{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9997983193277311, "eval_steps": 500, "global_step": 1859, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0005378151260504202, "grad_norm": 4.937017440795898, "learning_rate": 2.688172043010753e-07, "loss": 1.3181, "step": 1 }, { "epoch": 0.0010756302521008404, "grad_norm": 5.109012603759766, "learning_rate": 5.376344086021506e-07, "loss": 1.3866, "step": 2 }, { "epoch": 0.0016134453781512605, "grad_norm": 4.604580879211426, "learning_rate": 8.064516129032258e-07, "loss": 1.3097, "step": 3 }, { "epoch": 0.0021512605042016807, "grad_norm": 5.282269477844238, "learning_rate": 1.0752688172043011e-06, "loss": 1.4689, "step": 4 }, { "epoch": 0.002689075630252101, "grad_norm": 4.910592555999756, "learning_rate": 1.3440860215053765e-06, "loss": 1.2829, "step": 5 }, { "epoch": 0.003226890756302521, "grad_norm": 4.68040657043457, "learning_rate": 1.6129032258064516e-06, "loss": 1.2692, "step": 6 }, { "epoch": 0.0037647058823529413, "grad_norm": 5.114385604858398, "learning_rate": 1.881720430107527e-06, "loss": 1.4047, "step": 7 }, { "epoch": 0.0043025210084033615, "grad_norm": 4.183392524719238, "learning_rate": 2.1505376344086023e-06, "loss": 1.2257, "step": 8 }, { "epoch": 0.004840336134453781, "grad_norm": 4.43973970413208, "learning_rate": 2.4193548387096776e-06, "loss": 1.2095, "step": 9 }, { "epoch": 0.005378151260504202, "grad_norm": 4.051783084869385, "learning_rate": 2.688172043010753e-06, "loss": 1.1394, "step": 10 }, { "epoch": 0.005915966386554622, "grad_norm": 4.523661136627197, "learning_rate": 2.9569892473118283e-06, "loss": 1.318, "step": 11 }, { "epoch": 0.006453781512605042, "grad_norm": 3.742058753967285, "learning_rate": 3.225806451612903e-06, "loss": 1.1366, "step": 12 }, { "epoch": 0.006991596638655462, "grad_norm": 3.91782808303833, "learning_rate": 3.4946236559139785e-06, "loss": 1.2175, "step": 13 }, { "epoch": 0.0075294117647058826, "grad_norm": 3.6185474395751953, "learning_rate": 3.763440860215054e-06, "loss": 1.1784, "step": 14 }, { "epoch": 0.008067226890756302, "grad_norm": 2.9413998126983643, "learning_rate": 4.032258064516129e-06, "loss": 1.0017, "step": 15 }, { "epoch": 0.008605042016806723, "grad_norm": 2.58379864692688, "learning_rate": 4.3010752688172045e-06, "loss": 0.936, "step": 16 }, { "epoch": 0.009142857142857144, "grad_norm": 2.4405081272125244, "learning_rate": 4.56989247311828e-06, "loss": 0.9726, "step": 17 }, { "epoch": 0.009680672268907562, "grad_norm": 2.0846762657165527, "learning_rate": 4.838709677419355e-06, "loss": 0.95, "step": 18 }, { "epoch": 0.010218487394957983, "grad_norm": 1.9459309577941895, "learning_rate": 5.1075268817204305e-06, "loss": 0.8736, "step": 19 }, { "epoch": 0.010756302521008404, "grad_norm": 1.6321192979812622, "learning_rate": 5.376344086021506e-06, "loss": 0.8658, "step": 20 }, { "epoch": 0.011294117647058824, "grad_norm": 1.704120397567749, "learning_rate": 5.64516129032258e-06, "loss": 0.8129, "step": 21 }, { "epoch": 0.011831932773109243, "grad_norm": 1.5241268873214722, "learning_rate": 5.9139784946236566e-06, "loss": 0.7752, "step": 22 }, { "epoch": 0.012369747899159664, "grad_norm": 1.7300010919570923, "learning_rate": 6.182795698924732e-06, "loss": 0.8952, "step": 23 }, { "epoch": 0.012907563025210084, "grad_norm": 1.5992904901504517, "learning_rate": 6.451612903225806e-06, "loss": 0.8105, "step": 24 }, { "epoch": 0.013445378151260505, "grad_norm": 1.4070361852645874, "learning_rate": 6.720430107526882e-06, "loss": 0.7197, "step": 25 }, { "epoch": 0.013983193277310924, "grad_norm": 1.4357038736343384, "learning_rate": 6.989247311827957e-06, "loss": 0.7314, "step": 26 }, { "epoch": 0.014521008403361345, "grad_norm": 1.1927409172058105, "learning_rate": 7.258064516129033e-06, "loss": 0.5567, "step": 27 }, { "epoch": 0.015058823529411765, "grad_norm": 1.3240512609481812, "learning_rate": 7.526881720430108e-06, "loss": 0.6843, "step": 28 }, { "epoch": 0.015596638655462186, "grad_norm": 1.1877593994140625, "learning_rate": 7.795698924731183e-06, "loss": 0.5861, "step": 29 }, { "epoch": 0.016134453781512605, "grad_norm": 1.172675609588623, "learning_rate": 8.064516129032258e-06, "loss": 0.64, "step": 30 }, { "epoch": 0.016672268907563025, "grad_norm": 1.0072952508926392, "learning_rate": 8.333333333333334e-06, "loss": 0.5194, "step": 31 }, { "epoch": 0.017210084033613446, "grad_norm": 0.9239219427108765, "learning_rate": 8.602150537634409e-06, "loss": 0.4914, "step": 32 }, { "epoch": 0.017747899159663866, "grad_norm": 0.7992980480194092, "learning_rate": 8.870967741935484e-06, "loss": 0.4762, "step": 33 }, { "epoch": 0.018285714285714287, "grad_norm": 0.8525938987731934, "learning_rate": 9.13978494623656e-06, "loss": 0.5301, "step": 34 }, { "epoch": 0.018823529411764704, "grad_norm": 0.6349143385887146, "learning_rate": 9.408602150537635e-06, "loss": 0.5018, "step": 35 }, { "epoch": 0.019361344537815125, "grad_norm": 0.6189707517623901, "learning_rate": 9.67741935483871e-06, "loss": 0.4809, "step": 36 }, { "epoch": 0.019899159663865545, "grad_norm": 0.4752948582172394, "learning_rate": 9.946236559139786e-06, "loss": 0.4221, "step": 37 }, { "epoch": 0.020436974789915966, "grad_norm": 0.5394821166992188, "learning_rate": 1.0215053763440861e-05, "loss": 0.4725, "step": 38 }, { "epoch": 0.020974789915966387, "grad_norm": 0.5483920574188232, "learning_rate": 1.0483870967741936e-05, "loss": 0.4877, "step": 39 }, { "epoch": 0.021512605042016807, "grad_norm": 0.5472224354743958, "learning_rate": 1.0752688172043012e-05, "loss": 0.5236, "step": 40 }, { "epoch": 0.022050420168067228, "grad_norm": 0.4507257342338562, "learning_rate": 1.1021505376344087e-05, "loss": 0.4615, "step": 41 }, { "epoch": 0.02258823529411765, "grad_norm": 0.48021742701530457, "learning_rate": 1.129032258064516e-05, "loss": 0.4244, "step": 42 }, { "epoch": 0.023126050420168066, "grad_norm": 0.404839426279068, "learning_rate": 1.1559139784946236e-05, "loss": 0.3701, "step": 43 }, { "epoch": 0.023663865546218486, "grad_norm": 0.46083515882492065, "learning_rate": 1.1827956989247313e-05, "loss": 0.4188, "step": 44 }, { "epoch": 0.024201680672268907, "grad_norm": 0.4987565875053406, "learning_rate": 1.2096774193548388e-05, "loss": 0.4059, "step": 45 }, { "epoch": 0.024739495798319328, "grad_norm": 0.4575386047363281, "learning_rate": 1.2365591397849464e-05, "loss": 0.3509, "step": 46 }, { "epoch": 0.025277310924369748, "grad_norm": 0.4604824483394623, "learning_rate": 1.2634408602150537e-05, "loss": 0.4452, "step": 47 }, { "epoch": 0.02581512605042017, "grad_norm": 0.4816606044769287, "learning_rate": 1.2903225806451613e-05, "loss": 0.4087, "step": 48 }, { "epoch": 0.02635294117647059, "grad_norm": 0.4182151257991791, "learning_rate": 1.3172043010752688e-05, "loss": 0.3575, "step": 49 }, { "epoch": 0.02689075630252101, "grad_norm": 0.43688341975212097, "learning_rate": 1.3440860215053763e-05, "loss": 0.3974, "step": 50 }, { "epoch": 0.027428571428571427, "grad_norm": 0.4294030964374542, "learning_rate": 1.3709677419354839e-05, "loss": 0.353, "step": 51 }, { "epoch": 0.027966386554621848, "grad_norm": 0.38825103640556335, "learning_rate": 1.3978494623655914e-05, "loss": 0.3267, "step": 52 }, { "epoch": 0.02850420168067227, "grad_norm": 0.3865625858306885, "learning_rate": 1.4247311827956991e-05, "loss": 0.3382, "step": 53 }, { "epoch": 0.02904201680672269, "grad_norm": 0.38936108350753784, "learning_rate": 1.4516129032258066e-05, "loss": 0.3045, "step": 54 }, { "epoch": 0.02957983193277311, "grad_norm": 0.47213318943977356, "learning_rate": 1.4784946236559142e-05, "loss": 0.3864, "step": 55 }, { "epoch": 0.03011764705882353, "grad_norm": 0.48206374049186707, "learning_rate": 1.5053763440860215e-05, "loss": 0.391, "step": 56 }, { "epoch": 0.03065546218487395, "grad_norm": 0.4362124800682068, "learning_rate": 1.5322580645161292e-05, "loss": 0.3856, "step": 57 }, { "epoch": 0.03119327731092437, "grad_norm": 0.4609488844871521, "learning_rate": 1.5591397849462366e-05, "loss": 0.328, "step": 58 }, { "epoch": 0.03173109243697479, "grad_norm": 0.36876922845840454, "learning_rate": 1.586021505376344e-05, "loss": 0.3766, "step": 59 }, { "epoch": 0.03226890756302521, "grad_norm": 0.35254091024398804, "learning_rate": 1.6129032258064517e-05, "loss": 0.3624, "step": 60 }, { "epoch": 0.03280672268907563, "grad_norm": 0.4049266278743744, "learning_rate": 1.639784946236559e-05, "loss": 0.3755, "step": 61 }, { "epoch": 0.03334453781512605, "grad_norm": 0.4366215765476227, "learning_rate": 1.6666666666666667e-05, "loss": 0.3573, "step": 62 }, { "epoch": 0.03388235294117647, "grad_norm": 0.34553131461143494, "learning_rate": 1.693548387096774e-05, "loss": 0.3205, "step": 63 }, { "epoch": 0.03442016806722689, "grad_norm": 0.38835960626602173, "learning_rate": 1.7204301075268818e-05, "loss": 0.3486, "step": 64 }, { "epoch": 0.03495798319327731, "grad_norm": 0.34760186076164246, "learning_rate": 1.7473118279569895e-05, "loss": 0.3571, "step": 65 }, { "epoch": 0.03549579831932773, "grad_norm": 0.34857356548309326, "learning_rate": 1.774193548387097e-05, "loss": 0.3364, "step": 66 }, { "epoch": 0.03603361344537815, "grad_norm": 0.3425612449645996, "learning_rate": 1.8010752688172046e-05, "loss": 0.3229, "step": 67 }, { "epoch": 0.036571428571428574, "grad_norm": 0.35244220495224, "learning_rate": 1.827956989247312e-05, "loss": 0.3606, "step": 68 }, { "epoch": 0.03710924369747899, "grad_norm": 0.39218392968177795, "learning_rate": 1.8548387096774193e-05, "loss": 0.3663, "step": 69 }, { "epoch": 0.03764705882352941, "grad_norm": 0.3076382875442505, "learning_rate": 1.881720430107527e-05, "loss": 0.3198, "step": 70 }, { "epoch": 0.03818487394957983, "grad_norm": 0.2868969440460205, "learning_rate": 1.9086021505376344e-05, "loss": 0.3233, "step": 71 }, { "epoch": 0.03872268907563025, "grad_norm": 0.36267876625061035, "learning_rate": 1.935483870967742e-05, "loss": 0.3934, "step": 72 }, { "epoch": 0.039260504201680674, "grad_norm": 0.34814783930778503, "learning_rate": 1.9623655913978494e-05, "loss": 0.353, "step": 73 }, { "epoch": 0.03979831932773109, "grad_norm": 0.33452489972114563, "learning_rate": 1.989247311827957e-05, "loss": 0.377, "step": 74 }, { "epoch": 0.040336134453781515, "grad_norm": 0.30888330936431885, "learning_rate": 2.0161290322580645e-05, "loss": 0.3301, "step": 75 }, { "epoch": 0.04087394957983193, "grad_norm": 0.2973141074180603, "learning_rate": 2.0430107526881722e-05, "loss": 0.2872, "step": 76 }, { "epoch": 0.041411764705882356, "grad_norm": 0.28969722986221313, "learning_rate": 2.06989247311828e-05, "loss": 0.3319, "step": 77 }, { "epoch": 0.04194957983193277, "grad_norm": 0.2909839153289795, "learning_rate": 2.0967741935483873e-05, "loss": 0.3356, "step": 78 }, { "epoch": 0.04248739495798319, "grad_norm": 0.32320553064346313, "learning_rate": 2.1236559139784946e-05, "loss": 0.343, "step": 79 }, { "epoch": 0.043025210084033615, "grad_norm": 0.31354013085365295, "learning_rate": 2.1505376344086024e-05, "loss": 0.2839, "step": 80 }, { "epoch": 0.04356302521008403, "grad_norm": 0.32794976234436035, "learning_rate": 2.1774193548387097e-05, "loss": 0.3293, "step": 81 }, { "epoch": 0.044100840336134456, "grad_norm": 0.26586076617240906, "learning_rate": 2.2043010752688174e-05, "loss": 0.3004, "step": 82 }, { "epoch": 0.04463865546218487, "grad_norm": 0.31163427233695984, "learning_rate": 2.2311827956989248e-05, "loss": 0.3828, "step": 83 }, { "epoch": 0.0451764705882353, "grad_norm": 0.30241164565086365, "learning_rate": 2.258064516129032e-05, "loss": 0.2991, "step": 84 }, { "epoch": 0.045714285714285714, "grad_norm": 0.29193735122680664, "learning_rate": 2.28494623655914e-05, "loss": 0.3527, "step": 85 }, { "epoch": 0.04625210084033613, "grad_norm": 0.31733816862106323, "learning_rate": 2.3118279569892472e-05, "loss": 0.3585, "step": 86 }, { "epoch": 0.046789915966386555, "grad_norm": 0.2820446789264679, "learning_rate": 2.338709677419355e-05, "loss": 0.3217, "step": 87 }, { "epoch": 0.04732773109243697, "grad_norm": 0.30359527468681335, "learning_rate": 2.3655913978494626e-05, "loss": 0.3326, "step": 88 }, { "epoch": 0.0478655462184874, "grad_norm": 0.2908801734447479, "learning_rate": 2.39247311827957e-05, "loss": 0.3038, "step": 89 }, { "epoch": 0.048403361344537814, "grad_norm": 0.28603604435920715, "learning_rate": 2.4193548387096777e-05, "loss": 0.3025, "step": 90 }, { "epoch": 0.04894117647058824, "grad_norm": 0.36291128396987915, "learning_rate": 2.446236559139785e-05, "loss": 0.361, "step": 91 }, { "epoch": 0.049478991596638655, "grad_norm": 0.28745701909065247, "learning_rate": 2.4731182795698928e-05, "loss": 0.3092, "step": 92 }, { "epoch": 0.05001680672268907, "grad_norm": 0.30134356021881104, "learning_rate": 2.5e-05, "loss": 0.3019, "step": 93 }, { "epoch": 0.050554621848739496, "grad_norm": 0.30430078506469727, "learning_rate": 2.5268817204301075e-05, "loss": 0.3396, "step": 94 }, { "epoch": 0.051092436974789913, "grad_norm": 0.31510689854621887, "learning_rate": 2.5537634408602152e-05, "loss": 0.2958, "step": 95 }, { "epoch": 0.05163025210084034, "grad_norm": 0.2963063716888428, "learning_rate": 2.5806451612903226e-05, "loss": 0.3184, "step": 96 }, { "epoch": 0.052168067226890755, "grad_norm": 0.2719542682170868, "learning_rate": 2.6075268817204303e-05, "loss": 0.2933, "step": 97 }, { "epoch": 0.05270588235294118, "grad_norm": 0.28713732957839966, "learning_rate": 2.6344086021505376e-05, "loss": 0.3139, "step": 98 }, { "epoch": 0.053243697478991596, "grad_norm": 0.37673336267471313, "learning_rate": 2.661290322580645e-05, "loss": 0.3803, "step": 99 }, { "epoch": 0.05378151260504202, "grad_norm": 0.2833888828754425, "learning_rate": 2.6881720430107527e-05, "loss": 0.3051, "step": 100 }, { "epoch": 0.05431932773109244, "grad_norm": 0.3063603937625885, "learning_rate": 2.71505376344086e-05, "loss": 0.3357, "step": 101 }, { "epoch": 0.054857142857142854, "grad_norm": 0.3169352412223816, "learning_rate": 2.7419354838709678e-05, "loss": 0.3272, "step": 102 }, { "epoch": 0.05539495798319328, "grad_norm": 0.29939785599708557, "learning_rate": 2.768817204301075e-05, "loss": 0.2994, "step": 103 }, { "epoch": 0.055932773109243696, "grad_norm": 0.31200557947158813, "learning_rate": 2.7956989247311828e-05, "loss": 0.3387, "step": 104 }, { "epoch": 0.05647058823529412, "grad_norm": 0.3000110685825348, "learning_rate": 2.822580645161291e-05, "loss": 0.312, "step": 105 }, { "epoch": 0.05700840336134454, "grad_norm": 0.2704688012599945, "learning_rate": 2.8494623655913982e-05, "loss": 0.3082, "step": 106 }, { "epoch": 0.05754621848739496, "grad_norm": 0.31490373611450195, "learning_rate": 2.8763440860215056e-05, "loss": 0.336, "step": 107 }, { "epoch": 0.05808403361344538, "grad_norm": 0.2622876465320587, "learning_rate": 2.9032258064516133e-05, "loss": 0.2774, "step": 108 }, { "epoch": 0.058621848739495795, "grad_norm": 0.39069652557373047, "learning_rate": 2.9301075268817207e-05, "loss": 0.3482, "step": 109 }, { "epoch": 0.05915966386554622, "grad_norm": 0.35069721937179565, "learning_rate": 2.9569892473118284e-05, "loss": 0.367, "step": 110 }, { "epoch": 0.059697478991596636, "grad_norm": 0.3116728365421295, "learning_rate": 2.9838709677419357e-05, "loss": 0.3202, "step": 111 }, { "epoch": 0.06023529411764706, "grad_norm": 0.2586362361907959, "learning_rate": 3.010752688172043e-05, "loss": 0.3013, "step": 112 }, { "epoch": 0.06077310924369748, "grad_norm": 0.27376314997673035, "learning_rate": 3.0376344086021508e-05, "loss": 0.2941, "step": 113 }, { "epoch": 0.0613109243697479, "grad_norm": 0.3034414052963257, "learning_rate": 3.0645161290322585e-05, "loss": 0.3081, "step": 114 }, { "epoch": 0.06184873949579832, "grad_norm": 0.2919054329395294, "learning_rate": 3.091397849462366e-05, "loss": 0.3224, "step": 115 }, { "epoch": 0.06238655462184874, "grad_norm": 0.3134850263595581, "learning_rate": 3.118279569892473e-05, "loss": 0.3258, "step": 116 }, { "epoch": 0.06292436974789915, "grad_norm": 0.2965404987335205, "learning_rate": 3.1451612903225806e-05, "loss": 0.3508, "step": 117 }, { "epoch": 0.06346218487394958, "grad_norm": 0.29333922266960144, "learning_rate": 3.172043010752688e-05, "loss": 0.3266, "step": 118 }, { "epoch": 0.064, "grad_norm": 0.2950519919395447, "learning_rate": 3.198924731182796e-05, "loss": 0.3291, "step": 119 }, { "epoch": 0.06453781512605042, "grad_norm": 0.2944244146347046, "learning_rate": 3.2258064516129034e-05, "loss": 0.291, "step": 120 }, { "epoch": 0.06507563025210084, "grad_norm": 0.3558066785335541, "learning_rate": 3.252688172043011e-05, "loss": 0.3377, "step": 121 }, { "epoch": 0.06561344537815127, "grad_norm": 0.22373619675636292, "learning_rate": 3.279569892473118e-05, "loss": 0.2355, "step": 122 }, { "epoch": 0.06615126050420168, "grad_norm": 0.2771933674812317, "learning_rate": 3.306451612903226e-05, "loss": 0.2613, "step": 123 }, { "epoch": 0.0666890756302521, "grad_norm": 0.29239460825920105, "learning_rate": 3.3333333333333335e-05, "loss": 0.3219, "step": 124 }, { "epoch": 0.06722689075630252, "grad_norm": 0.33710119128227234, "learning_rate": 3.360215053763441e-05, "loss": 0.3299, "step": 125 }, { "epoch": 0.06776470588235294, "grad_norm": 0.3195329010486603, "learning_rate": 3.387096774193548e-05, "loss": 0.2994, "step": 126 }, { "epoch": 0.06830252100840337, "grad_norm": 0.27329522371292114, "learning_rate": 3.4139784946236556e-05, "loss": 0.2849, "step": 127 }, { "epoch": 0.06884033613445378, "grad_norm": 0.3232622742652893, "learning_rate": 3.4408602150537636e-05, "loss": 0.262, "step": 128 }, { "epoch": 0.0693781512605042, "grad_norm": 0.28699207305908203, "learning_rate": 3.467741935483872e-05, "loss": 0.2961, "step": 129 }, { "epoch": 0.06991596638655462, "grad_norm": 0.288438081741333, "learning_rate": 3.494623655913979e-05, "loss": 0.2837, "step": 130 }, { "epoch": 0.07045378151260505, "grad_norm": 0.25113406777381897, "learning_rate": 3.5215053763440864e-05, "loss": 0.2582, "step": 131 }, { "epoch": 0.07099159663865547, "grad_norm": 0.29629459977149963, "learning_rate": 3.548387096774194e-05, "loss": 0.2779, "step": 132 }, { "epoch": 0.07152941176470588, "grad_norm": 0.315860778093338, "learning_rate": 3.575268817204301e-05, "loss": 0.3143, "step": 133 }, { "epoch": 0.0720672268907563, "grad_norm": 0.326097697019577, "learning_rate": 3.602150537634409e-05, "loss": 0.3531, "step": 134 }, { "epoch": 0.07260504201680672, "grad_norm": 0.2970331311225891, "learning_rate": 3.6290322580645165e-05, "loss": 0.2913, "step": 135 }, { "epoch": 0.07314285714285715, "grad_norm": 0.28848981857299805, "learning_rate": 3.655913978494624e-05, "loss": 0.294, "step": 136 }, { "epoch": 0.07368067226890757, "grad_norm": 0.30767518281936646, "learning_rate": 3.682795698924731e-05, "loss": 0.3041, "step": 137 }, { "epoch": 0.07421848739495798, "grad_norm": 0.28856533765792847, "learning_rate": 3.7096774193548386e-05, "loss": 0.3024, "step": 138 }, { "epoch": 0.0747563025210084, "grad_norm": 0.34398627281188965, "learning_rate": 3.736559139784947e-05, "loss": 0.3401, "step": 139 }, { "epoch": 0.07529411764705882, "grad_norm": 0.3126499652862549, "learning_rate": 3.763440860215054e-05, "loss": 0.2965, "step": 140 }, { "epoch": 0.07583193277310925, "grad_norm": 0.3086839020252228, "learning_rate": 3.7903225806451614e-05, "loss": 0.2919, "step": 141 }, { "epoch": 0.07636974789915967, "grad_norm": 0.2689349949359894, "learning_rate": 3.817204301075269e-05, "loss": 0.2736, "step": 142 }, { "epoch": 0.07690756302521008, "grad_norm": 0.2607412040233612, "learning_rate": 3.844086021505376e-05, "loss": 0.2568, "step": 143 }, { "epoch": 0.0774453781512605, "grad_norm": 0.3339359760284424, "learning_rate": 3.870967741935484e-05, "loss": 0.3592, "step": 144 }, { "epoch": 0.07798319327731093, "grad_norm": 0.3384217619895935, "learning_rate": 3.8978494623655915e-05, "loss": 0.3257, "step": 145 }, { "epoch": 0.07852100840336135, "grad_norm": 0.28650546073913574, "learning_rate": 3.924731182795699e-05, "loss": 0.3068, "step": 146 }, { "epoch": 0.07905882352941176, "grad_norm": 0.32297849655151367, "learning_rate": 3.951612903225806e-05, "loss": 0.2888, "step": 147 }, { "epoch": 0.07959663865546218, "grad_norm": 0.33793962001800537, "learning_rate": 3.978494623655914e-05, "loss": 0.3196, "step": 148 }, { "epoch": 0.0801344537815126, "grad_norm": 0.29474982619285583, "learning_rate": 4.005376344086022e-05, "loss": 0.3048, "step": 149 }, { "epoch": 0.08067226890756303, "grad_norm": 0.30823761224746704, "learning_rate": 4.032258064516129e-05, "loss": 0.406, "step": 150 }, { "epoch": 0.08121008403361345, "grad_norm": 0.2610779106616974, "learning_rate": 4.0591397849462364e-05, "loss": 0.2842, "step": 151 }, { "epoch": 0.08174789915966386, "grad_norm": 0.29009851813316345, "learning_rate": 4.0860215053763444e-05, "loss": 0.3147, "step": 152 }, { "epoch": 0.08228571428571428, "grad_norm": 0.3253105580806732, "learning_rate": 4.112903225806452e-05, "loss": 0.323, "step": 153 }, { "epoch": 0.08282352941176471, "grad_norm": 0.30503425002098083, "learning_rate": 4.13978494623656e-05, "loss": 0.2583, "step": 154 }, { "epoch": 0.08336134453781513, "grad_norm": 0.29400327801704407, "learning_rate": 4.166666666666667e-05, "loss": 0.3272, "step": 155 }, { "epoch": 0.08389915966386555, "grad_norm": 0.2912876605987549, "learning_rate": 4.1935483870967746e-05, "loss": 0.2949, "step": 156 }, { "epoch": 0.08443697478991596, "grad_norm": 0.31227219104766846, "learning_rate": 4.220430107526882e-05, "loss": 0.2999, "step": 157 }, { "epoch": 0.08497478991596638, "grad_norm": 0.3412877023220062, "learning_rate": 4.247311827956989e-05, "loss": 0.3154, "step": 158 }, { "epoch": 0.08551260504201681, "grad_norm": 0.27500101923942566, "learning_rate": 4.2741935483870973e-05, "loss": 0.2788, "step": 159 }, { "epoch": 0.08605042016806723, "grad_norm": 0.3277937173843384, "learning_rate": 4.301075268817205e-05, "loss": 0.2836, "step": 160 }, { "epoch": 0.08658823529411765, "grad_norm": 0.3013443648815155, "learning_rate": 4.327956989247312e-05, "loss": 0.2846, "step": 161 }, { "epoch": 0.08712605042016806, "grad_norm": 0.36392027139663696, "learning_rate": 4.3548387096774194e-05, "loss": 0.345, "step": 162 }, { "epoch": 0.08766386554621848, "grad_norm": 0.3518577218055725, "learning_rate": 4.381720430107527e-05, "loss": 0.29, "step": 163 }, { "epoch": 0.08820168067226891, "grad_norm": 0.35187238454818726, "learning_rate": 4.408602150537635e-05, "loss": 0.299, "step": 164 }, { "epoch": 0.08873949579831933, "grad_norm": 0.3398917615413666, "learning_rate": 4.435483870967742e-05, "loss": 0.3271, "step": 165 }, { "epoch": 0.08927731092436975, "grad_norm": 0.2711006999015808, "learning_rate": 4.4623655913978496e-05, "loss": 0.2478, "step": 166 }, { "epoch": 0.08981512605042016, "grad_norm": 0.26182228326797485, "learning_rate": 4.489247311827957e-05, "loss": 0.3053, "step": 167 }, { "epoch": 0.0903529411764706, "grad_norm": 0.3300965130329132, "learning_rate": 4.516129032258064e-05, "loss": 0.296, "step": 168 }, { "epoch": 0.09089075630252101, "grad_norm": 0.29774487018585205, "learning_rate": 4.543010752688172e-05, "loss": 0.2929, "step": 169 }, { "epoch": 0.09142857142857143, "grad_norm": 0.28421086072921753, "learning_rate": 4.56989247311828e-05, "loss": 0.2957, "step": 170 }, { "epoch": 0.09196638655462185, "grad_norm": 0.3338155448436737, "learning_rate": 4.596774193548387e-05, "loss": 0.3345, "step": 171 }, { "epoch": 0.09250420168067226, "grad_norm": 0.2805405855178833, "learning_rate": 4.6236559139784944e-05, "loss": 0.2787, "step": 172 }, { "epoch": 0.0930420168067227, "grad_norm": 0.30192992091178894, "learning_rate": 4.650537634408602e-05, "loss": 0.3175, "step": 173 }, { "epoch": 0.09357983193277311, "grad_norm": 0.3377910554409027, "learning_rate": 4.67741935483871e-05, "loss": 0.3192, "step": 174 }, { "epoch": 0.09411764705882353, "grad_norm": 0.3049200177192688, "learning_rate": 4.704301075268818e-05, "loss": 0.2912, "step": 175 }, { "epoch": 0.09465546218487395, "grad_norm": 0.3491119146347046, "learning_rate": 4.731182795698925e-05, "loss": 0.3242, "step": 176 }, { "epoch": 0.09519327731092438, "grad_norm": 0.30324605107307434, "learning_rate": 4.7580645161290326e-05, "loss": 0.2678, "step": 177 }, { "epoch": 0.0957310924369748, "grad_norm": 0.3010377287864685, "learning_rate": 4.78494623655914e-05, "loss": 0.3086, "step": 178 }, { "epoch": 0.09626890756302521, "grad_norm": 0.2898170053958893, "learning_rate": 4.811827956989248e-05, "loss": 0.2706, "step": 179 }, { "epoch": 0.09680672268907563, "grad_norm": 0.33306750655174255, "learning_rate": 4.8387096774193554e-05, "loss": 0.3561, "step": 180 }, { "epoch": 0.09734453781512604, "grad_norm": 0.2879464626312256, "learning_rate": 4.865591397849463e-05, "loss": 0.2773, "step": 181 }, { "epoch": 0.09788235294117648, "grad_norm": 0.30047672986984253, "learning_rate": 4.89247311827957e-05, "loss": 0.2728, "step": 182 }, { "epoch": 0.09842016806722689, "grad_norm": 0.27168819308280945, "learning_rate": 4.9193548387096775e-05, "loss": 0.2336, "step": 183 }, { "epoch": 0.09895798319327731, "grad_norm": 0.29804688692092896, "learning_rate": 4.9462365591397855e-05, "loss": 0.2777, "step": 184 }, { "epoch": 0.09949579831932773, "grad_norm": 0.28282034397125244, "learning_rate": 4.973118279569893e-05, "loss": 0.2777, "step": 185 }, { "epoch": 0.10003361344537814, "grad_norm": 0.323350727558136, "learning_rate": 5e-05, "loss": 0.3117, "step": 186 }, { "epoch": 0.10057142857142858, "grad_norm": 0.27245283126831055, "learning_rate": 4.999995592241934e-05, "loss": 0.2892, "step": 187 }, { "epoch": 0.10110924369747899, "grad_norm": 0.2725326418876648, "learning_rate": 4.9999823689832785e-05, "loss": 0.2701, "step": 188 }, { "epoch": 0.10164705882352941, "grad_norm": 0.2738107442855835, "learning_rate": 4.9999603302706614e-05, "loss": 0.2814, "step": 189 }, { "epoch": 0.10218487394957983, "grad_norm": 0.3000781834125519, "learning_rate": 4.999929476181796e-05, "loss": 0.3087, "step": 190 }, { "epoch": 0.10272268907563026, "grad_norm": 0.29324737191200256, "learning_rate": 4.99988980682548e-05, "loss": 0.2819, "step": 191 }, { "epoch": 0.10326050420168068, "grad_norm": 0.3278168737888336, "learning_rate": 4.999841322341595e-05, "loss": 0.3424, "step": 192 }, { "epoch": 0.10379831932773109, "grad_norm": 0.29901933670043945, "learning_rate": 4.9997840229011085e-05, "loss": 0.2999, "step": 193 }, { "epoch": 0.10433613445378151, "grad_norm": 0.30633270740509033, "learning_rate": 4.9997179087060695e-05, "loss": 0.3098, "step": 194 }, { "epoch": 0.10487394957983193, "grad_norm": 0.2496650665998459, "learning_rate": 4.999642979989611e-05, "loss": 0.2257, "step": 195 }, { "epoch": 0.10541176470588236, "grad_norm": 0.28163766860961914, "learning_rate": 4.999559237015946e-05, "loss": 0.2687, "step": 196 }, { "epoch": 0.10594957983193277, "grad_norm": 0.299674391746521, "learning_rate": 4.9994666800803716e-05, "loss": 0.3095, "step": 197 }, { "epoch": 0.10648739495798319, "grad_norm": 0.27861109375953674, "learning_rate": 4.9993653095092594e-05, "loss": 0.2719, "step": 198 }, { "epoch": 0.10702521008403361, "grad_norm": 0.2727408707141876, "learning_rate": 4.999255125660065e-05, "loss": 0.2513, "step": 199 }, { "epoch": 0.10756302521008404, "grad_norm": 0.25216174125671387, "learning_rate": 4.9991361289213203e-05, "loss": 0.2271, "step": 200 }, { "epoch": 0.10810084033613446, "grad_norm": 0.27564382553100586, "learning_rate": 4.99900831971263e-05, "loss": 0.2753, "step": 201 }, { "epoch": 0.10863865546218487, "grad_norm": 0.32505062222480774, "learning_rate": 4.9988716984846775e-05, "loss": 0.2991, "step": 202 }, { "epoch": 0.10917647058823529, "grad_norm": 0.30531519651412964, "learning_rate": 4.998726265719217e-05, "loss": 0.2907, "step": 203 }, { "epoch": 0.10971428571428571, "grad_norm": 0.24494339525699615, "learning_rate": 4.9985720219290745e-05, "loss": 0.2407, "step": 204 }, { "epoch": 0.11025210084033614, "grad_norm": 0.25707924365997314, "learning_rate": 4.998408967658145e-05, "loss": 0.2745, "step": 205 }, { "epoch": 0.11078991596638656, "grad_norm": 0.25832727551460266, "learning_rate": 4.998237103481391e-05, "loss": 0.2873, "step": 206 }, { "epoch": 0.11132773109243697, "grad_norm": 0.29838934540748596, "learning_rate": 4.998056430004844e-05, "loss": 0.3395, "step": 207 }, { "epoch": 0.11186554621848739, "grad_norm": 0.26841628551483154, "learning_rate": 4.9978669478655925e-05, "loss": 0.2914, "step": 208 }, { "epoch": 0.11240336134453781, "grad_norm": 0.2618709206581116, "learning_rate": 4.997668657731791e-05, "loss": 0.284, "step": 209 }, { "epoch": 0.11294117647058824, "grad_norm": 0.25725850462913513, "learning_rate": 4.997461560302652e-05, "loss": 0.2932, "step": 210 }, { "epoch": 0.11347899159663866, "grad_norm": 0.283725768327713, "learning_rate": 4.997245656308443e-05, "loss": 0.2979, "step": 211 }, { "epoch": 0.11401680672268907, "grad_norm": 0.27782928943634033, "learning_rate": 4.9970209465104866e-05, "loss": 0.3252, "step": 212 }, { "epoch": 0.11455462184873949, "grad_norm": 0.3113950490951538, "learning_rate": 4.9967874317011556e-05, "loss": 0.3345, "step": 213 }, { "epoch": 0.11509243697478992, "grad_norm": 0.28892624378204346, "learning_rate": 4.9965451127038714e-05, "loss": 0.3224, "step": 214 }, { "epoch": 0.11563025210084034, "grad_norm": 0.32412204146385193, "learning_rate": 4.996293990373101e-05, "loss": 0.3322, "step": 215 }, { "epoch": 0.11616806722689076, "grad_norm": 0.26274049282073975, "learning_rate": 4.996034065594354e-05, "loss": 0.2657, "step": 216 }, { "epoch": 0.11670588235294117, "grad_norm": 0.27728816866874695, "learning_rate": 4.9957653392841774e-05, "loss": 0.3226, "step": 217 }, { "epoch": 0.11724369747899159, "grad_norm": 0.2501899003982544, "learning_rate": 4.995487812390157e-05, "loss": 0.2802, "step": 218 }, { "epoch": 0.11778151260504202, "grad_norm": 0.24002771079540253, "learning_rate": 4.995201485890909e-05, "loss": 0.2408, "step": 219 }, { "epoch": 0.11831932773109244, "grad_norm": 0.2784670889377594, "learning_rate": 4.994906360796081e-05, "loss": 0.3177, "step": 220 }, { "epoch": 0.11885714285714286, "grad_norm": 0.31582459807395935, "learning_rate": 4.994602438146344e-05, "loss": 0.3042, "step": 221 }, { "epoch": 0.11939495798319327, "grad_norm": 0.2620047330856323, "learning_rate": 4.994289719013393e-05, "loss": 0.2798, "step": 222 }, { "epoch": 0.1199327731092437, "grad_norm": 0.2757277190685272, "learning_rate": 4.993968204499939e-05, "loss": 0.3082, "step": 223 }, { "epoch": 0.12047058823529412, "grad_norm": 0.3029164969921112, "learning_rate": 4.993637895739708e-05, "loss": 0.294, "step": 224 }, { "epoch": 0.12100840336134454, "grad_norm": 0.28546348214149475, "learning_rate": 4.99329879389744e-05, "loss": 0.2912, "step": 225 }, { "epoch": 0.12154621848739496, "grad_norm": 0.24452543258666992, "learning_rate": 4.992950900168875e-05, "loss": 0.2805, "step": 226 }, { "epoch": 0.12208403361344537, "grad_norm": 0.27994397282600403, "learning_rate": 4.992594215780758e-05, "loss": 0.2931, "step": 227 }, { "epoch": 0.1226218487394958, "grad_norm": 0.2986987233161926, "learning_rate": 4.992228741990834e-05, "loss": 0.3221, "step": 228 }, { "epoch": 0.12315966386554622, "grad_norm": 0.2799331247806549, "learning_rate": 4.9918544800878386e-05, "loss": 0.3141, "step": 229 }, { "epoch": 0.12369747899159664, "grad_norm": 0.22995354235172272, "learning_rate": 4.9914714313914954e-05, "loss": 0.2772, "step": 230 }, { "epoch": 0.12423529411764705, "grad_norm": 0.25273454189300537, "learning_rate": 4.991079597252514e-05, "loss": 0.291, "step": 231 }, { "epoch": 0.12477310924369749, "grad_norm": 0.2705329954624176, "learning_rate": 4.990678979052581e-05, "loss": 0.2552, "step": 232 }, { "epoch": 0.1253109243697479, "grad_norm": 0.2735231816768646, "learning_rate": 4.9902695782043605e-05, "loss": 0.2697, "step": 233 }, { "epoch": 0.1258487394957983, "grad_norm": 0.2653118073940277, "learning_rate": 4.989851396151485e-05, "loss": 0.2558, "step": 234 }, { "epoch": 0.12638655462184875, "grad_norm": 0.31132930517196655, "learning_rate": 4.989424434368549e-05, "loss": 0.2994, "step": 235 }, { "epoch": 0.12692436974789917, "grad_norm": 0.2758411467075348, "learning_rate": 4.988988694361109e-05, "loss": 0.2977, "step": 236 }, { "epoch": 0.12746218487394959, "grad_norm": 0.2935027778148651, "learning_rate": 4.9885441776656734e-05, "loss": 0.3054, "step": 237 }, { "epoch": 0.128, "grad_norm": 0.2782260775566101, "learning_rate": 4.9880908858497e-05, "loss": 0.3021, "step": 238 }, { "epoch": 0.12853781512605042, "grad_norm": 0.26450204849243164, "learning_rate": 4.987628820511591e-05, "loss": 0.2792, "step": 239 }, { "epoch": 0.12907563025210084, "grad_norm": 0.2428375482559204, "learning_rate": 4.987157983280681e-05, "loss": 0.2633, "step": 240 }, { "epoch": 0.12961344537815125, "grad_norm": 0.31785255670547485, "learning_rate": 4.986678375817242e-05, "loss": 0.3359, "step": 241 }, { "epoch": 0.13015126050420167, "grad_norm": 0.2536437511444092, "learning_rate": 4.986189999812468e-05, "loss": 0.2581, "step": 242 }, { "epoch": 0.1306890756302521, "grad_norm": 0.24715927243232727, "learning_rate": 4.985692856988473e-05, "loss": 0.2648, "step": 243 }, { "epoch": 0.13122689075630253, "grad_norm": 0.2812165915966034, "learning_rate": 4.985186949098287e-05, "loss": 0.3028, "step": 244 }, { "epoch": 0.13176470588235295, "grad_norm": 0.27538153529167175, "learning_rate": 4.984672277925844e-05, "loss": 0.3095, "step": 245 }, { "epoch": 0.13230252100840337, "grad_norm": 0.2800685167312622, "learning_rate": 4.984148845285981e-05, "loss": 0.2877, "step": 246 }, { "epoch": 0.13284033613445378, "grad_norm": 0.2819519340991974, "learning_rate": 4.98361665302443e-05, "loss": 0.3183, "step": 247 }, { "epoch": 0.1333781512605042, "grad_norm": 0.25874051451683044, "learning_rate": 4.983075703017811e-05, "loss": 0.261, "step": 248 }, { "epoch": 0.13391596638655462, "grad_norm": 0.27959126234054565, "learning_rate": 4.982525997173625e-05, "loss": 0.3111, "step": 249 }, { "epoch": 0.13445378151260504, "grad_norm": 0.24757210910320282, "learning_rate": 4.9819675374302475e-05, "loss": 0.2591, "step": 250 }, { "epoch": 0.13499159663865545, "grad_norm": 0.2646350860595703, "learning_rate": 4.981400325756925e-05, "loss": 0.2674, "step": 251 }, { "epoch": 0.13552941176470587, "grad_norm": 0.27526751160621643, "learning_rate": 4.980824364153761e-05, "loss": 0.2617, "step": 252 }, { "epoch": 0.13606722689075632, "grad_norm": 0.24986301362514496, "learning_rate": 4.980239654651716e-05, "loss": 0.275, "step": 253 }, { "epoch": 0.13660504201680673, "grad_norm": 0.25978991389274597, "learning_rate": 4.979646199312596e-05, "loss": 0.2893, "step": 254 }, { "epoch": 0.13714285714285715, "grad_norm": 0.2505474090576172, "learning_rate": 4.979044000229047e-05, "loss": 0.2597, "step": 255 }, { "epoch": 0.13768067226890757, "grad_norm": 0.29352056980133057, "learning_rate": 4.978433059524548e-05, "loss": 0.3432, "step": 256 }, { "epoch": 0.13821848739495798, "grad_norm": 0.22724519670009613, "learning_rate": 4.977813379353401e-05, "loss": 0.254, "step": 257 }, { "epoch": 0.1387563025210084, "grad_norm": 0.2791183292865753, "learning_rate": 4.9771849619007264e-05, "loss": 0.2568, "step": 258 }, { "epoch": 0.13929411764705882, "grad_norm": 0.2823403775691986, "learning_rate": 4.976547809382455e-05, "loss": 0.2951, "step": 259 }, { "epoch": 0.13983193277310924, "grad_norm": 0.2597982883453369, "learning_rate": 4.9759019240453166e-05, "loss": 0.3015, "step": 260 }, { "epoch": 0.14036974789915965, "grad_norm": 0.2825731039047241, "learning_rate": 4.975247308166838e-05, "loss": 0.2841, "step": 261 }, { "epoch": 0.1409075630252101, "grad_norm": 0.2528793513774872, "learning_rate": 4.9745839640553285e-05, "loss": 0.2525, "step": 262 }, { "epoch": 0.14144537815126051, "grad_norm": 0.27138614654541016, "learning_rate": 4.9739118940498766e-05, "loss": 0.2992, "step": 263 }, { "epoch": 0.14198319327731093, "grad_norm": 0.24798326194286346, "learning_rate": 4.97323110052034e-05, "loss": 0.261, "step": 264 }, { "epoch": 0.14252100840336135, "grad_norm": 0.27130061388015747, "learning_rate": 4.972541585867337e-05, "loss": 0.3285, "step": 265 }, { "epoch": 0.14305882352941177, "grad_norm": 0.2374134063720703, "learning_rate": 4.9718433525222395e-05, "loss": 0.2834, "step": 266 }, { "epoch": 0.14359663865546218, "grad_norm": 0.24421246349811554, "learning_rate": 4.971136402947162e-05, "loss": 0.2404, "step": 267 }, { "epoch": 0.1441344537815126, "grad_norm": 0.24347254633903503, "learning_rate": 4.970420739634954e-05, "loss": 0.2682, "step": 268 }, { "epoch": 0.14467226890756302, "grad_norm": 0.2449118196964264, "learning_rate": 4.9696963651091933e-05, "loss": 0.267, "step": 269 }, { "epoch": 0.14521008403361343, "grad_norm": 0.24414022266864777, "learning_rate": 4.968963281924173e-05, "loss": 0.2763, "step": 270 }, { "epoch": 0.14574789915966385, "grad_norm": 0.29989054799079895, "learning_rate": 4.968221492664896e-05, "loss": 0.2879, "step": 271 }, { "epoch": 0.1462857142857143, "grad_norm": 0.25095444917678833, "learning_rate": 4.967470999947064e-05, "loss": 0.2824, "step": 272 }, { "epoch": 0.1468235294117647, "grad_norm": 0.2392272651195526, "learning_rate": 4.96671180641707e-05, "loss": 0.2756, "step": 273 }, { "epoch": 0.14736134453781513, "grad_norm": 0.28284692764282227, "learning_rate": 4.9659439147519876e-05, "loss": 0.2889, "step": 274 }, { "epoch": 0.14789915966386555, "grad_norm": 0.24970386922359467, "learning_rate": 4.96516732765956e-05, "loss": 0.264, "step": 275 }, { "epoch": 0.14843697478991597, "grad_norm": 0.26660749316215515, "learning_rate": 4.964382047878195e-05, "loss": 0.2955, "step": 276 }, { "epoch": 0.14897478991596638, "grad_norm": 0.2941673696041107, "learning_rate": 4.9635880781769495e-05, "loss": 0.319, "step": 277 }, { "epoch": 0.1495126050420168, "grad_norm": 0.22710421681404114, "learning_rate": 4.9627854213555255e-05, "loss": 0.2481, "step": 278 }, { "epoch": 0.15005042016806722, "grad_norm": 0.3060409724712372, "learning_rate": 4.961974080244257e-05, "loss": 0.3255, "step": 279 }, { "epoch": 0.15058823529411763, "grad_norm": 0.2574678659439087, "learning_rate": 4.9611540577041006e-05, "loss": 0.3166, "step": 280 }, { "epoch": 0.15112605042016808, "grad_norm": 0.26686081290245056, "learning_rate": 4.9603253566266236e-05, "loss": 0.2745, "step": 281 }, { "epoch": 0.1516638655462185, "grad_norm": 0.25216084718704224, "learning_rate": 4.9594879799339976e-05, "loss": 0.2764, "step": 282 }, { "epoch": 0.1522016806722689, "grad_norm": 0.24703432619571686, "learning_rate": 4.9586419305789874e-05, "loss": 0.2812, "step": 283 }, { "epoch": 0.15273949579831933, "grad_norm": 0.2820897102355957, "learning_rate": 4.957787211544935e-05, "loss": 0.3062, "step": 284 }, { "epoch": 0.15327731092436975, "grad_norm": 0.23326194286346436, "learning_rate": 4.9569238258457586e-05, "loss": 0.2893, "step": 285 }, { "epoch": 0.15381512605042016, "grad_norm": 0.24942664802074432, "learning_rate": 4.9560517765259323e-05, "loss": 0.2688, "step": 286 }, { "epoch": 0.15435294117647058, "grad_norm": 0.253323495388031, "learning_rate": 4.9551710666604836e-05, "loss": 0.2775, "step": 287 }, { "epoch": 0.154890756302521, "grad_norm": 0.2614035904407501, "learning_rate": 4.9542816993549765e-05, "loss": 0.3235, "step": 288 }, { "epoch": 0.15542857142857142, "grad_norm": 0.252093642950058, "learning_rate": 4.953383677745505e-05, "loss": 0.2985, "step": 289 }, { "epoch": 0.15596638655462186, "grad_norm": 0.1951778680086136, "learning_rate": 4.952477004998676e-05, "loss": 0.234, "step": 290 }, { "epoch": 0.15650420168067228, "grad_norm": 0.27810660004615784, "learning_rate": 4.951561684311608e-05, "loss": 0.3028, "step": 291 }, { "epoch": 0.1570420168067227, "grad_norm": 0.24016375839710236, "learning_rate": 4.950637718911908e-05, "loss": 0.2457, "step": 292 }, { "epoch": 0.1575798319327731, "grad_norm": 0.2513678967952728, "learning_rate": 4.94970511205767e-05, "loss": 0.2926, "step": 293 }, { "epoch": 0.15811764705882353, "grad_norm": 0.26329195499420166, "learning_rate": 4.948763867037459e-05, "loss": 0.289, "step": 294 }, { "epoch": 0.15865546218487395, "grad_norm": 0.2394401580095291, "learning_rate": 4.947813987170298e-05, "loss": 0.2518, "step": 295 }, { "epoch": 0.15919327731092436, "grad_norm": 0.27771392464637756, "learning_rate": 4.9468554758056605e-05, "loss": 0.3295, "step": 296 }, { "epoch": 0.15973109243697478, "grad_norm": 0.257930189371109, "learning_rate": 4.945888336323455e-05, "loss": 0.2889, "step": 297 }, { "epoch": 0.1602689075630252, "grad_norm": 0.2435213178396225, "learning_rate": 4.9449125721340145e-05, "loss": 0.2773, "step": 298 }, { "epoch": 0.16080672268907564, "grad_norm": 0.22850783169269562, "learning_rate": 4.943928186678086e-05, "loss": 0.2413, "step": 299 }, { "epoch": 0.16134453781512606, "grad_norm": 0.23204346001148224, "learning_rate": 4.9429351834268145e-05, "loss": 0.2708, "step": 300 }, { "epoch": 0.16188235294117648, "grad_norm": 0.24790240824222565, "learning_rate": 4.941933565881736e-05, "loss": 0.2503, "step": 301 }, { "epoch": 0.1624201680672269, "grad_norm": 0.2386782020330429, "learning_rate": 4.94092333757476e-05, "loss": 0.2638, "step": 302 }, { "epoch": 0.1629579831932773, "grad_norm": 0.2794756591320038, "learning_rate": 4.93990450206816e-05, "loss": 0.2986, "step": 303 }, { "epoch": 0.16349579831932773, "grad_norm": 0.24725879728794098, "learning_rate": 4.93887706295456e-05, "loss": 0.3055, "step": 304 }, { "epoch": 0.16403361344537815, "grad_norm": 0.2441127747297287, "learning_rate": 4.937841023856923e-05, "loss": 0.2716, "step": 305 }, { "epoch": 0.16457142857142856, "grad_norm": 0.22461718320846558, "learning_rate": 4.936796388428537e-05, "loss": 0.2467, "step": 306 }, { "epoch": 0.16510924369747898, "grad_norm": 0.26115694642066956, "learning_rate": 4.935743160353002e-05, "loss": 0.3129, "step": 307 }, { "epoch": 0.16564705882352943, "grad_norm": 0.22991955280303955, "learning_rate": 4.934681343344217e-05, "loss": 0.2511, "step": 308 }, { "epoch": 0.16618487394957984, "grad_norm": 0.23744237422943115, "learning_rate": 4.9336109411463684e-05, "loss": 0.253, "step": 309 }, { "epoch": 0.16672268907563026, "grad_norm": 0.26600411534309387, "learning_rate": 4.932531957533916e-05, "loss": 0.309, "step": 310 }, { "epoch": 0.16726050420168068, "grad_norm": 0.2304258495569229, "learning_rate": 4.931444396311578e-05, "loss": 0.2719, "step": 311 }, { "epoch": 0.1677983193277311, "grad_norm": 0.28674209117889404, "learning_rate": 4.9303482613143194e-05, "loss": 0.3258, "step": 312 }, { "epoch": 0.1683361344537815, "grad_norm": 0.2850836515426636, "learning_rate": 4.929243556407339e-05, "loss": 0.2924, "step": 313 }, { "epoch": 0.16887394957983193, "grad_norm": 0.240781769156456, "learning_rate": 4.9281302854860545e-05, "loss": 0.2852, "step": 314 }, { "epoch": 0.16941176470588235, "grad_norm": 0.2284785658121109, "learning_rate": 4.9270084524760896e-05, "loss": 0.2449, "step": 315 }, { "epoch": 0.16994957983193276, "grad_norm": 0.27321118116378784, "learning_rate": 4.9258780613332575e-05, "loss": 0.2938, "step": 316 }, { "epoch": 0.1704873949579832, "grad_norm": 0.27485835552215576, "learning_rate": 4.924739116043552e-05, "loss": 0.2896, "step": 317 }, { "epoch": 0.17102521008403362, "grad_norm": 0.25155866146087646, "learning_rate": 4.923591620623129e-05, "loss": 0.2627, "step": 318 }, { "epoch": 0.17156302521008404, "grad_norm": 0.2596929371356964, "learning_rate": 4.9224355791182955e-05, "loss": 0.2793, "step": 319 }, { "epoch": 0.17210084033613446, "grad_norm": 0.2791350185871124, "learning_rate": 4.92127099560549e-05, "loss": 0.2801, "step": 320 }, { "epoch": 0.17263865546218488, "grad_norm": 0.2993616759777069, "learning_rate": 4.9200978741912755e-05, "loss": 0.3432, "step": 321 }, { "epoch": 0.1731764705882353, "grad_norm": 0.24144968390464783, "learning_rate": 4.918916219012321e-05, "loss": 0.2736, "step": 322 }, { "epoch": 0.1737142857142857, "grad_norm": 0.2616193890571594, "learning_rate": 4.9177260342353856e-05, "loss": 0.2652, "step": 323 }, { "epoch": 0.17425210084033613, "grad_norm": 0.2990521490573883, "learning_rate": 4.916527324057307e-05, "loss": 0.2985, "step": 324 }, { "epoch": 0.17478991596638654, "grad_norm": 0.27391839027404785, "learning_rate": 4.915320092704986e-05, "loss": 0.33, "step": 325 }, { "epoch": 0.17532773109243696, "grad_norm": 0.2536241114139557, "learning_rate": 4.9141043444353674e-05, "loss": 0.2866, "step": 326 }, { "epoch": 0.1758655462184874, "grad_norm": 0.249501571059227, "learning_rate": 4.912880083535432e-05, "loss": 0.2561, "step": 327 }, { "epoch": 0.17640336134453782, "grad_norm": 0.26134249567985535, "learning_rate": 4.9116473143221754e-05, "loss": 0.2755, "step": 328 }, { "epoch": 0.17694117647058824, "grad_norm": 0.27299821376800537, "learning_rate": 4.910406041142597e-05, "loss": 0.2976, "step": 329 }, { "epoch": 0.17747899159663866, "grad_norm": 0.2507427930831909, "learning_rate": 4.909156268373683e-05, "loss": 0.2765, "step": 330 }, { "epoch": 0.17801680672268907, "grad_norm": 0.2515356242656708, "learning_rate": 4.907898000422389e-05, "loss": 0.2716, "step": 331 }, { "epoch": 0.1785546218487395, "grad_norm": 0.25549665093421936, "learning_rate": 4.9066312417256284e-05, "loss": 0.2686, "step": 332 }, { "epoch": 0.1790924369747899, "grad_norm": 0.251264363527298, "learning_rate": 4.9053559967502535e-05, "loss": 0.2594, "step": 333 }, { "epoch": 0.17963025210084033, "grad_norm": 0.25811612606048584, "learning_rate": 4.9040722699930416e-05, "loss": 0.2737, "step": 334 }, { "epoch": 0.18016806722689074, "grad_norm": 0.2533648908138275, "learning_rate": 4.9027800659806775e-05, "loss": 0.2696, "step": 335 }, { "epoch": 0.1807058823529412, "grad_norm": 0.2561922073364258, "learning_rate": 4.90147938926974e-05, "loss": 0.2507, "step": 336 }, { "epoch": 0.1812436974789916, "grad_norm": 0.2774251103401184, "learning_rate": 4.9001702444466845e-05, "loss": 0.3131, "step": 337 }, { "epoch": 0.18178151260504202, "grad_norm": 0.23491545021533966, "learning_rate": 4.898852636127824e-05, "loss": 0.2718, "step": 338 }, { "epoch": 0.18231932773109244, "grad_norm": 0.26464974880218506, "learning_rate": 4.8975265689593195e-05, "loss": 0.2648, "step": 339 }, { "epoch": 0.18285714285714286, "grad_norm": 0.2524261474609375, "learning_rate": 4.896192047617156e-05, "loss": 0.2531, "step": 340 }, { "epoch": 0.18339495798319327, "grad_norm": 0.2577720582485199, "learning_rate": 4.894849076807132e-05, "loss": 0.2733, "step": 341 }, { "epoch": 0.1839327731092437, "grad_norm": 0.2304072231054306, "learning_rate": 4.89349766126484e-05, "loss": 0.2641, "step": 342 }, { "epoch": 0.1844705882352941, "grad_norm": 0.2800414264202118, "learning_rate": 4.89213780575565e-05, "loss": 0.3031, "step": 343 }, { "epoch": 0.18500840336134453, "grad_norm": 0.2762930393218994, "learning_rate": 4.890769515074693e-05, "loss": 0.3139, "step": 344 }, { "epoch": 0.18554621848739497, "grad_norm": 0.2419080287218094, "learning_rate": 4.8893927940468444e-05, "loss": 0.2641, "step": 345 }, { "epoch": 0.1860840336134454, "grad_norm": 0.24814869463443756, "learning_rate": 4.8880076475267064e-05, "loss": 0.287, "step": 346 }, { "epoch": 0.1866218487394958, "grad_norm": 0.25348711013793945, "learning_rate": 4.886614080398594e-05, "loss": 0.2359, "step": 347 }, { "epoch": 0.18715966386554622, "grad_norm": 0.22392988204956055, "learning_rate": 4.885212097576509e-05, "loss": 0.2736, "step": 348 }, { "epoch": 0.18769747899159664, "grad_norm": 0.22034569084644318, "learning_rate": 4.883801704004135e-05, "loss": 0.2619, "step": 349 }, { "epoch": 0.18823529411764706, "grad_norm": 0.2589433193206787, "learning_rate": 4.88238290465481e-05, "loss": 0.2782, "step": 350 }, { "epoch": 0.18877310924369747, "grad_norm": 0.2166265845298767, "learning_rate": 4.8809557045315135e-05, "loss": 0.2432, "step": 351 }, { "epoch": 0.1893109243697479, "grad_norm": 0.24867162108421326, "learning_rate": 4.879520108666847e-05, "loss": 0.2741, "step": 352 }, { "epoch": 0.1898487394957983, "grad_norm": 0.24785232543945312, "learning_rate": 4.87807612212302e-05, "loss": 0.2619, "step": 353 }, { "epoch": 0.19038655462184875, "grad_norm": 0.24923545122146606, "learning_rate": 4.8766237499918244e-05, "loss": 0.2502, "step": 354 }, { "epoch": 0.19092436974789917, "grad_norm": 0.26826900243759155, "learning_rate": 4.875162997394626e-05, "loss": 0.2793, "step": 355 }, { "epoch": 0.1914621848739496, "grad_norm": 0.26143020391464233, "learning_rate": 4.87369386948234e-05, "loss": 0.2741, "step": 356 }, { "epoch": 0.192, "grad_norm": 0.24153444170951843, "learning_rate": 4.872216371435414e-05, "loss": 0.2383, "step": 357 }, { "epoch": 0.19253781512605042, "grad_norm": 0.233461394906044, "learning_rate": 4.870730508463811e-05, "loss": 0.259, "step": 358 }, { "epoch": 0.19307563025210084, "grad_norm": 0.23585747182369232, "learning_rate": 4.869236285806992e-05, "loss": 0.2533, "step": 359 }, { "epoch": 0.19361344537815126, "grad_norm": 0.2478472888469696, "learning_rate": 4.8677337087338926e-05, "loss": 0.2511, "step": 360 }, { "epoch": 0.19415126050420167, "grad_norm": 0.27451691031455994, "learning_rate": 4.866222782542912e-05, "loss": 0.3086, "step": 361 }, { "epoch": 0.1946890756302521, "grad_norm": 0.24595822393894196, "learning_rate": 4.8647035125618855e-05, "loss": 0.2848, "step": 362 }, { "epoch": 0.19522689075630253, "grad_norm": 0.2632058560848236, "learning_rate": 4.863175904148074e-05, "loss": 0.2599, "step": 363 }, { "epoch": 0.19576470588235295, "grad_norm": 0.23471085727214813, "learning_rate": 4.861639962688141e-05, "loss": 0.2439, "step": 364 }, { "epoch": 0.19630252100840337, "grad_norm": 0.22981446981430054, "learning_rate": 4.860095693598131e-05, "loss": 0.2255, "step": 365 }, { "epoch": 0.19684033613445379, "grad_norm": 0.22001126408576965, "learning_rate": 4.8585431023234584e-05, "loss": 0.246, "step": 366 }, { "epoch": 0.1973781512605042, "grad_norm": 0.25503072142601013, "learning_rate": 4.856982194338878e-05, "loss": 0.2912, "step": 367 }, { "epoch": 0.19791596638655462, "grad_norm": 0.22565297782421112, "learning_rate": 4.855412975148475e-05, "loss": 0.2254, "step": 368 }, { "epoch": 0.19845378151260504, "grad_norm": 0.23742322623729706, "learning_rate": 4.85383545028564e-05, "loss": 0.2609, "step": 369 }, { "epoch": 0.19899159663865545, "grad_norm": 0.22145989537239075, "learning_rate": 4.85224962531305e-05, "loss": 0.2342, "step": 370 }, { "epoch": 0.19952941176470587, "grad_norm": 0.26108118891716003, "learning_rate": 4.850655505822654e-05, "loss": 0.2787, "step": 371 }, { "epoch": 0.2000672268907563, "grad_norm": 0.2858121693134308, "learning_rate": 4.849053097435644e-05, "loss": 0.3284, "step": 372 }, { "epoch": 0.20060504201680673, "grad_norm": 0.2194736748933792, "learning_rate": 4.8474424058024444e-05, "loss": 0.2303, "step": 373 }, { "epoch": 0.20114285714285715, "grad_norm": 0.24387770891189575, "learning_rate": 4.845823436602685e-05, "loss": 0.2772, "step": 374 }, { "epoch": 0.20168067226890757, "grad_norm": 0.21552203595638275, "learning_rate": 4.8441961955451865e-05, "loss": 0.2609, "step": 375 }, { "epoch": 0.20221848739495799, "grad_norm": 0.23177099227905273, "learning_rate": 4.842560688367937e-05, "loss": 0.2772, "step": 376 }, { "epoch": 0.2027563025210084, "grad_norm": 0.244123637676239, "learning_rate": 4.840916920838071e-05, "loss": 0.2652, "step": 377 }, { "epoch": 0.20329411764705882, "grad_norm": 0.240333691239357, "learning_rate": 4.839264898751854e-05, "loss": 0.2611, "step": 378 }, { "epoch": 0.20383193277310924, "grad_norm": 0.24081739783287048, "learning_rate": 4.837604627934655e-05, "loss": 0.2717, "step": 379 }, { "epoch": 0.20436974789915965, "grad_norm": 0.21801015734672546, "learning_rate": 4.835936114240934e-05, "loss": 0.24, "step": 380 }, { "epoch": 0.20490756302521007, "grad_norm": 0.25821009278297424, "learning_rate": 4.834259363554213e-05, "loss": 0.267, "step": 381 }, { "epoch": 0.20544537815126052, "grad_norm": 0.209644615650177, "learning_rate": 4.8325743817870614e-05, "loss": 0.2493, "step": 382 }, { "epoch": 0.20598319327731093, "grad_norm": 0.23463033139705658, "learning_rate": 4.830881174881073e-05, "loss": 0.2529, "step": 383 }, { "epoch": 0.20652100840336135, "grad_norm": 0.24982471764087677, "learning_rate": 4.829179748806846e-05, "loss": 0.3002, "step": 384 }, { "epoch": 0.20705882352941177, "grad_norm": 0.26289087533950806, "learning_rate": 4.827470109563957e-05, "loss": 0.2592, "step": 385 }, { "epoch": 0.20759663865546218, "grad_norm": 0.20384639501571655, "learning_rate": 4.82575226318095e-05, "loss": 0.2477, "step": 386 }, { "epoch": 0.2081344537815126, "grad_norm": 0.2661588788032532, "learning_rate": 4.824026215715304e-05, "loss": 0.2852, "step": 387 }, { "epoch": 0.20867226890756302, "grad_norm": 0.2528963088989258, "learning_rate": 4.8222919732534205e-05, "loss": 0.2603, "step": 388 }, { "epoch": 0.20921008403361344, "grad_norm": 0.2350187748670578, "learning_rate": 4.820549541910595e-05, "loss": 0.2602, "step": 389 }, { "epoch": 0.20974789915966385, "grad_norm": 0.24749146401882172, "learning_rate": 4.8187989278310005e-05, "loss": 0.263, "step": 390 }, { "epoch": 0.2102857142857143, "grad_norm": 0.2626173794269562, "learning_rate": 4.817040137187664e-05, "loss": 0.3036, "step": 391 }, { "epoch": 0.21082352941176472, "grad_norm": 0.25105804204940796, "learning_rate": 4.8152731761824444e-05, "loss": 0.3187, "step": 392 }, { "epoch": 0.21136134453781513, "grad_norm": 0.24493566155433655, "learning_rate": 4.81349805104601e-05, "loss": 0.2856, "step": 393 }, { "epoch": 0.21189915966386555, "grad_norm": 0.23525816202163696, "learning_rate": 4.811714768037821e-05, "loss": 0.2793, "step": 394 }, { "epoch": 0.21243697478991597, "grad_norm": 0.2570784389972687, "learning_rate": 4.809923333446098e-05, "loss": 0.2937, "step": 395 }, { "epoch": 0.21297478991596638, "grad_norm": 0.19580571353435516, "learning_rate": 4.8081237535878116e-05, "loss": 0.2382, "step": 396 }, { "epoch": 0.2135126050420168, "grad_norm": 0.21526862680912018, "learning_rate": 4.806316034808651e-05, "loss": 0.2706, "step": 397 }, { "epoch": 0.21405042016806722, "grad_norm": 0.2531112730503082, "learning_rate": 4.804500183483007e-05, "loss": 0.293, "step": 398 }, { "epoch": 0.21458823529411764, "grad_norm": 0.25029081106185913, "learning_rate": 4.802676206013945e-05, "loss": 0.302, "step": 399 }, { "epoch": 0.21512605042016808, "grad_norm": 0.23896951973438263, "learning_rate": 4.800844108833186e-05, "loss": 0.2814, "step": 400 }, { "epoch": 0.2156638655462185, "grad_norm": 0.23661303520202637, "learning_rate": 4.7990038984010837e-05, "loss": 0.2888, "step": 401 }, { "epoch": 0.21620168067226891, "grad_norm": 0.23976142704486847, "learning_rate": 4.7971555812065994e-05, "loss": 0.2748, "step": 402 }, { "epoch": 0.21673949579831933, "grad_norm": 0.26892197132110596, "learning_rate": 4.795299163767282e-05, "loss": 0.3043, "step": 403 }, { "epoch": 0.21727731092436975, "grad_norm": 0.25174662470817566, "learning_rate": 4.7934346526292414e-05, "loss": 0.2531, "step": 404 }, { "epoch": 0.21781512605042017, "grad_norm": 0.2701696753501892, "learning_rate": 4.79156205436713e-05, "loss": 0.337, "step": 405 }, { "epoch": 0.21835294117647058, "grad_norm": 0.25641125440597534, "learning_rate": 4.789681375584116e-05, "loss": 0.2952, "step": 406 }, { "epoch": 0.218890756302521, "grad_norm": 0.29483330249786377, "learning_rate": 4.78779262291186e-05, "loss": 0.2758, "step": 407 }, { "epoch": 0.21942857142857142, "grad_norm": 0.21791832149028778, "learning_rate": 4.7858958030104935e-05, "loss": 0.2796, "step": 408 }, { "epoch": 0.21996638655462186, "grad_norm": 0.24035575985908508, "learning_rate": 4.7839909225685966e-05, "loss": 0.2346, "step": 409 }, { "epoch": 0.22050420168067228, "grad_norm": 0.22041240334510803, "learning_rate": 4.7820779883031696e-05, "loss": 0.238, "step": 410 }, { "epoch": 0.2210420168067227, "grad_norm": 0.23774303495883942, "learning_rate": 4.780157006959615e-05, "loss": 0.2393, "step": 411 }, { "epoch": 0.2215798319327731, "grad_norm": 0.20871755480766296, "learning_rate": 4.7782279853117084e-05, "loss": 0.2217, "step": 412 }, { "epoch": 0.22211764705882353, "grad_norm": 0.33093753457069397, "learning_rate": 4.776290930161579e-05, "loss": 0.3435, "step": 413 }, { "epoch": 0.22265546218487395, "grad_norm": 0.2343863695859909, "learning_rate": 4.774345848339683e-05, "loss": 0.2687, "step": 414 }, { "epoch": 0.22319327731092437, "grad_norm": 0.2601449489593506, "learning_rate": 4.7723927467047814e-05, "loss": 0.2478, "step": 415 }, { "epoch": 0.22373109243697478, "grad_norm": 0.2045518159866333, "learning_rate": 4.7704316321439124e-05, "loss": 0.223, "step": 416 }, { "epoch": 0.2242689075630252, "grad_norm": 0.25932803750038147, "learning_rate": 4.768462511572371e-05, "loss": 0.2751, "step": 417 }, { "epoch": 0.22480672268907562, "grad_norm": 0.2314649373292923, "learning_rate": 4.7664853919336835e-05, "loss": 0.2868, "step": 418 }, { "epoch": 0.22534453781512606, "grad_norm": 0.2388240098953247, "learning_rate": 4.764500280199581e-05, "loss": 0.2795, "step": 419 }, { "epoch": 0.22588235294117648, "grad_norm": 0.23004719614982605, "learning_rate": 4.7625071833699795e-05, "loss": 0.2327, "step": 420 }, { "epoch": 0.2264201680672269, "grad_norm": 0.2778248190879822, "learning_rate": 4.760506108472947e-05, "loss": 0.3281, "step": 421 }, { "epoch": 0.2269579831932773, "grad_norm": 0.23967599868774414, "learning_rate": 4.7584970625646884e-05, "loss": 0.2661, "step": 422 }, { "epoch": 0.22749579831932773, "grad_norm": 0.23860499262809753, "learning_rate": 4.756480052729514e-05, "loss": 0.2843, "step": 423 }, { "epoch": 0.22803361344537815, "grad_norm": 0.20034274458885193, "learning_rate": 4.7544550860798177e-05, "loss": 0.2269, "step": 424 }, { "epoch": 0.22857142857142856, "grad_norm": 0.23515743017196655, "learning_rate": 4.752422169756048e-05, "loss": 0.2658, "step": 425 }, { "epoch": 0.22910924369747898, "grad_norm": 0.224516823887825, "learning_rate": 4.750381310926689e-05, "loss": 0.2644, "step": 426 }, { "epoch": 0.2296470588235294, "grad_norm": 0.2596588730812073, "learning_rate": 4.748332516788231e-05, "loss": 0.3037, "step": 427 }, { "epoch": 0.23018487394957984, "grad_norm": 0.23460553586483002, "learning_rate": 4.7462757945651426e-05, "loss": 0.2579, "step": 428 }, { "epoch": 0.23072268907563026, "grad_norm": 0.2203722447156906, "learning_rate": 4.744211151509854e-05, "loss": 0.2613, "step": 429 }, { "epoch": 0.23126050420168068, "grad_norm": 0.2298227697610855, "learning_rate": 4.7421385949027205e-05, "loss": 0.2445, "step": 430 }, { "epoch": 0.2317983193277311, "grad_norm": 0.25603994727134705, "learning_rate": 4.7400581320520055e-05, "loss": 0.2458, "step": 431 }, { "epoch": 0.2323361344537815, "grad_norm": 0.262251615524292, "learning_rate": 4.737969770293851e-05, "loss": 0.2954, "step": 432 }, { "epoch": 0.23287394957983193, "grad_norm": 0.2465902715921402, "learning_rate": 4.73587351699225e-05, "loss": 0.259, "step": 433 }, { "epoch": 0.23341176470588235, "grad_norm": 0.24431520700454712, "learning_rate": 4.733769379539026e-05, "loss": 0.2791, "step": 434 }, { "epoch": 0.23394957983193276, "grad_norm": 0.24327078461647034, "learning_rate": 4.731657365353803e-05, "loss": 0.2947, "step": 435 }, { "epoch": 0.23448739495798318, "grad_norm": 0.2557367980480194, "learning_rate": 4.7295374818839764e-05, "loss": 0.2803, "step": 436 }, { "epoch": 0.23502521008403363, "grad_norm": 0.2142622172832489, "learning_rate": 4.727409736604694e-05, "loss": 0.2508, "step": 437 }, { "epoch": 0.23556302521008404, "grad_norm": 0.21318228542804718, "learning_rate": 4.725274137018826e-05, "loss": 0.2504, "step": 438 }, { "epoch": 0.23610084033613446, "grad_norm": 0.21114876866340637, "learning_rate": 4.7231306906569375e-05, "loss": 0.2314, "step": 439 }, { "epoch": 0.23663865546218488, "grad_norm": 0.20804953575134277, "learning_rate": 4.7209794050772605e-05, "loss": 0.2299, "step": 440 }, { "epoch": 0.2371764705882353, "grad_norm": 0.2472643107175827, "learning_rate": 4.718820287865674e-05, "loss": 0.2743, "step": 441 }, { "epoch": 0.2377142857142857, "grad_norm": 0.2286311239004135, "learning_rate": 4.716653346635671e-05, "loss": 0.2591, "step": 442 }, { "epoch": 0.23825210084033613, "grad_norm": 0.23159442842006683, "learning_rate": 4.714478589028333e-05, "loss": 0.2593, "step": 443 }, { "epoch": 0.23878991596638655, "grad_norm": 0.22817808389663696, "learning_rate": 4.712296022712305e-05, "loss": 0.2478, "step": 444 }, { "epoch": 0.23932773109243696, "grad_norm": 0.24384620785713196, "learning_rate": 4.7101056553837665e-05, "loss": 0.3145, "step": 445 }, { "epoch": 0.2398655462184874, "grad_norm": 0.23113542795181274, "learning_rate": 4.7079074947664036e-05, "loss": 0.2788, "step": 446 }, { "epoch": 0.24040336134453782, "grad_norm": 0.25968456268310547, "learning_rate": 4.7057015486113856e-05, "loss": 0.273, "step": 447 }, { "epoch": 0.24094117647058824, "grad_norm": 0.24511998891830444, "learning_rate": 4.703487824697333e-05, "loss": 0.271, "step": 448 }, { "epoch": 0.24147899159663866, "grad_norm": 0.23048298060894012, "learning_rate": 4.7012663308302954e-05, "loss": 0.2895, "step": 449 }, { "epoch": 0.24201680672268908, "grad_norm": 0.25282594561576843, "learning_rate": 4.6990370748437175e-05, "loss": 0.2701, "step": 450 }, { "epoch": 0.2425546218487395, "grad_norm": 0.2358527034521103, "learning_rate": 4.6968000645984156e-05, "loss": 0.2811, "step": 451 }, { "epoch": 0.2430924369747899, "grad_norm": 0.2355646938085556, "learning_rate": 4.694555307982551e-05, "loss": 0.2599, "step": 452 }, { "epoch": 0.24363025210084033, "grad_norm": 0.21464508771896362, "learning_rate": 4.692302812911598e-05, "loss": 0.26, "step": 453 }, { "epoch": 0.24416806722689074, "grad_norm": 0.23232926428318024, "learning_rate": 4.690042587328319e-05, "loss": 0.2532, "step": 454 }, { "epoch": 0.2447058823529412, "grad_norm": 0.2313256710767746, "learning_rate": 4.6877746392027366e-05, "loss": 0.271, "step": 455 }, { "epoch": 0.2452436974789916, "grad_norm": 0.24588103592395782, "learning_rate": 4.685498976532104e-05, "loss": 0.2865, "step": 456 }, { "epoch": 0.24578151260504202, "grad_norm": 0.25031155347824097, "learning_rate": 4.683215607340878e-05, "loss": 0.3119, "step": 457 }, { "epoch": 0.24631932773109244, "grad_norm": 0.23389014601707458, "learning_rate": 4.680924539680689e-05, "loss": 0.3226, "step": 458 }, { "epoch": 0.24685714285714286, "grad_norm": 0.27950188517570496, "learning_rate": 4.678625781630315e-05, "loss": 0.3086, "step": 459 }, { "epoch": 0.24739495798319328, "grad_norm": 0.20302145183086395, "learning_rate": 4.676319341295652e-05, "loss": 0.2295, "step": 460 }, { "epoch": 0.2479327731092437, "grad_norm": 0.21615168452262878, "learning_rate": 4.674005226809684e-05, "loss": 0.2513, "step": 461 }, { "epoch": 0.2484705882352941, "grad_norm": 0.26412224769592285, "learning_rate": 4.671683446332457e-05, "loss": 0.3067, "step": 462 }, { "epoch": 0.24900840336134453, "grad_norm": 0.2427605390548706, "learning_rate": 4.669354008051047e-05, "loss": 0.2939, "step": 463 }, { "epoch": 0.24954621848739497, "grad_norm": 0.24122793972492218, "learning_rate": 4.6670169201795355e-05, "loss": 0.3041, "step": 464 }, { "epoch": 0.2500840336134454, "grad_norm": 0.23274767398834229, "learning_rate": 4.664672190958977e-05, "loss": 0.2465, "step": 465 }, { "epoch": 0.2506218487394958, "grad_norm": 0.21548829972743988, "learning_rate": 4.662319828657371e-05, "loss": 0.2346, "step": 466 }, { "epoch": 0.2511596638655462, "grad_norm": 0.2341853380203247, "learning_rate": 4.659959841569631e-05, "loss": 0.2715, "step": 467 }, { "epoch": 0.2516974789915966, "grad_norm": 0.2317124456167221, "learning_rate": 4.6575922380175596e-05, "loss": 0.2702, "step": 468 }, { "epoch": 0.25223529411764706, "grad_norm": 0.2621871829032898, "learning_rate": 4.655217026349817e-05, "loss": 0.3183, "step": 469 }, { "epoch": 0.2527731092436975, "grad_norm": 0.23459269106388092, "learning_rate": 4.6528342149418876e-05, "loss": 0.2662, "step": 470 }, { "epoch": 0.2533109243697479, "grad_norm": 0.24387896060943604, "learning_rate": 4.650443812196058e-05, "loss": 0.2733, "step": 471 }, { "epoch": 0.25384873949579834, "grad_norm": 0.22349336743354797, "learning_rate": 4.648045826541382e-05, "loss": 0.2205, "step": 472 }, { "epoch": 0.2543865546218487, "grad_norm": 0.2531421184539795, "learning_rate": 4.645640266433651e-05, "loss": 0.2917, "step": 473 }, { "epoch": 0.25492436974789917, "grad_norm": 0.2512775957584381, "learning_rate": 4.643227140355366e-05, "loss": 0.2713, "step": 474 }, { "epoch": 0.25546218487394956, "grad_norm": 0.24001802504062653, "learning_rate": 4.64080645681571e-05, "loss": 0.2734, "step": 475 }, { "epoch": 0.256, "grad_norm": 0.22391103208065033, "learning_rate": 4.638378224350511e-05, "loss": 0.2432, "step": 476 }, { "epoch": 0.2565378151260504, "grad_norm": 0.2442803829908371, "learning_rate": 4.635942451522219e-05, "loss": 0.2506, "step": 477 }, { "epoch": 0.25707563025210084, "grad_norm": 0.24446138739585876, "learning_rate": 4.6334991469198714e-05, "loss": 0.2716, "step": 478 }, { "epoch": 0.2576134453781513, "grad_norm": 0.2534659802913666, "learning_rate": 4.631048319159065e-05, "loss": 0.3181, "step": 479 }, { "epoch": 0.2581512605042017, "grad_norm": 0.23951131105422974, "learning_rate": 4.628589976881923e-05, "loss": 0.2685, "step": 480 }, { "epoch": 0.2586890756302521, "grad_norm": 0.24787181615829468, "learning_rate": 4.6261241287570703e-05, "loss": 0.295, "step": 481 }, { "epoch": 0.2592268907563025, "grad_norm": 0.2636100947856903, "learning_rate": 4.623650783479595e-05, "loss": 0.2771, "step": 482 }, { "epoch": 0.25976470588235295, "grad_norm": 0.20473726093769073, "learning_rate": 4.6211699497710225e-05, "loss": 0.2338, "step": 483 }, { "epoch": 0.26030252100840334, "grad_norm": 0.24425745010375977, "learning_rate": 4.6186816363792865e-05, "loss": 0.2867, "step": 484 }, { "epoch": 0.2608403361344538, "grad_norm": 0.2235567420721054, "learning_rate": 4.6161858520786926e-05, "loss": 0.2493, "step": 485 }, { "epoch": 0.2613781512605042, "grad_norm": 0.23400655388832092, "learning_rate": 4.6136826056698915e-05, "loss": 0.2639, "step": 486 }, { "epoch": 0.2619159663865546, "grad_norm": 0.22992226481437683, "learning_rate": 4.6111719059798466e-05, "loss": 0.2564, "step": 487 }, { "epoch": 0.26245378151260507, "grad_norm": 0.21632078289985657, "learning_rate": 4.608653761861804e-05, "loss": 0.2499, "step": 488 }, { "epoch": 0.26299159663865546, "grad_norm": 0.2274123877286911, "learning_rate": 4.606128182195259e-05, "loss": 0.266, "step": 489 }, { "epoch": 0.2635294117647059, "grad_norm": 0.25762587785720825, "learning_rate": 4.603595175885929e-05, "loss": 0.2974, "step": 490 }, { "epoch": 0.2640672268907563, "grad_norm": 0.2280481606721878, "learning_rate": 4.601054751865714e-05, "loss": 0.242, "step": 491 }, { "epoch": 0.26460504201680674, "grad_norm": 0.22591246664524078, "learning_rate": 4.598506919092676e-05, "loss": 0.2483, "step": 492 }, { "epoch": 0.2651428571428571, "grad_norm": 0.23084098100662231, "learning_rate": 4.595951686550998e-05, "loss": 0.2567, "step": 493 }, { "epoch": 0.26568067226890757, "grad_norm": 0.2260267585515976, "learning_rate": 4.593389063250958e-05, "loss": 0.2592, "step": 494 }, { "epoch": 0.26621848739495796, "grad_norm": 0.24005655944347382, "learning_rate": 4.590819058228896e-05, "loss": 0.2819, "step": 495 }, { "epoch": 0.2667563025210084, "grad_norm": 0.23080764710903168, "learning_rate": 4.5882416805471775e-05, "loss": 0.2682, "step": 496 }, { "epoch": 0.26729411764705885, "grad_norm": 0.2611956000328064, "learning_rate": 4.585656939294171e-05, "loss": 0.2586, "step": 497 }, { "epoch": 0.26783193277310924, "grad_norm": 0.21191535890102386, "learning_rate": 4.5830648435842056e-05, "loss": 0.2232, "step": 498 }, { "epoch": 0.2683697478991597, "grad_norm": 0.18608246743679047, "learning_rate": 4.5804654025575475e-05, "loss": 0.2139, "step": 499 }, { "epoch": 0.2689075630252101, "grad_norm": 0.23022042214870453, "learning_rate": 4.577858625380362e-05, "loss": 0.3, "step": 500 }, { "epoch": 0.2694453781512605, "grad_norm": 0.24489882588386536, "learning_rate": 4.5752445212446836e-05, "loss": 0.2823, "step": 501 }, { "epoch": 0.2699831932773109, "grad_norm": 0.21551187336444855, "learning_rate": 4.572623099368384e-05, "loss": 0.2391, "step": 502 }, { "epoch": 0.27052100840336135, "grad_norm": 0.23923662304878235, "learning_rate": 4.5699943689951355e-05, "loss": 0.2901, "step": 503 }, { "epoch": 0.27105882352941174, "grad_norm": 0.2115108072757721, "learning_rate": 4.5673583393943866e-05, "loss": 0.2319, "step": 504 }, { "epoch": 0.2715966386554622, "grad_norm": 0.1953652948141098, "learning_rate": 4.564715019861321e-05, "loss": 0.2544, "step": 505 }, { "epoch": 0.27213445378151263, "grad_norm": 0.25083816051483154, "learning_rate": 4.5620644197168286e-05, "loss": 0.2897, "step": 506 }, { "epoch": 0.272672268907563, "grad_norm": 0.24621397256851196, "learning_rate": 4.559406548307473e-05, "loss": 0.3167, "step": 507 }, { "epoch": 0.27321008403361347, "grad_norm": 0.20257635414600372, "learning_rate": 4.556741415005459e-05, "loss": 0.222, "step": 508 }, { "epoch": 0.27374789915966385, "grad_norm": 0.2511012554168701, "learning_rate": 4.5540690292085944e-05, "loss": 0.3315, "step": 509 }, { "epoch": 0.2742857142857143, "grad_norm": 0.2311190813779831, "learning_rate": 4.551389400340265e-05, "loss": 0.2583, "step": 510 }, { "epoch": 0.2748235294117647, "grad_norm": 0.26484665274620056, "learning_rate": 4.548702537849394e-05, "loss": 0.281, "step": 511 }, { "epoch": 0.27536134453781513, "grad_norm": 0.2264309674501419, "learning_rate": 4.546008451210414e-05, "loss": 0.2815, "step": 512 }, { "epoch": 0.2758991596638655, "grad_norm": 0.23666879534721375, "learning_rate": 4.543307149923231e-05, "loss": 0.2595, "step": 513 }, { "epoch": 0.27643697478991597, "grad_norm": 0.2525842785835266, "learning_rate": 4.540598643513191e-05, "loss": 0.2966, "step": 514 }, { "epoch": 0.2769747899159664, "grad_norm": 0.25806301832199097, "learning_rate": 4.5378829415310465e-05, "loss": 0.2918, "step": 515 }, { "epoch": 0.2775126050420168, "grad_norm": 0.2552969455718994, "learning_rate": 4.535160053552924e-05, "loss": 0.294, "step": 516 }, { "epoch": 0.27805042016806725, "grad_norm": 0.20627938210964203, "learning_rate": 4.5324299891802867e-05, "loss": 0.2399, "step": 517 }, { "epoch": 0.27858823529411764, "grad_norm": 0.22996152937412262, "learning_rate": 4.529692758039908e-05, "loss": 0.2485, "step": 518 }, { "epoch": 0.2791260504201681, "grad_norm": 0.2616472542285919, "learning_rate": 4.5269483697838276e-05, "loss": 0.2984, "step": 519 }, { "epoch": 0.27966386554621847, "grad_norm": 0.2364635169506073, "learning_rate": 4.524196834089326e-05, "loss": 0.2756, "step": 520 }, { "epoch": 0.2802016806722689, "grad_norm": 0.24075715243816376, "learning_rate": 4.521438160658887e-05, "loss": 0.2988, "step": 521 }, { "epoch": 0.2807394957983193, "grad_norm": 0.22384031116962433, "learning_rate": 4.518672359220161e-05, "loss": 0.2532, "step": 522 }, { "epoch": 0.28127731092436975, "grad_norm": 0.22723133862018585, "learning_rate": 4.5158994395259356e-05, "loss": 0.2491, "step": 523 }, { "epoch": 0.2818151260504202, "grad_norm": 0.21520480513572693, "learning_rate": 4.5131194113540995e-05, "loss": 0.2662, "step": 524 }, { "epoch": 0.2823529411764706, "grad_norm": 0.2153734713792801, "learning_rate": 4.5103322845076036e-05, "loss": 0.2487, "step": 525 }, { "epoch": 0.28289075630252103, "grad_norm": 0.23241375386714935, "learning_rate": 4.507538068814434e-05, "loss": 0.2687, "step": 526 }, { "epoch": 0.2834285714285714, "grad_norm": 0.26121076941490173, "learning_rate": 4.5047367741275715e-05, "loss": 0.3061, "step": 527 }, { "epoch": 0.28396638655462186, "grad_norm": 0.26131686568260193, "learning_rate": 4.5019284103249604e-05, "loss": 0.3332, "step": 528 }, { "epoch": 0.28450420168067225, "grad_norm": 0.2623176574707031, "learning_rate": 4.49911298730947e-05, "loss": 0.2923, "step": 529 }, { "epoch": 0.2850420168067227, "grad_norm": 0.22432950139045715, "learning_rate": 4.4962905150088644e-05, "loss": 0.2713, "step": 530 }, { "epoch": 0.2855798319327731, "grad_norm": 0.21896281838417053, "learning_rate": 4.493461003375762e-05, "loss": 0.2522, "step": 531 }, { "epoch": 0.28611764705882353, "grad_norm": 0.23661963641643524, "learning_rate": 4.490624462387607e-05, "loss": 0.2554, "step": 532 }, { "epoch": 0.286655462184874, "grad_norm": 0.2241566926240921, "learning_rate": 4.4877809020466265e-05, "loss": 0.246, "step": 533 }, { "epoch": 0.28719327731092437, "grad_norm": 0.23797310888767242, "learning_rate": 4.484930332379803e-05, "loss": 0.2758, "step": 534 }, { "epoch": 0.2877310924369748, "grad_norm": 0.2378690540790558, "learning_rate": 4.4820727634388324e-05, "loss": 0.297, "step": 535 }, { "epoch": 0.2882689075630252, "grad_norm": 0.27658143639564514, "learning_rate": 4.479208205300094e-05, "loss": 0.3039, "step": 536 }, { "epoch": 0.28880672268907565, "grad_norm": 0.23069649934768677, "learning_rate": 4.4763366680646104e-05, "loss": 0.2708, "step": 537 }, { "epoch": 0.28934453781512603, "grad_norm": 0.23312151432037354, "learning_rate": 4.473458161858015e-05, "loss": 0.3019, "step": 538 }, { "epoch": 0.2898823529411765, "grad_norm": 0.22876974940299988, "learning_rate": 4.470572696830515e-05, "loss": 0.2672, "step": 539 }, { "epoch": 0.29042016806722687, "grad_norm": 0.22691677510738373, "learning_rate": 4.467680283156854e-05, "loss": 0.2442, "step": 540 }, { "epoch": 0.2909579831932773, "grad_norm": 0.2252989113330841, "learning_rate": 4.4647809310362834e-05, "loss": 0.2782, "step": 541 }, { "epoch": 0.2914957983193277, "grad_norm": 0.24823607504367828, "learning_rate": 4.461874650692514e-05, "loss": 0.2935, "step": 542 }, { "epoch": 0.29203361344537815, "grad_norm": 0.2391696274280548, "learning_rate": 4.458961452373692e-05, "loss": 0.2711, "step": 543 }, { "epoch": 0.2925714285714286, "grad_norm": 0.23067960143089294, "learning_rate": 4.456041346352356e-05, "loss": 0.2485, "step": 544 }, { "epoch": 0.293109243697479, "grad_norm": 0.22453716397285461, "learning_rate": 4.453114342925402e-05, "loss": 0.3041, "step": 545 }, { "epoch": 0.2936470588235294, "grad_norm": 0.25891563296318054, "learning_rate": 4.450180452414049e-05, "loss": 0.263, "step": 546 }, { "epoch": 0.2941848739495798, "grad_norm": 0.20902924239635468, "learning_rate": 4.447239685163801e-05, "loss": 0.2292, "step": 547 }, { "epoch": 0.29472268907563026, "grad_norm": 0.23664672672748566, "learning_rate": 4.4442920515444094e-05, "loss": 0.2704, "step": 548 }, { "epoch": 0.29526050420168065, "grad_norm": 0.201388880610466, "learning_rate": 4.4413375619498385e-05, "loss": 0.2239, "step": 549 }, { "epoch": 0.2957983193277311, "grad_norm": 0.21629491448402405, "learning_rate": 4.438376226798231e-05, "loss": 0.2364, "step": 550 }, { "epoch": 0.2963361344537815, "grad_norm": 0.27691522240638733, "learning_rate": 4.435408056531864e-05, "loss": 0.3032, "step": 551 }, { "epoch": 0.29687394957983193, "grad_norm": 0.19841383397579193, "learning_rate": 4.432433061617119e-05, "loss": 0.2466, "step": 552 }, { "epoch": 0.2974117647058824, "grad_norm": 0.252935528755188, "learning_rate": 4.429451252544442e-05, "loss": 0.2804, "step": 553 }, { "epoch": 0.29794957983193276, "grad_norm": 0.23959507048130035, "learning_rate": 4.4264626398283085e-05, "loss": 0.2757, "step": 554 }, { "epoch": 0.2984873949579832, "grad_norm": 0.22644482553005219, "learning_rate": 4.4234672340071824e-05, "loss": 0.2568, "step": 555 }, { "epoch": 0.2990252100840336, "grad_norm": 0.24373210966587067, "learning_rate": 4.420465045643485e-05, "loss": 0.2497, "step": 556 }, { "epoch": 0.29956302521008404, "grad_norm": 0.23361137509346008, "learning_rate": 4.4174560853235505e-05, "loss": 0.2557, "step": 557 }, { "epoch": 0.30010084033613443, "grad_norm": 0.23983043432235718, "learning_rate": 4.414440363657595e-05, "loss": 0.258, "step": 558 }, { "epoch": 0.3006386554621849, "grad_norm": 0.2104366570711136, "learning_rate": 4.411417891279675e-05, "loss": 0.2216, "step": 559 }, { "epoch": 0.30117647058823527, "grad_norm": 0.20856517553329468, "learning_rate": 4.408388678847654e-05, "loss": 0.251, "step": 560 }, { "epoch": 0.3017142857142857, "grad_norm": 0.23048540949821472, "learning_rate": 4.405352737043158e-05, "loss": 0.2622, "step": 561 }, { "epoch": 0.30225210084033616, "grad_norm": 0.2341282069683075, "learning_rate": 4.4023100765715455e-05, "loss": 0.251, "step": 562 }, { "epoch": 0.30278991596638655, "grad_norm": 0.21298907697200775, "learning_rate": 4.399260708161866e-05, "loss": 0.2571, "step": 563 }, { "epoch": 0.303327731092437, "grad_norm": 0.2516660988330841, "learning_rate": 4.396204642566821e-05, "loss": 0.2818, "step": 564 }, { "epoch": 0.3038655462184874, "grad_norm": 0.21346484124660492, "learning_rate": 4.3931418905627305e-05, "loss": 0.2239, "step": 565 }, { "epoch": 0.3044033613445378, "grad_norm": 0.23531799018383026, "learning_rate": 4.390072462949489e-05, "loss": 0.2745, "step": 566 }, { "epoch": 0.3049411764705882, "grad_norm": 0.22887246310710907, "learning_rate": 4.3869963705505326e-05, "loss": 0.2597, "step": 567 }, { "epoch": 0.30547899159663866, "grad_norm": 0.23474571108818054, "learning_rate": 4.3839136242127977e-05, "loss": 0.313, "step": 568 }, { "epoch": 0.30601680672268905, "grad_norm": 0.22880013287067413, "learning_rate": 4.3808242348066844e-05, "loss": 0.2723, "step": 569 }, { "epoch": 0.3065546218487395, "grad_norm": 0.2300909012556076, "learning_rate": 4.377728213226017e-05, "loss": 0.2488, "step": 570 }, { "epoch": 0.30709243697478994, "grad_norm": 0.24805885553359985, "learning_rate": 4.374625570388008e-05, "loss": 0.338, "step": 571 }, { "epoch": 0.30763025210084033, "grad_norm": 0.24362607300281525, "learning_rate": 4.3715163172332156e-05, "loss": 0.2721, "step": 572 }, { "epoch": 0.3081680672268908, "grad_norm": 0.21831436455249786, "learning_rate": 4.368400464725509e-05, "loss": 0.2425, "step": 573 }, { "epoch": 0.30870588235294116, "grad_norm": 0.21941527724266052, "learning_rate": 4.365278023852027e-05, "loss": 0.268, "step": 574 }, { "epoch": 0.3092436974789916, "grad_norm": 0.2200327217578888, "learning_rate": 4.3621490056231416e-05, "loss": 0.2485, "step": 575 }, { "epoch": 0.309781512605042, "grad_norm": 0.23773083090782166, "learning_rate": 4.359013421072415e-05, "loss": 0.2845, "step": 576 }, { "epoch": 0.31031932773109244, "grad_norm": 0.21691930294036865, "learning_rate": 4.3558712812565686e-05, "loss": 0.2583, "step": 577 }, { "epoch": 0.31085714285714283, "grad_norm": 0.21579815447330475, "learning_rate": 4.352722597255434e-05, "loss": 0.2745, "step": 578 }, { "epoch": 0.3113949579831933, "grad_norm": 0.21122020483016968, "learning_rate": 4.3495673801719216e-05, "loss": 0.2502, "step": 579 }, { "epoch": 0.3119327731092437, "grad_norm": 0.22288072109222412, "learning_rate": 4.3464056411319784e-05, "loss": 0.2465, "step": 580 }, { "epoch": 0.3124705882352941, "grad_norm": 0.23311074078083038, "learning_rate": 4.343237391284548e-05, "loss": 0.2677, "step": 581 }, { "epoch": 0.31300840336134456, "grad_norm": 0.19925762712955475, "learning_rate": 4.340062641801535e-05, "loss": 0.2223, "step": 582 }, { "epoch": 0.31354621848739495, "grad_norm": 0.241774782538414, "learning_rate": 4.3368814038777604e-05, "loss": 0.2817, "step": 583 }, { "epoch": 0.3140840336134454, "grad_norm": 0.20146672427654266, "learning_rate": 4.333693688730926e-05, "loss": 0.2273, "step": 584 }, { "epoch": 0.3146218487394958, "grad_norm": 0.24930235743522644, "learning_rate": 4.330499507601575e-05, "loss": 0.3094, "step": 585 }, { "epoch": 0.3151596638655462, "grad_norm": 0.17968884110450745, "learning_rate": 4.3272988717530484e-05, "loss": 0.2194, "step": 586 }, { "epoch": 0.3156974789915966, "grad_norm": 0.21017666161060333, "learning_rate": 4.324091792471448e-05, "loss": 0.2806, "step": 587 }, { "epoch": 0.31623529411764706, "grad_norm": 0.21745620667934418, "learning_rate": 4.320878281065598e-05, "loss": 0.2739, "step": 588 }, { "epoch": 0.3167731092436975, "grad_norm": 0.20234698057174683, "learning_rate": 4.317658348867005e-05, "loss": 0.2572, "step": 589 }, { "epoch": 0.3173109243697479, "grad_norm": 0.22659555077552795, "learning_rate": 4.314432007229812e-05, "loss": 0.2684, "step": 590 }, { "epoch": 0.31784873949579834, "grad_norm": 0.26212990283966064, "learning_rate": 4.3111992675307664e-05, "loss": 0.2723, "step": 591 }, { "epoch": 0.3183865546218487, "grad_norm": 0.2004905343055725, "learning_rate": 4.3079601411691775e-05, "loss": 0.2272, "step": 592 }, { "epoch": 0.3189243697478992, "grad_norm": 0.20839561522006989, "learning_rate": 4.3047146395668716e-05, "loss": 0.2337, "step": 593 }, { "epoch": 0.31946218487394956, "grad_norm": 0.23353835940361023, "learning_rate": 4.301462774168158e-05, "loss": 0.2516, "step": 594 }, { "epoch": 0.32, "grad_norm": 0.23760321736335754, "learning_rate": 4.298204556439785e-05, "loss": 0.2633, "step": 595 }, { "epoch": 0.3205378151260504, "grad_norm": 0.24506376683712006, "learning_rate": 4.2949399978709026e-05, "loss": 0.3018, "step": 596 }, { "epoch": 0.32107563025210084, "grad_norm": 0.21284717321395874, "learning_rate": 4.2916691099730165e-05, "loss": 0.2445, "step": 597 }, { "epoch": 0.3216134453781513, "grad_norm": 0.2108844816684723, "learning_rate": 4.288391904279954e-05, "loss": 0.247, "step": 598 }, { "epoch": 0.3221512605042017, "grad_norm": 0.21289974451065063, "learning_rate": 4.2851083923478186e-05, "loss": 0.2446, "step": 599 }, { "epoch": 0.3226890756302521, "grad_norm": 0.2509615123271942, "learning_rate": 4.28181858575495e-05, "loss": 0.299, "step": 600 }, { "epoch": 0.3232268907563025, "grad_norm": 0.2101736217737198, "learning_rate": 4.278522496101887e-05, "loss": 0.2405, "step": 601 }, { "epoch": 0.32376470588235295, "grad_norm": 0.2126046121120453, "learning_rate": 4.275220135011322e-05, "loss": 0.2385, "step": 602 }, { "epoch": 0.32430252100840334, "grad_norm": 0.25800108909606934, "learning_rate": 4.271911514128061e-05, "loss": 0.2752, "step": 603 }, { "epoch": 0.3248403361344538, "grad_norm": 0.24176011979579926, "learning_rate": 4.268596645118986e-05, "loss": 0.2663, "step": 604 }, { "epoch": 0.3253781512605042, "grad_norm": 0.19101281464099884, "learning_rate": 4.2652755396730076e-05, "loss": 0.2198, "step": 605 }, { "epoch": 0.3259159663865546, "grad_norm": 0.25465914607048035, "learning_rate": 4.26194820950103e-05, "loss": 0.2616, "step": 606 }, { "epoch": 0.32645378151260507, "grad_norm": 0.2296343445777893, "learning_rate": 4.258614666335907e-05, "loss": 0.2506, "step": 607 }, { "epoch": 0.32699159663865546, "grad_norm": 0.22720451653003693, "learning_rate": 4.2552749219323993e-05, "loss": 0.2324, "step": 608 }, { "epoch": 0.3275294117647059, "grad_norm": 0.21041442453861237, "learning_rate": 4.251928988067135e-05, "loss": 0.2363, "step": 609 }, { "epoch": 0.3280672268907563, "grad_norm": 0.23050595819950104, "learning_rate": 4.248576876538568e-05, "loss": 0.249, "step": 610 }, { "epoch": 0.32860504201680674, "grad_norm": 0.22091218829154968, "learning_rate": 4.245218599166937e-05, "loss": 0.2566, "step": 611 }, { "epoch": 0.3291428571428571, "grad_norm": 0.22190944850444794, "learning_rate": 4.2418541677942185e-05, "loss": 0.2359, "step": 612 }, { "epoch": 0.32968067226890757, "grad_norm": 0.24113474786281586, "learning_rate": 4.238483594284094e-05, "loss": 0.2773, "step": 613 }, { "epoch": 0.33021848739495796, "grad_norm": 0.23700454831123352, "learning_rate": 4.235106890521901e-05, "loss": 0.2709, "step": 614 }, { "epoch": 0.3307563025210084, "grad_norm": 0.24181799590587616, "learning_rate": 4.231724068414594e-05, "loss": 0.2592, "step": 615 }, { "epoch": 0.33129411764705885, "grad_norm": 0.22947299480438232, "learning_rate": 4.228335139890703e-05, "loss": 0.294, "step": 616 }, { "epoch": 0.33183193277310924, "grad_norm": 0.23711158335208893, "learning_rate": 4.224940116900289e-05, "loss": 0.2711, "step": 617 }, { "epoch": 0.3323697478991597, "grad_norm": 0.19718888401985168, "learning_rate": 4.221539011414904e-05, "loss": 0.2418, "step": 618 }, { "epoch": 0.3329075630252101, "grad_norm": 0.22448423504829407, "learning_rate": 4.218131835427548e-05, "loss": 0.2565, "step": 619 }, { "epoch": 0.3334453781512605, "grad_norm": 0.2309676706790924, "learning_rate": 4.214718600952627e-05, "loss": 0.267, "step": 620 }, { "epoch": 0.3339831932773109, "grad_norm": 0.22953958809375763, "learning_rate": 4.2112993200259096e-05, "loss": 0.2431, "step": 621 }, { "epoch": 0.33452100840336135, "grad_norm": 0.2264927625656128, "learning_rate": 4.2078740047044884e-05, "loss": 0.271, "step": 622 }, { "epoch": 0.33505882352941174, "grad_norm": 0.24121220409870148, "learning_rate": 4.204442667066731e-05, "loss": 0.2776, "step": 623 }, { "epoch": 0.3355966386554622, "grad_norm": 0.2514519989490509, "learning_rate": 4.2010053192122425e-05, "loss": 0.2725, "step": 624 }, { "epoch": 0.33613445378151263, "grad_norm": 0.24246841669082642, "learning_rate": 4.19756197326182e-05, "loss": 0.2848, "step": 625 }, { "epoch": 0.336672268907563, "grad_norm": 0.19577960669994354, "learning_rate": 4.194112641357414e-05, "loss": 0.205, "step": 626 }, { "epoch": 0.33721008403361347, "grad_norm": 0.21389280259609222, "learning_rate": 4.1906573356620795e-05, "loss": 0.2497, "step": 627 }, { "epoch": 0.33774789915966386, "grad_norm": 0.2255650758743286, "learning_rate": 4.1871960683599396e-05, "loss": 0.25, "step": 628 }, { "epoch": 0.3382857142857143, "grad_norm": 0.21241484582424164, "learning_rate": 4.1837288516561356e-05, "loss": 0.2441, "step": 629 }, { "epoch": 0.3388235294117647, "grad_norm": 0.2422211468219757, "learning_rate": 4.180255697776791e-05, "loss": 0.2686, "step": 630 }, { "epoch": 0.33936134453781513, "grad_norm": 0.22701707482337952, "learning_rate": 4.1767766189689604e-05, "loss": 0.2614, "step": 631 }, { "epoch": 0.3398991596638655, "grad_norm": 0.22917112708091736, "learning_rate": 4.1732916275005976e-05, "loss": 0.2804, "step": 632 }, { "epoch": 0.34043697478991597, "grad_norm": 0.2061920017004013, "learning_rate": 4.1698007356604996e-05, "loss": 0.2446, "step": 633 }, { "epoch": 0.3409747899159664, "grad_norm": 0.22303032875061035, "learning_rate": 4.1663039557582725e-05, "loss": 0.247, "step": 634 }, { "epoch": 0.3415126050420168, "grad_norm": 0.21316547691822052, "learning_rate": 4.162801300124285e-05, "loss": 0.2395, "step": 635 }, { "epoch": 0.34205042016806725, "grad_norm": 0.23198041319847107, "learning_rate": 4.1592927811096226e-05, "loss": 0.2831, "step": 636 }, { "epoch": 0.34258823529411764, "grad_norm": 0.22692051529884338, "learning_rate": 4.155778411086048e-05, "loss": 0.2775, "step": 637 }, { "epoch": 0.3431260504201681, "grad_norm": 0.22588621079921722, "learning_rate": 4.152258202445956e-05, "loss": 0.26, "step": 638 }, { "epoch": 0.34366386554621847, "grad_norm": 0.2259419560432434, "learning_rate": 4.1487321676023286e-05, "loss": 0.2261, "step": 639 }, { "epoch": 0.3442016806722689, "grad_norm": 0.23023352026939392, "learning_rate": 4.1452003189886935e-05, "loss": 0.2785, "step": 640 }, { "epoch": 0.3447394957983193, "grad_norm": 0.22596915066242218, "learning_rate": 4.141662669059076e-05, "loss": 0.2607, "step": 641 }, { "epoch": 0.34527731092436975, "grad_norm": 0.20227885246276855, "learning_rate": 4.138119230287962e-05, "loss": 0.2121, "step": 642 }, { "epoch": 0.34581512605042014, "grad_norm": 0.22664810717105865, "learning_rate": 4.134570015170248e-05, "loss": 0.2425, "step": 643 }, { "epoch": 0.3463529411764706, "grad_norm": 0.2297787368297577, "learning_rate": 4.131015036221198e-05, "loss": 0.248, "step": 644 }, { "epoch": 0.34689075630252103, "grad_norm": 0.2577529847621918, "learning_rate": 4.127454305976404e-05, "loss": 0.2918, "step": 645 }, { "epoch": 0.3474285714285714, "grad_norm": 0.19339817762374878, "learning_rate": 4.123887836991734e-05, "loss": 0.2096, "step": 646 }, { "epoch": 0.34796638655462186, "grad_norm": 0.23412401974201202, "learning_rate": 4.120315641843294e-05, "loss": 0.2701, "step": 647 }, { "epoch": 0.34850420168067225, "grad_norm": 0.23786845803260803, "learning_rate": 4.1167377331273825e-05, "loss": 0.2698, "step": 648 }, { "epoch": 0.3490420168067227, "grad_norm": 0.22617590427398682, "learning_rate": 4.113154123460444e-05, "loss": 0.2606, "step": 649 }, { "epoch": 0.3495798319327731, "grad_norm": 0.23543404042720795, "learning_rate": 4.109564825479025e-05, "loss": 0.2592, "step": 650 }, { "epoch": 0.35011764705882353, "grad_norm": 0.23986509442329407, "learning_rate": 4.105969851839733e-05, "loss": 0.2779, "step": 651 }, { "epoch": 0.3506554621848739, "grad_norm": 0.19377322494983673, "learning_rate": 4.102369215219186e-05, "loss": 0.2269, "step": 652 }, { "epoch": 0.35119327731092437, "grad_norm": 0.2443757802248001, "learning_rate": 4.098762928313972e-05, "loss": 0.2612, "step": 653 }, { "epoch": 0.3517310924369748, "grad_norm": 0.2074122577905655, "learning_rate": 4.095151003840604e-05, "loss": 0.2058, "step": 654 }, { "epoch": 0.3522689075630252, "grad_norm": 0.22986437380313873, "learning_rate": 4.0915334545354734e-05, "loss": 0.2547, "step": 655 }, { "epoch": 0.35280672268907565, "grad_norm": 0.21097266674041748, "learning_rate": 4.087910293154805e-05, "loss": 0.2615, "step": 656 }, { "epoch": 0.35334453781512604, "grad_norm": 0.21959158778190613, "learning_rate": 4.084281532474614e-05, "loss": 0.2604, "step": 657 }, { "epoch": 0.3538823529411765, "grad_norm": 0.245167076587677, "learning_rate": 4.080647185290661e-05, "loss": 0.2573, "step": 658 }, { "epoch": 0.35442016806722687, "grad_norm": 0.23227927088737488, "learning_rate": 4.077007264418403e-05, "loss": 0.2531, "step": 659 }, { "epoch": 0.3549579831932773, "grad_norm": 0.20602072775363922, "learning_rate": 4.073361782692953e-05, "loss": 0.2464, "step": 660 }, { "epoch": 0.3554957983193277, "grad_norm": 0.20451411604881287, "learning_rate": 4.069710752969031e-05, "loss": 0.2293, "step": 661 }, { "epoch": 0.35603361344537815, "grad_norm": 0.24936050176620483, "learning_rate": 4.066054188120924e-05, "loss": 0.267, "step": 662 }, { "epoch": 0.3565714285714286, "grad_norm": 0.24026912450790405, "learning_rate": 4.062392101042434e-05, "loss": 0.2843, "step": 663 }, { "epoch": 0.357109243697479, "grad_norm": 0.21311888098716736, "learning_rate": 4.058724504646834e-05, "loss": 0.2549, "step": 664 }, { "epoch": 0.35764705882352943, "grad_norm": 0.23989063501358032, "learning_rate": 4.055051411866828e-05, "loss": 0.2687, "step": 665 }, { "epoch": 0.3581848739495798, "grad_norm": 0.20522049069404602, "learning_rate": 4.051372835654499e-05, "loss": 0.2214, "step": 666 }, { "epoch": 0.35872268907563026, "grad_norm": 0.23206378519535065, "learning_rate": 4.047688788981267e-05, "loss": 0.2823, "step": 667 }, { "epoch": 0.35926050420168065, "grad_norm": 0.21977598965168, "learning_rate": 4.0439992848378385e-05, "loss": 0.2413, "step": 668 }, { "epoch": 0.3597983193277311, "grad_norm": 0.22605201601982117, "learning_rate": 4.04030433623417e-05, "loss": 0.2396, "step": 669 }, { "epoch": 0.3603361344537815, "grad_norm": 0.2182280272245407, "learning_rate": 4.036603956199411e-05, "loss": 0.2595, "step": 670 }, { "epoch": 0.36087394957983193, "grad_norm": 0.19659273326396942, "learning_rate": 4.032898157781866e-05, "loss": 0.2176, "step": 671 }, { "epoch": 0.3614117647058824, "grad_norm": 0.22537723183631897, "learning_rate": 4.0291869540489455e-05, "loss": 0.2926, "step": 672 }, { "epoch": 0.36194957983193277, "grad_norm": 0.2213325798511505, "learning_rate": 4.025470358087121e-05, "loss": 0.2628, "step": 673 }, { "epoch": 0.3624873949579832, "grad_norm": 0.22804518043994904, "learning_rate": 4.021748383001875e-05, "loss": 0.2634, "step": 674 }, { "epoch": 0.3630252100840336, "grad_norm": 0.2389008104801178, "learning_rate": 4.018021041917662e-05, "loss": 0.2786, "step": 675 }, { "epoch": 0.36356302521008405, "grad_norm": 0.20193496346473694, "learning_rate": 4.0142883479778555e-05, "loss": 0.2192, "step": 676 }, { "epoch": 0.36410084033613443, "grad_norm": 0.22615692019462585, "learning_rate": 4.010550314344704e-05, "loss": 0.2523, "step": 677 }, { "epoch": 0.3646386554621849, "grad_norm": 0.19591794908046722, "learning_rate": 4.006806954199287e-05, "loss": 0.2243, "step": 678 }, { "epoch": 0.36517647058823527, "grad_norm": 0.2186879813671112, "learning_rate": 4.0030582807414655e-05, "loss": 0.2513, "step": 679 }, { "epoch": 0.3657142857142857, "grad_norm": 0.20144857466220856, "learning_rate": 3.9993043071898344e-05, "loss": 0.2492, "step": 680 }, { "epoch": 0.36625210084033616, "grad_norm": 0.21831588447093964, "learning_rate": 3.9955450467816816e-05, "loss": 0.2307, "step": 681 }, { "epoch": 0.36678991596638655, "grad_norm": 0.23408401012420654, "learning_rate": 3.9917805127729336e-05, "loss": 0.2869, "step": 682 }, { "epoch": 0.367327731092437, "grad_norm": 0.22690241038799286, "learning_rate": 3.988010718438115e-05, "loss": 0.2661, "step": 683 }, { "epoch": 0.3678655462184874, "grad_norm": 0.21780212223529816, "learning_rate": 3.9842356770702996e-05, "loss": 0.2849, "step": 684 }, { "epoch": 0.3684033613445378, "grad_norm": 0.2135433703660965, "learning_rate": 3.9804554019810626e-05, "loss": 0.2491, "step": 685 }, { "epoch": 0.3689411764705882, "grad_norm": 0.20458251237869263, "learning_rate": 3.9766699065004335e-05, "loss": 0.2348, "step": 686 }, { "epoch": 0.36947899159663866, "grad_norm": 0.22119994461536407, "learning_rate": 3.972879203976851e-05, "loss": 0.2557, "step": 687 }, { "epoch": 0.37001680672268905, "grad_norm": 0.2051679491996765, "learning_rate": 3.969083307777115e-05, "loss": 0.2307, "step": 688 }, { "epoch": 0.3705546218487395, "grad_norm": 0.20608171820640564, "learning_rate": 3.96528223128634e-05, "loss": 0.2215, "step": 689 }, { "epoch": 0.37109243697478994, "grad_norm": 0.20970328152179718, "learning_rate": 3.9614759879079057e-05, "loss": 0.2274, "step": 690 }, { "epoch": 0.37163025210084033, "grad_norm": 0.2343398630619049, "learning_rate": 3.9576645910634117e-05, "loss": 0.2628, "step": 691 }, { "epoch": 0.3721680672268908, "grad_norm": 0.2425430417060852, "learning_rate": 3.9538480541926306e-05, "loss": 0.2764, "step": 692 }, { "epoch": 0.37270588235294116, "grad_norm": 0.21542011201381683, "learning_rate": 3.95002639075346e-05, "loss": 0.2429, "step": 693 }, { "epoch": 0.3732436974789916, "grad_norm": 0.23112311959266663, "learning_rate": 3.946199614221873e-05, "loss": 0.2526, "step": 694 }, { "epoch": 0.373781512605042, "grad_norm": 0.2480165958404541, "learning_rate": 3.9423677380918737e-05, "loss": 0.2806, "step": 695 }, { "epoch": 0.37431932773109244, "grad_norm": 0.25499626994132996, "learning_rate": 3.93853077587545e-05, "loss": 0.2789, "step": 696 }, { "epoch": 0.37485714285714283, "grad_norm": 0.22191846370697021, "learning_rate": 3.934688741102521e-05, "loss": 0.2418, "step": 697 }, { "epoch": 0.3753949579831933, "grad_norm": 0.1952284723520279, "learning_rate": 3.930841647320895e-05, "loss": 0.2072, "step": 698 }, { "epoch": 0.3759327731092437, "grad_norm": 0.24460534751415253, "learning_rate": 3.92698950809622e-05, "loss": 0.2596, "step": 699 }, { "epoch": 0.3764705882352941, "grad_norm": 0.22935707867145538, "learning_rate": 3.9231323370119335e-05, "loss": 0.2439, "step": 700 }, { "epoch": 0.37700840336134456, "grad_norm": 0.24943925440311432, "learning_rate": 3.919270147669216e-05, "loss": 0.2478, "step": 701 }, { "epoch": 0.37754621848739495, "grad_norm": 0.24079573154449463, "learning_rate": 3.915402953686946e-05, "loss": 0.2898, "step": 702 }, { "epoch": 0.3780840336134454, "grad_norm": 0.1994202733039856, "learning_rate": 3.911530768701648e-05, "loss": 0.2236, "step": 703 }, { "epoch": 0.3786218487394958, "grad_norm": 0.21470992267131805, "learning_rate": 3.907653606367444e-05, "loss": 0.2697, "step": 704 }, { "epoch": 0.3791596638655462, "grad_norm": 0.22520503401756287, "learning_rate": 3.90377148035601e-05, "loss": 0.2528, "step": 705 }, { "epoch": 0.3796974789915966, "grad_norm": 0.18367807567119598, "learning_rate": 3.899884404356524e-05, "loss": 0.2072, "step": 706 }, { "epoch": 0.38023529411764706, "grad_norm": 0.24990661442279816, "learning_rate": 3.8959923920756183e-05, "loss": 0.2974, "step": 707 }, { "epoch": 0.3807731092436975, "grad_norm": 0.221461683511734, "learning_rate": 3.89209545723733e-05, "loss": 0.2412, "step": 708 }, { "epoch": 0.3813109243697479, "grad_norm": 0.23815631866455078, "learning_rate": 3.8881936135830586e-05, "loss": 0.2672, "step": 709 }, { "epoch": 0.38184873949579834, "grad_norm": 0.27707576751708984, "learning_rate": 3.8842868748715095e-05, "loss": 0.3383, "step": 710 }, { "epoch": 0.38238655462184873, "grad_norm": 0.22076308727264404, "learning_rate": 3.880375254878649e-05, "loss": 0.2533, "step": 711 }, { "epoch": 0.3829243697478992, "grad_norm": 0.22608336806297302, "learning_rate": 3.876458767397657e-05, "loss": 0.2624, "step": 712 }, { "epoch": 0.38346218487394956, "grad_norm": 0.22405767440795898, "learning_rate": 3.872537426238878e-05, "loss": 0.2615, "step": 713 }, { "epoch": 0.384, "grad_norm": 0.23655055463314056, "learning_rate": 3.868611245229769e-05, "loss": 0.2806, "step": 714 }, { "epoch": 0.3845378151260504, "grad_norm": 0.23618574440479279, "learning_rate": 3.864680238214857e-05, "loss": 0.2968, "step": 715 }, { "epoch": 0.38507563025210084, "grad_norm": 0.22632290422916412, "learning_rate": 3.8607444190556825e-05, "loss": 0.2665, "step": 716 }, { "epoch": 0.3856134453781513, "grad_norm": 0.2065638303756714, "learning_rate": 3.8568038016307564e-05, "loss": 0.2133, "step": 717 }, { "epoch": 0.3861512605042017, "grad_norm": 0.21590805053710938, "learning_rate": 3.8528583998355094e-05, "loss": 0.2422, "step": 718 }, { "epoch": 0.3866890756302521, "grad_norm": 0.25086694955825806, "learning_rate": 3.8489082275822443e-05, "loss": 0.252, "step": 719 }, { "epoch": 0.3872268907563025, "grad_norm": 0.21799249947071075, "learning_rate": 3.844953298800081e-05, "loss": 0.2444, "step": 720 }, { "epoch": 0.38776470588235296, "grad_norm": 0.2219514101743698, "learning_rate": 3.840993627434918e-05, "loss": 0.2568, "step": 721 }, { "epoch": 0.38830252100840335, "grad_norm": 0.2293611615896225, "learning_rate": 3.8370292274493724e-05, "loss": 0.261, "step": 722 }, { "epoch": 0.3888403361344538, "grad_norm": 0.2900821566581726, "learning_rate": 3.833060112822737e-05, "loss": 0.3559, "step": 723 }, { "epoch": 0.3893781512605042, "grad_norm": 0.23205965757369995, "learning_rate": 3.82908629755093e-05, "loss": 0.2841, "step": 724 }, { "epoch": 0.3899159663865546, "grad_norm": 0.19430388510227203, "learning_rate": 3.825107795646444e-05, "loss": 0.2316, "step": 725 }, { "epoch": 0.39045378151260507, "grad_norm": 0.22875846922397614, "learning_rate": 3.821124621138299e-05, "loss": 0.2551, "step": 726 }, { "epoch": 0.39099159663865546, "grad_norm": 0.2332560122013092, "learning_rate": 3.817136788071989e-05, "loss": 0.2462, "step": 727 }, { "epoch": 0.3915294117647059, "grad_norm": 0.20361079275608063, "learning_rate": 3.813144310509438e-05, "loss": 0.2288, "step": 728 }, { "epoch": 0.3920672268907563, "grad_norm": 0.21833625435829163, "learning_rate": 3.809147202528946e-05, "loss": 0.2735, "step": 729 }, { "epoch": 0.39260504201680674, "grad_norm": 0.20534344017505646, "learning_rate": 3.805145478225141e-05, "loss": 0.2563, "step": 730 }, { "epoch": 0.3931428571428571, "grad_norm": 0.26870402693748474, "learning_rate": 3.8011391517089276e-05, "loss": 0.3072, "step": 731 }, { "epoch": 0.39368067226890757, "grad_norm": 0.20677410066127777, "learning_rate": 3.797128237107441e-05, "loss": 0.2282, "step": 732 }, { "epoch": 0.39421848739495796, "grad_norm": 0.22139576077461243, "learning_rate": 3.7931127485639964e-05, "loss": 0.2835, "step": 733 }, { "epoch": 0.3947563025210084, "grad_norm": 0.2173648327589035, "learning_rate": 3.789092700238032e-05, "loss": 0.2778, "step": 734 }, { "epoch": 0.3952941176470588, "grad_norm": 0.22610820829868317, "learning_rate": 3.785068106305071e-05, "loss": 0.2526, "step": 735 }, { "epoch": 0.39583193277310924, "grad_norm": 0.20715831220149994, "learning_rate": 3.7810389809566596e-05, "loss": 0.2242, "step": 736 }, { "epoch": 0.3963697478991597, "grad_norm": 0.21988730132579803, "learning_rate": 3.777005338400328e-05, "loss": 0.2597, "step": 737 }, { "epoch": 0.3969075630252101, "grad_norm": 0.27025946974754333, "learning_rate": 3.772967192859532e-05, "loss": 0.2855, "step": 738 }, { "epoch": 0.3974453781512605, "grad_norm": 0.2487526535987854, "learning_rate": 3.768924558573606e-05, "loss": 0.2615, "step": 739 }, { "epoch": 0.3979831932773109, "grad_norm": 0.21603506803512573, "learning_rate": 3.7648774497977134e-05, "loss": 0.2328, "step": 740 }, { "epoch": 0.39852100840336135, "grad_norm": 0.23843330144882202, "learning_rate": 3.760825880802795e-05, "loss": 0.3074, "step": 741 }, { "epoch": 0.39905882352941174, "grad_norm": 0.23884864151477814, "learning_rate": 3.75676986587552e-05, "loss": 0.2777, "step": 742 }, { "epoch": 0.3995966386554622, "grad_norm": 0.20210275053977966, "learning_rate": 3.752709419318234e-05, "loss": 0.2448, "step": 743 }, { "epoch": 0.4001344537815126, "grad_norm": 0.2400791347026825, "learning_rate": 3.74864455544891e-05, "loss": 0.2762, "step": 744 }, { "epoch": 0.400672268907563, "grad_norm": 0.21968227624893188, "learning_rate": 3.7445752886010986e-05, "loss": 0.2613, "step": 745 }, { "epoch": 0.40121008403361347, "grad_norm": 0.2179841548204422, "learning_rate": 3.740501633123872e-05, "loss": 0.2658, "step": 746 }, { "epoch": 0.40174789915966386, "grad_norm": 0.1984151154756546, "learning_rate": 3.736423603381782e-05, "loss": 0.2254, "step": 747 }, { "epoch": 0.4022857142857143, "grad_norm": 0.22127607464790344, "learning_rate": 3.732341213754804e-05, "loss": 0.2651, "step": 748 }, { "epoch": 0.4028235294117647, "grad_norm": 0.23441387712955475, "learning_rate": 3.728254478638285e-05, "loss": 0.2582, "step": 749 }, { "epoch": 0.40336134453781514, "grad_norm": 0.20126119256019592, "learning_rate": 3.724163412442898e-05, "loss": 0.2092, "step": 750 }, { "epoch": 0.4038991596638655, "grad_norm": 0.1961725950241089, "learning_rate": 3.7200680295945875e-05, "loss": 0.2299, "step": 751 }, { "epoch": 0.40443697478991597, "grad_norm": 0.22056607902050018, "learning_rate": 3.715968344534517e-05, "loss": 0.2557, "step": 752 }, { "epoch": 0.40497478991596636, "grad_norm": 0.2034258246421814, "learning_rate": 3.711864371719024e-05, "loss": 0.2358, "step": 753 }, { "epoch": 0.4055126050420168, "grad_norm": 0.2452882081270218, "learning_rate": 3.707756125619564e-05, "loss": 0.2945, "step": 754 }, { "epoch": 0.40605042016806725, "grad_norm": 0.21332290768623352, "learning_rate": 3.703643620722659e-05, "loss": 0.2239, "step": 755 }, { "epoch": 0.40658823529411764, "grad_norm": 0.25961923599243164, "learning_rate": 3.699526871529853e-05, "loss": 0.2844, "step": 756 }, { "epoch": 0.4071260504201681, "grad_norm": 0.260526567697525, "learning_rate": 3.695405892557652e-05, "loss": 0.2855, "step": 757 }, { "epoch": 0.4076638655462185, "grad_norm": 0.21954157948493958, "learning_rate": 3.691280698337478e-05, "loss": 0.2377, "step": 758 }, { "epoch": 0.4082016806722689, "grad_norm": 0.22621572017669678, "learning_rate": 3.6871513034156193e-05, "loss": 0.2762, "step": 759 }, { "epoch": 0.4087394957983193, "grad_norm": 0.20203298330307007, "learning_rate": 3.683017722353173e-05, "loss": 0.2065, "step": 760 }, { "epoch": 0.40927731092436975, "grad_norm": 0.24239514768123627, "learning_rate": 3.678879969726001e-05, "loss": 0.2951, "step": 761 }, { "epoch": 0.40981512605042014, "grad_norm": 0.22806671261787415, "learning_rate": 3.6747380601246726e-05, "loss": 0.2626, "step": 762 }, { "epoch": 0.4103529411764706, "grad_norm": 0.2218102365732193, "learning_rate": 3.6705920081544165e-05, "loss": 0.2108, "step": 763 }, { "epoch": 0.41089075630252103, "grad_norm": 0.2203376591205597, "learning_rate": 3.666441828435067e-05, "loss": 0.2487, "step": 764 }, { "epoch": 0.4114285714285714, "grad_norm": 0.21736861765384674, "learning_rate": 3.662287535601015e-05, "loss": 0.2404, "step": 765 }, { "epoch": 0.41196638655462187, "grad_norm": 0.19187769293785095, "learning_rate": 3.658129144301155e-05, "loss": 0.2023, "step": 766 }, { "epoch": 0.41250420168067226, "grad_norm": 0.22862669825553894, "learning_rate": 3.6539666691988336e-05, "loss": 0.283, "step": 767 }, { "epoch": 0.4130420168067227, "grad_norm": 0.19601936638355255, "learning_rate": 3.649800124971796e-05, "loss": 0.2049, "step": 768 }, { "epoch": 0.4135798319327731, "grad_norm": 0.20116636157035828, "learning_rate": 3.645629526312139e-05, "loss": 0.2438, "step": 769 }, { "epoch": 0.41411764705882353, "grad_norm": 0.23367391526699066, "learning_rate": 3.641454887926254e-05, "loss": 0.2881, "step": 770 }, { "epoch": 0.4146554621848739, "grad_norm": 0.21525628864765167, "learning_rate": 3.637276224534777e-05, "loss": 0.259, "step": 771 }, { "epoch": 0.41519327731092437, "grad_norm": 0.18986177444458008, "learning_rate": 3.633093550872538e-05, "loss": 0.2278, "step": 772 }, { "epoch": 0.4157310924369748, "grad_norm": 0.22620441019535065, "learning_rate": 3.628906881688509e-05, "loss": 0.2428, "step": 773 }, { "epoch": 0.4162689075630252, "grad_norm": 0.20173411071300507, "learning_rate": 3.624716231745749e-05, "loss": 0.2318, "step": 774 }, { "epoch": 0.41680672268907565, "grad_norm": 0.20827507972717285, "learning_rate": 3.6205216158213544e-05, "loss": 0.2511, "step": 775 }, { "epoch": 0.41734453781512604, "grad_norm": 0.2318229228258133, "learning_rate": 3.616323048706408e-05, "loss": 0.2749, "step": 776 }, { "epoch": 0.4178823529411765, "grad_norm": 0.20369243621826172, "learning_rate": 3.612120545205924e-05, "loss": 0.2301, "step": 777 }, { "epoch": 0.41842016806722687, "grad_norm": 0.22705359756946564, "learning_rate": 3.607914120138796e-05, "loss": 0.2558, "step": 778 }, { "epoch": 0.4189579831932773, "grad_norm": 0.20380060374736786, "learning_rate": 3.603703788337749e-05, "loss": 0.2797, "step": 779 }, { "epoch": 0.4194957983193277, "grad_norm": 0.21266338229179382, "learning_rate": 3.5994895646492825e-05, "loss": 0.2581, "step": 780 }, { "epoch": 0.42003361344537815, "grad_norm": 0.20806504786014557, "learning_rate": 3.595271463933617e-05, "loss": 0.2324, "step": 781 }, { "epoch": 0.4205714285714286, "grad_norm": 0.20767484605312347, "learning_rate": 3.591049501064648e-05, "loss": 0.2339, "step": 782 }, { "epoch": 0.421109243697479, "grad_norm": 0.24052263796329498, "learning_rate": 3.586823690929888e-05, "loss": 0.2594, "step": 783 }, { "epoch": 0.42164705882352943, "grad_norm": 0.2176881730556488, "learning_rate": 3.582594048430417e-05, "loss": 0.2418, "step": 784 }, { "epoch": 0.4221848739495798, "grad_norm": 0.21665115654468536, "learning_rate": 3.5783605884808246e-05, "loss": 0.2303, "step": 785 }, { "epoch": 0.42272268907563026, "grad_norm": 0.21690556406974792, "learning_rate": 3.5741233260091685e-05, "loss": 0.2356, "step": 786 }, { "epoch": 0.42326050420168065, "grad_norm": 0.23979604244232178, "learning_rate": 3.569882275956908e-05, "loss": 0.2703, "step": 787 }, { "epoch": 0.4237983193277311, "grad_norm": 0.21392807364463806, "learning_rate": 3.5656374532788625e-05, "loss": 0.251, "step": 788 }, { "epoch": 0.4243361344537815, "grad_norm": 0.20479917526245117, "learning_rate": 3.5613888729431525e-05, "loss": 0.2327, "step": 789 }, { "epoch": 0.42487394957983193, "grad_norm": 0.20234209299087524, "learning_rate": 3.55713654993115e-05, "loss": 0.2181, "step": 790 }, { "epoch": 0.4254117647058824, "grad_norm": 0.20433494448661804, "learning_rate": 3.5528804992374235e-05, "loss": 0.2172, "step": 791 }, { "epoch": 0.42594957983193277, "grad_norm": 0.2267509549856186, "learning_rate": 3.548620735869687e-05, "loss": 0.2864, "step": 792 }, { "epoch": 0.4264873949579832, "grad_norm": 0.21119089424610138, "learning_rate": 3.5443572748487444e-05, "loss": 0.2516, "step": 793 }, { "epoch": 0.4270252100840336, "grad_norm": 0.1937139332294464, "learning_rate": 3.54009013120844e-05, "loss": 0.2314, "step": 794 }, { "epoch": 0.42756302521008405, "grad_norm": 0.20377981662750244, "learning_rate": 3.5358193199956036e-05, "loss": 0.2305, "step": 795 }, { "epoch": 0.42810084033613444, "grad_norm": 0.21441389620304108, "learning_rate": 3.5315448562699974e-05, "loss": 0.2495, "step": 796 }, { "epoch": 0.4286386554621849, "grad_norm": 0.22648885846138, "learning_rate": 3.5272667551042625e-05, "loss": 0.2374, "step": 797 }, { "epoch": 0.42917647058823527, "grad_norm": 0.2615983784198761, "learning_rate": 3.5229850315838673e-05, "loss": 0.2897, "step": 798 }, { "epoch": 0.4297142857142857, "grad_norm": 0.23039905726909637, "learning_rate": 3.518699700807052e-05, "loss": 0.2625, "step": 799 }, { "epoch": 0.43025210084033616, "grad_norm": 0.22674910724163055, "learning_rate": 3.514410777884778e-05, "loss": 0.2578, "step": 800 }, { "epoch": 0.43078991596638655, "grad_norm": 0.21222665905952454, "learning_rate": 3.5101182779406724e-05, "loss": 0.2427, "step": 801 }, { "epoch": 0.431327731092437, "grad_norm": 0.20142942667007446, "learning_rate": 3.505822216110979e-05, "loss": 0.2305, "step": 802 }, { "epoch": 0.4318655462184874, "grad_norm": 0.2520016133785248, "learning_rate": 3.5015226075444954e-05, "loss": 0.2881, "step": 803 }, { "epoch": 0.43240336134453783, "grad_norm": 0.2250577211380005, "learning_rate": 3.497219467402531e-05, "loss": 0.2571, "step": 804 }, { "epoch": 0.4329411764705882, "grad_norm": 0.1884489506483078, "learning_rate": 3.492912810858845e-05, "loss": 0.206, "step": 805 }, { "epoch": 0.43347899159663866, "grad_norm": 0.206516832113266, "learning_rate": 3.488602653099599e-05, "loss": 0.2246, "step": 806 }, { "epoch": 0.43401680672268905, "grad_norm": 0.2352733016014099, "learning_rate": 3.484289009323298e-05, "loss": 0.2635, "step": 807 }, { "epoch": 0.4345546218487395, "grad_norm": 0.2153131663799286, "learning_rate": 3.479971894740741e-05, "loss": 0.2304, "step": 808 }, { "epoch": 0.43509243697478994, "grad_norm": 0.22394916415214539, "learning_rate": 3.475651324574965e-05, "loss": 0.2596, "step": 809 }, { "epoch": 0.43563025210084033, "grad_norm": 0.21885313093662262, "learning_rate": 3.4713273140611926e-05, "loss": 0.2669, "step": 810 }, { "epoch": 0.4361680672268908, "grad_norm": 0.1914963722229004, "learning_rate": 3.466999878446777e-05, "loss": 0.2336, "step": 811 }, { "epoch": 0.43670588235294117, "grad_norm": 0.21499374508857727, "learning_rate": 3.4626690329911504e-05, "loss": 0.2456, "step": 812 }, { "epoch": 0.4372436974789916, "grad_norm": 0.22554565966129303, "learning_rate": 3.458334792965768e-05, "loss": 0.26, "step": 813 }, { "epoch": 0.437781512605042, "grad_norm": 0.22755029797554016, "learning_rate": 3.453997173654055e-05, "loss": 0.2734, "step": 814 }, { "epoch": 0.43831932773109245, "grad_norm": 0.23045405745506287, "learning_rate": 3.4496561903513515e-05, "loss": 0.2523, "step": 815 }, { "epoch": 0.43885714285714283, "grad_norm": 0.22664926946163177, "learning_rate": 3.445311858364862e-05, "loss": 0.244, "step": 816 }, { "epoch": 0.4393949579831933, "grad_norm": 0.2309415340423584, "learning_rate": 3.440964193013597e-05, "loss": 0.2971, "step": 817 }, { "epoch": 0.4399327731092437, "grad_norm": 0.23493921756744385, "learning_rate": 3.436613209628323e-05, "loss": 0.2484, "step": 818 }, { "epoch": 0.4404705882352941, "grad_norm": 0.1917898803949356, "learning_rate": 3.432258923551505e-05, "loss": 0.2215, "step": 819 }, { "epoch": 0.44100840336134456, "grad_norm": 0.2035529613494873, "learning_rate": 3.427901350137256e-05, "loss": 0.2214, "step": 820 }, { "epoch": 0.44154621848739495, "grad_norm": 0.22623182833194733, "learning_rate": 3.423540504751278e-05, "loss": 0.2649, "step": 821 }, { "epoch": 0.4420840336134454, "grad_norm": 0.22934243083000183, "learning_rate": 3.419176402770813e-05, "loss": 0.2549, "step": 822 }, { "epoch": 0.4426218487394958, "grad_norm": 0.23597899079322815, "learning_rate": 3.414809059584585e-05, "loss": 0.2527, "step": 823 }, { "epoch": 0.4431596638655462, "grad_norm": 0.24756953120231628, "learning_rate": 3.4104384905927475e-05, "loss": 0.3129, "step": 824 }, { "epoch": 0.4436974789915966, "grad_norm": 0.21132510900497437, "learning_rate": 3.40606471120683e-05, "loss": 0.2339, "step": 825 }, { "epoch": 0.44423529411764706, "grad_norm": 0.2312280833721161, "learning_rate": 3.4016877368496805e-05, "loss": 0.2601, "step": 826 }, { "epoch": 0.4447731092436975, "grad_norm": 0.1933688074350357, "learning_rate": 3.397307582955416e-05, "loss": 0.2228, "step": 827 }, { "epoch": 0.4453109243697479, "grad_norm": 0.22260862588882446, "learning_rate": 3.392924264969361e-05, "loss": 0.2588, "step": 828 }, { "epoch": 0.44584873949579834, "grad_norm": 0.23882415890693665, "learning_rate": 3.388537798348002e-05, "loss": 0.2592, "step": 829 }, { "epoch": 0.44638655462184873, "grad_norm": 0.25822776556015015, "learning_rate": 3.384148198558924e-05, "loss": 0.3121, "step": 830 }, { "epoch": 0.4469243697478992, "grad_norm": 0.20021331310272217, "learning_rate": 3.3797554810807636e-05, "loss": 0.2237, "step": 831 }, { "epoch": 0.44746218487394956, "grad_norm": 0.22066788375377655, "learning_rate": 3.375359661403149e-05, "loss": 0.2575, "step": 832 }, { "epoch": 0.448, "grad_norm": 0.25471699237823486, "learning_rate": 3.3709607550266476e-05, "loss": 0.3002, "step": 833 }, { "epoch": 0.4485378151260504, "grad_norm": 0.206255242228508, "learning_rate": 3.366558777462711e-05, "loss": 0.237, "step": 834 }, { "epoch": 0.44907563025210084, "grad_norm": 0.21138547360897064, "learning_rate": 3.362153744233622e-05, "loss": 0.2245, "step": 835 }, { "epoch": 0.44961344537815123, "grad_norm": 0.2394358068704605, "learning_rate": 3.357745670872437e-05, "loss": 0.2922, "step": 836 }, { "epoch": 0.4501512605042017, "grad_norm": 0.1949562430381775, "learning_rate": 3.3533345729229315e-05, "loss": 0.2212, "step": 837 }, { "epoch": 0.4506890756302521, "grad_norm": 0.2181372046470642, "learning_rate": 3.3489204659395494e-05, "loss": 0.25, "step": 838 }, { "epoch": 0.4512268907563025, "grad_norm": 0.2028977870941162, "learning_rate": 3.3445033654873426e-05, "loss": 0.2392, "step": 839 }, { "epoch": 0.45176470588235296, "grad_norm": 0.2228553295135498, "learning_rate": 3.340083287141918e-05, "loss": 0.2482, "step": 840 }, { "epoch": 0.45230252100840335, "grad_norm": 0.24964722990989685, "learning_rate": 3.335660246489387e-05, "loss": 0.3043, "step": 841 }, { "epoch": 0.4528403361344538, "grad_norm": 0.2323354184627533, "learning_rate": 3.3312342591263005e-05, "loss": 0.2494, "step": 842 }, { "epoch": 0.4533781512605042, "grad_norm": 0.21759991347789764, "learning_rate": 3.326805340659606e-05, "loss": 0.235, "step": 843 }, { "epoch": 0.4539159663865546, "grad_norm": 0.20892301201820374, "learning_rate": 3.322373506706584e-05, "loss": 0.2368, "step": 844 }, { "epoch": 0.454453781512605, "grad_norm": 0.23253467679023743, "learning_rate": 3.3179387728947976e-05, "loss": 0.2779, "step": 845 }, { "epoch": 0.45499159663865546, "grad_norm": 0.19273951649665833, "learning_rate": 3.313501154862031e-05, "loss": 0.2208, "step": 846 }, { "epoch": 0.4555294117647059, "grad_norm": 0.22905100882053375, "learning_rate": 3.3090606682562426e-05, "loss": 0.2768, "step": 847 }, { "epoch": 0.4560672268907563, "grad_norm": 0.22161675989627838, "learning_rate": 3.304617328735505e-05, "loss": 0.2324, "step": 848 }, { "epoch": 0.45660504201680674, "grad_norm": 0.22560031712055206, "learning_rate": 3.3001711519679504e-05, "loss": 0.2639, "step": 849 }, { "epoch": 0.45714285714285713, "grad_norm": 0.2228708416223526, "learning_rate": 3.295722153631718e-05, "loss": 0.2557, "step": 850 }, { "epoch": 0.4576806722689076, "grad_norm": 0.2297031134366989, "learning_rate": 3.291270349414891e-05, "loss": 0.2476, "step": 851 }, { "epoch": 0.45821848739495796, "grad_norm": 0.2247052639722824, "learning_rate": 3.286815755015452e-05, "loss": 0.2429, "step": 852 }, { "epoch": 0.4587563025210084, "grad_norm": 0.1788255125284195, "learning_rate": 3.282358386141221e-05, "loss": 0.1964, "step": 853 }, { "epoch": 0.4592941176470588, "grad_norm": 0.23308522999286652, "learning_rate": 3.2778982585098007e-05, "loss": 0.2628, "step": 854 }, { "epoch": 0.45983193277310924, "grad_norm": 0.23418858647346497, "learning_rate": 3.2734353878485206e-05, "loss": 0.2707, "step": 855 }, { "epoch": 0.4603697478991597, "grad_norm": 0.24291405081748962, "learning_rate": 3.268969789894386e-05, "loss": 0.2532, "step": 856 }, { "epoch": 0.4609075630252101, "grad_norm": 0.24038593471050262, "learning_rate": 3.264501480394016e-05, "loss": 0.2931, "step": 857 }, { "epoch": 0.4614453781512605, "grad_norm": 0.2002495974302292, "learning_rate": 3.260030475103592e-05, "loss": 0.2257, "step": 858 }, { "epoch": 0.4619831932773109, "grad_norm": 0.22154302895069122, "learning_rate": 3.255556789788803e-05, "loss": 0.2826, "step": 859 }, { "epoch": 0.46252100840336136, "grad_norm": 0.20094019174575806, "learning_rate": 3.251080440224786e-05, "loss": 0.2458, "step": 860 }, { "epoch": 0.46305882352941174, "grad_norm": 0.2181379795074463, "learning_rate": 3.246601442196074e-05, "loss": 0.2414, "step": 861 }, { "epoch": 0.4635966386554622, "grad_norm": 0.18043340742588043, "learning_rate": 3.242119811496539e-05, "loss": 0.2129, "step": 862 }, { "epoch": 0.4641344537815126, "grad_norm": 0.23722627758979797, "learning_rate": 3.237635563929336e-05, "loss": 0.2732, "step": 863 }, { "epoch": 0.464672268907563, "grad_norm": 0.21756771206855774, "learning_rate": 3.2331487153068474e-05, "loss": 0.2379, "step": 864 }, { "epoch": 0.46521008403361347, "grad_norm": 0.22130249440670013, "learning_rate": 3.228659281450628e-05, "loss": 0.2634, "step": 865 }, { "epoch": 0.46574789915966386, "grad_norm": 0.1850215643644333, "learning_rate": 3.2241672781913485e-05, "loss": 0.2083, "step": 866 }, { "epoch": 0.4662857142857143, "grad_norm": 0.22892746329307556, "learning_rate": 3.2196727213687395e-05, "loss": 0.2353, "step": 867 }, { "epoch": 0.4668235294117647, "grad_norm": 0.20422805845737457, "learning_rate": 3.215175626831537e-05, "loss": 0.2515, "step": 868 }, { "epoch": 0.46736134453781514, "grad_norm": 0.2293919324874878, "learning_rate": 3.210676010437423e-05, "loss": 0.2618, "step": 869 }, { "epoch": 0.4678991596638655, "grad_norm": 0.2424861341714859, "learning_rate": 3.206173888052977e-05, "loss": 0.2514, "step": 870 }, { "epoch": 0.46843697478991597, "grad_norm": 0.2245480716228485, "learning_rate": 3.201669275553609e-05, "loss": 0.2327, "step": 871 }, { "epoch": 0.46897478991596636, "grad_norm": 0.22477881610393524, "learning_rate": 3.197162188823514e-05, "loss": 0.248, "step": 872 }, { "epoch": 0.4695126050420168, "grad_norm": 0.21355709433555603, "learning_rate": 3.192652643755609e-05, "loss": 0.2462, "step": 873 }, { "epoch": 0.47005042016806725, "grad_norm": 0.24495093524456024, "learning_rate": 3.188140656251484e-05, "loss": 0.2551, "step": 874 }, { "epoch": 0.47058823529411764, "grad_norm": 0.23664867877960205, "learning_rate": 3.1836262422213345e-05, "loss": 0.2908, "step": 875 }, { "epoch": 0.4711260504201681, "grad_norm": 0.2471204400062561, "learning_rate": 3.17910941758392e-05, "loss": 0.3007, "step": 876 }, { "epoch": 0.4716638655462185, "grad_norm": 0.21095740795135498, "learning_rate": 3.174590198266493e-05, "loss": 0.2414, "step": 877 }, { "epoch": 0.4722016806722689, "grad_norm": 0.2130434811115265, "learning_rate": 3.170068600204758e-05, "loss": 0.2336, "step": 878 }, { "epoch": 0.4727394957983193, "grad_norm": 0.22079569101333618, "learning_rate": 3.1655446393427994e-05, "loss": 0.2948, "step": 879 }, { "epoch": 0.47327731092436975, "grad_norm": 0.22780220210552216, "learning_rate": 3.16101833163304e-05, "loss": 0.2587, "step": 880 }, { "epoch": 0.47381512605042014, "grad_norm": 0.21617624163627625, "learning_rate": 3.156489693036174e-05, "loss": 0.2328, "step": 881 }, { "epoch": 0.4743529411764706, "grad_norm": 0.23635557293891907, "learning_rate": 3.151958739521116e-05, "loss": 0.2718, "step": 882 }, { "epoch": 0.47489075630252103, "grad_norm": 0.22298604249954224, "learning_rate": 3.147425487064943e-05, "loss": 0.2496, "step": 883 }, { "epoch": 0.4754285714285714, "grad_norm": 0.20172730088233948, "learning_rate": 3.1428899516528407e-05, "loss": 0.2088, "step": 884 }, { "epoch": 0.47596638655462187, "grad_norm": 0.203935444355011, "learning_rate": 3.1383521492780424e-05, "loss": 0.2235, "step": 885 }, { "epoch": 0.47650420168067226, "grad_norm": 0.22618639469146729, "learning_rate": 3.133812095941775e-05, "loss": 0.2558, "step": 886 }, { "epoch": 0.4770420168067227, "grad_norm": 0.22726255655288696, "learning_rate": 3.129269807653206e-05, "loss": 0.258, "step": 887 }, { "epoch": 0.4775798319327731, "grad_norm": 0.2320665568113327, "learning_rate": 3.1247253004293806e-05, "loss": 0.2523, "step": 888 }, { "epoch": 0.47811764705882354, "grad_norm": 0.22346971929073334, "learning_rate": 3.12017859029517e-05, "loss": 0.272, "step": 889 }, { "epoch": 0.4786554621848739, "grad_norm": 0.20072266459465027, "learning_rate": 3.1156296932832114e-05, "loss": 0.2395, "step": 890 }, { "epoch": 0.47919327731092437, "grad_norm": 0.21371999382972717, "learning_rate": 3.1110786254338565e-05, "loss": 0.2844, "step": 891 }, { "epoch": 0.4797310924369748, "grad_norm": 0.2330796867609024, "learning_rate": 3.106525402795109e-05, "loss": 0.2481, "step": 892 }, { "epoch": 0.4802689075630252, "grad_norm": 0.24404597282409668, "learning_rate": 3.101970041422572e-05, "loss": 0.2555, "step": 893 }, { "epoch": 0.48080672268907565, "grad_norm": 0.22130556404590607, "learning_rate": 3.097412557379391e-05, "loss": 0.2355, "step": 894 }, { "epoch": 0.48134453781512604, "grad_norm": 0.2214861512184143, "learning_rate": 3.092852966736195e-05, "loss": 0.2756, "step": 895 }, { "epoch": 0.4818823529411765, "grad_norm": 0.2220049351453781, "learning_rate": 3.088291285571042e-05, "loss": 0.2416, "step": 896 }, { "epoch": 0.4824201680672269, "grad_norm": 0.25326985120773315, "learning_rate": 3.083727529969362e-05, "loss": 0.3023, "step": 897 }, { "epoch": 0.4829579831932773, "grad_norm": 0.18248715996742249, "learning_rate": 3.079161716023899e-05, "loss": 0.1934, "step": 898 }, { "epoch": 0.4834957983193277, "grad_norm": 0.2314782589673996, "learning_rate": 3.074593859834656e-05, "loss": 0.2648, "step": 899 }, { "epoch": 0.48403361344537815, "grad_norm": 0.22096309065818787, "learning_rate": 3.0700239775088355e-05, "loss": 0.2571, "step": 900 }, { "epoch": 0.4845714285714286, "grad_norm": 0.2369704395532608, "learning_rate": 3.0654520851607885e-05, "loss": 0.2628, "step": 901 }, { "epoch": 0.485109243697479, "grad_norm": 0.23117917776107788, "learning_rate": 3.06087819891195e-05, "loss": 0.2798, "step": 902 }, { "epoch": 0.48564705882352943, "grad_norm": 0.20888295769691467, "learning_rate": 3.056302334890786e-05, "loss": 0.2467, "step": 903 }, { "epoch": 0.4861848739495798, "grad_norm": 0.24168580770492554, "learning_rate": 3.05172450923274e-05, "loss": 0.279, "step": 904 }, { "epoch": 0.48672268907563027, "grad_norm": 0.23302188515663147, "learning_rate": 3.047144738080169e-05, "loss": 0.2515, "step": 905 }, { "epoch": 0.48726050420168066, "grad_norm": 0.22314533591270447, "learning_rate": 3.0425630375822922e-05, "loss": 0.2352, "step": 906 }, { "epoch": 0.4877983193277311, "grad_norm": 0.20311039686203003, "learning_rate": 3.037979423895131e-05, "loss": 0.2634, "step": 907 }, { "epoch": 0.4883361344537815, "grad_norm": 0.2151939570903778, "learning_rate": 3.0333939131814537e-05, "loss": 0.2654, "step": 908 }, { "epoch": 0.48887394957983193, "grad_norm": 0.24597501754760742, "learning_rate": 3.028806521610718e-05, "loss": 0.274, "step": 909 }, { "epoch": 0.4894117647058824, "grad_norm": 0.3262347877025604, "learning_rate": 3.0242172653590134e-05, "loss": 0.236, "step": 910 }, { "epoch": 0.48994957983193277, "grad_norm": 0.2061283141374588, "learning_rate": 3.019626160609005e-05, "loss": 0.2267, "step": 911 }, { "epoch": 0.4904873949579832, "grad_norm": 0.21239404380321503, "learning_rate": 3.0150332235498757e-05, "loss": 0.2249, "step": 912 }, { "epoch": 0.4910252100840336, "grad_norm": 0.2279185801744461, "learning_rate": 3.0104384703772702e-05, "loss": 0.2528, "step": 913 }, { "epoch": 0.49156302521008405, "grad_norm": 0.22870685160160065, "learning_rate": 3.0058419172932366e-05, "loss": 0.2414, "step": 914 }, { "epoch": 0.49210084033613444, "grad_norm": 0.19835229218006134, "learning_rate": 3.0012435805061712e-05, "loss": 0.2028, "step": 915 }, { "epoch": 0.4926386554621849, "grad_norm": 0.2232629656791687, "learning_rate": 2.9966434762307567e-05, "loss": 0.2567, "step": 916 }, { "epoch": 0.49317647058823527, "grad_norm": 0.21358704566955566, "learning_rate": 2.9920416206879116e-05, "loss": 0.2177, "step": 917 }, { "epoch": 0.4937142857142857, "grad_norm": 0.2150295078754425, "learning_rate": 2.9874380301047285e-05, "loss": 0.2614, "step": 918 }, { "epoch": 0.49425210084033616, "grad_norm": 0.2134786695241928, "learning_rate": 2.9828327207144185e-05, "loss": 0.2382, "step": 919 }, { "epoch": 0.49478991596638655, "grad_norm": 0.2189902812242508, "learning_rate": 2.9782257087562533e-05, "loss": 0.2694, "step": 920 }, { "epoch": 0.495327731092437, "grad_norm": 0.1887131780385971, "learning_rate": 2.9736170104755075e-05, "loss": 0.2155, "step": 921 }, { "epoch": 0.4958655462184874, "grad_norm": 0.2382410764694214, "learning_rate": 2.969006642123403e-05, "loss": 0.2781, "step": 922 }, { "epoch": 0.49640336134453783, "grad_norm": 0.24150177836418152, "learning_rate": 2.9643946199570512e-05, "loss": 0.261, "step": 923 }, { "epoch": 0.4969411764705882, "grad_norm": 0.20043762028217316, "learning_rate": 2.9597809602393933e-05, "loss": 0.204, "step": 924 }, { "epoch": 0.49747899159663866, "grad_norm": 0.23058444261550903, "learning_rate": 2.955165679239147e-05, "loss": 0.2579, "step": 925 }, { "epoch": 0.49801680672268905, "grad_norm": 0.23902669548988342, "learning_rate": 2.9505487932307456e-05, "loss": 0.2663, "step": 926 }, { "epoch": 0.4985546218487395, "grad_norm": 0.24433819949626923, "learning_rate": 2.9459303184942816e-05, "loss": 0.2727, "step": 927 }, { "epoch": 0.49909243697478994, "grad_norm": 0.19602841138839722, "learning_rate": 2.9413102713154504e-05, "loss": 0.2122, "step": 928 }, { "epoch": 0.49963025210084033, "grad_norm": 0.22457140684127808, "learning_rate": 2.936688667985494e-05, "loss": 0.2203, "step": 929 }, { "epoch": 0.5001680672268908, "grad_norm": 0.2408713847398758, "learning_rate": 2.932065524801138e-05, "loss": 0.2643, "step": 930 }, { "epoch": 0.5007058823529412, "grad_norm": 0.21281445026397705, "learning_rate": 2.9274408580645406e-05, "loss": 0.2344, "step": 931 }, { "epoch": 0.5012436974789916, "grad_norm": 0.20826272666454315, "learning_rate": 2.9228146840832322e-05, "loss": 0.2295, "step": 932 }, { "epoch": 0.501781512605042, "grad_norm": 0.24118486046791077, "learning_rate": 2.9181870191700555e-05, "loss": 0.284, "step": 933 }, { "epoch": 0.5023193277310924, "grad_norm": 0.21370923519134521, "learning_rate": 2.913557879643113e-05, "loss": 0.2519, "step": 934 }, { "epoch": 0.5028571428571429, "grad_norm": 0.23686231672763824, "learning_rate": 2.9089272818257073e-05, "loss": 0.2845, "step": 935 }, { "epoch": 0.5033949579831932, "grad_norm": 0.2226634919643402, "learning_rate": 2.904295242046281e-05, "loss": 0.3049, "step": 936 }, { "epoch": 0.5039327731092437, "grad_norm": 0.2291756123304367, "learning_rate": 2.899661776638364e-05, "loss": 0.2557, "step": 937 }, { "epoch": 0.5044705882352941, "grad_norm": 0.21768632531166077, "learning_rate": 2.8950269019405106e-05, "loss": 0.2254, "step": 938 }, { "epoch": 0.5050084033613446, "grad_norm": 0.19672738015651703, "learning_rate": 2.890390634296247e-05, "loss": 0.222, "step": 939 }, { "epoch": 0.505546218487395, "grad_norm": 0.22243821620941162, "learning_rate": 2.8857529900540092e-05, "loss": 0.2583, "step": 940 }, { "epoch": 0.5060840336134453, "grad_norm": 0.19144150614738464, "learning_rate": 2.8811139855670894e-05, "loss": 0.2106, "step": 941 }, { "epoch": 0.5066218487394958, "grad_norm": 0.21289044618606567, "learning_rate": 2.8764736371935736e-05, "loss": 0.2278, "step": 942 }, { "epoch": 0.5071596638655462, "grad_norm": 0.21704189479351044, "learning_rate": 2.8718319612962895e-05, "loss": 0.2714, "step": 943 }, { "epoch": 0.5076974789915967, "grad_norm": 0.21943902969360352, "learning_rate": 2.8671889742427443e-05, "loss": 0.2452, "step": 944 }, { "epoch": 0.508235294117647, "grad_norm": 0.2035403996706009, "learning_rate": 2.862544692405068e-05, "loss": 0.2415, "step": 945 }, { "epoch": 0.5087731092436975, "grad_norm": 0.2092832773923874, "learning_rate": 2.857899132159958e-05, "loss": 0.2569, "step": 946 }, { "epoch": 0.5093109243697479, "grad_norm": 0.20701830089092255, "learning_rate": 2.8532523098886195e-05, "loss": 0.2521, "step": 947 }, { "epoch": 0.5098487394957983, "grad_norm": 0.20691035687923431, "learning_rate": 2.8486042419767055e-05, "loss": 0.2589, "step": 948 }, { "epoch": 0.5103865546218488, "grad_norm": 0.21525749564170837, "learning_rate": 2.8439549448142644e-05, "loss": 0.2245, "step": 949 }, { "epoch": 0.5109243697478991, "grad_norm": 0.2348656803369522, "learning_rate": 2.8393044347956775e-05, "loss": 0.2523, "step": 950 }, { "epoch": 0.5114621848739496, "grad_norm": 0.22317996621131897, "learning_rate": 2.8346527283196034e-05, "loss": 0.2169, "step": 951 }, { "epoch": 0.512, "grad_norm": 0.2134958952665329, "learning_rate": 2.8299998417889184e-05, "loss": 0.2414, "step": 952 }, { "epoch": 0.5125378151260505, "grad_norm": 0.2569315731525421, "learning_rate": 2.8253457916106623e-05, "loss": 0.2737, "step": 953 }, { "epoch": 0.5130756302521008, "grad_norm": 0.21213652193546295, "learning_rate": 2.820690594195976e-05, "loss": 0.2194, "step": 954 }, { "epoch": 0.5136134453781512, "grad_norm": 0.19984906911849976, "learning_rate": 2.816034265960047e-05, "loss": 0.2353, "step": 955 }, { "epoch": 0.5141512605042017, "grad_norm": 0.24562713503837585, "learning_rate": 2.811376823322051e-05, "loss": 0.2401, "step": 956 }, { "epoch": 0.5146890756302521, "grad_norm": 0.17969459295272827, "learning_rate": 2.806718282705091e-05, "loss": 0.2055, "step": 957 }, { "epoch": 0.5152268907563026, "grad_norm": 0.20778468251228333, "learning_rate": 2.8020586605361426e-05, "loss": 0.2225, "step": 958 }, { "epoch": 0.5157647058823529, "grad_norm": 0.20480383932590485, "learning_rate": 2.797397973245997e-05, "loss": 0.2417, "step": 959 }, { "epoch": 0.5163025210084033, "grad_norm": 0.23860029876232147, "learning_rate": 2.792736237269199e-05, "loss": 0.2521, "step": 960 }, { "epoch": 0.5168403361344538, "grad_norm": 0.23347683250904083, "learning_rate": 2.788073469043992e-05, "loss": 0.2814, "step": 961 }, { "epoch": 0.5173781512605042, "grad_norm": 0.22045038640499115, "learning_rate": 2.7834096850122592e-05, "loss": 0.2478, "step": 962 }, { "epoch": 0.5179159663865546, "grad_norm": 0.23359179496765137, "learning_rate": 2.7787449016194665e-05, "loss": 0.2483, "step": 963 }, { "epoch": 0.518453781512605, "grad_norm": 0.21868960559368134, "learning_rate": 2.7740791353146033e-05, "loss": 0.2471, "step": 964 }, { "epoch": 0.5189915966386555, "grad_norm": 0.17689087986946106, "learning_rate": 2.7694124025501244e-05, "loss": 0.2065, "step": 965 }, { "epoch": 0.5195294117647059, "grad_norm": 0.23092396557331085, "learning_rate": 2.764744719781893e-05, "loss": 0.2777, "step": 966 }, { "epoch": 0.5200672268907564, "grad_norm": 0.2203717678785324, "learning_rate": 2.760076103469123e-05, "loss": 0.266, "step": 967 }, { "epoch": 0.5206050420168067, "grad_norm": 0.22807644307613373, "learning_rate": 2.755406570074318e-05, "loss": 0.2451, "step": 968 }, { "epoch": 0.5211428571428571, "grad_norm": 0.20408713817596436, "learning_rate": 2.7507361360632182e-05, "loss": 0.2214, "step": 969 }, { "epoch": 0.5216806722689076, "grad_norm": 0.2311539351940155, "learning_rate": 2.7460648179047367e-05, "loss": 0.2533, "step": 970 }, { "epoch": 0.522218487394958, "grad_norm": 0.19921916723251343, "learning_rate": 2.7413926320709072e-05, "loss": 0.2296, "step": 971 }, { "epoch": 0.5227563025210084, "grad_norm": 0.213759645819664, "learning_rate": 2.7367195950368196e-05, "loss": 0.2297, "step": 972 }, { "epoch": 0.5232941176470588, "grad_norm": 0.18288227915763855, "learning_rate": 2.7320457232805697e-05, "loss": 0.1879, "step": 973 }, { "epoch": 0.5238319327731092, "grad_norm": 0.21016623079776764, "learning_rate": 2.7273710332831927e-05, "loss": 0.2414, "step": 974 }, { "epoch": 0.5243697478991597, "grad_norm": 0.20843663811683655, "learning_rate": 2.7226955415286104e-05, "loss": 0.2274, "step": 975 }, { "epoch": 0.5249075630252101, "grad_norm": 0.214103564620018, "learning_rate": 2.7180192645035728e-05, "loss": 0.2785, "step": 976 }, { "epoch": 0.5254453781512605, "grad_norm": 0.21055221557617188, "learning_rate": 2.713342218697598e-05, "loss": 0.2538, "step": 977 }, { "epoch": 0.5259831932773109, "grad_norm": 0.24100837111473083, "learning_rate": 2.708664420602915e-05, "loss": 0.2761, "step": 978 }, { "epoch": 0.5265210084033614, "grad_norm": 0.23125532269477844, "learning_rate": 2.7039858867144057e-05, "loss": 0.2648, "step": 979 }, { "epoch": 0.5270588235294118, "grad_norm": 0.23040735721588135, "learning_rate": 2.699306633529546e-05, "loss": 0.2436, "step": 980 }, { "epoch": 0.5275966386554621, "grad_norm": 0.2521499693393707, "learning_rate": 2.694626677548349e-05, "loss": 0.2924, "step": 981 }, { "epoch": 0.5281344537815126, "grad_norm": 0.18516916036605835, "learning_rate": 2.6899460352733064e-05, "loss": 0.2008, "step": 982 }, { "epoch": 0.528672268907563, "grad_norm": 0.2103269249200821, "learning_rate": 2.685264723209328e-05, "loss": 0.2332, "step": 983 }, { "epoch": 0.5292100840336135, "grad_norm": 0.2094351351261139, "learning_rate": 2.680582757863687e-05, "loss": 0.2598, "step": 984 }, { "epoch": 0.5297478991596639, "grad_norm": 0.22945097088813782, "learning_rate": 2.6759001557459608e-05, "loss": 0.2911, "step": 985 }, { "epoch": 0.5302857142857142, "grad_norm": 0.19157052040100098, "learning_rate": 2.67121693336797e-05, "loss": 0.2174, "step": 986 }, { "epoch": 0.5308235294117647, "grad_norm": 0.19951434433460236, "learning_rate": 2.6665331072437233e-05, "loss": 0.3272, "step": 987 }, { "epoch": 0.5313613445378151, "grad_norm": 0.22069033980369568, "learning_rate": 2.66184869388936e-05, "loss": 0.2408, "step": 988 }, { "epoch": 0.5318991596638656, "grad_norm": 0.24270892143249512, "learning_rate": 2.6571637098230868e-05, "loss": 0.2808, "step": 989 }, { "epoch": 0.5324369747899159, "grad_norm": 0.2430240660905838, "learning_rate": 2.6524781715651265e-05, "loss": 0.2311, "step": 990 }, { "epoch": 0.5329747899159664, "grad_norm": 0.22191749513149261, "learning_rate": 2.647792095637654e-05, "loss": 0.2416, "step": 991 }, { "epoch": 0.5335126050420168, "grad_norm": 0.22174769639968872, "learning_rate": 2.64310549856474e-05, "loss": 0.26, "step": 992 }, { "epoch": 0.5340504201680673, "grad_norm": 0.18611428141593933, "learning_rate": 2.6384183968722936e-05, "loss": 0.2161, "step": 993 }, { "epoch": 0.5345882352941177, "grad_norm": 0.24197791516780853, "learning_rate": 2.633730807088003e-05, "loss": 0.275, "step": 994 }, { "epoch": 0.535126050420168, "grad_norm": 0.20925883948802948, "learning_rate": 2.629042745741277e-05, "loss": 0.2365, "step": 995 }, { "epoch": 0.5356638655462185, "grad_norm": 0.21936658024787903, "learning_rate": 2.6243542293631884e-05, "loss": 0.2522, "step": 996 }, { "epoch": 0.5362016806722689, "grad_norm": 0.27038827538490295, "learning_rate": 2.6196652744864143e-05, "loss": 0.2889, "step": 997 }, { "epoch": 0.5367394957983194, "grad_norm": 0.21431496739387512, "learning_rate": 2.6149758976451765e-05, "loss": 0.2449, "step": 998 }, { "epoch": 0.5372773109243697, "grad_norm": 0.2593373954296112, "learning_rate": 2.6102861153751866e-05, "loss": 0.2768, "step": 999 }, { "epoch": 0.5378151260504201, "grad_norm": 0.21531513333320618, "learning_rate": 2.6055959442135857e-05, "loss": 0.1951, "step": 1000 }, { "epoch": 0.5383529411764706, "grad_norm": 0.2271542251110077, "learning_rate": 2.600905400698884e-05, "loss": 0.2599, "step": 1001 }, { "epoch": 0.538890756302521, "grad_norm": 0.19336895644664764, "learning_rate": 2.5962145013709076e-05, "loss": 0.2165, "step": 1002 }, { "epoch": 0.5394285714285715, "grad_norm": 0.21949465572834015, "learning_rate": 2.5915232627707365e-05, "loss": 0.2227, "step": 1003 }, { "epoch": 0.5399663865546218, "grad_norm": 0.21503470838069916, "learning_rate": 2.5868317014406446e-05, "loss": 0.2605, "step": 1004 }, { "epoch": 0.5405042016806723, "grad_norm": 0.22388999164104462, "learning_rate": 2.582139833924047e-05, "loss": 0.2483, "step": 1005 }, { "epoch": 0.5410420168067227, "grad_norm": 0.22500364482402802, "learning_rate": 2.5774476767654366e-05, "loss": 0.2142, "step": 1006 }, { "epoch": 0.5415798319327731, "grad_norm": 0.21488697826862335, "learning_rate": 2.5727552465103287e-05, "loss": 0.252, "step": 1007 }, { "epoch": 0.5421176470588235, "grad_norm": 0.2063259482383728, "learning_rate": 2.568062559705202e-05, "loss": 0.2242, "step": 1008 }, { "epoch": 0.5426554621848739, "grad_norm": 0.21058394014835358, "learning_rate": 2.563369632897437e-05, "loss": 0.228, "step": 1009 }, { "epoch": 0.5431932773109244, "grad_norm": 0.1984008252620697, "learning_rate": 2.5586764826352643e-05, "loss": 0.2166, "step": 1010 }, { "epoch": 0.5437310924369748, "grad_norm": 0.2384747862815857, "learning_rate": 2.5539831254676994e-05, "loss": 0.2604, "step": 1011 }, { "epoch": 0.5442689075630253, "grad_norm": 0.2315310388803482, "learning_rate": 2.5492895779444893e-05, "loss": 0.2778, "step": 1012 }, { "epoch": 0.5448067226890756, "grad_norm": 0.22062067687511444, "learning_rate": 2.5445958566160517e-05, "loss": 0.2335, "step": 1013 }, { "epoch": 0.545344537815126, "grad_norm": 0.2174912542104721, "learning_rate": 2.5399019780334162e-05, "loss": 0.2453, "step": 1014 }, { "epoch": 0.5458823529411765, "grad_norm": 0.24750244617462158, "learning_rate": 2.5352079587481686e-05, "loss": 0.2929, "step": 1015 }, { "epoch": 0.5464201680672269, "grad_norm": 0.2094195932149887, "learning_rate": 2.5305138153123892e-05, "loss": 0.2394, "step": 1016 }, { "epoch": 0.5469579831932773, "grad_norm": 0.20794717967510223, "learning_rate": 2.525819564278598e-05, "loss": 0.24, "step": 1017 }, { "epoch": 0.5474957983193277, "grad_norm": 0.24174070358276367, "learning_rate": 2.5211252221996923e-05, "loss": 0.2537, "step": 1018 }, { "epoch": 0.5480336134453782, "grad_norm": 0.2292274534702301, "learning_rate": 2.5164308056288915e-05, "loss": 0.251, "step": 1019 }, { "epoch": 0.5485714285714286, "grad_norm": 0.2468942552804947, "learning_rate": 2.511736331119679e-05, "loss": 0.2699, "step": 1020 }, { "epoch": 0.549109243697479, "grad_norm": 0.2423298954963684, "learning_rate": 2.507041815225739e-05, "loss": 0.2799, "step": 1021 }, { "epoch": 0.5496470588235294, "grad_norm": 0.24724631011486053, "learning_rate": 2.5023472745009047e-05, "loss": 0.3059, "step": 1022 }, { "epoch": 0.5501848739495798, "grad_norm": 0.22058610618114471, "learning_rate": 2.4976527254990962e-05, "loss": 0.2358, "step": 1023 }, { "epoch": 0.5507226890756303, "grad_norm": 0.2078189253807068, "learning_rate": 2.492958184774262e-05, "loss": 0.2241, "step": 1024 }, { "epoch": 0.5512605042016807, "grad_norm": 0.21902219951152802, "learning_rate": 2.488263668880322e-05, "loss": 0.2414, "step": 1025 }, { "epoch": 0.551798319327731, "grad_norm": 0.22969570755958557, "learning_rate": 2.483569194371109e-05, "loss": 0.2809, "step": 1026 }, { "epoch": 0.5523361344537815, "grad_norm": 0.20817683637142181, "learning_rate": 2.4788747778003083e-05, "loss": 0.229, "step": 1027 }, { "epoch": 0.5528739495798319, "grad_norm": 0.2252126783132553, "learning_rate": 2.4741804357214026e-05, "loss": 0.2581, "step": 1028 }, { "epoch": 0.5534117647058824, "grad_norm": 0.2113388478755951, "learning_rate": 2.469486184687611e-05, "loss": 0.2618, "step": 1029 }, { "epoch": 0.5539495798319328, "grad_norm": 0.21714220941066742, "learning_rate": 2.4647920412518323e-05, "loss": 0.2289, "step": 1030 }, { "epoch": 0.5544873949579832, "grad_norm": 0.23396624624729156, "learning_rate": 2.460098021966584e-05, "loss": 0.2852, "step": 1031 }, { "epoch": 0.5550252100840336, "grad_norm": 0.2347496598958969, "learning_rate": 2.4554041433839492e-05, "loss": 0.2835, "step": 1032 }, { "epoch": 0.555563025210084, "grad_norm": 0.24235248565673828, "learning_rate": 2.4507104220555106e-05, "loss": 0.2836, "step": 1033 }, { "epoch": 0.5561008403361345, "grad_norm": 0.19914649426937103, "learning_rate": 2.4460168745323002e-05, "loss": 0.2054, "step": 1034 }, { "epoch": 0.5566386554621848, "grad_norm": 0.21990041434764862, "learning_rate": 2.441323517364737e-05, "loss": 0.2566, "step": 1035 }, { "epoch": 0.5571764705882353, "grad_norm": 0.19117048382759094, "learning_rate": 2.4366303671025636e-05, "loss": 0.1995, "step": 1036 }, { "epoch": 0.5577142857142857, "grad_norm": 0.2162487506866455, "learning_rate": 2.431937440294799e-05, "loss": 0.2345, "step": 1037 }, { "epoch": 0.5582521008403362, "grad_norm": 0.2511918842792511, "learning_rate": 2.4272447534896715e-05, "loss": 0.2682, "step": 1038 }, { "epoch": 0.5587899159663866, "grad_norm": 0.20426541566848755, "learning_rate": 2.422552323234564e-05, "loss": 0.2323, "step": 1039 }, { "epoch": 0.5593277310924369, "grad_norm": 0.24103893339633942, "learning_rate": 2.4178601660759535e-05, "loss": 0.2556, "step": 1040 }, { "epoch": 0.5598655462184874, "grad_norm": 0.22688975930213928, "learning_rate": 2.4131682985593557e-05, "loss": 0.2774, "step": 1041 }, { "epoch": 0.5604033613445378, "grad_norm": 0.23102036118507385, "learning_rate": 2.408476737229264e-05, "loss": 0.2471, "step": 1042 }, { "epoch": 0.5609411764705883, "grad_norm": 0.1971779763698578, "learning_rate": 2.403785498629092e-05, "loss": 0.2289, "step": 1043 }, { "epoch": 0.5614789915966386, "grad_norm": 0.2453945279121399, "learning_rate": 2.399094599301116e-05, "loss": 0.2651, "step": 1044 }, { "epoch": 0.562016806722689, "grad_norm": 0.23492689430713654, "learning_rate": 2.3944040557864145e-05, "loss": 0.2666, "step": 1045 }, { "epoch": 0.5625546218487395, "grad_norm": 0.21835936605930328, "learning_rate": 2.389713884624814e-05, "loss": 0.2434, "step": 1046 }, { "epoch": 0.56309243697479, "grad_norm": 0.22446410357952118, "learning_rate": 2.385024102354824e-05, "loss": 0.2437, "step": 1047 }, { "epoch": 0.5636302521008404, "grad_norm": 0.2274772971868515, "learning_rate": 2.3803347255135863e-05, "loss": 0.2274, "step": 1048 }, { "epoch": 0.5641680672268907, "grad_norm": 0.22732731699943542, "learning_rate": 2.3756457706368118e-05, "loss": 0.2693, "step": 1049 }, { "epoch": 0.5647058823529412, "grad_norm": 0.2063349336385727, "learning_rate": 2.3709572542587237e-05, "loss": 0.2519, "step": 1050 }, { "epoch": 0.5652436974789916, "grad_norm": 0.21152758598327637, "learning_rate": 2.366269192911998e-05, "loss": 0.2352, "step": 1051 }, { "epoch": 0.5657815126050421, "grad_norm": 0.23566892743110657, "learning_rate": 2.361581603127707e-05, "loss": 0.2829, "step": 1052 }, { "epoch": 0.5663193277310924, "grad_norm": 0.20706325769424438, "learning_rate": 2.3568945014352604e-05, "loss": 0.2191, "step": 1053 }, { "epoch": 0.5668571428571428, "grad_norm": 0.2380368411540985, "learning_rate": 2.3522079043623463e-05, "loss": 0.2494, "step": 1054 }, { "epoch": 0.5673949579831933, "grad_norm": 0.22715315222740173, "learning_rate": 2.3475218284348734e-05, "loss": 0.217, "step": 1055 }, { "epoch": 0.5679327731092437, "grad_norm": 0.20253589749336243, "learning_rate": 2.342836290176913e-05, "loss": 0.2445, "step": 1056 }, { "epoch": 0.5684705882352942, "grad_norm": 0.19837456941604614, "learning_rate": 2.338151306110641e-05, "loss": 0.2355, "step": 1057 }, { "epoch": 0.5690084033613445, "grad_norm": 0.19975750148296356, "learning_rate": 2.3334668927562776e-05, "loss": 0.2122, "step": 1058 }, { "epoch": 0.569546218487395, "grad_norm": 0.2294699102640152, "learning_rate": 2.328783066632031e-05, "loss": 0.2518, "step": 1059 }, { "epoch": 0.5700840336134454, "grad_norm": 0.22241781651973724, "learning_rate": 2.32409984425404e-05, "loss": 0.231, "step": 1060 }, { "epoch": 0.5706218487394958, "grad_norm": 0.22617857158184052, "learning_rate": 2.3194172421363132e-05, "loss": 0.2562, "step": 1061 }, { "epoch": 0.5711596638655462, "grad_norm": 0.21668554842472076, "learning_rate": 2.3147352767906728e-05, "loss": 0.244, "step": 1062 }, { "epoch": 0.5716974789915966, "grad_norm": 0.240121528506279, "learning_rate": 2.3100539647266942e-05, "loss": 0.2751, "step": 1063 }, { "epoch": 0.5722352941176471, "grad_norm": 0.20091429352760315, "learning_rate": 2.3053733224516514e-05, "loss": 0.2238, "step": 1064 }, { "epoch": 0.5727731092436975, "grad_norm": 0.21165336668491364, "learning_rate": 2.3006933664704543e-05, "loss": 0.2281, "step": 1065 }, { "epoch": 0.573310924369748, "grad_norm": 0.23653531074523926, "learning_rate": 2.296014113285595e-05, "loss": 0.2567, "step": 1066 }, { "epoch": 0.5738487394957983, "grad_norm": 0.22431451082229614, "learning_rate": 2.2913355793970846e-05, "loss": 0.2533, "step": 1067 }, { "epoch": 0.5743865546218487, "grad_norm": 0.21738441288471222, "learning_rate": 2.2866577813024018e-05, "loss": 0.231, "step": 1068 }, { "epoch": 0.5749243697478992, "grad_norm": 0.20502355694770813, "learning_rate": 2.281980735496427e-05, "loss": 0.2382, "step": 1069 }, { "epoch": 0.5754621848739496, "grad_norm": 0.21591916680335999, "learning_rate": 2.277304458471391e-05, "loss": 0.245, "step": 1070 }, { "epoch": 0.576, "grad_norm": 0.21414639055728912, "learning_rate": 2.2726289667168086e-05, "loss": 0.2621, "step": 1071 }, { "epoch": 0.5765378151260504, "grad_norm": 0.2304551601409912, "learning_rate": 2.2679542767194315e-05, "loss": 0.2376, "step": 1072 }, { "epoch": 0.5770756302521008, "grad_norm": 0.21633359789848328, "learning_rate": 2.2632804049631813e-05, "loss": 0.2512, "step": 1073 }, { "epoch": 0.5776134453781513, "grad_norm": 0.25273945927619934, "learning_rate": 2.2586073679290937e-05, "loss": 0.2912, "step": 1074 }, { "epoch": 0.5781512605042017, "grad_norm": 0.2021268755197525, "learning_rate": 2.2539351820952636e-05, "loss": 0.2068, "step": 1075 }, { "epoch": 0.5786890756302521, "grad_norm": 0.2192612737417221, "learning_rate": 2.2492638639367824e-05, "loss": 0.2224, "step": 1076 }, { "epoch": 0.5792268907563025, "grad_norm": 0.22460715472698212, "learning_rate": 2.2445934299256823e-05, "loss": 0.266, "step": 1077 }, { "epoch": 0.579764705882353, "grad_norm": 0.21565912663936615, "learning_rate": 2.2399238965308774e-05, "loss": 0.2514, "step": 1078 }, { "epoch": 0.5803025210084034, "grad_norm": 0.23067344725131989, "learning_rate": 2.2352552802181067e-05, "loss": 0.2701, "step": 1079 }, { "epoch": 0.5808403361344537, "grad_norm": 0.2300942838191986, "learning_rate": 2.2305875974498755e-05, "loss": 0.2498, "step": 1080 }, { "epoch": 0.5813781512605042, "grad_norm": 0.18332509696483612, "learning_rate": 2.225920864685397e-05, "loss": 0.2172, "step": 1081 }, { "epoch": 0.5819159663865546, "grad_norm": 0.2268993854522705, "learning_rate": 2.2212550983805337e-05, "loss": 0.2353, "step": 1082 }, { "epoch": 0.5824537815126051, "grad_norm": 0.20722606778144836, "learning_rate": 2.2165903149877414e-05, "loss": 0.2255, "step": 1083 }, { "epoch": 0.5829915966386554, "grad_norm": 0.19004997611045837, "learning_rate": 2.2119265309560087e-05, "loss": 0.2225, "step": 1084 }, { "epoch": 0.5835294117647059, "grad_norm": 0.2042532116174698, "learning_rate": 2.2072637627308017e-05, "loss": 0.2143, "step": 1085 }, { "epoch": 0.5840672268907563, "grad_norm": 0.2110351026058197, "learning_rate": 2.202602026754004e-05, "loss": 0.2204, "step": 1086 }, { "epoch": 0.5846050420168067, "grad_norm": 0.21320883929729462, "learning_rate": 2.1979413394638577e-05, "loss": 0.2381, "step": 1087 }, { "epoch": 0.5851428571428572, "grad_norm": 0.19492696225643158, "learning_rate": 2.1932817172949096e-05, "loss": 0.2293, "step": 1088 }, { "epoch": 0.5856806722689075, "grad_norm": 0.22972171008586884, "learning_rate": 2.1886231766779495e-05, "loss": 0.2574, "step": 1089 }, { "epoch": 0.586218487394958, "grad_norm": 0.2597947120666504, "learning_rate": 2.183965734039953e-05, "loss": 0.3091, "step": 1090 }, { "epoch": 0.5867563025210084, "grad_norm": 0.24778687953948975, "learning_rate": 2.1793094058040243e-05, "loss": 0.2615, "step": 1091 }, { "epoch": 0.5872941176470589, "grad_norm": 0.21720808744430542, "learning_rate": 2.1746542083893383e-05, "loss": 0.2393, "step": 1092 }, { "epoch": 0.5878319327731092, "grad_norm": 0.2302917242050171, "learning_rate": 2.1700001582110828e-05, "loss": 0.2424, "step": 1093 }, { "epoch": 0.5883697478991596, "grad_norm": 0.22795067727565765, "learning_rate": 2.1653472716803978e-05, "loss": 0.256, "step": 1094 }, { "epoch": 0.5889075630252101, "grad_norm": 0.20264248549938202, "learning_rate": 2.1606955652043234e-05, "loss": 0.2376, "step": 1095 }, { "epoch": 0.5894453781512605, "grad_norm": 0.19771194458007812, "learning_rate": 2.1560450551857365e-05, "loss": 0.2393, "step": 1096 }, { "epoch": 0.589983193277311, "grad_norm": 0.24479633569717407, "learning_rate": 2.151395758023295e-05, "loss": 0.2725, "step": 1097 }, { "epoch": 0.5905210084033613, "grad_norm": 0.2119426131248474, "learning_rate": 2.1467476901113814e-05, "loss": 0.2315, "step": 1098 }, { "epoch": 0.5910588235294117, "grad_norm": 0.23920418322086334, "learning_rate": 2.1421008678400423e-05, "loss": 0.255, "step": 1099 }, { "epoch": 0.5915966386554622, "grad_norm": 0.20457926392555237, "learning_rate": 2.1374553075949326e-05, "loss": 0.284, "step": 1100 }, { "epoch": 0.5921344537815126, "grad_norm": 0.21653254330158234, "learning_rate": 2.132811025757256e-05, "loss": 0.232, "step": 1101 }, { "epoch": 0.592672268907563, "grad_norm": 0.21075958013534546, "learning_rate": 2.1281680387037108e-05, "loss": 0.2418, "step": 1102 }, { "epoch": 0.5932100840336134, "grad_norm": 0.20594708621501923, "learning_rate": 2.123526362806426e-05, "loss": 0.2157, "step": 1103 }, { "epoch": 0.5937478991596639, "grad_norm": 0.20811708271503448, "learning_rate": 2.118886014432911e-05, "loss": 0.1964, "step": 1104 }, { "epoch": 0.5942857142857143, "grad_norm": 0.16754086315631866, "learning_rate": 2.1142470099459914e-05, "loss": 0.1981, "step": 1105 }, { "epoch": 0.5948235294117648, "grad_norm": 0.21783669292926788, "learning_rate": 2.1096093657037542e-05, "loss": 0.2707, "step": 1106 }, { "epoch": 0.5953613445378151, "grad_norm": 0.20705778896808624, "learning_rate": 2.1049730980594903e-05, "loss": 0.2131, "step": 1107 }, { "epoch": 0.5958991596638655, "grad_norm": 0.22853803634643555, "learning_rate": 2.1003382233616366e-05, "loss": 0.2419, "step": 1108 }, { "epoch": 0.596436974789916, "grad_norm": 0.2247644066810608, "learning_rate": 2.0957047579537196e-05, "loss": 0.2542, "step": 1109 }, { "epoch": 0.5969747899159664, "grad_norm": 0.2529137432575226, "learning_rate": 2.0910727181742933e-05, "loss": 0.2584, "step": 1110 }, { "epoch": 0.5975126050420168, "grad_norm": 0.19232851266860962, "learning_rate": 2.0864421203568872e-05, "loss": 0.1835, "step": 1111 }, { "epoch": 0.5980504201680672, "grad_norm": 0.23242990672588348, "learning_rate": 2.081812980829945e-05, "loss": 0.2626, "step": 1112 }, { "epoch": 0.5985882352941176, "grad_norm": 0.2416951209306717, "learning_rate": 2.0771853159167687e-05, "loss": 0.26, "step": 1113 }, { "epoch": 0.5991260504201681, "grad_norm": 0.24070553481578827, "learning_rate": 2.0725591419354593e-05, "loss": 0.2618, "step": 1114 }, { "epoch": 0.5996638655462185, "grad_norm": 0.21647585928440094, "learning_rate": 2.0679344751988618e-05, "loss": 0.2363, "step": 1115 }, { "epoch": 0.6002016806722689, "grad_norm": 0.20408564805984497, "learning_rate": 2.063311332014506e-05, "loss": 0.2226, "step": 1116 }, { "epoch": 0.6007394957983193, "grad_norm": 0.23719154298305511, "learning_rate": 2.0586897286845498e-05, "loss": 0.2524, "step": 1117 }, { "epoch": 0.6012773109243698, "grad_norm": 0.19723276793956757, "learning_rate": 2.0540696815057193e-05, "loss": 0.2145, "step": 1118 }, { "epoch": 0.6018151260504202, "grad_norm": 0.2164805680513382, "learning_rate": 2.0494512067692557e-05, "loss": 0.2383, "step": 1119 }, { "epoch": 0.6023529411764705, "grad_norm": 0.1979767233133316, "learning_rate": 2.0448343207608537e-05, "loss": 0.2041, "step": 1120 }, { "epoch": 0.602890756302521, "grad_norm": 0.2336210459470749, "learning_rate": 2.0402190397606073e-05, "loss": 0.2378, "step": 1121 }, { "epoch": 0.6034285714285714, "grad_norm": 0.16756337881088257, "learning_rate": 2.0356053800429494e-05, "loss": 0.1715, "step": 1122 }, { "epoch": 0.6039663865546219, "grad_norm": 0.19642971456050873, "learning_rate": 2.0309933578765975e-05, "loss": 0.2114, "step": 1123 }, { "epoch": 0.6045042016806723, "grad_norm": 0.2448231279850006, "learning_rate": 2.026382989524493e-05, "loss": 0.2618, "step": 1124 }, { "epoch": 0.6050420168067226, "grad_norm": 0.20517228543758392, "learning_rate": 2.0217742912437477e-05, "loss": 0.2388, "step": 1125 }, { "epoch": 0.6055798319327731, "grad_norm": 0.22515802085399628, "learning_rate": 2.017167279285582e-05, "loss": 0.2649, "step": 1126 }, { "epoch": 0.6061176470588235, "grad_norm": 0.23204569518566132, "learning_rate": 2.0125619698952717e-05, "loss": 0.2432, "step": 1127 }, { "epoch": 0.606655462184874, "grad_norm": 0.23449774086475372, "learning_rate": 2.0079583793120896e-05, "loss": 0.2663, "step": 1128 }, { "epoch": 0.6071932773109243, "grad_norm": 0.21482518315315247, "learning_rate": 2.0033565237692446e-05, "loss": 0.2158, "step": 1129 }, { "epoch": 0.6077310924369748, "grad_norm": 0.2162722647190094, "learning_rate": 1.9987564194938303e-05, "loss": 0.2453, "step": 1130 }, { "epoch": 0.6082689075630252, "grad_norm": 0.21342892944812775, "learning_rate": 1.994158082706764e-05, "loss": 0.2267, "step": 1131 }, { "epoch": 0.6088067226890757, "grad_norm": 0.2349112629890442, "learning_rate": 1.9895615296227304e-05, "loss": 0.2343, "step": 1132 }, { "epoch": 0.6093445378151261, "grad_norm": 0.2144494652748108, "learning_rate": 1.9849667764501252e-05, "loss": 0.2687, "step": 1133 }, { "epoch": 0.6098823529411764, "grad_norm": 0.21101562678813934, "learning_rate": 1.980373839390996e-05, "loss": 0.2315, "step": 1134 }, { "epoch": 0.6104201680672269, "grad_norm": 0.19724448025226593, "learning_rate": 1.9757827346409868e-05, "loss": 0.2032, "step": 1135 }, { "epoch": 0.6109579831932773, "grad_norm": 0.21851415932178497, "learning_rate": 1.971193478389282e-05, "loss": 0.231, "step": 1136 }, { "epoch": 0.6114957983193278, "grad_norm": 0.2385670691728592, "learning_rate": 1.9666060868185466e-05, "loss": 0.2414, "step": 1137 }, { "epoch": 0.6120336134453781, "grad_norm": 0.22299398481845856, "learning_rate": 1.9620205761048694e-05, "loss": 0.2402, "step": 1138 }, { "epoch": 0.6125714285714285, "grad_norm": 0.20440271496772766, "learning_rate": 1.957436962417708e-05, "loss": 0.2317, "step": 1139 }, { "epoch": 0.613109243697479, "grad_norm": 0.2320059984922409, "learning_rate": 1.952855261919832e-05, "loss": 0.302, "step": 1140 }, { "epoch": 0.6136470588235294, "grad_norm": 0.23930199444293976, "learning_rate": 1.9482754907672612e-05, "loss": 0.2852, "step": 1141 }, { "epoch": 0.6141848739495799, "grad_norm": 0.19173762202262878, "learning_rate": 1.9436976651092144e-05, "loss": 0.2026, "step": 1142 }, { "epoch": 0.6147226890756302, "grad_norm": 0.21921592950820923, "learning_rate": 1.9391218010880512e-05, "loss": 0.2435, "step": 1143 }, { "epoch": 0.6152605042016807, "grad_norm": 0.21169905364513397, "learning_rate": 1.934547914839212e-05, "loss": 0.2297, "step": 1144 }, { "epoch": 0.6157983193277311, "grad_norm": 0.24218642711639404, "learning_rate": 1.9299760224911644e-05, "loss": 0.2467, "step": 1145 }, { "epoch": 0.6163361344537815, "grad_norm": 0.24206547439098358, "learning_rate": 1.9254061401653448e-05, "loss": 0.2921, "step": 1146 }, { "epoch": 0.6168739495798319, "grad_norm": 0.21156887710094452, "learning_rate": 1.9208382839761015e-05, "loss": 0.2168, "step": 1147 }, { "epoch": 0.6174117647058823, "grad_norm": 0.19622036814689636, "learning_rate": 1.9162724700306384e-05, "loss": 0.2023, "step": 1148 }, { "epoch": 0.6179495798319328, "grad_norm": 0.23689432442188263, "learning_rate": 1.911708714428958e-05, "loss": 0.2422, "step": 1149 }, { "epoch": 0.6184873949579832, "grad_norm": 0.24284087121486664, "learning_rate": 1.9071470332638054e-05, "loss": 0.282, "step": 1150 }, { "epoch": 0.6190252100840337, "grad_norm": 0.23501574993133545, "learning_rate": 1.902587442620609e-05, "loss": 0.2576, "step": 1151 }, { "epoch": 0.619563025210084, "grad_norm": 0.22928939759731293, "learning_rate": 1.8980299585774287e-05, "loss": 0.2458, "step": 1152 }, { "epoch": 0.6201008403361344, "grad_norm": 0.19916066527366638, "learning_rate": 1.8934745972048922e-05, "loss": 0.2298, "step": 1153 }, { "epoch": 0.6206386554621849, "grad_norm": 0.24378034472465515, "learning_rate": 1.8889213745661448e-05, "loss": 0.2474, "step": 1154 }, { "epoch": 0.6211764705882353, "grad_norm": 0.23293298482894897, "learning_rate": 1.8843703067167892e-05, "loss": 0.2394, "step": 1155 }, { "epoch": 0.6217142857142857, "grad_norm": 0.2416652888059616, "learning_rate": 1.8798214097048313e-05, "loss": 0.2768, "step": 1156 }, { "epoch": 0.6222521008403361, "grad_norm": 0.22462652623653412, "learning_rate": 1.87527469957062e-05, "loss": 0.2447, "step": 1157 }, { "epoch": 0.6227899159663866, "grad_norm": 0.19404210150241852, "learning_rate": 1.8707301923467942e-05, "loss": 0.2134, "step": 1158 }, { "epoch": 0.623327731092437, "grad_norm": 0.22166766226291656, "learning_rate": 1.8661879040582254e-05, "loss": 0.2508, "step": 1159 }, { "epoch": 0.6238655462184874, "grad_norm": 0.2694026231765747, "learning_rate": 1.8616478507219585e-05, "loss": 0.3165, "step": 1160 }, { "epoch": 0.6244033613445378, "grad_norm": 0.21316049993038177, "learning_rate": 1.85711004834716e-05, "loss": 0.2312, "step": 1161 }, { "epoch": 0.6249411764705882, "grad_norm": 0.2301514744758606, "learning_rate": 1.8525745129350565e-05, "loss": 0.2594, "step": 1162 }, { "epoch": 0.6254789915966387, "grad_norm": 0.21092134714126587, "learning_rate": 1.8480412604788856e-05, "loss": 0.2217, "step": 1163 }, { "epoch": 0.6260168067226891, "grad_norm": 0.19549231231212616, "learning_rate": 1.8435103069638272e-05, "loss": 0.2103, "step": 1164 }, { "epoch": 0.6265546218487394, "grad_norm": 0.24326828122138977, "learning_rate": 1.8389816683669608e-05, "loss": 0.2561, "step": 1165 }, { "epoch": 0.6270924369747899, "grad_norm": 0.19743697345256805, "learning_rate": 1.834455360657201e-05, "loss": 0.2153, "step": 1166 }, { "epoch": 0.6276302521008403, "grad_norm": 0.21845899522304535, "learning_rate": 1.8299313997952434e-05, "loss": 0.2337, "step": 1167 }, { "epoch": 0.6281680672268908, "grad_norm": 0.21383164823055267, "learning_rate": 1.825409801733507e-05, "loss": 0.2241, "step": 1168 }, { "epoch": 0.6287058823529412, "grad_norm": 0.24857626855373383, "learning_rate": 1.820890582416081e-05, "loss": 0.2662, "step": 1169 }, { "epoch": 0.6292436974789916, "grad_norm": 0.18927183747291565, "learning_rate": 1.8163737577786654e-05, "loss": 0.2018, "step": 1170 }, { "epoch": 0.629781512605042, "grad_norm": 0.2225828319787979, "learning_rate": 1.8118593437485172e-05, "loss": 0.2429, "step": 1171 }, { "epoch": 0.6303193277310924, "grad_norm": 0.25112175941467285, "learning_rate": 1.8073473562443906e-05, "loss": 0.2618, "step": 1172 }, { "epoch": 0.6308571428571429, "grad_norm": 0.21214666962623596, "learning_rate": 1.8028378111764864e-05, "loss": 0.2564, "step": 1173 }, { "epoch": 0.6313949579831932, "grad_norm": 0.2133854478597641, "learning_rate": 1.7983307244463914e-05, "loss": 0.2349, "step": 1174 }, { "epoch": 0.6319327731092437, "grad_norm": 0.23252110183238983, "learning_rate": 1.7938261119470244e-05, "loss": 0.2716, "step": 1175 }, { "epoch": 0.6324705882352941, "grad_norm": 0.21583209931850433, "learning_rate": 1.7893239895625767e-05, "loss": 0.2709, "step": 1176 }, { "epoch": 0.6330084033613446, "grad_norm": 0.19374023377895355, "learning_rate": 1.7848243731684638e-05, "loss": 0.2107, "step": 1177 }, { "epoch": 0.633546218487395, "grad_norm": 0.23478300869464874, "learning_rate": 1.7803272786312607e-05, "loss": 0.2575, "step": 1178 }, { "epoch": 0.6340840336134453, "grad_norm": 0.2129848152399063, "learning_rate": 1.7758327218086518e-05, "loss": 0.2316, "step": 1179 }, { "epoch": 0.6346218487394958, "grad_norm": 0.24132247269153595, "learning_rate": 1.7713407185493725e-05, "loss": 0.2856, "step": 1180 }, { "epoch": 0.6351596638655462, "grad_norm": 0.239977166056633, "learning_rate": 1.766851284693153e-05, "loss": 0.2617, "step": 1181 }, { "epoch": 0.6356974789915967, "grad_norm": 0.23282906413078308, "learning_rate": 1.762364436070664e-05, "loss": 0.2613, "step": 1182 }, { "epoch": 0.636235294117647, "grad_norm": 0.22992181777954102, "learning_rate": 1.7578801885034614e-05, "loss": 0.2596, "step": 1183 }, { "epoch": 0.6367731092436975, "grad_norm": 0.24672624468803406, "learning_rate": 1.7533985578039262e-05, "loss": 0.28, "step": 1184 }, { "epoch": 0.6373109243697479, "grad_norm": 0.19815582036972046, "learning_rate": 1.7489195597752142e-05, "loss": 0.215, "step": 1185 }, { "epoch": 0.6378487394957983, "grad_norm": 0.2164713740348816, "learning_rate": 1.7444432102111973e-05, "loss": 0.2273, "step": 1186 }, { "epoch": 0.6383865546218488, "grad_norm": 0.22566668689250946, "learning_rate": 1.7399695248964086e-05, "loss": 0.2408, "step": 1187 }, { "epoch": 0.6389243697478991, "grad_norm": 0.2275109738111496, "learning_rate": 1.7354985196059848e-05, "loss": 0.2436, "step": 1188 }, { "epoch": 0.6394621848739496, "grad_norm": 0.21384531259536743, "learning_rate": 1.7310302101056148e-05, "loss": 0.2433, "step": 1189 }, { "epoch": 0.64, "grad_norm": 0.19996559619903564, "learning_rate": 1.7265646121514793e-05, "loss": 0.2066, "step": 1190 }, { "epoch": 0.6405378151260505, "grad_norm": 0.21316596865653992, "learning_rate": 1.7221017414902003e-05, "loss": 0.221, "step": 1191 }, { "epoch": 0.6410756302521008, "grad_norm": 0.2140304446220398, "learning_rate": 1.717641613858779e-05, "loss": 0.2569, "step": 1192 }, { "epoch": 0.6416134453781512, "grad_norm": 0.22504128515720367, "learning_rate": 1.7131842449845482e-05, "loss": 0.2452, "step": 1193 }, { "epoch": 0.6421512605042017, "grad_norm": 0.2319757640361786, "learning_rate": 1.7087296505851096e-05, "loss": 0.259, "step": 1194 }, { "epoch": 0.6426890756302521, "grad_norm": 0.24861328303813934, "learning_rate": 1.704277846368283e-05, "loss": 0.25, "step": 1195 }, { "epoch": 0.6432268907563026, "grad_norm": 0.24296881258487701, "learning_rate": 1.699828848032049e-05, "loss": 0.2569, "step": 1196 }, { "epoch": 0.6437647058823529, "grad_norm": 0.23796342313289642, "learning_rate": 1.6953826712644948e-05, "loss": 0.2563, "step": 1197 }, { "epoch": 0.6443025210084034, "grad_norm": 0.23866625130176544, "learning_rate": 1.6909393317437586e-05, "loss": 0.253, "step": 1198 }, { "epoch": 0.6448403361344538, "grad_norm": 0.26671093702316284, "learning_rate": 1.6864988451379703e-05, "loss": 0.3072, "step": 1199 }, { "epoch": 0.6453781512605042, "grad_norm": 0.2149304896593094, "learning_rate": 1.6820612271052036e-05, "loss": 0.2504, "step": 1200 }, { "epoch": 0.6459159663865546, "grad_norm": 0.19960154592990875, "learning_rate": 1.677626493293416e-05, "loss": 0.2136, "step": 1201 }, { "epoch": 0.646453781512605, "grad_norm": 0.20982013642787933, "learning_rate": 1.6731946593403945e-05, "loss": 0.2409, "step": 1202 }, { "epoch": 0.6469915966386555, "grad_norm": 0.26075857877731323, "learning_rate": 1.6687657408736997e-05, "loss": 0.2766, "step": 1203 }, { "epoch": 0.6475294117647059, "grad_norm": 0.23471295833587646, "learning_rate": 1.664339753510614e-05, "loss": 0.2664, "step": 1204 }, { "epoch": 0.6480672268907564, "grad_norm": 0.20021267235279083, "learning_rate": 1.6599167128580816e-05, "loss": 0.2142, "step": 1205 }, { "epoch": 0.6486050420168067, "grad_norm": 0.22755801677703857, "learning_rate": 1.6554966345126576e-05, "loss": 0.2585, "step": 1206 }, { "epoch": 0.6491428571428571, "grad_norm": 0.228998601436615, "learning_rate": 1.6510795340604505e-05, "loss": 0.2468, "step": 1207 }, { "epoch": 0.6496806722689076, "grad_norm": 0.22214056551456451, "learning_rate": 1.646665427077068e-05, "loss": 0.2303, "step": 1208 }, { "epoch": 0.650218487394958, "grad_norm": 0.21495211124420166, "learning_rate": 1.642254329127563e-05, "loss": 0.2505, "step": 1209 }, { "epoch": 0.6507563025210084, "grad_norm": 0.21766765415668488, "learning_rate": 1.6378462557663787e-05, "loss": 0.2355, "step": 1210 }, { "epoch": 0.6512941176470588, "grad_norm": 0.22449415922164917, "learning_rate": 1.6334412225372896e-05, "loss": 0.2419, "step": 1211 }, { "epoch": 0.6518319327731092, "grad_norm": 0.21600310504436493, "learning_rate": 1.6290392449733536e-05, "loss": 0.2239, "step": 1212 }, { "epoch": 0.6523697478991597, "grad_norm": 0.1893143355846405, "learning_rate": 1.624640338596852e-05, "loss": 0.2041, "step": 1213 }, { "epoch": 0.6529075630252101, "grad_norm": 0.19268107414245605, "learning_rate": 1.620244518919237e-05, "loss": 0.1923, "step": 1214 }, { "epoch": 0.6534453781512605, "grad_norm": 0.22330577671527863, "learning_rate": 1.6158518014410762e-05, "loss": 0.2504, "step": 1215 }, { "epoch": 0.6539831932773109, "grad_norm": 0.2710134983062744, "learning_rate": 1.6114622016519985e-05, "loss": 0.2456, "step": 1216 }, { "epoch": 0.6545210084033614, "grad_norm": 0.20601332187652588, "learning_rate": 1.6070757350306394e-05, "loss": 0.235, "step": 1217 }, { "epoch": 0.6550588235294118, "grad_norm": 0.2137424498796463, "learning_rate": 1.602692417044585e-05, "loss": 0.2388, "step": 1218 }, { "epoch": 0.6555966386554621, "grad_norm": 0.20844164490699768, "learning_rate": 1.59831226315032e-05, "loss": 0.2292, "step": 1219 }, { "epoch": 0.6561344537815126, "grad_norm": 0.23910173773765564, "learning_rate": 1.5939352887931707e-05, "loss": 0.2802, "step": 1220 }, { "epoch": 0.656672268907563, "grad_norm": 0.20287171006202698, "learning_rate": 1.589561509407253e-05, "loss": 0.221, "step": 1221 }, { "epoch": 0.6572100840336135, "grad_norm": 0.233629047870636, "learning_rate": 1.5851909404154162e-05, "loss": 0.244, "step": 1222 }, { "epoch": 0.6577478991596639, "grad_norm": 0.2525360584259033, "learning_rate": 1.5808235972291886e-05, "loss": 0.263, "step": 1223 }, { "epoch": 0.6582857142857143, "grad_norm": 0.22428973019123077, "learning_rate": 1.576459495248723e-05, "loss": 0.2546, "step": 1224 }, { "epoch": 0.6588235294117647, "grad_norm": 0.20047806203365326, "learning_rate": 1.5720986498627448e-05, "loss": 0.2176, "step": 1225 }, { "epoch": 0.6593613445378151, "grad_norm": 0.19986417889595032, "learning_rate": 1.5677410764484952e-05, "loss": 0.2255, "step": 1226 }, { "epoch": 0.6598991596638656, "grad_norm": 0.23886221647262573, "learning_rate": 1.5633867903716778e-05, "loss": 0.2599, "step": 1227 }, { "epoch": 0.6604369747899159, "grad_norm": 0.22238244116306305, "learning_rate": 1.5590358069864037e-05, "loss": 0.2195, "step": 1228 }, { "epoch": 0.6609747899159664, "grad_norm": 0.2169061005115509, "learning_rate": 1.5546881416351385e-05, "loss": 0.2368, "step": 1229 }, { "epoch": 0.6615126050420168, "grad_norm": 0.2144898921251297, "learning_rate": 1.550343809648649e-05, "loss": 0.2161, "step": 1230 }, { "epoch": 0.6620504201680673, "grad_norm": 0.21709373593330383, "learning_rate": 1.5460028263459455e-05, "loss": 0.2413, "step": 1231 }, { "epoch": 0.6625882352941177, "grad_norm": 0.2197798490524292, "learning_rate": 1.541665207034232e-05, "loss": 0.248, "step": 1232 }, { "epoch": 0.663126050420168, "grad_norm": 0.2238740622997284, "learning_rate": 1.537330967008849e-05, "loss": 0.2621, "step": 1233 }, { "epoch": 0.6636638655462185, "grad_norm": 0.24656018614768982, "learning_rate": 1.5330001215532243e-05, "loss": 0.2592, "step": 1234 }, { "epoch": 0.6642016806722689, "grad_norm": 0.2097335159778595, "learning_rate": 1.5286726859388087e-05, "loss": 0.2084, "step": 1235 }, { "epoch": 0.6647394957983194, "grad_norm": 0.24797068536281586, "learning_rate": 1.5243486754250363e-05, "loss": 0.2698, "step": 1236 }, { "epoch": 0.6652773109243697, "grad_norm": 0.2215716540813446, "learning_rate": 1.5200281052592596e-05, "loss": 0.2223, "step": 1237 }, { "epoch": 0.6658151260504201, "grad_norm": 0.23980914056301117, "learning_rate": 1.5157109906767026e-05, "loss": 0.2778, "step": 1238 }, { "epoch": 0.6663529411764706, "grad_norm": 0.21101805567741394, "learning_rate": 1.5113973469004014e-05, "loss": 0.2191, "step": 1239 }, { "epoch": 0.666890756302521, "grad_norm": 0.23726236820220947, "learning_rate": 1.507087189141155e-05, "loss": 0.2556, "step": 1240 }, { "epoch": 0.6674285714285715, "grad_norm": 0.21893130242824554, "learning_rate": 1.5027805325974694e-05, "loss": 0.2453, "step": 1241 }, { "epoch": 0.6679663865546218, "grad_norm": 0.2251564860343933, "learning_rate": 1.4984773924555046e-05, "loss": 0.2485, "step": 1242 }, { "epoch": 0.6685042016806723, "grad_norm": 0.22909890115261078, "learning_rate": 1.4941777838890215e-05, "loss": 0.2739, "step": 1243 }, { "epoch": 0.6690420168067227, "grad_norm": 0.2488081008195877, "learning_rate": 1.489881722059327e-05, "loss": 0.276, "step": 1244 }, { "epoch": 0.6695798319327732, "grad_norm": 0.21804067492485046, "learning_rate": 1.485589222115223e-05, "loss": 0.2273, "step": 1245 }, { "epoch": 0.6701176470588235, "grad_norm": 0.24628224968910217, "learning_rate": 1.4813002991929492e-05, "loss": 0.2483, "step": 1246 }, { "epoch": 0.6706554621848739, "grad_norm": 0.21196956932544708, "learning_rate": 1.4770149684161339e-05, "loss": 0.2433, "step": 1247 }, { "epoch": 0.6711932773109244, "grad_norm": 0.21633121371269226, "learning_rate": 1.4727332448957382e-05, "loss": 0.2401, "step": 1248 }, { "epoch": 0.6717310924369748, "grad_norm": 0.21294204890727997, "learning_rate": 1.4684551437300032e-05, "loss": 0.2587, "step": 1249 }, { "epoch": 0.6722689075630253, "grad_norm": 0.22414787113666534, "learning_rate": 1.4641806800043966e-05, "loss": 0.2276, "step": 1250 }, { "epoch": 0.6728067226890756, "grad_norm": 0.21335649490356445, "learning_rate": 1.4599098687915603e-05, "loss": 0.2446, "step": 1251 }, { "epoch": 0.673344537815126, "grad_norm": 0.2575361728668213, "learning_rate": 1.4556427251512562e-05, "loss": 0.2938, "step": 1252 }, { "epoch": 0.6738823529411765, "grad_norm": 0.22107447683811188, "learning_rate": 1.4513792641303133e-05, "loss": 0.2672, "step": 1253 }, { "epoch": 0.6744201680672269, "grad_norm": 0.19980645179748535, "learning_rate": 1.447119500762577e-05, "loss": 0.1933, "step": 1254 }, { "epoch": 0.6749579831932773, "grad_norm": 0.20437587797641754, "learning_rate": 1.4428634500688498e-05, "loss": 0.2151, "step": 1255 }, { "epoch": 0.6754957983193277, "grad_norm": 0.20928694307804108, "learning_rate": 1.4386111270568477e-05, "loss": 0.2633, "step": 1256 }, { "epoch": 0.6760336134453782, "grad_norm": 0.22099100053310394, "learning_rate": 1.4343625467211386e-05, "loss": 0.2482, "step": 1257 }, { "epoch": 0.6765714285714286, "grad_norm": 0.24198603630065918, "learning_rate": 1.4301177240430924e-05, "loss": 0.2731, "step": 1258 }, { "epoch": 0.677109243697479, "grad_norm": 0.23167122900485992, "learning_rate": 1.4258766739908325e-05, "loss": 0.24, "step": 1259 }, { "epoch": 0.6776470588235294, "grad_norm": 0.2559681534767151, "learning_rate": 1.421639411519175e-05, "loss": 0.2687, "step": 1260 }, { "epoch": 0.6781848739495798, "grad_norm": 0.2112700492143631, "learning_rate": 1.4174059515695842e-05, "loss": 0.2423, "step": 1261 }, { "epoch": 0.6787226890756303, "grad_norm": 0.25045645236968994, "learning_rate": 1.4131763090701116e-05, "loss": 0.2355, "step": 1262 }, { "epoch": 0.6792605042016807, "grad_norm": 0.1972346305847168, "learning_rate": 1.4089504989353524e-05, "loss": 0.1818, "step": 1263 }, { "epoch": 0.679798319327731, "grad_norm": 0.21904651820659637, "learning_rate": 1.4047285360663831e-05, "loss": 0.2437, "step": 1264 }, { "epoch": 0.6803361344537815, "grad_norm": 0.2096264362335205, "learning_rate": 1.4005104353507185e-05, "loss": 0.2404, "step": 1265 }, { "epoch": 0.6808739495798319, "grad_norm": 0.25896573066711426, "learning_rate": 1.3962962116622504e-05, "loss": 0.2925, "step": 1266 }, { "epoch": 0.6814117647058824, "grad_norm": 0.22476674616336823, "learning_rate": 1.392085879861204e-05, "loss": 0.2422, "step": 1267 }, { "epoch": 0.6819495798319328, "grad_norm": 0.22652891278266907, "learning_rate": 1.3878794547940765e-05, "loss": 0.2495, "step": 1268 }, { "epoch": 0.6824873949579832, "grad_norm": 0.20481696724891663, "learning_rate": 1.3836769512935927e-05, "loss": 0.2342, "step": 1269 }, { "epoch": 0.6830252100840336, "grad_norm": 0.23722721636295319, "learning_rate": 1.3794783841786465e-05, "loss": 0.2663, "step": 1270 }, { "epoch": 0.683563025210084, "grad_norm": 0.2346620112657547, "learning_rate": 1.375283768254252e-05, "loss": 0.2332, "step": 1271 }, { "epoch": 0.6841008403361345, "grad_norm": 0.2143581509590149, "learning_rate": 1.3710931183114919e-05, "loss": 0.2537, "step": 1272 }, { "epoch": 0.6846386554621848, "grad_norm": 0.23463256657123566, "learning_rate": 1.3669064491274623e-05, "loss": 0.2517, "step": 1273 }, { "epoch": 0.6851764705882353, "grad_norm": 0.250056654214859, "learning_rate": 1.362723775465224e-05, "loss": 0.2773, "step": 1274 }, { "epoch": 0.6857142857142857, "grad_norm": 0.33989107608795166, "learning_rate": 1.3585451120737467e-05, "loss": 0.265, "step": 1275 }, { "epoch": 0.6862521008403362, "grad_norm": 0.24132364988327026, "learning_rate": 1.3543704736878614e-05, "loss": 0.2428, "step": 1276 }, { "epoch": 0.6867899159663865, "grad_norm": 0.23876482248306274, "learning_rate": 1.3501998750282039e-05, "loss": 0.2513, "step": 1277 }, { "epoch": 0.6873277310924369, "grad_norm": 0.2151450663805008, "learning_rate": 1.3460333308011663e-05, "loss": 0.2287, "step": 1278 }, { "epoch": 0.6878655462184874, "grad_norm": 0.2007814347743988, "learning_rate": 1.3418708556988452e-05, "loss": 0.2095, "step": 1279 }, { "epoch": 0.6884033613445378, "grad_norm": 0.23487450182437897, "learning_rate": 1.3377124643989858e-05, "loss": 0.2366, "step": 1280 }, { "epoch": 0.6889411764705883, "grad_norm": 0.21814358234405518, "learning_rate": 1.3335581715649344e-05, "loss": 0.2348, "step": 1281 }, { "epoch": 0.6894789915966386, "grad_norm": 0.2367352843284607, "learning_rate": 1.3294079918455849e-05, "loss": 0.2705, "step": 1282 }, { "epoch": 0.6900168067226891, "grad_norm": 0.25812456011772156, "learning_rate": 1.3252619398753285e-05, "loss": 0.3323, "step": 1283 }, { "epoch": 0.6905546218487395, "grad_norm": 0.20507705211639404, "learning_rate": 1.3211200302739995e-05, "loss": 0.2164, "step": 1284 }, { "epoch": 0.69109243697479, "grad_norm": 0.23781126737594604, "learning_rate": 1.3169822776468268e-05, "loss": 0.2558, "step": 1285 }, { "epoch": 0.6916302521008403, "grad_norm": 0.23376110196113586, "learning_rate": 1.3128486965843817e-05, "loss": 0.237, "step": 1286 }, { "epoch": 0.6921680672268907, "grad_norm": 0.187261164188385, "learning_rate": 1.308719301662522e-05, "loss": 0.2142, "step": 1287 }, { "epoch": 0.6927058823529412, "grad_norm": 0.19427524507045746, "learning_rate": 1.304594107442349e-05, "loss": 0.2483, "step": 1288 }, { "epoch": 0.6932436974789916, "grad_norm": 0.1909186840057373, "learning_rate": 1.3004731284701471e-05, "loss": 0.2064, "step": 1289 }, { "epoch": 0.6937815126050421, "grad_norm": 0.21653415262699127, "learning_rate": 1.2963563792773409e-05, "loss": 0.2467, "step": 1290 }, { "epoch": 0.6943193277310924, "grad_norm": 0.23199012875556946, "learning_rate": 1.2922438743804363e-05, "loss": 0.2237, "step": 1291 }, { "epoch": 0.6948571428571428, "grad_norm": 0.21452416479587555, "learning_rate": 1.2881356282809764e-05, "loss": 0.2307, "step": 1292 }, { "epoch": 0.6953949579831933, "grad_norm": 0.22919467091560364, "learning_rate": 1.2840316554654835e-05, "loss": 0.2555, "step": 1293 }, { "epoch": 0.6959327731092437, "grad_norm": 0.2175435870885849, "learning_rate": 1.279931970405413e-05, "loss": 0.2322, "step": 1294 }, { "epoch": 0.6964705882352941, "grad_norm": 0.23672932386398315, "learning_rate": 1.2758365875571024e-05, "loss": 0.251, "step": 1295 }, { "epoch": 0.6970084033613445, "grad_norm": 0.23005543649196625, "learning_rate": 1.2717455213617152e-05, "loss": 0.2801, "step": 1296 }, { "epoch": 0.697546218487395, "grad_norm": 0.21484732627868652, "learning_rate": 1.2676587862451968e-05, "loss": 0.2355, "step": 1297 }, { "epoch": 0.6980840336134454, "grad_norm": 0.2306678146123886, "learning_rate": 1.263576396618218e-05, "loss": 0.2656, "step": 1298 }, { "epoch": 0.6986218487394958, "grad_norm": 0.18540635704994202, "learning_rate": 1.2594983668761286e-05, "loss": 0.2152, "step": 1299 }, { "epoch": 0.6991596638655462, "grad_norm": 0.25577017664909363, "learning_rate": 1.2554247113989021e-05, "loss": 0.2993, "step": 1300 }, { "epoch": 0.6996974789915966, "grad_norm": 0.19862250983715057, "learning_rate": 1.25135544455109e-05, "loss": 0.2318, "step": 1301 }, { "epoch": 0.7002352941176471, "grad_norm": 0.21865324676036835, "learning_rate": 1.2472905806817655e-05, "loss": 0.2311, "step": 1302 }, { "epoch": 0.7007731092436975, "grad_norm": 0.2108997255563736, "learning_rate": 1.2432301341244804e-05, "loss": 0.2391, "step": 1303 }, { "epoch": 0.7013109243697478, "grad_norm": 0.20589980483055115, "learning_rate": 1.239174119197206e-05, "loss": 0.2052, "step": 1304 }, { "epoch": 0.7018487394957983, "grad_norm": 0.269837886095047, "learning_rate": 1.2351225502022873e-05, "loss": 0.3076, "step": 1305 }, { "epoch": 0.7023865546218487, "grad_norm": 0.2143421322107315, "learning_rate": 1.231075441426395e-05, "loss": 0.2344, "step": 1306 }, { "epoch": 0.7029243697478992, "grad_norm": 0.23600098490715027, "learning_rate": 1.2270328071404686e-05, "loss": 0.2502, "step": 1307 }, { "epoch": 0.7034621848739496, "grad_norm": 0.2107437700033188, "learning_rate": 1.2229946615996729e-05, "loss": 0.2188, "step": 1308 }, { "epoch": 0.704, "grad_norm": 0.19257639348506927, "learning_rate": 1.2189610190433406e-05, "loss": 0.1995, "step": 1309 }, { "epoch": 0.7045378151260504, "grad_norm": 0.2633324861526489, "learning_rate": 1.2149318936949306e-05, "loss": 0.3131, "step": 1310 }, { "epoch": 0.7050756302521008, "grad_norm": 0.23750655353069305, "learning_rate": 1.2109072997619683e-05, "loss": 0.2542, "step": 1311 }, { "epoch": 0.7056134453781513, "grad_norm": 0.22169284522533417, "learning_rate": 1.2068872514360038e-05, "loss": 0.2243, "step": 1312 }, { "epoch": 0.7061512605042016, "grad_norm": 0.23754152655601501, "learning_rate": 1.2028717628925587e-05, "loss": 0.2653, "step": 1313 }, { "epoch": 0.7066890756302521, "grad_norm": 0.2294033020734787, "learning_rate": 1.1988608482910726e-05, "loss": 0.2332, "step": 1314 }, { "epoch": 0.7072268907563025, "grad_norm": 0.22231736779212952, "learning_rate": 1.194854521774861e-05, "loss": 0.2219, "step": 1315 }, { "epoch": 0.707764705882353, "grad_norm": 0.19734542071819305, "learning_rate": 1.1908527974710548e-05, "loss": 0.231, "step": 1316 }, { "epoch": 0.7083025210084034, "grad_norm": 0.2109014093875885, "learning_rate": 1.186855689490563e-05, "loss": 0.2113, "step": 1317 }, { "epoch": 0.7088403361344537, "grad_norm": 0.19948117434978485, "learning_rate": 1.1828632119280115e-05, "loss": 0.2091, "step": 1318 }, { "epoch": 0.7093781512605042, "grad_norm": 0.22777283191680908, "learning_rate": 1.1788753788617013e-05, "loss": 0.2215, "step": 1319 }, { "epoch": 0.7099159663865546, "grad_norm": 0.23160848021507263, "learning_rate": 1.174892204353556e-05, "loss": 0.266, "step": 1320 }, { "epoch": 0.7104537815126051, "grad_norm": 0.25431686639785767, "learning_rate": 1.1709137024490696e-05, "loss": 0.2859, "step": 1321 }, { "epoch": 0.7109915966386554, "grad_norm": 0.20985522866249084, "learning_rate": 1.166939887177263e-05, "loss": 0.2268, "step": 1322 }, { "epoch": 0.7115294117647059, "grad_norm": 0.2005484402179718, "learning_rate": 1.1629707725506275e-05, "loss": 0.245, "step": 1323 }, { "epoch": 0.7120672268907563, "grad_norm": 0.2151431441307068, "learning_rate": 1.1590063725650824e-05, "loss": 0.2312, "step": 1324 }, { "epoch": 0.7126050420168067, "grad_norm": 0.2094922661781311, "learning_rate": 1.1550467011999186e-05, "loss": 0.2314, "step": 1325 }, { "epoch": 0.7131428571428572, "grad_norm": 0.18972283601760864, "learning_rate": 1.1510917724177567e-05, "loss": 0.2054, "step": 1326 }, { "epoch": 0.7136806722689075, "grad_norm": 0.20692692697048187, "learning_rate": 1.1471416001644911e-05, "loss": 0.2008, "step": 1327 }, { "epoch": 0.714218487394958, "grad_norm": 0.21696309745311737, "learning_rate": 1.143196198369244e-05, "loss": 0.2087, "step": 1328 }, { "epoch": 0.7147563025210084, "grad_norm": 0.2015358805656433, "learning_rate": 1.1392555809443184e-05, "loss": 0.2095, "step": 1329 }, { "epoch": 0.7152941176470589, "grad_norm": 0.23600055277347565, "learning_rate": 1.1353197617851433e-05, "loss": 0.2506, "step": 1330 }, { "epoch": 0.7158319327731092, "grad_norm": 0.2174774706363678, "learning_rate": 1.131388754770231e-05, "loss": 0.2298, "step": 1331 }, { "epoch": 0.7163697478991596, "grad_norm": 0.20797263085842133, "learning_rate": 1.1274625737611219e-05, "loss": 0.2236, "step": 1332 }, { "epoch": 0.7169075630252101, "grad_norm": 0.21804486215114594, "learning_rate": 1.1235412326023434e-05, "loss": 0.2177, "step": 1333 }, { "epoch": 0.7174453781512605, "grad_norm": 0.21548961102962494, "learning_rate": 1.1196247451213512e-05, "loss": 0.2372, "step": 1334 }, { "epoch": 0.717983193277311, "grad_norm": 0.22576172649860382, "learning_rate": 1.1157131251284914e-05, "loss": 0.2532, "step": 1335 }, { "epoch": 0.7185210084033613, "grad_norm": 0.22701843082904816, "learning_rate": 1.1118063864169415e-05, "loss": 0.2819, "step": 1336 }, { "epoch": 0.7190588235294118, "grad_norm": 0.21869680285453796, "learning_rate": 1.1079045427626703e-05, "loss": 0.2418, "step": 1337 }, { "epoch": 0.7195966386554622, "grad_norm": 0.24774786829948425, "learning_rate": 1.1040076079243824e-05, "loss": 0.2486, "step": 1338 }, { "epoch": 0.7201344537815126, "grad_norm": 0.20886799693107605, "learning_rate": 1.100115595643477e-05, "loss": 0.2218, "step": 1339 }, { "epoch": 0.720672268907563, "grad_norm": 0.2487592250108719, "learning_rate": 1.0962285196439911e-05, "loss": 0.2592, "step": 1340 }, { "epoch": 0.7212100840336134, "grad_norm": 0.21543507277965546, "learning_rate": 1.0923463936325568e-05, "loss": 0.2212, "step": 1341 }, { "epoch": 0.7217478991596639, "grad_norm": 0.23592127859592438, "learning_rate": 1.0884692312983535e-05, "loss": 0.2527, "step": 1342 }, { "epoch": 0.7222857142857143, "grad_norm": 0.2409650981426239, "learning_rate": 1.0845970463130541e-05, "loss": 0.2777, "step": 1343 }, { "epoch": 0.7228235294117648, "grad_norm": 0.23602500557899475, "learning_rate": 1.0807298523307846e-05, "loss": 0.2587, "step": 1344 }, { "epoch": 0.7233613445378151, "grad_norm": 0.21660150587558746, "learning_rate": 1.0768676629880673e-05, "loss": 0.2293, "step": 1345 }, { "epoch": 0.7238991596638655, "grad_norm": 0.24998168647289276, "learning_rate": 1.0730104919037799e-05, "loss": 0.2553, "step": 1346 }, { "epoch": 0.724436974789916, "grad_norm": 0.2412223368883133, "learning_rate": 1.0691583526791052e-05, "loss": 0.2424, "step": 1347 }, { "epoch": 0.7249747899159664, "grad_norm": 0.22280366718769073, "learning_rate": 1.0653112588974793e-05, "loss": 0.2329, "step": 1348 }, { "epoch": 0.7255126050420168, "grad_norm": 0.2040831297636032, "learning_rate": 1.0614692241245509e-05, "loss": 0.2111, "step": 1349 }, { "epoch": 0.7260504201680672, "grad_norm": 0.19195757806301117, "learning_rate": 1.057632261908126e-05, "loss": 0.2072, "step": 1350 }, { "epoch": 0.7265882352941176, "grad_norm": 0.23605941236019135, "learning_rate": 1.0538003857781287e-05, "loss": 0.2501, "step": 1351 }, { "epoch": 0.7271260504201681, "grad_norm": 0.23891092836856842, "learning_rate": 1.0499736092465412e-05, "loss": 0.2388, "step": 1352 }, { "epoch": 0.7276638655462185, "grad_norm": 0.20679929852485657, "learning_rate": 1.0461519458073695e-05, "loss": 0.2194, "step": 1353 }, { "epoch": 0.7282016806722689, "grad_norm": 0.23450228571891785, "learning_rate": 1.0423354089365891e-05, "loss": 0.2611, "step": 1354 }, { "epoch": 0.7287394957983193, "grad_norm": 0.2095617949962616, "learning_rate": 1.038524012092095e-05, "loss": 0.2293, "step": 1355 }, { "epoch": 0.7292773109243698, "grad_norm": 0.20834556221961975, "learning_rate": 1.0347177687136608e-05, "loss": 0.2186, "step": 1356 }, { "epoch": 0.7298151260504202, "grad_norm": 0.21957340836524963, "learning_rate": 1.0309166922228853e-05, "loss": 0.2409, "step": 1357 }, { "epoch": 0.7303529411764705, "grad_norm": 0.20854447782039642, "learning_rate": 1.02712079602315e-05, "loss": 0.2421, "step": 1358 }, { "epoch": 0.730890756302521, "grad_norm": 0.23083819448947906, "learning_rate": 1.0233300934995671e-05, "loss": 0.2371, "step": 1359 }, { "epoch": 0.7314285714285714, "grad_norm": 0.23289982974529266, "learning_rate": 1.0195445980189383e-05, "loss": 0.3107, "step": 1360 }, { "epoch": 0.7319663865546219, "grad_norm": 0.22691717743873596, "learning_rate": 1.0157643229297004e-05, "loss": 0.2481, "step": 1361 }, { "epoch": 0.7325042016806723, "grad_norm": 0.20459218323230743, "learning_rate": 1.0119892815618853e-05, "loss": 0.2304, "step": 1362 }, { "epoch": 0.7330420168067227, "grad_norm": 0.2276337593793869, "learning_rate": 1.0082194872270675e-05, "loss": 0.227, "step": 1363 }, { "epoch": 0.7335798319327731, "grad_norm": 0.20450882613658905, "learning_rate": 1.0044549532183192e-05, "loss": 0.2362, "step": 1364 }, { "epoch": 0.7341176470588235, "grad_norm": 0.19269534945487976, "learning_rate": 1.0006956928101663e-05, "loss": 0.2004, "step": 1365 }, { "epoch": 0.734655462184874, "grad_norm": 0.2340576946735382, "learning_rate": 9.969417192585351e-06, "loss": 0.2444, "step": 1366 }, { "epoch": 0.7351932773109243, "grad_norm": 0.21006999909877777, "learning_rate": 9.931930458007136e-06, "loss": 0.2224, "step": 1367 }, { "epoch": 0.7357310924369748, "grad_norm": 0.20837579667568207, "learning_rate": 9.894496856552963e-06, "loss": 0.2046, "step": 1368 }, { "epoch": 0.7362689075630252, "grad_norm": 0.2253601849079132, "learning_rate": 9.857116520221457e-06, "loss": 0.2238, "step": 1369 }, { "epoch": 0.7368067226890757, "grad_norm": 0.25170648097991943, "learning_rate": 9.819789580823382e-06, "loss": 0.2374, "step": 1370 }, { "epoch": 0.7373445378151261, "grad_norm": 0.2324812412261963, "learning_rate": 9.782516169981254e-06, "loss": 0.2215, "step": 1371 }, { "epoch": 0.7378823529411764, "grad_norm": 0.22452184557914734, "learning_rate": 9.745296419128794e-06, "loss": 0.2315, "step": 1372 }, { "epoch": 0.7384201680672269, "grad_norm": 0.2293904572725296, "learning_rate": 9.708130459510537e-06, "loss": 0.2547, "step": 1373 }, { "epoch": 0.7389579831932773, "grad_norm": 0.20795464515686035, "learning_rate": 9.67101842218135e-06, "loss": 0.207, "step": 1374 }, { "epoch": 0.7394957983193278, "grad_norm": 0.25670695304870605, "learning_rate": 9.633960438005899e-06, "loss": 0.2682, "step": 1375 }, { "epoch": 0.7400336134453781, "grad_norm": 0.22279313206672668, "learning_rate": 9.596956637658313e-06, "loss": 0.243, "step": 1376 }, { "epoch": 0.7405714285714285, "grad_norm": 0.22875022888183594, "learning_rate": 9.560007151621619e-06, "loss": 0.2321, "step": 1377 }, { "epoch": 0.741109243697479, "grad_norm": 0.2712250053882599, "learning_rate": 9.523112110187346e-06, "loss": 0.2661, "step": 1378 }, { "epoch": 0.7416470588235294, "grad_norm": 0.2451431155204773, "learning_rate": 9.486271643455014e-06, "loss": 0.26, "step": 1379 }, { "epoch": 0.7421848739495799, "grad_norm": 0.19108882546424866, "learning_rate": 9.449485881331719e-06, "loss": 0.2119, "step": 1380 }, { "epoch": 0.7427226890756302, "grad_norm": 0.205430805683136, "learning_rate": 9.412754953531663e-06, "loss": 0.2066, "step": 1381 }, { "epoch": 0.7432605042016807, "grad_norm": 0.22810019552707672, "learning_rate": 9.376078989575665e-06, "loss": 0.2483, "step": 1382 }, { "epoch": 0.7437983193277311, "grad_norm": 0.23313850164413452, "learning_rate": 9.339458118790761e-06, "loss": 0.2608, "step": 1383 }, { "epoch": 0.7443361344537816, "grad_norm": 0.20907914638519287, "learning_rate": 9.302892470309684e-06, "loss": 0.2402, "step": 1384 }, { "epoch": 0.7448739495798319, "grad_norm": 0.21873757243156433, "learning_rate": 9.266382173070479e-06, "loss": 0.2255, "step": 1385 }, { "epoch": 0.7454117647058823, "grad_norm": 0.2230088710784912, "learning_rate": 9.229927355815985e-06, "loss": 0.2524, "step": 1386 }, { "epoch": 0.7459495798319328, "grad_norm": 0.18229977786540985, "learning_rate": 9.193528147093396e-06, "loss": 0.1895, "step": 1387 }, { "epoch": 0.7464873949579832, "grad_norm": 0.2057199627161026, "learning_rate": 9.157184675253867e-06, "loss": 0.2297, "step": 1388 }, { "epoch": 0.7470252100840337, "grad_norm": 0.22640296816825867, "learning_rate": 9.120897068451956e-06, "loss": 0.2505, "step": 1389 }, { "epoch": 0.747563025210084, "grad_norm": 0.21385444700717926, "learning_rate": 9.084665454645275e-06, "loss": 0.2281, "step": 1390 }, { "epoch": 0.7481008403361344, "grad_norm": 0.24358543753623962, "learning_rate": 9.048489961593958e-06, "loss": 0.2685, "step": 1391 }, { "epoch": 0.7486386554621849, "grad_norm": 0.2382071614265442, "learning_rate": 9.012370716860285e-06, "loss": 0.2422, "step": 1392 }, { "epoch": 0.7491764705882353, "grad_norm": 0.20517347753047943, "learning_rate": 8.976307847808143e-06, "loss": 0.2349, "step": 1393 }, { "epoch": 0.7497142857142857, "grad_norm": 0.25024041533470154, "learning_rate": 8.940301481602679e-06, "loss": 0.2422, "step": 1394 }, { "epoch": 0.7502521008403361, "grad_norm": 0.219709575176239, "learning_rate": 8.90435174520975e-06, "loss": 0.2442, "step": 1395 }, { "epoch": 0.7507899159663866, "grad_norm": 0.2377159744501114, "learning_rate": 8.868458765395568e-06, "loss": 0.2421, "step": 1396 }, { "epoch": 0.751327731092437, "grad_norm": 0.2325834333896637, "learning_rate": 8.832622668726184e-06, "loss": 0.2803, "step": 1397 }, { "epoch": 0.7518655462184874, "grad_norm": 0.2121846228837967, "learning_rate": 8.796843581567063e-06, "loss": 0.2523, "step": 1398 }, { "epoch": 0.7524033613445378, "grad_norm": 0.20979849994182587, "learning_rate": 8.761121630082668e-06, "loss": 0.2302, "step": 1399 }, { "epoch": 0.7529411764705882, "grad_norm": 0.2370312511920929, "learning_rate": 8.725456940235963e-06, "loss": 0.2566, "step": 1400 }, { "epoch": 0.7534789915966387, "grad_norm": 0.22219379246234894, "learning_rate": 8.68984963778802e-06, "loss": 0.2263, "step": 1401 }, { "epoch": 0.7540168067226891, "grad_norm": 0.22913403809070587, "learning_rate": 8.65429984829752e-06, "loss": 0.2166, "step": 1402 }, { "epoch": 0.7545546218487394, "grad_norm": 0.2273648977279663, "learning_rate": 8.618807697120381e-06, "loss": 0.2395, "step": 1403 }, { "epoch": 0.7550924369747899, "grad_norm": 0.23049034178256989, "learning_rate": 8.58337330940924e-06, "loss": 0.2136, "step": 1404 }, { "epoch": 0.7556302521008403, "grad_norm": 0.22248324751853943, "learning_rate": 8.547996810113076e-06, "loss": 0.2321, "step": 1405 }, { "epoch": 0.7561680672268908, "grad_norm": 0.26518169045448303, "learning_rate": 8.512678323976716e-06, "loss": 0.3069, "step": 1406 }, { "epoch": 0.7567058823529412, "grad_norm": 0.24161212146282196, "learning_rate": 8.477417975540438e-06, "loss": 0.2873, "step": 1407 }, { "epoch": 0.7572436974789916, "grad_norm": 0.22573645412921906, "learning_rate": 8.442215889139524e-06, "loss": 0.2206, "step": 1408 }, { "epoch": 0.757781512605042, "grad_norm": 0.21155428886413574, "learning_rate": 8.407072188903783e-06, "loss": 0.2195, "step": 1409 }, { "epoch": 0.7583193277310925, "grad_norm": 0.20944969356060028, "learning_rate": 8.371986998757164e-06, "loss": 0.2283, "step": 1410 }, { "epoch": 0.7588571428571429, "grad_norm": 0.19222448766231537, "learning_rate": 8.33696044241728e-06, "loss": 0.2075, "step": 1411 }, { "epoch": 0.7593949579831932, "grad_norm": 0.26435086131095886, "learning_rate": 8.301992643395015e-06, "loss": 0.2693, "step": 1412 }, { "epoch": 0.7599327731092437, "grad_norm": 0.220671609044075, "learning_rate": 8.267083724994032e-06, "loss": 0.211, "step": 1413 }, { "epoch": 0.7604705882352941, "grad_norm": 0.2246689647436142, "learning_rate": 8.232233810310394e-06, "loss": 0.2522, "step": 1414 }, { "epoch": 0.7610084033613446, "grad_norm": 0.20782461762428284, "learning_rate": 8.197443022232104e-06, "loss": 0.2114, "step": 1415 }, { "epoch": 0.761546218487395, "grad_norm": 0.2566245496273041, "learning_rate": 8.162711483438645e-06, "loss": 0.2525, "step": 1416 }, { "epoch": 0.7620840336134453, "grad_norm": 0.24346865713596344, "learning_rate": 8.12803931640061e-06, "loss": 0.2287, "step": 1417 }, { "epoch": 0.7626218487394958, "grad_norm": 0.21621957421302795, "learning_rate": 8.093426643379199e-06, "loss": 0.2279, "step": 1418 }, { "epoch": 0.7631596638655462, "grad_norm": 0.2196330726146698, "learning_rate": 8.058873586425864e-06, "loss": 0.2287, "step": 1419 }, { "epoch": 0.7636974789915967, "grad_norm": 0.24028843641281128, "learning_rate": 8.024380267381799e-06, "loss": 0.2782, "step": 1420 }, { "epoch": 0.764235294117647, "grad_norm": 0.21746890246868134, "learning_rate": 7.989946807877586e-06, "loss": 0.2321, "step": 1421 }, { "epoch": 0.7647731092436975, "grad_norm": 0.22737929224967957, "learning_rate": 7.9555733293327e-06, "loss": 0.2565, "step": 1422 }, { "epoch": 0.7653109243697479, "grad_norm": 0.21071253716945648, "learning_rate": 7.921259952955118e-06, "loss": 0.2098, "step": 1423 }, { "epoch": 0.7658487394957983, "grad_norm": 0.22043201327323914, "learning_rate": 7.887006799740906e-06, "loss": 0.2533, "step": 1424 }, { "epoch": 0.7663865546218488, "grad_norm": 0.21582907438278198, "learning_rate": 7.852813990473734e-06, "loss": 0.2536, "step": 1425 }, { "epoch": 0.7669243697478991, "grad_norm": 0.2434009313583374, "learning_rate": 7.818681645724529e-06, "loss": 0.2607, "step": 1426 }, { "epoch": 0.7674621848739496, "grad_norm": 0.2228667140007019, "learning_rate": 7.784609885850962e-06, "loss": 0.2489, "step": 1427 }, { "epoch": 0.768, "grad_norm": 0.20169229805469513, "learning_rate": 7.750598830997114e-06, "loss": 0.2224, "step": 1428 }, { "epoch": 0.7685378151260505, "grad_norm": 0.21318617463111877, "learning_rate": 7.716648601092968e-06, "loss": 0.2293, "step": 1429 }, { "epoch": 0.7690756302521008, "grad_norm": 0.22522027790546417, "learning_rate": 7.68275931585406e-06, "loss": 0.2332, "step": 1430 }, { "epoch": 0.7696134453781512, "grad_norm": 0.2417115569114685, "learning_rate": 7.64893109478099e-06, "loss": 0.256, "step": 1431 }, { "epoch": 0.7701512605042017, "grad_norm": 0.20186994969844818, "learning_rate": 7.615164057159063e-06, "loss": 0.2149, "step": 1432 }, { "epoch": 0.7706890756302521, "grad_norm": 0.25581175088882446, "learning_rate": 7.581458322057825e-06, "loss": 0.292, "step": 1433 }, { "epoch": 0.7712268907563026, "grad_norm": 0.21164478361606598, "learning_rate": 7.547814008330639e-06, "loss": 0.2411, "step": 1434 }, { "epoch": 0.7717647058823529, "grad_norm": 0.2501096725463867, "learning_rate": 7.514231234614322e-06, "loss": 0.2599, "step": 1435 }, { "epoch": 0.7723025210084034, "grad_norm": 0.21771636605262756, "learning_rate": 7.4807101193286515e-06, "loss": 0.2225, "step": 1436 }, { "epoch": 0.7728403361344538, "grad_norm": 0.22658269107341766, "learning_rate": 7.447250780676016e-06, "loss": 0.2127, "step": 1437 }, { "epoch": 0.7733781512605042, "grad_norm": 0.23322594165802002, "learning_rate": 7.413853336640933e-06, "loss": 0.2684, "step": 1438 }, { "epoch": 0.7739159663865546, "grad_norm": 0.223537415266037, "learning_rate": 7.3805179049896975e-06, "loss": 0.2335, "step": 1439 }, { "epoch": 0.774453781512605, "grad_norm": 0.21184484660625458, "learning_rate": 7.347244603269929e-06, "loss": 0.2187, "step": 1440 }, { "epoch": 0.7749915966386555, "grad_norm": 0.2276788353919983, "learning_rate": 7.314033548810143e-06, "loss": 0.2526, "step": 1441 }, { "epoch": 0.7755294117647059, "grad_norm": 0.24016128480434418, "learning_rate": 7.280884858719389e-06, "loss": 0.2458, "step": 1442 }, { "epoch": 0.7760672268907564, "grad_norm": 0.18796221911907196, "learning_rate": 7.247798649886781e-06, "loss": 0.1867, "step": 1443 }, { "epoch": 0.7766050420168067, "grad_norm": 0.20421604812145233, "learning_rate": 7.214775038981139e-06, "loss": 0.1903, "step": 1444 }, { "epoch": 0.7771428571428571, "grad_norm": 0.23377616703510284, "learning_rate": 7.181814142450505e-06, "loss": 0.2401, "step": 1445 }, { "epoch": 0.7776806722689076, "grad_norm": 0.19119246304035187, "learning_rate": 7.148916076521828e-06, "loss": 0.2068, "step": 1446 }, { "epoch": 0.778218487394958, "grad_norm": 0.21441155672073364, "learning_rate": 7.116080957200464e-06, "loss": 0.2182, "step": 1447 }, { "epoch": 0.7787563025210084, "grad_norm": 0.22746725380420685, "learning_rate": 7.0833089002698335e-06, "loss": 0.2462, "step": 1448 }, { "epoch": 0.7792941176470588, "grad_norm": 0.21350964903831482, "learning_rate": 7.0506000212909785e-06, "loss": 0.2077, "step": 1449 }, { "epoch": 0.7798319327731092, "grad_norm": 0.23166045546531677, "learning_rate": 7.017954435602145e-06, "loss": 0.2394, "step": 1450 }, { "epoch": 0.7803697478991597, "grad_norm": 0.2207561731338501, "learning_rate": 6.985372258318426e-06, "loss": 0.2208, "step": 1451 }, { "epoch": 0.7809075630252101, "grad_norm": 0.21541151404380798, "learning_rate": 6.952853604331286e-06, "loss": 0.2541, "step": 1452 }, { "epoch": 0.7814453781512605, "grad_norm": 0.18041007220745087, "learning_rate": 6.920398588308233e-06, "loss": 0.1852, "step": 1453 }, { "epoch": 0.7819831932773109, "grad_norm": 0.2223946750164032, "learning_rate": 6.888007324692333e-06, "loss": 0.221, "step": 1454 }, { "epoch": 0.7825210084033614, "grad_norm": 0.2420312762260437, "learning_rate": 6.855679927701885e-06, "loss": 0.2394, "step": 1455 }, { "epoch": 0.7830588235294118, "grad_norm": 0.264152467250824, "learning_rate": 6.823416511329961e-06, "loss": 0.2752, "step": 1456 }, { "epoch": 0.7835966386554621, "grad_norm": 0.20937985181808472, "learning_rate": 6.791217189344018e-06, "loss": 0.1898, "step": 1457 }, { "epoch": 0.7841344537815126, "grad_norm": 0.19492729008197784, "learning_rate": 6.759082075285528e-06, "loss": 0.2004, "step": 1458 }, { "epoch": 0.784672268907563, "grad_norm": 0.21095407009124756, "learning_rate": 6.727011282469522e-06, "loss": 0.235, "step": 1459 }, { "epoch": 0.7852100840336135, "grad_norm": 0.21176856756210327, "learning_rate": 6.695004923984252e-06, "loss": 0.2294, "step": 1460 }, { "epoch": 0.7857478991596639, "grad_norm": 0.23757463693618774, "learning_rate": 6.663063112690735e-06, "loss": 0.243, "step": 1461 }, { "epoch": 0.7862857142857143, "grad_norm": 0.21689768135547638, "learning_rate": 6.6311859612224e-06, "loss": 0.2273, "step": 1462 }, { "epoch": 0.7868235294117647, "grad_norm": 0.23363135755062103, "learning_rate": 6.5993735819846526e-06, "loss": 0.2444, "step": 1463 }, { "epoch": 0.7873613445378151, "grad_norm": 0.19851571321487427, "learning_rate": 6.567626087154524e-06, "loss": 0.2102, "step": 1464 }, { "epoch": 0.7878991596638656, "grad_norm": 0.19374319911003113, "learning_rate": 6.535943588680221e-06, "loss": 0.2159, "step": 1465 }, { "epoch": 0.7884369747899159, "grad_norm": 0.23248833417892456, "learning_rate": 6.504326198280783e-06, "loss": 0.2388, "step": 1466 }, { "epoch": 0.7889747899159664, "grad_norm": 0.22389160096645355, "learning_rate": 6.4727740274456605e-06, "loss": 0.2474, "step": 1467 }, { "epoch": 0.7895126050420168, "grad_norm": 0.24470821022987366, "learning_rate": 6.441287187434317e-06, "loss": 0.2511, "step": 1468 }, { "epoch": 0.7900504201680673, "grad_norm": 0.19853001832962036, "learning_rate": 6.409865789275851e-06, "loss": 0.2157, "step": 1469 }, { "epoch": 0.7905882352941176, "grad_norm": 0.2385863959789276, "learning_rate": 6.37850994376859e-06, "loss": 0.2621, "step": 1470 }, { "epoch": 0.791126050420168, "grad_norm": 0.21084314584732056, "learning_rate": 6.347219761479733e-06, "loss": 0.2499, "step": 1471 }, { "epoch": 0.7916638655462185, "grad_norm": 0.2241710126399994, "learning_rate": 6.315995352744911e-06, "loss": 0.1933, "step": 1472 }, { "epoch": 0.7922016806722689, "grad_norm": 0.23332887887954712, "learning_rate": 6.2848368276678425e-06, "loss": 0.2489, "step": 1473 }, { "epoch": 0.7927394957983194, "grad_norm": 0.23100899159908295, "learning_rate": 6.2537442961199235e-06, "loss": 0.233, "step": 1474 }, { "epoch": 0.7932773109243697, "grad_norm": 0.22058477997779846, "learning_rate": 6.22271786773983e-06, "loss": 0.2212, "step": 1475 }, { "epoch": 0.7938151260504202, "grad_norm": 0.2501188814640045, "learning_rate": 6.191757651933164e-06, "loss": 0.2597, "step": 1476 }, { "epoch": 0.7943529411764706, "grad_norm": 0.20972594618797302, "learning_rate": 6.160863757872027e-06, "loss": 0.2106, "step": 1477 }, { "epoch": 0.794890756302521, "grad_norm": 0.27039819955825806, "learning_rate": 6.130036294494679e-06, "loss": 0.2533, "step": 1478 }, { "epoch": 0.7954285714285714, "grad_norm": 0.21589580178260803, "learning_rate": 6.099275370505114e-06, "loss": 0.2419, "step": 1479 }, { "epoch": 0.7959663865546218, "grad_norm": 0.23502160608768463, "learning_rate": 6.068581094372702e-06, "loss": 0.2457, "step": 1480 }, { "epoch": 0.7965042016806723, "grad_norm": 0.21391834318637848, "learning_rate": 6.0379535743317896e-06, "loss": 0.2398, "step": 1481 }, { "epoch": 0.7970420168067227, "grad_norm": 0.22789664566516876, "learning_rate": 6.007392918381344e-06, "loss": 0.2413, "step": 1482 }, { "epoch": 0.7975798319327732, "grad_norm": 0.2258307784795761, "learning_rate": 5.976899234284553e-06, "loss": 0.2376, "step": 1483 }, { "epoch": 0.7981176470588235, "grad_norm": 0.22115400433540344, "learning_rate": 5.9464726295684255e-06, "loss": 0.2389, "step": 1484 }, { "epoch": 0.7986554621848739, "grad_norm": 0.24049271643161774, "learning_rate": 5.916113211523472e-06, "loss": 0.2731, "step": 1485 }, { "epoch": 0.7991932773109244, "grad_norm": 0.24062862992286682, "learning_rate": 5.88582108720325e-06, "loss": 0.2619, "step": 1486 }, { "epoch": 0.7997310924369748, "grad_norm": 0.22618700563907623, "learning_rate": 5.855596363424057e-06, "loss": 0.2384, "step": 1487 }, { "epoch": 0.8002689075630252, "grad_norm": 0.23733404278755188, "learning_rate": 5.825439146764497e-06, "loss": 0.2864, "step": 1488 }, { "epoch": 0.8008067226890756, "grad_norm": 0.23445641994476318, "learning_rate": 5.7953495435651565e-06, "loss": 0.2563, "step": 1489 }, { "epoch": 0.801344537815126, "grad_norm": 0.20475512742996216, "learning_rate": 5.765327659928174e-06, "loss": 0.208, "step": 1490 }, { "epoch": 0.8018823529411765, "grad_norm": 0.1762407422065735, "learning_rate": 5.73537360171692e-06, "loss": 0.209, "step": 1491 }, { "epoch": 0.8024201680672269, "grad_norm": 0.20160692930221558, "learning_rate": 5.705487474555585e-06, "loss": 0.2077, "step": 1492 }, { "epoch": 0.8029579831932773, "grad_norm": 0.21895018219947815, "learning_rate": 5.675669383828816e-06, "loss": 0.22, "step": 1493 }, { "epoch": 0.8034957983193277, "grad_norm": 0.23681506514549255, "learning_rate": 5.645919434681368e-06, "loss": 0.25, "step": 1494 }, { "epoch": 0.8040336134453782, "grad_norm": 0.22392837703227997, "learning_rate": 5.616237732017693e-06, "loss": 0.249, "step": 1495 }, { "epoch": 0.8045714285714286, "grad_norm": 0.24463045597076416, "learning_rate": 5.586624380501615e-06, "loss": 0.288, "step": 1496 }, { "epoch": 0.8051092436974789, "grad_norm": 0.25662362575531006, "learning_rate": 5.557079484555913e-06, "loss": 0.3018, "step": 1497 }, { "epoch": 0.8056470588235294, "grad_norm": 0.21797876060009003, "learning_rate": 5.527603148361998e-06, "loss": 0.2435, "step": 1498 }, { "epoch": 0.8061848739495798, "grad_norm": 0.22247564792633057, "learning_rate": 5.4981954758595105e-06, "loss": 0.2111, "step": 1499 }, { "epoch": 0.8067226890756303, "grad_norm": 0.21992816030979156, "learning_rate": 5.468856570745979e-06, "loss": 0.2497, "step": 1500 }, { "epoch": 0.8072605042016807, "grad_norm": 0.233550027012825, "learning_rate": 5.439586536476443e-06, "loss": 0.2366, "step": 1501 }, { "epoch": 0.807798319327731, "grad_norm": 0.19975094497203827, "learning_rate": 5.410385476263078e-06, "loss": 0.2189, "step": 1502 }, { "epoch": 0.8083361344537815, "grad_norm": 0.23960763216018677, "learning_rate": 5.381253493074867e-06, "loss": 0.2656, "step": 1503 }, { "epoch": 0.8088739495798319, "grad_norm": 0.2354157418012619, "learning_rate": 5.3521906896371735e-06, "loss": 0.2376, "step": 1504 }, { "epoch": 0.8094117647058824, "grad_norm": 0.22402840852737427, "learning_rate": 5.3231971684314625e-06, "loss": 0.2452, "step": 1505 }, { "epoch": 0.8099495798319327, "grad_norm": 0.2157168835401535, "learning_rate": 5.2942730316948605e-06, "loss": 0.2396, "step": 1506 }, { "epoch": 0.8104873949579832, "grad_norm": 0.21612894535064697, "learning_rate": 5.265418381419854e-06, "loss": 0.2387, "step": 1507 }, { "epoch": 0.8110252100840336, "grad_norm": 0.22226586937904358, "learning_rate": 5.236633319353898e-06, "loss": 0.217, "step": 1508 }, { "epoch": 0.811563025210084, "grad_norm": 0.24272467195987701, "learning_rate": 5.207917946999058e-06, "loss": 0.2808, "step": 1509 }, { "epoch": 0.8121008403361345, "grad_norm": 0.22184070944786072, "learning_rate": 5.179272365611676e-06, "loss": 0.2492, "step": 1510 }, { "epoch": 0.8126386554621848, "grad_norm": 0.22865667939186096, "learning_rate": 5.1506966762019715e-06, "loss": 0.2321, "step": 1511 }, { "epoch": 0.8131764705882353, "grad_norm": 0.19640418887138367, "learning_rate": 5.122190979533736e-06, "loss": 0.1923, "step": 1512 }, { "epoch": 0.8137142857142857, "grad_norm": 0.2846081852912903, "learning_rate": 5.093755376123935e-06, "loss": 0.2699, "step": 1513 }, { "epoch": 0.8142521008403362, "grad_norm": 0.21900415420532227, "learning_rate": 5.0653899662423895e-06, "loss": 0.2418, "step": 1514 }, { "epoch": 0.8147899159663865, "grad_norm": 0.23036575317382812, "learning_rate": 5.037094849911367e-06, "loss": 0.2449, "step": 1515 }, { "epoch": 0.815327731092437, "grad_norm": 0.21140176057815552, "learning_rate": 5.0088701269053044e-06, "loss": 0.2532, "step": 1516 }, { "epoch": 0.8158655462184874, "grad_norm": 0.23016634583473206, "learning_rate": 4.980715896750407e-06, "loss": 0.2621, "step": 1517 }, { "epoch": 0.8164033613445378, "grad_norm": 0.22205528616905212, "learning_rate": 4.952632258724288e-06, "loss": 0.2376, "step": 1518 }, { "epoch": 0.8169411764705883, "grad_norm": 0.26285234093666077, "learning_rate": 4.924619311855669e-06, "loss": 0.2838, "step": 1519 }, { "epoch": 0.8174789915966386, "grad_norm": 0.23266997933387756, "learning_rate": 4.896677154923967e-06, "loss": 0.2349, "step": 1520 }, { "epoch": 0.8180168067226891, "grad_norm": 0.21935082972049713, "learning_rate": 4.868805886459013e-06, "loss": 0.2149, "step": 1521 }, { "epoch": 0.8185546218487395, "grad_norm": 0.2622581124305725, "learning_rate": 4.8410056047406425e-06, "loss": 0.2778, "step": 1522 }, { "epoch": 0.81909243697479, "grad_norm": 0.23702587187290192, "learning_rate": 4.813276407798395e-06, "loss": 0.2642, "step": 1523 }, { "epoch": 0.8196302521008403, "grad_norm": 0.207424595952034, "learning_rate": 4.785618393411134e-06, "loss": 0.2071, "step": 1524 }, { "epoch": 0.8201680672268907, "grad_norm": 0.21759329736232758, "learning_rate": 4.758031659106743e-06, "loss": 0.2528, "step": 1525 }, { "epoch": 0.8207058823529412, "grad_norm": 0.24914182722568512, "learning_rate": 4.730516302161733e-06, "loss": 0.2958, "step": 1526 }, { "epoch": 0.8212436974789916, "grad_norm": 0.27313870191574097, "learning_rate": 4.703072419600927e-06, "loss": 0.2958, "step": 1527 }, { "epoch": 0.8217815126050421, "grad_norm": 0.19802267849445343, "learning_rate": 4.675700108197137e-06, "loss": 0.256, "step": 1528 }, { "epoch": 0.8223193277310924, "grad_norm": 0.21961702406406403, "learning_rate": 4.648399464470768e-06, "loss": 0.2368, "step": 1529 }, { "epoch": 0.8228571428571428, "grad_norm": 0.21870605647563934, "learning_rate": 4.621170584689538e-06, "loss": 0.2276, "step": 1530 }, { "epoch": 0.8233949579831933, "grad_norm": 0.2372722178697586, "learning_rate": 4.594013564868091e-06, "loss": 0.2772, "step": 1531 }, { "epoch": 0.8239327731092437, "grad_norm": 0.22747667133808136, "learning_rate": 4.566928500767692e-06, "loss": 0.2441, "step": 1532 }, { "epoch": 0.8244705882352941, "grad_norm": 0.2576858699321747, "learning_rate": 4.53991548789586e-06, "loss": 0.3154, "step": 1533 }, { "epoch": 0.8250084033613445, "grad_norm": 0.21705657243728638, "learning_rate": 4.512974621506061e-06, "loss": 0.225, "step": 1534 }, { "epoch": 0.825546218487395, "grad_norm": 0.2172779142856598, "learning_rate": 4.486105996597359e-06, "loss": 0.224, "step": 1535 }, { "epoch": 0.8260840336134454, "grad_norm": 0.22584858536720276, "learning_rate": 4.459309707914058e-06, "loss": 0.2588, "step": 1536 }, { "epoch": 0.8266218487394958, "grad_norm": 0.22656598687171936, "learning_rate": 4.432585849945417e-06, "loss": 0.2315, "step": 1537 }, { "epoch": 0.8271596638655462, "grad_norm": 0.20971766114234924, "learning_rate": 4.40593451692527e-06, "loss": 0.23, "step": 1538 }, { "epoch": 0.8276974789915966, "grad_norm": 0.24355734884738922, "learning_rate": 4.379355802831722e-06, "loss": 0.2617, "step": 1539 }, { "epoch": 0.8282352941176471, "grad_norm": 0.2629246115684509, "learning_rate": 4.3528498013867965e-06, "loss": 0.2812, "step": 1540 }, { "epoch": 0.8287731092436975, "grad_norm": 0.21974530816078186, "learning_rate": 4.3264166060561355e-06, "loss": 0.2239, "step": 1541 }, { "epoch": 0.8293109243697478, "grad_norm": 0.22194775938987732, "learning_rate": 4.3000563100486475e-06, "loss": 0.2187, "step": 1542 }, { "epoch": 0.8298487394957983, "grad_norm": 0.2451111376285553, "learning_rate": 4.273769006316167e-06, "loss": 0.2648, "step": 1543 }, { "epoch": 0.8303865546218487, "grad_norm": 0.21923545002937317, "learning_rate": 4.247554787553165e-06, "loss": 0.228, "step": 1544 }, { "epoch": 0.8309243697478992, "grad_norm": 0.19128277897834778, "learning_rate": 4.221413746196379e-06, "loss": 0.2127, "step": 1545 }, { "epoch": 0.8314621848739496, "grad_norm": 0.2334584891796112, "learning_rate": 4.195345974424527e-06, "loss": 0.244, "step": 1546 }, { "epoch": 0.832, "grad_norm": 0.23098425567150116, "learning_rate": 4.169351564157945e-06, "loss": 0.2351, "step": 1547 }, { "epoch": 0.8325378151260504, "grad_norm": 0.2086673080921173, "learning_rate": 4.143430607058299e-06, "loss": 0.2249, "step": 1548 }, { "epoch": 0.8330756302521009, "grad_norm": 0.22773250937461853, "learning_rate": 4.117583194528233e-06, "loss": 0.2373, "step": 1549 }, { "epoch": 0.8336134453781513, "grad_norm": 0.22602087259292603, "learning_rate": 4.091809417711051e-06, "loss": 0.1947, "step": 1550 }, { "epoch": 0.8341512605042016, "grad_norm": 0.20870481431484222, "learning_rate": 4.066109367490426e-06, "loss": 0.2194, "step": 1551 }, { "epoch": 0.8346890756302521, "grad_norm": 0.232761949300766, "learning_rate": 4.040483134490023e-06, "loss": 0.2371, "step": 1552 }, { "epoch": 0.8352268907563025, "grad_norm": 0.21111628413200378, "learning_rate": 4.014930809073248e-06, "loss": 0.2314, "step": 1553 }, { "epoch": 0.835764705882353, "grad_norm": 0.2183760553598404, "learning_rate": 3.9894524813428616e-06, "loss": 0.224, "step": 1554 }, { "epoch": 0.8363025210084034, "grad_norm": 0.26334574818611145, "learning_rate": 3.964048241140719e-06, "loss": 0.2727, "step": 1555 }, { "epoch": 0.8368403361344537, "grad_norm": 0.21533888578414917, "learning_rate": 3.938718178047404e-06, "loss": 0.209, "step": 1556 }, { "epoch": 0.8373781512605042, "grad_norm": 0.2312881350517273, "learning_rate": 3.913462381381963e-06, "loss": 0.2316, "step": 1557 }, { "epoch": 0.8379159663865546, "grad_norm": 0.2307368963956833, "learning_rate": 3.888280940201536e-06, "loss": 0.2928, "step": 1558 }, { "epoch": 0.8384537815126051, "grad_norm": 0.21729280054569244, "learning_rate": 3.863173943301094e-06, "loss": 0.2172, "step": 1559 }, { "epoch": 0.8389915966386554, "grad_norm": 0.21327124536037445, "learning_rate": 3.838141479213078e-06, "loss": 0.212, "step": 1560 }, { "epoch": 0.8395294117647059, "grad_norm": 0.20937539637088776, "learning_rate": 3.8131836362071403e-06, "loss": 0.2024, "step": 1561 }, { "epoch": 0.8400672268907563, "grad_norm": 0.23199336230754852, "learning_rate": 3.7883005022897794e-06, "loss": 0.242, "step": 1562 }, { "epoch": 0.8406050420168067, "grad_norm": 0.19046266376972198, "learning_rate": 3.7634921652040578e-06, "loss": 0.2061, "step": 1563 }, { "epoch": 0.8411428571428572, "grad_norm": 0.22096723318099976, "learning_rate": 3.738758712429305e-06, "loss": 0.2167, "step": 1564 }, { "epoch": 0.8416806722689075, "grad_norm": 0.23634915053844452, "learning_rate": 3.7141002311807698e-06, "loss": 0.2367, "step": 1565 }, { "epoch": 0.842218487394958, "grad_norm": 0.2182723879814148, "learning_rate": 3.689516808409363e-06, "loss": 0.2211, "step": 1566 }, { "epoch": 0.8427563025210084, "grad_norm": 0.2343529760837555, "learning_rate": 3.665008530801292e-06, "loss": 0.2068, "step": 1567 }, { "epoch": 0.8432941176470589, "grad_norm": 0.25877806544303894, "learning_rate": 3.6405754847778112e-06, "loss": 0.2696, "step": 1568 }, { "epoch": 0.8438319327731092, "grad_norm": 0.18986301124095917, "learning_rate": 3.616217756494891e-06, "loss": 0.1917, "step": 1569 }, { "epoch": 0.8443697478991596, "grad_norm": 0.21341216564178467, "learning_rate": 3.591935431842902e-06, "loss": 0.2175, "step": 1570 }, { "epoch": 0.8449075630252101, "grad_norm": 0.2142014354467392, "learning_rate": 3.5677285964463374e-06, "loss": 0.1989, "step": 1571 }, { "epoch": 0.8454453781512605, "grad_norm": 0.22054918110370636, "learning_rate": 3.5435973356634928e-06, "loss": 0.2507, "step": 1572 }, { "epoch": 0.845983193277311, "grad_norm": 0.20724591612815857, "learning_rate": 3.5195417345861903e-06, "loss": 0.2258, "step": 1573 }, { "epoch": 0.8465210084033613, "grad_norm": 0.22371357679367065, "learning_rate": 3.495561878039422e-06, "loss": 0.2387, "step": 1574 }, { "epoch": 0.8470588235294118, "grad_norm": 0.25370118021965027, "learning_rate": 3.4716578505811226e-06, "loss": 0.2752, "step": 1575 }, { "epoch": 0.8475966386554622, "grad_norm": 0.2144588828086853, "learning_rate": 3.447829736501837e-06, "loss": 0.2106, "step": 1576 }, { "epoch": 0.8481344537815126, "grad_norm": 0.22446668148040771, "learning_rate": 3.424077619824401e-06, "loss": 0.2153, "step": 1577 }, { "epoch": 0.848672268907563, "grad_norm": 0.23380570113658905, "learning_rate": 3.4004015843036947e-06, "loss": 0.2559, "step": 1578 }, { "epoch": 0.8492100840336134, "grad_norm": 0.23459002375602722, "learning_rate": 3.3768017134262945e-06, "loss": 0.2481, "step": 1579 }, { "epoch": 0.8497478991596639, "grad_norm": 0.1978437453508377, "learning_rate": 3.353278090410231e-06, "loss": 0.191, "step": 1580 }, { "epoch": 0.8502857142857143, "grad_norm": 0.21822255849838257, "learning_rate": 3.329830798204642e-06, "loss": 0.2029, "step": 1581 }, { "epoch": 0.8508235294117648, "grad_norm": 0.24054054915905, "learning_rate": 3.3064599194895325e-06, "loss": 0.2836, "step": 1582 }, { "epoch": 0.8513613445378151, "grad_norm": 0.19145500659942627, "learning_rate": 3.2831655366754365e-06, "loss": 0.1955, "step": 1583 }, { "epoch": 0.8518991596638655, "grad_norm": 0.2138543426990509, "learning_rate": 3.2599477319031653e-06, "loss": 0.2341, "step": 1584 }, { "epoch": 0.852436974789916, "grad_norm": 0.24167557060718536, "learning_rate": 3.2368065870434866e-06, "loss": 0.287, "step": 1585 }, { "epoch": 0.8529747899159664, "grad_norm": 0.20971588790416718, "learning_rate": 3.2137421836968494e-06, "loss": 0.2235, "step": 1586 }, { "epoch": 0.8535126050420168, "grad_norm": 0.2678580582141876, "learning_rate": 3.1907546031931135e-06, "loss": 0.2918, "step": 1587 }, { "epoch": 0.8540504201680672, "grad_norm": 0.23543906211853027, "learning_rate": 3.167843926591224e-06, "loss": 0.2583, "step": 1588 }, { "epoch": 0.8545882352941176, "grad_norm": 0.24895963072776794, "learning_rate": 3.1450102346789634e-06, "loss": 0.2412, "step": 1589 }, { "epoch": 0.8551260504201681, "grad_norm": 0.23921562731266022, "learning_rate": 3.1222536079726387e-06, "loss": 0.2482, "step": 1590 }, { "epoch": 0.8556638655462185, "grad_norm": 0.23790937662124634, "learning_rate": 3.099574126716817e-06, "loss": 0.223, "step": 1591 }, { "epoch": 0.8562016806722689, "grad_norm": 0.20260941982269287, "learning_rate": 3.076971870884027e-06, "loss": 0.2175, "step": 1592 }, { "epoch": 0.8567394957983193, "grad_norm": 0.2615525424480438, "learning_rate": 3.0544469201744976e-06, "loss": 0.2689, "step": 1593 }, { "epoch": 0.8572773109243698, "grad_norm": 0.21394728124141693, "learning_rate": 3.0319993540158437e-06, "loss": 0.2115, "step": 1594 }, { "epoch": 0.8578151260504202, "grad_norm": 0.21572695672512054, "learning_rate": 3.009629251562826e-06, "loss": 0.2487, "step": 1595 }, { "epoch": 0.8583529411764705, "grad_norm": 0.23756852746009827, "learning_rate": 2.9873366916970513e-06, "loss": 0.226, "step": 1596 }, { "epoch": 0.858890756302521, "grad_norm": 0.21911032497882843, "learning_rate": 2.9651217530266674e-06, "loss": 0.2381, "step": 1597 }, { "epoch": 0.8594285714285714, "grad_norm": 0.2636600434780121, "learning_rate": 2.9429845138861515e-06, "loss": 0.2361, "step": 1598 }, { "epoch": 0.8599663865546219, "grad_norm": 0.24675433337688446, "learning_rate": 2.920925052335971e-06, "loss": 0.2221, "step": 1599 }, { "epoch": 0.8605042016806723, "grad_norm": 0.2457774579524994, "learning_rate": 2.898943446162347e-06, "loss": 0.2627, "step": 1600 }, { "epoch": 0.8610420168067227, "grad_norm": 0.23348650336265564, "learning_rate": 2.877039772876955e-06, "loss": 0.269, "step": 1601 }, { "epoch": 0.8615798319327731, "grad_norm": 0.23182415962219238, "learning_rate": 2.8552141097166695e-06, "loss": 0.231, "step": 1602 }, { "epoch": 0.8621176470588235, "grad_norm": 0.22337450087070465, "learning_rate": 2.8334665336432945e-06, "loss": 0.2316, "step": 1603 }, { "epoch": 0.862655462184874, "grad_norm": 0.19861933588981628, "learning_rate": 2.8117971213432596e-06, "loss": 0.2212, "step": 1604 }, { "epoch": 0.8631932773109243, "grad_norm": 0.2402956634759903, "learning_rate": 2.7902059492274007e-06, "loss": 0.2374, "step": 1605 }, { "epoch": 0.8637310924369748, "grad_norm": 0.23148778080940247, "learning_rate": 2.7686930934306325e-06, "loss": 0.25, "step": 1606 }, { "epoch": 0.8642689075630252, "grad_norm": 0.22591784596443176, "learning_rate": 2.7472586298117385e-06, "loss": 0.2439, "step": 1607 }, { "epoch": 0.8648067226890757, "grad_norm": 0.21380610764026642, "learning_rate": 2.725902633953059e-06, "loss": 0.2344, "step": 1608 }, { "epoch": 0.8653445378151261, "grad_norm": 0.21937958896160126, "learning_rate": 2.7046251811602392e-06, "loss": 0.2169, "step": 1609 }, { "epoch": 0.8658823529411764, "grad_norm": 0.2123061716556549, "learning_rate": 2.6834263464619786e-06, "loss": 0.2282, "step": 1610 }, { "epoch": 0.8664201680672269, "grad_norm": 0.259592741727829, "learning_rate": 2.662306204609738e-06, "loss": 0.2549, "step": 1611 }, { "epoch": 0.8669579831932773, "grad_norm": 0.23750977218151093, "learning_rate": 2.641264830077503e-06, "loss": 0.2577, "step": 1612 }, { "epoch": 0.8674957983193278, "grad_norm": 0.2017223984003067, "learning_rate": 2.620302297061497e-06, "loss": 0.2141, "step": 1613 }, { "epoch": 0.8680336134453781, "grad_norm": 0.23118527233600616, "learning_rate": 2.5994186794799506e-06, "loss": 0.2546, "step": 1614 }, { "epoch": 0.8685714285714285, "grad_norm": 0.22225260734558105, "learning_rate": 2.5786140509727997e-06, "loss": 0.2199, "step": 1615 }, { "epoch": 0.869109243697479, "grad_norm": 0.22218848764896393, "learning_rate": 2.557888484901469e-06, "loss": 0.2436, "step": 1616 }, { "epoch": 0.8696470588235294, "grad_norm": 0.21399237215518951, "learning_rate": 2.537242054348571e-06, "loss": 0.2308, "step": 1617 }, { "epoch": 0.8701848739495799, "grad_norm": 0.23866206407546997, "learning_rate": 2.516674832117699e-06, "loss": 0.2671, "step": 1618 }, { "epoch": 0.8707226890756302, "grad_norm": 0.2131393849849701, "learning_rate": 2.496186890733107e-06, "loss": 0.2251, "step": 1619 }, { "epoch": 0.8712605042016807, "grad_norm": 0.2193623185157776, "learning_rate": 2.475778302439524e-06, "loss": 0.22, "step": 1620 }, { "epoch": 0.8717983193277311, "grad_norm": 0.23298172652721405, "learning_rate": 2.4554491392018347e-06, "loss": 0.2441, "step": 1621 }, { "epoch": 0.8723361344537816, "grad_norm": 0.23090720176696777, "learning_rate": 2.4351994727048604e-06, "loss": 0.2152, "step": 1622 }, { "epoch": 0.8728739495798319, "grad_norm": 0.20195604860782623, "learning_rate": 2.4150293743531198e-06, "loss": 0.2342, "step": 1623 }, { "epoch": 0.8734117647058823, "grad_norm": 0.2462608516216278, "learning_rate": 2.394938915270534e-06, "loss": 0.2783, "step": 1624 }, { "epoch": 0.8739495798319328, "grad_norm": 0.23774537444114685, "learning_rate": 2.3749281663002137e-06, "loss": 0.2402, "step": 1625 }, { "epoch": 0.8744873949579832, "grad_norm": 0.2380041629076004, "learning_rate": 2.3549971980041887e-06, "loss": 0.2399, "step": 1626 }, { "epoch": 0.8750252100840337, "grad_norm": 0.20549987256526947, "learning_rate": 2.3351460806631723e-06, "loss": 0.2076, "step": 1627 }, { "epoch": 0.875563025210084, "grad_norm": 0.22334080934524536, "learning_rate": 2.3153748842762944e-06, "loss": 0.2348, "step": 1628 }, { "epoch": 0.8761008403361344, "grad_norm": 0.22682395577430725, "learning_rate": 2.295683678560881e-06, "loss": 0.2207, "step": 1629 }, { "epoch": 0.8766386554621849, "grad_norm": 0.18507802486419678, "learning_rate": 2.276072532952192e-06, "loss": 0.2001, "step": 1630 }, { "epoch": 0.8771764705882353, "grad_norm": 0.2676650881767273, "learning_rate": 2.2565415166031723e-06, "loss": 0.2696, "step": 1631 }, { "epoch": 0.8777142857142857, "grad_norm": 0.22603026032447815, "learning_rate": 2.2370906983842142e-06, "loss": 0.2422, "step": 1632 }, { "epoch": 0.8782521008403361, "grad_norm": 0.2527368664741516, "learning_rate": 2.2177201468829204e-06, "loss": 0.2517, "step": 1633 }, { "epoch": 0.8787899159663866, "grad_norm": 0.20313459634780884, "learning_rate": 2.1984299304038585e-06, "loss": 0.2267, "step": 1634 }, { "epoch": 0.879327731092437, "grad_norm": 0.22754457592964172, "learning_rate": 2.1792201169683085e-06, "loss": 0.2648, "step": 1635 }, { "epoch": 0.8798655462184874, "grad_norm": 0.21851439774036407, "learning_rate": 2.160090774314039e-06, "loss": 0.2299, "step": 1636 }, { "epoch": 0.8804033613445378, "grad_norm": 0.22871626913547516, "learning_rate": 2.141041969895069e-06, "loss": 0.2359, "step": 1637 }, { "epoch": 0.8809411764705882, "grad_norm": 0.23707148432731628, "learning_rate": 2.122073770881408e-06, "loss": 0.2814, "step": 1638 }, { "epoch": 0.8814789915966387, "grad_norm": 0.21599948406219482, "learning_rate": 2.103186244158847e-06, "loss": 0.2184, "step": 1639 }, { "epoch": 0.8820168067226891, "grad_norm": 0.22139613330364227, "learning_rate": 2.0843794563286974e-06, "loss": 0.2231, "step": 1640 }, { "epoch": 0.8825546218487395, "grad_norm": 0.2699647545814514, "learning_rate": 2.065653473707585e-06, "loss": 0.2629, "step": 1641 }, { "epoch": 0.8830924369747899, "grad_norm": 0.22359047830104828, "learning_rate": 2.0470083623271797e-06, "loss": 0.2046, "step": 1642 }, { "epoch": 0.8836302521008403, "grad_norm": 0.23867052793502808, "learning_rate": 2.028444187934006e-06, "loss": 0.3091, "step": 1643 }, { "epoch": 0.8841680672268908, "grad_norm": 0.22837474942207336, "learning_rate": 2.0099610159891676e-06, "loss": 0.2258, "step": 1644 }, { "epoch": 0.8847058823529412, "grad_norm": 0.21621833741664886, "learning_rate": 1.991558911668143e-06, "loss": 0.2166, "step": 1645 }, { "epoch": 0.8852436974789916, "grad_norm": 0.2181062400341034, "learning_rate": 1.973237939860559e-06, "loss": 0.2158, "step": 1646 }, { "epoch": 0.885781512605042, "grad_norm": 0.19660021364688873, "learning_rate": 1.9549981651699355e-06, "loss": 0.1926, "step": 1647 }, { "epoch": 0.8863193277310925, "grad_norm": 0.2174326330423355, "learning_rate": 1.9368396519134913e-06, "loss": 0.2299, "step": 1648 }, { "epoch": 0.8868571428571429, "grad_norm": 0.24688170850276947, "learning_rate": 1.918762464121887e-06, "loss": 0.2487, "step": 1649 }, { "epoch": 0.8873949579831932, "grad_norm": 0.25096395611763, "learning_rate": 1.9007666655390248e-06, "loss": 0.2813, "step": 1650 }, { "epoch": 0.8879327731092437, "grad_norm": 0.21914611756801605, "learning_rate": 1.8828523196218e-06, "loss": 0.2293, "step": 1651 }, { "epoch": 0.8884705882352941, "grad_norm": 0.21469822525978088, "learning_rate": 1.8650194895398986e-06, "loss": 0.2069, "step": 1652 }, { "epoch": 0.8890084033613446, "grad_norm": 0.2337588369846344, "learning_rate": 1.847268238175559e-06, "loss": 0.2556, "step": 1653 }, { "epoch": 0.889546218487395, "grad_norm": 0.1989423781633377, "learning_rate": 1.8295986281233635e-06, "loss": 0.2047, "step": 1654 }, { "epoch": 0.8900840336134453, "grad_norm": 0.22571295499801636, "learning_rate": 1.8120107216899996e-06, "loss": 0.235, "step": 1655 }, { "epoch": 0.8906218487394958, "grad_norm": 0.2377486228942871, "learning_rate": 1.794504580894052e-06, "loss": 0.2648, "step": 1656 }, { "epoch": 0.8911596638655462, "grad_norm": 0.23715262115001678, "learning_rate": 1.7770802674658e-06, "loss": 0.2423, "step": 1657 }, { "epoch": 0.8916974789915967, "grad_norm": 0.21224485337734222, "learning_rate": 1.7597378428469592e-06, "loss": 0.2514, "step": 1658 }, { "epoch": 0.892235294117647, "grad_norm": 0.2155575156211853, "learning_rate": 1.7424773681905065e-06, "loss": 0.2335, "step": 1659 }, { "epoch": 0.8927731092436975, "grad_norm": 0.22317753732204437, "learning_rate": 1.7252989043604312e-06, "loss": 0.2327, "step": 1660 }, { "epoch": 0.8933109243697479, "grad_norm": 0.24362388253211975, "learning_rate": 1.7082025119315504e-06, "loss": 0.286, "step": 1661 }, { "epoch": 0.8938487394957984, "grad_norm": 0.2243383526802063, "learning_rate": 1.6911882511892685e-06, "loss": 0.2531, "step": 1662 }, { "epoch": 0.8943865546218488, "grad_norm": 0.23072177171707153, "learning_rate": 1.6742561821293829e-06, "loss": 0.2416, "step": 1663 }, { "epoch": 0.8949243697478991, "grad_norm": 0.2165490984916687, "learning_rate": 1.6574063644578696e-06, "loss": 0.2422, "step": 1664 }, { "epoch": 0.8954621848739496, "grad_norm": 0.23180986940860748, "learning_rate": 1.6406388575906567e-06, "loss": 0.2413, "step": 1665 }, { "epoch": 0.896, "grad_norm": 0.24213367700576782, "learning_rate": 1.6239537206534483e-06, "loss": 0.2718, "step": 1666 }, { "epoch": 0.8965378151260505, "grad_norm": 0.24084459245204926, "learning_rate": 1.6073510124814646e-06, "loss": 0.2243, "step": 1667 }, { "epoch": 0.8970756302521008, "grad_norm": 0.2517223656177521, "learning_rate": 1.590830791619291e-06, "loss": 0.2601, "step": 1668 }, { "epoch": 0.8976134453781512, "grad_norm": 0.23291735351085663, "learning_rate": 1.5743931163206356e-06, "loss": 0.2353, "step": 1669 }, { "epoch": 0.8981512605042017, "grad_norm": 0.24009115993976593, "learning_rate": 1.5580380445481325e-06, "loss": 0.2435, "step": 1670 }, { "epoch": 0.8986890756302521, "grad_norm": 0.24174968898296356, "learning_rate": 1.5417656339731523e-06, "loss": 0.253, "step": 1671 }, { "epoch": 0.8992268907563025, "grad_norm": 0.21337029337882996, "learning_rate": 1.52557594197556e-06, "loss": 0.2433, "step": 1672 }, { "epoch": 0.8997647058823529, "grad_norm": 0.20432236790657043, "learning_rate": 1.5094690256435617e-06, "loss": 0.2047, "step": 1673 }, { "epoch": 0.9003025210084034, "grad_norm": 0.19069026410579681, "learning_rate": 1.4934449417734614e-06, "loss": 0.1895, "step": 1674 }, { "epoch": 0.9008403361344538, "grad_norm": 0.24225901067256927, "learning_rate": 1.477503746869499e-06, "loss": 0.2607, "step": 1675 }, { "epoch": 0.9013781512605042, "grad_norm": 0.2323104739189148, "learning_rate": 1.4616454971436071e-06, "loss": 0.2462, "step": 1676 }, { "epoch": 0.9019159663865546, "grad_norm": 0.22920486330986023, "learning_rate": 1.4458702485152549e-06, "loss": 0.2437, "step": 1677 }, { "epoch": 0.902453781512605, "grad_norm": 0.2416364997625351, "learning_rate": 1.4301780566112254e-06, "loss": 0.231, "step": 1678 }, { "epoch": 0.9029915966386555, "grad_norm": 0.2541874349117279, "learning_rate": 1.4145689767654208e-06, "loss": 0.2895, "step": 1679 }, { "epoch": 0.9035294117647059, "grad_norm": 0.2295549362897873, "learning_rate": 1.3990430640186875e-06, "loss": 0.2422, "step": 1680 }, { "epoch": 0.9040672268907562, "grad_norm": 0.22736386954784393, "learning_rate": 1.3836003731185932e-06, "loss": 0.2566, "step": 1681 }, { "epoch": 0.9046050420168067, "grad_norm": 0.21268194913864136, "learning_rate": 1.3682409585192596e-06, "loss": 0.2241, "step": 1682 }, { "epoch": 0.9051428571428571, "grad_norm": 0.21363449096679688, "learning_rate": 1.3529648743811463e-06, "loss": 0.2488, "step": 1683 }, { "epoch": 0.9056806722689076, "grad_norm": 0.26828962564468384, "learning_rate": 1.3377721745708894e-06, "loss": 0.3224, "step": 1684 }, { "epoch": 0.906218487394958, "grad_norm": 0.20662641525268555, "learning_rate": 1.322662912661074e-06, "loss": 0.2159, "step": 1685 }, { "epoch": 0.9067563025210084, "grad_norm": 0.20506830513477325, "learning_rate": 1.3076371419300876e-06, "loss": 0.2305, "step": 1686 }, { "epoch": 0.9072941176470588, "grad_norm": 0.22052201628684998, "learning_rate": 1.2926949153618917e-06, "loss": 0.2466, "step": 1687 }, { "epoch": 0.9078319327731093, "grad_norm": 0.20990587770938873, "learning_rate": 1.277836285645867e-06, "loss": 0.2169, "step": 1688 }, { "epoch": 0.9083697478991597, "grad_norm": 0.22852779924869537, "learning_rate": 1.2630613051766048e-06, "loss": 0.2341, "step": 1689 }, { "epoch": 0.90890756302521, "grad_norm": 0.26850247383117676, "learning_rate": 1.2483700260537412e-06, "loss": 0.3516, "step": 1690 }, { "epoch": 0.9094453781512605, "grad_norm": 0.23324328660964966, "learning_rate": 1.2337625000817616e-06, "loss": 0.257, "step": 1691 }, { "epoch": 0.9099831932773109, "grad_norm": 0.20621325075626373, "learning_rate": 1.2192387787698072e-06, "loss": 0.2151, "step": 1692 }, { "epoch": 0.9105210084033614, "grad_norm": 0.20394685864448547, "learning_rate": 1.2047989133315313e-06, "loss": 0.2068, "step": 1693 }, { "epoch": 0.9110588235294118, "grad_norm": 0.22420884668827057, "learning_rate": 1.19044295468487e-06, "loss": 0.2267, "step": 1694 }, { "epoch": 0.9115966386554621, "grad_norm": 0.2126203030347824, "learning_rate": 1.176170953451905e-06, "loss": 0.2113, "step": 1695 }, { "epoch": 0.9121344537815126, "grad_norm": 0.21117857098579407, "learning_rate": 1.1619829599586517e-06, "loss": 0.2024, "step": 1696 }, { "epoch": 0.912672268907563, "grad_norm": 0.22331954538822174, "learning_rate": 1.1478790242349074e-06, "loss": 0.2571, "step": 1697 }, { "epoch": 0.9132100840336135, "grad_norm": 0.23153391480445862, "learning_rate": 1.1338591960140672e-06, "loss": 0.2386, "step": 1698 }, { "epoch": 0.9137478991596638, "grad_norm": 0.22471420466899872, "learning_rate": 1.1199235247329326e-06, "loss": 0.2314, "step": 1699 }, { "epoch": 0.9142857142857143, "grad_norm": 0.2089022696018219, "learning_rate": 1.1060720595315626e-06, "loss": 0.2222, "step": 1700 }, { "epoch": 0.9148235294117647, "grad_norm": 0.23995298147201538, "learning_rate": 1.0923048492530757e-06, "loss": 0.2396, "step": 1701 }, { "epoch": 0.9153613445378151, "grad_norm": 0.21533532440662384, "learning_rate": 1.0786219424435112e-06, "loss": 0.2167, "step": 1702 }, { "epoch": 0.9158991596638656, "grad_norm": 0.24985277652740479, "learning_rate": 1.0650233873516046e-06, "loss": 0.2749, "step": 1703 }, { "epoch": 0.9164369747899159, "grad_norm": 0.2433941513299942, "learning_rate": 1.0515092319286824e-06, "loss": 0.3085, "step": 1704 }, { "epoch": 0.9169747899159664, "grad_norm": 0.22683164477348328, "learning_rate": 1.0380795238284446e-06, "loss": 0.2394, "step": 1705 }, { "epoch": 0.9175126050420168, "grad_norm": 0.2490832656621933, "learning_rate": 1.024734310406808e-06, "loss": 0.2628, "step": 1706 }, { "epoch": 0.9180504201680673, "grad_norm": 0.23683255910873413, "learning_rate": 1.0114736387217604e-06, "loss": 0.2832, "step": 1707 }, { "epoch": 0.9185882352941176, "grad_norm": 0.2833135426044464, "learning_rate": 9.982975555331591e-07, "loss": 0.3016, "step": 1708 }, { "epoch": 0.919126050420168, "grad_norm": 0.23371315002441406, "learning_rate": 9.852061073025997e-07, "loss": 0.2497, "step": 1709 }, { "epoch": 0.9196638655462185, "grad_norm": 0.22701528668403625, "learning_rate": 9.721993401932278e-07, "loss": 0.2159, "step": 1710 }, { "epoch": 0.9202016806722689, "grad_norm": 0.2701483368873596, "learning_rate": 9.592773000695915e-07, "loss": 0.2867, "step": 1711 }, { "epoch": 0.9207394957983194, "grad_norm": 0.20761577785015106, "learning_rate": 9.46440032497467e-07, "loss": 0.2099, "step": 1712 }, { "epoch": 0.9212773109243697, "grad_norm": 0.2346915453672409, "learning_rate": 9.336875827437197e-07, "loss": 0.2659, "step": 1713 }, { "epoch": 0.9218151260504202, "grad_norm": 0.2438836693763733, "learning_rate": 9.210199957761123e-07, "loss": 0.2515, "step": 1714 }, { "epoch": 0.9223529411764706, "grad_norm": 0.21617330610752106, "learning_rate": 9.084373162631748e-07, "loss": 0.2117, "step": 1715 }, { "epoch": 0.922890756302521, "grad_norm": 0.20329926908016205, "learning_rate": 8.959395885740323e-07, "loss": 0.225, "step": 1716 }, { "epoch": 0.9234285714285714, "grad_norm": 0.21277156472206116, "learning_rate": 8.835268567782495e-07, "loss": 0.22, "step": 1717 }, { "epoch": 0.9239663865546218, "grad_norm": 0.22986096143722534, "learning_rate": 8.71199164645689e-07, "loss": 0.2164, "step": 1718 }, { "epoch": 0.9245042016806723, "grad_norm": 0.21631069481372833, "learning_rate": 8.589565556463314e-07, "loss": 0.226, "step": 1719 }, { "epoch": 0.9250420168067227, "grad_norm": 0.24472613632678986, "learning_rate": 8.467990729501468e-07, "loss": 0.2824, "step": 1720 }, { "epoch": 0.9255798319327732, "grad_norm": 0.19186931848526, "learning_rate": 8.347267594269264e-07, "loss": 0.1921, "step": 1721 }, { "epoch": 0.9261176470588235, "grad_norm": 0.22971421480178833, "learning_rate": 8.22739657646146e-07, "loss": 0.2454, "step": 1722 }, { "epoch": 0.9266554621848739, "grad_norm": 0.21870923042297363, "learning_rate": 8.108378098767938e-07, "loss": 0.2152, "step": 1723 }, { "epoch": 0.9271932773109244, "grad_norm": 0.25115063786506653, "learning_rate": 7.990212580872458e-07, "loss": 0.2511, "step": 1724 }, { "epoch": 0.9277310924369748, "grad_norm": 0.19788485765457153, "learning_rate": 7.872900439451076e-07, "loss": 0.2148, "step": 1725 }, { "epoch": 0.9282689075630252, "grad_norm": 0.2199716866016388, "learning_rate": 7.75644208817053e-07, "loss": 0.2393, "step": 1726 }, { "epoch": 0.9288067226890756, "grad_norm": 0.2298024743795395, "learning_rate": 7.640837937687079e-07, "loss": 0.2254, "step": 1727 }, { "epoch": 0.929344537815126, "grad_norm": 0.23643100261688232, "learning_rate": 7.526088395644781e-07, "loss": 0.2728, "step": 1728 }, { "epoch": 0.9298823529411765, "grad_norm": 0.24257013201713562, "learning_rate": 7.412193866674271e-07, "loss": 0.2425, "step": 1729 }, { "epoch": 0.9304201680672269, "grad_norm": 0.2120896279811859, "learning_rate": 7.299154752391097e-07, "loss": 0.2258, "step": 1730 }, { "epoch": 0.9309579831932773, "grad_norm": 0.26428136229515076, "learning_rate": 7.186971451394548e-07, "loss": 0.2956, "step": 1731 }, { "epoch": 0.9314957983193277, "grad_norm": 0.2296888530254364, "learning_rate": 7.075644359266137e-07, "loss": 0.2356, "step": 1732 }, { "epoch": 0.9320336134453782, "grad_norm": 0.2267071008682251, "learning_rate": 6.965173868568098e-07, "loss": 0.201, "step": 1733 }, { "epoch": 0.9325714285714286, "grad_norm": 0.222958505153656, "learning_rate": 6.855560368842274e-07, "loss": 0.2291, "step": 1734 }, { "epoch": 0.9331092436974789, "grad_norm": 0.20006921887397766, "learning_rate": 6.746804246608452e-07, "loss": 0.2096, "step": 1735 }, { "epoch": 0.9336470588235294, "grad_norm": 0.22314222157001495, "learning_rate": 6.638905885363173e-07, "loss": 0.2377, "step": 1736 }, { "epoch": 0.9341848739495798, "grad_norm": 0.19797377288341522, "learning_rate": 6.53186566557834e-07, "loss": 0.2321, "step": 1737 }, { "epoch": 0.9347226890756303, "grad_norm": 0.22384101152420044, "learning_rate": 6.425683964699836e-07, "loss": 0.2316, "step": 1738 }, { "epoch": 0.9352605042016807, "grad_norm": 0.24058353900909424, "learning_rate": 6.32036115714632e-07, "loss": 0.2614, "step": 1739 }, { "epoch": 0.935798319327731, "grad_norm": 0.24436968564987183, "learning_rate": 6.215897614307686e-07, "loss": 0.2764, "step": 1740 }, { "epoch": 0.9363361344537815, "grad_norm": 0.22619466483592987, "learning_rate": 6.112293704544026e-07, "loss": 0.2557, "step": 1741 }, { "epoch": 0.9368739495798319, "grad_norm": 0.2286638617515564, "learning_rate": 6.00954979318405e-07, "loss": 0.2059, "step": 1742 }, { "epoch": 0.9374117647058824, "grad_norm": 0.21887068450450897, "learning_rate": 5.907666242524035e-07, "loss": 0.2312, "step": 1743 }, { "epoch": 0.9379495798319327, "grad_norm": 0.26828280091285706, "learning_rate": 5.806643411826407e-07, "loss": 0.2783, "step": 1744 }, { "epoch": 0.9384873949579832, "grad_norm": 0.2037743180990219, "learning_rate": 5.706481657318547e-07, "loss": 0.2133, "step": 1745 }, { "epoch": 0.9390252100840336, "grad_norm": 0.22839489579200745, "learning_rate": 5.607181332191458e-07, "loss": 0.2374, "step": 1746 }, { "epoch": 0.9395630252100841, "grad_norm": 0.24497352540493011, "learning_rate": 5.508742786598575e-07, "loss": 0.245, "step": 1747 }, { "epoch": 0.9401008403361345, "grad_norm": 0.2133018672466278, "learning_rate": 5.411166367654569e-07, "loss": 0.2248, "step": 1748 }, { "epoch": 0.9406386554621848, "grad_norm": 0.22339177131652832, "learning_rate": 5.314452419433985e-07, "loss": 0.2413, "step": 1749 }, { "epoch": 0.9411764705882353, "grad_norm": 0.22162359952926636, "learning_rate": 5.21860128297022e-07, "loss": 0.2512, "step": 1750 }, { "epoch": 0.9417142857142857, "grad_norm": 0.22483007609844208, "learning_rate": 5.123613296254132e-07, "loss": 0.2514, "step": 1751 }, { "epoch": 0.9422521008403362, "grad_norm": 0.21014715731143951, "learning_rate": 5.029488794233012e-07, "loss": 0.229, "step": 1752 }, { "epoch": 0.9427899159663865, "grad_norm": 0.22612863779067993, "learning_rate": 4.936228108809227e-07, "loss": 0.2322, "step": 1753 }, { "epoch": 0.943327731092437, "grad_norm": 0.23138056695461273, "learning_rate": 4.843831568839302e-07, "loss": 0.2358, "step": 1754 }, { "epoch": 0.9438655462184874, "grad_norm": 0.23847609758377075, "learning_rate": 4.7522995001324233e-07, "loss": 0.2408, "step": 1755 }, { "epoch": 0.9444033613445378, "grad_norm": 0.21127089858055115, "learning_rate": 4.661632225449603e-07, "loss": 0.2033, "step": 1756 }, { "epoch": 0.9449411764705883, "grad_norm": 0.24290789663791656, "learning_rate": 4.5718300645023747e-07, "loss": 0.2481, "step": 1757 }, { "epoch": 0.9454789915966386, "grad_norm": 0.20376421511173248, "learning_rate": 4.48289333395166e-07, "loss": 0.2367, "step": 1758 }, { "epoch": 0.9460168067226891, "grad_norm": 0.24479244649410248, "learning_rate": 4.394822347406791e-07, "loss": 0.2588, "step": 1759 }, { "epoch": 0.9465546218487395, "grad_norm": 0.22748157382011414, "learning_rate": 4.307617415424209e-07, "loss": 0.2604, "step": 1760 }, { "epoch": 0.94709243697479, "grad_norm": 0.2122928500175476, "learning_rate": 4.2212788455065213e-07, "loss": 0.2117, "step": 1761 }, { "epoch": 0.9476302521008403, "grad_norm": 0.2356414943933487, "learning_rate": 4.1358069421013337e-07, "loss": 0.2544, "step": 1762 }, { "epoch": 0.9481680672268907, "grad_norm": 0.22548024356365204, "learning_rate": 4.051202006600224e-07, "loss": 0.2517, "step": 1763 }, { "epoch": 0.9487058823529412, "grad_norm": 0.2310381531715393, "learning_rate": 3.9674643373376895e-07, "loss": 0.2605, "step": 1764 }, { "epoch": 0.9492436974789916, "grad_norm": 0.24583300948143005, "learning_rate": 3.8845942295900053e-07, "loss": 0.2598, "step": 1765 }, { "epoch": 0.9497815126050421, "grad_norm": 0.20774447917938232, "learning_rate": 3.80259197557431e-07, "loss": 0.2068, "step": 1766 }, { "epoch": 0.9503193277310924, "grad_norm": 0.21736493706703186, "learning_rate": 3.7214578644474695e-07, "loss": 0.227, "step": 1767 }, { "epoch": 0.9508571428571428, "grad_norm": 0.22823993861675262, "learning_rate": 3.641192182305103e-07, "loss": 0.2523, "step": 1768 }, { "epoch": 0.9513949579831933, "grad_norm": 0.25741177797317505, "learning_rate": 3.5617952121805853e-07, "loss": 0.2838, "step": 1769 }, { "epoch": 0.9519327731092437, "grad_norm": 0.22269926965236664, "learning_rate": 3.4832672340440187e-07, "loss": 0.2398, "step": 1770 }, { "epoch": 0.9524705882352941, "grad_norm": 0.22511686384677887, "learning_rate": 3.4056085248012627e-07, "loss": 0.2591, "step": 1771 }, { "epoch": 0.9530084033613445, "grad_norm": 0.22192563116550446, "learning_rate": 3.328819358292989e-07, "loss": 0.2237, "step": 1772 }, { "epoch": 0.953546218487395, "grad_norm": 0.2205609679222107, "learning_rate": 3.25290000529363e-07, "loss": 0.2205, "step": 1773 }, { "epoch": 0.9540840336134454, "grad_norm": 0.22576045989990234, "learning_rate": 3.1778507335104567e-07, "loss": 0.2308, "step": 1774 }, { "epoch": 0.9546218487394958, "grad_norm": 0.21366746723651886, "learning_rate": 3.1036718075827806e-07, "loss": 0.2423, "step": 1775 }, { "epoch": 0.9551596638655462, "grad_norm": 0.2084474414587021, "learning_rate": 3.030363489080729e-07, "loss": 0.2489, "step": 1776 }, { "epoch": 0.9556974789915966, "grad_norm": 0.21041955053806305, "learning_rate": 2.957926036504632e-07, "loss": 0.2638, "step": 1777 }, { "epoch": 0.9562352941176471, "grad_norm": 0.22481632232666016, "learning_rate": 2.8863597052838367e-07, "loss": 0.2418, "step": 1778 }, { "epoch": 0.9567731092436975, "grad_norm": 0.22829340398311615, "learning_rate": 2.8156647477760866e-07, "loss": 0.2496, "step": 1779 }, { "epoch": 0.9573109243697479, "grad_norm": 0.212696373462677, "learning_rate": 2.745841413266309e-07, "loss": 0.2404, "step": 1780 }, { "epoch": 0.9578487394957983, "grad_norm": 0.26517555117607117, "learning_rate": 2.6768899479660546e-07, "loss": 0.3042, "step": 1781 }, { "epoch": 0.9583865546218487, "grad_norm": 0.24199555814266205, "learning_rate": 2.6088105950123897e-07, "loss": 0.2451, "step": 1782 }, { "epoch": 0.9589243697478992, "grad_norm": 0.24772870540618896, "learning_rate": 2.5416035944672287e-07, "loss": 0.265, "step": 1783 }, { "epoch": 0.9594621848739496, "grad_norm": 0.2291071116924286, "learning_rate": 2.4752691833162535e-07, "loss": 0.2381, "step": 1784 }, { "epoch": 0.96, "grad_norm": 0.22111442685127258, "learning_rate": 2.4098075954683275e-07, "loss": 0.2277, "step": 1785 }, { "epoch": 0.9605378151260504, "grad_norm": 0.25378668308258057, "learning_rate": 2.3452190617545556e-07, "loss": 0.2537, "step": 1786 }, { "epoch": 0.9610756302521009, "grad_norm": 0.20316681265830994, "learning_rate": 2.2815038099273655e-07, "loss": 0.2315, "step": 1787 }, { "epoch": 0.9616134453781513, "grad_norm": 0.17736071348190308, "learning_rate": 2.2186620646599543e-07, "loss": 0.186, "step": 1788 }, { "epoch": 0.9621512605042016, "grad_norm": 0.20521606504917145, "learning_rate": 2.1566940475452602e-07, "loss": 0.2196, "step": 1789 }, { "epoch": 0.9626890756302521, "grad_norm": 0.25576522946357727, "learning_rate": 2.095599977095325e-07, "loss": 0.2994, "step": 1790 }, { "epoch": 0.9632268907563025, "grad_norm": 0.2438701093196869, "learning_rate": 2.0353800687404335e-07, "loss": 0.2764, "step": 1791 }, { "epoch": 0.963764705882353, "grad_norm": 0.20603539049625397, "learning_rate": 1.9760345348284192e-07, "loss": 0.2349, "step": 1792 }, { "epoch": 0.9643025210084034, "grad_norm": 0.23523761332035065, "learning_rate": 1.917563584623916e-07, "loss": 0.278, "step": 1793 }, { "epoch": 0.9648403361344537, "grad_norm": 0.2543935775756836, "learning_rate": 1.8599674243075237e-07, "loss": 0.2663, "step": 1794 }, { "epoch": 0.9653781512605042, "grad_norm": 0.20902074873447418, "learning_rate": 1.803246256975255e-07, "loss": 0.2172, "step": 1795 }, { "epoch": 0.9659159663865546, "grad_norm": 0.25048109889030457, "learning_rate": 1.7474002826375625e-07, "loss": 0.2653, "step": 1796 }, { "epoch": 0.9664537815126051, "grad_norm": 0.23244474828243256, "learning_rate": 1.6924296982189502e-07, "loss": 0.3035, "step": 1797 }, { "epoch": 0.9669915966386554, "grad_norm": 0.24267198145389557, "learning_rate": 1.6383346975570312e-07, "loss": 0.2631, "step": 1798 }, { "epoch": 0.9675294117647059, "grad_norm": 0.23444727063179016, "learning_rate": 1.5851154714019434e-07, "loss": 0.2533, "step": 1799 }, { "epoch": 0.9680672268907563, "grad_norm": 0.22417624294757843, "learning_rate": 1.5327722074156557e-07, "loss": 0.26, "step": 1800 }, { "epoch": 0.9686050420168067, "grad_norm": 0.23502689599990845, "learning_rate": 1.481305090171331e-07, "loss": 0.2431, "step": 1801 }, { "epoch": 0.9691428571428572, "grad_norm": 0.2579214572906494, "learning_rate": 1.4307143011526858e-07, "loss": 0.2839, "step": 1802 }, { "epoch": 0.9696806722689075, "grad_norm": 0.2253645658493042, "learning_rate": 1.3810000187532425e-07, "loss": 0.2475, "step": 1803 }, { "epoch": 0.970218487394958, "grad_norm": 0.21975766122341156, "learning_rate": 1.3321624182758287e-07, "loss": 0.2192, "step": 1804 }, { "epoch": 0.9707563025210084, "grad_norm": 0.19434669613838196, "learning_rate": 1.2842016719319117e-07, "loss": 0.19, "step": 1805 }, { "epoch": 0.9712941176470589, "grad_norm": 0.2523990273475647, "learning_rate": 1.2371179488409877e-07, "loss": 0.2566, "step": 1806 }, { "epoch": 0.9718319327731092, "grad_norm": 0.21272164583206177, "learning_rate": 1.1909114150299994e-07, "loss": 0.2141, "step": 1807 }, { "epoch": 0.9723697478991596, "grad_norm": 0.22276386618614197, "learning_rate": 1.145582233432696e-07, "loss": 0.224, "step": 1808 }, { "epoch": 0.9729075630252101, "grad_norm": 0.25298258662223816, "learning_rate": 1.1011305638891634e-07, "loss": 0.2691, "step": 1809 }, { "epoch": 0.9734453781512605, "grad_norm": 0.21504244208335876, "learning_rate": 1.057556563145129e-07, "loss": 0.2308, "step": 1810 }, { "epoch": 0.973983193277311, "grad_norm": 0.21459577977657318, "learning_rate": 1.0148603848515459e-07, "loss": 0.2194, "step": 1811 }, { "epoch": 0.9745210084033613, "grad_norm": 0.20404484868049622, "learning_rate": 9.730421795639266e-08, "loss": 0.2252, "step": 1812 }, { "epoch": 0.9750588235294118, "grad_norm": 0.22167102992534637, "learning_rate": 9.321020947419268e-08, "loss": 0.2474, "step": 1813 }, { "epoch": 0.9755966386554622, "grad_norm": 0.24937115609645844, "learning_rate": 8.920402747486789e-08, "loss": 0.2546, "step": 1814 }, { "epoch": 0.9761344537815126, "grad_norm": 0.22689059376716614, "learning_rate": 8.528568608505149e-08, "loss": 0.2457, "step": 1815 }, { "epoch": 0.976672268907563, "grad_norm": 0.2244778424501419, "learning_rate": 8.145519912161891e-08, "loss": 0.2581, "step": 1816 }, { "epoch": 0.9772100840336134, "grad_norm": 0.2267417013645172, "learning_rate": 7.771258009166004e-08, "loss": 0.2289, "step": 1817 }, { "epoch": 0.9777478991596639, "grad_norm": 0.2235623598098755, "learning_rate": 7.405784219241818e-08, "loss": 0.2374, "step": 1818 }, { "epoch": 0.9782857142857143, "grad_norm": 0.22993768751621246, "learning_rate": 7.049099831125949e-08, "loss": 0.2429, "step": 1819 }, { "epoch": 0.9788235294117648, "grad_norm": 0.17638865113258362, "learning_rate": 6.70120610256092e-08, "loss": 0.1738, "step": 1820 }, { "epoch": 0.9793613445378151, "grad_norm": 0.236169695854187, "learning_rate": 6.362104260291823e-08, "loss": 0.2476, "step": 1821 }, { "epoch": 0.9798991596638655, "grad_norm": 0.21264630556106567, "learning_rate": 6.031795500061887e-08, "loss": 0.2319, "step": 1822 }, { "epoch": 0.980436974789916, "grad_norm": 0.28466445207595825, "learning_rate": 5.7102809866077524e-08, "loss": 0.3017, "step": 1823 }, { "epoch": 0.9809747899159664, "grad_norm": 0.20969516038894653, "learning_rate": 5.3975618536561415e-08, "loss": 0.2327, "step": 1824 }, { "epoch": 0.9815126050420168, "grad_norm": 0.22848498821258545, "learning_rate": 5.0936392039191427e-08, "loss": 0.2448, "step": 1825 }, { "epoch": 0.9820504201680672, "grad_norm": 0.21331901848316193, "learning_rate": 4.7985141090908795e-08, "loss": 0.2098, "step": 1826 }, { "epoch": 0.9825882352941177, "grad_norm": 0.1938292384147644, "learning_rate": 4.512187609843343e-08, "loss": 0.2082, "step": 1827 }, { "epoch": 0.9831260504201681, "grad_norm": 0.21801802515983582, "learning_rate": 4.234660715823069e-08, "loss": 0.2205, "step": 1828 }, { "epoch": 0.9836638655462185, "grad_norm": 0.21089939773082733, "learning_rate": 3.965934405646687e-08, "loss": 0.244, "step": 1829 }, { "epoch": 0.9842016806722689, "grad_norm": 0.21720567345619202, "learning_rate": 3.706009626899265e-08, "loss": 0.2237, "step": 1830 }, { "epoch": 0.9847394957983193, "grad_norm": 0.21715302765369415, "learning_rate": 3.454887296129028e-08, "loss": 0.2369, "step": 1831 }, { "epoch": 0.9852773109243698, "grad_norm": 0.24087439477443695, "learning_rate": 3.212568298844865e-08, "loss": 0.2371, "step": 1832 }, { "epoch": 0.9858151260504202, "grad_norm": 0.2655474543571472, "learning_rate": 2.9790534895135504e-08, "loss": 0.2665, "step": 1833 }, { "epoch": 0.9863529411764705, "grad_norm": 0.226781964302063, "learning_rate": 2.7543436915572485e-08, "loss": 0.2381, "step": 1834 }, { "epoch": 0.986890756302521, "grad_norm": 0.22141613066196442, "learning_rate": 2.538439697348238e-08, "loss": 0.2213, "step": 1835 }, { "epoch": 0.9874285714285714, "grad_norm": 0.23227772116661072, "learning_rate": 2.3313422682091913e-08, "loss": 0.2112, "step": 1836 }, { "epoch": 0.9879663865546219, "grad_norm": 0.23344041407108307, "learning_rate": 2.133052134407898e-08, "loss": 0.2337, "step": 1837 }, { "epoch": 0.9885042016806723, "grad_norm": 0.21933989226818085, "learning_rate": 1.9435699951567132e-08, "loss": 0.233, "step": 1838 }, { "epoch": 0.9890420168067227, "grad_norm": 0.23509401082992554, "learning_rate": 1.76289651860867e-08, "loss": 0.2557, "step": 1839 }, { "epoch": 0.9895798319327731, "grad_norm": 0.2226860225200653, "learning_rate": 1.591032341855536e-08, "loss": 0.2359, "step": 1840 }, { "epoch": 0.9901176470588235, "grad_norm": 0.24388764798641205, "learning_rate": 1.4279780709261504e-08, "loss": 0.2764, "step": 1841 }, { "epoch": 0.990655462184874, "grad_norm": 0.22781908512115479, "learning_rate": 1.2737342807833674e-08, "loss": 0.2458, "step": 1842 }, { "epoch": 0.9911932773109243, "grad_norm": 0.22319217026233673, "learning_rate": 1.1283015153226717e-08, "loss": 0.2368, "step": 1843 }, { "epoch": 0.9917310924369748, "grad_norm": 0.199215367436409, "learning_rate": 9.916802873702336e-09, "loss": 0.1891, "step": 1844 }, { "epoch": 0.9922689075630252, "grad_norm": 0.20670108497142792, "learning_rate": 8.638710786804116e-09, "loss": 0.2259, "step": 1845 }, { "epoch": 0.9928067226890757, "grad_norm": 0.22759781777858734, "learning_rate": 7.4487433993492e-09, "loss": 0.2372, "step": 1846 }, { "epoch": 0.9933445378151261, "grad_norm": 0.22349266707897186, "learning_rate": 6.346904907408857e-09, "loss": 0.234, "step": 1847 }, { "epoch": 0.9938823529411764, "grad_norm": 0.22265370190143585, "learning_rate": 5.333199196291827e-09, "loss": 0.2794, "step": 1848 }, { "epoch": 0.9944201680672269, "grad_norm": 0.2153802216053009, "learning_rate": 4.407629840538774e-09, "loss": 0.2127, "step": 1849 }, { "epoch": 0.9949579831932773, "grad_norm": 0.20852942764759064, "learning_rate": 3.570200103891752e-09, "loss": 0.2093, "step": 1850 }, { "epoch": 0.9954957983193278, "grad_norm": 0.24076414108276367, "learning_rate": 2.8209129393025335e-09, "loss": 0.2546, "step": 1851 }, { "epoch": 0.9960336134453781, "grad_norm": 0.2030596137046814, "learning_rate": 2.1597709889159545e-09, "loss": 0.219, "step": 1852 }, { "epoch": 0.9965714285714286, "grad_norm": 0.20425300300121307, "learning_rate": 1.5867765840504868e-09, "loss": 0.2248, "step": 1853 }, { "epoch": 0.997109243697479, "grad_norm": 0.22505299746990204, "learning_rate": 1.1019317452065637e-09, "loss": 0.2513, "step": 1854 }, { "epoch": 0.9976470588235294, "grad_norm": 0.2130691111087799, "learning_rate": 7.052381820443766e-10, "loss": 0.2515, "step": 1855 }, { "epoch": 0.9981848739495799, "grad_norm": 0.20600520074367523, "learning_rate": 3.9669729338664975e-10, "loss": 0.2098, "step": 1856 }, { "epoch": 0.9987226890756302, "grad_norm": 0.2103237360715866, "learning_rate": 1.7631016721586512e-10, "loss": 0.2213, "step": 1857 }, { "epoch": 0.9992605042016807, "grad_norm": 0.2278960645198822, "learning_rate": 4.407758066038436e-11, "loss": 0.2131, "step": 1858 }, { "epoch": 0.9997983193277311, "grad_norm": 0.24840262532234192, "learning_rate": 0.0, "loss": 0.2408, "step": 1859 } ], "logging_steps": 1, "max_steps": 1859, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.9079201331868262e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }