diff --git "a/last-checkpoint/trainer_state.json" "b/last-checkpoint/trainer_state.json" --- "a/last-checkpoint/trainer_state.json" +++ "b/last-checkpoint/trainer_state.json" @@ -1,9 +1,9 @@ { "best_metric": 0.30425724387168884, "best_model_checkpoint": "./w2v-bert-2.0-luo_cv_fleurs_19h-v2/checkpoint-2000", - "epoch": 32.467532467532465, + "epoch": 38.96103896103896, "eval_steps": 1000, - "global_step": 5000, + "global_step": 6000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, @@ -35057,6 +35057,7016 @@ "eval_steps_per_second": 0.541, "eval_wer": 0.3558201058201058, "step": 5000 + }, + { + "epoch": 32.47402597402598, + "grad_norm": 0.7455947399139404, + "learning_rate": 2.9231384615384617e-05, + "loss": 0.0142, + "step": 5001 + }, + { + "epoch": 32.48051948051948, + "grad_norm": 0.595359206199646, + "learning_rate": 2.9231076923076922e-05, + "loss": 0.0105, + "step": 5002 + }, + { + "epoch": 32.48701298701299, + "grad_norm": 0.23928628861904144, + "learning_rate": 2.923076923076923e-05, + "loss": 0.0029, + "step": 5003 + }, + { + "epoch": 32.493506493506494, + "grad_norm": 1.417263388633728, + "learning_rate": 2.923046153846154e-05, + "loss": 0.0566, + "step": 5004 + }, + { + "epoch": 32.5, + "grad_norm": 0.9653884768486023, + "learning_rate": 2.9230153846153844e-05, + "loss": 0.1181, + "step": 5005 + }, + { + "epoch": 32.506493506493506, + "grad_norm": 0.5853801965713501, + "learning_rate": 2.9229846153846156e-05, + "loss": 0.0873, + "step": 5006 + }, + { + "epoch": 32.51298701298701, + "grad_norm": 0.4240158796310425, + "learning_rate": 2.9229538461538464e-05, + "loss": 0.0576, + "step": 5007 + }, + { + "epoch": 32.51948051948052, + "grad_norm": 0.4580501616001129, + "learning_rate": 2.922923076923077e-05, + "loss": 0.0544, + "step": 5008 + }, + { + "epoch": 32.52597402597402, + "grad_norm": 0.5052347183227539, + "learning_rate": 2.9228923076923077e-05, + "loss": 0.0341, + "step": 5009 + }, + { + "epoch": 32.532467532467535, + "grad_norm": 0.4432257115840912, + "learning_rate": 2.9228615384615385e-05, + "loss": 0.0338, + "step": 5010 + }, + { + "epoch": 32.53896103896104, + "grad_norm": 0.7629462480545044, + "learning_rate": 2.922830769230769e-05, + "loss": 0.0539, + "step": 5011 + }, + { + "epoch": 32.54545454545455, + "grad_norm": 0.4871881604194641, + "learning_rate": 2.9228e-05, + "loss": 0.0385, + "step": 5012 + }, + { + "epoch": 32.55194805194805, + "grad_norm": 0.3923622965812683, + "learning_rate": 2.922769230769231e-05, + "loss": 0.027, + "step": 5013 + }, + { + "epoch": 32.55844155844156, + "grad_norm": 0.6062350869178772, + "learning_rate": 2.9227384615384615e-05, + "loss": 0.0365, + "step": 5014 + }, + { + "epoch": 32.564935064935064, + "grad_norm": 0.34979578852653503, + "learning_rate": 2.9227076923076924e-05, + "loss": 0.0178, + "step": 5015 + }, + { + "epoch": 32.57142857142857, + "grad_norm": 0.3995114862918854, + "learning_rate": 2.9226769230769232e-05, + "loss": 0.0194, + "step": 5016 + }, + { + "epoch": 32.577922077922075, + "grad_norm": 0.3907703161239624, + "learning_rate": 2.9226461538461537e-05, + "loss": 0.0183, + "step": 5017 + }, + { + "epoch": 32.58441558441559, + "grad_norm": 0.539358377456665, + "learning_rate": 2.9226153846153845e-05, + "loss": 0.0212, + "step": 5018 + }, + { + "epoch": 32.59090909090909, + "grad_norm": 0.4485572278499603, + "learning_rate": 2.9225846153846154e-05, + "loss": 0.0207, + "step": 5019 + }, + { + "epoch": 32.5974025974026, + "grad_norm": 1.1342281103134155, + "learning_rate": 2.9225538461538465e-05, + "loss": 0.0144, + "step": 5020 + }, + { + "epoch": 32.603896103896105, + "grad_norm": 0.44675344228744507, + "learning_rate": 2.922523076923077e-05, + "loss": 0.0226, + "step": 5021 + }, + { + "epoch": 32.61038961038961, + "grad_norm": 0.8697386384010315, + "learning_rate": 2.922492307692308e-05, + "loss": 0.0206, + "step": 5022 + }, + { + "epoch": 32.616883116883116, + "grad_norm": 0.36490869522094727, + "learning_rate": 2.9224615384615387e-05, + "loss": 0.0135, + "step": 5023 + }, + { + "epoch": 32.62337662337662, + "grad_norm": 0.7864395976066589, + "learning_rate": 2.9224307692307692e-05, + "loss": 0.0176, + "step": 5024 + }, + { + "epoch": 32.62987012987013, + "grad_norm": 0.49690279364585876, + "learning_rate": 2.9224e-05, + "loss": 0.0286, + "step": 5025 + }, + { + "epoch": 32.63636363636363, + "grad_norm": 1.0221036672592163, + "learning_rate": 2.922369230769231e-05, + "loss": 0.0165, + "step": 5026 + }, + { + "epoch": 32.642857142857146, + "grad_norm": 0.5382166504859924, + "learning_rate": 2.9223384615384617e-05, + "loss": 0.0082, + "step": 5027 + }, + { + "epoch": 32.64935064935065, + "grad_norm": 0.6461249589920044, + "learning_rate": 2.9223076923076925e-05, + "loss": 0.011, + "step": 5028 + }, + { + "epoch": 32.65584415584416, + "grad_norm": 0.5846178531646729, + "learning_rate": 2.9222769230769233e-05, + "loss": 0.0136, + "step": 5029 + }, + { + "epoch": 32.66233766233766, + "grad_norm": 0.6709246039390564, + "learning_rate": 2.9222461538461538e-05, + "loss": 0.0154, + "step": 5030 + }, + { + "epoch": 32.66883116883117, + "grad_norm": 0.30806440114974976, + "learning_rate": 2.9222153846153847e-05, + "loss": 0.0053, + "step": 5031 + }, + { + "epoch": 32.675324675324674, + "grad_norm": 0.8055159449577332, + "learning_rate": 2.9221846153846155e-05, + "loss": 0.0099, + "step": 5032 + }, + { + "epoch": 32.68181818181818, + "grad_norm": 1.1743619441986084, + "learning_rate": 2.922153846153846e-05, + "loss": 0.0411, + "step": 5033 + }, + { + "epoch": 32.688311688311686, + "grad_norm": 0.1986626237630844, + "learning_rate": 2.922123076923077e-05, + "loss": 0.0031, + "step": 5034 + }, + { + "epoch": 32.6948051948052, + "grad_norm": 0.9511582851409912, + "learning_rate": 2.922092307692308e-05, + "loss": 0.0166, + "step": 5035 + }, + { + "epoch": 32.701298701298704, + "grad_norm": 0.5934857726097107, + "learning_rate": 2.9220615384615385e-05, + "loss": 0.012, + "step": 5036 + }, + { + "epoch": 32.70779220779221, + "grad_norm": 0.7282407283782959, + "learning_rate": 2.9220307692307693e-05, + "loss": 0.0188, + "step": 5037 + }, + { + "epoch": 32.714285714285715, + "grad_norm": 0.4278605282306671, + "learning_rate": 2.922e-05, + "loss": 0.0062, + "step": 5038 + }, + { + "epoch": 32.72077922077922, + "grad_norm": 0.5733248591423035, + "learning_rate": 2.9219692307692306e-05, + "loss": 0.0076, + "step": 5039 + }, + { + "epoch": 32.72727272727273, + "grad_norm": 1.7734631299972534, + "learning_rate": 2.9219384615384615e-05, + "loss": 0.0403, + "step": 5040 + }, + { + "epoch": 32.73376623376623, + "grad_norm": 0.2512581944465637, + "learning_rate": 2.9219076923076926e-05, + "loss": 0.0087, + "step": 5041 + }, + { + "epoch": 32.74025974025974, + "grad_norm": 0.7879403233528137, + "learning_rate": 2.921876923076923e-05, + "loss": 0.0106, + "step": 5042 + }, + { + "epoch": 32.746753246753244, + "grad_norm": 0.7762377858161926, + "learning_rate": 2.921846153846154e-05, + "loss": 0.1138, + "step": 5043 + }, + { + "epoch": 32.753246753246756, + "grad_norm": 0.6951083540916443, + "learning_rate": 2.9218153846153848e-05, + "loss": 0.0988, + "step": 5044 + }, + { + "epoch": 32.75974025974026, + "grad_norm": 0.4834299087524414, + "learning_rate": 2.9217846153846153e-05, + "loss": 0.0673, + "step": 5045 + }, + { + "epoch": 32.76623376623377, + "grad_norm": 0.4215543866157532, + "learning_rate": 2.921753846153846e-05, + "loss": 0.0537, + "step": 5046 + }, + { + "epoch": 32.77272727272727, + "grad_norm": 0.38221612572669983, + "learning_rate": 2.9217230769230773e-05, + "loss": 0.0475, + "step": 5047 + }, + { + "epoch": 32.77922077922078, + "grad_norm": 0.3835408687591553, + "learning_rate": 2.9216923076923078e-05, + "loss": 0.0276, + "step": 5048 + }, + { + "epoch": 32.785714285714285, + "grad_norm": 0.5472840070724487, + "learning_rate": 2.9216615384615386e-05, + "loss": 0.0318, + "step": 5049 + }, + { + "epoch": 32.79220779220779, + "grad_norm": 0.7388037443161011, + "learning_rate": 2.9216307692307695e-05, + "loss": 0.0408, + "step": 5050 + }, + { + "epoch": 32.798701298701296, + "grad_norm": 0.6615577936172485, + "learning_rate": 2.9216e-05, + "loss": 0.0607, + "step": 5051 + }, + { + "epoch": 32.8051948051948, + "grad_norm": 0.548747181892395, + "learning_rate": 2.9215692307692308e-05, + "loss": 0.0304, + "step": 5052 + }, + { + "epoch": 32.811688311688314, + "grad_norm": 0.38156527280807495, + "learning_rate": 2.9215384615384616e-05, + "loss": 0.0229, + "step": 5053 + }, + { + "epoch": 32.81818181818182, + "grad_norm": 0.6100485324859619, + "learning_rate": 2.9215076923076924e-05, + "loss": 0.0282, + "step": 5054 + }, + { + "epoch": 32.824675324675326, + "grad_norm": 0.5645389556884766, + "learning_rate": 2.9214769230769233e-05, + "loss": 0.0267, + "step": 5055 + }, + { + "epoch": 32.83116883116883, + "grad_norm": 0.7538372874259949, + "learning_rate": 2.921446153846154e-05, + "loss": 0.0214, + "step": 5056 + }, + { + "epoch": 32.83766233766234, + "grad_norm": 0.41211986541748047, + "learning_rate": 2.9214153846153846e-05, + "loss": 0.0205, + "step": 5057 + }, + { + "epoch": 32.84415584415584, + "grad_norm": 0.993869960308075, + "learning_rate": 2.9213846153846154e-05, + "loss": 0.0504, + "step": 5058 + }, + { + "epoch": 32.85064935064935, + "grad_norm": 0.5521100163459778, + "learning_rate": 2.9213538461538463e-05, + "loss": 0.0171, + "step": 5059 + }, + { + "epoch": 32.857142857142854, + "grad_norm": 0.49331873655319214, + "learning_rate": 2.9213230769230768e-05, + "loss": 0.0194, + "step": 5060 + }, + { + "epoch": 32.86363636363637, + "grad_norm": 0.4689992368221283, + "learning_rate": 2.921292307692308e-05, + "loss": 0.0138, + "step": 5061 + }, + { + "epoch": 32.87012987012987, + "grad_norm": 0.6267738342285156, + "learning_rate": 2.9212615384615388e-05, + "loss": 0.0147, + "step": 5062 + }, + { + "epoch": 32.87662337662338, + "grad_norm": 0.4092702865600586, + "learning_rate": 2.9212307692307693e-05, + "loss": 0.0121, + "step": 5063 + }, + { + "epoch": 32.883116883116884, + "grad_norm": 1.5005707740783691, + "learning_rate": 2.9212e-05, + "loss": 0.0232, + "step": 5064 + }, + { + "epoch": 32.88961038961039, + "grad_norm": 0.4179195165634155, + "learning_rate": 2.921169230769231e-05, + "loss": 0.0116, + "step": 5065 + }, + { + "epoch": 32.896103896103895, + "grad_norm": 0.9752940535545349, + "learning_rate": 2.9211384615384614e-05, + "loss": 0.0587, + "step": 5066 + }, + { + "epoch": 32.9025974025974, + "grad_norm": 0.7442163228988647, + "learning_rate": 2.9211076923076922e-05, + "loss": 0.0212, + "step": 5067 + }, + { + "epoch": 32.90909090909091, + "grad_norm": 0.6272023916244507, + "learning_rate": 2.9210769230769234e-05, + "loss": 0.0131, + "step": 5068 + }, + { + "epoch": 32.91558441558441, + "grad_norm": 0.5838685035705566, + "learning_rate": 2.921046153846154e-05, + "loss": 0.0153, + "step": 5069 + }, + { + "epoch": 32.922077922077925, + "grad_norm": 1.2788761854171753, + "learning_rate": 2.9210153846153847e-05, + "loss": 0.0132, + "step": 5070 + }, + { + "epoch": 32.92857142857143, + "grad_norm": 1.2704384326934814, + "learning_rate": 2.9209846153846156e-05, + "loss": 0.0072, + "step": 5071 + }, + { + "epoch": 32.935064935064936, + "grad_norm": 0.36931920051574707, + "learning_rate": 2.920953846153846e-05, + "loss": 0.0073, + "step": 5072 + }, + { + "epoch": 32.94155844155844, + "grad_norm": 0.43888309597969055, + "learning_rate": 2.920923076923077e-05, + "loss": 0.0107, + "step": 5073 + }, + { + "epoch": 32.94805194805195, + "grad_norm": 0.7088915705680847, + "learning_rate": 2.9208923076923077e-05, + "loss": 0.0527, + "step": 5074 + }, + { + "epoch": 32.95454545454545, + "grad_norm": 0.6745349764823914, + "learning_rate": 2.9208615384615386e-05, + "loss": 0.0059, + "step": 5075 + }, + { + "epoch": 32.96103896103896, + "grad_norm": 2.0108890533447266, + "learning_rate": 2.9208307692307694e-05, + "loss": 0.0225, + "step": 5076 + }, + { + "epoch": 32.967532467532465, + "grad_norm": 0.7361552715301514, + "learning_rate": 2.9208000000000002e-05, + "loss": 0.007, + "step": 5077 + }, + { + "epoch": 32.97402597402598, + "grad_norm": 0.6981684565544128, + "learning_rate": 2.9207692307692307e-05, + "loss": 0.0305, + "step": 5078 + }, + { + "epoch": 32.98051948051948, + "grad_norm": 0.6083940267562866, + "learning_rate": 2.9207384615384615e-05, + "loss": 0.0061, + "step": 5079 + }, + { + "epoch": 32.98701298701299, + "grad_norm": 1.8485974073410034, + "learning_rate": 2.9207076923076924e-05, + "loss": 0.0192, + "step": 5080 + }, + { + "epoch": 32.993506493506494, + "grad_norm": 0.7409580945968628, + "learning_rate": 2.920676923076923e-05, + "loss": 0.05, + "step": 5081 + }, + { + "epoch": 33.0, + "grad_norm": 0.5991113185882568, + "learning_rate": 2.920646153846154e-05, + "loss": 0.0249, + "step": 5082 + }, + { + "epoch": 33.006493506493506, + "grad_norm": 0.6193212866783142, + "learning_rate": 2.920615384615385e-05, + "loss": 0.0957, + "step": 5083 + }, + { + "epoch": 33.01298701298701, + "grad_norm": 0.5013349652290344, + "learning_rate": 2.9205846153846154e-05, + "loss": 0.0582, + "step": 5084 + }, + { + "epoch": 33.01948051948052, + "grad_norm": 0.4513569474220276, + "learning_rate": 2.9205538461538462e-05, + "loss": 0.0527, + "step": 5085 + }, + { + "epoch": 33.02597402597402, + "grad_norm": 0.36499112844467163, + "learning_rate": 2.920523076923077e-05, + "loss": 0.0495, + "step": 5086 + }, + { + "epoch": 33.032467532467535, + "grad_norm": 0.4244421422481537, + "learning_rate": 2.9204923076923075e-05, + "loss": 0.0411, + "step": 5087 + }, + { + "epoch": 33.03896103896104, + "grad_norm": 0.3226945698261261, + "learning_rate": 2.9204615384615384e-05, + "loss": 0.0255, + "step": 5088 + }, + { + "epoch": 33.04545454545455, + "grad_norm": 0.38724368810653687, + "learning_rate": 2.9204307692307695e-05, + "loss": 0.027, + "step": 5089 + }, + { + "epoch": 33.05194805194805, + "grad_norm": 0.49415165185928345, + "learning_rate": 2.9204e-05, + "loss": 0.0276, + "step": 5090 + }, + { + "epoch": 33.05844155844156, + "grad_norm": 0.41548120975494385, + "learning_rate": 2.920369230769231e-05, + "loss": 0.02, + "step": 5091 + }, + { + "epoch": 33.064935064935064, + "grad_norm": 0.6897282004356384, + "learning_rate": 2.9203384615384617e-05, + "loss": 0.0215, + "step": 5092 + }, + { + "epoch": 33.07142857142857, + "grad_norm": 0.4937240481376648, + "learning_rate": 2.9203076923076922e-05, + "loss": 0.0169, + "step": 5093 + }, + { + "epoch": 33.077922077922075, + "grad_norm": 0.49796339869499207, + "learning_rate": 2.920276923076923e-05, + "loss": 0.0172, + "step": 5094 + }, + { + "epoch": 33.08441558441559, + "grad_norm": 1.144960880279541, + "learning_rate": 2.920246153846154e-05, + "loss": 0.0228, + "step": 5095 + }, + { + "epoch": 33.09090909090909, + "grad_norm": 0.44828659296035767, + "learning_rate": 2.9202153846153847e-05, + "loss": 0.0095, + "step": 5096 + }, + { + "epoch": 33.0974025974026, + "grad_norm": 0.4165479838848114, + "learning_rate": 2.9201846153846155e-05, + "loss": 0.015, + "step": 5097 + }, + { + "epoch": 33.103896103896105, + "grad_norm": 0.4386371672153473, + "learning_rate": 2.9201538461538463e-05, + "loss": 0.0133, + "step": 5098 + }, + { + "epoch": 33.11038961038961, + "grad_norm": 0.3190487325191498, + "learning_rate": 2.9201230769230772e-05, + "loss": 0.0086, + "step": 5099 + }, + { + "epoch": 33.116883116883116, + "grad_norm": 0.6171437501907349, + "learning_rate": 2.9200923076923077e-05, + "loss": 0.0379, + "step": 5100 + }, + { + "epoch": 33.12337662337662, + "grad_norm": 0.5920683741569519, + "learning_rate": 2.9200615384615385e-05, + "loss": 0.0111, + "step": 5101 + }, + { + "epoch": 33.12987012987013, + "grad_norm": 0.40865978598594666, + "learning_rate": 2.9200307692307693e-05, + "loss": 0.0088, + "step": 5102 + }, + { + "epoch": 33.13636363636363, + "grad_norm": 0.3535599112510681, + "learning_rate": 2.92e-05, + "loss": 0.0072, + "step": 5103 + }, + { + "epoch": 33.142857142857146, + "grad_norm": 0.577937662601471, + "learning_rate": 2.919969230769231e-05, + "loss": 0.0268, + "step": 5104 + }, + { + "epoch": 33.14935064935065, + "grad_norm": 0.5617281198501587, + "learning_rate": 2.9199384615384618e-05, + "loss": 0.0088, + "step": 5105 + }, + { + "epoch": 33.15584415584416, + "grad_norm": 0.9031304717063904, + "learning_rate": 2.9199076923076923e-05, + "loss": 0.0129, + "step": 5106 + }, + { + "epoch": 33.16233766233766, + "grad_norm": 1.5352048873901367, + "learning_rate": 2.919876923076923e-05, + "loss": 0.015, + "step": 5107 + }, + { + "epoch": 33.16883116883117, + "grad_norm": 0.34195709228515625, + "learning_rate": 2.919846153846154e-05, + "loss": 0.0047, + "step": 5108 + }, + { + "epoch": 33.175324675324674, + "grad_norm": 0.4741317331790924, + "learning_rate": 2.9198153846153845e-05, + "loss": 0.0083, + "step": 5109 + }, + { + "epoch": 33.18181818181818, + "grad_norm": 0.6579654812812805, + "learning_rate": 2.9197846153846157e-05, + "loss": 0.013, + "step": 5110 + }, + { + "epoch": 33.188311688311686, + "grad_norm": 0.8151121139526367, + "learning_rate": 2.9197538461538465e-05, + "loss": 0.0158, + "step": 5111 + }, + { + "epoch": 33.1948051948052, + "grad_norm": 1.0251507759094238, + "learning_rate": 2.919723076923077e-05, + "loss": 0.0173, + "step": 5112 + }, + { + "epoch": 33.201298701298704, + "grad_norm": 0.5701705813407898, + "learning_rate": 2.9196923076923078e-05, + "loss": 0.0104, + "step": 5113 + }, + { + "epoch": 33.20779220779221, + "grad_norm": 0.7816833257675171, + "learning_rate": 2.9196615384615386e-05, + "loss": 0.0139, + "step": 5114 + }, + { + "epoch": 33.214285714285715, + "grad_norm": 0.4763857126235962, + "learning_rate": 2.919630769230769e-05, + "loss": 0.0077, + "step": 5115 + }, + { + "epoch": 33.22077922077922, + "grad_norm": 1.8317736387252808, + "learning_rate": 2.9196e-05, + "loss": 0.0161, + "step": 5116 + }, + { + "epoch": 33.22727272727273, + "grad_norm": 3.5923259258270264, + "learning_rate": 2.919569230769231e-05, + "loss": 0.0258, + "step": 5117 + }, + { + "epoch": 33.23376623376623, + "grad_norm": 0.6837280988693237, + "learning_rate": 2.9195384615384616e-05, + "loss": 0.0198, + "step": 5118 + }, + { + "epoch": 33.24025974025974, + "grad_norm": 0.8108289837837219, + "learning_rate": 2.9195076923076925e-05, + "loss": 0.0117, + "step": 5119 + }, + { + "epoch": 33.246753246753244, + "grad_norm": 0.9223237037658691, + "learning_rate": 2.9194769230769233e-05, + "loss": 0.0143, + "step": 5120 + }, + { + "epoch": 33.253246753246756, + "grad_norm": 0.6978357434272766, + "learning_rate": 2.9194461538461538e-05, + "loss": 0.105, + "step": 5121 + }, + { + "epoch": 33.25974025974026, + "grad_norm": 0.6224868297576904, + "learning_rate": 2.9194153846153846e-05, + "loss": 0.0686, + "step": 5122 + }, + { + "epoch": 33.26623376623377, + "grad_norm": 0.4567446708679199, + "learning_rate": 2.9193846153846155e-05, + "loss": 0.0441, + "step": 5123 + }, + { + "epoch": 33.27272727272727, + "grad_norm": 0.5249010920524597, + "learning_rate": 2.9193538461538463e-05, + "loss": 0.0485, + "step": 5124 + }, + { + "epoch": 33.27922077922078, + "grad_norm": 0.45523807406425476, + "learning_rate": 2.919323076923077e-05, + "loss": 0.0297, + "step": 5125 + }, + { + "epoch": 33.285714285714285, + "grad_norm": 0.31809040904045105, + "learning_rate": 2.919292307692308e-05, + "loss": 0.0234, + "step": 5126 + }, + { + "epoch": 33.29220779220779, + "grad_norm": 0.45942091941833496, + "learning_rate": 2.9192615384615384e-05, + "loss": 0.0294, + "step": 5127 + }, + { + "epoch": 33.298701298701296, + "grad_norm": 0.4964737892150879, + "learning_rate": 2.9192307692307693e-05, + "loss": 0.024, + "step": 5128 + }, + { + "epoch": 33.3051948051948, + "grad_norm": 0.6479532122612, + "learning_rate": 2.9192e-05, + "loss": 0.0563, + "step": 5129 + }, + { + "epoch": 33.311688311688314, + "grad_norm": 0.3270736634731293, + "learning_rate": 2.9191692307692306e-05, + "loss": 0.019, + "step": 5130 + }, + { + "epoch": 33.31818181818182, + "grad_norm": 0.35060492157936096, + "learning_rate": 2.9191384615384618e-05, + "loss": 0.0178, + "step": 5131 + }, + { + "epoch": 33.324675324675326, + "grad_norm": 0.8634567260742188, + "learning_rate": 2.9191076923076926e-05, + "loss": 0.0338, + "step": 5132 + }, + { + "epoch": 33.33116883116883, + "grad_norm": 0.4603855013847351, + "learning_rate": 2.919076923076923e-05, + "loss": 0.0202, + "step": 5133 + }, + { + "epoch": 33.33766233766234, + "grad_norm": 0.4173492193222046, + "learning_rate": 2.919046153846154e-05, + "loss": 0.0183, + "step": 5134 + }, + { + "epoch": 33.34415584415584, + "grad_norm": 0.34388092160224915, + "learning_rate": 2.9190153846153848e-05, + "loss": 0.009, + "step": 5135 + }, + { + "epoch": 33.35064935064935, + "grad_norm": 0.6773208975791931, + "learning_rate": 2.9189846153846153e-05, + "loss": 0.0389, + "step": 5136 + }, + { + "epoch": 33.357142857142854, + "grad_norm": 0.4459824562072754, + "learning_rate": 2.918953846153846e-05, + "loss": 0.0207, + "step": 5137 + }, + { + "epoch": 33.36363636363637, + "grad_norm": 0.47827303409576416, + "learning_rate": 2.9189230769230773e-05, + "loss": 0.0166, + "step": 5138 + }, + { + "epoch": 33.37012987012987, + "grad_norm": 0.592857301235199, + "learning_rate": 2.9188923076923077e-05, + "loss": 0.0133, + "step": 5139 + }, + { + "epoch": 33.37662337662338, + "grad_norm": 0.9384477734565735, + "learning_rate": 2.9188615384615386e-05, + "loss": 0.025, + "step": 5140 + }, + { + "epoch": 33.383116883116884, + "grad_norm": 0.3302897810935974, + "learning_rate": 2.9188307692307694e-05, + "loss": 0.007, + "step": 5141 + }, + { + "epoch": 33.38961038961039, + "grad_norm": 0.6045815348625183, + "learning_rate": 2.9188e-05, + "loss": 0.0134, + "step": 5142 + }, + { + "epoch": 33.396103896103895, + "grad_norm": 0.4486161768436432, + "learning_rate": 2.9187692307692307e-05, + "loss": 0.0072, + "step": 5143 + }, + { + "epoch": 33.4025974025974, + "grad_norm": 0.4483949542045593, + "learning_rate": 2.9187384615384616e-05, + "loss": 0.031, + "step": 5144 + }, + { + "epoch": 33.40909090909091, + "grad_norm": 0.522419810295105, + "learning_rate": 2.9187076923076924e-05, + "loss": 0.0122, + "step": 5145 + }, + { + "epoch": 33.41558441558441, + "grad_norm": 0.9663325548171997, + "learning_rate": 2.9186769230769232e-05, + "loss": 0.0206, + "step": 5146 + }, + { + "epoch": 33.422077922077925, + "grad_norm": 1.6213914155960083, + "learning_rate": 2.918646153846154e-05, + "loss": 0.0104, + "step": 5147 + }, + { + "epoch": 33.42857142857143, + "grad_norm": 1.0196571350097656, + "learning_rate": 2.9186153846153846e-05, + "loss": 0.0489, + "step": 5148 + }, + { + "epoch": 33.435064935064936, + "grad_norm": 0.45447394251823425, + "learning_rate": 2.9185846153846154e-05, + "loss": 0.0104, + "step": 5149 + }, + { + "epoch": 33.44155844155844, + "grad_norm": 0.46332013607025146, + "learning_rate": 2.9185538461538462e-05, + "loss": 0.0085, + "step": 5150 + }, + { + "epoch": 33.44805194805195, + "grad_norm": 0.2794162929058075, + "learning_rate": 2.9185230769230767e-05, + "loss": 0.0034, + "step": 5151 + }, + { + "epoch": 33.45454545454545, + "grad_norm": 1.2567662000656128, + "learning_rate": 2.918492307692308e-05, + "loss": 0.0438, + "step": 5152 + }, + { + "epoch": 33.46103896103896, + "grad_norm": 0.4975087344646454, + "learning_rate": 2.9184615384615387e-05, + "loss": 0.0046, + "step": 5153 + }, + { + "epoch": 33.467532467532465, + "grad_norm": 0.7620040774345398, + "learning_rate": 2.9184307692307692e-05, + "loss": 0.0128, + "step": 5154 + }, + { + "epoch": 33.47402597402598, + "grad_norm": 0.7590813636779785, + "learning_rate": 2.9184e-05, + "loss": 0.0075, + "step": 5155 + }, + { + "epoch": 33.48051948051948, + "grad_norm": 0.5330340266227722, + "learning_rate": 2.918369230769231e-05, + "loss": 0.005, + "step": 5156 + }, + { + "epoch": 33.48701298701299, + "grad_norm": 0.24444550275802612, + "learning_rate": 2.9183384615384614e-05, + "loss": 0.0033, + "step": 5157 + }, + { + "epoch": 33.493506493506494, + "grad_norm": 0.6891660690307617, + "learning_rate": 2.9183076923076922e-05, + "loss": 0.0116, + "step": 5158 + }, + { + "epoch": 33.5, + "grad_norm": 1.2010726928710938, + "learning_rate": 2.9182769230769234e-05, + "loss": 0.0934, + "step": 5159 + }, + { + "epoch": 33.506493506493506, + "grad_norm": 0.590378999710083, + "learning_rate": 2.918246153846154e-05, + "loss": 0.08, + "step": 5160 + }, + { + "epoch": 33.51298701298701, + "grad_norm": 0.9820928573608398, + "learning_rate": 2.9182153846153847e-05, + "loss": 0.0655, + "step": 5161 + }, + { + "epoch": 33.51948051948052, + "grad_norm": 0.4438777267932892, + "learning_rate": 2.9181846153846155e-05, + "loss": 0.0445, + "step": 5162 + }, + { + "epoch": 33.52597402597402, + "grad_norm": 0.5293325781822205, + "learning_rate": 2.918153846153846e-05, + "loss": 0.0502, + "step": 5163 + }, + { + "epoch": 33.532467532467535, + "grad_norm": 1.6670782566070557, + "learning_rate": 2.918123076923077e-05, + "loss": 0.0291, + "step": 5164 + }, + { + "epoch": 33.53896103896104, + "grad_norm": 0.37328842282295227, + "learning_rate": 2.9180923076923077e-05, + "loss": 0.0285, + "step": 5165 + }, + { + "epoch": 33.54545454545455, + "grad_norm": 0.4555578827857971, + "learning_rate": 2.9180615384615385e-05, + "loss": 0.0352, + "step": 5166 + }, + { + "epoch": 33.55194805194805, + "grad_norm": 0.5837788581848145, + "learning_rate": 2.9180307692307694e-05, + "loss": 0.0301, + "step": 5167 + }, + { + "epoch": 33.55844155844156, + "grad_norm": 0.43086379766464233, + "learning_rate": 2.9180000000000002e-05, + "loss": 0.0286, + "step": 5168 + }, + { + "epoch": 33.564935064935064, + "grad_norm": 0.3732292056083679, + "learning_rate": 2.9179692307692307e-05, + "loss": 0.0172, + "step": 5169 + }, + { + "epoch": 33.57142857142857, + "grad_norm": 0.3461165428161621, + "learning_rate": 2.9179384615384615e-05, + "loss": 0.0146, + "step": 5170 + }, + { + "epoch": 33.577922077922075, + "grad_norm": 0.4424782991409302, + "learning_rate": 2.9179076923076923e-05, + "loss": 0.0177, + "step": 5171 + }, + { + "epoch": 33.58441558441559, + "grad_norm": 0.5220151543617249, + "learning_rate": 2.917876923076923e-05, + "loss": 0.0206, + "step": 5172 + }, + { + "epoch": 33.59090909090909, + "grad_norm": 0.3175686001777649, + "learning_rate": 2.917846153846154e-05, + "loss": 0.0105, + "step": 5173 + }, + { + "epoch": 33.5974025974026, + "grad_norm": 0.6249086856842041, + "learning_rate": 2.917815384615385e-05, + "loss": 0.0199, + "step": 5174 + }, + { + "epoch": 33.603896103896105, + "grad_norm": 0.4141503572463989, + "learning_rate": 2.9177846153846153e-05, + "loss": 0.0144, + "step": 5175 + }, + { + "epoch": 33.61038961038961, + "grad_norm": 1.178198218345642, + "learning_rate": 2.917753846153846e-05, + "loss": 0.0204, + "step": 5176 + }, + { + "epoch": 33.616883116883116, + "grad_norm": 0.4076842963695526, + "learning_rate": 2.917723076923077e-05, + "loss": 0.0109, + "step": 5177 + }, + { + "epoch": 33.62337662337662, + "grad_norm": 0.4344927966594696, + "learning_rate": 2.9176923076923078e-05, + "loss": 0.013, + "step": 5178 + }, + { + "epoch": 33.62987012987013, + "grad_norm": 0.6804765462875366, + "learning_rate": 2.9176615384615383e-05, + "loss": 0.0176, + "step": 5179 + }, + { + "epoch": 33.63636363636363, + "grad_norm": 0.28606274724006653, + "learning_rate": 2.9176307692307695e-05, + "loss": 0.0036, + "step": 5180 + }, + { + "epoch": 33.642857142857146, + "grad_norm": 0.6881821155548096, + "learning_rate": 2.9176000000000003e-05, + "loss": 0.0242, + "step": 5181 + }, + { + "epoch": 33.64935064935065, + "grad_norm": 0.5474647879600525, + "learning_rate": 2.9175692307692308e-05, + "loss": 0.0108, + "step": 5182 + }, + { + "epoch": 33.65584415584416, + "grad_norm": 0.6660938858985901, + "learning_rate": 2.9175384615384616e-05, + "loss": 0.0139, + "step": 5183 + }, + { + "epoch": 33.66233766233766, + "grad_norm": 1.0489383935928345, + "learning_rate": 2.9175076923076925e-05, + "loss": 0.0155, + "step": 5184 + }, + { + "epoch": 33.66883116883117, + "grad_norm": 3.0114641189575195, + "learning_rate": 2.917476923076923e-05, + "loss": 0.012, + "step": 5185 + }, + { + "epoch": 33.675324675324674, + "grad_norm": 2.148083209991455, + "learning_rate": 2.9174461538461538e-05, + "loss": 0.0402, + "step": 5186 + }, + { + "epoch": 33.68181818181818, + "grad_norm": 0.9920523166656494, + "learning_rate": 2.917415384615385e-05, + "loss": 0.0179, + "step": 5187 + }, + { + "epoch": 33.688311688311686, + "grad_norm": 0.17630691826343536, + "learning_rate": 2.9173846153846155e-05, + "loss": 0.0032, + "step": 5188 + }, + { + "epoch": 33.6948051948052, + "grad_norm": 0.8171472549438477, + "learning_rate": 2.9173538461538463e-05, + "loss": 0.0143, + "step": 5189 + }, + { + "epoch": 33.701298701298704, + "grad_norm": 1.099873661994934, + "learning_rate": 2.917323076923077e-05, + "loss": 0.0133, + "step": 5190 + }, + { + "epoch": 33.70779220779221, + "grad_norm": 0.36660346388816833, + "learning_rate": 2.9172923076923076e-05, + "loss": 0.0056, + "step": 5191 + }, + { + "epoch": 33.714285714285715, + "grad_norm": 0.8074890375137329, + "learning_rate": 2.9172615384615385e-05, + "loss": 0.0117, + "step": 5192 + }, + { + "epoch": 33.72077922077922, + "grad_norm": 2.2832932472229004, + "learning_rate": 2.9172307692307693e-05, + "loss": 0.0322, + "step": 5193 + }, + { + "epoch": 33.72727272727273, + "grad_norm": 7.517980575561523, + "learning_rate": 2.9172e-05, + "loss": 0.0527, + "step": 5194 + }, + { + "epoch": 33.73376623376623, + "grad_norm": 0.6317906975746155, + "learning_rate": 2.917169230769231e-05, + "loss": 0.0106, + "step": 5195 + }, + { + "epoch": 33.74025974025974, + "grad_norm": 4.595775127410889, + "learning_rate": 2.9171384615384618e-05, + "loss": 0.1125, + "step": 5196 + }, + { + "epoch": 33.746753246753244, + "grad_norm": 0.6804779171943665, + "learning_rate": 2.9171076923076923e-05, + "loss": 0.106, + "step": 5197 + }, + { + "epoch": 33.753246753246756, + "grad_norm": 0.47375553846359253, + "learning_rate": 2.917076923076923e-05, + "loss": 0.0698, + "step": 5198 + }, + { + "epoch": 33.75974025974026, + "grad_norm": 0.39842429757118225, + "learning_rate": 2.917046153846154e-05, + "loss": 0.0505, + "step": 5199 + }, + { + "epoch": 33.76623376623377, + "grad_norm": 0.46608805656433105, + "learning_rate": 2.9170153846153844e-05, + "loss": 0.06, + "step": 5200 + }, + { + "epoch": 33.77272727272727, + "grad_norm": 0.4317260682582855, + "learning_rate": 2.9169846153846156e-05, + "loss": 0.0378, + "step": 5201 + }, + { + "epoch": 33.77922077922078, + "grad_norm": 0.43160080909729004, + "learning_rate": 2.9169538461538464e-05, + "loss": 0.0333, + "step": 5202 + }, + { + "epoch": 33.785714285714285, + "grad_norm": 0.5686500072479248, + "learning_rate": 2.916923076923077e-05, + "loss": 0.0332, + "step": 5203 + }, + { + "epoch": 33.79220779220779, + "grad_norm": 0.6219180226325989, + "learning_rate": 2.9168923076923078e-05, + "loss": 0.0534, + "step": 5204 + }, + { + "epoch": 33.798701298701296, + "grad_norm": 0.42976194620132446, + "learning_rate": 2.9168615384615386e-05, + "loss": 0.0217, + "step": 5205 + }, + { + "epoch": 33.8051948051948, + "grad_norm": 0.42062732577323914, + "learning_rate": 2.916830769230769e-05, + "loss": 0.0203, + "step": 5206 + }, + { + "epoch": 33.811688311688314, + "grad_norm": 0.33051377534866333, + "learning_rate": 2.9168e-05, + "loss": 0.0205, + "step": 5207 + }, + { + "epoch": 33.81818181818182, + "grad_norm": 0.40757471323013306, + "learning_rate": 2.916769230769231e-05, + "loss": 0.0239, + "step": 5208 + }, + { + "epoch": 33.824675324675326, + "grad_norm": 0.6034359335899353, + "learning_rate": 2.9167384615384616e-05, + "loss": 0.0274, + "step": 5209 + }, + { + "epoch": 33.83116883116883, + "grad_norm": 0.5356073379516602, + "learning_rate": 2.9167076923076924e-05, + "loss": 0.0202, + "step": 5210 + }, + { + "epoch": 33.83766233766234, + "grad_norm": 0.41984131932258606, + "learning_rate": 2.9166769230769233e-05, + "loss": 0.0147, + "step": 5211 + }, + { + "epoch": 33.84415584415584, + "grad_norm": 0.3775591254234314, + "learning_rate": 2.9166461538461537e-05, + "loss": 0.0125, + "step": 5212 + }, + { + "epoch": 33.85064935064935, + "grad_norm": 0.7033487558364868, + "learning_rate": 2.9166153846153846e-05, + "loss": 0.0216, + "step": 5213 + }, + { + "epoch": 33.857142857142854, + "grad_norm": 0.3927634358406067, + "learning_rate": 2.9165846153846154e-05, + "loss": 0.011, + "step": 5214 + }, + { + "epoch": 33.86363636363637, + "grad_norm": 0.7569650411605835, + "learning_rate": 2.9165538461538462e-05, + "loss": 0.0231, + "step": 5215 + }, + { + "epoch": 33.87012987012987, + "grad_norm": 0.4745166003704071, + "learning_rate": 2.916523076923077e-05, + "loss": 0.012, + "step": 5216 + }, + { + "epoch": 33.87662337662338, + "grad_norm": 0.3904447555541992, + "learning_rate": 2.916492307692308e-05, + "loss": 0.0082, + "step": 5217 + }, + { + "epoch": 33.883116883116884, + "grad_norm": 1.1203515529632568, + "learning_rate": 2.9164615384615384e-05, + "loss": 0.0299, + "step": 5218 + }, + { + "epoch": 33.88961038961039, + "grad_norm": 0.3816543519496918, + "learning_rate": 2.9164307692307692e-05, + "loss": 0.0064, + "step": 5219 + }, + { + "epoch": 33.896103896103895, + "grad_norm": 0.3861207067966461, + "learning_rate": 2.9164e-05, + "loss": 0.009, + "step": 5220 + }, + { + "epoch": 33.9025974025974, + "grad_norm": 1.6413307189941406, + "learning_rate": 2.9163692307692306e-05, + "loss": 0.0211, + "step": 5221 + }, + { + "epoch": 33.90909090909091, + "grad_norm": 0.36325445771217346, + "learning_rate": 2.9163384615384617e-05, + "loss": 0.0074, + "step": 5222 + }, + { + "epoch": 33.91558441558441, + "grad_norm": 1.0768922567367554, + "learning_rate": 2.9163076923076926e-05, + "loss": 0.0164, + "step": 5223 + }, + { + "epoch": 33.922077922077925, + "grad_norm": 0.779722273349762, + "learning_rate": 2.916276923076923e-05, + "loss": 0.0171, + "step": 5224 + }, + { + "epoch": 33.92857142857143, + "grad_norm": 1.2277405261993408, + "learning_rate": 2.916246153846154e-05, + "loss": 0.0506, + "step": 5225 + }, + { + "epoch": 33.935064935064936, + "grad_norm": 0.6585797071456909, + "learning_rate": 2.9162153846153847e-05, + "loss": 0.0156, + "step": 5226 + }, + { + "epoch": 33.94155844155844, + "grad_norm": 0.23704443871974945, + "learning_rate": 2.9161846153846152e-05, + "loss": 0.0035, + "step": 5227 + }, + { + "epoch": 33.94805194805195, + "grad_norm": 0.6500463485717773, + "learning_rate": 2.9161538461538464e-05, + "loss": 0.014, + "step": 5228 + }, + { + "epoch": 33.95454545454545, + "grad_norm": 0.13933050632476807, + "learning_rate": 2.9161230769230772e-05, + "loss": 0.0018, + "step": 5229 + }, + { + "epoch": 33.96103896103896, + "grad_norm": 0.36695221066474915, + "learning_rate": 2.9160923076923077e-05, + "loss": 0.0051, + "step": 5230 + }, + { + "epoch": 33.967532467532465, + "grad_norm": 2.7713279724121094, + "learning_rate": 2.9160615384615385e-05, + "loss": 0.0064, + "step": 5231 + }, + { + "epoch": 33.97402597402598, + "grad_norm": 0.7228372693061829, + "learning_rate": 2.9160307692307694e-05, + "loss": 0.0071, + "step": 5232 + }, + { + "epoch": 33.98051948051948, + "grad_norm": 1.5581496953964233, + "learning_rate": 2.916e-05, + "loss": 0.0288, + "step": 5233 + }, + { + "epoch": 33.98701298701299, + "grad_norm": 0.8226633667945862, + "learning_rate": 2.9159692307692307e-05, + "loss": 0.016, + "step": 5234 + }, + { + "epoch": 33.993506493506494, + "grad_norm": 0.46550416946411133, + "learning_rate": 2.915938461538462e-05, + "loss": 0.0324, + "step": 5235 + }, + { + "epoch": 34.0, + "grad_norm": 0.998394787311554, + "learning_rate": 2.9159076923076924e-05, + "loss": 0.0125, + "step": 5236 + }, + { + "epoch": 34.006493506493506, + "grad_norm": 0.6651492118835449, + "learning_rate": 2.9158769230769232e-05, + "loss": 0.0943, + "step": 5237 + }, + { + "epoch": 34.01298701298701, + "grad_norm": 0.46950453519821167, + "learning_rate": 2.915846153846154e-05, + "loss": 0.0543, + "step": 5238 + }, + { + "epoch": 34.01948051948052, + "grad_norm": 0.4727919399738312, + "learning_rate": 2.9158153846153845e-05, + "loss": 0.0381, + "step": 5239 + }, + { + "epoch": 34.02597402597402, + "grad_norm": 0.5461315512657166, + "learning_rate": 2.9157846153846153e-05, + "loss": 0.043, + "step": 5240 + }, + { + "epoch": 34.032467532467535, + "grad_norm": 0.47372451424598694, + "learning_rate": 2.9157538461538462e-05, + "loss": 0.0339, + "step": 5241 + }, + { + "epoch": 34.03896103896104, + "grad_norm": 0.39935654401779175, + "learning_rate": 2.915723076923077e-05, + "loss": 0.0299, + "step": 5242 + }, + { + "epoch": 34.04545454545455, + "grad_norm": 0.4123666286468506, + "learning_rate": 2.915692307692308e-05, + "loss": 0.0372, + "step": 5243 + }, + { + "epoch": 34.05194805194805, + "grad_norm": 0.3065956234931946, + "learning_rate": 2.9156615384615387e-05, + "loss": 0.0207, + "step": 5244 + }, + { + "epoch": 34.05844155844156, + "grad_norm": 0.7890008687973022, + "learning_rate": 2.9156307692307692e-05, + "loss": 0.0284, + "step": 5245 + }, + { + "epoch": 34.064935064935064, + "grad_norm": 0.31738463044166565, + "learning_rate": 2.9156e-05, + "loss": 0.0159, + "step": 5246 + }, + { + "epoch": 34.07142857142857, + "grad_norm": 0.4964287281036377, + "learning_rate": 2.915569230769231e-05, + "loss": 0.0137, + "step": 5247 + }, + { + "epoch": 34.077922077922075, + "grad_norm": 0.38780248165130615, + "learning_rate": 2.9155384615384613e-05, + "loss": 0.0238, + "step": 5248 + }, + { + "epoch": 34.08441558441559, + "grad_norm": 0.3760470151901245, + "learning_rate": 2.9155076923076925e-05, + "loss": 0.0147, + "step": 5249 + }, + { + "epoch": 34.09090909090909, + "grad_norm": 0.5653063654899597, + "learning_rate": 2.9154769230769233e-05, + "loss": 0.0126, + "step": 5250 + }, + { + "epoch": 34.0974025974026, + "grad_norm": 0.43375396728515625, + "learning_rate": 2.9154461538461538e-05, + "loss": 0.0147, + "step": 5251 + }, + { + "epoch": 34.103896103896105, + "grad_norm": 0.3634866178035736, + "learning_rate": 2.9154153846153847e-05, + "loss": 0.0116, + "step": 5252 + }, + { + "epoch": 34.11038961038961, + "grad_norm": 0.4312509298324585, + "learning_rate": 2.9153846153846155e-05, + "loss": 0.018, + "step": 5253 + }, + { + "epoch": 34.116883116883116, + "grad_norm": 0.5749011635780334, + "learning_rate": 2.915353846153846e-05, + "loss": 0.0118, + "step": 5254 + }, + { + "epoch": 34.12337662337662, + "grad_norm": 0.31237083673477173, + "learning_rate": 2.9153230769230768e-05, + "loss": 0.0087, + "step": 5255 + }, + { + "epoch": 34.12987012987013, + "grad_norm": 0.4380163550376892, + "learning_rate": 2.915292307692308e-05, + "loss": 0.0067, + "step": 5256 + }, + { + "epoch": 34.13636363636363, + "grad_norm": 0.3805018663406372, + "learning_rate": 2.9152615384615385e-05, + "loss": 0.0106, + "step": 5257 + }, + { + "epoch": 34.142857142857146, + "grad_norm": 0.3207034766674042, + "learning_rate": 2.9152307692307693e-05, + "loss": 0.0078, + "step": 5258 + }, + { + "epoch": 34.14935064935065, + "grad_norm": 1.258542776107788, + "learning_rate": 2.9152e-05, + "loss": 0.0154, + "step": 5259 + }, + { + "epoch": 34.15584415584416, + "grad_norm": 0.3034595847129822, + "learning_rate": 2.915169230769231e-05, + "loss": 0.0168, + "step": 5260 + }, + { + "epoch": 34.16233766233766, + "grad_norm": 0.4040125012397766, + "learning_rate": 2.9151384615384615e-05, + "loss": 0.006, + "step": 5261 + }, + { + "epoch": 34.16883116883117, + "grad_norm": 0.31128889322280884, + "learning_rate": 2.9151076923076923e-05, + "loss": 0.0033, + "step": 5262 + }, + { + "epoch": 34.175324675324674, + "grad_norm": 0.4169544279575348, + "learning_rate": 2.9150769230769235e-05, + "loss": 0.0066, + "step": 5263 + }, + { + "epoch": 34.18181818181818, + "grad_norm": 0.6150069832801819, + "learning_rate": 2.915046153846154e-05, + "loss": 0.0267, + "step": 5264 + }, + { + "epoch": 34.188311688311686, + "grad_norm": 0.4691615104675293, + "learning_rate": 2.9150153846153848e-05, + "loss": 0.0078, + "step": 5265 + }, + { + "epoch": 34.1948051948052, + "grad_norm": 0.6778148412704468, + "learning_rate": 2.9149846153846156e-05, + "loss": 0.007, + "step": 5266 + }, + { + "epoch": 34.201298701298704, + "grad_norm": 2.118201494216919, + "learning_rate": 2.914953846153846e-05, + "loss": 0.0118, + "step": 5267 + }, + { + "epoch": 34.20779220779221, + "grad_norm": 0.4020325243473053, + "learning_rate": 2.914923076923077e-05, + "loss": 0.0115, + "step": 5268 + }, + { + "epoch": 34.214285714285715, + "grad_norm": 1.1559373140335083, + "learning_rate": 2.9148923076923078e-05, + "loss": 0.0153, + "step": 5269 + }, + { + "epoch": 34.22077922077922, + "grad_norm": 0.6937928199768066, + "learning_rate": 2.9148615384615386e-05, + "loss": 0.0072, + "step": 5270 + }, + { + "epoch": 34.22727272727273, + "grad_norm": 2.118891954421997, + "learning_rate": 2.9148307692307694e-05, + "loss": 0.0251, + "step": 5271 + }, + { + "epoch": 34.23376623376623, + "grad_norm": 0.7974960207939148, + "learning_rate": 2.9148000000000003e-05, + "loss": 0.0154, + "step": 5272 + }, + { + "epoch": 34.24025974025974, + "grad_norm": 1.1007604598999023, + "learning_rate": 2.9147692307692308e-05, + "loss": 0.0225, + "step": 5273 + }, + { + "epoch": 34.246753246753244, + "grad_norm": 0.738494873046875, + "learning_rate": 2.9147384615384616e-05, + "loss": 0.011, + "step": 5274 + }, + { + "epoch": 34.253246753246756, + "grad_norm": 0.9713312387466431, + "learning_rate": 2.9147076923076924e-05, + "loss": 0.1203, + "step": 5275 + }, + { + "epoch": 34.25974025974026, + "grad_norm": 0.6073378920555115, + "learning_rate": 2.914676923076923e-05, + "loss": 0.0688, + "step": 5276 + }, + { + "epoch": 34.26623376623377, + "grad_norm": 0.4947046935558319, + "learning_rate": 2.914646153846154e-05, + "loss": 0.0502, + "step": 5277 + }, + { + "epoch": 34.27272727272727, + "grad_norm": 0.6308859586715698, + "learning_rate": 2.914615384615385e-05, + "loss": 0.0436, + "step": 5278 + }, + { + "epoch": 34.27922077922078, + "grad_norm": 0.5347867012023926, + "learning_rate": 2.9145846153846154e-05, + "loss": 0.0448, + "step": 5279 + }, + { + "epoch": 34.285714285714285, + "grad_norm": 0.5856087803840637, + "learning_rate": 2.9145538461538463e-05, + "loss": 0.0383, + "step": 5280 + }, + { + "epoch": 34.29220779220779, + "grad_norm": 0.738502025604248, + "learning_rate": 2.914523076923077e-05, + "loss": 0.0346, + "step": 5281 + }, + { + "epoch": 34.298701298701296, + "grad_norm": 0.5556287169456482, + "learning_rate": 2.9144923076923076e-05, + "loss": 0.043, + "step": 5282 + }, + { + "epoch": 34.3051948051948, + "grad_norm": 0.49851131439208984, + "learning_rate": 2.9144615384615384e-05, + "loss": 0.0451, + "step": 5283 + }, + { + "epoch": 34.311688311688314, + "grad_norm": 0.41321220993995667, + "learning_rate": 2.9144307692307696e-05, + "loss": 0.0217, + "step": 5284 + }, + { + "epoch": 34.31818181818182, + "grad_norm": 0.5214315056800842, + "learning_rate": 2.9144e-05, + "loss": 0.0206, + "step": 5285 + }, + { + "epoch": 34.324675324675326, + "grad_norm": 0.5181014537811279, + "learning_rate": 2.914369230769231e-05, + "loss": 0.0236, + "step": 5286 + }, + { + "epoch": 34.33116883116883, + "grad_norm": 0.8338339328765869, + "learning_rate": 2.9143384615384617e-05, + "loss": 0.0177, + "step": 5287 + }, + { + "epoch": 34.33766233766234, + "grad_norm": 0.3325013816356659, + "learning_rate": 2.9143076923076922e-05, + "loss": 0.0176, + "step": 5288 + }, + { + "epoch": 34.34415584415584, + "grad_norm": 0.39470797777175903, + "learning_rate": 2.914276923076923e-05, + "loss": 0.0153, + "step": 5289 + }, + { + "epoch": 34.35064935064935, + "grad_norm": 0.7602513432502747, + "learning_rate": 2.914246153846154e-05, + "loss": 0.0463, + "step": 5290 + }, + { + "epoch": 34.357142857142854, + "grad_norm": 0.6509610414505005, + "learning_rate": 2.9142153846153847e-05, + "loss": 0.0179, + "step": 5291 + }, + { + "epoch": 34.36363636363637, + "grad_norm": 0.2593560814857483, + "learning_rate": 2.9141846153846156e-05, + "loss": 0.0103, + "step": 5292 + }, + { + "epoch": 34.37012987012987, + "grad_norm": 0.6391077041625977, + "learning_rate": 2.9141538461538464e-05, + "loss": 0.0128, + "step": 5293 + }, + { + "epoch": 34.37662337662338, + "grad_norm": 0.27730414271354675, + "learning_rate": 2.914123076923077e-05, + "loss": 0.0069, + "step": 5294 + }, + { + "epoch": 34.383116883116884, + "grad_norm": 0.9945710301399231, + "learning_rate": 2.9140923076923077e-05, + "loss": 0.0208, + "step": 5295 + }, + { + "epoch": 34.38961038961039, + "grad_norm": 0.37668702006340027, + "learning_rate": 2.9140615384615386e-05, + "loss": 0.0103, + "step": 5296 + }, + { + "epoch": 34.396103896103895, + "grad_norm": 0.4021940529346466, + "learning_rate": 2.914030769230769e-05, + "loss": 0.009, + "step": 5297 + }, + { + "epoch": 34.4025974025974, + "grad_norm": 2.7613797187805176, + "learning_rate": 2.9140000000000002e-05, + "loss": 0.0199, + "step": 5298 + }, + { + "epoch": 34.40909090909091, + "grad_norm": 0.5716807842254639, + "learning_rate": 2.913969230769231e-05, + "loss": 0.0085, + "step": 5299 + }, + { + "epoch": 34.41558441558441, + "grad_norm": 1.032850980758667, + "learning_rate": 2.9139384615384615e-05, + "loss": 0.0168, + "step": 5300 + }, + { + "epoch": 34.422077922077925, + "grad_norm": 1.0856531858444214, + "learning_rate": 2.9139076923076924e-05, + "loss": 0.0158, + "step": 5301 + }, + { + "epoch": 34.42857142857143, + "grad_norm": 0.9900875687599182, + "learning_rate": 2.9138769230769232e-05, + "loss": 0.0166, + "step": 5302 + }, + { + "epoch": 34.435064935064936, + "grad_norm": 1.2374485731124878, + "learning_rate": 2.9138461538461537e-05, + "loss": 0.0177, + "step": 5303 + }, + { + "epoch": 34.44155844155844, + "grad_norm": 0.9744414687156677, + "learning_rate": 2.9138153846153845e-05, + "loss": 0.0197, + "step": 5304 + }, + { + "epoch": 34.44805194805195, + "grad_norm": 0.5119012594223022, + "learning_rate": 2.9137846153846157e-05, + "loss": 0.0248, + "step": 5305 + }, + { + "epoch": 34.45454545454545, + "grad_norm": 0.8543151617050171, + "learning_rate": 2.9137538461538462e-05, + "loss": 0.0175, + "step": 5306 + }, + { + "epoch": 34.46103896103896, + "grad_norm": 0.604468822479248, + "learning_rate": 2.913723076923077e-05, + "loss": 0.0076, + "step": 5307 + }, + { + "epoch": 34.467532467532465, + "grad_norm": 1.545064926147461, + "learning_rate": 2.913692307692308e-05, + "loss": 0.0263, + "step": 5308 + }, + { + "epoch": 34.47402597402598, + "grad_norm": 0.7411176562309265, + "learning_rate": 2.9136615384615384e-05, + "loss": 0.0097, + "step": 5309 + }, + { + "epoch": 34.48051948051948, + "grad_norm": 1.1311073303222656, + "learning_rate": 2.9136307692307692e-05, + "loss": 0.0079, + "step": 5310 + }, + { + "epoch": 34.48701298701299, + "grad_norm": 0.7269066572189331, + "learning_rate": 2.9136e-05, + "loss": 0.0101, + "step": 5311 + }, + { + "epoch": 34.493506493506494, + "grad_norm": 2.4829859733581543, + "learning_rate": 2.913569230769231e-05, + "loss": 0.0583, + "step": 5312 + }, + { + "epoch": 34.5, + "grad_norm": 0.8113126754760742, + "learning_rate": 2.9135384615384617e-05, + "loss": 0.1129, + "step": 5313 + }, + { + "epoch": 34.506493506493506, + "grad_norm": 0.629552960395813, + "learning_rate": 2.9135076923076925e-05, + "loss": 0.0795, + "step": 5314 + }, + { + "epoch": 34.51298701298701, + "grad_norm": 0.5981005430221558, + "learning_rate": 2.913476923076923e-05, + "loss": 0.0641, + "step": 5315 + }, + { + "epoch": 34.51948051948052, + "grad_norm": 0.49797946214675903, + "learning_rate": 2.913446153846154e-05, + "loss": 0.0532, + "step": 5316 + }, + { + "epoch": 34.52597402597402, + "grad_norm": 0.343521386384964, + "learning_rate": 2.9134153846153847e-05, + "loss": 0.0343, + "step": 5317 + }, + { + "epoch": 34.532467532467535, + "grad_norm": 0.444717139005661, + "learning_rate": 2.913384615384615e-05, + "loss": 0.0259, + "step": 5318 + }, + { + "epoch": 34.53896103896104, + "grad_norm": 0.5592092275619507, + "learning_rate": 2.9133538461538463e-05, + "loss": 0.0272, + "step": 5319 + }, + { + "epoch": 34.54545454545455, + "grad_norm": 0.3717966675758362, + "learning_rate": 2.9133230769230772e-05, + "loss": 0.0221, + "step": 5320 + }, + { + "epoch": 34.55194805194805, + "grad_norm": 0.44754713773727417, + "learning_rate": 2.9132923076923077e-05, + "loss": 0.0266, + "step": 5321 + }, + { + "epoch": 34.55844155844156, + "grad_norm": 0.5954673886299133, + "learning_rate": 2.9132615384615385e-05, + "loss": 0.0316, + "step": 5322 + }, + { + "epoch": 34.564935064935064, + "grad_norm": 0.48587605357170105, + "learning_rate": 2.9132307692307693e-05, + "loss": 0.0191, + "step": 5323 + }, + { + "epoch": 34.57142857142857, + "grad_norm": 0.4678362011909485, + "learning_rate": 2.9131999999999998e-05, + "loss": 0.0217, + "step": 5324 + }, + { + "epoch": 34.577922077922075, + "grad_norm": 0.432365745306015, + "learning_rate": 2.9131692307692307e-05, + "loss": 0.0195, + "step": 5325 + }, + { + "epoch": 34.58441558441559, + "grad_norm": 0.8224279284477234, + "learning_rate": 2.9131384615384618e-05, + "loss": 0.0181, + "step": 5326 + }, + { + "epoch": 34.59090909090909, + "grad_norm": 0.43323376774787903, + "learning_rate": 2.9131076923076923e-05, + "loss": 0.0289, + "step": 5327 + }, + { + "epoch": 34.5974025974026, + "grad_norm": 0.6326972246170044, + "learning_rate": 2.913076923076923e-05, + "loss": 0.0132, + "step": 5328 + }, + { + "epoch": 34.603896103896105, + "grad_norm": 0.557214081287384, + "learning_rate": 2.913046153846154e-05, + "loss": 0.0189, + "step": 5329 + }, + { + "epoch": 34.61038961038961, + "grad_norm": 0.5050615668296814, + "learning_rate": 2.9130153846153845e-05, + "loss": 0.0162, + "step": 5330 + }, + { + "epoch": 34.616883116883116, + "grad_norm": 0.27772578597068787, + "learning_rate": 2.9129846153846153e-05, + "loss": 0.0078, + "step": 5331 + }, + { + "epoch": 34.62337662337662, + "grad_norm": 1.832878828048706, + "learning_rate": 2.912953846153846e-05, + "loss": 0.0253, + "step": 5332 + }, + { + "epoch": 34.62987012987013, + "grad_norm": 0.5369575023651123, + "learning_rate": 2.912923076923077e-05, + "loss": 0.0113, + "step": 5333 + }, + { + "epoch": 34.63636363636363, + "grad_norm": 0.5768107771873474, + "learning_rate": 2.9128923076923078e-05, + "loss": 0.0155, + "step": 5334 + }, + { + "epoch": 34.642857142857146, + "grad_norm": 0.7064384818077087, + "learning_rate": 2.9128615384615386e-05, + "loss": 0.0141, + "step": 5335 + }, + { + "epoch": 34.64935064935065, + "grad_norm": 0.5787583589553833, + "learning_rate": 2.912830769230769e-05, + "loss": 0.0215, + "step": 5336 + }, + { + "epoch": 34.65584415584416, + "grad_norm": 0.33400586247444153, + "learning_rate": 2.9128e-05, + "loss": 0.0124, + "step": 5337 + }, + { + "epoch": 34.66233766233766, + "grad_norm": 0.31130489706993103, + "learning_rate": 2.9127692307692308e-05, + "loss": 0.0054, + "step": 5338 + }, + { + "epoch": 34.66883116883117, + "grad_norm": 0.5748124122619629, + "learning_rate": 2.9127384615384616e-05, + "loss": 0.0483, + "step": 5339 + }, + { + "epoch": 34.675324675324674, + "grad_norm": 0.3011142909526825, + "learning_rate": 2.9127076923076925e-05, + "loss": 0.0038, + "step": 5340 + }, + { + "epoch": 34.68181818181818, + "grad_norm": 1.5520853996276855, + "learning_rate": 2.9126769230769233e-05, + "loss": 0.0281, + "step": 5341 + }, + { + "epoch": 34.688311688311686, + "grad_norm": 0.6826903223991394, + "learning_rate": 2.912646153846154e-05, + "loss": 0.0054, + "step": 5342 + }, + { + "epoch": 34.6948051948052, + "grad_norm": 0.41030851006507874, + "learning_rate": 2.9126153846153846e-05, + "loss": 0.0091, + "step": 5343 + }, + { + "epoch": 34.701298701298704, + "grad_norm": 0.5148544907569885, + "learning_rate": 2.9125846153846154e-05, + "loss": 0.008, + "step": 5344 + }, + { + "epoch": 34.70779220779221, + "grad_norm": 0.8189192414283752, + "learning_rate": 2.9125538461538463e-05, + "loss": 0.0154, + "step": 5345 + }, + { + "epoch": 34.714285714285715, + "grad_norm": 0.5207547545433044, + "learning_rate": 2.9125230769230768e-05, + "loss": 0.0116, + "step": 5346 + }, + { + "epoch": 34.72077922077922, + "grad_norm": 0.8722161650657654, + "learning_rate": 2.912492307692308e-05, + "loss": 0.0145, + "step": 5347 + }, + { + "epoch": 34.72727272727273, + "grad_norm": 1.0792169570922852, + "learning_rate": 2.9124615384615388e-05, + "loss": 0.0093, + "step": 5348 + }, + { + "epoch": 34.73376623376623, + "grad_norm": 0.5235753059387207, + "learning_rate": 2.9124307692307693e-05, + "loss": 0.0189, + "step": 5349 + }, + { + "epoch": 34.74025974025974, + "grad_norm": 0.940521776676178, + "learning_rate": 2.9124e-05, + "loss": 0.0161, + "step": 5350 + }, + { + "epoch": 34.746753246753244, + "grad_norm": 0.7171229720115662, + "learning_rate": 2.912369230769231e-05, + "loss": 0.0966, + "step": 5351 + }, + { + "epoch": 34.753246753246756, + "grad_norm": 0.508734941482544, + "learning_rate": 2.9123384615384614e-05, + "loss": 0.0641, + "step": 5352 + }, + { + "epoch": 34.75974025974026, + "grad_norm": 0.49755552411079407, + "learning_rate": 2.9123076923076923e-05, + "loss": 0.0603, + "step": 5353 + }, + { + "epoch": 34.76623376623377, + "grad_norm": 0.417592853307724, + "learning_rate": 2.9122769230769234e-05, + "loss": 0.0433, + "step": 5354 + }, + { + "epoch": 34.77272727272727, + "grad_norm": 0.5044029951095581, + "learning_rate": 2.912246153846154e-05, + "loss": 0.051, + "step": 5355 + }, + { + "epoch": 34.77922077922078, + "grad_norm": 0.5635175704956055, + "learning_rate": 2.9122153846153848e-05, + "loss": 0.0323, + "step": 5356 + }, + { + "epoch": 34.785714285714285, + "grad_norm": 0.6393215656280518, + "learning_rate": 2.9121846153846156e-05, + "loss": 0.0404, + "step": 5357 + }, + { + "epoch": 34.79220779220779, + "grad_norm": 0.7523524761199951, + "learning_rate": 2.912153846153846e-05, + "loss": 0.033, + "step": 5358 + }, + { + "epoch": 34.798701298701296, + "grad_norm": 0.38730788230895996, + "learning_rate": 2.912123076923077e-05, + "loss": 0.0224, + "step": 5359 + }, + { + "epoch": 34.8051948051948, + "grad_norm": 0.45913487672805786, + "learning_rate": 2.9120923076923077e-05, + "loss": 0.0222, + "step": 5360 + }, + { + "epoch": 34.811688311688314, + "grad_norm": 0.3380180597305298, + "learning_rate": 2.9120615384615386e-05, + "loss": 0.0195, + "step": 5361 + }, + { + "epoch": 34.81818181818182, + "grad_norm": 0.2975977957248688, + "learning_rate": 2.9120307692307694e-05, + "loss": 0.0157, + "step": 5362 + }, + { + "epoch": 34.824675324675326, + "grad_norm": 0.3227158188819885, + "learning_rate": 2.9120000000000002e-05, + "loss": 0.018, + "step": 5363 + }, + { + "epoch": 34.83116883116883, + "grad_norm": 0.5855149626731873, + "learning_rate": 2.9119692307692307e-05, + "loss": 0.0173, + "step": 5364 + }, + { + "epoch": 34.83766233766234, + "grad_norm": 0.46049290895462036, + "learning_rate": 2.9119384615384616e-05, + "loss": 0.0155, + "step": 5365 + }, + { + "epoch": 34.84415584415584, + "grad_norm": 0.7055251598358154, + "learning_rate": 2.9119076923076924e-05, + "loss": 0.0163, + "step": 5366 + }, + { + "epoch": 34.85064935064935, + "grad_norm": 0.6814199090003967, + "learning_rate": 2.911876923076923e-05, + "loss": 0.0232, + "step": 5367 + }, + { + "epoch": 34.857142857142854, + "grad_norm": 0.5258548259735107, + "learning_rate": 2.911846153846154e-05, + "loss": 0.0164, + "step": 5368 + }, + { + "epoch": 34.86363636363637, + "grad_norm": 0.9523411989212036, + "learning_rate": 2.911815384615385e-05, + "loss": 0.0211, + "step": 5369 + }, + { + "epoch": 34.87012987012987, + "grad_norm": 0.5174683928489685, + "learning_rate": 2.9117846153846154e-05, + "loss": 0.0129, + "step": 5370 + }, + { + "epoch": 34.87662337662338, + "grad_norm": 0.561558187007904, + "learning_rate": 2.9117538461538462e-05, + "loss": 0.0147, + "step": 5371 + }, + { + "epoch": 34.883116883116884, + "grad_norm": 2.3458762168884277, + "learning_rate": 2.911723076923077e-05, + "loss": 0.0175, + "step": 5372 + }, + { + "epoch": 34.88961038961039, + "grad_norm": 0.3211030960083008, + "learning_rate": 2.9116923076923075e-05, + "loss": 0.0056, + "step": 5373 + }, + { + "epoch": 34.896103896103895, + "grad_norm": 0.8096930980682373, + "learning_rate": 2.9116615384615384e-05, + "loss": 0.0132, + "step": 5374 + }, + { + "epoch": 34.9025974025974, + "grad_norm": 0.5573617815971375, + "learning_rate": 2.9116307692307695e-05, + "loss": 0.0292, + "step": 5375 + }, + { + "epoch": 34.90909090909091, + "grad_norm": 0.47696778178215027, + "learning_rate": 2.9116e-05, + "loss": 0.0063, + "step": 5376 + }, + { + "epoch": 34.91558441558441, + "grad_norm": 1.0395174026489258, + "learning_rate": 2.911569230769231e-05, + "loss": 0.0169, + "step": 5377 + }, + { + "epoch": 34.922077922077925, + "grad_norm": 0.4949539005756378, + "learning_rate": 2.9115384615384617e-05, + "loss": 0.0058, + "step": 5378 + }, + { + "epoch": 34.92857142857143, + "grad_norm": 0.8717275857925415, + "learning_rate": 2.9115076923076922e-05, + "loss": 0.0383, + "step": 5379 + }, + { + "epoch": 34.935064935064936, + "grad_norm": 0.3513164818286896, + "learning_rate": 2.911476923076923e-05, + "loss": 0.0057, + "step": 5380 + }, + { + "epoch": 34.94155844155844, + "grad_norm": 0.31253522634506226, + "learning_rate": 2.911446153846154e-05, + "loss": 0.0042, + "step": 5381 + }, + { + "epoch": 34.94805194805195, + "grad_norm": 0.5153536796569824, + "learning_rate": 2.9114153846153847e-05, + "loss": 0.0461, + "step": 5382 + }, + { + "epoch": 34.95454545454545, + "grad_norm": 1.0771937370300293, + "learning_rate": 2.9113846153846155e-05, + "loss": 0.0393, + "step": 5383 + }, + { + "epoch": 34.96103896103896, + "grad_norm": 0.6515333652496338, + "learning_rate": 2.9113538461538464e-05, + "loss": 0.0106, + "step": 5384 + }, + { + "epoch": 34.967532467532465, + "grad_norm": 1.4070911407470703, + "learning_rate": 2.911323076923077e-05, + "loss": 0.0234, + "step": 5385 + }, + { + "epoch": 34.97402597402598, + "grad_norm": 1.3474302291870117, + "learning_rate": 2.9112923076923077e-05, + "loss": 0.0124, + "step": 5386 + }, + { + "epoch": 34.98051948051948, + "grad_norm": 1.4982504844665527, + "learning_rate": 2.9112615384615385e-05, + "loss": 0.0242, + "step": 5387 + }, + { + "epoch": 34.98701298701299, + "grad_norm": 0.6688902378082275, + "learning_rate": 2.911230769230769e-05, + "loss": 0.0087, + "step": 5388 + }, + { + "epoch": 34.993506493506494, + "grad_norm": 1.5412135124206543, + "learning_rate": 2.9112000000000002e-05, + "loss": 0.0358, + "step": 5389 + }, + { + "epoch": 35.0, + "grad_norm": 0.770210325717926, + "learning_rate": 2.911169230769231e-05, + "loss": 0.0153, + "step": 5390 + }, + { + "epoch": 35.006493506493506, + "grad_norm": 0.6208969354629517, + "learning_rate": 2.9111384615384615e-05, + "loss": 0.0743, + "step": 5391 + }, + { + "epoch": 35.01298701298701, + "grad_norm": 0.46510785818099976, + "learning_rate": 2.9111076923076923e-05, + "loss": 0.0492, + "step": 5392 + }, + { + "epoch": 35.01948051948052, + "grad_norm": 0.4651740789413452, + "learning_rate": 2.9110769230769232e-05, + "loss": 0.0589, + "step": 5393 + }, + { + "epoch": 35.02597402597402, + "grad_norm": 0.3621070981025696, + "learning_rate": 2.9110461538461537e-05, + "loss": 0.0424, + "step": 5394 + }, + { + "epoch": 35.032467532467535, + "grad_norm": 0.511978268623352, + "learning_rate": 2.9110153846153845e-05, + "loss": 0.0281, + "step": 5395 + }, + { + "epoch": 35.03896103896104, + "grad_norm": 0.3234733045101166, + "learning_rate": 2.9109846153846157e-05, + "loss": 0.024, + "step": 5396 + }, + { + "epoch": 35.04545454545455, + "grad_norm": 0.4431886076927185, + "learning_rate": 2.910953846153846e-05, + "loss": 0.0244, + "step": 5397 + }, + { + "epoch": 35.05194805194805, + "grad_norm": 0.544076144695282, + "learning_rate": 2.910923076923077e-05, + "loss": 0.0217, + "step": 5398 + }, + { + "epoch": 35.05844155844156, + "grad_norm": 0.5601357221603394, + "learning_rate": 2.9108923076923078e-05, + "loss": 0.0253, + "step": 5399 + }, + { + "epoch": 35.064935064935064, + "grad_norm": 0.6342061161994934, + "learning_rate": 2.9108615384615383e-05, + "loss": 0.0189, + "step": 5400 + }, + { + "epoch": 35.07142857142857, + "grad_norm": 0.5895953178405762, + "learning_rate": 2.910830769230769e-05, + "loss": 0.0215, + "step": 5401 + }, + { + "epoch": 35.077922077922075, + "grad_norm": 0.6751048564910889, + "learning_rate": 2.9108000000000003e-05, + "loss": 0.0158, + "step": 5402 + }, + { + "epoch": 35.08441558441559, + "grad_norm": 0.27848485112190247, + "learning_rate": 2.9107692307692308e-05, + "loss": 0.0107, + "step": 5403 + }, + { + "epoch": 35.09090909090909, + "grad_norm": 0.2510458827018738, + "learning_rate": 2.9107384615384616e-05, + "loss": 0.0096, + "step": 5404 + }, + { + "epoch": 35.0974025974026, + "grad_norm": 0.5650168061256409, + "learning_rate": 2.9107076923076925e-05, + "loss": 0.0135, + "step": 5405 + }, + { + "epoch": 35.103896103896105, + "grad_norm": 0.565434992313385, + "learning_rate": 2.910676923076923e-05, + "loss": 0.0219, + "step": 5406 + }, + { + "epoch": 35.11038961038961, + "grad_norm": 0.4377562701702118, + "learning_rate": 2.9106461538461538e-05, + "loss": 0.0101, + "step": 5407 + }, + { + "epoch": 35.116883116883116, + "grad_norm": 0.5619506239891052, + "learning_rate": 2.9106153846153846e-05, + "loss": 0.0117, + "step": 5408 + }, + { + "epoch": 35.12337662337662, + "grad_norm": 0.8575261831283569, + "learning_rate": 2.9105846153846155e-05, + "loss": 0.0224, + "step": 5409 + }, + { + "epoch": 35.12987012987013, + "grad_norm": 0.3889850080013275, + "learning_rate": 2.9105538461538463e-05, + "loss": 0.0048, + "step": 5410 + }, + { + "epoch": 35.13636363636363, + "grad_norm": 0.4632609188556671, + "learning_rate": 2.910523076923077e-05, + "loss": 0.0111, + "step": 5411 + }, + { + "epoch": 35.142857142857146, + "grad_norm": 0.3049633502960205, + "learning_rate": 2.9104923076923076e-05, + "loss": 0.0064, + "step": 5412 + }, + { + "epoch": 35.14935064935065, + "grad_norm": 0.44959554076194763, + "learning_rate": 2.9104615384615385e-05, + "loss": 0.0073, + "step": 5413 + }, + { + "epoch": 35.15584415584416, + "grad_norm": 0.7940086722373962, + "learning_rate": 2.9104307692307693e-05, + "loss": 0.0287, + "step": 5414 + }, + { + "epoch": 35.16233766233766, + "grad_norm": 0.401343435049057, + "learning_rate": 2.9103999999999998e-05, + "loss": 0.0206, + "step": 5415 + }, + { + "epoch": 35.16883116883117, + "grad_norm": 0.22474849224090576, + "learning_rate": 2.910369230769231e-05, + "loss": 0.0053, + "step": 5416 + }, + { + "epoch": 35.175324675324674, + "grad_norm": 1.3230173587799072, + "learning_rate": 2.9103384615384618e-05, + "loss": 0.048, + "step": 5417 + }, + { + "epoch": 35.18181818181818, + "grad_norm": 0.7631206512451172, + "learning_rate": 2.9103076923076923e-05, + "loss": 0.0121, + "step": 5418 + }, + { + "epoch": 35.188311688311686, + "grad_norm": 0.4835094213485718, + "learning_rate": 2.910276923076923e-05, + "loss": 0.0053, + "step": 5419 + }, + { + "epoch": 35.1948051948052, + "grad_norm": 0.7741308212280273, + "learning_rate": 2.910246153846154e-05, + "loss": 0.015, + "step": 5420 + }, + { + "epoch": 35.201298701298704, + "grad_norm": 0.5356241464614868, + "learning_rate": 2.9102153846153848e-05, + "loss": 0.0058, + "step": 5421 + }, + { + "epoch": 35.20779220779221, + "grad_norm": 0.994607150554657, + "learning_rate": 2.9101846153846153e-05, + "loss": 0.0225, + "step": 5422 + }, + { + "epoch": 35.214285714285715, + "grad_norm": 0.789943516254425, + "learning_rate": 2.9101538461538464e-05, + "loss": 0.0095, + "step": 5423 + }, + { + "epoch": 35.22077922077922, + "grad_norm": 1.6964573860168457, + "learning_rate": 2.9101230769230773e-05, + "loss": 0.0216, + "step": 5424 + }, + { + "epoch": 35.22727272727273, + "grad_norm": 21.47344207763672, + "learning_rate": 2.9100923076923078e-05, + "loss": 0.0557, + "step": 5425 + }, + { + "epoch": 35.23376623376623, + "grad_norm": 0.30064910650253296, + "learning_rate": 2.9100615384615386e-05, + "loss": 0.0032, + "step": 5426 + }, + { + "epoch": 35.24025974025974, + "grad_norm": 0.12497668713331223, + "learning_rate": 2.9100307692307694e-05, + "loss": 0.0017, + "step": 5427 + }, + { + "epoch": 35.246753246753244, + "grad_norm": 0.7868456840515137, + "learning_rate": 2.91e-05, + "loss": 0.0458, + "step": 5428 + }, + { + "epoch": 35.253246753246756, + "grad_norm": 1.1124745607376099, + "learning_rate": 2.9099692307692308e-05, + "loss": 0.1121, + "step": 5429 + }, + { + "epoch": 35.25974025974026, + "grad_norm": 0.5016067028045654, + "learning_rate": 2.909938461538462e-05, + "loss": 0.0568, + "step": 5430 + }, + { + "epoch": 35.26623376623377, + "grad_norm": 0.6188507676124573, + "learning_rate": 2.9099076923076924e-05, + "loss": 0.0366, + "step": 5431 + }, + { + "epoch": 35.27272727272727, + "grad_norm": 0.4792235493659973, + "learning_rate": 2.9098769230769232e-05, + "loss": 0.0424, + "step": 5432 + }, + { + "epoch": 35.27922077922078, + "grad_norm": 0.39298224449157715, + "learning_rate": 2.909846153846154e-05, + "loss": 0.0355, + "step": 5433 + }, + { + "epoch": 35.285714285714285, + "grad_norm": 0.3892764151096344, + "learning_rate": 2.9098153846153846e-05, + "loss": 0.0255, + "step": 5434 + }, + { + "epoch": 35.29220779220779, + "grad_norm": 0.39262932538986206, + "learning_rate": 2.9097846153846154e-05, + "loss": 0.0347, + "step": 5435 + }, + { + "epoch": 35.298701298701296, + "grad_norm": 0.4668377637863159, + "learning_rate": 2.9097538461538462e-05, + "loss": 0.0227, + "step": 5436 + }, + { + "epoch": 35.3051948051948, + "grad_norm": 0.5487108826637268, + "learning_rate": 2.909723076923077e-05, + "loss": 0.0213, + "step": 5437 + }, + { + "epoch": 35.311688311688314, + "grad_norm": 0.49824461340904236, + "learning_rate": 2.909692307692308e-05, + "loss": 0.0236, + "step": 5438 + }, + { + "epoch": 35.31818181818182, + "grad_norm": 0.7446942925453186, + "learning_rate": 2.9096615384615387e-05, + "loss": 0.0254, + "step": 5439 + }, + { + "epoch": 35.324675324675326, + "grad_norm": 0.4612240195274353, + "learning_rate": 2.9096307692307692e-05, + "loss": 0.0164, + "step": 5440 + }, + { + "epoch": 35.33116883116883, + "grad_norm": 0.514288604259491, + "learning_rate": 2.9096e-05, + "loss": 0.0197, + "step": 5441 + }, + { + "epoch": 35.33766233766234, + "grad_norm": 0.328241229057312, + "learning_rate": 2.909569230769231e-05, + "loss": 0.0089, + "step": 5442 + }, + { + "epoch": 35.34415584415584, + "grad_norm": 0.49577921628952026, + "learning_rate": 2.9095384615384614e-05, + "loss": 0.0097, + "step": 5443 + }, + { + "epoch": 35.35064935064935, + "grad_norm": 0.27970683574676514, + "learning_rate": 2.9095076923076926e-05, + "loss": 0.0095, + "step": 5444 + }, + { + "epoch": 35.357142857142854, + "grad_norm": 0.5652754306793213, + "learning_rate": 2.9094769230769234e-05, + "loss": 0.0114, + "step": 5445 + }, + { + "epoch": 35.36363636363637, + "grad_norm": 0.7900232672691345, + "learning_rate": 2.909446153846154e-05, + "loss": 0.0162, + "step": 5446 + }, + { + "epoch": 35.37012987012987, + "grad_norm": 0.5476013422012329, + "learning_rate": 2.9094153846153847e-05, + "loss": 0.0164, + "step": 5447 + }, + { + "epoch": 35.37662337662338, + "grad_norm": 0.32560667395591736, + "learning_rate": 2.9093846153846155e-05, + "loss": 0.0149, + "step": 5448 + }, + { + "epoch": 35.383116883116884, + "grad_norm": 0.2903243899345398, + "learning_rate": 2.909353846153846e-05, + "loss": 0.0053, + "step": 5449 + }, + { + "epoch": 35.38961038961039, + "grad_norm": 0.5862712264060974, + "learning_rate": 2.909323076923077e-05, + "loss": 0.0064, + "step": 5450 + }, + { + "epoch": 35.396103896103895, + "grad_norm": 0.44692492485046387, + "learning_rate": 2.909292307692308e-05, + "loss": 0.012, + "step": 5451 + }, + { + "epoch": 35.4025974025974, + "grad_norm": 0.35976046323776245, + "learning_rate": 2.9092615384615385e-05, + "loss": 0.0064, + "step": 5452 + }, + { + "epoch": 35.40909090909091, + "grad_norm": 0.5869482159614563, + "learning_rate": 2.9092307692307694e-05, + "loss": 0.0128, + "step": 5453 + }, + { + "epoch": 35.41558441558441, + "grad_norm": 0.48824265599250793, + "learning_rate": 2.9092000000000002e-05, + "loss": 0.0054, + "step": 5454 + }, + { + "epoch": 35.422077922077925, + "grad_norm": 0.7205779552459717, + "learning_rate": 2.9091692307692307e-05, + "loss": 0.0133, + "step": 5455 + }, + { + "epoch": 35.42857142857143, + "grad_norm": 0.9559537172317505, + "learning_rate": 2.9091384615384615e-05, + "loss": 0.0214, + "step": 5456 + }, + { + "epoch": 35.435064935064936, + "grad_norm": 0.31414738297462463, + "learning_rate": 2.9091076923076924e-05, + "loss": 0.0053, + "step": 5457 + }, + { + "epoch": 35.44155844155844, + "grad_norm": 0.20352095365524292, + "learning_rate": 2.9090769230769232e-05, + "loss": 0.0052, + "step": 5458 + }, + { + "epoch": 35.44805194805195, + "grad_norm": 0.8375682234764099, + "learning_rate": 2.909046153846154e-05, + "loss": 0.009, + "step": 5459 + }, + { + "epoch": 35.45454545454545, + "grad_norm": 0.6300340890884399, + "learning_rate": 2.909015384615385e-05, + "loss": 0.0037, + "step": 5460 + }, + { + "epoch": 35.46103896103896, + "grad_norm": 0.614241898059845, + "learning_rate": 2.9089846153846153e-05, + "loss": 0.0032, + "step": 5461 + }, + { + "epoch": 35.467532467532465, + "grad_norm": 0.464739590883255, + "learning_rate": 2.9089538461538462e-05, + "loss": 0.0052, + "step": 5462 + }, + { + "epoch": 35.47402597402598, + "grad_norm": 0.18681010603904724, + "learning_rate": 2.908923076923077e-05, + "loss": 0.0024, + "step": 5463 + }, + { + "epoch": 35.48051948051948, + "grad_norm": 0.9883227348327637, + "learning_rate": 2.9088923076923075e-05, + "loss": 0.029, + "step": 5464 + }, + { + "epoch": 35.48701298701299, + "grad_norm": 0.5163398385047913, + "learning_rate": 2.9088615384615387e-05, + "loss": 0.0048, + "step": 5465 + }, + { + "epoch": 35.493506493506494, + "grad_norm": 1.1780750751495361, + "learning_rate": 2.9088307692307695e-05, + "loss": 0.0088, + "step": 5466 + }, + { + "epoch": 35.5, + "grad_norm": 0.8101599812507629, + "learning_rate": 2.9088e-05, + "loss": 0.0873, + "step": 5467 + }, + { + "epoch": 35.506493506493506, + "grad_norm": 0.5987259745597839, + "learning_rate": 2.908769230769231e-05, + "loss": 0.0667, + "step": 5468 + }, + { + "epoch": 35.51298701298701, + "grad_norm": 0.44257867336273193, + "learning_rate": 2.9087384615384617e-05, + "loss": 0.0408, + "step": 5469 + }, + { + "epoch": 35.51948051948052, + "grad_norm": 0.38900884985923767, + "learning_rate": 2.908707692307692e-05, + "loss": 0.0362, + "step": 5470 + }, + { + "epoch": 35.52597402597402, + "grad_norm": 0.3826066851615906, + "learning_rate": 2.908676923076923e-05, + "loss": 0.0266, + "step": 5471 + }, + { + "epoch": 35.532467532467535, + "grad_norm": 0.3471587300300598, + "learning_rate": 2.908646153846154e-05, + "loss": 0.0224, + "step": 5472 + }, + { + "epoch": 35.53896103896104, + "grad_norm": 0.4912363588809967, + "learning_rate": 2.9086153846153847e-05, + "loss": 0.0274, + "step": 5473 + }, + { + "epoch": 35.54545454545455, + "grad_norm": 0.36949363350868225, + "learning_rate": 2.9085846153846155e-05, + "loss": 0.0218, + "step": 5474 + }, + { + "epoch": 35.55194805194805, + "grad_norm": 0.4779680371284485, + "learning_rate": 2.9085538461538463e-05, + "loss": 0.0205, + "step": 5475 + }, + { + "epoch": 35.55844155844156, + "grad_norm": 0.6375389695167542, + "learning_rate": 2.9085230769230768e-05, + "loss": 0.0166, + "step": 5476 + }, + { + "epoch": 35.564935064935064, + "grad_norm": 0.5356177687644958, + "learning_rate": 2.9084923076923076e-05, + "loss": 0.0197, + "step": 5477 + }, + { + "epoch": 35.57142857142857, + "grad_norm": 0.6952133178710938, + "learning_rate": 2.9084615384615385e-05, + "loss": 0.0178, + "step": 5478 + }, + { + "epoch": 35.577922077922075, + "grad_norm": 0.44713854789733887, + "learning_rate": 2.9084307692307693e-05, + "loss": 0.0203, + "step": 5479 + }, + { + "epoch": 35.58441558441559, + "grad_norm": 0.3580329418182373, + "learning_rate": 2.9084e-05, + "loss": 0.0058, + "step": 5480 + }, + { + "epoch": 35.59090909090909, + "grad_norm": 0.47033828496932983, + "learning_rate": 2.908369230769231e-05, + "loss": 0.0116, + "step": 5481 + }, + { + "epoch": 35.5974025974026, + "grad_norm": 0.6218892931938171, + "learning_rate": 2.9083384615384615e-05, + "loss": 0.035, + "step": 5482 + }, + { + "epoch": 35.603896103896105, + "grad_norm": 0.49907755851745605, + "learning_rate": 2.9083076923076923e-05, + "loss": 0.0155, + "step": 5483 + }, + { + "epoch": 35.61038961038961, + "grad_norm": 0.3479296565055847, + "learning_rate": 2.908276923076923e-05, + "loss": 0.0055, + "step": 5484 + }, + { + "epoch": 35.616883116883116, + "grad_norm": 0.3294694125652313, + "learning_rate": 2.9082461538461536e-05, + "loss": 0.0089, + "step": 5485 + }, + { + "epoch": 35.62337662337662, + "grad_norm": 0.5581619143486023, + "learning_rate": 2.9082153846153848e-05, + "loss": 0.0114, + "step": 5486 + }, + { + "epoch": 35.62987012987013, + "grad_norm": 0.15970663726329803, + "learning_rate": 2.9081846153846156e-05, + "loss": 0.003, + "step": 5487 + }, + { + "epoch": 35.63636363636363, + "grad_norm": 0.7730697989463806, + "learning_rate": 2.908153846153846e-05, + "loss": 0.0111, + "step": 5488 + }, + { + "epoch": 35.642857142857146, + "grad_norm": 0.6104624271392822, + "learning_rate": 2.908123076923077e-05, + "loss": 0.0113, + "step": 5489 + }, + { + "epoch": 35.64935064935065, + "grad_norm": 0.28909027576446533, + "learning_rate": 2.9080923076923078e-05, + "loss": 0.0163, + "step": 5490 + }, + { + "epoch": 35.65584415584416, + "grad_norm": 0.5803835988044739, + "learning_rate": 2.9080615384615383e-05, + "loss": 0.0087, + "step": 5491 + }, + { + "epoch": 35.66233766233766, + "grad_norm": 1.817765712738037, + "learning_rate": 2.908030769230769e-05, + "loss": 0.0166, + "step": 5492 + }, + { + "epoch": 35.66883116883117, + "grad_norm": 0.38326776027679443, + "learning_rate": 2.9080000000000003e-05, + "loss": 0.0056, + "step": 5493 + }, + { + "epoch": 35.675324675324674, + "grad_norm": 0.3740791976451874, + "learning_rate": 2.9079692307692308e-05, + "loss": 0.0037, + "step": 5494 + }, + { + "epoch": 35.68181818181818, + "grad_norm": 0.6539549231529236, + "learning_rate": 2.9079384615384616e-05, + "loss": 0.03, + "step": 5495 + }, + { + "epoch": 35.688311688311686, + "grad_norm": 1.2329083681106567, + "learning_rate": 2.9079076923076924e-05, + "loss": 0.0142, + "step": 5496 + }, + { + "epoch": 35.6948051948052, + "grad_norm": 0.6088599562644958, + "learning_rate": 2.9078769230769233e-05, + "loss": 0.0087, + "step": 5497 + }, + { + "epoch": 35.701298701298704, + "grad_norm": 0.40001776814460754, + "learning_rate": 2.9078461538461538e-05, + "loss": 0.007, + "step": 5498 + }, + { + "epoch": 35.70779220779221, + "grad_norm": 1.0654664039611816, + "learning_rate": 2.9078153846153846e-05, + "loss": 0.0232, + "step": 5499 + }, + { + "epoch": 35.714285714285715, + "grad_norm": 0.4179539084434509, + "learning_rate": 2.9077846153846158e-05, + "loss": 0.0076, + "step": 5500 + }, + { + "epoch": 35.72077922077922, + "grad_norm": 0.60594242811203, + "learning_rate": 2.9077538461538463e-05, + "loss": 0.0138, + "step": 5501 + }, + { + "epoch": 35.72727272727273, + "grad_norm": 0.6405591368675232, + "learning_rate": 2.907723076923077e-05, + "loss": 0.0122, + "step": 5502 + }, + { + "epoch": 35.73376623376623, + "grad_norm": 0.49801746010780334, + "learning_rate": 2.907692307692308e-05, + "loss": 0.0066, + "step": 5503 + }, + { + "epoch": 35.74025974025974, + "grad_norm": 1.2940011024475098, + "learning_rate": 2.9076615384615384e-05, + "loss": 0.0237, + "step": 5504 + }, + { + "epoch": 35.746753246753244, + "grad_norm": 0.5902006030082703, + "learning_rate": 2.9076307692307692e-05, + "loss": 0.0761, + "step": 5505 + }, + { + "epoch": 35.753246753246756, + "grad_norm": 0.5554302930831909, + "learning_rate": 2.9076e-05, + "loss": 0.075, + "step": 5506 + }, + { + "epoch": 35.75974025974026, + "grad_norm": 0.3603295087814331, + "learning_rate": 2.907569230769231e-05, + "loss": 0.0425, + "step": 5507 + }, + { + "epoch": 35.76623376623377, + "grad_norm": 0.6961526274681091, + "learning_rate": 2.9075384615384617e-05, + "loss": 0.0552, + "step": 5508 + }, + { + "epoch": 35.77272727272727, + "grad_norm": 0.3742905557155609, + "learning_rate": 2.9075076923076926e-05, + "loss": 0.0378, + "step": 5509 + }, + { + "epoch": 35.77922077922078, + "grad_norm": 0.3611226975917816, + "learning_rate": 2.907476923076923e-05, + "loss": 0.0259, + "step": 5510 + }, + { + "epoch": 35.785714285714285, + "grad_norm": 0.5611889958381653, + "learning_rate": 2.907446153846154e-05, + "loss": 0.0294, + "step": 5511 + }, + { + "epoch": 35.79220779220779, + "grad_norm": 0.6996146440505981, + "learning_rate": 2.9074153846153847e-05, + "loss": 0.0216, + "step": 5512 + }, + { + "epoch": 35.798701298701296, + "grad_norm": 0.4185541570186615, + "learning_rate": 2.9073846153846152e-05, + "loss": 0.0403, + "step": 5513 + }, + { + "epoch": 35.8051948051948, + "grad_norm": 0.40230315923690796, + "learning_rate": 2.9073538461538464e-05, + "loss": 0.0204, + "step": 5514 + }, + { + "epoch": 35.811688311688314, + "grad_norm": 0.7334201335906982, + "learning_rate": 2.9073230769230772e-05, + "loss": 0.0156, + "step": 5515 + }, + { + "epoch": 35.81818181818182, + "grad_norm": 0.35723817348480225, + "learning_rate": 2.9072923076923077e-05, + "loss": 0.0146, + "step": 5516 + }, + { + "epoch": 35.824675324675326, + "grad_norm": 0.25425267219543457, + "learning_rate": 2.9072615384615386e-05, + "loss": 0.0115, + "step": 5517 + }, + { + "epoch": 35.83116883116883, + "grad_norm": 0.3605209290981293, + "learning_rate": 2.9072307692307694e-05, + "loss": 0.0128, + "step": 5518 + }, + { + "epoch": 35.83766233766234, + "grad_norm": 0.25324591994285583, + "learning_rate": 2.9072e-05, + "loss": 0.0092, + "step": 5519 + }, + { + "epoch": 35.84415584415584, + "grad_norm": 0.31561002135276794, + "learning_rate": 2.9071692307692307e-05, + "loss": 0.0084, + "step": 5520 + }, + { + "epoch": 35.85064935064935, + "grad_norm": 0.800163209438324, + "learning_rate": 2.907138461538462e-05, + "loss": 0.018, + "step": 5521 + }, + { + "epoch": 35.857142857142854, + "grad_norm": 0.3143797516822815, + "learning_rate": 2.9071076923076924e-05, + "loss": 0.0061, + "step": 5522 + }, + { + "epoch": 35.86363636363637, + "grad_norm": 0.24487708508968353, + "learning_rate": 2.9070769230769232e-05, + "loss": 0.0049, + "step": 5523 + }, + { + "epoch": 35.87012987012987, + "grad_norm": 0.7393335103988647, + "learning_rate": 2.907046153846154e-05, + "loss": 0.0183, + "step": 5524 + }, + { + "epoch": 35.87662337662338, + "grad_norm": 0.8401263356208801, + "learning_rate": 2.9070153846153845e-05, + "loss": 0.0253, + "step": 5525 + }, + { + "epoch": 35.883116883116884, + "grad_norm": 0.506006121635437, + "learning_rate": 2.9069846153846154e-05, + "loss": 0.0067, + "step": 5526 + }, + { + "epoch": 35.88961038961039, + "grad_norm": 0.35310667753219604, + "learning_rate": 2.9069538461538462e-05, + "loss": 0.0097, + "step": 5527 + }, + { + "epoch": 35.896103896103895, + "grad_norm": 0.23659542202949524, + "learning_rate": 2.906923076923077e-05, + "loss": 0.0039, + "step": 5528 + }, + { + "epoch": 35.9025974025974, + "grad_norm": 0.3253742456436157, + "learning_rate": 2.906892307692308e-05, + "loss": 0.0087, + "step": 5529 + }, + { + "epoch": 35.90909090909091, + "grad_norm": 2.2575976848602295, + "learning_rate": 2.9068615384615387e-05, + "loss": 0.0206, + "step": 5530 + }, + { + "epoch": 35.91558441558441, + "grad_norm": 1.0390942096710205, + "learning_rate": 2.9068307692307692e-05, + "loss": 0.0126, + "step": 5531 + }, + { + "epoch": 35.922077922077925, + "grad_norm": 0.5909035801887512, + "learning_rate": 2.9068e-05, + "loss": 0.0133, + "step": 5532 + }, + { + "epoch": 35.92857142857143, + "grad_norm": 0.6804397106170654, + "learning_rate": 2.906769230769231e-05, + "loss": 0.0186, + "step": 5533 + }, + { + "epoch": 35.935064935064936, + "grad_norm": 1.6290465593338013, + "learning_rate": 2.9067384615384613e-05, + "loss": 0.0128, + "step": 5534 + }, + { + "epoch": 35.94155844155844, + "grad_norm": 1.1904802322387695, + "learning_rate": 2.9067076923076925e-05, + "loss": 0.0248, + "step": 5535 + }, + { + "epoch": 35.94805194805195, + "grad_norm": 0.44184446334838867, + "learning_rate": 2.9066769230769233e-05, + "loss": 0.0527, + "step": 5536 + }, + { + "epoch": 35.95454545454545, + "grad_norm": 0.5022152662277222, + "learning_rate": 2.906646153846154e-05, + "loss": 0.0157, + "step": 5537 + }, + { + "epoch": 35.96103896103896, + "grad_norm": 1.0548217296600342, + "learning_rate": 2.9066153846153847e-05, + "loss": 0.0117, + "step": 5538 + }, + { + "epoch": 35.967532467532465, + "grad_norm": 0.5167074203491211, + "learning_rate": 2.9065846153846155e-05, + "loss": 0.0069, + "step": 5539 + }, + { + "epoch": 35.97402597402598, + "grad_norm": 0.6413472294807434, + "learning_rate": 2.906553846153846e-05, + "loss": 0.0131, + "step": 5540 + }, + { + "epoch": 35.98051948051948, + "grad_norm": 1.2182412147521973, + "learning_rate": 2.906523076923077e-05, + "loss": 0.0132, + "step": 5541 + }, + { + "epoch": 35.98701298701299, + "grad_norm": 1.4319102764129639, + "learning_rate": 2.906492307692308e-05, + "loss": 0.0201, + "step": 5542 + }, + { + "epoch": 35.993506493506494, + "grad_norm": 0.5336865186691284, + "learning_rate": 2.9064615384615385e-05, + "loss": 0.0435, + "step": 5543 + }, + { + "epoch": 36.0, + "grad_norm": 0.4530264437198639, + "learning_rate": 2.9064307692307693e-05, + "loss": 0.0097, + "step": 5544 + }, + { + "epoch": 36.006493506493506, + "grad_norm": 0.5937870740890503, + "learning_rate": 2.9064e-05, + "loss": 0.071, + "step": 5545 + }, + { + "epoch": 36.01298701298701, + "grad_norm": 0.5256974697113037, + "learning_rate": 2.9063692307692307e-05, + "loss": 0.056, + "step": 5546 + }, + { + "epoch": 36.01948051948052, + "grad_norm": 0.3508014380931854, + "learning_rate": 2.9063384615384615e-05, + "loss": 0.0302, + "step": 5547 + }, + { + "epoch": 36.02597402597402, + "grad_norm": 0.34796005487442017, + "learning_rate": 2.9063076923076923e-05, + "loss": 0.0407, + "step": 5548 + }, + { + "epoch": 36.032467532467535, + "grad_norm": 0.380829393863678, + "learning_rate": 2.906276923076923e-05, + "loss": 0.0276, + "step": 5549 + }, + { + "epoch": 36.03896103896104, + "grad_norm": 0.3318963646888733, + "learning_rate": 2.906246153846154e-05, + "loss": 0.0161, + "step": 5550 + }, + { + "epoch": 36.04545454545455, + "grad_norm": 0.2507683336734772, + "learning_rate": 2.9062153846153848e-05, + "loss": 0.0171, + "step": 5551 + }, + { + "epoch": 36.05194805194805, + "grad_norm": 0.35036081075668335, + "learning_rate": 2.9061846153846153e-05, + "loss": 0.0201, + "step": 5552 + }, + { + "epoch": 36.05844155844156, + "grad_norm": 0.505987823009491, + "learning_rate": 2.906153846153846e-05, + "loss": 0.0217, + "step": 5553 + }, + { + "epoch": 36.064935064935064, + "grad_norm": 0.3023730218410492, + "learning_rate": 2.906123076923077e-05, + "loss": 0.012, + "step": 5554 + }, + { + "epoch": 36.07142857142857, + "grad_norm": 0.5266294479370117, + "learning_rate": 2.9060923076923075e-05, + "loss": 0.0156, + "step": 5555 + }, + { + "epoch": 36.077922077922075, + "grad_norm": 0.37526872754096985, + "learning_rate": 2.9060615384615386e-05, + "loss": 0.0146, + "step": 5556 + }, + { + "epoch": 36.08441558441559, + "grad_norm": 0.29122111201286316, + "learning_rate": 2.9060307692307695e-05, + "loss": 0.0101, + "step": 5557 + }, + { + "epoch": 36.09090909090909, + "grad_norm": 0.31415414810180664, + "learning_rate": 2.906e-05, + "loss": 0.0107, + "step": 5558 + }, + { + "epoch": 36.0974025974026, + "grad_norm": 0.6353932023048401, + "learning_rate": 2.9059692307692308e-05, + "loss": 0.0137, + "step": 5559 + }, + { + "epoch": 36.103896103896105, + "grad_norm": 0.5115739703178406, + "learning_rate": 2.9059384615384616e-05, + "loss": 0.0386, + "step": 5560 + }, + { + "epoch": 36.11038961038961, + "grad_norm": 0.42253628373146057, + "learning_rate": 2.905907692307692e-05, + "loss": 0.0111, + "step": 5561 + }, + { + "epoch": 36.116883116883116, + "grad_norm": 0.27850422263145447, + "learning_rate": 2.905876923076923e-05, + "loss": 0.0092, + "step": 5562 + }, + { + "epoch": 36.12337662337662, + "grad_norm": 0.20719462633132935, + "learning_rate": 2.905846153846154e-05, + "loss": 0.0054, + "step": 5563 + }, + { + "epoch": 36.12987012987013, + "grad_norm": 0.6023554801940918, + "learning_rate": 2.9058153846153846e-05, + "loss": 0.0092, + "step": 5564 + }, + { + "epoch": 36.13636363636363, + "grad_norm": 1.097813606262207, + "learning_rate": 2.9057846153846154e-05, + "loss": 0.0105, + "step": 5565 + }, + { + "epoch": 36.142857142857146, + "grad_norm": 0.5676377415657043, + "learning_rate": 2.9057538461538463e-05, + "loss": 0.0112, + "step": 5566 + }, + { + "epoch": 36.14935064935065, + "grad_norm": 0.5977246165275574, + "learning_rate": 2.9057230769230768e-05, + "loss": 0.0198, + "step": 5567 + }, + { + "epoch": 36.15584415584416, + "grad_norm": 0.29906946420669556, + "learning_rate": 2.9056923076923076e-05, + "loss": 0.0043, + "step": 5568 + }, + { + "epoch": 36.16233766233766, + "grad_norm": 0.433550089597702, + "learning_rate": 2.9056615384615384e-05, + "loss": 0.0094, + "step": 5569 + }, + { + "epoch": 36.16883116883117, + "grad_norm": 0.6595407128334045, + "learning_rate": 2.9056307692307693e-05, + "loss": 0.0079, + "step": 5570 + }, + { + "epoch": 36.175324675324674, + "grad_norm": 0.9656956791877747, + "learning_rate": 2.9056e-05, + "loss": 0.019, + "step": 5571 + }, + { + "epoch": 36.18181818181818, + "grad_norm": 0.16969792544841766, + "learning_rate": 2.905569230769231e-05, + "loss": 0.002, + "step": 5572 + }, + { + "epoch": 36.188311688311686, + "grad_norm": 2.333681344985962, + "learning_rate": 2.9055384615384614e-05, + "loss": 0.0094, + "step": 5573 + }, + { + "epoch": 36.1948051948052, + "grad_norm": 0.4919402599334717, + "learning_rate": 2.9055076923076923e-05, + "loss": 0.0072, + "step": 5574 + }, + { + "epoch": 36.201298701298704, + "grad_norm": 0.46083393692970276, + "learning_rate": 2.905476923076923e-05, + "loss": 0.005, + "step": 5575 + }, + { + "epoch": 36.20779220779221, + "grad_norm": 1.5113096237182617, + "learning_rate": 2.905446153846154e-05, + "loss": 0.0686, + "step": 5576 + }, + { + "epoch": 36.214285714285715, + "grad_norm": 0.5647264122962952, + "learning_rate": 2.9054153846153848e-05, + "loss": 0.0139, + "step": 5577 + }, + { + "epoch": 36.22077922077922, + "grad_norm": 2.3536794185638428, + "learning_rate": 2.9053846153846156e-05, + "loss": 0.0303, + "step": 5578 + }, + { + "epoch": 36.22727272727273, + "grad_norm": 0.43088576197624207, + "learning_rate": 2.9053538461538464e-05, + "loss": 0.0136, + "step": 5579 + }, + { + "epoch": 36.23376623376623, + "grad_norm": 1.1980865001678467, + "learning_rate": 2.905323076923077e-05, + "loss": 0.0175, + "step": 5580 + }, + { + "epoch": 36.24025974025974, + "grad_norm": 0.4099942147731781, + "learning_rate": 2.9052923076923077e-05, + "loss": 0.0035, + "step": 5581 + }, + { + "epoch": 36.246753246753244, + "grad_norm": 1.6205854415893555, + "learning_rate": 2.9052615384615386e-05, + "loss": 0.048, + "step": 5582 + }, + { + "epoch": 36.253246753246756, + "grad_norm": 0.5835689306259155, + "learning_rate": 2.9052307692307694e-05, + "loss": 0.0781, + "step": 5583 + }, + { + "epoch": 36.25974025974026, + "grad_norm": 0.3758178651332855, + "learning_rate": 2.9052000000000002e-05, + "loss": 0.04, + "step": 5584 + }, + { + "epoch": 36.26623376623377, + "grad_norm": 0.6773163676261902, + "learning_rate": 2.905169230769231e-05, + "loss": 0.0431, + "step": 5585 + }, + { + "epoch": 36.27272727272727, + "grad_norm": 0.3295516073703766, + "learning_rate": 2.9051384615384616e-05, + "loss": 0.0303, + "step": 5586 + }, + { + "epoch": 36.27922077922078, + "grad_norm": 0.27221164107322693, + "learning_rate": 2.9051076923076924e-05, + "loss": 0.023, + "step": 5587 + }, + { + "epoch": 36.285714285714285, + "grad_norm": 0.3228011429309845, + "learning_rate": 2.9050769230769232e-05, + "loss": 0.0187, + "step": 5588 + }, + { + "epoch": 36.29220779220779, + "grad_norm": 0.3541872799396515, + "learning_rate": 2.9050461538461537e-05, + "loss": 0.0202, + "step": 5589 + }, + { + "epoch": 36.298701298701296, + "grad_norm": 0.5636196136474609, + "learning_rate": 2.905015384615385e-05, + "loss": 0.02, + "step": 5590 + }, + { + "epoch": 36.3051948051948, + "grad_norm": 0.8093762397766113, + "learning_rate": 2.9049846153846157e-05, + "loss": 0.0235, + "step": 5591 + }, + { + "epoch": 36.311688311688314, + "grad_norm": 0.35847240686416626, + "learning_rate": 2.9049538461538462e-05, + "loss": 0.0154, + "step": 5592 + }, + { + "epoch": 36.31818181818182, + "grad_norm": 0.4368458390235901, + "learning_rate": 2.904923076923077e-05, + "loss": 0.0152, + "step": 5593 + }, + { + "epoch": 36.324675324675326, + "grad_norm": 0.595810055732727, + "learning_rate": 2.904892307692308e-05, + "loss": 0.0131, + "step": 5594 + }, + { + "epoch": 36.33116883116883, + "grad_norm": 0.2645629644393921, + "learning_rate": 2.9048615384615384e-05, + "loss": 0.0077, + "step": 5595 + }, + { + "epoch": 36.33766233766234, + "grad_norm": 0.3524000942707062, + "learning_rate": 2.9048307692307692e-05, + "loss": 0.01, + "step": 5596 + }, + { + "epoch": 36.34415584415584, + "grad_norm": 0.42889848351478577, + "learning_rate": 2.9048000000000004e-05, + "loss": 0.0136, + "step": 5597 + }, + { + "epoch": 36.35064935064935, + "grad_norm": 0.6208891868591309, + "learning_rate": 2.904769230769231e-05, + "loss": 0.0175, + "step": 5598 + }, + { + "epoch": 36.357142857142854, + "grad_norm": 0.2720721960067749, + "learning_rate": 2.9047384615384617e-05, + "loss": 0.0113, + "step": 5599 + }, + { + "epoch": 36.36363636363637, + "grad_norm": 0.3387434184551239, + "learning_rate": 2.9047076923076925e-05, + "loss": 0.0071, + "step": 5600 + }, + { + "epoch": 36.37012987012987, + "grad_norm": 0.7956691384315491, + "learning_rate": 2.904676923076923e-05, + "loss": 0.0107, + "step": 5601 + }, + { + "epoch": 36.37662337662338, + "grad_norm": 0.5840033292770386, + "learning_rate": 2.904646153846154e-05, + "loss": 0.0161, + "step": 5602 + }, + { + "epoch": 36.383116883116884, + "grad_norm": 0.8667775988578796, + "learning_rate": 2.9046153846153847e-05, + "loss": 0.0095, + "step": 5603 + }, + { + "epoch": 36.38961038961039, + "grad_norm": 0.5208715796470642, + "learning_rate": 2.9045846153846155e-05, + "loss": 0.0115, + "step": 5604 + }, + { + "epoch": 36.396103896103895, + "grad_norm": 1.3019874095916748, + "learning_rate": 2.9045538461538464e-05, + "loss": 0.0109, + "step": 5605 + }, + { + "epoch": 36.4025974025974, + "grad_norm": 0.451679527759552, + "learning_rate": 2.9045230769230772e-05, + "loss": 0.0107, + "step": 5606 + }, + { + "epoch": 36.40909090909091, + "grad_norm": 0.4819625914096832, + "learning_rate": 2.9044923076923077e-05, + "loss": 0.0259, + "step": 5607 + }, + { + "epoch": 36.41558441558441, + "grad_norm": 0.27623212337493896, + "learning_rate": 2.9044615384615385e-05, + "loss": 0.006, + "step": 5608 + }, + { + "epoch": 36.422077922077925, + "grad_norm": 0.35889360308647156, + "learning_rate": 2.9044307692307693e-05, + "loss": 0.0051, + "step": 5609 + }, + { + "epoch": 36.42857142857143, + "grad_norm": 1.6308268308639526, + "learning_rate": 2.9044e-05, + "loss": 0.0233, + "step": 5610 + }, + { + "epoch": 36.435064935064936, + "grad_norm": 0.20552177727222443, + "learning_rate": 2.904369230769231e-05, + "loss": 0.0043, + "step": 5611 + }, + { + "epoch": 36.44155844155844, + "grad_norm": 0.3932984173297882, + "learning_rate": 2.904338461538462e-05, + "loss": 0.0053, + "step": 5612 + }, + { + "epoch": 36.44805194805195, + "grad_norm": 0.4277825355529785, + "learning_rate": 2.9043076923076923e-05, + "loss": 0.004, + "step": 5613 + }, + { + "epoch": 36.45454545454545, + "grad_norm": 0.3922305107116699, + "learning_rate": 2.904276923076923e-05, + "loss": 0.0055, + "step": 5614 + }, + { + "epoch": 36.46103896103896, + "grad_norm": 0.24312107264995575, + "learning_rate": 2.904246153846154e-05, + "loss": 0.003, + "step": 5615 + }, + { + "epoch": 36.467532467532465, + "grad_norm": 0.5969083309173584, + "learning_rate": 2.9042153846153845e-05, + "loss": 0.0132, + "step": 5616 + }, + { + "epoch": 36.47402597402598, + "grad_norm": 1.956734299659729, + "learning_rate": 2.9041846153846153e-05, + "loss": 0.0543, + "step": 5617 + }, + { + "epoch": 36.48051948051948, + "grad_norm": 0.8999709486961365, + "learning_rate": 2.9041538461538465e-05, + "loss": 0.0255, + "step": 5618 + }, + { + "epoch": 36.48701298701299, + "grad_norm": 1.2347708940505981, + "learning_rate": 2.904123076923077e-05, + "loss": 0.0149, + "step": 5619 + }, + { + "epoch": 36.493506493506494, + "grad_norm": 1.7901315689086914, + "learning_rate": 2.9040923076923078e-05, + "loss": 0.0445, + "step": 5620 + }, + { + "epoch": 36.5, + "grad_norm": 0.559679388999939, + "learning_rate": 2.9040615384615387e-05, + "loss": 0.0729, + "step": 5621 + }, + { + "epoch": 36.506493506493506, + "grad_norm": 0.5174139142036438, + "learning_rate": 2.904030769230769e-05, + "loss": 0.0572, + "step": 5622 + }, + { + "epoch": 36.51298701298701, + "grad_norm": 0.38822513818740845, + "learning_rate": 2.904e-05, + "loss": 0.0365, + "step": 5623 + }, + { + "epoch": 36.51948051948052, + "grad_norm": 0.39593538641929626, + "learning_rate": 2.9039692307692308e-05, + "loss": 0.0293, + "step": 5624 + }, + { + "epoch": 36.52597402597402, + "grad_norm": 0.3647960424423218, + "learning_rate": 2.9039384615384616e-05, + "loss": 0.0294, + "step": 5625 + }, + { + "epoch": 36.532467532467535, + "grad_norm": 0.38199394941329956, + "learning_rate": 2.9039076923076925e-05, + "loss": 0.0213, + "step": 5626 + }, + { + "epoch": 36.53896103896104, + "grad_norm": 0.3603741526603699, + "learning_rate": 2.9038769230769233e-05, + "loss": 0.0297, + "step": 5627 + }, + { + "epoch": 36.54545454545455, + "grad_norm": 0.4830622375011444, + "learning_rate": 2.9038461538461538e-05, + "loss": 0.0294, + "step": 5628 + }, + { + "epoch": 36.55194805194805, + "grad_norm": 0.6599977612495422, + "learning_rate": 2.9038153846153846e-05, + "loss": 0.0442, + "step": 5629 + }, + { + "epoch": 36.55844155844156, + "grad_norm": 0.6063061952590942, + "learning_rate": 2.9037846153846155e-05, + "loss": 0.0222, + "step": 5630 + }, + { + "epoch": 36.564935064935064, + "grad_norm": 0.29563385248184204, + "learning_rate": 2.903753846153846e-05, + "loss": 0.0116, + "step": 5631 + }, + { + "epoch": 36.57142857142857, + "grad_norm": 1.1445544958114624, + "learning_rate": 2.903723076923077e-05, + "loss": 0.0135, + "step": 5632 + }, + { + "epoch": 36.577922077922075, + "grad_norm": 0.4297947585582733, + "learning_rate": 2.903692307692308e-05, + "loss": 0.0181, + "step": 5633 + }, + { + "epoch": 36.58441558441559, + "grad_norm": 0.5096610188484192, + "learning_rate": 2.9036615384615385e-05, + "loss": 0.0122, + "step": 5634 + }, + { + "epoch": 36.59090909090909, + "grad_norm": 0.44934797286987305, + "learning_rate": 2.9036307692307693e-05, + "loss": 0.0149, + "step": 5635 + }, + { + "epoch": 36.5974025974026, + "grad_norm": 0.49636566638946533, + "learning_rate": 2.9036e-05, + "loss": 0.0121, + "step": 5636 + }, + { + "epoch": 36.603896103896105, + "grad_norm": 0.2698199450969696, + "learning_rate": 2.9035692307692306e-05, + "loss": 0.0086, + "step": 5637 + }, + { + "epoch": 36.61038961038961, + "grad_norm": 0.36220136284828186, + "learning_rate": 2.9035384615384614e-05, + "loss": 0.0123, + "step": 5638 + }, + { + "epoch": 36.616883116883116, + "grad_norm": 0.6276205778121948, + "learning_rate": 2.9035076923076926e-05, + "loss": 0.0086, + "step": 5639 + }, + { + "epoch": 36.62337662337662, + "grad_norm": 0.764651358127594, + "learning_rate": 2.903476923076923e-05, + "loss": 0.0167, + "step": 5640 + }, + { + "epoch": 36.62987012987013, + "grad_norm": 0.7851477265357971, + "learning_rate": 2.903446153846154e-05, + "loss": 0.0115, + "step": 5641 + }, + { + "epoch": 36.63636363636363, + "grad_norm": 1.075822114944458, + "learning_rate": 2.9034153846153848e-05, + "loss": 0.0172, + "step": 5642 + }, + { + "epoch": 36.642857142857146, + "grad_norm": 0.6303055882453918, + "learning_rate": 2.9033846153846153e-05, + "loss": 0.0052, + "step": 5643 + }, + { + "epoch": 36.64935064935065, + "grad_norm": 0.6038342714309692, + "learning_rate": 2.903353846153846e-05, + "loss": 0.0121, + "step": 5644 + }, + { + "epoch": 36.65584415584416, + "grad_norm": 0.4720795452594757, + "learning_rate": 2.903323076923077e-05, + "loss": 0.0113, + "step": 5645 + }, + { + "epoch": 36.66233766233766, + "grad_norm": 0.2050342559814453, + "learning_rate": 2.9032923076923078e-05, + "loss": 0.0026, + "step": 5646 + }, + { + "epoch": 36.66883116883117, + "grad_norm": 0.5154380202293396, + "learning_rate": 2.9032615384615386e-05, + "loss": 0.011, + "step": 5647 + }, + { + "epoch": 36.675324675324674, + "grad_norm": 0.7150618433952332, + "learning_rate": 2.9032307692307694e-05, + "loss": 0.0178, + "step": 5648 + }, + { + "epoch": 36.68181818181818, + "grad_norm": 1.0069153308868408, + "learning_rate": 2.9032e-05, + "loss": 0.0278, + "step": 5649 + }, + { + "epoch": 36.688311688311686, + "grad_norm": 0.29059624671936035, + "learning_rate": 2.9031692307692308e-05, + "loss": 0.0033, + "step": 5650 + }, + { + "epoch": 36.6948051948052, + "grad_norm": 1.0125490427017212, + "learning_rate": 2.9031384615384616e-05, + "loss": 0.0095, + "step": 5651 + }, + { + "epoch": 36.701298701298704, + "grad_norm": 0.28927430510520935, + "learning_rate": 2.903107692307692e-05, + "loss": 0.0035, + "step": 5652 + }, + { + "epoch": 36.70779220779221, + "grad_norm": 0.606968104839325, + "learning_rate": 2.9030769230769232e-05, + "loss": 0.0083, + "step": 5653 + }, + { + "epoch": 36.714285714285715, + "grad_norm": 0.8843885660171509, + "learning_rate": 2.903046153846154e-05, + "loss": 0.0048, + "step": 5654 + }, + { + "epoch": 36.72077922077922, + "grad_norm": 0.5820101499557495, + "learning_rate": 2.9030153846153846e-05, + "loss": 0.014, + "step": 5655 + }, + { + "epoch": 36.72727272727273, + "grad_norm": 1.4354671239852905, + "learning_rate": 2.9029846153846154e-05, + "loss": 0.0212, + "step": 5656 + }, + { + "epoch": 36.73376623376623, + "grad_norm": 0.6979734301567078, + "learning_rate": 2.9029538461538462e-05, + "loss": 0.0113, + "step": 5657 + }, + { + "epoch": 36.74025974025974, + "grad_norm": 1.1822783946990967, + "learning_rate": 2.902923076923077e-05, + "loss": 0.0163, + "step": 5658 + }, + { + "epoch": 36.746753246753244, + "grad_norm": 0.6402063965797424, + "learning_rate": 2.9028923076923076e-05, + "loss": 0.0871, + "step": 5659 + }, + { + "epoch": 36.753246753246756, + "grad_norm": 0.6479465365409851, + "learning_rate": 2.9028615384615387e-05, + "loss": 0.0684, + "step": 5660 + }, + { + "epoch": 36.75974025974026, + "grad_norm": 0.43468889594078064, + "learning_rate": 2.9028307692307696e-05, + "loss": 0.0542, + "step": 5661 + }, + { + "epoch": 36.76623376623377, + "grad_norm": 0.41597333550453186, + "learning_rate": 2.9028e-05, + "loss": 0.0447, + "step": 5662 + }, + { + "epoch": 36.77272727272727, + "grad_norm": 0.3900293707847595, + "learning_rate": 2.902769230769231e-05, + "loss": 0.0328, + "step": 5663 + }, + { + "epoch": 36.77922077922078, + "grad_norm": 0.35357940196990967, + "learning_rate": 2.9027384615384617e-05, + "loss": 0.0254, + "step": 5664 + }, + { + "epoch": 36.785714285714285, + "grad_norm": 0.46574369072914124, + "learning_rate": 2.9027076923076922e-05, + "loss": 0.026, + "step": 5665 + }, + { + "epoch": 36.79220779220779, + "grad_norm": 0.6147619485855103, + "learning_rate": 2.902676923076923e-05, + "loss": 0.0242, + "step": 5666 + }, + { + "epoch": 36.798701298701296, + "grad_norm": 0.32326582074165344, + "learning_rate": 2.9026461538461542e-05, + "loss": 0.0141, + "step": 5667 + }, + { + "epoch": 36.8051948051948, + "grad_norm": 0.42057231068611145, + "learning_rate": 2.9026153846153847e-05, + "loss": 0.0205, + "step": 5668 + }, + { + "epoch": 36.811688311688314, + "grad_norm": 0.33616119623184204, + "learning_rate": 2.9025846153846155e-05, + "loss": 0.016, + "step": 5669 + }, + { + "epoch": 36.81818181818182, + "grad_norm": 0.46913179755210876, + "learning_rate": 2.9025538461538464e-05, + "loss": 0.0241, + "step": 5670 + }, + { + "epoch": 36.824675324675326, + "grad_norm": 0.43245717883110046, + "learning_rate": 2.902523076923077e-05, + "loss": 0.019, + "step": 5671 + }, + { + "epoch": 36.83116883116883, + "grad_norm": 0.30958181619644165, + "learning_rate": 2.9024923076923077e-05, + "loss": 0.0105, + "step": 5672 + }, + { + "epoch": 36.83766233766234, + "grad_norm": 0.5329145193099976, + "learning_rate": 2.9024615384615385e-05, + "loss": 0.0145, + "step": 5673 + }, + { + "epoch": 36.84415584415584, + "grad_norm": 0.5764246582984924, + "learning_rate": 2.9024307692307694e-05, + "loss": 0.0239, + "step": 5674 + }, + { + "epoch": 36.85064935064935, + "grad_norm": 0.43031397461891174, + "learning_rate": 2.9024000000000002e-05, + "loss": 0.0109, + "step": 5675 + }, + { + "epoch": 36.857142857142854, + "grad_norm": 0.3727981448173523, + "learning_rate": 2.902369230769231e-05, + "loss": 0.0124, + "step": 5676 + }, + { + "epoch": 36.86363636363637, + "grad_norm": 0.4733588695526123, + "learning_rate": 2.9023384615384615e-05, + "loss": 0.0181, + "step": 5677 + }, + { + "epoch": 36.87012987012987, + "grad_norm": 0.2856893241405487, + "learning_rate": 2.9023076923076924e-05, + "loss": 0.0085, + "step": 5678 + }, + { + "epoch": 36.87662337662338, + "grad_norm": 0.5714840888977051, + "learning_rate": 2.9022769230769232e-05, + "loss": 0.0158, + "step": 5679 + }, + { + "epoch": 36.883116883116884, + "grad_norm": 0.22427834570407867, + "learning_rate": 2.9022461538461537e-05, + "loss": 0.005, + "step": 5680 + }, + { + "epoch": 36.88961038961039, + "grad_norm": 0.3562869429588318, + "learning_rate": 2.902215384615385e-05, + "loss": 0.0059, + "step": 5681 + }, + { + "epoch": 36.896103896103895, + "grad_norm": 0.6766841411590576, + "learning_rate": 2.9021846153846157e-05, + "loss": 0.0111, + "step": 5682 + }, + { + "epoch": 36.9025974025974, + "grad_norm": 1.1050662994384766, + "learning_rate": 2.9021538461538462e-05, + "loss": 0.039, + "step": 5683 + }, + { + "epoch": 36.90909090909091, + "grad_norm": 0.3368038833141327, + "learning_rate": 2.902123076923077e-05, + "loss": 0.0071, + "step": 5684 + }, + { + "epoch": 36.91558441558441, + "grad_norm": 0.31755903363227844, + "learning_rate": 2.902092307692308e-05, + "loss": 0.0048, + "step": 5685 + }, + { + "epoch": 36.922077922077925, + "grad_norm": 0.7562558650970459, + "learning_rate": 2.9020615384615383e-05, + "loss": 0.0708, + "step": 5686 + }, + { + "epoch": 36.92857142857143, + "grad_norm": 0.35310858488082886, + "learning_rate": 2.902030769230769e-05, + "loss": 0.0162, + "step": 5687 + }, + { + "epoch": 36.935064935064936, + "grad_norm": 0.4592399299144745, + "learning_rate": 2.9020000000000003e-05, + "loss": 0.006, + "step": 5688 + }, + { + "epoch": 36.94155844155844, + "grad_norm": 0.07205262780189514, + "learning_rate": 2.9019692307692308e-05, + "loss": 0.0012, + "step": 5689 + }, + { + "epoch": 36.94805194805195, + "grad_norm": 0.21239978075027466, + "learning_rate": 2.9019384615384617e-05, + "loss": 0.0031, + "step": 5690 + }, + { + "epoch": 36.95454545454545, + "grad_norm": 0.6470236778259277, + "learning_rate": 2.9019076923076925e-05, + "loss": 0.0073, + "step": 5691 + }, + { + "epoch": 36.96103896103896, + "grad_norm": 0.5057578086853027, + "learning_rate": 2.901876923076923e-05, + "loss": 0.0057, + "step": 5692 + }, + { + "epoch": 36.967532467532465, + "grad_norm": 1.3637326955795288, + "learning_rate": 2.9018461538461538e-05, + "loss": 0.0118, + "step": 5693 + }, + { + "epoch": 36.97402597402598, + "grad_norm": 0.8590590357780457, + "learning_rate": 2.9018153846153847e-05, + "loss": 0.0245, + "step": 5694 + }, + { + "epoch": 36.98051948051948, + "grad_norm": 0.49676382541656494, + "learning_rate": 2.9017846153846155e-05, + "loss": 0.0071, + "step": 5695 + }, + { + "epoch": 36.98701298701299, + "grad_norm": 0.8142293691635132, + "learning_rate": 2.9017538461538463e-05, + "loss": 0.0197, + "step": 5696 + }, + { + "epoch": 36.993506493506494, + "grad_norm": 0.39492130279541016, + "learning_rate": 2.901723076923077e-05, + "loss": 0.0233, + "step": 5697 + }, + { + "epoch": 37.0, + "grad_norm": 0.42088377475738525, + "learning_rate": 2.9016923076923076e-05, + "loss": 0.0082, + "step": 5698 + }, + { + "epoch": 37.006493506493506, + "grad_norm": 0.5682960152626038, + "learning_rate": 2.9016615384615385e-05, + "loss": 0.068, + "step": 5699 + }, + { + "epoch": 37.01298701298701, + "grad_norm": 0.4266335368156433, + "learning_rate": 2.9016307692307693e-05, + "loss": 0.0336, + "step": 5700 + }, + { + "epoch": 37.01948051948052, + "grad_norm": 0.45381370186805725, + "learning_rate": 2.9015999999999998e-05, + "loss": 0.0341, + "step": 5701 + }, + { + "epoch": 37.02597402597402, + "grad_norm": 0.3741210699081421, + "learning_rate": 2.901569230769231e-05, + "loss": 0.0339, + "step": 5702 + }, + { + "epoch": 37.032467532467535, + "grad_norm": 0.3010711073875427, + "learning_rate": 2.9015384615384618e-05, + "loss": 0.0234, + "step": 5703 + }, + { + "epoch": 37.03896103896104, + "grad_norm": 0.33665797114372253, + "learning_rate": 2.9015076923076923e-05, + "loss": 0.0206, + "step": 5704 + }, + { + "epoch": 37.04545454545455, + "grad_norm": 0.2654063105583191, + "learning_rate": 2.901476923076923e-05, + "loss": 0.0144, + "step": 5705 + }, + { + "epoch": 37.05194805194805, + "grad_norm": 0.3056391477584839, + "learning_rate": 2.901446153846154e-05, + "loss": 0.0139, + "step": 5706 + }, + { + "epoch": 37.05844155844156, + "grad_norm": 0.25488588213920593, + "learning_rate": 2.9014153846153845e-05, + "loss": 0.0097, + "step": 5707 + }, + { + "epoch": 37.064935064935064, + "grad_norm": 0.25274279713630676, + "learning_rate": 2.9013846153846153e-05, + "loss": 0.0102, + "step": 5708 + }, + { + "epoch": 37.07142857142857, + "grad_norm": 0.47891965508461, + "learning_rate": 2.9013538461538465e-05, + "loss": 0.012, + "step": 5709 + }, + { + "epoch": 37.077922077922075, + "grad_norm": 0.4401334822177887, + "learning_rate": 2.901323076923077e-05, + "loss": 0.0136, + "step": 5710 + }, + { + "epoch": 37.08441558441559, + "grad_norm": 0.2747070789337158, + "learning_rate": 2.9012923076923078e-05, + "loss": 0.0069, + "step": 5711 + }, + { + "epoch": 37.09090909090909, + "grad_norm": 0.3973609209060669, + "learning_rate": 2.9012615384615386e-05, + "loss": 0.0087, + "step": 5712 + }, + { + "epoch": 37.0974025974026, + "grad_norm": 0.3059113323688507, + "learning_rate": 2.901230769230769e-05, + "loss": 0.0101, + "step": 5713 + }, + { + "epoch": 37.103896103896105, + "grad_norm": 0.263389527797699, + "learning_rate": 2.9012e-05, + "loss": 0.007, + "step": 5714 + }, + { + "epoch": 37.11038961038961, + "grad_norm": 0.24985991418361664, + "learning_rate": 2.9011692307692308e-05, + "loss": 0.0081, + "step": 5715 + }, + { + "epoch": 37.116883116883116, + "grad_norm": 0.6577467918395996, + "learning_rate": 2.9011384615384616e-05, + "loss": 0.01, + "step": 5716 + }, + { + "epoch": 37.12337662337662, + "grad_norm": 0.6368363499641418, + "learning_rate": 2.9011076923076924e-05, + "loss": 0.0062, + "step": 5717 + }, + { + "epoch": 37.12987012987013, + "grad_norm": 0.2599269151687622, + "learning_rate": 2.9010769230769233e-05, + "loss": 0.0045, + "step": 5718 + }, + { + "epoch": 37.13636363636363, + "grad_norm": 1.2361116409301758, + "learning_rate": 2.9010461538461538e-05, + "loss": 0.0121, + "step": 5719 + }, + { + "epoch": 37.142857142857146, + "grad_norm": 0.44706419110298157, + "learning_rate": 2.9010153846153846e-05, + "loss": 0.0052, + "step": 5720 + }, + { + "epoch": 37.14935064935065, + "grad_norm": 0.28249335289001465, + "learning_rate": 2.9009846153846154e-05, + "loss": 0.0042, + "step": 5721 + }, + { + "epoch": 37.15584415584416, + "grad_norm": 1.5646660327911377, + "learning_rate": 2.900953846153846e-05, + "loss": 0.0205, + "step": 5722 + }, + { + "epoch": 37.16233766233766, + "grad_norm": 0.5014433264732361, + "learning_rate": 2.900923076923077e-05, + "loss": 0.0103, + "step": 5723 + }, + { + "epoch": 37.16883116883117, + "grad_norm": 0.6213587522506714, + "learning_rate": 2.900892307692308e-05, + "loss": 0.0103, + "step": 5724 + }, + { + "epoch": 37.175324675324674, + "grad_norm": 1.0360206365585327, + "learning_rate": 2.9008615384615384e-05, + "loss": 0.0268, + "step": 5725 + }, + { + "epoch": 37.18181818181818, + "grad_norm": 0.5155927538871765, + "learning_rate": 2.9008307692307692e-05, + "loss": 0.0406, + "step": 5726 + }, + { + "epoch": 37.188311688311686, + "grad_norm": 0.4905499517917633, + "learning_rate": 2.9008e-05, + "loss": 0.0203, + "step": 5727 + }, + { + "epoch": 37.1948051948052, + "grad_norm": 0.4875847399234772, + "learning_rate": 2.9007692307692306e-05, + "loss": 0.0081, + "step": 5728 + }, + { + "epoch": 37.201298701298704, + "grad_norm": 0.6300015449523926, + "learning_rate": 2.9007384615384614e-05, + "loss": 0.009, + "step": 5729 + }, + { + "epoch": 37.20779220779221, + "grad_norm": 1.02634596824646, + "learning_rate": 2.9007076923076926e-05, + "loss": 0.0157, + "step": 5730 + }, + { + "epoch": 37.214285714285715, + "grad_norm": 0.350734144449234, + "learning_rate": 2.900676923076923e-05, + "loss": 0.0023, + "step": 5731 + }, + { + "epoch": 37.22077922077922, + "grad_norm": 0.9716202020645142, + "learning_rate": 2.900646153846154e-05, + "loss": 0.0041, + "step": 5732 + }, + { + "epoch": 37.22727272727273, + "grad_norm": 1.7610501050949097, + "learning_rate": 2.9006153846153847e-05, + "loss": 0.0101, + "step": 5733 + }, + { + "epoch": 37.23376623376623, + "grad_norm": 1.2568671703338623, + "learning_rate": 2.9005846153846152e-05, + "loss": 0.0426, + "step": 5734 + }, + { + "epoch": 37.24025974025974, + "grad_norm": 0.8857156038284302, + "learning_rate": 2.900553846153846e-05, + "loss": 0.0056, + "step": 5735 + }, + { + "epoch": 37.246753246753244, + "grad_norm": 0.18274565041065216, + "learning_rate": 2.900523076923077e-05, + "loss": 0.0026, + "step": 5736 + }, + { + "epoch": 37.253246753246756, + "grad_norm": 0.590124785900116, + "learning_rate": 2.9004923076923077e-05, + "loss": 0.0624, + "step": 5737 + }, + { + "epoch": 37.25974025974026, + "grad_norm": 0.4975883960723877, + "learning_rate": 2.9004615384615386e-05, + "loss": 0.0502, + "step": 5738 + }, + { + "epoch": 37.26623376623377, + "grad_norm": 0.43369370698928833, + "learning_rate": 2.9004307692307694e-05, + "loss": 0.0497, + "step": 5739 + }, + { + "epoch": 37.27272727272727, + "grad_norm": 0.3413119912147522, + "learning_rate": 2.9004000000000002e-05, + "loss": 0.0339, + "step": 5740 + }, + { + "epoch": 37.27922077922078, + "grad_norm": 0.32237881422042847, + "learning_rate": 2.9003692307692307e-05, + "loss": 0.0245, + "step": 5741 + }, + { + "epoch": 37.285714285714285, + "grad_norm": 0.38169053196907043, + "learning_rate": 2.9003384615384615e-05, + "loss": 0.0196, + "step": 5742 + }, + { + "epoch": 37.29220779220779, + "grad_norm": 0.3442230224609375, + "learning_rate": 2.9003076923076924e-05, + "loss": 0.0194, + "step": 5743 + }, + { + "epoch": 37.298701298701296, + "grad_norm": 0.2957189679145813, + "learning_rate": 2.9002769230769232e-05, + "loss": 0.015, + "step": 5744 + }, + { + "epoch": 37.3051948051948, + "grad_norm": 0.40390369296073914, + "learning_rate": 2.900246153846154e-05, + "loss": 0.0183, + "step": 5745 + }, + { + "epoch": 37.311688311688314, + "grad_norm": 0.5869516134262085, + "learning_rate": 2.900215384615385e-05, + "loss": 0.0383, + "step": 5746 + }, + { + "epoch": 37.31818181818182, + "grad_norm": 0.5072347521781921, + "learning_rate": 2.9001846153846154e-05, + "loss": 0.0131, + "step": 5747 + }, + { + "epoch": 37.324675324675326, + "grad_norm": 0.357571005821228, + "learning_rate": 2.9001538461538462e-05, + "loss": 0.0095, + "step": 5748 + }, + { + "epoch": 37.33116883116883, + "grad_norm": 0.430806964635849, + "learning_rate": 2.900123076923077e-05, + "loss": 0.0153, + "step": 5749 + }, + { + "epoch": 37.33766233766234, + "grad_norm": 0.35799238085746765, + "learning_rate": 2.900092307692308e-05, + "loss": 0.0105, + "step": 5750 + }, + { + "epoch": 37.34415584415584, + "grad_norm": 0.3252030313014984, + "learning_rate": 2.9000615384615387e-05, + "loss": 0.0064, + "step": 5751 + }, + { + "epoch": 37.35064935064935, + "grad_norm": 0.6061626076698303, + "learning_rate": 2.9000307692307695e-05, + "loss": 0.009, + "step": 5752 + }, + { + "epoch": 37.357142857142854, + "grad_norm": 0.3573189675807953, + "learning_rate": 2.9e-05, + "loss": 0.0077, + "step": 5753 + }, + { + "epoch": 37.36363636363637, + "grad_norm": 1.481142282485962, + "learning_rate": 2.899969230769231e-05, + "loss": 0.0168, + "step": 5754 + }, + { + "epoch": 37.37012987012987, + "grad_norm": 0.536189079284668, + "learning_rate": 2.8999384615384617e-05, + "loss": 0.007, + "step": 5755 + }, + { + "epoch": 37.37662337662338, + "grad_norm": 1.1793863773345947, + "learning_rate": 2.8999076923076922e-05, + "loss": 0.0278, + "step": 5756 + }, + { + "epoch": 37.383116883116884, + "grad_norm": 1.4678716659545898, + "learning_rate": 2.8998769230769233e-05, + "loss": 0.0197, + "step": 5757 + }, + { + "epoch": 37.38961038961039, + "grad_norm": 0.29317334294319153, + "learning_rate": 2.8998461538461542e-05, + "loss": 0.0067, + "step": 5758 + }, + { + "epoch": 37.396103896103895, + "grad_norm": 1.279661774635315, + "learning_rate": 2.8998153846153847e-05, + "loss": 0.0254, + "step": 5759 + }, + { + "epoch": 37.4025974025974, + "grad_norm": 0.7066517472267151, + "learning_rate": 2.8997846153846155e-05, + "loss": 0.0226, + "step": 5760 + }, + { + "epoch": 37.40909090909091, + "grad_norm": 0.8013560175895691, + "learning_rate": 2.8997538461538463e-05, + "loss": 0.02, + "step": 5761 + }, + { + "epoch": 37.41558441558441, + "grad_norm": 0.4228562116622925, + "learning_rate": 2.8997230769230768e-05, + "loss": 0.0058, + "step": 5762 + }, + { + "epoch": 37.422077922077925, + "grad_norm": 0.484175443649292, + "learning_rate": 2.8996923076923077e-05, + "loss": 0.0057, + "step": 5763 + }, + { + "epoch": 37.42857142857143, + "grad_norm": 0.30538302659988403, + "learning_rate": 2.899661538461539e-05, + "loss": 0.0056, + "step": 5764 + }, + { + "epoch": 37.435064935064936, + "grad_norm": 0.9660945534706116, + "learning_rate": 2.8996307692307693e-05, + "loss": 0.0221, + "step": 5765 + }, + { + "epoch": 37.44155844155844, + "grad_norm": 1.1904863119125366, + "learning_rate": 2.8996e-05, + "loss": 0.0125, + "step": 5766 + }, + { + "epoch": 37.44805194805195, + "grad_norm": 0.4259716272354126, + "learning_rate": 2.899569230769231e-05, + "loss": 0.0049, + "step": 5767 + }, + { + "epoch": 37.45454545454545, + "grad_norm": 0.23206937313079834, + "learning_rate": 2.8995384615384615e-05, + "loss": 0.0027, + "step": 5768 + }, + { + "epoch": 37.46103896103896, + "grad_norm": 0.4441584646701813, + "learning_rate": 2.8995076923076923e-05, + "loss": 0.004, + "step": 5769 + }, + { + "epoch": 37.467532467532465, + "grad_norm": 0.5857951641082764, + "learning_rate": 2.899476923076923e-05, + "loss": 0.0243, + "step": 5770 + }, + { + "epoch": 37.47402597402598, + "grad_norm": 0.31035560369491577, + "learning_rate": 2.899446153846154e-05, + "loss": 0.0033, + "step": 5771 + }, + { + "epoch": 37.48051948051948, + "grad_norm": 0.9182817339897156, + "learning_rate": 2.8994153846153848e-05, + "loss": 0.0065, + "step": 5772 + }, + { + "epoch": 37.48701298701299, + "grad_norm": 2.1770718097686768, + "learning_rate": 2.8993846153846156e-05, + "loss": 0.0168, + "step": 5773 + }, + { + "epoch": 37.493506493506494, + "grad_norm": 2.2369048595428467, + "learning_rate": 2.899353846153846e-05, + "loss": 0.0343, + "step": 5774 + }, + { + "epoch": 37.5, + "grad_norm": 0.6820853352546692, + "learning_rate": 2.899323076923077e-05, + "loss": 0.0905, + "step": 5775 + }, + { + "epoch": 37.506493506493506, + "grad_norm": 0.47322067618370056, + "learning_rate": 2.8992923076923078e-05, + "loss": 0.0595, + "step": 5776 + }, + { + "epoch": 37.51298701298701, + "grad_norm": 0.4924643039703369, + "learning_rate": 2.8992615384615383e-05, + "loss": 0.0374, + "step": 5777 + }, + { + "epoch": 37.51948051948052, + "grad_norm": 0.39143550395965576, + "learning_rate": 2.8992307692307695e-05, + "loss": 0.0302, + "step": 5778 + }, + { + "epoch": 37.52597402597402, + "grad_norm": 0.6751034259796143, + "learning_rate": 2.8992000000000003e-05, + "loss": 0.0334, + "step": 5779 + }, + { + "epoch": 37.532467532467535, + "grad_norm": 0.35889044404029846, + "learning_rate": 2.8991692307692308e-05, + "loss": 0.0224, + "step": 5780 + }, + { + "epoch": 37.53896103896104, + "grad_norm": 0.5877482891082764, + "learning_rate": 2.8991384615384616e-05, + "loss": 0.0326, + "step": 5781 + }, + { + "epoch": 37.54545454545455, + "grad_norm": 0.6370552182197571, + "learning_rate": 2.8991076923076925e-05, + "loss": 0.0331, + "step": 5782 + }, + { + "epoch": 37.55194805194805, + "grad_norm": 0.5590326189994812, + "learning_rate": 2.899076923076923e-05, + "loss": 0.0231, + "step": 5783 + }, + { + "epoch": 37.55844155844156, + "grad_norm": 0.4057650864124298, + "learning_rate": 2.8990461538461538e-05, + "loss": 0.0218, + "step": 5784 + }, + { + "epoch": 37.564935064935064, + "grad_norm": 0.39709484577178955, + "learning_rate": 2.899015384615385e-05, + "loss": 0.016, + "step": 5785 + }, + { + "epoch": 37.57142857142857, + "grad_norm": 0.4472017288208008, + "learning_rate": 2.8989846153846154e-05, + "loss": 0.0198, + "step": 5786 + }, + { + "epoch": 37.577922077922075, + "grad_norm": 0.46619942784309387, + "learning_rate": 2.8989538461538463e-05, + "loss": 0.0143, + "step": 5787 + }, + { + "epoch": 37.58441558441559, + "grad_norm": 0.37539732456207275, + "learning_rate": 2.898923076923077e-05, + "loss": 0.0151, + "step": 5788 + }, + { + "epoch": 37.59090909090909, + "grad_norm": 0.5927746295928955, + "learning_rate": 2.8988923076923076e-05, + "loss": 0.0139, + "step": 5789 + }, + { + "epoch": 37.5974025974026, + "grad_norm": 0.7920203804969788, + "learning_rate": 2.8988615384615384e-05, + "loss": 0.0391, + "step": 5790 + }, + { + "epoch": 37.603896103896105, + "grad_norm": 0.5341692566871643, + "learning_rate": 2.8988307692307693e-05, + "loss": 0.0177, + "step": 5791 + }, + { + "epoch": 37.61038961038961, + "grad_norm": 0.21405696868896484, + "learning_rate": 2.8988e-05, + "loss": 0.0067, + "step": 5792 + }, + { + "epoch": 37.616883116883116, + "grad_norm": 0.36822637915611267, + "learning_rate": 2.898769230769231e-05, + "loss": 0.0077, + "step": 5793 + }, + { + "epoch": 37.62337662337662, + "grad_norm": 0.31057003140449524, + "learning_rate": 2.8987384615384618e-05, + "loss": 0.0079, + "step": 5794 + }, + { + "epoch": 37.62987012987013, + "grad_norm": 0.8950670957565308, + "learning_rate": 2.8987076923076923e-05, + "loss": 0.0106, + "step": 5795 + }, + { + "epoch": 37.63636363636363, + "grad_norm": 0.32633891701698303, + "learning_rate": 2.898676923076923e-05, + "loss": 0.0066, + "step": 5796 + }, + { + "epoch": 37.642857142857146, + "grad_norm": 0.6017307043075562, + "learning_rate": 2.898646153846154e-05, + "loss": 0.0125, + "step": 5797 + }, + { + "epoch": 37.64935064935065, + "grad_norm": 0.5148427486419678, + "learning_rate": 2.8986153846153844e-05, + "loss": 0.0135, + "step": 5798 + }, + { + "epoch": 37.65584415584416, + "grad_norm": 0.5627039670944214, + "learning_rate": 2.8985846153846156e-05, + "loss": 0.0081, + "step": 5799 + }, + { + "epoch": 37.66233766233766, + "grad_norm": 0.7895825505256653, + "learning_rate": 2.8985538461538464e-05, + "loss": 0.0159, + "step": 5800 + }, + { + "epoch": 37.66883116883117, + "grad_norm": 2.810176372528076, + "learning_rate": 2.898523076923077e-05, + "loss": 0.0147, + "step": 5801 + }, + { + "epoch": 37.675324675324674, + "grad_norm": 1.2674649953842163, + "learning_rate": 2.8984923076923077e-05, + "loss": 0.031, + "step": 5802 + }, + { + "epoch": 37.68181818181818, + "grad_norm": 0.717071533203125, + "learning_rate": 2.8984615384615386e-05, + "loss": 0.0096, + "step": 5803 + }, + { + "epoch": 37.688311688311686, + "grad_norm": 0.44485998153686523, + "learning_rate": 2.898430769230769e-05, + "loss": 0.0099, + "step": 5804 + }, + { + "epoch": 37.6948051948052, + "grad_norm": 0.44072046875953674, + "learning_rate": 2.8984e-05, + "loss": 0.0084, + "step": 5805 + }, + { + "epoch": 37.701298701298704, + "grad_norm": 1.7065715789794922, + "learning_rate": 2.898369230769231e-05, + "loss": 0.0554, + "step": 5806 + }, + { + "epoch": 37.70779220779221, + "grad_norm": 5.627781867980957, + "learning_rate": 2.8983384615384616e-05, + "loss": 0.0125, + "step": 5807 + }, + { + "epoch": 37.714285714285715, + "grad_norm": 1.938300371170044, + "learning_rate": 2.8983076923076924e-05, + "loss": 0.017, + "step": 5808 + }, + { + "epoch": 37.72077922077922, + "grad_norm": 0.5042718052864075, + "learning_rate": 2.8982769230769232e-05, + "loss": 0.0036, + "step": 5809 + }, + { + "epoch": 37.72727272727273, + "grad_norm": 0.9170762896537781, + "learning_rate": 2.8982461538461537e-05, + "loss": 0.0091, + "step": 5810 + }, + { + "epoch": 37.73376623376623, + "grad_norm": 1.4180288314819336, + "learning_rate": 2.8982153846153846e-05, + "loss": 0.013, + "step": 5811 + }, + { + "epoch": 37.74025974025974, + "grad_norm": 1.590042233467102, + "learning_rate": 2.8981846153846154e-05, + "loss": 0.0321, + "step": 5812 + }, + { + "epoch": 37.746753246753244, + "grad_norm": 0.664558470249176, + "learning_rate": 2.8981538461538462e-05, + "loss": 0.0704, + "step": 5813 + }, + { + "epoch": 37.753246753246756, + "grad_norm": 0.5193811058998108, + "learning_rate": 2.898123076923077e-05, + "loss": 0.0571, + "step": 5814 + }, + { + "epoch": 37.75974025974026, + "grad_norm": 0.39887556433677673, + "learning_rate": 2.898092307692308e-05, + "loss": 0.044, + "step": 5815 + }, + { + "epoch": 37.76623376623377, + "grad_norm": 0.4859084188938141, + "learning_rate": 2.8980615384615384e-05, + "loss": 0.0472, + "step": 5816 + }, + { + "epoch": 37.77272727272727, + "grad_norm": 0.4902481734752655, + "learning_rate": 2.8980307692307692e-05, + "loss": 0.0341, + "step": 5817 + }, + { + "epoch": 37.77922077922078, + "grad_norm": 0.4954281151294708, + "learning_rate": 2.898e-05, + "loss": 0.0294, + "step": 5818 + }, + { + "epoch": 37.785714285714285, + "grad_norm": 0.4322144091129303, + "learning_rate": 2.897969230769231e-05, + "loss": 0.0255, + "step": 5819 + }, + { + "epoch": 37.79220779220779, + "grad_norm": 0.42813196778297424, + "learning_rate": 2.8979384615384617e-05, + "loss": 0.0258, + "step": 5820 + }, + { + "epoch": 37.798701298701296, + "grad_norm": 0.3918134272098541, + "learning_rate": 2.8979076923076925e-05, + "loss": 0.0158, + "step": 5821 + }, + { + "epoch": 37.8051948051948, + "grad_norm": 0.7231109142303467, + "learning_rate": 2.8978769230769234e-05, + "loss": 0.0252, + "step": 5822 + }, + { + "epoch": 37.811688311688314, + "grad_norm": 0.40218809247016907, + "learning_rate": 2.897846153846154e-05, + "loss": 0.012, + "step": 5823 + }, + { + "epoch": 37.81818181818182, + "grad_norm": 0.5218591094017029, + "learning_rate": 2.8978153846153847e-05, + "loss": 0.0114, + "step": 5824 + }, + { + "epoch": 37.824675324675326, + "grad_norm": 0.9542838335037231, + "learning_rate": 2.8977846153846155e-05, + "loss": 0.0213, + "step": 5825 + }, + { + "epoch": 37.83116883116883, + "grad_norm": 0.29812315106391907, + "learning_rate": 2.897753846153846e-05, + "loss": 0.0156, + "step": 5826 + }, + { + "epoch": 37.83766233766234, + "grad_norm": 0.5161244869232178, + "learning_rate": 2.8977230769230772e-05, + "loss": 0.0194, + "step": 5827 + }, + { + "epoch": 37.84415584415584, + "grad_norm": 0.4323521852493286, + "learning_rate": 2.897692307692308e-05, + "loss": 0.0173, + "step": 5828 + }, + { + "epoch": 37.85064935064935, + "grad_norm": 0.27389127016067505, + "learning_rate": 2.8976615384615385e-05, + "loss": 0.0072, + "step": 5829 + }, + { + "epoch": 37.857142857142854, + "grad_norm": 1.3271141052246094, + "learning_rate": 2.8976307692307693e-05, + "loss": 0.0233, + "step": 5830 + }, + { + "epoch": 37.86363636363637, + "grad_norm": 1.1395535469055176, + "learning_rate": 2.8976000000000002e-05, + "loss": 0.0099, + "step": 5831 + }, + { + "epoch": 37.87012987012987, + "grad_norm": 0.7338045239448547, + "learning_rate": 2.8975692307692307e-05, + "loss": 0.0156, + "step": 5832 + }, + { + "epoch": 37.87662337662338, + "grad_norm": 0.4919627606868744, + "learning_rate": 2.8975384615384615e-05, + "loss": 0.0075, + "step": 5833 + }, + { + "epoch": 37.883116883116884, + "grad_norm": 0.8875814080238342, + "learning_rate": 2.8975076923076927e-05, + "loss": 0.0189, + "step": 5834 + }, + { + "epoch": 37.88961038961039, + "grad_norm": 0.5302218198776245, + "learning_rate": 2.897476923076923e-05, + "loss": 0.0103, + "step": 5835 + }, + { + "epoch": 37.896103896103895, + "grad_norm": 0.46994951367378235, + "learning_rate": 2.897446153846154e-05, + "loss": 0.0063, + "step": 5836 + }, + { + "epoch": 37.9025974025974, + "grad_norm": 0.16305571794509888, + "learning_rate": 2.8974153846153848e-05, + "loss": 0.0023, + "step": 5837 + }, + { + "epoch": 37.90909090909091, + "grad_norm": 0.30576035380363464, + "learning_rate": 2.8973846153846153e-05, + "loss": 0.0053, + "step": 5838 + }, + { + "epoch": 37.91558441558441, + "grad_norm": 0.4666111171245575, + "learning_rate": 2.897353846153846e-05, + "loss": 0.0143, + "step": 5839 + }, + { + "epoch": 37.922077922077925, + "grad_norm": 0.7408274412155151, + "learning_rate": 2.897323076923077e-05, + "loss": 0.0139, + "step": 5840 + }, + { + "epoch": 37.92857142857143, + "grad_norm": 0.7977734804153442, + "learning_rate": 2.8972923076923078e-05, + "loss": 0.0059, + "step": 5841 + }, + { + "epoch": 37.935064935064936, + "grad_norm": 0.6239389181137085, + "learning_rate": 2.8972615384615387e-05, + "loss": 0.0089, + "step": 5842 + }, + { + "epoch": 37.94155844155844, + "grad_norm": 0.18126250803470612, + "learning_rate": 2.8972307692307695e-05, + "loss": 0.002, + "step": 5843 + }, + { + "epoch": 37.94805194805195, + "grad_norm": 1.3525784015655518, + "learning_rate": 2.8972e-05, + "loss": 0.0071, + "step": 5844 + }, + { + "epoch": 37.95454545454545, + "grad_norm": 0.4028265178203583, + "learning_rate": 2.8971692307692308e-05, + "loss": 0.0032, + "step": 5845 + }, + { + "epoch": 37.96103896103896, + "grad_norm": 0.37903255224227905, + "learning_rate": 2.8971384615384616e-05, + "loss": 0.0084, + "step": 5846 + }, + { + "epoch": 37.967532467532465, + "grad_norm": 0.39638563990592957, + "learning_rate": 2.897107692307692e-05, + "loss": 0.0048, + "step": 5847 + }, + { + "epoch": 37.97402597402598, + "grad_norm": 0.5073239803314209, + "learning_rate": 2.8970769230769233e-05, + "loss": 0.0039, + "step": 5848 + }, + { + "epoch": 37.98051948051948, + "grad_norm": 1.5499805212020874, + "learning_rate": 2.897046153846154e-05, + "loss": 0.0081, + "step": 5849 + }, + { + "epoch": 37.98701298701299, + "grad_norm": 8.752338409423828, + "learning_rate": 2.8970153846153846e-05, + "loss": 0.1089, + "step": 5850 + }, + { + "epoch": 37.993506493506494, + "grad_norm": 0.5049633383750916, + "learning_rate": 2.8969846153846155e-05, + "loss": 0.0226, + "step": 5851 + }, + { + "epoch": 38.0, + "grad_norm": 0.4144134521484375, + "learning_rate": 2.8969538461538463e-05, + "loss": 0.0047, + "step": 5852 + }, + { + "epoch": 38.006493506493506, + "grad_norm": 0.8269168138504028, + "learning_rate": 2.8969230769230768e-05, + "loss": 0.0579, + "step": 5853 + }, + { + "epoch": 38.01298701298701, + "grad_norm": 0.5803773403167725, + "learning_rate": 2.8968923076923076e-05, + "loss": 0.0441, + "step": 5854 + }, + { + "epoch": 38.01948051948052, + "grad_norm": 1.5662708282470703, + "learning_rate": 2.8968615384615388e-05, + "loss": 0.0452, + "step": 5855 + }, + { + "epoch": 38.02597402597402, + "grad_norm": 0.508958637714386, + "learning_rate": 2.8968307692307693e-05, + "loss": 0.0451, + "step": 5856 + }, + { + "epoch": 38.032467532467535, + "grad_norm": 0.5083428621292114, + "learning_rate": 2.8968e-05, + "loss": 0.0309, + "step": 5857 + }, + { + "epoch": 38.03896103896104, + "grad_norm": 0.3908602297306061, + "learning_rate": 2.896769230769231e-05, + "loss": 0.0213, + "step": 5858 + }, + { + "epoch": 38.04545454545455, + "grad_norm": 0.3186749815940857, + "learning_rate": 2.8967384615384614e-05, + "loss": 0.0257, + "step": 5859 + }, + { + "epoch": 38.05194805194805, + "grad_norm": 0.33088552951812744, + "learning_rate": 2.8967076923076923e-05, + "loss": 0.0191, + "step": 5860 + }, + { + "epoch": 38.05844155844156, + "grad_norm": 0.3540251553058624, + "learning_rate": 2.896676923076923e-05, + "loss": 0.014, + "step": 5861 + }, + { + "epoch": 38.064935064935064, + "grad_norm": 0.48857712745666504, + "learning_rate": 2.896646153846154e-05, + "loss": 0.0151, + "step": 5862 + }, + { + "epoch": 38.07142857142857, + "grad_norm": 0.4713970124721527, + "learning_rate": 2.8966153846153848e-05, + "loss": 0.0128, + "step": 5863 + }, + { + "epoch": 38.077922077922075, + "grad_norm": 0.31919345259666443, + "learning_rate": 2.8965846153846156e-05, + "loss": 0.0119, + "step": 5864 + }, + { + "epoch": 38.08441558441559, + "grad_norm": 0.7923030257225037, + "learning_rate": 2.896553846153846e-05, + "loss": 0.0165, + "step": 5865 + }, + { + "epoch": 38.09090909090909, + "grad_norm": 0.25103074312210083, + "learning_rate": 2.896523076923077e-05, + "loss": 0.0076, + "step": 5866 + }, + { + "epoch": 38.0974025974026, + "grad_norm": 0.9443399906158447, + "learning_rate": 2.8964923076923078e-05, + "loss": 0.0208, + "step": 5867 + }, + { + "epoch": 38.103896103896105, + "grad_norm": 0.7961001396179199, + "learning_rate": 2.8964615384615383e-05, + "loss": 0.0303, + "step": 5868 + }, + { + "epoch": 38.11038961038961, + "grad_norm": 0.8084138035774231, + "learning_rate": 2.8964307692307694e-05, + "loss": 0.0133, + "step": 5869 + }, + { + "epoch": 38.116883116883116, + "grad_norm": 0.5023177266120911, + "learning_rate": 2.8964000000000003e-05, + "loss": 0.009, + "step": 5870 + }, + { + "epoch": 38.12337662337662, + "grad_norm": 1.263300895690918, + "learning_rate": 2.8963692307692307e-05, + "loss": 0.01, + "step": 5871 + }, + { + "epoch": 38.12987012987013, + "grad_norm": 0.15346093475818634, + "learning_rate": 2.8963384615384616e-05, + "loss": 0.0029, + "step": 5872 + }, + { + "epoch": 38.13636363636363, + "grad_norm": 0.6068188548088074, + "learning_rate": 2.8963076923076924e-05, + "loss": 0.0121, + "step": 5873 + }, + { + "epoch": 38.142857142857146, + "grad_norm": 0.6815487146377563, + "learning_rate": 2.896276923076923e-05, + "loss": 0.0122, + "step": 5874 + }, + { + "epoch": 38.14935064935065, + "grad_norm": 0.6805928349494934, + "learning_rate": 2.8962461538461537e-05, + "loss": 0.0098, + "step": 5875 + }, + { + "epoch": 38.15584415584416, + "grad_norm": 0.5679097175598145, + "learning_rate": 2.896215384615385e-05, + "loss": 0.0156, + "step": 5876 + }, + { + "epoch": 38.16233766233766, + "grad_norm": 0.40975064039230347, + "learning_rate": 2.8961846153846154e-05, + "loss": 0.0045, + "step": 5877 + }, + { + "epoch": 38.16883116883117, + "grad_norm": 0.8027766346931458, + "learning_rate": 2.8961538461538462e-05, + "loss": 0.0105, + "step": 5878 + }, + { + "epoch": 38.175324675324674, + "grad_norm": 0.30105969309806824, + "learning_rate": 2.896123076923077e-05, + "loss": 0.0049, + "step": 5879 + }, + { + "epoch": 38.18181818181818, + "grad_norm": 0.4990985095500946, + "learning_rate": 2.8960923076923076e-05, + "loss": 0.0321, + "step": 5880 + }, + { + "epoch": 38.188311688311686, + "grad_norm": 0.8950565457344055, + "learning_rate": 2.8960615384615384e-05, + "loss": 0.0137, + "step": 5881 + }, + { + "epoch": 38.1948051948052, + "grad_norm": 0.21951325237751007, + "learning_rate": 2.8960307692307692e-05, + "loss": 0.003, + "step": 5882 + }, + { + "epoch": 38.201298701298704, + "grad_norm": 0.8957061171531677, + "learning_rate": 2.896e-05, + "loss": 0.0346, + "step": 5883 + }, + { + "epoch": 38.20779220779221, + "grad_norm": 0.23368746042251587, + "learning_rate": 2.895969230769231e-05, + "loss": 0.0039, + "step": 5884 + }, + { + "epoch": 38.214285714285715, + "grad_norm": 0.3076799213886261, + "learning_rate": 2.8959384615384617e-05, + "loss": 0.0086, + "step": 5885 + }, + { + "epoch": 38.22077922077922, + "grad_norm": 0.6283580660820007, + "learning_rate": 2.8959076923076922e-05, + "loss": 0.0077, + "step": 5886 + }, + { + "epoch": 38.22727272727273, + "grad_norm": 0.48088017106056213, + "learning_rate": 2.895876923076923e-05, + "loss": 0.0068, + "step": 5887 + }, + { + "epoch": 38.23376623376623, + "grad_norm": 1.0112690925598145, + "learning_rate": 2.895846153846154e-05, + "loss": 0.0069, + "step": 5888 + }, + { + "epoch": 38.24025974025974, + "grad_norm": 3.1249289512634277, + "learning_rate": 2.8958153846153844e-05, + "loss": 0.0456, + "step": 5889 + }, + { + "epoch": 38.246753246753244, + "grad_norm": 1.8980683088302612, + "learning_rate": 2.8957846153846155e-05, + "loss": 0.049, + "step": 5890 + }, + { + "epoch": 38.253246753246756, + "grad_norm": 0.6966969966888428, + "learning_rate": 2.8957538461538464e-05, + "loss": 0.079, + "step": 5891 + }, + { + "epoch": 38.25974025974026, + "grad_norm": 0.50810307264328, + "learning_rate": 2.895723076923077e-05, + "loss": 0.0532, + "step": 5892 + }, + { + "epoch": 38.26623376623377, + "grad_norm": 0.6730650067329407, + "learning_rate": 2.8956923076923077e-05, + "loss": 0.0406, + "step": 5893 + }, + { + "epoch": 38.27272727272727, + "grad_norm": 0.8003307580947876, + "learning_rate": 2.8956615384615385e-05, + "loss": 0.0473, + "step": 5894 + }, + { + "epoch": 38.27922077922078, + "grad_norm": 0.36403682827949524, + "learning_rate": 2.895630769230769e-05, + "loss": 0.0263, + "step": 5895 + }, + { + "epoch": 38.285714285714285, + "grad_norm": 0.359247088432312, + "learning_rate": 2.8956e-05, + "loss": 0.0172, + "step": 5896 + }, + { + "epoch": 38.29220779220779, + "grad_norm": 0.26991915702819824, + "learning_rate": 2.895569230769231e-05, + "loss": 0.0153, + "step": 5897 + }, + { + "epoch": 38.298701298701296, + "grad_norm": 0.4487290680408478, + "learning_rate": 2.895538461538462e-05, + "loss": 0.02, + "step": 5898 + }, + { + "epoch": 38.3051948051948, + "grad_norm": 0.41255083680152893, + "learning_rate": 2.8955076923076924e-05, + "loss": 0.0127, + "step": 5899 + }, + { + "epoch": 38.311688311688314, + "grad_norm": 0.4111977815628052, + "learning_rate": 2.8954769230769232e-05, + "loss": 0.0173, + "step": 5900 + }, + { + "epoch": 38.31818181818182, + "grad_norm": 0.32981088757514954, + "learning_rate": 2.895446153846154e-05, + "loss": 0.0126, + "step": 5901 + }, + { + "epoch": 38.324675324675326, + "grad_norm": 0.37807318568229675, + "learning_rate": 2.8954153846153845e-05, + "loss": 0.0152, + "step": 5902 + }, + { + "epoch": 38.33116883116883, + "grad_norm": 0.43366217613220215, + "learning_rate": 2.8953846153846153e-05, + "loss": 0.0146, + "step": 5903 + }, + { + "epoch": 38.33766233766234, + "grad_norm": 0.3417205214500427, + "learning_rate": 2.8953538461538465e-05, + "loss": 0.0093, + "step": 5904 + }, + { + "epoch": 38.34415584415584, + "grad_norm": 0.5234576463699341, + "learning_rate": 2.895323076923077e-05, + "loss": 0.0083, + "step": 5905 + }, + { + "epoch": 38.35064935064935, + "grad_norm": 0.2839077115058899, + "learning_rate": 2.895292307692308e-05, + "loss": 0.0076, + "step": 5906 + }, + { + "epoch": 38.357142857142854, + "grad_norm": 0.35945460200309753, + "learning_rate": 2.8952615384615387e-05, + "loss": 0.0085, + "step": 5907 + }, + { + "epoch": 38.36363636363637, + "grad_norm": 0.5638504028320312, + "learning_rate": 2.895230769230769e-05, + "loss": 0.0093, + "step": 5908 + }, + { + "epoch": 38.37012987012987, + "grad_norm": 1.1708959341049194, + "learning_rate": 2.8952e-05, + "loss": 0.0206, + "step": 5909 + }, + { + "epoch": 38.37662337662338, + "grad_norm": 0.12267431616783142, + "learning_rate": 2.8951692307692308e-05, + "loss": 0.0016, + "step": 5910 + }, + { + "epoch": 38.383116883116884, + "grad_norm": 0.5171460509300232, + "learning_rate": 2.8951384615384617e-05, + "loss": 0.0113, + "step": 5911 + }, + { + "epoch": 38.38961038961039, + "grad_norm": 0.5932318568229675, + "learning_rate": 2.8951076923076925e-05, + "loss": 0.0059, + "step": 5912 + }, + { + "epoch": 38.396103896103895, + "grad_norm": 0.5508298873901367, + "learning_rate": 2.8950769230769233e-05, + "loss": 0.0046, + "step": 5913 + }, + { + "epoch": 38.4025974025974, + "grad_norm": 0.5931794047355652, + "learning_rate": 2.8950461538461538e-05, + "loss": 0.0061, + "step": 5914 + }, + { + "epoch": 38.40909090909091, + "grad_norm": 1.0745666027069092, + "learning_rate": 2.8950153846153846e-05, + "loss": 0.0184, + "step": 5915 + }, + { + "epoch": 38.41558441558441, + "grad_norm": 1.1727707386016846, + "learning_rate": 2.8949846153846155e-05, + "loss": 0.0113, + "step": 5916 + }, + { + "epoch": 38.422077922077925, + "grad_norm": 1.9281693696975708, + "learning_rate": 2.894953846153846e-05, + "loss": 0.028, + "step": 5917 + }, + { + "epoch": 38.42857142857143, + "grad_norm": 0.5420227646827698, + "learning_rate": 2.894923076923077e-05, + "loss": 0.0056, + "step": 5918 + }, + { + "epoch": 38.435064935064936, + "grad_norm": 0.42538145184516907, + "learning_rate": 2.894892307692308e-05, + "loss": 0.0079, + "step": 5919 + }, + { + "epoch": 38.44155844155844, + "grad_norm": 1.2814815044403076, + "learning_rate": 2.8948615384615385e-05, + "loss": 0.0142, + "step": 5920 + }, + { + "epoch": 38.44805194805195, + "grad_norm": 0.6370809078216553, + "learning_rate": 2.8948307692307693e-05, + "loss": 0.0208, + "step": 5921 + }, + { + "epoch": 38.45454545454545, + "grad_norm": 1.0014452934265137, + "learning_rate": 2.8948e-05, + "loss": 0.0525, + "step": 5922 + }, + { + "epoch": 38.46103896103896, + "grad_norm": 0.9764885306358337, + "learning_rate": 2.8947692307692306e-05, + "loss": 0.0139, + "step": 5923 + }, + { + "epoch": 38.467532467532465, + "grad_norm": 1.2797648906707764, + "learning_rate": 2.8947384615384618e-05, + "loss": 0.0118, + "step": 5924 + }, + { + "epoch": 38.47402597402598, + "grad_norm": 0.826373279094696, + "learning_rate": 2.8947076923076926e-05, + "loss": 0.0089, + "step": 5925 + }, + { + "epoch": 38.48051948051948, + "grad_norm": 1.1263399124145508, + "learning_rate": 2.894676923076923e-05, + "loss": 0.0097, + "step": 5926 + }, + { + "epoch": 38.48701298701299, + "grad_norm": 1.4181472063064575, + "learning_rate": 2.894646153846154e-05, + "loss": 0.0233, + "step": 5927 + }, + { + "epoch": 38.493506493506494, + "grad_norm": 0.6018449664115906, + "learning_rate": 2.8946153846153848e-05, + "loss": 0.0187, + "step": 5928 + }, + { + "epoch": 38.5, + "grad_norm": 0.7165797352790833, + "learning_rate": 2.8945846153846153e-05, + "loss": 0.0856, + "step": 5929 + }, + { + "epoch": 38.506493506493506, + "grad_norm": 0.7134907841682434, + "learning_rate": 2.894553846153846e-05, + "loss": 0.0648, + "step": 5930 + }, + { + "epoch": 38.51298701298701, + "grad_norm": 0.4745461642742157, + "learning_rate": 2.8945230769230773e-05, + "loss": 0.0426, + "step": 5931 + }, + { + "epoch": 38.51948051948052, + "grad_norm": 0.40484580397605896, + "learning_rate": 2.8944923076923078e-05, + "loss": 0.0441, + "step": 5932 + }, + { + "epoch": 38.52597402597402, + "grad_norm": 0.3975403308868408, + "learning_rate": 2.8944615384615386e-05, + "loss": 0.0317, + "step": 5933 + }, + { + "epoch": 38.532467532467535, + "grad_norm": 0.4806075692176819, + "learning_rate": 2.8944307692307694e-05, + "loss": 0.029, + "step": 5934 + }, + { + "epoch": 38.53896103896104, + "grad_norm": 0.4230036437511444, + "learning_rate": 2.8944e-05, + "loss": 0.0254, + "step": 5935 + }, + { + "epoch": 38.54545454545455, + "grad_norm": 0.40213966369628906, + "learning_rate": 2.8943692307692308e-05, + "loss": 0.0261, + "step": 5936 + }, + { + "epoch": 38.55194805194805, + "grad_norm": 0.3931240737438202, + "learning_rate": 2.8943384615384616e-05, + "loss": 0.0212, + "step": 5937 + }, + { + "epoch": 38.55844155844156, + "grad_norm": 0.3137282431125641, + "learning_rate": 2.8943076923076924e-05, + "loss": 0.0155, + "step": 5938 + }, + { + "epoch": 38.564935064935064, + "grad_norm": 0.43438827991485596, + "learning_rate": 2.8942769230769233e-05, + "loss": 0.0154, + "step": 5939 + }, + { + "epoch": 38.57142857142857, + "grad_norm": 0.4714788794517517, + "learning_rate": 2.894246153846154e-05, + "loss": 0.0216, + "step": 5940 + }, + { + "epoch": 38.577922077922075, + "grad_norm": 0.4681532084941864, + "learning_rate": 2.8942153846153846e-05, + "loss": 0.0188, + "step": 5941 + }, + { + "epoch": 38.58441558441559, + "grad_norm": 0.4101855754852295, + "learning_rate": 2.8941846153846154e-05, + "loss": 0.0316, + "step": 5942 + }, + { + "epoch": 38.59090909090909, + "grad_norm": 0.8212214112281799, + "learning_rate": 2.8941538461538463e-05, + "loss": 0.0098, + "step": 5943 + }, + { + "epoch": 38.5974025974026, + "grad_norm": 0.3889029622077942, + "learning_rate": 2.8941230769230767e-05, + "loss": 0.0125, + "step": 5944 + }, + { + "epoch": 38.603896103896105, + "grad_norm": 0.31291624903678894, + "learning_rate": 2.894092307692308e-05, + "loss": 0.0087, + "step": 5945 + }, + { + "epoch": 38.61038961038961, + "grad_norm": 0.4163176119327545, + "learning_rate": 2.8940615384615387e-05, + "loss": 0.0151, + "step": 5946 + }, + { + "epoch": 38.616883116883116, + "grad_norm": 0.3109338879585266, + "learning_rate": 2.8940307692307692e-05, + "loss": 0.005, + "step": 5947 + }, + { + "epoch": 38.62337662337662, + "grad_norm": 0.4024008810520172, + "learning_rate": 2.894e-05, + "loss": 0.0078, + "step": 5948 + }, + { + "epoch": 38.62987012987013, + "grad_norm": 0.2557741105556488, + "learning_rate": 2.893969230769231e-05, + "loss": 0.0047, + "step": 5949 + }, + { + "epoch": 38.63636363636363, + "grad_norm": 0.5303022861480713, + "learning_rate": 2.8939384615384614e-05, + "loss": 0.0155, + "step": 5950 + }, + { + "epoch": 38.642857142857146, + "grad_norm": 0.8672530651092529, + "learning_rate": 2.8939076923076922e-05, + "loss": 0.0201, + "step": 5951 + }, + { + "epoch": 38.64935064935065, + "grad_norm": 0.8533628582954407, + "learning_rate": 2.8938769230769234e-05, + "loss": 0.0081, + "step": 5952 + }, + { + "epoch": 38.65584415584416, + "grad_norm": 0.48627012968063354, + "learning_rate": 2.893846153846154e-05, + "loss": 0.0142, + "step": 5953 + }, + { + "epoch": 38.66233766233766, + "grad_norm": 0.8721800446510315, + "learning_rate": 2.8938153846153847e-05, + "loss": 0.0129, + "step": 5954 + }, + { + "epoch": 38.66883116883117, + "grad_norm": 0.5573851466178894, + "learning_rate": 2.8937846153846156e-05, + "loss": 0.009, + "step": 5955 + }, + { + "epoch": 38.675324675324674, + "grad_norm": 0.35611802339553833, + "learning_rate": 2.893753846153846e-05, + "loss": 0.0108, + "step": 5956 + }, + { + "epoch": 38.68181818181818, + "grad_norm": 0.7225804328918457, + "learning_rate": 2.893723076923077e-05, + "loss": 0.0091, + "step": 5957 + }, + { + "epoch": 38.688311688311686, + "grad_norm": 1.0647177696228027, + "learning_rate": 2.8936923076923077e-05, + "loss": 0.008, + "step": 5958 + }, + { + "epoch": 38.6948051948052, + "grad_norm": 0.27271005511283875, + "learning_rate": 2.8936615384615385e-05, + "loss": 0.0041, + "step": 5959 + }, + { + "epoch": 38.701298701298704, + "grad_norm": 0.8564441800117493, + "learning_rate": 2.8936307692307694e-05, + "loss": 0.0104, + "step": 5960 + }, + { + "epoch": 38.70779220779221, + "grad_norm": 0.8845938444137573, + "learning_rate": 2.8936000000000002e-05, + "loss": 0.0077, + "step": 5961 + }, + { + "epoch": 38.714285714285715, + "grad_norm": 0.567851185798645, + "learning_rate": 2.8935692307692307e-05, + "loss": 0.0068, + "step": 5962 + }, + { + "epoch": 38.72077922077922, + "grad_norm": 0.7509989738464355, + "learning_rate": 2.8935384615384615e-05, + "loss": 0.0043, + "step": 5963 + }, + { + "epoch": 38.72727272727273, + "grad_norm": 0.5271767973899841, + "learning_rate": 2.8935076923076924e-05, + "loss": 0.005, + "step": 5964 + }, + { + "epoch": 38.73376623376623, + "grad_norm": 0.5564625263214111, + "learning_rate": 2.893476923076923e-05, + "loss": 0.0171, + "step": 5965 + }, + { + "epoch": 38.74025974025974, + "grad_norm": 0.4576219618320465, + "learning_rate": 2.893446153846154e-05, + "loss": 0.0045, + "step": 5966 + }, + { + "epoch": 38.746753246753244, + "grad_norm": 0.6022686958312988, + "learning_rate": 2.893415384615385e-05, + "loss": 0.0684, + "step": 5967 + }, + { + "epoch": 38.753246753246756, + "grad_norm": 0.532085657119751, + "learning_rate": 2.8933846153846154e-05, + "loss": 0.0574, + "step": 5968 + }, + { + "epoch": 38.75974025974026, + "grad_norm": 0.5232948064804077, + "learning_rate": 2.8933538461538462e-05, + "loss": 0.0385, + "step": 5969 + }, + { + "epoch": 38.76623376623377, + "grad_norm": 0.40600380301475525, + "learning_rate": 2.893323076923077e-05, + "loss": 0.0252, + "step": 5970 + }, + { + "epoch": 38.77272727272727, + "grad_norm": 0.4164242744445801, + "learning_rate": 2.8932923076923075e-05, + "loss": 0.03, + "step": 5971 + }, + { + "epoch": 38.77922077922078, + "grad_norm": 0.3534366488456726, + "learning_rate": 2.8932615384615384e-05, + "loss": 0.0163, + "step": 5972 + }, + { + "epoch": 38.785714285714285, + "grad_norm": 0.32166191935539246, + "learning_rate": 2.8932307692307695e-05, + "loss": 0.0223, + "step": 5973 + }, + { + "epoch": 38.79220779220779, + "grad_norm": 0.39083895087242126, + "learning_rate": 2.8932e-05, + "loss": 0.0165, + "step": 5974 + }, + { + "epoch": 38.798701298701296, + "grad_norm": 0.3537064790725708, + "learning_rate": 2.893169230769231e-05, + "loss": 0.0325, + "step": 5975 + }, + { + "epoch": 38.8051948051948, + "grad_norm": 0.5228418707847595, + "learning_rate": 2.8931384615384617e-05, + "loss": 0.0201, + "step": 5976 + }, + { + "epoch": 38.811688311688314, + "grad_norm": 0.2531750500202179, + "learning_rate": 2.8931076923076925e-05, + "loss": 0.0098, + "step": 5977 + }, + { + "epoch": 38.81818181818182, + "grad_norm": 0.3136075437068939, + "learning_rate": 2.893076923076923e-05, + "loss": 0.0091, + "step": 5978 + }, + { + "epoch": 38.824675324675326, + "grad_norm": 0.31537479162216187, + "learning_rate": 2.893046153846154e-05, + "loss": 0.0096, + "step": 5979 + }, + { + "epoch": 38.83116883116883, + "grad_norm": 0.41998496651649475, + "learning_rate": 2.893015384615385e-05, + "loss": 0.0138, + "step": 5980 + }, + { + "epoch": 38.83766233766234, + "grad_norm": 0.25479060411453247, + "learning_rate": 2.8929846153846155e-05, + "loss": 0.0073, + "step": 5981 + }, + { + "epoch": 38.84415584415584, + "grad_norm": 0.382602721452713, + "learning_rate": 2.8929538461538463e-05, + "loss": 0.011, + "step": 5982 + }, + { + "epoch": 38.85064935064935, + "grad_norm": 0.6440215706825256, + "learning_rate": 2.892923076923077e-05, + "loss": 0.0103, + "step": 5983 + }, + { + "epoch": 38.857142857142854, + "grad_norm": 0.31958869099617004, + "learning_rate": 2.8928923076923077e-05, + "loss": 0.0103, + "step": 5984 + }, + { + "epoch": 38.86363636363637, + "grad_norm": 0.28973206877708435, + "learning_rate": 2.8928615384615385e-05, + "loss": 0.0086, + "step": 5985 + }, + { + "epoch": 38.87012987012987, + "grad_norm": 0.2433740496635437, + "learning_rate": 2.8928307692307693e-05, + "loss": 0.007, + "step": 5986 + }, + { + "epoch": 38.87662337662338, + "grad_norm": 0.3104799687862396, + "learning_rate": 2.8928e-05, + "loss": 0.0064, + "step": 5987 + }, + { + "epoch": 38.883116883116884, + "grad_norm": 0.2099333554506302, + "learning_rate": 2.892769230769231e-05, + "loss": 0.0038, + "step": 5988 + }, + { + "epoch": 38.88961038961039, + "grad_norm": 0.2746211588382721, + "learning_rate": 2.8927384615384618e-05, + "loss": 0.0037, + "step": 5989 + }, + { + "epoch": 38.896103896103895, + "grad_norm": 0.4701051414012909, + "learning_rate": 2.8927076923076923e-05, + "loss": 0.0192, + "step": 5990 + }, + { + "epoch": 38.9025974025974, + "grad_norm": 0.8103092312812805, + "learning_rate": 2.892676923076923e-05, + "loss": 0.0038, + "step": 5991 + }, + { + "epoch": 38.90909090909091, + "grad_norm": 0.2507099509239197, + "learning_rate": 2.892646153846154e-05, + "loss": 0.0024, + "step": 5992 + }, + { + "epoch": 38.91558441558441, + "grad_norm": 0.6929746270179749, + "learning_rate": 2.8926153846153845e-05, + "loss": 0.0083, + "step": 5993 + }, + { + "epoch": 38.922077922077925, + "grad_norm": 1.030951976776123, + "learning_rate": 2.8925846153846156e-05, + "loss": 0.0186, + "step": 5994 + }, + { + "epoch": 38.92857142857143, + "grad_norm": 0.7033496499061584, + "learning_rate": 2.8925538461538465e-05, + "loss": 0.0207, + "step": 5995 + }, + { + "epoch": 38.935064935064936, + "grad_norm": 0.38966307044029236, + "learning_rate": 2.892523076923077e-05, + "loss": 0.0044, + "step": 5996 + }, + { + "epoch": 38.94155844155844, + "grad_norm": 0.5722068548202515, + "learning_rate": 2.8924923076923078e-05, + "loss": 0.0061, + "step": 5997 + }, + { + "epoch": 38.94805194805195, + "grad_norm": 0.504027247428894, + "learning_rate": 2.8924615384615386e-05, + "loss": 0.0072, + "step": 5998 + }, + { + "epoch": 38.95454545454545, + "grad_norm": 1.0058695077896118, + "learning_rate": 2.892430769230769e-05, + "loss": 0.0085, + "step": 5999 + }, + { + "epoch": 38.96103896103896, + "grad_norm": 7.794951438903809, + "learning_rate": 2.8924e-05, + "loss": 0.0138, + "step": 6000 + }, + { + "epoch": 38.96103896103896, + "eval_cer": 0.08826600273070435, + "eval_loss": 0.4513417184352875, + "eval_runtime": 8.2069, + "eval_samples_per_second": 12.307, + "eval_steps_per_second": 0.487, + "eval_wer": 0.28174603174603174, + "step": 6000 } ], "logging_steps": 1.0, @@ -35071,7 +42081,7 @@ "early_stopping_threshold": 0.0 }, "attributes": { - "early_stopping_patience_counter": 3 + "early_stopping_patience_counter": 4 } }, "TrainerControl": { @@ -35085,7 +42095,7 @@ "attributes": {} } }, - "total_flos": 6.805472800258521e+19, + "total_flos": 8.165773678594713e+19, "train_batch_size": 32, "trial_name": null, "trial_params": null