shuoxing's picture
Upload folder using huggingface_hub
3010acf verified
raw
history blame
85.5 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9994910941475827,
"eval_steps": 500,
"global_step": 491,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.002035623409669211,
"grad_norm": 2244783.3021161165,
"learning_rate": 1.0000000000000001e-07,
"loss": 13.8236,
"step": 1
},
{
"epoch": 0.004071246819338422,
"grad_norm": 4218192.586319844,
"learning_rate": 2.0000000000000002e-07,
"loss": 13.7798,
"step": 2
},
{
"epoch": 0.0061068702290076335,
"grad_norm": 4594205.112133389,
"learning_rate": 3.0000000000000004e-07,
"loss": 13.844,
"step": 3
},
{
"epoch": 0.008142493638676845,
"grad_norm": 11263009.553503217,
"learning_rate": 4.0000000000000003e-07,
"loss": 13.8135,
"step": 4
},
{
"epoch": 0.010178117048346057,
"grad_norm": 8938655.40470094,
"learning_rate": 5.000000000000001e-07,
"loss": 13.8361,
"step": 5
},
{
"epoch": 0.012213740458015267,
"grad_norm": 3666969.2787727225,
"learning_rate": 6.000000000000001e-07,
"loss": 13.7754,
"step": 6
},
{
"epoch": 0.014249363867684479,
"grad_norm": 2213140.2581639104,
"learning_rate": 7.000000000000001e-07,
"loss": 13.816,
"step": 7
},
{
"epoch": 0.01628498727735369,
"grad_norm": 5204945.168357011,
"learning_rate": 8.000000000000001e-07,
"loss": 13.7919,
"step": 8
},
{
"epoch": 0.0183206106870229,
"grad_norm": 3161273.4979725075,
"learning_rate": 9.000000000000001e-07,
"loss": 13.7921,
"step": 9
},
{
"epoch": 0.020356234096692113,
"grad_norm": 4721972.526161844,
"learning_rate": 1.0000000000000002e-06,
"loss": 13.8129,
"step": 10
},
{
"epoch": 0.022391857506361322,
"grad_norm": 1993836.4706857507,
"learning_rate": 1.1e-06,
"loss": 13.7899,
"step": 11
},
{
"epoch": 0.024427480916030534,
"grad_norm": 2841663.624406631,
"learning_rate": 1.2000000000000002e-06,
"loss": 13.8222,
"step": 12
},
{
"epoch": 0.026463104325699746,
"grad_norm": 1851625.2694659713,
"learning_rate": 1.3e-06,
"loss": 13.8051,
"step": 13
},
{
"epoch": 0.028498727735368958,
"grad_norm": 1799567.0732118108,
"learning_rate": 1.4000000000000001e-06,
"loss": 13.8232,
"step": 14
},
{
"epoch": 0.030534351145038167,
"grad_norm": 1739244.434987474,
"learning_rate": 1.5e-06,
"loss": 13.787,
"step": 15
},
{
"epoch": 0.03256997455470738,
"grad_norm": 2049274.0550949178,
"learning_rate": 1.6000000000000001e-06,
"loss": 13.8272,
"step": 16
},
{
"epoch": 0.03460559796437659,
"grad_norm": 2196819.6130670137,
"learning_rate": 1.7000000000000002e-06,
"loss": 13.7819,
"step": 17
},
{
"epoch": 0.0366412213740458,
"grad_norm": 3122206.233837503,
"learning_rate": 1.8000000000000001e-06,
"loss": 13.7931,
"step": 18
},
{
"epoch": 0.03867684478371501,
"grad_norm": 6250105.385689335,
"learning_rate": 1.9000000000000002e-06,
"loss": 13.8169,
"step": 19
},
{
"epoch": 0.04071246819338423,
"grad_norm": 3333302.4953560205,
"learning_rate": 2.0000000000000003e-06,
"loss": 13.7969,
"step": 20
},
{
"epoch": 0.042748091603053436,
"grad_norm": 1842864.15838755,
"learning_rate": 2.1000000000000002e-06,
"loss": 13.7789,
"step": 21
},
{
"epoch": 0.044783715012722644,
"grad_norm": 2814803.976988681,
"learning_rate": 2.2e-06,
"loss": 13.8063,
"step": 22
},
{
"epoch": 0.04681933842239186,
"grad_norm": 5480609.064869341,
"learning_rate": 2.3000000000000004e-06,
"loss": 13.8059,
"step": 23
},
{
"epoch": 0.04885496183206107,
"grad_norm": 2330308.564996049,
"learning_rate": 2.4000000000000003e-06,
"loss": 13.7774,
"step": 24
},
{
"epoch": 0.05089058524173028,
"grad_norm": 1954680.474250264,
"learning_rate": 2.5e-06,
"loss": 13.8188,
"step": 25
},
{
"epoch": 0.05292620865139949,
"grad_norm": 2348178.4882796686,
"learning_rate": 2.6e-06,
"loss": 13.7916,
"step": 26
},
{
"epoch": 0.0549618320610687,
"grad_norm": 2063424.030791769,
"learning_rate": 2.7000000000000004e-06,
"loss": 13.8352,
"step": 27
},
{
"epoch": 0.056997455470737916,
"grad_norm": 3096470.2787967124,
"learning_rate": 2.8000000000000003e-06,
"loss": 13.814,
"step": 28
},
{
"epoch": 0.059033078880407125,
"grad_norm": 2313554.624060415,
"learning_rate": 2.9e-06,
"loss": 13.8149,
"step": 29
},
{
"epoch": 0.061068702290076333,
"grad_norm": 2759287.606120093,
"learning_rate": 3e-06,
"loss": 13.7901,
"step": 30
},
{
"epoch": 0.06310432569974554,
"grad_norm": 20688012.381375346,
"learning_rate": 3.1000000000000004e-06,
"loss": 13.8016,
"step": 31
},
{
"epoch": 0.06513994910941476,
"grad_norm": 2818955.21640328,
"learning_rate": 3.2000000000000003e-06,
"loss": 13.8174,
"step": 32
},
{
"epoch": 0.06717557251908397,
"grad_norm": 3642298.4739112426,
"learning_rate": 3.3000000000000006e-06,
"loss": 13.8439,
"step": 33
},
{
"epoch": 0.06921119592875317,
"grad_norm": 4530146.859043687,
"learning_rate": 3.4000000000000005e-06,
"loss": 13.798,
"step": 34
},
{
"epoch": 0.07124681933842239,
"grad_norm": 3901948.6988777495,
"learning_rate": 3.5e-06,
"loss": 13.7871,
"step": 35
},
{
"epoch": 0.0732824427480916,
"grad_norm": 2763745.5952281994,
"learning_rate": 3.6000000000000003e-06,
"loss": 13.7899,
"step": 36
},
{
"epoch": 0.07531806615776081,
"grad_norm": 6149669.662012621,
"learning_rate": 3.7e-06,
"loss": 13.7854,
"step": 37
},
{
"epoch": 0.07735368956743002,
"grad_norm": 2493897.7698470936,
"learning_rate": 3.8000000000000005e-06,
"loss": 13.7955,
"step": 38
},
{
"epoch": 0.07938931297709924,
"grad_norm": 1961343.832258401,
"learning_rate": 3.900000000000001e-06,
"loss": 13.8098,
"step": 39
},
{
"epoch": 0.08142493638676845,
"grad_norm": 2547368.4155277675,
"learning_rate": 4.000000000000001e-06,
"loss": 13.8144,
"step": 40
},
{
"epoch": 0.08346055979643766,
"grad_norm": 12326034.534851212,
"learning_rate": 4.1e-06,
"loss": 13.7863,
"step": 41
},
{
"epoch": 0.08549618320610687,
"grad_norm": 3960181.1514259093,
"learning_rate": 4.2000000000000004e-06,
"loss": 13.766,
"step": 42
},
{
"epoch": 0.08753180661577609,
"grad_norm": 10011098.88152794,
"learning_rate": 4.3e-06,
"loss": 13.8034,
"step": 43
},
{
"epoch": 0.08956743002544529,
"grad_norm": 3789423.3303024317,
"learning_rate": 4.4e-06,
"loss": 13.791,
"step": 44
},
{
"epoch": 0.0916030534351145,
"grad_norm": 3039428.004186018,
"learning_rate": 4.5e-06,
"loss": 13.7903,
"step": 45
},
{
"epoch": 0.09363867684478372,
"grad_norm": 3356582.7961585973,
"learning_rate": 4.600000000000001e-06,
"loss": 13.8235,
"step": 46
},
{
"epoch": 0.09567430025445292,
"grad_norm": 2820493.914527673,
"learning_rate": 4.7e-06,
"loss": 13.8022,
"step": 47
},
{
"epoch": 0.09770992366412214,
"grad_norm": 3791222.908966115,
"learning_rate": 4.800000000000001e-06,
"loss": 13.7717,
"step": 48
},
{
"epoch": 0.09974554707379135,
"grad_norm": 3283270.2287080307,
"learning_rate": 4.9000000000000005e-06,
"loss": 13.787,
"step": 49
},
{
"epoch": 0.10178117048346055,
"grad_norm": 2485970.4909266913,
"learning_rate": 5e-06,
"loss": 13.7976,
"step": 50
},
{
"epoch": 0.10381679389312977,
"grad_norm": 4547104.520705372,
"learning_rate": 5.1e-06,
"loss": 13.7639,
"step": 51
},
{
"epoch": 0.10585241730279898,
"grad_norm": 2824615.9607255333,
"learning_rate": 5.2e-06,
"loss": 13.7782,
"step": 52
},
{
"epoch": 0.1078880407124682,
"grad_norm": 2095536.7289427207,
"learning_rate": 5.300000000000001e-06,
"loss": 13.8148,
"step": 53
},
{
"epoch": 0.1099236641221374,
"grad_norm": 2351584.5173169677,
"learning_rate": 5.400000000000001e-06,
"loss": 13.7998,
"step": 54
},
{
"epoch": 0.11195928753180662,
"grad_norm": 3228695.507820654,
"learning_rate": 5.500000000000001e-06,
"loss": 13.8394,
"step": 55
},
{
"epoch": 0.11399491094147583,
"grad_norm": 9313699.37550104,
"learning_rate": 5.600000000000001e-06,
"loss": 13.7991,
"step": 56
},
{
"epoch": 0.11603053435114503,
"grad_norm": 3293386.5704521113,
"learning_rate": 5.7e-06,
"loss": 13.7839,
"step": 57
},
{
"epoch": 0.11806615776081425,
"grad_norm": 3718314.8927475032,
"learning_rate": 5.8e-06,
"loss": 13.811,
"step": 58
},
{
"epoch": 0.12010178117048347,
"grad_norm": 3124256.4145811866,
"learning_rate": 5.9e-06,
"loss": 13.7796,
"step": 59
},
{
"epoch": 0.12213740458015267,
"grad_norm": 3177179.019257336,
"learning_rate": 6e-06,
"loss": 13.7927,
"step": 60
},
{
"epoch": 0.12417302798982188,
"grad_norm": 3023297.2648282214,
"learning_rate": 6.1e-06,
"loss": 13.7867,
"step": 61
},
{
"epoch": 0.12620865139949108,
"grad_norm": 3214697.401730724,
"learning_rate": 6.200000000000001e-06,
"loss": 13.7771,
"step": 62
},
{
"epoch": 0.1282442748091603,
"grad_norm": 5100755.95841697,
"learning_rate": 6.300000000000001e-06,
"loss": 13.8624,
"step": 63
},
{
"epoch": 0.13027989821882952,
"grad_norm": 2723163.1725419424,
"learning_rate": 6.4000000000000006e-06,
"loss": 13.8343,
"step": 64
},
{
"epoch": 0.13231552162849872,
"grad_norm": 3190220.582667358,
"learning_rate": 6.5000000000000004e-06,
"loss": 13.8278,
"step": 65
},
{
"epoch": 0.13435114503816795,
"grad_norm": 4004364.9481327008,
"learning_rate": 6.600000000000001e-06,
"loss": 13.8181,
"step": 66
},
{
"epoch": 0.13638676844783715,
"grad_norm": 3042780.596978967,
"learning_rate": 6.700000000000001e-06,
"loss": 13.8136,
"step": 67
},
{
"epoch": 0.13842239185750635,
"grad_norm": 3848690.0005479343,
"learning_rate": 6.800000000000001e-06,
"loss": 13.8067,
"step": 68
},
{
"epoch": 0.14045801526717558,
"grad_norm": 3081170.591628097,
"learning_rate": 6.9e-06,
"loss": 13.802,
"step": 69
},
{
"epoch": 0.14249363867684478,
"grad_norm": 3465407.339307021,
"learning_rate": 7e-06,
"loss": 13.7681,
"step": 70
},
{
"epoch": 0.14452926208651398,
"grad_norm": 2647645.8680279553,
"learning_rate": 7.100000000000001e-06,
"loss": 13.8244,
"step": 71
},
{
"epoch": 0.1465648854961832,
"grad_norm": 2676701.9454676,
"learning_rate": 7.2000000000000005e-06,
"loss": 13.8011,
"step": 72
},
{
"epoch": 0.1486005089058524,
"grad_norm": 2103570.9894912024,
"learning_rate": 7.3e-06,
"loss": 13.7988,
"step": 73
},
{
"epoch": 0.15063613231552161,
"grad_norm": 2809043.7511407104,
"learning_rate": 7.4e-06,
"loss": 13.8103,
"step": 74
},
{
"epoch": 0.15267175572519084,
"grad_norm": 4000235.7083044164,
"learning_rate": 7.500000000000001e-06,
"loss": 13.8168,
"step": 75
},
{
"epoch": 0.15470737913486005,
"grad_norm": 8459155.529969739,
"learning_rate": 7.600000000000001e-06,
"loss": 13.774,
"step": 76
},
{
"epoch": 0.15674300254452928,
"grad_norm": 16399233.534973303,
"learning_rate": 7.7e-06,
"loss": 13.8219,
"step": 77
},
{
"epoch": 0.15877862595419848,
"grad_norm": 2696806.469277922,
"learning_rate": 7.800000000000002e-06,
"loss": 13.7919,
"step": 78
},
{
"epoch": 0.16081424936386768,
"grad_norm": 6832348.586697958,
"learning_rate": 7.9e-06,
"loss": 13.8085,
"step": 79
},
{
"epoch": 0.1628498727735369,
"grad_norm": 2802377.0728139468,
"learning_rate": 8.000000000000001e-06,
"loss": 13.7758,
"step": 80
},
{
"epoch": 0.1648854961832061,
"grad_norm": 3693570.1419803873,
"learning_rate": 8.1e-06,
"loss": 13.8108,
"step": 81
},
{
"epoch": 0.1669211195928753,
"grad_norm": 1899967.9199083971,
"learning_rate": 8.2e-06,
"loss": 13.7788,
"step": 82
},
{
"epoch": 0.16895674300254454,
"grad_norm": 2632120.314827873,
"learning_rate": 8.3e-06,
"loss": 13.7979,
"step": 83
},
{
"epoch": 0.17099236641221374,
"grad_norm": 2240799.146098359,
"learning_rate": 8.400000000000001e-06,
"loss": 13.7656,
"step": 84
},
{
"epoch": 0.17302798982188294,
"grad_norm": 2881127.214705138,
"learning_rate": 8.5e-06,
"loss": 13.8209,
"step": 85
},
{
"epoch": 0.17506361323155217,
"grad_norm": 4313944.882142538,
"learning_rate": 8.6e-06,
"loss": 13.8141,
"step": 86
},
{
"epoch": 0.17709923664122137,
"grad_norm": 4205463.004393998,
"learning_rate": 8.700000000000001e-06,
"loss": 13.7967,
"step": 87
},
{
"epoch": 0.17913486005089058,
"grad_norm": 2478941.649437644,
"learning_rate": 8.8e-06,
"loss": 13.7916,
"step": 88
},
{
"epoch": 0.1811704834605598,
"grad_norm": 3528590.3048747736,
"learning_rate": 8.900000000000001e-06,
"loss": 13.7464,
"step": 89
},
{
"epoch": 0.183206106870229,
"grad_norm": 2318484.135702536,
"learning_rate": 9e-06,
"loss": 13.8293,
"step": 90
},
{
"epoch": 0.1852417302798982,
"grad_norm": 2158526.8137466703,
"learning_rate": 9.100000000000001e-06,
"loss": 13.7882,
"step": 91
},
{
"epoch": 0.18727735368956744,
"grad_norm": 3658620.230343455,
"learning_rate": 9.200000000000002e-06,
"loss": 13.7981,
"step": 92
},
{
"epoch": 0.18931297709923664,
"grad_norm": 2456882.2321177353,
"learning_rate": 9.3e-06,
"loss": 13.8297,
"step": 93
},
{
"epoch": 0.19134860050890584,
"grad_norm": 2990599.371813722,
"learning_rate": 9.4e-06,
"loss": 13.7921,
"step": 94
},
{
"epoch": 0.19338422391857507,
"grad_norm": 5560006.043017588,
"learning_rate": 9.5e-06,
"loss": 13.8165,
"step": 95
},
{
"epoch": 0.19541984732824427,
"grad_norm": 1762528.334237519,
"learning_rate": 9.600000000000001e-06,
"loss": 13.794,
"step": 96
},
{
"epoch": 0.19745547073791347,
"grad_norm": 2238736.0246347915,
"learning_rate": 9.7e-06,
"loss": 13.8312,
"step": 97
},
{
"epoch": 0.1994910941475827,
"grad_norm": 1991545.6391396692,
"learning_rate": 9.800000000000001e-06,
"loss": 13.7997,
"step": 98
},
{
"epoch": 0.2015267175572519,
"grad_norm": 2812661.8889751355,
"learning_rate": 9.9e-06,
"loss": 13.7873,
"step": 99
},
{
"epoch": 0.2035623409669211,
"grad_norm": 2569208.410955407,
"learning_rate": 1e-05,
"loss": 13.8016,
"step": 100
},
{
"epoch": 0.20559796437659034,
"grad_norm": 5078922.075783424,
"learning_rate": 9.999838607294157e-06,
"loss": 13.8041,
"step": 101
},
{
"epoch": 0.20763358778625954,
"grad_norm": 2652997.6261025066,
"learning_rate": 9.999354439595668e-06,
"loss": 13.7995,
"step": 102
},
{
"epoch": 0.20966921119592874,
"grad_norm": 7720533.0329790395,
"learning_rate": 9.998547528160987e-06,
"loss": 13.7946,
"step": 103
},
{
"epoch": 0.21170483460559797,
"grad_norm": 4996759.547611104,
"learning_rate": 9.997417925081963e-06,
"loss": 13.8414,
"step": 104
},
{
"epoch": 0.21374045801526717,
"grad_norm": 2594583.6722655715,
"learning_rate": 9.995965703282472e-06,
"loss": 13.8074,
"step": 105
},
{
"epoch": 0.2157760814249364,
"grad_norm": 2448594.3176417607,
"learning_rate": 9.99419095651372e-06,
"loss": 13.7975,
"step": 106
},
{
"epoch": 0.2178117048346056,
"grad_norm": 2714056.8451403086,
"learning_rate": 9.992093799348182e-06,
"loss": 13.8033,
"step": 107
},
{
"epoch": 0.2198473282442748,
"grad_norm": 2961036.7491148347,
"learning_rate": 9.9896743671722e-06,
"loss": 13.8288,
"step": 108
},
{
"epoch": 0.22188295165394403,
"grad_norm": 4655525.14394145,
"learning_rate": 9.986932816177258e-06,
"loss": 13.809,
"step": 109
},
{
"epoch": 0.22391857506361323,
"grad_norm": 4477853.58417314,
"learning_rate": 9.98386932334989e-06,
"loss": 13.7901,
"step": 110
},
{
"epoch": 0.22595419847328244,
"grad_norm": 7786381.525914281,
"learning_rate": 9.980484086460258e-06,
"loss": 13.7985,
"step": 111
},
{
"epoch": 0.22798982188295167,
"grad_norm": 2476772.377738302,
"learning_rate": 9.976777324049374e-06,
"loss": 13.8245,
"step": 112
},
{
"epoch": 0.23002544529262087,
"grad_norm": 2424723.3375852606,
"learning_rate": 9.972749275415005e-06,
"loss": 13.8106,
"step": 113
},
{
"epoch": 0.23206106870229007,
"grad_norm": 3908900.533498587,
"learning_rate": 9.96840020059622e-06,
"loss": 13.8147,
"step": 114
},
{
"epoch": 0.2340966921119593,
"grad_norm": 6536739.236436032,
"learning_rate": 9.963730380356599e-06,
"loss": 13.7825,
"step": 115
},
{
"epoch": 0.2361323155216285,
"grad_norm": 2895977.217837065,
"learning_rate": 9.958740116166113e-06,
"loss": 13.8305,
"step": 116
},
{
"epoch": 0.2381679389312977,
"grad_norm": 3560241.3442100273,
"learning_rate": 9.953429730181653e-06,
"loss": 13.8031,
"step": 117
},
{
"epoch": 0.24020356234096693,
"grad_norm": 1955074.0369023099,
"learning_rate": 9.947799565226253e-06,
"loss": 13.8003,
"step": 118
},
{
"epoch": 0.24223918575063613,
"grad_norm": 3497318.257096594,
"learning_rate": 9.94184998476693e-06,
"loss": 13.8214,
"step": 119
},
{
"epoch": 0.24427480916030533,
"grad_norm": 3346632.116332331,
"learning_rate": 9.93558137289124e-06,
"loss": 13.7768,
"step": 120
},
{
"epoch": 0.24631043256997456,
"grad_norm": 2439047.5049070125,
"learning_rate": 9.928994134282477e-06,
"loss": 13.8308,
"step": 121
},
{
"epoch": 0.24834605597964376,
"grad_norm": 2664458.243837202,
"learning_rate": 9.922088694193546e-06,
"loss": 13.8132,
"step": 122
},
{
"epoch": 0.250381679389313,
"grad_norm": 2619630.1649600505,
"learning_rate": 9.91486549841951e-06,
"loss": 13.8046,
"step": 123
},
{
"epoch": 0.25241730279898217,
"grad_norm": 6268665.590956273,
"learning_rate": 9.907325013268816e-06,
"loss": 13.8003,
"step": 124
},
{
"epoch": 0.2544529262086514,
"grad_norm": 3719517.4601419703,
"learning_rate": 9.899467725533181e-06,
"loss": 13.8048,
"step": 125
},
{
"epoch": 0.2564885496183206,
"grad_norm": 3607937.2215440352,
"learning_rate": 9.89129414245618e-06,
"loss": 13.7788,
"step": 126
},
{
"epoch": 0.2585241730279898,
"grad_norm": 4184303.763179342,
"learning_rate": 9.882804791700488e-06,
"loss": 13.7775,
"step": 127
},
{
"epoch": 0.26055979643765903,
"grad_norm": 1838438.1959072966,
"learning_rate": 9.87400022131382e-06,
"loss": 13.8457,
"step": 128
},
{
"epoch": 0.26259541984732826,
"grad_norm": 2268135.323406917,
"learning_rate": 9.864880999693551e-06,
"loss": 13.7905,
"step": 129
},
{
"epoch": 0.26463104325699743,
"grad_norm": 6234862.720815243,
"learning_rate": 9.855447715550024e-06,
"loss": 13.8322,
"step": 130
},
{
"epoch": 0.26666666666666666,
"grad_norm": 2161673.2272078265,
"learning_rate": 9.845700977868536e-06,
"loss": 13.7802,
"step": 131
},
{
"epoch": 0.2687022900763359,
"grad_norm": 2040946.090751483,
"learning_rate": 9.835641415870038e-06,
"loss": 13.8012,
"step": 132
},
{
"epoch": 0.27073791348600507,
"grad_norm": 1690704.8881864555,
"learning_rate": 9.825269678970502e-06,
"loss": 13.8144,
"step": 133
},
{
"epoch": 0.2727735368956743,
"grad_norm": 3302151.7787588844,
"learning_rate": 9.814586436738998e-06,
"loss": 13.7961,
"step": 134
},
{
"epoch": 0.2748091603053435,
"grad_norm": 2678767.7092372375,
"learning_rate": 9.803592378854476e-06,
"loss": 13.7924,
"step": 135
},
{
"epoch": 0.2768447837150127,
"grad_norm": 1882847.5546937664,
"learning_rate": 9.792288215061237e-06,
"loss": 13.7571,
"step": 136
},
{
"epoch": 0.27888040712468193,
"grad_norm": 2535286.1758342357,
"learning_rate": 9.780674675123113e-06,
"loss": 13.8159,
"step": 137
},
{
"epoch": 0.28091603053435116,
"grad_norm": 2564638.3112862715,
"learning_rate": 9.768752508776358e-06,
"loss": 13.8283,
"step": 138
},
{
"epoch": 0.28295165394402033,
"grad_norm": 4141816.881587341,
"learning_rate": 9.756522485681247e-06,
"loss": 13.8205,
"step": 139
},
{
"epoch": 0.28498727735368956,
"grad_norm": 2736144.4377185158,
"learning_rate": 9.743985395372387e-06,
"loss": 13.8191,
"step": 140
},
{
"epoch": 0.2870229007633588,
"grad_norm": 2915473.6501582544,
"learning_rate": 9.73114204720775e-06,
"loss": 13.7825,
"step": 141
},
{
"epoch": 0.28905852417302796,
"grad_norm": 4206163.240001201,
"learning_rate": 9.717993270316421e-06,
"loss": 13.8325,
"step": 142
},
{
"epoch": 0.2910941475826972,
"grad_norm": 5230479.261609765,
"learning_rate": 9.704539913545073e-06,
"loss": 13.8205,
"step": 143
},
{
"epoch": 0.2931297709923664,
"grad_norm": 1844426.584030888,
"learning_rate": 9.690782845403164e-06,
"loss": 13.7856,
"step": 144
},
{
"epoch": 0.2951653944020356,
"grad_norm": 2494575.837554648,
"learning_rate": 9.676722954006878e-06,
"loss": 13.7562,
"step": 145
},
{
"epoch": 0.2972010178117048,
"grad_norm": 4691983.503821501,
"learning_rate": 9.66236114702178e-06,
"loss": 13.8174,
"step": 146
},
{
"epoch": 0.29923664122137406,
"grad_norm": 11448273.842583835,
"learning_rate": 9.647698351604227e-06,
"loss": 13.811,
"step": 147
},
{
"epoch": 0.30127226463104323,
"grad_norm": 2421950.649919781,
"learning_rate": 9.632735514341508e-06,
"loss": 13.798,
"step": 148
},
{
"epoch": 0.30330788804071246,
"grad_norm": 3423539.343520856,
"learning_rate": 9.617473601190743e-06,
"loss": 13.7918,
"step": 149
},
{
"epoch": 0.3053435114503817,
"grad_norm": 3264738.7097397717,
"learning_rate": 9.601913597416513e-06,
"loss": 13.8008,
"step": 150
},
{
"epoch": 0.3073791348600509,
"grad_norm": 1943889.953196763,
"learning_rate": 9.586056507527266e-06,
"loss": 13.7854,
"step": 151
},
{
"epoch": 0.3094147582697201,
"grad_norm": 3137121.9149822216,
"learning_rate": 9.569903355210457e-06,
"loss": 13.8321,
"step": 152
},
{
"epoch": 0.3114503816793893,
"grad_norm": 2080163.2784456573,
"learning_rate": 9.55345518326647e-06,
"loss": 13.7755,
"step": 153
},
{
"epoch": 0.31348600508905855,
"grad_norm": 1730160.4186251867,
"learning_rate": 9.5367130535413e-06,
"loss": 13.8155,
"step": 154
},
{
"epoch": 0.3155216284987277,
"grad_norm": 4116918.3545234683,
"learning_rate": 9.519678046857987e-06,
"loss": 13.8182,
"step": 155
},
{
"epoch": 0.31755725190839695,
"grad_norm": 3736775.455979484,
"learning_rate": 9.502351262946865e-06,
"loss": 13.8155,
"step": 156
},
{
"epoch": 0.3195928753180662,
"grad_norm": 4078409.8308530655,
"learning_rate": 9.48473382037455e-06,
"loss": 13.8245,
"step": 157
},
{
"epoch": 0.32162849872773536,
"grad_norm": 102844859.60289915,
"learning_rate": 9.466826856471728e-06,
"loss": 13.7664,
"step": 158
},
{
"epoch": 0.3236641221374046,
"grad_norm": 2697351.488139713,
"learning_rate": 9.448631527259749e-06,
"loss": 13.824,
"step": 159
},
{
"epoch": 0.3256997455470738,
"grad_norm": 2650130.4884775463,
"learning_rate": 9.430149007375974e-06,
"loss": 13.82,
"step": 160
},
{
"epoch": 0.327735368956743,
"grad_norm": 2848577.7461152556,
"learning_rate": 9.411380489997962e-06,
"loss": 13.7988,
"step": 161
},
{
"epoch": 0.3297709923664122,
"grad_norm": 2491523.6147146528,
"learning_rate": 9.392327186766434e-06,
"loss": 13.7801,
"step": 162
},
{
"epoch": 0.33180661577608145,
"grad_norm": 2225342.8572617928,
"learning_rate": 9.372990327707057e-06,
"loss": 13.8134,
"step": 163
},
{
"epoch": 0.3338422391857506,
"grad_norm": 3593367.5642024544,
"learning_rate": 9.353371161151032e-06,
"loss": 13.7934,
"step": 164
},
{
"epoch": 0.33587786259541985,
"grad_norm": 3596895.2611031746,
"learning_rate": 9.333470953654513e-06,
"loss": 13.8181,
"step": 165
},
{
"epoch": 0.3379134860050891,
"grad_norm": 1983403.4312289366,
"learning_rate": 9.31329098991683e-06,
"loss": 13.8016,
"step": 166
},
{
"epoch": 0.33994910941475825,
"grad_norm": 2840359.066969552,
"learning_rate": 9.292832572697566e-06,
"loss": 13.7929,
"step": 167
},
{
"epoch": 0.3419847328244275,
"grad_norm": 2440280.3409614386,
"learning_rate": 9.272097022732444e-06,
"loss": 13.7855,
"step": 168
},
{
"epoch": 0.3440203562340967,
"grad_norm": 3564583.9519162746,
"learning_rate": 9.251085678648072e-06,
"loss": 13.7962,
"step": 169
},
{
"epoch": 0.3460559796437659,
"grad_norm": 3017839.1906693154,
"learning_rate": 9.22979989687552e-06,
"loss": 13.7882,
"step": 170
},
{
"epoch": 0.3480916030534351,
"grad_norm": 1926367.5770608934,
"learning_rate": 9.208241051562753e-06,
"loss": 13.829,
"step": 171
},
{
"epoch": 0.35012722646310435,
"grad_norm": 2347553.991884138,
"learning_rate": 9.186410534485924e-06,
"loss": 13.7865,
"step": 172
},
{
"epoch": 0.3521628498727735,
"grad_norm": 3954302.4427076643,
"learning_rate": 9.164309754959523e-06,
"loss": 13.8042,
"step": 173
},
{
"epoch": 0.35419847328244275,
"grad_norm": 3440382.5762445726,
"learning_rate": 9.14194013974539e-06,
"loss": 13.8065,
"step": 174
},
{
"epoch": 0.356234096692112,
"grad_norm": 3100692.355061626,
"learning_rate": 9.11930313296062e-06,
"loss": 13.8002,
"step": 175
},
{
"epoch": 0.35826972010178115,
"grad_norm": 3335035.757879338,
"learning_rate": 9.096400195984322e-06,
"loss": 13.8002,
"step": 176
},
{
"epoch": 0.3603053435114504,
"grad_norm": 2934238.222131686,
"learning_rate": 9.073232807363283e-06,
"loss": 13.7968,
"step": 177
},
{
"epoch": 0.3623409669211196,
"grad_norm": 3863277.6264942884,
"learning_rate": 9.049802462716521e-06,
"loss": 13.8059,
"step": 178
},
{
"epoch": 0.3643765903307888,
"grad_norm": 1979824.2277182264,
"learning_rate": 9.026110674638722e-06,
"loss": 13.7686,
"step": 179
},
{
"epoch": 0.366412213740458,
"grad_norm": 3941584.712432807,
"learning_rate": 9.002158972602599e-06,
"loss": 13.7605,
"step": 180
},
{
"epoch": 0.36844783715012724,
"grad_norm": 2814992.789501057,
"learning_rate": 8.977948902860154e-06,
"loss": 13.8302,
"step": 181
},
{
"epoch": 0.3704834605597964,
"grad_norm": 2652186.9612782886,
"learning_rate": 8.953482028342853e-06,
"loss": 13.7782,
"step": 182
},
{
"epoch": 0.37251908396946565,
"grad_norm": 3385003.9687919975,
"learning_rate": 8.92875992856073e-06,
"loss": 13.8049,
"step": 183
},
{
"epoch": 0.3745547073791349,
"grad_norm": 5173853.448674343,
"learning_rate": 8.903784199500412e-06,
"loss": 13.7785,
"step": 184
},
{
"epoch": 0.37659033078880405,
"grad_norm": 4211561.315304709,
"learning_rate": 8.8785564535221e-06,
"loss": 13.8245,
"step": 185
},
{
"epoch": 0.3786259541984733,
"grad_norm": 2082428.2757180594,
"learning_rate": 8.853078319255466e-06,
"loss": 13.8218,
"step": 186
},
{
"epoch": 0.3806615776081425,
"grad_norm": 2180497.2165646055,
"learning_rate": 8.827351441494525e-06,
"loss": 13.7686,
"step": 187
},
{
"epoch": 0.3826972010178117,
"grad_norm": 2403683.8496971005,
"learning_rate": 8.80137748109144e-06,
"loss": 13.8504,
"step": 188
},
{
"epoch": 0.3847328244274809,
"grad_norm": 2088585.2805628132,
"learning_rate": 8.77515811484931e-06,
"loss": 13.8108,
"step": 189
},
{
"epoch": 0.38676844783715014,
"grad_norm": 2694457.991947191,
"learning_rate": 8.748695035413925e-06,
"loss": 13.8309,
"step": 190
},
{
"epoch": 0.3888040712468193,
"grad_norm": 3397579.358486345,
"learning_rate": 8.72198995116448e-06,
"loss": 13.7726,
"step": 191
},
{
"epoch": 0.39083969465648855,
"grad_norm": 1991789.4895514157,
"learning_rate": 8.695044586103297e-06,
"loss": 13.7618,
"step": 192
},
{
"epoch": 0.3928753180661578,
"grad_norm": 5556327.041255268,
"learning_rate": 8.667860679744529e-06,
"loss": 13.7844,
"step": 193
},
{
"epoch": 0.39491094147582695,
"grad_norm": 2177637.385711558,
"learning_rate": 8.640439987001855e-06,
"loss": 13.7805,
"step": 194
},
{
"epoch": 0.3969465648854962,
"grad_norm": 5370184.851617085,
"learning_rate": 8.612784278075195e-06,
"loss": 13.7892,
"step": 195
},
{
"epoch": 0.3989821882951654,
"grad_norm": 2473463.532725396,
"learning_rate": 8.58489533833643e-06,
"loss": 13.7826,
"step": 196
},
{
"epoch": 0.4010178117048346,
"grad_norm": 3439577.606571749,
"learning_rate": 8.556774968214134e-06,
"loss": 13.8133,
"step": 197
},
{
"epoch": 0.4030534351145038,
"grad_norm": 2463771.55347351,
"learning_rate": 8.52842498307736e-06,
"loss": 13.8139,
"step": 198
},
{
"epoch": 0.40508905852417304,
"grad_norm": 2610726.9405806856,
"learning_rate": 8.499847213118431e-06,
"loss": 13.7792,
"step": 199
},
{
"epoch": 0.4071246819338422,
"grad_norm": 5126921.560500939,
"learning_rate": 8.471043503234796e-06,
"loss": 13.7862,
"step": 200
},
{
"epoch": 0.40916030534351144,
"grad_norm": 2337790.258284094,
"learning_rate": 8.442015712909926e-06,
"loss": 13.8018,
"step": 201
},
{
"epoch": 0.4111959287531807,
"grad_norm": 2269326.88734255,
"learning_rate": 8.412765716093273e-06,
"loss": 13.8081,
"step": 202
},
{
"epoch": 0.41323155216284985,
"grad_norm": 2482439.54650581,
"learning_rate": 8.383295401079284e-06,
"loss": 13.8155,
"step": 203
},
{
"epoch": 0.4152671755725191,
"grad_norm": 2331248.12998744,
"learning_rate": 8.353606670385514e-06,
"loss": 13.7731,
"step": 204
},
{
"epoch": 0.4173027989821883,
"grad_norm": 2419939.8754564477,
"learning_rate": 8.3237014406298e-06,
"loss": 13.8313,
"step": 205
},
{
"epoch": 0.4193384223918575,
"grad_norm": 2294086.8182832007,
"learning_rate": 8.293581642406517e-06,
"loss": 13.836,
"step": 206
},
{
"epoch": 0.4213740458015267,
"grad_norm": 3514387.9792135926,
"learning_rate": 8.263249220161957e-06,
"loss": 13.8349,
"step": 207
},
{
"epoch": 0.42340966921119594,
"grad_norm": 2691961.5452496265,
"learning_rate": 8.232706132068806e-06,
"loss": 13.8321,
"step": 208
},
{
"epoch": 0.42544529262086517,
"grad_norm": 2645633.6227109493,
"learning_rate": 8.201954349899712e-06,
"loss": 13.8001,
"step": 209
},
{
"epoch": 0.42748091603053434,
"grad_norm": 3335230.860278415,
"learning_rate": 8.17099585890001e-06,
"loss": 13.8217,
"step": 210
},
{
"epoch": 0.42951653944020357,
"grad_norm": 2566670.9506267244,
"learning_rate": 8.139832657659557e-06,
"loss": 13.7714,
"step": 211
},
{
"epoch": 0.4315521628498728,
"grad_norm": 2370635.7525829547,
"learning_rate": 8.108466757983695e-06,
"loss": 13.7885,
"step": 212
},
{
"epoch": 0.433587786259542,
"grad_norm": 4441547.469959615,
"learning_rate": 8.076900184763394e-06,
"loss": 13.7823,
"step": 213
},
{
"epoch": 0.4356234096692112,
"grad_norm": 24430772.164419036,
"learning_rate": 8.04513497584452e-06,
"loss": 13.846,
"step": 214
},
{
"epoch": 0.43765903307888043,
"grad_norm": 1935048.054024762,
"learning_rate": 8.013173181896283e-06,
"loss": 13.7878,
"step": 215
},
{
"epoch": 0.4396946564885496,
"grad_norm": 1873118.933736968,
"learning_rate": 7.981016866278843e-06,
"loss": 13.7934,
"step": 216
},
{
"epoch": 0.44173027989821884,
"grad_norm": 2169804.42270086,
"learning_rate": 7.94866810491012e-06,
"loss": 13.7965,
"step": 217
},
{
"epoch": 0.44376590330788807,
"grad_norm": 3286778.9186343704,
"learning_rate": 7.916128986131761e-06,
"loss": 13.8332,
"step": 218
},
{
"epoch": 0.44580152671755724,
"grad_norm": 2593694.969662653,
"learning_rate": 7.883401610574338e-06,
"loss": 13.7957,
"step": 219
},
{
"epoch": 0.44783715012722647,
"grad_norm": 4975950.65536953,
"learning_rate": 7.850488091021726e-06,
"loss": 13.8212,
"step": 220
},
{
"epoch": 0.4498727735368957,
"grad_norm": 2795061.731221561,
"learning_rate": 7.817390552274721e-06,
"loss": 13.7835,
"step": 221
},
{
"epoch": 0.45190839694656487,
"grad_norm": 2386068.803973628,
"learning_rate": 7.784111131013858e-06,
"loss": 13.8274,
"step": 222
},
{
"epoch": 0.4539440203562341,
"grad_norm": 4657398.876885557,
"learning_rate": 7.750651975661471e-06,
"loss": 13.7771,
"step": 223
},
{
"epoch": 0.45597964376590333,
"grad_norm": 3655891.188475978,
"learning_rate": 7.717015246243012e-06,
"loss": 13.8151,
"step": 224
},
{
"epoch": 0.4580152671755725,
"grad_norm": 3667526.462351342,
"learning_rate": 7.683203114247587e-06,
"loss": 13.7915,
"step": 225
},
{
"epoch": 0.46005089058524173,
"grad_norm": 3809298.1813372867,
"learning_rate": 7.649217762487786e-06,
"loss": 13.8205,
"step": 226
},
{
"epoch": 0.46208651399491096,
"grad_norm": 3288055.728671065,
"learning_rate": 7.615061384958764e-06,
"loss": 13.8062,
"step": 227
},
{
"epoch": 0.46412213740458014,
"grad_norm": 5662460.544065125,
"learning_rate": 7.580736186696593e-06,
"loss": 13.8049,
"step": 228
},
{
"epoch": 0.46615776081424937,
"grad_norm": 4421743.27796514,
"learning_rate": 7.546244383635929e-06,
"loss": 13.7686,
"step": 229
},
{
"epoch": 0.4681933842239186,
"grad_norm": 2313331.4271743125,
"learning_rate": 7.5115882024669375e-06,
"loss": 13.8003,
"step": 230
},
{
"epoch": 0.47022900763358777,
"grad_norm": 6343328.905525712,
"learning_rate": 7.476769880491561e-06,
"loss": 13.7806,
"step": 231
},
{
"epoch": 0.472264631043257,
"grad_norm": 2640558.638662984,
"learning_rate": 7.44179166547908e-06,
"loss": 13.7999,
"step": 232
},
{
"epoch": 0.47430025445292623,
"grad_norm": 4310404.449472289,
"learning_rate": 7.406655815520998e-06,
"loss": 13.8118,
"step": 233
},
{
"epoch": 0.4763358778625954,
"grad_norm": 4518592.431753858,
"learning_rate": 7.371364598885276e-06,
"loss": 13.7864,
"step": 234
},
{
"epoch": 0.47837150127226463,
"grad_norm": 2098819.871080459,
"learning_rate": 7.335920293869891e-06,
"loss": 13.8016,
"step": 235
},
{
"epoch": 0.48040712468193386,
"grad_norm": 2463516.9812008953,
"learning_rate": 7.300325188655762e-06,
"loss": 13.7913,
"step": 236
},
{
"epoch": 0.48244274809160304,
"grad_norm": 3102860.0228757737,
"learning_rate": 7.264581581159024e-06,
"loss": 13.7706,
"step": 237
},
{
"epoch": 0.48447837150127226,
"grad_norm": 1914868.4483156833,
"learning_rate": 7.2286917788826926e-06,
"loss": 13.8093,
"step": 238
},
{
"epoch": 0.4865139949109415,
"grad_norm": 2539581.4122914425,
"learning_rate": 7.192658098767686e-06,
"loss": 13.7854,
"step": 239
},
{
"epoch": 0.48854961832061067,
"grad_norm": 3707976.67600207,
"learning_rate": 7.1564828670432595e-06,
"loss": 13.8176,
"step": 240
},
{
"epoch": 0.4905852417302799,
"grad_norm": 3593935.7875790736,
"learning_rate": 7.120168419076825e-06,
"loss": 13.767,
"step": 241
},
{
"epoch": 0.4926208651399491,
"grad_norm": 19371415.336839173,
"learning_rate": 7.083717099223192e-06,
"loss": 13.7774,
"step": 242
},
{
"epoch": 0.4946564885496183,
"grad_norm": 2053853.5565662459,
"learning_rate": 7.047131260673214e-06,
"loss": 13.8225,
"step": 243
},
{
"epoch": 0.49669211195928753,
"grad_norm": 4077586.3030170733,
"learning_rate": 7.010413265301888e-06,
"loss": 13.7911,
"step": 244
},
{
"epoch": 0.49872773536895676,
"grad_norm": 2640760.978026206,
"learning_rate": 6.97356548351586e-06,
"loss": 13.8096,
"step": 245
},
{
"epoch": 0.500763358778626,
"grad_norm": 3012267.667522946,
"learning_rate": 6.936590294100414e-06,
"loss": 13.7508,
"step": 246
},
{
"epoch": 0.5027989821882952,
"grad_norm": 2736053.147326933,
"learning_rate": 6.899490084065897e-06,
"loss": 13.8161,
"step": 247
},
{
"epoch": 0.5048346055979643,
"grad_norm": 3879858.467910511,
"learning_rate": 6.862267248493624e-06,
"loss": 13.8145,
"step": 248
},
{
"epoch": 0.5068702290076336,
"grad_norm": 2410669.072572543,
"learning_rate": 6.824924190381257e-06,
"loss": 13.7883,
"step": 249
},
{
"epoch": 0.5089058524173028,
"grad_norm": 2792170.4070734098,
"learning_rate": 6.7874633204876705e-06,
"loss": 13.83,
"step": 250
},
{
"epoch": 0.510941475826972,
"grad_norm": 3054954.4878360187,
"learning_rate": 6.7498870571773275e-06,
"loss": 13.8138,
"step": 251
},
{
"epoch": 0.5129770992366413,
"grad_norm": 13118018.692801312,
"learning_rate": 6.712197826264154e-06,
"loss": 13.7877,
"step": 252
},
{
"epoch": 0.5150127226463105,
"grad_norm": 2274862.944795723,
"learning_rate": 6.674398060854931e-06,
"loss": 13.788,
"step": 253
},
{
"epoch": 0.5170483460559796,
"grad_norm": 6493219.085795618,
"learning_rate": 6.636490201192229e-06,
"loss": 13.8056,
"step": 254
},
{
"epoch": 0.5190839694656488,
"grad_norm": 2210871.447412882,
"learning_rate": 6.5984766944968636e-06,
"loss": 13.7964,
"step": 255
},
{
"epoch": 0.5211195928753181,
"grad_norm": 2535457.1929967045,
"learning_rate": 6.560359994809916e-06,
"loss": 13.8025,
"step": 256
},
{
"epoch": 0.5231552162849873,
"grad_norm": 3487433.3872100264,
"learning_rate": 6.522142562834307e-06,
"loss": 13.7752,
"step": 257
},
{
"epoch": 0.5251908396946565,
"grad_norm": 2350043.7437156746,
"learning_rate": 6.483826865775941e-06,
"loss": 13.7891,
"step": 258
},
{
"epoch": 0.5272264631043257,
"grad_norm": 3776676.8394483137,
"learning_rate": 6.445415377184427e-06,
"loss": 13.8172,
"step": 259
},
{
"epoch": 0.5292620865139949,
"grad_norm": 3888987.2901650295,
"learning_rate": 6.4069105767933944e-06,
"loss": 13.7623,
"step": 260
},
{
"epoch": 0.5312977099236641,
"grad_norm": 3340457.7579122144,
"learning_rate": 6.368314950360416e-06,
"loss": 13.8065,
"step": 261
},
{
"epoch": 0.5333333333333333,
"grad_norm": 5982292.313514162,
"learning_rate": 6.3296309895065215e-06,
"loss": 13.7812,
"step": 262
},
{
"epoch": 0.5353689567430026,
"grad_norm": 3219916.0058936477,
"learning_rate": 6.290861191555359e-06,
"loss": 13.745,
"step": 263
},
{
"epoch": 0.5374045801526718,
"grad_norm": 11720301.310935492,
"learning_rate": 6.252008059371968e-06,
"loss": 13.8253,
"step": 264
},
{
"epoch": 0.539440203562341,
"grad_norm": 3118271.2300748057,
"learning_rate": 6.213074101201202e-06,
"loss": 13.7763,
"step": 265
},
{
"epoch": 0.5414758269720101,
"grad_norm": 1697945.554145853,
"learning_rate": 6.174061830505801e-06,
"loss": 13.7883,
"step": 266
},
{
"epoch": 0.5435114503816794,
"grad_norm": 2857758.0379691618,
"learning_rate": 6.1349737658041385e-06,
"loss": 13.7939,
"step": 267
},
{
"epoch": 0.5455470737913486,
"grad_norm": 1961975.7441584785,
"learning_rate": 6.095812430507627e-06,
"loss": 13.7989,
"step": 268
},
{
"epoch": 0.5475826972010178,
"grad_norm": 4193880.83293653,
"learning_rate": 6.056580352757813e-06,
"loss": 13.8241,
"step": 269
},
{
"epoch": 0.549618320610687,
"grad_norm": 3312744.2202664735,
"learning_rate": 6.0172800652631706e-06,
"loss": 13.7733,
"step": 270
},
{
"epoch": 0.5516539440203563,
"grad_norm": 7858171.4628072195,
"learning_rate": 5.977914105135594e-06,
"loss": 13.8072,
"step": 271
},
{
"epoch": 0.5536895674300254,
"grad_norm": 3941663.415453141,
"learning_rate": 5.938485013726612e-06,
"loss": 13.8103,
"step": 272
},
{
"epoch": 0.5557251908396946,
"grad_norm": 3610433.10118229,
"learning_rate": 5.898995336463326e-06,
"loss": 13.807,
"step": 273
},
{
"epoch": 0.5577608142493639,
"grad_norm": 4164601.1328435126,
"learning_rate": 5.859447622684084e-06,
"loss": 13.815,
"step": 274
},
{
"epoch": 0.5597964376590331,
"grad_norm": 2548966.8285937863,
"learning_rate": 5.819844425473899e-06,
"loss": 13.7778,
"step": 275
},
{
"epoch": 0.5618320610687023,
"grad_norm": 4078882.101176789,
"learning_rate": 5.780188301499636e-06,
"loss": 13.8193,
"step": 276
},
{
"epoch": 0.5638676844783715,
"grad_norm": 2517308.8894206337,
"learning_rate": 5.740481810844952e-06,
"loss": 13.8063,
"step": 277
},
{
"epoch": 0.5659033078880407,
"grad_norm": 2769966.422515691,
"learning_rate": 5.700727516845038e-06,
"loss": 13.8094,
"step": 278
},
{
"epoch": 0.5679389312977099,
"grad_norm": 3505593.62357941,
"learning_rate": 5.660927985921122e-06,
"loss": 13.8098,
"step": 279
},
{
"epoch": 0.5699745547073791,
"grad_norm": 2349437.1104983217,
"learning_rate": 5.621085787414799e-06,
"loss": 13.799,
"step": 280
},
{
"epoch": 0.5720101781170484,
"grad_norm": 3132576.359530331,
"learning_rate": 5.581203493422161e-06,
"loss": 13.7891,
"step": 281
},
{
"epoch": 0.5740458015267176,
"grad_norm": 4404000.913374095,
"learning_rate": 5.541283678627742e-06,
"loss": 13.7851,
"step": 282
},
{
"epoch": 0.5760814249363868,
"grad_norm": 3276477.3200832414,
"learning_rate": 5.501328920138314e-06,
"loss": 13.8194,
"step": 283
},
{
"epoch": 0.5781170483460559,
"grad_norm": 3546838.8556743674,
"learning_rate": 5.46134179731651e-06,
"loss": 13.8107,
"step": 284
},
{
"epoch": 0.5801526717557252,
"grad_norm": 2115264.4674158324,
"learning_rate": 5.421324891614312e-06,
"loss": 13.7984,
"step": 285
},
{
"epoch": 0.5821882951653944,
"grad_norm": 2235578.204459906,
"learning_rate": 5.3812807864063946e-06,
"loss": 13.8009,
"step": 286
},
{
"epoch": 0.5842239185750636,
"grad_norm": 4052228.371342962,
"learning_rate": 5.341212066823356e-06,
"loss": 13.7885,
"step": 287
},
{
"epoch": 0.5862595419847328,
"grad_norm": 2755225.8421591343,
"learning_rate": 5.3011213195848245e-06,
"loss": 13.7845,
"step": 288
},
{
"epoch": 0.5882951653944021,
"grad_norm": 3206915.4484688323,
"learning_rate": 5.26101113283247e-06,
"loss": 13.8352,
"step": 289
},
{
"epoch": 0.5903307888040712,
"grad_norm": 8291421.864476618,
"learning_rate": 5.220884095962924e-06,
"loss": 13.7882,
"step": 290
},
{
"epoch": 0.5923664122137404,
"grad_norm": 4520092.054349328,
"learning_rate": 5.1807427994606065e-06,
"loss": 13.7911,
"step": 291
},
{
"epoch": 0.5944020356234097,
"grad_norm": 1988928.2728027685,
"learning_rate": 5.140589834730503e-06,
"loss": 13.8379,
"step": 292
},
{
"epoch": 0.5964376590330789,
"grad_norm": 4465132.927934143,
"learning_rate": 5.100427793930862e-06,
"loss": 13.7948,
"step": 293
},
{
"epoch": 0.5984732824427481,
"grad_norm": 2286780.168218436,
"learning_rate": 5.06025926980586e-06,
"loss": 13.8048,
"step": 294
},
{
"epoch": 0.6005089058524173,
"grad_norm": 2724184.449634601,
"learning_rate": 5.0200868555182155e-06,
"loss": 13.7999,
"step": 295
},
{
"epoch": 0.6025445292620865,
"grad_norm": 2576691.827404387,
"learning_rate": 4.979913144481785e-06,
"loss": 13.8064,
"step": 296
},
{
"epoch": 0.6045801526717557,
"grad_norm": 2744547.316751667,
"learning_rate": 4.939740730194141e-06,
"loss": 13.7882,
"step": 297
},
{
"epoch": 0.6066157760814249,
"grad_norm": 2335607.7276287796,
"learning_rate": 4.899572206069138e-06,
"loss": 13.8027,
"step": 298
},
{
"epoch": 0.6086513994910941,
"grad_norm": 2599599.7384826983,
"learning_rate": 4.8594101652694996e-06,
"loss": 13.8156,
"step": 299
},
{
"epoch": 0.6106870229007634,
"grad_norm": 3536926.667490122,
"learning_rate": 4.819257200539394e-06,
"loss": 13.7784,
"step": 300
},
{
"epoch": 0.6127226463104326,
"grad_norm": 3302834.606287667,
"learning_rate": 4.779115904037079e-06,
"loss": 13.8263,
"step": 301
},
{
"epoch": 0.6147582697201018,
"grad_norm": 3042014.762667294,
"learning_rate": 4.738988867167531e-06,
"loss": 13.7901,
"step": 302
},
{
"epoch": 0.616793893129771,
"grad_norm": 2828614.249275787,
"learning_rate": 4.698878680415176e-06,
"loss": 13.8186,
"step": 303
},
{
"epoch": 0.6188295165394402,
"grad_norm": 3470485.7180142673,
"learning_rate": 4.6587879331766465e-06,
"loss": 13.8252,
"step": 304
},
{
"epoch": 0.6208651399491094,
"grad_norm": 2142989.454887267,
"learning_rate": 4.618719213593605e-06,
"loss": 13.8266,
"step": 305
},
{
"epoch": 0.6229007633587786,
"grad_norm": 3599418.075481026,
"learning_rate": 4.5786751083856895e-06,
"loss": 13.7994,
"step": 306
},
{
"epoch": 0.6249363867684479,
"grad_norm": 2468894.215654323,
"learning_rate": 4.53865820268349e-06,
"loss": 13.7975,
"step": 307
},
{
"epoch": 0.6269720101781171,
"grad_norm": 3514891.7530831015,
"learning_rate": 4.498671079861686e-06,
"loss": 13.8089,
"step": 308
},
{
"epoch": 0.6290076335877862,
"grad_norm": 2860617.0108131613,
"learning_rate": 4.4587163213722595e-06,
"loss": 13.8118,
"step": 309
},
{
"epoch": 0.6310432569974554,
"grad_norm": 3631959.0225475803,
"learning_rate": 4.41879650657784e-06,
"loss": 13.7789,
"step": 310
},
{
"epoch": 0.6330788804071247,
"grad_norm": 16505260.927290315,
"learning_rate": 4.3789142125852015e-06,
"loss": 13.7988,
"step": 311
},
{
"epoch": 0.6351145038167939,
"grad_norm": 2464199.3044430655,
"learning_rate": 4.339072014078879e-06,
"loss": 13.8382,
"step": 312
},
{
"epoch": 0.6371501272264631,
"grad_norm": 2003547.7230704648,
"learning_rate": 4.299272483154963e-06,
"loss": 13.8229,
"step": 313
},
{
"epoch": 0.6391857506361324,
"grad_norm": 2815342.948838401,
"learning_rate": 4.259518189155049e-06,
"loss": 13.8061,
"step": 314
},
{
"epoch": 0.6412213740458015,
"grad_norm": 3015740.318214435,
"learning_rate": 4.219811698500365e-06,
"loss": 13.8036,
"step": 315
},
{
"epoch": 0.6432569974554707,
"grad_norm": 3876924.4779627738,
"learning_rate": 4.1801555745261025e-06,
"loss": 13.7914,
"step": 316
},
{
"epoch": 0.6452926208651399,
"grad_norm": 3172609.6206479096,
"learning_rate": 4.140552377315918e-06,
"loss": 13.8449,
"step": 317
},
{
"epoch": 0.6473282442748092,
"grad_norm": 4902114.509930776,
"learning_rate": 4.101004663536675e-06,
"loss": 13.7976,
"step": 318
},
{
"epoch": 0.6493638676844784,
"grad_norm": 2691295.2268303274,
"learning_rate": 4.061514986273391e-06,
"loss": 13.8136,
"step": 319
},
{
"epoch": 0.6513994910941476,
"grad_norm": 3460007.077345259,
"learning_rate": 4.022085894864408e-06,
"loss": 13.7403,
"step": 320
},
{
"epoch": 0.6534351145038167,
"grad_norm": 2367194.347019021,
"learning_rate": 3.982719934736832e-06,
"loss": 13.7971,
"step": 321
},
{
"epoch": 0.655470737913486,
"grad_norm": 15501881.45680894,
"learning_rate": 3.943419647242189e-06,
"loss": 13.8015,
"step": 322
},
{
"epoch": 0.6575063613231552,
"grad_norm": 2840321.3616006514,
"learning_rate": 3.904187569492373e-06,
"loss": 13.8104,
"step": 323
},
{
"epoch": 0.6595419847328244,
"grad_norm": 2030054.0765901625,
"learning_rate": 3.865026234195863e-06,
"loss": 13.7948,
"step": 324
},
{
"epoch": 0.6615776081424937,
"grad_norm": 2852479.1215876606,
"learning_rate": 3.8259381694942e-06,
"loss": 13.7901,
"step": 325
},
{
"epoch": 0.6636132315521629,
"grad_norm": 3456741.004275556,
"learning_rate": 3.786925898798801e-06,
"loss": 13.7857,
"step": 326
},
{
"epoch": 0.665648854961832,
"grad_norm": 2066561.8194525177,
"learning_rate": 3.7479919406280334e-06,
"loss": 13.8063,
"step": 327
},
{
"epoch": 0.6676844783715012,
"grad_norm": 3433587.7234547967,
"learning_rate": 3.709138808444641e-06,
"loss": 13.8119,
"step": 328
},
{
"epoch": 0.6697201017811705,
"grad_norm": 3578864.1748913922,
"learning_rate": 3.6703690104934806e-06,
"loss": 13.7908,
"step": 329
},
{
"epoch": 0.6717557251908397,
"grad_norm": 4610701.366359627,
"learning_rate": 3.6316850496395863e-06,
"loss": 13.7847,
"step": 330
},
{
"epoch": 0.6737913486005089,
"grad_norm": 2286947.6993929525,
"learning_rate": 3.5930894232066072e-06,
"loss": 13.8097,
"step": 331
},
{
"epoch": 0.6758269720101782,
"grad_norm": 2505249.1545348107,
"learning_rate": 3.5545846228155743e-06,
"loss": 13.7801,
"step": 332
},
{
"epoch": 0.6778625954198473,
"grad_norm": 5630898.8033584,
"learning_rate": 3.516173134224059e-06,
"loss": 13.8135,
"step": 333
},
{
"epoch": 0.6798982188295165,
"grad_norm": 5219141.786739386,
"learning_rate": 3.477857437165694e-06,
"loss": 13.8255,
"step": 334
},
{
"epoch": 0.6819338422391857,
"grad_norm": 2124438.8087317753,
"learning_rate": 3.4396400051900846e-06,
"loss": 13.7879,
"step": 335
},
{
"epoch": 0.683969465648855,
"grad_norm": 2105149.4046267755,
"learning_rate": 3.401523305503139e-06,
"loss": 13.808,
"step": 336
},
{
"epoch": 0.6860050890585242,
"grad_norm": 2809609.693441461,
"learning_rate": 3.3635097988077724e-06,
"loss": 13.8112,
"step": 337
},
{
"epoch": 0.6880407124681934,
"grad_norm": 2457213.1931723696,
"learning_rate": 3.3256019391450696e-06,
"loss": 13.8489,
"step": 338
},
{
"epoch": 0.6900763358778625,
"grad_norm": 12558459.530425403,
"learning_rate": 3.287802173735848e-06,
"loss": 13.8277,
"step": 339
},
{
"epoch": 0.6921119592875318,
"grad_norm": 6967431.723878883,
"learning_rate": 3.250112942822673e-06,
"loss": 13.8185,
"step": 340
},
{
"epoch": 0.694147582697201,
"grad_norm": 3411907.5422053095,
"learning_rate": 3.212536679512332e-06,
"loss": 13.787,
"step": 341
},
{
"epoch": 0.6961832061068702,
"grad_norm": 2429502.145664427,
"learning_rate": 3.1750758096187446e-06,
"loss": 13.8105,
"step": 342
},
{
"epoch": 0.6982188295165395,
"grad_norm": 2327547.2445986923,
"learning_rate": 3.137732751506376e-06,
"loss": 13.7772,
"step": 343
},
{
"epoch": 0.7002544529262087,
"grad_norm": 3569398.8894632743,
"learning_rate": 3.1005099159341044e-06,
"loss": 13.7938,
"step": 344
},
{
"epoch": 0.7022900763358778,
"grad_norm": 3211149.457873932,
"learning_rate": 3.0634097058995877e-06,
"loss": 13.7915,
"step": 345
},
{
"epoch": 0.704325699745547,
"grad_norm": 3830655.2384712566,
"learning_rate": 3.0264345164841426e-06,
"loss": 13.8191,
"step": 346
},
{
"epoch": 0.7063613231552163,
"grad_norm": 3823712.125917996,
"learning_rate": 2.989586734698113e-06,
"loss": 13.8023,
"step": 347
},
{
"epoch": 0.7083969465648855,
"grad_norm": 32196514.721627507,
"learning_rate": 2.9528687393267865e-06,
"loss": 13.7764,
"step": 348
},
{
"epoch": 0.7104325699745547,
"grad_norm": 1882821.5554029653,
"learning_rate": 2.9162829007768103e-06,
"loss": 13.8351,
"step": 349
},
{
"epoch": 0.712468193384224,
"grad_norm": 3350119.682457626,
"learning_rate": 2.879831580923176e-06,
"loss": 13.7957,
"step": 350
},
{
"epoch": 0.7145038167938931,
"grad_norm": 3937737.379258324,
"learning_rate": 2.843517132956742e-06,
"loss": 13.8002,
"step": 351
},
{
"epoch": 0.7165394402035623,
"grad_norm": 3146762.068649268,
"learning_rate": 2.8073419012323154e-06,
"loss": 13.7632,
"step": 352
},
{
"epoch": 0.7185750636132315,
"grad_norm": 2767069.333419986,
"learning_rate": 2.771308221117309e-06,
"loss": 13.7933,
"step": 353
},
{
"epoch": 0.7206106870229008,
"grad_norm": 4373321.192999916,
"learning_rate": 2.7354184188409773e-06,
"loss": 13.798,
"step": 354
},
{
"epoch": 0.72264631043257,
"grad_norm": 3450348.839131381,
"learning_rate": 2.6996748113442397e-06,
"loss": 13.8177,
"step": 355
},
{
"epoch": 0.7246819338422392,
"grad_norm": 5591120.72637471,
"learning_rate": 2.66407970613011e-06,
"loss": 13.7984,
"step": 356
},
{
"epoch": 0.7267175572519083,
"grad_norm": 1703861.5304970315,
"learning_rate": 2.6286354011147252e-06,
"loss": 13.8147,
"step": 357
},
{
"epoch": 0.7287531806615776,
"grad_norm": 3465138.9565325286,
"learning_rate": 2.593344184479003e-06,
"loss": 13.8088,
"step": 358
},
{
"epoch": 0.7307888040712468,
"grad_norm": 4545416.100109938,
"learning_rate": 2.5582083345209217e-06,
"loss": 13.8249,
"step": 359
},
{
"epoch": 0.732824427480916,
"grad_norm": 3516332.5584949586,
"learning_rate": 2.5232301195084395e-06,
"loss": 13.8055,
"step": 360
},
{
"epoch": 0.7348600508905853,
"grad_norm": 2312331.751163826,
"learning_rate": 2.488411797533064e-06,
"loss": 13.8223,
"step": 361
},
{
"epoch": 0.7368956743002545,
"grad_norm": 2685597.665166646,
"learning_rate": 2.4537556163640726e-06,
"loss": 13.8293,
"step": 362
},
{
"epoch": 0.7389312977099237,
"grad_norm": 2522967.966999191,
"learning_rate": 2.4192638133034074e-06,
"loss": 13.7902,
"step": 363
},
{
"epoch": 0.7409669211195928,
"grad_norm": 2195261.840643841,
"learning_rate": 2.384938615041238e-06,
"loss": 13.782,
"step": 364
},
{
"epoch": 0.7430025445292621,
"grad_norm": 6598560.135727499,
"learning_rate": 2.350782237512215e-06,
"loss": 13.8163,
"step": 365
},
{
"epoch": 0.7450381679389313,
"grad_norm": 2786370.46056762,
"learning_rate": 2.316796885752415e-06,
"loss": 13.7821,
"step": 366
},
{
"epoch": 0.7470737913486005,
"grad_norm": 2943271.811966078,
"learning_rate": 2.2829847537569904e-06,
"loss": 13.7909,
"step": 367
},
{
"epoch": 0.7491094147582698,
"grad_norm": 2462160.442139116,
"learning_rate": 2.2493480243385298e-06,
"loss": 13.814,
"step": 368
},
{
"epoch": 0.751145038167939,
"grad_norm": 2225343.862719593,
"learning_rate": 2.2158888689861434e-06,
"loss": 13.8084,
"step": 369
},
{
"epoch": 0.7531806615776081,
"grad_norm": 2960178.644234803,
"learning_rate": 2.182609447725279e-06,
"loss": 13.8266,
"step": 370
},
{
"epoch": 0.7552162849872773,
"grad_norm": 2914387.1052595368,
"learning_rate": 2.149511908978275e-06,
"loss": 13.8007,
"step": 371
},
{
"epoch": 0.7572519083969466,
"grad_norm": 3028649.837352563,
"learning_rate": 2.1165983894256647e-06,
"loss": 13.7907,
"step": 372
},
{
"epoch": 0.7592875318066158,
"grad_norm": 2648884.5001803297,
"learning_rate": 2.0838710138682412e-06,
"loss": 13.8052,
"step": 373
},
{
"epoch": 0.761323155216285,
"grad_norm": 3498649.6090777554,
"learning_rate": 2.051331895089882e-06,
"loss": 13.8098,
"step": 374
},
{
"epoch": 0.7633587786259542,
"grad_norm": 23098352.355775494,
"learning_rate": 2.0189831337211573e-06,
"loss": 13.8393,
"step": 375
},
{
"epoch": 0.7653944020356234,
"grad_norm": 2461084.0552417845,
"learning_rate": 1.9868268181037186e-06,
"loss": 13.7867,
"step": 376
},
{
"epoch": 0.7674300254452926,
"grad_norm": 3735300.3296117457,
"learning_rate": 1.9548650241554812e-06,
"loss": 13.804,
"step": 377
},
{
"epoch": 0.7694656488549618,
"grad_norm": 3008756.34059785,
"learning_rate": 1.923099815236608e-06,
"loss": 13.7861,
"step": 378
},
{
"epoch": 0.771501272264631,
"grad_norm": 7710036.601809266,
"learning_rate": 1.8915332420163074e-06,
"loss": 13.7991,
"step": 379
},
{
"epoch": 0.7735368956743003,
"grad_norm": 3540790.6907704584,
"learning_rate": 1.8601673423404449e-06,
"loss": 13.7907,
"step": 380
},
{
"epoch": 0.7755725190839695,
"grad_norm": 2333150.5964701963,
"learning_rate": 1.8290041410999893e-06,
"loss": 13.8226,
"step": 381
},
{
"epoch": 0.7776081424936386,
"grad_norm": 3965290.2964736833,
"learning_rate": 1.798045650100289e-06,
"loss": 13.7884,
"step": 382
},
{
"epoch": 0.7796437659033079,
"grad_norm": 2968777.429909849,
"learning_rate": 1.7672938679311957e-06,
"loss": 13.7859,
"step": 383
},
{
"epoch": 0.7816793893129771,
"grad_norm": 5317126.450461407,
"learning_rate": 1.736750779838044e-06,
"loss": 13.8526,
"step": 384
},
{
"epoch": 0.7837150127226463,
"grad_norm": 3055487.063269149,
"learning_rate": 1.7064183575934856e-06,
"loss": 13.8009,
"step": 385
},
{
"epoch": 0.7857506361323155,
"grad_norm": 16905669.825724535,
"learning_rate": 1.676298559370202e-06,
"loss": 13.7807,
"step": 386
},
{
"epoch": 0.7877862595419848,
"grad_norm": 5137867.698262086,
"learning_rate": 1.6463933296144863e-06,
"loss": 13.8198,
"step": 387
},
{
"epoch": 0.7898218829516539,
"grad_norm": 3398913.1351794973,
"learning_rate": 1.6167045989207185e-06,
"loss": 13.7712,
"step": 388
},
{
"epoch": 0.7918575063613231,
"grad_norm": 3925212.2891858686,
"learning_rate": 1.5872342839067305e-06,
"loss": 13.807,
"step": 389
},
{
"epoch": 0.7938931297709924,
"grad_norm": 2924712.3948877575,
"learning_rate": 1.5579842870900746e-06,
"loss": 13.8241,
"step": 390
},
{
"epoch": 0.7959287531806616,
"grad_norm": 5390343.102311131,
"learning_rate": 1.5289564967652033e-06,
"loss": 13.8485,
"step": 391
},
{
"epoch": 0.7979643765903308,
"grad_norm": 3074687.848582143,
"learning_rate": 1.5001527868815702e-06,
"loss": 13.8125,
"step": 392
},
{
"epoch": 0.8,
"grad_norm": 3184498.416088956,
"learning_rate": 1.4715750169226417e-06,
"loss": 13.766,
"step": 393
},
{
"epoch": 0.8020356234096692,
"grad_norm": 2441964.563817395,
"learning_rate": 1.4432250317858675e-06,
"loss": 13.7805,
"step": 394
},
{
"epoch": 0.8040712468193384,
"grad_norm": 2091387.9994893048,
"learning_rate": 1.4151046616635727e-06,
"loss": 13.7952,
"step": 395
},
{
"epoch": 0.8061068702290076,
"grad_norm": 3508560.4814663967,
"learning_rate": 1.3872157219248045e-06,
"loss": 13.7889,
"step": 396
},
{
"epoch": 0.8081424936386769,
"grad_norm": 3962771.0203186534,
"learning_rate": 1.3595600129981469e-06,
"loss": 13.7979,
"step": 397
},
{
"epoch": 0.8101781170483461,
"grad_norm": 2731620.334807845,
"learning_rate": 1.3321393202554739e-06,
"loss": 13.7976,
"step": 398
},
{
"epoch": 0.8122137404580153,
"grad_norm": 3197496.0139092538,
"learning_rate": 1.3049554138967052e-06,
"loss": 13.7901,
"step": 399
},
{
"epoch": 0.8142493638676844,
"grad_norm": 7014943.992062835,
"learning_rate": 1.278010048835523e-06,
"loss": 13.7998,
"step": 400
},
{
"epoch": 0.8162849872773537,
"grad_norm": 3316247.1212317836,
"learning_rate": 1.2513049645860759e-06,
"loss": 13.765,
"step": 401
},
{
"epoch": 0.8183206106870229,
"grad_norm": 2783094.2613453907,
"learning_rate": 1.224841885150691e-06,
"loss": 13.8445,
"step": 402
},
{
"epoch": 0.8203562340966921,
"grad_norm": 3355566.32780499,
"learning_rate": 1.1986225189085627e-06,
"loss": 13.8081,
"step": 403
},
{
"epoch": 0.8223918575063613,
"grad_norm": 2212096.023617605,
"learning_rate": 1.172648558505477e-06,
"loss": 13.7965,
"step": 404
},
{
"epoch": 0.8244274809160306,
"grad_norm": 1744005.9134531072,
"learning_rate": 1.1469216807445348e-06,
"loss": 13.827,
"step": 405
},
{
"epoch": 0.8264631043256997,
"grad_norm": 5478757.4739052905,
"learning_rate": 1.1214435464779006e-06,
"loss": 13.8068,
"step": 406
},
{
"epoch": 0.8284987277353689,
"grad_norm": 10845370.007264948,
"learning_rate": 1.0962158004995893e-06,
"loss": 13.8192,
"step": 407
},
{
"epoch": 0.8305343511450382,
"grad_norm": 6268218.580873969,
"learning_rate": 1.0712400714392723e-06,
"loss": 13.7939,
"step": 408
},
{
"epoch": 0.8325699745547074,
"grad_norm": 4049672.137206165,
"learning_rate": 1.0465179716571467e-06,
"loss": 13.7897,
"step": 409
},
{
"epoch": 0.8346055979643766,
"grad_norm": 5672727.971393141,
"learning_rate": 1.0220510971398473e-06,
"loss": 13.8234,
"step": 410
},
{
"epoch": 0.8366412213740458,
"grad_norm": 2826950.9583422337,
"learning_rate": 9.978410273974015e-07,
"loss": 13.7928,
"step": 411
},
{
"epoch": 0.838676844783715,
"grad_norm": 2758814.4659312656,
"learning_rate": 9.738893253612808e-07,
"loss": 13.802,
"step": 412
},
{
"epoch": 0.8407124681933842,
"grad_norm": 2475383.7033035085,
"learning_rate": 9.50197537283481e-07,
"loss": 13.7978,
"step": 413
},
{
"epoch": 0.8427480916030534,
"grad_norm": 3247115.9631803534,
"learning_rate": 9.267671926367166e-07,
"loss": 13.826,
"step": 414
},
{
"epoch": 0.8447837150127226,
"grad_norm": 2633292.367379474,
"learning_rate": 9.035998040156801e-07,
"loss": 13.7931,
"step": 415
},
{
"epoch": 0.8468193384223919,
"grad_norm": 2515821.989602333,
"learning_rate": 8.806968670393801e-07,
"loss": 13.7718,
"step": 416
},
{
"epoch": 0.8488549618320611,
"grad_norm": 2652660.850368756,
"learning_rate": 8.580598602546109e-07,
"loss": 13.8055,
"step": 417
},
{
"epoch": 0.8508905852417303,
"grad_norm": 2999838.4744108086,
"learning_rate": 8.356902450404792e-07,
"loss": 13.817,
"step": 418
},
{
"epoch": 0.8529262086513995,
"grad_norm": 2359848.6979157086,
"learning_rate": 8.135894655140758e-07,
"loss": 13.79,
"step": 419
},
{
"epoch": 0.8549618320610687,
"grad_norm": 3156996.0591459703,
"learning_rate": 7.91758948437249e-07,
"loss": 13.7984,
"step": 420
},
{
"epoch": 0.8569974554707379,
"grad_norm": 3762598.8987431657,
"learning_rate": 7.702001031244816e-07,
"loss": 13.8018,
"step": 421
},
{
"epoch": 0.8590330788804071,
"grad_norm": 1976772.2357703648,
"learning_rate": 7.489143213519301e-07,
"loss": 13.8201,
"step": 422
},
{
"epoch": 0.8610687022900764,
"grad_norm": 2785725.350606155,
"learning_rate": 7.279029772675572e-07,
"loss": 13.7565,
"step": 423
},
{
"epoch": 0.8631043256997456,
"grad_norm": 2620438.7904132027,
"learning_rate": 7.071674273024353e-07,
"loss": 13.793,
"step": 424
},
{
"epoch": 0.8651399491094147,
"grad_norm": 2646276.309564974,
"learning_rate": 6.86709010083172e-07,
"loss": 13.8229,
"step": 425
},
{
"epoch": 0.867175572519084,
"grad_norm": 2628696.346363584,
"learning_rate": 6.665290463454882e-07,
"loss": 13.7809,
"step": 426
},
{
"epoch": 0.8692111959287532,
"grad_norm": 3600670.0350286104,
"learning_rate": 6.466288388489689e-07,
"loss": 13.8044,
"step": 427
},
{
"epoch": 0.8712468193384224,
"grad_norm": 2947142.074939093,
"learning_rate": 6.270096722929442e-07,
"loss": 13.8006,
"step": 428
},
{
"epoch": 0.8732824427480916,
"grad_norm": 2029930.0771920187,
"learning_rate": 6.076728132335669e-07,
"loss": 13.796,
"step": 429
},
{
"epoch": 0.8753180661577609,
"grad_norm": 2143832.4235748462,
"learning_rate": 5.886195100020408e-07,
"loss": 13.7854,
"step": 430
},
{
"epoch": 0.87735368956743,
"grad_norm": 13943126.109787703,
"learning_rate": 5.698509926240275e-07,
"loss": 13.7955,
"step": 431
},
{
"epoch": 0.8793893129770992,
"grad_norm": 3462678.627784003,
"learning_rate": 5.513684727402529e-07,
"loss": 13.8238,
"step": 432
},
{
"epoch": 0.8814249363867684,
"grad_norm": 2958565.3339085556,
"learning_rate": 5.331731435282705e-07,
"loss": 13.8227,
"step": 433
},
{
"epoch": 0.8834605597964377,
"grad_norm": 2315016.094193514,
"learning_rate": 5.152661796254505e-07,
"loss": 13.7815,
"step": 434
},
{
"epoch": 0.8854961832061069,
"grad_norm": 3124561.491445178,
"learning_rate": 4.976487370531352e-07,
"loss": 13.7972,
"step": 435
},
{
"epoch": 0.8875318066157761,
"grad_norm": 2569217.587769061,
"learning_rate": 4.803219531420128e-07,
"loss": 13.7759,
"step": 436
},
{
"epoch": 0.8895674300254452,
"grad_norm": 5211670.405327593,
"learning_rate": 4.6328694645870254e-07,
"loss": 13.7982,
"step": 437
},
{
"epoch": 0.8916030534351145,
"grad_norm": 2303871.1225886643,
"learning_rate": 4.46544816733529e-07,
"loss": 13.7919,
"step": 438
},
{
"epoch": 0.8936386768447837,
"grad_norm": 3417955.106120249,
"learning_rate": 4.3009664478954384e-07,
"loss": 13.8388,
"step": 439
},
{
"epoch": 0.8956743002544529,
"grad_norm": 3491337.261636375,
"learning_rate": 4.139434924727359e-07,
"loss": 13.7903,
"step": 440
},
{
"epoch": 0.8977099236641222,
"grad_norm": 2598696.463432398,
"learning_rate": 3.9808640258348686e-07,
"loss": 13.7935,
"step": 441
},
{
"epoch": 0.8997455470737914,
"grad_norm": 2406731.2012268724,
"learning_rate": 3.825263988092587e-07,
"loss": 13.8001,
"step": 442
},
{
"epoch": 0.9017811704834605,
"grad_norm": 1850343.550069675,
"learning_rate": 3.672644856584928e-07,
"loss": 13.8345,
"step": 443
},
{
"epoch": 0.9038167938931297,
"grad_norm": 2877460.3830578206,
"learning_rate": 3.523016483957742e-07,
"loss": 13.8395,
"step": 444
},
{
"epoch": 0.905852417302799,
"grad_norm": 3386724.5492421445,
"learning_rate": 3.3763885297822153e-07,
"loss": 13.8028,
"step": 445
},
{
"epoch": 0.9078880407124682,
"grad_norm": 2849485.569450586,
"learning_rate": 3.2327704599312283e-07,
"loss": 13.8038,
"step": 446
},
{
"epoch": 0.9099236641221374,
"grad_norm": 3175459.533705238,
"learning_rate": 3.0921715459683753e-07,
"loss": 13.8303,
"step": 447
},
{
"epoch": 0.9119592875318067,
"grad_norm": 22724636.64948581,
"learning_rate": 2.95460086454929e-07,
"loss": 13.772,
"step": 448
},
{
"epoch": 0.9139949109414758,
"grad_norm": 2467166.6966894856,
"learning_rate": 2.820067296835799e-07,
"loss": 13.7719,
"step": 449
},
{
"epoch": 0.916030534351145,
"grad_norm": 3412055.676564019,
"learning_rate": 2.688579527922514e-07,
"loss": 13.7956,
"step": 450
},
{
"epoch": 0.9180661577608142,
"grad_norm": 2620791.7542605917,
"learning_rate": 2.560146046276135e-07,
"loss": 13.7717,
"step": 451
},
{
"epoch": 0.9201017811704835,
"grad_norm": 2077929.3418459287,
"learning_rate": 2.4347751431875453e-07,
"loss": 13.7763,
"step": 452
},
{
"epoch": 0.9221374045801527,
"grad_norm": 2370550.2619939703,
"learning_rate": 2.3124749122364286e-07,
"loss": 13.7964,
"step": 453
},
{
"epoch": 0.9241730279898219,
"grad_norm": 5750923.606982284,
"learning_rate": 2.1932532487688784e-07,
"loss": 13.791,
"step": 454
},
{
"epoch": 0.926208651399491,
"grad_norm": 3460175.4976370293,
"learning_rate": 2.0771178493876387e-07,
"loss": 13.79,
"step": 455
},
{
"epoch": 0.9282442748091603,
"grad_norm": 5901311.706134819,
"learning_rate": 1.964076211455246e-07,
"loss": 13.7797,
"step": 456
},
{
"epoch": 0.9302798982188295,
"grad_norm": 5105079.708966371,
"learning_rate": 1.8541356326100436e-07,
"loss": 13.7849,
"step": 457
},
{
"epoch": 0.9323155216284987,
"grad_norm": 4198563.584774799,
"learning_rate": 1.7473032102949983e-07,
"loss": 13.8415,
"step": 458
},
{
"epoch": 0.934351145038168,
"grad_norm": 3844225.7229981595,
"learning_rate": 1.6435858412996275e-07,
"loss": 13.766,
"step": 459
},
{
"epoch": 0.9363867684478372,
"grad_norm": 3854801.945640343,
"learning_rate": 1.542990221314644e-07,
"loss": 13.7996,
"step": 460
},
{
"epoch": 0.9384223918575063,
"grad_norm": 4666453.104767151,
"learning_rate": 1.445522844499775e-07,
"loss": 13.8,
"step": 461
},
{
"epoch": 0.9404580152671755,
"grad_norm": 5472437.376052735,
"learning_rate": 1.3511900030644954e-07,
"loss": 13.7599,
"step": 462
},
{
"epoch": 0.9424936386768448,
"grad_norm": 4682435.578567764,
"learning_rate": 1.2599977868618052e-07,
"loss": 13.7934,
"step": 463
},
{
"epoch": 0.944529262086514,
"grad_norm": 9272421.276272465,
"learning_rate": 1.1719520829951203e-07,
"loss": 13.8146,
"step": 464
},
{
"epoch": 0.9465648854961832,
"grad_norm": 4195416.415355079,
"learning_rate": 1.087058575438199e-07,
"loss": 13.8253,
"step": 465
},
{
"epoch": 0.9486005089058525,
"grad_norm": 3012392.88106662,
"learning_rate": 1.0053227446681912e-07,
"loss": 13.8254,
"step": 466
},
{
"epoch": 0.9506361323155216,
"grad_norm": 4716265.36093074,
"learning_rate": 9.267498673118547e-08,
"loss": 13.8287,
"step": 467
},
{
"epoch": 0.9526717557251908,
"grad_norm": 3046833.346747879,
"learning_rate": 8.513450158049109e-08,
"loss": 13.7954,
"step": 468
},
{
"epoch": 0.95470737913486,
"grad_norm": 3919156.6955382572,
"learning_rate": 7.791130580645623e-08,
"loss": 13.7727,
"step": 469
},
{
"epoch": 0.9567430025445293,
"grad_norm": 2642703.3691420523,
"learning_rate": 7.100586571752444e-08,
"loss": 13.8211,
"step": 470
},
{
"epoch": 0.9587786259541985,
"grad_norm": 2860021.0276084486,
"learning_rate": 6.441862710876102e-08,
"loss": 13.7861,
"step": 471
},
{
"epoch": 0.9608142493638677,
"grad_norm": 2389117.180384712,
"learning_rate": 5.815001523307162e-08,
"loss": 13.7998,
"step": 472
},
{
"epoch": 0.9628498727735368,
"grad_norm": 4343380.201960487,
"learning_rate": 5.220043477374759e-08,
"loss": 13.7891,
"step": 473
},
{
"epoch": 0.9648854961832061,
"grad_norm": 5212705.489252775,
"learning_rate": 4.657026981834623e-08,
"loss": 13.8108,
"step": 474
},
{
"epoch": 0.9669211195928753,
"grad_norm": 2520458.4461040264,
"learning_rate": 4.125988383388957e-08,
"loss": 13.8067,
"step": 475
},
{
"epoch": 0.9689567430025445,
"grad_norm": 4182021.7452560714,
"learning_rate": 3.626961964340203e-08,
"loss": 13.7897,
"step": 476
},
{
"epoch": 0.9709923664122138,
"grad_norm": 10159149.969018588,
"learning_rate": 3.159979940378088e-08,
"loss": 13.8046,
"step": 477
},
{
"epoch": 0.973027989821883,
"grad_norm": 2510949.593741673,
"learning_rate": 2.725072458499567e-08,
"loss": 13.8275,
"step": 478
},
{
"epoch": 0.9750636132315522,
"grad_norm": 2900958.891211133,
"learning_rate": 2.3222675950627106e-08,
"loss": 13.7972,
"step": 479
},
{
"epoch": 0.9770992366412213,
"grad_norm": 3768468.1026937226,
"learning_rate": 1.9515913539743247e-08,
"loss": 13.8208,
"step": 480
},
{
"epoch": 0.9791348600508906,
"grad_norm": 2563277.62892743,
"learning_rate": 1.613067665010959e-08,
"loss": 13.8265,
"step": 481
},
{
"epoch": 0.9811704834605598,
"grad_norm": 23583570.860715404,
"learning_rate": 1.3067183822742525e-08,
"loss": 13.8262,
"step": 482
},
{
"epoch": 0.983206106870229,
"grad_norm": 8371515.456262343,
"learning_rate": 1.0325632827801745e-08,
"loss": 13.8356,
"step": 483
},
{
"epoch": 0.9852417302798983,
"grad_norm": 4260314.378446997,
"learning_rate": 7.906200651819907e-09,
"loss": 13.8054,
"step": 484
},
{
"epoch": 0.9872773536895675,
"grad_norm": 2235986.274852917,
"learning_rate": 5.809043486279531e-09,
"loss": 13.7764,
"step": 485
},
{
"epoch": 0.9893129770992366,
"grad_norm": 2261687.826321509,
"learning_rate": 4.034296717527752e-09,
"loss": 13.8129,
"step": 486
},
{
"epoch": 0.9913486005089058,
"grad_norm": 5923746.692619356,
"learning_rate": 2.5820749180388573e-09,
"loss": 13.843,
"step": 487
},
{
"epoch": 0.9933842239185751,
"grad_norm": 2729193.3221547497,
"learning_rate": 1.4524718390140913e-09,
"loss": 13.8395,
"step": 488
},
{
"epoch": 0.9954198473282443,
"grad_norm": 2703798.169864237,
"learning_rate": 6.455604043331676e-10,
"loss": 13.8165,
"step": 489
},
{
"epoch": 0.9974554707379135,
"grad_norm": 3350867.6861675973,
"learning_rate": 1.6139270584358823e-10,
"loss": 13.8103,
"step": 490
},
{
"epoch": 0.9994910941475827,
"grad_norm": 1851326.2112600075,
"learning_rate": 0.0,
"loss": 13.8163,
"step": 491
},
{
"epoch": 0.9994910941475827,
"eval_loss": 13.805337905883789,
"eval_runtime": 382.78,
"eval_samples_per_second": 2.283,
"eval_steps_per_second": 0.572,
"step": 491
}
],
"logging_steps": 1,
"max_steps": 491,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 100344791040000.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}