|
{
|
|
"best_metric": null,
|
|
"best_model_checkpoint": null,
|
|
"epoch": 5.69545389767707,
|
|
"eval_steps": 500,
|
|
"global_step": 5000,
|
|
"is_hyper_param_search": false,
|
|
"is_local_process_zero": true,
|
|
"is_world_process_zero": true,
|
|
"log_history": [
|
|
{
|
|
"epoch": 0.005700441784238278,
|
|
"grad_norm": 27.39983367919922,
|
|
"learning_rate": 4.9999995544380894e-05,
|
|
"loss": 16.8508,
|
|
"num_input_tokens_seen": 97712,
|
|
"step": 5
|
|
},
|
|
{
|
|
"epoch": 0.011400883568476556,
|
|
"grad_norm": 35.96725082397461,
|
|
"learning_rate": 4.999998217752515e-05,
|
|
"loss": 11.7177,
|
|
"num_input_tokens_seen": 195504,
|
|
"step": 10
|
|
},
|
|
{
|
|
"epoch": 0.017101325352714837,
|
|
"grad_norm": 13.93742847442627,
|
|
"learning_rate": 4.999995989943754e-05,
|
|
"loss": 6.3848,
|
|
"num_input_tokens_seen": 293200,
|
|
"step": 15
|
|
},
|
|
{
|
|
"epoch": 0.022801767136953113,
|
|
"grad_norm": 9.999053955078125,
|
|
"learning_rate": 4.9999928710126e-05,
|
|
"loss": 4.4249,
|
|
"num_input_tokens_seen": 390960,
|
|
"step": 20
|
|
},
|
|
{
|
|
"epoch": 0.02850220892119139,
|
|
"grad_norm": 11.072565078735352,
|
|
"learning_rate": 4.999988860960165e-05,
|
|
"loss": 3.7022,
|
|
"num_input_tokens_seen": 488752,
|
|
"step": 25
|
|
},
|
|
{
|
|
"epoch": 0.034202650705429674,
|
|
"grad_norm": 6.960489273071289,
|
|
"learning_rate": 4.9999839597878784e-05,
|
|
"loss": 2.5291,
|
|
"num_input_tokens_seen": 586496,
|
|
"step": 30
|
|
},
|
|
{
|
|
"epoch": 0.039903092489667946,
|
|
"grad_norm": 10.353778839111328,
|
|
"learning_rate": 4.999978167497488e-05,
|
|
"loss": 1.9498,
|
|
"num_input_tokens_seen": 684240,
|
|
"step": 35
|
|
},
|
|
{
|
|
"epoch": 0.045603534273906225,
|
|
"grad_norm": 12.493133544921875,
|
|
"learning_rate": 4.999971484091057e-05,
|
|
"loss": 1.551,
|
|
"num_input_tokens_seen": 781936,
|
|
"step": 40
|
|
},
|
|
{
|
|
"epoch": 0.051303976058144504,
|
|
"grad_norm": 16.28146743774414,
|
|
"learning_rate": 4.999963909570968e-05,
|
|
"loss": 1.4018,
|
|
"num_input_tokens_seen": 879680,
|
|
"step": 45
|
|
},
|
|
{
|
|
"epoch": 0.05700441784238278,
|
|
"grad_norm": 9.161210060119629,
|
|
"learning_rate": 4.999955443939922e-05,
|
|
"loss": 1.2093,
|
|
"num_input_tokens_seen": 977440,
|
|
"step": 50
|
|
},
|
|
{
|
|
"epoch": 0.06270485962662106,
|
|
"grad_norm": 13.297211647033691,
|
|
"learning_rate": 4.9999460872009366e-05,
|
|
"loss": 1.1716,
|
|
"num_input_tokens_seen": 1075200,
|
|
"step": 55
|
|
},
|
|
{
|
|
"epoch": 0.06840530141085935,
|
|
"grad_norm": 8.317058563232422,
|
|
"learning_rate": 4.9999358393573445e-05,
|
|
"loss": 1.1838,
|
|
"num_input_tokens_seen": 1172880,
|
|
"step": 60
|
|
},
|
|
{
|
|
"epoch": 0.07410574319509762,
|
|
"grad_norm": 7.10370397567749,
|
|
"learning_rate": 4.9999247004128014e-05,
|
|
"loss": 1.0844,
|
|
"num_input_tokens_seen": 1270608,
|
|
"step": 65
|
|
},
|
|
{
|
|
"epoch": 0.07980618497933589,
|
|
"grad_norm": 10.387308120727539,
|
|
"learning_rate": 4.9999126703712775e-05,
|
|
"loss": 1.0746,
|
|
"num_input_tokens_seen": 1368368,
|
|
"step": 70
|
|
},
|
|
{
|
|
"epoch": 0.08550662676357418,
|
|
"grad_norm": 10.262731552124023,
|
|
"learning_rate": 4.999899749237059e-05,
|
|
"loss": 1.0698,
|
|
"num_input_tokens_seen": 1466016,
|
|
"step": 75
|
|
},
|
|
{
|
|
"epoch": 0.09120706854781245,
|
|
"grad_norm": 9.300691604614258,
|
|
"learning_rate": 4.9998859370147524e-05,
|
|
"loss": 1.1167,
|
|
"num_input_tokens_seen": 1563648,
|
|
"step": 80
|
|
},
|
|
{
|
|
"epoch": 0.09690751033205074,
|
|
"grad_norm": 4.99461030960083,
|
|
"learning_rate": 4.999871233709282e-05,
|
|
"loss": 1.105,
|
|
"num_input_tokens_seen": 1661456,
|
|
"step": 85
|
|
},
|
|
{
|
|
"epoch": 0.10260795211628901,
|
|
"grad_norm": 8.910074234008789,
|
|
"learning_rate": 4.9998556393258884e-05,
|
|
"loss": 1.1096,
|
|
"num_input_tokens_seen": 1759184,
|
|
"step": 90
|
|
},
|
|
{
|
|
"epoch": 0.1083083939005273,
|
|
"grad_norm": 7.826836585998535,
|
|
"learning_rate": 4.9998391538701293e-05,
|
|
"loss": 1.1084,
|
|
"num_input_tokens_seen": 1856848,
|
|
"step": 95
|
|
},
|
|
{
|
|
"epoch": 0.11400883568476557,
|
|
"grad_norm": 8.221962928771973,
|
|
"learning_rate": 4.999821777347883e-05,
|
|
"loss": 1.1534,
|
|
"num_input_tokens_seen": 1954544,
|
|
"step": 100
|
|
},
|
|
{
|
|
"epoch": 0.11970927746900385,
|
|
"grad_norm": 8.834911346435547,
|
|
"learning_rate": 4.9998035097653406e-05,
|
|
"loss": 1.075,
|
|
"num_input_tokens_seen": 2052224,
|
|
"step": 105
|
|
},
|
|
{
|
|
"epoch": 0.12540971925324212,
|
|
"grad_norm": 7.220479488372803,
|
|
"learning_rate": 4.9997843511290156e-05,
|
|
"loss": 1.085,
|
|
"num_input_tokens_seen": 2150000,
|
|
"step": 110
|
|
},
|
|
{
|
|
"epoch": 0.1311101610374804,
|
|
"grad_norm": 7.887118816375732,
|
|
"learning_rate": 4.999764301445736e-05,
|
|
"loss": 1.0384,
|
|
"num_input_tokens_seen": 2247808,
|
|
"step": 115
|
|
},
|
|
{
|
|
"epoch": 0.1368106028217187,
|
|
"grad_norm": 9.087337493896484,
|
|
"learning_rate": 4.9997433607226495e-05,
|
|
"loss": 1.1907,
|
|
"num_input_tokens_seen": 2345584,
|
|
"step": 120
|
|
},
|
|
{
|
|
"epoch": 0.14251104460595695,
|
|
"grad_norm": 7.334683418273926,
|
|
"learning_rate": 4.9997215289672194e-05,
|
|
"loss": 1.0865,
|
|
"num_input_tokens_seen": 2443360,
|
|
"step": 125
|
|
},
|
|
{
|
|
"epoch": 0.14821148639019524,
|
|
"grad_norm": 6.93471097946167,
|
|
"learning_rate": 4.9996988061872284e-05,
|
|
"loss": 1.0469,
|
|
"num_input_tokens_seen": 2541120,
|
|
"step": 130
|
|
},
|
|
{
|
|
"epoch": 0.15391192817443353,
|
|
"grad_norm": 8.000816345214844,
|
|
"learning_rate": 4.999675192390776e-05,
|
|
"loss": 1.0966,
|
|
"num_input_tokens_seen": 2638912,
|
|
"step": 135
|
|
},
|
|
{
|
|
"epoch": 0.15961236995867178,
|
|
"grad_norm": 6.232958793640137,
|
|
"learning_rate": 4.999650687586278e-05,
|
|
"loss": 1.0418,
|
|
"num_input_tokens_seen": 2736624,
|
|
"step": 140
|
|
},
|
|
{
|
|
"epoch": 0.16531281174291007,
|
|
"grad_norm": 12.828268051147461,
|
|
"learning_rate": 4.999625291782471e-05,
|
|
"loss": 1.0684,
|
|
"num_input_tokens_seen": 2834384,
|
|
"step": 145
|
|
},
|
|
{
|
|
"epoch": 0.17101325352714836,
|
|
"grad_norm": 6.147856712341309,
|
|
"learning_rate": 4.999599004988406e-05,
|
|
"loss": 0.9802,
|
|
"num_input_tokens_seen": 2932160,
|
|
"step": 150
|
|
},
|
|
{
|
|
"epoch": 0.17671369531138664,
|
|
"grad_norm": 7.085546016693115,
|
|
"learning_rate": 4.999571827213454e-05,
|
|
"loss": 1.1506,
|
|
"num_input_tokens_seen": 3029904,
|
|
"step": 155
|
|
},
|
|
{
|
|
"epoch": 0.1824141370956249,
|
|
"grad_norm": 8.019725799560547,
|
|
"learning_rate": 4.999543758467301e-05,
|
|
"loss": 1.0185,
|
|
"num_input_tokens_seen": 3127648,
|
|
"step": 160
|
|
},
|
|
{
|
|
"epoch": 0.1881145788798632,
|
|
"grad_norm": 5.743609428405762,
|
|
"learning_rate": 4.9995147987599536e-05,
|
|
"loss": 1.0001,
|
|
"num_input_tokens_seen": 3225360,
|
|
"step": 165
|
|
},
|
|
{
|
|
"epoch": 0.19381502066410147,
|
|
"grad_norm": 6.749576568603516,
|
|
"learning_rate": 4.999484948101734e-05,
|
|
"loss": 1.0848,
|
|
"num_input_tokens_seen": 3323152,
|
|
"step": 170
|
|
},
|
|
{
|
|
"epoch": 0.19951546244833976,
|
|
"grad_norm": 7.745048522949219,
|
|
"learning_rate": 4.9994542065032823e-05,
|
|
"loss": 1.0074,
|
|
"num_input_tokens_seen": 3420912,
|
|
"step": 175
|
|
},
|
|
{
|
|
"epoch": 0.20521590423257802,
|
|
"grad_norm": 6.615988254547119,
|
|
"learning_rate": 4.9994225739755565e-05,
|
|
"loss": 1.0756,
|
|
"num_input_tokens_seen": 3518752,
|
|
"step": 180
|
|
},
|
|
{
|
|
"epoch": 0.2109163460168163,
|
|
"grad_norm": 6.993303298950195,
|
|
"learning_rate": 4.999390050529831e-05,
|
|
"loss": 1.0371,
|
|
"num_input_tokens_seen": 3616560,
|
|
"step": 185
|
|
},
|
|
{
|
|
"epoch": 0.2166167878010546,
|
|
"grad_norm": 6.100363731384277,
|
|
"learning_rate": 4.9993566361777e-05,
|
|
"loss": 0.9687,
|
|
"num_input_tokens_seen": 3714320,
|
|
"step": 190
|
|
},
|
|
{
|
|
"epoch": 0.22231722958529285,
|
|
"grad_norm": 5.574942111968994,
|
|
"learning_rate": 4.999322330931074e-05,
|
|
"loss": 1.0173,
|
|
"num_input_tokens_seen": 3812144,
|
|
"step": 195
|
|
},
|
|
{
|
|
"epoch": 0.22801767136953113,
|
|
"grad_norm": 7.2291975021362305,
|
|
"learning_rate": 4.9992871348021804e-05,
|
|
"loss": 1.0322,
|
|
"num_input_tokens_seen": 3909824,
|
|
"step": 200
|
|
},
|
|
{
|
|
"epoch": 0.23371811315376942,
|
|
"grad_norm": 6.874391078948975,
|
|
"learning_rate": 4.999251047803565e-05,
|
|
"loss": 1.0096,
|
|
"num_input_tokens_seen": 4007600,
|
|
"step": 205
|
|
},
|
|
{
|
|
"epoch": 0.2394185549380077,
|
|
"grad_norm": 9.4487886428833,
|
|
"learning_rate": 4.9992140699480914e-05,
|
|
"loss": 0.9313,
|
|
"num_input_tokens_seen": 4105360,
|
|
"step": 210
|
|
},
|
|
{
|
|
"epoch": 0.24511899672224596,
|
|
"grad_norm": 8.49326229095459,
|
|
"learning_rate": 4.99917620124894e-05,
|
|
"loss": 1.008,
|
|
"num_input_tokens_seen": 4203072,
|
|
"step": 215
|
|
},
|
|
{
|
|
"epoch": 0.25081943850648425,
|
|
"grad_norm": 8.347270965576172,
|
|
"learning_rate": 4.999137441719609e-05,
|
|
"loss": 0.9588,
|
|
"num_input_tokens_seen": 4300784,
|
|
"step": 220
|
|
},
|
|
{
|
|
"epoch": 0.25651988029072254,
|
|
"grad_norm": 7.798429489135742,
|
|
"learning_rate": 4.999097791373915e-05,
|
|
"loss": 1.0412,
|
|
"num_input_tokens_seen": 4398448,
|
|
"step": 225
|
|
},
|
|
{
|
|
"epoch": 0.2622203220749608,
|
|
"grad_norm": 7.584600448608398,
|
|
"learning_rate": 4.99905725022599e-05,
|
|
"loss": 0.9457,
|
|
"num_input_tokens_seen": 4496256,
|
|
"step": 230
|
|
},
|
|
{
|
|
"epoch": 0.2679207638591991,
|
|
"grad_norm": 5.460471153259277,
|
|
"learning_rate": 4.9990158182902866e-05,
|
|
"loss": 0.8931,
|
|
"num_input_tokens_seen": 4594032,
|
|
"step": 235
|
|
},
|
|
{
|
|
"epoch": 0.2736212056434374,
|
|
"grad_norm": 6.889909267425537,
|
|
"learning_rate": 4.9989734955815715e-05,
|
|
"loss": 0.846,
|
|
"num_input_tokens_seen": 4691824,
|
|
"step": 240
|
|
},
|
|
{
|
|
"epoch": 0.2793216474276756,
|
|
"grad_norm": 8.376734733581543,
|
|
"learning_rate": 4.998930282114932e-05,
|
|
"loss": 0.9712,
|
|
"num_input_tokens_seen": 4789568,
|
|
"step": 245
|
|
},
|
|
{
|
|
"epoch": 0.2850220892119139,
|
|
"grad_norm": 6.110357284545898,
|
|
"learning_rate": 4.99888617790577e-05,
|
|
"loss": 0.9427,
|
|
"num_input_tokens_seen": 4887296,
|
|
"step": 250
|
|
},
|
|
{
|
|
"epoch": 0.2907225309961522,
|
|
"grad_norm": 7.7024102210998535,
|
|
"learning_rate": 4.998841182969808e-05,
|
|
"loss": 0.8296,
|
|
"num_input_tokens_seen": 4984976,
|
|
"step": 255
|
|
},
|
|
{
|
|
"epoch": 0.2964229727803905,
|
|
"grad_norm": 6.920788288116455,
|
|
"learning_rate": 4.998795297323083e-05,
|
|
"loss": 1.0276,
|
|
"num_input_tokens_seen": 5082688,
|
|
"step": 260
|
|
},
|
|
{
|
|
"epoch": 0.30212341456462877,
|
|
"grad_norm": 7.553328514099121,
|
|
"learning_rate": 4.9987485209819515e-05,
|
|
"loss": 1.0488,
|
|
"num_input_tokens_seen": 5180400,
|
|
"step": 265
|
|
},
|
|
{
|
|
"epoch": 0.30782385634886705,
|
|
"grad_norm": 6.883415699005127,
|
|
"learning_rate": 4.998700853963088e-05,
|
|
"loss": 0.9426,
|
|
"num_input_tokens_seen": 5278208,
|
|
"step": 270
|
|
},
|
|
{
|
|
"epoch": 0.31352429813310534,
|
|
"grad_norm": 11.664554595947266,
|
|
"learning_rate": 4.998652296283481e-05,
|
|
"loss": 0.9294,
|
|
"num_input_tokens_seen": 5375968,
|
|
"step": 275
|
|
},
|
|
{
|
|
"epoch": 0.31922473991734357,
|
|
"grad_norm": 11.442777633666992,
|
|
"learning_rate": 4.9986028479604416e-05,
|
|
"loss": 1.0263,
|
|
"num_input_tokens_seen": 5473760,
|
|
"step": 280
|
|
},
|
|
{
|
|
"epoch": 0.32492518170158186,
|
|
"grad_norm": 7.503568649291992,
|
|
"learning_rate": 4.9985525090115936e-05,
|
|
"loss": 0.8616,
|
|
"num_input_tokens_seen": 5571472,
|
|
"step": 285
|
|
},
|
|
{
|
|
"epoch": 0.33062562348582014,
|
|
"grad_norm": 4.156918048858643,
|
|
"learning_rate": 4.998501279454881e-05,
|
|
"loss": 0.867,
|
|
"num_input_tokens_seen": 5669136,
|
|
"step": 290
|
|
},
|
|
{
|
|
"epoch": 0.3363260652700584,
|
|
"grad_norm": 8.972600936889648,
|
|
"learning_rate": 4.998449159308565e-05,
|
|
"loss": 0.9869,
|
|
"num_input_tokens_seen": 5766816,
|
|
"step": 295
|
|
},
|
|
{
|
|
"epoch": 0.3420265070542967,
|
|
"grad_norm": 6.943081855773926,
|
|
"learning_rate": 4.9983961485912235e-05,
|
|
"loss": 0.8677,
|
|
"num_input_tokens_seen": 5864576,
|
|
"step": 300
|
|
},
|
|
{
|
|
"epoch": 0.347726948838535,
|
|
"grad_norm": 5.271353244781494,
|
|
"learning_rate": 4.9983422473217514e-05,
|
|
"loss": 0.929,
|
|
"num_input_tokens_seen": 5962384,
|
|
"step": 305
|
|
},
|
|
{
|
|
"epoch": 0.3534273906227733,
|
|
"grad_norm": 6.874100685119629,
|
|
"learning_rate": 4.998287455519363e-05,
|
|
"loss": 0.8697,
|
|
"num_input_tokens_seen": 6060160,
|
|
"step": 310
|
|
},
|
|
{
|
|
"epoch": 0.3591278324070115,
|
|
"grad_norm": 6.316469192504883,
|
|
"learning_rate": 4.998231773203587e-05,
|
|
"loss": 0.8826,
|
|
"num_input_tokens_seen": 6157920,
|
|
"step": 315
|
|
},
|
|
{
|
|
"epoch": 0.3648282741912498,
|
|
"grad_norm": 6.3930816650390625,
|
|
"learning_rate": 4.9981752003942734e-05,
|
|
"loss": 0.9108,
|
|
"num_input_tokens_seen": 6255600,
|
|
"step": 320
|
|
},
|
|
{
|
|
"epoch": 0.3705287159754881,
|
|
"grad_norm": 7.396681785583496,
|
|
"learning_rate": 4.998117737111587e-05,
|
|
"loss": 0.9613,
|
|
"num_input_tokens_seen": 6353424,
|
|
"step": 325
|
|
},
|
|
{
|
|
"epoch": 0.3762291577597264,
|
|
"grad_norm": 9.793058395385742,
|
|
"learning_rate": 4.998059383376009e-05,
|
|
"loss": 0.8664,
|
|
"num_input_tokens_seen": 6451184,
|
|
"step": 330
|
|
},
|
|
{
|
|
"epoch": 0.38192959954396466,
|
|
"grad_norm": 4.0863423347473145,
|
|
"learning_rate": 4.998000139208342e-05,
|
|
"loss": 0.8693,
|
|
"num_input_tokens_seen": 6549040,
|
|
"step": 335
|
|
},
|
|
{
|
|
"epoch": 0.38763004132820295,
|
|
"grad_norm": 4.3018317222595215,
|
|
"learning_rate": 4.997940004629702e-05,
|
|
"loss": 0.9368,
|
|
"num_input_tokens_seen": 6646752,
|
|
"step": 340
|
|
},
|
|
{
|
|
"epoch": 0.39333048311244123,
|
|
"grad_norm": 16.874574661254883,
|
|
"learning_rate": 4.9978789796615235e-05,
|
|
"loss": 1.0444,
|
|
"num_input_tokens_seen": 6744544,
|
|
"step": 345
|
|
},
|
|
{
|
|
"epoch": 0.3990309248966795,
|
|
"grad_norm": 6.2149658203125,
|
|
"learning_rate": 4.9978170643255604e-05,
|
|
"loss": 0.9418,
|
|
"num_input_tokens_seen": 6842256,
|
|
"step": 350
|
|
},
|
|
{
|
|
"epoch": 0.40473136668091775,
|
|
"grad_norm": 6.908440113067627,
|
|
"learning_rate": 4.997754258643882e-05,
|
|
"loss": 0.8389,
|
|
"num_input_tokens_seen": 6939984,
|
|
"step": 355
|
|
},
|
|
{
|
|
"epoch": 0.41043180846515603,
|
|
"grad_norm": 9.332175254821777,
|
|
"learning_rate": 4.997690562638874e-05,
|
|
"loss": 0.9898,
|
|
"num_input_tokens_seen": 7037776,
|
|
"step": 360
|
|
},
|
|
{
|
|
"epoch": 0.4161322502493943,
|
|
"grad_norm": 7.081879138946533,
|
|
"learning_rate": 4.9976259763332423e-05,
|
|
"loss": 0.8761,
|
|
"num_input_tokens_seen": 7135552,
|
|
"step": 365
|
|
},
|
|
{
|
|
"epoch": 0.4218326920336326,
|
|
"grad_norm": 5.079131603240967,
|
|
"learning_rate": 4.9975604997500084e-05,
|
|
"loss": 0.8808,
|
|
"num_input_tokens_seen": 7233248,
|
|
"step": 370
|
|
},
|
|
{
|
|
"epoch": 0.4275331338178709,
|
|
"grad_norm": 7.381295680999756,
|
|
"learning_rate": 4.99749413291251e-05,
|
|
"loss": 0.9706,
|
|
"num_input_tokens_seen": 7330976,
|
|
"step": 375
|
|
},
|
|
{
|
|
"epoch": 0.4332335756021092,
|
|
"grad_norm": 4.044100284576416,
|
|
"learning_rate": 4.9974268758444054e-05,
|
|
"loss": 0.8972,
|
|
"num_input_tokens_seen": 7428704,
|
|
"step": 380
|
|
},
|
|
{
|
|
"epoch": 0.43893401738634746,
|
|
"grad_norm": 6.039126396179199,
|
|
"learning_rate": 4.9973587285696674e-05,
|
|
"loss": 0.7717,
|
|
"num_input_tokens_seen": 7526480,
|
|
"step": 385
|
|
},
|
|
{
|
|
"epoch": 0.4446344591705857,
|
|
"grad_norm": 5.874084949493408,
|
|
"learning_rate": 4.997289691112588e-05,
|
|
"loss": 0.9446,
|
|
"num_input_tokens_seen": 7624320,
|
|
"step": 390
|
|
},
|
|
{
|
|
"epoch": 0.450334900954824,
|
|
"grad_norm": 7.415895462036133,
|
|
"learning_rate": 4.997219763497774e-05,
|
|
"loss": 0.7123,
|
|
"num_input_tokens_seen": 7722064,
|
|
"step": 395
|
|
},
|
|
{
|
|
"epoch": 0.45603534273906227,
|
|
"grad_norm": 7.707664966583252,
|
|
"learning_rate": 4.997148945750153e-05,
|
|
"loss": 0.7859,
|
|
"num_input_tokens_seen": 7819808,
|
|
"step": 400
|
|
},
|
|
{
|
|
"epoch": 0.46173578452330055,
|
|
"grad_norm": 5.500309467315674,
|
|
"learning_rate": 4.9970772378949655e-05,
|
|
"loss": 0.826,
|
|
"num_input_tokens_seen": 7917488,
|
|
"step": 405
|
|
},
|
|
{
|
|
"epoch": 0.46743622630753884,
|
|
"grad_norm": 7.652528285980225,
|
|
"learning_rate": 4.9970046399577734e-05,
|
|
"loss": 0.8709,
|
|
"num_input_tokens_seen": 8015264,
|
|
"step": 410
|
|
},
|
|
{
|
|
"epoch": 0.4731366680917771,
|
|
"grad_norm": 6.417993545532227,
|
|
"learning_rate": 4.996931151964455e-05,
|
|
"loss": 0.9764,
|
|
"num_input_tokens_seen": 8113024,
|
|
"step": 415
|
|
},
|
|
{
|
|
"epoch": 0.4788371098760154,
|
|
"grad_norm": 5.648680210113525,
|
|
"learning_rate": 4.996856773941202e-05,
|
|
"loss": 0.8233,
|
|
"num_input_tokens_seen": 8210784,
|
|
"step": 420
|
|
},
|
|
{
|
|
"epoch": 0.4845375516602537,
|
|
"grad_norm": 8.321767807006836,
|
|
"learning_rate": 4.9967815059145296e-05,
|
|
"loss": 0.8556,
|
|
"num_input_tokens_seen": 8308512,
|
|
"step": 425
|
|
},
|
|
{
|
|
"epoch": 0.4902379934444919,
|
|
"grad_norm": 6.381886005401611,
|
|
"learning_rate": 4.9967053479112656e-05,
|
|
"loss": 0.7687,
|
|
"num_input_tokens_seen": 8406208,
|
|
"step": 430
|
|
},
|
|
{
|
|
"epoch": 0.4959384352287302,
|
|
"grad_norm": 7.855834007263184,
|
|
"learning_rate": 4.996628299958557e-05,
|
|
"loss": 0.7965,
|
|
"num_input_tokens_seen": 8503952,
|
|
"step": 435
|
|
},
|
|
{
|
|
"epoch": 0.5016388770129685,
|
|
"grad_norm": 8.358772277832031,
|
|
"learning_rate": 4.996550362083866e-05,
|
|
"loss": 0.7877,
|
|
"num_input_tokens_seen": 8601616,
|
|
"step": 440
|
|
},
|
|
{
|
|
"epoch": 0.5073393187972067,
|
|
"grad_norm": 8.553559303283691,
|
|
"learning_rate": 4.996471534314976e-05,
|
|
"loss": 0.76,
|
|
"num_input_tokens_seen": 8699424,
|
|
"step": 445
|
|
},
|
|
{
|
|
"epoch": 0.5130397605814451,
|
|
"grad_norm": 8.631624221801758,
|
|
"learning_rate": 4.9963918166799836e-05,
|
|
"loss": 0.8425,
|
|
"num_input_tokens_seen": 8797088,
|
|
"step": 450
|
|
},
|
|
{
|
|
"epoch": 0.5187402023656833,
|
|
"grad_norm": 11.236102104187012,
|
|
"learning_rate": 4.9963112092073046e-05,
|
|
"loss": 0.8332,
|
|
"num_input_tokens_seen": 8894848,
|
|
"step": 455
|
|
},
|
|
{
|
|
"epoch": 0.5244406441499216,
|
|
"grad_norm": 6.356544494628906,
|
|
"learning_rate": 4.996229711925671e-05,
|
|
"loss": 0.8231,
|
|
"num_input_tokens_seen": 8992576,
|
|
"step": 460
|
|
},
|
|
{
|
|
"epoch": 0.5301410859341599,
|
|
"grad_norm": 4.418157577514648,
|
|
"learning_rate": 4.996147324864132e-05,
|
|
"loss": 0.7168,
|
|
"num_input_tokens_seen": 9090272,
|
|
"step": 465
|
|
},
|
|
{
|
|
"epoch": 0.5358415277183982,
|
|
"grad_norm": 8.712305068969727,
|
|
"learning_rate": 4.996064048052056e-05,
|
|
"loss": 0.7672,
|
|
"num_input_tokens_seen": 9188080,
|
|
"step": 470
|
|
},
|
|
{
|
|
"epoch": 0.5415419695026364,
|
|
"grad_norm": 8.759718894958496,
|
|
"learning_rate": 4.995979881519126e-05,
|
|
"loss": 0.7601,
|
|
"num_input_tokens_seen": 9285872,
|
|
"step": 475
|
|
},
|
|
{
|
|
"epoch": 0.5472424112868748,
|
|
"grad_norm": 7.049539089202881,
|
|
"learning_rate": 4.995894825295343e-05,
|
|
"loss": 0.802,
|
|
"num_input_tokens_seen": 9383584,
|
|
"step": 480
|
|
},
|
|
{
|
|
"epoch": 0.552942853071113,
|
|
"grad_norm": 7.416094779968262,
|
|
"learning_rate": 4.995808879411026e-05,
|
|
"loss": 0.7645,
|
|
"num_input_tokens_seen": 9481200,
|
|
"step": 485
|
|
},
|
|
{
|
|
"epoch": 0.5586432948553512,
|
|
"grad_norm": 6.9029693603515625,
|
|
"learning_rate": 4.995722043896809e-05,
|
|
"loss": 0.6875,
|
|
"num_input_tokens_seen": 9578944,
|
|
"step": 490
|
|
},
|
|
{
|
|
"epoch": 0.5643437366395896,
|
|
"grad_norm": 7.398702621459961,
|
|
"learning_rate": 4.995634318783646e-05,
|
|
"loss": 0.7829,
|
|
"num_input_tokens_seen": 9676688,
|
|
"step": 495
|
|
},
|
|
{
|
|
"epoch": 0.5700441784238278,
|
|
"grad_norm": 7.631560802459717,
|
|
"learning_rate": 4.9955457041028055e-05,
|
|
"loss": 0.7324,
|
|
"num_input_tokens_seen": 9774464,
|
|
"step": 500
|
|
},
|
|
{
|
|
"epoch": 0.5757446202080662,
|
|
"grad_norm": 9.913789749145508,
|
|
"learning_rate": 4.995456199885875e-05,
|
|
"loss": 0.7578,
|
|
"num_input_tokens_seen": 9872160,
|
|
"step": 505
|
|
},
|
|
{
|
|
"epoch": 0.5814450619923044,
|
|
"grad_norm": 9.40986442565918,
|
|
"learning_rate": 4.995365806164758e-05,
|
|
"loss": 0.951,
|
|
"num_input_tokens_seen": 9969904,
|
|
"step": 510
|
|
},
|
|
{
|
|
"epoch": 0.5871455037765427,
|
|
"grad_norm": 5.404745578765869,
|
|
"learning_rate": 4.995274522971675e-05,
|
|
"loss": 0.7427,
|
|
"num_input_tokens_seen": 10067648,
|
|
"step": 515
|
|
},
|
|
{
|
|
"epoch": 0.592845945560781,
|
|
"grad_norm": 6.450439929962158,
|
|
"learning_rate": 4.9951823503391634e-05,
|
|
"loss": 0.75,
|
|
"num_input_tokens_seen": 10165456,
|
|
"step": 520
|
|
},
|
|
{
|
|
"epoch": 0.5985463873450192,
|
|
"grad_norm": 7.56156587600708,
|
|
"learning_rate": 4.9950892883000786e-05,
|
|
"loss": 0.7311,
|
|
"num_input_tokens_seen": 10263152,
|
|
"step": 525
|
|
},
|
|
{
|
|
"epoch": 0.6042468291292575,
|
|
"grad_norm": 5.007820129394531,
|
|
"learning_rate": 4.994995336887593e-05,
|
|
"loss": 0.7088,
|
|
"num_input_tokens_seen": 10360848,
|
|
"step": 530
|
|
},
|
|
{
|
|
"epoch": 0.6099472709134958,
|
|
"grad_norm": 6.651803016662598,
|
|
"learning_rate": 4.994900496135195e-05,
|
|
"loss": 0.7473,
|
|
"num_input_tokens_seen": 10458496,
|
|
"step": 535
|
|
},
|
|
{
|
|
"epoch": 0.6156477126977341,
|
|
"grad_norm": 4.845729351043701,
|
|
"learning_rate": 4.9948047660766904e-05,
|
|
"loss": 0.6939,
|
|
"num_input_tokens_seen": 10556304,
|
|
"step": 540
|
|
},
|
|
{
|
|
"epoch": 0.6213481544819723,
|
|
"grad_norm": 7.277071475982666,
|
|
"learning_rate": 4.994708146746203e-05,
|
|
"loss": 0.7219,
|
|
"num_input_tokens_seen": 10654048,
|
|
"step": 545
|
|
},
|
|
{
|
|
"epoch": 0.6270485962662107,
|
|
"grad_norm": 7.703381061553955,
|
|
"learning_rate": 4.994610638178172e-05,
|
|
"loss": 0.7795,
|
|
"num_input_tokens_seen": 10751776,
|
|
"step": 550
|
|
},
|
|
{
|
|
"epoch": 0.6327490380504489,
|
|
"grad_norm": 8.279520988464355,
|
|
"learning_rate": 4.994512240407354e-05,
|
|
"loss": 0.7027,
|
|
"num_input_tokens_seen": 10849584,
|
|
"step": 555
|
|
},
|
|
{
|
|
"epoch": 0.6384494798346871,
|
|
"grad_norm": 10.189576148986816,
|
|
"learning_rate": 4.9944129534688234e-05,
|
|
"loss": 0.6917,
|
|
"num_input_tokens_seen": 10947264,
|
|
"step": 560
|
|
},
|
|
{
|
|
"epoch": 0.6441499216189255,
|
|
"grad_norm": 6.311273574829102,
|
|
"learning_rate": 4.994312777397972e-05,
|
|
"loss": 0.7335,
|
|
"num_input_tokens_seen": 11045120,
|
|
"step": 565
|
|
},
|
|
{
|
|
"epoch": 0.6498503634031637,
|
|
"grad_norm": 9.937539100646973,
|
|
"learning_rate": 4.994211712230504e-05,
|
|
"loss": 0.6367,
|
|
"num_input_tokens_seen": 11142864,
|
|
"step": 570
|
|
},
|
|
{
|
|
"epoch": 0.655550805187402,
|
|
"grad_norm": 9.992775917053223,
|
|
"learning_rate": 4.994109758002447e-05,
|
|
"loss": 0.7662,
|
|
"num_input_tokens_seen": 11240560,
|
|
"step": 575
|
|
},
|
|
{
|
|
"epoch": 0.6612512469716403,
|
|
"grad_norm": 6.363308429718018,
|
|
"learning_rate": 4.994006914750143e-05,
|
|
"loss": 0.7291,
|
|
"num_input_tokens_seen": 11338320,
|
|
"step": 580
|
|
},
|
|
{
|
|
"epoch": 0.6669516887558786,
|
|
"grad_norm": 6.920602321624756,
|
|
"learning_rate": 4.993903182510249e-05,
|
|
"loss": 0.6525,
|
|
"num_input_tokens_seen": 11436032,
|
|
"step": 585
|
|
},
|
|
{
|
|
"epoch": 0.6726521305401169,
|
|
"grad_norm": 6.734442234039307,
|
|
"learning_rate": 4.99379856131974e-05,
|
|
"loss": 0.6581,
|
|
"num_input_tokens_seen": 11533680,
|
|
"step": 590
|
|
},
|
|
{
|
|
"epoch": 0.6783525723243551,
|
|
"grad_norm": 6.08076810836792,
|
|
"learning_rate": 4.99369305121591e-05,
|
|
"loss": 0.6868,
|
|
"num_input_tokens_seen": 11631344,
|
|
"step": 595
|
|
},
|
|
{
|
|
"epoch": 0.6840530141085934,
|
|
"grad_norm": 5.305174827575684,
|
|
"learning_rate": 4.9935866522363665e-05,
|
|
"loss": 0.7231,
|
|
"num_input_tokens_seen": 11729104,
|
|
"step": 600
|
|
},
|
|
{
|
|
"epoch": 0.6897534558928317,
|
|
"grad_norm": 5.337072849273682,
|
|
"learning_rate": 4.9934793644190345e-05,
|
|
"loss": 0.7082,
|
|
"num_input_tokens_seen": 11826880,
|
|
"step": 605
|
|
},
|
|
{
|
|
"epoch": 0.69545389767707,
|
|
"grad_norm": 6.563253879547119,
|
|
"learning_rate": 4.993371187802159e-05,
|
|
"loss": 0.7412,
|
|
"num_input_tokens_seen": 11924592,
|
|
"step": 610
|
|
},
|
|
{
|
|
"epoch": 0.7011543394613082,
|
|
"grad_norm": 6.92053747177124,
|
|
"learning_rate": 4.993262122424298e-05,
|
|
"loss": 0.6752,
|
|
"num_input_tokens_seen": 12022256,
|
|
"step": 615
|
|
},
|
|
{
|
|
"epoch": 0.7068547812455466,
|
|
"grad_norm": 10.413783073425293,
|
|
"learning_rate": 4.9931521683243276e-05,
|
|
"loss": 0.6955,
|
|
"num_input_tokens_seen": 12120000,
|
|
"step": 620
|
|
},
|
|
{
|
|
"epoch": 0.7125552230297848,
|
|
"grad_norm": 6.970921039581299,
|
|
"learning_rate": 4.993041325541442e-05,
|
|
"loss": 0.6883,
|
|
"num_input_tokens_seen": 12217808,
|
|
"step": 625
|
|
},
|
|
{
|
|
"epoch": 0.718255664814023,
|
|
"grad_norm": 5.135336875915527,
|
|
"learning_rate": 4.992929594115151e-05,
|
|
"loss": 0.6039,
|
|
"num_input_tokens_seen": 12315616,
|
|
"step": 630
|
|
},
|
|
{
|
|
"epoch": 0.7239561065982614,
|
|
"grad_norm": 7.350869655609131,
|
|
"learning_rate": 4.99281697408528e-05,
|
|
"loss": 0.7195,
|
|
"num_input_tokens_seen": 12413376,
|
|
"step": 635
|
|
},
|
|
{
|
|
"epoch": 0.7296565483824996,
|
|
"grad_norm": 6.427408218383789,
|
|
"learning_rate": 4.992703465491974e-05,
|
|
"loss": 0.5395,
|
|
"num_input_tokens_seen": 12510960,
|
|
"step": 640
|
|
},
|
|
{
|
|
"epoch": 0.735356990166738,
|
|
"grad_norm": 7.422171592712402,
|
|
"learning_rate": 4.992589068375691e-05,
|
|
"loss": 0.5605,
|
|
"num_input_tokens_seen": 12608752,
|
|
"step": 645
|
|
},
|
|
{
|
|
"epoch": 0.7410574319509762,
|
|
"grad_norm": 10.039104461669922,
|
|
"learning_rate": 4.9924737827772104e-05,
|
|
"loss": 0.6171,
|
|
"num_input_tokens_seen": 12706448,
|
|
"step": 650
|
|
},
|
|
{
|
|
"epoch": 0.7467578737352145,
|
|
"grad_norm": 6.769627094268799,
|
|
"learning_rate": 4.992357608737623e-05,
|
|
"loss": 0.6656,
|
|
"num_input_tokens_seen": 12804144,
|
|
"step": 655
|
|
},
|
|
{
|
|
"epoch": 0.7524583155194527,
|
|
"grad_norm": 6.161548614501953,
|
|
"learning_rate": 4.992240546298341e-05,
|
|
"loss": 0.6412,
|
|
"num_input_tokens_seen": 12902000,
|
|
"step": 660
|
|
},
|
|
{
|
|
"epoch": 0.7581587573036911,
|
|
"grad_norm": 9.02010440826416,
|
|
"learning_rate": 4.9921225955010906e-05,
|
|
"loss": 0.6899,
|
|
"num_input_tokens_seen": 12999648,
|
|
"step": 665
|
|
},
|
|
{
|
|
"epoch": 0.7638591990879293,
|
|
"grad_norm": 5.683040618896484,
|
|
"learning_rate": 4.9920037563879155e-05,
|
|
"loss": 0.5788,
|
|
"num_input_tokens_seen": 13097424,
|
|
"step": 670
|
|
},
|
|
{
|
|
"epoch": 0.7695596408721675,
|
|
"grad_norm": 5.576777935028076,
|
|
"learning_rate": 4.9918840290011745e-05,
|
|
"loss": 0.6354,
|
|
"num_input_tokens_seen": 13195136,
|
|
"step": 675
|
|
},
|
|
{
|
|
"epoch": 0.7752600826564059,
|
|
"grad_norm": 6.484269142150879,
|
|
"learning_rate": 4.9917634133835466e-05,
|
|
"loss": 0.6004,
|
|
"num_input_tokens_seen": 13292912,
|
|
"step": 680
|
|
},
|
|
{
|
|
"epoch": 0.7809605244406441,
|
|
"grad_norm": 5.845834732055664,
|
|
"learning_rate": 4.991641909578023e-05,
|
|
"loss": 0.6065,
|
|
"num_input_tokens_seen": 13390560,
|
|
"step": 685
|
|
},
|
|
{
|
|
"epoch": 0.7866609662248825,
|
|
"grad_norm": 7.066195011138916,
|
|
"learning_rate": 4.9915195176279156e-05,
|
|
"loss": 0.703,
|
|
"num_input_tokens_seen": 13488304,
|
|
"step": 690
|
|
},
|
|
{
|
|
"epoch": 0.7923614080091207,
|
|
"grad_norm": 7.687030792236328,
|
|
"learning_rate": 4.9913962375768494e-05,
|
|
"loss": 0.5684,
|
|
"num_input_tokens_seen": 13586032,
|
|
"step": 695
|
|
},
|
|
{
|
|
"epoch": 0.798061849793359,
|
|
"grad_norm": 5.923397064208984,
|
|
"learning_rate": 4.9912720694687684e-05,
|
|
"loss": 0.7124,
|
|
"num_input_tokens_seen": 13683792,
|
|
"step": 700
|
|
},
|
|
{
|
|
"epoch": 0.8037622915775973,
|
|
"grad_norm": 7.307689666748047,
|
|
"learning_rate": 4.9911470133479324e-05,
|
|
"loss": 0.585,
|
|
"num_input_tokens_seen": 13781488,
|
|
"step": 705
|
|
},
|
|
{
|
|
"epoch": 0.8094627333618355,
|
|
"grad_norm": 5.22707462310791,
|
|
"learning_rate": 4.9910210692589164e-05,
|
|
"loss": 0.6301,
|
|
"num_input_tokens_seen": 13879264,
|
|
"step": 710
|
|
},
|
|
{
|
|
"epoch": 0.8151631751460738,
|
|
"grad_norm": 6.6996870040893555,
|
|
"learning_rate": 4.990894237246615e-05,
|
|
"loss": 0.6073,
|
|
"num_input_tokens_seen": 13976976,
|
|
"step": 715
|
|
},
|
|
{
|
|
"epoch": 0.8208636169303121,
|
|
"grad_norm": 9.039154052734375,
|
|
"learning_rate": 4.990766517356236e-05,
|
|
"loss": 0.6611,
|
|
"num_input_tokens_seen": 14074688,
|
|
"step": 720
|
|
},
|
|
{
|
|
"epoch": 0.8265640587145504,
|
|
"grad_norm": 3.3328487873077393,
|
|
"learning_rate": 4.9906379096333047e-05,
|
|
"loss": 0.5829,
|
|
"num_input_tokens_seen": 14172432,
|
|
"step": 725
|
|
},
|
|
{
|
|
"epoch": 0.8322645004987886,
|
|
"grad_norm": 9.260608673095703,
|
|
"learning_rate": 4.9905084141236646e-05,
|
|
"loss": 0.7311,
|
|
"num_input_tokens_seen": 14270112,
|
|
"step": 730
|
|
},
|
|
{
|
|
"epoch": 0.837964942283027,
|
|
"grad_norm": 7.124883651733398,
|
|
"learning_rate": 4.990378030873474e-05,
|
|
"loss": 0.6354,
|
|
"num_input_tokens_seen": 14367792,
|
|
"step": 735
|
|
},
|
|
{
|
|
"epoch": 0.8436653840672652,
|
|
"grad_norm": 5.522550106048584,
|
|
"learning_rate": 4.990246759929207e-05,
|
|
"loss": 0.5578,
|
|
"num_input_tokens_seen": 14465584,
|
|
"step": 740
|
|
},
|
|
{
|
|
"epoch": 0.8493658258515034,
|
|
"grad_norm": 8.491950035095215,
|
|
"learning_rate": 4.9901146013376556e-05,
|
|
"loss": 0.6489,
|
|
"num_input_tokens_seen": 14563344,
|
|
"step": 745
|
|
},
|
|
{
|
|
"epoch": 0.8550662676357418,
|
|
"grad_norm": 6.821796417236328,
|
|
"learning_rate": 4.989981555145928e-05,
|
|
"loss": 0.451,
|
|
"num_input_tokens_seen": 14661024,
|
|
"step": 750
|
|
},
|
|
{
|
|
"epoch": 0.86076670941998,
|
|
"grad_norm": 8.655888557434082,
|
|
"learning_rate": 4.9898476214014486e-05,
|
|
"loss": 0.6291,
|
|
"num_input_tokens_seen": 14758800,
|
|
"step": 755
|
|
},
|
|
{
|
|
"epoch": 0.8664671512042184,
|
|
"grad_norm": 8.26075267791748,
|
|
"learning_rate": 4.989712800151958e-05,
|
|
"loss": 0.7259,
|
|
"num_input_tokens_seen": 14856592,
|
|
"step": 760
|
|
},
|
|
{
|
|
"epoch": 0.8721675929884566,
|
|
"grad_norm": 6.850794315338135,
|
|
"learning_rate": 4.989577091445512e-05,
|
|
"loss": 0.5639,
|
|
"num_input_tokens_seen": 14954304,
|
|
"step": 765
|
|
},
|
|
{
|
|
"epoch": 0.8778680347726949,
|
|
"grad_norm": 9.18870735168457,
|
|
"learning_rate": 4.989440495330485e-05,
|
|
"loss": 0.616,
|
|
"num_input_tokens_seen": 15052016,
|
|
"step": 770
|
|
},
|
|
{
|
|
"epoch": 0.8835684765569332,
|
|
"grad_norm": 9.08046817779541,
|
|
"learning_rate": 4.989303011855567e-05,
|
|
"loss": 0.5797,
|
|
"num_input_tokens_seen": 15149664,
|
|
"step": 775
|
|
},
|
|
{
|
|
"epoch": 0.8892689183411714,
|
|
"grad_norm": 5.607428073883057,
|
|
"learning_rate": 4.989164641069763e-05,
|
|
"loss": 0.5893,
|
|
"num_input_tokens_seen": 15247360,
|
|
"step": 780
|
|
},
|
|
{
|
|
"epoch": 0.8949693601254097,
|
|
"grad_norm": 6.935970783233643,
|
|
"learning_rate": 4.9890253830223955e-05,
|
|
"loss": 0.6095,
|
|
"num_input_tokens_seen": 15345056,
|
|
"step": 785
|
|
},
|
|
{
|
|
"epoch": 0.900669801909648,
|
|
"grad_norm": 6.799474239349365,
|
|
"learning_rate": 4.988885237763102e-05,
|
|
"loss": 0.5044,
|
|
"num_input_tokens_seen": 15442752,
|
|
"step": 790
|
|
},
|
|
{
|
|
"epoch": 0.9063702436938863,
|
|
"grad_norm": 6.294219017028809,
|
|
"learning_rate": 4.98874420534184e-05,
|
|
"loss": 0.5584,
|
|
"num_input_tokens_seen": 15540464,
|
|
"step": 795
|
|
},
|
|
{
|
|
"epoch": 0.9120706854781245,
|
|
"grad_norm": 5.488597869873047,
|
|
"learning_rate": 4.988602285808877e-05,
|
|
"loss": 0.4862,
|
|
"num_input_tokens_seen": 15638128,
|
|
"step": 800
|
|
},
|
|
{
|
|
"epoch": 0.9177711272623629,
|
|
"grad_norm": 8.307422637939453,
|
|
"learning_rate": 4.988459479214802e-05,
|
|
"loss": 0.5815,
|
|
"num_input_tokens_seen": 15735872,
|
|
"step": 805
|
|
},
|
|
{
|
|
"epoch": 0.9234715690466011,
|
|
"grad_norm": 10.344627380371094,
|
|
"learning_rate": 4.988315785610519e-05,
|
|
"loss": 0.5963,
|
|
"num_input_tokens_seen": 15833680,
|
|
"step": 810
|
|
},
|
|
{
|
|
"epoch": 0.9291720108308394,
|
|
"grad_norm": 10.354679107666016,
|
|
"learning_rate": 4.9881712050472464e-05,
|
|
"loss": 0.6225,
|
|
"num_input_tokens_seen": 15931472,
|
|
"step": 815
|
|
},
|
|
{
|
|
"epoch": 0.9348724526150777,
|
|
"grad_norm": 7.605050086975098,
|
|
"learning_rate": 4.9880257375765194e-05,
|
|
"loss": 0.645,
|
|
"num_input_tokens_seen": 16029120,
|
|
"step": 820
|
|
},
|
|
{
|
|
"epoch": 0.9405728943993159,
|
|
"grad_norm": 5.717419624328613,
|
|
"learning_rate": 4.987879383250191e-05,
|
|
"loss": 0.5142,
|
|
"num_input_tokens_seen": 16126896,
|
|
"step": 825
|
|
},
|
|
{
|
|
"epoch": 0.9462733361835542,
|
|
"grad_norm": 7.159694194793701,
|
|
"learning_rate": 4.987732142120428e-05,
|
|
"loss": 0.6613,
|
|
"num_input_tokens_seen": 16224592,
|
|
"step": 830
|
|
},
|
|
{
|
|
"epoch": 0.9519737779677925,
|
|
"grad_norm": 7.166426658630371,
|
|
"learning_rate": 4.987584014239716e-05,
|
|
"loss": 0.6094,
|
|
"num_input_tokens_seen": 16322208,
|
|
"step": 835
|
|
},
|
|
{
|
|
"epoch": 0.9576742197520308,
|
|
"grad_norm": 7.844811916351318,
|
|
"learning_rate": 4.9874349996608536e-05,
|
|
"loss": 0.5613,
|
|
"num_input_tokens_seen": 16419904,
|
|
"step": 840
|
|
},
|
|
{
|
|
"epoch": 0.963374661536269,
|
|
"grad_norm": 5.295498371124268,
|
|
"learning_rate": 4.987285098436958e-05,
|
|
"loss": 0.4958,
|
|
"num_input_tokens_seen": 16517600,
|
|
"step": 845
|
|
},
|
|
{
|
|
"epoch": 0.9690751033205074,
|
|
"grad_norm": 5.007256984710693,
|
|
"learning_rate": 4.987134310621461e-05,
|
|
"loss": 0.5119,
|
|
"num_input_tokens_seen": 16615216,
|
|
"step": 850
|
|
},
|
|
{
|
|
"epoch": 0.9747755451047456,
|
|
"grad_norm": 7.532383918762207,
|
|
"learning_rate": 4.9869826362681096e-05,
|
|
"loss": 0.4567,
|
|
"num_input_tokens_seen": 16713040,
|
|
"step": 855
|
|
},
|
|
{
|
|
"epoch": 0.9804759868889839,
|
|
"grad_norm": 6.256499767303467,
|
|
"learning_rate": 4.9868300754309706e-05,
|
|
"loss": 0.5088,
|
|
"num_input_tokens_seen": 16810768,
|
|
"step": 860
|
|
},
|
|
{
|
|
"epoch": 0.9861764286732222,
|
|
"grad_norm": 6.756839752197266,
|
|
"learning_rate": 4.986676628164423e-05,
|
|
"loss": 0.4097,
|
|
"num_input_tokens_seen": 16908512,
|
|
"step": 865
|
|
},
|
|
{
|
|
"epoch": 0.9918768704574604,
|
|
"grad_norm": 6.562160491943359,
|
|
"learning_rate": 4.986522294523162e-05,
|
|
"loss": 0.3819,
|
|
"num_input_tokens_seen": 17006240,
|
|
"step": 870
|
|
},
|
|
{
|
|
"epoch": 0.9975773122416988,
|
|
"grad_norm": 7.496212959289551,
|
|
"learning_rate": 4.9863670745622015e-05,
|
|
"loss": 0.4956,
|
|
"num_input_tokens_seen": 17104000,
|
|
"step": 875
|
|
},
|
|
{
|
|
"epoch": 1.0022801767136953,
|
|
"grad_norm": 9.028531074523926,
|
|
"learning_rate": 4.986210968336868e-05,
|
|
"loss": 0.5872,
|
|
"num_input_tokens_seen": 17184592,
|
|
"step": 880
|
|
},
|
|
{
|
|
"epoch": 1.0079806184979336,
|
|
"grad_norm": 6.04398775100708,
|
|
"learning_rate": 4.986053975902807e-05,
|
|
"loss": 0.48,
|
|
"num_input_tokens_seen": 17282304,
|
|
"step": 885
|
|
},
|
|
{
|
|
"epoch": 1.013681060282172,
|
|
"grad_norm": 9.602685928344727,
|
|
"learning_rate": 4.985896097315977e-05,
|
|
"loss": 0.5309,
|
|
"num_input_tokens_seen": 17380080,
|
|
"step": 890
|
|
},
|
|
{
|
|
"epoch": 1.01938150206641,
|
|
"grad_norm": 6.881324291229248,
|
|
"learning_rate": 4.9857373326326545e-05,
|
|
"loss": 0.5103,
|
|
"num_input_tokens_seen": 17477760,
|
|
"step": 895
|
|
},
|
|
{
|
|
"epoch": 1.0250819438506484,
|
|
"grad_norm": 8.762344360351562,
|
|
"learning_rate": 4.985577681909431e-05,
|
|
"loss": 0.5336,
|
|
"num_input_tokens_seen": 17575456,
|
|
"step": 900
|
|
},
|
|
{
|
|
"epoch": 1.0307823856348868,
|
|
"grad_norm": 7.028131484985352,
|
|
"learning_rate": 4.985417145203214e-05,
|
|
"loss": 0.4887,
|
|
"num_input_tokens_seen": 17673184,
|
|
"step": 905
|
|
},
|
|
{
|
|
"epoch": 1.036482827419125,
|
|
"grad_norm": 6.512467861175537,
|
|
"learning_rate": 4.985255722571227e-05,
|
|
"loss": 0.4787,
|
|
"num_input_tokens_seen": 17770944,
|
|
"step": 910
|
|
},
|
|
{
|
|
"epoch": 1.0421832692033632,
|
|
"grad_norm": 6.597855567932129,
|
|
"learning_rate": 4.985093414071008e-05,
|
|
"loss": 0.5185,
|
|
"num_input_tokens_seen": 17868768,
|
|
"step": 915
|
|
},
|
|
{
|
|
"epoch": 1.0478837109876016,
|
|
"grad_norm": 7.660828590393066,
|
|
"learning_rate": 4.984930219760413e-05,
|
|
"loss": 0.5056,
|
|
"num_input_tokens_seen": 17966480,
|
|
"step": 920
|
|
},
|
|
{
|
|
"epoch": 1.05358415277184,
|
|
"grad_norm": 6.880988121032715,
|
|
"learning_rate": 4.984766139697611e-05,
|
|
"loss": 0.5371,
|
|
"num_input_tokens_seen": 18064336,
|
|
"step": 925
|
|
},
|
|
{
|
|
"epoch": 1.059284594556078,
|
|
"grad_norm": 5.854323863983154,
|
|
"learning_rate": 4.98460117394109e-05,
|
|
"loss": 0.5041,
|
|
"num_input_tokens_seen": 18162112,
|
|
"step": 930
|
|
},
|
|
{
|
|
"epoch": 1.0649850363403164,
|
|
"grad_norm": 5.215938568115234,
|
|
"learning_rate": 4.984435322549651e-05,
|
|
"loss": 0.4857,
|
|
"num_input_tokens_seen": 18259904,
|
|
"step": 935
|
|
},
|
|
{
|
|
"epoch": 1.0706854781245547,
|
|
"grad_norm": 5.902091026306152,
|
|
"learning_rate": 4.984268585582412e-05,
|
|
"loss": 0.5047,
|
|
"num_input_tokens_seen": 18357616,
|
|
"step": 940
|
|
},
|
|
{
|
|
"epoch": 1.0763859199087928,
|
|
"grad_norm": 7.6616411209106445,
|
|
"learning_rate": 4.9841009630988064e-05,
|
|
"loss": 0.4147,
|
|
"num_input_tokens_seen": 18455392,
|
|
"step": 945
|
|
},
|
|
{
|
|
"epoch": 1.0820863616930312,
|
|
"grad_norm": 7.779905796051025,
|
|
"learning_rate": 4.983932455158583e-05,
|
|
"loss": 0.4762,
|
|
"num_input_tokens_seen": 18553120,
|
|
"step": 950
|
|
},
|
|
{
|
|
"epoch": 1.0877868034772695,
|
|
"grad_norm": 6.425886154174805,
|
|
"learning_rate": 4.9837630618218056e-05,
|
|
"loss": 0.4129,
|
|
"num_input_tokens_seen": 18650864,
|
|
"step": 955
|
|
},
|
|
{
|
|
"epoch": 1.0934872452615079,
|
|
"grad_norm": 8.044005393981934,
|
|
"learning_rate": 4.983592783148856e-05,
|
|
"loss": 0.4027,
|
|
"num_input_tokens_seen": 18748624,
|
|
"step": 960
|
|
},
|
|
{
|
|
"epoch": 1.099187687045746,
|
|
"grad_norm": 5.91091251373291,
|
|
"learning_rate": 4.983421619200428e-05,
|
|
"loss": 0.4064,
|
|
"num_input_tokens_seen": 18846320,
|
|
"step": 965
|
|
},
|
|
{
|
|
"epoch": 1.1048881288299843,
|
|
"grad_norm": 5.88447380065918,
|
|
"learning_rate": 4.9832495700375346e-05,
|
|
"loss": 0.4599,
|
|
"num_input_tokens_seen": 18944144,
|
|
"step": 970
|
|
},
|
|
{
|
|
"epoch": 1.1105885706142227,
|
|
"grad_norm": 7.686187744140625,
|
|
"learning_rate": 4.983076635721502e-05,
|
|
"loss": 0.4764,
|
|
"num_input_tokens_seen": 19041904,
|
|
"step": 975
|
|
},
|
|
{
|
|
"epoch": 1.1162890123984608,
|
|
"grad_norm": 6.358628273010254,
|
|
"learning_rate": 4.982902816313972e-05,
|
|
"loss": 0.4844,
|
|
"num_input_tokens_seen": 19139664,
|
|
"step": 980
|
|
},
|
|
{
|
|
"epoch": 1.1219894541826991,
|
|
"grad_norm": 6.775269508361816,
|
|
"learning_rate": 4.982728111876903e-05,
|
|
"loss": 0.4292,
|
|
"num_input_tokens_seen": 19237488,
|
|
"step": 985
|
|
},
|
|
{
|
|
"epoch": 1.1276898959669375,
|
|
"grad_norm": 7.491086483001709,
|
|
"learning_rate": 4.982552522472569e-05,
|
|
"loss": 0.4423,
|
|
"num_input_tokens_seen": 19335152,
|
|
"step": 990
|
|
},
|
|
{
|
|
"epoch": 1.1333903377511758,
|
|
"grad_norm": 7.664567947387695,
|
|
"learning_rate": 4.982376048163557e-05,
|
|
"loss": 0.4983,
|
|
"num_input_tokens_seen": 19432976,
|
|
"step": 995
|
|
},
|
|
{
|
|
"epoch": 1.139090779535414,
|
|
"grad_norm": 4.956116676330566,
|
|
"learning_rate": 4.9821986890127734e-05,
|
|
"loss": 0.4027,
|
|
"num_input_tokens_seen": 19530704,
|
|
"step": 1000
|
|
},
|
|
{
|
|
"epoch": 1.1447912213196523,
|
|
"grad_norm": 7.820266246795654,
|
|
"learning_rate": 4.982020445083436e-05,
|
|
"loss": 0.4131,
|
|
"num_input_tokens_seen": 19628448,
|
|
"step": 1005
|
|
},
|
|
{
|
|
"epoch": 1.1504916631038906,
|
|
"grad_norm": 5.402007102966309,
|
|
"learning_rate": 4.981841316439081e-05,
|
|
"loss": 0.4946,
|
|
"num_input_tokens_seen": 19726176,
|
|
"step": 1010
|
|
},
|
|
{
|
|
"epoch": 1.1561921048881287,
|
|
"grad_norm": 6.406668663024902,
|
|
"learning_rate": 4.981661303143557e-05,
|
|
"loss": 0.4701,
|
|
"num_input_tokens_seen": 19823856,
|
|
"step": 1015
|
|
},
|
|
{
|
|
"epoch": 1.161892546672367,
|
|
"grad_norm": 3.809847116470337,
|
|
"learning_rate": 4.981480405261032e-05,
|
|
"loss": 0.4063,
|
|
"num_input_tokens_seen": 19921552,
|
|
"step": 1020
|
|
},
|
|
{
|
|
"epoch": 1.1675929884566054,
|
|
"grad_norm": 8.321266174316406,
|
|
"learning_rate": 4.981298622855984e-05,
|
|
"loss": 0.38,
|
|
"num_input_tokens_seen": 20019248,
|
|
"step": 1025
|
|
},
|
|
{
|
|
"epoch": 1.1732934302408438,
|
|
"grad_norm": 4.611199855804443,
|
|
"learning_rate": 4.981115955993213e-05,
|
|
"loss": 0.3435,
|
|
"num_input_tokens_seen": 20116992,
|
|
"step": 1030
|
|
},
|
|
{
|
|
"epoch": 1.1789938720250819,
|
|
"grad_norm": 6.748137950897217,
|
|
"learning_rate": 4.980932404737827e-05,
|
|
"loss": 0.4443,
|
|
"num_input_tokens_seen": 20214848,
|
|
"step": 1035
|
|
},
|
|
{
|
|
"epoch": 1.1846943138093202,
|
|
"grad_norm": 7.327335834503174,
|
|
"learning_rate": 4.980747969155255e-05,
|
|
"loss": 0.5365,
|
|
"num_input_tokens_seen": 20312608,
|
|
"step": 1040
|
|
},
|
|
{
|
|
"epoch": 1.1903947555935586,
|
|
"grad_norm": 9.424795150756836,
|
|
"learning_rate": 4.980562649311238e-05,
|
|
"loss": 0.404,
|
|
"num_input_tokens_seen": 20410288,
|
|
"step": 1045
|
|
},
|
|
{
|
|
"epoch": 1.196095197377797,
|
|
"grad_norm": 6.2012152671813965,
|
|
"learning_rate": 4.9803764452718335e-05,
|
|
"loss": 0.4176,
|
|
"num_input_tokens_seen": 20508080,
|
|
"step": 1050
|
|
},
|
|
{
|
|
"epoch": 1.201795639162035,
|
|
"grad_norm": 6.23061990737915,
|
|
"learning_rate": 4.980189357103414e-05,
|
|
"loss": 0.3945,
|
|
"num_input_tokens_seen": 20605856,
|
|
"step": 1055
|
|
},
|
|
{
|
|
"epoch": 1.2074960809462734,
|
|
"grad_norm": 8.05282974243164,
|
|
"learning_rate": 4.980001384872666e-05,
|
|
"loss": 0.5353,
|
|
"num_input_tokens_seen": 20703584,
|
|
"step": 1060
|
|
},
|
|
{
|
|
"epoch": 1.2131965227305117,
|
|
"grad_norm": 7.0456156730651855,
|
|
"learning_rate": 4.9798125286465935e-05,
|
|
"loss": 0.4638,
|
|
"num_input_tokens_seen": 20801376,
|
|
"step": 1065
|
|
},
|
|
{
|
|
"epoch": 1.2188969645147498,
|
|
"grad_norm": 5.285283088684082,
|
|
"learning_rate": 4.979622788492513e-05,
|
|
"loss": 0.5492,
|
|
"num_input_tokens_seen": 20899200,
|
|
"step": 1070
|
|
},
|
|
{
|
|
"epoch": 1.2245974062989882,
|
|
"grad_norm": 7.358059883117676,
|
|
"learning_rate": 4.9794321644780585e-05,
|
|
"loss": 0.4979,
|
|
"num_input_tokens_seen": 20996928,
|
|
"step": 1075
|
|
},
|
|
{
|
|
"epoch": 1.2302978480832265,
|
|
"grad_norm": 6.339309215545654,
|
|
"learning_rate": 4.979240656671177e-05,
|
|
"loss": 0.3867,
|
|
"num_input_tokens_seen": 21094752,
|
|
"step": 1080
|
|
},
|
|
{
|
|
"epoch": 1.2359982898674646,
|
|
"grad_norm": 6.887006759643555,
|
|
"learning_rate": 4.979048265140132e-05,
|
|
"loss": 0.338,
|
|
"num_input_tokens_seen": 21192480,
|
|
"step": 1085
|
|
},
|
|
{
|
|
"epoch": 1.241698731651703,
|
|
"grad_norm": 7.377925395965576,
|
|
"learning_rate": 4.9788549899535e-05,
|
|
"loss": 0.3946,
|
|
"num_input_tokens_seen": 21290144,
|
|
"step": 1090
|
|
},
|
|
{
|
|
"epoch": 1.2473991734359413,
|
|
"grad_norm": 7.47123384475708,
|
|
"learning_rate": 4.978660831180175e-05,
|
|
"loss": 0.4831,
|
|
"num_input_tokens_seen": 21387888,
|
|
"step": 1095
|
|
},
|
|
{
|
|
"epoch": 1.2530996152201794,
|
|
"grad_norm": 9.348438262939453,
|
|
"learning_rate": 4.978465788889365e-05,
|
|
"loss": 0.4933,
|
|
"num_input_tokens_seen": 21485536,
|
|
"step": 1100
|
|
},
|
|
{
|
|
"epoch": 1.2588000570044178,
|
|
"grad_norm": 8.355619430541992,
|
|
"learning_rate": 4.978269863150592e-05,
|
|
"loss": 0.4139,
|
|
"num_input_tokens_seen": 21583264,
|
|
"step": 1105
|
|
},
|
|
{
|
|
"epoch": 1.2645004987886561,
|
|
"grad_norm": 5.923043727874756,
|
|
"learning_rate": 4.978073054033694e-05,
|
|
"loss": 0.3656,
|
|
"num_input_tokens_seen": 21681040,
|
|
"step": 1110
|
|
},
|
|
{
|
|
"epoch": 1.2702009405728945,
|
|
"grad_norm": 7.194669246673584,
|
|
"learning_rate": 4.977875361608823e-05,
|
|
"loss": 0.3487,
|
|
"num_input_tokens_seen": 21778720,
|
|
"step": 1115
|
|
},
|
|
{
|
|
"epoch": 1.2759013823571328,
|
|
"grad_norm": 7.351987838745117,
|
|
"learning_rate": 4.9776767859464474e-05,
|
|
"loss": 0.4004,
|
|
"num_input_tokens_seen": 21876496,
|
|
"step": 1120
|
|
},
|
|
{
|
|
"epoch": 1.281601824141371,
|
|
"grad_norm": 6.509387493133545,
|
|
"learning_rate": 4.9774773271173494e-05,
|
|
"loss": 0.3702,
|
|
"num_input_tokens_seen": 21974256,
|
|
"step": 1125
|
|
},
|
|
{
|
|
"epoch": 1.2873022659256093,
|
|
"grad_norm": 10.99763298034668,
|
|
"learning_rate": 4.977276985192624e-05,
|
|
"loss": 0.3921,
|
|
"num_input_tokens_seen": 22071952,
|
|
"step": 1130
|
|
},
|
|
{
|
|
"epoch": 1.2930027077098476,
|
|
"grad_norm": 5.566549777984619,
|
|
"learning_rate": 4.977075760243686e-05,
|
|
"loss": 0.4117,
|
|
"num_input_tokens_seen": 22169696,
|
|
"step": 1135
|
|
},
|
|
{
|
|
"epoch": 1.2987031494940857,
|
|
"grad_norm": 8.485737800598145,
|
|
"learning_rate": 4.976873652342259e-05,
|
|
"loss": 0.394,
|
|
"num_input_tokens_seen": 22267456,
|
|
"step": 1140
|
|
},
|
|
{
|
|
"epoch": 1.304403591278324,
|
|
"grad_norm": 7.3099284172058105,
|
|
"learning_rate": 4.976670661560386e-05,
|
|
"loss": 0.2883,
|
|
"num_input_tokens_seen": 22365120,
|
|
"step": 1145
|
|
},
|
|
{
|
|
"epoch": 1.3101040330625624,
|
|
"grad_norm": 6.294934272766113,
|
|
"learning_rate": 4.976466787970423e-05,
|
|
"loss": 0.3503,
|
|
"num_input_tokens_seen": 22462880,
|
|
"step": 1150
|
|
},
|
|
{
|
|
"epoch": 1.3158044748468005,
|
|
"grad_norm": 5.884027004241943,
|
|
"learning_rate": 4.97626203164504e-05,
|
|
"loss": 0.3098,
|
|
"num_input_tokens_seen": 22560640,
|
|
"step": 1155
|
|
},
|
|
{
|
|
"epoch": 1.3215049166310389,
|
|
"grad_norm": 7.804978847503662,
|
|
"learning_rate": 4.9760563926572226e-05,
|
|
"loss": 0.3423,
|
|
"num_input_tokens_seen": 22658368,
|
|
"step": 1160
|
|
},
|
|
{
|
|
"epoch": 1.3272053584152772,
|
|
"grad_norm": 7.155725002288818,
|
|
"learning_rate": 4.97584987108027e-05,
|
|
"loss": 0.3006,
|
|
"num_input_tokens_seen": 22756176,
|
|
"step": 1165
|
|
},
|
|
{
|
|
"epoch": 1.3329058001995155,
|
|
"grad_norm": 6.071112632751465,
|
|
"learning_rate": 4.975642466987799e-05,
|
|
"loss": 0.3357,
|
|
"num_input_tokens_seen": 22853920,
|
|
"step": 1170
|
|
},
|
|
{
|
|
"epoch": 1.3386062419837537,
|
|
"grad_norm": 5.568732738494873,
|
|
"learning_rate": 4.9754341804537356e-05,
|
|
"loss": 0.3445,
|
|
"num_input_tokens_seen": 22951664,
|
|
"step": 1175
|
|
},
|
|
{
|
|
"epoch": 1.344306683767992,
|
|
"grad_norm": 9.902073860168457,
|
|
"learning_rate": 4.975225011552326e-05,
|
|
"loss": 0.3621,
|
|
"num_input_tokens_seen": 23049520,
|
|
"step": 1180
|
|
},
|
|
{
|
|
"epoch": 1.3500071255522303,
|
|
"grad_norm": 5.503910064697266,
|
|
"learning_rate": 4.975014960358126e-05,
|
|
"loss": 0.3229,
|
|
"num_input_tokens_seen": 23147280,
|
|
"step": 1185
|
|
},
|
|
{
|
|
"epoch": 1.3557075673364687,
|
|
"grad_norm": 7.572802543640137,
|
|
"learning_rate": 4.974804026946011e-05,
|
|
"loss": 0.5356,
|
|
"num_input_tokens_seen": 23245008,
|
|
"step": 1190
|
|
},
|
|
{
|
|
"epoch": 1.3614080091207068,
|
|
"grad_norm": 7.335933208465576,
|
|
"learning_rate": 4.9745922113911655e-05,
|
|
"loss": 0.364,
|
|
"num_input_tokens_seen": 23342768,
|
|
"step": 1195
|
|
},
|
|
{
|
|
"epoch": 1.3671084509049451,
|
|
"grad_norm": 10.062085151672363,
|
|
"learning_rate": 4.974379513769093e-05,
|
|
"loss": 0.384,
|
|
"num_input_tokens_seen": 23440480,
|
|
"step": 1200
|
|
},
|
|
{
|
|
"epoch": 1.3728088926891835,
|
|
"grad_norm": 10.50871467590332,
|
|
"learning_rate": 4.974165934155608e-05,
|
|
"loss": 0.357,
|
|
"num_input_tokens_seen": 23538192,
|
|
"step": 1205
|
|
},
|
|
{
|
|
"epoch": 1.3785093344734216,
|
|
"grad_norm": 6.676635265350342,
|
|
"learning_rate": 4.9739514726268416e-05,
|
|
"loss": 0.316,
|
|
"num_input_tokens_seen": 23635984,
|
|
"step": 1210
|
|
},
|
|
{
|
|
"epoch": 1.38420977625766,
|
|
"grad_norm": 9.456487655639648,
|
|
"learning_rate": 4.973736129259239e-05,
|
|
"loss": 0.3407,
|
|
"num_input_tokens_seen": 23733744,
|
|
"step": 1215
|
|
},
|
|
{
|
|
"epoch": 1.3899102180418983,
|
|
"grad_norm": 7.790709495544434,
|
|
"learning_rate": 4.9735199041295575e-05,
|
|
"loss": 0.422,
|
|
"num_input_tokens_seen": 23831440,
|
|
"step": 1220
|
|
},
|
|
{
|
|
"epoch": 1.3956106598261364,
|
|
"grad_norm": 9.099756240844727,
|
|
"learning_rate": 4.9733027973148727e-05,
|
|
"loss": 0.4655,
|
|
"num_input_tokens_seen": 23929184,
|
|
"step": 1225
|
|
},
|
|
{
|
|
"epoch": 1.4013111016103748,
|
|
"grad_norm": 6.994203567504883,
|
|
"learning_rate": 4.9730848088925706e-05,
|
|
"loss": 0.388,
|
|
"num_input_tokens_seen": 24026928,
|
|
"step": 1230
|
|
},
|
|
{
|
|
"epoch": 1.407011543394613,
|
|
"grad_norm": 7.203540325164795,
|
|
"learning_rate": 4.9728659389403535e-05,
|
|
"loss": 0.4004,
|
|
"num_input_tokens_seen": 24124688,
|
|
"step": 1235
|
|
},
|
|
{
|
|
"epoch": 1.4127119851788514,
|
|
"grad_norm": 7.220198631286621,
|
|
"learning_rate": 4.9726461875362377e-05,
|
|
"loss": 0.3321,
|
|
"num_input_tokens_seen": 24222416,
|
|
"step": 1240
|
|
},
|
|
{
|
|
"epoch": 1.4184124269630896,
|
|
"grad_norm": 6.651162147521973,
|
|
"learning_rate": 4.9724255547585534e-05,
|
|
"loss": 0.2864,
|
|
"num_input_tokens_seen": 24320096,
|
|
"step": 1245
|
|
},
|
|
{
|
|
"epoch": 1.424112868747328,
|
|
"grad_norm": 7.986251354217529,
|
|
"learning_rate": 4.9722040406859454e-05,
|
|
"loss": 0.3401,
|
|
"num_input_tokens_seen": 24417712,
|
|
"step": 1250
|
|
},
|
|
{
|
|
"epoch": 1.4298133105315662,
|
|
"grad_norm": 6.927532196044922,
|
|
"learning_rate": 4.971981645397371e-05,
|
|
"loss": 0.344,
|
|
"num_input_tokens_seen": 24515456,
|
|
"step": 1255
|
|
},
|
|
{
|
|
"epoch": 1.4355137523158046,
|
|
"grad_norm": 8.963294982910156,
|
|
"learning_rate": 4.9717583689721046e-05,
|
|
"loss": 0.3394,
|
|
"num_input_tokens_seen": 24613232,
|
|
"step": 1260
|
|
},
|
|
{
|
|
"epoch": 1.4412141941000427,
|
|
"grad_norm": 9.106192588806152,
|
|
"learning_rate": 4.9715342114897325e-05,
|
|
"loss": 0.4323,
|
|
"num_input_tokens_seen": 24710960,
|
|
"step": 1265
|
|
},
|
|
{
|
|
"epoch": 1.446914635884281,
|
|
"grad_norm": 8.095370292663574,
|
|
"learning_rate": 4.971309173030154e-05,
|
|
"loss": 0.3961,
|
|
"num_input_tokens_seen": 24808560,
|
|
"step": 1270
|
|
},
|
|
{
|
|
"epoch": 1.4526150776685194,
|
|
"grad_norm": 7.318641662597656,
|
|
"learning_rate": 4.9710832536735864e-05,
|
|
"loss": 0.2917,
|
|
"num_input_tokens_seen": 24906320,
|
|
"step": 1275
|
|
},
|
|
{
|
|
"epoch": 1.4583155194527575,
|
|
"grad_norm": 7.140157699584961,
|
|
"learning_rate": 4.970856453500557e-05,
|
|
"loss": 0.3622,
|
|
"num_input_tokens_seen": 25004016,
|
|
"step": 1280
|
|
},
|
|
{
|
|
"epoch": 1.4640159612369958,
|
|
"grad_norm": 8.635784149169922,
|
|
"learning_rate": 4.970628772591909e-05,
|
|
"loss": 0.4472,
|
|
"num_input_tokens_seen": 25101808,
|
|
"step": 1285
|
|
},
|
|
{
|
|
"epoch": 1.4697164030212342,
|
|
"grad_norm": 9.65007495880127,
|
|
"learning_rate": 4.970400211028798e-05,
|
|
"loss": 0.3185,
|
|
"num_input_tokens_seen": 25199568,
|
|
"step": 1290
|
|
},
|
|
{
|
|
"epoch": 1.4754168448054723,
|
|
"grad_norm": 6.725943088531494,
|
|
"learning_rate": 4.970170768892697e-05,
|
|
"loss": 0.4232,
|
|
"num_input_tokens_seen": 25297296,
|
|
"step": 1295
|
|
},
|
|
{
|
|
"epoch": 1.4811172865897106,
|
|
"grad_norm": 7.719542980194092,
|
|
"learning_rate": 4.9699404462653887e-05,
|
|
"loss": 0.3133,
|
|
"num_input_tokens_seen": 25395056,
|
|
"step": 1300
|
|
},
|
|
{
|
|
"epoch": 1.486817728373949,
|
|
"grad_norm": 7.277915954589844,
|
|
"learning_rate": 4.969709243228972e-05,
|
|
"loss": 0.3103,
|
|
"num_input_tokens_seen": 25492784,
|
|
"step": 1305
|
|
},
|
|
{
|
|
"epoch": 1.4925181701581873,
|
|
"grad_norm": 6.372420310974121,
|
|
"learning_rate": 4.96947715986586e-05,
|
|
"loss": 0.3191,
|
|
"num_input_tokens_seen": 25590528,
|
|
"step": 1310
|
|
},
|
|
{
|
|
"epoch": 1.4982186119424257,
|
|
"grad_norm": 5.824290752410889,
|
|
"learning_rate": 4.969244196258777e-05,
|
|
"loss": 0.2663,
|
|
"num_input_tokens_seen": 25688304,
|
|
"step": 1315
|
|
},
|
|
{
|
|
"epoch": 1.5039190537266638,
|
|
"grad_norm": 5.841516971588135,
|
|
"learning_rate": 4.969010352490764e-05,
|
|
"loss": 0.3178,
|
|
"num_input_tokens_seen": 25786096,
|
|
"step": 1320
|
|
},
|
|
{
|
|
"epoch": 1.5096194955109021,
|
|
"grad_norm": 7.10455846786499,
|
|
"learning_rate": 4.968775628645174e-05,
|
|
"loss": 0.4365,
|
|
"num_input_tokens_seen": 25883776,
|
|
"step": 1325
|
|
},
|
|
{
|
|
"epoch": 1.5153199372951405,
|
|
"grad_norm": 6.748999118804932,
|
|
"learning_rate": 4.9685400248056747e-05,
|
|
"loss": 0.2147,
|
|
"num_input_tokens_seen": 25981552,
|
|
"step": 1330
|
|
},
|
|
{
|
|
"epoch": 1.5210203790793786,
|
|
"grad_norm": 4.732318878173828,
|
|
"learning_rate": 4.968303541056246e-05,
|
|
"loss": 0.3367,
|
|
"num_input_tokens_seen": 26079312,
|
|
"step": 1335
|
|
},
|
|
{
|
|
"epoch": 1.526720820863617,
|
|
"grad_norm": 4.386244297027588,
|
|
"learning_rate": 4.9680661774811835e-05,
|
|
"loss": 0.3207,
|
|
"num_input_tokens_seen": 26177136,
|
|
"step": 1340
|
|
},
|
|
{
|
|
"epoch": 1.5324212626478553,
|
|
"grad_norm": 9.144301414489746,
|
|
"learning_rate": 4.967827934165095e-05,
|
|
"loss": 0.2718,
|
|
"num_input_tokens_seen": 26274944,
|
|
"step": 1345
|
|
},
|
|
{
|
|
"epoch": 1.5381217044320934,
|
|
"grad_norm": 10.099781036376953,
|
|
"learning_rate": 4.967588811192902e-05,
|
|
"loss": 0.3752,
|
|
"num_input_tokens_seen": 26372768,
|
|
"step": 1350
|
|
},
|
|
{
|
|
"epoch": 1.5438221462163317,
|
|
"grad_norm": 7.383282661437988,
|
|
"learning_rate": 4.96734880864984e-05,
|
|
"loss": 0.2743,
|
|
"num_input_tokens_seen": 26470608,
|
|
"step": 1355
|
|
},
|
|
{
|
|
"epoch": 1.54952258800057,
|
|
"grad_norm": 8.287309646606445,
|
|
"learning_rate": 4.967107926621457e-05,
|
|
"loss": 0.2853,
|
|
"num_input_tokens_seen": 26568368,
|
|
"step": 1360
|
|
},
|
|
{
|
|
"epoch": 1.5552230297848082,
|
|
"grad_norm": 9.22373104095459,
|
|
"learning_rate": 4.966866165193617e-05,
|
|
"loss": 0.2913,
|
|
"num_input_tokens_seen": 26666080,
|
|
"step": 1365
|
|
},
|
|
{
|
|
"epoch": 1.5609234715690468,
|
|
"grad_norm": 6.172590255737305,
|
|
"learning_rate": 4.966623524452494e-05,
|
|
"loss": 0.2775,
|
|
"num_input_tokens_seen": 26763792,
|
|
"step": 1370
|
|
},
|
|
{
|
|
"epoch": 1.5666239133532849,
|
|
"grad_norm": 7.464315891265869,
|
|
"learning_rate": 4.9663800044845784e-05,
|
|
"loss": 0.3685,
|
|
"num_input_tokens_seen": 26861488,
|
|
"step": 1375
|
|
},
|
|
{
|
|
"epoch": 1.572324355137523,
|
|
"grad_norm": 7.123498916625977,
|
|
"learning_rate": 4.9661356053766716e-05,
|
|
"loss": 0.3636,
|
|
"num_input_tokens_seen": 26959232,
|
|
"step": 1380
|
|
},
|
|
{
|
|
"epoch": 1.5780247969217616,
|
|
"grad_norm": 7.7689619064331055,
|
|
"learning_rate": 4.965890327215891e-05,
|
|
"loss": 0.3052,
|
|
"num_input_tokens_seen": 27057040,
|
|
"step": 1385
|
|
},
|
|
{
|
|
"epoch": 1.5837252387059997,
|
|
"grad_norm": 5.988576412200928,
|
|
"learning_rate": 4.965644170089665e-05,
|
|
"loss": 0.3355,
|
|
"num_input_tokens_seen": 27154768,
|
|
"step": 1390
|
|
},
|
|
{
|
|
"epoch": 1.589425680490238,
|
|
"grad_norm": 9.67150592803955,
|
|
"learning_rate": 4.965397134085735e-05,
|
|
"loss": 0.3239,
|
|
"num_input_tokens_seen": 27252480,
|
|
"step": 1395
|
|
},
|
|
{
|
|
"epoch": 1.5951261222744764,
|
|
"grad_norm": 9.156477928161621,
|
|
"learning_rate": 4.96514921929216e-05,
|
|
"loss": 0.3421,
|
|
"num_input_tokens_seen": 27350320,
|
|
"step": 1400
|
|
},
|
|
{
|
|
"epoch": 1.6008265640587145,
|
|
"grad_norm": 8.114056587219238,
|
|
"learning_rate": 4.964900425797306e-05,
|
|
"loss": 0.405,
|
|
"num_input_tokens_seen": 27448128,
|
|
"step": 1405
|
|
},
|
|
{
|
|
"epoch": 1.6065270058429528,
|
|
"grad_norm": 8.292765617370605,
|
|
"learning_rate": 4.9646507536898575e-05,
|
|
"loss": 0.2936,
|
|
"num_input_tokens_seen": 27545808,
|
|
"step": 1410
|
|
},
|
|
{
|
|
"epoch": 1.6122274476271912,
|
|
"grad_norm": 6.832367420196533,
|
|
"learning_rate": 4.964400203058809e-05,
|
|
"loss": 0.2365,
|
|
"num_input_tokens_seen": 27643456,
|
|
"step": 1415
|
|
},
|
|
{
|
|
"epoch": 1.6179278894114293,
|
|
"grad_norm": 8.706232070922852,
|
|
"learning_rate": 4.9641487739934684e-05,
|
|
"loss": 0.3065,
|
|
"num_input_tokens_seen": 27741168,
|
|
"step": 1420
|
|
},
|
|
{
|
|
"epoch": 1.6236283311956676,
|
|
"grad_norm": 7.025177478790283,
|
|
"learning_rate": 4.963896466583459e-05,
|
|
"loss": 0.2376,
|
|
"num_input_tokens_seen": 27838912,
|
|
"step": 1425
|
|
},
|
|
{
|
|
"epoch": 1.629328772979906,
|
|
"grad_norm": 9.140198707580566,
|
|
"learning_rate": 4.963643280918714e-05,
|
|
"loss": 0.2518,
|
|
"num_input_tokens_seen": 27936592,
|
|
"step": 1430
|
|
},
|
|
{
|
|
"epoch": 1.635029214764144,
|
|
"grad_norm": 8.371499061584473,
|
|
"learning_rate": 4.963389217089484e-05,
|
|
"loss": 0.3488,
|
|
"num_input_tokens_seen": 28034304,
|
|
"step": 1435
|
|
},
|
|
{
|
|
"epoch": 1.6407296565483827,
|
|
"grad_norm": 4.390028476715088,
|
|
"learning_rate": 4.963134275186327e-05,
|
|
"loss": 0.2444,
|
|
"num_input_tokens_seen": 28131984,
|
|
"step": 1440
|
|
},
|
|
{
|
|
"epoch": 1.6464300983326208,
|
|
"grad_norm": 10.252416610717773,
|
|
"learning_rate": 4.9628784553001185e-05,
|
|
"loss": 0.3859,
|
|
"num_input_tokens_seen": 28229680,
|
|
"step": 1445
|
|
},
|
|
{
|
|
"epoch": 1.652130540116859,
|
|
"grad_norm": 8.285685539245605,
|
|
"learning_rate": 4.962621757522044e-05,
|
|
"loss": 0.3006,
|
|
"num_input_tokens_seen": 28327440,
|
|
"step": 1450
|
|
},
|
|
{
|
|
"epoch": 1.6578309819010975,
|
|
"grad_norm": 8.214794158935547,
|
|
"learning_rate": 4.962364181943606e-05,
|
|
"loss": 0.2718,
|
|
"num_input_tokens_seen": 28425216,
|
|
"step": 1455
|
|
},
|
|
{
|
|
"epoch": 1.6635314236853356,
|
|
"grad_norm": 12.243093490600586,
|
|
"learning_rate": 4.9621057286566155e-05,
|
|
"loss": 0.3569,
|
|
"num_input_tokens_seen": 28522992,
|
|
"step": 1460
|
|
},
|
|
{
|
|
"epoch": 1.669231865469574,
|
|
"grad_norm": 5.961976528167725,
|
|
"learning_rate": 4.961846397753197e-05,
|
|
"loss": 0.2414,
|
|
"num_input_tokens_seen": 28620720,
|
|
"step": 1465
|
|
},
|
|
{
|
|
"epoch": 1.6749323072538123,
|
|
"grad_norm": 7.293554782867432,
|
|
"learning_rate": 4.961586189325791e-05,
|
|
"loss": 0.2259,
|
|
"num_input_tokens_seen": 28718464,
|
|
"step": 1470
|
|
},
|
|
{
|
|
"epoch": 1.6806327490380504,
|
|
"grad_norm": 7.203713893890381,
|
|
"learning_rate": 4.9613251034671465e-05,
|
|
"loss": 0.2356,
|
|
"num_input_tokens_seen": 28816368,
|
|
"step": 1475
|
|
},
|
|
{
|
|
"epoch": 1.6863331908222887,
|
|
"grad_norm": 6.28717565536499,
|
|
"learning_rate": 4.961063140270329e-05,
|
|
"loss": 0.3129,
|
|
"num_input_tokens_seen": 28914080,
|
|
"step": 1480
|
|
},
|
|
{
|
|
"epoch": 1.692033632606527,
|
|
"grad_norm": 5.234409809112549,
|
|
"learning_rate": 4.960800299828715e-05,
|
|
"loss": 0.2614,
|
|
"num_input_tokens_seen": 29011808,
|
|
"step": 1485
|
|
},
|
|
{
|
|
"epoch": 1.6977340743907652,
|
|
"grad_norm": 7.488391399383545,
|
|
"learning_rate": 4.960536582235993e-05,
|
|
"loss": 0.2573,
|
|
"num_input_tokens_seen": 29109488,
|
|
"step": 1490
|
|
},
|
|
{
|
|
"epoch": 1.7034345161750035,
|
|
"grad_norm": 7.9980292320251465,
|
|
"learning_rate": 4.960271987586166e-05,
|
|
"loss": 0.2409,
|
|
"num_input_tokens_seen": 29207232,
|
|
"step": 1495
|
|
},
|
|
{
|
|
"epoch": 1.7091349579592419,
|
|
"grad_norm": 4.908660888671875,
|
|
"learning_rate": 4.960006515973548e-05,
|
|
"loss": 0.2969,
|
|
"num_input_tokens_seen": 29304960,
|
|
"step": 1500
|
|
},
|
|
{
|
|
"epoch": 1.71483539974348,
|
|
"grad_norm": 7.019242763519287,
|
|
"learning_rate": 4.959740167492767e-05,
|
|
"loss": 0.2576,
|
|
"num_input_tokens_seen": 29402720,
|
|
"step": 1505
|
|
},
|
|
{
|
|
"epoch": 1.7205358415277185,
|
|
"grad_norm": 5.726184844970703,
|
|
"learning_rate": 4.959472942238762e-05,
|
|
"loss": 0.2731,
|
|
"num_input_tokens_seen": 29500480,
|
|
"step": 1510
|
|
},
|
|
{
|
|
"epoch": 1.7262362833119567,
|
|
"grad_norm": 7.595127105712891,
|
|
"learning_rate": 4.9592048403067845e-05,
|
|
"loss": 0.3502,
|
|
"num_input_tokens_seen": 29598240,
|
|
"step": 1515
|
|
},
|
|
{
|
|
"epoch": 1.731936725096195,
|
|
"grad_norm": 5.473161697387695,
|
|
"learning_rate": 4.958935861792402e-05,
|
|
"loss": 0.3446,
|
|
"num_input_tokens_seen": 29695952,
|
|
"step": 1520
|
|
},
|
|
{
|
|
"epoch": 1.7376371668804333,
|
|
"grad_norm": 5.171034812927246,
|
|
"learning_rate": 4.958666006791489e-05,
|
|
"loss": 0.328,
|
|
"num_input_tokens_seen": 29793696,
|
|
"step": 1525
|
|
},
|
|
{
|
|
"epoch": 1.7433376086646715,
|
|
"grad_norm": 7.3467559814453125,
|
|
"learning_rate": 4.958395275400237e-05,
|
|
"loss": 0.2313,
|
|
"num_input_tokens_seen": 29891456,
|
|
"step": 1530
|
|
},
|
|
{
|
|
"epoch": 1.7490380504489098,
|
|
"grad_norm": 7.6113481521606445,
|
|
"learning_rate": 4.958123667715147e-05,
|
|
"loss": 0.3182,
|
|
"num_input_tokens_seen": 29989280,
|
|
"step": 1535
|
|
},
|
|
{
|
|
"epoch": 1.7547384922331482,
|
|
"grad_norm": 4.429864883422852,
|
|
"learning_rate": 4.957851183833034e-05,
|
|
"loss": 0.2573,
|
|
"num_input_tokens_seen": 30087104,
|
|
"step": 1540
|
|
},
|
|
{
|
|
"epoch": 1.7604389340173863,
|
|
"grad_norm": 7.2398200035095215,
|
|
"learning_rate": 4.957577823851024e-05,
|
|
"loss": 0.3694,
|
|
"num_input_tokens_seen": 30184768,
|
|
"step": 1545
|
|
},
|
|
{
|
|
"epoch": 1.7661393758016246,
|
|
"grad_norm": 5.151268005371094,
|
|
"learning_rate": 4.957303587866557e-05,
|
|
"loss": 0.1916,
|
|
"num_input_tokens_seen": 30282496,
|
|
"step": 1550
|
|
},
|
|
{
|
|
"epoch": 1.771839817585863,
|
|
"grad_norm": 3.622302293777466,
|
|
"learning_rate": 4.957028475977384e-05,
|
|
"loss": 0.2405,
|
|
"num_input_tokens_seen": 30380288,
|
|
"step": 1555
|
|
},
|
|
{
|
|
"epoch": 1.777540259370101,
|
|
"grad_norm": 6.740144729614258,
|
|
"learning_rate": 4.9567524882815686e-05,
|
|
"loss": 0.2632,
|
|
"num_input_tokens_seen": 30478048,
|
|
"step": 1560
|
|
},
|
|
{
|
|
"epoch": 1.7832407011543394,
|
|
"grad_norm": 8.049473762512207,
|
|
"learning_rate": 4.956475624877486e-05,
|
|
"loss": 0.4007,
|
|
"num_input_tokens_seen": 30575728,
|
|
"step": 1565
|
|
},
|
|
{
|
|
"epoch": 1.7889411429385778,
|
|
"grad_norm": 6.096712589263916,
|
|
"learning_rate": 4.9561978858638245e-05,
|
|
"loss": 0.3395,
|
|
"num_input_tokens_seen": 30673488,
|
|
"step": 1570
|
|
},
|
|
{
|
|
"epoch": 1.7946415847228159,
|
|
"grad_norm": 5.549604415893555,
|
|
"learning_rate": 4.955919271339584e-05,
|
|
"loss": 0.2917,
|
|
"num_input_tokens_seen": 30771120,
|
|
"step": 1575
|
|
},
|
|
{
|
|
"epoch": 1.8003420265070544,
|
|
"grad_norm": 6.270097732543945,
|
|
"learning_rate": 4.9556397814040754e-05,
|
|
"loss": 0.1805,
|
|
"num_input_tokens_seen": 30868848,
|
|
"step": 1580
|
|
},
|
|
{
|
|
"epoch": 1.8060424682912926,
|
|
"grad_norm": 5.557713985443115,
|
|
"learning_rate": 4.955359416156925e-05,
|
|
"loss": 0.2391,
|
|
"num_input_tokens_seen": 30966576,
|
|
"step": 1585
|
|
},
|
|
{
|
|
"epoch": 1.811742910075531,
|
|
"grad_norm": 5.296855926513672,
|
|
"learning_rate": 4.955078175698067e-05,
|
|
"loss": 0.3259,
|
|
"num_input_tokens_seen": 31064320,
|
|
"step": 1590
|
|
},
|
|
{
|
|
"epoch": 1.8174433518597692,
|
|
"grad_norm": 5.627151966094971,
|
|
"learning_rate": 4.9547960601277496e-05,
|
|
"loss": 0.2576,
|
|
"num_input_tokens_seen": 31162048,
|
|
"step": 1595
|
|
},
|
|
{
|
|
"epoch": 1.8231437936440074,
|
|
"grad_norm": 6.599062919616699,
|
|
"learning_rate": 4.9545130695465336e-05,
|
|
"loss": 0.2859,
|
|
"num_input_tokens_seen": 31259840,
|
|
"step": 1600
|
|
},
|
|
{
|
|
"epoch": 1.8288442354282457,
|
|
"grad_norm": 8.176803588867188,
|
|
"learning_rate": 4.954229204055291e-05,
|
|
"loss": 0.1917,
|
|
"num_input_tokens_seen": 31357568,
|
|
"step": 1605
|
|
},
|
|
{
|
|
"epoch": 1.834544677212484,
|
|
"grad_norm": 9.62484073638916,
|
|
"learning_rate": 4.953944463755204e-05,
|
|
"loss": 0.3755,
|
|
"num_input_tokens_seen": 31455344,
|
|
"step": 1610
|
|
},
|
|
{
|
|
"epoch": 1.8402451189967222,
|
|
"grad_norm": 6.320862293243408,
|
|
"learning_rate": 4.9536588487477697e-05,
|
|
"loss": 0.2781,
|
|
"num_input_tokens_seen": 31553024,
|
|
"step": 1615
|
|
},
|
|
{
|
|
"epoch": 1.8459455607809605,
|
|
"grad_norm": 4.023454189300537,
|
|
"learning_rate": 4.953372359134795e-05,
|
|
"loss": 0.2669,
|
|
"num_input_tokens_seen": 31650848,
|
|
"step": 1620
|
|
},
|
|
{
|
|
"epoch": 1.8516460025651988,
|
|
"grad_norm": 5.63836145401001,
|
|
"learning_rate": 4.953084995018398e-05,
|
|
"loss": 0.2577,
|
|
"num_input_tokens_seen": 31748560,
|
|
"step": 1625
|
|
},
|
|
{
|
|
"epoch": 1.857346444349437,
|
|
"grad_norm": 8.508479118347168,
|
|
"learning_rate": 4.95279675650101e-05,
|
|
"loss": 0.277,
|
|
"num_input_tokens_seen": 31846224,
|
|
"step": 1630
|
|
},
|
|
{
|
|
"epoch": 1.8630468861336753,
|
|
"grad_norm": 7.421855926513672,
|
|
"learning_rate": 4.952507643685375e-05,
|
|
"loss": 0.2915,
|
|
"num_input_tokens_seen": 31944016,
|
|
"step": 1635
|
|
},
|
|
{
|
|
"epoch": 1.8687473279179136,
|
|
"grad_norm": 8.737668991088867,
|
|
"learning_rate": 4.952217656674546e-05,
|
|
"loss": 0.2798,
|
|
"num_input_tokens_seen": 32041680,
|
|
"step": 1640
|
|
},
|
|
{
|
|
"epoch": 1.8744477697021518,
|
|
"grad_norm": 6.379103183746338,
|
|
"learning_rate": 4.951926795571888e-05,
|
|
"loss": 0.2403,
|
|
"num_input_tokens_seen": 32139392,
|
|
"step": 1645
|
|
},
|
|
{
|
|
"epoch": 1.8801482114863903,
|
|
"grad_norm": 3.9837377071380615,
|
|
"learning_rate": 4.9516350604810793e-05,
|
|
"loss": 0.1932,
|
|
"num_input_tokens_seen": 32237184,
|
|
"step": 1650
|
|
},
|
|
{
|
|
"epoch": 1.8858486532706284,
|
|
"grad_norm": 6.174622535705566,
|
|
"learning_rate": 4.951342451506108e-05,
|
|
"loss": 0.2904,
|
|
"num_input_tokens_seen": 32334816,
|
|
"step": 1655
|
|
},
|
|
{
|
|
"epoch": 1.8915490950548668,
|
|
"grad_norm": 5.5978899002075195,
|
|
"learning_rate": 4.951048968751275e-05,
|
|
"loss": 0.2017,
|
|
"num_input_tokens_seen": 32432528,
|
|
"step": 1660
|
|
},
|
|
{
|
|
"epoch": 1.8972495368391051,
|
|
"grad_norm": 6.84478759765625,
|
|
"learning_rate": 4.9507546123211926e-05,
|
|
"loss": 0.2464,
|
|
"num_input_tokens_seen": 32530320,
|
|
"step": 1665
|
|
},
|
|
{
|
|
"epoch": 1.9029499786233433,
|
|
"grad_norm": 4.2474799156188965,
|
|
"learning_rate": 4.950459382320782e-05,
|
|
"loss": 0.1859,
|
|
"num_input_tokens_seen": 32628016,
|
|
"step": 1670
|
|
},
|
|
{
|
|
"epoch": 1.9086504204075816,
|
|
"grad_norm": 7.542076587677002,
|
|
"learning_rate": 4.9501632788552805e-05,
|
|
"loss": 0.2051,
|
|
"num_input_tokens_seen": 32725744,
|
|
"step": 1675
|
|
},
|
|
{
|
|
"epoch": 1.91435086219182,
|
|
"grad_norm": 8.932976722717285,
|
|
"learning_rate": 4.949866302030232e-05,
|
|
"loss": 0.3001,
|
|
"num_input_tokens_seen": 32823424,
|
|
"step": 1680
|
|
},
|
|
{
|
|
"epoch": 1.920051303976058,
|
|
"grad_norm": 8.958706855773926,
|
|
"learning_rate": 4.949568451951495e-05,
|
|
"loss": 0.4515,
|
|
"num_input_tokens_seen": 32921120,
|
|
"step": 1685
|
|
},
|
|
{
|
|
"epoch": 1.9257517457602964,
|
|
"grad_norm": 8.6969633102417,
|
|
"learning_rate": 4.9492697287252365e-05,
|
|
"loss": 0.2328,
|
|
"num_input_tokens_seen": 33018880,
|
|
"step": 1690
|
|
},
|
|
{
|
|
"epoch": 1.9314521875445347,
|
|
"grad_norm": 5.649287223815918,
|
|
"learning_rate": 4.948970132457938e-05,
|
|
"loss": 0.2487,
|
|
"num_input_tokens_seen": 33116656,
|
|
"step": 1695
|
|
},
|
|
{
|
|
"epoch": 1.9371526293287729,
|
|
"grad_norm": 7.401125431060791,
|
|
"learning_rate": 4.94866966325639e-05,
|
|
"loss": 0.2839,
|
|
"num_input_tokens_seen": 33214416,
|
|
"step": 1700
|
|
},
|
|
{
|
|
"epoch": 1.9428530711130114,
|
|
"grad_norm": 8.189993858337402,
|
|
"learning_rate": 4.9483683212276935e-05,
|
|
"loss": 0.1811,
|
|
"num_input_tokens_seen": 33312096,
|
|
"step": 1705
|
|
},
|
|
{
|
|
"epoch": 1.9485535128972495,
|
|
"grad_norm": 8.221793174743652,
|
|
"learning_rate": 4.948066106479262e-05,
|
|
"loss": 0.2459,
|
|
"num_input_tokens_seen": 33409792,
|
|
"step": 1710
|
|
},
|
|
{
|
|
"epoch": 1.9542539546814877,
|
|
"grad_norm": 5.853153705596924,
|
|
"learning_rate": 4.947763019118821e-05,
|
|
"loss": 0.3363,
|
|
"num_input_tokens_seen": 33507504,
|
|
"step": 1715
|
|
},
|
|
{
|
|
"epoch": 1.9599543964657262,
|
|
"grad_norm": 7.756158351898193,
|
|
"learning_rate": 4.947459059254405e-05,
|
|
"loss": 0.2134,
|
|
"num_input_tokens_seen": 33605136,
|
|
"step": 1720
|
|
},
|
|
{
|
|
"epoch": 1.9656548382499643,
|
|
"grad_norm": 6.2845025062561035,
|
|
"learning_rate": 4.9471542269943604e-05,
|
|
"loss": 0.2498,
|
|
"num_input_tokens_seen": 33702928,
|
|
"step": 1725
|
|
},
|
|
{
|
|
"epoch": 1.9713552800342027,
|
|
"grad_norm": 2.1698145866394043,
|
|
"learning_rate": 4.946848522447345e-05,
|
|
"loss": 0.1366,
|
|
"num_input_tokens_seen": 33800656,
|
|
"step": 1730
|
|
},
|
|
{
|
|
"epoch": 1.977055721818441,
|
|
"grad_norm": 8.690340995788574,
|
|
"learning_rate": 4.946541945722326e-05,
|
|
"loss": 0.3645,
|
|
"num_input_tokens_seen": 33898336,
|
|
"step": 1735
|
|
},
|
|
{
|
|
"epoch": 1.9827561636026791,
|
|
"grad_norm": 7.308428764343262,
|
|
"learning_rate": 4.946234496928583e-05,
|
|
"loss": 0.1994,
|
|
"num_input_tokens_seen": 33996096,
|
|
"step": 1740
|
|
},
|
|
{
|
|
"epoch": 1.9884566053869175,
|
|
"grad_norm": 5.161618709564209,
|
|
"learning_rate": 4.945926176175707e-05,
|
|
"loss": 0.226,
|
|
"num_input_tokens_seen": 34093792,
|
|
"step": 1745
|
|
},
|
|
{
|
|
"epoch": 1.9941570471711558,
|
|
"grad_norm": 9.512948989868164,
|
|
"learning_rate": 4.945616983573598e-05,
|
|
"loss": 0.2135,
|
|
"num_input_tokens_seen": 34191552,
|
|
"step": 1750
|
|
},
|
|
{
|
|
"epoch": 1.999857488955394,
|
|
"grad_norm": 8.386415481567383,
|
|
"learning_rate": 4.945306919232467e-05,
|
|
"loss": 0.236,
|
|
"num_input_tokens_seen": 34289248,
|
|
"step": 1755
|
|
},
|
|
{
|
|
"epoch": 2.0045603534273906,
|
|
"grad_norm": 7.716324329376221,
|
|
"learning_rate": 4.944995983262837e-05,
|
|
"loss": 0.3453,
|
|
"num_input_tokens_seen": 34369840,
|
|
"step": 1760
|
|
},
|
|
{
|
|
"epoch": 2.0102607952116287,
|
|
"grad_norm": 3.110274076461792,
|
|
"learning_rate": 4.9446841757755405e-05,
|
|
"loss": 0.1964,
|
|
"num_input_tokens_seen": 34467568,
|
|
"step": 1765
|
|
},
|
|
{
|
|
"epoch": 2.0159612369958673,
|
|
"grad_norm": 6.07765007019043,
|
|
"learning_rate": 4.944371496881721e-05,
|
|
"loss": 0.2358,
|
|
"num_input_tokens_seen": 34565248,
|
|
"step": 1770
|
|
},
|
|
{
|
|
"epoch": 2.0216616787801054,
|
|
"grad_norm": 5.835047721862793,
|
|
"learning_rate": 4.944057946692834e-05,
|
|
"loss": 0.1317,
|
|
"num_input_tokens_seen": 34662896,
|
|
"step": 1775
|
|
},
|
|
{
|
|
"epoch": 2.027362120564344,
|
|
"grad_norm": 6.831048488616943,
|
|
"learning_rate": 4.943743525320643e-05,
|
|
"loss": 0.2355,
|
|
"num_input_tokens_seen": 34760624,
|
|
"step": 1780
|
|
},
|
|
{
|
|
"epoch": 2.033062562348582,
|
|
"grad_norm": 11.599326133728027,
|
|
"learning_rate": 4.943428232877224e-05,
|
|
"loss": 0.1869,
|
|
"num_input_tokens_seen": 34858288,
|
|
"step": 1785
|
|
},
|
|
{
|
|
"epoch": 2.03876300413282,
|
|
"grad_norm": 6.280679702758789,
|
|
"learning_rate": 4.943112069474963e-05,
|
|
"loss": 0.2707,
|
|
"num_input_tokens_seen": 34955968,
|
|
"step": 1790
|
|
},
|
|
{
|
|
"epoch": 2.0444634459170588,
|
|
"grad_norm": 7.326327800750732,
|
|
"learning_rate": 4.942795035226555e-05,
|
|
"loss": 0.2077,
|
|
"num_input_tokens_seen": 35053744,
|
|
"step": 1795
|
|
},
|
|
{
|
|
"epoch": 2.050163887701297,
|
|
"grad_norm": 4.219114303588867,
|
|
"learning_rate": 4.9424771302450084e-05,
|
|
"loss": 0.1575,
|
|
"num_input_tokens_seen": 35151408,
|
|
"step": 1800
|
|
},
|
|
{
|
|
"epoch": 2.055864329485535,
|
|
"grad_norm": 4.75279426574707,
|
|
"learning_rate": 4.942158354643639e-05,
|
|
"loss": 0.1663,
|
|
"num_input_tokens_seen": 35249168,
|
|
"step": 1805
|
|
},
|
|
{
|
|
"epoch": 2.0615647712697736,
|
|
"grad_norm": 7.397295951843262,
|
|
"learning_rate": 4.9418387085360754e-05,
|
|
"loss": 0.1872,
|
|
"num_input_tokens_seen": 35346880,
|
|
"step": 1810
|
|
},
|
|
{
|
|
"epoch": 2.0672652130540117,
|
|
"grad_norm": 7.649684906005859,
|
|
"learning_rate": 4.941518192036254e-05,
|
|
"loss": 0.2212,
|
|
"num_input_tokens_seen": 35444688,
|
|
"step": 1815
|
|
},
|
|
{
|
|
"epoch": 2.07296565483825,
|
|
"grad_norm": 3.966686725616455,
|
|
"learning_rate": 4.941196805258423e-05,
|
|
"loss": 0.1185,
|
|
"num_input_tokens_seen": 35542416,
|
|
"step": 1820
|
|
},
|
|
{
|
|
"epoch": 2.0786660966224884,
|
|
"grad_norm": 10.044607162475586,
|
|
"learning_rate": 4.940874548317143e-05,
|
|
"loss": 0.2099,
|
|
"num_input_tokens_seen": 35640128,
|
|
"step": 1825
|
|
},
|
|
{
|
|
"epoch": 2.0843665384067265,
|
|
"grad_norm": 5.567621231079102,
|
|
"learning_rate": 4.9405514213272784e-05,
|
|
"loss": 0.172,
|
|
"num_input_tokens_seen": 35737872,
|
|
"step": 1830
|
|
},
|
|
{
|
|
"epoch": 2.0900669801909646,
|
|
"grad_norm": 6.253763675689697,
|
|
"learning_rate": 4.94022742440401e-05,
|
|
"loss": 0.1485,
|
|
"num_input_tokens_seen": 35835568,
|
|
"step": 1835
|
|
},
|
|
{
|
|
"epoch": 2.095767421975203,
|
|
"grad_norm": 5.565789222717285,
|
|
"learning_rate": 4.939902557662826e-05,
|
|
"loss": 0.2586,
|
|
"num_input_tokens_seen": 35933312,
|
|
"step": 1840
|
|
},
|
|
{
|
|
"epoch": 2.1014678637594413,
|
|
"grad_norm": 4.763193607330322,
|
|
"learning_rate": 4.939576821219525e-05,
|
|
"loss": 0.2357,
|
|
"num_input_tokens_seen": 36030944,
|
|
"step": 1845
|
|
},
|
|
{
|
|
"epoch": 2.10716830554368,
|
|
"grad_norm": 11.802780151367188,
|
|
"learning_rate": 4.9392502151902156e-05,
|
|
"loss": 0.2471,
|
|
"num_input_tokens_seen": 36128688,
|
|
"step": 1850
|
|
},
|
|
{
|
|
"epoch": 2.112868747327918,
|
|
"grad_norm": 6.20152473449707,
|
|
"learning_rate": 4.938922739691316e-05,
|
|
"loss": 0.1398,
|
|
"num_input_tokens_seen": 36226368,
|
|
"step": 1855
|
|
},
|
|
{
|
|
"epoch": 2.118569189112156,
|
|
"grad_norm": 6.946401119232178,
|
|
"learning_rate": 4.938594394839555e-05,
|
|
"loss": 0.1601,
|
|
"num_input_tokens_seen": 36324096,
|
|
"step": 1860
|
|
},
|
|
{
|
|
"epoch": 2.1242696308963946,
|
|
"grad_norm": 4.598988056182861,
|
|
"learning_rate": 4.938265180751971e-05,
|
|
"loss": 0.1461,
|
|
"num_input_tokens_seen": 36421840,
|
|
"step": 1865
|
|
},
|
|
{
|
|
"epoch": 2.1299700726806328,
|
|
"grad_norm": 8.171868324279785,
|
|
"learning_rate": 4.937935097545912e-05,
|
|
"loss": 0.2531,
|
|
"num_input_tokens_seen": 36519552,
|
|
"step": 1870
|
|
},
|
|
{
|
|
"epoch": 2.135670514464871,
|
|
"grad_norm": 3.664236068725586,
|
|
"learning_rate": 4.9376041453390365e-05,
|
|
"loss": 0.1934,
|
|
"num_input_tokens_seen": 36617280,
|
|
"step": 1875
|
|
},
|
|
{
|
|
"epoch": 2.1413709562491094,
|
|
"grad_norm": 8.51440143585205,
|
|
"learning_rate": 4.937272324249312e-05,
|
|
"loss": 0.2024,
|
|
"num_input_tokens_seen": 36714992,
|
|
"step": 1880
|
|
},
|
|
{
|
|
"epoch": 2.1470713980333476,
|
|
"grad_norm": 6.9583001136779785,
|
|
"learning_rate": 4.9369396343950154e-05,
|
|
"loss": 0.2121,
|
|
"num_input_tokens_seen": 36812784,
|
|
"step": 1885
|
|
},
|
|
{
|
|
"epoch": 2.1527718398175857,
|
|
"grad_norm": 10.295254707336426,
|
|
"learning_rate": 4.936606075894734e-05,
|
|
"loss": 0.172,
|
|
"num_input_tokens_seen": 36910688,
|
|
"step": 1890
|
|
},
|
|
{
|
|
"epoch": 2.1584722816018242,
|
|
"grad_norm": 3.664754629135132,
|
|
"learning_rate": 4.9362716488673654e-05,
|
|
"loss": 0.163,
|
|
"num_input_tokens_seen": 37008464,
|
|
"step": 1895
|
|
},
|
|
{
|
|
"epoch": 2.1641727233860624,
|
|
"grad_norm": 9.738370895385742,
|
|
"learning_rate": 4.9359363534321156e-05,
|
|
"loss": 0.1591,
|
|
"num_input_tokens_seen": 37106272,
|
|
"step": 1900
|
|
},
|
|
{
|
|
"epoch": 2.1698731651703005,
|
|
"grad_norm": 6.579972743988037,
|
|
"learning_rate": 4.9356001897085e-05,
|
|
"loss": 0.1816,
|
|
"num_input_tokens_seen": 37204048,
|
|
"step": 1905
|
|
},
|
|
{
|
|
"epoch": 2.175573606954539,
|
|
"grad_norm": 6.322074890136719,
|
|
"learning_rate": 4.935263157816345e-05,
|
|
"loss": 0.183,
|
|
"num_input_tokens_seen": 37301824,
|
|
"step": 1910
|
|
},
|
|
{
|
|
"epoch": 2.181274048738777,
|
|
"grad_norm": 5.997386455535889,
|
|
"learning_rate": 4.934925257875784e-05,
|
|
"loss": 0.1722,
|
|
"num_input_tokens_seen": 37399632,
|
|
"step": 1915
|
|
},
|
|
{
|
|
"epoch": 2.1869744905230157,
|
|
"grad_norm": 7.246592998504639,
|
|
"learning_rate": 4.9345864900072625e-05,
|
|
"loss": 0.1017,
|
|
"num_input_tokens_seen": 37497296,
|
|
"step": 1920
|
|
},
|
|
{
|
|
"epoch": 2.192674932307254,
|
|
"grad_norm": 7.348723888397217,
|
|
"learning_rate": 4.934246854331534e-05,
|
|
"loss": 0.1756,
|
|
"num_input_tokens_seen": 37595168,
|
|
"step": 1925
|
|
},
|
|
{
|
|
"epoch": 2.198375374091492,
|
|
"grad_norm": 8.612208366394043,
|
|
"learning_rate": 4.933906350969661e-05,
|
|
"loss": 0.1674,
|
|
"num_input_tokens_seen": 37692832,
|
|
"step": 1930
|
|
},
|
|
{
|
|
"epoch": 2.2040758158757305,
|
|
"grad_norm": 16.743350982666016,
|
|
"learning_rate": 4.933564980043015e-05,
|
|
"loss": 0.2679,
|
|
"num_input_tokens_seen": 37790512,
|
|
"step": 1935
|
|
},
|
|
{
|
|
"epoch": 2.2097762576599687,
|
|
"grad_norm": 2.924848794937134,
|
|
"learning_rate": 4.93322274167328e-05,
|
|
"loss": 0.0981,
|
|
"num_input_tokens_seen": 37888304,
|
|
"step": 1940
|
|
},
|
|
{
|
|
"epoch": 2.2154766994442068,
|
|
"grad_norm": 8.15044116973877,
|
|
"learning_rate": 4.9328796359824445e-05,
|
|
"loss": 0.1621,
|
|
"num_input_tokens_seen": 37986032,
|
|
"step": 1945
|
|
},
|
|
{
|
|
"epoch": 2.2211771412284453,
|
|
"grad_norm": 9.112587928771973,
|
|
"learning_rate": 4.932535663092809e-05,
|
|
"loss": 0.2655,
|
|
"num_input_tokens_seen": 38083776,
|
|
"step": 1950
|
|
},
|
|
{
|
|
"epoch": 2.2268775830126835,
|
|
"grad_norm": 4.596200942993164,
|
|
"learning_rate": 4.932190823126982e-05,
|
|
"loss": 0.1608,
|
|
"num_input_tokens_seen": 38181488,
|
|
"step": 1955
|
|
},
|
|
{
|
|
"epoch": 2.2325780247969216,
|
|
"grad_norm": 5.918450355529785,
|
|
"learning_rate": 4.9318451162078824e-05,
|
|
"loss": 0.1119,
|
|
"num_input_tokens_seen": 38279248,
|
|
"step": 1960
|
|
},
|
|
{
|
|
"epoch": 2.23827846658116,
|
|
"grad_norm": 8.088936805725098,
|
|
"learning_rate": 4.931498542458738e-05,
|
|
"loss": 0.2202,
|
|
"num_input_tokens_seen": 38377024,
|
|
"step": 1965
|
|
},
|
|
{
|
|
"epoch": 2.2439789083653983,
|
|
"grad_norm": 6.410324573516846,
|
|
"learning_rate": 4.931151102003082e-05,
|
|
"loss": 0.1136,
|
|
"num_input_tokens_seen": 38474768,
|
|
"step": 1970
|
|
},
|
|
{
|
|
"epoch": 2.249679350149637,
|
|
"grad_norm": 8.893948554992676,
|
|
"learning_rate": 4.930802794964763e-05,
|
|
"loss": 0.1233,
|
|
"num_input_tokens_seen": 38572432,
|
|
"step": 1975
|
|
},
|
|
{
|
|
"epoch": 2.255379791933875,
|
|
"grad_norm": 2.6239705085754395,
|
|
"learning_rate": 4.9304536214679315e-05,
|
|
"loss": 0.1409,
|
|
"num_input_tokens_seen": 38670112,
|
|
"step": 1980
|
|
},
|
|
{
|
|
"epoch": 2.261080233718113,
|
|
"grad_norm": 8.951133728027344,
|
|
"learning_rate": 4.930103581637052e-05,
|
|
"loss": 0.159,
|
|
"num_input_tokens_seen": 38767872,
|
|
"step": 1985
|
|
},
|
|
{
|
|
"epoch": 2.2667806755023516,
|
|
"grad_norm": 8.211560249328613,
|
|
"learning_rate": 4.929752675596896e-05,
|
|
"loss": 0.1761,
|
|
"num_input_tokens_seen": 38865584,
|
|
"step": 1990
|
|
},
|
|
{
|
|
"epoch": 2.2724811172865897,
|
|
"grad_norm": 4.264492034912109,
|
|
"learning_rate": 4.929400903472544e-05,
|
|
"loss": 0.1206,
|
|
"num_input_tokens_seen": 38963264,
|
|
"step": 1995
|
|
},
|
|
{
|
|
"epoch": 2.278181559070828,
|
|
"grad_norm": 10.552608489990234,
|
|
"learning_rate": 4.9290482653893846e-05,
|
|
"loss": 0.1895,
|
|
"num_input_tokens_seen": 39060944,
|
|
"step": 2000
|
|
},
|
|
{
|
|
"epoch": 2.2838820008550664,
|
|
"grad_norm": 8.71176528930664,
|
|
"learning_rate": 4.928694761473115e-05,
|
|
"loss": 0.1604,
|
|
"num_input_tokens_seen": 39158640,
|
|
"step": 2005
|
|
},
|
|
{
|
|
"epoch": 2.2895824426393045,
|
|
"grad_norm": 6.852836608886719,
|
|
"learning_rate": 4.928340391849742e-05,
|
|
"loss": 0.2317,
|
|
"num_input_tokens_seen": 39256352,
|
|
"step": 2010
|
|
},
|
|
{
|
|
"epoch": 2.2952828844235427,
|
|
"grad_norm": 6.0051493644714355,
|
|
"learning_rate": 4.9279851566455806e-05,
|
|
"loss": 0.1945,
|
|
"num_input_tokens_seen": 39354112,
|
|
"step": 2015
|
|
},
|
|
{
|
|
"epoch": 2.3009833262077812,
|
|
"grad_norm": 8.092498779296875,
|
|
"learning_rate": 4.927629055987254e-05,
|
|
"loss": 0.1393,
|
|
"num_input_tokens_seen": 39451824,
|
|
"step": 2020
|
|
},
|
|
{
|
|
"epoch": 2.3066837679920194,
|
|
"grad_norm": 11.641897201538086,
|
|
"learning_rate": 4.927272090001695e-05,
|
|
"loss": 0.1692,
|
|
"num_input_tokens_seen": 39549600,
|
|
"step": 2025
|
|
},
|
|
{
|
|
"epoch": 2.3123842097762575,
|
|
"grad_norm": 6.343209266662598,
|
|
"learning_rate": 4.9269142588161424e-05,
|
|
"loss": 0.1058,
|
|
"num_input_tokens_seen": 39647280,
|
|
"step": 2030
|
|
},
|
|
{
|
|
"epoch": 2.318084651560496,
|
|
"grad_norm": 9.047554969787598,
|
|
"learning_rate": 4.9265555625581464e-05,
|
|
"loss": 0.1835,
|
|
"num_input_tokens_seen": 39745040,
|
|
"step": 2035
|
|
},
|
|
{
|
|
"epoch": 2.323785093344734,
|
|
"grad_norm": 7.915929317474365,
|
|
"learning_rate": 4.9261960013555625e-05,
|
|
"loss": 0.2291,
|
|
"num_input_tokens_seen": 39842816,
|
|
"step": 2040
|
|
},
|
|
{
|
|
"epoch": 2.3294855351289723,
|
|
"grad_norm": 5.318630695343018,
|
|
"learning_rate": 4.925835575336557e-05,
|
|
"loss": 0.1533,
|
|
"num_input_tokens_seen": 39940576,
|
|
"step": 2045
|
|
},
|
|
{
|
|
"epoch": 2.335185976913211,
|
|
"grad_norm": 7.653360366821289,
|
|
"learning_rate": 4.9254742846296045e-05,
|
|
"loss": 0.1978,
|
|
"num_input_tokens_seen": 40038368,
|
|
"step": 2050
|
|
},
|
|
{
|
|
"epoch": 2.340886418697449,
|
|
"grad_norm": 7.852567672729492,
|
|
"learning_rate": 4.925112129363486e-05,
|
|
"loss": 0.1531,
|
|
"num_input_tokens_seen": 40136144,
|
|
"step": 2055
|
|
},
|
|
{
|
|
"epoch": 2.3465868604816875,
|
|
"grad_norm": 6.86559534072876,
|
|
"learning_rate": 4.92474910966729e-05,
|
|
"loss": 0.0854,
|
|
"num_input_tokens_seen": 40233952,
|
|
"step": 2060
|
|
},
|
|
{
|
|
"epoch": 2.3522873022659256,
|
|
"grad_norm": 10.074113845825195,
|
|
"learning_rate": 4.9243852256704183e-05,
|
|
"loss": 0.1915,
|
|
"num_input_tokens_seen": 40331696,
|
|
"step": 2065
|
|
},
|
|
{
|
|
"epoch": 2.3579877440501638,
|
|
"grad_norm": 7.896622180938721,
|
|
"learning_rate": 4.924020477502574e-05,
|
|
"loss": 0.1495,
|
|
"num_input_tokens_seen": 40429360,
|
|
"step": 2070
|
|
},
|
|
{
|
|
"epoch": 2.3636881858344023,
|
|
"grad_norm": 8.397123336791992,
|
|
"learning_rate": 4.923654865293773e-05,
|
|
"loss": 0.1392,
|
|
"num_input_tokens_seen": 40527136,
|
|
"step": 2075
|
|
},
|
|
{
|
|
"epoch": 2.3693886276186404,
|
|
"grad_norm": 6.305740833282471,
|
|
"learning_rate": 4.923288389174337e-05,
|
|
"loss": 0.0875,
|
|
"num_input_tokens_seen": 40624912,
|
|
"step": 2080
|
|
},
|
|
{
|
|
"epoch": 2.3750890694028786,
|
|
"grad_norm": 8.387357711791992,
|
|
"learning_rate": 4.9229210492748976e-05,
|
|
"loss": 0.2358,
|
|
"num_input_tokens_seen": 40722720,
|
|
"step": 2085
|
|
},
|
|
{
|
|
"epoch": 2.380789511187117,
|
|
"grad_norm": 8.446219444274902,
|
|
"learning_rate": 4.92255284572639e-05,
|
|
"loss": 0.1482,
|
|
"num_input_tokens_seen": 40820448,
|
|
"step": 2090
|
|
},
|
|
{
|
|
"epoch": 2.3864899529713552,
|
|
"grad_norm": 6.30406379699707,
|
|
"learning_rate": 4.9221837786600634e-05,
|
|
"loss": 0.1603,
|
|
"num_input_tokens_seen": 40918256,
|
|
"step": 2095
|
|
},
|
|
{
|
|
"epoch": 2.392190394755594,
|
|
"grad_norm": 7.828269004821777,
|
|
"learning_rate": 4.921813848207469e-05,
|
|
"loss": 0.1764,
|
|
"num_input_tokens_seen": 41015920,
|
|
"step": 2100
|
|
},
|
|
{
|
|
"epoch": 2.397890836539832,
|
|
"grad_norm": 7.984679698944092,
|
|
"learning_rate": 4.921443054500471e-05,
|
|
"loss": 0.1809,
|
|
"num_input_tokens_seen": 41113632,
|
|
"step": 2105
|
|
},
|
|
{
|
|
"epoch": 2.40359127832407,
|
|
"grad_norm": 10.998138427734375,
|
|
"learning_rate": 4.921071397671235e-05,
|
|
"loss": 0.185,
|
|
"num_input_tokens_seen": 41211344,
|
|
"step": 2110
|
|
},
|
|
{
|
|
"epoch": 2.4092917201083086,
|
|
"grad_norm": 7.500617980957031,
|
|
"learning_rate": 4.9206988778522414e-05,
|
|
"loss": 0.116,
|
|
"num_input_tokens_seen": 41308992,
|
|
"step": 2115
|
|
},
|
|
{
|
|
"epoch": 2.4149921618925467,
|
|
"grad_norm": 7.988670349121094,
|
|
"learning_rate": 4.9203254951762735e-05,
|
|
"loss": 0.1457,
|
|
"num_input_tokens_seen": 41406752,
|
|
"step": 2120
|
|
},
|
|
{
|
|
"epoch": 2.420692603676785,
|
|
"grad_norm": 10.234942436218262,
|
|
"learning_rate": 4.9199512497764226e-05,
|
|
"loss": 0.2256,
|
|
"num_input_tokens_seen": 41504464,
|
|
"step": 2125
|
|
},
|
|
{
|
|
"epoch": 2.4263930454610234,
|
|
"grad_norm": 11.052645683288574,
|
|
"learning_rate": 4.919576141786089e-05,
|
|
"loss": 0.1721,
|
|
"num_input_tokens_seen": 41602272,
|
|
"step": 2130
|
|
},
|
|
{
|
|
"epoch": 2.4320934872452615,
|
|
"grad_norm": 3.908461332321167,
|
|
"learning_rate": 4.91920017133898e-05,
|
|
"loss": 0.1676,
|
|
"num_input_tokens_seen": 41700000,
|
|
"step": 2135
|
|
},
|
|
{
|
|
"epoch": 2.4377939290294997,
|
|
"grad_norm": 3.8486247062683105,
|
|
"learning_rate": 4.9188233385691094e-05,
|
|
"loss": 0.1458,
|
|
"num_input_tokens_seen": 41797696,
|
|
"step": 2140
|
|
},
|
|
{
|
|
"epoch": 2.443494370813738,
|
|
"grad_norm": 7.926904678344727,
|
|
"learning_rate": 4.9184456436107994e-05,
|
|
"loss": 0.202,
|
|
"num_input_tokens_seen": 41895392,
|
|
"step": 2145
|
|
},
|
|
{
|
|
"epoch": 2.4491948125979763,
|
|
"grad_norm": 8.783378601074219,
|
|
"learning_rate": 4.91806708659868e-05,
|
|
"loss": 0.1838,
|
|
"num_input_tokens_seen": 41993120,
|
|
"step": 2150
|
|
},
|
|
{
|
|
"epoch": 2.4548952543822145,
|
|
"grad_norm": 4.2609357833862305,
|
|
"learning_rate": 4.917687667667686e-05,
|
|
"loss": 0.1037,
|
|
"num_input_tokens_seen": 42090880,
|
|
"step": 2155
|
|
},
|
|
{
|
|
"epoch": 2.460595696166453,
|
|
"grad_norm": 10.791552543640137,
|
|
"learning_rate": 4.917307386953062e-05,
|
|
"loss": 0.1791,
|
|
"num_input_tokens_seen": 42188576,
|
|
"step": 2160
|
|
},
|
|
{
|
|
"epoch": 2.466296137950691,
|
|
"grad_norm": 8.570240020751953,
|
|
"learning_rate": 4.9169262445903595e-05,
|
|
"loss": 0.1608,
|
|
"num_input_tokens_seen": 42286272,
|
|
"step": 2165
|
|
},
|
|
{
|
|
"epoch": 2.4719965797349293,
|
|
"grad_norm": 5.907083034515381,
|
|
"learning_rate": 4.9165442407154355e-05,
|
|
"loss": 0.1657,
|
|
"num_input_tokens_seen": 42384048,
|
|
"step": 2170
|
|
},
|
|
{
|
|
"epoch": 2.477697021519168,
|
|
"grad_norm": 9.303824424743652,
|
|
"learning_rate": 4.916161375464455e-05,
|
|
"loss": 0.1839,
|
|
"num_input_tokens_seen": 42481888,
|
|
"step": 2175
|
|
},
|
|
{
|
|
"epoch": 2.483397463303406,
|
|
"grad_norm": 9.526128768920898,
|
|
"learning_rate": 4.915777648973892e-05,
|
|
"loss": 0.1084,
|
|
"num_input_tokens_seen": 42579600,
|
|
"step": 2180
|
|
},
|
|
{
|
|
"epoch": 2.489097905087644,
|
|
"grad_norm": 8.502360343933105,
|
|
"learning_rate": 4.915393061380523e-05,
|
|
"loss": 0.1205,
|
|
"num_input_tokens_seen": 42677360,
|
|
"step": 2185
|
|
},
|
|
{
|
|
"epoch": 2.4947983468718826,
|
|
"grad_norm": 8.056527137756348,
|
|
"learning_rate": 4.9150076128214364e-05,
|
|
"loss": 0.1244,
|
|
"num_input_tokens_seen": 42775072,
|
|
"step": 2190
|
|
},
|
|
{
|
|
"epoch": 2.5004987886561207,
|
|
"grad_norm": 3.5900754928588867,
|
|
"learning_rate": 4.914621303434023e-05,
|
|
"loss": 0.1198,
|
|
"num_input_tokens_seen": 42872832,
|
|
"step": 2195
|
|
},
|
|
{
|
|
"epoch": 2.506199230440359,
|
|
"grad_norm": 4.139026641845703,
|
|
"learning_rate": 4.914234133355984e-05,
|
|
"loss": 0.1016,
|
|
"num_input_tokens_seen": 42970592,
|
|
"step": 2200
|
|
},
|
|
{
|
|
"epoch": 2.5118996722245974,
|
|
"grad_norm": 8.35708999633789,
|
|
"learning_rate": 4.9138461027253255e-05,
|
|
"loss": 0.1066,
|
|
"num_input_tokens_seen": 43068384,
|
|
"step": 2205
|
|
},
|
|
{
|
|
"epoch": 2.5176001140088355,
|
|
"grad_norm": 2.149785041809082,
|
|
"learning_rate": 4.913457211680361e-05,
|
|
"loss": 0.0866,
|
|
"num_input_tokens_seen": 43166240,
|
|
"step": 2210
|
|
},
|
|
{
|
|
"epoch": 2.523300555793074,
|
|
"grad_norm": 3.6671578884124756,
|
|
"learning_rate": 4.913067460359711e-05,
|
|
"loss": 0.1831,
|
|
"num_input_tokens_seen": 43264000,
|
|
"step": 2215
|
|
},
|
|
{
|
|
"epoch": 2.5290009975773122,
|
|
"grad_norm": 7.476966857910156,
|
|
"learning_rate": 4.912676848902301e-05,
|
|
"loss": 0.1276,
|
|
"num_input_tokens_seen": 43361712,
|
|
"step": 2220
|
|
},
|
|
{
|
|
"epoch": 2.534701439361551,
|
|
"grad_norm": 14.653030395507812,
|
|
"learning_rate": 4.912285377447366e-05,
|
|
"loss": 0.1622,
|
|
"num_input_tokens_seen": 43459392,
|
|
"step": 2225
|
|
},
|
|
{
|
|
"epoch": 2.540401881145789,
|
|
"grad_norm": 7.570925712585449,
|
|
"learning_rate": 4.9118930461344433e-05,
|
|
"loss": 0.1279,
|
|
"num_input_tokens_seen": 43557104,
|
|
"step": 2230
|
|
},
|
|
{
|
|
"epoch": 2.546102322930027,
|
|
"grad_norm": 7.563347816467285,
|
|
"learning_rate": 4.911499855103382e-05,
|
|
"loss": 0.1015,
|
|
"num_input_tokens_seen": 43654928,
|
|
"step": 2235
|
|
},
|
|
{
|
|
"epoch": 2.5518027647142656,
|
|
"grad_norm": 5.281002521514893,
|
|
"learning_rate": 4.9111058044943334e-05,
|
|
"loss": 0.1255,
|
|
"num_input_tokens_seen": 43752672,
|
|
"step": 2240
|
|
},
|
|
{
|
|
"epoch": 2.5575032064985037,
|
|
"grad_norm": 5.972769737243652,
|
|
"learning_rate": 4.910710894447757e-05,
|
|
"loss": 0.0551,
|
|
"num_input_tokens_seen": 43850512,
|
|
"step": 2245
|
|
},
|
|
{
|
|
"epoch": 2.563203648282742,
|
|
"grad_norm": 1.96940279006958,
|
|
"learning_rate": 4.9103151251044174e-05,
|
|
"loss": 0.0708,
|
|
"num_input_tokens_seen": 43948336,
|
|
"step": 2250
|
|
},
|
|
{
|
|
"epoch": 2.5689040900669804,
|
|
"grad_norm": 10.9530668258667,
|
|
"learning_rate": 4.909918496605387e-05,
|
|
"loss": 0.1775,
|
|
"num_input_tokens_seen": 44046080,
|
|
"step": 2255
|
|
},
|
|
{
|
|
"epoch": 2.5746045318512185,
|
|
"grad_norm": 8.717411994934082,
|
|
"learning_rate": 4.909521009092045e-05,
|
|
"loss": 0.0874,
|
|
"num_input_tokens_seen": 44143808,
|
|
"step": 2260
|
|
},
|
|
{
|
|
"epoch": 2.5803049736354566,
|
|
"grad_norm": 11.679935455322266,
|
|
"learning_rate": 4.909122662706074e-05,
|
|
"loss": 0.2068,
|
|
"num_input_tokens_seen": 44241600,
|
|
"step": 2265
|
|
},
|
|
{
|
|
"epoch": 2.586005415419695,
|
|
"grad_norm": 5.130126953125,
|
|
"learning_rate": 4.9087234575894644e-05,
|
|
"loss": 0.0785,
|
|
"num_input_tokens_seen": 44339312,
|
|
"step": 2270
|
|
},
|
|
{
|
|
"epoch": 2.5917058572039333,
|
|
"grad_norm": 5.87738037109375,
|
|
"learning_rate": 4.908323393884514e-05,
|
|
"loss": 0.0893,
|
|
"num_input_tokens_seen": 44437136,
|
|
"step": 2275
|
|
},
|
|
{
|
|
"epoch": 2.5974062989881714,
|
|
"grad_norm": 9.397594451904297,
|
|
"learning_rate": 4.9079224717338246e-05,
|
|
"loss": 0.142,
|
|
"num_input_tokens_seen": 44534912,
|
|
"step": 2280
|
|
},
|
|
{
|
|
"epoch": 2.60310674077241,
|
|
"grad_norm": 3.5476927757263184,
|
|
"learning_rate": 4.907520691280304e-05,
|
|
"loss": 0.0855,
|
|
"num_input_tokens_seen": 44632720,
|
|
"step": 2285
|
|
},
|
|
{
|
|
"epoch": 2.608807182556648,
|
|
"grad_norm": 10.59860610961914,
|
|
"learning_rate": 4.907118052667168e-05,
|
|
"loss": 0.1536,
|
|
"num_input_tokens_seen": 44730480,
|
|
"step": 2290
|
|
},
|
|
{
|
|
"epoch": 2.6145076243408862,
|
|
"grad_norm": 7.346194744110107,
|
|
"learning_rate": 4.906714556037936e-05,
|
|
"loss": 0.1219,
|
|
"num_input_tokens_seen": 44828112,
|
|
"step": 2295
|
|
},
|
|
{
|
|
"epoch": 2.620208066125125,
|
|
"grad_norm": 4.199995517730713,
|
|
"learning_rate": 4.9063102015364344e-05,
|
|
"loss": 0.0867,
|
|
"num_input_tokens_seen": 44925888,
|
|
"step": 2300
|
|
},
|
|
{
|
|
"epoch": 2.625908507909363,
|
|
"grad_norm": 12.029003143310547,
|
|
"learning_rate": 4.9059049893067954e-05,
|
|
"loss": 0.1819,
|
|
"num_input_tokens_seen": 45023728,
|
|
"step": 2305
|
|
},
|
|
{
|
|
"epoch": 2.631608949693601,
|
|
"grad_norm": 9.11473560333252,
|
|
"learning_rate": 4.9054989194934564e-05,
|
|
"loss": 0.1298,
|
|
"num_input_tokens_seen": 45121424,
|
|
"step": 2310
|
|
},
|
|
{
|
|
"epoch": 2.6373093914778396,
|
|
"grad_norm": 17.91231918334961,
|
|
"learning_rate": 4.905091992241161e-05,
|
|
"loss": 0.1854,
|
|
"num_input_tokens_seen": 45219200,
|
|
"step": 2315
|
|
},
|
|
{
|
|
"epoch": 2.6430098332620777,
|
|
"grad_norm": 10.454273223876953,
|
|
"learning_rate": 4.9046842076949576e-05,
|
|
"loss": 0.2016,
|
|
"num_input_tokens_seen": 45316944,
|
|
"step": 2320
|
|
},
|
|
{
|
|
"epoch": 2.648710275046316,
|
|
"grad_norm": 11.895720481872559,
|
|
"learning_rate": 4.904275566000202e-05,
|
|
"loss": 0.2173,
|
|
"num_input_tokens_seen": 45414688,
|
|
"step": 2325
|
|
},
|
|
{
|
|
"epoch": 2.6544107168305544,
|
|
"grad_norm": 7.68233585357666,
|
|
"learning_rate": 4.903866067302554e-05,
|
|
"loss": 0.1429,
|
|
"num_input_tokens_seen": 45512400,
|
|
"step": 2330
|
|
},
|
|
{
|
|
"epoch": 2.6601111586147925,
|
|
"grad_norm": 6.063493251800537,
|
|
"learning_rate": 4.9034557117479786e-05,
|
|
"loss": 0.1397,
|
|
"num_input_tokens_seen": 45610128,
|
|
"step": 2335
|
|
},
|
|
{
|
|
"epoch": 2.665811600399031,
|
|
"grad_norm": 1.1613508462905884,
|
|
"learning_rate": 4.903044499482747e-05,
|
|
"loss": 0.0946,
|
|
"num_input_tokens_seen": 45707920,
|
|
"step": 2340
|
|
},
|
|
{
|
|
"epoch": 2.671512042183269,
|
|
"grad_norm": 10.7675142288208,
|
|
"learning_rate": 4.902632430653435e-05,
|
|
"loss": 0.1761,
|
|
"num_input_tokens_seen": 45805744,
|
|
"step": 2345
|
|
},
|
|
{
|
|
"epoch": 2.6772124839675073,
|
|
"grad_norm": 7.67887020111084,
|
|
"learning_rate": 4.902219505406926e-05,
|
|
"loss": 0.1615,
|
|
"num_input_tokens_seen": 45903456,
|
|
"step": 2350
|
|
},
|
|
{
|
|
"epoch": 2.682912925751746,
|
|
"grad_norm": 2.045400381088257,
|
|
"learning_rate": 4.901805723890407e-05,
|
|
"loss": 0.173,
|
|
"num_input_tokens_seen": 46001264,
|
|
"step": 2355
|
|
},
|
|
{
|
|
"epoch": 2.688613367535984,
|
|
"grad_norm": 8.454596519470215,
|
|
"learning_rate": 4.9013910862513676e-05,
|
|
"loss": 0.1894,
|
|
"num_input_tokens_seen": 46098976,
|
|
"step": 2360
|
|
},
|
|
{
|
|
"epoch": 2.6943138093202226,
|
|
"grad_norm": 7.203195095062256,
|
|
"learning_rate": 4.9009755926376085e-05,
|
|
"loss": 0.1496,
|
|
"num_input_tokens_seen": 46196816,
|
|
"step": 2365
|
|
},
|
|
{
|
|
"epoch": 2.7000142511044607,
|
|
"grad_norm": 10.533610343933105,
|
|
"learning_rate": 4.9005592431972304e-05,
|
|
"loss": 0.0768,
|
|
"num_input_tokens_seen": 46294480,
|
|
"step": 2370
|
|
},
|
|
{
|
|
"epoch": 2.705714692888699,
|
|
"grad_norm": 7.490808486938477,
|
|
"learning_rate": 4.90014203807864e-05,
|
|
"loss": 0.1114,
|
|
"num_input_tokens_seen": 46392096,
|
|
"step": 2375
|
|
},
|
|
{
|
|
"epoch": 2.7114151346729374,
|
|
"grad_norm": 7.0142083168029785,
|
|
"learning_rate": 4.899723977430552e-05,
|
|
"loss": 0.0883,
|
|
"num_input_tokens_seen": 46489936,
|
|
"step": 2380
|
|
},
|
|
{
|
|
"epoch": 2.7171155764571755,
|
|
"grad_norm": 8.30893611907959,
|
|
"learning_rate": 4.899305061401983e-05,
|
|
"loss": 0.1146,
|
|
"num_input_tokens_seen": 46587648,
|
|
"step": 2385
|
|
},
|
|
{
|
|
"epoch": 2.7228160182414136,
|
|
"grad_norm": 5.555058479309082,
|
|
"learning_rate": 4.898885290142254e-05,
|
|
"loss": 0.1212,
|
|
"num_input_tokens_seen": 46685360,
|
|
"step": 2390
|
|
},
|
|
{
|
|
"epoch": 2.728516460025652,
|
|
"grad_norm": 3.8162529468536377,
|
|
"learning_rate": 4.898464663800995e-05,
|
|
"loss": 0.1327,
|
|
"num_input_tokens_seen": 46783072,
|
|
"step": 2395
|
|
},
|
|
{
|
|
"epoch": 2.7342169018098903,
|
|
"grad_norm": 7.592154026031494,
|
|
"learning_rate": 4.898043182528136e-05,
|
|
"loss": 0.0871,
|
|
"num_input_tokens_seen": 46880832,
|
|
"step": 2400
|
|
},
|
|
{
|
|
"epoch": 2.7399173435941284,
|
|
"grad_norm": 4.684682846069336,
|
|
"learning_rate": 4.897620846473915e-05,
|
|
"loss": 0.0563,
|
|
"num_input_tokens_seen": 46978576,
|
|
"step": 2405
|
|
},
|
|
{
|
|
"epoch": 2.745617785378367,
|
|
"grad_norm": 1.159037709236145,
|
|
"learning_rate": 4.897197655788872e-05,
|
|
"loss": 0.1116,
|
|
"num_input_tokens_seen": 47076304,
|
|
"step": 2410
|
|
},
|
|
{
|
|
"epoch": 2.751318227162605,
|
|
"grad_norm": 9.983619689941406,
|
|
"learning_rate": 4.8967736106238546e-05,
|
|
"loss": 0.1072,
|
|
"num_input_tokens_seen": 47174000,
|
|
"step": 2415
|
|
},
|
|
{
|
|
"epoch": 2.757018668946843,
|
|
"grad_norm": 5.6760029792785645,
|
|
"learning_rate": 4.8963487111300133e-05,
|
|
"loss": 0.0847,
|
|
"num_input_tokens_seen": 47271760,
|
|
"step": 2420
|
|
},
|
|
{
|
|
"epoch": 2.762719110731082,
|
|
"grad_norm": 9.46414566040039,
|
|
"learning_rate": 4.895922957458803e-05,
|
|
"loss": 0.0821,
|
|
"num_input_tokens_seen": 47369504,
|
|
"step": 2425
|
|
},
|
|
{
|
|
"epoch": 2.76841955251532,
|
|
"grad_norm": 11.030294418334961,
|
|
"learning_rate": 4.8954963497619836e-05,
|
|
"loss": 0.1595,
|
|
"num_input_tokens_seen": 47467312,
|
|
"step": 2430
|
|
},
|
|
{
|
|
"epoch": 2.774119994299558,
|
|
"grad_norm": 4.053030967712402,
|
|
"learning_rate": 4.895068888191618e-05,
|
|
"loss": 0.0967,
|
|
"num_input_tokens_seen": 47565024,
|
|
"step": 2435
|
|
},
|
|
{
|
|
"epoch": 2.7798204360837966,
|
|
"grad_norm": 8.997540473937988,
|
|
"learning_rate": 4.894640572900076e-05,
|
|
"loss": 0.1222,
|
|
"num_input_tokens_seen": 47662768,
|
|
"step": 2440
|
|
},
|
|
{
|
|
"epoch": 2.7855208778680347,
|
|
"grad_norm": 9.141730308532715,
|
|
"learning_rate": 4.89421140404003e-05,
|
|
"loss": 0.1496,
|
|
"num_input_tokens_seen": 47760640,
|
|
"step": 2445
|
|
},
|
|
{
|
|
"epoch": 2.791221319652273,
|
|
"grad_norm": 6.096067905426025,
|
|
"learning_rate": 4.8937813817644577e-05,
|
|
"loss": 0.0965,
|
|
"num_input_tokens_seen": 47858400,
|
|
"step": 2450
|
|
},
|
|
{
|
|
"epoch": 2.7969217614365114,
|
|
"grad_norm": 7.94855260848999,
|
|
"learning_rate": 4.89335050622664e-05,
|
|
"loss": 0.1043,
|
|
"num_input_tokens_seen": 47956112,
|
|
"step": 2455
|
|
},
|
|
{
|
|
"epoch": 2.8026222032207495,
|
|
"grad_norm": 5.408116340637207,
|
|
"learning_rate": 4.892918777580161e-05,
|
|
"loss": 0.0953,
|
|
"num_input_tokens_seen": 48053888,
|
|
"step": 2460
|
|
},
|
|
{
|
|
"epoch": 2.8083226450049876,
|
|
"grad_norm": 10.836856842041016,
|
|
"learning_rate": 4.8924861959789116e-05,
|
|
"loss": 0.0829,
|
|
"num_input_tokens_seen": 48151648,
|
|
"step": 2465
|
|
},
|
|
{
|
|
"epoch": 2.814023086789226,
|
|
"grad_norm": 4.917642593383789,
|
|
"learning_rate": 4.892052761577084e-05,
|
|
"loss": 0.1339,
|
|
"num_input_tokens_seen": 48249344,
|
|
"step": 2470
|
|
},
|
|
{
|
|
"epoch": 2.8197235285734643,
|
|
"grad_norm": 4.280270099639893,
|
|
"learning_rate": 4.891618474529178e-05,
|
|
"loss": 0.0867,
|
|
"num_input_tokens_seen": 48347088,
|
|
"step": 2475
|
|
},
|
|
{
|
|
"epoch": 2.825423970357703,
|
|
"grad_norm": 10.1702241897583,
|
|
"learning_rate": 4.8911833349899924e-05,
|
|
"loss": 0.0944,
|
|
"num_input_tokens_seen": 48444848,
|
|
"step": 2480
|
|
},
|
|
{
|
|
"epoch": 2.831124412141941,
|
|
"grad_norm": 7.9395341873168945,
|
|
"learning_rate": 4.890747343114634e-05,
|
|
"loss": 0.1103,
|
|
"num_input_tokens_seen": 48542528,
|
|
"step": 2485
|
|
},
|
|
{
|
|
"epoch": 2.836824853926179,
|
|
"grad_norm": 7.740533828735352,
|
|
"learning_rate": 4.8903104990585124e-05,
|
|
"loss": 0.0763,
|
|
"num_input_tokens_seen": 48640240,
|
|
"step": 2490
|
|
},
|
|
{
|
|
"epoch": 2.8425252957104177,
|
|
"grad_norm": 2.646793842315674,
|
|
"learning_rate": 4.8898728029773394e-05,
|
|
"loss": 0.0821,
|
|
"num_input_tokens_seen": 48737888,
|
|
"step": 2495
|
|
},
|
|
{
|
|
"epoch": 2.848225737494656,
|
|
"grad_norm": 6.403926849365234,
|
|
"learning_rate": 4.8894342550271314e-05,
|
|
"loss": 0.0962,
|
|
"num_input_tokens_seen": 48835600,
|
|
"step": 2500
|
|
},
|
|
{
|
|
"epoch": 2.8539261792788944,
|
|
"grad_norm": 7.7822957038879395,
|
|
"learning_rate": 4.888994855364209e-05,
|
|
"loss": 0.0832,
|
|
"num_input_tokens_seen": 48933312,
|
|
"step": 2505
|
|
},
|
|
{
|
|
"epoch": 2.8596266210631325,
|
|
"grad_norm": 5.186079025268555,
|
|
"learning_rate": 4.888554604145196e-05,
|
|
"loss": 0.125,
|
|
"num_input_tokens_seen": 49030960,
|
|
"step": 2510
|
|
},
|
|
{
|
|
"epoch": 2.8653270628473706,
|
|
"grad_norm": 7.859062671661377,
|
|
"learning_rate": 4.8881135015270206e-05,
|
|
"loss": 0.0941,
|
|
"num_input_tokens_seen": 49128672,
|
|
"step": 2515
|
|
},
|
|
{
|
|
"epoch": 2.871027504631609,
|
|
"grad_norm": 7.483722686767578,
|
|
"learning_rate": 4.887671547666912e-05,
|
|
"loss": 0.1318,
|
|
"num_input_tokens_seen": 49226416,
|
|
"step": 2520
|
|
},
|
|
{
|
|
"epoch": 2.8767279464158473,
|
|
"grad_norm": 8.643847465515137,
|
|
"learning_rate": 4.887228742722405e-05,
|
|
"loss": 0.1856,
|
|
"num_input_tokens_seen": 49324112,
|
|
"step": 2525
|
|
},
|
|
{
|
|
"epoch": 2.8824283882000854,
|
|
"grad_norm": 8.750890731811523,
|
|
"learning_rate": 4.8867850868513374e-05,
|
|
"loss": 0.1006,
|
|
"num_input_tokens_seen": 49421776,
|
|
"step": 2530
|
|
},
|
|
{
|
|
"epoch": 2.888128829984324,
|
|
"grad_norm": 6.101781845092773,
|
|
"learning_rate": 4.8863405802118514e-05,
|
|
"loss": 0.1324,
|
|
"num_input_tokens_seen": 49519568,
|
|
"step": 2535
|
|
},
|
|
{
|
|
"epoch": 2.893829271768562,
|
|
"grad_norm": 7.980799198150635,
|
|
"learning_rate": 4.8858952229623886e-05,
|
|
"loss": 0.0907,
|
|
"num_input_tokens_seen": 49617360,
|
|
"step": 2540
|
|
},
|
|
{
|
|
"epoch": 2.8995297135528,
|
|
"grad_norm": 3.3241348266601562,
|
|
"learning_rate": 4.8854490152616984e-05,
|
|
"loss": 0.1104,
|
|
"num_input_tokens_seen": 49715056,
|
|
"step": 2545
|
|
},
|
|
{
|
|
"epoch": 2.9052301553370388,
|
|
"grad_norm": 10.823814392089844,
|
|
"learning_rate": 4.88500195726883e-05,
|
|
"loss": 0.1766,
|
|
"num_input_tokens_seen": 49812848,
|
|
"step": 2550
|
|
},
|
|
{
|
|
"epoch": 2.910930597121277,
|
|
"grad_norm": 6.91947078704834,
|
|
"learning_rate": 4.884554049143139e-05,
|
|
"loss": 0.1128,
|
|
"num_input_tokens_seen": 49910496,
|
|
"step": 2555
|
|
},
|
|
{
|
|
"epoch": 2.916631038905515,
|
|
"grad_norm": 4.440322399139404,
|
|
"learning_rate": 4.884105291044279e-05,
|
|
"loss": 0.0796,
|
|
"num_input_tokens_seen": 50008224,
|
|
"step": 2560
|
|
},
|
|
{
|
|
"epoch": 2.9223314806897536,
|
|
"grad_norm": 5.996119976043701,
|
|
"learning_rate": 4.8836556831322125e-05,
|
|
"loss": 0.1648,
|
|
"num_input_tokens_seen": 50105952,
|
|
"step": 2565
|
|
},
|
|
{
|
|
"epoch": 2.9280319224739917,
|
|
"grad_norm": 8.666937828063965,
|
|
"learning_rate": 4.8832052255672e-05,
|
|
"loss": 0.1488,
|
|
"num_input_tokens_seen": 50203680,
|
|
"step": 2570
|
|
},
|
|
{
|
|
"epoch": 2.93373236425823,
|
|
"grad_norm": 5.516872882843018,
|
|
"learning_rate": 4.8827539185098085e-05,
|
|
"loss": 0.1598,
|
|
"num_input_tokens_seen": 50301504,
|
|
"step": 2575
|
|
},
|
|
{
|
|
"epoch": 2.9394328060424684,
|
|
"grad_norm": 6.207433223724365,
|
|
"learning_rate": 4.882301762120905e-05,
|
|
"loss": 0.1003,
|
|
"num_input_tokens_seen": 50399152,
|
|
"step": 2580
|
|
},
|
|
{
|
|
"epoch": 2.9451332478267065,
|
|
"grad_norm": 5.3155598640441895,
|
|
"learning_rate": 4.88184875656166e-05,
|
|
"loss": 0.0675,
|
|
"num_input_tokens_seen": 50496880,
|
|
"step": 2585
|
|
},
|
|
{
|
|
"epoch": 2.9508336896109446,
|
|
"grad_norm": 8.118722915649414,
|
|
"learning_rate": 4.881394901993549e-05,
|
|
"loss": 0.0834,
|
|
"num_input_tokens_seen": 50594656,
|
|
"step": 2590
|
|
},
|
|
{
|
|
"epoch": 2.956534131395183,
|
|
"grad_norm": 8.280570983886719,
|
|
"learning_rate": 4.880940198578347e-05,
|
|
"loss": 0.1212,
|
|
"num_input_tokens_seen": 50692496,
|
|
"step": 2595
|
|
},
|
|
{
|
|
"epoch": 2.9622345731794213,
|
|
"grad_norm": 6.043283939361572,
|
|
"learning_rate": 4.8804846464781334e-05,
|
|
"loss": 0.1096,
|
|
"num_input_tokens_seen": 50790272,
|
|
"step": 2600
|
|
},
|
|
{
|
|
"epoch": 2.9679350149636594,
|
|
"grad_norm": 3.722360134124756,
|
|
"learning_rate": 4.8800282458552885e-05,
|
|
"loss": 0.155,
|
|
"num_input_tokens_seen": 50888032,
|
|
"step": 2605
|
|
},
|
|
{
|
|
"epoch": 2.973635456747898,
|
|
"grad_norm": 6.298059940338135,
|
|
"learning_rate": 4.8795709968724974e-05,
|
|
"loss": 0.072,
|
|
"num_input_tokens_seen": 50985776,
|
|
"step": 2610
|
|
},
|
|
{
|
|
"epoch": 2.979335898532136,
|
|
"grad_norm": 8.660189628601074,
|
|
"learning_rate": 4.879112899692745e-05,
|
|
"loss": 0.1247,
|
|
"num_input_tokens_seen": 51083440,
|
|
"step": 2615
|
|
},
|
|
{
|
|
"epoch": 2.9850363403163747,
|
|
"grad_norm": 14.756346702575684,
|
|
"learning_rate": 4.8786539544793206e-05,
|
|
"loss": 0.1067,
|
|
"num_input_tokens_seen": 51181152,
|
|
"step": 2620
|
|
},
|
|
{
|
|
"epoch": 2.990736782100613,
|
|
"grad_norm": 2.7973508834838867,
|
|
"learning_rate": 4.878194161395816e-05,
|
|
"loss": 0.0766,
|
|
"num_input_tokens_seen": 51278912,
|
|
"step": 2625
|
|
},
|
|
{
|
|
"epoch": 2.9964372238848513,
|
|
"grad_norm": 7.684298515319824,
|
|
"learning_rate": 4.8777335206061216e-05,
|
|
"loss": 0.0668,
|
|
"num_input_tokens_seen": 51376640,
|
|
"step": 2630
|
|
},
|
|
{
|
|
"epoch": 3.0011400883568475,
|
|
"grad_norm": 3.268608570098877,
|
|
"learning_rate": 4.877272032274435e-05,
|
|
"loss": 0.0698,
|
|
"num_input_tokens_seen": 51457280,
|
|
"step": 2635
|
|
},
|
|
{
|
|
"epoch": 3.006840530141086,
|
|
"grad_norm": 2.5024783611297607,
|
|
"learning_rate": 4.876809696565252e-05,
|
|
"loss": 0.0681,
|
|
"num_input_tokens_seen": 51555088,
|
|
"step": 2640
|
|
},
|
|
{
|
|
"epoch": 3.012540971925324,
|
|
"grad_norm": 2.1016647815704346,
|
|
"learning_rate": 4.876346513643373e-05,
|
|
"loss": 0.051,
|
|
"num_input_tokens_seen": 51652864,
|
|
"step": 2645
|
|
},
|
|
{
|
|
"epoch": 3.0182414137095623,
|
|
"grad_norm": 8.176024436950684,
|
|
"learning_rate": 4.875882483673898e-05,
|
|
"loss": 0.0712,
|
|
"num_input_tokens_seen": 51750560,
|
|
"step": 2650
|
|
},
|
|
{
|
|
"epoch": 3.023941855493801,
|
|
"grad_norm": 4.242624759674072,
|
|
"learning_rate": 4.875417606822232e-05,
|
|
"loss": 0.0761,
|
|
"num_input_tokens_seen": 51848288,
|
|
"step": 2655
|
|
},
|
|
{
|
|
"epoch": 3.029642297278039,
|
|
"grad_norm": 11.779088973999023,
|
|
"learning_rate": 4.874951883254078e-05,
|
|
"loss": 0.0485,
|
|
"num_input_tokens_seen": 51946016,
|
|
"step": 2660
|
|
},
|
|
{
|
|
"epoch": 3.035342739062277,
|
|
"grad_norm": 3.6110494136810303,
|
|
"learning_rate": 4.874485313135446e-05,
|
|
"loss": 0.0747,
|
|
"num_input_tokens_seen": 52043776,
|
|
"step": 2665
|
|
},
|
|
{
|
|
"epoch": 3.0410431808465157,
|
|
"grad_norm": 8.334362030029297,
|
|
"learning_rate": 4.874017896632642e-05,
|
|
"loss": 0.0614,
|
|
"num_input_tokens_seen": 52141520,
|
|
"step": 2670
|
|
},
|
|
{
|
|
"epoch": 3.046743622630754,
|
|
"grad_norm": 5.685539722442627,
|
|
"learning_rate": 4.8735496339122776e-05,
|
|
"loss": 0.0604,
|
|
"num_input_tokens_seen": 52239200,
|
|
"step": 2675
|
|
},
|
|
{
|
|
"epoch": 3.052444064414992,
|
|
"grad_norm": 4.587195873260498,
|
|
"learning_rate": 4.8730805251412645e-05,
|
|
"loss": 0.1134,
|
|
"num_input_tokens_seen": 52336848,
|
|
"step": 2680
|
|
},
|
|
{
|
|
"epoch": 3.0581445061992305,
|
|
"grad_norm": 2.6720361709594727,
|
|
"learning_rate": 4.872610570486816e-05,
|
|
"loss": 0.0946,
|
|
"num_input_tokens_seen": 52434640,
|
|
"step": 2685
|
|
},
|
|
{
|
|
"epoch": 3.0638449479834686,
|
|
"grad_norm": 5.53890323638916,
|
|
"learning_rate": 4.872139770116447e-05,
|
|
"loss": 0.0566,
|
|
"num_input_tokens_seen": 52532400,
|
|
"step": 2690
|
|
},
|
|
{
|
|
"epoch": 3.069545389767707,
|
|
"grad_norm": 0.7934585809707642,
|
|
"learning_rate": 4.871668124197976e-05,
|
|
"loss": 0.0163,
|
|
"num_input_tokens_seen": 52630112,
|
|
"step": 2695
|
|
},
|
|
{
|
|
"epoch": 3.0752458315519453,
|
|
"grad_norm": 9.233660697937012,
|
|
"learning_rate": 4.871195632899518e-05,
|
|
"loss": 0.0552,
|
|
"num_input_tokens_seen": 52727840,
|
|
"step": 2700
|
|
},
|
|
{
|
|
"epoch": 3.0809462733361834,
|
|
"grad_norm": 5.545113563537598,
|
|
"learning_rate": 4.870722296389495e-05,
|
|
"loss": 0.0711,
|
|
"num_input_tokens_seen": 52825600,
|
|
"step": 2705
|
|
},
|
|
{
|
|
"epoch": 3.086646715120422,
|
|
"grad_norm": 8.869247436523438,
|
|
"learning_rate": 4.870248114836626e-05,
|
|
"loss": 0.1192,
|
|
"num_input_tokens_seen": 52923312,
|
|
"step": 2710
|
|
},
|
|
{
|
|
"epoch": 3.09234715690466,
|
|
"grad_norm": 2.2247767448425293,
|
|
"learning_rate": 4.8697730884099334e-05,
|
|
"loss": 0.0258,
|
|
"num_input_tokens_seen": 53020928,
|
|
"step": 2715
|
|
},
|
|
{
|
|
"epoch": 3.0980475986888982,
|
|
"grad_norm": 0.8696161508560181,
|
|
"learning_rate": 4.8692972172787396e-05,
|
|
"loss": 0.0649,
|
|
"num_input_tokens_seen": 53118720,
|
|
"step": 2720
|
|
},
|
|
{
|
|
"epoch": 3.103748040473137,
|
|
"grad_norm": 6.550947189331055,
|
|
"learning_rate": 4.86882050161267e-05,
|
|
"loss": 0.0605,
|
|
"num_input_tokens_seen": 53216512,
|
|
"step": 2725
|
|
},
|
|
{
|
|
"epoch": 3.109448482257375,
|
|
"grad_norm": 5.182213306427002,
|
|
"learning_rate": 4.8683429415816485e-05,
|
|
"loss": 0.0933,
|
|
"num_input_tokens_seen": 53314224,
|
|
"step": 2730
|
|
},
|
|
{
|
|
"epoch": 3.115148924041613,
|
|
"grad_norm": 1.667688012123108,
|
|
"learning_rate": 4.867864537355901e-05,
|
|
"loss": 0.0777,
|
|
"num_input_tokens_seen": 53411936,
|
|
"step": 2735
|
|
},
|
|
{
|
|
"epoch": 3.1208493658258516,
|
|
"grad_norm": 10.697488784790039,
|
|
"learning_rate": 4.867385289105955e-05,
|
|
"loss": 0.1207,
|
|
"num_input_tokens_seen": 53509664,
|
|
"step": 2740
|
|
},
|
|
{
|
|
"epoch": 3.1265498076100897,
|
|
"grad_norm": 5.032103061676025,
|
|
"learning_rate": 4.866905197002637e-05,
|
|
"loss": 0.064,
|
|
"num_input_tokens_seen": 53607408,
|
|
"step": 2745
|
|
},
|
|
{
|
|
"epoch": 3.1322502493943283,
|
|
"grad_norm": 6.669914722442627,
|
|
"learning_rate": 4.866424261217078e-05,
|
|
"loss": 0.0425,
|
|
"num_input_tokens_seen": 53705216,
|
|
"step": 2750
|
|
},
|
|
{
|
|
"epoch": 3.1379506911785664,
|
|
"grad_norm": 3.654059886932373,
|
|
"learning_rate": 4.865942481920706e-05,
|
|
"loss": 0.0541,
|
|
"num_input_tokens_seen": 53802960,
|
|
"step": 2755
|
|
},
|
|
{
|
|
"epoch": 3.1436511329628045,
|
|
"grad_norm": 3.9376277923583984,
|
|
"learning_rate": 4.865459859285251e-05,
|
|
"loss": 0.0352,
|
|
"num_input_tokens_seen": 53900720,
|
|
"step": 2760
|
|
},
|
|
{
|
|
"epoch": 3.149351574747043,
|
|
"grad_norm": 3.594050168991089,
|
|
"learning_rate": 4.864976393482743e-05,
|
|
"loss": 0.0372,
|
|
"num_input_tokens_seen": 53998384,
|
|
"step": 2765
|
|
},
|
|
{
|
|
"epoch": 3.155052016531281,
|
|
"grad_norm": 5.50773811340332,
|
|
"learning_rate": 4.864492084685514e-05,
|
|
"loss": 0.0612,
|
|
"num_input_tokens_seen": 54096144,
|
|
"step": 2770
|
|
},
|
|
{
|
|
"epoch": 3.1607524583155193,
|
|
"grad_norm": 11.46947193145752,
|
|
"learning_rate": 4.864006933066196e-05,
|
|
"loss": 0.0896,
|
|
"num_input_tokens_seen": 54193840,
|
|
"step": 2775
|
|
},
|
|
{
|
|
"epoch": 3.166452900099758,
|
|
"grad_norm": 9.24869155883789,
|
|
"learning_rate": 4.8635209387977197e-05,
|
|
"loss": 0.0575,
|
|
"num_input_tokens_seen": 54291568,
|
|
"step": 2780
|
|
},
|
|
{
|
|
"epoch": 3.172153341883996,
|
|
"grad_norm": 5.757988929748535,
|
|
"learning_rate": 4.8630341020533196e-05,
|
|
"loss": 0.0832,
|
|
"num_input_tokens_seen": 54389248,
|
|
"step": 2785
|
|
},
|
|
{
|
|
"epoch": 3.177853783668234,
|
|
"grad_norm": 6.639283657073975,
|
|
"learning_rate": 4.862546423006527e-05,
|
|
"loss": 0.0882,
|
|
"num_input_tokens_seen": 54486944,
|
|
"step": 2790
|
|
},
|
|
{
|
|
"epoch": 3.1835542254524727,
|
|
"grad_norm": 5.969208240509033,
|
|
"learning_rate": 4.8620579018311744e-05,
|
|
"loss": 0.0486,
|
|
"num_input_tokens_seen": 54584624,
|
|
"step": 2795
|
|
},
|
|
{
|
|
"epoch": 3.189254667236711,
|
|
"grad_norm": 11.107736587524414,
|
|
"learning_rate": 4.8615685387013956e-05,
|
|
"loss": 0.0754,
|
|
"num_input_tokens_seen": 54682384,
|
|
"step": 2800
|
|
},
|
|
{
|
|
"epoch": 3.194955109020949,
|
|
"grad_norm": 9.802680969238281,
|
|
"learning_rate": 4.861078333791624e-05,
|
|
"loss": 0.0721,
|
|
"num_input_tokens_seen": 54780160,
|
|
"step": 2805
|
|
},
|
|
{
|
|
"epoch": 3.2006555508051875,
|
|
"grad_norm": 2.6495935916900635,
|
|
"learning_rate": 4.860587287276592e-05,
|
|
"loss": 0.0538,
|
|
"num_input_tokens_seen": 54877872,
|
|
"step": 2810
|
|
},
|
|
{
|
|
"epoch": 3.2063559925894256,
|
|
"grad_norm": 5.3208818435668945,
|
|
"learning_rate": 4.8600953993313344e-05,
|
|
"loss": 0.0571,
|
|
"num_input_tokens_seen": 54975632,
|
|
"step": 2815
|
|
},
|
|
{
|
|
"epoch": 3.2120564343736637,
|
|
"grad_norm": 5.696016311645508,
|
|
"learning_rate": 4.859602670131185e-05,
|
|
"loss": 0.0616,
|
|
"num_input_tokens_seen": 55073408,
|
|
"step": 2820
|
|
},
|
|
{
|
|
"epoch": 3.2177568761579023,
|
|
"grad_norm": 9.000017166137695,
|
|
"learning_rate": 4.859109099851774e-05,
|
|
"loss": 0.1114,
|
|
"num_input_tokens_seen": 55171152,
|
|
"step": 2825
|
|
},
|
|
{
|
|
"epoch": 3.2234573179421404,
|
|
"grad_norm": 6.167779922485352,
|
|
"learning_rate": 4.8586146886690364e-05,
|
|
"loss": 0.0335,
|
|
"num_input_tokens_seen": 55268896,
|
|
"step": 2830
|
|
},
|
|
{
|
|
"epoch": 3.229157759726379,
|
|
"grad_norm": 0.7552993893623352,
|
|
"learning_rate": 4.8581194367592043e-05,
|
|
"loss": 0.0157,
|
|
"num_input_tokens_seen": 55366688,
|
|
"step": 2835
|
|
},
|
|
{
|
|
"epoch": 3.234858201510617,
|
|
"grad_norm": 6.548010349273682,
|
|
"learning_rate": 4.8576233442988095e-05,
|
|
"loss": 0.0572,
|
|
"num_input_tokens_seen": 55464368,
|
|
"step": 2840
|
|
},
|
|
{
|
|
"epoch": 3.240558643294855,
|
|
"grad_norm": 0.6461604237556458,
|
|
"learning_rate": 4.857126411464685e-05,
|
|
"loss": 0.0241,
|
|
"num_input_tokens_seen": 55562128,
|
|
"step": 2845
|
|
},
|
|
{
|
|
"epoch": 3.2462590850790938,
|
|
"grad_norm": 7.866938591003418,
|
|
"learning_rate": 4.856628638433962e-05,
|
|
"loss": 0.0597,
|
|
"num_input_tokens_seen": 55659792,
|
|
"step": 2850
|
|
},
|
|
{
|
|
"epoch": 3.251959526863332,
|
|
"grad_norm": 5.226189136505127,
|
|
"learning_rate": 4.85613002538407e-05,
|
|
"loss": 0.0267,
|
|
"num_input_tokens_seen": 55757504,
|
|
"step": 2855
|
|
},
|
|
{
|
|
"epoch": 3.25765996864757,
|
|
"grad_norm": 6.863353252410889,
|
|
"learning_rate": 4.855630572492742e-05,
|
|
"loss": 0.0537,
|
|
"num_input_tokens_seen": 55855344,
|
|
"step": 2860
|
|
},
|
|
{
|
|
"epoch": 3.2633604104318086,
|
|
"grad_norm": 1.295962929725647,
|
|
"learning_rate": 4.8551302799380055e-05,
|
|
"loss": 0.0304,
|
|
"num_input_tokens_seen": 55953072,
|
|
"step": 2865
|
|
},
|
|
{
|
|
"epoch": 3.2690608522160467,
|
|
"grad_norm": 5.298805236816406,
|
|
"learning_rate": 4.854629147898191e-05,
|
|
"loss": 0.0321,
|
|
"num_input_tokens_seen": 56050752,
|
|
"step": 2870
|
|
},
|
|
{
|
|
"epoch": 3.2747612940002853,
|
|
"grad_norm": 12.122303009033203,
|
|
"learning_rate": 4.854127176551925e-05,
|
|
"loss": 0.1434,
|
|
"num_input_tokens_seen": 56148560,
|
|
"step": 2875
|
|
},
|
|
{
|
|
"epoch": 3.2804617357845234,
|
|
"grad_norm": 1.2280305624008179,
|
|
"learning_rate": 4.8536243660781375e-05,
|
|
"loss": 0.0707,
|
|
"num_input_tokens_seen": 56246272,
|
|
"step": 2880
|
|
},
|
|
{
|
|
"epoch": 3.2861621775687615,
|
|
"grad_norm": 5.140838623046875,
|
|
"learning_rate": 4.8531207166560524e-05,
|
|
"loss": 0.0457,
|
|
"num_input_tokens_seen": 56343984,
|
|
"step": 2885
|
|
},
|
|
{
|
|
"epoch": 3.291862619353,
|
|
"grad_norm": 2.1565487384796143,
|
|
"learning_rate": 4.8526162284651974e-05,
|
|
"loss": 0.0177,
|
|
"num_input_tokens_seen": 56441792,
|
|
"step": 2890
|
|
},
|
|
{
|
|
"epoch": 3.297563061137238,
|
|
"grad_norm": 2.832627534866333,
|
|
"learning_rate": 4.852110901685396e-05,
|
|
"loss": 0.0283,
|
|
"num_input_tokens_seen": 56539600,
|
|
"step": 2895
|
|
},
|
|
{
|
|
"epoch": 3.3032635029214763,
|
|
"grad_norm": 10.345179557800293,
|
|
"learning_rate": 4.851604736496772e-05,
|
|
"loss": 0.0475,
|
|
"num_input_tokens_seen": 56637280,
|
|
"step": 2900
|
|
},
|
|
{
|
|
"epoch": 3.308963944705715,
|
|
"grad_norm": 0.8519467711448669,
|
|
"learning_rate": 4.8510977330797476e-05,
|
|
"loss": 0.0266,
|
|
"num_input_tokens_seen": 56735056,
|
|
"step": 2905
|
|
},
|
|
{
|
|
"epoch": 3.314664386489953,
|
|
"grad_norm": 6.070542335510254,
|
|
"learning_rate": 4.8505898916150436e-05,
|
|
"loss": 0.0536,
|
|
"num_input_tokens_seen": 56832864,
|
|
"step": 2910
|
|
},
|
|
{
|
|
"epoch": 3.320364828274191,
|
|
"grad_norm": 3.4217629432678223,
|
|
"learning_rate": 4.85008121228368e-05,
|
|
"loss": 0.0251,
|
|
"num_input_tokens_seen": 56930608,
|
|
"step": 2915
|
|
},
|
|
{
|
|
"epoch": 3.3260652700584297,
|
|
"grad_norm": 5.251518726348877,
|
|
"learning_rate": 4.849571695266977e-05,
|
|
"loss": 0.0676,
|
|
"num_input_tokens_seen": 57028336,
|
|
"step": 2920
|
|
},
|
|
{
|
|
"epoch": 3.331765711842668,
|
|
"grad_norm": 9.286744117736816,
|
|
"learning_rate": 4.849061340746549e-05,
|
|
"loss": 0.1008,
|
|
"num_input_tokens_seen": 57126128,
|
|
"step": 2925
|
|
},
|
|
{
|
|
"epoch": 3.337466153626906,
|
|
"grad_norm": 5.496871471405029,
|
|
"learning_rate": 4.848550148904314e-05,
|
|
"loss": 0.1098,
|
|
"num_input_tokens_seen": 57223840,
|
|
"step": 2930
|
|
},
|
|
{
|
|
"epoch": 3.3431665954111445,
|
|
"grad_norm": 6.865820407867432,
|
|
"learning_rate": 4.848038119922483e-05,
|
|
"loss": 0.0545,
|
|
"num_input_tokens_seen": 57321568,
|
|
"step": 2935
|
|
},
|
|
{
|
|
"epoch": 3.3488670371953826,
|
|
"grad_norm": 4.949888229370117,
|
|
"learning_rate": 4.847525253983572e-05,
|
|
"loss": 0.1271,
|
|
"num_input_tokens_seen": 57419328,
|
|
"step": 2940
|
|
},
|
|
{
|
|
"epoch": 3.3545674789796207,
|
|
"grad_norm": 0.7395240068435669,
|
|
"learning_rate": 4.847011551270391e-05,
|
|
"loss": 0.0262,
|
|
"num_input_tokens_seen": 57517008,
|
|
"step": 2945
|
|
},
|
|
{
|
|
"epoch": 3.3602679207638593,
|
|
"grad_norm": 10.39633560180664,
|
|
"learning_rate": 4.846497011966047e-05,
|
|
"loss": 0.0333,
|
|
"num_input_tokens_seen": 57614816,
|
|
"step": 2950
|
|
},
|
|
{
|
|
"epoch": 3.3659683625480974,
|
|
"grad_norm": 5.424074649810791,
|
|
"learning_rate": 4.845981636253949e-05,
|
|
"loss": 0.066,
|
|
"num_input_tokens_seen": 57712528,
|
|
"step": 2955
|
|
},
|
|
{
|
|
"epoch": 3.3716688043323355,
|
|
"grad_norm": 5.526705265045166,
|
|
"learning_rate": 4.845465424317802e-05,
|
|
"loss": 0.0246,
|
|
"num_input_tokens_seen": 57810208,
|
|
"step": 2960
|
|
},
|
|
{
|
|
"epoch": 3.377369246116574,
|
|
"grad_norm": 3.3858978748321533,
|
|
"learning_rate": 4.8449483763416095e-05,
|
|
"loss": 0.0585,
|
|
"num_input_tokens_seen": 57907968,
|
|
"step": 2965
|
|
},
|
|
{
|
|
"epoch": 3.383069687900812,
|
|
"grad_norm": 4.47909688949585,
|
|
"learning_rate": 4.844430492509674e-05,
|
|
"loss": 0.0799,
|
|
"num_input_tokens_seen": 58005744,
|
|
"step": 2970
|
|
},
|
|
{
|
|
"epoch": 3.3887701296850508,
|
|
"grad_norm": 8.794025421142578,
|
|
"learning_rate": 4.843911773006593e-05,
|
|
"loss": 0.0286,
|
|
"num_input_tokens_seen": 58103504,
|
|
"step": 2975
|
|
},
|
|
{
|
|
"epoch": 3.394470571469289,
|
|
"grad_norm": 5.554230690002441,
|
|
"learning_rate": 4.8433922180172653e-05,
|
|
"loss": 0.0499,
|
|
"num_input_tokens_seen": 58201232,
|
|
"step": 2980
|
|
},
|
|
{
|
|
"epoch": 3.400171013253527,
|
|
"grad_norm": 4.487776279449463,
|
|
"learning_rate": 4.842871827726886e-05,
|
|
"loss": 0.0402,
|
|
"num_input_tokens_seen": 58299024,
|
|
"step": 2985
|
|
},
|
|
{
|
|
"epoch": 3.4058714550377656,
|
|
"grad_norm": 2.8140945434570312,
|
|
"learning_rate": 4.8423506023209466e-05,
|
|
"loss": 0.0566,
|
|
"num_input_tokens_seen": 58396816,
|
|
"step": 2990
|
|
},
|
|
{
|
|
"epoch": 3.4115718968220037,
|
|
"grad_norm": 3.601980686187744,
|
|
"learning_rate": 4.8418285419852395e-05,
|
|
"loss": 0.0412,
|
|
"num_input_tokens_seen": 58494544,
|
|
"step": 2995
|
|
},
|
|
{
|
|
"epoch": 3.417272338606242,
|
|
"grad_norm": 2.136195182800293,
|
|
"learning_rate": 4.841305646905851e-05,
|
|
"loss": 0.0304,
|
|
"num_input_tokens_seen": 58592352,
|
|
"step": 3000
|
|
},
|
|
{
|
|
"epoch": 3.4229727803904804,
|
|
"grad_norm": 5.654057502746582,
|
|
"learning_rate": 4.8407819172691694e-05,
|
|
"loss": 0.0304,
|
|
"num_input_tokens_seen": 58690128,
|
|
"step": 3005
|
|
},
|
|
{
|
|
"epoch": 3.4286732221747185,
|
|
"grad_norm": 2.4083597660064697,
|
|
"learning_rate": 4.840257353261875e-05,
|
|
"loss": 0.0383,
|
|
"num_input_tokens_seen": 58787904,
|
|
"step": 3010
|
|
},
|
|
{
|
|
"epoch": 3.434373663958957,
|
|
"grad_norm": 5.336053371429443,
|
|
"learning_rate": 4.83973195507095e-05,
|
|
"loss": 0.0915,
|
|
"num_input_tokens_seen": 58885632,
|
|
"step": 3015
|
|
},
|
|
{
|
|
"epoch": 3.440074105743195,
|
|
"grad_norm": 4.11752986907959,
|
|
"learning_rate": 4.839205722883672e-05,
|
|
"loss": 0.0503,
|
|
"num_input_tokens_seen": 58983312,
|
|
"step": 3020
|
|
},
|
|
{
|
|
"epoch": 3.4457745475274333,
|
|
"grad_norm": 13.777847290039062,
|
|
"learning_rate": 4.838678656887616e-05,
|
|
"loss": 0.1445,
|
|
"num_input_tokens_seen": 59081072,
|
|
"step": 3025
|
|
},
|
|
{
|
|
"epoch": 3.451474989311672,
|
|
"grad_norm": 9.075989723205566,
|
|
"learning_rate": 4.838150757270655e-05,
|
|
"loss": 0.0777,
|
|
"num_input_tokens_seen": 59178896,
|
|
"step": 3030
|
|
},
|
|
{
|
|
"epoch": 3.45717543109591,
|
|
"grad_norm": 7.0720014572143555,
|
|
"learning_rate": 4.837622024220959e-05,
|
|
"loss": 0.0592,
|
|
"num_input_tokens_seen": 59276560,
|
|
"step": 3035
|
|
},
|
|
{
|
|
"epoch": 3.462875872880148,
|
|
"grad_norm": 4.558810710906982,
|
|
"learning_rate": 4.837092457926993e-05,
|
|
"loss": 0.0274,
|
|
"num_input_tokens_seen": 59374368,
|
|
"step": 3040
|
|
},
|
|
{
|
|
"epoch": 3.4685763146643867,
|
|
"grad_norm": 14.200141906738281,
|
|
"learning_rate": 4.8365620585775214e-05,
|
|
"loss": 0.0558,
|
|
"num_input_tokens_seen": 59472048,
|
|
"step": 3045
|
|
},
|
|
{
|
|
"epoch": 3.4742767564486248,
|
|
"grad_norm": 5.859817028045654,
|
|
"learning_rate": 4.836030826361605e-05,
|
|
"loss": 0.0277,
|
|
"num_input_tokens_seen": 59569840,
|
|
"step": 3050
|
|
},
|
|
{
|
|
"epoch": 3.479977198232863,
|
|
"grad_norm": 8.385420799255371,
|
|
"learning_rate": 4.835498761468601e-05,
|
|
"loss": 0.0667,
|
|
"num_input_tokens_seen": 59667584,
|
|
"step": 3055
|
|
},
|
|
{
|
|
"epoch": 3.4856776400171015,
|
|
"grad_norm": 1.2888391017913818,
|
|
"learning_rate": 4.834965864088164e-05,
|
|
"loss": 0.0207,
|
|
"num_input_tokens_seen": 59765392,
|
|
"step": 3060
|
|
},
|
|
{
|
|
"epoch": 3.4913780818013396,
|
|
"grad_norm": 1.1023948192596436,
|
|
"learning_rate": 4.834432134410245e-05,
|
|
"loss": 0.0207,
|
|
"num_input_tokens_seen": 59863152,
|
|
"step": 3065
|
|
},
|
|
{
|
|
"epoch": 3.4970785235855777,
|
|
"grad_norm": 3.3756585121154785,
|
|
"learning_rate": 4.8338975726250925e-05,
|
|
"loss": 0.0416,
|
|
"num_input_tokens_seen": 59960928,
|
|
"step": 3070
|
|
},
|
|
{
|
|
"epoch": 3.5027789653698163,
|
|
"grad_norm": 0.9311105608940125,
|
|
"learning_rate": 4.833362178923249e-05,
|
|
"loss": 0.0316,
|
|
"num_input_tokens_seen": 60058656,
|
|
"step": 3075
|
|
},
|
|
{
|
|
"epoch": 3.5084794071540544,
|
|
"grad_norm": 10.324899673461914,
|
|
"learning_rate": 4.8328259534955554e-05,
|
|
"loss": 0.0793,
|
|
"num_input_tokens_seen": 60156448,
|
|
"step": 3080
|
|
},
|
|
{
|
|
"epoch": 3.5141798489382925,
|
|
"grad_norm": 4.7765703201293945,
|
|
"learning_rate": 4.832288896533151e-05,
|
|
"loss": 0.0476,
|
|
"num_input_tokens_seen": 60254192,
|
|
"step": 3085
|
|
},
|
|
{
|
|
"epoch": 3.519880290722531,
|
|
"grad_norm": 1.959538459777832,
|
|
"learning_rate": 4.831751008227468e-05,
|
|
"loss": 0.0346,
|
|
"num_input_tokens_seen": 60351920,
|
|
"step": 3090
|
|
},
|
|
{
|
|
"epoch": 3.525580732506769,
|
|
"grad_norm": 9.76518440246582,
|
|
"learning_rate": 4.831212288770237e-05,
|
|
"loss": 0.046,
|
|
"num_input_tokens_seen": 60449696,
|
|
"step": 3095
|
|
},
|
|
{
|
|
"epoch": 3.5312811742910073,
|
|
"grad_norm": 1.2289072275161743,
|
|
"learning_rate": 4.8306727383534835e-05,
|
|
"loss": 0.0225,
|
|
"num_input_tokens_seen": 60547440,
|
|
"step": 3100
|
|
},
|
|
{
|
|
"epoch": 3.536981616075246,
|
|
"grad_norm": 7.658115863800049,
|
|
"learning_rate": 4.8301323571695314e-05,
|
|
"loss": 0.0281,
|
|
"num_input_tokens_seen": 60645200,
|
|
"step": 3105
|
|
},
|
|
{
|
|
"epoch": 3.542682057859484,
|
|
"grad_norm": 4.308380126953125,
|
|
"learning_rate": 4.829591145410997e-05,
|
|
"loss": 0.0265,
|
|
"num_input_tokens_seen": 60742880,
|
|
"step": 3110
|
|
},
|
|
{
|
|
"epoch": 3.5483824996437225,
|
|
"grad_norm": 4.51566743850708,
|
|
"learning_rate": 4.829049103270798e-05,
|
|
"loss": 0.0473,
|
|
"num_input_tokens_seen": 60840640,
|
|
"step": 3115
|
|
},
|
|
{
|
|
"epoch": 3.5540829414279607,
|
|
"grad_norm": 4.3482255935668945,
|
|
"learning_rate": 4.8285062309421426e-05,
|
|
"loss": 0.0468,
|
|
"num_input_tokens_seen": 60938400,
|
|
"step": 3120
|
|
},
|
|
{
|
|
"epoch": 3.559783383212199,
|
|
"grad_norm": 7.6800994873046875,
|
|
"learning_rate": 4.827962528618538e-05,
|
|
"loss": 0.0282,
|
|
"num_input_tokens_seen": 61036128,
|
|
"step": 3125
|
|
},
|
|
{
|
|
"epoch": 3.5654838249964373,
|
|
"grad_norm": 8.757813453674316,
|
|
"learning_rate": 4.8274179964937875e-05,
|
|
"loss": 0.0225,
|
|
"num_input_tokens_seen": 61133872,
|
|
"step": 3130
|
|
},
|
|
{
|
|
"epoch": 3.5711842667806755,
|
|
"grad_norm": 1.4490429162979126,
|
|
"learning_rate": 4.826872634761989e-05,
|
|
"loss": 0.0375,
|
|
"num_input_tokens_seen": 61231600,
|
|
"step": 3135
|
|
},
|
|
{
|
|
"epoch": 3.576884708564914,
|
|
"grad_norm": 5.913198471069336,
|
|
"learning_rate": 4.826326443617536e-05,
|
|
"loss": 0.0422,
|
|
"num_input_tokens_seen": 61329360,
|
|
"step": 3140
|
|
},
|
|
{
|
|
"epoch": 3.582585150349152,
|
|
"grad_norm": 8.622357368469238,
|
|
"learning_rate": 4.825779423255118e-05,
|
|
"loss": 0.0399,
|
|
"num_input_tokens_seen": 61427104,
|
|
"step": 3145
|
|
},
|
|
{
|
|
"epoch": 3.5882855921333903,
|
|
"grad_norm": 6.383512496948242,
|
|
"learning_rate": 4.825231573869721e-05,
|
|
"loss": 0.0356,
|
|
"num_input_tokens_seen": 61524848,
|
|
"step": 3150
|
|
},
|
|
{
|
|
"epoch": 3.593986033917629,
|
|
"grad_norm": 15.792478561401367,
|
|
"learning_rate": 4.824682895656624e-05,
|
|
"loss": 0.0613,
|
|
"num_input_tokens_seen": 61622512,
|
|
"step": 3155
|
|
},
|
|
{
|
|
"epoch": 3.599686475701867,
|
|
"grad_norm": 1.2860291004180908,
|
|
"learning_rate": 4.824133388811405e-05,
|
|
"loss": 0.0439,
|
|
"num_input_tokens_seen": 61720192,
|
|
"step": 3160
|
|
},
|
|
{
|
|
"epoch": 3.605386917486105,
|
|
"grad_norm": 6.301830768585205,
|
|
"learning_rate": 4.823583053529934e-05,
|
|
"loss": 0.0353,
|
|
"num_input_tokens_seen": 61817936,
|
|
"step": 3165
|
|
},
|
|
{
|
|
"epoch": 3.6110873592703436,
|
|
"grad_norm": 4.263798236846924,
|
|
"learning_rate": 4.823031890008379e-05,
|
|
"loss": 0.0338,
|
|
"num_input_tokens_seen": 61915664,
|
|
"step": 3170
|
|
},
|
|
{
|
|
"epoch": 3.6167878010545818,
|
|
"grad_norm": 7.392456531524658,
|
|
"learning_rate": 4.8224798984432005e-05,
|
|
"loss": 0.0399,
|
|
"num_input_tokens_seen": 62013456,
|
|
"step": 3175
|
|
},
|
|
{
|
|
"epoch": 3.62248824283882,
|
|
"grad_norm": 2.850409746170044,
|
|
"learning_rate": 4.8219270790311575e-05,
|
|
"loss": 0.0422,
|
|
"num_input_tokens_seen": 62111248,
|
|
"step": 3180
|
|
},
|
|
{
|
|
"epoch": 3.6281886846230584,
|
|
"grad_norm": 3.5166022777557373,
|
|
"learning_rate": 4.8213734319693004e-05,
|
|
"loss": 0.0193,
|
|
"num_input_tokens_seen": 62208960,
|
|
"step": 3185
|
|
},
|
|
{
|
|
"epoch": 3.6338891264072966,
|
|
"grad_norm": 7.699153423309326,
|
|
"learning_rate": 4.820818957454978e-05,
|
|
"loss": 0.0698,
|
|
"num_input_tokens_seen": 62306592,
|
|
"step": 3190
|
|
},
|
|
{
|
|
"epoch": 3.6395895681915347,
|
|
"grad_norm": 0.7717591524124146,
|
|
"learning_rate": 4.820263655685831e-05,
|
|
"loss": 0.0257,
|
|
"num_input_tokens_seen": 62404400,
|
|
"step": 3195
|
|
},
|
|
{
|
|
"epoch": 3.6452900099757732,
|
|
"grad_norm": 6.028016567230225,
|
|
"learning_rate": 4.819707526859797e-05,
|
|
"loss": 0.0352,
|
|
"num_input_tokens_seen": 62502160,
|
|
"step": 3200
|
|
},
|
|
{
|
|
"epoch": 3.6509904517600114,
|
|
"grad_norm": 2.3986012935638428,
|
|
"learning_rate": 4.819150571175108e-05,
|
|
"loss": 0.043,
|
|
"num_input_tokens_seen": 62599920,
|
|
"step": 3205
|
|
},
|
|
{
|
|
"epoch": 3.6566908935442495,
|
|
"grad_norm": 3.4287400245666504,
|
|
"learning_rate": 4.818592788830291e-05,
|
|
"loss": 0.0289,
|
|
"num_input_tokens_seen": 62697680,
|
|
"step": 3210
|
|
},
|
|
{
|
|
"epoch": 3.662391335328488,
|
|
"grad_norm": 5.921146869659424,
|
|
"learning_rate": 4.818034180024167e-05,
|
|
"loss": 0.0331,
|
|
"num_input_tokens_seen": 62795472,
|
|
"step": 3215
|
|
},
|
|
{
|
|
"epoch": 3.668091777112726,
|
|
"grad_norm": 4.856356620788574,
|
|
"learning_rate": 4.8174747449558515e-05,
|
|
"loss": 0.0131,
|
|
"num_input_tokens_seen": 62893136,
|
|
"step": 3220
|
|
},
|
|
{
|
|
"epoch": 3.6737922188969643,
|
|
"grad_norm": 6.656949996948242,
|
|
"learning_rate": 4.816914483824755e-05,
|
|
"loss": 0.0426,
|
|
"num_input_tokens_seen": 62990816,
|
|
"step": 3225
|
|
},
|
|
{
|
|
"epoch": 3.679492660681203,
|
|
"grad_norm": 1.0884100198745728,
|
|
"learning_rate": 4.816353396830583e-05,
|
|
"loss": 0.032,
|
|
"num_input_tokens_seen": 63088560,
|
|
"step": 3230
|
|
},
|
|
{
|
|
"epoch": 3.685193102465441,
|
|
"grad_norm": 0.37009307742118835,
|
|
"learning_rate": 4.815791484173333e-05,
|
|
"loss": 0.0322,
|
|
"num_input_tokens_seen": 63186272,
|
|
"step": 3235
|
|
},
|
|
{
|
|
"epoch": 3.690893544249679,
|
|
"grad_norm": 2.093526840209961,
|
|
"learning_rate": 4.815228746053301e-05,
|
|
"loss": 0.0225,
|
|
"num_input_tokens_seen": 63284016,
|
|
"step": 3240
|
|
},
|
|
{
|
|
"epoch": 3.6965939860339176,
|
|
"grad_norm": 9.629427909851074,
|
|
"learning_rate": 4.814665182671072e-05,
|
|
"loss": 0.0321,
|
|
"num_input_tokens_seen": 63381776,
|
|
"step": 3245
|
|
},
|
|
{
|
|
"epoch": 3.7022944278181558,
|
|
"grad_norm": 7.924525260925293,
|
|
"learning_rate": 4.8141007942275295e-05,
|
|
"loss": 0.0641,
|
|
"num_input_tokens_seen": 63479536,
|
|
"step": 3250
|
|
},
|
|
{
|
|
"epoch": 3.7079948696023943,
|
|
"grad_norm": 3.5611679553985596,
|
|
"learning_rate": 4.813535580923849e-05,
|
|
"loss": 0.0731,
|
|
"num_input_tokens_seen": 63577152,
|
|
"step": 3255
|
|
},
|
|
{
|
|
"epoch": 3.7136953113866324,
|
|
"grad_norm": 0.575011670589447,
|
|
"learning_rate": 4.812969542961502e-05,
|
|
"loss": 0.0453,
|
|
"num_input_tokens_seen": 63674928,
|
|
"step": 3260
|
|
},
|
|
{
|
|
"epoch": 3.719395753170871,
|
|
"grad_norm": 5.894010066986084,
|
|
"learning_rate": 4.8124026805422494e-05,
|
|
"loss": 0.0257,
|
|
"num_input_tokens_seen": 63772640,
|
|
"step": 3265
|
|
},
|
|
{
|
|
"epoch": 3.725096194955109,
|
|
"grad_norm": 3.0350735187530518,
|
|
"learning_rate": 4.811834993868152e-05,
|
|
"loss": 0.0338,
|
|
"num_input_tokens_seen": 63870336,
|
|
"step": 3270
|
|
},
|
|
{
|
|
"epoch": 3.7307966367393472,
|
|
"grad_norm": 8.058395385742188,
|
|
"learning_rate": 4.81126648314156e-05,
|
|
"loss": 0.0421,
|
|
"num_input_tokens_seen": 63968160,
|
|
"step": 3275
|
|
},
|
|
{
|
|
"epoch": 3.736497078523586,
|
|
"grad_norm": 9.93237590789795,
|
|
"learning_rate": 4.81069714856512e-05,
|
|
"loss": 0.0448,
|
|
"num_input_tokens_seen": 64065904,
|
|
"step": 3280
|
|
},
|
|
{
|
|
"epoch": 3.742197520307824,
|
|
"grad_norm": 11.603642463684082,
|
|
"learning_rate": 4.810126990341769e-05,
|
|
"loss": 0.0901,
|
|
"num_input_tokens_seen": 64163616,
|
|
"step": 3285
|
|
},
|
|
{
|
|
"epoch": 3.747897962092062,
|
|
"grad_norm": 3.8158483505249023,
|
|
"learning_rate": 4.809556008674741e-05,
|
|
"loss": 0.0154,
|
|
"num_input_tokens_seen": 64261376,
|
|
"step": 3290
|
|
},
|
|
{
|
|
"epoch": 3.7535984038763006,
|
|
"grad_norm": 0.4274216890335083,
|
|
"learning_rate": 4.8089842037675615e-05,
|
|
"loss": 0.0094,
|
|
"num_input_tokens_seen": 64359072,
|
|
"step": 3295
|
|
},
|
|
{
|
|
"epoch": 3.7592988456605387,
|
|
"grad_norm": 4.152562618255615,
|
|
"learning_rate": 4.808411575824051e-05,
|
|
"loss": 0.0443,
|
|
"num_input_tokens_seen": 64456816,
|
|
"step": 3300
|
|
},
|
|
{
|
|
"epoch": 3.764999287444777,
|
|
"grad_norm": 4.328752040863037,
|
|
"learning_rate": 4.807838125048322e-05,
|
|
"loss": 0.0393,
|
|
"num_input_tokens_seen": 64554464,
|
|
"step": 3305
|
|
},
|
|
{
|
|
"epoch": 3.7706997292290154,
|
|
"grad_norm": 4.978052616119385,
|
|
"learning_rate": 4.80726385164478e-05,
|
|
"loss": 0.0324,
|
|
"num_input_tokens_seen": 64652272,
|
|
"step": 3310
|
|
},
|
|
{
|
|
"epoch": 3.7764001710132535,
|
|
"grad_norm": 6.3277082443237305,
|
|
"learning_rate": 4.8066887558181265e-05,
|
|
"loss": 0.0203,
|
|
"num_input_tokens_seen": 64750016,
|
|
"step": 3315
|
|
},
|
|
{
|
|
"epoch": 3.7821006127974917,
|
|
"grad_norm": 0.5800598859786987,
|
|
"learning_rate": 4.806112837773351e-05,
|
|
"loss": 0.015,
|
|
"num_input_tokens_seen": 64847760,
|
|
"step": 3320
|
|
},
|
|
{
|
|
"epoch": 3.78780105458173,
|
|
"grad_norm": 17.387359619140625,
|
|
"learning_rate": 4.8055360977157426e-05,
|
|
"loss": 0.0503,
|
|
"num_input_tokens_seen": 64945504,
|
|
"step": 3325
|
|
},
|
|
{
|
|
"epoch": 3.7935014963659683,
|
|
"grad_norm": 6.007382392883301,
|
|
"learning_rate": 4.8049585358508776e-05,
|
|
"loss": 0.0294,
|
|
"num_input_tokens_seen": 65043232,
|
|
"step": 3330
|
|
},
|
|
{
|
|
"epoch": 3.7992019381502065,
|
|
"grad_norm": 8.47810173034668,
|
|
"learning_rate": 4.804380152384629e-05,
|
|
"loss": 0.044,
|
|
"num_input_tokens_seen": 65141024,
|
|
"step": 3335
|
|
},
|
|
{
|
|
"epoch": 3.804902379934445,
|
|
"grad_norm": 9.82911491394043,
|
|
"learning_rate": 4.8038009475231604e-05,
|
|
"loss": 0.0369,
|
|
"num_input_tokens_seen": 65238752,
|
|
"step": 3340
|
|
},
|
|
{
|
|
"epoch": 3.810602821718683,
|
|
"grad_norm": 13.116619110107422,
|
|
"learning_rate": 4.80322092147293e-05,
|
|
"loss": 0.0289,
|
|
"num_input_tokens_seen": 65336528,
|
|
"step": 3345
|
|
},
|
|
{
|
|
"epoch": 3.8163032635029213,
|
|
"grad_norm": 1.19611656665802,
|
|
"learning_rate": 4.802640074440686e-05,
|
|
"loss": 0.0214,
|
|
"num_input_tokens_seen": 65434272,
|
|
"step": 3350
|
|
},
|
|
{
|
|
"epoch": 3.82200370528716,
|
|
"grad_norm": 0.3276759386062622,
|
|
"learning_rate": 4.802058406633474e-05,
|
|
"loss": 0.0193,
|
|
"num_input_tokens_seen": 65532064,
|
|
"step": 3355
|
|
},
|
|
{
|
|
"epoch": 3.827704147071398,
|
|
"grad_norm": 6.492347240447998,
|
|
"learning_rate": 4.8014759182586274e-05,
|
|
"loss": 0.0542,
|
|
"num_input_tokens_seen": 65629792,
|
|
"step": 3360
|
|
},
|
|
{
|
|
"epoch": 3.833404588855636,
|
|
"grad_norm": 3.1319868564605713,
|
|
"learning_rate": 4.800892609523774e-05,
|
|
"loss": 0.0361,
|
|
"num_input_tokens_seen": 65727536,
|
|
"step": 3365
|
|
},
|
|
{
|
|
"epoch": 3.8391050306398746,
|
|
"grad_norm": 0.28512752056121826,
|
|
"learning_rate": 4.8003084806368336e-05,
|
|
"loss": 0.0299,
|
|
"num_input_tokens_seen": 65825200,
|
|
"step": 3370
|
|
},
|
|
{
|
|
"epoch": 3.8448054724241127,
|
|
"grad_norm": 1.0629769563674927,
|
|
"learning_rate": 4.7997235318060185e-05,
|
|
"loss": 0.0643,
|
|
"num_input_tokens_seen": 65922976,
|
|
"step": 3375
|
|
},
|
|
{
|
|
"epoch": 3.8505059142083513,
|
|
"grad_norm": 9.550495147705078,
|
|
"learning_rate": 4.799137763239835e-05,
|
|
"loss": 0.024,
|
|
"num_input_tokens_seen": 66020656,
|
|
"step": 3380
|
|
},
|
|
{
|
|
"epoch": 3.8562063559925894,
|
|
"grad_norm": 5.962581157684326,
|
|
"learning_rate": 4.798551175147079e-05,
|
|
"loss": 0.0279,
|
|
"num_input_tokens_seen": 66118384,
|
|
"step": 3385
|
|
},
|
|
{
|
|
"epoch": 3.8619067977768275,
|
|
"grad_norm": 2.609731435775757,
|
|
"learning_rate": 4.79796376773684e-05,
|
|
"loss": 0.0399,
|
|
"num_input_tokens_seen": 66216176,
|
|
"step": 3390
|
|
},
|
|
{
|
|
"epoch": 3.867607239561066,
|
|
"grad_norm": 5.378483772277832,
|
|
"learning_rate": 4.797375541218498e-05,
|
|
"loss": 0.0118,
|
|
"num_input_tokens_seen": 66313872,
|
|
"step": 3395
|
|
},
|
|
{
|
|
"epoch": 3.8733076813453042,
|
|
"grad_norm": 7.734043598175049,
|
|
"learning_rate": 4.796786495801727e-05,
|
|
"loss": 0.0262,
|
|
"num_input_tokens_seen": 66411664,
|
|
"step": 3400
|
|
},
|
|
{
|
|
"epoch": 3.879008123129543,
|
|
"grad_norm": 7.185009479522705,
|
|
"learning_rate": 4.796196631696491e-05,
|
|
"loss": 0.0313,
|
|
"num_input_tokens_seen": 66509440,
|
|
"step": 3405
|
|
},
|
|
{
|
|
"epoch": 3.884708564913781,
|
|
"grad_norm": 4.7586164474487305,
|
|
"learning_rate": 4.795605949113049e-05,
|
|
"loss": 0.0137,
|
|
"num_input_tokens_seen": 66607152,
|
|
"step": 3410
|
|
},
|
|
{
|
|
"epoch": 3.890409006698019,
|
|
"grad_norm": 0.9074054956436157,
|
|
"learning_rate": 4.795014448261947e-05,
|
|
"loss": 0.0263,
|
|
"num_input_tokens_seen": 66704880,
|
|
"step": 3415
|
|
},
|
|
{
|
|
"epoch": 3.8961094484822576,
|
|
"grad_norm": 2.6224796772003174,
|
|
"learning_rate": 4.794422129354026e-05,
|
|
"loss": 0.0146,
|
|
"num_input_tokens_seen": 66802656,
|
|
"step": 3420
|
|
},
|
|
{
|
|
"epoch": 3.9018098902664957,
|
|
"grad_norm": 0.855692982673645,
|
|
"learning_rate": 4.7938289926004185e-05,
|
|
"loss": 0.0078,
|
|
"num_input_tokens_seen": 66900480,
|
|
"step": 3425
|
|
},
|
|
{
|
|
"epoch": 3.907510332050734,
|
|
"grad_norm": 1.3807679414749146,
|
|
"learning_rate": 4.793235038212548e-05,
|
|
"loss": 0.0188,
|
|
"num_input_tokens_seen": 66998304,
|
|
"step": 3430
|
|
},
|
|
{
|
|
"epoch": 3.9132107738349724,
|
|
"grad_norm": 0.8240529298782349,
|
|
"learning_rate": 4.7926402664021275e-05,
|
|
"loss": 0.0576,
|
|
"num_input_tokens_seen": 67096000,
|
|
"step": 3435
|
|
},
|
|
{
|
|
"epoch": 3.9189112156192105,
|
|
"grad_norm": 7.201174736022949,
|
|
"learning_rate": 4.792044677381165e-05,
|
|
"loss": 0.0205,
|
|
"num_input_tokens_seen": 67193680,
|
|
"step": 3440
|
|
},
|
|
{
|
|
"epoch": 3.9246116574034486,
|
|
"grad_norm": 10.291589736938477,
|
|
"learning_rate": 4.791448271361957e-05,
|
|
"loss": 0.0524,
|
|
"num_input_tokens_seen": 67291472,
|
|
"step": 3445
|
|
},
|
|
{
|
|
"epoch": 3.930312099187687,
|
|
"grad_norm": 3.4942891597747803,
|
|
"learning_rate": 4.7908510485570925e-05,
|
|
"loss": 0.0652,
|
|
"num_input_tokens_seen": 67389216,
|
|
"step": 3450
|
|
},
|
|
{
|
|
"epoch": 3.9360125409719253,
|
|
"grad_norm": 2.04127836227417,
|
|
"learning_rate": 4.7902530091794505e-05,
|
|
"loss": 0.0356,
|
|
"num_input_tokens_seen": 67486912,
|
|
"step": 3455
|
|
},
|
|
{
|
|
"epoch": 3.9417129827561634,
|
|
"grad_norm": 4.031794548034668,
|
|
"learning_rate": 4.789654153442203e-05,
|
|
"loss": 0.0419,
|
|
"num_input_tokens_seen": 67584624,
|
|
"step": 3460
|
|
},
|
|
{
|
|
"epoch": 3.947413424540402,
|
|
"grad_norm": 8.670853614807129,
|
|
"learning_rate": 4.7890544815588115e-05,
|
|
"loss": 0.0192,
|
|
"num_input_tokens_seen": 67682320,
|
|
"step": 3465
|
|
},
|
|
{
|
|
"epoch": 3.95311386632464,
|
|
"grad_norm": 7.351383686065674,
|
|
"learning_rate": 4.788453993743028e-05,
|
|
"loss": 0.0361,
|
|
"num_input_tokens_seen": 67780064,
|
|
"step": 3470
|
|
},
|
|
{
|
|
"epoch": 3.9588143081088782,
|
|
"grad_norm": 8.677574157714844,
|
|
"learning_rate": 4.787852690208897e-05,
|
|
"loss": 0.0235,
|
|
"num_input_tokens_seen": 67877792,
|
|
"step": 3475
|
|
},
|
|
{
|
|
"epoch": 3.964514749893117,
|
|
"grad_norm": 9.356419563293457,
|
|
"learning_rate": 4.787250571170752e-05,
|
|
"loss": 0.0572,
|
|
"num_input_tokens_seen": 67975472,
|
|
"step": 3480
|
|
},
|
|
{
|
|
"epoch": 3.970215191677355,
|
|
"grad_norm": 6.9926300048828125,
|
|
"learning_rate": 4.786647636843219e-05,
|
|
"loss": 0.0837,
|
|
"num_input_tokens_seen": 68073200,
|
|
"step": 3485
|
|
},
|
|
{
|
|
"epoch": 3.975915633461593,
|
|
"grad_norm": 4.828823089599609,
|
|
"learning_rate": 4.786043887441213e-05,
|
|
"loss": 0.0422,
|
|
"num_input_tokens_seen": 68170976,
|
|
"step": 3490
|
|
},
|
|
{
|
|
"epoch": 3.9816160752458316,
|
|
"grad_norm": 11.057583808898926,
|
|
"learning_rate": 4.785439323179941e-05,
|
|
"loss": 0.0326,
|
|
"num_input_tokens_seen": 68268672,
|
|
"step": 3495
|
|
},
|
|
{
|
|
"epoch": 3.9873165170300697,
|
|
"grad_norm": 0.949934184551239,
|
|
"learning_rate": 4.784833944274899e-05,
|
|
"loss": 0.0236,
|
|
"num_input_tokens_seen": 68366432,
|
|
"step": 3500
|
|
},
|
|
{
|
|
"epoch": 3.993016958814308,
|
|
"grad_norm": 6.844513416290283,
|
|
"learning_rate": 4.784227750941873e-05,
|
|
"loss": 0.0188,
|
|
"num_input_tokens_seen": 68464128,
|
|
"step": 3505
|
|
},
|
|
{
|
|
"epoch": 3.9987174005985464,
|
|
"grad_norm": 2.295914649963379,
|
|
"learning_rate": 4.783620743396943e-05,
|
|
"loss": 0.0186,
|
|
"num_input_tokens_seen": 68561936,
|
|
"step": 3510
|
|
},
|
|
{
|
|
"epoch": 4.003420265070543,
|
|
"grad_norm": 2.7240405082702637,
|
|
"learning_rate": 4.783012921856474e-05,
|
|
"loss": 0.0217,
|
|
"num_input_tokens_seen": 68642496,
|
|
"step": 3515
|
|
},
|
|
{
|
|
"epoch": 4.009120706854781,
|
|
"grad_norm": 9.799927711486816,
|
|
"learning_rate": 4.782404286537124e-05,
|
|
"loss": 0.0442,
|
|
"num_input_tokens_seen": 68740256,
|
|
"step": 3520
|
|
},
|
|
{
|
|
"epoch": 4.01482114863902,
|
|
"grad_norm": 4.823245525360107,
|
|
"learning_rate": 4.781794837655843e-05,
|
|
"loss": 0.0601,
|
|
"num_input_tokens_seen": 68837968,
|
|
"step": 3525
|
|
},
|
|
{
|
|
"epoch": 4.020521590423257,
|
|
"grad_norm": 6.3089213371276855,
|
|
"learning_rate": 4.781184575429867e-05,
|
|
"loss": 0.0181,
|
|
"num_input_tokens_seen": 68935680,
|
|
"step": 3530
|
|
},
|
|
{
|
|
"epoch": 4.026222032207496,
|
|
"grad_norm": 0.7296550869941711,
|
|
"learning_rate": 4.780573500076723e-05,
|
|
"loss": 0.0089,
|
|
"num_input_tokens_seen": 69033408,
|
|
"step": 3535
|
|
},
|
|
{
|
|
"epoch": 4.0319224739917345,
|
|
"grad_norm": 7.376620292663574,
|
|
"learning_rate": 4.77996161181423e-05,
|
|
"loss": 0.0136,
|
|
"num_input_tokens_seen": 69131152,
|
|
"step": 3540
|
|
},
|
|
{
|
|
"epoch": 4.037622915775972,
|
|
"grad_norm": 3.193028211593628,
|
|
"learning_rate": 4.779348910860494e-05,
|
|
"loss": 0.0251,
|
|
"num_input_tokens_seen": 69228800,
|
|
"step": 3545
|
|
},
|
|
{
|
|
"epoch": 4.043323357560211,
|
|
"grad_norm": 0.5682366490364075,
|
|
"learning_rate": 4.7787353974339134e-05,
|
|
"loss": 0.0037,
|
|
"num_input_tokens_seen": 69326608,
|
|
"step": 3550
|
|
},
|
|
{
|
|
"epoch": 4.049023799344449,
|
|
"grad_norm": 2.3201677799224854,
|
|
"learning_rate": 4.778121071753174e-05,
|
|
"loss": 0.0114,
|
|
"num_input_tokens_seen": 69424368,
|
|
"step": 3555
|
|
},
|
|
{
|
|
"epoch": 4.054724241128688,
|
|
"grad_norm": 2.5661370754241943,
|
|
"learning_rate": 4.7775059340372516e-05,
|
|
"loss": 0.0177,
|
|
"num_input_tokens_seen": 69522032,
|
|
"step": 3560
|
|
},
|
|
{
|
|
"epoch": 4.060424682912926,
|
|
"grad_norm": 0.4603801667690277,
|
|
"learning_rate": 4.776889984505413e-05,
|
|
"loss": 0.0249,
|
|
"num_input_tokens_seen": 69619728,
|
|
"step": 3565
|
|
},
|
|
{
|
|
"epoch": 4.066125124697164,
|
|
"grad_norm": 3.390105962753296,
|
|
"learning_rate": 4.776273223377211e-05,
|
|
"loss": 0.0172,
|
|
"num_input_tokens_seen": 69717424,
|
|
"step": 3570
|
|
},
|
|
{
|
|
"epoch": 4.071825566481403,
|
|
"grad_norm": 0.19697071611881256,
|
|
"learning_rate": 4.7756556508724914e-05,
|
|
"loss": 0.0153,
|
|
"num_input_tokens_seen": 69815152,
|
|
"step": 3575
|
|
},
|
|
{
|
|
"epoch": 4.07752600826564,
|
|
"grad_norm": 8.34150218963623,
|
|
"learning_rate": 4.7750372672113874e-05,
|
|
"loss": 0.0209,
|
|
"num_input_tokens_seen": 69912960,
|
|
"step": 3580
|
|
},
|
|
{
|
|
"epoch": 4.083226450049879,
|
|
"grad_norm": 0.32752522826194763,
|
|
"learning_rate": 4.774418072614322e-05,
|
|
"loss": 0.0138,
|
|
"num_input_tokens_seen": 70010672,
|
|
"step": 3585
|
|
},
|
|
{
|
|
"epoch": 4.0889268918341175,
|
|
"grad_norm": 3.4794821739196777,
|
|
"learning_rate": 4.773798067302005e-05,
|
|
"loss": 0.0562,
|
|
"num_input_tokens_seen": 70108448,
|
|
"step": 3590
|
|
},
|
|
{
|
|
"epoch": 4.094627333618355,
|
|
"grad_norm": 7.853202819824219,
|
|
"learning_rate": 4.7731772514954384e-05,
|
|
"loss": 0.0245,
|
|
"num_input_tokens_seen": 70206144,
|
|
"step": 3595
|
|
},
|
|
{
|
|
"epoch": 4.100327775402594,
|
|
"grad_norm": 0.33840203285217285,
|
|
"learning_rate": 4.772555625415912e-05,
|
|
"loss": 0.0092,
|
|
"num_input_tokens_seen": 70303872,
|
|
"step": 3600
|
|
},
|
|
{
|
|
"epoch": 4.106028217186832,
|
|
"grad_norm": 6.381319999694824,
|
|
"learning_rate": 4.771933189285004e-05,
|
|
"loss": 0.0101,
|
|
"num_input_tokens_seen": 70401664,
|
|
"step": 3605
|
|
},
|
|
{
|
|
"epoch": 4.11172865897107,
|
|
"grad_norm": 1.308600902557373,
|
|
"learning_rate": 4.771309943324581e-05,
|
|
"loss": 0.021,
|
|
"num_input_tokens_seen": 70499408,
|
|
"step": 3610
|
|
},
|
|
{
|
|
"epoch": 4.1174291007553085,
|
|
"grad_norm": 1.1248642206192017,
|
|
"learning_rate": 4.7706858877567984e-05,
|
|
"loss": 0.009,
|
|
"num_input_tokens_seen": 70597200,
|
|
"step": 3615
|
|
},
|
|
{
|
|
"epoch": 4.123129542539547,
|
|
"grad_norm": 0.797878623008728,
|
|
"learning_rate": 4.770061022804102e-05,
|
|
"loss": 0.0084,
|
|
"num_input_tokens_seen": 70695008,
|
|
"step": 3620
|
|
},
|
|
{
|
|
"epoch": 4.128829984323785,
|
|
"grad_norm": 5.1671905517578125,
|
|
"learning_rate": 4.7694353486892224e-05,
|
|
"loss": 0.0086,
|
|
"num_input_tokens_seen": 70792784,
|
|
"step": 3625
|
|
},
|
|
{
|
|
"epoch": 4.134530426108023,
|
|
"grad_norm": 1.4259310960769653,
|
|
"learning_rate": 4.7688088656351827e-05,
|
|
"loss": 0.0137,
|
|
"num_input_tokens_seen": 70890576,
|
|
"step": 3630
|
|
},
|
|
{
|
|
"epoch": 4.140230867892262,
|
|
"grad_norm": 0.6012780070304871,
|
|
"learning_rate": 4.7681815738652916e-05,
|
|
"loss": 0.0331,
|
|
"num_input_tokens_seen": 70988352,
|
|
"step": 3635
|
|
},
|
|
{
|
|
"epoch": 4.1459313096765,
|
|
"grad_norm": 0.923217236995697,
|
|
"learning_rate": 4.767553473603147e-05,
|
|
"loss": 0.0235,
|
|
"num_input_tokens_seen": 71086128,
|
|
"step": 3640
|
|
},
|
|
{
|
|
"epoch": 4.151631751460738,
|
|
"grad_norm": 0.6401808261871338,
|
|
"learning_rate": 4.766924565072635e-05,
|
|
"loss": 0.0056,
|
|
"num_input_tokens_seen": 71183888,
|
|
"step": 3645
|
|
},
|
|
{
|
|
"epoch": 4.157332193244977,
|
|
"grad_norm": 4.980163097381592,
|
|
"learning_rate": 4.7662948484979304e-05,
|
|
"loss": 0.0124,
|
|
"num_input_tokens_seen": 71281648,
|
|
"step": 3650
|
|
},
|
|
{
|
|
"epoch": 4.163032635029214,
|
|
"grad_norm": 0.19848279654979706,
|
|
"learning_rate": 4.7656643241034946e-05,
|
|
"loss": 0.0377,
|
|
"num_input_tokens_seen": 71379440,
|
|
"step": 3655
|
|
},
|
|
{
|
|
"epoch": 4.168733076813453,
|
|
"grad_norm": 0.8732094168663025,
|
|
"learning_rate": 4.765032992114078e-05,
|
|
"loss": 0.0071,
|
|
"num_input_tokens_seen": 71477216,
|
|
"step": 3660
|
|
},
|
|
{
|
|
"epoch": 4.1744335185976915,
|
|
"grad_norm": 1.789801001548767,
|
|
"learning_rate": 4.7644008527547185e-05,
|
|
"loss": 0.025,
|
|
"num_input_tokens_seen": 71574992,
|
|
"step": 3665
|
|
},
|
|
{
|
|
"epoch": 4.180133960381929,
|
|
"grad_norm": 2.9071710109710693,
|
|
"learning_rate": 4.763767906250742e-05,
|
|
"loss": 0.0172,
|
|
"num_input_tokens_seen": 71672800,
|
|
"step": 3670
|
|
},
|
|
{
|
|
"epoch": 4.185834402166168,
|
|
"grad_norm": 1.144612431526184,
|
|
"learning_rate": 4.7631341528277615e-05,
|
|
"loss": 0.0092,
|
|
"num_input_tokens_seen": 71770512,
|
|
"step": 3675
|
|
},
|
|
{
|
|
"epoch": 4.191534843950406,
|
|
"grad_norm": 2.124117374420166,
|
|
"learning_rate": 4.7624995927116794e-05,
|
|
"loss": 0.0214,
|
|
"num_input_tokens_seen": 71868240,
|
|
"step": 3680
|
|
},
|
|
{
|
|
"epoch": 4.197235285734644,
|
|
"grad_norm": 1.037842035293579,
|
|
"learning_rate": 4.761864226128683e-05,
|
|
"loss": 0.0173,
|
|
"num_input_tokens_seen": 71965952,
|
|
"step": 3685
|
|
},
|
|
{
|
|
"epoch": 4.202935727518883,
|
|
"grad_norm": 16.831375122070312,
|
|
"learning_rate": 4.761228053305249e-05,
|
|
"loss": 0.0419,
|
|
"num_input_tokens_seen": 72063680,
|
|
"step": 3690
|
|
},
|
|
{
|
|
"epoch": 4.208636169303121,
|
|
"grad_norm": 0.9212270975112915,
|
|
"learning_rate": 4.76059107446814e-05,
|
|
"loss": 0.0311,
|
|
"num_input_tokens_seen": 72161472,
|
|
"step": 3695
|
|
},
|
|
{
|
|
"epoch": 4.21433661108736,
|
|
"grad_norm": 7.812607288360596,
|
|
"learning_rate": 4.759953289844409e-05,
|
|
"loss": 0.0197,
|
|
"num_input_tokens_seen": 72259120,
|
|
"step": 3700
|
|
},
|
|
{
|
|
"epoch": 4.220037052871597,
|
|
"grad_norm": 2.24538516998291,
|
|
"learning_rate": 4.759314699661392e-05,
|
|
"loss": 0.0068,
|
|
"num_input_tokens_seen": 72356848,
|
|
"step": 3705
|
|
},
|
|
{
|
|
"epoch": 4.225737494655836,
|
|
"grad_norm": 1.110796570777893,
|
|
"learning_rate": 4.758675304146715e-05,
|
|
"loss": 0.0309,
|
|
"num_input_tokens_seen": 72454608,
|
|
"step": 3710
|
|
},
|
|
{
|
|
"epoch": 4.2314379364400745,
|
|
"grad_norm": 0.3017835021018982,
|
|
"learning_rate": 4.75803510352829e-05,
|
|
"loss": 0.0124,
|
|
"num_input_tokens_seen": 72552288,
|
|
"step": 3715
|
|
},
|
|
{
|
|
"epoch": 4.237138378224312,
|
|
"grad_norm": 3.7277181148529053,
|
|
"learning_rate": 4.757394098034316e-05,
|
|
"loss": 0.0754,
|
|
"num_input_tokens_seen": 72650000,
|
|
"step": 3720
|
|
},
|
|
{
|
|
"epoch": 4.242838820008551,
|
|
"grad_norm": 8.373753547668457,
|
|
"learning_rate": 4.756752287893279e-05,
|
|
"loss": 0.01,
|
|
"num_input_tokens_seen": 72747856,
|
|
"step": 3725
|
|
},
|
|
{
|
|
"epoch": 4.248539261792789,
|
|
"grad_norm": 3.710064172744751,
|
|
"learning_rate": 4.7561096733339526e-05,
|
|
"loss": 0.0109,
|
|
"num_input_tokens_seen": 72845600,
|
|
"step": 3730
|
|
},
|
|
{
|
|
"epoch": 4.254239703577027,
|
|
"grad_norm": 3.204511880874634,
|
|
"learning_rate": 4.755466254585397e-05,
|
|
"loss": 0.0271,
|
|
"num_input_tokens_seen": 72943376,
|
|
"step": 3735
|
|
},
|
|
{
|
|
"epoch": 4.2599401453612655,
|
|
"grad_norm": 2.9513449668884277,
|
|
"learning_rate": 4.754822031876957e-05,
|
|
"loss": 0.0119,
|
|
"num_input_tokens_seen": 73041168,
|
|
"step": 3740
|
|
},
|
|
{
|
|
"epoch": 4.265640587145504,
|
|
"grad_norm": 7.3270392417907715,
|
|
"learning_rate": 4.754177005438266e-05,
|
|
"loss": 0.0168,
|
|
"num_input_tokens_seen": 73138832,
|
|
"step": 3745
|
|
},
|
|
{
|
|
"epoch": 4.271341028929742,
|
|
"grad_norm": 11.671257972717285,
|
|
"learning_rate": 4.753531175499243e-05,
|
|
"loss": 0.0544,
|
|
"num_input_tokens_seen": 73236592,
|
|
"step": 3750
|
|
},
|
|
{
|
|
"epoch": 4.27704147071398,
|
|
"grad_norm": 2.340949773788452,
|
|
"learning_rate": 4.7528845422900946e-05,
|
|
"loss": 0.0058,
|
|
"num_input_tokens_seen": 73334272,
|
|
"step": 3755
|
|
},
|
|
{
|
|
"epoch": 4.282741912498219,
|
|
"grad_norm": 6.20223331451416,
|
|
"learning_rate": 4.7522371060413126e-05,
|
|
"loss": 0.0166,
|
|
"num_input_tokens_seen": 73432016,
|
|
"step": 3760
|
|
},
|
|
{
|
|
"epoch": 4.288442354282457,
|
|
"grad_norm": 2.861288547515869,
|
|
"learning_rate": 4.751588866983676e-05,
|
|
"loss": 0.0062,
|
|
"num_input_tokens_seen": 73529760,
|
|
"step": 3765
|
|
},
|
|
{
|
|
"epoch": 4.294142796066695,
|
|
"grad_norm": 0.3826698064804077,
|
|
"learning_rate": 4.750939825348249e-05,
|
|
"loss": 0.0276,
|
|
"num_input_tokens_seen": 73627552,
|
|
"step": 3770
|
|
},
|
|
{
|
|
"epoch": 4.299843237850934,
|
|
"grad_norm": 0.9756613373756409,
|
|
"learning_rate": 4.7502899813663806e-05,
|
|
"loss": 0.0052,
|
|
"num_input_tokens_seen": 73725328,
|
|
"step": 3775
|
|
},
|
|
{
|
|
"epoch": 4.305543679635171,
|
|
"grad_norm": 0.1972053200006485,
|
|
"learning_rate": 4.749639335269709e-05,
|
|
"loss": 0.0078,
|
|
"num_input_tokens_seen": 73823024,
|
|
"step": 3780
|
|
},
|
|
{
|
|
"epoch": 4.31124412141941,
|
|
"grad_norm": 3.610668897628784,
|
|
"learning_rate": 4.748987887290156e-05,
|
|
"loss": 0.0455,
|
|
"num_input_tokens_seen": 73920736,
|
|
"step": 3785
|
|
},
|
|
{
|
|
"epoch": 4.3169445632036485,
|
|
"grad_norm": 2.3730828762054443,
|
|
"learning_rate": 4.7483356376599305e-05,
|
|
"loss": 0.0169,
|
|
"num_input_tokens_seen": 74018448,
|
|
"step": 3790
|
|
},
|
|
{
|
|
"epoch": 4.322645004987886,
|
|
"grad_norm": 6.0831618309021,
|
|
"learning_rate": 4.747682586611526e-05,
|
|
"loss": 0.0107,
|
|
"num_input_tokens_seen": 74116224,
|
|
"step": 3795
|
|
},
|
|
{
|
|
"epoch": 4.328345446772125,
|
|
"grad_norm": 8.610361099243164,
|
|
"learning_rate": 4.747028734377723e-05,
|
|
"loss": 0.0209,
|
|
"num_input_tokens_seen": 74214016,
|
|
"step": 3800
|
|
},
|
|
{
|
|
"epoch": 4.334045888556363,
|
|
"grad_norm": 8.020880699157715,
|
|
"learning_rate": 4.7463740811915856e-05,
|
|
"loss": 0.0166,
|
|
"num_input_tokens_seen": 74311712,
|
|
"step": 3805
|
|
},
|
|
{
|
|
"epoch": 4.339746330340601,
|
|
"grad_norm": 0.3767450153827667,
|
|
"learning_rate": 4.745718627286466e-05,
|
|
"loss": 0.009,
|
|
"num_input_tokens_seen": 74409504,
|
|
"step": 3810
|
|
},
|
|
{
|
|
"epoch": 4.3454467721248395,
|
|
"grad_norm": 2.0386180877685547,
|
|
"learning_rate": 4.7450623728959996e-05,
|
|
"loss": 0.0143,
|
|
"num_input_tokens_seen": 74507280,
|
|
"step": 3815
|
|
},
|
|
{
|
|
"epoch": 4.351147213909078,
|
|
"grad_norm": 8.895707130432129,
|
|
"learning_rate": 4.744405318254109e-05,
|
|
"loss": 0.0129,
|
|
"num_input_tokens_seen": 74604912,
|
|
"step": 3820
|
|
},
|
|
{
|
|
"epoch": 4.356847655693317,
|
|
"grad_norm": 1.1221814155578613,
|
|
"learning_rate": 4.743747463594999e-05,
|
|
"loss": 0.0199,
|
|
"num_input_tokens_seen": 74702720,
|
|
"step": 3825
|
|
},
|
|
{
|
|
"epoch": 4.362548097477554,
|
|
"grad_norm": 0.9217889308929443,
|
|
"learning_rate": 4.7430888091531635e-05,
|
|
"loss": 0.0065,
|
|
"num_input_tokens_seen": 74800448,
|
|
"step": 3830
|
|
},
|
|
{
|
|
"epoch": 4.368248539261793,
|
|
"grad_norm": 1.3824939727783203,
|
|
"learning_rate": 4.7424293551633785e-05,
|
|
"loss": 0.0055,
|
|
"num_input_tokens_seen": 74898160,
|
|
"step": 3835
|
|
},
|
|
{
|
|
"epoch": 4.3739489810460315,
|
|
"grad_norm": 3.7707972526550293,
|
|
"learning_rate": 4.741769101860707e-05,
|
|
"loss": 0.0253,
|
|
"num_input_tokens_seen": 74995824,
|
|
"step": 3840
|
|
},
|
|
{
|
|
"epoch": 4.379649422830269,
|
|
"grad_norm": 0.15820211172103882,
|
|
"learning_rate": 4.7411080494804944e-05,
|
|
"loss": 0.0075,
|
|
"num_input_tokens_seen": 75093584,
|
|
"step": 3845
|
|
},
|
|
{
|
|
"epoch": 4.385349864614508,
|
|
"grad_norm": 13.08034610748291,
|
|
"learning_rate": 4.7404461982583735e-05,
|
|
"loss": 0.0158,
|
|
"num_input_tokens_seen": 75191296,
|
|
"step": 3850
|
|
},
|
|
{
|
|
"epoch": 4.391050306398746,
|
|
"grad_norm": 0.30530065298080444,
|
|
"learning_rate": 4.739783548430262e-05,
|
|
"loss": 0.0131,
|
|
"num_input_tokens_seen": 75288960,
|
|
"step": 3855
|
|
},
|
|
{
|
|
"epoch": 4.396750748182984,
|
|
"grad_norm": 0.16144217550754547,
|
|
"learning_rate": 4.739120100232359e-05,
|
|
"loss": 0.0319,
|
|
"num_input_tokens_seen": 75386768,
|
|
"step": 3860
|
|
},
|
|
{
|
|
"epoch": 4.4024511899672225,
|
|
"grad_norm": 8.027372360229492,
|
|
"learning_rate": 4.7384558539011515e-05,
|
|
"loss": 0.0352,
|
|
"num_input_tokens_seen": 75484464,
|
|
"step": 3865
|
|
},
|
|
{
|
|
"epoch": 4.408151631751461,
|
|
"grad_norm": 10.079002380371094,
|
|
"learning_rate": 4.73779080967341e-05,
|
|
"loss": 0.0151,
|
|
"num_input_tokens_seen": 75582368,
|
|
"step": 3870
|
|
},
|
|
{
|
|
"epoch": 4.413852073535699,
|
|
"grad_norm": 1.2833141088485718,
|
|
"learning_rate": 4.7371249677861886e-05,
|
|
"loss": 0.0081,
|
|
"num_input_tokens_seen": 75680112,
|
|
"step": 3875
|
|
},
|
|
{
|
|
"epoch": 4.419552515319937,
|
|
"grad_norm": 0.9999586343765259,
|
|
"learning_rate": 4.736458328476826e-05,
|
|
"loss": 0.0034,
|
|
"num_input_tokens_seen": 75777840,
|
|
"step": 3880
|
|
},
|
|
{
|
|
"epoch": 4.425252957104176,
|
|
"grad_norm": 7.5928215980529785,
|
|
"learning_rate": 4.7357908919829464e-05,
|
|
"loss": 0.012,
|
|
"num_input_tokens_seen": 75875648,
|
|
"step": 3885
|
|
},
|
|
{
|
|
"epoch": 4.4309533988884136,
|
|
"grad_norm": 0.11999927461147308,
|
|
"learning_rate": 4.735122658542456e-05,
|
|
"loss": 0.0093,
|
|
"num_input_tokens_seen": 75973296,
|
|
"step": 3890
|
|
},
|
|
{
|
|
"epoch": 4.436653840672652,
|
|
"grad_norm": 1.351083755493164,
|
|
"learning_rate": 4.734453628393548e-05,
|
|
"loss": 0.0051,
|
|
"num_input_tokens_seen": 76071088,
|
|
"step": 3895
|
|
},
|
|
{
|
|
"epoch": 4.442354282456891,
|
|
"grad_norm": 0.8617928624153137,
|
|
"learning_rate": 4.733783801774696e-05,
|
|
"loss": 0.0033,
|
|
"num_input_tokens_seen": 76168848,
|
|
"step": 3900
|
|
},
|
|
{
|
|
"epoch": 4.448054724241128,
|
|
"grad_norm": 0.1406688243150711,
|
|
"learning_rate": 4.7331131789246614e-05,
|
|
"loss": 0.0052,
|
|
"num_input_tokens_seen": 76266512,
|
|
"step": 3905
|
|
},
|
|
{
|
|
"epoch": 4.453755166025367,
|
|
"grad_norm": 0.1912948042154312,
|
|
"learning_rate": 4.7324417600824854e-05,
|
|
"loss": 0.0074,
|
|
"num_input_tokens_seen": 76364288,
|
|
"step": 3910
|
|
},
|
|
{
|
|
"epoch": 4.4594556078096055,
|
|
"grad_norm": 3.995418071746826,
|
|
"learning_rate": 4.7317695454874964e-05,
|
|
"loss": 0.0096,
|
|
"num_input_tokens_seen": 76462016,
|
|
"step": 3915
|
|
},
|
|
{
|
|
"epoch": 4.465156049593843,
|
|
"grad_norm": 0.7223560214042664,
|
|
"learning_rate": 4.7310965353793044e-05,
|
|
"loss": 0.003,
|
|
"num_input_tokens_seen": 76559792,
|
|
"step": 3920
|
|
},
|
|
{
|
|
"epoch": 4.470856491378082,
|
|
"grad_norm": 1.0632505416870117,
|
|
"learning_rate": 4.730422729997804e-05,
|
|
"loss": 0.035,
|
|
"num_input_tokens_seen": 76657616,
|
|
"step": 3925
|
|
},
|
|
{
|
|
"epoch": 4.47655693316232,
|
|
"grad_norm": 1.5412085056304932,
|
|
"learning_rate": 4.729748129583171e-05,
|
|
"loss": 0.0377,
|
|
"num_input_tokens_seen": 76755312,
|
|
"step": 3930
|
|
},
|
|
{
|
|
"epoch": 4.482257374946558,
|
|
"grad_norm": 1.5897432565689087,
|
|
"learning_rate": 4.729072734375869e-05,
|
|
"loss": 0.0166,
|
|
"num_input_tokens_seen": 76853056,
|
|
"step": 3935
|
|
},
|
|
{
|
|
"epoch": 4.4879578167307965,
|
|
"grad_norm": 0.3547525405883789,
|
|
"learning_rate": 4.728396544616641e-05,
|
|
"loss": 0.0201,
|
|
"num_input_tokens_seen": 76950784,
|
|
"step": 3940
|
|
},
|
|
{
|
|
"epoch": 4.493658258515035,
|
|
"grad_norm": 0.5280705094337463,
|
|
"learning_rate": 4.727719560546514e-05,
|
|
"loss": 0.0173,
|
|
"num_input_tokens_seen": 77048592,
|
|
"step": 3945
|
|
},
|
|
{
|
|
"epoch": 4.499358700299274,
|
|
"grad_norm": 5.78103494644165,
|
|
"learning_rate": 4.7270417824068e-05,
|
|
"loss": 0.0107,
|
|
"num_input_tokens_seen": 77146336,
|
|
"step": 3950
|
|
},
|
|
{
|
|
"epoch": 4.505059142083511,
|
|
"grad_norm": 5.893618583679199,
|
|
"learning_rate": 4.726363210439092e-05,
|
|
"loss": 0.0258,
|
|
"num_input_tokens_seen": 77244000,
|
|
"step": 3955
|
|
},
|
|
{
|
|
"epoch": 4.51075958386775,
|
|
"grad_norm": 5.622200965881348,
|
|
"learning_rate": 4.725683844885266e-05,
|
|
"loss": 0.0186,
|
|
"num_input_tokens_seen": 77341856,
|
|
"step": 3960
|
|
},
|
|
{
|
|
"epoch": 4.516460025651988,
|
|
"grad_norm": 3.430377244949341,
|
|
"learning_rate": 4.725003685987482e-05,
|
|
"loss": 0.0095,
|
|
"num_input_tokens_seen": 77439648,
|
|
"step": 3965
|
|
},
|
|
{
|
|
"epoch": 4.522160467436226,
|
|
"grad_norm": 16.007429122924805,
|
|
"learning_rate": 4.724322733988183e-05,
|
|
"loss": 0.0637,
|
|
"num_input_tokens_seen": 77537440,
|
|
"step": 3970
|
|
},
|
|
{
|
|
"epoch": 4.527860909220465,
|
|
"grad_norm": 0.6529415845870972,
|
|
"learning_rate": 4.7236409891300934e-05,
|
|
"loss": 0.0133,
|
|
"num_input_tokens_seen": 77635136,
|
|
"step": 3975
|
|
},
|
|
{
|
|
"epoch": 4.533561351004703,
|
|
"grad_norm": 0.014979444444179535,
|
|
"learning_rate": 4.722958451656221e-05,
|
|
"loss": 0.0353,
|
|
"num_input_tokens_seen": 77732848,
|
|
"step": 3980
|
|
},
|
|
{
|
|
"epoch": 4.539261792788941,
|
|
"grad_norm": 0.7652057409286499,
|
|
"learning_rate": 4.722275121809856e-05,
|
|
"loss": 0.0204,
|
|
"num_input_tokens_seen": 77830576,
|
|
"step": 3985
|
|
},
|
|
{
|
|
"epoch": 4.5449622345731795,
|
|
"grad_norm": 9.353001594543457,
|
|
"learning_rate": 4.721590999834571e-05,
|
|
"loss": 0.0329,
|
|
"num_input_tokens_seen": 77928320,
|
|
"step": 3990
|
|
},
|
|
{
|
|
"epoch": 4.550662676357418,
|
|
"grad_norm": 0.0680394098162651,
|
|
"learning_rate": 4.720906085974221e-05,
|
|
"loss": 0.0065,
|
|
"num_input_tokens_seen": 78026032,
|
|
"step": 3995
|
|
},
|
|
{
|
|
"epoch": 4.556363118141656,
|
|
"grad_norm": 2.323220729827881,
|
|
"learning_rate": 4.720220380472942e-05,
|
|
"loss": 0.0066,
|
|
"num_input_tokens_seen": 78123696,
|
|
"step": 4000
|
|
},
|
|
{
|
|
"epoch": 4.562063559925894,
|
|
"grad_norm": 0.5926280617713928,
|
|
"learning_rate": 4.719533883575155e-05,
|
|
"loss": 0.0043,
|
|
"num_input_tokens_seen": 78221376,
|
|
"step": 4005
|
|
},
|
|
{
|
|
"epoch": 4.567764001710133,
|
|
"grad_norm": 0.9745510220527649,
|
|
"learning_rate": 4.7188465955255604e-05,
|
|
"loss": 0.0147,
|
|
"num_input_tokens_seen": 78319104,
|
|
"step": 4010
|
|
},
|
|
{
|
|
"epoch": 4.5734644434943705,
|
|
"grad_norm": 10.33803653717041,
|
|
"learning_rate": 4.7181585165691437e-05,
|
|
"loss": 0.0112,
|
|
"num_input_tokens_seen": 78416816,
|
|
"step": 4015
|
|
},
|
|
{
|
|
"epoch": 4.579164885278609,
|
|
"grad_norm": 9.621374130249023,
|
|
"learning_rate": 4.7174696469511674e-05,
|
|
"loss": 0.0222,
|
|
"num_input_tokens_seen": 78514656,
|
|
"step": 4020
|
|
},
|
|
{
|
|
"epoch": 4.584865327062848,
|
|
"grad_norm": 4.345292568206787,
|
|
"learning_rate": 4.716779986917182e-05,
|
|
"loss": 0.0084,
|
|
"num_input_tokens_seen": 78612400,
|
|
"step": 4025
|
|
},
|
|
{
|
|
"epoch": 4.590565768847085,
|
|
"grad_norm": 1.6130069494247437,
|
|
"learning_rate": 4.7160895367130125e-05,
|
|
"loss": 0.0068,
|
|
"num_input_tokens_seen": 78710256,
|
|
"step": 4030
|
|
},
|
|
{
|
|
"epoch": 4.596266210631324,
|
|
"grad_norm": 1.0490821599960327,
|
|
"learning_rate": 4.715398296584773e-05,
|
|
"loss": 0.0086,
|
|
"num_input_tokens_seen": 78807936,
|
|
"step": 4035
|
|
},
|
|
{
|
|
"epoch": 4.6019666524155625,
|
|
"grad_norm": 10.216976165771484,
|
|
"learning_rate": 4.714706266778854e-05,
|
|
"loss": 0.0563,
|
|
"num_input_tokens_seen": 78905744,
|
|
"step": 4040
|
|
},
|
|
{
|
|
"epoch": 4.6076670941998,
|
|
"grad_norm": 0.1510315239429474,
|
|
"learning_rate": 4.7140134475419304e-05,
|
|
"loss": 0.0195,
|
|
"num_input_tokens_seen": 79003584,
|
|
"step": 4045
|
|
},
|
|
{
|
|
"epoch": 4.613367535984039,
|
|
"grad_norm": 0.32562801241874695,
|
|
"learning_rate": 4.7133198391209566e-05,
|
|
"loss": 0.0103,
|
|
"num_input_tokens_seen": 79101408,
|
|
"step": 4050
|
|
},
|
|
{
|
|
"epoch": 4.619067977768277,
|
|
"grad_norm": 1.025244116783142,
|
|
"learning_rate": 4.7126254417631686e-05,
|
|
"loss": 0.0022,
|
|
"num_input_tokens_seen": 79199136,
|
|
"step": 4055
|
|
},
|
|
{
|
|
"epoch": 4.624768419552515,
|
|
"grad_norm": 0.8182079195976257,
|
|
"learning_rate": 4.7119302557160844e-05,
|
|
"loss": 0.0032,
|
|
"num_input_tokens_seen": 79296832,
|
|
"step": 4060
|
|
},
|
|
{
|
|
"epoch": 4.6304688613367535,
|
|
"grad_norm": 0.6801844835281372,
|
|
"learning_rate": 4.7112342812275026e-05,
|
|
"loss": 0.012,
|
|
"num_input_tokens_seen": 79394528,
|
|
"step": 4065
|
|
},
|
|
{
|
|
"epoch": 4.636169303120992,
|
|
"grad_norm": 0.7363019585609436,
|
|
"learning_rate": 4.7105375185455034e-05,
|
|
"loss": 0.0055,
|
|
"num_input_tokens_seen": 79492352,
|
|
"step": 4070
|
|
},
|
|
{
|
|
"epoch": 4.641869744905231,
|
|
"grad_norm": 4.005695819854736,
|
|
"learning_rate": 4.709839967918447e-05,
|
|
"loss": 0.0195,
|
|
"num_input_tokens_seen": 79590064,
|
|
"step": 4075
|
|
},
|
|
{
|
|
"epoch": 4.647570186689468,
|
|
"grad_norm": 0.4576650857925415,
|
|
"learning_rate": 4.709141629594975e-05,
|
|
"loss": 0.0074,
|
|
"num_input_tokens_seen": 79687856,
|
|
"step": 4080
|
|
},
|
|
{
|
|
"epoch": 4.653270628473707,
|
|
"grad_norm": 0.05043329671025276,
|
|
"learning_rate": 4.708442503824011e-05,
|
|
"loss": 0.0175,
|
|
"num_input_tokens_seen": 79785600,
|
|
"step": 4085
|
|
},
|
|
{
|
|
"epoch": 4.6589710702579445,
|
|
"grad_norm": 0.19624769687652588,
|
|
"learning_rate": 4.707742590854756e-05,
|
|
"loss": 0.0029,
|
|
"num_input_tokens_seen": 79883424,
|
|
"step": 4090
|
|
},
|
|
{
|
|
"epoch": 4.664671512042183,
|
|
"grad_norm": 0.46751806139945984,
|
|
"learning_rate": 4.7070418909366954e-05,
|
|
"loss": 0.0192,
|
|
"num_input_tokens_seen": 79981152,
|
|
"step": 4095
|
|
},
|
|
{
|
|
"epoch": 4.670371953826422,
|
|
"grad_norm": 1.4042103290557861,
|
|
"learning_rate": 4.706340404319593e-05,
|
|
"loss": 0.002,
|
|
"num_input_tokens_seen": 80078864,
|
|
"step": 4100
|
|
},
|
|
{
|
|
"epoch": 4.67607239561066,
|
|
"grad_norm": 0.30372464656829834,
|
|
"learning_rate": 4.705638131253492e-05,
|
|
"loss": 0.0029,
|
|
"num_input_tokens_seen": 80176672,
|
|
"step": 4105
|
|
},
|
|
{
|
|
"epoch": 4.681772837394898,
|
|
"grad_norm": 12.20240306854248,
|
|
"learning_rate": 4.704935071988718e-05,
|
|
"loss": 0.0156,
|
|
"num_input_tokens_seen": 80274272,
|
|
"step": 4110
|
|
},
|
|
{
|
|
"epoch": 4.6874732791791365,
|
|
"grad_norm": 0.24599520862102509,
|
|
"learning_rate": 4.704231226775877e-05,
|
|
"loss": 0.0106,
|
|
"num_input_tokens_seen": 80372080,
|
|
"step": 4115
|
|
},
|
|
{
|
|
"epoch": 4.693173720963375,
|
|
"grad_norm": 5.936103820800781,
|
|
"learning_rate": 4.7035265958658545e-05,
|
|
"loss": 0.0063,
|
|
"num_input_tokens_seen": 80469824,
|
|
"step": 4120
|
|
},
|
|
{
|
|
"epoch": 4.698874162747613,
|
|
"grad_norm": 1.1616228818893433,
|
|
"learning_rate": 4.702821179509814e-05,
|
|
"loss": 0.0153,
|
|
"num_input_tokens_seen": 80567536,
|
|
"step": 4125
|
|
},
|
|
{
|
|
"epoch": 4.704574604531851,
|
|
"grad_norm": 11.066569328308105,
|
|
"learning_rate": 4.702114977959203e-05,
|
|
"loss": 0.0302,
|
|
"num_input_tokens_seen": 80665344,
|
|
"step": 4130
|
|
},
|
|
{
|
|
"epoch": 4.71027504631609,
|
|
"grad_norm": 0.9506548047065735,
|
|
"learning_rate": 4.701407991465745e-05,
|
|
"loss": 0.0058,
|
|
"num_input_tokens_seen": 80763072,
|
|
"step": 4135
|
|
},
|
|
{
|
|
"epoch": 4.7159754881003275,
|
|
"grad_norm": 0.13321082293987274,
|
|
"learning_rate": 4.700700220281446e-05,
|
|
"loss": 0.0023,
|
|
"num_input_tokens_seen": 80860816,
|
|
"step": 4140
|
|
},
|
|
{
|
|
"epoch": 4.721675929884566,
|
|
"grad_norm": 3.3196537494659424,
|
|
"learning_rate": 4.699991664658591e-05,
|
|
"loss": 0.0058,
|
|
"num_input_tokens_seen": 80958480,
|
|
"step": 4145
|
|
},
|
|
{
|
|
"epoch": 4.727376371668805,
|
|
"grad_norm": 0.7469080686569214,
|
|
"learning_rate": 4.699282324849742e-05,
|
|
"loss": 0.0398,
|
|
"num_input_tokens_seen": 81056144,
|
|
"step": 4150
|
|
},
|
|
{
|
|
"epoch": 4.733076813453042,
|
|
"grad_norm": 6.437920093536377,
|
|
"learning_rate": 4.698572201107746e-05,
|
|
"loss": 0.0205,
|
|
"num_input_tokens_seen": 81153888,
|
|
"step": 4155
|
|
},
|
|
{
|
|
"epoch": 4.738777255237281,
|
|
"grad_norm": 6.570982456207275,
|
|
"learning_rate": 4.697861293685724e-05,
|
|
"loss": 0.0083,
|
|
"num_input_tokens_seen": 81251680,
|
|
"step": 4160
|
|
},
|
|
{
|
|
"epoch": 4.7444776970215194,
|
|
"grad_norm": 2.7099902629852295,
|
|
"learning_rate": 4.69714960283708e-05,
|
|
"loss": 0.0038,
|
|
"num_input_tokens_seen": 81349360,
|
|
"step": 4165
|
|
},
|
|
{
|
|
"epoch": 4.750178138805757,
|
|
"grad_norm": 0.25943759083747864,
|
|
"learning_rate": 4.696437128815494e-05,
|
|
"loss": 0.0249,
|
|
"num_input_tokens_seen": 81447104,
|
|
"step": 4170
|
|
},
|
|
{
|
|
"epoch": 4.755878580589996,
|
|
"grad_norm": 1.0403450727462769,
|
|
"learning_rate": 4.6957238718749295e-05,
|
|
"loss": 0.0079,
|
|
"num_input_tokens_seen": 81544896,
|
|
"step": 4175
|
|
},
|
|
{
|
|
"epoch": 4.761579022374234,
|
|
"grad_norm": 6.538539886474609,
|
|
"learning_rate": 4.6950098322696254e-05,
|
|
"loss": 0.0292,
|
|
"num_input_tokens_seen": 81642576,
|
|
"step": 4180
|
|
},
|
|
{
|
|
"epoch": 4.767279464158472,
|
|
"grad_norm": 0.7306740283966064,
|
|
"learning_rate": 4.6942950102541007e-05,
|
|
"loss": 0.0153,
|
|
"num_input_tokens_seen": 81740384,
|
|
"step": 4185
|
|
},
|
|
{
|
|
"epoch": 4.7729799059427105,
|
|
"grad_norm": 1.1523919105529785,
|
|
"learning_rate": 4.693579406083153e-05,
|
|
"loss": 0.0137,
|
|
"num_input_tokens_seen": 81838112,
|
|
"step": 4190
|
|
},
|
|
{
|
|
"epoch": 4.778680347726949,
|
|
"grad_norm": 1.3222236633300781,
|
|
"learning_rate": 4.69286302001186e-05,
|
|
"loss": 0.0174,
|
|
"num_input_tokens_seen": 81935856,
|
|
"step": 4195
|
|
},
|
|
{
|
|
"epoch": 4.784380789511188,
|
|
"grad_norm": 4.96268367767334,
|
|
"learning_rate": 4.692145852295576e-05,
|
|
"loss": 0.0059,
|
|
"num_input_tokens_seen": 82033616,
|
|
"step": 4200
|
|
},
|
|
{
|
|
"epoch": 4.790081231295425,
|
|
"grad_norm": 1.5982474088668823,
|
|
"learning_rate": 4.6914279031899364e-05,
|
|
"loss": 0.017,
|
|
"num_input_tokens_seen": 82131360,
|
|
"step": 4205
|
|
},
|
|
{
|
|
"epoch": 4.795781673079664,
|
|
"grad_norm": 3.684070110321045,
|
|
"learning_rate": 4.690709172950854e-05,
|
|
"loss": 0.0113,
|
|
"num_input_tokens_seen": 82229136,
|
|
"step": 4210
|
|
},
|
|
{
|
|
"epoch": 4.8014821148639015,
|
|
"grad_norm": 12.136567115783691,
|
|
"learning_rate": 4.689989661834518e-05,
|
|
"loss": 0.0284,
|
|
"num_input_tokens_seen": 82326864,
|
|
"step": 4215
|
|
},
|
|
{
|
|
"epoch": 4.80718255664814,
|
|
"grad_norm": 0.33026665449142456,
|
|
"learning_rate": 4.6892693700973994e-05,
|
|
"loss": 0.0104,
|
|
"num_input_tokens_seen": 82424672,
|
|
"step": 4220
|
|
},
|
|
{
|
|
"epoch": 4.812882998432379,
|
|
"grad_norm": 8.315051078796387,
|
|
"learning_rate": 4.688548297996245e-05,
|
|
"loss": 0.017,
|
|
"num_input_tokens_seen": 82522400,
|
|
"step": 4225
|
|
},
|
|
{
|
|
"epoch": 4.818583440216617,
|
|
"grad_norm": 0.24934843182563782,
|
|
"learning_rate": 4.687826445788081e-05,
|
|
"loss": 0.0035,
|
|
"num_input_tokens_seen": 82620208,
|
|
"step": 4230
|
|
},
|
|
{
|
|
"epoch": 4.824283882000855,
|
|
"grad_norm": 0.1164386197924614,
|
|
"learning_rate": 4.687103813730211e-05,
|
|
"loss": 0.0092,
|
|
"num_input_tokens_seen": 82717856,
|
|
"step": 4235
|
|
},
|
|
{
|
|
"epoch": 4.8299843237850935,
|
|
"grad_norm": 11.615945816040039,
|
|
"learning_rate": 4.686380402080218e-05,
|
|
"loss": 0.0131,
|
|
"num_input_tokens_seen": 82815632,
|
|
"step": 4240
|
|
},
|
|
{
|
|
"epoch": 4.835684765569331,
|
|
"grad_norm": 0.6481454968452454,
|
|
"learning_rate": 4.68565621109596e-05,
|
|
"loss": 0.0011,
|
|
"num_input_tokens_seen": 82913296,
|
|
"step": 4245
|
|
},
|
|
{
|
|
"epoch": 4.84138520735357,
|
|
"grad_norm": 0.0655444860458374,
|
|
"learning_rate": 4.6849312410355755e-05,
|
|
"loss": 0.0198,
|
|
"num_input_tokens_seen": 83011072,
|
|
"step": 4250
|
|
},
|
|
{
|
|
"epoch": 4.847085649137808,
|
|
"grad_norm": 6.334054470062256,
|
|
"learning_rate": 4.68420549215748e-05,
|
|
"loss": 0.0048,
|
|
"num_input_tokens_seen": 83108864,
|
|
"step": 4255
|
|
},
|
|
{
|
|
"epoch": 4.852786090922047,
|
|
"grad_norm": 0.20101669430732727,
|
|
"learning_rate": 4.6834789647203656e-05,
|
|
"loss": 0.0048,
|
|
"num_input_tokens_seen": 83206608,
|
|
"step": 4260
|
|
},
|
|
{
|
|
"epoch": 4.8584865327062845,
|
|
"grad_norm": 14.852635383605957,
|
|
"learning_rate": 4.6827516589832025e-05,
|
|
"loss": 0.0461,
|
|
"num_input_tokens_seen": 83304336,
|
|
"step": 4265
|
|
},
|
|
{
|
|
"epoch": 4.864186974490523,
|
|
"grad_norm": 0.06325986981391907,
|
|
"learning_rate": 4.68202357520524e-05,
|
|
"loss": 0.0093,
|
|
"num_input_tokens_seen": 83402064,
|
|
"step": 4270
|
|
},
|
|
{
|
|
"epoch": 4.869887416274762,
|
|
"grad_norm": 1.859485149383545,
|
|
"learning_rate": 4.681294713646002e-05,
|
|
"loss": 0.0104,
|
|
"num_input_tokens_seen": 83499824,
|
|
"step": 4275
|
|
},
|
|
{
|
|
"epoch": 4.875587858058999,
|
|
"grad_norm": 0.47888869047164917,
|
|
"learning_rate": 4.68056507456529e-05,
|
|
"loss": 0.0106,
|
|
"num_input_tokens_seen": 83597536,
|
|
"step": 4280
|
|
},
|
|
{
|
|
"epoch": 4.881288299843238,
|
|
"grad_norm": 1.207236886024475,
|
|
"learning_rate": 4.6798346582231855e-05,
|
|
"loss": 0.0049,
|
|
"num_input_tokens_seen": 83695296,
|
|
"step": 4285
|
|
},
|
|
{
|
|
"epoch": 4.886988741627476,
|
|
"grad_norm": 0.27957767248153687,
|
|
"learning_rate": 4.679103464880044e-05,
|
|
"loss": 0.0017,
|
|
"num_input_tokens_seen": 83793024,
|
|
"step": 4290
|
|
},
|
|
{
|
|
"epoch": 4.892689183411714,
|
|
"grad_norm": 0.03679969534277916,
|
|
"learning_rate": 4.678371494796499e-05,
|
|
"loss": 0.0023,
|
|
"num_input_tokens_seen": 83890752,
|
|
"step": 4295
|
|
},
|
|
{
|
|
"epoch": 4.898389625195953,
|
|
"grad_norm": 1.9648526906967163,
|
|
"learning_rate": 4.677638748233461e-05,
|
|
"loss": 0.0168,
|
|
"num_input_tokens_seen": 83988512,
|
|
"step": 4300
|
|
},
|
|
{
|
|
"epoch": 4.904090066980191,
|
|
"grad_norm": 0.7433627247810364,
|
|
"learning_rate": 4.676905225452117e-05,
|
|
"loss": 0.0128,
|
|
"num_input_tokens_seen": 84086352,
|
|
"step": 4305
|
|
},
|
|
{
|
|
"epoch": 4.909790508764429,
|
|
"grad_norm": 1.4374768733978271,
|
|
"learning_rate": 4.676170926713932e-05,
|
|
"loss": 0.0019,
|
|
"num_input_tokens_seen": 84184032,
|
|
"step": 4310
|
|
},
|
|
{
|
|
"epoch": 4.9154909505486675,
|
|
"grad_norm": 0.46811923384666443,
|
|
"learning_rate": 4.6754358522806454e-05,
|
|
"loss": 0.0019,
|
|
"num_input_tokens_seen": 84281776,
|
|
"step": 4315
|
|
},
|
|
{
|
|
"epoch": 4.921191392332906,
|
|
"grad_norm": 2.098421573638916,
|
|
"learning_rate": 4.6747000024142734e-05,
|
|
"loss": 0.0169,
|
|
"num_input_tokens_seen": 84379472,
|
|
"step": 4320
|
|
},
|
|
{
|
|
"epoch": 4.926891834117144,
|
|
"grad_norm": 3.727424383163452,
|
|
"learning_rate": 4.673963377377111e-05,
|
|
"loss": 0.009,
|
|
"num_input_tokens_seen": 84477232,
|
|
"step": 4325
|
|
},
|
|
{
|
|
"epoch": 4.932592275901382,
|
|
"grad_norm": 9.418045043945312,
|
|
"learning_rate": 4.6732259774317264e-05,
|
|
"loss": 0.0283,
|
|
"num_input_tokens_seen": 84574992,
|
|
"step": 4330
|
|
},
|
|
{
|
|
"epoch": 4.938292717685621,
|
|
"grad_norm": 8.13887882232666,
|
|
"learning_rate": 4.672487802840966e-05,
|
|
"loss": 0.0163,
|
|
"num_input_tokens_seen": 84672800,
|
|
"step": 4335
|
|
},
|
|
{
|
|
"epoch": 4.9439931594698585,
|
|
"grad_norm": 0.15979628264904022,
|
|
"learning_rate": 4.671748853867952e-05,
|
|
"loss": 0.0126,
|
|
"num_input_tokens_seen": 84770416,
|
|
"step": 4340
|
|
},
|
|
{
|
|
"epoch": 4.949693601254097,
|
|
"grad_norm": 10.529417991638184,
|
|
"learning_rate": 4.671009130776083e-05,
|
|
"loss": 0.0189,
|
|
"num_input_tokens_seen": 84868256,
|
|
"step": 4345
|
|
},
|
|
{
|
|
"epoch": 4.955394043038336,
|
|
"grad_norm": 1.08811354637146,
|
|
"learning_rate": 4.670268633829031e-05,
|
|
"loss": 0.0016,
|
|
"num_input_tokens_seen": 84965872,
|
|
"step": 4350
|
|
},
|
|
{
|
|
"epoch": 4.961094484822574,
|
|
"grad_norm": 0.6671218872070312,
|
|
"learning_rate": 4.6695273632907476e-05,
|
|
"loss": 0.0025,
|
|
"num_input_tokens_seen": 85063648,
|
|
"step": 4355
|
|
},
|
|
{
|
|
"epoch": 4.966794926606812,
|
|
"grad_norm": 3.7817630767822266,
|
|
"learning_rate": 4.668785319425458e-05,
|
|
"loss": 0.0207,
|
|
"num_input_tokens_seen": 85161424,
|
|
"step": 4360
|
|
},
|
|
{
|
|
"epoch": 4.97249536839105,
|
|
"grad_norm": 3.2574493885040283,
|
|
"learning_rate": 4.668042502497663e-05,
|
|
"loss": 0.0183,
|
|
"num_input_tokens_seen": 85259088,
|
|
"step": 4365
|
|
},
|
|
{
|
|
"epoch": 4.978195810175288,
|
|
"grad_norm": 3.2037136554718018,
|
|
"learning_rate": 4.66729891277214e-05,
|
|
"loss": 0.0128,
|
|
"num_input_tokens_seen": 85356816,
|
|
"step": 4370
|
|
},
|
|
{
|
|
"epoch": 4.983896251959527,
|
|
"grad_norm": 3.986717462539673,
|
|
"learning_rate": 4.66655455051394e-05,
|
|
"loss": 0.0043,
|
|
"num_input_tokens_seen": 85454656,
|
|
"step": 4375
|
|
},
|
|
{
|
|
"epoch": 4.989596693743765,
|
|
"grad_norm": 1.5552836656570435,
|
|
"learning_rate": 4.6658094159883916e-05,
|
|
"loss": 0.0275,
|
|
"num_input_tokens_seen": 85552432,
|
|
"step": 4380
|
|
},
|
|
{
|
|
"epoch": 4.995297135528004,
|
|
"grad_norm": 0.39177027344703674,
|
|
"learning_rate": 4.665063509461097e-05,
|
|
"loss": 0.0053,
|
|
"num_input_tokens_seen": 85650144,
|
|
"step": 4385
|
|
},
|
|
{
|
|
"epoch": 5.0,
|
|
"grad_norm": 0.13293787837028503,
|
|
"learning_rate": 4.6643168311979345e-05,
|
|
"loss": 0.0034,
|
|
"num_input_tokens_seen": 85730720,
|
|
"step": 4390
|
|
},
|
|
{
|
|
"epoch": 5.005700441784239,
|
|
"grad_norm": 1.8142844438552856,
|
|
"learning_rate": 4.663569381465058e-05,
|
|
"loss": 0.0094,
|
|
"num_input_tokens_seen": 85828432,
|
|
"step": 4395
|
|
},
|
|
{
|
|
"epoch": 5.011400883568476,
|
|
"grad_norm": 0.5450536012649536,
|
|
"learning_rate": 4.662821160528894e-05,
|
|
"loss": 0.0019,
|
|
"num_input_tokens_seen": 85926048,
|
|
"step": 4400
|
|
},
|
|
{
|
|
"epoch": 5.017101325352715,
|
|
"grad_norm": 0.5705410242080688,
|
|
"learning_rate": 4.662072168656146e-05,
|
|
"loss": 0.0311,
|
|
"num_input_tokens_seen": 86023760,
|
|
"step": 4405
|
|
},
|
|
{
|
|
"epoch": 5.022801767136953,
|
|
"grad_norm": 0.47627347707748413,
|
|
"learning_rate": 4.661322406113794e-05,
|
|
"loss": 0.005,
|
|
"num_input_tokens_seen": 86121552,
|
|
"step": 4410
|
|
},
|
|
{
|
|
"epoch": 5.028502208921191,
|
|
"grad_norm": 5.517219066619873,
|
|
"learning_rate": 4.6605718731690874e-05,
|
|
"loss": 0.0048,
|
|
"num_input_tokens_seen": 86219200,
|
|
"step": 4415
|
|
},
|
|
{
|
|
"epoch": 5.03420265070543,
|
|
"grad_norm": 0.165016770362854,
|
|
"learning_rate": 4.659820570089555e-05,
|
|
"loss": 0.0025,
|
|
"num_input_tokens_seen": 86316976,
|
|
"step": 4420
|
|
},
|
|
{
|
|
"epoch": 5.039903092489668,
|
|
"grad_norm": 2.5400843620300293,
|
|
"learning_rate": 4.659068497142998e-05,
|
|
"loss": 0.0026,
|
|
"num_input_tokens_seen": 86414736,
|
|
"step": 4425
|
|
},
|
|
{
|
|
"epoch": 5.045603534273906,
|
|
"grad_norm": 1.7173391580581665,
|
|
"learning_rate": 4.658315654597492e-05,
|
|
"loss": 0.0037,
|
|
"num_input_tokens_seen": 86512528,
|
|
"step": 4430
|
|
},
|
|
{
|
|
"epoch": 5.051303976058144,
|
|
"grad_norm": 0.1867997944355011,
|
|
"learning_rate": 4.657562042721388e-05,
|
|
"loss": 0.001,
|
|
"num_input_tokens_seen": 86610224,
|
|
"step": 4435
|
|
},
|
|
{
|
|
"epoch": 5.057004417842383,
|
|
"grad_norm": 1.1393805742263794,
|
|
"learning_rate": 4.65680766178331e-05,
|
|
"loss": 0.0047,
|
|
"num_input_tokens_seen": 86708000,
|
|
"step": 4440
|
|
},
|
|
{
|
|
"epoch": 5.062704859626621,
|
|
"grad_norm": 4.353109836578369,
|
|
"learning_rate": 4.656052512052158e-05,
|
|
"loss": 0.0031,
|
|
"num_input_tokens_seen": 86805696,
|
|
"step": 4445
|
|
},
|
|
{
|
|
"epoch": 5.068405301410859,
|
|
"grad_norm": 0.10244199633598328,
|
|
"learning_rate": 4.655296593797104e-05,
|
|
"loss": 0.0167,
|
|
"num_input_tokens_seen": 86903504,
|
|
"step": 4450
|
|
},
|
|
{
|
|
"epoch": 5.074105743195098,
|
|
"grad_norm": 3.0064287185668945,
|
|
"learning_rate": 4.654539907287594e-05,
|
|
"loss": 0.0035,
|
|
"num_input_tokens_seen": 87001264,
|
|
"step": 4455
|
|
},
|
|
{
|
|
"epoch": 5.079806184979336,
|
|
"grad_norm": 2.2633399963378906,
|
|
"learning_rate": 4.653782452793349e-05,
|
|
"loss": 0.0022,
|
|
"num_input_tokens_seen": 87099008,
|
|
"step": 4460
|
|
},
|
|
{
|
|
"epoch": 5.085506626763574,
|
|
"grad_norm": 0.3934509754180908,
|
|
"learning_rate": 4.653024230584364e-05,
|
|
"loss": 0.0061,
|
|
"num_input_tokens_seen": 87196672,
|
|
"step": 4465
|
|
},
|
|
{
|
|
"epoch": 5.091207068547813,
|
|
"grad_norm": 0.034104038029909134,
|
|
"learning_rate": 4.6522652409309064e-05,
|
|
"loss": 0.0017,
|
|
"num_input_tokens_seen": 87294416,
|
|
"step": 4470
|
|
},
|
|
{
|
|
"epoch": 5.096907510332051,
|
|
"grad_norm": 2.047616720199585,
|
|
"learning_rate": 4.651505484103518e-05,
|
|
"loss": 0.0136,
|
|
"num_input_tokens_seen": 87392128,
|
|
"step": 4475
|
|
},
|
|
{
|
|
"epoch": 5.102607952116289,
|
|
"grad_norm": 4.767343044281006,
|
|
"learning_rate": 4.6507449603730135e-05,
|
|
"loss": 0.0118,
|
|
"num_input_tokens_seen": 87489840,
|
|
"step": 4480
|
|
},
|
|
{
|
|
"epoch": 5.108308393900527,
|
|
"grad_norm": 0.24816875159740448,
|
|
"learning_rate": 4.6499836700104806e-05,
|
|
"loss": 0.0083,
|
|
"num_input_tokens_seen": 87587568,
|
|
"step": 4485
|
|
},
|
|
{
|
|
"epoch": 5.114008835684766,
|
|
"grad_norm": 0.16580072045326233,
|
|
"learning_rate": 4.6492216132872824e-05,
|
|
"loss": 0.0053,
|
|
"num_input_tokens_seen": 87685264,
|
|
"step": 4490
|
|
},
|
|
{
|
|
"epoch": 5.119709277469004,
|
|
"grad_norm": 0.23322570323944092,
|
|
"learning_rate": 4.648458790475052e-05,
|
|
"loss": 0.0026,
|
|
"num_input_tokens_seen": 87783088,
|
|
"step": 4495
|
|
},
|
|
{
|
|
"epoch": 5.125409719253242,
|
|
"grad_norm": 0.2388758510351181,
|
|
"learning_rate": 4.6476952018456974e-05,
|
|
"loss": 0.0009,
|
|
"num_input_tokens_seen": 87880832,
|
|
"step": 4500
|
|
},
|
|
{
|
|
"epoch": 5.131110161037481,
|
|
"grad_norm": 2.167498826980591,
|
|
"learning_rate": 4.646930847671401e-05,
|
|
"loss": 0.009,
|
|
"num_input_tokens_seen": 87978544,
|
|
"step": 4505
|
|
},
|
|
{
|
|
"epoch": 5.136810602821718,
|
|
"grad_norm": 0.15172941982746124,
|
|
"learning_rate": 4.646165728224616e-05,
|
|
"loss": 0.0029,
|
|
"num_input_tokens_seen": 88076304,
|
|
"step": 4510
|
|
},
|
|
{
|
|
"epoch": 5.142511044605957,
|
|
"grad_norm": 1.221136450767517,
|
|
"learning_rate": 4.645399843778068e-05,
|
|
"loss": 0.0045,
|
|
"num_input_tokens_seen": 88174016,
|
|
"step": 4515
|
|
},
|
|
{
|
|
"epoch": 5.1482114863901955,
|
|
"grad_norm": 0.21661746501922607,
|
|
"learning_rate": 4.644633194604756e-05,
|
|
"loss": 0.013,
|
|
"num_input_tokens_seen": 88271632,
|
|
"step": 4520
|
|
},
|
|
{
|
|
"epoch": 5.153911928174433,
|
|
"grad_norm": 3.4261422157287598,
|
|
"learning_rate": 4.6438657809779526e-05,
|
|
"loss": 0.0069,
|
|
"num_input_tokens_seen": 88369312,
|
|
"step": 4525
|
|
},
|
|
{
|
|
"epoch": 5.159612369958672,
|
|
"grad_norm": 0.3439682126045227,
|
|
"learning_rate": 4.6430976031712017e-05,
|
|
"loss": 0.0014,
|
|
"num_input_tokens_seen": 88467120,
|
|
"step": 4530
|
|
},
|
|
{
|
|
"epoch": 5.16531281174291,
|
|
"grad_norm": 13.96764087677002,
|
|
"learning_rate": 4.6423286614583195e-05,
|
|
"loss": 0.0218,
|
|
"num_input_tokens_seen": 88564848,
|
|
"step": 4535
|
|
},
|
|
{
|
|
"epoch": 5.171013253527148,
|
|
"grad_norm": 0.09054847806692123,
|
|
"learning_rate": 4.641558956113396e-05,
|
|
"loss": 0.0054,
|
|
"num_input_tokens_seen": 88662560,
|
|
"step": 4540
|
|
},
|
|
{
|
|
"epoch": 5.176713695311387,
|
|
"grad_norm": 1.0485111474990845,
|
|
"learning_rate": 4.640788487410791e-05,
|
|
"loss": 0.0044,
|
|
"num_input_tokens_seen": 88760400,
|
|
"step": 4545
|
|
},
|
|
{
|
|
"epoch": 5.182414137095625,
|
|
"grad_norm": 0.10858794301748276,
|
|
"learning_rate": 4.640017255625139e-05,
|
|
"loss": 0.0009,
|
|
"num_input_tokens_seen": 88858096,
|
|
"step": 4550
|
|
},
|
|
{
|
|
"epoch": 5.188114578879863,
|
|
"grad_norm": 0.07652360200881958,
|
|
"learning_rate": 4.639245261031344e-05,
|
|
"loss": 0.0239,
|
|
"num_input_tokens_seen": 88955856,
|
|
"step": 4555
|
|
},
|
|
{
|
|
"epoch": 5.193815020664101,
|
|
"grad_norm": 0.6881747841835022,
|
|
"learning_rate": 4.638472503904583e-05,
|
|
"loss": 0.0009,
|
|
"num_input_tokens_seen": 89053600,
|
|
"step": 4560
|
|
},
|
|
{
|
|
"epoch": 5.19951546244834,
|
|
"grad_norm": 0.08055282384157181,
|
|
"learning_rate": 4.637698984520307e-05,
|
|
"loss": 0.0034,
|
|
"num_input_tokens_seen": 89151296,
|
|
"step": 4565
|
|
},
|
|
{
|
|
"epoch": 5.205215904232578,
|
|
"grad_norm": 0.08773194998502731,
|
|
"learning_rate": 4.636924703154234e-05,
|
|
"loss": 0.0121,
|
|
"num_input_tokens_seen": 89249120,
|
|
"step": 4570
|
|
},
|
|
{
|
|
"epoch": 5.210916346016816,
|
|
"grad_norm": 0.2949371039867401,
|
|
"learning_rate": 4.636149660082358e-05,
|
|
"loss": 0.0049,
|
|
"num_input_tokens_seen": 89346832,
|
|
"step": 4575
|
|
},
|
|
{
|
|
"epoch": 5.216616787801055,
|
|
"grad_norm": 7.335551738739014,
|
|
"learning_rate": 4.635373855580942e-05,
|
|
"loss": 0.0274,
|
|
"num_input_tokens_seen": 89444576,
|
|
"step": 4580
|
|
},
|
|
{
|
|
"epoch": 5.222317229585292,
|
|
"grad_norm": 2.2080814838409424,
|
|
"learning_rate": 4.634597289926521e-05,
|
|
"loss": 0.0128,
|
|
"num_input_tokens_seen": 89542288,
|
|
"step": 4585
|
|
},
|
|
{
|
|
"epoch": 5.228017671369531,
|
|
"grad_norm": 1.00960111618042,
|
|
"learning_rate": 4.6338199633959025e-05,
|
|
"loss": 0.0036,
|
|
"num_input_tokens_seen": 89640096,
|
|
"step": 4590
|
|
},
|
|
{
|
|
"epoch": 5.23371811315377,
|
|
"grad_norm": 0.1926228553056717,
|
|
"learning_rate": 4.6330418762661624e-05,
|
|
"loss": 0.0061,
|
|
"num_input_tokens_seen": 89737872,
|
|
"step": 4595
|
|
},
|
|
{
|
|
"epoch": 5.239418554938008,
|
|
"grad_norm": 0.0730406790971756,
|
|
"learning_rate": 4.632263028814652e-05,
|
|
"loss": 0.0383,
|
|
"num_input_tokens_seen": 89835552,
|
|
"step": 4600
|
|
},
|
|
{
|
|
"epoch": 5.245118996722246,
|
|
"grad_norm": 0.9148241281509399,
|
|
"learning_rate": 4.6314834213189884e-05,
|
|
"loss": 0.0167,
|
|
"num_input_tokens_seen": 89933232,
|
|
"step": 4605
|
|
},
|
|
{
|
|
"epoch": 5.250819438506484,
|
|
"grad_norm": 1.8269907236099243,
|
|
"learning_rate": 4.630703054057063e-05,
|
|
"loss": 0.006,
|
|
"num_input_tokens_seen": 90030960,
|
|
"step": 4610
|
|
},
|
|
{
|
|
"epoch": 5.256519880290723,
|
|
"grad_norm": 0.4568536877632141,
|
|
"learning_rate": 4.6299219273070396e-05,
|
|
"loss": 0.0105,
|
|
"num_input_tokens_seen": 90128784,
|
|
"step": 4615
|
|
},
|
|
{
|
|
"epoch": 5.262220322074961,
|
|
"grad_norm": 0.6757635474205017,
|
|
"learning_rate": 4.629140041347347e-05,
|
|
"loss": 0.0083,
|
|
"num_input_tokens_seen": 90226576,
|
|
"step": 4620
|
|
},
|
|
{
|
|
"epoch": 5.267920763859199,
|
|
"grad_norm": 8.738912582397461,
|
|
"learning_rate": 4.628357396456692e-05,
|
|
"loss": 0.0166,
|
|
"num_input_tokens_seen": 90324304,
|
|
"step": 4625
|
|
},
|
|
{
|
|
"epoch": 5.273621205643438,
|
|
"grad_norm": 1.5971795320510864,
|
|
"learning_rate": 4.627573992914044e-05,
|
|
"loss": 0.0029,
|
|
"num_input_tokens_seen": 90421920,
|
|
"step": 4630
|
|
},
|
|
{
|
|
"epoch": 5.279321647427675,
|
|
"grad_norm": 8.538966178894043,
|
|
"learning_rate": 4.626789830998649e-05,
|
|
"loss": 0.0098,
|
|
"num_input_tokens_seen": 90519728,
|
|
"step": 4635
|
|
},
|
|
{
|
|
"epoch": 5.285022089211914,
|
|
"grad_norm": 0.06448430567979813,
|
|
"learning_rate": 4.626004910990021e-05,
|
|
"loss": 0.0135,
|
|
"num_input_tokens_seen": 90617440,
|
|
"step": 4640
|
|
},
|
|
{
|
|
"epoch": 5.2907225309961525,
|
|
"grad_norm": 7.718270301818848,
|
|
"learning_rate": 4.625219233167944e-05,
|
|
"loss": 0.015,
|
|
"num_input_tokens_seen": 90715248,
|
|
"step": 4645
|
|
},
|
|
{
|
|
"epoch": 5.29642297278039,
|
|
"grad_norm": 0.2514442801475525,
|
|
"learning_rate": 4.6244327978124734e-05,
|
|
"loss": 0.0031,
|
|
"num_input_tokens_seen": 90812960,
|
|
"step": 4650
|
|
},
|
|
{
|
|
"epoch": 5.302123414564629,
|
|
"grad_norm": 0.28784915804862976,
|
|
"learning_rate": 4.623645605203932e-05,
|
|
"loss": 0.0063,
|
|
"num_input_tokens_seen": 90910624,
|
|
"step": 4655
|
|
},
|
|
{
|
|
"epoch": 5.307823856348867,
|
|
"grad_norm": 0.1487150639295578,
|
|
"learning_rate": 4.6228576556229156e-05,
|
|
"loss": 0.0035,
|
|
"num_input_tokens_seen": 91008320,
|
|
"step": 4660
|
|
},
|
|
{
|
|
"epoch": 5.313524298133105,
|
|
"grad_norm": 0.19565777480602264,
|
|
"learning_rate": 4.622068949350289e-05,
|
|
"loss": 0.0022,
|
|
"num_input_tokens_seen": 91106128,
|
|
"step": 4665
|
|
},
|
|
{
|
|
"epoch": 5.319224739917344,
|
|
"grad_norm": 0.2649058401584625,
|
|
"learning_rate": 4.6212794866671836e-05,
|
|
"loss": 0.0156,
|
|
"num_input_tokens_seen": 91203968,
|
|
"step": 4670
|
|
},
|
|
{
|
|
"epoch": 5.324925181701582,
|
|
"grad_norm": 1.254876732826233,
|
|
"learning_rate": 4.620489267855006e-05,
|
|
"loss": 0.0014,
|
|
"num_input_tokens_seen": 91301696,
|
|
"step": 4675
|
|
},
|
|
{
|
|
"epoch": 5.33062562348582,
|
|
"grad_norm": 0.03180227801203728,
|
|
"learning_rate": 4.619698293195427e-05,
|
|
"loss": 0.0046,
|
|
"num_input_tokens_seen": 91399360,
|
|
"step": 4680
|
|
},
|
|
{
|
|
"epoch": 5.336326065270058,
|
|
"grad_norm": 4.16030216217041,
|
|
"learning_rate": 4.618906562970391e-05,
|
|
"loss": 0.0031,
|
|
"num_input_tokens_seen": 91497088,
|
|
"step": 4685
|
|
},
|
|
{
|
|
"epoch": 5.342026507054297,
|
|
"grad_norm": 0.0919002890586853,
|
|
"learning_rate": 4.6181140774621077e-05,
|
|
"loss": 0.0021,
|
|
"num_input_tokens_seen": 91594688,
|
|
"step": 4690
|
|
},
|
|
{
|
|
"epoch": 5.347726948838535,
|
|
"grad_norm": 4.587754249572754,
|
|
"learning_rate": 4.617320836953061e-05,
|
|
"loss": 0.0129,
|
|
"num_input_tokens_seen": 91692448,
|
|
"step": 4695
|
|
},
|
|
{
|
|
"epoch": 5.353427390622773,
|
|
"grad_norm": 0.5592033863067627,
|
|
"learning_rate": 4.6165268417259986e-05,
|
|
"loss": 0.002,
|
|
"num_input_tokens_seen": 91790160,
|
|
"step": 4700
|
|
},
|
|
{
|
|
"epoch": 5.359127832407012,
|
|
"grad_norm": 9.988340377807617,
|
|
"learning_rate": 4.6157320920639406e-05,
|
|
"loss": 0.0083,
|
|
"num_input_tokens_seen": 91887888,
|
|
"step": 4705
|
|
},
|
|
{
|
|
"epoch": 5.364828274191249,
|
|
"grad_norm": 1.1014900207519531,
|
|
"learning_rate": 4.6149365882501754e-05,
|
|
"loss": 0.0049,
|
|
"num_input_tokens_seen": 91985648,
|
|
"step": 4710
|
|
},
|
|
{
|
|
"epoch": 5.370528715975488,
|
|
"grad_norm": 0.13604551553726196,
|
|
"learning_rate": 4.614140330568261e-05,
|
|
"loss": 0.0091,
|
|
"num_input_tokens_seen": 92083408,
|
|
"step": 4715
|
|
},
|
|
{
|
|
"epoch": 5.3762291577597265,
|
|
"grad_norm": 13.926383972167969,
|
|
"learning_rate": 4.6133433193020206e-05,
|
|
"loss": 0.0367,
|
|
"num_input_tokens_seen": 92181072,
|
|
"step": 4720
|
|
},
|
|
{
|
|
"epoch": 5.381929599543964,
|
|
"grad_norm": 0.2026086002588272,
|
|
"learning_rate": 4.61254555473555e-05,
|
|
"loss": 0.0112,
|
|
"num_input_tokens_seen": 92278880,
|
|
"step": 4725
|
|
},
|
|
{
|
|
"epoch": 5.387630041328203,
|
|
"grad_norm": 0.10835447162389755,
|
|
"learning_rate": 4.6117470371532115e-05,
|
|
"loss": 0.0094,
|
|
"num_input_tokens_seen": 92376672,
|
|
"step": 4730
|
|
},
|
|
{
|
|
"epoch": 5.393330483112441,
|
|
"grad_norm": 5.342576026916504,
|
|
"learning_rate": 4.610947766839637e-05,
|
|
"loss": 0.0153,
|
|
"num_input_tokens_seen": 92474448,
|
|
"step": 4735
|
|
},
|
|
{
|
|
"epoch": 5.39903092489668,
|
|
"grad_norm": 1.6363821029663086,
|
|
"learning_rate": 4.610147744079725e-05,
|
|
"loss": 0.0046,
|
|
"num_input_tokens_seen": 92572160,
|
|
"step": 4740
|
|
},
|
|
{
|
|
"epoch": 5.404731366680918,
|
|
"grad_norm": 0.9857316613197327,
|
|
"learning_rate": 4.609346969158645e-05,
|
|
"loss": 0.0092,
|
|
"num_input_tokens_seen": 92669792,
|
|
"step": 4745
|
|
},
|
|
{
|
|
"epoch": 5.410431808465156,
|
|
"grad_norm": 0.055682141333818436,
|
|
"learning_rate": 4.60854544236183e-05,
|
|
"loss": 0.003,
|
|
"num_input_tokens_seen": 92767520,
|
|
"step": 4750
|
|
},
|
|
{
|
|
"epoch": 5.416132250249395,
|
|
"grad_norm": 0.026181025430560112,
|
|
"learning_rate": 4.607743163974987e-05,
|
|
"loss": 0.0009,
|
|
"num_input_tokens_seen": 92865344,
|
|
"step": 4755
|
|
},
|
|
{
|
|
"epoch": 5.421832692033632,
|
|
"grad_norm": 0.0219236072152853,
|
|
"learning_rate": 4.6069401342840854e-05,
|
|
"loss": 0.003,
|
|
"num_input_tokens_seen": 92963104,
|
|
"step": 4760
|
|
},
|
|
{
|
|
"epoch": 5.427533133817871,
|
|
"grad_norm": 0.19581133127212524,
|
|
"learning_rate": 4.606136353575366e-05,
|
|
"loss": 0.0008,
|
|
"num_input_tokens_seen": 93060912,
|
|
"step": 4765
|
|
},
|
|
{
|
|
"epoch": 5.4332335756021095,
|
|
"grad_norm": 0.10174310952425003,
|
|
"learning_rate": 4.6053318221353356e-05,
|
|
"loss": 0.0006,
|
|
"num_input_tokens_seen": 93158768,
|
|
"step": 4770
|
|
},
|
|
{
|
|
"epoch": 5.438934017386347,
|
|
"grad_norm": 14.586435317993164,
|
|
"learning_rate": 4.60452654025077e-05,
|
|
"loss": 0.0157,
|
|
"num_input_tokens_seen": 93256496,
|
|
"step": 4775
|
|
},
|
|
{
|
|
"epoch": 5.444634459170586,
|
|
"grad_norm": 5.632846355438232,
|
|
"learning_rate": 4.6037205082087095e-05,
|
|
"loss": 0.0196,
|
|
"num_input_tokens_seen": 93354208,
|
|
"step": 4780
|
|
},
|
|
{
|
|
"epoch": 5.450334900954824,
|
|
"grad_norm": 0.09631127119064331,
|
|
"learning_rate": 4.602913726296466e-05,
|
|
"loss": 0.0012,
|
|
"num_input_tokens_seen": 93451952,
|
|
"step": 4785
|
|
},
|
|
{
|
|
"epoch": 5.456035342739062,
|
|
"grad_norm": 5.937401294708252,
|
|
"learning_rate": 4.602106194801615e-05,
|
|
"loss": 0.0037,
|
|
"num_input_tokens_seen": 93549744,
|
|
"step": 4790
|
|
},
|
|
{
|
|
"epoch": 5.4617357845233006,
|
|
"grad_norm": 0.3016761839389801,
|
|
"learning_rate": 4.6012979140120016e-05,
|
|
"loss": 0.0026,
|
|
"num_input_tokens_seen": 93647520,
|
|
"step": 4795
|
|
},
|
|
{
|
|
"epoch": 5.467436226307539,
|
|
"grad_norm": 0.0018981621833518147,
|
|
"learning_rate": 4.600488884215737e-05,
|
|
"loss": 0.0114,
|
|
"num_input_tokens_seen": 93745280,
|
|
"step": 4800
|
|
},
|
|
{
|
|
"epoch": 5.473136668091777,
|
|
"grad_norm": 0.2602160573005676,
|
|
"learning_rate": 4.599679105701199e-05,
|
|
"loss": 0.0043,
|
|
"num_input_tokens_seen": 93842992,
|
|
"step": 4805
|
|
},
|
|
{
|
|
"epoch": 5.478837109876015,
|
|
"grad_norm": 2.016597032546997,
|
|
"learning_rate": 4.598868578757033e-05,
|
|
"loss": 0.0043,
|
|
"num_input_tokens_seen": 93940768,
|
|
"step": 4810
|
|
},
|
|
{
|
|
"epoch": 5.484537551660254,
|
|
"grad_norm": 0.06573915481567383,
|
|
"learning_rate": 4.5980573036721505e-05,
|
|
"loss": 0.0025,
|
|
"num_input_tokens_seen": 94038528,
|
|
"step": 4815
|
|
},
|
|
{
|
|
"epoch": 5.490237993444492,
|
|
"grad_norm": 1.3520628213882446,
|
|
"learning_rate": 4.597245280735731e-05,
|
|
"loss": 0.0018,
|
|
"num_input_tokens_seen": 94136224,
|
|
"step": 4820
|
|
},
|
|
{
|
|
"epoch": 5.49593843522873,
|
|
"grad_norm": 14.249088287353516,
|
|
"learning_rate": 4.59643251023722e-05,
|
|
"loss": 0.0273,
|
|
"num_input_tokens_seen": 94233888,
|
|
"step": 4825
|
|
},
|
|
{
|
|
"epoch": 5.501638877012969,
|
|
"grad_norm": 2.345010280609131,
|
|
"learning_rate": 4.595618992466328e-05,
|
|
"loss": 0.0017,
|
|
"num_input_tokens_seen": 94331568,
|
|
"step": 4830
|
|
},
|
|
{
|
|
"epoch": 5.507339318797206,
|
|
"grad_norm": 0.5609773397445679,
|
|
"learning_rate": 4.594804727713033e-05,
|
|
"loss": 0.0045,
|
|
"num_input_tokens_seen": 94429248,
|
|
"step": 4835
|
|
},
|
|
{
|
|
"epoch": 5.513039760581445,
|
|
"grad_norm": 11.708130836486816,
|
|
"learning_rate": 4.5939897162675804e-05,
|
|
"loss": 0.0603,
|
|
"num_input_tokens_seen": 94526912,
|
|
"step": 4840
|
|
},
|
|
{
|
|
"epoch": 5.5187402023656835,
|
|
"grad_norm": 0.511098325252533,
|
|
"learning_rate": 4.59317395842048e-05,
|
|
"loss": 0.0015,
|
|
"num_input_tokens_seen": 94624688,
|
|
"step": 4845
|
|
},
|
|
{
|
|
"epoch": 5.524440644149921,
|
|
"grad_norm": 0.07705602049827576,
|
|
"learning_rate": 4.592357454462508e-05,
|
|
"loss": 0.0008,
|
|
"num_input_tokens_seen": 94722496,
|
|
"step": 4850
|
|
},
|
|
{
|
|
"epoch": 5.53014108593416,
|
|
"grad_norm": 0.49463194608688354,
|
|
"learning_rate": 4.591540204684708e-05,
|
|
"loss": 0.0226,
|
|
"num_input_tokens_seen": 94820176,
|
|
"step": 4855
|
|
},
|
|
{
|
|
"epoch": 5.535841527718398,
|
|
"grad_norm": 0.13507622480392456,
|
|
"learning_rate": 4.590722209378387e-05,
|
|
"loss": 0.0033,
|
|
"num_input_tokens_seen": 94917984,
|
|
"step": 4860
|
|
},
|
|
{
|
|
"epoch": 5.541541969502637,
|
|
"grad_norm": 0.11557400226593018,
|
|
"learning_rate": 4.589903468835119e-05,
|
|
"loss": 0.0048,
|
|
"num_input_tokens_seen": 95015744,
|
|
"step": 4865
|
|
},
|
|
{
|
|
"epoch": 5.547242411286875,
|
|
"grad_norm": 3.4503333568573,
|
|
"learning_rate": 4.5890839833467455e-05,
|
|
"loss": 0.0044,
|
|
"num_input_tokens_seen": 95113504,
|
|
"step": 4870
|
|
},
|
|
{
|
|
"epoch": 5.552942853071113,
|
|
"grad_norm": 0.1271464228630066,
|
|
"learning_rate": 4.58826375320537e-05,
|
|
"loss": 0.0021,
|
|
"num_input_tokens_seen": 95211264,
|
|
"step": 4875
|
|
},
|
|
{
|
|
"epoch": 5.558643294855351,
|
|
"grad_norm": 0.09808290749788284,
|
|
"learning_rate": 4.587442778703362e-05,
|
|
"loss": 0.0011,
|
|
"num_input_tokens_seen": 95309040,
|
|
"step": 4880
|
|
},
|
|
{
|
|
"epoch": 5.564343736639589,
|
|
"grad_norm": 2.261197328567505,
|
|
"learning_rate": 4.586621060133362e-05,
|
|
"loss": 0.0024,
|
|
"num_input_tokens_seen": 95406768,
|
|
"step": 4885
|
|
},
|
|
{
|
|
"epoch": 5.570044178423828,
|
|
"grad_norm": 0.1426500380039215,
|
|
"learning_rate": 4.585798597788266e-05,
|
|
"loss": 0.003,
|
|
"num_input_tokens_seen": 95504512,
|
|
"step": 4890
|
|
},
|
|
{
|
|
"epoch": 5.5757446202080665,
|
|
"grad_norm": 0.056966375559568405,
|
|
"learning_rate": 4.584975391961242e-05,
|
|
"loss": 0.0185,
|
|
"num_input_tokens_seen": 95602240,
|
|
"step": 4895
|
|
},
|
|
{
|
|
"epoch": 5.581445061992304,
|
|
"grad_norm": 13.30504322052002,
|
|
"learning_rate": 4.584151442945725e-05,
|
|
"loss": 0.0217,
|
|
"num_input_tokens_seen": 95699968,
|
|
"step": 4900
|
|
},
|
|
{
|
|
"epoch": 5.587145503776543,
|
|
"grad_norm": 1.4918162822723389,
|
|
"learning_rate": 4.583326751035405e-05,
|
|
"loss": 0.0303,
|
|
"num_input_tokens_seen": 95797696,
|
|
"step": 4905
|
|
},
|
|
{
|
|
"epoch": 5.592845945560781,
|
|
"grad_norm": 1.1220730543136597,
|
|
"learning_rate": 4.582501316524247e-05,
|
|
"loss": 0.0019,
|
|
"num_input_tokens_seen": 95895424,
|
|
"step": 4910
|
|
},
|
|
{
|
|
"epoch": 5.598546387345019,
|
|
"grad_norm": 1.1162631511688232,
|
|
"learning_rate": 4.5816751397064764e-05,
|
|
"loss": 0.0094,
|
|
"num_input_tokens_seen": 95993056,
|
|
"step": 4915
|
|
},
|
|
{
|
|
"epoch": 5.6042468291292575,
|
|
"grad_norm": 0.10550173372030258,
|
|
"learning_rate": 4.5808482208765836e-05,
|
|
"loss": 0.0277,
|
|
"num_input_tokens_seen": 96090832,
|
|
"step": 4920
|
|
},
|
|
{
|
|
"epoch": 5.609947270913496,
|
|
"grad_norm": 1.0097618103027344,
|
|
"learning_rate": 4.580020560329322e-05,
|
|
"loss": 0.0025,
|
|
"num_input_tokens_seen": 96188544,
|
|
"step": 4925
|
|
},
|
|
{
|
|
"epoch": 5.615647712697734,
|
|
"grad_norm": 0.717867374420166,
|
|
"learning_rate": 4.579192158359712e-05,
|
|
"loss": 0.0037,
|
|
"num_input_tokens_seen": 96286368,
|
|
"step": 4930
|
|
},
|
|
{
|
|
"epoch": 5.621348154481972,
|
|
"grad_norm": 0.8814383149147034,
|
|
"learning_rate": 4.5783630152630365e-05,
|
|
"loss": 0.024,
|
|
"num_input_tokens_seen": 96384128,
|
|
"step": 4935
|
|
},
|
|
{
|
|
"epoch": 5.627048596266211,
|
|
"grad_norm": 0.2208772897720337,
|
|
"learning_rate": 4.577533131334844e-05,
|
|
"loss": 0.0187,
|
|
"num_input_tokens_seen": 96481888,
|
|
"step": 4940
|
|
},
|
|
{
|
|
"epoch": 5.632749038050449,
|
|
"grad_norm": 9.015159606933594,
|
|
"learning_rate": 4.5767025068709455e-05,
|
|
"loss": 0.0203,
|
|
"num_input_tokens_seen": 96579680,
|
|
"step": 4945
|
|
},
|
|
{
|
|
"epoch": 5.638449479834687,
|
|
"grad_norm": 0.6479278206825256,
|
|
"learning_rate": 4.5758711421674166e-05,
|
|
"loss": 0.0253,
|
|
"num_input_tokens_seen": 96677488,
|
|
"step": 4950
|
|
},
|
|
{
|
|
"epoch": 5.644149921618926,
|
|
"grad_norm": 0.05944683775305748,
|
|
"learning_rate": 4.575039037520598e-05,
|
|
"loss": 0.001,
|
|
"num_input_tokens_seen": 96775280,
|
|
"step": 4955
|
|
},
|
|
{
|
|
"epoch": 5.649850363403163,
|
|
"grad_norm": 0.376200407743454,
|
|
"learning_rate": 4.5742061932270906e-05,
|
|
"loss": 0.0041,
|
|
"num_input_tokens_seen": 96873072,
|
|
"step": 4960
|
|
},
|
|
{
|
|
"epoch": 5.655550805187402,
|
|
"grad_norm": 0.14665372669696808,
|
|
"learning_rate": 4.5733726095837634e-05,
|
|
"loss": 0.0012,
|
|
"num_input_tokens_seen": 96970912,
|
|
"step": 4965
|
|
},
|
|
{
|
|
"epoch": 5.6612512469716405,
|
|
"grad_norm": 0.11682464182376862,
|
|
"learning_rate": 4.572538286887748e-05,
|
|
"loss": 0.029,
|
|
"num_input_tokens_seen": 97068624,
|
|
"step": 4970
|
|
},
|
|
{
|
|
"epoch": 5.666951688755878,
|
|
"grad_norm": 0.12137410789728165,
|
|
"learning_rate": 4.571703225436435e-05,
|
|
"loss": 0.0007,
|
|
"num_input_tokens_seen": 97166384,
|
|
"step": 4975
|
|
},
|
|
{
|
|
"epoch": 5.672652130540117,
|
|
"grad_norm": 0.09676264226436615,
|
|
"learning_rate": 4.570867425527484e-05,
|
|
"loss": 0.0009,
|
|
"num_input_tokens_seen": 97264112,
|
|
"step": 4980
|
|
},
|
|
{
|
|
"epoch": 5.678352572324355,
|
|
"grad_norm": 0.3440670371055603,
|
|
"learning_rate": 4.570030887458815e-05,
|
|
"loss": 0.0014,
|
|
"num_input_tokens_seen": 97361872,
|
|
"step": 4985
|
|
},
|
|
{
|
|
"epoch": 5.684053014108594,
|
|
"grad_norm": 0.712581992149353,
|
|
"learning_rate": 4.569193611528612e-05,
|
|
"loss": 0.0043,
|
|
"num_input_tokens_seen": 97459616,
|
|
"step": 4990
|
|
},
|
|
{
|
|
"epoch": 5.6897534558928315,
|
|
"grad_norm": 7.326711177825928,
|
|
"learning_rate": 4.5683555980353197e-05,
|
|
"loss": 0.009,
|
|
"num_input_tokens_seen": 97557376,
|
|
"step": 4995
|
|
},
|
|
{
|
|
"epoch": 5.69545389767707,
|
|
"grad_norm": 0.251907616853714,
|
|
"learning_rate": 4.56751684727765e-05,
|
|
"loss": 0.0112,
|
|
"num_input_tokens_seen": 97655040,
|
|
"step": 5000
|
|
}
|
|
],
|
|
"logging_steps": 5,
|
|
"max_steps": 26310,
|
|
"num_input_tokens_seen": 97655040,
|
|
"num_train_epochs": 30,
|
|
"save_steps": 100,
|
|
"stateful_callbacks": {
|
|
"TrainerControl": {
|
|
"args": {
|
|
"should_epoch_stop": false,
|
|
"should_evaluate": false,
|
|
"should_log": false,
|
|
"should_save": true,
|
|
"should_training_stop": false
|
|
},
|
|
"attributes": {}
|
|
}
|
|
},
|
|
"total_flos": 9.099184256070451e+17,
|
|
"train_batch_size": 2,
|
|
"trial_name": null,
|
|
"trial_params": null
|
|
}
|
|
|