|
{
|
|
"best_global_step": null,
|
|
"best_metric": null,
|
|
"best_model_checkpoint": null,
|
|
"epoch": 1.0,
|
|
"eval_steps": 100,
|
|
"global_step": 2375,
|
|
"is_hyper_param_search": false,
|
|
"is_local_process_zero": true,
|
|
"is_world_process_zero": true,
|
|
"log_history": [
|
|
{
|
|
"epoch": 0.002105263157894737,
|
|
"grad_norm": NaN,
|
|
"learning_rate": 4.999991251325301e-05,
|
|
"loss": 8.8259,
|
|
"num_input_tokens_seen": 592,
|
|
"step": 5
|
|
},
|
|
{
|
|
"epoch": 0.004210526315789474,
|
|
"grad_norm": 3.9077489376068115,
|
|
"learning_rate": 4.9999212622950984e-05,
|
|
"loss": 6.5033,
|
|
"num_input_tokens_seen": 1296,
|
|
"step": 10
|
|
},
|
|
{
|
|
"epoch": 0.00631578947368421,
|
|
"grad_norm": 17.610599517822266,
|
|
"learning_rate": 4.9997353571051935e-05,
|
|
"loss": 5.5742,
|
|
"num_input_tokens_seen": 1920,
|
|
"step": 15
|
|
},
|
|
{
|
|
"epoch": 0.008421052631578947,
|
|
"grad_norm": 4.236301422119141,
|
|
"learning_rate": 4.999440105392749e-05,
|
|
"loss": 4.7275,
|
|
"num_input_tokens_seen": 2432,
|
|
"step": 20
|
|
},
|
|
{
|
|
"epoch": 0.010526315789473684,
|
|
"grad_norm": 4.381397247314453,
|
|
"learning_rate": 4.999035520073032e-05,
|
|
"loss": 3.5477,
|
|
"num_input_tokens_seen": 3168,
|
|
"step": 25
|
|
},
|
|
{
|
|
"epoch": 0.01263157894736842,
|
|
"grad_norm": 3.554992198944092,
|
|
"learning_rate": 4.998521618843914e-05,
|
|
"loss": 3.6426,
|
|
"num_input_tokens_seen": 3920,
|
|
"step": 30
|
|
},
|
|
{
|
|
"epoch": 0.014736842105263158,
|
|
"grad_norm": 4.44541072845459,
|
|
"learning_rate": 4.9978984241851013e-05,
|
|
"loss": 3.8361,
|
|
"num_input_tokens_seen": 4432,
|
|
"step": 35
|
|
},
|
|
{
|
|
"epoch": 0.016842105263157894,
|
|
"grad_norm": 2.611109495162964,
|
|
"learning_rate": 4.997165963357145e-05,
|
|
"loss": 3.6522,
|
|
"num_input_tokens_seen": 5008,
|
|
"step": 40
|
|
},
|
|
{
|
|
"epoch": 0.018947368421052633,
|
|
"grad_norm": 2.0907833576202393,
|
|
"learning_rate": 4.996324268400256e-05,
|
|
"loss": 3.1776,
|
|
"num_input_tokens_seen": 5936,
|
|
"step": 45
|
|
},
|
|
{
|
|
"epoch": 0.021052631578947368,
|
|
"grad_norm": 3.9562370777130127,
|
|
"learning_rate": 4.995373376132898e-05,
|
|
"loss": 3.9684,
|
|
"num_input_tokens_seen": 6512,
|
|
"step": 50
|
|
},
|
|
{
|
|
"epoch": 0.023157894736842106,
|
|
"grad_norm": 4.664333820343018,
|
|
"learning_rate": 4.9943133281501795e-05,
|
|
"loss": 3.1654,
|
|
"num_input_tokens_seen": 7104,
|
|
"step": 55
|
|
},
|
|
{
|
|
"epoch": 0.02526315789473684,
|
|
"grad_norm": 2.5006332397460938,
|
|
"learning_rate": 4.993144170822032e-05,
|
|
"loss": 3.374,
|
|
"num_input_tokens_seen": 7712,
|
|
"step": 60
|
|
},
|
|
{
|
|
"epoch": 0.02736842105263158,
|
|
"grad_norm": 2.9427053928375244,
|
|
"learning_rate": 4.9918659552911864e-05,
|
|
"loss": 3.2508,
|
|
"num_input_tokens_seen": 8320,
|
|
"step": 65
|
|
},
|
|
{
|
|
"epoch": 0.029473684210526315,
|
|
"grad_norm": 2.8679909706115723,
|
|
"learning_rate": 4.9904787374709305e-05,
|
|
"loss": 3.6347,
|
|
"num_input_tokens_seen": 8848,
|
|
"step": 70
|
|
},
|
|
{
|
|
"epoch": 0.031578947368421054,
|
|
"grad_norm": 2.2228591442108154,
|
|
"learning_rate": 4.988982578042665e-05,
|
|
"loss": 3.6148,
|
|
"num_input_tokens_seen": 9472,
|
|
"step": 75
|
|
},
|
|
{
|
|
"epoch": 0.03368421052631579,
|
|
"grad_norm": 3.230189561843872,
|
|
"learning_rate": 4.987377542453251e-05,
|
|
"loss": 3.3735,
|
|
"num_input_tokens_seen": 10064,
|
|
"step": 80
|
|
},
|
|
{
|
|
"epoch": 0.035789473684210524,
|
|
"grad_norm": 4.476248741149902,
|
|
"learning_rate": 4.9856637009121434e-05,
|
|
"loss": 3.5765,
|
|
"num_input_tokens_seen": 10656,
|
|
"step": 85
|
|
},
|
|
{
|
|
"epoch": 0.037894736842105266,
|
|
"grad_norm": 3.675215244293213,
|
|
"learning_rate": 4.9838411283883245e-05,
|
|
"loss": 3.1841,
|
|
"num_input_tokens_seen": 11168,
|
|
"step": 90
|
|
},
|
|
{
|
|
"epoch": 0.04,
|
|
"grad_norm": 4.318700313568115,
|
|
"learning_rate": 4.9819099046070206e-05,
|
|
"loss": 3.3461,
|
|
"num_input_tokens_seen": 11808,
|
|
"step": 95
|
|
},
|
|
{
|
|
"epoch": 0.042105263157894736,
|
|
"grad_norm": 3.9966771602630615,
|
|
"learning_rate": 4.979870114046217e-05,
|
|
"loss": 3.3828,
|
|
"num_input_tokens_seen": 12336,
|
|
"step": 100
|
|
},
|
|
{
|
|
"epoch": 0.042105263157894736,
|
|
"eval_loss": 3.3995444774627686,
|
|
"eval_runtime": 21.4256,
|
|
"eval_samples_per_second": 23.337,
|
|
"eval_steps_per_second": 11.668,
|
|
"num_input_tokens_seen": 12336,
|
|
"step": 100
|
|
},
|
|
{
|
|
"epoch": 0.04421052631578947,
|
|
"grad_norm": 4.8785295486450195,
|
|
"learning_rate": 4.977721845932959e-05,
|
|
"loss": 3.7097,
|
|
"num_input_tokens_seen": 12992,
|
|
"step": 105
|
|
},
|
|
{
|
|
"epoch": 0.04631578947368421,
|
|
"grad_norm": 3.408017635345459,
|
|
"learning_rate": 4.975465194239454e-05,
|
|
"loss": 2.9564,
|
|
"num_input_tokens_seen": 13744,
|
|
"step": 110
|
|
},
|
|
{
|
|
"epoch": 0.04842105263157895,
|
|
"grad_norm": 3.588205099105835,
|
|
"learning_rate": 4.973100257678958e-05,
|
|
"loss": 3.4605,
|
|
"num_input_tokens_seen": 14304,
|
|
"step": 115
|
|
},
|
|
{
|
|
"epoch": 0.05052631578947368,
|
|
"grad_norm": 3.157773494720459,
|
|
"learning_rate": 4.970627139701458e-05,
|
|
"loss": 4.0948,
|
|
"num_input_tokens_seen": 14832,
|
|
"step": 120
|
|
},
|
|
{
|
|
"epoch": 0.05263157894736842,
|
|
"grad_norm": 1.8902894258499146,
|
|
"learning_rate": 4.9680459484891445e-05,
|
|
"loss": 3.0968,
|
|
"num_input_tokens_seen": 15584,
|
|
"step": 125
|
|
},
|
|
{
|
|
"epoch": 0.05473684210526316,
|
|
"grad_norm": 1.1550252437591553,
|
|
"learning_rate": 4.9653567969516844e-05,
|
|
"loss": 3.2226,
|
|
"num_input_tokens_seen": 16336,
|
|
"step": 130
|
|
},
|
|
{
|
|
"epoch": 0.056842105263157895,
|
|
"grad_norm": 3.8178346157073975,
|
|
"learning_rate": 4.962559802721277e-05,
|
|
"loss": 3.5534,
|
|
"num_input_tokens_seen": 16880,
|
|
"step": 135
|
|
},
|
|
{
|
|
"epoch": 0.05894736842105263,
|
|
"grad_norm": 1.2998478412628174,
|
|
"learning_rate": 4.959655088147511e-05,
|
|
"loss": 2.8113,
|
|
"num_input_tokens_seen": 17728,
|
|
"step": 140
|
|
},
|
|
{
|
|
"epoch": 0.061052631578947365,
|
|
"grad_norm": 4.0586066246032715,
|
|
"learning_rate": 4.956642780292012e-05,
|
|
"loss": 3.3134,
|
|
"num_input_tokens_seen": 18336,
|
|
"step": 145
|
|
},
|
|
{
|
|
"epoch": 0.06315789473684211,
|
|
"grad_norm": 5.839606285095215,
|
|
"learning_rate": 4.9535230109228844e-05,
|
|
"loss": 3.4131,
|
|
"num_input_tokens_seen": 19008,
|
|
"step": 150
|
|
},
|
|
{
|
|
"epoch": 0.06526315789473684,
|
|
"grad_norm": 4.800465106964111,
|
|
"learning_rate": 4.950295916508947e-05,
|
|
"loss": 3.4343,
|
|
"num_input_tokens_seen": 19520,
|
|
"step": 155
|
|
},
|
|
{
|
|
"epoch": 0.06736842105263158,
|
|
"grad_norm": 2.6969552040100098,
|
|
"learning_rate": 4.9469616382137635e-05,
|
|
"loss": 3.0208,
|
|
"num_input_tokens_seen": 20192,
|
|
"step": 160
|
|
},
|
|
{
|
|
"epoch": 0.06947368421052631,
|
|
"grad_norm": 6.1990203857421875,
|
|
"learning_rate": 4.943520321889468e-05,
|
|
"loss": 3.652,
|
|
"num_input_tokens_seen": 20848,
|
|
"step": 165
|
|
},
|
|
{
|
|
"epoch": 0.07157894736842105,
|
|
"grad_norm": 2.14052152633667,
|
|
"learning_rate": 4.939972118070384e-05,
|
|
"loss": 3.0198,
|
|
"num_input_tokens_seen": 21520,
|
|
"step": 170
|
|
},
|
|
{
|
|
"epoch": 0.07368421052631578,
|
|
"grad_norm": 3.615708351135254,
|
|
"learning_rate": 4.9363171819664434e-05,
|
|
"loss": 2.7798,
|
|
"num_input_tokens_seen": 22240,
|
|
"step": 175
|
|
},
|
|
{
|
|
"epoch": 0.07578947368421053,
|
|
"grad_norm": 5.227450370788574,
|
|
"learning_rate": 4.932555673456389e-05,
|
|
"loss": 3.482,
|
|
"num_input_tokens_seen": 22752,
|
|
"step": 180
|
|
},
|
|
{
|
|
"epoch": 0.07789473684210527,
|
|
"grad_norm": 4.943465709686279,
|
|
"learning_rate": 4.9286877570807915e-05,
|
|
"loss": 2.8097,
|
|
"num_input_tokens_seen": 23408,
|
|
"step": 185
|
|
},
|
|
{
|
|
"epoch": 0.08,
|
|
"grad_norm": 3.956165075302124,
|
|
"learning_rate": 4.924713602034842e-05,
|
|
"loss": 3.7867,
|
|
"num_input_tokens_seen": 24048,
|
|
"step": 190
|
|
},
|
|
{
|
|
"epoch": 0.08210526315789474,
|
|
"grad_norm": 4.213566303253174,
|
|
"learning_rate": 4.920633382160955e-05,
|
|
"loss": 3.4609,
|
|
"num_input_tokens_seen": 24624,
|
|
"step": 195
|
|
},
|
|
{
|
|
"epoch": 0.08421052631578947,
|
|
"grad_norm": 4.879347324371338,
|
|
"learning_rate": 4.9164472759411695e-05,
|
|
"loss": 3.7301,
|
|
"num_input_tokens_seen": 25168,
|
|
"step": 200
|
|
},
|
|
{
|
|
"epoch": 0.08421052631578947,
|
|
"eval_loss": 3.3119306564331055,
|
|
"eval_runtime": 22.2022,
|
|
"eval_samples_per_second": 22.52,
|
|
"eval_steps_per_second": 11.26,
|
|
"num_input_tokens_seen": 25168,
|
|
"step": 200
|
|
},
|
|
{
|
|
"epoch": 0.0863157894736842,
|
|
"grad_norm": 2.6285791397094727,
|
|
"learning_rate": 4.91215546648933e-05,
|
|
"loss": 3.1267,
|
|
"num_input_tokens_seen": 25936,
|
|
"step": 205
|
|
},
|
|
{
|
|
"epoch": 0.08842105263157894,
|
|
"grad_norm": 6.523679733276367,
|
|
"learning_rate": 4.907758141543086e-05,
|
|
"loss": 3.3567,
|
|
"num_input_tokens_seen": 26480,
|
|
"step": 210
|
|
},
|
|
{
|
|
"epoch": 0.09052631578947369,
|
|
"grad_norm": 3.7712221145629883,
|
|
"learning_rate": 4.903255493455676e-05,
|
|
"loss": 3.0875,
|
|
"num_input_tokens_seen": 27072,
|
|
"step": 215
|
|
},
|
|
{
|
|
"epoch": 0.09263157894736843,
|
|
"grad_norm": 2.6647517681121826,
|
|
"learning_rate": 4.898647719187515e-05,
|
|
"loss": 2.9579,
|
|
"num_input_tokens_seen": 27760,
|
|
"step": 220
|
|
},
|
|
{
|
|
"epoch": 0.09473684210526316,
|
|
"grad_norm": 5.09914493560791,
|
|
"learning_rate": 4.8939350202975756e-05,
|
|
"loss": 3.0374,
|
|
"num_input_tokens_seen": 28496,
|
|
"step": 225
|
|
},
|
|
{
|
|
"epoch": 0.0968421052631579,
|
|
"grad_norm": 3.115642786026001,
|
|
"learning_rate": 4.889117602934574e-05,
|
|
"loss": 2.9435,
|
|
"num_input_tokens_seen": 29232,
|
|
"step": 230
|
|
},
|
|
{
|
|
"epoch": 0.09894736842105263,
|
|
"grad_norm": 4.625009536743164,
|
|
"learning_rate": 4.884195677827952e-05,
|
|
"loss": 3.8317,
|
|
"num_input_tokens_seen": 29792,
|
|
"step": 235
|
|
},
|
|
{
|
|
"epoch": 0.10105263157894737,
|
|
"grad_norm": 4.469867706298828,
|
|
"learning_rate": 4.879169460278659e-05,
|
|
"loss": 3.4246,
|
|
"num_input_tokens_seen": 30320,
|
|
"step": 240
|
|
},
|
|
{
|
|
"epoch": 0.1031578947368421,
|
|
"grad_norm": 4.335468292236328,
|
|
"learning_rate": 4.874039170149733e-05,
|
|
"loss": 3.2556,
|
|
"num_input_tokens_seen": 30800,
|
|
"step": 245
|
|
},
|
|
{
|
|
"epoch": 0.10526315789473684,
|
|
"grad_norm": 4.776815414428711,
|
|
"learning_rate": 4.868805031856686e-05,
|
|
"loss": 3.6451,
|
|
"num_input_tokens_seen": 31440,
|
|
"step": 250
|
|
},
|
|
{
|
|
"epoch": 0.10736842105263159,
|
|
"grad_norm": 7.101145267486572,
|
|
"learning_rate": 4.863467274357679e-05,
|
|
"loss": 2.991,
|
|
"num_input_tokens_seen": 32032,
|
|
"step": 255
|
|
},
|
|
{
|
|
"epoch": 0.10947368421052632,
|
|
"grad_norm": 5.720299243927002,
|
|
"learning_rate": 4.858026131143522e-05,
|
|
"loss": 3.5287,
|
|
"num_input_tokens_seen": 32544,
|
|
"step": 260
|
|
},
|
|
{
|
|
"epoch": 0.11157894736842106,
|
|
"grad_norm": 1.5953224897384644,
|
|
"learning_rate": 4.8524818402274415e-05,
|
|
"loss": 3.1499,
|
|
"num_input_tokens_seen": 33216,
|
|
"step": 265
|
|
},
|
|
{
|
|
"epoch": 0.11368421052631579,
|
|
"grad_norm": 3.63209867477417,
|
|
"learning_rate": 4.846834644134686e-05,
|
|
"loss": 2.6614,
|
|
"num_input_tokens_seen": 33824,
|
|
"step": 270
|
|
},
|
|
{
|
|
"epoch": 0.11578947368421053,
|
|
"grad_norm": 5.393692493438721,
|
|
"learning_rate": 4.841084789891905e-05,
|
|
"loss": 3.2591,
|
|
"num_input_tokens_seen": 34464,
|
|
"step": 275
|
|
},
|
|
{
|
|
"epoch": 0.11789473684210526,
|
|
"grad_norm": 3.1078500747680664,
|
|
"learning_rate": 4.8352325290163526e-05,
|
|
"loss": 2.8512,
|
|
"num_input_tokens_seen": 35216,
|
|
"step": 280
|
|
},
|
|
{
|
|
"epoch": 0.12,
|
|
"grad_norm": 5.469326972961426,
|
|
"learning_rate": 4.829278117504876e-05,
|
|
"loss": 3.0881,
|
|
"num_input_tokens_seen": 35808,
|
|
"step": 285
|
|
},
|
|
{
|
|
"epoch": 0.12210526315789473,
|
|
"grad_norm": 6.58186674118042,
|
|
"learning_rate": 4.823221815822725e-05,
|
|
"loss": 3.1116,
|
|
"num_input_tokens_seen": 36480,
|
|
"step": 290
|
|
},
|
|
{
|
|
"epoch": 0.12421052631578948,
|
|
"grad_norm": 3.288723945617676,
|
|
"learning_rate": 4.817063888892155e-05,
|
|
"loss": 3.0578,
|
|
"num_input_tokens_seen": 37040,
|
|
"step": 295
|
|
},
|
|
{
|
|
"epoch": 0.12631578947368421,
|
|
"grad_norm": 4.704117298126221,
|
|
"learning_rate": 4.810804606080839e-05,
|
|
"loss": 2.7913,
|
|
"num_input_tokens_seen": 38256,
|
|
"step": 300
|
|
},
|
|
{
|
|
"epoch": 0.12631578947368421,
|
|
"eval_loss": 3.294227123260498,
|
|
"eval_runtime": 21.4039,
|
|
"eval_samples_per_second": 23.36,
|
|
"eval_steps_per_second": 11.68,
|
|
"num_input_tokens_seen": 38256,
|
|
"step": 300
|
|
},
|
|
{
|
|
"epoch": 0.12842105263157894,
|
|
"grad_norm": 5.600541591644287,
|
|
"learning_rate": 4.804444241190084e-05,
|
|
"loss": 3.3033,
|
|
"num_input_tokens_seen": 38832,
|
|
"step": 305
|
|
},
|
|
{
|
|
"epoch": 0.13052631578947368,
|
|
"grad_norm": 5.58181619644165,
|
|
"learning_rate": 4.797983072442855e-05,
|
|
"loss": 2.9879,
|
|
"num_input_tokens_seen": 39632,
|
|
"step": 310
|
|
},
|
|
{
|
|
"epoch": 0.13263157894736843,
|
|
"grad_norm": 4.971944332122803,
|
|
"learning_rate": 4.791421382471605e-05,
|
|
"loss": 3.3603,
|
|
"num_input_tokens_seen": 40320,
|
|
"step": 315
|
|
},
|
|
{
|
|
"epoch": 0.13473684210526315,
|
|
"grad_norm": 4.8193230628967285,
|
|
"learning_rate": 4.78475945830591e-05,
|
|
"loss": 3.2463,
|
|
"num_input_tokens_seen": 40976,
|
|
"step": 320
|
|
},
|
|
{
|
|
"epoch": 0.1368421052631579,
|
|
"grad_norm": 1.9304226636886597,
|
|
"learning_rate": 4.777997591359914e-05,
|
|
"loss": 3.3801,
|
|
"num_input_tokens_seen": 41712,
|
|
"step": 325
|
|
},
|
|
{
|
|
"epoch": 0.13894736842105262,
|
|
"grad_norm": 1.8638708591461182,
|
|
"learning_rate": 4.7711360774195835e-05,
|
|
"loss": 3.3647,
|
|
"num_input_tokens_seen": 42352,
|
|
"step": 330
|
|
},
|
|
{
|
|
"epoch": 0.14105263157894737,
|
|
"grad_norm": 6.234751224517822,
|
|
"learning_rate": 4.764175216629766e-05,
|
|
"loss": 3.4926,
|
|
"num_input_tokens_seen": 43040,
|
|
"step": 335
|
|
},
|
|
{
|
|
"epoch": 0.1431578947368421,
|
|
"grad_norm": 5.5220136642456055,
|
|
"learning_rate": 4.7571153134810634e-05,
|
|
"loss": 2.8625,
|
|
"num_input_tokens_seen": 43680,
|
|
"step": 340
|
|
},
|
|
{
|
|
"epoch": 0.14526315789473684,
|
|
"grad_norm": 3.765749216079712,
|
|
"learning_rate": 4.749956676796507e-05,
|
|
"loss": 3.7596,
|
|
"num_input_tokens_seen": 44256,
|
|
"step": 345
|
|
},
|
|
{
|
|
"epoch": 0.14736842105263157,
|
|
"grad_norm": 4.542320251464844,
|
|
"learning_rate": 4.742699619718061e-05,
|
|
"loss": 2.8841,
|
|
"num_input_tokens_seen": 45104,
|
|
"step": 350
|
|
},
|
|
{
|
|
"epoch": 0.14947368421052631,
|
|
"grad_norm": 6.3029632568359375,
|
|
"learning_rate": 4.735344459692909e-05,
|
|
"loss": 3.1136,
|
|
"num_input_tokens_seen": 45696,
|
|
"step": 355
|
|
},
|
|
{
|
|
"epoch": 0.15157894736842106,
|
|
"grad_norm": 7.161231517791748,
|
|
"learning_rate": 4.7278915184595774e-05,
|
|
"loss": 3.5087,
|
|
"num_input_tokens_seen": 46240,
|
|
"step": 360
|
|
},
|
|
{
|
|
"epoch": 0.15368421052631578,
|
|
"grad_norm": 6.039397239685059,
|
|
"learning_rate": 4.720341122033862e-05,
|
|
"loss": 3.0394,
|
|
"num_input_tokens_seen": 46928,
|
|
"step": 365
|
|
},
|
|
{
|
|
"epoch": 0.15578947368421053,
|
|
"grad_norm": 3.1546213626861572,
|
|
"learning_rate": 4.71269360069456e-05,
|
|
"loss": 3.3537,
|
|
"num_input_tokens_seen": 47552,
|
|
"step": 370
|
|
},
|
|
{
|
|
"epoch": 0.15789473684210525,
|
|
"grad_norm": 2.607238531112671,
|
|
"learning_rate": 4.704949288969031e-05,
|
|
"loss": 3.3383,
|
|
"num_input_tokens_seen": 48176,
|
|
"step": 375
|
|
},
|
|
{
|
|
"epoch": 0.16,
|
|
"grad_norm": 3.171997308731079,
|
|
"learning_rate": 4.697108525618556e-05,
|
|
"loss": 2.6449,
|
|
"num_input_tokens_seen": 48928,
|
|
"step": 380
|
|
},
|
|
{
|
|
"epoch": 0.16210526315789472,
|
|
"grad_norm": 5.732158660888672,
|
|
"learning_rate": 4.6891716536235275e-05,
|
|
"loss": 2.9166,
|
|
"num_input_tokens_seen": 49552,
|
|
"step": 385
|
|
},
|
|
{
|
|
"epoch": 0.16421052631578947,
|
|
"grad_norm": 6.06363582611084,
|
|
"learning_rate": 4.681139020168436e-05,
|
|
"loss": 3.689,
|
|
"num_input_tokens_seen": 50064,
|
|
"step": 390
|
|
},
|
|
{
|
|
"epoch": 0.16631578947368422,
|
|
"grad_norm": 6.7919487953186035,
|
|
"learning_rate": 4.673010976626692e-05,
|
|
"loss": 3.0367,
|
|
"num_input_tokens_seen": 50800,
|
|
"step": 395
|
|
},
|
|
{
|
|
"epoch": 0.16842105263157894,
|
|
"grad_norm": 6.127151012420654,
|
|
"learning_rate": 4.664787878545252e-05,
|
|
"loss": 2.9739,
|
|
"num_input_tokens_seen": 51344,
|
|
"step": 400
|
|
},
|
|
{
|
|
"epoch": 0.16842105263157894,
|
|
"eval_loss": 3.2370829582214355,
|
|
"eval_runtime": 20.9481,
|
|
"eval_samples_per_second": 23.869,
|
|
"eval_steps_per_second": 11.934,
|
|
"num_input_tokens_seen": 51344,
|
|
"step": 400
|
|
},
|
|
{
|
|
"epoch": 0.1705263157894737,
|
|
"grad_norm": 4.9031853675842285,
|
|
"learning_rate": 4.656470085629062e-05,
|
|
"loss": 2.9691,
|
|
"num_input_tokens_seen": 52208,
|
|
"step": 405
|
|
},
|
|
{
|
|
"epoch": 0.1726315789473684,
|
|
"grad_norm": 2.7085535526275635,
|
|
"learning_rate": 4.648057961725334e-05,
|
|
"loss": 3.5849,
|
|
"num_input_tokens_seen": 52784,
|
|
"step": 410
|
|
},
|
|
{
|
|
"epoch": 0.17473684210526316,
|
|
"grad_norm": 2.7885546684265137,
|
|
"learning_rate": 4.639551874807617e-05,
|
|
"loss": 3.0472,
|
|
"num_input_tokens_seen": 53488,
|
|
"step": 415
|
|
},
|
|
{
|
|
"epoch": 0.17684210526315788,
|
|
"grad_norm": 4.653199672698975,
|
|
"learning_rate": 4.630952196959709e-05,
|
|
"loss": 3.3394,
|
|
"num_input_tokens_seen": 54096,
|
|
"step": 420
|
|
},
|
|
{
|
|
"epoch": 0.17894736842105263,
|
|
"grad_norm": 6.162243366241455,
|
|
"learning_rate": 4.622259304359378e-05,
|
|
"loss": 3.1293,
|
|
"num_input_tokens_seen": 54832,
|
|
"step": 425
|
|
},
|
|
{
|
|
"epoch": 0.18105263157894738,
|
|
"grad_norm": 6.41090202331543,
|
|
"learning_rate": 4.613473577261908e-05,
|
|
"loss": 3.3758,
|
|
"num_input_tokens_seen": 55328,
|
|
"step": 430
|
|
},
|
|
{
|
|
"epoch": 0.1831578947368421,
|
|
"grad_norm": 2.4367990493774414,
|
|
"learning_rate": 4.604595399983463e-05,
|
|
"loss": 2.7601,
|
|
"num_input_tokens_seen": 56064,
|
|
"step": 435
|
|
},
|
|
{
|
|
"epoch": 0.18526315789473685,
|
|
"grad_norm": 3.071305751800537,
|
|
"learning_rate": 4.59562516088428e-05,
|
|
"loss": 3.4554,
|
|
"num_input_tokens_seen": 56624,
|
|
"step": 440
|
|
},
|
|
{
|
|
"epoch": 0.18736842105263157,
|
|
"grad_norm": 5.66983699798584,
|
|
"learning_rate": 4.5865632523516754e-05,
|
|
"loss": 3.1669,
|
|
"num_input_tokens_seen": 57216,
|
|
"step": 445
|
|
},
|
|
{
|
|
"epoch": 0.18947368421052632,
|
|
"grad_norm": 4.650711536407471,
|
|
"learning_rate": 4.577410070782885e-05,
|
|
"loss": 3.0806,
|
|
"num_input_tokens_seen": 57936,
|
|
"step": 450
|
|
},
|
|
{
|
|
"epoch": 0.19157894736842104,
|
|
"grad_norm": 5.459670066833496,
|
|
"learning_rate": 4.5681660165677236e-05,
|
|
"loss": 3.4798,
|
|
"num_input_tokens_seen": 58528,
|
|
"step": 455
|
|
},
|
|
{
|
|
"epoch": 0.1936842105263158,
|
|
"grad_norm": 6.794463157653809,
|
|
"learning_rate": 4.558831494071069e-05,
|
|
"loss": 3.1879,
|
|
"num_input_tokens_seen": 59152,
|
|
"step": 460
|
|
},
|
|
{
|
|
"epoch": 0.1957894736842105,
|
|
"grad_norm": 5.128958702087402,
|
|
"learning_rate": 4.549406911615174e-05,
|
|
"loss": 2.96,
|
|
"num_input_tokens_seen": 59904,
|
|
"step": 465
|
|
},
|
|
{
|
|
"epoch": 0.19789473684210526,
|
|
"grad_norm": 5.253583908081055,
|
|
"learning_rate": 4.539892681461808e-05,
|
|
"loss": 3.4593,
|
|
"num_input_tokens_seen": 60432,
|
|
"step": 470
|
|
},
|
|
{
|
|
"epoch": 0.2,
|
|
"grad_norm": 9.341363906860352,
|
|
"learning_rate": 4.530289219794218e-05,
|
|
"loss": 3.1928,
|
|
"num_input_tokens_seen": 61104,
|
|
"step": 475
|
|
},
|
|
{
|
|
"epoch": 0.20210526315789473,
|
|
"grad_norm": 2.279937267303467,
|
|
"learning_rate": 4.5205969466989304e-05,
|
|
"loss": 2.6037,
|
|
"num_input_tokens_seen": 61968,
|
|
"step": 480
|
|
},
|
|
{
|
|
"epoch": 0.20421052631578948,
|
|
"grad_norm": 5.586785793304443,
|
|
"learning_rate": 4.5108162861473665e-05,
|
|
"loss": 2.8718,
|
|
"num_input_tokens_seen": 62832,
|
|
"step": 485
|
|
},
|
|
{
|
|
"epoch": 0.2063157894736842,
|
|
"grad_norm": 6.161190509796143,
|
|
"learning_rate": 4.500947665977306e-05,
|
|
"loss": 2.8371,
|
|
"num_input_tokens_seen": 63360,
|
|
"step": 490
|
|
},
|
|
{
|
|
"epoch": 0.20842105263157895,
|
|
"grad_norm": 5.741548538208008,
|
|
"learning_rate": 4.490991517874165e-05,
|
|
"loss": 3.188,
|
|
"num_input_tokens_seen": 63968,
|
|
"step": 495
|
|
},
|
|
{
|
|
"epoch": 0.21052631578947367,
|
|
"grad_norm": 5.338064670562744,
|
|
"learning_rate": 4.480948277352113e-05,
|
|
"loss": 2.4265,
|
|
"num_input_tokens_seen": 64896,
|
|
"step": 500
|
|
},
|
|
{
|
|
"epoch": 0.21052631578947367,
|
|
"eval_loss": 3.229081153869629,
|
|
"eval_runtime": 22.1222,
|
|
"eval_samples_per_second": 22.602,
|
|
"eval_steps_per_second": 11.301,
|
|
"num_input_tokens_seen": 64896,
|
|
"step": 500
|
|
},
|
|
{
|
|
"epoch": 0.21263157894736842,
|
|
"grad_norm": 6.277206897735596,
|
|
"learning_rate": 4.470818383735027e-05,
|
|
"loss": 3.0783,
|
|
"num_input_tokens_seen": 65488,
|
|
"step": 505
|
|
},
|
|
{
|
|
"epoch": 0.21473684210526317,
|
|
"grad_norm": 4.257750034332275,
|
|
"learning_rate": 4.460602280137271e-05,
|
|
"loss": 3.1631,
|
|
"num_input_tokens_seen": 66144,
|
|
"step": 510
|
|
},
|
|
{
|
|
"epoch": 0.2168421052631579,
|
|
"grad_norm": 3.1334800720214844,
|
|
"learning_rate": 4.45030041344431e-05,
|
|
"loss": 3.0374,
|
|
"num_input_tokens_seen": 66688,
|
|
"step": 515
|
|
},
|
|
{
|
|
"epoch": 0.21894736842105264,
|
|
"grad_norm": 5.618471145629883,
|
|
"learning_rate": 4.4399132342931684e-05,
|
|
"loss": 2.7939,
|
|
"num_input_tokens_seen": 67296,
|
|
"step": 520
|
|
},
|
|
{
|
|
"epoch": 0.22105263157894736,
|
|
"grad_norm": 5.193708896636963,
|
|
"learning_rate": 4.4294411970527116e-05,
|
|
"loss": 3.0484,
|
|
"num_input_tokens_seen": 67984,
|
|
"step": 525
|
|
},
|
|
{
|
|
"epoch": 0.2231578947368421,
|
|
"grad_norm": 6.637373447418213,
|
|
"learning_rate": 4.418884759803773e-05,
|
|
"loss": 2.873,
|
|
"num_input_tokens_seen": 68560,
|
|
"step": 530
|
|
},
|
|
{
|
|
"epoch": 0.22526315789473683,
|
|
"grad_norm": 2.414778709411621,
|
|
"learning_rate": 4.408244384319116e-05,
|
|
"loss": 2.8347,
|
|
"num_input_tokens_seen": 69360,
|
|
"step": 535
|
|
},
|
|
{
|
|
"epoch": 0.22736842105263158,
|
|
"grad_norm": 5.371325969696045,
|
|
"learning_rate": 4.397520536043234e-05,
|
|
"loss": 3.0505,
|
|
"num_input_tokens_seen": 69888,
|
|
"step": 540
|
|
},
|
|
{
|
|
"epoch": 0.2294736842105263,
|
|
"grad_norm": 8.659576416015625,
|
|
"learning_rate": 4.386713684071992e-05,
|
|
"loss": 3.4455,
|
|
"num_input_tokens_seen": 70608,
|
|
"step": 545
|
|
},
|
|
{
|
|
"epoch": 0.23157894736842105,
|
|
"grad_norm": 5.507298469543457,
|
|
"learning_rate": 4.375824301132103e-05,
|
|
"loss": 3.1131,
|
|
"num_input_tokens_seen": 71184,
|
|
"step": 550
|
|
},
|
|
{
|
|
"epoch": 0.2336842105263158,
|
|
"grad_norm": 7.195556163787842,
|
|
"learning_rate": 4.3648528635604556e-05,
|
|
"loss": 2.922,
|
|
"num_input_tokens_seen": 71728,
|
|
"step": 555
|
|
},
|
|
{
|
|
"epoch": 0.23578947368421052,
|
|
"grad_norm": 2.615894079208374,
|
|
"learning_rate": 4.35379985128327e-05,
|
|
"loss": 3.0054,
|
|
"num_input_tokens_seen": 72400,
|
|
"step": 560
|
|
},
|
|
{
|
|
"epoch": 0.23789473684210527,
|
|
"grad_norm": 4.415563583374023,
|
|
"learning_rate": 4.3426657477951105e-05,
|
|
"loss": 2.7888,
|
|
"num_input_tokens_seen": 73024,
|
|
"step": 565
|
|
},
|
|
{
|
|
"epoch": 0.24,
|
|
"grad_norm": 3.531663417816162,
|
|
"learning_rate": 4.331451040137734e-05,
|
|
"loss": 2.649,
|
|
"num_input_tokens_seen": 73792,
|
|
"step": 570
|
|
},
|
|
{
|
|
"epoch": 0.24210526315789474,
|
|
"grad_norm": 5.618805408477783,
|
|
"learning_rate": 4.320156218878783e-05,
|
|
"loss": 3.2557,
|
|
"num_input_tokens_seen": 74432,
|
|
"step": 575
|
|
},
|
|
{
|
|
"epoch": 0.24421052631578946,
|
|
"grad_norm": 2.6014719009399414,
|
|
"learning_rate": 4.308781778090329e-05,
|
|
"loss": 3.3816,
|
|
"num_input_tokens_seen": 75088,
|
|
"step": 580
|
|
},
|
|
{
|
|
"epoch": 0.2463157894736842,
|
|
"grad_norm": 7.288875102996826,
|
|
"learning_rate": 4.297328215327261e-05,
|
|
"loss": 3.619,
|
|
"num_input_tokens_seen": 75760,
|
|
"step": 585
|
|
},
|
|
{
|
|
"epoch": 0.24842105263157896,
|
|
"grad_norm": 7.931623458862305,
|
|
"learning_rate": 4.285796031605519e-05,
|
|
"loss": 3.4897,
|
|
"num_input_tokens_seen": 76288,
|
|
"step": 590
|
|
},
|
|
{
|
|
"epoch": 0.2505263157894737,
|
|
"grad_norm": 2.327460527420044,
|
|
"learning_rate": 4.274185731380178e-05,
|
|
"loss": 3.3372,
|
|
"num_input_tokens_seen": 76912,
|
|
"step": 595
|
|
},
|
|
{
|
|
"epoch": 0.25263157894736843,
|
|
"grad_norm": 6.344222545623779,
|
|
"learning_rate": 4.262497822523381e-05,
|
|
"loss": 3.2421,
|
|
"num_input_tokens_seen": 77584,
|
|
"step": 600
|
|
},
|
|
{
|
|
"epoch": 0.25263157894736843,
|
|
"eval_loss": 3.197930097579956,
|
|
"eval_runtime": 22.9116,
|
|
"eval_samples_per_second": 21.823,
|
|
"eval_steps_per_second": 10.912,
|
|
"num_input_tokens_seen": 77584,
|
|
"step": 600
|
|
},
|
|
{
|
|
"epoch": 0.25473684210526315,
|
|
"grad_norm": 7.213596820831299,
|
|
"learning_rate": 4.2507328163021264e-05,
|
|
"loss": 3.5072,
|
|
"num_input_tokens_seen": 78144,
|
|
"step": 605
|
|
},
|
|
{
|
|
"epoch": 0.25684210526315787,
|
|
"grad_norm": 4.491611480712891,
|
|
"learning_rate": 4.241265646947705e-05,
|
|
"loss": 3.2118,
|
|
"num_input_tokens_seen": 78672,
|
|
"step": 610
|
|
},
|
|
{
|
|
"epoch": 0.25894736842105265,
|
|
"grad_norm": 5.271435260772705,
|
|
"learning_rate": 4.229363164613873e-05,
|
|
"loss": 3.6353,
|
|
"num_input_tokens_seen": 79232,
|
|
"step": 615
|
|
},
|
|
{
|
|
"epoch": 0.26105263157894737,
|
|
"grad_norm": 2.8192007541656494,
|
|
"learning_rate": 4.217385034332861e-05,
|
|
"loss": 3.0386,
|
|
"num_input_tokens_seen": 79872,
|
|
"step": 620
|
|
},
|
|
{
|
|
"epoch": 0.2631578947368421,
|
|
"grad_norm": 6.350308418273926,
|
|
"learning_rate": 4.205331780066892e-05,
|
|
"loss": 3.7191,
|
|
"num_input_tokens_seen": 80368,
|
|
"step": 625
|
|
},
|
|
{
|
|
"epoch": 0.26526315789473687,
|
|
"grad_norm": 4.168824672698975,
|
|
"learning_rate": 4.193203929064353e-05,
|
|
"loss": 2.6762,
|
|
"num_input_tokens_seen": 81024,
|
|
"step": 630
|
|
},
|
|
{
|
|
"epoch": 0.2673684210526316,
|
|
"grad_norm": 2.9309401512145996,
|
|
"learning_rate": 4.181002011836737e-05,
|
|
"loss": 3.0327,
|
|
"num_input_tokens_seen": 81792,
|
|
"step": 635
|
|
},
|
|
{
|
|
"epoch": 0.2694736842105263,
|
|
"grad_norm": 5.757573127746582,
|
|
"learning_rate": 4.1687265621354314e-05,
|
|
"loss": 3.6899,
|
|
"num_input_tokens_seen": 82336,
|
|
"step": 640
|
|
},
|
|
{
|
|
"epoch": 0.27157894736842103,
|
|
"grad_norm": 6.026342868804932,
|
|
"learning_rate": 4.156378116928375e-05,
|
|
"loss": 2.6871,
|
|
"num_input_tokens_seen": 83040,
|
|
"step": 645
|
|
},
|
|
{
|
|
"epoch": 0.2736842105263158,
|
|
"grad_norm": 6.627229690551758,
|
|
"learning_rate": 4.143957216376561e-05,
|
|
"loss": 3.0722,
|
|
"num_input_tokens_seen": 83776,
|
|
"step": 650
|
|
},
|
|
{
|
|
"epoch": 0.27578947368421053,
|
|
"grad_norm": 3.4329299926757812,
|
|
"learning_rate": 4.131464403810422e-05,
|
|
"loss": 2.9647,
|
|
"num_input_tokens_seen": 84384,
|
|
"step": 655
|
|
},
|
|
{
|
|
"epoch": 0.27789473684210525,
|
|
"grad_norm": 7.016841888427734,
|
|
"learning_rate": 4.118900225706047e-05,
|
|
"loss": 3.4247,
|
|
"num_input_tokens_seen": 84992,
|
|
"step": 660
|
|
},
|
|
{
|
|
"epoch": 0.28,
|
|
"grad_norm": 4.475071907043457,
|
|
"learning_rate": 4.106265231661292e-05,
|
|
"loss": 3.0979,
|
|
"num_input_tokens_seen": 85600,
|
|
"step": 665
|
|
},
|
|
{
|
|
"epoch": 0.28210526315789475,
|
|
"grad_norm": 3.230617046356201,
|
|
"learning_rate": 4.093559974371725e-05,
|
|
"loss": 3.0953,
|
|
"num_input_tokens_seen": 86208,
|
|
"step": 670
|
|
},
|
|
{
|
|
"epoch": 0.28421052631578947,
|
|
"grad_norm": 6.390127658843994,
|
|
"learning_rate": 4.0807850096064605e-05,
|
|
"loss": 3.0413,
|
|
"num_input_tokens_seen": 86784,
|
|
"step": 675
|
|
},
|
|
{
|
|
"epoch": 0.2863157894736842,
|
|
"grad_norm": 7.769253730773926,
|
|
"learning_rate": 4.067940896183843e-05,
|
|
"loss": 3.3108,
|
|
"num_input_tokens_seen": 87360,
|
|
"step": 680
|
|
},
|
|
{
|
|
"epoch": 0.28842105263157897,
|
|
"grad_norm": 3.7311196327209473,
|
|
"learning_rate": 4.0550281959470023e-05,
|
|
"loss": 3.2254,
|
|
"num_input_tokens_seen": 88048,
|
|
"step": 685
|
|
},
|
|
{
|
|
"epoch": 0.2905263157894737,
|
|
"grad_norm": 6.714389324188232,
|
|
"learning_rate": 4.042047473739278e-05,
|
|
"loss": 2.8943,
|
|
"num_input_tokens_seen": 88688,
|
|
"step": 690
|
|
},
|
|
{
|
|
"epoch": 0.2926315789473684,
|
|
"grad_norm": 6.58662223815918,
|
|
"learning_rate": 4.028999297379511e-05,
|
|
"loss": 2.8755,
|
|
"num_input_tokens_seen": 89408,
|
|
"step": 695
|
|
},
|
|
{
|
|
"epoch": 0.29473684210526313,
|
|
"grad_norm": 7.6045403480529785,
|
|
"learning_rate": 4.0158842376372064e-05,
|
|
"loss": 3.1406,
|
|
"num_input_tokens_seen": 89952,
|
|
"step": 700
|
|
},
|
|
{
|
|
"epoch": 0.29473684210526313,
|
|
"eval_loss": 3.176912784576416,
|
|
"eval_runtime": 21.4574,
|
|
"eval_samples_per_second": 23.302,
|
|
"eval_steps_per_second": 11.651,
|
|
"num_input_tokens_seen": 89952,
|
|
"step": 700
|
|
},
|
|
{
|
|
"epoch": 0.2968421052631579,
|
|
"grad_norm": 7.232762813568115,
|
|
"learning_rate": 4.002702868207563e-05,
|
|
"loss": 2.9851,
|
|
"num_input_tokens_seen": 90656,
|
|
"step": 705
|
|
},
|
|
{
|
|
"epoch": 0.29894736842105263,
|
|
"grad_norm": 1.8625717163085938,
|
|
"learning_rate": 3.9894557656863823e-05,
|
|
"loss": 3.2503,
|
|
"num_input_tokens_seen": 91616,
|
|
"step": 710
|
|
},
|
|
{
|
|
"epoch": 0.30105263157894735,
|
|
"grad_norm": 5.469936370849609,
|
|
"learning_rate": 3.976143509544843e-05,
|
|
"loss": 2.9929,
|
|
"num_input_tokens_seen": 92128,
|
|
"step": 715
|
|
},
|
|
{
|
|
"epoch": 0.3031578947368421,
|
|
"grad_norm": 6.398062229156494,
|
|
"learning_rate": 3.9627666821041545e-05,
|
|
"loss": 3.2811,
|
|
"num_input_tokens_seen": 92688,
|
|
"step": 720
|
|
},
|
|
{
|
|
"epoch": 0.30526315789473685,
|
|
"grad_norm": 7.9203715324401855,
|
|
"learning_rate": 3.949325868510083e-05,
|
|
"loss": 3.6912,
|
|
"num_input_tokens_seen": 93312,
|
|
"step": 725
|
|
},
|
|
{
|
|
"epoch": 0.30736842105263157,
|
|
"grad_norm": 7.217921733856201,
|
|
"learning_rate": 3.935821656707359e-05,
|
|
"loss": 2.7706,
|
|
"num_input_tokens_seen": 93872,
|
|
"step": 730
|
|
},
|
|
{
|
|
"epoch": 0.3094736842105263,
|
|
"grad_norm": 3.361421823501587,
|
|
"learning_rate": 3.9222546374139533e-05,
|
|
"loss": 3.7275,
|
|
"num_input_tokens_seen": 94480,
|
|
"step": 735
|
|
},
|
|
{
|
|
"epoch": 0.31157894736842107,
|
|
"grad_norm": 5.755743980407715,
|
|
"learning_rate": 3.9086254040952416e-05,
|
|
"loss": 3.4091,
|
|
"num_input_tokens_seen": 95040,
|
|
"step": 740
|
|
},
|
|
{
|
|
"epoch": 0.3136842105263158,
|
|
"grad_norm": 2.512439012527466,
|
|
"learning_rate": 3.894934552938041e-05,
|
|
"loss": 3.2579,
|
|
"num_input_tokens_seen": 95696,
|
|
"step": 745
|
|
},
|
|
{
|
|
"epoch": 0.3157894736842105,
|
|
"grad_norm": 6.869725227355957,
|
|
"learning_rate": 3.8811826828245334e-05,
|
|
"loss": 2.8736,
|
|
"num_input_tokens_seen": 96352,
|
|
"step": 750
|
|
},
|
|
{
|
|
"epoch": 0.3178947368421053,
|
|
"grad_norm": 4.974020957946777,
|
|
"learning_rate": 3.867370395306067e-05,
|
|
"loss": 2.5466,
|
|
"num_input_tokens_seen": 97184,
|
|
"step": 755
|
|
},
|
|
{
|
|
"epoch": 0.32,
|
|
"grad_norm": 8.7909574508667,
|
|
"learning_rate": 3.853498294576845e-05,
|
|
"loss": 3.1885,
|
|
"num_input_tokens_seen": 97744,
|
|
"step": 760
|
|
},
|
|
{
|
|
"epoch": 0.32210526315789473,
|
|
"grad_norm": 8.156033515930176,
|
|
"learning_rate": 3.8395669874474915e-05,
|
|
"loss": 3.6878,
|
|
"num_input_tokens_seen": 98272,
|
|
"step": 765
|
|
},
|
|
{
|
|
"epoch": 0.32421052631578945,
|
|
"grad_norm": 6.218159198760986,
|
|
"learning_rate": 3.825577083318512e-05,
|
|
"loss": 3.2699,
|
|
"num_input_tokens_seen": 98992,
|
|
"step": 770
|
|
},
|
|
{
|
|
"epoch": 0.3263157894736842,
|
|
"grad_norm": 7.318730354309082,
|
|
"learning_rate": 3.8115291941536345e-05,
|
|
"loss": 3.1887,
|
|
"num_input_tokens_seen": 99632,
|
|
"step": 775
|
|
},
|
|
{
|
|
"epoch": 0.32842105263157895,
|
|
"grad_norm": 7.037641525268555,
|
|
"learning_rate": 3.797423934453038e-05,
|
|
"loss": 3.2489,
|
|
"num_input_tokens_seen": 100192,
|
|
"step": 780
|
|
},
|
|
{
|
|
"epoch": 0.33052631578947367,
|
|
"grad_norm": 5.422025680541992,
|
|
"learning_rate": 3.783261921226479e-05,
|
|
"loss": 3.6458,
|
|
"num_input_tokens_seen": 100704,
|
|
"step": 785
|
|
},
|
|
{
|
|
"epoch": 0.33263157894736844,
|
|
"grad_norm": 7.227110385894775,
|
|
"learning_rate": 3.7690437739662924e-05,
|
|
"loss": 2.6975,
|
|
"num_input_tokens_seen": 101504,
|
|
"step": 790
|
|
},
|
|
{
|
|
"epoch": 0.33473684210526317,
|
|
"grad_norm": 3.2489235401153564,
|
|
"learning_rate": 3.7547701146203005e-05,
|
|
"loss": 2.5729,
|
|
"num_input_tokens_seen": 102336,
|
|
"step": 795
|
|
},
|
|
{
|
|
"epoch": 0.3368421052631579,
|
|
"grad_norm": 5.875162124633789,
|
|
"learning_rate": 3.7404415675646054e-05,
|
|
"loss": 3.1426,
|
|
"num_input_tokens_seen": 102976,
|
|
"step": 800
|
|
},
|
|
{
|
|
"epoch": 0.3368421052631579,
|
|
"eval_loss": 3.157048225402832,
|
|
"eval_runtime": 21.3317,
|
|
"eval_samples_per_second": 23.439,
|
|
"eval_steps_per_second": 11.72,
|
|
"num_input_tokens_seen": 102976,
|
|
"step": 800
|
|
},
|
|
{
|
|
"epoch": 0.3389473684210526,
|
|
"grad_norm": 5.515801906585693,
|
|
"learning_rate": 3.726058759576271e-05,
|
|
"loss": 2.5607,
|
|
"num_input_tokens_seen": 103760,
|
|
"step": 805
|
|
},
|
|
{
|
|
"epoch": 0.3410526315789474,
|
|
"grad_norm": 6.290920257568359,
|
|
"learning_rate": 3.711622319805913e-05,
|
|
"loss": 3.128,
|
|
"num_input_tokens_seen": 104272,
|
|
"step": 810
|
|
},
|
|
{
|
|
"epoch": 0.3431578947368421,
|
|
"grad_norm": 7.0145087242126465,
|
|
"learning_rate": 3.697132879750174e-05,
|
|
"loss": 2.8597,
|
|
"num_input_tokens_seen": 104880,
|
|
"step": 815
|
|
},
|
|
{
|
|
"epoch": 0.3452631578947368,
|
|
"grad_norm": 6.190591812133789,
|
|
"learning_rate": 3.6825910732241026e-05,
|
|
"loss": 3.5617,
|
|
"num_input_tokens_seen": 105440,
|
|
"step": 820
|
|
},
|
|
{
|
|
"epoch": 0.3473684210526316,
|
|
"grad_norm": 8.005406379699707,
|
|
"learning_rate": 3.667997536333424e-05,
|
|
"loss": 3.2266,
|
|
"num_input_tokens_seen": 105984,
|
|
"step": 825
|
|
},
|
|
{
|
|
"epoch": 0.3494736842105263,
|
|
"grad_norm": 7.040084362030029,
|
|
"learning_rate": 3.65335290744672e-05,
|
|
"loss": 3.0301,
|
|
"num_input_tokens_seen": 106720,
|
|
"step": 830
|
|
},
|
|
{
|
|
"epoch": 0.35157894736842105,
|
|
"grad_norm": 5.619962215423584,
|
|
"learning_rate": 3.6386578271674984e-05,
|
|
"loss": 2.9117,
|
|
"num_input_tokens_seen": 107392,
|
|
"step": 835
|
|
},
|
|
{
|
|
"epoch": 0.35368421052631577,
|
|
"grad_norm": 8.019217491149902,
|
|
"learning_rate": 3.623912938306176e-05,
|
|
"loss": 3.4049,
|
|
"num_input_tokens_seen": 107920,
|
|
"step": 840
|
|
},
|
|
{
|
|
"epoch": 0.35578947368421054,
|
|
"grad_norm": 2.9099061489105225,
|
|
"learning_rate": 3.6091188858519607e-05,
|
|
"loss": 3.2627,
|
|
"num_input_tokens_seen": 108528,
|
|
"step": 845
|
|
},
|
|
{
|
|
"epoch": 0.35789473684210527,
|
|
"grad_norm": 7.037422180175781,
|
|
"learning_rate": 3.5942763169446295e-05,
|
|
"loss": 2.5905,
|
|
"num_input_tokens_seen": 109376,
|
|
"step": 850
|
|
},
|
|
{
|
|
"epoch": 0.36,
|
|
"grad_norm": 6.472799301147461,
|
|
"learning_rate": 3.579385880846232e-05,
|
|
"loss": 3.5297,
|
|
"num_input_tokens_seen": 110048,
|
|
"step": 855
|
|
},
|
|
{
|
|
"epoch": 0.36210526315789476,
|
|
"grad_norm": 7.292938232421875,
|
|
"learning_rate": 3.564448228912682e-05,
|
|
"loss": 2.3537,
|
|
"num_input_tokens_seen": 110720,
|
|
"step": 860
|
|
},
|
|
{
|
|
"epoch": 0.3642105263157895,
|
|
"grad_norm": 6.758236885070801,
|
|
"learning_rate": 3.549464014565265e-05,
|
|
"loss": 2.8941,
|
|
"num_input_tokens_seen": 111408,
|
|
"step": 865
|
|
},
|
|
{
|
|
"epoch": 0.3663157894736842,
|
|
"grad_norm": 4.073180198669434,
|
|
"learning_rate": 3.534433893262058e-05,
|
|
"loss": 3.6315,
|
|
"num_input_tokens_seen": 112032,
|
|
"step": 870
|
|
},
|
|
{
|
|
"epoch": 0.3684210526315789,
|
|
"grad_norm": 9.296513557434082,
|
|
"learning_rate": 3.519358522469259e-05,
|
|
"loss": 3.3729,
|
|
"num_input_tokens_seen": 112592,
|
|
"step": 875
|
|
},
|
|
{
|
|
"epoch": 0.3705263157894737,
|
|
"grad_norm": 6.642693519592285,
|
|
"learning_rate": 3.504238561632424e-05,
|
|
"loss": 3.4379,
|
|
"num_input_tokens_seen": 113136,
|
|
"step": 880
|
|
},
|
|
{
|
|
"epoch": 0.3726315789473684,
|
|
"grad_norm": 5.044366359710693,
|
|
"learning_rate": 3.489074672147621e-05,
|
|
"loss": 3.3523,
|
|
"num_input_tokens_seen": 113728,
|
|
"step": 885
|
|
},
|
|
{
|
|
"epoch": 0.37473684210526315,
|
|
"grad_norm": 5.917368412017822,
|
|
"learning_rate": 3.473867517332501e-05,
|
|
"loss": 3.5194,
|
|
"num_input_tokens_seen": 114416,
|
|
"step": 890
|
|
},
|
|
{
|
|
"epoch": 0.37684210526315787,
|
|
"grad_norm": 10.968324661254883,
|
|
"learning_rate": 3.458617762397279e-05,
|
|
"loss": 3.3524,
|
|
"num_input_tokens_seen": 114944,
|
|
"step": 895
|
|
},
|
|
{
|
|
"epoch": 0.37894736842105264,
|
|
"grad_norm": 6.620034694671631,
|
|
"learning_rate": 3.4433260744156396e-05,
|
|
"loss": 3.101,
|
|
"num_input_tokens_seen": 115536,
|
|
"step": 900
|
|
},
|
|
{
|
|
"epoch": 0.37894736842105264,
|
|
"eval_loss": 3.1375341415405273,
|
|
"eval_runtime": 20.7823,
|
|
"eval_samples_per_second": 24.059,
|
|
"eval_steps_per_second": 12.029,
|
|
"num_input_tokens_seen": 115536,
|
|
"step": 900
|
|
},
|
|
{
|
|
"epoch": 0.38105263157894737,
|
|
"grad_norm": 5.726701259613037,
|
|
"learning_rate": 3.427993122295552e-05,
|
|
"loss": 3.3473,
|
|
"num_input_tokens_seen": 116160,
|
|
"step": 905
|
|
},
|
|
{
|
|
"epoch": 0.3831578947368421,
|
|
"grad_norm": 6.464386463165283,
|
|
"learning_rate": 3.412619576750014e-05,
|
|
"loss": 3.0013,
|
|
"num_input_tokens_seen": 116720,
|
|
"step": 910
|
|
},
|
|
{
|
|
"epoch": 0.38526315789473686,
|
|
"grad_norm": 6.10352087020874,
|
|
"learning_rate": 3.397206110267713e-05,
|
|
"loss": 3.0683,
|
|
"num_input_tokens_seen": 117360,
|
|
"step": 915
|
|
},
|
|
{
|
|
"epoch": 0.3873684210526316,
|
|
"grad_norm": 6.360663414001465,
|
|
"learning_rate": 3.381753397083604e-05,
|
|
"loss": 3.2696,
|
|
"num_input_tokens_seen": 118112,
|
|
"step": 920
|
|
},
|
|
{
|
|
"epoch": 0.3894736842105263,
|
|
"grad_norm": 6.36335563659668,
|
|
"learning_rate": 3.3662621131494204e-05,
|
|
"loss": 3.1551,
|
|
"num_input_tokens_seen": 118784,
|
|
"step": 925
|
|
},
|
|
{
|
|
"epoch": 0.391578947368421,
|
|
"grad_norm": 7.843230247497559,
|
|
"learning_rate": 3.350732936104108e-05,
|
|
"loss": 2.4573,
|
|
"num_input_tokens_seen": 119616,
|
|
"step": 930
|
|
},
|
|
{
|
|
"epoch": 0.3936842105263158,
|
|
"grad_norm": 4.782888412475586,
|
|
"learning_rate": 3.335166545244178e-05,
|
|
"loss": 3.0543,
|
|
"num_input_tokens_seen": 120448,
|
|
"step": 935
|
|
},
|
|
{
|
|
"epoch": 0.3957894736842105,
|
|
"grad_norm": 5.410915851593018,
|
|
"learning_rate": 3.319563621493994e-05,
|
|
"loss": 3.6002,
|
|
"num_input_tokens_seen": 121008,
|
|
"step": 940
|
|
},
|
|
{
|
|
"epoch": 0.39789473684210525,
|
|
"grad_norm": 7.571856498718262,
|
|
"learning_rate": 3.3039248473759885e-05,
|
|
"loss": 3.206,
|
|
"num_input_tokens_seen": 121552,
|
|
"step": 945
|
|
},
|
|
{
|
|
"epoch": 0.4,
|
|
"grad_norm": 8.490530967712402,
|
|
"learning_rate": 3.2882509069808044e-05,
|
|
"loss": 2.6314,
|
|
"num_input_tokens_seen": 122256,
|
|
"step": 950
|
|
},
|
|
{
|
|
"epoch": 0.40210526315789474,
|
|
"grad_norm": 6.643606662750244,
|
|
"learning_rate": 3.272542485937369e-05,
|
|
"loss": 2.9055,
|
|
"num_input_tokens_seen": 122928,
|
|
"step": 955
|
|
},
|
|
{
|
|
"epoch": 0.40421052631578946,
|
|
"grad_norm": 4.136000156402588,
|
|
"learning_rate": 3.2568002713829084e-05,
|
|
"loss": 2.8356,
|
|
"num_input_tokens_seen": 123664,
|
|
"step": 960
|
|
},
|
|
{
|
|
"epoch": 0.4063157894736842,
|
|
"grad_norm": 3.604309558868408,
|
|
"learning_rate": 3.241024951932885e-05,
|
|
"loss": 2.9538,
|
|
"num_input_tokens_seen": 124256,
|
|
"step": 965
|
|
},
|
|
{
|
|
"epoch": 0.40842105263157896,
|
|
"grad_norm": 7.433534145355225,
|
|
"learning_rate": 3.225217217650876e-05,
|
|
"loss": 3.2552,
|
|
"num_input_tokens_seen": 124864,
|
|
"step": 970
|
|
},
|
|
{
|
|
"epoch": 0.4105263157894737,
|
|
"grad_norm": 6.31335973739624,
|
|
"learning_rate": 3.2093777600183875e-05,
|
|
"loss": 3.399,
|
|
"num_input_tokens_seen": 125616,
|
|
"step": 975
|
|
},
|
|
{
|
|
"epoch": 0.4126315789473684,
|
|
"grad_norm": 5.291494846343994,
|
|
"learning_rate": 3.1935072719046115e-05,
|
|
"loss": 3.2976,
|
|
"num_input_tokens_seen": 126160,
|
|
"step": 980
|
|
},
|
|
{
|
|
"epoch": 0.4147368421052632,
|
|
"grad_norm": 6.659648418426514,
|
|
"learning_rate": 3.1776064475361114e-05,
|
|
"loss": 2.6787,
|
|
"num_input_tokens_seen": 126928,
|
|
"step": 985
|
|
},
|
|
{
|
|
"epoch": 0.4168421052631579,
|
|
"grad_norm": 6.5285258293151855,
|
|
"learning_rate": 3.161675982466454e-05,
|
|
"loss": 3.0148,
|
|
"num_input_tokens_seen": 127824,
|
|
"step": 990
|
|
},
|
|
{
|
|
"epoch": 0.4189473684210526,
|
|
"grad_norm": 3.0640006065368652,
|
|
"learning_rate": 3.145716573545792e-05,
|
|
"loss": 2.9179,
|
|
"num_input_tokens_seen": 128496,
|
|
"step": 995
|
|
},
|
|
{
|
|
"epoch": 0.42105263157894735,
|
|
"grad_norm": 3.517613649368286,
|
|
"learning_rate": 3.129728918890371e-05,
|
|
"loss": 3.3436,
|
|
"num_input_tokens_seen": 129104,
|
|
"step": 1000
|
|
},
|
|
{
|
|
"epoch": 0.42105263157894735,
|
|
"eval_loss": 3.1259236335754395,
|
|
"eval_runtime": 20.5644,
|
|
"eval_samples_per_second": 24.314,
|
|
"eval_steps_per_second": 12.157,
|
|
"num_input_tokens_seen": 129104,
|
|
"step": 1000
|
|
},
|
|
{
|
|
"epoch": 0.4231578947368421,
|
|
"grad_norm": 3.706281900405884,
|
|
"learning_rate": 3.1137137178519985e-05,
|
|
"loss": 3.0361,
|
|
"num_input_tokens_seen": 129728,
|
|
"step": 1005
|
|
},
|
|
{
|
|
"epoch": 0.42526315789473684,
|
|
"grad_norm": 6.263809680938721,
|
|
"learning_rate": 3.0976716709874496e-05,
|
|
"loss": 4.0744,
|
|
"num_input_tokens_seen": 130256,
|
|
"step": 1010
|
|
},
|
|
{
|
|
"epoch": 0.42736842105263156,
|
|
"grad_norm": 7.12708044052124,
|
|
"learning_rate": 3.081603480027826e-05,
|
|
"loss": 3.0169,
|
|
"num_input_tokens_seen": 130880,
|
|
"step": 1015
|
|
},
|
|
{
|
|
"epoch": 0.42947368421052634,
|
|
"grad_norm": 3.2109436988830566,
|
|
"learning_rate": 3.065509847847851e-05,
|
|
"loss": 2.7457,
|
|
"num_input_tokens_seen": 131568,
|
|
"step": 1020
|
|
},
|
|
{
|
|
"epoch": 0.43157894736842106,
|
|
"grad_norm": 4.197145462036133,
|
|
"learning_rate": 3.0493914784351328e-05,
|
|
"loss": 2.7009,
|
|
"num_input_tokens_seen": 132128,
|
|
"step": 1025
|
|
},
|
|
{
|
|
"epoch": 0.4336842105263158,
|
|
"grad_norm": 7.7999091148376465,
|
|
"learning_rate": 3.0332490768593675e-05,
|
|
"loss": 3.2792,
|
|
"num_input_tokens_seen": 132832,
|
|
"step": 1030
|
|
},
|
|
{
|
|
"epoch": 0.4357894736842105,
|
|
"grad_norm": 7.479982852935791,
|
|
"learning_rate": 3.017083349241492e-05,
|
|
"loss": 2.6386,
|
|
"num_input_tokens_seen": 133488,
|
|
"step": 1035
|
|
},
|
|
{
|
|
"epoch": 0.4378947368421053,
|
|
"grad_norm": 2.8392317295074463,
|
|
"learning_rate": 3.0008950027228033e-05,
|
|
"loss": 2.6743,
|
|
"num_input_tokens_seen": 134288,
|
|
"step": 1040
|
|
},
|
|
{
|
|
"epoch": 0.44,
|
|
"grad_norm": 5.863373756408691,
|
|
"learning_rate": 2.984684745434021e-05,
|
|
"loss": 2.6163,
|
|
"num_input_tokens_seen": 135088,
|
|
"step": 1045
|
|
},
|
|
{
|
|
"epoch": 0.4421052631578947,
|
|
"grad_norm": 3.552429437637329,
|
|
"learning_rate": 2.9684532864643122e-05,
|
|
"loss": 3.0771,
|
|
"num_input_tokens_seen": 135808,
|
|
"step": 1050
|
|
},
|
|
{
|
|
"epoch": 0.4442105263157895,
|
|
"grad_norm": 3.51658296585083,
|
|
"learning_rate": 2.952201335830275e-05,
|
|
"loss": 3.0567,
|
|
"num_input_tokens_seen": 136464,
|
|
"step": 1055
|
|
},
|
|
{
|
|
"epoch": 0.4463157894736842,
|
|
"grad_norm": 6.143110752105713,
|
|
"learning_rate": 2.9359296044448794e-05,
|
|
"loss": 2.6468,
|
|
"num_input_tokens_seen": 137120,
|
|
"step": 1060
|
|
},
|
|
{
|
|
"epoch": 0.44842105263157894,
|
|
"grad_norm": 9.046653747558594,
|
|
"learning_rate": 2.9196388040863693e-05,
|
|
"loss": 3.7056,
|
|
"num_input_tokens_seen": 137744,
|
|
"step": 1065
|
|
},
|
|
{
|
|
"epoch": 0.45052631578947366,
|
|
"grad_norm": 9.399657249450684,
|
|
"learning_rate": 2.9033296473671278e-05,
|
|
"loss": 2.609,
|
|
"num_input_tokens_seen": 138384,
|
|
"step": 1070
|
|
},
|
|
{
|
|
"epoch": 0.45263157894736844,
|
|
"grad_norm": 6.991403102874756,
|
|
"learning_rate": 2.8870028477025042e-05,
|
|
"loss": 2.9277,
|
|
"num_input_tokens_seen": 139072,
|
|
"step": 1075
|
|
},
|
|
{
|
|
"epoch": 0.45473684210526316,
|
|
"grad_norm": 6.533243656158447,
|
|
"learning_rate": 2.870659119279605e-05,
|
|
"loss": 3.3649,
|
|
"num_input_tokens_seen": 139680,
|
|
"step": 1080
|
|
},
|
|
{
|
|
"epoch": 0.4568421052631579,
|
|
"grad_norm": 7.684907913208008,
|
|
"learning_rate": 2.8542991770260608e-05,
|
|
"loss": 2.743,
|
|
"num_input_tokens_seen": 140384,
|
|
"step": 1085
|
|
},
|
|
{
|
|
"epoch": 0.4589473684210526,
|
|
"grad_norm": 6.984562873840332,
|
|
"learning_rate": 2.8379237365787426e-05,
|
|
"loss": 2.8093,
|
|
"num_input_tokens_seen": 141024,
|
|
"step": 1090
|
|
},
|
|
{
|
|
"epoch": 0.4610526315789474,
|
|
"grad_norm": 7.385660171508789,
|
|
"learning_rate": 2.8215335142524657e-05,
|
|
"loss": 3.2659,
|
|
"num_input_tokens_seen": 141520,
|
|
"step": 1095
|
|
},
|
|
{
|
|
"epoch": 0.4631578947368421,
|
|
"grad_norm": 8.597911834716797,
|
|
"learning_rate": 2.8051292270086503e-05,
|
|
"loss": 3.2059,
|
|
"num_input_tokens_seen": 142176,
|
|
"step": 1100
|
|
},
|
|
{
|
|
"epoch": 0.4631578947368421,
|
|
"eval_loss": 3.088874101638794,
|
|
"eval_runtime": 21.713,
|
|
"eval_samples_per_second": 23.028,
|
|
"eval_steps_per_second": 11.514,
|
|
"num_input_tokens_seen": 142176,
|
|
"step": 1100
|
|
},
|
|
{
|
|
"epoch": 0.4652631578947368,
|
|
"grad_norm": 3.734241008758545,
|
|
"learning_rate": 2.788711592423966e-05,
|
|
"loss": 2.5484,
|
|
"num_input_tokens_seen": 143008,
|
|
"step": 1105
|
|
},
|
|
{
|
|
"epoch": 0.4673684210526316,
|
|
"grad_norm": 9.137406349182129,
|
|
"learning_rate": 2.7722813286589316e-05,
|
|
"loss": 3.5024,
|
|
"num_input_tokens_seen": 143568,
|
|
"step": 1110
|
|
},
|
|
{
|
|
"epoch": 0.4694736842105263,
|
|
"grad_norm": 6.121912479400635,
|
|
"learning_rate": 2.755839154426513e-05,
|
|
"loss": 2.9845,
|
|
"num_input_tokens_seen": 144272,
|
|
"step": 1115
|
|
},
|
|
{
|
|
"epoch": 0.47157894736842104,
|
|
"grad_norm": 7.885390281677246,
|
|
"learning_rate": 2.7393857889606756e-05,
|
|
"loss": 2.6784,
|
|
"num_input_tokens_seen": 144944,
|
|
"step": 1120
|
|
},
|
|
{
|
|
"epoch": 0.47368421052631576,
|
|
"grad_norm": 8.315886497497559,
|
|
"learning_rate": 2.722921951984927e-05,
|
|
"loss": 3.1619,
|
|
"num_input_tokens_seen": 145632,
|
|
"step": 1125
|
|
},
|
|
{
|
|
"epoch": 0.47578947368421054,
|
|
"grad_norm": 9.628824234008789,
|
|
"learning_rate": 2.7064483636808313e-05,
|
|
"loss": 3.1394,
|
|
"num_input_tokens_seen": 146256,
|
|
"step": 1130
|
|
},
|
|
{
|
|
"epoch": 0.47789473684210526,
|
|
"grad_norm": 7.490114212036133,
|
|
"learning_rate": 2.689965744656508e-05,
|
|
"loss": 3.4272,
|
|
"num_input_tokens_seen": 146848,
|
|
"step": 1135
|
|
},
|
|
{
|
|
"epoch": 0.48,
|
|
"grad_norm": 4.576737880706787,
|
|
"learning_rate": 2.6734748159151102e-05,
|
|
"loss": 3.1131,
|
|
"num_input_tokens_seen": 147456,
|
|
"step": 1140
|
|
},
|
|
{
|
|
"epoch": 0.48210526315789476,
|
|
"grad_norm": 6.783941745758057,
|
|
"learning_rate": 2.656976298823284e-05,
|
|
"loss": 2.4894,
|
|
"num_input_tokens_seen": 148240,
|
|
"step": 1145
|
|
},
|
|
{
|
|
"epoch": 0.4842105263157895,
|
|
"grad_norm": 5.665360450744629,
|
|
"learning_rate": 2.6404709150796137e-05,
|
|
"loss": 2.961,
|
|
"num_input_tokens_seen": 148864,
|
|
"step": 1150
|
|
},
|
|
{
|
|
"epoch": 0.4863157894736842,
|
|
"grad_norm": 7.185101509094238,
|
|
"learning_rate": 2.623959386683056e-05,
|
|
"loss": 3.0204,
|
|
"num_input_tokens_seen": 149568,
|
|
"step": 1155
|
|
},
|
|
{
|
|
"epoch": 0.4884210526315789,
|
|
"grad_norm": 3.023557186126709,
|
|
"learning_rate": 2.6074424359013517e-05,
|
|
"loss": 3.104,
|
|
"num_input_tokens_seen": 150320,
|
|
"step": 1160
|
|
},
|
|
{
|
|
"epoch": 0.4905263157894737,
|
|
"grad_norm": 2.9198670387268066,
|
|
"learning_rate": 2.5909207852394363e-05,
|
|
"loss": 2.9611,
|
|
"num_input_tokens_seen": 151040,
|
|
"step": 1165
|
|
},
|
|
{
|
|
"epoch": 0.4926315789473684,
|
|
"grad_norm": 6.565184593200684,
|
|
"learning_rate": 2.5743951574078314e-05,
|
|
"loss": 2.8824,
|
|
"num_input_tokens_seen": 151808,
|
|
"step": 1170
|
|
},
|
|
{
|
|
"epoch": 0.49473684210526314,
|
|
"grad_norm": 7.136038780212402,
|
|
"learning_rate": 2.5578662752910347e-05,
|
|
"loss": 3.1135,
|
|
"num_input_tokens_seen": 152352,
|
|
"step": 1175
|
|
},
|
|
{
|
|
"epoch": 0.4968421052631579,
|
|
"grad_norm": 7.393047332763672,
|
|
"learning_rate": 2.5413348619158967e-05,
|
|
"loss": 2.9705,
|
|
"num_input_tokens_seen": 152992,
|
|
"step": 1180
|
|
},
|
|
{
|
|
"epoch": 0.49894736842105264,
|
|
"grad_norm": 7.747554779052734,
|
|
"learning_rate": 2.5248016404199908e-05,
|
|
"loss": 2.6599,
|
|
"num_input_tokens_seen": 153648,
|
|
"step": 1185
|
|
},
|
|
{
|
|
"epoch": 0.5010526315789474,
|
|
"grad_norm": 6.3294219970703125,
|
|
"learning_rate": 2.508267334019988e-05,
|
|
"loss": 3.4948,
|
|
"num_input_tokens_seen": 154192,
|
|
"step": 1190
|
|
},
|
|
{
|
|
"epoch": 0.5031578947368421,
|
|
"grad_norm": 3.7944674491882324,
|
|
"learning_rate": 2.4917326659800123e-05,
|
|
"loss": 2.7937,
|
|
"num_input_tokens_seen": 154768,
|
|
"step": 1195
|
|
},
|
|
{
|
|
"epoch": 0.5052631578947369,
|
|
"grad_norm": 8.302847862243652,
|
|
"learning_rate": 2.475198359580009e-05,
|
|
"loss": 3.2607,
|
|
"num_input_tokens_seen": 155360,
|
|
"step": 1200
|
|
},
|
|
{
|
|
"epoch": 0.5052631578947369,
|
|
"eval_loss": 3.0811386108398438,
|
|
"eval_runtime": 19.3295,
|
|
"eval_samples_per_second": 25.867,
|
|
"eval_steps_per_second": 12.934,
|
|
"num_input_tokens_seen": 155360,
|
|
"step": 1200
|
|
},
|
|
{
|
|
"epoch": 0.5073684210526316,
|
|
"grad_norm": 4.6022186279296875,
|
|
"learning_rate": 2.458665138084104e-05,
|
|
"loss": 2.8833,
|
|
"num_input_tokens_seen": 156176,
|
|
"step": 1205
|
|
},
|
|
{
|
|
"epoch": 0.5094736842105263,
|
|
"grad_norm": 9.753704071044922,
|
|
"learning_rate": 2.4421337247089655e-05,
|
|
"loss": 3.0525,
|
|
"num_input_tokens_seen": 156784,
|
|
"step": 1210
|
|
},
|
|
{
|
|
"epoch": 0.511578947368421,
|
|
"grad_norm": 6.436235427856445,
|
|
"learning_rate": 2.425604842592169e-05,
|
|
"loss": 3.5911,
|
|
"num_input_tokens_seen": 157424,
|
|
"step": 1215
|
|
},
|
|
{
|
|
"epoch": 0.5136842105263157,
|
|
"grad_norm": 7.303015232086182,
|
|
"learning_rate": 2.4090792147605647e-05,
|
|
"loss": 2.8173,
|
|
"num_input_tokens_seen": 158128,
|
|
"step": 1220
|
|
},
|
|
{
|
|
"epoch": 0.5157894736842106,
|
|
"grad_norm": 8.383377075195312,
|
|
"learning_rate": 2.392557564098649e-05,
|
|
"loss": 2.9395,
|
|
"num_input_tokens_seen": 158784,
|
|
"step": 1225
|
|
},
|
|
{
|
|
"epoch": 0.5178947368421053,
|
|
"grad_norm": 8.447127342224121,
|
|
"learning_rate": 2.3760406133169443e-05,
|
|
"loss": 3.643,
|
|
"num_input_tokens_seen": 159280,
|
|
"step": 1230
|
|
},
|
|
{
|
|
"epoch": 0.52,
|
|
"grad_norm": 6.229928493499756,
|
|
"learning_rate": 2.3595290849203862e-05,
|
|
"loss": 3.1131,
|
|
"num_input_tokens_seen": 159840,
|
|
"step": 1235
|
|
},
|
|
{
|
|
"epoch": 0.5221052631578947,
|
|
"grad_norm": 7.979696750640869,
|
|
"learning_rate": 2.3430237011767167e-05,
|
|
"loss": 2.9351,
|
|
"num_input_tokens_seen": 160416,
|
|
"step": 1240
|
|
},
|
|
{
|
|
"epoch": 0.5242105263157895,
|
|
"grad_norm": 9.784839630126953,
|
|
"learning_rate": 2.32652518408489e-05,
|
|
"loss": 2.2889,
|
|
"num_input_tokens_seen": 161120,
|
|
"step": 1245
|
|
},
|
|
{
|
|
"epoch": 0.5263157894736842,
|
|
"grad_norm": 7.285416126251221,
|
|
"learning_rate": 2.3100342553434924e-05,
|
|
"loss": 3.7082,
|
|
"num_input_tokens_seen": 161728,
|
|
"step": 1250
|
|
},
|
|
{
|
|
"epoch": 0.5284210526315789,
|
|
"grad_norm": 8.140905380249023,
|
|
"learning_rate": 2.2935516363191693e-05,
|
|
"loss": 3.1545,
|
|
"num_input_tokens_seen": 162496,
|
|
"step": 1255
|
|
},
|
|
{
|
|
"epoch": 0.5305263157894737,
|
|
"grad_norm": 7.0224480628967285,
|
|
"learning_rate": 2.2770780480150744e-05,
|
|
"loss": 3.3456,
|
|
"num_input_tokens_seen": 162976,
|
|
"step": 1260
|
|
},
|
|
{
|
|
"epoch": 0.5326315789473685,
|
|
"grad_norm": 3.493330717086792,
|
|
"learning_rate": 2.2606142110393247e-05,
|
|
"loss": 3.2902,
|
|
"num_input_tokens_seen": 163600,
|
|
"step": 1265
|
|
},
|
|
{
|
|
"epoch": 0.5347368421052632,
|
|
"grad_norm": 8.467061996459961,
|
|
"learning_rate": 2.2441608455734873e-05,
|
|
"loss": 2.9682,
|
|
"num_input_tokens_seen": 164128,
|
|
"step": 1270
|
|
},
|
|
{
|
|
"epoch": 0.5368421052631579,
|
|
"grad_norm": 8.28370475769043,
|
|
"learning_rate": 2.2277186713410687e-05,
|
|
"loss": 3.363,
|
|
"num_input_tokens_seen": 164848,
|
|
"step": 1275
|
|
},
|
|
{
|
|
"epoch": 0.5389473684210526,
|
|
"grad_norm": 6.318331241607666,
|
|
"learning_rate": 2.2112884075760347e-05,
|
|
"loss": 2.636,
|
|
"num_input_tokens_seen": 165552,
|
|
"step": 1280
|
|
},
|
|
{
|
|
"epoch": 0.5410526315789473,
|
|
"grad_norm": 6.917469501495361,
|
|
"learning_rate": 2.19487077299135e-05,
|
|
"loss": 3.3949,
|
|
"num_input_tokens_seen": 166176,
|
|
"step": 1285
|
|
},
|
|
{
|
|
"epoch": 0.5431578947368421,
|
|
"grad_norm": 6.556621551513672,
|
|
"learning_rate": 2.1784664857475352e-05,
|
|
"loss": 3.1511,
|
|
"num_input_tokens_seen": 166672,
|
|
"step": 1290
|
|
},
|
|
{
|
|
"epoch": 0.5452631578947369,
|
|
"grad_norm": 3.6133346557617188,
|
|
"learning_rate": 2.1620762634212586e-05,
|
|
"loss": 2.8032,
|
|
"num_input_tokens_seen": 167392,
|
|
"step": 1295
|
|
},
|
|
{
|
|
"epoch": 0.5473684210526316,
|
|
"grad_norm": 6.281291484832764,
|
|
"learning_rate": 2.1457008229739394e-05,
|
|
"loss": 2.8431,
|
|
"num_input_tokens_seen": 168080,
|
|
"step": 1300
|
|
},
|
|
{
|
|
"epoch": 0.5473684210526316,
|
|
"eval_loss": 3.060394763946533,
|
|
"eval_runtime": 18.9707,
|
|
"eval_samples_per_second": 26.356,
|
|
"eval_steps_per_second": 13.178,
|
|
"num_input_tokens_seen": 168080,
|
|
"step": 1300
|
|
},
|
|
{
|
|
"epoch": 0.5494736842105263,
|
|
"grad_norm": 7.926747798919678,
|
|
"learning_rate": 2.1293408807203947e-05,
|
|
"loss": 2.9994,
|
|
"num_input_tokens_seen": 168816,
|
|
"step": 1305
|
|
},
|
|
{
|
|
"epoch": 0.5515789473684211,
|
|
"grad_norm": 7.976357460021973,
|
|
"learning_rate": 2.1129971522974967e-05,
|
|
"loss": 3.1117,
|
|
"num_input_tokens_seen": 169456,
|
|
"step": 1310
|
|
},
|
|
{
|
|
"epoch": 0.5536842105263158,
|
|
"grad_norm": 14.778353691101074,
|
|
"learning_rate": 2.0966703526328728e-05,
|
|
"loss": 3.0262,
|
|
"num_input_tokens_seen": 170032,
|
|
"step": 1315
|
|
},
|
|
{
|
|
"epoch": 0.5557894736842105,
|
|
"grad_norm": 6.6729230880737305,
|
|
"learning_rate": 2.080361195913631e-05,
|
|
"loss": 2.9819,
|
|
"num_input_tokens_seen": 170560,
|
|
"step": 1320
|
|
},
|
|
{
|
|
"epoch": 0.5578947368421052,
|
|
"grad_norm": 9.366944313049316,
|
|
"learning_rate": 2.0640703955551212e-05,
|
|
"loss": 3.2148,
|
|
"num_input_tokens_seen": 171168,
|
|
"step": 1325
|
|
},
|
|
{
|
|
"epoch": 0.56,
|
|
"grad_norm": 6.804953575134277,
|
|
"learning_rate": 2.047798664169726e-05,
|
|
"loss": 3.6074,
|
|
"num_input_tokens_seen": 171744,
|
|
"step": 1330
|
|
},
|
|
{
|
|
"epoch": 0.5621052631578948,
|
|
"grad_norm": 9.633087158203125,
|
|
"learning_rate": 2.031546713535688e-05,
|
|
"loss": 2.5756,
|
|
"num_input_tokens_seen": 172560,
|
|
"step": 1335
|
|
},
|
|
{
|
|
"epoch": 0.5642105263157895,
|
|
"grad_norm": 9.197052955627441,
|
|
"learning_rate": 2.0153152545659798e-05,
|
|
"loss": 3.2645,
|
|
"num_input_tokens_seen": 173200,
|
|
"step": 1340
|
|
},
|
|
{
|
|
"epoch": 0.5663157894736842,
|
|
"grad_norm": 7.425899028778076,
|
|
"learning_rate": 1.9991049972771972e-05,
|
|
"loss": 2.6577,
|
|
"num_input_tokens_seen": 174080,
|
|
"step": 1345
|
|
},
|
|
{
|
|
"epoch": 0.5684210526315789,
|
|
"grad_norm": 11.483154296875,
|
|
"learning_rate": 1.9829166507585083e-05,
|
|
"loss": 3.6653,
|
|
"num_input_tokens_seen": 174560,
|
|
"step": 1350
|
|
},
|
|
{
|
|
"epoch": 0.5705263157894737,
|
|
"grad_norm": 4.616602897644043,
|
|
"learning_rate": 1.9667509231406334e-05,
|
|
"loss": 3.3071,
|
|
"num_input_tokens_seen": 175184,
|
|
"step": 1355
|
|
},
|
|
{
|
|
"epoch": 0.5726315789473684,
|
|
"grad_norm": 6.981142044067383,
|
|
"learning_rate": 1.9506085215648675e-05,
|
|
"loss": 3.0256,
|
|
"num_input_tokens_seen": 175872,
|
|
"step": 1360
|
|
},
|
|
{
|
|
"epoch": 0.5747368421052632,
|
|
"grad_norm": 10.552703857421875,
|
|
"learning_rate": 1.93449015215215e-05,
|
|
"loss": 3.2127,
|
|
"num_input_tokens_seen": 176432,
|
|
"step": 1365
|
|
},
|
|
{
|
|
"epoch": 0.5768421052631579,
|
|
"grad_norm": 7.310719966888428,
|
|
"learning_rate": 1.9183965199721745e-05,
|
|
"loss": 2.8919,
|
|
"num_input_tokens_seen": 177072,
|
|
"step": 1370
|
|
},
|
|
{
|
|
"epoch": 0.5789473684210527,
|
|
"grad_norm": 4.100451469421387,
|
|
"learning_rate": 1.90232832901255e-05,
|
|
"loss": 2.2461,
|
|
"num_input_tokens_seen": 177792,
|
|
"step": 1375
|
|
},
|
|
{
|
|
"epoch": 0.5810526315789474,
|
|
"grad_norm": 9.246238708496094,
|
|
"learning_rate": 1.8862862821480025e-05,
|
|
"loss": 2.5727,
|
|
"num_input_tokens_seen": 179168,
|
|
"step": 1380
|
|
},
|
|
{
|
|
"epoch": 0.5831578947368421,
|
|
"grad_norm": 6.5480875968933105,
|
|
"learning_rate": 1.87027108110963e-05,
|
|
"loss": 3.5131,
|
|
"num_input_tokens_seen": 179712,
|
|
"step": 1385
|
|
},
|
|
{
|
|
"epoch": 0.5852631578947368,
|
|
"grad_norm": 7.038618564605713,
|
|
"learning_rate": 1.8542834264542092e-05,
|
|
"loss": 3.0084,
|
|
"num_input_tokens_seen": 180256,
|
|
"step": 1390
|
|
},
|
|
{
|
|
"epoch": 0.5873684210526315,
|
|
"grad_norm": 8.34747314453125,
|
|
"learning_rate": 1.8383240175335464e-05,
|
|
"loss": 3.0598,
|
|
"num_input_tokens_seen": 180864,
|
|
"step": 1395
|
|
},
|
|
{
|
|
"epoch": 0.5894736842105263,
|
|
"grad_norm": 7.6554155349731445,
|
|
"learning_rate": 1.8223935524638898e-05,
|
|
"loss": 3.3622,
|
|
"num_input_tokens_seen": 181424,
|
|
"step": 1400
|
|
},
|
|
{
|
|
"epoch": 0.5894736842105263,
|
|
"eval_loss": 3.044952630996704,
|
|
"eval_runtime": 20.7836,
|
|
"eval_samples_per_second": 24.057,
|
|
"eval_steps_per_second": 12.029,
|
|
"num_input_tokens_seen": 181424,
|
|
"step": 1400
|
|
},
|
|
{
|
|
"epoch": 0.5915789473684211,
|
|
"grad_norm": 7.480663776397705,
|
|
"learning_rate": 1.806492728095389e-05,
|
|
"loss": 3.0682,
|
|
"num_input_tokens_seen": 181984,
|
|
"step": 1405
|
|
},
|
|
{
|
|
"epoch": 0.5936842105263158,
|
|
"grad_norm": 9.622962951660156,
|
|
"learning_rate": 1.7906222399816124e-05,
|
|
"loss": 3.5743,
|
|
"num_input_tokens_seen": 182496,
|
|
"step": 1410
|
|
},
|
|
{
|
|
"epoch": 0.5957894736842105,
|
|
"grad_norm": 6.27373743057251,
|
|
"learning_rate": 1.7747827823491252e-05,
|
|
"loss": 2.965,
|
|
"num_input_tokens_seen": 183072,
|
|
"step": 1415
|
|
},
|
|
{
|
|
"epoch": 0.5978947368421053,
|
|
"grad_norm": 2.8175039291381836,
|
|
"learning_rate": 1.758975048067116e-05,
|
|
"loss": 2.9435,
|
|
"num_input_tokens_seen": 183664,
|
|
"step": 1420
|
|
},
|
|
{
|
|
"epoch": 0.6,
|
|
"grad_norm": 7.884466648101807,
|
|
"learning_rate": 1.7431997286170922e-05,
|
|
"loss": 3.6214,
|
|
"num_input_tokens_seen": 184192,
|
|
"step": 1425
|
|
},
|
|
{
|
|
"epoch": 0.6021052631578947,
|
|
"grad_norm": 5.639321804046631,
|
|
"learning_rate": 1.7274575140626318e-05,
|
|
"loss": 2.9336,
|
|
"num_input_tokens_seen": 184896,
|
|
"step": 1430
|
|
},
|
|
{
|
|
"epoch": 0.6042105263157894,
|
|
"grad_norm": 10.029356002807617,
|
|
"learning_rate": 1.7117490930191965e-05,
|
|
"loss": 3.0541,
|
|
"num_input_tokens_seen": 185392,
|
|
"step": 1435
|
|
},
|
|
{
|
|
"epoch": 0.6063157894736843,
|
|
"grad_norm": 5.941510200500488,
|
|
"learning_rate": 1.696075152624012e-05,
|
|
"loss": 3.0067,
|
|
"num_input_tokens_seen": 185952,
|
|
"step": 1440
|
|
},
|
|
{
|
|
"epoch": 0.608421052631579,
|
|
"grad_norm": 7.775386810302734,
|
|
"learning_rate": 1.6804363785060056e-05,
|
|
"loss": 3.6286,
|
|
"num_input_tokens_seen": 186512,
|
|
"step": 1445
|
|
},
|
|
{
|
|
"epoch": 0.6105263157894737,
|
|
"grad_norm": 7.570580005645752,
|
|
"learning_rate": 1.6648334547558226e-05,
|
|
"loss": 3.5453,
|
|
"num_input_tokens_seen": 187072,
|
|
"step": 1450
|
|
},
|
|
{
|
|
"epoch": 0.6126315789473684,
|
|
"grad_norm": 4.141610145568848,
|
|
"learning_rate": 1.6492670638958924e-05,
|
|
"loss": 2.7635,
|
|
"num_input_tokens_seen": 187712,
|
|
"step": 1455
|
|
},
|
|
{
|
|
"epoch": 0.6147368421052631,
|
|
"grad_norm": 6.558705806732178,
|
|
"learning_rate": 1.6337378868505805e-05,
|
|
"loss": 3.2797,
|
|
"num_input_tokens_seen": 188384,
|
|
"step": 1460
|
|
},
|
|
{
|
|
"epoch": 0.6168421052631579,
|
|
"grad_norm": 7.0916748046875,
|
|
"learning_rate": 1.6182466029163975e-05,
|
|
"loss": 3.1194,
|
|
"num_input_tokens_seen": 188960,
|
|
"step": 1465
|
|
},
|
|
{
|
|
"epoch": 0.6189473684210526,
|
|
"grad_norm": 7.959896564483643,
|
|
"learning_rate": 1.602793889732288e-05,
|
|
"loss": 2.7758,
|
|
"num_input_tokens_seen": 189616,
|
|
"step": 1470
|
|
},
|
|
{
|
|
"epoch": 0.6210526315789474,
|
|
"grad_norm": 6.758187770843506,
|
|
"learning_rate": 1.5873804232499863e-05,
|
|
"loss": 3.2369,
|
|
"num_input_tokens_seen": 190160,
|
|
"step": 1475
|
|
},
|
|
{
|
|
"epoch": 0.6231578947368421,
|
|
"grad_norm": 7.454896450042725,
|
|
"learning_rate": 1.5720068777044476e-05,
|
|
"loss": 2.9654,
|
|
"num_input_tokens_seen": 190816,
|
|
"step": 1480
|
|
},
|
|
{
|
|
"epoch": 0.6252631578947369,
|
|
"grad_norm": 4.9395341873168945,
|
|
"learning_rate": 1.5566739255843606e-05,
|
|
"loss": 3.2288,
|
|
"num_input_tokens_seen": 191520,
|
|
"step": 1485
|
|
},
|
|
{
|
|
"epoch": 0.6273684210526316,
|
|
"grad_norm": 6.551896095275879,
|
|
"learning_rate": 1.541382237602721e-05,
|
|
"loss": 2.9903,
|
|
"num_input_tokens_seen": 192256,
|
|
"step": 1490
|
|
},
|
|
{
|
|
"epoch": 0.6294736842105263,
|
|
"grad_norm": 3.614539861679077,
|
|
"learning_rate": 1.5261324826675e-05,
|
|
"loss": 2.4733,
|
|
"num_input_tokens_seen": 193008,
|
|
"step": 1495
|
|
},
|
|
{
|
|
"epoch": 0.631578947368421,
|
|
"grad_norm": 9.163614273071289,
|
|
"learning_rate": 1.5109253278523799e-05,
|
|
"loss": 2.2921,
|
|
"num_input_tokens_seen": 193696,
|
|
"step": 1500
|
|
},
|
|
{
|
|
"epoch": 0.631578947368421,
|
|
"eval_loss": 3.0401594638824463,
|
|
"eval_runtime": 19.007,
|
|
"eval_samples_per_second": 26.306,
|
|
"eval_steps_per_second": 13.153,
|
|
"num_input_tokens_seen": 193696,
|
|
"step": 1500
|
|
},
|
|
{
|
|
"epoch": 0.6336842105263157,
|
|
"grad_norm": 8.729575157165527,
|
|
"learning_rate": 1.495761438367577e-05,
|
|
"loss": 3.5125,
|
|
"num_input_tokens_seen": 194272,
|
|
"step": 1505
|
|
},
|
|
{
|
|
"epoch": 0.6357894736842106,
|
|
"grad_norm": 8.781404495239258,
|
|
"learning_rate": 1.4806414775307418e-05,
|
|
"loss": 2.9632,
|
|
"num_input_tokens_seen": 194976,
|
|
"step": 1510
|
|
},
|
|
{
|
|
"epoch": 0.6378947368421053,
|
|
"grad_norm": 7.9863739013671875,
|
|
"learning_rate": 1.465566106737942e-05,
|
|
"loss": 2.9676,
|
|
"num_input_tokens_seen": 195568,
|
|
"step": 1515
|
|
},
|
|
{
|
|
"epoch": 0.64,
|
|
"grad_norm": 6.997982501983643,
|
|
"learning_rate": 1.4505359854347361e-05,
|
|
"loss": 2.4359,
|
|
"num_input_tokens_seen": 196608,
|
|
"step": 1520
|
|
},
|
|
{
|
|
"epoch": 0.6421052631578947,
|
|
"grad_norm": 8.560687065124512,
|
|
"learning_rate": 1.4355517710873184e-05,
|
|
"loss": 3.0732,
|
|
"num_input_tokens_seen": 197280,
|
|
"step": 1525
|
|
},
|
|
{
|
|
"epoch": 0.6442105263157895,
|
|
"grad_norm": 3.5838141441345215,
|
|
"learning_rate": 1.4206141191537682e-05,
|
|
"loss": 2.4828,
|
|
"num_input_tokens_seen": 198096,
|
|
"step": 1530
|
|
},
|
|
{
|
|
"epoch": 0.6463157894736842,
|
|
"grad_norm": 6.076948642730713,
|
|
"learning_rate": 1.4057236830553704e-05,
|
|
"loss": 2.8096,
|
|
"num_input_tokens_seen": 198816,
|
|
"step": 1535
|
|
},
|
|
{
|
|
"epoch": 0.6484210526315789,
|
|
"grad_norm": 2.3415188789367676,
|
|
"learning_rate": 1.3908811141480408e-05,
|
|
"loss": 2.2974,
|
|
"num_input_tokens_seen": 199504,
|
|
"step": 1540
|
|
},
|
|
{
|
|
"epoch": 0.6505263157894737,
|
|
"grad_norm": 9.881010055541992,
|
|
"learning_rate": 1.3760870616938248e-05,
|
|
"loss": 3.609,
|
|
"num_input_tokens_seen": 200080,
|
|
"step": 1545
|
|
},
|
|
{
|
|
"epoch": 0.6526315789473685,
|
|
"grad_norm": 3.0654964447021484,
|
|
"learning_rate": 1.3613421728325018e-05,
|
|
"loss": 2.8781,
|
|
"num_input_tokens_seen": 200928,
|
|
"step": 1550
|
|
},
|
|
{
|
|
"epoch": 0.6547368421052632,
|
|
"grad_norm": 8.141066551208496,
|
|
"learning_rate": 1.346647092553281e-05,
|
|
"loss": 3.2405,
|
|
"num_input_tokens_seen": 201472,
|
|
"step": 1555
|
|
},
|
|
{
|
|
"epoch": 0.6568421052631579,
|
|
"grad_norm": 6.4536943435668945,
|
|
"learning_rate": 1.3320024636665757e-05,
|
|
"loss": 3.5873,
|
|
"num_input_tokens_seen": 202048,
|
|
"step": 1560
|
|
},
|
|
{
|
|
"epoch": 0.6589473684210526,
|
|
"grad_norm": 3.8144352436065674,
|
|
"learning_rate": 1.3174089267758983e-05,
|
|
"loss": 3.0432,
|
|
"num_input_tokens_seen": 202656,
|
|
"step": 1565
|
|
},
|
|
{
|
|
"epoch": 0.6610526315789473,
|
|
"grad_norm": 6.641602993011475,
|
|
"learning_rate": 1.3028671202498261e-05,
|
|
"loss": 2.9072,
|
|
"num_input_tokens_seen": 203312,
|
|
"step": 1570
|
|
},
|
|
{
|
|
"epoch": 0.6631578947368421,
|
|
"grad_norm": 8.284623146057129,
|
|
"learning_rate": 1.2883776801940884e-05,
|
|
"loss": 3.0814,
|
|
"num_input_tokens_seen": 203872,
|
|
"step": 1575
|
|
},
|
|
{
|
|
"epoch": 0.6652631578947369,
|
|
"grad_norm": 3.8963799476623535,
|
|
"learning_rate": 1.2739412404237306e-05,
|
|
"loss": 2.6138,
|
|
"num_input_tokens_seen": 204512,
|
|
"step": 1580
|
|
},
|
|
{
|
|
"epoch": 0.6673684210526316,
|
|
"grad_norm": 2.5593197345733643,
|
|
"learning_rate": 1.2595584324353943e-05,
|
|
"loss": 2.8958,
|
|
"num_input_tokens_seen": 205120,
|
|
"step": 1585
|
|
},
|
|
{
|
|
"epoch": 0.6694736842105263,
|
|
"grad_norm": 9.582676887512207,
|
|
"learning_rate": 1.245229885379699e-05,
|
|
"loss": 3.1137,
|
|
"num_input_tokens_seen": 205712,
|
|
"step": 1590
|
|
},
|
|
{
|
|
"epoch": 0.671578947368421,
|
|
"grad_norm": 7.700749397277832,
|
|
"learning_rate": 1.2309562260337073e-05,
|
|
"loss": 3.2855,
|
|
"num_input_tokens_seen": 206256,
|
|
"step": 1595
|
|
},
|
|
{
|
|
"epoch": 0.6736842105263158,
|
|
"grad_norm": 6.43610954284668,
|
|
"learning_rate": 1.216738078773522e-05,
|
|
"loss": 3.1937,
|
|
"num_input_tokens_seen": 206800,
|
|
"step": 1600
|
|
},
|
|
{
|
|
"epoch": 0.6736842105263158,
|
|
"eval_loss": 3.031867742538452,
|
|
"eval_runtime": 20.3668,
|
|
"eval_samples_per_second": 24.55,
|
|
"eval_steps_per_second": 12.275,
|
|
"num_input_tokens_seen": 206800,
|
|
"step": 1600
|
|
},
|
|
{
|
|
"epoch": 0.6757894736842105,
|
|
"grad_norm": 4.286380767822266,
|
|
"learning_rate": 1.202576065546963e-05,
|
|
"loss": 3.0851,
|
|
"num_input_tokens_seen": 207488,
|
|
"step": 1605
|
|
},
|
|
{
|
|
"epoch": 0.6778947368421052,
|
|
"grad_norm": 7.998393535614014,
|
|
"learning_rate": 1.1884708058463668e-05,
|
|
"loss": 3.2747,
|
|
"num_input_tokens_seen": 208128,
|
|
"step": 1610
|
|
},
|
|
{
|
|
"epoch": 0.68,
|
|
"grad_norm": 4.960210800170898,
|
|
"learning_rate": 1.1744229166814888e-05,
|
|
"loss": 3.3051,
|
|
"num_input_tokens_seen": 208688,
|
|
"step": 1615
|
|
},
|
|
{
|
|
"epoch": 0.6821052631578948,
|
|
"grad_norm": 7.372851371765137,
|
|
"learning_rate": 1.1604330125525079e-05,
|
|
"loss": 2.5032,
|
|
"num_input_tokens_seen": 209808,
|
|
"step": 1620
|
|
},
|
|
{
|
|
"epoch": 0.6842105263157895,
|
|
"grad_norm": 7.572737693786621,
|
|
"learning_rate": 1.146501705423155e-05,
|
|
"loss": 2.986,
|
|
"num_input_tokens_seen": 210496,
|
|
"step": 1625
|
|
},
|
|
{
|
|
"epoch": 0.6863157894736842,
|
|
"grad_norm": 8.84771728515625,
|
|
"learning_rate": 1.1326296046939333e-05,
|
|
"loss": 3.1372,
|
|
"num_input_tokens_seen": 211088,
|
|
"step": 1630
|
|
},
|
|
{
|
|
"epoch": 0.6884210526315789,
|
|
"grad_norm": 7.353373050689697,
|
|
"learning_rate": 1.1188173171754673e-05,
|
|
"loss": 3.011,
|
|
"num_input_tokens_seen": 211680,
|
|
"step": 1635
|
|
},
|
|
{
|
|
"epoch": 0.6905263157894737,
|
|
"grad_norm": 3.834928035736084,
|
|
"learning_rate": 1.1050654470619601e-05,
|
|
"loss": 3.3625,
|
|
"num_input_tokens_seen": 212336,
|
|
"step": 1640
|
|
},
|
|
{
|
|
"epoch": 0.6926315789473684,
|
|
"grad_norm": 7.206390857696533,
|
|
"learning_rate": 1.091374595904759e-05,
|
|
"loss": 3.3342,
|
|
"num_input_tokens_seen": 212864,
|
|
"step": 1645
|
|
},
|
|
{
|
|
"epoch": 0.6947368421052632,
|
|
"grad_norm": 7.2623820304870605,
|
|
"learning_rate": 1.0777453625860472e-05,
|
|
"loss": 3.0298,
|
|
"num_input_tokens_seen": 213456,
|
|
"step": 1650
|
|
},
|
|
{
|
|
"epoch": 0.6968421052631579,
|
|
"grad_norm": 7.93752908706665,
|
|
"learning_rate": 1.064178343292641e-05,
|
|
"loss": 3.4735,
|
|
"num_input_tokens_seen": 214016,
|
|
"step": 1655
|
|
},
|
|
{
|
|
"epoch": 0.6989473684210527,
|
|
"grad_norm": 3.989210605621338,
|
|
"learning_rate": 1.0506741314899166e-05,
|
|
"loss": 2.811,
|
|
"num_input_tokens_seen": 214736,
|
|
"step": 1660
|
|
},
|
|
{
|
|
"epoch": 0.7010526315789474,
|
|
"grad_norm": 6.527041435241699,
|
|
"learning_rate": 1.0372333178958462e-05,
|
|
"loss": 2.6078,
|
|
"num_input_tokens_seen": 215376,
|
|
"step": 1665
|
|
},
|
|
{
|
|
"epoch": 0.7031578947368421,
|
|
"grad_norm": 7.541158676147461,
|
|
"learning_rate": 1.0238564904551574e-05,
|
|
"loss": 3.1118,
|
|
"num_input_tokens_seen": 216000,
|
|
"step": 1670
|
|
},
|
|
{
|
|
"epoch": 0.7052631578947368,
|
|
"grad_norm": 8.171647071838379,
|
|
"learning_rate": 1.0105442343136184e-05,
|
|
"loss": 2.7288,
|
|
"num_input_tokens_seen": 216672,
|
|
"step": 1675
|
|
},
|
|
{
|
|
"epoch": 0.7073684210526315,
|
|
"grad_norm": 7.328914642333984,
|
|
"learning_rate": 9.972971317924374e-06,
|
|
"loss": 2.736,
|
|
"num_input_tokens_seen": 217344,
|
|
"step": 1680
|
|
},
|
|
{
|
|
"epoch": 0.7094736842105264,
|
|
"grad_norm": 10.017664909362793,
|
|
"learning_rate": 9.841157623627947e-06,
|
|
"loss": 2.043,
|
|
"num_input_tokens_seen": 218048,
|
|
"step": 1685
|
|
},
|
|
{
|
|
"epoch": 0.7115789473684211,
|
|
"grad_norm": 8.35176944732666,
|
|
"learning_rate": 9.710007026204895e-06,
|
|
"loss": 3.133,
|
|
"num_input_tokens_seen": 218672,
|
|
"step": 1690
|
|
},
|
|
{
|
|
"epoch": 0.7136842105263158,
|
|
"grad_norm": 4.202718734741211,
|
|
"learning_rate": 9.579525262607226e-06,
|
|
"loss": 2.7477,
|
|
"num_input_tokens_seen": 219408,
|
|
"step": 1695
|
|
},
|
|
{
|
|
"epoch": 0.7157894736842105,
|
|
"grad_norm": 8.910264015197754,
|
|
"learning_rate": 9.449718040529987e-06,
|
|
"loss": 3.2635,
|
|
"num_input_tokens_seen": 219920,
|
|
"step": 1700
|
|
},
|
|
{
|
|
"epoch": 0.7157894736842105,
|
|
"eval_loss": 3.0285239219665527,
|
|
"eval_runtime": 21.5591,
|
|
"eval_samples_per_second": 23.192,
|
|
"eval_steps_per_second": 11.596,
|
|
"num_input_tokens_seen": 219920,
|
|
"step": 1700
|
|
},
|
|
{
|
|
"epoch": 0.7178947368421053,
|
|
"grad_norm": 7.725574016571045,
|
|
"learning_rate": 9.320591038161574e-06,
|
|
"loss": 3.1978,
|
|
"num_input_tokens_seen": 220512,
|
|
"step": 1705
|
|
},
|
|
{
|
|
"epoch": 0.72,
|
|
"grad_norm": 8.27135181427002,
|
|
"learning_rate": 9.192149903935405e-06,
|
|
"loss": 3.2038,
|
|
"num_input_tokens_seen": 221040,
|
|
"step": 1710
|
|
},
|
|
{
|
|
"epoch": 0.7221052631578947,
|
|
"grad_norm": 7.715824127197266,
|
|
"learning_rate": 9.064400256282757e-06,
|
|
"loss": 3.3379,
|
|
"num_input_tokens_seen": 221696,
|
|
"step": 1715
|
|
},
|
|
{
|
|
"epoch": 0.7242105263157895,
|
|
"grad_norm": 6.266040325164795,
|
|
"learning_rate": 8.937347683387095e-06,
|
|
"loss": 3.035,
|
|
"num_input_tokens_seen": 222272,
|
|
"step": 1720
|
|
},
|
|
{
|
|
"epoch": 0.7263157894736842,
|
|
"grad_norm": 9.326272964477539,
|
|
"learning_rate": 8.810997742939531e-06,
|
|
"loss": 3.2204,
|
|
"num_input_tokens_seen": 222784,
|
|
"step": 1725
|
|
},
|
|
{
|
|
"epoch": 0.728421052631579,
|
|
"grad_norm": 7.447837829589844,
|
|
"learning_rate": 8.685355961895784e-06,
|
|
"loss": 3.0992,
|
|
"num_input_tokens_seen": 223344,
|
|
"step": 1730
|
|
},
|
|
{
|
|
"epoch": 0.7305263157894737,
|
|
"grad_norm": 7.900444030761719,
|
|
"learning_rate": 8.56042783623439e-06,
|
|
"loss": 2.8659,
|
|
"num_input_tokens_seen": 223840,
|
|
"step": 1735
|
|
},
|
|
{
|
|
"epoch": 0.7326315789473684,
|
|
"grad_norm": 7.262746334075928,
|
|
"learning_rate": 8.436218830716258e-06,
|
|
"loss": 2.8752,
|
|
"num_input_tokens_seen": 224624,
|
|
"step": 1740
|
|
},
|
|
{
|
|
"epoch": 0.7347368421052631,
|
|
"grad_norm": 8.595057487487793,
|
|
"learning_rate": 8.31273437864569e-06,
|
|
"loss": 2.854,
|
|
"num_input_tokens_seen": 225216,
|
|
"step": 1745
|
|
},
|
|
{
|
|
"epoch": 0.7368421052631579,
|
|
"grad_norm": 11.285130500793457,
|
|
"learning_rate": 8.189979881632634e-06,
|
|
"loss": 2.9706,
|
|
"num_input_tokens_seen": 225776,
|
|
"step": 1750
|
|
},
|
|
{
|
|
"epoch": 0.7389473684210527,
|
|
"grad_norm": 8.038933753967285,
|
|
"learning_rate": 8.067960709356478e-06,
|
|
"loss": 2.9734,
|
|
"num_input_tokens_seen": 226480,
|
|
"step": 1755
|
|
},
|
|
{
|
|
"epoch": 0.7410526315789474,
|
|
"grad_norm": 10.75576400756836,
|
|
"learning_rate": 7.946682199331088e-06,
|
|
"loss": 3.0971,
|
|
"num_input_tokens_seen": 227120,
|
|
"step": 1760
|
|
},
|
|
{
|
|
"epoch": 0.7431578947368421,
|
|
"grad_norm": 11.199416160583496,
|
|
"learning_rate": 7.826149656671386e-06,
|
|
"loss": 3.1954,
|
|
"num_input_tokens_seen": 227744,
|
|
"step": 1765
|
|
},
|
|
{
|
|
"epoch": 0.7452631578947368,
|
|
"grad_norm": 5.3302836418151855,
|
|
"learning_rate": 7.706368353861269e-06,
|
|
"loss": 2.899,
|
|
"num_input_tokens_seen": 228368,
|
|
"step": 1770
|
|
},
|
|
{
|
|
"epoch": 0.7473684210526316,
|
|
"grad_norm": 3.570517063140869,
|
|
"learning_rate": 7.587343530522945e-06,
|
|
"loss": 2.5961,
|
|
"num_input_tokens_seen": 229088,
|
|
"step": 1775
|
|
},
|
|
{
|
|
"epoch": 0.7494736842105263,
|
|
"grad_norm": 8.551164627075195,
|
|
"learning_rate": 7.469080393187786e-06,
|
|
"loss": 3.042,
|
|
"num_input_tokens_seen": 229728,
|
|
"step": 1780
|
|
},
|
|
{
|
|
"epoch": 0.751578947368421,
|
|
"grad_norm": 8.114623069763184,
|
|
"learning_rate": 7.351584115068535e-06,
|
|
"loss": 3.7636,
|
|
"num_input_tokens_seen": 230240,
|
|
"step": 1785
|
|
},
|
|
{
|
|
"epoch": 0.7536842105263157,
|
|
"grad_norm": 10.003973007202148,
|
|
"learning_rate": 7.234859835833021e-06,
|
|
"loss": 3.3422,
|
|
"num_input_tokens_seen": 230880,
|
|
"step": 1790
|
|
},
|
|
{
|
|
"epoch": 0.7557894736842106,
|
|
"grad_norm": 6.54116153717041,
|
|
"learning_rate": 7.118912661379368e-06,
|
|
"loss": 2.8134,
|
|
"num_input_tokens_seen": 231568,
|
|
"step": 1795
|
|
},
|
|
{
|
|
"epoch": 0.7578947368421053,
|
|
"grad_norm": 7.931966781616211,
|
|
"learning_rate": 7.003747663612581e-06,
|
|
"loss": 2.9374,
|
|
"num_input_tokens_seen": 232224,
|
|
"step": 1800
|
|
},
|
|
{
|
|
"epoch": 0.7578947368421053,
|
|
"eval_loss": 3.024580955505371,
|
|
"eval_runtime": 19.415,
|
|
"eval_samples_per_second": 25.753,
|
|
"eval_steps_per_second": 12.877,
|
|
"num_input_tokens_seen": 232224,
|
|
"step": 1800
|
|
},
|
|
{
|
|
"epoch": 0.76,
|
|
"grad_norm": 8.41895866394043,
|
|
"learning_rate": 6.889369880222776e-06,
|
|
"loss": 3.1164,
|
|
"num_input_tokens_seen": 232880,
|
|
"step": 1805
|
|
},
|
|
{
|
|
"epoch": 0.7621052631578947,
|
|
"grad_norm": 9.935205459594727,
|
|
"learning_rate": 6.775784314464717e-06,
|
|
"loss": 3.4611,
|
|
"num_input_tokens_seen": 233456,
|
|
"step": 1810
|
|
},
|
|
{
|
|
"epoch": 0.7642105263157895,
|
|
"grad_norm": 7.748201370239258,
|
|
"learning_rate": 6.662995934939007e-06,
|
|
"loss": 2.6114,
|
|
"num_input_tokens_seen": 234080,
|
|
"step": 1815
|
|
},
|
|
{
|
|
"epoch": 0.7663157894736842,
|
|
"grad_norm": 4.554218292236328,
|
|
"learning_rate": 6.551009675374764e-06,
|
|
"loss": 2.9599,
|
|
"num_input_tokens_seen": 234688,
|
|
"step": 1820
|
|
},
|
|
{
|
|
"epoch": 0.7684210526315789,
|
|
"grad_norm": 8.75670051574707,
|
|
"learning_rate": 6.439830434413754e-06,
|
|
"loss": 3.4321,
|
|
"num_input_tokens_seen": 235248,
|
|
"step": 1825
|
|
},
|
|
{
|
|
"epoch": 0.7705263157894737,
|
|
"grad_norm": 8.14771556854248,
|
|
"learning_rate": 6.329463075396161e-06,
|
|
"loss": 3.1403,
|
|
"num_input_tokens_seen": 235824,
|
|
"step": 1830
|
|
},
|
|
{
|
|
"epoch": 0.7726315789473684,
|
|
"grad_norm": 8.070094108581543,
|
|
"learning_rate": 6.219912426147795e-06,
|
|
"loss": 3.1174,
|
|
"num_input_tokens_seen": 236528,
|
|
"step": 1835
|
|
},
|
|
{
|
|
"epoch": 0.7747368421052632,
|
|
"grad_norm": 12.188468933105469,
|
|
"learning_rate": 6.111183278768956e-06,
|
|
"loss": 2.9941,
|
|
"num_input_tokens_seen": 237200,
|
|
"step": 1840
|
|
},
|
|
{
|
|
"epoch": 0.7768421052631579,
|
|
"grad_norm": 8.94471263885498,
|
|
"learning_rate": 6.003280389424789e-06,
|
|
"loss": 2.8323,
|
|
"num_input_tokens_seen": 237840,
|
|
"step": 1845
|
|
},
|
|
{
|
|
"epoch": 0.7789473684210526,
|
|
"grad_norm": 8.164437294006348,
|
|
"learning_rate": 5.896208478137222e-06,
|
|
"loss": 2.867,
|
|
"num_input_tokens_seen": 238560,
|
|
"step": 1850
|
|
},
|
|
{
|
|
"epoch": 0.7810526315789473,
|
|
"grad_norm": 3.1200382709503174,
|
|
"learning_rate": 5.78997222857853e-06,
|
|
"loss": 2.7932,
|
|
"num_input_tokens_seen": 239248,
|
|
"step": 1855
|
|
},
|
|
{
|
|
"epoch": 0.783157894736842,
|
|
"grad_norm": 9.779284477233887,
|
|
"learning_rate": 5.684576287866411e-06,
|
|
"loss": 2.9749,
|
|
"num_input_tokens_seen": 239808,
|
|
"step": 1860
|
|
},
|
|
{
|
|
"epoch": 0.7852631578947369,
|
|
"grad_norm": 8.78986644744873,
|
|
"learning_rate": 5.5800252663607665e-06,
|
|
"loss": 3.3494,
|
|
"num_input_tokens_seen": 240320,
|
|
"step": 1865
|
|
},
|
|
{
|
|
"epoch": 0.7873684210526316,
|
|
"grad_norm": 7.762701988220215,
|
|
"learning_rate": 5.476323737461955e-06,
|
|
"loss": 3.056,
|
|
"num_input_tokens_seen": 241040,
|
|
"step": 1870
|
|
},
|
|
{
|
|
"epoch": 0.7894736842105263,
|
|
"grad_norm": 7.796496868133545,
|
|
"learning_rate": 5.373476237410807e-06,
|
|
"loss": 3.0191,
|
|
"num_input_tokens_seen": 241616,
|
|
"step": 1875
|
|
},
|
|
{
|
|
"epoch": 0.791578947368421,
|
|
"grad_norm": 8.419198989868164,
|
|
"learning_rate": 5.271487265090163e-06,
|
|
"loss": 3.2889,
|
|
"num_input_tokens_seen": 242144,
|
|
"step": 1880
|
|
},
|
|
{
|
|
"epoch": 0.7936842105263158,
|
|
"grad_norm": 7.434835910797119,
|
|
"learning_rate": 5.170361281828054e-06,
|
|
"loss": 2.777,
|
|
"num_input_tokens_seen": 242800,
|
|
"step": 1885
|
|
},
|
|
{
|
|
"epoch": 0.7957894736842105,
|
|
"grad_norm": 8.441527366638184,
|
|
"learning_rate": 5.070102711202607e-06,
|
|
"loss": 3.3625,
|
|
"num_input_tokens_seen": 243360,
|
|
"step": 1890
|
|
},
|
|
{
|
|
"epoch": 0.7978947368421052,
|
|
"grad_norm": 3.879971742630005,
|
|
"learning_rate": 4.970715938848478e-06,
|
|
"loss": 3.1517,
|
|
"num_input_tokens_seen": 244320,
|
|
"step": 1895
|
|
},
|
|
{
|
|
"epoch": 0.8,
|
|
"grad_norm": 3.3906750679016113,
|
|
"learning_rate": 4.872205312265074e-06,
|
|
"loss": 3.3592,
|
|
"num_input_tokens_seen": 244976,
|
|
"step": 1900
|
|
},
|
|
{
|
|
"epoch": 0.8,
|
|
"eval_loss": 3.0196211338043213,
|
|
"eval_runtime": 19.5552,
|
|
"eval_samples_per_second": 25.569,
|
|
"eval_steps_per_second": 12.784,
|
|
"num_input_tokens_seen": 244976,
|
|
"step": 1900
|
|
},
|
|
{
|
|
"epoch": 0.8021052631578948,
|
|
"grad_norm": 8.258299827575684,
|
|
"learning_rate": 4.7745751406263165e-06,
|
|
"loss": 2.8077,
|
|
"num_input_tokens_seen": 245520,
|
|
"step": 1905
|
|
},
|
|
{
|
|
"epoch": 0.8042105263157895,
|
|
"grad_norm": 7.004968166351318,
|
|
"learning_rate": 4.677829694592198e-06,
|
|
"loss": 2.9133,
|
|
"num_input_tokens_seen": 246192,
|
|
"step": 1910
|
|
},
|
|
{
|
|
"epoch": 0.8063157894736842,
|
|
"grad_norm": 9.79128360748291,
|
|
"learning_rate": 4.581973206121948e-06,
|
|
"loss": 2.9371,
|
|
"num_input_tokens_seen": 246688,
|
|
"step": 1915
|
|
},
|
|
{
|
|
"epoch": 0.8084210526315789,
|
|
"grad_norm": 4.271524429321289,
|
|
"learning_rate": 4.487009868288888e-06,
|
|
"loss": 2.6443,
|
|
"num_input_tokens_seen": 247312,
|
|
"step": 1920
|
|
},
|
|
{
|
|
"epoch": 0.8105263157894737,
|
|
"grad_norm": 6.933757781982422,
|
|
"learning_rate": 4.392943835097069e-06,
|
|
"loss": 2.6296,
|
|
"num_input_tokens_seen": 248080,
|
|
"step": 1925
|
|
},
|
|
{
|
|
"epoch": 0.8126315789473684,
|
|
"grad_norm": 7.335968017578125,
|
|
"learning_rate": 4.299779221299499e-06,
|
|
"loss": 3.0604,
|
|
"num_input_tokens_seen": 248608,
|
|
"step": 1930
|
|
},
|
|
{
|
|
"epoch": 0.8147368421052632,
|
|
"grad_norm": 2.3223562240600586,
|
|
"learning_rate": 4.207520102218213e-06,
|
|
"loss": 2.9399,
|
|
"num_input_tokens_seen": 249216,
|
|
"step": 1935
|
|
},
|
|
{
|
|
"epoch": 0.8168421052631579,
|
|
"grad_norm": 8.753087043762207,
|
|
"learning_rate": 4.116170513565942e-06,
|
|
"loss": 3.5264,
|
|
"num_input_tokens_seen": 249792,
|
|
"step": 1940
|
|
},
|
|
{
|
|
"epoch": 0.8189473684210526,
|
|
"grad_norm": 8.94509506225586,
|
|
"learning_rate": 4.025734451269636e-06,
|
|
"loss": 3.2601,
|
|
"num_input_tokens_seen": 250336,
|
|
"step": 1945
|
|
},
|
|
{
|
|
"epoch": 0.8210526315789474,
|
|
"grad_norm": 6.4770917892456055,
|
|
"learning_rate": 3.936215871295634e-06,
|
|
"loss": 3.1531,
|
|
"num_input_tokens_seen": 250896,
|
|
"step": 1950
|
|
},
|
|
{
|
|
"epoch": 0.8231578947368421,
|
|
"grad_norm": 8.961740493774414,
|
|
"learning_rate": 3.847618689476612e-06,
|
|
"loss": 2.8972,
|
|
"num_input_tokens_seen": 251456,
|
|
"step": 1955
|
|
},
|
|
{
|
|
"epoch": 0.8252631578947368,
|
|
"grad_norm": 7.780544281005859,
|
|
"learning_rate": 3.7599467813403344e-06,
|
|
"loss": 3.6235,
|
|
"num_input_tokens_seen": 252048,
|
|
"step": 1960
|
|
},
|
|
{
|
|
"epoch": 0.8273684210526315,
|
|
"grad_norm": 7.865116596221924,
|
|
"learning_rate": 3.6732039819400683e-06,
|
|
"loss": 2.6362,
|
|
"num_input_tokens_seen": 252672,
|
|
"step": 1965
|
|
},
|
|
{
|
|
"epoch": 0.8294736842105264,
|
|
"grad_norm": 10.95207691192627,
|
|
"learning_rate": 3.5873940856868656e-06,
|
|
"loss": 3.9149,
|
|
"num_input_tokens_seen": 253232,
|
|
"step": 1970
|
|
},
|
|
{
|
|
"epoch": 0.8315789473684211,
|
|
"grad_norm": 12.879585266113281,
|
|
"learning_rate": 3.502520846183577e-06,
|
|
"loss": 3.1847,
|
|
"num_input_tokens_seen": 253968,
|
|
"step": 1975
|
|
},
|
|
{
|
|
"epoch": 0.8336842105263158,
|
|
"grad_norm": 9.403814315795898,
|
|
"learning_rate": 3.418587976060653e-06,
|
|
"loss": 3.2093,
|
|
"num_input_tokens_seen": 254576,
|
|
"step": 1980
|
|
},
|
|
{
|
|
"epoch": 0.8357894736842105,
|
|
"grad_norm": 6.636196136474609,
|
|
"learning_rate": 3.3355991468137394e-06,
|
|
"loss": 3.2709,
|
|
"num_input_tokens_seen": 255120,
|
|
"step": 1985
|
|
},
|
|
{
|
|
"epoch": 0.8378947368421052,
|
|
"grad_norm": 6.675850868225098,
|
|
"learning_rate": 3.2535579886430718e-06,
|
|
"loss": 3.1014,
|
|
"num_input_tokens_seen": 255728,
|
|
"step": 1990
|
|
},
|
|
{
|
|
"epoch": 0.84,
|
|
"grad_norm": 11.94994831085205,
|
|
"learning_rate": 3.1724680902946753e-06,
|
|
"loss": 3.3848,
|
|
"num_input_tokens_seen": 256336,
|
|
"step": 1995
|
|
},
|
|
{
|
|
"epoch": 0.8421052631578947,
|
|
"grad_norm": 9.998152732849121,
|
|
"learning_rate": 3.0923329989034132e-06,
|
|
"loss": 3.1163,
|
|
"num_input_tokens_seen": 256912,
|
|
"step": 2000
|
|
},
|
|
{
|
|
"epoch": 0.8421052631578947,
|
|
"eval_loss": 3.0172553062438965,
|
|
"eval_runtime": 20.2205,
|
|
"eval_samples_per_second": 24.727,
|
|
"eval_steps_per_second": 12.364,
|
|
"num_input_tokens_seen": 256912,
|
|
"step": 2000
|
|
},
|
|
{
|
|
"epoch": 0.8442105263157895,
|
|
"grad_norm": 9.193185806274414,
|
|
"learning_rate": 3.013156219837776e-06,
|
|
"loss": 3.1348,
|
|
"num_input_tokens_seen": 257552,
|
|
"step": 2005
|
|
},
|
|
{
|
|
"epoch": 0.8463157894736842,
|
|
"grad_norm": 0.8085441589355469,
|
|
"learning_rate": 2.9349412165465773e-06,
|
|
"loss": 2.287,
|
|
"num_input_tokens_seen": 259152,
|
|
"step": 2010
|
|
},
|
|
{
|
|
"epoch": 0.848421052631579,
|
|
"grad_norm": 8.64601993560791,
|
|
"learning_rate": 2.8576914104074425e-06,
|
|
"loss": 2.9002,
|
|
"num_input_tokens_seen": 259792,
|
|
"step": 2015
|
|
},
|
|
{
|
|
"epoch": 0.8505263157894737,
|
|
"grad_norm": 6.8055572509765625,
|
|
"learning_rate": 2.781410180577157e-06,
|
|
"loss": 3.3239,
|
|
"num_input_tokens_seen": 260448,
|
|
"step": 2020
|
|
},
|
|
{
|
|
"epoch": 0.8526315789473684,
|
|
"grad_norm": 6.6297993659973145,
|
|
"learning_rate": 2.706100863843822e-06,
|
|
"loss": 2.6339,
|
|
"num_input_tokens_seen": 261168,
|
|
"step": 2025
|
|
},
|
|
{
|
|
"epoch": 0.8547368421052631,
|
|
"grad_norm": 7.018527030944824,
|
|
"learning_rate": 2.6317667544809134e-06,
|
|
"loss": 3.0074,
|
|
"num_input_tokens_seen": 261872,
|
|
"step": 2030
|
|
},
|
|
{
|
|
"epoch": 0.8568421052631578,
|
|
"grad_norm": 10.741174697875977,
|
|
"learning_rate": 2.558411104103198e-06,
|
|
"loss": 3.0096,
|
|
"num_input_tokens_seen": 262416,
|
|
"step": 2035
|
|
},
|
|
{
|
|
"epoch": 0.8589473684210527,
|
|
"grad_norm": 4.673122406005859,
|
|
"learning_rate": 2.4860371215244484e-06,
|
|
"loss": 3.1151,
|
|
"num_input_tokens_seen": 263040,
|
|
"step": 2040
|
|
},
|
|
{
|
|
"epoch": 0.8610526315789474,
|
|
"grad_norm": 11.751906394958496,
|
|
"learning_rate": 2.414647972617129e-06,
|
|
"loss": 2.9073,
|
|
"num_input_tokens_seen": 263584,
|
|
"step": 2045
|
|
},
|
|
{
|
|
"epoch": 0.8631578947368421,
|
|
"grad_norm": 7.23284912109375,
|
|
"learning_rate": 2.3442467801738863e-06,
|
|
"loss": 2.8287,
|
|
"num_input_tokens_seen": 264176,
|
|
"step": 2050
|
|
},
|
|
{
|
|
"epoch": 0.8652631578947368,
|
|
"grad_norm": 5.624141693115234,
|
|
"learning_rate": 2.2748366237709374e-06,
|
|
"loss": 3.2398,
|
|
"num_input_tokens_seen": 264800,
|
|
"step": 2055
|
|
},
|
|
{
|
|
"epoch": 0.8673684210526316,
|
|
"grad_norm": 6.689244270324707,
|
|
"learning_rate": 2.2064205396333886e-06,
|
|
"loss": 3.0842,
|
|
"num_input_tokens_seen": 265424,
|
|
"step": 2060
|
|
},
|
|
{
|
|
"epoch": 0.8694736842105263,
|
|
"grad_norm": 4.412909984588623,
|
|
"learning_rate": 2.13900152050239e-06,
|
|
"loss": 2.7342,
|
|
"num_input_tokens_seen": 266048,
|
|
"step": 2065
|
|
},
|
|
{
|
|
"epoch": 0.871578947368421,
|
|
"grad_norm": 7.338773727416992,
|
|
"learning_rate": 2.072582515504254e-06,
|
|
"loss": 3.8273,
|
|
"num_input_tokens_seen": 266608,
|
|
"step": 2070
|
|
},
|
|
{
|
|
"epoch": 0.8736842105263158,
|
|
"grad_norm": 9.580443382263184,
|
|
"learning_rate": 2.007166430021415e-06,
|
|
"loss": 3.3556,
|
|
"num_input_tokens_seen": 267184,
|
|
"step": 2075
|
|
},
|
|
{
|
|
"epoch": 0.8757894736842106,
|
|
"grad_norm": 7.67100715637207,
|
|
"learning_rate": 1.9427561255653816e-06,
|
|
"loss": 3.136,
|
|
"num_input_tokens_seen": 267840,
|
|
"step": 2080
|
|
},
|
|
{
|
|
"epoch": 0.8778947368421053,
|
|
"grad_norm": 9.706231117248535,
|
|
"learning_rate": 1.87935441965153e-06,
|
|
"loss": 3.18,
|
|
"num_input_tokens_seen": 268336,
|
|
"step": 2085
|
|
},
|
|
{
|
|
"epoch": 0.88,
|
|
"grad_norm": 10.20802116394043,
|
|
"learning_rate": 1.8169640856758651e-06,
|
|
"loss": 2.7568,
|
|
"num_input_tokens_seen": 268864,
|
|
"step": 2090
|
|
},
|
|
{
|
|
"epoch": 0.8821052631578947,
|
|
"grad_norm": 7.580144882202148,
|
|
"learning_rate": 1.7555878527937164e-06,
|
|
"loss": 3.4589,
|
|
"num_input_tokens_seen": 269408,
|
|
"step": 2095
|
|
},
|
|
{
|
|
"epoch": 0.8842105263157894,
|
|
"grad_norm": 4.091537952423096,
|
|
"learning_rate": 1.6952284058003366e-06,
|
|
"loss": 2.8533,
|
|
"num_input_tokens_seen": 270112,
|
|
"step": 2100
|
|
},
|
|
{
|
|
"epoch": 0.8842105263157894,
|
|
"eval_loss": 3.016836404800415,
|
|
"eval_runtime": 20.1028,
|
|
"eval_samples_per_second": 24.872,
|
|
"eval_steps_per_second": 12.436,
|
|
"num_input_tokens_seen": 270112,
|
|
"step": 2100
|
|
},
|
|
{
|
|
"epoch": 0.8863157894736842,
|
|
"grad_norm": 7.614329814910889,
|
|
"learning_rate": 1.6358883850134816e-06,
|
|
"loss": 2.816,
|
|
"num_input_tokens_seen": 270848,
|
|
"step": 2105
|
|
},
|
|
{
|
|
"epoch": 0.888421052631579,
|
|
"grad_norm": 4.826380729675293,
|
|
"learning_rate": 1.5775703861578866e-06,
|
|
"loss": 3.0902,
|
|
"num_input_tokens_seen": 271472,
|
|
"step": 2110
|
|
},
|
|
{
|
|
"epoch": 0.8905263157894737,
|
|
"grad_norm": 6.646672248840332,
|
|
"learning_rate": 1.5202769602517515e-06,
|
|
"loss": 2.4841,
|
|
"num_input_tokens_seen": 272416,
|
|
"step": 2115
|
|
},
|
|
{
|
|
"epoch": 0.8926315789473684,
|
|
"grad_norm": 4.4659528732299805,
|
|
"learning_rate": 1.4640106134951316e-06,
|
|
"loss": 2.9784,
|
|
"num_input_tokens_seen": 273056,
|
|
"step": 2120
|
|
},
|
|
{
|
|
"epoch": 0.8947368421052632,
|
|
"grad_norm": 10.427908897399902,
|
|
"learning_rate": 1.4087738071603075e-06,
|
|
"loss": 3.1002,
|
|
"num_input_tokens_seen": 273680,
|
|
"step": 2125
|
|
},
|
|
{
|
|
"epoch": 0.8968421052631579,
|
|
"grad_norm": 9.523606300354004,
|
|
"learning_rate": 1.3545689574841342e-06,
|
|
"loss": 3.1575,
|
|
"num_input_tokens_seen": 274256,
|
|
"step": 2130
|
|
},
|
|
{
|
|
"epoch": 0.8989473684210526,
|
|
"grad_norm": 10.486383438110352,
|
|
"learning_rate": 1.3013984355623315e-06,
|
|
"loss": 3.2515,
|
|
"num_input_tokens_seen": 274864,
|
|
"step": 2135
|
|
},
|
|
{
|
|
"epoch": 0.9010526315789473,
|
|
"grad_norm": 7.348026752471924,
|
|
"learning_rate": 1.2492645672457837e-06,
|
|
"loss": 2.6269,
|
|
"num_input_tokens_seen": 275488,
|
|
"step": 2140
|
|
},
|
|
{
|
|
"epoch": 0.9031578947368422,
|
|
"grad_norm": 4.190880298614502,
|
|
"learning_rate": 1.1981696330387787e-06,
|
|
"loss": 2.8124,
|
|
"num_input_tokens_seen": 276112,
|
|
"step": 2145
|
|
},
|
|
{
|
|
"epoch": 0.9052631578947369,
|
|
"grad_norm": 6.005728721618652,
|
|
"learning_rate": 1.1481158679992555e-06,
|
|
"loss": 2.5941,
|
|
"num_input_tokens_seen": 276768,
|
|
"step": 2150
|
|
},
|
|
{
|
|
"epoch": 0.9073684210526316,
|
|
"grad_norm": 8.46441650390625,
|
|
"learning_rate": 1.0991054616410589e-06,
|
|
"loss": 2.898,
|
|
"num_input_tokens_seen": 277344,
|
|
"step": 2155
|
|
},
|
|
{
|
|
"epoch": 0.9094736842105263,
|
|
"grad_norm": 7.817598819732666,
|
|
"learning_rate": 1.051140557838129e-06,
|
|
"loss": 2.9216,
|
|
"num_input_tokens_seen": 277936,
|
|
"step": 2160
|
|
},
|
|
{
|
|
"epoch": 0.911578947368421,
|
|
"grad_norm": 4.694936752319336,
|
|
"learning_rate": 1.004223254730749e-06,
|
|
"loss": 2.9048,
|
|
"num_input_tokens_seen": 278480,
|
|
"step": 2165
|
|
},
|
|
{
|
|
"epoch": 0.9136842105263158,
|
|
"grad_norm": 11.393693923950195,
|
|
"learning_rate": 9.5835560463374e-07,
|
|
"loss": 3.0062,
|
|
"num_input_tokens_seen": 279104,
|
|
"step": 2170
|
|
},
|
|
{
|
|
"epoch": 0.9157894736842105,
|
|
"grad_norm": 6.442198753356934,
|
|
"learning_rate": 9.135396139467151e-07,
|
|
"loss": 3.1778,
|
|
"num_input_tokens_seen": 279680,
|
|
"step": 2175
|
|
},
|
|
{
|
|
"epoch": 0.9178947368421052,
|
|
"grad_norm": 7.988010406494141,
|
|
"learning_rate": 8.697772430662859e-07,
|
|
"loss": 3.4734,
|
|
"num_input_tokens_seen": 280320,
|
|
"step": 2180
|
|
},
|
|
{
|
|
"epoch": 0.92,
|
|
"grad_norm": 6.969496250152588,
|
|
"learning_rate": 8.270704063003232e-07,
|
|
"loss": 3.112,
|
|
"num_input_tokens_seen": 280944,
|
|
"step": 2185
|
|
},
|
|
{
|
|
"epoch": 0.9221052631578948,
|
|
"grad_norm": 9.206937789916992,
|
|
"learning_rate": 7.854209717842231e-07,
|
|
"loss": 3.3519,
|
|
"num_input_tokens_seen": 281456,
|
|
"step": 2190
|
|
},
|
|
{
|
|
"epoch": 0.9242105263157895,
|
|
"grad_norm": 12.280099868774414,
|
|
"learning_rate": 7.448307613991734e-07,
|
|
"loss": 3.118,
|
|
"num_input_tokens_seen": 282032,
|
|
"step": 2195
|
|
},
|
|
{
|
|
"epoch": 0.9263157894736842,
|
|
"grad_norm": 8.12384033203125,
|
|
"learning_rate": 7.053015506924748e-07,
|
|
"loss": 3.6021,
|
|
"num_input_tokens_seen": 282512,
|
|
"step": 2200
|
|
},
|
|
{
|
|
"epoch": 0.9263157894736842,
|
|
"eval_loss": 3.0150835514068604,
|
|
"eval_runtime": 20.7343,
|
|
"eval_samples_per_second": 24.115,
|
|
"eval_steps_per_second": 12.057,
|
|
"num_input_tokens_seen": 282512,
|
|
"step": 2200
|
|
},
|
|
{
|
|
"epoch": 0.9284210526315789,
|
|
"grad_norm": 6.287796497344971,
|
|
"learning_rate": 6.668350687998565e-07,
|
|
"loss": 3.0778,
|
|
"num_input_tokens_seen": 283152,
|
|
"step": 2205
|
|
},
|
|
{
|
|
"epoch": 0.9305263157894736,
|
|
"grad_norm": 3.7126476764678955,
|
|
"learning_rate": 6.2943299836985e-07,
|
|
"loss": 2.407,
|
|
"num_input_tokens_seen": 283888,
|
|
"step": 2210
|
|
},
|
|
{
|
|
"epoch": 0.9326315789473684,
|
|
"grad_norm": 8.16634750366211,
|
|
"learning_rate": 5.930969754901843e-07,
|
|
"loss": 2.6046,
|
|
"num_input_tokens_seen": 284544,
|
|
"step": 2215
|
|
},
|
|
{
|
|
"epoch": 0.9347368421052632,
|
|
"grad_norm": 9.50013542175293,
|
|
"learning_rate": 5.578285896162106e-07,
|
|
"loss": 2.7371,
|
|
"num_input_tokens_seen": 285232,
|
|
"step": 2220
|
|
},
|
|
{
|
|
"epoch": 0.9368421052631579,
|
|
"grad_norm": 10.298531532287598,
|
|
"learning_rate": 5.236293835013839e-07,
|
|
"loss": 2.9548,
|
|
"num_input_tokens_seen": 285808,
|
|
"step": 2225
|
|
},
|
|
{
|
|
"epoch": 0.9389473684210526,
|
|
"grad_norm": 4.417308807373047,
|
|
"learning_rate": 4.905008531297661e-07,
|
|
"loss": 2.6968,
|
|
"num_input_tokens_seen": 286448,
|
|
"step": 2230
|
|
},
|
|
{
|
|
"epoch": 0.9410526315789474,
|
|
"grad_norm": 9.6083402633667,
|
|
"learning_rate": 4.5844444765059945e-07,
|
|
"loss": 3.5563,
|
|
"num_input_tokens_seen": 286960,
|
|
"step": 2235
|
|
},
|
|
{
|
|
"epoch": 0.9431578947368421,
|
|
"grad_norm": 2.603585958480835,
|
|
"learning_rate": 4.2746156931490754e-07,
|
|
"loss": 2.1315,
|
|
"num_input_tokens_seen": 287728,
|
|
"step": 2240
|
|
},
|
|
{
|
|
"epoch": 0.9452631578947368,
|
|
"grad_norm": 7.709352016448975,
|
|
"learning_rate": 3.9755357341415835e-07,
|
|
"loss": 2.559,
|
|
"num_input_tokens_seen": 288496,
|
|
"step": 2245
|
|
},
|
|
{
|
|
"epoch": 0.9473684210526315,
|
|
"grad_norm": 12.336277961730957,
|
|
"learning_rate": 3.687217682209837e-07,
|
|
"loss": 3.3418,
|
|
"num_input_tokens_seen": 289072,
|
|
"step": 2250
|
|
},
|
|
{
|
|
"epoch": 0.9494736842105264,
|
|
"grad_norm": 6.97348165512085,
|
|
"learning_rate": 3.4096741493194197e-07,
|
|
"loss": 2.5656,
|
|
"num_input_tokens_seen": 290000,
|
|
"step": 2255
|
|
},
|
|
{
|
|
"epoch": 0.9515789473684211,
|
|
"grad_norm": 4.049922466278076,
|
|
"learning_rate": 3.142917276123564e-07,
|
|
"loss": 2.9834,
|
|
"num_input_tokens_seen": 290672,
|
|
"step": 2260
|
|
},
|
|
{
|
|
"epoch": 0.9536842105263158,
|
|
"grad_norm": 7.198146820068359,
|
|
"learning_rate": 2.886958731432132e-07,
|
|
"loss": 2.6608,
|
|
"num_input_tokens_seen": 291520,
|
|
"step": 2265
|
|
},
|
|
{
|
|
"epoch": 0.9557894736842105,
|
|
"grad_norm": 10.9363374710083,
|
|
"learning_rate": 2.641809711700999e-07,
|
|
"loss": 2.6429,
|
|
"num_input_tokens_seen": 292192,
|
|
"step": 2270
|
|
},
|
|
{
|
|
"epoch": 0.9578947368421052,
|
|
"grad_norm": 7.181972026824951,
|
|
"learning_rate": 2.4074809405425225e-07,
|
|
"loss": 2.8782,
|
|
"num_input_tokens_seen": 292880,
|
|
"step": 2275
|
|
},
|
|
{
|
|
"epoch": 0.96,
|
|
"grad_norm": 8.074187278747559,
|
|
"learning_rate": 2.1839826682562015e-07,
|
|
"loss": 3.6088,
|
|
"num_input_tokens_seen": 293488,
|
|
"step": 2280
|
|
},
|
|
{
|
|
"epoch": 0.9621052631578947,
|
|
"grad_norm": 9.010050773620605,
|
|
"learning_rate": 1.9713246713805588e-07,
|
|
"loss": 3.6666,
|
|
"num_input_tokens_seen": 294048,
|
|
"step": 2285
|
|
},
|
|
{
|
|
"epoch": 0.9642105263157895,
|
|
"grad_norm": 7.33037805557251,
|
|
"learning_rate": 1.7695162522652353e-07,
|
|
"loss": 3.5345,
|
|
"num_input_tokens_seen": 294624,
|
|
"step": 2290
|
|
},
|
|
{
|
|
"epoch": 0.9663157894736842,
|
|
"grad_norm": 11.428391456604004,
|
|
"learning_rate": 1.578566238664314e-07,
|
|
"loss": 3.5267,
|
|
"num_input_tokens_seen": 295136,
|
|
"step": 2295
|
|
},
|
|
{
|
|
"epoch": 0.968421052631579,
|
|
"grad_norm": 8.888762474060059,
|
|
"learning_rate": 1.3984829833499636e-07,
|
|
"loss": 3.2839,
|
|
"num_input_tokens_seen": 295680,
|
|
"step": 2300
|
|
},
|
|
{
|
|
"epoch": 0.968421052631579,
|
|
"eval_loss": 3.0145645141601562,
|
|
"eval_runtime": 19.0351,
|
|
"eval_samples_per_second": 26.267,
|
|
"eval_steps_per_second": 13.134,
|
|
"num_input_tokens_seen": 295680,
|
|
"step": 2300
|
|
},
|
|
{
|
|
"epoch": 0.9705263157894737,
|
|
"grad_norm": 8.029891014099121,
|
|
"learning_rate": 1.229274363747146e-07,
|
|
"loss": 2.6669,
|
|
"num_input_tokens_seen": 296320,
|
|
"step": 2305
|
|
},
|
|
{
|
|
"epoch": 0.9726315789473684,
|
|
"grad_norm": 7.007322788238525,
|
|
"learning_rate": 1.0709477815890601e-07,
|
|
"loss": 3.453,
|
|
"num_input_tokens_seen": 296864,
|
|
"step": 2310
|
|
},
|
|
{
|
|
"epoch": 0.9747368421052631,
|
|
"grad_norm": 8.719565391540527,
|
|
"learning_rate": 9.235101625932885e-08,
|
|
"loss": 3.5526,
|
|
"num_input_tokens_seen": 297456,
|
|
"step": 2315
|
|
},
|
|
{
|
|
"epoch": 0.9768421052631578,
|
|
"grad_norm": 10.41721248626709,
|
|
"learning_rate": 7.869679561589293e-08,
|
|
"loss": 3.2058,
|
|
"num_input_tokens_seen": 298000,
|
|
"step": 2320
|
|
},
|
|
{
|
|
"epoch": 0.9789473684210527,
|
|
"grad_norm": 8.100107192993164,
|
|
"learning_rate": 6.613271350844608e-08,
|
|
"loss": 3.3422,
|
|
"num_input_tokens_seen": 298576,
|
|
"step": 2325
|
|
},
|
|
{
|
|
"epoch": 0.9810526315789474,
|
|
"grad_norm": 5.350499629974365,
|
|
"learning_rate": 5.4659319530636633e-08,
|
|
"loss": 1.9689,
|
|
"num_input_tokens_seen": 299408,
|
|
"step": 2330
|
|
},
|
|
{
|
|
"epoch": 0.9831578947368421,
|
|
"grad_norm": 7.671536445617676,
|
|
"learning_rate": 4.427711556588832e-08,
|
|
"loss": 3.3022,
|
|
"num_input_tokens_seen": 299936,
|
|
"step": 2335
|
|
},
|
|
{
|
|
"epoch": 0.9852631578947368,
|
|
"grad_norm": 8.892446517944336,
|
|
"learning_rate": 3.4986555765434413e-08,
|
|
"loss": 3.0765,
|
|
"num_input_tokens_seen": 300432,
|
|
"step": 2340
|
|
},
|
|
{
|
|
"epoch": 0.9873684210526316,
|
|
"grad_norm": 7.4744415283203125,
|
|
"learning_rate": 2.6788046528461453e-08,
|
|
"loss": 3.8128,
|
|
"num_input_tokens_seen": 301072,
|
|
"step": 2345
|
|
},
|
|
{
|
|
"epoch": 0.9894736842105263,
|
|
"grad_norm": 7.746840476989746,
|
|
"learning_rate": 1.9681946484320644e-08,
|
|
"loss": 3.1598,
|
|
"num_input_tokens_seen": 301664,
|
|
"step": 2350
|
|
},
|
|
{
|
|
"epoch": 0.991578947368421,
|
|
"grad_norm": 7.51800012588501,
|
|
"learning_rate": 1.3668566476848777e-08,
|
|
"loss": 3.4258,
|
|
"num_input_tokens_seen": 302192,
|
|
"step": 2355
|
|
},
|
|
{
|
|
"epoch": 0.9936842105263158,
|
|
"grad_norm": 14.097600936889648,
|
|
"learning_rate": 8.74816955076796e-09,
|
|
"loss": 3.2759,
|
|
"num_input_tokens_seen": 302720,
|
|
"step": 2360
|
|
},
|
|
{
|
|
"epoch": 0.9957894736842106,
|
|
"grad_norm": 7.83034610748291,
|
|
"learning_rate": 4.920970940180958e-09,
|
|
"loss": 2.9216,
|
|
"num_input_tokens_seen": 303328,
|
|
"step": 2365
|
|
},
|
|
{
|
|
"epoch": 0.9978947368421053,
|
|
"grad_norm": 3.084580421447754,
|
|
"learning_rate": 2.1871380591509392e-09,
|
|
"loss": 2.5628,
|
|
"num_input_tokens_seen": 303936,
|
|
"step": 2370
|
|
},
|
|
{
|
|
"epoch": 1.0,
|
|
"grad_norm": 10.095595359802246,
|
|
"learning_rate": 5.467904943851077e-10,
|
|
"loss": 2.9564,
|
|
"num_input_tokens_seen": 304528,
|
|
"step": 2375
|
|
}
|
|
],
|
|
"logging_steps": 5,
|
|
"max_steps": 2375,
|
|
"num_input_tokens_seen": 304528,
|
|
"num_train_epochs": 1,
|
|
"save_steps": 100,
|
|
"stateful_callbacks": {
|
|
"TrainerControl": {
|
|
"args": {
|
|
"should_epoch_stop": false,
|
|
"should_evaluate": false,
|
|
"should_log": false,
|
|
"should_save": true,
|
|
"should_training_stop": true
|
|
},
|
|
"attributes": {}
|
|
}
|
|
},
|
|
"total_flos": 5097324650299392.0,
|
|
"train_batch_size": 2,
|
|
"trial_name": null,
|
|
"trial_params": null
|
|
}
|
|
|