|
{
|
|
"best_metric": null,
|
|
"best_model_checkpoint": null,
|
|
"epoch": 10.0,
|
|
"eval_steps": 500,
|
|
"global_step": 80,
|
|
"is_hyper_param_search": false,
|
|
"is_local_process_zero": true,
|
|
"is_world_process_zero": true,
|
|
"log_history": [
|
|
{
|
|
"epoch": 0.125,
|
|
"grad_norm": 52.583274841308594,
|
|
"learning_rate": 1.57035175879397e-09,
|
|
"loss": 4.4832,
|
|
"step": 1
|
|
},
|
|
{
|
|
"epoch": 0.25,
|
|
"grad_norm": 52.1717414855957,
|
|
"learning_rate": 3.14070351758794e-09,
|
|
"loss": 4.4526,
|
|
"step": 2
|
|
},
|
|
{
|
|
"epoch": 0.375,
|
|
"grad_norm": 51.840389251708984,
|
|
"learning_rate": 4.71105527638191e-09,
|
|
"loss": 4.4294,
|
|
"step": 3
|
|
},
|
|
{
|
|
"epoch": 0.5,
|
|
"grad_norm": 52.11858367919922,
|
|
"learning_rate": 6.28140703517588e-09,
|
|
"loss": 4.4392,
|
|
"step": 4
|
|
},
|
|
{
|
|
"epoch": 0.625,
|
|
"grad_norm": 51.991600036621094,
|
|
"learning_rate": 7.85175879396985e-09,
|
|
"loss": 4.4464,
|
|
"step": 5
|
|
},
|
|
{
|
|
"epoch": 0.75,
|
|
"grad_norm": 52.67415237426758,
|
|
"learning_rate": 9.42211055276382e-09,
|
|
"loss": 4.4485,
|
|
"step": 6
|
|
},
|
|
{
|
|
"epoch": 0.875,
|
|
"grad_norm": 52.06214904785156,
|
|
"learning_rate": 1.099246231155779e-08,
|
|
"loss": 4.445,
|
|
"step": 7
|
|
},
|
|
{
|
|
"epoch": 1.0,
|
|
"grad_norm": 52.24057388305664,
|
|
"learning_rate": 1.256281407035176e-08,
|
|
"loss": 4.4631,
|
|
"step": 8
|
|
},
|
|
{
|
|
"epoch": 1.0,
|
|
"eval_loss": 4.591891765594482,
|
|
"eval_runtime": 48.4152,
|
|
"eval_samples_per_second": 28.214,
|
|
"eval_steps_per_second": 0.289,
|
|
"step": 8
|
|
},
|
|
{
|
|
"epoch": 1.125,
|
|
"grad_norm": 52.291419982910156,
|
|
"learning_rate": 1.413316582914573e-08,
|
|
"loss": 4.4435,
|
|
"step": 9
|
|
},
|
|
{
|
|
"epoch": 1.25,
|
|
"grad_norm": 51.911685943603516,
|
|
"learning_rate": 1.57035175879397e-08,
|
|
"loss": 4.4448,
|
|
"step": 10
|
|
},
|
|
{
|
|
"epoch": 1.375,
|
|
"grad_norm": 52.54676055908203,
|
|
"learning_rate": 1.7273869346733672e-08,
|
|
"loss": 4.4468,
|
|
"step": 11
|
|
},
|
|
{
|
|
"epoch": 1.5,
|
|
"grad_norm": 52.45462417602539,
|
|
"learning_rate": 1.884422110552764e-08,
|
|
"loss": 4.4229,
|
|
"step": 12
|
|
},
|
|
{
|
|
"epoch": 1.625,
|
|
"grad_norm": 52.290565490722656,
|
|
"learning_rate": 2.041457286432161e-08,
|
|
"loss": 4.4212,
|
|
"step": 13
|
|
},
|
|
{
|
|
"epoch": 1.75,
|
|
"grad_norm": 51.737483978271484,
|
|
"learning_rate": 2.198492462311558e-08,
|
|
"loss": 4.4379,
|
|
"step": 14
|
|
},
|
|
{
|
|
"epoch": 1.875,
|
|
"grad_norm": 52.21713638305664,
|
|
"learning_rate": 2.3555276381909547e-08,
|
|
"loss": 4.426,
|
|
"step": 15
|
|
},
|
|
{
|
|
"epoch": 2.0,
|
|
"grad_norm": 52.21146011352539,
|
|
"learning_rate": 2.512562814070352e-08,
|
|
"loss": 4.4206,
|
|
"step": 16
|
|
},
|
|
{
|
|
"epoch": 2.0,
|
|
"eval_loss": 4.538872241973877,
|
|
"eval_runtime": 45.8583,
|
|
"eval_samples_per_second": 29.787,
|
|
"eval_steps_per_second": 0.305,
|
|
"step": 16
|
|
},
|
|
{
|
|
"epoch": 2.125,
|
|
"grad_norm": 52.31325149536133,
|
|
"learning_rate": 2.669597989949749e-08,
|
|
"loss": 4.4286,
|
|
"step": 17
|
|
},
|
|
{
|
|
"epoch": 2.25,
|
|
"grad_norm": 52.39244842529297,
|
|
"learning_rate": 2.826633165829146e-08,
|
|
"loss": 4.4363,
|
|
"step": 18
|
|
},
|
|
{
|
|
"epoch": 2.375,
|
|
"grad_norm": 51.97914123535156,
|
|
"learning_rate": 2.983668341708543e-08,
|
|
"loss": 4.4068,
|
|
"step": 19
|
|
},
|
|
{
|
|
"epoch": 2.5,
|
|
"grad_norm": 52.424015045166016,
|
|
"learning_rate": 3.14070351758794e-08,
|
|
"loss": 4.4079,
|
|
"step": 20
|
|
},
|
|
{
|
|
"epoch": 2.625,
|
|
"grad_norm": 51.720726013183594,
|
|
"learning_rate": 3.297738693467337e-08,
|
|
"loss": 4.4012,
|
|
"step": 21
|
|
},
|
|
{
|
|
"epoch": 2.75,
|
|
"grad_norm": 51.92622375488281,
|
|
"learning_rate": 3.4547738693467345e-08,
|
|
"loss": 4.3656,
|
|
"step": 22
|
|
},
|
|
{
|
|
"epoch": 2.875,
|
|
"grad_norm": 51.43586730957031,
|
|
"learning_rate": 3.611809045226131e-08,
|
|
"loss": 4.3986,
|
|
"step": 23
|
|
},
|
|
{
|
|
"epoch": 3.0,
|
|
"grad_norm": 52.25431823730469,
|
|
"learning_rate": 3.768844221105528e-08,
|
|
"loss": 4.3742,
|
|
"step": 24
|
|
},
|
|
{
|
|
"epoch": 3.0,
|
|
"eval_loss": 4.466385841369629,
|
|
"eval_runtime": 43.0073,
|
|
"eval_samples_per_second": 31.762,
|
|
"eval_steps_per_second": 0.326,
|
|
"step": 24
|
|
},
|
|
{
|
|
"epoch": 3.125,
|
|
"grad_norm": 52.38350296020508,
|
|
"learning_rate": 3.925879396984925e-08,
|
|
"loss": 4.3569,
|
|
"step": 25
|
|
},
|
|
{
|
|
"epoch": 3.25,
|
|
"grad_norm": 52.09661865234375,
|
|
"learning_rate": 4.082914572864322e-08,
|
|
"loss": 4.3329,
|
|
"step": 26
|
|
},
|
|
{
|
|
"epoch": 3.375,
|
|
"grad_norm": 52.156551361083984,
|
|
"learning_rate": 4.239949748743719e-08,
|
|
"loss": 4.3611,
|
|
"step": 27
|
|
},
|
|
{
|
|
"epoch": 3.5,
|
|
"grad_norm": 51.66514587402344,
|
|
"learning_rate": 4.396984924623116e-08,
|
|
"loss": 4.3414,
|
|
"step": 28
|
|
},
|
|
{
|
|
"epoch": 3.625,
|
|
"grad_norm": 51.9528694152832,
|
|
"learning_rate": 4.5540201005025126e-08,
|
|
"loss": 4.3208,
|
|
"step": 29
|
|
},
|
|
{
|
|
"epoch": 3.75,
|
|
"grad_norm": 51.63153839111328,
|
|
"learning_rate": 4.7110552763819094e-08,
|
|
"loss": 4.3323,
|
|
"step": 30
|
|
},
|
|
{
|
|
"epoch": 3.875,
|
|
"grad_norm": 51.82636642456055,
|
|
"learning_rate": 4.8680904522613075e-08,
|
|
"loss": 4.2969,
|
|
"step": 31
|
|
},
|
|
{
|
|
"epoch": 4.0,
|
|
"grad_norm": 50.99885177612305,
|
|
"learning_rate": 5.025125628140704e-08,
|
|
"loss": 4.2848,
|
|
"step": 32
|
|
},
|
|
{
|
|
"epoch": 4.0,
|
|
"eval_loss": 4.35823392868042,
|
|
"eval_runtime": 42.4893,
|
|
"eval_samples_per_second": 32.149,
|
|
"eval_steps_per_second": 0.329,
|
|
"step": 32
|
|
},
|
|
{
|
|
"epoch": 4.125,
|
|
"grad_norm": 51.07362365722656,
|
|
"learning_rate": 5.182160804020101e-08,
|
|
"loss": 4.2605,
|
|
"step": 33
|
|
},
|
|
{
|
|
"epoch": 4.25,
|
|
"grad_norm": 51.94071578979492,
|
|
"learning_rate": 5.339195979899498e-08,
|
|
"loss": 4.2979,
|
|
"step": 34
|
|
},
|
|
{
|
|
"epoch": 4.375,
|
|
"grad_norm": 50.928531646728516,
|
|
"learning_rate": 5.496231155778895e-08,
|
|
"loss": 4.2821,
|
|
"step": 35
|
|
},
|
|
{
|
|
"epoch": 4.5,
|
|
"grad_norm": 51.46839141845703,
|
|
"learning_rate": 5.653266331658292e-08,
|
|
"loss": 4.2471,
|
|
"step": 36
|
|
},
|
|
{
|
|
"epoch": 4.625,
|
|
"grad_norm": 51.3604621887207,
|
|
"learning_rate": 5.810301507537689e-08,
|
|
"loss": 4.2608,
|
|
"step": 37
|
|
},
|
|
{
|
|
"epoch": 4.75,
|
|
"grad_norm": 51.4367561340332,
|
|
"learning_rate": 5.967336683417086e-08,
|
|
"loss": 4.24,
|
|
"step": 38
|
|
},
|
|
{
|
|
"epoch": 4.875,
|
|
"grad_norm": 51.37506103515625,
|
|
"learning_rate": 6.124371859296483e-08,
|
|
"loss": 4.2268,
|
|
"step": 39
|
|
},
|
|
{
|
|
"epoch": 5.0,
|
|
"grad_norm": 51.0534782409668,
|
|
"learning_rate": 6.28140703517588e-08,
|
|
"loss": 4.1992,
|
|
"step": 40
|
|
},
|
|
{
|
|
"epoch": 5.0,
|
|
"eval_loss": 4.225412845611572,
|
|
"eval_runtime": 62.0906,
|
|
"eval_samples_per_second": 22.0,
|
|
"eval_steps_per_second": 0.225,
|
|
"step": 40
|
|
},
|
|
{
|
|
"epoch": 5.125,
|
|
"grad_norm": 50.854610443115234,
|
|
"learning_rate": 6.438442211055277e-08,
|
|
"loss": 4.1707,
|
|
"step": 41
|
|
},
|
|
{
|
|
"epoch": 5.25,
|
|
"grad_norm": 51.0251579284668,
|
|
"learning_rate": 6.595477386934674e-08,
|
|
"loss": 4.1889,
|
|
"step": 42
|
|
},
|
|
{
|
|
"epoch": 5.375,
|
|
"grad_norm": 51.27962875366211,
|
|
"learning_rate": 6.75251256281407e-08,
|
|
"loss": 4.1892,
|
|
"step": 43
|
|
},
|
|
{
|
|
"epoch": 5.5,
|
|
"grad_norm": 50.632904052734375,
|
|
"learning_rate": 6.909547738693469e-08,
|
|
"loss": 4.1684,
|
|
"step": 44
|
|
},
|
|
{
|
|
"epoch": 5.625,
|
|
"grad_norm": 51.070587158203125,
|
|
"learning_rate": 7.066582914572865e-08,
|
|
"loss": 4.154,
|
|
"step": 45
|
|
},
|
|
{
|
|
"epoch": 5.75,
|
|
"grad_norm": 50.42203903198242,
|
|
"learning_rate": 7.223618090452263e-08,
|
|
"loss": 4.1407,
|
|
"step": 46
|
|
},
|
|
{
|
|
"epoch": 5.875,
|
|
"grad_norm": 50.98355484008789,
|
|
"learning_rate": 7.380653266331659e-08,
|
|
"loss": 4.129,
|
|
"step": 47
|
|
},
|
|
{
|
|
"epoch": 6.0,
|
|
"grad_norm": 50.823097229003906,
|
|
"learning_rate": 7.537688442211056e-08,
|
|
"loss": 4.1172,
|
|
"step": 48
|
|
},
|
|
{
|
|
"epoch": 6.0,
|
|
"eval_loss": 4.049016952514648,
|
|
"eval_runtime": 46.6978,
|
|
"eval_samples_per_second": 29.252,
|
|
"eval_steps_per_second": 0.3,
|
|
"step": 48
|
|
},
|
|
{
|
|
"epoch": 6.125,
|
|
"grad_norm": 51.02920150756836,
|
|
"learning_rate": 7.694723618090454e-08,
|
|
"loss": 4.1099,
|
|
"step": 49
|
|
},
|
|
{
|
|
"epoch": 6.25,
|
|
"grad_norm": 50.04145812988281,
|
|
"learning_rate": 7.85175879396985e-08,
|
|
"loss": 4.076,
|
|
"step": 50
|
|
},
|
|
{
|
|
"epoch": 6.375,
|
|
"grad_norm": 50.59613037109375,
|
|
"learning_rate": 8.008793969849247e-08,
|
|
"loss": 4.0718,
|
|
"step": 51
|
|
},
|
|
{
|
|
"epoch": 6.5,
|
|
"grad_norm": 50.145957946777344,
|
|
"learning_rate": 8.165829145728645e-08,
|
|
"loss": 4.0557,
|
|
"step": 52
|
|
},
|
|
{
|
|
"epoch": 6.625,
|
|
"grad_norm": 49.8894157409668,
|
|
"learning_rate": 8.322864321608041e-08,
|
|
"loss": 4.0178,
|
|
"step": 53
|
|
},
|
|
{
|
|
"epoch": 6.75,
|
|
"grad_norm": 49.86162185668945,
|
|
"learning_rate": 8.479899497487438e-08,
|
|
"loss": 4.0299,
|
|
"step": 54
|
|
},
|
|
{
|
|
"epoch": 6.875,
|
|
"grad_norm": 50.02029800415039,
|
|
"learning_rate": 8.636934673366834e-08,
|
|
"loss": 3.9924,
|
|
"step": 55
|
|
},
|
|
{
|
|
"epoch": 7.0,
|
|
"grad_norm": 49.66703414916992,
|
|
"learning_rate": 8.793969849246232e-08,
|
|
"loss": 3.9685,
|
|
"step": 56
|
|
},
|
|
{
|
|
"epoch": 7.0,
|
|
"eval_loss": 3.8612220287323,
|
|
"eval_runtime": 46.1098,
|
|
"eval_samples_per_second": 29.625,
|
|
"eval_steps_per_second": 0.304,
|
|
"step": 56
|
|
},
|
|
{
|
|
"epoch": 7.125,
|
|
"grad_norm": 49.38985824584961,
|
|
"learning_rate": 8.951005025125629e-08,
|
|
"loss": 3.9575,
|
|
"step": 57
|
|
},
|
|
{
|
|
"epoch": 7.25,
|
|
"grad_norm": 49.38282012939453,
|
|
"learning_rate": 9.108040201005025e-08,
|
|
"loss": 3.9457,
|
|
"step": 58
|
|
},
|
|
{
|
|
"epoch": 7.375,
|
|
"grad_norm": 49.578147888183594,
|
|
"learning_rate": 9.265075376884423e-08,
|
|
"loss": 3.9147,
|
|
"step": 59
|
|
},
|
|
{
|
|
"epoch": 7.5,
|
|
"grad_norm": 49.263980865478516,
|
|
"learning_rate": 9.422110552763819e-08,
|
|
"loss": 3.9124,
|
|
"step": 60
|
|
},
|
|
{
|
|
"epoch": 7.625,
|
|
"grad_norm": 48.740360260009766,
|
|
"learning_rate": 9.579145728643216e-08,
|
|
"loss": 3.9056,
|
|
"step": 61
|
|
},
|
|
{
|
|
"epoch": 7.75,
|
|
"grad_norm": 49.30016326904297,
|
|
"learning_rate": 9.736180904522615e-08,
|
|
"loss": 3.8962,
|
|
"step": 62
|
|
},
|
|
{
|
|
"epoch": 7.875,
|
|
"grad_norm": 48.78429412841797,
|
|
"learning_rate": 9.89321608040201e-08,
|
|
"loss": 3.8614,
|
|
"step": 63
|
|
},
|
|
{
|
|
"epoch": 8.0,
|
|
"grad_norm": 48.219146728515625,
|
|
"learning_rate": 1.0050251256281409e-07,
|
|
"loss": 3.8165,
|
|
"step": 64
|
|
},
|
|
{
|
|
"epoch": 8.0,
|
|
"eval_loss": 3.6316096782684326,
|
|
"eval_runtime": 45.5729,
|
|
"eval_samples_per_second": 29.974,
|
|
"eval_steps_per_second": 0.307,
|
|
"step": 64
|
|
},
|
|
{
|
|
"epoch": 8.125,
|
|
"grad_norm": 48.37006759643555,
|
|
"learning_rate": 1.0207286432160806e-07,
|
|
"loss": 3.8137,
|
|
"step": 65
|
|
},
|
|
{
|
|
"epoch": 8.25,
|
|
"grad_norm": 47.9218635559082,
|
|
"learning_rate": 1.0364321608040202e-07,
|
|
"loss": 3.7862,
|
|
"step": 66
|
|
},
|
|
{
|
|
"epoch": 8.375,
|
|
"grad_norm": 48.484378814697266,
|
|
"learning_rate": 1.05213567839196e-07,
|
|
"loss": 3.789,
|
|
"step": 67
|
|
},
|
|
{
|
|
"epoch": 8.5,
|
|
"grad_norm": 48.121212005615234,
|
|
"learning_rate": 1.0678391959798996e-07,
|
|
"loss": 3.7609,
|
|
"step": 68
|
|
},
|
|
{
|
|
"epoch": 8.625,
|
|
"grad_norm": 48.182281494140625,
|
|
"learning_rate": 1.0835427135678393e-07,
|
|
"loss": 3.7594,
|
|
"step": 69
|
|
},
|
|
{
|
|
"epoch": 8.75,
|
|
"grad_norm": 48.09780502319336,
|
|
"learning_rate": 1.099246231155779e-07,
|
|
"loss": 3.6978,
|
|
"step": 70
|
|
},
|
|
{
|
|
"epoch": 8.875,
|
|
"grad_norm": 47.829345703125,
|
|
"learning_rate": 1.1149497487437187e-07,
|
|
"loss": 3.7101,
|
|
"step": 71
|
|
},
|
|
{
|
|
"epoch": 9.0,
|
|
"grad_norm": 47.34735107421875,
|
|
"learning_rate": 1.1306532663316584e-07,
|
|
"loss": 3.6672,
|
|
"step": 72
|
|
},
|
|
{
|
|
"epoch": 9.0,
|
|
"eval_loss": 3.3924500942230225,
|
|
"eval_runtime": 42.4081,
|
|
"eval_samples_per_second": 32.211,
|
|
"eval_steps_per_second": 0.33,
|
|
"step": 72
|
|
},
|
|
{
|
|
"epoch": 9.125,
|
|
"grad_norm": 47.713768005371094,
|
|
"learning_rate": 1.146356783919598e-07,
|
|
"loss": 3.6657,
|
|
"step": 73
|
|
},
|
|
{
|
|
"epoch": 9.25,
|
|
"grad_norm": 48.04665756225586,
|
|
"learning_rate": 1.1620603015075378e-07,
|
|
"loss": 3.635,
|
|
"step": 74
|
|
},
|
|
{
|
|
"epoch": 9.375,
|
|
"grad_norm": 47.147972106933594,
|
|
"learning_rate": 1.1777638190954775e-07,
|
|
"loss": 3.6149,
|
|
"step": 75
|
|
},
|
|
{
|
|
"epoch": 9.5,
|
|
"grad_norm": 47.14889144897461,
|
|
"learning_rate": 1.193467336683417e-07,
|
|
"loss": 3.5891,
|
|
"step": 76
|
|
},
|
|
{
|
|
"epoch": 9.625,
|
|
"grad_norm": 47.27253723144531,
|
|
"learning_rate": 1.209170854271357e-07,
|
|
"loss": 3.56,
|
|
"step": 77
|
|
},
|
|
{
|
|
"epoch": 9.75,
|
|
"grad_norm": 47.30030822753906,
|
|
"learning_rate": 1.2248743718592966e-07,
|
|
"loss": 3.554,
|
|
"step": 78
|
|
},
|
|
{
|
|
"epoch": 9.875,
|
|
"grad_norm": 46.53535461425781,
|
|
"learning_rate": 1.2405778894472362e-07,
|
|
"loss": 3.5081,
|
|
"step": 79
|
|
},
|
|
{
|
|
"epoch": 10.0,
|
|
"grad_norm": 46.524356842041016,
|
|
"learning_rate": 1.256281407035176e-07,
|
|
"loss": 3.4906,
|
|
"step": 80
|
|
},
|
|
{
|
|
"epoch": 10.0,
|
|
"eval_loss": 3.130697011947632,
|
|
"eval_runtime": 44.7299,
|
|
"eval_samples_per_second": 30.539,
|
|
"eval_steps_per_second": 0.313,
|
|
"step": 80
|
|
}
|
|
],
|
|
"logging_steps": 1,
|
|
"max_steps": 80,
|
|
"num_input_tokens_seen": 0,
|
|
"num_train_epochs": 10,
|
|
"save_steps": 500,
|
|
"stateful_callbacks": {
|
|
"TrainerControl": {
|
|
"args": {
|
|
"should_epoch_stop": false,
|
|
"should_evaluate": false,
|
|
"should_log": false,
|
|
"should_save": true,
|
|
"should_training_stop": true
|
|
},
|
|
"attributes": {}
|
|
}
|
|
},
|
|
"total_flos": 0.0,
|
|
"train_batch_size": 100,
|
|
"trial_name": null,
|
|
"trial_params": null
|
|
}
|
|
|