lesso's picture
Training in progress, step 200, checkpoint
ee7b451 verified
{
"best_metric": 0.4131671190261841,
"best_model_checkpoint": "miner_id_24/checkpoint-200",
"epoch": 0.005134458635517617,
"eval_steps": 50,
"global_step": 200,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 2.5672293177588086e-05,
"grad_norm": 1.026159644126892,
"learning_rate": 1.0060000000000002e-05,
"loss": 1.2569,
"step": 1
},
{
"epoch": 2.5672293177588086e-05,
"eval_loss": 1.1921963691711426,
"eval_runtime": 1183.8788,
"eval_samples_per_second": 13.854,
"eval_steps_per_second": 3.464,
"step": 1
},
{
"epoch": 5.134458635517617e-05,
"grad_norm": 1.255319595336914,
"learning_rate": 2.0120000000000004e-05,
"loss": 1.2297,
"step": 2
},
{
"epoch": 7.701687953276427e-05,
"grad_norm": 1.6948285102844238,
"learning_rate": 3.018e-05,
"loss": 1.6159,
"step": 3
},
{
"epoch": 0.00010268917271035235,
"grad_norm": 1.570288062095642,
"learning_rate": 4.024000000000001e-05,
"loss": 1.5701,
"step": 4
},
{
"epoch": 0.00012836146588794044,
"grad_norm": 1.4998438358306885,
"learning_rate": 5.03e-05,
"loss": 1.7115,
"step": 5
},
{
"epoch": 0.00015403375906552854,
"grad_norm": 1.805379867553711,
"learning_rate": 6.036e-05,
"loss": 1.7774,
"step": 6
},
{
"epoch": 0.00017970605224311662,
"grad_norm": 2.1143808364868164,
"learning_rate": 7.042e-05,
"loss": 2.1946,
"step": 7
},
{
"epoch": 0.0002053783454207047,
"grad_norm": 2.083022356033325,
"learning_rate": 8.048000000000002e-05,
"loss": 1.9931,
"step": 8
},
{
"epoch": 0.0002310506385982928,
"grad_norm": 2.2490415573120117,
"learning_rate": 9.054000000000001e-05,
"loss": 2.7258,
"step": 9
},
{
"epoch": 0.00025672293177588087,
"grad_norm": 1.9612363576889038,
"learning_rate": 0.0001006,
"loss": 1.7879,
"step": 10
},
{
"epoch": 0.00028239522495346895,
"grad_norm": 2.7282867431640625,
"learning_rate": 0.00010007052631578948,
"loss": 2.3426,
"step": 11
},
{
"epoch": 0.0003080675181310571,
"grad_norm": 2.3359768390655518,
"learning_rate": 9.954105263157895e-05,
"loss": 1.8971,
"step": 12
},
{
"epoch": 0.00033373981130864515,
"grad_norm": 2.0164449214935303,
"learning_rate": 9.901157894736842e-05,
"loss": 1.8824,
"step": 13
},
{
"epoch": 0.00035941210448623323,
"grad_norm": 2.7772645950317383,
"learning_rate": 9.84821052631579e-05,
"loss": 2.6385,
"step": 14
},
{
"epoch": 0.0003850843976638213,
"grad_norm": 2.503121852874756,
"learning_rate": 9.795263157894737e-05,
"loss": 1.9396,
"step": 15
},
{
"epoch": 0.0004107566908414094,
"grad_norm": 2.5371038913726807,
"learning_rate": 9.742315789473686e-05,
"loss": 2.1417,
"step": 16
},
{
"epoch": 0.0004364289840189975,
"grad_norm": 3.1160120964050293,
"learning_rate": 9.689368421052633e-05,
"loss": 2.4972,
"step": 17
},
{
"epoch": 0.0004621012771965856,
"grad_norm": 3.7234385013580322,
"learning_rate": 9.63642105263158e-05,
"loss": 2.569,
"step": 18
},
{
"epoch": 0.00048777357037417367,
"grad_norm": 3.364891290664673,
"learning_rate": 9.583473684210527e-05,
"loss": 2.4822,
"step": 19
},
{
"epoch": 0.0005134458635517617,
"grad_norm": 9.361184120178223,
"learning_rate": 9.530526315789474e-05,
"loss": 1.3435,
"step": 20
},
{
"epoch": 0.0005391181567293499,
"grad_norm": 5.888495445251465,
"learning_rate": 9.477578947368422e-05,
"loss": 0.8447,
"step": 21
},
{
"epoch": 0.0005647904499069379,
"grad_norm": 4.862771511077881,
"learning_rate": 9.424631578947369e-05,
"loss": 0.4383,
"step": 22
},
{
"epoch": 0.000590462743084526,
"grad_norm": 1.6988191604614258,
"learning_rate": 9.371684210526316e-05,
"loss": 0.1793,
"step": 23
},
{
"epoch": 0.0006161350362621142,
"grad_norm": 0.5420213341712952,
"learning_rate": 9.318736842105263e-05,
"loss": 0.0468,
"step": 24
},
{
"epoch": 0.0006418073294397022,
"grad_norm": 0.26038220524787903,
"learning_rate": 9.26578947368421e-05,
"loss": 0.0137,
"step": 25
},
{
"epoch": 0.0006674796226172903,
"grad_norm": 0.11139990389347076,
"learning_rate": 9.212842105263159e-05,
"loss": 0.004,
"step": 26
},
{
"epoch": 0.0006931519157948783,
"grad_norm": 0.030027125030755997,
"learning_rate": 9.159894736842107e-05,
"loss": 0.0012,
"step": 27
},
{
"epoch": 0.0007188242089724665,
"grad_norm": 0.015038474462926388,
"learning_rate": 9.106947368421054e-05,
"loss": 0.0005,
"step": 28
},
{
"epoch": 0.0007444965021500546,
"grad_norm": 0.008915350772440434,
"learning_rate": 9.054000000000001e-05,
"loss": 0.0002,
"step": 29
},
{
"epoch": 0.0007701687953276426,
"grad_norm": 0.006986498832702637,
"learning_rate": 9.001052631578948e-05,
"loss": 0.0002,
"step": 30
},
{
"epoch": 0.0007958410885052307,
"grad_norm": 0.0063736108131706715,
"learning_rate": 8.948105263157895e-05,
"loss": 0.0001,
"step": 31
},
{
"epoch": 0.0008215133816828188,
"grad_norm": 0.007324839010834694,
"learning_rate": 8.895157894736842e-05,
"loss": 0.0001,
"step": 32
},
{
"epoch": 0.0008471856748604069,
"grad_norm": 0.010746892541646957,
"learning_rate": 8.842210526315789e-05,
"loss": 0.0002,
"step": 33
},
{
"epoch": 0.000872857968037995,
"grad_norm": 0.014329284429550171,
"learning_rate": 8.789263157894738e-05,
"loss": 0.0002,
"step": 34
},
{
"epoch": 0.000898530261215583,
"grad_norm": 0.012248692102730274,
"learning_rate": 8.736315789473685e-05,
"loss": 0.0002,
"step": 35
},
{
"epoch": 0.0009242025543931712,
"grad_norm": 0.009668469429016113,
"learning_rate": 8.683368421052632e-05,
"loss": 0.0001,
"step": 36
},
{
"epoch": 0.0009498748475707593,
"grad_norm": 0.007297456730157137,
"learning_rate": 8.63042105263158e-05,
"loss": 0.0001,
"step": 37
},
{
"epoch": 0.0009755471407483473,
"grad_norm": 0.005695601459592581,
"learning_rate": 8.577473684210527e-05,
"loss": 0.0001,
"step": 38
},
{
"epoch": 0.0010012194339259354,
"grad_norm": 0.004724627826362848,
"learning_rate": 8.524526315789474e-05,
"loss": 0.0001,
"step": 39
},
{
"epoch": 0.0010268917271035235,
"grad_norm": 0.0041098096407949924,
"learning_rate": 8.471578947368421e-05,
"loss": 0.0001,
"step": 40
},
{
"epoch": 0.0010525640202811116,
"grad_norm": 0.003558989381417632,
"learning_rate": 8.41863157894737e-05,
"loss": 0.0001,
"step": 41
},
{
"epoch": 0.0010782363134586997,
"grad_norm": 0.00332538690418005,
"learning_rate": 8.365684210526317e-05,
"loss": 0.0001,
"step": 42
},
{
"epoch": 0.0011039086066362879,
"grad_norm": 0.0027524020988494158,
"learning_rate": 8.312736842105264e-05,
"loss": 0.0001,
"step": 43
},
{
"epoch": 0.0011295808998138758,
"grad_norm": 0.002535044215619564,
"learning_rate": 8.259789473684211e-05,
"loss": 0.0001,
"step": 44
},
{
"epoch": 0.001155253192991464,
"grad_norm": 0.00231631426140666,
"learning_rate": 8.206842105263158e-05,
"loss": 0.0,
"step": 45
},
{
"epoch": 0.001180925486169052,
"grad_norm": 0.0019382525933906436,
"learning_rate": 8.153894736842105e-05,
"loss": 0.0,
"step": 46
},
{
"epoch": 0.0012065977793466402,
"grad_norm": 0.022770335897803307,
"learning_rate": 8.100947368421053e-05,
"loss": 0.0001,
"step": 47
},
{
"epoch": 0.0012322700725242283,
"grad_norm": 0.0018689745338633657,
"learning_rate": 8.048000000000002e-05,
"loss": 0.0,
"step": 48
},
{
"epoch": 0.0012579423657018162,
"grad_norm": 20.690397262573242,
"learning_rate": 7.995052631578949e-05,
"loss": 4.2531,
"step": 49
},
{
"epoch": 0.0012836146588794044,
"grad_norm": 18.80559730529785,
"learning_rate": 7.942105263157896e-05,
"loss": 3.8468,
"step": 50
},
{
"epoch": 0.0012836146588794044,
"eval_loss": 0.5789145231246948,
"eval_runtime": 1182.6905,
"eval_samples_per_second": 13.868,
"eval_steps_per_second": 3.468,
"step": 50
},
{
"epoch": 0.0013092869520569925,
"grad_norm": 2.402365207672119,
"learning_rate": 7.889157894736843e-05,
"loss": 1.2955,
"step": 51
},
{
"epoch": 0.0013349592452345806,
"grad_norm": 2.3085319995880127,
"learning_rate": 7.83621052631579e-05,
"loss": 1.9171,
"step": 52
},
{
"epoch": 0.0013606315384121687,
"grad_norm": 1.7359583377838135,
"learning_rate": 7.783263157894737e-05,
"loss": 1.5831,
"step": 53
},
{
"epoch": 0.0013863038315897567,
"grad_norm": 1.780971884727478,
"learning_rate": 7.730315789473684e-05,
"loss": 1.9551,
"step": 54
},
{
"epoch": 0.0014119761247673448,
"grad_norm": 1.7606279850006104,
"learning_rate": 7.677368421052632e-05,
"loss": 1.7301,
"step": 55
},
{
"epoch": 0.001437648417944933,
"grad_norm": 1.6475731134414673,
"learning_rate": 7.624421052631579e-05,
"loss": 1.8897,
"step": 56
},
{
"epoch": 0.001463320711122521,
"grad_norm": 1.541284203529358,
"learning_rate": 7.571473684210526e-05,
"loss": 1.8631,
"step": 57
},
{
"epoch": 0.0014889930043001092,
"grad_norm": 1.9083011150360107,
"learning_rate": 7.518526315789475e-05,
"loss": 1.9604,
"step": 58
},
{
"epoch": 0.001514665297477697,
"grad_norm": 1.855704665184021,
"learning_rate": 7.465578947368422e-05,
"loss": 2.3663,
"step": 59
},
{
"epoch": 0.0015403375906552852,
"grad_norm": 1.8971003293991089,
"learning_rate": 7.412631578947369e-05,
"loss": 2.0379,
"step": 60
},
{
"epoch": 0.0015660098838328734,
"grad_norm": 1.7835102081298828,
"learning_rate": 7.359684210526317e-05,
"loss": 2.1501,
"step": 61
},
{
"epoch": 0.0015916821770104615,
"grad_norm": 1.6390843391418457,
"learning_rate": 7.306736842105264e-05,
"loss": 2.1538,
"step": 62
},
{
"epoch": 0.0016173544701880496,
"grad_norm": 1.7119994163513184,
"learning_rate": 7.253789473684211e-05,
"loss": 1.7663,
"step": 63
},
{
"epoch": 0.0016430267633656375,
"grad_norm": 1.6038694381713867,
"learning_rate": 7.200842105263158e-05,
"loss": 2.2391,
"step": 64
},
{
"epoch": 0.0016686990565432257,
"grad_norm": 1.7166171073913574,
"learning_rate": 7.147894736842105e-05,
"loss": 2.1004,
"step": 65
},
{
"epoch": 0.0016943713497208138,
"grad_norm": 1.7185120582580566,
"learning_rate": 7.094947368421052e-05,
"loss": 1.732,
"step": 66
},
{
"epoch": 0.001720043642898402,
"grad_norm": 1.7218092679977417,
"learning_rate": 7.042e-05,
"loss": 1.92,
"step": 67
},
{
"epoch": 0.00174571593607599,
"grad_norm": 1.944508671760559,
"learning_rate": 6.989052631578948e-05,
"loss": 1.8669,
"step": 68
},
{
"epoch": 0.001771388229253578,
"grad_norm": 2.296661615371704,
"learning_rate": 6.936105263157896e-05,
"loss": 1.5057,
"step": 69
},
{
"epoch": 0.001797060522431166,
"grad_norm": 2.606893539428711,
"learning_rate": 6.883157894736843e-05,
"loss": 2.3624,
"step": 70
},
{
"epoch": 0.0018227328156087542,
"grad_norm": 5.274344444274902,
"learning_rate": 6.83021052631579e-05,
"loss": 2.6868,
"step": 71
},
{
"epoch": 0.0018484051087863424,
"grad_norm": 2.480908155441284,
"learning_rate": 6.777263157894737e-05,
"loss": 1.9085,
"step": 72
},
{
"epoch": 0.0018740774019639305,
"grad_norm": 1.2263275384902954,
"learning_rate": 6.724315789473684e-05,
"loss": 0.1152,
"step": 73
},
{
"epoch": 0.0018997496951415186,
"grad_norm": 0.09067212790250778,
"learning_rate": 6.671368421052631e-05,
"loss": 0.0036,
"step": 74
},
{
"epoch": 0.0019254219883191065,
"grad_norm": 0.04749782010912895,
"learning_rate": 6.61842105263158e-05,
"loss": 0.002,
"step": 75
},
{
"epoch": 0.0019510942814966947,
"grad_norm": 0.03309526666998863,
"learning_rate": 6.565473684210527e-05,
"loss": 0.0014,
"step": 76
},
{
"epoch": 0.0019767665746742826,
"grad_norm": 0.021400053054094315,
"learning_rate": 6.512526315789474e-05,
"loss": 0.0009,
"step": 77
},
{
"epoch": 0.0020024388678518707,
"grad_norm": 0.017963755875825882,
"learning_rate": 6.459578947368421e-05,
"loss": 0.0008,
"step": 78
},
{
"epoch": 0.002028111161029459,
"grad_norm": 1.1963677406311035,
"learning_rate": 6.406631578947369e-05,
"loss": 0.2937,
"step": 79
},
{
"epoch": 0.002053783454207047,
"grad_norm": 0.010341464541852474,
"learning_rate": 6.353684210526316e-05,
"loss": 0.0005,
"step": 80
},
{
"epoch": 0.002079455747384635,
"grad_norm": 0.00791017897427082,
"learning_rate": 6.300736842105263e-05,
"loss": 0.0004,
"step": 81
},
{
"epoch": 0.0021051280405622232,
"grad_norm": 0.00704535935074091,
"learning_rate": 6.247789473684212e-05,
"loss": 0.0003,
"step": 82
},
{
"epoch": 0.0021308003337398114,
"grad_norm": 0.005926445592194796,
"learning_rate": 6.194842105263159e-05,
"loss": 0.0003,
"step": 83
},
{
"epoch": 0.0021564726269173995,
"grad_norm": 0.005512417294085026,
"learning_rate": 6.141894736842106e-05,
"loss": 0.0003,
"step": 84
},
{
"epoch": 0.0021821449200949876,
"grad_norm": 0.005944954231381416,
"learning_rate": 6.088947368421053e-05,
"loss": 0.0003,
"step": 85
},
{
"epoch": 0.0022078172132725758,
"grad_norm": 0.005636818241328001,
"learning_rate": 6.036e-05,
"loss": 0.0003,
"step": 86
},
{
"epoch": 0.0022334895064501635,
"grad_norm": 0.005355035420507193,
"learning_rate": 5.9830526315789475e-05,
"loss": 0.0003,
"step": 87
},
{
"epoch": 0.0022591617996277516,
"grad_norm": 0.004906694870442152,
"learning_rate": 5.9301052631578946e-05,
"loss": 0.0002,
"step": 88
},
{
"epoch": 0.0022848340928053397,
"grad_norm": 0.00455674109980464,
"learning_rate": 5.877157894736843e-05,
"loss": 0.0002,
"step": 89
},
{
"epoch": 0.002310506385982928,
"grad_norm": 0.004359242040663958,
"learning_rate": 5.82421052631579e-05,
"loss": 0.0002,
"step": 90
},
{
"epoch": 0.002336178679160516,
"grad_norm": 0.004265242256224155,
"learning_rate": 5.771263157894737e-05,
"loss": 0.0002,
"step": 91
},
{
"epoch": 0.002361850972338104,
"grad_norm": 0.00403448985889554,
"learning_rate": 5.718315789473685e-05,
"loss": 0.0002,
"step": 92
},
{
"epoch": 0.0023875232655156922,
"grad_norm": 0.003738979110494256,
"learning_rate": 5.665368421052632e-05,
"loss": 0.0002,
"step": 93
},
{
"epoch": 0.0024131955586932804,
"grad_norm": 0.0035066159907728434,
"learning_rate": 5.612421052631579e-05,
"loss": 0.0002,
"step": 94
},
{
"epoch": 0.0024388678518708685,
"grad_norm": 0.0033452173229306936,
"learning_rate": 5.559473684210527e-05,
"loss": 0.0001,
"step": 95
},
{
"epoch": 0.0024645401450484566,
"grad_norm": 0.0031449589878320694,
"learning_rate": 5.506526315789474e-05,
"loss": 0.0001,
"step": 96
},
{
"epoch": 0.0024902124382260448,
"grad_norm": 0.8770474791526794,
"learning_rate": 5.453578947368421e-05,
"loss": 0.1372,
"step": 97
},
{
"epoch": 0.0025158847314036325,
"grad_norm": 0.006977991200983524,
"learning_rate": 5.400631578947369e-05,
"loss": 0.0002,
"step": 98
},
{
"epoch": 0.0025415570245812206,
"grad_norm": 2.6762304306030273,
"learning_rate": 5.347684210526316e-05,
"loss": 1.5861,
"step": 99
},
{
"epoch": 0.0025672293177588087,
"grad_norm": 6.186676502227783,
"learning_rate": 5.294736842105263e-05,
"loss": 3.1351,
"step": 100
},
{
"epoch": 0.0025672293177588087,
"eval_loss": 0.444609671831131,
"eval_runtime": 1180.4892,
"eval_samples_per_second": 13.894,
"eval_steps_per_second": 3.474,
"step": 100
},
{
"epoch": 0.002592901610936397,
"grad_norm": 1.0906559228897095,
"learning_rate": 5.24178947368421e-05,
"loss": 1.5745,
"step": 101
},
{
"epoch": 0.002618573904113985,
"grad_norm": 1.057900071144104,
"learning_rate": 5.1888421052631585e-05,
"loss": 1.359,
"step": 102
},
{
"epoch": 0.002644246197291573,
"grad_norm": 1.1936546564102173,
"learning_rate": 5.135894736842106e-05,
"loss": 1.8506,
"step": 103
},
{
"epoch": 0.0026699184904691612,
"grad_norm": 1.1742337942123413,
"learning_rate": 5.082947368421053e-05,
"loss": 1.4235,
"step": 104
},
{
"epoch": 0.0026955907836467494,
"grad_norm": 1.4636642932891846,
"learning_rate": 5.03e-05,
"loss": 1.9341,
"step": 105
},
{
"epoch": 0.0027212630768243375,
"grad_norm": 1.366550087928772,
"learning_rate": 4.977052631578947e-05,
"loss": 1.7735,
"step": 106
},
{
"epoch": 0.0027469353700019256,
"grad_norm": 1.4252630472183228,
"learning_rate": 4.924105263157895e-05,
"loss": 1.8694,
"step": 107
},
{
"epoch": 0.0027726076631795133,
"grad_norm": 1.5163335800170898,
"learning_rate": 4.871157894736843e-05,
"loss": 1.8035,
"step": 108
},
{
"epoch": 0.0027982799563571015,
"grad_norm": 1.6696662902832031,
"learning_rate": 4.81821052631579e-05,
"loss": 2.4422,
"step": 109
},
{
"epoch": 0.0028239522495346896,
"grad_norm": 1.823583960533142,
"learning_rate": 4.765263157894737e-05,
"loss": 2.0223,
"step": 110
},
{
"epoch": 0.0028496245427122777,
"grad_norm": 1.7153486013412476,
"learning_rate": 4.7123157894736845e-05,
"loss": 2.3425,
"step": 111
},
{
"epoch": 0.002875296835889866,
"grad_norm": 1.597476601600647,
"learning_rate": 4.6593684210526316e-05,
"loss": 1.9461,
"step": 112
},
{
"epoch": 0.002900969129067454,
"grad_norm": 1.6382873058319092,
"learning_rate": 4.606421052631579e-05,
"loss": 1.7633,
"step": 113
},
{
"epoch": 0.002926641422245042,
"grad_norm": 1.7374053001403809,
"learning_rate": 4.553473684210527e-05,
"loss": 2.0122,
"step": 114
},
{
"epoch": 0.0029523137154226302,
"grad_norm": 1.976195216178894,
"learning_rate": 4.500526315789474e-05,
"loss": 2.218,
"step": 115
},
{
"epoch": 0.0029779860086002184,
"grad_norm": 2.0346622467041016,
"learning_rate": 4.447578947368421e-05,
"loss": 2.3013,
"step": 116
},
{
"epoch": 0.0030036583017778065,
"grad_norm": 1.756709337234497,
"learning_rate": 4.394631578947369e-05,
"loss": 1.9596,
"step": 117
},
{
"epoch": 0.003029330594955394,
"grad_norm": 1.9882556200027466,
"learning_rate": 4.341684210526316e-05,
"loss": 2.2595,
"step": 118
},
{
"epoch": 0.0030550028881329823,
"grad_norm": 2.60469388961792,
"learning_rate": 4.2887368421052636e-05,
"loss": 1.5153,
"step": 119
},
{
"epoch": 0.0030806751813105705,
"grad_norm": 0.9778649806976318,
"learning_rate": 4.2357894736842106e-05,
"loss": 0.0113,
"step": 120
},
{
"epoch": 0.0031063474744881586,
"grad_norm": 0.011567816138267517,
"learning_rate": 4.182842105263158e-05,
"loss": 0.0004,
"step": 121
},
{
"epoch": 0.0031320197676657467,
"grad_norm": 0.010046404786407948,
"learning_rate": 4.1298947368421053e-05,
"loss": 0.0004,
"step": 122
},
{
"epoch": 0.003157692060843335,
"grad_norm": 0.009395002387464046,
"learning_rate": 4.0769473684210524e-05,
"loss": 0.0003,
"step": 123
},
{
"epoch": 0.003183364354020923,
"grad_norm": 0.008893662132322788,
"learning_rate": 4.024000000000001e-05,
"loss": 0.0003,
"step": 124
},
{
"epoch": 0.003209036647198511,
"grad_norm": 0.008500020019710064,
"learning_rate": 3.971052631578948e-05,
"loss": 0.0003,
"step": 125
},
{
"epoch": 0.0032347089403760992,
"grad_norm": 0.0076544624753296375,
"learning_rate": 3.918105263157895e-05,
"loss": 0.0003,
"step": 126
},
{
"epoch": 0.0032603812335536874,
"grad_norm": 0.007298609241843224,
"learning_rate": 3.865157894736842e-05,
"loss": 0.0003,
"step": 127
},
{
"epoch": 0.003286053526731275,
"grad_norm": 0.006563248578459024,
"learning_rate": 3.8122105263157896e-05,
"loss": 0.0002,
"step": 128
},
{
"epoch": 0.003311725819908863,
"grad_norm": 0.006430802401155233,
"learning_rate": 3.759263157894737e-05,
"loss": 0.0002,
"step": 129
},
{
"epoch": 0.0033373981130864513,
"grad_norm": 0.006386774126440287,
"learning_rate": 3.7063157894736844e-05,
"loss": 0.0002,
"step": 130
},
{
"epoch": 0.0033630704062640395,
"grad_norm": 0.6408913731575012,
"learning_rate": 3.653368421052632e-05,
"loss": 0.0931,
"step": 131
},
{
"epoch": 0.0033887426994416276,
"grad_norm": 0.005842825397849083,
"learning_rate": 3.600421052631579e-05,
"loss": 0.0002,
"step": 132
},
{
"epoch": 0.0034144149926192157,
"grad_norm": 0.005725574214011431,
"learning_rate": 3.547473684210526e-05,
"loss": 0.0002,
"step": 133
},
{
"epoch": 0.003440087285796804,
"grad_norm": 0.006068081129342318,
"learning_rate": 3.494526315789474e-05,
"loss": 0.0002,
"step": 134
},
{
"epoch": 0.003465759578974392,
"grad_norm": 0.00514700124040246,
"learning_rate": 3.4415789473684216e-05,
"loss": 0.0002,
"step": 135
},
{
"epoch": 0.00349143187215198,
"grad_norm": 0.005522563587874174,
"learning_rate": 3.3886315789473686e-05,
"loss": 0.0002,
"step": 136
},
{
"epoch": 0.0035171041653295682,
"grad_norm": 0.0057330005802214146,
"learning_rate": 3.3356842105263156e-05,
"loss": 0.0002,
"step": 137
},
{
"epoch": 0.003542776458507156,
"grad_norm": 0.0057954080402851105,
"learning_rate": 3.2827368421052634e-05,
"loss": 0.0002,
"step": 138
},
{
"epoch": 0.003568448751684744,
"grad_norm": 0.005006751511245966,
"learning_rate": 3.2297894736842104e-05,
"loss": 0.0002,
"step": 139
},
{
"epoch": 0.003594121044862332,
"grad_norm": 0.005032096989452839,
"learning_rate": 3.176842105263158e-05,
"loss": 0.0002,
"step": 140
},
{
"epoch": 0.0036197933380399203,
"grad_norm": 0.004935144912451506,
"learning_rate": 3.123894736842106e-05,
"loss": 0.0002,
"step": 141
},
{
"epoch": 0.0036454656312175085,
"grad_norm": 0.00508884247392416,
"learning_rate": 3.070947368421053e-05,
"loss": 0.0002,
"step": 142
},
{
"epoch": 0.0036711379243950966,
"grad_norm": 0.004769055638462305,
"learning_rate": 3.018e-05,
"loss": 0.0002,
"step": 143
},
{
"epoch": 0.0036968102175726847,
"grad_norm": 0.004287198651582003,
"learning_rate": 2.9650526315789473e-05,
"loss": 0.0002,
"step": 144
},
{
"epoch": 0.003722482510750273,
"grad_norm": 0.004471136257052422,
"learning_rate": 2.912105263157895e-05,
"loss": 0.0002,
"step": 145
},
{
"epoch": 0.003748154803927861,
"grad_norm": 0.004509101156145334,
"learning_rate": 2.8591578947368424e-05,
"loss": 0.0002,
"step": 146
},
{
"epoch": 0.003773827097105449,
"grad_norm": 0.005288159940391779,
"learning_rate": 2.8062105263157894e-05,
"loss": 0.0002,
"step": 147
},
{
"epoch": 0.0037994993902830372,
"grad_norm": 1.4431769847869873,
"learning_rate": 2.753263157894737e-05,
"loss": 0.4431,
"step": 148
},
{
"epoch": 0.003825171683460625,
"grad_norm": 2.757993459701538,
"learning_rate": 2.7003157894736845e-05,
"loss": 1.9569,
"step": 149
},
{
"epoch": 0.003850843976638213,
"grad_norm": 6.777806758880615,
"learning_rate": 2.6473684210526315e-05,
"loss": 2.2696,
"step": 150
},
{
"epoch": 0.003850843976638213,
"eval_loss": 0.41922199726104736,
"eval_runtime": 1184.8981,
"eval_samples_per_second": 13.843,
"eval_steps_per_second": 3.461,
"step": 150
},
{
"epoch": 0.003876516269815801,
"grad_norm": 0.7326381802558899,
"learning_rate": 2.5944210526315793e-05,
"loss": 1.3282,
"step": 151
},
{
"epoch": 0.0039021885629933893,
"grad_norm": 0.931328535079956,
"learning_rate": 2.5414736842105266e-05,
"loss": 1.2657,
"step": 152
},
{
"epoch": 0.003927860856170978,
"grad_norm": 0.7979090213775635,
"learning_rate": 2.4885263157894737e-05,
"loss": 1.0431,
"step": 153
},
{
"epoch": 0.003953533149348565,
"grad_norm": 0.8406013250350952,
"learning_rate": 2.4355789473684214e-05,
"loss": 1.5444,
"step": 154
},
{
"epoch": 0.003979205442526153,
"grad_norm": 1.172977328300476,
"learning_rate": 2.3826315789473684e-05,
"loss": 2.2357,
"step": 155
},
{
"epoch": 0.004004877735703741,
"grad_norm": 1.081085443496704,
"learning_rate": 2.3296842105263158e-05,
"loss": 1.9574,
"step": 156
},
{
"epoch": 0.0040305500288813296,
"grad_norm": 1.213394284248352,
"learning_rate": 2.2767368421052635e-05,
"loss": 1.7966,
"step": 157
},
{
"epoch": 0.004056222322058918,
"grad_norm": 1.3481943607330322,
"learning_rate": 2.2237894736842105e-05,
"loss": 2.1099,
"step": 158
},
{
"epoch": 0.004081894615236506,
"grad_norm": 1.1967196464538574,
"learning_rate": 2.170842105263158e-05,
"loss": 1.7427,
"step": 159
},
{
"epoch": 0.004107566908414094,
"grad_norm": 1.6480809450149536,
"learning_rate": 2.1178947368421053e-05,
"loss": 2.2222,
"step": 160
},
{
"epoch": 0.004133239201591682,
"grad_norm": 1.4550552368164062,
"learning_rate": 2.0649473684210527e-05,
"loss": 1.5513,
"step": 161
},
{
"epoch": 0.00415891149476927,
"grad_norm": 1.5787893533706665,
"learning_rate": 2.0120000000000004e-05,
"loss": 2.0812,
"step": 162
},
{
"epoch": 0.004184583787946858,
"grad_norm": 1.6700499057769775,
"learning_rate": 1.9590526315789474e-05,
"loss": 2.0889,
"step": 163
},
{
"epoch": 0.0042102560811244465,
"grad_norm": 1.2719978094100952,
"learning_rate": 1.9061052631578948e-05,
"loss": 1.6051,
"step": 164
},
{
"epoch": 0.004235928374302035,
"grad_norm": 2.337200164794922,
"learning_rate": 1.8531578947368422e-05,
"loss": 2.2032,
"step": 165
},
{
"epoch": 0.004261600667479623,
"grad_norm": 2.0356595516204834,
"learning_rate": 1.8002105263157896e-05,
"loss": 2.3998,
"step": 166
},
{
"epoch": 0.004287272960657211,
"grad_norm": 1.9695961475372314,
"learning_rate": 1.747263157894737e-05,
"loss": 2.3592,
"step": 167
},
{
"epoch": 0.004312945253834799,
"grad_norm": 2.0183303356170654,
"learning_rate": 1.6943157894736843e-05,
"loss": 2.1365,
"step": 168
},
{
"epoch": 0.004338617547012387,
"grad_norm": 2.894932270050049,
"learning_rate": 1.6413684210526317e-05,
"loss": 2.4704,
"step": 169
},
{
"epoch": 0.004364289840189975,
"grad_norm": 1.7332582473754883,
"learning_rate": 1.588421052631579e-05,
"loss": 0.8739,
"step": 170
},
{
"epoch": 0.004389962133367563,
"grad_norm": 0.0018881710711866617,
"learning_rate": 1.5354736842105264e-05,
"loss": 0.0001,
"step": 171
},
{
"epoch": 0.0044156344265451515,
"grad_norm": 0.0019908491522073746,
"learning_rate": 1.4825263157894736e-05,
"loss": 0.0001,
"step": 172
},
{
"epoch": 0.00444130671972274,
"grad_norm": 0.0019480012124404311,
"learning_rate": 1.4295789473684212e-05,
"loss": 0.0001,
"step": 173
},
{
"epoch": 0.004466979012900327,
"grad_norm": 0.0019311723299324512,
"learning_rate": 1.3766315789473686e-05,
"loss": 0.0001,
"step": 174
},
{
"epoch": 0.004492651306077915,
"grad_norm": 0.0019914451986551285,
"learning_rate": 1.3236842105263158e-05,
"loss": 0.0001,
"step": 175
},
{
"epoch": 0.004518323599255503,
"grad_norm": 0.001993614248931408,
"learning_rate": 1.2707368421052633e-05,
"loss": 0.0001,
"step": 176
},
{
"epoch": 0.004543995892433091,
"grad_norm": 0.0019910179544240236,
"learning_rate": 1.2177894736842107e-05,
"loss": 0.0001,
"step": 177
},
{
"epoch": 0.004569668185610679,
"grad_norm": 0.002013832563534379,
"learning_rate": 1.1648421052631579e-05,
"loss": 0.0001,
"step": 178
},
{
"epoch": 0.0045953404787882676,
"grad_norm": 0.0020234170369803905,
"learning_rate": 1.1118947368421053e-05,
"loss": 0.0001,
"step": 179
},
{
"epoch": 0.004621012771965856,
"grad_norm": 0.0019844514317810535,
"learning_rate": 1.0589473684210526e-05,
"loss": 0.0001,
"step": 180
},
{
"epoch": 0.004646685065143444,
"grad_norm": 0.00206298241391778,
"learning_rate": 1.0060000000000002e-05,
"loss": 0.0001,
"step": 181
},
{
"epoch": 0.004672357358321032,
"grad_norm": 0.0020082283299416304,
"learning_rate": 9.530526315789474e-06,
"loss": 0.0001,
"step": 182
},
{
"epoch": 0.00469802965149862,
"grad_norm": 0.0020704076159745455,
"learning_rate": 9.001052631578948e-06,
"loss": 0.0001,
"step": 183
},
{
"epoch": 0.004723701944676208,
"grad_norm": 0.002045375294983387,
"learning_rate": 8.471578947368422e-06,
"loss": 0.0001,
"step": 184
},
{
"epoch": 0.004749374237853796,
"grad_norm": 0.0021098172292113304,
"learning_rate": 7.942105263157895e-06,
"loss": 0.0001,
"step": 185
},
{
"epoch": 0.0047750465310313845,
"grad_norm": 0.0021194566506892443,
"learning_rate": 7.412631578947368e-06,
"loss": 0.0001,
"step": 186
},
{
"epoch": 0.004800718824208973,
"grad_norm": 0.0020953835919499397,
"learning_rate": 6.883157894736843e-06,
"loss": 0.0001,
"step": 187
},
{
"epoch": 0.004826391117386561,
"grad_norm": 0.0021119611337780952,
"learning_rate": 6.3536842105263166e-06,
"loss": 0.0001,
"step": 188
},
{
"epoch": 0.004852063410564149,
"grad_norm": 0.0021464722231030464,
"learning_rate": 5.8242105263157895e-06,
"loss": 0.0001,
"step": 189
},
{
"epoch": 0.004877735703741737,
"grad_norm": 0.0022363984026014805,
"learning_rate": 5.294736842105263e-06,
"loss": 0.0001,
"step": 190
},
{
"epoch": 0.004903407996919325,
"grad_norm": 0.002060087164863944,
"learning_rate": 4.765263157894737e-06,
"loss": 0.0001,
"step": 191
},
{
"epoch": 0.004929080290096913,
"grad_norm": 0.0019431081600487232,
"learning_rate": 4.235789473684211e-06,
"loss": 0.0001,
"step": 192
},
{
"epoch": 0.004954752583274501,
"grad_norm": 0.0021071808878332376,
"learning_rate": 3.706315789473684e-06,
"loss": 0.0001,
"step": 193
},
{
"epoch": 0.0049804248764520895,
"grad_norm": 0.0020984155125916004,
"learning_rate": 3.1768421052631583e-06,
"loss": 0.0001,
"step": 194
},
{
"epoch": 0.005006097169629677,
"grad_norm": 0.002161442069336772,
"learning_rate": 2.6473684210526316e-06,
"loss": 0.0001,
"step": 195
},
{
"epoch": 0.005031769462807265,
"grad_norm": 0.002335567260161042,
"learning_rate": 2.1178947368421054e-06,
"loss": 0.0001,
"step": 196
},
{
"epoch": 0.005057441755984853,
"grad_norm": 1.0611015558242798,
"learning_rate": 1.5884210526315791e-06,
"loss": 0.2117,
"step": 197
},
{
"epoch": 0.005083114049162441,
"grad_norm": 0.39899060130119324,
"learning_rate": 1.0589473684210527e-06,
"loss": 0.089,
"step": 198
},
{
"epoch": 0.005108786342340029,
"grad_norm": 3.193357467651367,
"learning_rate": 5.294736842105263e-07,
"loss": 1.4026,
"step": 199
},
{
"epoch": 0.005134458635517617,
"grad_norm": 3.7854483127593994,
"learning_rate": 0.0,
"loss": 1.5641,
"step": 200
},
{
"epoch": 0.005134458635517617,
"eval_loss": 0.4131671190261841,
"eval_runtime": 1189.4105,
"eval_samples_per_second": 13.79,
"eval_steps_per_second": 3.448,
"step": 200
}
],
"logging_steps": 1,
"max_steps": 200,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 50,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 5,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 0
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 3.795839081879962e+16,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}