l3kpm2-lora / checkpoint-88 /trainer_state.json
amphora's picture
Upload folder using huggingface_hub
c351ff0 verified
raw
history blame
No virus
18.3 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.7392996108949417,
"eval_steps": 8,
"global_step": 88,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0311284046692607,
"grad_norm": 0.5707955360412598,
"learning_rate": 1e-05,
"loss": 1.8935,
"step": 1
},
{
"epoch": 0.0311284046692607,
"eval_loss": 1.8884226083755493,
"eval_runtime": 34.2567,
"eval_samples_per_second": 29.104,
"eval_steps_per_second": 0.321,
"step": 1
},
{
"epoch": 0.0622568093385214,
"grad_norm": 0.5781293511390686,
"learning_rate": 2e-05,
"loss": 1.879,
"step": 2
},
{
"epoch": 0.0933852140077821,
"grad_norm": 0.5720934271812439,
"learning_rate": 3e-05,
"loss": 1.8848,
"step": 3
},
{
"epoch": 0.1245136186770428,
"grad_norm": 0.580179750919342,
"learning_rate": 4e-05,
"loss": 1.8845,
"step": 4
},
{
"epoch": 0.1556420233463035,
"grad_norm": 0.6264262795448303,
"learning_rate": 5e-05,
"loss": 1.8758,
"step": 5
},
{
"epoch": 0.1867704280155642,
"grad_norm": 0.643973708152771,
"learning_rate": 6e-05,
"loss": 1.8309,
"step": 6
},
{
"epoch": 0.2178988326848249,
"grad_norm": 0.6367993354797363,
"learning_rate": 7e-05,
"loss": 1.7743,
"step": 7
},
{
"epoch": 0.2490272373540856,
"grad_norm": 0.5833392143249512,
"learning_rate": 8e-05,
"loss": 1.6965,
"step": 8
},
{
"epoch": 0.2490272373540856,
"eval_loss": 1.5970573425292969,
"eval_runtime": 34.32,
"eval_samples_per_second": 29.05,
"eval_steps_per_second": 0.321,
"step": 8
},
{
"epoch": 0.2801556420233463,
"grad_norm": 0.5133880972862244,
"learning_rate": 9e-05,
"loss": 1.5915,
"step": 9
},
{
"epoch": 0.311284046692607,
"grad_norm": 0.42409589886665344,
"learning_rate": 0.0001,
"loss": 1.5128,
"step": 10
},
{
"epoch": 0.3424124513618677,
"grad_norm": 0.3264746069908142,
"learning_rate": 0.00011000000000000002,
"loss": 1.4567,
"step": 11
},
{
"epoch": 0.3735408560311284,
"grad_norm": 0.2589164078235626,
"learning_rate": 0.00012,
"loss": 1.4249,
"step": 12
},
{
"epoch": 0.4046692607003891,
"grad_norm": 0.3931436538696289,
"learning_rate": 0.00013000000000000002,
"loss": 1.4125,
"step": 13
},
{
"epoch": 0.4357976653696498,
"grad_norm": 0.5455179810523987,
"learning_rate": 0.00014,
"loss": 1.4079,
"step": 14
},
{
"epoch": 0.4669260700389105,
"grad_norm": 0.5418187379837036,
"learning_rate": 0.00015000000000000001,
"loss": 1.4031,
"step": 15
},
{
"epoch": 0.4980544747081712,
"grad_norm": 0.42387455701828003,
"learning_rate": 0.00016,
"loss": 1.3733,
"step": 16
},
{
"epoch": 0.4980544747081712,
"eval_loss": 1.3445571660995483,
"eval_runtime": 34.3152,
"eval_samples_per_second": 29.054,
"eval_steps_per_second": 0.321,
"step": 16
},
{
"epoch": 0.5291828793774319,
"grad_norm": 0.2986687123775482,
"learning_rate": 0.00017,
"loss": 1.3505,
"step": 17
},
{
"epoch": 0.5603112840466926,
"grad_norm": 0.2099975347518921,
"learning_rate": 0.00018,
"loss": 1.3243,
"step": 18
},
{
"epoch": 0.5914396887159533,
"grad_norm": 0.16759291291236877,
"learning_rate": 0.00019,
"loss": 1.3056,
"step": 19
},
{
"epoch": 0.622568093385214,
"grad_norm": 0.16132138669490814,
"learning_rate": 0.0002,
"loss": 1.3014,
"step": 20
},
{
"epoch": 0.6536964980544747,
"grad_norm": 0.17767557501792908,
"learning_rate": 0.0001999145758387301,
"loss": 1.2932,
"step": 21
},
{
"epoch": 0.6848249027237354,
"grad_norm": 0.19573098421096802,
"learning_rate": 0.000199658449300667,
"loss": 1.2771,
"step": 22
},
{
"epoch": 0.7159533073929961,
"grad_norm": 0.19915379583835602,
"learning_rate": 0.0001992320579737045,
"loss": 1.2762,
"step": 23
},
{
"epoch": 0.7470817120622568,
"grad_norm": 0.17230945825576782,
"learning_rate": 0.00019863613034027224,
"loss": 1.2466,
"step": 24
},
{
"epoch": 0.7470817120622568,
"eval_loss": 1.2462533712387085,
"eval_runtime": 34.3129,
"eval_samples_per_second": 29.056,
"eval_steps_per_second": 0.321,
"step": 24
},
{
"epoch": 0.7782101167315175,
"grad_norm": 0.13044685125350952,
"learning_rate": 0.00019787168453273544,
"loss": 1.2402,
"step": 25
},
{
"epoch": 0.8093385214007782,
"grad_norm": 0.09282781183719635,
"learning_rate": 0.00019694002659393305,
"loss": 1.234,
"step": 26
},
{
"epoch": 0.8404669260700389,
"grad_norm": 0.10575597733259201,
"learning_rate": 0.0001958427482458253,
"loss": 1.2214,
"step": 27
},
{
"epoch": 0.8715953307392996,
"grad_norm": 0.14210504293441772,
"learning_rate": 0.00019458172417006347,
"loss": 1.2185,
"step": 28
},
{
"epoch": 0.9027237354085603,
"grad_norm": 0.17919066548347473,
"learning_rate": 0.0001931591088051279,
"loss": 1.2025,
"step": 29
},
{
"epoch": 0.933852140077821,
"grad_norm": 0.16358336806297302,
"learning_rate": 0.00019157733266550575,
"loss": 1.2032,
"step": 30
},
{
"epoch": 0.9649805447470817,
"grad_norm": 0.13862887024879456,
"learning_rate": 0.0001898390981891979,
"loss": 1.197,
"step": 31
},
{
"epoch": 0.9961089494163424,
"grad_norm": 0.11003394424915314,
"learning_rate": 0.0001879473751206489,
"loss": 1.1852,
"step": 32
},
{
"epoch": 0.9961089494163424,
"eval_loss": 1.1821681261062622,
"eval_runtime": 34.3117,
"eval_samples_per_second": 29.057,
"eval_steps_per_second": 0.321,
"step": 32
},
{
"epoch": 1.027237354085603,
"grad_norm": 0.08200129121541977,
"learning_rate": 0.00018590539543698854,
"loss": 1.178,
"step": 33
},
{
"epoch": 1.0583657587548638,
"grad_norm": 0.07455576211214066,
"learning_rate": 0.00018371664782625287,
"loss": 1.1725,
"step": 34
},
{
"epoch": 1.0894941634241244,
"grad_norm": 0.08433058857917786,
"learning_rate": 0.0001813848717270195,
"loss": 1.1569,
"step": 35
},
{
"epoch": 1.1206225680933852,
"grad_norm": 0.09246356040239334,
"learning_rate": 0.00017891405093963938,
"loss": 1.1627,
"step": 36
},
{
"epoch": 1.1517509727626458,
"grad_norm": 0.09312273561954498,
"learning_rate": 0.00017630840681998066,
"loss": 1.1526,
"step": 37
},
{
"epoch": 1.1828793774319066,
"grad_norm": 0.08373520523309708,
"learning_rate": 0.00017357239106731317,
"loss": 1.1456,
"step": 38
},
{
"epoch": 1.2140077821011672,
"grad_norm": 0.07111110538244247,
"learning_rate": 0.00017071067811865476,
"loss": 1.1531,
"step": 39
},
{
"epoch": 1.245136186770428,
"grad_norm": 0.06889671832323074,
"learning_rate": 0.00016772815716257412,
"loss": 1.1444,
"step": 40
},
{
"epoch": 1.245136186770428,
"eval_loss": 1.1379262208938599,
"eval_runtime": 34.3236,
"eval_samples_per_second": 29.047,
"eval_steps_per_second": 0.32,
"step": 40
},
{
"epoch": 1.2762645914396886,
"grad_norm": 0.06582967936992645,
"learning_rate": 0.00016462992378609407,
"loss": 1.1335,
"step": 41
},
{
"epoch": 1.3073929961089494,
"grad_norm": 0.07529184967279434,
"learning_rate": 0.0001614212712689668,
"loss": 1.1292,
"step": 42
},
{
"epoch": 1.3385214007782102,
"grad_norm": 0.07816017419099808,
"learning_rate": 0.00015810768154019385,
"loss": 1.1293,
"step": 43
},
{
"epoch": 1.3696498054474708,
"grad_norm": 0.08063483238220215,
"learning_rate": 0.00015469481581224272,
"loss": 1.1161,
"step": 44
},
{
"epoch": 1.4007782101167314,
"grad_norm": 0.06947366893291473,
"learning_rate": 0.00015118850490896012,
"loss": 1.1168,
"step": 45
},
{
"epoch": 1.4319066147859922,
"grad_norm": 0.05603436380624771,
"learning_rate": 0.00014759473930370736,
"loss": 1.1147,
"step": 46
},
{
"epoch": 1.463035019455253,
"grad_norm": 0.055858004838228226,
"learning_rate": 0.00014391965888473703,
"loss": 1.1123,
"step": 47
},
{
"epoch": 1.4941634241245136,
"grad_norm": 0.0600324422121048,
"learning_rate": 0.00014016954246529696,
"loss": 1.0986,
"step": 48
},
{
"epoch": 1.4941634241245136,
"eval_loss": 1.1052128076553345,
"eval_runtime": 34.2952,
"eval_samples_per_second": 29.071,
"eval_steps_per_second": 0.321,
"step": 48
},
{
"epoch": 1.5252918287937742,
"grad_norm": 0.0596173070371151,
"learning_rate": 0.00013635079705638298,
"loss": 1.0949,
"step": 49
},
{
"epoch": 1.556420233463035,
"grad_norm": 0.06981530040502548,
"learning_rate": 0.00013246994692046836,
"loss": 1.1,
"step": 50
},
{
"epoch": 1.5875486381322959,
"grad_norm": 0.058555856347084045,
"learning_rate": 0.00012853362242491053,
"loss": 1.0946,
"step": 51
},
{
"epoch": 1.6186770428015564,
"grad_norm": 0.052131447941064835,
"learning_rate": 0.00012454854871407994,
"loss": 1.096,
"step": 52
},
{
"epoch": 1.649805447470817,
"grad_norm": 0.05138020217418671,
"learning_rate": 0.00012052153421956342,
"loss": 1.0948,
"step": 53
},
{
"epoch": 1.6809338521400778,
"grad_norm": 0.055884215980768204,
"learning_rate": 0.00011645945902807341,
"loss": 1.0868,
"step": 54
},
{
"epoch": 1.7120622568093387,
"grad_norm": 0.056635960936546326,
"learning_rate": 0.00011236926312693479,
"loss": 1.0782,
"step": 55
},
{
"epoch": 1.7431906614785992,
"grad_norm": 0.05791952833533287,
"learning_rate": 0.00010825793454723325,
"loss": 1.0774,
"step": 56
},
{
"epoch": 1.7431906614785992,
"eval_loss": 1.0816473960876465,
"eval_runtime": 34.308,
"eval_samples_per_second": 29.06,
"eval_steps_per_second": 0.321,
"step": 56
},
{
"epoch": 1.7743190661478598,
"grad_norm": 0.05655137449502945,
"learning_rate": 0.00010413249742488131,
"loss": 1.0793,
"step": 57
},
{
"epoch": 1.8054474708171206,
"grad_norm": 0.05930772423744202,
"learning_rate": 0.0001,
"loss": 1.0765,
"step": 58
},
{
"epoch": 1.8365758754863815,
"grad_norm": 0.056934159249067307,
"learning_rate": 9.586750257511867e-05,
"loss": 1.0825,
"step": 59
},
{
"epoch": 1.867704280155642,
"grad_norm": 0.05056174844503403,
"learning_rate": 9.174206545276677e-05,
"loss": 1.074,
"step": 60
},
{
"epoch": 1.8988326848249026,
"grad_norm": 0.05416735261678696,
"learning_rate": 8.763073687306524e-05,
"loss": 1.0731,
"step": 61
},
{
"epoch": 1.9299610894941635,
"grad_norm": 0.05306009575724602,
"learning_rate": 8.35405409719266e-05,
"loss": 1.0646,
"step": 62
},
{
"epoch": 1.9610894941634243,
"grad_norm": 0.054572440683841705,
"learning_rate": 7.947846578043659e-05,
"loss": 1.0697,
"step": 63
},
{
"epoch": 1.9922178988326849,
"grad_norm": 0.051973506808280945,
"learning_rate": 7.54514512859201e-05,
"loss": 1.065,
"step": 64
},
{
"epoch": 1.9922178988326849,
"eval_loss": 1.0657449960708618,
"eval_runtime": 34.2892,
"eval_samples_per_second": 29.076,
"eval_steps_per_second": 0.321,
"step": 64
},
{
"epoch": 2.0233463035019454,
"grad_norm": 0.048152584582567215,
"learning_rate": 7.146637757508949e-05,
"loss": 1.0629,
"step": 65
},
{
"epoch": 2.054474708171206,
"grad_norm": 0.04994530603289604,
"learning_rate": 6.753005307953167e-05,
"loss": 1.0516,
"step": 66
},
{
"epoch": 2.085603112840467,
"grad_norm": 0.05009295791387558,
"learning_rate": 6.3649202943617e-05,
"loss": 1.0526,
"step": 67
},
{
"epoch": 2.1167315175097277,
"grad_norm": 0.05345555767416954,
"learning_rate": 5.983045753470308e-05,
"loss": 1.0553,
"step": 68
},
{
"epoch": 2.1478599221789882,
"grad_norm": 0.04756650701165199,
"learning_rate": 5.608034111526298e-05,
"loss": 1.059,
"step": 69
},
{
"epoch": 2.178988326848249,
"grad_norm": 0.04925397038459778,
"learning_rate": 5.240526069629265e-05,
"loss": 1.0508,
"step": 70
},
{
"epoch": 2.21011673151751,
"grad_norm": 0.05096421390771866,
"learning_rate": 4.8811495091039926e-05,
"loss": 1.0472,
"step": 71
},
{
"epoch": 2.2412451361867705,
"grad_norm": 0.047330863773822784,
"learning_rate": 4.530518418775733e-05,
"loss": 1.055,
"step": 72
},
{
"epoch": 2.2412451361867705,
"eval_loss": 1.0550851821899414,
"eval_runtime": 34.2738,
"eval_samples_per_second": 29.089,
"eval_steps_per_second": 0.321,
"step": 72
},
{
"epoch": 2.272373540856031,
"grad_norm": 0.04690932855010033,
"learning_rate": 4.189231845980618e-05,
"loss": 1.0495,
"step": 73
},
{
"epoch": 2.3035019455252916,
"grad_norm": 0.04692551866173744,
"learning_rate": 3.857872873103322e-05,
"loss": 1.0561,
"step": 74
},
{
"epoch": 2.3346303501945527,
"grad_norm": 0.04910856485366821,
"learning_rate": 3.53700762139059e-05,
"loss": 1.0459,
"step": 75
},
{
"epoch": 2.3657587548638133,
"grad_norm": 0.04869484528899193,
"learning_rate": 3.227184283742591e-05,
"loss": 1.0373,
"step": 76
},
{
"epoch": 2.396887159533074,
"grad_norm": 0.045992154628038406,
"learning_rate": 2.9289321881345254e-05,
"loss": 1.0306,
"step": 77
},
{
"epoch": 2.4280155642023344,
"grad_norm": 0.04799241945147514,
"learning_rate": 2.6427608932686843e-05,
"loss": 1.051,
"step": 78
},
{
"epoch": 2.4591439688715955,
"grad_norm": 0.04848311096429825,
"learning_rate": 2.3691593180019366e-05,
"loss": 1.0408,
"step": 79
},
{
"epoch": 2.490272373540856,
"grad_norm": 0.04728139936923981,
"learning_rate": 2.1085949060360654e-05,
"loss": 1.0438,
"step": 80
},
{
"epoch": 2.490272373540856,
"eval_loss": 1.0484414100646973,
"eval_runtime": 34.2834,
"eval_samples_per_second": 29.081,
"eval_steps_per_second": 0.321,
"step": 80
},
{
"epoch": 2.5214007782101167,
"grad_norm": 0.04541860893368721,
"learning_rate": 1.861512827298051e-05,
"loss": 1.0422,
"step": 81
},
{
"epoch": 2.5525291828793772,
"grad_norm": 0.04615321755409241,
"learning_rate": 1.6283352173747145e-05,
"loss": 1.0388,
"step": 82
},
{
"epoch": 2.5836575875486383,
"grad_norm": 0.04621463268995285,
"learning_rate": 1.4094604563011472e-05,
"loss": 1.0442,
"step": 83
},
{
"epoch": 2.614785992217899,
"grad_norm": 0.045208945870399475,
"learning_rate": 1.2052624879351104e-05,
"loss": 1.0441,
"step": 84
},
{
"epoch": 2.6459143968871595,
"grad_norm": 0.04617554694414139,
"learning_rate": 1.0160901810802115e-05,
"loss": 1.0395,
"step": 85
},
{
"epoch": 2.6770428015564205,
"grad_norm": 0.043534088879823685,
"learning_rate": 8.422667334494249e-06,
"loss": 1.0463,
"step": 86
},
{
"epoch": 2.708171206225681,
"grad_norm": 0.04501954838633537,
"learning_rate": 6.840891194872112e-06,
"loss": 1.0426,
"step": 87
},
{
"epoch": 2.7392996108949417,
"grad_norm": 0.04564449191093445,
"learning_rate": 5.418275829936537e-06,
"loss": 1.0394,
"step": 88
},
{
"epoch": 2.7392996108949417,
"eval_loss": 1.0463460683822632,
"eval_runtime": 34.2795,
"eval_samples_per_second": 29.084,
"eval_steps_per_second": 0.321,
"step": 88
}
],
"logging_steps": 1,
"max_steps": 96,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 11,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 2.5197330643132875e+19,
"train_batch_size": 12,
"trial_name": null,
"trial_params": null
}