salami_truncsplit_model / trainer_state.json
taufiqsyed's picture
End of training
04905b5 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.9241877256317688,
"eval_steps": 25,
"global_step": 204,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.019253910950661854,
"grad_norm": 23.54662322998047,
"learning_rate": 0.00019901960784313727,
"loss": 9.4209,
"step": 1
},
{
"epoch": 0.03850782190132371,
"grad_norm": 22.151025772094727,
"learning_rate": 0.00019803921568627454,
"loss": 9.3584,
"step": 2
},
{
"epoch": 0.05776173285198556,
"grad_norm": 32.229759216308594,
"learning_rate": 0.00019705882352941177,
"loss": 9.1469,
"step": 3
},
{
"epoch": 0.07701564380264742,
"grad_norm": 42.96324920654297,
"learning_rate": 0.000196078431372549,
"loss": 8.5595,
"step": 4
},
{
"epoch": 0.09626955475330927,
"grad_norm": 32.40974044799805,
"learning_rate": 0.00019509803921568628,
"loss": 8.3043,
"step": 5
},
{
"epoch": 0.11552346570397112,
"grad_norm": 32.838134765625,
"learning_rate": 0.00019411764705882354,
"loss": 8.1422,
"step": 6
},
{
"epoch": 0.13477737665463296,
"grad_norm": 34.38292694091797,
"learning_rate": 0.0001931372549019608,
"loss": 7.7643,
"step": 7
},
{
"epoch": 0.15403128760529483,
"grad_norm": 31.947425842285156,
"learning_rate": 0.00019215686274509807,
"loss": 7.4565,
"step": 8
},
{
"epoch": 0.17328519855595667,
"grad_norm": 242.39166259765625,
"learning_rate": 0.0001911764705882353,
"loss": 7.436,
"step": 9
},
{
"epoch": 0.19253910950661854,
"grad_norm": 25.68425750732422,
"learning_rate": 0.00019019607843137254,
"loss": 7.1307,
"step": 10
},
{
"epoch": 0.21179302045728038,
"grad_norm": 24.717641830444336,
"learning_rate": 0.0001892156862745098,
"loss": 7.1206,
"step": 11
},
{
"epoch": 0.23104693140794225,
"grad_norm": 36.47980880737305,
"learning_rate": 0.00018823529411764707,
"loss": 6.6912,
"step": 12
},
{
"epoch": 0.2503008423586041,
"grad_norm": 28.181612014770508,
"learning_rate": 0.00018725490196078433,
"loss": 6.6547,
"step": 13
},
{
"epoch": 0.2695547533092659,
"grad_norm": 24.55516242980957,
"learning_rate": 0.00018627450980392157,
"loss": 6.9486,
"step": 14
},
{
"epoch": 0.2888086642599278,
"grad_norm": 32.426963806152344,
"learning_rate": 0.00018529411764705883,
"loss": 7.1069,
"step": 15
},
{
"epoch": 0.30806257521058966,
"grad_norm": 20.413976669311523,
"learning_rate": 0.00018431372549019607,
"loss": 6.6628,
"step": 16
},
{
"epoch": 0.32731648616125153,
"grad_norm": 28.58907699584961,
"learning_rate": 0.00018333333333333334,
"loss": 6.5333,
"step": 17
},
{
"epoch": 0.34657039711191334,
"grad_norm": 24.02996253967285,
"learning_rate": 0.0001823529411764706,
"loss": 6.5981,
"step": 18
},
{
"epoch": 0.3658243080625752,
"grad_norm": 23.250669479370117,
"learning_rate": 0.00018137254901960786,
"loss": 6.4779,
"step": 19
},
{
"epoch": 0.3850782190132371,
"grad_norm": 15.006091117858887,
"learning_rate": 0.0001803921568627451,
"loss": 6.6096,
"step": 20
},
{
"epoch": 0.4043321299638989,
"grad_norm": 16.560985565185547,
"learning_rate": 0.00017941176470588236,
"loss": 6.6496,
"step": 21
},
{
"epoch": 0.42358604091456076,
"grad_norm": 31.329875946044922,
"learning_rate": 0.00017843137254901963,
"loss": 6.9627,
"step": 22
},
{
"epoch": 0.4428399518652226,
"grad_norm": 12.381958961486816,
"learning_rate": 0.00017745098039215687,
"loss": 6.398,
"step": 23
},
{
"epoch": 0.4620938628158845,
"grad_norm": 9.271923065185547,
"learning_rate": 0.00017647058823529413,
"loss": 6.6,
"step": 24
},
{
"epoch": 0.4813477737665463,
"grad_norm": 12.544185638427734,
"learning_rate": 0.00017549019607843137,
"loss": 6.4684,
"step": 25
},
{
"epoch": 0.4813477737665463,
"eval_clap": 0.09883298724889755,
"eval_loss": 6.00625467300415,
"eval_runtime": 166.3531,
"eval_samples_per_second": 0.096,
"eval_steps_per_second": 0.096,
"step": 25
},
{
"epoch": 0.5006016847172082,
"grad_norm": 11.769013404846191,
"learning_rate": 0.00017450980392156863,
"loss": 6.5248,
"step": 26
},
{
"epoch": 0.51985559566787,
"grad_norm": 11.039627075195312,
"learning_rate": 0.0001735294117647059,
"loss": 6.6403,
"step": 27
},
{
"epoch": 0.5391095066185319,
"grad_norm": 17.4042911529541,
"learning_rate": 0.00017254901960784316,
"loss": 6.8092,
"step": 28
},
{
"epoch": 0.5583634175691937,
"grad_norm": 12.926351547241211,
"learning_rate": 0.0001715686274509804,
"loss": 6.5886,
"step": 29
},
{
"epoch": 0.5776173285198556,
"grad_norm": 12.865156173706055,
"learning_rate": 0.00017058823529411766,
"loss": 6.6176,
"step": 30
},
{
"epoch": 0.5968712394705175,
"grad_norm": 15.517515182495117,
"learning_rate": 0.0001696078431372549,
"loss": 6.4096,
"step": 31
},
{
"epoch": 0.6161251504211793,
"grad_norm": 12.356785774230957,
"learning_rate": 0.00016862745098039216,
"loss": 6.4528,
"step": 32
},
{
"epoch": 0.6353790613718412,
"grad_norm": 15.226251602172852,
"learning_rate": 0.00016764705882352942,
"loss": 6.3188,
"step": 33
},
{
"epoch": 0.6546329723225031,
"grad_norm": 13.221582412719727,
"learning_rate": 0.0001666666666666667,
"loss": 6.542,
"step": 34
},
{
"epoch": 0.6738868832731648,
"grad_norm": 13.414304733276367,
"learning_rate": 0.00016568627450980395,
"loss": 6.4272,
"step": 35
},
{
"epoch": 0.6931407942238267,
"grad_norm": 27.81321907043457,
"learning_rate": 0.0001647058823529412,
"loss": 6.7035,
"step": 36
},
{
"epoch": 0.7123947051744886,
"grad_norm": 17.882911682128906,
"learning_rate": 0.00016372549019607843,
"loss": 6.6117,
"step": 37
},
{
"epoch": 0.7316486161251504,
"grad_norm": 10.675613403320312,
"learning_rate": 0.0001627450980392157,
"loss": 6.4818,
"step": 38
},
{
"epoch": 0.7509025270758123,
"grad_norm": 11.32511043548584,
"learning_rate": 0.00016176470588235295,
"loss": 6.4717,
"step": 39
},
{
"epoch": 0.7701564380264742,
"grad_norm": 13.292048454284668,
"learning_rate": 0.00016078431372549022,
"loss": 6.4119,
"step": 40
},
{
"epoch": 0.789410348977136,
"grad_norm": 9.824177742004395,
"learning_rate": 0.00015980392156862746,
"loss": 6.6399,
"step": 41
},
{
"epoch": 0.8086642599277978,
"grad_norm": 18.48476791381836,
"learning_rate": 0.0001588235294117647,
"loss": 6.4116,
"step": 42
},
{
"epoch": 0.8279181708784596,
"grad_norm": 10.409250259399414,
"learning_rate": 0.00015784313725490196,
"loss": 6.4832,
"step": 43
},
{
"epoch": 0.8471720818291215,
"grad_norm": 18.297466278076172,
"learning_rate": 0.00015686274509803922,
"loss": 6.308,
"step": 44
},
{
"epoch": 0.8664259927797834,
"grad_norm": 12.408952713012695,
"learning_rate": 0.00015588235294117648,
"loss": 6.3373,
"step": 45
},
{
"epoch": 0.8856799037304453,
"grad_norm": 12.280571937561035,
"learning_rate": 0.00015490196078431375,
"loss": 6.3173,
"step": 46
},
{
"epoch": 0.9049338146811071,
"grad_norm": 12.348167419433594,
"learning_rate": 0.00015392156862745098,
"loss": 6.2873,
"step": 47
},
{
"epoch": 0.924187725631769,
"grad_norm": 28.005126953125,
"learning_rate": 0.00015294117647058822,
"loss": 6.7117,
"step": 48
},
{
"epoch": 0.9434416365824309,
"grad_norm": 16.248571395874023,
"learning_rate": 0.00015196078431372549,
"loss": 6.3493,
"step": 49
},
{
"epoch": 0.9626955475330926,
"grad_norm": 19.102869033813477,
"learning_rate": 0.00015098039215686275,
"loss": 6.4209,
"step": 50
},
{
"epoch": 0.9626955475330926,
"eval_clap": 0.13957397639751434,
"eval_loss": 6.070012092590332,
"eval_runtime": 165.6113,
"eval_samples_per_second": 0.097,
"eval_steps_per_second": 0.097,
"step": 50
},
{
"epoch": 0.9819494584837545,
"grad_norm": 6.675487995147705,
"learning_rate": 0.00015000000000000001,
"loss": 6.1695,
"step": 51
},
{
"epoch": 1.0,
"grad_norm": 14.88092041015625,
"learning_rate": 0.00014901960784313728,
"loss": 5.6169,
"step": 52
},
{
"epoch": 1.0192539109506618,
"grad_norm": 19.78269386291504,
"learning_rate": 0.00014803921568627451,
"loss": 6.5455,
"step": 53
},
{
"epoch": 1.0385078219013237,
"grad_norm": 7.873740196228027,
"learning_rate": 0.00014705882352941178,
"loss": 6.3154,
"step": 54
},
{
"epoch": 1.0577617328519855,
"grad_norm": 10.514632225036621,
"learning_rate": 0.00014607843137254902,
"loss": 6.5085,
"step": 55
},
{
"epoch": 1.0770156438026475,
"grad_norm": 10.021757125854492,
"learning_rate": 0.00014509803921568628,
"loss": 6.5109,
"step": 56
},
{
"epoch": 1.0962695547533092,
"grad_norm": 8.690667152404785,
"learning_rate": 0.00014411764705882354,
"loss": 6.5515,
"step": 57
},
{
"epoch": 1.1155234657039712,
"grad_norm": 12.78662109375,
"learning_rate": 0.00014313725490196078,
"loss": 6.5425,
"step": 58
},
{
"epoch": 1.134777376654633,
"grad_norm": 10.592965126037598,
"learning_rate": 0.00014215686274509804,
"loss": 6.5105,
"step": 59
},
{
"epoch": 1.154031287605295,
"grad_norm": 7.947122573852539,
"learning_rate": 0.0001411764705882353,
"loss": 6.6142,
"step": 60
},
{
"epoch": 1.1732851985559567,
"grad_norm": 6.823319911956787,
"learning_rate": 0.00014019607843137255,
"loss": 6.5339,
"step": 61
},
{
"epoch": 1.1925391095066185,
"grad_norm": 16.670989990234375,
"learning_rate": 0.0001392156862745098,
"loss": 6.3022,
"step": 62
},
{
"epoch": 1.2117930204572804,
"grad_norm": 20.09317398071289,
"learning_rate": 0.00013823529411764707,
"loss": 6.0779,
"step": 63
},
{
"epoch": 1.2310469314079422,
"grad_norm": 8.030014991760254,
"learning_rate": 0.0001372549019607843,
"loss": 6.3284,
"step": 64
},
{
"epoch": 1.2503008423586042,
"grad_norm": 10.324827194213867,
"learning_rate": 0.00013627450980392157,
"loss": 6.4022,
"step": 65
},
{
"epoch": 1.269554753309266,
"grad_norm": 29.070960998535156,
"learning_rate": 0.00013529411764705884,
"loss": 6.7835,
"step": 66
},
{
"epoch": 1.288808664259928,
"grad_norm": 17.838394165039062,
"learning_rate": 0.00013431372549019608,
"loss": 6.5344,
"step": 67
},
{
"epoch": 1.3080625752105897,
"grad_norm": 10.388354301452637,
"learning_rate": 0.00013333333333333334,
"loss": 6.3438,
"step": 68
},
{
"epoch": 1.3273164861612514,
"grad_norm": 9.607653617858887,
"learning_rate": 0.0001323529411764706,
"loss": 6.4325,
"step": 69
},
{
"epoch": 1.3465703971119134,
"grad_norm": 9.639688491821289,
"learning_rate": 0.00013137254901960784,
"loss": 6.3907,
"step": 70
},
{
"epoch": 1.3658243080625752,
"grad_norm": 9.424043655395508,
"learning_rate": 0.0001303921568627451,
"loss": 6.605,
"step": 71
},
{
"epoch": 1.3850782190132371,
"grad_norm": 8.21303653717041,
"learning_rate": 0.00012941176470588237,
"loss": 6.6275,
"step": 72
},
{
"epoch": 1.404332129963899,
"grad_norm": 10.479741096496582,
"learning_rate": 0.00012843137254901963,
"loss": 6.4801,
"step": 73
},
{
"epoch": 1.4235860409145609,
"grad_norm": 21.424253463745117,
"learning_rate": 0.00012745098039215687,
"loss": 6.3391,
"step": 74
},
{
"epoch": 1.4428399518652226,
"grad_norm": 6.5513224601745605,
"learning_rate": 0.0001264705882352941,
"loss": 6.7252,
"step": 75
},
{
"epoch": 1.4428399518652226,
"eval_clap": 0.10309316217899323,
"eval_loss": 6.036521911621094,
"eval_runtime": 165.4554,
"eval_samples_per_second": 0.097,
"eval_steps_per_second": 0.097,
"step": 75
},
{
"epoch": 1.4620938628158844,
"grad_norm": 32.52528762817383,
"learning_rate": 0.00012549019607843137,
"loss": 6.1922,
"step": 76
},
{
"epoch": 1.4813477737665464,
"grad_norm": 23.51795196533203,
"learning_rate": 0.00012450980392156863,
"loss": 6.3506,
"step": 77
},
{
"epoch": 1.5006016847172083,
"grad_norm": 10.925686836242676,
"learning_rate": 0.0001235294117647059,
"loss": 6.4783,
"step": 78
},
{
"epoch": 1.5198555956678699,
"grad_norm": 7.924820899963379,
"learning_rate": 0.00012254901960784316,
"loss": 6.6288,
"step": 79
},
{
"epoch": 1.5391095066185319,
"grad_norm": 6.946601390838623,
"learning_rate": 0.00012156862745098039,
"loss": 6.4085,
"step": 80
},
{
"epoch": 1.5583634175691938,
"grad_norm": 10.120043754577637,
"learning_rate": 0.00012058823529411765,
"loss": 6.4667,
"step": 81
},
{
"epoch": 1.5776173285198556,
"grad_norm": 9.635017395019531,
"learning_rate": 0.0001196078431372549,
"loss": 6.3742,
"step": 82
},
{
"epoch": 1.5968712394705173,
"grad_norm": 6.578627586364746,
"learning_rate": 0.00011862745098039216,
"loss": 6.1956,
"step": 83
},
{
"epoch": 1.6161251504211793,
"grad_norm": 18.30640983581543,
"learning_rate": 0.00011764705882352942,
"loss": 6.4804,
"step": 84
},
{
"epoch": 1.6353790613718413,
"grad_norm": 11.166876792907715,
"learning_rate": 0.00011666666666666668,
"loss": 6.4495,
"step": 85
},
{
"epoch": 1.654632972322503,
"grad_norm": 8.15738582611084,
"learning_rate": 0.00011568627450980394,
"loss": 6.1371,
"step": 86
},
{
"epoch": 1.6738868832731648,
"grad_norm": 9.473989486694336,
"learning_rate": 0.00011470588235294118,
"loss": 6.366,
"step": 87
},
{
"epoch": 1.6931407942238268,
"grad_norm": 16.634380340576172,
"learning_rate": 0.00011372549019607843,
"loss": 6.1748,
"step": 88
},
{
"epoch": 1.7123947051744886,
"grad_norm": 20.92518424987793,
"learning_rate": 0.0001127450980392157,
"loss": 6.0918,
"step": 89
},
{
"epoch": 1.7316486161251503,
"grad_norm": 10.186667442321777,
"learning_rate": 0.00011176470588235294,
"loss": 6.1072,
"step": 90
},
{
"epoch": 1.7509025270758123,
"grad_norm": 21.300180435180664,
"learning_rate": 0.00011078431372549021,
"loss": 6.724,
"step": 91
},
{
"epoch": 1.7701564380264743,
"grad_norm": 17.833845138549805,
"learning_rate": 0.00010980392156862746,
"loss": 6.2231,
"step": 92
},
{
"epoch": 1.789410348977136,
"grad_norm": 12.850127220153809,
"learning_rate": 0.0001088235294117647,
"loss": 6.4846,
"step": 93
},
{
"epoch": 1.8086642599277978,
"grad_norm": 16.229764938354492,
"learning_rate": 0.00010784313725490196,
"loss": 6.6046,
"step": 94
},
{
"epoch": 1.8279181708784598,
"grad_norm": 41.6049690246582,
"learning_rate": 0.00010686274509803922,
"loss": 6.5044,
"step": 95
},
{
"epoch": 1.8471720818291215,
"grad_norm": 8.0320463180542,
"learning_rate": 0.00010588235294117647,
"loss": 6.4836,
"step": 96
},
{
"epoch": 1.8664259927797833,
"grad_norm": 19.129127502441406,
"learning_rate": 0.00010490196078431374,
"loss": 6.1962,
"step": 97
},
{
"epoch": 1.8856799037304453,
"grad_norm": 14.464997291564941,
"learning_rate": 0.00010392156862745099,
"loss": 6.2694,
"step": 98
},
{
"epoch": 1.9049338146811072,
"grad_norm": 25.245752334594727,
"learning_rate": 0.00010294117647058823,
"loss": 6.0148,
"step": 99
},
{
"epoch": 1.924187725631769,
"grad_norm": 12.66399097442627,
"learning_rate": 0.00010196078431372549,
"loss": 6.1879,
"step": 100
},
{
"epoch": 1.924187725631769,
"eval_clap": 0.12328307330608368,
"eval_loss": 5.896579742431641,
"eval_runtime": 165.5834,
"eval_samples_per_second": 0.097,
"eval_steps_per_second": 0.097,
"step": 100
},
{
"epoch": 1.9434416365824307,
"grad_norm": 12.162952423095703,
"learning_rate": 0.00010098039215686274,
"loss": 6.1875,
"step": 101
},
{
"epoch": 1.9626955475330927,
"grad_norm": 16.754629135131836,
"learning_rate": 0.0001,
"loss": 6.5483,
"step": 102
},
{
"epoch": 1.9819494584837545,
"grad_norm": 9.804841995239258,
"learning_rate": 9.901960784313727e-05,
"loss": 6.0631,
"step": 103
},
{
"epoch": 2.0,
"grad_norm": 26.169551849365234,
"learning_rate": 9.80392156862745e-05,
"loss": 6.3384,
"step": 104
},
{
"epoch": 2.019253910950662,
"grad_norm": 22.054380416870117,
"learning_rate": 9.705882352941177e-05,
"loss": 6.5192,
"step": 105
},
{
"epoch": 2.0385078219013235,
"grad_norm": 13.319371223449707,
"learning_rate": 9.607843137254903e-05,
"loss": 6.1904,
"step": 106
},
{
"epoch": 2.0577617328519855,
"grad_norm": 13.158707618713379,
"learning_rate": 9.509803921568627e-05,
"loss": 6.4906,
"step": 107
},
{
"epoch": 2.0770156438026475,
"grad_norm": 7.972289562225342,
"learning_rate": 9.411764705882353e-05,
"loss": 6.4551,
"step": 108
},
{
"epoch": 2.0962695547533094,
"grad_norm": 14.052528381347656,
"learning_rate": 9.313725490196079e-05,
"loss": 6.2028,
"step": 109
},
{
"epoch": 2.115523465703971,
"grad_norm": 21.128631591796875,
"learning_rate": 9.215686274509804e-05,
"loss": 6.121,
"step": 110
},
{
"epoch": 2.134777376654633,
"grad_norm": 9.11488151550293,
"learning_rate": 9.11764705882353e-05,
"loss": 6.559,
"step": 111
},
{
"epoch": 2.154031287605295,
"grad_norm": 10.081767082214355,
"learning_rate": 9.019607843137255e-05,
"loss": 6.4236,
"step": 112
},
{
"epoch": 2.1732851985559565,
"grad_norm": 7.397235870361328,
"learning_rate": 8.921568627450981e-05,
"loss": 6.5415,
"step": 113
},
{
"epoch": 2.1925391095066185,
"grad_norm": 9.652939796447754,
"learning_rate": 8.823529411764706e-05,
"loss": 6.3744,
"step": 114
},
{
"epoch": 2.2117930204572804,
"grad_norm": 12.823005676269531,
"learning_rate": 8.725490196078432e-05,
"loss": 5.9683,
"step": 115
},
{
"epoch": 2.2310469314079424,
"grad_norm": 9.981169700622559,
"learning_rate": 8.627450980392158e-05,
"loss": 6.2714,
"step": 116
},
{
"epoch": 2.250300842358604,
"grad_norm": 11.026590347290039,
"learning_rate": 8.529411764705883e-05,
"loss": 6.1287,
"step": 117
},
{
"epoch": 2.269554753309266,
"grad_norm": 14.469505310058594,
"learning_rate": 8.431372549019608e-05,
"loss": 6.2634,
"step": 118
},
{
"epoch": 2.288808664259928,
"grad_norm": 10.639300346374512,
"learning_rate": 8.333333333333334e-05,
"loss": 6.1014,
"step": 119
},
{
"epoch": 2.30806257521059,
"grad_norm": 10.407938003540039,
"learning_rate": 8.23529411764706e-05,
"loss": 6.2487,
"step": 120
},
{
"epoch": 2.3273164861612514,
"grad_norm": 18.310867309570312,
"learning_rate": 8.137254901960785e-05,
"loss": 6.025,
"step": 121
},
{
"epoch": 2.3465703971119134,
"grad_norm": 13.314108848571777,
"learning_rate": 8.039215686274511e-05,
"loss": 6.1319,
"step": 122
},
{
"epoch": 2.3658243080625754,
"grad_norm": 12.528412818908691,
"learning_rate": 7.941176470588235e-05,
"loss": 6.27,
"step": 123
},
{
"epoch": 2.385078219013237,
"grad_norm": 10.71603775024414,
"learning_rate": 7.843137254901961e-05,
"loss": 6.4118,
"step": 124
},
{
"epoch": 2.404332129963899,
"grad_norm": 8.234016418457031,
"learning_rate": 7.745098039215687e-05,
"loss": 6.3642,
"step": 125
},
{
"epoch": 2.404332129963899,
"eval_clap": 0.10650094598531723,
"eval_loss": 6.806448936462402,
"eval_runtime": 165.8182,
"eval_samples_per_second": 0.096,
"eval_steps_per_second": 0.096,
"step": 125
},
{
"epoch": 2.423586040914561,
"grad_norm": 13.84628963470459,
"learning_rate": 7.647058823529411e-05,
"loss": 6.0872,
"step": 126
},
{
"epoch": 2.4428399518652224,
"grad_norm": 7.576101779937744,
"learning_rate": 7.549019607843137e-05,
"loss": 6.3515,
"step": 127
},
{
"epoch": 2.4620938628158844,
"grad_norm": 9.205301284790039,
"learning_rate": 7.450980392156864e-05,
"loss": 6.0883,
"step": 128
},
{
"epoch": 2.4813477737665464,
"grad_norm": 8.85059928894043,
"learning_rate": 7.352941176470589e-05,
"loss": 5.824,
"step": 129
},
{
"epoch": 2.5006016847172083,
"grad_norm": 6.963297367095947,
"learning_rate": 7.254901960784314e-05,
"loss": 6.4633,
"step": 130
},
{
"epoch": 2.51985559566787,
"grad_norm": 6.612102508544922,
"learning_rate": 7.156862745098039e-05,
"loss": 6.3979,
"step": 131
},
{
"epoch": 2.539109506618532,
"grad_norm": 11.322911262512207,
"learning_rate": 7.058823529411765e-05,
"loss": 6.2103,
"step": 132
},
{
"epoch": 2.558363417569194,
"grad_norm": 21.0396671295166,
"learning_rate": 6.96078431372549e-05,
"loss": 5.6772,
"step": 133
},
{
"epoch": 2.577617328519856,
"grad_norm": 13.040122985839844,
"learning_rate": 6.862745098039216e-05,
"loss": 6.0072,
"step": 134
},
{
"epoch": 2.5968712394705173,
"grad_norm": 13.392056465148926,
"learning_rate": 6.764705882352942e-05,
"loss": 6.0408,
"step": 135
},
{
"epoch": 2.6161251504211793,
"grad_norm": 9.345407485961914,
"learning_rate": 6.666666666666667e-05,
"loss": 6.345,
"step": 136
},
{
"epoch": 2.6353790613718413,
"grad_norm": 9.068965911865234,
"learning_rate": 6.568627450980392e-05,
"loss": 6.0518,
"step": 137
},
{
"epoch": 2.654632972322503,
"grad_norm": 9.924796104431152,
"learning_rate": 6.470588235294118e-05,
"loss": 6.404,
"step": 138
},
{
"epoch": 2.673886883273165,
"grad_norm": 11.512860298156738,
"learning_rate": 6.372549019607843e-05,
"loss": 5.849,
"step": 139
},
{
"epoch": 2.693140794223827,
"grad_norm": 9.558600425720215,
"learning_rate": 6.274509803921569e-05,
"loss": 6.0751,
"step": 140
},
{
"epoch": 2.7123947051744883,
"grad_norm": 14.465291976928711,
"learning_rate": 6.176470588235295e-05,
"loss": 5.5432,
"step": 141
},
{
"epoch": 2.7316486161251503,
"grad_norm": 14.843960762023926,
"learning_rate": 6.078431372549019e-05,
"loss": 5.8858,
"step": 142
},
{
"epoch": 2.7509025270758123,
"grad_norm": 8.04920768737793,
"learning_rate": 5.980392156862745e-05,
"loss": 5.8131,
"step": 143
},
{
"epoch": 2.7701564380264743,
"grad_norm": 9.71105670928955,
"learning_rate": 5.882352941176471e-05,
"loss": 5.9374,
"step": 144
},
{
"epoch": 2.7894103489771362,
"grad_norm": 5.949017524719238,
"learning_rate": 5.784313725490197e-05,
"loss": 6.4545,
"step": 145
},
{
"epoch": 2.808664259927798,
"grad_norm": 7.233414649963379,
"learning_rate": 5.6862745098039215e-05,
"loss": 6.1215,
"step": 146
},
{
"epoch": 2.8279181708784598,
"grad_norm": 9.445034980773926,
"learning_rate": 5.588235294117647e-05,
"loss": 5.7711,
"step": 147
},
{
"epoch": 2.8471720818291217,
"grad_norm": 6.351881980895996,
"learning_rate": 5.490196078431373e-05,
"loss": 6.3073,
"step": 148
},
{
"epoch": 2.8664259927797833,
"grad_norm": 5.955877304077148,
"learning_rate": 5.392156862745098e-05,
"loss": 6.2675,
"step": 149
},
{
"epoch": 2.8856799037304453,
"grad_norm": 7.2687764167785645,
"learning_rate": 5.294117647058824e-05,
"loss": 6.2382,
"step": 150
},
{
"epoch": 2.8856799037304453,
"eval_clap": 0.07656023651361465,
"eval_loss": 6.118464469909668,
"eval_runtime": 165.7635,
"eval_samples_per_second": 0.097,
"eval_steps_per_second": 0.097,
"step": 150
},
{
"epoch": 2.9049338146811072,
"grad_norm": 7.581653594970703,
"learning_rate": 5.1960784313725495e-05,
"loss": 6.1951,
"step": 151
},
{
"epoch": 2.9241877256317688,
"grad_norm": 5.309889793395996,
"learning_rate": 5.0980392156862745e-05,
"loss": 6.1416,
"step": 152
},
{
"epoch": 2.9434416365824307,
"grad_norm": 10.804561614990234,
"learning_rate": 5e-05,
"loss": 6.4203,
"step": 153
},
{
"epoch": 2.9626955475330927,
"grad_norm": 7.452890872955322,
"learning_rate": 4.901960784313725e-05,
"loss": 6.3695,
"step": 154
},
{
"epoch": 2.9819494584837543,
"grad_norm": 7.373142719268799,
"learning_rate": 4.803921568627452e-05,
"loss": 6.0469,
"step": 155
},
{
"epoch": 3.0,
"grad_norm": 6.503188610076904,
"learning_rate": 4.705882352941177e-05,
"loss": 5.5774,
"step": 156
},
{
"epoch": 3.019253910950662,
"grad_norm": 6.571235656738281,
"learning_rate": 4.607843137254902e-05,
"loss": 6.3784,
"step": 157
},
{
"epoch": 3.0385078219013235,
"grad_norm": 6.059790134429932,
"learning_rate": 4.5098039215686275e-05,
"loss": 6.2638,
"step": 158
},
{
"epoch": 3.0577617328519855,
"grad_norm": 7.978560447692871,
"learning_rate": 4.411764705882353e-05,
"loss": 6.2388,
"step": 159
},
{
"epoch": 3.0770156438026475,
"grad_norm": 4.5174479484558105,
"learning_rate": 4.313725490196079e-05,
"loss": 6.1811,
"step": 160
},
{
"epoch": 3.0962695547533094,
"grad_norm": 16.497093200683594,
"learning_rate": 4.215686274509804e-05,
"loss": 5.8567,
"step": 161
},
{
"epoch": 3.115523465703971,
"grad_norm": 10.036762237548828,
"learning_rate": 4.11764705882353e-05,
"loss": 5.7851,
"step": 162
},
{
"epoch": 3.134777376654633,
"grad_norm": 8.312905311584473,
"learning_rate": 4.0196078431372555e-05,
"loss": 6.3701,
"step": 163
},
{
"epoch": 3.154031287605295,
"grad_norm": 6.305182456970215,
"learning_rate": 3.9215686274509805e-05,
"loss": 6.2461,
"step": 164
},
{
"epoch": 3.1732851985559565,
"grad_norm": 6.297240257263184,
"learning_rate": 3.8235294117647055e-05,
"loss": 6.1583,
"step": 165
},
{
"epoch": 3.1925391095066185,
"grad_norm": 6.377700328826904,
"learning_rate": 3.725490196078432e-05,
"loss": 5.8368,
"step": 166
},
{
"epoch": 3.2117930204572804,
"grad_norm": 6.20255708694458,
"learning_rate": 3.627450980392157e-05,
"loss": 6.1394,
"step": 167
},
{
"epoch": 3.2310469314079424,
"grad_norm": 10.172269821166992,
"learning_rate": 3.529411764705883e-05,
"loss": 5.99,
"step": 168
},
{
"epoch": 3.250300842358604,
"grad_norm": 12.56449031829834,
"learning_rate": 3.431372549019608e-05,
"loss": 6.2823,
"step": 169
},
{
"epoch": 3.269554753309266,
"grad_norm": 6.517347812652588,
"learning_rate": 3.3333333333333335e-05,
"loss": 6.4417,
"step": 170
},
{
"epoch": 3.288808664259928,
"grad_norm": 7.165337085723877,
"learning_rate": 3.235294117647059e-05,
"loss": 6.1048,
"step": 171
},
{
"epoch": 3.30806257521059,
"grad_norm": 14.79480266571045,
"learning_rate": 3.137254901960784e-05,
"loss": 5.9012,
"step": 172
},
{
"epoch": 3.3273164861612514,
"grad_norm": 10.55307388305664,
"learning_rate": 3.0392156862745097e-05,
"loss": 6.0419,
"step": 173
},
{
"epoch": 3.3465703971119134,
"grad_norm": 7.354953289031982,
"learning_rate": 2.9411764705882354e-05,
"loss": 5.9871,
"step": 174
},
{
"epoch": 3.3658243080625754,
"grad_norm": 7.013256549835205,
"learning_rate": 2.8431372549019608e-05,
"loss": 6.3169,
"step": 175
},
{
"epoch": 3.3658243080625754,
"eval_clap": 0.09689466655254364,
"eval_loss": 6.116217613220215,
"eval_runtime": 165.7689,
"eval_samples_per_second": 0.097,
"eval_steps_per_second": 0.097,
"step": 175
},
{
"epoch": 3.385078219013237,
"grad_norm": 8.007953643798828,
"learning_rate": 2.7450980392156865e-05,
"loss": 6.0573,
"step": 176
},
{
"epoch": 3.404332129963899,
"grad_norm": 7.166982173919678,
"learning_rate": 2.647058823529412e-05,
"loss": 6.3097,
"step": 177
},
{
"epoch": 3.423586040914561,
"grad_norm": 5.868830680847168,
"learning_rate": 2.5490196078431373e-05,
"loss": 6.1856,
"step": 178
},
{
"epoch": 3.4428399518652224,
"grad_norm": 7.172518253326416,
"learning_rate": 2.4509803921568626e-05,
"loss": 6.284,
"step": 179
},
{
"epoch": 3.4620938628158844,
"grad_norm": 5.972955226898193,
"learning_rate": 2.3529411764705884e-05,
"loss": 6.1067,
"step": 180
},
{
"epoch": 3.4813477737665464,
"grad_norm": 5.716938495635986,
"learning_rate": 2.2549019607843138e-05,
"loss": 6.2792,
"step": 181
},
{
"epoch": 3.5006016847172083,
"grad_norm": 5.647866249084473,
"learning_rate": 2.1568627450980395e-05,
"loss": 6.336,
"step": 182
},
{
"epoch": 3.51985559566787,
"grad_norm": 7.596288204193115,
"learning_rate": 2.058823529411765e-05,
"loss": 6.1188,
"step": 183
},
{
"epoch": 3.539109506618532,
"grad_norm": 9.767680168151855,
"learning_rate": 1.9607843137254903e-05,
"loss": 6.3607,
"step": 184
},
{
"epoch": 3.558363417569194,
"grad_norm": 5.301209926605225,
"learning_rate": 1.862745098039216e-05,
"loss": 6.0671,
"step": 185
},
{
"epoch": 3.577617328519856,
"grad_norm": 6.347781658172607,
"learning_rate": 1.7647058823529414e-05,
"loss": 6.1538,
"step": 186
},
{
"epoch": 3.5968712394705173,
"grad_norm": 6.653684139251709,
"learning_rate": 1.6666666666666667e-05,
"loss": 6.1422,
"step": 187
},
{
"epoch": 3.6161251504211793,
"grad_norm": 9.340754508972168,
"learning_rate": 1.568627450980392e-05,
"loss": 5.6681,
"step": 188
},
{
"epoch": 3.6353790613718413,
"grad_norm": 6.159310340881348,
"learning_rate": 1.4705882352941177e-05,
"loss": 5.8408,
"step": 189
},
{
"epoch": 3.654632972322503,
"grad_norm": 7.5495195388793945,
"learning_rate": 1.3725490196078432e-05,
"loss": 6.1853,
"step": 190
},
{
"epoch": 3.673886883273165,
"grad_norm": 6.215287208557129,
"learning_rate": 1.2745098039215686e-05,
"loss": 6.082,
"step": 191
},
{
"epoch": 3.693140794223827,
"grad_norm": 5.863905906677246,
"learning_rate": 1.1764705882352942e-05,
"loss": 6.0772,
"step": 192
},
{
"epoch": 3.7123947051744883,
"grad_norm": 5.785052299499512,
"learning_rate": 1.0784313725490197e-05,
"loss": 6.2809,
"step": 193
},
{
"epoch": 3.7316486161251503,
"grad_norm": 8.62579345703125,
"learning_rate": 9.803921568627451e-06,
"loss": 5.9173,
"step": 194
},
{
"epoch": 3.7509025270758123,
"grad_norm": 8.095368385314941,
"learning_rate": 8.823529411764707e-06,
"loss": 6.2614,
"step": 195
},
{
"epoch": 3.7701564380264743,
"grad_norm": 6.416041851043701,
"learning_rate": 7.84313725490196e-06,
"loss": 5.7276,
"step": 196
},
{
"epoch": 3.7894103489771362,
"grad_norm": 6.0362868309021,
"learning_rate": 6.862745098039216e-06,
"loss": 6.1875,
"step": 197
},
{
"epoch": 3.808664259927798,
"grad_norm": 6.641626834869385,
"learning_rate": 5.882352941176471e-06,
"loss": 6.0641,
"step": 198
},
{
"epoch": 3.8279181708784598,
"grad_norm": 6.249925136566162,
"learning_rate": 4.901960784313726e-06,
"loss": 6.4255,
"step": 199
},
{
"epoch": 3.8471720818291217,
"grad_norm": 7.856912136077881,
"learning_rate": 3.92156862745098e-06,
"loss": 5.7667,
"step": 200
},
{
"epoch": 3.8471720818291217,
"eval_clap": 0.11432015895843506,
"eval_loss": 6.130455017089844,
"eval_runtime": 165.7823,
"eval_samples_per_second": 0.097,
"eval_steps_per_second": 0.097,
"step": 200
},
{
"epoch": 3.8664259927797833,
"grad_norm": 8.209946632385254,
"learning_rate": 2.9411764705882355e-06,
"loss": 6.1598,
"step": 201
},
{
"epoch": 3.8856799037304453,
"grad_norm": 7.541530609130859,
"learning_rate": 1.96078431372549e-06,
"loss": 5.7201,
"step": 202
},
{
"epoch": 3.9049338146811072,
"grad_norm": 36.531105041503906,
"learning_rate": 9.80392156862745e-07,
"loss": 6.0873,
"step": 203
},
{
"epoch": 3.9241877256317688,
"grad_norm": 6.220560073852539,
"learning_rate": 0.0,
"loss": 6.0892,
"step": 204
},
{
"epoch": 3.9241877256317688,
"step": 204,
"total_flos": 784195045500888.0,
"train_loss": 6.39456293629665,
"train_runtime": 14405.0011,
"train_samples_per_second": 0.231,
"train_steps_per_second": 0.014
}
],
"logging_steps": 1.0,
"max_steps": 204,
"num_input_tokens_seen": 0,
"num_train_epochs": 4,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 784195045500888.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}