nebo1337-GetTheRubber-l5ydy3nt7g / trainer_state.json
LegrandFrederic's picture
Upload trainer_state.json with huggingface_hub
cac37b5 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 19.931506849315067,
"eval_steps": 500,
"global_step": 1455,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.136986301369863,
"grad_norm": 7.088261127471924,
"learning_rate": 2.7397260273972603e-05,
"loss": 1.4774,
"step": 10
},
{
"epoch": 0.273972602739726,
"grad_norm": 3.0658137798309326,
"learning_rate": 5.479452054794521e-05,
"loss": 0.5535,
"step": 20
},
{
"epoch": 0.410958904109589,
"grad_norm": 1.415347695350647,
"learning_rate": 8.219178082191781e-05,
"loss": 0.2769,
"step": 30
},
{
"epoch": 0.547945205479452,
"grad_norm": 1.2462260723114014,
"learning_rate": 0.00010958904109589041,
"loss": 0.2071,
"step": 40
},
{
"epoch": 0.684931506849315,
"grad_norm": 1.1219278573989868,
"learning_rate": 0.000136986301369863,
"loss": 0.179,
"step": 50
},
{
"epoch": 0.821917808219178,
"grad_norm": 2.4104228019714355,
"learning_rate": 0.00016438356164383562,
"loss": 0.1587,
"step": 60
},
{
"epoch": 0.958904109589041,
"grad_norm": 1.2239787578582764,
"learning_rate": 0.0001917808219178082,
"loss": 0.1366,
"step": 70
},
{
"epoch": 1.095890410958904,
"grad_norm": 0.9942715167999268,
"learning_rate": 0.00019998733979961563,
"loss": 0.1218,
"step": 80
},
{
"epoch": 1.2328767123287672,
"grad_norm": 0.6293880939483643,
"learning_rate": 0.0001999253383717226,
"loss": 0.1168,
"step": 90
},
{
"epoch": 1.36986301369863,
"grad_norm": 0.7170248031616211,
"learning_rate": 0.00019981170237143067,
"loss": 0.1052,
"step": 100
},
{
"epoch": 1.5068493150684932,
"grad_norm": 0.7464343905448914,
"learning_rate": 0.00019964649051804355,
"loss": 0.1066,
"step": 110
},
{
"epoch": 1.643835616438356,
"grad_norm": 0.6828764081001282,
"learning_rate": 0.000199429788181734,
"loss": 0.1057,
"step": 120
},
{
"epoch": 1.7808219178082192,
"grad_norm": 0.6028720736503601,
"learning_rate": 0.0001991617073394306,
"loss": 0.0843,
"step": 130
},
{
"epoch": 1.9178082191780823,
"grad_norm": 0.5440357327461243,
"learning_rate": 0.00019884238651695556,
"loss": 0.0948,
"step": 140
},
{
"epoch": 2.0547945205479454,
"grad_norm": 0.8612964749336243,
"learning_rate": 0.00019847199071744415,
"loss": 0.085,
"step": 150
},
{
"epoch": 2.191780821917808,
"grad_norm": 0.889124870300293,
"learning_rate": 0.00019805071133608242,
"loss": 0.0962,
"step": 160
},
{
"epoch": 2.328767123287671,
"grad_norm": 0.45466411113739014,
"learning_rate": 0.0001975787660612072,
"loss": 0.0763,
"step": 170
},
{
"epoch": 2.4657534246575343,
"grad_norm": 0.42088282108306885,
"learning_rate": 0.00019705639876181969,
"loss": 0.0635,
"step": 180
},
{
"epoch": 2.602739726027397,
"grad_norm": 0.5170985460281372,
"learning_rate": 0.00019648387936157068,
"loss": 0.0726,
"step": 190
},
{
"epoch": 2.73972602739726,
"grad_norm": 0.4313249886035919,
"learning_rate": 0.00019586150369928245,
"loss": 0.0669,
"step": 200
},
{
"epoch": 2.8767123287671232,
"grad_norm": 0.3355115056037903,
"learning_rate": 0.00019518959337607957,
"loss": 0.0682,
"step": 210
},
{
"epoch": 3.0136986301369864,
"grad_norm": 0.34427109360694885,
"learning_rate": 0.0001944684955892075,
"loss": 0.0638,
"step": 220
},
{
"epoch": 3.1506849315068495,
"grad_norm": 0.2929873466491699,
"learning_rate": 0.0001936985829526247,
"loss": 0.0632,
"step": 230
},
{
"epoch": 3.287671232876712,
"grad_norm": 0.3884938657283783,
"learning_rate": 0.00019288025330446126,
"loss": 0.0655,
"step": 240
},
{
"epoch": 3.4246575342465753,
"grad_norm": 0.27399152517318726,
"learning_rate": 0.00019201392950144363,
"loss": 0.0533,
"step": 250
},
{
"epoch": 3.5616438356164384,
"grad_norm": 0.2924444079399109,
"learning_rate": 0.0001911000592003909,
"loss": 0.0589,
"step": 260
},
{
"epoch": 3.6986301369863015,
"grad_norm": 0.43013861775398254,
"learning_rate": 0.00019013911462689668,
"loss": 0.0615,
"step": 270
},
{
"epoch": 3.8356164383561646,
"grad_norm": 0.5247001647949219,
"learning_rate": 0.000189131592331315,
"loss": 0.0583,
"step": 280
},
{
"epoch": 3.9726027397260273,
"grad_norm": 0.5796880722045898,
"learning_rate": 0.00018807801293217735,
"loss": 0.0556,
"step": 290
},
{
"epoch": 4.109589041095891,
"grad_norm": 0.5179729461669922,
"learning_rate": 0.00018697892084717238,
"loss": 0.056,
"step": 300
},
{
"epoch": 4.2465753424657535,
"grad_norm": 0.42960262298583984,
"learning_rate": 0.00018583488401182843,
"loss": 0.0637,
"step": 310
},
{
"epoch": 4.383561643835616,
"grad_norm": 0.3196163773536682,
"learning_rate": 0.0001846464935860431,
"loss": 0.0518,
"step": 320
},
{
"epoch": 4.52054794520548,
"grad_norm": 0.4424096643924713,
"learning_rate": 0.0001834143636486124,
"loss": 0.0524,
"step": 330
},
{
"epoch": 4.657534246575342,
"grad_norm": 0.50010746717453,
"learning_rate": 0.00018213913087991685,
"loss": 0.0629,
"step": 340
},
{
"epoch": 4.794520547945205,
"grad_norm": 0.4036540389060974,
"learning_rate": 0.00018082145423292868,
"loss": 0.0531,
"step": 350
},
{
"epoch": 4.931506849315069,
"grad_norm": 0.36036092042922974,
"learning_rate": 0.0001794620145927101,
"loss": 0.0556,
"step": 360
},
{
"epoch": 5.068493150684931,
"grad_norm": 0.22472509741783142,
"learning_rate": 0.00017806151442457827,
"loss": 0.0446,
"step": 370
},
{
"epoch": 5.205479452054795,
"grad_norm": 0.3514921963214874,
"learning_rate": 0.00017662067741111974,
"loss": 0.0443,
"step": 380
},
{
"epoch": 5.342465753424658,
"grad_norm": 0.2920095920562744,
"learning_rate": 0.00017514024807824055,
"loss": 0.0451,
"step": 390
},
{
"epoch": 5.47945205479452,
"grad_norm": 0.21051590144634247,
"learning_rate": 0.00017362099141044626,
"loss": 0.0476,
"step": 400
},
{
"epoch": 5.616438356164384,
"grad_norm": 0.36196619272232056,
"learning_rate": 0.00017206369245555036,
"loss": 0.0521,
"step": 410
},
{
"epoch": 5.7534246575342465,
"grad_norm": 0.3503723442554474,
"learning_rate": 0.0001704691559190155,
"loss": 0.0472,
"step": 420
},
{
"epoch": 5.890410958904109,
"grad_norm": 0.3881896734237671,
"learning_rate": 0.0001688382057481364,
"loss": 0.0537,
"step": 430
},
{
"epoch": 6.027397260273973,
"grad_norm": 0.29409492015838623,
"learning_rate": 0.00016717168470628077,
"loss": 0.0436,
"step": 440
},
{
"epoch": 6.164383561643835,
"grad_norm": 0.2455558031797409,
"learning_rate": 0.0001654704539374066,
"loss": 0.0429,
"step": 450
},
{
"epoch": 6.301369863013699,
"grad_norm": 0.30749672651290894,
"learning_rate": 0.00016373539252108202,
"loss": 0.042,
"step": 460
},
{
"epoch": 6.438356164383562,
"grad_norm": 0.4117829501628876,
"learning_rate": 0.00016196739701823716,
"loss": 0.0422,
"step": 470
},
{
"epoch": 6.575342465753424,
"grad_norm": 0.3047957718372345,
"learning_rate": 0.00016016738100788297,
"loss": 0.0456,
"step": 480
},
{
"epoch": 6.712328767123288,
"grad_norm": 0.3104310631752014,
"learning_rate": 0.00015833627461503595,
"loss": 0.0405,
"step": 490
},
{
"epoch": 6.8493150684931505,
"grad_norm": 0.3713166415691376,
"learning_rate": 0.0001564750240300934,
"loss": 0.0451,
"step": 500
},
{
"epoch": 6.986301369863014,
"grad_norm": 0.23804673552513123,
"learning_rate": 0.00015458459101990693,
"loss": 0.0387,
"step": 510
},
{
"epoch": 7.123287671232877,
"grad_norm": 0.4476951062679291,
"learning_rate": 0.00015266595243080714,
"loss": 0.0406,
"step": 520
},
{
"epoch": 7.260273972602739,
"grad_norm": 0.27973777055740356,
"learning_rate": 0.00015072009968383656,
"loss": 0.0464,
"step": 530
},
{
"epoch": 7.397260273972603,
"grad_norm": 0.3597777783870697,
"learning_rate": 0.00014874803826245089,
"loss": 0.0459,
"step": 540
},
{
"epoch": 7.534246575342466,
"grad_norm": 0.27027377486228943,
"learning_rate": 0.00014675078719295415,
"loss": 0.0375,
"step": 550
},
{
"epoch": 7.671232876712329,
"grad_norm": 0.27681443095207214,
"learning_rate": 0.00014472937851793557,
"loss": 0.0421,
"step": 560
},
{
"epoch": 7.808219178082192,
"grad_norm": 0.3312411904335022,
"learning_rate": 0.00014268485676298078,
"loss": 0.048,
"step": 570
},
{
"epoch": 7.945205479452055,
"grad_norm": 0.2358381599187851,
"learning_rate": 0.0001406182783969324,
"loss": 0.0409,
"step": 580
},
{
"epoch": 8.082191780821917,
"grad_norm": 0.19072838127613068,
"learning_rate": 0.00013853071128597924,
"loss": 0.0417,
"step": 590
},
{
"epoch": 8.219178082191782,
"grad_norm": 0.3328644931316376,
"learning_rate": 0.0001364232341418564,
"loss": 0.0397,
"step": 600
},
{
"epoch": 8.356164383561644,
"grad_norm": 0.27157458662986755,
"learning_rate": 0.00013429693596444067,
"loss": 0.0395,
"step": 610
},
{
"epoch": 8.493150684931507,
"grad_norm": 0.2969032824039459,
"learning_rate": 0.00013215291547903006,
"loss": 0.0406,
"step": 620
},
{
"epoch": 8.63013698630137,
"grad_norm": 0.2864357829093933,
"learning_rate": 0.00012999228056859784,
"loss": 0.0424,
"step": 630
},
{
"epoch": 8.767123287671232,
"grad_norm": 0.25885725021362305,
"learning_rate": 0.00012781614770131442,
"loss": 0.0392,
"step": 640
},
{
"epoch": 8.904109589041095,
"grad_norm": 0.2456735372543335,
"learning_rate": 0.00012562564135363313,
"loss": 0.0415,
"step": 650
},
{
"epoch": 9.04109589041096,
"grad_norm": 0.41431066393852234,
"learning_rate": 0.0001234218934292376,
"loss": 0.0407,
"step": 660
},
{
"epoch": 9.178082191780822,
"grad_norm": 0.260213702917099,
"learning_rate": 0.00012120604267415172,
"loss": 0.0393,
"step": 670
},
{
"epoch": 9.315068493150685,
"grad_norm": 0.3395901322364807,
"learning_rate": 0.00011897923408831346,
"loss": 0.035,
"step": 680
},
{
"epoch": 9.452054794520548,
"grad_norm": 0.3405311405658722,
"learning_rate": 0.0001167426183339174,
"loss": 0.0342,
"step": 690
},
{
"epoch": 9.58904109589041,
"grad_norm": 0.20802819728851318,
"learning_rate": 0.00011449735114083127,
"loss": 0.0347,
"step": 700
},
{
"epoch": 9.726027397260275,
"grad_norm": 0.5094506144523621,
"learning_rate": 0.00011224459270939384,
"loss": 0.0373,
"step": 710
},
{
"epoch": 9.863013698630137,
"grad_norm": 0.21799403429031372,
"learning_rate": 0.000109985507110903,
"loss": 0.0392,
"step": 720
},
{
"epoch": 10.0,
"grad_norm": 0.28433603048324585,
"learning_rate": 0.00010772126168610325,
"loss": 0.0373,
"step": 730
},
{
"epoch": 10.136986301369863,
"grad_norm": 0.3425813913345337,
"learning_rate": 0.00010545302644198405,
"loss": 0.0385,
"step": 740
},
{
"epoch": 10.273972602739725,
"grad_norm": 0.2662697434425354,
"learning_rate": 0.00010318197344720018,
"loss": 0.0347,
"step": 750
},
{
"epoch": 10.41095890410959,
"grad_norm": 0.2841816842556,
"learning_rate": 0.0001009092762264271,
"loss": 0.04,
"step": 760
},
{
"epoch": 10.547945205479452,
"grad_norm": 0.2933363914489746,
"learning_rate": 9.863610915396365e-05,
"loss": 0.0363,
"step": 770
},
{
"epoch": 10.684931506849315,
"grad_norm": 0.20692330598831177,
"learning_rate": 9.63636468468959e-05,
"loss": 0.0361,
"step": 780
},
{
"epoch": 10.821917808219178,
"grad_norm": 0.24741721153259277,
"learning_rate": 9.409306355813529e-05,
"loss": 0.0341,
"step": 790
},
{
"epoch": 10.95890410958904,
"grad_norm": 0.1948077529668808,
"learning_rate": 9.18255325696454e-05,
"loss": 0.0349,
"step": 800
},
{
"epoch": 11.095890410958905,
"grad_norm": 0.16165360808372498,
"learning_rate": 8.956222558616998e-05,
"loss": 0.0318,
"step": 810
},
{
"epoch": 11.232876712328768,
"grad_norm": 0.25702184438705444,
"learning_rate": 8.730431212977625e-05,
"loss": 0.0281,
"step": 820
},
{
"epoch": 11.36986301369863,
"grad_norm": 0.27587395906448364,
"learning_rate": 8.505295893552594e-05,
"loss": 0.0349,
"step": 830
},
{
"epoch": 11.506849315068493,
"grad_norm": 0.3140430152416229,
"learning_rate": 8.280932934858652e-05,
"loss": 0.0305,
"step": 840
},
{
"epoch": 11.643835616438356,
"grad_norm": 0.21165433526039124,
"learning_rate": 8.05745827230941e-05,
"loss": 0.0314,
"step": 850
},
{
"epoch": 11.780821917808218,
"grad_norm": 0.20445489883422852,
"learning_rate": 7.834987382307861e-05,
"loss": 0.0319,
"step": 860
},
{
"epoch": 11.917808219178083,
"grad_norm": 0.27832481265068054,
"learning_rate": 7.613635222576072e-05,
"loss": 0.0334,
"step": 870
},
{
"epoch": 12.054794520547945,
"grad_norm": 0.25728923082351685,
"learning_rate": 7.393516172752919e-05,
"loss": 0.033,
"step": 880
},
{
"epoch": 12.191780821917808,
"grad_norm": 0.2254086136817932,
"learning_rate": 7.174743975290513e-05,
"loss": 0.0346,
"step": 890
},
{
"epoch": 12.32876712328767,
"grad_norm": 0.31018713116645813,
"learning_rate": 6.957431676679896e-05,
"loss": 0.0329,
"step": 900
},
{
"epoch": 12.465753424657533,
"grad_norm": 0.32662343978881836,
"learning_rate": 6.741691569036338e-05,
"loss": 0.0342,
"step": 910
},
{
"epoch": 12.602739726027398,
"grad_norm": 0.2533169984817505,
"learning_rate": 6.527635132074493e-05,
"loss": 0.0264,
"step": 920
},
{
"epoch": 12.73972602739726,
"grad_norm": 0.27445635199546814,
"learning_rate": 6.315372975503285e-05,
"loss": 0.0281,
"step": 930
},
{
"epoch": 12.876712328767123,
"grad_norm": 0.21471256017684937,
"learning_rate": 6.1050147818704e-05,
"loss": 0.0321,
"step": 940
},
{
"epoch": 13.013698630136986,
"grad_norm": 0.19105984270572662,
"learning_rate": 5.896669249885851e-05,
"loss": 0.0273,
"step": 950
},
{
"epoch": 13.150684931506849,
"grad_norm": 0.3308360278606415,
"learning_rate": 5.690444038253935e-05,
"loss": 0.0343,
"step": 960
},
{
"epoch": 13.287671232876713,
"grad_norm": 0.1988590806722641,
"learning_rate": 5.4864457100425783e-05,
"loss": 0.028,
"step": 970
},
{
"epoch": 13.424657534246576,
"grad_norm": 0.1858794391155243,
"learning_rate": 5.284779677618841e-05,
"loss": 0.0273,
"step": 980
},
{
"epoch": 13.561643835616438,
"grad_norm": 0.29671627283096313,
"learning_rate": 5.0855501481790305e-05,
"loss": 0.0271,
"step": 990
},
{
"epoch": 13.698630136986301,
"grad_norm": 0.17693527042865753,
"learning_rate": 4.8888600699015496e-05,
"loss": 0.034,
"step": 1000
},
{
"epoch": 13.835616438356164,
"grad_norm": 0.31038013100624084,
"learning_rate": 4.694811078750338e-05,
"loss": 0.0251,
"step": 1010
},
{
"epoch": 13.972602739726028,
"grad_norm": 0.3317829668521881,
"learning_rate": 4.50350344595635e-05,
"loss": 0.0334,
"step": 1020
},
{
"epoch": 14.10958904109589,
"grad_norm": 0.1818408966064453,
"learning_rate": 4.315036026204262e-05,
"loss": 0.0272,
"step": 1030
},
{
"epoch": 14.246575342465754,
"grad_norm": 0.2105715572834015,
"learning_rate": 4.129506206551138e-05,
"loss": 0.025,
"step": 1040
},
{
"epoch": 14.383561643835616,
"grad_norm": 0.18613150715827942,
"learning_rate": 3.947009856103465e-05,
"loss": 0.0238,
"step": 1050
},
{
"epoch": 14.520547945205479,
"grad_norm": 0.2959461212158203,
"learning_rate": 3.767641276478563e-05,
"loss": 0.0249,
"step": 1060
},
{
"epoch": 14.657534246575342,
"grad_norm": 0.18495745956897736,
"learning_rate": 3.591493153075966e-05,
"loss": 0.0214,
"step": 1070
},
{
"epoch": 14.794520547945206,
"grad_norm": 0.1501263529062271,
"learning_rate": 3.41865650718396e-05,
"loss": 0.0266,
"step": 1080
},
{
"epoch": 14.931506849315069,
"grad_norm": 0.3387095332145691,
"learning_rate": 3.24922064894601e-05,
"loss": 0.0268,
"step": 1090
},
{
"epoch": 15.068493150684931,
"grad_norm": 0.23434942960739136,
"learning_rate": 3.083273131211382e-05,
"loss": 0.0272,
"step": 1100
},
{
"epoch": 15.205479452054794,
"grad_norm": 0.163187175989151,
"learning_rate": 2.920899704293849e-05,
"loss": 0.0232,
"step": 1110
},
{
"epoch": 15.342465753424657,
"grad_norm": 0.20000265538692474,
"learning_rate": 2.762184271661785e-05,
"loss": 0.0261,
"step": 1120
},
{
"epoch": 15.479452054794521,
"grad_norm": 0.18943333625793457,
"learning_rate": 2.6072088465826038e-05,
"loss": 0.0246,
"step": 1130
},
{
"epoch": 15.616438356164384,
"grad_norm": 0.2833252251148224,
"learning_rate": 2.4560535097439108e-05,
"loss": 0.0253,
"step": 1140
},
{
"epoch": 15.753424657534246,
"grad_norm": 0.1302843540906906,
"learning_rate": 2.308796367873296e-05,
"loss": 0.0246,
"step": 1150
},
{
"epoch": 15.89041095890411,
"grad_norm": 0.16615238785743713,
"learning_rate": 2.165513513378121e-05,
"loss": 0.0254,
"step": 1160
},
{
"epoch": 16.027397260273972,
"grad_norm": 0.17113815248012543,
"learning_rate": 2.0262789850261798e-05,
"loss": 0.0288,
"step": 1170
},
{
"epoch": 16.164383561643834,
"grad_norm": 0.21394069492816925,
"learning_rate": 1.8911647296875147e-05,
"loss": 0.025,
"step": 1180
},
{
"epoch": 16.301369863013697,
"grad_norm": 0.2763649523258209,
"learning_rate": 1.7602405651572275e-05,
"loss": 0.0219,
"step": 1190
},
{
"epoch": 16.438356164383563,
"grad_norm": 0.13925646245479584,
"learning_rate": 1.6335741440784035e-05,
"loss": 0.0217,
"step": 1200
},
{
"epoch": 16.575342465753426,
"grad_norm": 0.20826192200183868,
"learning_rate": 1.511230918983867e-05,
"loss": 0.023,
"step": 1210
},
{
"epoch": 16.71232876712329,
"grad_norm": 0.2256271094083786,
"learning_rate": 1.3932741084747913e-05,
"loss": 0.023,
"step": 1220
},
{
"epoch": 16.84931506849315,
"grad_norm": 0.27016547322273254,
"learning_rate": 1.2797646645536566e-05,
"loss": 0.0211,
"step": 1230
},
{
"epoch": 16.986301369863014,
"grad_norm": 0.26627489924430847,
"learning_rate": 1.1707612411284253e-05,
"loss": 0.0235,
"step": 1240
},
{
"epoch": 17.123287671232877,
"grad_norm": 0.18498767912387848,
"learning_rate": 1.0663201637042252e-05,
"loss": 0.022,
"step": 1250
},
{
"epoch": 17.26027397260274,
"grad_norm": 0.23852607607841492,
"learning_rate": 9.664954002781745e-06,
"loss": 0.0228,
"step": 1260
},
{
"epoch": 17.397260273972602,
"grad_norm": 0.15411531925201416,
"learning_rate": 8.713385334524283e-06,
"loss": 0.0198,
"step": 1270
},
{
"epoch": 17.534246575342465,
"grad_norm": 0.25403866171836853,
"learning_rate": 7.808987337798158e-06,
"loss": 0.0257,
"step": 1280
},
{
"epoch": 17.671232876712327,
"grad_norm": 0.14403975009918213,
"learning_rate": 6.952227343558671e-06,
"loss": 0.0215,
"step": 1290
},
{
"epoch": 17.80821917808219,
"grad_norm": 0.188527911901474,
"learning_rate": 6.143548066703475e-06,
"loss": 0.0224,
"step": 1300
},
{
"epoch": 17.945205479452056,
"grad_norm": 0.1309424489736557,
"learning_rate": 5.383367377307857e-06,
"loss": 0.0215,
"step": 1310
},
{
"epoch": 18.08219178082192,
"grad_norm": 0.11233002692461014,
"learning_rate": 4.672078084698095e-06,
"loss": 0.0211,
"step": 1320
},
{
"epoch": 18.21917808219178,
"grad_norm": 0.22869743406772614,
"learning_rate": 4.010047734474454e-06,
"loss": 0.0215,
"step": 1330
},
{
"epoch": 18.356164383561644,
"grad_norm": 0.11979719996452332,
"learning_rate": 3.397618418588877e-06,
"loss": 0.0273,
"step": 1340
},
{
"epoch": 18.493150684931507,
"grad_norm": 0.2112375795841217,
"learning_rate": 2.8351065985751766e-06,
"loss": 0.0228,
"step": 1350
},
{
"epoch": 18.63013698630137,
"grad_norm": 0.14134034514427185,
"learning_rate": 2.322802942023461e-06,
"loss": 0.0247,
"step": 1360
},
{
"epoch": 18.767123287671232,
"grad_norm": 0.09884881973266602,
"learning_rate": 1.8609721723830132e-06,
"loss": 0.0196,
"step": 1370
},
{
"epoch": 18.904109589041095,
"grad_norm": 0.14044946432113647,
"learning_rate": 1.4498529321713584e-06,
"loss": 0.0198,
"step": 1380
},
{
"epoch": 19.041095890410958,
"grad_norm": 0.13853876292705536,
"learning_rate": 1.0896576596600705e-06,
"loss": 0.0182,
"step": 1390
},
{
"epoch": 19.17808219178082,
"grad_norm": 0.1654110848903656,
"learning_rate": 7.80572479101327e-07,
"loss": 0.0229,
"step": 1400
},
{
"epoch": 19.315068493150687,
"grad_norm": 0.15151838958263397,
"learning_rate": 5.227571045515633e-07,
"loss": 0.0202,
"step": 1410
},
{
"epoch": 19.45205479452055,
"grad_norm": 0.2258201688528061,
"learning_rate": 3.163447573422351e-07,
"loss": 0.0197,
"step": 1420
},
{
"epoch": 19.589041095890412,
"grad_norm": 0.24640779197216034,
"learning_rate": 1.614420972401165e-07,
"loss": 0.0187,
"step": 1430
},
{
"epoch": 19.726027397260275,
"grad_norm": 0.21181590855121613,
"learning_rate": 5.812916733284324e-08,
"loss": 0.0198,
"step": 1440
},
{
"epoch": 19.863013698630137,
"grad_norm": 0.14787183701992035,
"learning_rate": 6.459352668164442e-09,
"loss": 0.0186,
"step": 1450
},
{
"epoch": 19.931506849315067,
"step": 1455,
"total_flos": 1.1504025698630573e+17,
"train_loss": 0.05927258820058554,
"train_runtime": 1048.7401,
"train_samples_per_second": 88.792,
"train_steps_per_second": 1.387
}
],
"logging_steps": 10,
"max_steps": 1455,
"num_input_tokens_seen": 0,
"num_train_epochs": 20,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.1504025698630573e+17,
"train_batch_size": 64,
"trial_name": null,
"trial_params": null
}