cube_to_round_box_146-neivonm3xj / trainer_state.json
LegrandFrederic's picture
Upload trainer_state.json with huggingface_hub
00ff434 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 19.908045977011493,
"eval_steps": 500,
"global_step": 1732,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.11494252873563218,
"grad_norm": 5.833221435546875,
"learning_rate": 2.2988505747126437e-05,
"loss": 1.1691,
"step": 10
},
{
"epoch": 0.22988505747126436,
"grad_norm": 3.3890936374664307,
"learning_rate": 4.597701149425287e-05,
"loss": 0.4962,
"step": 20
},
{
"epoch": 0.3448275862068966,
"grad_norm": 1.4635858535766602,
"learning_rate": 6.896551724137931e-05,
"loss": 0.2852,
"step": 30
},
{
"epoch": 0.45977011494252873,
"grad_norm": 1.3263287544250488,
"learning_rate": 9.195402298850575e-05,
"loss": 0.2068,
"step": 40
},
{
"epoch": 0.5747126436781609,
"grad_norm": 1.4933586120605469,
"learning_rate": 0.00011494252873563218,
"loss": 0.1671,
"step": 50
},
{
"epoch": 0.6896551724137931,
"grad_norm": 3.461280584335327,
"learning_rate": 0.00013793103448275863,
"loss": 0.163,
"step": 60
},
{
"epoch": 0.8045977011494253,
"grad_norm": 0.8741048574447632,
"learning_rate": 0.00016091954022988506,
"loss": 0.1625,
"step": 70
},
{
"epoch": 0.9195402298850575,
"grad_norm": 0.9238091111183167,
"learning_rate": 0.0001839080459770115,
"loss": 0.1292,
"step": 80
},
{
"epoch": 1.0344827586206897,
"grad_norm": 1.1964523792266846,
"learning_rate": 0.00019999835873288997,
"loss": 0.1137,
"step": 90
},
{
"epoch": 1.1494252873563218,
"grad_norm": 0.6014072895050049,
"learning_rate": 0.0001999691821496584,
"loss": 0.1066,
"step": 100
},
{
"epoch": 1.264367816091954,
"grad_norm": 0.6493586897850037,
"learning_rate": 0.00019990354521250803,
"loss": 0.1051,
"step": 110
},
{
"epoch": 1.3793103448275863,
"grad_norm": 0.6839510202407837,
"learning_rate": 0.00019980147186027586,
"loss": 0.0926,
"step": 120
},
{
"epoch": 1.4942528735632183,
"grad_norm": 0.5401099920272827,
"learning_rate": 0.00019966299932074023,
"loss": 0.0895,
"step": 130
},
{
"epoch": 1.6091954022988506,
"grad_norm": 1.1563774347305298,
"learning_rate": 0.000199488178097043,
"loss": 0.0915,
"step": 140
},
{
"epoch": 1.7241379310344827,
"grad_norm": 2.1474926471710205,
"learning_rate": 0.00019927707194927066,
"loss": 0.0853,
"step": 150
},
{
"epoch": 1.839080459770115,
"grad_norm": 0.7267495393753052,
"learning_rate": 0.00019902975787119956,
"loss": 0.0873,
"step": 160
},
{
"epoch": 1.9540229885057472,
"grad_norm": 0.5461205244064331,
"learning_rate": 0.00019874632606221545,
"loss": 0.0739,
"step": 170
},
{
"epoch": 2.0689655172413794,
"grad_norm": 0.43381860852241516,
"learning_rate": 0.00019842687989441604,
"loss": 0.0682,
"step": 180
},
{
"epoch": 2.1839080459770113,
"grad_norm": 0.619968593120575,
"learning_rate": 0.00019807153587490963,
"loss": 0.0725,
"step": 190
},
{
"epoch": 2.2988505747126435,
"grad_norm": 0.5531813502311707,
"learning_rate": 0.00019768042360332325,
"loss": 0.0649,
"step": 200
},
{
"epoch": 2.413793103448276,
"grad_norm": 0.4325454831123352,
"learning_rate": 0.00019725368572453539,
"loss": 0.0629,
"step": 210
},
{
"epoch": 2.528735632183908,
"grad_norm": 1.132866382598877,
"learning_rate": 0.00019679147787665126,
"loss": 0.0597,
"step": 220
},
{
"epoch": 2.6436781609195403,
"grad_norm": 0.5158783793449402,
"learning_rate": 0.00019629396863423911,
"loss": 0.0658,
"step": 230
},
{
"epoch": 2.7586206896551726,
"grad_norm": 0.5275442600250244,
"learning_rate": 0.0001957613394468484,
"loss": 0.0624,
"step": 240
},
{
"epoch": 2.873563218390805,
"grad_norm": 0.26212960481643677,
"learning_rate": 0.0001951937845728321,
"loss": 0.0565,
"step": 250
},
{
"epoch": 2.9885057471264367,
"grad_norm": 0.4064450263977051,
"learning_rate": 0.00019459151100849784,
"loss": 0.0586,
"step": 260
},
{
"epoch": 3.103448275862069,
"grad_norm": 0.497156023979187,
"learning_rate": 0.0001939547384126128,
"loss": 0.0569,
"step": 270
},
{
"epoch": 3.218390804597701,
"grad_norm": 0.23238833248615265,
"learning_rate": 0.00019328369902629083,
"loss": 0.048,
"step": 280
},
{
"epoch": 3.3333333333333335,
"grad_norm": 0.33057811856269836,
"learning_rate": 0.00019257863758829035,
"loss": 0.0508,
"step": 290
},
{
"epoch": 3.4482758620689653,
"grad_norm": 0.2923976182937622,
"learning_rate": 0.00019183981124575418,
"loss": 0.059,
"step": 300
},
{
"epoch": 3.5632183908045976,
"grad_norm": 0.40444961190223694,
"learning_rate": 0.00019106748946042407,
"loss": 0.0589,
"step": 310
},
{
"epoch": 3.67816091954023,
"grad_norm": 0.3618530333042145,
"learning_rate": 0.00019026195391036338,
"loss": 0.0493,
"step": 320
},
{
"epoch": 3.793103448275862,
"grad_norm": 0.2655580937862396,
"learning_rate": 0.00018942349838722486,
"loss": 0.0502,
"step": 330
},
{
"epoch": 3.9080459770114944,
"grad_norm": 0.30788642168045044,
"learning_rate": 0.0001885524286891002,
"loss": 0.0562,
"step": 340
},
{
"epoch": 4.022988505747127,
"grad_norm": 0.3801023066043854,
"learning_rate": 0.00018764906250899027,
"loss": 0.0536,
"step": 350
},
{
"epoch": 4.137931034482759,
"grad_norm": 0.34299996495246887,
"learning_rate": 0.00018671372931893773,
"loss": 0.0583,
"step": 360
},
{
"epoch": 4.252873563218391,
"grad_norm": 0.5825142860412598,
"learning_rate": 0.0001857467702498633,
"loss": 0.0465,
"step": 370
},
{
"epoch": 4.3678160919540225,
"grad_norm": 0.46258264780044556,
"learning_rate": 0.0001847485379671496,
"loss": 0.0469,
"step": 380
},
{
"epoch": 4.482758620689655,
"grad_norm": 0.23550616204738617,
"learning_rate": 0.0001837193965420188,
"loss": 0.0477,
"step": 390
},
{
"epoch": 4.597701149425287,
"grad_norm": 0.609255850315094,
"learning_rate": 0.00018265972131874987,
"loss": 0.0467,
"step": 400
},
{
"epoch": 4.712643678160919,
"grad_norm": 0.3701482117176056,
"learning_rate": 0.00018156989877778461,
"loss": 0.0471,
"step": 410
},
{
"epoch": 4.827586206896552,
"grad_norm": 0.4651474356651306,
"learning_rate": 0.00018045032639477194,
"loss": 0.0434,
"step": 420
},
{
"epoch": 4.942528735632184,
"grad_norm": 0.34028705954551697,
"learning_rate": 0.00017930141249560233,
"loss": 0.0452,
"step": 430
},
{
"epoch": 5.057471264367816,
"grad_norm": 0.2748933732509613,
"learning_rate": 0.00017812357610748488,
"loss": 0.0413,
"step": 440
},
{
"epoch": 5.172413793103448,
"grad_norm": 0.4612014591693878,
"learning_rate": 0.00017691724680612118,
"loss": 0.0423,
"step": 450
},
{
"epoch": 5.287356321839081,
"grad_norm": 0.37961891293525696,
"learning_rate": 0.00017568286455903258,
"loss": 0.0422,
"step": 460
},
{
"epoch": 5.402298850574713,
"grad_norm": 0.3245999813079834,
"learning_rate": 0.00017442087956509665,
"loss": 0.0396,
"step": 470
},
{
"epoch": 5.517241379310345,
"grad_norm": 0.5230941772460938,
"learning_rate": 0.00017313175209035268,
"loss": 0.0405,
"step": 480
},
{
"epoch": 5.6321839080459775,
"grad_norm": 0.2870311737060547,
"learning_rate": 0.00017181595230013525,
"loss": 0.0343,
"step": 490
},
{
"epoch": 5.747126436781609,
"grad_norm": 0.2876773774623871,
"learning_rate": 0.00017047396008759754,
"loss": 0.0436,
"step": 500
},
{
"epoch": 5.862068965517241,
"grad_norm": 0.4095667898654938,
"learning_rate": 0.00016910626489868649,
"loss": 0.0408,
"step": 510
},
{
"epoch": 5.977011494252873,
"grad_norm": 0.377605140209198,
"learning_rate": 0.00016771336555363418,
"loss": 0.0415,
"step": 520
},
{
"epoch": 6.091954022988506,
"grad_norm": 0.28248798847198486,
"learning_rate": 0.00016629577006503009,
"loss": 0.0386,
"step": 530
},
{
"epoch": 6.206896551724138,
"grad_norm": 0.36199840903282166,
"learning_rate": 0.0001648539954525409,
"loss": 0.0405,
"step": 540
},
{
"epoch": 6.32183908045977,
"grad_norm": 0.2778664529323578,
"learning_rate": 0.00016338856755434503,
"loss": 0.0359,
"step": 550
},
{
"epoch": 6.436781609195402,
"grad_norm": 0.23418012261390686,
"learning_rate": 0.00016190002083535122,
"loss": 0.0382,
"step": 560
},
{
"epoch": 6.551724137931035,
"grad_norm": 0.3027312457561493,
"learning_rate": 0.00016038889819227045,
"loss": 0.0394,
"step": 570
},
{
"epoch": 6.666666666666667,
"grad_norm": 0.2858007550239563,
"learning_rate": 0.00015885575075561326,
"loss": 0.042,
"step": 580
},
{
"epoch": 6.781609195402299,
"grad_norm": 0.2762337923049927,
"learning_rate": 0.00015730113768868312,
"loss": 0.039,
"step": 590
},
{
"epoch": 6.896551724137931,
"grad_norm": 0.40732237696647644,
"learning_rate": 0.0001557256259836412,
"loss": 0.0404,
"step": 600
},
{
"epoch": 7.011494252873563,
"grad_norm": 0.36847805976867676,
"learning_rate": 0.00015412979025471488,
"loss": 0.0368,
"step": 610
},
{
"epoch": 7.126436781609195,
"grad_norm": 0.2492237538099289,
"learning_rate": 0.00015251421252862707,
"loss": 0.0336,
"step": 620
},
{
"epoch": 7.241379310344827,
"grad_norm": 0.2626156210899353,
"learning_rate": 0.00015087948203232156,
"loss": 0.0352,
"step": 630
},
{
"epoch": 7.35632183908046,
"grad_norm": 0.6365396976470947,
"learning_rate": 0.00014922619497806277,
"loss": 0.0342,
"step": 640
},
{
"epoch": 7.471264367816092,
"grad_norm": 0.3000635802745819,
"learning_rate": 0.00014755495434598745,
"loss": 0.037,
"step": 650
},
{
"epoch": 7.586206896551724,
"grad_norm": 0.21869853138923645,
"learning_rate": 0.0001458663696641884,
"loss": 0.0365,
"step": 660
},
{
"epoch": 7.7011494252873565,
"grad_norm": 0.22284150123596191,
"learning_rate": 0.0001441610567864096,
"loss": 0.035,
"step": 670
},
{
"epoch": 7.816091954022989,
"grad_norm": 0.267621248960495,
"learning_rate": 0.00014243963766743495,
"loss": 0.029,
"step": 680
},
{
"epoch": 7.931034482758621,
"grad_norm": 0.2817297875881195,
"learning_rate": 0.00014070274013625096,
"loss": 0.0303,
"step": 690
},
{
"epoch": 8.045977011494253,
"grad_norm": 0.3712492287158966,
"learning_rate": 0.00013895099766706784,
"loss": 0.0297,
"step": 700
},
{
"epoch": 8.160919540229886,
"grad_norm": 0.4549995958805084,
"learning_rate": 0.00013718504914828135,
"loss": 0.033,
"step": 710
},
{
"epoch": 8.275862068965518,
"grad_norm": 0.28695234656333923,
"learning_rate": 0.00013540553864945976,
"loss": 0.0306,
"step": 720
},
{
"epoch": 8.39080459770115,
"grad_norm": 0.34577062726020813,
"learning_rate": 0.00013361311518644172,
"loss": 0.0325,
"step": 730
},
{
"epoch": 8.505747126436782,
"grad_norm": 0.3214464783668518,
"learning_rate": 0.00013180843248462973,
"loss": 0.0337,
"step": 740
},
{
"epoch": 8.620689655172415,
"grad_norm": 0.33310961723327637,
"learning_rate": 0.00012999214874056595,
"loss": 0.0344,
"step": 750
},
{
"epoch": 8.735632183908045,
"grad_norm": 0.25606226921081543,
"learning_rate": 0.00012816492638187762,
"loss": 0.0396,
"step": 760
},
{
"epoch": 8.850574712643677,
"grad_norm": 0.36330148577690125,
"learning_rate": 0.00012632743182567905,
"loss": 0.0348,
"step": 770
},
{
"epoch": 8.96551724137931,
"grad_norm": 0.314394474029541,
"learning_rate": 0.00012448033523551865,
"loss": 0.0399,
"step": 780
},
{
"epoch": 9.080459770114942,
"grad_norm": 0.28129351139068604,
"learning_rate": 0.00012262431027695964,
"loss": 0.0298,
"step": 790
},
{
"epoch": 9.195402298850574,
"grad_norm": 0.256881982088089,
"learning_rate": 0.00012076003387188353,
"loss": 0.0292,
"step": 800
},
{
"epoch": 9.310344827586206,
"grad_norm": 0.1919921189546585,
"learning_rate": 0.00011888818595160584,
"loss": 0.0269,
"step": 810
},
{
"epoch": 9.425287356321839,
"grad_norm": 0.2719796895980835,
"learning_rate": 0.00011700944920889436,
"loss": 0.0265,
"step": 820
},
{
"epoch": 9.540229885057471,
"grad_norm": 0.2269754707813263,
"learning_rate": 0.00011512450884898022,
"loss": 0.0316,
"step": 830
},
{
"epoch": 9.655172413793103,
"grad_norm": 0.23504453897476196,
"learning_rate": 0.00011323405233965256,
"loss": 0.0273,
"step": 840
},
{
"epoch": 9.770114942528735,
"grad_norm": 0.22570957243442535,
"learning_rate": 0.00011133876916052821,
"loss": 0.0304,
"step": 850
},
{
"epoch": 9.885057471264368,
"grad_norm": 0.19824576377868652,
"learning_rate": 0.00010943935055158734,
"loss": 0.0283,
"step": 860
},
{
"epoch": 10.0,
"grad_norm": 0.41852012276649475,
"learning_rate": 0.00010753648926106723,
"loss": 0.0319,
"step": 870
},
{
"epoch": 10.114942528735632,
"grad_norm": 0.20548714697360992,
"learning_rate": 0.00010563087929280613,
"loss": 0.0285,
"step": 880
},
{
"epoch": 10.229885057471265,
"grad_norm": 0.22767336666584015,
"learning_rate": 0.00010372321565312872,
"loss": 0.031,
"step": 890
},
{
"epoch": 10.344827586206897,
"grad_norm": 0.20542040467262268,
"learning_rate": 0.00010181419409736647,
"loss": 0.0316,
"step": 900
},
{
"epoch": 10.459770114942529,
"grad_norm": 0.3105849027633667,
"learning_rate": 9.990451087610448e-05,
"loss": 0.027,
"step": 910
},
{
"epoch": 10.574712643678161,
"grad_norm": 0.31816890835762024,
"learning_rate": 9.799486248124775e-05,
"loss": 0.025,
"step": 920
},
{
"epoch": 10.689655172413794,
"grad_norm": 0.3295416235923767,
"learning_rate": 9.608594539199957e-05,
"loss": 0.0247,
"step": 930
},
{
"epoch": 10.804597701149426,
"grad_norm": 0.17071272432804108,
"learning_rate": 9.417845582084448e-05,
"loss": 0.0291,
"step": 940
},
{
"epoch": 10.919540229885058,
"grad_norm": 0.189552441239357,
"learning_rate": 9.227308945962827e-05,
"loss": 0.0243,
"step": 950
},
{
"epoch": 11.03448275862069,
"grad_norm": 0.3179641664028168,
"learning_rate": 9.037054122582839e-05,
"loss": 0.0308,
"step": 960
},
{
"epoch": 11.149425287356323,
"grad_norm": 0.3051457703113556,
"learning_rate": 8.847150500910618e-05,
"loss": 0.0275,
"step": 970
},
{
"epoch": 11.264367816091955,
"grad_norm": 0.29757606983184814,
"learning_rate": 8.657667341823448e-05,
"loss": 0.0264,
"step": 980
},
{
"epoch": 11.379310344827585,
"grad_norm": 0.2845855951309204,
"learning_rate": 8.4686737528492e-05,
"loss": 0.0249,
"step": 990
},
{
"epoch": 11.494252873563218,
"grad_norm": 0.239737406373024,
"learning_rate": 8.280238662961728e-05,
"loss": 0.027,
"step": 1000
},
{
"epoch": 11.60919540229885,
"grad_norm": 0.2692360281944275,
"learning_rate": 8.092430797441364e-05,
"loss": 0.0216,
"step": 1010
},
{
"epoch": 11.724137931034482,
"grad_norm": 0.18495500087738037,
"learning_rate": 7.905318652809728e-05,
"loss": 0.0255,
"step": 1020
},
{
"epoch": 11.839080459770114,
"grad_norm": 0.2230875939130783,
"learning_rate": 7.718970471847923e-05,
"loss": 0.0262,
"step": 1030
},
{
"epoch": 11.954022988505747,
"grad_norm": 0.14376775920391083,
"learning_rate": 7.53345421870735e-05,
"loss": 0.0209,
"step": 1040
},
{
"epoch": 12.068965517241379,
"grad_norm": 0.20623371005058289,
"learning_rate": 7.348837554122057e-05,
"loss": 0.0192,
"step": 1050
},
{
"epoch": 12.183908045977011,
"grad_norm": 0.27209600806236267,
"learning_rate": 7.165187810731823e-05,
"loss": 0.0208,
"step": 1060
},
{
"epoch": 12.298850574712644,
"grad_norm": 0.19447851181030273,
"learning_rate": 6.982571968524847e-05,
"loss": 0.0201,
"step": 1070
},
{
"epoch": 12.413793103448276,
"grad_norm": 0.18613241612911224,
"learning_rate": 6.801056630409098e-05,
"loss": 0.0248,
"step": 1080
},
{
"epoch": 12.528735632183908,
"grad_norm": 0.24156583845615387,
"learning_rate": 6.620707997921157e-05,
"loss": 0.0197,
"step": 1090
},
{
"epoch": 12.64367816091954,
"grad_norm": 0.16912145912647247,
"learning_rate": 6.441591847081476e-05,
"loss": 0.022,
"step": 1100
},
{
"epoch": 12.758620689655173,
"grad_norm": 0.14165754616260529,
"learning_rate": 6.263773504404801e-05,
"loss": 0.0199,
"step": 1110
},
{
"epoch": 12.873563218390805,
"grad_norm": 0.3424724340438843,
"learning_rate": 6.087317823074565e-05,
"loss": 0.0209,
"step": 1120
},
{
"epoch": 12.988505747126437,
"grad_norm": 0.2658204138278961,
"learning_rate": 5.912289159289883e-05,
"loss": 0.0242,
"step": 1130
},
{
"epoch": 13.10344827586207,
"grad_norm": 0.21321730315685272,
"learning_rate": 5.7387513487938386e-05,
"loss": 0.0216,
"step": 1140
},
{
"epoch": 13.218390804597702,
"grad_norm": 0.2854823172092438,
"learning_rate": 5.566767683591553e-05,
"loss": 0.0227,
"step": 1150
},
{
"epoch": 13.333333333333334,
"grad_norm": 0.28919658064842224,
"learning_rate": 5.396400888866601e-05,
"loss": 0.0195,
"step": 1160
},
{
"epoch": 13.448275862068966,
"grad_norm": 0.22510255873203278,
"learning_rate": 5.2277131001041125e-05,
"loss": 0.0241,
"step": 1170
},
{
"epoch": 13.563218390804598,
"grad_norm": 0.21545900404453278,
"learning_rate": 5.060765840429e-05,
"loss": 0.023,
"step": 1180
},
{
"epoch": 13.678160919540229,
"grad_norm": 0.20618782937526703,
"learning_rate": 4.8956199981674656e-05,
"loss": 0.0181,
"step": 1190
},
{
"epoch": 13.793103448275861,
"grad_norm": 0.22331970930099487,
"learning_rate": 4.7323358046400844e-05,
"loss": 0.0212,
"step": 1200
},
{
"epoch": 13.908045977011493,
"grad_norm": 0.14791706204414368,
"learning_rate": 4.570972812194457e-05,
"loss": 0.0195,
"step": 1210
},
{
"epoch": 14.022988505747126,
"grad_norm": 0.1526448130607605,
"learning_rate": 4.4115898724855455e-05,
"loss": 0.0188,
"step": 1220
},
{
"epoch": 14.137931034482758,
"grad_norm": 0.18956783413887024,
"learning_rate": 4.254245115011506e-05,
"loss": 0.0188,
"step": 1230
},
{
"epoch": 14.25287356321839,
"grad_norm": 0.1313301920890808,
"learning_rate": 4.098995925912972e-05,
"loss": 0.019,
"step": 1240
},
{
"epoch": 14.367816091954023,
"grad_norm": 0.13764789700508118,
"learning_rate": 3.945898927043372e-05,
"loss": 0.0175,
"step": 1250
},
{
"epoch": 14.482758620689655,
"grad_norm": 0.19556942582130432,
"learning_rate": 3.7950099553180804e-05,
"loss": 0.0196,
"step": 1260
},
{
"epoch": 14.597701149425287,
"grad_norm": 0.14027345180511475,
"learning_rate": 3.646384042349764e-05,
"loss": 0.0177,
"step": 1270
},
{
"epoch": 14.71264367816092,
"grad_norm": 0.2918284833431244,
"learning_rate": 3.500075394377511e-05,
"loss": 0.0204,
"step": 1280
},
{
"epoch": 14.827586206896552,
"grad_norm": 0.12948164343833923,
"learning_rate": 3.3561373724969224e-05,
"loss": 0.0188,
"step": 1290
},
{
"epoch": 14.942528735632184,
"grad_norm": 0.1773224174976349,
"learning_rate": 3.214622473198492e-05,
"loss": 0.0212,
"step": 1300
},
{
"epoch": 15.057471264367816,
"grad_norm": 0.29863160848617554,
"learning_rate": 3.075582309221289e-05,
"loss": 0.0157,
"step": 1310
},
{
"epoch": 15.172413793103448,
"grad_norm": 0.18764474987983704,
"learning_rate": 2.939067590728972e-05,
"loss": 0.0175,
"step": 1320
},
{
"epoch": 15.28735632183908,
"grad_norm": 0.16273990273475647,
"learning_rate": 2.8051281068149803e-05,
"loss": 0.0135,
"step": 1330
},
{
"epoch": 15.402298850574713,
"grad_norm": 0.25088945031166077,
"learning_rate": 2.673812707343669e-05,
"loss": 0.0242,
"step": 1340
},
{
"epoch": 15.517241379310345,
"grad_norm": 0.25521960854530334,
"learning_rate": 2.545169285133965e-05,
"loss": 0.0188,
"step": 1350
},
{
"epoch": 15.632183908045977,
"grad_norm": 0.15780223906040192,
"learning_rate": 2.4192447584921195e-05,
"loss": 0.0194,
"step": 1360
},
{
"epoch": 15.74712643678161,
"grad_norm": 0.13658417761325836,
"learning_rate": 2.296085054099828e-05,
"loss": 0.0179,
"step": 1370
},
{
"epoch": 15.862068965517242,
"grad_norm": 0.14593394100666046,
"learning_rate": 2.175735090264058e-05,
"loss": 0.016,
"step": 1380
},
{
"epoch": 15.977011494252874,
"grad_norm": 0.20093883574008942,
"learning_rate": 2.0582387605346088e-05,
"loss": 0.0157,
"step": 1390
},
{
"epoch": 16.091954022988507,
"grad_norm": 0.22261527180671692,
"learning_rate": 1.943638917695453e-05,
"loss": 0.0175,
"step": 1400
},
{
"epoch": 16.20689655172414,
"grad_norm": 0.17486433684825897,
"learning_rate": 1.831977358135625e-05,
"loss": 0.0166,
"step": 1410
},
{
"epoch": 16.32183908045977,
"grad_norm": 0.2138216346502304,
"learning_rate": 1.723294806605428e-05,
"loss": 0.0146,
"step": 1420
},
{
"epoch": 16.436781609195403,
"grad_norm": 0.20112960040569305,
"learning_rate": 1.6176309013634517e-05,
"loss": 0.0159,
"step": 1430
},
{
"epoch": 16.551724137931036,
"grad_norm": 0.15377485752105713,
"learning_rate": 1.5150241797198883e-05,
"loss": 0.016,
"step": 1440
},
{
"epoch": 16.666666666666668,
"grad_norm": 0.23132722079753876,
"learning_rate": 1.415512063981339e-05,
"loss": 0.0134,
"step": 1450
},
{
"epoch": 16.7816091954023,
"grad_norm": 0.15262600779533386,
"learning_rate": 1.3191308478023212e-05,
"loss": 0.017,
"step": 1460
},
{
"epoch": 16.896551724137932,
"grad_norm": 0.0991855040192604,
"learning_rate": 1.2259156829483842e-05,
"loss": 0.0167,
"step": 1470
},
{
"epoch": 17.011494252873565,
"grad_norm": 0.12278055399656296,
"learning_rate": 1.1359005664756994e-05,
"loss": 0.0146,
"step": 1480
},
{
"epoch": 17.126436781609197,
"grad_norm": 0.17124158143997192,
"learning_rate": 1.0491183283317997e-05,
"loss": 0.0173,
"step": 1490
},
{
"epoch": 17.24137931034483,
"grad_norm": 0.1300356686115265,
"learning_rate": 9.656006193819633e-06,
"loss": 0.0143,
"step": 1500
},
{
"epoch": 17.35632183908046,
"grad_norm": 0.17946338653564453,
"learning_rate": 8.853778998656537e-06,
"loss": 0.0154,
"step": 1510
},
{
"epoch": 17.47126436781609,
"grad_norm": 0.28736400604248047,
"learning_rate": 8.084794282871689e-06,
"loss": 0.0166,
"step": 1520
},
{
"epoch": 17.586206896551722,
"grad_norm": 0.13112574815750122,
"learning_rate": 7.3493325074461165e-06,
"loss": 0.0132,
"step": 1530
},
{
"epoch": 17.701149425287355,
"grad_norm": 0.12864838540554047,
"learning_rate": 6.647661907010183e-06,
"loss": 0.0171,
"step": 1540
},
{
"epoch": 17.816091954022987,
"grad_norm": 0.16958807408809662,
"learning_rate": 5.980038392014309e-06,
"loss": 0.0161,
"step": 1550
},
{
"epoch": 17.93103448275862,
"grad_norm": 0.36121729016304016,
"learning_rate": 5.3467054553941405e-06,
"loss": 0.0158,
"step": 1560
},
{
"epoch": 18.04597701149425,
"grad_norm": 0.2107989490032196,
"learning_rate": 4.7478940837649924e-06,
"loss": 0.0147,
"step": 1570
},
{
"epoch": 18.160919540229884,
"grad_norm": 0.15654149651527405,
"learning_rate": 4.183822673177229e-06,
"loss": 0.0164,
"step": 1580
},
{
"epoch": 18.275862068965516,
"grad_norm": 0.1438828557729721,
"learning_rate": 3.6546969494637986e-06,
"loss": 0.0131,
"step": 1590
},
{
"epoch": 18.39080459770115,
"grad_norm": 0.2543192207813263,
"learning_rate": 3.16070989320868e-06,
"loss": 0.0157,
"step": 1600
},
{
"epoch": 18.50574712643678,
"grad_norm": 0.13453112542629242,
"learning_rate": 2.702041669363875e-06,
"loss": 0.017,
"step": 1610
},
{
"epoch": 18.620689655172413,
"grad_norm": 0.16369780898094177,
"learning_rate": 2.2788595615403474e-06,
"loss": 0.0157,
"step": 1620
},
{
"epoch": 18.735632183908045,
"grad_norm": 0.14639818668365479,
"learning_rate": 1.8913179109969482e-06,
"loss": 0.0122,
"step": 1630
},
{
"epoch": 18.850574712643677,
"grad_norm": 0.23813354969024658,
"learning_rate": 1.5395580603498328e-06,
"loss": 0.0157,
"step": 1640
},
{
"epoch": 18.96551724137931,
"grad_norm": 0.15577834844589233,
"learning_rate": 1.2237083020224526e-06,
"loss": 0.0144,
"step": 1650
},
{
"epoch": 19.080459770114942,
"grad_norm": 0.06880059838294983,
"learning_rate": 9.438838314553056e-07,
"loss": 0.0109,
"step": 1660
},
{
"epoch": 19.195402298850574,
"grad_norm": 0.19819270074367523,
"learning_rate": 7.001867050923095e-07,
"loss": 0.0134,
"step": 1670
},
{
"epoch": 19.310344827586206,
"grad_norm": 0.10673543065786362,
"learning_rate": 4.92705803159188e-07,
"loss": 0.0155,
"step": 1680
},
{
"epoch": 19.42528735632184,
"grad_norm": 0.16529639065265656,
"learning_rate": 3.2151679724748975e-07,
"loss": 0.0175,
"step": 1690
},
{
"epoch": 19.54022988505747,
"grad_norm": 0.1206677109003067,
"learning_rate": 1.8668212271585327e-07,
"loss": 0.0188,
"step": 1700
},
{
"epoch": 19.655172413793103,
"grad_norm": 0.10180158913135529,
"learning_rate": 8.825095591891152e-08,
"loss": 0.0158,
"step": 1710
},
{
"epoch": 19.770114942528735,
"grad_norm": 0.14544282853603363,
"learning_rate": 2.625919627188278e-08,
"loss": 0.015,
"step": 1720
},
{
"epoch": 19.885057471264368,
"grad_norm": 0.14054107666015625,
"learning_rate": 7.294531574553176e-10,
"loss": 0.0139,
"step": 1730
},
{
"epoch": 19.908045977011493,
"step": 1732,
"total_flos": 2.431760592612004e+17,
"train_loss": 0.04785273781403314,
"train_runtime": 1980.9382,
"train_samples_per_second": 55.957,
"train_steps_per_second": 0.874
}
],
"logging_steps": 10,
"max_steps": 1732,
"num_input_tokens_seen": 0,
"num_train_epochs": 20,
"save_steps": 10000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2.431760592612004e+17,
"train_batch_size": 64,
"trial_name": null,
"trial_params": null
}