zheminh's picture
Add files using upload-large-folder tool
823513a verified
raw
history blame contribute delete
49.7 kB
{
"best_metric": 0.40588063,
"best_model_checkpoint": "/home/ubuntu/output/v0-20250315-052746/checkpoint-800",
"epoch": 0.9549388242315726,
"eval_steps": 100,
"global_step": 800,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.001193673530289466,
"grad_norm": 22.298568401591066,
"learning_rate": 7.936507936507937e-08,
"loss": 1.0442615747451782,
"memory(GiB)": 30.7,
"step": 1,
"token_acc": 0.7699836867862969,
"train_speed(iter/s)": 0.093757
},
{
"epoch": 0.005968367651447329,
"grad_norm": 19.63376949197587,
"learning_rate": 3.9682539682539683e-07,
"loss": 0.9844925403594971,
"memory(GiB)": 36.92,
"step": 5,
"token_acc": 0.7549315386400557,
"train_speed(iter/s)": 0.149789
},
{
"epoch": 0.011936735302894658,
"grad_norm": 16.540823173599176,
"learning_rate": 7.936507936507937e-07,
"loss": 0.9816521644592285,
"memory(GiB)": 36.92,
"step": 10,
"token_acc": 0.7844917012448133,
"train_speed(iter/s)": 0.162277
},
{
"epoch": 0.017905102954341987,
"grad_norm": 7.069008652065498,
"learning_rate": 1.1904761904761906e-06,
"loss": 0.8435724258422852,
"memory(GiB)": 36.92,
"step": 15,
"token_acc": 0.7823455233291299,
"train_speed(iter/s)": 0.176636
},
{
"epoch": 0.023873470605789315,
"grad_norm": 5.623338707216517,
"learning_rate": 1.5873015873015873e-06,
"loss": 0.7347106456756591,
"memory(GiB)": 36.92,
"step": 20,
"token_acc": 0.8210757409440176,
"train_speed(iter/s)": 0.177914
},
{
"epoch": 0.029841838257236644,
"grad_norm": 4.003256651554691,
"learning_rate": 1.984126984126984e-06,
"loss": 0.654999828338623,
"memory(GiB)": 36.92,
"step": 25,
"token_acc": 0.7765267826680314,
"train_speed(iter/s)": 0.183726
},
{
"epoch": 0.03581020590868397,
"grad_norm": 3.4287088973952624,
"learning_rate": 2.380952380952381e-06,
"loss": 0.5874819755554199,
"memory(GiB)": 36.92,
"step": 30,
"token_acc": 0.8138706921105098,
"train_speed(iter/s)": 0.184018
},
{
"epoch": 0.0417785735601313,
"grad_norm": 2.8169959397852256,
"learning_rate": 2.7777777777777783e-06,
"loss": 0.6081454753875732,
"memory(GiB)": 36.92,
"step": 35,
"token_acc": 0.81794500723589,
"train_speed(iter/s)": 0.183529
},
{
"epoch": 0.04774694121157863,
"grad_norm": 3.325157673054176,
"learning_rate": 3.1746031746031746e-06,
"loss": 0.5828543663024902,
"memory(GiB)": 36.92,
"step": 40,
"token_acc": 0.8394230769230769,
"train_speed(iter/s)": 0.184431
},
{
"epoch": 0.05371530886302596,
"grad_norm": 2.7879997870731166,
"learning_rate": 3.5714285714285718e-06,
"loss": 0.5575875282287598,
"memory(GiB)": 36.92,
"step": 45,
"token_acc": 0.8293939393939394,
"train_speed(iter/s)": 0.187841
},
{
"epoch": 0.05968367651447329,
"grad_norm": 2.778252157541432,
"learning_rate": 3.968253968253968e-06,
"loss": 0.5551129341125488,
"memory(GiB)": 36.92,
"step": 50,
"token_acc": 0.8260184559981995,
"train_speed(iter/s)": 0.189026
},
{
"epoch": 0.06565204416592062,
"grad_norm": 3.0281694802961816,
"learning_rate": 4.365079365079366e-06,
"loss": 0.5619981765747071,
"memory(GiB)": 36.92,
"step": 55,
"token_acc": 0.8104107766505904,
"train_speed(iter/s)": 0.188889
},
{
"epoch": 0.07162041181736795,
"grad_norm": 3.102265765306523,
"learning_rate": 4.761904761904762e-06,
"loss": 0.5332321166992188,
"memory(GiB)": 36.92,
"step": 60,
"token_acc": 0.8470640768028578,
"train_speed(iter/s)": 0.189797
},
{
"epoch": 0.07758877946881527,
"grad_norm": 2.8144373694536444,
"learning_rate": 5.15873015873016e-06,
"loss": 0.5349865436553956,
"memory(GiB)": 36.92,
"step": 65,
"token_acc": 0.8722838137472284,
"train_speed(iter/s)": 0.190181
},
{
"epoch": 0.0835571471202626,
"grad_norm": 3.337016219899744,
"learning_rate": 5.555555555555557e-06,
"loss": 0.5452562808990479,
"memory(GiB)": 36.92,
"step": 70,
"token_acc": 0.8432369942196531,
"train_speed(iter/s)": 0.189336
},
{
"epoch": 0.08952551477170993,
"grad_norm": 3.06136632501639,
"learning_rate": 5.9523809523809525e-06,
"loss": 0.49980897903442384,
"memory(GiB)": 36.92,
"step": 75,
"token_acc": 0.8338361568809468,
"train_speed(iter/s)": 0.190084
},
{
"epoch": 0.09549388242315726,
"grad_norm": 2.4877794848384633,
"learning_rate": 6.349206349206349e-06,
"loss": 0.539669418334961,
"memory(GiB)": 36.92,
"step": 80,
"token_acc": 0.8455654331197023,
"train_speed(iter/s)": 0.189397
},
{
"epoch": 0.10146225007460459,
"grad_norm": 3.221405214526254,
"learning_rate": 6.746031746031747e-06,
"loss": 0.5160573959350586,
"memory(GiB)": 36.92,
"step": 85,
"token_acc": 0.8462370242214533,
"train_speed(iter/s)": 0.190557
},
{
"epoch": 0.10743061772605192,
"grad_norm": 3.2868066256101085,
"learning_rate": 7.1428571428571436e-06,
"loss": 0.4913814067840576,
"memory(GiB)": 36.92,
"step": 90,
"token_acc": 0.8452227659026526,
"train_speed(iter/s)": 0.190372
},
{
"epoch": 0.11339898537749925,
"grad_norm": 3.051253318495505,
"learning_rate": 7.53968253968254e-06,
"loss": 0.49474325180053713,
"memory(GiB)": 36.92,
"step": 95,
"token_acc": 0.8567275747508306,
"train_speed(iter/s)": 0.190208
},
{
"epoch": 0.11936735302894658,
"grad_norm": 2.912617922195808,
"learning_rate": 7.936507936507936e-06,
"loss": 0.4725308418273926,
"memory(GiB)": 36.92,
"step": 100,
"token_acc": 0.8608458390177354,
"train_speed(iter/s)": 0.190477
},
{
"epoch": 0.11936735302894658,
"eval_loss": 0.45056208968162537,
"eval_runtime": 10.9299,
"eval_samples_per_second": 24.611,
"eval_steps_per_second": 3.111,
"eval_token_acc": 0.8447598692022433,
"step": 100
},
{
"epoch": 0.12533572068039392,
"grad_norm": 2.8012333103883647,
"learning_rate": 8.333333333333334e-06,
"loss": 0.5016227722167969,
"memory(GiB)": 36.92,
"step": 105,
"token_acc": 0.842248243559719,
"train_speed(iter/s)": 0.17126
},
{
"epoch": 0.13130408833184123,
"grad_norm": 2.912876340051396,
"learning_rate": 8.730158730158731e-06,
"loss": 0.518134593963623,
"memory(GiB)": 36.92,
"step": 110,
"token_acc": 0.8509636604384287,
"train_speed(iter/s)": 0.172461
},
{
"epoch": 0.13727245598328858,
"grad_norm": 3.3516293509261117,
"learning_rate": 9.126984126984127e-06,
"loss": 0.5215555191040039,
"memory(GiB)": 36.92,
"step": 115,
"token_acc": 0.8062340503098797,
"train_speed(iter/s)": 0.172888
},
{
"epoch": 0.1432408236347359,
"grad_norm": 3.137187651305806,
"learning_rate": 9.523809523809525e-06,
"loss": 0.4938058853149414,
"memory(GiB)": 36.92,
"step": 120,
"token_acc": 0.8689788053949904,
"train_speed(iter/s)": 0.173312
},
{
"epoch": 0.14920919128618323,
"grad_norm": 2.9778302322354255,
"learning_rate": 9.920634920634922e-06,
"loss": 0.47763543128967284,
"memory(GiB)": 36.92,
"step": 125,
"token_acc": 0.8223684210526315,
"train_speed(iter/s)": 0.173984
},
{
"epoch": 0.15517755893763055,
"grad_norm": 2.454176368796001,
"learning_rate": 9.999930596405254e-06,
"loss": 0.5025428771972656,
"memory(GiB)": 36.92,
"step": 130,
"token_acc": 0.8584961515689757,
"train_speed(iter/s)": 0.17494
},
{
"epoch": 0.1611459265890779,
"grad_norm": 2.308752936530914,
"learning_rate": 9.999648647603774e-06,
"loss": 0.4561060905456543,
"memory(GiB)": 36.92,
"step": 135,
"token_acc": 0.8763222131814483,
"train_speed(iter/s)": 0.175709
},
{
"epoch": 0.1671142942405252,
"grad_norm": 3.2296962246861276,
"learning_rate": 9.999149828091632e-06,
"loss": 0.5205905437469482,
"memory(GiB)": 36.92,
"step": 140,
"token_acc": 0.8126463700234192,
"train_speed(iter/s)": 0.176096
},
{
"epoch": 0.17308266189197255,
"grad_norm": 2.8523931115518915,
"learning_rate": 9.998434159506211e-06,
"loss": 0.4669060230255127,
"memory(GiB)": 36.92,
"step": 145,
"token_acc": 0.8612167300380228,
"train_speed(iter/s)": 0.176748
},
{
"epoch": 0.17905102954341987,
"grad_norm": 2.689699959591659,
"learning_rate": 9.997501672891208e-06,
"loss": 0.4870173454284668,
"memory(GiB)": 36.92,
"step": 150,
"token_acc": 0.8250571369208394,
"train_speed(iter/s)": 0.177191
},
{
"epoch": 0.1850193971948672,
"grad_norm": 2.8278119407117517,
"learning_rate": 9.99635240869527e-06,
"loss": 0.47814245223999025,
"memory(GiB)": 36.92,
"step": 155,
"token_acc": 0.819718309859155,
"train_speed(iter/s)": 0.177751
},
{
"epoch": 0.19098776484631452,
"grad_norm": 3.0394183749078887,
"learning_rate": 9.99498641677025e-06,
"loss": 0.5187320232391357,
"memory(GiB)": 36.92,
"step": 160,
"token_acc": 0.8540501094624179,
"train_speed(iter/s)": 0.178072
},
{
"epoch": 0.19695613249776187,
"grad_norm": 2.4913040205595696,
"learning_rate": 9.993403756369037e-06,
"loss": 0.471418571472168,
"memory(GiB)": 36.92,
"step": 165,
"token_acc": 0.8507351108896087,
"train_speed(iter/s)": 0.178675
},
{
"epoch": 0.20292450014920918,
"grad_norm": 2.5344653445390937,
"learning_rate": 9.991604496142997e-06,
"loss": 0.5218185901641845,
"memory(GiB)": 36.92,
"step": 170,
"token_acc": 0.8207423580786026,
"train_speed(iter/s)": 0.179206
},
{
"epoch": 0.20889286780065652,
"grad_norm": 3.1475346404756737,
"learning_rate": 9.989588714138977e-06,
"loss": 0.4809536933898926,
"memory(GiB)": 36.92,
"step": 175,
"token_acc": 0.8179710144927537,
"train_speed(iter/s)": 0.179609
},
{
"epoch": 0.21486123545210384,
"grad_norm": 2.9443708297446927,
"learning_rate": 9.987356497795944e-06,
"loss": 0.5137897491455078,
"memory(GiB)": 36.92,
"step": 180,
"token_acc": 0.838412017167382,
"train_speed(iter/s)": 0.179743
},
{
"epoch": 0.22082960310355118,
"grad_norm": 2.3691979165448864,
"learning_rate": 9.984907943941164e-06,
"loss": 0.47942285537719725,
"memory(GiB)": 36.92,
"step": 185,
"token_acc": 0.8178681677864537,
"train_speed(iter/s)": 0.179758
},
{
"epoch": 0.2267979707549985,
"grad_norm": 2.7655733596568806,
"learning_rate": 9.98224315878603e-06,
"loss": 0.4850442886352539,
"memory(GiB)": 36.92,
"step": 190,
"token_acc": 0.8557343020238714,
"train_speed(iter/s)": 0.180113
},
{
"epoch": 0.23276633840644584,
"grad_norm": 2.9415771183832837,
"learning_rate": 9.979362257921428e-06,
"loss": 0.4999836921691895,
"memory(GiB)": 36.92,
"step": 195,
"token_acc": 0.851006381934217,
"train_speed(iter/s)": 0.180236
},
{
"epoch": 0.23873470605789315,
"grad_norm": 3.1074760910693797,
"learning_rate": 9.976265366312746e-06,
"loss": 0.5033563137054443,
"memory(GiB)": 36.92,
"step": 200,
"token_acc": 0.8293048128342246,
"train_speed(iter/s)": 0.180618
},
{
"epoch": 0.23873470605789315,
"eval_loss": 0.4415110647678375,
"eval_runtime": 10.926,
"eval_samples_per_second": 24.62,
"eval_steps_per_second": 3.112,
"eval_token_acc": 0.8473173672384502,
"step": 200
},
{
"epoch": 0.2447030737093405,
"grad_norm": 2.7742402422517016,
"learning_rate": 9.972952618294442e-06,
"loss": 0.48658447265625,
"memory(GiB)": 36.92,
"step": 205,
"token_acc": 0.8399616256759114,
"train_speed(iter/s)": 0.171547
},
{
"epoch": 0.25067144136078784,
"grad_norm": 2.9146485975442946,
"learning_rate": 9.969424157564215e-06,
"loss": 0.48202037811279297,
"memory(GiB)": 36.92,
"step": 210,
"token_acc": 0.8229777256740914,
"train_speed(iter/s)": 0.172058
},
{
"epoch": 0.25663980901223515,
"grad_norm": 2.6037700192849007,
"learning_rate": 9.965680137176778e-06,
"loss": 0.4780398368835449,
"memory(GiB)": 36.92,
"step": 215,
"token_acc": 0.8451862602806,
"train_speed(iter/s)": 0.172776
},
{
"epoch": 0.26260817666368247,
"grad_norm": 2.4624431066871555,
"learning_rate": 9.961720719537217e-06,
"loss": 0.46450080871582033,
"memory(GiB)": 36.92,
"step": 220,
"token_acc": 0.8089250493096647,
"train_speed(iter/s)": 0.173186
},
{
"epoch": 0.26857654431512984,
"grad_norm": 2.6192496099911624,
"learning_rate": 9.957546076393944e-06,
"loss": 0.44403810501098634,
"memory(GiB)": 36.92,
"step": 225,
"token_acc": 0.8560982743492249,
"train_speed(iter/s)": 0.173308
},
{
"epoch": 0.27454491196657715,
"grad_norm": 2.5789044914565227,
"learning_rate": 9.953156388831246e-06,
"loss": 0.4940804481506348,
"memory(GiB)": 36.92,
"step": 230,
"token_acc": 0.8385935769656699,
"train_speed(iter/s)": 0.173656
},
{
"epoch": 0.28051327961802447,
"grad_norm": 2.3813674364243984,
"learning_rate": 9.948551847261439e-06,
"loss": 0.4587420463562012,
"memory(GiB)": 36.92,
"step": 235,
"token_acc": 0.8549975381585426,
"train_speed(iter/s)": 0.173976
},
{
"epoch": 0.2864816472694718,
"grad_norm": 2.7610173702743865,
"learning_rate": 9.943732651416597e-06,
"loss": 0.4972860336303711,
"memory(GiB)": 36.92,
"step": 240,
"token_acc": 0.8406979379107183,
"train_speed(iter/s)": 0.174337
},
{
"epoch": 0.29245001492091915,
"grad_norm": 2.390894612477832,
"learning_rate": 9.938699010339898e-06,
"loss": 0.4903904438018799,
"memory(GiB)": 36.92,
"step": 245,
"token_acc": 0.8545253863134658,
"train_speed(iter/s)": 0.174579
},
{
"epoch": 0.29841838257236647,
"grad_norm": 2.3711824949447546,
"learning_rate": 9.933451142376545e-06,
"loss": 0.4524253845214844,
"memory(GiB)": 37.05,
"step": 250,
"token_acc": 0.8489612577203818,
"train_speed(iter/s)": 0.174973
},
{
"epoch": 0.3043867502238138,
"grad_norm": 2.2514671634568364,
"learning_rate": 9.927989275164305e-06,
"loss": 0.48909597396850585,
"memory(GiB)": 37.05,
"step": 255,
"token_acc": 0.8518639633747548,
"train_speed(iter/s)": 0.175028
},
{
"epoch": 0.3103551178752611,
"grad_norm": 2.3714755110814005,
"learning_rate": 9.922313645623634e-06,
"loss": 0.4785162448883057,
"memory(GiB)": 37.05,
"step": 260,
"token_acc": 0.8465215082315454,
"train_speed(iter/s)": 0.175714
},
{
"epoch": 0.31632348552670847,
"grad_norm": 2.648679383955696,
"learning_rate": 9.916424499947395e-06,
"loss": 0.46675701141357423,
"memory(GiB)": 37.05,
"step": 265,
"token_acc": 0.8571428571428571,
"train_speed(iter/s)": 0.175927
},
{
"epoch": 0.3222918531781558,
"grad_norm": 2.579144815458166,
"learning_rate": 9.910322093590177e-06,
"loss": 0.47339348793029784,
"memory(GiB)": 37.05,
"step": 270,
"token_acc": 0.8505902192242834,
"train_speed(iter/s)": 0.176471
},
{
"epoch": 0.3282602208296031,
"grad_norm": 2.2898701347032793,
"learning_rate": 9.904006691257224e-06,
"loss": 0.49665226936340334,
"memory(GiB)": 37.05,
"step": 275,
"token_acc": 0.8427124366910523,
"train_speed(iter/s)": 0.17689
},
{
"epoch": 0.3342285884810504,
"grad_norm": 1.9441720928034771,
"learning_rate": 9.897478566892942e-06,
"loss": 0.44453701972961424,
"memory(GiB)": 37.05,
"step": 280,
"token_acc": 0.8629363449691991,
"train_speed(iter/s)": 0.177368
},
{
"epoch": 0.3401969561324978,
"grad_norm": 2.4637260658165,
"learning_rate": 9.890738003669029e-06,
"loss": 0.4563939094543457,
"memory(GiB)": 37.05,
"step": 285,
"token_acc": 0.8230596456201648,
"train_speed(iter/s)": 0.1776
},
{
"epoch": 0.3461653237839451,
"grad_norm": 2.287302517723748,
"learning_rate": 9.883785293972175e-06,
"loss": 0.504718017578125,
"memory(GiB)": 37.05,
"step": 290,
"token_acc": 0.7899543378995434,
"train_speed(iter/s)": 0.177582
},
{
"epoch": 0.3521336914353924,
"grad_norm": 2.328908891504034,
"learning_rate": 9.87662073939139e-06,
"loss": 0.4355961799621582,
"memory(GiB)": 37.05,
"step": 295,
"token_acc": 0.8636019960683502,
"train_speed(iter/s)": 0.177798
},
{
"epoch": 0.35810205908683973,
"grad_norm": 2.444070696496546,
"learning_rate": 9.869244650704924e-06,
"loss": 0.4655925750732422,
"memory(GiB)": 37.05,
"step": 300,
"token_acc": 0.8573033707865169,
"train_speed(iter/s)": 0.177836
},
{
"epoch": 0.35810205908683973,
"eval_loss": 0.4284290373325348,
"eval_runtime": 10.9831,
"eval_samples_per_second": 24.492,
"eval_steps_per_second": 3.096,
"eval_token_acc": 0.8515189711550757,
"step": 300
},
{
"epoch": 0.3640704267382871,
"grad_norm": 2.659611192736946,
"learning_rate": 9.861657347866778e-06,
"loss": 0.5253509521484375,
"memory(GiB)": 37.06,
"step": 305,
"token_acc": 0.828113750899928,
"train_speed(iter/s)": 0.171888
},
{
"epoch": 0.3700387943897344,
"grad_norm": 2.6984676971627226,
"learning_rate": 9.853859159992831e-06,
"loss": 0.47617392539978026,
"memory(GiB)": 37.06,
"step": 310,
"token_acc": 0.8316008316008316,
"train_speed(iter/s)": 0.172231
},
{
"epoch": 0.37600716204118173,
"grad_norm": 2.195598140600359,
"learning_rate": 9.845850425346563e-06,
"loss": 0.4360311508178711,
"memory(GiB)": 37.06,
"step": 315,
"token_acc": 0.848318462594372,
"train_speed(iter/s)": 0.172652
},
{
"epoch": 0.38197552969262905,
"grad_norm": 2.4976869303898597,
"learning_rate": 9.837631491324379e-06,
"loss": 0.46515851020812987,
"memory(GiB)": 37.06,
"step": 320,
"token_acc": 0.8522144522144522,
"train_speed(iter/s)": 0.172786
},
{
"epoch": 0.3879438973440764,
"grad_norm": 3.017469180784894,
"learning_rate": 9.829202714440544e-06,
"loss": 0.5420156478881836,
"memory(GiB)": 37.06,
"step": 325,
"token_acc": 0.8376052027543994,
"train_speed(iter/s)": 0.17318
},
{
"epoch": 0.39391226499552373,
"grad_norm": 2.5730220119384297,
"learning_rate": 9.820564460311719e-06,
"loss": 0.4916552543640137,
"memory(GiB)": 37.06,
"step": 330,
"token_acc": 0.8207920792079207,
"train_speed(iter/s)": 0.173365
},
{
"epoch": 0.39988063264697105,
"grad_norm": 2.798903385122773,
"learning_rate": 9.811717103641096e-06,
"loss": 0.4587296485900879,
"memory(GiB)": 37.06,
"step": 335,
"token_acc": 0.8592551001310126,
"train_speed(iter/s)": 0.173592
},
{
"epoch": 0.40584900029841836,
"grad_norm": 2.6409823275058653,
"learning_rate": 9.802661028202147e-06,
"loss": 0.48290514945983887,
"memory(GiB)": 37.06,
"step": 340,
"token_acc": 0.823793194407808,
"train_speed(iter/s)": 0.173952
},
{
"epoch": 0.41181736794986573,
"grad_norm": 3.0285812809146635,
"learning_rate": 9.79339662682198e-06,
"loss": 0.46567506790161134,
"memory(GiB)": 37.06,
"step": 345,
"token_acc": 0.8304556354916067,
"train_speed(iter/s)": 0.174192
},
{
"epoch": 0.41778573560131305,
"grad_norm": 2.4611578793858486,
"learning_rate": 9.783924301364297e-06,
"loss": 0.4647653579711914,
"memory(GiB)": 37.06,
"step": 350,
"token_acc": 0.8199260286638927,
"train_speed(iter/s)": 0.17443
},
{
"epoch": 0.42375410325276036,
"grad_norm": 2.154896994755901,
"learning_rate": 9.774244462711962e-06,
"loss": 0.4952418327331543,
"memory(GiB)": 37.06,
"step": 355,
"token_acc": 0.8217054263565892,
"train_speed(iter/s)": 0.174757
},
{
"epoch": 0.4297224709042077,
"grad_norm": 2.005838047714932,
"learning_rate": 9.764357530749178e-06,
"loss": 0.4674674034118652,
"memory(GiB)": 37.06,
"step": 360,
"token_acc": 0.841979596266551,
"train_speed(iter/s)": 0.174828
},
{
"epoch": 0.43569083855565505,
"grad_norm": 2.292609923640767,
"learning_rate": 9.754263934343272e-06,
"loss": 0.44636335372924807,
"memory(GiB)": 37.06,
"step": 365,
"token_acc": 0.8596112311015118,
"train_speed(iter/s)": 0.175118
},
{
"epoch": 0.44165920620710236,
"grad_norm": 2.477107058493794,
"learning_rate": 9.743964111326098e-06,
"loss": 0.4866192817687988,
"memory(GiB)": 37.06,
"step": 370,
"token_acc": 0.809440252675908,
"train_speed(iter/s)": 0.175357
},
{
"epoch": 0.4476275738585497,
"grad_norm": 2.3446291196746922,
"learning_rate": 9.733458508475038e-06,
"loss": 0.4887577533721924,
"memory(GiB)": 37.06,
"step": 375,
"token_acc": 0.8332948510736551,
"train_speed(iter/s)": 0.175371
},
{
"epoch": 0.453595941509997,
"grad_norm": 2.29799169108157,
"learning_rate": 9.722747581493625e-06,
"loss": 0.49045257568359374,
"memory(GiB)": 37.06,
"step": 380,
"token_acc": 0.8406266882766072,
"train_speed(iter/s)": 0.175414
},
{
"epoch": 0.45956430916144436,
"grad_norm": 2.563802674403576,
"learning_rate": 9.711831794991777e-06,
"loss": 0.4675490379333496,
"memory(GiB)": 37.06,
"step": 385,
"token_acc": 0.847358529964502,
"train_speed(iter/s)": 0.175567
},
{
"epoch": 0.4655326768128917,
"grad_norm": 2.480776446284018,
"learning_rate": 9.700711622465645e-06,
"loss": 0.4845867156982422,
"memory(GiB)": 37.06,
"step": 390,
"token_acc": 0.8422996998383745,
"train_speed(iter/s)": 0.17572
},
{
"epoch": 0.471501044464339,
"grad_norm": 2.721044012538843,
"learning_rate": 9.689387546277062e-06,
"loss": 0.46145071983337405,
"memory(GiB)": 37.06,
"step": 395,
"token_acc": 0.8513663630304377,
"train_speed(iter/s)": 0.175882
},
{
"epoch": 0.4774694121157863,
"grad_norm": 2.580126202957563,
"learning_rate": 9.677860057632642e-06,
"loss": 0.5093360424041748,
"memory(GiB)": 37.06,
"step": 400,
"token_acc": 0.8206378986866791,
"train_speed(iter/s)": 0.175987
},
{
"epoch": 0.4774694121157863,
"eval_loss": 0.42347872257232666,
"eval_runtime": 10.9358,
"eval_samples_per_second": 24.598,
"eval_steps_per_second": 3.109,
"eval_token_acc": 0.8527429166438318,
"step": 400
},
{
"epoch": 0.4834377797672337,
"grad_norm": 2.355447977882308,
"learning_rate": 9.66612965656245e-06,
"loss": 0.48992347717285156,
"memory(GiB)": 37.06,
"step": 405,
"token_acc": 0.8608419645840294,
"train_speed(iter/s)": 0.171561
},
{
"epoch": 0.489406147418681,
"grad_norm": 2.0174115419967773,
"learning_rate": 9.654196851898325e-06,
"loss": 0.4750755786895752,
"memory(GiB)": 37.06,
"step": 410,
"token_acc": 0.8274902615470228,
"train_speed(iter/s)": 0.171858
},
{
"epoch": 0.4953745150701283,
"grad_norm": 2.155026242929759,
"learning_rate": 9.642062161251807e-06,
"loss": 0.46627135276794435,
"memory(GiB)": 37.06,
"step": 415,
"token_acc": 0.8661600496277916,
"train_speed(iter/s)": 0.17197
},
{
"epoch": 0.5013428827215757,
"grad_norm": 2.8519922687228174,
"learning_rate": 9.62972611099168e-06,
"loss": 0.4620970726013184,
"memory(GiB)": 37.06,
"step": 420,
"token_acc": 0.8595988538681948,
"train_speed(iter/s)": 0.172268
},
{
"epoch": 0.5073112503730229,
"grad_norm": 2.5658438134794324,
"learning_rate": 9.617189236221143e-06,
"loss": 0.45318241119384767,
"memory(GiB)": 37.06,
"step": 425,
"token_acc": 0.8252274866645748,
"train_speed(iter/s)": 0.172438
},
{
"epoch": 0.5132796180244703,
"grad_norm": 2.2980368916312206,
"learning_rate": 9.604452080754601e-06,
"loss": 0.46477622985839845,
"memory(GiB)": 37.06,
"step": 430,
"token_acc": 0.8681318681318682,
"train_speed(iter/s)": 0.17271
},
{
"epoch": 0.5192479856759177,
"grad_norm": 2.3920351806796925,
"learning_rate": 9.591515197094064e-06,
"loss": 0.43802127838134763,
"memory(GiB)": 37.06,
"step": 435,
"token_acc": 0.8632865550022635,
"train_speed(iter/s)": 0.172963
},
{
"epoch": 0.5252163533273649,
"grad_norm": 2.3926322888936196,
"learning_rate": 9.578379146405202e-06,
"loss": 0.4414364814758301,
"memory(GiB)": 37.06,
"step": 440,
"token_acc": 0.8378196500672948,
"train_speed(iter/s)": 0.173049
},
{
"epoch": 0.5311847209788123,
"grad_norm": 2.5309415862721787,
"learning_rate": 9.565044498492984e-06,
"loss": 0.4737836837768555,
"memory(GiB)": 37.06,
"step": 445,
"token_acc": 0.8400094809196492,
"train_speed(iter/s)": 0.173413
},
{
"epoch": 0.5371530886302597,
"grad_norm": 2.574732220606661,
"learning_rate": 9.551511831776966e-06,
"loss": 0.4299252986907959,
"memory(GiB)": 37.06,
"step": 450,
"token_acc": 0.8394777265745008,
"train_speed(iter/s)": 0.173639
},
{
"epoch": 0.5431214562817069,
"grad_norm": 2.209862389780888,
"learning_rate": 9.53778173326621e-06,
"loss": 0.44927520751953126,
"memory(GiB)": 37.06,
"step": 455,
"token_acc": 0.8641338013627916,
"train_speed(iter/s)": 0.173751
},
{
"epoch": 0.5490898239331543,
"grad_norm": 2.524639918389781,
"learning_rate": 9.523854798533814e-06,
"loss": 0.44107656478881835,
"memory(GiB)": 37.06,
"step": 460,
"token_acc": 0.8868033496967946,
"train_speed(iter/s)": 0.174216
},
{
"epoch": 0.5550581915846016,
"grad_norm": 2.1182849441153215,
"learning_rate": 9.509731631691071e-06,
"loss": 0.43174285888671876,
"memory(GiB)": 37.06,
"step": 465,
"token_acc": 0.855464759959142,
"train_speed(iter/s)": 0.174365
},
{
"epoch": 0.5610265592360489,
"grad_norm": 2.2926487255366688,
"learning_rate": 9.495412845361279e-06,
"loss": 0.48258438110351565,
"memory(GiB)": 37.06,
"step": 470,
"token_acc": 0.8603872818551279,
"train_speed(iter/s)": 0.174664
},
{
"epoch": 0.5669949268874963,
"grad_norm": 2.192746026976168,
"learning_rate": 9.480899060653154e-06,
"loss": 0.4563854217529297,
"memory(GiB)": 37.06,
"step": 475,
"token_acc": 0.8394289067083904,
"train_speed(iter/s)": 0.17502
},
{
"epoch": 0.5729632945389436,
"grad_norm": 2.014209866578747,
"learning_rate": 9.466190907133901e-06,
"loss": 0.4754791259765625,
"memory(GiB)": 37.06,
"step": 480,
"token_acc": 0.8577712609970675,
"train_speed(iter/s)": 0.175025
},
{
"epoch": 0.5789316621903909,
"grad_norm": 2.559320864210838,
"learning_rate": 9.451289022801894e-06,
"loss": 0.47232685089111326,
"memory(GiB)": 37.06,
"step": 485,
"token_acc": 0.8380402225074882,
"train_speed(iter/s)": 0.175186
},
{
"epoch": 0.5849000298418383,
"grad_norm": 2.2053676509330433,
"learning_rate": 9.436194054058998e-06,
"loss": 0.4336155891418457,
"memory(GiB)": 37.06,
"step": 490,
"token_acc": 0.8529990167158309,
"train_speed(iter/s)": 0.175216
},
{
"epoch": 0.5908683974932856,
"grad_norm": 2.46940001428622,
"learning_rate": 9.420906655682553e-06,
"loss": 0.45275249481201174,
"memory(GiB)": 37.06,
"step": 495,
"token_acc": 0.8271080928126768,
"train_speed(iter/s)": 0.175432
},
{
"epoch": 0.5968367651447329,
"grad_norm": 2.3675730058319293,
"learning_rate": 9.405427490796941e-06,
"loss": 0.48803205490112306,
"memory(GiB)": 37.06,
"step": 500,
"token_acc": 0.8432593011741406,
"train_speed(iter/s)": 0.175539
},
{
"epoch": 0.5968367651447329,
"eval_loss": 0.4169776141643524,
"eval_runtime": 10.9599,
"eval_samples_per_second": 24.544,
"eval_steps_per_second": 3.102,
"eval_token_acc": 0.8532361484079575,
"step": 500
},
{
"epoch": 0.6028051327961802,
"grad_norm": 2.1414646330001217,
"learning_rate": 9.389757230844845e-06,
"loss": 0.46323652267456056,
"memory(GiB)": 37.06,
"step": 505,
"token_acc": 0.8552877345904119,
"train_speed(iter/s)": 0.159112
},
{
"epoch": 0.6087735004476276,
"grad_norm": 2.5503273386919667,
"learning_rate": 9.373896555558113e-06,
"loss": 0.4701972961425781,
"memory(GiB)": 37.06,
"step": 510,
"token_acc": 0.8592652620205294,
"train_speed(iter/s)": 0.159422
},
{
"epoch": 0.6147418680990749,
"grad_norm": 2.6125713791079996,
"learning_rate": 9.357846152928275e-06,
"loss": 0.4990544319152832,
"memory(GiB)": 37.06,
"step": 515,
"token_acc": 0.824811732065002,
"train_speed(iter/s)": 0.159707
},
{
"epoch": 0.6207102357505222,
"grad_norm": 1.9353177630019818,
"learning_rate": 9.341606719176695e-06,
"loss": 0.4381883144378662,
"memory(GiB)": 37.06,
"step": 520,
"token_acc": 0.867666063582321,
"train_speed(iter/s)": 0.159909
},
{
"epoch": 0.6266786034019696,
"grad_norm": 2.3284686918748667,
"learning_rate": 9.325178958724387e-06,
"loss": 0.45581645965576173,
"memory(GiB)": 37.06,
"step": 525,
"token_acc": 0.8706395348837209,
"train_speed(iter/s)": 0.160206
},
{
"epoch": 0.6326469710534169,
"grad_norm": 2.2369421417810926,
"learning_rate": 9.308563584161439e-06,
"loss": 0.4688922882080078,
"memory(GiB)": 37.06,
"step": 530,
"token_acc": 0.8338983050847457,
"train_speed(iter/s)": 0.160549
},
{
"epoch": 0.6386153387048642,
"grad_norm": 2.4187058758316202,
"learning_rate": 9.291761316216115e-06,
"loss": 0.43785710334777833,
"memory(GiB)": 37.06,
"step": 535,
"token_acc": 0.8175961715442666,
"train_speed(iter/s)": 0.160901
},
{
"epoch": 0.6445837063563116,
"grad_norm": 2.11230034988461,
"learning_rate": 9.274772883723587e-06,
"loss": 0.4285177707672119,
"memory(GiB)": 37.06,
"step": 540,
"token_acc": 0.8522423025435074,
"train_speed(iter/s)": 0.161093
},
{
"epoch": 0.6505520740077588,
"grad_norm": 2.340278397663115,
"learning_rate": 9.257599023594326e-06,
"loss": 0.4503736972808838,
"memory(GiB)": 37.06,
"step": 545,
"token_acc": 0.8704713049054184,
"train_speed(iter/s)": 0.161286
},
{
"epoch": 0.6565204416592062,
"grad_norm": 2.3913667503479705,
"learning_rate": 9.24024048078213e-06,
"loss": 0.42584834098815916,
"memory(GiB)": 37.06,
"step": 550,
"token_acc": 0.8828032979976443,
"train_speed(iter/s)": 0.161464
},
{
"epoch": 0.6624888093106536,
"grad_norm": 2.2991966974662628,
"learning_rate": 9.222698008251814e-06,
"loss": 0.48091468811035154,
"memory(GiB)": 37.06,
"step": 555,
"token_acc": 0.8286792452830188,
"train_speed(iter/s)": 0.161689
},
{
"epoch": 0.6684571769621008,
"grad_norm": 2.083499198931165,
"learning_rate": 9.204972366946546e-06,
"loss": 0.4586004734039307,
"memory(GiB)": 37.06,
"step": 560,
"token_acc": 0.8503009027081244,
"train_speed(iter/s)": 0.16188
},
{
"epoch": 0.6744255446135482,
"grad_norm": 2.475812664409812,
"learning_rate": 9.187064325754838e-06,
"loss": 0.4561641693115234,
"memory(GiB)": 37.06,
"step": 565,
"token_acc": 0.8384485031067596,
"train_speed(iter/s)": 0.162054
},
{
"epoch": 0.6803939122649956,
"grad_norm": 2.4413316196832984,
"learning_rate": 9.168974661477206e-06,
"loss": 0.43843851089477537,
"memory(GiB)": 37.06,
"step": 570,
"token_acc": 0.839965019676432,
"train_speed(iter/s)": 0.162185
},
{
"epoch": 0.6863622799164428,
"grad_norm": 2.1737549301105075,
"learning_rate": 9.150704158792456e-06,
"loss": 0.4771718502044678,
"memory(GiB)": 37.06,
"step": 575,
"token_acc": 0.8196035642844154,
"train_speed(iter/s)": 0.162359
},
{
"epoch": 0.6923306475678902,
"grad_norm": 2.1356874443108342,
"learning_rate": 9.13225361022366e-06,
"loss": 0.48221721649169924,
"memory(GiB)": 37.06,
"step": 580,
"token_acc": 0.8299897993879632,
"train_speed(iter/s)": 0.162445
},
{
"epoch": 0.6982990152193375,
"grad_norm": 2.3220256859553077,
"learning_rate": 9.113623816103775e-06,
"loss": 0.4806779384613037,
"memory(GiB)": 37.06,
"step": 585,
"token_acc": 0.8411007545494895,
"train_speed(iter/s)": 0.162682
},
{
"epoch": 0.7042673828707848,
"grad_norm": 2.069813477739464,
"learning_rate": 9.094815584540922e-06,
"loss": 0.4947704792022705,
"memory(GiB)": 37.06,
"step": 590,
"token_acc": 0.862796833773087,
"train_speed(iter/s)": 0.162845
},
{
"epoch": 0.7102357505222322,
"grad_norm": 2.252802103709778,
"learning_rate": 9.075829731383342e-06,
"loss": 0.4306300163269043,
"memory(GiB)": 37.06,
"step": 595,
"token_acc": 0.8425353797089894,
"train_speed(iter/s)": 0.163154
},
{
"epoch": 0.7162041181736795,
"grad_norm": 2.241419478853809,
"learning_rate": 9.056667080184004e-06,
"loss": 0.4567378520965576,
"memory(GiB)": 37.06,
"step": 600,
"token_acc": 0.8388354561996361,
"train_speed(iter/s)": 0.163286
},
{
"epoch": 0.7162041181736795,
"eval_loss": 0.41334930062294006,
"eval_runtime": 10.9312,
"eval_samples_per_second": 24.608,
"eval_steps_per_second": 3.11,
"eval_token_acc": 0.8542591476224403,
"step": 600
},
{
"epoch": 0.7221724858251268,
"grad_norm": 2.1208660287310384,
"learning_rate": 9.037328462164866e-06,
"loss": 0.44713678359985354,
"memory(GiB)": 37.06,
"step": 605,
"token_acc": 0.8356246777796872,
"train_speed(iter/s)": 0.151305
},
{
"epoch": 0.7281408534765742,
"grad_norm": 1.9420061515865858,
"learning_rate": 9.01781471618085e-06,
"loss": 0.45147147178649905,
"memory(GiB)": 37.06,
"step": 610,
"token_acc": 0.8882771277816013,
"train_speed(iter/s)": 0.151579
},
{
"epoch": 0.7341092211280215,
"grad_norm": 2.370549361627338,
"learning_rate": 8.998126688683423e-06,
"loss": 0.4287998199462891,
"memory(GiB)": 37.06,
"step": 615,
"token_acc": 0.8318122555410691,
"train_speed(iter/s)": 0.15183
},
{
"epoch": 0.7400775887794688,
"grad_norm": 2.003208951467392,
"learning_rate": 8.978265233683903e-06,
"loss": 0.4494300842285156,
"memory(GiB)": 37.06,
"step": 620,
"token_acc": 0.8252328878088295,
"train_speed(iter/s)": 0.15205
},
{
"epoch": 0.7460459564309161,
"grad_norm": 2.602367805333985,
"learning_rate": 8.9582312127164e-06,
"loss": 0.46652889251708984,
"memory(GiB)": 37.06,
"step": 625,
"token_acc": 0.8474077428118633,
"train_speed(iter/s)": 0.152311
},
{
"epoch": 0.7520143240823635,
"grad_norm": 2.3007477614457765,
"learning_rate": 8.938025494800454e-06,
"loss": 0.46235361099243166,
"memory(GiB)": 37.06,
"step": 630,
"token_acc": 0.8234998744664825,
"train_speed(iter/s)": 0.152632
},
{
"epoch": 0.7579826917338108,
"grad_norm": 2.403260011722763,
"learning_rate": 8.917648956403338e-06,
"loss": 0.4329329490661621,
"memory(GiB)": 37.06,
"step": 635,
"token_acc": 0.8512756689483509,
"train_speed(iter/s)": 0.152969
},
{
"epoch": 0.7639510593852581,
"grad_norm": 1.8459463363591184,
"learning_rate": 8.897102481402031e-06,
"loss": 0.45981664657592775,
"memory(GiB)": 37.06,
"step": 640,
"token_acc": 0.8598321614878657,
"train_speed(iter/s)": 0.153182
},
{
"epoch": 0.7699194270367055,
"grad_norm": 2.0204814112895044,
"learning_rate": 8.876386961044892e-06,
"loss": 0.46657752990722656,
"memory(GiB)": 37.06,
"step": 645,
"token_acc": 0.8745874587458746,
"train_speed(iter/s)": 0.153345
},
{
"epoch": 0.7758877946881528,
"grad_norm": 1.8481808083298177,
"learning_rate": 8.855503293912987e-06,
"loss": 0.4649078369140625,
"memory(GiB)": 37.06,
"step": 650,
"token_acc": 0.8592820512820513,
"train_speed(iter/s)": 0.153498
},
{
"epoch": 0.7818561623396001,
"grad_norm": 2.2884914044841698,
"learning_rate": 8.834452385881121e-06,
"loss": 0.4653633117675781,
"memory(GiB)": 37.06,
"step": 655,
"token_acc": 0.8515602216389618,
"train_speed(iter/s)": 0.153659
},
{
"epoch": 0.7878245299910475,
"grad_norm": 2.173340273942357,
"learning_rate": 8.813235150078532e-06,
"loss": 0.46648712158203126,
"memory(GiB)": 37.06,
"step": 660,
"token_acc": 0.8156269959548648,
"train_speed(iter/s)": 0.153953
},
{
"epoch": 0.7937928976424948,
"grad_norm": 2.2191296614587563,
"learning_rate": 8.791852506849301e-06,
"loss": 0.45751609802246096,
"memory(GiB)": 37.06,
"step": 665,
"token_acc": 0.8260312580066616,
"train_speed(iter/s)": 0.154161
},
{
"epoch": 0.7997612652939421,
"grad_norm": 2.2870388856485335,
"learning_rate": 8.770305383712407e-06,
"loss": 0.4709470748901367,
"memory(GiB)": 37.06,
"step": 670,
"token_acc": 0.842337607735968,
"train_speed(iter/s)": 0.154453
},
{
"epoch": 0.8057296329453895,
"grad_norm": 2.3046312751781866,
"learning_rate": 8.748594715321512e-06,
"loss": 0.44265017509460447,
"memory(GiB)": 37.06,
"step": 675,
"token_acc": 0.8602195071443363,
"train_speed(iter/s)": 0.154677
},
{
"epoch": 0.8116980005968367,
"grad_norm": 2.2464744707673985,
"learning_rate": 8.726721443424409e-06,
"loss": 0.4592324733734131,
"memory(GiB)": 37.06,
"step": 680,
"token_acc": 0.8654945054945055,
"train_speed(iter/s)": 0.154905
},
{
"epoch": 0.8176663682482841,
"grad_norm": 2.194092144648434,
"learning_rate": 8.704686516822177e-06,
"loss": 0.43160429000854494,
"memory(GiB)": 37.06,
"step": 685,
"token_acc": 0.8649193548387096,
"train_speed(iter/s)": 0.155078
},
{
"epoch": 0.8236347358997315,
"grad_norm": 2.247411516392796,
"learning_rate": 8.682490891328016e-06,
"loss": 0.45626983642578123,
"memory(GiB)": 37.06,
"step": 690,
"token_acc": 0.8643364928909952,
"train_speed(iter/s)": 0.155279
},
{
"epoch": 0.8296031035511787,
"grad_norm": 2.035754411138357,
"learning_rate": 8.660135529725799e-06,
"loss": 0.4315452575683594,
"memory(GiB)": 37.06,
"step": 695,
"token_acc": 0.8554044380816035,
"train_speed(iter/s)": 0.155502
},
{
"epoch": 0.8355714712026261,
"grad_norm": 2.292286762424394,
"learning_rate": 8.6376214017283e-06,
"loss": 0.4535685539245605,
"memory(GiB)": 37.06,
"step": 700,
"token_acc": 0.833079268292683,
"train_speed(iter/s)": 0.155636
},
{
"epoch": 0.8355714712026261,
"eval_loss": 0.4100053906440735,
"eval_runtime": 10.9163,
"eval_samples_per_second": 24.642,
"eval_steps_per_second": 3.115,
"eval_token_acc": 0.8548802542883762,
"step": 700
},
{
"epoch": 0.8415398388540735,
"grad_norm": 2.6314360636405714,
"learning_rate": 8.61494948393513e-06,
"loss": 0.4539949417114258,
"memory(GiB)": 37.06,
"step": 705,
"token_acc": 0.8583042973286876,
"train_speed(iter/s)": 0.146478
},
{
"epoch": 0.8475082065055207,
"grad_norm": 2.1848010999728715,
"learning_rate": 8.592120759790383e-06,
"loss": 0.46171207427978517,
"memory(GiB)": 37.06,
"step": 710,
"token_acc": 0.8417105263157895,
"train_speed(iter/s)": 0.146671
},
{
"epoch": 0.8534765741569681,
"grad_norm": 2.447774461275868,
"learning_rate": 8.56913621953997e-06,
"loss": 0.4798592567443848,
"memory(GiB)": 37.06,
"step": 715,
"token_acc": 0.8562048588312541,
"train_speed(iter/s)": 0.146953
},
{
"epoch": 0.8594449418084154,
"grad_norm": 2.596951485691162,
"learning_rate": 8.545996860188668e-06,
"loss": 0.4231537342071533,
"memory(GiB)": 37.06,
"step": 720,
"token_acc": 0.831799700406591,
"train_speed(iter/s)": 0.147232
},
{
"epoch": 0.8654133094598627,
"grad_norm": 2.0232163854750027,
"learning_rate": 8.522703685456866e-06,
"loss": 0.44301156997680663,
"memory(GiB)": 37.06,
"step": 725,
"token_acc": 0.8794139744552968,
"train_speed(iter/s)": 0.1475
},
{
"epoch": 0.8713816771113101,
"grad_norm": 2.281907577430269,
"learning_rate": 8.49925770573704e-06,
"loss": 0.46319947242736814,
"memory(GiB)": 37.06,
"step": 730,
"token_acc": 0.8430570505920344,
"train_speed(iter/s)": 0.147765
},
{
"epoch": 0.8773500447627574,
"grad_norm": 2.190179810988922,
"learning_rate": 8.475659938049912e-06,
"loss": 0.4825079917907715,
"memory(GiB)": 37.06,
"step": 735,
"token_acc": 0.839588377723971,
"train_speed(iter/s)": 0.147996
},
{
"epoch": 0.8833184124142047,
"grad_norm": 2.014804370593861,
"learning_rate": 8.45191140600034e-06,
"loss": 0.454302978515625,
"memory(GiB)": 37.06,
"step": 740,
"token_acc": 0.8007774538386784,
"train_speed(iter/s)": 0.148279
},
{
"epoch": 0.8892867800656521,
"grad_norm": 2.1256355584342077,
"learning_rate": 8.42801313973292e-06,
"loss": 0.4445801258087158,
"memory(GiB)": 37.06,
"step": 745,
"token_acc": 0.846286205907657,
"train_speed(iter/s)": 0.148536
},
{
"epoch": 0.8952551477170994,
"grad_norm": 2.6544295779283575,
"learning_rate": 8.403966175887293e-06,
"loss": 0.4630784511566162,
"memory(GiB)": 37.06,
"step": 750,
"token_acc": 0.8537764350453172,
"train_speed(iter/s)": 0.148704
},
{
"epoch": 0.9012235153685467,
"grad_norm": 2.4745309667627255,
"learning_rate": 8.379771557553184e-06,
"loss": 0.43903446197509766,
"memory(GiB)": 37.06,
"step": 755,
"token_acc": 0.8682237600922722,
"train_speed(iter/s)": 0.148945
},
{
"epoch": 0.907191883019994,
"grad_norm": 2.167884085714607,
"learning_rate": 8.355430334225159e-06,
"loss": 0.445455265045166,
"memory(GiB)": 37.06,
"step": 760,
"token_acc": 0.852589641434263,
"train_speed(iter/s)": 0.149189
},
{
"epoch": 0.9131602506714414,
"grad_norm": 2.3516013470748116,
"learning_rate": 8.330943561757092e-06,
"loss": 0.44769630432128904,
"memory(GiB)": 37.06,
"step": 765,
"token_acc": 0.8217955651703623,
"train_speed(iter/s)": 0.149338
},
{
"epoch": 0.9191286183228887,
"grad_norm": 2.0619205640970506,
"learning_rate": 8.30631230231637e-06,
"loss": 0.46817874908447266,
"memory(GiB)": 37.06,
"step": 770,
"token_acc": 0.8363870967741935,
"train_speed(iter/s)": 0.149487
},
{
"epoch": 0.925096985974336,
"grad_norm": 2.3440589362137993,
"learning_rate": 8.281537624337823e-06,
"loss": 0.4982964038848877,
"memory(GiB)": 37.06,
"step": 775,
"token_acc": 0.8594432314410481,
"train_speed(iter/s)": 0.149779
},
{
"epoch": 0.9310653536257834,
"grad_norm": 2.0757541904974097,
"learning_rate": 8.256620602477372e-06,
"loss": 0.4509378433227539,
"memory(GiB)": 37.06,
"step": 780,
"token_acc": 0.8259721555448872,
"train_speed(iter/s)": 0.149971
},
{
"epoch": 0.9370337212772307,
"grad_norm": 2.086378932611534,
"learning_rate": 8.231562317565412e-06,
"loss": 0.43694629669189455,
"memory(GiB)": 37.06,
"step": 785,
"token_acc": 0.856384262611634,
"train_speed(iter/s)": 0.150204
},
{
"epoch": 0.943002088928678,
"grad_norm": 2.308538899901496,
"learning_rate": 8.206363856559935e-06,
"loss": 0.4430408477783203,
"memory(GiB)": 37.06,
"step": 790,
"token_acc": 0.8422222222222222,
"train_speed(iter/s)": 0.15035
},
{
"epoch": 0.9489704565801254,
"grad_norm": 1.8314796079076852,
"learning_rate": 8.181026312499383e-06,
"loss": 0.44437146186828613,
"memory(GiB)": 37.06,
"step": 795,
"token_acc": 0.8529804865009356,
"train_speed(iter/s)": 0.150549
},
{
"epoch": 0.9549388242315726,
"grad_norm": 2.2397424826021792,
"learning_rate": 8.155550784455224e-06,
"loss": 0.4815809726715088,
"memory(GiB)": 37.06,
"step": 800,
"token_acc": 0.8588266107909901,
"train_speed(iter/s)": 0.150753
},
{
"epoch": 0.9549388242315726,
"eval_loss": 0.4058806300163269,
"eval_runtime": 11.0737,
"eval_samples_per_second": 24.292,
"eval_steps_per_second": 3.07,
"eval_token_acc": 0.8572368060503096,
"step": 800
}
],
"logging_steps": 5,
"max_steps": 2511,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 88119181914112.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}