|
{ |
|
"best_metric": 0.40588063, |
|
"best_model_checkpoint": "/home/ubuntu/output/v0-20250315-052746/checkpoint-800", |
|
"epoch": 0.9549388242315726, |
|
"eval_steps": 100, |
|
"global_step": 800, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.001193673530289466, |
|
"grad_norm": 22.298568401591066, |
|
"learning_rate": 7.936507936507937e-08, |
|
"loss": 1.0442615747451782, |
|
"memory(GiB)": 30.7, |
|
"step": 1, |
|
"token_acc": 0.7699836867862969, |
|
"train_speed(iter/s)": 0.093757 |
|
}, |
|
{ |
|
"epoch": 0.005968367651447329, |
|
"grad_norm": 19.63376949197587, |
|
"learning_rate": 3.9682539682539683e-07, |
|
"loss": 0.9844925403594971, |
|
"memory(GiB)": 36.92, |
|
"step": 5, |
|
"token_acc": 0.7549315386400557, |
|
"train_speed(iter/s)": 0.149789 |
|
}, |
|
{ |
|
"epoch": 0.011936735302894658, |
|
"grad_norm": 16.540823173599176, |
|
"learning_rate": 7.936507936507937e-07, |
|
"loss": 0.9816521644592285, |
|
"memory(GiB)": 36.92, |
|
"step": 10, |
|
"token_acc": 0.7844917012448133, |
|
"train_speed(iter/s)": 0.162277 |
|
}, |
|
{ |
|
"epoch": 0.017905102954341987, |
|
"grad_norm": 7.069008652065498, |
|
"learning_rate": 1.1904761904761906e-06, |
|
"loss": 0.8435724258422852, |
|
"memory(GiB)": 36.92, |
|
"step": 15, |
|
"token_acc": 0.7823455233291299, |
|
"train_speed(iter/s)": 0.176636 |
|
}, |
|
{ |
|
"epoch": 0.023873470605789315, |
|
"grad_norm": 5.623338707216517, |
|
"learning_rate": 1.5873015873015873e-06, |
|
"loss": 0.7347106456756591, |
|
"memory(GiB)": 36.92, |
|
"step": 20, |
|
"token_acc": 0.8210757409440176, |
|
"train_speed(iter/s)": 0.177914 |
|
}, |
|
{ |
|
"epoch": 0.029841838257236644, |
|
"grad_norm": 4.003256651554691, |
|
"learning_rate": 1.984126984126984e-06, |
|
"loss": 0.654999828338623, |
|
"memory(GiB)": 36.92, |
|
"step": 25, |
|
"token_acc": 0.7765267826680314, |
|
"train_speed(iter/s)": 0.183726 |
|
}, |
|
{ |
|
"epoch": 0.03581020590868397, |
|
"grad_norm": 3.4287088973952624, |
|
"learning_rate": 2.380952380952381e-06, |
|
"loss": 0.5874819755554199, |
|
"memory(GiB)": 36.92, |
|
"step": 30, |
|
"token_acc": 0.8138706921105098, |
|
"train_speed(iter/s)": 0.184018 |
|
}, |
|
{ |
|
"epoch": 0.0417785735601313, |
|
"grad_norm": 2.8169959397852256, |
|
"learning_rate": 2.7777777777777783e-06, |
|
"loss": 0.6081454753875732, |
|
"memory(GiB)": 36.92, |
|
"step": 35, |
|
"token_acc": 0.81794500723589, |
|
"train_speed(iter/s)": 0.183529 |
|
}, |
|
{ |
|
"epoch": 0.04774694121157863, |
|
"grad_norm": 3.325157673054176, |
|
"learning_rate": 3.1746031746031746e-06, |
|
"loss": 0.5828543663024902, |
|
"memory(GiB)": 36.92, |
|
"step": 40, |
|
"token_acc": 0.8394230769230769, |
|
"train_speed(iter/s)": 0.184431 |
|
}, |
|
{ |
|
"epoch": 0.05371530886302596, |
|
"grad_norm": 2.7879997870731166, |
|
"learning_rate": 3.5714285714285718e-06, |
|
"loss": 0.5575875282287598, |
|
"memory(GiB)": 36.92, |
|
"step": 45, |
|
"token_acc": 0.8293939393939394, |
|
"train_speed(iter/s)": 0.187841 |
|
}, |
|
{ |
|
"epoch": 0.05968367651447329, |
|
"grad_norm": 2.778252157541432, |
|
"learning_rate": 3.968253968253968e-06, |
|
"loss": 0.5551129341125488, |
|
"memory(GiB)": 36.92, |
|
"step": 50, |
|
"token_acc": 0.8260184559981995, |
|
"train_speed(iter/s)": 0.189026 |
|
}, |
|
{ |
|
"epoch": 0.06565204416592062, |
|
"grad_norm": 3.0281694802961816, |
|
"learning_rate": 4.365079365079366e-06, |
|
"loss": 0.5619981765747071, |
|
"memory(GiB)": 36.92, |
|
"step": 55, |
|
"token_acc": 0.8104107766505904, |
|
"train_speed(iter/s)": 0.188889 |
|
}, |
|
{ |
|
"epoch": 0.07162041181736795, |
|
"grad_norm": 3.102265765306523, |
|
"learning_rate": 4.761904761904762e-06, |
|
"loss": 0.5332321166992188, |
|
"memory(GiB)": 36.92, |
|
"step": 60, |
|
"token_acc": 0.8470640768028578, |
|
"train_speed(iter/s)": 0.189797 |
|
}, |
|
{ |
|
"epoch": 0.07758877946881527, |
|
"grad_norm": 2.8144373694536444, |
|
"learning_rate": 5.15873015873016e-06, |
|
"loss": 0.5349865436553956, |
|
"memory(GiB)": 36.92, |
|
"step": 65, |
|
"token_acc": 0.8722838137472284, |
|
"train_speed(iter/s)": 0.190181 |
|
}, |
|
{ |
|
"epoch": 0.0835571471202626, |
|
"grad_norm": 3.337016219899744, |
|
"learning_rate": 5.555555555555557e-06, |
|
"loss": 0.5452562808990479, |
|
"memory(GiB)": 36.92, |
|
"step": 70, |
|
"token_acc": 0.8432369942196531, |
|
"train_speed(iter/s)": 0.189336 |
|
}, |
|
{ |
|
"epoch": 0.08952551477170993, |
|
"grad_norm": 3.06136632501639, |
|
"learning_rate": 5.9523809523809525e-06, |
|
"loss": 0.49980897903442384, |
|
"memory(GiB)": 36.92, |
|
"step": 75, |
|
"token_acc": 0.8338361568809468, |
|
"train_speed(iter/s)": 0.190084 |
|
}, |
|
{ |
|
"epoch": 0.09549388242315726, |
|
"grad_norm": 2.4877794848384633, |
|
"learning_rate": 6.349206349206349e-06, |
|
"loss": 0.539669418334961, |
|
"memory(GiB)": 36.92, |
|
"step": 80, |
|
"token_acc": 0.8455654331197023, |
|
"train_speed(iter/s)": 0.189397 |
|
}, |
|
{ |
|
"epoch": 0.10146225007460459, |
|
"grad_norm": 3.221405214526254, |
|
"learning_rate": 6.746031746031747e-06, |
|
"loss": 0.5160573959350586, |
|
"memory(GiB)": 36.92, |
|
"step": 85, |
|
"token_acc": 0.8462370242214533, |
|
"train_speed(iter/s)": 0.190557 |
|
}, |
|
{ |
|
"epoch": 0.10743061772605192, |
|
"grad_norm": 3.2868066256101085, |
|
"learning_rate": 7.1428571428571436e-06, |
|
"loss": 0.4913814067840576, |
|
"memory(GiB)": 36.92, |
|
"step": 90, |
|
"token_acc": 0.8452227659026526, |
|
"train_speed(iter/s)": 0.190372 |
|
}, |
|
{ |
|
"epoch": 0.11339898537749925, |
|
"grad_norm": 3.051253318495505, |
|
"learning_rate": 7.53968253968254e-06, |
|
"loss": 0.49474325180053713, |
|
"memory(GiB)": 36.92, |
|
"step": 95, |
|
"token_acc": 0.8567275747508306, |
|
"train_speed(iter/s)": 0.190208 |
|
}, |
|
{ |
|
"epoch": 0.11936735302894658, |
|
"grad_norm": 2.912617922195808, |
|
"learning_rate": 7.936507936507936e-06, |
|
"loss": 0.4725308418273926, |
|
"memory(GiB)": 36.92, |
|
"step": 100, |
|
"token_acc": 0.8608458390177354, |
|
"train_speed(iter/s)": 0.190477 |
|
}, |
|
{ |
|
"epoch": 0.11936735302894658, |
|
"eval_loss": 0.45056208968162537, |
|
"eval_runtime": 10.9299, |
|
"eval_samples_per_second": 24.611, |
|
"eval_steps_per_second": 3.111, |
|
"eval_token_acc": 0.8447598692022433, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.12533572068039392, |
|
"grad_norm": 2.8012333103883647, |
|
"learning_rate": 8.333333333333334e-06, |
|
"loss": 0.5016227722167969, |
|
"memory(GiB)": 36.92, |
|
"step": 105, |
|
"token_acc": 0.842248243559719, |
|
"train_speed(iter/s)": 0.17126 |
|
}, |
|
{ |
|
"epoch": 0.13130408833184123, |
|
"grad_norm": 2.912876340051396, |
|
"learning_rate": 8.730158730158731e-06, |
|
"loss": 0.518134593963623, |
|
"memory(GiB)": 36.92, |
|
"step": 110, |
|
"token_acc": 0.8509636604384287, |
|
"train_speed(iter/s)": 0.172461 |
|
}, |
|
{ |
|
"epoch": 0.13727245598328858, |
|
"grad_norm": 3.3516293509261117, |
|
"learning_rate": 9.126984126984127e-06, |
|
"loss": 0.5215555191040039, |
|
"memory(GiB)": 36.92, |
|
"step": 115, |
|
"token_acc": 0.8062340503098797, |
|
"train_speed(iter/s)": 0.172888 |
|
}, |
|
{ |
|
"epoch": 0.1432408236347359, |
|
"grad_norm": 3.137187651305806, |
|
"learning_rate": 9.523809523809525e-06, |
|
"loss": 0.4938058853149414, |
|
"memory(GiB)": 36.92, |
|
"step": 120, |
|
"token_acc": 0.8689788053949904, |
|
"train_speed(iter/s)": 0.173312 |
|
}, |
|
{ |
|
"epoch": 0.14920919128618323, |
|
"grad_norm": 2.9778302322354255, |
|
"learning_rate": 9.920634920634922e-06, |
|
"loss": 0.47763543128967284, |
|
"memory(GiB)": 36.92, |
|
"step": 125, |
|
"token_acc": 0.8223684210526315, |
|
"train_speed(iter/s)": 0.173984 |
|
}, |
|
{ |
|
"epoch": 0.15517755893763055, |
|
"grad_norm": 2.454176368796001, |
|
"learning_rate": 9.999930596405254e-06, |
|
"loss": 0.5025428771972656, |
|
"memory(GiB)": 36.92, |
|
"step": 130, |
|
"token_acc": 0.8584961515689757, |
|
"train_speed(iter/s)": 0.17494 |
|
}, |
|
{ |
|
"epoch": 0.1611459265890779, |
|
"grad_norm": 2.308752936530914, |
|
"learning_rate": 9.999648647603774e-06, |
|
"loss": 0.4561060905456543, |
|
"memory(GiB)": 36.92, |
|
"step": 135, |
|
"token_acc": 0.8763222131814483, |
|
"train_speed(iter/s)": 0.175709 |
|
}, |
|
{ |
|
"epoch": 0.1671142942405252, |
|
"grad_norm": 3.2296962246861276, |
|
"learning_rate": 9.999149828091632e-06, |
|
"loss": 0.5205905437469482, |
|
"memory(GiB)": 36.92, |
|
"step": 140, |
|
"token_acc": 0.8126463700234192, |
|
"train_speed(iter/s)": 0.176096 |
|
}, |
|
{ |
|
"epoch": 0.17308266189197255, |
|
"grad_norm": 2.8523931115518915, |
|
"learning_rate": 9.998434159506211e-06, |
|
"loss": 0.4669060230255127, |
|
"memory(GiB)": 36.92, |
|
"step": 145, |
|
"token_acc": 0.8612167300380228, |
|
"train_speed(iter/s)": 0.176748 |
|
}, |
|
{ |
|
"epoch": 0.17905102954341987, |
|
"grad_norm": 2.689699959591659, |
|
"learning_rate": 9.997501672891208e-06, |
|
"loss": 0.4870173454284668, |
|
"memory(GiB)": 36.92, |
|
"step": 150, |
|
"token_acc": 0.8250571369208394, |
|
"train_speed(iter/s)": 0.177191 |
|
}, |
|
{ |
|
"epoch": 0.1850193971948672, |
|
"grad_norm": 2.8278119407117517, |
|
"learning_rate": 9.99635240869527e-06, |
|
"loss": 0.47814245223999025, |
|
"memory(GiB)": 36.92, |
|
"step": 155, |
|
"token_acc": 0.819718309859155, |
|
"train_speed(iter/s)": 0.177751 |
|
}, |
|
{ |
|
"epoch": 0.19098776484631452, |
|
"grad_norm": 3.0394183749078887, |
|
"learning_rate": 9.99498641677025e-06, |
|
"loss": 0.5187320232391357, |
|
"memory(GiB)": 36.92, |
|
"step": 160, |
|
"token_acc": 0.8540501094624179, |
|
"train_speed(iter/s)": 0.178072 |
|
}, |
|
{ |
|
"epoch": 0.19695613249776187, |
|
"grad_norm": 2.4913040205595696, |
|
"learning_rate": 9.993403756369037e-06, |
|
"loss": 0.471418571472168, |
|
"memory(GiB)": 36.92, |
|
"step": 165, |
|
"token_acc": 0.8507351108896087, |
|
"train_speed(iter/s)": 0.178675 |
|
}, |
|
{ |
|
"epoch": 0.20292450014920918, |
|
"grad_norm": 2.5344653445390937, |
|
"learning_rate": 9.991604496142997e-06, |
|
"loss": 0.5218185901641845, |
|
"memory(GiB)": 36.92, |
|
"step": 170, |
|
"token_acc": 0.8207423580786026, |
|
"train_speed(iter/s)": 0.179206 |
|
}, |
|
{ |
|
"epoch": 0.20889286780065652, |
|
"grad_norm": 3.1475346404756737, |
|
"learning_rate": 9.989588714138977e-06, |
|
"loss": 0.4809536933898926, |
|
"memory(GiB)": 36.92, |
|
"step": 175, |
|
"token_acc": 0.8179710144927537, |
|
"train_speed(iter/s)": 0.179609 |
|
}, |
|
{ |
|
"epoch": 0.21486123545210384, |
|
"grad_norm": 2.9443708297446927, |
|
"learning_rate": 9.987356497795944e-06, |
|
"loss": 0.5137897491455078, |
|
"memory(GiB)": 36.92, |
|
"step": 180, |
|
"token_acc": 0.838412017167382, |
|
"train_speed(iter/s)": 0.179743 |
|
}, |
|
{ |
|
"epoch": 0.22082960310355118, |
|
"grad_norm": 2.3691979165448864, |
|
"learning_rate": 9.984907943941164e-06, |
|
"loss": 0.47942285537719725, |
|
"memory(GiB)": 36.92, |
|
"step": 185, |
|
"token_acc": 0.8178681677864537, |
|
"train_speed(iter/s)": 0.179758 |
|
}, |
|
{ |
|
"epoch": 0.2267979707549985, |
|
"grad_norm": 2.7655733596568806, |
|
"learning_rate": 9.98224315878603e-06, |
|
"loss": 0.4850442886352539, |
|
"memory(GiB)": 36.92, |
|
"step": 190, |
|
"token_acc": 0.8557343020238714, |
|
"train_speed(iter/s)": 0.180113 |
|
}, |
|
{ |
|
"epoch": 0.23276633840644584, |
|
"grad_norm": 2.9415771183832837, |
|
"learning_rate": 9.979362257921428e-06, |
|
"loss": 0.4999836921691895, |
|
"memory(GiB)": 36.92, |
|
"step": 195, |
|
"token_acc": 0.851006381934217, |
|
"train_speed(iter/s)": 0.180236 |
|
}, |
|
{ |
|
"epoch": 0.23873470605789315, |
|
"grad_norm": 3.1074760910693797, |
|
"learning_rate": 9.976265366312746e-06, |
|
"loss": 0.5033563137054443, |
|
"memory(GiB)": 36.92, |
|
"step": 200, |
|
"token_acc": 0.8293048128342246, |
|
"train_speed(iter/s)": 0.180618 |
|
}, |
|
{ |
|
"epoch": 0.23873470605789315, |
|
"eval_loss": 0.4415110647678375, |
|
"eval_runtime": 10.926, |
|
"eval_samples_per_second": 24.62, |
|
"eval_steps_per_second": 3.112, |
|
"eval_token_acc": 0.8473173672384502, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.2447030737093405, |
|
"grad_norm": 2.7742402422517016, |
|
"learning_rate": 9.972952618294442e-06, |
|
"loss": 0.48658447265625, |
|
"memory(GiB)": 36.92, |
|
"step": 205, |
|
"token_acc": 0.8399616256759114, |
|
"train_speed(iter/s)": 0.171547 |
|
}, |
|
{ |
|
"epoch": 0.25067144136078784, |
|
"grad_norm": 2.9146485975442946, |
|
"learning_rate": 9.969424157564215e-06, |
|
"loss": 0.48202037811279297, |
|
"memory(GiB)": 36.92, |
|
"step": 210, |
|
"token_acc": 0.8229777256740914, |
|
"train_speed(iter/s)": 0.172058 |
|
}, |
|
{ |
|
"epoch": 0.25663980901223515, |
|
"grad_norm": 2.6037700192849007, |
|
"learning_rate": 9.965680137176778e-06, |
|
"loss": 0.4780398368835449, |
|
"memory(GiB)": 36.92, |
|
"step": 215, |
|
"token_acc": 0.8451862602806, |
|
"train_speed(iter/s)": 0.172776 |
|
}, |
|
{ |
|
"epoch": 0.26260817666368247, |
|
"grad_norm": 2.4624431066871555, |
|
"learning_rate": 9.961720719537217e-06, |
|
"loss": 0.46450080871582033, |
|
"memory(GiB)": 36.92, |
|
"step": 220, |
|
"token_acc": 0.8089250493096647, |
|
"train_speed(iter/s)": 0.173186 |
|
}, |
|
{ |
|
"epoch": 0.26857654431512984, |
|
"grad_norm": 2.6192496099911624, |
|
"learning_rate": 9.957546076393944e-06, |
|
"loss": 0.44403810501098634, |
|
"memory(GiB)": 36.92, |
|
"step": 225, |
|
"token_acc": 0.8560982743492249, |
|
"train_speed(iter/s)": 0.173308 |
|
}, |
|
{ |
|
"epoch": 0.27454491196657715, |
|
"grad_norm": 2.5789044914565227, |
|
"learning_rate": 9.953156388831246e-06, |
|
"loss": 0.4940804481506348, |
|
"memory(GiB)": 36.92, |
|
"step": 230, |
|
"token_acc": 0.8385935769656699, |
|
"train_speed(iter/s)": 0.173656 |
|
}, |
|
{ |
|
"epoch": 0.28051327961802447, |
|
"grad_norm": 2.3813674364243984, |
|
"learning_rate": 9.948551847261439e-06, |
|
"loss": 0.4587420463562012, |
|
"memory(GiB)": 36.92, |
|
"step": 235, |
|
"token_acc": 0.8549975381585426, |
|
"train_speed(iter/s)": 0.173976 |
|
}, |
|
{ |
|
"epoch": 0.2864816472694718, |
|
"grad_norm": 2.7610173702743865, |
|
"learning_rate": 9.943732651416597e-06, |
|
"loss": 0.4972860336303711, |
|
"memory(GiB)": 36.92, |
|
"step": 240, |
|
"token_acc": 0.8406979379107183, |
|
"train_speed(iter/s)": 0.174337 |
|
}, |
|
{ |
|
"epoch": 0.29245001492091915, |
|
"grad_norm": 2.390894612477832, |
|
"learning_rate": 9.938699010339898e-06, |
|
"loss": 0.4903904438018799, |
|
"memory(GiB)": 36.92, |
|
"step": 245, |
|
"token_acc": 0.8545253863134658, |
|
"train_speed(iter/s)": 0.174579 |
|
}, |
|
{ |
|
"epoch": 0.29841838257236647, |
|
"grad_norm": 2.3711824949447546, |
|
"learning_rate": 9.933451142376545e-06, |
|
"loss": 0.4524253845214844, |
|
"memory(GiB)": 37.05, |
|
"step": 250, |
|
"token_acc": 0.8489612577203818, |
|
"train_speed(iter/s)": 0.174973 |
|
}, |
|
{ |
|
"epoch": 0.3043867502238138, |
|
"grad_norm": 2.2514671634568364, |
|
"learning_rate": 9.927989275164305e-06, |
|
"loss": 0.48909597396850585, |
|
"memory(GiB)": 37.05, |
|
"step": 255, |
|
"token_acc": 0.8518639633747548, |
|
"train_speed(iter/s)": 0.175028 |
|
}, |
|
{ |
|
"epoch": 0.3103551178752611, |
|
"grad_norm": 2.3714755110814005, |
|
"learning_rate": 9.922313645623634e-06, |
|
"loss": 0.4785162448883057, |
|
"memory(GiB)": 37.05, |
|
"step": 260, |
|
"token_acc": 0.8465215082315454, |
|
"train_speed(iter/s)": 0.175714 |
|
}, |
|
{ |
|
"epoch": 0.31632348552670847, |
|
"grad_norm": 2.648679383955696, |
|
"learning_rate": 9.916424499947395e-06, |
|
"loss": 0.46675701141357423, |
|
"memory(GiB)": 37.05, |
|
"step": 265, |
|
"token_acc": 0.8571428571428571, |
|
"train_speed(iter/s)": 0.175927 |
|
}, |
|
{ |
|
"epoch": 0.3222918531781558, |
|
"grad_norm": 2.579144815458166, |
|
"learning_rate": 9.910322093590177e-06, |
|
"loss": 0.47339348793029784, |
|
"memory(GiB)": 37.05, |
|
"step": 270, |
|
"token_acc": 0.8505902192242834, |
|
"train_speed(iter/s)": 0.176471 |
|
}, |
|
{ |
|
"epoch": 0.3282602208296031, |
|
"grad_norm": 2.2898701347032793, |
|
"learning_rate": 9.904006691257224e-06, |
|
"loss": 0.49665226936340334, |
|
"memory(GiB)": 37.05, |
|
"step": 275, |
|
"token_acc": 0.8427124366910523, |
|
"train_speed(iter/s)": 0.17689 |
|
}, |
|
{ |
|
"epoch": 0.3342285884810504, |
|
"grad_norm": 1.9441720928034771, |
|
"learning_rate": 9.897478566892942e-06, |
|
"loss": 0.44453701972961424, |
|
"memory(GiB)": 37.05, |
|
"step": 280, |
|
"token_acc": 0.8629363449691991, |
|
"train_speed(iter/s)": 0.177368 |
|
}, |
|
{ |
|
"epoch": 0.3401969561324978, |
|
"grad_norm": 2.4637260658165, |
|
"learning_rate": 9.890738003669029e-06, |
|
"loss": 0.4563939094543457, |
|
"memory(GiB)": 37.05, |
|
"step": 285, |
|
"token_acc": 0.8230596456201648, |
|
"train_speed(iter/s)": 0.1776 |
|
}, |
|
{ |
|
"epoch": 0.3461653237839451, |
|
"grad_norm": 2.287302517723748, |
|
"learning_rate": 9.883785293972175e-06, |
|
"loss": 0.504718017578125, |
|
"memory(GiB)": 37.05, |
|
"step": 290, |
|
"token_acc": 0.7899543378995434, |
|
"train_speed(iter/s)": 0.177582 |
|
}, |
|
{ |
|
"epoch": 0.3521336914353924, |
|
"grad_norm": 2.328908891504034, |
|
"learning_rate": 9.87662073939139e-06, |
|
"loss": 0.4355961799621582, |
|
"memory(GiB)": 37.05, |
|
"step": 295, |
|
"token_acc": 0.8636019960683502, |
|
"train_speed(iter/s)": 0.177798 |
|
}, |
|
{ |
|
"epoch": 0.35810205908683973, |
|
"grad_norm": 2.444070696496546, |
|
"learning_rate": 9.869244650704924e-06, |
|
"loss": 0.4655925750732422, |
|
"memory(GiB)": 37.05, |
|
"step": 300, |
|
"token_acc": 0.8573033707865169, |
|
"train_speed(iter/s)": 0.177836 |
|
}, |
|
{ |
|
"epoch": 0.35810205908683973, |
|
"eval_loss": 0.4284290373325348, |
|
"eval_runtime": 10.9831, |
|
"eval_samples_per_second": 24.492, |
|
"eval_steps_per_second": 3.096, |
|
"eval_token_acc": 0.8515189711550757, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.3640704267382871, |
|
"grad_norm": 2.659611192736946, |
|
"learning_rate": 9.861657347866778e-06, |
|
"loss": 0.5253509521484375, |
|
"memory(GiB)": 37.06, |
|
"step": 305, |
|
"token_acc": 0.828113750899928, |
|
"train_speed(iter/s)": 0.171888 |
|
}, |
|
{ |
|
"epoch": 0.3700387943897344, |
|
"grad_norm": 2.6984676971627226, |
|
"learning_rate": 9.853859159992831e-06, |
|
"loss": 0.47617392539978026, |
|
"memory(GiB)": 37.06, |
|
"step": 310, |
|
"token_acc": 0.8316008316008316, |
|
"train_speed(iter/s)": 0.172231 |
|
}, |
|
{ |
|
"epoch": 0.37600716204118173, |
|
"grad_norm": 2.195598140600359, |
|
"learning_rate": 9.845850425346563e-06, |
|
"loss": 0.4360311508178711, |
|
"memory(GiB)": 37.06, |
|
"step": 315, |
|
"token_acc": 0.848318462594372, |
|
"train_speed(iter/s)": 0.172652 |
|
}, |
|
{ |
|
"epoch": 0.38197552969262905, |
|
"grad_norm": 2.4976869303898597, |
|
"learning_rate": 9.837631491324379e-06, |
|
"loss": 0.46515851020812987, |
|
"memory(GiB)": 37.06, |
|
"step": 320, |
|
"token_acc": 0.8522144522144522, |
|
"train_speed(iter/s)": 0.172786 |
|
}, |
|
{ |
|
"epoch": 0.3879438973440764, |
|
"grad_norm": 3.017469180784894, |
|
"learning_rate": 9.829202714440544e-06, |
|
"loss": 0.5420156478881836, |
|
"memory(GiB)": 37.06, |
|
"step": 325, |
|
"token_acc": 0.8376052027543994, |
|
"train_speed(iter/s)": 0.17318 |
|
}, |
|
{ |
|
"epoch": 0.39391226499552373, |
|
"grad_norm": 2.5730220119384297, |
|
"learning_rate": 9.820564460311719e-06, |
|
"loss": 0.4916552543640137, |
|
"memory(GiB)": 37.06, |
|
"step": 330, |
|
"token_acc": 0.8207920792079207, |
|
"train_speed(iter/s)": 0.173365 |
|
}, |
|
{ |
|
"epoch": 0.39988063264697105, |
|
"grad_norm": 2.798903385122773, |
|
"learning_rate": 9.811717103641096e-06, |
|
"loss": 0.4587296485900879, |
|
"memory(GiB)": 37.06, |
|
"step": 335, |
|
"token_acc": 0.8592551001310126, |
|
"train_speed(iter/s)": 0.173592 |
|
}, |
|
{ |
|
"epoch": 0.40584900029841836, |
|
"grad_norm": 2.6409823275058653, |
|
"learning_rate": 9.802661028202147e-06, |
|
"loss": 0.48290514945983887, |
|
"memory(GiB)": 37.06, |
|
"step": 340, |
|
"token_acc": 0.823793194407808, |
|
"train_speed(iter/s)": 0.173952 |
|
}, |
|
{ |
|
"epoch": 0.41181736794986573, |
|
"grad_norm": 3.0285812809146635, |
|
"learning_rate": 9.79339662682198e-06, |
|
"loss": 0.46567506790161134, |
|
"memory(GiB)": 37.06, |
|
"step": 345, |
|
"token_acc": 0.8304556354916067, |
|
"train_speed(iter/s)": 0.174192 |
|
}, |
|
{ |
|
"epoch": 0.41778573560131305, |
|
"grad_norm": 2.4611578793858486, |
|
"learning_rate": 9.783924301364297e-06, |
|
"loss": 0.4647653579711914, |
|
"memory(GiB)": 37.06, |
|
"step": 350, |
|
"token_acc": 0.8199260286638927, |
|
"train_speed(iter/s)": 0.17443 |
|
}, |
|
{ |
|
"epoch": 0.42375410325276036, |
|
"grad_norm": 2.154896994755901, |
|
"learning_rate": 9.774244462711962e-06, |
|
"loss": 0.4952418327331543, |
|
"memory(GiB)": 37.06, |
|
"step": 355, |
|
"token_acc": 0.8217054263565892, |
|
"train_speed(iter/s)": 0.174757 |
|
}, |
|
{ |
|
"epoch": 0.4297224709042077, |
|
"grad_norm": 2.005838047714932, |
|
"learning_rate": 9.764357530749178e-06, |
|
"loss": 0.4674674034118652, |
|
"memory(GiB)": 37.06, |
|
"step": 360, |
|
"token_acc": 0.841979596266551, |
|
"train_speed(iter/s)": 0.174828 |
|
}, |
|
{ |
|
"epoch": 0.43569083855565505, |
|
"grad_norm": 2.292609923640767, |
|
"learning_rate": 9.754263934343272e-06, |
|
"loss": 0.44636335372924807, |
|
"memory(GiB)": 37.06, |
|
"step": 365, |
|
"token_acc": 0.8596112311015118, |
|
"train_speed(iter/s)": 0.175118 |
|
}, |
|
{ |
|
"epoch": 0.44165920620710236, |
|
"grad_norm": 2.477107058493794, |
|
"learning_rate": 9.743964111326098e-06, |
|
"loss": 0.4866192817687988, |
|
"memory(GiB)": 37.06, |
|
"step": 370, |
|
"token_acc": 0.809440252675908, |
|
"train_speed(iter/s)": 0.175357 |
|
}, |
|
{ |
|
"epoch": 0.4476275738585497, |
|
"grad_norm": 2.3446291196746922, |
|
"learning_rate": 9.733458508475038e-06, |
|
"loss": 0.4887577533721924, |
|
"memory(GiB)": 37.06, |
|
"step": 375, |
|
"token_acc": 0.8332948510736551, |
|
"train_speed(iter/s)": 0.175371 |
|
}, |
|
{ |
|
"epoch": 0.453595941509997, |
|
"grad_norm": 2.29799169108157, |
|
"learning_rate": 9.722747581493625e-06, |
|
"loss": 0.49045257568359374, |
|
"memory(GiB)": 37.06, |
|
"step": 380, |
|
"token_acc": 0.8406266882766072, |
|
"train_speed(iter/s)": 0.175414 |
|
}, |
|
{ |
|
"epoch": 0.45956430916144436, |
|
"grad_norm": 2.563802674403576, |
|
"learning_rate": 9.711831794991777e-06, |
|
"loss": 0.4675490379333496, |
|
"memory(GiB)": 37.06, |
|
"step": 385, |
|
"token_acc": 0.847358529964502, |
|
"train_speed(iter/s)": 0.175567 |
|
}, |
|
{ |
|
"epoch": 0.4655326768128917, |
|
"grad_norm": 2.480776446284018, |
|
"learning_rate": 9.700711622465645e-06, |
|
"loss": 0.4845867156982422, |
|
"memory(GiB)": 37.06, |
|
"step": 390, |
|
"token_acc": 0.8422996998383745, |
|
"train_speed(iter/s)": 0.17572 |
|
}, |
|
{ |
|
"epoch": 0.471501044464339, |
|
"grad_norm": 2.721044012538843, |
|
"learning_rate": 9.689387546277062e-06, |
|
"loss": 0.46145071983337405, |
|
"memory(GiB)": 37.06, |
|
"step": 395, |
|
"token_acc": 0.8513663630304377, |
|
"train_speed(iter/s)": 0.175882 |
|
}, |
|
{ |
|
"epoch": 0.4774694121157863, |
|
"grad_norm": 2.580126202957563, |
|
"learning_rate": 9.677860057632642e-06, |
|
"loss": 0.5093360424041748, |
|
"memory(GiB)": 37.06, |
|
"step": 400, |
|
"token_acc": 0.8206378986866791, |
|
"train_speed(iter/s)": 0.175987 |
|
}, |
|
{ |
|
"epoch": 0.4774694121157863, |
|
"eval_loss": 0.42347872257232666, |
|
"eval_runtime": 10.9358, |
|
"eval_samples_per_second": 24.598, |
|
"eval_steps_per_second": 3.109, |
|
"eval_token_acc": 0.8527429166438318, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.4834377797672337, |
|
"grad_norm": 2.355447977882308, |
|
"learning_rate": 9.66612965656245e-06, |
|
"loss": 0.48992347717285156, |
|
"memory(GiB)": 37.06, |
|
"step": 405, |
|
"token_acc": 0.8608419645840294, |
|
"train_speed(iter/s)": 0.171561 |
|
}, |
|
{ |
|
"epoch": 0.489406147418681, |
|
"grad_norm": 2.0174115419967773, |
|
"learning_rate": 9.654196851898325e-06, |
|
"loss": 0.4750755786895752, |
|
"memory(GiB)": 37.06, |
|
"step": 410, |
|
"token_acc": 0.8274902615470228, |
|
"train_speed(iter/s)": 0.171858 |
|
}, |
|
{ |
|
"epoch": 0.4953745150701283, |
|
"grad_norm": 2.155026242929759, |
|
"learning_rate": 9.642062161251807e-06, |
|
"loss": 0.46627135276794435, |
|
"memory(GiB)": 37.06, |
|
"step": 415, |
|
"token_acc": 0.8661600496277916, |
|
"train_speed(iter/s)": 0.17197 |
|
}, |
|
{ |
|
"epoch": 0.5013428827215757, |
|
"grad_norm": 2.8519922687228174, |
|
"learning_rate": 9.62972611099168e-06, |
|
"loss": 0.4620970726013184, |
|
"memory(GiB)": 37.06, |
|
"step": 420, |
|
"token_acc": 0.8595988538681948, |
|
"train_speed(iter/s)": 0.172268 |
|
}, |
|
{ |
|
"epoch": 0.5073112503730229, |
|
"grad_norm": 2.5658438134794324, |
|
"learning_rate": 9.617189236221143e-06, |
|
"loss": 0.45318241119384767, |
|
"memory(GiB)": 37.06, |
|
"step": 425, |
|
"token_acc": 0.8252274866645748, |
|
"train_speed(iter/s)": 0.172438 |
|
}, |
|
{ |
|
"epoch": 0.5132796180244703, |
|
"grad_norm": 2.2980368916312206, |
|
"learning_rate": 9.604452080754601e-06, |
|
"loss": 0.46477622985839845, |
|
"memory(GiB)": 37.06, |
|
"step": 430, |
|
"token_acc": 0.8681318681318682, |
|
"train_speed(iter/s)": 0.17271 |
|
}, |
|
{ |
|
"epoch": 0.5192479856759177, |
|
"grad_norm": 2.3920351806796925, |
|
"learning_rate": 9.591515197094064e-06, |
|
"loss": 0.43802127838134763, |
|
"memory(GiB)": 37.06, |
|
"step": 435, |
|
"token_acc": 0.8632865550022635, |
|
"train_speed(iter/s)": 0.172963 |
|
}, |
|
{ |
|
"epoch": 0.5252163533273649, |
|
"grad_norm": 2.3926322888936196, |
|
"learning_rate": 9.578379146405202e-06, |
|
"loss": 0.4414364814758301, |
|
"memory(GiB)": 37.06, |
|
"step": 440, |
|
"token_acc": 0.8378196500672948, |
|
"train_speed(iter/s)": 0.173049 |
|
}, |
|
{ |
|
"epoch": 0.5311847209788123, |
|
"grad_norm": 2.5309415862721787, |
|
"learning_rate": 9.565044498492984e-06, |
|
"loss": 0.4737836837768555, |
|
"memory(GiB)": 37.06, |
|
"step": 445, |
|
"token_acc": 0.8400094809196492, |
|
"train_speed(iter/s)": 0.173413 |
|
}, |
|
{ |
|
"epoch": 0.5371530886302597, |
|
"grad_norm": 2.574732220606661, |
|
"learning_rate": 9.551511831776966e-06, |
|
"loss": 0.4299252986907959, |
|
"memory(GiB)": 37.06, |
|
"step": 450, |
|
"token_acc": 0.8394777265745008, |
|
"train_speed(iter/s)": 0.173639 |
|
}, |
|
{ |
|
"epoch": 0.5431214562817069, |
|
"grad_norm": 2.209862389780888, |
|
"learning_rate": 9.53778173326621e-06, |
|
"loss": 0.44927520751953126, |
|
"memory(GiB)": 37.06, |
|
"step": 455, |
|
"token_acc": 0.8641338013627916, |
|
"train_speed(iter/s)": 0.173751 |
|
}, |
|
{ |
|
"epoch": 0.5490898239331543, |
|
"grad_norm": 2.524639918389781, |
|
"learning_rate": 9.523854798533814e-06, |
|
"loss": 0.44107656478881835, |
|
"memory(GiB)": 37.06, |
|
"step": 460, |
|
"token_acc": 0.8868033496967946, |
|
"train_speed(iter/s)": 0.174216 |
|
}, |
|
{ |
|
"epoch": 0.5550581915846016, |
|
"grad_norm": 2.1182849441153215, |
|
"learning_rate": 9.509731631691071e-06, |
|
"loss": 0.43174285888671876, |
|
"memory(GiB)": 37.06, |
|
"step": 465, |
|
"token_acc": 0.855464759959142, |
|
"train_speed(iter/s)": 0.174365 |
|
}, |
|
{ |
|
"epoch": 0.5610265592360489, |
|
"grad_norm": 2.2926487255366688, |
|
"learning_rate": 9.495412845361279e-06, |
|
"loss": 0.48258438110351565, |
|
"memory(GiB)": 37.06, |
|
"step": 470, |
|
"token_acc": 0.8603872818551279, |
|
"train_speed(iter/s)": 0.174664 |
|
}, |
|
{ |
|
"epoch": 0.5669949268874963, |
|
"grad_norm": 2.192746026976168, |
|
"learning_rate": 9.480899060653154e-06, |
|
"loss": 0.4563854217529297, |
|
"memory(GiB)": 37.06, |
|
"step": 475, |
|
"token_acc": 0.8394289067083904, |
|
"train_speed(iter/s)": 0.17502 |
|
}, |
|
{ |
|
"epoch": 0.5729632945389436, |
|
"grad_norm": 2.014209866578747, |
|
"learning_rate": 9.466190907133901e-06, |
|
"loss": 0.4754791259765625, |
|
"memory(GiB)": 37.06, |
|
"step": 480, |
|
"token_acc": 0.8577712609970675, |
|
"train_speed(iter/s)": 0.175025 |
|
}, |
|
{ |
|
"epoch": 0.5789316621903909, |
|
"grad_norm": 2.559320864210838, |
|
"learning_rate": 9.451289022801894e-06, |
|
"loss": 0.47232685089111326, |
|
"memory(GiB)": 37.06, |
|
"step": 485, |
|
"token_acc": 0.8380402225074882, |
|
"train_speed(iter/s)": 0.175186 |
|
}, |
|
{ |
|
"epoch": 0.5849000298418383, |
|
"grad_norm": 2.2053676509330433, |
|
"learning_rate": 9.436194054058998e-06, |
|
"loss": 0.4336155891418457, |
|
"memory(GiB)": 37.06, |
|
"step": 490, |
|
"token_acc": 0.8529990167158309, |
|
"train_speed(iter/s)": 0.175216 |
|
}, |
|
{ |
|
"epoch": 0.5908683974932856, |
|
"grad_norm": 2.46940001428622, |
|
"learning_rate": 9.420906655682553e-06, |
|
"loss": 0.45275249481201174, |
|
"memory(GiB)": 37.06, |
|
"step": 495, |
|
"token_acc": 0.8271080928126768, |
|
"train_speed(iter/s)": 0.175432 |
|
}, |
|
{ |
|
"epoch": 0.5968367651447329, |
|
"grad_norm": 2.3675730058319293, |
|
"learning_rate": 9.405427490796941e-06, |
|
"loss": 0.48803205490112306, |
|
"memory(GiB)": 37.06, |
|
"step": 500, |
|
"token_acc": 0.8432593011741406, |
|
"train_speed(iter/s)": 0.175539 |
|
}, |
|
{ |
|
"epoch": 0.5968367651447329, |
|
"eval_loss": 0.4169776141643524, |
|
"eval_runtime": 10.9599, |
|
"eval_samples_per_second": 24.544, |
|
"eval_steps_per_second": 3.102, |
|
"eval_token_acc": 0.8532361484079575, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.6028051327961802, |
|
"grad_norm": 2.1414646330001217, |
|
"learning_rate": 9.389757230844845e-06, |
|
"loss": 0.46323652267456056, |
|
"memory(GiB)": 37.06, |
|
"step": 505, |
|
"token_acc": 0.8552877345904119, |
|
"train_speed(iter/s)": 0.159112 |
|
}, |
|
{ |
|
"epoch": 0.6087735004476276, |
|
"grad_norm": 2.5503273386919667, |
|
"learning_rate": 9.373896555558113e-06, |
|
"loss": 0.4701972961425781, |
|
"memory(GiB)": 37.06, |
|
"step": 510, |
|
"token_acc": 0.8592652620205294, |
|
"train_speed(iter/s)": 0.159422 |
|
}, |
|
{ |
|
"epoch": 0.6147418680990749, |
|
"grad_norm": 2.6125713791079996, |
|
"learning_rate": 9.357846152928275e-06, |
|
"loss": 0.4990544319152832, |
|
"memory(GiB)": 37.06, |
|
"step": 515, |
|
"token_acc": 0.824811732065002, |
|
"train_speed(iter/s)": 0.159707 |
|
}, |
|
{ |
|
"epoch": 0.6207102357505222, |
|
"grad_norm": 1.9353177630019818, |
|
"learning_rate": 9.341606719176695e-06, |
|
"loss": 0.4381883144378662, |
|
"memory(GiB)": 37.06, |
|
"step": 520, |
|
"token_acc": 0.867666063582321, |
|
"train_speed(iter/s)": 0.159909 |
|
}, |
|
{ |
|
"epoch": 0.6266786034019696, |
|
"grad_norm": 2.3284686918748667, |
|
"learning_rate": 9.325178958724387e-06, |
|
"loss": 0.45581645965576173, |
|
"memory(GiB)": 37.06, |
|
"step": 525, |
|
"token_acc": 0.8706395348837209, |
|
"train_speed(iter/s)": 0.160206 |
|
}, |
|
{ |
|
"epoch": 0.6326469710534169, |
|
"grad_norm": 2.2369421417810926, |
|
"learning_rate": 9.308563584161439e-06, |
|
"loss": 0.4688922882080078, |
|
"memory(GiB)": 37.06, |
|
"step": 530, |
|
"token_acc": 0.8338983050847457, |
|
"train_speed(iter/s)": 0.160549 |
|
}, |
|
{ |
|
"epoch": 0.6386153387048642, |
|
"grad_norm": 2.4187058758316202, |
|
"learning_rate": 9.291761316216115e-06, |
|
"loss": 0.43785710334777833, |
|
"memory(GiB)": 37.06, |
|
"step": 535, |
|
"token_acc": 0.8175961715442666, |
|
"train_speed(iter/s)": 0.160901 |
|
}, |
|
{ |
|
"epoch": 0.6445837063563116, |
|
"grad_norm": 2.11230034988461, |
|
"learning_rate": 9.274772883723587e-06, |
|
"loss": 0.4285177707672119, |
|
"memory(GiB)": 37.06, |
|
"step": 540, |
|
"token_acc": 0.8522423025435074, |
|
"train_speed(iter/s)": 0.161093 |
|
}, |
|
{ |
|
"epoch": 0.6505520740077588, |
|
"grad_norm": 2.340278397663115, |
|
"learning_rate": 9.257599023594326e-06, |
|
"loss": 0.4503736972808838, |
|
"memory(GiB)": 37.06, |
|
"step": 545, |
|
"token_acc": 0.8704713049054184, |
|
"train_speed(iter/s)": 0.161286 |
|
}, |
|
{ |
|
"epoch": 0.6565204416592062, |
|
"grad_norm": 2.3913667503479705, |
|
"learning_rate": 9.24024048078213e-06, |
|
"loss": 0.42584834098815916, |
|
"memory(GiB)": 37.06, |
|
"step": 550, |
|
"token_acc": 0.8828032979976443, |
|
"train_speed(iter/s)": 0.161464 |
|
}, |
|
{ |
|
"epoch": 0.6624888093106536, |
|
"grad_norm": 2.2991966974662628, |
|
"learning_rate": 9.222698008251814e-06, |
|
"loss": 0.48091468811035154, |
|
"memory(GiB)": 37.06, |
|
"step": 555, |
|
"token_acc": 0.8286792452830188, |
|
"train_speed(iter/s)": 0.161689 |
|
}, |
|
{ |
|
"epoch": 0.6684571769621008, |
|
"grad_norm": 2.083499198931165, |
|
"learning_rate": 9.204972366946546e-06, |
|
"loss": 0.4586004734039307, |
|
"memory(GiB)": 37.06, |
|
"step": 560, |
|
"token_acc": 0.8503009027081244, |
|
"train_speed(iter/s)": 0.16188 |
|
}, |
|
{ |
|
"epoch": 0.6744255446135482, |
|
"grad_norm": 2.475812664409812, |
|
"learning_rate": 9.187064325754838e-06, |
|
"loss": 0.4561641693115234, |
|
"memory(GiB)": 37.06, |
|
"step": 565, |
|
"token_acc": 0.8384485031067596, |
|
"train_speed(iter/s)": 0.162054 |
|
}, |
|
{ |
|
"epoch": 0.6803939122649956, |
|
"grad_norm": 2.4413316196832984, |
|
"learning_rate": 9.168974661477206e-06, |
|
"loss": 0.43843851089477537, |
|
"memory(GiB)": 37.06, |
|
"step": 570, |
|
"token_acc": 0.839965019676432, |
|
"train_speed(iter/s)": 0.162185 |
|
}, |
|
{ |
|
"epoch": 0.6863622799164428, |
|
"grad_norm": 2.1737549301105075, |
|
"learning_rate": 9.150704158792456e-06, |
|
"loss": 0.4771718502044678, |
|
"memory(GiB)": 37.06, |
|
"step": 575, |
|
"token_acc": 0.8196035642844154, |
|
"train_speed(iter/s)": 0.162359 |
|
}, |
|
{ |
|
"epoch": 0.6923306475678902, |
|
"grad_norm": 2.1356874443108342, |
|
"learning_rate": 9.13225361022366e-06, |
|
"loss": 0.48221721649169924, |
|
"memory(GiB)": 37.06, |
|
"step": 580, |
|
"token_acc": 0.8299897993879632, |
|
"train_speed(iter/s)": 0.162445 |
|
}, |
|
{ |
|
"epoch": 0.6982990152193375, |
|
"grad_norm": 2.3220256859553077, |
|
"learning_rate": 9.113623816103775e-06, |
|
"loss": 0.4806779384613037, |
|
"memory(GiB)": 37.06, |
|
"step": 585, |
|
"token_acc": 0.8411007545494895, |
|
"train_speed(iter/s)": 0.162682 |
|
}, |
|
{ |
|
"epoch": 0.7042673828707848, |
|
"grad_norm": 2.069813477739464, |
|
"learning_rate": 9.094815584540922e-06, |
|
"loss": 0.4947704792022705, |
|
"memory(GiB)": 37.06, |
|
"step": 590, |
|
"token_acc": 0.862796833773087, |
|
"train_speed(iter/s)": 0.162845 |
|
}, |
|
{ |
|
"epoch": 0.7102357505222322, |
|
"grad_norm": 2.252802103709778, |
|
"learning_rate": 9.075829731383342e-06, |
|
"loss": 0.4306300163269043, |
|
"memory(GiB)": 37.06, |
|
"step": 595, |
|
"token_acc": 0.8425353797089894, |
|
"train_speed(iter/s)": 0.163154 |
|
}, |
|
{ |
|
"epoch": 0.7162041181736795, |
|
"grad_norm": 2.241419478853809, |
|
"learning_rate": 9.056667080184004e-06, |
|
"loss": 0.4567378520965576, |
|
"memory(GiB)": 37.06, |
|
"step": 600, |
|
"token_acc": 0.8388354561996361, |
|
"train_speed(iter/s)": 0.163286 |
|
}, |
|
{ |
|
"epoch": 0.7162041181736795, |
|
"eval_loss": 0.41334930062294006, |
|
"eval_runtime": 10.9312, |
|
"eval_samples_per_second": 24.608, |
|
"eval_steps_per_second": 3.11, |
|
"eval_token_acc": 0.8542591476224403, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.7221724858251268, |
|
"grad_norm": 2.1208660287310384, |
|
"learning_rate": 9.037328462164866e-06, |
|
"loss": 0.44713678359985354, |
|
"memory(GiB)": 37.06, |
|
"step": 605, |
|
"token_acc": 0.8356246777796872, |
|
"train_speed(iter/s)": 0.151305 |
|
}, |
|
{ |
|
"epoch": 0.7281408534765742, |
|
"grad_norm": 1.9420061515865858, |
|
"learning_rate": 9.01781471618085e-06, |
|
"loss": 0.45147147178649905, |
|
"memory(GiB)": 37.06, |
|
"step": 610, |
|
"token_acc": 0.8882771277816013, |
|
"train_speed(iter/s)": 0.151579 |
|
}, |
|
{ |
|
"epoch": 0.7341092211280215, |
|
"grad_norm": 2.370549361627338, |
|
"learning_rate": 8.998126688683423e-06, |
|
"loss": 0.4287998199462891, |
|
"memory(GiB)": 37.06, |
|
"step": 615, |
|
"token_acc": 0.8318122555410691, |
|
"train_speed(iter/s)": 0.15183 |
|
}, |
|
{ |
|
"epoch": 0.7400775887794688, |
|
"grad_norm": 2.003208951467392, |
|
"learning_rate": 8.978265233683903e-06, |
|
"loss": 0.4494300842285156, |
|
"memory(GiB)": 37.06, |
|
"step": 620, |
|
"token_acc": 0.8252328878088295, |
|
"train_speed(iter/s)": 0.15205 |
|
}, |
|
{ |
|
"epoch": 0.7460459564309161, |
|
"grad_norm": 2.602367805333985, |
|
"learning_rate": 8.9582312127164e-06, |
|
"loss": 0.46652889251708984, |
|
"memory(GiB)": 37.06, |
|
"step": 625, |
|
"token_acc": 0.8474077428118633, |
|
"train_speed(iter/s)": 0.152311 |
|
}, |
|
{ |
|
"epoch": 0.7520143240823635, |
|
"grad_norm": 2.3007477614457765, |
|
"learning_rate": 8.938025494800454e-06, |
|
"loss": 0.46235361099243166, |
|
"memory(GiB)": 37.06, |
|
"step": 630, |
|
"token_acc": 0.8234998744664825, |
|
"train_speed(iter/s)": 0.152632 |
|
}, |
|
{ |
|
"epoch": 0.7579826917338108, |
|
"grad_norm": 2.403260011722763, |
|
"learning_rate": 8.917648956403338e-06, |
|
"loss": 0.4329329490661621, |
|
"memory(GiB)": 37.06, |
|
"step": 635, |
|
"token_acc": 0.8512756689483509, |
|
"train_speed(iter/s)": 0.152969 |
|
}, |
|
{ |
|
"epoch": 0.7639510593852581, |
|
"grad_norm": 1.8459463363591184, |
|
"learning_rate": 8.897102481402031e-06, |
|
"loss": 0.45981664657592775, |
|
"memory(GiB)": 37.06, |
|
"step": 640, |
|
"token_acc": 0.8598321614878657, |
|
"train_speed(iter/s)": 0.153182 |
|
}, |
|
{ |
|
"epoch": 0.7699194270367055, |
|
"grad_norm": 2.0204814112895044, |
|
"learning_rate": 8.876386961044892e-06, |
|
"loss": 0.46657752990722656, |
|
"memory(GiB)": 37.06, |
|
"step": 645, |
|
"token_acc": 0.8745874587458746, |
|
"train_speed(iter/s)": 0.153345 |
|
}, |
|
{ |
|
"epoch": 0.7758877946881528, |
|
"grad_norm": 1.8481808083298177, |
|
"learning_rate": 8.855503293912987e-06, |
|
"loss": 0.4649078369140625, |
|
"memory(GiB)": 37.06, |
|
"step": 650, |
|
"token_acc": 0.8592820512820513, |
|
"train_speed(iter/s)": 0.153498 |
|
}, |
|
{ |
|
"epoch": 0.7818561623396001, |
|
"grad_norm": 2.2884914044841698, |
|
"learning_rate": 8.834452385881121e-06, |
|
"loss": 0.4653633117675781, |
|
"memory(GiB)": 37.06, |
|
"step": 655, |
|
"token_acc": 0.8515602216389618, |
|
"train_speed(iter/s)": 0.153659 |
|
}, |
|
{ |
|
"epoch": 0.7878245299910475, |
|
"grad_norm": 2.173340273942357, |
|
"learning_rate": 8.813235150078532e-06, |
|
"loss": 0.46648712158203126, |
|
"memory(GiB)": 37.06, |
|
"step": 660, |
|
"token_acc": 0.8156269959548648, |
|
"train_speed(iter/s)": 0.153953 |
|
}, |
|
{ |
|
"epoch": 0.7937928976424948, |
|
"grad_norm": 2.2191296614587563, |
|
"learning_rate": 8.791852506849301e-06, |
|
"loss": 0.45751609802246096, |
|
"memory(GiB)": 37.06, |
|
"step": 665, |
|
"token_acc": 0.8260312580066616, |
|
"train_speed(iter/s)": 0.154161 |
|
}, |
|
{ |
|
"epoch": 0.7997612652939421, |
|
"grad_norm": 2.2870388856485335, |
|
"learning_rate": 8.770305383712407e-06, |
|
"loss": 0.4709470748901367, |
|
"memory(GiB)": 37.06, |
|
"step": 670, |
|
"token_acc": 0.842337607735968, |
|
"train_speed(iter/s)": 0.154453 |
|
}, |
|
{ |
|
"epoch": 0.8057296329453895, |
|
"grad_norm": 2.3046312751781866, |
|
"learning_rate": 8.748594715321512e-06, |
|
"loss": 0.44265017509460447, |
|
"memory(GiB)": 37.06, |
|
"step": 675, |
|
"token_acc": 0.8602195071443363, |
|
"train_speed(iter/s)": 0.154677 |
|
}, |
|
{ |
|
"epoch": 0.8116980005968367, |
|
"grad_norm": 2.2464744707673985, |
|
"learning_rate": 8.726721443424409e-06, |
|
"loss": 0.4592324733734131, |
|
"memory(GiB)": 37.06, |
|
"step": 680, |
|
"token_acc": 0.8654945054945055, |
|
"train_speed(iter/s)": 0.154905 |
|
}, |
|
{ |
|
"epoch": 0.8176663682482841, |
|
"grad_norm": 2.194092144648434, |
|
"learning_rate": 8.704686516822177e-06, |
|
"loss": 0.43160429000854494, |
|
"memory(GiB)": 37.06, |
|
"step": 685, |
|
"token_acc": 0.8649193548387096, |
|
"train_speed(iter/s)": 0.155078 |
|
}, |
|
{ |
|
"epoch": 0.8236347358997315, |
|
"grad_norm": 2.247411516392796, |
|
"learning_rate": 8.682490891328016e-06, |
|
"loss": 0.45626983642578123, |
|
"memory(GiB)": 37.06, |
|
"step": 690, |
|
"token_acc": 0.8643364928909952, |
|
"train_speed(iter/s)": 0.155279 |
|
}, |
|
{ |
|
"epoch": 0.8296031035511787, |
|
"grad_norm": 2.035754411138357, |
|
"learning_rate": 8.660135529725799e-06, |
|
"loss": 0.4315452575683594, |
|
"memory(GiB)": 37.06, |
|
"step": 695, |
|
"token_acc": 0.8554044380816035, |
|
"train_speed(iter/s)": 0.155502 |
|
}, |
|
{ |
|
"epoch": 0.8355714712026261, |
|
"grad_norm": 2.292286762424394, |
|
"learning_rate": 8.6376214017283e-06, |
|
"loss": 0.4535685539245605, |
|
"memory(GiB)": 37.06, |
|
"step": 700, |
|
"token_acc": 0.833079268292683, |
|
"train_speed(iter/s)": 0.155636 |
|
}, |
|
{ |
|
"epoch": 0.8355714712026261, |
|
"eval_loss": 0.4100053906440735, |
|
"eval_runtime": 10.9163, |
|
"eval_samples_per_second": 24.642, |
|
"eval_steps_per_second": 3.115, |
|
"eval_token_acc": 0.8548802542883762, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.8415398388540735, |
|
"grad_norm": 2.6314360636405714, |
|
"learning_rate": 8.61494948393513e-06, |
|
"loss": 0.4539949417114258, |
|
"memory(GiB)": 37.06, |
|
"step": 705, |
|
"token_acc": 0.8583042973286876, |
|
"train_speed(iter/s)": 0.146478 |
|
}, |
|
{ |
|
"epoch": 0.8475082065055207, |
|
"grad_norm": 2.1848010999728715, |
|
"learning_rate": 8.592120759790383e-06, |
|
"loss": 0.46171207427978517, |
|
"memory(GiB)": 37.06, |
|
"step": 710, |
|
"token_acc": 0.8417105263157895, |
|
"train_speed(iter/s)": 0.146671 |
|
}, |
|
{ |
|
"epoch": 0.8534765741569681, |
|
"grad_norm": 2.447774461275868, |
|
"learning_rate": 8.56913621953997e-06, |
|
"loss": 0.4798592567443848, |
|
"memory(GiB)": 37.06, |
|
"step": 715, |
|
"token_acc": 0.8562048588312541, |
|
"train_speed(iter/s)": 0.146953 |
|
}, |
|
{ |
|
"epoch": 0.8594449418084154, |
|
"grad_norm": 2.596951485691162, |
|
"learning_rate": 8.545996860188668e-06, |
|
"loss": 0.4231537342071533, |
|
"memory(GiB)": 37.06, |
|
"step": 720, |
|
"token_acc": 0.831799700406591, |
|
"train_speed(iter/s)": 0.147232 |
|
}, |
|
{ |
|
"epoch": 0.8654133094598627, |
|
"grad_norm": 2.0232163854750027, |
|
"learning_rate": 8.522703685456866e-06, |
|
"loss": 0.44301156997680663, |
|
"memory(GiB)": 37.06, |
|
"step": 725, |
|
"token_acc": 0.8794139744552968, |
|
"train_speed(iter/s)": 0.1475 |
|
}, |
|
{ |
|
"epoch": 0.8713816771113101, |
|
"grad_norm": 2.281907577430269, |
|
"learning_rate": 8.49925770573704e-06, |
|
"loss": 0.46319947242736814, |
|
"memory(GiB)": 37.06, |
|
"step": 730, |
|
"token_acc": 0.8430570505920344, |
|
"train_speed(iter/s)": 0.147765 |
|
}, |
|
{ |
|
"epoch": 0.8773500447627574, |
|
"grad_norm": 2.190179810988922, |
|
"learning_rate": 8.475659938049912e-06, |
|
"loss": 0.4825079917907715, |
|
"memory(GiB)": 37.06, |
|
"step": 735, |
|
"token_acc": 0.839588377723971, |
|
"train_speed(iter/s)": 0.147996 |
|
}, |
|
{ |
|
"epoch": 0.8833184124142047, |
|
"grad_norm": 2.014804370593861, |
|
"learning_rate": 8.45191140600034e-06, |
|
"loss": 0.454302978515625, |
|
"memory(GiB)": 37.06, |
|
"step": 740, |
|
"token_acc": 0.8007774538386784, |
|
"train_speed(iter/s)": 0.148279 |
|
}, |
|
{ |
|
"epoch": 0.8892867800656521, |
|
"grad_norm": 2.1256355584342077, |
|
"learning_rate": 8.42801313973292e-06, |
|
"loss": 0.4445801258087158, |
|
"memory(GiB)": 37.06, |
|
"step": 745, |
|
"token_acc": 0.846286205907657, |
|
"train_speed(iter/s)": 0.148536 |
|
}, |
|
{ |
|
"epoch": 0.8952551477170994, |
|
"grad_norm": 2.6544295779283575, |
|
"learning_rate": 8.403966175887293e-06, |
|
"loss": 0.4630784511566162, |
|
"memory(GiB)": 37.06, |
|
"step": 750, |
|
"token_acc": 0.8537764350453172, |
|
"train_speed(iter/s)": 0.148704 |
|
}, |
|
{ |
|
"epoch": 0.9012235153685467, |
|
"grad_norm": 2.4745309667627255, |
|
"learning_rate": 8.379771557553184e-06, |
|
"loss": 0.43903446197509766, |
|
"memory(GiB)": 37.06, |
|
"step": 755, |
|
"token_acc": 0.8682237600922722, |
|
"train_speed(iter/s)": 0.148945 |
|
}, |
|
{ |
|
"epoch": 0.907191883019994, |
|
"grad_norm": 2.167884085714607, |
|
"learning_rate": 8.355430334225159e-06, |
|
"loss": 0.445455265045166, |
|
"memory(GiB)": 37.06, |
|
"step": 760, |
|
"token_acc": 0.852589641434263, |
|
"train_speed(iter/s)": 0.149189 |
|
}, |
|
{ |
|
"epoch": 0.9131602506714414, |
|
"grad_norm": 2.3516013470748116, |
|
"learning_rate": 8.330943561757092e-06, |
|
"loss": 0.44769630432128904, |
|
"memory(GiB)": 37.06, |
|
"step": 765, |
|
"token_acc": 0.8217955651703623, |
|
"train_speed(iter/s)": 0.149338 |
|
}, |
|
{ |
|
"epoch": 0.9191286183228887, |
|
"grad_norm": 2.0619205640970506, |
|
"learning_rate": 8.30631230231637e-06, |
|
"loss": 0.46817874908447266, |
|
"memory(GiB)": 37.06, |
|
"step": 770, |
|
"token_acc": 0.8363870967741935, |
|
"train_speed(iter/s)": 0.149487 |
|
}, |
|
{ |
|
"epoch": 0.925096985974336, |
|
"grad_norm": 2.3440589362137993, |
|
"learning_rate": 8.281537624337823e-06, |
|
"loss": 0.4982964038848877, |
|
"memory(GiB)": 37.06, |
|
"step": 775, |
|
"token_acc": 0.8594432314410481, |
|
"train_speed(iter/s)": 0.149779 |
|
}, |
|
{ |
|
"epoch": 0.9310653536257834, |
|
"grad_norm": 2.0757541904974097, |
|
"learning_rate": 8.256620602477372e-06, |
|
"loss": 0.4509378433227539, |
|
"memory(GiB)": 37.06, |
|
"step": 780, |
|
"token_acc": 0.8259721555448872, |
|
"train_speed(iter/s)": 0.149971 |
|
}, |
|
{ |
|
"epoch": 0.9370337212772307, |
|
"grad_norm": 2.086378932611534, |
|
"learning_rate": 8.231562317565412e-06, |
|
"loss": 0.43694629669189455, |
|
"memory(GiB)": 37.06, |
|
"step": 785, |
|
"token_acc": 0.856384262611634, |
|
"train_speed(iter/s)": 0.150204 |
|
}, |
|
{ |
|
"epoch": 0.943002088928678, |
|
"grad_norm": 2.308538899901496, |
|
"learning_rate": 8.206363856559935e-06, |
|
"loss": 0.4430408477783203, |
|
"memory(GiB)": 37.06, |
|
"step": 790, |
|
"token_acc": 0.8422222222222222, |
|
"train_speed(iter/s)": 0.15035 |
|
}, |
|
{ |
|
"epoch": 0.9489704565801254, |
|
"grad_norm": 1.8314796079076852, |
|
"learning_rate": 8.181026312499383e-06, |
|
"loss": 0.44437146186828613, |
|
"memory(GiB)": 37.06, |
|
"step": 795, |
|
"token_acc": 0.8529804865009356, |
|
"train_speed(iter/s)": 0.150549 |
|
}, |
|
{ |
|
"epoch": 0.9549388242315726, |
|
"grad_norm": 2.2397424826021792, |
|
"learning_rate": 8.155550784455224e-06, |
|
"loss": 0.4815809726715088, |
|
"memory(GiB)": 37.06, |
|
"step": 800, |
|
"token_acc": 0.8588266107909901, |
|
"train_speed(iter/s)": 0.150753 |
|
}, |
|
{ |
|
"epoch": 0.9549388242315726, |
|
"eval_loss": 0.4058806300163269, |
|
"eval_runtime": 11.0737, |
|
"eval_samples_per_second": 24.292, |
|
"eval_steps_per_second": 3.07, |
|
"eval_token_acc": 0.8572368060503096, |
|
"step": 800 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 2511, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 100, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 88119181914112.0, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|