deepseekJanusPro1bms-swift123 / trainer_state.json
gauravparajuli's picture
update files
902563b verified
{
"best_global_step": 43,
"best_metric": 3.79597425,
"best_model_checkpoint": "/workspace/output/v0-20250510-202602/checkpoint-43",
"epoch": 0.9842632331902719,
"eval_steps": 200,
"global_step": 43,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.022889842632331903,
"grad_norm": 0.5022401213645935,
"learning_rate": 2.5e-05,
"loss": 5.9138689041137695,
"memory(GiB)": 22.25,
"step": 1,
"token_acc": 0.2735191637630662,
"train_speed(iter/s)": 0.017618
},
{
"epoch": 0.045779685264663805,
"grad_norm": 0.4973876178264618,
"learning_rate": 5e-05,
"loss": 6.206646919250488,
"memory(GiB)": 22.25,
"step": 2,
"token_acc": 0.25411334552102377,
"train_speed(iter/s)": 0.024222
},
{
"epoch": 0.06866952789699571,
"grad_norm": 0.520351767539978,
"learning_rate": 4.992664502959351e-05,
"loss": 5.884594917297363,
"memory(GiB)": 22.25,
"step": 3,
"token_acc": 0.26119402985074625,
"train_speed(iter/s)": 0.027671
},
{
"epoch": 0.09155937052932761,
"grad_norm": 0.6917837262153625,
"learning_rate": 4.970701059450872e-05,
"loss": 5.813294887542725,
"memory(GiB)": 22.25,
"step": 4,
"token_acc": 0.2789115646258503,
"train_speed(iter/s)": 0.02975
},
{
"epoch": 0.11444921316165951,
"grad_norm": 0.8174898028373718,
"learning_rate": 4.934238559694448e-05,
"loss": 6.142425537109375,
"memory(GiB)": 22.25,
"step": 5,
"token_acc": 0.20984455958549222,
"train_speed(iter/s)": 0.031187
},
{
"epoch": 0.13733905579399142,
"grad_norm": 0.5081659555435181,
"learning_rate": 4.8834909801373264e-05,
"loss": 5.509262561798096,
"memory(GiB)": 22.25,
"step": 6,
"token_acc": 0.29264214046822745,
"train_speed(iter/s)": 0.032136
},
{
"epoch": 0.16022889842632332,
"grad_norm": 0.5285544395446777,
"learning_rate": 4.8187561277552374e-05,
"loss": 5.453015327453613,
"memory(GiB)": 22.25,
"step": 7,
"token_acc": 0.33134328358208953,
"train_speed(iter/s)": 0.032853
},
{
"epoch": 0.18311874105865522,
"grad_norm": 0.6126793026924133,
"learning_rate": 4.740413892402639e-05,
"loss": 5.514800071716309,
"memory(GiB)": 22.25,
"step": 8,
"token_acc": 0.24347826086956523,
"train_speed(iter/s)": 0.033481
},
{
"epoch": 0.20600858369098712,
"grad_norm": 0.5079677104949951,
"learning_rate": 4.648924017468003e-05,
"loss": 5.397139549255371,
"memory(GiB)": 22.25,
"step": 9,
"token_acc": 0.2693069306930693,
"train_speed(iter/s)": 0.033962
},
{
"epoch": 0.22889842632331903,
"grad_norm": 0.5848721861839294,
"learning_rate": 4.5448234019167945e-05,
"loss": 5.021652698516846,
"memory(GiB)": 22.25,
"step": 10,
"token_acc": 0.32229965156794427,
"train_speed(iter/s)": 0.034354
},
{
"epoch": 0.25178826895565093,
"grad_norm": 0.4369657635688782,
"learning_rate": 4.428722949554857e-05,
"loss": 5.207980155944824,
"memory(GiB)": 22.25,
"step": 11,
"token_acc": 0.34467455621301774,
"train_speed(iter/s)": 0.034653
},
{
"epoch": 0.27467811158798283,
"grad_norm": 0.7269682884216309,
"learning_rate": 4.301303984001967e-05,
"loss": 5.160121917724609,
"memory(GiB)": 22.25,
"step": 12,
"token_acc": 0.34941763727121466,
"train_speed(iter/s)": 0.034904
},
{
"epoch": 0.29756795422031473,
"grad_norm": 0.829106867313385,
"learning_rate": 4.163314250413913e-05,
"loss": 4.662051200866699,
"memory(GiB)": 22.25,
"step": 13,
"token_acc": 0.32751091703056767,
"train_speed(iter/s)": 0.035138
},
{
"epoch": 0.32045779685264664,
"grad_norm": 1.1529988050460815,
"learning_rate": 4.015563527416595e-05,
"loss": 5.173630237579346,
"memory(GiB)": 22.25,
"step": 14,
"token_acc": 0.28865979381443296,
"train_speed(iter/s)": 0.035337
},
{
"epoch": 0.34334763948497854,
"grad_norm": 0.7239392995834351,
"learning_rate": 3.858918875003053e-05,
"loss": 4.88520622253418,
"memory(GiB)": 22.25,
"step": 15,
"token_acc": 0.332089552238806,
"train_speed(iter/s)": 0.035497
},
{
"epoch": 0.36623748211731044,
"grad_norm": 0.5255656838417053,
"learning_rate": 3.694299546280657e-05,
"loss": 4.789463043212891,
"memory(GiB)": 22.25,
"step": 16,
"token_acc": 0.36082474226804123,
"train_speed(iter/s)": 0.035644
},
{
"epoch": 0.38912732474964234,
"grad_norm": 0.527284562587738,
"learning_rate": 3.5226715929283506e-05,
"loss": 5.008277416229248,
"memory(GiB)": 22.25,
"step": 17,
"token_acc": 0.3034188034188034,
"train_speed(iter/s)": 0.035798
},
{
"epoch": 0.41201716738197425,
"grad_norm": 0.6423527002334595,
"learning_rate": 3.3450421960212566e-05,
"loss": 4.778470039367676,
"memory(GiB)": 22.25,
"step": 18,
"token_acc": 0.3488372093023256,
"train_speed(iter/s)": 0.035907
},
{
"epoch": 0.43490701001430615,
"grad_norm": 0.4906652867794037,
"learning_rate": 3.162453755491655e-05,
"loss": 4.682660102844238,
"memory(GiB)": 22.25,
"step": 19,
"token_acc": 0.35555555555555557,
"train_speed(iter/s)": 0.036025
},
{
"epoch": 0.45779685264663805,
"grad_norm": 0.9560534358024597,
"learning_rate": 2.975977772911671e-05,
"loss": 4.940546989440918,
"memory(GiB)": 22.25,
"step": 20,
"token_acc": 0.36923076923076925,
"train_speed(iter/s)": 0.036093
},
{
"epoch": 0.48068669527896996,
"grad_norm": 0.5544789433479309,
"learning_rate": 2.7867085634960016e-05,
"loss": 4.366146087646484,
"memory(GiB)": 22.25,
"step": 21,
"token_acc": 0.3649906890130354,
"train_speed(iter/s)": 0.036168
},
{
"epoch": 0.5035765379113019,
"grad_norm": 0.4951302111148834,
"learning_rate": 2.595756834225089e-05,
"loss": 4.866259574890137,
"memory(GiB)": 22.25,
"step": 22,
"token_acc": 0.34402852049910876,
"train_speed(iter/s)": 0.036268
},
{
"epoch": 0.5264663805436338,
"grad_norm": 1.56654953956604,
"learning_rate": 2.4042431657749117e-05,
"loss": 4.790994644165039,
"memory(GiB)": 22.25,
"step": 23,
"token_acc": 0.3361522198731501,
"train_speed(iter/s)": 0.03635
},
{
"epoch": 0.5493562231759657,
"grad_norm": 0.529353678226471,
"learning_rate": 2.2132914365039993e-05,
"loss": 4.498373985290527,
"memory(GiB)": 22.25,
"step": 24,
"token_acc": 0.38278388278388276,
"train_speed(iter/s)": 0.036412
},
{
"epoch": 0.5722460658082976,
"grad_norm": 0.5923216342926025,
"learning_rate": 2.0240222270883288e-05,
"loss": 4.431886672973633,
"memory(GiB)": 22.25,
"step": 25,
"token_acc": 0.3901345291479821,
"train_speed(iter/s)": 0.036468
},
{
"epoch": 0.5951359084406295,
"grad_norm": 0.5044678449630737,
"learning_rate": 1.8375462445083464e-05,
"loss": 4.577709674835205,
"memory(GiB)": 22.25,
"step": 26,
"token_acc": 0.3509803921568627,
"train_speed(iter/s)": 0.036523
},
{
"epoch": 0.6180257510729614,
"grad_norm": 0.8515617251396179,
"learning_rate": 1.6549578039787436e-05,
"loss": 3.797635555267334,
"memory(GiB)": 22.25,
"step": 27,
"token_acc": 0.40134907251264756,
"train_speed(iter/s)": 0.036566
},
{
"epoch": 0.6409155937052933,
"grad_norm": 0.9012308120727539,
"learning_rate": 1.4773284070716503e-05,
"loss": 4.415590286254883,
"memory(GiB)": 22.25,
"step": 28,
"token_acc": 0.38589981447124305,
"train_speed(iter/s)": 0.036597
},
{
"epoch": 0.6638054363376252,
"grad_norm": 0.5051128268241882,
"learning_rate": 1.3057004537193423e-05,
"loss": 4.514218330383301,
"memory(GiB)": 22.25,
"step": 29,
"token_acc": 0.3765541740674956,
"train_speed(iter/s)": 0.036643
},
{
"epoch": 0.6866952789699571,
"grad_norm": 0.8118892908096313,
"learning_rate": 1.1410811249969475e-05,
"loss": 4.161840915679932,
"memory(GiB)": 22.25,
"step": 30,
"token_acc": 0.35412474849094566,
"train_speed(iter/s)": 0.036683
},
{
"epoch": 0.709585121602289,
"grad_norm": 0.7509729266166687,
"learning_rate": 9.844364725834057e-06,
"loss": 4.108524799346924,
"memory(GiB)": 22.25,
"step": 31,
"token_acc": 0.4240924092409241,
"train_speed(iter/s)": 0.036725
},
{
"epoch": 0.7324749642346209,
"grad_norm": 0.6745265126228333,
"learning_rate": 8.36685749586087e-06,
"loss": 4.507699489593506,
"memory(GiB)": 22.25,
"step": 32,
"token_acc": 0.35660377358490564,
"train_speed(iter/s)": 0.036768
},
{
"epoch": 0.7553648068669528,
"grad_norm": 0.5046018958091736,
"learning_rate": 6.986960159980327e-06,
"loss": 4.469419479370117,
"memory(GiB)": 22.25,
"step": 33,
"token_acc": 0.41550387596899224,
"train_speed(iter/s)": 0.036795
},
{
"epoch": 0.7782546494992847,
"grad_norm": 0.6278886198997498,
"learning_rate": 5.712770504451426e-06,
"loss": 4.4875640869140625,
"memory(GiB)": 22.25,
"step": 34,
"token_acc": 0.386411889596603,
"train_speed(iter/s)": 0.036831
},
{
"epoch": 0.8011444921316166,
"grad_norm": 1.2817845344543457,
"learning_rate": 4.551765980832059e-06,
"loss": 4.035043239593506,
"memory(GiB)": 22.25,
"step": 35,
"token_acc": 0.39222042139384117,
"train_speed(iter/s)": 0.036858
},
{
"epoch": 0.8240343347639485,
"grad_norm": 0.6294739246368408,
"learning_rate": 3.5107598253199758e-06,
"loss": 3.905367612838745,
"memory(GiB)": 22.25,
"step": 36,
"token_acc": 0.4039301310043668,
"train_speed(iter/s)": 0.036893
},
{
"epoch": 0.8469241773962805,
"grad_norm": 0.5308797359466553,
"learning_rate": 2.595861075973613e-06,
"loss": 3.832357883453369,
"memory(GiB)": 22.25,
"step": 37,
"token_acc": 0.37555555555555553,
"train_speed(iter/s)": 0.03692
},
{
"epoch": 0.8698140200286123,
"grad_norm": 0.613280177116394,
"learning_rate": 1.8124387224476347e-06,
"loss": 3.510023832321167,
"memory(GiB)": 22.25,
"step": 38,
"token_acc": 0.41849529780564265,
"train_speed(iter/s)": 0.036949
},
{
"epoch": 0.8927038626609443,
"grad_norm": 0.5897545218467712,
"learning_rate": 1.1650901986267365e-06,
"loss": 4.297924041748047,
"memory(GiB)": 22.25,
"step": 39,
"token_acc": 0.38562091503267976,
"train_speed(iter/s)": 0.036961
},
{
"epoch": 0.9155937052932761,
"grad_norm": 0.5033223032951355,
"learning_rate": 6.576144030555259e-07,
"loss": 3.912318229675293,
"memory(GiB)": 22.25,
"step": 40,
"token_acc": 0.3920792079207921,
"train_speed(iter/s)": 0.036991
},
{
"epoch": 0.9384835479256081,
"grad_norm": 0.44826453924179077,
"learning_rate": 2.9298940549128964e-07,
"loss": 3.7790920734405518,
"memory(GiB)": 22.25,
"step": 41,
"token_acc": 0.4283464566929134,
"train_speed(iter/s)": 0.03701
},
{
"epoch": 0.9613733905579399,
"grad_norm": 0.8731946349143982,
"learning_rate": 7.335497040648898e-08,
"loss": 4.0045576095581055,
"memory(GiB)": 22.25,
"step": 42,
"token_acc": 0.3923076923076923,
"train_speed(iter/s)": 0.03704
},
{
"epoch": 0.9842632331902719,
"grad_norm": 1.097395420074463,
"learning_rate": 0.0,
"loss": 4.415482521057129,
"memory(GiB)": 22.25,
"step": 43,
"token_acc": 0.4146341463414634,
"train_speed(iter/s)": 0.037059
},
{
"epoch": 0.9842632331902719,
"eval_loss": 3.7959742546081543,
"eval_runtime": 29.3121,
"eval_samples_per_second": 9.996,
"eval_steps_per_second": 1.262,
"eval_token_acc": 0.4216255442670537,
"step": 43
}
],
"logging_steps": 1,
"max_steps": 43,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 4.042062092776243e+16,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}