|
{ |
|
"best_global_step": 43, |
|
"best_metric": 3.79597425, |
|
"best_model_checkpoint": "/workspace/output/v0-20250510-202602/checkpoint-43", |
|
"epoch": 0.9842632331902719, |
|
"eval_steps": 200, |
|
"global_step": 43, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.022889842632331903, |
|
"grad_norm": 0.5022401213645935, |
|
"learning_rate": 2.5e-05, |
|
"loss": 5.9138689041137695, |
|
"memory(GiB)": 22.25, |
|
"step": 1, |
|
"token_acc": 0.2735191637630662, |
|
"train_speed(iter/s)": 0.017618 |
|
}, |
|
{ |
|
"epoch": 0.045779685264663805, |
|
"grad_norm": 0.4973876178264618, |
|
"learning_rate": 5e-05, |
|
"loss": 6.206646919250488, |
|
"memory(GiB)": 22.25, |
|
"step": 2, |
|
"token_acc": 0.25411334552102377, |
|
"train_speed(iter/s)": 0.024222 |
|
}, |
|
{ |
|
"epoch": 0.06866952789699571, |
|
"grad_norm": 0.520351767539978, |
|
"learning_rate": 4.992664502959351e-05, |
|
"loss": 5.884594917297363, |
|
"memory(GiB)": 22.25, |
|
"step": 3, |
|
"token_acc": 0.26119402985074625, |
|
"train_speed(iter/s)": 0.027671 |
|
}, |
|
{ |
|
"epoch": 0.09155937052932761, |
|
"grad_norm": 0.6917837262153625, |
|
"learning_rate": 4.970701059450872e-05, |
|
"loss": 5.813294887542725, |
|
"memory(GiB)": 22.25, |
|
"step": 4, |
|
"token_acc": 0.2789115646258503, |
|
"train_speed(iter/s)": 0.02975 |
|
}, |
|
{ |
|
"epoch": 0.11444921316165951, |
|
"grad_norm": 0.8174898028373718, |
|
"learning_rate": 4.934238559694448e-05, |
|
"loss": 6.142425537109375, |
|
"memory(GiB)": 22.25, |
|
"step": 5, |
|
"token_acc": 0.20984455958549222, |
|
"train_speed(iter/s)": 0.031187 |
|
}, |
|
{ |
|
"epoch": 0.13733905579399142, |
|
"grad_norm": 0.5081659555435181, |
|
"learning_rate": 4.8834909801373264e-05, |
|
"loss": 5.509262561798096, |
|
"memory(GiB)": 22.25, |
|
"step": 6, |
|
"token_acc": 0.29264214046822745, |
|
"train_speed(iter/s)": 0.032136 |
|
}, |
|
{ |
|
"epoch": 0.16022889842632332, |
|
"grad_norm": 0.5285544395446777, |
|
"learning_rate": 4.8187561277552374e-05, |
|
"loss": 5.453015327453613, |
|
"memory(GiB)": 22.25, |
|
"step": 7, |
|
"token_acc": 0.33134328358208953, |
|
"train_speed(iter/s)": 0.032853 |
|
}, |
|
{ |
|
"epoch": 0.18311874105865522, |
|
"grad_norm": 0.6126793026924133, |
|
"learning_rate": 4.740413892402639e-05, |
|
"loss": 5.514800071716309, |
|
"memory(GiB)": 22.25, |
|
"step": 8, |
|
"token_acc": 0.24347826086956523, |
|
"train_speed(iter/s)": 0.033481 |
|
}, |
|
{ |
|
"epoch": 0.20600858369098712, |
|
"grad_norm": 0.5079677104949951, |
|
"learning_rate": 4.648924017468003e-05, |
|
"loss": 5.397139549255371, |
|
"memory(GiB)": 22.25, |
|
"step": 9, |
|
"token_acc": 0.2693069306930693, |
|
"train_speed(iter/s)": 0.033962 |
|
}, |
|
{ |
|
"epoch": 0.22889842632331903, |
|
"grad_norm": 0.5848721861839294, |
|
"learning_rate": 4.5448234019167945e-05, |
|
"loss": 5.021652698516846, |
|
"memory(GiB)": 22.25, |
|
"step": 10, |
|
"token_acc": 0.32229965156794427, |
|
"train_speed(iter/s)": 0.034354 |
|
}, |
|
{ |
|
"epoch": 0.25178826895565093, |
|
"grad_norm": 0.4369657635688782, |
|
"learning_rate": 4.428722949554857e-05, |
|
"loss": 5.207980155944824, |
|
"memory(GiB)": 22.25, |
|
"step": 11, |
|
"token_acc": 0.34467455621301774, |
|
"train_speed(iter/s)": 0.034653 |
|
}, |
|
{ |
|
"epoch": 0.27467811158798283, |
|
"grad_norm": 0.7269682884216309, |
|
"learning_rate": 4.301303984001967e-05, |
|
"loss": 5.160121917724609, |
|
"memory(GiB)": 22.25, |
|
"step": 12, |
|
"token_acc": 0.34941763727121466, |
|
"train_speed(iter/s)": 0.034904 |
|
}, |
|
{ |
|
"epoch": 0.29756795422031473, |
|
"grad_norm": 0.829106867313385, |
|
"learning_rate": 4.163314250413913e-05, |
|
"loss": 4.662051200866699, |
|
"memory(GiB)": 22.25, |
|
"step": 13, |
|
"token_acc": 0.32751091703056767, |
|
"train_speed(iter/s)": 0.035138 |
|
}, |
|
{ |
|
"epoch": 0.32045779685264664, |
|
"grad_norm": 1.1529988050460815, |
|
"learning_rate": 4.015563527416595e-05, |
|
"loss": 5.173630237579346, |
|
"memory(GiB)": 22.25, |
|
"step": 14, |
|
"token_acc": 0.28865979381443296, |
|
"train_speed(iter/s)": 0.035337 |
|
}, |
|
{ |
|
"epoch": 0.34334763948497854, |
|
"grad_norm": 0.7239392995834351, |
|
"learning_rate": 3.858918875003053e-05, |
|
"loss": 4.88520622253418, |
|
"memory(GiB)": 22.25, |
|
"step": 15, |
|
"token_acc": 0.332089552238806, |
|
"train_speed(iter/s)": 0.035497 |
|
}, |
|
{ |
|
"epoch": 0.36623748211731044, |
|
"grad_norm": 0.5255656838417053, |
|
"learning_rate": 3.694299546280657e-05, |
|
"loss": 4.789463043212891, |
|
"memory(GiB)": 22.25, |
|
"step": 16, |
|
"token_acc": 0.36082474226804123, |
|
"train_speed(iter/s)": 0.035644 |
|
}, |
|
{ |
|
"epoch": 0.38912732474964234, |
|
"grad_norm": 0.527284562587738, |
|
"learning_rate": 3.5226715929283506e-05, |
|
"loss": 5.008277416229248, |
|
"memory(GiB)": 22.25, |
|
"step": 17, |
|
"token_acc": 0.3034188034188034, |
|
"train_speed(iter/s)": 0.035798 |
|
}, |
|
{ |
|
"epoch": 0.41201716738197425, |
|
"grad_norm": 0.6423527002334595, |
|
"learning_rate": 3.3450421960212566e-05, |
|
"loss": 4.778470039367676, |
|
"memory(GiB)": 22.25, |
|
"step": 18, |
|
"token_acc": 0.3488372093023256, |
|
"train_speed(iter/s)": 0.035907 |
|
}, |
|
{ |
|
"epoch": 0.43490701001430615, |
|
"grad_norm": 0.4906652867794037, |
|
"learning_rate": 3.162453755491655e-05, |
|
"loss": 4.682660102844238, |
|
"memory(GiB)": 22.25, |
|
"step": 19, |
|
"token_acc": 0.35555555555555557, |
|
"train_speed(iter/s)": 0.036025 |
|
}, |
|
{ |
|
"epoch": 0.45779685264663805, |
|
"grad_norm": 0.9560534358024597, |
|
"learning_rate": 2.975977772911671e-05, |
|
"loss": 4.940546989440918, |
|
"memory(GiB)": 22.25, |
|
"step": 20, |
|
"token_acc": 0.36923076923076925, |
|
"train_speed(iter/s)": 0.036093 |
|
}, |
|
{ |
|
"epoch": 0.48068669527896996, |
|
"grad_norm": 0.5544789433479309, |
|
"learning_rate": 2.7867085634960016e-05, |
|
"loss": 4.366146087646484, |
|
"memory(GiB)": 22.25, |
|
"step": 21, |
|
"token_acc": 0.3649906890130354, |
|
"train_speed(iter/s)": 0.036168 |
|
}, |
|
{ |
|
"epoch": 0.5035765379113019, |
|
"grad_norm": 0.4951302111148834, |
|
"learning_rate": 2.595756834225089e-05, |
|
"loss": 4.866259574890137, |
|
"memory(GiB)": 22.25, |
|
"step": 22, |
|
"token_acc": 0.34402852049910876, |
|
"train_speed(iter/s)": 0.036268 |
|
}, |
|
{ |
|
"epoch": 0.5264663805436338, |
|
"grad_norm": 1.56654953956604, |
|
"learning_rate": 2.4042431657749117e-05, |
|
"loss": 4.790994644165039, |
|
"memory(GiB)": 22.25, |
|
"step": 23, |
|
"token_acc": 0.3361522198731501, |
|
"train_speed(iter/s)": 0.03635 |
|
}, |
|
{ |
|
"epoch": 0.5493562231759657, |
|
"grad_norm": 0.529353678226471, |
|
"learning_rate": 2.2132914365039993e-05, |
|
"loss": 4.498373985290527, |
|
"memory(GiB)": 22.25, |
|
"step": 24, |
|
"token_acc": 0.38278388278388276, |
|
"train_speed(iter/s)": 0.036412 |
|
}, |
|
{ |
|
"epoch": 0.5722460658082976, |
|
"grad_norm": 0.5923216342926025, |
|
"learning_rate": 2.0240222270883288e-05, |
|
"loss": 4.431886672973633, |
|
"memory(GiB)": 22.25, |
|
"step": 25, |
|
"token_acc": 0.3901345291479821, |
|
"train_speed(iter/s)": 0.036468 |
|
}, |
|
{ |
|
"epoch": 0.5951359084406295, |
|
"grad_norm": 0.5044678449630737, |
|
"learning_rate": 1.8375462445083464e-05, |
|
"loss": 4.577709674835205, |
|
"memory(GiB)": 22.25, |
|
"step": 26, |
|
"token_acc": 0.3509803921568627, |
|
"train_speed(iter/s)": 0.036523 |
|
}, |
|
{ |
|
"epoch": 0.6180257510729614, |
|
"grad_norm": 0.8515617251396179, |
|
"learning_rate": 1.6549578039787436e-05, |
|
"loss": 3.797635555267334, |
|
"memory(GiB)": 22.25, |
|
"step": 27, |
|
"token_acc": 0.40134907251264756, |
|
"train_speed(iter/s)": 0.036566 |
|
}, |
|
{ |
|
"epoch": 0.6409155937052933, |
|
"grad_norm": 0.9012308120727539, |
|
"learning_rate": 1.4773284070716503e-05, |
|
"loss": 4.415590286254883, |
|
"memory(GiB)": 22.25, |
|
"step": 28, |
|
"token_acc": 0.38589981447124305, |
|
"train_speed(iter/s)": 0.036597 |
|
}, |
|
{ |
|
"epoch": 0.6638054363376252, |
|
"grad_norm": 0.5051128268241882, |
|
"learning_rate": 1.3057004537193423e-05, |
|
"loss": 4.514218330383301, |
|
"memory(GiB)": 22.25, |
|
"step": 29, |
|
"token_acc": 0.3765541740674956, |
|
"train_speed(iter/s)": 0.036643 |
|
}, |
|
{ |
|
"epoch": 0.6866952789699571, |
|
"grad_norm": 0.8118892908096313, |
|
"learning_rate": 1.1410811249969475e-05, |
|
"loss": 4.161840915679932, |
|
"memory(GiB)": 22.25, |
|
"step": 30, |
|
"token_acc": 0.35412474849094566, |
|
"train_speed(iter/s)": 0.036683 |
|
}, |
|
{ |
|
"epoch": 0.709585121602289, |
|
"grad_norm": 0.7509729266166687, |
|
"learning_rate": 9.844364725834057e-06, |
|
"loss": 4.108524799346924, |
|
"memory(GiB)": 22.25, |
|
"step": 31, |
|
"token_acc": 0.4240924092409241, |
|
"train_speed(iter/s)": 0.036725 |
|
}, |
|
{ |
|
"epoch": 0.7324749642346209, |
|
"grad_norm": 0.6745265126228333, |
|
"learning_rate": 8.36685749586087e-06, |
|
"loss": 4.507699489593506, |
|
"memory(GiB)": 22.25, |
|
"step": 32, |
|
"token_acc": 0.35660377358490564, |
|
"train_speed(iter/s)": 0.036768 |
|
}, |
|
{ |
|
"epoch": 0.7553648068669528, |
|
"grad_norm": 0.5046018958091736, |
|
"learning_rate": 6.986960159980327e-06, |
|
"loss": 4.469419479370117, |
|
"memory(GiB)": 22.25, |
|
"step": 33, |
|
"token_acc": 0.41550387596899224, |
|
"train_speed(iter/s)": 0.036795 |
|
}, |
|
{ |
|
"epoch": 0.7782546494992847, |
|
"grad_norm": 0.6278886198997498, |
|
"learning_rate": 5.712770504451426e-06, |
|
"loss": 4.4875640869140625, |
|
"memory(GiB)": 22.25, |
|
"step": 34, |
|
"token_acc": 0.386411889596603, |
|
"train_speed(iter/s)": 0.036831 |
|
}, |
|
{ |
|
"epoch": 0.8011444921316166, |
|
"grad_norm": 1.2817845344543457, |
|
"learning_rate": 4.551765980832059e-06, |
|
"loss": 4.035043239593506, |
|
"memory(GiB)": 22.25, |
|
"step": 35, |
|
"token_acc": 0.39222042139384117, |
|
"train_speed(iter/s)": 0.036858 |
|
}, |
|
{ |
|
"epoch": 0.8240343347639485, |
|
"grad_norm": 0.6294739246368408, |
|
"learning_rate": 3.5107598253199758e-06, |
|
"loss": 3.905367612838745, |
|
"memory(GiB)": 22.25, |
|
"step": 36, |
|
"token_acc": 0.4039301310043668, |
|
"train_speed(iter/s)": 0.036893 |
|
}, |
|
{ |
|
"epoch": 0.8469241773962805, |
|
"grad_norm": 0.5308797359466553, |
|
"learning_rate": 2.595861075973613e-06, |
|
"loss": 3.832357883453369, |
|
"memory(GiB)": 22.25, |
|
"step": 37, |
|
"token_acc": 0.37555555555555553, |
|
"train_speed(iter/s)": 0.03692 |
|
}, |
|
{ |
|
"epoch": 0.8698140200286123, |
|
"grad_norm": 0.613280177116394, |
|
"learning_rate": 1.8124387224476347e-06, |
|
"loss": 3.510023832321167, |
|
"memory(GiB)": 22.25, |
|
"step": 38, |
|
"token_acc": 0.41849529780564265, |
|
"train_speed(iter/s)": 0.036949 |
|
}, |
|
{ |
|
"epoch": 0.8927038626609443, |
|
"grad_norm": 0.5897545218467712, |
|
"learning_rate": 1.1650901986267365e-06, |
|
"loss": 4.297924041748047, |
|
"memory(GiB)": 22.25, |
|
"step": 39, |
|
"token_acc": 0.38562091503267976, |
|
"train_speed(iter/s)": 0.036961 |
|
}, |
|
{ |
|
"epoch": 0.9155937052932761, |
|
"grad_norm": 0.5033223032951355, |
|
"learning_rate": 6.576144030555259e-07, |
|
"loss": 3.912318229675293, |
|
"memory(GiB)": 22.25, |
|
"step": 40, |
|
"token_acc": 0.3920792079207921, |
|
"train_speed(iter/s)": 0.036991 |
|
}, |
|
{ |
|
"epoch": 0.9384835479256081, |
|
"grad_norm": 0.44826453924179077, |
|
"learning_rate": 2.9298940549128964e-07, |
|
"loss": 3.7790920734405518, |
|
"memory(GiB)": 22.25, |
|
"step": 41, |
|
"token_acc": 0.4283464566929134, |
|
"train_speed(iter/s)": 0.03701 |
|
}, |
|
{ |
|
"epoch": 0.9613733905579399, |
|
"grad_norm": 0.8731946349143982, |
|
"learning_rate": 7.335497040648898e-08, |
|
"loss": 4.0045576095581055, |
|
"memory(GiB)": 22.25, |
|
"step": 42, |
|
"token_acc": 0.3923076923076923, |
|
"train_speed(iter/s)": 0.03704 |
|
}, |
|
{ |
|
"epoch": 0.9842632331902719, |
|
"grad_norm": 1.097395420074463, |
|
"learning_rate": 0.0, |
|
"loss": 4.415482521057129, |
|
"memory(GiB)": 22.25, |
|
"step": 43, |
|
"token_acc": 0.4146341463414634, |
|
"train_speed(iter/s)": 0.037059 |
|
}, |
|
{ |
|
"epoch": 0.9842632331902719, |
|
"eval_loss": 3.7959742546081543, |
|
"eval_runtime": 29.3121, |
|
"eval_samples_per_second": 9.996, |
|
"eval_steps_per_second": 1.262, |
|
"eval_token_acc": 0.4216255442670537, |
|
"step": 43 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 43, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 4.042062092776243e+16, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|