qwen25_1p5mimo_r32 / checkpoint-200 /trainer_state.json
moyixiao's picture
Training in progress, step 200, checkpoint
b4cb1be verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.23494860499265785,
"eval_steps": 500,
"global_step": 200,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.011747430249632892,
"grad_norm": 1.5699902772903442,
"learning_rate": 0.00027,
"loss": 3.0983,
"step": 10
},
{
"epoch": 0.023494860499265784,
"grad_norm": 1.6029695272445679,
"learning_rate": 0.00029991523567092526,
"loss": 2.062,
"step": 20
},
{
"epoch": 0.03524229074889868,
"grad_norm": 1.593436360359192,
"learning_rate": 0.00029962234616583063,
"loss": 1.2074,
"step": 30
},
{
"epoch": 0.04698972099853157,
"grad_norm": 0.5851414799690247,
"learning_rate": 0.00029912069357315393,
"loss": 0.888,
"step": 40
},
{
"epoch": 0.05873715124816446,
"grad_norm": 0.25992292165756226,
"learning_rate": 0.0002984109778320875,
"loss": 0.7685,
"step": 50
},
{
"epoch": 0.07048458149779736,
"grad_norm": 0.21082307398319244,
"learning_rate": 0.00029749418918542057,
"loss": 0.7096,
"step": 60
},
{
"epoch": 0.08223201174743025,
"grad_norm": 0.16843102872371674,
"learning_rate": 0.0002963716067978866,
"loss": 0.6901,
"step": 70
},
{
"epoch": 0.09397944199706314,
"grad_norm": 0.12076722830533981,
"learning_rate": 0.000295044796971387,
"loss": 0.6702,
"step": 80
},
{
"epoch": 0.10572687224669604,
"grad_norm": 0.21371866762638092,
"learning_rate": 0.000293515610959582,
"loss": 0.6353,
"step": 90
},
{
"epoch": 0.11747430249632893,
"grad_norm": 0.13458965718746185,
"learning_rate": 0.0002917861823848985,
"loss": 0.6479,
"step": 100
},
{
"epoch": 0.12922173274596183,
"grad_norm": 0.265765517950058,
"learning_rate": 0.0002898589242615568,
"loss": 0.6244,
"step": 110
},
{
"epoch": 0.14096916299559473,
"grad_norm": 0.1473032385110855,
"learning_rate": 0.0002877365256287728,
"loss": 0.6217,
"step": 120
},
{
"epoch": 0.1527165932452276,
"grad_norm": 0.1591167151927948,
"learning_rate": 0.00028542194779883047,
"loss": 0.6022,
"step": 130
},
{
"epoch": 0.1644640234948605,
"grad_norm": 0.13270772993564606,
"learning_rate": 0.00028291842022526133,
"loss": 0.6098,
"step": 140
},
{
"epoch": 0.1762114537444934,
"grad_norm": 0.1444919854402542,
"learning_rate": 0.0002802294359968954,
"loss": 0.5971,
"step": 150
},
{
"epoch": 0.18795888399412627,
"grad_norm": 0.1571902334690094,
"learning_rate": 0.0002773587469640702,
"loss": 0.5937,
"step": 160
},
{
"epoch": 0.19970631424375918,
"grad_norm": 0.11585285514593124,
"learning_rate": 0.0002743103585037989,
"loss": 0.6054,
"step": 170
},
{
"epoch": 0.21145374449339208,
"grad_norm": 0.10303252190351486,
"learning_rate": 0.0002710885239312008,
"loss": 0.5708,
"step": 180
},
{
"epoch": 0.22320117474302498,
"grad_norm": 0.09355439245700836,
"learning_rate": 0.00026769773856499167,
"loss": 0.5806,
"step": 190
},
{
"epoch": 0.23494860499265785,
"grad_norm": 0.09288550913333893,
"learning_rate": 0.0002641427334553158,
"loss": 0.5747,
"step": 200
}
],
"logging_steps": 10,
"max_steps": 851,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 40,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 4.2381453081706496e+17,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}