qwen25_1p5mimo_r32 / checkpoint-200 /trainer_state.json

Training in progress, step 200, checkpoint

b4cb1be verified about 2 months ago

4.26 kB

	{
	"best_global_step": null,
	"best_metric": null,
	"best_model_checkpoint": null,
	"epoch": 0.23494860499265785,
	"eval_steps": 500,
	"global_step": 200,
	"is_hyper_param_search": false,
	"is_local_process_zero": true,
	"is_world_process_zero": true,
	"log_history": [
	{
	"epoch": 0.011747430249632892,
	"grad_norm": 1.5699902772903442,
	"learning_rate": 0.00027,
	"loss": 3.0983,
	"step": 10
	},
	{
	"epoch": 0.023494860499265784,
	"grad_norm": 1.6029695272445679,
	"learning_rate": 0.00029991523567092526,
	"loss": 2.062,
	"step": 20
	},
	{
	"epoch": 0.03524229074889868,
	"grad_norm": 1.593436360359192,
	"learning_rate": 0.00029962234616583063,
	"loss": 1.2074,
	"step": 30
	},
	{
	"epoch": 0.04698972099853157,
	"grad_norm": 0.5851414799690247,
	"learning_rate": 0.00029912069357315393,
	"loss": 0.888,
	"step": 40
	},
	{
	"epoch": 0.05873715124816446,
	"grad_norm": 0.25992292165756226,
	"learning_rate": 0.0002984109778320875,
	"loss": 0.7685,
	"step": 50
	},
	{
	"epoch": 0.07048458149779736,
	"grad_norm": 0.21082307398319244,
	"learning_rate": 0.00029749418918542057,
	"loss": 0.7096,
	"step": 60
	},
	{
	"epoch": 0.08223201174743025,
	"grad_norm": 0.16843102872371674,
	"learning_rate": 0.0002963716067978866,
	"loss": 0.6901,
	"step": 70
	},
	{
	"epoch": 0.09397944199706314,
	"grad_norm": 0.12076722830533981,
	"learning_rate": 0.000295044796971387,
	"loss": 0.6702,
	"step": 80
	},
	{
	"epoch": 0.10572687224669604,
	"grad_norm": 0.21371866762638092,
	"learning_rate": 0.000293515610959582,
	"loss": 0.6353,
	"step": 90
	},
	{
	"epoch": 0.11747430249632893,
	"grad_norm": 0.13458965718746185,
	"learning_rate": 0.0002917861823848985,
	"loss": 0.6479,
	"step": 100
	},
	{
	"epoch": 0.12922173274596183,
	"grad_norm": 0.265765517950058,
	"learning_rate": 0.0002898589242615568,
	"loss": 0.6244,
	"step": 110
	},
	{
	"epoch": 0.14096916299559473,
	"grad_norm": 0.1473032385110855,
	"learning_rate": 0.0002877365256287728,
	"loss": 0.6217,
	"step": 120
	},
	{
	"epoch": 0.1527165932452276,
	"grad_norm": 0.1591167151927948,
	"learning_rate": 0.00028542194779883047,
	"loss": 0.6022,
	"step": 130
	},
	{
	"epoch": 0.1644640234948605,
	"grad_norm": 0.13270772993564606,
	"learning_rate": 0.00028291842022526133,
	"loss": 0.6098,
	"step": 140
	},
	{
	"epoch": 0.1762114537444934,
	"grad_norm": 0.1444919854402542,
	"learning_rate": 0.0002802294359968954,
	"loss": 0.5971,
	"step": 150
	},
	{
	"epoch": 0.18795888399412627,
	"grad_norm": 0.1571902334690094,
	"learning_rate": 0.0002773587469640702,
	"loss": 0.5937,
	"step": 160
	},
	{
	"epoch": 0.19970631424375918,
	"grad_norm": 0.11585285514593124,
	"learning_rate": 0.0002743103585037989,
	"loss": 0.6054,
	"step": 170
	},
	{
	"epoch": 0.21145374449339208,
	"grad_norm": 0.10303252190351486,
	"learning_rate": 0.0002710885239312008,
	"loss": 0.5708,
	"step": 180
	},
	{
	"epoch": 0.22320117474302498,
	"grad_norm": 0.09355439245700836,
	"learning_rate": 0.00026769773856499167,
	"loss": 0.5806,
	"step": 190
	},
	{
	"epoch": 0.23494860499265785,
	"grad_norm": 0.09288550913333893,
	"learning_rate": 0.0002641427334553158,
	"loss": 0.5747,
	"step": 200
	}
	],
	"logging_steps": 10,
	"max_steps": 851,
	"num_input_tokens_seen": 0,
	"num_train_epochs": 1,
	"save_steps": 40,
	"stateful_callbacks": {
	"TrainerControl": {
	"args": {
	"should_epoch_stop": false,
	"should_evaluate": false,
	"should_log": false,
	"should_save": true,
	"should_training_stop": false
	},
	"attributes": {}
	}
	},
	"total_flos": 4.2381453081706496e+17,
	"train_batch_size": 2,
	"trial_name": null,
	"trial_params": null
	}