Qwen-2.5-7B-Simple-RL / trainer_state.json
dadadar's picture
Model save
c0c3d48 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.997867803837953,
"eval_steps": 100,
"global_step": 117,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"clip_ratio": 0.0,
"completion_length": 628.3828125,
"epoch": 0.008528784648187633,
"grad_norm": 0.3978726325351024,
"kl": 0.0,
"learning_rate": 2.5e-07,
"loss": -0.008,
"reward": 0.611328125,
"reward_std": 0.3421183191239834,
"rewards/accuracy_reward": 0.611328125,
"rewards/format_reward": 0.0,
"step": 1
},
{
"clip_ratio": 0.0,
"completion_length": 591.9208984375,
"epoch": 0.042643923240938165,
"grad_norm": 0.4850032275141134,
"kl": 0.00016227364540100098,
"learning_rate": 1.25e-06,
"loss": 0.0201,
"reward": 0.60205078125,
"reward_std": 0.368276241235435,
"rewards/accuracy_reward": 0.60205078125,
"rewards/format_reward": 0.0,
"step": 5
},
{
"clip_ratio": 0.0,
"completion_length": 609.651953125,
"epoch": 0.08528784648187633,
"grad_norm": 287.18666398380157,
"kl": 0.016454243659973146,
"learning_rate": 2.5e-06,
"loss": 0.0199,
"reward": 0.63359375,
"reward_std": 0.3275037372484803,
"rewards/accuracy_reward": 0.63359375,
"rewards/format_reward": 0.0,
"step": 10
},
{
"clip_ratio": 0.0,
"completion_length": 611.729296875,
"epoch": 0.1279317697228145,
"grad_norm": 0.8855366004356563,
"kl": 0.013251018524169923,
"learning_rate": 2.993961440992859e-06,
"loss": 0.0662,
"reward": 0.7046875,
"reward_std": 0.2706967916339636,
"rewards/accuracy_reward": 0.7046875,
"rewards/format_reward": 0.0,
"step": 15
},
{
"clip_ratio": 0.0,
"completion_length": 615.81640625,
"epoch": 0.17057569296375266,
"grad_norm": 0.15305826426466904,
"kl": 0.012549591064453126,
"learning_rate": 2.957235057439301e-06,
"loss": 0.075,
"reward": 0.748046875,
"reward_std": 0.23348850551992656,
"rewards/accuracy_reward": 0.748046875,
"rewards/format_reward": 0.0,
"step": 20
},
{
"clip_ratio": 0.0,
"completion_length": 591.190234375,
"epoch": 0.21321961620469082,
"grad_norm": 0.1572182728264828,
"kl": 0.008654594421386719,
"learning_rate": 2.887956450710995e-06,
"loss": 0.0746,
"reward": 0.78515625,
"reward_std": 0.20421773064881563,
"rewards/accuracy_reward": 0.78515625,
"rewards/format_reward": 0.0,
"step": 25
},
{
"clip_ratio": 0.0,
"completion_length": 582.583984375,
"epoch": 0.255863539445629,
"grad_norm": 0.18846851459877467,
"kl": 0.004192924499511719,
"learning_rate": 2.7876731904027993e-06,
"loss": 0.0461,
"reward": 0.766796875,
"reward_std": 0.20100836837664246,
"rewards/accuracy_reward": 0.766796875,
"rewards/format_reward": 0.0,
"step": 30
},
{
"clip_ratio": 0.0,
"completion_length": 581.9828125,
"epoch": 0.29850746268656714,
"grad_norm": 0.1633828721045992,
"kl": 0.012949752807617187,
"learning_rate": 2.6586254388368995e-06,
"loss": 0.044,
"reward": 0.785546875,
"reward_std": 0.17904459508135914,
"rewards/accuracy_reward": 0.785546875,
"rewards/format_reward": 0.0,
"step": 35
},
{
"clip_ratio": 0.0,
"completion_length": 577.331640625,
"epoch": 0.3411513859275053,
"grad_norm": 0.8927385078453667,
"kl": 0.005597305297851562,
"learning_rate": 2.5036959095382875e-06,
"loss": 0.0463,
"reward": 0.771875,
"reward_std": 0.18639590675011278,
"rewards/accuracy_reward": 0.771875,
"rewards/format_reward": 0.0,
"step": 40
},
{
"clip_ratio": 0.0,
"completion_length": 589.18203125,
"epoch": 0.3837953091684435,
"grad_norm": 0.9202370908691567,
"kl": 0.005675506591796875,
"learning_rate": 2.3263454721781537e-06,
"loss": 0.0348,
"reward": 0.79609375,
"reward_std": 0.16009506704285742,
"rewards/accuracy_reward": 0.79609375,
"rewards/format_reward": 0.0,
"step": 45
},
{
"clip_ratio": 0.0,
"completion_length": 582.716796875,
"epoch": 0.42643923240938164,
"grad_norm": 0.15051191313130605,
"kl": 0.005776214599609375,
"learning_rate": 2.1305358424643485e-06,
"loss": 0.0396,
"reward": 0.744140625,
"reward_std": 0.1848529415205121,
"rewards/accuracy_reward": 0.744140625,
"rewards/format_reward": 0.0,
"step": 50
},
{
"clip_ratio": 0.0,
"completion_length": 573.162109375,
"epoch": 0.4690831556503198,
"grad_norm": 0.13007092776387721,
"kl": 0.005338287353515625,
"learning_rate": 1.9206410839590043e-06,
"loss": 0.03,
"reward": 0.770703125,
"reward_std": 0.18297699503600598,
"rewards/accuracy_reward": 0.770703125,
"rewards/format_reward": 0.0,
"step": 55
},
{
"clip_ratio": 0.0,
"completion_length": 606.7,
"epoch": 0.511727078891258,
"grad_norm": 0.11547483395811298,
"kl": 0.005091094970703125,
"learning_rate": 1.7013498987264833e-06,
"loss": 0.0414,
"reward": 0.754296875,
"reward_std": 0.1787008465267718,
"rewards/accuracy_reward": 0.754296875,
"rewards/format_reward": 0.0,
"step": 60
},
{
"clip_ratio": 0.0,
"completion_length": 571.59609375,
"epoch": 0.5543710021321961,
"grad_norm": 0.2214356197716542,
"kl": 0.0063533782958984375,
"learning_rate": 1.4775608894771048e-06,
"loss": 0.0404,
"reward": 0.756640625,
"reward_std": 0.18082559341564775,
"rewards/accuracy_reward": 0.756640625,
"rewards/format_reward": 0.0,
"step": 65
},
{
"clip_ratio": 0.0,
"completion_length": 589.0390625,
"epoch": 0.5970149253731343,
"grad_norm": 0.13046863657480887,
"kl": 0.0055328369140625,
"learning_rate": 1.2542731328772936e-06,
"loss": 0.0271,
"reward": 0.737109375,
"reward_std": 0.1798726196400821,
"rewards/accuracy_reward": 0.737109375,
"rewards/format_reward": 0.0,
"step": 70
},
{
"clip_ratio": 0.0,
"completion_length": 560.494921875,
"epoch": 0.6396588486140725,
"grad_norm": 0.18140547445466393,
"kl": 0.007068634033203125,
"learning_rate": 1.036474508437579e-06,
"loss": 0.0432,
"reward": 0.783984375,
"reward_std": 0.182032142393291,
"rewards/accuracy_reward": 0.783984375,
"rewards/format_reward": 0.0,
"step": 75
},
{
"clip_ratio": 0.0,
"completion_length": 573.178515625,
"epoch": 0.6823027718550106,
"grad_norm": 0.1291947925621071,
"kl": 0.00672454833984375,
"learning_rate": 8.290302775265509e-07,
"loss": 0.0302,
"reward": 0.76875,
"reward_std": 0.16823082510381937,
"rewards/accuracy_reward": 0.76875,
"rewards/format_reward": 0.0,
"step": 80
},
{
"clip_ratio": 0.0,
"completion_length": 590.775,
"epoch": 0.7249466950959488,
"grad_norm": 0.21716845594114212,
"kl": 0.009334564208984375,
"learning_rate": 6.3657440147149e-07,
"loss": 0.0297,
"reward": 0.77421875,
"reward_std": 0.18809175668284298,
"rewards/accuracy_reward": 0.77421875,
"rewards/format_reward": 0.0,
"step": 85
},
{
"clip_ratio": 0.0,
"completion_length": 580.678515625,
"epoch": 0.767590618336887,
"grad_norm": 0.15013354686025335,
"kl": 0.009665679931640626,
"learning_rate": 4.63406026519703e-07,
"loss": 0.03,
"reward": 0.752734375,
"reward_std": 0.17157533336430789,
"rewards/accuracy_reward": 0.752734375,
"rewards/format_reward": 0.0,
"step": 90
},
{
"clip_ratio": 0.0,
"completion_length": 587.61015625,
"epoch": 0.8102345415778252,
"grad_norm": 0.17535012938422753,
"kl": 0.00673065185546875,
"learning_rate": 3.133934480154885e-07,
"loss": 0.0327,
"reward": 0.748046875,
"reward_std": 0.18622339582070707,
"rewards/accuracy_reward": 0.748046875,
"rewards/format_reward": 0.0,
"step": 95
},
{
"epoch": 0.8528784648187633,
"grad_norm": 0.1415194016262269,
"learning_rate": 1.8988769907430552e-07,
"loss": 0.0315,
"step": 100
},
{
"epoch": 0.8528784648187633,
"eval_clip_ratio": 0.0,
"eval_completion_length": 564.361775,
"eval_kl": 0.0873845947265625,
"eval_loss": 0.01933957263827324,
"eval_reward": 0.67925,
"eval_reward_std": 0.21540253313183785,
"eval_rewards/accuracy_reward": 0.67925,
"eval_rewards/format_reward": 0.0,
"eval_runtime": 12790.1771,
"eval_samples_per_second": 0.391,
"eval_steps_per_second": 0.006,
"step": 100
},
{
"clip_ratio": 0.0,
"completion_length": 580.8361328125,
"epoch": 0.8955223880597015,
"grad_norm": 0.12258269168276933,
"kl": 0.007758140563964844,
"learning_rate": 9.564769404039419e-08,
"loss": 0.0376,
"reward": 0.77265625,
"reward_std": 0.18973822570405902,
"rewards/accuracy_reward": 0.77265625,
"rewards/format_reward": 0.0,
"step": 105
},
{
"clip_ratio": 0.0,
"completion_length": 579.112890625,
"epoch": 0.9381663113006397,
"grad_norm": 0.145162218617579,
"kl": 0.007707977294921875,
"learning_rate": 3.277859889929147e-08,
"loss": 0.0332,
"reward": 0.79609375,
"reward_std": 0.1877390337176621,
"rewards/accuracy_reward": 0.79609375,
"rewards/format_reward": 0.0,
"step": 110
},
{
"clip_ratio": 0.0,
"completion_length": 563.658203125,
"epoch": 0.9808102345415778,
"grad_norm": 0.14501622687734866,
"kl": 0.007173919677734375,
"learning_rate": 2.684805348397268e-09,
"loss": 0.0326,
"reward": 0.77421875,
"reward_std": 0.18014944810420275,
"rewards/accuracy_reward": 0.77421875,
"rewards/format_reward": 0.0,
"step": 115
},
{
"clip_ratio": 0.0,
"completion_length": 604.4720134735107,
"epoch": 0.997867803837953,
"kl": 0.0069580078125,
"reward": 0.74609375,
"reward_std": 0.19644855032674968,
"rewards/accuracy_reward": 0.74609375,
"rewards/format_reward": 0.0,
"step": 117,
"total_flos": 0.0,
"train_loss": 0.03916507870213598,
"train_runtime": 36868.5034,
"train_samples_per_second": 0.203,
"train_steps_per_second": 0.003
}
],
"logging_steps": 5,
"max_steps": 117,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": false,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}