Qwen3-4b-gsm8k-Qlora-GRPO / trainer_state.json
tahamajs's picture
add files
973c01d verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.008565310492505354,
"eval_steps": 1,
"global_step": 16,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"completion_length": 327.875,
"completions/clipped_ratio": 0.0,
"completions/max_length": 422.0,
"completions/max_terminated_length": 422.0,
"completions/mean_length": 327.875,
"completions/mean_terminated_length": 327.875,
"completions/min_length": 256.0,
"completions/min_terminated_length": 256.0,
"epoch": 0.0005353319057815846,
"frac_reward_zero_std": 0.0,
"grad_norm": NaN,
"kl": 0.0,
"learning_rate": 0.0,
"loss": 0.0,
"num_tokens": 3479.0,
"reward": 1.3822917938232422,
"reward_std": 0.001679160282947123,
"rewards/r_correctness/mean": 1.0,
"rewards/r_correctness/std": 1.0690449476242065,
"rewards/r_shaping/mean": -0.01770833320915699,
"rewards/r_shaping/std": 0.00442595174536109,
"rewards/r_soft/mean": 0.30000001192092896,
"rewards/r_soft/std": 0.0,
"rewards/r_strict/mean": 0.0,
"rewards/r_strict/std": 0.0,
"rewards/r_xmlcount/mean": 0.10000000149011612,
"rewards/r_xmlcount/std": 0.0,
"step": 1
},
{
"completion_length": 382.25,
"completions/clipped_ratio": 0.0,
"completions/max_length": 486.0,
"completions/max_terminated_length": 486.0,
"completions/mean_length": 382.25,
"completions/mean_terminated_length": 382.25,
"completions/min_length": 313.0,
"completions/min_terminated_length": 313.0,
"epoch": 0.0010706638115631692,
"frac_reward_zero_std": 0.0,
"grad_norm": NaN,
"kl": 0.0,
"learning_rate": 0.0005,
"loss": 0.0001,
"num_tokens": 7457.0,
"reward": 2.382591724395752,
"reward_std": 0.0023273415863513947,
"rewards/r_correctness/mean": 2.0,
"rewards/r_correctness/std": 0.0,
"rewards/r_shaping/mean": -0.017408333718776703,
"rewards/r_shaping/std": 0.005641527008265257,
"rewards/r_soft/mean": 0.30000001192092896,
"rewards/r_soft/std": 0.0,
"rewards/r_strict/mean": 0.0,
"rewards/r_strict/std": 0.0,
"rewards/r_xmlcount/mean": 0.10000000149011612,
"rewards/r_xmlcount/std": 0.0,
"step": 2
},
{
"completion_length": 555.125,
"completions/clipped_ratio": 0.5,
"completions/max_length": 700.0,
"completions/max_terminated_length": 534.0,
"completions/mean_length": 555.125,
"completions/mean_terminated_length": 410.25,
"completions/min_length": 228.0,
"completions/min_terminated_length": 228.0,
"epoch": 0.0016059957173447537,
"frac_reward_zero_std": 0.0,
"grad_norm": NaN,
"kl": 0.0,
"learning_rate": 0.0004998023493068255,
"loss": 0.0,
"num_tokens": 12818.0,
"reward": 1.117954134941101,
"reward_std": 0.009724103845655918,
"rewards/r_correctness/mean": 1.0,
"rewards/r_correctness/std": 1.0690449476242065,
"rewards/r_shaping/mean": -0.028920834884047508,
"rewards/r_shaping/std": 0.008277526125311852,
"rewards/r_soft/mean": 0.15000000596046448,
"rewards/r_soft/std": 0.16035674512386322,
"rewards/r_strict/mean": 0.0,
"rewards/r_strict/std": 0.0,
"rewards/r_xmlcount/mean": -0.0031250007450580597,
"rewards/r_xmlcount/std": 0.11054855585098267,
"step": 3
},
{
"completion_length": 633.375,
"completions/clipped_ratio": 0.125,
"completions/max_length": 700.0,
"completions/max_terminated_length": 676.0,
"completions/mean_length": 633.375,
"completions/mean_terminated_length": 623.857177734375,
"completions/min_length": 507.0,
"completions/min_terminated_length": 507.0,
"epoch": 0.0021413276231263384,
"frac_reward_zero_std": 0.0,
"grad_norm": NaN,
"kl": 0.0,
"learning_rate": 0.0004992097097536739,
"loss": 0.0,
"num_tokens": 18821.0,
"reward": 1.3096479177474976,
"reward_std": 0.12074954062700272,
"rewards/r_correctness/mean": 1.0,
"rewards/r_correctness/std": 1.0690449476242065,
"rewards/r_shaping/mean": -0.03097708337008953,
"rewards/r_shaping/std": 0.004845558665692806,
"rewards/r_soft/mean": 0.26250001788139343,
"rewards/r_soft/std": 0.1060660257935524,
"rewards/r_strict/mean": 0.0,
"rewards/r_strict/std": 0.0,
"rewards/r_xmlcount/mean": 0.078125,
"rewards/r_xmlcount/std": 0.06187184527516365,
"step": 4
},
{
"completion_length": 510.875,
"completions/clipped_ratio": 0.5,
"completions/max_length": 700.0,
"completions/max_terminated_length": 415.0,
"completions/mean_length": 510.875,
"completions/mean_terminated_length": 321.75,
"completions/min_length": 272.0,
"completions/min_terminated_length": 272.0,
"epoch": 0.0026766595289079227,
"frac_reward_zero_std": 0.0,
"grad_norm": NaN,
"kl": 0.0,
"learning_rate": 0.0004982230184254933,
"loss": 0.0,
"num_tokens": 23780.0,
"reward": 1.125322937965393,
"reward_std": 0.004006261937320232,
"rewards/r_correctness/mean": 1.0,
"rewards/r_correctness/std": 1.0690449476242065,
"rewards/r_shaping/mean": -0.024677084758877754,
"rewards/r_shaping/std": 0.011852074414491653,
"rewards/r_soft/mean": 0.15000000596046448,
"rewards/r_soft/std": 0.16035674512386322,
"rewards/r_strict/mean": 0.0,
"rewards/r_strict/std": 0.0,
"rewards/r_xmlcount/mean": 0.0,
"rewards/r_xmlcount/std": 0.10690450668334961,
"step": 5
},
{
"completion_length": 578.0,
"completions/clipped_ratio": 0.5,
"completions/max_length": 700.0,
"completions/max_terminated_length": 493.0,
"completions/mean_length": 578.0,
"completions/mean_terminated_length": 456.0,
"completions/min_length": 432.0,
"completions/min_terminated_length": 432.0,
"epoch": 0.0032119914346895075,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.11385109275579453,
"kl": 0.0,
"learning_rate": 0.0004968438354840834,
"loss": 0.0,
"num_tokens": 29236.0,
"reward": 1.1172062158584595,
"reward_std": 0.001010744832456112,
"rewards/r_correctness/mean": 1.0,
"rewards/r_correctness/std": 1.0690449476242065,
"rewards/r_shaping/mean": -0.03279374912381172,
"rewards/r_shaping/std": 0.007816643454134464,
"rewards/r_soft/mean": 0.15000000596046448,
"rewards/r_soft/std": 0.16035674512386322,
"rewards/r_strict/mean": 0.0,
"rewards/r_strict/std": 0.0,
"rewards/r_xmlcount/mean": 0.0,
"rewards/r_xmlcount/std": 0.10690450668334961,
"step": 6
},
{
"completion_length": 565.125,
"completions/clipped_ratio": 0.5,
"completions/max_length": 700.0,
"completions/max_terminated_length": 448.0,
"completions/mean_length": 565.125,
"completions/mean_terminated_length": 430.25,
"completions/min_length": 417.0,
"completions/min_terminated_length": 417.0,
"epoch": 0.003747323340471092,
"frac_reward_zero_std": 0.5,
"grad_norm": NaN,
"kl": 0.002949223853647709,
"learning_rate": 0.0004950743417011591,
"loss": 0.0001,
"num_tokens": 34769.0,
"reward": 1.1194353103637695,
"reward_std": 0.00033387652365490794,
"rewards/r_correctness/mean": 1.0,
"rewards/r_correctness/std": 1.0690449476242065,
"rewards/r_shaping/mean": -0.030564583837985992,
"rewards/r_shaping/std": 0.010096355341374874,
"rewards/r_soft/mean": 0.15000000596046448,
"rewards/r_soft/std": 0.16035674512386322,
"rewards/r_strict/mean": 0.0,
"rewards/r_strict/std": 0.0,
"rewards/r_xmlcount/mean": 0.0,
"rewards/r_xmlcount/std": 0.10690450668334961,
"step": 7
},
{
"completion_length": 612.0,
"completions/clipped_ratio": 0.5,
"completions/max_length": 700.0,
"completions/max_terminated_length": 577.0,
"completions/mean_length": 612.0,
"completions/mean_terminated_length": 524.0,
"completions/min_length": 427.0,
"completions/min_terminated_length": 427.0,
"epoch": 0.004282655246252677,
"frac_reward_zero_std": 0.0,
"grad_norm": NaN,
"kl": 0.005263926927000284,
"learning_rate": 0.0004929173350101025,
"loss": 0.0003,
"num_tokens": 40597.0,
"reward": 1.1142561435699463,
"reward_std": 0.007277170196175575,
"rewards/r_correctness/mean": 1.0,
"rewards/r_correctness/std": 1.0690449476242065,
"rewards/r_shaping/mean": -0.032618746161460876,
"rewards/r_shaping/std": 0.00728320237249136,
"rewards/r_soft/mean": 0.15000000596046448,
"rewards/r_soft/std": 0.16035674512386322,
"rewards/r_strict/mean": 0.0,
"rewards/r_strict/std": 0.0,
"rewards/r_xmlcount/mean": -0.0031250007450580597,
"rewards/r_xmlcount/std": 0.11054855585098267,
"step": 8
},
{
"completion_length": 540.75,
"completions/clipped_ratio": 0.125,
"completions/max_length": 700.0,
"completions/max_terminated_length": 666.0,
"completions/mean_length": 540.75,
"completions/mean_terminated_length": 518.0,
"completions/min_length": 298.0,
"completions/min_terminated_length": 298.0,
"epoch": 0.004817987152034261,
"frac_reward_zero_std": 0.0,
"grad_norm": NaN,
"kl": 0.003534165909513831,
"learning_rate": 0.0004903762260818551,
"loss": 0.0002,
"num_tokens": 45663.0,
"reward": 2.0576353073120117,
"reward_std": 0.6352876424789429,
"rewards/r_correctness/mean": 1.75,
"rewards/r_correctness/std": 0.7071067690849304,
"rewards/r_shaping/mean": -0.026739582419395447,
"rewards/r_shaping/std": 0.007363913580775261,
"rewards/r_soft/mean": 0.26250001788139343,
"rewards/r_soft/std": 0.1060660257935524,
"rewards/r_strict/mean": 0.0,
"rewards/r_strict/std": 0.0,
"rewards/r_xmlcount/mean": 0.07187500596046448,
"rewards/r_xmlcount/std": 0.07954951375722885,
"step": 9
},
{
"completion_length": 471.375,
"completions/clipped_ratio": 0.0,
"completions/max_length": 593.0,
"completions/max_terminated_length": 593.0,
"completions/mean_length": 471.375,
"completions/mean_terminated_length": 471.375,
"completions/min_length": 353.0,
"completions/min_terminated_length": 353.0,
"epoch": 0.0053533190578158455,
"frac_reward_zero_std": 0.0,
"grad_norm": NaN,
"kl": 0.007448031101375818,
"learning_rate": 0.0004874550329319457,
"loss": 0.0004,
"num_tokens": 50302.0,
"reward": 2.3762893676757812,
"reward_std": 0.002844305010512471,
"rewards/r_correctness/mean": 2.0,
"rewards/r_correctness/std": 0.0,
"rewards/r_shaping/mean": -0.02371041476726532,
"rewards/r_shaping/std": 0.0028945477679371834,
"rewards/r_soft/mean": 0.30000001192092896,
"rewards/r_soft/std": 0.0,
"rewards/r_strict/mean": 0.0,
"rewards/r_strict/std": 0.0,
"rewards/r_xmlcount/mean": 0.10000000149011612,
"rewards/r_xmlcount/std": 0.0,
"step": 10
},
{
"completion_length": 486.875,
"completions/clipped_ratio": 0.0,
"completions/max_length": 586.0,
"completions/max_terminated_length": 586.0,
"completions/mean_length": 486.875,
"completions/mean_terminated_length": 486.875,
"completions/min_length": 404.0,
"completions/min_terminated_length": 404.0,
"epoch": 0.005888650963597431,
"frac_reward_zero_std": 0.0,
"grad_norm": NaN,
"kl": 0.0019934047013521194,
"learning_rate": 0.00048415837456718195,
"loss": 0.0001,
"num_tokens": 55077.0,
"reward": 2.3770666122436523,
"reward_std": 0.0032932735048234463,
"rewards/r_correctness/mean": 2.0,
"rewards/r_correctness/std": 0.0,
"rewards/r_shaping/mean": -0.02293333411216736,
"rewards/r_shaping/std": 0.0030542060267180204,
"rewards/r_soft/mean": 0.30000001192092896,
"rewards/r_soft/std": 0.0,
"rewards/r_strict/mean": 0.0,
"rewards/r_strict/std": 0.0,
"rewards/r_xmlcount/mean": 0.10000000149011612,
"rewards/r_xmlcount/std": 0.0,
"step": 11
},
{
"completion_length": 520.625,
"completions/clipped_ratio": 0.125,
"completions/max_length": 700.0,
"completions/max_terminated_length": 604.0,
"completions/mean_length": 520.625,
"completions/mean_terminated_length": 495.0000305175781,
"completions/min_length": 386.0,
"completions/min_terminated_length": 386.0,
"epoch": 0.006423982869379015,
"frac_reward_zero_std": 0.0,
"grad_norm": NaN,
"kl": 0.0034809389617294073,
"learning_rate": 0.0004804914636820517,
"loss": 0.0002,
"num_tokens": 60182.0,
"reward": 2.0551228523254395,
"reward_std": 0.6347294449806213,
"rewards/r_correctness/mean": 1.75,
"rewards/r_correctness/std": 0.7071067690849304,
"rewards/r_shaping/mean": -0.029252082109451294,
"rewards/r_shaping/std": 0.005904427729547024,
"rewards/r_soft/mean": 0.26250001788139343,
"rewards/r_soft/std": 0.1060660257935524,
"rewards/r_strict/mean": 0.0,
"rewards/r_strict/std": 0.0,
"rewards/r_xmlcount/mean": 0.07187500596046448,
"rewards/r_xmlcount/std": 0.07954951375722885,
"step": 12
},
{
"completion_length": 601.25,
"completions/clipped_ratio": 0.375,
"completions/max_length": 700.0,
"completions/max_terminated_length": 698.0,
"completions/mean_length": 601.25,
"completions/mean_terminated_length": 542.0,
"completions/min_length": 400.0,
"completions/min_terminated_length": 400.0,
"epoch": 0.006959314775160599,
"frac_reward_zero_std": 0.0,
"grad_norm": NaN,
"kl": 0.0022265929728746414,
"learning_rate": 0.00047646009841638084,
"loss": 0.0001,
"num_tokens": 66032.0,
"reward": 1.6835541725158691,
"reward_std": 0.6659948229789734,
"rewards/r_correctness/mean": 1.5,
"rewards/r_correctness/std": 0.9258201122283936,
"rewards/r_shaping/mean": -0.0320708304643631,
"rewards/r_shaping/std": 0.006900239735841751,
"rewards/r_soft/mean": 0.1875,
"rewards/r_soft/std": 0.15526476502418518,
"rewards/r_strict/mean": 0.0,
"rewards/r_strict/std": 0.0,
"rewards/r_xmlcount/mean": 0.02812499925494194,
"rewards/r_xmlcount/std": 0.0994965061545372,
"step": 13
},
{
"completion_length": 478.375,
"completions/clipped_ratio": 0.0,
"completions/max_length": 600.0,
"completions/max_terminated_length": 600.0,
"completions/mean_length": 478.375,
"completions/mean_terminated_length": 478.375,
"completions/min_length": 269.0,
"completions/min_terminated_length": 269.0,
"epoch": 0.007494646680942184,
"frac_reward_zero_std": 0.0,
"grad_norm": NaN,
"kl": 0.002636353485286236,
"learning_rate": 0.00047207065318728296,
"loss": 0.0001,
"num_tokens": 70759.0,
"reward": 2.3731250762939453,
"reward_std": 0.0061251213774085045,
"rewards/r_correctness/mean": 2.0,
"rewards/r_correctness/std": 0.0,
"rewards/r_shaping/mean": -0.026875000447034836,
"rewards/r_shaping/std": 0.006678614765405655,
"rewards/r_soft/mean": 0.30000001192092896,
"rewards/r_soft/std": 0.0,
"rewards/r_strict/mean": 0.0,
"rewards/r_strict/std": 0.0,
"rewards/r_xmlcount/mean": 0.10000000149011612,
"rewards/r_xmlcount/std": 0.0,
"step": 14
},
{
"completion_length": 367.125,
"completions/clipped_ratio": 0.0,
"completions/max_length": 524.0,
"completions/max_terminated_length": 524.0,
"completions/mean_length": 367.125,
"completions/mean_terminated_length": 367.125,
"completions/min_length": 214.0,
"completions/min_terminated_length": 214.0,
"epoch": 0.008029978586723769,
"frac_reward_zero_std": 0.0,
"grad_norm": NaN,
"kl": 0.005754380952566862,
"learning_rate": 0.00046733006860989566,
"loss": 0.0003,
"num_tokens": 74608.0,
"reward": 2.3812456130981445,
"reward_std": 0.0021071869414299726,
"rewards/r_correctness/mean": 2.0,
"rewards/r_correctness/std": 0.0,
"rewards/r_shaping/mean": -0.01875416561961174,
"rewards/r_shaping/std": 0.005434602499008179,
"rewards/r_soft/mean": 0.30000001192092896,
"rewards/r_soft/std": 0.0,
"rewards/r_strict/mean": 0.0,
"rewards/r_strict/std": 0.0,
"rewards/r_xmlcount/mean": 0.10000000149011612,
"rewards/r_xmlcount/std": 0.0,
"step": 15
},
{
"completion_length": 515.875,
"completions/clipped_ratio": 0.5,
"completions/max_length": 700.0,
"completions/max_terminated_length": 366.0,
"completions/mean_length": 515.875,
"completions/mean_terminated_length": 331.75,
"completions/min_length": 296.0,
"completions/min_terminated_length": 296.0,
"epoch": 0.008565310492505354,
"frac_reward_zero_std": 0.5,
"grad_norm": NaN,
"kl": 0.0027227874379605055,
"learning_rate": 0.000462245840522841,
"loss": 0.0001,
"num_tokens": 79783.0,
"reward": 1.1220020055770874,
"reward_std": 0.0010650103213265538,
"rewards/r_correctness/mean": 1.0,
"rewards/r_correctness/std": 1.0690449476242065,
"rewards/r_shaping/mean": -0.02799791656434536,
"rewards/r_shaping/std": 0.012906314805150032,
"rewards/r_soft/mean": 0.15000000596046448,
"rewards/r_soft/std": 0.16035674512386322,
"rewards/r_strict/mean": 0.0,
"rewards/r_strict/std": 0.0,
"rewards/r_xmlcount/mean": 0.0,
"rewards/r_xmlcount/std": 0.10690450668334961,
"step": 16
}
],
"logging_steps": 1,
"max_steps": 80,
"num_input_tokens_seen": 79783,
"num_train_epochs": 1,
"save_steps": 50,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": false,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}