|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.03345376689415228, |
|
"eval_steps": 500, |
|
"global_step": 250, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"completion_length": 200.0, |
|
"epoch": 0.00013381506757660912, |
|
"grad_norm": 0.0, |
|
"kl": 0.0, |
|
"learning_rate": 2.0000000000000002e-07, |
|
"loss": 0.0, |
|
"reward": 0.0, |
|
"reward_std": 0.0, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/int_reward_func": 0.0, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.0, |
|
"step": 1 |
|
}, |
|
{ |
|
"completion_length": 168.6666717529297, |
|
"epoch": 0.00026763013515321824, |
|
"grad_norm": 0.6520445942878723, |
|
"kl": 0.0, |
|
"learning_rate": 4.0000000000000003e-07, |
|
"loss": -0.0, |
|
"reward": -0.046833336353302, |
|
"reward_std": 0.1817607581615448, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/int_reward_func": 0.0, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.046833336353302, |
|
"step": 2 |
|
}, |
|
{ |
|
"completion_length": 128.1666717529297, |
|
"epoch": 0.0004014452027298274, |
|
"grad_norm": 0.7553274631500244, |
|
"kl": 0.00023205635079648346, |
|
"learning_rate": 6.000000000000001e-07, |
|
"loss": 0.0, |
|
"reward": 0.7700001001358032, |
|
"reward_std": 1.30967378616333, |
|
"rewards/correctness_reward_func": 0.6666666865348816, |
|
"rewards/int_reward_func": 0.1666666716337204, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.06333333253860474, |
|
"step": 3 |
|
}, |
|
{ |
|
"completion_length": 181.83334350585938, |
|
"epoch": 0.0005352602703064365, |
|
"grad_norm": 0.0009925751946866512, |
|
"kl": 0.0003016882692463696, |
|
"learning_rate": 8.000000000000001e-07, |
|
"loss": 0.0, |
|
"reward": 0.0, |
|
"reward_std": 0.0, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/int_reward_func": 0.0, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.0, |
|
"step": 4 |
|
}, |
|
{ |
|
"completion_length": 198.5, |
|
"epoch": 0.0006690753378830456, |
|
"grad_norm": 0.257866770029068, |
|
"kl": 0.00034359516575932503, |
|
"learning_rate": 1.0000000000000002e-06, |
|
"loss": 0.0, |
|
"reward": 0.4375, |
|
"reward_std": 1.0116509199142456, |
|
"rewards/correctness_reward_func": 0.3333333432674408, |
|
"rewards/int_reward_func": 0.0833333358168602, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.02083333395421505, |
|
"step": 5 |
|
}, |
|
{ |
|
"completion_length": 123.83333587646484, |
|
"epoch": 0.0008028904054596548, |
|
"grad_norm": 0.5083842277526855, |
|
"kl": 0.00048388767754659057, |
|
"learning_rate": 1.2000000000000002e-06, |
|
"loss": 0.0, |
|
"reward": -0.09333333373069763, |
|
"reward_std": 0.112388014793396, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/int_reward_func": 0.0, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.09333333373069763, |
|
"step": 6 |
|
}, |
|
{ |
|
"completion_length": 121.5, |
|
"epoch": 0.0009367054730362638, |
|
"grad_norm": 0.6899563670158386, |
|
"kl": 0.0006710832240059972, |
|
"learning_rate": 1.4000000000000001e-06, |
|
"loss": 0.0, |
|
"reward": 0.4321666955947876, |
|
"reward_std": 1.0167949199676514, |
|
"rewards/correctness_reward_func": 0.3333333432674408, |
|
"rewards/int_reward_func": 0.0833333358168602, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.015500001609325409, |
|
"step": 7 |
|
}, |
|
{ |
|
"completion_length": 155.1666717529297, |
|
"epoch": 0.001070520540612873, |
|
"grad_norm": 0.4054330587387085, |
|
"kl": 0.0002861042448785156, |
|
"learning_rate": 1.6000000000000001e-06, |
|
"loss": 0.0, |
|
"reward": 0.8665000200271606, |
|
"reward_std": 1.2465369701385498, |
|
"rewards/correctness_reward_func": 0.6666666865348816, |
|
"rewards/int_reward_func": 0.1666666716337204, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.03316666558384895, |
|
"step": 8 |
|
}, |
|
{ |
|
"completion_length": 146.6666717529297, |
|
"epoch": 0.0012043356081894822, |
|
"grad_norm": 0.3640693128108978, |
|
"kl": 0.0003890148364007473, |
|
"learning_rate": 1.8000000000000001e-06, |
|
"loss": 0.0, |
|
"reward": -0.09716667234897614, |
|
"reward_std": 0.2380087822675705, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/int_reward_func": 0.0, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.09716667234897614, |
|
"step": 9 |
|
}, |
|
{ |
|
"completion_length": 152.5, |
|
"epoch": 0.0013381506757660913, |
|
"grad_norm": 0.8989537358283997, |
|
"kl": 0.0005690781399607658, |
|
"learning_rate": 2.0000000000000003e-06, |
|
"loss": 0.0, |
|
"reward": 0.33650001883506775, |
|
"reward_std": 1.0737173557281494, |
|
"rewards/correctness_reward_func": 0.3333333432674408, |
|
"rewards/int_reward_func": 0.0833333358168602, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.08016666769981384, |
|
"step": 10 |
|
}, |
|
{ |
|
"completion_length": 163.5, |
|
"epoch": 0.0014719657433427003, |
|
"grad_norm": 0.4434801936149597, |
|
"kl": 0.0005024907295592129, |
|
"learning_rate": 2.2e-06, |
|
"loss": 0.0, |
|
"reward": 0.3734999895095825, |
|
"reward_std": 0.9148844480514526, |
|
"rewards/correctness_reward_func": 0.3333333432674408, |
|
"rewards/int_reward_func": 0.0833333358168602, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.043166667222976685, |
|
"step": 11 |
|
}, |
|
{ |
|
"completion_length": 166.33334350585938, |
|
"epoch": 0.0016057808109193096, |
|
"grad_norm": 0.2695975601673126, |
|
"kl": 0.0002385280531598255, |
|
"learning_rate": 2.4000000000000003e-06, |
|
"loss": 0.0, |
|
"reward": -0.03350000083446503, |
|
"reward_std": 0.0820579081773758, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/int_reward_func": 0.0, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.03350000083446503, |
|
"step": 12 |
|
}, |
|
{ |
|
"completion_length": 162.0, |
|
"epoch": 0.0017395958784959186, |
|
"grad_norm": 0.3843610882759094, |
|
"kl": 0.0003402562579140067, |
|
"learning_rate": 2.6e-06, |
|
"loss": 0.0, |
|
"reward": -0.1420000046491623, |
|
"reward_std": 0.15615761280059814, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/int_reward_func": 0.0, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.1420000046491623, |
|
"step": 13 |
|
}, |
|
{ |
|
"completion_length": 139.0, |
|
"epoch": 0.0018734109460725277, |
|
"grad_norm": 0.31161829829216003, |
|
"kl": 0.00037740400875918567, |
|
"learning_rate": 2.8000000000000003e-06, |
|
"loss": 0.0, |
|
"reward": -0.05516666918992996, |
|
"reward_std": 0.10250934958457947, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/int_reward_func": 0.0, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.05516666918992996, |
|
"step": 14 |
|
}, |
|
{ |
|
"completion_length": 198.5, |
|
"epoch": 0.002007226013649137, |
|
"grad_norm": 0.001213455805554986, |
|
"kl": 0.0005896758520975709, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0, |
|
"reward": 0.0, |
|
"reward_std": 0.0, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/int_reward_func": 0.0, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.0, |
|
"step": 15 |
|
}, |
|
{ |
|
"completion_length": 142.5, |
|
"epoch": 0.002141041081225746, |
|
"grad_norm": 0.40445369482040405, |
|
"kl": 0.00034244018024764955, |
|
"learning_rate": 3.2000000000000003e-06, |
|
"loss": 0.0, |
|
"reward": 0.746666669845581, |
|
"reward_std": 1.3729774951934814, |
|
"rewards/correctness_reward_func": 0.6666666865348816, |
|
"rewards/int_reward_func": 0.1666666716337204, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.08666666597127914, |
|
"step": 16 |
|
}, |
|
{ |
|
"completion_length": 193.1666717529297, |
|
"epoch": 0.002274856148802355, |
|
"grad_norm": 0.0018463014857843518, |
|
"kl": 0.0003552571579348296, |
|
"learning_rate": 3.4000000000000005e-06, |
|
"loss": 0.0, |
|
"reward": 0.0, |
|
"reward_std": 0.0, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/int_reward_func": 0.0, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.0, |
|
"step": 17 |
|
}, |
|
{ |
|
"completion_length": 165.6666717529297, |
|
"epoch": 0.0024086712163789645, |
|
"grad_norm": 0.390596479177475, |
|
"kl": 0.00029498100047931075, |
|
"learning_rate": 3.6000000000000003e-06, |
|
"loss": 0.0, |
|
"reward": -0.09133332967758179, |
|
"reward_std": 0.14864006638526917, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/int_reward_func": 0.0, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.09133332967758179, |
|
"step": 18 |
|
}, |
|
{ |
|
"completion_length": 175.33334350585938, |
|
"epoch": 0.0025424862839555735, |
|
"grad_norm": 0.29836106300354004, |
|
"kl": 0.00035133626079186797, |
|
"learning_rate": 3.8000000000000005e-06, |
|
"loss": 0.0, |
|
"reward": 0.044999998062849045, |
|
"reward_std": 0.11022703349590302, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/int_reward_func": 0.0833333358168602, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.038333334028720856, |
|
"step": 19 |
|
}, |
|
{ |
|
"completion_length": 174.33334350585938, |
|
"epoch": 0.0026763013515321826, |
|
"grad_norm": 0.4651794135570526, |
|
"kl": 0.000449147482868284, |
|
"learning_rate": 4.000000000000001e-06, |
|
"loss": 0.0, |
|
"reward": 0.011333334259688854, |
|
"reward_std": 0.060777194797992706, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/int_reward_func": 0.0, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.011333334259688854, |
|
"step": 20 |
|
}, |
|
{ |
|
"completion_length": 188.5, |
|
"epoch": 0.0028101164191087916, |
|
"grad_norm": 0.45565637946128845, |
|
"kl": 0.00044950511073693633, |
|
"learning_rate": 4.2000000000000004e-06, |
|
"loss": 0.0, |
|
"reward": 0.04633333534002304, |
|
"reward_std": 0.11349302530288696, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/int_reward_func": 0.0833333358168602, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.03700000047683716, |
|
"step": 21 |
|
}, |
|
{ |
|
"completion_length": 168.33334350585938, |
|
"epoch": 0.0029439314866854006, |
|
"grad_norm": 0.36476781964302063, |
|
"kl": 0.0003528912493493408, |
|
"learning_rate": 4.4e-06, |
|
"loss": 0.0, |
|
"reward": 0.3645000159740448, |
|
"reward_std": 0.8928390741348267, |
|
"rewards/correctness_reward_func": 0.3333333432674408, |
|
"rewards/int_reward_func": 0.0833333358168602, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.052166666835546494, |
|
"step": 22 |
|
}, |
|
{ |
|
"completion_length": 118.83333587646484, |
|
"epoch": 0.00307774655426201, |
|
"grad_norm": 0.6629476547241211, |
|
"kl": 0.00042950891656801105, |
|
"learning_rate": 4.600000000000001e-06, |
|
"loss": 0.0, |
|
"reward": 0.013500000350177288, |
|
"reward_std": 0.03306811302900314, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/int_reward_func": 0.0, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.013500000350177288, |
|
"step": 23 |
|
}, |
|
{ |
|
"completion_length": 112.83333587646484, |
|
"epoch": 0.003211561621838619, |
|
"grad_norm": 0.49353620409965515, |
|
"kl": 0.0005617816932499409, |
|
"learning_rate": 4.800000000000001e-06, |
|
"loss": 0.0, |
|
"reward": 0.3318333327770233, |
|
"reward_std": 1.0452561378479004, |
|
"rewards/correctness_reward_func": 0.3333333432674408, |
|
"rewards/int_reward_func": 0.0833333358168602, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.08483333885669708, |
|
"step": 24 |
|
}, |
|
{ |
|
"completion_length": 131.1666717529297, |
|
"epoch": 0.003345376689415228, |
|
"grad_norm": 0.38862261176109314, |
|
"kl": 0.0004901737556792796, |
|
"learning_rate": 5e-06, |
|
"loss": 0.0, |
|
"reward": 0.3465000092983246, |
|
"reward_std": 0.9767013788223267, |
|
"rewards/correctness_reward_func": 0.3333333432674408, |
|
"rewards/int_reward_func": 0.0833333358168602, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.07016666978597641, |
|
"step": 25 |
|
}, |
|
{ |
|
"completion_length": 128.5, |
|
"epoch": 0.0034791917569918372, |
|
"grad_norm": 0.41505903005599976, |
|
"kl": 0.00022011167311575264, |
|
"learning_rate": 4.999756310023261e-06, |
|
"loss": 0.0, |
|
"reward": 0.41233333945274353, |
|
"reward_std": 1.010006308555603, |
|
"rewards/correctness_reward_func": 0.3333333432674408, |
|
"rewards/int_reward_func": 0.0833333358168602, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.00433333357796073, |
|
"step": 26 |
|
}, |
|
{ |
|
"completion_length": 169.0, |
|
"epoch": 0.0036130068245684463, |
|
"grad_norm": 0.4222584664821625, |
|
"kl": 0.0006107889348641038, |
|
"learning_rate": 4.999025287600886e-06, |
|
"loss": 0.0, |
|
"reward": 0.021000001579523087, |
|
"reward_std": 0.05143928527832031, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/int_reward_func": 0.0, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.021000001579523087, |
|
"step": 27 |
|
}, |
|
{ |
|
"completion_length": 128.0, |
|
"epoch": 0.0037468218921450553, |
|
"grad_norm": 0.4782092273235321, |
|
"kl": 0.000323088257573545, |
|
"learning_rate": 4.997807075247147e-06, |
|
"loss": 0.0, |
|
"reward": 0.4166666865348816, |
|
"reward_std": 1.0206208229064941, |
|
"rewards/correctness_reward_func": 0.3333333432674408, |
|
"rewards/int_reward_func": 0.0833333358168602, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.0, |
|
"step": 28 |
|
}, |
|
{ |
|
"completion_length": 200.0, |
|
"epoch": 0.003880636959721665, |
|
"grad_norm": 0.0008450484601780772, |
|
"kl": 0.0003519197925925255, |
|
"learning_rate": 4.996101910454953e-06, |
|
"loss": 0.0, |
|
"reward": 0.0, |
|
"reward_std": 0.0, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/int_reward_func": 0.0, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.0, |
|
"step": 29 |
|
}, |
|
{ |
|
"completion_length": 175.33334350585938, |
|
"epoch": 0.004014452027298274, |
|
"grad_norm": 0.33603161573410034, |
|
"kl": 0.0004287260235287249, |
|
"learning_rate": 4.993910125649561e-06, |
|
"loss": 0.0, |
|
"reward": -0.0560000017285347, |
|
"reward_std": 0.1371714174747467, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/int_reward_func": 0.0, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.0560000017285347, |
|
"step": 30 |
|
}, |
|
{ |
|
"completion_length": 168.83334350585938, |
|
"epoch": 0.004148267094874883, |
|
"grad_norm": 0.29343071579933167, |
|
"kl": 0.0004830901452805847, |
|
"learning_rate": 4.9912321481237616e-06, |
|
"loss": 0.0, |
|
"reward": 0.04183333367109299, |
|
"reward_std": 0.064808689057827, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/int_reward_func": 0.0, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.04183333367109299, |
|
"step": 31 |
|
}, |
|
{ |
|
"completion_length": 149.0, |
|
"epoch": 0.004282082162451492, |
|
"grad_norm": 0.4075845181941986, |
|
"kl": 0.0004737289564218372, |
|
"learning_rate": 4.988068499954578e-06, |
|
"loss": 0.0, |
|
"reward": 0.36633333563804626, |
|
"reward_std": 0.8973297476768494, |
|
"rewards/correctness_reward_func": 0.3333333432674408, |
|
"rewards/int_reward_func": 0.0833333358168602, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.050333332270383835, |
|
"step": 32 |
|
}, |
|
{ |
|
"completion_length": 145.6666717529297, |
|
"epoch": 0.004415897230028101, |
|
"grad_norm": 0.5869320631027222, |
|
"kl": 0.00045360170770436525, |
|
"learning_rate": 4.984419797901491e-06, |
|
"loss": 0.0, |
|
"reward": -0.058666668832302094, |
|
"reward_std": 0.12906846404075623, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/int_reward_func": 0.0, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.058666668832302094, |
|
"step": 33 |
|
}, |
|
{ |
|
"completion_length": 188.0, |
|
"epoch": 0.00454971229760471, |
|
"grad_norm": 0.44135627150535583, |
|
"kl": 0.0004000771732535213, |
|
"learning_rate": 4.980286753286196e-06, |
|
"loss": 0.0, |
|
"reward": 0.040666669607162476, |
|
"reward_std": 0.06300053000450134, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/int_reward_func": 0.0, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.040666669607162476, |
|
"step": 34 |
|
}, |
|
{ |
|
"completion_length": 186.1666717529297, |
|
"epoch": 0.004683527365181319, |
|
"grad_norm": 0.29517051577568054, |
|
"kl": 0.0003852550871670246, |
|
"learning_rate": 4.975670171853926e-06, |
|
"loss": 0.0, |
|
"reward": 0.0416666679084301, |
|
"reward_std": 0.10206206887960434, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/int_reward_func": 0.0, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.0416666679084301, |
|
"step": 35 |
|
}, |
|
{ |
|
"completion_length": 198.1666717529297, |
|
"epoch": 0.004817342432757929, |
|
"grad_norm": 0.004241094458848238, |
|
"kl": 0.000525132636539638, |
|
"learning_rate": 4.970570953616383e-06, |
|
"loss": 0.0, |
|
"reward": 0.0, |
|
"reward_std": 0.0, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/int_reward_func": 0.0, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.0, |
|
"step": 36 |
|
}, |
|
{ |
|
"completion_length": 111.16667175292969, |
|
"epoch": 0.004951157500334538, |
|
"grad_norm": 0.9546847343444824, |
|
"kl": 0.0008722383063286543, |
|
"learning_rate": 4.964990092676263e-06, |
|
"loss": 0.0, |
|
"reward": 0.00433333357796073, |
|
"reward_std": 0.07178208231925964, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/int_reward_func": 0.0, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.00433333357796073, |
|
"step": 37 |
|
}, |
|
{ |
|
"completion_length": 116.33333587646484, |
|
"epoch": 0.005084972567911147, |
|
"grad_norm": 0.4917941093444824, |
|
"kl": 0.0006737456424161792, |
|
"learning_rate": 4.958928677033465e-06, |
|
"loss": 0.0, |
|
"reward": 0.4699999988079071, |
|
"reward_std": 0.9978088140487671, |
|
"rewards/correctness_reward_func": 0.3333333432674408, |
|
"rewards/int_reward_func": 0.0833333358168602, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.0533333346247673, |
|
"step": 38 |
|
}, |
|
{ |
|
"completion_length": 136.33334350585938, |
|
"epoch": 0.005218787635487756, |
|
"grad_norm": 0.4850587248802185, |
|
"kl": 0.0006846442702226341, |
|
"learning_rate": 4.9523878883729794e-06, |
|
"loss": 0.0, |
|
"reward": 1.321166753768921, |
|
"reward_std": 1.2952899932861328, |
|
"rewards/correctness_reward_func": 1.0, |
|
"rewards/int_reward_func": 0.3333333432674408, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.01216666679829359, |
|
"step": 39 |
|
}, |
|
{ |
|
"completion_length": 170.5, |
|
"epoch": 0.005352602703064365, |
|
"grad_norm": 0.0013024378567934036, |
|
"kl": 0.00040273740887641907, |
|
"learning_rate": 4.9453690018345144e-06, |
|
"loss": 0.0, |
|
"reward": 0.0, |
|
"reward_std": 0.0, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/int_reward_func": 0.0, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.0, |
|
"step": 40 |
|
}, |
|
{ |
|
"completion_length": 173.6666717529297, |
|
"epoch": 0.005486417770640974, |
|
"grad_norm": 0.5376860499382019, |
|
"kl": 0.0011046230792999268, |
|
"learning_rate": 4.937873385763909e-06, |
|
"loss": 0.0, |
|
"reward": 0.7595000267028809, |
|
"reward_std": 1.2757749557495117, |
|
"rewards/correctness_reward_func": 0.6666666865348816, |
|
"rewards/int_reward_func": 0.1666666716337204, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.07383333891630173, |
|
"step": 41 |
|
}, |
|
{ |
|
"completion_length": 172.33334350585938, |
|
"epoch": 0.005620232838217583, |
|
"grad_norm": 0.48040255904197693, |
|
"kl": 0.001196104334667325, |
|
"learning_rate": 4.9299025014463665e-06, |
|
"loss": 0.0, |
|
"reward": 0.7818333506584167, |
|
"reward_std": 1.171539306640625, |
|
"rewards/correctness_reward_func": 0.6666666865348816, |
|
"rewards/int_reward_func": 0.1666666716337204, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.05150000378489494, |
|
"step": 42 |
|
}, |
|
{ |
|
"completion_length": 200.0, |
|
"epoch": 0.005754047905794192, |
|
"grad_norm": 0.0011990669881924987, |
|
"kl": 0.00046603806549683213, |
|
"learning_rate": 4.921457902821578e-06, |
|
"loss": 0.0, |
|
"reward": 0.0, |
|
"reward_std": 0.0, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/int_reward_func": 0.0, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.0, |
|
"step": 43 |
|
}, |
|
{ |
|
"completion_length": 167.1666717529297, |
|
"epoch": 0.005887862973370801, |
|
"grad_norm": 0.002651863731443882, |
|
"kl": 0.0007552001625299454, |
|
"learning_rate": 4.912541236180779e-06, |
|
"loss": 0.0, |
|
"reward": 0.0, |
|
"reward_std": 0.0, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/int_reward_func": 0.0, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.0, |
|
"step": 44 |
|
}, |
|
{ |
|
"completion_length": 180.0, |
|
"epoch": 0.00602167804094741, |
|
"grad_norm": 0.3888886868953705, |
|
"kl": 0.000817882944829762, |
|
"learning_rate": 4.903154239845798e-06, |
|
"loss": 0.0, |
|
"reward": 1.2708333730697632, |
|
"reward_std": 1.3472579717636108, |
|
"rewards/correctness_reward_func": 1.0, |
|
"rewards/int_reward_func": 0.25, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.02083333395421505, |
|
"step": 45 |
|
}, |
|
{ |
|
"completion_length": 108.66667175292969, |
|
"epoch": 0.00615549310852402, |
|
"grad_norm": 0.0016210161847993731, |
|
"kl": 0.00039254926377907395, |
|
"learning_rate": 4.893298743830168e-06, |
|
"loss": 0.0, |
|
"reward": 0.0, |
|
"reward_std": 0.0, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/int_reward_func": 0.0, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.0, |
|
"step": 46 |
|
}, |
|
{ |
|
"completion_length": 154.6666717529297, |
|
"epoch": 0.006289308176100629, |
|
"grad_norm": 0.341744601726532, |
|
"kl": 0.0009102068725042045, |
|
"learning_rate": 4.882976669482368e-06, |
|
"loss": 0.0, |
|
"reward": 0.8333333730697632, |
|
"reward_std": 1.29099440574646, |
|
"rewards/correctness_reward_func": 0.6666666865348816, |
|
"rewards/int_reward_func": 0.1666666716337204, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.0, |
|
"step": 47 |
|
}, |
|
{ |
|
"completion_length": 126.16667175292969, |
|
"epoch": 0.006423123243677238, |
|
"grad_norm": 0.012836282141506672, |
|
"kl": 0.0018564595375210047, |
|
"learning_rate": 4.8721900291112415e-06, |
|
"loss": 0.0001, |
|
"reward": 0.0, |
|
"reward_std": 0.0, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/int_reward_func": 0.0, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.0, |
|
"step": 48 |
|
}, |
|
{ |
|
"completion_length": 101.83333587646484, |
|
"epoch": 0.006556938311253847, |
|
"grad_norm": 0.6613649725914001, |
|
"kl": 0.0017794461455196142, |
|
"learning_rate": 4.860940925593703e-06, |
|
"loss": 0.0001, |
|
"reward": 0.04200000315904617, |
|
"reward_std": 0.06506612151861191, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/int_reward_func": 0.0, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.04200000315904617, |
|
"step": 49 |
|
}, |
|
{ |
|
"completion_length": 196.5, |
|
"epoch": 0.006690753378830456, |
|
"grad_norm": 0.004236978013068438, |
|
"kl": 0.0008604646427556872, |
|
"learning_rate": 4.849231551964771e-06, |
|
"loss": 0.0, |
|
"reward": 0.0, |
|
"reward_std": 0.0, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/int_reward_func": 0.0, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.0, |
|
"step": 50 |
|
}, |
|
{ |
|
"completion_length": 87.33333587646484, |
|
"epoch": 0.006824568446407065, |
|
"grad_norm": 2.5036542415618896, |
|
"kl": 0.002497585490345955, |
|
"learning_rate": 4.837064190990036e-06, |
|
"loss": 0.0001, |
|
"reward": 0.020333334803581238, |
|
"reward_std": 0.04980628937482834, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/int_reward_func": 0.0, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.020333334803581238, |
|
"step": 51 |
|
}, |
|
{ |
|
"completion_length": 130.6666717529297, |
|
"epoch": 0.0069583835139836745, |
|
"grad_norm": 0.4602515697479248, |
|
"kl": 0.0022823391482234, |
|
"learning_rate": 4.824441214720629e-06, |
|
"loss": 0.0001, |
|
"reward": 0.8040000200271606, |
|
"reward_std": 1.2467942237854004, |
|
"rewards/correctness_reward_func": 0.6666666865348816, |
|
"rewards/int_reward_func": 0.1666666716337204, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.029333334416151047, |
|
"step": 52 |
|
}, |
|
{ |
|
"completion_length": 113.0, |
|
"epoch": 0.0070921985815602835, |
|
"grad_norm": 0.5198715329170227, |
|
"kl": 0.005426853429526091, |
|
"learning_rate": 4.811365084030784e-06, |
|
"loss": 0.0002, |
|
"reward": 1.254666805267334, |
|
"reward_std": 1.3316380977630615, |
|
"rewards/correctness_reward_func": 1.0, |
|
"rewards/int_reward_func": 0.25, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.004666668828576803, |
|
"step": 53 |
|
}, |
|
{ |
|
"completion_length": 154.6666717529297, |
|
"epoch": 0.0072260136491368926, |
|
"grad_norm": 0.34585776925086975, |
|
"kl": 0.002139053540304303, |
|
"learning_rate": 4.7978383481380865e-06, |
|
"loss": 0.0001, |
|
"reward": 1.2051666975021362, |
|
"reward_std": 1.323843002319336, |
|
"rewards/correctness_reward_func": 1.0, |
|
"rewards/int_reward_func": 0.25, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.04483333230018616, |
|
"step": 54 |
|
}, |
|
{ |
|
"completion_length": 187.33334350585938, |
|
"epoch": 0.007359828716713502, |
|
"grad_norm": 0.4027135670185089, |
|
"kl": 0.0035339975729584694, |
|
"learning_rate": 4.783863644106502e-06, |
|
"loss": 0.0001, |
|
"reward": 0.9166666865348816, |
|
"reward_std": 1.2416388988494873, |
|
"rewards/correctness_reward_func": 0.6666666865348816, |
|
"rewards/int_reward_func": 0.25, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.0, |
|
"step": 55 |
|
}, |
|
{ |
|
"completion_length": 195.6666717529297, |
|
"epoch": 0.007493643784290111, |
|
"grad_norm": 0.00830562599003315, |
|
"kl": 0.00114604108966887, |
|
"learning_rate": 4.769443696332272e-06, |
|
"loss": 0.0, |
|
"reward": 0.0, |
|
"reward_std": 0.0, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/int_reward_func": 0.0, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.0, |
|
"step": 56 |
|
}, |
|
{ |
|
"completion_length": 129.5, |
|
"epoch": 0.0076274588518667205, |
|
"grad_norm": 0.5255561470985413, |
|
"kl": 0.0031108385883271694, |
|
"learning_rate": 4.754581316012785e-06, |
|
"loss": 0.0001, |
|
"reward": 0.4346666932106018, |
|
"reward_std": 1.0144504308700562, |
|
"rewards/correctness_reward_func": 0.3333333432674408, |
|
"rewards/int_reward_func": 0.0833333358168602, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.017999999225139618, |
|
"step": 57 |
|
}, |
|
{ |
|
"completion_length": 159.83334350585938, |
|
"epoch": 0.00776127391944333, |
|
"grad_norm": 0.46986404061317444, |
|
"kl": 0.00284760445356369, |
|
"learning_rate": 4.7392794005985324e-06, |
|
"loss": 0.0001, |
|
"reward": 0.04800000041723251, |
|
"reward_std": 0.07532595098018646, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/int_reward_func": 0.0833333358168602, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.035333335399627686, |
|
"step": 58 |
|
}, |
|
{ |
|
"completion_length": 121.66667175292969, |
|
"epoch": 0.007895088987019938, |
|
"grad_norm": 1.0778398513793945, |
|
"kl": 0.003858643351122737, |
|
"learning_rate": 4.723540933228245e-06, |
|
"loss": 0.0002, |
|
"reward": 0.0338333360850811, |
|
"reward_std": 0.053443118929862976, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/int_reward_func": 0.0, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.0338333360850811, |
|
"step": 59 |
|
}, |
|
{ |
|
"completion_length": 169.83334350585938, |
|
"epoch": 0.008028904054596548, |
|
"grad_norm": 0.3702806830406189, |
|
"kl": 0.003246339038014412, |
|
"learning_rate": 4.707368982147318e-06, |
|
"loss": 0.0001, |
|
"reward": -0.0560000017285347, |
|
"reward_std": 0.1371714174747467, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/int_reward_func": 0.0, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.0560000017285347, |
|
"step": 60 |
|
}, |
|
{ |
|
"completion_length": 162.0, |
|
"epoch": 0.008162719122173156, |
|
"grad_norm": 0.004582512192428112, |
|
"kl": 0.0006104764179326594, |
|
"learning_rate": 4.690766700109659e-06, |
|
"loss": 0.0, |
|
"reward": 0.0, |
|
"reward_std": 0.0, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/int_reward_func": 0.0, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.0, |
|
"step": 61 |
|
}, |
|
{ |
|
"completion_length": 175.1666717529297, |
|
"epoch": 0.008296534189749766, |
|
"grad_norm": 0.4721982181072235, |
|
"kl": 0.0031741862185299397, |
|
"learning_rate": 4.673737323763048e-06, |
|
"loss": 0.0001, |
|
"reward": 0.8009999990463257, |
|
"reward_std": 1.2424194812774658, |
|
"rewards/correctness_reward_func": 0.6666666865348816, |
|
"rewards/int_reward_func": 0.1666666716337204, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.032333336770534515, |
|
"step": 62 |
|
}, |
|
{ |
|
"completion_length": 122.5, |
|
"epoch": 0.008430349257326376, |
|
"grad_norm": 0.022069407626986504, |
|
"kl": 0.007255158387124538, |
|
"learning_rate": 4.656284173018144e-06, |
|
"loss": 0.0003, |
|
"reward": 0.0, |
|
"reward_std": 0.0, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/int_reward_func": 0.0, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.0, |
|
"step": 63 |
|
}, |
|
{ |
|
"completion_length": 150.5, |
|
"epoch": 0.008564164324902984, |
|
"grad_norm": 0.0063232239335775375, |
|
"kl": 0.005966612603515387, |
|
"learning_rate": 4.638410650401267e-06, |
|
"loss": 0.0002, |
|
"reward": 0.0, |
|
"reward_std": 0.0, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/int_reward_func": 0.0, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.0, |
|
"step": 64 |
|
}, |
|
{ |
|
"completion_length": 200.0, |
|
"epoch": 0.008697979392479594, |
|
"grad_norm": 0.0014898076187819242, |
|
"kl": 0.0004787092038895935, |
|
"learning_rate": 4.620120240391065e-06, |
|
"loss": 0.0, |
|
"reward": 0.0, |
|
"reward_std": 0.0, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/int_reward_func": 0.0, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.0, |
|
"step": 65 |
|
}, |
|
{ |
|
"completion_length": 172.5, |
|
"epoch": 0.008831794460056202, |
|
"grad_norm": 0.2924825847148895, |
|
"kl": 0.0044395048171281815, |
|
"learning_rate": 4.601416508739211e-06, |
|
"loss": 0.0002, |
|
"reward": 0.4166666865348816, |
|
"reward_std": 1.0206208229064941, |
|
"rewards/correctness_reward_func": 0.3333333432674408, |
|
"rewards/int_reward_func": 0.0833333358168602, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.0, |
|
"step": 66 |
|
}, |
|
{ |
|
"completion_length": 162.0, |
|
"epoch": 0.008965609527632812, |
|
"grad_norm": 0.008148525841534138, |
|
"kl": 0.0034142467193305492, |
|
"learning_rate": 4.582303101775249e-06, |
|
"loss": 0.0001, |
|
"reward": 0.0, |
|
"reward_std": 0.0, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/int_reward_func": 0.0, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.0, |
|
"step": 67 |
|
}, |
|
{ |
|
"completion_length": 165.83334350585938, |
|
"epoch": 0.00909942459520942, |
|
"grad_norm": 0.38308584690093994, |
|
"kl": 0.00867555383592844, |
|
"learning_rate": 4.562783745695738e-06, |
|
"loss": 0.0003, |
|
"reward": 0.8333333730697632, |
|
"reward_std": 1.29099440574646, |
|
"rewards/correctness_reward_func": 0.6666666865348816, |
|
"rewards/int_reward_func": 0.1666666716337204, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.0, |
|
"step": 68 |
|
}, |
|
{ |
|
"completion_length": 134.6666717529297, |
|
"epoch": 0.00923323966278603, |
|
"grad_norm": 0.450452595949173, |
|
"kl": 0.005564470775425434, |
|
"learning_rate": 4.542862245837821e-06, |
|
"loss": 0.0002, |
|
"reward": -0.03816666826605797, |
|
"reward_std": 0.09348885715007782, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/int_reward_func": 0.0, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.03816666826605797, |
|
"step": 69 |
|
}, |
|
{ |
|
"completion_length": 166.33334350585938, |
|
"epoch": 0.009367054730362638, |
|
"grad_norm": 0.4226210415363312, |
|
"kl": 0.0031330427154898643, |
|
"learning_rate": 4.522542485937369e-06, |
|
"loss": 0.0001, |
|
"reward": 0.4166666865348816, |
|
"reward_std": 1.0206208229064941, |
|
"rewards/correctness_reward_func": 0.3333333432674408, |
|
"rewards/int_reward_func": 0.0833333358168602, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.0, |
|
"step": 70 |
|
}, |
|
{ |
|
"completion_length": 196.33334350585938, |
|
"epoch": 0.009500869797939248, |
|
"grad_norm": 0.2609606683254242, |
|
"kl": 0.00573092233389616, |
|
"learning_rate": 4.501828427371834e-06, |
|
"loss": 0.0002, |
|
"reward": 0.4375, |
|
"reward_std": 1.0116509199142456, |
|
"rewards/correctness_reward_func": 0.3333333432674408, |
|
"rewards/int_reward_func": 0.0833333358168602, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.02083333395421505, |
|
"step": 71 |
|
}, |
|
{ |
|
"completion_length": 183.33334350585938, |
|
"epoch": 0.009634684865515858, |
|
"grad_norm": 0.001673938357271254, |
|
"kl": 0.0006895886617712677, |
|
"learning_rate": 4.4807241083879774e-06, |
|
"loss": 0.0, |
|
"reward": 0.0, |
|
"reward_std": 0.0, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/int_reward_func": 0.0, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.0, |
|
"step": 72 |
|
}, |
|
{ |
|
"completion_length": 183.33334350585938, |
|
"epoch": 0.009768499933092466, |
|
"grad_norm": 0.41474249958992004, |
|
"kl": 0.003182145766913891, |
|
"learning_rate": 4.4592336433146e-06, |
|
"loss": 0.0001, |
|
"reward": 0.8333333730697632, |
|
"reward_std": 1.29099440574646, |
|
"rewards/correctness_reward_func": 0.6666666865348816, |
|
"rewards/int_reward_func": 0.1666666716337204, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.0, |
|
"step": 73 |
|
}, |
|
{ |
|
"completion_length": 198.5, |
|
"epoch": 0.009902315000669076, |
|
"grad_norm": 0.016794368624687195, |
|
"kl": 0.0026337592862546444, |
|
"learning_rate": 4.437361221760449e-06, |
|
"loss": 0.0001, |
|
"reward": 0.0, |
|
"reward_std": 0.0, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/int_reward_func": 0.0, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.0, |
|
"step": 74 |
|
}, |
|
{ |
|
"completion_length": 180.5, |
|
"epoch": 0.010036130068245684, |
|
"grad_norm": 0.2793622314929962, |
|
"kl": 0.008324583992362022, |
|
"learning_rate": 4.415111107797445e-06, |
|
"loss": 0.0003, |
|
"reward": 1.6666667461395264, |
|
"reward_std": 1.29099440574646, |
|
"rewards/correctness_reward_func": 1.3333333730697632, |
|
"rewards/int_reward_func": 0.3333333432674408, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.0, |
|
"step": 75 |
|
}, |
|
{ |
|
"completion_length": 106.83333587646484, |
|
"epoch": 0.010169945135822294, |
|
"grad_norm": 0.6471434831619263, |
|
"kl": 0.016430877149105072, |
|
"learning_rate": 4.3924876391293915e-06, |
|
"loss": 0.0007, |
|
"reward": 0.021000001579523087, |
|
"reward_std": 0.05143928527832031, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/int_reward_func": 0.0, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.021000001579523087, |
|
"step": 76 |
|
}, |
|
{ |
|
"completion_length": 127.33333587646484, |
|
"epoch": 0.010303760203398902, |
|
"grad_norm": 0.3994431793689728, |
|
"kl": 0.006109229288995266, |
|
"learning_rate": 4.36949522624633e-06, |
|
"loss": 0.0002, |
|
"reward": 0.4166666865348816, |
|
"reward_std": 1.0206208229064941, |
|
"rewards/correctness_reward_func": 0.3333333432674408, |
|
"rewards/int_reward_func": 0.0833333358168602, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.0, |
|
"step": 77 |
|
}, |
|
{ |
|
"completion_length": 133.1666717529297, |
|
"epoch": 0.010437575270975512, |
|
"grad_norm": 0.5078148245811462, |
|
"kl": 0.012061258777976036, |
|
"learning_rate": 4.346138351564711e-06, |
|
"loss": 0.0005, |
|
"reward": 1.628499984741211, |
|
"reward_std": 1.352062702178955, |
|
"rewards/correctness_reward_func": 1.3333333730697632, |
|
"rewards/int_reward_func": 0.3333333432674408, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.03816666826605797, |
|
"step": 78 |
|
}, |
|
{ |
|
"completion_length": 102.83333587646484, |
|
"epoch": 0.01057139033855212, |
|
"grad_norm": 0.029442179948091507, |
|
"kl": 0.010023966431617737, |
|
"learning_rate": 4.322421568553529e-06, |
|
"loss": 0.0004, |
|
"reward": 0.0, |
|
"reward_std": 0.0, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/int_reward_func": 0.0, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.0, |
|
"step": 79 |
|
}, |
|
{ |
|
"completion_length": 171.5, |
|
"epoch": 0.01070520540612873, |
|
"grad_norm": 0.608511745929718, |
|
"kl": 0.02281329035758972, |
|
"learning_rate": 4.2983495008466285e-06, |
|
"loss": 0.0009, |
|
"reward": -0.07016666978597641, |
|
"reward_std": 0.10962922871112823, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/int_reward_func": 0.0, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.07016666978597641, |
|
"step": 80 |
|
}, |
|
{ |
|
"completion_length": 191.5, |
|
"epoch": 0.010839020473705338, |
|
"grad_norm": 0.30423974990844727, |
|
"kl": 0.008041778579354286, |
|
"learning_rate": 4.273926841341303e-06, |
|
"loss": 0.0003, |
|
"reward": 0.4166666865348816, |
|
"reward_std": 1.0206208229064941, |
|
"rewards/correctness_reward_func": 0.3333333432674408, |
|
"rewards/int_reward_func": 0.0833333358168602, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.0, |
|
"step": 81 |
|
}, |
|
{ |
|
"completion_length": 114.83333587646484, |
|
"epoch": 0.010972835541281948, |
|
"grad_norm": 0.010348974727094173, |
|
"kl": 0.01861591637134552, |
|
"learning_rate": 4.249158351283414e-06, |
|
"loss": 0.0007, |
|
"reward": 2.5, |
|
"reward_std": 0.0, |
|
"rewards/correctness_reward_func": 2.0, |
|
"rewards/int_reward_func": 0.5, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.0, |
|
"step": 82 |
|
}, |
|
{ |
|
"completion_length": 163.5, |
|
"epoch": 0.011106650608858558, |
|
"grad_norm": 0.38949424028396606, |
|
"kl": 0.014888926409184933, |
|
"learning_rate": 4.224048859339175e-06, |
|
"loss": 0.0006, |
|
"reward": 1.185666799545288, |
|
"reward_std": 1.302016019821167, |
|
"rewards/correctness_reward_func": 1.0, |
|
"rewards/int_reward_func": 0.25, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.06433333456516266, |
|
"step": 83 |
|
}, |
|
{ |
|
"completion_length": 180.5, |
|
"epoch": 0.011240465676435166, |
|
"grad_norm": 4.860539436340332, |
|
"kl": 0.018028978258371353, |
|
"learning_rate": 4.198603260653792e-06, |
|
"loss": 0.0007, |
|
"reward": 0.875, |
|
"reward_std": 1.259960412979126, |
|
"rewards/correctness_reward_func": 0.6666666865348816, |
|
"rewards/int_reward_func": 0.1666666716337204, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.0416666679084301, |
|
"step": 84 |
|
}, |
|
{ |
|
"completion_length": 160.6666717529297, |
|
"epoch": 0.011374280744011776, |
|
"grad_norm": 0.45955681800842285, |
|
"kl": 0.014035634696483612, |
|
"learning_rate": 4.172826515897146e-06, |
|
"loss": 0.0006, |
|
"reward": 0.4166666865348816, |
|
"reward_std": 1.0206208229064941, |
|
"rewards/correctness_reward_func": 0.3333333432674408, |
|
"rewards/int_reward_func": 0.0833333358168602, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.0, |
|
"step": 85 |
|
}, |
|
{ |
|
"completion_length": 146.33334350585938, |
|
"epoch": 0.011508095811588384, |
|
"grad_norm": 0.40156418085098267, |
|
"kl": 0.011429629288613796, |
|
"learning_rate": 4.146723650296701e-06, |
|
"loss": 0.0005, |
|
"reward": -0.03099999576807022, |
|
"reward_std": 0.208724707365036, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/int_reward_func": 0.0, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.03099999949336052, |
|
"step": 86 |
|
}, |
|
{ |
|
"completion_length": 120.0, |
|
"epoch": 0.011641910879164994, |
|
"grad_norm": 0.5004826784133911, |
|
"kl": 0.021759724244475365, |
|
"learning_rate": 4.120299752657828e-06, |
|
"loss": 0.0009, |
|
"reward": 1.6751668453216553, |
|
"reward_std": 1.2977288961410522, |
|
"rewards/correctness_reward_func": 1.3333333730697632, |
|
"rewards/int_reward_func": 0.3333333432674408, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.008500000461935997, |
|
"step": 87 |
|
}, |
|
{ |
|
"completion_length": 188.0, |
|
"epoch": 0.011775725946741603, |
|
"grad_norm": 0.446821391582489, |
|
"kl": 0.009519928134977818, |
|
"learning_rate": 4.093559974371725e-06, |
|
"loss": 0.0004, |
|
"reward": 0.02083333395421505, |
|
"reward_std": 0.05103103443980217, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/int_reward_func": 0.0, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.02083333395421505, |
|
"step": 88 |
|
}, |
|
{ |
|
"completion_length": 171.6666717529297, |
|
"epoch": 0.011909541014318212, |
|
"grad_norm": 0.003901235293596983, |
|
"kl": 0.010121981613337994, |
|
"learning_rate": 4.066509528411151e-06, |
|
"loss": 0.0004, |
|
"reward": 2.5, |
|
"reward_std": 0.0, |
|
"rewards/correctness_reward_func": 2.0, |
|
"rewards/int_reward_func": 0.5, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.0, |
|
"step": 89 |
|
}, |
|
{ |
|
"completion_length": 154.5, |
|
"epoch": 0.01204335608189482, |
|
"grad_norm": 0.5476510524749756, |
|
"kl": 0.022881466895341873, |
|
"learning_rate": 4.039153688314146e-06, |
|
"loss": 0.0009, |
|
"reward": 0.4166666865348816, |
|
"reward_std": 1.0206208229064941, |
|
"rewards/correctness_reward_func": 0.3333333432674408, |
|
"rewards/int_reward_func": 0.0833333358168602, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.0, |
|
"step": 90 |
|
}, |
|
{ |
|
"completion_length": 176.6666717529297, |
|
"epoch": 0.01217717114947143, |
|
"grad_norm": 0.8005715608596802, |
|
"kl": 0.02562887594103813, |
|
"learning_rate": 4.011497787155938e-06, |
|
"loss": 0.001, |
|
"reward": 0.02083333395421505, |
|
"reward_std": 0.05103103816509247, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/int_reward_func": 0.0, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.02083333395421505, |
|
"step": 91 |
|
}, |
|
{ |
|
"completion_length": 157.83334350585938, |
|
"epoch": 0.01231098621704804, |
|
"grad_norm": 0.572854220867157, |
|
"kl": 0.010252588428556919, |
|
"learning_rate": 3.983547216509254e-06, |
|
"loss": 0.0004, |
|
"reward": 0.0833333358168602, |
|
"reward_std": 0.20412415266036987, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/int_reward_func": 0.0833333358168602, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.0, |
|
"step": 92 |
|
}, |
|
{ |
|
"completion_length": 168.0, |
|
"epoch": 0.012444801284624649, |
|
"grad_norm": 0.3673333525657654, |
|
"kl": 0.008399921469390392, |
|
"learning_rate": 3.955307425393224e-06, |
|
"loss": 0.0003, |
|
"reward": 0.8333333730697632, |
|
"reward_std": 1.29099440574646, |
|
"rewards/correctness_reward_func": 0.6666666865348816, |
|
"rewards/int_reward_func": 0.1666666716337204, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.0, |
|
"step": 93 |
|
}, |
|
{ |
|
"completion_length": 167.0, |
|
"epoch": 0.012578616352201259, |
|
"grad_norm": 0.028444070369005203, |
|
"kl": 0.011543246917426586, |
|
"learning_rate": 3.92678391921108e-06, |
|
"loss": 0.0005, |
|
"reward": 0.0, |
|
"reward_std": 0.0, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/int_reward_func": 0.0, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.0, |
|
"step": 94 |
|
}, |
|
{ |
|
"completion_length": 98.33333587646484, |
|
"epoch": 0.012712431419777867, |
|
"grad_norm": 0.452796995639801, |
|
"kl": 0.020905088633298874, |
|
"learning_rate": 3.897982258676867e-06, |
|
"loss": 0.0008, |
|
"reward": 0.75, |
|
"reward_std": 0.8803408145904541, |
|
"rewards/correctness_reward_func": 0.3333333432674408, |
|
"rewards/int_reward_func": 0.4166666865348816, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.0, |
|
"step": 95 |
|
}, |
|
{ |
|
"completion_length": 166.6666717529297, |
|
"epoch": 0.012846246487354477, |
|
"grad_norm": 0.322458416223526, |
|
"kl": 0.013159887865185738, |
|
"learning_rate": 3.868908058731376e-06, |
|
"loss": 0.0005, |
|
"reward": 2.06000018119812, |
|
"reward_std": 1.010742425918579, |
|
"rewards/correctness_reward_func": 1.6666667461395264, |
|
"rewards/int_reward_func": 0.4166666865348816, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.023333333432674408, |
|
"step": 96 |
|
}, |
|
{ |
|
"completion_length": 177.33334350585938, |
|
"epoch": 0.012980061554931085, |
|
"grad_norm": 0.3679336905479431, |
|
"kl": 0.009648090228438377, |
|
"learning_rate": 3.839566987447492e-06, |
|
"loss": 0.0004, |
|
"reward": 1.25, |
|
"reward_std": 1.3693064451217651, |
|
"rewards/correctness_reward_func": 1.0, |
|
"rewards/int_reward_func": 0.25, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.0, |
|
"step": 97 |
|
}, |
|
{ |
|
"completion_length": 188.6666717529297, |
|
"epoch": 0.013113876622507695, |
|
"grad_norm": 0.004214553628116846, |
|
"kl": 0.0036848734598606825, |
|
"learning_rate": 3.8099647649251984e-06, |
|
"loss": 0.0001, |
|
"reward": 0.0, |
|
"reward_std": 0.0, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/int_reward_func": 0.0, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.0, |
|
"step": 98 |
|
}, |
|
{ |
|
"completion_length": 145.83334350585938, |
|
"epoch": 0.013247691690084303, |
|
"grad_norm": 0.5636357069015503, |
|
"kl": 0.01897261291742325, |
|
"learning_rate": 3.780107162176429e-06, |
|
"loss": 0.0008, |
|
"reward": 0.9166666865348816, |
|
"reward_std": 1.2416388988494873, |
|
"rewards/correctness_reward_func": 0.6666666865348816, |
|
"rewards/int_reward_func": 0.25, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.0, |
|
"step": 99 |
|
}, |
|
{ |
|
"completion_length": 126.0, |
|
"epoch": 0.013381506757660913, |
|
"grad_norm": 0.5204256772994995, |
|
"kl": 0.026616668328642845, |
|
"learning_rate": 3.7500000000000005e-06, |
|
"loss": 0.0011, |
|
"reward": 1.2710000276565552, |
|
"reward_std": 1.347088098526001, |
|
"rewards/correctness_reward_func": 1.0, |
|
"rewards/int_reward_func": 0.25, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.021000001579523087, |
|
"step": 100 |
|
}, |
|
{ |
|
"completion_length": 199.1666717529297, |
|
"epoch": 0.013515321825237521, |
|
"grad_norm": 0.4379250407218933, |
|
"kl": 0.004337076563388109, |
|
"learning_rate": 3.7196491478468322e-06, |
|
"loss": 0.0002, |
|
"reward": 0.02083333395421505, |
|
"reward_std": 0.05103103443980217, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/int_reward_func": 0.0, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.02083333395421505, |
|
"step": 101 |
|
}, |
|
{ |
|
"completion_length": 200.0, |
|
"epoch": 0.01364913689281413, |
|
"grad_norm": 0.464220255613327, |
|
"kl": 0.002902195556089282, |
|
"learning_rate": 3.689060522675689e-06, |
|
"loss": 0.0001, |
|
"reward": 0.02083333395421505, |
|
"reward_std": 0.05103103443980217, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/int_reward_func": 0.0, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.02083333395421505, |
|
"step": 102 |
|
}, |
|
{ |
|
"completion_length": 162.5, |
|
"epoch": 0.01378295196039074, |
|
"grad_norm": 0.4563305079936981, |
|
"kl": 0.031100619584321976, |
|
"learning_rate": 3.658240087799655e-06, |
|
"loss": 0.0012, |
|
"reward": 1.6421668529510498, |
|
"reward_std": 1.2732903957366943, |
|
"rewards/correctness_reward_func": 1.3333333730697632, |
|
"rewards/int_reward_func": 0.3333333432674408, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.024500001221895218, |
|
"step": 103 |
|
}, |
|
{ |
|
"completion_length": 148.6666717529297, |
|
"epoch": 0.013916767027967349, |
|
"grad_norm": 0.03375934436917305, |
|
"kl": 0.02203848585486412, |
|
"learning_rate": 3.627193851723577e-06, |
|
"loss": 0.0009, |
|
"reward": 0.0, |
|
"reward_std": 0.0, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/int_reward_func": 0.0, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.0, |
|
"step": 104 |
|
}, |
|
{ |
|
"completion_length": 181.6666717529297, |
|
"epoch": 0.014050582095543959, |
|
"grad_norm": 0.34021925926208496, |
|
"kl": 0.013872651383280754, |
|
"learning_rate": 3.595927866972694e-06, |
|
"loss": 0.0006, |
|
"reward": 0.9375, |
|
"reward_std": 1.224106788635254, |
|
"rewards/correctness_reward_func": 0.6666666865348816, |
|
"rewards/int_reward_func": 0.25, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.02083333395421505, |
|
"step": 105 |
|
}, |
|
{ |
|
"completion_length": 148.33334350585938, |
|
"epoch": 0.014184397163120567, |
|
"grad_norm": 0.4161204695701599, |
|
"kl": 0.011628372594714165, |
|
"learning_rate": 3.564448228912682e-06, |
|
"loss": 0.0005, |
|
"reward": 1.75, |
|
"reward_std": 1.172603964805603, |
|
"rewards/correctness_reward_func": 1.3333333730697632, |
|
"rewards/int_reward_func": 0.4166666865348816, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.0, |
|
"step": 106 |
|
}, |
|
{ |
|
"completion_length": 199.83334350585938, |
|
"epoch": 0.014318212230697177, |
|
"grad_norm": 0.2740929126739502, |
|
"kl": 0.0027336678467690945, |
|
"learning_rate": 3.532761074561355e-06, |
|
"loss": 0.0001, |
|
"reward": 0.4166666865348816, |
|
"reward_std": 1.0206208229064941, |
|
"rewards/correctness_reward_func": 0.3333333432674408, |
|
"rewards/int_reward_func": 0.0833333358168602, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.0, |
|
"step": 107 |
|
}, |
|
{ |
|
"completion_length": 199.6666717529297, |
|
"epoch": 0.014452027298273785, |
|
"grad_norm": 0.34659114480018616, |
|
"kl": 0.003705874551087618, |
|
"learning_rate": 3.5008725813922383e-06, |
|
"loss": 0.0001, |
|
"reward": 0.0833333358168602, |
|
"reward_std": 0.20412413775920868, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/int_reward_func": 0.0833333358168602, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.0, |
|
"step": 108 |
|
}, |
|
{ |
|
"completion_length": 181.83334350585938, |
|
"epoch": 0.014585842365850395, |
|
"grad_norm": 0.03158266097307205, |
|
"kl": 0.016718031838536263, |
|
"learning_rate": 3.4687889661302577e-06, |
|
"loss": 0.0007, |
|
"reward": 0.0, |
|
"reward_std": 0.0, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/int_reward_func": 0.0, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.0, |
|
"step": 109 |
|
}, |
|
{ |
|
"completion_length": 125.0, |
|
"epoch": 0.014719657433427003, |
|
"grad_norm": 0.48598426580429077, |
|
"kl": 0.05068827420473099, |
|
"learning_rate": 3.436516483539781e-06, |
|
"loss": 0.002, |
|
"reward": 1.7691667079925537, |
|
"reward_std": 1.1387290954589844, |
|
"rewards/correctness_reward_func": 1.3333333730697632, |
|
"rewards/int_reward_func": 0.4166666865348816, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.019166667014360428, |
|
"step": 110 |
|
}, |
|
{ |
|
"completion_length": 181.6666717529297, |
|
"epoch": 0.014853472501003613, |
|
"grad_norm": 0.33522072434425354, |
|
"kl": 0.007758219726383686, |
|
"learning_rate": 3.4040614252052305e-06, |
|
"loss": 0.0003, |
|
"reward": 1.6666667461395264, |
|
"reward_std": 1.29099440574646, |
|
"rewards/correctness_reward_func": 1.3333333730697632, |
|
"rewards/int_reward_func": 0.3333333432674408, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.0, |
|
"step": 111 |
|
}, |
|
{ |
|
"completion_length": 185.83334350585938, |
|
"epoch": 0.014987287568580221, |
|
"grad_norm": 0.35056257247924805, |
|
"kl": 0.015054799616336823, |
|
"learning_rate": 3.3714301183045382e-06, |
|
"loss": 0.0006, |
|
"reward": 0.8333333730697632, |
|
"reward_std": 1.29099440574646, |
|
"rewards/correctness_reward_func": 0.6666666865348816, |
|
"rewards/int_reward_func": 0.1666666716337204, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.0, |
|
"step": 112 |
|
}, |
|
{ |
|
"completion_length": 183.33334350585938, |
|
"epoch": 0.015121102636156831, |
|
"grad_norm": 0.32826197147369385, |
|
"kl": 0.04551512748003006, |
|
"learning_rate": 3.338628924375638e-06, |
|
"loss": 0.0018, |
|
"reward": 0.41050001978874207, |
|
"reward_std": 1.034417986869812, |
|
"rewards/correctness_reward_func": 0.3333333432674408, |
|
"rewards/int_reward_func": 0.0833333358168602, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.006166668143123388, |
|
"step": 113 |
|
}, |
|
{ |
|
"completion_length": 174.83334350585938, |
|
"epoch": 0.015254917703733441, |
|
"grad_norm": 0.3377326726913452, |
|
"kl": 0.04113561660051346, |
|
"learning_rate": 3.3056642380762783e-06, |
|
"loss": 0.0016, |
|
"reward": 1.25, |
|
"reward_std": 1.3693064451217651, |
|
"rewards/correctness_reward_func": 1.0, |
|
"rewards/int_reward_func": 0.25, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.0, |
|
"step": 114 |
|
}, |
|
{ |
|
"completion_length": 150.6666717529297, |
|
"epoch": 0.01538873277131005, |
|
"grad_norm": 0.6040213108062744, |
|
"kl": 0.021824371069669724, |
|
"learning_rate": 3.272542485937369e-06, |
|
"loss": 0.0009, |
|
"reward": 0.8681666851043701, |
|
"reward_std": 1.2657527923583984, |
|
"rewards/correctness_reward_func": 0.6666666865348816, |
|
"rewards/int_reward_func": 0.1666666716337204, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.034833334386348724, |
|
"step": 115 |
|
}, |
|
{ |
|
"completion_length": 200.0, |
|
"epoch": 0.01552254783888666, |
|
"grad_norm": 0.008158405311405659, |
|
"kl": 0.0036414554342627525, |
|
"learning_rate": 3.2392701251101172e-06, |
|
"loss": 0.0001, |
|
"reward": 0.0, |
|
"reward_std": 0.0, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/int_reward_func": 0.0, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.0, |
|
"step": 116 |
|
}, |
|
{ |
|
"completion_length": 200.0, |
|
"epoch": 0.01565636290646327, |
|
"grad_norm": 0.003316780785098672, |
|
"kl": 0.0018421653658151627, |
|
"learning_rate": 3.205853642107192e-06, |
|
"loss": 0.0001, |
|
"reward": 0.0, |
|
"reward_std": 0.0, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/int_reward_func": 0.0, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.0, |
|
"step": 117 |
|
}, |
|
{ |
|
"completion_length": 156.6666717529297, |
|
"epoch": 0.015790177974039876, |
|
"grad_norm": 0.5106171369552612, |
|
"kl": 0.028099259361624718, |
|
"learning_rate": 3.1722995515381644e-06, |
|
"loss": 0.0011, |
|
"reward": 1.2643333673477173, |
|
"reward_std": 1.354689359664917, |
|
"rewards/correctness_reward_func": 1.0, |
|
"rewards/int_reward_func": 0.25, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.014333332888782024, |
|
"step": 118 |
|
}, |
|
{ |
|
"completion_length": 139.33334350585938, |
|
"epoch": 0.015923993041616485, |
|
"grad_norm": 0.3868845999240875, |
|
"kl": 0.05956920236349106, |
|
"learning_rate": 3.1386143948394764e-06, |
|
"loss": 0.0024, |
|
"reward": 1.687666654586792, |
|
"reward_std": 1.259092092514038, |
|
"rewards/correctness_reward_func": 1.3333333730697632, |
|
"rewards/int_reward_func": 0.3333333432674408, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.021000001579523087, |
|
"step": 119 |
|
}, |
|
{ |
|
"completion_length": 197.0, |
|
"epoch": 0.016057808109193095, |
|
"grad_norm": 0.3134411573410034, |
|
"kl": 0.021013274788856506, |
|
"learning_rate": 3.1048047389991693e-06, |
|
"loss": 0.0008, |
|
"reward": 0.4166666865348816, |
|
"reward_std": 1.0206208229064941, |
|
"rewards/correctness_reward_func": 0.3333333432674408, |
|
"rewards/int_reward_func": 0.0833333358168602, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.0, |
|
"step": 120 |
|
}, |
|
{ |
|
"completion_length": 179.1666717529297, |
|
"epoch": 0.016191623176769705, |
|
"grad_norm": 0.636957585811615, |
|
"kl": 0.024236779659986496, |
|
"learning_rate": 3.0708771752766397e-06, |
|
"loss": 0.001, |
|
"reward": 0.0833333358168602, |
|
"reward_std": 0.20412413775920868, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/int_reward_func": 0.0833333358168602, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.0, |
|
"step": 121 |
|
}, |
|
{ |
|
"completion_length": 115.83333587646484, |
|
"epoch": 0.01632543824434631, |
|
"grad_norm": 0.01371024176478386, |
|
"kl": 0.042789995670318604, |
|
"learning_rate": 3.0368383179176584e-06, |
|
"loss": 0.0017, |
|
"reward": 2.5, |
|
"reward_std": 0.0, |
|
"rewards/correctness_reward_func": 2.0, |
|
"rewards/int_reward_func": 0.5, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.0, |
|
"step": 122 |
|
}, |
|
{ |
|
"completion_length": 122.5, |
|
"epoch": 0.01645925331192292, |
|
"grad_norm": 0.005612315144389868, |
|
"kl": 0.020341983065009117, |
|
"learning_rate": 3.002694802864912e-06, |
|
"loss": 0.0008, |
|
"reward": 2.5, |
|
"reward_std": 0.0, |
|
"rewards/correctness_reward_func": 2.0, |
|
"rewards/int_reward_func": 0.5, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.0, |
|
"step": 123 |
|
}, |
|
{ |
|
"completion_length": 194.6666717529297, |
|
"epoch": 0.01659306837949953, |
|
"grad_norm": 0.00693311495706439, |
|
"kl": 0.007592849433422089, |
|
"learning_rate": 2.9684532864643123e-06, |
|
"loss": 0.0003, |
|
"reward": 0.0, |
|
"reward_std": 0.0, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/int_reward_func": 0.0, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.0, |
|
"step": 124 |
|
}, |
|
{ |
|
"completion_length": 106.5, |
|
"epoch": 0.01672688344707614, |
|
"grad_norm": 0.013540594838559628, |
|
"kl": 0.027746539562940598, |
|
"learning_rate": 2.9341204441673267e-06, |
|
"loss": 0.0011, |
|
"reward": 2.5, |
|
"reward_std": 0.0, |
|
"rewards/correctness_reward_func": 2.0, |
|
"rewards/int_reward_func": 0.5, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.0, |
|
"step": 125 |
|
}, |
|
{ |
|
"completion_length": 194.1666717529297, |
|
"epoch": 0.01686069851465275, |
|
"grad_norm": 0.38014551997184753, |
|
"kl": 0.017290499061346054, |
|
"learning_rate": 2.8997029692295875e-06, |
|
"loss": 0.0007, |
|
"reward": 0.9166666865348816, |
|
"reward_std": 1.2416388988494873, |
|
"rewards/correctness_reward_func": 0.6666666865348816, |
|
"rewards/int_reward_func": 0.25, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.0, |
|
"step": 126 |
|
}, |
|
{ |
|
"completion_length": 175.5, |
|
"epoch": 0.016994513582229358, |
|
"grad_norm": 0.4393642842769623, |
|
"kl": 0.009731654077768326, |
|
"learning_rate": 2.8652075714060296e-06, |
|
"loss": 0.0004, |
|
"reward": 0.9166666865348816, |
|
"reward_std": 1.2416388988494873, |
|
"rewards/correctness_reward_func": 0.6666666865348816, |
|
"rewards/int_reward_func": 0.25, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.0, |
|
"step": 127 |
|
}, |
|
{ |
|
"completion_length": 108.5, |
|
"epoch": 0.017128328649805968, |
|
"grad_norm": 0.016502853482961655, |
|
"kl": 0.045004379004240036, |
|
"learning_rate": 2.8306409756428067e-06, |
|
"loss": 0.0018, |
|
"reward": 2.5, |
|
"reward_std": 0.0, |
|
"rewards/correctness_reward_func": 2.0, |
|
"rewards/int_reward_func": 0.5, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.0, |
|
"step": 128 |
|
}, |
|
{ |
|
"completion_length": 126.16667175292969, |
|
"epoch": 0.017262143717382578, |
|
"grad_norm": 0.34395888447761536, |
|
"kl": 0.016212839633226395, |
|
"learning_rate": 2.7960099207662535e-06, |
|
"loss": 0.0006, |
|
"reward": 1.6666667461395264, |
|
"reward_std": 1.29099440574646, |
|
"rewards/correctness_reward_func": 1.3333333730697632, |
|
"rewards/int_reward_func": 0.3333333432674408, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.0, |
|
"step": 129 |
|
}, |
|
{ |
|
"completion_length": 154.6666717529297, |
|
"epoch": 0.017395958784959187, |
|
"grad_norm": 0.4351252615451813, |
|
"kl": 0.02109598182141781, |
|
"learning_rate": 2.761321158169134e-06, |
|
"loss": 0.0008, |
|
"reward": 1.6666667461395264, |
|
"reward_std": 1.29099440574646, |
|
"rewards/correctness_reward_func": 1.3333333730697632, |
|
"rewards/int_reward_func": 0.3333333432674408, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.0, |
|
"step": 130 |
|
}, |
|
{ |
|
"completion_length": 91.16667175292969, |
|
"epoch": 0.017529773852535794, |
|
"grad_norm": 0.5746130347251892, |
|
"kl": 0.0653064101934433, |
|
"learning_rate": 2.726581450494451e-06, |
|
"loss": 0.0026, |
|
"reward": 2.454500198364258, |
|
"reward_std": 0.11145174503326416, |
|
"rewards/correctness_reward_func": 2.0, |
|
"rewards/int_reward_func": 0.5, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.045500002801418304, |
|
"step": 131 |
|
}, |
|
{ |
|
"completion_length": 121.66667175292969, |
|
"epoch": 0.017663588920112404, |
|
"grad_norm": 0.007989317178726196, |
|
"kl": 0.03470905125141144, |
|
"learning_rate": 2.6917975703170466e-06, |
|
"loss": 0.0014, |
|
"reward": 2.5, |
|
"reward_std": 0.0, |
|
"rewards/correctness_reward_func": 2.0, |
|
"rewards/int_reward_func": 0.5, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.0, |
|
"step": 132 |
|
}, |
|
{ |
|
"completion_length": 158.33334350585938, |
|
"epoch": 0.017797403987689014, |
|
"grad_norm": 0.3317304849624634, |
|
"kl": 0.014697610400617123, |
|
"learning_rate": 2.6569762988232838e-06, |
|
"loss": 0.0006, |
|
"reward": 1.25, |
|
"reward_std": 1.3693064451217651, |
|
"rewards/correctness_reward_func": 1.0, |
|
"rewards/int_reward_func": 0.25, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.0, |
|
"step": 133 |
|
}, |
|
{ |
|
"completion_length": 185.5, |
|
"epoch": 0.017931219055265624, |
|
"grad_norm": 0.4038233458995819, |
|
"kl": 0.007475182414054871, |
|
"learning_rate": 2.6221244244890336e-06, |
|
"loss": 0.0003, |
|
"reward": 0.8333333730697632, |
|
"reward_std": 1.29099440574646, |
|
"rewards/correctness_reward_func": 0.6666666865348816, |
|
"rewards/int_reward_func": 0.1666666716337204, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.0, |
|
"step": 134 |
|
}, |
|
{ |
|
"completion_length": 147.33334350585938, |
|
"epoch": 0.018065034122842234, |
|
"grad_norm": 0.4337051510810852, |
|
"kl": 0.03628496080636978, |
|
"learning_rate": 2.587248741756253e-06, |
|
"loss": 0.0015, |
|
"reward": 2.0833334922790527, |
|
"reward_std": 1.0206208229064941, |
|
"rewards/correctness_reward_func": 1.6666667461395264, |
|
"rewards/int_reward_func": 0.4166666865348816, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.0, |
|
"step": 135 |
|
}, |
|
{ |
|
"completion_length": 186.33334350585938, |
|
"epoch": 0.01819884919041884, |
|
"grad_norm": 0.430644690990448, |
|
"kl": 0.029759211465716362, |
|
"learning_rate": 2.5523560497083927e-06, |
|
"loss": 0.0012, |
|
"reward": 0.5, |
|
"reward_std": 1.0, |
|
"rewards/correctness_reward_func": 0.3333333432674408, |
|
"rewards/int_reward_func": 0.1666666716337204, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.0, |
|
"step": 136 |
|
}, |
|
{ |
|
"completion_length": 121.66667175292969, |
|
"epoch": 0.01833266425799545, |
|
"grad_norm": 0.5082526803016663, |
|
"kl": 0.050083570182323456, |
|
"learning_rate": 2.517453150744904e-06, |
|
"loss": 0.002, |
|
"reward": 1.6666667461395264, |
|
"reward_std": 1.29099440574646, |
|
"rewards/correctness_reward_func": 1.3333333730697632, |
|
"rewards/int_reward_func": 0.3333333432674408, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.0, |
|
"step": 137 |
|
}, |
|
{ |
|
"completion_length": 189.33334350585938, |
|
"epoch": 0.01846647932557206, |
|
"grad_norm": 0.2880929112434387, |
|
"kl": 0.010486051440238953, |
|
"learning_rate": 2.482546849255096e-06, |
|
"loss": 0.0004, |
|
"reward": 2.0833334922790527, |
|
"reward_std": 1.0206208229064941, |
|
"rewards/correctness_reward_func": 1.6666667461395264, |
|
"rewards/int_reward_func": 0.4166666865348816, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.0, |
|
"step": 138 |
|
}, |
|
{ |
|
"completion_length": 161.83334350585938, |
|
"epoch": 0.01860029439314867, |
|
"grad_norm": 0.012269456870853901, |
|
"kl": 0.016117241233587265, |
|
"learning_rate": 2.447643950291608e-06, |
|
"loss": 0.0006, |
|
"reward": 0.0, |
|
"reward_std": 0.0, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/int_reward_func": 0.0, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.0, |
|
"step": 139 |
|
}, |
|
{ |
|
"completion_length": 146.6666717529297, |
|
"epoch": 0.018734109460725276, |
|
"grad_norm": 0.46656334400177, |
|
"kl": 0.04306882619857788, |
|
"learning_rate": 2.4127512582437486e-06, |
|
"loss": 0.0017, |
|
"reward": 0.7596666812896729, |
|
"reward_std": 1.358882188796997, |
|
"rewards/correctness_reward_func": 0.6666666865348816, |
|
"rewards/int_reward_func": 0.1666666716337204, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.07366666942834854, |
|
"step": 140 |
|
}, |
|
{ |
|
"completion_length": 183.1666717529297, |
|
"epoch": 0.018867924528301886, |
|
"grad_norm": 0.31509730219841003, |
|
"kl": 0.018859118223190308, |
|
"learning_rate": 2.377875575510967e-06, |
|
"loss": 0.0008, |
|
"reward": 1.6666667461395264, |
|
"reward_std": 1.29099440574646, |
|
"rewards/correctness_reward_func": 1.3333333730697632, |
|
"rewards/int_reward_func": 0.3333333432674408, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.0, |
|
"step": 141 |
|
}, |
|
{ |
|
"completion_length": 122.0, |
|
"epoch": 0.019001739595878496, |
|
"grad_norm": 0.01961562968790531, |
|
"kl": 0.04471735656261444, |
|
"learning_rate": 2.3430237011767166e-06, |
|
"loss": 0.0018, |
|
"reward": 2.5, |
|
"reward_std": 0.0, |
|
"rewards/correctness_reward_func": 2.0, |
|
"rewards/int_reward_func": 0.5, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.0, |
|
"step": 142 |
|
}, |
|
{ |
|
"completion_length": 135.5, |
|
"epoch": 0.019135554663455106, |
|
"grad_norm": 0.41898852586746216, |
|
"kl": 0.04450133442878723, |
|
"learning_rate": 2.3082024296829538e-06, |
|
"loss": 0.0018, |
|
"reward": 0.4583333432674408, |
|
"reward_std": 1.005194902420044, |
|
"rewards/correctness_reward_func": 0.3333333432674408, |
|
"rewards/int_reward_func": 0.0833333358168602, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.0416666679084301, |
|
"step": 143 |
|
}, |
|
{ |
|
"completion_length": 190.1666717529297, |
|
"epoch": 0.019269369731031716, |
|
"grad_norm": 0.6943901777267456, |
|
"kl": 0.031737446784973145, |
|
"learning_rate": 2.2734185495055503e-06, |
|
"loss": 0.0013, |
|
"reward": 0.02083333395421505, |
|
"reward_std": 0.05103103443980217, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/int_reward_func": 0.0, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.02083333395421505, |
|
"step": 144 |
|
}, |
|
{ |
|
"completion_length": 177.83334350585938, |
|
"epoch": 0.019403184798608322, |
|
"grad_norm": 0.4527323544025421, |
|
"kl": 0.011997250840067863, |
|
"learning_rate": 2.238678841830867e-06, |
|
"loss": 0.0005, |
|
"reward": 0.25, |
|
"reward_std": 0.273861289024353, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/int_reward_func": 0.25, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.0, |
|
"step": 145 |
|
}, |
|
{ |
|
"completion_length": 94.33333587646484, |
|
"epoch": 0.019536999866184932, |
|
"grad_norm": 0.9129260182380676, |
|
"kl": 0.03500860929489136, |
|
"learning_rate": 2.2039900792337477e-06, |
|
"loss": 0.0014, |
|
"reward": 2.5158333778381348, |
|
"reward_std": 0.038783639669418335, |
|
"rewards/correctness_reward_func": 2.0, |
|
"rewards/int_reward_func": 0.5, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.015833333134651184, |
|
"step": 146 |
|
}, |
|
{ |
|
"completion_length": 151.0, |
|
"epoch": 0.019670814933761542, |
|
"grad_norm": 0.4710156321525574, |
|
"kl": 0.040015410631895065, |
|
"learning_rate": 2.1693590243571937e-06, |
|
"loss": 0.0016, |
|
"reward": 1.75, |
|
"reward_std": 1.172603964805603, |
|
"rewards/correctness_reward_func": 1.3333333730697632, |
|
"rewards/int_reward_func": 0.4166666865348816, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.0, |
|
"step": 147 |
|
}, |
|
{ |
|
"completion_length": 109.5, |
|
"epoch": 0.019804630001338152, |
|
"grad_norm": 0.4614563584327698, |
|
"kl": 0.046104662120342255, |
|
"learning_rate": 2.134792428593971e-06, |
|
"loss": 0.0018, |
|
"reward": 1.6666667461395264, |
|
"reward_std": 1.29099440574646, |
|
"rewards/correctness_reward_func": 1.3333333730697632, |
|
"rewards/int_reward_func": 0.3333333432674408, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.0, |
|
"step": 148 |
|
}, |
|
{ |
|
"completion_length": 170.83334350585938, |
|
"epoch": 0.01993844506891476, |
|
"grad_norm": 0.37295976281166077, |
|
"kl": 0.026096370071172714, |
|
"learning_rate": 2.1002970307704134e-06, |
|
"loss": 0.001, |
|
"reward": 1.25, |
|
"reward_std": 1.3693064451217651, |
|
"rewards/correctness_reward_func": 1.0, |
|
"rewards/int_reward_func": 0.25, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.0, |
|
"step": 149 |
|
}, |
|
{ |
|
"completion_length": 141.5, |
|
"epoch": 0.02007226013649137, |
|
"grad_norm": 0.46510520577430725, |
|
"kl": 0.04087626934051514, |
|
"learning_rate": 2.0658795558326745e-06, |
|
"loss": 0.0016, |
|
"reward": 0.4166666865348816, |
|
"reward_std": 1.0206208229064941, |
|
"rewards/correctness_reward_func": 0.3333333432674408, |
|
"rewards/int_reward_func": 0.0833333358168602, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.0, |
|
"step": 150 |
|
}, |
|
{ |
|
"completion_length": 196.0, |
|
"epoch": 0.020206075204067978, |
|
"grad_norm": 0.12236814945936203, |
|
"kl": 0.03315367549657822, |
|
"learning_rate": 2.031546713535688e-06, |
|
"loss": 0.0013, |
|
"reward": 0.0, |
|
"reward_std": 0.0, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/int_reward_func": 0.0, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.0, |
|
"step": 151 |
|
}, |
|
{ |
|
"completion_length": 124.33333587646484, |
|
"epoch": 0.020339890271644588, |
|
"grad_norm": 0.4959777891635895, |
|
"kl": 0.06505782902240753, |
|
"learning_rate": 1.997305197135089e-06, |
|
"loss": 0.0026, |
|
"reward": 1.6666667461395264, |
|
"reward_std": 1.29099440574646, |
|
"rewards/correctness_reward_func": 1.3333333730697632, |
|
"rewards/int_reward_func": 0.3333333432674408, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.0, |
|
"step": 152 |
|
}, |
|
{ |
|
"completion_length": 98.33333587646484, |
|
"epoch": 0.020473705339221198, |
|
"grad_norm": 0.5583345293998718, |
|
"kl": 0.06674438714981079, |
|
"learning_rate": 1.963161682082342e-06, |
|
"loss": 0.0027, |
|
"reward": 0.8333333730697632, |
|
"reward_std": 1.29099440574646, |
|
"rewards/correctness_reward_func": 0.6666666865348816, |
|
"rewards/int_reward_func": 0.1666666716337204, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.0, |
|
"step": 153 |
|
}, |
|
{ |
|
"completion_length": 185.33334350585938, |
|
"epoch": 0.020607520406797804, |
|
"grad_norm": 0.4062897264957428, |
|
"kl": 0.008387739770114422, |
|
"learning_rate": 1.9291228247233607e-06, |
|
"loss": 0.0003, |
|
"reward": 0.4166666865348816, |
|
"reward_std": 1.0206208229064941, |
|
"rewards/correctness_reward_func": 0.3333333432674408, |
|
"rewards/int_reward_func": 0.0833333358168602, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.0, |
|
"step": 154 |
|
}, |
|
{ |
|
"completion_length": 162.5, |
|
"epoch": 0.020741335474374414, |
|
"grad_norm": 0.483214795589447, |
|
"kl": 0.04108166694641113, |
|
"learning_rate": 1.895195261000831e-06, |
|
"loss": 0.0016, |
|
"reward": 0.9166666865348816, |
|
"reward_std": 1.2416388988494873, |
|
"rewards/correctness_reward_func": 0.6666666865348816, |
|
"rewards/int_reward_func": 0.25, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.0, |
|
"step": 155 |
|
}, |
|
{ |
|
"completion_length": 111.33333587646484, |
|
"epoch": 0.020875150541951024, |
|
"grad_norm": 1.3660439252853394, |
|
"kl": 0.14076313376426697, |
|
"learning_rate": 1.8613856051605242e-06, |
|
"loss": 0.0056, |
|
"reward": 2.5288333892822266, |
|
"reward_std": 0.07062703371047974, |
|
"rewards/correctness_reward_func": 2.0, |
|
"rewards/int_reward_func": 0.5, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.028833333402872086, |
|
"step": 156 |
|
}, |
|
{ |
|
"completion_length": 118.16667175292969, |
|
"epoch": 0.021008965609527634, |
|
"grad_norm": 0.5607126355171204, |
|
"kl": 0.04791057109832764, |
|
"learning_rate": 1.827700448461836e-06, |
|
"loss": 0.0019, |
|
"reward": 0.905666708946228, |
|
"reward_std": 1.251636028289795, |
|
"rewards/correctness_reward_func": 0.6666666865348816, |
|
"rewards/int_reward_func": 0.25, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.010999999940395355, |
|
"step": 157 |
|
}, |
|
{ |
|
"completion_length": 93.5, |
|
"epoch": 0.02114278067710424, |
|
"grad_norm": 0.013210687786340714, |
|
"kl": 0.02975599095225334, |
|
"learning_rate": 1.7941463578928088e-06, |
|
"loss": 0.0012, |
|
"reward": 2.5, |
|
"reward_std": 0.0, |
|
"rewards/correctness_reward_func": 2.0, |
|
"rewards/int_reward_func": 0.5, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.0, |
|
"step": 158 |
|
}, |
|
{ |
|
"completion_length": 184.83334350585938, |
|
"epoch": 0.02127659574468085, |
|
"grad_norm": 0.5084704160690308, |
|
"kl": 0.01870289444923401, |
|
"learning_rate": 1.7607298748898844e-06, |
|
"loss": 0.0007, |
|
"reward": 0.25, |
|
"reward_std": 0.273861289024353, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/int_reward_func": 0.25, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.0, |
|
"step": 159 |
|
}, |
|
{ |
|
"completion_length": 148.1666717529297, |
|
"epoch": 0.02141041081225746, |
|
"grad_norm": 0.577641487121582, |
|
"kl": 0.10752715915441513, |
|
"learning_rate": 1.7274575140626318e-06, |
|
"loss": 0.0043, |
|
"reward": 0.4166666865348816, |
|
"reward_std": 1.0206208229064941, |
|
"rewards/correctness_reward_func": 0.3333333432674408, |
|
"rewards/int_reward_func": 0.0833333358168602, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.0, |
|
"step": 160 |
|
}, |
|
{ |
|
"completion_length": 125.83333587646484, |
|
"epoch": 0.02154422587983407, |
|
"grad_norm": 0.39314910769462585, |
|
"kl": 0.04669243097305298, |
|
"learning_rate": 1.6943357619237227e-06, |
|
"loss": 0.0019, |
|
"reward": 1.4166667461395264, |
|
"reward_std": 1.2006943225860596, |
|
"rewards/correctness_reward_func": 1.0, |
|
"rewards/int_reward_func": 0.4166666865348816, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.0, |
|
"step": 161 |
|
}, |
|
{ |
|
"completion_length": 129.0, |
|
"epoch": 0.021678040947410677, |
|
"grad_norm": 0.6266517639160156, |
|
"kl": 0.07034413516521454, |
|
"learning_rate": 1.661371075624363e-06, |
|
"loss": 0.0028, |
|
"reward": 1.6666667461395264, |
|
"reward_std": 1.2909945249557495, |
|
"rewards/correctness_reward_func": 1.3333333730697632, |
|
"rewards/int_reward_func": 0.3333333432674408, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.0, |
|
"step": 162 |
|
}, |
|
{ |
|
"completion_length": 157.33334350585938, |
|
"epoch": 0.021811856014987287, |
|
"grad_norm": 0.6227591037750244, |
|
"kl": 0.020313257351517677, |
|
"learning_rate": 1.6285698816954626e-06, |
|
"loss": 0.0008, |
|
"reward": 1.3333333730697632, |
|
"reward_std": 1.29099440574646, |
|
"rewards/correctness_reward_func": 1.0, |
|
"rewards/int_reward_func": 0.3333333432674408, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.0, |
|
"step": 163 |
|
}, |
|
{ |
|
"completion_length": 100.5, |
|
"epoch": 0.021945671082563897, |
|
"grad_norm": 0.724163830280304, |
|
"kl": 0.07246337085962296, |
|
"learning_rate": 1.5959385747947697e-06, |
|
"loss": 0.0029, |
|
"reward": 1.25, |
|
"reward_std": 1.3693064451217651, |
|
"rewards/correctness_reward_func": 1.0, |
|
"rewards/int_reward_func": 0.25, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.0, |
|
"step": 164 |
|
}, |
|
{ |
|
"completion_length": 195.83334350585938, |
|
"epoch": 0.022079486150140507, |
|
"grad_norm": 0.6425427198410034, |
|
"kl": 0.009818270802497864, |
|
"learning_rate": 1.56348351646022e-06, |
|
"loss": 0.0004, |
|
"reward": 0.5, |
|
"reward_std": 1.0, |
|
"rewards/correctness_reward_func": 0.3333333432674408, |
|
"rewards/int_reward_func": 0.1666666716337204, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.0, |
|
"step": 165 |
|
}, |
|
{ |
|
"completion_length": 173.1666717529297, |
|
"epoch": 0.022213301217717116, |
|
"grad_norm": 0.31658393144607544, |
|
"kl": 0.02298016846179962, |
|
"learning_rate": 1.5312110338697427e-06, |
|
"loss": 0.0009, |
|
"reward": 2.0833334922790527, |
|
"reward_std": 1.0206208229064941, |
|
"rewards/correctness_reward_func": 1.6666667461395264, |
|
"rewards/int_reward_func": 0.4166666865348816, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.0, |
|
"step": 166 |
|
}, |
|
{ |
|
"completion_length": 144.33334350585938, |
|
"epoch": 0.022347116285293723, |
|
"grad_norm": 0.7370114922523499, |
|
"kl": 0.038862839341163635, |
|
"learning_rate": 1.4991274186077632e-06, |
|
"loss": 0.0016, |
|
"reward": 1.1519999504089355, |
|
"reward_std": 1.4199146032333374, |
|
"rewards/correctness_reward_func": 1.0, |
|
"rewards/int_reward_func": 0.25, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.09800000488758087, |
|
"step": 167 |
|
}, |
|
{ |
|
"completion_length": 108.16667175292969, |
|
"epoch": 0.022480931352870333, |
|
"grad_norm": 1.0748318433761597, |
|
"kl": 0.05299171060323715, |
|
"learning_rate": 1.467238925438646e-06, |
|
"loss": 0.0021, |
|
"reward": 2.5209999084472656, |
|
"reward_std": 0.051439255475997925, |
|
"rewards/correctness_reward_func": 2.0, |
|
"rewards/int_reward_func": 0.5, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.021000001579523087, |
|
"step": 168 |
|
}, |
|
{ |
|
"completion_length": 91.83333587646484, |
|
"epoch": 0.022614746420446943, |
|
"grad_norm": 1.3673151731491089, |
|
"kl": 0.07639364898204803, |
|
"learning_rate": 1.4355517710873184e-06, |
|
"loss": 0.0031, |
|
"reward": 2.0833334922790527, |
|
"reward_std": 1.0206208229064941, |
|
"rewards/correctness_reward_func": 1.6666667461395264, |
|
"rewards/int_reward_func": 0.4166666865348816, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.0, |
|
"step": 169 |
|
}, |
|
{ |
|
"completion_length": 135.5, |
|
"epoch": 0.022748561488023553, |
|
"grad_norm": 0.4482136368751526, |
|
"kl": 0.02127678319811821, |
|
"learning_rate": 1.4040721330273063e-06, |
|
"loss": 0.0009, |
|
"reward": 2.1666667461395264, |
|
"reward_std": 0.8164965510368347, |
|
"rewards/correctness_reward_func": 1.6666667461395264, |
|
"rewards/int_reward_func": 0.5, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.0, |
|
"step": 170 |
|
}, |
|
{ |
|
"completion_length": 165.33334350585938, |
|
"epoch": 0.02288237655560016, |
|
"grad_norm": 0.27974173426628113, |
|
"kl": 0.01391599141061306, |
|
"learning_rate": 1.3728061482764238e-06, |
|
"loss": 0.0006, |
|
"reward": 2.0833334922790527, |
|
"reward_std": 1.0206208229064941, |
|
"rewards/correctness_reward_func": 1.6666667461395264, |
|
"rewards/int_reward_func": 0.4166666865348816, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.0, |
|
"step": 171 |
|
}, |
|
{ |
|
"completion_length": 176.33334350585938, |
|
"epoch": 0.02301619162317677, |
|
"grad_norm": 0.4582180082798004, |
|
"kl": 0.016940653324127197, |
|
"learning_rate": 1.3417599122003464e-06, |
|
"loss": 0.0007, |
|
"reward": 1.4166667461395264, |
|
"reward_std": 1.2006943225860596, |
|
"rewards/correctness_reward_func": 1.0, |
|
"rewards/int_reward_func": 0.4166666865348816, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.0, |
|
"step": 172 |
|
}, |
|
{ |
|
"completion_length": 200.0, |
|
"epoch": 0.02315000669075338, |
|
"grad_norm": 0.0036287466064095497, |
|
"kl": 0.0012827565660700202, |
|
"learning_rate": 1.3109394773243117e-06, |
|
"loss": 0.0001, |
|
"reward": 0.0, |
|
"reward_std": 0.0, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/int_reward_func": 0.0, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.0, |
|
"step": 173 |
|
}, |
|
{ |
|
"completion_length": 85.83333587646484, |
|
"epoch": 0.02328382175832999, |
|
"grad_norm": 0.021944554522633553, |
|
"kl": 0.04209326207637787, |
|
"learning_rate": 1.280350852153168e-06, |
|
"loss": 0.0017, |
|
"reward": 2.5, |
|
"reward_std": 0.0, |
|
"rewards/correctness_reward_func": 2.0, |
|
"rewards/int_reward_func": 0.5, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.0, |
|
"step": 174 |
|
}, |
|
{ |
|
"completion_length": 108.0, |
|
"epoch": 0.0234176368259066, |
|
"grad_norm": 0.01814507693052292, |
|
"kl": 0.04845254868268967, |
|
"learning_rate": 1.2500000000000007e-06, |
|
"loss": 0.0019, |
|
"reward": 2.5, |
|
"reward_std": 0.0, |
|
"rewards/correctness_reward_func": 2.0, |
|
"rewards/int_reward_func": 0.5, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.0, |
|
"step": 175 |
|
}, |
|
{ |
|
"completion_length": 169.0, |
|
"epoch": 0.023551451893483205, |
|
"grad_norm": 0.418618768453598, |
|
"kl": 0.01748467981815338, |
|
"learning_rate": 1.2198928378235717e-06, |
|
"loss": 0.0007, |
|
"reward": 1.6666667461395264, |
|
"reward_std": 1.29099440574646, |
|
"rewards/correctness_reward_func": 1.3333333730697632, |
|
"rewards/int_reward_func": 0.3333333432674408, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.0, |
|
"step": 176 |
|
}, |
|
{ |
|
"completion_length": 172.0, |
|
"epoch": 0.023685266961059815, |
|
"grad_norm": 0.33497563004493713, |
|
"kl": 0.023299388587474823, |
|
"learning_rate": 1.1900352350748026e-06, |
|
"loss": 0.0009, |
|
"reward": 2.0833334922790527, |
|
"reward_std": 1.0206208229064941, |
|
"rewards/correctness_reward_func": 1.6666667461395264, |
|
"rewards/int_reward_func": 0.4166666865348816, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.0, |
|
"step": 177 |
|
}, |
|
{ |
|
"completion_length": 176.6666717529297, |
|
"epoch": 0.023819082028636425, |
|
"grad_norm": 0.629429817199707, |
|
"kl": 0.022387957200407982, |
|
"learning_rate": 1.160433012552508e-06, |
|
"loss": 0.0009, |
|
"reward": 0.8541666865348816, |
|
"reward_std": 1.2757759094238281, |
|
"rewards/correctness_reward_func": 0.6666666865348816, |
|
"rewards/int_reward_func": 0.1666666716337204, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.02083333395421505, |
|
"step": 178 |
|
}, |
|
{ |
|
"completion_length": 105.0, |
|
"epoch": 0.023952897096213035, |
|
"grad_norm": 0.6294716596603394, |
|
"kl": 0.019054202362895012, |
|
"learning_rate": 1.1310919412686248e-06, |
|
"loss": 0.0008, |
|
"reward": 2.1666667461395264, |
|
"reward_std": 0.8164965510368347, |
|
"rewards/correctness_reward_func": 1.6666667461395264, |
|
"rewards/int_reward_func": 0.5, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.0, |
|
"step": 179 |
|
}, |
|
{ |
|
"completion_length": 172.5, |
|
"epoch": 0.02408671216378964, |
|
"grad_norm": 0.318983793258667, |
|
"kl": 0.022515472024679184, |
|
"learning_rate": 1.1020177413231334e-06, |
|
"loss": 0.0009, |
|
"reward": 1.6666667461395264, |
|
"reward_std": 1.29099440574646, |
|
"rewards/correctness_reward_func": 1.3333333730697632, |
|
"rewards/int_reward_func": 0.3333333432674408, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.0, |
|
"step": 180 |
|
}, |
|
{ |
|
"completion_length": 126.33333587646484, |
|
"epoch": 0.02422052723136625, |
|
"grad_norm": 0.007083941251039505, |
|
"kl": 0.026608020067214966, |
|
"learning_rate": 1.073216080788921e-06, |
|
"loss": 0.0011, |
|
"reward": 2.5, |
|
"reward_std": 0.0, |
|
"rewards/correctness_reward_func": 2.0, |
|
"rewards/int_reward_func": 0.5, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.0, |
|
"step": 181 |
|
}, |
|
{ |
|
"completion_length": 101.83333587646484, |
|
"epoch": 0.02435434229894286, |
|
"grad_norm": 0.01637137122452259, |
|
"kl": 0.05336300656199455, |
|
"learning_rate": 1.0446925746067768e-06, |
|
"loss": 0.0021, |
|
"reward": 2.5, |
|
"reward_std": 0.0, |
|
"rewards/correctness_reward_func": 2.0, |
|
"rewards/int_reward_func": 0.5, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.0, |
|
"step": 182 |
|
}, |
|
{ |
|
"completion_length": 133.6666717529297, |
|
"epoch": 0.02448815736651947, |
|
"grad_norm": 0.010956374928355217, |
|
"kl": 0.015068410895764828, |
|
"learning_rate": 1.0164527834907468e-06, |
|
"loss": 0.0006, |
|
"reward": 2.5, |
|
"reward_std": 0.0, |
|
"rewards/correctness_reward_func": 2.0, |
|
"rewards/int_reward_func": 0.5, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.0, |
|
"step": 183 |
|
}, |
|
{ |
|
"completion_length": 152.83334350585938, |
|
"epoch": 0.02462197243409608, |
|
"grad_norm": 0.059698257595300674, |
|
"kl": 0.026588236913084984, |
|
"learning_rate": 9.88502212844063e-07, |
|
"loss": 0.0011, |
|
"reward": 0.0, |
|
"reward_std": 0.0, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/int_reward_func": 0.0, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.0, |
|
"step": 184 |
|
}, |
|
{ |
|
"completion_length": 104.5, |
|
"epoch": 0.024755787501672687, |
|
"grad_norm": 0.8568313121795654, |
|
"kl": 0.1081070601940155, |
|
"learning_rate": 9.608463116858544e-07, |
|
"loss": 0.0043, |
|
"reward": 1.25, |
|
"reward_std": 1.3693064451217651, |
|
"rewards/correctness_reward_func": 1.0, |
|
"rewards/int_reward_func": 0.25, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.0, |
|
"step": 185 |
|
}, |
|
{ |
|
"completion_length": 188.0, |
|
"epoch": 0.024889602569249297, |
|
"grad_norm": 0.4310242533683777, |
|
"kl": 0.009734027087688446, |
|
"learning_rate": 9.334904715888496e-07, |
|
"loss": 0.0004, |
|
"reward": 0.0833333358168602, |
|
"reward_std": 0.20412413775920868, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/int_reward_func": 0.0833333358168602, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.0, |
|
"step": 186 |
|
}, |
|
{ |
|
"completion_length": 134.1666717529297, |
|
"epoch": 0.025023417636825907, |
|
"grad_norm": 0.43237951397895813, |
|
"kl": 0.11007413268089294, |
|
"learning_rate": 9.064400256282757e-07, |
|
"loss": 0.0044, |
|
"reward": 2.4739999771118164, |
|
"reward_std": 0.063686802983284, |
|
"rewards/correctness_reward_func": 2.0, |
|
"rewards/int_reward_func": 0.5, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.026000000536441803, |
|
"step": 187 |
|
}, |
|
{ |
|
"completion_length": 120.83333587646484, |
|
"epoch": 0.025157232704402517, |
|
"grad_norm": 0.5578576922416687, |
|
"kl": 0.06421395391225815, |
|
"learning_rate": 8.797002473421729e-07, |
|
"loss": 0.0026, |
|
"reward": 1.3333333730697632, |
|
"reward_std": 1.29099440574646, |
|
"rewards/correctness_reward_func": 1.0, |
|
"rewards/int_reward_func": 0.3333333432674408, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.0, |
|
"step": 188 |
|
}, |
|
{ |
|
"completion_length": 129.33334350585938, |
|
"epoch": 0.025291047771979124, |
|
"grad_norm": 0.010872965678572655, |
|
"kl": 0.04434513673186302, |
|
"learning_rate": 8.532763497032987e-07, |
|
"loss": 0.0018, |
|
"reward": 2.5, |
|
"reward_std": 0.0, |
|
"rewards/correctness_reward_func": 2.0, |
|
"rewards/int_reward_func": 0.5, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.0, |
|
"step": 189 |
|
}, |
|
{ |
|
"completion_length": 82.33333587646484, |
|
"epoch": 0.025424862839555733, |
|
"grad_norm": 0.8208261728286743, |
|
"kl": 0.12054745852947235, |
|
"learning_rate": 8.271734841028553e-07, |
|
"loss": 0.0048, |
|
"reward": 2.125, |
|
"reward_std": 1.0458250045776367, |
|
"rewards/correctness_reward_func": 1.6666667461395264, |
|
"rewards/int_reward_func": 0.4166666865348816, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.0416666679084301, |
|
"step": 190 |
|
}, |
|
{ |
|
"completion_length": 157.6666717529297, |
|
"epoch": 0.025558677907132343, |
|
"grad_norm": 0.007430327590554953, |
|
"kl": 0.0287347212433815, |
|
"learning_rate": 8.013967393462094e-07, |
|
"loss": 0.0011, |
|
"reward": 2.5, |
|
"reward_std": 0.0, |
|
"rewards/correctness_reward_func": 2.0, |
|
"rewards/int_reward_func": 0.5, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.0, |
|
"step": 191 |
|
}, |
|
{ |
|
"completion_length": 105.33333587646484, |
|
"epoch": 0.025692492974708953, |
|
"grad_norm": 0.5002444386482239, |
|
"kl": 0.05566655844449997, |
|
"learning_rate": 7.759511406608255e-07, |
|
"loss": 0.0022, |
|
"reward": 0.8333333730697632, |
|
"reward_std": 1.29099440574646, |
|
"rewards/correctness_reward_func": 0.6666666865348816, |
|
"rewards/int_reward_func": 0.1666666716337204, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.0, |
|
"step": 192 |
|
}, |
|
{ |
|
"completion_length": 135.83334350585938, |
|
"epoch": 0.02582630804228556, |
|
"grad_norm": 0.663666844367981, |
|
"kl": 0.07299192249774933, |
|
"learning_rate": 7.508416487165862e-07, |
|
"loss": 0.0029, |
|
"reward": 1.68149995803833, |
|
"reward_std": 1.2683271169662476, |
|
"rewards/correctness_reward_func": 1.3333333730697632, |
|
"rewards/int_reward_func": 0.3333333432674408, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.014833333902060986, |
|
"step": 193 |
|
}, |
|
{ |
|
"completion_length": 153.5, |
|
"epoch": 0.02596012310986217, |
|
"grad_norm": 0.48523613810539246, |
|
"kl": 0.019618486985564232, |
|
"learning_rate": 7.260731586586983e-07, |
|
"loss": 0.0008, |
|
"reward": 1.6666667461395264, |
|
"reward_std": 1.29099440574646, |
|
"rewards/correctness_reward_func": 1.3333333730697632, |
|
"rewards/int_reward_func": 0.3333333432674408, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.0, |
|
"step": 194 |
|
}, |
|
{ |
|
"completion_length": 185.1666717529297, |
|
"epoch": 0.02609393817743878, |
|
"grad_norm": 0.48822808265686035, |
|
"kl": 0.01835816167294979, |
|
"learning_rate": 7.016504991533727e-07, |
|
"loss": 0.0007, |
|
"reward": 0.1666666716337204, |
|
"reward_std": 0.25819891691207886, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/int_reward_func": 0.1666666716337204, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.0, |
|
"step": 195 |
|
}, |
|
{ |
|
"completion_length": 98.16667175292969, |
|
"epoch": 0.02622775324501539, |
|
"grad_norm": 0.8225710391998291, |
|
"kl": 0.05286232382059097, |
|
"learning_rate": 6.775784314464717e-07, |
|
"loss": 0.0021, |
|
"reward": 1.6666667461395264, |
|
"reward_std": 1.29099440574646, |
|
"rewards/correctness_reward_func": 1.3333333730697632, |
|
"rewards/int_reward_func": 0.3333333432674408, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.0, |
|
"step": 196 |
|
}, |
|
{ |
|
"completion_length": 95.0, |
|
"epoch": 0.026361568312592, |
|
"grad_norm": 0.02045668661594391, |
|
"kl": 0.11426003277301788, |
|
"learning_rate": 6.538616484352902e-07, |
|
"loss": 0.0046, |
|
"reward": 2.5, |
|
"reward_std": 0.0, |
|
"rewards/correctness_reward_func": 2.0, |
|
"rewards/int_reward_func": 0.5, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.0, |
|
"step": 197 |
|
}, |
|
{ |
|
"completion_length": 161.1666717529297, |
|
"epoch": 0.026495383380168606, |
|
"grad_norm": 0.5276882648468018, |
|
"kl": 0.02128939889371395, |
|
"learning_rate": 6.305047737536707e-07, |
|
"loss": 0.0009, |
|
"reward": 0.6041666865348816, |
|
"reward_std": 0.9566108584403992, |
|
"rewards/correctness_reward_func": 0.3333333432674408, |
|
"rewards/int_reward_func": 0.25, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.02083333395421505, |
|
"step": 198 |
|
}, |
|
{ |
|
"completion_length": 109.33333587646484, |
|
"epoch": 0.026629198447745216, |
|
"grad_norm": 0.009646751917898655, |
|
"kl": 0.032662972807884216, |
|
"learning_rate": 6.075123608706093e-07, |
|
"loss": 0.0013, |
|
"reward": 2.5, |
|
"reward_std": 0.0, |
|
"rewards/correctness_reward_func": 2.0, |
|
"rewards/int_reward_func": 0.5, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.0, |
|
"step": 199 |
|
}, |
|
{ |
|
"completion_length": 183.5, |
|
"epoch": 0.026763013515321826, |
|
"grad_norm": 0.4662231504917145, |
|
"kl": 0.018144948408007622, |
|
"learning_rate": 5.848888922025553e-07, |
|
"loss": 0.0007, |
|
"reward": 0.8541666865348816, |
|
"reward_std": 1.2757759094238281, |
|
"rewards/correctness_reward_func": 0.6666666865348816, |
|
"rewards/int_reward_func": 0.1666666716337204, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.02083333395421505, |
|
"step": 200 |
|
}, |
|
{ |
|
"completion_length": 134.33334350585938, |
|
"epoch": 0.026896828582898435, |
|
"grad_norm": 0.3459940552711487, |
|
"kl": 0.02849223092198372, |
|
"learning_rate": 5.626387782395512e-07, |
|
"loss": 0.0011, |
|
"reward": 2.0833334922790527, |
|
"reward_std": 1.0206208229064941, |
|
"rewards/correctness_reward_func": 1.6666667461395264, |
|
"rewards/int_reward_func": 0.4166666865348816, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.0, |
|
"step": 201 |
|
}, |
|
{ |
|
"completion_length": 144.5, |
|
"epoch": 0.027030643650475042, |
|
"grad_norm": 0.4389075040817261, |
|
"kl": 0.10118179768323898, |
|
"learning_rate": 5.407663566854008e-07, |
|
"loss": 0.004, |
|
"reward": 0.8068333864212036, |
|
"reward_std": 1.312966227531433, |
|
"rewards/correctness_reward_func": 0.6666666865348816, |
|
"rewards/int_reward_func": 0.1666666716337204, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.026499999687075615, |
|
"step": 202 |
|
}, |
|
{ |
|
"completion_length": 135.1666717529297, |
|
"epoch": 0.027164458718051652, |
|
"grad_norm": 0.3375273644924164, |
|
"kl": 0.03832804784178734, |
|
"learning_rate": 5.192758916120236e-07, |
|
"loss": 0.0015, |
|
"reward": 1.5, |
|
"reward_std": 1.095445156097412, |
|
"rewards/correctness_reward_func": 1.0, |
|
"rewards/int_reward_func": 0.5, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.0, |
|
"step": 203 |
|
}, |
|
{ |
|
"completion_length": 149.6666717529297, |
|
"epoch": 0.02729827378562826, |
|
"grad_norm": 0.018328191712498665, |
|
"kl": 0.03014025092124939, |
|
"learning_rate": 4.981715726281666e-07, |
|
"loss": 0.0012, |
|
"reward": 2.5, |
|
"reward_std": 0.0, |
|
"rewards/correctness_reward_func": 2.0, |
|
"rewards/int_reward_func": 0.5, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.0, |
|
"step": 204 |
|
}, |
|
{ |
|
"completion_length": 160.1666717529297, |
|
"epoch": 0.02743208885320487, |
|
"grad_norm": 0.08511482924222946, |
|
"kl": 0.06747082620859146, |
|
"learning_rate": 4.774575140626317e-07, |
|
"loss": 0.0027, |
|
"reward": 0.0, |
|
"reward_std": 0.0, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/int_reward_func": 0.0, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.0, |
|
"step": 205 |
|
}, |
|
{ |
|
"completion_length": 82.66667175292969, |
|
"epoch": 0.02756590392078148, |
|
"grad_norm": 4.189925193786621, |
|
"kl": 0.9347108602523804, |
|
"learning_rate": 4.5713775416217884e-07, |
|
"loss": 0.0374, |
|
"reward": 2.1038334369659424, |
|
"reward_std": 0.9704062342643738, |
|
"rewards/correctness_reward_func": 1.6666667461395264, |
|
"rewards/int_reward_func": 0.4166666865348816, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.020500000566244125, |
|
"step": 206 |
|
}, |
|
{ |
|
"completion_length": 195.83334350585938, |
|
"epoch": 0.027699718988358088, |
|
"grad_norm": 0.019951414316892624, |
|
"kl": 0.005952201783657074, |
|
"learning_rate": 4.372162543042624e-07, |
|
"loss": 0.0002, |
|
"reward": 0.0, |
|
"reward_std": 0.0, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/int_reward_func": 0.0, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.0, |
|
"step": 207 |
|
}, |
|
{ |
|
"completion_length": 81.33333587646484, |
|
"epoch": 0.027833534055934698, |
|
"grad_norm": 0.6730319261550903, |
|
"kl": 0.09711631387472153, |
|
"learning_rate": 4.1769689822475147e-07, |
|
"loss": 0.0039, |
|
"reward": 1.6666667461395264, |
|
"reward_std": 1.29099440574646, |
|
"rewards/correctness_reward_func": 1.3333333730697632, |
|
"rewards/int_reward_func": 0.3333333432674408, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.0, |
|
"step": 208 |
|
}, |
|
{ |
|
"completion_length": 148.83334350585938, |
|
"epoch": 0.027967349123511308, |
|
"grad_norm": 0.012037621811032295, |
|
"kl": 0.03848704323172569, |
|
"learning_rate": 3.9858349126078945e-07, |
|
"loss": 0.0015, |
|
"reward": 2.5, |
|
"reward_std": 0.0, |
|
"rewards/correctness_reward_func": 2.0, |
|
"rewards/int_reward_func": 0.5, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.0, |
|
"step": 209 |
|
}, |
|
{ |
|
"completion_length": 140.0, |
|
"epoch": 0.028101164191087918, |
|
"grad_norm": 0.9327221512794495, |
|
"kl": 0.03975226730108261, |
|
"learning_rate": 3.798797596089351e-07, |
|
"loss": 0.0016, |
|
"reward": 2.1533334255218506, |
|
"reward_std": 0.8105965852737427, |
|
"rewards/correctness_reward_func": 1.6666667461395264, |
|
"rewards/int_reward_func": 0.5, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.013333333656191826, |
|
"step": 210 |
|
}, |
|
{ |
|
"completion_length": 138.5, |
|
"epoch": 0.028234979258664524, |
|
"grad_norm": 0.3639069199562073, |
|
"kl": 0.019840382039546967, |
|
"learning_rate": 3.615893495987335e-07, |
|
"loss": 0.0008, |
|
"reward": 2.0833334922790527, |
|
"reward_std": 1.0206208229064941, |
|
"rewards/correctness_reward_func": 1.6666667461395264, |
|
"rewards/int_reward_func": 0.4166666865348816, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.0, |
|
"step": 211 |
|
}, |
|
{ |
|
"completion_length": 127.5, |
|
"epoch": 0.028368794326241134, |
|
"grad_norm": 1.152321457862854, |
|
"kl": 0.12709830701351166, |
|
"learning_rate": 3.4371582698185636e-07, |
|
"loss": 0.0051, |
|
"reward": 2.0833334922790527, |
|
"reward_std": 1.0206208229064941, |
|
"rewards/correctness_reward_func": 1.6666667461395264, |
|
"rewards/int_reward_func": 0.4166666865348816, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.0, |
|
"step": 212 |
|
}, |
|
{ |
|
"completion_length": 111.66667175292969, |
|
"epoch": 0.028502609393817744, |
|
"grad_norm": 0.6623407602310181, |
|
"kl": 0.03994090482592583, |
|
"learning_rate": 3.262626762369525e-07, |
|
"loss": 0.0016, |
|
"reward": 2.5416667461395264, |
|
"reward_std": 0.10206206887960434, |
|
"rewards/correctness_reward_func": 2.0, |
|
"rewards/int_reward_func": 0.5, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.0416666679084301, |
|
"step": 213 |
|
}, |
|
{ |
|
"completion_length": 200.0, |
|
"epoch": 0.028636424461394354, |
|
"grad_norm": 0.004210959654301405, |
|
"kl": 0.0024131210520863533, |
|
"learning_rate": 3.092332998903416e-07, |
|
"loss": 0.0001, |
|
"reward": 0.0, |
|
"reward_std": 0.0, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/int_reward_func": 0.0, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.0, |
|
"step": 214 |
|
}, |
|
{ |
|
"completion_length": 99.16667175292969, |
|
"epoch": 0.028770239528970964, |
|
"grad_norm": 0.014651302248239517, |
|
"kl": 0.09786570072174072, |
|
"learning_rate": 2.9263101785268253e-07, |
|
"loss": 0.0039, |
|
"reward": 2.5, |
|
"reward_std": 0.0, |
|
"rewards/correctness_reward_func": 2.0, |
|
"rewards/int_reward_func": 0.5, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.0, |
|
"step": 215 |
|
}, |
|
{ |
|
"completion_length": 123.66667175292969, |
|
"epoch": 0.02890405459654757, |
|
"grad_norm": 0.006985802669078112, |
|
"kl": 0.024645794183015823, |
|
"learning_rate": 2.764590667717562e-07, |
|
"loss": 0.001, |
|
"reward": 2.5, |
|
"reward_std": 0.0, |
|
"rewards/correctness_reward_func": 2.0, |
|
"rewards/int_reward_func": 0.5, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.0, |
|
"step": 216 |
|
}, |
|
{ |
|
"completion_length": 142.83334350585938, |
|
"epoch": 0.02903786966412418, |
|
"grad_norm": 0.5996547341346741, |
|
"kl": 0.037068672478199005, |
|
"learning_rate": 2.6072059940146775e-07, |
|
"loss": 0.0015, |
|
"reward": 2.0833334922790527, |
|
"reward_std": 1.0206208229064941, |
|
"rewards/correctness_reward_func": 1.6666667461395264, |
|
"rewards/int_reward_func": 0.4166666865348816, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.0, |
|
"step": 217 |
|
}, |
|
{ |
|
"completion_length": 123.16667175292969, |
|
"epoch": 0.02917168473170079, |
|
"grad_norm": 0.005444398615509272, |
|
"kl": 0.0173953827470541, |
|
"learning_rate": 2.454186839872158e-07, |
|
"loss": 0.0007, |
|
"reward": 2.5, |
|
"reward_std": 0.0, |
|
"rewards/correctness_reward_func": 2.0, |
|
"rewards/int_reward_func": 0.5, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.0, |
|
"step": 218 |
|
}, |
|
{ |
|
"completion_length": 119.33333587646484, |
|
"epoch": 0.0293054997992774, |
|
"grad_norm": 0.4850221574306488, |
|
"kl": 0.04719596356153488, |
|
"learning_rate": 2.3055630366772857e-07, |
|
"loss": 0.0019, |
|
"reward": 2.1666667461395264, |
|
"reward_std": 0.8164965510368347, |
|
"rewards/correctness_reward_func": 1.6666667461395264, |
|
"rewards/int_reward_func": 0.5, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.0, |
|
"step": 219 |
|
}, |
|
{ |
|
"completion_length": 194.83334350585938, |
|
"epoch": 0.029439314866854006, |
|
"grad_norm": 0.2741755247116089, |
|
"kl": 0.005098981317132711, |
|
"learning_rate": 2.1613635589349756e-07, |
|
"loss": 0.0002, |
|
"reward": 0.4166666865348816, |
|
"reward_std": 1.0206208229064941, |
|
"rewards/correctness_reward_func": 0.3333333432674408, |
|
"rewards/int_reward_func": 0.0833333358168602, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.0, |
|
"step": 220 |
|
}, |
|
{ |
|
"completion_length": 180.0, |
|
"epoch": 0.029573129934430616, |
|
"grad_norm": 0.4658772051334381, |
|
"kl": 0.02872592769563198, |
|
"learning_rate": 2.0216165186191406e-07, |
|
"loss": 0.0011, |
|
"reward": 0.9166666865348816, |
|
"reward_std": 1.2416388988494873, |
|
"rewards/correctness_reward_func": 0.6666666865348816, |
|
"rewards/int_reward_func": 0.25, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.0, |
|
"step": 221 |
|
}, |
|
{ |
|
"completion_length": 180.5, |
|
"epoch": 0.029706945002007226, |
|
"grad_norm": 0.5886812210083008, |
|
"kl": 0.026107415556907654, |
|
"learning_rate": 1.8863491596921745e-07, |
|
"loss": 0.001, |
|
"reward": 0.8080000281333923, |
|
"reward_std": 1.317206859588623, |
|
"rewards/correctness_reward_func": 0.6666666865348816, |
|
"rewards/int_reward_func": 0.1666666716337204, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.025333335623145103, |
|
"step": 222 |
|
}, |
|
{ |
|
"completion_length": 137.33334350585938, |
|
"epoch": 0.029840760069583836, |
|
"grad_norm": 0.013461791910231113, |
|
"kl": 0.020822227001190186, |
|
"learning_rate": 1.7555878527937164e-07, |
|
"loss": 0.0008, |
|
"reward": 2.5, |
|
"reward_std": 0.0, |
|
"rewards/correctness_reward_func": 2.0, |
|
"rewards/int_reward_func": 0.5, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.0, |
|
"step": 223 |
|
}, |
|
{ |
|
"completion_length": 88.83333587646484, |
|
"epoch": 0.029974575137160443, |
|
"grad_norm": 0.9285796284675598, |
|
"kl": 0.061391860246658325, |
|
"learning_rate": 1.629358090099639e-07, |
|
"loss": 0.0025, |
|
"reward": 2.1008334159851074, |
|
"reward_std": 1.0300506353378296, |
|
"rewards/correctness_reward_func": 1.6666667461395264, |
|
"rewards/int_reward_func": 0.4166666865348816, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.017500000074505806, |
|
"step": 224 |
|
}, |
|
{ |
|
"completion_length": 111.83333587646484, |
|
"epoch": 0.030108390204737052, |
|
"grad_norm": 0.47369858622550964, |
|
"kl": 0.05458936095237732, |
|
"learning_rate": 1.507684480352292e-07, |
|
"loss": 0.0022, |
|
"reward": 2.0833334922790527, |
|
"reward_std": 1.0206208229064941, |
|
"rewards/correctness_reward_func": 1.6666667461395264, |
|
"rewards/int_reward_func": 0.4166666865348816, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.0, |
|
"step": 225 |
|
}, |
|
{ |
|
"completion_length": 102.5, |
|
"epoch": 0.030242205272313662, |
|
"grad_norm": 0.5270379185676575, |
|
"kl": 0.04939878731966019, |
|
"learning_rate": 1.3905907440629752e-07, |
|
"loss": 0.002, |
|
"reward": 2.484499931335449, |
|
"reward_std": 0.03796706721186638, |
|
"rewards/correctness_reward_func": 2.0, |
|
"rewards/int_reward_func": 0.5, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.015500000678002834, |
|
"step": 226 |
|
}, |
|
{ |
|
"completion_length": 122.0, |
|
"epoch": 0.030376020339890272, |
|
"grad_norm": 1.011409878730774, |
|
"kl": 0.07699307054281235, |
|
"learning_rate": 1.278099708887587e-07, |
|
"loss": 0.0031, |
|
"reward": 2.0989999771118164, |
|
"reward_std": 0.9822453856468201, |
|
"rewards/correctness_reward_func": 1.6666667461395264, |
|
"rewards/int_reward_func": 0.4166666865348816, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.015666667371988297, |
|
"step": 227 |
|
}, |
|
{ |
|
"completion_length": 177.0, |
|
"epoch": 0.030509835407466882, |
|
"grad_norm": 0.24471516907215118, |
|
"kl": 0.013278186321258545, |
|
"learning_rate": 1.1702333051763271e-07, |
|
"loss": 0.0005, |
|
"reward": 1.6666667461395264, |
|
"reward_std": 1.29099440574646, |
|
"rewards/correctness_reward_func": 1.3333333730697632, |
|
"rewards/int_reward_func": 0.3333333432674408, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.0, |
|
"step": 228 |
|
}, |
|
{ |
|
"completion_length": 125.66667175292969, |
|
"epoch": 0.03064365047504349, |
|
"grad_norm": 0.4674018323421478, |
|
"kl": 0.03539639338850975, |
|
"learning_rate": 1.067012561698319e-07, |
|
"loss": 0.0014, |
|
"reward": 1.6666667461395264, |
|
"reward_std": 1.29099440574646, |
|
"rewards/correctness_reward_func": 1.3333333730697632, |
|
"rewards/int_reward_func": 0.3333333432674408, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.0, |
|
"step": 229 |
|
}, |
|
{ |
|
"completion_length": 136.1666717529297, |
|
"epoch": 0.0307774655426201, |
|
"grad_norm": 0.5469502806663513, |
|
"kl": 0.013292806223034859, |
|
"learning_rate": 9.684576015420277e-08, |
|
"loss": 0.0005, |
|
"reward": 0.75, |
|
"reward_std": 0.8803408145904541, |
|
"rewards/correctness_reward_func": 0.3333333432674408, |
|
"rewards/int_reward_func": 0.4166666865348816, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.0, |
|
"step": 230 |
|
}, |
|
{ |
|
"completion_length": 188.1666717529297, |
|
"epoch": 0.03091128061019671, |
|
"grad_norm": 0.48473238945007324, |
|
"kl": 0.009322606027126312, |
|
"learning_rate": 8.745876381922147e-08, |
|
"loss": 0.0004, |
|
"reward": 0.4166666865348816, |
|
"reward_std": 1.0206208229064941, |
|
"rewards/correctness_reward_func": 0.3333333432674408, |
|
"rewards/int_reward_func": 0.0833333358168602, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.0, |
|
"step": 231 |
|
}, |
|
{ |
|
"completion_length": 162.0, |
|
"epoch": 0.03104509567777332, |
|
"grad_norm": 0.008338610641658306, |
|
"kl": 0.05397571250796318, |
|
"learning_rate": 7.854209717842231e-08, |
|
"loss": 0.0022, |
|
"reward": 0.0, |
|
"reward_std": 0.0, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/int_reward_func": 0.0, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.0, |
|
"step": 232 |
|
}, |
|
{ |
|
"completion_length": 109.16667175292969, |
|
"epoch": 0.031178910745349925, |
|
"grad_norm": 0.05844065919518471, |
|
"kl": 0.04213399440050125, |
|
"learning_rate": 7.009749855363457e-08, |
|
"loss": 0.0017, |
|
"reward": 2.5, |
|
"reward_std": 0.0, |
|
"rewards/correctness_reward_func": 2.0, |
|
"rewards/int_reward_func": 0.5, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.0, |
|
"step": 233 |
|
}, |
|
{ |
|
"completion_length": 172.6666717529297, |
|
"epoch": 0.03131272581292654, |
|
"grad_norm": 0.36787256598472595, |
|
"kl": 0.02669469267129898, |
|
"learning_rate": 6.212661423609184e-08, |
|
"loss": 0.0011, |
|
"reward": 1.75, |
|
"reward_std": 1.172603964805603, |
|
"rewards/correctness_reward_func": 1.3333333730697632, |
|
"rewards/int_reward_func": 0.4166666865348816, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.0, |
|
"step": 234 |
|
}, |
|
{ |
|
"completion_length": 137.1666717529297, |
|
"epoch": 0.031446540880503145, |
|
"grad_norm": 0.9061893820762634, |
|
"kl": 0.04043171927332878, |
|
"learning_rate": 5.463099816548578e-08, |
|
"loss": 0.0016, |
|
"reward": 1.6258333921432495, |
|
"reward_std": 1.356467604637146, |
|
"rewards/correctness_reward_func": 1.3333333730697632, |
|
"rewards/int_reward_func": 0.3333333432674408, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.04083333536982536, |
|
"step": 235 |
|
}, |
|
{ |
|
"completion_length": 128.5, |
|
"epoch": 0.03158035594807975, |
|
"grad_norm": 0.8342092037200928, |
|
"kl": 0.051375072449445724, |
|
"learning_rate": 4.761211162702117e-08, |
|
"loss": 0.0021, |
|
"reward": 1.25, |
|
"reward_std": 1.3693064451217651, |
|
"rewards/correctness_reward_func": 1.0, |
|
"rewards/int_reward_func": 0.25, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.0, |
|
"step": 236 |
|
}, |
|
{ |
|
"completion_length": 192.1666717529297, |
|
"epoch": 0.031714171015656364, |
|
"grad_norm": 0.004815606400370598, |
|
"kl": 0.005360649898648262, |
|
"learning_rate": 4.1071322966535487e-08, |
|
"loss": 0.0002, |
|
"reward": 0.0, |
|
"reward_std": 0.0, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/int_reward_func": 0.0, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.0, |
|
"step": 237 |
|
}, |
|
{ |
|
"completion_length": 162.33334350585938, |
|
"epoch": 0.03184798608323297, |
|
"grad_norm": 0.40393784642219543, |
|
"kl": 0.02690042555332184, |
|
"learning_rate": 3.5009907323737826e-08, |
|
"loss": 0.0011, |
|
"reward": 2.052999973297119, |
|
"reward_std": 1.0083918571472168, |
|
"rewards/correctness_reward_func": 1.6666667461395264, |
|
"rewards/int_reward_func": 0.4166666865348816, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.03033333271741867, |
|
"step": 238 |
|
}, |
|
{ |
|
"completion_length": 150.33334350585938, |
|
"epoch": 0.031981801150809584, |
|
"grad_norm": 0.5924004912376404, |
|
"kl": 0.0438881441950798, |
|
"learning_rate": 2.9429046383618042e-08, |
|
"loss": 0.0018, |
|
"reward": 1.9803333282470703, |
|
"reward_std": 0.9830150008201599, |
|
"rewards/correctness_reward_func": 1.6666667461395264, |
|
"rewards/int_reward_func": 0.4166666865348816, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.10300000011920929, |
|
"step": 239 |
|
}, |
|
{ |
|
"completion_length": 185.5, |
|
"epoch": 0.03211561621838619, |
|
"grad_norm": 0.4823758602142334, |
|
"kl": 0.01092308945953846, |
|
"learning_rate": 2.4329828146074096e-08, |
|
"loss": 0.0004, |
|
"reward": 0.0833333358168602, |
|
"reward_std": 0.20412413775920868, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/int_reward_func": 0.0833333358168602, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.0, |
|
"step": 240 |
|
}, |
|
{ |
|
"completion_length": 199.33334350585938, |
|
"epoch": 0.0322494312859628, |
|
"grad_norm": 0.9771707057952881, |
|
"kl": 0.11909093707799911, |
|
"learning_rate": 1.9713246713805588e-08, |
|
"loss": 0.0048, |
|
"reward": 0.4271666705608368, |
|
"reward_std": 0.8259710073471069, |
|
"rewards/correctness_reward_func": 0.3333333432674408, |
|
"rewards/int_reward_func": 0.1666666716337204, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.0728333368897438, |
|
"step": 241 |
|
}, |
|
{ |
|
"completion_length": 142.83334350585938, |
|
"epoch": 0.03238324635353941, |
|
"grad_norm": 0.013215508311986923, |
|
"kl": 0.019352126866579056, |
|
"learning_rate": 1.5580202098509078e-08, |
|
"loss": 0.0008, |
|
"reward": 2.5, |
|
"reward_std": 0.0, |
|
"rewards/correctness_reward_func": 2.0, |
|
"rewards/int_reward_func": 0.5, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.0, |
|
"step": 242 |
|
}, |
|
{ |
|
"completion_length": 159.33334350585938, |
|
"epoch": 0.03251706142111602, |
|
"grad_norm": 0.7379459738731384, |
|
"kl": 0.055626656860113144, |
|
"learning_rate": 1.193150004542204e-08, |
|
"loss": 0.0022, |
|
"reward": -0.10683333873748779, |
|
"reward_std": 0.13728278875350952, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/int_reward_func": 0.0, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.10683333873748779, |
|
"step": 243 |
|
}, |
|
{ |
|
"completion_length": 123.16667175292969, |
|
"epoch": 0.03265087648869262, |
|
"grad_norm": 0.008916381746530533, |
|
"kl": 0.02974400669336319, |
|
"learning_rate": 8.767851876239075e-09, |
|
"loss": 0.0012, |
|
"reward": 2.5, |
|
"reward_std": 0.0, |
|
"rewards/correctness_reward_func": 2.0, |
|
"rewards/int_reward_func": 0.5, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.0, |
|
"step": 244 |
|
}, |
|
{ |
|
"completion_length": 126.16667175292969, |
|
"epoch": 0.03278469155626924, |
|
"grad_norm": 0.8518889546394348, |
|
"kl": 0.02666613459587097, |
|
"learning_rate": 6.089874350439507e-09, |
|
"loss": 0.0011, |
|
"reward": 1.245500087738037, |
|
"reward_std": 1.3644126653671265, |
|
"rewards/correctness_reward_func": 1.0, |
|
"rewards/int_reward_func": 0.25, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.004500000271946192, |
|
"step": 245 |
|
}, |
|
{ |
|
"completion_length": 112.5, |
|
"epoch": 0.03291850662384584, |
|
"grad_norm": 0.6925764083862305, |
|
"kl": 0.07085268944501877, |
|
"learning_rate": 3.8980895450474455e-09, |
|
"loss": 0.0028, |
|
"reward": 2.454166889190674, |
|
"reward_std": 0.11226829886436462, |
|
"rewards/correctness_reward_func": 2.0, |
|
"rewards/int_reward_func": 0.5, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.04583333432674408, |
|
"step": 246 |
|
}, |
|
{ |
|
"completion_length": 106.83333587646484, |
|
"epoch": 0.03305232169142246, |
|
"grad_norm": 0.5025346279144287, |
|
"kl": 0.05391158163547516, |
|
"learning_rate": 2.192924752854042e-09, |
|
"loss": 0.0022, |
|
"reward": 2.4696667194366455, |
|
"reward_std": 0.07430125772953033, |
|
"rewards/correctness_reward_func": 2.0, |
|
"rewards/int_reward_func": 0.5, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.03033333271741867, |
|
"step": 247 |
|
}, |
|
{ |
|
"completion_length": 103.0, |
|
"epoch": 0.03318613675899906, |
|
"grad_norm": 0.6465145945549011, |
|
"kl": 0.08165942132472992, |
|
"learning_rate": 9.747123991141193e-10, |
|
"loss": 0.0033, |
|
"reward": 2.1666667461395264, |
|
"reward_std": 0.8164965510368347, |
|
"rewards/correctness_reward_func": 1.6666667461395264, |
|
"rewards/int_reward_func": 0.5, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.0, |
|
"step": 248 |
|
}, |
|
{ |
|
"completion_length": 181.6666717529297, |
|
"epoch": 0.03331995182657567, |
|
"grad_norm": 0.398413747549057, |
|
"kl": 0.016869667917490005, |
|
"learning_rate": 2.43689976739403e-10, |
|
"loss": 0.0007, |
|
"reward": 0.8333333730697632, |
|
"reward_std": 1.29099440574646, |
|
"rewards/correctness_reward_func": 0.6666666865348816, |
|
"rewards/int_reward_func": 0.1666666716337204, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.0, |
|
"step": 249 |
|
}, |
|
{ |
|
"completion_length": 176.33334350585938, |
|
"epoch": 0.03345376689415228, |
|
"grad_norm": 0.4862636625766754, |
|
"kl": 0.020642109215259552, |
|
"learning_rate": 0.0, |
|
"loss": 0.0008, |
|
"reward": 1.3333333730697632, |
|
"reward_std": 1.29099440574646, |
|
"rewards/correctness_reward_func": 1.0, |
|
"rewards/int_reward_func": 0.3333333432674408, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.0, |
|
"step": 250 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 250, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 250, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 0.0, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|