|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.9994666666666666, |
|
"eval_steps": 100, |
|
"global_step": 937, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 644.2009124755859, |
|
"epoch": 0.0010666666666666667, |
|
"grad_norm": 0.7038388582582233, |
|
"kl": 0.0, |
|
"learning_rate": 3.191489361702128e-08, |
|
"loss": 0.0066, |
|
"reward": 0.3364444375038147, |
|
"reward_std": 0.4127754420042038, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/cosine_scaled_reward": 0.33644443191587925, |
|
"rewards/format_reward": 0.0, |
|
"rewards/len_reward": -0.04627744515892118, |
|
"rewards/reasoning_steps_reward": 0.2901785895228386, |
|
"rewards/tag_count_reward": 0.01562500069849193, |
|
"step": 1 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 646.2991361618042, |
|
"epoch": 0.005333333333333333, |
|
"grad_norm": 1.165178025484845, |
|
"kl": 0.00015673041343688965, |
|
"learning_rate": 1.5957446808510638e-07, |
|
"loss": 0.047, |
|
"reward": 0.3832849070895463, |
|
"reward_std": 0.4201222136616707, |
|
"rewards/accuracy_reward": 0.007812500465661287, |
|
"rewards/cosine_scaled_reward": 0.37212418764829636, |
|
"rewards/format_reward": 0.003348214435391128, |
|
"rewards/len_reward": -0.04301465207754518, |
|
"rewards/reasoning_steps_reward": 0.3128720454405993, |
|
"rewards/tag_count_reward": 0.02008928661234677, |
|
"step": 5 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 651.7919937133789, |
|
"epoch": 0.010666666666666666, |
|
"grad_norm": 0.9583817953361469, |
|
"kl": 0.0002131938934326172, |
|
"learning_rate": 3.1914893617021275e-07, |
|
"loss": 0.0412, |
|
"reward": 0.30802508229389786, |
|
"reward_std": 0.39543123468756675, |
|
"rewards/accuracy_reward": 0.0062500002793967726, |
|
"rewards/cosine_scaled_reward": 0.2964179317001253, |
|
"rewards/format_reward": 0.0053571430966258046, |
|
"rewards/len_reward": -0.05133374946017284, |
|
"rewards/reasoning_steps_reward": 0.30892858691513536, |
|
"rewards/tag_count_reward": 0.017410715389996768, |
|
"step": 10 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 634.5687789916992, |
|
"epoch": 0.016, |
|
"grad_norm": 1.2665941426985083, |
|
"kl": 0.0002197742462158203, |
|
"learning_rate": 4.787234042553192e-07, |
|
"loss": 0.03, |
|
"reward": 0.3102074888069183, |
|
"reward_std": 0.41705508157610893, |
|
"rewards/accuracy_reward": 0.00714285746216774, |
|
"rewards/cosine_scaled_reward": 0.2977074964437634, |
|
"rewards/format_reward": 0.0053571430966258046, |
|
"rewards/len_reward": -0.04337599159625825, |
|
"rewards/reasoning_steps_reward": 0.28660716451704504, |
|
"rewards/tag_count_reward": 0.02656250132713467, |
|
"step": 15 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 639.2134269714355, |
|
"epoch": 0.021333333333333333, |
|
"grad_norm": 0.5388664872711878, |
|
"kl": 0.00028989315032958987, |
|
"learning_rate": 6.382978723404255e-07, |
|
"loss": 0.0341, |
|
"reward": 0.33970867421012374, |
|
"reward_std": 0.3754332659766078, |
|
"rewards/accuracy_reward": 0.008035714738070965, |
|
"rewards/cosine_scaled_reward": 0.3289943867130205, |
|
"rewards/format_reward": 0.0026785715483129023, |
|
"rewards/len_reward": -0.0342805469466839, |
|
"rewards/reasoning_steps_reward": 0.32232144884765146, |
|
"rewards/tag_count_reward": 0.02187500111758709, |
|
"step": 20 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 641.0866355895996, |
|
"epoch": 0.02666666666666667, |
|
"grad_norm": 1.33339338523773, |
|
"kl": 0.00041189193725585935, |
|
"learning_rate": 7.978723404255319e-07, |
|
"loss": 0.059, |
|
"reward": 0.3427321955561638, |
|
"reward_std": 0.4108327502384782, |
|
"rewards/accuracy_reward": 0.004464285913854837, |
|
"rewards/cosine_scaled_reward": 0.3364822005853057, |
|
"rewards/format_reward": 0.001785714365541935, |
|
"rewards/len_reward": -0.023951610503718256, |
|
"rewards/reasoning_steps_reward": 0.30625002086162567, |
|
"rewards/tag_count_reward": 0.018750000605359675, |
|
"step": 25 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 641.2518119812012, |
|
"epoch": 0.032, |
|
"grad_norm": 0.5777275749493307, |
|
"kl": 0.0007986545562744141, |
|
"learning_rate": 9.574468085106384e-07, |
|
"loss": 0.0497, |
|
"reward": 0.3393188311718404, |
|
"reward_std": 0.39051519297063353, |
|
"rewards/accuracy_reward": 0.00714285746216774, |
|
"rewards/cosine_scaled_reward": 0.32503311512991784, |
|
"rewards/format_reward": 0.00714285746216774, |
|
"rewards/len_reward": -0.029087586002424358, |
|
"rewards/reasoning_steps_reward": 0.30952382907271386, |
|
"rewards/tag_count_reward": 0.025446430174633862, |
|
"step": 30 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 636.4821701049805, |
|
"epoch": 0.037333333333333336, |
|
"grad_norm": 1.1015594817672492, |
|
"kl": 0.053539371490478514, |
|
"learning_rate": 1.1170212765957447e-06, |
|
"loss": 0.0637, |
|
"reward": 0.36175463497638705, |
|
"reward_std": 0.3966035820543766, |
|
"rewards/accuracy_reward": 0.0062500002793967726, |
|
"rewards/cosine_scaled_reward": 0.34925462789833545, |
|
"rewards/format_reward": 0.0062500002793967726, |
|
"rewards/len_reward": -0.030977025411993965, |
|
"rewards/reasoning_steps_reward": 0.3026785932481289, |
|
"rewards/tag_count_reward": 0.02031250107102096, |
|
"step": 35 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 662.1411056518555, |
|
"epoch": 0.042666666666666665, |
|
"grad_norm": 0.6754960541104272, |
|
"kl": 0.0025452613830566407, |
|
"learning_rate": 1.276595744680851e-06, |
|
"loss": 0.0271, |
|
"reward": 0.3522125052288175, |
|
"reward_std": 0.4222590584307909, |
|
"rewards/accuracy_reward": 0.010714286193251609, |
|
"rewards/cosine_scaled_reward": 0.3307839307235554, |
|
"rewards/format_reward": 0.010714286193251609, |
|
"rewards/len_reward": -0.03555333581316518, |
|
"rewards/reasoning_steps_reward": 0.3464285958558321, |
|
"rewards/tag_count_reward": 0.031919644214212894, |
|
"step": 40 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 634.5562736511231, |
|
"epoch": 0.048, |
|
"grad_norm": 0.5612643620767712, |
|
"kl": 0.003268718719482422, |
|
"learning_rate": 1.4361702127659576e-06, |
|
"loss": 0.0534, |
|
"reward": 0.3792000488378108, |
|
"reward_std": 0.40308301597833635, |
|
"rewards/accuracy_reward": 0.0053571430966258046, |
|
"rewards/cosine_scaled_reward": 0.3649143366143107, |
|
"rewards/format_reward": 0.008928571920841932, |
|
"rewards/len_reward": -0.029042343396577054, |
|
"rewards/reasoning_steps_reward": 0.2964285874739289, |
|
"rewards/tag_count_reward": 0.02991071599535644, |
|
"step": 45 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 622.6723480224609, |
|
"epoch": 0.05333333333333334, |
|
"grad_norm": 0.7108858441676024, |
|
"kl": 0.003692150115966797, |
|
"learning_rate": 1.5957446808510639e-06, |
|
"loss": 0.0455, |
|
"reward": 0.371215118841792, |
|
"reward_std": 0.3538612272590399, |
|
"rewards/accuracy_reward": 0.004464285913854837, |
|
"rewards/cosine_scaled_reward": 0.3605008259153692, |
|
"rewards/format_reward": 0.0062500002793967726, |
|
"rewards/len_reward": -0.02482663995178882, |
|
"rewards/reasoning_steps_reward": 0.2991071559488773, |
|
"rewards/tag_count_reward": 0.023883929662406444, |
|
"step": 50 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 611.9214553833008, |
|
"epoch": 0.058666666666666666, |
|
"grad_norm": 0.43136348282970804, |
|
"kl": 0.004044151306152344, |
|
"learning_rate": 1.7553191489361702e-06, |
|
"loss": 0.0401, |
|
"reward": 0.4254450174048543, |
|
"reward_std": 0.3616549573838711, |
|
"rewards/accuracy_reward": 0.01339285783469677, |
|
"rewards/cosine_scaled_reward": 0.3995521478354931, |
|
"rewards/format_reward": 0.012500000558793545, |
|
"rewards/len_reward": -0.015584439341910183, |
|
"rewards/reasoning_steps_reward": 0.30208335518836976, |
|
"rewards/tag_count_reward": 0.030133930058218537, |
|
"step": 55 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 625.6402084350586, |
|
"epoch": 0.064, |
|
"grad_norm": 0.8297564908222154, |
|
"kl": 0.0055370330810546875, |
|
"learning_rate": 1.9148936170212767e-06, |
|
"loss": 0.0352, |
|
"reward": 0.43232634887099264, |
|
"reward_std": 0.3845306318253279, |
|
"rewards/accuracy_reward": 0.011607143376022578, |
|
"rewards/cosine_scaled_reward": 0.3948263553902507, |
|
"rewards/format_reward": 0.025892858393490314, |
|
"rewards/len_reward": -0.009139302224502899, |
|
"rewards/reasoning_steps_reward": 0.2845238268375397, |
|
"rewards/tag_count_reward": 0.052455359627492724, |
|
"step": 60 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 596.3741302490234, |
|
"epoch": 0.06933333333333333, |
|
"grad_norm": 0.7228866431898863, |
|
"kl": 0.012695503234863282, |
|
"learning_rate": 2.074468085106383e-06, |
|
"loss": 0.0365, |
|
"reward": 0.5030742045491934, |
|
"reward_std": 0.43186259232461455, |
|
"rewards/accuracy_reward": 0.02767857275903225, |
|
"rewards/cosine_scaled_reward": 0.4128956302069128, |
|
"rewards/format_reward": 0.0625000030733645, |
|
"rewards/len_reward": -0.015213955337821971, |
|
"rewards/reasoning_steps_reward": 0.3366071626543999, |
|
"rewards/tag_count_reward": 0.10022321869619191, |
|
"step": 65 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 585.4160987854004, |
|
"epoch": 0.07466666666666667, |
|
"grad_norm": 0.6682633979026431, |
|
"kl": 0.01843681335449219, |
|
"learning_rate": 2.2340425531914894e-06, |
|
"loss": 0.0505, |
|
"reward": 0.5593858446925879, |
|
"reward_std": 0.4915604364126921, |
|
"rewards/accuracy_reward": 0.0419642879627645, |
|
"rewards/cosine_scaled_reward": 0.3995644087903202, |
|
"rewards/format_reward": 0.11785714896395802, |
|
"rewards/len_reward": -0.01337289962102659, |
|
"rewards/reasoning_steps_reward": 0.3351190686225891, |
|
"rewards/tag_count_reward": 0.17343750719446688, |
|
"step": 70 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 557.2544898986816, |
|
"epoch": 0.08, |
|
"grad_norm": 0.8532958065384724, |
|
"kl": 0.014874649047851563, |
|
"learning_rate": 2.3936170212765957e-06, |
|
"loss": 0.0318, |
|
"reward": 0.5684177622199058, |
|
"reward_std": 0.41847779490053655, |
|
"rewards/accuracy_reward": 0.025892858393490314, |
|
"rewards/cosine_scaled_reward": 0.4523463241755962, |
|
"rewards/format_reward": 0.09017857620492578, |
|
"rewards/len_reward": -0.018315281078685075, |
|
"rewards/reasoning_steps_reward": 0.3172619273886085, |
|
"rewards/tag_count_reward": 0.12142857732251286, |
|
"step": 75 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 628.3152076721192, |
|
"epoch": 0.08533333333333333, |
|
"grad_norm": 0.9448142819988257, |
|
"kl": 0.009454345703125, |
|
"learning_rate": 2.553191489361702e-06, |
|
"loss": 0.0125, |
|
"reward": 0.5472255704924465, |
|
"reward_std": 0.4222383305430412, |
|
"rewards/accuracy_reward": 0.0455357164144516, |
|
"rewards/cosine_scaled_reward": 0.4150827143341303, |
|
"rewards/format_reward": 0.08660714719444514, |
|
"rewards/len_reward": -0.03305647437373409, |
|
"rewards/reasoning_steps_reward": 0.29196430519223215, |
|
"rewards/tag_count_reward": 0.10915179166477174, |
|
"step": 80 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 641.1973541259765, |
|
"epoch": 0.09066666666666667, |
|
"grad_norm": 1.2685441941621476, |
|
"kl": 0.027700042724609374, |
|
"learning_rate": 2.7127659574468088e-06, |
|
"loss": 0.0222, |
|
"reward": 0.711650275811553, |
|
"reward_std": 0.6027548030018807, |
|
"rewards/accuracy_reward": 0.07857143254950642, |
|
"rewards/cosine_scaled_reward": 0.3482574049849063, |
|
"rewards/format_reward": 0.2848214406520128, |
|
"rewards/len_reward": -0.04190118510159664, |
|
"rewards/reasoning_steps_reward": 0.32886906489729884, |
|
"rewards/tag_count_reward": 0.26964286863803866, |
|
"step": 85 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 652.3964576721191, |
|
"epoch": 0.096, |
|
"grad_norm": 0.7164425205527156, |
|
"kl": 0.0451019287109375, |
|
"learning_rate": 2.872340425531915e-06, |
|
"loss": 0.0152, |
|
"reward": 0.8936719134449959, |
|
"reward_std": 0.6759964823722839, |
|
"rewards/accuracy_reward": 0.13303572190925478, |
|
"rewards/cosine_scaled_reward": 0.29992188327014446, |
|
"rewards/format_reward": 0.46071430779993533, |
|
"rewards/len_reward": -0.044884511404961816, |
|
"rewards/reasoning_steps_reward": 0.3279762100428343, |
|
"rewards/tag_count_reward": 0.38973215855658055, |
|
"step": 90 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 584.2643104553223, |
|
"epoch": 0.10133333333333333, |
|
"grad_norm": 0.8742676458030147, |
|
"kl": 0.0368682861328125, |
|
"learning_rate": 2.9999895838948146e-06, |
|
"loss": 0.0333, |
|
"reward": 1.0263997331261634, |
|
"reward_std": 0.7044595286250115, |
|
"rewards/accuracy_reward": 0.18839286491274834, |
|
"rewards/cosine_scaled_reward": 0.3335425513330847, |
|
"rewards/format_reward": 0.5044643081724643, |
|
"rewards/len_reward": -0.031246750250284096, |
|
"rewards/reasoning_steps_reward": 0.33482144922018053, |
|
"rewards/tag_count_reward": 0.4535714492201805, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.10666666666666667, |
|
"grad_norm": 4.100207664946175, |
|
"learning_rate": 2.9996250354024346e-06, |
|
"loss": 0.0501, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.10666666666666667, |
|
"eval_clip_ratio": 0.0, |
|
"eval_completion_length": 572.2930786132813, |
|
"eval_kl": 0.056494140625, |
|
"eval_loss": 0.056255146861076355, |
|
"eval_reward": 1.2980437994003295, |
|
"eval_reward_std": 0.7676759362220764, |
|
"eval_rewards/accuracy_reward": 0.325000011920929, |
|
"eval_rewards/cosine_scaled_reward": 0.3194723308086395, |
|
"eval_rewards/format_reward": 0.6535714626312256, |
|
"eval_rewards/len_reward": 0.0353432796895504, |
|
"eval_rewards/reasoning_steps_reward": 0.3666666865348816, |
|
"eval_rewards/tag_count_reward": 0.620089304447174, |
|
"eval_runtime": 113.2606, |
|
"eval_samples_per_second": 0.671, |
|
"eval_steps_per_second": 0.009, |
|
"step": 100 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 580.3339561462402, |
|
"epoch": 0.112, |
|
"grad_norm": 1.024357541492345, |
|
"kl": 0.14632110595703124, |
|
"learning_rate": 2.9987398263020837e-06, |
|
"loss": 0.0522, |
|
"reward": 1.2215609500184654, |
|
"reward_std": 0.7516257010400296, |
|
"rewards/accuracy_reward": 0.2843750137835741, |
|
"rewards/cosine_scaled_reward": 0.30638233484933153, |
|
"rewards/format_reward": 0.6308036033064127, |
|
"rewards/len_reward": -0.03830806077457964, |
|
"rewards/reasoning_steps_reward": 0.3824405036866665, |
|
"rewards/tag_count_reward": 0.6280134215950965, |
|
"step": 105 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 613.3071662902832, |
|
"epoch": 0.11733333333333333, |
|
"grad_norm": 0.73740128371405, |
|
"kl": 0.07979736328125, |
|
"learning_rate": 2.997334263932927e-06, |
|
"loss": 0.0507, |
|
"reward": 1.2840619757771492, |
|
"reward_std": 0.7633979134261608, |
|
"rewards/accuracy_reward": 0.3741071574389935, |
|
"rewards/cosine_scaled_reward": 0.31888334788382056, |
|
"rewards/format_reward": 0.5910714514553547, |
|
"rewards/len_reward": -0.03648645522480365, |
|
"rewards/reasoning_steps_reward": 0.4241071671247482, |
|
"rewards/tag_count_reward": 0.6294643133878708, |
|
"step": 110 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 623.3973457336426, |
|
"epoch": 0.12266666666666666, |
|
"grad_norm": 0.5315564057866974, |
|
"kl": 0.0554351806640625, |
|
"learning_rate": 2.9954088362975936e-06, |
|
"loss": 0.0422, |
|
"reward": 1.4692801490426064, |
|
"reward_std": 0.7539573520421982, |
|
"rewards/accuracy_reward": 0.4035714466124773, |
|
"rewards/cosine_scaled_reward": 0.3612444051541388, |
|
"rewards/format_reward": 0.7044643200933933, |
|
"rewards/len_reward": -0.017655941796692786, |
|
"rewards/reasoning_steps_reward": 0.47142860367894174, |
|
"rewards/tag_count_reward": 0.6227678850293159, |
|
"step": 115 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 605.333065032959, |
|
"epoch": 0.128, |
|
"grad_norm": 0.5958118040109989, |
|
"kl": 0.074505615234375, |
|
"learning_rate": 2.99296421189274e-06, |
|
"loss": 0.0551, |
|
"reward": 1.4197789639234544, |
|
"reward_std": 0.7952461540699005, |
|
"rewards/accuracy_reward": 0.4223214466124773, |
|
"rewards/cosine_scaled_reward": 0.32067176890559496, |
|
"rewards/format_reward": 0.6767857477068902, |
|
"rewards/len_reward": -0.03406066980678588, |
|
"rewards/reasoning_steps_reward": 0.4580357432365417, |
|
"rewards/tag_count_reward": 0.6189732424914837, |
|
"step": 120 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 630.7732460021973, |
|
"epoch": 0.13333333333333333, |
|
"grad_norm": 0.8607419693848777, |
|
"kl": 0.065643310546875, |
|
"learning_rate": 2.9900012394769546e-06, |
|
"loss": 0.0506, |
|
"reward": 1.4135100565850736, |
|
"reward_std": 0.7826988212764263, |
|
"rewards/accuracy_reward": 0.42232144698500634, |
|
"rewards/cosine_scaled_reward": 0.2822600125335157, |
|
"rewards/format_reward": 0.7089286029338837, |
|
"rewards/len_reward": -0.04302762195584364, |
|
"rewards/reasoning_steps_reward": 0.42529764398932457, |
|
"rewards/tag_count_reward": 0.6520089574158192, |
|
"step": 125 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 577.2678840637207, |
|
"epoch": 0.13866666666666666, |
|
"grad_norm": 1.7581222298002241, |
|
"kl": 0.0793212890625, |
|
"learning_rate": 2.986520947776075e-06, |
|
"loss": 0.0432, |
|
"reward": 1.540949085354805, |
|
"reward_std": 0.7308154627680779, |
|
"rewards/accuracy_reward": 0.45892859138548375, |
|
"rewards/cosine_scaled_reward": 0.29630617263028397, |
|
"rewards/format_reward": 0.7857143327593803, |
|
"rewards/len_reward": -0.040497589367441834, |
|
"rewards/reasoning_steps_reward": 0.40059526450932026, |
|
"rewards/tag_count_reward": 0.7185268178582191, |
|
"step": 130 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 618.2786010742187, |
|
"epoch": 0.144, |
|
"grad_norm": 1.2147641283215929, |
|
"kl": 0.076068115234375, |
|
"learning_rate": 2.982524545126018e-06, |
|
"loss": 0.03, |
|
"reward": 1.4926121950149536, |
|
"reward_std": 0.7447091139853, |
|
"rewards/accuracy_reward": 0.46875002086162565, |
|
"rewards/cosine_scaled_reward": 0.28100501019507645, |
|
"rewards/format_reward": 0.7428571835160256, |
|
"rewards/len_reward": -0.04923985062050633, |
|
"rewards/reasoning_steps_reward": 0.3982143126428127, |
|
"rewards/tag_count_reward": 0.660714315623045, |
|
"step": 135 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 631.2741371154785, |
|
"epoch": 0.14933333333333335, |
|
"grad_norm": 0.9629578835791102, |
|
"kl": 0.102850341796875, |
|
"learning_rate": 2.9780134190532553e-06, |
|
"loss": 0.0561, |
|
"reward": 1.442286056280136, |
|
"reward_std": 0.8065447643399238, |
|
"rewards/accuracy_reward": 0.423214303702116, |
|
"rewards/cosine_scaled_reward": 0.25300029437057675, |
|
"rewards/format_reward": 0.7660714693367481, |
|
"rewards/len_reward": -0.05252085686661303, |
|
"rewards/reasoning_steps_reward": 0.43720240965485574, |
|
"rewards/tag_count_reward": 0.6805803880095482, |
|
"step": 140 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 664.377709197998, |
|
"epoch": 0.15466666666666667, |
|
"grad_norm": 1.5387538457241525, |
|
"kl": 0.1677978515625, |
|
"learning_rate": 2.972989135793071e-06, |
|
"loss": 0.1038, |
|
"reward": 0.9583968121558428, |
|
"reward_std": 0.9008305951952934, |
|
"rewards/accuracy_reward": 0.29732144232839347, |
|
"rewards/cosine_scaled_reward": 0.11553962156176567, |
|
"rewards/format_reward": 0.5455357372760773, |
|
"rewards/len_reward": -0.08262603848124854, |
|
"rewards/reasoning_steps_reward": 0.43005954995751383, |
|
"rewards/tag_count_reward": 0.6725446745753288, |
|
"step": 145 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 669.9571769714355, |
|
"epoch": 0.16, |
|
"grad_norm": 8.232445507998719, |
|
"kl": 1.080517578125, |
|
"learning_rate": 2.967453439745775e-06, |
|
"loss": 0.1908, |
|
"reward": 0.8955060914158821, |
|
"reward_std": 0.9506111726164818, |
|
"rewards/accuracy_reward": 0.28571429643779994, |
|
"rewards/cosine_scaled_reward": 0.10711320142145268, |
|
"rewards/format_reward": 0.502678594738245, |
|
"rewards/len_reward": -0.06221853480674326, |
|
"rewards/reasoning_steps_reward": 0.3529762107878923, |
|
"rewards/tag_count_reward": 0.7274553924798965, |
|
"step": 150 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 548.216098022461, |
|
"epoch": 0.16533333333333333, |
|
"grad_norm": 5.902897016758618, |
|
"kl": 7.4263671875, |
|
"learning_rate": 2.961408252871058e-06, |
|
"loss": 0.3684, |
|
"reward": 1.1710864089429378, |
|
"reward_std": 0.969765692949295, |
|
"rewards/accuracy_reward": 0.38035716153681276, |
|
"rewards/cosine_scaled_reward": 0.17108636341872624, |
|
"rewards/format_reward": 0.6196428865194321, |
|
"rewards/len_reward": -0.03628237677039579, |
|
"rewards/reasoning_steps_reward": 0.3818452678620815, |
|
"rewards/tag_count_reward": 0.8296875432133675, |
|
"step": 155 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 623.2044944763184, |
|
"epoch": 0.17066666666666666, |
|
"grad_norm": 15.230557110031734, |
|
"kl": 2.2271484375, |
|
"learning_rate": 2.9548556740207e-06, |
|
"loss": 0.2607, |
|
"reward": 1.023178616911173, |
|
"reward_std": 0.9279091581702232, |
|
"rewards/accuracy_reward": 0.31607144232839346, |
|
"rewards/cosine_scaled_reward": 0.11335714326705784, |
|
"rewards/format_reward": 0.5937500290572644, |
|
"rewards/len_reward": -0.04749395059770904, |
|
"rewards/reasoning_steps_reward": 0.33750002086162567, |
|
"rewards/tag_count_reward": 0.8383928969502449, |
|
"step": 160 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 735.4348541259766, |
|
"epoch": 0.176, |
|
"grad_norm": 5.053812153888101, |
|
"kl": 1.266015625, |
|
"learning_rate": 2.9477979782098592e-06, |
|
"loss": 0.1493, |
|
"reward": 1.0551368452608585, |
|
"reward_std": 0.8986638769507408, |
|
"rewards/accuracy_reward": 0.3321428726427257, |
|
"rewards/cosine_scaled_reward": 0.1435296577285044, |
|
"rewards/format_reward": 0.5794643104076386, |
|
"rewards/len_reward": -0.049430397151081704, |
|
"rewards/reasoning_steps_reward": 0.33333336040377615, |
|
"rewards/tag_count_reward": 0.8238839626312255, |
|
"step": 165 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 845.265217590332, |
|
"epoch": 0.18133333333333335, |
|
"grad_norm": 0.6919959400923131, |
|
"kl": 0.52850341796875, |
|
"learning_rate": 2.9402376158272022e-06, |
|
"loss": 0.0567, |
|
"reward": 1.0284505672752857, |
|
"reward_std": 0.873851603269577, |
|
"rewards/accuracy_reward": 0.32767858579754827, |
|
"rewards/cosine_scaled_reward": 0.1400576631671356, |
|
"rewards/format_reward": 0.5607143126428127, |
|
"rewards/len_reward": -0.07766101571614854, |
|
"rewards/reasoning_steps_reward": 0.3125000227242708, |
|
"rewards/tag_count_reward": 0.7578125342726707, |
|
"step": 170 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 948.5634368896484, |
|
"epoch": 0.18666666666666668, |
|
"grad_norm": 0.9504642967041296, |
|
"kl": 0.3015380859375, |
|
"learning_rate": 2.9321772117841463e-06, |
|
"loss": 0.019, |
|
"reward": 0.8819758415222168, |
|
"reward_std": 0.7948870480060577, |
|
"rewards/accuracy_reward": 0.27946429857984184, |
|
"rewards/cosine_scaled_reward": 0.048047243334440284, |
|
"rewards/format_reward": 0.5544643133878708, |
|
"rewards/len_reward": -0.12528894301503896, |
|
"rewards/reasoning_steps_reward": 0.31071430891752244, |
|
"rewards/tag_count_reward": 0.7254464566707611, |
|
"step": 175 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 990.125033569336, |
|
"epoch": 0.192, |
|
"grad_norm": 0.8747320513862124, |
|
"kl": 0.17032470703125, |
|
"learning_rate": 2.923619564603501e-06, |
|
"loss": -0.0046, |
|
"reward": 0.9295428976416588, |
|
"reward_std": 0.7955453962087631, |
|
"rewards/accuracy_reward": 0.3446428715251386, |
|
"rewards/cosine_scaled_reward": 0.0063285754295066, |
|
"rewards/format_reward": 0.5785714522004127, |
|
"rewards/len_reward": -0.14820685870945455, |
|
"rewards/reasoning_steps_reward": 0.3136904966086149, |
|
"rewards/tag_count_reward": 0.7122768208384513, |
|
"step": 180 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 988.6259307861328, |
|
"epoch": 0.19733333333333333, |
|
"grad_norm": 1.7966045934893466, |
|
"kl": 0.16611328125, |
|
"learning_rate": 2.9145676454478435e-06, |
|
"loss": -0.0042, |
|
"reward": 0.9886309564113617, |
|
"reward_std": 0.7453033030033112, |
|
"rewards/accuracy_reward": 0.3205357275903225, |
|
"rewards/cosine_scaled_reward": -0.0006547938079165761, |
|
"rewards/format_reward": 0.6687500283122063, |
|
"rewards/len_reward": -0.1490553854033351, |
|
"rewards/reasoning_steps_reward": 0.4044643186032772, |
|
"rewards/tag_count_reward": 0.7602678939700127, |
|
"step": 185 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 762.9036071777343, |
|
"epoch": 0.20266666666666666, |
|
"grad_norm": 100.15512012017349, |
|
"kl": 7.372412109375, |
|
"learning_rate": 2.9050245970879456e-06, |
|
"loss": 0.3495, |
|
"reward": 0.7870586156845093, |
|
"reward_std": 0.7903555080294609, |
|
"rewards/accuracy_reward": 0.20714286556467415, |
|
"rewards/cosine_scaled_reward": -0.001334278640570119, |
|
"rewards/format_reward": 0.5812500268220901, |
|
"rewards/len_reward": -0.1288936346769333, |
|
"rewards/reasoning_steps_reward": 0.35505954883992674, |
|
"rewards/tag_count_reward": 0.6861607432365417, |
|
"step": 190 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 582.100023651123, |
|
"epoch": 0.208, |
|
"grad_norm": 3.905714073659836, |
|
"kl": 1.8900390625, |
|
"learning_rate": 2.8949937328116252e-06, |
|
"loss": 0.1685, |
|
"reward": 0.5571490220725537, |
|
"reward_std": 0.8787636801600456, |
|
"rewards/accuracy_reward": 0.10357143403962255, |
|
"rewards/cosine_scaled_reward": -0.02320814849808812, |
|
"rewards/format_reward": 0.4767857387661934, |
|
"rewards/len_reward": -0.09527752730937208, |
|
"rewards/reasoning_steps_reward": 0.3270833570510149, |
|
"rewards/tag_count_reward": 0.6017857432365418, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.21333333333333335, |
|
"grad_norm": 3.9416410412461524, |
|
"learning_rate": 2.884478535273393e-06, |
|
"loss": 0.3992, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.21333333333333335, |
|
"eval_clip_ratio": 0.0, |
|
"eval_completion_length": 499.3030212402344, |
|
"eval_kl": 3.175, |
|
"eval_loss": 0.4126989245414734, |
|
"eval_reward": 0.8562156677246093, |
|
"eval_reward_std": 0.9723240971565247, |
|
"eval_rewards/accuracy_reward": 0.21607144176959991, |
|
"eval_rewards/cosine_scaled_reward": 0.10621565729379653, |
|
"eval_rewards/format_reward": 0.5339285790920257, |
|
"eval_rewards/len_reward": 0.021631143055856228, |
|
"eval_rewards/reasoning_steps_reward": 0.30654761791229246, |
|
"eval_rewards/tag_count_reward": 0.6245536088943482, |
|
"eval_runtime": 110.8118, |
|
"eval_samples_per_second": 0.686, |
|
"eval_steps_per_second": 0.009, |
|
"step": 200 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 502.50091552734375, |
|
"epoch": 0.21866666666666668, |
|
"grad_norm": 7.328057495766339, |
|
"kl": 4.2474609375, |
|
"learning_rate": 2.8734826552852934e-06, |
|
"loss": 0.4105, |
|
"reward": 0.8123717293143272, |
|
"reward_std": 0.9503199592232704, |
|
"rewards/accuracy_reward": 0.19732143627479673, |
|
"rewards/cosine_scaled_reward": 0.07755026518425438, |
|
"rewards/format_reward": 0.5375000230967999, |
|
"rewards/len_reward": -0.06171439889585599, |
|
"rewards/reasoning_steps_reward": 0.3178571631200612, |
|
"rewards/tag_count_reward": 0.635267884656787, |
|
"step": 205 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 461.1893020629883, |
|
"epoch": 0.224, |
|
"grad_norm": 6.661356281614044, |
|
"kl": 4.8693359375, |
|
"learning_rate": 2.86200991054937e-06, |
|
"loss": 0.5026, |
|
"reward": 1.0250703942030668, |
|
"reward_std": 0.9945615231990814, |
|
"rewards/accuracy_reward": 0.26696429792791604, |
|
"rewards/cosine_scaled_reward": 0.1456060634693131, |
|
"rewards/format_reward": 0.6125000283122063, |
|
"rewards/len_reward": -0.038271602327586154, |
|
"rewards/reasoning_steps_reward": 0.30952383354306223, |
|
"rewards/tag_count_reward": 0.7133928909897804, |
|
"step": 210 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 454.26252365112305, |
|
"epoch": 0.22933333333333333, |
|
"grad_norm": 29.073733687853476, |
|
"kl": 5.935546875, |
|
"learning_rate": 2.850064284332176e-06, |
|
"loss": 0.4959, |
|
"reward": 0.9025619432330132, |
|
"reward_std": 0.9635577172040939, |
|
"rewards/accuracy_reward": 0.23928572610020638, |
|
"rewards/cosine_scaled_reward": 0.12399048676597886, |
|
"rewards/format_reward": 0.5392857387661933, |
|
"rewards/len_reward": -0.04608473673724802, |
|
"rewards/reasoning_steps_reward": 0.2455357311293483, |
|
"rewards/tag_count_reward": 0.6573661014437675, |
|
"step": 215 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 431.4607322692871, |
|
"epoch": 0.23466666666666666, |
|
"grad_norm": 32.81745130495549, |
|
"kl": 4.9755859375, |
|
"learning_rate": 2.8376499240818166e-06, |
|
"loss": 0.4603, |
|
"reward": 1.0474480047822, |
|
"reward_std": 1.0046013176441193, |
|
"rewards/accuracy_reward": 0.30267858877778053, |
|
"rewards/cosine_scaled_reward": 0.1724479661497753, |
|
"rewards/format_reward": 0.5723214603960514, |
|
"rewards/len_reward": -0.03554261325771222, |
|
"rewards/reasoning_steps_reward": 0.27380954213440417, |
|
"rewards/tag_count_reward": 0.6997768178582191, |
|
"step": 220 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 404.6401988983154, |
|
"epoch": 0.24, |
|
"grad_norm": 28.1722788422819, |
|
"kl": 3.94912109375, |
|
"learning_rate": 2.8247711399879734e-06, |
|
"loss": 0.3951, |
|
"reward": 1.0693716026842595, |
|
"reward_std": 0.9888196289539337, |
|
"rewards/accuracy_reward": 0.2991071553900838, |
|
"rewards/cosine_scaled_reward": 0.16133585062343628, |
|
"rewards/format_reward": 0.6089285999536515, |
|
"rewards/len_reward": -0.048874559343676086, |
|
"rewards/reasoning_steps_reward": 0.3071428768336773, |
|
"rewards/tag_count_reward": 0.722098246216774, |
|
"step": 225 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 387.7723388671875, |
|
"epoch": 0.24533333333333332, |
|
"grad_norm": 13.352200008648655, |
|
"kl": 4.7109375, |
|
"learning_rate": 2.8114324034854378e-06, |
|
"loss": 0.4193, |
|
"reward": 1.0878922022879123, |
|
"reward_std": 1.0054454013705254, |
|
"rewards/accuracy_reward": 0.2955357299186289, |
|
"rewards/cosine_scaled_reward": 0.15128503099549562, |
|
"rewards/format_reward": 0.6410714581608772, |
|
"rewards/len_reward": -0.043122516572475435, |
|
"rewards/reasoning_steps_reward": 0.28511906825006006, |
|
"rewards/tag_count_reward": 0.7535714656114578, |
|
"step": 230 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 410.9437679290771, |
|
"epoch": 0.25066666666666665, |
|
"grad_norm": 24.250800007195018, |
|
"kl": 3.6314453125, |
|
"learning_rate": 2.7976383457016535e-06, |
|
"loss": 0.3386, |
|
"reward": 1.1002146422863006, |
|
"reward_std": 0.9713477358222008, |
|
"rewards/accuracy_reward": 0.30625001564621923, |
|
"rewards/cosine_scaled_reward": 0.1180717434406688, |
|
"rewards/format_reward": 0.6758928917348385, |
|
"rewards/len_reward": -0.04972980402235407, |
|
"rewards/reasoning_steps_reward": 0.31785716116428375, |
|
"rewards/tag_count_reward": 0.770982176065445, |
|
"step": 235 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 433.2035934448242, |
|
"epoch": 0.256, |
|
"grad_norm": 19.63417414565418, |
|
"kl": 9.439453125, |
|
"learning_rate": 2.7833937558488187e-06, |
|
"loss": 0.7573, |
|
"reward": 1.1284980118274688, |
|
"reward_std": 1.0023478761315345, |
|
"rewards/accuracy_reward": 0.29821430118754505, |
|
"rewards/cosine_scaled_reward": 0.14189081880031154, |
|
"rewards/format_reward": 0.6883928954601288, |
|
"rewards/len_reward": -0.04797074495872948, |
|
"rewards/reasoning_steps_reward": 0.3354166891425848, |
|
"rewards/tag_count_reward": 0.7850446790456772, |
|
"step": 240 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 406.71162338256835, |
|
"epoch": 0.2613333333333333, |
|
"grad_norm": 24.59188831849529, |
|
"kl": 4.787109375, |
|
"learning_rate": 2.7687035795611003e-06, |
|
"loss": 0.3719, |
|
"reward": 1.2017367050051688, |
|
"reward_std": 0.9599395081400871, |
|
"rewards/accuracy_reward": 0.32857144586741927, |
|
"rewards/cosine_scaled_reward": 0.15620094180339947, |
|
"rewards/format_reward": 0.7169643253087997, |
|
"rewards/len_reward": -0.04119561462430284, |
|
"rewards/reasoning_steps_reward": 0.32886907272040844, |
|
"rewards/tag_count_reward": 0.7966518267989159, |
|
"step": 245 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 408.95180358886716, |
|
"epoch": 0.26666666666666666, |
|
"grad_norm": 9.752015710296007, |
|
"kl": 3.60859375, |
|
"learning_rate": 2.7535729171775408e-06, |
|
"loss": 0.4066, |
|
"reward": 1.3339441634714604, |
|
"reward_std": 0.9489350289106369, |
|
"rewards/accuracy_reward": 0.35714287580922244, |
|
"rewards/cosine_scaled_reward": 0.22412267459294527, |
|
"rewards/format_reward": 0.7526786118745804, |
|
"rewards/len_reward": -0.01727719540358521, |
|
"rewards/reasoning_steps_reward": 0.34851192869246006, |
|
"rewards/tag_count_reward": 0.8140625387430191, |
|
"step": 250 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 411.16162490844727, |
|
"epoch": 0.272, |
|
"grad_norm": 3.3832375486237614, |
|
"kl": 3.37255859375, |
|
"learning_rate": 2.7380070219712514e-06, |
|
"loss": 0.3354, |
|
"reward": 1.3407041341066361, |
|
"reward_std": 0.9241366110742092, |
|
"rewards/accuracy_reward": 0.3598214445635676, |
|
"rewards/cosine_scaled_reward": 0.20677551386179402, |
|
"rewards/format_reward": 0.7741071850061416, |
|
"rewards/len_reward": -0.027734974736813457, |
|
"rewards/reasoning_steps_reward": 0.3889881126582623, |
|
"rewards/tag_count_reward": 0.8216518253087998, |
|
"step": 255 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 395.68305587768555, |
|
"epoch": 0.2773333333333333, |
|
"grad_norm": 5.777425418025543, |
|
"kl": 2.6078125, |
|
"learning_rate": 2.722011298325509e-06, |
|
"loss": 0.2668, |
|
"reward": 1.363752231001854, |
|
"reward_std": 0.9073125638067723, |
|
"rewards/accuracy_reward": 0.37321430230513214, |
|
"rewards/cosine_scaled_reward": 0.20660932243335992, |
|
"rewards/format_reward": 0.7839286118745804, |
|
"rewards/len_reward": -0.01969163056710386, |
|
"rewards/reasoning_steps_reward": 0.3535714544355869, |
|
"rewards/tag_count_reward": 0.8475446835160255, |
|
"step": 260 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 383.4839458465576, |
|
"epoch": 0.2826666666666667, |
|
"grad_norm": 6.935633340822481, |
|
"kl": 3.378515625, |
|
"learning_rate": 2.705591299857385e-06, |
|
"loss": 0.285, |
|
"reward": 1.3742309853434562, |
|
"reward_std": 0.9106731534004211, |
|
"rewards/accuracy_reward": 0.39821430314332246, |
|
"rewards/cosine_scaled_reward": 0.20994521365500987, |
|
"rewards/format_reward": 0.7660714671015739, |
|
"rewards/len_reward": -0.03229370064800605, |
|
"rewards/reasoning_steps_reward": 0.40327384024858476, |
|
"rewards/tag_count_reward": 0.8417411088943482, |
|
"step": 265 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 403.61609268188477, |
|
"epoch": 0.288, |
|
"grad_norm": 3.039135573350634, |
|
"kl": 2.783984375, |
|
"learning_rate": 2.6887527274895657e-06, |
|
"loss": 0.1881, |
|
"reward": 1.4360268995165826, |
|
"reward_std": 0.8577364102005959, |
|
"rewards/accuracy_reward": 0.4116071589291096, |
|
"rewards/cosine_scaled_reward": 0.2128125553485006, |
|
"rewards/format_reward": 0.8116071850061417, |
|
"rewards/len_reward": -0.044078399677528066, |
|
"rewards/reasoning_steps_reward": 0.4425595536828041, |
|
"rewards/tag_count_reward": 0.8636161103844643, |
|
"step": 270 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 400.08126678466795, |
|
"epoch": 0.29333333333333333, |
|
"grad_norm": 2.4388095548591324, |
|
"kl": 2.45634765625, |
|
"learning_rate": 2.6715014274710265e-06, |
|
"loss": 0.1916, |
|
"reward": 1.5192030668258667, |
|
"reward_std": 0.8684033416211605, |
|
"rewards/accuracy_reward": 0.4464285934343934, |
|
"rewards/cosine_scaled_reward": 0.25313157981727274, |
|
"rewards/format_reward": 0.8196428954601288, |
|
"rewards/len_reward": -0.026938176073599607, |
|
"rewards/reasoning_steps_reward": 0.4386905148625374, |
|
"rewards/tag_count_reward": 0.8888393223285675, |
|
"step": 275 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 449.9669845581055, |
|
"epoch": 0.2986666666666667, |
|
"grad_norm": 6.321239319821566, |
|
"kl": 2.86875, |
|
"learning_rate": 2.65384338934725e-06, |
|
"loss": 0.2027, |
|
"reward": 1.4269624769687652, |
|
"reward_std": 0.8590005040168762, |
|
"rewards/accuracy_reward": 0.4142857328057289, |
|
"rewards/cosine_scaled_reward": 0.21446242899401113, |
|
"rewards/format_reward": 0.7982143282890319, |
|
"rewards/len_reward": -0.040290398540673775, |
|
"rewards/reasoning_steps_reward": 0.46130955815315244, |
|
"rewards/tag_count_reward": 0.8482143253087997, |
|
"step": 280 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 393.6366256713867, |
|
"epoch": 0.304, |
|
"grad_norm": 4.771107753553364, |
|
"kl": 1.920849609375, |
|
"learning_rate": 2.6357847438806916e-06, |
|
"loss": 0.1957, |
|
"reward": 1.607939650118351, |
|
"reward_std": 0.8728329673409462, |
|
"rewards/accuracy_reward": 0.4839285944588482, |
|
"rewards/cosine_scaled_reward": 0.28115387591533364, |
|
"rewards/format_reward": 0.84285718947649, |
|
"rewards/len_reward": -0.020568763601477257, |
|
"rewards/reasoning_steps_reward": 0.49107146337628366, |
|
"rewards/tag_count_reward": 0.9120536118745803, |
|
"step": 285 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 439.46252059936523, |
|
"epoch": 0.30933333333333335, |
|
"grad_norm": 2.222415417228654, |
|
"kl": 1.5540283203125, |
|
"learning_rate": 2.617331760922218e-06, |
|
"loss": 0.1929, |
|
"reward": 1.7294473230838776, |
|
"reward_std": 0.8087163001298905, |
|
"rewards/accuracy_reward": 0.5616071686148644, |
|
"rewards/cosine_scaled_reward": 0.307125816680491, |
|
"rewards/format_reward": 0.8607143267989159, |
|
"rewards/len_reward": 0.0014614581130445003, |
|
"rewards/reasoning_steps_reward": 0.5306547991931438, |
|
"rewards/tag_count_reward": 0.910267898440361, |
|
"step": 290 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 502.6964530944824, |
|
"epoch": 0.31466666666666665, |
|
"grad_norm": 3.044011469162571, |
|
"kl": 2.042333984375, |
|
"learning_rate": 2.598490847234253e-06, |
|
"loss": 0.1994, |
|
"reward": 1.5344257444143294, |
|
"reward_std": 0.8376345105469227, |
|
"rewards/accuracy_reward": 0.490178593993187, |
|
"rewards/cosine_scaled_reward": 0.23710425589233636, |
|
"rewards/format_reward": 0.8071428954601287, |
|
"rewards/len_reward": -0.023603520775213837, |
|
"rewards/reasoning_steps_reward": 0.5127976588904858, |
|
"rewards/tag_count_reward": 0.8689732521772384, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 2.4088716556569, |
|
"learning_rate": 2.5792685442663883e-06, |
|
"loss": 0.1325, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"eval_clip_ratio": 0.0, |
|
"eval_completion_length": 569.0503173828125, |
|
"eval_kl": 1.77578125, |
|
"eval_loss": 0.19349658489227295, |
|
"eval_reward": 1.5551659345626831, |
|
"eval_reward_std": 0.8867796778678894, |
|
"eval_rewards/accuracy_reward": 0.48214287161827085, |
|
"eval_rewards/cosine_scaled_reward": 0.258737313747406, |
|
"eval_rewards/format_reward": 0.8142857432365418, |
|
"eval_rewards/len_reward": 0.03932540193200111, |
|
"eval_rewards/reasoning_steps_reward": 0.5386904895305633, |
|
"eval_rewards/tag_count_reward": 0.8683036088943481, |
|
"eval_runtime": 115.2712, |
|
"eval_samples_per_second": 0.659, |
|
"eval_steps_per_second": 0.009, |
|
"step": 300 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 557.4629703521729, |
|
"epoch": 0.3253333333333333, |
|
"grad_norm": 2.3974125622229288, |
|
"kl": 2.141943359375, |
|
"learning_rate": 2.559671525884232e-06, |
|
"loss": 0.1392, |
|
"reward": 1.4303073339164256, |
|
"reward_std": 0.930858300626278, |
|
"rewards/accuracy_reward": 0.4379464481957257, |
|
"rewards/cosine_scaled_reward": 0.21914655352011322, |
|
"rewards/format_reward": 0.7732143267989159, |
|
"rewards/len_reward": -0.04112708342508995, |
|
"rewards/reasoning_steps_reward": 0.5296131338924169, |
|
"rewards/tag_count_reward": 0.8465402163565159, |
|
"step": 305 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 630.7919937133789, |
|
"epoch": 0.33066666666666666, |
|
"grad_norm": 2.478264362396331, |
|
"kl": 2.63065185546875, |
|
"learning_rate": 2.539706596052286e-06, |
|
"loss": 0.116, |
|
"reward": 1.3845888167619704, |
|
"reward_std": 0.9000825509428978, |
|
"rewards/accuracy_reward": 0.41964287469163536, |
|
"rewards/cosine_scaled_reward": 0.19262447137152777, |
|
"rewards/format_reward": 0.7723214656114579, |
|
"rewards/len_reward": -0.06281980487401598, |
|
"rewards/reasoning_steps_reward": 0.5708333693444729, |
|
"rewards/tag_count_reward": 0.8419643223285675, |
|
"step": 310 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 646.2580604553223, |
|
"epoch": 0.336, |
|
"grad_norm": 6.9435065986378, |
|
"kl": 2.74915771484375, |
|
"learning_rate": 2.5193806864716466e-06, |
|
"loss": 0.1113, |
|
"reward": 1.655515044927597, |
|
"reward_std": 0.7905310012400151, |
|
"rewards/accuracy_reward": 0.512500025331974, |
|
"rewards/cosine_scaled_reward": 0.26265784676652404, |
|
"rewards/format_reward": 0.8803571864962578, |
|
"rewards/len_reward": -0.04766237259027548, |
|
"rewards/reasoning_steps_reward": 0.6678571827709675, |
|
"rewards/tag_count_reward": 0.9017857626080513, |
|
"step": 315 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 594.9205612182617, |
|
"epoch": 0.3413333333333333, |
|
"grad_norm": 7.668425562182992, |
|
"kl": 2.090625, |
|
"learning_rate": 2.4987008541733663e-06, |
|
"loss": 0.1401, |
|
"reward": 1.6221125468611717, |
|
"reward_std": 0.8562563940882683, |
|
"rewards/accuracy_reward": 0.5142857370898127, |
|
"rewards/cosine_scaled_reward": 0.27300534858368336, |
|
"rewards/format_reward": 0.8348214700818062, |
|
"rewards/len_reward": -0.033610415155999364, |
|
"rewards/reasoning_steps_reward": 0.6363095626235008, |
|
"rewards/tag_count_reward": 0.8723214656114578, |
|
"step": 320 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 555.5401992797852, |
|
"epoch": 0.3466666666666667, |
|
"grad_norm": 18.400557478777237, |
|
"kl": 3.0746337890625, |
|
"learning_rate": 2.477674279068291e-06, |
|
"loss": 0.187, |
|
"reward": 1.575891900062561, |
|
"reward_std": 0.9395873740315437, |
|
"rewards/accuracy_reward": 0.5107143113389612, |
|
"rewards/cosine_scaled_reward": 0.27589183195959777, |
|
"rewards/format_reward": 0.7892857536673545, |
|
"rewards/len_reward": -0.03333336425275775, |
|
"rewards/reasoning_steps_reward": 0.600000037252903, |
|
"rewards/tag_count_reward": 0.8345982521772385, |
|
"step": 325 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 562.6785972595214, |
|
"epoch": 0.352, |
|
"grad_norm": 8.132015615402397, |
|
"kl": 2.1640625, |
|
"learning_rate": 2.4563082614542412e-06, |
|
"loss": 0.1433, |
|
"reward": 1.4302320718765258, |
|
"reward_std": 0.9858111351728439, |
|
"rewards/accuracy_reward": 0.4705357376486063, |
|
"rewards/cosine_scaled_reward": 0.2239820205140859, |
|
"rewards/format_reward": 0.7357143267989159, |
|
"rewards/len_reward": -0.05076549501682166, |
|
"rewards/reasoning_steps_reward": 0.5842262268066406, |
|
"rewards/tag_count_reward": 0.7995536103844643, |
|
"step": 330 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 578.0491333007812, |
|
"epoch": 0.35733333333333334, |
|
"grad_norm": 6.7174895292302805, |
|
"kl": 2.395703125, |
|
"learning_rate": 2.4346102194813937e-06, |
|
"loss": 0.1352, |
|
"reward": 1.4722676426172256, |
|
"reward_std": 0.9315723203122616, |
|
"rewards/accuracy_reward": 0.48303573820739987, |
|
"rewards/cosine_scaled_reward": 0.2249461407540366, |
|
"rewards/format_reward": 0.7642857551574707, |
|
"rewards/len_reward": -0.04090934251144063, |
|
"rewards/reasoning_steps_reward": 0.5836309917271137, |
|
"rewards/tag_count_reward": 0.8129464685916901, |
|
"step": 335 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 579.7286003112793, |
|
"epoch": 0.3626666666666667, |
|
"grad_norm": 14.327208381956924, |
|
"kl": 2.17421875, |
|
"learning_rate": 2.4125876865767443e-06, |
|
"loss": 0.1168, |
|
"reward": 1.435131347179413, |
|
"reward_std": 0.9724505253136158, |
|
"rewards/accuracy_reward": 0.46964288130402565, |
|
"rewards/cosine_scaled_reward": 0.21459556268528104, |
|
"rewards/format_reward": 0.7508928999304771, |
|
"rewards/len_reward": -0.05366924590198323, |
|
"rewards/reasoning_steps_reward": 0.5889881312847137, |
|
"rewards/tag_count_reward": 0.8158482506871223, |
|
"step": 340 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 582.0196670532226, |
|
"epoch": 0.368, |
|
"grad_norm": 12.351783358249422, |
|
"kl": 2.0623779296875, |
|
"learning_rate": 2.390248308828548e-06, |
|
"loss": 0.1202, |
|
"reward": 1.3827464163303376, |
|
"reward_std": 0.9574041977524758, |
|
"rewards/accuracy_reward": 0.45535716265439985, |
|
"rewards/cosine_scaled_reward": 0.19256778084672987, |
|
"rewards/format_reward": 0.7348214671015739, |
|
"rewards/len_reward": -0.050123395124683154, |
|
"rewards/reasoning_steps_reward": 0.6059524111449719, |
|
"rewards/tag_count_reward": 0.8122768223285675, |
|
"step": 345 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 600.7643127441406, |
|
"epoch": 0.37333333333333335, |
|
"grad_norm": 44.225679044967485, |
|
"kl": 2.75009765625, |
|
"learning_rate": 2.367599842331646e-06, |
|
"loss": 0.1392, |
|
"reward": 1.3846063792705536, |
|
"reward_std": 0.9288265883922577, |
|
"rewards/accuracy_reward": 0.45625002551823857, |
|
"rewards/cosine_scaled_reward": 0.184606320628518, |
|
"rewards/format_reward": 0.743750037252903, |
|
"rewards/len_reward": -0.0557435173453996, |
|
"rewards/reasoning_steps_reward": 0.597916704416275, |
|
"rewards/tag_count_reward": 0.8087053939700126, |
|
"step": 350 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 521.9241317749023, |
|
"epoch": 0.37866666666666665, |
|
"grad_norm": 2.5071157826124266, |
|
"kl": 2.10361328125, |
|
"learning_rate": 2.344650150494596e-06, |
|
"loss": 0.1433, |
|
"reward": 1.4640185952186584, |
|
"reward_std": 0.9381368611007929, |
|
"rewards/accuracy_reward": 0.49196430630981924, |
|
"rewards/cosine_scaled_reward": 0.21937568094581367, |
|
"rewards/format_reward": 0.7526786088943481, |
|
"rewards/len_reward": -0.05196416466642404, |
|
"rewards/reasoning_steps_reward": 0.5964286036789417, |
|
"rewards/tag_count_reward": 0.8064732551574707, |
|
"step": 355 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 496.58126907348634, |
|
"epoch": 0.384, |
|
"grad_norm": 3.557634724061136, |
|
"kl": 2.18310546875, |
|
"learning_rate": 2.3214072013095436e-06, |
|
"loss": 0.0958, |
|
"reward": 1.468079997599125, |
|
"reward_std": 0.909414467215538, |
|
"rewards/accuracy_reward": 0.4651785932481289, |
|
"rewards/cosine_scaled_reward": 0.22075851147892536, |
|
"rewards/format_reward": 0.7821428999304771, |
|
"rewards/len_reward": -0.054533878527581695, |
|
"rewards/reasoning_steps_reward": 0.577380983531475, |
|
"rewards/tag_count_reward": 0.8406250387430191, |
|
"step": 360 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 542.5785934448243, |
|
"epoch": 0.3893333333333333, |
|
"grad_norm": 3.8879973791198053, |
|
"kl": 3.174609375, |
|
"learning_rate": 2.2978790645857867e-06, |
|
"loss": 0.2324, |
|
"reward": 1.0243786200881004, |
|
"reward_std": 0.8560890629887581, |
|
"rewards/accuracy_reward": 0.28839287189766766, |
|
"rewards/cosine_scaled_reward": 0.08598572693299503, |
|
"rewards/format_reward": 0.6500000312924386, |
|
"rewards/len_reward": -0.08332871415186674, |
|
"rewards/reasoning_steps_reward": 0.5497024253010749, |
|
"rewards/tag_count_reward": 0.7598214596509933, |
|
"step": 365 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 502.62145309448243, |
|
"epoch": 0.39466666666666667, |
|
"grad_norm": 3.506874575960076, |
|
"kl": 1.89091796875, |
|
"learning_rate": 2.274073909147986e-06, |
|
"loss": 0.1493, |
|
"reward": 1.2348149195313454, |
|
"reward_std": 0.9403831019997597, |
|
"rewards/accuracy_reward": 0.3812500167638063, |
|
"rewards/cosine_scaled_reward": 0.1330291626858525, |
|
"rewards/format_reward": 0.7205357491970062, |
|
"rewards/len_reward": -0.07397339158924296, |
|
"rewards/reasoning_steps_reward": 0.5413690857589245, |
|
"rewards/tag_count_reward": 0.7933036088943481, |
|
"step": 370 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 516.9330604553222, |
|
"epoch": 0.4, |
|
"grad_norm": 2.9176316838758143, |
|
"kl": 3.01181640625, |
|
"learning_rate": 2.25e-06, |
|
"loss": 0.2186, |
|
"reward": 1.2647677563130855, |
|
"reward_std": 0.951485538482666, |
|
"rewards/accuracy_reward": 0.3955357325263321, |
|
"rewards/cosine_scaled_reward": 0.1781605551484972, |
|
"rewards/format_reward": 0.6910714544355869, |
|
"rewards/len_reward": -0.058914582757279274, |
|
"rewards/reasoning_steps_reward": 0.5461309857666492, |
|
"rewards/tag_count_reward": 0.7575893193483353, |
|
"step": 375 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 511.66430816650393, |
|
"epoch": 0.4053333333333333, |
|
"grad_norm": 2.696075434837073, |
|
"kl": 2.28759765625, |
|
"learning_rate": 2.225665695455325e-06, |
|
"loss": 0.1772, |
|
"reward": 1.3005409233272076, |
|
"reward_std": 0.9499507501721383, |
|
"rewards/accuracy_reward": 0.4178571649827063, |
|
"rewards/cosine_scaled_reward": 0.15232657552696766, |
|
"rewards/format_reward": 0.7303571812808514, |
|
"rewards/len_reward": -0.05240586331638042, |
|
"rewards/reasoning_steps_reward": 0.5726190879940987, |
|
"rewards/tag_count_reward": 0.8000000312924385, |
|
"step": 380 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 538.5303825378418, |
|
"epoch": 0.4106666666666667, |
|
"grad_norm": 2.894911697402106, |
|
"kl": 2.70791015625, |
|
"learning_rate": 2.20107944423514e-06, |
|
"loss": 0.1687, |
|
"reward": 1.2143270827829837, |
|
"reward_std": 0.9277090534567833, |
|
"rewards/accuracy_reward": 0.39285716190934183, |
|
"rewards/cosine_scaled_reward": 0.1196841929873699, |
|
"rewards/format_reward": 0.701785746961832, |
|
"rewards/len_reward": -0.08161826087161898, |
|
"rewards/reasoning_steps_reward": 0.6038690775632858, |
|
"rewards/tag_count_reward": 0.7930803939700126, |
|
"step": 385 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 530.4901977539063, |
|
"epoch": 0.416, |
|
"grad_norm": 2.1893043577623335, |
|
"kl": 1.98134765625, |
|
"learning_rate": 2.1762497825349665e-06, |
|
"loss": 0.1139, |
|
"reward": 1.451311932504177, |
|
"reward_std": 0.8446490153670311, |
|
"rewards/accuracy_reward": 0.4669643074274063, |
|
"rewards/cosine_scaled_reward": 0.16202617147937418, |
|
"rewards/format_reward": 0.8223214730620384, |
|
"rewards/len_reward": -0.049353477614931765, |
|
"rewards/reasoning_steps_reward": 0.6681548073887825, |
|
"rewards/tag_count_reward": 0.8658482521772385, |
|
"step": 390 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 515.6937721252441, |
|
"epoch": 0.42133333333333334, |
|
"grad_norm": 2.108843718460042, |
|
"kl": 1.975244140625, |
|
"learning_rate": 2.1511853310609467e-06, |
|
"loss": 0.1375, |
|
"reward": 1.566932386159897, |
|
"reward_std": 0.796729838848114, |
|
"rewards/accuracy_reward": 0.5142857372760773, |
|
"rewards/cosine_scaled_reward": 0.20353946691611782, |
|
"rewards/format_reward": 0.8491071820259094, |
|
"rewards/len_reward": -0.04666417014668696, |
|
"rewards/reasoning_steps_reward": 0.6708333812654018, |
|
"rewards/tag_count_reward": 0.894196467101574, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.4266666666666667, |
|
"grad_norm": 2.6528834721585386, |
|
"learning_rate": 2.1258947920367943e-06, |
|
"loss": 0.142, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.4266666666666667, |
|
"eval_clip_ratio": 0.0, |
|
"eval_completion_length": 549.1377075195312, |
|
"eval_kl": 2.1953125, |
|
"eval_loss": 0.14695748686790466, |
|
"eval_reward": 1.5166082382202148, |
|
"eval_reward_std": 0.9115615248680115, |
|
"eval_rewards/accuracy_reward": 0.5214285969734191, |
|
"eval_rewards/cosine_scaled_reward": 0.19160813689231873, |
|
"eval_rewards/format_reward": 0.8035714626312256, |
|
"eval_rewards/len_reward": 0.03922285344451666, |
|
"eval_rewards/reasoning_steps_reward": 0.6208333611488343, |
|
"eval_rewards/tag_count_reward": 0.8433036088943482, |
|
"eval_runtime": 119.8458, |
|
"eval_samples_per_second": 0.634, |
|
"eval_steps_per_second": 0.008, |
|
"step": 400 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 561.7768123626709, |
|
"epoch": 0.432, |
|
"grad_norm": 1.8155192088908143, |
|
"kl": 2.64248046875, |
|
"learning_rate": 2.100386946182431e-06, |
|
"loss": 0.1025, |
|
"reward": 1.4179199129343032, |
|
"reward_std": 0.8790018357336521, |
|
"rewards/accuracy_reward": 0.4616071647964418, |
|
"rewards/cosine_scaled_reward": 0.18131270299782046, |
|
"rewards/format_reward": 0.7750000402331352, |
|
"rewards/len_reward": -0.05932185428849053, |
|
"rewards/reasoning_steps_reward": 0.6251488503068685, |
|
"rewards/tag_count_reward": 0.8315848618745804, |
|
"step": 405 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 577.8580657958985, |
|
"epoch": 0.43733333333333335, |
|
"grad_norm": 6.06476868467511, |
|
"kl": 2.2408203125, |
|
"learning_rate": 2.0746706496653765e-06, |
|
"loss": 0.1245, |
|
"reward": 1.3582762971520423, |
|
"reward_std": 0.8962713375687599, |
|
"rewards/accuracy_reward": 0.43214288000017403, |
|
"rewards/cosine_scaled_reward": 0.1823833931237459, |
|
"rewards/format_reward": 0.743750037252903, |
|
"rewards/len_reward": -0.06492970008366683, |
|
"rewards/reasoning_steps_reward": 0.6160714574158191, |
|
"rewards/tag_count_reward": 0.8098214656114578, |
|
"step": 410 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 607.6991371154785, |
|
"epoch": 0.44266666666666665, |
|
"grad_norm": 3.369320135705493, |
|
"kl": 2.0812744140625, |
|
"learning_rate": 2.048754831025942e-06, |
|
"loss": 0.1126, |
|
"reward": 1.353794051706791, |
|
"reward_std": 0.8936590984463691, |
|
"rewards/accuracy_reward": 0.4241071607917547, |
|
"rewards/cosine_scaled_reward": 0.16986540658399463, |
|
"rewards/format_reward": 0.7598214641213417, |
|
"rewards/len_reward": -0.07249546247767284, |
|
"rewards/reasoning_steps_reward": 0.6014881297945976, |
|
"rewards/tag_count_reward": 0.8178571820259094, |
|
"step": 415 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 649.4482482910156, |
|
"epoch": 0.448, |
|
"grad_norm": 3.4251131757872786, |
|
"kl": 1.88330078125, |
|
"learning_rate": 2.0226484880772943e-06, |
|
"loss": 0.0423, |
|
"reward": 1.4434038519859314, |
|
"reward_std": 0.8622494846582412, |
|
"rewards/accuracy_reward": 0.4553571665659547, |
|
"rewards/cosine_scaled_reward": 0.18536810133846301, |
|
"rewards/format_reward": 0.8026786103844643, |
|
"rewards/len_reward": -0.0708045683815726, |
|
"rewards/reasoning_steps_reward": 0.615773843973875, |
|
"rewards/tag_count_reward": 0.8500000402331352, |
|
"step": 420 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 646.4428848266601, |
|
"epoch": 0.4533333333333333, |
|
"grad_norm": 3.9373036793479965, |
|
"kl": 1.9150634765625, |
|
"learning_rate": 1.9963606847814702e-06, |
|
"loss": 0.0719, |
|
"reward": 1.4559296056628228, |
|
"reward_std": 0.8618416309356689, |
|
"rewards/accuracy_reward": 0.4446428772062063, |
|
"rewards/cosine_scaled_reward": 0.20235811360180378, |
|
"rewards/format_reward": 0.8089286148548126, |
|
"rewards/len_reward": -0.0718243672628887, |
|
"rewards/reasoning_steps_reward": 0.648511940985918, |
|
"rewards/tag_count_reward": 0.861830398440361, |
|
"step": 425 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 625.8178810119629, |
|
"epoch": 0.45866666666666667, |
|
"grad_norm": 1.5199869338814753, |
|
"kl": 1.849365234375, |
|
"learning_rate": 1.9699005481024273e-06, |
|
"loss": 0.0543, |
|
"reward": 1.5006039649248124, |
|
"reward_std": 0.8628779232501984, |
|
"rewards/accuracy_reward": 0.4571428752504289, |
|
"rewards/cosine_scaled_reward": 0.25953247884754094, |
|
"rewards/format_reward": 0.7839286133646965, |
|
"rewards/len_reward": -0.06937549803406, |
|
"rewards/reasoning_steps_reward": 0.6339286088943481, |
|
"rewards/tag_count_reward": 0.8537946835160255, |
|
"step": 430 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 628.6714561462402, |
|
"epoch": 0.464, |
|
"grad_norm": 3.1795113531999535, |
|
"kl": 1.884423828125, |
|
"learning_rate": 1.943277264837214e-06, |
|
"loss": 0.0435, |
|
"reward": 1.4665423482656479, |
|
"reward_std": 0.8692625299096107, |
|
"rewards/accuracy_reward": 0.45803573401644826, |
|
"rewards/cosine_scaled_reward": 0.22100658505223691, |
|
"rewards/format_reward": 0.7875000417232514, |
|
"rewards/len_reward": -0.08461722731226473, |
|
"rewards/reasoning_steps_reward": 0.6083333678543568, |
|
"rewards/tag_count_reward": 0.859375037252903, |
|
"step": 435 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 650.9866348266602, |
|
"epoch": 0.4693333333333333, |
|
"grad_norm": 2.3194432630180817, |
|
"kl": 2.84091796875, |
|
"learning_rate": 1.9165000784263734e-06, |
|
"loss": 0.0896, |
|
"reward": 1.4741526886820793, |
|
"reward_std": 0.9216743379831314, |
|
"rewards/accuracy_reward": 0.4535714493133128, |
|
"rewards/cosine_scaled_reward": 0.2571883347816765, |
|
"rewards/format_reward": 0.7633929014205932, |
|
"rewards/len_reward": -0.06349783511832356, |
|
"rewards/reasoning_steps_reward": 0.5604166954755783, |
|
"rewards/tag_count_reward": 0.822991105914116, |
|
"step": 440 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 643.0214553833008, |
|
"epoch": 0.4746666666666667, |
|
"grad_norm": 3.2555576351928543, |
|
"kl": 1.411083984375, |
|
"learning_rate": 1.8895782857446754e-06, |
|
"loss": 0.0528, |
|
"reward": 1.5955371528863906, |
|
"reward_std": 0.8531403854489327, |
|
"rewards/accuracy_reward": 0.5044643089175225, |
|
"rewards/cosine_scaled_reward": 0.26964423903264106, |
|
"rewards/format_reward": 0.8214286103844642, |
|
"rewards/len_reward": -0.05830043089517858, |
|
"rewards/reasoning_steps_reward": 0.5845238514244556, |
|
"rewards/tag_count_reward": 0.8455357491970062, |
|
"step": 445 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 645.4437805175781, |
|
"epoch": 0.48, |
|
"grad_norm": 3.0639176041989487, |
|
"kl": 1.577734375, |
|
"learning_rate": 1.8625212338733005e-06, |
|
"loss": 0.0851, |
|
"reward": 1.6924126744270325, |
|
"reward_std": 0.8420305147767067, |
|
"rewards/accuracy_reward": 0.5589285969734192, |
|
"rewards/cosine_scaled_reward": 0.3040197363588959, |
|
"rewards/format_reward": 0.8294643297791481, |
|
"rewards/len_reward": -0.04013834063371178, |
|
"rewards/reasoning_steps_reward": 0.549404788017273, |
|
"rewards/tag_count_reward": 0.8446428969502449, |
|
"step": 450 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 629.6714530944824, |
|
"epoch": 0.48533333333333334, |
|
"grad_norm": 5.415079728456791, |
|
"kl": 1.687841796875, |
|
"learning_rate": 1.835338316854588e-06, |
|
"loss": 0.1255, |
|
"reward": 1.7355066567659378, |
|
"reward_std": 0.8314376153051853, |
|
"rewards/accuracy_reward": 0.5687500245869159, |
|
"rewards/cosine_scaled_reward": 0.32300658551976086, |
|
"rewards/format_reward": 0.8437500402331353, |
|
"rewards/len_reward": -0.03221394362917636, |
|
"rewards/reasoning_steps_reward": 0.5708333767950535, |
|
"rewards/tag_count_reward": 0.8620536133646965, |
|
"step": 455 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 620.2669906616211, |
|
"epoch": 0.49066666666666664, |
|
"grad_norm": 3.3718675514560483, |
|
"kl": 1.591455078125, |
|
"learning_rate": 1.8080389724304863e-06, |
|
"loss": 0.0807, |
|
"reward": 1.7503222301602364, |
|
"reward_std": 0.8645094059407711, |
|
"rewards/accuracy_reward": 0.5830357387661934, |
|
"rewards/cosine_scaled_reward": 0.3387150165159255, |
|
"rewards/format_reward": 0.8285714715719223, |
|
"rewards/len_reward": -0.03689943939389195, |
|
"rewards/reasoning_steps_reward": 0.551190510392189, |
|
"rewards/tag_count_reward": 0.8562500432133675, |
|
"step": 460 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 678.2982467651367, |
|
"epoch": 0.496, |
|
"grad_norm": 1.4238603128331906, |
|
"kl": 1.46229248046875, |
|
"learning_rate": 1.7806326787658219e-06, |
|
"loss": 0.0696, |
|
"reward": 1.7878277271986007, |
|
"reward_std": 0.8226681739091873, |
|
"rewards/accuracy_reward": 0.6151786014437676, |
|
"rewards/cosine_scaled_reward": 0.33604192789644005, |
|
"rewards/format_reward": 0.8366071790456772, |
|
"rewards/len_reward": -0.04087298092490528, |
|
"rewards/reasoning_steps_reward": 0.551190511137247, |
|
"rewards/tag_count_reward": 0.8636161163449287, |
|
"step": 465 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 713.3116371154786, |
|
"epoch": 0.5013333333333333, |
|
"grad_norm": 2.222503543302329, |
|
"kl": 1.5125, |
|
"learning_rate": 1.7531289511575427e-06, |
|
"loss": 0.0596, |
|
"reward": 1.6392380952835084, |
|
"reward_std": 0.8160237230360508, |
|
"rewards/accuracy_reward": 0.5241071671247483, |
|
"rewards/cosine_scaled_reward": 0.2669166113249958, |
|
"rewards/format_reward": 0.8482143312692643, |
|
"rewards/len_reward": -0.06106900977320038, |
|
"rewards/reasoning_steps_reward": 0.5258928924798966, |
|
"rewards/tag_count_reward": 0.875669677555561, |
|
"step": 470 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 672.4080635070801, |
|
"epoch": 0.5066666666666667, |
|
"grad_norm": 3.1430403238439366, |
|
"kl": 2.708740234375, |
|
"learning_rate": 1.7255373387310633e-06, |
|
"loss": 0.0805, |
|
"reward": 1.5507970213890077, |
|
"reward_std": 0.8910624407231807, |
|
"rewards/accuracy_reward": 0.49107144922018053, |
|
"rewards/cosine_scaled_reward": 0.26954693003499414, |
|
"rewards/format_reward": 0.7901786148548127, |
|
"rewards/len_reward": -0.05758061000378802, |
|
"rewards/reasoning_steps_reward": 0.5047619365155697, |
|
"rewards/tag_count_reward": 0.83482146859169, |
|
"step": 475 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 680.8928863525391, |
|
"epoch": 0.512, |
|
"grad_norm": 1.2963031388346116, |
|
"kl": 1.945947265625, |
|
"learning_rate": 1.6978674211248676e-06, |
|
"loss": 0.0875, |
|
"reward": 1.604718978703022, |
|
"reward_std": 0.8831877142190934, |
|
"rewards/accuracy_reward": 0.49553573653101923, |
|
"rewards/cosine_scaled_reward": 0.2832903404778335, |
|
"rewards/format_reward": 0.8258928969502449, |
|
"rewards/len_reward": -0.06320558707229793, |
|
"rewards/reasoning_steps_reward": 0.48422621935606003, |
|
"rewards/tag_count_reward": 0.8361607521772385, |
|
"step": 480 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 677.7125305175781, |
|
"epoch": 0.5173333333333333, |
|
"grad_norm": 1.0370713662213589, |
|
"kl": 1.35882568359375, |
|
"learning_rate": 1.6701288051645182e-06, |
|
"loss": 0.0922, |
|
"reward": 1.792074230313301, |
|
"reward_std": 0.79517278149724, |
|
"rewards/accuracy_reward": 0.5866071701049804, |
|
"rewards/cosine_scaled_reward": 0.3420741647016257, |
|
"rewards/format_reward": 0.8633928969502449, |
|
"rewards/len_reward": -0.03288009662137483, |
|
"rewards/reasoning_steps_reward": 0.4857143238186836, |
|
"rewards/tag_count_reward": 0.8662946850061417, |
|
"step": 485 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 676.1411018371582, |
|
"epoch": 0.5226666666666666, |
|
"grad_norm": 1.2794408915109077, |
|
"kl": 1.83466796875, |
|
"learning_rate": 1.642331121527223e-06, |
|
"loss": 0.0709, |
|
"reward": 1.6959030866622924, |
|
"reward_std": 0.8808967053890229, |
|
"rewards/accuracy_reward": 0.5696428790688515, |
|
"rewards/cosine_scaled_reward": 0.29679586752317844, |
|
"rewards/format_reward": 0.8294643342494965, |
|
"rewards/len_reward": -0.05175146399415098, |
|
"rewards/reasoning_steps_reward": 0.47678574323654177, |
|
"rewards/tag_count_reward": 0.8544643208384514, |
|
"step": 490 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 685.1571792602539, |
|
"epoch": 0.528, |
|
"grad_norm": 2.014915380678631, |
|
"kl": 1.9997802734375, |
|
"learning_rate": 1.6144840213981257e-06, |
|
"loss": 0.0355, |
|
"reward": 1.5696246579289437, |
|
"reward_std": 0.8558191373944283, |
|
"rewards/accuracy_reward": 0.508928595483303, |
|
"rewards/cosine_scaled_reward": 0.2562317634001374, |
|
"rewards/format_reward": 0.8044643253087997, |
|
"rewards/len_reward": -0.07657566228881478, |
|
"rewards/reasoning_steps_reward": 0.48779764398932457, |
|
"rewards/tag_count_reward": 0.846651828289032, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 0.5333333333333333, |
|
"grad_norm": 1.4151878603780967, |
|
"learning_rate": 1.5865971731194738e-06, |
|
"loss": 0.0949, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.5333333333333333, |
|
"eval_clip_ratio": 0.0, |
|
"eval_completion_length": 705.310546875, |
|
"eval_kl": 3.2875, |
|
"eval_loss": 0.10477960109710693, |
|
"eval_reward": 1.4186731815338134, |
|
"eval_reward_std": 0.9728380680084229, |
|
"eval_rewards/accuracy_reward": 0.47500001788139345, |
|
"eval_rewards/cosine_scaled_reward": 0.2133159816265106, |
|
"eval_rewards/format_reward": 0.7303571701049805, |
|
"eval_rewards/len_reward": -0.04307665973901749, |
|
"eval_rewards/reasoning_steps_reward": 0.46190478801727297, |
|
"eval_rewards/tag_count_reward": 0.7973214626312256, |
|
"eval_runtime": 123.234, |
|
"eval_samples_per_second": 0.617, |
|
"eval_steps_per_second": 0.008, |
|
"step": 500 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 709.8607475280762, |
|
"epoch": 0.5386666666666666, |
|
"grad_norm": 64.5985203796494, |
|
"kl": 2.5929443359375, |
|
"learning_rate": 1.5586802588338262e-06, |
|
"loss": 0.094, |
|
"reward": 1.4848765015602112, |
|
"reward_std": 0.9199745565652847, |
|
"rewards/accuracy_reward": 0.4910714510828257, |
|
"rewards/cosine_scaled_reward": 0.22773357991827653, |
|
"rewards/format_reward": 0.7660714708268642, |
|
"rewards/len_reward": -0.07428886514389887, |
|
"rewards/reasoning_steps_reward": 0.4977678868919611, |
|
"rewards/tag_count_reward": 0.8183036133646965, |
|
"step": 505 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 710.7768203735352, |
|
"epoch": 0.544, |
|
"grad_norm": 1.0652888427475398, |
|
"kl": 1.95126953125, |
|
"learning_rate": 1.5307429711224756e-06, |
|
"loss": 0.0823, |
|
"reward": 1.6024441167712211, |
|
"reward_std": 0.9890345253050328, |
|
"rewards/accuracy_reward": 0.550000024959445, |
|
"rewards/cosine_scaled_reward": 0.26494405455887315, |
|
"rewards/format_reward": 0.7875000387430191, |
|
"rewards/len_reward": -0.0605491196794901, |
|
"rewards/reasoning_steps_reward": 0.5348214656114578, |
|
"rewards/tag_count_reward": 0.8069196820259095, |
|
"step": 510 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 693.4411056518554, |
|
"epoch": 0.5493333333333333, |
|
"grad_norm": 3.159918602831137, |
|
"kl": 2.3521484375, |
|
"learning_rate": 1.5027950096402447e-06, |
|
"loss": 0.0542, |
|
"reward": 1.5701138600707054, |
|
"reward_std": 0.8735249437391758, |
|
"rewards/accuracy_reward": 0.5089285969734192, |
|
"rewards/cosine_scaled_reward": 0.27189951852487865, |
|
"rewards/format_reward": 0.7892857536673545, |
|
"rewards/len_reward": -0.07036974684160668, |
|
"rewards/reasoning_steps_reward": 0.5255952708423137, |
|
"rewards/tag_count_reward": 0.793303607404232, |
|
"step": 515 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 716.8518157958985, |
|
"epoch": 0.5546666666666666, |
|
"grad_norm": 1.361987334147322, |
|
"kl": 2.1235595703125, |
|
"learning_rate": 1.474846077747821e-06, |
|
"loss": 0.0603, |
|
"reward": 1.6362058356404305, |
|
"reward_std": 0.8912258110940456, |
|
"rewards/accuracy_reward": 0.5241071671247483, |
|
"rewards/cosine_scaled_reward": 0.28977720933035017, |
|
"rewards/format_reward": 0.8223214656114578, |
|
"rewards/len_reward": -0.06673234046902507, |
|
"rewards/reasoning_steps_reward": 0.5232143193483353, |
|
"rewards/tag_count_reward": 0.8129464626312256, |
|
"step": 520 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 696.7134223937989, |
|
"epoch": 0.56, |
|
"grad_norm": 2.896870550734319, |
|
"kl": 2.0576904296875, |
|
"learning_rate": 1.4469058791428154e-06, |
|
"loss": 0.0856, |
|
"reward": 1.648724715411663, |
|
"reward_std": 0.8711829155683517, |
|
"rewards/accuracy_reward": 0.5375000245869159, |
|
"rewards/cosine_scaled_reward": 0.2808674838131992, |
|
"rewards/format_reward": 0.8303571864962578, |
|
"rewards/len_reward": -0.05326956428325502, |
|
"rewards/reasoning_steps_reward": 0.48779765255749225, |
|
"rewards/tag_count_reward": 0.8171875372529029, |
|
"step": 525 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 729.4321807861328, |
|
"epoch": 0.5653333333333334, |
|
"grad_norm": 2.0390779532818843, |
|
"kl": 1.3226318359375, |
|
"learning_rate": 1.4189841144906928e-06, |
|
"loss": 0.0481, |
|
"reward": 1.725620111823082, |
|
"reward_std": 0.8606425553560257, |
|
"rewards/accuracy_reward": 0.5705357432365418, |
|
"rewards/cosine_scaled_reward": 0.3131200488656759, |
|
"rewards/format_reward": 0.8419643253087997, |
|
"rewards/len_reward": -0.06525407417793758, |
|
"rewards/reasoning_steps_reward": 0.4889881335198879, |
|
"rewards/tag_count_reward": 0.8185268193483353, |
|
"step": 530 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 694.4687828063965, |
|
"epoch": 0.5706666666666667, |
|
"grad_norm": 3.2275030286756503, |
|
"kl": 1.80302734375, |
|
"learning_rate": 1.3910904780567642e-06, |
|
"loss": 0.0862, |
|
"reward": 1.5835659071803092, |
|
"reward_std": 0.9309476897120476, |
|
"rewards/accuracy_reward": 0.5133928820490837, |
|
"rewards/cosine_scaled_reward": 0.2826729838096071, |
|
"rewards/format_reward": 0.7875000402331352, |
|
"rewards/len_reward": -0.05334139431070071, |
|
"rewards/reasoning_steps_reward": 0.4339285997673869, |
|
"rewards/tag_count_reward": 0.7857143238186837, |
|
"step": 535 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 674.718782043457, |
|
"epoch": 0.576, |
|
"grad_norm": 2.445582670371733, |
|
"kl": 2.04580078125, |
|
"learning_rate": 1.3632346543403946e-06, |
|
"loss": 0.0442, |
|
"reward": 1.7607878595590591, |
|
"reward_std": 0.8948764257133007, |
|
"rewards/accuracy_reward": 0.5901785979047418, |
|
"rewards/cosine_scaled_reward": 0.3402521046809852, |
|
"rewards/format_reward": 0.8303571909666061, |
|
"rewards/len_reward": -0.0463905618264107, |
|
"rewards/reasoning_steps_reward": 0.47976194024086, |
|
"rewards/tag_count_reward": 0.8104911088943482, |
|
"step": 540 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 672.7214630126953, |
|
"epoch": 0.5813333333333334, |
|
"grad_norm": 2.105824534589838, |
|
"kl": 1.8478271484375, |
|
"learning_rate": 1.335426314712607e-06, |
|
"loss": 0.0705, |
|
"reward": 1.643703442811966, |
|
"reward_std": 0.8904561165720224, |
|
"rewards/accuracy_reward": 0.541071455925703, |
|
"rewards/cosine_scaled_reward": 0.3079891032539308, |
|
"rewards/format_reward": 0.7946428984403611, |
|
"rewards/len_reward": -0.05454629746964201, |
|
"rewards/reasoning_steps_reward": 0.4824405059218407, |
|
"rewards/tag_count_reward": 0.78794646859169, |
|
"step": 545 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 661.1991333007812, |
|
"epoch": 0.5866666666666667, |
|
"grad_norm": 2.7373033343485553, |
|
"kl": 2.600390625, |
|
"learning_rate": 1.3076751140582396e-06, |
|
"loss": 0.0498, |
|
"reward": 1.5583188071846963, |
|
"reward_std": 0.9974759891629219, |
|
"rewards/accuracy_reward": 0.5366071701049805, |
|
"rewards/cosine_scaled_reward": 0.2967116108164191, |
|
"rewards/format_reward": 0.725000036507845, |
|
"rewards/len_reward": -0.06344892352935858, |
|
"rewards/reasoning_steps_reward": 0.46160717457532885, |
|
"rewards/tag_count_reward": 0.7638393208384514, |
|
"step": 550 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 665.4652076721192, |
|
"epoch": 0.592, |
|
"grad_norm": 1.1852476234067224, |
|
"kl": 1.458251953125, |
|
"learning_rate": 1.2799906874238297e-06, |
|
"loss": 0.0173, |
|
"reward": 1.7244828373193741, |
|
"reward_std": 0.8695188030600548, |
|
"rewards/accuracy_reward": 0.5767857413738966, |
|
"rewards/cosine_scaled_reward": 0.3334113570395857, |
|
"rewards/format_reward": 0.8142857536673546, |
|
"rewards/len_reward": -0.060599689168157056, |
|
"rewards/reasoning_steps_reward": 0.5214286014437676, |
|
"rewards/tag_count_reward": 0.7852678894996643, |
|
"step": 555 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 669.0857482910156, |
|
"epoch": 0.5973333333333334, |
|
"grad_norm": 2.914690295232921, |
|
"kl": 1.308984375, |
|
"learning_rate": 1.2523826466723843e-06, |
|
"loss": 0.0526, |
|
"reward": 1.798653519153595, |
|
"reward_std": 0.7890060037374497, |
|
"rewards/accuracy_reward": 0.5723214579746128, |
|
"rewards/cosine_scaled_reward": 0.35133201819844545, |
|
"rewards/format_reward": 0.8750000417232513, |
|
"rewards/len_reward": -0.050365234701894225, |
|
"rewards/reasoning_steps_reward": 0.561607176065445, |
|
"rewards/tag_count_reward": 0.8348214671015739, |
|
"step": 560 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 663.8428840637207, |
|
"epoch": 0.6026666666666667, |
|
"grad_norm": 3.266880748687459, |
|
"kl": 3.2771484375, |
|
"learning_rate": 1.2248605771462016e-06, |
|
"loss": 0.0748, |
|
"reward": 1.5902897894382477, |
|
"reward_std": 0.9371627844870091, |
|
"rewards/accuracy_reward": 0.5294643044471741, |
|
"rewards/cosine_scaled_reward": 0.2885039990535006, |
|
"rewards/format_reward": 0.7723214700818062, |
|
"rewards/len_reward": -0.07391626983880997, |
|
"rewards/reasoning_steps_reward": 0.5761905089020729, |
|
"rewards/tag_count_reward": 0.7772321790456772, |
|
"step": 565 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 649.5839538574219, |
|
"epoch": 0.608, |
|
"grad_norm": 1.1323574314355938, |
|
"kl": 1.91318359375, |
|
"learning_rate": 1.1974340343388974e-06, |
|
"loss": 0.0716, |
|
"reward": 1.7778286993503571, |
|
"reward_std": 0.8691867522895336, |
|
"rewards/accuracy_reward": 0.6008928859606385, |
|
"rewards/cosine_scaled_reward": 0.3430071874521673, |
|
"rewards/format_reward": 0.8339286118745803, |
|
"rewards/len_reward": -0.05089021619896812, |
|
"rewards/reasoning_steps_reward": 0.5464286044239998, |
|
"rewards/tag_count_reward": 0.8194196790456771, |
|
"step": 570 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 664.774137878418, |
|
"epoch": 0.6133333333333333, |
|
"grad_norm": 1.4388282162154258, |
|
"kl": 1.21708984375, |
|
"learning_rate": 1.1701125405777965e-06, |
|
"loss": 0.0413, |
|
"reward": 1.888391876220703, |
|
"reward_std": 0.8107489451766015, |
|
"rewards/accuracy_reward": 0.6410714574158192, |
|
"rewards/cosine_scaled_reward": 0.37678469233214856, |
|
"rewards/format_reward": 0.8705357581377029, |
|
"rewards/len_reward": -0.03641782412887551, |
|
"rewards/reasoning_steps_reward": 0.6273809865117073, |
|
"rewards/tag_count_reward": 0.8319196745753288, |
|
"step": 575 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 668.4446685791015, |
|
"epoch": 0.6186666666666667, |
|
"grad_norm": 1.3341419974163, |
|
"kl": 1.6546875, |
|
"learning_rate": 1.142905581717841e-06, |
|
"loss": 0.0385, |
|
"reward": 1.6953963339328766, |
|
"reward_std": 0.8444600582122803, |
|
"rewards/accuracy_reward": 0.5392857410013676, |
|
"rewards/cosine_scaled_reward": 0.31414626743644475, |
|
"rewards/format_reward": 0.841964328289032, |
|
"rewards/len_reward": -0.052902453497517855, |
|
"rewards/reasoning_steps_reward": 0.5791667021811009, |
|
"rewards/tag_count_reward": 0.80357146859169, |
|
"step": 580 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 676.8196731567383, |
|
"epoch": 0.624, |
|
"grad_norm": 4.640349038075715, |
|
"kl": 2.029638671875, |
|
"learning_rate": 1.1158226038481584e-06, |
|
"loss": 0.09, |
|
"reward": 1.795636734366417, |
|
"reward_std": 0.8532808139920235, |
|
"rewards/accuracy_reward": 0.6071428872644902, |
|
"rewards/cosine_scaled_reward": 0.3384937860071659, |
|
"rewards/format_reward": 0.8500000461935997, |
|
"rewards/len_reward": -0.044303331701667045, |
|
"rewards/reasoning_steps_reward": 0.6011905141174794, |
|
"rewards/tag_count_reward": 0.7720982536673546, |
|
"step": 585 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 699.8062805175781, |
|
"epoch": 0.6293333333333333, |
|
"grad_norm": 1.798885837943298, |
|
"kl": 2.5541015625, |
|
"learning_rate": 1.0888730100124355e-06, |
|
"loss": 0.0852, |
|
"reward": 1.640977455675602, |
|
"reward_std": 0.9245925977826118, |
|
"rewards/accuracy_reward": 0.550892885029316, |
|
"rewards/cosine_scaled_reward": 0.2829416638240218, |
|
"rewards/format_reward": 0.8071428954601287, |
|
"rewards/len_reward": -0.058377658866811545, |
|
"rewards/reasoning_steps_reward": 0.5770833715796471, |
|
"rewards/tag_count_reward": 0.7515625357627869, |
|
"step": 590 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 699.8830703735351, |
|
"epoch": 0.6346666666666667, |
|
"grad_norm": 1.4180247445536809, |
|
"kl": 2.21064453125, |
|
"learning_rate": 1.062066156944242e-06, |
|
"loss": 0.0715, |
|
"reward": 1.5858480513095856, |
|
"reward_std": 0.9440520867705345, |
|
"rewards/accuracy_reward": 0.5116071663796902, |
|
"rewards/cosine_scaled_reward": 0.28941939915530385, |
|
"rewards/format_reward": 0.7848214745521546, |
|
"rewards/len_reward": -0.06778505290567409, |
|
"rewards/reasoning_steps_reward": 0.5654762282967567, |
|
"rewards/tag_count_reward": 0.7723214596509933, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 1.5382009317793046, |
|
"learning_rate": 1.0354113518184304e-06, |
|
"loss": 0.0362, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"eval_clip_ratio": 0.0, |
|
"eval_completion_length": 692.8052856445313, |
|
"eval_kl": 2.0015625, |
|
"eval_loss": 0.02467462234199047, |
|
"eval_reward": 1.579683256149292, |
|
"eval_reward_std": 0.9282516837120056, |
|
"eval_rewards/accuracy_reward": 0.5410714447498322, |
|
"eval_rewards/cosine_scaled_reward": 0.28146889209747317, |
|
"eval_rewards/format_reward": 0.7571428894996644, |
|
"eval_rewards/len_reward": -0.033683009818196295, |
|
"eval_rewards/reasoning_steps_reward": 0.5428571820259094, |
|
"eval_rewards/tag_count_reward": 0.7959821939468383, |
|
"eval_runtime": 123.4097, |
|
"eval_samples_per_second": 0.616, |
|
"eval_steps_per_second": 0.008, |
|
"step": 600 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 698.9357452392578, |
|
"epoch": 0.6453333333333333, |
|
"grad_norm": 0.8999801432514335, |
|
"kl": 2.011328125, |
|
"learning_rate": 1.008917849019739e-06, |
|
"loss": 0.0246, |
|
"reward": 1.6242469638586043, |
|
"reward_std": 0.9202400345355273, |
|
"rewards/accuracy_reward": 0.5218750248197466, |
|
"rewards/cosine_scaled_reward": 0.3135326229268685, |
|
"rewards/format_reward": 0.7888393275439739, |
|
"rewards/len_reward": -0.06526912819986137, |
|
"rewards/reasoning_steps_reward": 0.5854167021811009, |
|
"rewards/tag_count_reward": 0.8075893238186836, |
|
"step": 605 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 750.725926208496, |
|
"epoch": 0.6506666666666666, |
|
"grad_norm": 2.1187107981918802, |
|
"kl": 1.882958984375, |
|
"learning_rate": 9.825948469297303e-07, |
|
"loss": 0.0314, |
|
"reward": 1.64074095338583, |
|
"reward_std": 0.9081449903547764, |
|
"rewards/accuracy_reward": 0.5330357388593256, |
|
"rewards/cosine_scaled_reward": 0.3068123304285109, |
|
"rewards/format_reward": 0.8008928969502449, |
|
"rewards/len_reward": -0.06153649939224124, |
|
"rewards/reasoning_steps_reward": 0.6250000432133674, |
|
"rewards/tag_count_reward": 0.8250000357627869, |
|
"step": 610 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 748.2152130126954, |
|
"epoch": 0.656, |
|
"grad_norm": 2.131590013783802, |
|
"kl": 1.9369140625, |
|
"learning_rate": 9.564514847331647e-07, |
|
"loss": 0.0395, |
|
"reward": 1.6662242144346238, |
|
"reward_std": 0.8884774453938007, |
|
"rewards/accuracy_reward": 0.5437500268220902, |
|
"rewards/cosine_scaled_reward": 0.30372417061589657, |
|
"rewards/format_reward": 0.8187500432133674, |
|
"rewards/len_reward": -0.07084276499663247, |
|
"rewards/reasoning_steps_reward": 0.586309564858675, |
|
"rewards/tag_count_reward": 0.8140625357627869, |
|
"step": 615 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 722.0428909301758, |
|
"epoch": 0.6613333333333333, |
|
"grad_norm": 0.947846170528357, |
|
"kl": 1.95106201171875, |
|
"learning_rate": 9.304968392449361e-07, |
|
"loss": 0.0166, |
|
"reward": 1.6543820381164551, |
|
"reward_std": 0.8385958023369312, |
|
"rewards/accuracy_reward": 0.5223214563913643, |
|
"rewards/cosine_scaled_reward": 0.3061676881741732, |
|
"rewards/format_reward": 0.8258928969502449, |
|
"rewards/len_reward": -0.07246997265610844, |
|
"rewards/reasoning_steps_reward": 0.5738095574080944, |
|
"rewards/tag_count_reward": 0.8107143238186836, |
|
"step": 620 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 716.7339599609375, |
|
"epoch": 0.6666666666666666, |
|
"grad_norm": 3.0578518576987914, |
|
"kl": 1.99375, |
|
"learning_rate": 9.047399217586552e-07, |
|
"loss": 0.0515, |
|
"reward": 1.6329590931534768, |
|
"reward_std": 0.8712823033332825, |
|
"rewards/accuracy_reward": 0.4964285962283611, |
|
"rewards/cosine_scaled_reward": 0.32670902004465463, |
|
"rewards/format_reward": 0.8098214730620384, |
|
"rewards/len_reward": -0.05736397755099461, |
|
"rewards/reasoning_steps_reward": 0.5723214700818062, |
|
"rewards/tag_count_reward": 0.7941964656114578, |
|
"step": 625 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 768.658071899414, |
|
"epoch": 0.672, |
|
"grad_norm": 1.9045908740529087, |
|
"kl": 1.85390625, |
|
"learning_rate": 8.791896749179831e-07, |
|
"loss": 0.0319, |
|
"reward": 1.545591439306736, |
|
"reward_std": 0.9249462381005287, |
|
"rewards/accuracy_reward": 0.49910716637969016, |
|
"rewards/cosine_scaled_reward": 0.2473770938348025, |
|
"rewards/format_reward": 0.7991071864962578, |
|
"rewards/len_reward": -0.09455443265615031, |
|
"rewards/reasoning_steps_reward": 0.5520833678543567, |
|
"rewards/tag_count_reward": 0.8082589685916901, |
|
"step": 630 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 764.3089599609375, |
|
"epoch": 0.6773333333333333, |
|
"grad_norm": 1.489830070872437, |
|
"kl": 1.666015625, |
|
"learning_rate": 8.538549696118023e-07, |
|
"loss": 0.035, |
|
"reward": 1.6950147479772568, |
|
"reward_std": 0.8494538977742195, |
|
"rewards/accuracy_reward": 0.5785714577883482, |
|
"rewards/cosine_scaled_reward": 0.28787180441431703, |
|
"rewards/format_reward": 0.8285714700818062, |
|
"rewards/len_reward": -0.07843075728160329, |
|
"rewards/reasoning_steps_reward": 0.5327381260693074, |
|
"rewards/tag_count_reward": 0.825446467101574, |
|
"step": 635 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 771.7393203735352, |
|
"epoch": 0.6826666666666666, |
|
"grad_norm": 1.408555863280269, |
|
"kl": 1.8015625, |
|
"learning_rate": 8.287446018942973e-07, |
|
"loss": 0.0166, |
|
"reward": 1.6653958648443221, |
|
"reward_std": 0.8103255234658718, |
|
"rewards/accuracy_reward": 0.5616071740165353, |
|
"rewards/cosine_scaled_reward": 0.2725386595353484, |
|
"rewards/format_reward": 0.8312500461935997, |
|
"rewards/len_reward": -0.08096674757543951, |
|
"rewards/reasoning_steps_reward": 0.5892857603728772, |
|
"rewards/tag_count_reward": 0.8274553939700127, |
|
"step": 640 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 744.9634246826172, |
|
"epoch": 0.688, |
|
"grad_norm": 3.237395884870554, |
|
"kl": 1.91787109375, |
|
"learning_rate": 8.038672899310176e-07, |
|
"loss": 0.0597, |
|
"reward": 1.6257297798991204, |
|
"reward_std": 0.8151006668806076, |
|
"rewards/accuracy_reward": 0.5276785969734192, |
|
"rewards/cosine_scaled_reward": 0.24983684375183657, |
|
"rewards/format_reward": 0.8482143342494964, |
|
"rewards/len_reward": -0.07792667767498643, |
|
"rewards/reasoning_steps_reward": 0.5824405170977116, |
|
"rewards/tag_count_reward": 0.8095982536673546, |
|
"step": 645 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 734.9598541259766, |
|
"epoch": 0.6933333333333334, |
|
"grad_norm": 2.539952667963002, |
|
"kl": 1.850830078125, |
|
"learning_rate": 7.792316709719875e-07, |
|
"loss": 0.0642, |
|
"reward": 1.649840420484543, |
|
"reward_std": 0.8533309623599052, |
|
"rewards/accuracy_reward": 0.5375000268220902, |
|
"rewards/cosine_scaled_reward": 0.28555465580429884, |
|
"rewards/format_reward": 0.8267857581377029, |
|
"rewards/len_reward": -0.06069966709183063, |
|
"rewards/reasoning_steps_reward": 0.5857143230736256, |
|
"rewards/tag_count_reward": 0.7982143208384513, |
|
"step": 650 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 723.6411041259765, |
|
"epoch": 0.6986666666666667, |
|
"grad_norm": 1.8599505779512118, |
|
"kl": 1.41396484375, |
|
"learning_rate": 7.548462983529016e-07, |
|
"loss": 0.0594, |
|
"reward": 1.759677691757679, |
|
"reward_std": 0.8695821583271026, |
|
"rewards/accuracy_reward": 0.5821428894996643, |
|
"rewards/cosine_scaled_reward": 0.3382490785326809, |
|
"rewards/format_reward": 0.839285759627819, |
|
"rewards/len_reward": -0.05585277818609029, |
|
"rewards/reasoning_steps_reward": 0.5860119491815567, |
|
"rewards/tag_count_reward": 0.8180803880095482, |
|
"step": 655 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 694.9009216308593, |
|
"epoch": 0.704, |
|
"grad_norm": 1.356477272292917, |
|
"kl": 1.7875, |
|
"learning_rate": 7.307196385254621e-07, |
|
"loss": 0.0349, |
|
"reward": 1.6008909553289414, |
|
"reward_std": 0.9093584820628167, |
|
"rewards/accuracy_reward": 0.5258928779512644, |
|
"rewards/cosine_scaled_reward": 0.2687480329768732, |
|
"rewards/format_reward": 0.8062500432133675, |
|
"rewards/len_reward": -0.07140515584032983, |
|
"rewards/reasoning_steps_reward": 0.5979167088866234, |
|
"rewards/tag_count_reward": 0.7738839656114578, |
|
"step": 660 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 728.4169921875, |
|
"epoch": 0.7093333333333334, |
|
"grad_norm": 2.3455305274577944, |
|
"kl": 1.65068359375, |
|
"learning_rate": 7.068600681178772e-07, |
|
"loss": 0.0147, |
|
"reward": 1.760453712940216, |
|
"reward_std": 0.869930399954319, |
|
"rewards/accuracy_reward": 0.5937500312924385, |
|
"rewards/cosine_scaled_reward": 0.32741790837608276, |
|
"rewards/format_reward": 0.8392857626080513, |
|
"rewards/len_reward": -0.05782133540487848, |
|
"rewards/reasoning_steps_reward": 0.5970238469541073, |
|
"rewards/tag_count_reward": 0.8209821820259094, |
|
"step": 665 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 724.6794952392578, |
|
"epoch": 0.7146666666666667, |
|
"grad_norm": 1.4838696193874328, |
|
"kl": 1.65224609375, |
|
"learning_rate": 6.832758710265492e-07, |
|
"loss": 0.0393, |
|
"reward": 1.6467845141887665, |
|
"reward_std": 0.8451258420944214, |
|
"rewards/accuracy_reward": 0.5276785969734192, |
|
"rewards/cosine_scaled_reward": 0.29232015907764436, |
|
"rewards/format_reward": 0.8267857551574707, |
|
"rewards/len_reward": -0.05971484867623076, |
|
"rewards/reasoning_steps_reward": 0.5928571835160256, |
|
"rewards/tag_count_reward": 0.799330398440361, |
|
"step": 670 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 730.0205703735352, |
|
"epoch": 0.72, |
|
"grad_norm": 1.7671955478088914, |
|
"kl": 1.96552734375, |
|
"learning_rate": 6.599752355399538e-07, |
|
"loss": 0.0535, |
|
"reward": 1.6977156594395637, |
|
"reward_std": 0.8604207873344422, |
|
"rewards/accuracy_reward": 0.559821455180645, |
|
"rewards/cosine_scaled_reward": 0.31289418824017046, |
|
"rewards/format_reward": 0.8250000461935997, |
|
"rewards/len_reward": -0.06840839698270429, |
|
"rewards/reasoning_steps_reward": 0.5937500350177288, |
|
"rewards/tag_count_reward": 0.806919677555561, |
|
"step": 675 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 701.5803939819336, |
|
"epoch": 0.7253333333333334, |
|
"grad_norm": 2.711356467895934, |
|
"kl": 2.33935546875, |
|
"learning_rate": 6.369662514957191e-07, |
|
"loss": 0.0471, |
|
"reward": 1.6881353616714478, |
|
"reward_std": 0.9093373231589794, |
|
"rewards/accuracy_reward": 0.5616071753203868, |
|
"rewards/cosine_scaled_reward": 0.3033138638362288, |
|
"rewards/format_reward": 0.8232143267989158, |
|
"rewards/len_reward": -0.06628427920097693, |
|
"rewards/reasoning_steps_reward": 0.5904762275516987, |
|
"rewards/tag_count_reward": 0.7837053939700127, |
|
"step": 680 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 717.6080688476562, |
|
"epoch": 0.7306666666666667, |
|
"grad_norm": 2.3553290067273727, |
|
"kl": 1.72353515625, |
|
"learning_rate": 6.142569074718818e-07, |
|
"loss": 0.0314, |
|
"reward": 1.6499603599309922, |
|
"reward_std": 0.8875117138028145, |
|
"rewards/accuracy_reward": 0.5419643139466643, |
|
"rewards/cosine_scaled_reward": 0.27763886260800064, |
|
"rewards/format_reward": 0.8303571879863739, |
|
"rewards/len_reward": -0.06650189743377269, |
|
"rewards/reasoning_steps_reward": 0.6178571805357933, |
|
"rewards/tag_count_reward": 0.7953125342726708, |
|
"step": 685 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 701.9911003112793, |
|
"epoch": 0.736, |
|
"grad_norm": 1.4466892795867712, |
|
"kl": 1.6891357421875, |
|
"learning_rate": 5.918550880133018e-07, |
|
"loss": 0.022, |
|
"reward": 1.8273341655731201, |
|
"reward_std": 0.8836588777601719, |
|
"rewards/accuracy_reward": 0.6133928835391999, |
|
"rewards/cosine_scaled_reward": 0.37644122838974, |
|
"rewards/format_reward": 0.8375000417232513, |
|
"rewards/len_reward": -0.05486658178269863, |
|
"rewards/reasoning_steps_reward": 0.6154762282967567, |
|
"rewards/tag_count_reward": 0.7799107506871223, |
|
"step": 690 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 709.267894744873, |
|
"epoch": 0.7413333333333333, |
|
"grad_norm": 1.8058065674655572, |
|
"kl": 1.5920654296875, |
|
"learning_rate": 5.697685708941996e-07, |
|
"loss": 0.0249, |
|
"reward": 1.6933651700615884, |
|
"reward_std": 0.8785169780254364, |
|
"rewards/accuracy_reward": 0.5642857432365418, |
|
"rewards/cosine_scaled_reward": 0.3067579740891233, |
|
"rewards/format_reward": 0.8223214775323868, |
|
"rewards/len_reward": -0.06730828973231837, |
|
"rewards/reasoning_steps_reward": 0.5997024156153202, |
|
"rewards/tag_count_reward": 0.7810268208384514, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 0.7466666666666667, |
|
"grad_norm": 2.0040117357136964, |
|
"learning_rate": 5.480050244177573e-07, |
|
"loss": 0.0404, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.7466666666666667, |
|
"eval_clip_ratio": 0.0, |
|
"eval_completion_length": 689.7265380859375, |
|
"eval_kl": 1.8734375, |
|
"eval_loss": 0.05176318436861038, |
|
"eval_reward": 1.7034537076950074, |
|
"eval_reward_std": 0.9170334577560425, |
|
"eval_rewards/accuracy_reward": 0.5625000417232513, |
|
"eval_rewards/cosine_scaled_reward": 0.3195250898599625, |
|
"eval_rewards/format_reward": 0.8214286088943481, |
|
"eval_rewards/len_reward": -0.014184250682592391, |
|
"eval_rewards/reasoning_steps_reward": 0.598809564113617, |
|
"eval_rewards/tag_count_reward": 0.7892857432365418, |
|
"eval_runtime": 121.9627, |
|
"eval_samples_per_second": 0.623, |
|
"eval_steps_per_second": 0.008, |
|
"step": 700 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 684.2286022186279, |
|
"epoch": 0.752, |
|
"grad_norm": 1.7320693073402242, |
|
"kl": 2.126708984375, |
|
"learning_rate": 5.265720047537318e-07, |
|
"loss": 0.0423, |
|
"reward": 1.5781065180897713, |
|
"reward_std": 0.9164291121065616, |
|
"rewards/accuracy_reward": 0.529017882142216, |
|
"rewards/cosine_scaled_reward": 0.274981448915787, |
|
"rewards/format_reward": 0.7741071842610836, |
|
"rewards/len_reward": -0.0748695431771921, |
|
"rewards/reasoning_steps_reward": 0.6022321883589029, |
|
"rewards/tag_count_reward": 0.761272357404232, |
|
"step": 705 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 668.6464576721191, |
|
"epoch": 0.7573333333333333, |
|
"grad_norm": 2.851783458060785, |
|
"kl": 1.795166015625, |
|
"learning_rate": 5.054769533149999e-07, |
|
"loss": 0.0354, |
|
"reward": 1.7714153915643691, |
|
"reward_std": 0.9144653856754303, |
|
"rewards/accuracy_reward": 0.5973214589059352, |
|
"rewards/cosine_scaled_reward": 0.3535581724718213, |
|
"rewards/format_reward": 0.8205357566475868, |
|
"rewards/len_reward": -0.057637330750003456, |
|
"rewards/reasoning_steps_reward": 0.621130996197462, |
|
"rewards/tag_count_reward": 0.7683036014437675, |
|
"step": 710 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 708.1009216308594, |
|
"epoch": 0.7626666666666667, |
|
"grad_norm": 2.260299938099763, |
|
"kl": 1.759228515625, |
|
"learning_rate": 4.847271941739458e-07, |
|
"loss": 0.0182, |
|
"reward": 1.6718120127916336, |
|
"reward_std": 0.8970876529812812, |
|
"rewards/accuracy_reward": 0.5437500262632966, |
|
"rewards/cosine_scaled_reward": 0.32538338992744686, |
|
"rewards/format_reward": 0.8026786133646965, |
|
"rewards/len_reward": -0.062959678506013, |
|
"rewards/reasoning_steps_reward": 0.6113095685839653, |
|
"rewards/tag_count_reward": 0.7883928954601288, |
|
"step": 715 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 683.4464584350586, |
|
"epoch": 0.768, |
|
"grad_norm": 2.074272987776867, |
|
"kl": 1.8629638671875, |
|
"learning_rate": 4.643299315195855e-07, |
|
"loss": 0.0045, |
|
"reward": 1.6655956655740738, |
|
"reward_std": 0.9097626186907292, |
|
"rewards/accuracy_reward": 0.5223214536905288, |
|
"rewards/cosine_scaled_reward": 0.3227384569123387, |
|
"rewards/format_reward": 0.820535758137703, |
|
"rewards/len_reward": -0.06934529592399485, |
|
"rewards/reasoning_steps_reward": 0.6032738454639912, |
|
"rewards/tag_count_reward": 0.7993303939700127, |
|
"step": 720 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 678.5500289916993, |
|
"epoch": 0.7733333333333333, |
|
"grad_norm": 1.6540501406095172, |
|
"kl": 1.699609375, |
|
"learning_rate": 4.442922471563205e-07, |
|
"loss": 0.0157, |
|
"reward": 1.7752037733793258, |
|
"reward_std": 0.8400040708482266, |
|
"rewards/accuracy_reward": 0.5580357387661934, |
|
"rewards/cosine_scaled_reward": 0.3662751629948616, |
|
"rewards/format_reward": 0.8508929058909416, |
|
"rewards/len_reward": -0.05521610935102217, |
|
"rewards/reasoning_steps_reward": 0.6130952782928943, |
|
"rewards/tag_count_reward": 0.8223214641213417, |
|
"step": 725 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 725.4411071777344, |
|
"epoch": 0.7786666666666666, |
|
"grad_norm": 8.410594372712595, |
|
"kl": 1.941552734375, |
|
"learning_rate": 4.24621098045175e-07, |
|
"loss": 0.0076, |
|
"reward": 1.7029503554105758, |
|
"reward_std": 0.8638798981904984, |
|
"rewards/accuracy_reward": 0.5294643077999354, |
|
"rewards/cosine_scaled_reward": 0.3404503061901778, |
|
"rewards/format_reward": 0.8330357566475868, |
|
"rewards/len_reward": -0.06691470365040005, |
|
"rewards/reasoning_steps_reward": 0.5958333618938922, |
|
"rewards/tag_count_reward": 0.8234375402331352, |
|
"step": 730 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 713.8982528686523, |
|
"epoch": 0.784, |
|
"grad_norm": 1.9391448788727794, |
|
"kl": 1.941015625, |
|
"learning_rate": 4.053233138883835e-07, |
|
"loss": 0.0489, |
|
"reward": 1.77894726395607, |
|
"reward_std": 0.9000619798898697, |
|
"rewards/accuracy_reward": 0.5982143111526966, |
|
"rewards/cosine_scaled_reward": 0.3539472218602896, |
|
"rewards/format_reward": 0.8267857581377029, |
|
"rewards/len_reward": -0.05374447105568834, |
|
"rewards/reasoning_steps_reward": 0.6297619432210922, |
|
"rewards/tag_count_reward": 0.8029018208384514, |
|
"step": 735 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 702.009854888916, |
|
"epoch": 0.7893333333333333, |
|
"grad_norm": 1.355632286643051, |
|
"kl": 2.266845703125, |
|
"learning_rate": 3.864055947581605e-07, |
|
"loss": 0.0316, |
|
"reward": 1.6201725795865058, |
|
"reward_std": 0.8785072147846222, |
|
"rewards/accuracy_reward": 0.5321428820490837, |
|
"rewards/cosine_scaled_reward": 0.29070827066898347, |
|
"rewards/format_reward": 0.7973214685916901, |
|
"rewards/len_reward": -0.07668145237257704, |
|
"rewards/reasoning_steps_reward": 0.5699405044317245, |
|
"rewards/tag_count_reward": 0.8022321805357933, |
|
"step": 740 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 708.0571716308593, |
|
"epoch": 0.7946666666666666, |
|
"grad_norm": 3.3069120908956227, |
|
"kl": 2.06044921875, |
|
"learning_rate": 3.6787450877047543e-07, |
|
"loss": 0.0349, |
|
"reward": 1.6467604607343673, |
|
"reward_std": 0.8980094000697136, |
|
"rewards/accuracy_reward": 0.543750024586916, |
|
"rewards/cosine_scaled_reward": 0.2985461330041289, |
|
"rewards/format_reward": 0.8044643297791481, |
|
"rewards/len_reward": -0.06843005996015564, |
|
"rewards/reasoning_steps_reward": 0.598214327543974, |
|
"rewards/tag_count_reward": 0.8011161029338837, |
|
"step": 745 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 693.400032043457, |
|
"epoch": 0.8, |
|
"grad_norm": 1.7291219089700667, |
|
"kl": 2.141796875, |
|
"learning_rate": 3.4973648980464454e-07, |
|
"loss": 0.0398, |
|
"reward": 1.7774270474910736, |
|
"reward_std": 0.9566665157675743, |
|
"rewards/accuracy_reward": 0.6017857499420642, |
|
"rewards/cosine_scaled_reward": 0.3622484166175127, |
|
"rewards/format_reward": 0.813392898440361, |
|
"rewards/len_reward": -0.06579462469671853, |
|
"rewards/reasoning_steps_reward": 0.6119048051536083, |
|
"rewards/tag_count_reward": 0.7859375327825546, |
|
"step": 750 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 697.6071731567383, |
|
"epoch": 0.8053333333333333, |
|
"grad_norm": 2.1537011975481626, |
|
"kl": 2.369970703125, |
|
"learning_rate": 3.3199783526952656e-07, |
|
"loss": 0.0285, |
|
"reward": 1.626606747508049, |
|
"reward_std": 0.8798683725297451, |
|
"rewards/accuracy_reward": 0.546428595483303, |
|
"rewards/cosine_scaled_reward": 0.2810709737765137, |
|
"rewards/format_reward": 0.7991071850061416, |
|
"rewards/len_reward": -0.08888059532619082, |
|
"rewards/reasoning_steps_reward": 0.6297619499266147, |
|
"rewards/tag_count_reward": 0.7700893223285675, |
|
"step": 755 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 669.0143157958985, |
|
"epoch": 0.8106666666666666, |
|
"grad_norm": 2.365610096938751, |
|
"kl": 2.06806640625, |
|
"learning_rate": 3.146647039171002e-07, |
|
"loss": 0.0252, |
|
"reward": 1.7266648024320603, |
|
"reward_std": 0.8433036901056766, |
|
"rewards/accuracy_reward": 0.5705357395112515, |
|
"rewards/cosine_scaled_reward": 0.33559329714626074, |
|
"rewards/format_reward": 0.8205357536673545, |
|
"rewards/len_reward": -0.058183514599659245, |
|
"rewards/reasoning_steps_reward": 0.6181547954678536, |
|
"rewards/tag_count_reward": 0.7935268253087997, |
|
"step": 760 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 718.244677734375, |
|
"epoch": 0.816, |
|
"grad_norm": 2.229073726941259, |
|
"kl": 1.7294921875, |
|
"learning_rate": 2.977431137041848e-07, |
|
"loss": 0.0401, |
|
"reward": 1.6797782227396965, |
|
"reward_std": 0.8497840896248817, |
|
"rewards/accuracy_reward": 0.5312500277534127, |
|
"rewards/cosine_scaled_reward": 0.32263530092313886, |
|
"rewards/format_reward": 0.8258928999304771, |
|
"rewards/len_reward": -0.060991953429766, |
|
"rewards/reasoning_steps_reward": 0.6047619454562664, |
|
"rewards/tag_count_reward": 0.8078125342726707, |
|
"step": 765 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 698.1232482910157, |
|
"epoch": 0.8213333333333334, |
|
"grad_norm": 1.619053512727917, |
|
"kl": 1.646826171875, |
|
"learning_rate": 2.8123893970304154e-07, |
|
"loss": 0.0266, |
|
"reward": 1.7365109011530877, |
|
"reward_std": 0.8401576727628708, |
|
"rewards/accuracy_reward": 0.5803571723401546, |
|
"rewards/cosine_scaled_reward": 0.3257965755648911, |
|
"rewards/format_reward": 0.8303571820259095, |
|
"rewards/len_reward": -0.05876578897587024, |
|
"rewards/reasoning_steps_reward": 0.6479167118668556, |
|
"rewards/tag_count_reward": 0.8062500417232513, |
|
"step": 770 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 693.6268112182618, |
|
"epoch": 0.8266666666666667, |
|
"grad_norm": 1.9920542202647236, |
|
"kl": 1.9521728515625, |
|
"learning_rate": 2.651579120615855e-07, |
|
"loss": 0.043, |
|
"reward": 1.746559591591358, |
|
"reward_std": 0.8663071312010289, |
|
"rewards/accuracy_reward": 0.584821455925703, |
|
"rewards/cosine_scaled_reward": 0.3394166727666743, |
|
"rewards/format_reward": 0.8223214671015739, |
|
"rewards/len_reward": -0.05387566906865686, |
|
"rewards/reasoning_steps_reward": 0.6288690894842148, |
|
"rewards/tag_count_reward": 0.7970982506871224, |
|
"step": 775 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 703.2910995483398, |
|
"epoch": 0.832, |
|
"grad_norm": 1.999916137698683, |
|
"kl": 1.91943359375, |
|
"learning_rate": 2.495056140139119e-07, |
|
"loss": 0.0445, |
|
"reward": 1.650632557272911, |
|
"reward_std": 0.812701889872551, |
|
"rewards/accuracy_reward": 0.5187500238418579, |
|
"rewards/cosine_scaled_reward": 0.30152534758672117, |
|
"rewards/format_reward": 0.8303571879863739, |
|
"rewards/len_reward": -0.0650079416227527, |
|
"rewards/reasoning_steps_reward": 0.6523810014128685, |
|
"rewards/tag_count_reward": 0.7991071790456772, |
|
"step": 780 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 675.2893173217774, |
|
"epoch": 0.8373333333333334, |
|
"grad_norm": 1.4300723881782424, |
|
"kl": 2.218310546875, |
|
"learning_rate": 2.3428747994183364e-07, |
|
"loss": 0.0414, |
|
"reward": 1.7487051695585252, |
|
"reward_std": 0.8682012394070625, |
|
"rewards/accuracy_reward": 0.5741071701049805, |
|
"rewards/cosine_scaled_reward": 0.34424083852209153, |
|
"rewards/format_reward": 0.8303571879863739, |
|
"rewards/len_reward": -0.05801073173061013, |
|
"rewards/reasoning_steps_reward": 0.6276786103844643, |
|
"rewards/tag_count_reward": 0.8100446790456772, |
|
"step": 785 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 681.7848541259766, |
|
"epoch": 0.8426666666666667, |
|
"grad_norm": 3.1714728291900087, |
|
"kl": 2.08974609375, |
|
"learning_rate": 2.1950879348809548e-07, |
|
"loss": 0.0119, |
|
"reward": 1.751221266388893, |
|
"reward_std": 0.8795267082750797, |
|
"rewards/accuracy_reward": 0.5803571723401546, |
|
"rewards/cosine_scaled_reward": 0.34943547430448235, |
|
"rewards/format_reward": 0.8214286148548127, |
|
"rewards/len_reward": -0.07035618057707324, |
|
"rewards/reasoning_steps_reward": 0.635119092464447, |
|
"rewards/tag_count_reward": 0.8131696820259094, |
|
"step": 790 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 680.6027069091797, |
|
"epoch": 0.848, |
|
"grad_norm": 1.5849755112353832, |
|
"kl": 1.878271484375, |
|
"learning_rate": 2.0517468572192632e-07, |
|
"loss": 0.035, |
|
"reward": 1.7385686576366424, |
|
"reward_std": 0.892936672270298, |
|
"rewards/accuracy_reward": 0.5535714536905288, |
|
"rewards/cosine_scaled_reward": 0.35374716023216024, |
|
"rewards/format_reward": 0.8312500447034836, |
|
"rewards/len_reward": -0.06039702805610432, |
|
"rewards/reasoning_steps_reward": 0.6625000402331352, |
|
"rewards/tag_count_reward": 0.8140625387430191, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 0.8533333333333334, |
|
"grad_norm": 1.9274338875069446, |
|
"learning_rate": 1.9129013335756317e-07, |
|
"loss": 0.0496, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.8533333333333334, |
|
"eval_clip_ratio": 0.0, |
|
"eval_completion_length": 684.6899658203125, |
|
"eval_kl": 2.2203125, |
|
"eval_loss": 0.019189750775694847, |
|
"eval_reward": 1.7292620897293092, |
|
"eval_reward_std": 0.872122836112976, |
|
"eval_rewards/accuracy_reward": 0.573214304447174, |
|
"eval_rewards/cosine_scaled_reward": 0.34890486896038053, |
|
"eval_rewards/format_reward": 0.8071428775787354, |
|
"eval_rewards/len_reward": -0.008494636416435242, |
|
"eval_rewards/reasoning_steps_reward": 0.6005952835083008, |
|
"eval_rewards/tag_count_reward": 0.8008928894996643, |
|
"eval_runtime": 122.1397, |
|
"eval_samples_per_second": 0.622, |
|
"eval_steps_per_second": 0.008, |
|
"step": 800 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 691.9308334350586, |
|
"epoch": 0.8586666666666667, |
|
"grad_norm": 3.224793392092436, |
|
"kl": 1.940557861328125, |
|
"learning_rate": 1.7785995702636698e-07, |
|
"loss": 0.0121, |
|
"reward": 1.7232433706521988, |
|
"reward_std": 0.8734276548027993, |
|
"rewards/accuracy_reward": 0.5656250283122063, |
|
"rewards/cosine_scaled_reward": 0.3406540274620056, |
|
"rewards/format_reward": 0.816964328289032, |
|
"rewards/len_reward": -0.05547755345760379, |
|
"rewards/reasoning_steps_reward": 0.6305059880018234, |
|
"rewards/tag_count_reward": 0.8243303947150707, |
|
"step": 805 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 708.755387878418, |
|
"epoch": 0.864, |
|
"grad_norm": 3.5260608802965816, |
|
"kl": 1.724560546875, |
|
"learning_rate": 1.64888819603129e-07, |
|
"loss": 0.0148, |
|
"reward": 1.6694684252142906, |
|
"reward_std": 0.8598788410425187, |
|
"rewards/accuracy_reward": 0.5410714522004128, |
|
"rewards/cosine_scaled_reward": 0.31500408379361033, |
|
"rewards/format_reward": 0.813392898440361, |
|
"rewards/len_reward": -0.06669973691023187, |
|
"rewards/reasoning_steps_reward": 0.6187500357627869, |
|
"rewards/tag_count_reward": 0.8223214700818062, |
|
"step": 810 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 704.1750305175781, |
|
"epoch": 0.8693333333333333, |
|
"grad_norm": 1.8205557040783624, |
|
"kl": 1.6877685546875, |
|
"learning_rate": 1.5238122458714925e-07, |
|
"loss": -0.0007, |
|
"reward": 1.778202649950981, |
|
"reward_std": 0.8024675074964762, |
|
"rewards/accuracy_reward": 0.5928571715950965, |
|
"rewards/cosine_scaled_reward": 0.3424883014522493, |
|
"rewards/format_reward": 0.8428571850061417, |
|
"rewards/len_reward": -0.07196439338222263, |
|
"rewards/reasoning_steps_reward": 0.6529762379825115, |
|
"rewards/tag_count_reward": 0.8439732566475868, |
|
"step": 815 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 703.4661041259766, |
|
"epoch": 0.8746666666666667, |
|
"grad_norm": 1.1390547789955021, |
|
"kl": 1.820703125, |
|
"learning_rate": 1.4034151453864846e-07, |
|
"loss": 0.0058, |
|
"reward": 1.7741546720266341, |
|
"reward_std": 0.887740996479988, |
|
"rewards/accuracy_reward": 0.5687500283122062, |
|
"rewards/cosine_scaled_reward": 0.3643331742845476, |
|
"rewards/format_reward": 0.84107146859169, |
|
"rewards/len_reward": -0.06238816555123776, |
|
"rewards/reasoning_steps_reward": 0.6565476641058922, |
|
"rewards/tag_count_reward": 0.810044676065445, |
|
"step": 820 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 711.9839569091797, |
|
"epoch": 0.88, |
|
"grad_norm": 3.201601066169082, |
|
"kl": 2.0060546875, |
|
"learning_rate": 1.287738695710592e-07, |
|
"loss": 0.0445, |
|
"reward": 1.741848286986351, |
|
"reward_std": 0.8842388309538365, |
|
"rewards/accuracy_reward": 0.5696428827941418, |
|
"rewards/cosine_scaled_reward": 0.3463124948553741, |
|
"rewards/format_reward": 0.8258928999304771, |
|
"rewards/len_reward": -0.061798423925574754, |
|
"rewards/reasoning_steps_reward": 0.6389881439507008, |
|
"rewards/tag_count_reward": 0.8033482506871223, |
|
"step": 825 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 699.556282043457, |
|
"epoch": 0.8853333333333333, |
|
"grad_norm": 2.047674415321257, |
|
"kl": 1.7311279296875, |
|
"learning_rate": 1.1768230589971457e-07, |
|
"loss": 0.0151, |
|
"reward": 1.7691504821181296, |
|
"reward_std": 0.8467303015291691, |
|
"rewards/accuracy_reward": 0.581250025331974, |
|
"rewards/cosine_scaled_reward": 0.3566504124552011, |
|
"rewards/format_reward": 0.8312500342726707, |
|
"rewards/len_reward": -0.05797590847651009, |
|
"rewards/reasoning_steps_reward": 0.6645833745598793, |
|
"rewards/tag_count_reward": 0.813169677555561, |
|
"step": 830 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 718.4741348266601, |
|
"epoch": 0.8906666666666667, |
|
"grad_norm": 1.2710201599686817, |
|
"kl": 1.734326171875, |
|
"learning_rate": 1.0707067444744439e-07, |
|
"loss": 0.0118, |
|
"reward": 1.7898413628339767, |
|
"reward_std": 0.8654466308653355, |
|
"rewards/accuracy_reward": 0.5803571701049804, |
|
"rewards/cosine_scaled_reward": 0.36394843012094497, |
|
"rewards/format_reward": 0.845535758137703, |
|
"rewards/len_reward": -0.06752787398800138, |
|
"rewards/reasoning_steps_reward": 0.6630952812731266, |
|
"rewards/tag_count_reward": 0.8261161103844643, |
|
"step": 835 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 699.3634231567382, |
|
"epoch": 0.896, |
|
"grad_norm": 1.9737307274220788, |
|
"kl": 1.7364990234375, |
|
"learning_rate": 9.69426595075566e-08, |
|
"loss": -0.0073, |
|
"reward": 1.728901642560959, |
|
"reward_std": 0.8724588222801686, |
|
"rewards/accuracy_reward": 0.5598214510828257, |
|
"rewards/cosine_scaled_reward": 0.33693731487728656, |
|
"rewards/format_reward": 0.8321429014205932, |
|
"rewards/len_reward": -0.06747927672695368, |
|
"rewards/reasoning_steps_reward": 0.6422619573771954, |
|
"rewards/tag_count_reward": 0.8299107551574707, |
|
"step": 840 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 737.0732482910156, |
|
"epoch": 0.9013333333333333, |
|
"grad_norm": 1.9784493712749323, |
|
"kl": 1.6184814453125, |
|
"learning_rate": 8.730177746467616e-08, |
|
"loss": 0.0245, |
|
"reward": 1.6512254253029823, |
|
"reward_std": 0.8224760733544827, |
|
"rewards/accuracy_reward": 0.5133928835391999, |
|
"rewards/cosine_scaled_reward": 0.2949753848835826, |
|
"rewards/format_reward": 0.84285718947649, |
|
"rewards/len_reward": -0.06837809896096587, |
|
"rewards/reasoning_steps_reward": 0.6458333730697632, |
|
"rewards/tag_count_reward": 0.8607143267989159, |
|
"step": 845 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 717.0750335693359, |
|
"epoch": 0.9066666666666666, |
|
"grad_norm": 1.1451369234582136, |
|
"kl": 1.67718505859375, |
|
"learning_rate": 7.81513755738742e-08, |
|
"loss": 0.0011, |
|
"reward": 1.742706833779812, |
|
"reward_std": 0.8420647040009499, |
|
"rewards/accuracy_reward": 0.5696428827941418, |
|
"rewards/cosine_scaled_reward": 0.3373496507178061, |
|
"rewards/format_reward": 0.8357143253087997, |
|
"rewards/len_reward": -0.06677207143511624, |
|
"rewards/reasoning_steps_reward": 0.6532738506793976, |
|
"rewards/tag_count_reward": 0.8330357551574707, |
|
"step": 850 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 739.9223541259765, |
|
"epoch": 0.912, |
|
"grad_norm": 2.5094540072798353, |
|
"kl": 1.945263671875, |
|
"learning_rate": 6.949463079852491e-08, |
|
"loss": 0.0302, |
|
"reward": 1.7340825259685517, |
|
"reward_std": 0.8873489238321781, |
|
"rewards/accuracy_reward": 0.5642857439815998, |
|
"rewards/cosine_scaled_reward": 0.33586815614253285, |
|
"rewards/format_reward": 0.8339286178350449, |
|
"rewards/len_reward": -0.05986913426313549, |
|
"rewards/reasoning_steps_reward": 0.6571428991854191, |
|
"rewards/tag_count_reward": 0.8406250417232514, |
|
"step": 855 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 696.5928855895996, |
|
"epoch": 0.9173333333333333, |
|
"grad_norm": 1.9398788694913773, |
|
"kl": 2.00673828125, |
|
"learning_rate": 6.133454870728111e-08, |
|
"loss": 0.0432, |
|
"reward": 1.7525239706039428, |
|
"reward_std": 0.9240262523293495, |
|
"rewards/accuracy_reward": 0.5803571745753289, |
|
"rewards/cosine_scaled_reward": 0.3480596005916595, |
|
"rewards/format_reward": 0.8241071864962578, |
|
"rewards/len_reward": -0.05318943413440138, |
|
"rewards/reasoning_steps_reward": 0.6342262253165245, |
|
"rewards/tag_count_reward": 0.830803605914116, |
|
"step": 860 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 737.1518157958984, |
|
"epoch": 0.9226666666666666, |
|
"grad_norm": 2.0067649784156574, |
|
"kl": 1.8587890625, |
|
"learning_rate": 5.367396243056022e-08, |
|
"loss": 0.0586, |
|
"reward": 1.5764021649956703, |
|
"reward_std": 0.9064562991261482, |
|
"rewards/accuracy_reward": 0.5071428824216128, |
|
"rewards/cosine_scaled_reward": 0.2781878274225164, |
|
"rewards/format_reward": 0.7910714671015739, |
|
"rewards/len_reward": -0.0710936440504156, |
|
"rewards/reasoning_steps_reward": 0.6214286148548126, |
|
"rewards/tag_count_reward": 0.8029018253087997, |
|
"step": 865 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 702.3964599609375, |
|
"epoch": 0.928, |
|
"grad_norm": 1.3626379779643958, |
|
"kl": 1.9628662109375, |
|
"learning_rate": 4.6515531676899316e-08, |
|
"loss": 0.0235, |
|
"reward": 1.8264706060290337, |
|
"reward_std": 0.896178449690342, |
|
"rewards/accuracy_reward": 0.603571455180645, |
|
"rewards/cosine_scaled_reward": 0.38807767661637627, |
|
"rewards/format_reward": 0.8348214730620385, |
|
"rewards/len_reward": -0.053661848261253906, |
|
"rewards/reasoning_steps_reward": 0.656547662615776, |
|
"rewards/tag_count_reward": 0.8383928894996643, |
|
"step": 870 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 688.1732444763184, |
|
"epoch": 0.9333333333333333, |
|
"grad_norm": 1.9694362633321498, |
|
"kl": 2.45517578125, |
|
"learning_rate": 3.986174180951896e-08, |
|
"loss": 0.0403, |
|
"reward": 1.7715040892362595, |
|
"reward_std": 0.875535361468792, |
|
"rewards/accuracy_reward": 0.5866071682423353, |
|
"rewards/cosine_scaled_reward": 0.3420397279784083, |
|
"rewards/format_reward": 0.8428571850061417, |
|
"rewards/len_reward": -0.07309366355184466, |
|
"rewards/reasoning_steps_reward": 0.6440476626157761, |
|
"rewards/tag_count_reward": 0.829687537252903, |
|
"step": 875 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 704.2875289916992, |
|
"epoch": 0.9386666666666666, |
|
"grad_norm": 1.8297011568150405, |
|
"kl": 2.25908203125, |
|
"learning_rate": 3.3714902983421944e-08, |
|
"loss": 0.0123, |
|
"reward": 1.7017309829592704, |
|
"reward_std": 0.9257866092026233, |
|
"rewards/accuracy_reward": 0.5705357387661933, |
|
"rewards/cosine_scaled_reward": 0.32851662803441284, |
|
"rewards/format_reward": 0.8026786118745803, |
|
"rewards/len_reward": -0.06342671204765793, |
|
"rewards/reasoning_steps_reward": 0.6318452827632427, |
|
"rewards/tag_count_reward": 0.8151786103844643, |
|
"step": 880 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 709.7786010742187, |
|
"epoch": 0.944, |
|
"grad_norm": 2.0419278966924916, |
|
"kl": 2.3875, |
|
"learning_rate": 2.807714934332073e-08, |
|
"loss": 0.0495, |
|
"reward": 1.659614458680153, |
|
"reward_std": 0.9322795614600181, |
|
"rewards/accuracy_reward": 0.5294643096625805, |
|
"rewards/cosine_scaled_reward": 0.3292572578415275, |
|
"rewards/format_reward": 0.8008929014205932, |
|
"rewards/len_reward": -0.07157147889083718, |
|
"rewards/reasoning_steps_reward": 0.6122024178504943, |
|
"rewards/tag_count_reward": 0.8060268223285675, |
|
"step": 885 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 715.5553909301758, |
|
"epoch": 0.9493333333333334, |
|
"grad_norm": 2.8653022841959257, |
|
"kl": 2.29814453125, |
|
"learning_rate": 2.2950438282676455e-08, |
|
"loss": -0.0014, |
|
"reward": 1.6036544814705849, |
|
"reward_std": 0.9239592231810093, |
|
"rewards/accuracy_reward": 0.5241071704775095, |
|
"rewards/cosine_scaled_reward": 0.2813329972326756, |
|
"rewards/format_reward": 0.7982143253087998, |
|
"rewards/len_reward": -0.08350599388359115, |
|
"rewards/reasoning_steps_reward": 0.603571467846632, |
|
"rewards/tag_count_reward": 0.816741107404232, |
|
"step": 890 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 709.7669952392578, |
|
"epoch": 0.9546666666666667, |
|
"grad_norm": 3.1175284258712788, |
|
"kl": 2.15888671875, |
|
"learning_rate": 1.8336549764102594e-08, |
|
"loss": 0.038, |
|
"reward": 1.7244937881827354, |
|
"reward_std": 0.8962258003652096, |
|
"rewards/accuracy_reward": 0.5776786021888256, |
|
"rewards/cosine_scaled_reward": 0.34592231740243734, |
|
"rewards/format_reward": 0.800892898440361, |
|
"rewards/len_reward": -0.05511092038359493, |
|
"rewards/reasoning_steps_reward": 0.6098214618861675, |
|
"rewards/tag_count_reward": 0.8035714671015739, |
|
"step": 895 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 1.9860913993698874, |
|
"learning_rate": 1.4237085701374109e-08, |
|
"loss": 0.0147, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"eval_clip_ratio": 0.0, |
|
"eval_completion_length": 710.0548217773437, |
|
"eval_kl": 2.0609375, |
|
"eval_loss": 0.08885510265827179, |
|
"eval_reward": 1.7470547199249267, |
|
"eval_reward_std": 0.9193769574165345, |
|
"eval_rewards/accuracy_reward": 0.5785714566707612, |
|
"eval_rewards/cosine_scaled_reward": 0.3541975736618042, |
|
"eval_rewards/format_reward": 0.8142857432365418, |
|
"eval_rewards/len_reward": 0.017873572744429113, |
|
"eval_rewards/reasoning_steps_reward": 0.6309524178504944, |
|
"eval_rewards/tag_count_reward": 0.8183036088943482, |
|
"eval_runtime": 125.1026, |
|
"eval_samples_per_second": 0.608, |
|
"eval_steps_per_second": 0.008, |
|
"step": 900 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 694.5861915588379, |
|
"epoch": 0.9653333333333334, |
|
"grad_norm": 1.6253024075355416, |
|
"kl": 2.132666015625, |
|
"learning_rate": 1.0653469403252015e-08, |
|
"loss": 0.0295, |
|
"reward": 1.7099345169961453, |
|
"reward_std": 0.9086461085826159, |
|
"rewards/accuracy_reward": 0.5669643130153418, |
|
"rewards/cosine_scaled_reward": 0.3335951896267943, |
|
"rewards/format_reward": 0.8093750402331352, |
|
"rewards/len_reward": -0.06031820691714529, |
|
"rewards/reasoning_steps_reward": 0.6342262338846922, |
|
"rewards/tag_count_reward": 0.8136161088943481, |
|
"step": 905 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 711.0339614868165, |
|
"epoch": 0.9706666666666667, |
|
"grad_norm": 3.0891558552999125, |
|
"kl": 2.23671875, |
|
"learning_rate": 7.586945079319673e-09, |
|
"loss": 0.027, |
|
"reward": 1.7360344290733338, |
|
"reward_std": 0.8696943923830986, |
|
"rewards/accuracy_reward": 0.5678571678698063, |
|
"rewards/cosine_scaled_reward": 0.33514149505645036, |
|
"rewards/format_reward": 0.8330357581377029, |
|
"rewards/len_reward": -0.06922432167921215, |
|
"rewards/reasoning_steps_reward": 0.6446429014205932, |
|
"rewards/tag_count_reward": 0.8270089656114579, |
|
"step": 910 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 702.9910995483399, |
|
"epoch": 0.976, |
|
"grad_norm": 1.4845427370224942, |
|
"kl": 2.0302734375, |
|
"learning_rate": 5.038577408000844e-09, |
|
"loss": 0.0143, |
|
"reward": 1.6590574353933334, |
|
"reward_std": 0.9159045577049255, |
|
"rewards/accuracy_reward": 0.5455357421189546, |
|
"rewards/cosine_scaled_reward": 0.3045931006781757, |
|
"rewards/format_reward": 0.8089286133646965, |
|
"rewards/len_reward": -0.0816710107261315, |
|
"rewards/reasoning_steps_reward": 0.6232143297791481, |
|
"rewards/tag_count_reward": 0.808258967101574, |
|
"step": 915 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 699.5473541259765, |
|
"epoch": 0.9813333333333333, |
|
"grad_norm": 1.7313125978534194, |
|
"kl": 2.3384765625, |
|
"learning_rate": 3.009251166909699e-09, |
|
"loss": 0.0087, |
|
"reward": 1.5918783232569695, |
|
"reward_std": 0.9757736340165138, |
|
"rewards/accuracy_reward": 0.5142857357859612, |
|
"rewards/cosine_scaled_reward": 0.2990211246535182, |
|
"rewards/format_reward": 0.778571467101574, |
|
"rewards/len_reward": -0.08776085836580023, |
|
"rewards/reasoning_steps_reward": 0.6345238506793975, |
|
"rewards/tag_count_reward": 0.7993303939700127, |
|
"step": 920 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 699.5911056518555, |
|
"epoch": 0.9866666666666667, |
|
"grad_norm": 1.739888955006767, |
|
"kl": 2.113232421875, |
|
"learning_rate": 1.4996709256617225e-09, |
|
"loss": 0.0032, |
|
"reward": 1.8000961601734162, |
|
"reward_std": 0.9336124449968338, |
|
"rewards/accuracy_reward": 0.6107143178582192, |
|
"rewards/cosine_scaled_reward": 0.36973895924165845, |
|
"rewards/format_reward": 0.8196429014205933, |
|
"rewards/len_reward": -0.06013430766906822, |
|
"rewards/reasoning_steps_reward": 0.6375000424683094, |
|
"rewards/tag_count_reward": 0.8087053939700126, |
|
"step": 925 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 704.1303871154785, |
|
"epoch": 0.992, |
|
"grad_norm": 1.0971767587313153, |
|
"kl": 1.8646484375, |
|
"learning_rate": 5.103608012512195e-10, |
|
"loss": 0.025, |
|
"reward": 1.7941276371479034, |
|
"reward_std": 0.9121661514043808, |
|
"rewards/accuracy_reward": 0.6089286014437676, |
|
"rewards/cosine_scaled_reward": 0.36734184846282003, |
|
"rewards/format_reward": 0.8178571909666061, |
|
"rewards/len_reward": -0.04802298827562481, |
|
"rewards/reasoning_steps_reward": 0.6273809909820557, |
|
"rewards/tag_count_reward": 0.8111607506871223, |
|
"step": 930 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 715.4536041259765, |
|
"epoch": 0.9973333333333333, |
|
"grad_norm": 1.472648685003733, |
|
"kl": 2.31025390625, |
|
"learning_rate": 4.1664276081376796e-11, |
|
"loss": 0.0476, |
|
"reward": 1.6112597823143004, |
|
"reward_std": 0.9594747319817543, |
|
"rewards/accuracy_reward": 0.5142857380211353, |
|
"rewards/cosine_scaled_reward": 0.294295444060117, |
|
"rewards/format_reward": 0.8026786193251609, |
|
"rewards/len_reward": -0.06919930868316441, |
|
"rewards/reasoning_steps_reward": 0.62529766112566, |
|
"rewards/tag_count_reward": 0.8127232551574707, |
|
"step": 935 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 724.0409545898438, |
|
"epoch": 0.9994666666666666, |
|
"kl": 2.156005859375, |
|
"reward": 1.7690049931406975, |
|
"reward_std": 0.8843330852687359, |
|
"rewards/accuracy_reward": 0.6093750298023224, |
|
"rewards/cosine_scaled_reward": 0.35382634587585926, |
|
"rewards/format_reward": 0.8058036118745804, |
|
"rewards/len_reward": -0.057325188361573964, |
|
"rewards/reasoning_steps_reward": 0.6279762387275696, |
|
"rewards/tag_count_reward": 0.8113839626312256, |
|
"step": 937, |
|
"total_flos": 0.0, |
|
"train_loss": 0.09447722357977083, |
|
"train_runtime": 119506.5825, |
|
"train_samples_per_second": 0.251, |
|
"train_steps_per_second": 0.008 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 937, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": false, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 0.0, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|