OpenRS-RLoRA-LoftQ-R32-5 / trainer_state.json
colinpannikkat's picture
Model save
ee752c6 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.2857142857142857,
"eval_steps": 500,
"global_step": 500,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"clip_ratio": 0.0,
"completion_length": 3068.5000610351562,
"epoch": 0.0005714285714285715,
"grad_norm": 0.013173151761293411,
"kl": 0.0005006790161132812,
"learning_rate": 0.0,
"loss": -0.0242,
"reward": 0.200983926653862,
"reward_std": 0.24425111338496208,
"rewards/cosine_scaled_reward": -0.0453413650393486,
"rewards/format_reward": 0.2916666679084301,
"step": 1
},
{
"clip_ratio": 0.0,
"completion_length": 2930.9583740234375,
"epoch": 0.001142857142857143,
"grad_norm": 0.04316634312272072,
"kl": 0.0003731250762939453,
"learning_rate": 2e-08,
"loss": 0.2092,
"reward": -0.28063105791807175,
"reward_std": 0.29903180059045553,
"rewards/cosine_scaled_reward": -0.28614887595176697,
"rewards/format_reward": 0.291666679084301,
"step": 2
},
{
"clip_ratio": 0.0,
"completion_length": 2359.2500915527344,
"epoch": 0.0017142857142857142,
"grad_norm": 0.03820963203907013,
"kl": 0.00048732757568359375,
"learning_rate": 4e-08,
"loss": 0.1661,
"reward": 0.3625979460775852,
"reward_std": 0.7691465243697166,
"rewards/cosine_scaled_reward": -0.11036771535873413,
"rewards/format_reward": 0.5833333544433117,
"step": 3
},
{
"clip_ratio": 0.0,
"completion_length": 2455.5000610351562,
"epoch": 0.002285714285714286,
"grad_norm": 0.01873675175011158,
"kl": 0.00042629241943359375,
"learning_rate": 6e-08,
"loss": 0.0368,
"reward": 0.42465633153915405,
"reward_std": 0.6839377954602242,
"rewards/cosine_scaled_reward": -0.12100516259670258,
"rewards/format_reward": 0.6666666679084301,
"step": 4
},
{
"clip_ratio": 0.0,
"completion_length": 2971.2916870117188,
"epoch": 0.002857142857142857,
"grad_norm": 0.01593288779258728,
"kl": 0.0005702972412109375,
"learning_rate": 8e-08,
"loss": 0.0526,
"reward": -0.45133184641599655,
"reward_std": 0.1987809967249632,
"rewards/cosine_scaled_reward": -0.3506659045815468,
"rewards/format_reward": 0.25,
"step": 5
},
{
"clip_ratio": 0.0,
"completion_length": 2659.4583740234375,
"epoch": 0.0034285714285714284,
"grad_norm": 0.016305305063724518,
"kl": 0.0003809928894042969,
"learning_rate": 1e-07,
"loss": 0.0749,
"reward": -0.20897246897220612,
"reward_std": 0.22619805298745632,
"rewards/cosine_scaled_reward": -0.27115290239453316,
"rewards/format_reward": 0.3333333358168602,
"step": 6
},
{
"clip_ratio": 0.0,
"completion_length": 3150.0000610351562,
"epoch": 0.004,
"grad_norm": 0.01786160096526146,
"kl": 0.00041294097900390625,
"learning_rate": 1.2e-07,
"loss": 0.1274,
"reward": 0.03739733062684536,
"reward_std": 0.6221467964351177,
"rewards/cosine_scaled_reward": -0.12713466212153435,
"rewards/format_reward": 0.2916666753590107,
"step": 7
},
{
"clip_ratio": 0.0,
"completion_length": 2930.75,
"epoch": 0.004571428571428572,
"grad_norm": 0.013749959878623486,
"kl": 0.0005197525024414062,
"learning_rate": 1.4e-07,
"loss": -0.0105,
"reward": -0.5632898807525635,
"reward_std": 0.14319632947444916,
"rewards/cosine_scaled_reward": -0.40664494782686234,
"rewards/format_reward": 0.25,
"step": 8
},
{
"clip_ratio": 0.0,
"completion_length": 2527.7083435058594,
"epoch": 0.005142857142857143,
"grad_norm": 0.01508411392569542,
"kl": 0.0005040168762207031,
"learning_rate": 1.6e-07,
"loss": -0.0156,
"reward": 0.4341657906770706,
"reward_std": 0.46268418058753014,
"rewards/cosine_scaled_reward": -0.03291710093617439,
"rewards/format_reward": 0.5,
"step": 9
},
{
"clip_ratio": 0.0,
"completion_length": 2895.4584350585938,
"epoch": 0.005714285714285714,
"grad_norm": 0.024773668497800827,
"kl": 0.00039124488830566406,
"learning_rate": 1.8e-07,
"loss": 0.1541,
"reward": 0.5330872263293713,
"reward_std": 1.0390121936798096,
"rewards/cosine_scaled_reward": 0.016543611884117126,
"rewards/format_reward": 0.5000000186264515,
"step": 10
},
{
"clip_ratio": 0.0,
"completion_length": 2230.2916870117188,
"epoch": 0.006285714285714286,
"grad_norm": 0.02818623185157776,
"kl": 0.0006093978881835938,
"learning_rate": 2e-07,
"loss": 0.1588,
"reward": -0.14730040915310383,
"reward_std": 0.23369846679270267,
"rewards/cosine_scaled_reward": -0.34448356181383133,
"rewards/format_reward": 0.541666679084301,
"step": 11
},
{
"clip_ratio": 0.0,
"completion_length": 2591.7083740234375,
"epoch": 0.006857142857142857,
"grad_norm": 0.02190236933529377,
"kl": 0.0006151199340820312,
"learning_rate": 2.1999999999999998e-07,
"loss": 0.0329,
"reward": 0.48731680028140545,
"reward_std": 0.8824571967124939,
"rewards/cosine_scaled_reward": -0.0688415989279747,
"rewards/format_reward": 0.6250000074505806,
"step": 12
},
{
"clip_ratio": 0.0,
"completion_length": 2778.3334197998047,
"epoch": 0.0074285714285714285,
"grad_norm": 0.015030866488814354,
"kl": 0.0003528594970703125,
"learning_rate": 2.4e-07,
"loss": 0.1056,
"reward": 0.6901360005140305,
"reward_std": 0.84443748742342,
"rewards/cosine_scaled_reward": 0.07423467561602592,
"rewards/format_reward": 0.541666679084301,
"step": 13
},
{
"clip_ratio": 0.0,
"completion_length": 2315.8333587646484,
"epoch": 0.008,
"grad_norm": 0.013019426725804806,
"kl": 0.0004930496215820312,
"learning_rate": 2.6e-07,
"loss": 0.001,
"reward": 1.1444866992533207,
"reward_std": 0.720286563038826,
"rewards/cosine_scaled_reward": 0.2180766798555851,
"rewards/format_reward": 0.7083333395421505,
"step": 14
},
{
"clip_ratio": 0.0,
"completion_length": 2382.000030517578,
"epoch": 0.008571428571428572,
"grad_norm": 0.02887255884706974,
"kl": 0.00041961669921875,
"learning_rate": 2.8e-07,
"loss": -0.0308,
"reward": 0.2332791006192565,
"reward_std": 0.48609885200858116,
"rewards/cosine_scaled_reward": -0.21669380273669958,
"rewards/format_reward": 0.6666666716337204,
"step": 15
},
{
"clip_ratio": 0.0,
"completion_length": 3069.4583740234375,
"epoch": 0.009142857142857144,
"grad_norm": 0.018421335145831108,
"kl": 0.000476837158203125,
"learning_rate": 3e-07,
"loss": 0.0859,
"reward": 0.2359318658709526,
"reward_std": 0.6105321571230888,
"rewards/cosine_scaled_reward": -0.0695340558886528,
"rewards/format_reward": 0.3750000149011612,
"step": 16
},
{
"clip_ratio": 0.0,
"completion_length": 2575.5833435058594,
"epoch": 0.009714285714285713,
"grad_norm": 0.03597598895430565,
"kl": 0.0007686614990234375,
"learning_rate": 3.2e-07,
"loss": 0.2179,
"reward": -0.09196203667670488,
"reward_std": 0.5380833484232426,
"rewards/cosine_scaled_reward": -0.2334810234606266,
"rewards/format_reward": 0.3750000149011612,
"step": 17
},
{
"clip_ratio": 0.0,
"completion_length": 3274.9583740234375,
"epoch": 0.010285714285714285,
"grad_norm": 0.012147138826549053,
"kl": 0.0004515647888183594,
"learning_rate": 3.4000000000000003e-07,
"loss": -0.0625,
"reward": -0.1761876866221428,
"reward_std": 0.6809441670775414,
"rewards/cosine_scaled_reward": -0.2339271828532219,
"rewards/format_reward": 0.2916666679084301,
"step": 18
},
{
"clip_ratio": 0.0,
"completion_length": 2882.2083435058594,
"epoch": 0.010857142857142857,
"grad_norm": 0.026394149288535118,
"kl": 0.0004787445068359375,
"learning_rate": 3.6e-07,
"loss": -0.0461,
"reward": 0.3346722051501274,
"reward_std": 0.3912115804851055,
"rewards/cosine_scaled_reward": -0.02016391232609749,
"rewards/format_reward": 0.3750000037252903,
"step": 19
},
{
"clip_ratio": 0.0,
"completion_length": 1633.5833435058594,
"epoch": 0.011428571428571429,
"grad_norm": 0.01967485062777996,
"kl": 0.0003612041473388672,
"learning_rate": 3.7999999999999996e-07,
"loss": 0.0766,
"reward": 0.638333223760128,
"reward_std": 0.8739089630544186,
"rewards/cosine_scaled_reward": -0.0975000774487853,
"rewards/format_reward": 0.8333333358168602,
"step": 20
},
{
"clip_ratio": 0.0,
"completion_length": 1705.2916717529297,
"epoch": 0.012,
"grad_norm": 0.052289288491010666,
"kl": 0.0007824897766113281,
"learning_rate": 4e-07,
"loss": 0.4594,
"reward": 0.13058341294527054,
"reward_std": 0.39700106158852577,
"rewards/cosine_scaled_reward": -0.2680416405200958,
"rewards/format_reward": 0.6666666865348816,
"step": 21
},
{
"clip_ratio": 0.0,
"completion_length": 3156.9166870117188,
"epoch": 0.012571428571428572,
"grad_norm": 0.016474798321723938,
"kl": 0.00043773651123046875,
"learning_rate": 4.1999999999999995e-07,
"loss": 0.0935,
"reward": 1.028728973120451,
"reward_std": 1.6666590571403503,
"rewards/cosine_scaled_reward": 0.26436448842287064,
"rewards/format_reward": 0.5000000149011612,
"step": 22
},
{
"clip_ratio": 0.0,
"completion_length": 2439.041748046875,
"epoch": 0.013142857142857144,
"grad_norm": 0.019004346802830696,
"kl": 0.0005197525024414062,
"learning_rate": 4.3999999999999997e-07,
"loss": 0.0789,
"reward": 0.5590320900082588,
"reward_std": 1.0339802950620651,
"rewards/cosine_scaled_reward": -0.03298397921025753,
"rewards/format_reward": 0.6250000149011612,
"step": 23
},
{
"clip_ratio": 0.0,
"completion_length": 2758.0833740234375,
"epoch": 0.013714285714285714,
"grad_norm": 0.017480166628956795,
"kl": 0.0005626678466796875,
"learning_rate": 4.6e-07,
"loss": -0.1518,
"reward": 0.3504378944635391,
"reward_std": 0.513374675065279,
"rewards/cosine_scaled_reward": -0.09561440348625183,
"rewards/format_reward": 0.5416666679084301,
"step": 24
},
{
"clip_ratio": 0.0,
"completion_length": 1633.2083587646484,
"epoch": 0.014285714285714285,
"grad_norm": 0.03595641627907753,
"kl": 0.00029969215393066406,
"learning_rate": 4.8e-07,
"loss": 0.0377,
"reward": 0.6530582755804062,
"reward_std": 0.629613857716322,
"rewards/cosine_scaled_reward": -0.04847088688984513,
"rewards/format_reward": 0.75,
"step": 25
},
{
"clip_ratio": 0.0,
"completion_length": 2597.0416870117188,
"epoch": 0.014857142857142857,
"grad_norm": 0.01941561885178089,
"kl": 0.0005736351013183594,
"learning_rate": 5e-07,
"loss": 0.0753,
"reward": -0.07398717105388641,
"reward_std": 0.7024243678897619,
"rewards/cosine_scaled_reward": -0.2453269399702549,
"rewards/format_reward": 0.4166666716337204,
"step": 26
},
{
"clip_ratio": 0.0,
"completion_length": 3365.5833740234375,
"epoch": 0.015428571428571429,
"grad_norm": 0.015174021013081074,
"kl": 0.00045490264892578125,
"learning_rate": 5.2e-07,
"loss": 0.1186,
"reward": -0.35242245346307755,
"reward_std": 0.28800780698657036,
"rewards/cosine_scaled_reward": -0.21787790581583977,
"rewards/format_reward": 0.0833333358168602,
"step": 27
},
{
"clip_ratio": 0.0,
"completion_length": 2502.5833740234375,
"epoch": 0.016,
"grad_norm": 0.026485657319426537,
"kl": 0.0006246566772460938,
"learning_rate": 5.4e-07,
"loss": 0.0892,
"reward": 0.14434174199413974,
"reward_std": 0.6618244834244251,
"rewards/cosine_scaled_reward": -0.17782913893461227,
"rewards/format_reward": 0.5000000074505806,
"step": 28
},
{
"clip_ratio": 0.0,
"completion_length": 2873.0,
"epoch": 0.01657142857142857,
"grad_norm": 0.015148441307246685,
"kl": 0.0004634857177734375,
"learning_rate": 5.6e-07,
"loss": -0.009,
"reward": -0.17739348113536835,
"reward_std": 0.48768409341573715,
"rewards/cosine_scaled_reward": -0.21369674568995833,
"rewards/format_reward": 0.25,
"step": 29
},
{
"clip_ratio": 0.0,
"completion_length": 3555.2083740234375,
"epoch": 0.017142857142857144,
"grad_norm": 0.011238854378461838,
"kl": 0.00047588348388671875,
"learning_rate": 5.8e-07,
"loss": 0.0107,
"reward": -0.07929126173257828,
"reward_std": 0.8475685454905033,
"rewards/cosine_scaled_reward": -0.1021456066519022,
"rewards/format_reward": 0.1250000037252903,
"step": 30
},
{
"clip_ratio": 0.0,
"completion_length": 2528.625030517578,
"epoch": 0.017714285714285714,
"grad_norm": 0.06202416494488716,
"kl": 0.0005381107330322266,
"learning_rate": 6e-07,
"loss": 0.2637,
"reward": -0.021039772778749466,
"reward_std": 0.860994964838028,
"rewards/cosine_scaled_reward": -0.21885321522131562,
"rewards/format_reward": 0.4166666679084301,
"step": 31
},
{
"clip_ratio": 0.0,
"completion_length": 2219.9166870117188,
"epoch": 0.018285714285714287,
"grad_norm": 0.034875061362981796,
"kl": 0.0004260540008544922,
"learning_rate": 6.2e-07,
"loss": 0.132,
"reward": 0.3424109169282019,
"reward_std": 0.8896235823631287,
"rewards/cosine_scaled_reward": -0.09962787851691246,
"rewards/format_reward": 0.541666679084301,
"step": 32
},
{
"clip_ratio": 0.0,
"completion_length": 3474.4583740234375,
"epoch": 0.018857142857142857,
"grad_norm": 0.016711147502064705,
"kl": 0.0005655288696289062,
"learning_rate": 6.4e-07,
"loss": 0.0639,
"reward": -0.47737591713666916,
"reward_std": 0.1697257850319147,
"rewards/cosine_scaled_reward": -0.2595212906599045,
"rewards/format_reward": 0.0416666679084301,
"step": 33
},
{
"clip_ratio": 0.0,
"completion_length": 3159.6666870117188,
"epoch": 0.019428571428571427,
"grad_norm": 0.014976400882005692,
"kl": 0.0004849433898925781,
"learning_rate": 6.6e-07,
"loss": 0.0609,
"reward": 0.015222817659378052,
"reward_std": 0.701315013691783,
"rewards/cosine_scaled_reward": -0.17988859117031097,
"rewards/format_reward": 0.3750000111758709,
"step": 34
},
{
"clip_ratio": 0.0,
"completion_length": 3254.7916870117188,
"epoch": 0.02,
"grad_norm": 0.04339271038770676,
"kl": 0.0004878044128417969,
"learning_rate": 6.800000000000001e-07,
"loss": 0.1246,
"reward": -0.6728095263242722,
"reward_std": 0.290258064866066,
"rewards/cosine_scaled_reward": -0.41973811388015747,
"rewards/format_reward": 0.1666666679084301,
"step": 35
},
{
"clip_ratio": 0.0,
"completion_length": 2359.500030517578,
"epoch": 0.02057142857142857,
"grad_norm": 0.031043976545333862,
"kl": 0.0005769729614257812,
"learning_rate": 7e-07,
"loss": 0.1315,
"reward": 0.5186025649309158,
"reward_std": 0.7595919780433178,
"rewards/cosine_scaled_reward": -0.011532071977853775,
"rewards/format_reward": 0.5416666716337204,
"step": 36
},
{
"clip_ratio": 0.0,
"completion_length": 3505.2916870117188,
"epoch": 0.021142857142857144,
"grad_norm": 0.011476296000182629,
"kl": 0.00041484832763671875,
"learning_rate": 7.2e-07,
"loss": 0.0309,
"reward": 0.1381697803735733,
"reward_std": 0.9006250277161598,
"rewards/cosine_scaled_reward": -0.014248451218008995,
"rewards/format_reward": 0.1666666679084301,
"step": 37
},
{
"clip_ratio": 0.0,
"completion_length": 3085.5001220703125,
"epoch": 0.021714285714285714,
"grad_norm": 0.017438944429159164,
"kl": 0.000644683837890625,
"learning_rate": 7.4e-07,
"loss": 0.0651,
"reward": 0.5748194381594658,
"reward_std": 1.0931934267282486,
"rewards/cosine_scaled_reward": 0.016576368361711502,
"rewards/format_reward": 0.541666679084301,
"step": 38
},
{
"clip_ratio": 0.0,
"completion_length": 2326.2084045410156,
"epoch": 0.022285714285714287,
"grad_norm": 0.01982088014483452,
"kl": 0.0006146430969238281,
"learning_rate": 7.599999999999999e-07,
"loss": 0.0144,
"reward": 0.7864310666918755,
"reward_std": 0.7996397092938423,
"rewards/cosine_scaled_reward": 0.0807155417278409,
"rewards/format_reward": 0.6250000149011612,
"step": 39
},
{
"clip_ratio": 0.0,
"completion_length": 3484.5000610351562,
"epoch": 0.022857142857142857,
"grad_norm": 0.01842389442026615,
"kl": 0.0004572868347167969,
"learning_rate": 7.799999999999999e-07,
"loss": 0.0465,
"reward": -0.408206457272172,
"reward_std": 0.35211328975856304,
"rewards/cosine_scaled_reward": -0.2874365597963333,
"rewards/format_reward": 0.1666666679084301,
"step": 40
},
{
"clip_ratio": 0.0,
"completion_length": 2935.875,
"epoch": 0.023428571428571427,
"grad_norm": 0.016613401472568512,
"kl": 0.00041675567626953125,
"learning_rate": 8e-07,
"loss": 0.0661,
"reward": 0.06390659511089325,
"reward_std": 0.4249320328235626,
"rewards/cosine_scaled_reward": -0.11388003081083298,
"rewards/format_reward": 0.2916666679084301,
"step": 41
},
{
"clip_ratio": 0.0,
"completion_length": 2428.9583435058594,
"epoch": 0.024,
"grad_norm": 0.021880364045500755,
"kl": 0.00039958953857421875,
"learning_rate": 8.199999999999999e-07,
"loss": -0.1435,
"reward": 0.11024168506264687,
"reward_std": 0.32544056698679924,
"rewards/cosine_scaled_reward": -0.19487916305661201,
"rewards/format_reward": 0.5,
"step": 42
},
{
"clip_ratio": 0.0,
"completion_length": 3448.7083740234375,
"epoch": 0.02457142857142857,
"grad_norm": 0.016641981899738312,
"kl": 0.0004696846008300781,
"learning_rate": 8.399999999999999e-07,
"loss": 0.0654,
"reward": -0.2966616526246071,
"reward_std": 0.5851836632937193,
"rewards/cosine_scaled_reward": -0.2108308244496584,
"rewards/format_reward": 0.1250000037252903,
"step": 43
},
{
"clip_ratio": 0.0,
"completion_length": 3186.4584350585938,
"epoch": 0.025142857142857144,
"grad_norm": 0.025406604632735252,
"kl": 0.0004544258117675781,
"learning_rate": 8.599999999999999e-07,
"loss": 0.0373,
"reward": 0.24428799748420715,
"reward_std": 0.6496499851346016,
"rewards/cosine_scaled_reward": -0.08618932589888573,
"rewards/format_reward": 0.4166666679084301,
"step": 44
},
{
"clip_ratio": 0.0,
"completion_length": 3035.8750610351562,
"epoch": 0.025714285714285714,
"grad_norm": 0.022738995030522346,
"kl": 0.00051116943359375,
"learning_rate": 8.799999999999999e-07,
"loss": 0.1805,
"reward": -0.13624755293130875,
"reward_std": 0.8101175278425217,
"rewards/cosine_scaled_reward": -0.21395711041986942,
"rewards/format_reward": 0.2916666753590107,
"step": 45
},
{
"clip_ratio": 0.0,
"completion_length": 3030.5416870117188,
"epoch": 0.026285714285714287,
"grad_norm": 0.015439880080521107,
"kl": 0.0004506111145019531,
"learning_rate": 9e-07,
"loss": 0.0927,
"reward": 0.9518533200025558,
"reward_std": 0.8967212848365307,
"rewards/cosine_scaled_reward": 0.20509332790970802,
"rewards/format_reward": 0.5416666716337204,
"step": 46
},
{
"clip_ratio": 0.0,
"completion_length": 2392.0833587646484,
"epoch": 0.026857142857142857,
"grad_norm": 0.016110830008983612,
"kl": 0.00034999847412109375,
"learning_rate": 9.2e-07,
"loss": 0.0866,
"reward": 0.8118329793214798,
"reward_std": 0.6570356953889132,
"rewards/cosine_scaled_reward": 0.11424979940056801,
"rewards/format_reward": 0.5833333432674408,
"step": 47
},
{
"clip_ratio": 0.0,
"completion_length": 2063.5833435058594,
"epoch": 0.027428571428571427,
"grad_norm": 0.02758549526333809,
"kl": 0.0005931854248046875,
"learning_rate": 9.399999999999999e-07,
"loss": 0.09,
"reward": 0.6152995973825455,
"reward_std": 0.7233676761388779,
"rewards/cosine_scaled_reward": -0.025683537125587463,
"rewards/format_reward": 0.6666666865348816,
"step": 48
},
{
"clip_ratio": 0.0,
"completion_length": 2459.9583740234375,
"epoch": 0.028,
"grad_norm": 0.02223382703959942,
"kl": 0.0005383491516113281,
"learning_rate": 9.6e-07,
"loss": 0.0228,
"reward": 0.4767443835735321,
"reward_std": 0.6502058878540993,
"rewards/cosine_scaled_reward": 0.009205527603626251,
"rewards/format_reward": 0.4583333432674408,
"step": 49
},
{
"clip_ratio": 0.0,
"completion_length": 3291.4166870117188,
"epoch": 0.02857142857142857,
"grad_norm": 0.01337195374071598,
"kl": 0.0004801750183105469,
"learning_rate": 9.8e-07,
"loss": -0.0675,
"reward": 0.18172279000282288,
"reward_std": 0.23446273803710938,
"rewards/cosine_scaled_reward": -0.034138597548007965,
"rewards/format_reward": 0.25,
"step": 50
},
{
"clip_ratio": 0.0,
"completion_length": 2837.3333435058594,
"epoch": 0.029142857142857144,
"grad_norm": 0.0264517143368721,
"kl": 0.00040531158447265625,
"learning_rate": 1e-06,
"loss": 0.0691,
"reward": 0.0855257548391819,
"reward_std": 0.5013507194817066,
"rewards/cosine_scaled_reward": -0.10307044349610806,
"rewards/format_reward": 0.2916666679084301,
"step": 51
},
{
"clip_ratio": 0.0,
"completion_length": 3550.125,
"epoch": 0.029714285714285714,
"grad_norm": 0.01194742787629366,
"kl": 0.0003275871276855469,
"learning_rate": 9.999890338174275e-07,
"loss": 0.0148,
"reward": -0.3245684579014778,
"reward_std": 0.8235821425914764,
"rewards/cosine_scaled_reward": -0.22478425258304924,
"rewards/format_reward": 0.1250000037252903,
"step": 52
},
{
"clip_ratio": 0.0,
"completion_length": 2964.2083435058594,
"epoch": 0.030285714285714287,
"grad_norm": 0.01602712646126747,
"kl": 0.00046443939208984375,
"learning_rate": 9.999561358041868e-07,
"loss": -0.0535,
"reward": 0.2959368694573641,
"reward_std": 0.18181271478533745,
"rewards/cosine_scaled_reward": 0.022968419827520847,
"rewards/format_reward": 0.25,
"step": 53
},
{
"clip_ratio": 0.0,
"completion_length": 3124.0,
"epoch": 0.030857142857142857,
"grad_norm": 0.017948586493730545,
"kl": 0.0005545616149902344,
"learning_rate": 9.999013075636804e-07,
"loss": 0.0975,
"reward": -0.4570858801016584,
"reward_std": 0.25594667345285416,
"rewards/cosine_scaled_reward": -0.3327096067368984,
"rewards/format_reward": 0.2083333432674408,
"step": 54
},
{
"clip_ratio": 0.0,
"completion_length": 3167.3750610351562,
"epoch": 0.03142857142857143,
"grad_norm": 0.014247610233724117,
"kl": 0.00048542022705078125,
"learning_rate": 9.998245517681593e-07,
"loss": -0.0067,
"reward": 0.13806065171957016,
"reward_std": 0.8922518789768219,
"rewards/cosine_scaled_reward": -0.07680301181972027,
"rewards/format_reward": 0.291666679084301,
"step": 55
},
{
"clip_ratio": 0.0,
"completion_length": 3063.3750610351562,
"epoch": 0.032,
"grad_norm": 0.01854483038187027,
"kl": 0.0007352828979492188,
"learning_rate": 9.997258721585931e-07,
"loss": 0.16,
"reward": -0.01539595052599907,
"reward_std": 0.6964320801198483,
"rewards/cosine_scaled_reward": -0.15353131107985973,
"rewards/format_reward": 0.291666679084301,
"step": 56
},
{
"clip_ratio": 0.0,
"completion_length": 2957.666717529297,
"epoch": 0.03257142857142857,
"grad_norm": 0.034658752381801605,
"kl": 0.00042319297790527344,
"learning_rate": 9.996052735444862e-07,
"loss": 0.1077,
"reward": -0.11001442139968276,
"reward_std": 0.6499116308987141,
"rewards/cosine_scaled_reward": -0.2216738946735859,
"rewards/format_reward": 0.3333333432674408,
"step": 57
},
{
"clip_ratio": 0.0,
"completion_length": 3472.9583740234375,
"epoch": 0.03314285714285714,
"grad_norm": 0.011575725860893726,
"kl": 0.0004363059997558594,
"learning_rate": 9.994627618036452e-07,
"loss": 0.0248,
"reward": -0.1939047873020172,
"reward_std": 0.8678329139947891,
"rewards/cosine_scaled_reward": -0.1802857331931591,
"rewards/format_reward": 0.1666666679084301,
"step": 58
},
{
"clip_ratio": 0.0,
"completion_length": 2450.541748046875,
"epoch": 0.03371428571428572,
"grad_norm": 0.02538195252418518,
"kl": 0.0005178451538085938,
"learning_rate": 9.992983438818915e-07,
"loss": 0.2097,
"reward": 0.2084958329796791,
"reward_std": 0.7872938960790634,
"rewards/cosine_scaled_reward": -0.18741872906684875,
"rewards/format_reward": 0.583333358168602,
"step": 59
},
{
"clip_ratio": 0.0,
"completion_length": 3508.0833740234375,
"epoch": 0.03428571428571429,
"grad_norm": 0.018474752083420753,
"kl": 0.00039386749267578125,
"learning_rate": 9.991120277927223e-07,
"loss": 0.0386,
"reward": -0.1972892191261053,
"reward_std": 0.7005422301590443,
"rewards/cosine_scaled_reward": -0.18197794491425157,
"rewards/format_reward": 0.1666666679084301,
"step": 60
},
{
"clip_ratio": 0.0,
"completion_length": 2241.916702270508,
"epoch": 0.03485714285714286,
"grad_norm": 0.027353286743164062,
"kl": 0.0003466606140136719,
"learning_rate": 9.989038226169207e-07,
"loss": 0.2232,
"reward": 0.8048897292464972,
"reward_std": 1.2643243670463562,
"rewards/cosine_scaled_reward": 0.0899448562413454,
"rewards/format_reward": 0.6250000111758709,
"step": 61
},
{
"clip_ratio": 0.0,
"completion_length": 2604.1666870117188,
"epoch": 0.03542857142857143,
"grad_norm": 0.022513169795274734,
"kl": 0.0006260871887207031,
"learning_rate": 9.98673738502114e-07,
"loss": -0.0144,
"reward": -0.23419425124302506,
"reward_std": 0.3903382420539856,
"rewards/cosine_scaled_reward": -0.3254304677248001,
"rewards/format_reward": 0.4166666716337204,
"step": 62
},
{
"clip_ratio": 0.0,
"completion_length": 3552.2500610351562,
"epoch": 0.036,
"grad_norm": 0.01291267666965723,
"kl": 0.0005128383636474609,
"learning_rate": 9.98421786662277e-07,
"loss": 0.0182,
"reward": -0.18873221427202225,
"reward_std": 0.5512382872402668,
"rewards/cosine_scaled_reward": -0.1360327743459493,
"rewards/format_reward": 0.0833333358168602,
"step": 63
},
{
"clip_ratio": 0.0,
"completion_length": 3535.8333740234375,
"epoch": 0.036571428571428574,
"grad_norm": 0.011109953746199608,
"kl": 0.00037360191345214844,
"learning_rate": 9.981479793771866e-07,
"loss": 0.01,
"reward": -0.44818597845733166,
"reward_std": 0.25114433094859123,
"rewards/cosine_scaled_reward": -0.2865929929539561,
"rewards/format_reward": 0.125,
"step": 64
},
{
"clip_ratio": 0.0,
"completion_length": 3338.5416870117188,
"epoch": 0.037142857142857144,
"grad_norm": 0.018163377419114113,
"kl": 0.0006074905395507812,
"learning_rate": 9.97852329991824e-07,
"loss": 0.0984,
"reward": -0.24769322806969285,
"reward_std": 0.49852345883846283,
"rewards/cosine_scaled_reward": -0.20717995800077915,
"rewards/format_reward": 0.1666666716337204,
"step": 65
},
{
"clip_ratio": 0.0,
"completion_length": 2866.3334045410156,
"epoch": 0.037714285714285714,
"grad_norm": 0.03914084658026695,
"kl": 0.0003857612609863281,
"learning_rate": 9.975348529157229e-07,
"loss": 0.1522,
"reward": 0.5251417439430952,
"reward_std": 1.087289422750473,
"rewards/cosine_scaled_reward": -0.00826246291399002,
"rewards/format_reward": 0.541666679084301,
"step": 66
},
{
"clip_ratio": 0.0,
"completion_length": 2491.2917098999023,
"epoch": 0.038285714285714284,
"grad_norm": 0.03254377841949463,
"kl": 0.0004949569702148438,
"learning_rate": 9.971955636222684e-07,
"loss": 0.0968,
"reward": 0.676957952324301,
"reward_std": 0.9317026361823082,
"rewards/cosine_scaled_reward": 0.046812308952212334,
"rewards/format_reward": 0.5833333432674408,
"step": 67
},
{
"clip_ratio": 0.0,
"completion_length": 3545.0416870117188,
"epoch": 0.038857142857142854,
"grad_norm": 0.010356022976338863,
"kl": 0.0002932548522949219,
"learning_rate": 9.968344786479415e-07,
"loss": 0.0228,
"reward": -0.4004784324206412,
"reward_std": 0.4494715537875891,
"rewards/cosine_scaled_reward": -0.22107255086302757,
"rewards/format_reward": 0.0416666679084301,
"step": 68
},
{
"clip_ratio": 0.0,
"completion_length": 2902.666748046875,
"epoch": 0.03942857142857143,
"grad_norm": 0.022468766197562218,
"kl": 0.0005145072937011719,
"learning_rate": 9.964516155915151e-07,
"loss": 0.039,
"reward": -0.08480488415807486,
"reward_std": 0.5629407912492752,
"rewards/cosine_scaled_reward": -0.292402446269989,
"rewards/format_reward": 0.5000000149011612,
"step": 69
},
{
"clip_ratio": 0.0,
"completion_length": 2750.3334045410156,
"epoch": 0.04,
"grad_norm": 0.04673980176448822,
"kl": 0.0007410049438476562,
"learning_rate": 9.960469931131936e-07,
"loss": 0.2347,
"reward": 0.22281695902347565,
"reward_std": 0.6946056261658669,
"rewards/cosine_scaled_reward": -0.07609154284000397,
"rewards/format_reward": 0.3750000037252903,
"step": 70
},
{
"clip_ratio": 0.0,
"completion_length": 2139.7083892822266,
"epoch": 0.04057142857142857,
"grad_norm": 0.022997990250587463,
"kl": 0.00041031837463378906,
"learning_rate": 9.956206309337066e-07,
"loss": 0.0503,
"reward": 1.0238905474543571,
"reward_std": 0.3659038320183754,
"rewards/cosine_scaled_reward": 0.19944527000188828,
"rewards/format_reward": 0.625,
"step": 71
},
{
"clip_ratio": 0.0,
"completion_length": 3038.5833435058594,
"epoch": 0.04114285714285714,
"grad_norm": 0.015527274459600449,
"kl": 0.0004353523254394531,
"learning_rate": 9.951725498333448e-07,
"loss": -0.0403,
"reward": 0.057796329259872437,
"reward_std": 0.43705910444259644,
"rewards/cosine_scaled_reward": -0.09610185027122498,
"rewards/format_reward": 0.25,
"step": 72
},
{
"clip_ratio": 0.0,
"completion_length": 2542.416717529297,
"epoch": 0.04171428571428572,
"grad_norm": 0.031132198870182037,
"kl": 0.0004696846008300781,
"learning_rate": 9.947027716509488e-07,
"loss": 0.087,
"reward": 0.4634501487016678,
"reward_std": 0.6224832870066166,
"rewards/cosine_scaled_reward": -0.0807749442756176,
"rewards/format_reward": 0.6250000149011612,
"step": 73
},
{
"clip_ratio": 0.0,
"completion_length": 2811.375030517578,
"epoch": 0.04228571428571429,
"grad_norm": 0.017915023490786552,
"kl": 0.0003814697265625,
"learning_rate": 9.942113192828444e-07,
"loss": 0.0416,
"reward": 0.5657302290201187,
"reward_std": 0.4264005981385708,
"rewards/cosine_scaled_reward": 0.03286512568593025,
"rewards/format_reward": 0.5,
"step": 74
},
{
"clip_ratio": 0.0,
"completion_length": 2507.0000610351562,
"epoch": 0.04285714285714286,
"grad_norm": 0.04219174385070801,
"kl": 0.000507354736328125,
"learning_rate": 9.93698216681727e-07,
"loss": 0.1314,
"reward": 0.33457985022687353,
"reward_std": 0.9787873476743698,
"rewards/cosine_scaled_reward": -0.08271008729934692,
"rewards/format_reward": 0.5000000037252903,
"step": 75
},
{
"clip_ratio": 0.0,
"completion_length": 3257.2083740234375,
"epoch": 0.04342857142857143,
"grad_norm": 0.016020679846405983,
"kl": 0.0005130767822265625,
"learning_rate": 9.931634888554935e-07,
"loss": 0.0017,
"reward": 0.03311159461736679,
"reward_std": 0.8149497471749783,
"rewards/cosine_scaled_reward": -0.170944195240736,
"rewards/format_reward": 0.3750000111758709,
"step": 76
},
{
"clip_ratio": 0.0,
"completion_length": 2980.9584350585938,
"epoch": 0.044,
"grad_norm": 0.01863541640341282,
"kl": 0.0004639625549316406,
"learning_rate": 9.926071618660237e-07,
"loss": 0.0415,
"reward": 0.25933826714754105,
"reward_std": 0.6607147231698036,
"rewards/cosine_scaled_reward": -0.09949754178524017,
"rewards/format_reward": 0.4583333358168602,
"step": 77
},
{
"clip_ratio": 0.0,
"completion_length": 2690.5000915527344,
"epoch": 0.044571428571428574,
"grad_norm": 0.044503167271614075,
"kl": 0.0007963180541992188,
"learning_rate": 9.9202926282791e-07,
"loss": 0.2539,
"reward": 0.40623846650123596,
"reward_std": 0.999249055981636,
"rewards/cosine_scaled_reward": -0.026047438383102417,
"rewards/format_reward": 0.4583333507180214,
"step": 78
},
{
"clip_ratio": 0.0,
"completion_length": 2197.500030517578,
"epoch": 0.045142857142857144,
"grad_norm": 0.03637674078345299,
"kl": 0.0004515647888183594,
"learning_rate": 9.91429819907136e-07,
"loss": 0.245,
"reward": 0.35764368530362844,
"reward_std": 0.7761036828160286,
"rewards/cosine_scaled_reward": -0.13367816805839539,
"rewards/format_reward": 0.6250000074505806,
"step": 79
},
{
"clip_ratio": 0.0,
"completion_length": 3167.7500610351562,
"epoch": 0.045714285714285714,
"grad_norm": 0.016760632395744324,
"kl": 0.0005090236663818359,
"learning_rate": 9.908088623197048e-07,
"loss": 0.0648,
"reward": 0.6552756018936634,
"reward_std": 0.888429120182991,
"rewards/cosine_scaled_reward": 0.11930444650352001,
"rewards/format_reward": 0.4166666828095913,
"step": 80
},
{
"clip_ratio": 0.0,
"completion_length": 3178.4583740234375,
"epoch": 0.046285714285714284,
"grad_norm": 0.018774185329675674,
"kl": 0.0004324913024902344,
"learning_rate": 9.901664203302124e-07,
"loss": 0.096,
"reward": 0.3727112878113985,
"reward_std": 0.6679159682244062,
"rewards/cosine_scaled_reward": 0.019688975531607866,
"rewards/format_reward": 0.3333333432674408,
"step": 81
},
{
"clip_ratio": 0.0,
"completion_length": 3135.25,
"epoch": 0.046857142857142854,
"grad_norm": 0.014164636842906475,
"kl": 0.0003552436828613281,
"learning_rate": 9.895025252503755e-07,
"loss": -0.0475,
"reward": 0.1692640781402588,
"reward_std": 0.2734921835362911,
"rewards/cosine_scaled_reward": -0.0403679758310318,
"rewards/format_reward": 0.25,
"step": 82
},
{
"clip_ratio": 0.0,
"completion_length": 1449.3750762939453,
"epoch": 0.04742857142857143,
"grad_norm": 0.027206717059016228,
"kl": 0.0006706714630126953,
"learning_rate": 9.888172094375033e-07,
"loss": 0.1933,
"reward": 0.888194240629673,
"reward_std": 0.8644632250070572,
"rewards/cosine_scaled_reward": 0.027430432848632336,
"rewards/format_reward": 0.8333333432674408,
"step": 83
},
{
"clip_ratio": 0.0,
"completion_length": 2289.875030517578,
"epoch": 0.048,
"grad_norm": 0.018596787005662918,
"kl": 0.0004329681396484375,
"learning_rate": 9.881105062929221e-07,
"loss": -0.0513,
"reward": 0.2941868454217911,
"reward_std": 0.6860168538987637,
"rewards/cosine_scaled_reward": -0.18623991776257753,
"rewards/format_reward": 0.6666666865348816,
"step": 84
},
{
"clip_ratio": 0.0,
"completion_length": 2823.2083435058594,
"epoch": 0.04857142857142857,
"grad_norm": 0.01691795513033867,
"kl": 0.0005006790161132812,
"learning_rate": 9.873824502603459e-07,
"loss": 0.0365,
"reward": -0.15082548558712006,
"reward_std": 0.26336194574832916,
"rewards/cosine_scaled_reward": -0.24207941442728043,
"rewards/format_reward": 0.3333333358168602,
"step": 85
},
{
"clip_ratio": 0.0,
"completion_length": 2668.7083740234375,
"epoch": 0.04914285714285714,
"grad_norm": 0.014490959234535694,
"kl": 0.0003361701965332031,
"learning_rate": 9.866330768241983e-07,
"loss": 0.0614,
"reward": 1.0604897737503052,
"reward_std": 1.551704853773117,
"rewards/cosine_scaled_reward": 0.19691153056919575,
"rewards/format_reward": 0.6666666865348816,
"step": 86
},
{
"clip_ratio": 0.0,
"completion_length": 2975.3750610351562,
"epoch": 0.04971428571428571,
"grad_norm": 0.01852068305015564,
"kl": 0.0004734992980957031,
"learning_rate": 9.85862422507884e-07,
"loss": -0.0494,
"reward": 0.4187099374830723,
"reward_std": 0.7260430231690407,
"rewards/cosine_scaled_reward": 0.021854941733181477,
"rewards/format_reward": 0.3750000037252903,
"step": 87
},
{
"clip_ratio": 0.0,
"completion_length": 2801.7501220703125,
"epoch": 0.05028571428571429,
"grad_norm": 0.01830633357167244,
"kl": 0.0005750656127929688,
"learning_rate": 9.850705248720068e-07,
"loss": 0.0263,
"reward": 0.018878452479839325,
"reward_std": 0.6697305515408516,
"rewards/cosine_scaled_reward": -0.2822274398058653,
"rewards/format_reward": 0.5833333395421505,
"step": 88
},
{
"clip_ratio": 0.0,
"completion_length": 2697.416717529297,
"epoch": 0.05085714285714286,
"grad_norm": 0.023486582562327385,
"kl": 0.0004010200500488281,
"learning_rate": 9.8425742251254e-07,
"loss": -0.0038,
"reward": 0.29166828095912933,
"reward_std": 0.7786018922924995,
"rewards/cosine_scaled_reward": -0.08333254605531693,
"rewards/format_reward": 0.4583333395421505,
"step": 89
},
{
"clip_ratio": 0.0,
"completion_length": 2538.0833740234375,
"epoch": 0.05142857142857143,
"grad_norm": 0.010524451732635498,
"kl": 0.0002994537353515625,
"learning_rate": 9.83423155058946e-07,
"loss": 0.0346,
"reward": 0.7873217761516571,
"reward_std": 0.763067714869976,
"rewards/cosine_scaled_reward": 0.08116088062524796,
"rewards/format_reward": 0.625,
"step": 90
},
{
"clip_ratio": 0.0,
"completion_length": 3094.8333740234375,
"epoch": 0.052,
"grad_norm": 0.03540240228176117,
"kl": 0.0004677772521972656,
"learning_rate": 9.825677631722435e-07,
"loss": 0.0673,
"reward": 0.1670377204718534,
"reward_std": 0.6299600079655647,
"rewards/cosine_scaled_reward": -0.0831478089094162,
"rewards/format_reward": 0.3333333432674408,
"step": 91
},
{
"clip_ratio": 0.0,
"completion_length": 2744.541717529297,
"epoch": 0.052571428571428575,
"grad_norm": 0.03803950175642967,
"kl": 0.0004138946533203125,
"learning_rate": 9.816912885430258e-07,
"loss": 0.1363,
"reward": 0.2968802750110626,
"reward_std": 0.4654075037688017,
"rewards/cosine_scaled_reward": 0.002606801688671112,
"rewards/format_reward": 0.2916666679084301,
"step": 92
},
{
"clip_ratio": 0.0,
"completion_length": 2166.9583435058594,
"epoch": 0.053142857142857144,
"grad_norm": 0.047334711998701096,
"kl": 0.000446319580078125,
"learning_rate": 9.807937738894303e-07,
"loss": 0.0582,
"reward": 0.20845970511436462,
"reward_std": 0.3487181942909956,
"rewards/cosine_scaled_reward": -0.14577015489339828,
"rewards/format_reward": 0.5,
"step": 93
},
{
"clip_ratio": 0.0,
"completion_length": 2899.2083740234375,
"epoch": 0.053714285714285714,
"grad_norm": 0.013924806378781796,
"kl": 0.00037860870361328125,
"learning_rate": 9.798752629550546e-07,
"loss": 0.0477,
"reward": 0.4423799216747284,
"reward_std": 0.8836686909198761,
"rewards/cosine_scaled_reward": -0.0913100466132164,
"rewards/format_reward": 0.6250000260770321,
"step": 94
},
{
"clip_ratio": 0.0,
"completion_length": 2209.4583740234375,
"epoch": 0.054285714285714284,
"grad_norm": 0.0459674671292305,
"kl": 0.0005049705505371094,
"learning_rate": 9.78935800506826e-07,
"loss": 0.2862,
"reward": 0.9317682832479477,
"reward_std": 0.7728890106081963,
"rewards/cosine_scaled_reward": 0.07005079090595245,
"rewards/format_reward": 0.791666679084301,
"step": 95
},
{
"clip_ratio": 0.0,
"completion_length": 3184.416748046875,
"epoch": 0.054857142857142854,
"grad_norm": 0.017193680629134178,
"kl": 0.00043773651123046875,
"learning_rate": 9.779754323328192e-07,
"loss": 0.0105,
"reward": 0.38489115983247757,
"reward_std": 1.058574389666319,
"rewards/cosine_scaled_reward": -0.015887772024143487,
"rewards/format_reward": 0.4166666828095913,
"step": 96
},
{
"clip_ratio": 0.0,
"completion_length": 2542.9583740234375,
"epoch": 0.05542857142857143,
"grad_norm": 0.03288557752966881,
"kl": 0.0006394386291503906,
"learning_rate": 9.769942052400235e-07,
"loss": 0.0173,
"reward": -0.20064683258533478,
"reward_std": 0.43674986250698566,
"rewards/cosine_scaled_reward": -0.32949007861316204,
"rewards/format_reward": 0.4583333432674408,
"step": 97
},
{
"clip_ratio": 0.0,
"completion_length": 2007.2500457763672,
"epoch": 0.056,
"grad_norm": 0.020400483161211014,
"kl": 0.0003914833068847656,
"learning_rate": 9.759921670520634e-07,
"loss": 0.0489,
"reward": 1.0779699385166168,
"reward_std": 1.257737785577774,
"rewards/cosine_scaled_reward": 0.18481825292110443,
"rewards/format_reward": 0.7083333358168602,
"step": 98
},
{
"clip_ratio": 0.0,
"completion_length": 2389.5833587646484,
"epoch": 0.05657142857142857,
"grad_norm": 0.019844507798552513,
"kl": 0.0003876686096191406,
"learning_rate": 9.749693666068663e-07,
"loss": 0.0556,
"reward": 0.953456562012434,
"reward_std": 0.7421619556844234,
"rewards/cosine_scaled_reward": 0.2058949265629053,
"rewards/format_reward": 0.541666679084301,
"step": 99
},
{
"clip_ratio": 0.0,
"completion_length": 2124.0416717529297,
"epoch": 0.05714285714285714,
"grad_norm": 0.030665883794426918,
"kl": 0.0005888938903808594,
"learning_rate": 9.739258537542835e-07,
"loss": -0.0126,
"reward": 0.47896020486950874,
"reward_std": 0.677450954914093,
"rewards/cosine_scaled_reward": -0.03135322220623493,
"rewards/format_reward": 0.5416666679084301,
"step": 100
},
{
"clip_ratio": 0.0,
"completion_length": 3004.041748046875,
"epoch": 0.05771428571428571,
"grad_norm": 0.04486413672566414,
"kl": 0.0004496574401855469,
"learning_rate": 9.728616793536587e-07,
"loss": 0.0735,
"reward": 0.022397130727767944,
"reward_std": 0.7927646785974503,
"rewards/cosine_scaled_reward": -0.19713477417826653,
"rewards/format_reward": 0.416666679084301,
"step": 101
},
{
"clip_ratio": 0.0,
"completion_length": 3206.875,
"epoch": 0.05828571428571429,
"grad_norm": 0.035310786217451096,
"kl": 0.0003910064697265625,
"learning_rate": 9.717768952713511e-07,
"loss": 0.1437,
"reward": -0.08731867372989655,
"reward_std": 0.9576195627450943,
"rewards/cosine_scaled_reward": -0.16865937039256096,
"rewards/format_reward": 0.2500000037252903,
"step": 102
},
{
"clip_ratio": 0.0,
"completion_length": 3014.3750610351562,
"epoch": 0.05885714285714286,
"grad_norm": 0.031477976590394974,
"kl": 0.0004429817199707031,
"learning_rate": 9.706715543782064e-07,
"loss": 0.1671,
"reward": -0.22813038900494576,
"reward_std": 0.688251368701458,
"rewards/cosine_scaled_reward": -0.301565196365118,
"rewards/format_reward": 0.3750000074505806,
"step": 103
},
{
"clip_ratio": 0.0,
"completion_length": 3124.25,
"epoch": 0.05942857142857143,
"grad_norm": 0.02040957659482956,
"kl": 0.0004067420959472656,
"learning_rate": 9.695457105469804e-07,
"loss": -0.0155,
"reward": -0.01127801463007927,
"reward_std": 0.39767561107873917,
"rewards/cosine_scaled_reward": -0.1306389942765236,
"rewards/format_reward": 0.25,
"step": 104
},
{
"clip_ratio": 0.0,
"completion_length": 2859.416717529297,
"epoch": 0.06,
"grad_norm": 0.014855082146823406,
"kl": 0.00042057037353515625,
"learning_rate": 9.683994186497132e-07,
"loss": 0.0882,
"reward": -0.13864438608288765,
"reward_std": 0.40612044744193554,
"rewards/cosine_scaled_reward": -0.23598887026309967,
"rewards/format_reward": 0.3333333358168602,
"step": 105
},
{
"clip_ratio": 0.0,
"completion_length": 2882.0416717529297,
"epoch": 0.060571428571428575,
"grad_norm": 0.01957513391971588,
"kl": 0.00047016143798828125,
"learning_rate": 9.672327345550543e-07,
"loss": -0.0236,
"reward": 0.05208197236061096,
"reward_std": 0.6449095346033573,
"rewards/cosine_scaled_reward": -0.11979235336184502,
"rewards/format_reward": 0.2916666679084301,
"step": 106
},
{
"clip_ratio": 0.0,
"completion_length": 2631.000045776367,
"epoch": 0.061142857142857145,
"grad_norm": 0.018131662160158157,
"kl": 0.0004868507385253906,
"learning_rate": 9.66045715125541e-07,
"loss": 0.0708,
"reward": -0.09214365109801292,
"reward_std": 0.40418105013668537,
"rewards/cosine_scaled_reward": -0.2544051744043827,
"rewards/format_reward": 0.4166666716337204,
"step": 107
},
{
"clip_ratio": 0.0,
"completion_length": 2191.2916717529297,
"epoch": 0.061714285714285715,
"grad_norm": 0.03655322641134262,
"kl": 0.0003566741943359375,
"learning_rate": 9.648384182148252e-07,
"loss": 0.1124,
"reward": 0.02219559997320175,
"reward_std": 0.20816783979535103,
"rewards/cosine_scaled_reward": -0.23890221491456032,
"rewards/format_reward": 0.5,
"step": 108
},
{
"clip_ratio": 0.0,
"completion_length": 3294.125,
"epoch": 0.062285714285714285,
"grad_norm": 0.02335178665816784,
"kl": 0.000438690185546875,
"learning_rate": 9.636109026648554e-07,
"loss": 0.034,
"reward": -0.42653578519821167,
"reward_std": 0.2618470564484596,
"rewards/cosine_scaled_reward": -0.27576790004968643,
"rewards/format_reward": 0.125,
"step": 109
},
{
"clip_ratio": 0.0,
"completion_length": 3212.7916870117188,
"epoch": 0.06285714285714286,
"grad_norm": 0.013373509049415588,
"kl": 0.0003647804260253906,
"learning_rate": 9.623632283030077e-07,
"loss": 0.0984,
"reward": -0.35255161160603166,
"reward_std": 0.34573741257190704,
"rewards/cosine_scaled_reward": -0.2804424799978733,
"rewards/format_reward": 0.2083333395421505,
"step": 110
},
{
"clip_ratio": 0.0,
"completion_length": 2838.5833435058594,
"epoch": 0.06342857142857143,
"grad_norm": 0.02231280505657196,
"kl": 0.000438690185546875,
"learning_rate": 9.610954559391704e-07,
"loss": 0.0041,
"reward": -0.08670877665281296,
"reward_std": 0.4552724286913872,
"rewards/cosine_scaled_reward": -0.16835440043359995,
"rewards/format_reward": 0.25,
"step": 111
},
{
"clip_ratio": 0.0,
"completion_length": 3045.6666870117188,
"epoch": 0.064,
"grad_norm": 0.01650678738951683,
"kl": 0.0003314018249511719,
"learning_rate": 9.598076473627796e-07,
"loss": -0.0241,
"reward": 0.5308302510529757,
"reward_std": 0.7812503390014172,
"rewards/cosine_scaled_reward": 0.07791512738913298,
"rewards/format_reward": 0.3750000037252903,
"step": 112
},
{
"clip_ratio": 0.0,
"completion_length": 2850.4166870117188,
"epoch": 0.06457142857142857,
"grad_norm": 0.029017914086580276,
"kl": 0.00040984153747558594,
"learning_rate": 9.58499865339809e-07,
"loss": 0.0898,
"reward": 0.07424558838829398,
"reward_std": 0.701502051204443,
"rewards/cosine_scaled_reward": -0.17121053859591484,
"rewards/format_reward": 0.4166666679084301,
"step": 113
},
{
"clip_ratio": 0.0,
"completion_length": 3156.1666870117188,
"epoch": 0.06514285714285714,
"grad_norm": 0.013486391864717007,
"kl": 0.00036525726318359375,
"learning_rate": 9.571721736097088e-07,
"loss": -0.0383,
"reward": -0.05461219698190689,
"reward_std": 0.5745192095637321,
"rewards/cosine_scaled_reward": -0.1939727613935247,
"rewards/format_reward": 0.3333333358168602,
"step": 114
},
{
"clip_ratio": 0.0,
"completion_length": 2846.9583740234375,
"epoch": 0.06571428571428571,
"grad_norm": 0.01851780340075493,
"kl": 0.0004734992980957031,
"learning_rate": 9.55824636882301e-07,
"loss": 0.1023,
"reward": -0.03960709646344185,
"reward_std": 0.7411026880145073,
"rewards/cosine_scaled_reward": -0.18647022545337677,
"rewards/format_reward": 0.3333333358168602,
"step": 115
},
{
"clip_ratio": 0.0,
"completion_length": 3016.7083435058594,
"epoch": 0.06628571428571428,
"grad_norm": 0.012387678027153015,
"kl": 0.0004115104675292969,
"learning_rate": 9.54457320834625e-07,
"loss": 0.0442,
"reward": -0.4454263895750046,
"reward_std": 0.34630244970321655,
"rewards/cosine_scaled_reward": -0.3477131985127926,
"rewards/format_reward": 0.25,
"step": 116
},
{
"clip_ratio": 0.0,
"completion_length": 3214.125,
"epoch": 0.06685714285714285,
"grad_norm": 0.010518810711801052,
"kl": 0.0004563331604003906,
"learning_rate": 9.530702921077358e-07,
"loss": -0.0368,
"reward": 0.24042409658432007,
"reward_std": 0.21273156255483627,
"rewards/cosine_scaled_reward": -0.004787934944033623,
"rewards/format_reward": 0.25,
"step": 117
},
{
"clip_ratio": 0.0,
"completion_length": 2701.125045776367,
"epoch": 0.06742857142857143,
"grad_norm": 0.020669570192694664,
"kl": 0.00047397613525390625,
"learning_rate": 9.516636183034564e-07,
"loss": 0.0176,
"reward": 0.029145129024982452,
"reward_std": 0.4890642985701561,
"rewards/cosine_scaled_reward": -0.15209410339593887,
"rewards/format_reward": 0.3333333358168602,
"step": 118
},
{
"clip_ratio": 0.0,
"completion_length": 2052.625030517578,
"epoch": 0.068,
"grad_norm": 0.04713466763496399,
"kl": 0.0004968643188476562,
"learning_rate": 9.502373679810839e-07,
"loss": 0.2029,
"reward": 0.8692835792899132,
"reward_std": 0.5231802985072136,
"rewards/cosine_scaled_reward": 0.1429751217365265,
"rewards/format_reward": 0.5833333358168602,
"step": 119
},
{
"clip_ratio": 0.0,
"completion_length": 3555.3333740234375,
"epoch": 0.06857142857142857,
"grad_norm": 0.011495165526866913,
"kl": 0.0003578662872314453,
"learning_rate": 9.487916106540465e-07,
"loss": 0.0078,
"reward": -0.3606860190629959,
"reward_std": 0.5378699135035276,
"rewards/cosine_scaled_reward": -0.24284303188323975,
"rewards/format_reward": 0.125,
"step": 120
},
{
"clip_ratio": 0.0,
"completion_length": 3037.0,
"epoch": 0.06914285714285714,
"grad_norm": 0.040560223162174225,
"kl": 0.00043773651123046875,
"learning_rate": 9.473264167865171e-07,
"loss": 0.1242,
"reward": -0.06026811897754669,
"reward_std": 0.48589139245450497,
"rewards/cosine_scaled_reward": -0.15513405948877335,
"rewards/format_reward": 0.25,
"step": 121
},
{
"clip_ratio": 0.0,
"completion_length": 2017.791748046875,
"epoch": 0.06971428571428571,
"grad_norm": 0.021992556750774384,
"kl": 0.0003094673156738281,
"learning_rate": 9.458418577899774e-07,
"loss": 0.2174,
"reward": 1.1798989064991474,
"reward_std": 0.7208777815103531,
"rewards/cosine_scaled_reward": 0.21494942158460617,
"rewards/format_reward": 0.7500000223517418,
"step": 122
},
{
"clip_ratio": 0.0,
"completion_length": 2790.0833740234375,
"epoch": 0.07028571428571428,
"grad_norm": 0.016002673655748367,
"kl": 0.0004038810729980469,
"learning_rate": 9.443380060197385e-07,
"loss": 0.0023,
"reward": 0.33917365968227386,
"reward_std": 1.0576607063412666,
"rewards/cosine_scaled_reward": -0.08041317760944366,
"rewards/format_reward": 0.5000000149011612,
"step": 123
},
{
"clip_ratio": 0.0,
"completion_length": 3338.7500610351562,
"epoch": 0.07085714285714285,
"grad_norm": 0.010752753354609013,
"kl": 0.00029778480529785156,
"learning_rate": 9.428149347714143e-07,
"loss": 0.0352,
"reward": 0.3968821354210377,
"reward_std": 0.5420339666306973,
"rewards/cosine_scaled_reward": -0.009892286732792854,
"rewards/format_reward": 0.4166666865348816,
"step": 124
},
{
"clip_ratio": 0.0,
"completion_length": 2333.291717529297,
"epoch": 0.07142857142857142,
"grad_norm": 0.05020369216799736,
"kl": 0.00043487548828125,
"learning_rate": 9.412727182773486e-07,
"loss": 0.1914,
"reward": 0.29357251059263945,
"reward_std": 0.5018073245882988,
"rewards/cosine_scaled_reward": -0.18654709309339523,
"rewards/format_reward": 0.6666666716337204,
"step": 125
},
{
"clip_ratio": 0.0,
"completion_length": 1793.6666870117188,
"epoch": 0.072,
"grad_norm": 0.026897089555859566,
"kl": 0.0003814697265625,
"learning_rate": 9.397114317029974e-07,
"loss": 0.1009,
"reward": 0.33217477798461914,
"reward_std": 0.8002122193574905,
"rewards/cosine_scaled_reward": -0.20891261473298073,
"rewards/format_reward": 0.7500000111758709,
"step": 126
},
{
"clip_ratio": 0.0,
"completion_length": 2812.0834045410156,
"epoch": 0.07257142857142856,
"grad_norm": 0.013029903173446655,
"kl": 0.0003662109375,
"learning_rate": 9.381311511432658e-07,
"loss": 0.0056,
"reward": 0.6684755804017186,
"reward_std": 0.9957198947668076,
"rewards/cosine_scaled_reward": 0.10507109388709068,
"rewards/format_reward": 0.4583333395421505,
"step": 127
},
{
"clip_ratio": 0.0,
"completion_length": 3381.7500610351562,
"epoch": 0.07314285714285715,
"grad_norm": 0.014719157479703426,
"kl": 0.0004086494445800781,
"learning_rate": 9.36531953618799e-07,
"loss": 0.0262,
"reward": 0.14912248402833939,
"reward_std": 0.6112966164946556,
"rewards/cosine_scaled_reward": -0.0504387766122818,
"rewards/format_reward": 0.2500000111758709,
"step": 128
},
{
"clip_ratio": 0.0,
"completion_length": 2489.6666870117188,
"epoch": 0.07371428571428572,
"grad_norm": 0.02591477520763874,
"kl": 0.0003113746643066406,
"learning_rate": 9.34913917072228e-07,
"loss": 0.0579,
"reward": 0.2893460839986801,
"reward_std": 0.34850335121154785,
"rewards/cosine_scaled_reward": -0.10532695800065994,
"rewards/format_reward": 0.5,
"step": 129
},
{
"clip_ratio": 0.0,
"completion_length": 1840.0000305175781,
"epoch": 0.07428571428571429,
"grad_norm": 0.03489629924297333,
"kl": 0.0003867149353027344,
"learning_rate": 9.332771203643714e-07,
"loss": 0.0713,
"reward": 0.5813919119536877,
"reward_std": 0.6657490562647581,
"rewards/cosine_scaled_reward": -0.06347071845084429,
"rewards/format_reward": 0.7083333432674408,
"step": 130
},
{
"clip_ratio": 0.0,
"completion_length": 3027.5000610351562,
"epoch": 0.07485714285714286,
"grad_norm": 0.031031399965286255,
"kl": 0.00043010711669921875,
"learning_rate": 9.316216432703916e-07,
"loss": -0.0614,
"reward": -0.35745152831077576,
"reward_std": 0.42021336406469345,
"rewards/cosine_scaled_reward": -0.3453924432396889,
"rewards/format_reward": 0.3333333358168602,
"step": 131
},
{
"clip_ratio": 0.0,
"completion_length": 2721.041717529297,
"epoch": 0.07542857142857143,
"grad_norm": 0.019737781956791878,
"kl": 0.0003380775451660156,
"learning_rate": 9.299475664759068e-07,
"loss": -0.1046,
"reward": 0.52672129124403,
"reward_std": 0.818169629201293,
"rewards/cosine_scaled_reward": -0.049139365553855896,
"rewards/format_reward": 0.6250000149011612,
"step": 132
},
{
"clip_ratio": 0.0,
"completion_length": 3006.125,
"epoch": 0.076,
"grad_norm": 0.014350071549415588,
"kl": 0.00035190582275390625,
"learning_rate": 9.282549715730579e-07,
"loss": 0.0195,
"reward": 0.436140738427639,
"reward_std": 0.799125649034977,
"rewards/cosine_scaled_reward": 0.009737027809023857,
"rewards/format_reward": 0.4166666716337204,
"step": 133
},
{
"clip_ratio": 0.0,
"completion_length": 3342.0000610351562,
"epoch": 0.07657142857142857,
"grad_norm": 0.013594014570116997,
"kl": 0.0003528594970703125,
"learning_rate": 9.265439410565328e-07,
"loss": 0.0027,
"reward": 0.4128790497779846,
"reward_std": 0.9192672446370125,
"rewards/cosine_scaled_reward": 0.039772857911884785,
"rewards/format_reward": 0.3333333395421505,
"step": 134
},
{
"clip_ratio": 0.0,
"completion_length": 2860.0833435058594,
"epoch": 0.07714285714285714,
"grad_norm": 0.020422151312232018,
"kl": 0.000537872314453125,
"learning_rate": 9.248145583195447e-07,
"loss": -0.0067,
"reward": -0.4891085624694824,
"reward_std": 0.28661480732262135,
"rewards/cosine_scaled_reward": -0.3695542886853218,
"rewards/format_reward": 0.25,
"step": 135
},
{
"clip_ratio": 0.0,
"completion_length": 1929.3750610351562,
"epoch": 0.07771428571428571,
"grad_norm": 0.03446084260940552,
"kl": 0.0007333755493164062,
"learning_rate": 9.230669076497687e-07,
"loss": 0.1816,
"reward": 0.3107494944706559,
"reward_std": 0.7272366061806679,
"rewards/cosine_scaled_reward": -0.15712526440620422,
"rewards/format_reward": 0.6250000037252903,
"step": 136
},
{
"clip_ratio": 0.0,
"completion_length": 2655.000030517578,
"epoch": 0.07828571428571429,
"grad_norm": 0.01656595803797245,
"kl": 0.0002579689025878906,
"learning_rate": 9.213010742252327e-07,
"loss": -0.0522,
"reward": 0.6488993316888809,
"reward_std": 0.77546676248312,
"rewards/cosine_scaled_reward": 0.011949680745601654,
"rewards/format_reward": 0.625,
"step": 137
},
{
"clip_ratio": 0.0,
"completion_length": 2360.5416870117188,
"epoch": 0.07885714285714286,
"grad_norm": 0.01801217719912529,
"kl": 0.0003211498260498047,
"learning_rate": 9.195171441101668e-07,
"loss": -0.0965,
"reward": 0.3774528503417969,
"reward_std": 0.27286792919039726,
"rewards/cosine_scaled_reward": -0.06127360463142395,
"rewards/format_reward": 0.5,
"step": 138
},
{
"clip_ratio": 0.0,
"completion_length": 3113.3750610351562,
"epoch": 0.07942857142857143,
"grad_norm": 0.015139062888920307,
"kl": 0.0002903938293457031,
"learning_rate": 9.177152042508077e-07,
"loss": 0.1314,
"reward": 0.18484138697385788,
"reward_std": 0.8956813514232635,
"rewards/cosine_scaled_reward": -0.11591265327297151,
"rewards/format_reward": 0.416666679084301,
"step": 139
},
{
"clip_ratio": 0.0,
"completion_length": 2925.1666717529297,
"epoch": 0.08,
"grad_norm": 0.011720138601958752,
"kl": 0.0002703666687011719,
"learning_rate": 9.158953424711624e-07,
"loss": 0.0138,
"reward": 0.18188539519906044,
"reward_std": 0.5702639557421207,
"rewards/cosine_scaled_reward": -0.07572397217154503,
"rewards/format_reward": 0.3333333358168602,
"step": 140
},
{
"clip_ratio": 0.0,
"completion_length": 2260.7083740234375,
"epoch": 0.08057142857142857,
"grad_norm": 0.07121221721172333,
"kl": 0.0005216598510742188,
"learning_rate": 9.140576474687263e-07,
"loss": 0.2931,
"reward": 0.15240047996303474,
"reward_std": 0.9472056925296783,
"rewards/cosine_scaled_reward": -0.2154664322733879,
"rewards/format_reward": 0.5833333544433117,
"step": 141
},
{
"clip_ratio": 0.0,
"completion_length": 2265.5000610351562,
"epoch": 0.08114285714285714,
"grad_norm": 0.054452214390039444,
"kl": 0.0006041526794433594,
"learning_rate": 9.122022088101613e-07,
"loss": 0.242,
"reward": 0.7805991023778915,
"reward_std": 0.9608509242534637,
"rewards/cosine_scaled_reward": 0.05696620047092438,
"rewards/format_reward": 0.6666666939854622,
"step": 142
},
{
"clip_ratio": 0.0,
"completion_length": 2732.2917098999023,
"epoch": 0.08171428571428571,
"grad_norm": 0.017999855801463127,
"kl": 0.0003368854522705078,
"learning_rate": 9.103291169269299e-07,
"loss": 0.0514,
"reward": 1.025037132203579,
"reward_std": 0.7192419245839119,
"rewards/cosine_scaled_reward": 0.2625185213983059,
"rewards/format_reward": 0.5000000111758709,
"step": 143
},
{
"clip_ratio": 0.0,
"completion_length": 2587.8750915527344,
"epoch": 0.08228571428571428,
"grad_norm": 0.01989753730595112,
"kl": 0.0004143714904785156,
"learning_rate": 9.084384631108882e-07,
"loss": 0.1676,
"reward": 0.8056844659149647,
"reward_std": 0.7985115312039852,
"rewards/cosine_scaled_reward": 0.0695088729262352,
"rewards/format_reward": 0.6666666679084301,
"step": 144
},
{
"clip_ratio": 0.0,
"completion_length": 1967.666748046875,
"epoch": 0.08285714285714285,
"grad_norm": 0.06734281778335571,
"kl": 0.0005283355712890625,
"learning_rate": 9.065303395098358e-07,
"loss": 0.2562,
"reward": 0.2510095611214638,
"reward_std": 0.6584825366735458,
"rewards/cosine_scaled_reward": -0.2286618910729885,
"rewards/format_reward": 0.7083333432674408,
"step": 145
},
{
"clip_ratio": 0.0,
"completion_length": 2411.416717529297,
"epoch": 0.08342857142857144,
"grad_norm": 0.0633506253361702,
"kl": 0.000423431396484375,
"learning_rate": 9.046048391230247e-07,
"loss": 0.2981,
"reward": 0.670308992266655,
"reward_std": 1.008466713130474,
"rewards/cosine_scaled_reward": 0.0851544663310051,
"rewards/format_reward": 0.5,
"step": 146
},
{
"clip_ratio": 0.0,
"completion_length": 2494.041748046875,
"epoch": 0.084,
"grad_norm": 0.035755012184381485,
"kl": 0.00038242340087890625,
"learning_rate": 9.026620557966279e-07,
"loss": 0.1709,
"reward": 0.34743132442235947,
"reward_std": 0.7957043498754501,
"rewards/cosine_scaled_reward": -0.03461768664419651,
"rewards/format_reward": 0.4166666716337204,
"step": 147
},
{
"clip_ratio": 0.0,
"completion_length": 2072.750045776367,
"epoch": 0.08457142857142858,
"grad_norm": 0.026552215218544006,
"kl": 0.0003905296325683594,
"learning_rate": 9.007020842191634e-07,
"loss": -0.01,
"reward": 0.576688677072525,
"reward_std": 0.5415612012147903,
"rewards/cosine_scaled_reward": -0.08665566146373749,
"rewards/format_reward": 0.7500000111758709,
"step": 148
},
{
"clip_ratio": 0.0,
"completion_length": 2761.7084350585938,
"epoch": 0.08514285714285715,
"grad_norm": 0.08269675821065903,
"kl": 0.0005259513854980469,
"learning_rate": 8.987250199168808e-07,
"loss": 0.2554,
"reward": -0.020979389548301697,
"reward_std": 0.6839594691991806,
"rewards/cosine_scaled_reward": -0.2188230287283659,
"rewards/format_reward": 0.416666679084301,
"step": 149
},
{
"clip_ratio": 0.0,
"completion_length": 2854.875030517578,
"epoch": 0.08571428571428572,
"grad_norm": 0.024635691195726395,
"kl": 0.0005636215209960938,
"learning_rate": 8.967309592491052e-07,
"loss": 0.215,
"reward": -0.06861633434891701,
"reward_std": 0.48412579856812954,
"rewards/cosine_scaled_reward": -0.22180816903710365,
"rewards/format_reward": 0.3750000149011612,
"step": 150
},
{
"clip_ratio": 0.0,
"completion_length": 2962.0000915527344,
"epoch": 0.08628571428571429,
"grad_norm": 0.015141277574002743,
"kl": 0.00032520294189453125,
"learning_rate": 8.9471999940354e-07,
"loss": 0.0071,
"reward": 0.32280058413743973,
"reward_std": 0.8727270662784576,
"rewards/cosine_scaled_reward": -0.06776639446616173,
"rewards/format_reward": 0.4583333395421505,
"step": 151
},
{
"clip_ratio": 0.0,
"completion_length": 2867.7500610351562,
"epoch": 0.08685714285714285,
"grad_norm": 0.016482684761285782,
"kl": 0.00044155120849609375,
"learning_rate": 8.926922383915315e-07,
"loss": 0.0131,
"reward": -0.14668704383075237,
"reward_std": 0.4973195120692253,
"rewards/cosine_scaled_reward": -0.28167686983942986,
"rewards/format_reward": 0.4166666716337204,
"step": 152
},
{
"clip_ratio": 0.0,
"completion_length": 2524.2916717529297,
"epoch": 0.08742857142857142,
"grad_norm": 0.01825847662985325,
"kl": 0.0002818107604980469,
"learning_rate": 8.906477750432903e-07,
"loss": -0.0077,
"reward": 0.5729860588908195,
"reward_std": 0.6833535395562649,
"rewards/cosine_scaled_reward": -0.005173638463020325,
"rewards/format_reward": 0.5833333358168602,
"step": 153
},
{
"clip_ratio": 0.0,
"completion_length": 3422.1250610351562,
"epoch": 0.088,
"grad_norm": 0.019472761079669,
"kl": 0.0004367828369140625,
"learning_rate": 8.88586709003076e-07,
"loss": 0.097,
"reward": -0.15317635610699654,
"reward_std": 0.7526141926646233,
"rewards/cosine_scaled_reward": -0.15992150828242302,
"rewards/format_reward": 0.1666666716337204,
"step": 154
},
{
"clip_ratio": 0.0,
"completion_length": 3454.375,
"epoch": 0.08857142857142856,
"grad_norm": 0.010583397001028061,
"kl": 0.00032520294189453125,
"learning_rate": 8.865091407243394e-07,
"loss": 0.033,
"reward": -0.4739493057131767,
"reward_std": 0.42491842061281204,
"rewards/cosine_scaled_reward": -0.2786413189023733,
"rewards/format_reward": 0.0833333358168602,
"step": 155
},
{
"clip_ratio": 0.0,
"completion_length": 1829.5000610351562,
"epoch": 0.08914285714285715,
"grad_norm": 0.027142049744725227,
"kl": 0.0004248619079589844,
"learning_rate": 8.844151714648274e-07,
"loss": 0.1219,
"reward": 0.6520796567201614,
"reward_std": 0.9076523296535015,
"rewards/cosine_scaled_reward": -0.06979351304471493,
"rewards/format_reward": 0.7916666865348816,
"step": 156
},
{
"clip_ratio": 0.0,
"completion_length": 2761.4166870117188,
"epoch": 0.08971428571428572,
"grad_norm": 0.03456060215830803,
"kl": 0.0031185150146484375,
"learning_rate": 8.823049032816478e-07,
"loss": 0.0636,
"reward": -0.12644078209996223,
"reward_std": 0.47517139464616776,
"rewards/cosine_scaled_reward": -0.27155373618006706,
"rewards/format_reward": 0.4166666716337204,
"step": 157
},
{
"clip_ratio": 0.0,
"completion_length": 2967.3333740234375,
"epoch": 0.09028571428571429,
"grad_norm": 0.018295863643288612,
"kl": 0.00030517578125,
"learning_rate": 8.801784390262943e-07,
"loss": 0.1293,
"reward": 0.7170794606208801,
"reward_std": 1.2831790447235107,
"rewards/cosine_scaled_reward": 0.02520638657733798,
"rewards/format_reward": 0.6666666939854622,
"step": 158
},
{
"clip_ratio": 0.0,
"completion_length": 3502.5833740234375,
"epoch": 0.09085714285714286,
"grad_norm": 0.013578574173152447,
"kl": 0.00040340423583984375,
"learning_rate": 8.780358823396352e-07,
"loss": 0.032,
"reward": -0.382570318877697,
"reward_std": 0.30739978328347206,
"rewards/cosine_scaled_reward": -0.2537851668894291,
"rewards/format_reward": 0.1250000037252903,
"step": 159
},
{
"clip_ratio": 0.0,
"completion_length": 3438.4583740234375,
"epoch": 0.09142857142857143,
"grad_norm": 0.013279566541314125,
"kl": 0.0003542900085449219,
"learning_rate": 8.758773376468604e-07,
"loss": 0.0503,
"reward": 0.07213277881965041,
"reward_std": 0.7591742426156998,
"rewards/cosine_scaled_reward": -0.08893361687660217,
"rewards/format_reward": 0.2500000037252903,
"step": 160
},
{
"clip_ratio": 0.0,
"completion_length": 3217.7083740234375,
"epoch": 0.092,
"grad_norm": 0.020369982346892357,
"kl": 0.00045013427734375,
"learning_rate": 8.737029101523929e-07,
"loss": -0.0123,
"reward": 0.045274198055267334,
"reward_std": 0.8290613554418087,
"rewards/cosine_scaled_reward": -0.16486290469765663,
"rewards/format_reward": 0.3750000111758709,
"step": 161
},
{
"clip_ratio": 0.0,
"completion_length": 2882.9583740234375,
"epoch": 0.09257142857142857,
"grad_norm": 0.015230800956487656,
"kl": 0.0003082752227783203,
"learning_rate": 8.715127058347614e-07,
"loss": 0.0195,
"reward": 0.22908502910286188,
"reward_std": 0.6315614283084869,
"rewards/cosine_scaled_reward": -0.093790827319026,
"rewards/format_reward": 0.4166666716337204,
"step": 162
},
{
"clip_ratio": 0.0,
"completion_length": 3374.041748046875,
"epoch": 0.09314285714285714,
"grad_norm": 0.023900238797068596,
"kl": 0.0012507438659667969,
"learning_rate": 8.693068314414344e-07,
"loss": 0.1131,
"reward": -0.4366554766893387,
"reward_std": 0.31389016658067703,
"rewards/cosine_scaled_reward": -0.25999439880251884,
"rewards/format_reward": 0.0833333358168602,
"step": 163
},
{
"clip_ratio": 0.0,
"completion_length": 2721.291748046875,
"epoch": 0.09371428571428571,
"grad_norm": 0.012770951725542545,
"kl": 0.00031685829162597656,
"learning_rate": 8.670853944836176e-07,
"loss": -0.0522,
"reward": 0.07291282713413239,
"reward_std": 0.6898190379142761,
"rewards/cosine_scaled_reward": -0.2552102580666542,
"rewards/format_reward": 0.5833333395421505,
"step": 164
},
{
"clip_ratio": 0.0,
"completion_length": 3518.3333740234375,
"epoch": 0.09428571428571429,
"grad_norm": 0.01447351835668087,
"kl": 0.0003514289855957031,
"learning_rate": 8.648485032310144e-07,
"loss": 0.0387,
"reward": -0.5118223652243614,
"reward_std": 0.4116092287003994,
"rewards/cosine_scaled_reward": -0.2767445221543312,
"rewards/format_reward": 0.0416666679084301,
"step": 165
},
{
"clip_ratio": 0.0,
"completion_length": 3584.0,
"epoch": 0.09485714285714286,
"grad_norm": 0.01175595261156559,
"kl": 0.0004019737243652344,
"learning_rate": 8.625962667065487e-07,
"loss": 0.0,
"reward": -0.8086595237255096,
"reward_std": 0.14668290875852108,
"rewards/cosine_scaled_reward": -0.4043297544121742,
"rewards/format_reward": 0.0,
"step": 166
},
{
"clip_ratio": 0.0,
"completion_length": 2145.7916870117188,
"epoch": 0.09542857142857143,
"grad_norm": 0.033467333763837814,
"kl": 0.0003383159637451172,
"learning_rate": 8.603287946810513e-07,
"loss": 0.1774,
"reward": 1.582944918423891,
"reward_std": 0.9087323695421219,
"rewards/cosine_scaled_reward": 0.3748057931661606,
"rewards/format_reward": 0.833333358168602,
"step": 167
},
{
"clip_ratio": 0.0,
"completion_length": 3014.4583740234375,
"epoch": 0.096,
"grad_norm": 0.015946704894304276,
"kl": 0.00036334991455078125,
"learning_rate": 8.580461976679099e-07,
"loss": 0.1448,
"reward": -0.07501981779932976,
"reward_std": 0.6305944435298443,
"rewards/cosine_scaled_reward": -0.20417658984661102,
"rewards/format_reward": 0.3333333432674408,
"step": 168
},
{
"clip_ratio": 0.0,
"completion_length": 2566.9584197998047,
"epoch": 0.09657142857142857,
"grad_norm": 0.017763182520866394,
"kl": 0.00031757354736328125,
"learning_rate": 8.557485869176825e-07,
"loss": -0.056,
"reward": 0.9450984001159668,
"reward_std": 1.3784255981445312,
"rewards/cosine_scaled_reward": 0.18088253866881132,
"rewards/format_reward": 0.5833333544433117,
"step": 169
},
{
"clip_ratio": 0.0,
"completion_length": 3493.8333740234375,
"epoch": 0.09714285714285714,
"grad_norm": 0.012164680287241936,
"kl": 0.0003337860107421875,
"learning_rate": 8.534360744126753e-07,
"loss": 0.0137,
"reward": -0.02268831804394722,
"reward_std": 0.5062807034701109,
"rewards/cosine_scaled_reward": -0.07384415343403816,
"rewards/format_reward": 0.1250000037252903,
"step": 170
},
{
"clip_ratio": 0.0,
"completion_length": 2499.9583740234375,
"epoch": 0.09771428571428571,
"grad_norm": 0.016273025423288345,
"kl": 0.0003085136413574219,
"learning_rate": 8.511087728614862e-07,
"loss": 0.0939,
"reward": 0.18185213347896934,
"reward_std": 0.5238135792315006,
"rewards/cosine_scaled_reward": -0.1382406111806631,
"rewards/format_reward": 0.4583333544433117,
"step": 171
},
{
"clip_ratio": 0.0,
"completion_length": 2753.5833740234375,
"epoch": 0.09828571428571428,
"grad_norm": 0.02729148045182228,
"kl": 0.00043773651123046875,
"learning_rate": 8.487667956935087e-07,
"loss": 0.2349,
"reward": -0.1761530265212059,
"reward_std": 0.31285534240305424,
"rewards/cosine_scaled_reward": -0.27557652816176414,
"rewards/format_reward": 0.3750000149011612,
"step": 172
},
{
"clip_ratio": 0.0,
"completion_length": 2625.7916870117188,
"epoch": 0.09885714285714285,
"grad_norm": 0.020237158983945847,
"kl": 0.0004603862762451172,
"learning_rate": 8.464102570534061e-07,
"loss": 0.1532,
"reward": 0.3476742703933269,
"reward_std": 1.086041659116745,
"rewards/cosine_scaled_reward": -0.11782953515648842,
"rewards/format_reward": 0.5833333544433117,
"step": 173
},
{
"clip_ratio": 0.0,
"completion_length": 3466.7083740234375,
"epoch": 0.09942857142857142,
"grad_norm": 0.013751998543739319,
"kl": 0.000400543212890625,
"learning_rate": 8.440392717955475e-07,
"loss": 0.0416,
"reward": -0.5668784528970718,
"reward_std": 0.2918172590434551,
"rewards/cosine_scaled_reward": -0.3459392338991165,
"rewards/format_reward": 0.1250000037252903,
"step": 174
},
{
"clip_ratio": 0.0,
"completion_length": 2285.4166870117188,
"epoch": 0.1,
"grad_norm": 0.044584471732378006,
"kl": 0.0002956390380859375,
"learning_rate": 8.416539554784089e-07,
"loss": 0.2817,
"reward": 1.0394483506679535,
"reward_std": 0.9922515600919724,
"rewards/cosine_scaled_reward": 0.20722418278455734,
"rewards/format_reward": 0.6250000149011612,
"step": 175
},
{
"clip_ratio": 0.0,
"completion_length": 2696.000030517578,
"epoch": 0.10057142857142858,
"grad_norm": 0.02319377101957798,
"kl": 0.000415802001953125,
"learning_rate": 8.392544243589427e-07,
"loss": 0.0886,
"reward": 0.3795542363077402,
"reward_std": 0.7756792306900024,
"rewards/cosine_scaled_reward": -0.060222890228033066,
"rewards/format_reward": 0.5000000223517418,
"step": 176
},
{
"clip_ratio": 0.0,
"completion_length": 2909.4166870117188,
"epoch": 0.10114285714285715,
"grad_norm": 0.03342346474528313,
"kl": 0.0003414154052734375,
"learning_rate": 8.368407953869103e-07,
"loss": 0.1555,
"reward": 0.2533244490623474,
"reward_std": 0.6474116146564484,
"rewards/cosine_scaled_reward": -0.04000448310398497,
"rewards/format_reward": 0.3333333432674408,
"step": 177
},
{
"clip_ratio": 0.0,
"completion_length": 3399.6250610351562,
"epoch": 0.10171428571428572,
"grad_norm": 0.01095277164131403,
"kl": 0.0003046989440917969,
"learning_rate": 8.344131861991828e-07,
"loss": 0.0506,
"reward": -0.01267234981060028,
"reward_std": 0.6906884871423244,
"rewards/cosine_scaled_reward": -0.11050283908843994,
"rewards/format_reward": 0.2083333395421505,
"step": 178
},
{
"clip_ratio": 0.0,
"completion_length": 2949.000030517578,
"epoch": 0.10228571428571429,
"grad_norm": 0.012322952039539814,
"kl": 0.0004057884216308594,
"learning_rate": 8.319717151140072e-07,
"loss": 0.1078,
"reward": -0.04895609989762306,
"reward_std": 0.553108676103875,
"rewards/cosine_scaled_reward": -0.17031139694154263,
"rewards/format_reward": 0.2916666679084301,
"step": 179
},
{
"clip_ratio": 0.0,
"completion_length": 2829.7083740234375,
"epoch": 0.10285714285714286,
"grad_norm": 0.029738230630755424,
"kl": 0.0003414154052734375,
"learning_rate": 8.295165011252396e-07,
"loss": 0.0411,
"reward": 0.3339508920907974,
"reward_std": 0.8846062198281288,
"rewards/cosine_scaled_reward": -0.04135786276310682,
"rewards/format_reward": 0.4166666716337204,
"step": 180
},
{
"clip_ratio": 0.0,
"completion_length": 3196.7083740234375,
"epoch": 0.10342857142857143,
"grad_norm": 0.018469586968421936,
"kl": 0.0003407001495361328,
"learning_rate": 8.270476638965461e-07,
"loss": 0.0479,
"reward": 0.06904555298388004,
"reward_std": 0.5129038351587951,
"rewards/cosine_scaled_reward": -0.13214393705129623,
"rewards/format_reward": 0.3333333432674408,
"step": 181
},
{
"clip_ratio": 0.0,
"completion_length": 3226.416748046875,
"epoch": 0.104,
"grad_norm": 0.018400847911834717,
"kl": 0.00044155120849609375,
"learning_rate": 8.245653237555705e-07,
"loss": 0.057,
"reward": -0.3992947228252888,
"reward_std": 0.5363226048648357,
"rewards/cosine_scaled_reward": -0.30381404608488083,
"rewards/format_reward": 0.2083333395421505,
"step": 182
},
{
"clip_ratio": 0.0,
"completion_length": 2809.4583740234375,
"epoch": 0.10457142857142857,
"grad_norm": 0.015173462219536304,
"kl": 0.0004189014434814453,
"learning_rate": 8.220696016880687e-07,
"loss": 0.0059,
"reward": -0.08883734792470932,
"reward_std": 0.4826326109468937,
"rewards/cosine_scaled_reward": -0.27358534932136536,
"rewards/format_reward": 0.4583333432674408,
"step": 183
},
{
"clip_ratio": 0.0,
"completion_length": 2285.250030517578,
"epoch": 0.10514285714285715,
"grad_norm": 0.01784459501504898,
"kl": 0.0003428459167480469,
"learning_rate": 8.195606193320136e-07,
"loss": 0.0369,
"reward": 0.36116717755794525,
"reward_std": 0.7178683169186115,
"rewards/cosine_scaled_reward": -0.06941639818251133,
"rewards/format_reward": 0.5000000111758709,
"step": 184
},
{
"clip_ratio": 0.0,
"completion_length": 2192.3333740234375,
"epoch": 0.10571428571428572,
"grad_norm": 0.021139826625585556,
"kl": 0.0005130767822265625,
"learning_rate": 8.170384989716657e-07,
"loss": 0.1298,
"reward": -0.057670027017593384,
"reward_std": 0.41697440296411514,
"rewards/cosine_scaled_reward": -0.2996683418750763,
"rewards/format_reward": 0.5416666679084301,
"step": 185
},
{
"clip_ratio": 0.0,
"completion_length": 2678.541748046875,
"epoch": 0.10628571428571429,
"grad_norm": 0.016024595126509666,
"kl": 0.0003867149353027344,
"learning_rate": 8.145033635316128e-07,
"loss": 0.0501,
"reward": 0.8263338319957256,
"reward_std": 0.7537258118391037,
"rewards/cosine_scaled_reward": 0.14233355224132538,
"rewards/format_reward": 0.5416666716337204,
"step": 186
},
{
"clip_ratio": 0.0,
"completion_length": 3467.2083740234375,
"epoch": 0.10685714285714286,
"grad_norm": 0.02122497744858265,
"kl": 0.00041675567626953125,
"learning_rate": 8.119553365707802e-07,
"loss": 0.0488,
"reward": -0.3330922797322273,
"reward_std": 0.4953814512118697,
"rewards/cosine_scaled_reward": -0.2290461454540491,
"rewards/format_reward": 0.1250000037252903,
"step": 187
},
{
"clip_ratio": 0.0,
"completion_length": 3197.7083740234375,
"epoch": 0.10742857142857143,
"grad_norm": 0.015642890706658363,
"kl": 0.000370025634765625,
"learning_rate": 8.093945422764069e-07,
"loss": 0.0347,
"reward": 0.1139075756072998,
"reward_std": 1.0698014255613089,
"rewards/cosine_scaled_reward": -0.1097128726541996,
"rewards/format_reward": 0.3333333469927311,
"step": 188
},
{
"clip_ratio": 0.0,
"completion_length": 3420.2083740234375,
"epoch": 0.108,
"grad_norm": 0.016403522342443466,
"kl": 0.0003390312194824219,
"learning_rate": 8.068211054579943e-07,
"loss": 0.0476,
"reward": -0.16637829318642616,
"reward_std": 0.6985251531004906,
"rewards/cosine_scaled_reward": -0.20818914845585823,
"rewards/format_reward": 0.2500000074505806,
"step": 189
},
{
"clip_ratio": 0.0,
"completion_length": 2964.5833740234375,
"epoch": 0.10857142857142857,
"grad_norm": 0.015617966651916504,
"kl": 0.0003600120544433594,
"learning_rate": 8.04235151541222e-07,
"loss": 0.0077,
"reward": -0.2868319842964411,
"reward_std": 0.26455265283584595,
"rewards/cosine_scaled_reward": -0.2892493214458227,
"rewards/format_reward": 0.2916666679084301,
"step": 190
},
{
"clip_ratio": 0.0,
"completion_length": 2810.2916717529297,
"epoch": 0.10914285714285714,
"grad_norm": 0.019974973052740097,
"kl": 0.0004630088806152344,
"learning_rate": 8.01636806561836e-07,
"loss": 0.066,
"reward": -0.06471758894622326,
"reward_std": 0.3940250463783741,
"rewards/cosine_scaled_reward": -0.26152546517550945,
"rewards/format_reward": 0.4583333358168602,
"step": 191
},
{
"clip_ratio": 0.0,
"completion_length": 2830.5416717529297,
"epoch": 0.10971428571428571,
"grad_norm": 0.016707738861441612,
"kl": 0.0004711151123046875,
"learning_rate": 7.990261971595048e-07,
"loss": 0.0331,
"reward": 0.1446765586733818,
"reward_std": 0.1306634098291397,
"rewards/cosine_scaled_reward": -0.052661728113889694,
"rewards/format_reward": 0.25,
"step": 192
},
{
"clip_ratio": 0.0,
"completion_length": 3578.6666870117188,
"epoch": 0.11028571428571429,
"grad_norm": 0.01071973703801632,
"kl": 0.0003151893615722656,
"learning_rate": 7.964034505716476e-07,
"loss": 0.0019,
"reward": -0.34736843407154083,
"reward_std": 0.33792266994714737,
"rewards/cosine_scaled_reward": -0.23618422076106071,
"rewards/format_reward": 0.1250000037252903,
"step": 193
},
{
"clip_ratio": 0.0,
"completion_length": 2222.666732788086,
"epoch": 0.11085714285714286,
"grad_norm": 0.05484706535935402,
"kl": 0.0006213188171386719,
"learning_rate": 7.93768694627233e-07,
"loss": 0.0984,
"reward": 0.5483606606721878,
"reward_std": 1.1607089042663574,
"rewards/cosine_scaled_reward": -0.01748633268289268,
"rewards/format_reward": 0.5833333432674408,
"step": 194
},
{
"clip_ratio": 0.0,
"completion_length": 2849.8334045410156,
"epoch": 0.11142857142857143,
"grad_norm": 0.021573448553681374,
"kl": 0.0005321502685546875,
"learning_rate": 7.911220577405484e-07,
"loss": -0.0482,
"reward": -0.2685957998037338,
"reward_std": 0.45445217937231064,
"rewards/cosine_scaled_reward": -0.3426312282681465,
"rewards/format_reward": 0.4166666716337204,
"step": 195
},
{
"clip_ratio": 0.0,
"completion_length": 2637.9166870117188,
"epoch": 0.112,
"grad_norm": 0.021516846492886543,
"kl": 0.0003535747528076172,
"learning_rate": 7.884636689049422e-07,
"loss": -0.0182,
"reward": 0.022181347012519836,
"reward_std": 0.4800866097211838,
"rewards/cosine_scaled_reward": -0.15557599812746048,
"rewards/format_reward": 0.3333333358168602,
"step": 196
},
{
"clip_ratio": 0.0,
"completion_length": 3041.875,
"epoch": 0.11257142857142857,
"grad_norm": 0.01381740253418684,
"kl": 0.0003528594970703125,
"learning_rate": 7.857936576865356e-07,
"loss": -0.0073,
"reward": -0.33803367614746094,
"reward_std": 0.4302855357527733,
"rewards/cosine_scaled_reward": -0.29401686880737543,
"rewards/format_reward": 0.25,
"step": 197
},
{
"clip_ratio": 0.0,
"completion_length": 2317.8333892822266,
"epoch": 0.11314285714285714,
"grad_norm": 0.03078615479171276,
"kl": 0.0003762245178222656,
"learning_rate": 7.831121542179086e-07,
"loss": 0.0924,
"reward": 0.597826175391674,
"reward_std": 0.8695001602172852,
"rewards/cosine_scaled_reward": 0.007246408611536026,
"rewards/format_reward": 0.5833333469927311,
"step": 198
},
{
"clip_ratio": 0.0,
"completion_length": 3303.9583740234375,
"epoch": 0.11371428571428571,
"grad_norm": 0.019953317940235138,
"kl": 0.0003972053527832031,
"learning_rate": 7.804192891917571e-07,
"loss": 0.0989,
"reward": 0.565569007769227,
"reward_std": 0.5124572534114122,
"rewards/cosine_scaled_reward": 0.07445115875452757,
"rewards/format_reward": 0.4166666716337204,
"step": 199
},
{
"clip_ratio": 0.0,
"completion_length": 2750.8333435058594,
"epoch": 0.11428571428571428,
"grad_norm": 0.05455951392650604,
"kl": 0.0007038116455078125,
"learning_rate": 7.777151938545235e-07,
"loss": 0.1193,
"reward": -0.07477065362036228,
"reward_std": 0.8196274563670158,
"rewards/cosine_scaled_reward": -0.24571867287158966,
"rewards/format_reward": 0.4166666716337204,
"step": 200
},
{
"clip_ratio": 0.0,
"completion_length": 2873.4166870117188,
"epoch": 0.11485714285714285,
"grad_norm": 0.06070086359977722,
"kl": 0.0005712509155273438,
"learning_rate": 7.75e-07,
"loss": 0.2315,
"reward": -0.16712947003543377,
"reward_std": 0.3942112438380718,
"rewards/cosine_scaled_reward": -0.22939807549118996,
"rewards/format_reward": 0.2916666716337204,
"step": 201
},
{
"clip_ratio": 0.0,
"completion_length": 2792.7083740234375,
"epoch": 0.11542857142857142,
"grad_norm": 0.02964051626622677,
"kl": 0.0003209114074707031,
"learning_rate": 7.72273839962904e-07,
"loss": 0.0298,
"reward": 0.44592857360839844,
"reward_std": 0.7889965083450079,
"rewards/cosine_scaled_reward": 0.05629761889576912,
"rewards/format_reward": 0.3333333358168602,
"step": 202
},
{
"clip_ratio": 0.0,
"completion_length": 3046.7500610351562,
"epoch": 0.116,
"grad_norm": 0.018877137452363968,
"kl": 0.0003161430358886719,
"learning_rate": 7.695368466124296e-07,
"loss": 0.0529,
"reward": 0.408734455704689,
"reward_std": 0.6842712201178074,
"rewards/cosine_scaled_reward": -0.0247994652017951,
"rewards/format_reward": 0.4583333544433117,
"step": 203
},
{
"clip_ratio": 0.0,
"completion_length": 2634.5833435058594,
"epoch": 0.11657142857142858,
"grad_norm": 0.02069164253771305,
"kl": 0.0003864765167236328,
"learning_rate": 7.667891533457718e-07,
"loss": 0.0833,
"reward": -0.21086983382701874,
"reward_std": 0.22330690175294876,
"rewards/cosine_scaled_reward": -0.2929349225014448,
"rewards/format_reward": 0.375,
"step": 204
},
{
"clip_ratio": 0.0,
"completion_length": 2709.7083435058594,
"epoch": 0.11714285714285715,
"grad_norm": 0.023381751030683517,
"kl": 0.000301361083984375,
"learning_rate": 7.640308940816239e-07,
"loss": 0.0675,
"reward": 0.27661415934562683,
"reward_std": 0.6943130940198898,
"rewards/cosine_scaled_reward": -0.09085960499942303,
"rewards/format_reward": 0.4583333432674408,
"step": 205
},
{
"clip_ratio": 0.0,
"completion_length": 3047.625030517578,
"epoch": 0.11771428571428572,
"grad_norm": 0.02151002548635006,
"kl": 0.0004744529724121094,
"learning_rate": 7.612622032536507e-07,
"loss": -0.0196,
"reward": -0.10782860964536667,
"reward_std": 0.5117022879421711,
"rewards/cosine_scaled_reward": -0.19974764343351126,
"rewards/format_reward": 0.2916666679084301,
"step": 206
},
{
"clip_ratio": 0.0,
"completion_length": 2933.7916870117188,
"epoch": 0.11828571428571429,
"grad_norm": 0.0185316763818264,
"kl": 0.0003197193145751953,
"learning_rate": 7.584832158039378e-07,
"loss": 0.0325,
"reward": 0.4302789755165577,
"reward_std": 1.0025385729968548,
"rewards/cosine_scaled_reward": 0.006806140765547752,
"rewards/format_reward": 0.4166666679084301,
"step": 207
},
{
"clip_ratio": 0.0,
"completion_length": 2270.75,
"epoch": 0.11885714285714286,
"grad_norm": 0.013073590584099293,
"kl": 0.00028967857360839844,
"learning_rate": 7.556940671764124e-07,
"loss": -0.0037,
"reward": 0.8428396731615067,
"reward_std": 0.33612318709492683,
"rewards/cosine_scaled_reward": 0.17141985148191452,
"rewards/format_reward": 0.5,
"step": 208
},
{
"clip_ratio": 0.0,
"completion_length": 2534.0834197998047,
"epoch": 0.11942857142857143,
"grad_norm": 0.04636608809232712,
"kl": 0.0002626180648803711,
"learning_rate": 7.528948933102438e-07,
"loss": 0.1077,
"reward": 0.41008343175053596,
"reward_std": 0.38527223095297813,
"rewards/cosine_scaled_reward": -0.003291614353656769,
"rewards/format_reward": 0.4166666716337204,
"step": 209
},
{
"clip_ratio": 0.0,
"completion_length": 2664.375030517578,
"epoch": 0.12,
"grad_norm": 0.06337418407201767,
"kl": 0.0005369186401367188,
"learning_rate": 7.500858306332172e-07,
"loss": 0.1954,
"reward": -0.11931078508496284,
"reward_std": 0.4521569199860096,
"rewards/cosine_scaled_reward": -0.2679887441918254,
"rewards/format_reward": 0.4166666828095913,
"step": 210
},
{
"clip_ratio": 0.0,
"completion_length": 2690.75,
"epoch": 0.12057142857142857,
"grad_norm": 0.02364625595510006,
"kl": 0.0003635883331298828,
"learning_rate": 7.472670160550848e-07,
"loss": 0.1012,
"reward": 0.4544009678065777,
"reward_std": 0.9030434042215347,
"rewards/cosine_scaled_reward": 0.018867140635848045,
"rewards/format_reward": 0.4166666679084301,
"step": 211
},
{
"clip_ratio": 0.0,
"completion_length": 3231.4584350585938,
"epoch": 0.12114285714285715,
"grad_norm": 0.02378404326736927,
"kl": 0.0004596710205078125,
"learning_rate": 7.444385869608921e-07,
"loss": 0.126,
"reward": 0.3596238009631634,
"reward_std": 1.0245484188199043,
"rewards/cosine_scaled_reward": 0.03397855442017317,
"rewards/format_reward": 0.2916666679084301,
"step": 212
},
{
"clip_ratio": 0.0,
"completion_length": 2259.166732788086,
"epoch": 0.12171428571428572,
"grad_norm": 0.0168690737336874,
"kl": 0.000362396240234375,
"learning_rate": 7.416006812042827e-07,
"loss": 0.0384,
"reward": 0.7057138048112392,
"reward_std": 1.0122921094298363,
"rewards/cosine_scaled_reward": 0.04035688715521246,
"rewards/format_reward": 0.6250000037252903,
"step": 213
},
{
"clip_ratio": 0.0,
"completion_length": 2541.250045776367,
"epoch": 0.12228571428571429,
"grad_norm": 0.029833870008587837,
"kl": 0.0004019737243652344,
"learning_rate": 7.387534371007797e-07,
"loss": 0.1661,
"reward": 0.027080008760094643,
"reward_std": 0.8045615777373314,
"rewards/cosine_scaled_reward": -0.19479333609342575,
"rewards/format_reward": 0.4166666716337204,
"step": 214
},
{
"clip_ratio": 0.0,
"completion_length": 2826.3333740234375,
"epoch": 0.12285714285714286,
"grad_norm": 0.01670445129275322,
"kl": 0.0003113746643066406,
"learning_rate": 7.358969934210438e-07,
"loss": 0.0458,
"reward": -0.15282147377729416,
"reward_std": 0.6320948153734207,
"rewards/cosine_scaled_reward": -0.28474406246095896,
"rewards/format_reward": 0.4166666679084301,
"step": 215
},
{
"clip_ratio": 0.0,
"completion_length": 1534.750015258789,
"epoch": 0.12342857142857143,
"grad_norm": 0.028196675702929497,
"kl": 0.0003743171691894531,
"learning_rate": 7.330314893841101e-07,
"loss": 0.173,
"reward": 1.9182523787021637,
"reward_std": 0.8897372838109732,
"rewards/cosine_scaled_reward": 0.5007927343249321,
"rewards/format_reward": 0.9166666716337204,
"step": 216
},
{
"clip_ratio": 0.0,
"completion_length": 3460.5416870117188,
"epoch": 0.124,
"grad_norm": 0.012388592585921288,
"kl": 0.00035881996154785156,
"learning_rate": 7.301570646506027e-07,
"loss": 0.0433,
"reward": -0.15055294707417488,
"reward_std": 0.5311995670199394,
"rewards/cosine_scaled_reward": -0.137776467949152,
"rewards/format_reward": 0.1250000037252903,
"step": 217
},
{
"clip_ratio": 0.0,
"completion_length": 2624.8334045410156,
"epoch": 0.12457142857142857,
"grad_norm": 0.03056301549077034,
"kl": 0.00043702125549316406,
"learning_rate": 7.27273859315928e-07,
"loss": 0.2376,
"reward": 0.47333015874028206,
"reward_std": 0.7044993788003922,
"rewards/cosine_scaled_reward": -0.034168269485235214,
"rewards/format_reward": 0.5416666865348816,
"step": 218
},
{
"clip_ratio": 0.0,
"completion_length": 2094.5834045410156,
"epoch": 0.12514285714285714,
"grad_norm": 0.05004338175058365,
"kl": 0.0005245208740234375,
"learning_rate": 7.243820139034464e-07,
"loss": 0.1989,
"reward": 0.7468737373128533,
"reward_std": 0.8828459084033966,
"rewards/cosine_scaled_reward": 0.019270192831754684,
"rewards/format_reward": 0.7083333432674408,
"step": 219
},
{
"clip_ratio": 0.0,
"completion_length": 2917.791748046875,
"epoch": 0.12571428571428572,
"grad_norm": 0.01651514135301113,
"kl": 0.0003859996795654297,
"learning_rate": 7.214816693576234e-07,
"loss": 0.1265,
"reward": 0.40257575549185276,
"reward_std": 0.8734332285821438,
"rewards/cosine_scaled_reward": -0.04871212877333164,
"rewards/format_reward": 0.5000000149011612,
"step": 220
},
{
"clip_ratio": 0.0,
"completion_length": 1825.2500457763672,
"epoch": 0.12628571428571428,
"grad_norm": 0.022186581045389175,
"kl": 0.0004601478576660156,
"learning_rate": 7.185729670371604e-07,
"loss": -0.0246,
"reward": 0.5106580704450607,
"reward_std": 0.6262499652802944,
"rewards/cosine_scaled_reward": -0.11967097967863083,
"rewards/format_reward": 0.75,
"step": 221
},
{
"clip_ratio": 0.0,
"completion_length": 1936.2917098999023,
"epoch": 0.12685714285714286,
"grad_norm": 0.025714771822094917,
"kl": 0.0005955696105957031,
"learning_rate": 7.156560487081051e-07,
"loss": -0.0667,
"reward": 0.5173665434122086,
"reward_std": 0.4710990320891142,
"rewards/cosine_scaled_reward": -0.11631673201918602,
"rewards/format_reward": 0.75,
"step": 222
},
{
"clip_ratio": 0.0,
"completion_length": 3248.3333740234375,
"epoch": 0.12742857142857142,
"grad_norm": 0.014303294010460377,
"kl": 0.00043010711669921875,
"learning_rate": 7.127310565369415e-07,
"loss": 0.0878,
"reward": -0.4517449364066124,
"reward_std": 0.20166658982634544,
"rewards/cosine_scaled_reward": -0.3092058040201664,
"rewards/format_reward": 0.1666666716337204,
"step": 223
},
{
"clip_ratio": 0.0,
"completion_length": 3256.8333740234375,
"epoch": 0.128,
"grad_norm": 0.01596376858651638,
"kl": 0.000354766845703125,
"learning_rate": 7.097981330836616e-07,
"loss": 0.0683,
"reward": 0.5998236387968063,
"reward_std": 0.8237827308475971,
"rewards/cosine_scaled_reward": 0.11241178959608078,
"rewards/format_reward": 0.3750000149011612,
"step": 224
},
{
"clip_ratio": 0.0,
"completion_length": 2851.2083740234375,
"epoch": 0.12857142857142856,
"grad_norm": 0.017591828480362892,
"kl": 0.0004096031188964844,
"learning_rate": 7.068574212948169e-07,
"loss": 0.0511,
"reward": 0.10140763968229294,
"reward_std": 0.5019327104091644,
"rewards/cosine_scaled_reward": -0.11596284806728363,
"rewards/format_reward": 0.3333333358168602,
"step": 225
},
{
"clip_ratio": 0.0,
"completion_length": 2292.000030517578,
"epoch": 0.12914285714285714,
"grad_norm": 0.032970964908599854,
"kl": 0.00045108795166015625,
"learning_rate": 7.039090644965509e-07,
"loss": 0.1386,
"reward": 0.4276478886604309,
"reward_std": 0.7602110169827938,
"rewards/cosine_scaled_reward": -0.057009367272257805,
"rewards/format_reward": 0.541666679084301,
"step": 226
},
{
"clip_ratio": 0.0,
"completion_length": 2093.6250610351562,
"epoch": 0.12971428571428573,
"grad_norm": 0.015598502941429615,
"kl": 0.0003306865692138672,
"learning_rate": 7.009532063876148e-07,
"loss": 0.1292,
"reward": 0.6084302365779877,
"reward_std": 0.6128123812377453,
"rewards/cosine_scaled_reward": -0.008284901501610875,
"rewards/format_reward": 0.625,
"step": 227
},
{
"clip_ratio": 0.0,
"completion_length": 1409.5833587646484,
"epoch": 0.13028571428571428,
"grad_norm": 0.03161853179335594,
"kl": 0.0005328655242919922,
"learning_rate": 6.979899910323624e-07,
"loss": 0.0405,
"reward": 0.6997113339602947,
"reward_std": 0.7971856854856014,
"rewards/cosine_scaled_reward": -0.04597766697406769,
"rewards/format_reward": 0.7916666679084301,
"step": 228
},
{
"clip_ratio": 0.0,
"completion_length": 2726.4166870117188,
"epoch": 0.13085714285714287,
"grad_norm": 0.03210078924894333,
"kl": 0.00038051605224609375,
"learning_rate": 6.950195628537299e-07,
"loss": 0.0564,
"reward": 0.28624797612428665,
"reward_std": 0.706984382122755,
"rewards/cosine_scaled_reward": -0.06520934589207172,
"rewards/format_reward": 0.4166666679084301,
"step": 229
},
{
"clip_ratio": 0.0,
"completion_length": 3351.7083740234375,
"epoch": 0.13142857142857142,
"grad_norm": 0.020253852009773254,
"kl": 0.0004558563232421875,
"learning_rate": 6.920420666261961e-07,
"loss": 0.0537,
"reward": 0.20329816453158855,
"reward_std": 0.920021902769804,
"rewards/cosine_scaled_reward": -0.04418425913900137,
"rewards/format_reward": 0.2916666753590107,
"step": 230
},
{
"clip_ratio": 0.0,
"completion_length": 3248.7916870117188,
"epoch": 0.132,
"grad_norm": 0.027326960116624832,
"kl": 0.0003800392150878906,
"learning_rate": 6.890576474687263e-07,
"loss": 0.0472,
"reward": 0.02445869892835617,
"reward_std": 0.4333410616964102,
"rewards/cosine_scaled_reward": -0.07110398076474667,
"rewards/format_reward": 0.1666666716337204,
"step": 231
},
{
"clip_ratio": 0.0,
"completion_length": 2847.8333435058594,
"epoch": 0.13257142857142856,
"grad_norm": 0.017697490751743317,
"kl": 0.0004267692565917969,
"learning_rate": 6.860664508377001e-07,
"loss": 0.0322,
"reward": -0.29599685221910477,
"reward_std": 0.34860160388052464,
"rewards/cosine_scaled_reward": -0.29383176099509,
"rewards/format_reward": 0.2916666679084301,
"step": 232
},
{
"clip_ratio": 0.0,
"completion_length": 3569.7916870117188,
"epoch": 0.13314285714285715,
"grad_norm": 0.011569323018193245,
"kl": 0.0003650188446044922,
"learning_rate": 6.83068622519821e-07,
"loss": 0.0082,
"reward": -0.27050644531846046,
"reward_std": 0.28310155123472214,
"rewards/cosine_scaled_reward": -0.15608655102550983,
"rewards/format_reward": 0.0416666679084301,
"step": 233
},
{
"clip_ratio": 0.0,
"completion_length": 3336.3750610351562,
"epoch": 0.1337142857142857,
"grad_norm": 0.014362377114593983,
"kl": 0.0003769397735595703,
"learning_rate": 6.800643086250121e-07,
"loss": 0.1096,
"reward": 0.16344637423753738,
"reward_std": 1.1342220231890678,
"rewards/cosine_scaled_reward": -0.022443480789661407,
"rewards/format_reward": 0.2083333358168602,
"step": 234
},
{
"clip_ratio": 0.0,
"completion_length": 2698.5416717529297,
"epoch": 0.13428571428571429,
"grad_norm": 0.016733834519982338,
"kl": 0.0003490447998046875,
"learning_rate": 6.770536555792944e-07,
"loss": 0.0561,
"reward": 0.6713685989379883,
"reward_std": 0.4799320735037327,
"rewards/cosine_scaled_reward": 0.14818426966667175,
"rewards/format_reward": 0.3750000037252903,
"step": 235
},
{
"clip_ratio": 0.0,
"completion_length": 1749.2916717529297,
"epoch": 0.13485714285714287,
"grad_norm": 0.032998789101839066,
"kl": 0.00046539306640625,
"learning_rate": 6.740368101176495e-07,
"loss": 0.0971,
"reward": 0.6648036018013954,
"reward_std": 0.8384054228663445,
"rewards/cosine_scaled_reward": -0.10509821772575378,
"rewards/format_reward": 0.8750000149011612,
"step": 236
},
{
"clip_ratio": 0.0,
"completion_length": 1968.3333435058594,
"epoch": 0.13542857142857143,
"grad_norm": 0.02814081497490406,
"kl": 0.0006561279296875,
"learning_rate": 6.710139192768694e-07,
"loss": 0.0855,
"reward": 0.15130121633410454,
"reward_std": 0.4565184935927391,
"rewards/cosine_scaled_reward": -0.21601606532931328,
"rewards/format_reward": 0.5833333358168602,
"step": 237
},
{
"clip_ratio": 0.0,
"completion_length": 2913.3333435058594,
"epoch": 0.136,
"grad_norm": 0.017682794481515884,
"kl": 0.0003323554992675781,
"learning_rate": 6.679851303883891e-07,
"loss": -0.0443,
"reward": 0.043509919196367264,
"reward_std": 0.8419067375361919,
"rewards/cosine_scaled_reward": -0.18657837435603142,
"rewards/format_reward": 0.4166666679084301,
"step": 238
},
{
"clip_ratio": 0.0,
"completion_length": 3364.416748046875,
"epoch": 0.13657142857142857,
"grad_norm": 0.028049411252141,
"kl": 0.0004544258117675781,
"learning_rate": 6.649505910711058e-07,
"loss": 0.1081,
"reward": 0.21441528294235468,
"reward_std": 1.0792418122291565,
"rewards/cosine_scaled_reward": -0.03862569108605385,
"rewards/format_reward": 0.2916666753590107,
"step": 239
},
{
"clip_ratio": 0.0,
"completion_length": 2725.7083740234375,
"epoch": 0.13714285714285715,
"grad_norm": 0.046624064445495605,
"kl": 0.0005068778991699219,
"learning_rate": 6.619104492241847e-07,
"loss": 0.2407,
"reward": -0.037649777717888355,
"reward_std": 0.7788064442574978,
"rewards/cosine_scaled_reward": -0.22715822607278824,
"rewards/format_reward": 0.4166666716337204,
"step": 240
},
{
"clip_ratio": 0.0,
"completion_length": 2704.25,
"epoch": 0.1377142857142857,
"grad_norm": 0.03354218974709511,
"kl": 0.00043392181396484375,
"learning_rate": 6.588648530198504e-07,
"loss": 0.027,
"reward": 0.41115298634395003,
"reward_std": 0.5296461880207062,
"rewards/cosine_scaled_reward": -0.002756841480731964,
"rewards/format_reward": 0.4166666679084301,
"step": 241
},
{
"clip_ratio": 0.0,
"completion_length": 3075.8750610351562,
"epoch": 0.1382857142857143,
"grad_norm": 0.01612684689462185,
"kl": 0.0004563331604003906,
"learning_rate": 6.558139508961654e-07,
"loss": 0.1636,
"reward": -0.1777523159980774,
"reward_std": 0.47497741878032684,
"rewards/cosine_scaled_reward": -0.2138761579990387,
"rewards/format_reward": 0.2500000037252903,
"step": 242
},
{
"clip_ratio": 0.0,
"completion_length": 2829.500030517578,
"epoch": 0.13885714285714285,
"grad_norm": 0.022913776338100433,
"kl": 0.00047969818115234375,
"learning_rate": 6.527578915497951e-07,
"loss": 0.1179,
"reward": -0.3900511562824249,
"reward_std": 0.19519304856657982,
"rewards/cosine_scaled_reward": -0.34085891395807266,
"rewards/format_reward": 0.2916666679084301,
"step": 243
},
{
"clip_ratio": 0.0,
"completion_length": 3008.4166870117188,
"epoch": 0.13942857142857143,
"grad_norm": 0.017667554318904877,
"kl": 0.0005576610565185547,
"learning_rate": 6.496968239287603e-07,
"loss": 0.1161,
"reward": -0.07407113164663315,
"reward_std": 0.9030572213232517,
"rewards/cosine_scaled_reward": -0.22453556954860687,
"rewards/format_reward": 0.3750000111758709,
"step": 244
},
{
"clip_ratio": 0.0,
"completion_length": 2979.1250610351562,
"epoch": 0.14,
"grad_norm": 0.015229357406497002,
"kl": 0.0004878044128417969,
"learning_rate": 6.466308972251785e-07,
"loss": 0.0894,
"reward": 0.2635510638356209,
"reward_std": 1.100995272397995,
"rewards/cosine_scaled_reward": -0.05572447320446372,
"rewards/format_reward": 0.3750000149011612,
"step": 245
},
{
"clip_ratio": 0.0,
"completion_length": 2819.0416870117188,
"epoch": 0.14057142857142857,
"grad_norm": 0.044691551476716995,
"kl": 0.00034236907958984375,
"learning_rate": 6.435602608679916e-07,
"loss": -0.046,
"reward": 0.28528738766908646,
"reward_std": 0.8157100006937981,
"rewards/cosine_scaled_reward": -0.04485631617717445,
"rewards/format_reward": 0.375,
"step": 246
},
{
"clip_ratio": 0.0,
"completion_length": 3443.8750610351562,
"epoch": 0.14114285714285715,
"grad_norm": 0.011280644685029984,
"kl": 0.00034046173095703125,
"learning_rate": 6.404850645156841e-07,
"loss": 0.0628,
"reward": 0.2559690326452255,
"reward_std": 1.1299069225788116,
"rewards/cosine_scaled_reward": 0.00298450980335474,
"rewards/format_reward": 0.2500000074505806,
"step": 247
},
{
"clip_ratio": 0.0,
"completion_length": 1982.8333435058594,
"epoch": 0.1417142857142857,
"grad_norm": 0.051233626902103424,
"kl": 0.0006239414215087891,
"learning_rate": 6.374054580489873e-07,
"loss": 0.1208,
"reward": 0.47601281851530075,
"reward_std": 0.8072874061763287,
"rewards/cosine_scaled_reward": -0.09532693400979042,
"rewards/format_reward": 0.6666666716337204,
"step": 248
},
{
"clip_ratio": 0.0,
"completion_length": 3090.5000610351562,
"epoch": 0.1422857142857143,
"grad_norm": 0.011510615237057209,
"kl": 0.0003566741943359375,
"learning_rate": 6.343215915635761e-07,
"loss": -0.0075,
"reward": -0.1677398905158043,
"reward_std": 0.7362043038010597,
"rewards/cosine_scaled_reward": -0.271369943395257,
"rewards/format_reward": 0.3750000037252903,
"step": 249
},
{
"clip_ratio": 0.0,
"completion_length": 2831.6666870117188,
"epoch": 0.14285714285714285,
"grad_norm": 0.012945250608026981,
"kl": 0.000339508056640625,
"learning_rate": 6.31233615362752e-07,
"loss": 0.0464,
"reward": 0.08572675287723541,
"reward_std": 0.4463801756501198,
"rewards/cosine_scaled_reward": -0.10296999663114548,
"rewards/format_reward": 0.2916666679084301,
"step": 250
},
{
"clip_ratio": 0.0,
"completion_length": 2145.7084045410156,
"epoch": 0.14342857142857143,
"grad_norm": 0.02382800169289112,
"kl": 0.00037860870361328125,
"learning_rate": 6.281416799501187e-07,
"loss": 0.1758,
"reward": 0.016892731189727783,
"reward_std": 0.4844237770885229,
"rewards/cosine_scaled_reward": -0.2832203172147274,
"rewards/format_reward": 0.5833333432674408,
"step": 251
},
{
"clip_ratio": 0.0,
"completion_length": 2960.5416870117188,
"epoch": 0.144,
"grad_norm": 0.021183207631111145,
"kl": 0.0004506111145019531,
"learning_rate": 6.25045936022246e-07,
"loss": 0.1659,
"reward": -0.04246724583208561,
"reward_std": 0.6862606927752495,
"rewards/cosine_scaled_reward": -0.18790028244256973,
"rewards/format_reward": 0.3333333395421505,
"step": 252
},
{
"clip_ratio": 0.0,
"completion_length": 2839.4166870117188,
"epoch": 0.14457142857142857,
"grad_norm": 0.015497619286179543,
"kl": 0.0003509521484375,
"learning_rate": 6.219465344613258e-07,
"loss": 0.0335,
"reward": -0.19672805070877075,
"reward_std": 0.37255218997597694,
"rewards/cosine_scaled_reward": -0.24419735372066498,
"rewards/format_reward": 0.2916666679084301,
"step": 253
},
{
"clip_ratio": 0.0,
"completion_length": 2932.6666717529297,
"epoch": 0.14514285714285713,
"grad_norm": 0.014029218815267086,
"kl": 0.0002608299255371094,
"learning_rate": 6.188436263278172e-07,
"loss": 0.1358,
"reward": 0.564534567296505,
"reward_std": 0.7931031864136457,
"rewards/cosine_scaled_reward": 0.03226728364825249,
"rewards/format_reward": 0.5000000074505806,
"step": 254
},
{
"clip_ratio": 0.0,
"completion_length": 2817.3333740234375,
"epoch": 0.1457142857142857,
"grad_norm": 0.02292151190340519,
"kl": 0.0003943443298339844,
"learning_rate": 6.157373628530852e-07,
"loss": 0.0033,
"reward": 0.7924370467662811,
"reward_std": 0.5020520761609077,
"rewards/cosine_scaled_reward": 0.16705189645290375,
"rewards/format_reward": 0.4583333432674408,
"step": 255
},
{
"clip_ratio": 0.0,
"completion_length": 3312.5000610351562,
"epoch": 0.1462857142857143,
"grad_norm": 0.024594873189926147,
"kl": 0.00034236907958984375,
"learning_rate": 6.126278954320294e-07,
"loss": 0.1007,
"reward": -0.39764899387955666,
"reward_std": 0.4253292456269264,
"rewards/cosine_scaled_reward": -0.302991159260273,
"rewards/format_reward": 0.2083333395421505,
"step": 256
},
{
"clip_ratio": 0.0,
"completion_length": 2919.0833740234375,
"epoch": 0.14685714285714285,
"grad_norm": 0.026216557249426842,
"kl": 0.0004639625549316406,
"learning_rate": 6.095153756157051e-07,
"loss": 0.2025,
"reward": -0.20567850768566132,
"reward_std": 0.6880289539694786,
"rewards/cosine_scaled_reward": -0.2695059161633253,
"rewards/format_reward": 0.3333333432674408,
"step": 257
},
{
"clip_ratio": 0.0,
"completion_length": 2133.8333435058594,
"epoch": 0.14742857142857144,
"grad_norm": 0.03468165546655655,
"kl": 0.0003616809844970703,
"learning_rate": 6.06399955103937e-07,
"loss": 0.1963,
"reward": 0.30517828464508057,
"reward_std": 0.593671128153801,
"rewards/cosine_scaled_reward": -0.1390775376930833,
"rewards/format_reward": 0.5833333358168602,
"step": 258
},
{
"clip_ratio": 0.0,
"completion_length": 3469.2916870117188,
"epoch": 0.148,
"grad_norm": 0.02112772688269615,
"kl": 0.0003986358642578125,
"learning_rate": 6.032817857379256e-07,
"loss": 0.0331,
"reward": -0.01782984286546707,
"reward_std": 0.8413332737982273,
"rewards/cosine_scaled_reward": -0.09224824234843254,
"rewards/format_reward": 0.1666666679084301,
"step": 259
},
{
"clip_ratio": 0.0,
"completion_length": 3433.7083740234375,
"epoch": 0.14857142857142858,
"grad_norm": 0.014742922969162464,
"kl": 0.000308990478515625,
"learning_rate": 6.001610194928464e-07,
"loss": 0.0383,
"reward": 0.31882617622613907,
"reward_std": 0.15408218186348677,
"rewards/cosine_scaled_reward": 0.03441305831074715,
"rewards/format_reward": 0.25,
"step": 260
},
{
"clip_ratio": 0.0,
"completion_length": 3461.291748046875,
"epoch": 0.14914285714285713,
"grad_norm": 0.014058803208172321,
"kl": 0.00042438507080078125,
"learning_rate": 5.97037808470444e-07,
"loss": 0.0671,
"reward": -0.2942545488476753,
"reward_std": 0.612834420055151,
"rewards/cosine_scaled_reward": -0.2096272725611925,
"rewards/format_reward": 0.1250000037252903,
"step": 261
},
{
"clip_ratio": 0.0,
"completion_length": 3319.7083740234375,
"epoch": 0.14971428571428572,
"grad_norm": 0.013714558444917202,
"kl": 0.0003371238708496094,
"learning_rate": 5.939123048916173e-07,
"loss": 0.104,
"reward": 0.14693566597998142,
"reward_std": 0.5481071509420872,
"rewards/cosine_scaled_reward": -0.11403219401836395,
"rewards/format_reward": 0.3750000149011612,
"step": 262
},
{
"clip_ratio": 0.0,
"completion_length": 2869.166717529297,
"epoch": 0.15028571428571427,
"grad_norm": 0.04431964457035065,
"kl": 0.0004782676696777344,
"learning_rate": 5.907846610890011e-07,
"loss": 0.2064,
"reward": -0.06633574888110161,
"reward_std": 0.24844567105174065,
"rewards/cosine_scaled_reward": -0.17900121491402388,
"rewards/format_reward": 0.291666679084301,
"step": 263
},
{
"clip_ratio": 0.0,
"completion_length": 3500.7500610351562,
"epoch": 0.15085714285714286,
"grad_norm": 0.011211477220058441,
"kl": 0.00043582916259765625,
"learning_rate": 5.87655029499542e-07,
"loss": 0.0411,
"reward": -0.3998759835958481,
"reward_std": 0.430798327550292,
"rewards/cosine_scaled_reward": -0.24160464480519295,
"rewards/format_reward": 0.0833333358168602,
"step": 264
},
{
"clip_ratio": 0.0,
"completion_length": 3510.6250610351562,
"epoch": 0.15142857142857144,
"grad_norm": 0.015797875821590424,
"kl": 0.0004782676696777344,
"learning_rate": 5.845235626570683e-07,
"loss": 0.0301,
"reward": 0.03237064555287361,
"reward_std": 0.7703893817961216,
"rewards/cosine_scaled_reward": -0.06714800372719765,
"rewards/format_reward": 0.1666666679084301,
"step": 265
},
{
"clip_ratio": 0.0,
"completion_length": 2977.3333740234375,
"epoch": 0.152,
"grad_norm": 0.02703353762626648,
"kl": 0.000396728515625,
"learning_rate": 5.813904131848564e-07,
"loss": 0.0458,
"reward": -0.099398122751154,
"reward_std": 0.3674992090091109,
"rewards/cosine_scaled_reward": -0.21636574110016227,
"rewards/format_reward": 0.3333333358168602,
"step": 266
},
{
"clip_ratio": 0.0,
"completion_length": 2061.0833740234375,
"epoch": 0.15257142857142858,
"grad_norm": 0.0498061366379261,
"kl": 0.0005326271057128906,
"learning_rate": 5.78255733788191e-07,
"loss": 0.1127,
"reward": 0.3922936078161001,
"reward_std": 0.6864228155463934,
"rewards/cosine_scaled_reward": -0.13718653097748756,
"rewards/format_reward": 0.6666666716337204,
"step": 267
},
{
"clip_ratio": 0.0,
"completion_length": 2333.4583435058594,
"epoch": 0.15314285714285714,
"grad_norm": 0.028823453933000565,
"kl": 0.0005085468292236328,
"learning_rate": 5.751196772469237e-07,
"loss": 0.186,
"reward": 0.22259462717920542,
"reward_std": 0.5068696048110723,
"rewards/cosine_scaled_reward": -0.15953603573143482,
"rewards/format_reward": 0.5416666716337204,
"step": 268
},
{
"clip_ratio": 0.0,
"completion_length": 3267.3333740234375,
"epoch": 0.15371428571428572,
"grad_norm": 0.027852864935994148,
"kl": 0.0002601146697998047,
"learning_rate": 5.71982396408026e-07,
"loss": 0.1001,
"reward": -0.2786406707018614,
"reward_std": 0.3324854364618659,
"rewards/cosine_scaled_reward": -0.22265366930514574,
"rewards/format_reward": 0.1666666679084301,
"step": 269
},
{
"clip_ratio": 0.0,
"completion_length": 2147.2083740234375,
"epoch": 0.15428571428571428,
"grad_norm": 0.025380106642842293,
"kl": 0.000537872314453125,
"learning_rate": 5.688440441781398e-07,
"loss": 0.1412,
"reward": 0.5648728758096695,
"reward_std": 0.5805850811302662,
"rewards/cosine_scaled_reward": 0.011603094637393951,
"rewards/format_reward": 0.5416666679084301,
"step": 270
},
{
"clip_ratio": 0.0,
"completion_length": 2535.7500610351562,
"epoch": 0.15485714285714286,
"grad_norm": 0.04058884456753731,
"kl": 0.0005526542663574219,
"learning_rate": 5.657047735161255e-07,
"loss": 0.0614,
"reward": 0.19145794212818146,
"reward_std": 0.5669713392853737,
"rewards/cosine_scaled_reward": -0.17510437592864037,
"rewards/format_reward": 0.5416666679084301,
"step": 271
},
{
"clip_ratio": 0.0,
"completion_length": 2707.7500610351562,
"epoch": 0.15542857142857142,
"grad_norm": 0.02463456057012081,
"kl": 0.0004875659942626953,
"learning_rate": 5.625647374256061e-07,
"loss": -0.0531,
"reward": 0.32263614796102047,
"reward_std": 0.8555684071034193,
"rewards/cosine_scaled_reward": -0.10951526463031769,
"rewards/format_reward": 0.5416666716337204,
"step": 272
},
{
"clip_ratio": 0.0,
"completion_length": 3203.8750610351562,
"epoch": 0.156,
"grad_norm": 0.021756043657660484,
"kl": 0.0005960464477539062,
"learning_rate": 5.594240889475106e-07,
"loss": 0.0959,
"reward": -0.3803365007042885,
"reward_std": 0.4917794167995453,
"rewards/cosine_scaled_reward": -0.31516825407743454,
"rewards/format_reward": 0.2500000037252903,
"step": 273
},
{
"clip_ratio": 0.0,
"completion_length": 3559.375,
"epoch": 0.15657142857142858,
"grad_norm": 0.014902282506227493,
"kl": 0.0003399848937988281,
"learning_rate": 5.562829811526154e-07,
"loss": 0.014,
"reward": -0.4870794676244259,
"reward_std": 0.32457295805215836,
"rewards/cosine_scaled_reward": -0.26437306217849255,
"rewards/format_reward": 0.0416666679084301,
"step": 274
},
{
"clip_ratio": 0.0,
"completion_length": 2296.2083435058594,
"epoch": 0.15714285714285714,
"grad_norm": 0.019861867651343346,
"kl": 0.0003972053527832031,
"learning_rate": 5.531415671340826e-07,
"loss": -0.0306,
"reward": 0.20605197548866272,
"reward_std": 0.666698768734932,
"rewards/cosine_scaled_reward": -0.14697400853037834,
"rewards/format_reward": 0.5,
"step": 275
},
{
"clip_ratio": 0.0,
"completion_length": 2113.9583892822266,
"epoch": 0.15771428571428572,
"grad_norm": 0.02660396508872509,
"kl": 0.00037980079650878906,
"learning_rate": 5.5e-07,
"loss": 0.189,
"reward": 0.8675991147756577,
"reward_std": 0.5296240104362369,
"rewards/cosine_scaled_reward": 0.12129955366253853,
"rewards/format_reward": 0.625,
"step": 276
},
{
"clip_ratio": 0.0,
"completion_length": 2935.2916870117188,
"epoch": 0.15828571428571428,
"grad_norm": 0.014423711225390434,
"kl": 0.0004062652587890625,
"learning_rate": 5.468584328659172e-07,
"loss": 0.0398,
"reward": 0.7772083282470703,
"reward_std": 0.564259335398674,
"rewards/cosine_scaled_reward": 0.15943749248981476,
"rewards/format_reward": 0.4583333432674408,
"step": 277
},
{
"clip_ratio": 0.0,
"completion_length": 2655.375030517578,
"epoch": 0.15885714285714286,
"grad_norm": 0.018322305753827095,
"kl": 0.0004067420959472656,
"learning_rate": 5.437170188473847e-07,
"loss": 0.1151,
"reward": -0.03639080002903938,
"reward_std": 0.4445139616727829,
"rewards/cosine_scaled_reward": -0.24736207351088524,
"rewards/format_reward": 0.4583333432674408,
"step": 278
},
{
"clip_ratio": 0.0,
"completion_length": 2705.1250610351562,
"epoch": 0.15942857142857142,
"grad_norm": 0.03632061555981636,
"kl": 0.000385284423828125,
"learning_rate": 5.405759110524894e-07,
"loss": 0.135,
"reward": 0.6486790850758553,
"reward_std": 0.898019090294838,
"rewards/cosine_scaled_reward": 0.05350620858371258,
"rewards/format_reward": 0.5416666828095913,
"step": 279
},
{
"clip_ratio": 0.0,
"completion_length": 2446.2083435058594,
"epoch": 0.16,
"grad_norm": 0.013082285411655903,
"kl": 0.000347137451171875,
"learning_rate": 5.37435262574394e-07,
"loss": -0.0428,
"reward": 0.37989859376102686,
"reward_std": 0.29897986352443695,
"rewards/cosine_scaled_reward": -0.060050718020647764,
"rewards/format_reward": 0.5,
"step": 280
},
{
"clip_ratio": 0.0,
"completion_length": 2319.2083435058594,
"epoch": 0.16057142857142856,
"grad_norm": 0.045566096901893616,
"kl": 0.0005693435668945312,
"learning_rate": 5.342952264838747e-07,
"loss": 0.1853,
"reward": 0.5609744489192963,
"reward_std": 0.6304305791854858,
"rewards/cosine_scaled_reward": 0.05132052768021822,
"rewards/format_reward": 0.4583333395421505,
"step": 281
},
{
"clip_ratio": 0.0,
"completion_length": 3381.8750610351562,
"epoch": 0.16114285714285714,
"grad_norm": 0.02684849686920643,
"kl": 0.0007226467132568359,
"learning_rate": 5.311559558218603e-07,
"loss": 0.0416,
"reward": 0.4639171026647091,
"reward_std": 0.9287250991910696,
"rewards/cosine_scaled_reward": 0.08612522296607494,
"rewards/format_reward": 0.2916666716337204,
"step": 282
},
{
"clip_ratio": 0.0,
"completion_length": 2586.375030517578,
"epoch": 0.16171428571428573,
"grad_norm": 0.0842081606388092,
"kl": 0.000537872314453125,
"learning_rate": 5.28017603591974e-07,
"loss": 0.2945,
"reward": -0.23938731849193573,
"reward_std": 0.5152188688516617,
"rewards/cosine_scaled_reward": -0.30719365179538727,
"rewards/format_reward": 0.3750000149011612,
"step": 283
},
{
"clip_ratio": 0.0,
"completion_length": 2352.8750610351562,
"epoch": 0.16228571428571428,
"grad_norm": 0.026272239163517952,
"kl": 0.00044083595275878906,
"learning_rate": 5.248803227530763e-07,
"loss": 0.186,
"reward": 1.0214342884719372,
"reward_std": 1.2180421352386475,
"rewards/cosine_scaled_reward": 0.2398838261142373,
"rewards/format_reward": 0.5416666753590107,
"step": 284
},
{
"clip_ratio": 0.0,
"completion_length": 3469.25,
"epoch": 0.16285714285714287,
"grad_norm": 0.018204184249043465,
"kl": 0.0004673004150390625,
"learning_rate": 5.21744266211809e-07,
"loss": 0.063,
"reward": -0.13403620570898056,
"reward_std": 0.6568610742688179,
"rewards/cosine_scaled_reward": -0.12951810285449028,
"rewards/format_reward": 0.1250000037252903,
"step": 285
},
{
"clip_ratio": 0.0,
"completion_length": 2795.166748046875,
"epoch": 0.16342857142857142,
"grad_norm": 0.013621930964291096,
"kl": 0.0003037452697753906,
"learning_rate": 5.186095868151436e-07,
"loss": -0.039,
"reward": 1.5945213325321674,
"reward_std": 0.5471338629722595,
"rewards/cosine_scaled_reward": 0.4222606960684061,
"rewards/format_reward": 0.75,
"step": 286
},
{
"clip_ratio": 0.0,
"completion_length": 2879.833465576172,
"epoch": 0.164,
"grad_norm": 0.01253263745456934,
"kl": 0.0004277229309082031,
"learning_rate": 5.154764373429315e-07,
"loss": 0.0411,
"reward": 0.21084657812025398,
"reward_std": 1.322334498167038,
"rewards/cosine_scaled_reward": -0.12374338880181313,
"rewards/format_reward": 0.4583333395421505,
"step": 287
},
{
"clip_ratio": 0.0,
"completion_length": 3213.2916870117188,
"epoch": 0.16457142857142856,
"grad_norm": 0.024232791736721992,
"kl": 0.0003829002380371094,
"learning_rate": 5.123449705004581e-07,
"loss": 0.0237,
"reward": -0.3808862268924713,
"reward_std": 0.41033722274005413,
"rewards/cosine_scaled_reward": -0.294609775301069,
"rewards/format_reward": 0.2083333432674408,
"step": 288
},
{
"clip_ratio": 0.0,
"completion_length": 2690.750045776367,
"epoch": 0.16514285714285715,
"grad_norm": 0.053029682487249374,
"kl": 0.00045490264892578125,
"learning_rate": 5.09215338910999e-07,
"loss": 0.1226,
"reward": -0.21079005533829331,
"reward_std": 0.3236595541238785,
"rewards/cosine_scaled_reward": -0.29289503768086433,
"rewards/format_reward": 0.3750000037252903,
"step": 289
},
{
"clip_ratio": 0.0,
"completion_length": 2208.916717529297,
"epoch": 0.1657142857142857,
"grad_norm": 0.03633257746696472,
"kl": 0.0009098052978515625,
"learning_rate": 5.060876951083828e-07,
"loss": 0.0996,
"reward": 0.3243631422519684,
"reward_std": 0.7893142104148865,
"rewards/cosine_scaled_reward": -0.17115176958031952,
"rewards/format_reward": 0.6666666828095913,
"step": 290
},
{
"clip_ratio": 0.0,
"completion_length": 3469.4166870117188,
"epoch": 0.1662857142857143,
"grad_norm": 0.013034985400736332,
"kl": 0.00041294097900390625,
"learning_rate": 5.02962191529556e-07,
"loss": 0.0095,
"reward": -0.2475043497979641,
"reward_std": 0.5379906464368105,
"rewards/cosine_scaled_reward": -0.227918840944767,
"rewards/format_reward": 0.2083333358168602,
"step": 291
},
{
"clip_ratio": 0.0,
"completion_length": 2733.541748046875,
"epoch": 0.16685714285714287,
"grad_norm": 0.023631447926163673,
"kl": 0.00041866302490234375,
"learning_rate": 4.998389805071536e-07,
"loss": 0.0501,
"reward": -0.05327422299887985,
"reward_std": 0.6902021616697311,
"rewards/cosine_scaled_reward": -0.29747044667601585,
"rewards/format_reward": 0.5416666865348816,
"step": 292
},
{
"clip_ratio": 0.0,
"completion_length": 1974.3333587646484,
"epoch": 0.16742857142857143,
"grad_norm": 0.025896325707435608,
"kl": 0.0005052089691162109,
"learning_rate": 4.967182142620745e-07,
"loss": -0.0503,
"reward": 1.0392636209726334,
"reward_std": 0.7821149528026581,
"rewards/cosine_scaled_reward": 0.16546513326466084,
"rewards/format_reward": 0.7083333432674408,
"step": 293
},
{
"clip_ratio": 0.0,
"completion_length": 3210.666748046875,
"epoch": 0.168,
"grad_norm": 0.016182757914066315,
"kl": 0.0005517005920410156,
"learning_rate": 4.93600044896063e-07,
"loss": 0.1227,
"reward": -0.3081187531352043,
"reward_std": 0.9994445107877254,
"rewards/cosine_scaled_reward": -0.27905938774347305,
"rewards/format_reward": 0.2500000037252903,
"step": 294
},
{
"clip_ratio": 0.0,
"completion_length": 3051.1251220703125,
"epoch": 0.16857142857142857,
"grad_norm": 0.021791962906718254,
"kl": 0.0004928112030029297,
"learning_rate": 4.904846243842949e-07,
"loss": 0.1472,
"reward": -0.0736542553640902,
"reward_std": 0.4684867039322853,
"rewards/cosine_scaled_reward": -0.20349380746483803,
"rewards/format_reward": 0.3333333395421505,
"step": 295
},
{
"clip_ratio": 0.0,
"completion_length": 3008.375030517578,
"epoch": 0.16914285714285715,
"grad_norm": 0.044715363532304764,
"kl": 0.0005588531494140625,
"learning_rate": 4.873721045679706e-07,
"loss": 0.2406,
"reward": -0.45781777799129486,
"reward_std": 0.3647055197507143,
"rewards/cosine_scaled_reward": -0.33307556062936783,
"rewards/format_reward": 0.2083333395421505,
"step": 296
},
{
"clip_ratio": 0.0,
"completion_length": 3294.1666870117188,
"epoch": 0.1697142857142857,
"grad_norm": 0.01236710138618946,
"kl": 0.00040149688720703125,
"learning_rate": 4.842626371469149e-07,
"loss": 0.0176,
"reward": 0.3674662681296468,
"reward_std": 0.8067609528079629,
"rewards/cosine_scaled_reward": 0.017066428757971153,
"rewards/format_reward": 0.3333333432674408,
"step": 297
},
{
"clip_ratio": 0.0,
"completion_length": 2191.2084045410156,
"epoch": 0.1702857142857143,
"grad_norm": 0.02793304994702339,
"kl": 0.0004506111145019531,
"learning_rate": 4.811563736721829e-07,
"loss": 0.2163,
"reward": 0.3552871508290991,
"reward_std": 0.6271071489900351,
"rewards/cosine_scaled_reward": -0.07235642522573471,
"rewards/format_reward": 0.5,
"step": 298
},
{
"clip_ratio": 0.0,
"completion_length": 2937.0833740234375,
"epoch": 0.17085714285714285,
"grad_norm": 0.02537981979548931,
"kl": 0.0006856918334960938,
"learning_rate": 4.780534655386743e-07,
"loss": 0.0741,
"reward": 0.014573439490050077,
"reward_std": 0.6083418875932693,
"rewards/cosine_scaled_reward": -0.20104661397635937,
"rewards/format_reward": 0.4166666828095913,
"step": 299
},
{
"clip_ratio": 0.0,
"completion_length": 2810.375,
"epoch": 0.17142857142857143,
"grad_norm": 0.015299368649721146,
"kl": 0.00049591064453125,
"learning_rate": 4.749540639777539e-07,
"loss": 0.0634,
"reward": -0.10026557371020317,
"reward_std": 0.45571645349264145,
"rewards/cosine_scaled_reward": -0.23763278499245644,
"rewards/format_reward": 0.375,
"step": 300
},
{
"clip_ratio": 0.0,
"completion_length": 2589.4583435058594,
"epoch": 0.172,
"grad_norm": 0.03871985152363777,
"kl": 0.000347137451171875,
"learning_rate": 4.7185832004988133e-07,
"loss": 0.1152,
"reward": 0.054252732545137405,
"reward_std": 0.2450561560690403,
"rewards/cosine_scaled_reward": -0.1812069695442915,
"rewards/format_reward": 0.4166666716337204,
"step": 301
},
{
"clip_ratio": 0.0,
"completion_length": 2865.7083435058594,
"epoch": 0.17257142857142857,
"grad_norm": 0.02540568634867668,
"kl": 0.0004420280456542969,
"learning_rate": 4.68766384637248e-07,
"loss": 0.0172,
"reward": 0.15939845889806747,
"reward_std": 0.35454079881310463,
"rewards/cosine_scaled_reward": -0.04530075564980507,
"rewards/format_reward": 0.25,
"step": 302
},
{
"clip_ratio": 0.0,
"completion_length": 3046.25,
"epoch": 0.17314285714285715,
"grad_norm": 0.014560963958501816,
"kl": 0.0003390312194824219,
"learning_rate": 4.656784084364238e-07,
"loss": 0.0832,
"reward": 0.007433712482452393,
"reward_std": 0.2396685965359211,
"rewards/cosine_scaled_reward": -0.1212831512093544,
"rewards/format_reward": 0.25,
"step": 303
},
{
"clip_ratio": 0.0,
"completion_length": 3481.0416870117188,
"epoch": 0.1737142857142857,
"grad_norm": 0.016006356105208397,
"kl": 0.00033545494079589844,
"learning_rate": 4.6259454195101267e-07,
"loss": 0.0346,
"reward": -0.22439849376678467,
"reward_std": 0.6627911329269409,
"rewards/cosine_scaled_reward": -0.17469927296042442,
"rewards/format_reward": 0.1250000037252903,
"step": 304
},
{
"clip_ratio": 0.0,
"completion_length": 2904.5418090820312,
"epoch": 0.1742857142857143,
"grad_norm": 0.025142712518572807,
"kl": 0.00030517578125,
"learning_rate": 4.59514935484316e-07,
"loss": 0.1243,
"reward": 0.8032534047961235,
"reward_std": 0.8733390048146248,
"rewards/cosine_scaled_reward": 0.13079336285591125,
"rewards/format_reward": 0.5416666753590107,
"step": 305
},
{
"clip_ratio": 0.0,
"completion_length": 2188.0833587646484,
"epoch": 0.17485714285714285,
"grad_norm": 0.03312807157635689,
"kl": 0.0004315376281738281,
"learning_rate": 4.5643973913200837e-07,
"loss": 0.0318,
"reward": 0.7741055563092232,
"reward_std": 0.8599487096071243,
"rewards/cosine_scaled_reward": 0.0745527446269989,
"rewards/format_reward": 0.625,
"step": 306
},
{
"clip_ratio": 0.0,
"completion_length": 3403.7500610351562,
"epoch": 0.17542857142857143,
"grad_norm": 0.01129495445638895,
"kl": 0.0003752708435058594,
"learning_rate": 4.5336910277482155e-07,
"loss": 0.0876,
"reward": -0.2145760916173458,
"reward_std": 0.883152648806572,
"rewards/cosine_scaled_reward": -0.2739547099918127,
"rewards/format_reward": 0.3333333395421505,
"step": 307
},
{
"clip_ratio": 0.0,
"completion_length": 2245.3333435058594,
"epoch": 0.176,
"grad_norm": 0.02003965899348259,
"kl": 0.00042700767517089844,
"learning_rate": 4.503031760712397e-07,
"loss": 0.1665,
"reward": 0.9610237777233124,
"reward_std": 0.924444355070591,
"rewards/cosine_scaled_reward": 0.1888452209532261,
"rewards/format_reward": 0.5833333432674408,
"step": 308
},
{
"clip_ratio": 0.0,
"completion_length": 3584.0,
"epoch": 0.17657142857142857,
"grad_norm": 0.015420272946357727,
"kl": 0.0005145072937011719,
"learning_rate": 4.4724210845020494e-07,
"loss": 0.0,
"reward": -0.4918478289619088,
"reward_std": 0.18744987901300192,
"rewards/cosine_scaled_reward": -0.24592391401529312,
"rewards/format_reward": 0.0,
"step": 309
},
{
"clip_ratio": 0.0,
"completion_length": 3584.0,
"epoch": 0.17714285714285713,
"grad_norm": 0.011936242692172527,
"kl": 0.00030040740966796875,
"learning_rate": 4.441860491038345e-07,
"loss": 0.0,
"reward": -0.5715262293815613,
"reward_std": 0.19434408470988274,
"rewards/cosine_scaled_reward": -0.28576310351490974,
"rewards/format_reward": 0.0,
"step": 310
},
{
"clip_ratio": 0.0,
"completion_length": 3039.9583435058594,
"epoch": 0.1777142857142857,
"grad_norm": 0.013624753803014755,
"kl": 0.0003497600555419922,
"learning_rate": 4.4113514698014953e-07,
"loss": -0.0761,
"reward": 0.008271858096122742,
"reward_std": 0.42439935728907585,
"rewards/cosine_scaled_reward": -0.12086406722664833,
"rewards/format_reward": 0.25,
"step": 311
},
{
"clip_ratio": 0.0,
"completion_length": 3584.0,
"epoch": 0.1782857142857143,
"grad_norm": 0.014998997561633587,
"kl": 0.0006074905395507812,
"learning_rate": 4.3808955077581546e-07,
"loss": 0.0,
"reward": -0.5124310553073883,
"reward_std": 0.2232176773250103,
"rewards/cosine_scaled_reward": -0.25621553510427475,
"rewards/format_reward": 0.0,
"step": 312
},
{
"clip_ratio": 0.0,
"completion_length": 2576.7916870117188,
"epoch": 0.17885714285714285,
"grad_norm": 0.028986027464270592,
"kl": 0.0004107952117919922,
"learning_rate": 4.350494089288943e-07,
"loss": 0.1646,
"reward": 0.38414837792515755,
"reward_std": 0.8268394228070974,
"rewards/cosine_scaled_reward": -0.03709249012172222,
"rewards/format_reward": 0.4583333358168602,
"step": 313
},
{
"clip_ratio": 0.0,
"completion_length": 2342.250030517578,
"epoch": 0.17942857142857144,
"grad_norm": 0.07766366004943848,
"kl": 0.0004897117614746094,
"learning_rate": 4.3201486961161093e-07,
"loss": 0.2676,
"reward": 1.206177432090044,
"reward_std": 1.3236718773841858,
"rewards/cosine_scaled_reward": 0.29058872163295746,
"rewards/format_reward": 0.6250000111758709,
"step": 314
},
{
"clip_ratio": 0.0,
"completion_length": 2941.8333740234375,
"epoch": 0.18,
"grad_norm": 0.015173373743891716,
"kl": 0.00037097930908203125,
"learning_rate": 4.2898608072313045e-07,
"loss": -0.0539,
"reward": -0.3268696665763855,
"reward_std": 0.39195265993475914,
"rewards/cosine_scaled_reward": -0.33010151237249374,
"rewards/format_reward": 0.3333333358168602,
"step": 315
},
{
"clip_ratio": 0.0,
"completion_length": 3459.75,
"epoch": 0.18057142857142858,
"grad_norm": 0.01420869305729866,
"kl": 0.0004048347473144531,
"learning_rate": 4.2596318988235037e-07,
"loss": 0.0212,
"reward": -0.1601060489192605,
"reward_std": 0.7861558832228184,
"rewards/cosine_scaled_reward": -0.16338635561987758,
"rewards/format_reward": 0.1666666716337204,
"step": 316
},
{
"clip_ratio": 0.0,
"completion_length": 2739.7083740234375,
"epoch": 0.18114285714285713,
"grad_norm": 0.02297414094209671,
"kl": 0.002285003662109375,
"learning_rate": 4.2294634442070553e-07,
"loss": -0.0329,
"reward": 0.5244220271706581,
"reward_std": 0.9719415307044983,
"rewards/cosine_scaled_reward": 0.0330443549901247,
"rewards/format_reward": 0.4583333358168602,
"step": 317
},
{
"clip_ratio": 0.0,
"completion_length": 2666.5416870117188,
"epoch": 0.18171428571428572,
"grad_norm": 0.022833170369267464,
"kl": 0.0004475116729736328,
"learning_rate": 4.1993569137498776e-07,
"loss": 0.1241,
"reward": 0.19690169394016266,
"reward_std": 0.6580867804586887,
"rewards/cosine_scaled_reward": -0.19321582466363907,
"rewards/format_reward": 0.583333358168602,
"step": 318
},
{
"clip_ratio": 0.0,
"completion_length": 2698.3750610351562,
"epoch": 0.18228571428571427,
"grad_norm": 0.034383926540613174,
"kl": 0.0003724098205566406,
"learning_rate": 4.1693137748017915e-07,
"loss": 0.2278,
"reward": 0.6201122887432575,
"reward_std": 0.6080379486083984,
"rewards/cosine_scaled_reward": 0.06005614344030619,
"rewards/format_reward": 0.5000000074505806,
"step": 319
},
{
"clip_ratio": 0.0,
"completion_length": 2862.3750610351562,
"epoch": 0.18285714285714286,
"grad_norm": 0.0345999151468277,
"kl": 0.00040721893310546875,
"learning_rate": 4.1393354916230005e-07,
"loss": -0.0318,
"reward": 0.12236133217811584,
"reward_std": 0.6523439809679985,
"rewards/cosine_scaled_reward": -0.10548599809408188,
"rewards/format_reward": 0.3333333358168602,
"step": 320
},
{
"clip_ratio": 0.0,
"completion_length": 3358.125,
"epoch": 0.18342857142857144,
"grad_norm": 0.013232111930847168,
"kl": 0.0003991127014160156,
"learning_rate": 4.1094235253127374e-07,
"loss": 0.0404,
"reward": -0.11832981812767684,
"reward_std": 0.42844806239008904,
"rewards/cosine_scaled_reward": -0.16333157755434513,
"rewards/format_reward": 0.2083333395421505,
"step": 321
},
{
"clip_ratio": 0.0,
"completion_length": 3566.4166870117188,
"epoch": 0.184,
"grad_norm": 0.016278283670544624,
"kl": 0.00032711029052734375,
"learning_rate": 4.079579333738039e-07,
"loss": 0.0102,
"reward": -0.4220607131719589,
"reward_std": 0.5171524062752724,
"rewards/cosine_scaled_reward": -0.23186369240283966,
"rewards/format_reward": 0.0416666679084301,
"step": 322
},
{
"clip_ratio": 0.0,
"completion_length": 3138.6666870117188,
"epoch": 0.18457142857142858,
"grad_norm": 0.013172892853617668,
"kl": 0.0003600120544433594,
"learning_rate": 4.0498043714627006e-07,
"loss": 0.0537,
"reward": 0.05053871381096542,
"reward_std": 0.8983977390453219,
"rewards/cosine_scaled_reward": -0.16223064810037613,
"rewards/format_reward": 0.3750000111758709,
"step": 323
},
{
"clip_ratio": 0.0,
"completion_length": 3010.291748046875,
"epoch": 0.18514285714285714,
"grad_norm": 0.025909846648573875,
"kl": 0.00040721893310546875,
"learning_rate": 4.020100089676376e-07,
"loss": 0.0602,
"reward": 0.5590743962675333,
"reward_std": 0.8683347813785076,
"rewards/cosine_scaled_reward": 0.029537230730056763,
"rewards/format_reward": 0.5000000223517418,
"step": 324
},
{
"clip_ratio": 0.0,
"completion_length": 2910.000030517578,
"epoch": 0.18571428571428572,
"grad_norm": 0.015413357876241207,
"kl": 0.0004220008850097656,
"learning_rate": 3.9904679361238526e-07,
"loss": 0.0317,
"reward": -0.0408908948302269,
"reward_std": 0.5930759366601706,
"rewards/cosine_scaled_reward": -0.20794545486569405,
"rewards/format_reward": 0.375,
"step": 325
},
{
"clip_ratio": 0.0,
"completion_length": 2532.166748046875,
"epoch": 0.18628571428571428,
"grad_norm": 0.030060861259698868,
"kl": 0.0005965232849121094,
"learning_rate": 3.9609093550344907e-07,
"loss": 0.0679,
"reward": 0.6265107244253159,
"reward_std": 0.9718786887824535,
"rewards/cosine_scaled_reward": 0.06325538456439972,
"rewards/format_reward": 0.5,
"step": 326
},
{
"clip_ratio": 0.0,
"completion_length": 3148.875,
"epoch": 0.18685714285714286,
"grad_norm": 0.041659917682409286,
"kl": 0.0004248619079589844,
"learning_rate": 3.931425787051832e-07,
"loss": 0.1537,
"reward": -0.35396782122552395,
"reward_std": 0.2636511065065861,
"rewards/cosine_scaled_reward": -0.2603172492235899,
"rewards/format_reward": 0.1666666716337204,
"step": 327
},
{
"clip_ratio": 0.0,
"completion_length": 2493.916717529297,
"epoch": 0.18742857142857142,
"grad_norm": 0.020772553980350494,
"kl": 0.0006198883056640625,
"learning_rate": 3.902018669163384e-07,
"loss": 0.1175,
"reward": 0.18326607067137957,
"reward_std": 0.49889209493994713,
"rewards/cosine_scaled_reward": -0.11670029908418655,
"rewards/format_reward": 0.4166666716337204,
"step": 328
},
{
"clip_ratio": 0.0,
"completion_length": 3093.291748046875,
"epoch": 0.188,
"grad_norm": 0.011293930932879448,
"kl": 0.0003113746643066406,
"learning_rate": 3.872689434630585e-07,
"loss": 0.0669,
"reward": 0.5585142355412245,
"reward_std": 1.4061576128005981,
"rewards/cosine_scaled_reward": 0.09175711218267679,
"rewards/format_reward": 0.3750000111758709,
"step": 329
},
{
"clip_ratio": 0.0,
"completion_length": 2978.7916870117188,
"epoch": 0.18857142857142858,
"grad_norm": 0.05254025384783745,
"kl": 0.000415802001953125,
"learning_rate": 3.843439512918949e-07,
"loss": 0.2134,
"reward": 0.02450428158044815,
"reward_std": 0.6984777390025556,
"rewards/cosine_scaled_reward": -0.11274785548448563,
"rewards/format_reward": 0.2500000111758709,
"step": 330
},
{
"clip_ratio": 0.0,
"completion_length": 2632.4583587646484,
"epoch": 0.18914285714285714,
"grad_norm": 0.020709240809082985,
"kl": 0.000537872314453125,
"learning_rate": 3.8142703296283953e-07,
"loss": 0.0682,
"reward": 0.45295886788517237,
"reward_std": 0.2939260210841894,
"rewards/cosine_scaled_reward": 0.018146060872823,
"rewards/format_reward": 0.4166666679084301,
"step": 331
},
{
"clip_ratio": 0.0,
"completion_length": 2125.0416870117188,
"epoch": 0.18971428571428572,
"grad_norm": 0.019701264798641205,
"kl": 0.00028228759765625,
"learning_rate": 3.785183306423767e-07,
"loss": 0.198,
"reward": 0.22587934881448746,
"reward_std": 0.6166221983730793,
"rewards/cosine_scaled_reward": -0.15789367514662445,
"rewards/format_reward": 0.541666679084301,
"step": 332
},
{
"clip_ratio": 0.0,
"completion_length": 2881.3333740234375,
"epoch": 0.19028571428571428,
"grad_norm": 0.019184157252311707,
"kl": 0.0005030632019042969,
"learning_rate": 3.7561798609655373e-07,
"loss": 0.0184,
"reward": -0.2152223140001297,
"reward_std": 0.38293132930994034,
"rewards/cosine_scaled_reward": -0.25344450399279594,
"rewards/format_reward": 0.2916666679084301,
"step": 333
},
{
"clip_ratio": 0.0,
"completion_length": 3178.7083435058594,
"epoch": 0.19085714285714286,
"grad_norm": 0.014846621081233025,
"kl": 0.000293731689453125,
"learning_rate": 3.72726140684072e-07,
"loss": 0.1053,
"reward": 0.18489570170640945,
"reward_std": 1.0315047651529312,
"rewards/cosine_scaled_reward": -0.03255215287208557,
"rewards/format_reward": 0.2500000074505806,
"step": 334
},
{
"clip_ratio": 0.0,
"completion_length": 2227.750030517578,
"epoch": 0.19142857142857142,
"grad_norm": 0.022946475073695183,
"kl": 0.00047016143798828125,
"learning_rate": 3.6984293534939737e-07,
"loss": 0.0083,
"reward": 0.32345347106456757,
"reward_std": 0.5401728432625532,
"rewards/cosine_scaled_reward": -0.1924399547278881,
"rewards/format_reward": 0.7083333395421505,
"step": 335
},
{
"clip_ratio": 0.0,
"completion_length": 2682.9166870117188,
"epoch": 0.192,
"grad_norm": 0.029810767620801926,
"kl": 0.0006122589111328125,
"learning_rate": 3.6696851061588994e-07,
"loss": 0.1395,
"reward": 0.37340210494585335,
"reward_std": 0.7611111477017403,
"rewards/cosine_scaled_reward": -0.08413228765130043,
"rewards/format_reward": 0.5416666716337204,
"step": 336
},
{
"clip_ratio": 0.0,
"completion_length": 2755.2500228881836,
"epoch": 0.19257142857142856,
"grad_norm": 0.030772754922509193,
"kl": 0.0004162788391113281,
"learning_rate": 3.641030065789562e-07,
"loss": 0.0926,
"reward": 0.6443270817399025,
"reward_std": 0.9492446109652519,
"rewards/cosine_scaled_reward": 0.09299685433506966,
"rewards/format_reward": 0.4583333358168602,
"step": 337
},
{
"clip_ratio": 0.0,
"completion_length": 2727.2083435058594,
"epoch": 0.19314285714285714,
"grad_norm": 0.019305266439914703,
"kl": 0.0005497932434082031,
"learning_rate": 3.612465628992203e-07,
"loss": 0.0282,
"reward": 0.6710007563233376,
"reward_std": 1.21895881742239,
"rewards/cosine_scaled_reward": 0.08550036698579788,
"rewards/format_reward": 0.5000000074505806,
"step": 338
},
{
"clip_ratio": 0.0,
"completion_length": 3584.0,
"epoch": 0.19371428571428573,
"grad_norm": 0.01511505339294672,
"kl": 0.0004024505615234375,
"learning_rate": 3.5839931879571725e-07,
"loss": 0.0,
"reward": -0.610577579587698,
"reward_std": 0.23075975850224495,
"rewards/cosine_scaled_reward": -0.305288789793849,
"rewards/format_reward": 0.0,
"step": 339
},
{
"clip_ratio": 0.0,
"completion_length": 2202.125015258789,
"epoch": 0.19428571428571428,
"grad_norm": 0.02869362384080887,
"kl": 0.0005574226379394531,
"learning_rate": 3.555614130391079e-07,
"loss": 0.0131,
"reward": 0.296954870223999,
"reward_std": 0.4164566658437252,
"rewards/cosine_scaled_reward": -0.10152260027825832,
"rewards/format_reward": 0.5,
"step": 340
},
{
"clip_ratio": 0.0,
"completion_length": 1960.1667175292969,
"epoch": 0.19485714285714287,
"grad_norm": 0.05206461623311043,
"kl": 0.00039577484130859375,
"learning_rate": 3.5273298394491515e-07,
"loss": 0.268,
"reward": 0.24809112399816513,
"reward_std": 0.6666374318301678,
"rewards/cosine_scaled_reward": -0.23012111335992813,
"rewards/format_reward": 0.7083333432674408,
"step": 341
},
{
"clip_ratio": 0.0,
"completion_length": 2733.4166717529297,
"epoch": 0.19542857142857142,
"grad_norm": 0.028285467997193336,
"kl": 0.000484466552734375,
"learning_rate": 3.4991416936678276e-07,
"loss": 0.1274,
"reward": 0.2500569764524698,
"reward_std": 0.38465849310159683,
"rewards/cosine_scaled_reward": -0.062471505254507065,
"rewards/format_reward": 0.3750000037252903,
"step": 342
},
{
"clip_ratio": 0.0,
"completion_length": 2453.500030517578,
"epoch": 0.196,
"grad_norm": 0.03303263336420059,
"kl": 0.0004892349243164062,
"learning_rate": 3.471051066897562e-07,
"loss": 0.2808,
"reward": -0.11763790249824524,
"reward_std": 0.5739464350044727,
"rewards/cosine_scaled_reward": -0.30881896358914673,
"rewards/format_reward": 0.5000000223517418,
"step": 343
},
{
"clip_ratio": 0.0,
"completion_length": 2497.2083435058594,
"epoch": 0.19657142857142856,
"grad_norm": 0.05244883522391319,
"kl": 0.00046515464782714844,
"learning_rate": 3.4430593282358777e-07,
"loss": 0.0989,
"reward": 0.34542886912822723,
"reward_std": 0.9526717662811279,
"rewards/cosine_scaled_reward": -0.03561893478035927,
"rewards/format_reward": 0.4166666679084301,
"step": 344
},
{
"clip_ratio": 0.0,
"completion_length": 1832.7500305175781,
"epoch": 0.19714285714285715,
"grad_norm": 0.041937097907066345,
"kl": 0.0008487701416015625,
"learning_rate": 3.4151678419606233e-07,
"loss": -0.1656,
"reward": 1.009265385568142,
"reward_std": 0.40681467205286026,
"rewards/cosine_scaled_reward": 0.15046602487564087,
"rewards/format_reward": 0.7083333432674408,
"step": 345
},
{
"clip_ratio": 0.0,
"completion_length": 3584.0,
"epoch": 0.1977142857142857,
"grad_norm": 0.011569972150027752,
"kl": 0.000385284423828125,
"learning_rate": 3.387377967463493e-07,
"loss": 0.0,
"reward": -0.31521460227668285,
"reward_std": 0.19862121110782027,
"rewards/cosine_scaled_reward": -0.19927397277206182,
"rewards/format_reward": 0.0833333358168602,
"step": 346
},
{
"clip_ratio": 0.0,
"completion_length": 2886.0416870117188,
"epoch": 0.1982857142857143,
"grad_norm": 0.013902098871767521,
"kl": 0.00035190582275390625,
"learning_rate": 3.359691059183761e-07,
"loss": -0.0006,
"reward": 0.06203071027994156,
"reward_std": 0.5102610923349857,
"rewards/cosine_scaled_reward": -0.11481798812747002,
"rewards/format_reward": 0.2916666679084301,
"step": 347
},
{
"clip_ratio": 0.0,
"completion_length": 2697.4166870117188,
"epoch": 0.19885714285714284,
"grad_norm": 0.028411809355020523,
"kl": 0.0004925727844238281,
"learning_rate": 3.3321084665422803e-07,
"loss": -0.1683,
"reward": 0.16876617819070816,
"reward_std": 0.468828896060586,
"rewards/cosine_scaled_reward": -0.16561690717935562,
"rewards/format_reward": 0.5,
"step": 348
},
{
"clip_ratio": 0.0,
"completion_length": 3510.6666870117188,
"epoch": 0.19942857142857143,
"grad_norm": 0.012191089801490307,
"kl": 0.00038242340087890625,
"learning_rate": 3.3046315338757026e-07,
"loss": 0.0276,
"reward": -0.3254171907901764,
"reward_std": 0.6530030593276024,
"rewards/cosine_scaled_reward": -0.2460419312119484,
"rewards/format_reward": 0.1666666716337204,
"step": 349
},
{
"clip_ratio": 0.0,
"completion_length": 2198.541717529297,
"epoch": 0.2,
"grad_norm": 0.01535298302769661,
"kl": 0.0005538463592529297,
"learning_rate": 3.2772616003709616e-07,
"loss": 0.0941,
"reward": 1.250352792441845,
"reward_std": 0.6242426857352257,
"rewards/cosine_scaled_reward": 0.2710097096860409,
"rewards/format_reward": 0.7083333395421505,
"step": 350
},
{
"clip_ratio": 0.0,
"completion_length": 2659.791748046875,
"epoch": 0.20057142857142857,
"grad_norm": 0.017833461984992027,
"kl": 0.0005612373352050781,
"learning_rate": 3.250000000000001e-07,
"loss": 0.0465,
"reward": 0.1603565402328968,
"reward_std": 0.8732819259166718,
"rewards/cosine_scaled_reward": -0.1698217373341322,
"rewards/format_reward": 0.5,
"step": 351
},
{
"clip_ratio": 0.0,
"completion_length": 2180.500015258789,
"epoch": 0.20114285714285715,
"grad_norm": 0.01808895543217659,
"kl": 0.0004558563232421875,
"learning_rate": 3.222848061454764e-07,
"loss": 0.0133,
"reward": 0.2774962969124317,
"reward_std": 0.7046881169080734,
"rewards/cosine_scaled_reward": -0.15291852178052068,
"rewards/format_reward": 0.5833333358168602,
"step": 352
},
{
"clip_ratio": 0.0,
"completion_length": 2483.750030517578,
"epoch": 0.2017142857142857,
"grad_norm": 0.023005694150924683,
"kl": 0.0004525184631347656,
"learning_rate": 3.195807108082429e-07,
"loss": 0.2053,
"reward": 0.28026173263788223,
"reward_std": 0.7538251765072346,
"rewards/cosine_scaled_reward": -0.13070247694849968,
"rewards/format_reward": 0.541666679084301,
"step": 353
},
{
"clip_ratio": 0.0,
"completion_length": 2603.5833435058594,
"epoch": 0.2022857142857143,
"grad_norm": 0.019903168082237244,
"kl": 0.0005669593811035156,
"learning_rate": 3.168878457820915e-07,
"loss": 0.098,
"reward": 0.14617812633514404,
"reward_std": 0.5162135027348995,
"rewards/cosine_scaled_reward": -0.15607764571905136,
"rewards/format_reward": 0.4583333432674408,
"step": 354
},
{
"clip_ratio": 0.0,
"completion_length": 2432.875030517578,
"epoch": 0.20285714285714285,
"grad_norm": 0.02829565852880478,
"kl": 0.0006160736083984375,
"learning_rate": 3.142063423134644e-07,
"loss": 0.0099,
"reward": 0.19245748221874237,
"reward_std": 0.2699956987053156,
"rewards/cosine_scaled_reward": -0.1329379379749298,
"rewards/format_reward": 0.4583333432674408,
"step": 355
},
{
"clip_ratio": 0.0,
"completion_length": 2468.8334197998047,
"epoch": 0.20342857142857143,
"grad_norm": 0.02010870911180973,
"kl": 0.0003266334533691406,
"learning_rate": 3.115363310950578e-07,
"loss": 0.0836,
"reward": 1.1018516272306442,
"reward_std": 0.832906199619174,
"rewards/cosine_scaled_reward": 0.2800924628973007,
"rewards/format_reward": 0.541666679084301,
"step": 356
},
{
"clip_ratio": 0.0,
"completion_length": 2738.2083740234375,
"epoch": 0.204,
"grad_norm": 0.02142561413347721,
"kl": 0.0004112720489501953,
"learning_rate": 3.0887794225945143e-07,
"loss": 0.0696,
"reward": -0.34853553399443626,
"reward_std": 0.1840939112007618,
"rewards/cosine_scaled_reward": -0.3617677837610245,
"rewards/format_reward": 0.375,
"step": 357
},
{
"clip_ratio": 0.0,
"completion_length": 2853.416717529297,
"epoch": 0.20457142857142857,
"grad_norm": 0.016107890754938126,
"kl": 0.0005478858947753906,
"learning_rate": 3.062313053727671e-07,
"loss": 0.0663,
"reward": 0.6284266784787178,
"reward_std": 0.72141382843256,
"rewards/cosine_scaled_reward": 0.10587997734546661,
"rewards/format_reward": 0.4166666679084301,
"step": 358
},
{
"clip_ratio": 0.0,
"completion_length": 2372.000045776367,
"epoch": 0.20514285714285715,
"grad_norm": 0.021992484107613564,
"kl": 0.0005311965942382812,
"learning_rate": 3.0359654942835247e-07,
"loss": 0.086,
"reward": 0.6230212822556496,
"reward_std": 0.8500233590602875,
"rewards/cosine_scaled_reward": -0.021822698414325714,
"rewards/format_reward": 0.6666666828095913,
"step": 359
},
{
"clip_ratio": 0.0,
"completion_length": 3455.2500610351562,
"epoch": 0.2057142857142857,
"grad_norm": 0.030560219660401344,
"kl": 0.00031185150146484375,
"learning_rate": 3.0097380284049523e-07,
"loss": 0.0487,
"reward": -0.10205069184303284,
"reward_std": 0.8343977108597755,
"rewards/cosine_scaled_reward": -0.13435868825763464,
"rewards/format_reward": 0.1666666716337204,
"step": 360
},
{
"clip_ratio": 0.0,
"completion_length": 2516.9583740234375,
"epoch": 0.2062857142857143,
"grad_norm": 0.016366608440876007,
"kl": 0.0004210472106933594,
"learning_rate": 2.9836319343816397e-07,
"loss": 0.0794,
"reward": 0.12586134672164917,
"reward_std": 0.6533399596810341,
"rewards/cosine_scaled_reward": -0.18706931918859482,
"rewards/format_reward": 0.5000000111758709,
"step": 361
},
{
"clip_ratio": 0.0,
"completion_length": 3351.7916870117188,
"epoch": 0.20685714285714285,
"grad_norm": 0.01776767335832119,
"kl": 0.0003039836883544922,
"learning_rate": 2.9576484845877793e-07,
"loss": 0.0928,
"reward": 0.29054381139576435,
"reward_std": 1.0099711641669273,
"rewards/cosine_scaled_reward": -0.04222810734063387,
"rewards/format_reward": 0.3750000074505806,
"step": 362
},
{
"clip_ratio": 0.0,
"completion_length": 2496.8750610351562,
"epoch": 0.20742857142857143,
"grad_norm": 0.03273333981633186,
"kl": 0.0006189346313476562,
"learning_rate": 2.931788945420058e-07,
"loss": 0.2348,
"reward": 0.34358200430870056,
"reward_std": 0.7383445575833321,
"rewards/cosine_scaled_reward": -0.1407090239226818,
"rewards/format_reward": 0.6250000111758709,
"step": 363
},
{
"clip_ratio": 0.0,
"completion_length": 3452.375,
"epoch": 0.208,
"grad_norm": 0.014953386969864368,
"kl": 0.00035858154296875,
"learning_rate": 2.9060545772359305e-07,
"loss": 0.043,
"reward": -0.4569111131131649,
"reward_std": 0.34297534823417664,
"rewards/cosine_scaled_reward": -0.2701222151517868,
"rewards/format_reward": 0.0833333358168602,
"step": 364
},
{
"clip_ratio": 0.0,
"completion_length": 2785.7083740234375,
"epoch": 0.20857142857142857,
"grad_norm": 0.02793431654572487,
"kl": 0.0006036758422851562,
"learning_rate": 2.8804466342921987e-07,
"loss": 0.0765,
"reward": 0.11223252862691879,
"reward_std": 0.619122963398695,
"rewards/cosine_scaled_reward": -0.21471708547323942,
"rewards/format_reward": 0.5416666865348816,
"step": 365
},
{
"clip_ratio": 0.0,
"completion_length": 3409.541748046875,
"epoch": 0.20914285714285713,
"grad_norm": 0.01248230878263712,
"kl": 0.0003552436828613281,
"learning_rate": 2.854966364683872e-07,
"loss": 0.0491,
"reward": -0.3326341025531292,
"reward_std": 0.3651573769748211,
"rewards/cosine_scaled_reward": -0.2288170587271452,
"rewards/format_reward": 0.1250000037252903,
"step": 366
},
{
"clip_ratio": 0.0,
"completion_length": 2808.4583587646484,
"epoch": 0.20971428571428571,
"grad_norm": 0.028179485350847244,
"kl": 0.0006113052368164062,
"learning_rate": 2.829615010283344e-07,
"loss": 0.0589,
"reward": 0.1167896268889308,
"reward_std": 1.1129950881004333,
"rewards/cosine_scaled_reward": -0.10827185213565826,
"rewards/format_reward": 0.3333333358168602,
"step": 367
},
{
"clip_ratio": 0.0,
"completion_length": 2660.791748046875,
"epoch": 0.2102857142857143,
"grad_norm": 0.0340777263045311,
"kl": 0.0005307197570800781,
"learning_rate": 2.8043938066798645e-07,
"loss": 0.1111,
"reward": 1.4332922995090485,
"reward_std": 1.385080635547638,
"rewards/cosine_scaled_reward": 0.34164613112807274,
"rewards/format_reward": 0.7500000149011612,
"step": 368
},
{
"clip_ratio": 0.0,
"completion_length": 3233.3750610351562,
"epoch": 0.21085714285714285,
"grad_norm": 0.013750105164945126,
"kl": 0.0003342628479003906,
"learning_rate": 2.7793039831193133e-07,
"loss": 0.0717,
"reward": 0.05912124365568161,
"reward_std": 0.874366108328104,
"rewards/cosine_scaled_reward": -0.09543937258422375,
"rewards/format_reward": 0.2500000074505806,
"step": 369
},
{
"clip_ratio": 0.0,
"completion_length": 3452.4166870117188,
"epoch": 0.21142857142857144,
"grad_norm": 0.012770992703735828,
"kl": 0.0004000663757324219,
"learning_rate": 2.7543467624442956e-07,
"loss": 0.0281,
"reward": 0.15689724683761597,
"reward_std": 0.2785376161336899,
"rewards/cosine_scaled_reward": -0.04655133932828903,
"rewards/format_reward": 0.25,
"step": 370
},
{
"clip_ratio": 0.0,
"completion_length": 2499.791717529297,
"epoch": 0.212,
"grad_norm": 0.018871566280722618,
"kl": 0.00035071372985839844,
"learning_rate": 2.729523361034538e-07,
"loss": 0.1363,
"reward": 0.5506950244307518,
"reward_std": 0.9160189777612686,
"rewards/cosine_scaled_reward": 0.025347519665956497,
"rewards/format_reward": 0.5000000037252903,
"step": 371
},
{
"clip_ratio": 0.0,
"completion_length": 3505.2083740234375,
"epoch": 0.21257142857142858,
"grad_norm": 0.01206301525235176,
"kl": 0.00035262107849121094,
"learning_rate": 2.7048349887476037e-07,
"loss": 0.0086,
"reward": -0.2348268087953329,
"reward_std": 0.3814601432532072,
"rewards/cosine_scaled_reward": -0.1799134025350213,
"rewards/format_reward": 0.125,
"step": 372
},
{
"clip_ratio": 0.0,
"completion_length": 2949.166717529297,
"epoch": 0.21314285714285713,
"grad_norm": 0.01700667478144169,
"kl": 0.0003724098205566406,
"learning_rate": 2.6802828488599294e-07,
"loss": -0.0476,
"reward": 0.22861511493101716,
"reward_std": 0.5081553272902966,
"rewards/cosine_scaled_reward": -0.031525759026408195,
"rewards/format_reward": 0.2916666679084301,
"step": 373
},
{
"clip_ratio": 0.0,
"completion_length": 3467.0000610351562,
"epoch": 0.21371428571428572,
"grad_norm": 0.017147116363048553,
"kl": 0.00045490264892578125,
"learning_rate": 2.655868138008171e-07,
"loss": 0.0457,
"reward": -0.4217256158590317,
"reward_std": 0.6266062669456005,
"rewards/cosine_scaled_reward": -0.31502948701381683,
"rewards/format_reward": 0.2083333358168602,
"step": 374
},
{
"clip_ratio": 0.0,
"completion_length": 2722.375,
"epoch": 0.21428571428571427,
"grad_norm": 0.02914683148264885,
"kl": 0.0004086494445800781,
"learning_rate": 2.631592046130896e-07,
"loss": 0.1972,
"reward": 0.23988548666238785,
"reward_std": 0.4570600874722004,
"rewards/cosine_scaled_reward": -0.06755725666880608,
"rewards/format_reward": 0.3750000149011612,
"step": 375
},
{
"clip_ratio": 0.0,
"completion_length": 3282.5834350585938,
"epoch": 0.21485714285714286,
"grad_norm": 0.01625804975628853,
"kl": 0.00047779083251953125,
"learning_rate": 2.6074557564105724e-07,
"loss": 0.0483,
"reward": 0.39857788383960724,
"reward_std": 0.9685629121959209,
"rewards/cosine_scaled_reward": -0.00904439389705658,
"rewards/format_reward": 0.416666679084301,
"step": 376
},
{
"clip_ratio": 0.0,
"completion_length": 2434.125,
"epoch": 0.21542857142857144,
"grad_norm": 0.034063227474689484,
"kl": 0.0004830360412597656,
"learning_rate": 2.583460445215911e-07,
"loss": 0.164,
"reward": 0.0973532497882843,
"reward_std": 0.7279782295227051,
"rewards/cosine_scaled_reward": -0.22215671092271805,
"rewards/format_reward": 0.5416666753590107,
"step": 377
},
{
"clip_ratio": 0.0,
"completion_length": 2981.2083435058594,
"epoch": 0.216,
"grad_norm": 0.02709144353866577,
"kl": 0.0005640983581542969,
"learning_rate": 2.5596072820445254e-07,
"loss": 0.2001,
"reward": -0.3218484600074589,
"reward_std": 0.3952440693974495,
"rewards/cosine_scaled_reward": -0.28592423163354397,
"rewards/format_reward": 0.2500000111758709,
"step": 378
},
{
"clip_ratio": 0.0,
"completion_length": 3029.7083740234375,
"epoch": 0.21657142857142858,
"grad_norm": 0.020819807425141335,
"kl": 0.0030651092529296875,
"learning_rate": 2.5358974294659373e-07,
"loss": -0.0893,
"reward": -0.21052202675491571,
"reward_std": 0.38894602842628956,
"rewards/cosine_scaled_reward": -0.2719276868738234,
"rewards/format_reward": 0.3333333358168602,
"step": 379
},
{
"clip_ratio": 0.0,
"completion_length": 3011.0833740234375,
"epoch": 0.21714285714285714,
"grad_norm": 0.015882771462202072,
"kl": 0.0006201267242431641,
"learning_rate": 2.512332043064913e-07,
"loss": 0.0272,
"reward": 0.18160124588757753,
"reward_std": 1.1701444238424301,
"rewards/cosine_scaled_reward": -0.09669936696445802,
"rewards/format_reward": 0.3750000037252903,
"step": 380
},
{
"clip_ratio": 0.0,
"completion_length": 3127.2500610351562,
"epoch": 0.21771428571428572,
"grad_norm": 0.07095064222812653,
"kl": 0.0007042884826660156,
"learning_rate": 2.488912271385139e-07,
"loss": 0.1667,
"reward": -0.5090018883347511,
"reward_std": 0.36618487909436226,
"rewards/cosine_scaled_reward": -0.35866761952638626,
"rewards/format_reward": 0.2083333358168602,
"step": 381
},
{
"clip_ratio": 0.0,
"completion_length": 2951.2916870117188,
"epoch": 0.21828571428571428,
"grad_norm": 0.03872789442539215,
"kl": 0.0005259513854980469,
"learning_rate": 2.465639255873246e-07,
"loss": 0.1543,
"reward": -0.07253091287566349,
"reward_std": 0.7700534537434578,
"rewards/cosine_scaled_reward": -0.20293213427066803,
"rewards/format_reward": 0.3333333469927311,
"step": 382
},
{
"clip_ratio": 0.0,
"completion_length": 2982.2916870117188,
"epoch": 0.21885714285714286,
"grad_norm": 0.014369679614901543,
"kl": 0.0005116462707519531,
"learning_rate": 2.4425141308231765e-07,
"loss": -0.0014,
"reward": 0.13170818611979485,
"reward_std": 0.41740766912698746,
"rewards/cosine_scaled_reward": -0.059145910665392876,
"rewards/format_reward": 0.25,
"step": 383
},
{
"clip_ratio": 0.0,
"completion_length": 3228.625,
"epoch": 0.21942857142857142,
"grad_norm": 0.012754120863974094,
"kl": 0.0003314018249511719,
"learning_rate": 2.4195380233209006e-07,
"loss": -0.0007,
"reward": 0.17525914683938026,
"reward_std": 0.6218334436416626,
"rewards/cosine_scaled_reward": -0.09987044055014849,
"rewards/format_reward": 0.3750000149011612,
"step": 384
},
{
"clip_ratio": 0.0,
"completion_length": 2379.2916717529297,
"epoch": 0.22,
"grad_norm": 0.020193729549646378,
"kl": 0.0006628036499023438,
"learning_rate": 2.3967120531894857e-07,
"loss": -0.0018,
"reward": -0.0569206103682518,
"reward_std": 0.6073889955878258,
"rewards/cosine_scaled_reward": -0.257626973092556,
"rewards/format_reward": 0.4583333432674408,
"step": 385
},
{
"clip_ratio": 0.0,
"completion_length": 3001.1666870117188,
"epoch": 0.22057142857142858,
"grad_norm": 0.018432218581438065,
"kl": 0.0004229545593261719,
"learning_rate": 2.374037332934512e-07,
"loss": 0.0952,
"reward": 0.21215322148054838,
"reward_std": 0.888201154768467,
"rewards/cosine_scaled_reward": -0.12309006974101067,
"rewards/format_reward": 0.4583333432674408,
"step": 386
},
{
"clip_ratio": 0.0,
"completion_length": 1933.2083740234375,
"epoch": 0.22114285714285714,
"grad_norm": 0.05453366041183472,
"kl": 0.0005445480346679688,
"learning_rate": 2.3515149676898552e-07,
"loss": 0.1831,
"reward": 1.217309720814228,
"reward_std": 0.4588906615972519,
"rewards/cosine_scaled_reward": 0.33782152086496353,
"rewards/format_reward": 0.5416666679084301,
"step": 387
},
{
"clip_ratio": 0.0,
"completion_length": 1593.1667175292969,
"epoch": 0.22171428571428572,
"grad_norm": 0.026233471930027008,
"kl": 0.0007166862487792969,
"learning_rate": 2.3291460551638237e-07,
"loss": -0.0644,
"reward": 1.6789040267467499,
"reward_std": 1.0171207189559937,
"rewards/cosine_scaled_reward": 0.33945199474692345,
"rewards/format_reward": 1.0,
"step": 388
},
{
"clip_ratio": 0.0,
"completion_length": 1572.4583587646484,
"epoch": 0.22228571428571428,
"grad_norm": 0.02512061409652233,
"kl": 0.0004048347473144531,
"learning_rate": 2.306931685585657e-07,
"loss": 0.0294,
"reward": 1.3811066150665283,
"reward_std": 0.3954271301627159,
"rewards/cosine_scaled_reward": 0.3155532553792,
"rewards/format_reward": 0.75,
"step": 389
},
{
"clip_ratio": 0.0,
"completion_length": 3255.9583740234375,
"epoch": 0.22285714285714286,
"grad_norm": 0.03513728827238083,
"kl": 0.0005030632019042969,
"learning_rate": 2.2848729416523859e-07,
"loss": 0.1731,
"reward": -0.12103257514536381,
"reward_std": 0.8534305840730667,
"rewards/cosine_scaled_reward": -0.16468296200037003,
"rewards/format_reward": 0.2083333395421505,
"step": 390
},
{
"clip_ratio": 0.0,
"completion_length": 2981.8333740234375,
"epoch": 0.22342857142857142,
"grad_norm": 0.03934527188539505,
"kl": 0.0004153251647949219,
"learning_rate": 2.2629708984760706e-07,
"loss": 0.1821,
"reward": -0.06651334711932577,
"reward_std": 0.2963850498199463,
"rewards/cosine_scaled_reward": -0.2207566797733307,
"rewards/format_reward": 0.3750000111758709,
"step": 391
},
{
"clip_ratio": 0.0,
"completion_length": 3223.0833740234375,
"epoch": 0.224,
"grad_norm": 0.013558811508119106,
"kl": 0.00038242340087890625,
"learning_rate": 2.2412266235313973e-07,
"loss": 0.0057,
"reward": -0.2705407738685608,
"reward_std": 0.2837679469957948,
"rewards/cosine_scaled_reward": -0.2602703794836998,
"rewards/format_reward": 0.25,
"step": 392
},
{
"clip_ratio": 0.0,
"completion_length": 3398.0000610351562,
"epoch": 0.22457142857142856,
"grad_norm": 0.012919829227030277,
"kl": 0.0003833770751953125,
"learning_rate": 2.2196411766036487e-07,
"loss": 0.0292,
"reward": -0.32555074989795685,
"reward_std": 0.6584090702235699,
"rewards/cosine_scaled_reward": -0.2669420391321182,
"rewards/format_reward": 0.2083333395421505,
"step": 393
},
{
"clip_ratio": 0.0,
"completion_length": 2330.0833587646484,
"epoch": 0.22514285714285714,
"grad_norm": 0.03346144035458565,
"kl": 0.0005555152893066406,
"learning_rate": 2.1982156097370557e-07,
"loss": 0.167,
"reward": 0.23031404614448547,
"reward_std": 0.7982805892825127,
"rewards/cosine_scaled_reward": -0.11400966346263885,
"rewards/format_reward": 0.4583333358168602,
"step": 394
},
{
"clip_ratio": 0.0,
"completion_length": 3490.75,
"epoch": 0.2257142857142857,
"grad_norm": 0.016392916440963745,
"kl": 0.0003719329833984375,
"learning_rate": 2.1769509671835223e-07,
"loss": 0.0408,
"reward": -0.04605567455291748,
"reward_std": 0.8931695856153965,
"rewards/cosine_scaled_reward": -0.10636119358241558,
"rewards/format_reward": 0.1666666716337204,
"step": 395
},
{
"clip_ratio": 0.0,
"completion_length": 2741.9166870117188,
"epoch": 0.22628571428571428,
"grad_norm": 0.012639901600778103,
"kl": 0.0002758502960205078,
"learning_rate": 2.1558482853517253e-07,
"loss": 0.0119,
"reward": 0.37444762885570526,
"reward_std": 0.4292972981929779,
"rewards/cosine_scaled_reward": -0.06277619302272797,
"rewards/format_reward": 0.5000000111758709,
"step": 396
},
{
"clip_ratio": 0.0,
"completion_length": 2962.8333435058594,
"epoch": 0.22685714285714287,
"grad_norm": 0.01602632738649845,
"kl": 0.00043392181396484375,
"learning_rate": 2.134908592756607e-07,
"loss": -0.0317,
"reward": 0.5750939613208175,
"reward_std": 0.7307236031629145,
"rewards/cosine_scaled_reward": 0.05838030157610774,
"rewards/format_reward": 0.4583333432674408,
"step": 397
},
{
"clip_ratio": 0.0,
"completion_length": 2762.5833740234375,
"epoch": 0.22742857142857142,
"grad_norm": 0.016584103927016258,
"kl": 0.00033473968505859375,
"learning_rate": 2.1141329099692406e-07,
"loss": 0.0113,
"reward": 0.8510713949799538,
"reward_std": 0.8986274972558022,
"rewards/cosine_scaled_reward": 0.1338690184056759,
"rewards/format_reward": 0.5833333469927311,
"step": 398
},
{
"clip_ratio": 0.0,
"completion_length": 3111.7916870117188,
"epoch": 0.228,
"grad_norm": 0.04136970639228821,
"kl": 0.0006189346313476562,
"learning_rate": 2.0935222495670968e-07,
"loss": 0.0911,
"reward": -0.29911787807941437,
"reward_std": 0.36130358278751373,
"rewards/cosine_scaled_reward": -0.2537256069481373,
"rewards/format_reward": 0.2083333432674408,
"step": 399
},
{
"clip_ratio": 0.0,
"completion_length": 1176.083381652832,
"epoch": 0.22857142857142856,
"grad_norm": 0.032378729432821274,
"kl": 0.0008146762847900391,
"learning_rate": 2.0730776160846853e-07,
"loss": -0.0498,
"reward": 1.2029592096805573,
"reward_std": 0.8494270741939545,
"rewards/cosine_scaled_reward": 0.10147958248853683,
"rewards/format_reward": 1.0,
"step": 400
},
{
"clip_ratio": 0.0,
"completion_length": 2217.2083892822266,
"epoch": 0.22914285714285715,
"grad_norm": 0.03296418488025665,
"kl": 0.0005393028259277344,
"learning_rate": 2.0528000059645995e-07,
"loss": 0.0836,
"reward": 0.29183474462479353,
"reward_std": 0.5516791455447674,
"rewards/cosine_scaled_reward": -0.1874159649014473,
"rewards/format_reward": 0.6666666716337204,
"step": 401
},
{
"clip_ratio": 0.0,
"completion_length": 2966.0833740234375,
"epoch": 0.2297142857142857,
"grad_norm": 0.023311948403716087,
"kl": 0.0004687309265136719,
"learning_rate": 2.032690407508949e-07,
"loss": -0.0487,
"reward": -0.14803513139486313,
"reward_std": 0.5541299842298031,
"rewards/cosine_scaled_reward": -0.21985089778900146,
"rewards/format_reward": 0.2916666679084301,
"step": 402
},
{
"clip_ratio": 0.0,
"completion_length": 2138.375015258789,
"epoch": 0.2302857142857143,
"grad_norm": 0.0190444178879261,
"kl": 0.0005555152893066406,
"learning_rate": 2.0127498008311922e-07,
"loss": 0.1245,
"reward": 0.7670382708311081,
"reward_std": 0.880347341299057,
"rewards/cosine_scaled_reward": 0.05018577980808914,
"rewards/format_reward": 0.6666666716337204,
"step": 403
},
{
"clip_ratio": 0.0,
"completion_length": 2700.1666717529297,
"epoch": 0.23085714285714284,
"grad_norm": 0.02195879817008972,
"kl": 0.0004572868347167969,
"learning_rate": 1.9929791578083655e-07,
"loss": 0.1354,
"reward": -0.03147786110639572,
"reward_std": 0.5353209748864174,
"rewards/cosine_scaled_reward": -0.22407225891947746,
"rewards/format_reward": 0.4166666716337204,
"step": 404
},
{
"clip_ratio": 0.0,
"completion_length": 2581.000045776367,
"epoch": 0.23142857142857143,
"grad_norm": 0.01655156910419464,
"kl": 0.00037384033203125,
"learning_rate": 1.9733794420337213e-07,
"loss": 0.063,
"reward": 0.6779801677912474,
"reward_std": 0.5696643739938736,
"rewards/cosine_scaled_reward": 0.08899005688726902,
"rewards/format_reward": 0.5000000074505806,
"step": 405
},
{
"clip_ratio": 0.0,
"completion_length": 3439.5000610351562,
"epoch": 0.232,
"grad_norm": 0.02102508395910263,
"kl": 0.0006380081176757812,
"learning_rate": 1.9539516087697517e-07,
"loss": 0.0608,
"reward": -0.2339201234281063,
"reward_std": 0.7671289071440697,
"rewards/cosine_scaled_reward": -0.22112673026276752,
"rewards/format_reward": 0.2083333395421505,
"step": 406
},
{
"clip_ratio": 0.0,
"completion_length": 2525.6666870117188,
"epoch": 0.23257142857142857,
"grad_norm": 0.016638562083244324,
"kl": 0.00039958953857421875,
"learning_rate": 1.934696604901642e-07,
"loss": 0.0268,
"reward": 0.5109657794237137,
"reward_std": 0.6168239414691925,
"rewards/cosine_scaled_reward": 0.04714955762028694,
"rewards/format_reward": 0.4166666716337204,
"step": 407
},
{
"clip_ratio": 0.0,
"completion_length": 2517.6250610351562,
"epoch": 0.23314285714285715,
"grad_norm": 0.024556465446949005,
"kl": 0.0006203651428222656,
"learning_rate": 1.915615368891117e-07,
"loss": 0.1592,
"reward": 0.06803740188479424,
"reward_std": 0.8579247817397118,
"rewards/cosine_scaled_reward": -0.19514797255396843,
"rewards/format_reward": 0.4583333358168602,
"step": 408
},
{
"clip_ratio": 0.0,
"completion_length": 2717.2916870117188,
"epoch": 0.2337142857142857,
"grad_norm": 0.029170790687203407,
"kl": 0.0004508495330810547,
"learning_rate": 1.8967088307307e-07,
"loss": 0.0397,
"reward": 0.8809500015340745,
"reward_std": 0.6774098351597786,
"rewards/cosine_scaled_reward": 0.12797498516738415,
"rewards/format_reward": 0.625,
"step": 409
},
{
"clip_ratio": 0.0,
"completion_length": 3492.8333740234375,
"epoch": 0.2342857142857143,
"grad_norm": 0.011575359851121902,
"kl": 0.0004987716674804688,
"learning_rate": 1.8779779118983867e-07,
"loss": -0.0028,
"reward": -0.09124118834733963,
"reward_std": 0.882003229111433,
"rewards/cosine_scaled_reward": -0.14978726860135794,
"rewards/format_reward": 0.2083333358168602,
"step": 410
},
{
"clip_ratio": 0.0,
"completion_length": 3483.4583740234375,
"epoch": 0.23485714285714285,
"grad_norm": 0.013307969085872173,
"kl": 0.00041484832763671875,
"learning_rate": 1.8594235253127372e-07,
"loss": 0.038,
"reward": 0.0876331478357315,
"reward_std": 0.9785483591258526,
"rewards/cosine_scaled_reward": -0.06035009026527405,
"rewards/format_reward": 0.2083333395421505,
"step": 411
},
{
"clip_ratio": 0.0,
"completion_length": 3483.0,
"epoch": 0.23542857142857143,
"grad_norm": 0.014635894447565079,
"kl": 0.0004057884216308594,
"learning_rate": 1.8410465752883758e-07,
"loss": 0.0558,
"reward": -0.7163440883159637,
"reward_std": 0.2162969596683979,
"rewards/cosine_scaled_reward": -0.37900539487600327,
"rewards/format_reward": 0.0416666679084301,
"step": 412
},
{
"clip_ratio": 0.0,
"completion_length": 2679.916778564453,
"epoch": 0.236,
"grad_norm": 0.013161610811948776,
"kl": 0.00030541419982910156,
"learning_rate": 1.822847957491922e-07,
"loss": 0.0978,
"reward": 0.9060661401599646,
"reward_std": 0.8712828233838081,
"rewards/cosine_scaled_reward": 0.11969973146915436,
"rewards/format_reward": 0.666666679084301,
"step": 413
},
{
"clip_ratio": 0.0,
"completion_length": 2817.625030517578,
"epoch": 0.23657142857142857,
"grad_norm": 0.022518867626786232,
"kl": 0.00038623809814453125,
"learning_rate": 1.804828558898332e-07,
"loss": 0.0441,
"reward": 0.13390246778726578,
"reward_std": 0.6470340602099895,
"rewards/cosine_scaled_reward": -0.09971543587744236,
"rewards/format_reward": 0.3333333358168602,
"step": 414
},
{
"clip_ratio": 0.0,
"completion_length": 2841.2083740234375,
"epoch": 0.23714285714285716,
"grad_norm": 0.023023054003715515,
"kl": 0.0006165504455566406,
"learning_rate": 1.7869892577476722e-07,
"loss": 0.0876,
"reward": -0.059880852699279785,
"reward_std": 0.4681435003876686,
"rewards/cosine_scaled_reward": -0.19660709938034415,
"rewards/format_reward": 0.3333333358168602,
"step": 415
},
{
"clip_ratio": 0.0,
"completion_length": 3561.75,
"epoch": 0.2377142857142857,
"grad_norm": 0.018553022295236588,
"kl": 0.0003795623779296875,
"learning_rate": 1.7693309235023127e-07,
"loss": 0.0128,
"reward": -0.2655714526772499,
"reward_std": 0.3128821440041065,
"rewards/cosine_scaled_reward": -0.15361905843019485,
"rewards/format_reward": 0.0416666679084301,
"step": 416
},
{
"clip_ratio": 0.0,
"completion_length": 2401.7917098999023,
"epoch": 0.2382857142857143,
"grad_norm": 0.026138195767998695,
"kl": 0.0005283355712890625,
"learning_rate": 1.7518544168045524e-07,
"loss": 0.0481,
"reward": 0.4319583922624588,
"reward_std": 0.5039754528552294,
"rewards/cosine_scaled_reward": -0.013187475502490997,
"rewards/format_reward": 0.4583333432674408,
"step": 417
},
{
"clip_ratio": 0.0,
"completion_length": 2618.4166717529297,
"epoch": 0.23885714285714285,
"grad_norm": 0.03529448062181473,
"kl": 0.0008215904235839844,
"learning_rate": 1.7345605894346726e-07,
"loss": 0.053,
"reward": -0.38310980424284935,
"reward_std": 0.3161798268556595,
"rewards/cosine_scaled_reward": -0.3790549263358116,
"rewards/format_reward": 0.375,
"step": 418
},
{
"clip_ratio": 0.0,
"completion_length": 2717.2083435058594,
"epoch": 0.23942857142857144,
"grad_norm": 0.015204512514173985,
"kl": 0.0004143714904785156,
"learning_rate": 1.7174502842694212e-07,
"loss": 0.0434,
"reward": -0.06527332402765751,
"reward_std": 0.3376622749492526,
"rewards/cosine_scaled_reward": -0.2618033364415169,
"rewards/format_reward": 0.4583333432674408,
"step": 419
},
{
"clip_ratio": 0.0,
"completion_length": 1704.7916870117188,
"epoch": 0.24,
"grad_norm": 0.05218733474612236,
"kl": 0.0005950927734375,
"learning_rate": 1.7005243352409333e-07,
"loss": 0.249,
"reward": 0.7783117964863777,
"reward_std": 0.45935374312102795,
"rewards/cosine_scaled_reward": 0.03498924896121025,
"rewards/format_reward": 0.7083333432674408,
"step": 420
},
{
"clip_ratio": 0.0,
"completion_length": 3204.5,
"epoch": 0.24057142857142857,
"grad_norm": 0.01363091915845871,
"kl": 0.0003876686096191406,
"learning_rate": 1.6837835672960831e-07,
"loss": 0.0017,
"reward": -0.22349218279123306,
"reward_std": 0.22306939586997032,
"rewards/cosine_scaled_reward": -0.23674608767032623,
"rewards/format_reward": 0.25,
"step": 421
},
{
"clip_ratio": 0.0,
"completion_length": 3584.0,
"epoch": 0.24114285714285713,
"grad_norm": 0.010621090419590473,
"kl": 0.0003581047058105469,
"learning_rate": 1.6672287963562852e-07,
"loss": 0.0,
"reward": -0.5770844966173172,
"reward_std": 0.2044766042381525,
"rewards/cosine_scaled_reward": -0.288542240858078,
"rewards/format_reward": 0.0,
"step": 422
},
{
"clip_ratio": 0.0,
"completion_length": 2312.250030517578,
"epoch": 0.24171428571428571,
"grad_norm": 0.02375810779631138,
"kl": 0.0005865097045898438,
"learning_rate": 1.6508608292777203e-07,
"loss": -0.0847,
"reward": 0.414031776599586,
"reward_std": 0.732084047049284,
"rewards/cosine_scaled_reward": -0.10548414289951324,
"rewards/format_reward": 0.6250000037252903,
"step": 423
},
{
"clip_ratio": 0.0,
"completion_length": 3128.5833740234375,
"epoch": 0.2422857142857143,
"grad_norm": 0.01551869697868824,
"kl": 0.00045108795166015625,
"learning_rate": 1.6346804638120098e-07,
"loss": 0.0483,
"reward": 0.01406601071357727,
"reward_std": 0.8138113915920258,
"rewards/cosine_scaled_reward": -0.1388003290630877,
"rewards/format_reward": 0.2916666716337204,
"step": 424
},
{
"clip_ratio": 0.0,
"completion_length": 3584.0,
"epoch": 0.24285714285714285,
"grad_norm": 0.011893173679709435,
"kl": 0.00039386749267578125,
"learning_rate": 1.6186884885673413e-07,
"loss": 0.0,
"reward": -0.5272135511040688,
"reward_std": 0.4391126446425915,
"rewards/cosine_scaled_reward": -0.2844401001930237,
"rewards/format_reward": 0.0416666679084301,
"step": 425
},
{
"clip_ratio": 0.0,
"completion_length": 3331.7916870117188,
"epoch": 0.24342857142857144,
"grad_norm": 0.011359743773937225,
"kl": 0.0003376007080078125,
"learning_rate": 1.6028856829700258e-07,
"loss": 0.015,
"reward": 0.13842247426509857,
"reward_std": 0.2540343776345253,
"rewards/cosine_scaled_reward": -0.05578881502151489,
"rewards/format_reward": 0.25,
"step": 426
},
{
"clip_ratio": 0.0,
"completion_length": 3101.5834350585938,
"epoch": 0.244,
"grad_norm": 0.01774391531944275,
"kl": 0.00032806396484375,
"learning_rate": 1.5872728172265146e-07,
"loss": 0.0951,
"reward": 0.436421200633049,
"reward_std": 0.9749777019023895,
"rewards/cosine_scaled_reward": -0.05262273037806153,
"rewards/format_reward": 0.5416666828095913,
"step": 427
},
{
"clip_ratio": 0.0,
"completion_length": 2517.2083740234375,
"epoch": 0.24457142857142858,
"grad_norm": 0.018922699615359306,
"kl": 0.0004782676696777344,
"learning_rate": 1.5718506522858572e-07,
"loss": 0.0171,
"reward": 1.0143605917692184,
"reward_std": 0.9533870965242386,
"rewards/cosine_scaled_reward": 0.17384696938097477,
"rewards/format_reward": 0.6666666865348816,
"step": 428
},
{
"clip_ratio": 0.0,
"completion_length": 3139.4166870117188,
"epoch": 0.24514285714285713,
"grad_norm": 0.01879842020571232,
"kl": 0.0005502700805664062,
"learning_rate": 1.5566199398026147e-07,
"loss": 0.0225,
"reward": -0.2566852793097496,
"reward_std": 0.44601357355713844,
"rewards/cosine_scaled_reward": -0.23250930570065975,
"rewards/format_reward": 0.2083333432674408,
"step": 429
},
{
"clip_ratio": 0.0,
"completion_length": 3440.3333740234375,
"epoch": 0.24571428571428572,
"grad_norm": 0.012092203833162785,
"kl": 0.0003752708435058594,
"learning_rate": 1.5415814221002265e-07,
"loss": 0.0602,
"reward": -0.08506331103853881,
"reward_std": 0.7580065792426467,
"rewards/cosine_scaled_reward": -0.18836499378085136,
"rewards/format_reward": 0.291666679084301,
"step": 430
},
{
"clip_ratio": 0.0,
"completion_length": 3584.0,
"epoch": 0.24628571428571427,
"grad_norm": 0.009651312604546547,
"kl": 0.000339508056640625,
"learning_rate": 1.5267358321348285e-07,
"loss": 0.0,
"reward": -0.5135565400123596,
"reward_std": 0.17573323473334312,
"rewards/cosine_scaled_reward": -0.2776115983724594,
"rewards/format_reward": 0.0416666679084301,
"step": 431
},
{
"clip_ratio": 0.0,
"completion_length": 2914.6250610351562,
"epoch": 0.24685714285714286,
"grad_norm": 0.04963121563196182,
"kl": 0.00048542022705078125,
"learning_rate": 1.5120838934595337e-07,
"loss": 0.0778,
"reward": 0.22040988504886627,
"reward_std": 0.7857857123017311,
"rewards/cosine_scaled_reward": -0.05646173283457756,
"rewards/format_reward": 0.3333333358168602,
"step": 432
},
{
"clip_ratio": 0.0,
"completion_length": 1992.5417022705078,
"epoch": 0.24742857142857144,
"grad_norm": 0.018762821331620216,
"kl": 0.0004487037658691406,
"learning_rate": 1.4976263201891613e-07,
"loss": 0.0176,
"reward": 0.7027051514014602,
"reward_std": 0.5680601857602596,
"rewards/cosine_scaled_reward": -0.002814119216054678,
"rewards/format_reward": 0.7083333432674408,
"step": 433
},
{
"clip_ratio": 0.0,
"completion_length": 3400.2500610351562,
"epoch": 0.248,
"grad_norm": 0.014792956411838531,
"kl": 0.000438690185546875,
"learning_rate": 1.483363816965435e-07,
"loss": 0.0359,
"reward": -0.11403081566095352,
"reward_std": 0.695421889424324,
"rewards/cosine_scaled_reward": -0.14034874364733696,
"rewards/format_reward": 0.1666666716337204,
"step": 434
},
{
"clip_ratio": 0.0,
"completion_length": 2794.375,
"epoch": 0.24857142857142858,
"grad_norm": 0.015310428105294704,
"kl": 0.0004611015319824219,
"learning_rate": 1.469297078922642e-07,
"loss": 0.0315,
"reward": 0.5071351528167725,
"reward_std": 0.44683452136814594,
"rewards/cosine_scaled_reward": 0.10773422196507454,
"rewards/format_reward": 0.2916666679084301,
"step": 435
},
{
"clip_ratio": 0.0,
"completion_length": 2730.250030517578,
"epoch": 0.24914285714285714,
"grad_norm": 0.03859826177358627,
"kl": 0.000392913818359375,
"learning_rate": 1.4554267916537495e-07,
"loss": 0.1139,
"reward": 0.25591667648404837,
"reward_std": 0.7985352799296379,
"rewards/cosine_scaled_reward": -0.08037500828504562,
"rewards/format_reward": 0.4166666716337204,
"step": 436
},
{
"clip_ratio": 0.0,
"completion_length": 1703.7083435058594,
"epoch": 0.24971428571428572,
"grad_norm": 0.035596735775470734,
"kl": 0.0003857612609863281,
"learning_rate": 1.4417536311769885e-07,
"loss": 0.1267,
"reward": 0.5573078580200672,
"reward_std": 0.6633748337626457,
"rewards/cosine_scaled_reward": -0.11717941612005234,
"rewards/format_reward": 0.791666679084301,
"step": 437
},
{
"clip_ratio": 0.0,
"completion_length": 2871.3333435058594,
"epoch": 0.2502857142857143,
"grad_norm": 0.012692431919276714,
"kl": 0.000335693359375,
"learning_rate": 1.4282782639029128e-07,
"loss": 0.0717,
"reward": -0.2980673983693123,
"reward_std": 0.15785099938511848,
"rewards/cosine_scaled_reward": -0.29486703872680664,
"rewards/format_reward": 0.2916666679084301,
"step": 438
},
{
"clip_ratio": 0.0,
"completion_length": 2338.7083435058594,
"epoch": 0.25085714285714283,
"grad_norm": 0.01559723261743784,
"kl": 0.0004553794860839844,
"learning_rate": 1.4150013466019114e-07,
"loss": 0.0277,
"reward": 0.19210883975028992,
"reward_std": 0.7682934515178204,
"rewards/cosine_scaled_reward": -0.17477891221642494,
"rewards/format_reward": 0.5416666679084301,
"step": 439
},
{
"clip_ratio": 0.0,
"completion_length": 1833.625015258789,
"epoch": 0.25142857142857145,
"grad_norm": 0.02628200314939022,
"kl": 0.0005588531494140625,
"learning_rate": 1.4019235263722034e-07,
"loss": -0.0343,
"reward": 0.7311004251241684,
"reward_std": 0.6230124272406101,
"rewards/cosine_scaled_reward": -0.009449809789657593,
"rewards/format_reward": 0.75,
"step": 440
},
{
"clip_ratio": 0.0,
"completion_length": 3584.0,
"epoch": 0.252,
"grad_norm": 0.011055848561227322,
"kl": 0.0003528594970703125,
"learning_rate": 1.3890454406082956e-07,
"loss": 0.0,
"reward": -0.23345018550753593,
"reward_std": 0.553386427462101,
"rewards/cosine_scaled_reward": -0.15839176578447223,
"rewards/format_reward": 0.0833333358168602,
"step": 441
},
{
"clip_ratio": 0.0,
"completion_length": 2781.4583435058594,
"epoch": 0.25257142857142856,
"grad_norm": 0.040766630321741104,
"kl": 0.0006265640258789062,
"learning_rate": 1.3763677169699217e-07,
"loss": 0.0573,
"reward": 0.07009226828813553,
"reward_std": 0.8544792495667934,
"rewards/cosine_scaled_reward": -0.13162054121494293,
"rewards/format_reward": 0.3333333358168602,
"step": 442
},
{
"clip_ratio": 0.0,
"completion_length": 3411.25,
"epoch": 0.25314285714285717,
"grad_norm": 0.013510222546756268,
"kl": 0.0003809928894042969,
"learning_rate": 1.3638909733514452e-07,
"loss": 0.085,
"reward": -0.4627658315002918,
"reward_std": 0.3685727119445801,
"rewards/cosine_scaled_reward": -0.3355495557188988,
"rewards/format_reward": 0.2083333395421505,
"step": 443
},
{
"clip_ratio": 0.0,
"completion_length": 3350.0833740234375,
"epoch": 0.2537142857142857,
"grad_norm": 0.017425937578082085,
"kl": 0.00029087066650390625,
"learning_rate": 1.351615817851748e-07,
"loss": 0.109,
"reward": -0.536733441054821,
"reward_std": 0.5179216116666794,
"rewards/cosine_scaled_reward": -0.3308667168021202,
"rewards/format_reward": 0.1250000037252903,
"step": 444
},
{
"clip_ratio": 0.0,
"completion_length": 2777.4583587646484,
"epoch": 0.2542857142857143,
"grad_norm": 0.01394882146269083,
"kl": 0.0004124641418457031,
"learning_rate": 1.3395428487445914e-07,
"loss": 0.0398,
"reward": 0.6419320218265057,
"reward_std": 1.290092408657074,
"rewards/cosine_scaled_reward": 0.11263268813490868,
"rewards/format_reward": 0.4166666716337204,
"step": 445
},
{
"clip_ratio": 0.0,
"completion_length": 2162.0000610351562,
"epoch": 0.25485714285714284,
"grad_norm": 0.025396212935447693,
"kl": 0.0004458427429199219,
"learning_rate": 1.3276726544494571e-07,
"loss": 0.2263,
"reward": 0.2729286514222622,
"reward_std": 0.4925660863518715,
"rewards/cosine_scaled_reward": -0.2177023496478796,
"rewards/format_reward": 0.7083333395421505,
"step": 446
},
{
"clip_ratio": 0.0,
"completion_length": 2784.250045776367,
"epoch": 0.25542857142857145,
"grad_norm": 0.01916358806192875,
"kl": 0.00040149688720703125,
"learning_rate": 1.316005813502869e-07,
"loss": 0.0456,
"reward": -0.28101276885718107,
"reward_std": 0.27371450141072273,
"rewards/cosine_scaled_reward": -0.28633972629904747,
"rewards/format_reward": 0.2916666679084301,
"step": 447
},
{
"clip_ratio": 0.0,
"completion_length": 3356.5833740234375,
"epoch": 0.256,
"grad_norm": 0.012381333857774734,
"kl": 0.0004825592041015625,
"learning_rate": 1.3045428945301953e-07,
"loss": 0.0587,
"reward": 0.8448215499520302,
"reward_std": 0.715430673211813,
"rewards/cosine_scaled_reward": 0.2557440847158432,
"rewards/format_reward": 0.3333333432674408,
"step": 448
},
{
"clip_ratio": 0.0,
"completion_length": 2940.3334350585938,
"epoch": 0.25657142857142856,
"grad_norm": 0.017700130119919777,
"kl": 0.0002903938293457031,
"learning_rate": 1.2932844562179352e-07,
"loss": 0.1691,
"reward": 0.6478632241487503,
"reward_std": 1.2038164585828781,
"rewards/cosine_scaled_reward": 0.11559828370809555,
"rewards/format_reward": 0.4166666753590107,
"step": 449
},
{
"clip_ratio": 0.0,
"completion_length": 1642.2084197998047,
"epoch": 0.2571428571428571,
"grad_norm": 0.01771002635359764,
"kl": 0.0003179311752319336,
"learning_rate": 1.2822310472864885e-07,
"loss": 0.1028,
"reward": 0.930170651525259,
"reward_std": 0.5905403085052967,
"rewards/cosine_scaled_reward": 0.027585337636992335,
"rewards/format_reward": 0.875,
"step": 450
},
{
"clip_ratio": 0.0,
"completion_length": 3493.4583740234375,
"epoch": 0.25771428571428573,
"grad_norm": 0.015439066104590893,
"kl": 0.00055694580078125,
"learning_rate": 1.2713832064634125e-07,
"loss": 0.0217,
"reward": -0.7016485892236233,
"reward_std": 0.41242050286382437,
"rewards/cosine_scaled_reward": -0.3924909606575966,
"rewards/format_reward": 0.0833333358168602,
"step": 451
},
{
"clip_ratio": 0.0,
"completion_length": 2162.375015258789,
"epoch": 0.2582857142857143,
"grad_norm": 0.031103147193789482,
"kl": 0.0004916191101074219,
"learning_rate": 1.260741462457165e-07,
"loss": 0.1222,
"reward": 0.33962953090667725,
"reward_std": 0.5016037877649069,
"rewards/cosine_scaled_reward": -0.14268524572253227,
"rewards/format_reward": 0.625,
"step": 452
},
{
"clip_ratio": 0.0,
"completion_length": 2658.2500228881836,
"epoch": 0.25885714285714284,
"grad_norm": 0.02379673905670643,
"kl": 0.0007123947143554688,
"learning_rate": 1.2503063339313356e-07,
"loss": 0.0397,
"reward": -0.1027427650988102,
"reward_std": 0.6555835595354438,
"rewards/cosine_scaled_reward": -0.2180380504578352,
"rewards/format_reward": 0.3333333358168602,
"step": 453
},
{
"clip_ratio": 0.0,
"completion_length": 2904.4583740234375,
"epoch": 0.25942857142857145,
"grad_norm": 0.024318231269717216,
"kl": 0.0005178451538085938,
"learning_rate": 1.2400783294793668e-07,
"loss": 0.1233,
"reward": 0.12724535167217255,
"reward_std": 0.690078116953373,
"rewards/cosine_scaled_reward": -0.10304398089647293,
"rewards/format_reward": 0.3333333432674408,
"step": 454
},
{
"clip_ratio": 0.0,
"completion_length": 3007.0833740234375,
"epoch": 0.26,
"grad_norm": 0.014805259183049202,
"kl": 0.00030303001403808594,
"learning_rate": 1.2300579475997657e-07,
"loss": 0.0556,
"reward": 0.07904787175357342,
"reward_std": 0.41673495434224606,
"rewards/cosine_scaled_reward": -0.12714274739846587,
"rewards/format_reward": 0.3333333358168602,
"step": 455
},
{
"clip_ratio": 0.0,
"completion_length": 3369.75,
"epoch": 0.26057142857142856,
"grad_norm": 0.01267288252711296,
"kl": 0.000545501708984375,
"learning_rate": 1.220245676671809e-07,
"loss": 0.0697,
"reward": -0.19169889390468597,
"reward_std": 0.5537064597010612,
"rewards/cosine_scaled_reward": -0.15834945812821388,
"rewards/format_reward": 0.1250000037252903,
"step": 456
},
{
"clip_ratio": 0.0,
"completion_length": 2750.4166870117188,
"epoch": 0.2611428571428571,
"grad_norm": 0.01913038082420826,
"kl": 0.0005252361297607422,
"learning_rate": 1.2106419949317388e-07,
"loss": 0.0638,
"reward": -0.025101646780967712,
"reward_std": 0.8123595081269741,
"rewards/cosine_scaled_reward": -0.17921748850494623,
"rewards/format_reward": 0.3333333358168602,
"step": 457
},
{
"clip_ratio": 0.0,
"completion_length": 2731.291732788086,
"epoch": 0.26171428571428573,
"grad_norm": 0.018483366817235947,
"kl": 0.0005779266357421875,
"learning_rate": 1.2012473704494537e-07,
"loss": 0.1314,
"reward": 0.18389775604009628,
"reward_std": 0.6026048362255096,
"rewards/cosine_scaled_reward": -0.11638446152210236,
"rewards/format_reward": 0.4166666716337204,
"step": 458
},
{
"clip_ratio": 0.0,
"completion_length": 2847.2083435058594,
"epoch": 0.2622857142857143,
"grad_norm": 0.019928766414523125,
"kl": 0.0004062652587890625,
"learning_rate": 1.1920622611056974e-07,
"loss": 0.116,
"reward": 0.056805893778800964,
"reward_std": 0.897519065067172,
"rewards/cosine_scaled_reward": -0.15909705124795437,
"rewards/format_reward": 0.3750000111758709,
"step": 459
},
{
"clip_ratio": 0.0,
"completion_length": 2855.7083740234375,
"epoch": 0.26285714285714284,
"grad_norm": 0.01562991738319397,
"kl": 0.0004324913024902344,
"learning_rate": 1.1830871145697412e-07,
"loss": 0.011,
"reward": 0.4357723630964756,
"reward_std": 0.7417406067252159,
"rewards/cosine_scaled_reward": 0.030386213213205338,
"rewards/format_reward": 0.3750000037252903,
"step": 460
},
{
"clip_ratio": 0.0,
"completion_length": 2806.1666870117188,
"epoch": 0.2634285714285714,
"grad_norm": 0.022745100781321526,
"kl": 0.0005688667297363281,
"learning_rate": 1.1743223682775649e-07,
"loss": 0.0915,
"reward": -0.35859447717666626,
"reward_std": 0.3101446107029915,
"rewards/cosine_scaled_reward": -0.32513057440519333,
"rewards/format_reward": 0.2916666679084301,
"step": 461
},
{
"clip_ratio": 0.0,
"completion_length": 3287.166748046875,
"epoch": 0.264,
"grad_norm": 0.014005818404257298,
"kl": 0.0002605915069580078,
"learning_rate": 1.1657684494105386e-07,
"loss": 0.1217,
"reward": -0.07925862073898315,
"reward_std": 0.570786502212286,
"rewards/cosine_scaled_reward": -0.12296264991164207,
"rewards/format_reward": 0.1666666679084301,
"step": 462
},
{
"clip_ratio": 0.0,
"completion_length": 3048.8750610351562,
"epoch": 0.26457142857142857,
"grad_norm": 0.026238108053803444,
"kl": 0.0003509521484375,
"learning_rate": 1.1574257748745986e-07,
"loss": 0.1634,
"reward": -0.2891614316031337,
"reward_std": 0.6285391822457314,
"rewards/cosine_scaled_reward": -0.2695807181298733,
"rewards/format_reward": 0.25,
"step": 463
},
{
"clip_ratio": 0.0,
"completion_length": 2917.9583435058594,
"epoch": 0.2651428571428571,
"grad_norm": 0.014083731919527054,
"kl": 0.00032520294189453125,
"learning_rate": 1.1492947512799328e-07,
"loss": 0.0497,
"reward": 0.23886509239673615,
"reward_std": 0.5035260319709778,
"rewards/cosine_scaled_reward": -0.08890077471733093,
"rewards/format_reward": 0.4166666716337204,
"step": 464
},
{
"clip_ratio": 0.0,
"completion_length": 3480.75,
"epoch": 0.26571428571428574,
"grad_norm": 0.01106889545917511,
"kl": 0.0002949237823486328,
"learning_rate": 1.1413757749211602e-07,
"loss": 0.022,
"reward": 0.06554645299911499,
"reward_std": 0.4903480280190706,
"rewards/cosine_scaled_reward": -0.0505601167678833,
"rewards/format_reward": 0.1666666716337204,
"step": 465
},
{
"clip_ratio": 0.0,
"completion_length": 2717.875030517578,
"epoch": 0.2662857142857143,
"grad_norm": 0.017086246982216835,
"kl": 0.0003123283386230469,
"learning_rate": 1.1336692317580158e-07,
"loss": 0.0943,
"reward": -0.1813066378235817,
"reward_std": 0.2922391891479492,
"rewards/cosine_scaled_reward": -0.31981998309493065,
"rewards/format_reward": 0.4583333432674408,
"step": 466
},
{
"clip_ratio": 0.0,
"completion_length": 3453.916748046875,
"epoch": 0.26685714285714285,
"grad_norm": 0.013130133971571922,
"kl": 0.0003361701965332031,
"learning_rate": 1.1261754973965422e-07,
"loss": 0.0385,
"reward": 0.055451929569244385,
"reward_std": 0.7496693283319473,
"rewards/cosine_scaled_reward": -0.1181073747575283,
"rewards/format_reward": 0.2916666716337204,
"step": 467
},
{
"clip_ratio": 0.0,
"completion_length": 2965.5416870117188,
"epoch": 0.2674285714285714,
"grad_norm": 0.02877657674252987,
"kl": 0.0003726482391357422,
"learning_rate": 1.1188949370707787e-07,
"loss": 0.1355,
"reward": 0.2859138697385788,
"reward_std": 0.8501931093633175,
"rewards/cosine_scaled_reward": -0.0445430725812912,
"rewards/format_reward": 0.3750000149011612,
"step": 468
},
{
"clip_ratio": 0.0,
"completion_length": 3065.666717529297,
"epoch": 0.268,
"grad_norm": 0.021241569891572,
"kl": 0.0003745555877685547,
"learning_rate": 1.1118279056249653e-07,
"loss": -0.0209,
"reward": -0.22108882665634155,
"reward_std": 0.5584007576107979,
"rewards/cosine_scaled_reward": -0.2772110812366009,
"rewards/format_reward": 0.3333333358168602,
"step": 469
},
{
"clip_ratio": 0.0,
"completion_length": 2091.041717529297,
"epoch": 0.26857142857142857,
"grad_norm": 0.043246157467365265,
"kl": 0.00055694580078125,
"learning_rate": 1.1049747474962444e-07,
"loss": 0.1196,
"reward": 1.0320170223712921,
"reward_std": 1.0782769918441772,
"rewards/cosine_scaled_reward": 0.2035085055977106,
"rewards/format_reward": 0.6250000149011612,
"step": 470
},
{
"clip_ratio": 0.0,
"completion_length": 2538.291717529297,
"epoch": 0.26914285714285713,
"grad_norm": 0.016464348882436752,
"kl": 0.00033664703369140625,
"learning_rate": 1.0983357966978745e-07,
"loss": 0.0476,
"reward": 0.2932426920160651,
"reward_std": 0.5623490735888481,
"rewards/cosine_scaled_reward": -0.1450453530997038,
"rewards/format_reward": 0.5833333432674408,
"step": 471
},
{
"clip_ratio": 0.0,
"completion_length": 3552.5833740234375,
"epoch": 0.26971428571428574,
"grad_norm": 0.0214456245303154,
"kl": 0.00042819976806640625,
"learning_rate": 1.0919113768029517e-07,
"loss": 0.0135,
"reward": -0.35011430410668254,
"reward_std": 0.3236675038933754,
"rewards/cosine_scaled_reward": -0.23755715577863157,
"rewards/format_reward": 0.1250000037252903,
"step": 472
},
{
"clip_ratio": 0.0,
"completion_length": 2889.4166717529297,
"epoch": 0.2702857142857143,
"grad_norm": 0.02036861516535282,
"kl": 0.0003829002380371094,
"learning_rate": 1.0857018009286381e-07,
"loss": 0.0107,
"reward": -0.004865109920501709,
"reward_std": 0.6773473080247641,
"rewards/cosine_scaled_reward": -0.14826588705182076,
"rewards/format_reward": 0.2916666679084301,
"step": 473
},
{
"clip_ratio": 0.0,
"completion_length": 2525.9583740234375,
"epoch": 0.27085714285714285,
"grad_norm": 0.013575663790106773,
"kl": 0.0002644062042236328,
"learning_rate": 1.0797073717209013e-07,
"loss": -0.0323,
"reward": 0.5381094664335251,
"reward_std": 0.44665071181952953,
"rewards/cosine_scaled_reward": -0.0017786095850169659,
"rewards/format_reward": 0.5416666679084301,
"step": 474
},
{
"clip_ratio": 0.0,
"completion_length": 2851.9583740234375,
"epoch": 0.2714285714285714,
"grad_norm": 0.020593978464603424,
"kl": 0.0005025863647460938,
"learning_rate": 1.0739283813397639e-07,
"loss": 0.0314,
"reward": 0.17490556836128235,
"reward_std": 0.6029777117073536,
"rewards/cosine_scaled_reward": -0.12088055908679962,
"rewards/format_reward": 0.4166666865348816,
"step": 475
},
{
"clip_ratio": 0.0,
"completion_length": 3584.0,
"epoch": 0.272,
"grad_norm": 0.01853923127055168,
"kl": 0.000408172607421875,
"learning_rate": 1.068365111445064e-07,
"loss": 0.0,
"reward": -0.6839941293001175,
"reward_std": 0.23570294678211212,
"rewards/cosine_scaled_reward": -0.34199706465005875,
"rewards/format_reward": 0.0,
"step": 476
},
{
"clip_ratio": 0.0,
"completion_length": 3223.4166870117188,
"epoch": 0.2725714285714286,
"grad_norm": 0.02092069201171398,
"kl": 0.00046443939208984375,
"learning_rate": 1.063017833182728e-07,
"loss": 0.1149,
"reward": -0.4160095602273941,
"reward_std": 0.3700854703783989,
"rewards/cosine_scaled_reward": -0.33300479501485825,
"rewards/format_reward": 0.25,
"step": 477
},
{
"clip_ratio": 0.0,
"completion_length": 3584.0,
"epoch": 0.27314285714285713,
"grad_norm": 0.015490886755287647,
"kl": 0.0003581047058105469,
"learning_rate": 1.0578868071715544e-07,
"loss": 0.0,
"reward": -0.35857730358839035,
"reward_std": 0.1991352178156376,
"rewards/cosine_scaled_reward": -0.17928865179419518,
"rewards/format_reward": 0.0,
"step": 478
},
{
"clip_ratio": 0.0,
"completion_length": 2192.2917098999023,
"epoch": 0.2737142857142857,
"grad_norm": 0.025250233709812164,
"kl": 0.0003457069396972656,
"learning_rate": 1.0529722834905125e-07,
"loss": 0.2067,
"reward": 1.0132533311843872,
"reward_std": 0.5927854059264064,
"rewards/cosine_scaled_reward": 0.21495997160673141,
"rewards/format_reward": 0.5833333469927311,
"step": 479
},
{
"clip_ratio": 0.0,
"completion_length": 3373.9583740234375,
"epoch": 0.2742857142857143,
"grad_norm": 0.023761438205838203,
"kl": 0.0004482269287109375,
"learning_rate": 1.0482745016665526e-07,
"loss": 0.0815,
"reward": 0.2368101328611374,
"reward_std": 0.7100772261619568,
"rewards/cosine_scaled_reward": -0.006594939972274005,
"rewards/format_reward": 0.2500000111758709,
"step": 480
},
{
"clip_ratio": 0.0,
"completion_length": 3024.125,
"epoch": 0.27485714285714286,
"grad_norm": 0.01686985418200493,
"kl": 0.0006651878356933594,
"learning_rate": 1.0437936906629334e-07,
"loss": 0.0839,
"reward": -0.0927225798368454,
"reward_std": 0.6664447784423828,
"rewards/cosine_scaled_reward": -0.21302797086536884,
"rewards/format_reward": 0.3333333358168602,
"step": 481
},
{
"clip_ratio": 0.0,
"completion_length": 2792.125030517578,
"epoch": 0.2754285714285714,
"grad_norm": 0.016433361917734146,
"kl": 0.00051116943359375,
"learning_rate": 1.0395300688680625e-07,
"loss": 0.0005,
"reward": -0.018994301557540894,
"reward_std": 0.6018916461616755,
"rewards/cosine_scaled_reward": -0.19699716940522194,
"rewards/format_reward": 0.375,
"step": 482
},
{
"clip_ratio": 0.0,
"completion_length": 2972.1250610351562,
"epoch": 0.276,
"grad_norm": 0.031187044456601143,
"kl": 0.0005083084106445312,
"learning_rate": 1.0354838440848501e-07,
"loss": 0.215,
"reward": -0.3035236857831478,
"reward_std": 0.6242531836032867,
"rewards/cosine_scaled_reward": -0.2975951712578535,
"rewards/format_reward": 0.291666679084301,
"step": 483
},
{
"clip_ratio": 0.0,
"completion_length": 2626.541717529297,
"epoch": 0.2765714285714286,
"grad_norm": 0.03856087848544121,
"kl": 0.0005278587341308594,
"learning_rate": 1.0316552135205837e-07,
"loss": 0.0923,
"reward": -0.07003412395715714,
"reward_std": 0.5723829306662083,
"rewards/cosine_scaled_reward": -0.26418372616171837,
"rewards/format_reward": 0.4583333395421505,
"step": 484
},
{
"clip_ratio": 0.0,
"completion_length": 3163.291748046875,
"epoch": 0.27714285714285714,
"grad_norm": 0.023002101108431816,
"kl": 0.0003609657287597656,
"learning_rate": 1.0280443637773163e-07,
"loss": 0.1445,
"reward": -0.1619274765253067,
"reward_std": 0.6520496867597103,
"rewards/cosine_scaled_reward": -0.18513040244579315,
"rewards/format_reward": 0.2083333395421505,
"step": 485
},
{
"clip_ratio": 0.0,
"completion_length": 2786.500045776367,
"epoch": 0.2777142857142857,
"grad_norm": 0.04367635026574135,
"kl": 0.00084686279296875,
"learning_rate": 1.0246514708427701e-07,
"loss": 0.1118,
"reward": -0.10202455054968596,
"reward_std": 0.5869803428649902,
"rewards/cosine_scaled_reward": -0.19684561155736446,
"rewards/format_reward": 0.2916666679084301,
"step": 486
},
{
"clip_ratio": 0.0,
"completion_length": 2069.625045776367,
"epoch": 0.2782857142857143,
"grad_norm": 0.01725645735859871,
"kl": 0.0004436969757080078,
"learning_rate": 1.0214767000817596e-07,
"loss": 0.0339,
"reward": 0.9145368824247271,
"reward_std": 0.6427567200735211,
"rewards/cosine_scaled_reward": 0.1656017464119941,
"rewards/format_reward": 0.5833333432674408,
"step": 487
},
{
"clip_ratio": 0.0,
"completion_length": 3266.9583740234375,
"epoch": 0.27885714285714286,
"grad_norm": 0.020370880141854286,
"kl": 0.0003273487091064453,
"learning_rate": 1.0185202062281336e-07,
"loss": 0.069,
"reward": 0.08849874883890152,
"reward_std": 0.9649087898433208,
"rewards/cosine_scaled_reward": -0.10158396512269974,
"rewards/format_reward": 0.2916666679084301,
"step": 488
},
{
"clip_ratio": 0.0,
"completion_length": 2884.7500610351562,
"epoch": 0.2794285714285714,
"grad_norm": 0.031119707971811295,
"kl": 0.0006618499755859375,
"learning_rate": 1.0157821333772304e-07,
"loss": 0.1788,
"reward": -0.1438409616239369,
"reward_std": 0.752083495259285,
"rewards/cosine_scaled_reward": -0.2802538275718689,
"rewards/format_reward": 0.4166666716337204,
"step": 489
},
{
"clip_ratio": 0.0,
"completion_length": 2443.8333740234375,
"epoch": 0.28,
"grad_norm": 0.033809904009103775,
"kl": 0.0004324913024902344,
"learning_rate": 1.013262614978859e-07,
"loss": 0.1043,
"reward": 0.5690474957227707,
"reward_std": 0.7256521657109261,
"rewards/cosine_scaled_reward": 0.034523727372288704,
"rewards/format_reward": 0.5000000111758709,
"step": 490
},
{
"clip_ratio": 0.0,
"completion_length": 2658.3750610351562,
"epoch": 0.2805714285714286,
"grad_norm": 0.02354743331670761,
"kl": 0.0004076957702636719,
"learning_rate": 1.0109617738307911e-07,
"loss": 0.0933,
"reward": 0.611915085464716,
"reward_std": 0.9881070479750633,
"rewards/cosine_scaled_reward": 0.05595753900706768,
"rewards/format_reward": 0.5000000037252903,
"step": 491
},
{
"clip_ratio": 0.0,
"completion_length": 2955.7083740234375,
"epoch": 0.28114285714285714,
"grad_norm": 0.017025692388415337,
"kl": 0.000514984130859375,
"learning_rate": 1.0088797220727779e-07,
"loss": 0.1604,
"reward": -0.22188673401251435,
"reward_std": 0.46732087805867195,
"rewards/cosine_scaled_reward": -0.27761003375053406,
"rewards/format_reward": 0.3333333432674408,
"step": 492
},
{
"clip_ratio": 0.0,
"completion_length": 2733.125045776367,
"epoch": 0.2817142857142857,
"grad_norm": 0.025227824226021767,
"kl": 0.0007300376892089844,
"learning_rate": 1.0070165611810855e-07,
"loss": 0.051,
"reward": 0.24700819700956345,
"reward_std": 0.9385927617549896,
"rewards/cosine_scaled_reward": -0.12649590522050858,
"rewards/format_reward": 0.5,
"step": 493
},
{
"clip_ratio": 0.0,
"completion_length": 2702.0416870117188,
"epoch": 0.2822857142857143,
"grad_norm": 0.024555031210184097,
"kl": 0.0006690025329589844,
"learning_rate": 1.005372381963547e-07,
"loss": 0.071,
"reward": 0.8110056445002556,
"reward_std": 1.0802773237228394,
"rewards/cosine_scaled_reward": 0.15550284087657928,
"rewards/format_reward": 0.5000000223517418,
"step": 494
},
{
"clip_ratio": 0.0,
"completion_length": 3584.0,
"epoch": 0.28285714285714286,
"grad_norm": 0.017543919384479523,
"kl": 0.00031828880310058594,
"learning_rate": 1.0039472645551372e-07,
"loss": 0.0,
"reward": -0.29333774745464325,
"reward_std": 0.505841463804245,
"rewards/cosine_scaled_reward": -0.18833555281162262,
"rewards/format_reward": 0.0833333358168602,
"step": 495
},
{
"clip_ratio": 0.0,
"completion_length": 1680.4167175292969,
"epoch": 0.2834285714285714,
"grad_norm": 0.0482969656586647,
"kl": 0.0008435249328613281,
"learning_rate": 1.002741278414069e-07,
"loss": 0.1698,
"reward": 0.8347217477858067,
"reward_std": 0.6243661791086197,
"rewards/cosine_scaled_reward": -0.06180577352643013,
"rewards/format_reward": 0.9583333432674408,
"step": 496
},
{
"clip_ratio": 0.0,
"completion_length": 3413.5000610351562,
"epoch": 0.284,
"grad_norm": 0.010940664447844028,
"kl": 0.000286102294921875,
"learning_rate": 1.0017544823184055e-07,
"loss": 0.0685,
"reward": -0.3512016786262393,
"reward_std": 0.293385605327785,
"rewards/cosine_scaled_reward": -0.23810084303840995,
"rewards/format_reward": 0.1250000037252903,
"step": 497
},
{
"clip_ratio": 0.0,
"completion_length": 2852.9166717529297,
"epoch": 0.2845714285714286,
"grad_norm": 0.031826820224523544,
"kl": 0.0005040168762207031,
"learning_rate": 1.0009869243631952e-07,
"loss": 0.0531,
"reward": -0.09286746755242348,
"reward_std": 0.689836498349905,
"rewards/cosine_scaled_reward": -0.19226707890629768,
"rewards/format_reward": 0.2916666679084301,
"step": 498
},
{
"clip_ratio": 0.0,
"completion_length": 2774.666748046875,
"epoch": 0.28514285714285714,
"grad_norm": 0.015145834535360336,
"kl": 0.0004925727844238281,
"learning_rate": 1.000438641958131e-07,
"loss": 0.047,
"reward": 1.141416186466813,
"reward_std": 1.473642259836197,
"rewards/cosine_scaled_reward": 0.2582080829888582,
"rewards/format_reward": 0.6250000111758709,
"step": 499
},
{
"clip_ratio": 0.0,
"completion_length": 2512.625030517578,
"epoch": 0.2857142857142857,
"grad_norm": 0.045064449310302734,
"kl": 0.0005974769592285156,
"learning_rate": 1.0001096618257236e-07,
"loss": -0.0824,
"reward": -0.023780837655067444,
"reward_std": 0.46510184183716774,
"rewards/cosine_scaled_reward": -0.2827237620949745,
"rewards/format_reward": 0.5416666679084301,
"step": 500
},
{
"epoch": 0.2857142857142857,
"step": 500,
"total_flos": 0.0,
"train_loss": 0.07169770698851426,
"train_runtime": 25124.3049,
"train_samples_per_second": 0.478,
"train_steps_per_second": 0.02
}
],
"logging_steps": 1,
"max_steps": 500,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 50,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 6,
"trial_name": null,
"trial_params": null
}