OpenRS-RLoRA-LoftQ-R32-5 / trainer_state.json

Model save

ee752c6 verified 2 months ago

233 kB

	{
	"best_global_step": null,
	"best_metric": null,
	"best_model_checkpoint": null,
	"epoch": 0.2857142857142857,
	"eval_steps": 500,
	"global_step": 500,
	"is_hyper_param_search": false,
	"is_local_process_zero": true,
	"is_world_process_zero": true,
	"log_history": [
	{
	"clip_ratio": 0.0,
	"completion_length": 3068.5000610351562,
	"epoch": 0.0005714285714285715,
	"grad_norm": 0.013173151761293411,
	"kl": 0.0005006790161132812,
	"learning_rate": 0.0,
	"loss": -0.0242,
	"reward": 0.200983926653862,
	"reward_std": 0.24425111338496208,
	"rewards/cosine_scaled_reward": -0.0453413650393486,
	"rewards/format_reward": 0.2916666679084301,
	"step": 1
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2930.9583740234375,
	"epoch": 0.001142857142857143,
	"grad_norm": 0.04316634312272072,
	"kl": 0.0003731250762939453,
	"learning_rate": 2e-08,
	"loss": 0.2092,
	"reward": -0.28063105791807175,
	"reward_std": 0.29903180059045553,
	"rewards/cosine_scaled_reward": -0.28614887595176697,
	"rewards/format_reward": 0.291666679084301,
	"step": 2
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2359.2500915527344,
	"epoch": 0.0017142857142857142,
	"grad_norm": 0.03820963203907013,
	"kl": 0.00048732757568359375,
	"learning_rate": 4e-08,
	"loss": 0.1661,
	"reward": 0.3625979460775852,
	"reward_std": 0.7691465243697166,
	"rewards/cosine_scaled_reward": -0.11036771535873413,
	"rewards/format_reward": 0.5833333544433117,
	"step": 3
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2455.5000610351562,
	"epoch": 0.002285714285714286,
	"grad_norm": 0.01873675175011158,
	"kl": 0.00042629241943359375,
	"learning_rate": 6e-08,
	"loss": 0.0368,
	"reward": 0.42465633153915405,
	"reward_std": 0.6839377954602242,
	"rewards/cosine_scaled_reward": -0.12100516259670258,
	"rewards/format_reward": 0.6666666679084301,
	"step": 4
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2971.2916870117188,
	"epoch": 0.002857142857142857,
	"grad_norm": 0.01593288779258728,
	"kl": 0.0005702972412109375,
	"learning_rate": 8e-08,
	"loss": 0.0526,
	"reward": -0.45133184641599655,
	"reward_std": 0.1987809967249632,
	"rewards/cosine_scaled_reward": -0.3506659045815468,
	"rewards/format_reward": 0.25,
	"step": 5
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2659.4583740234375,
	"epoch": 0.0034285714285714284,
	"grad_norm": 0.016305305063724518,
	"kl": 0.0003809928894042969,
	"learning_rate": 1e-07,
	"loss": 0.0749,
	"reward": -0.20897246897220612,
	"reward_std": 0.22619805298745632,
	"rewards/cosine_scaled_reward": -0.27115290239453316,
	"rewards/format_reward": 0.3333333358168602,
	"step": 6
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 3150.0000610351562,
	"epoch": 0.004,
	"grad_norm": 0.01786160096526146,
	"kl": 0.00041294097900390625,
	"learning_rate": 1.2e-07,
	"loss": 0.1274,
	"reward": 0.03739733062684536,
	"reward_std": 0.6221467964351177,
	"rewards/cosine_scaled_reward": -0.12713466212153435,
	"rewards/format_reward": 0.2916666753590107,
	"step": 7
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2930.75,
	"epoch": 0.004571428571428572,
	"grad_norm": 0.013749959878623486,
	"kl": 0.0005197525024414062,
	"learning_rate": 1.4e-07,
	"loss": -0.0105,
	"reward": -0.5632898807525635,
	"reward_std": 0.14319632947444916,
	"rewards/cosine_scaled_reward": -0.40664494782686234,
	"rewards/format_reward": 0.25,
	"step": 8
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2527.7083435058594,
	"epoch": 0.005142857142857143,
	"grad_norm": 0.01508411392569542,
	"kl": 0.0005040168762207031,
	"learning_rate": 1.6e-07,
	"loss": -0.0156,
	"reward": 0.4341657906770706,
	"reward_std": 0.46268418058753014,
	"rewards/cosine_scaled_reward": -0.03291710093617439,
	"rewards/format_reward": 0.5,
	"step": 9
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2895.4584350585938,
	"epoch": 0.005714285714285714,
	"grad_norm": 0.024773668497800827,
	"kl": 0.00039124488830566406,
	"learning_rate": 1.8e-07,
	"loss": 0.1541,
	"reward": 0.5330872263293713,
	"reward_std": 1.0390121936798096,
	"rewards/cosine_scaled_reward": 0.016543611884117126,
	"rewards/format_reward": 0.5000000186264515,
	"step": 10
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2230.2916870117188,
	"epoch": 0.006285714285714286,
	"grad_norm": 0.02818623185157776,
	"kl": 0.0006093978881835938,
	"learning_rate": 2e-07,
	"loss": 0.1588,
	"reward": -0.14730040915310383,
	"reward_std": 0.23369846679270267,
	"rewards/cosine_scaled_reward": -0.34448356181383133,
	"rewards/format_reward": 0.541666679084301,
	"step": 11
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2591.7083740234375,
	"epoch": 0.006857142857142857,
	"grad_norm": 0.02190236933529377,
	"kl": 0.0006151199340820312,
	"learning_rate": 2.1999999999999998e-07,
	"loss": 0.0329,
	"reward": 0.48731680028140545,
	"reward_std": 0.8824571967124939,
	"rewards/cosine_scaled_reward": -0.0688415989279747,
	"rewards/format_reward": 0.6250000074505806,
	"step": 12
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2778.3334197998047,
	"epoch": 0.0074285714285714285,
	"grad_norm": 0.015030866488814354,
	"kl": 0.0003528594970703125,
	"learning_rate": 2.4e-07,
	"loss": 0.1056,
	"reward": 0.6901360005140305,
	"reward_std": 0.84443748742342,
	"rewards/cosine_scaled_reward": 0.07423467561602592,
	"rewards/format_reward": 0.541666679084301,
	"step": 13
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2315.8333587646484,
	"epoch": 0.008,
	"grad_norm": 0.013019426725804806,
	"kl": 0.0004930496215820312,
	"learning_rate": 2.6e-07,
	"loss": 0.001,
	"reward": 1.1444866992533207,
	"reward_std": 0.720286563038826,
	"rewards/cosine_scaled_reward": 0.2180766798555851,
	"rewards/format_reward": 0.7083333395421505,
	"step": 14
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2382.000030517578,
	"epoch": 0.008571428571428572,
	"grad_norm": 0.02887255884706974,
	"kl": 0.00041961669921875,
	"learning_rate": 2.8e-07,
	"loss": -0.0308,
	"reward": 0.2332791006192565,
	"reward_std": 0.48609885200858116,
	"rewards/cosine_scaled_reward": -0.21669380273669958,
	"rewards/format_reward": 0.6666666716337204,
	"step": 15
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 3069.4583740234375,
	"epoch": 0.009142857142857144,
	"grad_norm": 0.018421335145831108,
	"kl": 0.000476837158203125,
	"learning_rate": 3e-07,
	"loss": 0.0859,
	"reward": 0.2359318658709526,
	"reward_std": 0.6105321571230888,
	"rewards/cosine_scaled_reward": -0.0695340558886528,
	"rewards/format_reward": 0.3750000149011612,
	"step": 16
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2575.5833435058594,
	"epoch": 0.009714285714285713,
	"grad_norm": 0.03597598895430565,
	"kl": 0.0007686614990234375,
	"learning_rate": 3.2e-07,
	"loss": 0.2179,
	"reward": -0.09196203667670488,
	"reward_std": 0.5380833484232426,
	"rewards/cosine_scaled_reward": -0.2334810234606266,
	"rewards/format_reward": 0.3750000149011612,
	"step": 17
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 3274.9583740234375,
	"epoch": 0.010285714285714285,
	"grad_norm": 0.012147138826549053,
	"kl": 0.0004515647888183594,
	"learning_rate": 3.4000000000000003e-07,
	"loss": -0.0625,
	"reward": -0.1761876866221428,
	"reward_std": 0.6809441670775414,
	"rewards/cosine_scaled_reward": -0.2339271828532219,
	"rewards/format_reward": 0.2916666679084301,
	"step": 18
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2882.2083435058594,
	"epoch": 0.010857142857142857,
	"grad_norm": 0.026394149288535118,
	"kl": 0.0004787445068359375,
	"learning_rate": 3.6e-07,
	"loss": -0.0461,
	"reward": 0.3346722051501274,
	"reward_std": 0.3912115804851055,
	"rewards/cosine_scaled_reward": -0.02016391232609749,
	"rewards/format_reward": 0.3750000037252903,
	"step": 19
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1633.5833435058594,
	"epoch": 0.011428571428571429,
	"grad_norm": 0.01967485062777996,
	"kl": 0.0003612041473388672,
	"learning_rate": 3.7999999999999996e-07,
	"loss": 0.0766,
	"reward": 0.638333223760128,
	"reward_std": 0.8739089630544186,
	"rewards/cosine_scaled_reward": -0.0975000774487853,
	"rewards/format_reward": 0.8333333358168602,
	"step": 20
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1705.2916717529297,
	"epoch": 0.012,
	"grad_norm": 0.052289288491010666,
	"kl": 0.0007824897766113281,
	"learning_rate": 4e-07,
	"loss": 0.4594,
	"reward": 0.13058341294527054,
	"reward_std": 0.39700106158852577,
	"rewards/cosine_scaled_reward": -0.2680416405200958,
	"rewards/format_reward": 0.6666666865348816,
	"step": 21
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 3156.9166870117188,
	"epoch": 0.012571428571428572,
	"grad_norm": 0.016474798321723938,
	"kl": 0.00043773651123046875,
	"learning_rate": 4.1999999999999995e-07,
	"loss": 0.0935,
	"reward": 1.028728973120451,
	"reward_std": 1.6666590571403503,
	"rewards/cosine_scaled_reward": 0.26436448842287064,
	"rewards/format_reward": 0.5000000149011612,
	"step": 22
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2439.041748046875,
	"epoch": 0.013142857142857144,
	"grad_norm": 0.019004346802830696,
	"kl": 0.0005197525024414062,
	"learning_rate": 4.3999999999999997e-07,
	"loss": 0.0789,
	"reward": 0.5590320900082588,
	"reward_std": 1.0339802950620651,
	"rewards/cosine_scaled_reward": -0.03298397921025753,
	"rewards/format_reward": 0.6250000149011612,
	"step": 23
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2758.0833740234375,
	"epoch": 0.013714285714285714,
	"grad_norm": 0.017480166628956795,
	"kl": 0.0005626678466796875,
	"learning_rate": 4.6e-07,
	"loss": -0.1518,
	"reward": 0.3504378944635391,
	"reward_std": 0.513374675065279,
	"rewards/cosine_scaled_reward": -0.09561440348625183,
	"rewards/format_reward": 0.5416666679084301,
	"step": 24
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1633.2083587646484,
	"epoch": 0.014285714285714285,
	"grad_norm": 0.03595641627907753,
	"kl": 0.00029969215393066406,
	"learning_rate": 4.8e-07,
	"loss": 0.0377,
	"reward": 0.6530582755804062,
	"reward_std": 0.629613857716322,
	"rewards/cosine_scaled_reward": -0.04847088688984513,
	"rewards/format_reward": 0.75,
	"step": 25
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2597.0416870117188,
	"epoch": 0.014857142857142857,
	"grad_norm": 0.01941561885178089,
	"kl": 0.0005736351013183594,
	"learning_rate": 5e-07,
	"loss": 0.0753,
	"reward": -0.07398717105388641,
	"reward_std": 0.7024243678897619,
	"rewards/cosine_scaled_reward": -0.2453269399702549,
	"rewards/format_reward": 0.4166666716337204,
	"step": 26
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 3365.5833740234375,
	"epoch": 0.015428571428571429,
	"grad_norm": 0.015174021013081074,
	"kl": 0.00045490264892578125,
	"learning_rate": 5.2e-07,
	"loss": 0.1186,
	"reward": -0.35242245346307755,
	"reward_std": 0.28800780698657036,
	"rewards/cosine_scaled_reward": -0.21787790581583977,
	"rewards/format_reward": 0.0833333358168602,
	"step": 27
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2502.5833740234375,
	"epoch": 0.016,
	"grad_norm": 0.026485657319426537,
	"kl": 0.0006246566772460938,
	"learning_rate": 5.4e-07,
	"loss": 0.0892,
	"reward": 0.14434174199413974,
	"reward_std": 0.6618244834244251,
	"rewards/cosine_scaled_reward": -0.17782913893461227,
	"rewards/format_reward": 0.5000000074505806,
	"step": 28
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2873.0,
	"epoch": 0.01657142857142857,
	"grad_norm": 0.015148441307246685,
	"kl": 0.0004634857177734375,
	"learning_rate": 5.6e-07,
	"loss": -0.009,
	"reward": -0.17739348113536835,
	"reward_std": 0.48768409341573715,
	"rewards/cosine_scaled_reward": -0.21369674568995833,
	"rewards/format_reward": 0.25,
	"step": 29
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 3555.2083740234375,
	"epoch": 0.017142857142857144,
	"grad_norm": 0.011238854378461838,
	"kl": 0.00047588348388671875,
	"learning_rate": 5.8e-07,
	"loss": 0.0107,
	"reward": -0.07929126173257828,
	"reward_std": 0.8475685454905033,
	"rewards/cosine_scaled_reward": -0.1021456066519022,
	"rewards/format_reward": 0.1250000037252903,
	"step": 30
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2528.625030517578,
	"epoch": 0.017714285714285714,
	"grad_norm": 0.06202416494488716,
	"kl": 0.0005381107330322266,
	"learning_rate": 6e-07,
	"loss": 0.2637,
	"reward": -0.021039772778749466,
	"reward_std": 0.860994964838028,
	"rewards/cosine_scaled_reward": -0.21885321522131562,
	"rewards/format_reward": 0.4166666679084301,
	"step": 31
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2219.9166870117188,
	"epoch": 0.018285714285714287,
	"grad_norm": 0.034875061362981796,
	"kl": 0.0004260540008544922,
	"learning_rate": 6.2e-07,
	"loss": 0.132,
	"reward": 0.3424109169282019,
	"reward_std": 0.8896235823631287,
	"rewards/cosine_scaled_reward": -0.09962787851691246,
	"rewards/format_reward": 0.541666679084301,
	"step": 32
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 3474.4583740234375,
	"epoch": 0.018857142857142857,
	"grad_norm": 0.016711147502064705,
	"kl": 0.0005655288696289062,
	"learning_rate": 6.4e-07,
	"loss": 0.0639,
	"reward": -0.47737591713666916,
	"reward_std": 0.1697257850319147,
	"rewards/cosine_scaled_reward": -0.2595212906599045,
	"rewards/format_reward": 0.0416666679084301,
	"step": 33
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 3159.6666870117188,
	"epoch": 0.019428571428571427,
	"grad_norm": 0.014976400882005692,
	"kl": 0.0004849433898925781,
	"learning_rate": 6.6e-07,
	"loss": 0.0609,
	"reward": 0.015222817659378052,
	"reward_std": 0.701315013691783,
	"rewards/cosine_scaled_reward": -0.17988859117031097,
	"rewards/format_reward": 0.3750000111758709,
	"step": 34
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 3254.7916870117188,
	"epoch": 0.02,
	"grad_norm": 0.04339271038770676,
	"kl": 0.0004878044128417969,
	"learning_rate": 6.800000000000001e-07,
	"loss": 0.1246,
	"reward": -0.6728095263242722,
	"reward_std": 0.290258064866066,
	"rewards/cosine_scaled_reward": -0.41973811388015747,
	"rewards/format_reward": 0.1666666679084301,
	"step": 35
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2359.500030517578,
	"epoch": 0.02057142857142857,
	"grad_norm": 0.031043976545333862,
	"kl": 0.0005769729614257812,
	"learning_rate": 7e-07,
	"loss": 0.1315,
	"reward": 0.5186025649309158,
	"reward_std": 0.7595919780433178,
	"rewards/cosine_scaled_reward": -0.011532071977853775,
	"rewards/format_reward": 0.5416666716337204,
	"step": 36
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 3505.2916870117188,
	"epoch": 0.021142857142857144,
	"grad_norm": 0.011476296000182629,
	"kl": 0.00041484832763671875,
	"learning_rate": 7.2e-07,
	"loss": 0.0309,
	"reward": 0.1381697803735733,
	"reward_std": 0.9006250277161598,
	"rewards/cosine_scaled_reward": -0.014248451218008995,
	"rewards/format_reward": 0.1666666679084301,
	"step": 37
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 3085.5001220703125,
	"epoch": 0.021714285714285714,
	"grad_norm": 0.017438944429159164,
	"kl": 0.000644683837890625,
	"learning_rate": 7.4e-07,
	"loss": 0.0651,
	"reward": 0.5748194381594658,
	"reward_std": 1.0931934267282486,
	"rewards/cosine_scaled_reward": 0.016576368361711502,
	"rewards/format_reward": 0.541666679084301,
	"step": 38
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2326.2084045410156,
	"epoch": 0.022285714285714287,
	"grad_norm": 0.01982088014483452,
	"kl": 0.0006146430969238281,
	"learning_rate": 7.599999999999999e-07,
	"loss": 0.0144,
	"reward": 0.7864310666918755,
	"reward_std": 0.7996397092938423,
	"rewards/cosine_scaled_reward": 0.0807155417278409,
	"rewards/format_reward": 0.6250000149011612,
	"step": 39
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 3484.5000610351562,
	"epoch": 0.022857142857142857,
	"grad_norm": 0.01842389442026615,
	"kl": 0.0004572868347167969,
	"learning_rate": 7.799999999999999e-07,
	"loss": 0.0465,
	"reward": -0.408206457272172,
	"reward_std": 0.35211328975856304,
	"rewards/cosine_scaled_reward": -0.2874365597963333,
	"rewards/format_reward": 0.1666666679084301,
	"step": 40
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2935.875,
	"epoch": 0.023428571428571427,
	"grad_norm": 0.016613401472568512,
	"kl": 0.00041675567626953125,
	"learning_rate": 8e-07,
	"loss": 0.0661,
	"reward": 0.06390659511089325,
	"reward_std": 0.4249320328235626,
	"rewards/cosine_scaled_reward": -0.11388003081083298,
	"rewards/format_reward": 0.2916666679084301,
	"step": 41
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2428.9583435058594,
	"epoch": 0.024,
	"grad_norm": 0.021880364045500755,
	"kl": 0.00039958953857421875,
	"learning_rate": 8.199999999999999e-07,
	"loss": -0.1435,
	"reward": 0.11024168506264687,
	"reward_std": 0.32544056698679924,
	"rewards/cosine_scaled_reward": -0.19487916305661201,
	"rewards/format_reward": 0.5,
	"step": 42
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 3448.7083740234375,
	"epoch": 0.02457142857142857,
	"grad_norm": 0.016641981899738312,
	"kl": 0.0004696846008300781,
	"learning_rate": 8.399999999999999e-07,
	"loss": 0.0654,
	"reward": -0.2966616526246071,
	"reward_std": 0.5851836632937193,
	"rewards/cosine_scaled_reward": -0.2108308244496584,
	"rewards/format_reward": 0.1250000037252903,
	"step": 43
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 3186.4584350585938,
	"epoch": 0.025142857142857144,
	"grad_norm": 0.025406604632735252,
	"kl": 0.0004544258117675781,
	"learning_rate": 8.599999999999999e-07,
	"loss": 0.0373,
	"reward": 0.24428799748420715,
	"reward_std": 0.6496499851346016,
	"rewards/cosine_scaled_reward": -0.08618932589888573,
	"rewards/format_reward": 0.4166666679084301,
	"step": 44
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 3035.8750610351562,
	"epoch": 0.025714285714285714,
	"grad_norm": 0.022738995030522346,
	"kl": 0.00051116943359375,
	"learning_rate": 8.799999999999999e-07,
	"loss": 0.1805,
	"reward": -0.13624755293130875,
	"reward_std": 0.8101175278425217,
	"rewards/cosine_scaled_reward": -0.21395711041986942,
	"rewards/format_reward": 0.2916666753590107,
	"step": 45
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 3030.5416870117188,
	"epoch": 0.026285714285714287,
	"grad_norm": 0.015439880080521107,
	"kl": 0.0004506111145019531,
	"learning_rate": 9e-07,
	"loss": 0.0927,
	"reward": 0.9518533200025558,
	"reward_std": 0.8967212848365307,
	"rewards/cosine_scaled_reward": 0.20509332790970802,
	"rewards/format_reward": 0.5416666716337204,
	"step": 46
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2392.0833587646484,
	"epoch": 0.026857142857142857,
	"grad_norm": 0.016110830008983612,
	"kl": 0.00034999847412109375,
	"learning_rate": 9.2e-07,
	"loss": 0.0866,
	"reward": 0.8118329793214798,
	"reward_std": 0.6570356953889132,
	"rewards/cosine_scaled_reward": 0.11424979940056801,
	"rewards/format_reward": 0.5833333432674408,
	"step": 47
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2063.5833435058594,
	"epoch": 0.027428571428571427,
	"grad_norm": 0.02758549526333809,
	"kl": 0.0005931854248046875,
	"learning_rate": 9.399999999999999e-07,
	"loss": 0.09,
	"reward": 0.6152995973825455,
	"reward_std": 0.7233676761388779,
	"rewards/cosine_scaled_reward": -0.025683537125587463,
	"rewards/format_reward": 0.6666666865348816,
	"step": 48
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2459.9583740234375,
	"epoch": 0.028,
	"grad_norm": 0.02223382703959942,
	"kl": 0.0005383491516113281,
	"learning_rate": 9.6e-07,
	"loss": 0.0228,
	"reward": 0.4767443835735321,
	"reward_std": 0.6502058878540993,
	"rewards/cosine_scaled_reward": 0.009205527603626251,
	"rewards/format_reward": 0.4583333432674408,
	"step": 49
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 3291.4166870117188,
	"epoch": 0.02857142857142857,
	"grad_norm": 0.01337195374071598,
	"kl": 0.0004801750183105469,
	"learning_rate": 9.8e-07,
	"loss": -0.0675,
	"reward": 0.18172279000282288,
	"reward_std": 0.23446273803710938,
	"rewards/cosine_scaled_reward": -0.034138597548007965,
	"rewards/format_reward": 0.25,
	"step": 50
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2837.3333435058594,
	"epoch": 0.029142857142857144,
	"grad_norm": 0.0264517143368721,
	"kl": 0.00040531158447265625,
	"learning_rate": 1e-06,
	"loss": 0.0691,
	"reward": 0.0855257548391819,
	"reward_std": 0.5013507194817066,
	"rewards/cosine_scaled_reward": -0.10307044349610806,
	"rewards/format_reward": 0.2916666679084301,
	"step": 51
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 3550.125,
	"epoch": 0.029714285714285714,
	"grad_norm": 0.01194742787629366,
	"kl": 0.0003275871276855469,
	"learning_rate": 9.999890338174275e-07,
	"loss": 0.0148,
	"reward": -0.3245684579014778,
	"reward_std": 0.8235821425914764,
	"rewards/cosine_scaled_reward": -0.22478425258304924,
	"rewards/format_reward": 0.1250000037252903,
	"step": 52
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2964.2083435058594,
	"epoch": 0.030285714285714287,
	"grad_norm": 0.01602712646126747,
	"kl": 0.00046443939208984375,
	"learning_rate": 9.999561358041868e-07,
	"loss": -0.0535,
	"reward": 0.2959368694573641,
	"reward_std": 0.18181271478533745,
	"rewards/cosine_scaled_reward": 0.022968419827520847,
	"rewards/format_reward": 0.25,
	"step": 53
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 3124.0,
	"epoch": 0.030857142857142857,
	"grad_norm": 0.017948586493730545,
	"kl": 0.0005545616149902344,
	"learning_rate": 9.999013075636804e-07,
	"loss": 0.0975,
	"reward": -0.4570858801016584,
	"reward_std": 0.25594667345285416,
	"rewards/cosine_scaled_reward": -0.3327096067368984,
	"rewards/format_reward": 0.2083333432674408,
	"step": 54
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 3167.3750610351562,
	"epoch": 0.03142857142857143,
	"grad_norm": 0.014247610233724117,
	"kl": 0.00048542022705078125,
	"learning_rate": 9.998245517681593e-07,
	"loss": -0.0067,
	"reward": 0.13806065171957016,
	"reward_std": 0.8922518789768219,
	"rewards/cosine_scaled_reward": -0.07680301181972027,
	"rewards/format_reward": 0.291666679084301,
	"step": 55
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 3063.3750610351562,
	"epoch": 0.032,
	"grad_norm": 0.01854483038187027,
	"kl": 0.0007352828979492188,
	"learning_rate": 9.997258721585931e-07,
	"loss": 0.16,
	"reward": -0.01539595052599907,
	"reward_std": 0.6964320801198483,
	"rewards/cosine_scaled_reward": -0.15353131107985973,
	"rewards/format_reward": 0.291666679084301,
	"step": 56
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2957.666717529297,
	"epoch": 0.03257142857142857,
	"grad_norm": 0.034658752381801605,
	"kl": 0.00042319297790527344,
	"learning_rate": 9.996052735444862e-07,
	"loss": 0.1077,
	"reward": -0.11001442139968276,
	"reward_std": 0.6499116308987141,
	"rewards/cosine_scaled_reward": -0.2216738946735859,
	"rewards/format_reward": 0.3333333432674408,
	"step": 57
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 3472.9583740234375,
	"epoch": 0.03314285714285714,
	"grad_norm": 0.011575725860893726,
	"kl": 0.0004363059997558594,
	"learning_rate": 9.994627618036452e-07,
	"loss": 0.0248,
	"reward": -0.1939047873020172,
	"reward_std": 0.8678329139947891,
	"rewards/cosine_scaled_reward": -0.1802857331931591,
	"rewards/format_reward": 0.1666666679084301,
	"step": 58
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2450.541748046875,
	"epoch": 0.03371428571428572,
	"grad_norm": 0.02538195252418518,
	"kl": 0.0005178451538085938,
	"learning_rate": 9.992983438818915e-07,
	"loss": 0.2097,
	"reward": 0.2084958329796791,
	"reward_std": 0.7872938960790634,
	"rewards/cosine_scaled_reward": -0.18741872906684875,
	"rewards/format_reward": 0.583333358168602,
	"step": 59
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 3508.0833740234375,
	"epoch": 0.03428571428571429,
	"grad_norm": 0.018474752083420753,
	"kl": 0.00039386749267578125,
	"learning_rate": 9.991120277927223e-07,
	"loss": 0.0386,
	"reward": -0.1972892191261053,
	"reward_std": 0.7005422301590443,
	"rewards/cosine_scaled_reward": -0.18197794491425157,
	"rewards/format_reward": 0.1666666679084301,
	"step": 60
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2241.916702270508,
	"epoch": 0.03485714285714286,
	"grad_norm": 0.027353286743164062,
	"kl": 0.0003466606140136719,
	"learning_rate": 9.989038226169207e-07,
	"loss": 0.2232,
	"reward": 0.8048897292464972,
	"reward_std": 1.2643243670463562,
	"rewards/cosine_scaled_reward": 0.0899448562413454,
	"rewards/format_reward": 0.6250000111758709,
	"step": 61
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2604.1666870117188,
	"epoch": 0.03542857142857143,
	"grad_norm": 0.022513169795274734,
	"kl": 0.0006260871887207031,
	"learning_rate": 9.98673738502114e-07,
	"loss": -0.0144,
	"reward": -0.23419425124302506,
	"reward_std": 0.3903382420539856,
	"rewards/cosine_scaled_reward": -0.3254304677248001,
	"rewards/format_reward": 0.4166666716337204,
	"step": 62
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 3552.2500610351562,
	"epoch": 0.036,
	"grad_norm": 0.01291267666965723,
	"kl": 0.0005128383636474609,
	"learning_rate": 9.98421786662277e-07,
	"loss": 0.0182,
	"reward": -0.18873221427202225,
	"reward_std": 0.5512382872402668,
	"rewards/cosine_scaled_reward": -0.1360327743459493,
	"rewards/format_reward": 0.0833333358168602,
	"step": 63
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 3535.8333740234375,
	"epoch": 0.036571428571428574,
	"grad_norm": 0.011109953746199608,
	"kl": 0.00037360191345214844,
	"learning_rate": 9.981479793771866e-07,
	"loss": 0.01,
	"reward": -0.44818597845733166,
	"reward_std": 0.25114433094859123,
	"rewards/cosine_scaled_reward": -0.2865929929539561,
	"rewards/format_reward": 0.125,
	"step": 64
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 3338.5416870117188,
	"epoch": 0.037142857142857144,
	"grad_norm": 0.018163377419114113,
	"kl": 0.0006074905395507812,
	"learning_rate": 9.97852329991824e-07,
	"loss": 0.0984,
	"reward": -0.24769322806969285,
	"reward_std": 0.49852345883846283,
	"rewards/cosine_scaled_reward": -0.20717995800077915,
	"rewards/format_reward": 0.1666666716337204,
	"step": 65
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2866.3334045410156,
	"epoch": 0.037714285714285714,
	"grad_norm": 0.03914084658026695,
	"kl": 0.0003857612609863281,
	"learning_rate": 9.975348529157229e-07,
	"loss": 0.1522,
	"reward": 0.5251417439430952,
	"reward_std": 1.087289422750473,
	"rewards/cosine_scaled_reward": -0.00826246291399002,
	"rewards/format_reward": 0.541666679084301,
	"step": 66
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2491.2917098999023,
	"epoch": 0.038285714285714284,
	"grad_norm": 0.03254377841949463,
	"kl": 0.0004949569702148438,
	"learning_rate": 9.971955636222684e-07,
	"loss": 0.0968,
	"reward": 0.676957952324301,
	"reward_std": 0.9317026361823082,
	"rewards/cosine_scaled_reward": 0.046812308952212334,
	"rewards/format_reward": 0.5833333432674408,
	"step": 67
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 3545.0416870117188,
	"epoch": 0.038857142857142854,
	"grad_norm": 0.010356022976338863,
	"kl": 0.0002932548522949219,
	"learning_rate": 9.968344786479415e-07,
	"loss": 0.0228,
	"reward": -0.4004784324206412,
	"reward_std": 0.4494715537875891,
	"rewards/cosine_scaled_reward": -0.22107255086302757,
	"rewards/format_reward": 0.0416666679084301,
	"step": 68
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2902.666748046875,
	"epoch": 0.03942857142857143,
	"grad_norm": 0.022468766197562218,
	"kl": 0.0005145072937011719,
	"learning_rate": 9.964516155915151e-07,
	"loss": 0.039,
	"reward": -0.08480488415807486,
	"reward_std": 0.5629407912492752,
	"rewards/cosine_scaled_reward": -0.292402446269989,
	"rewards/format_reward": 0.5000000149011612,
	"step": 69
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2750.3334045410156,
	"epoch": 0.04,
	"grad_norm": 0.04673980176448822,
	"kl": 0.0007410049438476562,
	"learning_rate": 9.960469931131936e-07,
	"loss": 0.2347,
	"reward": 0.22281695902347565,
	"reward_std": 0.6946056261658669,
	"rewards/cosine_scaled_reward": -0.07609154284000397,
	"rewards/format_reward": 0.3750000037252903,
	"step": 70
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2139.7083892822266,
	"epoch": 0.04057142857142857,
	"grad_norm": 0.022997990250587463,
	"kl": 0.00041031837463378906,
	"learning_rate": 9.956206309337066e-07,
	"loss": 0.0503,
	"reward": 1.0238905474543571,
	"reward_std": 0.3659038320183754,
	"rewards/cosine_scaled_reward": 0.19944527000188828,
	"rewards/format_reward": 0.625,
	"step": 71
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 3038.5833435058594,
	"epoch": 0.04114285714285714,
	"grad_norm": 0.015527274459600449,
	"kl": 0.0004353523254394531,
	"learning_rate": 9.951725498333448e-07,
	"loss": -0.0403,
	"reward": 0.057796329259872437,
	"reward_std": 0.43705910444259644,
	"rewards/cosine_scaled_reward": -0.09610185027122498,
	"rewards/format_reward": 0.25,
	"step": 72
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2542.416717529297,
	"epoch": 0.04171428571428572,
	"grad_norm": 0.031132198870182037,
	"kl": 0.0004696846008300781,
	"learning_rate": 9.947027716509488e-07,
	"loss": 0.087,
	"reward": 0.4634501487016678,
	"reward_std": 0.6224832870066166,
	"rewards/cosine_scaled_reward": -0.0807749442756176,
	"rewards/format_reward": 0.6250000149011612,
	"step": 73
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2811.375030517578,
	"epoch": 0.04228571428571429,
	"grad_norm": 0.017915023490786552,
	"kl": 0.0003814697265625,
	"learning_rate": 9.942113192828444e-07,
	"loss": 0.0416,
	"reward": 0.5657302290201187,
	"reward_std": 0.4264005981385708,
	"rewards/cosine_scaled_reward": 0.03286512568593025,
	"rewards/format_reward": 0.5,
	"step": 74
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2507.0000610351562,
	"epoch": 0.04285714285714286,
	"grad_norm": 0.04219174385070801,
	"kl": 0.000507354736328125,
	"learning_rate": 9.93698216681727e-07,
	"loss": 0.1314,
	"reward": 0.33457985022687353,
	"reward_std": 0.9787873476743698,
	"rewards/cosine_scaled_reward": -0.08271008729934692,
	"rewards/format_reward": 0.5000000037252903,
	"step": 75
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 3257.2083740234375,
	"epoch": 0.04342857142857143,
	"grad_norm": 0.016020679846405983,
	"kl": 0.0005130767822265625,
	"learning_rate": 9.931634888554935e-07,
	"loss": 0.0017,
	"reward": 0.03311159461736679,
	"reward_std": 0.8149497471749783,
	"rewards/cosine_scaled_reward": -0.170944195240736,
	"rewards/format_reward": 0.3750000111758709,
	"step": 76
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2980.9584350585938,
	"epoch": 0.044,
	"grad_norm": 0.01863541640341282,
	"kl": 0.0004639625549316406,
	"learning_rate": 9.926071618660237e-07,
	"loss": 0.0415,
	"reward": 0.25933826714754105,
	"reward_std": 0.6607147231698036,
	"rewards/cosine_scaled_reward": -0.09949754178524017,
	"rewards/format_reward": 0.4583333358168602,
	"step": 77
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2690.5000915527344,
	"epoch": 0.044571428571428574,
	"grad_norm": 0.044503167271614075,
	"kl": 0.0007963180541992188,
	"learning_rate": 9.9202926282791e-07,
	"loss": 0.2539,
	"reward": 0.40623846650123596,
	"reward_std": 0.999249055981636,
	"rewards/cosine_scaled_reward": -0.026047438383102417,
	"rewards/format_reward": 0.4583333507180214,
	"step": 78
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2197.500030517578,
	"epoch": 0.045142857142857144,
	"grad_norm": 0.03637674078345299,
	"kl": 0.0004515647888183594,
	"learning_rate": 9.91429819907136e-07,
	"loss": 0.245,
	"reward": 0.35764368530362844,
	"reward_std": 0.7761036828160286,
	"rewards/cosine_scaled_reward": -0.13367816805839539,
	"rewards/format_reward": 0.6250000074505806,
	"step": 79
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 3167.7500610351562,
	"epoch": 0.045714285714285714,
	"grad_norm": 0.016760632395744324,
	"kl": 0.0005090236663818359,
	"learning_rate": 9.908088623197048e-07,
	"loss": 0.0648,
	"reward": 0.6552756018936634,
	"reward_std": 0.888429120182991,
	"rewards/cosine_scaled_reward": 0.11930444650352001,
	"rewards/format_reward": 0.4166666828095913,
	"step": 80
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 3178.4583740234375,
	"epoch": 0.046285714285714284,
	"grad_norm": 0.018774185329675674,
	"kl": 0.0004324913024902344,
	"learning_rate": 9.901664203302124e-07,
	"loss": 0.096,
	"reward": 0.3727112878113985,
	"reward_std": 0.6679159682244062,
	"rewards/cosine_scaled_reward": 0.019688975531607866,
	"rewards/format_reward": 0.3333333432674408,
	"step": 81
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 3135.25,
	"epoch": 0.046857142857142854,
	"grad_norm": 0.014164636842906475,
	"kl": 0.0003552436828613281,
	"learning_rate": 9.895025252503755e-07,
	"loss": -0.0475,
	"reward": 0.1692640781402588,
	"reward_std": 0.2734921835362911,
	"rewards/cosine_scaled_reward": -0.0403679758310318,
	"rewards/format_reward": 0.25,
	"step": 82
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1449.3750762939453,
	"epoch": 0.04742857142857143,
	"grad_norm": 0.027206717059016228,
	"kl": 0.0006706714630126953,
	"learning_rate": 9.888172094375033e-07,
	"loss": 0.1933,
	"reward": 0.888194240629673,
	"reward_std": 0.8644632250070572,
	"rewards/cosine_scaled_reward": 0.027430432848632336,
	"rewards/format_reward": 0.8333333432674408,
	"step": 83
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2289.875030517578,
	"epoch": 0.048,
	"grad_norm": 0.018596787005662918,
	"kl": 0.0004329681396484375,
	"learning_rate": 9.881105062929221e-07,
	"loss": -0.0513,
	"reward": 0.2941868454217911,
	"reward_std": 0.6860168538987637,
	"rewards/cosine_scaled_reward": -0.18623991776257753,
	"rewards/format_reward": 0.6666666865348816,
	"step": 84
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2823.2083435058594,
	"epoch": 0.04857142857142857,
	"grad_norm": 0.01691795513033867,
	"kl": 0.0005006790161132812,
	"learning_rate": 9.873824502603459e-07,
	"loss": 0.0365,
	"reward": -0.15082548558712006,
	"reward_std": 0.26336194574832916,
	"rewards/cosine_scaled_reward": -0.24207941442728043,
	"rewards/format_reward": 0.3333333358168602,
	"step": 85
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2668.7083740234375,
	"epoch": 0.04914285714285714,
	"grad_norm": 0.014490959234535694,
	"kl": 0.0003361701965332031,
	"learning_rate": 9.866330768241983e-07,
	"loss": 0.0614,
	"reward": 1.0604897737503052,
	"reward_std": 1.551704853773117,
	"rewards/cosine_scaled_reward": 0.19691153056919575,
	"rewards/format_reward": 0.6666666865348816,
	"step": 86
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2975.3750610351562,
	"epoch": 0.04971428571428571,
	"grad_norm": 0.01852068305015564,
	"kl": 0.0004734992980957031,
	"learning_rate": 9.85862422507884e-07,
	"loss": -0.0494,
	"reward": 0.4187099374830723,
	"reward_std": 0.7260430231690407,
	"rewards/cosine_scaled_reward": 0.021854941733181477,
	"rewards/format_reward": 0.3750000037252903,
	"step": 87
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2801.7501220703125,
	"epoch": 0.05028571428571429,
	"grad_norm": 0.01830633357167244,
	"kl": 0.0005750656127929688,
	"learning_rate": 9.850705248720068e-07,
	"loss": 0.0263,
	"reward": 0.018878452479839325,
	"reward_std": 0.6697305515408516,
	"rewards/cosine_scaled_reward": -0.2822274398058653,
	"rewards/format_reward": 0.5833333395421505,
	"step": 88
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2697.416717529297,
	"epoch": 0.05085714285714286,
	"grad_norm": 0.023486582562327385,
	"kl": 0.0004010200500488281,
	"learning_rate": 9.8425742251254e-07,
	"loss": -0.0038,
	"reward": 0.29166828095912933,
	"reward_std": 0.7786018922924995,
	"rewards/cosine_scaled_reward": -0.08333254605531693,
	"rewards/format_reward": 0.4583333395421505,
	"step": 89
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2538.0833740234375,
	"epoch": 0.05142857142857143,
	"grad_norm": 0.010524451732635498,
	"kl": 0.0002994537353515625,
	"learning_rate": 9.83423155058946e-07,
	"loss": 0.0346,
	"reward": 0.7873217761516571,
	"reward_std": 0.763067714869976,
	"rewards/cosine_scaled_reward": 0.08116088062524796,
	"rewards/format_reward": 0.625,
	"step": 90
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 3094.8333740234375,
	"epoch": 0.052,
	"grad_norm": 0.03540240228176117,
	"kl": 0.0004677772521972656,
	"learning_rate": 9.825677631722435e-07,
	"loss": 0.0673,
	"reward": 0.1670377204718534,
	"reward_std": 0.6299600079655647,
	"rewards/cosine_scaled_reward": -0.0831478089094162,
	"rewards/format_reward": 0.3333333432674408,
	"step": 91
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2744.541717529297,
	"epoch": 0.052571428571428575,
	"grad_norm": 0.03803950175642967,
	"kl": 0.0004138946533203125,
	"learning_rate": 9.816912885430258e-07,
	"loss": 0.1363,
	"reward": 0.2968802750110626,
	"reward_std": 0.4654075037688017,
	"rewards/cosine_scaled_reward": 0.002606801688671112,
	"rewards/format_reward": 0.2916666679084301,
	"step": 92
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2166.9583435058594,
	"epoch": 0.053142857142857144,
	"grad_norm": 0.047334711998701096,
	"kl": 0.000446319580078125,
	"learning_rate": 9.807937738894303e-07,
	"loss": 0.0582,
	"reward": 0.20845970511436462,
	"reward_std": 0.3487181942909956,
	"rewards/cosine_scaled_reward": -0.14577015489339828,
	"rewards/format_reward": 0.5,
	"step": 93
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2899.2083740234375,
	"epoch": 0.053714285714285714,
	"grad_norm": 0.013924806378781796,
	"kl": 0.00037860870361328125,
	"learning_rate": 9.798752629550546e-07,
	"loss": 0.0477,
	"reward": 0.4423799216747284,
	"reward_std": 0.8836686909198761,
	"rewards/cosine_scaled_reward": -0.0913100466132164,
	"rewards/format_reward": 0.6250000260770321,
	"step": 94
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2209.4583740234375,
	"epoch": 0.054285714285714284,
	"grad_norm": 0.0459674671292305,
	"kl": 0.0005049705505371094,
	"learning_rate": 9.78935800506826e-07,
	"loss": 0.2862,
	"reward": 0.9317682832479477,
	"reward_std": 0.7728890106081963,
	"rewards/cosine_scaled_reward": 0.07005079090595245,
	"rewards/format_reward": 0.791666679084301,
	"step": 95
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 3184.416748046875,
	"epoch": 0.054857142857142854,
	"grad_norm": 0.017193680629134178,
	"kl": 0.00043773651123046875,
	"learning_rate": 9.779754323328192e-07,
	"loss": 0.0105,
	"reward": 0.38489115983247757,
	"reward_std": 1.058574389666319,
	"rewards/cosine_scaled_reward": -0.015887772024143487,
	"rewards/format_reward": 0.4166666828095913,
	"step": 96
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2542.9583740234375,
	"epoch": 0.05542857142857143,
	"grad_norm": 0.03288557752966881,
	"kl": 0.0006394386291503906,
	"learning_rate": 9.769942052400235e-07,
	"loss": 0.0173,
	"reward": -0.20064683258533478,
	"reward_std": 0.43674986250698566,
	"rewards/cosine_scaled_reward": -0.32949007861316204,
	"rewards/format_reward": 0.4583333432674408,
	"step": 97
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2007.2500457763672,
	"epoch": 0.056,
	"grad_norm": 0.020400483161211014,
	"kl": 0.0003914833068847656,
	"learning_rate": 9.759921670520634e-07,
	"loss": 0.0489,
	"reward": 1.0779699385166168,
	"reward_std": 1.257737785577774,
	"rewards/cosine_scaled_reward": 0.18481825292110443,
	"rewards/format_reward": 0.7083333358168602,
	"step": 98
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2389.5833587646484,
	"epoch": 0.05657142857142857,
	"grad_norm": 0.019844507798552513,
	"kl": 0.0003876686096191406,
	"learning_rate": 9.749693666068663e-07,
	"loss": 0.0556,
	"reward": 0.953456562012434,
	"reward_std": 0.7421619556844234,
	"rewards/cosine_scaled_reward": 0.2058949265629053,
	"rewards/format_reward": 0.541666679084301,
	"step": 99
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2124.0416717529297,
	"epoch": 0.05714285714285714,
	"grad_norm": 0.030665883794426918,
	"kl": 0.0005888938903808594,
	"learning_rate": 9.739258537542835e-07,
	"loss": -0.0126,
	"reward": 0.47896020486950874,
	"reward_std": 0.677450954914093,
	"rewards/cosine_scaled_reward": -0.03135322220623493,
	"rewards/format_reward": 0.5416666679084301,
	"step": 100
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 3004.041748046875,
	"epoch": 0.05771428571428571,
	"grad_norm": 0.04486413672566414,
	"kl": 0.0004496574401855469,
	"learning_rate": 9.728616793536587e-07,
	"loss": 0.0735,
	"reward": 0.022397130727767944,
	"reward_std": 0.7927646785974503,
	"rewards/cosine_scaled_reward": -0.19713477417826653,
	"rewards/format_reward": 0.416666679084301,
	"step": 101
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 3206.875,
	"epoch": 0.05828571428571429,
	"grad_norm": 0.035310786217451096,
	"kl": 0.0003910064697265625,
	"learning_rate": 9.717768952713511e-07,
	"loss": 0.1437,
	"reward": -0.08731867372989655,
	"reward_std": 0.9576195627450943,
	"rewards/cosine_scaled_reward": -0.16865937039256096,
	"rewards/format_reward": 0.2500000037252903,
	"step": 102
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 3014.3750610351562,
	"epoch": 0.05885714285714286,
	"grad_norm": 0.031477976590394974,
	"kl": 0.0004429817199707031,
	"learning_rate": 9.706715543782064e-07,
	"loss": 0.1671,
	"reward": -0.22813038900494576,
	"reward_std": 0.688251368701458,
	"rewards/cosine_scaled_reward": -0.301565196365118,
	"rewards/format_reward": 0.3750000074505806,
	"step": 103
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 3124.25,
	"epoch": 0.05942857142857143,
	"grad_norm": 0.02040957659482956,
	"kl": 0.0004067420959472656,
	"learning_rate": 9.695457105469804e-07,
	"loss": -0.0155,
	"reward": -0.01127801463007927,
	"reward_std": 0.39767561107873917,
	"rewards/cosine_scaled_reward": -0.1306389942765236,
	"rewards/format_reward": 0.25,
	"step": 104
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2859.416717529297,
	"epoch": 0.06,
	"grad_norm": 0.014855082146823406,
	"kl": 0.00042057037353515625,
	"learning_rate": 9.683994186497132e-07,
	"loss": 0.0882,
	"reward": -0.13864438608288765,
	"reward_std": 0.40612044744193554,
	"rewards/cosine_scaled_reward": -0.23598887026309967,
	"rewards/format_reward": 0.3333333358168602,
	"step": 105
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2882.0416717529297,
	"epoch": 0.060571428571428575,
	"grad_norm": 0.01957513391971588,
	"kl": 0.00047016143798828125,
	"learning_rate": 9.672327345550543e-07,
	"loss": -0.0236,
	"reward": 0.05208197236061096,
	"reward_std": 0.6449095346033573,
	"rewards/cosine_scaled_reward": -0.11979235336184502,
	"rewards/format_reward": 0.2916666679084301,
	"step": 106
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2631.000045776367,
	"epoch": 0.061142857142857145,
	"grad_norm": 0.018131662160158157,
	"kl": 0.0004868507385253906,
	"learning_rate": 9.66045715125541e-07,
	"loss": 0.0708,
	"reward": -0.09214365109801292,
	"reward_std": 0.40418105013668537,
	"rewards/cosine_scaled_reward": -0.2544051744043827,
	"rewards/format_reward": 0.4166666716337204,
	"step": 107
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2191.2916717529297,
	"epoch": 0.061714285714285715,
	"grad_norm": 0.03655322641134262,
	"kl": 0.0003566741943359375,
	"learning_rate": 9.648384182148252e-07,
	"loss": 0.1124,
	"reward": 0.02219559997320175,
	"reward_std": 0.20816783979535103,
	"rewards/cosine_scaled_reward": -0.23890221491456032,
	"rewards/format_reward": 0.5,
	"step": 108
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 3294.125,
	"epoch": 0.062285714285714285,
	"grad_norm": 0.02335178665816784,
	"kl": 0.000438690185546875,
	"learning_rate": 9.636109026648554e-07,
	"loss": 0.034,
	"reward": -0.42653578519821167,
	"reward_std": 0.2618470564484596,
	"rewards/cosine_scaled_reward": -0.27576790004968643,
	"rewards/format_reward": 0.125,
	"step": 109
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 3212.7916870117188,
	"epoch": 0.06285714285714286,
	"grad_norm": 0.013373509049415588,
	"kl": 0.0003647804260253906,
	"learning_rate": 9.623632283030077e-07,
	"loss": 0.0984,
	"reward": -0.35255161160603166,
	"reward_std": 0.34573741257190704,
	"rewards/cosine_scaled_reward": -0.2804424799978733,
	"rewards/format_reward": 0.2083333395421505,
	"step": 110
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2838.5833435058594,
	"epoch": 0.06342857142857143,
	"grad_norm": 0.02231280505657196,
	"kl": 0.000438690185546875,
	"learning_rate": 9.610954559391704e-07,
	"loss": 0.0041,
	"reward": -0.08670877665281296,
	"reward_std": 0.4552724286913872,
	"rewards/cosine_scaled_reward": -0.16835440043359995,
	"rewards/format_reward": 0.25,
	"step": 111
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 3045.6666870117188,
	"epoch": 0.064,
	"grad_norm": 0.01650678738951683,
	"kl": 0.0003314018249511719,
	"learning_rate": 9.598076473627796e-07,
	"loss": -0.0241,
	"reward": 0.5308302510529757,
	"reward_std": 0.7812503390014172,
	"rewards/cosine_scaled_reward": 0.07791512738913298,
	"rewards/format_reward": 0.3750000037252903,
	"step": 112
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2850.4166870117188,
	"epoch": 0.06457142857142857,
	"grad_norm": 0.029017914086580276,
	"kl": 0.00040984153747558594,
	"learning_rate": 9.58499865339809e-07,
	"loss": 0.0898,
	"reward": 0.07424558838829398,
	"reward_std": 0.701502051204443,
	"rewards/cosine_scaled_reward": -0.17121053859591484,
	"rewards/format_reward": 0.4166666679084301,
	"step": 113
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 3156.1666870117188,
	"epoch": 0.06514285714285714,
	"grad_norm": 0.013486391864717007,
	"kl": 0.00036525726318359375,
	"learning_rate": 9.571721736097088e-07,
	"loss": -0.0383,
	"reward": -0.05461219698190689,
	"reward_std": 0.5745192095637321,
	"rewards/cosine_scaled_reward": -0.1939727613935247,
	"rewards/format_reward": 0.3333333358168602,
	"step": 114
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2846.9583740234375,
	"epoch": 0.06571428571428571,
	"grad_norm": 0.01851780340075493,
	"kl": 0.0004734992980957031,
	"learning_rate": 9.55824636882301e-07,
	"loss": 0.1023,
	"reward": -0.03960709646344185,
	"reward_std": 0.7411026880145073,
	"rewards/cosine_scaled_reward": -0.18647022545337677,
	"rewards/format_reward": 0.3333333358168602,
	"step": 115
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 3016.7083435058594,
	"epoch": 0.06628571428571428,
	"grad_norm": 0.012387678027153015,
	"kl": 0.0004115104675292969,
	"learning_rate": 9.54457320834625e-07,
	"loss": 0.0442,
	"reward": -0.4454263895750046,
	"reward_std": 0.34630244970321655,
	"rewards/cosine_scaled_reward": -0.3477131985127926,
	"rewards/format_reward": 0.25,
	"step": 116
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 3214.125,
	"epoch": 0.06685714285714285,
	"grad_norm": 0.010518810711801052,
	"kl": 0.0004563331604003906,
	"learning_rate": 9.530702921077358e-07,
	"loss": -0.0368,
	"reward": 0.24042409658432007,
	"reward_std": 0.21273156255483627,
	"rewards/cosine_scaled_reward": -0.004787934944033623,
	"rewards/format_reward": 0.25,
	"step": 117
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2701.125045776367,
	"epoch": 0.06742857142857143,
	"grad_norm": 0.020669570192694664,
	"kl": 0.00047397613525390625,
	"learning_rate": 9.516636183034564e-07,
	"loss": 0.0176,
	"reward": 0.029145129024982452,
	"reward_std": 0.4890642985701561,
	"rewards/cosine_scaled_reward": -0.15209410339593887,
	"rewards/format_reward": 0.3333333358168602,
	"step": 118
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2052.625030517578,
	"epoch": 0.068,
	"grad_norm": 0.04713466763496399,
	"kl": 0.0004968643188476562,
	"learning_rate": 9.502373679810839e-07,
	"loss": 0.2029,
	"reward": 0.8692835792899132,
	"reward_std": 0.5231802985072136,
	"rewards/cosine_scaled_reward": 0.1429751217365265,
	"rewards/format_reward": 0.5833333358168602,
	"step": 119
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 3555.3333740234375,
	"epoch": 0.06857142857142857,
	"grad_norm": 0.011495165526866913,
	"kl": 0.0003578662872314453,
	"learning_rate": 9.487916106540465e-07,
	"loss": 0.0078,
	"reward": -0.3606860190629959,
	"reward_std": 0.5378699135035276,
	"rewards/cosine_scaled_reward": -0.24284303188323975,
	"rewards/format_reward": 0.125,
	"step": 120
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 3037.0,
	"epoch": 0.06914285714285714,
	"grad_norm": 0.040560223162174225,
	"kl": 0.00043773651123046875,
	"learning_rate": 9.473264167865171e-07,
	"loss": 0.1242,
	"reward": -0.06026811897754669,
	"reward_std": 0.48589139245450497,
	"rewards/cosine_scaled_reward": -0.15513405948877335,
	"rewards/format_reward": 0.25,
	"step": 121
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2017.791748046875,
	"epoch": 0.06971428571428571,
	"grad_norm": 0.021992556750774384,
	"kl": 0.0003094673156738281,
	"learning_rate": 9.458418577899774e-07,
	"loss": 0.2174,
	"reward": 1.1798989064991474,
	"reward_std": 0.7208777815103531,
	"rewards/cosine_scaled_reward": 0.21494942158460617,
	"rewards/format_reward": 0.7500000223517418,
	"step": 122
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2790.0833740234375,
	"epoch": 0.07028571428571428,
	"grad_norm": 0.016002673655748367,
	"kl": 0.0004038810729980469,
	"learning_rate": 9.443380060197385e-07,
	"loss": 0.0023,
	"reward": 0.33917365968227386,
	"reward_std": 1.0576607063412666,
	"rewards/cosine_scaled_reward": -0.08041317760944366,
	"rewards/format_reward": 0.5000000149011612,
	"step": 123
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 3338.7500610351562,
	"epoch": 0.07085714285714285,
	"grad_norm": 0.010752753354609013,
	"kl": 0.00029778480529785156,
	"learning_rate": 9.428149347714143e-07,
	"loss": 0.0352,
	"reward": 0.3968821354210377,
	"reward_std": 0.5420339666306973,
	"rewards/cosine_scaled_reward": -0.009892286732792854,
	"rewards/format_reward": 0.4166666865348816,
	"step": 124
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2333.291717529297,
	"epoch": 0.07142857142857142,
	"grad_norm": 0.05020369216799736,
	"kl": 0.00043487548828125,
	"learning_rate": 9.412727182773486e-07,
	"loss": 0.1914,
	"reward": 0.29357251059263945,
	"reward_std": 0.5018073245882988,
	"rewards/cosine_scaled_reward": -0.18654709309339523,
	"rewards/format_reward": 0.6666666716337204,
	"step": 125
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1793.6666870117188,
	"epoch": 0.072,
	"grad_norm": 0.026897089555859566,
	"kl": 0.0003814697265625,
	"learning_rate": 9.397114317029974e-07,
	"loss": 0.1009,
	"reward": 0.33217477798461914,
	"reward_std": 0.8002122193574905,
	"rewards/cosine_scaled_reward": -0.20891261473298073,
	"rewards/format_reward": 0.7500000111758709,
	"step": 126
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2812.0834045410156,
	"epoch": 0.07257142857142856,
	"grad_norm": 0.013029903173446655,
	"kl": 0.0003662109375,
	"learning_rate": 9.381311511432658e-07,
	"loss": 0.0056,
	"reward": 0.6684755804017186,
	"reward_std": 0.9957198947668076,
	"rewards/cosine_scaled_reward": 0.10507109388709068,
	"rewards/format_reward": 0.4583333395421505,
	"step": 127
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 3381.7500610351562,
	"epoch": 0.07314285714285715,
	"grad_norm": 0.014719157479703426,
	"kl": 0.0004086494445800781,
	"learning_rate": 9.36531953618799e-07,
	"loss": 0.0262,
	"reward": 0.14912248402833939,
	"reward_std": 0.6112966164946556,
	"rewards/cosine_scaled_reward": -0.0504387766122818,
	"rewards/format_reward": 0.2500000111758709,
	"step": 128
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2489.6666870117188,
	"epoch": 0.07371428571428572,
	"grad_norm": 0.02591477520763874,
	"kl": 0.0003113746643066406,
	"learning_rate": 9.34913917072228e-07,
	"loss": 0.0579,
	"reward": 0.2893460839986801,
	"reward_std": 0.34850335121154785,
	"rewards/cosine_scaled_reward": -0.10532695800065994,
	"rewards/format_reward": 0.5,
	"step": 129
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1840.0000305175781,
	"epoch": 0.07428571428571429,
	"grad_norm": 0.03489629924297333,
	"kl": 0.0003867149353027344,
	"learning_rate": 9.332771203643714e-07,
	"loss": 0.0713,
	"reward": 0.5813919119536877,
	"reward_std": 0.6657490562647581,
	"rewards/cosine_scaled_reward": -0.06347071845084429,
	"rewards/format_reward": 0.7083333432674408,
	"step": 130
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 3027.5000610351562,
	"epoch": 0.07485714285714286,
	"grad_norm": 0.031031399965286255,
	"kl": 0.00043010711669921875,
	"learning_rate": 9.316216432703916e-07,
	"loss": -0.0614,
	"reward": -0.35745152831077576,
	"reward_std": 0.42021336406469345,
	"rewards/cosine_scaled_reward": -0.3453924432396889,
	"rewards/format_reward": 0.3333333358168602,
	"step": 131
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2721.041717529297,
	"epoch": 0.07542857142857143,
	"grad_norm": 0.019737781956791878,
	"kl": 0.0003380775451660156,
	"learning_rate": 9.299475664759068e-07,
	"loss": -0.1046,
	"reward": 0.52672129124403,
	"reward_std": 0.818169629201293,
	"rewards/cosine_scaled_reward": -0.049139365553855896,
	"rewards/format_reward": 0.6250000149011612,
	"step": 132
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 3006.125,
	"epoch": 0.076,
	"grad_norm": 0.014350071549415588,
	"kl": 0.00035190582275390625,
	"learning_rate": 9.282549715730579e-07,
	"loss": 0.0195,
	"reward": 0.436140738427639,
	"reward_std": 0.799125649034977,
	"rewards/cosine_scaled_reward": 0.009737027809023857,
	"rewards/format_reward": 0.4166666716337204,
	"step": 133
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 3342.0000610351562,
	"epoch": 0.07657142857142857,
	"grad_norm": 0.013594014570116997,
	"kl": 0.0003528594970703125,
	"learning_rate": 9.265439410565328e-07,
	"loss": 0.0027,
	"reward": 0.4128790497779846,
	"reward_std": 0.9192672446370125,
	"rewards/cosine_scaled_reward": 0.039772857911884785,
	"rewards/format_reward": 0.3333333395421505,
	"step": 134
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2860.0833435058594,
	"epoch": 0.07714285714285714,
	"grad_norm": 0.020422151312232018,
	"kl": 0.000537872314453125,
	"learning_rate": 9.248145583195447e-07,
	"loss": -0.0067,
	"reward": -0.4891085624694824,
	"reward_std": 0.28661480732262135,
	"rewards/cosine_scaled_reward": -0.3695542886853218,
	"rewards/format_reward": 0.25,
	"step": 135
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1929.3750610351562,
	"epoch": 0.07771428571428571,
	"grad_norm": 0.03446084260940552,
	"kl": 0.0007333755493164062,
	"learning_rate": 9.230669076497687e-07,
	"loss": 0.1816,
	"reward": 0.3107494944706559,
	"reward_std": 0.7272366061806679,
	"rewards/cosine_scaled_reward": -0.15712526440620422,
	"rewards/format_reward": 0.6250000037252903,
	"step": 136
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2655.000030517578,
	"epoch": 0.07828571428571429,
	"grad_norm": 0.01656595803797245,
	"kl": 0.0002579689025878906,
	"learning_rate": 9.213010742252327e-07,
	"loss": -0.0522,
	"reward": 0.6488993316888809,
	"reward_std": 0.77546676248312,
	"rewards/cosine_scaled_reward": 0.011949680745601654,
	"rewards/format_reward": 0.625,
	"step": 137
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2360.5416870117188,
	"epoch": 0.07885714285714286,
	"grad_norm": 0.01801217719912529,
	"kl": 0.0003211498260498047,
	"learning_rate": 9.195171441101668e-07,
	"loss": -0.0965,
	"reward": 0.3774528503417969,
	"reward_std": 0.27286792919039726,
	"rewards/cosine_scaled_reward": -0.06127360463142395,
	"rewards/format_reward": 0.5,
	"step": 138
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 3113.3750610351562,
	"epoch": 0.07942857142857143,
	"grad_norm": 0.015139062888920307,
	"kl": 0.0002903938293457031,
	"learning_rate": 9.177152042508077e-07,
	"loss": 0.1314,
	"reward": 0.18484138697385788,
	"reward_std": 0.8956813514232635,
	"rewards/cosine_scaled_reward": -0.11591265327297151,
	"rewards/format_reward": 0.416666679084301,
	"step": 139
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2925.1666717529297,
	"epoch": 0.08,
	"grad_norm": 0.011720138601958752,
	"kl": 0.0002703666687011719,
	"learning_rate": 9.158953424711624e-07,
	"loss": 0.0138,
	"reward": 0.18188539519906044,
	"reward_std": 0.5702639557421207,
	"rewards/cosine_scaled_reward": -0.07572397217154503,
	"rewards/format_reward": 0.3333333358168602,
	"step": 140
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2260.7083740234375,
	"epoch": 0.08057142857142857,
	"grad_norm": 0.07121221721172333,
	"kl": 0.0005216598510742188,
	"learning_rate": 9.140576474687263e-07,
	"loss": 0.2931,
	"reward": 0.15240047996303474,
	"reward_std": 0.9472056925296783,
	"rewards/cosine_scaled_reward": -0.2154664322733879,
	"rewards/format_reward": 0.5833333544433117,
	"step": 141
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2265.5000610351562,
	"epoch": 0.08114285714285714,
	"grad_norm": 0.054452214390039444,
	"kl": 0.0006041526794433594,
	"learning_rate": 9.122022088101613e-07,
	"loss": 0.242,
	"reward": 0.7805991023778915,
	"reward_std": 0.9608509242534637,
	"rewards/cosine_scaled_reward": 0.05696620047092438,
	"rewards/format_reward": 0.6666666939854622,
	"step": 142
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2732.2917098999023,
	"epoch": 0.08171428571428571,
	"grad_norm": 0.017999855801463127,
	"kl": 0.0003368854522705078,
	"learning_rate": 9.103291169269299e-07,
	"loss": 0.0514,
	"reward": 1.025037132203579,
	"reward_std": 0.7192419245839119,
	"rewards/cosine_scaled_reward": 0.2625185213983059,
	"rewards/format_reward": 0.5000000111758709,
	"step": 143
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2587.8750915527344,
	"epoch": 0.08228571428571428,
	"grad_norm": 0.01989753730595112,
	"kl": 0.0004143714904785156,
	"learning_rate": 9.084384631108882e-07,
	"loss": 0.1676,
	"reward": 0.8056844659149647,
	"reward_std": 0.7985115312039852,
	"rewards/cosine_scaled_reward": 0.0695088729262352,
	"rewards/format_reward": 0.6666666679084301,
	"step": 144
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1967.666748046875,
	"epoch": 0.08285714285714285,
	"grad_norm": 0.06734281778335571,
	"kl": 0.0005283355712890625,
	"learning_rate": 9.065303395098358e-07,
	"loss": 0.2562,
	"reward": 0.2510095611214638,
	"reward_std": 0.6584825366735458,
	"rewards/cosine_scaled_reward": -0.2286618910729885,
	"rewards/format_reward": 0.7083333432674408,
	"step": 145
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2411.416717529297,
	"epoch": 0.08342857142857144,
	"grad_norm": 0.0633506253361702,
	"kl": 0.000423431396484375,
	"learning_rate": 9.046048391230247e-07,
	"loss": 0.2981,
	"reward": 0.670308992266655,
	"reward_std": 1.008466713130474,
	"rewards/cosine_scaled_reward": 0.0851544663310051,
	"rewards/format_reward": 0.5,
	"step": 146
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2494.041748046875,
	"epoch": 0.084,
	"grad_norm": 0.035755012184381485,
	"kl": 0.00038242340087890625,
	"learning_rate": 9.026620557966279e-07,
	"loss": 0.1709,
	"reward": 0.34743132442235947,
	"reward_std": 0.7957043498754501,
	"rewards/cosine_scaled_reward": -0.03461768664419651,
	"rewards/format_reward": 0.4166666716337204,
	"step": 147
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2072.750045776367,
	"epoch": 0.08457142857142858,
	"grad_norm": 0.026552215218544006,
	"kl": 0.0003905296325683594,
	"learning_rate": 9.007020842191634e-07,
	"loss": -0.01,
	"reward": 0.576688677072525,
	"reward_std": 0.5415612012147903,
	"rewards/cosine_scaled_reward": -0.08665566146373749,
	"rewards/format_reward": 0.7500000111758709,
	"step": 148
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2761.7084350585938,
	"epoch": 0.08514285714285715,
	"grad_norm": 0.08269675821065903,
	"kl": 0.0005259513854980469,
	"learning_rate": 8.987250199168808e-07,
	"loss": 0.2554,
	"reward": -0.020979389548301697,
	"reward_std": 0.6839594691991806,
	"rewards/cosine_scaled_reward": -0.2188230287283659,
	"rewards/format_reward": 0.416666679084301,
	"step": 149
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2854.875030517578,
	"epoch": 0.08571428571428572,
	"grad_norm": 0.024635691195726395,
	"kl": 0.0005636215209960938,
	"learning_rate": 8.967309592491052e-07,
	"loss": 0.215,
	"reward": -0.06861633434891701,
	"reward_std": 0.48412579856812954,
	"rewards/cosine_scaled_reward": -0.22180816903710365,
	"rewards/format_reward": 0.3750000149011612,
	"step": 150
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2962.0000915527344,
	"epoch": 0.08628571428571429,
	"grad_norm": 0.015141277574002743,
	"kl": 0.00032520294189453125,
	"learning_rate": 8.9471999940354e-07,
	"loss": 0.0071,
	"reward": 0.32280058413743973,
	"reward_std": 0.8727270662784576,
	"rewards/cosine_scaled_reward": -0.06776639446616173,
	"rewards/format_reward": 0.4583333395421505,
	"step": 151
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2867.7500610351562,
	"epoch": 0.08685714285714285,
	"grad_norm": 0.016482684761285782,
	"kl": 0.00044155120849609375,
	"learning_rate": 8.926922383915315e-07,
	"loss": 0.0131,
	"reward": -0.14668704383075237,
	"reward_std": 0.4973195120692253,
	"rewards/cosine_scaled_reward": -0.28167686983942986,
	"rewards/format_reward": 0.4166666716337204,
	"step": 152
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2524.2916717529297,
	"epoch": 0.08742857142857142,
	"grad_norm": 0.01825847662985325,
	"kl": 0.0002818107604980469,
	"learning_rate": 8.906477750432903e-07,
	"loss": -0.0077,
	"reward": 0.5729860588908195,
	"reward_std": 0.6833535395562649,
	"rewards/cosine_scaled_reward": -0.005173638463020325,
	"rewards/format_reward": 0.5833333358168602,
	"step": 153
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 3422.1250610351562,
	"epoch": 0.088,
	"grad_norm": 0.019472761079669,
	"kl": 0.0004367828369140625,
	"learning_rate": 8.88586709003076e-07,
	"loss": 0.097,
	"reward": -0.15317635610699654,
	"reward_std": 0.7526141926646233,
	"rewards/cosine_scaled_reward": -0.15992150828242302,
	"rewards/format_reward": 0.1666666716337204,
	"step": 154
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 3454.375,
	"epoch": 0.08857142857142856,
	"grad_norm": 0.010583397001028061,
	"kl": 0.00032520294189453125,
	"learning_rate": 8.865091407243394e-07,
	"loss": 0.033,
	"reward": -0.4739493057131767,
	"reward_std": 0.42491842061281204,
	"rewards/cosine_scaled_reward": -0.2786413189023733,
	"rewards/format_reward": 0.0833333358168602,
	"step": 155
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1829.5000610351562,
	"epoch": 0.08914285714285715,
	"grad_norm": 0.027142049744725227,
	"kl": 0.0004248619079589844,
	"learning_rate": 8.844151714648274e-07,
	"loss": 0.1219,
	"reward": 0.6520796567201614,
	"reward_std": 0.9076523296535015,
	"rewards/cosine_scaled_reward": -0.06979351304471493,
	"rewards/format_reward": 0.7916666865348816,
	"step": 156
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2761.4166870117188,
	"epoch": 0.08971428571428572,
	"grad_norm": 0.03456060215830803,
	"kl": 0.0031185150146484375,
	"learning_rate": 8.823049032816478e-07,
	"loss": 0.0636,
	"reward": -0.12644078209996223,
	"reward_std": 0.47517139464616776,
	"rewards/cosine_scaled_reward": -0.27155373618006706,
	"rewards/format_reward": 0.4166666716337204,
	"step": 157
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2967.3333740234375,
	"epoch": 0.09028571428571429,
	"grad_norm": 0.018295863643288612,
	"kl": 0.00030517578125,
	"learning_rate": 8.801784390262943e-07,
	"loss": 0.1293,
	"reward": 0.7170794606208801,
	"reward_std": 1.2831790447235107,
	"rewards/cosine_scaled_reward": 0.02520638657733798,
	"rewards/format_reward": 0.6666666939854622,
	"step": 158
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 3502.5833740234375,
	"epoch": 0.09085714285714286,
	"grad_norm": 0.013578574173152447,
	"kl": 0.00040340423583984375,
	"learning_rate": 8.780358823396352e-07,
	"loss": 0.032,
	"reward": -0.382570318877697,
	"reward_std": 0.30739978328347206,
	"rewards/cosine_scaled_reward": -0.2537851668894291,
	"rewards/format_reward": 0.1250000037252903,
	"step": 159
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 3438.4583740234375,
	"epoch": 0.09142857142857143,
	"grad_norm": 0.013279566541314125,
	"kl": 0.0003542900085449219,
	"learning_rate": 8.758773376468604e-07,
	"loss": 0.0503,
	"reward": 0.07213277881965041,
	"reward_std": 0.7591742426156998,
	"rewards/cosine_scaled_reward": -0.08893361687660217,
	"rewards/format_reward": 0.2500000037252903,
	"step": 160
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 3217.7083740234375,
	"epoch": 0.092,
	"grad_norm": 0.020369982346892357,
	"kl": 0.00045013427734375,
	"learning_rate": 8.737029101523929e-07,
	"loss": -0.0123,
	"reward": 0.045274198055267334,
	"reward_std": 0.8290613554418087,
	"rewards/cosine_scaled_reward": -0.16486290469765663,
	"rewards/format_reward": 0.3750000111758709,
	"step": 161
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2882.9583740234375,
	"epoch": 0.09257142857142857,
	"grad_norm": 0.015230800956487656,
	"kl": 0.0003082752227783203,
	"learning_rate": 8.715127058347614e-07,
	"loss": 0.0195,
	"reward": 0.22908502910286188,
	"reward_std": 0.6315614283084869,
	"rewards/cosine_scaled_reward": -0.093790827319026,
	"rewards/format_reward": 0.4166666716337204,
	"step": 162
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 3374.041748046875,
	"epoch": 0.09314285714285714,
	"grad_norm": 0.023900238797068596,
	"kl": 0.0012507438659667969,
	"learning_rate": 8.693068314414344e-07,
	"loss": 0.1131,
	"reward": -0.4366554766893387,
	"reward_std": 0.31389016658067703,
	"rewards/cosine_scaled_reward": -0.25999439880251884,
	"rewards/format_reward": 0.0833333358168602,
	"step": 163
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2721.291748046875,
	"epoch": 0.09371428571428571,
	"grad_norm": 0.012770951725542545,
	"kl": 0.00031685829162597656,
	"learning_rate": 8.670853944836176e-07,
	"loss": -0.0522,
	"reward": 0.07291282713413239,
	"reward_std": 0.6898190379142761,
	"rewards/cosine_scaled_reward": -0.2552102580666542,
	"rewards/format_reward": 0.5833333395421505,
	"step": 164
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 3518.3333740234375,
	"epoch": 0.09428571428571429,
	"grad_norm": 0.01447351835668087,
	"kl": 0.0003514289855957031,
	"learning_rate": 8.648485032310144e-07,
	"loss": 0.0387,
	"reward": -0.5118223652243614,
	"reward_std": 0.4116092287003994,
	"rewards/cosine_scaled_reward": -0.2767445221543312,
	"rewards/format_reward": 0.0416666679084301,
	"step": 165
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 3584.0,
	"epoch": 0.09485714285714286,
	"grad_norm": 0.01175595261156559,
	"kl": 0.0004019737243652344,
	"learning_rate": 8.625962667065487e-07,
	"loss": 0.0,
	"reward": -0.8086595237255096,
	"reward_std": 0.14668290875852108,
	"rewards/cosine_scaled_reward": -0.4043297544121742,
	"rewards/format_reward": 0.0,
	"step": 166
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2145.7916870117188,
	"epoch": 0.09542857142857143,
	"grad_norm": 0.033467333763837814,
	"kl": 0.0003383159637451172,
	"learning_rate": 8.603287946810513e-07,
	"loss": 0.1774,
	"reward": 1.582944918423891,
	"reward_std": 0.9087323695421219,
	"rewards/cosine_scaled_reward": 0.3748057931661606,
	"rewards/format_reward": 0.833333358168602,
	"step": 167
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 3014.4583740234375,
	"epoch": 0.096,
	"grad_norm": 0.015946704894304276,
	"kl": 0.00036334991455078125,
	"learning_rate": 8.580461976679099e-07,
	"loss": 0.1448,
	"reward": -0.07501981779932976,
	"reward_std": 0.6305944435298443,
	"rewards/cosine_scaled_reward": -0.20417658984661102,
	"rewards/format_reward": 0.3333333432674408,
	"step": 168
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2566.9584197998047,
	"epoch": 0.09657142857142857,
	"grad_norm": 0.017763182520866394,
	"kl": 0.00031757354736328125,
	"learning_rate": 8.557485869176825e-07,
	"loss": -0.056,
	"reward": 0.9450984001159668,
	"reward_std": 1.3784255981445312,
	"rewards/cosine_scaled_reward": 0.18088253866881132,
	"rewards/format_reward": 0.5833333544433117,
	"step": 169
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 3493.8333740234375,
	"epoch": 0.09714285714285714,
	"grad_norm": 0.012164680287241936,
	"kl": 0.0003337860107421875,
	"learning_rate": 8.534360744126753e-07,
	"loss": 0.0137,
	"reward": -0.02268831804394722,
	"reward_std": 0.5062807034701109,
	"rewards/cosine_scaled_reward": -0.07384415343403816,
	"rewards/format_reward": 0.1250000037252903,
	"step": 170
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2499.9583740234375,
	"epoch": 0.09771428571428571,
	"grad_norm": 0.016273025423288345,
	"kl": 0.0003085136413574219,
	"learning_rate": 8.511087728614862e-07,
	"loss": 0.0939,
	"reward": 0.18185213347896934,
	"reward_std": 0.5238135792315006,
	"rewards/cosine_scaled_reward": -0.1382406111806631,
	"rewards/format_reward": 0.4583333544433117,
	"step": 171
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2753.5833740234375,
	"epoch": 0.09828571428571428,
	"grad_norm": 0.02729148045182228,
	"kl": 0.00043773651123046875,
	"learning_rate": 8.487667956935087e-07,
	"loss": 0.2349,
	"reward": -0.1761530265212059,
	"reward_std": 0.31285534240305424,
	"rewards/cosine_scaled_reward": -0.27557652816176414,
	"rewards/format_reward": 0.3750000149011612,
	"step": 172
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2625.7916870117188,
	"epoch": 0.09885714285714285,
	"grad_norm": 0.020237158983945847,
	"kl": 0.0004603862762451172,
	"learning_rate": 8.464102570534061e-07,
	"loss": 0.1532,
	"reward": 0.3476742703933269,
	"reward_std": 1.086041659116745,
	"rewards/cosine_scaled_reward": -0.11782953515648842,
	"rewards/format_reward": 0.5833333544433117,
	"step": 173
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 3466.7083740234375,
	"epoch": 0.09942857142857142,
	"grad_norm": 0.013751998543739319,
	"kl": 0.000400543212890625,
	"learning_rate": 8.440392717955475e-07,
	"loss": 0.0416,
	"reward": -0.5668784528970718,
	"reward_std": 0.2918172590434551,
	"rewards/cosine_scaled_reward": -0.3459392338991165,
	"rewards/format_reward": 0.1250000037252903,
	"step": 174
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2285.4166870117188,
	"epoch": 0.1,
	"grad_norm": 0.044584471732378006,
	"kl": 0.0002956390380859375,
	"learning_rate": 8.416539554784089e-07,
	"loss": 0.2817,
	"reward": 1.0394483506679535,
	"reward_std": 0.9922515600919724,
	"rewards/cosine_scaled_reward": 0.20722418278455734,
	"rewards/format_reward": 0.6250000149011612,
	"step": 175
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2696.000030517578,
	"epoch": 0.10057142857142858,
	"grad_norm": 0.02319377101957798,
	"kl": 0.000415802001953125,
	"learning_rate": 8.392544243589427e-07,
	"loss": 0.0886,
	"reward": 0.3795542363077402,
	"reward_std": 0.7756792306900024,
	"rewards/cosine_scaled_reward": -0.060222890228033066,
	"rewards/format_reward": 0.5000000223517418,
	"step": 176
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2909.4166870117188,
	"epoch": 0.10114285714285715,
	"grad_norm": 0.03342346474528313,
	"kl": 0.0003414154052734375,
	"learning_rate": 8.368407953869103e-07,
	"loss": 0.1555,
	"reward": 0.2533244490623474,
	"reward_std": 0.6474116146564484,
	"rewards/cosine_scaled_reward": -0.04000448310398497,
	"rewards/format_reward": 0.3333333432674408,
	"step": 177
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 3399.6250610351562,
	"epoch": 0.10171428571428572,
	"grad_norm": 0.01095277164131403,
	"kl": 0.0003046989440917969,
	"learning_rate": 8.344131861991828e-07,
	"loss": 0.0506,
	"reward": -0.01267234981060028,
	"reward_std": 0.6906884871423244,
	"rewards/cosine_scaled_reward": -0.11050283908843994,
	"rewards/format_reward": 0.2083333395421505,
	"step": 178
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2949.000030517578,
	"epoch": 0.10228571428571429,
	"grad_norm": 0.012322952039539814,
	"kl": 0.0004057884216308594,
	"learning_rate": 8.319717151140072e-07,
	"loss": 0.1078,
	"reward": -0.04895609989762306,
	"reward_std": 0.553108676103875,
	"rewards/cosine_scaled_reward": -0.17031139694154263,
	"rewards/format_reward": 0.2916666679084301,
	"step": 179
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2829.7083740234375,
	"epoch": 0.10285714285714286,
	"grad_norm": 0.029738230630755424,
	"kl": 0.0003414154052734375,
	"learning_rate": 8.295165011252396e-07,
	"loss": 0.0411,
	"reward": 0.3339508920907974,
	"reward_std": 0.8846062198281288,
	"rewards/cosine_scaled_reward": -0.04135786276310682,
	"rewards/format_reward": 0.4166666716337204,
	"step": 180
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 3196.7083740234375,
	"epoch": 0.10342857142857143,
	"grad_norm": 0.018469586968421936,
	"kl": 0.0003407001495361328,
	"learning_rate": 8.270476638965461e-07,
	"loss": 0.0479,
	"reward": 0.06904555298388004,
	"reward_std": 0.5129038351587951,
	"rewards/cosine_scaled_reward": -0.13214393705129623,
	"rewards/format_reward": 0.3333333432674408,
	"step": 181
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 3226.416748046875,
	"epoch": 0.104,
	"grad_norm": 0.018400847911834717,
	"kl": 0.00044155120849609375,
	"learning_rate": 8.245653237555705e-07,
	"loss": 0.057,
	"reward": -0.3992947228252888,
	"reward_std": 0.5363226048648357,
	"rewards/cosine_scaled_reward": -0.30381404608488083,
	"rewards/format_reward": 0.2083333395421505,
	"step": 182
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2809.4583740234375,
	"epoch": 0.10457142857142857,
	"grad_norm": 0.015173462219536304,
	"kl": 0.0004189014434814453,
	"learning_rate": 8.220696016880687e-07,
	"loss": 0.0059,
	"reward": -0.08883734792470932,
	"reward_std": 0.4826326109468937,
	"rewards/cosine_scaled_reward": -0.27358534932136536,
	"rewards/format_reward": 0.4583333432674408,
	"step": 183
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2285.250030517578,
	"epoch": 0.10514285714285715,
	"grad_norm": 0.01784459501504898,
	"kl": 0.0003428459167480469,
	"learning_rate": 8.195606193320136e-07,
	"loss": 0.0369,
	"reward": 0.36116717755794525,
	"reward_std": 0.7178683169186115,
	"rewards/cosine_scaled_reward": -0.06941639818251133,
	"rewards/format_reward": 0.5000000111758709,
	"step": 184
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2192.3333740234375,
	"epoch": 0.10571428571428572,
	"grad_norm": 0.021139826625585556,
	"kl": 0.0005130767822265625,
	"learning_rate": 8.170384989716657e-07,
	"loss": 0.1298,
	"reward": -0.057670027017593384,
	"reward_std": 0.41697440296411514,
	"rewards/cosine_scaled_reward": -0.2996683418750763,
	"rewards/format_reward": 0.5416666679084301,
	"step": 185
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2678.541748046875,
	"epoch": 0.10628571428571429,
	"grad_norm": 0.016024595126509666,
	"kl": 0.0003867149353027344,
	"learning_rate": 8.145033635316128e-07,
	"loss": 0.0501,
	"reward": 0.8263338319957256,
	"reward_std": 0.7537258118391037,
	"rewards/cosine_scaled_reward": 0.14233355224132538,
	"rewards/format_reward": 0.5416666716337204,
	"step": 186
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 3467.2083740234375,
	"epoch": 0.10685714285714286,
	"grad_norm": 0.02122497744858265,
	"kl": 0.00041675567626953125,
	"learning_rate": 8.119553365707802e-07,
	"loss": 0.0488,
	"reward": -0.3330922797322273,
	"reward_std": 0.4953814512118697,
	"rewards/cosine_scaled_reward": -0.2290461454540491,
	"rewards/format_reward": 0.1250000037252903,
	"step": 187
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 3197.7083740234375,
	"epoch": 0.10742857142857143,
	"grad_norm": 0.015642890706658363,
	"kl": 0.000370025634765625,
	"learning_rate": 8.093945422764069e-07,
	"loss": 0.0347,
	"reward": 0.1139075756072998,
	"reward_std": 1.0698014255613089,
	"rewards/cosine_scaled_reward": -0.1097128726541996,
	"rewards/format_reward": 0.3333333469927311,
	"step": 188
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 3420.2083740234375,
	"epoch": 0.108,
	"grad_norm": 0.016403522342443466,
	"kl": 0.0003390312194824219,
	"learning_rate": 8.068211054579943e-07,
	"loss": 0.0476,
	"reward": -0.16637829318642616,
	"reward_std": 0.6985251531004906,
	"rewards/cosine_scaled_reward": -0.20818914845585823,
	"rewards/format_reward": 0.2500000074505806,
	"step": 189
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2964.5833740234375,
	"epoch": 0.10857142857142857,
	"grad_norm": 0.015617966651916504,
	"kl": 0.0003600120544433594,
	"learning_rate": 8.04235151541222e-07,
	"loss": 0.0077,
	"reward": -0.2868319842964411,
	"reward_std": 0.26455265283584595,
	"rewards/cosine_scaled_reward": -0.2892493214458227,
	"rewards/format_reward": 0.2916666679084301,
	"step": 190
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2810.2916717529297,
	"epoch": 0.10914285714285714,
	"grad_norm": 0.019974973052740097,
	"kl": 0.0004630088806152344,
	"learning_rate": 8.01636806561836e-07,
	"loss": 0.066,
	"reward": -0.06471758894622326,
	"reward_std": 0.3940250463783741,
	"rewards/cosine_scaled_reward": -0.26152546517550945,
	"rewards/format_reward": 0.4583333358168602,
	"step": 191
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2830.5416717529297,
	"epoch": 0.10971428571428571,
	"grad_norm": 0.016707738861441612,
	"kl": 0.0004711151123046875,
	"learning_rate": 7.990261971595048e-07,
	"loss": 0.0331,
	"reward": 0.1446765586733818,
	"reward_std": 0.1306634098291397,
	"rewards/cosine_scaled_reward": -0.052661728113889694,
	"rewards/format_reward": 0.25,
	"step": 192
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 3578.6666870117188,
	"epoch": 0.11028571428571429,
	"grad_norm": 0.01071973703801632,
	"kl": 0.0003151893615722656,
	"learning_rate": 7.964034505716476e-07,
	"loss": 0.0019,
	"reward": -0.34736843407154083,
	"reward_std": 0.33792266994714737,
	"rewards/cosine_scaled_reward": -0.23618422076106071,
	"rewards/format_reward": 0.1250000037252903,
	"step": 193
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2222.666732788086,
	"epoch": 0.11085714285714286,
	"grad_norm": 0.05484706535935402,
	"kl": 0.0006213188171386719,
	"learning_rate": 7.93768694627233e-07,
	"loss": 0.0984,
	"reward": 0.5483606606721878,
	"reward_std": 1.1607089042663574,
	"rewards/cosine_scaled_reward": -0.01748633268289268,
	"rewards/format_reward": 0.5833333432674408,
	"step": 194
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2849.8334045410156,
	"epoch": 0.11142857142857143,
	"grad_norm": 0.021573448553681374,
	"kl": 0.0005321502685546875,
	"learning_rate": 7.911220577405484e-07,
	"loss": -0.0482,
	"reward": -0.2685957998037338,
	"reward_std": 0.45445217937231064,
	"rewards/cosine_scaled_reward": -0.3426312282681465,
	"rewards/format_reward": 0.4166666716337204,
	"step": 195
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2637.9166870117188,
	"epoch": 0.112,
	"grad_norm": 0.021516846492886543,
	"kl": 0.0003535747528076172,
	"learning_rate": 7.884636689049422e-07,
	"loss": -0.0182,
	"reward": 0.022181347012519836,
	"reward_std": 0.4800866097211838,
	"rewards/cosine_scaled_reward": -0.15557599812746048,
	"rewards/format_reward": 0.3333333358168602,
	"step": 196
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 3041.875,
	"epoch": 0.11257142857142857,
	"grad_norm": 0.01381740253418684,
	"kl": 0.0003528594970703125,
	"learning_rate": 7.857936576865356e-07,
	"loss": -0.0073,
	"reward": -0.33803367614746094,
	"reward_std": 0.4302855357527733,
	"rewards/cosine_scaled_reward": -0.29401686880737543,
	"rewards/format_reward": 0.25,
	"step": 197
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2317.8333892822266,
	"epoch": 0.11314285714285714,
	"grad_norm": 0.03078615479171276,
	"kl": 0.0003762245178222656,
	"learning_rate": 7.831121542179086e-07,
	"loss": 0.0924,
	"reward": 0.597826175391674,
	"reward_std": 0.8695001602172852,
	"rewards/cosine_scaled_reward": 0.007246408611536026,
	"rewards/format_reward": 0.5833333469927311,
	"step": 198
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 3303.9583740234375,
	"epoch": 0.11371428571428571,
	"grad_norm": 0.019953317940235138,
	"kl": 0.0003972053527832031,
	"learning_rate": 7.804192891917571e-07,
	"loss": 0.0989,
	"reward": 0.565569007769227,
	"reward_std": 0.5124572534114122,
	"rewards/cosine_scaled_reward": 0.07445115875452757,
	"rewards/format_reward": 0.4166666716337204,
	"step": 199
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2750.8333435058594,
	"epoch": 0.11428571428571428,
	"grad_norm": 0.05455951392650604,
	"kl": 0.0007038116455078125,
	"learning_rate": 7.777151938545235e-07,
	"loss": 0.1193,
	"reward": -0.07477065362036228,
	"reward_std": 0.8196274563670158,
	"rewards/cosine_scaled_reward": -0.24571867287158966,
	"rewards/format_reward": 0.4166666716337204,
	"step": 200
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2873.4166870117188,
	"epoch": 0.11485714285714285,
	"grad_norm": 0.06070086359977722,
	"kl": 0.0005712509155273438,
	"learning_rate": 7.75e-07,
	"loss": 0.2315,
	"reward": -0.16712947003543377,
	"reward_std": 0.3942112438380718,
	"rewards/cosine_scaled_reward": -0.22939807549118996,
	"rewards/format_reward": 0.2916666716337204,
	"step": 201
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2792.7083740234375,
	"epoch": 0.11542857142857142,
	"grad_norm": 0.02964051626622677,
	"kl": 0.0003209114074707031,
	"learning_rate": 7.72273839962904e-07,
	"loss": 0.0298,
	"reward": 0.44592857360839844,
	"reward_std": 0.7889965083450079,
	"rewards/cosine_scaled_reward": 0.05629761889576912,
	"rewards/format_reward": 0.3333333358168602,
	"step": 202
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 3046.7500610351562,
	"epoch": 0.116,
	"grad_norm": 0.018877137452363968,
	"kl": 0.0003161430358886719,
	"learning_rate": 7.695368466124296e-07,
	"loss": 0.0529,
	"reward": 0.408734455704689,
	"reward_std": 0.6842712201178074,
	"rewards/cosine_scaled_reward": -0.0247994652017951,
	"rewards/format_reward": 0.4583333544433117,
	"step": 203
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2634.5833435058594,
	"epoch": 0.11657142857142858,
	"grad_norm": 0.02069164253771305,
	"kl": 0.0003864765167236328,
	"learning_rate": 7.667891533457718e-07,
	"loss": 0.0833,
	"reward": -0.21086983382701874,
	"reward_std": 0.22330690175294876,
	"rewards/cosine_scaled_reward": -0.2929349225014448,
	"rewards/format_reward": 0.375,
	"step": 204
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2709.7083435058594,
	"epoch": 0.11714285714285715,
	"grad_norm": 0.023381751030683517,
	"kl": 0.000301361083984375,
	"learning_rate": 7.640308940816239e-07,
	"loss": 0.0675,
	"reward": 0.27661415934562683,
	"reward_std": 0.6943130940198898,
	"rewards/cosine_scaled_reward": -0.09085960499942303,
	"rewards/format_reward": 0.4583333432674408,
	"step": 205
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 3047.625030517578,
	"epoch": 0.11771428571428572,
	"grad_norm": 0.02151002548635006,
	"kl": 0.0004744529724121094,
	"learning_rate": 7.612622032536507e-07,
	"loss": -0.0196,
	"reward": -0.10782860964536667,
	"reward_std": 0.5117022879421711,
	"rewards/cosine_scaled_reward": -0.19974764343351126,
	"rewards/format_reward": 0.2916666679084301,
	"step": 206
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2933.7916870117188,
	"epoch": 0.11828571428571429,
	"grad_norm": 0.0185316763818264,
	"kl": 0.0003197193145751953,
	"learning_rate": 7.584832158039378e-07,
	"loss": 0.0325,
	"reward": 0.4302789755165577,
	"reward_std": 1.0025385729968548,
	"rewards/cosine_scaled_reward": 0.006806140765547752,
	"rewards/format_reward": 0.4166666679084301,
	"step": 207
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2270.75,
	"epoch": 0.11885714285714286,
	"grad_norm": 0.013073590584099293,
	"kl": 0.00028967857360839844,
	"learning_rate": 7.556940671764124e-07,
	"loss": -0.0037,
	"reward": 0.8428396731615067,
	"reward_std": 0.33612318709492683,
	"rewards/cosine_scaled_reward": 0.17141985148191452,
	"rewards/format_reward": 0.5,
	"step": 208
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2534.0834197998047,
	"epoch": 0.11942857142857143,
	"grad_norm": 0.04636608809232712,
	"kl": 0.0002626180648803711,
	"learning_rate": 7.528948933102438e-07,
	"loss": 0.1077,
	"reward": 0.41008343175053596,
	"reward_std": 0.38527223095297813,
	"rewards/cosine_scaled_reward": -0.003291614353656769,
	"rewards/format_reward": 0.4166666716337204,
	"step": 209
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2664.375030517578,
	"epoch": 0.12,
	"grad_norm": 0.06337418407201767,
	"kl": 0.0005369186401367188,
	"learning_rate": 7.500858306332172e-07,
	"loss": 0.1954,
	"reward": -0.11931078508496284,
	"reward_std": 0.4521569199860096,
	"rewards/cosine_scaled_reward": -0.2679887441918254,
	"rewards/format_reward": 0.4166666828095913,
	"step": 210
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2690.75,
	"epoch": 0.12057142857142857,
	"grad_norm": 0.02364625595510006,
	"kl": 0.0003635883331298828,
	"learning_rate": 7.472670160550848e-07,
	"loss": 0.1012,
	"reward": 0.4544009678065777,
	"reward_std": 0.9030434042215347,
	"rewards/cosine_scaled_reward": 0.018867140635848045,
	"rewards/format_reward": 0.4166666679084301,
	"step": 211
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 3231.4584350585938,
	"epoch": 0.12114285714285715,
	"grad_norm": 0.02378404326736927,
	"kl": 0.0004596710205078125,
	"learning_rate": 7.444385869608921e-07,
	"loss": 0.126,
	"reward": 0.3596238009631634,
	"reward_std": 1.0245484188199043,
	"rewards/cosine_scaled_reward": 0.03397855442017317,
	"rewards/format_reward": 0.2916666679084301,
	"step": 212
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2259.166732788086,
	"epoch": 0.12171428571428572,
	"grad_norm": 0.0168690737336874,
	"kl": 0.000362396240234375,
	"learning_rate": 7.416006812042827e-07,
	"loss": 0.0384,
	"reward": 0.7057138048112392,
	"reward_std": 1.0122921094298363,
	"rewards/cosine_scaled_reward": 0.04035688715521246,
	"rewards/format_reward": 0.6250000037252903,
	"step": 213
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2541.250045776367,
	"epoch": 0.12228571428571429,
	"grad_norm": 0.029833870008587837,
	"kl": 0.0004019737243652344,
	"learning_rate": 7.387534371007797e-07,
	"loss": 0.1661,
	"reward": 0.027080008760094643,
	"reward_std": 0.8045615777373314,
	"rewards/cosine_scaled_reward": -0.19479333609342575,
	"rewards/format_reward": 0.4166666716337204,
	"step": 214
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2826.3333740234375,
	"epoch": 0.12285714285714286,
	"grad_norm": 0.01670445129275322,
	"kl": 0.0003113746643066406,
	"learning_rate": 7.358969934210438e-07,
	"loss": 0.0458,
	"reward": -0.15282147377729416,
	"reward_std": 0.6320948153734207,
	"rewards/cosine_scaled_reward": -0.28474406246095896,
	"rewards/format_reward": 0.4166666679084301,
	"step": 215
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1534.750015258789,
	"epoch": 0.12342857142857143,
	"grad_norm": 0.028196675702929497,
	"kl": 0.0003743171691894531,
	"learning_rate": 7.330314893841101e-07,
	"loss": 0.173,
	"reward": 1.9182523787021637,
	"reward_std": 0.8897372838109732,
	"rewards/cosine_scaled_reward": 0.5007927343249321,
	"rewards/format_reward": 0.9166666716337204,
	"step": 216
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 3460.5416870117188,
	"epoch": 0.124,
	"grad_norm": 0.012388592585921288,
	"kl": 0.00035881996154785156,
	"learning_rate": 7.301570646506027e-07,
	"loss": 0.0433,
	"reward": -0.15055294707417488,
	"reward_std": 0.5311995670199394,
	"rewards/cosine_scaled_reward": -0.137776467949152,
	"rewards/format_reward": 0.1250000037252903,
	"step": 217
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2624.8334045410156,
	"epoch": 0.12457142857142857,
	"grad_norm": 0.03056301549077034,
	"kl": 0.00043702125549316406,
	"learning_rate": 7.27273859315928e-07,
	"loss": 0.2376,
	"reward": 0.47333015874028206,
	"reward_std": 0.7044993788003922,
	"rewards/cosine_scaled_reward": -0.034168269485235214,
	"rewards/format_reward": 0.5416666865348816,
	"step": 218
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2094.5834045410156,
	"epoch": 0.12514285714285714,
	"grad_norm": 0.05004338175058365,
	"kl": 0.0005245208740234375,
	"learning_rate": 7.243820139034464e-07,
	"loss": 0.1989,
	"reward": 0.7468737373128533,
	"reward_std": 0.8828459084033966,
	"rewards/cosine_scaled_reward": 0.019270192831754684,
	"rewards/format_reward": 0.7083333432674408,
	"step": 219
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2917.791748046875,
	"epoch": 0.12571428571428572,
	"grad_norm": 0.01651514135301113,
	"kl": 0.0003859996795654297,
	"learning_rate": 7.214816693576234e-07,
	"loss": 0.1265,
	"reward": 0.40257575549185276,
	"reward_std": 0.8734332285821438,
	"rewards/cosine_scaled_reward": -0.04871212877333164,
	"rewards/format_reward": 0.5000000149011612,
	"step": 220
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1825.2500457763672,
	"epoch": 0.12628571428571428,
	"grad_norm": 0.022186581045389175,
	"kl": 0.0004601478576660156,
	"learning_rate": 7.185729670371604e-07,
	"loss": -0.0246,
	"reward": 0.5106580704450607,
	"reward_std": 0.6262499652802944,
	"rewards/cosine_scaled_reward": -0.11967097967863083,
	"rewards/format_reward": 0.75,
	"step": 221
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1936.2917098999023,
	"epoch": 0.12685714285714286,
	"grad_norm": 0.025714771822094917,
	"kl": 0.0005955696105957031,
	"learning_rate": 7.156560487081051e-07,
	"loss": -0.0667,
	"reward": 0.5173665434122086,
	"reward_std": 0.4710990320891142,
	"rewards/cosine_scaled_reward": -0.11631673201918602,
	"rewards/format_reward": 0.75,
	"step": 222
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 3248.3333740234375,
	"epoch": 0.12742857142857142,
	"grad_norm": 0.014303294010460377,
	"kl": 0.00043010711669921875,
	"learning_rate": 7.127310565369415e-07,
	"loss": 0.0878,
	"reward": -0.4517449364066124,
	"reward_std": 0.20166658982634544,
	"rewards/cosine_scaled_reward": -0.3092058040201664,
	"rewards/format_reward": 0.1666666716337204,
	"step": 223
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 3256.8333740234375,
	"epoch": 0.128,
	"grad_norm": 0.01596376858651638,
	"kl": 0.000354766845703125,
	"learning_rate": 7.097981330836616e-07,
	"loss": 0.0683,
	"reward": 0.5998236387968063,
	"reward_std": 0.8237827308475971,
	"rewards/cosine_scaled_reward": 0.11241178959608078,
	"rewards/format_reward": 0.3750000149011612,
	"step": 224
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2851.2083740234375,
	"epoch": 0.12857142857142856,
	"grad_norm": 0.017591828480362892,
	"kl": 0.0004096031188964844,
	"learning_rate": 7.068574212948169e-07,
	"loss": 0.0511,
	"reward": 0.10140763968229294,
	"reward_std": 0.5019327104091644,
	"rewards/cosine_scaled_reward": -0.11596284806728363,
	"rewards/format_reward": 0.3333333358168602,
	"step": 225
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2292.000030517578,
	"epoch": 0.12914285714285714,
	"grad_norm": 0.032970964908599854,
	"kl": 0.00045108795166015625,
	"learning_rate": 7.039090644965509e-07,
	"loss": 0.1386,
	"reward": 0.4276478886604309,
	"reward_std": 0.7602110169827938,
	"rewards/cosine_scaled_reward": -0.057009367272257805,
	"rewards/format_reward": 0.541666679084301,
	"step": 226
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2093.6250610351562,
	"epoch": 0.12971428571428573,
	"grad_norm": 0.015598502941429615,
	"kl": 0.0003306865692138672,
	"learning_rate": 7.009532063876148e-07,
	"loss": 0.1292,
	"reward": 0.6084302365779877,
	"reward_std": 0.6128123812377453,
	"rewards/cosine_scaled_reward": -0.008284901501610875,
	"rewards/format_reward": 0.625,
	"step": 227
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1409.5833587646484,
	"epoch": 0.13028571428571428,
	"grad_norm": 0.03161853179335594,
	"kl": 0.0005328655242919922,
	"learning_rate": 6.979899910323624e-07,
	"loss": 0.0405,
	"reward": 0.6997113339602947,
	"reward_std": 0.7971856854856014,
	"rewards/cosine_scaled_reward": -0.04597766697406769,
	"rewards/format_reward": 0.7916666679084301,
	"step": 228
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2726.4166870117188,
	"epoch": 0.13085714285714287,
	"grad_norm": 0.03210078924894333,
	"kl": 0.00038051605224609375,
	"learning_rate": 6.950195628537299e-07,
	"loss": 0.0564,
	"reward": 0.28624797612428665,
	"reward_std": 0.706984382122755,
	"rewards/cosine_scaled_reward": -0.06520934589207172,
	"rewards/format_reward": 0.4166666679084301,
	"step": 229
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 3351.7083740234375,
	"epoch": 0.13142857142857142,
	"grad_norm": 0.020253852009773254,
	"kl": 0.0004558563232421875,
	"learning_rate": 6.920420666261961e-07,
	"loss": 0.0537,
	"reward": 0.20329816453158855,
	"reward_std": 0.920021902769804,
	"rewards/cosine_scaled_reward": -0.04418425913900137,
	"rewards/format_reward": 0.2916666753590107,
	"step": 230
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 3248.7916870117188,
	"epoch": 0.132,
	"grad_norm": 0.027326960116624832,
	"kl": 0.0003800392150878906,
	"learning_rate": 6.890576474687263e-07,
	"loss": 0.0472,
	"reward": 0.02445869892835617,
	"reward_std": 0.4333410616964102,
	"rewards/cosine_scaled_reward": -0.07110398076474667,
	"rewards/format_reward": 0.1666666716337204,
	"step": 231
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2847.8333435058594,
	"epoch": 0.13257142857142856,
	"grad_norm": 0.017697490751743317,
	"kl": 0.0004267692565917969,
	"learning_rate": 6.860664508377001e-07,
	"loss": 0.0322,
	"reward": -0.29599685221910477,
	"reward_std": 0.34860160388052464,
	"rewards/cosine_scaled_reward": -0.29383176099509,
	"rewards/format_reward": 0.2916666679084301,
	"step": 232
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 3569.7916870117188,
	"epoch": 0.13314285714285715,
	"grad_norm": 0.011569323018193245,
	"kl": 0.0003650188446044922,
	"learning_rate": 6.83068622519821e-07,
	"loss": 0.0082,
	"reward": -0.27050644531846046,
	"reward_std": 0.28310155123472214,
	"rewards/cosine_scaled_reward": -0.15608655102550983,
	"rewards/format_reward": 0.0416666679084301,
	"step": 233
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 3336.3750610351562,
	"epoch": 0.1337142857142857,
	"grad_norm": 0.014362377114593983,
	"kl": 0.0003769397735595703,
	"learning_rate": 6.800643086250121e-07,
	"loss": 0.1096,
	"reward": 0.16344637423753738,
	"reward_std": 1.1342220231890678,
	"rewards/cosine_scaled_reward": -0.022443480789661407,
	"rewards/format_reward": 0.2083333358168602,
	"step": 234
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2698.5416717529297,
	"epoch": 0.13428571428571429,
	"grad_norm": 0.016733834519982338,
	"kl": 0.0003490447998046875,
	"learning_rate": 6.770536555792944e-07,
	"loss": 0.0561,
	"reward": 0.6713685989379883,
	"reward_std": 0.4799320735037327,
	"rewards/cosine_scaled_reward": 0.14818426966667175,
	"rewards/format_reward": 0.3750000037252903,
	"step": 235
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1749.2916717529297,
	"epoch": 0.13485714285714287,
	"grad_norm": 0.032998789101839066,
	"kl": 0.00046539306640625,
	"learning_rate": 6.740368101176495e-07,
	"loss": 0.0971,
	"reward": 0.6648036018013954,
	"reward_std": 0.8384054228663445,
	"rewards/cosine_scaled_reward": -0.10509821772575378,
	"rewards/format_reward": 0.8750000149011612,
	"step": 236
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1968.3333435058594,
	"epoch": 0.13542857142857143,
	"grad_norm": 0.02814081497490406,
	"kl": 0.0006561279296875,
	"learning_rate": 6.710139192768694e-07,
	"loss": 0.0855,
	"reward": 0.15130121633410454,
	"reward_std": 0.4565184935927391,
	"rewards/cosine_scaled_reward": -0.21601606532931328,
	"rewards/format_reward": 0.5833333358168602,
	"step": 237
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2913.3333435058594,
	"epoch": 0.136,
	"grad_norm": 0.017682794481515884,
	"kl": 0.0003323554992675781,
	"learning_rate": 6.679851303883891e-07,
	"loss": -0.0443,
	"reward": 0.043509919196367264,
	"reward_std": 0.8419067375361919,
	"rewards/cosine_scaled_reward": -0.18657837435603142,
	"rewards/format_reward": 0.4166666679084301,
	"step": 238
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 3364.416748046875,
	"epoch": 0.13657142857142857,
	"grad_norm": 0.028049411252141,
	"kl": 0.0004544258117675781,
	"learning_rate": 6.649505910711058e-07,
	"loss": 0.1081,
	"reward": 0.21441528294235468,
	"reward_std": 1.0792418122291565,
	"rewards/cosine_scaled_reward": -0.03862569108605385,
	"rewards/format_reward": 0.2916666753590107,
	"step": 239
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2725.7083740234375,
	"epoch": 0.13714285714285715,
	"grad_norm": 0.046624064445495605,
	"kl": 0.0005068778991699219,
	"learning_rate": 6.619104492241847e-07,
	"loss": 0.2407,
	"reward": -0.037649777717888355,
	"reward_std": 0.7788064442574978,
	"rewards/cosine_scaled_reward": -0.22715822607278824,
	"rewards/format_reward": 0.4166666716337204,
	"step": 240
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2704.25,
	"epoch": 0.1377142857142857,
	"grad_norm": 0.03354218974709511,
	"kl": 0.00043392181396484375,
	"learning_rate": 6.588648530198504e-07,
	"loss": 0.027,
	"reward": 0.41115298634395003,
	"reward_std": 0.5296461880207062,
	"rewards/cosine_scaled_reward": -0.002756841480731964,
	"rewards/format_reward": 0.4166666679084301,
	"step": 241
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 3075.8750610351562,
	"epoch": 0.1382857142857143,
	"grad_norm": 0.01612684689462185,
	"kl": 0.0004563331604003906,
	"learning_rate": 6.558139508961654e-07,
	"loss": 0.1636,
	"reward": -0.1777523159980774,
	"reward_std": 0.47497741878032684,
	"rewards/cosine_scaled_reward": -0.2138761579990387,
	"rewards/format_reward": 0.2500000037252903,
	"step": 242
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2829.500030517578,
	"epoch": 0.13885714285714285,
	"grad_norm": 0.022913776338100433,
	"kl": 0.00047969818115234375,
	"learning_rate": 6.527578915497951e-07,
	"loss": 0.1179,
	"reward": -0.3900511562824249,
	"reward_std": 0.19519304856657982,
	"rewards/cosine_scaled_reward": -0.34085891395807266,
	"rewards/format_reward": 0.2916666679084301,
	"step": 243
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 3008.4166870117188,
	"epoch": 0.13942857142857143,
	"grad_norm": 0.017667554318904877,
	"kl": 0.0005576610565185547,
	"learning_rate": 6.496968239287603e-07,
	"loss": 0.1161,
	"reward": -0.07407113164663315,
	"reward_std": 0.9030572213232517,
	"rewards/cosine_scaled_reward": -0.22453556954860687,
	"rewards/format_reward": 0.3750000111758709,
	"step": 244
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2979.1250610351562,
	"epoch": 0.14,
	"grad_norm": 0.015229357406497002,
	"kl": 0.0004878044128417969,
	"learning_rate": 6.466308972251785e-07,
	"loss": 0.0894,
	"reward": 0.2635510638356209,
	"reward_std": 1.100995272397995,
	"rewards/cosine_scaled_reward": -0.05572447320446372,
	"rewards/format_reward": 0.3750000149011612,
	"step": 245
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2819.0416870117188,
	"epoch": 0.14057142857142857,
	"grad_norm": 0.044691551476716995,
	"kl": 0.00034236907958984375,
	"learning_rate": 6.435602608679916e-07,
	"loss": -0.046,
	"reward": 0.28528738766908646,
	"reward_std": 0.8157100006937981,
	"rewards/cosine_scaled_reward": -0.04485631617717445,
	"rewards/format_reward": 0.375,
	"step": 246
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 3443.8750610351562,
	"epoch": 0.14114285714285715,
	"grad_norm": 0.011280644685029984,
	"kl": 0.00034046173095703125,
	"learning_rate": 6.404850645156841e-07,
	"loss": 0.0628,
	"reward": 0.2559690326452255,
	"reward_std": 1.1299069225788116,
	"rewards/cosine_scaled_reward": 0.00298450980335474,
	"rewards/format_reward": 0.2500000074505806,
	"step": 247
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1982.8333435058594,
	"epoch": 0.1417142857142857,
	"grad_norm": 0.051233626902103424,
	"kl": 0.0006239414215087891,
	"learning_rate": 6.374054580489873e-07,
	"loss": 0.1208,
	"reward": 0.47601281851530075,
	"reward_std": 0.8072874061763287,
	"rewards/cosine_scaled_reward": -0.09532693400979042,
	"rewards/format_reward": 0.6666666716337204,
	"step": 248
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 3090.5000610351562,
	"epoch": 0.1422857142857143,
	"grad_norm": 0.011510615237057209,
	"kl": 0.0003566741943359375,
	"learning_rate": 6.343215915635761e-07,
	"loss": -0.0075,
	"reward": -0.1677398905158043,
	"reward_std": 0.7362043038010597,
	"rewards/cosine_scaled_reward": -0.271369943395257,
	"rewards/format_reward": 0.3750000037252903,
	"step": 249
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2831.6666870117188,
	"epoch": 0.14285714285714285,
	"grad_norm": 0.012945250608026981,
	"kl": 0.000339508056640625,
	"learning_rate": 6.31233615362752e-07,
	"loss": 0.0464,
	"reward": 0.08572675287723541,
	"reward_std": 0.4463801756501198,
	"rewards/cosine_scaled_reward": -0.10296999663114548,
	"rewards/format_reward": 0.2916666679084301,
	"step": 250
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2145.7084045410156,
	"epoch": 0.14342857142857143,
	"grad_norm": 0.02382800169289112,
	"kl": 0.00037860870361328125,
	"learning_rate": 6.281416799501187e-07,
	"loss": 0.1758,
	"reward": 0.016892731189727783,
	"reward_std": 0.4844237770885229,
	"rewards/cosine_scaled_reward": -0.2832203172147274,
	"rewards/format_reward": 0.5833333432674408,
	"step": 251
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2960.5416870117188,
	"epoch": 0.144,
	"grad_norm": 0.021183207631111145,
	"kl": 0.0004506111145019531,
	"learning_rate": 6.25045936022246e-07,
	"loss": 0.1659,
	"reward": -0.04246724583208561,
	"reward_std": 0.6862606927752495,
	"rewards/cosine_scaled_reward": -0.18790028244256973,
	"rewards/format_reward": 0.3333333395421505,
	"step": 252
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2839.4166870117188,
	"epoch": 0.14457142857142857,
	"grad_norm": 0.015497619286179543,
	"kl": 0.0003509521484375,
	"learning_rate": 6.219465344613258e-07,
	"loss": 0.0335,
	"reward": -0.19672805070877075,
	"reward_std": 0.37255218997597694,
	"rewards/cosine_scaled_reward": -0.24419735372066498,
	"rewards/format_reward": 0.2916666679084301,
	"step": 253
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2932.6666717529297,
	"epoch": 0.14514285714285713,
	"grad_norm": 0.014029218815267086,
	"kl": 0.0002608299255371094,
	"learning_rate": 6.188436263278172e-07,
	"loss": 0.1358,
	"reward": 0.564534567296505,
	"reward_std": 0.7931031864136457,
	"rewards/cosine_scaled_reward": 0.03226728364825249,
	"rewards/format_reward": 0.5000000074505806,
	"step": 254
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2817.3333740234375,
	"epoch": 0.1457142857142857,
	"grad_norm": 0.02292151190340519,
	"kl": 0.0003943443298339844,
	"learning_rate": 6.157373628530852e-07,
	"loss": 0.0033,
	"reward": 0.7924370467662811,
	"reward_std": 0.5020520761609077,
	"rewards/cosine_scaled_reward": 0.16705189645290375,
	"rewards/format_reward": 0.4583333432674408,
	"step": 255
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 3312.5000610351562,
	"epoch": 0.1462857142857143,
	"grad_norm": 0.024594873189926147,
	"kl": 0.00034236907958984375,
	"learning_rate": 6.126278954320294e-07,
	"loss": 0.1007,
	"reward": -0.39764899387955666,
	"reward_std": 0.4253292456269264,
	"rewards/cosine_scaled_reward": -0.302991159260273,
	"rewards/format_reward": 0.2083333395421505,
	"step": 256
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2919.0833740234375,
	"epoch": 0.14685714285714285,
	"grad_norm": 0.026216557249426842,
	"kl": 0.0004639625549316406,
	"learning_rate": 6.095153756157051e-07,
	"loss": 0.2025,
	"reward": -0.20567850768566132,
	"reward_std": 0.6880289539694786,
	"rewards/cosine_scaled_reward": -0.2695059161633253,
	"rewards/format_reward": 0.3333333432674408,
	"step": 257
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2133.8333435058594,
	"epoch": 0.14742857142857144,
	"grad_norm": 0.03468165546655655,
	"kl": 0.0003616809844970703,
	"learning_rate": 6.06399955103937e-07,
	"loss": 0.1963,
	"reward": 0.30517828464508057,
	"reward_std": 0.593671128153801,
	"rewards/cosine_scaled_reward": -0.1390775376930833,
	"rewards/format_reward": 0.5833333358168602,
	"step": 258
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 3469.2916870117188,
	"epoch": 0.148,
	"grad_norm": 0.02112772688269615,
	"kl": 0.0003986358642578125,
	"learning_rate": 6.032817857379256e-07,
	"loss": 0.0331,
	"reward": -0.01782984286546707,
	"reward_std": 0.8413332737982273,
	"rewards/cosine_scaled_reward": -0.09224824234843254,
	"rewards/format_reward": 0.1666666679084301,
	"step": 259
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 3433.7083740234375,
	"epoch": 0.14857142857142858,
	"grad_norm": 0.014742922969162464,
	"kl": 0.000308990478515625,
	"learning_rate": 6.001610194928464e-07,
	"loss": 0.0383,
	"reward": 0.31882617622613907,
	"reward_std": 0.15408218186348677,
	"rewards/cosine_scaled_reward": 0.03441305831074715,
	"rewards/format_reward": 0.25,
	"step": 260
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 3461.291748046875,
	"epoch": 0.14914285714285713,
	"grad_norm": 0.014058803208172321,
	"kl": 0.00042438507080078125,
	"learning_rate": 5.97037808470444e-07,
	"loss": 0.0671,
	"reward": -0.2942545488476753,
	"reward_std": 0.612834420055151,
	"rewards/cosine_scaled_reward": -0.2096272725611925,
	"rewards/format_reward": 0.1250000037252903,
	"step": 261
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 3319.7083740234375,
	"epoch": 0.14971428571428572,
	"grad_norm": 0.013714558444917202,
	"kl": 0.0003371238708496094,
	"learning_rate": 5.939123048916173e-07,
	"loss": 0.104,
	"reward": 0.14693566597998142,
	"reward_std": 0.5481071509420872,
	"rewards/cosine_scaled_reward": -0.11403219401836395,
	"rewards/format_reward": 0.3750000149011612,
	"step": 262
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2869.166717529297,
	"epoch": 0.15028571428571427,
	"grad_norm": 0.04431964457035065,
	"kl": 0.0004782676696777344,
	"learning_rate": 5.907846610890011e-07,
	"loss": 0.2064,
	"reward": -0.06633574888110161,
	"reward_std": 0.24844567105174065,
	"rewards/cosine_scaled_reward": -0.17900121491402388,
	"rewards/format_reward": 0.291666679084301,
	"step": 263
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 3500.7500610351562,
	"epoch": 0.15085714285714286,
	"grad_norm": 0.011211477220058441,
	"kl": 0.00043582916259765625,
	"learning_rate": 5.87655029499542e-07,
	"loss": 0.0411,
	"reward": -0.3998759835958481,
	"reward_std": 0.430798327550292,
	"rewards/cosine_scaled_reward": -0.24160464480519295,
	"rewards/format_reward": 0.0833333358168602,
	"step": 264
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 3510.6250610351562,
	"epoch": 0.15142857142857144,
	"grad_norm": 0.015797875821590424,
	"kl": 0.0004782676696777344,
	"learning_rate": 5.845235626570683e-07,
	"loss": 0.0301,
	"reward": 0.03237064555287361,
	"reward_std": 0.7703893817961216,
	"rewards/cosine_scaled_reward": -0.06714800372719765,
	"rewards/format_reward": 0.1666666679084301,
	"step": 265
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2977.3333740234375,
	"epoch": 0.152,
	"grad_norm": 0.02703353762626648,
	"kl": 0.000396728515625,
	"learning_rate": 5.813904131848564e-07,
	"loss": 0.0458,
	"reward": -0.099398122751154,
	"reward_std": 0.3674992090091109,
	"rewards/cosine_scaled_reward": -0.21636574110016227,
	"rewards/format_reward": 0.3333333358168602,
	"step": 266
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2061.0833740234375,
	"epoch": 0.15257142857142858,
	"grad_norm": 0.0498061366379261,
	"kl": 0.0005326271057128906,
	"learning_rate": 5.78255733788191e-07,
	"loss": 0.1127,
	"reward": 0.3922936078161001,
	"reward_std": 0.6864228155463934,
	"rewards/cosine_scaled_reward": -0.13718653097748756,
	"rewards/format_reward": 0.6666666716337204,
	"step": 267
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2333.4583435058594,
	"epoch": 0.15314285714285714,
	"grad_norm": 0.028823453933000565,
	"kl": 0.0005085468292236328,
	"learning_rate": 5.751196772469237e-07,
	"loss": 0.186,
	"reward": 0.22259462717920542,
	"reward_std": 0.5068696048110723,
	"rewards/cosine_scaled_reward": -0.15953603573143482,
	"rewards/format_reward": 0.5416666716337204,
	"step": 268
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 3267.3333740234375,
	"epoch": 0.15371428571428572,
	"grad_norm": 0.027852864935994148,
	"kl": 0.0002601146697998047,
	"learning_rate": 5.71982396408026e-07,
	"loss": 0.1001,
	"reward": -0.2786406707018614,
	"reward_std": 0.3324854364618659,
	"rewards/cosine_scaled_reward": -0.22265366930514574,
	"rewards/format_reward": 0.1666666679084301,
	"step": 269
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2147.2083740234375,
	"epoch": 0.15428571428571428,
	"grad_norm": 0.025380106642842293,
	"kl": 0.000537872314453125,
	"learning_rate": 5.688440441781398e-07,
	"loss": 0.1412,
	"reward": 0.5648728758096695,
	"reward_std": 0.5805850811302662,
	"rewards/cosine_scaled_reward": 0.011603094637393951,
	"rewards/format_reward": 0.5416666679084301,
	"step": 270
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2535.7500610351562,
	"epoch": 0.15485714285714286,
	"grad_norm": 0.04058884456753731,
	"kl": 0.0005526542663574219,
	"learning_rate": 5.657047735161255e-07,
	"loss": 0.0614,
	"reward": 0.19145794212818146,
	"reward_std": 0.5669713392853737,
	"rewards/cosine_scaled_reward": -0.17510437592864037,
	"rewards/format_reward": 0.5416666679084301,
	"step": 271
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2707.7500610351562,
	"epoch": 0.15542857142857142,
	"grad_norm": 0.02463456057012081,
	"kl": 0.0004875659942626953,
	"learning_rate": 5.625647374256061e-07,
	"loss": -0.0531,
	"reward": 0.32263614796102047,
	"reward_std": 0.8555684071034193,
	"rewards/cosine_scaled_reward": -0.10951526463031769,
	"rewards/format_reward": 0.5416666716337204,
	"step": 272
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 3203.8750610351562,
	"epoch": 0.156,
	"grad_norm": 0.021756043657660484,
	"kl": 0.0005960464477539062,
	"learning_rate": 5.594240889475106e-07,
	"loss": 0.0959,
	"reward": -0.3803365007042885,
	"reward_std": 0.4917794167995453,
	"rewards/cosine_scaled_reward": -0.31516825407743454,
	"rewards/format_reward": 0.2500000037252903,
	"step": 273
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 3559.375,
	"epoch": 0.15657142857142858,
	"grad_norm": 0.014902282506227493,
	"kl": 0.0003399848937988281,
	"learning_rate": 5.562829811526154e-07,
	"loss": 0.014,
	"reward": -0.4870794676244259,
	"reward_std": 0.32457295805215836,
	"rewards/cosine_scaled_reward": -0.26437306217849255,
	"rewards/format_reward": 0.0416666679084301,
	"step": 274
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2296.2083435058594,
	"epoch": 0.15714285714285714,
	"grad_norm": 0.019861867651343346,
	"kl": 0.0003972053527832031,
	"learning_rate": 5.531415671340826e-07,
	"loss": -0.0306,
	"reward": 0.20605197548866272,
	"reward_std": 0.666698768734932,
	"rewards/cosine_scaled_reward": -0.14697400853037834,
	"rewards/format_reward": 0.5,
	"step": 275
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2113.9583892822266,
	"epoch": 0.15771428571428572,
	"grad_norm": 0.02660396508872509,
	"kl": 0.00037980079650878906,
	"learning_rate": 5.5e-07,
	"loss": 0.189,
	"reward": 0.8675991147756577,
	"reward_std": 0.5296240104362369,
	"rewards/cosine_scaled_reward": 0.12129955366253853,
	"rewards/format_reward": 0.625,
	"step": 276
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2935.2916870117188,
	"epoch": 0.15828571428571428,
	"grad_norm": 0.014423711225390434,
	"kl": 0.0004062652587890625,
	"learning_rate": 5.468584328659172e-07,
	"loss": 0.0398,
	"reward": 0.7772083282470703,
	"reward_std": 0.564259335398674,
	"rewards/cosine_scaled_reward": 0.15943749248981476,
	"rewards/format_reward": 0.4583333432674408,
	"step": 277
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2655.375030517578,
	"epoch": 0.15885714285714286,
	"grad_norm": 0.018322305753827095,
	"kl": 0.0004067420959472656,
	"learning_rate": 5.437170188473847e-07,
	"loss": 0.1151,
	"reward": -0.03639080002903938,
	"reward_std": 0.4445139616727829,
	"rewards/cosine_scaled_reward": -0.24736207351088524,
	"rewards/format_reward": 0.4583333432674408,
	"step": 278
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2705.1250610351562,
	"epoch": 0.15942857142857142,
	"grad_norm": 0.03632061555981636,
	"kl": 0.000385284423828125,
	"learning_rate": 5.405759110524894e-07,
	"loss": 0.135,
	"reward": 0.6486790850758553,
	"reward_std": 0.898019090294838,
	"rewards/cosine_scaled_reward": 0.05350620858371258,
	"rewards/format_reward": 0.5416666828095913,
	"step": 279
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2446.2083435058594,
	"epoch": 0.16,
	"grad_norm": 0.013082285411655903,
	"kl": 0.000347137451171875,
	"learning_rate": 5.37435262574394e-07,
	"loss": -0.0428,
	"reward": 0.37989859376102686,
	"reward_std": 0.29897986352443695,
	"rewards/cosine_scaled_reward": -0.060050718020647764,
	"rewards/format_reward": 0.5,
	"step": 280
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2319.2083435058594,
	"epoch": 0.16057142857142856,
	"grad_norm": 0.045566096901893616,
	"kl": 0.0005693435668945312,
	"learning_rate": 5.342952264838747e-07,
	"loss": 0.1853,
	"reward": 0.5609744489192963,
	"reward_std": 0.6304305791854858,
	"rewards/cosine_scaled_reward": 0.05132052768021822,
	"rewards/format_reward": 0.4583333395421505,
	"step": 281
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 3381.8750610351562,
	"epoch": 0.16114285714285714,
	"grad_norm": 0.02684849686920643,
	"kl": 0.0007226467132568359,
	"learning_rate": 5.311559558218603e-07,
	"loss": 0.0416,
	"reward": 0.4639171026647091,
	"reward_std": 0.9287250991910696,
	"rewards/cosine_scaled_reward": 0.08612522296607494,
	"rewards/format_reward": 0.2916666716337204,
	"step": 282
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2586.375030517578,
	"epoch": 0.16171428571428573,
	"grad_norm": 0.0842081606388092,
	"kl": 0.000537872314453125,
	"learning_rate": 5.28017603591974e-07,
	"loss": 0.2945,
	"reward": -0.23938731849193573,
	"reward_std": 0.5152188688516617,
	"rewards/cosine_scaled_reward": -0.30719365179538727,
	"rewards/format_reward": 0.3750000149011612,
	"step": 283
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2352.8750610351562,
	"epoch": 0.16228571428571428,
	"grad_norm": 0.026272239163517952,
	"kl": 0.00044083595275878906,
	"learning_rate": 5.248803227530763e-07,
	"loss": 0.186,
	"reward": 1.0214342884719372,
	"reward_std": 1.2180421352386475,
	"rewards/cosine_scaled_reward": 0.2398838261142373,
	"rewards/format_reward": 0.5416666753590107,
	"step": 284
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 3469.25,
	"epoch": 0.16285714285714287,
	"grad_norm": 0.018204184249043465,
	"kl": 0.0004673004150390625,
	"learning_rate": 5.21744266211809e-07,
	"loss": 0.063,
	"reward": -0.13403620570898056,
	"reward_std": 0.6568610742688179,
	"rewards/cosine_scaled_reward": -0.12951810285449028,
	"rewards/format_reward": 0.1250000037252903,
	"step": 285
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2795.166748046875,
	"epoch": 0.16342857142857142,
	"grad_norm": 0.013621930964291096,
	"kl": 0.0003037452697753906,
	"learning_rate": 5.186095868151436e-07,
	"loss": -0.039,
	"reward": 1.5945213325321674,
	"reward_std": 0.5471338629722595,
	"rewards/cosine_scaled_reward": 0.4222606960684061,
	"rewards/format_reward": 0.75,
	"step": 286
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2879.833465576172,
	"epoch": 0.164,
	"grad_norm": 0.01253263745456934,
	"kl": 0.0004277229309082031,
	"learning_rate": 5.154764373429315e-07,
	"loss": 0.0411,
	"reward": 0.21084657812025398,
	"reward_std": 1.322334498167038,
	"rewards/cosine_scaled_reward": -0.12374338880181313,
	"rewards/format_reward": 0.4583333395421505,
	"step": 287
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 3213.2916870117188,
	"epoch": 0.16457142857142856,
	"grad_norm": 0.024232791736721992,
	"kl": 0.0003829002380371094,
	"learning_rate": 5.123449705004581e-07,
	"loss": 0.0237,
	"reward": -0.3808862268924713,
	"reward_std": 0.41033722274005413,
	"rewards/cosine_scaled_reward": -0.294609775301069,
	"rewards/format_reward": 0.2083333432674408,
	"step": 288
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2690.750045776367,
	"epoch": 0.16514285714285715,
	"grad_norm": 0.053029682487249374,
	"kl": 0.00045490264892578125,
	"learning_rate": 5.09215338910999e-07,
	"loss": 0.1226,
	"reward": -0.21079005533829331,
	"reward_std": 0.3236595541238785,
	"rewards/cosine_scaled_reward": -0.29289503768086433,
	"rewards/format_reward": 0.3750000037252903,
	"step": 289
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2208.916717529297,
	"epoch": 0.1657142857142857,
	"grad_norm": 0.03633257746696472,
	"kl": 0.0009098052978515625,
	"learning_rate": 5.060876951083828e-07,
	"loss": 0.0996,
	"reward": 0.3243631422519684,
	"reward_std": 0.7893142104148865,
	"rewards/cosine_scaled_reward": -0.17115176958031952,
	"rewards/format_reward": 0.6666666828095913,
	"step": 290
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 3469.4166870117188,
	"epoch": 0.1662857142857143,
	"grad_norm": 0.013034985400736332,
	"kl": 0.00041294097900390625,
	"learning_rate": 5.02962191529556e-07,
	"loss": 0.0095,
	"reward": -0.2475043497979641,
	"reward_std": 0.5379906464368105,
	"rewards/cosine_scaled_reward": -0.227918840944767,
	"rewards/format_reward": 0.2083333358168602,
	"step": 291
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2733.541748046875,
	"epoch": 0.16685714285714287,
	"grad_norm": 0.023631447926163673,
	"kl": 0.00041866302490234375,
	"learning_rate": 4.998389805071536e-07,
	"loss": 0.0501,
	"reward": -0.05327422299887985,
	"reward_std": 0.6902021616697311,
	"rewards/cosine_scaled_reward": -0.29747044667601585,
	"rewards/format_reward": 0.5416666865348816,
	"step": 292
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1974.3333587646484,
	"epoch": 0.16742857142857143,
	"grad_norm": 0.025896325707435608,
	"kl": 0.0005052089691162109,
	"learning_rate": 4.967182142620745e-07,
	"loss": -0.0503,
	"reward": 1.0392636209726334,
	"reward_std": 0.7821149528026581,
	"rewards/cosine_scaled_reward": 0.16546513326466084,
	"rewards/format_reward": 0.7083333432674408,
	"step": 293
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 3210.666748046875,
	"epoch": 0.168,
	"grad_norm": 0.016182757914066315,
	"kl": 0.0005517005920410156,
	"learning_rate": 4.93600044896063e-07,
	"loss": 0.1227,
	"reward": -0.3081187531352043,
	"reward_std": 0.9994445107877254,
	"rewards/cosine_scaled_reward": -0.27905938774347305,
	"rewards/format_reward": 0.2500000037252903,
	"step": 294
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 3051.1251220703125,
	"epoch": 0.16857142857142857,
	"grad_norm": 0.021791962906718254,
	"kl": 0.0004928112030029297,
	"learning_rate": 4.904846243842949e-07,
	"loss": 0.1472,
	"reward": -0.0736542553640902,
	"reward_std": 0.4684867039322853,
	"rewards/cosine_scaled_reward": -0.20349380746483803,
	"rewards/format_reward": 0.3333333395421505,
	"step": 295
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 3008.375030517578,
	"epoch": 0.16914285714285715,
	"grad_norm": 0.044715363532304764,
	"kl": 0.0005588531494140625,
	"learning_rate": 4.873721045679706e-07,
	"loss": 0.2406,
	"reward": -0.45781777799129486,
	"reward_std": 0.3647055197507143,
	"rewards/cosine_scaled_reward": -0.33307556062936783,
	"rewards/format_reward": 0.2083333395421505,
	"step": 296
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 3294.1666870117188,
	"epoch": 0.1697142857142857,
	"grad_norm": 0.01236710138618946,
	"kl": 0.00040149688720703125,
	"learning_rate": 4.842626371469149e-07,
	"loss": 0.0176,
	"reward": 0.3674662681296468,
	"reward_std": 0.8067609528079629,
	"rewards/cosine_scaled_reward": 0.017066428757971153,
	"rewards/format_reward": 0.3333333432674408,
	"step": 297
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2191.2084045410156,
	"epoch": 0.1702857142857143,
	"grad_norm": 0.02793304994702339,
	"kl": 0.0004506111145019531,
	"learning_rate": 4.811563736721829e-07,
	"loss": 0.2163,
	"reward": 0.3552871508290991,
	"reward_std": 0.6271071489900351,
	"rewards/cosine_scaled_reward": -0.07235642522573471,
	"rewards/format_reward": 0.5,
	"step": 298
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2937.0833740234375,
	"epoch": 0.17085714285714285,
	"grad_norm": 0.02537981979548931,
	"kl": 0.0006856918334960938,
	"learning_rate": 4.780534655386743e-07,
	"loss": 0.0741,
	"reward": 0.014573439490050077,
	"reward_std": 0.6083418875932693,
	"rewards/cosine_scaled_reward": -0.20104661397635937,
	"rewards/format_reward": 0.4166666828095913,
	"step": 299
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2810.375,
	"epoch": 0.17142857142857143,
	"grad_norm": 0.015299368649721146,
	"kl": 0.00049591064453125,
	"learning_rate": 4.749540639777539e-07,
	"loss": 0.0634,
	"reward": -0.10026557371020317,
	"reward_std": 0.45571645349264145,
	"rewards/cosine_scaled_reward": -0.23763278499245644,
	"rewards/format_reward": 0.375,
	"step": 300
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2589.4583435058594,
	"epoch": 0.172,
	"grad_norm": 0.03871985152363777,
	"kl": 0.000347137451171875,
	"learning_rate": 4.7185832004988133e-07,
	"loss": 0.1152,
	"reward": 0.054252732545137405,
	"reward_std": 0.2450561560690403,
	"rewards/cosine_scaled_reward": -0.1812069695442915,
	"rewards/format_reward": 0.4166666716337204,
	"step": 301
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2865.7083435058594,
	"epoch": 0.17257142857142857,
	"grad_norm": 0.02540568634867668,
	"kl": 0.0004420280456542969,
	"learning_rate": 4.68766384637248e-07,
	"loss": 0.0172,
	"reward": 0.15939845889806747,
	"reward_std": 0.35454079881310463,
	"rewards/cosine_scaled_reward": -0.04530075564980507,
	"rewards/format_reward": 0.25,
	"step": 302
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 3046.25,
	"epoch": 0.17314285714285715,
	"grad_norm": 0.014560963958501816,
	"kl": 0.0003390312194824219,
	"learning_rate": 4.656784084364238e-07,
	"loss": 0.0832,
	"reward": 0.007433712482452393,
	"reward_std": 0.2396685965359211,
	"rewards/cosine_scaled_reward": -0.1212831512093544,
	"rewards/format_reward": 0.25,
	"step": 303
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 3481.0416870117188,
	"epoch": 0.1737142857142857,
	"grad_norm": 0.016006356105208397,
	"kl": 0.00033545494079589844,
	"learning_rate": 4.6259454195101267e-07,
	"loss": 0.0346,
	"reward": -0.22439849376678467,
	"reward_std": 0.6627911329269409,
	"rewards/cosine_scaled_reward": -0.17469927296042442,
	"rewards/format_reward": 0.1250000037252903,
	"step": 304
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2904.5418090820312,
	"epoch": 0.1742857142857143,
	"grad_norm": 0.025142712518572807,
	"kl": 0.00030517578125,
	"learning_rate": 4.59514935484316e-07,
	"loss": 0.1243,
	"reward": 0.8032534047961235,
	"reward_std": 0.8733390048146248,
	"rewards/cosine_scaled_reward": 0.13079336285591125,
	"rewards/format_reward": 0.5416666753590107,
	"step": 305
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2188.0833587646484,
	"epoch": 0.17485714285714285,
	"grad_norm": 0.03312807157635689,
	"kl": 0.0004315376281738281,
	"learning_rate": 4.5643973913200837e-07,
	"loss": 0.0318,
	"reward": 0.7741055563092232,
	"reward_std": 0.8599487096071243,
	"rewards/cosine_scaled_reward": 0.0745527446269989,
	"rewards/format_reward": 0.625,
	"step": 306
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 3403.7500610351562,
	"epoch": 0.17542857142857143,
	"grad_norm": 0.01129495445638895,
	"kl": 0.0003752708435058594,
	"learning_rate": 4.5336910277482155e-07,
	"loss": 0.0876,
	"reward": -0.2145760916173458,
	"reward_std": 0.883152648806572,
	"rewards/cosine_scaled_reward": -0.2739547099918127,
	"rewards/format_reward": 0.3333333395421505,
	"step": 307
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2245.3333435058594,
	"epoch": 0.176,
	"grad_norm": 0.02003965899348259,
	"kl": 0.00042700767517089844,
	"learning_rate": 4.503031760712397e-07,
	"loss": 0.1665,
	"reward": 0.9610237777233124,
	"reward_std": 0.924444355070591,
	"rewards/cosine_scaled_reward": 0.1888452209532261,
	"rewards/format_reward": 0.5833333432674408,
	"step": 308
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 3584.0,
	"epoch": 0.17657142857142857,
	"grad_norm": 0.015420272946357727,
	"kl": 0.0005145072937011719,
	"learning_rate": 4.4724210845020494e-07,
	"loss": 0.0,
	"reward": -0.4918478289619088,
	"reward_std": 0.18744987901300192,
	"rewards/cosine_scaled_reward": -0.24592391401529312,
	"rewards/format_reward": 0.0,
	"step": 309
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 3584.0,
	"epoch": 0.17714285714285713,
	"grad_norm": 0.011936242692172527,
	"kl": 0.00030040740966796875,
	"learning_rate": 4.441860491038345e-07,
	"loss": 0.0,
	"reward": -0.5715262293815613,
	"reward_std": 0.19434408470988274,
	"rewards/cosine_scaled_reward": -0.28576310351490974,
	"rewards/format_reward": 0.0,
	"step": 310
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 3039.9583435058594,
	"epoch": 0.1777142857142857,
	"grad_norm": 0.013624753803014755,
	"kl": 0.0003497600555419922,
	"learning_rate": 4.4113514698014953e-07,
	"loss": -0.0761,
	"reward": 0.008271858096122742,
	"reward_std": 0.42439935728907585,
	"rewards/cosine_scaled_reward": -0.12086406722664833,
	"rewards/format_reward": 0.25,
	"step": 311
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 3584.0,
	"epoch": 0.1782857142857143,
	"grad_norm": 0.014998997561633587,
	"kl": 0.0006074905395507812,
	"learning_rate": 4.3808955077581546e-07,
	"loss": 0.0,
	"reward": -0.5124310553073883,
	"reward_std": 0.2232176773250103,
	"rewards/cosine_scaled_reward": -0.25621553510427475,
	"rewards/format_reward": 0.0,
	"step": 312
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2576.7916870117188,
	"epoch": 0.17885714285714285,
	"grad_norm": 0.028986027464270592,
	"kl": 0.0004107952117919922,
	"learning_rate": 4.350494089288943e-07,
	"loss": 0.1646,
	"reward": 0.38414837792515755,
	"reward_std": 0.8268394228070974,
	"rewards/cosine_scaled_reward": -0.03709249012172222,
	"rewards/format_reward": 0.4583333358168602,
	"step": 313
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2342.250030517578,
	"epoch": 0.17942857142857144,
	"grad_norm": 0.07766366004943848,
	"kl": 0.0004897117614746094,
	"learning_rate": 4.3201486961161093e-07,
	"loss": 0.2676,
	"reward": 1.206177432090044,
	"reward_std": 1.3236718773841858,
	"rewards/cosine_scaled_reward": 0.29058872163295746,
	"rewards/format_reward": 0.6250000111758709,
	"step": 314
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2941.8333740234375,
	"epoch": 0.18,
	"grad_norm": 0.015173373743891716,
	"kl": 0.00037097930908203125,
	"learning_rate": 4.2898608072313045e-07,
	"loss": -0.0539,
	"reward": -0.3268696665763855,
	"reward_std": 0.39195265993475914,
	"rewards/cosine_scaled_reward": -0.33010151237249374,
	"rewards/format_reward": 0.3333333358168602,
	"step": 315
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 3459.75,
	"epoch": 0.18057142857142858,
	"grad_norm": 0.01420869305729866,
	"kl": 0.0004048347473144531,
	"learning_rate": 4.2596318988235037e-07,
	"loss": 0.0212,
	"reward": -0.1601060489192605,
	"reward_std": 0.7861558832228184,
	"rewards/cosine_scaled_reward": -0.16338635561987758,
	"rewards/format_reward": 0.1666666716337204,
	"step": 316
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2739.7083740234375,
	"epoch": 0.18114285714285713,
	"grad_norm": 0.02297414094209671,
	"kl": 0.002285003662109375,
	"learning_rate": 4.2294634442070553e-07,
	"loss": -0.0329,
	"reward": 0.5244220271706581,
	"reward_std": 0.9719415307044983,
	"rewards/cosine_scaled_reward": 0.0330443549901247,
	"rewards/format_reward": 0.4583333358168602,
	"step": 317
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2666.5416870117188,
	"epoch": 0.18171428571428572,
	"grad_norm": 0.022833170369267464,
	"kl": 0.0004475116729736328,
	"learning_rate": 4.1993569137498776e-07,
	"loss": 0.1241,
	"reward": 0.19690169394016266,
	"reward_std": 0.6580867804586887,
	"rewards/cosine_scaled_reward": -0.19321582466363907,
	"rewards/format_reward": 0.583333358168602,
	"step": 318
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2698.3750610351562,
	"epoch": 0.18228571428571427,
	"grad_norm": 0.034383926540613174,
	"kl": 0.0003724098205566406,
	"learning_rate": 4.1693137748017915e-07,
	"loss": 0.2278,
	"reward": 0.6201122887432575,
	"reward_std": 0.6080379486083984,
	"rewards/cosine_scaled_reward": 0.06005614344030619,
	"rewards/format_reward": 0.5000000074505806,
	"step": 319
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2862.3750610351562,
	"epoch": 0.18285714285714286,
	"grad_norm": 0.0345999151468277,
	"kl": 0.00040721893310546875,
	"learning_rate": 4.1393354916230005e-07,
	"loss": -0.0318,
	"reward": 0.12236133217811584,
	"reward_std": 0.6523439809679985,
	"rewards/cosine_scaled_reward": -0.10548599809408188,
	"rewards/format_reward": 0.3333333358168602,
	"step": 320
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 3358.125,
	"epoch": 0.18342857142857144,
	"grad_norm": 0.013232111930847168,
	"kl": 0.0003991127014160156,
	"learning_rate": 4.1094235253127374e-07,
	"loss": 0.0404,
	"reward": -0.11832981812767684,
	"reward_std": 0.42844806239008904,
	"rewards/cosine_scaled_reward": -0.16333157755434513,
	"rewards/format_reward": 0.2083333395421505,
	"step": 321
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 3566.4166870117188,
	"epoch": 0.184,
	"grad_norm": 0.016278283670544624,
	"kl": 0.00032711029052734375,
	"learning_rate": 4.079579333738039e-07,
	"loss": 0.0102,
	"reward": -0.4220607131719589,
	"reward_std": 0.5171524062752724,
	"rewards/cosine_scaled_reward": -0.23186369240283966,
	"rewards/format_reward": 0.0416666679084301,
	"step": 322
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 3138.6666870117188,
	"epoch": 0.18457142857142858,
	"grad_norm": 0.013172892853617668,
	"kl": 0.0003600120544433594,
	"learning_rate": 4.0498043714627006e-07,
	"loss": 0.0537,
	"reward": 0.05053871381096542,
	"reward_std": 0.8983977390453219,
	"rewards/cosine_scaled_reward": -0.16223064810037613,
	"rewards/format_reward": 0.3750000111758709,
	"step": 323
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 3010.291748046875,
	"epoch": 0.18514285714285714,
	"grad_norm": 0.025909846648573875,
	"kl": 0.00040721893310546875,
	"learning_rate": 4.020100089676376e-07,
	"loss": 0.0602,
	"reward": 0.5590743962675333,
	"reward_std": 0.8683347813785076,
	"rewards/cosine_scaled_reward": 0.029537230730056763,
	"rewards/format_reward": 0.5000000223517418,
	"step": 324
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2910.000030517578,
	"epoch": 0.18571428571428572,
	"grad_norm": 0.015413357876241207,
	"kl": 0.0004220008850097656,
	"learning_rate": 3.9904679361238526e-07,
	"loss": 0.0317,
	"reward": -0.0408908948302269,
	"reward_std": 0.5930759366601706,
	"rewards/cosine_scaled_reward": -0.20794545486569405,
	"rewards/format_reward": 0.375,
	"step": 325
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2532.166748046875,
	"epoch": 0.18628571428571428,
	"grad_norm": 0.030060861259698868,
	"kl": 0.0005965232849121094,
	"learning_rate": 3.9609093550344907e-07,
	"loss": 0.0679,
	"reward": 0.6265107244253159,
	"reward_std": 0.9718786887824535,
	"rewards/cosine_scaled_reward": 0.06325538456439972,
	"rewards/format_reward": 0.5,
	"step": 326
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 3148.875,
	"epoch": 0.18685714285714286,
	"grad_norm": 0.041659917682409286,
	"kl": 0.0004248619079589844,
	"learning_rate": 3.931425787051832e-07,
	"loss": 0.1537,
	"reward": -0.35396782122552395,
	"reward_std": 0.2636511065065861,
	"rewards/cosine_scaled_reward": -0.2603172492235899,
	"rewards/format_reward": 0.1666666716337204,
	"step": 327
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2493.916717529297,
	"epoch": 0.18742857142857142,
	"grad_norm": 0.020772553980350494,
	"kl": 0.0006198883056640625,
	"learning_rate": 3.902018669163384e-07,
	"loss": 0.1175,
	"reward": 0.18326607067137957,
	"reward_std": 0.49889209493994713,
	"rewards/cosine_scaled_reward": -0.11670029908418655,
	"rewards/format_reward": 0.4166666716337204,
	"step": 328
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 3093.291748046875,
	"epoch": 0.188,
	"grad_norm": 0.011293930932879448,
	"kl": 0.0003113746643066406,
	"learning_rate": 3.872689434630585e-07,
	"loss": 0.0669,
	"reward": 0.5585142355412245,
	"reward_std": 1.4061576128005981,
	"rewards/cosine_scaled_reward": 0.09175711218267679,
	"rewards/format_reward": 0.3750000111758709,
	"step": 329
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2978.7916870117188,
	"epoch": 0.18857142857142858,
	"grad_norm": 0.05254025384783745,
	"kl": 0.000415802001953125,
	"learning_rate": 3.843439512918949e-07,
	"loss": 0.2134,
	"reward": 0.02450428158044815,
	"reward_std": 0.6984777390025556,
	"rewards/cosine_scaled_reward": -0.11274785548448563,
	"rewards/format_reward": 0.2500000111758709,
	"step": 330
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2632.4583587646484,
	"epoch": 0.18914285714285714,
	"grad_norm": 0.020709240809082985,
	"kl": 0.000537872314453125,
	"learning_rate": 3.8142703296283953e-07,
	"loss": 0.0682,
	"reward": 0.45295886788517237,
	"reward_std": 0.2939260210841894,
	"rewards/cosine_scaled_reward": 0.018146060872823,
	"rewards/format_reward": 0.4166666679084301,
	"step": 331
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2125.0416870117188,
	"epoch": 0.18971428571428572,
	"grad_norm": 0.019701264798641205,
	"kl": 0.00028228759765625,
	"learning_rate": 3.785183306423767e-07,
	"loss": 0.198,
	"reward": 0.22587934881448746,
	"reward_std": 0.6166221983730793,
	"rewards/cosine_scaled_reward": -0.15789367514662445,
	"rewards/format_reward": 0.541666679084301,
	"step": 332
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2881.3333740234375,
	"epoch": 0.19028571428571428,
	"grad_norm": 0.019184157252311707,
	"kl": 0.0005030632019042969,
	"learning_rate": 3.7561798609655373e-07,
	"loss": 0.0184,
	"reward": -0.2152223140001297,
	"reward_std": 0.38293132930994034,
	"rewards/cosine_scaled_reward": -0.25344450399279594,
	"rewards/format_reward": 0.2916666679084301,
	"step": 333
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 3178.7083435058594,
	"epoch": 0.19085714285714286,
	"grad_norm": 0.014846621081233025,
	"kl": 0.000293731689453125,
	"learning_rate": 3.72726140684072e-07,
	"loss": 0.1053,
	"reward": 0.18489570170640945,
	"reward_std": 1.0315047651529312,
	"rewards/cosine_scaled_reward": -0.03255215287208557,
	"rewards/format_reward": 0.2500000074505806,
	"step": 334
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2227.750030517578,
	"epoch": 0.19142857142857142,
	"grad_norm": 0.022946475073695183,
	"kl": 0.00047016143798828125,
	"learning_rate": 3.6984293534939737e-07,
	"loss": 0.0083,
	"reward": 0.32345347106456757,
	"reward_std": 0.5401728432625532,
	"rewards/cosine_scaled_reward": -0.1924399547278881,
	"rewards/format_reward": 0.7083333395421505,
	"step": 335
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2682.9166870117188,
	"epoch": 0.192,
	"grad_norm": 0.029810767620801926,
	"kl": 0.0006122589111328125,
	"learning_rate": 3.6696851061588994e-07,
	"loss": 0.1395,
	"reward": 0.37340210494585335,
	"reward_std": 0.7611111477017403,
	"rewards/cosine_scaled_reward": -0.08413228765130043,
	"rewards/format_reward": 0.5416666716337204,
	"step": 336
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2755.2500228881836,
	"epoch": 0.19257142857142856,
	"grad_norm": 0.030772754922509193,
	"kl": 0.0004162788391113281,
	"learning_rate": 3.641030065789562e-07,
	"loss": 0.0926,
	"reward": 0.6443270817399025,
	"reward_std": 0.9492446109652519,
	"rewards/cosine_scaled_reward": 0.09299685433506966,
	"rewards/format_reward": 0.4583333358168602,
	"step": 337
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2727.2083435058594,
	"epoch": 0.19314285714285714,
	"grad_norm": 0.019305266439914703,
	"kl": 0.0005497932434082031,
	"learning_rate": 3.612465628992203e-07,
	"loss": 0.0282,
	"reward": 0.6710007563233376,
	"reward_std": 1.21895881742239,
	"rewards/cosine_scaled_reward": 0.08550036698579788,
	"rewards/format_reward": 0.5000000074505806,
	"step": 338
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 3584.0,
	"epoch": 0.19371428571428573,
	"grad_norm": 0.01511505339294672,
	"kl": 0.0004024505615234375,
	"learning_rate": 3.5839931879571725e-07,
	"loss": 0.0,
	"reward": -0.610577579587698,
	"reward_std": 0.23075975850224495,
	"rewards/cosine_scaled_reward": -0.305288789793849,
	"rewards/format_reward": 0.0,
	"step": 339
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2202.125015258789,
	"epoch": 0.19428571428571428,
	"grad_norm": 0.02869362384080887,
	"kl": 0.0005574226379394531,
	"learning_rate": 3.555614130391079e-07,
	"loss": 0.0131,
	"reward": 0.296954870223999,
	"reward_std": 0.4164566658437252,
	"rewards/cosine_scaled_reward": -0.10152260027825832,
	"rewards/format_reward": 0.5,
	"step": 340
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1960.1667175292969,
	"epoch": 0.19485714285714287,
	"grad_norm": 0.05206461623311043,
	"kl": 0.00039577484130859375,
	"learning_rate": 3.5273298394491515e-07,
	"loss": 0.268,
	"reward": 0.24809112399816513,
	"reward_std": 0.6666374318301678,
	"rewards/cosine_scaled_reward": -0.23012111335992813,
	"rewards/format_reward": 0.7083333432674408,
	"step": 341
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2733.4166717529297,
	"epoch": 0.19542857142857142,
	"grad_norm": 0.028285467997193336,
	"kl": 0.000484466552734375,
	"learning_rate": 3.4991416936678276e-07,
	"loss": 0.1274,
	"reward": 0.2500569764524698,
	"reward_std": 0.38465849310159683,
	"rewards/cosine_scaled_reward": -0.062471505254507065,
	"rewards/format_reward": 0.3750000037252903,
	"step": 342
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2453.500030517578,
	"epoch": 0.196,
	"grad_norm": 0.03303263336420059,
	"kl": 0.0004892349243164062,
	"learning_rate": 3.471051066897562e-07,
	"loss": 0.2808,
	"reward": -0.11763790249824524,
	"reward_std": 0.5739464350044727,
	"rewards/cosine_scaled_reward": -0.30881896358914673,
	"rewards/format_reward": 0.5000000223517418,
	"step": 343
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2497.2083435058594,
	"epoch": 0.19657142857142856,
	"grad_norm": 0.05244883522391319,
	"kl": 0.00046515464782714844,
	"learning_rate": 3.4430593282358777e-07,
	"loss": 0.0989,
	"reward": 0.34542886912822723,
	"reward_std": 0.9526717662811279,
	"rewards/cosine_scaled_reward": -0.03561893478035927,
	"rewards/format_reward": 0.4166666679084301,
	"step": 344
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1832.7500305175781,
	"epoch": 0.19714285714285715,
	"grad_norm": 0.041937097907066345,
	"kl": 0.0008487701416015625,
	"learning_rate": 3.4151678419606233e-07,
	"loss": -0.1656,
	"reward": 1.009265385568142,
	"reward_std": 0.40681467205286026,
	"rewards/cosine_scaled_reward": 0.15046602487564087,
	"rewards/format_reward": 0.7083333432674408,
	"step": 345
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 3584.0,
	"epoch": 0.1977142857142857,
	"grad_norm": 0.011569972150027752,
	"kl": 0.000385284423828125,
	"learning_rate": 3.387377967463493e-07,
	"loss": 0.0,
	"reward": -0.31521460227668285,
	"reward_std": 0.19862121110782027,
	"rewards/cosine_scaled_reward": -0.19927397277206182,
	"rewards/format_reward": 0.0833333358168602,
	"step": 346
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2886.0416870117188,
	"epoch": 0.1982857142857143,
	"grad_norm": 0.013902098871767521,
	"kl": 0.00035190582275390625,
	"learning_rate": 3.359691059183761e-07,
	"loss": -0.0006,
	"reward": 0.06203071027994156,
	"reward_std": 0.5102610923349857,
	"rewards/cosine_scaled_reward": -0.11481798812747002,
	"rewards/format_reward": 0.2916666679084301,
	"step": 347
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2697.4166870117188,
	"epoch": 0.19885714285714284,
	"grad_norm": 0.028411809355020523,
	"kl": 0.0004925727844238281,
	"learning_rate": 3.3321084665422803e-07,
	"loss": -0.1683,
	"reward": 0.16876617819070816,
	"reward_std": 0.468828896060586,
	"rewards/cosine_scaled_reward": -0.16561690717935562,
	"rewards/format_reward": 0.5,
	"step": 348
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 3510.6666870117188,
	"epoch": 0.19942857142857143,
	"grad_norm": 0.012191089801490307,
	"kl": 0.00038242340087890625,
	"learning_rate": 3.3046315338757026e-07,
	"loss": 0.0276,
	"reward": -0.3254171907901764,
	"reward_std": 0.6530030593276024,
	"rewards/cosine_scaled_reward": -0.2460419312119484,
	"rewards/format_reward": 0.1666666716337204,
	"step": 349
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2198.541717529297,
	"epoch": 0.2,
	"grad_norm": 0.01535298302769661,
	"kl": 0.0005538463592529297,
	"learning_rate": 3.2772616003709616e-07,
	"loss": 0.0941,
	"reward": 1.250352792441845,
	"reward_std": 0.6242426857352257,
	"rewards/cosine_scaled_reward": 0.2710097096860409,
	"rewards/format_reward": 0.7083333395421505,
	"step": 350
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2659.791748046875,
	"epoch": 0.20057142857142857,
	"grad_norm": 0.017833461984992027,
	"kl": 0.0005612373352050781,
	"learning_rate": 3.250000000000001e-07,
	"loss": 0.0465,
	"reward": 0.1603565402328968,
	"reward_std": 0.8732819259166718,
	"rewards/cosine_scaled_reward": -0.1698217373341322,
	"rewards/format_reward": 0.5,
	"step": 351
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2180.500015258789,
	"epoch": 0.20114285714285715,
	"grad_norm": 0.01808895543217659,
	"kl": 0.0004558563232421875,
	"learning_rate": 3.222848061454764e-07,
	"loss": 0.0133,
	"reward": 0.2774962969124317,
	"reward_std": 0.7046881169080734,
	"rewards/cosine_scaled_reward": -0.15291852178052068,
	"rewards/format_reward": 0.5833333358168602,
	"step": 352
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2483.750030517578,
	"epoch": 0.2017142857142857,
	"grad_norm": 0.023005694150924683,
	"kl": 0.0004525184631347656,
	"learning_rate": 3.195807108082429e-07,
	"loss": 0.2053,
	"reward": 0.28026173263788223,
	"reward_std": 0.7538251765072346,
	"rewards/cosine_scaled_reward": -0.13070247694849968,
	"rewards/format_reward": 0.541666679084301,
	"step": 353
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2603.5833435058594,
	"epoch": 0.2022857142857143,
	"grad_norm": 0.019903168082237244,
	"kl": 0.0005669593811035156,
	"learning_rate": 3.168878457820915e-07,
	"loss": 0.098,
	"reward": 0.14617812633514404,
	"reward_std": 0.5162135027348995,
	"rewards/cosine_scaled_reward": -0.15607764571905136,
	"rewards/format_reward": 0.4583333432674408,
	"step": 354
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2432.875030517578,
	"epoch": 0.20285714285714285,
	"grad_norm": 0.02829565852880478,
	"kl": 0.0006160736083984375,
	"learning_rate": 3.142063423134644e-07,
	"loss": 0.0099,
	"reward": 0.19245748221874237,
	"reward_std": 0.2699956987053156,
	"rewards/cosine_scaled_reward": -0.1329379379749298,
	"rewards/format_reward": 0.4583333432674408,
	"step": 355
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2468.8334197998047,
	"epoch": 0.20342857142857143,
	"grad_norm": 0.02010870911180973,
	"kl": 0.0003266334533691406,
	"learning_rate": 3.115363310950578e-07,
	"loss": 0.0836,
	"reward": 1.1018516272306442,
	"reward_std": 0.832906199619174,
	"rewards/cosine_scaled_reward": 0.2800924628973007,
	"rewards/format_reward": 0.541666679084301,
	"step": 356
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2738.2083740234375,
	"epoch": 0.204,
	"grad_norm": 0.02142561413347721,
	"kl": 0.0004112720489501953,
	"learning_rate": 3.0887794225945143e-07,
	"loss": 0.0696,
	"reward": -0.34853553399443626,
	"reward_std": 0.1840939112007618,
	"rewards/cosine_scaled_reward": -0.3617677837610245,
	"rewards/format_reward": 0.375,
	"step": 357
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2853.416717529297,
	"epoch": 0.20457142857142857,
	"grad_norm": 0.016107890754938126,
	"kl": 0.0005478858947753906,
	"learning_rate": 3.062313053727671e-07,
	"loss": 0.0663,
	"reward": 0.6284266784787178,
	"reward_std": 0.72141382843256,
	"rewards/cosine_scaled_reward": 0.10587997734546661,
	"rewards/format_reward": 0.4166666679084301,
	"step": 358
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2372.000045776367,
	"epoch": 0.20514285714285715,
	"grad_norm": 0.021992484107613564,
	"kl": 0.0005311965942382812,
	"learning_rate": 3.0359654942835247e-07,
	"loss": 0.086,
	"reward": 0.6230212822556496,
	"reward_std": 0.8500233590602875,
	"rewards/cosine_scaled_reward": -0.021822698414325714,
	"rewards/format_reward": 0.6666666828095913,
	"step": 359
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 3455.2500610351562,
	"epoch": 0.2057142857142857,
	"grad_norm": 0.030560219660401344,
	"kl": 0.00031185150146484375,
	"learning_rate": 3.0097380284049523e-07,
	"loss": 0.0487,
	"reward": -0.10205069184303284,
	"reward_std": 0.8343977108597755,
	"rewards/cosine_scaled_reward": -0.13435868825763464,
	"rewards/format_reward": 0.1666666716337204,
	"step": 360
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2516.9583740234375,
	"epoch": 0.2062857142857143,
	"grad_norm": 0.016366608440876007,
	"kl": 0.0004210472106933594,
	"learning_rate": 2.9836319343816397e-07,
	"loss": 0.0794,
	"reward": 0.12586134672164917,
	"reward_std": 0.6533399596810341,
	"rewards/cosine_scaled_reward": -0.18706931918859482,
	"rewards/format_reward": 0.5000000111758709,
	"step": 361
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 3351.7916870117188,
	"epoch": 0.20685714285714285,
	"grad_norm": 0.01776767335832119,
	"kl": 0.0003039836883544922,
	"learning_rate": 2.9576484845877793e-07,
	"loss": 0.0928,
	"reward": 0.29054381139576435,
	"reward_std": 1.0099711641669273,
	"rewards/cosine_scaled_reward": -0.04222810734063387,
	"rewards/format_reward": 0.3750000074505806,
	"step": 362
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2496.8750610351562,
	"epoch": 0.20742857142857143,
	"grad_norm": 0.03273333981633186,
	"kl": 0.0006189346313476562,
	"learning_rate": 2.931788945420058e-07,
	"loss": 0.2348,
	"reward": 0.34358200430870056,
	"reward_std": 0.7383445575833321,
	"rewards/cosine_scaled_reward": -0.1407090239226818,
	"rewards/format_reward": 0.6250000111758709,
	"step": 363
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 3452.375,
	"epoch": 0.208,
	"grad_norm": 0.014953386969864368,
	"kl": 0.00035858154296875,
	"learning_rate": 2.9060545772359305e-07,
	"loss": 0.043,
	"reward": -0.4569111131131649,
	"reward_std": 0.34297534823417664,
	"rewards/cosine_scaled_reward": -0.2701222151517868,
	"rewards/format_reward": 0.0833333358168602,
	"step": 364
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2785.7083740234375,
	"epoch": 0.20857142857142857,
	"grad_norm": 0.02793431654572487,
	"kl": 0.0006036758422851562,
	"learning_rate": 2.8804466342921987e-07,
	"loss": 0.0765,
	"reward": 0.11223252862691879,
	"reward_std": 0.619122963398695,
	"rewards/cosine_scaled_reward": -0.21471708547323942,
	"rewards/format_reward": 0.5416666865348816,
	"step": 365
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 3409.541748046875,
	"epoch": 0.20914285714285713,
	"grad_norm": 0.01248230878263712,
	"kl": 0.0003552436828613281,
	"learning_rate": 2.854966364683872e-07,
	"loss": 0.0491,
	"reward": -0.3326341025531292,
	"reward_std": 0.3651573769748211,
	"rewards/cosine_scaled_reward": -0.2288170587271452,
	"rewards/format_reward": 0.1250000037252903,
	"step": 366
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2808.4583587646484,
	"epoch": 0.20971428571428571,
	"grad_norm": 0.028179485350847244,
	"kl": 0.0006113052368164062,
	"learning_rate": 2.829615010283344e-07,
	"loss": 0.0589,
	"reward": 0.1167896268889308,
	"reward_std": 1.1129950881004333,
	"rewards/cosine_scaled_reward": -0.10827185213565826,
	"rewards/format_reward": 0.3333333358168602,
	"step": 367
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2660.791748046875,
	"epoch": 0.2102857142857143,
	"grad_norm": 0.0340777263045311,
	"kl": 0.0005307197570800781,
	"learning_rate": 2.8043938066798645e-07,
	"loss": 0.1111,
	"reward": 1.4332922995090485,
	"reward_std": 1.385080635547638,
	"rewards/cosine_scaled_reward": 0.34164613112807274,
	"rewards/format_reward": 0.7500000149011612,
	"step": 368
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 3233.3750610351562,
	"epoch": 0.21085714285714285,
	"grad_norm": 0.013750105164945126,
	"kl": 0.0003342628479003906,
	"learning_rate": 2.7793039831193133e-07,
	"loss": 0.0717,
	"reward": 0.05912124365568161,
	"reward_std": 0.874366108328104,
	"rewards/cosine_scaled_reward": -0.09543937258422375,
	"rewards/format_reward": 0.2500000074505806,
	"step": 369
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 3452.4166870117188,
	"epoch": 0.21142857142857144,
	"grad_norm": 0.012770992703735828,
	"kl": 0.0004000663757324219,
	"learning_rate": 2.7543467624442956e-07,
	"loss": 0.0281,
	"reward": 0.15689724683761597,
	"reward_std": 0.2785376161336899,
	"rewards/cosine_scaled_reward": -0.04655133932828903,
	"rewards/format_reward": 0.25,
	"step": 370
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2499.791717529297,
	"epoch": 0.212,
	"grad_norm": 0.018871566280722618,
	"kl": 0.00035071372985839844,
	"learning_rate": 2.729523361034538e-07,
	"loss": 0.1363,
	"reward": 0.5506950244307518,
	"reward_std": 0.9160189777612686,
	"rewards/cosine_scaled_reward": 0.025347519665956497,
	"rewards/format_reward": 0.5000000037252903,
	"step": 371
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 3505.2083740234375,
	"epoch": 0.21257142857142858,
	"grad_norm": 0.01206301525235176,
	"kl": 0.00035262107849121094,
	"learning_rate": 2.7048349887476037e-07,
	"loss": 0.0086,
	"reward": -0.2348268087953329,
	"reward_std": 0.3814601432532072,
	"rewards/cosine_scaled_reward": -0.1799134025350213,
	"rewards/format_reward": 0.125,
	"step": 372
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2949.166717529297,
	"epoch": 0.21314285714285713,
	"grad_norm": 0.01700667478144169,
	"kl": 0.0003724098205566406,
	"learning_rate": 2.6802828488599294e-07,
	"loss": -0.0476,
	"reward": 0.22861511493101716,
	"reward_std": 0.5081553272902966,
	"rewards/cosine_scaled_reward": -0.031525759026408195,
	"rewards/format_reward": 0.2916666679084301,
	"step": 373
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 3467.0000610351562,
	"epoch": 0.21371428571428572,
	"grad_norm": 0.017147116363048553,
	"kl": 0.00045490264892578125,
	"learning_rate": 2.655868138008171e-07,
	"loss": 0.0457,
	"reward": -0.4217256158590317,
	"reward_std": 0.6266062669456005,
	"rewards/cosine_scaled_reward": -0.31502948701381683,
	"rewards/format_reward": 0.2083333358168602,
	"step": 374
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2722.375,
	"epoch": 0.21428571428571427,
	"grad_norm": 0.02914683148264885,
	"kl": 0.0004086494445800781,
	"learning_rate": 2.631592046130896e-07,
	"loss": 0.1972,
	"reward": 0.23988548666238785,
	"reward_std": 0.4570600874722004,
	"rewards/cosine_scaled_reward": -0.06755725666880608,
	"rewards/format_reward": 0.3750000149011612,
	"step": 375
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 3282.5834350585938,
	"epoch": 0.21485714285714286,
	"grad_norm": 0.01625804975628853,
	"kl": 0.00047779083251953125,
	"learning_rate": 2.6074557564105724e-07,
	"loss": 0.0483,
	"reward": 0.39857788383960724,
	"reward_std": 0.9685629121959209,
	"rewards/cosine_scaled_reward": -0.00904439389705658,
	"rewards/format_reward": 0.416666679084301,
	"step": 376
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2434.125,
	"epoch": 0.21542857142857144,
	"grad_norm": 0.034063227474689484,
	"kl": 0.0004830360412597656,
	"learning_rate": 2.583460445215911e-07,
	"loss": 0.164,
	"reward": 0.0973532497882843,
	"reward_std": 0.7279782295227051,
	"rewards/cosine_scaled_reward": -0.22215671092271805,
	"rewards/format_reward": 0.5416666753590107,
	"step": 377
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2981.2083435058594,
	"epoch": 0.216,
	"grad_norm": 0.02709144353866577,
	"kl": 0.0005640983581542969,
	"learning_rate": 2.5596072820445254e-07,
	"loss": 0.2001,
	"reward": -0.3218484600074589,
	"reward_std": 0.3952440693974495,
	"rewards/cosine_scaled_reward": -0.28592423163354397,
	"rewards/format_reward": 0.2500000111758709,
	"step": 378
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 3029.7083740234375,
	"epoch": 0.21657142857142858,
	"grad_norm": 0.020819807425141335,
	"kl": 0.0030651092529296875,
	"learning_rate": 2.5358974294659373e-07,
	"loss": -0.0893,
	"reward": -0.21052202675491571,
	"reward_std": 0.38894602842628956,
	"rewards/cosine_scaled_reward": -0.2719276868738234,
	"rewards/format_reward": 0.3333333358168602,
	"step": 379
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 3011.0833740234375,
	"epoch": 0.21714285714285714,
	"grad_norm": 0.015882771462202072,
	"kl": 0.0006201267242431641,
	"learning_rate": 2.512332043064913e-07,
	"loss": 0.0272,
	"reward": 0.18160124588757753,
	"reward_std": 1.1701444238424301,
	"rewards/cosine_scaled_reward": -0.09669936696445802,
	"rewards/format_reward": 0.3750000037252903,
	"step": 380
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 3127.2500610351562,
	"epoch": 0.21771428571428572,
	"grad_norm": 0.07095064222812653,
	"kl": 0.0007042884826660156,
	"learning_rate": 2.488912271385139e-07,
	"loss": 0.1667,
	"reward": -0.5090018883347511,
	"reward_std": 0.36618487909436226,
	"rewards/cosine_scaled_reward": -0.35866761952638626,
	"rewards/format_reward": 0.2083333358168602,
	"step": 381
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2951.2916870117188,
	"epoch": 0.21828571428571428,
	"grad_norm": 0.03872789442539215,
	"kl": 0.0005259513854980469,
	"learning_rate": 2.465639255873246e-07,
	"loss": 0.1543,
	"reward": -0.07253091287566349,
	"reward_std": 0.7700534537434578,
	"rewards/cosine_scaled_reward": -0.20293213427066803,
	"rewards/format_reward": 0.3333333469927311,
	"step": 382
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2982.2916870117188,
	"epoch": 0.21885714285714286,
	"grad_norm": 0.014369679614901543,
	"kl": 0.0005116462707519531,
	"learning_rate": 2.4425141308231765e-07,
	"loss": -0.0014,
	"reward": 0.13170818611979485,
	"reward_std": 0.41740766912698746,
	"rewards/cosine_scaled_reward": -0.059145910665392876,
	"rewards/format_reward": 0.25,
	"step": 383
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 3228.625,
	"epoch": 0.21942857142857142,
	"grad_norm": 0.012754120863974094,
	"kl": 0.0003314018249511719,
	"learning_rate": 2.4195380233209006e-07,
	"loss": -0.0007,
	"reward": 0.17525914683938026,
	"reward_std": 0.6218334436416626,
	"rewards/cosine_scaled_reward": -0.09987044055014849,
	"rewards/format_reward": 0.3750000149011612,
	"step": 384
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2379.2916717529297,
	"epoch": 0.22,
	"grad_norm": 0.020193729549646378,
	"kl": 0.0006628036499023438,
	"learning_rate": 2.3967120531894857e-07,
	"loss": -0.0018,
	"reward": -0.0569206103682518,
	"reward_std": 0.6073889955878258,
	"rewards/cosine_scaled_reward": -0.257626973092556,
	"rewards/format_reward": 0.4583333432674408,
	"step": 385
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 3001.1666870117188,
	"epoch": 0.22057142857142858,
	"grad_norm": 0.018432218581438065,
	"kl": 0.0004229545593261719,
	"learning_rate": 2.374037332934512e-07,
	"loss": 0.0952,
	"reward": 0.21215322148054838,
	"reward_std": 0.888201154768467,
	"rewards/cosine_scaled_reward": -0.12309006974101067,
	"rewards/format_reward": 0.4583333432674408,
	"step": 386
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1933.2083740234375,
	"epoch": 0.22114285714285714,
	"grad_norm": 0.05453366041183472,
	"kl": 0.0005445480346679688,
	"learning_rate": 2.3515149676898552e-07,
	"loss": 0.1831,
	"reward": 1.217309720814228,
	"reward_std": 0.4588906615972519,
	"rewards/cosine_scaled_reward": 0.33782152086496353,
	"rewards/format_reward": 0.5416666679084301,
	"step": 387
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1593.1667175292969,
	"epoch": 0.22171428571428572,
	"grad_norm": 0.026233471930027008,
	"kl": 0.0007166862487792969,
	"learning_rate": 2.3291460551638237e-07,
	"loss": -0.0644,
	"reward": 1.6789040267467499,
	"reward_std": 1.0171207189559937,
	"rewards/cosine_scaled_reward": 0.33945199474692345,
	"rewards/format_reward": 1.0,
	"step": 388
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1572.4583587646484,
	"epoch": 0.22228571428571428,
	"grad_norm": 0.02512061409652233,
	"kl": 0.0004048347473144531,
	"learning_rate": 2.306931685585657e-07,
	"loss": 0.0294,
	"reward": 1.3811066150665283,
	"reward_std": 0.3954271301627159,
	"rewards/cosine_scaled_reward": 0.3155532553792,
	"rewards/format_reward": 0.75,
	"step": 389
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 3255.9583740234375,
	"epoch": 0.22285714285714286,
	"grad_norm": 0.03513728827238083,
	"kl": 0.0005030632019042969,
	"learning_rate": 2.2848729416523859e-07,
	"loss": 0.1731,
	"reward": -0.12103257514536381,
	"reward_std": 0.8534305840730667,
	"rewards/cosine_scaled_reward": -0.16468296200037003,
	"rewards/format_reward": 0.2083333395421505,
	"step": 390
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2981.8333740234375,
	"epoch": 0.22342857142857142,
	"grad_norm": 0.03934527188539505,
	"kl": 0.0004153251647949219,
	"learning_rate": 2.2629708984760706e-07,
	"loss": 0.1821,
	"reward": -0.06651334711932577,
	"reward_std": 0.2963850498199463,
	"rewards/cosine_scaled_reward": -0.2207566797733307,
	"rewards/format_reward": 0.3750000111758709,
	"step": 391
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 3223.0833740234375,
	"epoch": 0.224,
	"grad_norm": 0.013558811508119106,
	"kl": 0.00038242340087890625,
	"learning_rate": 2.2412266235313973e-07,
	"loss": 0.0057,
	"reward": -0.2705407738685608,
	"reward_std": 0.2837679469957948,
	"rewards/cosine_scaled_reward": -0.2602703794836998,
	"rewards/format_reward": 0.25,
	"step": 392
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 3398.0000610351562,
	"epoch": 0.22457142857142856,
	"grad_norm": 0.012919829227030277,
	"kl": 0.0003833770751953125,
	"learning_rate": 2.2196411766036487e-07,
	"loss": 0.0292,
	"reward": -0.32555074989795685,
	"reward_std": 0.6584090702235699,
	"rewards/cosine_scaled_reward": -0.2669420391321182,
	"rewards/format_reward": 0.2083333395421505,
	"step": 393
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2330.0833587646484,
	"epoch": 0.22514285714285714,
	"grad_norm": 0.03346144035458565,
	"kl": 0.0005555152893066406,
	"learning_rate": 2.1982156097370557e-07,
	"loss": 0.167,
	"reward": 0.23031404614448547,
	"reward_std": 0.7982805892825127,
	"rewards/cosine_scaled_reward": -0.11400966346263885,
	"rewards/format_reward": 0.4583333358168602,
	"step": 394
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 3490.75,
	"epoch": 0.2257142857142857,
	"grad_norm": 0.016392916440963745,
	"kl": 0.0003719329833984375,
	"learning_rate": 2.1769509671835223e-07,
	"loss": 0.0408,
	"reward": -0.04605567455291748,
	"reward_std": 0.8931695856153965,
	"rewards/cosine_scaled_reward": -0.10636119358241558,
	"rewards/format_reward": 0.1666666716337204,
	"step": 395
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2741.9166870117188,
	"epoch": 0.22628571428571428,
	"grad_norm": 0.012639901600778103,
	"kl": 0.0002758502960205078,
	"learning_rate": 2.1558482853517253e-07,
	"loss": 0.0119,
	"reward": 0.37444762885570526,
	"reward_std": 0.4292972981929779,
	"rewards/cosine_scaled_reward": -0.06277619302272797,
	"rewards/format_reward": 0.5000000111758709,
	"step": 396
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2962.8333435058594,
	"epoch": 0.22685714285714287,
	"grad_norm": 0.01602632738649845,
	"kl": 0.00043392181396484375,
	"learning_rate": 2.134908592756607e-07,
	"loss": -0.0317,
	"reward": 0.5750939613208175,
	"reward_std": 0.7307236031629145,
	"rewards/cosine_scaled_reward": 0.05838030157610774,
	"rewards/format_reward": 0.4583333432674408,
	"step": 397
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2762.5833740234375,
	"epoch": 0.22742857142857142,
	"grad_norm": 0.016584103927016258,
	"kl": 0.00033473968505859375,
	"learning_rate": 2.1141329099692406e-07,
	"loss": 0.0113,
	"reward": 0.8510713949799538,
	"reward_std": 0.8986274972558022,
	"rewards/cosine_scaled_reward": 0.1338690184056759,
	"rewards/format_reward": 0.5833333469927311,
	"step": 398
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 3111.7916870117188,
	"epoch": 0.228,
	"grad_norm": 0.04136970639228821,
	"kl": 0.0006189346313476562,
	"learning_rate": 2.0935222495670968e-07,
	"loss": 0.0911,
	"reward": -0.29911787807941437,
	"reward_std": 0.36130358278751373,
	"rewards/cosine_scaled_reward": -0.2537256069481373,
	"rewards/format_reward": 0.2083333432674408,
	"step": 399
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1176.083381652832,
	"epoch": 0.22857142857142856,
	"grad_norm": 0.032378729432821274,
	"kl": 0.0008146762847900391,
	"learning_rate": 2.0730776160846853e-07,
	"loss": -0.0498,
	"reward": 1.2029592096805573,
	"reward_std": 0.8494270741939545,
	"rewards/cosine_scaled_reward": 0.10147958248853683,
	"rewards/format_reward": 1.0,
	"step": 400
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2217.2083892822266,
	"epoch": 0.22914285714285715,
	"grad_norm": 0.03296418488025665,
	"kl": 0.0005393028259277344,
	"learning_rate": 2.0528000059645995e-07,
	"loss": 0.0836,
	"reward": 0.29183474462479353,
	"reward_std": 0.5516791455447674,
	"rewards/cosine_scaled_reward": -0.1874159649014473,
	"rewards/format_reward": 0.6666666716337204,
	"step": 401
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2966.0833740234375,
	"epoch": 0.2297142857142857,
	"grad_norm": 0.023311948403716087,
	"kl": 0.0004687309265136719,
	"learning_rate": 2.032690407508949e-07,
	"loss": -0.0487,
	"reward": -0.14803513139486313,
	"reward_std": 0.5541299842298031,
	"rewards/cosine_scaled_reward": -0.21985089778900146,
	"rewards/format_reward": 0.2916666679084301,
	"step": 402
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2138.375015258789,
	"epoch": 0.2302857142857143,
	"grad_norm": 0.0190444178879261,
	"kl": 0.0005555152893066406,
	"learning_rate": 2.0127498008311922e-07,
	"loss": 0.1245,
	"reward": 0.7670382708311081,
	"reward_std": 0.880347341299057,
	"rewards/cosine_scaled_reward": 0.05018577980808914,
	"rewards/format_reward": 0.6666666716337204,
	"step": 403
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2700.1666717529297,
	"epoch": 0.23085714285714284,
	"grad_norm": 0.02195879817008972,
	"kl": 0.0004572868347167969,
	"learning_rate": 1.9929791578083655e-07,
	"loss": 0.1354,
	"reward": -0.03147786110639572,
	"reward_std": 0.5353209748864174,
	"rewards/cosine_scaled_reward": -0.22407225891947746,
	"rewards/format_reward": 0.4166666716337204,
	"step": 404
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2581.000045776367,
	"epoch": 0.23142857142857143,
	"grad_norm": 0.01655156910419464,
	"kl": 0.00037384033203125,
	"learning_rate": 1.9733794420337213e-07,
	"loss": 0.063,
	"reward": 0.6779801677912474,
	"reward_std": 0.5696643739938736,
	"rewards/cosine_scaled_reward": 0.08899005688726902,
	"rewards/format_reward": 0.5000000074505806,
	"step": 405
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 3439.5000610351562,
	"epoch": 0.232,
	"grad_norm": 0.02102508395910263,
	"kl": 0.0006380081176757812,
	"learning_rate": 1.9539516087697517e-07,
	"loss": 0.0608,
	"reward": -0.2339201234281063,
	"reward_std": 0.7671289071440697,
	"rewards/cosine_scaled_reward": -0.22112673026276752,
	"rewards/format_reward": 0.2083333395421505,
	"step": 406
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2525.6666870117188,
	"epoch": 0.23257142857142857,
	"grad_norm": 0.016638562083244324,
	"kl": 0.00039958953857421875,
	"learning_rate": 1.934696604901642e-07,
	"loss": 0.0268,
	"reward": 0.5109657794237137,
	"reward_std": 0.6168239414691925,
	"rewards/cosine_scaled_reward": 0.04714955762028694,
	"rewards/format_reward": 0.4166666716337204,
	"step": 407
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2517.6250610351562,
	"epoch": 0.23314285714285715,
	"grad_norm": 0.024556465446949005,
	"kl": 0.0006203651428222656,
	"learning_rate": 1.915615368891117e-07,
	"loss": 0.1592,
	"reward": 0.06803740188479424,
	"reward_std": 0.8579247817397118,
	"rewards/cosine_scaled_reward": -0.19514797255396843,
	"rewards/format_reward": 0.4583333358168602,
	"step": 408
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2717.2916870117188,
	"epoch": 0.2337142857142857,
	"grad_norm": 0.029170790687203407,
	"kl": 0.0004508495330810547,
	"learning_rate": 1.8967088307307e-07,
	"loss": 0.0397,
	"reward": 0.8809500015340745,
	"reward_std": 0.6774098351597786,
	"rewards/cosine_scaled_reward": 0.12797498516738415,
	"rewards/format_reward": 0.625,
	"step": 409
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 3492.8333740234375,
	"epoch": 0.2342857142857143,
	"grad_norm": 0.011575359851121902,
	"kl": 0.0004987716674804688,
	"learning_rate": 1.8779779118983867e-07,
	"loss": -0.0028,
	"reward": -0.09124118834733963,
	"reward_std": 0.882003229111433,
	"rewards/cosine_scaled_reward": -0.14978726860135794,
	"rewards/format_reward": 0.2083333358168602,
	"step": 410
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 3483.4583740234375,
	"epoch": 0.23485714285714285,
	"grad_norm": 0.013307969085872173,
	"kl": 0.00041484832763671875,
	"learning_rate": 1.8594235253127372e-07,
	"loss": 0.038,
	"reward": 0.0876331478357315,
	"reward_std": 0.9785483591258526,
	"rewards/cosine_scaled_reward": -0.06035009026527405,
	"rewards/format_reward": 0.2083333395421505,
	"step": 411
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 3483.0,
	"epoch": 0.23542857142857143,
	"grad_norm": 0.014635894447565079,
	"kl": 0.0004057884216308594,
	"learning_rate": 1.8410465752883758e-07,
	"loss": 0.0558,
	"reward": -0.7163440883159637,
	"reward_std": 0.2162969596683979,
	"rewards/cosine_scaled_reward": -0.37900539487600327,
	"rewards/format_reward": 0.0416666679084301,
	"step": 412
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2679.916778564453,
	"epoch": 0.236,
	"grad_norm": 0.013161610811948776,
	"kl": 0.00030541419982910156,
	"learning_rate": 1.822847957491922e-07,
	"loss": 0.0978,
	"reward": 0.9060661401599646,
	"reward_std": 0.8712828233838081,
	"rewards/cosine_scaled_reward": 0.11969973146915436,
	"rewards/format_reward": 0.666666679084301,
	"step": 413
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2817.625030517578,
	"epoch": 0.23657142857142857,
	"grad_norm": 0.022518867626786232,
	"kl": 0.00038623809814453125,
	"learning_rate": 1.804828558898332e-07,
	"loss": 0.0441,
	"reward": 0.13390246778726578,
	"reward_std": 0.6470340602099895,
	"rewards/cosine_scaled_reward": -0.09971543587744236,
	"rewards/format_reward": 0.3333333358168602,
	"step": 414
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2841.2083740234375,
	"epoch": 0.23714285714285716,
	"grad_norm": 0.023023054003715515,
	"kl": 0.0006165504455566406,
	"learning_rate": 1.7869892577476722e-07,
	"loss": 0.0876,
	"reward": -0.059880852699279785,
	"reward_std": 0.4681435003876686,
	"rewards/cosine_scaled_reward": -0.19660709938034415,
	"rewards/format_reward": 0.3333333358168602,
	"step": 415
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 3561.75,
	"epoch": 0.2377142857142857,
	"grad_norm": 0.018553022295236588,
	"kl": 0.0003795623779296875,
	"learning_rate": 1.7693309235023127e-07,
	"loss": 0.0128,
	"reward": -0.2655714526772499,
	"reward_std": 0.3128821440041065,
	"rewards/cosine_scaled_reward": -0.15361905843019485,
	"rewards/format_reward": 0.0416666679084301,
	"step": 416
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2401.7917098999023,
	"epoch": 0.2382857142857143,
	"grad_norm": 0.026138195767998695,
	"kl": 0.0005283355712890625,
	"learning_rate": 1.7518544168045524e-07,
	"loss": 0.0481,
	"reward": 0.4319583922624588,
	"reward_std": 0.5039754528552294,
	"rewards/cosine_scaled_reward": -0.013187475502490997,
	"rewards/format_reward": 0.4583333432674408,
	"step": 417
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2618.4166717529297,
	"epoch": 0.23885714285714285,
	"grad_norm": 0.03529448062181473,
	"kl": 0.0008215904235839844,
	"learning_rate": 1.7345605894346726e-07,
	"loss": 0.053,
	"reward": -0.38310980424284935,
	"reward_std": 0.3161798268556595,
	"rewards/cosine_scaled_reward": -0.3790549263358116,
	"rewards/format_reward": 0.375,
	"step": 418
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2717.2083435058594,
	"epoch": 0.23942857142857144,
	"grad_norm": 0.015204512514173985,
	"kl": 0.0004143714904785156,
	"learning_rate": 1.7174502842694212e-07,
	"loss": 0.0434,
	"reward": -0.06527332402765751,
	"reward_std": 0.3376622749492526,
	"rewards/cosine_scaled_reward": -0.2618033364415169,
	"rewards/format_reward": 0.4583333432674408,
	"step": 419
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1704.7916870117188,
	"epoch": 0.24,
	"grad_norm": 0.05218733474612236,
	"kl": 0.0005950927734375,
	"learning_rate": 1.7005243352409333e-07,
	"loss": 0.249,
	"reward": 0.7783117964863777,
	"reward_std": 0.45935374312102795,
	"rewards/cosine_scaled_reward": 0.03498924896121025,
	"rewards/format_reward": 0.7083333432674408,
	"step": 420
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 3204.5,
	"epoch": 0.24057142857142857,
	"grad_norm": 0.01363091915845871,
	"kl": 0.0003876686096191406,
	"learning_rate": 1.6837835672960831e-07,
	"loss": 0.0017,
	"reward": -0.22349218279123306,
	"reward_std": 0.22306939586997032,
	"rewards/cosine_scaled_reward": -0.23674608767032623,
	"rewards/format_reward": 0.25,
	"step": 421
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 3584.0,
	"epoch": 0.24114285714285713,
	"grad_norm": 0.010621090419590473,
	"kl": 0.0003581047058105469,
	"learning_rate": 1.6672287963562852e-07,
	"loss": 0.0,
	"reward": -0.5770844966173172,
	"reward_std": 0.2044766042381525,
	"rewards/cosine_scaled_reward": -0.288542240858078,
	"rewards/format_reward": 0.0,
	"step": 422
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2312.250030517578,
	"epoch": 0.24171428571428571,
	"grad_norm": 0.02375810779631138,
	"kl": 0.0005865097045898438,
	"learning_rate": 1.6508608292777203e-07,
	"loss": -0.0847,
	"reward": 0.414031776599586,
	"reward_std": 0.732084047049284,
	"rewards/cosine_scaled_reward": -0.10548414289951324,
	"rewards/format_reward": 0.6250000037252903,
	"step": 423
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 3128.5833740234375,
	"epoch": 0.2422857142857143,
	"grad_norm": 0.01551869697868824,
	"kl": 0.00045108795166015625,
	"learning_rate": 1.6346804638120098e-07,
	"loss": 0.0483,
	"reward": 0.01406601071357727,
	"reward_std": 0.8138113915920258,
	"rewards/cosine_scaled_reward": -0.1388003290630877,
	"rewards/format_reward": 0.2916666716337204,
	"step": 424
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 3584.0,
	"epoch": 0.24285714285714285,
	"grad_norm": 0.011893173679709435,
	"kl": 0.00039386749267578125,
	"learning_rate": 1.6186884885673413e-07,
	"loss": 0.0,
	"reward": -0.5272135511040688,
	"reward_std": 0.4391126446425915,
	"rewards/cosine_scaled_reward": -0.2844401001930237,
	"rewards/format_reward": 0.0416666679084301,
	"step": 425
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 3331.7916870117188,
	"epoch": 0.24342857142857144,
	"grad_norm": 0.011359743773937225,
	"kl": 0.0003376007080078125,
	"learning_rate": 1.6028856829700258e-07,
	"loss": 0.015,
	"reward": 0.13842247426509857,
	"reward_std": 0.2540343776345253,
	"rewards/cosine_scaled_reward": -0.05578881502151489,
	"rewards/format_reward": 0.25,
	"step": 426
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 3101.5834350585938,
	"epoch": 0.244,
	"grad_norm": 0.01774391531944275,
	"kl": 0.00032806396484375,
	"learning_rate": 1.5872728172265146e-07,
	"loss": 0.0951,
	"reward": 0.436421200633049,
	"reward_std": 0.9749777019023895,
	"rewards/cosine_scaled_reward": -0.05262273037806153,
	"rewards/format_reward": 0.5416666828095913,
	"step": 427
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2517.2083740234375,
	"epoch": 0.24457142857142858,
	"grad_norm": 0.018922699615359306,
	"kl": 0.0004782676696777344,
	"learning_rate": 1.5718506522858572e-07,
	"loss": 0.0171,
	"reward": 1.0143605917692184,
	"reward_std": 0.9533870965242386,
	"rewards/cosine_scaled_reward": 0.17384696938097477,
	"rewards/format_reward": 0.6666666865348816,
	"step": 428
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 3139.4166870117188,
	"epoch": 0.24514285714285713,
	"grad_norm": 0.01879842020571232,
	"kl": 0.0005502700805664062,
	"learning_rate": 1.5566199398026147e-07,
	"loss": 0.0225,
	"reward": -0.2566852793097496,
	"reward_std": 0.44601357355713844,
	"rewards/cosine_scaled_reward": -0.23250930570065975,
	"rewards/format_reward": 0.2083333432674408,
	"step": 429
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 3440.3333740234375,
	"epoch": 0.24571428571428572,
	"grad_norm": 0.012092203833162785,
	"kl": 0.0003752708435058594,
	"learning_rate": 1.5415814221002265e-07,
	"loss": 0.0602,
	"reward": -0.08506331103853881,
	"reward_std": 0.7580065792426467,
	"rewards/cosine_scaled_reward": -0.18836499378085136,
	"rewards/format_reward": 0.291666679084301,
	"step": 430
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 3584.0,
	"epoch": 0.24628571428571427,
	"grad_norm": 0.009651312604546547,
	"kl": 0.000339508056640625,
	"learning_rate": 1.5267358321348285e-07,
	"loss": 0.0,
	"reward": -0.5135565400123596,
	"reward_std": 0.17573323473334312,
	"rewards/cosine_scaled_reward": -0.2776115983724594,
	"rewards/format_reward": 0.0416666679084301,
	"step": 431
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2914.6250610351562,
	"epoch": 0.24685714285714286,
	"grad_norm": 0.04963121563196182,
	"kl": 0.00048542022705078125,
	"learning_rate": 1.5120838934595337e-07,
	"loss": 0.0778,
	"reward": 0.22040988504886627,
	"reward_std": 0.7857857123017311,
	"rewards/cosine_scaled_reward": -0.05646173283457756,
	"rewards/format_reward": 0.3333333358168602,
	"step": 432
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1992.5417022705078,
	"epoch": 0.24742857142857144,
	"grad_norm": 0.018762821331620216,
	"kl": 0.0004487037658691406,
	"learning_rate": 1.4976263201891613e-07,
	"loss": 0.0176,
	"reward": 0.7027051514014602,
	"reward_std": 0.5680601857602596,
	"rewards/cosine_scaled_reward": -0.002814119216054678,
	"rewards/format_reward": 0.7083333432674408,
	"step": 433
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 3400.2500610351562,
	"epoch": 0.248,
	"grad_norm": 0.014792956411838531,
	"kl": 0.000438690185546875,
	"learning_rate": 1.483363816965435e-07,
	"loss": 0.0359,
	"reward": -0.11403081566095352,
	"reward_std": 0.695421889424324,
	"rewards/cosine_scaled_reward": -0.14034874364733696,
	"rewards/format_reward": 0.1666666716337204,
	"step": 434
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2794.375,
	"epoch": 0.24857142857142858,
	"grad_norm": 0.015310428105294704,
	"kl": 0.0004611015319824219,
	"learning_rate": 1.469297078922642e-07,
	"loss": 0.0315,
	"reward": 0.5071351528167725,
	"reward_std": 0.44683452136814594,
	"rewards/cosine_scaled_reward": 0.10773422196507454,
	"rewards/format_reward": 0.2916666679084301,
	"step": 435
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2730.250030517578,
	"epoch": 0.24914285714285714,
	"grad_norm": 0.03859826177358627,
	"kl": 0.000392913818359375,
	"learning_rate": 1.4554267916537495e-07,
	"loss": 0.1139,
	"reward": 0.25591667648404837,
	"reward_std": 0.7985352799296379,
	"rewards/cosine_scaled_reward": -0.08037500828504562,
	"rewards/format_reward": 0.4166666716337204,
	"step": 436
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1703.7083435058594,
	"epoch": 0.24971428571428572,
	"grad_norm": 0.035596735775470734,
	"kl": 0.0003857612609863281,
	"learning_rate": 1.4417536311769885e-07,
	"loss": 0.1267,
	"reward": 0.5573078580200672,
	"reward_std": 0.6633748337626457,
	"rewards/cosine_scaled_reward": -0.11717941612005234,
	"rewards/format_reward": 0.791666679084301,
	"step": 437
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2871.3333435058594,
	"epoch": 0.2502857142857143,
	"grad_norm": 0.012692431919276714,
	"kl": 0.000335693359375,
	"learning_rate": 1.4282782639029128e-07,
	"loss": 0.0717,
	"reward": -0.2980673983693123,
	"reward_std": 0.15785099938511848,
	"rewards/cosine_scaled_reward": -0.29486703872680664,
	"rewards/format_reward": 0.2916666679084301,
	"step": 438
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2338.7083435058594,
	"epoch": 0.25085714285714283,
	"grad_norm": 0.01559723261743784,
	"kl": 0.0004553794860839844,
	"learning_rate": 1.4150013466019114e-07,
	"loss": 0.0277,
	"reward": 0.19210883975028992,
	"reward_std": 0.7682934515178204,
	"rewards/cosine_scaled_reward": -0.17477891221642494,
	"rewards/format_reward": 0.5416666679084301,
	"step": 439
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1833.625015258789,
	"epoch": 0.25142857142857145,
	"grad_norm": 0.02628200314939022,
	"kl": 0.0005588531494140625,
	"learning_rate": 1.4019235263722034e-07,
	"loss": -0.0343,
	"reward": 0.7311004251241684,
	"reward_std": 0.6230124272406101,
	"rewards/cosine_scaled_reward": -0.009449809789657593,
	"rewards/format_reward": 0.75,
	"step": 440
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 3584.0,
	"epoch": 0.252,
	"grad_norm": 0.011055848561227322,
	"kl": 0.0003528594970703125,
	"learning_rate": 1.3890454406082956e-07,
	"loss": 0.0,
	"reward": -0.23345018550753593,
	"reward_std": 0.553386427462101,
	"rewards/cosine_scaled_reward": -0.15839176578447223,
	"rewards/format_reward": 0.0833333358168602,
	"step": 441
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2781.4583435058594,
	"epoch": 0.25257142857142856,
	"grad_norm": 0.040766630321741104,
	"kl": 0.0006265640258789062,
	"learning_rate": 1.3763677169699217e-07,
	"loss": 0.0573,
	"reward": 0.07009226828813553,
	"reward_std": 0.8544792495667934,
	"rewards/cosine_scaled_reward": -0.13162054121494293,
	"rewards/format_reward": 0.3333333358168602,
	"step": 442
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 3411.25,
	"epoch": 0.25314285714285717,
	"grad_norm": 0.013510222546756268,
	"kl": 0.0003809928894042969,
	"learning_rate": 1.3638909733514452e-07,
	"loss": 0.085,
	"reward": -0.4627658315002918,
	"reward_std": 0.3685727119445801,
	"rewards/cosine_scaled_reward": -0.3355495557188988,
	"rewards/format_reward": 0.2083333395421505,
	"step": 443
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 3350.0833740234375,
	"epoch": 0.2537142857142857,
	"grad_norm": 0.017425937578082085,
	"kl": 0.00029087066650390625,
	"learning_rate": 1.351615817851748e-07,
	"loss": 0.109,
	"reward": -0.536733441054821,
	"reward_std": 0.5179216116666794,
	"rewards/cosine_scaled_reward": -0.3308667168021202,
	"rewards/format_reward": 0.1250000037252903,
	"step": 444
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2777.4583587646484,
	"epoch": 0.2542857142857143,
	"grad_norm": 0.01394882146269083,
	"kl": 0.0004124641418457031,
	"learning_rate": 1.3395428487445914e-07,
	"loss": 0.0398,
	"reward": 0.6419320218265057,
	"reward_std": 1.290092408657074,
	"rewards/cosine_scaled_reward": 0.11263268813490868,
	"rewards/format_reward": 0.4166666716337204,
	"step": 445
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2162.0000610351562,
	"epoch": 0.25485714285714284,
	"grad_norm": 0.025396212935447693,
	"kl": 0.0004458427429199219,
	"learning_rate": 1.3276726544494571e-07,
	"loss": 0.2263,
	"reward": 0.2729286514222622,
	"reward_std": 0.4925660863518715,
	"rewards/cosine_scaled_reward": -0.2177023496478796,
	"rewards/format_reward": 0.7083333395421505,
	"step": 446
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2784.250045776367,
	"epoch": 0.25542857142857145,
	"grad_norm": 0.01916358806192875,
	"kl": 0.00040149688720703125,
	"learning_rate": 1.316005813502869e-07,
	"loss": 0.0456,
	"reward": -0.28101276885718107,
	"reward_std": 0.27371450141072273,
	"rewards/cosine_scaled_reward": -0.28633972629904747,
	"rewards/format_reward": 0.2916666679084301,
	"step": 447
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 3356.5833740234375,
	"epoch": 0.256,
	"grad_norm": 0.012381333857774734,
	"kl": 0.0004825592041015625,
	"learning_rate": 1.3045428945301953e-07,
	"loss": 0.0587,
	"reward": 0.8448215499520302,
	"reward_std": 0.715430673211813,
	"rewards/cosine_scaled_reward": 0.2557440847158432,
	"rewards/format_reward": 0.3333333432674408,
	"step": 448
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2940.3334350585938,
	"epoch": 0.25657142857142856,
	"grad_norm": 0.017700130119919777,
	"kl": 0.0002903938293457031,
	"learning_rate": 1.2932844562179352e-07,
	"loss": 0.1691,
	"reward": 0.6478632241487503,
	"reward_std": 1.2038164585828781,
	"rewards/cosine_scaled_reward": 0.11559828370809555,
	"rewards/format_reward": 0.4166666753590107,
	"step": 449
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1642.2084197998047,
	"epoch": 0.2571428571428571,
	"grad_norm": 0.01771002635359764,
	"kl": 0.0003179311752319336,
	"learning_rate": 1.2822310472864885e-07,
	"loss": 0.1028,
	"reward": 0.930170651525259,
	"reward_std": 0.5905403085052967,
	"rewards/cosine_scaled_reward": 0.027585337636992335,
	"rewards/format_reward": 0.875,
	"step": 450
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 3493.4583740234375,
	"epoch": 0.25771428571428573,
	"grad_norm": 0.015439066104590893,
	"kl": 0.00055694580078125,
	"learning_rate": 1.2713832064634125e-07,
	"loss": 0.0217,
	"reward": -0.7016485892236233,
	"reward_std": 0.41242050286382437,
	"rewards/cosine_scaled_reward": -0.3924909606575966,
	"rewards/format_reward": 0.0833333358168602,
	"step": 451
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2162.375015258789,
	"epoch": 0.2582857142857143,
	"grad_norm": 0.031103147193789482,
	"kl": 0.0004916191101074219,
	"learning_rate": 1.260741462457165e-07,
	"loss": 0.1222,
	"reward": 0.33962953090667725,
	"reward_std": 0.5016037877649069,
	"rewards/cosine_scaled_reward": -0.14268524572253227,
	"rewards/format_reward": 0.625,
	"step": 452
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2658.2500228881836,
	"epoch": 0.25885714285714284,
	"grad_norm": 0.02379673905670643,
	"kl": 0.0007123947143554688,
	"learning_rate": 1.2503063339313356e-07,
	"loss": 0.0397,
	"reward": -0.1027427650988102,
	"reward_std": 0.6555835595354438,
	"rewards/cosine_scaled_reward": -0.2180380504578352,
	"rewards/format_reward": 0.3333333358168602,
	"step": 453
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2904.4583740234375,
	"epoch": 0.25942857142857145,
	"grad_norm": 0.024318231269717216,
	"kl": 0.0005178451538085938,
	"learning_rate": 1.2400783294793668e-07,
	"loss": 0.1233,
	"reward": 0.12724535167217255,
	"reward_std": 0.690078116953373,
	"rewards/cosine_scaled_reward": -0.10304398089647293,
	"rewards/format_reward": 0.3333333432674408,
	"step": 454
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 3007.0833740234375,
	"epoch": 0.26,
	"grad_norm": 0.014805259183049202,
	"kl": 0.00030303001403808594,
	"learning_rate": 1.2300579475997657e-07,
	"loss": 0.0556,
	"reward": 0.07904787175357342,
	"reward_std": 0.41673495434224606,
	"rewards/cosine_scaled_reward": -0.12714274739846587,
	"rewards/format_reward": 0.3333333358168602,
	"step": 455
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 3369.75,
	"epoch": 0.26057142857142856,
	"grad_norm": 0.01267288252711296,
	"kl": 0.000545501708984375,
	"learning_rate": 1.220245676671809e-07,
	"loss": 0.0697,
	"reward": -0.19169889390468597,
	"reward_std": 0.5537064597010612,
	"rewards/cosine_scaled_reward": -0.15834945812821388,
	"rewards/format_reward": 0.1250000037252903,
	"step": 456
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2750.4166870117188,
	"epoch": 0.2611428571428571,
	"grad_norm": 0.01913038082420826,
	"kl": 0.0005252361297607422,
	"learning_rate": 1.2106419949317388e-07,
	"loss": 0.0638,
	"reward": -0.025101646780967712,
	"reward_std": 0.8123595081269741,
	"rewards/cosine_scaled_reward": -0.17921748850494623,
	"rewards/format_reward": 0.3333333358168602,
	"step": 457
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2731.291732788086,
	"epoch": 0.26171428571428573,
	"grad_norm": 0.018483366817235947,
	"kl": 0.0005779266357421875,
	"learning_rate": 1.2012473704494537e-07,
	"loss": 0.1314,
	"reward": 0.18389775604009628,
	"reward_std": 0.6026048362255096,
	"rewards/cosine_scaled_reward": -0.11638446152210236,
	"rewards/format_reward": 0.4166666716337204,
	"step": 458
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2847.2083435058594,
	"epoch": 0.2622857142857143,
	"grad_norm": 0.019928766414523125,
	"kl": 0.0004062652587890625,
	"learning_rate": 1.1920622611056974e-07,
	"loss": 0.116,
	"reward": 0.056805893778800964,
	"reward_std": 0.897519065067172,
	"rewards/cosine_scaled_reward": -0.15909705124795437,
	"rewards/format_reward": 0.3750000111758709,
	"step": 459
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2855.7083740234375,
	"epoch": 0.26285714285714284,
	"grad_norm": 0.01562991738319397,
	"kl": 0.0004324913024902344,
	"learning_rate": 1.1830871145697412e-07,
	"loss": 0.011,
	"reward": 0.4357723630964756,
	"reward_std": 0.7417406067252159,
	"rewards/cosine_scaled_reward": 0.030386213213205338,
	"rewards/format_reward": 0.3750000037252903,
	"step": 460
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2806.1666870117188,
	"epoch": 0.2634285714285714,
	"grad_norm": 0.022745100781321526,
	"kl": 0.0005688667297363281,
	"learning_rate": 1.1743223682775649e-07,
	"loss": 0.0915,
	"reward": -0.35859447717666626,
	"reward_std": 0.3101446107029915,
	"rewards/cosine_scaled_reward": -0.32513057440519333,
	"rewards/format_reward": 0.2916666679084301,
	"step": 461
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 3287.166748046875,
	"epoch": 0.264,
	"grad_norm": 0.014005818404257298,
	"kl": 0.0002605915069580078,
	"learning_rate": 1.1657684494105386e-07,
	"loss": 0.1217,
	"reward": -0.07925862073898315,
	"reward_std": 0.570786502212286,
	"rewards/cosine_scaled_reward": -0.12296264991164207,
	"rewards/format_reward": 0.1666666679084301,
	"step": 462
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 3048.8750610351562,
	"epoch": 0.26457142857142857,
	"grad_norm": 0.026238108053803444,
	"kl": 0.0003509521484375,
	"learning_rate": 1.1574257748745986e-07,
	"loss": 0.1634,
	"reward": -0.2891614316031337,
	"reward_std": 0.6285391822457314,
	"rewards/cosine_scaled_reward": -0.2695807181298733,
	"rewards/format_reward": 0.25,
	"step": 463
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2917.9583435058594,
	"epoch": 0.2651428571428571,
	"grad_norm": 0.014083731919527054,
	"kl": 0.00032520294189453125,
	"learning_rate": 1.1492947512799328e-07,
	"loss": 0.0497,
	"reward": 0.23886509239673615,
	"reward_std": 0.5035260319709778,
	"rewards/cosine_scaled_reward": -0.08890077471733093,
	"rewards/format_reward": 0.4166666716337204,
	"step": 464
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 3480.75,
	"epoch": 0.26571428571428574,
	"grad_norm": 0.01106889545917511,
	"kl": 0.0002949237823486328,
	"learning_rate": 1.1413757749211602e-07,
	"loss": 0.022,
	"reward": 0.06554645299911499,
	"reward_std": 0.4903480280190706,
	"rewards/cosine_scaled_reward": -0.0505601167678833,
	"rewards/format_reward": 0.1666666716337204,
	"step": 465
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2717.875030517578,
	"epoch": 0.2662857142857143,
	"grad_norm": 0.017086246982216835,
	"kl": 0.0003123283386230469,
	"learning_rate": 1.1336692317580158e-07,
	"loss": 0.0943,
	"reward": -0.1813066378235817,
	"reward_std": 0.2922391891479492,
	"rewards/cosine_scaled_reward": -0.31981998309493065,
	"rewards/format_reward": 0.4583333432674408,
	"step": 466
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 3453.916748046875,
	"epoch": 0.26685714285714285,
	"grad_norm": 0.013130133971571922,
	"kl": 0.0003361701965332031,
	"learning_rate": 1.1261754973965422e-07,
	"loss": 0.0385,
	"reward": 0.055451929569244385,
	"reward_std": 0.7496693283319473,
	"rewards/cosine_scaled_reward": -0.1181073747575283,
	"rewards/format_reward": 0.2916666716337204,
	"step": 467
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2965.5416870117188,
	"epoch": 0.2674285714285714,
	"grad_norm": 0.02877657674252987,
	"kl": 0.0003726482391357422,
	"learning_rate": 1.1188949370707787e-07,
	"loss": 0.1355,
	"reward": 0.2859138697385788,
	"reward_std": 0.8501931093633175,
	"rewards/cosine_scaled_reward": -0.0445430725812912,
	"rewards/format_reward": 0.3750000149011612,
	"step": 468
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 3065.666717529297,
	"epoch": 0.268,
	"grad_norm": 0.021241569891572,
	"kl": 0.0003745555877685547,
	"learning_rate": 1.1118279056249653e-07,
	"loss": -0.0209,
	"reward": -0.22108882665634155,
	"reward_std": 0.5584007576107979,
	"rewards/cosine_scaled_reward": -0.2772110812366009,
	"rewards/format_reward": 0.3333333358168602,
	"step": 469
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2091.041717529297,
	"epoch": 0.26857142857142857,
	"grad_norm": 0.043246157467365265,
	"kl": 0.00055694580078125,
	"learning_rate": 1.1049747474962444e-07,
	"loss": 0.1196,
	"reward": 1.0320170223712921,
	"reward_std": 1.0782769918441772,
	"rewards/cosine_scaled_reward": 0.2035085055977106,
	"rewards/format_reward": 0.6250000149011612,
	"step": 470
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2538.291717529297,
	"epoch": 0.26914285714285713,
	"grad_norm": 0.016464348882436752,
	"kl": 0.00033664703369140625,
	"learning_rate": 1.0983357966978745e-07,
	"loss": 0.0476,
	"reward": 0.2932426920160651,
	"reward_std": 0.5623490735888481,
	"rewards/cosine_scaled_reward": -0.1450453530997038,
	"rewards/format_reward": 0.5833333432674408,
	"step": 471
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 3552.5833740234375,
	"epoch": 0.26971428571428574,
	"grad_norm": 0.0214456245303154,
	"kl": 0.00042819976806640625,
	"learning_rate": 1.0919113768029517e-07,
	"loss": 0.0135,
	"reward": -0.35011430410668254,
	"reward_std": 0.3236675038933754,
	"rewards/cosine_scaled_reward": -0.23755715577863157,
	"rewards/format_reward": 0.1250000037252903,
	"step": 472
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2889.4166717529297,
	"epoch": 0.2702857142857143,
	"grad_norm": 0.02036861516535282,
	"kl": 0.0003829002380371094,
	"learning_rate": 1.0857018009286381e-07,
	"loss": 0.0107,
	"reward": -0.004865109920501709,
	"reward_std": 0.6773473080247641,
	"rewards/cosine_scaled_reward": -0.14826588705182076,
	"rewards/format_reward": 0.2916666679084301,
	"step": 473
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2525.9583740234375,
	"epoch": 0.27085714285714285,
	"grad_norm": 0.013575663790106773,
	"kl": 0.0002644062042236328,
	"learning_rate": 1.0797073717209013e-07,
	"loss": -0.0323,
	"reward": 0.5381094664335251,
	"reward_std": 0.44665071181952953,
	"rewards/cosine_scaled_reward": -0.0017786095850169659,
	"rewards/format_reward": 0.5416666679084301,
	"step": 474
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2851.9583740234375,
	"epoch": 0.2714285714285714,
	"grad_norm": 0.020593978464603424,
	"kl": 0.0005025863647460938,
	"learning_rate": 1.0739283813397639e-07,
	"loss": 0.0314,
	"reward": 0.17490556836128235,
	"reward_std": 0.6029777117073536,
	"rewards/cosine_scaled_reward": -0.12088055908679962,
	"rewards/format_reward": 0.4166666865348816,
	"step": 475
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 3584.0,
	"epoch": 0.272,
	"grad_norm": 0.01853923127055168,
	"kl": 0.000408172607421875,
	"learning_rate": 1.068365111445064e-07,
	"loss": 0.0,
	"reward": -0.6839941293001175,
	"reward_std": 0.23570294678211212,
	"rewards/cosine_scaled_reward": -0.34199706465005875,
	"rewards/format_reward": 0.0,
	"step": 476
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 3223.4166870117188,
	"epoch": 0.2725714285714286,
	"grad_norm": 0.02092069201171398,
	"kl": 0.00046443939208984375,
	"learning_rate": 1.063017833182728e-07,
	"loss": 0.1149,
	"reward": -0.4160095602273941,
	"reward_std": 0.3700854703783989,
	"rewards/cosine_scaled_reward": -0.33300479501485825,
	"rewards/format_reward": 0.25,
	"step": 477
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 3584.0,
	"epoch": 0.27314285714285713,
	"grad_norm": 0.015490886755287647,
	"kl": 0.0003581047058105469,
	"learning_rate": 1.0578868071715544e-07,
	"loss": 0.0,
	"reward": -0.35857730358839035,
	"reward_std": 0.1991352178156376,
	"rewards/cosine_scaled_reward": -0.17928865179419518,
	"rewards/format_reward": 0.0,
	"step": 478
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2192.2917098999023,
	"epoch": 0.2737142857142857,
	"grad_norm": 0.025250233709812164,
	"kl": 0.0003457069396972656,
	"learning_rate": 1.0529722834905125e-07,
	"loss": 0.2067,
	"reward": 1.0132533311843872,
	"reward_std": 0.5927854059264064,
	"rewards/cosine_scaled_reward": 0.21495997160673141,
	"rewards/format_reward": 0.5833333469927311,
	"step": 479
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 3373.9583740234375,
	"epoch": 0.2742857142857143,
	"grad_norm": 0.023761438205838203,
	"kl": 0.0004482269287109375,
	"learning_rate": 1.0482745016665526e-07,
	"loss": 0.0815,
	"reward": 0.2368101328611374,
	"reward_std": 0.7100772261619568,
	"rewards/cosine_scaled_reward": -0.006594939972274005,
	"rewards/format_reward": 0.2500000111758709,
	"step": 480
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 3024.125,
	"epoch": 0.27485714285714286,
	"grad_norm": 0.01686985418200493,
	"kl": 0.0006651878356933594,
	"learning_rate": 1.0437936906629334e-07,
	"loss": 0.0839,
	"reward": -0.0927225798368454,
	"reward_std": 0.6664447784423828,
	"rewards/cosine_scaled_reward": -0.21302797086536884,
	"rewards/format_reward": 0.3333333358168602,
	"step": 481
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2792.125030517578,
	"epoch": 0.2754285714285714,
	"grad_norm": 0.016433361917734146,
	"kl": 0.00051116943359375,
	"learning_rate": 1.0395300688680625e-07,
	"loss": 0.0005,
	"reward": -0.018994301557540894,
	"reward_std": 0.6018916461616755,
	"rewards/cosine_scaled_reward": -0.19699716940522194,
	"rewards/format_reward": 0.375,
	"step": 482
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2972.1250610351562,
	"epoch": 0.276,
	"grad_norm": 0.031187044456601143,
	"kl": 0.0005083084106445312,
	"learning_rate": 1.0354838440848501e-07,
	"loss": 0.215,
	"reward": -0.3035236857831478,
	"reward_std": 0.6242531836032867,
	"rewards/cosine_scaled_reward": -0.2975951712578535,
	"rewards/format_reward": 0.291666679084301,
	"step": 483
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2626.541717529297,
	"epoch": 0.2765714285714286,
	"grad_norm": 0.03856087848544121,
	"kl": 0.0005278587341308594,
	"learning_rate": 1.0316552135205837e-07,
	"loss": 0.0923,
	"reward": -0.07003412395715714,
	"reward_std": 0.5723829306662083,
	"rewards/cosine_scaled_reward": -0.26418372616171837,
	"rewards/format_reward": 0.4583333395421505,
	"step": 484
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 3163.291748046875,
	"epoch": 0.27714285714285714,
	"grad_norm": 0.023002101108431816,
	"kl": 0.0003609657287597656,
	"learning_rate": 1.0280443637773163e-07,
	"loss": 0.1445,
	"reward": -0.1619274765253067,
	"reward_std": 0.6520496867597103,
	"rewards/cosine_scaled_reward": -0.18513040244579315,
	"rewards/format_reward": 0.2083333395421505,
	"step": 485
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2786.500045776367,
	"epoch": 0.2777142857142857,
	"grad_norm": 0.04367635026574135,
	"kl": 0.00084686279296875,
	"learning_rate": 1.0246514708427701e-07,
	"loss": 0.1118,
	"reward": -0.10202455054968596,
	"reward_std": 0.5869803428649902,
	"rewards/cosine_scaled_reward": -0.19684561155736446,
	"rewards/format_reward": 0.2916666679084301,
	"step": 486
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2069.625045776367,
	"epoch": 0.2782857142857143,
	"grad_norm": 0.01725645735859871,
	"kl": 0.0004436969757080078,
	"learning_rate": 1.0214767000817596e-07,
	"loss": 0.0339,
	"reward": 0.9145368824247271,
	"reward_std": 0.6427567200735211,
	"rewards/cosine_scaled_reward": 0.1656017464119941,
	"rewards/format_reward": 0.5833333432674408,
	"step": 487
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 3266.9583740234375,
	"epoch": 0.27885714285714286,
	"grad_norm": 0.020370880141854286,
	"kl": 0.0003273487091064453,
	"learning_rate": 1.0185202062281336e-07,
	"loss": 0.069,
	"reward": 0.08849874883890152,
	"reward_std": 0.9649087898433208,
	"rewards/cosine_scaled_reward": -0.10158396512269974,
	"rewards/format_reward": 0.2916666679084301,
	"step": 488
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2884.7500610351562,
	"epoch": 0.2794285714285714,
	"grad_norm": 0.031119707971811295,
	"kl": 0.0006618499755859375,
	"learning_rate": 1.0157821333772304e-07,
	"loss": 0.1788,
	"reward": -0.1438409616239369,
	"reward_std": 0.752083495259285,
	"rewards/cosine_scaled_reward": -0.2802538275718689,
	"rewards/format_reward": 0.4166666716337204,
	"step": 489
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2443.8333740234375,
	"epoch": 0.28,
	"grad_norm": 0.033809904009103775,
	"kl": 0.0004324913024902344,
	"learning_rate": 1.013262614978859e-07,
	"loss": 0.1043,
	"reward": 0.5690474957227707,
	"reward_std": 0.7256521657109261,
	"rewards/cosine_scaled_reward": 0.034523727372288704,
	"rewards/format_reward": 0.5000000111758709,
	"step": 490
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2658.3750610351562,
	"epoch": 0.2805714285714286,
	"grad_norm": 0.02354743331670761,
	"kl": 0.0004076957702636719,
	"learning_rate": 1.0109617738307911e-07,
	"loss": 0.0933,
	"reward": 0.611915085464716,
	"reward_std": 0.9881070479750633,
	"rewards/cosine_scaled_reward": 0.05595753900706768,
	"rewards/format_reward": 0.5000000037252903,
	"step": 491
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2955.7083740234375,
	"epoch": 0.28114285714285714,
	"grad_norm": 0.017025692388415337,
	"kl": 0.000514984130859375,
	"learning_rate": 1.0088797220727779e-07,
	"loss": 0.1604,
	"reward": -0.22188673401251435,
	"reward_std": 0.46732087805867195,
	"rewards/cosine_scaled_reward": -0.27761003375053406,
	"rewards/format_reward": 0.3333333432674408,
	"step": 492
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2733.125045776367,
	"epoch": 0.2817142857142857,
	"grad_norm": 0.025227824226021767,
	"kl": 0.0007300376892089844,
	"learning_rate": 1.0070165611810855e-07,
	"loss": 0.051,
	"reward": 0.24700819700956345,
	"reward_std": 0.9385927617549896,
	"rewards/cosine_scaled_reward": -0.12649590522050858,
	"rewards/format_reward": 0.5,
	"step": 493
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2702.0416870117188,
	"epoch": 0.2822857142857143,
	"grad_norm": 0.024555031210184097,
	"kl": 0.0006690025329589844,
	"learning_rate": 1.005372381963547e-07,
	"loss": 0.071,
	"reward": 0.8110056445002556,
	"reward_std": 1.0802773237228394,
	"rewards/cosine_scaled_reward": 0.15550284087657928,
	"rewards/format_reward": 0.5000000223517418,
	"step": 494
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 3584.0,
	"epoch": 0.28285714285714286,
	"grad_norm": 0.017543919384479523,
	"kl": 0.00031828880310058594,
	"learning_rate": 1.0039472645551372e-07,
	"loss": 0.0,
	"reward": -0.29333774745464325,
	"reward_std": 0.505841463804245,
	"rewards/cosine_scaled_reward": -0.18833555281162262,
	"rewards/format_reward": 0.0833333358168602,
	"step": 495
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1680.4167175292969,
	"epoch": 0.2834285714285714,
	"grad_norm": 0.0482969656586647,
	"kl": 0.0008435249328613281,
	"learning_rate": 1.002741278414069e-07,
	"loss": 0.1698,
	"reward": 0.8347217477858067,
	"reward_std": 0.6243661791086197,
	"rewards/cosine_scaled_reward": -0.06180577352643013,
	"rewards/format_reward": 0.9583333432674408,
	"step": 496
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 3413.5000610351562,
	"epoch": 0.284,
	"grad_norm": 0.010940664447844028,
	"kl": 0.000286102294921875,
	"learning_rate": 1.0017544823184055e-07,
	"loss": 0.0685,
	"reward": -0.3512016786262393,
	"reward_std": 0.293385605327785,
	"rewards/cosine_scaled_reward": -0.23810084303840995,
	"rewards/format_reward": 0.1250000037252903,
	"step": 497
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2852.9166717529297,
	"epoch": 0.2845714285714286,
	"grad_norm": 0.031826820224523544,
	"kl": 0.0005040168762207031,
	"learning_rate": 1.0009869243631952e-07,
	"loss": 0.0531,
	"reward": -0.09286746755242348,
	"reward_std": 0.689836498349905,
	"rewards/cosine_scaled_reward": -0.19226707890629768,
	"rewards/format_reward": 0.2916666679084301,
	"step": 498
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2774.666748046875,
	"epoch": 0.28514285714285714,
	"grad_norm": 0.015145834535360336,
	"kl": 0.0004925727844238281,
	"learning_rate": 1.000438641958131e-07,
	"loss": 0.047,
	"reward": 1.141416186466813,
	"reward_std": 1.473642259836197,
	"rewards/cosine_scaled_reward": 0.2582080829888582,
	"rewards/format_reward": 0.6250000111758709,
	"step": 499
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 2512.625030517578,
	"epoch": 0.2857142857142857,
	"grad_norm": 0.045064449310302734,
	"kl": 0.0005974769592285156,
	"learning_rate": 1.0001096618257236e-07,
	"loss": -0.0824,
	"reward": -0.023780837655067444,
	"reward_std": 0.46510184183716774,
	"rewards/cosine_scaled_reward": -0.2827237620949745,
	"rewards/format_reward": 0.5416666679084301,
	"step": 500
	},
	{
	"epoch": 0.2857142857142857,
	"step": 500,
	"total_flos": 0.0,
	"train_loss": 0.07169770698851426,
	"train_runtime": 25124.3049,
	"train_samples_per_second": 0.478,
	"train_steps_per_second": 0.02
	}
	],
	"logging_steps": 1,
	"max_steps": 500,
	"num_input_tokens_seen": 0,
	"num_train_epochs": 1,
	"save_steps": 50,
	"stateful_callbacks": {
	"TrainerControl": {
	"args": {
	"should_epoch_stop": false,
	"should_evaluate": false,
	"should_log": false,
	"should_save": true,
	"should_training_stop": true
	},
	"attributes": {}
	}
	},
	"total_flos": 0.0,
	"train_batch_size": 6,
	"trial_name": null,
	"trial_params": null
	}