|
{ |
|
"best_global_step": null, |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.2857142857142857, |
|
"eval_steps": 500, |
|
"global_step": 500, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3068.5000610351562, |
|
"epoch": 0.0005714285714285715, |
|
"grad_norm": 0.013173151761293411, |
|
"kl": 0.0005006790161132812, |
|
"learning_rate": 0.0, |
|
"loss": -0.0242, |
|
"reward": 0.200983926653862, |
|
"reward_std": 0.24425111338496208, |
|
"rewards/cosine_scaled_reward": -0.0453413650393486, |
|
"rewards/format_reward": 0.2916666679084301, |
|
"step": 1 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2930.9583740234375, |
|
"epoch": 0.001142857142857143, |
|
"grad_norm": 0.04316634312272072, |
|
"kl": 0.0003731250762939453, |
|
"learning_rate": 2e-08, |
|
"loss": 0.2092, |
|
"reward": -0.28063105791807175, |
|
"reward_std": 0.29903180059045553, |
|
"rewards/cosine_scaled_reward": -0.28614887595176697, |
|
"rewards/format_reward": 0.291666679084301, |
|
"step": 2 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2359.2500915527344, |
|
"epoch": 0.0017142857142857142, |
|
"grad_norm": 0.03820963203907013, |
|
"kl": 0.00048732757568359375, |
|
"learning_rate": 4e-08, |
|
"loss": 0.1661, |
|
"reward": 0.3625979460775852, |
|
"reward_std": 0.7691465243697166, |
|
"rewards/cosine_scaled_reward": -0.11036771535873413, |
|
"rewards/format_reward": 0.5833333544433117, |
|
"step": 3 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2455.5000610351562, |
|
"epoch": 0.002285714285714286, |
|
"grad_norm": 0.01873675175011158, |
|
"kl": 0.00042629241943359375, |
|
"learning_rate": 6e-08, |
|
"loss": 0.0368, |
|
"reward": 0.42465633153915405, |
|
"reward_std": 0.6839377954602242, |
|
"rewards/cosine_scaled_reward": -0.12100516259670258, |
|
"rewards/format_reward": 0.6666666679084301, |
|
"step": 4 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2971.2916870117188, |
|
"epoch": 0.002857142857142857, |
|
"grad_norm": 0.01593288779258728, |
|
"kl": 0.0005702972412109375, |
|
"learning_rate": 8e-08, |
|
"loss": 0.0526, |
|
"reward": -0.45133184641599655, |
|
"reward_std": 0.1987809967249632, |
|
"rewards/cosine_scaled_reward": -0.3506659045815468, |
|
"rewards/format_reward": 0.25, |
|
"step": 5 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2659.4583740234375, |
|
"epoch": 0.0034285714285714284, |
|
"grad_norm": 0.016305305063724518, |
|
"kl": 0.0003809928894042969, |
|
"learning_rate": 1e-07, |
|
"loss": 0.0749, |
|
"reward": -0.20897246897220612, |
|
"reward_std": 0.22619805298745632, |
|
"rewards/cosine_scaled_reward": -0.27115290239453316, |
|
"rewards/format_reward": 0.3333333358168602, |
|
"step": 6 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3150.0000610351562, |
|
"epoch": 0.004, |
|
"grad_norm": 0.01786160096526146, |
|
"kl": 0.00041294097900390625, |
|
"learning_rate": 1.2e-07, |
|
"loss": 0.1274, |
|
"reward": 0.03739733062684536, |
|
"reward_std": 0.6221467964351177, |
|
"rewards/cosine_scaled_reward": -0.12713466212153435, |
|
"rewards/format_reward": 0.2916666753590107, |
|
"step": 7 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2930.75, |
|
"epoch": 0.004571428571428572, |
|
"grad_norm": 0.013749959878623486, |
|
"kl": 0.0005197525024414062, |
|
"learning_rate": 1.4e-07, |
|
"loss": -0.0105, |
|
"reward": -0.5632898807525635, |
|
"reward_std": 0.14319632947444916, |
|
"rewards/cosine_scaled_reward": -0.40664494782686234, |
|
"rewards/format_reward": 0.25, |
|
"step": 8 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2527.7083435058594, |
|
"epoch": 0.005142857142857143, |
|
"grad_norm": 0.01508411392569542, |
|
"kl": 0.0005040168762207031, |
|
"learning_rate": 1.6e-07, |
|
"loss": -0.0156, |
|
"reward": 0.4341657906770706, |
|
"reward_std": 0.46268418058753014, |
|
"rewards/cosine_scaled_reward": -0.03291710093617439, |
|
"rewards/format_reward": 0.5, |
|
"step": 9 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2895.4584350585938, |
|
"epoch": 0.005714285714285714, |
|
"grad_norm": 0.024773668497800827, |
|
"kl": 0.00039124488830566406, |
|
"learning_rate": 1.8e-07, |
|
"loss": 0.1541, |
|
"reward": 0.5330872263293713, |
|
"reward_std": 1.0390121936798096, |
|
"rewards/cosine_scaled_reward": 0.016543611884117126, |
|
"rewards/format_reward": 0.5000000186264515, |
|
"step": 10 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2230.2916870117188, |
|
"epoch": 0.006285714285714286, |
|
"grad_norm": 0.02818623185157776, |
|
"kl": 0.0006093978881835938, |
|
"learning_rate": 2e-07, |
|
"loss": 0.1588, |
|
"reward": -0.14730040915310383, |
|
"reward_std": 0.23369846679270267, |
|
"rewards/cosine_scaled_reward": -0.34448356181383133, |
|
"rewards/format_reward": 0.541666679084301, |
|
"step": 11 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2591.7083740234375, |
|
"epoch": 0.006857142857142857, |
|
"grad_norm": 0.02190236933529377, |
|
"kl": 0.0006151199340820312, |
|
"learning_rate": 2.1999999999999998e-07, |
|
"loss": 0.0329, |
|
"reward": 0.48731680028140545, |
|
"reward_std": 0.8824571967124939, |
|
"rewards/cosine_scaled_reward": -0.0688415989279747, |
|
"rewards/format_reward": 0.6250000074505806, |
|
"step": 12 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2778.3334197998047, |
|
"epoch": 0.0074285714285714285, |
|
"grad_norm": 0.015030866488814354, |
|
"kl": 0.0003528594970703125, |
|
"learning_rate": 2.4e-07, |
|
"loss": 0.1056, |
|
"reward": 0.6901360005140305, |
|
"reward_std": 0.84443748742342, |
|
"rewards/cosine_scaled_reward": 0.07423467561602592, |
|
"rewards/format_reward": 0.541666679084301, |
|
"step": 13 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2315.8333587646484, |
|
"epoch": 0.008, |
|
"grad_norm": 0.013019426725804806, |
|
"kl": 0.0004930496215820312, |
|
"learning_rate": 2.6e-07, |
|
"loss": 0.001, |
|
"reward": 1.1444866992533207, |
|
"reward_std": 0.720286563038826, |
|
"rewards/cosine_scaled_reward": 0.2180766798555851, |
|
"rewards/format_reward": 0.7083333395421505, |
|
"step": 14 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2382.000030517578, |
|
"epoch": 0.008571428571428572, |
|
"grad_norm": 0.02887255884706974, |
|
"kl": 0.00041961669921875, |
|
"learning_rate": 2.8e-07, |
|
"loss": -0.0308, |
|
"reward": 0.2332791006192565, |
|
"reward_std": 0.48609885200858116, |
|
"rewards/cosine_scaled_reward": -0.21669380273669958, |
|
"rewards/format_reward": 0.6666666716337204, |
|
"step": 15 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3069.4583740234375, |
|
"epoch": 0.009142857142857144, |
|
"grad_norm": 0.018421335145831108, |
|
"kl": 0.000476837158203125, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0859, |
|
"reward": 0.2359318658709526, |
|
"reward_std": 0.6105321571230888, |
|
"rewards/cosine_scaled_reward": -0.0695340558886528, |
|
"rewards/format_reward": 0.3750000149011612, |
|
"step": 16 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2575.5833435058594, |
|
"epoch": 0.009714285714285713, |
|
"grad_norm": 0.03597598895430565, |
|
"kl": 0.0007686614990234375, |
|
"learning_rate": 3.2e-07, |
|
"loss": 0.2179, |
|
"reward": -0.09196203667670488, |
|
"reward_std": 0.5380833484232426, |
|
"rewards/cosine_scaled_reward": -0.2334810234606266, |
|
"rewards/format_reward": 0.3750000149011612, |
|
"step": 17 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3274.9583740234375, |
|
"epoch": 0.010285714285714285, |
|
"grad_norm": 0.012147138826549053, |
|
"kl": 0.0004515647888183594, |
|
"learning_rate": 3.4000000000000003e-07, |
|
"loss": -0.0625, |
|
"reward": -0.1761876866221428, |
|
"reward_std": 0.6809441670775414, |
|
"rewards/cosine_scaled_reward": -0.2339271828532219, |
|
"rewards/format_reward": 0.2916666679084301, |
|
"step": 18 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2882.2083435058594, |
|
"epoch": 0.010857142857142857, |
|
"grad_norm": 0.026394149288535118, |
|
"kl": 0.0004787445068359375, |
|
"learning_rate": 3.6e-07, |
|
"loss": -0.0461, |
|
"reward": 0.3346722051501274, |
|
"reward_std": 0.3912115804851055, |
|
"rewards/cosine_scaled_reward": -0.02016391232609749, |
|
"rewards/format_reward": 0.3750000037252903, |
|
"step": 19 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1633.5833435058594, |
|
"epoch": 0.011428571428571429, |
|
"grad_norm": 0.01967485062777996, |
|
"kl": 0.0003612041473388672, |
|
"learning_rate": 3.7999999999999996e-07, |
|
"loss": 0.0766, |
|
"reward": 0.638333223760128, |
|
"reward_std": 0.8739089630544186, |
|
"rewards/cosine_scaled_reward": -0.0975000774487853, |
|
"rewards/format_reward": 0.8333333358168602, |
|
"step": 20 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1705.2916717529297, |
|
"epoch": 0.012, |
|
"grad_norm": 0.052289288491010666, |
|
"kl": 0.0007824897766113281, |
|
"learning_rate": 4e-07, |
|
"loss": 0.4594, |
|
"reward": 0.13058341294527054, |
|
"reward_std": 0.39700106158852577, |
|
"rewards/cosine_scaled_reward": -0.2680416405200958, |
|
"rewards/format_reward": 0.6666666865348816, |
|
"step": 21 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3156.9166870117188, |
|
"epoch": 0.012571428571428572, |
|
"grad_norm": 0.016474798321723938, |
|
"kl": 0.00043773651123046875, |
|
"learning_rate": 4.1999999999999995e-07, |
|
"loss": 0.0935, |
|
"reward": 1.028728973120451, |
|
"reward_std": 1.6666590571403503, |
|
"rewards/cosine_scaled_reward": 0.26436448842287064, |
|
"rewards/format_reward": 0.5000000149011612, |
|
"step": 22 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2439.041748046875, |
|
"epoch": 0.013142857142857144, |
|
"grad_norm": 0.019004346802830696, |
|
"kl": 0.0005197525024414062, |
|
"learning_rate": 4.3999999999999997e-07, |
|
"loss": 0.0789, |
|
"reward": 0.5590320900082588, |
|
"reward_std": 1.0339802950620651, |
|
"rewards/cosine_scaled_reward": -0.03298397921025753, |
|
"rewards/format_reward": 0.6250000149011612, |
|
"step": 23 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2758.0833740234375, |
|
"epoch": 0.013714285714285714, |
|
"grad_norm": 0.017480166628956795, |
|
"kl": 0.0005626678466796875, |
|
"learning_rate": 4.6e-07, |
|
"loss": -0.1518, |
|
"reward": 0.3504378944635391, |
|
"reward_std": 0.513374675065279, |
|
"rewards/cosine_scaled_reward": -0.09561440348625183, |
|
"rewards/format_reward": 0.5416666679084301, |
|
"step": 24 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1633.2083587646484, |
|
"epoch": 0.014285714285714285, |
|
"grad_norm": 0.03595641627907753, |
|
"kl": 0.00029969215393066406, |
|
"learning_rate": 4.8e-07, |
|
"loss": 0.0377, |
|
"reward": 0.6530582755804062, |
|
"reward_std": 0.629613857716322, |
|
"rewards/cosine_scaled_reward": -0.04847088688984513, |
|
"rewards/format_reward": 0.75, |
|
"step": 25 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2597.0416870117188, |
|
"epoch": 0.014857142857142857, |
|
"grad_norm": 0.01941561885178089, |
|
"kl": 0.0005736351013183594, |
|
"learning_rate": 5e-07, |
|
"loss": 0.0753, |
|
"reward": -0.07398717105388641, |
|
"reward_std": 0.7024243678897619, |
|
"rewards/cosine_scaled_reward": -0.2453269399702549, |
|
"rewards/format_reward": 0.4166666716337204, |
|
"step": 26 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3365.5833740234375, |
|
"epoch": 0.015428571428571429, |
|
"grad_norm": 0.015174021013081074, |
|
"kl": 0.00045490264892578125, |
|
"learning_rate": 5.2e-07, |
|
"loss": 0.1186, |
|
"reward": -0.35242245346307755, |
|
"reward_std": 0.28800780698657036, |
|
"rewards/cosine_scaled_reward": -0.21787790581583977, |
|
"rewards/format_reward": 0.0833333358168602, |
|
"step": 27 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2502.5833740234375, |
|
"epoch": 0.016, |
|
"grad_norm": 0.026485657319426537, |
|
"kl": 0.0006246566772460938, |
|
"learning_rate": 5.4e-07, |
|
"loss": 0.0892, |
|
"reward": 0.14434174199413974, |
|
"reward_std": 0.6618244834244251, |
|
"rewards/cosine_scaled_reward": -0.17782913893461227, |
|
"rewards/format_reward": 0.5000000074505806, |
|
"step": 28 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2873.0, |
|
"epoch": 0.01657142857142857, |
|
"grad_norm": 0.015148441307246685, |
|
"kl": 0.0004634857177734375, |
|
"learning_rate": 5.6e-07, |
|
"loss": -0.009, |
|
"reward": -0.17739348113536835, |
|
"reward_std": 0.48768409341573715, |
|
"rewards/cosine_scaled_reward": -0.21369674568995833, |
|
"rewards/format_reward": 0.25, |
|
"step": 29 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3555.2083740234375, |
|
"epoch": 0.017142857142857144, |
|
"grad_norm": 0.011238854378461838, |
|
"kl": 0.00047588348388671875, |
|
"learning_rate": 5.8e-07, |
|
"loss": 0.0107, |
|
"reward": -0.07929126173257828, |
|
"reward_std": 0.8475685454905033, |
|
"rewards/cosine_scaled_reward": -0.1021456066519022, |
|
"rewards/format_reward": 0.1250000037252903, |
|
"step": 30 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2528.625030517578, |
|
"epoch": 0.017714285714285714, |
|
"grad_norm": 0.06202416494488716, |
|
"kl": 0.0005381107330322266, |
|
"learning_rate": 6e-07, |
|
"loss": 0.2637, |
|
"reward": -0.021039772778749466, |
|
"reward_std": 0.860994964838028, |
|
"rewards/cosine_scaled_reward": -0.21885321522131562, |
|
"rewards/format_reward": 0.4166666679084301, |
|
"step": 31 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2219.9166870117188, |
|
"epoch": 0.018285714285714287, |
|
"grad_norm": 0.034875061362981796, |
|
"kl": 0.0004260540008544922, |
|
"learning_rate": 6.2e-07, |
|
"loss": 0.132, |
|
"reward": 0.3424109169282019, |
|
"reward_std": 0.8896235823631287, |
|
"rewards/cosine_scaled_reward": -0.09962787851691246, |
|
"rewards/format_reward": 0.541666679084301, |
|
"step": 32 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3474.4583740234375, |
|
"epoch": 0.018857142857142857, |
|
"grad_norm": 0.016711147502064705, |
|
"kl": 0.0005655288696289062, |
|
"learning_rate": 6.4e-07, |
|
"loss": 0.0639, |
|
"reward": -0.47737591713666916, |
|
"reward_std": 0.1697257850319147, |
|
"rewards/cosine_scaled_reward": -0.2595212906599045, |
|
"rewards/format_reward": 0.0416666679084301, |
|
"step": 33 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3159.6666870117188, |
|
"epoch": 0.019428571428571427, |
|
"grad_norm": 0.014976400882005692, |
|
"kl": 0.0004849433898925781, |
|
"learning_rate": 6.6e-07, |
|
"loss": 0.0609, |
|
"reward": 0.015222817659378052, |
|
"reward_std": 0.701315013691783, |
|
"rewards/cosine_scaled_reward": -0.17988859117031097, |
|
"rewards/format_reward": 0.3750000111758709, |
|
"step": 34 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3254.7916870117188, |
|
"epoch": 0.02, |
|
"grad_norm": 0.04339271038770676, |
|
"kl": 0.0004878044128417969, |
|
"learning_rate": 6.800000000000001e-07, |
|
"loss": 0.1246, |
|
"reward": -0.6728095263242722, |
|
"reward_std": 0.290258064866066, |
|
"rewards/cosine_scaled_reward": -0.41973811388015747, |
|
"rewards/format_reward": 0.1666666679084301, |
|
"step": 35 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2359.500030517578, |
|
"epoch": 0.02057142857142857, |
|
"grad_norm": 0.031043976545333862, |
|
"kl": 0.0005769729614257812, |
|
"learning_rate": 7e-07, |
|
"loss": 0.1315, |
|
"reward": 0.5186025649309158, |
|
"reward_std": 0.7595919780433178, |
|
"rewards/cosine_scaled_reward": -0.011532071977853775, |
|
"rewards/format_reward": 0.5416666716337204, |
|
"step": 36 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3505.2916870117188, |
|
"epoch": 0.021142857142857144, |
|
"grad_norm": 0.011476296000182629, |
|
"kl": 0.00041484832763671875, |
|
"learning_rate": 7.2e-07, |
|
"loss": 0.0309, |
|
"reward": 0.1381697803735733, |
|
"reward_std": 0.9006250277161598, |
|
"rewards/cosine_scaled_reward": -0.014248451218008995, |
|
"rewards/format_reward": 0.1666666679084301, |
|
"step": 37 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3085.5001220703125, |
|
"epoch": 0.021714285714285714, |
|
"grad_norm": 0.017438944429159164, |
|
"kl": 0.000644683837890625, |
|
"learning_rate": 7.4e-07, |
|
"loss": 0.0651, |
|
"reward": 0.5748194381594658, |
|
"reward_std": 1.0931934267282486, |
|
"rewards/cosine_scaled_reward": 0.016576368361711502, |
|
"rewards/format_reward": 0.541666679084301, |
|
"step": 38 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2326.2084045410156, |
|
"epoch": 0.022285714285714287, |
|
"grad_norm": 0.01982088014483452, |
|
"kl": 0.0006146430969238281, |
|
"learning_rate": 7.599999999999999e-07, |
|
"loss": 0.0144, |
|
"reward": 0.7864310666918755, |
|
"reward_std": 0.7996397092938423, |
|
"rewards/cosine_scaled_reward": 0.0807155417278409, |
|
"rewards/format_reward": 0.6250000149011612, |
|
"step": 39 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3484.5000610351562, |
|
"epoch": 0.022857142857142857, |
|
"grad_norm": 0.01842389442026615, |
|
"kl": 0.0004572868347167969, |
|
"learning_rate": 7.799999999999999e-07, |
|
"loss": 0.0465, |
|
"reward": -0.408206457272172, |
|
"reward_std": 0.35211328975856304, |
|
"rewards/cosine_scaled_reward": -0.2874365597963333, |
|
"rewards/format_reward": 0.1666666679084301, |
|
"step": 40 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2935.875, |
|
"epoch": 0.023428571428571427, |
|
"grad_norm": 0.016613401472568512, |
|
"kl": 0.00041675567626953125, |
|
"learning_rate": 8e-07, |
|
"loss": 0.0661, |
|
"reward": 0.06390659511089325, |
|
"reward_std": 0.4249320328235626, |
|
"rewards/cosine_scaled_reward": -0.11388003081083298, |
|
"rewards/format_reward": 0.2916666679084301, |
|
"step": 41 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2428.9583435058594, |
|
"epoch": 0.024, |
|
"grad_norm": 0.021880364045500755, |
|
"kl": 0.00039958953857421875, |
|
"learning_rate": 8.199999999999999e-07, |
|
"loss": -0.1435, |
|
"reward": 0.11024168506264687, |
|
"reward_std": 0.32544056698679924, |
|
"rewards/cosine_scaled_reward": -0.19487916305661201, |
|
"rewards/format_reward": 0.5, |
|
"step": 42 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3448.7083740234375, |
|
"epoch": 0.02457142857142857, |
|
"grad_norm": 0.016641981899738312, |
|
"kl": 0.0004696846008300781, |
|
"learning_rate": 8.399999999999999e-07, |
|
"loss": 0.0654, |
|
"reward": -0.2966616526246071, |
|
"reward_std": 0.5851836632937193, |
|
"rewards/cosine_scaled_reward": -0.2108308244496584, |
|
"rewards/format_reward": 0.1250000037252903, |
|
"step": 43 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3186.4584350585938, |
|
"epoch": 0.025142857142857144, |
|
"grad_norm": 0.025406604632735252, |
|
"kl": 0.0004544258117675781, |
|
"learning_rate": 8.599999999999999e-07, |
|
"loss": 0.0373, |
|
"reward": 0.24428799748420715, |
|
"reward_std": 0.6496499851346016, |
|
"rewards/cosine_scaled_reward": -0.08618932589888573, |
|
"rewards/format_reward": 0.4166666679084301, |
|
"step": 44 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3035.8750610351562, |
|
"epoch": 0.025714285714285714, |
|
"grad_norm": 0.022738995030522346, |
|
"kl": 0.00051116943359375, |
|
"learning_rate": 8.799999999999999e-07, |
|
"loss": 0.1805, |
|
"reward": -0.13624755293130875, |
|
"reward_std": 0.8101175278425217, |
|
"rewards/cosine_scaled_reward": -0.21395711041986942, |
|
"rewards/format_reward": 0.2916666753590107, |
|
"step": 45 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3030.5416870117188, |
|
"epoch": 0.026285714285714287, |
|
"grad_norm": 0.015439880080521107, |
|
"kl": 0.0004506111145019531, |
|
"learning_rate": 9e-07, |
|
"loss": 0.0927, |
|
"reward": 0.9518533200025558, |
|
"reward_std": 0.8967212848365307, |
|
"rewards/cosine_scaled_reward": 0.20509332790970802, |
|
"rewards/format_reward": 0.5416666716337204, |
|
"step": 46 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2392.0833587646484, |
|
"epoch": 0.026857142857142857, |
|
"grad_norm": 0.016110830008983612, |
|
"kl": 0.00034999847412109375, |
|
"learning_rate": 9.2e-07, |
|
"loss": 0.0866, |
|
"reward": 0.8118329793214798, |
|
"reward_std": 0.6570356953889132, |
|
"rewards/cosine_scaled_reward": 0.11424979940056801, |
|
"rewards/format_reward": 0.5833333432674408, |
|
"step": 47 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2063.5833435058594, |
|
"epoch": 0.027428571428571427, |
|
"grad_norm": 0.02758549526333809, |
|
"kl": 0.0005931854248046875, |
|
"learning_rate": 9.399999999999999e-07, |
|
"loss": 0.09, |
|
"reward": 0.6152995973825455, |
|
"reward_std": 0.7233676761388779, |
|
"rewards/cosine_scaled_reward": -0.025683537125587463, |
|
"rewards/format_reward": 0.6666666865348816, |
|
"step": 48 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2459.9583740234375, |
|
"epoch": 0.028, |
|
"grad_norm": 0.02223382703959942, |
|
"kl": 0.0005383491516113281, |
|
"learning_rate": 9.6e-07, |
|
"loss": 0.0228, |
|
"reward": 0.4767443835735321, |
|
"reward_std": 0.6502058878540993, |
|
"rewards/cosine_scaled_reward": 0.009205527603626251, |
|
"rewards/format_reward": 0.4583333432674408, |
|
"step": 49 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3291.4166870117188, |
|
"epoch": 0.02857142857142857, |
|
"grad_norm": 0.01337195374071598, |
|
"kl": 0.0004801750183105469, |
|
"learning_rate": 9.8e-07, |
|
"loss": -0.0675, |
|
"reward": 0.18172279000282288, |
|
"reward_std": 0.23446273803710938, |
|
"rewards/cosine_scaled_reward": -0.034138597548007965, |
|
"rewards/format_reward": 0.25, |
|
"step": 50 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2837.3333435058594, |
|
"epoch": 0.029142857142857144, |
|
"grad_norm": 0.0264517143368721, |
|
"kl": 0.00040531158447265625, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0691, |
|
"reward": 0.0855257548391819, |
|
"reward_std": 0.5013507194817066, |
|
"rewards/cosine_scaled_reward": -0.10307044349610806, |
|
"rewards/format_reward": 0.2916666679084301, |
|
"step": 51 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3550.125, |
|
"epoch": 0.029714285714285714, |
|
"grad_norm": 0.01194742787629366, |
|
"kl": 0.0003275871276855469, |
|
"learning_rate": 9.999890338174275e-07, |
|
"loss": 0.0148, |
|
"reward": -0.3245684579014778, |
|
"reward_std": 0.8235821425914764, |
|
"rewards/cosine_scaled_reward": -0.22478425258304924, |
|
"rewards/format_reward": 0.1250000037252903, |
|
"step": 52 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2964.2083435058594, |
|
"epoch": 0.030285714285714287, |
|
"grad_norm": 0.01602712646126747, |
|
"kl": 0.00046443939208984375, |
|
"learning_rate": 9.999561358041868e-07, |
|
"loss": -0.0535, |
|
"reward": 0.2959368694573641, |
|
"reward_std": 0.18181271478533745, |
|
"rewards/cosine_scaled_reward": 0.022968419827520847, |
|
"rewards/format_reward": 0.25, |
|
"step": 53 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3124.0, |
|
"epoch": 0.030857142857142857, |
|
"grad_norm": 0.017948586493730545, |
|
"kl": 0.0005545616149902344, |
|
"learning_rate": 9.999013075636804e-07, |
|
"loss": 0.0975, |
|
"reward": -0.4570858801016584, |
|
"reward_std": 0.25594667345285416, |
|
"rewards/cosine_scaled_reward": -0.3327096067368984, |
|
"rewards/format_reward": 0.2083333432674408, |
|
"step": 54 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3167.3750610351562, |
|
"epoch": 0.03142857142857143, |
|
"grad_norm": 0.014247610233724117, |
|
"kl": 0.00048542022705078125, |
|
"learning_rate": 9.998245517681593e-07, |
|
"loss": -0.0067, |
|
"reward": 0.13806065171957016, |
|
"reward_std": 0.8922518789768219, |
|
"rewards/cosine_scaled_reward": -0.07680301181972027, |
|
"rewards/format_reward": 0.291666679084301, |
|
"step": 55 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3063.3750610351562, |
|
"epoch": 0.032, |
|
"grad_norm": 0.01854483038187027, |
|
"kl": 0.0007352828979492188, |
|
"learning_rate": 9.997258721585931e-07, |
|
"loss": 0.16, |
|
"reward": -0.01539595052599907, |
|
"reward_std": 0.6964320801198483, |
|
"rewards/cosine_scaled_reward": -0.15353131107985973, |
|
"rewards/format_reward": 0.291666679084301, |
|
"step": 56 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2957.666717529297, |
|
"epoch": 0.03257142857142857, |
|
"grad_norm": 0.034658752381801605, |
|
"kl": 0.00042319297790527344, |
|
"learning_rate": 9.996052735444862e-07, |
|
"loss": 0.1077, |
|
"reward": -0.11001442139968276, |
|
"reward_std": 0.6499116308987141, |
|
"rewards/cosine_scaled_reward": -0.2216738946735859, |
|
"rewards/format_reward": 0.3333333432674408, |
|
"step": 57 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3472.9583740234375, |
|
"epoch": 0.03314285714285714, |
|
"grad_norm": 0.011575725860893726, |
|
"kl": 0.0004363059997558594, |
|
"learning_rate": 9.994627618036452e-07, |
|
"loss": 0.0248, |
|
"reward": -0.1939047873020172, |
|
"reward_std": 0.8678329139947891, |
|
"rewards/cosine_scaled_reward": -0.1802857331931591, |
|
"rewards/format_reward": 0.1666666679084301, |
|
"step": 58 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2450.541748046875, |
|
"epoch": 0.03371428571428572, |
|
"grad_norm": 0.02538195252418518, |
|
"kl": 0.0005178451538085938, |
|
"learning_rate": 9.992983438818915e-07, |
|
"loss": 0.2097, |
|
"reward": 0.2084958329796791, |
|
"reward_std": 0.7872938960790634, |
|
"rewards/cosine_scaled_reward": -0.18741872906684875, |
|
"rewards/format_reward": 0.583333358168602, |
|
"step": 59 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3508.0833740234375, |
|
"epoch": 0.03428571428571429, |
|
"grad_norm": 0.018474752083420753, |
|
"kl": 0.00039386749267578125, |
|
"learning_rate": 9.991120277927223e-07, |
|
"loss": 0.0386, |
|
"reward": -0.1972892191261053, |
|
"reward_std": 0.7005422301590443, |
|
"rewards/cosine_scaled_reward": -0.18197794491425157, |
|
"rewards/format_reward": 0.1666666679084301, |
|
"step": 60 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2241.916702270508, |
|
"epoch": 0.03485714285714286, |
|
"grad_norm": 0.027353286743164062, |
|
"kl": 0.0003466606140136719, |
|
"learning_rate": 9.989038226169207e-07, |
|
"loss": 0.2232, |
|
"reward": 0.8048897292464972, |
|
"reward_std": 1.2643243670463562, |
|
"rewards/cosine_scaled_reward": 0.0899448562413454, |
|
"rewards/format_reward": 0.6250000111758709, |
|
"step": 61 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2604.1666870117188, |
|
"epoch": 0.03542857142857143, |
|
"grad_norm": 0.022513169795274734, |
|
"kl": 0.0006260871887207031, |
|
"learning_rate": 9.98673738502114e-07, |
|
"loss": -0.0144, |
|
"reward": -0.23419425124302506, |
|
"reward_std": 0.3903382420539856, |
|
"rewards/cosine_scaled_reward": -0.3254304677248001, |
|
"rewards/format_reward": 0.4166666716337204, |
|
"step": 62 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3552.2500610351562, |
|
"epoch": 0.036, |
|
"grad_norm": 0.01291267666965723, |
|
"kl": 0.0005128383636474609, |
|
"learning_rate": 9.98421786662277e-07, |
|
"loss": 0.0182, |
|
"reward": -0.18873221427202225, |
|
"reward_std": 0.5512382872402668, |
|
"rewards/cosine_scaled_reward": -0.1360327743459493, |
|
"rewards/format_reward": 0.0833333358168602, |
|
"step": 63 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3535.8333740234375, |
|
"epoch": 0.036571428571428574, |
|
"grad_norm": 0.011109953746199608, |
|
"kl": 0.00037360191345214844, |
|
"learning_rate": 9.981479793771866e-07, |
|
"loss": 0.01, |
|
"reward": -0.44818597845733166, |
|
"reward_std": 0.25114433094859123, |
|
"rewards/cosine_scaled_reward": -0.2865929929539561, |
|
"rewards/format_reward": 0.125, |
|
"step": 64 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3338.5416870117188, |
|
"epoch": 0.037142857142857144, |
|
"grad_norm": 0.018163377419114113, |
|
"kl": 0.0006074905395507812, |
|
"learning_rate": 9.97852329991824e-07, |
|
"loss": 0.0984, |
|
"reward": -0.24769322806969285, |
|
"reward_std": 0.49852345883846283, |
|
"rewards/cosine_scaled_reward": -0.20717995800077915, |
|
"rewards/format_reward": 0.1666666716337204, |
|
"step": 65 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2866.3334045410156, |
|
"epoch": 0.037714285714285714, |
|
"grad_norm": 0.03914084658026695, |
|
"kl": 0.0003857612609863281, |
|
"learning_rate": 9.975348529157229e-07, |
|
"loss": 0.1522, |
|
"reward": 0.5251417439430952, |
|
"reward_std": 1.087289422750473, |
|
"rewards/cosine_scaled_reward": -0.00826246291399002, |
|
"rewards/format_reward": 0.541666679084301, |
|
"step": 66 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2491.2917098999023, |
|
"epoch": 0.038285714285714284, |
|
"grad_norm": 0.03254377841949463, |
|
"kl": 0.0004949569702148438, |
|
"learning_rate": 9.971955636222684e-07, |
|
"loss": 0.0968, |
|
"reward": 0.676957952324301, |
|
"reward_std": 0.9317026361823082, |
|
"rewards/cosine_scaled_reward": 0.046812308952212334, |
|
"rewards/format_reward": 0.5833333432674408, |
|
"step": 67 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3545.0416870117188, |
|
"epoch": 0.038857142857142854, |
|
"grad_norm": 0.010356022976338863, |
|
"kl": 0.0002932548522949219, |
|
"learning_rate": 9.968344786479415e-07, |
|
"loss": 0.0228, |
|
"reward": -0.4004784324206412, |
|
"reward_std": 0.4494715537875891, |
|
"rewards/cosine_scaled_reward": -0.22107255086302757, |
|
"rewards/format_reward": 0.0416666679084301, |
|
"step": 68 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2902.666748046875, |
|
"epoch": 0.03942857142857143, |
|
"grad_norm": 0.022468766197562218, |
|
"kl": 0.0005145072937011719, |
|
"learning_rate": 9.964516155915151e-07, |
|
"loss": 0.039, |
|
"reward": -0.08480488415807486, |
|
"reward_std": 0.5629407912492752, |
|
"rewards/cosine_scaled_reward": -0.292402446269989, |
|
"rewards/format_reward": 0.5000000149011612, |
|
"step": 69 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2750.3334045410156, |
|
"epoch": 0.04, |
|
"grad_norm": 0.04673980176448822, |
|
"kl": 0.0007410049438476562, |
|
"learning_rate": 9.960469931131936e-07, |
|
"loss": 0.2347, |
|
"reward": 0.22281695902347565, |
|
"reward_std": 0.6946056261658669, |
|
"rewards/cosine_scaled_reward": -0.07609154284000397, |
|
"rewards/format_reward": 0.3750000037252903, |
|
"step": 70 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2139.7083892822266, |
|
"epoch": 0.04057142857142857, |
|
"grad_norm": 0.022997990250587463, |
|
"kl": 0.00041031837463378906, |
|
"learning_rate": 9.956206309337066e-07, |
|
"loss": 0.0503, |
|
"reward": 1.0238905474543571, |
|
"reward_std": 0.3659038320183754, |
|
"rewards/cosine_scaled_reward": 0.19944527000188828, |
|
"rewards/format_reward": 0.625, |
|
"step": 71 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3038.5833435058594, |
|
"epoch": 0.04114285714285714, |
|
"grad_norm": 0.015527274459600449, |
|
"kl": 0.0004353523254394531, |
|
"learning_rate": 9.951725498333448e-07, |
|
"loss": -0.0403, |
|
"reward": 0.057796329259872437, |
|
"reward_std": 0.43705910444259644, |
|
"rewards/cosine_scaled_reward": -0.09610185027122498, |
|
"rewards/format_reward": 0.25, |
|
"step": 72 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2542.416717529297, |
|
"epoch": 0.04171428571428572, |
|
"grad_norm": 0.031132198870182037, |
|
"kl": 0.0004696846008300781, |
|
"learning_rate": 9.947027716509488e-07, |
|
"loss": 0.087, |
|
"reward": 0.4634501487016678, |
|
"reward_std": 0.6224832870066166, |
|
"rewards/cosine_scaled_reward": -0.0807749442756176, |
|
"rewards/format_reward": 0.6250000149011612, |
|
"step": 73 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2811.375030517578, |
|
"epoch": 0.04228571428571429, |
|
"grad_norm": 0.017915023490786552, |
|
"kl": 0.0003814697265625, |
|
"learning_rate": 9.942113192828444e-07, |
|
"loss": 0.0416, |
|
"reward": 0.5657302290201187, |
|
"reward_std": 0.4264005981385708, |
|
"rewards/cosine_scaled_reward": 0.03286512568593025, |
|
"rewards/format_reward": 0.5, |
|
"step": 74 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2507.0000610351562, |
|
"epoch": 0.04285714285714286, |
|
"grad_norm": 0.04219174385070801, |
|
"kl": 0.000507354736328125, |
|
"learning_rate": 9.93698216681727e-07, |
|
"loss": 0.1314, |
|
"reward": 0.33457985022687353, |
|
"reward_std": 0.9787873476743698, |
|
"rewards/cosine_scaled_reward": -0.08271008729934692, |
|
"rewards/format_reward": 0.5000000037252903, |
|
"step": 75 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3257.2083740234375, |
|
"epoch": 0.04342857142857143, |
|
"grad_norm": 0.016020679846405983, |
|
"kl": 0.0005130767822265625, |
|
"learning_rate": 9.931634888554935e-07, |
|
"loss": 0.0017, |
|
"reward": 0.03311159461736679, |
|
"reward_std": 0.8149497471749783, |
|
"rewards/cosine_scaled_reward": -0.170944195240736, |
|
"rewards/format_reward": 0.3750000111758709, |
|
"step": 76 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2980.9584350585938, |
|
"epoch": 0.044, |
|
"grad_norm": 0.01863541640341282, |
|
"kl": 0.0004639625549316406, |
|
"learning_rate": 9.926071618660237e-07, |
|
"loss": 0.0415, |
|
"reward": 0.25933826714754105, |
|
"reward_std": 0.6607147231698036, |
|
"rewards/cosine_scaled_reward": -0.09949754178524017, |
|
"rewards/format_reward": 0.4583333358168602, |
|
"step": 77 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2690.5000915527344, |
|
"epoch": 0.044571428571428574, |
|
"grad_norm": 0.044503167271614075, |
|
"kl": 0.0007963180541992188, |
|
"learning_rate": 9.9202926282791e-07, |
|
"loss": 0.2539, |
|
"reward": 0.40623846650123596, |
|
"reward_std": 0.999249055981636, |
|
"rewards/cosine_scaled_reward": -0.026047438383102417, |
|
"rewards/format_reward": 0.4583333507180214, |
|
"step": 78 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2197.500030517578, |
|
"epoch": 0.045142857142857144, |
|
"grad_norm": 0.03637674078345299, |
|
"kl": 0.0004515647888183594, |
|
"learning_rate": 9.91429819907136e-07, |
|
"loss": 0.245, |
|
"reward": 0.35764368530362844, |
|
"reward_std": 0.7761036828160286, |
|
"rewards/cosine_scaled_reward": -0.13367816805839539, |
|
"rewards/format_reward": 0.6250000074505806, |
|
"step": 79 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3167.7500610351562, |
|
"epoch": 0.045714285714285714, |
|
"grad_norm": 0.016760632395744324, |
|
"kl": 0.0005090236663818359, |
|
"learning_rate": 9.908088623197048e-07, |
|
"loss": 0.0648, |
|
"reward": 0.6552756018936634, |
|
"reward_std": 0.888429120182991, |
|
"rewards/cosine_scaled_reward": 0.11930444650352001, |
|
"rewards/format_reward": 0.4166666828095913, |
|
"step": 80 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3178.4583740234375, |
|
"epoch": 0.046285714285714284, |
|
"grad_norm": 0.018774185329675674, |
|
"kl": 0.0004324913024902344, |
|
"learning_rate": 9.901664203302124e-07, |
|
"loss": 0.096, |
|
"reward": 0.3727112878113985, |
|
"reward_std": 0.6679159682244062, |
|
"rewards/cosine_scaled_reward": 0.019688975531607866, |
|
"rewards/format_reward": 0.3333333432674408, |
|
"step": 81 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3135.25, |
|
"epoch": 0.046857142857142854, |
|
"grad_norm": 0.014164636842906475, |
|
"kl": 0.0003552436828613281, |
|
"learning_rate": 9.895025252503755e-07, |
|
"loss": -0.0475, |
|
"reward": 0.1692640781402588, |
|
"reward_std": 0.2734921835362911, |
|
"rewards/cosine_scaled_reward": -0.0403679758310318, |
|
"rewards/format_reward": 0.25, |
|
"step": 82 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1449.3750762939453, |
|
"epoch": 0.04742857142857143, |
|
"grad_norm": 0.027206717059016228, |
|
"kl": 0.0006706714630126953, |
|
"learning_rate": 9.888172094375033e-07, |
|
"loss": 0.1933, |
|
"reward": 0.888194240629673, |
|
"reward_std": 0.8644632250070572, |
|
"rewards/cosine_scaled_reward": 0.027430432848632336, |
|
"rewards/format_reward": 0.8333333432674408, |
|
"step": 83 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2289.875030517578, |
|
"epoch": 0.048, |
|
"grad_norm": 0.018596787005662918, |
|
"kl": 0.0004329681396484375, |
|
"learning_rate": 9.881105062929221e-07, |
|
"loss": -0.0513, |
|
"reward": 0.2941868454217911, |
|
"reward_std": 0.6860168538987637, |
|
"rewards/cosine_scaled_reward": -0.18623991776257753, |
|
"rewards/format_reward": 0.6666666865348816, |
|
"step": 84 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2823.2083435058594, |
|
"epoch": 0.04857142857142857, |
|
"grad_norm": 0.01691795513033867, |
|
"kl": 0.0005006790161132812, |
|
"learning_rate": 9.873824502603459e-07, |
|
"loss": 0.0365, |
|
"reward": -0.15082548558712006, |
|
"reward_std": 0.26336194574832916, |
|
"rewards/cosine_scaled_reward": -0.24207941442728043, |
|
"rewards/format_reward": 0.3333333358168602, |
|
"step": 85 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2668.7083740234375, |
|
"epoch": 0.04914285714285714, |
|
"grad_norm": 0.014490959234535694, |
|
"kl": 0.0003361701965332031, |
|
"learning_rate": 9.866330768241983e-07, |
|
"loss": 0.0614, |
|
"reward": 1.0604897737503052, |
|
"reward_std": 1.551704853773117, |
|
"rewards/cosine_scaled_reward": 0.19691153056919575, |
|
"rewards/format_reward": 0.6666666865348816, |
|
"step": 86 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2975.3750610351562, |
|
"epoch": 0.04971428571428571, |
|
"grad_norm": 0.01852068305015564, |
|
"kl": 0.0004734992980957031, |
|
"learning_rate": 9.85862422507884e-07, |
|
"loss": -0.0494, |
|
"reward": 0.4187099374830723, |
|
"reward_std": 0.7260430231690407, |
|
"rewards/cosine_scaled_reward": 0.021854941733181477, |
|
"rewards/format_reward": 0.3750000037252903, |
|
"step": 87 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2801.7501220703125, |
|
"epoch": 0.05028571428571429, |
|
"grad_norm": 0.01830633357167244, |
|
"kl": 0.0005750656127929688, |
|
"learning_rate": 9.850705248720068e-07, |
|
"loss": 0.0263, |
|
"reward": 0.018878452479839325, |
|
"reward_std": 0.6697305515408516, |
|
"rewards/cosine_scaled_reward": -0.2822274398058653, |
|
"rewards/format_reward": 0.5833333395421505, |
|
"step": 88 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2697.416717529297, |
|
"epoch": 0.05085714285714286, |
|
"grad_norm": 0.023486582562327385, |
|
"kl": 0.0004010200500488281, |
|
"learning_rate": 9.8425742251254e-07, |
|
"loss": -0.0038, |
|
"reward": 0.29166828095912933, |
|
"reward_std": 0.7786018922924995, |
|
"rewards/cosine_scaled_reward": -0.08333254605531693, |
|
"rewards/format_reward": 0.4583333395421505, |
|
"step": 89 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2538.0833740234375, |
|
"epoch": 0.05142857142857143, |
|
"grad_norm": 0.010524451732635498, |
|
"kl": 0.0002994537353515625, |
|
"learning_rate": 9.83423155058946e-07, |
|
"loss": 0.0346, |
|
"reward": 0.7873217761516571, |
|
"reward_std": 0.763067714869976, |
|
"rewards/cosine_scaled_reward": 0.08116088062524796, |
|
"rewards/format_reward": 0.625, |
|
"step": 90 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3094.8333740234375, |
|
"epoch": 0.052, |
|
"grad_norm": 0.03540240228176117, |
|
"kl": 0.0004677772521972656, |
|
"learning_rate": 9.825677631722435e-07, |
|
"loss": 0.0673, |
|
"reward": 0.1670377204718534, |
|
"reward_std": 0.6299600079655647, |
|
"rewards/cosine_scaled_reward": -0.0831478089094162, |
|
"rewards/format_reward": 0.3333333432674408, |
|
"step": 91 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2744.541717529297, |
|
"epoch": 0.052571428571428575, |
|
"grad_norm": 0.03803950175642967, |
|
"kl": 0.0004138946533203125, |
|
"learning_rate": 9.816912885430258e-07, |
|
"loss": 0.1363, |
|
"reward": 0.2968802750110626, |
|
"reward_std": 0.4654075037688017, |
|
"rewards/cosine_scaled_reward": 0.002606801688671112, |
|
"rewards/format_reward": 0.2916666679084301, |
|
"step": 92 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2166.9583435058594, |
|
"epoch": 0.053142857142857144, |
|
"grad_norm": 0.047334711998701096, |
|
"kl": 0.000446319580078125, |
|
"learning_rate": 9.807937738894303e-07, |
|
"loss": 0.0582, |
|
"reward": 0.20845970511436462, |
|
"reward_std": 0.3487181942909956, |
|
"rewards/cosine_scaled_reward": -0.14577015489339828, |
|
"rewards/format_reward": 0.5, |
|
"step": 93 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2899.2083740234375, |
|
"epoch": 0.053714285714285714, |
|
"grad_norm": 0.013924806378781796, |
|
"kl": 0.00037860870361328125, |
|
"learning_rate": 9.798752629550546e-07, |
|
"loss": 0.0477, |
|
"reward": 0.4423799216747284, |
|
"reward_std": 0.8836686909198761, |
|
"rewards/cosine_scaled_reward": -0.0913100466132164, |
|
"rewards/format_reward": 0.6250000260770321, |
|
"step": 94 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2209.4583740234375, |
|
"epoch": 0.054285714285714284, |
|
"grad_norm": 0.0459674671292305, |
|
"kl": 0.0005049705505371094, |
|
"learning_rate": 9.78935800506826e-07, |
|
"loss": 0.2862, |
|
"reward": 0.9317682832479477, |
|
"reward_std": 0.7728890106081963, |
|
"rewards/cosine_scaled_reward": 0.07005079090595245, |
|
"rewards/format_reward": 0.791666679084301, |
|
"step": 95 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3184.416748046875, |
|
"epoch": 0.054857142857142854, |
|
"grad_norm": 0.017193680629134178, |
|
"kl": 0.00043773651123046875, |
|
"learning_rate": 9.779754323328192e-07, |
|
"loss": 0.0105, |
|
"reward": 0.38489115983247757, |
|
"reward_std": 1.058574389666319, |
|
"rewards/cosine_scaled_reward": -0.015887772024143487, |
|
"rewards/format_reward": 0.4166666828095913, |
|
"step": 96 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2542.9583740234375, |
|
"epoch": 0.05542857142857143, |
|
"grad_norm": 0.03288557752966881, |
|
"kl": 0.0006394386291503906, |
|
"learning_rate": 9.769942052400235e-07, |
|
"loss": 0.0173, |
|
"reward": -0.20064683258533478, |
|
"reward_std": 0.43674986250698566, |
|
"rewards/cosine_scaled_reward": -0.32949007861316204, |
|
"rewards/format_reward": 0.4583333432674408, |
|
"step": 97 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2007.2500457763672, |
|
"epoch": 0.056, |
|
"grad_norm": 0.020400483161211014, |
|
"kl": 0.0003914833068847656, |
|
"learning_rate": 9.759921670520634e-07, |
|
"loss": 0.0489, |
|
"reward": 1.0779699385166168, |
|
"reward_std": 1.257737785577774, |
|
"rewards/cosine_scaled_reward": 0.18481825292110443, |
|
"rewards/format_reward": 0.7083333358168602, |
|
"step": 98 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2389.5833587646484, |
|
"epoch": 0.05657142857142857, |
|
"grad_norm": 0.019844507798552513, |
|
"kl": 0.0003876686096191406, |
|
"learning_rate": 9.749693666068663e-07, |
|
"loss": 0.0556, |
|
"reward": 0.953456562012434, |
|
"reward_std": 0.7421619556844234, |
|
"rewards/cosine_scaled_reward": 0.2058949265629053, |
|
"rewards/format_reward": 0.541666679084301, |
|
"step": 99 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2124.0416717529297, |
|
"epoch": 0.05714285714285714, |
|
"grad_norm": 0.030665883794426918, |
|
"kl": 0.0005888938903808594, |
|
"learning_rate": 9.739258537542835e-07, |
|
"loss": -0.0126, |
|
"reward": 0.47896020486950874, |
|
"reward_std": 0.677450954914093, |
|
"rewards/cosine_scaled_reward": -0.03135322220623493, |
|
"rewards/format_reward": 0.5416666679084301, |
|
"step": 100 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3004.041748046875, |
|
"epoch": 0.05771428571428571, |
|
"grad_norm": 0.04486413672566414, |
|
"kl": 0.0004496574401855469, |
|
"learning_rate": 9.728616793536587e-07, |
|
"loss": 0.0735, |
|
"reward": 0.022397130727767944, |
|
"reward_std": 0.7927646785974503, |
|
"rewards/cosine_scaled_reward": -0.19713477417826653, |
|
"rewards/format_reward": 0.416666679084301, |
|
"step": 101 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3206.875, |
|
"epoch": 0.05828571428571429, |
|
"grad_norm": 0.035310786217451096, |
|
"kl": 0.0003910064697265625, |
|
"learning_rate": 9.717768952713511e-07, |
|
"loss": 0.1437, |
|
"reward": -0.08731867372989655, |
|
"reward_std": 0.9576195627450943, |
|
"rewards/cosine_scaled_reward": -0.16865937039256096, |
|
"rewards/format_reward": 0.2500000037252903, |
|
"step": 102 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3014.3750610351562, |
|
"epoch": 0.05885714285714286, |
|
"grad_norm": 0.031477976590394974, |
|
"kl": 0.0004429817199707031, |
|
"learning_rate": 9.706715543782064e-07, |
|
"loss": 0.1671, |
|
"reward": -0.22813038900494576, |
|
"reward_std": 0.688251368701458, |
|
"rewards/cosine_scaled_reward": -0.301565196365118, |
|
"rewards/format_reward": 0.3750000074505806, |
|
"step": 103 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3124.25, |
|
"epoch": 0.05942857142857143, |
|
"grad_norm": 0.02040957659482956, |
|
"kl": 0.0004067420959472656, |
|
"learning_rate": 9.695457105469804e-07, |
|
"loss": -0.0155, |
|
"reward": -0.01127801463007927, |
|
"reward_std": 0.39767561107873917, |
|
"rewards/cosine_scaled_reward": -0.1306389942765236, |
|
"rewards/format_reward": 0.25, |
|
"step": 104 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2859.416717529297, |
|
"epoch": 0.06, |
|
"grad_norm": 0.014855082146823406, |
|
"kl": 0.00042057037353515625, |
|
"learning_rate": 9.683994186497132e-07, |
|
"loss": 0.0882, |
|
"reward": -0.13864438608288765, |
|
"reward_std": 0.40612044744193554, |
|
"rewards/cosine_scaled_reward": -0.23598887026309967, |
|
"rewards/format_reward": 0.3333333358168602, |
|
"step": 105 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2882.0416717529297, |
|
"epoch": 0.060571428571428575, |
|
"grad_norm": 0.01957513391971588, |
|
"kl": 0.00047016143798828125, |
|
"learning_rate": 9.672327345550543e-07, |
|
"loss": -0.0236, |
|
"reward": 0.05208197236061096, |
|
"reward_std": 0.6449095346033573, |
|
"rewards/cosine_scaled_reward": -0.11979235336184502, |
|
"rewards/format_reward": 0.2916666679084301, |
|
"step": 106 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2631.000045776367, |
|
"epoch": 0.061142857142857145, |
|
"grad_norm": 0.018131662160158157, |
|
"kl": 0.0004868507385253906, |
|
"learning_rate": 9.66045715125541e-07, |
|
"loss": 0.0708, |
|
"reward": -0.09214365109801292, |
|
"reward_std": 0.40418105013668537, |
|
"rewards/cosine_scaled_reward": -0.2544051744043827, |
|
"rewards/format_reward": 0.4166666716337204, |
|
"step": 107 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2191.2916717529297, |
|
"epoch": 0.061714285714285715, |
|
"grad_norm": 0.03655322641134262, |
|
"kl": 0.0003566741943359375, |
|
"learning_rate": 9.648384182148252e-07, |
|
"loss": 0.1124, |
|
"reward": 0.02219559997320175, |
|
"reward_std": 0.20816783979535103, |
|
"rewards/cosine_scaled_reward": -0.23890221491456032, |
|
"rewards/format_reward": 0.5, |
|
"step": 108 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3294.125, |
|
"epoch": 0.062285714285714285, |
|
"grad_norm": 0.02335178665816784, |
|
"kl": 0.000438690185546875, |
|
"learning_rate": 9.636109026648554e-07, |
|
"loss": 0.034, |
|
"reward": -0.42653578519821167, |
|
"reward_std": 0.2618470564484596, |
|
"rewards/cosine_scaled_reward": -0.27576790004968643, |
|
"rewards/format_reward": 0.125, |
|
"step": 109 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3212.7916870117188, |
|
"epoch": 0.06285714285714286, |
|
"grad_norm": 0.013373509049415588, |
|
"kl": 0.0003647804260253906, |
|
"learning_rate": 9.623632283030077e-07, |
|
"loss": 0.0984, |
|
"reward": -0.35255161160603166, |
|
"reward_std": 0.34573741257190704, |
|
"rewards/cosine_scaled_reward": -0.2804424799978733, |
|
"rewards/format_reward": 0.2083333395421505, |
|
"step": 110 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2838.5833435058594, |
|
"epoch": 0.06342857142857143, |
|
"grad_norm": 0.02231280505657196, |
|
"kl": 0.000438690185546875, |
|
"learning_rate": 9.610954559391704e-07, |
|
"loss": 0.0041, |
|
"reward": -0.08670877665281296, |
|
"reward_std": 0.4552724286913872, |
|
"rewards/cosine_scaled_reward": -0.16835440043359995, |
|
"rewards/format_reward": 0.25, |
|
"step": 111 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3045.6666870117188, |
|
"epoch": 0.064, |
|
"grad_norm": 0.01650678738951683, |
|
"kl": 0.0003314018249511719, |
|
"learning_rate": 9.598076473627796e-07, |
|
"loss": -0.0241, |
|
"reward": 0.5308302510529757, |
|
"reward_std": 0.7812503390014172, |
|
"rewards/cosine_scaled_reward": 0.07791512738913298, |
|
"rewards/format_reward": 0.3750000037252903, |
|
"step": 112 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2850.4166870117188, |
|
"epoch": 0.06457142857142857, |
|
"grad_norm": 0.029017914086580276, |
|
"kl": 0.00040984153747558594, |
|
"learning_rate": 9.58499865339809e-07, |
|
"loss": 0.0898, |
|
"reward": 0.07424558838829398, |
|
"reward_std": 0.701502051204443, |
|
"rewards/cosine_scaled_reward": -0.17121053859591484, |
|
"rewards/format_reward": 0.4166666679084301, |
|
"step": 113 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3156.1666870117188, |
|
"epoch": 0.06514285714285714, |
|
"grad_norm": 0.013486391864717007, |
|
"kl": 0.00036525726318359375, |
|
"learning_rate": 9.571721736097088e-07, |
|
"loss": -0.0383, |
|
"reward": -0.05461219698190689, |
|
"reward_std": 0.5745192095637321, |
|
"rewards/cosine_scaled_reward": -0.1939727613935247, |
|
"rewards/format_reward": 0.3333333358168602, |
|
"step": 114 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2846.9583740234375, |
|
"epoch": 0.06571428571428571, |
|
"grad_norm": 0.01851780340075493, |
|
"kl": 0.0004734992980957031, |
|
"learning_rate": 9.55824636882301e-07, |
|
"loss": 0.1023, |
|
"reward": -0.03960709646344185, |
|
"reward_std": 0.7411026880145073, |
|
"rewards/cosine_scaled_reward": -0.18647022545337677, |
|
"rewards/format_reward": 0.3333333358168602, |
|
"step": 115 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3016.7083435058594, |
|
"epoch": 0.06628571428571428, |
|
"grad_norm": 0.012387678027153015, |
|
"kl": 0.0004115104675292969, |
|
"learning_rate": 9.54457320834625e-07, |
|
"loss": 0.0442, |
|
"reward": -0.4454263895750046, |
|
"reward_std": 0.34630244970321655, |
|
"rewards/cosine_scaled_reward": -0.3477131985127926, |
|
"rewards/format_reward": 0.25, |
|
"step": 116 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3214.125, |
|
"epoch": 0.06685714285714285, |
|
"grad_norm": 0.010518810711801052, |
|
"kl": 0.0004563331604003906, |
|
"learning_rate": 9.530702921077358e-07, |
|
"loss": -0.0368, |
|
"reward": 0.24042409658432007, |
|
"reward_std": 0.21273156255483627, |
|
"rewards/cosine_scaled_reward": -0.004787934944033623, |
|
"rewards/format_reward": 0.25, |
|
"step": 117 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2701.125045776367, |
|
"epoch": 0.06742857142857143, |
|
"grad_norm": 0.020669570192694664, |
|
"kl": 0.00047397613525390625, |
|
"learning_rate": 9.516636183034564e-07, |
|
"loss": 0.0176, |
|
"reward": 0.029145129024982452, |
|
"reward_std": 0.4890642985701561, |
|
"rewards/cosine_scaled_reward": -0.15209410339593887, |
|
"rewards/format_reward": 0.3333333358168602, |
|
"step": 118 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2052.625030517578, |
|
"epoch": 0.068, |
|
"grad_norm": 0.04713466763496399, |
|
"kl": 0.0004968643188476562, |
|
"learning_rate": 9.502373679810839e-07, |
|
"loss": 0.2029, |
|
"reward": 0.8692835792899132, |
|
"reward_std": 0.5231802985072136, |
|
"rewards/cosine_scaled_reward": 0.1429751217365265, |
|
"rewards/format_reward": 0.5833333358168602, |
|
"step": 119 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3555.3333740234375, |
|
"epoch": 0.06857142857142857, |
|
"grad_norm": 0.011495165526866913, |
|
"kl": 0.0003578662872314453, |
|
"learning_rate": 9.487916106540465e-07, |
|
"loss": 0.0078, |
|
"reward": -0.3606860190629959, |
|
"reward_std": 0.5378699135035276, |
|
"rewards/cosine_scaled_reward": -0.24284303188323975, |
|
"rewards/format_reward": 0.125, |
|
"step": 120 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3037.0, |
|
"epoch": 0.06914285714285714, |
|
"grad_norm": 0.040560223162174225, |
|
"kl": 0.00043773651123046875, |
|
"learning_rate": 9.473264167865171e-07, |
|
"loss": 0.1242, |
|
"reward": -0.06026811897754669, |
|
"reward_std": 0.48589139245450497, |
|
"rewards/cosine_scaled_reward": -0.15513405948877335, |
|
"rewards/format_reward": 0.25, |
|
"step": 121 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2017.791748046875, |
|
"epoch": 0.06971428571428571, |
|
"grad_norm": 0.021992556750774384, |
|
"kl": 0.0003094673156738281, |
|
"learning_rate": 9.458418577899774e-07, |
|
"loss": 0.2174, |
|
"reward": 1.1798989064991474, |
|
"reward_std": 0.7208777815103531, |
|
"rewards/cosine_scaled_reward": 0.21494942158460617, |
|
"rewards/format_reward": 0.7500000223517418, |
|
"step": 122 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2790.0833740234375, |
|
"epoch": 0.07028571428571428, |
|
"grad_norm": 0.016002673655748367, |
|
"kl": 0.0004038810729980469, |
|
"learning_rate": 9.443380060197385e-07, |
|
"loss": 0.0023, |
|
"reward": 0.33917365968227386, |
|
"reward_std": 1.0576607063412666, |
|
"rewards/cosine_scaled_reward": -0.08041317760944366, |
|
"rewards/format_reward": 0.5000000149011612, |
|
"step": 123 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3338.7500610351562, |
|
"epoch": 0.07085714285714285, |
|
"grad_norm": 0.010752753354609013, |
|
"kl": 0.00029778480529785156, |
|
"learning_rate": 9.428149347714143e-07, |
|
"loss": 0.0352, |
|
"reward": 0.3968821354210377, |
|
"reward_std": 0.5420339666306973, |
|
"rewards/cosine_scaled_reward": -0.009892286732792854, |
|
"rewards/format_reward": 0.4166666865348816, |
|
"step": 124 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2333.291717529297, |
|
"epoch": 0.07142857142857142, |
|
"grad_norm": 0.05020369216799736, |
|
"kl": 0.00043487548828125, |
|
"learning_rate": 9.412727182773486e-07, |
|
"loss": 0.1914, |
|
"reward": 0.29357251059263945, |
|
"reward_std": 0.5018073245882988, |
|
"rewards/cosine_scaled_reward": -0.18654709309339523, |
|
"rewards/format_reward": 0.6666666716337204, |
|
"step": 125 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1793.6666870117188, |
|
"epoch": 0.072, |
|
"grad_norm": 0.026897089555859566, |
|
"kl": 0.0003814697265625, |
|
"learning_rate": 9.397114317029974e-07, |
|
"loss": 0.1009, |
|
"reward": 0.33217477798461914, |
|
"reward_std": 0.8002122193574905, |
|
"rewards/cosine_scaled_reward": -0.20891261473298073, |
|
"rewards/format_reward": 0.7500000111758709, |
|
"step": 126 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2812.0834045410156, |
|
"epoch": 0.07257142857142856, |
|
"grad_norm": 0.013029903173446655, |
|
"kl": 0.0003662109375, |
|
"learning_rate": 9.381311511432658e-07, |
|
"loss": 0.0056, |
|
"reward": 0.6684755804017186, |
|
"reward_std": 0.9957198947668076, |
|
"rewards/cosine_scaled_reward": 0.10507109388709068, |
|
"rewards/format_reward": 0.4583333395421505, |
|
"step": 127 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3381.7500610351562, |
|
"epoch": 0.07314285714285715, |
|
"grad_norm": 0.014719157479703426, |
|
"kl": 0.0004086494445800781, |
|
"learning_rate": 9.36531953618799e-07, |
|
"loss": 0.0262, |
|
"reward": 0.14912248402833939, |
|
"reward_std": 0.6112966164946556, |
|
"rewards/cosine_scaled_reward": -0.0504387766122818, |
|
"rewards/format_reward": 0.2500000111758709, |
|
"step": 128 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2489.6666870117188, |
|
"epoch": 0.07371428571428572, |
|
"grad_norm": 0.02591477520763874, |
|
"kl": 0.0003113746643066406, |
|
"learning_rate": 9.34913917072228e-07, |
|
"loss": 0.0579, |
|
"reward": 0.2893460839986801, |
|
"reward_std": 0.34850335121154785, |
|
"rewards/cosine_scaled_reward": -0.10532695800065994, |
|
"rewards/format_reward": 0.5, |
|
"step": 129 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1840.0000305175781, |
|
"epoch": 0.07428571428571429, |
|
"grad_norm": 0.03489629924297333, |
|
"kl": 0.0003867149353027344, |
|
"learning_rate": 9.332771203643714e-07, |
|
"loss": 0.0713, |
|
"reward": 0.5813919119536877, |
|
"reward_std": 0.6657490562647581, |
|
"rewards/cosine_scaled_reward": -0.06347071845084429, |
|
"rewards/format_reward": 0.7083333432674408, |
|
"step": 130 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3027.5000610351562, |
|
"epoch": 0.07485714285714286, |
|
"grad_norm": 0.031031399965286255, |
|
"kl": 0.00043010711669921875, |
|
"learning_rate": 9.316216432703916e-07, |
|
"loss": -0.0614, |
|
"reward": -0.35745152831077576, |
|
"reward_std": 0.42021336406469345, |
|
"rewards/cosine_scaled_reward": -0.3453924432396889, |
|
"rewards/format_reward": 0.3333333358168602, |
|
"step": 131 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2721.041717529297, |
|
"epoch": 0.07542857142857143, |
|
"grad_norm": 0.019737781956791878, |
|
"kl": 0.0003380775451660156, |
|
"learning_rate": 9.299475664759068e-07, |
|
"loss": -0.1046, |
|
"reward": 0.52672129124403, |
|
"reward_std": 0.818169629201293, |
|
"rewards/cosine_scaled_reward": -0.049139365553855896, |
|
"rewards/format_reward": 0.6250000149011612, |
|
"step": 132 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3006.125, |
|
"epoch": 0.076, |
|
"grad_norm": 0.014350071549415588, |
|
"kl": 0.00035190582275390625, |
|
"learning_rate": 9.282549715730579e-07, |
|
"loss": 0.0195, |
|
"reward": 0.436140738427639, |
|
"reward_std": 0.799125649034977, |
|
"rewards/cosine_scaled_reward": 0.009737027809023857, |
|
"rewards/format_reward": 0.4166666716337204, |
|
"step": 133 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3342.0000610351562, |
|
"epoch": 0.07657142857142857, |
|
"grad_norm": 0.013594014570116997, |
|
"kl": 0.0003528594970703125, |
|
"learning_rate": 9.265439410565328e-07, |
|
"loss": 0.0027, |
|
"reward": 0.4128790497779846, |
|
"reward_std": 0.9192672446370125, |
|
"rewards/cosine_scaled_reward": 0.039772857911884785, |
|
"rewards/format_reward": 0.3333333395421505, |
|
"step": 134 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2860.0833435058594, |
|
"epoch": 0.07714285714285714, |
|
"grad_norm": 0.020422151312232018, |
|
"kl": 0.000537872314453125, |
|
"learning_rate": 9.248145583195447e-07, |
|
"loss": -0.0067, |
|
"reward": -0.4891085624694824, |
|
"reward_std": 0.28661480732262135, |
|
"rewards/cosine_scaled_reward": -0.3695542886853218, |
|
"rewards/format_reward": 0.25, |
|
"step": 135 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1929.3750610351562, |
|
"epoch": 0.07771428571428571, |
|
"grad_norm": 0.03446084260940552, |
|
"kl": 0.0007333755493164062, |
|
"learning_rate": 9.230669076497687e-07, |
|
"loss": 0.1816, |
|
"reward": 0.3107494944706559, |
|
"reward_std": 0.7272366061806679, |
|
"rewards/cosine_scaled_reward": -0.15712526440620422, |
|
"rewards/format_reward": 0.6250000037252903, |
|
"step": 136 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2655.000030517578, |
|
"epoch": 0.07828571428571429, |
|
"grad_norm": 0.01656595803797245, |
|
"kl": 0.0002579689025878906, |
|
"learning_rate": 9.213010742252327e-07, |
|
"loss": -0.0522, |
|
"reward": 0.6488993316888809, |
|
"reward_std": 0.77546676248312, |
|
"rewards/cosine_scaled_reward": 0.011949680745601654, |
|
"rewards/format_reward": 0.625, |
|
"step": 137 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2360.5416870117188, |
|
"epoch": 0.07885714285714286, |
|
"grad_norm": 0.01801217719912529, |
|
"kl": 0.0003211498260498047, |
|
"learning_rate": 9.195171441101668e-07, |
|
"loss": -0.0965, |
|
"reward": 0.3774528503417969, |
|
"reward_std": 0.27286792919039726, |
|
"rewards/cosine_scaled_reward": -0.06127360463142395, |
|
"rewards/format_reward": 0.5, |
|
"step": 138 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3113.3750610351562, |
|
"epoch": 0.07942857142857143, |
|
"grad_norm": 0.015139062888920307, |
|
"kl": 0.0002903938293457031, |
|
"learning_rate": 9.177152042508077e-07, |
|
"loss": 0.1314, |
|
"reward": 0.18484138697385788, |
|
"reward_std": 0.8956813514232635, |
|
"rewards/cosine_scaled_reward": -0.11591265327297151, |
|
"rewards/format_reward": 0.416666679084301, |
|
"step": 139 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2925.1666717529297, |
|
"epoch": 0.08, |
|
"grad_norm": 0.011720138601958752, |
|
"kl": 0.0002703666687011719, |
|
"learning_rate": 9.158953424711624e-07, |
|
"loss": 0.0138, |
|
"reward": 0.18188539519906044, |
|
"reward_std": 0.5702639557421207, |
|
"rewards/cosine_scaled_reward": -0.07572397217154503, |
|
"rewards/format_reward": 0.3333333358168602, |
|
"step": 140 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2260.7083740234375, |
|
"epoch": 0.08057142857142857, |
|
"grad_norm": 0.07121221721172333, |
|
"kl": 0.0005216598510742188, |
|
"learning_rate": 9.140576474687263e-07, |
|
"loss": 0.2931, |
|
"reward": 0.15240047996303474, |
|
"reward_std": 0.9472056925296783, |
|
"rewards/cosine_scaled_reward": -0.2154664322733879, |
|
"rewards/format_reward": 0.5833333544433117, |
|
"step": 141 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2265.5000610351562, |
|
"epoch": 0.08114285714285714, |
|
"grad_norm": 0.054452214390039444, |
|
"kl": 0.0006041526794433594, |
|
"learning_rate": 9.122022088101613e-07, |
|
"loss": 0.242, |
|
"reward": 0.7805991023778915, |
|
"reward_std": 0.9608509242534637, |
|
"rewards/cosine_scaled_reward": 0.05696620047092438, |
|
"rewards/format_reward": 0.6666666939854622, |
|
"step": 142 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2732.2917098999023, |
|
"epoch": 0.08171428571428571, |
|
"grad_norm": 0.017999855801463127, |
|
"kl": 0.0003368854522705078, |
|
"learning_rate": 9.103291169269299e-07, |
|
"loss": 0.0514, |
|
"reward": 1.025037132203579, |
|
"reward_std": 0.7192419245839119, |
|
"rewards/cosine_scaled_reward": 0.2625185213983059, |
|
"rewards/format_reward": 0.5000000111758709, |
|
"step": 143 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2587.8750915527344, |
|
"epoch": 0.08228571428571428, |
|
"grad_norm": 0.01989753730595112, |
|
"kl": 0.0004143714904785156, |
|
"learning_rate": 9.084384631108882e-07, |
|
"loss": 0.1676, |
|
"reward": 0.8056844659149647, |
|
"reward_std": 0.7985115312039852, |
|
"rewards/cosine_scaled_reward": 0.0695088729262352, |
|
"rewards/format_reward": 0.6666666679084301, |
|
"step": 144 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1967.666748046875, |
|
"epoch": 0.08285714285714285, |
|
"grad_norm": 0.06734281778335571, |
|
"kl": 0.0005283355712890625, |
|
"learning_rate": 9.065303395098358e-07, |
|
"loss": 0.2562, |
|
"reward": 0.2510095611214638, |
|
"reward_std": 0.6584825366735458, |
|
"rewards/cosine_scaled_reward": -0.2286618910729885, |
|
"rewards/format_reward": 0.7083333432674408, |
|
"step": 145 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2411.416717529297, |
|
"epoch": 0.08342857142857144, |
|
"grad_norm": 0.0633506253361702, |
|
"kl": 0.000423431396484375, |
|
"learning_rate": 9.046048391230247e-07, |
|
"loss": 0.2981, |
|
"reward": 0.670308992266655, |
|
"reward_std": 1.008466713130474, |
|
"rewards/cosine_scaled_reward": 0.0851544663310051, |
|
"rewards/format_reward": 0.5, |
|
"step": 146 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2494.041748046875, |
|
"epoch": 0.084, |
|
"grad_norm": 0.035755012184381485, |
|
"kl": 0.00038242340087890625, |
|
"learning_rate": 9.026620557966279e-07, |
|
"loss": 0.1709, |
|
"reward": 0.34743132442235947, |
|
"reward_std": 0.7957043498754501, |
|
"rewards/cosine_scaled_reward": -0.03461768664419651, |
|
"rewards/format_reward": 0.4166666716337204, |
|
"step": 147 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2072.750045776367, |
|
"epoch": 0.08457142857142858, |
|
"grad_norm": 0.026552215218544006, |
|
"kl": 0.0003905296325683594, |
|
"learning_rate": 9.007020842191634e-07, |
|
"loss": -0.01, |
|
"reward": 0.576688677072525, |
|
"reward_std": 0.5415612012147903, |
|
"rewards/cosine_scaled_reward": -0.08665566146373749, |
|
"rewards/format_reward": 0.7500000111758709, |
|
"step": 148 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2761.7084350585938, |
|
"epoch": 0.08514285714285715, |
|
"grad_norm": 0.08269675821065903, |
|
"kl": 0.0005259513854980469, |
|
"learning_rate": 8.987250199168808e-07, |
|
"loss": 0.2554, |
|
"reward": -0.020979389548301697, |
|
"reward_std": 0.6839594691991806, |
|
"rewards/cosine_scaled_reward": -0.2188230287283659, |
|
"rewards/format_reward": 0.416666679084301, |
|
"step": 149 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2854.875030517578, |
|
"epoch": 0.08571428571428572, |
|
"grad_norm": 0.024635691195726395, |
|
"kl": 0.0005636215209960938, |
|
"learning_rate": 8.967309592491052e-07, |
|
"loss": 0.215, |
|
"reward": -0.06861633434891701, |
|
"reward_std": 0.48412579856812954, |
|
"rewards/cosine_scaled_reward": -0.22180816903710365, |
|
"rewards/format_reward": 0.3750000149011612, |
|
"step": 150 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2962.0000915527344, |
|
"epoch": 0.08628571428571429, |
|
"grad_norm": 0.015141277574002743, |
|
"kl": 0.00032520294189453125, |
|
"learning_rate": 8.9471999940354e-07, |
|
"loss": 0.0071, |
|
"reward": 0.32280058413743973, |
|
"reward_std": 0.8727270662784576, |
|
"rewards/cosine_scaled_reward": -0.06776639446616173, |
|
"rewards/format_reward": 0.4583333395421505, |
|
"step": 151 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2867.7500610351562, |
|
"epoch": 0.08685714285714285, |
|
"grad_norm": 0.016482684761285782, |
|
"kl": 0.00044155120849609375, |
|
"learning_rate": 8.926922383915315e-07, |
|
"loss": 0.0131, |
|
"reward": -0.14668704383075237, |
|
"reward_std": 0.4973195120692253, |
|
"rewards/cosine_scaled_reward": -0.28167686983942986, |
|
"rewards/format_reward": 0.4166666716337204, |
|
"step": 152 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2524.2916717529297, |
|
"epoch": 0.08742857142857142, |
|
"grad_norm": 0.01825847662985325, |
|
"kl": 0.0002818107604980469, |
|
"learning_rate": 8.906477750432903e-07, |
|
"loss": -0.0077, |
|
"reward": 0.5729860588908195, |
|
"reward_std": 0.6833535395562649, |
|
"rewards/cosine_scaled_reward": -0.005173638463020325, |
|
"rewards/format_reward": 0.5833333358168602, |
|
"step": 153 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3422.1250610351562, |
|
"epoch": 0.088, |
|
"grad_norm": 0.019472761079669, |
|
"kl": 0.0004367828369140625, |
|
"learning_rate": 8.88586709003076e-07, |
|
"loss": 0.097, |
|
"reward": -0.15317635610699654, |
|
"reward_std": 0.7526141926646233, |
|
"rewards/cosine_scaled_reward": -0.15992150828242302, |
|
"rewards/format_reward": 0.1666666716337204, |
|
"step": 154 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3454.375, |
|
"epoch": 0.08857142857142856, |
|
"grad_norm": 0.010583397001028061, |
|
"kl": 0.00032520294189453125, |
|
"learning_rate": 8.865091407243394e-07, |
|
"loss": 0.033, |
|
"reward": -0.4739493057131767, |
|
"reward_std": 0.42491842061281204, |
|
"rewards/cosine_scaled_reward": -0.2786413189023733, |
|
"rewards/format_reward": 0.0833333358168602, |
|
"step": 155 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1829.5000610351562, |
|
"epoch": 0.08914285714285715, |
|
"grad_norm": 0.027142049744725227, |
|
"kl": 0.0004248619079589844, |
|
"learning_rate": 8.844151714648274e-07, |
|
"loss": 0.1219, |
|
"reward": 0.6520796567201614, |
|
"reward_std": 0.9076523296535015, |
|
"rewards/cosine_scaled_reward": -0.06979351304471493, |
|
"rewards/format_reward": 0.7916666865348816, |
|
"step": 156 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2761.4166870117188, |
|
"epoch": 0.08971428571428572, |
|
"grad_norm": 0.03456060215830803, |
|
"kl": 0.0031185150146484375, |
|
"learning_rate": 8.823049032816478e-07, |
|
"loss": 0.0636, |
|
"reward": -0.12644078209996223, |
|
"reward_std": 0.47517139464616776, |
|
"rewards/cosine_scaled_reward": -0.27155373618006706, |
|
"rewards/format_reward": 0.4166666716337204, |
|
"step": 157 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2967.3333740234375, |
|
"epoch": 0.09028571428571429, |
|
"grad_norm": 0.018295863643288612, |
|
"kl": 0.00030517578125, |
|
"learning_rate": 8.801784390262943e-07, |
|
"loss": 0.1293, |
|
"reward": 0.7170794606208801, |
|
"reward_std": 1.2831790447235107, |
|
"rewards/cosine_scaled_reward": 0.02520638657733798, |
|
"rewards/format_reward": 0.6666666939854622, |
|
"step": 158 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3502.5833740234375, |
|
"epoch": 0.09085714285714286, |
|
"grad_norm": 0.013578574173152447, |
|
"kl": 0.00040340423583984375, |
|
"learning_rate": 8.780358823396352e-07, |
|
"loss": 0.032, |
|
"reward": -0.382570318877697, |
|
"reward_std": 0.30739978328347206, |
|
"rewards/cosine_scaled_reward": -0.2537851668894291, |
|
"rewards/format_reward": 0.1250000037252903, |
|
"step": 159 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3438.4583740234375, |
|
"epoch": 0.09142857142857143, |
|
"grad_norm": 0.013279566541314125, |
|
"kl": 0.0003542900085449219, |
|
"learning_rate": 8.758773376468604e-07, |
|
"loss": 0.0503, |
|
"reward": 0.07213277881965041, |
|
"reward_std": 0.7591742426156998, |
|
"rewards/cosine_scaled_reward": -0.08893361687660217, |
|
"rewards/format_reward": 0.2500000037252903, |
|
"step": 160 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3217.7083740234375, |
|
"epoch": 0.092, |
|
"grad_norm": 0.020369982346892357, |
|
"kl": 0.00045013427734375, |
|
"learning_rate": 8.737029101523929e-07, |
|
"loss": -0.0123, |
|
"reward": 0.045274198055267334, |
|
"reward_std": 0.8290613554418087, |
|
"rewards/cosine_scaled_reward": -0.16486290469765663, |
|
"rewards/format_reward": 0.3750000111758709, |
|
"step": 161 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2882.9583740234375, |
|
"epoch": 0.09257142857142857, |
|
"grad_norm": 0.015230800956487656, |
|
"kl": 0.0003082752227783203, |
|
"learning_rate": 8.715127058347614e-07, |
|
"loss": 0.0195, |
|
"reward": 0.22908502910286188, |
|
"reward_std": 0.6315614283084869, |
|
"rewards/cosine_scaled_reward": -0.093790827319026, |
|
"rewards/format_reward": 0.4166666716337204, |
|
"step": 162 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3374.041748046875, |
|
"epoch": 0.09314285714285714, |
|
"grad_norm": 0.023900238797068596, |
|
"kl": 0.0012507438659667969, |
|
"learning_rate": 8.693068314414344e-07, |
|
"loss": 0.1131, |
|
"reward": -0.4366554766893387, |
|
"reward_std": 0.31389016658067703, |
|
"rewards/cosine_scaled_reward": -0.25999439880251884, |
|
"rewards/format_reward": 0.0833333358168602, |
|
"step": 163 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2721.291748046875, |
|
"epoch": 0.09371428571428571, |
|
"grad_norm": 0.012770951725542545, |
|
"kl": 0.00031685829162597656, |
|
"learning_rate": 8.670853944836176e-07, |
|
"loss": -0.0522, |
|
"reward": 0.07291282713413239, |
|
"reward_std": 0.6898190379142761, |
|
"rewards/cosine_scaled_reward": -0.2552102580666542, |
|
"rewards/format_reward": 0.5833333395421505, |
|
"step": 164 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3518.3333740234375, |
|
"epoch": 0.09428571428571429, |
|
"grad_norm": 0.01447351835668087, |
|
"kl": 0.0003514289855957031, |
|
"learning_rate": 8.648485032310144e-07, |
|
"loss": 0.0387, |
|
"reward": -0.5118223652243614, |
|
"reward_std": 0.4116092287003994, |
|
"rewards/cosine_scaled_reward": -0.2767445221543312, |
|
"rewards/format_reward": 0.0416666679084301, |
|
"step": 165 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3584.0, |
|
"epoch": 0.09485714285714286, |
|
"grad_norm": 0.01175595261156559, |
|
"kl": 0.0004019737243652344, |
|
"learning_rate": 8.625962667065487e-07, |
|
"loss": 0.0, |
|
"reward": -0.8086595237255096, |
|
"reward_std": 0.14668290875852108, |
|
"rewards/cosine_scaled_reward": -0.4043297544121742, |
|
"rewards/format_reward": 0.0, |
|
"step": 166 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2145.7916870117188, |
|
"epoch": 0.09542857142857143, |
|
"grad_norm": 0.033467333763837814, |
|
"kl": 0.0003383159637451172, |
|
"learning_rate": 8.603287946810513e-07, |
|
"loss": 0.1774, |
|
"reward": 1.582944918423891, |
|
"reward_std": 0.9087323695421219, |
|
"rewards/cosine_scaled_reward": 0.3748057931661606, |
|
"rewards/format_reward": 0.833333358168602, |
|
"step": 167 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3014.4583740234375, |
|
"epoch": 0.096, |
|
"grad_norm": 0.015946704894304276, |
|
"kl": 0.00036334991455078125, |
|
"learning_rate": 8.580461976679099e-07, |
|
"loss": 0.1448, |
|
"reward": -0.07501981779932976, |
|
"reward_std": 0.6305944435298443, |
|
"rewards/cosine_scaled_reward": -0.20417658984661102, |
|
"rewards/format_reward": 0.3333333432674408, |
|
"step": 168 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2566.9584197998047, |
|
"epoch": 0.09657142857142857, |
|
"grad_norm": 0.017763182520866394, |
|
"kl": 0.00031757354736328125, |
|
"learning_rate": 8.557485869176825e-07, |
|
"loss": -0.056, |
|
"reward": 0.9450984001159668, |
|
"reward_std": 1.3784255981445312, |
|
"rewards/cosine_scaled_reward": 0.18088253866881132, |
|
"rewards/format_reward": 0.5833333544433117, |
|
"step": 169 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3493.8333740234375, |
|
"epoch": 0.09714285714285714, |
|
"grad_norm": 0.012164680287241936, |
|
"kl": 0.0003337860107421875, |
|
"learning_rate": 8.534360744126753e-07, |
|
"loss": 0.0137, |
|
"reward": -0.02268831804394722, |
|
"reward_std": 0.5062807034701109, |
|
"rewards/cosine_scaled_reward": -0.07384415343403816, |
|
"rewards/format_reward": 0.1250000037252903, |
|
"step": 170 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2499.9583740234375, |
|
"epoch": 0.09771428571428571, |
|
"grad_norm": 0.016273025423288345, |
|
"kl": 0.0003085136413574219, |
|
"learning_rate": 8.511087728614862e-07, |
|
"loss": 0.0939, |
|
"reward": 0.18185213347896934, |
|
"reward_std": 0.5238135792315006, |
|
"rewards/cosine_scaled_reward": -0.1382406111806631, |
|
"rewards/format_reward": 0.4583333544433117, |
|
"step": 171 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2753.5833740234375, |
|
"epoch": 0.09828571428571428, |
|
"grad_norm": 0.02729148045182228, |
|
"kl": 0.00043773651123046875, |
|
"learning_rate": 8.487667956935087e-07, |
|
"loss": 0.2349, |
|
"reward": -0.1761530265212059, |
|
"reward_std": 0.31285534240305424, |
|
"rewards/cosine_scaled_reward": -0.27557652816176414, |
|
"rewards/format_reward": 0.3750000149011612, |
|
"step": 172 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2625.7916870117188, |
|
"epoch": 0.09885714285714285, |
|
"grad_norm": 0.020237158983945847, |
|
"kl": 0.0004603862762451172, |
|
"learning_rate": 8.464102570534061e-07, |
|
"loss": 0.1532, |
|
"reward": 0.3476742703933269, |
|
"reward_std": 1.086041659116745, |
|
"rewards/cosine_scaled_reward": -0.11782953515648842, |
|
"rewards/format_reward": 0.5833333544433117, |
|
"step": 173 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3466.7083740234375, |
|
"epoch": 0.09942857142857142, |
|
"grad_norm": 0.013751998543739319, |
|
"kl": 0.000400543212890625, |
|
"learning_rate": 8.440392717955475e-07, |
|
"loss": 0.0416, |
|
"reward": -0.5668784528970718, |
|
"reward_std": 0.2918172590434551, |
|
"rewards/cosine_scaled_reward": -0.3459392338991165, |
|
"rewards/format_reward": 0.1250000037252903, |
|
"step": 174 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2285.4166870117188, |
|
"epoch": 0.1, |
|
"grad_norm": 0.044584471732378006, |
|
"kl": 0.0002956390380859375, |
|
"learning_rate": 8.416539554784089e-07, |
|
"loss": 0.2817, |
|
"reward": 1.0394483506679535, |
|
"reward_std": 0.9922515600919724, |
|
"rewards/cosine_scaled_reward": 0.20722418278455734, |
|
"rewards/format_reward": 0.6250000149011612, |
|
"step": 175 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2696.000030517578, |
|
"epoch": 0.10057142857142858, |
|
"grad_norm": 0.02319377101957798, |
|
"kl": 0.000415802001953125, |
|
"learning_rate": 8.392544243589427e-07, |
|
"loss": 0.0886, |
|
"reward": 0.3795542363077402, |
|
"reward_std": 0.7756792306900024, |
|
"rewards/cosine_scaled_reward": -0.060222890228033066, |
|
"rewards/format_reward": 0.5000000223517418, |
|
"step": 176 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2909.4166870117188, |
|
"epoch": 0.10114285714285715, |
|
"grad_norm": 0.03342346474528313, |
|
"kl": 0.0003414154052734375, |
|
"learning_rate": 8.368407953869103e-07, |
|
"loss": 0.1555, |
|
"reward": 0.2533244490623474, |
|
"reward_std": 0.6474116146564484, |
|
"rewards/cosine_scaled_reward": -0.04000448310398497, |
|
"rewards/format_reward": 0.3333333432674408, |
|
"step": 177 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3399.6250610351562, |
|
"epoch": 0.10171428571428572, |
|
"grad_norm": 0.01095277164131403, |
|
"kl": 0.0003046989440917969, |
|
"learning_rate": 8.344131861991828e-07, |
|
"loss": 0.0506, |
|
"reward": -0.01267234981060028, |
|
"reward_std": 0.6906884871423244, |
|
"rewards/cosine_scaled_reward": -0.11050283908843994, |
|
"rewards/format_reward": 0.2083333395421505, |
|
"step": 178 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2949.000030517578, |
|
"epoch": 0.10228571428571429, |
|
"grad_norm": 0.012322952039539814, |
|
"kl": 0.0004057884216308594, |
|
"learning_rate": 8.319717151140072e-07, |
|
"loss": 0.1078, |
|
"reward": -0.04895609989762306, |
|
"reward_std": 0.553108676103875, |
|
"rewards/cosine_scaled_reward": -0.17031139694154263, |
|
"rewards/format_reward": 0.2916666679084301, |
|
"step": 179 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2829.7083740234375, |
|
"epoch": 0.10285714285714286, |
|
"grad_norm": 0.029738230630755424, |
|
"kl": 0.0003414154052734375, |
|
"learning_rate": 8.295165011252396e-07, |
|
"loss": 0.0411, |
|
"reward": 0.3339508920907974, |
|
"reward_std": 0.8846062198281288, |
|
"rewards/cosine_scaled_reward": -0.04135786276310682, |
|
"rewards/format_reward": 0.4166666716337204, |
|
"step": 180 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3196.7083740234375, |
|
"epoch": 0.10342857142857143, |
|
"grad_norm": 0.018469586968421936, |
|
"kl": 0.0003407001495361328, |
|
"learning_rate": 8.270476638965461e-07, |
|
"loss": 0.0479, |
|
"reward": 0.06904555298388004, |
|
"reward_std": 0.5129038351587951, |
|
"rewards/cosine_scaled_reward": -0.13214393705129623, |
|
"rewards/format_reward": 0.3333333432674408, |
|
"step": 181 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3226.416748046875, |
|
"epoch": 0.104, |
|
"grad_norm": 0.018400847911834717, |
|
"kl": 0.00044155120849609375, |
|
"learning_rate": 8.245653237555705e-07, |
|
"loss": 0.057, |
|
"reward": -0.3992947228252888, |
|
"reward_std": 0.5363226048648357, |
|
"rewards/cosine_scaled_reward": -0.30381404608488083, |
|
"rewards/format_reward": 0.2083333395421505, |
|
"step": 182 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2809.4583740234375, |
|
"epoch": 0.10457142857142857, |
|
"grad_norm": 0.015173462219536304, |
|
"kl": 0.0004189014434814453, |
|
"learning_rate": 8.220696016880687e-07, |
|
"loss": 0.0059, |
|
"reward": -0.08883734792470932, |
|
"reward_std": 0.4826326109468937, |
|
"rewards/cosine_scaled_reward": -0.27358534932136536, |
|
"rewards/format_reward": 0.4583333432674408, |
|
"step": 183 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2285.250030517578, |
|
"epoch": 0.10514285714285715, |
|
"grad_norm": 0.01784459501504898, |
|
"kl": 0.0003428459167480469, |
|
"learning_rate": 8.195606193320136e-07, |
|
"loss": 0.0369, |
|
"reward": 0.36116717755794525, |
|
"reward_std": 0.7178683169186115, |
|
"rewards/cosine_scaled_reward": -0.06941639818251133, |
|
"rewards/format_reward": 0.5000000111758709, |
|
"step": 184 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2192.3333740234375, |
|
"epoch": 0.10571428571428572, |
|
"grad_norm": 0.021139826625585556, |
|
"kl": 0.0005130767822265625, |
|
"learning_rate": 8.170384989716657e-07, |
|
"loss": 0.1298, |
|
"reward": -0.057670027017593384, |
|
"reward_std": 0.41697440296411514, |
|
"rewards/cosine_scaled_reward": -0.2996683418750763, |
|
"rewards/format_reward": 0.5416666679084301, |
|
"step": 185 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2678.541748046875, |
|
"epoch": 0.10628571428571429, |
|
"grad_norm": 0.016024595126509666, |
|
"kl": 0.0003867149353027344, |
|
"learning_rate": 8.145033635316128e-07, |
|
"loss": 0.0501, |
|
"reward": 0.8263338319957256, |
|
"reward_std": 0.7537258118391037, |
|
"rewards/cosine_scaled_reward": 0.14233355224132538, |
|
"rewards/format_reward": 0.5416666716337204, |
|
"step": 186 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3467.2083740234375, |
|
"epoch": 0.10685714285714286, |
|
"grad_norm": 0.02122497744858265, |
|
"kl": 0.00041675567626953125, |
|
"learning_rate": 8.119553365707802e-07, |
|
"loss": 0.0488, |
|
"reward": -0.3330922797322273, |
|
"reward_std": 0.4953814512118697, |
|
"rewards/cosine_scaled_reward": -0.2290461454540491, |
|
"rewards/format_reward": 0.1250000037252903, |
|
"step": 187 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3197.7083740234375, |
|
"epoch": 0.10742857142857143, |
|
"grad_norm": 0.015642890706658363, |
|
"kl": 0.000370025634765625, |
|
"learning_rate": 8.093945422764069e-07, |
|
"loss": 0.0347, |
|
"reward": 0.1139075756072998, |
|
"reward_std": 1.0698014255613089, |
|
"rewards/cosine_scaled_reward": -0.1097128726541996, |
|
"rewards/format_reward": 0.3333333469927311, |
|
"step": 188 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3420.2083740234375, |
|
"epoch": 0.108, |
|
"grad_norm": 0.016403522342443466, |
|
"kl": 0.0003390312194824219, |
|
"learning_rate": 8.068211054579943e-07, |
|
"loss": 0.0476, |
|
"reward": -0.16637829318642616, |
|
"reward_std": 0.6985251531004906, |
|
"rewards/cosine_scaled_reward": -0.20818914845585823, |
|
"rewards/format_reward": 0.2500000074505806, |
|
"step": 189 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2964.5833740234375, |
|
"epoch": 0.10857142857142857, |
|
"grad_norm": 0.015617966651916504, |
|
"kl": 0.0003600120544433594, |
|
"learning_rate": 8.04235151541222e-07, |
|
"loss": 0.0077, |
|
"reward": -0.2868319842964411, |
|
"reward_std": 0.26455265283584595, |
|
"rewards/cosine_scaled_reward": -0.2892493214458227, |
|
"rewards/format_reward": 0.2916666679084301, |
|
"step": 190 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2810.2916717529297, |
|
"epoch": 0.10914285714285714, |
|
"grad_norm": 0.019974973052740097, |
|
"kl": 0.0004630088806152344, |
|
"learning_rate": 8.01636806561836e-07, |
|
"loss": 0.066, |
|
"reward": -0.06471758894622326, |
|
"reward_std": 0.3940250463783741, |
|
"rewards/cosine_scaled_reward": -0.26152546517550945, |
|
"rewards/format_reward": 0.4583333358168602, |
|
"step": 191 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2830.5416717529297, |
|
"epoch": 0.10971428571428571, |
|
"grad_norm": 0.016707738861441612, |
|
"kl": 0.0004711151123046875, |
|
"learning_rate": 7.990261971595048e-07, |
|
"loss": 0.0331, |
|
"reward": 0.1446765586733818, |
|
"reward_std": 0.1306634098291397, |
|
"rewards/cosine_scaled_reward": -0.052661728113889694, |
|
"rewards/format_reward": 0.25, |
|
"step": 192 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3578.6666870117188, |
|
"epoch": 0.11028571428571429, |
|
"grad_norm": 0.01071973703801632, |
|
"kl": 0.0003151893615722656, |
|
"learning_rate": 7.964034505716476e-07, |
|
"loss": 0.0019, |
|
"reward": -0.34736843407154083, |
|
"reward_std": 0.33792266994714737, |
|
"rewards/cosine_scaled_reward": -0.23618422076106071, |
|
"rewards/format_reward": 0.1250000037252903, |
|
"step": 193 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2222.666732788086, |
|
"epoch": 0.11085714285714286, |
|
"grad_norm": 0.05484706535935402, |
|
"kl": 0.0006213188171386719, |
|
"learning_rate": 7.93768694627233e-07, |
|
"loss": 0.0984, |
|
"reward": 0.5483606606721878, |
|
"reward_std": 1.1607089042663574, |
|
"rewards/cosine_scaled_reward": -0.01748633268289268, |
|
"rewards/format_reward": 0.5833333432674408, |
|
"step": 194 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2849.8334045410156, |
|
"epoch": 0.11142857142857143, |
|
"grad_norm": 0.021573448553681374, |
|
"kl": 0.0005321502685546875, |
|
"learning_rate": 7.911220577405484e-07, |
|
"loss": -0.0482, |
|
"reward": -0.2685957998037338, |
|
"reward_std": 0.45445217937231064, |
|
"rewards/cosine_scaled_reward": -0.3426312282681465, |
|
"rewards/format_reward": 0.4166666716337204, |
|
"step": 195 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2637.9166870117188, |
|
"epoch": 0.112, |
|
"grad_norm": 0.021516846492886543, |
|
"kl": 0.0003535747528076172, |
|
"learning_rate": 7.884636689049422e-07, |
|
"loss": -0.0182, |
|
"reward": 0.022181347012519836, |
|
"reward_std": 0.4800866097211838, |
|
"rewards/cosine_scaled_reward": -0.15557599812746048, |
|
"rewards/format_reward": 0.3333333358168602, |
|
"step": 196 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3041.875, |
|
"epoch": 0.11257142857142857, |
|
"grad_norm": 0.01381740253418684, |
|
"kl": 0.0003528594970703125, |
|
"learning_rate": 7.857936576865356e-07, |
|
"loss": -0.0073, |
|
"reward": -0.33803367614746094, |
|
"reward_std": 0.4302855357527733, |
|
"rewards/cosine_scaled_reward": -0.29401686880737543, |
|
"rewards/format_reward": 0.25, |
|
"step": 197 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2317.8333892822266, |
|
"epoch": 0.11314285714285714, |
|
"grad_norm": 0.03078615479171276, |
|
"kl": 0.0003762245178222656, |
|
"learning_rate": 7.831121542179086e-07, |
|
"loss": 0.0924, |
|
"reward": 0.597826175391674, |
|
"reward_std": 0.8695001602172852, |
|
"rewards/cosine_scaled_reward": 0.007246408611536026, |
|
"rewards/format_reward": 0.5833333469927311, |
|
"step": 198 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3303.9583740234375, |
|
"epoch": 0.11371428571428571, |
|
"grad_norm": 0.019953317940235138, |
|
"kl": 0.0003972053527832031, |
|
"learning_rate": 7.804192891917571e-07, |
|
"loss": 0.0989, |
|
"reward": 0.565569007769227, |
|
"reward_std": 0.5124572534114122, |
|
"rewards/cosine_scaled_reward": 0.07445115875452757, |
|
"rewards/format_reward": 0.4166666716337204, |
|
"step": 199 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2750.8333435058594, |
|
"epoch": 0.11428571428571428, |
|
"grad_norm": 0.05455951392650604, |
|
"kl": 0.0007038116455078125, |
|
"learning_rate": 7.777151938545235e-07, |
|
"loss": 0.1193, |
|
"reward": -0.07477065362036228, |
|
"reward_std": 0.8196274563670158, |
|
"rewards/cosine_scaled_reward": -0.24571867287158966, |
|
"rewards/format_reward": 0.4166666716337204, |
|
"step": 200 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2873.4166870117188, |
|
"epoch": 0.11485714285714285, |
|
"grad_norm": 0.06070086359977722, |
|
"kl": 0.0005712509155273438, |
|
"learning_rate": 7.75e-07, |
|
"loss": 0.2315, |
|
"reward": -0.16712947003543377, |
|
"reward_std": 0.3942112438380718, |
|
"rewards/cosine_scaled_reward": -0.22939807549118996, |
|
"rewards/format_reward": 0.2916666716337204, |
|
"step": 201 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2792.7083740234375, |
|
"epoch": 0.11542857142857142, |
|
"grad_norm": 0.02964051626622677, |
|
"kl": 0.0003209114074707031, |
|
"learning_rate": 7.72273839962904e-07, |
|
"loss": 0.0298, |
|
"reward": 0.44592857360839844, |
|
"reward_std": 0.7889965083450079, |
|
"rewards/cosine_scaled_reward": 0.05629761889576912, |
|
"rewards/format_reward": 0.3333333358168602, |
|
"step": 202 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3046.7500610351562, |
|
"epoch": 0.116, |
|
"grad_norm": 0.018877137452363968, |
|
"kl": 0.0003161430358886719, |
|
"learning_rate": 7.695368466124296e-07, |
|
"loss": 0.0529, |
|
"reward": 0.408734455704689, |
|
"reward_std": 0.6842712201178074, |
|
"rewards/cosine_scaled_reward": -0.0247994652017951, |
|
"rewards/format_reward": 0.4583333544433117, |
|
"step": 203 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2634.5833435058594, |
|
"epoch": 0.11657142857142858, |
|
"grad_norm": 0.02069164253771305, |
|
"kl": 0.0003864765167236328, |
|
"learning_rate": 7.667891533457718e-07, |
|
"loss": 0.0833, |
|
"reward": -0.21086983382701874, |
|
"reward_std": 0.22330690175294876, |
|
"rewards/cosine_scaled_reward": -0.2929349225014448, |
|
"rewards/format_reward": 0.375, |
|
"step": 204 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2709.7083435058594, |
|
"epoch": 0.11714285714285715, |
|
"grad_norm": 0.023381751030683517, |
|
"kl": 0.000301361083984375, |
|
"learning_rate": 7.640308940816239e-07, |
|
"loss": 0.0675, |
|
"reward": 0.27661415934562683, |
|
"reward_std": 0.6943130940198898, |
|
"rewards/cosine_scaled_reward": -0.09085960499942303, |
|
"rewards/format_reward": 0.4583333432674408, |
|
"step": 205 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3047.625030517578, |
|
"epoch": 0.11771428571428572, |
|
"grad_norm": 0.02151002548635006, |
|
"kl": 0.0004744529724121094, |
|
"learning_rate": 7.612622032536507e-07, |
|
"loss": -0.0196, |
|
"reward": -0.10782860964536667, |
|
"reward_std": 0.5117022879421711, |
|
"rewards/cosine_scaled_reward": -0.19974764343351126, |
|
"rewards/format_reward": 0.2916666679084301, |
|
"step": 206 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2933.7916870117188, |
|
"epoch": 0.11828571428571429, |
|
"grad_norm": 0.0185316763818264, |
|
"kl": 0.0003197193145751953, |
|
"learning_rate": 7.584832158039378e-07, |
|
"loss": 0.0325, |
|
"reward": 0.4302789755165577, |
|
"reward_std": 1.0025385729968548, |
|
"rewards/cosine_scaled_reward": 0.006806140765547752, |
|
"rewards/format_reward": 0.4166666679084301, |
|
"step": 207 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2270.75, |
|
"epoch": 0.11885714285714286, |
|
"grad_norm": 0.013073590584099293, |
|
"kl": 0.00028967857360839844, |
|
"learning_rate": 7.556940671764124e-07, |
|
"loss": -0.0037, |
|
"reward": 0.8428396731615067, |
|
"reward_std": 0.33612318709492683, |
|
"rewards/cosine_scaled_reward": 0.17141985148191452, |
|
"rewards/format_reward": 0.5, |
|
"step": 208 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2534.0834197998047, |
|
"epoch": 0.11942857142857143, |
|
"grad_norm": 0.04636608809232712, |
|
"kl": 0.0002626180648803711, |
|
"learning_rate": 7.528948933102438e-07, |
|
"loss": 0.1077, |
|
"reward": 0.41008343175053596, |
|
"reward_std": 0.38527223095297813, |
|
"rewards/cosine_scaled_reward": -0.003291614353656769, |
|
"rewards/format_reward": 0.4166666716337204, |
|
"step": 209 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2664.375030517578, |
|
"epoch": 0.12, |
|
"grad_norm": 0.06337418407201767, |
|
"kl": 0.0005369186401367188, |
|
"learning_rate": 7.500858306332172e-07, |
|
"loss": 0.1954, |
|
"reward": -0.11931078508496284, |
|
"reward_std": 0.4521569199860096, |
|
"rewards/cosine_scaled_reward": -0.2679887441918254, |
|
"rewards/format_reward": 0.4166666828095913, |
|
"step": 210 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2690.75, |
|
"epoch": 0.12057142857142857, |
|
"grad_norm": 0.02364625595510006, |
|
"kl": 0.0003635883331298828, |
|
"learning_rate": 7.472670160550848e-07, |
|
"loss": 0.1012, |
|
"reward": 0.4544009678065777, |
|
"reward_std": 0.9030434042215347, |
|
"rewards/cosine_scaled_reward": 0.018867140635848045, |
|
"rewards/format_reward": 0.4166666679084301, |
|
"step": 211 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3231.4584350585938, |
|
"epoch": 0.12114285714285715, |
|
"grad_norm": 0.02378404326736927, |
|
"kl": 0.0004596710205078125, |
|
"learning_rate": 7.444385869608921e-07, |
|
"loss": 0.126, |
|
"reward": 0.3596238009631634, |
|
"reward_std": 1.0245484188199043, |
|
"rewards/cosine_scaled_reward": 0.03397855442017317, |
|
"rewards/format_reward": 0.2916666679084301, |
|
"step": 212 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2259.166732788086, |
|
"epoch": 0.12171428571428572, |
|
"grad_norm": 0.0168690737336874, |
|
"kl": 0.000362396240234375, |
|
"learning_rate": 7.416006812042827e-07, |
|
"loss": 0.0384, |
|
"reward": 0.7057138048112392, |
|
"reward_std": 1.0122921094298363, |
|
"rewards/cosine_scaled_reward": 0.04035688715521246, |
|
"rewards/format_reward": 0.6250000037252903, |
|
"step": 213 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2541.250045776367, |
|
"epoch": 0.12228571428571429, |
|
"grad_norm": 0.029833870008587837, |
|
"kl": 0.0004019737243652344, |
|
"learning_rate": 7.387534371007797e-07, |
|
"loss": 0.1661, |
|
"reward": 0.027080008760094643, |
|
"reward_std": 0.8045615777373314, |
|
"rewards/cosine_scaled_reward": -0.19479333609342575, |
|
"rewards/format_reward": 0.4166666716337204, |
|
"step": 214 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2826.3333740234375, |
|
"epoch": 0.12285714285714286, |
|
"grad_norm": 0.01670445129275322, |
|
"kl": 0.0003113746643066406, |
|
"learning_rate": 7.358969934210438e-07, |
|
"loss": 0.0458, |
|
"reward": -0.15282147377729416, |
|
"reward_std": 0.6320948153734207, |
|
"rewards/cosine_scaled_reward": -0.28474406246095896, |
|
"rewards/format_reward": 0.4166666679084301, |
|
"step": 215 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1534.750015258789, |
|
"epoch": 0.12342857142857143, |
|
"grad_norm": 0.028196675702929497, |
|
"kl": 0.0003743171691894531, |
|
"learning_rate": 7.330314893841101e-07, |
|
"loss": 0.173, |
|
"reward": 1.9182523787021637, |
|
"reward_std": 0.8897372838109732, |
|
"rewards/cosine_scaled_reward": 0.5007927343249321, |
|
"rewards/format_reward": 0.9166666716337204, |
|
"step": 216 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3460.5416870117188, |
|
"epoch": 0.124, |
|
"grad_norm": 0.012388592585921288, |
|
"kl": 0.00035881996154785156, |
|
"learning_rate": 7.301570646506027e-07, |
|
"loss": 0.0433, |
|
"reward": -0.15055294707417488, |
|
"reward_std": 0.5311995670199394, |
|
"rewards/cosine_scaled_reward": -0.137776467949152, |
|
"rewards/format_reward": 0.1250000037252903, |
|
"step": 217 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2624.8334045410156, |
|
"epoch": 0.12457142857142857, |
|
"grad_norm": 0.03056301549077034, |
|
"kl": 0.00043702125549316406, |
|
"learning_rate": 7.27273859315928e-07, |
|
"loss": 0.2376, |
|
"reward": 0.47333015874028206, |
|
"reward_std": 0.7044993788003922, |
|
"rewards/cosine_scaled_reward": -0.034168269485235214, |
|
"rewards/format_reward": 0.5416666865348816, |
|
"step": 218 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2094.5834045410156, |
|
"epoch": 0.12514285714285714, |
|
"grad_norm": 0.05004338175058365, |
|
"kl": 0.0005245208740234375, |
|
"learning_rate": 7.243820139034464e-07, |
|
"loss": 0.1989, |
|
"reward": 0.7468737373128533, |
|
"reward_std": 0.8828459084033966, |
|
"rewards/cosine_scaled_reward": 0.019270192831754684, |
|
"rewards/format_reward": 0.7083333432674408, |
|
"step": 219 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2917.791748046875, |
|
"epoch": 0.12571428571428572, |
|
"grad_norm": 0.01651514135301113, |
|
"kl": 0.0003859996795654297, |
|
"learning_rate": 7.214816693576234e-07, |
|
"loss": 0.1265, |
|
"reward": 0.40257575549185276, |
|
"reward_std": 0.8734332285821438, |
|
"rewards/cosine_scaled_reward": -0.04871212877333164, |
|
"rewards/format_reward": 0.5000000149011612, |
|
"step": 220 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1825.2500457763672, |
|
"epoch": 0.12628571428571428, |
|
"grad_norm": 0.022186581045389175, |
|
"kl": 0.0004601478576660156, |
|
"learning_rate": 7.185729670371604e-07, |
|
"loss": -0.0246, |
|
"reward": 0.5106580704450607, |
|
"reward_std": 0.6262499652802944, |
|
"rewards/cosine_scaled_reward": -0.11967097967863083, |
|
"rewards/format_reward": 0.75, |
|
"step": 221 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1936.2917098999023, |
|
"epoch": 0.12685714285714286, |
|
"grad_norm": 0.025714771822094917, |
|
"kl": 0.0005955696105957031, |
|
"learning_rate": 7.156560487081051e-07, |
|
"loss": -0.0667, |
|
"reward": 0.5173665434122086, |
|
"reward_std": 0.4710990320891142, |
|
"rewards/cosine_scaled_reward": -0.11631673201918602, |
|
"rewards/format_reward": 0.75, |
|
"step": 222 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3248.3333740234375, |
|
"epoch": 0.12742857142857142, |
|
"grad_norm": 0.014303294010460377, |
|
"kl": 0.00043010711669921875, |
|
"learning_rate": 7.127310565369415e-07, |
|
"loss": 0.0878, |
|
"reward": -0.4517449364066124, |
|
"reward_std": 0.20166658982634544, |
|
"rewards/cosine_scaled_reward": -0.3092058040201664, |
|
"rewards/format_reward": 0.1666666716337204, |
|
"step": 223 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3256.8333740234375, |
|
"epoch": 0.128, |
|
"grad_norm": 0.01596376858651638, |
|
"kl": 0.000354766845703125, |
|
"learning_rate": 7.097981330836616e-07, |
|
"loss": 0.0683, |
|
"reward": 0.5998236387968063, |
|
"reward_std": 0.8237827308475971, |
|
"rewards/cosine_scaled_reward": 0.11241178959608078, |
|
"rewards/format_reward": 0.3750000149011612, |
|
"step": 224 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2851.2083740234375, |
|
"epoch": 0.12857142857142856, |
|
"grad_norm": 0.017591828480362892, |
|
"kl": 0.0004096031188964844, |
|
"learning_rate": 7.068574212948169e-07, |
|
"loss": 0.0511, |
|
"reward": 0.10140763968229294, |
|
"reward_std": 0.5019327104091644, |
|
"rewards/cosine_scaled_reward": -0.11596284806728363, |
|
"rewards/format_reward": 0.3333333358168602, |
|
"step": 225 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2292.000030517578, |
|
"epoch": 0.12914285714285714, |
|
"grad_norm": 0.032970964908599854, |
|
"kl": 0.00045108795166015625, |
|
"learning_rate": 7.039090644965509e-07, |
|
"loss": 0.1386, |
|
"reward": 0.4276478886604309, |
|
"reward_std": 0.7602110169827938, |
|
"rewards/cosine_scaled_reward": -0.057009367272257805, |
|
"rewards/format_reward": 0.541666679084301, |
|
"step": 226 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2093.6250610351562, |
|
"epoch": 0.12971428571428573, |
|
"grad_norm": 0.015598502941429615, |
|
"kl": 0.0003306865692138672, |
|
"learning_rate": 7.009532063876148e-07, |
|
"loss": 0.1292, |
|
"reward": 0.6084302365779877, |
|
"reward_std": 0.6128123812377453, |
|
"rewards/cosine_scaled_reward": -0.008284901501610875, |
|
"rewards/format_reward": 0.625, |
|
"step": 227 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1409.5833587646484, |
|
"epoch": 0.13028571428571428, |
|
"grad_norm": 0.03161853179335594, |
|
"kl": 0.0005328655242919922, |
|
"learning_rate": 6.979899910323624e-07, |
|
"loss": 0.0405, |
|
"reward": 0.6997113339602947, |
|
"reward_std": 0.7971856854856014, |
|
"rewards/cosine_scaled_reward": -0.04597766697406769, |
|
"rewards/format_reward": 0.7916666679084301, |
|
"step": 228 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2726.4166870117188, |
|
"epoch": 0.13085714285714287, |
|
"grad_norm": 0.03210078924894333, |
|
"kl": 0.00038051605224609375, |
|
"learning_rate": 6.950195628537299e-07, |
|
"loss": 0.0564, |
|
"reward": 0.28624797612428665, |
|
"reward_std": 0.706984382122755, |
|
"rewards/cosine_scaled_reward": -0.06520934589207172, |
|
"rewards/format_reward": 0.4166666679084301, |
|
"step": 229 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3351.7083740234375, |
|
"epoch": 0.13142857142857142, |
|
"grad_norm": 0.020253852009773254, |
|
"kl": 0.0004558563232421875, |
|
"learning_rate": 6.920420666261961e-07, |
|
"loss": 0.0537, |
|
"reward": 0.20329816453158855, |
|
"reward_std": 0.920021902769804, |
|
"rewards/cosine_scaled_reward": -0.04418425913900137, |
|
"rewards/format_reward": 0.2916666753590107, |
|
"step": 230 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3248.7916870117188, |
|
"epoch": 0.132, |
|
"grad_norm": 0.027326960116624832, |
|
"kl": 0.0003800392150878906, |
|
"learning_rate": 6.890576474687263e-07, |
|
"loss": 0.0472, |
|
"reward": 0.02445869892835617, |
|
"reward_std": 0.4333410616964102, |
|
"rewards/cosine_scaled_reward": -0.07110398076474667, |
|
"rewards/format_reward": 0.1666666716337204, |
|
"step": 231 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2847.8333435058594, |
|
"epoch": 0.13257142857142856, |
|
"grad_norm": 0.017697490751743317, |
|
"kl": 0.0004267692565917969, |
|
"learning_rate": 6.860664508377001e-07, |
|
"loss": 0.0322, |
|
"reward": -0.29599685221910477, |
|
"reward_std": 0.34860160388052464, |
|
"rewards/cosine_scaled_reward": -0.29383176099509, |
|
"rewards/format_reward": 0.2916666679084301, |
|
"step": 232 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3569.7916870117188, |
|
"epoch": 0.13314285714285715, |
|
"grad_norm": 0.011569323018193245, |
|
"kl": 0.0003650188446044922, |
|
"learning_rate": 6.83068622519821e-07, |
|
"loss": 0.0082, |
|
"reward": -0.27050644531846046, |
|
"reward_std": 0.28310155123472214, |
|
"rewards/cosine_scaled_reward": -0.15608655102550983, |
|
"rewards/format_reward": 0.0416666679084301, |
|
"step": 233 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3336.3750610351562, |
|
"epoch": 0.1337142857142857, |
|
"grad_norm": 0.014362377114593983, |
|
"kl": 0.0003769397735595703, |
|
"learning_rate": 6.800643086250121e-07, |
|
"loss": 0.1096, |
|
"reward": 0.16344637423753738, |
|
"reward_std": 1.1342220231890678, |
|
"rewards/cosine_scaled_reward": -0.022443480789661407, |
|
"rewards/format_reward": 0.2083333358168602, |
|
"step": 234 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2698.5416717529297, |
|
"epoch": 0.13428571428571429, |
|
"grad_norm": 0.016733834519982338, |
|
"kl": 0.0003490447998046875, |
|
"learning_rate": 6.770536555792944e-07, |
|
"loss": 0.0561, |
|
"reward": 0.6713685989379883, |
|
"reward_std": 0.4799320735037327, |
|
"rewards/cosine_scaled_reward": 0.14818426966667175, |
|
"rewards/format_reward": 0.3750000037252903, |
|
"step": 235 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1749.2916717529297, |
|
"epoch": 0.13485714285714287, |
|
"grad_norm": 0.032998789101839066, |
|
"kl": 0.00046539306640625, |
|
"learning_rate": 6.740368101176495e-07, |
|
"loss": 0.0971, |
|
"reward": 0.6648036018013954, |
|
"reward_std": 0.8384054228663445, |
|
"rewards/cosine_scaled_reward": -0.10509821772575378, |
|
"rewards/format_reward": 0.8750000149011612, |
|
"step": 236 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1968.3333435058594, |
|
"epoch": 0.13542857142857143, |
|
"grad_norm": 0.02814081497490406, |
|
"kl": 0.0006561279296875, |
|
"learning_rate": 6.710139192768694e-07, |
|
"loss": 0.0855, |
|
"reward": 0.15130121633410454, |
|
"reward_std": 0.4565184935927391, |
|
"rewards/cosine_scaled_reward": -0.21601606532931328, |
|
"rewards/format_reward": 0.5833333358168602, |
|
"step": 237 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2913.3333435058594, |
|
"epoch": 0.136, |
|
"grad_norm": 0.017682794481515884, |
|
"kl": 0.0003323554992675781, |
|
"learning_rate": 6.679851303883891e-07, |
|
"loss": -0.0443, |
|
"reward": 0.043509919196367264, |
|
"reward_std": 0.8419067375361919, |
|
"rewards/cosine_scaled_reward": -0.18657837435603142, |
|
"rewards/format_reward": 0.4166666679084301, |
|
"step": 238 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3364.416748046875, |
|
"epoch": 0.13657142857142857, |
|
"grad_norm": 0.028049411252141, |
|
"kl": 0.0004544258117675781, |
|
"learning_rate": 6.649505910711058e-07, |
|
"loss": 0.1081, |
|
"reward": 0.21441528294235468, |
|
"reward_std": 1.0792418122291565, |
|
"rewards/cosine_scaled_reward": -0.03862569108605385, |
|
"rewards/format_reward": 0.2916666753590107, |
|
"step": 239 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2725.7083740234375, |
|
"epoch": 0.13714285714285715, |
|
"grad_norm": 0.046624064445495605, |
|
"kl": 0.0005068778991699219, |
|
"learning_rate": 6.619104492241847e-07, |
|
"loss": 0.2407, |
|
"reward": -0.037649777717888355, |
|
"reward_std": 0.7788064442574978, |
|
"rewards/cosine_scaled_reward": -0.22715822607278824, |
|
"rewards/format_reward": 0.4166666716337204, |
|
"step": 240 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2704.25, |
|
"epoch": 0.1377142857142857, |
|
"grad_norm": 0.03354218974709511, |
|
"kl": 0.00043392181396484375, |
|
"learning_rate": 6.588648530198504e-07, |
|
"loss": 0.027, |
|
"reward": 0.41115298634395003, |
|
"reward_std": 0.5296461880207062, |
|
"rewards/cosine_scaled_reward": -0.002756841480731964, |
|
"rewards/format_reward": 0.4166666679084301, |
|
"step": 241 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3075.8750610351562, |
|
"epoch": 0.1382857142857143, |
|
"grad_norm": 0.01612684689462185, |
|
"kl": 0.0004563331604003906, |
|
"learning_rate": 6.558139508961654e-07, |
|
"loss": 0.1636, |
|
"reward": -0.1777523159980774, |
|
"reward_std": 0.47497741878032684, |
|
"rewards/cosine_scaled_reward": -0.2138761579990387, |
|
"rewards/format_reward": 0.2500000037252903, |
|
"step": 242 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2829.500030517578, |
|
"epoch": 0.13885714285714285, |
|
"grad_norm": 0.022913776338100433, |
|
"kl": 0.00047969818115234375, |
|
"learning_rate": 6.527578915497951e-07, |
|
"loss": 0.1179, |
|
"reward": -0.3900511562824249, |
|
"reward_std": 0.19519304856657982, |
|
"rewards/cosine_scaled_reward": -0.34085891395807266, |
|
"rewards/format_reward": 0.2916666679084301, |
|
"step": 243 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3008.4166870117188, |
|
"epoch": 0.13942857142857143, |
|
"grad_norm": 0.017667554318904877, |
|
"kl": 0.0005576610565185547, |
|
"learning_rate": 6.496968239287603e-07, |
|
"loss": 0.1161, |
|
"reward": -0.07407113164663315, |
|
"reward_std": 0.9030572213232517, |
|
"rewards/cosine_scaled_reward": -0.22453556954860687, |
|
"rewards/format_reward": 0.3750000111758709, |
|
"step": 244 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2979.1250610351562, |
|
"epoch": 0.14, |
|
"grad_norm": 0.015229357406497002, |
|
"kl": 0.0004878044128417969, |
|
"learning_rate": 6.466308972251785e-07, |
|
"loss": 0.0894, |
|
"reward": 0.2635510638356209, |
|
"reward_std": 1.100995272397995, |
|
"rewards/cosine_scaled_reward": -0.05572447320446372, |
|
"rewards/format_reward": 0.3750000149011612, |
|
"step": 245 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2819.0416870117188, |
|
"epoch": 0.14057142857142857, |
|
"grad_norm": 0.044691551476716995, |
|
"kl": 0.00034236907958984375, |
|
"learning_rate": 6.435602608679916e-07, |
|
"loss": -0.046, |
|
"reward": 0.28528738766908646, |
|
"reward_std": 0.8157100006937981, |
|
"rewards/cosine_scaled_reward": -0.04485631617717445, |
|
"rewards/format_reward": 0.375, |
|
"step": 246 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3443.8750610351562, |
|
"epoch": 0.14114285714285715, |
|
"grad_norm": 0.011280644685029984, |
|
"kl": 0.00034046173095703125, |
|
"learning_rate": 6.404850645156841e-07, |
|
"loss": 0.0628, |
|
"reward": 0.2559690326452255, |
|
"reward_std": 1.1299069225788116, |
|
"rewards/cosine_scaled_reward": 0.00298450980335474, |
|
"rewards/format_reward": 0.2500000074505806, |
|
"step": 247 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1982.8333435058594, |
|
"epoch": 0.1417142857142857, |
|
"grad_norm": 0.051233626902103424, |
|
"kl": 0.0006239414215087891, |
|
"learning_rate": 6.374054580489873e-07, |
|
"loss": 0.1208, |
|
"reward": 0.47601281851530075, |
|
"reward_std": 0.8072874061763287, |
|
"rewards/cosine_scaled_reward": -0.09532693400979042, |
|
"rewards/format_reward": 0.6666666716337204, |
|
"step": 248 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3090.5000610351562, |
|
"epoch": 0.1422857142857143, |
|
"grad_norm": 0.011510615237057209, |
|
"kl": 0.0003566741943359375, |
|
"learning_rate": 6.343215915635761e-07, |
|
"loss": -0.0075, |
|
"reward": -0.1677398905158043, |
|
"reward_std": 0.7362043038010597, |
|
"rewards/cosine_scaled_reward": -0.271369943395257, |
|
"rewards/format_reward": 0.3750000037252903, |
|
"step": 249 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2831.6666870117188, |
|
"epoch": 0.14285714285714285, |
|
"grad_norm": 0.012945250608026981, |
|
"kl": 0.000339508056640625, |
|
"learning_rate": 6.31233615362752e-07, |
|
"loss": 0.0464, |
|
"reward": 0.08572675287723541, |
|
"reward_std": 0.4463801756501198, |
|
"rewards/cosine_scaled_reward": -0.10296999663114548, |
|
"rewards/format_reward": 0.2916666679084301, |
|
"step": 250 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2145.7084045410156, |
|
"epoch": 0.14342857142857143, |
|
"grad_norm": 0.02382800169289112, |
|
"kl": 0.00037860870361328125, |
|
"learning_rate": 6.281416799501187e-07, |
|
"loss": 0.1758, |
|
"reward": 0.016892731189727783, |
|
"reward_std": 0.4844237770885229, |
|
"rewards/cosine_scaled_reward": -0.2832203172147274, |
|
"rewards/format_reward": 0.5833333432674408, |
|
"step": 251 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2960.5416870117188, |
|
"epoch": 0.144, |
|
"grad_norm": 0.021183207631111145, |
|
"kl": 0.0004506111145019531, |
|
"learning_rate": 6.25045936022246e-07, |
|
"loss": 0.1659, |
|
"reward": -0.04246724583208561, |
|
"reward_std": 0.6862606927752495, |
|
"rewards/cosine_scaled_reward": -0.18790028244256973, |
|
"rewards/format_reward": 0.3333333395421505, |
|
"step": 252 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2839.4166870117188, |
|
"epoch": 0.14457142857142857, |
|
"grad_norm": 0.015497619286179543, |
|
"kl": 0.0003509521484375, |
|
"learning_rate": 6.219465344613258e-07, |
|
"loss": 0.0335, |
|
"reward": -0.19672805070877075, |
|
"reward_std": 0.37255218997597694, |
|
"rewards/cosine_scaled_reward": -0.24419735372066498, |
|
"rewards/format_reward": 0.2916666679084301, |
|
"step": 253 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2932.6666717529297, |
|
"epoch": 0.14514285714285713, |
|
"grad_norm": 0.014029218815267086, |
|
"kl": 0.0002608299255371094, |
|
"learning_rate": 6.188436263278172e-07, |
|
"loss": 0.1358, |
|
"reward": 0.564534567296505, |
|
"reward_std": 0.7931031864136457, |
|
"rewards/cosine_scaled_reward": 0.03226728364825249, |
|
"rewards/format_reward": 0.5000000074505806, |
|
"step": 254 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2817.3333740234375, |
|
"epoch": 0.1457142857142857, |
|
"grad_norm": 0.02292151190340519, |
|
"kl": 0.0003943443298339844, |
|
"learning_rate": 6.157373628530852e-07, |
|
"loss": 0.0033, |
|
"reward": 0.7924370467662811, |
|
"reward_std": 0.5020520761609077, |
|
"rewards/cosine_scaled_reward": 0.16705189645290375, |
|
"rewards/format_reward": 0.4583333432674408, |
|
"step": 255 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3312.5000610351562, |
|
"epoch": 0.1462857142857143, |
|
"grad_norm": 0.024594873189926147, |
|
"kl": 0.00034236907958984375, |
|
"learning_rate": 6.126278954320294e-07, |
|
"loss": 0.1007, |
|
"reward": -0.39764899387955666, |
|
"reward_std": 0.4253292456269264, |
|
"rewards/cosine_scaled_reward": -0.302991159260273, |
|
"rewards/format_reward": 0.2083333395421505, |
|
"step": 256 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2919.0833740234375, |
|
"epoch": 0.14685714285714285, |
|
"grad_norm": 0.026216557249426842, |
|
"kl": 0.0004639625549316406, |
|
"learning_rate": 6.095153756157051e-07, |
|
"loss": 0.2025, |
|
"reward": -0.20567850768566132, |
|
"reward_std": 0.6880289539694786, |
|
"rewards/cosine_scaled_reward": -0.2695059161633253, |
|
"rewards/format_reward": 0.3333333432674408, |
|
"step": 257 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2133.8333435058594, |
|
"epoch": 0.14742857142857144, |
|
"grad_norm": 0.03468165546655655, |
|
"kl": 0.0003616809844970703, |
|
"learning_rate": 6.06399955103937e-07, |
|
"loss": 0.1963, |
|
"reward": 0.30517828464508057, |
|
"reward_std": 0.593671128153801, |
|
"rewards/cosine_scaled_reward": -0.1390775376930833, |
|
"rewards/format_reward": 0.5833333358168602, |
|
"step": 258 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3469.2916870117188, |
|
"epoch": 0.148, |
|
"grad_norm": 0.02112772688269615, |
|
"kl": 0.0003986358642578125, |
|
"learning_rate": 6.032817857379256e-07, |
|
"loss": 0.0331, |
|
"reward": -0.01782984286546707, |
|
"reward_std": 0.8413332737982273, |
|
"rewards/cosine_scaled_reward": -0.09224824234843254, |
|
"rewards/format_reward": 0.1666666679084301, |
|
"step": 259 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3433.7083740234375, |
|
"epoch": 0.14857142857142858, |
|
"grad_norm": 0.014742922969162464, |
|
"kl": 0.000308990478515625, |
|
"learning_rate": 6.001610194928464e-07, |
|
"loss": 0.0383, |
|
"reward": 0.31882617622613907, |
|
"reward_std": 0.15408218186348677, |
|
"rewards/cosine_scaled_reward": 0.03441305831074715, |
|
"rewards/format_reward": 0.25, |
|
"step": 260 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3461.291748046875, |
|
"epoch": 0.14914285714285713, |
|
"grad_norm": 0.014058803208172321, |
|
"kl": 0.00042438507080078125, |
|
"learning_rate": 5.97037808470444e-07, |
|
"loss": 0.0671, |
|
"reward": -0.2942545488476753, |
|
"reward_std": 0.612834420055151, |
|
"rewards/cosine_scaled_reward": -0.2096272725611925, |
|
"rewards/format_reward": 0.1250000037252903, |
|
"step": 261 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3319.7083740234375, |
|
"epoch": 0.14971428571428572, |
|
"grad_norm": 0.013714558444917202, |
|
"kl": 0.0003371238708496094, |
|
"learning_rate": 5.939123048916173e-07, |
|
"loss": 0.104, |
|
"reward": 0.14693566597998142, |
|
"reward_std": 0.5481071509420872, |
|
"rewards/cosine_scaled_reward": -0.11403219401836395, |
|
"rewards/format_reward": 0.3750000149011612, |
|
"step": 262 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2869.166717529297, |
|
"epoch": 0.15028571428571427, |
|
"grad_norm": 0.04431964457035065, |
|
"kl": 0.0004782676696777344, |
|
"learning_rate": 5.907846610890011e-07, |
|
"loss": 0.2064, |
|
"reward": -0.06633574888110161, |
|
"reward_std": 0.24844567105174065, |
|
"rewards/cosine_scaled_reward": -0.17900121491402388, |
|
"rewards/format_reward": 0.291666679084301, |
|
"step": 263 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3500.7500610351562, |
|
"epoch": 0.15085714285714286, |
|
"grad_norm": 0.011211477220058441, |
|
"kl": 0.00043582916259765625, |
|
"learning_rate": 5.87655029499542e-07, |
|
"loss": 0.0411, |
|
"reward": -0.3998759835958481, |
|
"reward_std": 0.430798327550292, |
|
"rewards/cosine_scaled_reward": -0.24160464480519295, |
|
"rewards/format_reward": 0.0833333358168602, |
|
"step": 264 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3510.6250610351562, |
|
"epoch": 0.15142857142857144, |
|
"grad_norm": 0.015797875821590424, |
|
"kl": 0.0004782676696777344, |
|
"learning_rate": 5.845235626570683e-07, |
|
"loss": 0.0301, |
|
"reward": 0.03237064555287361, |
|
"reward_std": 0.7703893817961216, |
|
"rewards/cosine_scaled_reward": -0.06714800372719765, |
|
"rewards/format_reward": 0.1666666679084301, |
|
"step": 265 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2977.3333740234375, |
|
"epoch": 0.152, |
|
"grad_norm": 0.02703353762626648, |
|
"kl": 0.000396728515625, |
|
"learning_rate": 5.813904131848564e-07, |
|
"loss": 0.0458, |
|
"reward": -0.099398122751154, |
|
"reward_std": 0.3674992090091109, |
|
"rewards/cosine_scaled_reward": -0.21636574110016227, |
|
"rewards/format_reward": 0.3333333358168602, |
|
"step": 266 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2061.0833740234375, |
|
"epoch": 0.15257142857142858, |
|
"grad_norm": 0.0498061366379261, |
|
"kl": 0.0005326271057128906, |
|
"learning_rate": 5.78255733788191e-07, |
|
"loss": 0.1127, |
|
"reward": 0.3922936078161001, |
|
"reward_std": 0.6864228155463934, |
|
"rewards/cosine_scaled_reward": -0.13718653097748756, |
|
"rewards/format_reward": 0.6666666716337204, |
|
"step": 267 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2333.4583435058594, |
|
"epoch": 0.15314285714285714, |
|
"grad_norm": 0.028823453933000565, |
|
"kl": 0.0005085468292236328, |
|
"learning_rate": 5.751196772469237e-07, |
|
"loss": 0.186, |
|
"reward": 0.22259462717920542, |
|
"reward_std": 0.5068696048110723, |
|
"rewards/cosine_scaled_reward": -0.15953603573143482, |
|
"rewards/format_reward": 0.5416666716337204, |
|
"step": 268 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3267.3333740234375, |
|
"epoch": 0.15371428571428572, |
|
"grad_norm": 0.027852864935994148, |
|
"kl": 0.0002601146697998047, |
|
"learning_rate": 5.71982396408026e-07, |
|
"loss": 0.1001, |
|
"reward": -0.2786406707018614, |
|
"reward_std": 0.3324854364618659, |
|
"rewards/cosine_scaled_reward": -0.22265366930514574, |
|
"rewards/format_reward": 0.1666666679084301, |
|
"step": 269 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2147.2083740234375, |
|
"epoch": 0.15428571428571428, |
|
"grad_norm": 0.025380106642842293, |
|
"kl": 0.000537872314453125, |
|
"learning_rate": 5.688440441781398e-07, |
|
"loss": 0.1412, |
|
"reward": 0.5648728758096695, |
|
"reward_std": 0.5805850811302662, |
|
"rewards/cosine_scaled_reward": 0.011603094637393951, |
|
"rewards/format_reward": 0.5416666679084301, |
|
"step": 270 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2535.7500610351562, |
|
"epoch": 0.15485714285714286, |
|
"grad_norm": 0.04058884456753731, |
|
"kl": 0.0005526542663574219, |
|
"learning_rate": 5.657047735161255e-07, |
|
"loss": 0.0614, |
|
"reward": 0.19145794212818146, |
|
"reward_std": 0.5669713392853737, |
|
"rewards/cosine_scaled_reward": -0.17510437592864037, |
|
"rewards/format_reward": 0.5416666679084301, |
|
"step": 271 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2707.7500610351562, |
|
"epoch": 0.15542857142857142, |
|
"grad_norm": 0.02463456057012081, |
|
"kl": 0.0004875659942626953, |
|
"learning_rate": 5.625647374256061e-07, |
|
"loss": -0.0531, |
|
"reward": 0.32263614796102047, |
|
"reward_std": 0.8555684071034193, |
|
"rewards/cosine_scaled_reward": -0.10951526463031769, |
|
"rewards/format_reward": 0.5416666716337204, |
|
"step": 272 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3203.8750610351562, |
|
"epoch": 0.156, |
|
"grad_norm": 0.021756043657660484, |
|
"kl": 0.0005960464477539062, |
|
"learning_rate": 5.594240889475106e-07, |
|
"loss": 0.0959, |
|
"reward": -0.3803365007042885, |
|
"reward_std": 0.4917794167995453, |
|
"rewards/cosine_scaled_reward": -0.31516825407743454, |
|
"rewards/format_reward": 0.2500000037252903, |
|
"step": 273 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3559.375, |
|
"epoch": 0.15657142857142858, |
|
"grad_norm": 0.014902282506227493, |
|
"kl": 0.0003399848937988281, |
|
"learning_rate": 5.562829811526154e-07, |
|
"loss": 0.014, |
|
"reward": -0.4870794676244259, |
|
"reward_std": 0.32457295805215836, |
|
"rewards/cosine_scaled_reward": -0.26437306217849255, |
|
"rewards/format_reward": 0.0416666679084301, |
|
"step": 274 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2296.2083435058594, |
|
"epoch": 0.15714285714285714, |
|
"grad_norm": 0.019861867651343346, |
|
"kl": 0.0003972053527832031, |
|
"learning_rate": 5.531415671340826e-07, |
|
"loss": -0.0306, |
|
"reward": 0.20605197548866272, |
|
"reward_std": 0.666698768734932, |
|
"rewards/cosine_scaled_reward": -0.14697400853037834, |
|
"rewards/format_reward": 0.5, |
|
"step": 275 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2113.9583892822266, |
|
"epoch": 0.15771428571428572, |
|
"grad_norm": 0.02660396508872509, |
|
"kl": 0.00037980079650878906, |
|
"learning_rate": 5.5e-07, |
|
"loss": 0.189, |
|
"reward": 0.8675991147756577, |
|
"reward_std": 0.5296240104362369, |
|
"rewards/cosine_scaled_reward": 0.12129955366253853, |
|
"rewards/format_reward": 0.625, |
|
"step": 276 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2935.2916870117188, |
|
"epoch": 0.15828571428571428, |
|
"grad_norm": 0.014423711225390434, |
|
"kl": 0.0004062652587890625, |
|
"learning_rate": 5.468584328659172e-07, |
|
"loss": 0.0398, |
|
"reward": 0.7772083282470703, |
|
"reward_std": 0.564259335398674, |
|
"rewards/cosine_scaled_reward": 0.15943749248981476, |
|
"rewards/format_reward": 0.4583333432674408, |
|
"step": 277 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2655.375030517578, |
|
"epoch": 0.15885714285714286, |
|
"grad_norm": 0.018322305753827095, |
|
"kl": 0.0004067420959472656, |
|
"learning_rate": 5.437170188473847e-07, |
|
"loss": 0.1151, |
|
"reward": -0.03639080002903938, |
|
"reward_std": 0.4445139616727829, |
|
"rewards/cosine_scaled_reward": -0.24736207351088524, |
|
"rewards/format_reward": 0.4583333432674408, |
|
"step": 278 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2705.1250610351562, |
|
"epoch": 0.15942857142857142, |
|
"grad_norm": 0.03632061555981636, |
|
"kl": 0.000385284423828125, |
|
"learning_rate": 5.405759110524894e-07, |
|
"loss": 0.135, |
|
"reward": 0.6486790850758553, |
|
"reward_std": 0.898019090294838, |
|
"rewards/cosine_scaled_reward": 0.05350620858371258, |
|
"rewards/format_reward": 0.5416666828095913, |
|
"step": 279 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2446.2083435058594, |
|
"epoch": 0.16, |
|
"grad_norm": 0.013082285411655903, |
|
"kl": 0.000347137451171875, |
|
"learning_rate": 5.37435262574394e-07, |
|
"loss": -0.0428, |
|
"reward": 0.37989859376102686, |
|
"reward_std": 0.29897986352443695, |
|
"rewards/cosine_scaled_reward": -0.060050718020647764, |
|
"rewards/format_reward": 0.5, |
|
"step": 280 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2319.2083435058594, |
|
"epoch": 0.16057142857142856, |
|
"grad_norm": 0.045566096901893616, |
|
"kl": 0.0005693435668945312, |
|
"learning_rate": 5.342952264838747e-07, |
|
"loss": 0.1853, |
|
"reward": 0.5609744489192963, |
|
"reward_std": 0.6304305791854858, |
|
"rewards/cosine_scaled_reward": 0.05132052768021822, |
|
"rewards/format_reward": 0.4583333395421505, |
|
"step": 281 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3381.8750610351562, |
|
"epoch": 0.16114285714285714, |
|
"grad_norm": 0.02684849686920643, |
|
"kl": 0.0007226467132568359, |
|
"learning_rate": 5.311559558218603e-07, |
|
"loss": 0.0416, |
|
"reward": 0.4639171026647091, |
|
"reward_std": 0.9287250991910696, |
|
"rewards/cosine_scaled_reward": 0.08612522296607494, |
|
"rewards/format_reward": 0.2916666716337204, |
|
"step": 282 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2586.375030517578, |
|
"epoch": 0.16171428571428573, |
|
"grad_norm": 0.0842081606388092, |
|
"kl": 0.000537872314453125, |
|
"learning_rate": 5.28017603591974e-07, |
|
"loss": 0.2945, |
|
"reward": -0.23938731849193573, |
|
"reward_std": 0.5152188688516617, |
|
"rewards/cosine_scaled_reward": -0.30719365179538727, |
|
"rewards/format_reward": 0.3750000149011612, |
|
"step": 283 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2352.8750610351562, |
|
"epoch": 0.16228571428571428, |
|
"grad_norm": 0.026272239163517952, |
|
"kl": 0.00044083595275878906, |
|
"learning_rate": 5.248803227530763e-07, |
|
"loss": 0.186, |
|
"reward": 1.0214342884719372, |
|
"reward_std": 1.2180421352386475, |
|
"rewards/cosine_scaled_reward": 0.2398838261142373, |
|
"rewards/format_reward": 0.5416666753590107, |
|
"step": 284 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3469.25, |
|
"epoch": 0.16285714285714287, |
|
"grad_norm": 0.018204184249043465, |
|
"kl": 0.0004673004150390625, |
|
"learning_rate": 5.21744266211809e-07, |
|
"loss": 0.063, |
|
"reward": -0.13403620570898056, |
|
"reward_std": 0.6568610742688179, |
|
"rewards/cosine_scaled_reward": -0.12951810285449028, |
|
"rewards/format_reward": 0.1250000037252903, |
|
"step": 285 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2795.166748046875, |
|
"epoch": 0.16342857142857142, |
|
"grad_norm": 0.013621930964291096, |
|
"kl": 0.0003037452697753906, |
|
"learning_rate": 5.186095868151436e-07, |
|
"loss": -0.039, |
|
"reward": 1.5945213325321674, |
|
"reward_std": 0.5471338629722595, |
|
"rewards/cosine_scaled_reward": 0.4222606960684061, |
|
"rewards/format_reward": 0.75, |
|
"step": 286 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2879.833465576172, |
|
"epoch": 0.164, |
|
"grad_norm": 0.01253263745456934, |
|
"kl": 0.0004277229309082031, |
|
"learning_rate": 5.154764373429315e-07, |
|
"loss": 0.0411, |
|
"reward": 0.21084657812025398, |
|
"reward_std": 1.322334498167038, |
|
"rewards/cosine_scaled_reward": -0.12374338880181313, |
|
"rewards/format_reward": 0.4583333395421505, |
|
"step": 287 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3213.2916870117188, |
|
"epoch": 0.16457142857142856, |
|
"grad_norm": 0.024232791736721992, |
|
"kl": 0.0003829002380371094, |
|
"learning_rate": 5.123449705004581e-07, |
|
"loss": 0.0237, |
|
"reward": -0.3808862268924713, |
|
"reward_std": 0.41033722274005413, |
|
"rewards/cosine_scaled_reward": -0.294609775301069, |
|
"rewards/format_reward": 0.2083333432674408, |
|
"step": 288 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2690.750045776367, |
|
"epoch": 0.16514285714285715, |
|
"grad_norm": 0.053029682487249374, |
|
"kl": 0.00045490264892578125, |
|
"learning_rate": 5.09215338910999e-07, |
|
"loss": 0.1226, |
|
"reward": -0.21079005533829331, |
|
"reward_std": 0.3236595541238785, |
|
"rewards/cosine_scaled_reward": -0.29289503768086433, |
|
"rewards/format_reward": 0.3750000037252903, |
|
"step": 289 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2208.916717529297, |
|
"epoch": 0.1657142857142857, |
|
"grad_norm": 0.03633257746696472, |
|
"kl": 0.0009098052978515625, |
|
"learning_rate": 5.060876951083828e-07, |
|
"loss": 0.0996, |
|
"reward": 0.3243631422519684, |
|
"reward_std": 0.7893142104148865, |
|
"rewards/cosine_scaled_reward": -0.17115176958031952, |
|
"rewards/format_reward": 0.6666666828095913, |
|
"step": 290 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3469.4166870117188, |
|
"epoch": 0.1662857142857143, |
|
"grad_norm": 0.013034985400736332, |
|
"kl": 0.00041294097900390625, |
|
"learning_rate": 5.02962191529556e-07, |
|
"loss": 0.0095, |
|
"reward": -0.2475043497979641, |
|
"reward_std": 0.5379906464368105, |
|
"rewards/cosine_scaled_reward": -0.227918840944767, |
|
"rewards/format_reward": 0.2083333358168602, |
|
"step": 291 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2733.541748046875, |
|
"epoch": 0.16685714285714287, |
|
"grad_norm": 0.023631447926163673, |
|
"kl": 0.00041866302490234375, |
|
"learning_rate": 4.998389805071536e-07, |
|
"loss": 0.0501, |
|
"reward": -0.05327422299887985, |
|
"reward_std": 0.6902021616697311, |
|
"rewards/cosine_scaled_reward": -0.29747044667601585, |
|
"rewards/format_reward": 0.5416666865348816, |
|
"step": 292 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1974.3333587646484, |
|
"epoch": 0.16742857142857143, |
|
"grad_norm": 0.025896325707435608, |
|
"kl": 0.0005052089691162109, |
|
"learning_rate": 4.967182142620745e-07, |
|
"loss": -0.0503, |
|
"reward": 1.0392636209726334, |
|
"reward_std": 0.7821149528026581, |
|
"rewards/cosine_scaled_reward": 0.16546513326466084, |
|
"rewards/format_reward": 0.7083333432674408, |
|
"step": 293 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3210.666748046875, |
|
"epoch": 0.168, |
|
"grad_norm": 0.016182757914066315, |
|
"kl": 0.0005517005920410156, |
|
"learning_rate": 4.93600044896063e-07, |
|
"loss": 0.1227, |
|
"reward": -0.3081187531352043, |
|
"reward_std": 0.9994445107877254, |
|
"rewards/cosine_scaled_reward": -0.27905938774347305, |
|
"rewards/format_reward": 0.2500000037252903, |
|
"step": 294 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3051.1251220703125, |
|
"epoch": 0.16857142857142857, |
|
"grad_norm": 0.021791962906718254, |
|
"kl": 0.0004928112030029297, |
|
"learning_rate": 4.904846243842949e-07, |
|
"loss": 0.1472, |
|
"reward": -0.0736542553640902, |
|
"reward_std": 0.4684867039322853, |
|
"rewards/cosine_scaled_reward": -0.20349380746483803, |
|
"rewards/format_reward": 0.3333333395421505, |
|
"step": 295 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3008.375030517578, |
|
"epoch": 0.16914285714285715, |
|
"grad_norm": 0.044715363532304764, |
|
"kl": 0.0005588531494140625, |
|
"learning_rate": 4.873721045679706e-07, |
|
"loss": 0.2406, |
|
"reward": -0.45781777799129486, |
|
"reward_std": 0.3647055197507143, |
|
"rewards/cosine_scaled_reward": -0.33307556062936783, |
|
"rewards/format_reward": 0.2083333395421505, |
|
"step": 296 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3294.1666870117188, |
|
"epoch": 0.1697142857142857, |
|
"grad_norm": 0.01236710138618946, |
|
"kl": 0.00040149688720703125, |
|
"learning_rate": 4.842626371469149e-07, |
|
"loss": 0.0176, |
|
"reward": 0.3674662681296468, |
|
"reward_std": 0.8067609528079629, |
|
"rewards/cosine_scaled_reward": 0.017066428757971153, |
|
"rewards/format_reward": 0.3333333432674408, |
|
"step": 297 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2191.2084045410156, |
|
"epoch": 0.1702857142857143, |
|
"grad_norm": 0.02793304994702339, |
|
"kl": 0.0004506111145019531, |
|
"learning_rate": 4.811563736721829e-07, |
|
"loss": 0.2163, |
|
"reward": 0.3552871508290991, |
|
"reward_std": 0.6271071489900351, |
|
"rewards/cosine_scaled_reward": -0.07235642522573471, |
|
"rewards/format_reward": 0.5, |
|
"step": 298 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2937.0833740234375, |
|
"epoch": 0.17085714285714285, |
|
"grad_norm": 0.02537981979548931, |
|
"kl": 0.0006856918334960938, |
|
"learning_rate": 4.780534655386743e-07, |
|
"loss": 0.0741, |
|
"reward": 0.014573439490050077, |
|
"reward_std": 0.6083418875932693, |
|
"rewards/cosine_scaled_reward": -0.20104661397635937, |
|
"rewards/format_reward": 0.4166666828095913, |
|
"step": 299 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2810.375, |
|
"epoch": 0.17142857142857143, |
|
"grad_norm": 0.015299368649721146, |
|
"kl": 0.00049591064453125, |
|
"learning_rate": 4.749540639777539e-07, |
|
"loss": 0.0634, |
|
"reward": -0.10026557371020317, |
|
"reward_std": 0.45571645349264145, |
|
"rewards/cosine_scaled_reward": -0.23763278499245644, |
|
"rewards/format_reward": 0.375, |
|
"step": 300 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2589.4583435058594, |
|
"epoch": 0.172, |
|
"grad_norm": 0.03871985152363777, |
|
"kl": 0.000347137451171875, |
|
"learning_rate": 4.7185832004988133e-07, |
|
"loss": 0.1152, |
|
"reward": 0.054252732545137405, |
|
"reward_std": 0.2450561560690403, |
|
"rewards/cosine_scaled_reward": -0.1812069695442915, |
|
"rewards/format_reward": 0.4166666716337204, |
|
"step": 301 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2865.7083435058594, |
|
"epoch": 0.17257142857142857, |
|
"grad_norm": 0.02540568634867668, |
|
"kl": 0.0004420280456542969, |
|
"learning_rate": 4.68766384637248e-07, |
|
"loss": 0.0172, |
|
"reward": 0.15939845889806747, |
|
"reward_std": 0.35454079881310463, |
|
"rewards/cosine_scaled_reward": -0.04530075564980507, |
|
"rewards/format_reward": 0.25, |
|
"step": 302 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3046.25, |
|
"epoch": 0.17314285714285715, |
|
"grad_norm": 0.014560963958501816, |
|
"kl": 0.0003390312194824219, |
|
"learning_rate": 4.656784084364238e-07, |
|
"loss": 0.0832, |
|
"reward": 0.007433712482452393, |
|
"reward_std": 0.2396685965359211, |
|
"rewards/cosine_scaled_reward": -0.1212831512093544, |
|
"rewards/format_reward": 0.25, |
|
"step": 303 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3481.0416870117188, |
|
"epoch": 0.1737142857142857, |
|
"grad_norm": 0.016006356105208397, |
|
"kl": 0.00033545494079589844, |
|
"learning_rate": 4.6259454195101267e-07, |
|
"loss": 0.0346, |
|
"reward": -0.22439849376678467, |
|
"reward_std": 0.6627911329269409, |
|
"rewards/cosine_scaled_reward": -0.17469927296042442, |
|
"rewards/format_reward": 0.1250000037252903, |
|
"step": 304 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2904.5418090820312, |
|
"epoch": 0.1742857142857143, |
|
"grad_norm": 0.025142712518572807, |
|
"kl": 0.00030517578125, |
|
"learning_rate": 4.59514935484316e-07, |
|
"loss": 0.1243, |
|
"reward": 0.8032534047961235, |
|
"reward_std": 0.8733390048146248, |
|
"rewards/cosine_scaled_reward": 0.13079336285591125, |
|
"rewards/format_reward": 0.5416666753590107, |
|
"step": 305 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2188.0833587646484, |
|
"epoch": 0.17485714285714285, |
|
"grad_norm": 0.03312807157635689, |
|
"kl": 0.0004315376281738281, |
|
"learning_rate": 4.5643973913200837e-07, |
|
"loss": 0.0318, |
|
"reward": 0.7741055563092232, |
|
"reward_std": 0.8599487096071243, |
|
"rewards/cosine_scaled_reward": 0.0745527446269989, |
|
"rewards/format_reward": 0.625, |
|
"step": 306 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3403.7500610351562, |
|
"epoch": 0.17542857142857143, |
|
"grad_norm": 0.01129495445638895, |
|
"kl": 0.0003752708435058594, |
|
"learning_rate": 4.5336910277482155e-07, |
|
"loss": 0.0876, |
|
"reward": -0.2145760916173458, |
|
"reward_std": 0.883152648806572, |
|
"rewards/cosine_scaled_reward": -0.2739547099918127, |
|
"rewards/format_reward": 0.3333333395421505, |
|
"step": 307 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2245.3333435058594, |
|
"epoch": 0.176, |
|
"grad_norm": 0.02003965899348259, |
|
"kl": 0.00042700767517089844, |
|
"learning_rate": 4.503031760712397e-07, |
|
"loss": 0.1665, |
|
"reward": 0.9610237777233124, |
|
"reward_std": 0.924444355070591, |
|
"rewards/cosine_scaled_reward": 0.1888452209532261, |
|
"rewards/format_reward": 0.5833333432674408, |
|
"step": 308 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3584.0, |
|
"epoch": 0.17657142857142857, |
|
"grad_norm": 0.015420272946357727, |
|
"kl": 0.0005145072937011719, |
|
"learning_rate": 4.4724210845020494e-07, |
|
"loss": 0.0, |
|
"reward": -0.4918478289619088, |
|
"reward_std": 0.18744987901300192, |
|
"rewards/cosine_scaled_reward": -0.24592391401529312, |
|
"rewards/format_reward": 0.0, |
|
"step": 309 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3584.0, |
|
"epoch": 0.17714285714285713, |
|
"grad_norm": 0.011936242692172527, |
|
"kl": 0.00030040740966796875, |
|
"learning_rate": 4.441860491038345e-07, |
|
"loss": 0.0, |
|
"reward": -0.5715262293815613, |
|
"reward_std": 0.19434408470988274, |
|
"rewards/cosine_scaled_reward": -0.28576310351490974, |
|
"rewards/format_reward": 0.0, |
|
"step": 310 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3039.9583435058594, |
|
"epoch": 0.1777142857142857, |
|
"grad_norm": 0.013624753803014755, |
|
"kl": 0.0003497600555419922, |
|
"learning_rate": 4.4113514698014953e-07, |
|
"loss": -0.0761, |
|
"reward": 0.008271858096122742, |
|
"reward_std": 0.42439935728907585, |
|
"rewards/cosine_scaled_reward": -0.12086406722664833, |
|
"rewards/format_reward": 0.25, |
|
"step": 311 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3584.0, |
|
"epoch": 0.1782857142857143, |
|
"grad_norm": 0.014998997561633587, |
|
"kl": 0.0006074905395507812, |
|
"learning_rate": 4.3808955077581546e-07, |
|
"loss": 0.0, |
|
"reward": -0.5124310553073883, |
|
"reward_std": 0.2232176773250103, |
|
"rewards/cosine_scaled_reward": -0.25621553510427475, |
|
"rewards/format_reward": 0.0, |
|
"step": 312 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2576.7916870117188, |
|
"epoch": 0.17885714285714285, |
|
"grad_norm": 0.028986027464270592, |
|
"kl": 0.0004107952117919922, |
|
"learning_rate": 4.350494089288943e-07, |
|
"loss": 0.1646, |
|
"reward": 0.38414837792515755, |
|
"reward_std": 0.8268394228070974, |
|
"rewards/cosine_scaled_reward": -0.03709249012172222, |
|
"rewards/format_reward": 0.4583333358168602, |
|
"step": 313 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2342.250030517578, |
|
"epoch": 0.17942857142857144, |
|
"grad_norm": 0.07766366004943848, |
|
"kl": 0.0004897117614746094, |
|
"learning_rate": 4.3201486961161093e-07, |
|
"loss": 0.2676, |
|
"reward": 1.206177432090044, |
|
"reward_std": 1.3236718773841858, |
|
"rewards/cosine_scaled_reward": 0.29058872163295746, |
|
"rewards/format_reward": 0.6250000111758709, |
|
"step": 314 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2941.8333740234375, |
|
"epoch": 0.18, |
|
"grad_norm": 0.015173373743891716, |
|
"kl": 0.00037097930908203125, |
|
"learning_rate": 4.2898608072313045e-07, |
|
"loss": -0.0539, |
|
"reward": -0.3268696665763855, |
|
"reward_std": 0.39195265993475914, |
|
"rewards/cosine_scaled_reward": -0.33010151237249374, |
|
"rewards/format_reward": 0.3333333358168602, |
|
"step": 315 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3459.75, |
|
"epoch": 0.18057142857142858, |
|
"grad_norm": 0.01420869305729866, |
|
"kl": 0.0004048347473144531, |
|
"learning_rate": 4.2596318988235037e-07, |
|
"loss": 0.0212, |
|
"reward": -0.1601060489192605, |
|
"reward_std": 0.7861558832228184, |
|
"rewards/cosine_scaled_reward": -0.16338635561987758, |
|
"rewards/format_reward": 0.1666666716337204, |
|
"step": 316 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2739.7083740234375, |
|
"epoch": 0.18114285714285713, |
|
"grad_norm": 0.02297414094209671, |
|
"kl": 0.002285003662109375, |
|
"learning_rate": 4.2294634442070553e-07, |
|
"loss": -0.0329, |
|
"reward": 0.5244220271706581, |
|
"reward_std": 0.9719415307044983, |
|
"rewards/cosine_scaled_reward": 0.0330443549901247, |
|
"rewards/format_reward": 0.4583333358168602, |
|
"step": 317 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2666.5416870117188, |
|
"epoch": 0.18171428571428572, |
|
"grad_norm": 0.022833170369267464, |
|
"kl": 0.0004475116729736328, |
|
"learning_rate": 4.1993569137498776e-07, |
|
"loss": 0.1241, |
|
"reward": 0.19690169394016266, |
|
"reward_std": 0.6580867804586887, |
|
"rewards/cosine_scaled_reward": -0.19321582466363907, |
|
"rewards/format_reward": 0.583333358168602, |
|
"step": 318 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2698.3750610351562, |
|
"epoch": 0.18228571428571427, |
|
"grad_norm": 0.034383926540613174, |
|
"kl": 0.0003724098205566406, |
|
"learning_rate": 4.1693137748017915e-07, |
|
"loss": 0.2278, |
|
"reward": 0.6201122887432575, |
|
"reward_std": 0.6080379486083984, |
|
"rewards/cosine_scaled_reward": 0.06005614344030619, |
|
"rewards/format_reward": 0.5000000074505806, |
|
"step": 319 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2862.3750610351562, |
|
"epoch": 0.18285714285714286, |
|
"grad_norm": 0.0345999151468277, |
|
"kl": 0.00040721893310546875, |
|
"learning_rate": 4.1393354916230005e-07, |
|
"loss": -0.0318, |
|
"reward": 0.12236133217811584, |
|
"reward_std": 0.6523439809679985, |
|
"rewards/cosine_scaled_reward": -0.10548599809408188, |
|
"rewards/format_reward": 0.3333333358168602, |
|
"step": 320 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3358.125, |
|
"epoch": 0.18342857142857144, |
|
"grad_norm": 0.013232111930847168, |
|
"kl": 0.0003991127014160156, |
|
"learning_rate": 4.1094235253127374e-07, |
|
"loss": 0.0404, |
|
"reward": -0.11832981812767684, |
|
"reward_std": 0.42844806239008904, |
|
"rewards/cosine_scaled_reward": -0.16333157755434513, |
|
"rewards/format_reward": 0.2083333395421505, |
|
"step": 321 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3566.4166870117188, |
|
"epoch": 0.184, |
|
"grad_norm": 0.016278283670544624, |
|
"kl": 0.00032711029052734375, |
|
"learning_rate": 4.079579333738039e-07, |
|
"loss": 0.0102, |
|
"reward": -0.4220607131719589, |
|
"reward_std": 0.5171524062752724, |
|
"rewards/cosine_scaled_reward": -0.23186369240283966, |
|
"rewards/format_reward": 0.0416666679084301, |
|
"step": 322 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3138.6666870117188, |
|
"epoch": 0.18457142857142858, |
|
"grad_norm": 0.013172892853617668, |
|
"kl": 0.0003600120544433594, |
|
"learning_rate": 4.0498043714627006e-07, |
|
"loss": 0.0537, |
|
"reward": 0.05053871381096542, |
|
"reward_std": 0.8983977390453219, |
|
"rewards/cosine_scaled_reward": -0.16223064810037613, |
|
"rewards/format_reward": 0.3750000111758709, |
|
"step": 323 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3010.291748046875, |
|
"epoch": 0.18514285714285714, |
|
"grad_norm": 0.025909846648573875, |
|
"kl": 0.00040721893310546875, |
|
"learning_rate": 4.020100089676376e-07, |
|
"loss": 0.0602, |
|
"reward": 0.5590743962675333, |
|
"reward_std": 0.8683347813785076, |
|
"rewards/cosine_scaled_reward": 0.029537230730056763, |
|
"rewards/format_reward": 0.5000000223517418, |
|
"step": 324 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2910.000030517578, |
|
"epoch": 0.18571428571428572, |
|
"grad_norm": 0.015413357876241207, |
|
"kl": 0.0004220008850097656, |
|
"learning_rate": 3.9904679361238526e-07, |
|
"loss": 0.0317, |
|
"reward": -0.0408908948302269, |
|
"reward_std": 0.5930759366601706, |
|
"rewards/cosine_scaled_reward": -0.20794545486569405, |
|
"rewards/format_reward": 0.375, |
|
"step": 325 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2532.166748046875, |
|
"epoch": 0.18628571428571428, |
|
"grad_norm": 0.030060861259698868, |
|
"kl": 0.0005965232849121094, |
|
"learning_rate": 3.9609093550344907e-07, |
|
"loss": 0.0679, |
|
"reward": 0.6265107244253159, |
|
"reward_std": 0.9718786887824535, |
|
"rewards/cosine_scaled_reward": 0.06325538456439972, |
|
"rewards/format_reward": 0.5, |
|
"step": 326 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3148.875, |
|
"epoch": 0.18685714285714286, |
|
"grad_norm": 0.041659917682409286, |
|
"kl": 0.0004248619079589844, |
|
"learning_rate": 3.931425787051832e-07, |
|
"loss": 0.1537, |
|
"reward": -0.35396782122552395, |
|
"reward_std": 0.2636511065065861, |
|
"rewards/cosine_scaled_reward": -0.2603172492235899, |
|
"rewards/format_reward": 0.1666666716337204, |
|
"step": 327 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2493.916717529297, |
|
"epoch": 0.18742857142857142, |
|
"grad_norm": 0.020772553980350494, |
|
"kl": 0.0006198883056640625, |
|
"learning_rate": 3.902018669163384e-07, |
|
"loss": 0.1175, |
|
"reward": 0.18326607067137957, |
|
"reward_std": 0.49889209493994713, |
|
"rewards/cosine_scaled_reward": -0.11670029908418655, |
|
"rewards/format_reward": 0.4166666716337204, |
|
"step": 328 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3093.291748046875, |
|
"epoch": 0.188, |
|
"grad_norm": 0.011293930932879448, |
|
"kl": 0.0003113746643066406, |
|
"learning_rate": 3.872689434630585e-07, |
|
"loss": 0.0669, |
|
"reward": 0.5585142355412245, |
|
"reward_std": 1.4061576128005981, |
|
"rewards/cosine_scaled_reward": 0.09175711218267679, |
|
"rewards/format_reward": 0.3750000111758709, |
|
"step": 329 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2978.7916870117188, |
|
"epoch": 0.18857142857142858, |
|
"grad_norm": 0.05254025384783745, |
|
"kl": 0.000415802001953125, |
|
"learning_rate": 3.843439512918949e-07, |
|
"loss": 0.2134, |
|
"reward": 0.02450428158044815, |
|
"reward_std": 0.6984777390025556, |
|
"rewards/cosine_scaled_reward": -0.11274785548448563, |
|
"rewards/format_reward": 0.2500000111758709, |
|
"step": 330 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2632.4583587646484, |
|
"epoch": 0.18914285714285714, |
|
"grad_norm": 0.020709240809082985, |
|
"kl": 0.000537872314453125, |
|
"learning_rate": 3.8142703296283953e-07, |
|
"loss": 0.0682, |
|
"reward": 0.45295886788517237, |
|
"reward_std": 0.2939260210841894, |
|
"rewards/cosine_scaled_reward": 0.018146060872823, |
|
"rewards/format_reward": 0.4166666679084301, |
|
"step": 331 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2125.0416870117188, |
|
"epoch": 0.18971428571428572, |
|
"grad_norm": 0.019701264798641205, |
|
"kl": 0.00028228759765625, |
|
"learning_rate": 3.785183306423767e-07, |
|
"loss": 0.198, |
|
"reward": 0.22587934881448746, |
|
"reward_std": 0.6166221983730793, |
|
"rewards/cosine_scaled_reward": -0.15789367514662445, |
|
"rewards/format_reward": 0.541666679084301, |
|
"step": 332 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2881.3333740234375, |
|
"epoch": 0.19028571428571428, |
|
"grad_norm": 0.019184157252311707, |
|
"kl": 0.0005030632019042969, |
|
"learning_rate": 3.7561798609655373e-07, |
|
"loss": 0.0184, |
|
"reward": -0.2152223140001297, |
|
"reward_std": 0.38293132930994034, |
|
"rewards/cosine_scaled_reward": -0.25344450399279594, |
|
"rewards/format_reward": 0.2916666679084301, |
|
"step": 333 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3178.7083435058594, |
|
"epoch": 0.19085714285714286, |
|
"grad_norm": 0.014846621081233025, |
|
"kl": 0.000293731689453125, |
|
"learning_rate": 3.72726140684072e-07, |
|
"loss": 0.1053, |
|
"reward": 0.18489570170640945, |
|
"reward_std": 1.0315047651529312, |
|
"rewards/cosine_scaled_reward": -0.03255215287208557, |
|
"rewards/format_reward": 0.2500000074505806, |
|
"step": 334 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2227.750030517578, |
|
"epoch": 0.19142857142857142, |
|
"grad_norm": 0.022946475073695183, |
|
"kl": 0.00047016143798828125, |
|
"learning_rate": 3.6984293534939737e-07, |
|
"loss": 0.0083, |
|
"reward": 0.32345347106456757, |
|
"reward_std": 0.5401728432625532, |
|
"rewards/cosine_scaled_reward": -0.1924399547278881, |
|
"rewards/format_reward": 0.7083333395421505, |
|
"step": 335 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2682.9166870117188, |
|
"epoch": 0.192, |
|
"grad_norm": 0.029810767620801926, |
|
"kl": 0.0006122589111328125, |
|
"learning_rate": 3.6696851061588994e-07, |
|
"loss": 0.1395, |
|
"reward": 0.37340210494585335, |
|
"reward_std": 0.7611111477017403, |
|
"rewards/cosine_scaled_reward": -0.08413228765130043, |
|
"rewards/format_reward": 0.5416666716337204, |
|
"step": 336 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2755.2500228881836, |
|
"epoch": 0.19257142857142856, |
|
"grad_norm": 0.030772754922509193, |
|
"kl": 0.0004162788391113281, |
|
"learning_rate": 3.641030065789562e-07, |
|
"loss": 0.0926, |
|
"reward": 0.6443270817399025, |
|
"reward_std": 0.9492446109652519, |
|
"rewards/cosine_scaled_reward": 0.09299685433506966, |
|
"rewards/format_reward": 0.4583333358168602, |
|
"step": 337 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2727.2083435058594, |
|
"epoch": 0.19314285714285714, |
|
"grad_norm": 0.019305266439914703, |
|
"kl": 0.0005497932434082031, |
|
"learning_rate": 3.612465628992203e-07, |
|
"loss": 0.0282, |
|
"reward": 0.6710007563233376, |
|
"reward_std": 1.21895881742239, |
|
"rewards/cosine_scaled_reward": 0.08550036698579788, |
|
"rewards/format_reward": 0.5000000074505806, |
|
"step": 338 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3584.0, |
|
"epoch": 0.19371428571428573, |
|
"grad_norm": 0.01511505339294672, |
|
"kl": 0.0004024505615234375, |
|
"learning_rate": 3.5839931879571725e-07, |
|
"loss": 0.0, |
|
"reward": -0.610577579587698, |
|
"reward_std": 0.23075975850224495, |
|
"rewards/cosine_scaled_reward": -0.305288789793849, |
|
"rewards/format_reward": 0.0, |
|
"step": 339 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2202.125015258789, |
|
"epoch": 0.19428571428571428, |
|
"grad_norm": 0.02869362384080887, |
|
"kl": 0.0005574226379394531, |
|
"learning_rate": 3.555614130391079e-07, |
|
"loss": 0.0131, |
|
"reward": 0.296954870223999, |
|
"reward_std": 0.4164566658437252, |
|
"rewards/cosine_scaled_reward": -0.10152260027825832, |
|
"rewards/format_reward": 0.5, |
|
"step": 340 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1960.1667175292969, |
|
"epoch": 0.19485714285714287, |
|
"grad_norm": 0.05206461623311043, |
|
"kl": 0.00039577484130859375, |
|
"learning_rate": 3.5273298394491515e-07, |
|
"loss": 0.268, |
|
"reward": 0.24809112399816513, |
|
"reward_std": 0.6666374318301678, |
|
"rewards/cosine_scaled_reward": -0.23012111335992813, |
|
"rewards/format_reward": 0.7083333432674408, |
|
"step": 341 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2733.4166717529297, |
|
"epoch": 0.19542857142857142, |
|
"grad_norm": 0.028285467997193336, |
|
"kl": 0.000484466552734375, |
|
"learning_rate": 3.4991416936678276e-07, |
|
"loss": 0.1274, |
|
"reward": 0.2500569764524698, |
|
"reward_std": 0.38465849310159683, |
|
"rewards/cosine_scaled_reward": -0.062471505254507065, |
|
"rewards/format_reward": 0.3750000037252903, |
|
"step": 342 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2453.500030517578, |
|
"epoch": 0.196, |
|
"grad_norm": 0.03303263336420059, |
|
"kl": 0.0004892349243164062, |
|
"learning_rate": 3.471051066897562e-07, |
|
"loss": 0.2808, |
|
"reward": -0.11763790249824524, |
|
"reward_std": 0.5739464350044727, |
|
"rewards/cosine_scaled_reward": -0.30881896358914673, |
|
"rewards/format_reward": 0.5000000223517418, |
|
"step": 343 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2497.2083435058594, |
|
"epoch": 0.19657142857142856, |
|
"grad_norm": 0.05244883522391319, |
|
"kl": 0.00046515464782714844, |
|
"learning_rate": 3.4430593282358777e-07, |
|
"loss": 0.0989, |
|
"reward": 0.34542886912822723, |
|
"reward_std": 0.9526717662811279, |
|
"rewards/cosine_scaled_reward": -0.03561893478035927, |
|
"rewards/format_reward": 0.4166666679084301, |
|
"step": 344 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1832.7500305175781, |
|
"epoch": 0.19714285714285715, |
|
"grad_norm": 0.041937097907066345, |
|
"kl": 0.0008487701416015625, |
|
"learning_rate": 3.4151678419606233e-07, |
|
"loss": -0.1656, |
|
"reward": 1.009265385568142, |
|
"reward_std": 0.40681467205286026, |
|
"rewards/cosine_scaled_reward": 0.15046602487564087, |
|
"rewards/format_reward": 0.7083333432674408, |
|
"step": 345 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3584.0, |
|
"epoch": 0.1977142857142857, |
|
"grad_norm": 0.011569972150027752, |
|
"kl": 0.000385284423828125, |
|
"learning_rate": 3.387377967463493e-07, |
|
"loss": 0.0, |
|
"reward": -0.31521460227668285, |
|
"reward_std": 0.19862121110782027, |
|
"rewards/cosine_scaled_reward": -0.19927397277206182, |
|
"rewards/format_reward": 0.0833333358168602, |
|
"step": 346 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2886.0416870117188, |
|
"epoch": 0.1982857142857143, |
|
"grad_norm": 0.013902098871767521, |
|
"kl": 0.00035190582275390625, |
|
"learning_rate": 3.359691059183761e-07, |
|
"loss": -0.0006, |
|
"reward": 0.06203071027994156, |
|
"reward_std": 0.5102610923349857, |
|
"rewards/cosine_scaled_reward": -0.11481798812747002, |
|
"rewards/format_reward": 0.2916666679084301, |
|
"step": 347 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2697.4166870117188, |
|
"epoch": 0.19885714285714284, |
|
"grad_norm": 0.028411809355020523, |
|
"kl": 0.0004925727844238281, |
|
"learning_rate": 3.3321084665422803e-07, |
|
"loss": -0.1683, |
|
"reward": 0.16876617819070816, |
|
"reward_std": 0.468828896060586, |
|
"rewards/cosine_scaled_reward": -0.16561690717935562, |
|
"rewards/format_reward": 0.5, |
|
"step": 348 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3510.6666870117188, |
|
"epoch": 0.19942857142857143, |
|
"grad_norm": 0.012191089801490307, |
|
"kl": 0.00038242340087890625, |
|
"learning_rate": 3.3046315338757026e-07, |
|
"loss": 0.0276, |
|
"reward": -0.3254171907901764, |
|
"reward_std": 0.6530030593276024, |
|
"rewards/cosine_scaled_reward": -0.2460419312119484, |
|
"rewards/format_reward": 0.1666666716337204, |
|
"step": 349 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2198.541717529297, |
|
"epoch": 0.2, |
|
"grad_norm": 0.01535298302769661, |
|
"kl": 0.0005538463592529297, |
|
"learning_rate": 3.2772616003709616e-07, |
|
"loss": 0.0941, |
|
"reward": 1.250352792441845, |
|
"reward_std": 0.6242426857352257, |
|
"rewards/cosine_scaled_reward": 0.2710097096860409, |
|
"rewards/format_reward": 0.7083333395421505, |
|
"step": 350 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2659.791748046875, |
|
"epoch": 0.20057142857142857, |
|
"grad_norm": 0.017833461984992027, |
|
"kl": 0.0005612373352050781, |
|
"learning_rate": 3.250000000000001e-07, |
|
"loss": 0.0465, |
|
"reward": 0.1603565402328968, |
|
"reward_std": 0.8732819259166718, |
|
"rewards/cosine_scaled_reward": -0.1698217373341322, |
|
"rewards/format_reward": 0.5, |
|
"step": 351 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2180.500015258789, |
|
"epoch": 0.20114285714285715, |
|
"grad_norm": 0.01808895543217659, |
|
"kl": 0.0004558563232421875, |
|
"learning_rate": 3.222848061454764e-07, |
|
"loss": 0.0133, |
|
"reward": 0.2774962969124317, |
|
"reward_std": 0.7046881169080734, |
|
"rewards/cosine_scaled_reward": -0.15291852178052068, |
|
"rewards/format_reward": 0.5833333358168602, |
|
"step": 352 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2483.750030517578, |
|
"epoch": 0.2017142857142857, |
|
"grad_norm": 0.023005694150924683, |
|
"kl": 0.0004525184631347656, |
|
"learning_rate": 3.195807108082429e-07, |
|
"loss": 0.2053, |
|
"reward": 0.28026173263788223, |
|
"reward_std": 0.7538251765072346, |
|
"rewards/cosine_scaled_reward": -0.13070247694849968, |
|
"rewards/format_reward": 0.541666679084301, |
|
"step": 353 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2603.5833435058594, |
|
"epoch": 0.2022857142857143, |
|
"grad_norm": 0.019903168082237244, |
|
"kl": 0.0005669593811035156, |
|
"learning_rate": 3.168878457820915e-07, |
|
"loss": 0.098, |
|
"reward": 0.14617812633514404, |
|
"reward_std": 0.5162135027348995, |
|
"rewards/cosine_scaled_reward": -0.15607764571905136, |
|
"rewards/format_reward": 0.4583333432674408, |
|
"step": 354 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2432.875030517578, |
|
"epoch": 0.20285714285714285, |
|
"grad_norm": 0.02829565852880478, |
|
"kl": 0.0006160736083984375, |
|
"learning_rate": 3.142063423134644e-07, |
|
"loss": 0.0099, |
|
"reward": 0.19245748221874237, |
|
"reward_std": 0.2699956987053156, |
|
"rewards/cosine_scaled_reward": -0.1329379379749298, |
|
"rewards/format_reward": 0.4583333432674408, |
|
"step": 355 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2468.8334197998047, |
|
"epoch": 0.20342857142857143, |
|
"grad_norm": 0.02010870911180973, |
|
"kl": 0.0003266334533691406, |
|
"learning_rate": 3.115363310950578e-07, |
|
"loss": 0.0836, |
|
"reward": 1.1018516272306442, |
|
"reward_std": 0.832906199619174, |
|
"rewards/cosine_scaled_reward": 0.2800924628973007, |
|
"rewards/format_reward": 0.541666679084301, |
|
"step": 356 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2738.2083740234375, |
|
"epoch": 0.204, |
|
"grad_norm": 0.02142561413347721, |
|
"kl": 0.0004112720489501953, |
|
"learning_rate": 3.0887794225945143e-07, |
|
"loss": 0.0696, |
|
"reward": -0.34853553399443626, |
|
"reward_std": 0.1840939112007618, |
|
"rewards/cosine_scaled_reward": -0.3617677837610245, |
|
"rewards/format_reward": 0.375, |
|
"step": 357 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2853.416717529297, |
|
"epoch": 0.20457142857142857, |
|
"grad_norm": 0.016107890754938126, |
|
"kl": 0.0005478858947753906, |
|
"learning_rate": 3.062313053727671e-07, |
|
"loss": 0.0663, |
|
"reward": 0.6284266784787178, |
|
"reward_std": 0.72141382843256, |
|
"rewards/cosine_scaled_reward": 0.10587997734546661, |
|
"rewards/format_reward": 0.4166666679084301, |
|
"step": 358 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2372.000045776367, |
|
"epoch": 0.20514285714285715, |
|
"grad_norm": 0.021992484107613564, |
|
"kl": 0.0005311965942382812, |
|
"learning_rate": 3.0359654942835247e-07, |
|
"loss": 0.086, |
|
"reward": 0.6230212822556496, |
|
"reward_std": 0.8500233590602875, |
|
"rewards/cosine_scaled_reward": -0.021822698414325714, |
|
"rewards/format_reward": 0.6666666828095913, |
|
"step": 359 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3455.2500610351562, |
|
"epoch": 0.2057142857142857, |
|
"grad_norm": 0.030560219660401344, |
|
"kl": 0.00031185150146484375, |
|
"learning_rate": 3.0097380284049523e-07, |
|
"loss": 0.0487, |
|
"reward": -0.10205069184303284, |
|
"reward_std": 0.8343977108597755, |
|
"rewards/cosine_scaled_reward": -0.13435868825763464, |
|
"rewards/format_reward": 0.1666666716337204, |
|
"step": 360 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2516.9583740234375, |
|
"epoch": 0.2062857142857143, |
|
"grad_norm": 0.016366608440876007, |
|
"kl": 0.0004210472106933594, |
|
"learning_rate": 2.9836319343816397e-07, |
|
"loss": 0.0794, |
|
"reward": 0.12586134672164917, |
|
"reward_std": 0.6533399596810341, |
|
"rewards/cosine_scaled_reward": -0.18706931918859482, |
|
"rewards/format_reward": 0.5000000111758709, |
|
"step": 361 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3351.7916870117188, |
|
"epoch": 0.20685714285714285, |
|
"grad_norm": 0.01776767335832119, |
|
"kl": 0.0003039836883544922, |
|
"learning_rate": 2.9576484845877793e-07, |
|
"loss": 0.0928, |
|
"reward": 0.29054381139576435, |
|
"reward_std": 1.0099711641669273, |
|
"rewards/cosine_scaled_reward": -0.04222810734063387, |
|
"rewards/format_reward": 0.3750000074505806, |
|
"step": 362 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2496.8750610351562, |
|
"epoch": 0.20742857142857143, |
|
"grad_norm": 0.03273333981633186, |
|
"kl": 0.0006189346313476562, |
|
"learning_rate": 2.931788945420058e-07, |
|
"loss": 0.2348, |
|
"reward": 0.34358200430870056, |
|
"reward_std": 0.7383445575833321, |
|
"rewards/cosine_scaled_reward": -0.1407090239226818, |
|
"rewards/format_reward": 0.6250000111758709, |
|
"step": 363 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3452.375, |
|
"epoch": 0.208, |
|
"grad_norm": 0.014953386969864368, |
|
"kl": 0.00035858154296875, |
|
"learning_rate": 2.9060545772359305e-07, |
|
"loss": 0.043, |
|
"reward": -0.4569111131131649, |
|
"reward_std": 0.34297534823417664, |
|
"rewards/cosine_scaled_reward": -0.2701222151517868, |
|
"rewards/format_reward": 0.0833333358168602, |
|
"step": 364 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2785.7083740234375, |
|
"epoch": 0.20857142857142857, |
|
"grad_norm": 0.02793431654572487, |
|
"kl": 0.0006036758422851562, |
|
"learning_rate": 2.8804466342921987e-07, |
|
"loss": 0.0765, |
|
"reward": 0.11223252862691879, |
|
"reward_std": 0.619122963398695, |
|
"rewards/cosine_scaled_reward": -0.21471708547323942, |
|
"rewards/format_reward": 0.5416666865348816, |
|
"step": 365 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3409.541748046875, |
|
"epoch": 0.20914285714285713, |
|
"grad_norm": 0.01248230878263712, |
|
"kl": 0.0003552436828613281, |
|
"learning_rate": 2.854966364683872e-07, |
|
"loss": 0.0491, |
|
"reward": -0.3326341025531292, |
|
"reward_std": 0.3651573769748211, |
|
"rewards/cosine_scaled_reward": -0.2288170587271452, |
|
"rewards/format_reward": 0.1250000037252903, |
|
"step": 366 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2808.4583587646484, |
|
"epoch": 0.20971428571428571, |
|
"grad_norm": 0.028179485350847244, |
|
"kl": 0.0006113052368164062, |
|
"learning_rate": 2.829615010283344e-07, |
|
"loss": 0.0589, |
|
"reward": 0.1167896268889308, |
|
"reward_std": 1.1129950881004333, |
|
"rewards/cosine_scaled_reward": -0.10827185213565826, |
|
"rewards/format_reward": 0.3333333358168602, |
|
"step": 367 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2660.791748046875, |
|
"epoch": 0.2102857142857143, |
|
"grad_norm": 0.0340777263045311, |
|
"kl": 0.0005307197570800781, |
|
"learning_rate": 2.8043938066798645e-07, |
|
"loss": 0.1111, |
|
"reward": 1.4332922995090485, |
|
"reward_std": 1.385080635547638, |
|
"rewards/cosine_scaled_reward": 0.34164613112807274, |
|
"rewards/format_reward": 0.7500000149011612, |
|
"step": 368 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3233.3750610351562, |
|
"epoch": 0.21085714285714285, |
|
"grad_norm": 0.013750105164945126, |
|
"kl": 0.0003342628479003906, |
|
"learning_rate": 2.7793039831193133e-07, |
|
"loss": 0.0717, |
|
"reward": 0.05912124365568161, |
|
"reward_std": 0.874366108328104, |
|
"rewards/cosine_scaled_reward": -0.09543937258422375, |
|
"rewards/format_reward": 0.2500000074505806, |
|
"step": 369 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3452.4166870117188, |
|
"epoch": 0.21142857142857144, |
|
"grad_norm": 0.012770992703735828, |
|
"kl": 0.0004000663757324219, |
|
"learning_rate": 2.7543467624442956e-07, |
|
"loss": 0.0281, |
|
"reward": 0.15689724683761597, |
|
"reward_std": 0.2785376161336899, |
|
"rewards/cosine_scaled_reward": -0.04655133932828903, |
|
"rewards/format_reward": 0.25, |
|
"step": 370 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2499.791717529297, |
|
"epoch": 0.212, |
|
"grad_norm": 0.018871566280722618, |
|
"kl": 0.00035071372985839844, |
|
"learning_rate": 2.729523361034538e-07, |
|
"loss": 0.1363, |
|
"reward": 0.5506950244307518, |
|
"reward_std": 0.9160189777612686, |
|
"rewards/cosine_scaled_reward": 0.025347519665956497, |
|
"rewards/format_reward": 0.5000000037252903, |
|
"step": 371 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3505.2083740234375, |
|
"epoch": 0.21257142857142858, |
|
"grad_norm": 0.01206301525235176, |
|
"kl": 0.00035262107849121094, |
|
"learning_rate": 2.7048349887476037e-07, |
|
"loss": 0.0086, |
|
"reward": -0.2348268087953329, |
|
"reward_std": 0.3814601432532072, |
|
"rewards/cosine_scaled_reward": -0.1799134025350213, |
|
"rewards/format_reward": 0.125, |
|
"step": 372 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2949.166717529297, |
|
"epoch": 0.21314285714285713, |
|
"grad_norm": 0.01700667478144169, |
|
"kl": 0.0003724098205566406, |
|
"learning_rate": 2.6802828488599294e-07, |
|
"loss": -0.0476, |
|
"reward": 0.22861511493101716, |
|
"reward_std": 0.5081553272902966, |
|
"rewards/cosine_scaled_reward": -0.031525759026408195, |
|
"rewards/format_reward": 0.2916666679084301, |
|
"step": 373 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3467.0000610351562, |
|
"epoch": 0.21371428571428572, |
|
"grad_norm": 0.017147116363048553, |
|
"kl": 0.00045490264892578125, |
|
"learning_rate": 2.655868138008171e-07, |
|
"loss": 0.0457, |
|
"reward": -0.4217256158590317, |
|
"reward_std": 0.6266062669456005, |
|
"rewards/cosine_scaled_reward": -0.31502948701381683, |
|
"rewards/format_reward": 0.2083333358168602, |
|
"step": 374 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2722.375, |
|
"epoch": 0.21428571428571427, |
|
"grad_norm": 0.02914683148264885, |
|
"kl": 0.0004086494445800781, |
|
"learning_rate": 2.631592046130896e-07, |
|
"loss": 0.1972, |
|
"reward": 0.23988548666238785, |
|
"reward_std": 0.4570600874722004, |
|
"rewards/cosine_scaled_reward": -0.06755725666880608, |
|
"rewards/format_reward": 0.3750000149011612, |
|
"step": 375 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3282.5834350585938, |
|
"epoch": 0.21485714285714286, |
|
"grad_norm": 0.01625804975628853, |
|
"kl": 0.00047779083251953125, |
|
"learning_rate": 2.6074557564105724e-07, |
|
"loss": 0.0483, |
|
"reward": 0.39857788383960724, |
|
"reward_std": 0.9685629121959209, |
|
"rewards/cosine_scaled_reward": -0.00904439389705658, |
|
"rewards/format_reward": 0.416666679084301, |
|
"step": 376 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2434.125, |
|
"epoch": 0.21542857142857144, |
|
"grad_norm": 0.034063227474689484, |
|
"kl": 0.0004830360412597656, |
|
"learning_rate": 2.583460445215911e-07, |
|
"loss": 0.164, |
|
"reward": 0.0973532497882843, |
|
"reward_std": 0.7279782295227051, |
|
"rewards/cosine_scaled_reward": -0.22215671092271805, |
|
"rewards/format_reward": 0.5416666753590107, |
|
"step": 377 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2981.2083435058594, |
|
"epoch": 0.216, |
|
"grad_norm": 0.02709144353866577, |
|
"kl": 0.0005640983581542969, |
|
"learning_rate": 2.5596072820445254e-07, |
|
"loss": 0.2001, |
|
"reward": -0.3218484600074589, |
|
"reward_std": 0.3952440693974495, |
|
"rewards/cosine_scaled_reward": -0.28592423163354397, |
|
"rewards/format_reward": 0.2500000111758709, |
|
"step": 378 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3029.7083740234375, |
|
"epoch": 0.21657142857142858, |
|
"grad_norm": 0.020819807425141335, |
|
"kl": 0.0030651092529296875, |
|
"learning_rate": 2.5358974294659373e-07, |
|
"loss": -0.0893, |
|
"reward": -0.21052202675491571, |
|
"reward_std": 0.38894602842628956, |
|
"rewards/cosine_scaled_reward": -0.2719276868738234, |
|
"rewards/format_reward": 0.3333333358168602, |
|
"step": 379 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3011.0833740234375, |
|
"epoch": 0.21714285714285714, |
|
"grad_norm": 0.015882771462202072, |
|
"kl": 0.0006201267242431641, |
|
"learning_rate": 2.512332043064913e-07, |
|
"loss": 0.0272, |
|
"reward": 0.18160124588757753, |
|
"reward_std": 1.1701444238424301, |
|
"rewards/cosine_scaled_reward": -0.09669936696445802, |
|
"rewards/format_reward": 0.3750000037252903, |
|
"step": 380 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3127.2500610351562, |
|
"epoch": 0.21771428571428572, |
|
"grad_norm": 0.07095064222812653, |
|
"kl": 0.0007042884826660156, |
|
"learning_rate": 2.488912271385139e-07, |
|
"loss": 0.1667, |
|
"reward": -0.5090018883347511, |
|
"reward_std": 0.36618487909436226, |
|
"rewards/cosine_scaled_reward": -0.35866761952638626, |
|
"rewards/format_reward": 0.2083333358168602, |
|
"step": 381 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2951.2916870117188, |
|
"epoch": 0.21828571428571428, |
|
"grad_norm": 0.03872789442539215, |
|
"kl": 0.0005259513854980469, |
|
"learning_rate": 2.465639255873246e-07, |
|
"loss": 0.1543, |
|
"reward": -0.07253091287566349, |
|
"reward_std": 0.7700534537434578, |
|
"rewards/cosine_scaled_reward": -0.20293213427066803, |
|
"rewards/format_reward": 0.3333333469927311, |
|
"step": 382 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2982.2916870117188, |
|
"epoch": 0.21885714285714286, |
|
"grad_norm": 0.014369679614901543, |
|
"kl": 0.0005116462707519531, |
|
"learning_rate": 2.4425141308231765e-07, |
|
"loss": -0.0014, |
|
"reward": 0.13170818611979485, |
|
"reward_std": 0.41740766912698746, |
|
"rewards/cosine_scaled_reward": -0.059145910665392876, |
|
"rewards/format_reward": 0.25, |
|
"step": 383 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3228.625, |
|
"epoch": 0.21942857142857142, |
|
"grad_norm": 0.012754120863974094, |
|
"kl": 0.0003314018249511719, |
|
"learning_rate": 2.4195380233209006e-07, |
|
"loss": -0.0007, |
|
"reward": 0.17525914683938026, |
|
"reward_std": 0.6218334436416626, |
|
"rewards/cosine_scaled_reward": -0.09987044055014849, |
|
"rewards/format_reward": 0.3750000149011612, |
|
"step": 384 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2379.2916717529297, |
|
"epoch": 0.22, |
|
"grad_norm": 0.020193729549646378, |
|
"kl": 0.0006628036499023438, |
|
"learning_rate": 2.3967120531894857e-07, |
|
"loss": -0.0018, |
|
"reward": -0.0569206103682518, |
|
"reward_std": 0.6073889955878258, |
|
"rewards/cosine_scaled_reward": -0.257626973092556, |
|
"rewards/format_reward": 0.4583333432674408, |
|
"step": 385 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3001.1666870117188, |
|
"epoch": 0.22057142857142858, |
|
"grad_norm": 0.018432218581438065, |
|
"kl": 0.0004229545593261719, |
|
"learning_rate": 2.374037332934512e-07, |
|
"loss": 0.0952, |
|
"reward": 0.21215322148054838, |
|
"reward_std": 0.888201154768467, |
|
"rewards/cosine_scaled_reward": -0.12309006974101067, |
|
"rewards/format_reward": 0.4583333432674408, |
|
"step": 386 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1933.2083740234375, |
|
"epoch": 0.22114285714285714, |
|
"grad_norm": 0.05453366041183472, |
|
"kl": 0.0005445480346679688, |
|
"learning_rate": 2.3515149676898552e-07, |
|
"loss": 0.1831, |
|
"reward": 1.217309720814228, |
|
"reward_std": 0.4588906615972519, |
|
"rewards/cosine_scaled_reward": 0.33782152086496353, |
|
"rewards/format_reward": 0.5416666679084301, |
|
"step": 387 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1593.1667175292969, |
|
"epoch": 0.22171428571428572, |
|
"grad_norm": 0.026233471930027008, |
|
"kl": 0.0007166862487792969, |
|
"learning_rate": 2.3291460551638237e-07, |
|
"loss": -0.0644, |
|
"reward": 1.6789040267467499, |
|
"reward_std": 1.0171207189559937, |
|
"rewards/cosine_scaled_reward": 0.33945199474692345, |
|
"rewards/format_reward": 1.0, |
|
"step": 388 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1572.4583587646484, |
|
"epoch": 0.22228571428571428, |
|
"grad_norm": 0.02512061409652233, |
|
"kl": 0.0004048347473144531, |
|
"learning_rate": 2.306931685585657e-07, |
|
"loss": 0.0294, |
|
"reward": 1.3811066150665283, |
|
"reward_std": 0.3954271301627159, |
|
"rewards/cosine_scaled_reward": 0.3155532553792, |
|
"rewards/format_reward": 0.75, |
|
"step": 389 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3255.9583740234375, |
|
"epoch": 0.22285714285714286, |
|
"grad_norm": 0.03513728827238083, |
|
"kl": 0.0005030632019042969, |
|
"learning_rate": 2.2848729416523859e-07, |
|
"loss": 0.1731, |
|
"reward": -0.12103257514536381, |
|
"reward_std": 0.8534305840730667, |
|
"rewards/cosine_scaled_reward": -0.16468296200037003, |
|
"rewards/format_reward": 0.2083333395421505, |
|
"step": 390 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2981.8333740234375, |
|
"epoch": 0.22342857142857142, |
|
"grad_norm": 0.03934527188539505, |
|
"kl": 0.0004153251647949219, |
|
"learning_rate": 2.2629708984760706e-07, |
|
"loss": 0.1821, |
|
"reward": -0.06651334711932577, |
|
"reward_std": 0.2963850498199463, |
|
"rewards/cosine_scaled_reward": -0.2207566797733307, |
|
"rewards/format_reward": 0.3750000111758709, |
|
"step": 391 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3223.0833740234375, |
|
"epoch": 0.224, |
|
"grad_norm": 0.013558811508119106, |
|
"kl": 0.00038242340087890625, |
|
"learning_rate": 2.2412266235313973e-07, |
|
"loss": 0.0057, |
|
"reward": -0.2705407738685608, |
|
"reward_std": 0.2837679469957948, |
|
"rewards/cosine_scaled_reward": -0.2602703794836998, |
|
"rewards/format_reward": 0.25, |
|
"step": 392 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3398.0000610351562, |
|
"epoch": 0.22457142857142856, |
|
"grad_norm": 0.012919829227030277, |
|
"kl": 0.0003833770751953125, |
|
"learning_rate": 2.2196411766036487e-07, |
|
"loss": 0.0292, |
|
"reward": -0.32555074989795685, |
|
"reward_std": 0.6584090702235699, |
|
"rewards/cosine_scaled_reward": -0.2669420391321182, |
|
"rewards/format_reward": 0.2083333395421505, |
|
"step": 393 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2330.0833587646484, |
|
"epoch": 0.22514285714285714, |
|
"grad_norm": 0.03346144035458565, |
|
"kl": 0.0005555152893066406, |
|
"learning_rate": 2.1982156097370557e-07, |
|
"loss": 0.167, |
|
"reward": 0.23031404614448547, |
|
"reward_std": 0.7982805892825127, |
|
"rewards/cosine_scaled_reward": -0.11400966346263885, |
|
"rewards/format_reward": 0.4583333358168602, |
|
"step": 394 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3490.75, |
|
"epoch": 0.2257142857142857, |
|
"grad_norm": 0.016392916440963745, |
|
"kl": 0.0003719329833984375, |
|
"learning_rate": 2.1769509671835223e-07, |
|
"loss": 0.0408, |
|
"reward": -0.04605567455291748, |
|
"reward_std": 0.8931695856153965, |
|
"rewards/cosine_scaled_reward": -0.10636119358241558, |
|
"rewards/format_reward": 0.1666666716337204, |
|
"step": 395 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2741.9166870117188, |
|
"epoch": 0.22628571428571428, |
|
"grad_norm": 0.012639901600778103, |
|
"kl": 0.0002758502960205078, |
|
"learning_rate": 2.1558482853517253e-07, |
|
"loss": 0.0119, |
|
"reward": 0.37444762885570526, |
|
"reward_std": 0.4292972981929779, |
|
"rewards/cosine_scaled_reward": -0.06277619302272797, |
|
"rewards/format_reward": 0.5000000111758709, |
|
"step": 396 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2962.8333435058594, |
|
"epoch": 0.22685714285714287, |
|
"grad_norm": 0.01602632738649845, |
|
"kl": 0.00043392181396484375, |
|
"learning_rate": 2.134908592756607e-07, |
|
"loss": -0.0317, |
|
"reward": 0.5750939613208175, |
|
"reward_std": 0.7307236031629145, |
|
"rewards/cosine_scaled_reward": 0.05838030157610774, |
|
"rewards/format_reward": 0.4583333432674408, |
|
"step": 397 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2762.5833740234375, |
|
"epoch": 0.22742857142857142, |
|
"grad_norm": 0.016584103927016258, |
|
"kl": 0.00033473968505859375, |
|
"learning_rate": 2.1141329099692406e-07, |
|
"loss": 0.0113, |
|
"reward": 0.8510713949799538, |
|
"reward_std": 0.8986274972558022, |
|
"rewards/cosine_scaled_reward": 0.1338690184056759, |
|
"rewards/format_reward": 0.5833333469927311, |
|
"step": 398 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3111.7916870117188, |
|
"epoch": 0.228, |
|
"grad_norm": 0.04136970639228821, |
|
"kl": 0.0006189346313476562, |
|
"learning_rate": 2.0935222495670968e-07, |
|
"loss": 0.0911, |
|
"reward": -0.29911787807941437, |
|
"reward_std": 0.36130358278751373, |
|
"rewards/cosine_scaled_reward": -0.2537256069481373, |
|
"rewards/format_reward": 0.2083333432674408, |
|
"step": 399 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1176.083381652832, |
|
"epoch": 0.22857142857142856, |
|
"grad_norm": 0.032378729432821274, |
|
"kl": 0.0008146762847900391, |
|
"learning_rate": 2.0730776160846853e-07, |
|
"loss": -0.0498, |
|
"reward": 1.2029592096805573, |
|
"reward_std": 0.8494270741939545, |
|
"rewards/cosine_scaled_reward": 0.10147958248853683, |
|
"rewards/format_reward": 1.0, |
|
"step": 400 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2217.2083892822266, |
|
"epoch": 0.22914285714285715, |
|
"grad_norm": 0.03296418488025665, |
|
"kl": 0.0005393028259277344, |
|
"learning_rate": 2.0528000059645995e-07, |
|
"loss": 0.0836, |
|
"reward": 0.29183474462479353, |
|
"reward_std": 0.5516791455447674, |
|
"rewards/cosine_scaled_reward": -0.1874159649014473, |
|
"rewards/format_reward": 0.6666666716337204, |
|
"step": 401 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2966.0833740234375, |
|
"epoch": 0.2297142857142857, |
|
"grad_norm": 0.023311948403716087, |
|
"kl": 0.0004687309265136719, |
|
"learning_rate": 2.032690407508949e-07, |
|
"loss": -0.0487, |
|
"reward": -0.14803513139486313, |
|
"reward_std": 0.5541299842298031, |
|
"rewards/cosine_scaled_reward": -0.21985089778900146, |
|
"rewards/format_reward": 0.2916666679084301, |
|
"step": 402 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2138.375015258789, |
|
"epoch": 0.2302857142857143, |
|
"grad_norm": 0.0190444178879261, |
|
"kl": 0.0005555152893066406, |
|
"learning_rate": 2.0127498008311922e-07, |
|
"loss": 0.1245, |
|
"reward": 0.7670382708311081, |
|
"reward_std": 0.880347341299057, |
|
"rewards/cosine_scaled_reward": 0.05018577980808914, |
|
"rewards/format_reward": 0.6666666716337204, |
|
"step": 403 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2700.1666717529297, |
|
"epoch": 0.23085714285714284, |
|
"grad_norm": 0.02195879817008972, |
|
"kl": 0.0004572868347167969, |
|
"learning_rate": 1.9929791578083655e-07, |
|
"loss": 0.1354, |
|
"reward": -0.03147786110639572, |
|
"reward_std": 0.5353209748864174, |
|
"rewards/cosine_scaled_reward": -0.22407225891947746, |
|
"rewards/format_reward": 0.4166666716337204, |
|
"step": 404 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2581.000045776367, |
|
"epoch": 0.23142857142857143, |
|
"grad_norm": 0.01655156910419464, |
|
"kl": 0.00037384033203125, |
|
"learning_rate": 1.9733794420337213e-07, |
|
"loss": 0.063, |
|
"reward": 0.6779801677912474, |
|
"reward_std": 0.5696643739938736, |
|
"rewards/cosine_scaled_reward": 0.08899005688726902, |
|
"rewards/format_reward": 0.5000000074505806, |
|
"step": 405 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3439.5000610351562, |
|
"epoch": 0.232, |
|
"grad_norm": 0.02102508395910263, |
|
"kl": 0.0006380081176757812, |
|
"learning_rate": 1.9539516087697517e-07, |
|
"loss": 0.0608, |
|
"reward": -0.2339201234281063, |
|
"reward_std": 0.7671289071440697, |
|
"rewards/cosine_scaled_reward": -0.22112673026276752, |
|
"rewards/format_reward": 0.2083333395421505, |
|
"step": 406 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2525.6666870117188, |
|
"epoch": 0.23257142857142857, |
|
"grad_norm": 0.016638562083244324, |
|
"kl": 0.00039958953857421875, |
|
"learning_rate": 1.934696604901642e-07, |
|
"loss": 0.0268, |
|
"reward": 0.5109657794237137, |
|
"reward_std": 0.6168239414691925, |
|
"rewards/cosine_scaled_reward": 0.04714955762028694, |
|
"rewards/format_reward": 0.4166666716337204, |
|
"step": 407 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2517.6250610351562, |
|
"epoch": 0.23314285714285715, |
|
"grad_norm": 0.024556465446949005, |
|
"kl": 0.0006203651428222656, |
|
"learning_rate": 1.915615368891117e-07, |
|
"loss": 0.1592, |
|
"reward": 0.06803740188479424, |
|
"reward_std": 0.8579247817397118, |
|
"rewards/cosine_scaled_reward": -0.19514797255396843, |
|
"rewards/format_reward": 0.4583333358168602, |
|
"step": 408 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2717.2916870117188, |
|
"epoch": 0.2337142857142857, |
|
"grad_norm": 0.029170790687203407, |
|
"kl": 0.0004508495330810547, |
|
"learning_rate": 1.8967088307307e-07, |
|
"loss": 0.0397, |
|
"reward": 0.8809500015340745, |
|
"reward_std": 0.6774098351597786, |
|
"rewards/cosine_scaled_reward": 0.12797498516738415, |
|
"rewards/format_reward": 0.625, |
|
"step": 409 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3492.8333740234375, |
|
"epoch": 0.2342857142857143, |
|
"grad_norm": 0.011575359851121902, |
|
"kl": 0.0004987716674804688, |
|
"learning_rate": 1.8779779118983867e-07, |
|
"loss": -0.0028, |
|
"reward": -0.09124118834733963, |
|
"reward_std": 0.882003229111433, |
|
"rewards/cosine_scaled_reward": -0.14978726860135794, |
|
"rewards/format_reward": 0.2083333358168602, |
|
"step": 410 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3483.4583740234375, |
|
"epoch": 0.23485714285714285, |
|
"grad_norm": 0.013307969085872173, |
|
"kl": 0.00041484832763671875, |
|
"learning_rate": 1.8594235253127372e-07, |
|
"loss": 0.038, |
|
"reward": 0.0876331478357315, |
|
"reward_std": 0.9785483591258526, |
|
"rewards/cosine_scaled_reward": -0.06035009026527405, |
|
"rewards/format_reward": 0.2083333395421505, |
|
"step": 411 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3483.0, |
|
"epoch": 0.23542857142857143, |
|
"grad_norm": 0.014635894447565079, |
|
"kl": 0.0004057884216308594, |
|
"learning_rate": 1.8410465752883758e-07, |
|
"loss": 0.0558, |
|
"reward": -0.7163440883159637, |
|
"reward_std": 0.2162969596683979, |
|
"rewards/cosine_scaled_reward": -0.37900539487600327, |
|
"rewards/format_reward": 0.0416666679084301, |
|
"step": 412 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2679.916778564453, |
|
"epoch": 0.236, |
|
"grad_norm": 0.013161610811948776, |
|
"kl": 0.00030541419982910156, |
|
"learning_rate": 1.822847957491922e-07, |
|
"loss": 0.0978, |
|
"reward": 0.9060661401599646, |
|
"reward_std": 0.8712828233838081, |
|
"rewards/cosine_scaled_reward": 0.11969973146915436, |
|
"rewards/format_reward": 0.666666679084301, |
|
"step": 413 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2817.625030517578, |
|
"epoch": 0.23657142857142857, |
|
"grad_norm": 0.022518867626786232, |
|
"kl": 0.00038623809814453125, |
|
"learning_rate": 1.804828558898332e-07, |
|
"loss": 0.0441, |
|
"reward": 0.13390246778726578, |
|
"reward_std": 0.6470340602099895, |
|
"rewards/cosine_scaled_reward": -0.09971543587744236, |
|
"rewards/format_reward": 0.3333333358168602, |
|
"step": 414 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2841.2083740234375, |
|
"epoch": 0.23714285714285716, |
|
"grad_norm": 0.023023054003715515, |
|
"kl": 0.0006165504455566406, |
|
"learning_rate": 1.7869892577476722e-07, |
|
"loss": 0.0876, |
|
"reward": -0.059880852699279785, |
|
"reward_std": 0.4681435003876686, |
|
"rewards/cosine_scaled_reward": -0.19660709938034415, |
|
"rewards/format_reward": 0.3333333358168602, |
|
"step": 415 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3561.75, |
|
"epoch": 0.2377142857142857, |
|
"grad_norm": 0.018553022295236588, |
|
"kl": 0.0003795623779296875, |
|
"learning_rate": 1.7693309235023127e-07, |
|
"loss": 0.0128, |
|
"reward": -0.2655714526772499, |
|
"reward_std": 0.3128821440041065, |
|
"rewards/cosine_scaled_reward": -0.15361905843019485, |
|
"rewards/format_reward": 0.0416666679084301, |
|
"step": 416 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2401.7917098999023, |
|
"epoch": 0.2382857142857143, |
|
"grad_norm": 0.026138195767998695, |
|
"kl": 0.0005283355712890625, |
|
"learning_rate": 1.7518544168045524e-07, |
|
"loss": 0.0481, |
|
"reward": 0.4319583922624588, |
|
"reward_std": 0.5039754528552294, |
|
"rewards/cosine_scaled_reward": -0.013187475502490997, |
|
"rewards/format_reward": 0.4583333432674408, |
|
"step": 417 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2618.4166717529297, |
|
"epoch": 0.23885714285714285, |
|
"grad_norm": 0.03529448062181473, |
|
"kl": 0.0008215904235839844, |
|
"learning_rate": 1.7345605894346726e-07, |
|
"loss": 0.053, |
|
"reward": -0.38310980424284935, |
|
"reward_std": 0.3161798268556595, |
|
"rewards/cosine_scaled_reward": -0.3790549263358116, |
|
"rewards/format_reward": 0.375, |
|
"step": 418 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2717.2083435058594, |
|
"epoch": 0.23942857142857144, |
|
"grad_norm": 0.015204512514173985, |
|
"kl": 0.0004143714904785156, |
|
"learning_rate": 1.7174502842694212e-07, |
|
"loss": 0.0434, |
|
"reward": -0.06527332402765751, |
|
"reward_std": 0.3376622749492526, |
|
"rewards/cosine_scaled_reward": -0.2618033364415169, |
|
"rewards/format_reward": 0.4583333432674408, |
|
"step": 419 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1704.7916870117188, |
|
"epoch": 0.24, |
|
"grad_norm": 0.05218733474612236, |
|
"kl": 0.0005950927734375, |
|
"learning_rate": 1.7005243352409333e-07, |
|
"loss": 0.249, |
|
"reward": 0.7783117964863777, |
|
"reward_std": 0.45935374312102795, |
|
"rewards/cosine_scaled_reward": 0.03498924896121025, |
|
"rewards/format_reward": 0.7083333432674408, |
|
"step": 420 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3204.5, |
|
"epoch": 0.24057142857142857, |
|
"grad_norm": 0.01363091915845871, |
|
"kl": 0.0003876686096191406, |
|
"learning_rate": 1.6837835672960831e-07, |
|
"loss": 0.0017, |
|
"reward": -0.22349218279123306, |
|
"reward_std": 0.22306939586997032, |
|
"rewards/cosine_scaled_reward": -0.23674608767032623, |
|
"rewards/format_reward": 0.25, |
|
"step": 421 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3584.0, |
|
"epoch": 0.24114285714285713, |
|
"grad_norm": 0.010621090419590473, |
|
"kl": 0.0003581047058105469, |
|
"learning_rate": 1.6672287963562852e-07, |
|
"loss": 0.0, |
|
"reward": -0.5770844966173172, |
|
"reward_std": 0.2044766042381525, |
|
"rewards/cosine_scaled_reward": -0.288542240858078, |
|
"rewards/format_reward": 0.0, |
|
"step": 422 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2312.250030517578, |
|
"epoch": 0.24171428571428571, |
|
"grad_norm": 0.02375810779631138, |
|
"kl": 0.0005865097045898438, |
|
"learning_rate": 1.6508608292777203e-07, |
|
"loss": -0.0847, |
|
"reward": 0.414031776599586, |
|
"reward_std": 0.732084047049284, |
|
"rewards/cosine_scaled_reward": -0.10548414289951324, |
|
"rewards/format_reward": 0.6250000037252903, |
|
"step": 423 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3128.5833740234375, |
|
"epoch": 0.2422857142857143, |
|
"grad_norm": 0.01551869697868824, |
|
"kl": 0.00045108795166015625, |
|
"learning_rate": 1.6346804638120098e-07, |
|
"loss": 0.0483, |
|
"reward": 0.01406601071357727, |
|
"reward_std": 0.8138113915920258, |
|
"rewards/cosine_scaled_reward": -0.1388003290630877, |
|
"rewards/format_reward": 0.2916666716337204, |
|
"step": 424 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3584.0, |
|
"epoch": 0.24285714285714285, |
|
"grad_norm": 0.011893173679709435, |
|
"kl": 0.00039386749267578125, |
|
"learning_rate": 1.6186884885673413e-07, |
|
"loss": 0.0, |
|
"reward": -0.5272135511040688, |
|
"reward_std": 0.4391126446425915, |
|
"rewards/cosine_scaled_reward": -0.2844401001930237, |
|
"rewards/format_reward": 0.0416666679084301, |
|
"step": 425 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3331.7916870117188, |
|
"epoch": 0.24342857142857144, |
|
"grad_norm": 0.011359743773937225, |
|
"kl": 0.0003376007080078125, |
|
"learning_rate": 1.6028856829700258e-07, |
|
"loss": 0.015, |
|
"reward": 0.13842247426509857, |
|
"reward_std": 0.2540343776345253, |
|
"rewards/cosine_scaled_reward": -0.05578881502151489, |
|
"rewards/format_reward": 0.25, |
|
"step": 426 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3101.5834350585938, |
|
"epoch": 0.244, |
|
"grad_norm": 0.01774391531944275, |
|
"kl": 0.00032806396484375, |
|
"learning_rate": 1.5872728172265146e-07, |
|
"loss": 0.0951, |
|
"reward": 0.436421200633049, |
|
"reward_std": 0.9749777019023895, |
|
"rewards/cosine_scaled_reward": -0.05262273037806153, |
|
"rewards/format_reward": 0.5416666828095913, |
|
"step": 427 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2517.2083740234375, |
|
"epoch": 0.24457142857142858, |
|
"grad_norm": 0.018922699615359306, |
|
"kl": 0.0004782676696777344, |
|
"learning_rate": 1.5718506522858572e-07, |
|
"loss": 0.0171, |
|
"reward": 1.0143605917692184, |
|
"reward_std": 0.9533870965242386, |
|
"rewards/cosine_scaled_reward": 0.17384696938097477, |
|
"rewards/format_reward": 0.6666666865348816, |
|
"step": 428 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3139.4166870117188, |
|
"epoch": 0.24514285714285713, |
|
"grad_norm": 0.01879842020571232, |
|
"kl": 0.0005502700805664062, |
|
"learning_rate": 1.5566199398026147e-07, |
|
"loss": 0.0225, |
|
"reward": -0.2566852793097496, |
|
"reward_std": 0.44601357355713844, |
|
"rewards/cosine_scaled_reward": -0.23250930570065975, |
|
"rewards/format_reward": 0.2083333432674408, |
|
"step": 429 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3440.3333740234375, |
|
"epoch": 0.24571428571428572, |
|
"grad_norm": 0.012092203833162785, |
|
"kl": 0.0003752708435058594, |
|
"learning_rate": 1.5415814221002265e-07, |
|
"loss": 0.0602, |
|
"reward": -0.08506331103853881, |
|
"reward_std": 0.7580065792426467, |
|
"rewards/cosine_scaled_reward": -0.18836499378085136, |
|
"rewards/format_reward": 0.291666679084301, |
|
"step": 430 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3584.0, |
|
"epoch": 0.24628571428571427, |
|
"grad_norm": 0.009651312604546547, |
|
"kl": 0.000339508056640625, |
|
"learning_rate": 1.5267358321348285e-07, |
|
"loss": 0.0, |
|
"reward": -0.5135565400123596, |
|
"reward_std": 0.17573323473334312, |
|
"rewards/cosine_scaled_reward": -0.2776115983724594, |
|
"rewards/format_reward": 0.0416666679084301, |
|
"step": 431 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2914.6250610351562, |
|
"epoch": 0.24685714285714286, |
|
"grad_norm": 0.04963121563196182, |
|
"kl": 0.00048542022705078125, |
|
"learning_rate": 1.5120838934595337e-07, |
|
"loss": 0.0778, |
|
"reward": 0.22040988504886627, |
|
"reward_std": 0.7857857123017311, |
|
"rewards/cosine_scaled_reward": -0.05646173283457756, |
|
"rewards/format_reward": 0.3333333358168602, |
|
"step": 432 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1992.5417022705078, |
|
"epoch": 0.24742857142857144, |
|
"grad_norm": 0.018762821331620216, |
|
"kl": 0.0004487037658691406, |
|
"learning_rate": 1.4976263201891613e-07, |
|
"loss": 0.0176, |
|
"reward": 0.7027051514014602, |
|
"reward_std": 0.5680601857602596, |
|
"rewards/cosine_scaled_reward": -0.002814119216054678, |
|
"rewards/format_reward": 0.7083333432674408, |
|
"step": 433 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3400.2500610351562, |
|
"epoch": 0.248, |
|
"grad_norm": 0.014792956411838531, |
|
"kl": 0.000438690185546875, |
|
"learning_rate": 1.483363816965435e-07, |
|
"loss": 0.0359, |
|
"reward": -0.11403081566095352, |
|
"reward_std": 0.695421889424324, |
|
"rewards/cosine_scaled_reward": -0.14034874364733696, |
|
"rewards/format_reward": 0.1666666716337204, |
|
"step": 434 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2794.375, |
|
"epoch": 0.24857142857142858, |
|
"grad_norm": 0.015310428105294704, |
|
"kl": 0.0004611015319824219, |
|
"learning_rate": 1.469297078922642e-07, |
|
"loss": 0.0315, |
|
"reward": 0.5071351528167725, |
|
"reward_std": 0.44683452136814594, |
|
"rewards/cosine_scaled_reward": 0.10773422196507454, |
|
"rewards/format_reward": 0.2916666679084301, |
|
"step": 435 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2730.250030517578, |
|
"epoch": 0.24914285714285714, |
|
"grad_norm": 0.03859826177358627, |
|
"kl": 0.000392913818359375, |
|
"learning_rate": 1.4554267916537495e-07, |
|
"loss": 0.1139, |
|
"reward": 0.25591667648404837, |
|
"reward_std": 0.7985352799296379, |
|
"rewards/cosine_scaled_reward": -0.08037500828504562, |
|
"rewards/format_reward": 0.4166666716337204, |
|
"step": 436 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1703.7083435058594, |
|
"epoch": 0.24971428571428572, |
|
"grad_norm": 0.035596735775470734, |
|
"kl": 0.0003857612609863281, |
|
"learning_rate": 1.4417536311769885e-07, |
|
"loss": 0.1267, |
|
"reward": 0.5573078580200672, |
|
"reward_std": 0.6633748337626457, |
|
"rewards/cosine_scaled_reward": -0.11717941612005234, |
|
"rewards/format_reward": 0.791666679084301, |
|
"step": 437 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2871.3333435058594, |
|
"epoch": 0.2502857142857143, |
|
"grad_norm": 0.012692431919276714, |
|
"kl": 0.000335693359375, |
|
"learning_rate": 1.4282782639029128e-07, |
|
"loss": 0.0717, |
|
"reward": -0.2980673983693123, |
|
"reward_std": 0.15785099938511848, |
|
"rewards/cosine_scaled_reward": -0.29486703872680664, |
|
"rewards/format_reward": 0.2916666679084301, |
|
"step": 438 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2338.7083435058594, |
|
"epoch": 0.25085714285714283, |
|
"grad_norm": 0.01559723261743784, |
|
"kl": 0.0004553794860839844, |
|
"learning_rate": 1.4150013466019114e-07, |
|
"loss": 0.0277, |
|
"reward": 0.19210883975028992, |
|
"reward_std": 0.7682934515178204, |
|
"rewards/cosine_scaled_reward": -0.17477891221642494, |
|
"rewards/format_reward": 0.5416666679084301, |
|
"step": 439 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1833.625015258789, |
|
"epoch": 0.25142857142857145, |
|
"grad_norm": 0.02628200314939022, |
|
"kl": 0.0005588531494140625, |
|
"learning_rate": 1.4019235263722034e-07, |
|
"loss": -0.0343, |
|
"reward": 0.7311004251241684, |
|
"reward_std": 0.6230124272406101, |
|
"rewards/cosine_scaled_reward": -0.009449809789657593, |
|
"rewards/format_reward": 0.75, |
|
"step": 440 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3584.0, |
|
"epoch": 0.252, |
|
"grad_norm": 0.011055848561227322, |
|
"kl": 0.0003528594970703125, |
|
"learning_rate": 1.3890454406082956e-07, |
|
"loss": 0.0, |
|
"reward": -0.23345018550753593, |
|
"reward_std": 0.553386427462101, |
|
"rewards/cosine_scaled_reward": -0.15839176578447223, |
|
"rewards/format_reward": 0.0833333358168602, |
|
"step": 441 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2781.4583435058594, |
|
"epoch": 0.25257142857142856, |
|
"grad_norm": 0.040766630321741104, |
|
"kl": 0.0006265640258789062, |
|
"learning_rate": 1.3763677169699217e-07, |
|
"loss": 0.0573, |
|
"reward": 0.07009226828813553, |
|
"reward_std": 0.8544792495667934, |
|
"rewards/cosine_scaled_reward": -0.13162054121494293, |
|
"rewards/format_reward": 0.3333333358168602, |
|
"step": 442 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3411.25, |
|
"epoch": 0.25314285714285717, |
|
"grad_norm": 0.013510222546756268, |
|
"kl": 0.0003809928894042969, |
|
"learning_rate": 1.3638909733514452e-07, |
|
"loss": 0.085, |
|
"reward": -0.4627658315002918, |
|
"reward_std": 0.3685727119445801, |
|
"rewards/cosine_scaled_reward": -0.3355495557188988, |
|
"rewards/format_reward": 0.2083333395421505, |
|
"step": 443 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3350.0833740234375, |
|
"epoch": 0.2537142857142857, |
|
"grad_norm": 0.017425937578082085, |
|
"kl": 0.00029087066650390625, |
|
"learning_rate": 1.351615817851748e-07, |
|
"loss": 0.109, |
|
"reward": -0.536733441054821, |
|
"reward_std": 0.5179216116666794, |
|
"rewards/cosine_scaled_reward": -0.3308667168021202, |
|
"rewards/format_reward": 0.1250000037252903, |
|
"step": 444 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2777.4583587646484, |
|
"epoch": 0.2542857142857143, |
|
"grad_norm": 0.01394882146269083, |
|
"kl": 0.0004124641418457031, |
|
"learning_rate": 1.3395428487445914e-07, |
|
"loss": 0.0398, |
|
"reward": 0.6419320218265057, |
|
"reward_std": 1.290092408657074, |
|
"rewards/cosine_scaled_reward": 0.11263268813490868, |
|
"rewards/format_reward": 0.4166666716337204, |
|
"step": 445 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2162.0000610351562, |
|
"epoch": 0.25485714285714284, |
|
"grad_norm": 0.025396212935447693, |
|
"kl": 0.0004458427429199219, |
|
"learning_rate": 1.3276726544494571e-07, |
|
"loss": 0.2263, |
|
"reward": 0.2729286514222622, |
|
"reward_std": 0.4925660863518715, |
|
"rewards/cosine_scaled_reward": -0.2177023496478796, |
|
"rewards/format_reward": 0.7083333395421505, |
|
"step": 446 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2784.250045776367, |
|
"epoch": 0.25542857142857145, |
|
"grad_norm": 0.01916358806192875, |
|
"kl": 0.00040149688720703125, |
|
"learning_rate": 1.316005813502869e-07, |
|
"loss": 0.0456, |
|
"reward": -0.28101276885718107, |
|
"reward_std": 0.27371450141072273, |
|
"rewards/cosine_scaled_reward": -0.28633972629904747, |
|
"rewards/format_reward": 0.2916666679084301, |
|
"step": 447 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3356.5833740234375, |
|
"epoch": 0.256, |
|
"grad_norm": 0.012381333857774734, |
|
"kl": 0.0004825592041015625, |
|
"learning_rate": 1.3045428945301953e-07, |
|
"loss": 0.0587, |
|
"reward": 0.8448215499520302, |
|
"reward_std": 0.715430673211813, |
|
"rewards/cosine_scaled_reward": 0.2557440847158432, |
|
"rewards/format_reward": 0.3333333432674408, |
|
"step": 448 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2940.3334350585938, |
|
"epoch": 0.25657142857142856, |
|
"grad_norm": 0.017700130119919777, |
|
"kl": 0.0002903938293457031, |
|
"learning_rate": 1.2932844562179352e-07, |
|
"loss": 0.1691, |
|
"reward": 0.6478632241487503, |
|
"reward_std": 1.2038164585828781, |
|
"rewards/cosine_scaled_reward": 0.11559828370809555, |
|
"rewards/format_reward": 0.4166666753590107, |
|
"step": 449 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1642.2084197998047, |
|
"epoch": 0.2571428571428571, |
|
"grad_norm": 0.01771002635359764, |
|
"kl": 0.0003179311752319336, |
|
"learning_rate": 1.2822310472864885e-07, |
|
"loss": 0.1028, |
|
"reward": 0.930170651525259, |
|
"reward_std": 0.5905403085052967, |
|
"rewards/cosine_scaled_reward": 0.027585337636992335, |
|
"rewards/format_reward": 0.875, |
|
"step": 450 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3493.4583740234375, |
|
"epoch": 0.25771428571428573, |
|
"grad_norm": 0.015439066104590893, |
|
"kl": 0.00055694580078125, |
|
"learning_rate": 1.2713832064634125e-07, |
|
"loss": 0.0217, |
|
"reward": -0.7016485892236233, |
|
"reward_std": 0.41242050286382437, |
|
"rewards/cosine_scaled_reward": -0.3924909606575966, |
|
"rewards/format_reward": 0.0833333358168602, |
|
"step": 451 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2162.375015258789, |
|
"epoch": 0.2582857142857143, |
|
"grad_norm": 0.031103147193789482, |
|
"kl": 0.0004916191101074219, |
|
"learning_rate": 1.260741462457165e-07, |
|
"loss": 0.1222, |
|
"reward": 0.33962953090667725, |
|
"reward_std": 0.5016037877649069, |
|
"rewards/cosine_scaled_reward": -0.14268524572253227, |
|
"rewards/format_reward": 0.625, |
|
"step": 452 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2658.2500228881836, |
|
"epoch": 0.25885714285714284, |
|
"grad_norm": 0.02379673905670643, |
|
"kl": 0.0007123947143554688, |
|
"learning_rate": 1.2503063339313356e-07, |
|
"loss": 0.0397, |
|
"reward": -0.1027427650988102, |
|
"reward_std": 0.6555835595354438, |
|
"rewards/cosine_scaled_reward": -0.2180380504578352, |
|
"rewards/format_reward": 0.3333333358168602, |
|
"step": 453 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2904.4583740234375, |
|
"epoch": 0.25942857142857145, |
|
"grad_norm": 0.024318231269717216, |
|
"kl": 0.0005178451538085938, |
|
"learning_rate": 1.2400783294793668e-07, |
|
"loss": 0.1233, |
|
"reward": 0.12724535167217255, |
|
"reward_std": 0.690078116953373, |
|
"rewards/cosine_scaled_reward": -0.10304398089647293, |
|
"rewards/format_reward": 0.3333333432674408, |
|
"step": 454 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3007.0833740234375, |
|
"epoch": 0.26, |
|
"grad_norm": 0.014805259183049202, |
|
"kl": 0.00030303001403808594, |
|
"learning_rate": 1.2300579475997657e-07, |
|
"loss": 0.0556, |
|
"reward": 0.07904787175357342, |
|
"reward_std": 0.41673495434224606, |
|
"rewards/cosine_scaled_reward": -0.12714274739846587, |
|
"rewards/format_reward": 0.3333333358168602, |
|
"step": 455 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3369.75, |
|
"epoch": 0.26057142857142856, |
|
"grad_norm": 0.01267288252711296, |
|
"kl": 0.000545501708984375, |
|
"learning_rate": 1.220245676671809e-07, |
|
"loss": 0.0697, |
|
"reward": -0.19169889390468597, |
|
"reward_std": 0.5537064597010612, |
|
"rewards/cosine_scaled_reward": -0.15834945812821388, |
|
"rewards/format_reward": 0.1250000037252903, |
|
"step": 456 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2750.4166870117188, |
|
"epoch": 0.2611428571428571, |
|
"grad_norm": 0.01913038082420826, |
|
"kl": 0.0005252361297607422, |
|
"learning_rate": 1.2106419949317388e-07, |
|
"loss": 0.0638, |
|
"reward": -0.025101646780967712, |
|
"reward_std": 0.8123595081269741, |
|
"rewards/cosine_scaled_reward": -0.17921748850494623, |
|
"rewards/format_reward": 0.3333333358168602, |
|
"step": 457 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2731.291732788086, |
|
"epoch": 0.26171428571428573, |
|
"grad_norm": 0.018483366817235947, |
|
"kl": 0.0005779266357421875, |
|
"learning_rate": 1.2012473704494537e-07, |
|
"loss": 0.1314, |
|
"reward": 0.18389775604009628, |
|
"reward_std": 0.6026048362255096, |
|
"rewards/cosine_scaled_reward": -0.11638446152210236, |
|
"rewards/format_reward": 0.4166666716337204, |
|
"step": 458 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2847.2083435058594, |
|
"epoch": 0.2622857142857143, |
|
"grad_norm": 0.019928766414523125, |
|
"kl": 0.0004062652587890625, |
|
"learning_rate": 1.1920622611056974e-07, |
|
"loss": 0.116, |
|
"reward": 0.056805893778800964, |
|
"reward_std": 0.897519065067172, |
|
"rewards/cosine_scaled_reward": -0.15909705124795437, |
|
"rewards/format_reward": 0.3750000111758709, |
|
"step": 459 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2855.7083740234375, |
|
"epoch": 0.26285714285714284, |
|
"grad_norm": 0.01562991738319397, |
|
"kl": 0.0004324913024902344, |
|
"learning_rate": 1.1830871145697412e-07, |
|
"loss": 0.011, |
|
"reward": 0.4357723630964756, |
|
"reward_std": 0.7417406067252159, |
|
"rewards/cosine_scaled_reward": 0.030386213213205338, |
|
"rewards/format_reward": 0.3750000037252903, |
|
"step": 460 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2806.1666870117188, |
|
"epoch": 0.2634285714285714, |
|
"grad_norm": 0.022745100781321526, |
|
"kl": 0.0005688667297363281, |
|
"learning_rate": 1.1743223682775649e-07, |
|
"loss": 0.0915, |
|
"reward": -0.35859447717666626, |
|
"reward_std": 0.3101446107029915, |
|
"rewards/cosine_scaled_reward": -0.32513057440519333, |
|
"rewards/format_reward": 0.2916666679084301, |
|
"step": 461 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3287.166748046875, |
|
"epoch": 0.264, |
|
"grad_norm": 0.014005818404257298, |
|
"kl": 0.0002605915069580078, |
|
"learning_rate": 1.1657684494105386e-07, |
|
"loss": 0.1217, |
|
"reward": -0.07925862073898315, |
|
"reward_std": 0.570786502212286, |
|
"rewards/cosine_scaled_reward": -0.12296264991164207, |
|
"rewards/format_reward": 0.1666666679084301, |
|
"step": 462 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3048.8750610351562, |
|
"epoch": 0.26457142857142857, |
|
"grad_norm": 0.026238108053803444, |
|
"kl": 0.0003509521484375, |
|
"learning_rate": 1.1574257748745986e-07, |
|
"loss": 0.1634, |
|
"reward": -0.2891614316031337, |
|
"reward_std": 0.6285391822457314, |
|
"rewards/cosine_scaled_reward": -0.2695807181298733, |
|
"rewards/format_reward": 0.25, |
|
"step": 463 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2917.9583435058594, |
|
"epoch": 0.2651428571428571, |
|
"grad_norm": 0.014083731919527054, |
|
"kl": 0.00032520294189453125, |
|
"learning_rate": 1.1492947512799328e-07, |
|
"loss": 0.0497, |
|
"reward": 0.23886509239673615, |
|
"reward_std": 0.5035260319709778, |
|
"rewards/cosine_scaled_reward": -0.08890077471733093, |
|
"rewards/format_reward": 0.4166666716337204, |
|
"step": 464 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3480.75, |
|
"epoch": 0.26571428571428574, |
|
"grad_norm": 0.01106889545917511, |
|
"kl": 0.0002949237823486328, |
|
"learning_rate": 1.1413757749211602e-07, |
|
"loss": 0.022, |
|
"reward": 0.06554645299911499, |
|
"reward_std": 0.4903480280190706, |
|
"rewards/cosine_scaled_reward": -0.0505601167678833, |
|
"rewards/format_reward": 0.1666666716337204, |
|
"step": 465 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2717.875030517578, |
|
"epoch": 0.2662857142857143, |
|
"grad_norm": 0.017086246982216835, |
|
"kl": 0.0003123283386230469, |
|
"learning_rate": 1.1336692317580158e-07, |
|
"loss": 0.0943, |
|
"reward": -0.1813066378235817, |
|
"reward_std": 0.2922391891479492, |
|
"rewards/cosine_scaled_reward": -0.31981998309493065, |
|
"rewards/format_reward": 0.4583333432674408, |
|
"step": 466 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3453.916748046875, |
|
"epoch": 0.26685714285714285, |
|
"grad_norm": 0.013130133971571922, |
|
"kl": 0.0003361701965332031, |
|
"learning_rate": 1.1261754973965422e-07, |
|
"loss": 0.0385, |
|
"reward": 0.055451929569244385, |
|
"reward_std": 0.7496693283319473, |
|
"rewards/cosine_scaled_reward": -0.1181073747575283, |
|
"rewards/format_reward": 0.2916666716337204, |
|
"step": 467 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2965.5416870117188, |
|
"epoch": 0.2674285714285714, |
|
"grad_norm": 0.02877657674252987, |
|
"kl": 0.0003726482391357422, |
|
"learning_rate": 1.1188949370707787e-07, |
|
"loss": 0.1355, |
|
"reward": 0.2859138697385788, |
|
"reward_std": 0.8501931093633175, |
|
"rewards/cosine_scaled_reward": -0.0445430725812912, |
|
"rewards/format_reward": 0.3750000149011612, |
|
"step": 468 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3065.666717529297, |
|
"epoch": 0.268, |
|
"grad_norm": 0.021241569891572, |
|
"kl": 0.0003745555877685547, |
|
"learning_rate": 1.1118279056249653e-07, |
|
"loss": -0.0209, |
|
"reward": -0.22108882665634155, |
|
"reward_std": 0.5584007576107979, |
|
"rewards/cosine_scaled_reward": -0.2772110812366009, |
|
"rewards/format_reward": 0.3333333358168602, |
|
"step": 469 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2091.041717529297, |
|
"epoch": 0.26857142857142857, |
|
"grad_norm": 0.043246157467365265, |
|
"kl": 0.00055694580078125, |
|
"learning_rate": 1.1049747474962444e-07, |
|
"loss": 0.1196, |
|
"reward": 1.0320170223712921, |
|
"reward_std": 1.0782769918441772, |
|
"rewards/cosine_scaled_reward": 0.2035085055977106, |
|
"rewards/format_reward": 0.6250000149011612, |
|
"step": 470 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2538.291717529297, |
|
"epoch": 0.26914285714285713, |
|
"grad_norm": 0.016464348882436752, |
|
"kl": 0.00033664703369140625, |
|
"learning_rate": 1.0983357966978745e-07, |
|
"loss": 0.0476, |
|
"reward": 0.2932426920160651, |
|
"reward_std": 0.5623490735888481, |
|
"rewards/cosine_scaled_reward": -0.1450453530997038, |
|
"rewards/format_reward": 0.5833333432674408, |
|
"step": 471 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3552.5833740234375, |
|
"epoch": 0.26971428571428574, |
|
"grad_norm": 0.0214456245303154, |
|
"kl": 0.00042819976806640625, |
|
"learning_rate": 1.0919113768029517e-07, |
|
"loss": 0.0135, |
|
"reward": -0.35011430410668254, |
|
"reward_std": 0.3236675038933754, |
|
"rewards/cosine_scaled_reward": -0.23755715577863157, |
|
"rewards/format_reward": 0.1250000037252903, |
|
"step": 472 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2889.4166717529297, |
|
"epoch": 0.2702857142857143, |
|
"grad_norm": 0.02036861516535282, |
|
"kl": 0.0003829002380371094, |
|
"learning_rate": 1.0857018009286381e-07, |
|
"loss": 0.0107, |
|
"reward": -0.004865109920501709, |
|
"reward_std": 0.6773473080247641, |
|
"rewards/cosine_scaled_reward": -0.14826588705182076, |
|
"rewards/format_reward": 0.2916666679084301, |
|
"step": 473 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2525.9583740234375, |
|
"epoch": 0.27085714285714285, |
|
"grad_norm": 0.013575663790106773, |
|
"kl": 0.0002644062042236328, |
|
"learning_rate": 1.0797073717209013e-07, |
|
"loss": -0.0323, |
|
"reward": 0.5381094664335251, |
|
"reward_std": 0.44665071181952953, |
|
"rewards/cosine_scaled_reward": -0.0017786095850169659, |
|
"rewards/format_reward": 0.5416666679084301, |
|
"step": 474 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2851.9583740234375, |
|
"epoch": 0.2714285714285714, |
|
"grad_norm": 0.020593978464603424, |
|
"kl": 0.0005025863647460938, |
|
"learning_rate": 1.0739283813397639e-07, |
|
"loss": 0.0314, |
|
"reward": 0.17490556836128235, |
|
"reward_std": 0.6029777117073536, |
|
"rewards/cosine_scaled_reward": -0.12088055908679962, |
|
"rewards/format_reward": 0.4166666865348816, |
|
"step": 475 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3584.0, |
|
"epoch": 0.272, |
|
"grad_norm": 0.01853923127055168, |
|
"kl": 0.000408172607421875, |
|
"learning_rate": 1.068365111445064e-07, |
|
"loss": 0.0, |
|
"reward": -0.6839941293001175, |
|
"reward_std": 0.23570294678211212, |
|
"rewards/cosine_scaled_reward": -0.34199706465005875, |
|
"rewards/format_reward": 0.0, |
|
"step": 476 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3223.4166870117188, |
|
"epoch": 0.2725714285714286, |
|
"grad_norm": 0.02092069201171398, |
|
"kl": 0.00046443939208984375, |
|
"learning_rate": 1.063017833182728e-07, |
|
"loss": 0.1149, |
|
"reward": -0.4160095602273941, |
|
"reward_std": 0.3700854703783989, |
|
"rewards/cosine_scaled_reward": -0.33300479501485825, |
|
"rewards/format_reward": 0.25, |
|
"step": 477 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3584.0, |
|
"epoch": 0.27314285714285713, |
|
"grad_norm": 0.015490886755287647, |
|
"kl": 0.0003581047058105469, |
|
"learning_rate": 1.0578868071715544e-07, |
|
"loss": 0.0, |
|
"reward": -0.35857730358839035, |
|
"reward_std": 0.1991352178156376, |
|
"rewards/cosine_scaled_reward": -0.17928865179419518, |
|
"rewards/format_reward": 0.0, |
|
"step": 478 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2192.2917098999023, |
|
"epoch": 0.2737142857142857, |
|
"grad_norm": 0.025250233709812164, |
|
"kl": 0.0003457069396972656, |
|
"learning_rate": 1.0529722834905125e-07, |
|
"loss": 0.2067, |
|
"reward": 1.0132533311843872, |
|
"reward_std": 0.5927854059264064, |
|
"rewards/cosine_scaled_reward": 0.21495997160673141, |
|
"rewards/format_reward": 0.5833333469927311, |
|
"step": 479 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3373.9583740234375, |
|
"epoch": 0.2742857142857143, |
|
"grad_norm": 0.023761438205838203, |
|
"kl": 0.0004482269287109375, |
|
"learning_rate": 1.0482745016665526e-07, |
|
"loss": 0.0815, |
|
"reward": 0.2368101328611374, |
|
"reward_std": 0.7100772261619568, |
|
"rewards/cosine_scaled_reward": -0.006594939972274005, |
|
"rewards/format_reward": 0.2500000111758709, |
|
"step": 480 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3024.125, |
|
"epoch": 0.27485714285714286, |
|
"grad_norm": 0.01686985418200493, |
|
"kl": 0.0006651878356933594, |
|
"learning_rate": 1.0437936906629334e-07, |
|
"loss": 0.0839, |
|
"reward": -0.0927225798368454, |
|
"reward_std": 0.6664447784423828, |
|
"rewards/cosine_scaled_reward": -0.21302797086536884, |
|
"rewards/format_reward": 0.3333333358168602, |
|
"step": 481 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2792.125030517578, |
|
"epoch": 0.2754285714285714, |
|
"grad_norm": 0.016433361917734146, |
|
"kl": 0.00051116943359375, |
|
"learning_rate": 1.0395300688680625e-07, |
|
"loss": 0.0005, |
|
"reward": -0.018994301557540894, |
|
"reward_std": 0.6018916461616755, |
|
"rewards/cosine_scaled_reward": -0.19699716940522194, |
|
"rewards/format_reward": 0.375, |
|
"step": 482 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2972.1250610351562, |
|
"epoch": 0.276, |
|
"grad_norm": 0.031187044456601143, |
|
"kl": 0.0005083084106445312, |
|
"learning_rate": 1.0354838440848501e-07, |
|
"loss": 0.215, |
|
"reward": -0.3035236857831478, |
|
"reward_std": 0.6242531836032867, |
|
"rewards/cosine_scaled_reward": -0.2975951712578535, |
|
"rewards/format_reward": 0.291666679084301, |
|
"step": 483 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2626.541717529297, |
|
"epoch": 0.2765714285714286, |
|
"grad_norm": 0.03856087848544121, |
|
"kl": 0.0005278587341308594, |
|
"learning_rate": 1.0316552135205837e-07, |
|
"loss": 0.0923, |
|
"reward": -0.07003412395715714, |
|
"reward_std": 0.5723829306662083, |
|
"rewards/cosine_scaled_reward": -0.26418372616171837, |
|
"rewards/format_reward": 0.4583333395421505, |
|
"step": 484 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3163.291748046875, |
|
"epoch": 0.27714285714285714, |
|
"grad_norm": 0.023002101108431816, |
|
"kl": 0.0003609657287597656, |
|
"learning_rate": 1.0280443637773163e-07, |
|
"loss": 0.1445, |
|
"reward": -0.1619274765253067, |
|
"reward_std": 0.6520496867597103, |
|
"rewards/cosine_scaled_reward": -0.18513040244579315, |
|
"rewards/format_reward": 0.2083333395421505, |
|
"step": 485 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2786.500045776367, |
|
"epoch": 0.2777142857142857, |
|
"grad_norm": 0.04367635026574135, |
|
"kl": 0.00084686279296875, |
|
"learning_rate": 1.0246514708427701e-07, |
|
"loss": 0.1118, |
|
"reward": -0.10202455054968596, |
|
"reward_std": 0.5869803428649902, |
|
"rewards/cosine_scaled_reward": -0.19684561155736446, |
|
"rewards/format_reward": 0.2916666679084301, |
|
"step": 486 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2069.625045776367, |
|
"epoch": 0.2782857142857143, |
|
"grad_norm": 0.01725645735859871, |
|
"kl": 0.0004436969757080078, |
|
"learning_rate": 1.0214767000817596e-07, |
|
"loss": 0.0339, |
|
"reward": 0.9145368824247271, |
|
"reward_std": 0.6427567200735211, |
|
"rewards/cosine_scaled_reward": 0.1656017464119941, |
|
"rewards/format_reward": 0.5833333432674408, |
|
"step": 487 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3266.9583740234375, |
|
"epoch": 0.27885714285714286, |
|
"grad_norm": 0.020370880141854286, |
|
"kl": 0.0003273487091064453, |
|
"learning_rate": 1.0185202062281336e-07, |
|
"loss": 0.069, |
|
"reward": 0.08849874883890152, |
|
"reward_std": 0.9649087898433208, |
|
"rewards/cosine_scaled_reward": -0.10158396512269974, |
|
"rewards/format_reward": 0.2916666679084301, |
|
"step": 488 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2884.7500610351562, |
|
"epoch": 0.2794285714285714, |
|
"grad_norm": 0.031119707971811295, |
|
"kl": 0.0006618499755859375, |
|
"learning_rate": 1.0157821333772304e-07, |
|
"loss": 0.1788, |
|
"reward": -0.1438409616239369, |
|
"reward_std": 0.752083495259285, |
|
"rewards/cosine_scaled_reward": -0.2802538275718689, |
|
"rewards/format_reward": 0.4166666716337204, |
|
"step": 489 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2443.8333740234375, |
|
"epoch": 0.28, |
|
"grad_norm": 0.033809904009103775, |
|
"kl": 0.0004324913024902344, |
|
"learning_rate": 1.013262614978859e-07, |
|
"loss": 0.1043, |
|
"reward": 0.5690474957227707, |
|
"reward_std": 0.7256521657109261, |
|
"rewards/cosine_scaled_reward": 0.034523727372288704, |
|
"rewards/format_reward": 0.5000000111758709, |
|
"step": 490 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2658.3750610351562, |
|
"epoch": 0.2805714285714286, |
|
"grad_norm": 0.02354743331670761, |
|
"kl": 0.0004076957702636719, |
|
"learning_rate": 1.0109617738307911e-07, |
|
"loss": 0.0933, |
|
"reward": 0.611915085464716, |
|
"reward_std": 0.9881070479750633, |
|
"rewards/cosine_scaled_reward": 0.05595753900706768, |
|
"rewards/format_reward": 0.5000000037252903, |
|
"step": 491 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2955.7083740234375, |
|
"epoch": 0.28114285714285714, |
|
"grad_norm": 0.017025692388415337, |
|
"kl": 0.000514984130859375, |
|
"learning_rate": 1.0088797220727779e-07, |
|
"loss": 0.1604, |
|
"reward": -0.22188673401251435, |
|
"reward_std": 0.46732087805867195, |
|
"rewards/cosine_scaled_reward": -0.27761003375053406, |
|
"rewards/format_reward": 0.3333333432674408, |
|
"step": 492 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2733.125045776367, |
|
"epoch": 0.2817142857142857, |
|
"grad_norm": 0.025227824226021767, |
|
"kl": 0.0007300376892089844, |
|
"learning_rate": 1.0070165611810855e-07, |
|
"loss": 0.051, |
|
"reward": 0.24700819700956345, |
|
"reward_std": 0.9385927617549896, |
|
"rewards/cosine_scaled_reward": -0.12649590522050858, |
|
"rewards/format_reward": 0.5, |
|
"step": 493 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2702.0416870117188, |
|
"epoch": 0.2822857142857143, |
|
"grad_norm": 0.024555031210184097, |
|
"kl": 0.0006690025329589844, |
|
"learning_rate": 1.005372381963547e-07, |
|
"loss": 0.071, |
|
"reward": 0.8110056445002556, |
|
"reward_std": 1.0802773237228394, |
|
"rewards/cosine_scaled_reward": 0.15550284087657928, |
|
"rewards/format_reward": 0.5000000223517418, |
|
"step": 494 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3584.0, |
|
"epoch": 0.28285714285714286, |
|
"grad_norm": 0.017543919384479523, |
|
"kl": 0.00031828880310058594, |
|
"learning_rate": 1.0039472645551372e-07, |
|
"loss": 0.0, |
|
"reward": -0.29333774745464325, |
|
"reward_std": 0.505841463804245, |
|
"rewards/cosine_scaled_reward": -0.18833555281162262, |
|
"rewards/format_reward": 0.0833333358168602, |
|
"step": 495 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1680.4167175292969, |
|
"epoch": 0.2834285714285714, |
|
"grad_norm": 0.0482969656586647, |
|
"kl": 0.0008435249328613281, |
|
"learning_rate": 1.002741278414069e-07, |
|
"loss": 0.1698, |
|
"reward": 0.8347217477858067, |
|
"reward_std": 0.6243661791086197, |
|
"rewards/cosine_scaled_reward": -0.06180577352643013, |
|
"rewards/format_reward": 0.9583333432674408, |
|
"step": 496 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3413.5000610351562, |
|
"epoch": 0.284, |
|
"grad_norm": 0.010940664447844028, |
|
"kl": 0.000286102294921875, |
|
"learning_rate": 1.0017544823184055e-07, |
|
"loss": 0.0685, |
|
"reward": -0.3512016786262393, |
|
"reward_std": 0.293385605327785, |
|
"rewards/cosine_scaled_reward": -0.23810084303840995, |
|
"rewards/format_reward": 0.1250000037252903, |
|
"step": 497 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2852.9166717529297, |
|
"epoch": 0.2845714285714286, |
|
"grad_norm": 0.031826820224523544, |
|
"kl": 0.0005040168762207031, |
|
"learning_rate": 1.0009869243631952e-07, |
|
"loss": 0.0531, |
|
"reward": -0.09286746755242348, |
|
"reward_std": 0.689836498349905, |
|
"rewards/cosine_scaled_reward": -0.19226707890629768, |
|
"rewards/format_reward": 0.2916666679084301, |
|
"step": 498 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2774.666748046875, |
|
"epoch": 0.28514285714285714, |
|
"grad_norm": 0.015145834535360336, |
|
"kl": 0.0004925727844238281, |
|
"learning_rate": 1.000438641958131e-07, |
|
"loss": 0.047, |
|
"reward": 1.141416186466813, |
|
"reward_std": 1.473642259836197, |
|
"rewards/cosine_scaled_reward": 0.2582080829888582, |
|
"rewards/format_reward": 0.6250000111758709, |
|
"step": 499 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2512.625030517578, |
|
"epoch": 0.2857142857142857, |
|
"grad_norm": 0.045064449310302734, |
|
"kl": 0.0005974769592285156, |
|
"learning_rate": 1.0001096618257236e-07, |
|
"loss": -0.0824, |
|
"reward": -0.023780837655067444, |
|
"reward_std": 0.46510184183716774, |
|
"rewards/cosine_scaled_reward": -0.2827237620949745, |
|
"rewards/format_reward": 0.5416666679084301, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.2857142857142857, |
|
"step": 500, |
|
"total_flos": 0.0, |
|
"train_loss": 0.07169770698851426, |
|
"train_runtime": 25124.3049, |
|
"train_samples_per_second": 0.478, |
|
"train_steps_per_second": 0.02 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 500, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 50, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 0.0, |
|
"train_batch_size": 6, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|