|
{ |
|
"best_global_step": null, |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.2857142857142857, |
|
"eval_steps": 500, |
|
"global_step": 500, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3068.5000610351562, |
|
"epoch": 0.0005714285714285715, |
|
"grad_norm": 0.013173403218388557, |
|
"kl": 0.0005006790161132812, |
|
"learning_rate": 0.0, |
|
"loss": -0.0242, |
|
"reward": 0.200983926653862, |
|
"reward_std": 0.24425111338496208, |
|
"rewards/cosine_scaled_reward": -0.0453413650393486, |
|
"rewards/format_reward": 0.2916666679084301, |
|
"step": 1 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2930.9583740234375, |
|
"epoch": 0.001142857142857143, |
|
"grad_norm": 0.043274302035570145, |
|
"kl": 0.0003731250762939453, |
|
"learning_rate": 2.0000000000000003e-06, |
|
"loss": 0.2092, |
|
"reward": -0.28063105791807175, |
|
"reward_std": 0.29903180059045553, |
|
"rewards/cosine_scaled_reward": -0.28614887595176697, |
|
"rewards/format_reward": 0.291666679084301, |
|
"step": 2 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2402.4584045410156, |
|
"epoch": 0.0017142857142857142, |
|
"grad_norm": 0.027086207643151283, |
|
"kl": 0.0004477500915527344, |
|
"learning_rate": 4.000000000000001e-06, |
|
"loss": 0.1282, |
|
"reward": 0.3940318487584591, |
|
"reward_std": 0.7995570376515388, |
|
"rewards/cosine_scaled_reward": -0.0946507640182972, |
|
"rewards/format_reward": 0.5833333507180214, |
|
"step": 3 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2621.1666870117188, |
|
"epoch": 0.002285714285714286, |
|
"grad_norm": 0.017406703904271126, |
|
"kl": 0.00045299530029296875, |
|
"learning_rate": 6e-06, |
|
"loss": 0.0693, |
|
"reward": 0.26569247245788574, |
|
"reward_std": 0.6448228061199188, |
|
"rewards/cosine_scaled_reward": -0.13798709958791733, |
|
"rewards/format_reward": 0.541666679084301, |
|
"step": 4 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2960.4583435058594, |
|
"epoch": 0.002857142857142857, |
|
"grad_norm": 0.03813392296433449, |
|
"kl": 0.0005469322204589844, |
|
"learning_rate": 8.000000000000001e-06, |
|
"loss": 0.2296, |
|
"reward": -0.4634598270058632, |
|
"reward_std": 0.31902989000082016, |
|
"rewards/cosine_scaled_reward": -0.3358965665102005, |
|
"rewards/format_reward": 0.2083333432674408, |
|
"step": 5 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2929.625, |
|
"epoch": 0.0034285714285714284, |
|
"grad_norm": 0.09496363252401352, |
|
"kl": 0.000408172607421875, |
|
"learning_rate": 1e-05, |
|
"loss": 0.2546, |
|
"reward": -0.23965296894311905, |
|
"reward_std": 0.21214338019490242, |
|
"rewards/cosine_scaled_reward": -0.24482648819684982, |
|
"rewards/format_reward": 0.25, |
|
"step": 6 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2929.125030517578, |
|
"epoch": 0.004, |
|
"grad_norm": 0.021513408049941063, |
|
"kl": 0.0005333423614501953, |
|
"learning_rate": 1.2e-05, |
|
"loss": 0.154, |
|
"reward": 0.022021979675628245, |
|
"reward_std": 0.5914725549519062, |
|
"rewards/cosine_scaled_reward": -0.13482235372066498, |
|
"rewards/format_reward": 0.291666679084301, |
|
"step": 7 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3008.8333435058594, |
|
"epoch": 0.004571428571428572, |
|
"grad_norm": 0.014135723002254963, |
|
"kl": 0.0004029273986816406, |
|
"learning_rate": 1.4000000000000001e-05, |
|
"loss": -0.0025, |
|
"reward": -0.6439514700323343, |
|
"reward_std": 0.10124612040817738, |
|
"rewards/cosine_scaled_reward": -0.4469757154583931, |
|
"rewards/format_reward": 0.25, |
|
"step": 8 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2343.9166870117188, |
|
"epoch": 0.005142857142857143, |
|
"grad_norm": 0.022239407524466515, |
|
"kl": 0.0003972053527832031, |
|
"learning_rate": 1.6000000000000003e-05, |
|
"loss": -0.0438, |
|
"reward": 0.5440307706594467, |
|
"reward_std": 0.46696411445736885, |
|
"rewards/cosine_scaled_reward": 0.001182064414024353, |
|
"rewards/format_reward": 0.5416666679084301, |
|
"step": 9 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3273.166748046875, |
|
"epoch": 0.005714285714285714, |
|
"grad_norm": 0.012372066266834736, |
|
"kl": 0.0002987384796142578, |
|
"learning_rate": 1.8e-05, |
|
"loss": 0.044, |
|
"reward": 0.2406943179666996, |
|
"reward_std": 1.128564938902855, |
|
"rewards/cosine_scaled_reward": -0.04631950333714485, |
|
"rewards/format_reward": 0.3333333469927311, |
|
"step": 10 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2470.7500610351562, |
|
"epoch": 0.006285714285714286, |
|
"grad_norm": 0.02754840813577175, |
|
"kl": 0.0005016326904296875, |
|
"learning_rate": 2e-05, |
|
"loss": 0.1079, |
|
"reward": 0.0395459933206439, |
|
"reward_std": 0.7187126167118549, |
|
"rewards/cosine_scaled_reward": -0.251060351729393, |
|
"rewards/format_reward": 0.5416666716337204, |
|
"step": 11 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2851.4583740234375, |
|
"epoch": 0.006857142857142857, |
|
"grad_norm": 0.032746948301792145, |
|
"kl": 0.0005283355712890625, |
|
"learning_rate": 2.2000000000000003e-05, |
|
"loss": -0.0402, |
|
"reward": 0.5950729995965958, |
|
"reward_std": 0.7282880395650864, |
|
"rewards/cosine_scaled_reward": 0.005869843065738678, |
|
"rewards/format_reward": 0.5833333469927311, |
|
"step": 12 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2740.9584350585938, |
|
"epoch": 0.0074285714285714285, |
|
"grad_norm": 0.016786912456154823, |
|
"kl": 0.0004119873046875, |
|
"learning_rate": 2.4e-05, |
|
"loss": 0.062, |
|
"reward": 0.3663429766893387, |
|
"reward_std": 0.7826206907629967, |
|
"rewards/cosine_scaled_reward": -0.0459951926022768, |
|
"rewards/format_reward": 0.4583333395421505, |
|
"step": 13 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2583.166778564453, |
|
"epoch": 0.008, |
|
"grad_norm": 0.01807275228202343, |
|
"kl": 0.0003781318664550781, |
|
"learning_rate": 2.6000000000000002e-05, |
|
"loss": 0.0601, |
|
"reward": 0.6402689702808857, |
|
"reward_std": 0.9116464108228683, |
|
"rewards/cosine_scaled_reward": 0.09096781723201275, |
|
"rewards/format_reward": 0.4583333358168602, |
|
"step": 14 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2415.166748046875, |
|
"epoch": 0.008571428571428572, |
|
"grad_norm": 0.016410216689109802, |
|
"kl": 0.000415802001953125, |
|
"learning_rate": 2.8000000000000003e-05, |
|
"loss": 0.0876, |
|
"reward": 0.17176377028226852, |
|
"reward_std": 0.5884413756430149, |
|
"rewards/cosine_scaled_reward": -0.20578479953110218, |
|
"rewards/format_reward": 0.5833333432674408, |
|
"step": 15 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2822.416748046875, |
|
"epoch": 0.009142857142857144, |
|
"grad_norm": 0.01920630969107151, |
|
"kl": 0.0003960132598876953, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1401, |
|
"reward": 0.46021055802702904, |
|
"reward_std": 0.6382475309073925, |
|
"rewards/cosine_scaled_reward": 0.0009385794401168823, |
|
"rewards/format_reward": 0.4583333544433117, |
|
"step": 16 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2660.875030517578, |
|
"epoch": 0.009714285714285713, |
|
"grad_norm": 0.03571460023522377, |
|
"kl": 0.0005893707275390625, |
|
"learning_rate": 3.2000000000000005e-05, |
|
"loss": 0.2357, |
|
"reward": 0.0063078440725803375, |
|
"reward_std": 0.5483059138059616, |
|
"rewards/cosine_scaled_reward": -0.18434608541429043, |
|
"rewards/format_reward": 0.375, |
|
"step": 17 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3228.125, |
|
"epoch": 0.010285714285714285, |
|
"grad_norm": 0.011813213117420673, |
|
"kl": 0.0003924369812011719, |
|
"learning_rate": 3.4000000000000007e-05, |
|
"loss": -0.0546, |
|
"reward": 0.039575088769197464, |
|
"reward_std": 0.8113678842782974, |
|
"rewards/cosine_scaled_reward": -0.12604578211903572, |
|
"rewards/format_reward": 0.2916666679084301, |
|
"step": 18 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2784.2916870117188, |
|
"epoch": 0.010857142857142857, |
|
"grad_norm": 0.01843290589749813, |
|
"kl": 0.0005035400390625, |
|
"learning_rate": 3.6e-05, |
|
"loss": -0.0056, |
|
"reward": 0.0836619883775711, |
|
"reward_std": 0.4685993976891041, |
|
"rewards/cosine_scaled_reward": -0.16650232672691345, |
|
"rewards/format_reward": 0.4166666716337204, |
|
"step": 19 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1598.4583587646484, |
|
"epoch": 0.011428571428571429, |
|
"grad_norm": 0.02082478627562523, |
|
"kl": 0.0005908012390136719, |
|
"learning_rate": 3.8e-05, |
|
"loss": 0.115, |
|
"reward": 0.6086829677224159, |
|
"reward_std": 0.7450617477297783, |
|
"rewards/cosine_scaled_reward": -0.09149187058210373, |
|
"rewards/format_reward": 0.7916666679084301, |
|
"step": 20 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1688.2500305175781, |
|
"epoch": 0.012, |
|
"grad_norm": 0.06570505350828171, |
|
"kl": 0.0012292861938476562, |
|
"learning_rate": 4e-05, |
|
"loss": 0.2063, |
|
"reward": 0.12941228225827217, |
|
"reward_std": 0.2382342591881752, |
|
"rewards/cosine_scaled_reward": -0.26862720027565956, |
|
"rewards/format_reward": 0.6666666716337204, |
|
"step": 21 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2881.0416870117188, |
|
"epoch": 0.012571428571428572, |
|
"grad_norm": 0.01590500958263874, |
|
"kl": 0.0003497600555419922, |
|
"learning_rate": 4.2e-05, |
|
"loss": 0.1198, |
|
"reward": 0.8034647405147552, |
|
"reward_std": 1.4150425791740417, |
|
"rewards/cosine_scaled_reward": 0.15173236466944218, |
|
"rewards/format_reward": 0.5000000186264515, |
|
"step": 22 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2695.3333740234375, |
|
"epoch": 0.013142857142857144, |
|
"grad_norm": 0.014568633399903774, |
|
"kl": 0.0006999969482421875, |
|
"learning_rate": 4.4000000000000006e-05, |
|
"loss": 0.0306, |
|
"reward": 0.31954750418663025, |
|
"reward_std": 0.7953172847628593, |
|
"rewards/cosine_scaled_reward": -0.09022624790668488, |
|
"rewards/format_reward": 0.5000000037252903, |
|
"step": 23 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2814.2083740234375, |
|
"epoch": 0.013714285714285714, |
|
"grad_norm": 0.03579765558242798, |
|
"kl": 0.0009121894836425781, |
|
"learning_rate": 4.600000000000001e-05, |
|
"loss": 0.0315, |
|
"reward": 0.5563870035111904, |
|
"reward_std": 0.46287257969379425, |
|
"rewards/cosine_scaled_reward": -0.013473168015480042, |
|
"rewards/format_reward": 0.5833333432674408, |
|
"step": 24 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1512.7083587646484, |
|
"epoch": 0.014285714285714285, |
|
"grad_norm": 0.023476263508200645, |
|
"kl": 0.0010466575622558594, |
|
"learning_rate": 4.8e-05, |
|
"loss": 0.0897, |
|
"reward": 0.7631387338042259, |
|
"reward_std": 0.3562787361443043, |
|
"rewards/cosine_scaled_reward": 0.006569338962435722, |
|
"rewards/format_reward": 0.75, |
|
"step": 25 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2746.4583740234375, |
|
"epoch": 0.014857142857142857, |
|
"grad_norm": 0.015517197549343109, |
|
"kl": 0.0005793571472167969, |
|
"learning_rate": 5e-05, |
|
"loss": 0.1146, |
|
"reward": 0.2103739231824875, |
|
"reward_std": 0.9847119301557541, |
|
"rewards/cosine_scaled_reward": -0.08231302350759506, |
|
"rewards/format_reward": 0.3750000037252903, |
|
"step": 26 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3263.041748046875, |
|
"epoch": 0.015428571428571429, |
|
"grad_norm": 0.019443849101662636, |
|
"kl": 0.0005698204040527344, |
|
"learning_rate": 5.2000000000000004e-05, |
|
"loss": 0.0483, |
|
"reward": -0.21550959814339876, |
|
"reward_std": 0.47869637608528137, |
|
"rewards/cosine_scaled_reward": -0.2119214665144682, |
|
"rewards/format_reward": 0.2083333358168602, |
|
"step": 27 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2655.8750915527344, |
|
"epoch": 0.016, |
|
"grad_norm": 0.02436124160885811, |
|
"kl": 0.00157928466796875, |
|
"learning_rate": 5.4000000000000005e-05, |
|
"loss": -0.0163, |
|
"reward": 0.45750702917575836, |
|
"reward_std": 0.7698620781302452, |
|
"rewards/cosine_scaled_reward": -0.06291317194700241, |
|
"rewards/format_reward": 0.5833333432674408, |
|
"step": 28 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2933.5, |
|
"epoch": 0.01657142857142857, |
|
"grad_norm": 0.014681576751172543, |
|
"kl": 0.0005736351013183594, |
|
"learning_rate": 5.6000000000000006e-05, |
|
"loss": -0.014, |
|
"reward": 0.10475081205368042, |
|
"reward_std": 0.3902103342115879, |
|
"rewards/cosine_scaled_reward": -0.07262461073696613, |
|
"rewards/format_reward": 0.25, |
|
"step": 29 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3584.0, |
|
"epoch": 0.017142857142857144, |
|
"grad_norm": 0.011684375815093517, |
|
"kl": 0.0007815361022949219, |
|
"learning_rate": 5.8e-05, |
|
"loss": 0.0, |
|
"reward": -0.3875264674425125, |
|
"reward_std": 0.43698976188898087, |
|
"rewards/cosine_scaled_reward": -0.21459656301885843, |
|
"rewards/format_reward": 0.0416666679084301, |
|
"step": 30 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2217.291717529297, |
|
"epoch": 0.017714285714285714, |
|
"grad_norm": 0.036056023091077805, |
|
"kl": 0.0011777877807617188, |
|
"learning_rate": 6e-05, |
|
"loss": 0.2175, |
|
"reward": 0.9864542707800865, |
|
"reward_std": 0.9503698498010635, |
|
"rewards/cosine_scaled_reward": 0.18072709161788225, |
|
"rewards/format_reward": 0.6250000111758709, |
|
"step": 31 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2517.25, |
|
"epoch": 0.018285714285714287, |
|
"grad_norm": 0.02313651144504547, |
|
"kl": 0.0033006668090820312, |
|
"learning_rate": 6.2e-05, |
|
"loss": -0.1335, |
|
"reward": 0.9981129616498947, |
|
"reward_std": 0.3932240381836891, |
|
"rewards/cosine_scaled_reward": 0.20738982781767845, |
|
"rewards/format_reward": 0.5833333358168602, |
|
"step": 32 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3584.0, |
|
"epoch": 0.018857142857142857, |
|
"grad_norm": 0.012460554018616676, |
|
"kl": 0.0009255409240722656, |
|
"learning_rate": 6.400000000000001e-05, |
|
"loss": 0.0, |
|
"reward": -0.6963627487421036, |
|
"reward_std": 0.18985886126756668, |
|
"rewards/cosine_scaled_reward": -0.3481813669204712, |
|
"rewards/format_reward": 0.0, |
|
"step": 33 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3006.1250610351562, |
|
"epoch": 0.019428571428571427, |
|
"grad_norm": 0.01616235077381134, |
|
"kl": 0.0015659332275390625, |
|
"learning_rate": 6.6e-05, |
|
"loss": 0.0083, |
|
"reward": 0.1573825404047966, |
|
"reward_std": 0.7217601127922535, |
|
"rewards/cosine_scaled_reward": -0.1296420693397522, |
|
"rewards/format_reward": 0.4166666716337204, |
|
"step": 34 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3014.7083435058594, |
|
"epoch": 0.02, |
|
"grad_norm": 0.027833612635731697, |
|
"kl": 0.0010890960693359375, |
|
"learning_rate": 6.800000000000001e-05, |
|
"loss": 0.1738, |
|
"reward": -0.5526116490364075, |
|
"reward_std": 0.2741971779614687, |
|
"rewards/cosine_scaled_reward": -0.38047249615192413, |
|
"rewards/format_reward": 0.2083333432674408, |
|
"step": 35 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2914.1250610351562, |
|
"epoch": 0.02057142857142857, |
|
"grad_norm": 0.023610979318618774, |
|
"kl": 0.001861572265625, |
|
"learning_rate": 7e-05, |
|
"loss": 0.2468, |
|
"reward": 0.184011185541749, |
|
"reward_std": 0.8658745139837265, |
|
"rewards/cosine_scaled_reward": -0.07466107979416847, |
|
"rewards/format_reward": 0.3333333358168602, |
|
"step": 36 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3503.5416870117188, |
|
"epoch": 0.021142857142857144, |
|
"grad_norm": 0.010844520293176174, |
|
"kl": 0.0012454986572265625, |
|
"learning_rate": 7.2e-05, |
|
"loss": 0.0386, |
|
"reward": -0.049630362540483475, |
|
"reward_std": 0.8300461787730455, |
|
"rewards/cosine_scaled_reward": -0.08731517381966114, |
|
"rewards/format_reward": 0.1250000037252903, |
|
"step": 37 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3236.1250610351562, |
|
"epoch": 0.021714285714285714, |
|
"grad_norm": 0.013169029727578163, |
|
"kl": 0.0030126571655273438, |
|
"learning_rate": 7.4e-05, |
|
"loss": 0.0504, |
|
"reward": -0.19106721878051758, |
|
"reward_std": 0.9097858145833015, |
|
"rewards/cosine_scaled_reward": -0.241366945207119, |
|
"rewards/format_reward": 0.291666679084301, |
|
"step": 38 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1753.4167175292969, |
|
"epoch": 0.022285714285714287, |
|
"grad_norm": 0.04312436282634735, |
|
"kl": 0.0025959014892578125, |
|
"learning_rate": 7.6e-05, |
|
"loss": -0.2639, |
|
"reward": 0.9211674332618713, |
|
"reward_std": 0.7600763663649559, |
|
"rewards/cosine_scaled_reward": 0.04391703475266695, |
|
"rewards/format_reward": 0.8333333358168602, |
|
"step": 39 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3123.0416870117188, |
|
"epoch": 0.022857142857142857, |
|
"grad_norm": 0.015262431465089321, |
|
"kl": 0.001953125, |
|
"learning_rate": 7.800000000000001e-05, |
|
"loss": 0.0771, |
|
"reward": -0.2396223871037364, |
|
"reward_std": 0.45411088317632675, |
|
"rewards/cosine_scaled_reward": -0.28647788520902395, |
|
"rewards/format_reward": 0.3333333432674408, |
|
"step": 40 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3089.75, |
|
"epoch": 0.023428571428571427, |
|
"grad_norm": 0.017734840512275696, |
|
"kl": 0.0012683868408203125, |
|
"learning_rate": 8e-05, |
|
"loss": -0.0673, |
|
"reward": -0.05341312289237976, |
|
"reward_std": 0.23775821551680565, |
|
"rewards/cosine_scaled_reward": -0.15170655399560928, |
|
"rewards/format_reward": 0.25, |
|
"step": 41 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2459.5417098999023, |
|
"epoch": 0.024, |
|
"grad_norm": 0.01839015819132328, |
|
"kl": 0.0012769699096679688, |
|
"learning_rate": 8.2e-05, |
|
"loss": 0.0185, |
|
"reward": 0.4259261190891266, |
|
"reward_std": 0.39659431390464306, |
|
"rewards/cosine_scaled_reward": -0.016203660517930984, |
|
"rewards/format_reward": 0.4583333432674408, |
|
"step": 42 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3215.0833740234375, |
|
"epoch": 0.02457142857142857, |
|
"grad_norm": 0.021077867597341537, |
|
"kl": 0.0025310516357421875, |
|
"learning_rate": 8.4e-05, |
|
"loss": 0.1354, |
|
"reward": -0.06841922923922539, |
|
"reward_std": 0.7763218209147453, |
|
"rewards/cosine_scaled_reward": -0.13837626948952675, |
|
"rewards/format_reward": 0.2083333358168602, |
|
"step": 43 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3144.9583435058594, |
|
"epoch": 0.025142857142857144, |
|
"grad_norm": 0.014874313957989216, |
|
"kl": 0.0016632080078125, |
|
"learning_rate": 8.6e-05, |
|
"loss": 0.1193, |
|
"reward": -0.1355942115187645, |
|
"reward_std": 0.4834160730242729, |
|
"rewards/cosine_scaled_reward": -0.15113045647740364, |
|
"rewards/format_reward": 0.1666666716337204, |
|
"step": 44 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3175.5833740234375, |
|
"epoch": 0.025714285714285714, |
|
"grad_norm": 0.016986120492219925, |
|
"kl": 0.0015850067138671875, |
|
"learning_rate": 8.800000000000001e-05, |
|
"loss": 0.1528, |
|
"reward": -0.4366232231259346, |
|
"reward_std": 0.5260147228837013, |
|
"rewards/cosine_scaled_reward": -0.343311607837677, |
|
"rewards/format_reward": 0.2500000037252903, |
|
"step": 45 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3461.3333740234375, |
|
"epoch": 0.026285714285714287, |
|
"grad_norm": 0.012634013779461384, |
|
"kl": 0.0028676986694335938, |
|
"learning_rate": 9e-05, |
|
"loss": 0.0237, |
|
"reward": -0.06645508855581284, |
|
"reward_std": 1.0512375514954329, |
|
"rewards/cosine_scaled_reward": -0.17906087916344404, |
|
"rewards/format_reward": 0.2916666679084301, |
|
"step": 46 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2675.4166717529297, |
|
"epoch": 0.026857142857142857, |
|
"grad_norm": 0.016832780092954636, |
|
"kl": 0.0008749961853027344, |
|
"learning_rate": 9.200000000000001e-05, |
|
"loss": 0.1006, |
|
"reward": 0.34809515066444874, |
|
"reward_std": 0.2990557327866554, |
|
"rewards/cosine_scaled_reward": -0.013452455401420593, |
|
"rewards/format_reward": 0.3750000037252903, |
|
"step": 47 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1616.666748046875, |
|
"epoch": 0.027428571428571427, |
|
"grad_norm": 0.02106091007590294, |
|
"kl": 0.0061321258544921875, |
|
"learning_rate": 9.4e-05, |
|
"loss": 0.0979, |
|
"reward": 1.2005126923322678, |
|
"reward_std": 0.6015892848372459, |
|
"rewards/cosine_scaled_reward": 0.20442297495901585, |
|
"rewards/format_reward": 0.7916666679084301, |
|
"step": 48 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2405.3333435058594, |
|
"epoch": 0.028, |
|
"grad_norm": 0.02687126025557518, |
|
"kl": 0.009845733642578125, |
|
"learning_rate": 9.6e-05, |
|
"loss": 0.0241, |
|
"reward": 0.5416064560413361, |
|
"reward_std": 0.4324432276189327, |
|
"rewards/cosine_scaled_reward": 0.02080322802066803, |
|
"rewards/format_reward": 0.5, |
|
"step": 49 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3412.2083740234375, |
|
"epoch": 0.02857142857142857, |
|
"grad_norm": 0.013700881972908974, |
|
"kl": 0.00302886962890625, |
|
"learning_rate": 9.8e-05, |
|
"loss": 0.0663, |
|
"reward": -0.06471916288137436, |
|
"reward_std": 0.6379625052213669, |
|
"rewards/cosine_scaled_reward": -0.11569291353225708, |
|
"rewards/format_reward": 0.1666666679084301, |
|
"step": 50 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2844.5833435058594, |
|
"epoch": 0.029142857142857144, |
|
"grad_norm": 0.01361958496272564, |
|
"kl": 0.0013275146484375, |
|
"learning_rate": 0.0001, |
|
"loss": 0.0377, |
|
"reward": 0.20726051926612854, |
|
"reward_std": 0.1396642979234457, |
|
"rewards/cosine_scaled_reward": -0.021369755268096924, |
|
"rewards/format_reward": 0.25, |
|
"step": 51 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3466.9166870117188, |
|
"epoch": 0.029714285714285714, |
|
"grad_norm": 0.01215248741209507, |
|
"kl": 0.0009393692016601562, |
|
"learning_rate": 9.999890338174276e-05, |
|
"loss": 0.0278, |
|
"reward": -0.11257268488407135, |
|
"reward_std": 0.8305042590945959, |
|
"rewards/cosine_scaled_reward": -0.18128634430468082, |
|
"rewards/format_reward": 0.25, |
|
"step": 52 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3201.0, |
|
"epoch": 0.030285714285714287, |
|
"grad_norm": 0.014383244328200817, |
|
"kl": 0.0026397705078125, |
|
"learning_rate": 9.999561358041869e-05, |
|
"loss": 0.0744, |
|
"reward": 0.2842802293598652, |
|
"reward_std": 0.6465214220806956, |
|
"rewards/cosine_scaled_reward": 0.01714009791612625, |
|
"rewards/format_reward": 0.2500000111758709, |
|
"step": 53 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3046.0416870117188, |
|
"epoch": 0.030857142857142857, |
|
"grad_norm": 0.03346686065196991, |
|
"kl": 0.005756378173828125, |
|
"learning_rate": 9.999013075636805e-05, |
|
"loss": -0.0486, |
|
"reward": -0.3102136142551899, |
|
"reward_std": 0.20892462320625782, |
|
"rewards/cosine_scaled_reward": -0.28010681085288525, |
|
"rewards/format_reward": 0.25, |
|
"step": 54 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3255.0000610351562, |
|
"epoch": 0.03142857142857143, |
|
"grad_norm": 0.02068556286394596, |
|
"kl": 0.0030732154846191406, |
|
"learning_rate": 9.998245517681595e-05, |
|
"loss": -0.0891, |
|
"reward": 0.15274354815483093, |
|
"reward_std": 0.8250106833875179, |
|
"rewards/cosine_scaled_reward": -0.09029489010572433, |
|
"rewards/format_reward": 0.3333333358168602, |
|
"step": 55 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2653.7084197998047, |
|
"epoch": 0.032, |
|
"grad_norm": 0.03398514911532402, |
|
"kl": 0.0039196014404296875, |
|
"learning_rate": 9.997258721585931e-05, |
|
"loss": 0.028, |
|
"reward": 0.31957897171378136, |
|
"reward_std": 0.8959189355373383, |
|
"rewards/cosine_scaled_reward": -0.06937719509005547, |
|
"rewards/format_reward": 0.4583333358168602, |
|
"step": 56 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2849.3750915527344, |
|
"epoch": 0.03257142857142857, |
|
"grad_norm": 0.012109542265534401, |
|
"kl": 0.0045604705810546875, |
|
"learning_rate": 9.996052735444863e-05, |
|
"loss": -0.0042, |
|
"reward": 0.1344001293182373, |
|
"reward_std": 0.45969852432608604, |
|
"rewards/cosine_scaled_reward": -0.12029992416501045, |
|
"rewards/format_reward": 0.3750000037252903, |
|
"step": 57 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3181.0833740234375, |
|
"epoch": 0.03314285714285714, |
|
"grad_norm": 0.021295132115483284, |
|
"kl": 0.002410888671875, |
|
"learning_rate": 9.994627618036454e-05, |
|
"loss": 0.1335, |
|
"reward": -0.20644971216097474, |
|
"reward_std": 0.9107521660625935, |
|
"rewards/cosine_scaled_reward": -0.22822486609220505, |
|
"rewards/format_reward": 0.2500000074505806, |
|
"step": 58 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3010.4584350585938, |
|
"epoch": 0.03371428571428572, |
|
"grad_norm": 0.022170057520270348, |
|
"kl": 0.0033636093139648438, |
|
"learning_rate": 9.992983438818914e-05, |
|
"loss": 0.1363, |
|
"reward": 0.24811836518347263, |
|
"reward_std": 1.1761204116046429, |
|
"rewards/cosine_scaled_reward": -0.06344081088900566, |
|
"rewards/format_reward": 0.3750000074505806, |
|
"step": 59 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3584.0, |
|
"epoch": 0.03428571428571429, |
|
"grad_norm": 0.011233195662498474, |
|
"kl": 0.005175590515136719, |
|
"learning_rate": 9.991120277927223e-05, |
|
"loss": 0.0002, |
|
"reward": -0.3428353890776634, |
|
"reward_std": 0.3860394358634949, |
|
"rewards/cosine_scaled_reward": -0.1714176945388317, |
|
"rewards/format_reward": 0.0, |
|
"step": 60 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2401.5416870117188, |
|
"epoch": 0.03485714285714286, |
|
"grad_norm": 0.031769707798957825, |
|
"kl": 0.002063751220703125, |
|
"learning_rate": 9.989038226169209e-05, |
|
"loss": 0.2605, |
|
"reward": 0.41955555975437164, |
|
"reward_std": 0.7642397582530975, |
|
"rewards/cosine_scaled_reward": 0.0014444328844547272, |
|
"rewards/format_reward": 0.4166666716337204, |
|
"step": 61 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2725.625, |
|
"epoch": 0.03542857142857143, |
|
"grad_norm": 0.018359214067459106, |
|
"kl": 0.002758026123046875, |
|
"learning_rate": 9.986737385021142e-05, |
|
"loss": 0.0482, |
|
"reward": -0.2772537413984537, |
|
"reward_std": 0.23343749158084393, |
|
"rewards/cosine_scaled_reward": -0.3469602093100548, |
|
"rewards/format_reward": 0.4166666716337204, |
|
"step": 62 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3515.2500610351562, |
|
"epoch": 0.036, |
|
"grad_norm": 0.017092403024435043, |
|
"kl": 0.0034360885620117188, |
|
"learning_rate": 9.98421786662277e-05, |
|
"loss": 0.0286, |
|
"reward": 0.15998955629765987, |
|
"reward_std": 0.7633470920845866, |
|
"rewards/cosine_scaled_reward": 0.017494780011475086, |
|
"rewards/format_reward": 0.1250000037252903, |
|
"step": 63 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3171.2916870117188, |
|
"epoch": 0.036571428571428574, |
|
"grad_norm": 0.025074612349271774, |
|
"kl": 0.0031719207763671875, |
|
"learning_rate": 9.981479793771866e-05, |
|
"loss": 0.0007, |
|
"reward": -0.23104018531739712, |
|
"reward_std": 0.39066051598638296, |
|
"rewards/cosine_scaled_reward": -0.21968677546828985, |
|
"rewards/format_reward": 0.2083333432674408, |
|
"step": 64 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3346.8750610351562, |
|
"epoch": 0.037142857142857144, |
|
"grad_norm": 0.01631852798163891, |
|
"kl": 0.003975868225097656, |
|
"learning_rate": 9.97852329991824e-05, |
|
"loss": 0.0989, |
|
"reward": -0.2601096876896918, |
|
"reward_std": 0.43978679180145264, |
|
"rewards/cosine_scaled_reward": -0.19255484640598297, |
|
"rewards/format_reward": 0.1250000037252903, |
|
"step": 65 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2645.6251220703125, |
|
"epoch": 0.037714285714285714, |
|
"grad_norm": 0.027915285900235176, |
|
"kl": 0.0030145645141601562, |
|
"learning_rate": 9.97534852915723e-05, |
|
"loss": 0.096, |
|
"reward": 0.6423284709453583, |
|
"reward_std": 1.0060622096061707, |
|
"rewards/cosine_scaled_reward": -0.012169107794761658, |
|
"rewards/format_reward": 0.6666666716337204, |
|
"step": 66 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2729.0833587646484, |
|
"epoch": 0.038285714285714284, |
|
"grad_norm": 0.023593856021761894, |
|
"kl": 0.0039539337158203125, |
|
"learning_rate": 9.971955636222684e-05, |
|
"loss": -0.0287, |
|
"reward": 0.26382073760032654, |
|
"reward_std": 0.7153124492615461, |
|
"rewards/cosine_scaled_reward": -0.07642300426959991, |
|
"rewards/format_reward": 0.4166666716337204, |
|
"step": 67 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3584.0, |
|
"epoch": 0.038857142857142854, |
|
"grad_norm": 0.009784531779587269, |
|
"kl": 0.0015583038330078125, |
|
"learning_rate": 9.968344786479416e-05, |
|
"loss": 0.0001, |
|
"reward": -0.4307379722595215, |
|
"reward_std": 0.28678057435899973, |
|
"rewards/cosine_scaled_reward": -0.21536898985505104, |
|
"rewards/format_reward": 0.0, |
|
"step": 68 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2913.1250610351562, |
|
"epoch": 0.03942857142857143, |
|
"grad_norm": 0.08137527108192444, |
|
"kl": 0.0056610107421875, |
|
"learning_rate": 9.964516155915151e-05, |
|
"loss": 0.0838, |
|
"reward": 0.05851218104362488, |
|
"reward_std": 0.8442177847027779, |
|
"rewards/cosine_scaled_reward": -0.1790772434324026, |
|
"rewards/format_reward": 0.4166666679084301, |
|
"step": 69 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2989.6666870117188, |
|
"epoch": 0.04, |
|
"grad_norm": 0.03494952991604805, |
|
"kl": 0.008270263671875, |
|
"learning_rate": 9.960469931131939e-05, |
|
"loss": 0.2326, |
|
"reward": 0.0155078349635005, |
|
"reward_std": 0.7885254546999931, |
|
"rewards/cosine_scaled_reward": -0.1380794234573841, |
|
"rewards/format_reward": 0.291666679084301, |
|
"step": 70 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2028.7917022705078, |
|
"epoch": 0.04057142857142857, |
|
"grad_norm": 0.037250399589538574, |
|
"kl": 0.00527191162109375, |
|
"learning_rate": 9.956206309337068e-05, |
|
"loss": 0.0821, |
|
"reward": 0.9818168096244335, |
|
"reward_std": 0.5084632188081741, |
|
"rewards/cosine_scaled_reward": 0.1575750857591629, |
|
"rewards/format_reward": 0.6666666716337204, |
|
"step": 71 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3102.7083435058594, |
|
"epoch": 0.04114285714285714, |
|
"grad_norm": 0.01361783780157566, |
|
"kl": 0.006103515625, |
|
"learning_rate": 9.951725498333448e-05, |
|
"loss": -0.0301, |
|
"reward": 0.15624341368675232, |
|
"reward_std": 0.18956233747303486, |
|
"rewards/cosine_scaled_reward": -0.04687829315662384, |
|
"rewards/format_reward": 0.25, |
|
"step": 72 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2701.8333587646484, |
|
"epoch": 0.04171428571428572, |
|
"grad_norm": 0.01851752959191799, |
|
"kl": 0.006786346435546875, |
|
"learning_rate": 9.947027716509488e-05, |
|
"loss": 0.158, |
|
"reward": 0.3348941504955292, |
|
"reward_std": 0.7048965748399496, |
|
"rewards/cosine_scaled_reward": -0.12421960011124611, |
|
"rewards/format_reward": 0.5833333432674408, |
|
"step": 73 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3003.041717529297, |
|
"epoch": 0.04228571428571429, |
|
"grad_norm": 0.018196003511548042, |
|
"kl": 0.005214691162109375, |
|
"learning_rate": 9.942113192828445e-05, |
|
"loss": 0.091, |
|
"reward": 0.34735431522130966, |
|
"reward_std": 0.8543844074010849, |
|
"rewards/cosine_scaled_reward": -0.013822849839925766, |
|
"rewards/format_reward": 0.3750000111758709, |
|
"step": 74 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3122.8334350585938, |
|
"epoch": 0.04285714285714286, |
|
"grad_norm": 0.015949510037899017, |
|
"kl": 0.00733184814453125, |
|
"learning_rate": 9.936982166817273e-05, |
|
"loss": 0.1108, |
|
"reward": 0.1362277865409851, |
|
"reward_std": 0.9558060020208359, |
|
"rewards/cosine_scaled_reward": -0.14021944627165794, |
|
"rewards/format_reward": 0.4166666753590107, |
|
"step": 75 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3223.375, |
|
"epoch": 0.04342857142857143, |
|
"grad_norm": 0.014545021578669548, |
|
"kl": 0.011676788330078125, |
|
"learning_rate": 9.931634888554937e-05, |
|
"loss": -0.0729, |
|
"reward": -0.3659635931253433, |
|
"reward_std": 0.43001827597618103, |
|
"rewards/cosine_scaled_reward": -0.2663151305168867, |
|
"rewards/format_reward": 0.1666666716337204, |
|
"step": 76 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2802.0000610351562, |
|
"epoch": 0.044, |
|
"grad_norm": 0.05182776227593422, |
|
"kl": 0.013092041015625, |
|
"learning_rate": 9.926071618660238e-05, |
|
"loss": 0.2806, |
|
"reward": -0.06057482771575451, |
|
"reward_std": 0.6179361715912819, |
|
"rewards/cosine_scaled_reward": -0.19695409014821053, |
|
"rewards/format_reward": 0.3333333469927311, |
|
"step": 77 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3281.7083740234375, |
|
"epoch": 0.044571428571428574, |
|
"grad_norm": 0.029229266569018364, |
|
"kl": 0.00809478759765625, |
|
"learning_rate": 9.920292628279099e-05, |
|
"loss": 0.0812, |
|
"reward": -0.0839165486395359, |
|
"reward_std": 0.9277310892939568, |
|
"rewards/cosine_scaled_reward": -0.16695828922092915, |
|
"rewards/format_reward": 0.2500000037252903, |
|
"step": 78 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3018.125, |
|
"epoch": 0.045142857142857144, |
|
"grad_norm": 0.014639424160122871, |
|
"kl": 0.02259063720703125, |
|
"learning_rate": 9.914298199071362e-05, |
|
"loss": -0.0578, |
|
"reward": 0.25495412945747375, |
|
"reward_std": 0.8909239619970322, |
|
"rewards/cosine_scaled_reward": -0.03918960690498352, |
|
"rewards/format_reward": 0.3333333358168602, |
|
"step": 79 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3345.375, |
|
"epoch": 0.045714285714285714, |
|
"grad_norm": 0.01726081222295761, |
|
"kl": 0.0069427490234375, |
|
"learning_rate": 9.908088623197048e-05, |
|
"loss": 0.0662, |
|
"reward": -0.11315303295850754, |
|
"reward_std": 0.4808522164821625, |
|
"rewards/cosine_scaled_reward": -0.20240984484553337, |
|
"rewards/format_reward": 0.2916666716337204, |
|
"step": 80 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3315.2916870117188, |
|
"epoch": 0.046285714285714284, |
|
"grad_norm": 0.013985719531774521, |
|
"kl": 0.00591278076171875, |
|
"learning_rate": 9.901664203302126e-05, |
|
"loss": 0.0775, |
|
"reward": 0.5611200910061598, |
|
"reward_std": 0.5030479682609439, |
|
"rewards/cosine_scaled_reward": 0.11389338690787554, |
|
"rewards/format_reward": 0.3333333469927311, |
|
"step": 81 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3138.1666870117188, |
|
"epoch": 0.046857142857142854, |
|
"grad_norm": 0.015917882323265076, |
|
"kl": 0.00797271728515625, |
|
"learning_rate": 9.895025252503756e-05, |
|
"loss": -0.0315, |
|
"reward": 0.07941232621669769, |
|
"reward_std": 0.241426233202219, |
|
"rewards/cosine_scaled_reward": -0.08529385924339294, |
|
"rewards/format_reward": 0.25, |
|
"step": 82 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1649.9167175292969, |
|
"epoch": 0.04742857142857143, |
|
"grad_norm": 0.20445036888122559, |
|
"kl": 0.146575927734375, |
|
"learning_rate": 9.888172094375034e-05, |
|
"loss": 0.1238, |
|
"reward": 1.3232550099492073, |
|
"reward_std": 0.27204739674925804, |
|
"rewards/cosine_scaled_reward": 0.28662747144699097, |
|
"rewards/format_reward": 0.75, |
|
"step": 83 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2547.8333740234375, |
|
"epoch": 0.048, |
|
"grad_norm": 0.055599115788936615, |
|
"kl": 0.03204345703125, |
|
"learning_rate": 9.881105062929221e-05, |
|
"loss": 0.03, |
|
"reward": 0.06362025253474712, |
|
"reward_std": 0.6657480411231518, |
|
"rewards/cosine_scaled_reward": -0.2181899007409811, |
|
"rewards/format_reward": 0.5000000111758709, |
|
"step": 84 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2876.8333587646484, |
|
"epoch": 0.04857142857142857, |
|
"grad_norm": 0.015016643330454826, |
|
"kl": 0.0128631591796875, |
|
"learning_rate": 9.87382450260346e-05, |
|
"loss": 0.0261, |
|
"reward": -0.039747525937855244, |
|
"reward_std": 0.38030122220516205, |
|
"rewards/cosine_scaled_reward": -0.1657070992514491, |
|
"rewards/format_reward": 0.2916666679084301, |
|
"step": 85 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3386.7501220703125, |
|
"epoch": 0.04914285714285714, |
|
"grad_norm": 0.014947664923965931, |
|
"kl": 0.01012420654296875, |
|
"learning_rate": 9.866330768241984e-05, |
|
"loss": 0.0714, |
|
"reward": 0.804126501083374, |
|
"reward_std": 1.6738584637641907, |
|
"rewards/cosine_scaled_reward": 0.1728965789079666, |
|
"rewards/format_reward": 0.4583333358168602, |
|
"step": 86 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3348.625, |
|
"epoch": 0.04971428571428571, |
|
"grad_norm": 0.012593540363013744, |
|
"kl": 0.0147857666015625, |
|
"learning_rate": 9.858624225078841e-05, |
|
"loss": 0.0384, |
|
"reward": 0.16258125752210617, |
|
"reward_std": 0.6240727119147778, |
|
"rewards/cosine_scaled_reward": -0.043709371238946915, |
|
"rewards/format_reward": 0.2500000111758709, |
|
"step": 87 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2490.041732788086, |
|
"epoch": 0.05028571428571429, |
|
"grad_norm": 0.04205311834812164, |
|
"kl": 0.0225677490234375, |
|
"learning_rate": 9.850705248720069e-05, |
|
"loss": 0.2146, |
|
"reward": -0.07761363685131073, |
|
"reward_std": 0.6357230395078659, |
|
"rewards/cosine_scaled_reward": -0.26797350379638374, |
|
"rewards/format_reward": 0.4583333395421505, |
|
"step": 88 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2835.8333740234375, |
|
"epoch": 0.05085714285714286, |
|
"grad_norm": 0.03906717151403427, |
|
"kl": 0.016937255859375, |
|
"learning_rate": 9.842574225125401e-05, |
|
"loss": 0.1663, |
|
"reward": 0.05134725570678711, |
|
"reward_std": 0.8167771622538567, |
|
"rewards/cosine_scaled_reward": -0.1618263851851225, |
|
"rewards/format_reward": 0.3750000111758709, |
|
"step": 89 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2103.2084350585938, |
|
"epoch": 0.05142857142857143, |
|
"grad_norm": 0.019287196919322014, |
|
"kl": 0.00624847412109375, |
|
"learning_rate": 9.834231550589462e-05, |
|
"loss": 0.1045, |
|
"reward": 1.3975676447153091, |
|
"reward_std": 1.155922506004572, |
|
"rewards/cosine_scaled_reward": 0.28211713768541813, |
|
"rewards/format_reward": 0.8333333432674408, |
|
"step": 90 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3305.3333740234375, |
|
"epoch": 0.052, |
|
"grad_norm": 0.016830939799547195, |
|
"kl": 0.01747894287109375, |
|
"learning_rate": 9.825677631722435e-05, |
|
"loss": 0.0694, |
|
"reward": 0.1266888901591301, |
|
"reward_std": 0.7699784189462662, |
|
"rewards/cosine_scaled_reward": -0.08248887583613396, |
|
"rewards/format_reward": 0.2916666716337204, |
|
"step": 91 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2862.5, |
|
"epoch": 0.052571428571428575, |
|
"grad_norm": 0.019136304035782814, |
|
"kl": 0.0141448974609375, |
|
"learning_rate": 9.816912885430258e-05, |
|
"loss": 0.0305, |
|
"reward": 0.02380932867527008, |
|
"reward_std": 0.44232284277677536, |
|
"rewards/cosine_scaled_reward": -0.11309535056352615, |
|
"rewards/format_reward": 0.25, |
|
"step": 92 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2409.75, |
|
"epoch": 0.053142857142857144, |
|
"grad_norm": 0.015891728922724724, |
|
"kl": 0.014984130859375, |
|
"learning_rate": 9.807937738894303e-05, |
|
"loss": 0.0508, |
|
"reward": 0.3022146672010422, |
|
"reward_std": 0.6315838098526001, |
|
"rewards/cosine_scaled_reward": -0.0988927073776722, |
|
"rewards/format_reward": 0.5, |
|
"step": 93 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3115.666748046875, |
|
"epoch": 0.053714285714285714, |
|
"grad_norm": 0.018753819167613983, |
|
"kl": 0.010223388671875, |
|
"learning_rate": 9.798752629550546e-05, |
|
"loss": 0.0866, |
|
"reward": 0.1527155190706253, |
|
"reward_std": 0.8529080599546432, |
|
"rewards/cosine_scaled_reward": -0.11114224418997765, |
|
"rewards/format_reward": 0.3750000111758709, |
|
"step": 94 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2407.5833740234375, |
|
"epoch": 0.054285714285714284, |
|
"grad_norm": 0.033838678151369095, |
|
"kl": 0.01202392578125, |
|
"learning_rate": 9.789358005068262e-05, |
|
"loss": 0.1269, |
|
"reward": 0.42500831093639135, |
|
"reward_std": 0.9461558535695076, |
|
"rewards/cosine_scaled_reward": -0.05832919664680958, |
|
"rewards/format_reward": 0.5416666716337204, |
|
"step": 95 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3434.2916870117188, |
|
"epoch": 0.054857142857142854, |
|
"grad_norm": 0.012105435132980347, |
|
"kl": 0.023162841796875, |
|
"learning_rate": 9.779754323328192e-05, |
|
"loss": 0.0246, |
|
"reward": -0.11048224568367004, |
|
"reward_std": 0.7371259145438671, |
|
"rewards/cosine_scaled_reward": -0.13857445027679205, |
|
"rewards/format_reward": 0.1666666716337204, |
|
"step": 96 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3034.5416870117188, |
|
"epoch": 0.05542857142857143, |
|
"grad_norm": 0.025039825588464737, |
|
"kl": 0.02886962890625, |
|
"learning_rate": 9.769942052400235e-05, |
|
"loss": 0.1097, |
|
"reward": -0.30599308758974075, |
|
"reward_std": 0.44477224349975586, |
|
"rewards/cosine_scaled_reward": -0.3196632117033005, |
|
"rewards/format_reward": 0.3333333432674408, |
|
"step": 97 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1659.4166717529297, |
|
"epoch": 0.056, |
|
"grad_norm": 0.01933540217578411, |
|
"kl": 0.01055145263671875, |
|
"learning_rate": 9.759921670520634e-05, |
|
"loss": -0.0341, |
|
"reward": 1.2363078743219376, |
|
"reward_std": 1.0749643743038177, |
|
"rewards/cosine_scaled_reward": 0.22232059203088284, |
|
"rewards/format_reward": 0.7916666679084301, |
|
"step": 98 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2495.750030517578, |
|
"epoch": 0.05657142857142857, |
|
"grad_norm": 0.04390771687030792, |
|
"kl": 0.01910400390625, |
|
"learning_rate": 9.749693666068664e-05, |
|
"loss": 0.2446, |
|
"reward": 0.6843666434288025, |
|
"reward_std": 1.0362943559885025, |
|
"rewards/cosine_scaled_reward": 0.13385000079870224, |
|
"rewards/format_reward": 0.4166666716337204, |
|
"step": 99 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2179.5833740234375, |
|
"epoch": 0.05714285714285714, |
|
"grad_norm": 0.021462595090270042, |
|
"kl": 0.02783203125, |
|
"learning_rate": 9.739258537542835e-05, |
|
"loss": 0.0284, |
|
"reward": 0.4209151156246662, |
|
"reward_std": 0.6625542566180229, |
|
"rewards/cosine_scaled_reward": -0.06037578295217827, |
|
"rewards/format_reward": 0.5416666679084301, |
|
"step": 100 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3205.5, |
|
"epoch": 0.05771428571428571, |
|
"grad_norm": 0.024794792756438255, |
|
"kl": 0.028961181640625, |
|
"learning_rate": 9.728616793536588e-05, |
|
"loss": 0.1078, |
|
"reward": -0.20108163356781006, |
|
"reward_std": 0.7786883413791656, |
|
"rewards/cosine_scaled_reward": -0.22554081678390503, |
|
"rewards/format_reward": 0.2500000037252903, |
|
"step": 101 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3427.8333740234375, |
|
"epoch": 0.05828571428571429, |
|
"grad_norm": 0.013528184965252876, |
|
"kl": 0.0247802734375, |
|
"learning_rate": 9.717768952713513e-05, |
|
"loss": 0.0313, |
|
"reward": -0.5099735148251057, |
|
"reward_std": 0.49693995993584394, |
|
"rewards/cosine_scaled_reward": -0.29665342532098293, |
|
"rewards/format_reward": 0.0833333358168602, |
|
"step": 102 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3009.0000610351562, |
|
"epoch": 0.05885714285714286, |
|
"grad_norm": 0.06049313023686409, |
|
"kl": 0.0323486328125, |
|
"learning_rate": 9.706715543782064e-05, |
|
"loss": 0.2486, |
|
"reward": -0.3947155475616455, |
|
"reward_std": 0.49045583233237267, |
|
"rewards/cosine_scaled_reward": -0.32235776633024216, |
|
"rewards/format_reward": 0.2500000037252903, |
|
"step": 103 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3463.875, |
|
"epoch": 0.05942857142857143, |
|
"grad_norm": 0.024880580604076385, |
|
"kl": 0.041290283203125, |
|
"learning_rate": 9.695457105469806e-05, |
|
"loss": 0.0448, |
|
"reward": -0.31599661335349083, |
|
"reward_std": 0.5560531467199326, |
|
"rewards/cosine_scaled_reward": -0.19966497272253036, |
|
"rewards/format_reward": 0.0833333358168602, |
|
"step": 104 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3122.7083740234375, |
|
"epoch": 0.06, |
|
"grad_norm": 0.021345539018511772, |
|
"kl": 0.037017822265625, |
|
"learning_rate": 9.683994186497132e-05, |
|
"loss": 0.0791, |
|
"reward": -0.42670758813619614, |
|
"reward_std": 0.3747115731239319, |
|
"rewards/cosine_scaled_reward": -0.33835379779338837, |
|
"rewards/format_reward": 0.2500000111758709, |
|
"step": 105 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2970.7500610351562, |
|
"epoch": 0.060571428571428575, |
|
"grad_norm": 0.07319632917642593, |
|
"kl": 0.04351806640625, |
|
"learning_rate": 9.672327345550543e-05, |
|
"loss": -0.0454, |
|
"reward": -0.012075750157237053, |
|
"reward_std": 0.6432481557130814, |
|
"rewards/cosine_scaled_reward": -0.1727045476436615, |
|
"rewards/format_reward": 0.3333333358168602, |
|
"step": 106 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2791.0833587646484, |
|
"epoch": 0.061142857142857145, |
|
"grad_norm": 0.022539552301168442, |
|
"kl": 0.0341339111328125, |
|
"learning_rate": 9.66045715125541e-05, |
|
"loss": -0.0835, |
|
"reward": -0.19219213724136353, |
|
"reward_std": 0.35477501153945923, |
|
"rewards/cosine_scaled_reward": -0.26276274397969246, |
|
"rewards/format_reward": 0.3333333358168602, |
|
"step": 107 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2680.8750610351562, |
|
"epoch": 0.061714285714285715, |
|
"grad_norm": 0.04677839204668999, |
|
"kl": 0.0345611572265625, |
|
"learning_rate": 9.648384182148252e-05, |
|
"loss": 0.1531, |
|
"reward": -0.10541854053735733, |
|
"reward_std": 0.3359173368662596, |
|
"rewards/cosine_scaled_reward": -0.26104260981082916, |
|
"rewards/format_reward": 0.4166666865348816, |
|
"step": 108 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2972.1666870117188, |
|
"epoch": 0.062285714285714285, |
|
"grad_norm": 0.09794782847166061, |
|
"kl": 0.041717529296875, |
|
"learning_rate": 9.636109026648555e-05, |
|
"loss": 0.1218, |
|
"reward": -0.25983521342277527, |
|
"reward_std": 0.6902979081496596, |
|
"rewards/cosine_scaled_reward": -0.2549176011234522, |
|
"rewards/format_reward": 0.2500000111758709, |
|
"step": 109 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2938.0000610351562, |
|
"epoch": 0.06285714285714286, |
|
"grad_norm": 0.05397583171725273, |
|
"kl": 0.044769287109375, |
|
"learning_rate": 9.623632283030079e-05, |
|
"loss": 0.1955, |
|
"reward": -0.3282480388879776, |
|
"reward_std": 0.45391279086470604, |
|
"rewards/cosine_scaled_reward": -0.3099573701620102, |
|
"rewards/format_reward": 0.291666679084301, |
|
"step": 110 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2632.5416870117188, |
|
"epoch": 0.06342857142857143, |
|
"grad_norm": 0.03481624647974968, |
|
"kl": 0.0413818359375, |
|
"learning_rate": 9.610954559391703e-05, |
|
"loss": 0.0666, |
|
"reward": -0.10429460182785988, |
|
"reward_std": 0.4938749596476555, |
|
"rewards/cosine_scaled_reward": -0.2188139706850052, |
|
"rewards/format_reward": 0.3333333358168602, |
|
"step": 111 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3193.0, |
|
"epoch": 0.064, |
|
"grad_norm": 0.04939614608883858, |
|
"kl": 0.0479736328125, |
|
"learning_rate": 9.598076473627798e-05, |
|
"loss": 0.1431, |
|
"reward": 0.014449171721935272, |
|
"reward_std": 0.7652652338147163, |
|
"rewards/cosine_scaled_reward": -0.15944208949804306, |
|
"rewards/format_reward": 0.3333333395421505, |
|
"step": 112 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3056.7916870117188, |
|
"epoch": 0.06457142857142857, |
|
"grad_norm": 0.023673003539443016, |
|
"kl": 0.062255859375, |
|
"learning_rate": 9.58499865339809e-05, |
|
"loss": 0.0594, |
|
"reward": -0.11021074652671814, |
|
"reward_std": 0.6375727728009224, |
|
"rewards/cosine_scaled_reward": -0.22177204675972462, |
|
"rewards/format_reward": 0.3333333469927311, |
|
"step": 113 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2781.5833435058594, |
|
"epoch": 0.06514285714285714, |
|
"grad_norm": 0.023715665563941002, |
|
"kl": 0.0440216064453125, |
|
"learning_rate": 9.571721736097089e-05, |
|
"loss": 0.1791, |
|
"reward": 0.14624399319291115, |
|
"reward_std": 0.6526099145412445, |
|
"rewards/cosine_scaled_reward": -0.17687800526618958, |
|
"rewards/format_reward": 0.5000000074505806, |
|
"step": 114 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2856.5833740234375, |
|
"epoch": 0.06571428571428571, |
|
"grad_norm": 0.030268969014286995, |
|
"kl": 0.0810546875, |
|
"learning_rate": 9.558246368823013e-05, |
|
"loss": 0.0175, |
|
"reward": -0.05208939407020807, |
|
"reward_std": 0.41136957705020905, |
|
"rewards/cosine_scaled_reward": -0.192711366340518, |
|
"rewards/format_reward": 0.3333333358168602, |
|
"step": 115 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2494.666748046875, |
|
"epoch": 0.06628571428571428, |
|
"grad_norm": 0.12291796505451202, |
|
"kl": 0.063690185546875, |
|
"learning_rate": 9.544573208346253e-05, |
|
"loss": 0.2623, |
|
"reward": -0.11325077712535858, |
|
"reward_std": 0.706281989812851, |
|
"rewards/cosine_scaled_reward": -0.3066254146397114, |
|
"rewards/format_reward": 0.5000000074505806, |
|
"step": 116 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3124.9166870117188, |
|
"epoch": 0.06685714285714285, |
|
"grad_norm": 0.021119866520166397, |
|
"kl": 0.07550048828125, |
|
"learning_rate": 9.530702921077358e-05, |
|
"loss": -0.0377, |
|
"reward": 0.1393863447010517, |
|
"reward_std": 0.1535858940333128, |
|
"rewards/cosine_scaled_reward": -0.05530684255063534, |
|
"rewards/format_reward": 0.25, |
|
"step": 117 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2799.3333587646484, |
|
"epoch": 0.06742857142857143, |
|
"grad_norm": 0.04394465312361717, |
|
"kl": 0.0882568359375, |
|
"learning_rate": 9.516636183034565e-05, |
|
"loss": 0.0588, |
|
"reward": -0.04012691602110863, |
|
"reward_std": 0.7108660526573658, |
|
"rewards/cosine_scaled_reward": -0.18673011288046837, |
|
"rewards/format_reward": 0.3333333358168602, |
|
"step": 118 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2137.750015258789, |
|
"epoch": 0.068, |
|
"grad_norm": 0.02675880491733551, |
|
"kl": 0.09320068359375, |
|
"learning_rate": 9.50237367981084e-05, |
|
"loss": 0.0715, |
|
"reward": 0.5401680022478104, |
|
"reward_std": 0.37584915198385715, |
|
"rewards/cosine_scaled_reward": 0.02008398249745369, |
|
"rewards/format_reward": 0.5, |
|
"step": 119 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3519.0, |
|
"epoch": 0.06857142857142857, |
|
"grad_norm": 0.027520829811692238, |
|
"kl": 0.113037109375, |
|
"learning_rate": 9.487916106540466e-05, |
|
"loss": 0.0444, |
|
"reward": -0.6826352626085281, |
|
"reward_std": 0.5236286884173751, |
|
"rewards/cosine_scaled_reward": -0.36215095594525337, |
|
"rewards/format_reward": 0.0416666679084301, |
|
"step": 120 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3382.125, |
|
"epoch": 0.06914285714285714, |
|
"grad_norm": 0.04471747577190399, |
|
"kl": 0.1441650390625, |
|
"learning_rate": 9.473264167865173e-05, |
|
"loss": 0.0948, |
|
"reward": -0.5512382835149765, |
|
"reward_std": 0.39517808333039284, |
|
"rewards/cosine_scaled_reward": -0.31728582084178925, |
|
"rewards/format_reward": 0.0833333358168602, |
|
"step": 121 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1638.5000305175781, |
|
"epoch": 0.06971428571428571, |
|
"grad_norm": 0.11067837476730347, |
|
"kl": 0.04302978515625, |
|
"learning_rate": 9.458418577899775e-05, |
|
"loss": 0.2051, |
|
"reward": 1.2672786926850677, |
|
"reward_std": 0.5117088668048382, |
|
"rewards/cosine_scaled_reward": 0.17530599236488342, |
|
"rewards/format_reward": 0.9166666865348816, |
|
"step": 122 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1879.1250610351562, |
|
"epoch": 0.07028571428571428, |
|
"grad_norm": 0.05848046392202377, |
|
"kl": 0.074951171875, |
|
"learning_rate": 9.443380060197387e-05, |
|
"loss": 0.071, |
|
"reward": 0.7721037119626999, |
|
"reward_std": 0.8204567953944206, |
|
"rewards/cosine_scaled_reward": 0.03188517317175865, |
|
"rewards/format_reward": 0.7083333395421505, |
|
"step": 123 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3159.0000610351562, |
|
"epoch": 0.07085714285714285, |
|
"grad_norm": 0.0826815739274025, |
|
"kl": 0.1474609375, |
|
"learning_rate": 9.428149347714143e-05, |
|
"loss": 0.0821, |
|
"reward": 0.13420870155096054, |
|
"reward_std": 0.6483585238456726, |
|
"rewards/cosine_scaled_reward": -0.09956231713294983, |
|
"rewards/format_reward": 0.3333333469927311, |
|
"step": 124 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2804.8750610351562, |
|
"epoch": 0.07142857142857142, |
|
"grad_norm": 0.04677790403366089, |
|
"kl": 0.19384765625, |
|
"learning_rate": 9.412727182773487e-05, |
|
"loss": 0.078, |
|
"reward": 0.2003941610455513, |
|
"reward_std": 0.6851903721690178, |
|
"rewards/cosine_scaled_reward": -0.10813625901937485, |
|
"rewards/format_reward": 0.4166666865348816, |
|
"step": 125 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2025.3750762939453, |
|
"epoch": 0.072, |
|
"grad_norm": 0.1587214171886444, |
|
"kl": 0.119873046875, |
|
"learning_rate": 9.397114317029975e-05, |
|
"loss": 0.2926, |
|
"reward": 0.33202146738767624, |
|
"reward_std": 0.9008842334151268, |
|
"rewards/cosine_scaled_reward": -0.16732261329889297, |
|
"rewards/format_reward": 0.6666666716337204, |
|
"step": 126 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2622.2083740234375, |
|
"epoch": 0.07257142857142856, |
|
"grad_norm": 0.10209250450134277, |
|
"kl": 0.20458984375, |
|
"learning_rate": 9.381311511432659e-05, |
|
"loss": 0.103, |
|
"reward": -0.008071951568126678, |
|
"reward_std": 0.6357720643281937, |
|
"rewards/cosine_scaled_reward": -0.23320263996720314, |
|
"rewards/format_reward": 0.4583333395421505, |
|
"step": 127 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3402.666748046875, |
|
"epoch": 0.07314285714285715, |
|
"grad_norm": 0.03874148055911064, |
|
"kl": 0.31787109375, |
|
"learning_rate": 9.36531953618799e-05, |
|
"loss": 0.0657, |
|
"reward": -0.03658340871334076, |
|
"reward_std": 0.6458300985395908, |
|
"rewards/cosine_scaled_reward": -0.12245837599039078, |
|
"rewards/format_reward": 0.2083333395421505, |
|
"step": 128 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2692.041748046875, |
|
"epoch": 0.07371428571428572, |
|
"grad_norm": 0.08207596093416214, |
|
"kl": 0.26953125, |
|
"learning_rate": 9.349139170722281e-05, |
|
"loss": 0.0873, |
|
"reward": 0.14036400616168976, |
|
"reward_std": 0.8220642358064651, |
|
"rewards/cosine_scaled_reward": -0.13815134391188622, |
|
"rewards/format_reward": 0.4166666679084301, |
|
"step": 129 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2693.1250915527344, |
|
"epoch": 0.07428571428571429, |
|
"grad_norm": 0.2660582959651947, |
|
"kl": 0.309326171875, |
|
"learning_rate": 9.332771203643715e-05, |
|
"loss": 0.3145, |
|
"reward": -0.03846167027950287, |
|
"reward_std": 0.6097433939576149, |
|
"rewards/cosine_scaled_reward": -0.20673084072768688, |
|
"rewards/format_reward": 0.3750000074505806, |
|
"step": 130 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2838.5833740234375, |
|
"epoch": 0.07485714285714286, |
|
"grad_norm": 0.22927673161029816, |
|
"kl": 0.42578125, |
|
"learning_rate": 9.316216432703917e-05, |
|
"loss": 0.2716, |
|
"reward": -0.1591216754168272, |
|
"reward_std": 0.5838810279965401, |
|
"rewards/cosine_scaled_reward": -0.3087275102734566, |
|
"rewards/format_reward": 0.4583333395421505, |
|
"step": 131 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3145.9583740234375, |
|
"epoch": 0.07542857142857143, |
|
"grad_norm": 0.24140125513076782, |
|
"kl": 0.49658203125, |
|
"learning_rate": 9.299475664759069e-05, |
|
"loss": 0.2328, |
|
"reward": -0.22540412843227386, |
|
"reward_std": 0.49389100819826126, |
|
"rewards/cosine_scaled_reward": -0.2168687330558896, |
|
"rewards/format_reward": 0.2083333395421505, |
|
"step": 132 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2801.666748046875, |
|
"epoch": 0.076, |
|
"grad_norm": 0.1175323873758316, |
|
"kl": 0.6162109375, |
|
"learning_rate": 9.28254971573058e-05, |
|
"loss": 0.1208, |
|
"reward": 0.2671317234635353, |
|
"reward_std": 0.9439200237393379, |
|
"rewards/cosine_scaled_reward": -0.09560081362724304, |
|
"rewards/format_reward": 0.4583333507180214, |
|
"step": 133 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2603.3750610351562, |
|
"epoch": 0.07657142857142857, |
|
"grad_norm": 0.08401333540678024, |
|
"kl": 0.5513916015625, |
|
"learning_rate": 9.265439410565329e-05, |
|
"loss": 0.1291, |
|
"reward": -0.025265276432037354, |
|
"reward_std": 0.7744887471199036, |
|
"rewards/cosine_scaled_reward": -0.26263265684247017, |
|
"rewards/format_reward": 0.5000000074505806, |
|
"step": 134 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2280.8333587646484, |
|
"epoch": 0.07714285714285714, |
|
"grad_norm": 0.1608603298664093, |
|
"kl": 0.74853515625, |
|
"learning_rate": 9.248145583195448e-05, |
|
"loss": 0.2022, |
|
"reward": 0.48708484787493944, |
|
"reward_std": 0.7106279134750366, |
|
"rewards/cosine_scaled_reward": -0.1939575858414173, |
|
"rewards/format_reward": 0.8750000149011612, |
|
"step": 135 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1642.2083435058594, |
|
"epoch": 0.07771428571428571, |
|
"grad_norm": 0.13642534613609314, |
|
"kl": 0.98046875, |
|
"learning_rate": 9.230669076497688e-05, |
|
"loss": 0.1375, |
|
"reward": 0.5342673538252711, |
|
"reward_std": 0.8357692211866379, |
|
"rewards/cosine_scaled_reward": -0.14953299798071384, |
|
"rewards/format_reward": 0.8333333432674408, |
|
"step": 136 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2910.70849609375, |
|
"epoch": 0.07828571428571429, |
|
"grad_norm": 0.15120375156402588, |
|
"kl": 0.921875, |
|
"learning_rate": 9.213010742252328e-05, |
|
"loss": 0.2046, |
|
"reward": 0.15343802922870964, |
|
"reward_std": 0.9799866378307343, |
|
"rewards/cosine_scaled_reward": -0.19411433674395084, |
|
"rewards/format_reward": 0.541666679084301, |
|
"step": 137 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2308.5000610351562, |
|
"epoch": 0.07885714285714286, |
|
"grad_norm": 0.11914081871509552, |
|
"kl": 1.169921875, |
|
"learning_rate": 9.195171441101669e-05, |
|
"loss": 0.2146, |
|
"reward": 0.3029663562774658, |
|
"reward_std": 0.5679962188005447, |
|
"rewards/cosine_scaled_reward": -0.2235168293118477, |
|
"rewards/format_reward": 0.7500000223517418, |
|
"step": 138 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2387.750030517578, |
|
"epoch": 0.07942857142857143, |
|
"grad_norm": 0.17026901245117188, |
|
"kl": 1.244140625, |
|
"learning_rate": 9.177152042508078e-05, |
|
"loss": 0.1786, |
|
"reward": 0.3169918926432729, |
|
"reward_std": 0.630860798060894, |
|
"rewards/cosine_scaled_reward": -0.11233741417527199, |
|
"rewards/format_reward": 0.541666679084301, |
|
"step": 139 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2226.2083740234375, |
|
"epoch": 0.08, |
|
"grad_norm": 0.19778449833393097, |
|
"kl": 1.255859375, |
|
"learning_rate": 9.158953424711625e-05, |
|
"loss": 0.0049, |
|
"reward": 0.10223929863423109, |
|
"reward_std": 0.4931130036711693, |
|
"rewards/cosine_scaled_reward": -0.3238803520798683, |
|
"rewards/format_reward": 0.7500000149011612, |
|
"step": 140 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1775.416732788086, |
|
"epoch": 0.08057142857142857, |
|
"grad_norm": 0.741427481174469, |
|
"kl": 1.2548828125, |
|
"learning_rate": 9.140576474687264e-05, |
|
"loss": 0.0466, |
|
"reward": 0.4277765303850174, |
|
"reward_std": 0.6418062597513199, |
|
"rewards/cosine_scaled_reward": -0.18194507574662566, |
|
"rewards/format_reward": 0.7916666716337204, |
|
"step": 141 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1896.6250610351562, |
|
"epoch": 0.08114285714285714, |
|
"grad_norm": 0.16323894262313843, |
|
"kl": 1.263671875, |
|
"learning_rate": 9.122022088101614e-05, |
|
"loss": 0.1192, |
|
"reward": 0.3693200536072254, |
|
"reward_std": 0.6720193177461624, |
|
"rewards/cosine_scaled_reward": -0.23200665414333344, |
|
"rewards/format_reward": 0.8333333730697632, |
|
"step": 142 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2213.4583740234375, |
|
"epoch": 0.08171428571428571, |
|
"grad_norm": 0.18504098057746887, |
|
"kl": 1.0888671875, |
|
"learning_rate": 9.1032911692693e-05, |
|
"loss": 0.2741, |
|
"reward": 0.8721700385212898, |
|
"reward_std": 0.6731484234333038, |
|
"rewards/cosine_scaled_reward": -0.04308167099952698, |
|
"rewards/format_reward": 0.9583333432674408, |
|
"step": 143 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2104.666748046875, |
|
"epoch": 0.08228571428571428, |
|
"grad_norm": 0.08342334628105164, |
|
"kl": 1.12890625, |
|
"learning_rate": 9.084384631108883e-05, |
|
"loss": 0.1517, |
|
"reward": 0.19961576722562313, |
|
"reward_std": 0.7329239994287491, |
|
"rewards/cosine_scaled_reward": -0.27519211545586586, |
|
"rewards/format_reward": 0.7500000298023224, |
|
"step": 144 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1690.6666870117188, |
|
"epoch": 0.08285714285714285, |
|
"grad_norm": 0.18242701888084412, |
|
"kl": 1.08984375, |
|
"learning_rate": 9.065303395098359e-05, |
|
"loss": 0.0593, |
|
"reward": 0.619764544069767, |
|
"reward_std": 0.8105409517884254, |
|
"rewards/cosine_scaled_reward": -0.14845106936991215, |
|
"rewards/format_reward": 0.9166666865348816, |
|
"step": 145 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2374.416748046875, |
|
"epoch": 0.08342857142857144, |
|
"grad_norm": 0.21408452093601227, |
|
"kl": 1.177734375, |
|
"learning_rate": 9.046048391230248e-05, |
|
"loss": 0.0769, |
|
"reward": 0.3972553052008152, |
|
"reward_std": 0.6599168851971626, |
|
"rewards/cosine_scaled_reward": -0.21803902462124825, |
|
"rewards/format_reward": 0.8333333432674408, |
|
"step": 146 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1720.8333892822266, |
|
"epoch": 0.084, |
|
"grad_norm": 0.11365830153226852, |
|
"kl": 0.884765625, |
|
"learning_rate": 9.02662055796628e-05, |
|
"loss": 0.0353, |
|
"reward": 0.5338674746453762, |
|
"reward_std": 0.8632927983999252, |
|
"rewards/cosine_scaled_reward": -0.10806626826524734, |
|
"rewards/format_reward": 0.7500000149011612, |
|
"step": 147 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1343.6250305175781, |
|
"epoch": 0.08457142857142858, |
|
"grad_norm": 0.17277319729328156, |
|
"kl": 0.734375, |
|
"learning_rate": 9.007020842191635e-05, |
|
"loss": 0.1714, |
|
"reward": 0.5662912502884865, |
|
"reward_std": 0.7500499784946442, |
|
"rewards/cosine_scaled_reward": -0.13352105021476746, |
|
"rewards/format_reward": 0.833333358168602, |
|
"step": 148 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1988.5417785644531, |
|
"epoch": 0.08514285714285715, |
|
"grad_norm": 0.1171557605266571, |
|
"kl": 1.1796875, |
|
"learning_rate": 8.987250199168808e-05, |
|
"loss": 0.0848, |
|
"reward": 0.35671424493193626, |
|
"reward_std": 0.49780040234327316, |
|
"rewards/cosine_scaled_reward": -0.17580955289304256, |
|
"rewards/format_reward": 0.7083333432674408, |
|
"step": 149 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1752.0000915527344, |
|
"epoch": 0.08571428571428572, |
|
"grad_norm": 0.14017713069915771, |
|
"kl": 0.96435546875, |
|
"learning_rate": 8.967309592491052e-05, |
|
"loss": -0.0065, |
|
"reward": 0.5877555161714554, |
|
"reward_std": 0.7223709151148796, |
|
"rewards/cosine_scaled_reward": -0.16445559449493885, |
|
"rewards/format_reward": 0.9166666865348816, |
|
"step": 150 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2179.4583740234375, |
|
"epoch": 0.08628571428571429, |
|
"grad_norm": 0.0961502268910408, |
|
"kl": 0.9990234375, |
|
"learning_rate": 8.947199994035401e-05, |
|
"loss": 0.116, |
|
"reward": 0.4838414415717125, |
|
"reward_std": 0.39460384100675583, |
|
"rewards/cosine_scaled_reward": -0.19557928666472435, |
|
"rewards/format_reward": 0.8750000149011612, |
|
"step": 151 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1957.2084350585938, |
|
"epoch": 0.08685714285714285, |
|
"grad_norm": 0.21524538099765778, |
|
"kl": 0.92041015625, |
|
"learning_rate": 8.926922383915316e-05, |
|
"loss": 0.2778, |
|
"reward": 1.0773675739765167, |
|
"reward_std": 1.0207276940345764, |
|
"rewards/cosine_scaled_reward": 0.14285043627023697, |
|
"rewards/format_reward": 0.7916666865348816, |
|
"step": 152 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2126.166748046875, |
|
"epoch": 0.08742857142857142, |
|
"grad_norm": 0.18447129428386688, |
|
"kl": 0.875, |
|
"learning_rate": 8.906477750432904e-05, |
|
"loss": 0.2488, |
|
"reward": 0.905040979385376, |
|
"reward_std": 0.779262512922287, |
|
"rewards/cosine_scaled_reward": -0.02664615958929062, |
|
"rewards/format_reward": 0.9583333432674408, |
|
"step": 153 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2594.4584350585938, |
|
"epoch": 0.088, |
|
"grad_norm": 0.1024523600935936, |
|
"kl": 1.0654296875, |
|
"learning_rate": 8.885867090030761e-05, |
|
"loss": 0.0841, |
|
"reward": 0.5415095314383507, |
|
"reward_std": 0.7814789414405823, |
|
"rewards/cosine_scaled_reward": -0.10424522310495377, |
|
"rewards/format_reward": 0.7500000298023224, |
|
"step": 154 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2428.166748046875, |
|
"epoch": 0.08857142857142856, |
|
"grad_norm": 0.07103646546602249, |
|
"kl": 0.88232421875, |
|
"learning_rate": 8.865091407243394e-05, |
|
"loss": 0.0955, |
|
"reward": 0.2886502370238304, |
|
"reward_std": 0.6928622350096703, |
|
"rewards/cosine_scaled_reward": -0.2098415493965149, |
|
"rewards/format_reward": 0.7083333507180214, |
|
"step": 155 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1767.3334045410156, |
|
"epoch": 0.08914285714285715, |
|
"grad_norm": 0.15649987757205963, |
|
"kl": 0.688720703125, |
|
"learning_rate": 8.844151714648274e-05, |
|
"loss": 0.0392, |
|
"reward": 0.8681200444698334, |
|
"reward_std": 0.5367059111595154, |
|
"rewards/cosine_scaled_reward": -0.024273302406072617, |
|
"rewards/format_reward": 0.9166666865348816, |
|
"step": 156 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2410.5833740234375, |
|
"epoch": 0.08971428571428572, |
|
"grad_norm": 0.12421420216560364, |
|
"kl": 1.00390625, |
|
"learning_rate": 8.823049032816479e-05, |
|
"loss": 0.1328, |
|
"reward": 0.6280432939529419, |
|
"reward_std": 0.6566349938511848, |
|
"rewards/cosine_scaled_reward": -0.10264504700899124, |
|
"rewards/format_reward": 0.833333358168602, |
|
"step": 157 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2807.6666870117188, |
|
"epoch": 0.09028571428571429, |
|
"grad_norm": 0.1208081841468811, |
|
"kl": 1.0380859375, |
|
"learning_rate": 8.801784390262944e-05, |
|
"loss": 0.0709, |
|
"reward": 0.4343430995941162, |
|
"reward_std": 0.4182932637631893, |
|
"rewards/cosine_scaled_reward": -0.2203284539282322, |
|
"rewards/format_reward": 0.8750000149011612, |
|
"step": 158 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2652.791778564453, |
|
"epoch": 0.09085714285714286, |
|
"grad_norm": 0.17998459935188293, |
|
"kl": 0.9296875, |
|
"learning_rate": 8.780358823396352e-05, |
|
"loss": 0.0068, |
|
"reward": 0.2543765474110842, |
|
"reward_std": 0.7230948582291603, |
|
"rewards/cosine_scaled_reward": -0.1853117246646434, |
|
"rewards/format_reward": 0.6250000074505806, |
|
"step": 159 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2881.916748046875, |
|
"epoch": 0.09142857142857143, |
|
"grad_norm": 0.06644508242607117, |
|
"kl": 0.955078125, |
|
"learning_rate": 8.758773376468606e-05, |
|
"loss": 0.1401, |
|
"reward": 0.3839067495428026, |
|
"reward_std": 0.6370702758431435, |
|
"rewards/cosine_scaled_reward": -0.2247133031487465, |
|
"rewards/format_reward": 0.833333358168602, |
|
"step": 160 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3114.4584350585938, |
|
"epoch": 0.092, |
|
"grad_norm": 0.11562149226665497, |
|
"kl": 0.9951171875, |
|
"learning_rate": 8.73702910152393e-05, |
|
"loss": 0.067, |
|
"reward": 0.24938225746154785, |
|
"reward_std": 0.6529880091547966, |
|
"rewards/cosine_scaled_reward": -0.1669755440670997, |
|
"rewards/format_reward": 0.5833333432674408, |
|
"step": 161 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2711.0834350585938, |
|
"epoch": 0.09257142857142857, |
|
"grad_norm": 0.06730871647596359, |
|
"kl": 0.8408203125, |
|
"learning_rate": 8.715127058347615e-05, |
|
"loss": 0.0675, |
|
"reward": 0.2713778093457222, |
|
"reward_std": 0.6517889946699142, |
|
"rewards/cosine_scaled_reward": -0.21847776509821415, |
|
"rewards/format_reward": 0.7083333507180214, |
|
"step": 162 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2407.3334350585938, |
|
"epoch": 0.09314285714285714, |
|
"grad_norm": 0.12514488399028778, |
|
"kl": 0.7216796875, |
|
"learning_rate": 8.693068314414344e-05, |
|
"loss": 0.1883, |
|
"reward": 0.5021318048238754, |
|
"reward_std": 0.5789435803890228, |
|
"rewards/cosine_scaled_reward": -0.18643410876393318, |
|
"rewards/format_reward": 0.8750000298023224, |
|
"step": 163 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2688.0833740234375, |
|
"epoch": 0.09371428571428571, |
|
"grad_norm": 0.1626950055360794, |
|
"kl": 0.791015625, |
|
"learning_rate": 8.670853944836176e-05, |
|
"loss": 0.2289, |
|
"reward": 0.17175179324112833, |
|
"reward_std": 0.4885940235108137, |
|
"rewards/cosine_scaled_reward": -0.24745745211839676, |
|
"rewards/format_reward": 0.6666666716337204, |
|
"step": 164 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2521.666717529297, |
|
"epoch": 0.09428571428571429, |
|
"grad_norm": 0.27727189660072327, |
|
"kl": 0.711669921875, |
|
"learning_rate": 8.648485032310145e-05, |
|
"loss": 0.1999, |
|
"reward": 0.3786450959742069, |
|
"reward_std": 0.6580736637115479, |
|
"rewards/cosine_scaled_reward": -0.20651079155504704, |
|
"rewards/format_reward": 0.7916666865348816, |
|
"step": 165 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3203.416748046875, |
|
"epoch": 0.09485714285714286, |
|
"grad_norm": 0.11694037914276123, |
|
"kl": 0.9208984375, |
|
"learning_rate": 8.625962667065488e-05, |
|
"loss": 0.0458, |
|
"reward": 0.5847738608717918, |
|
"reward_std": 0.642599880695343, |
|
"rewards/cosine_scaled_reward": -0.12427974189631641, |
|
"rewards/format_reward": 0.833333358168602, |
|
"step": 166 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2589.2084045410156, |
|
"epoch": 0.09542857142857143, |
|
"grad_norm": 0.09123259037733078, |
|
"kl": 0.6767578125, |
|
"learning_rate": 8.603287946810515e-05, |
|
"loss": 0.1078, |
|
"reward": 0.7590624950826168, |
|
"reward_std": 1.0844984501600266, |
|
"rewards/cosine_scaled_reward": 0.025364567525684834, |
|
"rewards/format_reward": 0.7083333544433117, |
|
"step": 167 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2705.375030517578, |
|
"epoch": 0.096, |
|
"grad_norm": 0.2029721587896347, |
|
"kl": 0.70361328125, |
|
"learning_rate": 8.5804619766791e-05, |
|
"loss": 0.2496, |
|
"reward": 0.5522685013711452, |
|
"reward_std": 0.6528475284576416, |
|
"rewards/cosine_scaled_reward": -0.09886575862765312, |
|
"rewards/format_reward": 0.7500000223517418, |
|
"step": 168 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2472.041793823242, |
|
"epoch": 0.09657142857142857, |
|
"grad_norm": 0.10081025213003159, |
|
"kl": 0.72509765625, |
|
"learning_rate": 8.557485869176826e-05, |
|
"loss": 0.1258, |
|
"reward": 0.5760277360677719, |
|
"reward_std": 0.8965695053339005, |
|
"rewards/cosine_scaled_reward": -0.1494861477985978, |
|
"rewards/format_reward": 0.8750000149011612, |
|
"step": 169 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2640.9584350585938, |
|
"epoch": 0.09714285714285714, |
|
"grad_norm": 0.06490536034107208, |
|
"kl": 0.6435546875, |
|
"learning_rate": 8.534360744126755e-05, |
|
"loss": 0.1257, |
|
"reward": 0.5584607645869255, |
|
"reward_std": 0.6074354127049446, |
|
"rewards/cosine_scaled_reward": -0.15826964005827904, |
|
"rewards/format_reward": 0.875, |
|
"step": 170 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2400.916732788086, |
|
"epoch": 0.09771428571428571, |
|
"grad_norm": 0.10735026746988297, |
|
"kl": 0.585693359375, |
|
"learning_rate": 8.511087728614862e-05, |
|
"loss": 0.0496, |
|
"reward": 0.3593628406524658, |
|
"reward_std": 0.6010008379817009, |
|
"rewards/cosine_scaled_reward": -0.1953185722231865, |
|
"rewards/format_reward": 0.7500000149011612, |
|
"step": 171 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1632.1250610351562, |
|
"epoch": 0.09828571428571428, |
|
"grad_norm": 0.5536447167396545, |
|
"kl": 0.45166015625, |
|
"learning_rate": 8.487667956935088e-05, |
|
"loss": 0.4406, |
|
"reward": 0.30203498154878616, |
|
"reward_std": 0.40830350667238235, |
|
"rewards/cosine_scaled_reward": -0.2448158636689186, |
|
"rewards/format_reward": 0.791666679084301, |
|
"step": 172 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1648.2500305175781, |
|
"epoch": 0.09885714285714285, |
|
"grad_norm": 0.41387858986854553, |
|
"kl": 0.41180419921875, |
|
"learning_rate": 8.464102570534061e-05, |
|
"loss": 0.3343, |
|
"reward": 0.7647038325667381, |
|
"reward_std": 0.6418692655861378, |
|
"rewards/cosine_scaled_reward": -0.05514809489250183, |
|
"rewards/format_reward": 0.8750000298023224, |
|
"step": 173 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2472.3334350585938, |
|
"epoch": 0.09942857142857142, |
|
"grad_norm": 0.18969063460826874, |
|
"kl": 0.8681640625, |
|
"learning_rate": 8.440392717955476e-05, |
|
"loss": 0.2013, |
|
"reward": 0.21441331086680293, |
|
"reward_std": 0.5078516378998756, |
|
"rewards/cosine_scaled_reward": -0.26779336854815483, |
|
"rewards/format_reward": 0.7500000149011612, |
|
"step": 174 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2739.4583740234375, |
|
"epoch": 0.1, |
|
"grad_norm": 0.16940070688724518, |
|
"kl": 0.9853515625, |
|
"learning_rate": 8.416539554784089e-05, |
|
"loss": 0.2297, |
|
"reward": 0.642971895635128, |
|
"reward_std": 1.0668489187955856, |
|
"rewards/cosine_scaled_reward": -0.011847391724586487, |
|
"rewards/format_reward": 0.666666679084301, |
|
"step": 175 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1711.041732788086, |
|
"epoch": 0.10057142857142858, |
|
"grad_norm": 0.1455065906047821, |
|
"kl": 0.763671875, |
|
"learning_rate": 8.392544243589427e-05, |
|
"loss": 0.0957, |
|
"reward": 0.7176935896277428, |
|
"reward_std": 0.4694051705300808, |
|
"rewards/cosine_scaled_reward": -0.07865320146083832, |
|
"rewards/format_reward": 0.875, |
|
"step": 176 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1795.0417022705078, |
|
"epoch": 0.10114285714285715, |
|
"grad_norm": 0.3457939028739929, |
|
"kl": 0.7919921875, |
|
"learning_rate": 8.368407953869104e-05, |
|
"loss": 0.3018, |
|
"reward": 0.7449050601571798, |
|
"reward_std": 0.9370853900909424, |
|
"rewards/cosine_scaled_reward": -0.04421415273100138, |
|
"rewards/format_reward": 0.8333333432674408, |
|
"step": 177 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2875.7500610351562, |
|
"epoch": 0.10171428571428572, |
|
"grad_norm": 0.12909594178199768, |
|
"kl": 1.1767578125, |
|
"learning_rate": 8.34413186199183e-05, |
|
"loss": 0.1559, |
|
"reward": 0.2506026141345501, |
|
"reward_std": 0.4546685218811035, |
|
"rewards/cosine_scaled_reward": -0.249698705971241, |
|
"rewards/format_reward": 0.7500000149011612, |
|
"step": 178 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1944.0417175292969, |
|
"epoch": 0.10228571428571429, |
|
"grad_norm": 0.2241043746471405, |
|
"kl": 0.85870361328125, |
|
"learning_rate": 8.319717151140073e-05, |
|
"loss": 0.2912, |
|
"reward": 1.0774075190420263, |
|
"reward_std": 0.8805922865867615, |
|
"rewards/cosine_scaled_reward": 0.12203706055879593, |
|
"rewards/format_reward": 0.8333333432674408, |
|
"step": 179 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2280.8334350585938, |
|
"epoch": 0.10285714285714286, |
|
"grad_norm": 0.20919553935527802, |
|
"kl": 1.177734375, |
|
"learning_rate": 8.295165011252397e-05, |
|
"loss": 0.2332, |
|
"reward": 0.11883920338004827, |
|
"reward_std": 0.5948300361633301, |
|
"rewards/cosine_scaled_reward": -0.27391375228762627, |
|
"rewards/format_reward": 0.6666666865348816, |
|
"step": 180 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2596.416732788086, |
|
"epoch": 0.10342857142857143, |
|
"grad_norm": 0.18407326936721802, |
|
"kl": 0.93310546875, |
|
"learning_rate": 8.270476638965462e-05, |
|
"loss": 0.1702, |
|
"reward": 0.45228124409914017, |
|
"reward_std": 0.7070431187748909, |
|
"rewards/cosine_scaled_reward": -0.0863594114780426, |
|
"rewards/format_reward": 0.6250000111758709, |
|
"step": 181 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1908.9167175292969, |
|
"epoch": 0.104, |
|
"grad_norm": 0.3261708617210388, |
|
"kl": 1.4140625, |
|
"learning_rate": 8.245653237555706e-05, |
|
"loss": 0.1104, |
|
"reward": 0.36228151875548065, |
|
"reward_std": 0.8484360724687576, |
|
"rewards/cosine_scaled_reward": -0.1730259107425809, |
|
"rewards/format_reward": 0.7083333432674408, |
|
"step": 182 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2503.2084045410156, |
|
"epoch": 0.10457142857142857, |
|
"grad_norm": 0.2822038531303406, |
|
"kl": 1.244140625, |
|
"learning_rate": 8.220696016880688e-05, |
|
"loss": 0.3006, |
|
"reward": -0.05789138190448284, |
|
"reward_std": 0.6969511806964874, |
|
"rewards/cosine_scaled_reward": -0.2997790314257145, |
|
"rewards/format_reward": 0.5416666865348816, |
|
"step": 183 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1496.3750305175781, |
|
"epoch": 0.10514285714285715, |
|
"grad_norm": 0.2194892019033432, |
|
"kl": 0.9404296875, |
|
"learning_rate": 8.195606193320136e-05, |
|
"loss": 0.0467, |
|
"reward": 0.7817502450197935, |
|
"reward_std": 0.814917117357254, |
|
"rewards/cosine_scaled_reward": -0.004958219826221466, |
|
"rewards/format_reward": 0.7916666716337204, |
|
"step": 184 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1257.041732788086, |
|
"epoch": 0.10571428571428572, |
|
"grad_norm": 0.3073634207248688, |
|
"kl": 0.860107421875, |
|
"learning_rate": 8.170384989716657e-05, |
|
"loss": 0.1978, |
|
"reward": 1.0940335839986801, |
|
"reward_std": 0.3842375408858061, |
|
"rewards/cosine_scaled_reward": 0.06785010732710361, |
|
"rewards/format_reward": 0.9583333432674408, |
|
"step": 185 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1938.2500762939453, |
|
"epoch": 0.10628571428571429, |
|
"grad_norm": 0.16985982656478882, |
|
"kl": 0.8951416015625, |
|
"learning_rate": 8.14503363531613e-05, |
|
"loss": 0.1556, |
|
"reward": 0.7376667633652687, |
|
"reward_std": 0.8680930733680725, |
|
"rewards/cosine_scaled_reward": -0.0061665866523981094, |
|
"rewards/format_reward": 0.7500000223517418, |
|
"step": 186 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1639.0833740234375, |
|
"epoch": 0.10685714285714286, |
|
"grad_norm": 0.5398156642913818, |
|
"kl": 0.888671875, |
|
"learning_rate": 8.119553365707803e-05, |
|
"loss": 0.5061, |
|
"reward": 0.03231507260352373, |
|
"reward_std": 0.5724686309695244, |
|
"rewards/cosine_scaled_reward": -0.29634247720241547, |
|
"rewards/format_reward": 0.6250000223517418, |
|
"step": 187 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2887.6250610351562, |
|
"epoch": 0.10742857142857143, |
|
"grad_norm": 0.19194325804710388, |
|
"kl": 1.197265625, |
|
"learning_rate": 8.09394542276407e-05, |
|
"loss": 0.2126, |
|
"reward": -0.3554415591061115, |
|
"reward_std": 0.7380103319883347, |
|
"rewards/cosine_scaled_reward": -0.3235541209578514, |
|
"rewards/format_reward": 0.2916666716337204, |
|
"step": 188 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3177.5833740234375, |
|
"epoch": 0.108, |
|
"grad_norm": 0.13098645210266113, |
|
"kl": 1.07421875, |
|
"learning_rate": 8.068211054579944e-05, |
|
"loss": 0.1842, |
|
"reward": -0.6138647869229317, |
|
"reward_std": 0.514356566593051, |
|
"rewards/cosine_scaled_reward": -0.36943238973617554, |
|
"rewards/format_reward": 0.1250000037252903, |
|
"step": 189 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2187.791717529297, |
|
"epoch": 0.10857142857142857, |
|
"grad_norm": 0.13339217007160187, |
|
"kl": 0.84423828125, |
|
"learning_rate": 8.042351515412221e-05, |
|
"loss": 0.1347, |
|
"reward": -0.0517389252781868, |
|
"reward_std": 0.4778178557753563, |
|
"rewards/cosine_scaled_reward": -0.2550361379981041, |
|
"rewards/format_reward": 0.4583333358168602, |
|
"step": 190 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1908.4583587646484, |
|
"epoch": 0.10914285714285714, |
|
"grad_norm": 0.11566094309091568, |
|
"kl": 0.6126708984375, |
|
"learning_rate": 8.016368065618361e-05, |
|
"loss": 0.179, |
|
"reward": 0.044671330600976944, |
|
"reward_std": 0.2592208320274949, |
|
"rewards/cosine_scaled_reward": -0.2693310081958771, |
|
"rewards/format_reward": 0.5833333358168602, |
|
"step": 191 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1878.2500839233398, |
|
"epoch": 0.10971428571428571, |
|
"grad_norm": 0.170388862490654, |
|
"kl": 0.915771484375, |
|
"learning_rate": 7.99026197159505e-05, |
|
"loss": 0.2365, |
|
"reward": 0.5707967132329941, |
|
"reward_std": 0.27280137967318296, |
|
"rewards/cosine_scaled_reward": 0.035398345440626144, |
|
"rewards/format_reward": 0.5, |
|
"step": 192 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2798.2500610351562, |
|
"epoch": 0.11028571428571429, |
|
"grad_norm": 0.1819745898246765, |
|
"kl": 0.9326171875, |
|
"learning_rate": 7.964034505716477e-05, |
|
"loss": 0.2348, |
|
"reward": -0.5285695753991604, |
|
"reward_std": 0.5819516181945801, |
|
"rewards/cosine_scaled_reward": -0.36845147609710693, |
|
"rewards/format_reward": 0.2083333358168602, |
|
"step": 193 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2043.166732788086, |
|
"epoch": 0.11085714285714286, |
|
"grad_norm": 0.20014306902885437, |
|
"kl": 0.966796875, |
|
"learning_rate": 7.93768694627233e-05, |
|
"loss": 0.2299, |
|
"reward": 0.49893204495310783, |
|
"reward_std": 0.8825281783938408, |
|
"rewards/cosine_scaled_reward": 0.06196599453687668, |
|
"rewards/format_reward": 0.3750000037252903, |
|
"step": 194 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1638.791732788086, |
|
"epoch": 0.11142857142857143, |
|
"grad_norm": 0.37110522389411926, |
|
"kl": 0.8349609375, |
|
"learning_rate": 7.911220577405484e-05, |
|
"loss": 0.1974, |
|
"reward": 0.5442861206829548, |
|
"reward_std": 0.8290757983922958, |
|
"rewards/cosine_scaled_reward": -0.01952361688017845, |
|
"rewards/format_reward": 0.5833333469927311, |
|
"step": 195 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1842.208381652832, |
|
"epoch": 0.112, |
|
"grad_norm": 0.11744756251573563, |
|
"kl": 0.5994873046875, |
|
"learning_rate": 7.884636689049423e-05, |
|
"loss": 0.1054, |
|
"reward": 0.4689091891050339, |
|
"reward_std": 0.5096632726490498, |
|
"rewards/cosine_scaled_reward": -0.03637874871492386, |
|
"rewards/format_reward": 0.5416666679084301, |
|
"step": 196 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2997.8750610351562, |
|
"epoch": 0.11257142857142857, |
|
"grad_norm": 0.16055168211460114, |
|
"kl": 1.0537109375, |
|
"learning_rate": 7.857936576865357e-05, |
|
"loss": 0.1749, |
|
"reward": -0.3116486147046089, |
|
"reward_std": 0.7263150736689568, |
|
"rewards/cosine_scaled_reward": -0.23915765061974525, |
|
"rewards/format_reward": 0.1666666716337204, |
|
"step": 197 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2395.2500915527344, |
|
"epoch": 0.11314285714285714, |
|
"grad_norm": 0.17808175086975098, |
|
"kl": 0.9326171875, |
|
"learning_rate": 7.831121542179087e-05, |
|
"loss": 0.1602, |
|
"reward": 0.15749725699424744, |
|
"reward_std": 0.8375851400196552, |
|
"rewards/cosine_scaled_reward": -0.10875139385461807, |
|
"rewards/format_reward": 0.3750000149011612, |
|
"step": 198 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2222.7084350585938, |
|
"epoch": 0.11371428571428571, |
|
"grad_norm": 0.29520052671432495, |
|
"kl": 1.09716796875, |
|
"learning_rate": 7.804192891917572e-05, |
|
"loss": 0.3001, |
|
"reward": -0.023016322404146194, |
|
"reward_std": 0.7964953854680061, |
|
"rewards/cosine_scaled_reward": -0.17817485332489014, |
|
"rewards/format_reward": 0.3333333432674408, |
|
"step": 199 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2596.916748046875, |
|
"epoch": 0.11428571428571428, |
|
"grad_norm": 0.17907942831516266, |
|
"kl": 1.1689453125, |
|
"learning_rate": 7.777151938545237e-05, |
|
"loss": 0.1107, |
|
"reward": -0.19867387227714062, |
|
"reward_std": 0.3830955922603607, |
|
"rewards/cosine_scaled_reward": -0.16183693706989288, |
|
"rewards/format_reward": 0.1250000037252903, |
|
"step": 200 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2207.291717529297, |
|
"epoch": 0.11485714285714285, |
|
"grad_norm": 0.2985213100910187, |
|
"kl": 0.92431640625, |
|
"learning_rate": 7.75e-05, |
|
"loss": 0.2348, |
|
"reward": 0.06418987736105919, |
|
"reward_std": 0.6583275869488716, |
|
"rewards/cosine_scaled_reward": -0.15540507063269615, |
|
"rewards/format_reward": 0.3750000111758709, |
|
"step": 201 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2214.291732788086, |
|
"epoch": 0.11542857142857142, |
|
"grad_norm": 0.31010299921035767, |
|
"kl": 1.043212890625, |
|
"learning_rate": 7.72273839962904e-05, |
|
"loss": 0.0332, |
|
"reward": 0.3318791128695011, |
|
"reward_std": 0.5672735497355461, |
|
"rewards/cosine_scaled_reward": -0.04239379055798054, |
|
"rewards/format_reward": 0.4166666716337204, |
|
"step": 202 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2259.041778564453, |
|
"epoch": 0.116, |
|
"grad_norm": 0.21049495041370392, |
|
"kl": 1.07421875, |
|
"learning_rate": 7.695368466124298e-05, |
|
"loss": 0.1093, |
|
"reward": 0.018525540828704834, |
|
"reward_std": 0.5658747386187315, |
|
"rewards/cosine_scaled_reward": -0.15740390308201313, |
|
"rewards/format_reward": 0.3333333469927311, |
|
"step": 203 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1843.2084197998047, |
|
"epoch": 0.11657142857142858, |
|
"grad_norm": 0.14651866257190704, |
|
"kl": 0.8848876953125, |
|
"learning_rate": 7.667891533457719e-05, |
|
"loss": 0.0457, |
|
"reward": 0.1518230028450489, |
|
"reward_std": 0.3332018107175827, |
|
"rewards/cosine_scaled_reward": -0.15325517859309912, |
|
"rewards/format_reward": 0.4583333395421505, |
|
"step": 204 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2268.2916870117188, |
|
"epoch": 0.11714285714285715, |
|
"grad_norm": 0.2668810486793518, |
|
"kl": 0.9375, |
|
"learning_rate": 7.64030894081624e-05, |
|
"loss": 0.3148, |
|
"reward": -0.03797488193958998, |
|
"reward_std": 0.48975174129009247, |
|
"rewards/cosine_scaled_reward": -0.24815411865711212, |
|
"rewards/format_reward": 0.4583333395421505, |
|
"step": 205 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2495.541748046875, |
|
"epoch": 0.11771428571428572, |
|
"grad_norm": 0.2810802459716797, |
|
"kl": 1.427734375, |
|
"learning_rate": 7.612622032536509e-05, |
|
"loss": 0.2099, |
|
"reward": -0.0652031796053052, |
|
"reward_std": 0.4952133148908615, |
|
"rewards/cosine_scaled_reward": -0.1992682572454214, |
|
"rewards/format_reward": 0.3333333395421505, |
|
"step": 206 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2103.4584045410156, |
|
"epoch": 0.11828571428571429, |
|
"grad_norm": 0.9285241365432739, |
|
"kl": 1.095703125, |
|
"learning_rate": 7.58483215803938e-05, |
|
"loss": 0.1823, |
|
"reward": 0.27077991724945605, |
|
"reward_std": 0.5788689702749252, |
|
"rewards/cosine_scaled_reward": -0.09377672243863344, |
|
"rewards/format_reward": 0.4583333395421505, |
|
"step": 207 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1963.5000915527344, |
|
"epoch": 0.11885714285714286, |
|
"grad_norm": 0.7166336178779602, |
|
"kl": 1.20263671875, |
|
"learning_rate": 7.556940671764125e-05, |
|
"loss": 0.5024, |
|
"reward": 0.5436707381159067, |
|
"reward_std": 0.9241102710366249, |
|
"rewards/cosine_scaled_reward": 0.06350202858448029, |
|
"rewards/format_reward": 0.4166666828095913, |
|
"step": 208 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1874.7917098999023, |
|
"epoch": 0.11942857142857143, |
|
"grad_norm": 0.2397138923406601, |
|
"kl": 0.9052734375, |
|
"learning_rate": 7.52894893310244e-05, |
|
"loss": 0.2168, |
|
"reward": 0.7744720131158829, |
|
"reward_std": 0.30723479902371764, |
|
"rewards/cosine_scaled_reward": 0.1372359935194254, |
|
"rewards/format_reward": 0.5000000111758709, |
|
"step": 209 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1800.0833587646484, |
|
"epoch": 0.12, |
|
"grad_norm": 0.4871465861797333, |
|
"kl": 1.25341796875, |
|
"learning_rate": 7.500858306332173e-05, |
|
"loss": 0.1891, |
|
"reward": 0.04496807977557182, |
|
"reward_std": 0.5449698269367218, |
|
"rewards/cosine_scaled_reward": -0.22751596197485924, |
|
"rewards/format_reward": 0.5000000223517418, |
|
"step": 210 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1229.3333587646484, |
|
"epoch": 0.12057142857142857, |
|
"grad_norm": 0.41425731778144836, |
|
"kl": 0.82958984375, |
|
"learning_rate": 7.472670160550849e-05, |
|
"loss": 0.22, |
|
"reward": 1.015406172722578, |
|
"reward_std": 0.726126566529274, |
|
"rewards/cosine_scaled_reward": 0.19520305842161179, |
|
"rewards/format_reward": 0.6250000111758709, |
|
"step": 211 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3078.3751220703125, |
|
"epoch": 0.12114285714285715, |
|
"grad_norm": 0.33234572410583496, |
|
"kl": 1.435546875, |
|
"learning_rate": 7.444385869608922e-05, |
|
"loss": 0.1149, |
|
"reward": -0.23881859704852104, |
|
"reward_std": 0.4091813191771507, |
|
"rewards/cosine_scaled_reward": -0.22357597202062607, |
|
"rewards/format_reward": 0.2083333358168602, |
|
"step": 212 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2063.0834350585938, |
|
"epoch": 0.12171428571428572, |
|
"grad_norm": 0.21360896527767181, |
|
"kl": 1.5478515625, |
|
"learning_rate": 7.416006812042828e-05, |
|
"loss": 0.2844, |
|
"reward": 0.1532426355406642, |
|
"reward_std": 0.6949435919523239, |
|
"rewards/cosine_scaled_reward": -0.11087869806215167, |
|
"rewards/format_reward": 0.3750000111758709, |
|
"step": 213 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1602.3333740234375, |
|
"epoch": 0.12228571428571429, |
|
"grad_norm": 0.7172893285751343, |
|
"kl": 1.461669921875, |
|
"learning_rate": 7.387534371007797e-05, |
|
"loss": 0.2138, |
|
"reward": 0.2475161775946617, |
|
"reward_std": 0.23651241697371006, |
|
"rewards/cosine_scaled_reward": -0.14707526192069054, |
|
"rewards/format_reward": 0.5416666679084301, |
|
"step": 214 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2274.416717529297, |
|
"epoch": 0.12285714285714286, |
|
"grad_norm": 1.329139232635498, |
|
"kl": 2.357421875, |
|
"learning_rate": 7.358969934210438e-05, |
|
"loss": 0.1054, |
|
"reward": -0.20811030641198158, |
|
"reward_std": 0.7684681564569473, |
|
"rewards/cosine_scaled_reward": -0.18738848436623812, |
|
"rewards/format_reward": 0.1666666716337204, |
|
"step": 215 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2036.7083740234375, |
|
"epoch": 0.12342857142857143, |
|
"grad_norm": 0.3380753993988037, |
|
"kl": 1.658203125, |
|
"learning_rate": 7.330314893841101e-05, |
|
"loss": 0.282, |
|
"reward": 0.2936302299494855, |
|
"reward_std": 0.956149123609066, |
|
"rewards/cosine_scaled_reward": 0.0009817667305469513, |
|
"rewards/format_reward": 0.2916666753590107, |
|
"step": 216 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2718.3334350585938, |
|
"epoch": 0.124, |
|
"grad_norm": 0.37306633591651917, |
|
"kl": 1.537109375, |
|
"learning_rate": 7.301570646506028e-05, |
|
"loss": 0.0517, |
|
"reward": -0.10302772559225559, |
|
"reward_std": 0.4567326009273529, |
|
"rewards/cosine_scaled_reward": -0.15568053536117077, |
|
"rewards/format_reward": 0.2083333358168602, |
|
"step": 217 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2313.4583740234375, |
|
"epoch": 0.12457142857142857, |
|
"grad_norm": 0.3978230953216553, |
|
"kl": 1.35546875, |
|
"learning_rate": 7.27273859315928e-05, |
|
"loss": 0.2971, |
|
"reward": 0.18653920199722052, |
|
"reward_std": 0.5482294261455536, |
|
"rewards/cosine_scaled_reward": -0.1567304128257092, |
|
"rewards/format_reward": 0.5000000111758709, |
|
"step": 218 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1831.2083740234375, |
|
"epoch": 0.12514285714285714, |
|
"grad_norm": 0.5213355422019958, |
|
"kl": 1.0810546875, |
|
"learning_rate": 7.243820139034464e-05, |
|
"loss": 0.0582, |
|
"reward": 0.3415638351580128, |
|
"reward_std": 0.6902635730803013, |
|
"rewards/cosine_scaled_reward": -0.14171809703111649, |
|
"rewards/format_reward": 0.6250000074505806, |
|
"step": 219 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2493.541748046875, |
|
"epoch": 0.12571428571428572, |
|
"grad_norm": 0.08655526489019394, |
|
"kl": 0.808837890625, |
|
"learning_rate": 7.214816693576235e-05, |
|
"loss": 0.0852, |
|
"reward": 0.891080267727375, |
|
"reward_std": 0.9232172593474388, |
|
"rewards/cosine_scaled_reward": 0.11220681853592396, |
|
"rewards/format_reward": 0.666666679084301, |
|
"step": 220 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2407.5000610351562, |
|
"epoch": 0.12628571428571428, |
|
"grad_norm": 0.1971471756696701, |
|
"kl": 1.009765625, |
|
"learning_rate": 7.185729670371605e-05, |
|
"loss": 0.266, |
|
"reward": 0.38966307416558266, |
|
"reward_std": 0.9727390855550766, |
|
"rewards/cosine_scaled_reward": -0.03433513268828392, |
|
"rewards/format_reward": 0.4583333469927311, |
|
"step": 221 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2275.500030517578, |
|
"epoch": 0.12685714285714286, |
|
"grad_norm": 0.23895929753780365, |
|
"kl": 0.55322265625, |
|
"learning_rate": 7.156560487081053e-05, |
|
"loss": 0.2723, |
|
"reward": 0.5348676145076752, |
|
"reward_std": 0.4934211131185293, |
|
"rewards/cosine_scaled_reward": -0.12839954253286123, |
|
"rewards/format_reward": 0.791666679084301, |
|
"step": 222 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2512.2083740234375, |
|
"epoch": 0.12742857142857142, |
|
"grad_norm": 0.09032748639583588, |
|
"kl": 0.7138671875, |
|
"learning_rate": 7.127310565369415e-05, |
|
"loss": 0.031, |
|
"reward": 0.257486991584301, |
|
"reward_std": 0.4581068307161331, |
|
"rewards/cosine_scaled_reward": -0.246256522834301, |
|
"rewards/format_reward": 0.7500000149011612, |
|
"step": 223 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3016.2501220703125, |
|
"epoch": 0.128, |
|
"grad_norm": 0.08650817722082138, |
|
"kl": 0.82421875, |
|
"learning_rate": 7.097981330836617e-05, |
|
"loss": 0.0687, |
|
"reward": 0.2571569848805666, |
|
"reward_std": 0.7694349959492683, |
|
"rewards/cosine_scaled_reward": -0.07975485920906067, |
|
"rewards/format_reward": 0.4166666753590107, |
|
"step": 224 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2969.541717529297, |
|
"epoch": 0.12857142857142856, |
|
"grad_norm": 0.07500854134559631, |
|
"kl": 0.8349609375, |
|
"learning_rate": 7.068574212948169e-05, |
|
"loss": 0.1226, |
|
"reward": 0.1223123692907393, |
|
"reward_std": 0.6229890622198582, |
|
"rewards/cosine_scaled_reward": -0.14717714861035347, |
|
"rewards/format_reward": 0.416666679084301, |
|
"step": 225 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3157.5834350585938, |
|
"epoch": 0.12914285714285714, |
|
"grad_norm": 0.09925863891839981, |
|
"kl": 0.79296875, |
|
"learning_rate": 7.03909064496551e-05, |
|
"loss": 0.0567, |
|
"reward": 0.4053786303848028, |
|
"reward_std": 1.0473359823226929, |
|
"rewards/cosine_scaled_reward": -0.04731069877743721, |
|
"rewards/format_reward": 0.5000000111758709, |
|
"step": 226 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2760.791732788086, |
|
"epoch": 0.12971428571428573, |
|
"grad_norm": 0.12794190645217896, |
|
"kl": 0.614013671875, |
|
"learning_rate": 7.009532063876149e-05, |
|
"loss": 0.0846, |
|
"reward": 0.35397324431687593, |
|
"reward_std": 0.9577793553471565, |
|
"rewards/cosine_scaled_reward": -0.09384672529995441, |
|
"rewards/format_reward": 0.5416666716337204, |
|
"step": 227 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2348.7083740234375, |
|
"epoch": 0.13028571428571428, |
|
"grad_norm": 0.3130943179130554, |
|
"kl": 0.71826171875, |
|
"learning_rate": 6.979899910323624e-05, |
|
"loss": 0.296, |
|
"reward": 0.4632652625441551, |
|
"reward_std": 0.8859903812408447, |
|
"rewards/cosine_scaled_reward": -0.018367409706115723, |
|
"rewards/format_reward": 0.5000000186264515, |
|
"step": 228 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3352.916748046875, |
|
"epoch": 0.13085714285714287, |
|
"grad_norm": 0.046103257685899734, |
|
"kl": 0.796875, |
|
"learning_rate": 6.9501956285373e-05, |
|
"loss": 0.1272, |
|
"reward": 0.5160791212692857, |
|
"reward_std": 0.8618348892778158, |
|
"rewards/cosine_scaled_reward": 0.008039550390094519, |
|
"rewards/format_reward": 0.5000000074505806, |
|
"step": 229 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3535.4583740234375, |
|
"epoch": 0.13142857142857142, |
|
"grad_norm": 0.10873083025217056, |
|
"kl": 0.8486328125, |
|
"learning_rate": 6.920420666261962e-05, |
|
"loss": 0.0469, |
|
"reward": -0.26604770543053746, |
|
"reward_std": 0.4407772123813629, |
|
"rewards/cosine_scaled_reward": -0.2580238524824381, |
|
"rewards/format_reward": 0.2500000037252903, |
|
"step": 230 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3418.5000610351562, |
|
"epoch": 0.132, |
|
"grad_norm": 0.09764978289604187, |
|
"kl": 0.9033203125, |
|
"learning_rate": 6.890576474687263e-05, |
|
"loss": 0.0588, |
|
"reward": -0.12345289438962936, |
|
"reward_std": 0.5395884811878204, |
|
"rewards/cosine_scaled_reward": -0.20755979791283607, |
|
"rewards/format_reward": 0.2916666716337204, |
|
"step": 231 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2774.2083740234375, |
|
"epoch": 0.13257142857142856, |
|
"grad_norm": 0.09208109974861145, |
|
"kl": 0.7041015625, |
|
"learning_rate": 6.860664508377001e-05, |
|
"loss": 0.1257, |
|
"reward": 0.3880004594102502, |
|
"reward_std": 0.6920578330755234, |
|
"rewards/cosine_scaled_reward": -0.03516644984483719, |
|
"rewards/format_reward": 0.4583333395421505, |
|
"step": 232 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3413.4166870117188, |
|
"epoch": 0.13314285714285715, |
|
"grad_norm": 0.09384088963270187, |
|
"kl": 0.7041015625, |
|
"learning_rate": 6.83068622519821e-05, |
|
"loss": 0.021, |
|
"reward": -0.12629218865185976, |
|
"reward_std": 0.5478029847145081, |
|
"rewards/cosine_scaled_reward": -0.33397944271564484, |
|
"rewards/format_reward": 0.541666679084301, |
|
"step": 233 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3346.6250610351562, |
|
"epoch": 0.1337142857142857, |
|
"grad_norm": 0.04808826744556427, |
|
"kl": 0.6845703125, |
|
"learning_rate": 6.800643086250122e-05, |
|
"loss": 0.0825, |
|
"reward": -0.1951084854081273, |
|
"reward_std": 0.4496830254793167, |
|
"rewards/cosine_scaled_reward": -0.28505424316972494, |
|
"rewards/format_reward": 0.3750000037252903, |
|
"step": 234 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2900.666717529297, |
|
"epoch": 0.13428571428571429, |
|
"grad_norm": 0.1497814655303955, |
|
"kl": 0.55810546875, |
|
"learning_rate": 6.770536555792944e-05, |
|
"loss": -0.0319, |
|
"reward": 0.5402148407883942, |
|
"reward_std": 0.37184665352106094, |
|
"rewards/cosine_scaled_reward": -0.04239258915185928, |
|
"rewards/format_reward": 0.6250000111758709, |
|
"step": 235 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2875.666748046875, |
|
"epoch": 0.13485714285714287, |
|
"grad_norm": 0.0731302872300148, |
|
"kl": 0.69921875, |
|
"learning_rate": 6.740368101176496e-05, |
|
"loss": 0.0621, |
|
"reward": 0.16314180195331573, |
|
"reward_std": 0.5131651610136032, |
|
"rewards/cosine_scaled_reward": -0.2517624497413635, |
|
"rewards/format_reward": 0.6666666828095913, |
|
"step": 236 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2531.0000610351562, |
|
"epoch": 0.13542857142857143, |
|
"grad_norm": 0.07892952114343643, |
|
"kl": 0.5634765625, |
|
"learning_rate": 6.710139192768695e-05, |
|
"loss": 0.0984, |
|
"reward": 0.2294897036626935, |
|
"reward_std": 0.3668659031391144, |
|
"rewards/cosine_scaled_reward": -0.1560884891077876, |
|
"rewards/format_reward": 0.5416666716337204, |
|
"step": 237 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3519.3333740234375, |
|
"epoch": 0.136, |
|
"grad_norm": 0.048439182341098785, |
|
"kl": 0.55322265625, |
|
"learning_rate": 6.679851303883892e-05, |
|
"loss": 0.0406, |
|
"reward": -0.1855611428618431, |
|
"reward_std": 0.3151257839053869, |
|
"rewards/cosine_scaled_reward": -0.21778057888150215, |
|
"rewards/format_reward": 0.2500000111758709, |
|
"step": 238 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3434.3333740234375, |
|
"epoch": 0.13657142857142857, |
|
"grad_norm": 0.037434171885252, |
|
"kl": 0.53125, |
|
"learning_rate": 6.649505910711058e-05, |
|
"loss": 0.044, |
|
"reward": 0.21306656673550606, |
|
"reward_std": 0.5599532704800367, |
|
"rewards/cosine_scaled_reward": -0.08096674270927906, |
|
"rewards/format_reward": 0.3750000074505806, |
|
"step": 239 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3486.4583740234375, |
|
"epoch": 0.13714285714285715, |
|
"grad_norm": 0.06022648140788078, |
|
"kl": 0.6474609375, |
|
"learning_rate": 6.619104492241848e-05, |
|
"loss": 0.0605, |
|
"reward": -0.09005486592650414, |
|
"reward_std": 0.6559914350509644, |
|
"rewards/cosine_scaled_reward": -0.19086076319217682, |
|
"rewards/format_reward": 0.2916666716337204, |
|
"step": 240 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3027.166748046875, |
|
"epoch": 0.1377142857142857, |
|
"grad_norm": 0.1721280962228775, |
|
"kl": 0.5634765625, |
|
"learning_rate": 6.588648530198504e-05, |
|
"loss": 0.1883, |
|
"reward": 0.39998156833462417, |
|
"reward_std": 0.9498686045408249, |
|
"rewards/cosine_scaled_reward": 0.01249077171087265, |
|
"rewards/format_reward": 0.3750000149011612, |
|
"step": 241 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3415.041748046875, |
|
"epoch": 0.1382857142857143, |
|
"grad_norm": 0.053598903119564056, |
|
"kl": 0.5390625, |
|
"learning_rate": 6.558139508961655e-05, |
|
"loss": 0.0471, |
|
"reward": -0.3487744452431798, |
|
"reward_std": 0.31119491159915924, |
|
"rewards/cosine_scaled_reward": -0.29938721284270287, |
|
"rewards/format_reward": 0.2500000074505806, |
|
"step": 242 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3389.9166870117188, |
|
"epoch": 0.13885714285714285, |
|
"grad_norm": 0.05878981202840805, |
|
"kl": 0.58056640625, |
|
"learning_rate": 6.527578915497951e-05, |
|
"loss": 0.0654, |
|
"reward": -0.28902174066752195, |
|
"reward_std": 0.33487619645893574, |
|
"rewards/cosine_scaled_reward": -0.20701087033376098, |
|
"rewards/format_reward": 0.1250000037252903, |
|
"step": 243 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3095.2083435058594, |
|
"epoch": 0.13942857142857143, |
|
"grad_norm": 0.06762240827083588, |
|
"kl": 0.4462890625, |
|
"learning_rate": 6.496968239287605e-05, |
|
"loss": 0.0881, |
|
"reward": 0.176816888153553, |
|
"reward_std": 0.6285420805215836, |
|
"rewards/cosine_scaled_reward": -0.14075824059545994, |
|
"rewards/format_reward": 0.4583333507180214, |
|
"step": 244 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3193.4584350585938, |
|
"epoch": 0.14, |
|
"grad_norm": 0.055704742670059204, |
|
"kl": 0.67333984375, |
|
"learning_rate": 6.466308972251785e-05, |
|
"loss": 0.0928, |
|
"reward": 0.09640493569895625, |
|
"reward_std": 0.6325432863086462, |
|
"rewards/cosine_scaled_reward": -0.11846420541405678, |
|
"rewards/format_reward": 0.3333333395421505, |
|
"step": 245 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3142.9584350585938, |
|
"epoch": 0.14057142857142857, |
|
"grad_norm": 0.03989458829164505, |
|
"kl": 0.431884765625, |
|
"learning_rate": 6.435602608679918e-05, |
|
"loss": 0.0134, |
|
"reward": 0.4728468209505081, |
|
"reward_std": 0.7001243010163307, |
|
"rewards/cosine_scaled_reward": 0.007256772369146347, |
|
"rewards/format_reward": 0.4583333358168602, |
|
"step": 246 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3543.9166870117188, |
|
"epoch": 0.14114285714285715, |
|
"grad_norm": 0.05796092376112938, |
|
"kl": 0.5068359375, |
|
"learning_rate": 6.404850645156841e-05, |
|
"loss": 0.0442, |
|
"reward": -0.33712251763790846, |
|
"reward_std": 0.5263936333358288, |
|
"rewards/cosine_scaled_reward": -0.21022793650627136, |
|
"rewards/format_reward": 0.0833333358168602, |
|
"step": 247 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2386.4166870117188, |
|
"epoch": 0.1417142857142857, |
|
"grad_norm": 0.12801125645637512, |
|
"kl": 0.49365234375, |
|
"learning_rate": 6.374054580489874e-05, |
|
"loss": 0.2453, |
|
"reward": 0.6528540402650833, |
|
"reward_std": 0.6541059017181396, |
|
"rewards/cosine_scaled_reward": 0.03476031869649887, |
|
"rewards/format_reward": 0.5833333432674408, |
|
"step": 248 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3535.3751220703125, |
|
"epoch": 0.1422857142857143, |
|
"grad_norm": 0.045454155653715134, |
|
"kl": 0.61328125, |
|
"learning_rate": 6.343215915635762e-05, |
|
"loss": 0.0356, |
|
"reward": -0.27654687594622374, |
|
"reward_std": 0.4039493198506534, |
|
"rewards/cosine_scaled_reward": -0.2632734435610473, |
|
"rewards/format_reward": 0.2500000037252903, |
|
"step": 249 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3574.875, |
|
"epoch": 0.14285714285714285, |
|
"grad_norm": 0.06252045184373856, |
|
"kl": 0.53173828125, |
|
"learning_rate": 6.31233615362752e-05, |
|
"loss": 0.0221, |
|
"reward": -0.2779444120824337, |
|
"reward_std": 0.6321102194488049, |
|
"rewards/cosine_scaled_reward": -0.20147221349179745, |
|
"rewards/format_reward": 0.1250000037252903, |
|
"step": 250 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3158.9166870117188, |
|
"epoch": 0.14342857142857143, |
|
"grad_norm": 0.092299684882164, |
|
"kl": 0.59228515625, |
|
"learning_rate": 6.281416799501188e-05, |
|
"loss": 0.0785, |
|
"reward": 0.22740934044122696, |
|
"reward_std": 0.7193168550729752, |
|
"rewards/cosine_scaled_reward": -0.09462865814566612, |
|
"rewards/format_reward": 0.4166666828095913, |
|
"step": 251 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3144.5000610351562, |
|
"epoch": 0.144, |
|
"grad_norm": 0.07481088489294052, |
|
"kl": 0.62548828125, |
|
"learning_rate": 6.250459360222461e-05, |
|
"loss": 0.0965, |
|
"reward": -0.2660303530283272, |
|
"reward_std": 0.5256764851510525, |
|
"rewards/cosine_scaled_reward": -0.3205151781439781, |
|
"rewards/format_reward": 0.3750000037252903, |
|
"step": 252 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3226.5416870117188, |
|
"epoch": 0.14457142857142857, |
|
"grad_norm": 0.1071605458855629, |
|
"kl": 0.5869140625, |
|
"learning_rate": 6.219465344613258e-05, |
|
"loss": 0.0028, |
|
"reward": -0.21571965515613556, |
|
"reward_std": 0.7407341748476028, |
|
"rewards/cosine_scaled_reward": -0.2745265010744333, |
|
"rewards/format_reward": 0.3333333395421505, |
|
"step": 253 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3164.25, |
|
"epoch": 0.14514285714285713, |
|
"grad_norm": 0.07004442811012268, |
|
"kl": 0.6025390625, |
|
"learning_rate": 6.188436263278172e-05, |
|
"loss": 0.0488, |
|
"reward": -0.4852742440998554, |
|
"reward_std": 0.4384588450193405, |
|
"rewards/cosine_scaled_reward": -0.38847045600414276, |
|
"rewards/format_reward": 0.2916666753590107, |
|
"step": 254 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3331.75, |
|
"epoch": 0.1457142857142857, |
|
"grad_norm": 0.07318566739559174, |
|
"kl": 0.70068359375, |
|
"learning_rate": 6.157373628530852e-05, |
|
"loss": 0.0552, |
|
"reward": -0.26235504634678364, |
|
"reward_std": 0.5362003445625305, |
|
"rewards/cosine_scaled_reward": -0.23534419387578964, |
|
"rewards/format_reward": 0.2083333358168602, |
|
"step": 255 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3578.0833740234375, |
|
"epoch": 0.1462857142857143, |
|
"grad_norm": 0.08398139476776123, |
|
"kl": 0.5537109375, |
|
"learning_rate": 6.126278954320295e-05, |
|
"loss": 0.0253, |
|
"reward": -0.49905257299542427, |
|
"reward_std": 0.4306683763861656, |
|
"rewards/cosine_scaled_reward": -0.31202628277242184, |
|
"rewards/format_reward": 0.1250000037252903, |
|
"step": 256 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3329.2500610351562, |
|
"epoch": 0.14685714285714285, |
|
"grad_norm": 0.06330101937055588, |
|
"kl": 0.54541015625, |
|
"learning_rate": 6.095153756157051e-05, |
|
"loss": 0.0436, |
|
"reward": -0.006688140332698822, |
|
"reward_std": 0.640820337459445, |
|
"rewards/cosine_scaled_reward": -0.10751075111329556, |
|
"rewards/format_reward": 0.2083333358168602, |
|
"step": 257 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2738.2083740234375, |
|
"epoch": 0.14742857142857144, |
|
"grad_norm": 0.08145475387573242, |
|
"kl": 0.60498046875, |
|
"learning_rate": 6.06399955103937e-05, |
|
"loss": 0.2035, |
|
"reward": -0.2508790194988251, |
|
"reward_std": 0.4752666652202606, |
|
"rewards/cosine_scaled_reward": -0.2504395004361868, |
|
"rewards/format_reward": 0.2500000074505806, |
|
"step": 258 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3575.5416870117188, |
|
"epoch": 0.148, |
|
"grad_norm": 0.0701456069946289, |
|
"kl": 0.49853515625, |
|
"learning_rate": 6.032817857379256e-05, |
|
"loss": 0.023, |
|
"reward": -0.540546678006649, |
|
"reward_std": 0.3376801423728466, |
|
"rewards/cosine_scaled_reward": -0.3327733352780342, |
|
"rewards/format_reward": 0.1250000037252903, |
|
"step": 259 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3471.7083740234375, |
|
"epoch": 0.14857142857142858, |
|
"grad_norm": 0.06758905947208405, |
|
"kl": 0.5185546875, |
|
"learning_rate": 6.001610194928464e-05, |
|
"loss": 0.0384, |
|
"reward": -0.6148700146004558, |
|
"reward_std": 0.25194124691188335, |
|
"rewards/cosine_scaled_reward": -0.3491016775369644, |
|
"rewards/format_reward": 0.0833333358168602, |
|
"step": 260 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3584.0, |
|
"epoch": 0.14914285714285713, |
|
"grad_norm": 0.10130701959133148, |
|
"kl": 0.428466796875, |
|
"learning_rate": 5.970378084704441e-05, |
|
"loss": 0.0172, |
|
"reward": -0.6753373667597771, |
|
"reward_std": 0.3040266986936331, |
|
"rewards/cosine_scaled_reward": -0.37933534011244774, |
|
"rewards/format_reward": 0.0833333358168602, |
|
"step": 261 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3520.8333740234375, |
|
"epoch": 0.14971428571428572, |
|
"grad_norm": 0.05124128237366676, |
|
"kl": 0.4736328125, |
|
"learning_rate": 5.9391230489161734e-05, |
|
"loss": 0.0285, |
|
"reward": -0.4647176805883646, |
|
"reward_std": 0.43572363816201687, |
|
"rewards/cosine_scaled_reward": -0.25319216772913933, |
|
"rewards/format_reward": 0.0416666679084301, |
|
"step": 262 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3579.2916870117188, |
|
"epoch": 0.15028571428571427, |
|
"grad_norm": 0.07916761189699173, |
|
"kl": 0.34814453125, |
|
"learning_rate": 5.907846610890012e-05, |
|
"loss": 0.0166, |
|
"reward": -0.4248216481646523, |
|
"reward_std": 0.35384376160800457, |
|
"rewards/cosine_scaled_reward": -0.23324416455579922, |
|
"rewards/format_reward": 0.0416666679084301, |
|
"step": 263 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3584.0, |
|
"epoch": 0.15085714285714286, |
|
"grad_norm": 0.04631157964468002, |
|
"kl": 0.36572265625, |
|
"learning_rate": 5.876550294995421e-05, |
|
"loss": 0.0146, |
|
"reward": -0.2554018050432205, |
|
"reward_std": 0.4412471568211913, |
|
"rewards/cosine_scaled_reward": -0.16936755925416946, |
|
"rewards/format_reward": 0.0833333358168602, |
|
"step": 264 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3527.416748046875, |
|
"epoch": 0.15142857142857144, |
|
"grad_norm": 0.04877844080328941, |
|
"kl": 0.343017578125, |
|
"learning_rate": 5.8452356265706845e-05, |
|
"loss": 0.0137, |
|
"reward": -0.30376535654067993, |
|
"reward_std": 0.33788828179240227, |
|
"rewards/cosine_scaled_reward": -0.25604934617877007, |
|
"rewards/format_reward": 0.2083333395421505, |
|
"step": 265 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3505.2916870117188, |
|
"epoch": 0.152, |
|
"grad_norm": 0.05829893797636032, |
|
"kl": 0.3427734375, |
|
"learning_rate": 5.813904131848564e-05, |
|
"loss": 0.0447, |
|
"reward": -0.27650847285985947, |
|
"reward_std": 0.3842029310762882, |
|
"rewards/cosine_scaled_reward": -0.17992090061306953, |
|
"rewards/format_reward": 0.0833333358168602, |
|
"step": 266 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2713.416748046875, |
|
"epoch": 0.15257142857142858, |
|
"grad_norm": 0.06513495743274689, |
|
"kl": 0.4794921875, |
|
"learning_rate": 5.782557337881911e-05, |
|
"loss": 0.0555, |
|
"reward": 0.09612545743584633, |
|
"reward_std": 0.7547842487692833, |
|
"rewards/cosine_scaled_reward": -0.22277061268687248, |
|
"rewards/format_reward": 0.5416666865348816, |
|
"step": 267 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3205.4583740234375, |
|
"epoch": 0.15314285714285714, |
|
"grad_norm": 0.053954530507326126, |
|
"kl": 0.392578125, |
|
"learning_rate": 5.751196772469237e-05, |
|
"loss": 0.0786, |
|
"reward": -0.2341192662715912, |
|
"reward_std": 0.6096795275807381, |
|
"rewards/cosine_scaled_reward": -0.2628929764032364, |
|
"rewards/format_reward": 0.291666679084301, |
|
"step": 268 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3527.5, |
|
"epoch": 0.15371428571428572, |
|
"grad_norm": 0.09402066469192505, |
|
"kl": 0.4326171875, |
|
"learning_rate": 5.719823964080261e-05, |
|
"loss": 0.018, |
|
"reward": -0.7792681828141212, |
|
"reward_std": 0.16622583265416324, |
|
"rewards/cosine_scaled_reward": -0.4313007518649101, |
|
"rewards/format_reward": 0.0833333358168602, |
|
"step": 269 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2934.125, |
|
"epoch": 0.15428571428571428, |
|
"grad_norm": 0.08574939519166946, |
|
"kl": 0.3681640625, |
|
"learning_rate": 5.688440441781399e-05, |
|
"loss": -0.0006, |
|
"reward": 0.4598322659730911, |
|
"reward_std": 0.5430615171790123, |
|
"rewards/cosine_scaled_reward": 0.02158279437571764, |
|
"rewards/format_reward": 0.4166666679084301, |
|
"step": 270 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3205.2916870117188, |
|
"epoch": 0.15485714285714286, |
|
"grad_norm": 0.09958425909280777, |
|
"kl": 0.35693359375, |
|
"learning_rate": 5.657047735161256e-05, |
|
"loss": 0.1366, |
|
"reward": -0.3448881134390831, |
|
"reward_std": 0.23123590275645256, |
|
"rewards/cosine_scaled_reward": -0.25577738881111145, |
|
"rewards/format_reward": 0.1666666716337204, |
|
"step": 271 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2611.3334197998047, |
|
"epoch": 0.15542857142857142, |
|
"grad_norm": 0.03576628491282463, |
|
"kl": 0.36773681640625, |
|
"learning_rate": 5.6256473742560614e-05, |
|
"loss": 0.1024, |
|
"reward": 0.011561892926692963, |
|
"reward_std": 0.3712347708642483, |
|
"rewards/cosine_scaled_reward": -0.26505238376557827, |
|
"rewards/format_reward": 0.5416666679084301, |
|
"step": 272 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3323.2500610351562, |
|
"epoch": 0.156, |
|
"grad_norm": 0.06556365638971329, |
|
"kl": 0.4912109375, |
|
"learning_rate": 5.594240889475107e-05, |
|
"loss": 0.0166, |
|
"reward": -0.4284048527479172, |
|
"reward_std": 0.520957512781024, |
|
"rewards/cosine_scaled_reward": -0.25586907658725977, |
|
"rewards/format_reward": 0.0833333358168602, |
|
"step": 273 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3584.0, |
|
"epoch": 0.15657142857142858, |
|
"grad_norm": 0.03841016814112663, |
|
"kl": 0.299072265625, |
|
"learning_rate": 5.5628298115261545e-05, |
|
"loss": 0.012, |
|
"reward": -0.7124607469886541, |
|
"reward_std": 0.3880233308300376, |
|
"rewards/cosine_scaled_reward": -0.37706371024250984, |
|
"rewards/format_reward": 0.0416666679084301, |
|
"step": 274 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3276.7916870117188, |
|
"epoch": 0.15714285714285714, |
|
"grad_norm": 0.07038947939872742, |
|
"kl": 0.271728515625, |
|
"learning_rate": 5.5314156713408275e-05, |
|
"loss": -0.0357, |
|
"reward": 0.1667029708623886, |
|
"reward_std": 0.4187759216874838, |
|
"rewards/cosine_scaled_reward": -0.041648514568805695, |
|
"rewards/format_reward": 0.25, |
|
"step": 275 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3108.1250610351562, |
|
"epoch": 0.15771428571428572, |
|
"grad_norm": 0.046657588332891464, |
|
"kl": 0.3857421875, |
|
"learning_rate": 5.500000000000001e-05, |
|
"loss": 0.0822, |
|
"reward": -0.2098342329263687, |
|
"reward_std": 0.7249854430556297, |
|
"rewards/cosine_scaled_reward": -0.2299171146005392, |
|
"rewards/format_reward": 0.2500000074505806, |
|
"step": 276 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3534.1666870117188, |
|
"epoch": 0.15828571428571428, |
|
"grad_norm": 0.045905984938144684, |
|
"kl": 0.3125, |
|
"learning_rate": 5.468584328659173e-05, |
|
"loss": 0.0342, |
|
"reward": -0.498178094625473, |
|
"reward_std": 0.19802542310208082, |
|
"rewards/cosine_scaled_reward": -0.2490890473127365, |
|
"rewards/format_reward": 0.0, |
|
"step": 277 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3167.0416870117188, |
|
"epoch": 0.15885714285714286, |
|
"grad_norm": 0.10243193805217743, |
|
"kl": 0.2861328125, |
|
"learning_rate": 5.4371701884738466e-05, |
|
"loss": 0.1193, |
|
"reward": -0.42556126043200493, |
|
"reward_std": 0.3831705767661333, |
|
"rewards/cosine_scaled_reward": -0.2961139716207981, |
|
"rewards/format_reward": 0.1666666716337204, |
|
"step": 278 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3515.5833740234375, |
|
"epoch": 0.15942857142857142, |
|
"grad_norm": 0.046917758882045746, |
|
"kl": 0.27294921875, |
|
"learning_rate": 5.405759110524894e-05, |
|
"loss": 0.0402, |
|
"reward": -0.1426243856549263, |
|
"reward_std": 0.6854961533099413, |
|
"rewards/cosine_scaled_reward": -0.1754788588732481, |
|
"rewards/format_reward": 0.2083333395421505, |
|
"step": 279 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3225.9583740234375, |
|
"epoch": 0.16, |
|
"grad_norm": 0.05153701454401016, |
|
"kl": 0.27978515625, |
|
"learning_rate": 5.374352625743941e-05, |
|
"loss": -0.0109, |
|
"reward": -0.4667184352874756, |
|
"reward_std": 0.24931692145764828, |
|
"rewards/cosine_scaled_reward": -0.3583592250943184, |
|
"rewards/format_reward": 0.25, |
|
"step": 280 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3036.5834350585938, |
|
"epoch": 0.16057142857142856, |
|
"grad_norm": 0.06122641637921333, |
|
"kl": 0.36865234375, |
|
"learning_rate": 5.342952264838747e-05, |
|
"loss": 0.0865, |
|
"reward": 0.47959111630916595, |
|
"reward_std": 0.37914127111434937, |
|
"rewards/cosine_scaled_reward": 0.010628907009959221, |
|
"rewards/format_reward": 0.4583333395421505, |
|
"step": 281 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3430.25, |
|
"epoch": 0.16114285714285714, |
|
"grad_norm": 0.05524813011288643, |
|
"kl": 0.24755859375, |
|
"learning_rate": 5.311559558218603e-05, |
|
"loss": 0.035, |
|
"reward": 0.04076346941292286, |
|
"reward_std": 0.4443469550460577, |
|
"rewards/cosine_scaled_reward": -0.08378491457551718, |
|
"rewards/format_reward": 0.2083333432674408, |
|
"step": 282 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3506.1250610351562, |
|
"epoch": 0.16171428571428573, |
|
"grad_norm": 0.03167405351996422, |
|
"kl": 0.360107421875, |
|
"learning_rate": 5.28017603591974e-05, |
|
"loss": 0.047, |
|
"reward": -0.20051036775112152, |
|
"reward_std": 0.7524923011660576, |
|
"rewards/cosine_scaled_reward": -0.24608852714300156, |
|
"rewards/format_reward": 0.2916666716337204, |
|
"step": 283 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2803.375, |
|
"epoch": 0.16228571428571428, |
|
"grad_norm": 0.0590810589492321, |
|
"kl": 0.223876953125, |
|
"learning_rate": 5.248803227530763e-05, |
|
"loss": 0.1235, |
|
"reward": -0.12046916782855988, |
|
"reward_std": 0.4706810973584652, |
|
"rewards/cosine_scaled_reward": -0.18523459136486053, |
|
"rewards/format_reward": 0.25, |
|
"step": 284 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3491.4166870117188, |
|
"epoch": 0.16285714285714287, |
|
"grad_norm": 0.09945899248123169, |
|
"kl": 0.260009765625, |
|
"learning_rate": 5.2174426621180906e-05, |
|
"loss": 0.0514, |
|
"reward": -0.5829556360840797, |
|
"reward_std": 0.20706506725400686, |
|
"rewards/cosine_scaled_reward": -0.33314448967576027, |
|
"rewards/format_reward": 0.0833333358168602, |
|
"step": 285 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2982.8750610351562, |
|
"epoch": 0.16342857142857142, |
|
"grad_norm": 0.18956300616264343, |
|
"kl": 0.47900390625, |
|
"learning_rate": 5.186095868151436e-05, |
|
"loss": 0.0642, |
|
"reward": 0.18999171257019043, |
|
"reward_std": 0.8487022221088409, |
|
"rewards/cosine_scaled_reward": -0.11333749070763588, |
|
"rewards/format_reward": 0.4166666828095913, |
|
"step": 286 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2949.4584045410156, |
|
"epoch": 0.164, |
|
"grad_norm": 0.0727335512638092, |
|
"kl": 0.43017578125, |
|
"learning_rate": 5.154764373429316e-05, |
|
"loss": 0.0658, |
|
"reward": 0.3310265392065048, |
|
"reward_std": 0.8132697474211454, |
|
"rewards/cosine_scaled_reward": -0.021986715495586395, |
|
"rewards/format_reward": 0.3750000149011612, |
|
"step": 287 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3584.0, |
|
"epoch": 0.16457142857142856, |
|
"grad_norm": 0.05997053533792496, |
|
"kl": 0.27587890625, |
|
"learning_rate": 5.1234497050045814e-05, |
|
"loss": 0.011, |
|
"reward": -0.5673756748437881, |
|
"reward_std": 0.2968660295009613, |
|
"rewards/cosine_scaled_reward": -0.3045211657881737, |
|
"rewards/format_reward": 0.0416666679084301, |
|
"step": 288 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3584.0, |
|
"epoch": 0.16514285714285715, |
|
"grad_norm": 0.06633277982473373, |
|
"kl": 0.30517578125, |
|
"learning_rate": 5.0921533891099905e-05, |
|
"loss": 0.0122, |
|
"reward": -0.6890946179628372, |
|
"reward_std": 0.15950028970837593, |
|
"rewards/cosine_scaled_reward": -0.3445473089814186, |
|
"rewards/format_reward": 0.0, |
|
"step": 289 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3561.7916870117188, |
|
"epoch": 0.1657142857142857, |
|
"grad_norm": 0.040217384696006775, |
|
"kl": 0.274169921875, |
|
"learning_rate": 5.0608769510838284e-05, |
|
"loss": 0.0149, |
|
"reward": -0.15157588943839073, |
|
"reward_std": 0.3089019572362304, |
|
"rewards/cosine_scaled_reward": -0.1382879503071308, |
|
"rewards/format_reward": 0.125, |
|
"step": 290 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3584.0, |
|
"epoch": 0.1662857142857143, |
|
"grad_norm": 0.07168910652399063, |
|
"kl": 0.314453125, |
|
"learning_rate": 5.0296219152955604e-05, |
|
"loss": 0.0126, |
|
"reward": -0.4608482411131263, |
|
"reward_std": 0.1107195196673274, |
|
"rewards/cosine_scaled_reward": -0.23042412823997438, |
|
"rewards/format_reward": 0.0, |
|
"step": 291 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3516.125, |
|
"epoch": 0.16685714285714287, |
|
"grad_norm": 0.04790537431836128, |
|
"kl": 0.427734375, |
|
"learning_rate": 4.998389805071536e-05, |
|
"loss": 0.0372, |
|
"reward": -0.40456270426511765, |
|
"reward_std": 0.47755980491638184, |
|
"rewards/cosine_scaled_reward": -0.2647813465446234, |
|
"rewards/format_reward": 0.125, |
|
"step": 292 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3265.7500610351562, |
|
"epoch": 0.16742857142857143, |
|
"grad_norm": 0.050319403409957886, |
|
"kl": 0.43408203125, |
|
"learning_rate": 4.9671821426207455e-05, |
|
"loss": 0.0514, |
|
"reward": 0.42125577852129936, |
|
"reward_std": 0.8351590689271688, |
|
"rewards/cosine_scaled_reward": 0.04396123066544533, |
|
"rewards/format_reward": 0.3333333432674408, |
|
"step": 293 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3466.5833740234375, |
|
"epoch": 0.168, |
|
"grad_norm": 0.05146944150328636, |
|
"kl": 0.3720703125, |
|
"learning_rate": 4.936000448960631e-05, |
|
"loss": 0.029, |
|
"reward": -0.271098967641592, |
|
"reward_std": 0.37742302753031254, |
|
"rewards/cosine_scaled_reward": -0.19804948195815086, |
|
"rewards/format_reward": 0.125, |
|
"step": 294 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3577.125, |
|
"epoch": 0.16857142857142857, |
|
"grad_norm": 0.06401149928569794, |
|
"kl": 0.409423828125, |
|
"learning_rate": 4.904846243842949e-05, |
|
"loss": 0.0203, |
|
"reward": -0.46894958056509495, |
|
"reward_std": 0.1929320227354765, |
|
"rewards/cosine_scaled_reward": -0.2553081316873431, |
|
"rewards/format_reward": 0.0416666679084301, |
|
"step": 295 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3085.9166870117188, |
|
"epoch": 0.16914285714285715, |
|
"grad_norm": 3.077998161315918, |
|
"kl": 10.78662109375, |
|
"learning_rate": 4.873721045679707e-05, |
|
"loss": 0.2368, |
|
"reward": -0.45518723130226135, |
|
"reward_std": 0.34384622564539313, |
|
"rewards/cosine_scaled_reward": -0.3109269514679909, |
|
"rewards/format_reward": 0.1666666716337204, |
|
"step": 296 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3572.3333740234375, |
|
"epoch": 0.1697142857142857, |
|
"grad_norm": 0.029030799865722656, |
|
"kl": 0.24560546875, |
|
"learning_rate": 4.842626371469149e-05, |
|
"loss": 0.0127, |
|
"reward": -0.0029055774211883545, |
|
"reward_std": 0.9067392088472843, |
|
"rewards/cosine_scaled_reward": -0.14728612639009953, |
|
"rewards/format_reward": 0.2916666679084301, |
|
"step": 297 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3539.1666870117188, |
|
"epoch": 0.1702857142857143, |
|
"grad_norm": 0.03600252792239189, |
|
"kl": 0.282470703125, |
|
"learning_rate": 4.811563736721829e-05, |
|
"loss": 0.0161, |
|
"reward": -0.2380085289478302, |
|
"reward_std": 0.7847420740872622, |
|
"rewards/cosine_scaled_reward": -0.2231709435582161, |
|
"rewards/format_reward": 0.2083333395421505, |
|
"step": 298 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3420.75, |
|
"epoch": 0.17085714285714285, |
|
"grad_norm": 0.04998674616217613, |
|
"kl": 0.34716796875, |
|
"learning_rate": 4.780534655386744e-05, |
|
"loss": 0.0751, |
|
"reward": -0.060495853424072266, |
|
"reward_std": 0.3135654963552952, |
|
"rewards/cosine_scaled_reward": -0.09274792857468128, |
|
"rewards/format_reward": 0.125, |
|
"step": 299 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3197.041748046875, |
|
"epoch": 0.17142857142857143, |
|
"grad_norm": 0.04269924387335777, |
|
"kl": 0.295654296875, |
|
"learning_rate": 4.74954063977754e-05, |
|
"loss": 0.0324, |
|
"reward": -0.26061324402689934, |
|
"reward_std": 0.29013217613101006, |
|
"rewards/cosine_scaled_reward": -0.3178066350519657, |
|
"rewards/format_reward": 0.375, |
|
"step": 300 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3308.5, |
|
"epoch": 0.172, |
|
"grad_norm": 0.061065103858709335, |
|
"kl": 0.324951171875, |
|
"learning_rate": 4.718583200498814e-05, |
|
"loss": 0.0757, |
|
"reward": -0.6140761077404022, |
|
"reward_std": 0.3115503266453743, |
|
"rewards/cosine_scaled_reward": -0.3903713934123516, |
|
"rewards/format_reward": 0.1666666716337204, |
|
"step": 301 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2942.25, |
|
"epoch": 0.17257142857142857, |
|
"grad_norm": 0.0670550987124443, |
|
"kl": 0.328369140625, |
|
"learning_rate": 4.687663846372481e-05, |
|
"loss": 0.0489, |
|
"reward": -0.023671671748161316, |
|
"reward_std": 0.2200901247560978, |
|
"rewards/cosine_scaled_reward": -0.15766918659210205, |
|
"rewards/format_reward": 0.2916666679084301, |
|
"step": 302 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3216.2916870117188, |
|
"epoch": 0.17314285714285715, |
|
"grad_norm": 0.12610213458538055, |
|
"kl": 0.3369140625, |
|
"learning_rate": 4.6567840843642384e-05, |
|
"loss": 0.0942, |
|
"reward": 0.028468750417232513, |
|
"reward_std": 0.4689778573811054, |
|
"rewards/cosine_scaled_reward": -0.08993229269981384, |
|
"rewards/format_reward": 0.2083333432674408, |
|
"step": 303 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3306.7500610351562, |
|
"epoch": 0.1737142857142857, |
|
"grad_norm": 0.04432929679751396, |
|
"kl": 0.35595703125, |
|
"learning_rate": 4.6259454195101274e-05, |
|
"loss": 0.0864, |
|
"reward": -0.35086609423160553, |
|
"reward_std": 0.286039125174284, |
|
"rewards/cosine_scaled_reward": -0.2795997243374586, |
|
"rewards/format_reward": 0.2083333395421505, |
|
"step": 304 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3584.0, |
|
"epoch": 0.1742857142857143, |
|
"grad_norm": 0.04625676944851875, |
|
"kl": 0.45068359375, |
|
"learning_rate": 4.5951493548431603e-05, |
|
"loss": 0.018, |
|
"reward": -0.6459367321804166, |
|
"reward_std": 0.4608680563978851, |
|
"rewards/cosine_scaled_reward": -0.36463503539562225, |
|
"rewards/format_reward": 0.0833333358168602, |
|
"step": 305 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2516.0833435058594, |
|
"epoch": 0.17485714285714285, |
|
"grad_norm": 0.11857768148183823, |
|
"kl": 0.3681640625, |
|
"learning_rate": 4.564397391320084e-05, |
|
"loss": 0.0883, |
|
"reward": 0.5657532401382923, |
|
"reward_std": 0.4573374604806304, |
|
"rewards/cosine_scaled_reward": 0.032876621931791306, |
|
"rewards/format_reward": 0.5000000111758709, |
|
"step": 306 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3584.0, |
|
"epoch": 0.17542857142857143, |
|
"grad_norm": 0.07965458929538727, |
|
"kl": 0.306884765625, |
|
"learning_rate": 4.5336910277482156e-05, |
|
"loss": 0.0123, |
|
"reward": -0.8762749880552292, |
|
"reward_std": 0.08273854246363044, |
|
"rewards/cosine_scaled_reward": -0.4381374940276146, |
|
"rewards/format_reward": 0.0, |
|
"step": 307 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2825.1666870117188, |
|
"epoch": 0.176, |
|
"grad_norm": 0.0597933791577816, |
|
"kl": 0.296142578125, |
|
"learning_rate": 4.503031760712397e-05, |
|
"loss": -0.0245, |
|
"reward": 0.3207015171647072, |
|
"reward_std": 0.7405253425240517, |
|
"rewards/cosine_scaled_reward": -0.006315924227237701, |
|
"rewards/format_reward": 0.3333333358168602, |
|
"step": 308 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3479.8333740234375, |
|
"epoch": 0.17657142857142857, |
|
"grad_norm": 0.05388667806982994, |
|
"kl": 0.378173828125, |
|
"learning_rate": 4.47242108450205e-05, |
|
"loss": 0.0445, |
|
"reward": -0.20779564417898655, |
|
"reward_std": 0.20239176135510206, |
|
"rewards/cosine_scaled_reward": -0.16639782628044486, |
|
"rewards/format_reward": 0.125, |
|
"step": 309 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3584.0, |
|
"epoch": 0.17714285714285713, |
|
"grad_norm": 0.0844094380736351, |
|
"kl": 0.225830078125, |
|
"learning_rate": 4.4418604910383456e-05, |
|
"loss": 0.009, |
|
"reward": -0.8865186870098114, |
|
"reward_std": 0.09305100329220295, |
|
"rewards/cosine_scaled_reward": -0.4432593658566475, |
|
"rewards/format_reward": 0.0, |
|
"step": 310 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3314.3333740234375, |
|
"epoch": 0.1777142857142857, |
|
"grad_norm": 0.05528863146901131, |
|
"kl": 0.37060546875, |
|
"learning_rate": 4.411351469801496e-05, |
|
"loss": 0.0433, |
|
"reward": -0.24768365547060966, |
|
"reward_std": 0.7055571936070919, |
|
"rewards/cosine_scaled_reward": -0.24884184449911118, |
|
"rewards/format_reward": 0.2500000037252903, |
|
"step": 311 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3523.1666870117188, |
|
"epoch": 0.1782857142857143, |
|
"grad_norm": 0.04280995950102806, |
|
"kl": 0.3310546875, |
|
"learning_rate": 4.380895507758155e-05, |
|
"loss": 0.0242, |
|
"reward": -0.381958182901144, |
|
"reward_std": 0.25565229170024395, |
|
"rewards/cosine_scaled_reward": -0.2326457593590021, |
|
"rewards/format_reward": 0.0833333358168602, |
|
"step": 312 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2733.875, |
|
"epoch": 0.17885714285714285, |
|
"grad_norm": 0.06423232704401016, |
|
"kl": 0.25390625, |
|
"learning_rate": 4.3504940892889434e-05, |
|
"loss": -0.0337, |
|
"reward": 0.036944784224033356, |
|
"reward_std": 0.9110874142497778, |
|
"rewards/cosine_scaled_reward": -0.14819425716996193, |
|
"rewards/format_reward": 0.3333333358168602, |
|
"step": 313 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3313.0416870117188, |
|
"epoch": 0.17942857142857144, |
|
"grad_norm": 0.07672199606895447, |
|
"kl": 0.4697265625, |
|
"learning_rate": 4.3201486961161094e-05, |
|
"loss": 0.0886, |
|
"reward": -0.19889018312096596, |
|
"reward_std": 0.8323519490659237, |
|
"rewards/cosine_scaled_reward": -0.22444510459899902, |
|
"rewards/format_reward": 0.2500000037252903, |
|
"step": 314 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3345.0833740234375, |
|
"epoch": 0.18, |
|
"grad_norm": 0.08122944086790085, |
|
"kl": 0.54150390625, |
|
"learning_rate": 4.289860807231305e-05, |
|
"loss": 0.0666, |
|
"reward": -0.32951921597123146, |
|
"reward_std": 0.3918669559061527, |
|
"rewards/cosine_scaled_reward": -0.20642626285552979, |
|
"rewards/format_reward": 0.0833333358168602, |
|
"step": 315 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3584.0, |
|
"epoch": 0.18057142857142858, |
|
"grad_norm": 0.04830171540379524, |
|
"kl": 0.234619140625, |
|
"learning_rate": 4.259631898823504e-05, |
|
"loss": 0.0094, |
|
"reward": -0.5842305850237608, |
|
"reward_std": 0.18226368352770805, |
|
"rewards/cosine_scaled_reward": -0.292115299962461, |
|
"rewards/format_reward": 0.0, |
|
"step": 316 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3418.375, |
|
"epoch": 0.18114285714285713, |
|
"grad_norm": 0.07951097935438156, |
|
"kl": 0.27880859375, |
|
"learning_rate": 4.229463444207056e-05, |
|
"loss": 0.0673, |
|
"reward": 1.2218952178955078e-06, |
|
"reward_std": 0.9312711171805859, |
|
"rewards/cosine_scaled_reward": -0.10416605323553085, |
|
"rewards/format_reward": 0.2083333395421505, |
|
"step": 317 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3141.750030517578, |
|
"epoch": 0.18171428571428572, |
|
"grad_norm": 0.05983598157763481, |
|
"kl": 0.32373046875, |
|
"learning_rate": 4.1993569137498776e-05, |
|
"loss": 0.1124, |
|
"reward": -0.1722477674484253, |
|
"reward_std": 0.665809502825141, |
|
"rewards/cosine_scaled_reward": -0.23195721581578255, |
|
"rewards/format_reward": 0.2916666716337204, |
|
"step": 318 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3271.9584350585938, |
|
"epoch": 0.18228571428571427, |
|
"grad_norm": 0.05965941771864891, |
|
"kl": 0.322265625, |
|
"learning_rate": 4.1693137748017916e-05, |
|
"loss": 0.0532, |
|
"reward": 0.3015955649316311, |
|
"reward_std": 0.40980809181928635, |
|
"rewards/cosine_scaled_reward": -0.0783689022064209, |
|
"rewards/format_reward": 0.4583333358168602, |
|
"step": 319 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3099.1666870117188, |
|
"epoch": 0.18285714285714286, |
|
"grad_norm": 0.08215689659118652, |
|
"kl": 0.227294921875, |
|
"learning_rate": 4.1393354916230006e-05, |
|
"loss": -0.0244, |
|
"reward": 0.0645943135023117, |
|
"reward_std": 0.3934327196329832, |
|
"rewards/cosine_scaled_reward": -0.09270285069942474, |
|
"rewards/format_reward": 0.25, |
|
"step": 320 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3561.2083740234375, |
|
"epoch": 0.18342857142857144, |
|
"grad_norm": 0.0447857603430748, |
|
"kl": 0.33349609375, |
|
"learning_rate": 4.109423525312738e-05, |
|
"loss": 0.0264, |
|
"reward": -0.6538648195564747, |
|
"reward_std": 0.1470047291368246, |
|
"rewards/cosine_scaled_reward": -0.32693241722881794, |
|
"rewards/format_reward": 0.0, |
|
"step": 321 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3584.0, |
|
"epoch": 0.184, |
|
"grad_norm": 0.0734485313296318, |
|
"kl": 0.2578125, |
|
"learning_rate": 4.079579333738039e-05, |
|
"loss": 0.0103, |
|
"reward": -0.3972032852470875, |
|
"reward_std": 0.285872345790267, |
|
"rewards/cosine_scaled_reward": -0.21943498216569424, |
|
"rewards/format_reward": 0.0416666679084301, |
|
"step": 322 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3581.2500610351562, |
|
"epoch": 0.18457142857142858, |
|
"grad_norm": 0.03482038900256157, |
|
"kl": 0.299072265625, |
|
"learning_rate": 4.049804371462701e-05, |
|
"loss": 0.0135, |
|
"reward": -0.3774775490164757, |
|
"reward_std": 0.3965078853070736, |
|
"rewards/cosine_scaled_reward": -0.25123877450823784, |
|
"rewards/format_reward": 0.1250000037252903, |
|
"step": 323 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3514.4166870117188, |
|
"epoch": 0.18514285714285714, |
|
"grad_norm": 0.06719670444726944, |
|
"kl": 0.283935546875, |
|
"learning_rate": 4.0201000896763766e-05, |
|
"loss": 0.0481, |
|
"reward": -0.4587059337645769, |
|
"reward_std": 0.5845797648653388, |
|
"rewards/cosine_scaled_reward": -0.3126862980425358, |
|
"rewards/format_reward": 0.1666666679084301, |
|
"step": 324 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2928.7916717529297, |
|
"epoch": 0.18571428571428572, |
|
"grad_norm": 0.03646574914455414, |
|
"kl": 0.281982421875, |
|
"learning_rate": 3.9904679361238525e-05, |
|
"loss": 0.0589, |
|
"reward": -0.3672609478235245, |
|
"reward_std": 0.27671839017421007, |
|
"rewards/cosine_scaled_reward": -0.3086304762400687, |
|
"rewards/format_reward": 0.25, |
|
"step": 325 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2766.958335876465, |
|
"epoch": 0.18628571428571428, |
|
"grad_norm": 0.052393004298210144, |
|
"kl": 0.26123046875, |
|
"learning_rate": 3.960909355034491e-05, |
|
"loss": 0.0333, |
|
"reward": 0.0963091105222702, |
|
"reward_std": 0.6041255034506321, |
|
"rewards/cosine_scaled_reward": -0.1393454596400261, |
|
"rewards/format_reward": 0.3750000037252903, |
|
"step": 326 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3330.875, |
|
"epoch": 0.18685714285714286, |
|
"grad_norm": 0.04328332468867302, |
|
"kl": 0.28857421875, |
|
"learning_rate": 3.9314257870518325e-05, |
|
"loss": 0.0722, |
|
"reward": -0.4075555991148576, |
|
"reward_std": 0.20874548598658293, |
|
"rewards/cosine_scaled_reward": -0.24544446932850406, |
|
"rewards/format_reward": 0.0833333358168602, |
|
"step": 327 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2778.7916870117188, |
|
"epoch": 0.18742857142857142, |
|
"grad_norm": 0.045754820108413696, |
|
"kl": 0.18963623046875, |
|
"learning_rate": 3.902018669163384e-05, |
|
"loss": 0.0114, |
|
"reward": -0.1832116525620222, |
|
"reward_std": 0.5292581329122186, |
|
"rewards/cosine_scaled_reward": -0.27910582162439823, |
|
"rewards/format_reward": 0.375, |
|
"step": 328 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3457.6666870117188, |
|
"epoch": 0.188, |
|
"grad_norm": 0.0493839792907238, |
|
"kl": 0.36279296875, |
|
"learning_rate": 3.872689434630585e-05, |
|
"loss": 0.0534, |
|
"reward": -0.25419315695762634, |
|
"reward_std": 0.5991159714758396, |
|
"rewards/cosine_scaled_reward": -0.23126322403550148, |
|
"rewards/format_reward": 0.2083333358168602, |
|
"step": 329 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3402.1666870117188, |
|
"epoch": 0.18857142857142858, |
|
"grad_norm": 0.10224326699972153, |
|
"kl": 0.270751953125, |
|
"learning_rate": 3.843439512918949e-05, |
|
"loss": 0.0955, |
|
"reward": -0.6764328852295876, |
|
"reward_std": 0.2557358928024769, |
|
"rewards/cosine_scaled_reward": -0.3590497747063637, |
|
"rewards/format_reward": 0.0416666679084301, |
|
"step": 330 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2853.291717529297, |
|
"epoch": 0.18914285714285714, |
|
"grad_norm": 0.06347351521253586, |
|
"kl": 0.278076171875, |
|
"learning_rate": 3.814270329628396e-05, |
|
"loss": 0.1003, |
|
"reward": 0.17456289008259773, |
|
"reward_std": 0.23762040957808495, |
|
"rewards/cosine_scaled_reward": -0.07938522100448608, |
|
"rewards/format_reward": 0.3333333358168602, |
|
"step": 331 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3217.2083740234375, |
|
"epoch": 0.18971428571428572, |
|
"grad_norm": 0.08903349936008453, |
|
"kl": 0.453369140625, |
|
"learning_rate": 3.785183306423768e-05, |
|
"loss": 0.0416, |
|
"reward": -0.23864453844726086, |
|
"reward_std": 0.47026437893509865, |
|
"rewards/cosine_scaled_reward": -0.2859889483079314, |
|
"rewards/format_reward": 0.3333333432674408, |
|
"step": 332 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3260.7083740234375, |
|
"epoch": 0.19028571428571428, |
|
"grad_norm": 0.07358856499195099, |
|
"kl": 0.28271484375, |
|
"learning_rate": 3.756179860965538e-05, |
|
"loss": 0.0064, |
|
"reward": -0.38410256803035736, |
|
"reward_std": 0.468995469622314, |
|
"rewards/cosine_scaled_reward": -0.3170512933284044, |
|
"rewards/format_reward": 0.2500000111758709, |
|
"step": 333 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3584.0, |
|
"epoch": 0.19085714285714286, |
|
"grad_norm": 0.05134163424372673, |
|
"kl": 0.31591796875, |
|
"learning_rate": 3.7272614068407205e-05, |
|
"loss": 0.0127, |
|
"reward": -0.6644355654716492, |
|
"reward_std": 0.20398546569049358, |
|
"rewards/cosine_scaled_reward": -0.3322177827358246, |
|
"rewards/format_reward": 0.0, |
|
"step": 334 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2896.4583740234375, |
|
"epoch": 0.19142857142857142, |
|
"grad_norm": 0.15725326538085938, |
|
"kl": 0.28125, |
|
"learning_rate": 3.698429353493974e-05, |
|
"loss": 0.193, |
|
"reward": -0.16589602641761303, |
|
"reward_std": 0.39419085811823606, |
|
"rewards/cosine_scaled_reward": -0.2496146857738495, |
|
"rewards/format_reward": 0.3333333432674408, |
|
"step": 335 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3211.2916870117188, |
|
"epoch": 0.192, |
|
"grad_norm": 0.04808633401989937, |
|
"kl": 0.306396484375, |
|
"learning_rate": 3.6696851061589e-05, |
|
"loss": 0.0454, |
|
"reward": 0.04493848606944084, |
|
"reward_std": 0.9861778020858765, |
|
"rewards/cosine_scaled_reward": -0.12336408998817205, |
|
"rewards/format_reward": 0.2916666753590107, |
|
"step": 336 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2941.4166717529297, |
|
"epoch": 0.19257142857142856, |
|
"grad_norm": 0.1766992211341858, |
|
"kl": 0.259521484375, |
|
"learning_rate": 3.6410300657895626e-05, |
|
"loss": -0.0732, |
|
"reward": 0.014326110482215881, |
|
"reward_std": 0.17792627471499145, |
|
"rewards/cosine_scaled_reward": -0.11783694475889206, |
|
"rewards/format_reward": 0.25, |
|
"step": 337 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2701.75, |
|
"epoch": 0.19314285714285714, |
|
"grad_norm": 0.175528883934021, |
|
"kl": 0.356689453125, |
|
"learning_rate": 3.6124656289922034e-05, |
|
"loss": 0.1389, |
|
"reward": 0.20876749278977513, |
|
"reward_std": 0.7206097654998302, |
|
"rewards/cosine_scaled_reward": -0.08311627432703972, |
|
"rewards/format_reward": 0.3750000037252903, |
|
"step": 338 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3575.0, |
|
"epoch": 0.19371428571428573, |
|
"grad_norm": 0.036193348467350006, |
|
"kl": 0.239501953125, |
|
"learning_rate": 3.583993187957173e-05, |
|
"loss": 0.0088, |
|
"reward": -0.37282892875373363, |
|
"reward_std": 0.49462850391864777, |
|
"rewards/cosine_scaled_reward": -0.2280811471864581, |
|
"rewards/format_reward": 0.0833333358168602, |
|
"step": 339 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2417.3333587646484, |
|
"epoch": 0.19428571428571428, |
|
"grad_norm": 0.13114678859710693, |
|
"kl": 0.198974609375, |
|
"learning_rate": 3.5556141303910795e-05, |
|
"loss": 0.158, |
|
"reward": 0.1652292013168335, |
|
"reward_std": 0.36829282343387604, |
|
"rewards/cosine_scaled_reward": -0.16738539934158325, |
|
"rewards/format_reward": 0.5, |
|
"step": 340 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2132.500030517578, |
|
"epoch": 0.19485714285714287, |
|
"grad_norm": 0.14688146114349365, |
|
"kl": 0.278076171875, |
|
"learning_rate": 3.5273298394491515e-05, |
|
"loss": 0.2139, |
|
"reward": 0.1563252117484808, |
|
"reward_std": 0.2888243719935417, |
|
"rewards/cosine_scaled_reward": -0.23433741927146912, |
|
"rewards/format_reward": 0.6250000149011612, |
|
"step": 341 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2552.0416870117188, |
|
"epoch": 0.19542857142857142, |
|
"grad_norm": 0.08089858293533325, |
|
"kl": 0.278076171875, |
|
"learning_rate": 3.499141693667828e-05, |
|
"loss": -0.0431, |
|
"reward": 0.3262595981359482, |
|
"reward_std": 0.29629753855988383, |
|
"rewards/cosine_scaled_reward": -0.02437019906938076, |
|
"rewards/format_reward": 0.375, |
|
"step": 342 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2731.000045776367, |
|
"epoch": 0.196, |
|
"grad_norm": 0.08294232934713364, |
|
"kl": 0.3438720703125, |
|
"learning_rate": 3.4710510668975624e-05, |
|
"loss": -0.009, |
|
"reward": 0.140800341963768, |
|
"reward_std": 0.30971661023795605, |
|
"rewards/cosine_scaled_reward": -0.0754331611096859, |
|
"rewards/format_reward": 0.2916666679084301, |
|
"step": 343 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3287.0000610351562, |
|
"epoch": 0.19657142857142856, |
|
"grad_norm": 0.07776912301778793, |
|
"kl": 0.28564453125, |
|
"learning_rate": 3.443059328235878e-05, |
|
"loss": 0.0645, |
|
"reward": -0.010377008467912674, |
|
"reward_std": 0.8143371604382992, |
|
"rewards/cosine_scaled_reward": -0.13018852844834328, |
|
"rewards/format_reward": 0.2500000074505806, |
|
"step": 344 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2312.4166870117188, |
|
"epoch": 0.19714285714285715, |
|
"grad_norm": 0.5134606957435608, |
|
"kl": 0.472900390625, |
|
"learning_rate": 3.415167841960624e-05, |
|
"loss": -0.0386, |
|
"reward": 0.507152209058404, |
|
"reward_std": 0.5029881596565247, |
|
"rewards/cosine_scaled_reward": -0.01725723221898079, |
|
"rewards/format_reward": 0.5416666679084301, |
|
"step": 345 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3584.0, |
|
"epoch": 0.1977142857142857, |
|
"grad_norm": 0.032926544547080994, |
|
"kl": 0.202880859375, |
|
"learning_rate": 3.387377967463493e-05, |
|
"loss": 0.0081, |
|
"reward": -0.500700056552887, |
|
"reward_std": 0.22377115488052368, |
|
"rewards/cosine_scaled_reward": -0.2503500273451209, |
|
"rewards/format_reward": 0.0, |
|
"step": 346 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3344.166748046875, |
|
"epoch": 0.1982857142857143, |
|
"grad_norm": 0.06091040372848511, |
|
"kl": 0.4443359375, |
|
"learning_rate": 3.359691059183761e-05, |
|
"loss": 0.0376, |
|
"reward": -0.43880724161863327, |
|
"reward_std": 0.5309533849358559, |
|
"rewards/cosine_scaled_reward": -0.2819036263972521, |
|
"rewards/format_reward": 0.1250000037252903, |
|
"step": 347 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2690.125045776367, |
|
"epoch": 0.19885714285714284, |
|
"grad_norm": 0.15880349278450012, |
|
"kl": 0.33056640625, |
|
"learning_rate": 3.3321084665422807e-05, |
|
"loss": 0.0083, |
|
"reward": -0.3709861980751157, |
|
"reward_std": 0.3275773096829653, |
|
"rewards/cosine_scaled_reward": -0.3938264362514019, |
|
"rewards/format_reward": 0.4166666865348816, |
|
"step": 348 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3584.0, |
|
"epoch": 0.19942857142857143, |
|
"grad_norm": 0.04959436133503914, |
|
"kl": 0.341796875, |
|
"learning_rate": 3.304631533875703e-05, |
|
"loss": 0.0137, |
|
"reward": -0.4646516516804695, |
|
"reward_std": 0.5127328485250473, |
|
"rewards/cosine_scaled_reward": -0.25315913930535316, |
|
"rewards/format_reward": 0.0416666679084301, |
|
"step": 349 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2546.4583740234375, |
|
"epoch": 0.2, |
|
"grad_norm": 0.08026839792728424, |
|
"kl": 0.325927734375, |
|
"learning_rate": 3.2772616003709614e-05, |
|
"loss": 0.1587, |
|
"reward": 0.4196392893791199, |
|
"reward_std": 0.3610045984387398, |
|
"rewards/cosine_scaled_reward": -0.01934703439474106, |
|
"rewards/format_reward": 0.4583333395421505, |
|
"step": 350 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3361.7083740234375, |
|
"epoch": 0.20057142857142857, |
|
"grad_norm": 0.036866363137960434, |
|
"kl": 0.319091796875, |
|
"learning_rate": 3.250000000000001e-05, |
|
"loss": 0.0212, |
|
"reward": -0.3742051422595978, |
|
"reward_std": 0.3687475919723511, |
|
"rewards/cosine_scaled_reward": -0.3329359143972397, |
|
"rewards/format_reward": 0.291666679084301, |
|
"step": 351 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2882.4166717529297, |
|
"epoch": 0.20114285714285715, |
|
"grad_norm": 0.1847529113292694, |
|
"kl": 0.2890625, |
|
"learning_rate": 3.222848061454764e-05, |
|
"loss": 0.1266, |
|
"reward": -0.06669257394969463, |
|
"reward_std": 0.5095125660300255, |
|
"rewards/cosine_scaled_reward": -0.13751295860856771, |
|
"rewards/format_reward": 0.2083333432674408, |
|
"step": 352 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2834.916717529297, |
|
"epoch": 0.2017142857142857, |
|
"grad_norm": 0.2114069163799286, |
|
"kl": 0.1923828125, |
|
"learning_rate": 3.195807108082429e-05, |
|
"loss": 0.1295, |
|
"reward": 0.1316913142800331, |
|
"reward_std": 0.5846901014447212, |
|
"rewards/cosine_scaled_reward": -0.12165435403585434, |
|
"rewards/format_reward": 0.3750000149011612, |
|
"step": 353 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3222.8750610351562, |
|
"epoch": 0.2022857142857143, |
|
"grad_norm": 0.08327220380306244, |
|
"kl": 0.41943359375, |
|
"learning_rate": 3.168878457820915e-05, |
|
"loss": 0.1263, |
|
"reward": -0.2932204008102417, |
|
"reward_std": 0.5368962581269443, |
|
"rewards/cosine_scaled_reward": -0.2924435433524195, |
|
"rewards/format_reward": 0.2916666716337204, |
|
"step": 354 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2713.541717529297, |
|
"epoch": 0.20285714285714285, |
|
"grad_norm": 0.0538349449634552, |
|
"kl": 0.21435546875, |
|
"learning_rate": 3.1420634231346445e-05, |
|
"loss": 0.0415, |
|
"reward": 0.24344536662101746, |
|
"reward_std": 0.3011304475367069, |
|
"rewards/cosine_scaled_reward": -0.08661067485809326, |
|
"rewards/format_reward": 0.4166666865348816, |
|
"step": 355 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2710.7916870117188, |
|
"epoch": 0.20342857142857143, |
|
"grad_norm": 0.0692262351512909, |
|
"kl": 0.243896484375, |
|
"learning_rate": 3.1153633109505784e-05, |
|
"loss": 0.0255, |
|
"reward": 0.8570400476455688, |
|
"reward_std": 0.7603759318590164, |
|
"rewards/cosine_scaled_reward": 0.17851997911930084, |
|
"rewards/format_reward": 0.5000000111758709, |
|
"step": 356 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3192.1250610351562, |
|
"epoch": 0.204, |
|
"grad_norm": 0.09190493822097778, |
|
"kl": 0.352783203125, |
|
"learning_rate": 3.088779422594514e-05, |
|
"loss": -0.008, |
|
"reward": -0.4519059807062149, |
|
"reward_std": 0.3100459352135658, |
|
"rewards/cosine_scaled_reward": -0.39261969178915024, |
|
"rewards/format_reward": 0.3333333358168602, |
|
"step": 357 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2996.1666870117188, |
|
"epoch": 0.20457142857142857, |
|
"grad_norm": 0.07240985333919525, |
|
"kl": 0.216064453125, |
|
"learning_rate": 3.062313053727671e-05, |
|
"loss": -0.0162, |
|
"reward": 0.539523258805275, |
|
"reward_std": 0.5989483781158924, |
|
"rewards/cosine_scaled_reward": 0.10309496521949768, |
|
"rewards/format_reward": 0.3333333358168602, |
|
"step": 358 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2649.0000610351562, |
|
"epoch": 0.20514285714285715, |
|
"grad_norm": 0.13966155052185059, |
|
"kl": 0.431640625, |
|
"learning_rate": 3.0359654942835248e-05, |
|
"loss": 0.1862, |
|
"reward": -0.014458760619163513, |
|
"reward_std": 0.6829442456364632, |
|
"rewards/cosine_scaled_reward": -0.1530627132160589, |
|
"rewards/format_reward": 0.2916666753590107, |
|
"step": 359 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3582.5416870117188, |
|
"epoch": 0.2057142857142857, |
|
"grad_norm": 0.05827893316745758, |
|
"kl": 0.3515625, |
|
"learning_rate": 3.0097380284049527e-05, |
|
"loss": 0.0144, |
|
"reward": -0.40928656980395317, |
|
"reward_std": 0.5495708473026752, |
|
"rewards/cosine_scaled_reward": -0.30880994349718094, |
|
"rewards/format_reward": 0.2083333395421505, |
|
"step": 360 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3319.6666870117188, |
|
"epoch": 0.2062857142857143, |
|
"grad_norm": 0.07192689180374146, |
|
"kl": 0.47998046875, |
|
"learning_rate": 2.98363193438164e-05, |
|
"loss": 0.085, |
|
"reward": -0.28407811373472214, |
|
"reward_std": 0.5651301890611649, |
|
"rewards/cosine_scaled_reward": -0.20453906618058681, |
|
"rewards/format_reward": 0.125, |
|
"step": 361 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3341.8750610351562, |
|
"epoch": 0.20685714285714285, |
|
"grad_norm": 0.04843948781490326, |
|
"kl": 0.41845703125, |
|
"learning_rate": 2.9576484845877794e-05, |
|
"loss": 0.0374, |
|
"reward": -0.03159081190824509, |
|
"reward_std": 0.49874855391681194, |
|
"rewards/cosine_scaled_reward": -0.11996208503842354, |
|
"rewards/format_reward": 0.2083333395421505, |
|
"step": 362 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3009.625030517578, |
|
"epoch": 0.20742857142857143, |
|
"grad_norm": 0.08178125321865082, |
|
"kl": 0.314453125, |
|
"learning_rate": 2.931788945420058e-05, |
|
"loss": -0.019, |
|
"reward": 0.15126367658376694, |
|
"reward_std": 0.3955496810376644, |
|
"rewards/cosine_scaled_reward": -0.0910348454490304, |
|
"rewards/format_reward": 0.3333333358168602, |
|
"step": 363 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2873.0833435058594, |
|
"epoch": 0.208, |
|
"grad_norm": 0.03954126313328743, |
|
"kl": 0.21868896484375, |
|
"learning_rate": 2.906054577235931e-05, |
|
"loss": 0.0505, |
|
"reward": -0.4637039601802826, |
|
"reward_std": 0.08979167556390166, |
|
"rewards/cosine_scaled_reward": -0.35685197822749615, |
|
"rewards/format_reward": 0.25, |
|
"step": 364 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3243.7916870117188, |
|
"epoch": 0.20857142857142857, |
|
"grad_norm": 0.050534144043922424, |
|
"kl": 0.343994140625, |
|
"learning_rate": 2.880446634292199e-05, |
|
"loss": 0.0642, |
|
"reward": -0.2382236891426146, |
|
"reward_std": 0.3782954253256321, |
|
"rewards/cosine_scaled_reward": -0.2649451866745949, |
|
"rewards/format_reward": 0.2916666716337204, |
|
"step": 365 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3584.0, |
|
"epoch": 0.20914285714285713, |
|
"grad_norm": 0.038890041410923004, |
|
"kl": 0.322021484375, |
|
"learning_rate": 2.854966364683872e-05, |
|
"loss": 0.0129, |
|
"reward": -0.7458789497613907, |
|
"reward_std": 0.2065310850739479, |
|
"rewards/cosine_scaled_reward": -0.39377281069755554, |
|
"rewards/format_reward": 0.0416666679084301, |
|
"step": 366 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2880.2083435058594, |
|
"epoch": 0.20971428571428571, |
|
"grad_norm": 0.15094637870788574, |
|
"kl": 0.240478515625, |
|
"learning_rate": 2.829615010283344e-05, |
|
"loss": 0.0793, |
|
"reward": 0.01062861829996109, |
|
"reward_std": 0.17077413201332092, |
|
"rewards/cosine_scaled_reward": -0.11968569085001945, |
|
"rewards/format_reward": 0.25, |
|
"step": 367 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2936.0, |
|
"epoch": 0.2102857142857143, |
|
"grad_norm": 0.03814147785305977, |
|
"kl": 0.2568359375, |
|
"learning_rate": 2.8043938066798646e-05, |
|
"loss": 0.0351, |
|
"reward": -0.06382668018341064, |
|
"reward_std": 0.13458664249628782, |
|
"rewards/cosine_scaled_reward": -0.15691335499286652, |
|
"rewards/format_reward": 0.25, |
|
"step": 368 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2985.25, |
|
"epoch": 0.21085714285714285, |
|
"grad_norm": 0.2742946147918701, |
|
"kl": 0.27294921875, |
|
"learning_rate": 2.7793039831193136e-05, |
|
"loss": -0.0924, |
|
"reward": -0.20023366808891296, |
|
"reward_std": 0.6205369587987661, |
|
"rewards/cosine_scaled_reward": -0.2459501512348652, |
|
"rewards/format_reward": 0.2916666679084301, |
|
"step": 369 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3553.2916870117188, |
|
"epoch": 0.21142857142857144, |
|
"grad_norm": 0.03684834763407707, |
|
"kl": 0.23046875, |
|
"learning_rate": 2.754346762444296e-05, |
|
"loss": 0.0207, |
|
"reward": -0.3706485256552696, |
|
"reward_std": 0.5973577741533518, |
|
"rewards/cosine_scaled_reward": -0.2478242591023445, |
|
"rewards/format_reward": 0.1250000037252903, |
|
"step": 370 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2341.5416870117188, |
|
"epoch": 0.212, |
|
"grad_norm": 0.0773259699344635, |
|
"kl": 0.28411865234375, |
|
"learning_rate": 2.729523361034538e-05, |
|
"loss": 0.0904, |
|
"reward": 0.24746574461460114, |
|
"reward_std": 0.3304491974413395, |
|
"rewards/cosine_scaled_reward": -0.10543380305171013, |
|
"rewards/format_reward": 0.4583333432674408, |
|
"step": 371 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3566.2916870117188, |
|
"epoch": 0.21257142857142858, |
|
"grad_norm": 0.05479148030281067, |
|
"kl": 0.364990234375, |
|
"learning_rate": 2.7048349887476037e-05, |
|
"loss": 0.0218, |
|
"reward": -0.6281777173280716, |
|
"reward_std": 0.3108227998018265, |
|
"rewards/cosine_scaled_reward": -0.3557555228471756, |
|
"rewards/format_reward": 0.0833333358168602, |
|
"step": 372 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2977.125030517578, |
|
"epoch": 0.21314285714285713, |
|
"grad_norm": 0.11434542387723923, |
|
"kl": 0.2933349609375, |
|
"learning_rate": 2.6802828488599297e-05, |
|
"loss": -0.0334, |
|
"reward": -0.07653629779815674, |
|
"reward_std": 0.2992605846375227, |
|
"rewards/cosine_scaled_reward": -0.18410149216651917, |
|
"rewards/format_reward": 0.2916666679084301, |
|
"step": 373 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3578.0833740234375, |
|
"epoch": 0.21371428571428572, |
|
"grad_norm": 0.053493838757276535, |
|
"kl": 0.28466796875, |
|
"learning_rate": 2.6558681380081713e-05, |
|
"loss": 0.0125, |
|
"reward": -0.40124300494790077, |
|
"reward_std": 0.2405980322510004, |
|
"rewards/cosine_scaled_reward": -0.24228817224502563, |
|
"rewards/format_reward": 0.0833333358168602, |
|
"step": 374 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2983.7500610351562, |
|
"epoch": 0.21428571428571427, |
|
"grad_norm": 0.05206667259335518, |
|
"kl": 0.406494140625, |
|
"learning_rate": 2.6315920461308964e-05, |
|
"loss": 0.0623, |
|
"reward": 0.096424276009202, |
|
"reward_std": 0.6552168540656567, |
|
"rewards/cosine_scaled_reward": -0.13928786292672157, |
|
"rewards/format_reward": 0.3750000149011612, |
|
"step": 375 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3149.7500610351562, |
|
"epoch": 0.21485714285714286, |
|
"grad_norm": 0.05979221314191818, |
|
"kl": 0.3193359375, |
|
"learning_rate": 2.6074557564105727e-05, |
|
"loss": 0.0525, |
|
"reward": 0.052399429492652416, |
|
"reward_std": 0.826238114386797, |
|
"rewards/cosine_scaled_reward": -0.16130028385668993, |
|
"rewards/format_reward": 0.3750000037252903, |
|
"step": 376 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2414.3334045410156, |
|
"epoch": 0.21542857142857144, |
|
"grad_norm": 0.2308046966791153, |
|
"kl": 0.3388671875, |
|
"learning_rate": 2.5834604452159112e-05, |
|
"loss": 0.2314, |
|
"reward": 0.3543909564614296, |
|
"reward_std": 0.49069568142294884, |
|
"rewards/cosine_scaled_reward": -0.07280451618134975, |
|
"rewards/format_reward": 0.5000000111758709, |
|
"step": 377 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3316.25, |
|
"epoch": 0.216, |
|
"grad_norm": 0.06222301721572876, |
|
"kl": 0.40576171875, |
|
"learning_rate": 2.5596072820445254e-05, |
|
"loss": 0.0485, |
|
"reward": -0.5630598217248917, |
|
"reward_std": 0.21512744203209877, |
|
"rewards/cosine_scaled_reward": -0.3648632522672415, |
|
"rewards/format_reward": 0.1666666716337204, |
|
"step": 378 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3449.0833740234375, |
|
"epoch": 0.21657142857142858, |
|
"grad_norm": 0.053267695009708405, |
|
"kl": 0.39697265625, |
|
"learning_rate": 2.5358974294659375e-05, |
|
"loss": 0.0307, |
|
"reward": -0.44570785015821457, |
|
"reward_std": 0.24237919226288795, |
|
"rewards/cosine_scaled_reward": -0.285353927873075, |
|
"rewards/format_reward": 0.125, |
|
"step": 379 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3125.2500610351562, |
|
"epoch": 0.21714285714285714, |
|
"grad_norm": 0.06081575155258179, |
|
"kl": 0.37646484375, |
|
"learning_rate": 2.5123320430649133e-05, |
|
"loss": 0.078, |
|
"reward": 0.06776145473122597, |
|
"reward_std": 0.9119222313165665, |
|
"rewards/cosine_scaled_reward": -0.13278593588620424, |
|
"rewards/format_reward": 0.3333333432674408, |
|
"step": 380 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3297.25, |
|
"epoch": 0.21771428571428572, |
|
"grad_norm": 0.05626344308257103, |
|
"kl": 0.541015625, |
|
"learning_rate": 2.4889122713851394e-05, |
|
"loss": 0.0828, |
|
"reward": 0.005039989948272705, |
|
"reward_std": 0.6351946890354156, |
|
"rewards/cosine_scaled_reward": -0.03914666548371315, |
|
"rewards/format_reward": 0.0833333358168602, |
|
"step": 381 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3272.9166870117188, |
|
"epoch": 0.21828571428571428, |
|
"grad_norm": 0.06031295284628868, |
|
"kl": 0.32666015625, |
|
"learning_rate": 2.4656392558732464e-05, |
|
"loss": 0.0317, |
|
"reward": -0.2794642001390457, |
|
"reward_std": 0.666092368774116, |
|
"rewards/cosine_scaled_reward": -0.26473210006952286, |
|
"rewards/format_reward": 0.2500000111758709, |
|
"step": 382 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2630.250045776367, |
|
"epoch": 0.21885714285714286, |
|
"grad_norm": 0.4203470051288605, |
|
"kl": 0.2860107421875, |
|
"learning_rate": 2.442514130823177e-05, |
|
"loss": -0.1659, |
|
"reward": 0.09679129719734192, |
|
"reward_std": 0.6322224270552397, |
|
"rewards/cosine_scaled_reward": -0.1599376993253827, |
|
"rewards/format_reward": 0.4166666716337204, |
|
"step": 383 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3584.0, |
|
"epoch": 0.21942857142857142, |
|
"grad_norm": 0.1070617288351059, |
|
"kl": 0.404052734375, |
|
"learning_rate": 2.4195380233209008e-05, |
|
"loss": 0.0162, |
|
"reward": -0.7048551961779594, |
|
"reward_std": 0.05190820666030049, |
|
"rewards/cosine_scaled_reward": -0.3524275906383991, |
|
"rewards/format_reward": 0.0, |
|
"step": 384 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2689.666748046875, |
|
"epoch": 0.22, |
|
"grad_norm": 0.07103494554758072, |
|
"kl": 0.529296875, |
|
"learning_rate": 2.396712053189486e-05, |
|
"loss": 0.0642, |
|
"reward": 0.18483632430434227, |
|
"reward_std": 0.6015055403113365, |
|
"rewards/cosine_scaled_reward": -0.09508185088634491, |
|
"rewards/format_reward": 0.3750000037252903, |
|
"step": 385 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3165.875, |
|
"epoch": 0.22057142857142858, |
|
"grad_norm": 0.05382478982210159, |
|
"kl": 0.44775390625, |
|
"learning_rate": 2.374037332934512e-05, |
|
"loss": 0.0748, |
|
"reward": -0.6691429018974304, |
|
"reward_std": 0.313931992277503, |
|
"rewards/cosine_scaled_reward": -0.3970714509487152, |
|
"rewards/format_reward": 0.125, |
|
"step": 386 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1973.0000762939453, |
|
"epoch": 0.22114285714285714, |
|
"grad_norm": 0.46731579303741455, |
|
"kl": 0.2667236328125, |
|
"learning_rate": 2.3515149676898555e-05, |
|
"loss": 0.3138, |
|
"reward": 0.6846924126148224, |
|
"reward_std": 0.5325267240405083, |
|
"rewards/cosine_scaled_reward": 0.0506795197725296, |
|
"rewards/format_reward": 0.5833333358168602, |
|
"step": 387 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2146.6250610351562, |
|
"epoch": 0.22171428571428572, |
|
"grad_norm": 0.06577088683843613, |
|
"kl": 0.330596923828125, |
|
"learning_rate": 2.329146055163824e-05, |
|
"loss": 0.0908, |
|
"reward": 1.1133306175470352, |
|
"reward_std": 0.9890650920569897, |
|
"rewards/cosine_scaled_reward": 0.24416528269648552, |
|
"rewards/format_reward": 0.6250000149011612, |
|
"step": 388 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2011.7083587646484, |
|
"epoch": 0.22228571428571428, |
|
"grad_norm": 0.07788607478141785, |
|
"kl": 0.350677490234375, |
|
"learning_rate": 2.306931685585657e-05, |
|
"loss": 0.1926, |
|
"reward": 0.8576178252696991, |
|
"reward_std": 0.9910986423492432, |
|
"rewards/cosine_scaled_reward": 0.1371422311058268, |
|
"rewards/format_reward": 0.5833333432674408, |
|
"step": 389 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3186.5416870117188, |
|
"epoch": 0.22285714285714286, |
|
"grad_norm": 0.07300270348787308, |
|
"kl": 0.46630859375, |
|
"learning_rate": 2.284872941652386e-05, |
|
"loss": 0.1027, |
|
"reward": -0.17644105851650238, |
|
"reward_std": 0.3873226083815098, |
|
"rewards/cosine_scaled_reward": -0.1923871971666813, |
|
"rewards/format_reward": 0.2083333358168602, |
|
"step": 390 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3476.5833740234375, |
|
"epoch": 0.22342857142857142, |
|
"grad_norm": 0.03151053190231323, |
|
"kl": 0.219482421875, |
|
"learning_rate": 2.2629708984760708e-05, |
|
"loss": 0.0435, |
|
"reward": 0.03934659995138645, |
|
"reward_std": 0.4940398707985878, |
|
"rewards/cosine_scaled_reward": -0.08449336793273687, |
|
"rewards/format_reward": 0.2083333358168602, |
|
"step": 391 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3302.0833740234375, |
|
"epoch": 0.224, |
|
"grad_norm": 0.07931151986122131, |
|
"kl": 0.357666015625, |
|
"learning_rate": 2.2412266235313975e-05, |
|
"loss": -0.0285, |
|
"reward": -0.5167839005589485, |
|
"reward_std": 0.20752658136188984, |
|
"rewards/cosine_scaled_reward": -0.30005861073732376, |
|
"rewards/format_reward": 0.0833333358168602, |
|
"step": 392 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3422.0, |
|
"epoch": 0.22457142857142856, |
|
"grad_norm": 0.05402982234954834, |
|
"kl": 0.3740234375, |
|
"learning_rate": 2.219641176603649e-05, |
|
"loss": 0.0462, |
|
"reward": -0.3995904391631484, |
|
"reward_std": 0.182258821092546, |
|
"rewards/cosine_scaled_reward": -0.2622952158562839, |
|
"rewards/format_reward": 0.125, |
|
"step": 393 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3126.6250610351562, |
|
"epoch": 0.22514285714285714, |
|
"grad_norm": 0.08397813886404037, |
|
"kl": 0.39794921875, |
|
"learning_rate": 2.198215609737056e-05, |
|
"loss": 0.1138, |
|
"reward": -0.33128097280859947, |
|
"reward_std": 0.3852621605619788, |
|
"rewards/cosine_scaled_reward": -0.24897384084761143, |
|
"rewards/format_reward": 0.1666666716337204, |
|
"step": 394 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2954.5833740234375, |
|
"epoch": 0.2257142857142857, |
|
"grad_norm": 0.06730625778436661, |
|
"kl": 0.3223876953125, |
|
"learning_rate": 2.1769509671835224e-05, |
|
"loss": 0.1452, |
|
"reward": 0.2924303896725178, |
|
"reward_std": 0.7555544227361679, |
|
"rewards/cosine_scaled_reward": -0.06211814656853676, |
|
"rewards/format_reward": 0.416666679084301, |
|
"step": 395 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3396.6666870117188, |
|
"epoch": 0.22628571428571428, |
|
"grad_norm": 0.06294995546340942, |
|
"kl": 0.42138671875, |
|
"learning_rate": 2.1558482853517257e-05, |
|
"loss": 0.0685, |
|
"reward": -0.32847850024700165, |
|
"reward_std": 0.589644180610776, |
|
"rewards/cosine_scaled_reward": -0.2892392612993717, |
|
"rewards/format_reward": 0.25, |
|
"step": 396 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3145.2916870117188, |
|
"epoch": 0.22685714285714287, |
|
"grad_norm": 0.04051867127418518, |
|
"kl": 0.35986328125, |
|
"learning_rate": 2.1349085927566073e-05, |
|
"loss": 0.0434, |
|
"reward": -0.27365291118621826, |
|
"reward_std": 0.8279965240508318, |
|
"rewards/cosine_scaled_reward": -0.2618264742195606, |
|
"rewards/format_reward": 0.2500000074505806, |
|
"step": 397 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3471.4583740234375, |
|
"epoch": 0.22742857142857142, |
|
"grad_norm": 0.07548023015260696, |
|
"kl": 0.53759765625, |
|
"learning_rate": 2.114132909969241e-05, |
|
"loss": 0.0366, |
|
"reward": -0.2639296054840088, |
|
"reward_std": 0.5751975458115339, |
|
"rewards/cosine_scaled_reward": -0.2152981460094452, |
|
"rewards/format_reward": 0.1666666716337204, |
|
"step": 398 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3529.4583740234375, |
|
"epoch": 0.228, |
|
"grad_norm": 0.03485243022441864, |
|
"kl": 0.294921875, |
|
"learning_rate": 2.093522249567097e-05, |
|
"loss": 0.0234, |
|
"reward": -0.439264640212059, |
|
"reward_std": 0.28215121757239103, |
|
"rewards/cosine_scaled_reward": -0.28213231824338436, |
|
"rewards/format_reward": 0.125, |
|
"step": 399 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2100.125030517578, |
|
"epoch": 0.22857142857142856, |
|
"grad_norm": 0.2823559045791626, |
|
"kl": 0.52685546875, |
|
"learning_rate": 2.0730776160846853e-05, |
|
"loss": 0.2694, |
|
"reward": 0.5646544303745031, |
|
"reward_std": 1.1762162446975708, |
|
"rewards/cosine_scaled_reward": -0.05100613087415695, |
|
"rewards/format_reward": 0.666666679084301, |
|
"step": 400 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3157.3750610351562, |
|
"epoch": 0.22914285714285715, |
|
"grad_norm": 0.09117607027292252, |
|
"kl": 0.65234375, |
|
"learning_rate": 2.0528000059645997e-05, |
|
"loss": 0.0588, |
|
"reward": -0.3910463247448206, |
|
"reward_std": 0.2949160858988762, |
|
"rewards/cosine_scaled_reward": -0.2580231502652168, |
|
"rewards/format_reward": 0.1250000037252903, |
|
"step": 401 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2805.333335876465, |
|
"epoch": 0.2297142857142857, |
|
"grad_norm": 0.06091153621673584, |
|
"kl": 0.24639892578125, |
|
"learning_rate": 2.0326904075089492e-05, |
|
"loss": 0.0546, |
|
"reward": -0.23423780500888824, |
|
"reward_std": 0.1788835395127535, |
|
"rewards/cosine_scaled_reward": -0.24211889691650867, |
|
"rewards/format_reward": 0.25, |
|
"step": 402 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2704.3750915527344, |
|
"epoch": 0.2302857142857143, |
|
"grad_norm": 0.11689075827598572, |
|
"kl": 0.18212890625, |
|
"learning_rate": 2.0127498008311922e-05, |
|
"loss": 0.0923, |
|
"reward": 0.480072483420372, |
|
"reward_std": 0.8732938468456268, |
|
"rewards/cosine_scaled_reward": -0.0516304369084537, |
|
"rewards/format_reward": 0.5833333432674408, |
|
"step": 403 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3054.666717529297, |
|
"epoch": 0.23085714285714284, |
|
"grad_norm": 0.06528580188751221, |
|
"kl": 0.48193359375, |
|
"learning_rate": 1.9929791578083658e-05, |
|
"loss": 0.0749, |
|
"reward": -0.4229290783405304, |
|
"reward_std": 0.344521377235651, |
|
"rewards/cosine_scaled_reward": -0.2947978749871254, |
|
"rewards/format_reward": 0.1666666679084301, |
|
"step": 404 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2730.2084350585938, |
|
"epoch": 0.23142857142857143, |
|
"grad_norm": 0.07100927829742432, |
|
"kl": 0.482421875, |
|
"learning_rate": 1.9733794420337214e-05, |
|
"loss": 0.0353, |
|
"reward": 0.5909395664930344, |
|
"reward_std": 0.5787490289658308, |
|
"rewards/cosine_scaled_reward": 0.10796979814767838, |
|
"rewards/format_reward": 0.375, |
|
"step": 405 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3417.9583740234375, |
|
"epoch": 0.232, |
|
"grad_norm": 0.03272836655378342, |
|
"kl": 0.313232421875, |
|
"learning_rate": 1.9539516087697518e-05, |
|
"loss": 0.0321, |
|
"reward": -0.10836548218503594, |
|
"reward_std": 0.4936055834405124, |
|
"rewards/cosine_scaled_reward": -0.1583494134247303, |
|
"rewards/format_reward": 0.2083333358168602, |
|
"step": 406 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3065.791748046875, |
|
"epoch": 0.23257142857142857, |
|
"grad_norm": 0.17790015041828156, |
|
"kl": 0.3212890625, |
|
"learning_rate": 1.9346966049016424e-05, |
|
"loss": 0.105, |
|
"reward": 0.5387177914381027, |
|
"reward_std": 1.1512526031583548, |
|
"rewards/cosine_scaled_reward": 0.061025530099868774, |
|
"rewards/format_reward": 0.4166666828095913, |
|
"step": 407 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2239.2083587646484, |
|
"epoch": 0.23314285714285715, |
|
"grad_norm": 0.25460880994796753, |
|
"kl": 0.249755859375, |
|
"learning_rate": 1.915615368891117e-05, |
|
"loss": 0.1841, |
|
"reward": 0.13613080978393555, |
|
"reward_std": 0.5948553457856178, |
|
"rewards/cosine_scaled_reward": -0.16110125556588173, |
|
"rewards/format_reward": 0.4583333432674408, |
|
"step": 408 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3243.8750610351562, |
|
"epoch": 0.2337142857142857, |
|
"grad_norm": 0.22207611799240112, |
|
"kl": 0.44970703125, |
|
"learning_rate": 1.8967088307307003e-05, |
|
"loss": 0.1602, |
|
"reward": -0.5169526115059853, |
|
"reward_std": 0.28472404927015305, |
|
"rewards/cosine_scaled_reward": -0.3209763169288635, |
|
"rewards/format_reward": 0.1250000037252903, |
|
"step": 409 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3584.0, |
|
"epoch": 0.2342857142857143, |
|
"grad_norm": 0.05123216286301613, |
|
"kl": 0.356201171875, |
|
"learning_rate": 1.877977911898387e-05, |
|
"loss": 0.0143, |
|
"reward": -0.671182170510292, |
|
"reward_std": 0.408594099804759, |
|
"rewards/cosine_scaled_reward": -0.3564244285225868, |
|
"rewards/format_reward": 0.0416666679084301, |
|
"step": 410 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3441.5833740234375, |
|
"epoch": 0.23485714285714285, |
|
"grad_norm": 0.06985009461641312, |
|
"kl": 0.530029296875, |
|
"learning_rate": 1.8594235253127375e-05, |
|
"loss": 0.0742, |
|
"reward": -0.38147830381058156, |
|
"reward_std": 0.40077026188373566, |
|
"rewards/cosine_scaled_reward": -0.274072487722151, |
|
"rewards/format_reward": 0.1666666716337204, |
|
"step": 411 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3580.75, |
|
"epoch": 0.23542857142857143, |
|
"grad_norm": 0.06430774182081223, |
|
"kl": 0.380859375, |
|
"learning_rate": 1.8410465752883758e-05, |
|
"loss": 0.0169, |
|
"reward": -0.6020461395382881, |
|
"reward_std": 0.310219619423151, |
|
"rewards/cosine_scaled_reward": -0.36352307721972466, |
|
"rewards/format_reward": 0.1250000037252903, |
|
"step": 412 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3413.5833740234375, |
|
"epoch": 0.236, |
|
"grad_norm": 0.0643596202135086, |
|
"kl": 0.56591796875, |
|
"learning_rate": 1.822847957491922e-05, |
|
"loss": 0.069, |
|
"reward": -0.19847530126571655, |
|
"reward_std": 0.632585421204567, |
|
"rewards/cosine_scaled_reward": -0.22423765808343887, |
|
"rewards/format_reward": 0.2500000037252903, |
|
"step": 413 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2909.4583435058594, |
|
"epoch": 0.23657142857142857, |
|
"grad_norm": 0.17475536465644836, |
|
"kl": 0.317138671875, |
|
"learning_rate": 1.804828558898332e-05, |
|
"loss": 0.2022, |
|
"reward": -0.07582948263734579, |
|
"reward_std": 0.3062135260552168, |
|
"rewards/cosine_scaled_reward": -0.1420813980512321, |
|
"rewards/format_reward": 0.2083333432674408, |
|
"step": 414 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2919.2083435058594, |
|
"epoch": 0.23714285714285716, |
|
"grad_norm": 0.08638226240873337, |
|
"kl": 0.40966796875, |
|
"learning_rate": 1.7869892577476724e-05, |
|
"loss": 0.0055, |
|
"reward": -0.26949138939380646, |
|
"reward_std": 0.3594799619168043, |
|
"rewards/cosine_scaled_reward": -0.30141236586496234, |
|
"rewards/format_reward": 0.3333333432674408, |
|
"step": 415 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3522.125, |
|
"epoch": 0.2377142857142857, |
|
"grad_norm": 0.045446183532476425, |
|
"kl": 0.417724609375, |
|
"learning_rate": 1.769330923502313e-05, |
|
"loss": 0.0411, |
|
"reward": -0.4672236889600754, |
|
"reward_std": 0.2726159645244479, |
|
"rewards/cosine_scaled_reward": -0.25444517843425274, |
|
"rewards/format_reward": 0.0416666679084301, |
|
"step": 416 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2717.2916870117188, |
|
"epoch": 0.2382857142857143, |
|
"grad_norm": 0.14400292932987213, |
|
"kl": 0.3536376953125, |
|
"learning_rate": 1.7518544168045525e-05, |
|
"loss": 0.2068, |
|
"reward": 0.2651242660358548, |
|
"reward_std": 0.6696367170661688, |
|
"rewards/cosine_scaled_reward": -0.034104532562196255, |
|
"rewards/format_reward": 0.3333333358168602, |
|
"step": 417 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2557.5833435058594, |
|
"epoch": 0.23885714285714285, |
|
"grad_norm": 0.14127807319164276, |
|
"kl": 0.41259765625, |
|
"learning_rate": 1.734560589434673e-05, |
|
"loss": 0.0174, |
|
"reward": 0.01780000329017639, |
|
"reward_std": 0.4921893812716007, |
|
"rewards/cosine_scaled_reward": -0.178600013256073, |
|
"rewards/format_reward": 0.3750000149011612, |
|
"step": 418 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2538.0833740234375, |
|
"epoch": 0.23942857142857144, |
|
"grad_norm": 0.26915180683135986, |
|
"kl": 0.7177734375, |
|
"learning_rate": 1.7174502842694213e-05, |
|
"loss": 0.0781, |
|
"reward": 0.22315247356891632, |
|
"reward_std": 0.7611993253231049, |
|
"rewards/cosine_scaled_reward": -0.2009237576276064, |
|
"rewards/format_reward": 0.6250000111758709, |
|
"step": 419 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2058.4583740234375, |
|
"epoch": 0.24, |
|
"grad_norm": 0.13916106522083282, |
|
"kl": 0.48828125, |
|
"learning_rate": 1.7005243352409334e-05, |
|
"loss": 0.1651, |
|
"reward": 0.5351078482344747, |
|
"reward_std": 0.43670096062123775, |
|
"rewards/cosine_scaled_reward": 0.017553903628140688, |
|
"rewards/format_reward": 0.5, |
|
"step": 420 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2917.625030517578, |
|
"epoch": 0.24057142857142857, |
|
"grad_norm": 0.26665350794792175, |
|
"kl": 0.361328125, |
|
"learning_rate": 1.6837835672960835e-05, |
|
"loss": 0.2609, |
|
"reward": -0.19462932646274567, |
|
"reward_std": 0.6805716454982758, |
|
"rewards/cosine_scaled_reward": -0.24314800277352333, |
|
"rewards/format_reward": 0.291666679084301, |
|
"step": 421 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3578.875, |
|
"epoch": 0.24114285714285713, |
|
"grad_norm": 0.050200000405311584, |
|
"kl": 0.3662109375, |
|
"learning_rate": 1.6672287963562855e-05, |
|
"loss": 0.0176, |
|
"reward": -0.7312059998512268, |
|
"reward_std": 0.228121904656291, |
|
"rewards/cosine_scaled_reward": -0.3864363357424736, |
|
"rewards/format_reward": 0.0416666679084301, |
|
"step": 422 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2685.8750610351562, |
|
"epoch": 0.24171428571428571, |
|
"grad_norm": 0.2088881880044937, |
|
"kl": 0.58447265625, |
|
"learning_rate": 1.6508608292777204e-05, |
|
"loss": 0.0649, |
|
"reward": 0.13651205599308014, |
|
"reward_std": 0.7265305370092392, |
|
"rewards/cosine_scaled_reward": -0.14007731387391686, |
|
"rewards/format_reward": 0.4166666828095913, |
|
"step": 423 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2770.7916870117188, |
|
"epoch": 0.2422857142857143, |
|
"grad_norm": 0.6906270384788513, |
|
"kl": 0.63232421875, |
|
"learning_rate": 1.63468046381201e-05, |
|
"loss": 0.1834, |
|
"reward": -0.13689884543418884, |
|
"reward_std": 0.6126798801124096, |
|
"rewards/cosine_scaled_reward": -0.17261607944965363, |
|
"rewards/format_reward": 0.2083333395421505, |
|
"step": 424 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3473.125, |
|
"epoch": 0.24285714285714285, |
|
"grad_norm": 0.1417115479707718, |
|
"kl": 0.53515625, |
|
"learning_rate": 1.6186884885673413e-05, |
|
"loss": -0.0243, |
|
"reward": -0.4850518964231014, |
|
"reward_std": 0.24936811439692974, |
|
"rewards/cosine_scaled_reward": -0.26335928216576576, |
|
"rewards/format_reward": 0.0416666679084301, |
|
"step": 425 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3285.8333740234375, |
|
"epoch": 0.24342857142857144, |
|
"grad_norm": 0.06952951848506927, |
|
"kl": 0.5224609375, |
|
"learning_rate": 1.602885682970026e-05, |
|
"loss": 0.0646, |
|
"reward": -0.4608178175985813, |
|
"reward_std": 0.5314787924289703, |
|
"rewards/cosine_scaled_reward": -0.2929089143872261, |
|
"rewards/format_reward": 0.1250000037252903, |
|
"step": 426 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3526.9583740234375, |
|
"epoch": 0.244, |
|
"grad_norm": 0.03886076807975769, |
|
"kl": 0.365478515625, |
|
"learning_rate": 1.5872728172265147e-05, |
|
"loss": 0.0222, |
|
"reward": -0.21462566033005714, |
|
"reward_std": 0.7559888269752264, |
|
"rewards/cosine_scaled_reward": -0.21147949434816837, |
|
"rewards/format_reward": 0.2083333395421505, |
|
"step": 427 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2593.666748046875, |
|
"epoch": 0.24457142857142858, |
|
"grad_norm": 0.4337615370750427, |
|
"kl": 0.6572265625, |
|
"learning_rate": 1.5718506522858573e-05, |
|
"loss": 0.3042, |
|
"reward": 0.1845724955201149, |
|
"reward_std": 0.9847190231084824, |
|
"rewards/cosine_scaled_reward": -0.11604708188679069, |
|
"rewards/format_reward": 0.4166666828095913, |
|
"step": 428 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3424.1251220703125, |
|
"epoch": 0.24514285714285713, |
|
"grad_norm": 0.05527598410844803, |
|
"kl": 0.55224609375, |
|
"learning_rate": 1.556619939802615e-05, |
|
"loss": 0.085, |
|
"reward": -0.2955315187573433, |
|
"reward_std": 0.5266687069088221, |
|
"rewards/cosine_scaled_reward": -0.27276574447751045, |
|
"rewards/format_reward": 0.2500000074505806, |
|
"step": 429 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3065.5000610351562, |
|
"epoch": 0.24571428571428572, |
|
"grad_norm": 0.15154457092285156, |
|
"kl": 0.68212890625, |
|
"learning_rate": 1.5415814221002267e-05, |
|
"loss": 0.0947, |
|
"reward": -0.09890948422253132, |
|
"reward_std": 0.46929389610886574, |
|
"rewards/cosine_scaled_reward": -0.19528808258473873, |
|
"rewards/format_reward": 0.2916666716337204, |
|
"step": 430 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3439.2916870117188, |
|
"epoch": 0.24628571428571427, |
|
"grad_norm": 0.0829787403345108, |
|
"kl": 0.6484375, |
|
"learning_rate": 1.526735832134829e-05, |
|
"loss": 0.066, |
|
"reward": -0.6627758890390396, |
|
"reward_std": 0.37856999412178993, |
|
"rewards/cosine_scaled_reward": -0.3730546161532402, |
|
"rewards/format_reward": 0.0833333358168602, |
|
"step": 431 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2820.7916870117188, |
|
"epoch": 0.24685714285714286, |
|
"grad_norm": 0.22786198556423187, |
|
"kl": 0.685546875, |
|
"learning_rate": 1.5120838934595339e-05, |
|
"loss": 0.0229, |
|
"reward": -0.5289145242422819, |
|
"reward_std": 0.3426254317164421, |
|
"rewards/cosine_scaled_reward": -0.32695727050304413, |
|
"rewards/format_reward": 0.125, |
|
"step": 432 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2546.3334045410156, |
|
"epoch": 0.24742857142857144, |
|
"grad_norm": 0.13099390268325806, |
|
"kl": 0.751953125, |
|
"learning_rate": 1.4976263201891614e-05, |
|
"loss": 0.1944, |
|
"reward": 0.3029240146279335, |
|
"reward_std": 0.840803325176239, |
|
"rewards/cosine_scaled_reward": -0.05687133315950632, |
|
"rewards/format_reward": 0.416666679084301, |
|
"step": 433 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3548.6250610351562, |
|
"epoch": 0.248, |
|
"grad_norm": 0.14448414742946625, |
|
"kl": 0.51416015625, |
|
"learning_rate": 1.4833638169654352e-05, |
|
"loss": 0.0237, |
|
"reward": -0.5506420657038689, |
|
"reward_std": 0.24051987566053867, |
|
"rewards/cosine_scaled_reward": -0.35865436494350433, |
|
"rewards/format_reward": 0.1666666716337204, |
|
"step": 434 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2912.666778564453, |
|
"epoch": 0.24857142857142858, |
|
"grad_norm": 0.11906195431947708, |
|
"kl": 0.55078125, |
|
"learning_rate": 1.469297078922642e-05, |
|
"loss": 0.114, |
|
"reward": 0.029549360275268555, |
|
"reward_std": 0.4572529271245003, |
|
"rewards/cosine_scaled_reward": -0.17272532731294632, |
|
"rewards/format_reward": 0.375, |
|
"step": 435 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2759.25, |
|
"epoch": 0.24914285714285714, |
|
"grad_norm": 0.09241659194231033, |
|
"kl": 0.4913330078125, |
|
"learning_rate": 1.4554267916537495e-05, |
|
"loss": 0.0232, |
|
"reward": 0.08335928618907928, |
|
"reward_std": 0.35814575105905533, |
|
"rewards/cosine_scaled_reward": -0.08332037925720215, |
|
"rewards/format_reward": 0.25, |
|
"step": 436 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1934.3750305175781, |
|
"epoch": 0.24971428571428572, |
|
"grad_norm": 0.2781325876712799, |
|
"kl": 0.41302490234375, |
|
"learning_rate": 1.4417536311769886e-05, |
|
"loss": 0.1265, |
|
"reward": 0.7840927466750145, |
|
"reward_std": 0.7154708206653595, |
|
"rewards/cosine_scaled_reward": 0.0795463752001524, |
|
"rewards/format_reward": 0.6250000037252903, |
|
"step": 437 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3043.4583435058594, |
|
"epoch": 0.2502857142857143, |
|
"grad_norm": 0.16331548988819122, |
|
"kl": 0.56005859375, |
|
"learning_rate": 1.428278263902913e-05, |
|
"loss": 0.1218, |
|
"reward": -0.6011191233992577, |
|
"reward_std": 0.2519193133339286, |
|
"rewards/cosine_scaled_reward": -0.40472622215747833, |
|
"rewards/format_reward": 0.2083333395421505, |
|
"step": 438 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2903.5001220703125, |
|
"epoch": 0.25085714285714283, |
|
"grad_norm": 0.1437004804611206, |
|
"kl": 0.55859375, |
|
"learning_rate": 1.4150013466019115e-05, |
|
"loss": 0.0634, |
|
"reward": 0.2986115887761116, |
|
"reward_std": 0.6229725033044815, |
|
"rewards/cosine_scaled_reward": -0.121527548879385, |
|
"rewards/format_reward": 0.5416666753590107, |
|
"step": 439 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1969.4167175292969, |
|
"epoch": 0.25142857142857145, |
|
"grad_norm": 0.4684412479400635, |
|
"kl": 0.38134765625, |
|
"learning_rate": 1.4019235263722036e-05, |
|
"loss": 0.279, |
|
"reward": 0.1754133328795433, |
|
"reward_std": 0.7903597727417946, |
|
"rewards/cosine_scaled_reward": -0.1622933349572122, |
|
"rewards/format_reward": 0.5000000223517418, |
|
"step": 440 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3259.2083740234375, |
|
"epoch": 0.252, |
|
"grad_norm": 0.09937479346990585, |
|
"kl": 0.54443359375, |
|
"learning_rate": 1.389045440608296e-05, |
|
"loss": -0.0214, |
|
"reward": -0.5238807797431946, |
|
"reward_std": 0.32005127891898155, |
|
"rewards/cosine_scaled_reward": -0.3452737405896187, |
|
"rewards/format_reward": 0.1666666679084301, |
|
"step": 441 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2888.125045776367, |
|
"epoch": 0.25257142857142856, |
|
"grad_norm": 0.2833991050720215, |
|
"kl": 0.44384765625, |
|
"learning_rate": 1.3763677169699218e-05, |
|
"loss": 0.1149, |
|
"reward": 0.1393699049949646, |
|
"reward_std": 0.8629260342568159, |
|
"rewards/cosine_scaled_reward": -0.1386483833193779, |
|
"rewards/format_reward": 0.4166666679084301, |
|
"step": 442 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3266.125, |
|
"epoch": 0.25314285714285717, |
|
"grad_norm": 0.08092425763607025, |
|
"kl": 0.47119140625, |
|
"learning_rate": 1.3638909733514454e-05, |
|
"loss": 0.0795, |
|
"reward": -0.44480053149163723, |
|
"reward_std": 0.36030059307813644, |
|
"rewards/cosine_scaled_reward": -0.3057336136698723, |
|
"rewards/format_reward": 0.1666666679084301, |
|
"step": 443 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3183.3333740234375, |
|
"epoch": 0.2537142857142857, |
|
"grad_norm": 0.07417810708284378, |
|
"kl": 0.58544921875, |
|
"learning_rate": 1.3516158178517482e-05, |
|
"loss": 0.0659, |
|
"reward": -0.37381474673748016, |
|
"reward_std": 0.2710421346127987, |
|
"rewards/cosine_scaled_reward": -0.2910740412771702, |
|
"rewards/format_reward": 0.2083333395421505, |
|
"step": 444 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2618.6250915527344, |
|
"epoch": 0.2542857142857143, |
|
"grad_norm": 0.2676498293876648, |
|
"kl": 0.697265625, |
|
"learning_rate": 1.3395428487445916e-05, |
|
"loss": 0.2333, |
|
"reward": 0.09787814319133759, |
|
"reward_std": 0.7350487858057022, |
|
"rewards/cosine_scaled_reward": -0.1385609395802021, |
|
"rewards/format_reward": 0.3750000111758709, |
|
"step": 445 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2239.875015258789, |
|
"epoch": 0.25485714285714284, |
|
"grad_norm": 0.23302024602890015, |
|
"kl": 0.339599609375, |
|
"learning_rate": 1.3276726544494572e-05, |
|
"loss": -0.0018, |
|
"reward": -0.09462682902812958, |
|
"reward_std": 0.2530629448592663, |
|
"rewards/cosine_scaled_reward": -0.3181467577815056, |
|
"rewards/format_reward": 0.5416666679084301, |
|
"step": 446 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3186.875, |
|
"epoch": 0.25542857142857145, |
|
"grad_norm": 0.0788385272026062, |
|
"kl": 0.4404296875, |
|
"learning_rate": 1.3160058135028691e-05, |
|
"loss": 0.0481, |
|
"reward": -0.03993312269449234, |
|
"reward_std": 0.49259845726192, |
|
"rewards/cosine_scaled_reward": -0.16579990461468697, |
|
"rewards/format_reward": 0.291666679084301, |
|
"step": 447 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2922.8333740234375, |
|
"epoch": 0.256, |
|
"grad_norm": 0.11522118002176285, |
|
"kl": 0.56005859375, |
|
"learning_rate": 1.3045428945301954e-05, |
|
"loss": 0.0533, |
|
"reward": 0.417447566986084, |
|
"reward_std": 0.5856805425137281, |
|
"rewards/cosine_scaled_reward": -0.020442910492420197, |
|
"rewards/format_reward": 0.4583333432674408, |
|
"step": 448 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3091.5833740234375, |
|
"epoch": 0.25657142857142856, |
|
"grad_norm": 0.10368100553750992, |
|
"kl": 0.77734375, |
|
"learning_rate": 1.2932844562179353e-05, |
|
"loss": 0.0535, |
|
"reward": -0.07481794245541096, |
|
"reward_std": 0.8016606643795967, |
|
"rewards/cosine_scaled_reward": -0.2457423061132431, |
|
"rewards/format_reward": 0.4166666716337204, |
|
"step": 449 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1368.8750305175781, |
|
"epoch": 0.2571428571428571, |
|
"grad_norm": 0.3712880611419678, |
|
"kl": 0.75482177734375, |
|
"learning_rate": 1.2822310472864884e-05, |
|
"loss": 0.1738, |
|
"reward": 1.4089849265292287, |
|
"reward_std": 0.5784200113266706, |
|
"rewards/cosine_scaled_reward": 0.3294924534857273, |
|
"rewards/format_reward": 0.7500000074505806, |
|
"step": 450 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2171.8333435058594, |
|
"epoch": 0.25771428571428573, |
|
"grad_norm": 0.4426482021808624, |
|
"kl": 0.4508056640625, |
|
"learning_rate": 1.2713832064634126e-05, |
|
"loss": 0.1503, |
|
"reward": 0.019651681184768677, |
|
"reward_std": 0.3091997979208827, |
|
"rewards/cosine_scaled_reward": -0.19850750174373388, |
|
"rewards/format_reward": 0.4166666716337204, |
|
"step": 451 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2205.750045776367, |
|
"epoch": 0.2582857142857143, |
|
"grad_norm": 0.4167480170726776, |
|
"kl": 0.468017578125, |
|
"learning_rate": 1.260741462457165e-05, |
|
"loss": 0.2359, |
|
"reward": 0.14184805005788803, |
|
"reward_std": 0.5776838436722755, |
|
"rewards/cosine_scaled_reward": -0.17907597869634628, |
|
"rewards/format_reward": 0.5000000111758709, |
|
"step": 452 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2234.9166717529297, |
|
"epoch": 0.25885714285714284, |
|
"grad_norm": 0.39677053689956665, |
|
"kl": 0.59765625, |
|
"learning_rate": 1.2503063339313356e-05, |
|
"loss": 0.3304, |
|
"reward": 0.13150884211063385, |
|
"reward_std": 0.5827006474137306, |
|
"rewards/cosine_scaled_reward": -0.16341226734220982, |
|
"rewards/format_reward": 0.4583333544433117, |
|
"step": 453 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2502.4166870117188, |
|
"epoch": 0.25942857142857145, |
|
"grad_norm": 0.6984990835189819, |
|
"kl": 0.47119140625, |
|
"learning_rate": 1.240078329479367e-05, |
|
"loss": 0.1735, |
|
"reward": 0.2351670740172267, |
|
"reward_std": 0.4651456903666258, |
|
"rewards/cosine_scaled_reward": -0.04908313835039735, |
|
"rewards/format_reward": 0.3333333358168602, |
|
"step": 454 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3326.166748046875, |
|
"epoch": 0.26, |
|
"grad_norm": 0.19114693999290466, |
|
"kl": 1.01171875, |
|
"learning_rate": 1.2300579475997657e-05, |
|
"loss": 0.0617, |
|
"reward": -0.29883327800780535, |
|
"reward_std": 0.41828643530607224, |
|
"rewards/cosine_scaled_reward": -0.23274997994303703, |
|
"rewards/format_reward": 0.1666666716337204, |
|
"step": 455 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3502.4166870117188, |
|
"epoch": 0.26057142857142856, |
|
"grad_norm": 0.055889781564474106, |
|
"kl": 0.406005859375, |
|
"learning_rate": 1.2202456766718093e-05, |
|
"loss": 0.0569, |
|
"reward": -0.4835352450609207, |
|
"reward_std": 0.2959635443985462, |
|
"rewards/cosine_scaled_reward": -0.2834343034774065, |
|
"rewards/format_reward": 0.0833333358168602, |
|
"step": 456 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2922.7083435058594, |
|
"epoch": 0.2611428571428571, |
|
"grad_norm": 0.2059084177017212, |
|
"kl": 0.687744140625, |
|
"learning_rate": 1.210641994931739e-05, |
|
"loss": 0.217, |
|
"reward": -0.24368640035390854, |
|
"reward_std": 0.2636729357764125, |
|
"rewards/cosine_scaled_reward": -0.30934320390224457, |
|
"rewards/format_reward": 0.375, |
|
"step": 457 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2602.9166870117188, |
|
"epoch": 0.26171428571428573, |
|
"grad_norm": 0.18258771300315857, |
|
"kl": 0.5390625, |
|
"learning_rate": 1.2012473704494538e-05, |
|
"loss": 0.178, |
|
"reward": 0.1497629238292575, |
|
"reward_std": 0.5888768993318081, |
|
"rewards/cosine_scaled_reward": -0.09178520552814007, |
|
"rewards/format_reward": 0.3333333395421505, |
|
"step": 458 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3391.0833740234375, |
|
"epoch": 0.2622857142857143, |
|
"grad_norm": 0.10063629597425461, |
|
"kl": 0.71728515625, |
|
"learning_rate": 1.1920622611056975e-05, |
|
"loss": 0.0528, |
|
"reward": -0.36289872229099274, |
|
"reward_std": 0.5369972474873066, |
|
"rewards/cosine_scaled_reward": -0.26478270068764687, |
|
"rewards/format_reward": 0.1666666679084301, |
|
"step": 459 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2814.000030517578, |
|
"epoch": 0.26285714285714284, |
|
"grad_norm": 0.06488575041294098, |
|
"kl": 0.550048828125, |
|
"learning_rate": 1.1830871145697413e-05, |
|
"loss": 0.0445, |
|
"reward": 0.6600381471216679, |
|
"reward_std": 0.4823018051683903, |
|
"rewards/cosine_scaled_reward": 0.10085240937769413, |
|
"rewards/format_reward": 0.4583333432674408, |
|
"step": 460 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2659.00008392334, |
|
"epoch": 0.2634285714285714, |
|
"grad_norm": 0.19850417971611023, |
|
"kl": 0.284912109375, |
|
"learning_rate": 1.174322368277565e-05, |
|
"loss": 0.1124, |
|
"reward": 0.20981285348534584, |
|
"reward_std": 0.5238317660987377, |
|
"rewards/cosine_scaled_reward": -0.10342691093683243, |
|
"rewards/format_reward": 0.4166666716337204, |
|
"step": 461 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3503.0834350585938, |
|
"epoch": 0.264, |
|
"grad_norm": 0.06738544255495071, |
|
"kl": 0.6259765625, |
|
"learning_rate": 1.1657684494105387e-05, |
|
"loss": 0.0494, |
|
"reward": -0.18034307146444917, |
|
"reward_std": 0.8387964870780706, |
|
"rewards/cosine_scaled_reward": -0.2776715336367488, |
|
"rewards/format_reward": 0.3750000074505806, |
|
"step": 462 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3274.916748046875, |
|
"epoch": 0.26457142857142857, |
|
"grad_norm": 0.16633926331996918, |
|
"kl": 0.7099609375, |
|
"learning_rate": 1.1574257748745986e-05, |
|
"loss": -0.0402, |
|
"reward": -0.3673575446009636, |
|
"reward_std": 0.31967577897012234, |
|
"rewards/cosine_scaled_reward": -0.204512108117342, |
|
"rewards/format_reward": 0.0416666679084301, |
|
"step": 463 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2274.0416717529297, |
|
"epoch": 0.2651428571428571, |
|
"grad_norm": 0.23727242648601532, |
|
"kl": 0.6483154296875, |
|
"learning_rate": 1.149294751279933e-05, |
|
"loss": 0.033, |
|
"reward": 0.15764057636260986, |
|
"reward_std": 0.6667953277938068, |
|
"rewards/cosine_scaled_reward": -0.21284636482596397, |
|
"rewards/format_reward": 0.5833333358168602, |
|
"step": 464 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3351.291748046875, |
|
"epoch": 0.26571428571428574, |
|
"grad_norm": 0.11981618404388428, |
|
"kl": 0.8818359375, |
|
"learning_rate": 1.1413757749211602e-05, |
|
"loss": 0.0629, |
|
"reward": 0.07552861422300339, |
|
"reward_std": 0.9423349946737289, |
|
"rewards/cosine_scaled_reward": -0.17056902311742306, |
|
"rewards/format_reward": 0.4166666716337204, |
|
"step": 465 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2843.5834350585938, |
|
"epoch": 0.2662857142857143, |
|
"grad_norm": 0.1698676198720932, |
|
"kl": 0.6044921875, |
|
"learning_rate": 1.133669231758016e-05, |
|
"loss": 0.2029, |
|
"reward": -0.21855482331011444, |
|
"reward_std": 0.6860118061304092, |
|
"rewards/cosine_scaled_reward": -0.3384440951049328, |
|
"rewards/format_reward": 0.4583333432674408, |
|
"step": 466 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3427.8751220703125, |
|
"epoch": 0.26685714285714285, |
|
"grad_norm": 0.17002388834953308, |
|
"kl": 0.90283203125, |
|
"learning_rate": 1.1261754973965422e-05, |
|
"loss": 0.0599, |
|
"reward": -0.3653205633163452, |
|
"reward_std": 0.37883180007338524, |
|
"rewards/cosine_scaled_reward": -0.2868269607424736, |
|
"rewards/format_reward": 0.2083333358168602, |
|
"step": 467 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2550.291748046875, |
|
"epoch": 0.2674285714285714, |
|
"grad_norm": 1.2961100339889526, |
|
"kl": 0.5009765625, |
|
"learning_rate": 1.1188949370707787e-05, |
|
"loss": 0.1446, |
|
"reward": 0.2884400337934494, |
|
"reward_std": 0.706204243004322, |
|
"rewards/cosine_scaled_reward": -0.10577998217195272, |
|
"rewards/format_reward": 0.5000000223517418, |
|
"step": 468 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2284.125045776367, |
|
"epoch": 0.268, |
|
"grad_norm": 0.5664176940917969, |
|
"kl": 0.47509765625, |
|
"learning_rate": 1.1118279056249655e-05, |
|
"loss": 0.174, |
|
"reward": -0.08516758680343628, |
|
"reward_std": 0.30671944469213486, |
|
"rewards/cosine_scaled_reward": -0.31341712176799774, |
|
"rewards/format_reward": 0.5416666679084301, |
|
"step": 469 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1342.0416870117188, |
|
"epoch": 0.26857142857142857, |
|
"grad_norm": 0.16639195382595062, |
|
"kl": 0.525146484375, |
|
"learning_rate": 1.1049747474962445e-05, |
|
"loss": 0.2126, |
|
"reward": 1.3647146373987198, |
|
"reward_std": 0.4275565594434738, |
|
"rewards/cosine_scaled_reward": 0.2865240015089512, |
|
"rewards/format_reward": 0.7916666679084301, |
|
"step": 470 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2648.3334045410156, |
|
"epoch": 0.26914285714285713, |
|
"grad_norm": 0.10152919590473175, |
|
"kl": 0.640625, |
|
"learning_rate": 1.0983357966978745e-05, |
|
"loss": 0.1102, |
|
"reward": 0.0945998802781105, |
|
"reward_std": 0.5440156869590282, |
|
"rewards/cosine_scaled_reward": -0.1818667290499434, |
|
"rewards/format_reward": 0.4583333395421505, |
|
"step": 471 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2823.0833740234375, |
|
"epoch": 0.26971428571428574, |
|
"grad_norm": 0.28343844413757324, |
|
"kl": 0.8359375, |
|
"learning_rate": 1.0919113768029518e-05, |
|
"loss": 0.0423, |
|
"reward": -0.11053501442074776, |
|
"reward_std": 0.406174935400486, |
|
"rewards/cosine_scaled_reward": -0.3052675127983093, |
|
"rewards/format_reward": 0.5000000074505806, |
|
"step": 472 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3113.0833740234375, |
|
"epoch": 0.2702857142857143, |
|
"grad_norm": 0.07717534154653549, |
|
"kl": 0.66162109375, |
|
"learning_rate": 1.0857018009286382e-05, |
|
"loss": 0.0757, |
|
"reward": -0.2859737928956747, |
|
"reward_std": 0.5037247315049171, |
|
"rewards/cosine_scaled_reward": -0.24715356901288033, |
|
"rewards/format_reward": 0.2083333358168602, |
|
"step": 473 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1836.0000305175781, |
|
"epoch": 0.27085714285714285, |
|
"grad_norm": 0.3636644780635834, |
|
"kl": 0.83648681640625, |
|
"learning_rate": 1.0797073717209014e-05, |
|
"loss": 0.0671, |
|
"reward": 0.6108469665050507, |
|
"reward_std": 0.3949619419872761, |
|
"rewards/cosine_scaled_reward": 0.034590087831020355, |
|
"rewards/format_reward": 0.5416666679084301, |
|
"step": 474 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2542.0416870117188, |
|
"epoch": 0.2714285714285714, |
|
"grad_norm": 0.17265525460243225, |
|
"kl": 0.6083984375, |
|
"learning_rate": 1.0739283813397639e-05, |
|
"loss": 0.0681, |
|
"reward": 0.41628449596464634, |
|
"reward_std": 0.8513908386230469, |
|
"rewards/cosine_scaled_reward": -0.04185774736106396, |
|
"rewards/format_reward": 0.5000000037252903, |
|
"step": 475 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3087.166748046875, |
|
"epoch": 0.272, |
|
"grad_norm": 0.09109177440404892, |
|
"kl": 0.896484375, |
|
"learning_rate": 1.0683651114450641e-05, |
|
"loss": 0.1429, |
|
"reward": -0.14520969986915588, |
|
"reward_std": 0.3790069818496704, |
|
"rewards/cosine_scaled_reward": -0.28093818202614784, |
|
"rewards/format_reward": 0.416666679084301, |
|
"step": 476 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2237.4583587646484, |
|
"epoch": 0.2725714285714286, |
|
"grad_norm": 0.200510174036026, |
|
"kl": 1.012451171875, |
|
"learning_rate": 1.0630178331827282e-05, |
|
"loss": 0.1417, |
|
"reward": -0.027669966220855713, |
|
"reward_std": 0.3241183590143919, |
|
"rewards/cosine_scaled_reward": -0.28466833382844925, |
|
"rewards/format_reward": 0.5416666753590107, |
|
"step": 477 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3540.7916870117188, |
|
"epoch": 0.27314285714285713, |
|
"grad_norm": 0.23702482879161835, |
|
"kl": 0.743408203125, |
|
"learning_rate": 1.0578868071715544e-05, |
|
"loss": 0.0225, |
|
"reward": -0.5009803473949432, |
|
"reward_std": 0.35418499261140823, |
|
"rewards/cosine_scaled_reward": -0.3546568304300308, |
|
"rewards/format_reward": 0.2083333358168602, |
|
"step": 478 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2298.375045776367, |
|
"epoch": 0.2737142857142857, |
|
"grad_norm": 0.6971262693405151, |
|
"kl": 1.15380859375, |
|
"learning_rate": 1.0529722834905126e-05, |
|
"loss": 0.2907, |
|
"reward": 0.7373159751296043, |
|
"reward_std": 0.6764814406633377, |
|
"rewards/cosine_scaled_reward": 0.13949130102992058, |
|
"rewards/format_reward": 0.4583333469927311, |
|
"step": 479 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2324.8334045410156, |
|
"epoch": 0.2742857142857143, |
|
"grad_norm": 0.17469684779644012, |
|
"kl": 0.61767578125, |
|
"learning_rate": 1.0482745016665526e-05, |
|
"loss": 0.097, |
|
"reward": 0.40352728590369225, |
|
"reward_std": 0.8156778272241354, |
|
"rewards/cosine_scaled_reward": -0.08990303543396294, |
|
"rewards/format_reward": 0.5833333469927311, |
|
"step": 480 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2138.0417098999023, |
|
"epoch": 0.27485714285714286, |
|
"grad_norm": 0.292651891708374, |
|
"kl": 1.777099609375, |
|
"learning_rate": 1.0437936906629336e-05, |
|
"loss": 0.2001, |
|
"reward": 0.35360522009432316, |
|
"reward_std": 0.4059207197278738, |
|
"rewards/cosine_scaled_reward": -0.05236405599862337, |
|
"rewards/format_reward": 0.4583333358168602, |
|
"step": 481 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2493.6666870117188, |
|
"epoch": 0.2754285714285714, |
|
"grad_norm": 0.4282706081867218, |
|
"kl": 0.57958984375, |
|
"learning_rate": 1.0395300688680626e-05, |
|
"loss": 0.0375, |
|
"reward": 0.8709183055907488, |
|
"reward_std": 0.7465209662914276, |
|
"rewards/cosine_scaled_reward": 0.16462579369544983, |
|
"rewards/format_reward": 0.541666679084301, |
|
"step": 482 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1954.4167175292969, |
|
"epoch": 0.276, |
|
"grad_norm": 0.4806761145591736, |
|
"kl": 1.107421875, |
|
"learning_rate": 1.0354838440848503e-05, |
|
"loss": 0.1812, |
|
"reward": 0.1631168033927679, |
|
"reward_std": 0.6468721106648445, |
|
"rewards/cosine_scaled_reward": -0.12677491828799248, |
|
"rewards/format_reward": 0.4166666679084301, |
|
"step": 483 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1815.0000534057617, |
|
"epoch": 0.2765714285714286, |
|
"grad_norm": 0.7567575573921204, |
|
"kl": 1.1226806640625, |
|
"learning_rate": 1.0316552135205838e-05, |
|
"loss": 0.1186, |
|
"reward": 0.24124560877680779, |
|
"reward_std": 0.4737013475969434, |
|
"rewards/cosine_scaled_reward": -0.19187720585614443, |
|
"rewards/format_reward": 0.6250000074505806, |
|
"step": 484 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2469.4166870117188, |
|
"epoch": 0.27714285714285714, |
|
"grad_norm": 0.1286521553993225, |
|
"kl": 1.216796875, |
|
"learning_rate": 1.0280443637773165e-05, |
|
"loss": 0.1977, |
|
"reward": 0.19717933982610703, |
|
"reward_std": 0.7559288740158081, |
|
"rewards/cosine_scaled_reward": -0.08891034871339798, |
|
"rewards/format_reward": 0.3750000111758709, |
|
"step": 485 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1957.083396911621, |
|
"epoch": 0.2777142857142857, |
|
"grad_norm": 0.1778564304113388, |
|
"kl": 0.53155517578125, |
|
"learning_rate": 1.0246514708427702e-05, |
|
"loss": 0.0623, |
|
"reward": 0.32581200636923313, |
|
"reward_std": 0.43175826454535127, |
|
"rewards/cosine_scaled_reward": -0.14959400426596403, |
|
"rewards/format_reward": 0.6250000037252903, |
|
"step": 486 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1288.9166717529297, |
|
"epoch": 0.2782857142857143, |
|
"grad_norm": 0.5399029850959778, |
|
"kl": 0.40069580078125, |
|
"learning_rate": 1.0214767000817597e-05, |
|
"loss": 0.1209, |
|
"reward": 0.9641948640346527, |
|
"reward_std": 0.6063492856919765, |
|
"rewards/cosine_scaled_reward": 0.0862640580162406, |
|
"rewards/format_reward": 0.7916666679084301, |
|
"step": 487 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3372.9166870117188, |
|
"epoch": 0.27885714285714286, |
|
"grad_norm": 0.09737123548984528, |
|
"kl": 0.74609375, |
|
"learning_rate": 1.0185202062281336e-05, |
|
"loss": 0.0528, |
|
"reward": -0.11789792403578758, |
|
"reward_std": 0.5601080972701311, |
|
"rewards/cosine_scaled_reward": -0.20478228479623795, |
|
"rewards/format_reward": 0.2916666716337204, |
|
"step": 488 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1862.458381652832, |
|
"epoch": 0.2794285714285714, |
|
"grad_norm": 0.1760631799697876, |
|
"kl": 0.479736328125, |
|
"learning_rate": 1.0157821333772305e-05, |
|
"loss": 0.1488, |
|
"reward": 0.6090323962271214, |
|
"reward_std": 0.577803835272789, |
|
"rewards/cosine_scaled_reward": -0.028817158192396164, |
|
"rewards/format_reward": 0.6666666716337204, |
|
"step": 489 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1804.5000610351562, |
|
"epoch": 0.28, |
|
"grad_norm": 1.0395656824111938, |
|
"kl": 1.10107421875, |
|
"learning_rate": 1.0132626149788591e-05, |
|
"loss": 0.3023, |
|
"reward": 0.177335936576128, |
|
"reward_std": 0.549927618354559, |
|
"rewards/cosine_scaled_reward": -0.18216537311673164, |
|
"rewards/format_reward": 0.541666679084301, |
|
"step": 490 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2520.666717529297, |
|
"epoch": 0.2805714285714286, |
|
"grad_norm": 0.45748019218444824, |
|
"kl": 0.405517578125, |
|
"learning_rate": 1.0109617738307912e-05, |
|
"loss": 0.2743, |
|
"reward": 0.12348990142345428, |
|
"reward_std": 0.5433773789554834, |
|
"rewards/cosine_scaled_reward": -0.12575505301356316, |
|
"rewards/format_reward": 0.375, |
|
"step": 491 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2241.416748046875, |
|
"epoch": 0.28114285714285714, |
|
"grad_norm": 1.151473045349121, |
|
"kl": 1.1015625, |
|
"learning_rate": 1.008879722072778e-05, |
|
"loss": 0.2926, |
|
"reward": 0.06369444611482322, |
|
"reward_std": 0.7478420436382294, |
|
"rewards/cosine_scaled_reward": -0.23898612521588802, |
|
"rewards/format_reward": 0.5416666716337204, |
|
"step": 492 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2081.333335876465, |
|
"epoch": 0.2817142857142857, |
|
"grad_norm": 0.27526819705963135, |
|
"kl": 0.312744140625, |
|
"learning_rate": 1.0070165611810856e-05, |
|
"loss": 0.0627, |
|
"reward": 0.5283693410456181, |
|
"reward_std": 0.6566501557826996, |
|
"rewards/cosine_scaled_reward": -0.006648639217019081, |
|
"rewards/format_reward": 0.5416666679084301, |
|
"step": 493 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1102.6250381469727, |
|
"epoch": 0.2822857142857143, |
|
"grad_norm": 0.5913681387901306, |
|
"kl": 0.3724365234375, |
|
"learning_rate": 1.0053723819635471e-05, |
|
"loss": 0.0625, |
|
"reward": 1.7323738783597946, |
|
"reward_std": 0.44539252668619156, |
|
"rewards/cosine_scaled_reward": 0.36618690751492977, |
|
"rewards/format_reward": 1.0, |
|
"step": 494 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3075.2916870117188, |
|
"epoch": 0.28285714285714286, |
|
"grad_norm": 0.32760554552078247, |
|
"kl": 0.64306640625, |
|
"learning_rate": 1.0039472645551373e-05, |
|
"loss": 0.0609, |
|
"reward": -0.2853744365274906, |
|
"reward_std": 0.2332111056894064, |
|
"rewards/cosine_scaled_reward": -0.246853890363127, |
|
"rewards/format_reward": 0.2083333395421505, |
|
"step": 495 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1245.208366394043, |
|
"epoch": 0.2834285714285714, |
|
"grad_norm": 0.2870532274246216, |
|
"kl": 0.65533447265625, |
|
"learning_rate": 1.0027412784140691e-05, |
|
"loss": 0.0608, |
|
"reward": 0.566844031214714, |
|
"reward_std": 0.43196946009993553, |
|
"rewards/cosine_scaled_reward": -0.07074464811012149, |
|
"rewards/format_reward": 0.7083333432674408, |
|
"step": 496 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2885.2500610351562, |
|
"epoch": 0.284, |
|
"grad_norm": 0.5022923350334167, |
|
"kl": 0.97265625, |
|
"learning_rate": 1.0017544823184056e-05, |
|
"loss": 0.2029, |
|
"reward": -0.30719401501119137, |
|
"reward_std": 0.604552611708641, |
|
"rewards/cosine_scaled_reward": -0.32026369124650955, |
|
"rewards/format_reward": 0.3333333469927311, |
|
"step": 497 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2770.5833587646484, |
|
"epoch": 0.2845714285714286, |
|
"grad_norm": 0.0918683186173439, |
|
"kl": 0.5137939453125, |
|
"learning_rate": 1.0009869243631953e-05, |
|
"loss": 0.0071, |
|
"reward": 0.11931697279214859, |
|
"reward_std": 0.7490174844861031, |
|
"rewards/cosine_scaled_reward": -0.1278415024280548, |
|
"rewards/format_reward": 0.3750000037252903, |
|
"step": 498 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1951.8750305175781, |
|
"epoch": 0.28514285714285714, |
|
"grad_norm": 0.9043524265289307, |
|
"kl": 0.7900390625, |
|
"learning_rate": 1.000438641958131e-05, |
|
"loss": 0.343, |
|
"reward": 0.27463794499635696, |
|
"reward_std": 0.7951376140117645, |
|
"rewards/cosine_scaled_reward": -0.1960143893957138, |
|
"rewards/format_reward": 0.666666679084301, |
|
"step": 499 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1526.2916870117188, |
|
"epoch": 0.2857142857142857, |
|
"grad_norm": 0.7896002531051636, |
|
"kl": 1.0877685546875, |
|
"learning_rate": 1.0001096618257236e-05, |
|
"loss": 0.0515, |
|
"reward": 0.6481724679470062, |
|
"reward_std": 0.6312408894300461, |
|
"rewards/cosine_scaled_reward": -0.07174711301922798, |
|
"rewards/format_reward": 0.791666679084301, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.2857142857142857, |
|
"step": 500, |
|
"total_flos": 0.0, |
|
"train_loss": 0.09381090864279394, |
|
"train_runtime": 25053.7653, |
|
"train_samples_per_second": 0.479, |
|
"train_steps_per_second": 0.02 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 500, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 50, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 0.0, |
|
"train_batch_size": 6, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|