OpenRS-RLoRA-LoftQ-R32-4 / trainer_state.json
colinpannikkat's picture
Model save
529bd45 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.2857142857142857,
"eval_steps": 500,
"global_step": 500,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"clip_ratio": 0.0,
"completion_length": 3068.5000610351562,
"epoch": 0.0005714285714285715,
"grad_norm": 0.013173403218388557,
"kl": 0.0005006790161132812,
"learning_rate": 0.0,
"loss": -0.0242,
"reward": 0.200983926653862,
"reward_std": 0.24425111338496208,
"rewards/cosine_scaled_reward": -0.0453413650393486,
"rewards/format_reward": 0.2916666679084301,
"step": 1
},
{
"clip_ratio": 0.0,
"completion_length": 2930.9583740234375,
"epoch": 0.001142857142857143,
"grad_norm": 0.043274302035570145,
"kl": 0.0003731250762939453,
"learning_rate": 2.0000000000000003e-06,
"loss": 0.2092,
"reward": -0.28063105791807175,
"reward_std": 0.29903180059045553,
"rewards/cosine_scaled_reward": -0.28614887595176697,
"rewards/format_reward": 0.291666679084301,
"step": 2
},
{
"clip_ratio": 0.0,
"completion_length": 2402.4584045410156,
"epoch": 0.0017142857142857142,
"grad_norm": 0.027086207643151283,
"kl": 0.0004477500915527344,
"learning_rate": 4.000000000000001e-06,
"loss": 0.1282,
"reward": 0.3940318487584591,
"reward_std": 0.7995570376515388,
"rewards/cosine_scaled_reward": -0.0946507640182972,
"rewards/format_reward": 0.5833333507180214,
"step": 3
},
{
"clip_ratio": 0.0,
"completion_length": 2621.1666870117188,
"epoch": 0.002285714285714286,
"grad_norm": 0.017406703904271126,
"kl": 0.00045299530029296875,
"learning_rate": 6e-06,
"loss": 0.0693,
"reward": 0.26569247245788574,
"reward_std": 0.6448228061199188,
"rewards/cosine_scaled_reward": -0.13798709958791733,
"rewards/format_reward": 0.541666679084301,
"step": 4
},
{
"clip_ratio": 0.0,
"completion_length": 2960.4583435058594,
"epoch": 0.002857142857142857,
"grad_norm": 0.03813392296433449,
"kl": 0.0005469322204589844,
"learning_rate": 8.000000000000001e-06,
"loss": 0.2296,
"reward": -0.4634598270058632,
"reward_std": 0.31902989000082016,
"rewards/cosine_scaled_reward": -0.3358965665102005,
"rewards/format_reward": 0.2083333432674408,
"step": 5
},
{
"clip_ratio": 0.0,
"completion_length": 2929.625,
"epoch": 0.0034285714285714284,
"grad_norm": 0.09496363252401352,
"kl": 0.000408172607421875,
"learning_rate": 1e-05,
"loss": 0.2546,
"reward": -0.23965296894311905,
"reward_std": 0.21214338019490242,
"rewards/cosine_scaled_reward": -0.24482648819684982,
"rewards/format_reward": 0.25,
"step": 6
},
{
"clip_ratio": 0.0,
"completion_length": 2929.125030517578,
"epoch": 0.004,
"grad_norm": 0.021513408049941063,
"kl": 0.0005333423614501953,
"learning_rate": 1.2e-05,
"loss": 0.154,
"reward": 0.022021979675628245,
"reward_std": 0.5914725549519062,
"rewards/cosine_scaled_reward": -0.13482235372066498,
"rewards/format_reward": 0.291666679084301,
"step": 7
},
{
"clip_ratio": 0.0,
"completion_length": 3008.8333435058594,
"epoch": 0.004571428571428572,
"grad_norm": 0.014135723002254963,
"kl": 0.0004029273986816406,
"learning_rate": 1.4000000000000001e-05,
"loss": -0.0025,
"reward": -0.6439514700323343,
"reward_std": 0.10124612040817738,
"rewards/cosine_scaled_reward": -0.4469757154583931,
"rewards/format_reward": 0.25,
"step": 8
},
{
"clip_ratio": 0.0,
"completion_length": 2343.9166870117188,
"epoch": 0.005142857142857143,
"grad_norm": 0.022239407524466515,
"kl": 0.0003972053527832031,
"learning_rate": 1.6000000000000003e-05,
"loss": -0.0438,
"reward": 0.5440307706594467,
"reward_std": 0.46696411445736885,
"rewards/cosine_scaled_reward": 0.001182064414024353,
"rewards/format_reward": 0.5416666679084301,
"step": 9
},
{
"clip_ratio": 0.0,
"completion_length": 3273.166748046875,
"epoch": 0.005714285714285714,
"grad_norm": 0.012372066266834736,
"kl": 0.0002987384796142578,
"learning_rate": 1.8e-05,
"loss": 0.044,
"reward": 0.2406943179666996,
"reward_std": 1.128564938902855,
"rewards/cosine_scaled_reward": -0.04631950333714485,
"rewards/format_reward": 0.3333333469927311,
"step": 10
},
{
"clip_ratio": 0.0,
"completion_length": 2470.7500610351562,
"epoch": 0.006285714285714286,
"grad_norm": 0.02754840813577175,
"kl": 0.0005016326904296875,
"learning_rate": 2e-05,
"loss": 0.1079,
"reward": 0.0395459933206439,
"reward_std": 0.7187126167118549,
"rewards/cosine_scaled_reward": -0.251060351729393,
"rewards/format_reward": 0.5416666716337204,
"step": 11
},
{
"clip_ratio": 0.0,
"completion_length": 2851.4583740234375,
"epoch": 0.006857142857142857,
"grad_norm": 0.032746948301792145,
"kl": 0.0005283355712890625,
"learning_rate": 2.2000000000000003e-05,
"loss": -0.0402,
"reward": 0.5950729995965958,
"reward_std": 0.7282880395650864,
"rewards/cosine_scaled_reward": 0.005869843065738678,
"rewards/format_reward": 0.5833333469927311,
"step": 12
},
{
"clip_ratio": 0.0,
"completion_length": 2740.9584350585938,
"epoch": 0.0074285714285714285,
"grad_norm": 0.016786912456154823,
"kl": 0.0004119873046875,
"learning_rate": 2.4e-05,
"loss": 0.062,
"reward": 0.3663429766893387,
"reward_std": 0.7826206907629967,
"rewards/cosine_scaled_reward": -0.0459951926022768,
"rewards/format_reward": 0.4583333395421505,
"step": 13
},
{
"clip_ratio": 0.0,
"completion_length": 2583.166778564453,
"epoch": 0.008,
"grad_norm": 0.01807275228202343,
"kl": 0.0003781318664550781,
"learning_rate": 2.6000000000000002e-05,
"loss": 0.0601,
"reward": 0.6402689702808857,
"reward_std": 0.9116464108228683,
"rewards/cosine_scaled_reward": 0.09096781723201275,
"rewards/format_reward": 0.4583333358168602,
"step": 14
},
{
"clip_ratio": 0.0,
"completion_length": 2415.166748046875,
"epoch": 0.008571428571428572,
"grad_norm": 0.016410216689109802,
"kl": 0.000415802001953125,
"learning_rate": 2.8000000000000003e-05,
"loss": 0.0876,
"reward": 0.17176377028226852,
"reward_std": 0.5884413756430149,
"rewards/cosine_scaled_reward": -0.20578479953110218,
"rewards/format_reward": 0.5833333432674408,
"step": 15
},
{
"clip_ratio": 0.0,
"completion_length": 2822.416748046875,
"epoch": 0.009142857142857144,
"grad_norm": 0.01920630969107151,
"kl": 0.0003960132598876953,
"learning_rate": 3e-05,
"loss": 0.1401,
"reward": 0.46021055802702904,
"reward_std": 0.6382475309073925,
"rewards/cosine_scaled_reward": 0.0009385794401168823,
"rewards/format_reward": 0.4583333544433117,
"step": 16
},
{
"clip_ratio": 0.0,
"completion_length": 2660.875030517578,
"epoch": 0.009714285714285713,
"grad_norm": 0.03571460023522377,
"kl": 0.0005893707275390625,
"learning_rate": 3.2000000000000005e-05,
"loss": 0.2357,
"reward": 0.0063078440725803375,
"reward_std": 0.5483059138059616,
"rewards/cosine_scaled_reward": -0.18434608541429043,
"rewards/format_reward": 0.375,
"step": 17
},
{
"clip_ratio": 0.0,
"completion_length": 3228.125,
"epoch": 0.010285714285714285,
"grad_norm": 0.011813213117420673,
"kl": 0.0003924369812011719,
"learning_rate": 3.4000000000000007e-05,
"loss": -0.0546,
"reward": 0.039575088769197464,
"reward_std": 0.8113678842782974,
"rewards/cosine_scaled_reward": -0.12604578211903572,
"rewards/format_reward": 0.2916666679084301,
"step": 18
},
{
"clip_ratio": 0.0,
"completion_length": 2784.2916870117188,
"epoch": 0.010857142857142857,
"grad_norm": 0.01843290589749813,
"kl": 0.0005035400390625,
"learning_rate": 3.6e-05,
"loss": -0.0056,
"reward": 0.0836619883775711,
"reward_std": 0.4685993976891041,
"rewards/cosine_scaled_reward": -0.16650232672691345,
"rewards/format_reward": 0.4166666716337204,
"step": 19
},
{
"clip_ratio": 0.0,
"completion_length": 1598.4583587646484,
"epoch": 0.011428571428571429,
"grad_norm": 0.02082478627562523,
"kl": 0.0005908012390136719,
"learning_rate": 3.8e-05,
"loss": 0.115,
"reward": 0.6086829677224159,
"reward_std": 0.7450617477297783,
"rewards/cosine_scaled_reward": -0.09149187058210373,
"rewards/format_reward": 0.7916666679084301,
"step": 20
},
{
"clip_ratio": 0.0,
"completion_length": 1688.2500305175781,
"epoch": 0.012,
"grad_norm": 0.06570505350828171,
"kl": 0.0012292861938476562,
"learning_rate": 4e-05,
"loss": 0.2063,
"reward": 0.12941228225827217,
"reward_std": 0.2382342591881752,
"rewards/cosine_scaled_reward": -0.26862720027565956,
"rewards/format_reward": 0.6666666716337204,
"step": 21
},
{
"clip_ratio": 0.0,
"completion_length": 2881.0416870117188,
"epoch": 0.012571428571428572,
"grad_norm": 0.01590500958263874,
"kl": 0.0003497600555419922,
"learning_rate": 4.2e-05,
"loss": 0.1198,
"reward": 0.8034647405147552,
"reward_std": 1.4150425791740417,
"rewards/cosine_scaled_reward": 0.15173236466944218,
"rewards/format_reward": 0.5000000186264515,
"step": 22
},
{
"clip_ratio": 0.0,
"completion_length": 2695.3333740234375,
"epoch": 0.013142857142857144,
"grad_norm": 0.014568633399903774,
"kl": 0.0006999969482421875,
"learning_rate": 4.4000000000000006e-05,
"loss": 0.0306,
"reward": 0.31954750418663025,
"reward_std": 0.7953172847628593,
"rewards/cosine_scaled_reward": -0.09022624790668488,
"rewards/format_reward": 0.5000000037252903,
"step": 23
},
{
"clip_ratio": 0.0,
"completion_length": 2814.2083740234375,
"epoch": 0.013714285714285714,
"grad_norm": 0.03579765558242798,
"kl": 0.0009121894836425781,
"learning_rate": 4.600000000000001e-05,
"loss": 0.0315,
"reward": 0.5563870035111904,
"reward_std": 0.46287257969379425,
"rewards/cosine_scaled_reward": -0.013473168015480042,
"rewards/format_reward": 0.5833333432674408,
"step": 24
},
{
"clip_ratio": 0.0,
"completion_length": 1512.7083587646484,
"epoch": 0.014285714285714285,
"grad_norm": 0.023476263508200645,
"kl": 0.0010466575622558594,
"learning_rate": 4.8e-05,
"loss": 0.0897,
"reward": 0.7631387338042259,
"reward_std": 0.3562787361443043,
"rewards/cosine_scaled_reward": 0.006569338962435722,
"rewards/format_reward": 0.75,
"step": 25
},
{
"clip_ratio": 0.0,
"completion_length": 2746.4583740234375,
"epoch": 0.014857142857142857,
"grad_norm": 0.015517197549343109,
"kl": 0.0005793571472167969,
"learning_rate": 5e-05,
"loss": 0.1146,
"reward": 0.2103739231824875,
"reward_std": 0.9847119301557541,
"rewards/cosine_scaled_reward": -0.08231302350759506,
"rewards/format_reward": 0.3750000037252903,
"step": 26
},
{
"clip_ratio": 0.0,
"completion_length": 3263.041748046875,
"epoch": 0.015428571428571429,
"grad_norm": 0.019443849101662636,
"kl": 0.0005698204040527344,
"learning_rate": 5.2000000000000004e-05,
"loss": 0.0483,
"reward": -0.21550959814339876,
"reward_std": 0.47869637608528137,
"rewards/cosine_scaled_reward": -0.2119214665144682,
"rewards/format_reward": 0.2083333358168602,
"step": 27
},
{
"clip_ratio": 0.0,
"completion_length": 2655.8750915527344,
"epoch": 0.016,
"grad_norm": 0.02436124160885811,
"kl": 0.00157928466796875,
"learning_rate": 5.4000000000000005e-05,
"loss": -0.0163,
"reward": 0.45750702917575836,
"reward_std": 0.7698620781302452,
"rewards/cosine_scaled_reward": -0.06291317194700241,
"rewards/format_reward": 0.5833333432674408,
"step": 28
},
{
"clip_ratio": 0.0,
"completion_length": 2933.5,
"epoch": 0.01657142857142857,
"grad_norm": 0.014681576751172543,
"kl": 0.0005736351013183594,
"learning_rate": 5.6000000000000006e-05,
"loss": -0.014,
"reward": 0.10475081205368042,
"reward_std": 0.3902103342115879,
"rewards/cosine_scaled_reward": -0.07262461073696613,
"rewards/format_reward": 0.25,
"step": 29
},
{
"clip_ratio": 0.0,
"completion_length": 3584.0,
"epoch": 0.017142857142857144,
"grad_norm": 0.011684375815093517,
"kl": 0.0007815361022949219,
"learning_rate": 5.8e-05,
"loss": 0.0,
"reward": -0.3875264674425125,
"reward_std": 0.43698976188898087,
"rewards/cosine_scaled_reward": -0.21459656301885843,
"rewards/format_reward": 0.0416666679084301,
"step": 30
},
{
"clip_ratio": 0.0,
"completion_length": 2217.291717529297,
"epoch": 0.017714285714285714,
"grad_norm": 0.036056023091077805,
"kl": 0.0011777877807617188,
"learning_rate": 6e-05,
"loss": 0.2175,
"reward": 0.9864542707800865,
"reward_std": 0.9503698498010635,
"rewards/cosine_scaled_reward": 0.18072709161788225,
"rewards/format_reward": 0.6250000111758709,
"step": 31
},
{
"clip_ratio": 0.0,
"completion_length": 2517.25,
"epoch": 0.018285714285714287,
"grad_norm": 0.02313651144504547,
"kl": 0.0033006668090820312,
"learning_rate": 6.2e-05,
"loss": -0.1335,
"reward": 0.9981129616498947,
"reward_std": 0.3932240381836891,
"rewards/cosine_scaled_reward": 0.20738982781767845,
"rewards/format_reward": 0.5833333358168602,
"step": 32
},
{
"clip_ratio": 0.0,
"completion_length": 3584.0,
"epoch": 0.018857142857142857,
"grad_norm": 0.012460554018616676,
"kl": 0.0009255409240722656,
"learning_rate": 6.400000000000001e-05,
"loss": 0.0,
"reward": -0.6963627487421036,
"reward_std": 0.18985886126756668,
"rewards/cosine_scaled_reward": -0.3481813669204712,
"rewards/format_reward": 0.0,
"step": 33
},
{
"clip_ratio": 0.0,
"completion_length": 3006.1250610351562,
"epoch": 0.019428571428571427,
"grad_norm": 0.01616235077381134,
"kl": 0.0015659332275390625,
"learning_rate": 6.6e-05,
"loss": 0.0083,
"reward": 0.1573825404047966,
"reward_std": 0.7217601127922535,
"rewards/cosine_scaled_reward": -0.1296420693397522,
"rewards/format_reward": 0.4166666716337204,
"step": 34
},
{
"clip_ratio": 0.0,
"completion_length": 3014.7083435058594,
"epoch": 0.02,
"grad_norm": 0.027833612635731697,
"kl": 0.0010890960693359375,
"learning_rate": 6.800000000000001e-05,
"loss": 0.1738,
"reward": -0.5526116490364075,
"reward_std": 0.2741971779614687,
"rewards/cosine_scaled_reward": -0.38047249615192413,
"rewards/format_reward": 0.2083333432674408,
"step": 35
},
{
"clip_ratio": 0.0,
"completion_length": 2914.1250610351562,
"epoch": 0.02057142857142857,
"grad_norm": 0.023610979318618774,
"kl": 0.001861572265625,
"learning_rate": 7e-05,
"loss": 0.2468,
"reward": 0.184011185541749,
"reward_std": 0.8658745139837265,
"rewards/cosine_scaled_reward": -0.07466107979416847,
"rewards/format_reward": 0.3333333358168602,
"step": 36
},
{
"clip_ratio": 0.0,
"completion_length": 3503.5416870117188,
"epoch": 0.021142857142857144,
"grad_norm": 0.010844520293176174,
"kl": 0.0012454986572265625,
"learning_rate": 7.2e-05,
"loss": 0.0386,
"reward": -0.049630362540483475,
"reward_std": 0.8300461787730455,
"rewards/cosine_scaled_reward": -0.08731517381966114,
"rewards/format_reward": 0.1250000037252903,
"step": 37
},
{
"clip_ratio": 0.0,
"completion_length": 3236.1250610351562,
"epoch": 0.021714285714285714,
"grad_norm": 0.013169029727578163,
"kl": 0.0030126571655273438,
"learning_rate": 7.4e-05,
"loss": 0.0504,
"reward": -0.19106721878051758,
"reward_std": 0.9097858145833015,
"rewards/cosine_scaled_reward": -0.241366945207119,
"rewards/format_reward": 0.291666679084301,
"step": 38
},
{
"clip_ratio": 0.0,
"completion_length": 1753.4167175292969,
"epoch": 0.022285714285714287,
"grad_norm": 0.04312436282634735,
"kl": 0.0025959014892578125,
"learning_rate": 7.6e-05,
"loss": -0.2639,
"reward": 0.9211674332618713,
"reward_std": 0.7600763663649559,
"rewards/cosine_scaled_reward": 0.04391703475266695,
"rewards/format_reward": 0.8333333358168602,
"step": 39
},
{
"clip_ratio": 0.0,
"completion_length": 3123.0416870117188,
"epoch": 0.022857142857142857,
"grad_norm": 0.015262431465089321,
"kl": 0.001953125,
"learning_rate": 7.800000000000001e-05,
"loss": 0.0771,
"reward": -0.2396223871037364,
"reward_std": 0.45411088317632675,
"rewards/cosine_scaled_reward": -0.28647788520902395,
"rewards/format_reward": 0.3333333432674408,
"step": 40
},
{
"clip_ratio": 0.0,
"completion_length": 3089.75,
"epoch": 0.023428571428571427,
"grad_norm": 0.017734840512275696,
"kl": 0.0012683868408203125,
"learning_rate": 8e-05,
"loss": -0.0673,
"reward": -0.05341312289237976,
"reward_std": 0.23775821551680565,
"rewards/cosine_scaled_reward": -0.15170655399560928,
"rewards/format_reward": 0.25,
"step": 41
},
{
"clip_ratio": 0.0,
"completion_length": 2459.5417098999023,
"epoch": 0.024,
"grad_norm": 0.01839015819132328,
"kl": 0.0012769699096679688,
"learning_rate": 8.2e-05,
"loss": 0.0185,
"reward": 0.4259261190891266,
"reward_std": 0.39659431390464306,
"rewards/cosine_scaled_reward": -0.016203660517930984,
"rewards/format_reward": 0.4583333432674408,
"step": 42
},
{
"clip_ratio": 0.0,
"completion_length": 3215.0833740234375,
"epoch": 0.02457142857142857,
"grad_norm": 0.021077867597341537,
"kl": 0.0025310516357421875,
"learning_rate": 8.4e-05,
"loss": 0.1354,
"reward": -0.06841922923922539,
"reward_std": 0.7763218209147453,
"rewards/cosine_scaled_reward": -0.13837626948952675,
"rewards/format_reward": 0.2083333358168602,
"step": 43
},
{
"clip_ratio": 0.0,
"completion_length": 3144.9583435058594,
"epoch": 0.025142857142857144,
"grad_norm": 0.014874313957989216,
"kl": 0.0016632080078125,
"learning_rate": 8.6e-05,
"loss": 0.1193,
"reward": -0.1355942115187645,
"reward_std": 0.4834160730242729,
"rewards/cosine_scaled_reward": -0.15113045647740364,
"rewards/format_reward": 0.1666666716337204,
"step": 44
},
{
"clip_ratio": 0.0,
"completion_length": 3175.5833740234375,
"epoch": 0.025714285714285714,
"grad_norm": 0.016986120492219925,
"kl": 0.0015850067138671875,
"learning_rate": 8.800000000000001e-05,
"loss": 0.1528,
"reward": -0.4366232231259346,
"reward_std": 0.5260147228837013,
"rewards/cosine_scaled_reward": -0.343311607837677,
"rewards/format_reward": 0.2500000037252903,
"step": 45
},
{
"clip_ratio": 0.0,
"completion_length": 3461.3333740234375,
"epoch": 0.026285714285714287,
"grad_norm": 0.012634013779461384,
"kl": 0.0028676986694335938,
"learning_rate": 9e-05,
"loss": 0.0237,
"reward": -0.06645508855581284,
"reward_std": 1.0512375514954329,
"rewards/cosine_scaled_reward": -0.17906087916344404,
"rewards/format_reward": 0.2916666679084301,
"step": 46
},
{
"clip_ratio": 0.0,
"completion_length": 2675.4166717529297,
"epoch": 0.026857142857142857,
"grad_norm": 0.016832780092954636,
"kl": 0.0008749961853027344,
"learning_rate": 9.200000000000001e-05,
"loss": 0.1006,
"reward": 0.34809515066444874,
"reward_std": 0.2990557327866554,
"rewards/cosine_scaled_reward": -0.013452455401420593,
"rewards/format_reward": 0.3750000037252903,
"step": 47
},
{
"clip_ratio": 0.0,
"completion_length": 1616.666748046875,
"epoch": 0.027428571428571427,
"grad_norm": 0.02106091007590294,
"kl": 0.0061321258544921875,
"learning_rate": 9.4e-05,
"loss": 0.0979,
"reward": 1.2005126923322678,
"reward_std": 0.6015892848372459,
"rewards/cosine_scaled_reward": 0.20442297495901585,
"rewards/format_reward": 0.7916666679084301,
"step": 48
},
{
"clip_ratio": 0.0,
"completion_length": 2405.3333435058594,
"epoch": 0.028,
"grad_norm": 0.02687126025557518,
"kl": 0.009845733642578125,
"learning_rate": 9.6e-05,
"loss": 0.0241,
"reward": 0.5416064560413361,
"reward_std": 0.4324432276189327,
"rewards/cosine_scaled_reward": 0.02080322802066803,
"rewards/format_reward": 0.5,
"step": 49
},
{
"clip_ratio": 0.0,
"completion_length": 3412.2083740234375,
"epoch": 0.02857142857142857,
"grad_norm": 0.013700881972908974,
"kl": 0.00302886962890625,
"learning_rate": 9.8e-05,
"loss": 0.0663,
"reward": -0.06471916288137436,
"reward_std": 0.6379625052213669,
"rewards/cosine_scaled_reward": -0.11569291353225708,
"rewards/format_reward": 0.1666666679084301,
"step": 50
},
{
"clip_ratio": 0.0,
"completion_length": 2844.5833435058594,
"epoch": 0.029142857142857144,
"grad_norm": 0.01361958496272564,
"kl": 0.0013275146484375,
"learning_rate": 0.0001,
"loss": 0.0377,
"reward": 0.20726051926612854,
"reward_std": 0.1396642979234457,
"rewards/cosine_scaled_reward": -0.021369755268096924,
"rewards/format_reward": 0.25,
"step": 51
},
{
"clip_ratio": 0.0,
"completion_length": 3466.9166870117188,
"epoch": 0.029714285714285714,
"grad_norm": 0.01215248741209507,
"kl": 0.0009393692016601562,
"learning_rate": 9.999890338174276e-05,
"loss": 0.0278,
"reward": -0.11257268488407135,
"reward_std": 0.8305042590945959,
"rewards/cosine_scaled_reward": -0.18128634430468082,
"rewards/format_reward": 0.25,
"step": 52
},
{
"clip_ratio": 0.0,
"completion_length": 3201.0,
"epoch": 0.030285714285714287,
"grad_norm": 0.014383244328200817,
"kl": 0.0026397705078125,
"learning_rate": 9.999561358041869e-05,
"loss": 0.0744,
"reward": 0.2842802293598652,
"reward_std": 0.6465214220806956,
"rewards/cosine_scaled_reward": 0.01714009791612625,
"rewards/format_reward": 0.2500000111758709,
"step": 53
},
{
"clip_ratio": 0.0,
"completion_length": 3046.0416870117188,
"epoch": 0.030857142857142857,
"grad_norm": 0.03346686065196991,
"kl": 0.005756378173828125,
"learning_rate": 9.999013075636805e-05,
"loss": -0.0486,
"reward": -0.3102136142551899,
"reward_std": 0.20892462320625782,
"rewards/cosine_scaled_reward": -0.28010681085288525,
"rewards/format_reward": 0.25,
"step": 54
},
{
"clip_ratio": 0.0,
"completion_length": 3255.0000610351562,
"epoch": 0.03142857142857143,
"grad_norm": 0.02068556286394596,
"kl": 0.0030732154846191406,
"learning_rate": 9.998245517681595e-05,
"loss": -0.0891,
"reward": 0.15274354815483093,
"reward_std": 0.8250106833875179,
"rewards/cosine_scaled_reward": -0.09029489010572433,
"rewards/format_reward": 0.3333333358168602,
"step": 55
},
{
"clip_ratio": 0.0,
"completion_length": 2653.7084197998047,
"epoch": 0.032,
"grad_norm": 0.03398514911532402,
"kl": 0.0039196014404296875,
"learning_rate": 9.997258721585931e-05,
"loss": 0.028,
"reward": 0.31957897171378136,
"reward_std": 0.8959189355373383,
"rewards/cosine_scaled_reward": -0.06937719509005547,
"rewards/format_reward": 0.4583333358168602,
"step": 56
},
{
"clip_ratio": 0.0,
"completion_length": 2849.3750915527344,
"epoch": 0.03257142857142857,
"grad_norm": 0.012109542265534401,
"kl": 0.0045604705810546875,
"learning_rate": 9.996052735444863e-05,
"loss": -0.0042,
"reward": 0.1344001293182373,
"reward_std": 0.45969852432608604,
"rewards/cosine_scaled_reward": -0.12029992416501045,
"rewards/format_reward": 0.3750000037252903,
"step": 57
},
{
"clip_ratio": 0.0,
"completion_length": 3181.0833740234375,
"epoch": 0.03314285714285714,
"grad_norm": 0.021295132115483284,
"kl": 0.002410888671875,
"learning_rate": 9.994627618036454e-05,
"loss": 0.1335,
"reward": -0.20644971216097474,
"reward_std": 0.9107521660625935,
"rewards/cosine_scaled_reward": -0.22822486609220505,
"rewards/format_reward": 0.2500000074505806,
"step": 58
},
{
"clip_ratio": 0.0,
"completion_length": 3010.4584350585938,
"epoch": 0.03371428571428572,
"grad_norm": 0.022170057520270348,
"kl": 0.0033636093139648438,
"learning_rate": 9.992983438818914e-05,
"loss": 0.1363,
"reward": 0.24811836518347263,
"reward_std": 1.1761204116046429,
"rewards/cosine_scaled_reward": -0.06344081088900566,
"rewards/format_reward": 0.3750000074505806,
"step": 59
},
{
"clip_ratio": 0.0,
"completion_length": 3584.0,
"epoch": 0.03428571428571429,
"grad_norm": 0.011233195662498474,
"kl": 0.005175590515136719,
"learning_rate": 9.991120277927223e-05,
"loss": 0.0002,
"reward": -0.3428353890776634,
"reward_std": 0.3860394358634949,
"rewards/cosine_scaled_reward": -0.1714176945388317,
"rewards/format_reward": 0.0,
"step": 60
},
{
"clip_ratio": 0.0,
"completion_length": 2401.5416870117188,
"epoch": 0.03485714285714286,
"grad_norm": 0.031769707798957825,
"kl": 0.002063751220703125,
"learning_rate": 9.989038226169209e-05,
"loss": 0.2605,
"reward": 0.41955555975437164,
"reward_std": 0.7642397582530975,
"rewards/cosine_scaled_reward": 0.0014444328844547272,
"rewards/format_reward": 0.4166666716337204,
"step": 61
},
{
"clip_ratio": 0.0,
"completion_length": 2725.625,
"epoch": 0.03542857142857143,
"grad_norm": 0.018359214067459106,
"kl": 0.002758026123046875,
"learning_rate": 9.986737385021142e-05,
"loss": 0.0482,
"reward": -0.2772537413984537,
"reward_std": 0.23343749158084393,
"rewards/cosine_scaled_reward": -0.3469602093100548,
"rewards/format_reward": 0.4166666716337204,
"step": 62
},
{
"clip_ratio": 0.0,
"completion_length": 3515.2500610351562,
"epoch": 0.036,
"grad_norm": 0.017092403024435043,
"kl": 0.0034360885620117188,
"learning_rate": 9.98421786662277e-05,
"loss": 0.0286,
"reward": 0.15998955629765987,
"reward_std": 0.7633470920845866,
"rewards/cosine_scaled_reward": 0.017494780011475086,
"rewards/format_reward": 0.1250000037252903,
"step": 63
},
{
"clip_ratio": 0.0,
"completion_length": 3171.2916870117188,
"epoch": 0.036571428571428574,
"grad_norm": 0.025074612349271774,
"kl": 0.0031719207763671875,
"learning_rate": 9.981479793771866e-05,
"loss": 0.0007,
"reward": -0.23104018531739712,
"reward_std": 0.39066051598638296,
"rewards/cosine_scaled_reward": -0.21968677546828985,
"rewards/format_reward": 0.2083333432674408,
"step": 64
},
{
"clip_ratio": 0.0,
"completion_length": 3346.8750610351562,
"epoch": 0.037142857142857144,
"grad_norm": 0.01631852798163891,
"kl": 0.003975868225097656,
"learning_rate": 9.97852329991824e-05,
"loss": 0.0989,
"reward": -0.2601096876896918,
"reward_std": 0.43978679180145264,
"rewards/cosine_scaled_reward": -0.19255484640598297,
"rewards/format_reward": 0.1250000037252903,
"step": 65
},
{
"clip_ratio": 0.0,
"completion_length": 2645.6251220703125,
"epoch": 0.037714285714285714,
"grad_norm": 0.027915285900235176,
"kl": 0.0030145645141601562,
"learning_rate": 9.97534852915723e-05,
"loss": 0.096,
"reward": 0.6423284709453583,
"reward_std": 1.0060622096061707,
"rewards/cosine_scaled_reward": -0.012169107794761658,
"rewards/format_reward": 0.6666666716337204,
"step": 66
},
{
"clip_ratio": 0.0,
"completion_length": 2729.0833587646484,
"epoch": 0.038285714285714284,
"grad_norm": 0.023593856021761894,
"kl": 0.0039539337158203125,
"learning_rate": 9.971955636222684e-05,
"loss": -0.0287,
"reward": 0.26382073760032654,
"reward_std": 0.7153124492615461,
"rewards/cosine_scaled_reward": -0.07642300426959991,
"rewards/format_reward": 0.4166666716337204,
"step": 67
},
{
"clip_ratio": 0.0,
"completion_length": 3584.0,
"epoch": 0.038857142857142854,
"grad_norm": 0.009784531779587269,
"kl": 0.0015583038330078125,
"learning_rate": 9.968344786479416e-05,
"loss": 0.0001,
"reward": -0.4307379722595215,
"reward_std": 0.28678057435899973,
"rewards/cosine_scaled_reward": -0.21536898985505104,
"rewards/format_reward": 0.0,
"step": 68
},
{
"clip_ratio": 0.0,
"completion_length": 2913.1250610351562,
"epoch": 0.03942857142857143,
"grad_norm": 0.08137527108192444,
"kl": 0.0056610107421875,
"learning_rate": 9.964516155915151e-05,
"loss": 0.0838,
"reward": 0.05851218104362488,
"reward_std": 0.8442177847027779,
"rewards/cosine_scaled_reward": -0.1790772434324026,
"rewards/format_reward": 0.4166666679084301,
"step": 69
},
{
"clip_ratio": 0.0,
"completion_length": 2989.6666870117188,
"epoch": 0.04,
"grad_norm": 0.03494952991604805,
"kl": 0.008270263671875,
"learning_rate": 9.960469931131939e-05,
"loss": 0.2326,
"reward": 0.0155078349635005,
"reward_std": 0.7885254546999931,
"rewards/cosine_scaled_reward": -0.1380794234573841,
"rewards/format_reward": 0.291666679084301,
"step": 70
},
{
"clip_ratio": 0.0,
"completion_length": 2028.7917022705078,
"epoch": 0.04057142857142857,
"grad_norm": 0.037250399589538574,
"kl": 0.00527191162109375,
"learning_rate": 9.956206309337068e-05,
"loss": 0.0821,
"reward": 0.9818168096244335,
"reward_std": 0.5084632188081741,
"rewards/cosine_scaled_reward": 0.1575750857591629,
"rewards/format_reward": 0.6666666716337204,
"step": 71
},
{
"clip_ratio": 0.0,
"completion_length": 3102.7083435058594,
"epoch": 0.04114285714285714,
"grad_norm": 0.01361783780157566,
"kl": 0.006103515625,
"learning_rate": 9.951725498333448e-05,
"loss": -0.0301,
"reward": 0.15624341368675232,
"reward_std": 0.18956233747303486,
"rewards/cosine_scaled_reward": -0.04687829315662384,
"rewards/format_reward": 0.25,
"step": 72
},
{
"clip_ratio": 0.0,
"completion_length": 2701.8333587646484,
"epoch": 0.04171428571428572,
"grad_norm": 0.01851752959191799,
"kl": 0.006786346435546875,
"learning_rate": 9.947027716509488e-05,
"loss": 0.158,
"reward": 0.3348941504955292,
"reward_std": 0.7048965748399496,
"rewards/cosine_scaled_reward": -0.12421960011124611,
"rewards/format_reward": 0.5833333432674408,
"step": 73
},
{
"clip_ratio": 0.0,
"completion_length": 3003.041717529297,
"epoch": 0.04228571428571429,
"grad_norm": 0.018196003511548042,
"kl": 0.005214691162109375,
"learning_rate": 9.942113192828445e-05,
"loss": 0.091,
"reward": 0.34735431522130966,
"reward_std": 0.8543844074010849,
"rewards/cosine_scaled_reward": -0.013822849839925766,
"rewards/format_reward": 0.3750000111758709,
"step": 74
},
{
"clip_ratio": 0.0,
"completion_length": 3122.8334350585938,
"epoch": 0.04285714285714286,
"grad_norm": 0.015949510037899017,
"kl": 0.00733184814453125,
"learning_rate": 9.936982166817273e-05,
"loss": 0.1108,
"reward": 0.1362277865409851,
"reward_std": 0.9558060020208359,
"rewards/cosine_scaled_reward": -0.14021944627165794,
"rewards/format_reward": 0.4166666753590107,
"step": 75
},
{
"clip_ratio": 0.0,
"completion_length": 3223.375,
"epoch": 0.04342857142857143,
"grad_norm": 0.014545021578669548,
"kl": 0.011676788330078125,
"learning_rate": 9.931634888554937e-05,
"loss": -0.0729,
"reward": -0.3659635931253433,
"reward_std": 0.43001827597618103,
"rewards/cosine_scaled_reward": -0.2663151305168867,
"rewards/format_reward": 0.1666666716337204,
"step": 76
},
{
"clip_ratio": 0.0,
"completion_length": 2802.0000610351562,
"epoch": 0.044,
"grad_norm": 0.05182776227593422,
"kl": 0.013092041015625,
"learning_rate": 9.926071618660238e-05,
"loss": 0.2806,
"reward": -0.06057482771575451,
"reward_std": 0.6179361715912819,
"rewards/cosine_scaled_reward": -0.19695409014821053,
"rewards/format_reward": 0.3333333469927311,
"step": 77
},
{
"clip_ratio": 0.0,
"completion_length": 3281.7083740234375,
"epoch": 0.044571428571428574,
"grad_norm": 0.029229266569018364,
"kl": 0.00809478759765625,
"learning_rate": 9.920292628279099e-05,
"loss": 0.0812,
"reward": -0.0839165486395359,
"reward_std": 0.9277310892939568,
"rewards/cosine_scaled_reward": -0.16695828922092915,
"rewards/format_reward": 0.2500000037252903,
"step": 78
},
{
"clip_ratio": 0.0,
"completion_length": 3018.125,
"epoch": 0.045142857142857144,
"grad_norm": 0.014639424160122871,
"kl": 0.02259063720703125,
"learning_rate": 9.914298199071362e-05,
"loss": -0.0578,
"reward": 0.25495412945747375,
"reward_std": 0.8909239619970322,
"rewards/cosine_scaled_reward": -0.03918960690498352,
"rewards/format_reward": 0.3333333358168602,
"step": 79
},
{
"clip_ratio": 0.0,
"completion_length": 3345.375,
"epoch": 0.045714285714285714,
"grad_norm": 0.01726081222295761,
"kl": 0.0069427490234375,
"learning_rate": 9.908088623197048e-05,
"loss": 0.0662,
"reward": -0.11315303295850754,
"reward_std": 0.4808522164821625,
"rewards/cosine_scaled_reward": -0.20240984484553337,
"rewards/format_reward": 0.2916666716337204,
"step": 80
},
{
"clip_ratio": 0.0,
"completion_length": 3315.2916870117188,
"epoch": 0.046285714285714284,
"grad_norm": 0.013985719531774521,
"kl": 0.00591278076171875,
"learning_rate": 9.901664203302126e-05,
"loss": 0.0775,
"reward": 0.5611200910061598,
"reward_std": 0.5030479682609439,
"rewards/cosine_scaled_reward": 0.11389338690787554,
"rewards/format_reward": 0.3333333469927311,
"step": 81
},
{
"clip_ratio": 0.0,
"completion_length": 3138.1666870117188,
"epoch": 0.046857142857142854,
"grad_norm": 0.015917882323265076,
"kl": 0.00797271728515625,
"learning_rate": 9.895025252503756e-05,
"loss": -0.0315,
"reward": 0.07941232621669769,
"reward_std": 0.241426233202219,
"rewards/cosine_scaled_reward": -0.08529385924339294,
"rewards/format_reward": 0.25,
"step": 82
},
{
"clip_ratio": 0.0,
"completion_length": 1649.9167175292969,
"epoch": 0.04742857142857143,
"grad_norm": 0.20445036888122559,
"kl": 0.146575927734375,
"learning_rate": 9.888172094375034e-05,
"loss": 0.1238,
"reward": 1.3232550099492073,
"reward_std": 0.27204739674925804,
"rewards/cosine_scaled_reward": 0.28662747144699097,
"rewards/format_reward": 0.75,
"step": 83
},
{
"clip_ratio": 0.0,
"completion_length": 2547.8333740234375,
"epoch": 0.048,
"grad_norm": 0.055599115788936615,
"kl": 0.03204345703125,
"learning_rate": 9.881105062929221e-05,
"loss": 0.03,
"reward": 0.06362025253474712,
"reward_std": 0.6657480411231518,
"rewards/cosine_scaled_reward": -0.2181899007409811,
"rewards/format_reward": 0.5000000111758709,
"step": 84
},
{
"clip_ratio": 0.0,
"completion_length": 2876.8333587646484,
"epoch": 0.04857142857142857,
"grad_norm": 0.015016643330454826,
"kl": 0.0128631591796875,
"learning_rate": 9.87382450260346e-05,
"loss": 0.0261,
"reward": -0.039747525937855244,
"reward_std": 0.38030122220516205,
"rewards/cosine_scaled_reward": -0.1657070992514491,
"rewards/format_reward": 0.2916666679084301,
"step": 85
},
{
"clip_ratio": 0.0,
"completion_length": 3386.7501220703125,
"epoch": 0.04914285714285714,
"grad_norm": 0.014947664923965931,
"kl": 0.01012420654296875,
"learning_rate": 9.866330768241984e-05,
"loss": 0.0714,
"reward": 0.804126501083374,
"reward_std": 1.6738584637641907,
"rewards/cosine_scaled_reward": 0.1728965789079666,
"rewards/format_reward": 0.4583333358168602,
"step": 86
},
{
"clip_ratio": 0.0,
"completion_length": 3348.625,
"epoch": 0.04971428571428571,
"grad_norm": 0.012593540363013744,
"kl": 0.0147857666015625,
"learning_rate": 9.858624225078841e-05,
"loss": 0.0384,
"reward": 0.16258125752210617,
"reward_std": 0.6240727119147778,
"rewards/cosine_scaled_reward": -0.043709371238946915,
"rewards/format_reward": 0.2500000111758709,
"step": 87
},
{
"clip_ratio": 0.0,
"completion_length": 2490.041732788086,
"epoch": 0.05028571428571429,
"grad_norm": 0.04205311834812164,
"kl": 0.0225677490234375,
"learning_rate": 9.850705248720069e-05,
"loss": 0.2146,
"reward": -0.07761363685131073,
"reward_std": 0.6357230395078659,
"rewards/cosine_scaled_reward": -0.26797350379638374,
"rewards/format_reward": 0.4583333395421505,
"step": 88
},
{
"clip_ratio": 0.0,
"completion_length": 2835.8333740234375,
"epoch": 0.05085714285714286,
"grad_norm": 0.03906717151403427,
"kl": 0.016937255859375,
"learning_rate": 9.842574225125401e-05,
"loss": 0.1663,
"reward": 0.05134725570678711,
"reward_std": 0.8167771622538567,
"rewards/cosine_scaled_reward": -0.1618263851851225,
"rewards/format_reward": 0.3750000111758709,
"step": 89
},
{
"clip_ratio": 0.0,
"completion_length": 2103.2084350585938,
"epoch": 0.05142857142857143,
"grad_norm": 0.019287196919322014,
"kl": 0.00624847412109375,
"learning_rate": 9.834231550589462e-05,
"loss": 0.1045,
"reward": 1.3975676447153091,
"reward_std": 1.155922506004572,
"rewards/cosine_scaled_reward": 0.28211713768541813,
"rewards/format_reward": 0.8333333432674408,
"step": 90
},
{
"clip_ratio": 0.0,
"completion_length": 3305.3333740234375,
"epoch": 0.052,
"grad_norm": 0.016830939799547195,
"kl": 0.01747894287109375,
"learning_rate": 9.825677631722435e-05,
"loss": 0.0694,
"reward": 0.1266888901591301,
"reward_std": 0.7699784189462662,
"rewards/cosine_scaled_reward": -0.08248887583613396,
"rewards/format_reward": 0.2916666716337204,
"step": 91
},
{
"clip_ratio": 0.0,
"completion_length": 2862.5,
"epoch": 0.052571428571428575,
"grad_norm": 0.019136304035782814,
"kl": 0.0141448974609375,
"learning_rate": 9.816912885430258e-05,
"loss": 0.0305,
"reward": 0.02380932867527008,
"reward_std": 0.44232284277677536,
"rewards/cosine_scaled_reward": -0.11309535056352615,
"rewards/format_reward": 0.25,
"step": 92
},
{
"clip_ratio": 0.0,
"completion_length": 2409.75,
"epoch": 0.053142857142857144,
"grad_norm": 0.015891728922724724,
"kl": 0.014984130859375,
"learning_rate": 9.807937738894303e-05,
"loss": 0.0508,
"reward": 0.3022146672010422,
"reward_std": 0.6315838098526001,
"rewards/cosine_scaled_reward": -0.0988927073776722,
"rewards/format_reward": 0.5,
"step": 93
},
{
"clip_ratio": 0.0,
"completion_length": 3115.666748046875,
"epoch": 0.053714285714285714,
"grad_norm": 0.018753819167613983,
"kl": 0.010223388671875,
"learning_rate": 9.798752629550546e-05,
"loss": 0.0866,
"reward": 0.1527155190706253,
"reward_std": 0.8529080599546432,
"rewards/cosine_scaled_reward": -0.11114224418997765,
"rewards/format_reward": 0.3750000111758709,
"step": 94
},
{
"clip_ratio": 0.0,
"completion_length": 2407.5833740234375,
"epoch": 0.054285714285714284,
"grad_norm": 0.033838678151369095,
"kl": 0.01202392578125,
"learning_rate": 9.789358005068262e-05,
"loss": 0.1269,
"reward": 0.42500831093639135,
"reward_std": 0.9461558535695076,
"rewards/cosine_scaled_reward": -0.05832919664680958,
"rewards/format_reward": 0.5416666716337204,
"step": 95
},
{
"clip_ratio": 0.0,
"completion_length": 3434.2916870117188,
"epoch": 0.054857142857142854,
"grad_norm": 0.012105435132980347,
"kl": 0.023162841796875,
"learning_rate": 9.779754323328192e-05,
"loss": 0.0246,
"reward": -0.11048224568367004,
"reward_std": 0.7371259145438671,
"rewards/cosine_scaled_reward": -0.13857445027679205,
"rewards/format_reward": 0.1666666716337204,
"step": 96
},
{
"clip_ratio": 0.0,
"completion_length": 3034.5416870117188,
"epoch": 0.05542857142857143,
"grad_norm": 0.025039825588464737,
"kl": 0.02886962890625,
"learning_rate": 9.769942052400235e-05,
"loss": 0.1097,
"reward": -0.30599308758974075,
"reward_std": 0.44477224349975586,
"rewards/cosine_scaled_reward": -0.3196632117033005,
"rewards/format_reward": 0.3333333432674408,
"step": 97
},
{
"clip_ratio": 0.0,
"completion_length": 1659.4166717529297,
"epoch": 0.056,
"grad_norm": 0.01933540217578411,
"kl": 0.01055145263671875,
"learning_rate": 9.759921670520634e-05,
"loss": -0.0341,
"reward": 1.2363078743219376,
"reward_std": 1.0749643743038177,
"rewards/cosine_scaled_reward": 0.22232059203088284,
"rewards/format_reward": 0.7916666679084301,
"step": 98
},
{
"clip_ratio": 0.0,
"completion_length": 2495.750030517578,
"epoch": 0.05657142857142857,
"grad_norm": 0.04390771687030792,
"kl": 0.01910400390625,
"learning_rate": 9.749693666068664e-05,
"loss": 0.2446,
"reward": 0.6843666434288025,
"reward_std": 1.0362943559885025,
"rewards/cosine_scaled_reward": 0.13385000079870224,
"rewards/format_reward": 0.4166666716337204,
"step": 99
},
{
"clip_ratio": 0.0,
"completion_length": 2179.5833740234375,
"epoch": 0.05714285714285714,
"grad_norm": 0.021462595090270042,
"kl": 0.02783203125,
"learning_rate": 9.739258537542835e-05,
"loss": 0.0284,
"reward": 0.4209151156246662,
"reward_std": 0.6625542566180229,
"rewards/cosine_scaled_reward": -0.06037578295217827,
"rewards/format_reward": 0.5416666679084301,
"step": 100
},
{
"clip_ratio": 0.0,
"completion_length": 3205.5,
"epoch": 0.05771428571428571,
"grad_norm": 0.024794792756438255,
"kl": 0.028961181640625,
"learning_rate": 9.728616793536588e-05,
"loss": 0.1078,
"reward": -0.20108163356781006,
"reward_std": 0.7786883413791656,
"rewards/cosine_scaled_reward": -0.22554081678390503,
"rewards/format_reward": 0.2500000037252903,
"step": 101
},
{
"clip_ratio": 0.0,
"completion_length": 3427.8333740234375,
"epoch": 0.05828571428571429,
"grad_norm": 0.013528184965252876,
"kl": 0.0247802734375,
"learning_rate": 9.717768952713513e-05,
"loss": 0.0313,
"reward": -0.5099735148251057,
"reward_std": 0.49693995993584394,
"rewards/cosine_scaled_reward": -0.29665342532098293,
"rewards/format_reward": 0.0833333358168602,
"step": 102
},
{
"clip_ratio": 0.0,
"completion_length": 3009.0000610351562,
"epoch": 0.05885714285714286,
"grad_norm": 0.06049313023686409,
"kl": 0.0323486328125,
"learning_rate": 9.706715543782064e-05,
"loss": 0.2486,
"reward": -0.3947155475616455,
"reward_std": 0.49045583233237267,
"rewards/cosine_scaled_reward": -0.32235776633024216,
"rewards/format_reward": 0.2500000037252903,
"step": 103
},
{
"clip_ratio": 0.0,
"completion_length": 3463.875,
"epoch": 0.05942857142857143,
"grad_norm": 0.024880580604076385,
"kl": 0.041290283203125,
"learning_rate": 9.695457105469806e-05,
"loss": 0.0448,
"reward": -0.31599661335349083,
"reward_std": 0.5560531467199326,
"rewards/cosine_scaled_reward": -0.19966497272253036,
"rewards/format_reward": 0.0833333358168602,
"step": 104
},
{
"clip_ratio": 0.0,
"completion_length": 3122.7083740234375,
"epoch": 0.06,
"grad_norm": 0.021345539018511772,
"kl": 0.037017822265625,
"learning_rate": 9.683994186497132e-05,
"loss": 0.0791,
"reward": -0.42670758813619614,
"reward_std": 0.3747115731239319,
"rewards/cosine_scaled_reward": -0.33835379779338837,
"rewards/format_reward": 0.2500000111758709,
"step": 105
},
{
"clip_ratio": 0.0,
"completion_length": 2970.7500610351562,
"epoch": 0.060571428571428575,
"grad_norm": 0.07319632917642593,
"kl": 0.04351806640625,
"learning_rate": 9.672327345550543e-05,
"loss": -0.0454,
"reward": -0.012075750157237053,
"reward_std": 0.6432481557130814,
"rewards/cosine_scaled_reward": -0.1727045476436615,
"rewards/format_reward": 0.3333333358168602,
"step": 106
},
{
"clip_ratio": 0.0,
"completion_length": 2791.0833587646484,
"epoch": 0.061142857142857145,
"grad_norm": 0.022539552301168442,
"kl": 0.0341339111328125,
"learning_rate": 9.66045715125541e-05,
"loss": -0.0835,
"reward": -0.19219213724136353,
"reward_std": 0.35477501153945923,
"rewards/cosine_scaled_reward": -0.26276274397969246,
"rewards/format_reward": 0.3333333358168602,
"step": 107
},
{
"clip_ratio": 0.0,
"completion_length": 2680.8750610351562,
"epoch": 0.061714285714285715,
"grad_norm": 0.04677839204668999,
"kl": 0.0345611572265625,
"learning_rate": 9.648384182148252e-05,
"loss": 0.1531,
"reward": -0.10541854053735733,
"reward_std": 0.3359173368662596,
"rewards/cosine_scaled_reward": -0.26104260981082916,
"rewards/format_reward": 0.4166666865348816,
"step": 108
},
{
"clip_ratio": 0.0,
"completion_length": 2972.1666870117188,
"epoch": 0.062285714285714285,
"grad_norm": 0.09794782847166061,
"kl": 0.041717529296875,
"learning_rate": 9.636109026648555e-05,
"loss": 0.1218,
"reward": -0.25983521342277527,
"reward_std": 0.6902979081496596,
"rewards/cosine_scaled_reward": -0.2549176011234522,
"rewards/format_reward": 0.2500000111758709,
"step": 109
},
{
"clip_ratio": 0.0,
"completion_length": 2938.0000610351562,
"epoch": 0.06285714285714286,
"grad_norm": 0.05397583171725273,
"kl": 0.044769287109375,
"learning_rate": 9.623632283030079e-05,
"loss": 0.1955,
"reward": -0.3282480388879776,
"reward_std": 0.45391279086470604,
"rewards/cosine_scaled_reward": -0.3099573701620102,
"rewards/format_reward": 0.291666679084301,
"step": 110
},
{
"clip_ratio": 0.0,
"completion_length": 2632.5416870117188,
"epoch": 0.06342857142857143,
"grad_norm": 0.03481624647974968,
"kl": 0.0413818359375,
"learning_rate": 9.610954559391703e-05,
"loss": 0.0666,
"reward": -0.10429460182785988,
"reward_std": 0.4938749596476555,
"rewards/cosine_scaled_reward": -0.2188139706850052,
"rewards/format_reward": 0.3333333358168602,
"step": 111
},
{
"clip_ratio": 0.0,
"completion_length": 3193.0,
"epoch": 0.064,
"grad_norm": 0.04939614608883858,
"kl": 0.0479736328125,
"learning_rate": 9.598076473627798e-05,
"loss": 0.1431,
"reward": 0.014449171721935272,
"reward_std": 0.7652652338147163,
"rewards/cosine_scaled_reward": -0.15944208949804306,
"rewards/format_reward": 0.3333333395421505,
"step": 112
},
{
"clip_ratio": 0.0,
"completion_length": 3056.7916870117188,
"epoch": 0.06457142857142857,
"grad_norm": 0.023673003539443016,
"kl": 0.062255859375,
"learning_rate": 9.58499865339809e-05,
"loss": 0.0594,
"reward": -0.11021074652671814,
"reward_std": 0.6375727728009224,
"rewards/cosine_scaled_reward": -0.22177204675972462,
"rewards/format_reward": 0.3333333469927311,
"step": 113
},
{
"clip_ratio": 0.0,
"completion_length": 2781.5833435058594,
"epoch": 0.06514285714285714,
"grad_norm": 0.023715665563941002,
"kl": 0.0440216064453125,
"learning_rate": 9.571721736097089e-05,
"loss": 0.1791,
"reward": 0.14624399319291115,
"reward_std": 0.6526099145412445,
"rewards/cosine_scaled_reward": -0.17687800526618958,
"rewards/format_reward": 0.5000000074505806,
"step": 114
},
{
"clip_ratio": 0.0,
"completion_length": 2856.5833740234375,
"epoch": 0.06571428571428571,
"grad_norm": 0.030268969014286995,
"kl": 0.0810546875,
"learning_rate": 9.558246368823013e-05,
"loss": 0.0175,
"reward": -0.05208939407020807,
"reward_std": 0.41136957705020905,
"rewards/cosine_scaled_reward": -0.192711366340518,
"rewards/format_reward": 0.3333333358168602,
"step": 115
},
{
"clip_ratio": 0.0,
"completion_length": 2494.666748046875,
"epoch": 0.06628571428571428,
"grad_norm": 0.12291796505451202,
"kl": 0.063690185546875,
"learning_rate": 9.544573208346253e-05,
"loss": 0.2623,
"reward": -0.11325077712535858,
"reward_std": 0.706281989812851,
"rewards/cosine_scaled_reward": -0.3066254146397114,
"rewards/format_reward": 0.5000000074505806,
"step": 116
},
{
"clip_ratio": 0.0,
"completion_length": 3124.9166870117188,
"epoch": 0.06685714285714285,
"grad_norm": 0.021119866520166397,
"kl": 0.07550048828125,
"learning_rate": 9.530702921077358e-05,
"loss": -0.0377,
"reward": 0.1393863447010517,
"reward_std": 0.1535858940333128,
"rewards/cosine_scaled_reward": -0.05530684255063534,
"rewards/format_reward": 0.25,
"step": 117
},
{
"clip_ratio": 0.0,
"completion_length": 2799.3333587646484,
"epoch": 0.06742857142857143,
"grad_norm": 0.04394465312361717,
"kl": 0.0882568359375,
"learning_rate": 9.516636183034565e-05,
"loss": 0.0588,
"reward": -0.04012691602110863,
"reward_std": 0.7108660526573658,
"rewards/cosine_scaled_reward": -0.18673011288046837,
"rewards/format_reward": 0.3333333358168602,
"step": 118
},
{
"clip_ratio": 0.0,
"completion_length": 2137.750015258789,
"epoch": 0.068,
"grad_norm": 0.02675880491733551,
"kl": 0.09320068359375,
"learning_rate": 9.50237367981084e-05,
"loss": 0.0715,
"reward": 0.5401680022478104,
"reward_std": 0.37584915198385715,
"rewards/cosine_scaled_reward": 0.02008398249745369,
"rewards/format_reward": 0.5,
"step": 119
},
{
"clip_ratio": 0.0,
"completion_length": 3519.0,
"epoch": 0.06857142857142857,
"grad_norm": 0.027520829811692238,
"kl": 0.113037109375,
"learning_rate": 9.487916106540466e-05,
"loss": 0.0444,
"reward": -0.6826352626085281,
"reward_std": 0.5236286884173751,
"rewards/cosine_scaled_reward": -0.36215095594525337,
"rewards/format_reward": 0.0416666679084301,
"step": 120
},
{
"clip_ratio": 0.0,
"completion_length": 3382.125,
"epoch": 0.06914285714285714,
"grad_norm": 0.04471747577190399,
"kl": 0.1441650390625,
"learning_rate": 9.473264167865173e-05,
"loss": 0.0948,
"reward": -0.5512382835149765,
"reward_std": 0.39517808333039284,
"rewards/cosine_scaled_reward": -0.31728582084178925,
"rewards/format_reward": 0.0833333358168602,
"step": 121
},
{
"clip_ratio": 0.0,
"completion_length": 1638.5000305175781,
"epoch": 0.06971428571428571,
"grad_norm": 0.11067837476730347,
"kl": 0.04302978515625,
"learning_rate": 9.458418577899775e-05,
"loss": 0.2051,
"reward": 1.2672786926850677,
"reward_std": 0.5117088668048382,
"rewards/cosine_scaled_reward": 0.17530599236488342,
"rewards/format_reward": 0.9166666865348816,
"step": 122
},
{
"clip_ratio": 0.0,
"completion_length": 1879.1250610351562,
"epoch": 0.07028571428571428,
"grad_norm": 0.05848046392202377,
"kl": 0.074951171875,
"learning_rate": 9.443380060197387e-05,
"loss": 0.071,
"reward": 0.7721037119626999,
"reward_std": 0.8204567953944206,
"rewards/cosine_scaled_reward": 0.03188517317175865,
"rewards/format_reward": 0.7083333395421505,
"step": 123
},
{
"clip_ratio": 0.0,
"completion_length": 3159.0000610351562,
"epoch": 0.07085714285714285,
"grad_norm": 0.0826815739274025,
"kl": 0.1474609375,
"learning_rate": 9.428149347714143e-05,
"loss": 0.0821,
"reward": 0.13420870155096054,
"reward_std": 0.6483585238456726,
"rewards/cosine_scaled_reward": -0.09956231713294983,
"rewards/format_reward": 0.3333333469927311,
"step": 124
},
{
"clip_ratio": 0.0,
"completion_length": 2804.8750610351562,
"epoch": 0.07142857142857142,
"grad_norm": 0.04677790403366089,
"kl": 0.19384765625,
"learning_rate": 9.412727182773487e-05,
"loss": 0.078,
"reward": 0.2003941610455513,
"reward_std": 0.6851903721690178,
"rewards/cosine_scaled_reward": -0.10813625901937485,
"rewards/format_reward": 0.4166666865348816,
"step": 125
},
{
"clip_ratio": 0.0,
"completion_length": 2025.3750762939453,
"epoch": 0.072,
"grad_norm": 0.1587214171886444,
"kl": 0.119873046875,
"learning_rate": 9.397114317029975e-05,
"loss": 0.2926,
"reward": 0.33202146738767624,
"reward_std": 0.9008842334151268,
"rewards/cosine_scaled_reward": -0.16732261329889297,
"rewards/format_reward": 0.6666666716337204,
"step": 126
},
{
"clip_ratio": 0.0,
"completion_length": 2622.2083740234375,
"epoch": 0.07257142857142856,
"grad_norm": 0.10209250450134277,
"kl": 0.20458984375,
"learning_rate": 9.381311511432659e-05,
"loss": 0.103,
"reward": -0.008071951568126678,
"reward_std": 0.6357720643281937,
"rewards/cosine_scaled_reward": -0.23320263996720314,
"rewards/format_reward": 0.4583333395421505,
"step": 127
},
{
"clip_ratio": 0.0,
"completion_length": 3402.666748046875,
"epoch": 0.07314285714285715,
"grad_norm": 0.03874148055911064,
"kl": 0.31787109375,
"learning_rate": 9.36531953618799e-05,
"loss": 0.0657,
"reward": -0.03658340871334076,
"reward_std": 0.6458300985395908,
"rewards/cosine_scaled_reward": -0.12245837599039078,
"rewards/format_reward": 0.2083333395421505,
"step": 128
},
{
"clip_ratio": 0.0,
"completion_length": 2692.041748046875,
"epoch": 0.07371428571428572,
"grad_norm": 0.08207596093416214,
"kl": 0.26953125,
"learning_rate": 9.349139170722281e-05,
"loss": 0.0873,
"reward": 0.14036400616168976,
"reward_std": 0.8220642358064651,
"rewards/cosine_scaled_reward": -0.13815134391188622,
"rewards/format_reward": 0.4166666679084301,
"step": 129
},
{
"clip_ratio": 0.0,
"completion_length": 2693.1250915527344,
"epoch": 0.07428571428571429,
"grad_norm": 0.2660582959651947,
"kl": 0.309326171875,
"learning_rate": 9.332771203643715e-05,
"loss": 0.3145,
"reward": -0.03846167027950287,
"reward_std": 0.6097433939576149,
"rewards/cosine_scaled_reward": -0.20673084072768688,
"rewards/format_reward": 0.3750000074505806,
"step": 130
},
{
"clip_ratio": 0.0,
"completion_length": 2838.5833740234375,
"epoch": 0.07485714285714286,
"grad_norm": 0.22927673161029816,
"kl": 0.42578125,
"learning_rate": 9.316216432703917e-05,
"loss": 0.2716,
"reward": -0.1591216754168272,
"reward_std": 0.5838810279965401,
"rewards/cosine_scaled_reward": -0.3087275102734566,
"rewards/format_reward": 0.4583333395421505,
"step": 131
},
{
"clip_ratio": 0.0,
"completion_length": 3145.9583740234375,
"epoch": 0.07542857142857143,
"grad_norm": 0.24140125513076782,
"kl": 0.49658203125,
"learning_rate": 9.299475664759069e-05,
"loss": 0.2328,
"reward": -0.22540412843227386,
"reward_std": 0.49389100819826126,
"rewards/cosine_scaled_reward": -0.2168687330558896,
"rewards/format_reward": 0.2083333395421505,
"step": 132
},
{
"clip_ratio": 0.0,
"completion_length": 2801.666748046875,
"epoch": 0.076,
"grad_norm": 0.1175323873758316,
"kl": 0.6162109375,
"learning_rate": 9.28254971573058e-05,
"loss": 0.1208,
"reward": 0.2671317234635353,
"reward_std": 0.9439200237393379,
"rewards/cosine_scaled_reward": -0.09560081362724304,
"rewards/format_reward": 0.4583333507180214,
"step": 133
},
{
"clip_ratio": 0.0,
"completion_length": 2603.3750610351562,
"epoch": 0.07657142857142857,
"grad_norm": 0.08401333540678024,
"kl": 0.5513916015625,
"learning_rate": 9.265439410565329e-05,
"loss": 0.1291,
"reward": -0.025265276432037354,
"reward_std": 0.7744887471199036,
"rewards/cosine_scaled_reward": -0.26263265684247017,
"rewards/format_reward": 0.5000000074505806,
"step": 134
},
{
"clip_ratio": 0.0,
"completion_length": 2280.8333587646484,
"epoch": 0.07714285714285714,
"grad_norm": 0.1608603298664093,
"kl": 0.74853515625,
"learning_rate": 9.248145583195448e-05,
"loss": 0.2022,
"reward": 0.48708484787493944,
"reward_std": 0.7106279134750366,
"rewards/cosine_scaled_reward": -0.1939575858414173,
"rewards/format_reward": 0.8750000149011612,
"step": 135
},
{
"clip_ratio": 0.0,
"completion_length": 1642.2083435058594,
"epoch": 0.07771428571428571,
"grad_norm": 0.13642534613609314,
"kl": 0.98046875,
"learning_rate": 9.230669076497688e-05,
"loss": 0.1375,
"reward": 0.5342673538252711,
"reward_std": 0.8357692211866379,
"rewards/cosine_scaled_reward": -0.14953299798071384,
"rewards/format_reward": 0.8333333432674408,
"step": 136
},
{
"clip_ratio": 0.0,
"completion_length": 2910.70849609375,
"epoch": 0.07828571428571429,
"grad_norm": 0.15120375156402588,
"kl": 0.921875,
"learning_rate": 9.213010742252328e-05,
"loss": 0.2046,
"reward": 0.15343802922870964,
"reward_std": 0.9799866378307343,
"rewards/cosine_scaled_reward": -0.19411433674395084,
"rewards/format_reward": 0.541666679084301,
"step": 137
},
{
"clip_ratio": 0.0,
"completion_length": 2308.5000610351562,
"epoch": 0.07885714285714286,
"grad_norm": 0.11914081871509552,
"kl": 1.169921875,
"learning_rate": 9.195171441101669e-05,
"loss": 0.2146,
"reward": 0.3029663562774658,
"reward_std": 0.5679962188005447,
"rewards/cosine_scaled_reward": -0.2235168293118477,
"rewards/format_reward": 0.7500000223517418,
"step": 138
},
{
"clip_ratio": 0.0,
"completion_length": 2387.750030517578,
"epoch": 0.07942857142857143,
"grad_norm": 0.17026901245117188,
"kl": 1.244140625,
"learning_rate": 9.177152042508078e-05,
"loss": 0.1786,
"reward": 0.3169918926432729,
"reward_std": 0.630860798060894,
"rewards/cosine_scaled_reward": -0.11233741417527199,
"rewards/format_reward": 0.541666679084301,
"step": 139
},
{
"clip_ratio": 0.0,
"completion_length": 2226.2083740234375,
"epoch": 0.08,
"grad_norm": 0.19778449833393097,
"kl": 1.255859375,
"learning_rate": 9.158953424711625e-05,
"loss": 0.0049,
"reward": 0.10223929863423109,
"reward_std": 0.4931130036711693,
"rewards/cosine_scaled_reward": -0.3238803520798683,
"rewards/format_reward": 0.7500000149011612,
"step": 140
},
{
"clip_ratio": 0.0,
"completion_length": 1775.416732788086,
"epoch": 0.08057142857142857,
"grad_norm": 0.741427481174469,
"kl": 1.2548828125,
"learning_rate": 9.140576474687264e-05,
"loss": 0.0466,
"reward": 0.4277765303850174,
"reward_std": 0.6418062597513199,
"rewards/cosine_scaled_reward": -0.18194507574662566,
"rewards/format_reward": 0.7916666716337204,
"step": 141
},
{
"clip_ratio": 0.0,
"completion_length": 1896.6250610351562,
"epoch": 0.08114285714285714,
"grad_norm": 0.16323894262313843,
"kl": 1.263671875,
"learning_rate": 9.122022088101614e-05,
"loss": 0.1192,
"reward": 0.3693200536072254,
"reward_std": 0.6720193177461624,
"rewards/cosine_scaled_reward": -0.23200665414333344,
"rewards/format_reward": 0.8333333730697632,
"step": 142
},
{
"clip_ratio": 0.0,
"completion_length": 2213.4583740234375,
"epoch": 0.08171428571428571,
"grad_norm": 0.18504098057746887,
"kl": 1.0888671875,
"learning_rate": 9.1032911692693e-05,
"loss": 0.2741,
"reward": 0.8721700385212898,
"reward_std": 0.6731484234333038,
"rewards/cosine_scaled_reward": -0.04308167099952698,
"rewards/format_reward": 0.9583333432674408,
"step": 143
},
{
"clip_ratio": 0.0,
"completion_length": 2104.666748046875,
"epoch": 0.08228571428571428,
"grad_norm": 0.08342334628105164,
"kl": 1.12890625,
"learning_rate": 9.084384631108883e-05,
"loss": 0.1517,
"reward": 0.19961576722562313,
"reward_std": 0.7329239994287491,
"rewards/cosine_scaled_reward": -0.27519211545586586,
"rewards/format_reward": 0.7500000298023224,
"step": 144
},
{
"clip_ratio": 0.0,
"completion_length": 1690.6666870117188,
"epoch": 0.08285714285714285,
"grad_norm": 0.18242701888084412,
"kl": 1.08984375,
"learning_rate": 9.065303395098359e-05,
"loss": 0.0593,
"reward": 0.619764544069767,
"reward_std": 0.8105409517884254,
"rewards/cosine_scaled_reward": -0.14845106936991215,
"rewards/format_reward": 0.9166666865348816,
"step": 145
},
{
"clip_ratio": 0.0,
"completion_length": 2374.416748046875,
"epoch": 0.08342857142857144,
"grad_norm": 0.21408452093601227,
"kl": 1.177734375,
"learning_rate": 9.046048391230248e-05,
"loss": 0.0769,
"reward": 0.3972553052008152,
"reward_std": 0.6599168851971626,
"rewards/cosine_scaled_reward": -0.21803902462124825,
"rewards/format_reward": 0.8333333432674408,
"step": 146
},
{
"clip_ratio": 0.0,
"completion_length": 1720.8333892822266,
"epoch": 0.084,
"grad_norm": 0.11365830153226852,
"kl": 0.884765625,
"learning_rate": 9.02662055796628e-05,
"loss": 0.0353,
"reward": 0.5338674746453762,
"reward_std": 0.8632927983999252,
"rewards/cosine_scaled_reward": -0.10806626826524734,
"rewards/format_reward": 0.7500000149011612,
"step": 147
},
{
"clip_ratio": 0.0,
"completion_length": 1343.6250305175781,
"epoch": 0.08457142857142858,
"grad_norm": 0.17277319729328156,
"kl": 0.734375,
"learning_rate": 9.007020842191635e-05,
"loss": 0.1714,
"reward": 0.5662912502884865,
"reward_std": 0.7500499784946442,
"rewards/cosine_scaled_reward": -0.13352105021476746,
"rewards/format_reward": 0.833333358168602,
"step": 148
},
{
"clip_ratio": 0.0,
"completion_length": 1988.5417785644531,
"epoch": 0.08514285714285715,
"grad_norm": 0.1171557605266571,
"kl": 1.1796875,
"learning_rate": 8.987250199168808e-05,
"loss": 0.0848,
"reward": 0.35671424493193626,
"reward_std": 0.49780040234327316,
"rewards/cosine_scaled_reward": -0.17580955289304256,
"rewards/format_reward": 0.7083333432674408,
"step": 149
},
{
"clip_ratio": 0.0,
"completion_length": 1752.0000915527344,
"epoch": 0.08571428571428572,
"grad_norm": 0.14017713069915771,
"kl": 0.96435546875,
"learning_rate": 8.967309592491052e-05,
"loss": -0.0065,
"reward": 0.5877555161714554,
"reward_std": 0.7223709151148796,
"rewards/cosine_scaled_reward": -0.16445559449493885,
"rewards/format_reward": 0.9166666865348816,
"step": 150
},
{
"clip_ratio": 0.0,
"completion_length": 2179.4583740234375,
"epoch": 0.08628571428571429,
"grad_norm": 0.0961502268910408,
"kl": 0.9990234375,
"learning_rate": 8.947199994035401e-05,
"loss": 0.116,
"reward": 0.4838414415717125,
"reward_std": 0.39460384100675583,
"rewards/cosine_scaled_reward": -0.19557928666472435,
"rewards/format_reward": 0.8750000149011612,
"step": 151
},
{
"clip_ratio": 0.0,
"completion_length": 1957.2084350585938,
"epoch": 0.08685714285714285,
"grad_norm": 0.21524538099765778,
"kl": 0.92041015625,
"learning_rate": 8.926922383915316e-05,
"loss": 0.2778,
"reward": 1.0773675739765167,
"reward_std": 1.0207276940345764,
"rewards/cosine_scaled_reward": 0.14285043627023697,
"rewards/format_reward": 0.7916666865348816,
"step": 152
},
{
"clip_ratio": 0.0,
"completion_length": 2126.166748046875,
"epoch": 0.08742857142857142,
"grad_norm": 0.18447129428386688,
"kl": 0.875,
"learning_rate": 8.906477750432904e-05,
"loss": 0.2488,
"reward": 0.905040979385376,
"reward_std": 0.779262512922287,
"rewards/cosine_scaled_reward": -0.02664615958929062,
"rewards/format_reward": 0.9583333432674408,
"step": 153
},
{
"clip_ratio": 0.0,
"completion_length": 2594.4584350585938,
"epoch": 0.088,
"grad_norm": 0.1024523600935936,
"kl": 1.0654296875,
"learning_rate": 8.885867090030761e-05,
"loss": 0.0841,
"reward": 0.5415095314383507,
"reward_std": 0.7814789414405823,
"rewards/cosine_scaled_reward": -0.10424522310495377,
"rewards/format_reward": 0.7500000298023224,
"step": 154
},
{
"clip_ratio": 0.0,
"completion_length": 2428.166748046875,
"epoch": 0.08857142857142856,
"grad_norm": 0.07103646546602249,
"kl": 0.88232421875,
"learning_rate": 8.865091407243394e-05,
"loss": 0.0955,
"reward": 0.2886502370238304,
"reward_std": 0.6928622350096703,
"rewards/cosine_scaled_reward": -0.2098415493965149,
"rewards/format_reward": 0.7083333507180214,
"step": 155
},
{
"clip_ratio": 0.0,
"completion_length": 1767.3334045410156,
"epoch": 0.08914285714285715,
"grad_norm": 0.15649987757205963,
"kl": 0.688720703125,
"learning_rate": 8.844151714648274e-05,
"loss": 0.0392,
"reward": 0.8681200444698334,
"reward_std": 0.5367059111595154,
"rewards/cosine_scaled_reward": -0.024273302406072617,
"rewards/format_reward": 0.9166666865348816,
"step": 156
},
{
"clip_ratio": 0.0,
"completion_length": 2410.5833740234375,
"epoch": 0.08971428571428572,
"grad_norm": 0.12421420216560364,
"kl": 1.00390625,
"learning_rate": 8.823049032816479e-05,
"loss": 0.1328,
"reward": 0.6280432939529419,
"reward_std": 0.6566349938511848,
"rewards/cosine_scaled_reward": -0.10264504700899124,
"rewards/format_reward": 0.833333358168602,
"step": 157
},
{
"clip_ratio": 0.0,
"completion_length": 2807.6666870117188,
"epoch": 0.09028571428571429,
"grad_norm": 0.1208081841468811,
"kl": 1.0380859375,
"learning_rate": 8.801784390262944e-05,
"loss": 0.0709,
"reward": 0.4343430995941162,
"reward_std": 0.4182932637631893,
"rewards/cosine_scaled_reward": -0.2203284539282322,
"rewards/format_reward": 0.8750000149011612,
"step": 158
},
{
"clip_ratio": 0.0,
"completion_length": 2652.791778564453,
"epoch": 0.09085714285714286,
"grad_norm": 0.17998459935188293,
"kl": 0.9296875,
"learning_rate": 8.780358823396352e-05,
"loss": 0.0068,
"reward": 0.2543765474110842,
"reward_std": 0.7230948582291603,
"rewards/cosine_scaled_reward": -0.1853117246646434,
"rewards/format_reward": 0.6250000074505806,
"step": 159
},
{
"clip_ratio": 0.0,
"completion_length": 2881.916748046875,
"epoch": 0.09142857142857143,
"grad_norm": 0.06644508242607117,
"kl": 0.955078125,
"learning_rate": 8.758773376468606e-05,
"loss": 0.1401,
"reward": 0.3839067495428026,
"reward_std": 0.6370702758431435,
"rewards/cosine_scaled_reward": -0.2247133031487465,
"rewards/format_reward": 0.833333358168602,
"step": 160
},
{
"clip_ratio": 0.0,
"completion_length": 3114.4584350585938,
"epoch": 0.092,
"grad_norm": 0.11562149226665497,
"kl": 0.9951171875,
"learning_rate": 8.73702910152393e-05,
"loss": 0.067,
"reward": 0.24938225746154785,
"reward_std": 0.6529880091547966,
"rewards/cosine_scaled_reward": -0.1669755440670997,
"rewards/format_reward": 0.5833333432674408,
"step": 161
},
{
"clip_ratio": 0.0,
"completion_length": 2711.0834350585938,
"epoch": 0.09257142857142857,
"grad_norm": 0.06730871647596359,
"kl": 0.8408203125,
"learning_rate": 8.715127058347615e-05,
"loss": 0.0675,
"reward": 0.2713778093457222,
"reward_std": 0.6517889946699142,
"rewards/cosine_scaled_reward": -0.21847776509821415,
"rewards/format_reward": 0.7083333507180214,
"step": 162
},
{
"clip_ratio": 0.0,
"completion_length": 2407.3334350585938,
"epoch": 0.09314285714285714,
"grad_norm": 0.12514488399028778,
"kl": 0.7216796875,
"learning_rate": 8.693068314414344e-05,
"loss": 0.1883,
"reward": 0.5021318048238754,
"reward_std": 0.5789435803890228,
"rewards/cosine_scaled_reward": -0.18643410876393318,
"rewards/format_reward": 0.8750000298023224,
"step": 163
},
{
"clip_ratio": 0.0,
"completion_length": 2688.0833740234375,
"epoch": 0.09371428571428571,
"grad_norm": 0.1626950055360794,
"kl": 0.791015625,
"learning_rate": 8.670853944836176e-05,
"loss": 0.2289,
"reward": 0.17175179324112833,
"reward_std": 0.4885940235108137,
"rewards/cosine_scaled_reward": -0.24745745211839676,
"rewards/format_reward": 0.6666666716337204,
"step": 164
},
{
"clip_ratio": 0.0,
"completion_length": 2521.666717529297,
"epoch": 0.09428571428571429,
"grad_norm": 0.27727189660072327,
"kl": 0.711669921875,
"learning_rate": 8.648485032310145e-05,
"loss": 0.1999,
"reward": 0.3786450959742069,
"reward_std": 0.6580736637115479,
"rewards/cosine_scaled_reward": -0.20651079155504704,
"rewards/format_reward": 0.7916666865348816,
"step": 165
},
{
"clip_ratio": 0.0,
"completion_length": 3203.416748046875,
"epoch": 0.09485714285714286,
"grad_norm": 0.11694037914276123,
"kl": 0.9208984375,
"learning_rate": 8.625962667065488e-05,
"loss": 0.0458,
"reward": 0.5847738608717918,
"reward_std": 0.642599880695343,
"rewards/cosine_scaled_reward": -0.12427974189631641,
"rewards/format_reward": 0.833333358168602,
"step": 166
},
{
"clip_ratio": 0.0,
"completion_length": 2589.2084045410156,
"epoch": 0.09542857142857143,
"grad_norm": 0.09123259037733078,
"kl": 0.6767578125,
"learning_rate": 8.603287946810515e-05,
"loss": 0.1078,
"reward": 0.7590624950826168,
"reward_std": 1.0844984501600266,
"rewards/cosine_scaled_reward": 0.025364567525684834,
"rewards/format_reward": 0.7083333544433117,
"step": 167
},
{
"clip_ratio": 0.0,
"completion_length": 2705.375030517578,
"epoch": 0.096,
"grad_norm": 0.2029721587896347,
"kl": 0.70361328125,
"learning_rate": 8.5804619766791e-05,
"loss": 0.2496,
"reward": 0.5522685013711452,
"reward_std": 0.6528475284576416,
"rewards/cosine_scaled_reward": -0.09886575862765312,
"rewards/format_reward": 0.7500000223517418,
"step": 168
},
{
"clip_ratio": 0.0,
"completion_length": 2472.041793823242,
"epoch": 0.09657142857142857,
"grad_norm": 0.10081025213003159,
"kl": 0.72509765625,
"learning_rate": 8.557485869176826e-05,
"loss": 0.1258,
"reward": 0.5760277360677719,
"reward_std": 0.8965695053339005,
"rewards/cosine_scaled_reward": -0.1494861477985978,
"rewards/format_reward": 0.8750000149011612,
"step": 169
},
{
"clip_ratio": 0.0,
"completion_length": 2640.9584350585938,
"epoch": 0.09714285714285714,
"grad_norm": 0.06490536034107208,
"kl": 0.6435546875,
"learning_rate": 8.534360744126755e-05,
"loss": 0.1257,
"reward": 0.5584607645869255,
"reward_std": 0.6074354127049446,
"rewards/cosine_scaled_reward": -0.15826964005827904,
"rewards/format_reward": 0.875,
"step": 170
},
{
"clip_ratio": 0.0,
"completion_length": 2400.916732788086,
"epoch": 0.09771428571428571,
"grad_norm": 0.10735026746988297,
"kl": 0.585693359375,
"learning_rate": 8.511087728614862e-05,
"loss": 0.0496,
"reward": 0.3593628406524658,
"reward_std": 0.6010008379817009,
"rewards/cosine_scaled_reward": -0.1953185722231865,
"rewards/format_reward": 0.7500000149011612,
"step": 171
},
{
"clip_ratio": 0.0,
"completion_length": 1632.1250610351562,
"epoch": 0.09828571428571428,
"grad_norm": 0.5536447167396545,
"kl": 0.45166015625,
"learning_rate": 8.487667956935088e-05,
"loss": 0.4406,
"reward": 0.30203498154878616,
"reward_std": 0.40830350667238235,
"rewards/cosine_scaled_reward": -0.2448158636689186,
"rewards/format_reward": 0.791666679084301,
"step": 172
},
{
"clip_ratio": 0.0,
"completion_length": 1648.2500305175781,
"epoch": 0.09885714285714285,
"grad_norm": 0.41387858986854553,
"kl": 0.41180419921875,
"learning_rate": 8.464102570534061e-05,
"loss": 0.3343,
"reward": 0.7647038325667381,
"reward_std": 0.6418692655861378,
"rewards/cosine_scaled_reward": -0.05514809489250183,
"rewards/format_reward": 0.8750000298023224,
"step": 173
},
{
"clip_ratio": 0.0,
"completion_length": 2472.3334350585938,
"epoch": 0.09942857142857142,
"grad_norm": 0.18969063460826874,
"kl": 0.8681640625,
"learning_rate": 8.440392717955476e-05,
"loss": 0.2013,
"reward": 0.21441331086680293,
"reward_std": 0.5078516378998756,
"rewards/cosine_scaled_reward": -0.26779336854815483,
"rewards/format_reward": 0.7500000149011612,
"step": 174
},
{
"clip_ratio": 0.0,
"completion_length": 2739.4583740234375,
"epoch": 0.1,
"grad_norm": 0.16940070688724518,
"kl": 0.9853515625,
"learning_rate": 8.416539554784089e-05,
"loss": 0.2297,
"reward": 0.642971895635128,
"reward_std": 1.0668489187955856,
"rewards/cosine_scaled_reward": -0.011847391724586487,
"rewards/format_reward": 0.666666679084301,
"step": 175
},
{
"clip_ratio": 0.0,
"completion_length": 1711.041732788086,
"epoch": 0.10057142857142858,
"grad_norm": 0.1455065906047821,
"kl": 0.763671875,
"learning_rate": 8.392544243589427e-05,
"loss": 0.0957,
"reward": 0.7176935896277428,
"reward_std": 0.4694051705300808,
"rewards/cosine_scaled_reward": -0.07865320146083832,
"rewards/format_reward": 0.875,
"step": 176
},
{
"clip_ratio": 0.0,
"completion_length": 1795.0417022705078,
"epoch": 0.10114285714285715,
"grad_norm": 0.3457939028739929,
"kl": 0.7919921875,
"learning_rate": 8.368407953869104e-05,
"loss": 0.3018,
"reward": 0.7449050601571798,
"reward_std": 0.9370853900909424,
"rewards/cosine_scaled_reward": -0.04421415273100138,
"rewards/format_reward": 0.8333333432674408,
"step": 177
},
{
"clip_ratio": 0.0,
"completion_length": 2875.7500610351562,
"epoch": 0.10171428571428572,
"grad_norm": 0.12909594178199768,
"kl": 1.1767578125,
"learning_rate": 8.34413186199183e-05,
"loss": 0.1559,
"reward": 0.2506026141345501,
"reward_std": 0.4546685218811035,
"rewards/cosine_scaled_reward": -0.249698705971241,
"rewards/format_reward": 0.7500000149011612,
"step": 178
},
{
"clip_ratio": 0.0,
"completion_length": 1944.0417175292969,
"epoch": 0.10228571428571429,
"grad_norm": 0.2241043746471405,
"kl": 0.85870361328125,
"learning_rate": 8.319717151140073e-05,
"loss": 0.2912,
"reward": 1.0774075190420263,
"reward_std": 0.8805922865867615,
"rewards/cosine_scaled_reward": 0.12203706055879593,
"rewards/format_reward": 0.8333333432674408,
"step": 179
},
{
"clip_ratio": 0.0,
"completion_length": 2280.8334350585938,
"epoch": 0.10285714285714286,
"grad_norm": 0.20919553935527802,
"kl": 1.177734375,
"learning_rate": 8.295165011252397e-05,
"loss": 0.2332,
"reward": 0.11883920338004827,
"reward_std": 0.5948300361633301,
"rewards/cosine_scaled_reward": -0.27391375228762627,
"rewards/format_reward": 0.6666666865348816,
"step": 180
},
{
"clip_ratio": 0.0,
"completion_length": 2596.416732788086,
"epoch": 0.10342857142857143,
"grad_norm": 0.18407326936721802,
"kl": 0.93310546875,
"learning_rate": 8.270476638965462e-05,
"loss": 0.1702,
"reward": 0.45228124409914017,
"reward_std": 0.7070431187748909,
"rewards/cosine_scaled_reward": -0.0863594114780426,
"rewards/format_reward": 0.6250000111758709,
"step": 181
},
{
"clip_ratio": 0.0,
"completion_length": 1908.9167175292969,
"epoch": 0.104,
"grad_norm": 0.3261708617210388,
"kl": 1.4140625,
"learning_rate": 8.245653237555706e-05,
"loss": 0.1104,
"reward": 0.36228151875548065,
"reward_std": 0.8484360724687576,
"rewards/cosine_scaled_reward": -0.1730259107425809,
"rewards/format_reward": 0.7083333432674408,
"step": 182
},
{
"clip_ratio": 0.0,
"completion_length": 2503.2084045410156,
"epoch": 0.10457142857142857,
"grad_norm": 0.2822038531303406,
"kl": 1.244140625,
"learning_rate": 8.220696016880688e-05,
"loss": 0.3006,
"reward": -0.05789138190448284,
"reward_std": 0.6969511806964874,
"rewards/cosine_scaled_reward": -0.2997790314257145,
"rewards/format_reward": 0.5416666865348816,
"step": 183
},
{
"clip_ratio": 0.0,
"completion_length": 1496.3750305175781,
"epoch": 0.10514285714285715,
"grad_norm": 0.2194892019033432,
"kl": 0.9404296875,
"learning_rate": 8.195606193320136e-05,
"loss": 0.0467,
"reward": 0.7817502450197935,
"reward_std": 0.814917117357254,
"rewards/cosine_scaled_reward": -0.004958219826221466,
"rewards/format_reward": 0.7916666716337204,
"step": 184
},
{
"clip_ratio": 0.0,
"completion_length": 1257.041732788086,
"epoch": 0.10571428571428572,
"grad_norm": 0.3073634207248688,
"kl": 0.860107421875,
"learning_rate": 8.170384989716657e-05,
"loss": 0.1978,
"reward": 1.0940335839986801,
"reward_std": 0.3842375408858061,
"rewards/cosine_scaled_reward": 0.06785010732710361,
"rewards/format_reward": 0.9583333432674408,
"step": 185
},
{
"clip_ratio": 0.0,
"completion_length": 1938.2500762939453,
"epoch": 0.10628571428571429,
"grad_norm": 0.16985982656478882,
"kl": 0.8951416015625,
"learning_rate": 8.14503363531613e-05,
"loss": 0.1556,
"reward": 0.7376667633652687,
"reward_std": 0.8680930733680725,
"rewards/cosine_scaled_reward": -0.0061665866523981094,
"rewards/format_reward": 0.7500000223517418,
"step": 186
},
{
"clip_ratio": 0.0,
"completion_length": 1639.0833740234375,
"epoch": 0.10685714285714286,
"grad_norm": 0.5398156642913818,
"kl": 0.888671875,
"learning_rate": 8.119553365707803e-05,
"loss": 0.5061,
"reward": 0.03231507260352373,
"reward_std": 0.5724686309695244,
"rewards/cosine_scaled_reward": -0.29634247720241547,
"rewards/format_reward": 0.6250000223517418,
"step": 187
},
{
"clip_ratio": 0.0,
"completion_length": 2887.6250610351562,
"epoch": 0.10742857142857143,
"grad_norm": 0.19194325804710388,
"kl": 1.197265625,
"learning_rate": 8.09394542276407e-05,
"loss": 0.2126,
"reward": -0.3554415591061115,
"reward_std": 0.7380103319883347,
"rewards/cosine_scaled_reward": -0.3235541209578514,
"rewards/format_reward": 0.2916666716337204,
"step": 188
},
{
"clip_ratio": 0.0,
"completion_length": 3177.5833740234375,
"epoch": 0.108,
"grad_norm": 0.13098645210266113,
"kl": 1.07421875,
"learning_rate": 8.068211054579944e-05,
"loss": 0.1842,
"reward": -0.6138647869229317,
"reward_std": 0.514356566593051,
"rewards/cosine_scaled_reward": -0.36943238973617554,
"rewards/format_reward": 0.1250000037252903,
"step": 189
},
{
"clip_ratio": 0.0,
"completion_length": 2187.791717529297,
"epoch": 0.10857142857142857,
"grad_norm": 0.13339217007160187,
"kl": 0.84423828125,
"learning_rate": 8.042351515412221e-05,
"loss": 0.1347,
"reward": -0.0517389252781868,
"reward_std": 0.4778178557753563,
"rewards/cosine_scaled_reward": -0.2550361379981041,
"rewards/format_reward": 0.4583333358168602,
"step": 190
},
{
"clip_ratio": 0.0,
"completion_length": 1908.4583587646484,
"epoch": 0.10914285714285714,
"grad_norm": 0.11566094309091568,
"kl": 0.6126708984375,
"learning_rate": 8.016368065618361e-05,
"loss": 0.179,
"reward": 0.044671330600976944,
"reward_std": 0.2592208320274949,
"rewards/cosine_scaled_reward": -0.2693310081958771,
"rewards/format_reward": 0.5833333358168602,
"step": 191
},
{
"clip_ratio": 0.0,
"completion_length": 1878.2500839233398,
"epoch": 0.10971428571428571,
"grad_norm": 0.170388862490654,
"kl": 0.915771484375,
"learning_rate": 7.99026197159505e-05,
"loss": 0.2365,
"reward": 0.5707967132329941,
"reward_std": 0.27280137967318296,
"rewards/cosine_scaled_reward": 0.035398345440626144,
"rewards/format_reward": 0.5,
"step": 192
},
{
"clip_ratio": 0.0,
"completion_length": 2798.2500610351562,
"epoch": 0.11028571428571429,
"grad_norm": 0.1819745898246765,
"kl": 0.9326171875,
"learning_rate": 7.964034505716477e-05,
"loss": 0.2348,
"reward": -0.5285695753991604,
"reward_std": 0.5819516181945801,
"rewards/cosine_scaled_reward": -0.36845147609710693,
"rewards/format_reward": 0.2083333358168602,
"step": 193
},
{
"clip_ratio": 0.0,
"completion_length": 2043.166732788086,
"epoch": 0.11085714285714286,
"grad_norm": 0.20014306902885437,
"kl": 0.966796875,
"learning_rate": 7.93768694627233e-05,
"loss": 0.2299,
"reward": 0.49893204495310783,
"reward_std": 0.8825281783938408,
"rewards/cosine_scaled_reward": 0.06196599453687668,
"rewards/format_reward": 0.3750000037252903,
"step": 194
},
{
"clip_ratio": 0.0,
"completion_length": 1638.791732788086,
"epoch": 0.11142857142857143,
"grad_norm": 0.37110522389411926,
"kl": 0.8349609375,
"learning_rate": 7.911220577405484e-05,
"loss": 0.1974,
"reward": 0.5442861206829548,
"reward_std": 0.8290757983922958,
"rewards/cosine_scaled_reward": -0.01952361688017845,
"rewards/format_reward": 0.5833333469927311,
"step": 195
},
{
"clip_ratio": 0.0,
"completion_length": 1842.208381652832,
"epoch": 0.112,
"grad_norm": 0.11744756251573563,
"kl": 0.5994873046875,
"learning_rate": 7.884636689049423e-05,
"loss": 0.1054,
"reward": 0.4689091891050339,
"reward_std": 0.5096632726490498,
"rewards/cosine_scaled_reward": -0.03637874871492386,
"rewards/format_reward": 0.5416666679084301,
"step": 196
},
{
"clip_ratio": 0.0,
"completion_length": 2997.8750610351562,
"epoch": 0.11257142857142857,
"grad_norm": 0.16055168211460114,
"kl": 1.0537109375,
"learning_rate": 7.857936576865357e-05,
"loss": 0.1749,
"reward": -0.3116486147046089,
"reward_std": 0.7263150736689568,
"rewards/cosine_scaled_reward": -0.23915765061974525,
"rewards/format_reward": 0.1666666716337204,
"step": 197
},
{
"clip_ratio": 0.0,
"completion_length": 2395.2500915527344,
"epoch": 0.11314285714285714,
"grad_norm": 0.17808175086975098,
"kl": 0.9326171875,
"learning_rate": 7.831121542179087e-05,
"loss": 0.1602,
"reward": 0.15749725699424744,
"reward_std": 0.8375851400196552,
"rewards/cosine_scaled_reward": -0.10875139385461807,
"rewards/format_reward": 0.3750000149011612,
"step": 198
},
{
"clip_ratio": 0.0,
"completion_length": 2222.7084350585938,
"epoch": 0.11371428571428571,
"grad_norm": 0.29520052671432495,
"kl": 1.09716796875,
"learning_rate": 7.804192891917572e-05,
"loss": 0.3001,
"reward": -0.023016322404146194,
"reward_std": 0.7964953854680061,
"rewards/cosine_scaled_reward": -0.17817485332489014,
"rewards/format_reward": 0.3333333432674408,
"step": 199
},
{
"clip_ratio": 0.0,
"completion_length": 2596.916748046875,
"epoch": 0.11428571428571428,
"grad_norm": 0.17907942831516266,
"kl": 1.1689453125,
"learning_rate": 7.777151938545237e-05,
"loss": 0.1107,
"reward": -0.19867387227714062,
"reward_std": 0.3830955922603607,
"rewards/cosine_scaled_reward": -0.16183693706989288,
"rewards/format_reward": 0.1250000037252903,
"step": 200
},
{
"clip_ratio": 0.0,
"completion_length": 2207.291717529297,
"epoch": 0.11485714285714285,
"grad_norm": 0.2985213100910187,
"kl": 0.92431640625,
"learning_rate": 7.75e-05,
"loss": 0.2348,
"reward": 0.06418987736105919,
"reward_std": 0.6583275869488716,
"rewards/cosine_scaled_reward": -0.15540507063269615,
"rewards/format_reward": 0.3750000111758709,
"step": 201
},
{
"clip_ratio": 0.0,
"completion_length": 2214.291732788086,
"epoch": 0.11542857142857142,
"grad_norm": 0.31010299921035767,
"kl": 1.043212890625,
"learning_rate": 7.72273839962904e-05,
"loss": 0.0332,
"reward": 0.3318791128695011,
"reward_std": 0.5672735497355461,
"rewards/cosine_scaled_reward": -0.04239379055798054,
"rewards/format_reward": 0.4166666716337204,
"step": 202
},
{
"clip_ratio": 0.0,
"completion_length": 2259.041778564453,
"epoch": 0.116,
"grad_norm": 0.21049495041370392,
"kl": 1.07421875,
"learning_rate": 7.695368466124298e-05,
"loss": 0.1093,
"reward": 0.018525540828704834,
"reward_std": 0.5658747386187315,
"rewards/cosine_scaled_reward": -0.15740390308201313,
"rewards/format_reward": 0.3333333469927311,
"step": 203
},
{
"clip_ratio": 0.0,
"completion_length": 1843.2084197998047,
"epoch": 0.11657142857142858,
"grad_norm": 0.14651866257190704,
"kl": 0.8848876953125,
"learning_rate": 7.667891533457719e-05,
"loss": 0.0457,
"reward": 0.1518230028450489,
"reward_std": 0.3332018107175827,
"rewards/cosine_scaled_reward": -0.15325517859309912,
"rewards/format_reward": 0.4583333395421505,
"step": 204
},
{
"clip_ratio": 0.0,
"completion_length": 2268.2916870117188,
"epoch": 0.11714285714285715,
"grad_norm": 0.2668810486793518,
"kl": 0.9375,
"learning_rate": 7.64030894081624e-05,
"loss": 0.3148,
"reward": -0.03797488193958998,
"reward_std": 0.48975174129009247,
"rewards/cosine_scaled_reward": -0.24815411865711212,
"rewards/format_reward": 0.4583333395421505,
"step": 205
},
{
"clip_ratio": 0.0,
"completion_length": 2495.541748046875,
"epoch": 0.11771428571428572,
"grad_norm": 0.2810802459716797,
"kl": 1.427734375,
"learning_rate": 7.612622032536509e-05,
"loss": 0.2099,
"reward": -0.0652031796053052,
"reward_std": 0.4952133148908615,
"rewards/cosine_scaled_reward": -0.1992682572454214,
"rewards/format_reward": 0.3333333395421505,
"step": 206
},
{
"clip_ratio": 0.0,
"completion_length": 2103.4584045410156,
"epoch": 0.11828571428571429,
"grad_norm": 0.9285241365432739,
"kl": 1.095703125,
"learning_rate": 7.58483215803938e-05,
"loss": 0.1823,
"reward": 0.27077991724945605,
"reward_std": 0.5788689702749252,
"rewards/cosine_scaled_reward": -0.09377672243863344,
"rewards/format_reward": 0.4583333395421505,
"step": 207
},
{
"clip_ratio": 0.0,
"completion_length": 1963.5000915527344,
"epoch": 0.11885714285714286,
"grad_norm": 0.7166336178779602,
"kl": 1.20263671875,
"learning_rate": 7.556940671764125e-05,
"loss": 0.5024,
"reward": 0.5436707381159067,
"reward_std": 0.9241102710366249,
"rewards/cosine_scaled_reward": 0.06350202858448029,
"rewards/format_reward": 0.4166666828095913,
"step": 208
},
{
"clip_ratio": 0.0,
"completion_length": 1874.7917098999023,
"epoch": 0.11942857142857143,
"grad_norm": 0.2397138923406601,
"kl": 0.9052734375,
"learning_rate": 7.52894893310244e-05,
"loss": 0.2168,
"reward": 0.7744720131158829,
"reward_std": 0.30723479902371764,
"rewards/cosine_scaled_reward": 0.1372359935194254,
"rewards/format_reward": 0.5000000111758709,
"step": 209
},
{
"clip_ratio": 0.0,
"completion_length": 1800.0833587646484,
"epoch": 0.12,
"grad_norm": 0.4871465861797333,
"kl": 1.25341796875,
"learning_rate": 7.500858306332173e-05,
"loss": 0.1891,
"reward": 0.04496807977557182,
"reward_std": 0.5449698269367218,
"rewards/cosine_scaled_reward": -0.22751596197485924,
"rewards/format_reward": 0.5000000223517418,
"step": 210
},
{
"clip_ratio": 0.0,
"completion_length": 1229.3333587646484,
"epoch": 0.12057142857142857,
"grad_norm": 0.41425731778144836,
"kl": 0.82958984375,
"learning_rate": 7.472670160550849e-05,
"loss": 0.22,
"reward": 1.015406172722578,
"reward_std": 0.726126566529274,
"rewards/cosine_scaled_reward": 0.19520305842161179,
"rewards/format_reward": 0.6250000111758709,
"step": 211
},
{
"clip_ratio": 0.0,
"completion_length": 3078.3751220703125,
"epoch": 0.12114285714285715,
"grad_norm": 0.33234572410583496,
"kl": 1.435546875,
"learning_rate": 7.444385869608922e-05,
"loss": 0.1149,
"reward": -0.23881859704852104,
"reward_std": 0.4091813191771507,
"rewards/cosine_scaled_reward": -0.22357597202062607,
"rewards/format_reward": 0.2083333358168602,
"step": 212
},
{
"clip_ratio": 0.0,
"completion_length": 2063.0834350585938,
"epoch": 0.12171428571428572,
"grad_norm": 0.21360896527767181,
"kl": 1.5478515625,
"learning_rate": 7.416006812042828e-05,
"loss": 0.2844,
"reward": 0.1532426355406642,
"reward_std": 0.6949435919523239,
"rewards/cosine_scaled_reward": -0.11087869806215167,
"rewards/format_reward": 0.3750000111758709,
"step": 213
},
{
"clip_ratio": 0.0,
"completion_length": 1602.3333740234375,
"epoch": 0.12228571428571429,
"grad_norm": 0.7172893285751343,
"kl": 1.461669921875,
"learning_rate": 7.387534371007797e-05,
"loss": 0.2138,
"reward": 0.2475161775946617,
"reward_std": 0.23651241697371006,
"rewards/cosine_scaled_reward": -0.14707526192069054,
"rewards/format_reward": 0.5416666679084301,
"step": 214
},
{
"clip_ratio": 0.0,
"completion_length": 2274.416717529297,
"epoch": 0.12285714285714286,
"grad_norm": 1.329139232635498,
"kl": 2.357421875,
"learning_rate": 7.358969934210438e-05,
"loss": 0.1054,
"reward": -0.20811030641198158,
"reward_std": 0.7684681564569473,
"rewards/cosine_scaled_reward": -0.18738848436623812,
"rewards/format_reward": 0.1666666716337204,
"step": 215
},
{
"clip_ratio": 0.0,
"completion_length": 2036.7083740234375,
"epoch": 0.12342857142857143,
"grad_norm": 0.3380753993988037,
"kl": 1.658203125,
"learning_rate": 7.330314893841101e-05,
"loss": 0.282,
"reward": 0.2936302299494855,
"reward_std": 0.956149123609066,
"rewards/cosine_scaled_reward": 0.0009817667305469513,
"rewards/format_reward": 0.2916666753590107,
"step": 216
},
{
"clip_ratio": 0.0,
"completion_length": 2718.3334350585938,
"epoch": 0.124,
"grad_norm": 0.37306633591651917,
"kl": 1.537109375,
"learning_rate": 7.301570646506028e-05,
"loss": 0.0517,
"reward": -0.10302772559225559,
"reward_std": 0.4567326009273529,
"rewards/cosine_scaled_reward": -0.15568053536117077,
"rewards/format_reward": 0.2083333358168602,
"step": 217
},
{
"clip_ratio": 0.0,
"completion_length": 2313.4583740234375,
"epoch": 0.12457142857142857,
"grad_norm": 0.3978230953216553,
"kl": 1.35546875,
"learning_rate": 7.27273859315928e-05,
"loss": 0.2971,
"reward": 0.18653920199722052,
"reward_std": 0.5482294261455536,
"rewards/cosine_scaled_reward": -0.1567304128257092,
"rewards/format_reward": 0.5000000111758709,
"step": 218
},
{
"clip_ratio": 0.0,
"completion_length": 1831.2083740234375,
"epoch": 0.12514285714285714,
"grad_norm": 0.5213355422019958,
"kl": 1.0810546875,
"learning_rate": 7.243820139034464e-05,
"loss": 0.0582,
"reward": 0.3415638351580128,
"reward_std": 0.6902635730803013,
"rewards/cosine_scaled_reward": -0.14171809703111649,
"rewards/format_reward": 0.6250000074505806,
"step": 219
},
{
"clip_ratio": 0.0,
"completion_length": 2493.541748046875,
"epoch": 0.12571428571428572,
"grad_norm": 0.08655526489019394,
"kl": 0.808837890625,
"learning_rate": 7.214816693576235e-05,
"loss": 0.0852,
"reward": 0.891080267727375,
"reward_std": 0.9232172593474388,
"rewards/cosine_scaled_reward": 0.11220681853592396,
"rewards/format_reward": 0.666666679084301,
"step": 220
},
{
"clip_ratio": 0.0,
"completion_length": 2407.5000610351562,
"epoch": 0.12628571428571428,
"grad_norm": 0.1971471756696701,
"kl": 1.009765625,
"learning_rate": 7.185729670371605e-05,
"loss": 0.266,
"reward": 0.38966307416558266,
"reward_std": 0.9727390855550766,
"rewards/cosine_scaled_reward": -0.03433513268828392,
"rewards/format_reward": 0.4583333469927311,
"step": 221
},
{
"clip_ratio": 0.0,
"completion_length": 2275.500030517578,
"epoch": 0.12685714285714286,
"grad_norm": 0.23895929753780365,
"kl": 0.55322265625,
"learning_rate": 7.156560487081053e-05,
"loss": 0.2723,
"reward": 0.5348676145076752,
"reward_std": 0.4934211131185293,
"rewards/cosine_scaled_reward": -0.12839954253286123,
"rewards/format_reward": 0.791666679084301,
"step": 222
},
{
"clip_ratio": 0.0,
"completion_length": 2512.2083740234375,
"epoch": 0.12742857142857142,
"grad_norm": 0.09032748639583588,
"kl": 0.7138671875,
"learning_rate": 7.127310565369415e-05,
"loss": 0.031,
"reward": 0.257486991584301,
"reward_std": 0.4581068307161331,
"rewards/cosine_scaled_reward": -0.246256522834301,
"rewards/format_reward": 0.7500000149011612,
"step": 223
},
{
"clip_ratio": 0.0,
"completion_length": 3016.2501220703125,
"epoch": 0.128,
"grad_norm": 0.08650817722082138,
"kl": 0.82421875,
"learning_rate": 7.097981330836617e-05,
"loss": 0.0687,
"reward": 0.2571569848805666,
"reward_std": 0.7694349959492683,
"rewards/cosine_scaled_reward": -0.07975485920906067,
"rewards/format_reward": 0.4166666753590107,
"step": 224
},
{
"clip_ratio": 0.0,
"completion_length": 2969.541717529297,
"epoch": 0.12857142857142856,
"grad_norm": 0.07500854134559631,
"kl": 0.8349609375,
"learning_rate": 7.068574212948169e-05,
"loss": 0.1226,
"reward": 0.1223123692907393,
"reward_std": 0.6229890622198582,
"rewards/cosine_scaled_reward": -0.14717714861035347,
"rewards/format_reward": 0.416666679084301,
"step": 225
},
{
"clip_ratio": 0.0,
"completion_length": 3157.5834350585938,
"epoch": 0.12914285714285714,
"grad_norm": 0.09925863891839981,
"kl": 0.79296875,
"learning_rate": 7.03909064496551e-05,
"loss": 0.0567,
"reward": 0.4053786303848028,
"reward_std": 1.0473359823226929,
"rewards/cosine_scaled_reward": -0.04731069877743721,
"rewards/format_reward": 0.5000000111758709,
"step": 226
},
{
"clip_ratio": 0.0,
"completion_length": 2760.791732788086,
"epoch": 0.12971428571428573,
"grad_norm": 0.12794190645217896,
"kl": 0.614013671875,
"learning_rate": 7.009532063876149e-05,
"loss": 0.0846,
"reward": 0.35397324431687593,
"reward_std": 0.9577793553471565,
"rewards/cosine_scaled_reward": -0.09384672529995441,
"rewards/format_reward": 0.5416666716337204,
"step": 227
},
{
"clip_ratio": 0.0,
"completion_length": 2348.7083740234375,
"epoch": 0.13028571428571428,
"grad_norm": 0.3130943179130554,
"kl": 0.71826171875,
"learning_rate": 6.979899910323624e-05,
"loss": 0.296,
"reward": 0.4632652625441551,
"reward_std": 0.8859903812408447,
"rewards/cosine_scaled_reward": -0.018367409706115723,
"rewards/format_reward": 0.5000000186264515,
"step": 228
},
{
"clip_ratio": 0.0,
"completion_length": 3352.916748046875,
"epoch": 0.13085714285714287,
"grad_norm": 0.046103257685899734,
"kl": 0.796875,
"learning_rate": 6.9501956285373e-05,
"loss": 0.1272,
"reward": 0.5160791212692857,
"reward_std": 0.8618348892778158,
"rewards/cosine_scaled_reward": 0.008039550390094519,
"rewards/format_reward": 0.5000000074505806,
"step": 229
},
{
"clip_ratio": 0.0,
"completion_length": 3535.4583740234375,
"epoch": 0.13142857142857142,
"grad_norm": 0.10873083025217056,
"kl": 0.8486328125,
"learning_rate": 6.920420666261962e-05,
"loss": 0.0469,
"reward": -0.26604770543053746,
"reward_std": 0.4407772123813629,
"rewards/cosine_scaled_reward": -0.2580238524824381,
"rewards/format_reward": 0.2500000037252903,
"step": 230
},
{
"clip_ratio": 0.0,
"completion_length": 3418.5000610351562,
"epoch": 0.132,
"grad_norm": 0.09764978289604187,
"kl": 0.9033203125,
"learning_rate": 6.890576474687263e-05,
"loss": 0.0588,
"reward": -0.12345289438962936,
"reward_std": 0.5395884811878204,
"rewards/cosine_scaled_reward": -0.20755979791283607,
"rewards/format_reward": 0.2916666716337204,
"step": 231
},
{
"clip_ratio": 0.0,
"completion_length": 2774.2083740234375,
"epoch": 0.13257142857142856,
"grad_norm": 0.09208109974861145,
"kl": 0.7041015625,
"learning_rate": 6.860664508377001e-05,
"loss": 0.1257,
"reward": 0.3880004594102502,
"reward_std": 0.6920578330755234,
"rewards/cosine_scaled_reward": -0.03516644984483719,
"rewards/format_reward": 0.4583333395421505,
"step": 232
},
{
"clip_ratio": 0.0,
"completion_length": 3413.4166870117188,
"epoch": 0.13314285714285715,
"grad_norm": 0.09384088963270187,
"kl": 0.7041015625,
"learning_rate": 6.83068622519821e-05,
"loss": 0.021,
"reward": -0.12629218865185976,
"reward_std": 0.5478029847145081,
"rewards/cosine_scaled_reward": -0.33397944271564484,
"rewards/format_reward": 0.541666679084301,
"step": 233
},
{
"clip_ratio": 0.0,
"completion_length": 3346.6250610351562,
"epoch": 0.1337142857142857,
"grad_norm": 0.04808826744556427,
"kl": 0.6845703125,
"learning_rate": 6.800643086250122e-05,
"loss": 0.0825,
"reward": -0.1951084854081273,
"reward_std": 0.4496830254793167,
"rewards/cosine_scaled_reward": -0.28505424316972494,
"rewards/format_reward": 0.3750000037252903,
"step": 234
},
{
"clip_ratio": 0.0,
"completion_length": 2900.666717529297,
"epoch": 0.13428571428571429,
"grad_norm": 0.1497814655303955,
"kl": 0.55810546875,
"learning_rate": 6.770536555792944e-05,
"loss": -0.0319,
"reward": 0.5402148407883942,
"reward_std": 0.37184665352106094,
"rewards/cosine_scaled_reward": -0.04239258915185928,
"rewards/format_reward": 0.6250000111758709,
"step": 235
},
{
"clip_ratio": 0.0,
"completion_length": 2875.666748046875,
"epoch": 0.13485714285714287,
"grad_norm": 0.0731302872300148,
"kl": 0.69921875,
"learning_rate": 6.740368101176496e-05,
"loss": 0.0621,
"reward": 0.16314180195331573,
"reward_std": 0.5131651610136032,
"rewards/cosine_scaled_reward": -0.2517624497413635,
"rewards/format_reward": 0.6666666828095913,
"step": 236
},
{
"clip_ratio": 0.0,
"completion_length": 2531.0000610351562,
"epoch": 0.13542857142857143,
"grad_norm": 0.07892952114343643,
"kl": 0.5634765625,
"learning_rate": 6.710139192768695e-05,
"loss": 0.0984,
"reward": 0.2294897036626935,
"reward_std": 0.3668659031391144,
"rewards/cosine_scaled_reward": -0.1560884891077876,
"rewards/format_reward": 0.5416666716337204,
"step": 237
},
{
"clip_ratio": 0.0,
"completion_length": 3519.3333740234375,
"epoch": 0.136,
"grad_norm": 0.048439182341098785,
"kl": 0.55322265625,
"learning_rate": 6.679851303883892e-05,
"loss": 0.0406,
"reward": -0.1855611428618431,
"reward_std": 0.3151257839053869,
"rewards/cosine_scaled_reward": -0.21778057888150215,
"rewards/format_reward": 0.2500000111758709,
"step": 238
},
{
"clip_ratio": 0.0,
"completion_length": 3434.3333740234375,
"epoch": 0.13657142857142857,
"grad_norm": 0.037434171885252,
"kl": 0.53125,
"learning_rate": 6.649505910711058e-05,
"loss": 0.044,
"reward": 0.21306656673550606,
"reward_std": 0.5599532704800367,
"rewards/cosine_scaled_reward": -0.08096674270927906,
"rewards/format_reward": 0.3750000074505806,
"step": 239
},
{
"clip_ratio": 0.0,
"completion_length": 3486.4583740234375,
"epoch": 0.13714285714285715,
"grad_norm": 0.06022648140788078,
"kl": 0.6474609375,
"learning_rate": 6.619104492241848e-05,
"loss": 0.0605,
"reward": -0.09005486592650414,
"reward_std": 0.6559914350509644,
"rewards/cosine_scaled_reward": -0.19086076319217682,
"rewards/format_reward": 0.2916666716337204,
"step": 240
},
{
"clip_ratio": 0.0,
"completion_length": 3027.166748046875,
"epoch": 0.1377142857142857,
"grad_norm": 0.1721280962228775,
"kl": 0.5634765625,
"learning_rate": 6.588648530198504e-05,
"loss": 0.1883,
"reward": 0.39998156833462417,
"reward_std": 0.9498686045408249,
"rewards/cosine_scaled_reward": 0.01249077171087265,
"rewards/format_reward": 0.3750000149011612,
"step": 241
},
{
"clip_ratio": 0.0,
"completion_length": 3415.041748046875,
"epoch": 0.1382857142857143,
"grad_norm": 0.053598903119564056,
"kl": 0.5390625,
"learning_rate": 6.558139508961655e-05,
"loss": 0.0471,
"reward": -0.3487744452431798,
"reward_std": 0.31119491159915924,
"rewards/cosine_scaled_reward": -0.29938721284270287,
"rewards/format_reward": 0.2500000074505806,
"step": 242
},
{
"clip_ratio": 0.0,
"completion_length": 3389.9166870117188,
"epoch": 0.13885714285714285,
"grad_norm": 0.05878981202840805,
"kl": 0.58056640625,
"learning_rate": 6.527578915497951e-05,
"loss": 0.0654,
"reward": -0.28902174066752195,
"reward_std": 0.33487619645893574,
"rewards/cosine_scaled_reward": -0.20701087033376098,
"rewards/format_reward": 0.1250000037252903,
"step": 243
},
{
"clip_ratio": 0.0,
"completion_length": 3095.2083435058594,
"epoch": 0.13942857142857143,
"grad_norm": 0.06762240827083588,
"kl": 0.4462890625,
"learning_rate": 6.496968239287605e-05,
"loss": 0.0881,
"reward": 0.176816888153553,
"reward_std": 0.6285420805215836,
"rewards/cosine_scaled_reward": -0.14075824059545994,
"rewards/format_reward": 0.4583333507180214,
"step": 244
},
{
"clip_ratio": 0.0,
"completion_length": 3193.4584350585938,
"epoch": 0.14,
"grad_norm": 0.055704742670059204,
"kl": 0.67333984375,
"learning_rate": 6.466308972251785e-05,
"loss": 0.0928,
"reward": 0.09640493569895625,
"reward_std": 0.6325432863086462,
"rewards/cosine_scaled_reward": -0.11846420541405678,
"rewards/format_reward": 0.3333333395421505,
"step": 245
},
{
"clip_ratio": 0.0,
"completion_length": 3142.9584350585938,
"epoch": 0.14057142857142857,
"grad_norm": 0.03989458829164505,
"kl": 0.431884765625,
"learning_rate": 6.435602608679918e-05,
"loss": 0.0134,
"reward": 0.4728468209505081,
"reward_std": 0.7001243010163307,
"rewards/cosine_scaled_reward": 0.007256772369146347,
"rewards/format_reward": 0.4583333358168602,
"step": 246
},
{
"clip_ratio": 0.0,
"completion_length": 3543.9166870117188,
"epoch": 0.14114285714285715,
"grad_norm": 0.05796092376112938,
"kl": 0.5068359375,
"learning_rate": 6.404850645156841e-05,
"loss": 0.0442,
"reward": -0.33712251763790846,
"reward_std": 0.5263936333358288,
"rewards/cosine_scaled_reward": -0.21022793650627136,
"rewards/format_reward": 0.0833333358168602,
"step": 247
},
{
"clip_ratio": 0.0,
"completion_length": 2386.4166870117188,
"epoch": 0.1417142857142857,
"grad_norm": 0.12801125645637512,
"kl": 0.49365234375,
"learning_rate": 6.374054580489874e-05,
"loss": 0.2453,
"reward": 0.6528540402650833,
"reward_std": 0.6541059017181396,
"rewards/cosine_scaled_reward": 0.03476031869649887,
"rewards/format_reward": 0.5833333432674408,
"step": 248
},
{
"clip_ratio": 0.0,
"completion_length": 3535.3751220703125,
"epoch": 0.1422857142857143,
"grad_norm": 0.045454155653715134,
"kl": 0.61328125,
"learning_rate": 6.343215915635762e-05,
"loss": 0.0356,
"reward": -0.27654687594622374,
"reward_std": 0.4039493198506534,
"rewards/cosine_scaled_reward": -0.2632734435610473,
"rewards/format_reward": 0.2500000037252903,
"step": 249
},
{
"clip_ratio": 0.0,
"completion_length": 3574.875,
"epoch": 0.14285714285714285,
"grad_norm": 0.06252045184373856,
"kl": 0.53173828125,
"learning_rate": 6.31233615362752e-05,
"loss": 0.0221,
"reward": -0.2779444120824337,
"reward_std": 0.6321102194488049,
"rewards/cosine_scaled_reward": -0.20147221349179745,
"rewards/format_reward": 0.1250000037252903,
"step": 250
},
{
"clip_ratio": 0.0,
"completion_length": 3158.9166870117188,
"epoch": 0.14342857142857143,
"grad_norm": 0.092299684882164,
"kl": 0.59228515625,
"learning_rate": 6.281416799501188e-05,
"loss": 0.0785,
"reward": 0.22740934044122696,
"reward_std": 0.7193168550729752,
"rewards/cosine_scaled_reward": -0.09462865814566612,
"rewards/format_reward": 0.4166666828095913,
"step": 251
},
{
"clip_ratio": 0.0,
"completion_length": 3144.5000610351562,
"epoch": 0.144,
"grad_norm": 0.07481088489294052,
"kl": 0.62548828125,
"learning_rate": 6.250459360222461e-05,
"loss": 0.0965,
"reward": -0.2660303530283272,
"reward_std": 0.5256764851510525,
"rewards/cosine_scaled_reward": -0.3205151781439781,
"rewards/format_reward": 0.3750000037252903,
"step": 252
},
{
"clip_ratio": 0.0,
"completion_length": 3226.5416870117188,
"epoch": 0.14457142857142857,
"grad_norm": 0.1071605458855629,
"kl": 0.5869140625,
"learning_rate": 6.219465344613258e-05,
"loss": 0.0028,
"reward": -0.21571965515613556,
"reward_std": 0.7407341748476028,
"rewards/cosine_scaled_reward": -0.2745265010744333,
"rewards/format_reward": 0.3333333395421505,
"step": 253
},
{
"clip_ratio": 0.0,
"completion_length": 3164.25,
"epoch": 0.14514285714285713,
"grad_norm": 0.07004442811012268,
"kl": 0.6025390625,
"learning_rate": 6.188436263278172e-05,
"loss": 0.0488,
"reward": -0.4852742440998554,
"reward_std": 0.4384588450193405,
"rewards/cosine_scaled_reward": -0.38847045600414276,
"rewards/format_reward": 0.2916666753590107,
"step": 254
},
{
"clip_ratio": 0.0,
"completion_length": 3331.75,
"epoch": 0.1457142857142857,
"grad_norm": 0.07318566739559174,
"kl": 0.70068359375,
"learning_rate": 6.157373628530852e-05,
"loss": 0.0552,
"reward": -0.26235504634678364,
"reward_std": 0.5362003445625305,
"rewards/cosine_scaled_reward": -0.23534419387578964,
"rewards/format_reward": 0.2083333358168602,
"step": 255
},
{
"clip_ratio": 0.0,
"completion_length": 3578.0833740234375,
"epoch": 0.1462857142857143,
"grad_norm": 0.08398139476776123,
"kl": 0.5537109375,
"learning_rate": 6.126278954320295e-05,
"loss": 0.0253,
"reward": -0.49905257299542427,
"reward_std": 0.4306683763861656,
"rewards/cosine_scaled_reward": -0.31202628277242184,
"rewards/format_reward": 0.1250000037252903,
"step": 256
},
{
"clip_ratio": 0.0,
"completion_length": 3329.2500610351562,
"epoch": 0.14685714285714285,
"grad_norm": 0.06330101937055588,
"kl": 0.54541015625,
"learning_rate": 6.095153756157051e-05,
"loss": 0.0436,
"reward": -0.006688140332698822,
"reward_std": 0.640820337459445,
"rewards/cosine_scaled_reward": -0.10751075111329556,
"rewards/format_reward": 0.2083333358168602,
"step": 257
},
{
"clip_ratio": 0.0,
"completion_length": 2738.2083740234375,
"epoch": 0.14742857142857144,
"grad_norm": 0.08145475387573242,
"kl": 0.60498046875,
"learning_rate": 6.06399955103937e-05,
"loss": 0.2035,
"reward": -0.2508790194988251,
"reward_std": 0.4752666652202606,
"rewards/cosine_scaled_reward": -0.2504395004361868,
"rewards/format_reward": 0.2500000074505806,
"step": 258
},
{
"clip_ratio": 0.0,
"completion_length": 3575.5416870117188,
"epoch": 0.148,
"grad_norm": 0.0701456069946289,
"kl": 0.49853515625,
"learning_rate": 6.032817857379256e-05,
"loss": 0.023,
"reward": -0.540546678006649,
"reward_std": 0.3376801423728466,
"rewards/cosine_scaled_reward": -0.3327733352780342,
"rewards/format_reward": 0.1250000037252903,
"step": 259
},
{
"clip_ratio": 0.0,
"completion_length": 3471.7083740234375,
"epoch": 0.14857142857142858,
"grad_norm": 0.06758905947208405,
"kl": 0.5185546875,
"learning_rate": 6.001610194928464e-05,
"loss": 0.0384,
"reward": -0.6148700146004558,
"reward_std": 0.25194124691188335,
"rewards/cosine_scaled_reward": -0.3491016775369644,
"rewards/format_reward": 0.0833333358168602,
"step": 260
},
{
"clip_ratio": 0.0,
"completion_length": 3584.0,
"epoch": 0.14914285714285713,
"grad_norm": 0.10130701959133148,
"kl": 0.428466796875,
"learning_rate": 5.970378084704441e-05,
"loss": 0.0172,
"reward": -0.6753373667597771,
"reward_std": 0.3040266986936331,
"rewards/cosine_scaled_reward": -0.37933534011244774,
"rewards/format_reward": 0.0833333358168602,
"step": 261
},
{
"clip_ratio": 0.0,
"completion_length": 3520.8333740234375,
"epoch": 0.14971428571428572,
"grad_norm": 0.05124128237366676,
"kl": 0.4736328125,
"learning_rate": 5.9391230489161734e-05,
"loss": 0.0285,
"reward": -0.4647176805883646,
"reward_std": 0.43572363816201687,
"rewards/cosine_scaled_reward": -0.25319216772913933,
"rewards/format_reward": 0.0416666679084301,
"step": 262
},
{
"clip_ratio": 0.0,
"completion_length": 3579.2916870117188,
"epoch": 0.15028571428571427,
"grad_norm": 0.07916761189699173,
"kl": 0.34814453125,
"learning_rate": 5.907846610890012e-05,
"loss": 0.0166,
"reward": -0.4248216481646523,
"reward_std": 0.35384376160800457,
"rewards/cosine_scaled_reward": -0.23324416455579922,
"rewards/format_reward": 0.0416666679084301,
"step": 263
},
{
"clip_ratio": 0.0,
"completion_length": 3584.0,
"epoch": 0.15085714285714286,
"grad_norm": 0.04631157964468002,
"kl": 0.36572265625,
"learning_rate": 5.876550294995421e-05,
"loss": 0.0146,
"reward": -0.2554018050432205,
"reward_std": 0.4412471568211913,
"rewards/cosine_scaled_reward": -0.16936755925416946,
"rewards/format_reward": 0.0833333358168602,
"step": 264
},
{
"clip_ratio": 0.0,
"completion_length": 3527.416748046875,
"epoch": 0.15142857142857144,
"grad_norm": 0.04877844080328941,
"kl": 0.343017578125,
"learning_rate": 5.8452356265706845e-05,
"loss": 0.0137,
"reward": -0.30376535654067993,
"reward_std": 0.33788828179240227,
"rewards/cosine_scaled_reward": -0.25604934617877007,
"rewards/format_reward": 0.2083333395421505,
"step": 265
},
{
"clip_ratio": 0.0,
"completion_length": 3505.2916870117188,
"epoch": 0.152,
"grad_norm": 0.05829893797636032,
"kl": 0.3427734375,
"learning_rate": 5.813904131848564e-05,
"loss": 0.0447,
"reward": -0.27650847285985947,
"reward_std": 0.3842029310762882,
"rewards/cosine_scaled_reward": -0.17992090061306953,
"rewards/format_reward": 0.0833333358168602,
"step": 266
},
{
"clip_ratio": 0.0,
"completion_length": 2713.416748046875,
"epoch": 0.15257142857142858,
"grad_norm": 0.06513495743274689,
"kl": 0.4794921875,
"learning_rate": 5.782557337881911e-05,
"loss": 0.0555,
"reward": 0.09612545743584633,
"reward_std": 0.7547842487692833,
"rewards/cosine_scaled_reward": -0.22277061268687248,
"rewards/format_reward": 0.5416666865348816,
"step": 267
},
{
"clip_ratio": 0.0,
"completion_length": 3205.4583740234375,
"epoch": 0.15314285714285714,
"grad_norm": 0.053954530507326126,
"kl": 0.392578125,
"learning_rate": 5.751196772469237e-05,
"loss": 0.0786,
"reward": -0.2341192662715912,
"reward_std": 0.6096795275807381,
"rewards/cosine_scaled_reward": -0.2628929764032364,
"rewards/format_reward": 0.291666679084301,
"step": 268
},
{
"clip_ratio": 0.0,
"completion_length": 3527.5,
"epoch": 0.15371428571428572,
"grad_norm": 0.09402066469192505,
"kl": 0.4326171875,
"learning_rate": 5.719823964080261e-05,
"loss": 0.018,
"reward": -0.7792681828141212,
"reward_std": 0.16622583265416324,
"rewards/cosine_scaled_reward": -0.4313007518649101,
"rewards/format_reward": 0.0833333358168602,
"step": 269
},
{
"clip_ratio": 0.0,
"completion_length": 2934.125,
"epoch": 0.15428571428571428,
"grad_norm": 0.08574939519166946,
"kl": 0.3681640625,
"learning_rate": 5.688440441781399e-05,
"loss": -0.0006,
"reward": 0.4598322659730911,
"reward_std": 0.5430615171790123,
"rewards/cosine_scaled_reward": 0.02158279437571764,
"rewards/format_reward": 0.4166666679084301,
"step": 270
},
{
"clip_ratio": 0.0,
"completion_length": 3205.2916870117188,
"epoch": 0.15485714285714286,
"grad_norm": 0.09958425909280777,
"kl": 0.35693359375,
"learning_rate": 5.657047735161256e-05,
"loss": 0.1366,
"reward": -0.3448881134390831,
"reward_std": 0.23123590275645256,
"rewards/cosine_scaled_reward": -0.25577738881111145,
"rewards/format_reward": 0.1666666716337204,
"step": 271
},
{
"clip_ratio": 0.0,
"completion_length": 2611.3334197998047,
"epoch": 0.15542857142857142,
"grad_norm": 0.03576628491282463,
"kl": 0.36773681640625,
"learning_rate": 5.6256473742560614e-05,
"loss": 0.1024,
"reward": 0.011561892926692963,
"reward_std": 0.3712347708642483,
"rewards/cosine_scaled_reward": -0.26505238376557827,
"rewards/format_reward": 0.5416666679084301,
"step": 272
},
{
"clip_ratio": 0.0,
"completion_length": 3323.2500610351562,
"epoch": 0.156,
"grad_norm": 0.06556365638971329,
"kl": 0.4912109375,
"learning_rate": 5.594240889475107e-05,
"loss": 0.0166,
"reward": -0.4284048527479172,
"reward_std": 0.520957512781024,
"rewards/cosine_scaled_reward": -0.25586907658725977,
"rewards/format_reward": 0.0833333358168602,
"step": 273
},
{
"clip_ratio": 0.0,
"completion_length": 3584.0,
"epoch": 0.15657142857142858,
"grad_norm": 0.03841016814112663,
"kl": 0.299072265625,
"learning_rate": 5.5628298115261545e-05,
"loss": 0.012,
"reward": -0.7124607469886541,
"reward_std": 0.3880233308300376,
"rewards/cosine_scaled_reward": -0.37706371024250984,
"rewards/format_reward": 0.0416666679084301,
"step": 274
},
{
"clip_ratio": 0.0,
"completion_length": 3276.7916870117188,
"epoch": 0.15714285714285714,
"grad_norm": 0.07038947939872742,
"kl": 0.271728515625,
"learning_rate": 5.5314156713408275e-05,
"loss": -0.0357,
"reward": 0.1667029708623886,
"reward_std": 0.4187759216874838,
"rewards/cosine_scaled_reward": -0.041648514568805695,
"rewards/format_reward": 0.25,
"step": 275
},
{
"clip_ratio": 0.0,
"completion_length": 3108.1250610351562,
"epoch": 0.15771428571428572,
"grad_norm": 0.046657588332891464,
"kl": 0.3857421875,
"learning_rate": 5.500000000000001e-05,
"loss": 0.0822,
"reward": -0.2098342329263687,
"reward_std": 0.7249854430556297,
"rewards/cosine_scaled_reward": -0.2299171146005392,
"rewards/format_reward": 0.2500000074505806,
"step": 276
},
{
"clip_ratio": 0.0,
"completion_length": 3534.1666870117188,
"epoch": 0.15828571428571428,
"grad_norm": 0.045905984938144684,
"kl": 0.3125,
"learning_rate": 5.468584328659173e-05,
"loss": 0.0342,
"reward": -0.498178094625473,
"reward_std": 0.19802542310208082,
"rewards/cosine_scaled_reward": -0.2490890473127365,
"rewards/format_reward": 0.0,
"step": 277
},
{
"clip_ratio": 0.0,
"completion_length": 3167.0416870117188,
"epoch": 0.15885714285714286,
"grad_norm": 0.10243193805217743,
"kl": 0.2861328125,
"learning_rate": 5.4371701884738466e-05,
"loss": 0.1193,
"reward": -0.42556126043200493,
"reward_std": 0.3831705767661333,
"rewards/cosine_scaled_reward": -0.2961139716207981,
"rewards/format_reward": 0.1666666716337204,
"step": 278
},
{
"clip_ratio": 0.0,
"completion_length": 3515.5833740234375,
"epoch": 0.15942857142857142,
"grad_norm": 0.046917758882045746,
"kl": 0.27294921875,
"learning_rate": 5.405759110524894e-05,
"loss": 0.0402,
"reward": -0.1426243856549263,
"reward_std": 0.6854961533099413,
"rewards/cosine_scaled_reward": -0.1754788588732481,
"rewards/format_reward": 0.2083333395421505,
"step": 279
},
{
"clip_ratio": 0.0,
"completion_length": 3225.9583740234375,
"epoch": 0.16,
"grad_norm": 0.05153701454401016,
"kl": 0.27978515625,
"learning_rate": 5.374352625743941e-05,
"loss": -0.0109,
"reward": -0.4667184352874756,
"reward_std": 0.24931692145764828,
"rewards/cosine_scaled_reward": -0.3583592250943184,
"rewards/format_reward": 0.25,
"step": 280
},
{
"clip_ratio": 0.0,
"completion_length": 3036.5834350585938,
"epoch": 0.16057142857142856,
"grad_norm": 0.06122641637921333,
"kl": 0.36865234375,
"learning_rate": 5.342952264838747e-05,
"loss": 0.0865,
"reward": 0.47959111630916595,
"reward_std": 0.37914127111434937,
"rewards/cosine_scaled_reward": 0.010628907009959221,
"rewards/format_reward": 0.4583333395421505,
"step": 281
},
{
"clip_ratio": 0.0,
"completion_length": 3430.25,
"epoch": 0.16114285714285714,
"grad_norm": 0.05524813011288643,
"kl": 0.24755859375,
"learning_rate": 5.311559558218603e-05,
"loss": 0.035,
"reward": 0.04076346941292286,
"reward_std": 0.4443469550460577,
"rewards/cosine_scaled_reward": -0.08378491457551718,
"rewards/format_reward": 0.2083333432674408,
"step": 282
},
{
"clip_ratio": 0.0,
"completion_length": 3506.1250610351562,
"epoch": 0.16171428571428573,
"grad_norm": 0.03167405351996422,
"kl": 0.360107421875,
"learning_rate": 5.28017603591974e-05,
"loss": 0.047,
"reward": -0.20051036775112152,
"reward_std": 0.7524923011660576,
"rewards/cosine_scaled_reward": -0.24608852714300156,
"rewards/format_reward": 0.2916666716337204,
"step": 283
},
{
"clip_ratio": 0.0,
"completion_length": 2803.375,
"epoch": 0.16228571428571428,
"grad_norm": 0.0590810589492321,
"kl": 0.223876953125,
"learning_rate": 5.248803227530763e-05,
"loss": 0.1235,
"reward": -0.12046916782855988,
"reward_std": 0.4706810973584652,
"rewards/cosine_scaled_reward": -0.18523459136486053,
"rewards/format_reward": 0.25,
"step": 284
},
{
"clip_ratio": 0.0,
"completion_length": 3491.4166870117188,
"epoch": 0.16285714285714287,
"grad_norm": 0.09945899248123169,
"kl": 0.260009765625,
"learning_rate": 5.2174426621180906e-05,
"loss": 0.0514,
"reward": -0.5829556360840797,
"reward_std": 0.20706506725400686,
"rewards/cosine_scaled_reward": -0.33314448967576027,
"rewards/format_reward": 0.0833333358168602,
"step": 285
},
{
"clip_ratio": 0.0,
"completion_length": 2982.8750610351562,
"epoch": 0.16342857142857142,
"grad_norm": 0.18956300616264343,
"kl": 0.47900390625,
"learning_rate": 5.186095868151436e-05,
"loss": 0.0642,
"reward": 0.18999171257019043,
"reward_std": 0.8487022221088409,
"rewards/cosine_scaled_reward": -0.11333749070763588,
"rewards/format_reward": 0.4166666828095913,
"step": 286
},
{
"clip_ratio": 0.0,
"completion_length": 2949.4584045410156,
"epoch": 0.164,
"grad_norm": 0.0727335512638092,
"kl": 0.43017578125,
"learning_rate": 5.154764373429316e-05,
"loss": 0.0658,
"reward": 0.3310265392065048,
"reward_std": 0.8132697474211454,
"rewards/cosine_scaled_reward": -0.021986715495586395,
"rewards/format_reward": 0.3750000149011612,
"step": 287
},
{
"clip_ratio": 0.0,
"completion_length": 3584.0,
"epoch": 0.16457142857142856,
"grad_norm": 0.05997053533792496,
"kl": 0.27587890625,
"learning_rate": 5.1234497050045814e-05,
"loss": 0.011,
"reward": -0.5673756748437881,
"reward_std": 0.2968660295009613,
"rewards/cosine_scaled_reward": -0.3045211657881737,
"rewards/format_reward": 0.0416666679084301,
"step": 288
},
{
"clip_ratio": 0.0,
"completion_length": 3584.0,
"epoch": 0.16514285714285715,
"grad_norm": 0.06633277982473373,
"kl": 0.30517578125,
"learning_rate": 5.0921533891099905e-05,
"loss": 0.0122,
"reward": -0.6890946179628372,
"reward_std": 0.15950028970837593,
"rewards/cosine_scaled_reward": -0.3445473089814186,
"rewards/format_reward": 0.0,
"step": 289
},
{
"clip_ratio": 0.0,
"completion_length": 3561.7916870117188,
"epoch": 0.1657142857142857,
"grad_norm": 0.040217384696006775,
"kl": 0.274169921875,
"learning_rate": 5.0608769510838284e-05,
"loss": 0.0149,
"reward": -0.15157588943839073,
"reward_std": 0.3089019572362304,
"rewards/cosine_scaled_reward": -0.1382879503071308,
"rewards/format_reward": 0.125,
"step": 290
},
{
"clip_ratio": 0.0,
"completion_length": 3584.0,
"epoch": 0.1662857142857143,
"grad_norm": 0.07168910652399063,
"kl": 0.314453125,
"learning_rate": 5.0296219152955604e-05,
"loss": 0.0126,
"reward": -0.4608482411131263,
"reward_std": 0.1107195196673274,
"rewards/cosine_scaled_reward": -0.23042412823997438,
"rewards/format_reward": 0.0,
"step": 291
},
{
"clip_ratio": 0.0,
"completion_length": 3516.125,
"epoch": 0.16685714285714287,
"grad_norm": 0.04790537431836128,
"kl": 0.427734375,
"learning_rate": 4.998389805071536e-05,
"loss": 0.0372,
"reward": -0.40456270426511765,
"reward_std": 0.47755980491638184,
"rewards/cosine_scaled_reward": -0.2647813465446234,
"rewards/format_reward": 0.125,
"step": 292
},
{
"clip_ratio": 0.0,
"completion_length": 3265.7500610351562,
"epoch": 0.16742857142857143,
"grad_norm": 0.050319403409957886,
"kl": 0.43408203125,
"learning_rate": 4.9671821426207455e-05,
"loss": 0.0514,
"reward": 0.42125577852129936,
"reward_std": 0.8351590689271688,
"rewards/cosine_scaled_reward": 0.04396123066544533,
"rewards/format_reward": 0.3333333432674408,
"step": 293
},
{
"clip_ratio": 0.0,
"completion_length": 3466.5833740234375,
"epoch": 0.168,
"grad_norm": 0.05146944150328636,
"kl": 0.3720703125,
"learning_rate": 4.936000448960631e-05,
"loss": 0.029,
"reward": -0.271098967641592,
"reward_std": 0.37742302753031254,
"rewards/cosine_scaled_reward": -0.19804948195815086,
"rewards/format_reward": 0.125,
"step": 294
},
{
"clip_ratio": 0.0,
"completion_length": 3577.125,
"epoch": 0.16857142857142857,
"grad_norm": 0.06401149928569794,
"kl": 0.409423828125,
"learning_rate": 4.904846243842949e-05,
"loss": 0.0203,
"reward": -0.46894958056509495,
"reward_std": 0.1929320227354765,
"rewards/cosine_scaled_reward": -0.2553081316873431,
"rewards/format_reward": 0.0416666679084301,
"step": 295
},
{
"clip_ratio": 0.0,
"completion_length": 3085.9166870117188,
"epoch": 0.16914285714285715,
"grad_norm": 3.077998161315918,
"kl": 10.78662109375,
"learning_rate": 4.873721045679707e-05,
"loss": 0.2368,
"reward": -0.45518723130226135,
"reward_std": 0.34384622564539313,
"rewards/cosine_scaled_reward": -0.3109269514679909,
"rewards/format_reward": 0.1666666716337204,
"step": 296
},
{
"clip_ratio": 0.0,
"completion_length": 3572.3333740234375,
"epoch": 0.1697142857142857,
"grad_norm": 0.029030799865722656,
"kl": 0.24560546875,
"learning_rate": 4.842626371469149e-05,
"loss": 0.0127,
"reward": -0.0029055774211883545,
"reward_std": 0.9067392088472843,
"rewards/cosine_scaled_reward": -0.14728612639009953,
"rewards/format_reward": 0.2916666679084301,
"step": 297
},
{
"clip_ratio": 0.0,
"completion_length": 3539.1666870117188,
"epoch": 0.1702857142857143,
"grad_norm": 0.03600252792239189,
"kl": 0.282470703125,
"learning_rate": 4.811563736721829e-05,
"loss": 0.0161,
"reward": -0.2380085289478302,
"reward_std": 0.7847420740872622,
"rewards/cosine_scaled_reward": -0.2231709435582161,
"rewards/format_reward": 0.2083333395421505,
"step": 298
},
{
"clip_ratio": 0.0,
"completion_length": 3420.75,
"epoch": 0.17085714285714285,
"grad_norm": 0.04998674616217613,
"kl": 0.34716796875,
"learning_rate": 4.780534655386744e-05,
"loss": 0.0751,
"reward": -0.060495853424072266,
"reward_std": 0.3135654963552952,
"rewards/cosine_scaled_reward": -0.09274792857468128,
"rewards/format_reward": 0.125,
"step": 299
},
{
"clip_ratio": 0.0,
"completion_length": 3197.041748046875,
"epoch": 0.17142857142857143,
"grad_norm": 0.04269924387335777,
"kl": 0.295654296875,
"learning_rate": 4.74954063977754e-05,
"loss": 0.0324,
"reward": -0.26061324402689934,
"reward_std": 0.29013217613101006,
"rewards/cosine_scaled_reward": -0.3178066350519657,
"rewards/format_reward": 0.375,
"step": 300
},
{
"clip_ratio": 0.0,
"completion_length": 3308.5,
"epoch": 0.172,
"grad_norm": 0.061065103858709335,
"kl": 0.324951171875,
"learning_rate": 4.718583200498814e-05,
"loss": 0.0757,
"reward": -0.6140761077404022,
"reward_std": 0.3115503266453743,
"rewards/cosine_scaled_reward": -0.3903713934123516,
"rewards/format_reward": 0.1666666716337204,
"step": 301
},
{
"clip_ratio": 0.0,
"completion_length": 2942.25,
"epoch": 0.17257142857142857,
"grad_norm": 0.0670550987124443,
"kl": 0.328369140625,
"learning_rate": 4.687663846372481e-05,
"loss": 0.0489,
"reward": -0.023671671748161316,
"reward_std": 0.2200901247560978,
"rewards/cosine_scaled_reward": -0.15766918659210205,
"rewards/format_reward": 0.2916666679084301,
"step": 302
},
{
"clip_ratio": 0.0,
"completion_length": 3216.2916870117188,
"epoch": 0.17314285714285715,
"grad_norm": 0.12610213458538055,
"kl": 0.3369140625,
"learning_rate": 4.6567840843642384e-05,
"loss": 0.0942,
"reward": 0.028468750417232513,
"reward_std": 0.4689778573811054,
"rewards/cosine_scaled_reward": -0.08993229269981384,
"rewards/format_reward": 0.2083333432674408,
"step": 303
},
{
"clip_ratio": 0.0,
"completion_length": 3306.7500610351562,
"epoch": 0.1737142857142857,
"grad_norm": 0.04432929679751396,
"kl": 0.35595703125,
"learning_rate": 4.6259454195101274e-05,
"loss": 0.0864,
"reward": -0.35086609423160553,
"reward_std": 0.286039125174284,
"rewards/cosine_scaled_reward": -0.2795997243374586,
"rewards/format_reward": 0.2083333395421505,
"step": 304
},
{
"clip_ratio": 0.0,
"completion_length": 3584.0,
"epoch": 0.1742857142857143,
"grad_norm": 0.04625676944851875,
"kl": 0.45068359375,
"learning_rate": 4.5951493548431603e-05,
"loss": 0.018,
"reward": -0.6459367321804166,
"reward_std": 0.4608680563978851,
"rewards/cosine_scaled_reward": -0.36463503539562225,
"rewards/format_reward": 0.0833333358168602,
"step": 305
},
{
"clip_ratio": 0.0,
"completion_length": 2516.0833435058594,
"epoch": 0.17485714285714285,
"grad_norm": 0.11857768148183823,
"kl": 0.3681640625,
"learning_rate": 4.564397391320084e-05,
"loss": 0.0883,
"reward": 0.5657532401382923,
"reward_std": 0.4573374604806304,
"rewards/cosine_scaled_reward": 0.032876621931791306,
"rewards/format_reward": 0.5000000111758709,
"step": 306
},
{
"clip_ratio": 0.0,
"completion_length": 3584.0,
"epoch": 0.17542857142857143,
"grad_norm": 0.07965458929538727,
"kl": 0.306884765625,
"learning_rate": 4.5336910277482156e-05,
"loss": 0.0123,
"reward": -0.8762749880552292,
"reward_std": 0.08273854246363044,
"rewards/cosine_scaled_reward": -0.4381374940276146,
"rewards/format_reward": 0.0,
"step": 307
},
{
"clip_ratio": 0.0,
"completion_length": 2825.1666870117188,
"epoch": 0.176,
"grad_norm": 0.0597933791577816,
"kl": 0.296142578125,
"learning_rate": 4.503031760712397e-05,
"loss": -0.0245,
"reward": 0.3207015171647072,
"reward_std": 0.7405253425240517,
"rewards/cosine_scaled_reward": -0.006315924227237701,
"rewards/format_reward": 0.3333333358168602,
"step": 308
},
{
"clip_ratio": 0.0,
"completion_length": 3479.8333740234375,
"epoch": 0.17657142857142857,
"grad_norm": 0.05388667806982994,
"kl": 0.378173828125,
"learning_rate": 4.47242108450205e-05,
"loss": 0.0445,
"reward": -0.20779564417898655,
"reward_std": 0.20239176135510206,
"rewards/cosine_scaled_reward": -0.16639782628044486,
"rewards/format_reward": 0.125,
"step": 309
},
{
"clip_ratio": 0.0,
"completion_length": 3584.0,
"epoch": 0.17714285714285713,
"grad_norm": 0.0844094380736351,
"kl": 0.225830078125,
"learning_rate": 4.4418604910383456e-05,
"loss": 0.009,
"reward": -0.8865186870098114,
"reward_std": 0.09305100329220295,
"rewards/cosine_scaled_reward": -0.4432593658566475,
"rewards/format_reward": 0.0,
"step": 310
},
{
"clip_ratio": 0.0,
"completion_length": 3314.3333740234375,
"epoch": 0.1777142857142857,
"grad_norm": 0.05528863146901131,
"kl": 0.37060546875,
"learning_rate": 4.411351469801496e-05,
"loss": 0.0433,
"reward": -0.24768365547060966,
"reward_std": 0.7055571936070919,
"rewards/cosine_scaled_reward": -0.24884184449911118,
"rewards/format_reward": 0.2500000037252903,
"step": 311
},
{
"clip_ratio": 0.0,
"completion_length": 3523.1666870117188,
"epoch": 0.1782857142857143,
"grad_norm": 0.04280995950102806,
"kl": 0.3310546875,
"learning_rate": 4.380895507758155e-05,
"loss": 0.0242,
"reward": -0.381958182901144,
"reward_std": 0.25565229170024395,
"rewards/cosine_scaled_reward": -0.2326457593590021,
"rewards/format_reward": 0.0833333358168602,
"step": 312
},
{
"clip_ratio": 0.0,
"completion_length": 2733.875,
"epoch": 0.17885714285714285,
"grad_norm": 0.06423232704401016,
"kl": 0.25390625,
"learning_rate": 4.3504940892889434e-05,
"loss": -0.0337,
"reward": 0.036944784224033356,
"reward_std": 0.9110874142497778,
"rewards/cosine_scaled_reward": -0.14819425716996193,
"rewards/format_reward": 0.3333333358168602,
"step": 313
},
{
"clip_ratio": 0.0,
"completion_length": 3313.0416870117188,
"epoch": 0.17942857142857144,
"grad_norm": 0.07672199606895447,
"kl": 0.4697265625,
"learning_rate": 4.3201486961161094e-05,
"loss": 0.0886,
"reward": -0.19889018312096596,
"reward_std": 0.8323519490659237,
"rewards/cosine_scaled_reward": -0.22444510459899902,
"rewards/format_reward": 0.2500000037252903,
"step": 314
},
{
"clip_ratio": 0.0,
"completion_length": 3345.0833740234375,
"epoch": 0.18,
"grad_norm": 0.08122944086790085,
"kl": 0.54150390625,
"learning_rate": 4.289860807231305e-05,
"loss": 0.0666,
"reward": -0.32951921597123146,
"reward_std": 0.3918669559061527,
"rewards/cosine_scaled_reward": -0.20642626285552979,
"rewards/format_reward": 0.0833333358168602,
"step": 315
},
{
"clip_ratio": 0.0,
"completion_length": 3584.0,
"epoch": 0.18057142857142858,
"grad_norm": 0.04830171540379524,
"kl": 0.234619140625,
"learning_rate": 4.259631898823504e-05,
"loss": 0.0094,
"reward": -0.5842305850237608,
"reward_std": 0.18226368352770805,
"rewards/cosine_scaled_reward": -0.292115299962461,
"rewards/format_reward": 0.0,
"step": 316
},
{
"clip_ratio": 0.0,
"completion_length": 3418.375,
"epoch": 0.18114285714285713,
"grad_norm": 0.07951097935438156,
"kl": 0.27880859375,
"learning_rate": 4.229463444207056e-05,
"loss": 0.0673,
"reward": 1.2218952178955078e-06,
"reward_std": 0.9312711171805859,
"rewards/cosine_scaled_reward": -0.10416605323553085,
"rewards/format_reward": 0.2083333395421505,
"step": 317
},
{
"clip_ratio": 0.0,
"completion_length": 3141.750030517578,
"epoch": 0.18171428571428572,
"grad_norm": 0.05983598157763481,
"kl": 0.32373046875,
"learning_rate": 4.1993569137498776e-05,
"loss": 0.1124,
"reward": -0.1722477674484253,
"reward_std": 0.665809502825141,
"rewards/cosine_scaled_reward": -0.23195721581578255,
"rewards/format_reward": 0.2916666716337204,
"step": 318
},
{
"clip_ratio": 0.0,
"completion_length": 3271.9584350585938,
"epoch": 0.18228571428571427,
"grad_norm": 0.05965941771864891,
"kl": 0.322265625,
"learning_rate": 4.1693137748017916e-05,
"loss": 0.0532,
"reward": 0.3015955649316311,
"reward_std": 0.40980809181928635,
"rewards/cosine_scaled_reward": -0.0783689022064209,
"rewards/format_reward": 0.4583333358168602,
"step": 319
},
{
"clip_ratio": 0.0,
"completion_length": 3099.1666870117188,
"epoch": 0.18285714285714286,
"grad_norm": 0.08215689659118652,
"kl": 0.227294921875,
"learning_rate": 4.1393354916230006e-05,
"loss": -0.0244,
"reward": 0.0645943135023117,
"reward_std": 0.3934327196329832,
"rewards/cosine_scaled_reward": -0.09270285069942474,
"rewards/format_reward": 0.25,
"step": 320
},
{
"clip_ratio": 0.0,
"completion_length": 3561.2083740234375,
"epoch": 0.18342857142857144,
"grad_norm": 0.0447857603430748,
"kl": 0.33349609375,
"learning_rate": 4.109423525312738e-05,
"loss": 0.0264,
"reward": -0.6538648195564747,
"reward_std": 0.1470047291368246,
"rewards/cosine_scaled_reward": -0.32693241722881794,
"rewards/format_reward": 0.0,
"step": 321
},
{
"clip_ratio": 0.0,
"completion_length": 3584.0,
"epoch": 0.184,
"grad_norm": 0.0734485313296318,
"kl": 0.2578125,
"learning_rate": 4.079579333738039e-05,
"loss": 0.0103,
"reward": -0.3972032852470875,
"reward_std": 0.285872345790267,
"rewards/cosine_scaled_reward": -0.21943498216569424,
"rewards/format_reward": 0.0416666679084301,
"step": 322
},
{
"clip_ratio": 0.0,
"completion_length": 3581.2500610351562,
"epoch": 0.18457142857142858,
"grad_norm": 0.03482038900256157,
"kl": 0.299072265625,
"learning_rate": 4.049804371462701e-05,
"loss": 0.0135,
"reward": -0.3774775490164757,
"reward_std": 0.3965078853070736,
"rewards/cosine_scaled_reward": -0.25123877450823784,
"rewards/format_reward": 0.1250000037252903,
"step": 323
},
{
"clip_ratio": 0.0,
"completion_length": 3514.4166870117188,
"epoch": 0.18514285714285714,
"grad_norm": 0.06719670444726944,
"kl": 0.283935546875,
"learning_rate": 4.0201000896763766e-05,
"loss": 0.0481,
"reward": -0.4587059337645769,
"reward_std": 0.5845797648653388,
"rewards/cosine_scaled_reward": -0.3126862980425358,
"rewards/format_reward": 0.1666666679084301,
"step": 324
},
{
"clip_ratio": 0.0,
"completion_length": 2928.7916717529297,
"epoch": 0.18571428571428572,
"grad_norm": 0.03646574914455414,
"kl": 0.281982421875,
"learning_rate": 3.9904679361238525e-05,
"loss": 0.0589,
"reward": -0.3672609478235245,
"reward_std": 0.27671839017421007,
"rewards/cosine_scaled_reward": -0.3086304762400687,
"rewards/format_reward": 0.25,
"step": 325
},
{
"clip_ratio": 0.0,
"completion_length": 2766.958335876465,
"epoch": 0.18628571428571428,
"grad_norm": 0.052393004298210144,
"kl": 0.26123046875,
"learning_rate": 3.960909355034491e-05,
"loss": 0.0333,
"reward": 0.0963091105222702,
"reward_std": 0.6041255034506321,
"rewards/cosine_scaled_reward": -0.1393454596400261,
"rewards/format_reward": 0.3750000037252903,
"step": 326
},
{
"clip_ratio": 0.0,
"completion_length": 3330.875,
"epoch": 0.18685714285714286,
"grad_norm": 0.04328332468867302,
"kl": 0.28857421875,
"learning_rate": 3.9314257870518325e-05,
"loss": 0.0722,
"reward": -0.4075555991148576,
"reward_std": 0.20874548598658293,
"rewards/cosine_scaled_reward": -0.24544446932850406,
"rewards/format_reward": 0.0833333358168602,
"step": 327
},
{
"clip_ratio": 0.0,
"completion_length": 2778.7916870117188,
"epoch": 0.18742857142857142,
"grad_norm": 0.045754820108413696,
"kl": 0.18963623046875,
"learning_rate": 3.902018669163384e-05,
"loss": 0.0114,
"reward": -0.1832116525620222,
"reward_std": 0.5292581329122186,
"rewards/cosine_scaled_reward": -0.27910582162439823,
"rewards/format_reward": 0.375,
"step": 328
},
{
"clip_ratio": 0.0,
"completion_length": 3457.6666870117188,
"epoch": 0.188,
"grad_norm": 0.0493839792907238,
"kl": 0.36279296875,
"learning_rate": 3.872689434630585e-05,
"loss": 0.0534,
"reward": -0.25419315695762634,
"reward_std": 0.5991159714758396,
"rewards/cosine_scaled_reward": -0.23126322403550148,
"rewards/format_reward": 0.2083333358168602,
"step": 329
},
{
"clip_ratio": 0.0,
"completion_length": 3402.1666870117188,
"epoch": 0.18857142857142858,
"grad_norm": 0.10224326699972153,
"kl": 0.270751953125,
"learning_rate": 3.843439512918949e-05,
"loss": 0.0955,
"reward": -0.6764328852295876,
"reward_std": 0.2557358928024769,
"rewards/cosine_scaled_reward": -0.3590497747063637,
"rewards/format_reward": 0.0416666679084301,
"step": 330
},
{
"clip_ratio": 0.0,
"completion_length": 2853.291717529297,
"epoch": 0.18914285714285714,
"grad_norm": 0.06347351521253586,
"kl": 0.278076171875,
"learning_rate": 3.814270329628396e-05,
"loss": 0.1003,
"reward": 0.17456289008259773,
"reward_std": 0.23762040957808495,
"rewards/cosine_scaled_reward": -0.07938522100448608,
"rewards/format_reward": 0.3333333358168602,
"step": 331
},
{
"clip_ratio": 0.0,
"completion_length": 3217.2083740234375,
"epoch": 0.18971428571428572,
"grad_norm": 0.08903349936008453,
"kl": 0.453369140625,
"learning_rate": 3.785183306423768e-05,
"loss": 0.0416,
"reward": -0.23864453844726086,
"reward_std": 0.47026437893509865,
"rewards/cosine_scaled_reward": -0.2859889483079314,
"rewards/format_reward": 0.3333333432674408,
"step": 332
},
{
"clip_ratio": 0.0,
"completion_length": 3260.7083740234375,
"epoch": 0.19028571428571428,
"grad_norm": 0.07358856499195099,
"kl": 0.28271484375,
"learning_rate": 3.756179860965538e-05,
"loss": 0.0064,
"reward": -0.38410256803035736,
"reward_std": 0.468995469622314,
"rewards/cosine_scaled_reward": -0.3170512933284044,
"rewards/format_reward": 0.2500000111758709,
"step": 333
},
{
"clip_ratio": 0.0,
"completion_length": 3584.0,
"epoch": 0.19085714285714286,
"grad_norm": 0.05134163424372673,
"kl": 0.31591796875,
"learning_rate": 3.7272614068407205e-05,
"loss": 0.0127,
"reward": -0.6644355654716492,
"reward_std": 0.20398546569049358,
"rewards/cosine_scaled_reward": -0.3322177827358246,
"rewards/format_reward": 0.0,
"step": 334
},
{
"clip_ratio": 0.0,
"completion_length": 2896.4583740234375,
"epoch": 0.19142857142857142,
"grad_norm": 0.15725326538085938,
"kl": 0.28125,
"learning_rate": 3.698429353493974e-05,
"loss": 0.193,
"reward": -0.16589602641761303,
"reward_std": 0.39419085811823606,
"rewards/cosine_scaled_reward": -0.2496146857738495,
"rewards/format_reward": 0.3333333432674408,
"step": 335
},
{
"clip_ratio": 0.0,
"completion_length": 3211.2916870117188,
"epoch": 0.192,
"grad_norm": 0.04808633401989937,
"kl": 0.306396484375,
"learning_rate": 3.6696851061589e-05,
"loss": 0.0454,
"reward": 0.04493848606944084,
"reward_std": 0.9861778020858765,
"rewards/cosine_scaled_reward": -0.12336408998817205,
"rewards/format_reward": 0.2916666753590107,
"step": 336
},
{
"clip_ratio": 0.0,
"completion_length": 2941.4166717529297,
"epoch": 0.19257142857142856,
"grad_norm": 0.1766992211341858,
"kl": 0.259521484375,
"learning_rate": 3.6410300657895626e-05,
"loss": -0.0732,
"reward": 0.014326110482215881,
"reward_std": 0.17792627471499145,
"rewards/cosine_scaled_reward": -0.11783694475889206,
"rewards/format_reward": 0.25,
"step": 337
},
{
"clip_ratio": 0.0,
"completion_length": 2701.75,
"epoch": 0.19314285714285714,
"grad_norm": 0.175528883934021,
"kl": 0.356689453125,
"learning_rate": 3.6124656289922034e-05,
"loss": 0.1389,
"reward": 0.20876749278977513,
"reward_std": 0.7206097654998302,
"rewards/cosine_scaled_reward": -0.08311627432703972,
"rewards/format_reward": 0.3750000037252903,
"step": 338
},
{
"clip_ratio": 0.0,
"completion_length": 3575.0,
"epoch": 0.19371428571428573,
"grad_norm": 0.036193348467350006,
"kl": 0.239501953125,
"learning_rate": 3.583993187957173e-05,
"loss": 0.0088,
"reward": -0.37282892875373363,
"reward_std": 0.49462850391864777,
"rewards/cosine_scaled_reward": -0.2280811471864581,
"rewards/format_reward": 0.0833333358168602,
"step": 339
},
{
"clip_ratio": 0.0,
"completion_length": 2417.3333587646484,
"epoch": 0.19428571428571428,
"grad_norm": 0.13114678859710693,
"kl": 0.198974609375,
"learning_rate": 3.5556141303910795e-05,
"loss": 0.158,
"reward": 0.1652292013168335,
"reward_std": 0.36829282343387604,
"rewards/cosine_scaled_reward": -0.16738539934158325,
"rewards/format_reward": 0.5,
"step": 340
},
{
"clip_ratio": 0.0,
"completion_length": 2132.500030517578,
"epoch": 0.19485714285714287,
"grad_norm": 0.14688146114349365,
"kl": 0.278076171875,
"learning_rate": 3.5273298394491515e-05,
"loss": 0.2139,
"reward": 0.1563252117484808,
"reward_std": 0.2888243719935417,
"rewards/cosine_scaled_reward": -0.23433741927146912,
"rewards/format_reward": 0.6250000149011612,
"step": 341
},
{
"clip_ratio": 0.0,
"completion_length": 2552.0416870117188,
"epoch": 0.19542857142857142,
"grad_norm": 0.08089858293533325,
"kl": 0.278076171875,
"learning_rate": 3.499141693667828e-05,
"loss": -0.0431,
"reward": 0.3262595981359482,
"reward_std": 0.29629753855988383,
"rewards/cosine_scaled_reward": -0.02437019906938076,
"rewards/format_reward": 0.375,
"step": 342
},
{
"clip_ratio": 0.0,
"completion_length": 2731.000045776367,
"epoch": 0.196,
"grad_norm": 0.08294232934713364,
"kl": 0.3438720703125,
"learning_rate": 3.4710510668975624e-05,
"loss": -0.009,
"reward": 0.140800341963768,
"reward_std": 0.30971661023795605,
"rewards/cosine_scaled_reward": -0.0754331611096859,
"rewards/format_reward": 0.2916666679084301,
"step": 343
},
{
"clip_ratio": 0.0,
"completion_length": 3287.0000610351562,
"epoch": 0.19657142857142856,
"grad_norm": 0.07776912301778793,
"kl": 0.28564453125,
"learning_rate": 3.443059328235878e-05,
"loss": 0.0645,
"reward": -0.010377008467912674,
"reward_std": 0.8143371604382992,
"rewards/cosine_scaled_reward": -0.13018852844834328,
"rewards/format_reward": 0.2500000074505806,
"step": 344
},
{
"clip_ratio": 0.0,
"completion_length": 2312.4166870117188,
"epoch": 0.19714285714285715,
"grad_norm": 0.5134606957435608,
"kl": 0.472900390625,
"learning_rate": 3.415167841960624e-05,
"loss": -0.0386,
"reward": 0.507152209058404,
"reward_std": 0.5029881596565247,
"rewards/cosine_scaled_reward": -0.01725723221898079,
"rewards/format_reward": 0.5416666679084301,
"step": 345
},
{
"clip_ratio": 0.0,
"completion_length": 3584.0,
"epoch": 0.1977142857142857,
"grad_norm": 0.032926544547080994,
"kl": 0.202880859375,
"learning_rate": 3.387377967463493e-05,
"loss": 0.0081,
"reward": -0.500700056552887,
"reward_std": 0.22377115488052368,
"rewards/cosine_scaled_reward": -0.2503500273451209,
"rewards/format_reward": 0.0,
"step": 346
},
{
"clip_ratio": 0.0,
"completion_length": 3344.166748046875,
"epoch": 0.1982857142857143,
"grad_norm": 0.06091040372848511,
"kl": 0.4443359375,
"learning_rate": 3.359691059183761e-05,
"loss": 0.0376,
"reward": -0.43880724161863327,
"reward_std": 0.5309533849358559,
"rewards/cosine_scaled_reward": -0.2819036263972521,
"rewards/format_reward": 0.1250000037252903,
"step": 347
},
{
"clip_ratio": 0.0,
"completion_length": 2690.125045776367,
"epoch": 0.19885714285714284,
"grad_norm": 0.15880349278450012,
"kl": 0.33056640625,
"learning_rate": 3.3321084665422807e-05,
"loss": 0.0083,
"reward": -0.3709861980751157,
"reward_std": 0.3275773096829653,
"rewards/cosine_scaled_reward": -0.3938264362514019,
"rewards/format_reward": 0.4166666865348816,
"step": 348
},
{
"clip_ratio": 0.0,
"completion_length": 3584.0,
"epoch": 0.19942857142857143,
"grad_norm": 0.04959436133503914,
"kl": 0.341796875,
"learning_rate": 3.304631533875703e-05,
"loss": 0.0137,
"reward": -0.4646516516804695,
"reward_std": 0.5127328485250473,
"rewards/cosine_scaled_reward": -0.25315913930535316,
"rewards/format_reward": 0.0416666679084301,
"step": 349
},
{
"clip_ratio": 0.0,
"completion_length": 2546.4583740234375,
"epoch": 0.2,
"grad_norm": 0.08026839792728424,
"kl": 0.325927734375,
"learning_rate": 3.2772616003709614e-05,
"loss": 0.1587,
"reward": 0.4196392893791199,
"reward_std": 0.3610045984387398,
"rewards/cosine_scaled_reward": -0.01934703439474106,
"rewards/format_reward": 0.4583333395421505,
"step": 350
},
{
"clip_ratio": 0.0,
"completion_length": 3361.7083740234375,
"epoch": 0.20057142857142857,
"grad_norm": 0.036866363137960434,
"kl": 0.319091796875,
"learning_rate": 3.250000000000001e-05,
"loss": 0.0212,
"reward": -0.3742051422595978,
"reward_std": 0.3687475919723511,
"rewards/cosine_scaled_reward": -0.3329359143972397,
"rewards/format_reward": 0.291666679084301,
"step": 351
},
{
"clip_ratio": 0.0,
"completion_length": 2882.4166717529297,
"epoch": 0.20114285714285715,
"grad_norm": 0.1847529113292694,
"kl": 0.2890625,
"learning_rate": 3.222848061454764e-05,
"loss": 0.1266,
"reward": -0.06669257394969463,
"reward_std": 0.5095125660300255,
"rewards/cosine_scaled_reward": -0.13751295860856771,
"rewards/format_reward": 0.2083333432674408,
"step": 352
},
{
"clip_ratio": 0.0,
"completion_length": 2834.916717529297,
"epoch": 0.2017142857142857,
"grad_norm": 0.2114069163799286,
"kl": 0.1923828125,
"learning_rate": 3.195807108082429e-05,
"loss": 0.1295,
"reward": 0.1316913142800331,
"reward_std": 0.5846901014447212,
"rewards/cosine_scaled_reward": -0.12165435403585434,
"rewards/format_reward": 0.3750000149011612,
"step": 353
},
{
"clip_ratio": 0.0,
"completion_length": 3222.8750610351562,
"epoch": 0.2022857142857143,
"grad_norm": 0.08327220380306244,
"kl": 0.41943359375,
"learning_rate": 3.168878457820915e-05,
"loss": 0.1263,
"reward": -0.2932204008102417,
"reward_std": 0.5368962581269443,
"rewards/cosine_scaled_reward": -0.2924435433524195,
"rewards/format_reward": 0.2916666716337204,
"step": 354
},
{
"clip_ratio": 0.0,
"completion_length": 2713.541717529297,
"epoch": 0.20285714285714285,
"grad_norm": 0.0538349449634552,
"kl": 0.21435546875,
"learning_rate": 3.1420634231346445e-05,
"loss": 0.0415,
"reward": 0.24344536662101746,
"reward_std": 0.3011304475367069,
"rewards/cosine_scaled_reward": -0.08661067485809326,
"rewards/format_reward": 0.4166666865348816,
"step": 355
},
{
"clip_ratio": 0.0,
"completion_length": 2710.7916870117188,
"epoch": 0.20342857142857143,
"grad_norm": 0.0692262351512909,
"kl": 0.243896484375,
"learning_rate": 3.1153633109505784e-05,
"loss": 0.0255,
"reward": 0.8570400476455688,
"reward_std": 0.7603759318590164,
"rewards/cosine_scaled_reward": 0.17851997911930084,
"rewards/format_reward": 0.5000000111758709,
"step": 356
},
{
"clip_ratio": 0.0,
"completion_length": 3192.1250610351562,
"epoch": 0.204,
"grad_norm": 0.09190493822097778,
"kl": 0.352783203125,
"learning_rate": 3.088779422594514e-05,
"loss": -0.008,
"reward": -0.4519059807062149,
"reward_std": 0.3100459352135658,
"rewards/cosine_scaled_reward": -0.39261969178915024,
"rewards/format_reward": 0.3333333358168602,
"step": 357
},
{
"clip_ratio": 0.0,
"completion_length": 2996.1666870117188,
"epoch": 0.20457142857142857,
"grad_norm": 0.07240985333919525,
"kl": 0.216064453125,
"learning_rate": 3.062313053727671e-05,
"loss": -0.0162,
"reward": 0.539523258805275,
"reward_std": 0.5989483781158924,
"rewards/cosine_scaled_reward": 0.10309496521949768,
"rewards/format_reward": 0.3333333358168602,
"step": 358
},
{
"clip_ratio": 0.0,
"completion_length": 2649.0000610351562,
"epoch": 0.20514285714285715,
"grad_norm": 0.13966155052185059,
"kl": 0.431640625,
"learning_rate": 3.0359654942835248e-05,
"loss": 0.1862,
"reward": -0.014458760619163513,
"reward_std": 0.6829442456364632,
"rewards/cosine_scaled_reward": -0.1530627132160589,
"rewards/format_reward": 0.2916666753590107,
"step": 359
},
{
"clip_ratio": 0.0,
"completion_length": 3582.5416870117188,
"epoch": 0.2057142857142857,
"grad_norm": 0.05827893316745758,
"kl": 0.3515625,
"learning_rate": 3.0097380284049527e-05,
"loss": 0.0144,
"reward": -0.40928656980395317,
"reward_std": 0.5495708473026752,
"rewards/cosine_scaled_reward": -0.30880994349718094,
"rewards/format_reward": 0.2083333395421505,
"step": 360
},
{
"clip_ratio": 0.0,
"completion_length": 3319.6666870117188,
"epoch": 0.2062857142857143,
"grad_norm": 0.07192689180374146,
"kl": 0.47998046875,
"learning_rate": 2.98363193438164e-05,
"loss": 0.085,
"reward": -0.28407811373472214,
"reward_std": 0.5651301890611649,
"rewards/cosine_scaled_reward": -0.20453906618058681,
"rewards/format_reward": 0.125,
"step": 361
},
{
"clip_ratio": 0.0,
"completion_length": 3341.8750610351562,
"epoch": 0.20685714285714285,
"grad_norm": 0.04843948781490326,
"kl": 0.41845703125,
"learning_rate": 2.9576484845877794e-05,
"loss": 0.0374,
"reward": -0.03159081190824509,
"reward_std": 0.49874855391681194,
"rewards/cosine_scaled_reward": -0.11996208503842354,
"rewards/format_reward": 0.2083333395421505,
"step": 362
},
{
"clip_ratio": 0.0,
"completion_length": 3009.625030517578,
"epoch": 0.20742857142857143,
"grad_norm": 0.08178125321865082,
"kl": 0.314453125,
"learning_rate": 2.931788945420058e-05,
"loss": -0.019,
"reward": 0.15126367658376694,
"reward_std": 0.3955496810376644,
"rewards/cosine_scaled_reward": -0.0910348454490304,
"rewards/format_reward": 0.3333333358168602,
"step": 363
},
{
"clip_ratio": 0.0,
"completion_length": 2873.0833435058594,
"epoch": 0.208,
"grad_norm": 0.03954126313328743,
"kl": 0.21868896484375,
"learning_rate": 2.906054577235931e-05,
"loss": 0.0505,
"reward": -0.4637039601802826,
"reward_std": 0.08979167556390166,
"rewards/cosine_scaled_reward": -0.35685197822749615,
"rewards/format_reward": 0.25,
"step": 364
},
{
"clip_ratio": 0.0,
"completion_length": 3243.7916870117188,
"epoch": 0.20857142857142857,
"grad_norm": 0.050534144043922424,
"kl": 0.343994140625,
"learning_rate": 2.880446634292199e-05,
"loss": 0.0642,
"reward": -0.2382236891426146,
"reward_std": 0.3782954253256321,
"rewards/cosine_scaled_reward": -0.2649451866745949,
"rewards/format_reward": 0.2916666716337204,
"step": 365
},
{
"clip_ratio": 0.0,
"completion_length": 3584.0,
"epoch": 0.20914285714285713,
"grad_norm": 0.038890041410923004,
"kl": 0.322021484375,
"learning_rate": 2.854966364683872e-05,
"loss": 0.0129,
"reward": -0.7458789497613907,
"reward_std": 0.2065310850739479,
"rewards/cosine_scaled_reward": -0.39377281069755554,
"rewards/format_reward": 0.0416666679084301,
"step": 366
},
{
"clip_ratio": 0.0,
"completion_length": 2880.2083435058594,
"epoch": 0.20971428571428571,
"grad_norm": 0.15094637870788574,
"kl": 0.240478515625,
"learning_rate": 2.829615010283344e-05,
"loss": 0.0793,
"reward": 0.01062861829996109,
"reward_std": 0.17077413201332092,
"rewards/cosine_scaled_reward": -0.11968569085001945,
"rewards/format_reward": 0.25,
"step": 367
},
{
"clip_ratio": 0.0,
"completion_length": 2936.0,
"epoch": 0.2102857142857143,
"grad_norm": 0.03814147785305977,
"kl": 0.2568359375,
"learning_rate": 2.8043938066798646e-05,
"loss": 0.0351,
"reward": -0.06382668018341064,
"reward_std": 0.13458664249628782,
"rewards/cosine_scaled_reward": -0.15691335499286652,
"rewards/format_reward": 0.25,
"step": 368
},
{
"clip_ratio": 0.0,
"completion_length": 2985.25,
"epoch": 0.21085714285714285,
"grad_norm": 0.2742946147918701,
"kl": 0.27294921875,
"learning_rate": 2.7793039831193136e-05,
"loss": -0.0924,
"reward": -0.20023366808891296,
"reward_std": 0.6205369587987661,
"rewards/cosine_scaled_reward": -0.2459501512348652,
"rewards/format_reward": 0.2916666679084301,
"step": 369
},
{
"clip_ratio": 0.0,
"completion_length": 3553.2916870117188,
"epoch": 0.21142857142857144,
"grad_norm": 0.03684834763407707,
"kl": 0.23046875,
"learning_rate": 2.754346762444296e-05,
"loss": 0.0207,
"reward": -0.3706485256552696,
"reward_std": 0.5973577741533518,
"rewards/cosine_scaled_reward": -0.2478242591023445,
"rewards/format_reward": 0.1250000037252903,
"step": 370
},
{
"clip_ratio": 0.0,
"completion_length": 2341.5416870117188,
"epoch": 0.212,
"grad_norm": 0.0773259699344635,
"kl": 0.28411865234375,
"learning_rate": 2.729523361034538e-05,
"loss": 0.0904,
"reward": 0.24746574461460114,
"reward_std": 0.3304491974413395,
"rewards/cosine_scaled_reward": -0.10543380305171013,
"rewards/format_reward": 0.4583333432674408,
"step": 371
},
{
"clip_ratio": 0.0,
"completion_length": 3566.2916870117188,
"epoch": 0.21257142857142858,
"grad_norm": 0.05479148030281067,
"kl": 0.364990234375,
"learning_rate": 2.7048349887476037e-05,
"loss": 0.0218,
"reward": -0.6281777173280716,
"reward_std": 0.3108227998018265,
"rewards/cosine_scaled_reward": -0.3557555228471756,
"rewards/format_reward": 0.0833333358168602,
"step": 372
},
{
"clip_ratio": 0.0,
"completion_length": 2977.125030517578,
"epoch": 0.21314285714285713,
"grad_norm": 0.11434542387723923,
"kl": 0.2933349609375,
"learning_rate": 2.6802828488599297e-05,
"loss": -0.0334,
"reward": -0.07653629779815674,
"reward_std": 0.2992605846375227,
"rewards/cosine_scaled_reward": -0.18410149216651917,
"rewards/format_reward": 0.2916666679084301,
"step": 373
},
{
"clip_ratio": 0.0,
"completion_length": 3578.0833740234375,
"epoch": 0.21371428571428572,
"grad_norm": 0.053493838757276535,
"kl": 0.28466796875,
"learning_rate": 2.6558681380081713e-05,
"loss": 0.0125,
"reward": -0.40124300494790077,
"reward_std": 0.2405980322510004,
"rewards/cosine_scaled_reward": -0.24228817224502563,
"rewards/format_reward": 0.0833333358168602,
"step": 374
},
{
"clip_ratio": 0.0,
"completion_length": 2983.7500610351562,
"epoch": 0.21428571428571427,
"grad_norm": 0.05206667259335518,
"kl": 0.406494140625,
"learning_rate": 2.6315920461308964e-05,
"loss": 0.0623,
"reward": 0.096424276009202,
"reward_std": 0.6552168540656567,
"rewards/cosine_scaled_reward": -0.13928786292672157,
"rewards/format_reward": 0.3750000149011612,
"step": 375
},
{
"clip_ratio": 0.0,
"completion_length": 3149.7500610351562,
"epoch": 0.21485714285714286,
"grad_norm": 0.05979221314191818,
"kl": 0.3193359375,
"learning_rate": 2.6074557564105727e-05,
"loss": 0.0525,
"reward": 0.052399429492652416,
"reward_std": 0.826238114386797,
"rewards/cosine_scaled_reward": -0.16130028385668993,
"rewards/format_reward": 0.3750000037252903,
"step": 376
},
{
"clip_ratio": 0.0,
"completion_length": 2414.3334045410156,
"epoch": 0.21542857142857144,
"grad_norm": 0.2308046966791153,
"kl": 0.3388671875,
"learning_rate": 2.5834604452159112e-05,
"loss": 0.2314,
"reward": 0.3543909564614296,
"reward_std": 0.49069568142294884,
"rewards/cosine_scaled_reward": -0.07280451618134975,
"rewards/format_reward": 0.5000000111758709,
"step": 377
},
{
"clip_ratio": 0.0,
"completion_length": 3316.25,
"epoch": 0.216,
"grad_norm": 0.06222301721572876,
"kl": 0.40576171875,
"learning_rate": 2.5596072820445254e-05,
"loss": 0.0485,
"reward": -0.5630598217248917,
"reward_std": 0.21512744203209877,
"rewards/cosine_scaled_reward": -0.3648632522672415,
"rewards/format_reward": 0.1666666716337204,
"step": 378
},
{
"clip_ratio": 0.0,
"completion_length": 3449.0833740234375,
"epoch": 0.21657142857142858,
"grad_norm": 0.053267695009708405,
"kl": 0.39697265625,
"learning_rate": 2.5358974294659375e-05,
"loss": 0.0307,
"reward": -0.44570785015821457,
"reward_std": 0.24237919226288795,
"rewards/cosine_scaled_reward": -0.285353927873075,
"rewards/format_reward": 0.125,
"step": 379
},
{
"clip_ratio": 0.0,
"completion_length": 3125.2500610351562,
"epoch": 0.21714285714285714,
"grad_norm": 0.06081575155258179,
"kl": 0.37646484375,
"learning_rate": 2.5123320430649133e-05,
"loss": 0.078,
"reward": 0.06776145473122597,
"reward_std": 0.9119222313165665,
"rewards/cosine_scaled_reward": -0.13278593588620424,
"rewards/format_reward": 0.3333333432674408,
"step": 380
},
{
"clip_ratio": 0.0,
"completion_length": 3297.25,
"epoch": 0.21771428571428572,
"grad_norm": 0.05626344308257103,
"kl": 0.541015625,
"learning_rate": 2.4889122713851394e-05,
"loss": 0.0828,
"reward": 0.005039989948272705,
"reward_std": 0.6351946890354156,
"rewards/cosine_scaled_reward": -0.03914666548371315,
"rewards/format_reward": 0.0833333358168602,
"step": 381
},
{
"clip_ratio": 0.0,
"completion_length": 3272.9166870117188,
"epoch": 0.21828571428571428,
"grad_norm": 0.06031295284628868,
"kl": 0.32666015625,
"learning_rate": 2.4656392558732464e-05,
"loss": 0.0317,
"reward": -0.2794642001390457,
"reward_std": 0.666092368774116,
"rewards/cosine_scaled_reward": -0.26473210006952286,
"rewards/format_reward": 0.2500000111758709,
"step": 382
},
{
"clip_ratio": 0.0,
"completion_length": 2630.250045776367,
"epoch": 0.21885714285714286,
"grad_norm": 0.4203470051288605,
"kl": 0.2860107421875,
"learning_rate": 2.442514130823177e-05,
"loss": -0.1659,
"reward": 0.09679129719734192,
"reward_std": 0.6322224270552397,
"rewards/cosine_scaled_reward": -0.1599376993253827,
"rewards/format_reward": 0.4166666716337204,
"step": 383
},
{
"clip_ratio": 0.0,
"completion_length": 3584.0,
"epoch": 0.21942857142857142,
"grad_norm": 0.1070617288351059,
"kl": 0.404052734375,
"learning_rate": 2.4195380233209008e-05,
"loss": 0.0162,
"reward": -0.7048551961779594,
"reward_std": 0.05190820666030049,
"rewards/cosine_scaled_reward": -0.3524275906383991,
"rewards/format_reward": 0.0,
"step": 384
},
{
"clip_ratio": 0.0,
"completion_length": 2689.666748046875,
"epoch": 0.22,
"grad_norm": 0.07103494554758072,
"kl": 0.529296875,
"learning_rate": 2.396712053189486e-05,
"loss": 0.0642,
"reward": 0.18483632430434227,
"reward_std": 0.6015055403113365,
"rewards/cosine_scaled_reward": -0.09508185088634491,
"rewards/format_reward": 0.3750000037252903,
"step": 385
},
{
"clip_ratio": 0.0,
"completion_length": 3165.875,
"epoch": 0.22057142857142858,
"grad_norm": 0.05382478982210159,
"kl": 0.44775390625,
"learning_rate": 2.374037332934512e-05,
"loss": 0.0748,
"reward": -0.6691429018974304,
"reward_std": 0.313931992277503,
"rewards/cosine_scaled_reward": -0.3970714509487152,
"rewards/format_reward": 0.125,
"step": 386
},
{
"clip_ratio": 0.0,
"completion_length": 1973.0000762939453,
"epoch": 0.22114285714285714,
"grad_norm": 0.46731579303741455,
"kl": 0.2667236328125,
"learning_rate": 2.3515149676898555e-05,
"loss": 0.3138,
"reward": 0.6846924126148224,
"reward_std": 0.5325267240405083,
"rewards/cosine_scaled_reward": 0.0506795197725296,
"rewards/format_reward": 0.5833333358168602,
"step": 387
},
{
"clip_ratio": 0.0,
"completion_length": 2146.6250610351562,
"epoch": 0.22171428571428572,
"grad_norm": 0.06577088683843613,
"kl": 0.330596923828125,
"learning_rate": 2.329146055163824e-05,
"loss": 0.0908,
"reward": 1.1133306175470352,
"reward_std": 0.9890650920569897,
"rewards/cosine_scaled_reward": 0.24416528269648552,
"rewards/format_reward": 0.6250000149011612,
"step": 388
},
{
"clip_ratio": 0.0,
"completion_length": 2011.7083587646484,
"epoch": 0.22228571428571428,
"grad_norm": 0.07788607478141785,
"kl": 0.350677490234375,
"learning_rate": 2.306931685585657e-05,
"loss": 0.1926,
"reward": 0.8576178252696991,
"reward_std": 0.9910986423492432,
"rewards/cosine_scaled_reward": 0.1371422311058268,
"rewards/format_reward": 0.5833333432674408,
"step": 389
},
{
"clip_ratio": 0.0,
"completion_length": 3186.5416870117188,
"epoch": 0.22285714285714286,
"grad_norm": 0.07300270348787308,
"kl": 0.46630859375,
"learning_rate": 2.284872941652386e-05,
"loss": 0.1027,
"reward": -0.17644105851650238,
"reward_std": 0.3873226083815098,
"rewards/cosine_scaled_reward": -0.1923871971666813,
"rewards/format_reward": 0.2083333358168602,
"step": 390
},
{
"clip_ratio": 0.0,
"completion_length": 3476.5833740234375,
"epoch": 0.22342857142857142,
"grad_norm": 0.03151053190231323,
"kl": 0.219482421875,
"learning_rate": 2.2629708984760708e-05,
"loss": 0.0435,
"reward": 0.03934659995138645,
"reward_std": 0.4940398707985878,
"rewards/cosine_scaled_reward": -0.08449336793273687,
"rewards/format_reward": 0.2083333358168602,
"step": 391
},
{
"clip_ratio": 0.0,
"completion_length": 3302.0833740234375,
"epoch": 0.224,
"grad_norm": 0.07931151986122131,
"kl": 0.357666015625,
"learning_rate": 2.2412266235313975e-05,
"loss": -0.0285,
"reward": -0.5167839005589485,
"reward_std": 0.20752658136188984,
"rewards/cosine_scaled_reward": -0.30005861073732376,
"rewards/format_reward": 0.0833333358168602,
"step": 392
},
{
"clip_ratio": 0.0,
"completion_length": 3422.0,
"epoch": 0.22457142857142856,
"grad_norm": 0.05402982234954834,
"kl": 0.3740234375,
"learning_rate": 2.219641176603649e-05,
"loss": 0.0462,
"reward": -0.3995904391631484,
"reward_std": 0.182258821092546,
"rewards/cosine_scaled_reward": -0.2622952158562839,
"rewards/format_reward": 0.125,
"step": 393
},
{
"clip_ratio": 0.0,
"completion_length": 3126.6250610351562,
"epoch": 0.22514285714285714,
"grad_norm": 0.08397813886404037,
"kl": 0.39794921875,
"learning_rate": 2.198215609737056e-05,
"loss": 0.1138,
"reward": -0.33128097280859947,
"reward_std": 0.3852621605619788,
"rewards/cosine_scaled_reward": -0.24897384084761143,
"rewards/format_reward": 0.1666666716337204,
"step": 394
},
{
"clip_ratio": 0.0,
"completion_length": 2954.5833740234375,
"epoch": 0.2257142857142857,
"grad_norm": 0.06730625778436661,
"kl": 0.3223876953125,
"learning_rate": 2.1769509671835224e-05,
"loss": 0.1452,
"reward": 0.2924303896725178,
"reward_std": 0.7555544227361679,
"rewards/cosine_scaled_reward": -0.06211814656853676,
"rewards/format_reward": 0.416666679084301,
"step": 395
},
{
"clip_ratio": 0.0,
"completion_length": 3396.6666870117188,
"epoch": 0.22628571428571428,
"grad_norm": 0.06294995546340942,
"kl": 0.42138671875,
"learning_rate": 2.1558482853517257e-05,
"loss": 0.0685,
"reward": -0.32847850024700165,
"reward_std": 0.589644180610776,
"rewards/cosine_scaled_reward": -0.2892392612993717,
"rewards/format_reward": 0.25,
"step": 396
},
{
"clip_ratio": 0.0,
"completion_length": 3145.2916870117188,
"epoch": 0.22685714285714287,
"grad_norm": 0.04051867127418518,
"kl": 0.35986328125,
"learning_rate": 2.1349085927566073e-05,
"loss": 0.0434,
"reward": -0.27365291118621826,
"reward_std": 0.8279965240508318,
"rewards/cosine_scaled_reward": -0.2618264742195606,
"rewards/format_reward": 0.2500000074505806,
"step": 397
},
{
"clip_ratio": 0.0,
"completion_length": 3471.4583740234375,
"epoch": 0.22742857142857142,
"grad_norm": 0.07548023015260696,
"kl": 0.53759765625,
"learning_rate": 2.114132909969241e-05,
"loss": 0.0366,
"reward": -0.2639296054840088,
"reward_std": 0.5751975458115339,
"rewards/cosine_scaled_reward": -0.2152981460094452,
"rewards/format_reward": 0.1666666716337204,
"step": 398
},
{
"clip_ratio": 0.0,
"completion_length": 3529.4583740234375,
"epoch": 0.228,
"grad_norm": 0.03485243022441864,
"kl": 0.294921875,
"learning_rate": 2.093522249567097e-05,
"loss": 0.0234,
"reward": -0.439264640212059,
"reward_std": 0.28215121757239103,
"rewards/cosine_scaled_reward": -0.28213231824338436,
"rewards/format_reward": 0.125,
"step": 399
},
{
"clip_ratio": 0.0,
"completion_length": 2100.125030517578,
"epoch": 0.22857142857142856,
"grad_norm": 0.2823559045791626,
"kl": 0.52685546875,
"learning_rate": 2.0730776160846853e-05,
"loss": 0.2694,
"reward": 0.5646544303745031,
"reward_std": 1.1762162446975708,
"rewards/cosine_scaled_reward": -0.05100613087415695,
"rewards/format_reward": 0.666666679084301,
"step": 400
},
{
"clip_ratio": 0.0,
"completion_length": 3157.3750610351562,
"epoch": 0.22914285714285715,
"grad_norm": 0.09117607027292252,
"kl": 0.65234375,
"learning_rate": 2.0528000059645997e-05,
"loss": 0.0588,
"reward": -0.3910463247448206,
"reward_std": 0.2949160858988762,
"rewards/cosine_scaled_reward": -0.2580231502652168,
"rewards/format_reward": 0.1250000037252903,
"step": 401
},
{
"clip_ratio": 0.0,
"completion_length": 2805.333335876465,
"epoch": 0.2297142857142857,
"grad_norm": 0.06091153621673584,
"kl": 0.24639892578125,
"learning_rate": 2.0326904075089492e-05,
"loss": 0.0546,
"reward": -0.23423780500888824,
"reward_std": 0.1788835395127535,
"rewards/cosine_scaled_reward": -0.24211889691650867,
"rewards/format_reward": 0.25,
"step": 402
},
{
"clip_ratio": 0.0,
"completion_length": 2704.3750915527344,
"epoch": 0.2302857142857143,
"grad_norm": 0.11689075827598572,
"kl": 0.18212890625,
"learning_rate": 2.0127498008311922e-05,
"loss": 0.0923,
"reward": 0.480072483420372,
"reward_std": 0.8732938468456268,
"rewards/cosine_scaled_reward": -0.0516304369084537,
"rewards/format_reward": 0.5833333432674408,
"step": 403
},
{
"clip_ratio": 0.0,
"completion_length": 3054.666717529297,
"epoch": 0.23085714285714284,
"grad_norm": 0.06528580188751221,
"kl": 0.48193359375,
"learning_rate": 1.9929791578083658e-05,
"loss": 0.0749,
"reward": -0.4229290783405304,
"reward_std": 0.344521377235651,
"rewards/cosine_scaled_reward": -0.2947978749871254,
"rewards/format_reward": 0.1666666679084301,
"step": 404
},
{
"clip_ratio": 0.0,
"completion_length": 2730.2084350585938,
"epoch": 0.23142857142857143,
"grad_norm": 0.07100927829742432,
"kl": 0.482421875,
"learning_rate": 1.9733794420337214e-05,
"loss": 0.0353,
"reward": 0.5909395664930344,
"reward_std": 0.5787490289658308,
"rewards/cosine_scaled_reward": 0.10796979814767838,
"rewards/format_reward": 0.375,
"step": 405
},
{
"clip_ratio": 0.0,
"completion_length": 3417.9583740234375,
"epoch": 0.232,
"grad_norm": 0.03272836655378342,
"kl": 0.313232421875,
"learning_rate": 1.9539516087697518e-05,
"loss": 0.0321,
"reward": -0.10836548218503594,
"reward_std": 0.4936055834405124,
"rewards/cosine_scaled_reward": -0.1583494134247303,
"rewards/format_reward": 0.2083333358168602,
"step": 406
},
{
"clip_ratio": 0.0,
"completion_length": 3065.791748046875,
"epoch": 0.23257142857142857,
"grad_norm": 0.17790015041828156,
"kl": 0.3212890625,
"learning_rate": 1.9346966049016424e-05,
"loss": 0.105,
"reward": 0.5387177914381027,
"reward_std": 1.1512526031583548,
"rewards/cosine_scaled_reward": 0.061025530099868774,
"rewards/format_reward": 0.4166666828095913,
"step": 407
},
{
"clip_ratio": 0.0,
"completion_length": 2239.2083587646484,
"epoch": 0.23314285714285715,
"grad_norm": 0.25460880994796753,
"kl": 0.249755859375,
"learning_rate": 1.915615368891117e-05,
"loss": 0.1841,
"reward": 0.13613080978393555,
"reward_std": 0.5948553457856178,
"rewards/cosine_scaled_reward": -0.16110125556588173,
"rewards/format_reward": 0.4583333432674408,
"step": 408
},
{
"clip_ratio": 0.0,
"completion_length": 3243.8750610351562,
"epoch": 0.2337142857142857,
"grad_norm": 0.22207611799240112,
"kl": 0.44970703125,
"learning_rate": 1.8967088307307003e-05,
"loss": 0.1602,
"reward": -0.5169526115059853,
"reward_std": 0.28472404927015305,
"rewards/cosine_scaled_reward": -0.3209763169288635,
"rewards/format_reward": 0.1250000037252903,
"step": 409
},
{
"clip_ratio": 0.0,
"completion_length": 3584.0,
"epoch": 0.2342857142857143,
"grad_norm": 0.05123216286301613,
"kl": 0.356201171875,
"learning_rate": 1.877977911898387e-05,
"loss": 0.0143,
"reward": -0.671182170510292,
"reward_std": 0.408594099804759,
"rewards/cosine_scaled_reward": -0.3564244285225868,
"rewards/format_reward": 0.0416666679084301,
"step": 410
},
{
"clip_ratio": 0.0,
"completion_length": 3441.5833740234375,
"epoch": 0.23485714285714285,
"grad_norm": 0.06985009461641312,
"kl": 0.530029296875,
"learning_rate": 1.8594235253127375e-05,
"loss": 0.0742,
"reward": -0.38147830381058156,
"reward_std": 0.40077026188373566,
"rewards/cosine_scaled_reward": -0.274072487722151,
"rewards/format_reward": 0.1666666716337204,
"step": 411
},
{
"clip_ratio": 0.0,
"completion_length": 3580.75,
"epoch": 0.23542857142857143,
"grad_norm": 0.06430774182081223,
"kl": 0.380859375,
"learning_rate": 1.8410465752883758e-05,
"loss": 0.0169,
"reward": -0.6020461395382881,
"reward_std": 0.310219619423151,
"rewards/cosine_scaled_reward": -0.36352307721972466,
"rewards/format_reward": 0.1250000037252903,
"step": 412
},
{
"clip_ratio": 0.0,
"completion_length": 3413.5833740234375,
"epoch": 0.236,
"grad_norm": 0.0643596202135086,
"kl": 0.56591796875,
"learning_rate": 1.822847957491922e-05,
"loss": 0.069,
"reward": -0.19847530126571655,
"reward_std": 0.632585421204567,
"rewards/cosine_scaled_reward": -0.22423765808343887,
"rewards/format_reward": 0.2500000037252903,
"step": 413
},
{
"clip_ratio": 0.0,
"completion_length": 2909.4583435058594,
"epoch": 0.23657142857142857,
"grad_norm": 0.17475536465644836,
"kl": 0.317138671875,
"learning_rate": 1.804828558898332e-05,
"loss": 0.2022,
"reward": -0.07582948263734579,
"reward_std": 0.3062135260552168,
"rewards/cosine_scaled_reward": -0.1420813980512321,
"rewards/format_reward": 0.2083333432674408,
"step": 414
},
{
"clip_ratio": 0.0,
"completion_length": 2919.2083435058594,
"epoch": 0.23714285714285716,
"grad_norm": 0.08638226240873337,
"kl": 0.40966796875,
"learning_rate": 1.7869892577476724e-05,
"loss": 0.0055,
"reward": -0.26949138939380646,
"reward_std": 0.3594799619168043,
"rewards/cosine_scaled_reward": -0.30141236586496234,
"rewards/format_reward": 0.3333333432674408,
"step": 415
},
{
"clip_ratio": 0.0,
"completion_length": 3522.125,
"epoch": 0.2377142857142857,
"grad_norm": 0.045446183532476425,
"kl": 0.417724609375,
"learning_rate": 1.769330923502313e-05,
"loss": 0.0411,
"reward": -0.4672236889600754,
"reward_std": 0.2726159645244479,
"rewards/cosine_scaled_reward": -0.25444517843425274,
"rewards/format_reward": 0.0416666679084301,
"step": 416
},
{
"clip_ratio": 0.0,
"completion_length": 2717.2916870117188,
"epoch": 0.2382857142857143,
"grad_norm": 0.14400292932987213,
"kl": 0.3536376953125,
"learning_rate": 1.7518544168045525e-05,
"loss": 0.2068,
"reward": 0.2651242660358548,
"reward_std": 0.6696367170661688,
"rewards/cosine_scaled_reward": -0.034104532562196255,
"rewards/format_reward": 0.3333333358168602,
"step": 417
},
{
"clip_ratio": 0.0,
"completion_length": 2557.5833435058594,
"epoch": 0.23885714285714285,
"grad_norm": 0.14127807319164276,
"kl": 0.41259765625,
"learning_rate": 1.734560589434673e-05,
"loss": 0.0174,
"reward": 0.01780000329017639,
"reward_std": 0.4921893812716007,
"rewards/cosine_scaled_reward": -0.178600013256073,
"rewards/format_reward": 0.3750000149011612,
"step": 418
},
{
"clip_ratio": 0.0,
"completion_length": 2538.0833740234375,
"epoch": 0.23942857142857144,
"grad_norm": 0.26915180683135986,
"kl": 0.7177734375,
"learning_rate": 1.7174502842694213e-05,
"loss": 0.0781,
"reward": 0.22315247356891632,
"reward_std": 0.7611993253231049,
"rewards/cosine_scaled_reward": -0.2009237576276064,
"rewards/format_reward": 0.6250000111758709,
"step": 419
},
{
"clip_ratio": 0.0,
"completion_length": 2058.4583740234375,
"epoch": 0.24,
"grad_norm": 0.13916106522083282,
"kl": 0.48828125,
"learning_rate": 1.7005243352409334e-05,
"loss": 0.1651,
"reward": 0.5351078482344747,
"reward_std": 0.43670096062123775,
"rewards/cosine_scaled_reward": 0.017553903628140688,
"rewards/format_reward": 0.5,
"step": 420
},
{
"clip_ratio": 0.0,
"completion_length": 2917.625030517578,
"epoch": 0.24057142857142857,
"grad_norm": 0.26665350794792175,
"kl": 0.361328125,
"learning_rate": 1.6837835672960835e-05,
"loss": 0.2609,
"reward": -0.19462932646274567,
"reward_std": 0.6805716454982758,
"rewards/cosine_scaled_reward": -0.24314800277352333,
"rewards/format_reward": 0.291666679084301,
"step": 421
},
{
"clip_ratio": 0.0,
"completion_length": 3578.875,
"epoch": 0.24114285714285713,
"grad_norm": 0.050200000405311584,
"kl": 0.3662109375,
"learning_rate": 1.6672287963562855e-05,
"loss": 0.0176,
"reward": -0.7312059998512268,
"reward_std": 0.228121904656291,
"rewards/cosine_scaled_reward": -0.3864363357424736,
"rewards/format_reward": 0.0416666679084301,
"step": 422
},
{
"clip_ratio": 0.0,
"completion_length": 2685.8750610351562,
"epoch": 0.24171428571428571,
"grad_norm": 0.2088881880044937,
"kl": 0.58447265625,
"learning_rate": 1.6508608292777204e-05,
"loss": 0.0649,
"reward": 0.13651205599308014,
"reward_std": 0.7265305370092392,
"rewards/cosine_scaled_reward": -0.14007731387391686,
"rewards/format_reward": 0.4166666828095913,
"step": 423
},
{
"clip_ratio": 0.0,
"completion_length": 2770.7916870117188,
"epoch": 0.2422857142857143,
"grad_norm": 0.6906270384788513,
"kl": 0.63232421875,
"learning_rate": 1.63468046381201e-05,
"loss": 0.1834,
"reward": -0.13689884543418884,
"reward_std": 0.6126798801124096,
"rewards/cosine_scaled_reward": -0.17261607944965363,
"rewards/format_reward": 0.2083333395421505,
"step": 424
},
{
"clip_ratio": 0.0,
"completion_length": 3473.125,
"epoch": 0.24285714285714285,
"grad_norm": 0.1417115479707718,
"kl": 0.53515625,
"learning_rate": 1.6186884885673413e-05,
"loss": -0.0243,
"reward": -0.4850518964231014,
"reward_std": 0.24936811439692974,
"rewards/cosine_scaled_reward": -0.26335928216576576,
"rewards/format_reward": 0.0416666679084301,
"step": 425
},
{
"clip_ratio": 0.0,
"completion_length": 3285.8333740234375,
"epoch": 0.24342857142857144,
"grad_norm": 0.06952951848506927,
"kl": 0.5224609375,
"learning_rate": 1.602885682970026e-05,
"loss": 0.0646,
"reward": -0.4608178175985813,
"reward_std": 0.5314787924289703,
"rewards/cosine_scaled_reward": -0.2929089143872261,
"rewards/format_reward": 0.1250000037252903,
"step": 426
},
{
"clip_ratio": 0.0,
"completion_length": 3526.9583740234375,
"epoch": 0.244,
"grad_norm": 0.03886076807975769,
"kl": 0.365478515625,
"learning_rate": 1.5872728172265147e-05,
"loss": 0.0222,
"reward": -0.21462566033005714,
"reward_std": 0.7559888269752264,
"rewards/cosine_scaled_reward": -0.21147949434816837,
"rewards/format_reward": 0.2083333395421505,
"step": 427
},
{
"clip_ratio": 0.0,
"completion_length": 2593.666748046875,
"epoch": 0.24457142857142858,
"grad_norm": 0.4337615370750427,
"kl": 0.6572265625,
"learning_rate": 1.5718506522858573e-05,
"loss": 0.3042,
"reward": 0.1845724955201149,
"reward_std": 0.9847190231084824,
"rewards/cosine_scaled_reward": -0.11604708188679069,
"rewards/format_reward": 0.4166666828095913,
"step": 428
},
{
"clip_ratio": 0.0,
"completion_length": 3424.1251220703125,
"epoch": 0.24514285714285713,
"grad_norm": 0.05527598410844803,
"kl": 0.55224609375,
"learning_rate": 1.556619939802615e-05,
"loss": 0.085,
"reward": -0.2955315187573433,
"reward_std": 0.5266687069088221,
"rewards/cosine_scaled_reward": -0.27276574447751045,
"rewards/format_reward": 0.2500000074505806,
"step": 429
},
{
"clip_ratio": 0.0,
"completion_length": 3065.5000610351562,
"epoch": 0.24571428571428572,
"grad_norm": 0.15154457092285156,
"kl": 0.68212890625,
"learning_rate": 1.5415814221002267e-05,
"loss": 0.0947,
"reward": -0.09890948422253132,
"reward_std": 0.46929389610886574,
"rewards/cosine_scaled_reward": -0.19528808258473873,
"rewards/format_reward": 0.2916666716337204,
"step": 430
},
{
"clip_ratio": 0.0,
"completion_length": 3439.2916870117188,
"epoch": 0.24628571428571427,
"grad_norm": 0.0829787403345108,
"kl": 0.6484375,
"learning_rate": 1.526735832134829e-05,
"loss": 0.066,
"reward": -0.6627758890390396,
"reward_std": 0.37856999412178993,
"rewards/cosine_scaled_reward": -0.3730546161532402,
"rewards/format_reward": 0.0833333358168602,
"step": 431
},
{
"clip_ratio": 0.0,
"completion_length": 2820.7916870117188,
"epoch": 0.24685714285714286,
"grad_norm": 0.22786198556423187,
"kl": 0.685546875,
"learning_rate": 1.5120838934595339e-05,
"loss": 0.0229,
"reward": -0.5289145242422819,
"reward_std": 0.3426254317164421,
"rewards/cosine_scaled_reward": -0.32695727050304413,
"rewards/format_reward": 0.125,
"step": 432
},
{
"clip_ratio": 0.0,
"completion_length": 2546.3334045410156,
"epoch": 0.24742857142857144,
"grad_norm": 0.13099390268325806,
"kl": 0.751953125,
"learning_rate": 1.4976263201891614e-05,
"loss": 0.1944,
"reward": 0.3029240146279335,
"reward_std": 0.840803325176239,
"rewards/cosine_scaled_reward": -0.05687133315950632,
"rewards/format_reward": 0.416666679084301,
"step": 433
},
{
"clip_ratio": 0.0,
"completion_length": 3548.6250610351562,
"epoch": 0.248,
"grad_norm": 0.14448414742946625,
"kl": 0.51416015625,
"learning_rate": 1.4833638169654352e-05,
"loss": 0.0237,
"reward": -0.5506420657038689,
"reward_std": 0.24051987566053867,
"rewards/cosine_scaled_reward": -0.35865436494350433,
"rewards/format_reward": 0.1666666716337204,
"step": 434
},
{
"clip_ratio": 0.0,
"completion_length": 2912.666778564453,
"epoch": 0.24857142857142858,
"grad_norm": 0.11906195431947708,
"kl": 0.55078125,
"learning_rate": 1.469297078922642e-05,
"loss": 0.114,
"reward": 0.029549360275268555,
"reward_std": 0.4572529271245003,
"rewards/cosine_scaled_reward": -0.17272532731294632,
"rewards/format_reward": 0.375,
"step": 435
},
{
"clip_ratio": 0.0,
"completion_length": 2759.25,
"epoch": 0.24914285714285714,
"grad_norm": 0.09241659194231033,
"kl": 0.4913330078125,
"learning_rate": 1.4554267916537495e-05,
"loss": 0.0232,
"reward": 0.08335928618907928,
"reward_std": 0.35814575105905533,
"rewards/cosine_scaled_reward": -0.08332037925720215,
"rewards/format_reward": 0.25,
"step": 436
},
{
"clip_ratio": 0.0,
"completion_length": 1934.3750305175781,
"epoch": 0.24971428571428572,
"grad_norm": 0.2781325876712799,
"kl": 0.41302490234375,
"learning_rate": 1.4417536311769886e-05,
"loss": 0.1265,
"reward": 0.7840927466750145,
"reward_std": 0.7154708206653595,
"rewards/cosine_scaled_reward": 0.0795463752001524,
"rewards/format_reward": 0.6250000037252903,
"step": 437
},
{
"clip_ratio": 0.0,
"completion_length": 3043.4583435058594,
"epoch": 0.2502857142857143,
"grad_norm": 0.16331548988819122,
"kl": 0.56005859375,
"learning_rate": 1.428278263902913e-05,
"loss": 0.1218,
"reward": -0.6011191233992577,
"reward_std": 0.2519193133339286,
"rewards/cosine_scaled_reward": -0.40472622215747833,
"rewards/format_reward": 0.2083333395421505,
"step": 438
},
{
"clip_ratio": 0.0,
"completion_length": 2903.5001220703125,
"epoch": 0.25085714285714283,
"grad_norm": 0.1437004804611206,
"kl": 0.55859375,
"learning_rate": 1.4150013466019115e-05,
"loss": 0.0634,
"reward": 0.2986115887761116,
"reward_std": 0.6229725033044815,
"rewards/cosine_scaled_reward": -0.121527548879385,
"rewards/format_reward": 0.5416666753590107,
"step": 439
},
{
"clip_ratio": 0.0,
"completion_length": 1969.4167175292969,
"epoch": 0.25142857142857145,
"grad_norm": 0.4684412479400635,
"kl": 0.38134765625,
"learning_rate": 1.4019235263722036e-05,
"loss": 0.279,
"reward": 0.1754133328795433,
"reward_std": 0.7903597727417946,
"rewards/cosine_scaled_reward": -0.1622933349572122,
"rewards/format_reward": 0.5000000223517418,
"step": 440
},
{
"clip_ratio": 0.0,
"completion_length": 3259.2083740234375,
"epoch": 0.252,
"grad_norm": 0.09937479346990585,
"kl": 0.54443359375,
"learning_rate": 1.389045440608296e-05,
"loss": -0.0214,
"reward": -0.5238807797431946,
"reward_std": 0.32005127891898155,
"rewards/cosine_scaled_reward": -0.3452737405896187,
"rewards/format_reward": 0.1666666679084301,
"step": 441
},
{
"clip_ratio": 0.0,
"completion_length": 2888.125045776367,
"epoch": 0.25257142857142856,
"grad_norm": 0.2833991050720215,
"kl": 0.44384765625,
"learning_rate": 1.3763677169699218e-05,
"loss": 0.1149,
"reward": 0.1393699049949646,
"reward_std": 0.8629260342568159,
"rewards/cosine_scaled_reward": -0.1386483833193779,
"rewards/format_reward": 0.4166666679084301,
"step": 442
},
{
"clip_ratio": 0.0,
"completion_length": 3266.125,
"epoch": 0.25314285714285717,
"grad_norm": 0.08092425763607025,
"kl": 0.47119140625,
"learning_rate": 1.3638909733514454e-05,
"loss": 0.0795,
"reward": -0.44480053149163723,
"reward_std": 0.36030059307813644,
"rewards/cosine_scaled_reward": -0.3057336136698723,
"rewards/format_reward": 0.1666666679084301,
"step": 443
},
{
"clip_ratio": 0.0,
"completion_length": 3183.3333740234375,
"epoch": 0.2537142857142857,
"grad_norm": 0.07417810708284378,
"kl": 0.58544921875,
"learning_rate": 1.3516158178517482e-05,
"loss": 0.0659,
"reward": -0.37381474673748016,
"reward_std": 0.2710421346127987,
"rewards/cosine_scaled_reward": -0.2910740412771702,
"rewards/format_reward": 0.2083333395421505,
"step": 444
},
{
"clip_ratio": 0.0,
"completion_length": 2618.6250915527344,
"epoch": 0.2542857142857143,
"grad_norm": 0.2676498293876648,
"kl": 0.697265625,
"learning_rate": 1.3395428487445916e-05,
"loss": 0.2333,
"reward": 0.09787814319133759,
"reward_std": 0.7350487858057022,
"rewards/cosine_scaled_reward": -0.1385609395802021,
"rewards/format_reward": 0.3750000111758709,
"step": 445
},
{
"clip_ratio": 0.0,
"completion_length": 2239.875015258789,
"epoch": 0.25485714285714284,
"grad_norm": 0.23302024602890015,
"kl": 0.339599609375,
"learning_rate": 1.3276726544494572e-05,
"loss": -0.0018,
"reward": -0.09462682902812958,
"reward_std": 0.2530629448592663,
"rewards/cosine_scaled_reward": -0.3181467577815056,
"rewards/format_reward": 0.5416666679084301,
"step": 446
},
{
"clip_ratio": 0.0,
"completion_length": 3186.875,
"epoch": 0.25542857142857145,
"grad_norm": 0.0788385272026062,
"kl": 0.4404296875,
"learning_rate": 1.3160058135028691e-05,
"loss": 0.0481,
"reward": -0.03993312269449234,
"reward_std": 0.49259845726192,
"rewards/cosine_scaled_reward": -0.16579990461468697,
"rewards/format_reward": 0.291666679084301,
"step": 447
},
{
"clip_ratio": 0.0,
"completion_length": 2922.8333740234375,
"epoch": 0.256,
"grad_norm": 0.11522118002176285,
"kl": 0.56005859375,
"learning_rate": 1.3045428945301954e-05,
"loss": 0.0533,
"reward": 0.417447566986084,
"reward_std": 0.5856805425137281,
"rewards/cosine_scaled_reward": -0.020442910492420197,
"rewards/format_reward": 0.4583333432674408,
"step": 448
},
{
"clip_ratio": 0.0,
"completion_length": 3091.5833740234375,
"epoch": 0.25657142857142856,
"grad_norm": 0.10368100553750992,
"kl": 0.77734375,
"learning_rate": 1.2932844562179353e-05,
"loss": 0.0535,
"reward": -0.07481794245541096,
"reward_std": 0.8016606643795967,
"rewards/cosine_scaled_reward": -0.2457423061132431,
"rewards/format_reward": 0.4166666716337204,
"step": 449
},
{
"clip_ratio": 0.0,
"completion_length": 1368.8750305175781,
"epoch": 0.2571428571428571,
"grad_norm": 0.3712880611419678,
"kl": 0.75482177734375,
"learning_rate": 1.2822310472864884e-05,
"loss": 0.1738,
"reward": 1.4089849265292287,
"reward_std": 0.5784200113266706,
"rewards/cosine_scaled_reward": 0.3294924534857273,
"rewards/format_reward": 0.7500000074505806,
"step": 450
},
{
"clip_ratio": 0.0,
"completion_length": 2171.8333435058594,
"epoch": 0.25771428571428573,
"grad_norm": 0.4426482021808624,
"kl": 0.4508056640625,
"learning_rate": 1.2713832064634126e-05,
"loss": 0.1503,
"reward": 0.019651681184768677,
"reward_std": 0.3091997979208827,
"rewards/cosine_scaled_reward": -0.19850750174373388,
"rewards/format_reward": 0.4166666716337204,
"step": 451
},
{
"clip_ratio": 0.0,
"completion_length": 2205.750045776367,
"epoch": 0.2582857142857143,
"grad_norm": 0.4167480170726776,
"kl": 0.468017578125,
"learning_rate": 1.260741462457165e-05,
"loss": 0.2359,
"reward": 0.14184805005788803,
"reward_std": 0.5776838436722755,
"rewards/cosine_scaled_reward": -0.17907597869634628,
"rewards/format_reward": 0.5000000111758709,
"step": 452
},
{
"clip_ratio": 0.0,
"completion_length": 2234.9166717529297,
"epoch": 0.25885714285714284,
"grad_norm": 0.39677053689956665,
"kl": 0.59765625,
"learning_rate": 1.2503063339313356e-05,
"loss": 0.3304,
"reward": 0.13150884211063385,
"reward_std": 0.5827006474137306,
"rewards/cosine_scaled_reward": -0.16341226734220982,
"rewards/format_reward": 0.4583333544433117,
"step": 453
},
{
"clip_ratio": 0.0,
"completion_length": 2502.4166870117188,
"epoch": 0.25942857142857145,
"grad_norm": 0.6984990835189819,
"kl": 0.47119140625,
"learning_rate": 1.240078329479367e-05,
"loss": 0.1735,
"reward": 0.2351670740172267,
"reward_std": 0.4651456903666258,
"rewards/cosine_scaled_reward": -0.04908313835039735,
"rewards/format_reward": 0.3333333358168602,
"step": 454
},
{
"clip_ratio": 0.0,
"completion_length": 3326.166748046875,
"epoch": 0.26,
"grad_norm": 0.19114693999290466,
"kl": 1.01171875,
"learning_rate": 1.2300579475997657e-05,
"loss": 0.0617,
"reward": -0.29883327800780535,
"reward_std": 0.41828643530607224,
"rewards/cosine_scaled_reward": -0.23274997994303703,
"rewards/format_reward": 0.1666666716337204,
"step": 455
},
{
"clip_ratio": 0.0,
"completion_length": 3502.4166870117188,
"epoch": 0.26057142857142856,
"grad_norm": 0.055889781564474106,
"kl": 0.406005859375,
"learning_rate": 1.2202456766718093e-05,
"loss": 0.0569,
"reward": -0.4835352450609207,
"reward_std": 0.2959635443985462,
"rewards/cosine_scaled_reward": -0.2834343034774065,
"rewards/format_reward": 0.0833333358168602,
"step": 456
},
{
"clip_ratio": 0.0,
"completion_length": 2922.7083435058594,
"epoch": 0.2611428571428571,
"grad_norm": 0.2059084177017212,
"kl": 0.687744140625,
"learning_rate": 1.210641994931739e-05,
"loss": 0.217,
"reward": -0.24368640035390854,
"reward_std": 0.2636729357764125,
"rewards/cosine_scaled_reward": -0.30934320390224457,
"rewards/format_reward": 0.375,
"step": 457
},
{
"clip_ratio": 0.0,
"completion_length": 2602.9166870117188,
"epoch": 0.26171428571428573,
"grad_norm": 0.18258771300315857,
"kl": 0.5390625,
"learning_rate": 1.2012473704494538e-05,
"loss": 0.178,
"reward": 0.1497629238292575,
"reward_std": 0.5888768993318081,
"rewards/cosine_scaled_reward": -0.09178520552814007,
"rewards/format_reward": 0.3333333395421505,
"step": 458
},
{
"clip_ratio": 0.0,
"completion_length": 3391.0833740234375,
"epoch": 0.2622857142857143,
"grad_norm": 0.10063629597425461,
"kl": 0.71728515625,
"learning_rate": 1.1920622611056975e-05,
"loss": 0.0528,
"reward": -0.36289872229099274,
"reward_std": 0.5369972474873066,
"rewards/cosine_scaled_reward": -0.26478270068764687,
"rewards/format_reward": 0.1666666679084301,
"step": 459
},
{
"clip_ratio": 0.0,
"completion_length": 2814.000030517578,
"epoch": 0.26285714285714284,
"grad_norm": 0.06488575041294098,
"kl": 0.550048828125,
"learning_rate": 1.1830871145697413e-05,
"loss": 0.0445,
"reward": 0.6600381471216679,
"reward_std": 0.4823018051683903,
"rewards/cosine_scaled_reward": 0.10085240937769413,
"rewards/format_reward": 0.4583333432674408,
"step": 460
},
{
"clip_ratio": 0.0,
"completion_length": 2659.00008392334,
"epoch": 0.2634285714285714,
"grad_norm": 0.19850417971611023,
"kl": 0.284912109375,
"learning_rate": 1.174322368277565e-05,
"loss": 0.1124,
"reward": 0.20981285348534584,
"reward_std": 0.5238317660987377,
"rewards/cosine_scaled_reward": -0.10342691093683243,
"rewards/format_reward": 0.4166666716337204,
"step": 461
},
{
"clip_ratio": 0.0,
"completion_length": 3503.0834350585938,
"epoch": 0.264,
"grad_norm": 0.06738544255495071,
"kl": 0.6259765625,
"learning_rate": 1.1657684494105387e-05,
"loss": 0.0494,
"reward": -0.18034307146444917,
"reward_std": 0.8387964870780706,
"rewards/cosine_scaled_reward": -0.2776715336367488,
"rewards/format_reward": 0.3750000074505806,
"step": 462
},
{
"clip_ratio": 0.0,
"completion_length": 3274.916748046875,
"epoch": 0.26457142857142857,
"grad_norm": 0.16633926331996918,
"kl": 0.7099609375,
"learning_rate": 1.1574257748745986e-05,
"loss": -0.0402,
"reward": -0.3673575446009636,
"reward_std": 0.31967577897012234,
"rewards/cosine_scaled_reward": -0.204512108117342,
"rewards/format_reward": 0.0416666679084301,
"step": 463
},
{
"clip_ratio": 0.0,
"completion_length": 2274.0416717529297,
"epoch": 0.2651428571428571,
"grad_norm": 0.23727242648601532,
"kl": 0.6483154296875,
"learning_rate": 1.149294751279933e-05,
"loss": 0.033,
"reward": 0.15764057636260986,
"reward_std": 0.6667953277938068,
"rewards/cosine_scaled_reward": -0.21284636482596397,
"rewards/format_reward": 0.5833333358168602,
"step": 464
},
{
"clip_ratio": 0.0,
"completion_length": 3351.291748046875,
"epoch": 0.26571428571428574,
"grad_norm": 0.11981618404388428,
"kl": 0.8818359375,
"learning_rate": 1.1413757749211602e-05,
"loss": 0.0629,
"reward": 0.07552861422300339,
"reward_std": 0.9423349946737289,
"rewards/cosine_scaled_reward": -0.17056902311742306,
"rewards/format_reward": 0.4166666716337204,
"step": 465
},
{
"clip_ratio": 0.0,
"completion_length": 2843.5834350585938,
"epoch": 0.2662857142857143,
"grad_norm": 0.1698676198720932,
"kl": 0.6044921875,
"learning_rate": 1.133669231758016e-05,
"loss": 0.2029,
"reward": -0.21855482331011444,
"reward_std": 0.6860118061304092,
"rewards/cosine_scaled_reward": -0.3384440951049328,
"rewards/format_reward": 0.4583333432674408,
"step": 466
},
{
"clip_ratio": 0.0,
"completion_length": 3427.8751220703125,
"epoch": 0.26685714285714285,
"grad_norm": 0.17002388834953308,
"kl": 0.90283203125,
"learning_rate": 1.1261754973965422e-05,
"loss": 0.0599,
"reward": -0.3653205633163452,
"reward_std": 0.37883180007338524,
"rewards/cosine_scaled_reward": -0.2868269607424736,
"rewards/format_reward": 0.2083333358168602,
"step": 467
},
{
"clip_ratio": 0.0,
"completion_length": 2550.291748046875,
"epoch": 0.2674285714285714,
"grad_norm": 1.2961100339889526,
"kl": 0.5009765625,
"learning_rate": 1.1188949370707787e-05,
"loss": 0.1446,
"reward": 0.2884400337934494,
"reward_std": 0.706204243004322,
"rewards/cosine_scaled_reward": -0.10577998217195272,
"rewards/format_reward": 0.5000000223517418,
"step": 468
},
{
"clip_ratio": 0.0,
"completion_length": 2284.125045776367,
"epoch": 0.268,
"grad_norm": 0.5664176940917969,
"kl": 0.47509765625,
"learning_rate": 1.1118279056249655e-05,
"loss": 0.174,
"reward": -0.08516758680343628,
"reward_std": 0.30671944469213486,
"rewards/cosine_scaled_reward": -0.31341712176799774,
"rewards/format_reward": 0.5416666679084301,
"step": 469
},
{
"clip_ratio": 0.0,
"completion_length": 1342.0416870117188,
"epoch": 0.26857142857142857,
"grad_norm": 0.16639195382595062,
"kl": 0.525146484375,
"learning_rate": 1.1049747474962445e-05,
"loss": 0.2126,
"reward": 1.3647146373987198,
"reward_std": 0.4275565594434738,
"rewards/cosine_scaled_reward": 0.2865240015089512,
"rewards/format_reward": 0.7916666679084301,
"step": 470
},
{
"clip_ratio": 0.0,
"completion_length": 2648.3334045410156,
"epoch": 0.26914285714285713,
"grad_norm": 0.10152919590473175,
"kl": 0.640625,
"learning_rate": 1.0983357966978745e-05,
"loss": 0.1102,
"reward": 0.0945998802781105,
"reward_std": 0.5440156869590282,
"rewards/cosine_scaled_reward": -0.1818667290499434,
"rewards/format_reward": 0.4583333395421505,
"step": 471
},
{
"clip_ratio": 0.0,
"completion_length": 2823.0833740234375,
"epoch": 0.26971428571428574,
"grad_norm": 0.28343844413757324,
"kl": 0.8359375,
"learning_rate": 1.0919113768029518e-05,
"loss": 0.0423,
"reward": -0.11053501442074776,
"reward_std": 0.406174935400486,
"rewards/cosine_scaled_reward": -0.3052675127983093,
"rewards/format_reward": 0.5000000074505806,
"step": 472
},
{
"clip_ratio": 0.0,
"completion_length": 3113.0833740234375,
"epoch": 0.2702857142857143,
"grad_norm": 0.07717534154653549,
"kl": 0.66162109375,
"learning_rate": 1.0857018009286382e-05,
"loss": 0.0757,
"reward": -0.2859737928956747,
"reward_std": 0.5037247315049171,
"rewards/cosine_scaled_reward": -0.24715356901288033,
"rewards/format_reward": 0.2083333358168602,
"step": 473
},
{
"clip_ratio": 0.0,
"completion_length": 1836.0000305175781,
"epoch": 0.27085714285714285,
"grad_norm": 0.3636644780635834,
"kl": 0.83648681640625,
"learning_rate": 1.0797073717209014e-05,
"loss": 0.0671,
"reward": 0.6108469665050507,
"reward_std": 0.3949619419872761,
"rewards/cosine_scaled_reward": 0.034590087831020355,
"rewards/format_reward": 0.5416666679084301,
"step": 474
},
{
"clip_ratio": 0.0,
"completion_length": 2542.0416870117188,
"epoch": 0.2714285714285714,
"grad_norm": 0.17265525460243225,
"kl": 0.6083984375,
"learning_rate": 1.0739283813397639e-05,
"loss": 0.0681,
"reward": 0.41628449596464634,
"reward_std": 0.8513908386230469,
"rewards/cosine_scaled_reward": -0.04185774736106396,
"rewards/format_reward": 0.5000000037252903,
"step": 475
},
{
"clip_ratio": 0.0,
"completion_length": 3087.166748046875,
"epoch": 0.272,
"grad_norm": 0.09109177440404892,
"kl": 0.896484375,
"learning_rate": 1.0683651114450641e-05,
"loss": 0.1429,
"reward": -0.14520969986915588,
"reward_std": 0.3790069818496704,
"rewards/cosine_scaled_reward": -0.28093818202614784,
"rewards/format_reward": 0.416666679084301,
"step": 476
},
{
"clip_ratio": 0.0,
"completion_length": 2237.4583587646484,
"epoch": 0.2725714285714286,
"grad_norm": 0.200510174036026,
"kl": 1.012451171875,
"learning_rate": 1.0630178331827282e-05,
"loss": 0.1417,
"reward": -0.027669966220855713,
"reward_std": 0.3241183590143919,
"rewards/cosine_scaled_reward": -0.28466833382844925,
"rewards/format_reward": 0.5416666753590107,
"step": 477
},
{
"clip_ratio": 0.0,
"completion_length": 3540.7916870117188,
"epoch": 0.27314285714285713,
"grad_norm": 0.23702482879161835,
"kl": 0.743408203125,
"learning_rate": 1.0578868071715544e-05,
"loss": 0.0225,
"reward": -0.5009803473949432,
"reward_std": 0.35418499261140823,
"rewards/cosine_scaled_reward": -0.3546568304300308,
"rewards/format_reward": 0.2083333358168602,
"step": 478
},
{
"clip_ratio": 0.0,
"completion_length": 2298.375045776367,
"epoch": 0.2737142857142857,
"grad_norm": 0.6971262693405151,
"kl": 1.15380859375,
"learning_rate": 1.0529722834905126e-05,
"loss": 0.2907,
"reward": 0.7373159751296043,
"reward_std": 0.6764814406633377,
"rewards/cosine_scaled_reward": 0.13949130102992058,
"rewards/format_reward": 0.4583333469927311,
"step": 479
},
{
"clip_ratio": 0.0,
"completion_length": 2324.8334045410156,
"epoch": 0.2742857142857143,
"grad_norm": 0.17469684779644012,
"kl": 0.61767578125,
"learning_rate": 1.0482745016665526e-05,
"loss": 0.097,
"reward": 0.40352728590369225,
"reward_std": 0.8156778272241354,
"rewards/cosine_scaled_reward": -0.08990303543396294,
"rewards/format_reward": 0.5833333469927311,
"step": 480
},
{
"clip_ratio": 0.0,
"completion_length": 2138.0417098999023,
"epoch": 0.27485714285714286,
"grad_norm": 0.292651891708374,
"kl": 1.777099609375,
"learning_rate": 1.0437936906629336e-05,
"loss": 0.2001,
"reward": 0.35360522009432316,
"reward_std": 0.4059207197278738,
"rewards/cosine_scaled_reward": -0.05236405599862337,
"rewards/format_reward": 0.4583333358168602,
"step": 481
},
{
"clip_ratio": 0.0,
"completion_length": 2493.6666870117188,
"epoch": 0.2754285714285714,
"grad_norm": 0.4282706081867218,
"kl": 0.57958984375,
"learning_rate": 1.0395300688680626e-05,
"loss": 0.0375,
"reward": 0.8709183055907488,
"reward_std": 0.7465209662914276,
"rewards/cosine_scaled_reward": 0.16462579369544983,
"rewards/format_reward": 0.541666679084301,
"step": 482
},
{
"clip_ratio": 0.0,
"completion_length": 1954.4167175292969,
"epoch": 0.276,
"grad_norm": 0.4806761145591736,
"kl": 1.107421875,
"learning_rate": 1.0354838440848503e-05,
"loss": 0.1812,
"reward": 0.1631168033927679,
"reward_std": 0.6468721106648445,
"rewards/cosine_scaled_reward": -0.12677491828799248,
"rewards/format_reward": 0.4166666679084301,
"step": 483
},
{
"clip_ratio": 0.0,
"completion_length": 1815.0000534057617,
"epoch": 0.2765714285714286,
"grad_norm": 0.7567575573921204,
"kl": 1.1226806640625,
"learning_rate": 1.0316552135205838e-05,
"loss": 0.1186,
"reward": 0.24124560877680779,
"reward_std": 0.4737013475969434,
"rewards/cosine_scaled_reward": -0.19187720585614443,
"rewards/format_reward": 0.6250000074505806,
"step": 484
},
{
"clip_ratio": 0.0,
"completion_length": 2469.4166870117188,
"epoch": 0.27714285714285714,
"grad_norm": 0.1286521553993225,
"kl": 1.216796875,
"learning_rate": 1.0280443637773165e-05,
"loss": 0.1977,
"reward": 0.19717933982610703,
"reward_std": 0.7559288740158081,
"rewards/cosine_scaled_reward": -0.08891034871339798,
"rewards/format_reward": 0.3750000111758709,
"step": 485
},
{
"clip_ratio": 0.0,
"completion_length": 1957.083396911621,
"epoch": 0.2777142857142857,
"grad_norm": 0.1778564304113388,
"kl": 0.53155517578125,
"learning_rate": 1.0246514708427702e-05,
"loss": 0.0623,
"reward": 0.32581200636923313,
"reward_std": 0.43175826454535127,
"rewards/cosine_scaled_reward": -0.14959400426596403,
"rewards/format_reward": 0.6250000037252903,
"step": 486
},
{
"clip_ratio": 0.0,
"completion_length": 1288.9166717529297,
"epoch": 0.2782857142857143,
"grad_norm": 0.5399029850959778,
"kl": 0.40069580078125,
"learning_rate": 1.0214767000817597e-05,
"loss": 0.1209,
"reward": 0.9641948640346527,
"reward_std": 0.6063492856919765,
"rewards/cosine_scaled_reward": 0.0862640580162406,
"rewards/format_reward": 0.7916666679084301,
"step": 487
},
{
"clip_ratio": 0.0,
"completion_length": 3372.9166870117188,
"epoch": 0.27885714285714286,
"grad_norm": 0.09737123548984528,
"kl": 0.74609375,
"learning_rate": 1.0185202062281336e-05,
"loss": 0.0528,
"reward": -0.11789792403578758,
"reward_std": 0.5601080972701311,
"rewards/cosine_scaled_reward": -0.20478228479623795,
"rewards/format_reward": 0.2916666716337204,
"step": 488
},
{
"clip_ratio": 0.0,
"completion_length": 1862.458381652832,
"epoch": 0.2794285714285714,
"grad_norm": 0.1760631799697876,
"kl": 0.479736328125,
"learning_rate": 1.0157821333772305e-05,
"loss": 0.1488,
"reward": 0.6090323962271214,
"reward_std": 0.577803835272789,
"rewards/cosine_scaled_reward": -0.028817158192396164,
"rewards/format_reward": 0.6666666716337204,
"step": 489
},
{
"clip_ratio": 0.0,
"completion_length": 1804.5000610351562,
"epoch": 0.28,
"grad_norm": 1.0395656824111938,
"kl": 1.10107421875,
"learning_rate": 1.0132626149788591e-05,
"loss": 0.3023,
"reward": 0.177335936576128,
"reward_std": 0.549927618354559,
"rewards/cosine_scaled_reward": -0.18216537311673164,
"rewards/format_reward": 0.541666679084301,
"step": 490
},
{
"clip_ratio": 0.0,
"completion_length": 2520.666717529297,
"epoch": 0.2805714285714286,
"grad_norm": 0.45748019218444824,
"kl": 0.405517578125,
"learning_rate": 1.0109617738307912e-05,
"loss": 0.2743,
"reward": 0.12348990142345428,
"reward_std": 0.5433773789554834,
"rewards/cosine_scaled_reward": -0.12575505301356316,
"rewards/format_reward": 0.375,
"step": 491
},
{
"clip_ratio": 0.0,
"completion_length": 2241.416748046875,
"epoch": 0.28114285714285714,
"grad_norm": 1.151473045349121,
"kl": 1.1015625,
"learning_rate": 1.008879722072778e-05,
"loss": 0.2926,
"reward": 0.06369444611482322,
"reward_std": 0.7478420436382294,
"rewards/cosine_scaled_reward": -0.23898612521588802,
"rewards/format_reward": 0.5416666716337204,
"step": 492
},
{
"clip_ratio": 0.0,
"completion_length": 2081.333335876465,
"epoch": 0.2817142857142857,
"grad_norm": 0.27526819705963135,
"kl": 0.312744140625,
"learning_rate": 1.0070165611810856e-05,
"loss": 0.0627,
"reward": 0.5283693410456181,
"reward_std": 0.6566501557826996,
"rewards/cosine_scaled_reward": -0.006648639217019081,
"rewards/format_reward": 0.5416666679084301,
"step": 493
},
{
"clip_ratio": 0.0,
"completion_length": 1102.6250381469727,
"epoch": 0.2822857142857143,
"grad_norm": 0.5913681387901306,
"kl": 0.3724365234375,
"learning_rate": 1.0053723819635471e-05,
"loss": 0.0625,
"reward": 1.7323738783597946,
"reward_std": 0.44539252668619156,
"rewards/cosine_scaled_reward": 0.36618690751492977,
"rewards/format_reward": 1.0,
"step": 494
},
{
"clip_ratio": 0.0,
"completion_length": 3075.2916870117188,
"epoch": 0.28285714285714286,
"grad_norm": 0.32760554552078247,
"kl": 0.64306640625,
"learning_rate": 1.0039472645551373e-05,
"loss": 0.0609,
"reward": -0.2853744365274906,
"reward_std": 0.2332111056894064,
"rewards/cosine_scaled_reward": -0.246853890363127,
"rewards/format_reward": 0.2083333395421505,
"step": 495
},
{
"clip_ratio": 0.0,
"completion_length": 1245.208366394043,
"epoch": 0.2834285714285714,
"grad_norm": 0.2870532274246216,
"kl": 0.65533447265625,
"learning_rate": 1.0027412784140691e-05,
"loss": 0.0608,
"reward": 0.566844031214714,
"reward_std": 0.43196946009993553,
"rewards/cosine_scaled_reward": -0.07074464811012149,
"rewards/format_reward": 0.7083333432674408,
"step": 496
},
{
"clip_ratio": 0.0,
"completion_length": 2885.2500610351562,
"epoch": 0.284,
"grad_norm": 0.5022923350334167,
"kl": 0.97265625,
"learning_rate": 1.0017544823184056e-05,
"loss": 0.2029,
"reward": -0.30719401501119137,
"reward_std": 0.604552611708641,
"rewards/cosine_scaled_reward": -0.32026369124650955,
"rewards/format_reward": 0.3333333469927311,
"step": 497
},
{
"clip_ratio": 0.0,
"completion_length": 2770.5833587646484,
"epoch": 0.2845714285714286,
"grad_norm": 0.0918683186173439,
"kl": 0.5137939453125,
"learning_rate": 1.0009869243631953e-05,
"loss": 0.0071,
"reward": 0.11931697279214859,
"reward_std": 0.7490174844861031,
"rewards/cosine_scaled_reward": -0.1278415024280548,
"rewards/format_reward": 0.3750000037252903,
"step": 498
},
{
"clip_ratio": 0.0,
"completion_length": 1951.8750305175781,
"epoch": 0.28514285714285714,
"grad_norm": 0.9043524265289307,
"kl": 0.7900390625,
"learning_rate": 1.000438641958131e-05,
"loss": 0.343,
"reward": 0.27463794499635696,
"reward_std": 0.7951376140117645,
"rewards/cosine_scaled_reward": -0.1960143893957138,
"rewards/format_reward": 0.666666679084301,
"step": 499
},
{
"clip_ratio": 0.0,
"completion_length": 1526.2916870117188,
"epoch": 0.2857142857142857,
"grad_norm": 0.7896002531051636,
"kl": 1.0877685546875,
"learning_rate": 1.0001096618257236e-05,
"loss": 0.0515,
"reward": 0.6481724679470062,
"reward_std": 0.6312408894300461,
"rewards/cosine_scaled_reward": -0.07174711301922798,
"rewards/format_reward": 0.791666679084301,
"step": 500
},
{
"epoch": 0.2857142857142857,
"step": 500,
"total_flos": 0.0,
"train_loss": 0.09381090864279394,
"train_runtime": 25053.7653,
"train_samples_per_second": 0.479,
"train_steps_per_second": 0.02
}
],
"logging_steps": 1,
"max_steps": 500,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 50,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 6,
"trial_name": null,
"trial_params": null
}