|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.5714285714285714, |
|
"eval_steps": 500, |
|
"global_step": 500, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"completion_length": 3001.9584350585938, |
|
"epoch": 0.001142857142857143, |
|
"grad_norm": 0.18922260403633118, |
|
"kl": 0.0, |
|
"learning_rate": 2e-08, |
|
"loss": -0.0, |
|
"reward": -0.010712452232837677, |
|
"reward_std": 0.48354096710681915, |
|
"rewards/cosine_scaled_reward": -0.1928562317043543, |
|
"rewards/format_reward": 0.37500000558793545, |
|
"step": 1 |
|
}, |
|
{ |
|
"completion_length": 2822.541717529297, |
|
"epoch": 0.002285714285714286, |
|
"grad_norm": 0.28424975275993347, |
|
"kl": 0.0, |
|
"learning_rate": 4e-08, |
|
"loss": 0.0, |
|
"reward": 0.4385625521535985, |
|
"reward_std": 0.8208381980657578, |
|
"rewards/cosine_scaled_reward": -0.009885392151772976, |
|
"rewards/format_reward": 0.4583333432674408, |
|
"step": 2 |
|
}, |
|
{ |
|
"completion_length": 2882.4166870117188, |
|
"epoch": 0.0034285714285714284, |
|
"grad_norm": 0.18410934507846832, |
|
"kl": 3.517171717248857e-05, |
|
"learning_rate": 6e-08, |
|
"loss": 0.0, |
|
"reward": -0.291525443084538, |
|
"reward_std": 0.3761885389685631, |
|
"rewards/cosine_scaled_reward": -0.27076271921396255, |
|
"rewards/format_reward": 0.25, |
|
"step": 3 |
|
}, |
|
{ |
|
"completion_length": 3245.416748046875, |
|
"epoch": 0.004571428571428572, |
|
"grad_norm": 0.16615347564220428, |
|
"kl": 2.9280781745910645e-05, |
|
"learning_rate": 8e-08, |
|
"loss": 0.0, |
|
"reward": -0.25264428183436394, |
|
"reward_std": 0.4561151713132858, |
|
"rewards/cosine_scaled_reward": -0.24090547114610672, |
|
"rewards/format_reward": 0.22916667349636555, |
|
"step": 4 |
|
}, |
|
{ |
|
"completion_length": 2911.3334350585938, |
|
"epoch": 0.005714285714285714, |
|
"grad_norm": 0.21166956424713135, |
|
"kl": 3.331899642944336e-05, |
|
"learning_rate": 1e-07, |
|
"loss": 0.0, |
|
"reward": 0.8400040492415428, |
|
"reward_std": 0.885560505092144, |
|
"rewards/cosine_scaled_reward": 0.18041866831481457, |
|
"rewards/format_reward": 0.479166679084301, |
|
"step": 5 |
|
}, |
|
{ |
|
"completion_length": 2720.89599609375, |
|
"epoch": 0.006857142857142857, |
|
"grad_norm": 0.23326514661312103, |
|
"kl": 4.035234451293945e-05, |
|
"learning_rate": 1.2e-07, |
|
"loss": 0.0, |
|
"reward": 0.2041575275361538, |
|
"reward_std": 0.6658071056008339, |
|
"rewards/cosine_scaled_reward": -0.11667125090025365, |
|
"rewards/format_reward": 0.4375, |
|
"step": 6 |
|
}, |
|
{ |
|
"completion_length": 2360.6458740234375, |
|
"epoch": 0.008, |
|
"grad_norm": 0.2280312329530716, |
|
"kl": 1.850724220275879e-05, |
|
"learning_rate": 1.4e-07, |
|
"loss": 0.0, |
|
"reward": 0.7341702952980995, |
|
"reward_std": 0.44598812609910965, |
|
"rewards/cosine_scaled_reward": 0.09625181555747986, |
|
"rewards/format_reward": 0.5416666865348816, |
|
"step": 7 |
|
}, |
|
{ |
|
"completion_length": 2888.8750610351562, |
|
"epoch": 0.009142857142857144, |
|
"grad_norm": 0.20181676745414734, |
|
"kl": 2.9474496841430664e-05, |
|
"learning_rate": 1.6e-07, |
|
"loss": 0.0, |
|
"reward": -0.007411351427435875, |
|
"reward_std": 0.6588219478726387, |
|
"rewards/cosine_scaled_reward": -0.2016223482787609, |
|
"rewards/format_reward": 0.3958333395421505, |
|
"step": 8 |
|
}, |
|
{ |
|
"completion_length": 3309.8541870117188, |
|
"epoch": 0.010285714285714285, |
|
"grad_norm": 0.18089492619037628, |
|
"kl": 4.0411949157714844e-05, |
|
"learning_rate": 1.8e-07, |
|
"loss": 0.0, |
|
"reward": 0.08656559139490128, |
|
"reward_std": 0.7023323476314545, |
|
"rewards/cosine_scaled_reward": -0.060883864760398865, |
|
"rewards/format_reward": 0.2083333358168602, |
|
"step": 9 |
|
}, |
|
{ |
|
"completion_length": 2354.729217529297, |
|
"epoch": 0.011428571428571429, |
|
"grad_norm": 0.2087497115135193, |
|
"kl": 3.822147846221924e-05, |
|
"learning_rate": 2e-07, |
|
"loss": 0.0, |
|
"reward": 0.43211155757308006, |
|
"reward_std": 0.7549905180931091, |
|
"rewards/cosine_scaled_reward": -0.054777566343545914, |
|
"rewards/format_reward": 0.5416666679084301, |
|
"step": 10 |
|
}, |
|
{ |
|
"completion_length": 2589.041778564453, |
|
"epoch": 0.012571428571428572, |
|
"grad_norm": 0.2392556220293045, |
|
"kl": 4.547834396362305e-05, |
|
"learning_rate": 2.1999999999999998e-07, |
|
"loss": 0.0, |
|
"reward": 0.570576427038759, |
|
"reward_std": 1.0850151628255844, |
|
"rewards/cosine_scaled_reward": -0.006378462538123131, |
|
"rewards/format_reward": 0.5833333432674408, |
|
"step": 11 |
|
}, |
|
{ |
|
"completion_length": 2472.4375610351562, |
|
"epoch": 0.013714285714285714, |
|
"grad_norm": 0.2759428322315216, |
|
"kl": 2.5950372219085693e-05, |
|
"learning_rate": 2.4e-07, |
|
"loss": 0.0, |
|
"reward": 0.7166856527328491, |
|
"reward_std": 0.7806050479412079, |
|
"rewards/cosine_scaled_reward": 0.056259457021951675, |
|
"rewards/format_reward": 0.6041666716337204, |
|
"step": 12 |
|
}, |
|
{ |
|
"completion_length": 2164.854232788086, |
|
"epoch": 0.014857142857142857, |
|
"grad_norm": 0.1908554881811142, |
|
"kl": 2.492964267730713e-05, |
|
"learning_rate": 2.6e-07, |
|
"loss": 0.0, |
|
"reward": 0.7641210779547691, |
|
"reward_std": 0.6774896830320358, |
|
"rewards/cosine_scaled_reward": 0.10081052035093307, |
|
"rewards/format_reward": 0.5625, |
|
"step": 13 |
|
}, |
|
{ |
|
"completion_length": 2820.2501220703125, |
|
"epoch": 0.016, |
|
"grad_norm": 0.18801091611385345, |
|
"kl": 3.5822391510009766e-05, |
|
"learning_rate": 2.8e-07, |
|
"loss": 0.0, |
|
"reward": -0.07261240109801292, |
|
"reward_std": 0.6130082383751869, |
|
"rewards/cosine_scaled_reward": -0.2133895456790924, |
|
"rewards/format_reward": 0.3541666753590107, |
|
"step": 14 |
|
}, |
|
{ |
|
"completion_length": 3089.104248046875, |
|
"epoch": 0.017142857142857144, |
|
"grad_norm": 0.19688484072685242, |
|
"kl": 3.3468008041381836e-05, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0, |
|
"reward": 0.27143352539860643, |
|
"reward_std": 0.8015548288822174, |
|
"rewards/cosine_scaled_reward": 0.0003000907599925995, |
|
"rewards/format_reward": 0.2708333469927311, |
|
"step": 15 |
|
}, |
|
{ |
|
"completion_length": 2362.6876220703125, |
|
"epoch": 0.018285714285714287, |
|
"grad_norm": 0.24537329375743866, |
|
"kl": 2.1159648895263672e-05, |
|
"learning_rate": 3.2e-07, |
|
"loss": 0.0, |
|
"reward": 0.7649998441338539, |
|
"reward_std": 1.021081954240799, |
|
"rewards/cosine_scaled_reward": 0.11166658625006676, |
|
"rewards/format_reward": 0.5416666865348816, |
|
"step": 16 |
|
}, |
|
{ |
|
"completion_length": 3128.25, |
|
"epoch": 0.019428571428571427, |
|
"grad_norm": 0.20609241724014282, |
|
"kl": 4.242360591888428e-05, |
|
"learning_rate": 3.4000000000000003e-07, |
|
"loss": 0.0, |
|
"reward": -0.19378644227981567, |
|
"reward_std": 0.5115947872400284, |
|
"rewards/cosine_scaled_reward": -0.21147656068205833, |
|
"rewards/format_reward": 0.2291666716337204, |
|
"step": 17 |
|
}, |
|
{ |
|
"completion_length": 2980.9793090820312, |
|
"epoch": 0.02057142857142857, |
|
"grad_norm": 0.2714909315109253, |
|
"kl": 3.966689109802246e-05, |
|
"learning_rate": 3.6e-07, |
|
"loss": 0.0, |
|
"reward": -0.022395100444555283, |
|
"reward_std": 0.6723635420203209, |
|
"rewards/cosine_scaled_reward": -0.16744754649698734, |
|
"rewards/format_reward": 0.3125000037252903, |
|
"step": 18 |
|
}, |
|
{ |
|
"completion_length": 3212.604248046875, |
|
"epoch": 0.021714285714285714, |
|
"grad_norm": 0.17160819470882416, |
|
"kl": 3.719329833984375e-05, |
|
"learning_rate": 3.7999999999999996e-07, |
|
"loss": 0.0, |
|
"reward": 0.07102994620800018, |
|
"reward_std": 0.8850104063749313, |
|
"rewards/cosine_scaled_reward": -0.1207350417971611, |
|
"rewards/format_reward": 0.31250001303851604, |
|
"step": 19 |
|
}, |
|
{ |
|
"completion_length": 2613.6041870117188, |
|
"epoch": 0.022857142857142857, |
|
"grad_norm": 0.24837209284305573, |
|
"kl": 3.3915042877197266e-05, |
|
"learning_rate": 4e-07, |
|
"loss": 0.0, |
|
"reward": 0.33390188589692116, |
|
"reward_std": 0.713263601064682, |
|
"rewards/cosine_scaled_reward": -0.08304904773831367, |
|
"rewards/format_reward": 0.5000000149011612, |
|
"step": 20 |
|
}, |
|
{ |
|
"completion_length": 2582.041748046875, |
|
"epoch": 0.024, |
|
"grad_norm": 0.2642858326435089, |
|
"kl": 2.1037645637989044e-05, |
|
"learning_rate": 4.1999999999999995e-07, |
|
"loss": 0.0, |
|
"reward": 0.2965797185897827, |
|
"reward_std": 0.5356749221682549, |
|
"rewards/cosine_scaled_reward": -0.06004347978159785, |
|
"rewards/format_reward": 0.41666667722165585, |
|
"step": 21 |
|
}, |
|
{ |
|
"completion_length": 3307.6459350585938, |
|
"epoch": 0.025142857142857144, |
|
"grad_norm": 0.22419147193431854, |
|
"kl": 4.1961669921875e-05, |
|
"learning_rate": 4.3999999999999997e-07, |
|
"loss": 0.0, |
|
"reward": 0.25835999101400375, |
|
"reward_std": 1.1261206567287445, |
|
"rewards/cosine_scaled_reward": -0.04790334962308407, |
|
"rewards/format_reward": 0.3541666716337204, |
|
"step": 22 |
|
}, |
|
{ |
|
"completion_length": 3249.1876220703125, |
|
"epoch": 0.026285714285714287, |
|
"grad_norm": 0.19173863530158997, |
|
"kl": 4.4405460357666016e-05, |
|
"learning_rate": 4.6e-07, |
|
"loss": 0.0, |
|
"reward": 0.27271851897239685, |
|
"reward_std": 0.7990642189979553, |
|
"rewards/cosine_scaled_reward": -0.04072406329214573, |
|
"rewards/format_reward": 0.3541666828095913, |
|
"step": 23 |
|
}, |
|
{ |
|
"completion_length": 2154.25, |
|
"epoch": 0.027428571428571427, |
|
"grad_norm": 0.259212851524353, |
|
"kl": 1.8768012523651123e-05, |
|
"learning_rate": 4.8e-07, |
|
"loss": 0.0, |
|
"reward": 0.5731075070798397, |
|
"reward_std": 0.8421577215194702, |
|
"rewards/cosine_scaled_reward": -0.02594624925404787, |
|
"rewards/format_reward": 0.6250000074505806, |
|
"step": 24 |
|
}, |
|
{ |
|
"completion_length": 2784.7916870117188, |
|
"epoch": 0.02857142857142857, |
|
"grad_norm": 0.29162946343421936, |
|
"kl": 3.090500831604004e-05, |
|
"learning_rate": 5e-07, |
|
"loss": 0.0, |
|
"reward": 0.1705078724771738, |
|
"reward_std": 0.6685621440410614, |
|
"rewards/cosine_scaled_reward": -0.08141273260116577, |
|
"rewards/format_reward": 0.3333333358168602, |
|
"step": 25 |
|
}, |
|
{ |
|
"completion_length": 3185.729248046875, |
|
"epoch": 0.029714285714285714, |
|
"grad_norm": 0.15754370391368866, |
|
"kl": 2.549588680267334e-05, |
|
"learning_rate": 5.2e-07, |
|
"loss": 0.0, |
|
"reward": 0.13516026688739657, |
|
"reward_std": 0.6272664293646812, |
|
"rewards/cosine_scaled_reward": -0.05741987004876137, |
|
"rewards/format_reward": 0.2500000111758709, |
|
"step": 26 |
|
}, |
|
{ |
|
"completion_length": 3129.2083740234375, |
|
"epoch": 0.030857142857142857, |
|
"grad_norm": 0.16376672685146332, |
|
"kl": 2.86102294921875e-05, |
|
"learning_rate": 5.4e-07, |
|
"loss": 0.0, |
|
"reward": 0.0651654414832592, |
|
"reward_std": 0.5805819556117058, |
|
"rewards/cosine_scaled_reward": -0.08200062438845634, |
|
"rewards/format_reward": 0.22916667722165585, |
|
"step": 27 |
|
}, |
|
{ |
|
"completion_length": 3173.4791870117188, |
|
"epoch": 0.032, |
|
"grad_norm": 0.2187095433473587, |
|
"kl": 3.802776336669922e-05, |
|
"learning_rate": 5.6e-07, |
|
"loss": 0.0, |
|
"reward": 0.1244891807436943, |
|
"reward_std": 0.8137174546718597, |
|
"rewards/cosine_scaled_reward": -0.07317209523171186, |
|
"rewards/format_reward": 0.27083334885537624, |
|
"step": 28 |
|
}, |
|
{ |
|
"completion_length": 3206.0208740234375, |
|
"epoch": 0.03314285714285714, |
|
"grad_norm": 0.15626287460327148, |
|
"kl": 1.7024576663970947e-05, |
|
"learning_rate": 5.8e-07, |
|
"loss": 0.0, |
|
"reward": -0.0882865646854043, |
|
"reward_std": 0.6182056441903114, |
|
"rewards/cosine_scaled_reward": -0.13789328234270215, |
|
"rewards/format_reward": 0.1875000111758709, |
|
"step": 29 |
|
}, |
|
{ |
|
"completion_length": 3293.979248046875, |
|
"epoch": 0.03428571428571429, |
|
"grad_norm": 0.176454558968544, |
|
"kl": 2.8930604457855225e-05, |
|
"learning_rate": 6e-07, |
|
"loss": 0.0, |
|
"reward": 0.12017922103404999, |
|
"reward_std": 0.8002806901931763, |
|
"rewards/cosine_scaled_reward": -0.10657705180346966, |
|
"rewards/format_reward": 0.33333334885537624, |
|
"step": 30 |
|
}, |
|
{ |
|
"completion_length": 2556.0625610351562, |
|
"epoch": 0.03542857142857143, |
|
"grad_norm": 0.2976699471473694, |
|
"kl": 3.6090612411499023e-05, |
|
"learning_rate": 6.2e-07, |
|
"loss": 0.0, |
|
"reward": 0.13020960986614227, |
|
"reward_std": 0.5589020624756813, |
|
"rewards/cosine_scaled_reward": -0.15364519506692886, |
|
"rewards/format_reward": 0.43750001303851604, |
|
"step": 31 |
|
}, |
|
{ |
|
"completion_length": 3466.125, |
|
"epoch": 0.036571428571428574, |
|
"grad_norm": 0.15761366486549377, |
|
"kl": 3.0994415283203125e-05, |
|
"learning_rate": 6.4e-07, |
|
"loss": 0.0, |
|
"reward": -0.34774322621524334, |
|
"reward_std": 0.4613388404250145, |
|
"rewards/cosine_scaled_reward": -0.22595495358109474, |
|
"rewards/format_reward": 0.1041666679084301, |
|
"step": 32 |
|
}, |
|
{ |
|
"completion_length": 3078.729248046875, |
|
"epoch": 0.037714285714285714, |
|
"grad_norm": 0.17744146287441254, |
|
"kl": 1.9311904907226562e-05, |
|
"learning_rate": 6.6e-07, |
|
"loss": 0.0, |
|
"reward": 0.1523735709488392, |
|
"reward_std": 0.7702403217554092, |
|
"rewards/cosine_scaled_reward": -0.10089654847979546, |
|
"rewards/format_reward": 0.354166679084301, |
|
"step": 33 |
|
}, |
|
{ |
|
"completion_length": 3068.2083740234375, |
|
"epoch": 0.038857142857142854, |
|
"grad_norm": 0.2183830887079239, |
|
"kl": 1.940131187438965e-05, |
|
"learning_rate": 6.800000000000001e-07, |
|
"loss": 0.0, |
|
"reward": 0.029434625059366226, |
|
"reward_std": 0.7817529812455177, |
|
"rewards/cosine_scaled_reward": -0.1415326923597604, |
|
"rewards/format_reward": 0.3125000111758709, |
|
"step": 34 |
|
}, |
|
{ |
|
"completion_length": 3028.916748046875, |
|
"epoch": 0.04, |
|
"grad_norm": 0.1779097616672516, |
|
"kl": 2.0578503608703613e-05, |
|
"learning_rate": 7e-07, |
|
"loss": 0.0, |
|
"reward": 0.2327469252049923, |
|
"reward_std": 0.9538670182228088, |
|
"rewards/cosine_scaled_reward": -0.09195987693965435, |
|
"rewards/format_reward": 0.41666667349636555, |
|
"step": 35 |
|
}, |
|
{ |
|
"completion_length": 2689.3959350585938, |
|
"epoch": 0.04114285714285714, |
|
"grad_norm": 0.16330143809318542, |
|
"kl": 4.976987838745117e-05, |
|
"learning_rate": 7.2e-07, |
|
"loss": 0.0, |
|
"reward": 0.5622920989990234, |
|
"reward_std": 0.39920446276664734, |
|
"rewards/cosine_scaled_reward": 0.07281268946826458, |
|
"rewards/format_reward": 0.4166666716337204, |
|
"step": 36 |
|
}, |
|
{ |
|
"completion_length": 2801.6459045410156, |
|
"epoch": 0.04228571428571429, |
|
"grad_norm": 0.19838035106658936, |
|
"kl": 4.4226646423339844e-05, |
|
"learning_rate": 7.4e-07, |
|
"loss": 0.0, |
|
"reward": 0.41371238604187965, |
|
"reward_std": 0.575165145099163, |
|
"rewards/cosine_scaled_reward": -0.07439382094889879, |
|
"rewards/format_reward": 0.5625000149011612, |
|
"step": 37 |
|
}, |
|
{ |
|
"completion_length": 3009.5000610351562, |
|
"epoch": 0.04342857142857143, |
|
"grad_norm": 0.1800134778022766, |
|
"kl": 6.097555160522461e-05, |
|
"learning_rate": 7.599999999999999e-07, |
|
"loss": 0.0, |
|
"reward": 0.12262389855459332, |
|
"reward_std": 0.6652626991271973, |
|
"rewards/cosine_scaled_reward": -0.15743806213140488, |
|
"rewards/format_reward": 0.4375000149011612, |
|
"step": 38 |
|
}, |
|
{ |
|
"completion_length": 3139.604248046875, |
|
"epoch": 0.044571428571428574, |
|
"grad_norm": 0.23411938548088074, |
|
"kl": 4.2323023080825806e-05, |
|
"learning_rate": 7.799999999999999e-07, |
|
"loss": 0.0, |
|
"reward": 0.24589091911911964, |
|
"reward_std": 0.8911770880222321, |
|
"rewards/cosine_scaled_reward": -0.07497121207416058, |
|
"rewards/format_reward": 0.3958333395421505, |
|
"step": 39 |
|
}, |
|
{ |
|
"completion_length": 3011.8958740234375, |
|
"epoch": 0.045714285714285714, |
|
"grad_norm": 0.1625184565782547, |
|
"kl": 4.693865776062012e-05, |
|
"learning_rate": 8e-07, |
|
"loss": 0.0, |
|
"reward": 0.12772860191762447, |
|
"reward_std": 0.7576778829097748, |
|
"rewards/cosine_scaled_reward": -0.12363571301102638, |
|
"rewards/format_reward": 0.3750000037252903, |
|
"step": 40 |
|
}, |
|
{ |
|
"completion_length": 3124.2500610351562, |
|
"epoch": 0.046857142857142854, |
|
"grad_norm": 0.17387458682060242, |
|
"kl": 1.8164515495300293e-05, |
|
"learning_rate": 8.199999999999999e-07, |
|
"loss": 0.0, |
|
"reward": 0.21644378546625376, |
|
"reward_std": 0.6694483831524849, |
|
"rewards/cosine_scaled_reward": -0.06886144913733006, |
|
"rewards/format_reward": 0.354166679084301, |
|
"step": 41 |
|
}, |
|
{ |
|
"completion_length": 2182.5000610351562, |
|
"epoch": 0.048, |
|
"grad_norm": 0.34351351857185364, |
|
"kl": 0.00022931396961212158, |
|
"learning_rate": 8.399999999999999e-07, |
|
"loss": 0.0, |
|
"reward": 0.6580724753439426, |
|
"reward_std": 0.8123672902584076, |
|
"rewards/cosine_scaled_reward": 0.016536223702132702, |
|
"rewards/format_reward": 0.6250000111758709, |
|
"step": 42 |
|
}, |
|
{ |
|
"completion_length": 2828.7709350585938, |
|
"epoch": 0.04914285714285714, |
|
"grad_norm": 0.19178950786590576, |
|
"kl": 3.0837953090667725e-05, |
|
"learning_rate": 8.599999999999999e-07, |
|
"loss": 0.0, |
|
"reward": 0.31538891326636076, |
|
"reward_std": 0.8877717405557632, |
|
"rewards/cosine_scaled_reward": -0.06105554662644863, |
|
"rewards/format_reward": 0.4375000149011612, |
|
"step": 43 |
|
}, |
|
{ |
|
"completion_length": 3036.6250610351562, |
|
"epoch": 0.05028571428571429, |
|
"grad_norm": 0.22349753975868225, |
|
"kl": 0.0003269314765930176, |
|
"learning_rate": 8.799999999999999e-07, |
|
"loss": 0.0, |
|
"reward": -0.01474527781829238, |
|
"reward_std": 0.5097765326499939, |
|
"rewards/cosine_scaled_reward": -0.15320597402751446, |
|
"rewards/format_reward": 0.291666679084301, |
|
"step": 44 |
|
}, |
|
{ |
|
"completion_length": 2778.5000610351562, |
|
"epoch": 0.05142857142857143, |
|
"grad_norm": 0.18280261754989624, |
|
"kl": 5.6609511375427246e-05, |
|
"learning_rate": 9e-07, |
|
"loss": 0.0, |
|
"reward": 0.8612850233912468, |
|
"reward_std": 1.1412108689546585, |
|
"rewards/cosine_scaled_reward": 0.13897587358951569, |
|
"rewards/format_reward": 0.583333358168602, |
|
"step": 45 |
|
}, |
|
{ |
|
"completion_length": 2998.229248046875, |
|
"epoch": 0.052571428571428575, |
|
"grad_norm": 0.16858145594596863, |
|
"kl": 5.383044481277466e-05, |
|
"learning_rate": 9.2e-07, |
|
"loss": 0.0, |
|
"reward": 0.28540395572781563, |
|
"reward_std": 0.43213801458477974, |
|
"rewards/cosine_scaled_reward": -0.02396468259394169, |
|
"rewards/format_reward": 0.3333333358168602, |
|
"step": 46 |
|
}, |
|
{ |
|
"completion_length": 2551.6458587646484, |
|
"epoch": 0.053714285714285714, |
|
"grad_norm": 0.23799559473991394, |
|
"kl": 9.156018495559692e-05, |
|
"learning_rate": 9.399999999999999e-07, |
|
"loss": 0.0, |
|
"reward": 0.44500812888145447, |
|
"reward_std": 0.783466711640358, |
|
"rewards/cosine_scaled_reward": -0.04832928255200386, |
|
"rewards/format_reward": 0.5416666865348816, |
|
"step": 47 |
|
}, |
|
{ |
|
"completion_length": 2939.1041870117188, |
|
"epoch": 0.054857142857142854, |
|
"grad_norm": 0.18564291298389435, |
|
"kl": 0.00010335445404052734, |
|
"learning_rate": 9.6e-07, |
|
"loss": 0.0, |
|
"reward": 0.34564049541950226, |
|
"reward_std": 0.9494538530707359, |
|
"rewards/cosine_scaled_reward": -0.02509642019867897, |
|
"rewards/format_reward": 0.3958333395421505, |
|
"step": 48 |
|
}, |
|
{ |
|
"completion_length": 2282.895835876465, |
|
"epoch": 0.056, |
|
"grad_norm": 0.23054172098636627, |
|
"kl": 0.00024145841598510742, |
|
"learning_rate": 9.8e-07, |
|
"loss": 0.0, |
|
"reward": 0.44885391741991043, |
|
"reward_std": 0.7631752789020538, |
|
"rewards/cosine_scaled_reward": -0.06723971478641033, |
|
"rewards/format_reward": 0.5833333358168602, |
|
"step": 49 |
|
}, |
|
{ |
|
"completion_length": 2204.1041870117188, |
|
"epoch": 0.05714285714285714, |
|
"grad_norm": 0.29597678780555725, |
|
"kl": 0.0005988925695419312, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0, |
|
"reward": 0.5718545913696289, |
|
"reward_std": 0.6146985068917274, |
|
"rewards/cosine_scaled_reward": 0.02551062125712633, |
|
"rewards/format_reward": 0.5208333432674408, |
|
"step": 50 |
|
}, |
|
{ |
|
"completion_length": 3201.6666870117188, |
|
"epoch": 0.05828571428571429, |
|
"grad_norm": 0.15334689617156982, |
|
"kl": 8.338689804077148e-05, |
|
"learning_rate": 9.999890338174275e-07, |
|
"loss": 0.0, |
|
"reward": 0.14526839554309845, |
|
"reward_std": 0.8655073121190071, |
|
"rewards/cosine_scaled_reward": -0.07319913152605295, |
|
"rewards/format_reward": 0.2916666716337204, |
|
"step": 51 |
|
}, |
|
{ |
|
"completion_length": 3215.0000610351562, |
|
"epoch": 0.05942857142857143, |
|
"grad_norm": 0.17531076073646545, |
|
"kl": 0.0001531541347503662, |
|
"learning_rate": 9.999561358041868e-07, |
|
"loss": 0.0, |
|
"reward": -0.015788130462169647, |
|
"reward_std": 0.7165202274918556, |
|
"rewards/cosine_scaled_reward": -0.13289407594129443, |
|
"rewards/format_reward": 0.2500000074505806, |
|
"step": 52 |
|
}, |
|
{ |
|
"completion_length": 2991.8125610351562, |
|
"epoch": 0.060571428571428575, |
|
"grad_norm": 0.28014782071113586, |
|
"kl": 0.000295490026473999, |
|
"learning_rate": 9.999013075636804e-07, |
|
"loss": 0.0, |
|
"reward": -0.08945630304515362, |
|
"reward_std": 0.6164149194955826, |
|
"rewards/cosine_scaled_reward": -0.1905614770948887, |
|
"rewards/format_reward": 0.2916666828095913, |
|
"step": 53 |
|
}, |
|
{ |
|
"completion_length": 2437.4375, |
|
"epoch": 0.061714285714285715, |
|
"grad_norm": 0.21041618287563324, |
|
"kl": 0.00014019012451171875, |
|
"learning_rate": 9.998245517681593e-07, |
|
"loss": 0.0, |
|
"reward": 0.08023202046751976, |
|
"reward_std": 0.43379058688879013, |
|
"rewards/cosine_scaled_reward": -0.18905067443847656, |
|
"rewards/format_reward": 0.4583333432674408, |
|
"step": 54 |
|
}, |
|
{ |
|
"completion_length": 3115.2501220703125, |
|
"epoch": 0.06285714285714286, |
|
"grad_norm": 0.18965038657188416, |
|
"kl": 0.00021153688430786133, |
|
"learning_rate": 9.997258721585931e-07, |
|
"loss": 0.0, |
|
"reward": 0.14397013932466507, |
|
"reward_std": 0.7335802316665649, |
|
"rewards/cosine_scaled_reward": -0.07384827360510826, |
|
"rewards/format_reward": 0.291666679084301, |
|
"step": 55 |
|
}, |
|
{ |
|
"completion_length": 3097.0625, |
|
"epoch": 0.064, |
|
"grad_norm": 0.1861303448677063, |
|
"kl": 0.0006959438323974609, |
|
"learning_rate": 9.996052735444862e-07, |
|
"loss": 0.0, |
|
"reward": 0.21384139358997345, |
|
"reward_std": 0.7854617610573769, |
|
"rewards/cosine_scaled_reward": -0.038912639021873474, |
|
"rewards/format_reward": 0.29166667722165585, |
|
"step": 56 |
|
}, |
|
{ |
|
"completion_length": 2959.479248046875, |
|
"epoch": 0.06514285714285714, |
|
"grad_norm": 0.17092932760715485, |
|
"kl": 0.0004864931106567383, |
|
"learning_rate": 9.994627618036452e-07, |
|
"loss": 0.0, |
|
"reward": -0.0896548442542553, |
|
"reward_std": 0.597286906093359, |
|
"rewards/cosine_scaled_reward": -0.22191076539456844, |
|
"rewards/format_reward": 0.3541666716337204, |
|
"step": 57 |
|
}, |
|
{ |
|
"completion_length": 3173.9583740234375, |
|
"epoch": 0.06628571428571428, |
|
"grad_norm": 0.16764329373836517, |
|
"kl": 0.0009320974349975586, |
|
"learning_rate": 9.992983438818915e-07, |
|
"loss": 0.0, |
|
"reward": -0.259409268386662, |
|
"reward_std": 0.41252440214157104, |
|
"rewards/cosine_scaled_reward": -0.22345462441444397, |
|
"rewards/format_reward": 0.1875, |
|
"step": 58 |
|
}, |
|
{ |
|
"completion_length": 2903.4375610351562, |
|
"epoch": 0.06742857142857143, |
|
"grad_norm": 0.2438689023256302, |
|
"kl": 0.0004966259002685547, |
|
"learning_rate": 9.991120277927223e-07, |
|
"loss": 0.0, |
|
"reward": 0.5037799216806889, |
|
"reward_std": 0.6180380508303642, |
|
"rewards/cosine_scaled_reward": 0.05397331342101097, |
|
"rewards/format_reward": 0.39583334513008595, |
|
"step": 59 |
|
}, |
|
{ |
|
"completion_length": 2846.9375610351562, |
|
"epoch": 0.06857142857142857, |
|
"grad_norm": 0.19560052454471588, |
|
"kl": 0.0006766319274902344, |
|
"learning_rate": 9.989038226169207e-07, |
|
"loss": 0.0, |
|
"reward": 0.619435504078865, |
|
"reward_std": 0.54927659034729, |
|
"rewards/cosine_scaled_reward": 0.12221772782504559, |
|
"rewards/format_reward": 0.3750000149011612, |
|
"step": 60 |
|
}, |
|
{ |
|
"completion_length": 2432.729248046875, |
|
"epoch": 0.06971428571428571, |
|
"grad_norm": 0.18966424465179443, |
|
"kl": 0.00019347667694091797, |
|
"learning_rate": 9.98673738502114e-07, |
|
"loss": 0.0, |
|
"reward": 0.8035399168729782, |
|
"reward_std": 0.6529746800661087, |
|
"rewards/cosine_scaled_reward": 0.12051995098590851, |
|
"rewards/format_reward": 0.5625000149011612, |
|
"step": 61 |
|
}, |
|
{ |
|
"completion_length": 3356.4583740234375, |
|
"epoch": 0.07085714285714285, |
|
"grad_norm": 0.1480625420808792, |
|
"kl": 0.0005307793617248535, |
|
"learning_rate": 9.98421786662277e-07, |
|
"loss": 0.0, |
|
"reward": 0.33844682574272156, |
|
"reward_std": 0.7905219346284866, |
|
"rewards/cosine_scaled_reward": 0.023390088230371475, |
|
"rewards/format_reward": 0.29166667722165585, |
|
"step": 62 |
|
}, |
|
{ |
|
"completion_length": 1956.7084350585938, |
|
"epoch": 0.072, |
|
"grad_norm": 0.26979929208755493, |
|
"kl": 0.006456255912780762, |
|
"learning_rate": 9.981479793771866e-07, |
|
"loss": 0.0003, |
|
"reward": 0.7974750846624374, |
|
"reward_std": 0.7315621674060822, |
|
"rewards/cosine_scaled_reward": 0.05498753860592842, |
|
"rewards/format_reward": 0.6875000149011612, |
|
"step": 63 |
|
}, |
|
{ |
|
"completion_length": 2895.8958740234375, |
|
"epoch": 0.07314285714285715, |
|
"grad_norm": 0.14358438551425934, |
|
"kl": 0.00026297569274902344, |
|
"learning_rate": 9.97852329991824e-07, |
|
"loss": 0.0, |
|
"reward": 0.6833263337612152, |
|
"reward_std": 0.5810272544622421, |
|
"rewards/cosine_scaled_reward": 0.12291315197944641, |
|
"rewards/format_reward": 0.4375000149011612, |
|
"step": 64 |
|
}, |
|
{ |
|
"completion_length": 2582.1875610351562, |
|
"epoch": 0.07428571428571429, |
|
"grad_norm": 0.1719641089439392, |
|
"kl": 0.0006394386291503906, |
|
"learning_rate": 9.975348529157229e-07, |
|
"loss": 0.0, |
|
"reward": 0.4943835213780403, |
|
"reward_std": 0.9102021306753159, |
|
"rewards/cosine_scaled_reward": -0.002808244898915291, |
|
"rewards/format_reward": 0.5000000074505806, |
|
"step": 65 |
|
}, |
|
{ |
|
"completion_length": 3066.2709350585938, |
|
"epoch": 0.07542857142857143, |
|
"grad_norm": 0.15914808213710785, |
|
"kl": 0.00035455822944641113, |
|
"learning_rate": 9.971955636222684e-07, |
|
"loss": 0.0, |
|
"reward": 0.270260289311409, |
|
"reward_std": 0.6933658719062805, |
|
"rewards/cosine_scaled_reward": -0.10445320140570402, |
|
"rewards/format_reward": 0.47916667722165585, |
|
"step": 66 |
|
}, |
|
{ |
|
"completion_length": 3021.1666870117188, |
|
"epoch": 0.07657142857142857, |
|
"grad_norm": 0.15889614820480347, |
|
"kl": 0.0007028579711914062, |
|
"learning_rate": 9.968344786479415e-07, |
|
"loss": 0.0, |
|
"reward": 0.2754221335053444, |
|
"reward_std": 0.6702268719673157, |
|
"rewards/cosine_scaled_reward": -0.028955606278032064, |
|
"rewards/format_reward": 0.33333334140479565, |
|
"step": 67 |
|
}, |
|
{ |
|
"completion_length": 2520.7291870117188, |
|
"epoch": 0.07771428571428571, |
|
"grad_norm": 0.25743117928504944, |
|
"kl": 0.001796722412109375, |
|
"learning_rate": 9.964516155915151e-07, |
|
"loss": 0.0001, |
|
"reward": 0.2481890171766281, |
|
"reward_std": 0.4365886226296425, |
|
"rewards/cosine_scaled_reward": -0.06340551376342773, |
|
"rewards/format_reward": 0.375, |
|
"step": 68 |
|
}, |
|
{ |
|
"completion_length": 2519.2500610351562, |
|
"epoch": 0.07885714285714286, |
|
"grad_norm": 0.1728557050228119, |
|
"kl": 0.0004031658172607422, |
|
"learning_rate": 9.960469931131936e-07, |
|
"loss": 0.0, |
|
"reward": 0.4316702373325825, |
|
"reward_std": 0.455346904695034, |
|
"rewards/cosine_scaled_reward": -0.02374822273850441, |
|
"rewards/format_reward": 0.4791666716337204, |
|
"step": 69 |
|
}, |
|
{ |
|
"completion_length": 3165.7708740234375, |
|
"epoch": 0.08, |
|
"grad_norm": 0.13548843562602997, |
|
"kl": 0.0004048347473144531, |
|
"learning_rate": 9.956206309337066e-07, |
|
"loss": 0.0, |
|
"reward": 0.46206507831811905, |
|
"reward_std": 0.8794394135475159, |
|
"rewards/cosine_scaled_reward": -0.008550799917429686, |
|
"rewards/format_reward": 0.4791666716337204, |
|
"step": 70 |
|
}, |
|
{ |
|
"completion_length": 2529.8751220703125, |
|
"epoch": 0.08114285714285714, |
|
"grad_norm": 0.22490063309669495, |
|
"kl": 0.003349781036376953, |
|
"learning_rate": 9.951725498333448e-07, |
|
"loss": 0.0001, |
|
"reward": 0.5197920426726341, |
|
"reward_std": 0.823016032576561, |
|
"rewards/cosine_scaled_reward": -0.010937327519059181, |
|
"rewards/format_reward": 0.5416666865348816, |
|
"step": 71 |
|
}, |
|
{ |
|
"completion_length": 2654.4791870117188, |
|
"epoch": 0.08228571428571428, |
|
"grad_norm": 0.20955628156661987, |
|
"kl": 0.0012530684471130371, |
|
"learning_rate": 9.947027716509488e-07, |
|
"loss": 0.0001, |
|
"reward": 0.7414695173501968, |
|
"reward_std": 0.5663128644227982, |
|
"rewards/cosine_scaled_reward": 0.05823474656790495, |
|
"rewards/format_reward": 0.625, |
|
"step": 72 |
|
}, |
|
{ |
|
"completion_length": 1640.041748046875, |
|
"epoch": 0.08342857142857144, |
|
"grad_norm": 0.2790416479110718, |
|
"kl": 0.005096435546875, |
|
"learning_rate": 9.942113192828444e-07, |
|
"loss": 0.0002, |
|
"reward": 0.91811203956604, |
|
"reward_std": 0.8141122311353683, |
|
"rewards/cosine_scaled_reward": 0.08405601512640715, |
|
"rewards/format_reward": 0.75, |
|
"step": 73 |
|
}, |
|
{ |
|
"completion_length": 2470.5833740234375, |
|
"epoch": 0.08457142857142858, |
|
"grad_norm": 0.1816757470369339, |
|
"kl": 0.001283407211303711, |
|
"learning_rate": 9.93698216681727e-07, |
|
"loss": 0.0001, |
|
"reward": 0.595103541854769, |
|
"reward_std": 0.6585821881890297, |
|
"rewards/cosine_scaled_reward": 0.01630176231265068, |
|
"rewards/format_reward": 0.5625000074505806, |
|
"step": 74 |
|
}, |
|
{ |
|
"completion_length": 2660.8750915527344, |
|
"epoch": 0.08571428571428572, |
|
"grad_norm": 0.2641614079475403, |
|
"kl": 0.0019817352294921875, |
|
"learning_rate": 9.931634888554935e-07, |
|
"loss": 0.0001, |
|
"reward": 0.2930721901357174, |
|
"reward_std": 0.7745417281985283, |
|
"rewards/cosine_scaled_reward": -0.06179725006222725, |
|
"rewards/format_reward": 0.4166666679084301, |
|
"step": 75 |
|
}, |
|
{ |
|
"completion_length": 2939.604248046875, |
|
"epoch": 0.08685714285714285, |
|
"grad_norm": 0.3027961552143097, |
|
"kl": 0.0008873939514160156, |
|
"learning_rate": 9.926071618660237e-07, |
|
"loss": 0.0, |
|
"reward": 0.3248658664524555, |
|
"reward_std": 0.909210205078125, |
|
"rewards/cosine_scaled_reward": -0.014650408178567886, |
|
"rewards/format_reward": 0.3541666828095913, |
|
"step": 76 |
|
}, |
|
{ |
|
"completion_length": 3152.1666870117188, |
|
"epoch": 0.088, |
|
"grad_norm": 0.16062307357788086, |
|
"kl": 0.001007080078125, |
|
"learning_rate": 9.9202926282791e-07, |
|
"loss": 0.0, |
|
"reward": 0.11907588690519333, |
|
"reward_std": 0.613786868751049, |
|
"rewards/cosine_scaled_reward": -0.06546205282211304, |
|
"rewards/format_reward": 0.25000000558793545, |
|
"step": 77 |
|
}, |
|
{ |
|
"completion_length": 2635.041717529297, |
|
"epoch": 0.08914285714285715, |
|
"grad_norm": 0.19166067242622375, |
|
"kl": 0.0008903741836547852, |
|
"learning_rate": 9.91429819907136e-07, |
|
"loss": 0.0, |
|
"reward": 0.5825019255280495, |
|
"reward_std": 0.7261854261159897, |
|
"rewards/cosine_scaled_reward": 0.06208430230617523, |
|
"rewards/format_reward": 0.4583333507180214, |
|
"step": 78 |
|
}, |
|
{ |
|
"completion_length": 2755.5209350585938, |
|
"epoch": 0.09028571428571429, |
|
"grad_norm": 0.17263904213905334, |
|
"kl": 0.0004019737243652344, |
|
"learning_rate": 9.908088623197048e-07, |
|
"loss": 0.0, |
|
"reward": 0.3458161875605583, |
|
"reward_std": 0.717200756072998, |
|
"rewards/cosine_scaled_reward": -0.056258589029312134, |
|
"rewards/format_reward": 0.4583333432674408, |
|
"step": 79 |
|
}, |
|
{ |
|
"completion_length": 3565.4375, |
|
"epoch": 0.09142857142857143, |
|
"grad_norm": 0.1458665281534195, |
|
"kl": 0.00030177831649780273, |
|
"learning_rate": 9.901664203302124e-07, |
|
"loss": 0.0, |
|
"reward": -0.254756236448884, |
|
"reward_std": 0.5783224925398827, |
|
"rewards/cosine_scaled_reward": -0.17946145310997963, |
|
"rewards/format_reward": 0.10416666977107525, |
|
"step": 80 |
|
}, |
|
{ |
|
"completion_length": 3124.291748046875, |
|
"epoch": 0.09257142857142857, |
|
"grad_norm": 0.17081034183502197, |
|
"kl": 0.0008420944213867188, |
|
"learning_rate": 9.895025252503755e-07, |
|
"loss": 0.0, |
|
"reward": 0.06758889555931091, |
|
"reward_std": 0.7439121454954147, |
|
"rewards/cosine_scaled_reward": -0.10162222757935524, |
|
"rewards/format_reward": 0.27083333767950535, |
|
"step": 81 |
|
}, |
|
{ |
|
"completion_length": 3004.0833740234375, |
|
"epoch": 0.09371428571428571, |
|
"grad_norm": 0.17911851406097412, |
|
"kl": 0.0006622076034545898, |
|
"learning_rate": 9.888172094375033e-07, |
|
"loss": 0.0, |
|
"reward": 0.3707614615559578, |
|
"reward_std": 0.8215866684913635, |
|
"rewards/cosine_scaled_reward": -0.0437859346420737, |
|
"rewards/format_reward": 0.45833334885537624, |
|
"step": 82 |
|
}, |
|
{ |
|
"completion_length": 3440.3125, |
|
"epoch": 0.09485714285714286, |
|
"grad_norm": 0.15295840799808502, |
|
"kl": 0.00017774105072021484, |
|
"learning_rate": 9.881105062929221e-07, |
|
"loss": 0.0, |
|
"reward": -0.39509791135787964, |
|
"reward_std": 0.5668186843395233, |
|
"rewards/cosine_scaled_reward": -0.260048970580101, |
|
"rewards/format_reward": 0.125, |
|
"step": 83 |
|
}, |
|
{ |
|
"completion_length": 2530.7083740234375, |
|
"epoch": 0.096, |
|
"grad_norm": 0.2006414830684662, |
|
"kl": 0.0003807544708251953, |
|
"learning_rate": 9.873824502603459e-07, |
|
"loss": 0.0, |
|
"reward": 0.7480560662224889, |
|
"reward_std": 1.0157663226127625, |
|
"rewards/cosine_scaled_reward": 0.0927780270576477, |
|
"rewards/format_reward": 0.5625000149011612, |
|
"step": 84 |
|
}, |
|
{ |
|
"completion_length": 3185.8541870117188, |
|
"epoch": 0.09714285714285714, |
|
"grad_norm": 0.17822831869125366, |
|
"kl": 0.0009975433349609375, |
|
"learning_rate": 9.866330768241983e-07, |
|
"loss": 0.0, |
|
"reward": 0.07128806412220001, |
|
"reward_std": 0.8152596428990364, |
|
"rewards/cosine_scaled_reward": -0.08935598330572248, |
|
"rewards/format_reward": 0.2500000111758709, |
|
"step": 85 |
|
}, |
|
{ |
|
"completion_length": 2827.5625610351562, |
|
"epoch": 0.09828571428571428, |
|
"grad_norm": 0.1663668006658554, |
|
"kl": 0.0010325908660888672, |
|
"learning_rate": 9.85862422507884e-07, |
|
"loss": 0.0, |
|
"reward": 0.19359283335506916, |
|
"reward_std": 0.644717700779438, |
|
"rewards/cosine_scaled_reward": -0.10112026333808899, |
|
"rewards/format_reward": 0.3958333544433117, |
|
"step": 86 |
|
}, |
|
{ |
|
"completion_length": 2942.5, |
|
"epoch": 0.09942857142857142, |
|
"grad_norm": 0.1894853264093399, |
|
"kl": 0.0014657974243164062, |
|
"learning_rate": 9.850705248720068e-07, |
|
"loss": 0.0001, |
|
"reward": 0.1782783716917038, |
|
"reward_std": 0.7724725604057312, |
|
"rewards/cosine_scaled_reward": -0.08794412622228265, |
|
"rewards/format_reward": 0.3541666716337204, |
|
"step": 87 |
|
}, |
|
{ |
|
"completion_length": 2836.3959350585938, |
|
"epoch": 0.10057142857142858, |
|
"grad_norm": 0.1908150315284729, |
|
"kl": 0.0014390945434570312, |
|
"learning_rate": 9.8425742251254e-07, |
|
"loss": 0.0001, |
|
"reward": 0.3470733240246773, |
|
"reward_std": 0.8534664362668991, |
|
"rewards/cosine_scaled_reward": -0.024379996582865715, |
|
"rewards/format_reward": 0.3958333432674408, |
|
"step": 88 |
|
}, |
|
{ |
|
"completion_length": 3278.6041870117188, |
|
"epoch": 0.10171428571428572, |
|
"grad_norm": 0.1539601981639862, |
|
"kl": 0.0004895925521850586, |
|
"learning_rate": 9.83423155058946e-07, |
|
"loss": 0.0, |
|
"reward": 0.30410441011190414, |
|
"reward_std": 0.6761599257588387, |
|
"rewards/cosine_scaled_reward": 0.006218895316123962, |
|
"rewards/format_reward": 0.29166667722165585, |
|
"step": 89 |
|
}, |
|
{ |
|
"completion_length": 2985.3958740234375, |
|
"epoch": 0.10285714285714286, |
|
"grad_norm": 0.17722909152507782, |
|
"kl": 0.00044274330139160156, |
|
"learning_rate": 9.825677631722435e-07, |
|
"loss": 0.0, |
|
"reward": 0.2143753319978714, |
|
"reward_std": 0.6936175674200058, |
|
"rewards/cosine_scaled_reward": -0.0594789981842041, |
|
"rewards/format_reward": 0.3333333358168602, |
|
"step": 90 |
|
}, |
|
{ |
|
"completion_length": 3058.5000610351562, |
|
"epoch": 0.104, |
|
"grad_norm": 0.19192735850811005, |
|
"kl": 0.0004374980926513672, |
|
"learning_rate": 9.816912885430258e-07, |
|
"loss": 0.0, |
|
"reward": 0.302869388833642, |
|
"reward_std": 0.5636695921421051, |
|
"rewards/cosine_scaled_reward": -0.04648197069764137, |
|
"rewards/format_reward": 0.3958333432674408, |
|
"step": 91 |
|
}, |
|
{ |
|
"completion_length": 2699.6250610351562, |
|
"epoch": 0.10514285714285715, |
|
"grad_norm": 0.17412729561328888, |
|
"kl": 0.0012140274047851562, |
|
"learning_rate": 9.807937738894303e-07, |
|
"loss": 0.0, |
|
"reward": 0.5564358681440353, |
|
"reward_std": 0.717531181871891, |
|
"rewards/cosine_scaled_reward": 0.059467924758791924, |
|
"rewards/format_reward": 0.4375000111758709, |
|
"step": 92 |
|
}, |
|
{ |
|
"completion_length": 2346.1250610351562, |
|
"epoch": 0.10628571428571429, |
|
"grad_norm": 0.2216739058494568, |
|
"kl": 0.0015277862548828125, |
|
"learning_rate": 9.798752629550546e-07, |
|
"loss": 0.0001, |
|
"reward": 0.5152877140790224, |
|
"reward_std": 0.6053595095872879, |
|
"rewards/cosine_scaled_reward": -0.0027728192508220673, |
|
"rewards/format_reward": 0.5208333358168602, |
|
"step": 93 |
|
}, |
|
{ |
|
"completion_length": 3211.8333740234375, |
|
"epoch": 0.10742857142857143, |
|
"grad_norm": 0.18879717588424683, |
|
"kl": 0.000827789306640625, |
|
"learning_rate": 9.78935800506826e-07, |
|
"loss": 0.0, |
|
"reward": -0.015797210857272148, |
|
"reward_std": 0.735307015478611, |
|
"rewards/cosine_scaled_reward": -0.13289860635995865, |
|
"rewards/format_reward": 0.2500000037252903, |
|
"step": 94 |
|
}, |
|
{ |
|
"completion_length": 3198.1459350585938, |
|
"epoch": 0.10857142857142857, |
|
"grad_norm": 0.15773996710777283, |
|
"kl": 0.00040841102600097656, |
|
"learning_rate": 9.779754323328192e-07, |
|
"loss": 0.0, |
|
"reward": -0.016605263575911522, |
|
"reward_std": 0.7409057542681694, |
|
"rewards/cosine_scaled_reward": -0.16455264016985893, |
|
"rewards/format_reward": 0.3125000037252903, |
|
"step": 95 |
|
}, |
|
{ |
|
"completion_length": 2382.2084350585938, |
|
"epoch": 0.10971428571428571, |
|
"grad_norm": 0.2195434868335724, |
|
"kl": 0.0015625953674316406, |
|
"learning_rate": 9.769942052400235e-07, |
|
"loss": 0.0001, |
|
"reward": 0.3714839336462319, |
|
"reward_std": 0.5286353975534439, |
|
"rewards/cosine_scaled_reward": -0.08509137481451035, |
|
"rewards/format_reward": 0.5416666865348816, |
|
"step": 96 |
|
}, |
|
{ |
|
"completion_length": 2969.479248046875, |
|
"epoch": 0.11085714285714286, |
|
"grad_norm": 0.22521458566188812, |
|
"kl": 0.0011968612670898438, |
|
"learning_rate": 9.759921670520634e-07, |
|
"loss": 0.0, |
|
"reward": 0.023297425359487534, |
|
"reward_std": 0.6409126222133636, |
|
"rewards/cosine_scaled_reward": -0.14460130035877228, |
|
"rewards/format_reward": 0.3125000111758709, |
|
"step": 97 |
|
}, |
|
{ |
|
"completion_length": 2782.479248046875, |
|
"epoch": 0.112, |
|
"grad_norm": 0.48463404178619385, |
|
"kl": 0.01540231704711914, |
|
"learning_rate": 9.749693666068663e-07, |
|
"loss": 0.0006, |
|
"reward": 0.2789543569087982, |
|
"reward_std": 0.6075774282217026, |
|
"rewards/cosine_scaled_reward": -0.05843949131667614, |
|
"rewards/format_reward": 0.39583333395421505, |
|
"step": 98 |
|
}, |
|
{ |
|
"completion_length": 2909.1458435058594, |
|
"epoch": 0.11314285714285714, |
|
"grad_norm": 0.19729964435100555, |
|
"kl": 0.000751495361328125, |
|
"learning_rate": 9.739258537542835e-07, |
|
"loss": 0.0, |
|
"reward": 0.3117349073290825, |
|
"reward_std": 0.5036360248923302, |
|
"rewards/cosine_scaled_reward": -0.010799217969179153, |
|
"rewards/format_reward": 0.33333333395421505, |
|
"step": 99 |
|
}, |
|
{ |
|
"completion_length": 3006.9375610351562, |
|
"epoch": 0.11428571428571428, |
|
"grad_norm": 0.1744341254234314, |
|
"kl": 0.0009255409240722656, |
|
"learning_rate": 9.728616793536587e-07, |
|
"loss": 0.0, |
|
"reward": 0.4609271613880992, |
|
"reward_std": 0.859523817896843, |
|
"rewards/cosine_scaled_reward": 0.022130253724753857, |
|
"rewards/format_reward": 0.416666679084301, |
|
"step": 100 |
|
}, |
|
{ |
|
"completion_length": 2650.979217529297, |
|
"epoch": 0.11542857142857142, |
|
"grad_norm": 0.18743358552455902, |
|
"kl": 0.0010256767272949219, |
|
"learning_rate": 9.717768952713511e-07, |
|
"loss": 0.0, |
|
"reward": 0.21766437217593193, |
|
"reward_std": 0.6801646202802658, |
|
"rewards/cosine_scaled_reward": -0.09950115904211998, |
|
"rewards/format_reward": 0.41666667722165585, |
|
"step": 101 |
|
}, |
|
{ |
|
"completion_length": 2554.1875610351562, |
|
"epoch": 0.11657142857142858, |
|
"grad_norm": 0.18099477887153625, |
|
"kl": 0.001209259033203125, |
|
"learning_rate": 9.706715543782064e-07, |
|
"loss": 0.0, |
|
"reward": 0.29797927755862474, |
|
"reward_std": 0.4223637208342552, |
|
"rewards/cosine_scaled_reward": -0.10101036727428436, |
|
"rewards/format_reward": 0.5, |
|
"step": 102 |
|
}, |
|
{ |
|
"completion_length": 2658.979248046875, |
|
"epoch": 0.11771428571428572, |
|
"grad_norm": 0.15931963920593262, |
|
"kl": 0.0010652542114257812, |
|
"learning_rate": 9.695457105469804e-07, |
|
"loss": 0.0, |
|
"reward": 0.23173093050718307, |
|
"reward_std": 0.6561538353562355, |
|
"rewards/cosine_scaled_reward": -0.10288454219698906, |
|
"rewards/format_reward": 0.4375, |
|
"step": 103 |
|
}, |
|
{ |
|
"completion_length": 2505.5833740234375, |
|
"epoch": 0.11885714285714286, |
|
"grad_norm": 0.18099236488342285, |
|
"kl": 0.0005369186401367188, |
|
"learning_rate": 9.683994186497132e-07, |
|
"loss": 0.0, |
|
"reward": 0.9495851993560791, |
|
"reward_std": 0.7366478592157364, |
|
"rewards/cosine_scaled_reward": 0.19354257080703974, |
|
"rewards/format_reward": 0.5625000149011612, |
|
"step": 104 |
|
}, |
|
{ |
|
"completion_length": 2778.3125, |
|
"epoch": 0.12, |
|
"grad_norm": 0.20704622566699982, |
|
"kl": 0.0013036727905273438, |
|
"learning_rate": 9.672327345550543e-07, |
|
"loss": 0.0001, |
|
"reward": 0.24978191778063774, |
|
"reward_std": 0.7690765783190727, |
|
"rewards/cosine_scaled_reward": -0.09385904669761658, |
|
"rewards/format_reward": 0.4375000074505806, |
|
"step": 105 |
|
}, |
|
{ |
|
"completion_length": 2914.6250610351562, |
|
"epoch": 0.12114285714285715, |
|
"grad_norm": 0.1921072155237198, |
|
"kl": 0.001560211181640625, |
|
"learning_rate": 9.66045715125541e-07, |
|
"loss": 0.0001, |
|
"reward": 0.3622821723110974, |
|
"reward_std": 0.912909746170044, |
|
"rewards/cosine_scaled_reward": -0.01677557732909918, |
|
"rewards/format_reward": 0.3958333395421505, |
|
"step": 106 |
|
}, |
|
{ |
|
"completion_length": 2109.354248046875, |
|
"epoch": 0.12228571428571429, |
|
"grad_norm": 0.2092333436012268, |
|
"kl": 0.0012311935424804688, |
|
"learning_rate": 9.648384182148252e-07, |
|
"loss": 0.0, |
|
"reward": 0.5422526616603136, |
|
"reward_std": 0.8620414137840271, |
|
"rewards/cosine_scaled_reward": -0.062207008711993694, |
|
"rewards/format_reward": 0.6666666865348816, |
|
"step": 107 |
|
}, |
|
{ |
|
"completion_length": 2214.9583587646484, |
|
"epoch": 0.12342857142857143, |
|
"grad_norm": 0.2060472071170807, |
|
"kl": 0.0030279159545898438, |
|
"learning_rate": 9.636109026648554e-07, |
|
"loss": 0.0001, |
|
"reward": 0.6110497042536736, |
|
"reward_std": 0.7519760131835938, |
|
"rewards/cosine_scaled_reward": 0.03469152469187975, |
|
"rewards/format_reward": 0.5416666716337204, |
|
"step": 108 |
|
}, |
|
{ |
|
"completion_length": 2803.2083740234375, |
|
"epoch": 0.12457142857142857, |
|
"grad_norm": 0.20012950897216797, |
|
"kl": 0.0010724067687988281, |
|
"learning_rate": 9.623632283030077e-07, |
|
"loss": 0.0, |
|
"reward": 0.08136957883834839, |
|
"reward_std": 0.6037605702877045, |
|
"rewards/cosine_scaled_reward": -0.15723188465926796, |
|
"rewards/format_reward": 0.3958333395421505, |
|
"step": 109 |
|
}, |
|
{ |
|
"completion_length": 2464.7500610351562, |
|
"epoch": 0.12571428571428572, |
|
"grad_norm": 0.22272928059101105, |
|
"kl": 0.0024871826171875, |
|
"learning_rate": 9.610954559391704e-07, |
|
"loss": 0.0001, |
|
"reward": 0.9461969807744026, |
|
"reward_std": 0.842521145939827, |
|
"rewards/cosine_scaled_reward": 0.1710151496808976, |
|
"rewards/format_reward": 0.604166679084301, |
|
"step": 110 |
|
}, |
|
{ |
|
"completion_length": 1819.3750610351562, |
|
"epoch": 0.12685714285714286, |
|
"grad_norm": 0.24292264878749847, |
|
"kl": 0.0017080307006835938, |
|
"learning_rate": 9.598076473627796e-07, |
|
"loss": 0.0001, |
|
"reward": 0.7363898158073425, |
|
"reward_std": 0.7160477414727211, |
|
"rewards/cosine_scaled_reward": 0.003611571155488491, |
|
"rewards/format_reward": 0.7291666716337204, |
|
"step": 111 |
|
}, |
|
{ |
|
"completion_length": 2938.8333740234375, |
|
"epoch": 0.128, |
|
"grad_norm": 0.20110748708248138, |
|
"kl": 0.0015575885772705078, |
|
"learning_rate": 9.58499865339809e-07, |
|
"loss": 0.0001, |
|
"reward": 0.47850653529167175, |
|
"reward_std": 0.8684659749269485, |
|
"rewards/cosine_scaled_reward": 0.020503249019384384, |
|
"rewards/format_reward": 0.4375000149011612, |
|
"step": 112 |
|
}, |
|
{ |
|
"completion_length": 2170.791717529297, |
|
"epoch": 0.12914285714285714, |
|
"grad_norm": 0.22352100908756256, |
|
"kl": 0.0018672943115234375, |
|
"learning_rate": 9.571721736097088e-07, |
|
"loss": 0.0001, |
|
"reward": 0.8810203373432159, |
|
"reward_std": 0.6223750859498978, |
|
"rewards/cosine_scaled_reward": 0.09676016308367252, |
|
"rewards/format_reward": 0.6875000074505806, |
|
"step": 113 |
|
}, |
|
{ |
|
"completion_length": 1761.1250457763672, |
|
"epoch": 0.13028571428571428, |
|
"grad_norm": 0.22250227630138397, |
|
"kl": 0.0011968612670898438, |
|
"learning_rate": 9.55824636882301e-07, |
|
"loss": 0.0, |
|
"reward": 0.8257871624082327, |
|
"reward_std": 0.5129944495856762, |
|
"rewards/cosine_scaled_reward": 0.08997690677642822, |
|
"rewards/format_reward": 0.6458333432674408, |
|
"step": 114 |
|
}, |
|
{ |
|
"completion_length": 2406.6875610351562, |
|
"epoch": 0.13142857142857142, |
|
"grad_norm": 0.18063588440418243, |
|
"kl": 0.0014677047729492188, |
|
"learning_rate": 9.54457320834625e-07, |
|
"loss": 0.0001, |
|
"reward": 0.7925823777914047, |
|
"reward_std": 1.05247762799263, |
|
"rewards/cosine_scaled_reward": 0.08379119075834751, |
|
"rewards/format_reward": 0.6250000149011612, |
|
"step": 115 |
|
}, |
|
{ |
|
"completion_length": 2417.8958435058594, |
|
"epoch": 0.13257142857142856, |
|
"grad_norm": 0.23391500115394592, |
|
"kl": 0.004772186279296875, |
|
"learning_rate": 9.530702921077358e-07, |
|
"loss": 0.0002, |
|
"reward": 0.15583854354918003, |
|
"reward_std": 0.6483651623129845, |
|
"rewards/cosine_scaled_reward": -0.14083073096117005, |
|
"rewards/format_reward": 0.4375, |
|
"step": 116 |
|
}, |
|
{ |
|
"completion_length": 3263.666748046875, |
|
"epoch": 0.1337142857142857, |
|
"grad_norm": 0.1551298350095749, |
|
"kl": 0.0015048980712890625, |
|
"learning_rate": 9.516636183034564e-07, |
|
"loss": 0.0001, |
|
"reward": -0.017866918817162514, |
|
"reward_std": 0.6443519741296768, |
|
"rewards/cosine_scaled_reward": -0.10268345987424254, |
|
"rewards/format_reward": 0.18750000558793545, |
|
"step": 117 |
|
}, |
|
{ |
|
"completion_length": 2010.8750305175781, |
|
"epoch": 0.13485714285714287, |
|
"grad_norm": 0.21352525055408478, |
|
"kl": 0.001308441162109375, |
|
"learning_rate": 9.502373679810839e-07, |
|
"loss": 0.0001, |
|
"reward": 0.8165969103574753, |
|
"reward_std": 0.8080126643180847, |
|
"rewards/cosine_scaled_reward": 0.012465095147490501, |
|
"rewards/format_reward": 0.7916666865348816, |
|
"step": 118 |
|
}, |
|
{ |
|
"completion_length": 1884.0625610351562, |
|
"epoch": 0.136, |
|
"grad_norm": 0.27621325850486755, |
|
"kl": 0.0025424957275390625, |
|
"learning_rate": 9.487916106540465e-07, |
|
"loss": 0.0001, |
|
"reward": 0.5952838063240051, |
|
"reward_std": 0.5625797361135483, |
|
"rewards/cosine_scaled_reward": -0.046108097303658724, |
|
"rewards/format_reward": 0.6875, |
|
"step": 119 |
|
}, |
|
{ |
|
"completion_length": 2755.3959350585938, |
|
"epoch": 0.13714285714285715, |
|
"grad_norm": 0.23236262798309326, |
|
"kl": 0.0017757415771484375, |
|
"learning_rate": 9.473264167865171e-07, |
|
"loss": 0.0001, |
|
"reward": 0.063610197044909, |
|
"reward_std": 0.8038829490542412, |
|
"rewards/cosine_scaled_reward": -0.18694491172209382, |
|
"rewards/format_reward": 0.4375000074505806, |
|
"step": 120 |
|
}, |
|
{ |
|
"completion_length": 2782.5833740234375, |
|
"epoch": 0.1382857142857143, |
|
"grad_norm": 0.186203733086586, |
|
"kl": 0.0014486312866210938, |
|
"learning_rate": 9.458418577899774e-07, |
|
"loss": 0.0001, |
|
"reward": 0.2508644163608551, |
|
"reward_std": 0.5808881223201752, |
|
"rewards/cosine_scaled_reward": -0.09331781789660454, |
|
"rewards/format_reward": 0.4375000111758709, |
|
"step": 121 |
|
}, |
|
{ |
|
"completion_length": 2584.0625915527344, |
|
"epoch": 0.13942857142857143, |
|
"grad_norm": 0.2748485803604126, |
|
"kl": 0.0027008056640625, |
|
"learning_rate": 9.443380060197385e-07, |
|
"loss": 0.0001, |
|
"reward": 0.24610598012804985, |
|
"reward_std": 0.4119979292154312, |
|
"rewards/cosine_scaled_reward": -0.11653035134077072, |
|
"rewards/format_reward": 0.47916667722165585, |
|
"step": 122 |
|
}, |
|
{ |
|
"completion_length": 2821.416748046875, |
|
"epoch": 0.14057142857142857, |
|
"grad_norm": 0.27934524416923523, |
|
"kl": 0.0057964324951171875, |
|
"learning_rate": 9.428149347714143e-07, |
|
"loss": 0.0002, |
|
"reward": 0.10106497257947922, |
|
"reward_std": 0.7212768346071243, |
|
"rewards/cosine_scaled_reward": -0.10571751650422812, |
|
"rewards/format_reward": 0.31250000186264515, |
|
"step": 123 |
|
}, |
|
{ |
|
"completion_length": 2463.0000915527344, |
|
"epoch": 0.1417142857142857, |
|
"grad_norm": 0.22744229435920715, |
|
"kl": 0.0014429092407226562, |
|
"learning_rate": 9.412727182773486e-07, |
|
"loss": 0.0001, |
|
"reward": 0.8311970978975296, |
|
"reward_std": 0.8409435376524925, |
|
"rewards/cosine_scaled_reward": 0.11351519823074341, |
|
"rewards/format_reward": 0.6041666716337204, |
|
"step": 124 |
|
}, |
|
{ |
|
"completion_length": 3028.6459350585938, |
|
"epoch": 0.14285714285714285, |
|
"grad_norm": 0.230963334441185, |
|
"kl": 0.0016689300537109375, |
|
"learning_rate": 9.397114317029974e-07, |
|
"loss": 0.0001, |
|
"reward": 0.08886492438614368, |
|
"reward_std": 0.5733988359570503, |
|
"rewards/cosine_scaled_reward": -0.13265088573098183, |
|
"rewards/format_reward": 0.35416666977107525, |
|
"step": 125 |
|
}, |
|
{ |
|
"completion_length": 1943.6667175292969, |
|
"epoch": 0.144, |
|
"grad_norm": 0.26326608657836914, |
|
"kl": 0.0029306411743164062, |
|
"learning_rate": 9.381311511432658e-07, |
|
"loss": 0.0001, |
|
"reward": 0.6195674315094948, |
|
"reward_std": 0.7148094028234482, |
|
"rewards/cosine_scaled_reward": -0.03396627772599459, |
|
"rewards/format_reward": 0.6875000149011612, |
|
"step": 126 |
|
}, |
|
{ |
|
"completion_length": 2635.9166870117188, |
|
"epoch": 0.14514285714285713, |
|
"grad_norm": 0.2009022980928421, |
|
"kl": 0.0012750625610351562, |
|
"learning_rate": 9.36531953618799e-07, |
|
"loss": 0.0001, |
|
"reward": 0.41267674416303635, |
|
"reward_std": 0.8958253264427185, |
|
"rewards/cosine_scaled_reward": -0.022828295826911926, |
|
"rewards/format_reward": 0.4583333395421505, |
|
"step": 127 |
|
}, |
|
{ |
|
"completion_length": 2321.2500610351562, |
|
"epoch": 0.1462857142857143, |
|
"grad_norm": 0.19144511222839355, |
|
"kl": 0.00170135498046875, |
|
"learning_rate": 9.34913917072228e-07, |
|
"loss": 0.0001, |
|
"reward": 0.5039072521030903, |
|
"reward_std": 0.8606824576854706, |
|
"rewards/cosine_scaled_reward": -0.029296381399035454, |
|
"rewards/format_reward": 0.5625000149011612, |
|
"step": 128 |
|
}, |
|
{ |
|
"completion_length": 1837.7917175292969, |
|
"epoch": 0.14742857142857144, |
|
"grad_norm": 0.22352828085422516, |
|
"kl": 0.0069904327392578125, |
|
"learning_rate": 9.332771203643714e-07, |
|
"loss": 0.0003, |
|
"reward": 0.5292131304740906, |
|
"reward_std": 0.6835447549819946, |
|
"rewards/cosine_scaled_reward": -0.09997677942737937, |
|
"rewards/format_reward": 0.7291666716337204, |
|
"step": 129 |
|
}, |
|
{ |
|
"completion_length": 2737.7918090820312, |
|
"epoch": 0.14857142857142858, |
|
"grad_norm": 0.21162550151348114, |
|
"kl": 0.00159454345703125, |
|
"learning_rate": 9.316216432703916e-07, |
|
"loss": 0.0001, |
|
"reward": 0.1000329963862896, |
|
"reward_std": 0.6349897980690002, |
|
"rewards/cosine_scaled_reward": -0.15831685485318303, |
|
"rewards/format_reward": 0.416666679084301, |
|
"step": 130 |
|
}, |
|
{ |
|
"completion_length": 2639.3750610351562, |
|
"epoch": 0.14971428571428572, |
|
"grad_norm": 0.21849536895751953, |
|
"kl": 0.001689910888671875, |
|
"learning_rate": 9.299475664759068e-07, |
|
"loss": 0.0001, |
|
"reward": 0.3575108479708433, |
|
"reward_std": 0.7335042506456375, |
|
"rewards/cosine_scaled_reward": -0.07124457694590092, |
|
"rewards/format_reward": 0.5000000055879354, |
|
"step": 131 |
|
}, |
|
{ |
|
"completion_length": 2595.791748046875, |
|
"epoch": 0.15085714285714286, |
|
"grad_norm": 0.2819630801677704, |
|
"kl": 0.0028476715087890625, |
|
"learning_rate": 9.282549715730579e-07, |
|
"loss": 0.0001, |
|
"reward": -0.14514993596822023, |
|
"reward_std": 0.4842909276485443, |
|
"rewards/cosine_scaled_reward": -0.2704916410148144, |
|
"rewards/format_reward": 0.3958333432674408, |
|
"step": 132 |
|
}, |
|
{ |
|
"completion_length": 2752.2084350585938, |
|
"epoch": 0.152, |
|
"grad_norm": 0.20234017074108124, |
|
"kl": 0.002986907958984375, |
|
"learning_rate": 9.265439410565328e-07, |
|
"loss": 0.0001, |
|
"reward": 0.15741661936044693, |
|
"reward_std": 0.6936222016811371, |
|
"rewards/cosine_scaled_reward": -0.15045835822820663, |
|
"rewards/format_reward": 0.4583333507180214, |
|
"step": 133 |
|
}, |
|
{ |
|
"completion_length": 1920.0000610351562, |
|
"epoch": 0.15314285714285714, |
|
"grad_norm": 0.30035078525543213, |
|
"kl": 0.00372314453125, |
|
"learning_rate": 9.248145583195447e-07, |
|
"loss": 0.0001, |
|
"reward": 0.7273320555686951, |
|
"reward_std": 0.7046244740486145, |
|
"rewards/cosine_scaled_reward": 0.01991601102054119, |
|
"rewards/format_reward": 0.6875, |
|
"step": 134 |
|
}, |
|
{ |
|
"completion_length": 2167.8334350585938, |
|
"epoch": 0.15428571428571428, |
|
"grad_norm": 0.2100658118724823, |
|
"kl": 0.0017566680908203125, |
|
"learning_rate": 9.230669076497687e-07, |
|
"loss": 0.0001, |
|
"reward": 0.4464118145406246, |
|
"reward_std": 0.3928603231906891, |
|
"rewards/cosine_scaled_reward": -0.03721076436340809, |
|
"rewards/format_reward": 0.520833333954215, |
|
"step": 135 |
|
}, |
|
{ |
|
"completion_length": 2357.2084350585938, |
|
"epoch": 0.15542857142857142, |
|
"grad_norm": 0.24143747985363007, |
|
"kl": 0.00250244140625, |
|
"learning_rate": 9.213010742252327e-07, |
|
"loss": 0.0001, |
|
"reward": 0.6343938559293747, |
|
"reward_std": 0.7614049315452576, |
|
"rewards/cosine_scaled_reward": 0.004696924239397049, |
|
"rewards/format_reward": 0.6250000149011612, |
|
"step": 136 |
|
}, |
|
{ |
|
"completion_length": 2482.1875610351562, |
|
"epoch": 0.15657142857142858, |
|
"grad_norm": 0.22769631445407867, |
|
"kl": 0.003353118896484375, |
|
"learning_rate": 9.195171441101668e-07, |
|
"loss": 0.0001, |
|
"reward": 0.09337181597948074, |
|
"reward_std": 0.6429625153541565, |
|
"rewards/cosine_scaled_reward": -0.18248076736927032, |
|
"rewards/format_reward": 0.4583333395421505, |
|
"step": 137 |
|
}, |
|
{ |
|
"completion_length": 2043.5000610351562, |
|
"epoch": 0.15771428571428572, |
|
"grad_norm": 0.2516387403011322, |
|
"kl": 0.003204345703125, |
|
"learning_rate": 9.177152042508077e-07, |
|
"loss": 0.0001, |
|
"reward": 0.9434101283550262, |
|
"reward_std": 0.8629068434238434, |
|
"rewards/cosine_scaled_reward": 0.13837172836065292, |
|
"rewards/format_reward": 0.6666666865348816, |
|
"step": 138 |
|
}, |
|
{ |
|
"completion_length": 2047.5625, |
|
"epoch": 0.15885714285714286, |
|
"grad_norm": 0.2453654259443283, |
|
"kl": 0.0018482208251953125, |
|
"learning_rate": 9.158953424711624e-07, |
|
"loss": 0.0001, |
|
"reward": 0.7086100317537785, |
|
"reward_std": 0.5908297449350357, |
|
"rewards/cosine_scaled_reward": 0.00013833213597536087, |
|
"rewards/format_reward": 0.7083333507180214, |
|
"step": 139 |
|
}, |
|
{ |
|
"completion_length": 2375.7084350585938, |
|
"epoch": 0.16, |
|
"grad_norm": 0.1979781836271286, |
|
"kl": 0.0024309158325195312, |
|
"learning_rate": 9.140576474687263e-07, |
|
"loss": 0.0001, |
|
"reward": 0.619663898833096, |
|
"reward_std": 0.41604873538017273, |
|
"rewards/cosine_scaled_reward": 0.007748600095510483, |
|
"rewards/format_reward": 0.6041666716337204, |
|
"step": 140 |
|
}, |
|
{ |
|
"completion_length": 2381.7708740234375, |
|
"epoch": 0.16114285714285714, |
|
"grad_norm": 0.23859204351902008, |
|
"kl": 0.003826141357421875, |
|
"learning_rate": 9.122022088101613e-07, |
|
"loss": 0.0002, |
|
"reward": 0.3829444032162428, |
|
"reward_std": 0.5856426432728767, |
|
"rewards/cosine_scaled_reward": -0.05852780118584633, |
|
"rewards/format_reward": 0.5000000149011612, |
|
"step": 141 |
|
}, |
|
{ |
|
"completion_length": 2265.4584045410156, |
|
"epoch": 0.16228571428571428, |
|
"grad_norm": 0.26838216185569763, |
|
"kl": 0.0041828155517578125, |
|
"learning_rate": 9.103291169269299e-07, |
|
"loss": 0.0002, |
|
"reward": 0.459966566413641, |
|
"reward_std": 0.5846913754940033, |
|
"rewards/cosine_scaled_reward": -0.030433382838964462, |
|
"rewards/format_reward": 0.5208333358168602, |
|
"step": 142 |
|
}, |
|
{ |
|
"completion_length": 2539.479248046875, |
|
"epoch": 0.16342857142857142, |
|
"grad_norm": 0.18913578987121582, |
|
"kl": 0.0029649734497070312, |
|
"learning_rate": 9.084384631108882e-07, |
|
"loss": 0.0001, |
|
"reward": 0.42852520011365414, |
|
"reward_std": 0.6579816788434982, |
|
"rewards/cosine_scaled_reward": -0.05657072924077511, |
|
"rewards/format_reward": 0.5416666865348816, |
|
"step": 143 |
|
}, |
|
{ |
|
"completion_length": 2627.8125610351562, |
|
"epoch": 0.16457142857142856, |
|
"grad_norm": 0.1790352761745453, |
|
"kl": 0.0033435821533203125, |
|
"learning_rate": 9.065303395098358e-07, |
|
"loss": 0.0001, |
|
"reward": 0.7180662602186203, |
|
"reward_std": 0.9851722121238708, |
|
"rewards/cosine_scaled_reward": 0.05694979056715965, |
|
"rewards/format_reward": 0.6041666939854622, |
|
"step": 144 |
|
}, |
|
{ |
|
"completion_length": 2526.541778564453, |
|
"epoch": 0.1657142857142857, |
|
"grad_norm": 0.22108972072601318, |
|
"kl": 0.0050811767578125, |
|
"learning_rate": 9.046048391230247e-07, |
|
"loss": 0.0002, |
|
"reward": 0.40608268324285746, |
|
"reward_std": 0.8329223841428757, |
|
"rewards/cosine_scaled_reward": -0.0573753397911787, |
|
"rewards/format_reward": 0.5208333432674408, |
|
"step": 145 |
|
}, |
|
{ |
|
"completion_length": 2217.0625610351562, |
|
"epoch": 0.16685714285714287, |
|
"grad_norm": 0.37226402759552, |
|
"kl": 0.00292205810546875, |
|
"learning_rate": 9.026620557966279e-07, |
|
"loss": 0.0001, |
|
"reward": 0.29790709912776947, |
|
"reward_std": 0.6913768872618675, |
|
"rewards/cosine_scaled_reward": -0.19479646161198616, |
|
"rewards/format_reward": 0.6875000149011612, |
|
"step": 146 |
|
}, |
|
{ |
|
"completion_length": 2100.500030517578, |
|
"epoch": 0.168, |
|
"grad_norm": 0.1791330724954605, |
|
"kl": 0.0022411346435546875, |
|
"learning_rate": 9.007020842191634e-07, |
|
"loss": 0.0001, |
|
"reward": 0.6166809126734734, |
|
"reward_std": 0.7499666661024094, |
|
"rewards/cosine_scaled_reward": -0.014576207846403122, |
|
"rewards/format_reward": 0.6458333432674408, |
|
"step": 147 |
|
}, |
|
{ |
|
"completion_length": 1953.1667022705078, |
|
"epoch": 0.16914285714285715, |
|
"grad_norm": 0.26837047934532166, |
|
"kl": 0.0032806396484375, |
|
"learning_rate": 8.987250199168808e-07, |
|
"loss": 0.0001, |
|
"reward": 0.22773092985153198, |
|
"reward_std": 0.5684618726372719, |
|
"rewards/cosine_scaled_reward": -0.21946788486093283, |
|
"rewards/format_reward": 0.6666666679084301, |
|
"step": 148 |
|
}, |
|
{ |
|
"completion_length": 2537.8958740234375, |
|
"epoch": 0.1702857142857143, |
|
"grad_norm": 0.19118516147136688, |
|
"kl": 0.0032138824462890625, |
|
"learning_rate": 8.967309592491052e-07, |
|
"loss": 0.0001, |
|
"reward": 0.9459018707275391, |
|
"reward_std": 0.6409400217235088, |
|
"rewards/cosine_scaled_reward": 0.18128425255417824, |
|
"rewards/format_reward": 0.5833333488553762, |
|
"step": 149 |
|
}, |
|
{ |
|
"completion_length": 2327.9375610351562, |
|
"epoch": 0.17142857142857143, |
|
"grad_norm": 0.22041891515254974, |
|
"kl": 0.004886627197265625, |
|
"learning_rate": 8.9471999940354e-07, |
|
"loss": 0.0002, |
|
"reward": 0.2033998966217041, |
|
"reward_std": 0.6696746721863747, |
|
"rewards/cosine_scaled_reward": -0.1587167321704328, |
|
"rewards/format_reward": 0.5208333488553762, |
|
"step": 150 |
|
}, |
|
{ |
|
"completion_length": 2248.9375610351562, |
|
"epoch": 0.17257142857142857, |
|
"grad_norm": 0.20176808536052704, |
|
"kl": 0.0022144317626953125, |
|
"learning_rate": 8.926922383915315e-07, |
|
"loss": 0.0001, |
|
"reward": 0.2744840234518051, |
|
"reward_std": 0.46313488483428955, |
|
"rewards/cosine_scaled_reward": -0.11275799572467804, |
|
"rewards/format_reward": 0.5000000149011612, |
|
"step": 151 |
|
}, |
|
{ |
|
"completion_length": 2933.0833740234375, |
|
"epoch": 0.1737142857142857, |
|
"grad_norm": 0.26598456501960754, |
|
"kl": 0.005481719970703125, |
|
"learning_rate": 8.906477750432903e-07, |
|
"loss": 0.0002, |
|
"reward": 0.04549443535506725, |
|
"reward_std": 0.48727741837501526, |
|
"rewards/cosine_scaled_reward": -0.13350277952849865, |
|
"rewards/format_reward": 0.3125000074505806, |
|
"step": 152 |
|
}, |
|
{ |
|
"completion_length": 2075.729278564453, |
|
"epoch": 0.17485714285714285, |
|
"grad_norm": 0.19798634946346283, |
|
"kl": 0.0023260116577148438, |
|
"learning_rate": 8.88586709003076e-07, |
|
"loss": 0.0001, |
|
"reward": 0.9074295610189438, |
|
"reward_std": 0.5949664637446404, |
|
"rewards/cosine_scaled_reward": 0.09954808466136456, |
|
"rewards/format_reward": 0.7083333432674408, |
|
"step": 153 |
|
}, |
|
{ |
|
"completion_length": 2754.791748046875, |
|
"epoch": 0.176, |
|
"grad_norm": 0.19806884229183197, |
|
"kl": 0.0027751922607421875, |
|
"learning_rate": 8.865091407243394e-07, |
|
"loss": 0.0001, |
|
"reward": 0.6114984676241875, |
|
"reward_std": 0.6458263993263245, |
|
"rewards/cosine_scaled_reward": 0.08699923381209373, |
|
"rewards/format_reward": 0.4375000149011612, |
|
"step": 154 |
|
}, |
|
{ |
|
"completion_length": 3066.354248046875, |
|
"epoch": 0.17714285714285713, |
|
"grad_norm": 0.29352447390556335, |
|
"kl": 0.00366973876953125, |
|
"learning_rate": 8.844151714648274e-07, |
|
"loss": 0.0001, |
|
"reward": 0.017834719270467758, |
|
"reward_std": 0.8173775672912598, |
|
"rewards/cosine_scaled_reward": -0.14733264222741127, |
|
"rewards/format_reward": 0.31250000558793545, |
|
"step": 155 |
|
}, |
|
{ |
|
"completion_length": 2754.4375610351562, |
|
"epoch": 0.1782857142857143, |
|
"grad_norm": 0.19924919307231903, |
|
"kl": 0.005084991455078125, |
|
"learning_rate": 8.823049032816478e-07, |
|
"loss": 0.0002, |
|
"reward": 0.06147514842450619, |
|
"reward_std": 0.5927468463778496, |
|
"rewards/cosine_scaled_reward": -0.14634575322270393, |
|
"rewards/format_reward": 0.3541666828095913, |
|
"step": 156 |
|
}, |
|
{ |
|
"completion_length": 2206.250030517578, |
|
"epoch": 0.17942857142857144, |
|
"grad_norm": 0.23515692353248596, |
|
"kl": 0.0025005340576171875, |
|
"learning_rate": 8.801784390262943e-07, |
|
"loss": 0.0001, |
|
"reward": 0.9370372518897057, |
|
"reward_std": 0.8283505141735077, |
|
"rewards/cosine_scaled_reward": 0.1456019375473261, |
|
"rewards/format_reward": 0.6458333507180214, |
|
"step": 157 |
|
}, |
|
{ |
|
"completion_length": 2862.7708740234375, |
|
"epoch": 0.18057142857142858, |
|
"grad_norm": 0.19533918797969818, |
|
"kl": 0.004573822021484375, |
|
"learning_rate": 8.780358823396352e-07, |
|
"loss": 0.0002, |
|
"reward": -0.053055196069180965, |
|
"reward_std": 0.6198497340083122, |
|
"rewards/cosine_scaled_reward": -0.23486093431711197, |
|
"rewards/format_reward": 0.41666667349636555, |
|
"step": 158 |
|
}, |
|
{ |
|
"completion_length": 1894.8542175292969, |
|
"epoch": 0.18171428571428572, |
|
"grad_norm": 0.22211147844791412, |
|
"kl": 0.002826690673828125, |
|
"learning_rate": 8.758773376468604e-07, |
|
"loss": 0.0001, |
|
"reward": 1.2009564340114594, |
|
"reward_std": 0.7813936918973923, |
|
"rewards/cosine_scaled_reward": 0.1942282197996974, |
|
"rewards/format_reward": 0.8125000149011612, |
|
"step": 159 |
|
}, |
|
{ |
|
"completion_length": 1964.9792175292969, |
|
"epoch": 0.18285714285714286, |
|
"grad_norm": 0.21944580972194672, |
|
"kl": 0.003116607666015625, |
|
"learning_rate": 8.737029101523929e-07, |
|
"loss": 0.0001, |
|
"reward": 0.9964812844991684, |
|
"reward_std": 0.7849611788988113, |
|
"rewards/cosine_scaled_reward": 0.10240732878446579, |
|
"rewards/format_reward": 0.7916666716337204, |
|
"step": 160 |
|
}, |
|
{ |
|
"completion_length": 2853.4583740234375, |
|
"epoch": 0.184, |
|
"grad_norm": 0.1943131983280182, |
|
"kl": 0.00357818603515625, |
|
"learning_rate": 8.715127058347614e-07, |
|
"loss": 0.0001, |
|
"reward": 0.10683573782444, |
|
"reward_std": 0.6206659823656082, |
|
"rewards/cosine_scaled_reward": -0.1549154706299305, |
|
"rewards/format_reward": 0.41666667722165585, |
|
"step": 161 |
|
}, |
|
{ |
|
"completion_length": 2725.604248046875, |
|
"epoch": 0.18514285714285714, |
|
"grad_norm": 0.18736310303211212, |
|
"kl": 0.00328826904296875, |
|
"learning_rate": 8.693068314414344e-07, |
|
"loss": 0.0001, |
|
"reward": 0.16440774500370026, |
|
"reward_std": 0.7531605362892151, |
|
"rewards/cosine_scaled_reward": -0.1782128056511283, |
|
"rewards/format_reward": 0.5208333507180214, |
|
"step": 162 |
|
}, |
|
{ |
|
"completion_length": 2378.604248046875, |
|
"epoch": 0.18628571428571428, |
|
"grad_norm": 0.27985262870788574, |
|
"kl": 0.00499725341796875, |
|
"learning_rate": 8.670853944836176e-07, |
|
"loss": 0.0002, |
|
"reward": 0.3438632491452154, |
|
"reward_std": 0.8545256406068802, |
|
"rewards/cosine_scaled_reward": -0.06765171512961388, |
|
"rewards/format_reward": 0.479166679084301, |
|
"step": 163 |
|
}, |
|
{ |
|
"completion_length": 2041.791748046875, |
|
"epoch": 0.18742857142857142, |
|
"grad_norm": 0.2726307511329651, |
|
"kl": 0.00522613525390625, |
|
"learning_rate": 8.648485032310144e-07, |
|
"loss": 0.0002, |
|
"reward": 0.1412298008799553, |
|
"reward_std": 0.45817675441503525, |
|
"rewards/cosine_scaled_reward": -0.18980177072808146, |
|
"rewards/format_reward": 0.520833333954215, |
|
"step": 164 |
|
}, |
|
{ |
|
"completion_length": 2056.8958740234375, |
|
"epoch": 0.18857142857142858, |
|
"grad_norm": 0.24950121343135834, |
|
"kl": 0.0033512115478515625, |
|
"learning_rate": 8.625962667065487e-07, |
|
"loss": 0.0001, |
|
"reward": 0.4896044433116913, |
|
"reward_std": 0.6808345168828964, |
|
"rewards/cosine_scaled_reward": -0.057281110901385546, |
|
"rewards/format_reward": 0.6041666828095913, |
|
"step": 165 |
|
}, |
|
{ |
|
"completion_length": 1701.5625, |
|
"epoch": 0.18971428571428572, |
|
"grad_norm": 0.20629195868968964, |
|
"kl": 0.0029201507568359375, |
|
"learning_rate": 8.603287946810513e-07, |
|
"loss": 0.0001, |
|
"reward": 0.4639076357707381, |
|
"reward_std": 0.4746507927775383, |
|
"rewards/cosine_scaled_reward": -0.13262954354286194, |
|
"rewards/format_reward": 0.7291666716337204, |
|
"step": 166 |
|
}, |
|
{ |
|
"completion_length": 2674.4375610351562, |
|
"epoch": 0.19085714285714286, |
|
"grad_norm": 0.19312238693237305, |
|
"kl": 0.004119873046875, |
|
"learning_rate": 8.580461976679099e-07, |
|
"loss": 0.0002, |
|
"reward": 0.26991652697324753, |
|
"reward_std": 0.8362310528755188, |
|
"rewards/cosine_scaled_reward": -0.11504174256697297, |
|
"rewards/format_reward": 0.5000000111758709, |
|
"step": 167 |
|
}, |
|
{ |
|
"completion_length": 1723.5209045410156, |
|
"epoch": 0.192, |
|
"grad_norm": 0.19439440965652466, |
|
"kl": 0.002704620361328125, |
|
"learning_rate": 8.557485869176825e-07, |
|
"loss": 0.0001, |
|
"reward": 0.7088751941919327, |
|
"reward_std": 0.7652025148272514, |
|
"rewards/cosine_scaled_reward": -0.051812431775033474, |
|
"rewards/format_reward": 0.8125000149011612, |
|
"step": 168 |
|
}, |
|
{ |
|
"completion_length": 2231.979248046875, |
|
"epoch": 0.19314285714285714, |
|
"grad_norm": 0.2904442250728607, |
|
"kl": 0.004784584045410156, |
|
"learning_rate": 8.534360744126753e-07, |
|
"loss": 0.0002, |
|
"reward": 0.9261031150817871, |
|
"reward_std": 0.9859992563724518, |
|
"rewards/cosine_scaled_reward": 0.12971824035048485, |
|
"rewards/format_reward": 0.6666666939854622, |
|
"step": 169 |
|
}, |
|
{ |
|
"completion_length": 2114.0208740234375, |
|
"epoch": 0.19428571428571428, |
|
"grad_norm": 0.19766280055046082, |
|
"kl": 0.003971099853515625, |
|
"learning_rate": 8.511087728614862e-07, |
|
"loss": 0.0002, |
|
"reward": 0.3019937239587307, |
|
"reward_std": 0.6615323200821877, |
|
"rewards/cosine_scaled_reward": -0.1510864682495594, |
|
"rewards/format_reward": 0.6041666865348816, |
|
"step": 170 |
|
}, |
|
{ |
|
"completion_length": 1614.7708435058594, |
|
"epoch": 0.19542857142857142, |
|
"grad_norm": 0.23038722574710846, |
|
"kl": 0.0032196044921875, |
|
"learning_rate": 8.487667956935087e-07, |
|
"loss": 0.0001, |
|
"reward": 0.5918858665972948, |
|
"reward_std": 0.47077811881899834, |
|
"rewards/cosine_scaled_reward": -0.09989039599895477, |
|
"rewards/format_reward": 0.7916666716337204, |
|
"step": 171 |
|
}, |
|
{ |
|
"completion_length": 1724.5833740234375, |
|
"epoch": 0.19657142857142856, |
|
"grad_norm": 0.2515551447868347, |
|
"kl": 0.00432586669921875, |
|
"learning_rate": 8.464102570534061e-07, |
|
"loss": 0.0002, |
|
"reward": 0.8076295026112348, |
|
"reward_std": 0.5722271054983139, |
|
"rewards/cosine_scaled_reward": 0.04964808002114296, |
|
"rewards/format_reward": 0.7083333432674408, |
|
"step": 172 |
|
}, |
|
{ |
|
"completion_length": 2433.4375610351562, |
|
"epoch": 0.1977142857142857, |
|
"grad_norm": 0.3010346591472626, |
|
"kl": 0.004909515380859375, |
|
"learning_rate": 8.440392717955475e-07, |
|
"loss": 0.0002, |
|
"reward": 0.34551432851003483, |
|
"reward_std": 0.7096427381038666, |
|
"rewards/cosine_scaled_reward": -0.08765951948589645, |
|
"rewards/format_reward": 0.5208333488553762, |
|
"step": 173 |
|
}, |
|
{ |
|
"completion_length": 2419.979217529297, |
|
"epoch": 0.19885714285714284, |
|
"grad_norm": 0.19969363510608673, |
|
"kl": 0.005977630615234375, |
|
"learning_rate": 8.416539554784089e-07, |
|
"loss": 0.0002, |
|
"reward": 0.27888998575508595, |
|
"reward_std": 0.5231342613697052, |
|
"rewards/cosine_scaled_reward": -0.1001383513212204, |
|
"rewards/format_reward": 0.4791666716337204, |
|
"step": 174 |
|
}, |
|
{ |
|
"completion_length": 2777.1250610351562, |
|
"epoch": 0.2, |
|
"grad_norm": 0.17995108664035797, |
|
"kl": 0.0071868896484375, |
|
"learning_rate": 8.392544243589427e-07, |
|
"loss": 0.0003, |
|
"reward": 0.6402685008943081, |
|
"reward_std": 0.7186409756541252, |
|
"rewards/cosine_scaled_reward": 0.0909675620496273, |
|
"rewards/format_reward": 0.4583333395421505, |
|
"step": 175 |
|
}, |
|
{ |
|
"completion_length": 2093.25, |
|
"epoch": 0.20114285714285715, |
|
"grad_norm": 0.20400448143482208, |
|
"kl": 0.00519561767578125, |
|
"learning_rate": 8.368407953869103e-07, |
|
"loss": 0.0002, |
|
"reward": 0.5985848978161812, |
|
"reward_std": 0.7769260108470917, |
|
"rewards/cosine_scaled_reward": -0.03404088690876961, |
|
"rewards/format_reward": 0.6666666716337204, |
|
"step": 176 |
|
}, |
|
{ |
|
"completion_length": 2018.1875915527344, |
|
"epoch": 0.2022857142857143, |
|
"grad_norm": 0.21771669387817383, |
|
"kl": 0.004276275634765625, |
|
"learning_rate": 8.344131861991828e-07, |
|
"loss": 0.0002, |
|
"reward": 0.5329161509871483, |
|
"reward_std": 0.5947398841381073, |
|
"rewards/cosine_scaled_reward": -0.0772919338196516, |
|
"rewards/format_reward": 0.6875000149011612, |
|
"step": 177 |
|
}, |
|
{ |
|
"completion_length": 1553.2083892822266, |
|
"epoch": 0.20342857142857143, |
|
"grad_norm": 0.2806382477283478, |
|
"kl": 0.00424957275390625, |
|
"learning_rate": 8.319717151140072e-07, |
|
"loss": 0.0002, |
|
"reward": 0.9667995385825634, |
|
"reward_std": 0.4322159215807915, |
|
"rewards/cosine_scaled_reward": 0.09798309206962585, |
|
"rewards/format_reward": 0.770833358168602, |
|
"step": 178 |
|
}, |
|
{ |
|
"completion_length": 2260.9584350585938, |
|
"epoch": 0.20457142857142857, |
|
"grad_norm": 0.19229480624198914, |
|
"kl": 0.00655364990234375, |
|
"learning_rate": 8.295165011252396e-07, |
|
"loss": 0.0003, |
|
"reward": 0.6450915709137917, |
|
"reward_std": 0.6193302199244499, |
|
"rewards/cosine_scaled_reward": 0.030879119411110878, |
|
"rewards/format_reward": 0.583333358168602, |
|
"step": 179 |
|
}, |
|
{ |
|
"completion_length": 2204.104248046875, |
|
"epoch": 0.2057142857142857, |
|
"grad_norm": 0.34143543243408203, |
|
"kl": 0.00551605224609375, |
|
"learning_rate": 8.270476638965461e-07, |
|
"loss": 0.0002, |
|
"reward": 0.49652543663978577, |
|
"reward_std": 0.9920015186071396, |
|
"rewards/cosine_scaled_reward": -0.06423728261142969, |
|
"rewards/format_reward": 0.6250000111758709, |
|
"step": 180 |
|
}, |
|
{ |
|
"completion_length": 2376.3333740234375, |
|
"epoch": 0.20685714285714285, |
|
"grad_norm": 0.1844940185546875, |
|
"kl": 0.002460479736328125, |
|
"learning_rate": 8.245653237555705e-07, |
|
"loss": 0.0001, |
|
"reward": 0.7340436186641455, |
|
"reward_std": 0.7672436386346817, |
|
"rewards/cosine_scaled_reward": 0.033688463270664215, |
|
"rewards/format_reward": 0.6666666865348816, |
|
"step": 181 |
|
}, |
|
{ |
|
"completion_length": 1781.0208435058594, |
|
"epoch": 0.208, |
|
"grad_norm": 0.27145451307296753, |
|
"kl": 0.005458831787109375, |
|
"learning_rate": 8.220696016880687e-07, |
|
"loss": 0.0002, |
|
"reward": 0.6259329319000244, |
|
"reward_std": 0.7968147397041321, |
|
"rewards/cosine_scaled_reward": -0.062033540569245815, |
|
"rewards/format_reward": 0.7500000111758709, |
|
"step": 182 |
|
}, |
|
{ |
|
"completion_length": 2343.8959045410156, |
|
"epoch": 0.20914285714285713, |
|
"grad_norm": 0.2297639399766922, |
|
"kl": 0.00566864013671875, |
|
"learning_rate": 8.195606193320136e-07, |
|
"loss": 0.0002, |
|
"reward": 0.11410272493958473, |
|
"reward_std": 0.5572097525000572, |
|
"rewards/cosine_scaled_reward": -0.2658653110265732, |
|
"rewards/format_reward": 0.6458333432674408, |
|
"step": 183 |
|
}, |
|
{ |
|
"completion_length": 1841.2292175292969, |
|
"epoch": 0.2102857142857143, |
|
"grad_norm": 0.2628481388092041, |
|
"kl": 0.003875732421875, |
|
"learning_rate": 8.170384989716657e-07, |
|
"loss": 0.0002, |
|
"reward": 1.0498279109597206, |
|
"reward_std": 0.812163732945919, |
|
"rewards/cosine_scaled_reward": 0.1394973020069301, |
|
"rewards/format_reward": 0.770833358168602, |
|
"step": 184 |
|
}, |
|
{ |
|
"completion_length": 2690.666748046875, |
|
"epoch": 0.21142857142857144, |
|
"grad_norm": 0.2707008719444275, |
|
"kl": 0.006504058837890625, |
|
"learning_rate": 8.145033635316128e-07, |
|
"loss": 0.0003, |
|
"reward": 0.12935106456279755, |
|
"reward_std": 0.6062737256288528, |
|
"rewards/cosine_scaled_reward": -0.15407447703182697, |
|
"rewards/format_reward": 0.4375000149011612, |
|
"step": 185 |
|
}, |
|
{ |
|
"completion_length": 2648.7083740234375, |
|
"epoch": 0.21257142857142858, |
|
"grad_norm": 0.277004599571228, |
|
"kl": 0.00493621826171875, |
|
"learning_rate": 8.119553365707802e-07, |
|
"loss": 0.0002, |
|
"reward": 0.3933283071964979, |
|
"reward_std": 0.6029615625739098, |
|
"rewards/cosine_scaled_reward": -0.07416917383670807, |
|
"rewards/format_reward": 0.5416666865348816, |
|
"step": 186 |
|
}, |
|
{ |
|
"completion_length": 2165.416778564453, |
|
"epoch": 0.21371428571428572, |
|
"grad_norm": 0.2298295795917511, |
|
"kl": 0.005840301513671875, |
|
"learning_rate": 8.093945422764069e-07, |
|
"loss": 0.0002, |
|
"reward": 0.7806095313280821, |
|
"reward_std": 0.7954358160495758, |
|
"rewards/cosine_scaled_reward": 0.025721419602632523, |
|
"rewards/format_reward": 0.7291666865348816, |
|
"step": 187 |
|
}, |
|
{ |
|
"completion_length": 1475.4167022705078, |
|
"epoch": 0.21485714285714286, |
|
"grad_norm": 0.24691948294639587, |
|
"kl": 0.005809783935546875, |
|
"learning_rate": 8.068211054579943e-07, |
|
"loss": 0.0002, |
|
"reward": 1.0945345759391785, |
|
"reward_std": 0.8786479085683823, |
|
"rewards/cosine_scaled_reward": 0.10976729169487953, |
|
"rewards/format_reward": 0.8750000149011612, |
|
"step": 188 |
|
}, |
|
{ |
|
"completion_length": 1539.8333740234375, |
|
"epoch": 0.216, |
|
"grad_norm": 0.27775290608406067, |
|
"kl": 0.0064697265625, |
|
"learning_rate": 8.04235151541222e-07, |
|
"loss": 0.0003, |
|
"reward": 0.6156105473637581, |
|
"reward_std": 0.7454669773578644, |
|
"rewards/cosine_scaled_reward": -0.119278060272336, |
|
"rewards/format_reward": 0.8541666865348816, |
|
"step": 189 |
|
}, |
|
{ |
|
"completion_length": 2440.979278564453, |
|
"epoch": 0.21714285714285714, |
|
"grad_norm": 0.22604604065418243, |
|
"kl": 0.00592041015625, |
|
"learning_rate": 8.01636806561836e-07, |
|
"loss": 0.0002, |
|
"reward": 0.4239200847223401, |
|
"reward_std": 0.8845669329166412, |
|
"rewards/cosine_scaled_reward": -0.038039978593587875, |
|
"rewards/format_reward": 0.500000013038516, |
|
"step": 190 |
|
}, |
|
{ |
|
"completion_length": 1904.729248046875, |
|
"epoch": 0.21828571428571428, |
|
"grad_norm": 0.2662159204483032, |
|
"kl": 0.00586700439453125, |
|
"learning_rate": 7.990261971595048e-07, |
|
"loss": 0.0002, |
|
"reward": 0.4985937252640724, |
|
"reward_std": 0.7315638810396194, |
|
"rewards/cosine_scaled_reward": -0.09445315971970558, |
|
"rewards/format_reward": 0.6875000074505806, |
|
"step": 191 |
|
}, |
|
{ |
|
"completion_length": 2417.1875610351562, |
|
"epoch": 0.21942857142857142, |
|
"grad_norm": 0.27427810430526733, |
|
"kl": 0.009246826171875, |
|
"learning_rate": 7.964034505716476e-07, |
|
"loss": 0.0004, |
|
"reward": 0.3124541025608778, |
|
"reward_std": 0.6425688564777374, |
|
"rewards/cosine_scaled_reward": -0.12502295151352882, |
|
"rewards/format_reward": 0.5625000074505806, |
|
"step": 192 |
|
}, |
|
{ |
|
"completion_length": 1865.0208740234375, |
|
"epoch": 0.22057142857142858, |
|
"grad_norm": 0.2562018930912018, |
|
"kl": 0.01080322265625, |
|
"learning_rate": 7.93768694627233e-07, |
|
"loss": 0.0004, |
|
"reward": 0.5529625415802002, |
|
"reward_std": 0.5897716134786606, |
|
"rewards/cosine_scaled_reward": -0.0985187292098999, |
|
"rewards/format_reward": 0.7500000149011612, |
|
"step": 193 |
|
}, |
|
{ |
|
"completion_length": 1062.4375305175781, |
|
"epoch": 0.22171428571428572, |
|
"grad_norm": 0.22855594754219055, |
|
"kl": 0.0037994384765625, |
|
"learning_rate": 7.911220577405484e-07, |
|
"loss": 0.0002, |
|
"reward": 1.7262530326843262, |
|
"reward_std": 0.826399639248848, |
|
"rewards/cosine_scaled_reward": 0.3631264716386795, |
|
"rewards/format_reward": 1.0, |
|
"step": 194 |
|
}, |
|
{ |
|
"completion_length": 1257.5625305175781, |
|
"epoch": 0.22285714285714286, |
|
"grad_norm": 0.26371893286705017, |
|
"kl": 0.00643157958984375, |
|
"learning_rate": 7.884636689049422e-07, |
|
"loss": 0.0003, |
|
"reward": 1.151860922574997, |
|
"reward_std": 0.6702793166041374, |
|
"rewards/cosine_scaled_reward": 0.12801377475261688, |
|
"rewards/format_reward": 0.8958333432674408, |
|
"step": 195 |
|
}, |
|
{ |
|
"completion_length": 2729.5000610351562, |
|
"epoch": 0.224, |
|
"grad_norm": 0.2403058111667633, |
|
"kl": 0.00676727294921875, |
|
"learning_rate": 7.857936576865356e-07, |
|
"loss": 0.0003, |
|
"reward": 0.26913030445575714, |
|
"reward_std": 0.6797884181141853, |
|
"rewards/cosine_scaled_reward": -0.08418486639857292, |
|
"rewards/format_reward": 0.4375, |
|
"step": 196 |
|
}, |
|
{ |
|
"completion_length": 2123.791717529297, |
|
"epoch": 0.22514285714285714, |
|
"grad_norm": 0.22864989936351776, |
|
"kl": 0.006134033203125, |
|
"learning_rate": 7.831121542179086e-07, |
|
"loss": 0.0002, |
|
"reward": 0.06347193196415901, |
|
"reward_std": 0.40899810940027237, |
|
"rewards/cosine_scaled_reward": -0.24951404333114624, |
|
"rewards/format_reward": 0.5625000149011612, |
|
"step": 197 |
|
}, |
|
{ |
|
"completion_length": 2316.791748046875, |
|
"epoch": 0.22628571428571428, |
|
"grad_norm": 0.2166266292333603, |
|
"kl": 0.00603485107421875, |
|
"learning_rate": 7.804192891917571e-07, |
|
"loss": 0.0002, |
|
"reward": 0.4236091636121273, |
|
"reward_std": 0.794644683599472, |
|
"rewards/cosine_scaled_reward": -0.1215287372469902, |
|
"rewards/format_reward": 0.6666666865348816, |
|
"step": 198 |
|
}, |
|
{ |
|
"completion_length": 2175.5416870117188, |
|
"epoch": 0.22742857142857142, |
|
"grad_norm": 0.2332044243812561, |
|
"kl": 0.005603790283203125, |
|
"learning_rate": 7.777151938545235e-07, |
|
"loss": 0.0002, |
|
"reward": 1.2629163265228271, |
|
"reward_std": 0.7567542046308517, |
|
"rewards/cosine_scaled_reward": 0.2252081297338009, |
|
"rewards/format_reward": 0.8125000298023224, |
|
"step": 199 |
|
}, |
|
{ |
|
"completion_length": 1545.8125305175781, |
|
"epoch": 0.22857142857142856, |
|
"grad_norm": 0.3451651632785797, |
|
"kl": 0.006610870361328125, |
|
"learning_rate": 7.75e-07, |
|
"loss": 0.0003, |
|
"reward": 1.1884014122188091, |
|
"reward_std": 0.868816927075386, |
|
"rewards/cosine_scaled_reward": 0.1983673730865121, |
|
"rewards/format_reward": 0.7916666716337204, |
|
"step": 200 |
|
}, |
|
{ |
|
"completion_length": 1524.0625610351562, |
|
"epoch": 0.2297142857142857, |
|
"grad_norm": 0.21861064434051514, |
|
"kl": 0.0051116943359375, |
|
"learning_rate": 7.72273839962904e-07, |
|
"loss": 0.0002, |
|
"reward": 0.675473814830184, |
|
"reward_std": 0.6859661787748337, |
|
"rewards/cosine_scaled_reward": -0.07892975211143494, |
|
"rewards/format_reward": 0.8333333432674408, |
|
"step": 201 |
|
}, |
|
{ |
|
"completion_length": 1321.3959045410156, |
|
"epoch": 0.23085714285714284, |
|
"grad_norm": 0.24629908800125122, |
|
"kl": 0.007568359375, |
|
"learning_rate": 7.695368466124296e-07, |
|
"loss": 0.0003, |
|
"reward": 0.9415311962366104, |
|
"reward_std": 0.7775374501943588, |
|
"rewards/cosine_scaled_reward": 0.02284892648458481, |
|
"rewards/format_reward": 0.8958333432674408, |
|
"step": 202 |
|
}, |
|
{ |
|
"completion_length": 1379.7709045410156, |
|
"epoch": 0.232, |
|
"grad_norm": 0.27627113461494446, |
|
"kl": 0.00753021240234375, |
|
"learning_rate": 7.667891533457718e-07, |
|
"loss": 0.0003, |
|
"reward": 1.1194992661476135, |
|
"reward_std": 0.7730197608470917, |
|
"rewards/cosine_scaled_reward": 0.09099959582090378, |
|
"rewards/format_reward": 0.9375000149011612, |
|
"step": 203 |
|
}, |
|
{ |
|
"completion_length": 1120.1667022705078, |
|
"epoch": 0.23314285714285715, |
|
"grad_norm": 0.26729604601860046, |
|
"kl": 0.00640869140625, |
|
"learning_rate": 7.640308940816239e-07, |
|
"loss": 0.0003, |
|
"reward": 1.2363095879554749, |
|
"reward_std": 0.8477204591035843, |
|
"rewards/cosine_scaled_reward": 0.15982142463326454, |
|
"rewards/format_reward": 0.9166666716337204, |
|
"step": 204 |
|
}, |
|
{ |
|
"completion_length": 1971.2291870117188, |
|
"epoch": 0.2342857142857143, |
|
"grad_norm": 0.2195984125137329, |
|
"kl": 0.00698089599609375, |
|
"learning_rate": 7.612622032536507e-07, |
|
"loss": 0.0003, |
|
"reward": 0.8974205702543259, |
|
"reward_std": 0.8895229697227478, |
|
"rewards/cosine_scaled_reward": 0.052876945585012436, |
|
"rewards/format_reward": 0.7916666716337204, |
|
"step": 205 |
|
}, |
|
{ |
|
"completion_length": 2061.3334350585938, |
|
"epoch": 0.23542857142857143, |
|
"grad_norm": 0.268889844417572, |
|
"kl": 0.007232666015625, |
|
"learning_rate": 7.584832158039378e-07, |
|
"loss": 0.0003, |
|
"reward": 0.2433365173637867, |
|
"reward_std": 0.5611164793372154, |
|
"rewards/cosine_scaled_reward": -0.2324984148144722, |
|
"rewards/format_reward": 0.708333358168602, |
|
"step": 206 |
|
}, |
|
{ |
|
"completion_length": 1904.5000305175781, |
|
"epoch": 0.23657142857142857, |
|
"grad_norm": 0.19144296646118164, |
|
"kl": 0.00501251220703125, |
|
"learning_rate": 7.556940671764124e-07, |
|
"loss": 0.0002, |
|
"reward": 0.9468748420476913, |
|
"reward_std": 0.6613385528326035, |
|
"rewards/cosine_scaled_reward": 0.05677075684070587, |
|
"rewards/format_reward": 0.8333333432674408, |
|
"step": 207 |
|
}, |
|
{ |
|
"completion_length": 1996.8125610351562, |
|
"epoch": 0.2377142857142857, |
|
"grad_norm": 0.29941245913505554, |
|
"kl": 0.00930023193359375, |
|
"learning_rate": 7.528948933102438e-07, |
|
"loss": 0.0004, |
|
"reward": 0.29106441140174866, |
|
"reward_std": 0.6116437911987305, |
|
"rewards/cosine_scaled_reward": -0.17738447710871696, |
|
"rewards/format_reward": 0.6458333544433117, |
|
"step": 208 |
|
}, |
|
{ |
|
"completion_length": 1285.9166870117188, |
|
"epoch": 0.23885714285714285, |
|
"grad_norm": 0.3566973805427551, |
|
"kl": 0.00695037841796875, |
|
"learning_rate": 7.500858306332172e-07, |
|
"loss": 0.0003, |
|
"reward": 0.5402148813009262, |
|
"reward_std": 0.7111145555973053, |
|
"rewards/cosine_scaled_reward": -0.14655922167003155, |
|
"rewards/format_reward": 0.8333333432674408, |
|
"step": 209 |
|
}, |
|
{ |
|
"completion_length": 1275.5000305175781, |
|
"epoch": 0.24, |
|
"grad_norm": 0.2605917155742645, |
|
"kl": 0.006221771240234375, |
|
"learning_rate": 7.472670160550848e-07, |
|
"loss": 0.0002, |
|
"reward": 1.3301078528165817, |
|
"reward_std": 0.8438884019851685, |
|
"rewards/cosine_scaled_reward": 0.18588725943118334, |
|
"rewards/format_reward": 0.9583333432674408, |
|
"step": 210 |
|
}, |
|
{ |
|
"completion_length": 2328.6251220703125, |
|
"epoch": 0.24114285714285713, |
|
"grad_norm": 0.1887713074684143, |
|
"kl": 0.0092010498046875, |
|
"learning_rate": 7.444385869608921e-07, |
|
"loss": 0.0004, |
|
"reward": 0.6505604535341263, |
|
"reward_std": 0.5875601321458817, |
|
"rewards/cosine_scaled_reward": -0.008053132332861423, |
|
"rewards/format_reward": 0.6666666865348816, |
|
"step": 211 |
|
}, |
|
{ |
|
"completion_length": 1282.166732788086, |
|
"epoch": 0.2422857142857143, |
|
"grad_norm": 0.2532815635204315, |
|
"kl": 0.0076446533203125, |
|
"learning_rate": 7.416006812042827e-07, |
|
"loss": 0.0003, |
|
"reward": 1.1198171079158783, |
|
"reward_std": 0.8018105626106262, |
|
"rewards/cosine_scaled_reward": 0.11199186649173498, |
|
"rewards/format_reward": 0.8958333432674408, |
|
"step": 212 |
|
}, |
|
{ |
|
"completion_length": 1855.0834045410156, |
|
"epoch": 0.24342857142857144, |
|
"grad_norm": 0.2252466082572937, |
|
"kl": 0.0081024169921875, |
|
"learning_rate": 7.387534371007797e-07, |
|
"loss": 0.0003, |
|
"reward": 0.2633536756038666, |
|
"reward_std": 0.42707760632038116, |
|
"rewards/cosine_scaled_reward": -0.2224898338317871, |
|
"rewards/format_reward": 0.7083333432674408, |
|
"step": 213 |
|
}, |
|
{ |
|
"completion_length": 1971.8959045410156, |
|
"epoch": 0.24457142857142858, |
|
"grad_norm": 0.1935078501701355, |
|
"kl": 0.004627227783203125, |
|
"learning_rate": 7.358969934210438e-07, |
|
"loss": 0.0002, |
|
"reward": 0.7197396508418024, |
|
"reward_std": 0.6319501101970673, |
|
"rewards/cosine_scaled_reward": -0.056796859949827194, |
|
"rewards/format_reward": 0.8333333432674408, |
|
"step": 214 |
|
}, |
|
{ |
|
"completion_length": 1413.3541870117188, |
|
"epoch": 0.24571428571428572, |
|
"grad_norm": 0.2069859504699707, |
|
"kl": 0.00742340087890625, |
|
"learning_rate": 7.330314893841101e-07, |
|
"loss": 0.0003, |
|
"reward": 1.0066201090812683, |
|
"reward_std": 0.9675450921058655, |
|
"rewards/cosine_scaled_reward": 0.03456003498286009, |
|
"rewards/format_reward": 0.9375000149011612, |
|
"step": 215 |
|
}, |
|
{ |
|
"completion_length": 2199.2083435058594, |
|
"epoch": 0.24685714285714286, |
|
"grad_norm": 0.2034756988286972, |
|
"kl": 0.00646209716796875, |
|
"learning_rate": 7.301570646506027e-07, |
|
"loss": 0.0003, |
|
"reward": 0.47810695320367813, |
|
"reward_std": 0.7864377945661545, |
|
"rewards/cosine_scaled_reward": -0.12552986666560173, |
|
"rewards/format_reward": 0.7291666865348816, |
|
"step": 216 |
|
}, |
|
{ |
|
"completion_length": 1527.8958740234375, |
|
"epoch": 0.248, |
|
"grad_norm": 0.2671460807323456, |
|
"kl": 0.006622314453125, |
|
"learning_rate": 7.27273859315928e-07, |
|
"loss": 0.0003, |
|
"reward": 0.7772237807512283, |
|
"reward_std": 0.6489354968070984, |
|
"rewards/cosine_scaled_reward": -0.038471437990665436, |
|
"rewards/format_reward": 0.8541666865348816, |
|
"step": 217 |
|
}, |
|
{ |
|
"completion_length": 1419.3542175292969, |
|
"epoch": 0.24914285714285714, |
|
"grad_norm": 0.2513315677642822, |
|
"kl": 0.00665283203125, |
|
"learning_rate": 7.243820139034464e-07, |
|
"loss": 0.0003, |
|
"reward": 1.1047292775474489, |
|
"reward_std": 0.6393595859408379, |
|
"rewards/cosine_scaled_reward": 0.1252813059836626, |
|
"rewards/format_reward": 0.8541666716337204, |
|
"step": 218 |
|
}, |
|
{ |
|
"completion_length": 1275.3125305175781, |
|
"epoch": 0.2502857142857143, |
|
"grad_norm": 0.2648639380931854, |
|
"kl": 0.00737762451171875, |
|
"learning_rate": 7.214816693576234e-07, |
|
"loss": 0.0003, |
|
"reward": 0.6924525499343872, |
|
"reward_std": 0.6107815653085709, |
|
"rewards/cosine_scaled_reward": -0.13294040283653885, |
|
"rewards/format_reward": 0.9583333432674408, |
|
"step": 219 |
|
}, |
|
{ |
|
"completion_length": 1247.9791717529297, |
|
"epoch": 0.25142857142857145, |
|
"grad_norm": 0.22622907161712646, |
|
"kl": 0.006031036376953125, |
|
"learning_rate": 7.185729670371604e-07, |
|
"loss": 0.0002, |
|
"reward": 0.9419594034552574, |
|
"reward_std": 0.675844706594944, |
|
"rewards/cosine_scaled_reward": 0.023063029162585735, |
|
"rewards/format_reward": 0.8958333432674408, |
|
"step": 220 |
|
}, |
|
{ |
|
"completion_length": 2002.0833740234375, |
|
"epoch": 0.25257142857142856, |
|
"grad_norm": 0.218685120344162, |
|
"kl": 0.0069580078125, |
|
"learning_rate": 7.156560487081051e-07, |
|
"loss": 0.0003, |
|
"reward": 0.6575891096144915, |
|
"reward_std": 0.6497488841414452, |
|
"rewards/cosine_scaled_reward": 0.026711229234933853, |
|
"rewards/format_reward": 0.6041666716337204, |
|
"step": 221 |
|
}, |
|
{ |
|
"completion_length": 2015.6250915527344, |
|
"epoch": 0.2537142857142857, |
|
"grad_norm": 0.20831483602523804, |
|
"kl": 0.0071258544921875, |
|
"learning_rate": 7.127310565369415e-07, |
|
"loss": 0.0003, |
|
"reward": 0.14067217335104942, |
|
"reward_std": 0.48574624210596085, |
|
"rewards/cosine_scaled_reward": -0.28383059799671173, |
|
"rewards/format_reward": 0.7083333432674408, |
|
"step": 222 |
|
}, |
|
{ |
|
"completion_length": 1517.1875305175781, |
|
"epoch": 0.25485714285714284, |
|
"grad_norm": 0.24125142395496368, |
|
"kl": 0.005817413330078125, |
|
"learning_rate": 7.097981330836616e-07, |
|
"loss": 0.0002, |
|
"reward": 0.6348569616675377, |
|
"reward_std": 0.5405807122588158, |
|
"rewards/cosine_scaled_reward": -0.09923820104449987, |
|
"rewards/format_reward": 0.8333333432674408, |
|
"step": 223 |
|
}, |
|
{ |
|
"completion_length": 1914.0000305175781, |
|
"epoch": 0.256, |
|
"grad_norm": 0.2622263431549072, |
|
"kl": 0.00728607177734375, |
|
"learning_rate": 7.068574212948169e-07, |
|
"loss": 0.0003, |
|
"reward": 0.5525996647775173, |
|
"reward_std": 0.5521951243281364, |
|
"rewards/cosine_scaled_reward": -0.04661682341247797, |
|
"rewards/format_reward": 0.6458333432674408, |
|
"step": 224 |
|
}, |
|
{ |
|
"completion_length": 1250.0000305175781, |
|
"epoch": 0.2571428571428571, |
|
"grad_norm": 0.2181866317987442, |
|
"kl": 0.00460052490234375, |
|
"learning_rate": 7.039090644965509e-07, |
|
"loss": 0.0002, |
|
"reward": 1.2948355674743652, |
|
"reward_std": 0.6228364408016205, |
|
"rewards/cosine_scaled_reward": 0.16825110744684935, |
|
"rewards/format_reward": 0.9583333432674408, |
|
"step": 225 |
|
}, |
|
{ |
|
"completion_length": 1461.6666870117188, |
|
"epoch": 0.2582857142857143, |
|
"grad_norm": 0.36098772287368774, |
|
"kl": 0.00760650634765625, |
|
"learning_rate": 7.009532063876148e-07, |
|
"loss": 0.0003, |
|
"reward": 0.3821214698255062, |
|
"reward_std": 0.5764878466725349, |
|
"rewards/cosine_scaled_reward": -0.20477261394262314, |
|
"rewards/format_reward": 0.7916666716337204, |
|
"step": 226 |
|
}, |
|
{ |
|
"completion_length": 1328.5416870117188, |
|
"epoch": 0.25942857142857145, |
|
"grad_norm": 0.27139514684677124, |
|
"kl": 0.00934600830078125, |
|
"learning_rate": 6.979899910323624e-07, |
|
"loss": 0.0004, |
|
"reward": 0.7815765663981438, |
|
"reward_std": 0.7309335023164749, |
|
"rewards/cosine_scaled_reward": -0.03629505028948188, |
|
"rewards/format_reward": 0.8541666865348816, |
|
"step": 227 |
|
}, |
|
{ |
|
"completion_length": 1626.4167175292969, |
|
"epoch": 0.26057142857142856, |
|
"grad_norm": 0.23888561129570007, |
|
"kl": 0.00714874267578125, |
|
"learning_rate": 6.950195628537299e-07, |
|
"loss": 0.0003, |
|
"reward": 0.6099164858460426, |
|
"reward_std": 0.7778853923082352, |
|
"rewards/cosine_scaled_reward": -0.09087510220706463, |
|
"rewards/format_reward": 0.7916666716337204, |
|
"step": 228 |
|
}, |
|
{ |
|
"completion_length": 1440.6875457763672, |
|
"epoch": 0.26171428571428573, |
|
"grad_norm": 0.2864842116832733, |
|
"kl": 0.0119171142578125, |
|
"learning_rate": 6.920420666261961e-07, |
|
"loss": 0.0005, |
|
"reward": 0.7616857700049877, |
|
"reward_std": 0.7498719990253448, |
|
"rewards/cosine_scaled_reward": -0.025407111272215843, |
|
"rewards/format_reward": 0.8125000149011612, |
|
"step": 229 |
|
}, |
|
{ |
|
"completion_length": 1527.8958740234375, |
|
"epoch": 0.26285714285714284, |
|
"grad_norm": 0.2429640144109726, |
|
"kl": 0.00768280029296875, |
|
"learning_rate": 6.890576474687263e-07, |
|
"loss": 0.0003, |
|
"reward": 0.9651975035667419, |
|
"reward_std": 0.824803501367569, |
|
"rewards/cosine_scaled_reward": 0.06593206711113453, |
|
"rewards/format_reward": 0.8333333432674408, |
|
"step": 230 |
|
}, |
|
{ |
|
"completion_length": 1934.2084045410156, |
|
"epoch": 0.264, |
|
"grad_norm": 0.3963330090045929, |
|
"kl": 0.0092926025390625, |
|
"learning_rate": 6.860664508377001e-07, |
|
"loss": 0.0004, |
|
"reward": 0.6242162762209773, |
|
"reward_std": 0.8598367348313332, |
|
"rewards/cosine_scaled_reward": -0.03164188005030155, |
|
"rewards/format_reward": 0.6875000149011612, |
|
"step": 231 |
|
}, |
|
{ |
|
"completion_length": 1455.8333740234375, |
|
"epoch": 0.2651428571428571, |
|
"grad_norm": 0.23361533880233765, |
|
"kl": 0.00798797607421875, |
|
"learning_rate": 6.83068622519821e-07, |
|
"loss": 0.0003, |
|
"reward": 0.7301613166928291, |
|
"reward_std": 0.7596315294504166, |
|
"rewards/cosine_scaled_reward": -0.07241935143247247, |
|
"rewards/format_reward": 0.8750000149011612, |
|
"step": 232 |
|
}, |
|
{ |
|
"completion_length": 2010.8333740234375, |
|
"epoch": 0.2662857142857143, |
|
"grad_norm": 0.2177191823720932, |
|
"kl": 0.0087738037109375, |
|
"learning_rate": 6.800643086250121e-07, |
|
"loss": 0.0004, |
|
"reward": 0.6955921053886414, |
|
"reward_std": 0.8746853768825531, |
|
"rewards/cosine_scaled_reward": -0.0480372947640717, |
|
"rewards/format_reward": 0.7916666865348816, |
|
"step": 233 |
|
}, |
|
{ |
|
"completion_length": 1696.8541870117188, |
|
"epoch": 0.2674285714285714, |
|
"grad_norm": 0.2090214192867279, |
|
"kl": 0.008090972900390625, |
|
"learning_rate": 6.770536555792944e-07, |
|
"loss": 0.0003, |
|
"reward": 0.6760512292385101, |
|
"reward_std": 0.6585969775915146, |
|
"rewards/cosine_scaled_reward": -0.0578077242244035, |
|
"rewards/format_reward": 0.7916666716337204, |
|
"step": 234 |
|
}, |
|
{ |
|
"completion_length": 1389.6667175292969, |
|
"epoch": 0.26857142857142857, |
|
"grad_norm": 0.2434709221124649, |
|
"kl": 0.006389617919921875, |
|
"learning_rate": 6.740368101176495e-07, |
|
"loss": 0.0003, |
|
"reward": 0.6067800773307681, |
|
"reward_std": 0.41319192945957184, |
|
"rewards/cosine_scaled_reward": -0.1028599888086319, |
|
"rewards/format_reward": 0.8125, |
|
"step": 235 |
|
}, |
|
{ |
|
"completion_length": 1396.0625305175781, |
|
"epoch": 0.26971428571428574, |
|
"grad_norm": 0.23188619315624237, |
|
"kl": 0.0070953369140625, |
|
"learning_rate": 6.710139192768694e-07, |
|
"loss": 0.0003, |
|
"reward": 0.5102774500846863, |
|
"reward_std": 0.4952257424592972, |
|
"rewards/cosine_scaled_reward": -0.21361128613352776, |
|
"rewards/format_reward": 0.9375, |
|
"step": 236 |
|
}, |
|
{ |
|
"completion_length": 1236.3958892822266, |
|
"epoch": 0.27085714285714285, |
|
"grad_norm": 0.20340608060359955, |
|
"kl": 0.00693511962890625, |
|
"learning_rate": 6.679851303883891e-07, |
|
"loss": 0.0003, |
|
"reward": 0.9460461437702179, |
|
"reward_std": 0.5520051866769791, |
|
"rewards/cosine_scaled_reward": -0.01656026765704155, |
|
"rewards/format_reward": 0.9791666716337204, |
|
"step": 237 |
|
}, |
|
{ |
|
"completion_length": 1293.0208435058594, |
|
"epoch": 0.272, |
|
"grad_norm": 0.28505584597587585, |
|
"kl": 0.00867462158203125, |
|
"learning_rate": 6.649505910711058e-07, |
|
"loss": 0.0003, |
|
"reward": 0.6806494742631912, |
|
"reward_std": 0.7478103041648865, |
|
"rewards/cosine_scaled_reward": -0.11800861544907093, |
|
"rewards/format_reward": 0.9166666716337204, |
|
"step": 238 |
|
}, |
|
{ |
|
"completion_length": 1973.0000915527344, |
|
"epoch": 0.27314285714285713, |
|
"grad_norm": 0.29544979333877563, |
|
"kl": 0.0121917724609375, |
|
"learning_rate": 6.619104492241847e-07, |
|
"loss": 0.0005, |
|
"reward": 0.5413463786244392, |
|
"reward_std": 0.7216374576091766, |
|
"rewards/cosine_scaled_reward": -0.10432682058308274, |
|
"rewards/format_reward": 0.7500000149011612, |
|
"step": 239 |
|
}, |
|
{ |
|
"completion_length": 1372.6250305175781, |
|
"epoch": 0.2742857142857143, |
|
"grad_norm": 0.2549286484718323, |
|
"kl": 0.00878143310546875, |
|
"learning_rate": 6.588648530198504e-07, |
|
"loss": 0.0004, |
|
"reward": 1.4046210646629333, |
|
"reward_std": 0.5067310631275177, |
|
"rewards/cosine_scaled_reward": 0.2543938383460045, |
|
"rewards/format_reward": 0.8958333432674408, |
|
"step": 240 |
|
}, |
|
{ |
|
"completion_length": 1410.0417175292969, |
|
"epoch": 0.2754285714285714, |
|
"grad_norm": 0.32747364044189453, |
|
"kl": 0.009124755859375, |
|
"learning_rate": 6.558139508961654e-07, |
|
"loss": 0.0004, |
|
"reward": 0.8229547590017319, |
|
"reward_std": 0.6901429891586304, |
|
"rewards/cosine_scaled_reward": -0.026022649370133877, |
|
"rewards/format_reward": 0.8750000149011612, |
|
"step": 241 |
|
}, |
|
{ |
|
"completion_length": 1473.6875228881836, |
|
"epoch": 0.2765714285714286, |
|
"grad_norm": 0.3592205345630646, |
|
"kl": 0.012908935546875, |
|
"learning_rate": 6.527578915497951e-07, |
|
"loss": 0.0005, |
|
"reward": 0.7551293671131134, |
|
"reward_std": 0.663729339838028, |
|
"rewards/cosine_scaled_reward": -0.0599353089928627, |
|
"rewards/format_reward": 0.8750000149011612, |
|
"step": 242 |
|
}, |
|
{ |
|
"completion_length": 1187.0625610351562, |
|
"epoch": 0.2777142857142857, |
|
"grad_norm": 0.22532759606838226, |
|
"kl": 0.0076751708984375, |
|
"learning_rate": 6.496968239287603e-07, |
|
"loss": 0.0003, |
|
"reward": 0.8520705178380013, |
|
"reward_std": 0.45504674315452576, |
|
"rewards/cosine_scaled_reward": -0.05313139781355858, |
|
"rewards/format_reward": 0.9583333432674408, |
|
"step": 243 |
|
}, |
|
{ |
|
"completion_length": 1377.7291870117188, |
|
"epoch": 0.27885714285714286, |
|
"grad_norm": 0.2351863533258438, |
|
"kl": 0.00748443603515625, |
|
"learning_rate": 6.466308972251785e-07, |
|
"loss": 0.0003, |
|
"reward": 1.1842002123594284, |
|
"reward_std": 1.0244361460208893, |
|
"rewards/cosine_scaled_reward": 0.15460011083632708, |
|
"rewards/format_reward": 0.8750000149011612, |
|
"step": 244 |
|
}, |
|
{ |
|
"completion_length": 1255.4167175292969, |
|
"epoch": 0.28, |
|
"grad_norm": 0.34309011697769165, |
|
"kl": 0.01132965087890625, |
|
"learning_rate": 6.435602608679916e-07, |
|
"loss": 0.0005, |
|
"reward": 0.8439341634511948, |
|
"reward_std": 0.6376037150621414, |
|
"rewards/cosine_scaled_reward": -0.025949583388864994, |
|
"rewards/format_reward": 0.8958333432674408, |
|
"step": 245 |
|
}, |
|
{ |
|
"completion_length": 1144.6250305175781, |
|
"epoch": 0.28114285714285714, |
|
"grad_norm": 0.27464959025382996, |
|
"kl": 0.0088958740234375, |
|
"learning_rate": 6.404850645156841e-07, |
|
"loss": 0.0004, |
|
"reward": 0.9792153835296631, |
|
"reward_std": 0.5227902606129646, |
|
"rewards/cosine_scaled_reward": 2.4352222681045532e-05, |
|
"rewards/format_reward": 0.9791666716337204, |
|
"step": 246 |
|
}, |
|
{ |
|
"completion_length": 985.2083587646484, |
|
"epoch": 0.2822857142857143, |
|
"grad_norm": 0.3299963176250458, |
|
"kl": 0.01165771484375, |
|
"learning_rate": 6.374054580489873e-07, |
|
"loss": 0.0005, |
|
"reward": 1.1308997794985771, |
|
"reward_std": 0.699935294687748, |
|
"rewards/cosine_scaled_reward": 0.09669988602399826, |
|
"rewards/format_reward": 0.9375, |
|
"step": 247 |
|
}, |
|
{ |
|
"completion_length": 1719.0208740234375, |
|
"epoch": 0.2834285714285714, |
|
"grad_norm": 0.27527329325675964, |
|
"kl": 0.0107879638671875, |
|
"learning_rate": 6.343215915635761e-07, |
|
"loss": 0.0004, |
|
"reward": 0.808366172015667, |
|
"reward_std": 0.725908488035202, |
|
"rewards/cosine_scaled_reward": -0.012483585625886917, |
|
"rewards/format_reward": 0.8333333432674408, |
|
"step": 248 |
|
}, |
|
{ |
|
"completion_length": 1970.2083740234375, |
|
"epoch": 0.2845714285714286, |
|
"grad_norm": 0.27165958285331726, |
|
"kl": 0.011810302734375, |
|
"learning_rate": 6.31233615362752e-07, |
|
"loss": 0.0005, |
|
"reward": 0.3851170837879181, |
|
"reward_std": 0.7313886731863022, |
|
"rewards/cosine_scaled_reward": -0.16160813719034195, |
|
"rewards/format_reward": 0.7083333432674408, |
|
"step": 249 |
|
}, |
|
{ |
|
"completion_length": 1342.3750305175781, |
|
"epoch": 0.2857142857142857, |
|
"grad_norm": 0.2968628704547882, |
|
"kl": 0.00905609130859375, |
|
"learning_rate": 6.281416799501187e-07, |
|
"loss": 0.0004, |
|
"reward": 0.8386699110269547, |
|
"reward_std": 0.7474230378866196, |
|
"rewards/cosine_scaled_reward": -0.02858173381537199, |
|
"rewards/format_reward": 0.8958333432674408, |
|
"step": 250 |
|
}, |
|
{ |
|
"completion_length": 1774.0208740234375, |
|
"epoch": 0.28685714285714287, |
|
"grad_norm": 0.39884844422340393, |
|
"kl": 0.0139312744140625, |
|
"learning_rate": 6.25045936022246e-07, |
|
"loss": 0.0006, |
|
"reward": 1.0918036848306656, |
|
"reward_std": 1.1108526289463043, |
|
"rewards/cosine_scaled_reward": 0.1917351707816124, |
|
"rewards/format_reward": 0.708333358168602, |
|
"step": 251 |
|
}, |
|
{ |
|
"completion_length": 1604.7500610351562, |
|
"epoch": 0.288, |
|
"grad_norm": 0.32672712206840515, |
|
"kl": 0.0103912353515625, |
|
"learning_rate": 6.219465344613258e-07, |
|
"loss": 0.0004, |
|
"reward": 0.7797054275870323, |
|
"reward_std": 0.6628324761986732, |
|
"rewards/cosine_scaled_reward": -0.06848062574863434, |
|
"rewards/format_reward": 0.9166666716337204, |
|
"step": 252 |
|
}, |
|
{ |
|
"completion_length": 1578.9167175292969, |
|
"epoch": 0.28914285714285715, |
|
"grad_norm": 0.3023277223110199, |
|
"kl": 0.01739501953125, |
|
"learning_rate": 6.188436263278172e-07, |
|
"loss": 0.0007, |
|
"reward": 0.7126835100352764, |
|
"reward_std": 0.786981999874115, |
|
"rewards/cosine_scaled_reward": -0.039491571485996246, |
|
"rewards/format_reward": 0.7916666716337204, |
|
"step": 253 |
|
}, |
|
{ |
|
"completion_length": 1142.6458435058594, |
|
"epoch": 0.29028571428571426, |
|
"grad_norm": 0.4636065363883972, |
|
"kl": 0.0141448974609375, |
|
"learning_rate": 6.157373628530852e-07, |
|
"loss": 0.0006, |
|
"reward": 1.1053122580051422, |
|
"reward_std": 0.47931085526943207, |
|
"rewards/cosine_scaled_reward": 0.16723946738056839, |
|
"rewards/format_reward": 0.7708333432674408, |
|
"step": 254 |
|
}, |
|
{ |
|
"completion_length": 1286.0833435058594, |
|
"epoch": 0.2914285714285714, |
|
"grad_norm": 0.28858116269111633, |
|
"kl": 0.009735107421875, |
|
"learning_rate": 6.126278954320294e-07, |
|
"loss": 0.0004, |
|
"reward": 1.1211883053183556, |
|
"reward_std": 0.4671914726495743, |
|
"rewards/cosine_scaled_reward": 0.09184413589537144, |
|
"rewards/format_reward": 0.9375000149011612, |
|
"step": 255 |
|
}, |
|
{ |
|
"completion_length": 1213.2083740234375, |
|
"epoch": 0.2925714285714286, |
|
"grad_norm": 0.32043886184692383, |
|
"kl": 0.015838623046875, |
|
"learning_rate": 6.095153756157051e-07, |
|
"loss": 0.0006, |
|
"reward": 0.823145791888237, |
|
"reward_std": 0.5544667765498161, |
|
"rewards/cosine_scaled_reward": -0.0675937756896019, |
|
"rewards/format_reward": 0.9583333432674408, |
|
"step": 256 |
|
}, |
|
{ |
|
"completion_length": 1316.8541870117188, |
|
"epoch": 0.2937142857142857, |
|
"grad_norm": 0.23760062456130981, |
|
"kl": 0.0098114013671875, |
|
"learning_rate": 6.06399955103937e-07, |
|
"loss": 0.0004, |
|
"reward": 0.49779732525348663, |
|
"reward_std": 0.46498920768499374, |
|
"rewards/cosine_scaled_reward": -0.23026802763342857, |
|
"rewards/format_reward": 0.9583333432674408, |
|
"step": 257 |
|
}, |
|
{ |
|
"completion_length": 1751.2708435058594, |
|
"epoch": 0.2948571428571429, |
|
"grad_norm": 0.254151314496994, |
|
"kl": 0.0131378173828125, |
|
"learning_rate": 6.032817857379256e-07, |
|
"loss": 0.0005, |
|
"reward": 0.6079542301595211, |
|
"reward_std": 0.6472693011164665, |
|
"rewards/cosine_scaled_reward": -0.10227290168404579, |
|
"rewards/format_reward": 0.8125000149011612, |
|
"step": 258 |
|
}, |
|
{ |
|
"completion_length": 978.6458435058594, |
|
"epoch": 0.296, |
|
"grad_norm": 0.29362747073173523, |
|
"kl": 0.00855255126953125, |
|
"learning_rate": 6.001610194928464e-07, |
|
"loss": 0.0003, |
|
"reward": 0.7723531350493431, |
|
"reward_std": 0.7641957998275757, |
|
"rewards/cosine_scaled_reward": -0.10340679436922073, |
|
"rewards/format_reward": 0.9791666716337204, |
|
"step": 259 |
|
}, |
|
{ |
|
"completion_length": 1655.1875305175781, |
|
"epoch": 0.29714285714285715, |
|
"grad_norm": 0.38929465413093567, |
|
"kl": 0.0145721435546875, |
|
"learning_rate": 5.97037808470444e-07, |
|
"loss": 0.0006, |
|
"reward": 0.37256848718971014, |
|
"reward_std": 0.6071458011865616, |
|
"rewards/cosine_scaled_reward": -0.18871578108519316, |
|
"rewards/format_reward": 0.7500000298023224, |
|
"step": 260 |
|
}, |
|
{ |
|
"completion_length": 1198.8333892822266, |
|
"epoch": 0.29828571428571427, |
|
"grad_norm": 0.2716739773750305, |
|
"kl": 0.0102081298828125, |
|
"learning_rate": 5.939123048916173e-07, |
|
"loss": 0.0004, |
|
"reward": 0.633615754544735, |
|
"reward_std": 0.586112380027771, |
|
"rewards/cosine_scaled_reward": -0.1519421450793743, |
|
"rewards/format_reward": 0.9375000149011612, |
|
"step": 261 |
|
}, |
|
{ |
|
"completion_length": 1484.1042175292969, |
|
"epoch": 0.29942857142857143, |
|
"grad_norm": 0.2628130316734314, |
|
"kl": 0.0106353759765625, |
|
"learning_rate": 5.907846610890011e-07, |
|
"loss": 0.0004, |
|
"reward": 0.4845110587775707, |
|
"reward_std": 0.536693274974823, |
|
"rewards/cosine_scaled_reward": -0.18482780829071999, |
|
"rewards/format_reward": 0.8541666716337204, |
|
"step": 262 |
|
}, |
|
{ |
|
"completion_length": 1064.0833587646484, |
|
"epoch": 0.30057142857142854, |
|
"grad_norm": 0.36321043968200684, |
|
"kl": 0.0098876953125, |
|
"learning_rate": 5.87655029499542e-07, |
|
"loss": 0.0004, |
|
"reward": 0.9257199168205261, |
|
"reward_std": 0.8942077457904816, |
|
"rewards/cosine_scaled_reward": -0.016306710429489613, |
|
"rewards/format_reward": 0.9583333432674408, |
|
"step": 263 |
|
}, |
|
{ |
|
"completion_length": 1316.1042175292969, |
|
"epoch": 0.3017142857142857, |
|
"grad_norm": 0.26243484020233154, |
|
"kl": 0.00811004638671875, |
|
"learning_rate": 5.845235626570683e-07, |
|
"loss": 0.0003, |
|
"reward": 0.6479791402816772, |
|
"reward_std": 0.6652240380644798, |
|
"rewards/cosine_scaled_reward": -0.15517710940912366, |
|
"rewards/format_reward": 0.9583333432674408, |
|
"step": 264 |
|
}, |
|
{ |
|
"completion_length": 1909.6250915527344, |
|
"epoch": 0.3028571428571429, |
|
"grad_norm": 0.2833210229873657, |
|
"kl": 0.01206207275390625, |
|
"learning_rate": 5.813904131848564e-07, |
|
"loss": 0.0005, |
|
"reward": 0.7709198147058487, |
|
"reward_std": 0.6459413915872574, |
|
"rewards/cosine_scaled_reward": 4.323199391365051e-05, |
|
"rewards/format_reward": 0.7708333432674408, |
|
"step": 265 |
|
}, |
|
{ |
|
"completion_length": 1645.9583740234375, |
|
"epoch": 0.304, |
|
"grad_norm": 0.3645997941493988, |
|
"kl": 0.01513671875, |
|
"learning_rate": 5.78255733788191e-07, |
|
"loss": 0.0006, |
|
"reward": 0.4407457821071148, |
|
"reward_std": 0.6104073449969292, |
|
"rewards/cosine_scaled_reward": -0.1858771131373942, |
|
"rewards/format_reward": 0.8125000149011612, |
|
"step": 266 |
|
}, |
|
{ |
|
"completion_length": 1644.0209045410156, |
|
"epoch": 0.30514285714285716, |
|
"grad_norm": 0.27599290013313293, |
|
"kl": 0.014923095703125, |
|
"learning_rate": 5.751196772469237e-07, |
|
"loss": 0.0006, |
|
"reward": 0.7639178857207298, |
|
"reward_std": 0.5990442484617233, |
|
"rewards/cosine_scaled_reward": -0.013874400407075882, |
|
"rewards/format_reward": 0.7916666716337204, |
|
"step": 267 |
|
}, |
|
{ |
|
"completion_length": 1681.4583587646484, |
|
"epoch": 0.3062857142857143, |
|
"grad_norm": 0.2269752472639084, |
|
"kl": 0.01216888427734375, |
|
"learning_rate": 5.71982396408026e-07, |
|
"loss": 0.0005, |
|
"reward": 0.5377051346004009, |
|
"reward_std": 0.6556981913745403, |
|
"rewards/cosine_scaled_reward": -0.14781412575393915, |
|
"rewards/format_reward": 0.8333333358168602, |
|
"step": 268 |
|
}, |
|
{ |
|
"completion_length": 1047.1250457763672, |
|
"epoch": 0.30742857142857144, |
|
"grad_norm": 0.3485262095928192, |
|
"kl": 0.01123046875, |
|
"learning_rate": 5.688440441781398e-07, |
|
"loss": 0.0005, |
|
"reward": 0.68592269718647, |
|
"reward_std": 0.5478300377726555, |
|
"rewards/cosine_scaled_reward": -0.13620532862842083, |
|
"rewards/format_reward": 0.9583333432674408, |
|
"step": 269 |
|
}, |
|
{ |
|
"completion_length": 1012.0208587646484, |
|
"epoch": 0.30857142857142855, |
|
"grad_norm": 0.3116324543952942, |
|
"kl": 0.007354736328125, |
|
"learning_rate": 5.657047735161255e-07, |
|
"loss": 0.0003, |
|
"reward": 0.9341000914573669, |
|
"reward_std": 0.6274480298161507, |
|
"rewards/cosine_scaled_reward": -0.03294998221099377, |
|
"rewards/format_reward": 1.0, |
|
"step": 270 |
|
}, |
|
{ |
|
"completion_length": 1732.6459350585938, |
|
"epoch": 0.3097142857142857, |
|
"grad_norm": 0.42208629846572876, |
|
"kl": 0.016632080078125, |
|
"learning_rate": 5.625647374256061e-07, |
|
"loss": 0.0007, |
|
"reward": 0.7596426885575056, |
|
"reward_std": 0.77412149310112, |
|
"rewards/cosine_scaled_reward": -0.0055953278206288815, |
|
"rewards/format_reward": 0.7708333432674408, |
|
"step": 271 |
|
}, |
|
{ |
|
"completion_length": 2177.270965576172, |
|
"epoch": 0.31085714285714283, |
|
"grad_norm": 0.5940233469009399, |
|
"kl": 0.02764892578125, |
|
"learning_rate": 5.594240889475106e-07, |
|
"loss": 0.0011, |
|
"reward": 0.4223189577460289, |
|
"reward_std": 0.8890358135104179, |
|
"rewards/cosine_scaled_reward": -0.10134052112698555, |
|
"rewards/format_reward": 0.6250000223517418, |
|
"step": 272 |
|
}, |
|
{ |
|
"completion_length": 1954.8333740234375, |
|
"epoch": 0.312, |
|
"grad_norm": 0.3138795495033264, |
|
"kl": 0.02874755859375, |
|
"learning_rate": 5.562829811526154e-07, |
|
"loss": 0.0012, |
|
"reward": 0.3654465600848198, |
|
"reward_std": 0.5650510713458061, |
|
"rewards/cosine_scaled_reward": -0.1506100632250309, |
|
"rewards/format_reward": 0.6666666865348816, |
|
"step": 273 |
|
}, |
|
{ |
|
"completion_length": 1267.4583587646484, |
|
"epoch": 0.31314285714285717, |
|
"grad_norm": 0.274538516998291, |
|
"kl": 0.01172637939453125, |
|
"learning_rate": 5.531415671340826e-07, |
|
"loss": 0.0005, |
|
"reward": 0.837937019765377, |
|
"reward_std": 0.6332506015896797, |
|
"rewards/cosine_scaled_reward": -0.03936483711004257, |
|
"rewards/format_reward": 0.9166666716337204, |
|
"step": 274 |
|
}, |
|
{ |
|
"completion_length": 2021.4583740234375, |
|
"epoch": 0.3142857142857143, |
|
"grad_norm": 0.30807891488075256, |
|
"kl": 0.02382659912109375, |
|
"learning_rate": 5.5e-07, |
|
"loss": 0.001, |
|
"reward": 0.9066380485892296, |
|
"reward_std": 0.9834412485361099, |
|
"rewards/cosine_scaled_reward": 0.1095690238289535, |
|
"rewards/format_reward": 0.6875000223517418, |
|
"step": 275 |
|
}, |
|
{ |
|
"completion_length": 1135.8750305175781, |
|
"epoch": 0.31542857142857145, |
|
"grad_norm": 0.2739965319633484, |
|
"kl": 0.010345458984375, |
|
"learning_rate": 5.468584328659172e-07, |
|
"loss": 0.0004, |
|
"reward": 1.0236308723688126, |
|
"reward_std": 0.5140665993094444, |
|
"rewards/cosine_scaled_reward": 0.04306542640551925, |
|
"rewards/format_reward": 0.9375000149011612, |
|
"step": 276 |
|
}, |
|
{ |
|
"completion_length": 1247.2500305175781, |
|
"epoch": 0.31657142857142856, |
|
"grad_norm": 0.37156689167022705, |
|
"kl": 0.00988006591796875, |
|
"learning_rate": 5.437170188473847e-07, |
|
"loss": 0.0004, |
|
"reward": 0.8927154019474983, |
|
"reward_std": 0.8232090175151825, |
|
"rewards/cosine_scaled_reward": -0.0015590004622936249, |
|
"rewards/format_reward": 0.8958333432674408, |
|
"step": 277 |
|
}, |
|
{ |
|
"completion_length": 1309.3125610351562, |
|
"epoch": 0.3177142857142857, |
|
"grad_norm": 0.2857230603694916, |
|
"kl": 0.0121002197265625, |
|
"learning_rate": 5.405759110524894e-07, |
|
"loss": 0.0005, |
|
"reward": 0.942589208483696, |
|
"reward_std": 0.6875655725598335, |
|
"rewards/cosine_scaled_reward": 0.012961250729858875, |
|
"rewards/format_reward": 0.9166666716337204, |
|
"step": 278 |
|
}, |
|
{ |
|
"completion_length": 1435.4791717529297, |
|
"epoch": 0.31885714285714284, |
|
"grad_norm": 0.43713676929473877, |
|
"kl": 0.01840972900390625, |
|
"learning_rate": 5.37435262574394e-07, |
|
"loss": 0.0007, |
|
"reward": 0.37974046915769577, |
|
"reward_std": 0.5385972559452057, |
|
"rewards/cosine_scaled_reward": -0.2163797914981842, |
|
"rewards/format_reward": 0.8125, |
|
"step": 279 |
|
}, |
|
{ |
|
"completion_length": 1156.062515258789, |
|
"epoch": 0.32, |
|
"grad_norm": 0.2634413242340088, |
|
"kl": 0.0093536376953125, |
|
"learning_rate": 5.342952264838747e-07, |
|
"loss": 0.0004, |
|
"reward": 0.980226680636406, |
|
"reward_std": 0.5487889721989632, |
|
"rewards/cosine_scaled_reward": 0.0005300038028508425, |
|
"rewards/format_reward": 0.9791666716337204, |
|
"step": 280 |
|
}, |
|
{ |
|
"completion_length": 1187.3125457763672, |
|
"epoch": 0.3211428571428571, |
|
"grad_norm": 0.32134756445884705, |
|
"kl": 0.008182525634765625, |
|
"learning_rate": 5.311559558218603e-07, |
|
"loss": 0.0003, |
|
"reward": 0.9637440145015717, |
|
"reward_std": 0.8387380540370941, |
|
"rewards/cosine_scaled_reward": 0.0027053444646298885, |
|
"rewards/format_reward": 0.9583333432674408, |
|
"step": 281 |
|
}, |
|
{ |
|
"completion_length": 1651.604232788086, |
|
"epoch": 0.3222857142857143, |
|
"grad_norm": 0.37761881947517395, |
|
"kl": 0.0242919921875, |
|
"learning_rate": 5.28017603591974e-07, |
|
"loss": 0.001, |
|
"reward": 0.5987202003598213, |
|
"reward_std": 0.6141614019870758, |
|
"rewards/cosine_scaled_reward": -0.10688992030918598, |
|
"rewards/format_reward": 0.8125000149011612, |
|
"step": 282 |
|
}, |
|
{ |
|
"completion_length": 1207.1250610351562, |
|
"epoch": 0.32342857142857145, |
|
"grad_norm": 0.3965184688568115, |
|
"kl": 0.0155487060546875, |
|
"learning_rate": 5.248803227530763e-07, |
|
"loss": 0.0006, |
|
"reward": 0.6064153388142586, |
|
"reward_std": 0.6052896529436111, |
|
"rewards/cosine_scaled_reward": -0.16554233682109043, |
|
"rewards/format_reward": 0.9375000149011612, |
|
"step": 283 |
|
}, |
|
{ |
|
"completion_length": 1520.5625610351562, |
|
"epoch": 0.32457142857142857, |
|
"grad_norm": 0.4802095890045166, |
|
"kl": 0.01385498046875, |
|
"learning_rate": 5.21744266211809e-07, |
|
"loss": 0.0006, |
|
"reward": 0.6298409104347229, |
|
"reward_std": 0.7485504075884819, |
|
"rewards/cosine_scaled_reward": -0.10174621269106865, |
|
"rewards/format_reward": 0.833333358168602, |
|
"step": 284 |
|
}, |
|
{ |
|
"completion_length": 1631.4166870117188, |
|
"epoch": 0.32571428571428573, |
|
"grad_norm": 0.5242775678634644, |
|
"kl": 0.0250244140625, |
|
"learning_rate": 5.186095868151436e-07, |
|
"loss": 0.001, |
|
"reward": 0.4431188479065895, |
|
"reward_std": 0.6355130672454834, |
|
"rewards/cosine_scaled_reward": -0.20552390813827515, |
|
"rewards/format_reward": 0.8541666865348816, |
|
"step": 285 |
|
}, |
|
{ |
|
"completion_length": 1278.2917175292969, |
|
"epoch": 0.32685714285714285, |
|
"grad_norm": 0.2374052256345749, |
|
"kl": 0.009552001953125, |
|
"learning_rate": 5.154764373429315e-07, |
|
"loss": 0.0004, |
|
"reward": 0.9486149102449417, |
|
"reward_std": 0.6938119828701019, |
|
"rewards/cosine_scaled_reward": -0.025692567229270935, |
|
"rewards/format_reward": 1.0, |
|
"step": 286 |
|
}, |
|
{ |
|
"completion_length": 1549.5208740234375, |
|
"epoch": 0.328, |
|
"grad_norm": 0.5496286749839783, |
|
"kl": 0.0344696044921875, |
|
"learning_rate": 5.123449705004581e-07, |
|
"loss": 0.0014, |
|
"reward": 0.6046592518687248, |
|
"reward_std": 0.7527553886175156, |
|
"rewards/cosine_scaled_reward": -0.12475371174514294, |
|
"rewards/format_reward": 0.8541666865348816, |
|
"step": 287 |
|
}, |
|
{ |
|
"completion_length": 906.6041717529297, |
|
"epoch": 0.3291428571428571, |
|
"grad_norm": 0.40935972332954407, |
|
"kl": 0.014556884765625, |
|
"learning_rate": 5.09215338910999e-07, |
|
"loss": 0.0006, |
|
"reward": 1.1515108793973923, |
|
"reward_std": 0.670855775475502, |
|
"rewards/cosine_scaled_reward": 0.07575542479753494, |
|
"rewards/format_reward": 1.0, |
|
"step": 288 |
|
}, |
|
{ |
|
"completion_length": 1302.3958587646484, |
|
"epoch": 0.3302857142857143, |
|
"grad_norm": 0.33956480026245117, |
|
"kl": 0.01055145263671875, |
|
"learning_rate": 5.060876951083828e-07, |
|
"loss": 0.0004, |
|
"reward": 0.8325834274291992, |
|
"reward_std": 0.469268262386322, |
|
"rewards/cosine_scaled_reward": -0.0628749430179596, |
|
"rewards/format_reward": 0.9583333432674408, |
|
"step": 289 |
|
}, |
|
{ |
|
"completion_length": 1219.1667022705078, |
|
"epoch": 0.3314285714285714, |
|
"grad_norm": 0.5517088174819946, |
|
"kl": 0.0243072509765625, |
|
"learning_rate": 5.02962191529556e-07, |
|
"loss": 0.001, |
|
"reward": 1.3289316296577454, |
|
"reward_std": 0.8555012494325638, |
|
"rewards/cosine_scaled_reward": 0.1957157626748085, |
|
"rewards/format_reward": 0.9375000149011612, |
|
"step": 290 |
|
}, |
|
{ |
|
"completion_length": 1010.8541870117188, |
|
"epoch": 0.3325714285714286, |
|
"grad_norm": 0.44879335165023804, |
|
"kl": 0.020660400390625, |
|
"learning_rate": 4.998389805071536e-07, |
|
"loss": 0.0008, |
|
"reward": 0.886590301990509, |
|
"reward_std": 0.6713649779558182, |
|
"rewards/cosine_scaled_reward": 0.00579514354467392, |
|
"rewards/format_reward": 0.8750000149011612, |
|
"step": 291 |
|
}, |
|
{ |
|
"completion_length": 1223.687515258789, |
|
"epoch": 0.33371428571428574, |
|
"grad_norm": 0.3632715344429016, |
|
"kl": 0.0165252685546875, |
|
"learning_rate": 4.967182142620745e-07, |
|
"loss": 0.0007, |
|
"reward": 0.8943772986531258, |
|
"reward_std": 0.8097474128007889, |
|
"rewards/cosine_scaled_reward": -0.0007280493155121803, |
|
"rewards/format_reward": 0.8958333432674408, |
|
"step": 292 |
|
}, |
|
{ |
|
"completion_length": 1374.0208740234375, |
|
"epoch": 0.33485714285714285, |
|
"grad_norm": 0.4008006155490875, |
|
"kl": 0.0230865478515625, |
|
"learning_rate": 4.93600044896063e-07, |
|
"loss": 0.0009, |
|
"reward": 0.6955159157514572, |
|
"reward_std": 0.6655403971672058, |
|
"rewards/cosine_scaled_reward": -0.11057540401816368, |
|
"rewards/format_reward": 0.9166666865348816, |
|
"step": 293 |
|
}, |
|
{ |
|
"completion_length": 1269.2917022705078, |
|
"epoch": 0.336, |
|
"grad_norm": 0.20706599950790405, |
|
"kl": 0.00942230224609375, |
|
"learning_rate": 4.904846243842949e-07, |
|
"loss": 0.0004, |
|
"reward": 1.2973814010620117, |
|
"reward_std": 0.7491715997457504, |
|
"rewards/cosine_scaled_reward": 0.16952402517199516, |
|
"rewards/format_reward": 0.9583333432674408, |
|
"step": 294 |
|
}, |
|
{ |
|
"completion_length": 2128.9375610351562, |
|
"epoch": 0.33714285714285713, |
|
"grad_norm": 0.3372494876384735, |
|
"kl": 0.03973388671875, |
|
"learning_rate": 4.873721045679706e-07, |
|
"loss": 0.0016, |
|
"reward": 0.4113108851015568, |
|
"reward_std": 0.6322937309741974, |
|
"rewards/cosine_scaled_reward": -0.11726122908294201, |
|
"rewards/format_reward": 0.6458333432674408, |
|
"step": 295 |
|
}, |
|
{ |
|
"completion_length": 938.1042022705078, |
|
"epoch": 0.3382857142857143, |
|
"grad_norm": 0.3853781521320343, |
|
"kl": 0.01088714599609375, |
|
"learning_rate": 4.842626371469149e-07, |
|
"loss": 0.0004, |
|
"reward": 1.2917412221431732, |
|
"reward_std": 0.859960287809372, |
|
"rewards/cosine_scaled_reward": 0.16670391708612442, |
|
"rewards/format_reward": 0.9583333432674408, |
|
"step": 296 |
|
}, |
|
{ |
|
"completion_length": 1234.5000305175781, |
|
"epoch": 0.3394285714285714, |
|
"grad_norm": 0.32493123412132263, |
|
"kl": 0.02001953125, |
|
"learning_rate": 4.811563736721829e-07, |
|
"loss": 0.0008, |
|
"reward": 0.5709987878799438, |
|
"reward_std": 0.5739526003599167, |
|
"rewards/cosine_scaled_reward": -0.1728339404799044, |
|
"rewards/format_reward": 0.9166666865348816, |
|
"step": 297 |
|
}, |
|
{ |
|
"completion_length": 1737.8750305175781, |
|
"epoch": 0.3405714285714286, |
|
"grad_norm": 0.6288060545921326, |
|
"kl": 0.02978515625, |
|
"learning_rate": 4.780534655386743e-07, |
|
"loss": 0.0012, |
|
"reward": 0.7676291763782501, |
|
"reward_std": 0.6836749911308289, |
|
"rewards/cosine_scaled_reward": -0.0328520848415792, |
|
"rewards/format_reward": 0.833333358168602, |
|
"step": 298 |
|
}, |
|
{ |
|
"completion_length": 1495.9583740234375, |
|
"epoch": 0.3417142857142857, |
|
"grad_norm": 0.5687032341957092, |
|
"kl": 0.02923583984375, |
|
"learning_rate": 4.749540639777539e-07, |
|
"loss": 0.0012, |
|
"reward": 0.7650880664587021, |
|
"reward_std": 0.7526437044143677, |
|
"rewards/cosine_scaled_reward": -0.0757893230766058, |
|
"rewards/format_reward": 0.9166666716337204, |
|
"step": 299 |
|
}, |
|
{ |
|
"completion_length": 1482.9167022705078, |
|
"epoch": 0.34285714285714286, |
|
"grad_norm": 0.307375431060791, |
|
"kl": 0.0286102294921875, |
|
"learning_rate": 4.7185832004988133e-07, |
|
"loss": 0.0011, |
|
"reward": 1.0939996913075447, |
|
"reward_std": 0.6226199977099895, |
|
"rewards/cosine_scaled_reward": 0.10949981957674026, |
|
"rewards/format_reward": 0.875, |
|
"step": 300 |
|
}, |
|
{ |
|
"completion_length": 1668.7708740234375, |
|
"epoch": 0.344, |
|
"grad_norm": 0.3655478358268738, |
|
"kl": 0.0385589599609375, |
|
"learning_rate": 4.68766384637248e-07, |
|
"loss": 0.0015, |
|
"reward": 0.9393804222345352, |
|
"reward_std": 0.5472998023033142, |
|
"rewards/cosine_scaled_reward": 0.04260684549808502, |
|
"rewards/format_reward": 0.8541666865348816, |
|
"step": 301 |
|
}, |
|
{ |
|
"completion_length": 1460.1667175292969, |
|
"epoch": 0.34514285714285714, |
|
"grad_norm": 0.5648999810218811, |
|
"kl": 0.0326690673828125, |
|
"learning_rate": 4.656784084364238e-07, |
|
"loss": 0.0013, |
|
"reward": 0.8058248609304428, |
|
"reward_std": 0.5371805727481842, |
|
"rewards/cosine_scaled_reward": -0.034587569534778595, |
|
"rewards/format_reward": 0.8750000149011612, |
|
"step": 302 |
|
}, |
|
{ |
|
"completion_length": 1176.3958740234375, |
|
"epoch": 0.3462857142857143, |
|
"grad_norm": 0.2893832325935364, |
|
"kl": 0.01714324951171875, |
|
"learning_rate": 4.6259454195101267e-07, |
|
"loss": 0.0007, |
|
"reward": 0.9501863233745098, |
|
"reward_std": 0.7773154973983765, |
|
"rewards/cosine_scaled_reward": 0.006343139801174402, |
|
"rewards/format_reward": 0.9375000149011612, |
|
"step": 303 |
|
}, |
|
{ |
|
"completion_length": 1049.2708587646484, |
|
"epoch": 0.3474285714285714, |
|
"grad_norm": 0.3396141231060028, |
|
"kl": 0.017486572265625, |
|
"learning_rate": 4.59514935484316e-07, |
|
"loss": 0.0007, |
|
"reward": 1.3814565241336823, |
|
"reward_std": 0.7759077772498131, |
|
"rewards/cosine_scaled_reward": 0.21156160347163677, |
|
"rewards/format_reward": 0.9583333432674408, |
|
"step": 304 |
|
}, |
|
{ |
|
"completion_length": 1675.1667175292969, |
|
"epoch": 0.3485714285714286, |
|
"grad_norm": 0.32787981629371643, |
|
"kl": 0.0314483642578125, |
|
"learning_rate": 4.5643973913200837e-07, |
|
"loss": 0.0013, |
|
"reward": 0.7963635921478271, |
|
"reward_std": 0.7029048502445221, |
|
"rewards/cosine_scaled_reward": -0.018484866246581078, |
|
"rewards/format_reward": 0.8333333432674408, |
|
"step": 305 |
|
}, |
|
{ |
|
"completion_length": 1478.7916870117188, |
|
"epoch": 0.3497142857142857, |
|
"grad_norm": 0.7724860310554504, |
|
"kl": 0.04205322265625, |
|
"learning_rate": 4.5336910277482155e-07, |
|
"loss": 0.0017, |
|
"reward": 0.934022843837738, |
|
"reward_std": 0.6418131068348885, |
|
"rewards/cosine_scaled_reward": 0.039928069338202477, |
|
"rewards/format_reward": 0.8541666716337204, |
|
"step": 306 |
|
}, |
|
{ |
|
"completion_length": 1466.062515258789, |
|
"epoch": 0.35085714285714287, |
|
"grad_norm": 0.7901880741119385, |
|
"kl": 0.048187255859375, |
|
"learning_rate": 4.503031760712397e-07, |
|
"loss": 0.0019, |
|
"reward": 1.208159700036049, |
|
"reward_std": 0.9337977021932602, |
|
"rewards/cosine_scaled_reward": 0.16657985746860504, |
|
"rewards/format_reward": 0.8750000149011612, |
|
"step": 307 |
|
}, |
|
{ |
|
"completion_length": 1011.0, |
|
"epoch": 0.352, |
|
"grad_norm": 0.43838247656822205, |
|
"kl": 0.02608489990234375, |
|
"learning_rate": 4.4724210845020494e-07, |
|
"loss": 0.001, |
|
"reward": 0.8291152790188789, |
|
"reward_std": 0.4770050719380379, |
|
"rewards/cosine_scaled_reward": -0.022942371666431427, |
|
"rewards/format_reward": 0.875, |
|
"step": 308 |
|
}, |
|
{ |
|
"completion_length": 1304.812515258789, |
|
"epoch": 0.35314285714285715, |
|
"grad_norm": 0.34449389576911926, |
|
"kl": 0.027618408203125, |
|
"learning_rate": 4.441860491038345e-07, |
|
"loss": 0.0011, |
|
"reward": 0.9580995887517929, |
|
"reward_std": 0.9287254959344864, |
|
"rewards/cosine_scaled_reward": 0.020716451108455658, |
|
"rewards/format_reward": 0.9166666716337204, |
|
"step": 309 |
|
}, |
|
{ |
|
"completion_length": 1587.1875610351562, |
|
"epoch": 0.35428571428571426, |
|
"grad_norm": 0.3894862234592438, |
|
"kl": 0.0413360595703125, |
|
"learning_rate": 4.4113514698014953e-07, |
|
"loss": 0.0017, |
|
"reward": 0.967779666185379, |
|
"reward_std": 0.49595198780298233, |
|
"rewards/cosine_scaled_reward": 0.04638980980962515, |
|
"rewards/format_reward": 0.875, |
|
"step": 310 |
|
}, |
|
{ |
|
"completion_length": 1005.9583587646484, |
|
"epoch": 0.3554285714285714, |
|
"grad_norm": 0.5111984610557556, |
|
"kl": 0.0352020263671875, |
|
"learning_rate": 4.3808955077581546e-07, |
|
"loss": 0.0014, |
|
"reward": 0.8524645194411278, |
|
"reward_std": 0.6346431374549866, |
|
"rewards/cosine_scaled_reward": -0.042517755180597305, |
|
"rewards/format_reward": 0.9375, |
|
"step": 311 |
|
}, |
|
{ |
|
"completion_length": 1219.1042175292969, |
|
"epoch": 0.3565714285714286, |
|
"grad_norm": 0.24494509398937225, |
|
"kl": 0.0164337158203125, |
|
"learning_rate": 4.350494089288943e-07, |
|
"loss": 0.0007, |
|
"reward": 1.1846633851528168, |
|
"reward_std": 0.8452874422073364, |
|
"rewards/cosine_scaled_reward": 0.09233169769868255, |
|
"rewards/format_reward": 1.0, |
|
"step": 312 |
|
}, |
|
{ |
|
"completion_length": 1772.6459045410156, |
|
"epoch": 0.3577142857142857, |
|
"grad_norm": 0.32232236862182617, |
|
"kl": 0.05014801025390625, |
|
"learning_rate": 4.3201486961161093e-07, |
|
"loss": 0.002, |
|
"reward": 0.7452734671533108, |
|
"reward_std": 0.5130416378378868, |
|
"rewards/cosine_scaled_reward": -0.09611329552717507, |
|
"rewards/format_reward": 0.9375000149011612, |
|
"step": 313 |
|
}, |
|
{ |
|
"completion_length": 1450.9167022705078, |
|
"epoch": 0.3588571428571429, |
|
"grad_norm": 0.7095328569412231, |
|
"kl": 0.04193115234375, |
|
"learning_rate": 4.2898608072313045e-07, |
|
"loss": 0.0017, |
|
"reward": 0.695801317691803, |
|
"reward_std": 0.6888641864061356, |
|
"rewards/cosine_scaled_reward": -0.07918267324566841, |
|
"rewards/format_reward": 0.8541666865348816, |
|
"step": 314 |
|
}, |
|
{ |
|
"completion_length": 890.2083435058594, |
|
"epoch": 0.36, |
|
"grad_norm": 0.8648350238800049, |
|
"kl": 0.02002716064453125, |
|
"learning_rate": 4.2596318988235037e-07, |
|
"loss": 0.0008, |
|
"reward": 0.8261366635560989, |
|
"reward_std": 0.5876666381955147, |
|
"rewards/cosine_scaled_reward": -0.07651501428335905, |
|
"rewards/format_reward": 0.9791666716337204, |
|
"step": 315 |
|
}, |
|
{ |
|
"completion_length": 1495.2917175292969, |
|
"epoch": 0.36114285714285715, |
|
"grad_norm": 0.7055426239967346, |
|
"kl": 0.0330047607421875, |
|
"learning_rate": 4.2294634442070553e-07, |
|
"loss": 0.0013, |
|
"reward": 0.5078188478946686, |
|
"reward_std": 0.6738529801368713, |
|
"rewards/cosine_scaled_reward": -0.18359058536589146, |
|
"rewards/format_reward": 0.8750000149011612, |
|
"step": 316 |
|
}, |
|
{ |
|
"completion_length": 2028.3959350585938, |
|
"epoch": 0.36228571428571427, |
|
"grad_norm": 0.6188425421714783, |
|
"kl": 0.1023101806640625, |
|
"learning_rate": 4.1993569137498776e-07, |
|
"loss": 0.0041, |
|
"reward": 0.6971778050065041, |
|
"reward_std": 0.6844265758991241, |
|
"rewards/cosine_scaled_reward": 0.015255570411682129, |
|
"rewards/format_reward": 0.6666666865348816, |
|
"step": 317 |
|
}, |
|
{ |
|
"completion_length": 1409.0833740234375, |
|
"epoch": 0.36342857142857143, |
|
"grad_norm": 0.4974954426288605, |
|
"kl": 0.04302978515625, |
|
"learning_rate": 4.1693137748017915e-07, |
|
"loss": 0.0017, |
|
"reward": 0.9140654609072953, |
|
"reward_std": 0.590464636683464, |
|
"rewards/cosine_scaled_reward": 0.040366058237850666, |
|
"rewards/format_reward": 0.8333333432674408, |
|
"step": 318 |
|
}, |
|
{ |
|
"completion_length": 1569.2500305175781, |
|
"epoch": 0.36457142857142855, |
|
"grad_norm": 0.3885625898838043, |
|
"kl": 0.0399017333984375, |
|
"learning_rate": 4.1393354916230005e-07, |
|
"loss": 0.0016, |
|
"reward": 0.7486050575971603, |
|
"reward_std": 0.9239681512117386, |
|
"rewards/cosine_scaled_reward": -0.08403081598225981, |
|
"rewards/format_reward": 0.9166666865348816, |
|
"step": 319 |
|
}, |
|
{ |
|
"completion_length": 1059.1875305175781, |
|
"epoch": 0.3657142857142857, |
|
"grad_norm": 0.5842506289482117, |
|
"kl": 0.0222625732421875, |
|
"learning_rate": 4.1094235253127374e-07, |
|
"loss": 0.0009, |
|
"reward": 0.872907280921936, |
|
"reward_std": 0.7717489525675774, |
|
"rewards/cosine_scaled_reward": -0.0635463809594512, |
|
"rewards/format_reward": 1.0, |
|
"step": 320 |
|
}, |
|
{ |
|
"completion_length": 1675.8334045410156, |
|
"epoch": 0.3668571428571429, |
|
"grad_norm": 1.7648111581802368, |
|
"kl": 0.10723876953125, |
|
"learning_rate": 4.079579333738039e-07, |
|
"loss": 0.0043, |
|
"reward": 0.6553980484604836, |
|
"reward_std": 0.6528958007693291, |
|
"rewards/cosine_scaled_reward": -0.06813432276248932, |
|
"rewards/format_reward": 0.7916666865348816, |
|
"step": 321 |
|
}, |
|
{ |
|
"completion_length": 1275.2708587646484, |
|
"epoch": 0.368, |
|
"grad_norm": 0.6659455895423889, |
|
"kl": 0.0338134765625, |
|
"learning_rate": 4.0498043714627006e-07, |
|
"loss": 0.0013, |
|
"reward": 1.113847702741623, |
|
"reward_std": 0.7288860827684402, |
|
"rewards/cosine_scaled_reward": 0.14025717787444592, |
|
"rewards/format_reward": 0.8333333432674408, |
|
"step": 322 |
|
}, |
|
{ |
|
"completion_length": 1612.9166717529297, |
|
"epoch": 0.36914285714285716, |
|
"grad_norm": 0.6660499572753906, |
|
"kl": 0.0994873046875, |
|
"learning_rate": 4.020100089676376e-07, |
|
"loss": 0.004, |
|
"reward": 1.0578741058707237, |
|
"reward_std": 0.49694500118494034, |
|
"rewards/cosine_scaled_reward": 0.09143703989684582, |
|
"rewards/format_reward": 0.8750000149011612, |
|
"step": 323 |
|
}, |
|
{ |
|
"completion_length": 1938.4584045410156, |
|
"epoch": 0.3702857142857143, |
|
"grad_norm": 1.3754856586456299, |
|
"kl": 0.1189422607421875, |
|
"learning_rate": 3.9904679361238526e-07, |
|
"loss": 0.0047, |
|
"reward": 0.27103549893945456, |
|
"reward_std": 0.5563001856207848, |
|
"rewards/cosine_scaled_reward": -0.23948227241635323, |
|
"rewards/format_reward": 0.7500000149011612, |
|
"step": 324 |
|
}, |
|
{ |
|
"completion_length": 953.395881652832, |
|
"epoch": 0.37142857142857144, |
|
"grad_norm": 0.36472955346107483, |
|
"kl": 0.0133514404296875, |
|
"learning_rate": 3.9609093550344907e-07, |
|
"loss": 0.0005, |
|
"reward": 1.0393343269824982, |
|
"reward_std": 0.7779577225446701, |
|
"rewards/cosine_scaled_reward": 0.01966716069728136, |
|
"rewards/format_reward": 1.0, |
|
"step": 325 |
|
}, |
|
{ |
|
"completion_length": 1773.9167175292969, |
|
"epoch": 0.37257142857142855, |
|
"grad_norm": 1.3614015579223633, |
|
"kl": 0.09857177734375, |
|
"learning_rate": 3.931425787051832e-07, |
|
"loss": 0.0039, |
|
"reward": 0.4826292358338833, |
|
"reward_std": 0.624052882194519, |
|
"rewards/cosine_scaled_reward": -0.1753520662896335, |
|
"rewards/format_reward": 0.833333358168602, |
|
"step": 326 |
|
}, |
|
{ |
|
"completion_length": 1750.354232788086, |
|
"epoch": 0.3737142857142857, |
|
"grad_norm": 1.2214010953903198, |
|
"kl": 0.092681884765625, |
|
"learning_rate": 3.902018669163384e-07, |
|
"loss": 0.0037, |
|
"reward": 0.6829137187451124, |
|
"reward_std": 0.5776621401309967, |
|
"rewards/cosine_scaled_reward": -0.05437648296356201, |
|
"rewards/format_reward": 0.7916666865348816, |
|
"step": 327 |
|
}, |
|
{ |
|
"completion_length": 1816.6250610351562, |
|
"epoch": 0.37485714285714283, |
|
"grad_norm": 0.6874251961708069, |
|
"kl": 0.08489990234375, |
|
"learning_rate": 3.872689434630585e-07, |
|
"loss": 0.0034, |
|
"reward": 0.796313688158989, |
|
"reward_std": 1.0243088752031326, |
|
"rewards/cosine_scaled_reward": -0.008093174546957016, |
|
"rewards/format_reward": 0.8125000298023224, |
|
"step": 328 |
|
}, |
|
{ |
|
"completion_length": 2015.5000305175781, |
|
"epoch": 0.376, |
|
"grad_norm": 1.2615182399749756, |
|
"kl": 0.15081787109375, |
|
"learning_rate": 3.843439512918949e-07, |
|
"loss": 0.006, |
|
"reward": 0.6527662584558129, |
|
"reward_std": 0.7077807486057281, |
|
"rewards/cosine_scaled_reward": -0.09028353914618492, |
|
"rewards/format_reward": 0.833333358168602, |
|
"step": 329 |
|
}, |
|
{ |
|
"completion_length": 2259.291717529297, |
|
"epoch": 0.37714285714285717, |
|
"grad_norm": 1.1345741748809814, |
|
"kl": 0.1787109375, |
|
"learning_rate": 3.8142703296283953e-07, |
|
"loss": 0.0071, |
|
"reward": 0.5901373848319054, |
|
"reward_std": 0.9298846423625946, |
|
"rewards/cosine_scaled_reward": -0.03826466016471386, |
|
"rewards/format_reward": 0.6666666865348816, |
|
"step": 330 |
|
}, |
|
{ |
|
"completion_length": 2008.9584045410156, |
|
"epoch": 0.3782857142857143, |
|
"grad_norm": 1.42428719997406, |
|
"kl": 0.228515625, |
|
"learning_rate": 3.785183306423767e-07, |
|
"loss": 0.0092, |
|
"reward": 0.315967773552984, |
|
"reward_std": 0.7076915055513382, |
|
"rewards/cosine_scaled_reward": -0.19618277810513973, |
|
"rewards/format_reward": 0.7083333432674408, |
|
"step": 331 |
|
}, |
|
{ |
|
"completion_length": 1344.7917175292969, |
|
"epoch": 0.37942857142857145, |
|
"grad_norm": 1.2130067348480225, |
|
"kl": 0.17629241943359375, |
|
"learning_rate": 3.7561798609655373e-07, |
|
"loss": 0.007, |
|
"reward": 1.3930619359016418, |
|
"reward_std": 0.7922802865505219, |
|
"rewards/cosine_scaled_reward": 0.26944761723279953, |
|
"rewards/format_reward": 0.8541666716337204, |
|
"step": 332 |
|
}, |
|
{ |
|
"completion_length": 1645.3750305175781, |
|
"epoch": 0.38057142857142856, |
|
"grad_norm": 1.1035624742507935, |
|
"kl": 0.212158203125, |
|
"learning_rate": 3.72726140684072e-07, |
|
"loss": 0.0085, |
|
"reward": 0.28750851564109325, |
|
"reward_std": 0.3911990597844124, |
|
"rewards/cosine_scaled_reward": -0.252079077064991, |
|
"rewards/format_reward": 0.7916666865348816, |
|
"step": 333 |
|
}, |
|
{ |
|
"completion_length": 1476.8333740234375, |
|
"epoch": 0.38171428571428573, |
|
"grad_norm": 1.2739105224609375, |
|
"kl": 0.1046142578125, |
|
"learning_rate": 3.6984293534939737e-07, |
|
"loss": 0.0042, |
|
"reward": 0.836659163236618, |
|
"reward_std": 0.8671058118343353, |
|
"rewards/cosine_scaled_reward": -0.04000374022871256, |
|
"rewards/format_reward": 0.9166666865348816, |
|
"step": 334 |
|
}, |
|
{ |
|
"completion_length": 1723.3333892822266, |
|
"epoch": 0.38285714285714284, |
|
"grad_norm": 1.672484040260315, |
|
"kl": 0.21124267578125, |
|
"learning_rate": 3.6696851061588994e-07, |
|
"loss": 0.0084, |
|
"reward": 0.5473275234689936, |
|
"reward_std": 0.6023431569337845, |
|
"rewards/cosine_scaled_reward": -0.11175291612744331, |
|
"rewards/format_reward": 0.770833358168602, |
|
"step": 335 |
|
}, |
|
{ |
|
"completion_length": 1291.8542175292969, |
|
"epoch": 0.384, |
|
"grad_norm": 1.1949892044067383, |
|
"kl": 0.13958740234375, |
|
"learning_rate": 3.641030065789562e-07, |
|
"loss": 0.0056, |
|
"reward": 0.43108636140823364, |
|
"reward_std": 0.42728982865810394, |
|
"rewards/cosine_scaled_reward": -0.22195683978497982, |
|
"rewards/format_reward": 0.8750000149011612, |
|
"step": 336 |
|
}, |
|
{ |
|
"completion_length": 1024.3333740234375, |
|
"epoch": 0.3851428571428571, |
|
"grad_norm": 0.902114748954773, |
|
"kl": 0.11954498291015625, |
|
"learning_rate": 3.612465628992203e-07, |
|
"loss": 0.0048, |
|
"reward": 1.0072421729564667, |
|
"reward_std": 0.6713532879948616, |
|
"rewards/cosine_scaled_reward": 0.045287732034921646, |
|
"rewards/format_reward": 0.9166666865348816, |
|
"step": 337 |
|
}, |
|
{ |
|
"completion_length": 1448.9375457763672, |
|
"epoch": 0.3862857142857143, |
|
"grad_norm": 1.2535922527313232, |
|
"kl": 0.1561737060546875, |
|
"learning_rate": 3.5839931879571725e-07, |
|
"loss": 0.0062, |
|
"reward": 0.5455589033663273, |
|
"reward_std": 0.535648949444294, |
|
"rewards/cosine_scaled_reward": -0.13347055204212666, |
|
"rewards/format_reward": 0.8125000149011612, |
|
"step": 338 |
|
}, |
|
{ |
|
"completion_length": 2035.6459045410156, |
|
"epoch": 0.38742857142857146, |
|
"grad_norm": 0.746334969997406, |
|
"kl": 0.178955078125, |
|
"learning_rate": 3.555614130391079e-07, |
|
"loss": 0.0071, |
|
"reward": 0.593096449971199, |
|
"reward_std": 0.948510006070137, |
|
"rewards/cosine_scaled_reward": -0.08886844478547573, |
|
"rewards/format_reward": 0.770833358168602, |
|
"step": 339 |
|
}, |
|
{ |
|
"completion_length": 1074.1041870117188, |
|
"epoch": 0.38857142857142857, |
|
"grad_norm": 0.5353119373321533, |
|
"kl": 0.041778564453125, |
|
"learning_rate": 3.5273298394491515e-07, |
|
"loss": 0.0017, |
|
"reward": 1.1473649591207504, |
|
"reward_std": 0.7414836436510086, |
|
"rewards/cosine_scaled_reward": 0.07368248514831066, |
|
"rewards/format_reward": 1.0, |
|
"step": 340 |
|
}, |
|
{ |
|
"completion_length": 1197.7083892822266, |
|
"epoch": 0.38971428571428574, |
|
"grad_norm": 1.1976772546768188, |
|
"kl": 0.1300048828125, |
|
"learning_rate": 3.4991416936678276e-07, |
|
"loss": 0.0052, |
|
"reward": 1.3422877192497253, |
|
"reward_std": 0.6801351606845856, |
|
"rewards/cosine_scaled_reward": 0.1919771609827876, |
|
"rewards/format_reward": 0.9583333432674408, |
|
"step": 341 |
|
}, |
|
{ |
|
"completion_length": 1645.479248046875, |
|
"epoch": 0.39085714285714285, |
|
"grad_norm": 1.725953459739685, |
|
"kl": 0.33575439453125, |
|
"learning_rate": 3.471051066897562e-07, |
|
"loss": 0.0135, |
|
"reward": 0.45842229574918747, |
|
"reward_std": 0.5379992946982384, |
|
"rewards/cosine_scaled_reward": -0.1457888763397932, |
|
"rewards/format_reward": 0.7500000149011612, |
|
"step": 342 |
|
}, |
|
{ |
|
"completion_length": 1599.0416870117188, |
|
"epoch": 0.392, |
|
"grad_norm": 1.9615150690078735, |
|
"kl": 0.2664794921875, |
|
"learning_rate": 3.4430593282358777e-07, |
|
"loss": 0.0107, |
|
"reward": 0.8948207944631577, |
|
"reward_std": 0.6805202513933182, |
|
"rewards/cosine_scaled_reward": 0.009910388849675655, |
|
"rewards/format_reward": 0.8750000149011612, |
|
"step": 343 |
|
}, |
|
{ |
|
"completion_length": 1742.8750610351562, |
|
"epoch": 0.3931428571428571, |
|
"grad_norm": 41.373931884765625, |
|
"kl": 0.8536376953125, |
|
"learning_rate": 3.4151678419606233e-07, |
|
"loss": 0.0343, |
|
"reward": 0.8157278522849083, |
|
"reward_std": 0.7099937200546265, |
|
"rewards/cosine_scaled_reward": -0.029636098071932793, |
|
"rewards/format_reward": 0.8750000149011612, |
|
"step": 344 |
|
}, |
|
{ |
|
"completion_length": 1436.8125610351562, |
|
"epoch": 0.3942857142857143, |
|
"grad_norm": 1.8816604614257812, |
|
"kl": 0.26708984375, |
|
"learning_rate": 3.387377967463493e-07, |
|
"loss": 0.0107, |
|
"reward": 0.9264494627714157, |
|
"reward_std": 0.6063774973154068, |
|
"rewards/cosine_scaled_reward": 0.015308059751987457, |
|
"rewards/format_reward": 0.8958333432674408, |
|
"step": 345 |
|
}, |
|
{ |
|
"completion_length": 1363.9792022705078, |
|
"epoch": 0.3954285714285714, |
|
"grad_norm": 2.3164970874786377, |
|
"kl": 0.09549713134765625, |
|
"learning_rate": 3.359691059183761e-07, |
|
"loss": 0.0038, |
|
"reward": 1.0577785670757294, |
|
"reward_std": 0.8408964425325394, |
|
"rewards/cosine_scaled_reward": 0.07055594399571419, |
|
"rewards/format_reward": 0.9166666716337204, |
|
"step": 346 |
|
}, |
|
{ |
|
"completion_length": 1337.7083892822266, |
|
"epoch": 0.3965714285714286, |
|
"grad_norm": 1.1808565855026245, |
|
"kl": 0.2160491943359375, |
|
"learning_rate": 3.3321084665422803e-07, |
|
"loss": 0.0086, |
|
"reward": 1.5314601063728333, |
|
"reward_std": 0.8420404642820358, |
|
"rewards/cosine_scaled_reward": 0.31781339878216386, |
|
"rewards/format_reward": 0.895833358168602, |
|
"step": 347 |
|
}, |
|
{ |
|
"completion_length": 1250.9791870117188, |
|
"epoch": 0.3977142857142857, |
|
"grad_norm": 1.317704439163208, |
|
"kl": 0.178863525390625, |
|
"learning_rate": 3.3046315338757026e-07, |
|
"loss": 0.0072, |
|
"reward": 0.746107667684555, |
|
"reward_std": 0.5483127310872078, |
|
"rewards/cosine_scaled_reward": -0.10611284070182592, |
|
"rewards/format_reward": 0.9583333432674408, |
|
"step": 348 |
|
}, |
|
{ |
|
"completion_length": 1193.000015258789, |
|
"epoch": 0.39885714285714285, |
|
"grad_norm": 1.4739476442337036, |
|
"kl": 0.249725341796875, |
|
"learning_rate": 3.2772616003709616e-07, |
|
"loss": 0.01, |
|
"reward": 1.2662739604711533, |
|
"reward_std": 0.7532782405614853, |
|
"rewards/cosine_scaled_reward": 0.18522025644779205, |
|
"rewards/format_reward": 0.895833358168602, |
|
"step": 349 |
|
}, |
|
{ |
|
"completion_length": 1177.3541870117188, |
|
"epoch": 0.4, |
|
"grad_norm": 2.2546327114105225, |
|
"kl": 0.24993896484375, |
|
"learning_rate": 3.250000000000001e-07, |
|
"loss": 0.01, |
|
"reward": 1.0857526510953903, |
|
"reward_std": 0.6757391728460789, |
|
"rewards/cosine_scaled_reward": 0.09495963307563215, |
|
"rewards/format_reward": 0.8958333432674408, |
|
"step": 350 |
|
}, |
|
{ |
|
"completion_length": 1196.0000457763672, |
|
"epoch": 0.40114285714285713, |
|
"grad_norm": 1.1820554733276367, |
|
"kl": 0.15570068359375, |
|
"learning_rate": 3.222848061454764e-07, |
|
"loss": 0.0062, |
|
"reward": 0.5796034894883633, |
|
"reward_std": 0.5287084579467773, |
|
"rewards/cosine_scaled_reward": -0.178948275744915, |
|
"rewards/format_reward": 0.9375000149011612, |
|
"step": 351 |
|
}, |
|
{ |
|
"completion_length": 1270.5416870117188, |
|
"epoch": 0.4022857142857143, |
|
"grad_norm": 1.5620218515396118, |
|
"kl": 0.12603759765625, |
|
"learning_rate": 3.195807108082429e-07, |
|
"loss": 0.005, |
|
"reward": 0.7529177367687225, |
|
"reward_std": 0.6913501024246216, |
|
"rewards/cosine_scaled_reward": -0.09229113161563873, |
|
"rewards/format_reward": 0.9375000149011612, |
|
"step": 352 |
|
}, |
|
{ |
|
"completion_length": 1443.1666870117188, |
|
"epoch": 0.4034285714285714, |
|
"grad_norm": 2.1429245471954346, |
|
"kl": 0.18780517578125, |
|
"learning_rate": 3.168878457820915e-07, |
|
"loss": 0.0075, |
|
"reward": 0.9008820652961731, |
|
"reward_std": 0.6463882178068161, |
|
"rewards/cosine_scaled_reward": -0.007892303168773651, |
|
"rewards/format_reward": 0.9166666716337204, |
|
"step": 353 |
|
}, |
|
{ |
|
"completion_length": 1230.6458587646484, |
|
"epoch": 0.4045714285714286, |
|
"grad_norm": 0.9503077268600464, |
|
"kl": 0.2082061767578125, |
|
"learning_rate": 3.142063423134644e-07, |
|
"loss": 0.0083, |
|
"reward": 0.4887809455394745, |
|
"reward_std": 0.4611463025212288, |
|
"rewards/cosine_scaled_reward": -0.23477619886398315, |
|
"rewards/format_reward": 0.9583333432674408, |
|
"step": 354 |
|
}, |
|
{ |
|
"completion_length": 930.0625305175781, |
|
"epoch": 0.4057142857142857, |
|
"grad_norm": 0.8462334275245667, |
|
"kl": 0.1320648193359375, |
|
"learning_rate": 3.115363310950578e-07, |
|
"loss": 0.0053, |
|
"reward": 0.8714643996208906, |
|
"reward_std": 0.35595114156603813, |
|
"rewards/cosine_scaled_reward": -0.022601131349802017, |
|
"rewards/format_reward": 0.9166666716337204, |
|
"step": 355 |
|
}, |
|
{ |
|
"completion_length": 1521.2500610351562, |
|
"epoch": 0.40685714285714286, |
|
"grad_norm": 3.6062703132629395, |
|
"kl": 0.31671142578125, |
|
"learning_rate": 3.0887794225945143e-07, |
|
"loss": 0.0126, |
|
"reward": 0.7512324824929237, |
|
"reward_std": 0.7443573772907257, |
|
"rewards/cosine_scaled_reward": -0.05146709643304348, |
|
"rewards/format_reward": 0.8541666865348816, |
|
"step": 356 |
|
}, |
|
{ |
|
"completion_length": 1307.0625305175781, |
|
"epoch": 0.408, |
|
"grad_norm": 2.073002576828003, |
|
"kl": 0.2852783203125, |
|
"learning_rate": 3.062313053727671e-07, |
|
"loss": 0.0114, |
|
"reward": 0.6376607120037079, |
|
"reward_std": 0.5459994077682495, |
|
"rewards/cosine_scaled_reward": -0.13950299471616745, |
|
"rewards/format_reward": 0.9166666865348816, |
|
"step": 357 |
|
}, |
|
{ |
|
"completion_length": 1298.2083587646484, |
|
"epoch": 0.40914285714285714, |
|
"grad_norm": 1.5467488765716553, |
|
"kl": 0.34130859375, |
|
"learning_rate": 3.0359654942835247e-07, |
|
"loss": 0.0136, |
|
"reward": 0.881379060447216, |
|
"reward_std": 0.8507445156574249, |
|
"rewards/cosine_scaled_reward": 0.003189507406204939, |
|
"rewards/format_reward": 0.875, |
|
"step": 358 |
|
}, |
|
{ |
|
"completion_length": 1172.7916870117188, |
|
"epoch": 0.4102857142857143, |
|
"grad_norm": 1.671886682510376, |
|
"kl": 0.314453125, |
|
"learning_rate": 3.0097380284049523e-07, |
|
"loss": 0.0126, |
|
"reward": 0.7981488406658173, |
|
"reward_std": 0.5063908323645592, |
|
"rewards/cosine_scaled_reward": -0.04884226247668266, |
|
"rewards/format_reward": 0.895833358168602, |
|
"step": 359 |
|
}, |
|
{ |
|
"completion_length": 990.9792175292969, |
|
"epoch": 0.4114285714285714, |
|
"grad_norm": 2.374612808227539, |
|
"kl": 0.41845703125, |
|
"learning_rate": 2.9836319343816397e-07, |
|
"loss": 0.0168, |
|
"reward": 1.1945213824510574, |
|
"reward_std": 0.7698078602552414, |
|
"rewards/cosine_scaled_reward": 0.11809402331709862, |
|
"rewards/format_reward": 0.9583333432674408, |
|
"step": 360 |
|
}, |
|
{ |
|
"completion_length": 1524.3541870117188, |
|
"epoch": 0.4125714285714286, |
|
"grad_norm": 3.4913723468780518, |
|
"kl": 1.1523895263671875, |
|
"learning_rate": 2.9576484845877793e-07, |
|
"loss": 0.0461, |
|
"reward": 0.6581477224826813, |
|
"reward_std": 0.6756026446819305, |
|
"rewards/cosine_scaled_reward": -0.05634281662059948, |
|
"rewards/format_reward": 0.7708333432674408, |
|
"step": 361 |
|
}, |
|
{ |
|
"completion_length": 1185.2708587646484, |
|
"epoch": 0.4137142857142857, |
|
"grad_norm": 34.620460510253906, |
|
"kl": 0.832366943359375, |
|
"learning_rate": 2.931788945420058e-07, |
|
"loss": 0.0334, |
|
"reward": 0.8447651118040085, |
|
"reward_std": 0.7907865196466446, |
|
"rewards/cosine_scaled_reward": -0.046367482747882605, |
|
"rewards/format_reward": 0.9375000149011612, |
|
"step": 362 |
|
}, |
|
{ |
|
"completion_length": 1141.0208892822266, |
|
"epoch": 0.41485714285714287, |
|
"grad_norm": 2.2585678100585938, |
|
"kl": 0.352935791015625, |
|
"learning_rate": 2.9060545772359305e-07, |
|
"loss": 0.0141, |
|
"reward": 0.5232757963240147, |
|
"reward_std": 0.5396992526948452, |
|
"rewards/cosine_scaled_reward": -0.1966954478994012, |
|
"rewards/format_reward": 0.9166666865348816, |
|
"step": 363 |
|
}, |
|
{ |
|
"completion_length": 1359.1667175292969, |
|
"epoch": 0.416, |
|
"grad_norm": 3.5956501960754395, |
|
"kl": 0.715576171875, |
|
"learning_rate": 2.8804466342921987e-07, |
|
"loss": 0.0286, |
|
"reward": 0.6145341023802757, |
|
"reward_std": 0.5419690161943436, |
|
"rewards/cosine_scaled_reward": -0.17189963907003403, |
|
"rewards/format_reward": 0.9583333432674408, |
|
"step": 364 |
|
}, |
|
{ |
|
"completion_length": 1670.229248046875, |
|
"epoch": 0.41714285714285715, |
|
"grad_norm": 3.5323920249938965, |
|
"kl": 1.1943359375, |
|
"learning_rate": 2.854966364683872e-07, |
|
"loss": 0.0479, |
|
"reward": 0.40414058696478605, |
|
"reward_std": 0.691382423043251, |
|
"rewards/cosine_scaled_reward": -0.17292970418930054, |
|
"rewards/format_reward": 0.7500000074505806, |
|
"step": 365 |
|
}, |
|
{ |
|
"completion_length": 1720.5000305175781, |
|
"epoch": 0.41828571428571426, |
|
"grad_norm": 3.34942364692688, |
|
"kl": 1.042236328125, |
|
"learning_rate": 2.829615010283344e-07, |
|
"loss": 0.0417, |
|
"reward": 0.6996188908815384, |
|
"reward_std": 0.7247656881809235, |
|
"rewards/cosine_scaled_reward": -0.0460239015519619, |
|
"rewards/format_reward": 0.791666679084301, |
|
"step": 366 |
|
}, |
|
{ |
|
"completion_length": 927.6458587646484, |
|
"epoch": 0.41942857142857143, |
|
"grad_norm": 1.461083173751831, |
|
"kl": 0.38287353515625, |
|
"learning_rate": 2.8043938066798645e-07, |
|
"loss": 0.0153, |
|
"reward": 1.2472570352256298, |
|
"reward_std": 0.6670772060751915, |
|
"rewards/cosine_scaled_reward": 0.14446185529232025, |
|
"rewards/format_reward": 0.9583333432674408, |
|
"step": 367 |
|
}, |
|
{ |
|
"completion_length": 1301.9792175292969, |
|
"epoch": 0.4205714285714286, |
|
"grad_norm": 2.861830711364746, |
|
"kl": 0.337615966796875, |
|
"learning_rate": 2.7793039831193133e-07, |
|
"loss": 0.0135, |
|
"reward": 1.2516463994979858, |
|
"reward_std": 1.0124248266220093, |
|
"rewards/cosine_scaled_reward": 0.17790652811527252, |
|
"rewards/format_reward": 0.8958333432674408, |
|
"step": 368 |
|
}, |
|
{ |
|
"completion_length": 1215.5625, |
|
"epoch": 0.4217142857142857, |
|
"grad_norm": 1.6978230476379395, |
|
"kl": 0.335235595703125, |
|
"learning_rate": 2.7543467624442956e-07, |
|
"loss": 0.0134, |
|
"reward": 1.1675912141799927, |
|
"reward_std": 0.5593391507863998, |
|
"rewards/cosine_scaled_reward": 0.13587890937924385, |
|
"rewards/format_reward": 0.8958333432674408, |
|
"step": 369 |
|
}, |
|
{ |
|
"completion_length": 1309.5208740234375, |
|
"epoch": 0.4228571428571429, |
|
"grad_norm": 2.2963132858276367, |
|
"kl": 0.4232177734375, |
|
"learning_rate": 2.729523361034538e-07, |
|
"loss": 0.0169, |
|
"reward": 0.5871782079339027, |
|
"reward_std": 0.778529703617096, |
|
"rewards/cosine_scaled_reward": -0.14391089417040348, |
|
"rewards/format_reward": 0.8750000298023224, |
|
"step": 370 |
|
}, |
|
{ |
|
"completion_length": 1090.1042022705078, |
|
"epoch": 0.424, |
|
"grad_norm": 0.9345715045928955, |
|
"kl": 0.28076171875, |
|
"learning_rate": 2.7048349887476037e-07, |
|
"loss": 0.0112, |
|
"reward": 0.6981654912233353, |
|
"reward_std": 0.6373357623815536, |
|
"rewards/cosine_scaled_reward": -0.09883392881602049, |
|
"rewards/format_reward": 0.8958333432674408, |
|
"step": 371 |
|
}, |
|
{ |
|
"completion_length": 1309.9375610351562, |
|
"epoch": 0.42514285714285716, |
|
"grad_norm": 0.7092093825340271, |
|
"kl": 0.486602783203125, |
|
"learning_rate": 2.6802828488599294e-07, |
|
"loss": 0.0194, |
|
"reward": 0.8811748586595058, |
|
"reward_std": 0.5044156312942505, |
|
"rewards/cosine_scaled_reward": -0.007329270243644714, |
|
"rewards/format_reward": 0.8958333432674408, |
|
"step": 372 |
|
}, |
|
{ |
|
"completion_length": 1460.7708587646484, |
|
"epoch": 0.42628571428571427, |
|
"grad_norm": 2.416278839111328, |
|
"kl": 0.5400390625, |
|
"learning_rate": 2.655868138008171e-07, |
|
"loss": 0.0216, |
|
"reward": 0.9694189727306366, |
|
"reward_std": 0.6003681719303131, |
|
"rewards/cosine_scaled_reward": 0.0472094789147377, |
|
"rewards/format_reward": 0.8750000149011612, |
|
"step": 373 |
|
}, |
|
{ |
|
"completion_length": 1132.5208587646484, |
|
"epoch": 0.42742857142857144, |
|
"grad_norm": 1.0274022817611694, |
|
"kl": 0.29541015625, |
|
"learning_rate": 2.631592046130896e-07, |
|
"loss": 0.0118, |
|
"reward": 1.1347306370735168, |
|
"reward_std": 0.5300607345998287, |
|
"rewards/cosine_scaled_reward": 0.07778198271989822, |
|
"rewards/format_reward": 0.9791666716337204, |
|
"step": 374 |
|
}, |
|
{ |
|
"completion_length": 1205.6042175292969, |
|
"epoch": 0.42857142857142855, |
|
"grad_norm": 1.4747833013534546, |
|
"kl": 0.336669921875, |
|
"learning_rate": 2.6074557564105724e-07, |
|
"loss": 0.0135, |
|
"reward": 0.914167582988739, |
|
"reward_std": 0.7538186013698578, |
|
"rewards/cosine_scaled_reward": -0.011666236445307732, |
|
"rewards/format_reward": 0.9375000149011612, |
|
"step": 375 |
|
}, |
|
{ |
|
"completion_length": 1323.9583740234375, |
|
"epoch": 0.4297142857142857, |
|
"grad_norm": 1.5346781015396118, |
|
"kl": 0.40960693359375, |
|
"learning_rate": 2.583460445215911e-07, |
|
"loss": 0.0164, |
|
"reward": 0.9741204902529716, |
|
"reward_std": 0.8010425865650177, |
|
"rewards/cosine_scaled_reward": 0.018310231156647205, |
|
"rewards/format_reward": 0.9375000149011612, |
|
"step": 376 |
|
}, |
|
{ |
|
"completion_length": 1173.8333892822266, |
|
"epoch": 0.4308571428571429, |
|
"grad_norm": 1.3860251903533936, |
|
"kl": 0.32177734375, |
|
"learning_rate": 2.5596072820445254e-07, |
|
"loss": 0.0129, |
|
"reward": 1.1005370616912842, |
|
"reward_std": 0.8687849044799805, |
|
"rewards/cosine_scaled_reward": 0.11276852712035179, |
|
"rewards/format_reward": 0.8750000149011612, |
|
"step": 377 |
|
}, |
|
{ |
|
"completion_length": 1077.0208587646484, |
|
"epoch": 0.432, |
|
"grad_norm": 1.6222304105758667, |
|
"kl": 0.325439453125, |
|
"learning_rate": 2.5358974294659373e-07, |
|
"loss": 0.013, |
|
"reward": 0.7557893544435501, |
|
"reward_std": 0.46596937626600266, |
|
"rewards/cosine_scaled_reward": -0.12210530787706375, |
|
"rewards/format_reward": 1.0, |
|
"step": 378 |
|
}, |
|
{ |
|
"completion_length": 1240.3333740234375, |
|
"epoch": 0.43314285714285716, |
|
"grad_norm": 1.0828309059143066, |
|
"kl": 0.356201171875, |
|
"learning_rate": 2.512332043064913e-07, |
|
"loss": 0.0142, |
|
"reward": 0.8545220792293549, |
|
"reward_std": 0.6498586684465408, |
|
"rewards/cosine_scaled_reward": -0.05190563574433327, |
|
"rewards/format_reward": 0.9583333432674408, |
|
"step": 379 |
|
}, |
|
{ |
|
"completion_length": 1270.6667175292969, |
|
"epoch": 0.4342857142857143, |
|
"grad_norm": 1.8103889226913452, |
|
"kl": 0.383026123046875, |
|
"learning_rate": 2.488912271385139e-07, |
|
"loss": 0.0153, |
|
"reward": 0.6777837425470352, |
|
"reward_std": 0.7819753885269165, |
|
"rewards/cosine_scaled_reward": -0.08819146640598774, |
|
"rewards/format_reward": 0.8541666716337204, |
|
"step": 380 |
|
}, |
|
{ |
|
"completion_length": 1318.7292175292969, |
|
"epoch": 0.43542857142857144, |
|
"grad_norm": 1.7676132917404175, |
|
"kl": 0.482666015625, |
|
"learning_rate": 2.465639255873246e-07, |
|
"loss": 0.0193, |
|
"reward": 0.7255931571125984, |
|
"reward_std": 0.6526116281747818, |
|
"rewards/cosine_scaled_reward": -0.10595342982560396, |
|
"rewards/format_reward": 0.9375000149011612, |
|
"step": 381 |
|
}, |
|
{ |
|
"completion_length": 1141.7917022705078, |
|
"epoch": 0.43657142857142855, |
|
"grad_norm": 1.832429051399231, |
|
"kl": 0.3201904296875, |
|
"learning_rate": 2.4425141308231765e-07, |
|
"loss": 0.0128, |
|
"reward": 0.7391386441886425, |
|
"reward_std": 0.7291494160890579, |
|
"rewards/cosine_scaled_reward": -0.07834736630320549, |
|
"rewards/format_reward": 0.8958333432674408, |
|
"step": 382 |
|
}, |
|
{ |
|
"completion_length": 1615.541732788086, |
|
"epoch": 0.4377142857142857, |
|
"grad_norm": 2.4247608184814453, |
|
"kl": 1.1395721435546875, |
|
"learning_rate": 2.4195380233209006e-07, |
|
"loss": 0.0456, |
|
"reward": 0.8856848031282425, |
|
"reward_std": 0.5492001250386238, |
|
"rewards/cosine_scaled_reward": 0.005342394113540649, |
|
"rewards/format_reward": 0.8750000149011612, |
|
"step": 383 |
|
}, |
|
{ |
|
"completion_length": 1421.0833435058594, |
|
"epoch": 0.43885714285714283, |
|
"grad_norm": 1.442662000656128, |
|
"kl": 0.374847412109375, |
|
"learning_rate": 2.3967120531894857e-07, |
|
"loss": 0.015, |
|
"reward": 0.32889158837497234, |
|
"reward_std": 0.4332951605319977, |
|
"rewards/cosine_scaled_reward": -0.26263754442334175, |
|
"rewards/format_reward": 0.8541666865348816, |
|
"step": 384 |
|
}, |
|
{ |
|
"completion_length": 1334.5208435058594, |
|
"epoch": 0.44, |
|
"grad_norm": 1.4730134010314941, |
|
"kl": 0.422088623046875, |
|
"learning_rate": 2.374037332934512e-07, |
|
"loss": 0.0169, |
|
"reward": 0.8442478328943253, |
|
"reward_std": 0.6724153012037277, |
|
"rewards/cosine_scaled_reward": -0.05704276263713837, |
|
"rewards/format_reward": 0.9583333432674408, |
|
"step": 385 |
|
}, |
|
{ |
|
"completion_length": 1111.4583740234375, |
|
"epoch": 0.44114285714285717, |
|
"grad_norm": 1.538129448890686, |
|
"kl": 0.181640625, |
|
"learning_rate": 2.3515149676898552e-07, |
|
"loss": 0.0073, |
|
"reward": 0.8161994330585003, |
|
"reward_std": 0.4277452155947685, |
|
"rewards/cosine_scaled_reward": -0.06065032631158829, |
|
"rewards/format_reward": 0.9375000149011612, |
|
"step": 386 |
|
}, |
|
{ |
|
"completion_length": 1218.0208435058594, |
|
"epoch": 0.4422857142857143, |
|
"grad_norm": 1.626770257949829, |
|
"kl": 0.309906005859375, |
|
"learning_rate": 2.3291460551638237e-07, |
|
"loss": 0.0124, |
|
"reward": 0.6058969795703888, |
|
"reward_std": 0.7323561906814575, |
|
"rewards/cosine_scaled_reward": -0.1658015362918377, |
|
"rewards/format_reward": 0.9375000149011612, |
|
"step": 387 |
|
}, |
|
{ |
|
"completion_length": 1258.187515258789, |
|
"epoch": 0.44342857142857145, |
|
"grad_norm": 1.7491554021835327, |
|
"kl": 0.3914794921875, |
|
"learning_rate": 2.306931685585657e-07, |
|
"loss": 0.0156, |
|
"reward": 0.7627854868769646, |
|
"reward_std": 0.7414772808551788, |
|
"rewards/cosine_scaled_reward": -0.05610728543251753, |
|
"rewards/format_reward": 0.8750000149011612, |
|
"step": 388 |
|
}, |
|
{ |
|
"completion_length": 1268.4375305175781, |
|
"epoch": 0.44457142857142856, |
|
"grad_norm": 1.6758959293365479, |
|
"kl": 0.440826416015625, |
|
"learning_rate": 2.2848729416523859e-07, |
|
"loss": 0.0176, |
|
"reward": 1.1525626480579376, |
|
"reward_std": 0.7809968441724777, |
|
"rewards/cosine_scaled_reward": 0.12836465798318386, |
|
"rewards/format_reward": 0.895833358168602, |
|
"step": 389 |
|
}, |
|
{ |
|
"completion_length": 940.9166870117188, |
|
"epoch": 0.44571428571428573, |
|
"grad_norm": 1.7198045253753662, |
|
"kl": 0.3330078125, |
|
"learning_rate": 2.2629708984760706e-07, |
|
"loss": 0.0133, |
|
"reward": 0.9264688044786453, |
|
"reward_std": 0.6931325197219849, |
|
"rewards/cosine_scaled_reward": -0.03676560753956437, |
|
"rewards/format_reward": 1.0, |
|
"step": 390 |
|
}, |
|
{ |
|
"completion_length": 978.7291717529297, |
|
"epoch": 0.44685714285714284, |
|
"grad_norm": 0.6671801805496216, |
|
"kl": 0.029083251953125, |
|
"learning_rate": 2.2412266235313973e-07, |
|
"loss": 0.0012, |
|
"reward": 0.6134733706712723, |
|
"reward_std": 0.4728550612926483, |
|
"rewards/cosine_scaled_reward": -0.1932633202522993, |
|
"rewards/format_reward": 1.0, |
|
"step": 391 |
|
}, |
|
{ |
|
"completion_length": 1309.5417175292969, |
|
"epoch": 0.448, |
|
"grad_norm": 2.3814332485198975, |
|
"kl": 0.251953125, |
|
"learning_rate": 2.2196411766036487e-07, |
|
"loss": 0.0101, |
|
"reward": 1.0050799548625946, |
|
"reward_std": 0.8785246461629868, |
|
"rewards/cosine_scaled_reward": 0.07545664254575968, |
|
"rewards/format_reward": 0.8541666865348816, |
|
"step": 392 |
|
}, |
|
{ |
|
"completion_length": 1097.3125305175781, |
|
"epoch": 0.4491428571428571, |
|
"grad_norm": 1.722952961921692, |
|
"kl": 0.2423095703125, |
|
"learning_rate": 2.1982156097370557e-07, |
|
"loss": 0.0097, |
|
"reward": 0.832743689417839, |
|
"reward_std": 0.8567025661468506, |
|
"rewards/cosine_scaled_reward": -0.0419615093851462, |
|
"rewards/format_reward": 0.9166666865348816, |
|
"step": 393 |
|
}, |
|
{ |
|
"completion_length": 1223.0833740234375, |
|
"epoch": 0.4502857142857143, |
|
"grad_norm": 2.990572452545166, |
|
"kl": 0.4295654296875, |
|
"learning_rate": 2.1769509671835223e-07, |
|
"loss": 0.0172, |
|
"reward": 0.5410304628312588, |
|
"reward_std": 0.6248824968934059, |
|
"rewards/cosine_scaled_reward": -0.13573478162288666, |
|
"rewards/format_reward": 0.8125000149011612, |
|
"step": 394 |
|
}, |
|
{ |
|
"completion_length": 1189.500015258789, |
|
"epoch": 0.4514285714285714, |
|
"grad_norm": 2.629859685897827, |
|
"kl": 0.321044921875, |
|
"learning_rate": 2.1558482853517253e-07, |
|
"loss": 0.0128, |
|
"reward": 0.5656278505921364, |
|
"reward_std": 0.7980157136917114, |
|
"rewards/cosine_scaled_reward": -0.1130194254219532, |
|
"rewards/format_reward": 0.7916667014360428, |
|
"step": 395 |
|
}, |
|
{ |
|
"completion_length": 1043.1458435058594, |
|
"epoch": 0.45257142857142857, |
|
"grad_norm": 1.0702786445617676, |
|
"kl": 0.0563507080078125, |
|
"learning_rate": 2.134908592756607e-07, |
|
"loss": 0.0023, |
|
"reward": 0.7221028283238411, |
|
"reward_std": 0.5050450935959816, |
|
"rewards/cosine_scaled_reward": -0.11811527609825134, |
|
"rewards/format_reward": 0.9583333432674408, |
|
"step": 396 |
|
}, |
|
{ |
|
"completion_length": 1230.7292175292969, |
|
"epoch": 0.45371428571428574, |
|
"grad_norm": 3.2597548961639404, |
|
"kl": 0.392822265625, |
|
"learning_rate": 2.1141329099692406e-07, |
|
"loss": 0.0157, |
|
"reward": 1.312897451221943, |
|
"reward_std": 0.6480658948421478, |
|
"rewards/cosine_scaled_reward": 0.2293653730303049, |
|
"rewards/format_reward": 0.8541667014360428, |
|
"step": 397 |
|
}, |
|
{ |
|
"completion_length": 1381.5625305175781, |
|
"epoch": 0.45485714285714285, |
|
"grad_norm": 1.6307339668273926, |
|
"kl": 0.20714569091796875, |
|
"learning_rate": 2.0935222495670968e-07, |
|
"loss": 0.0083, |
|
"reward": 0.7576578855514526, |
|
"reward_std": 0.7408057749271393, |
|
"rewards/cosine_scaled_reward": -0.10033774503972381, |
|
"rewards/format_reward": 0.9583333432674408, |
|
"step": 398 |
|
}, |
|
{ |
|
"completion_length": 1487.0208740234375, |
|
"epoch": 0.456, |
|
"grad_norm": 2.9457855224609375, |
|
"kl": 0.426025390625, |
|
"learning_rate": 2.0730776160846853e-07, |
|
"loss": 0.0171, |
|
"reward": 0.6220748424530029, |
|
"reward_std": 0.5146789476275444, |
|
"rewards/cosine_scaled_reward": -0.09521258249878883, |
|
"rewards/format_reward": 0.8125000149011612, |
|
"step": 399 |
|
}, |
|
{ |
|
"completion_length": 1041.7708587646484, |
|
"epoch": 0.45714285714285713, |
|
"grad_norm": 5.961241722106934, |
|
"kl": 0.289337158203125, |
|
"learning_rate": 2.0528000059645995e-07, |
|
"loss": 0.0116, |
|
"reward": 0.8267193324863911, |
|
"reward_std": 0.6178570240736008, |
|
"rewards/cosine_scaled_reward": -0.04497369006276131, |
|
"rewards/format_reward": 0.9166666716337204, |
|
"step": 400 |
|
}, |
|
{ |
|
"completion_length": 965.6250305175781, |
|
"epoch": 0.4582857142857143, |
|
"grad_norm": 1.4047675132751465, |
|
"kl": 0.1659698486328125, |
|
"learning_rate": 2.032690407508949e-07, |
|
"loss": 0.0066, |
|
"reward": 0.982437789440155, |
|
"reward_std": 0.7459337636828423, |
|
"rewards/cosine_scaled_reward": 0.0537188770249486, |
|
"rewards/format_reward": 0.8750000149011612, |
|
"step": 401 |
|
}, |
|
{ |
|
"completion_length": 1609.729248046875, |
|
"epoch": 0.4594285714285714, |
|
"grad_norm": 2.311453342437744, |
|
"kl": 0.615234375, |
|
"learning_rate": 2.0127498008311922e-07, |
|
"loss": 0.0246, |
|
"reward": 0.3766894303262234, |
|
"reward_std": 0.658165842294693, |
|
"rewards/cosine_scaled_reward": -0.2387386392802, |
|
"rewards/format_reward": 0.8541666865348816, |
|
"step": 402 |
|
}, |
|
{ |
|
"completion_length": 1466.9375610351562, |
|
"epoch": 0.4605714285714286, |
|
"grad_norm": 2.0651133060455322, |
|
"kl": 0.7806396484375, |
|
"learning_rate": 1.9929791578083655e-07, |
|
"loss": 0.0312, |
|
"reward": 0.5563420876860619, |
|
"reward_std": 0.6794729232788086, |
|
"rewards/cosine_scaled_reward": -0.1489123017527163, |
|
"rewards/format_reward": 0.8541666865348816, |
|
"step": 403 |
|
}, |
|
{ |
|
"completion_length": 1289.2708740234375, |
|
"epoch": 0.4617142857142857, |
|
"grad_norm": 2.5164127349853516, |
|
"kl": 0.3046875, |
|
"learning_rate": 1.9733794420337213e-07, |
|
"loss": 0.0122, |
|
"reward": 0.8672967702150345, |
|
"reward_std": 0.7338433116674423, |
|
"rewards/cosine_scaled_reward": -0.03510164050385356, |
|
"rewards/format_reward": 0.9375000149011612, |
|
"step": 404 |
|
}, |
|
{ |
|
"completion_length": 1246.5833587646484, |
|
"epoch": 0.46285714285714286, |
|
"grad_norm": 1.7397247552871704, |
|
"kl": 0.14056396484375, |
|
"learning_rate": 1.9539516087697517e-07, |
|
"loss": 0.0056, |
|
"reward": 0.7972104996442795, |
|
"reward_std": 0.6757538244128227, |
|
"rewards/cosine_scaled_reward": 0.0027718953788280487, |
|
"rewards/format_reward": 0.7916667014360428, |
|
"step": 405 |
|
}, |
|
{ |
|
"completion_length": 1569.5209045410156, |
|
"epoch": 0.464, |
|
"grad_norm": 3.1697490215301514, |
|
"kl": 1.05712890625, |
|
"learning_rate": 1.934696604901642e-07, |
|
"loss": 0.0423, |
|
"reward": 0.7342821173369884, |
|
"reward_std": 0.6933150887489319, |
|
"rewards/cosine_scaled_reward": -0.05994228646159172, |
|
"rewards/format_reward": 0.8541666716337204, |
|
"step": 406 |
|
}, |
|
{ |
|
"completion_length": 970.0625457763672, |
|
"epoch": 0.46514285714285714, |
|
"grad_norm": 2.7586686611175537, |
|
"kl": 0.496337890625, |
|
"learning_rate": 1.915615368891117e-07, |
|
"loss": 0.0198, |
|
"reward": 0.7943353094160557, |
|
"reward_std": 0.5273813158273697, |
|
"rewards/cosine_scaled_reward": -0.07158234342932701, |
|
"rewards/format_reward": 0.9375000149011612, |
|
"step": 407 |
|
}, |
|
{ |
|
"completion_length": 954.7083435058594, |
|
"epoch": 0.4662857142857143, |
|
"grad_norm": 1.7943812608718872, |
|
"kl": 0.168701171875, |
|
"learning_rate": 1.8967088307307e-07, |
|
"loss": 0.0067, |
|
"reward": 0.9154380261898041, |
|
"reward_std": 0.63597172498703, |
|
"rewards/cosine_scaled_reward": -0.02144765853881836, |
|
"rewards/format_reward": 0.9583333432674408, |
|
"step": 408 |
|
}, |
|
{ |
|
"completion_length": 1268.2291717529297, |
|
"epoch": 0.4674285714285714, |
|
"grad_norm": 0.9967200756072998, |
|
"kl": 0.53662109375, |
|
"learning_rate": 1.8779779118983867e-07, |
|
"loss": 0.0215, |
|
"reward": 0.9619596749544144, |
|
"reward_std": 0.6098195463418961, |
|
"rewards/cosine_scaled_reward": 0.022646483033895493, |
|
"rewards/format_reward": 0.9166666865348816, |
|
"step": 409 |
|
}, |
|
{ |
|
"completion_length": 1628.0000610351562, |
|
"epoch": 0.4685714285714286, |
|
"grad_norm": 5.845465660095215, |
|
"kl": 0.8017578125, |
|
"learning_rate": 1.8594235253127372e-07, |
|
"loss": 0.0321, |
|
"reward": 0.6380558162927628, |
|
"reward_std": 0.5308569446206093, |
|
"rewards/cosine_scaled_reward": -0.14972211251733825, |
|
"rewards/format_reward": 0.9375000149011612, |
|
"step": 410 |
|
}, |
|
{ |
|
"completion_length": 1665.7083740234375, |
|
"epoch": 0.4697142857142857, |
|
"grad_norm": 1.869497537612915, |
|
"kl": 0.796875, |
|
"learning_rate": 1.8410465752883758e-07, |
|
"loss": 0.0319, |
|
"reward": 0.31100673973560333, |
|
"reward_std": 0.5528412610292435, |
|
"rewards/cosine_scaled_reward": -0.27157998457551, |
|
"rewards/format_reward": 0.8541666865348816, |
|
"step": 411 |
|
}, |
|
{ |
|
"completion_length": 1114.604232788086, |
|
"epoch": 0.47085714285714286, |
|
"grad_norm": 1.987026333808899, |
|
"kl": 0.284912109375, |
|
"learning_rate": 1.822847957491922e-07, |
|
"loss": 0.0114, |
|
"reward": 0.7781837359070778, |
|
"reward_std": 0.5904239565134048, |
|
"rewards/cosine_scaled_reward": -0.09007478877902031, |
|
"rewards/format_reward": 0.9583333432674408, |
|
"step": 412 |
|
}, |
|
{ |
|
"completion_length": 1371.5417175292969, |
|
"epoch": 0.472, |
|
"grad_norm": 2.023742198944092, |
|
"kl": 0.481689453125, |
|
"learning_rate": 1.804828558898332e-07, |
|
"loss": 0.0193, |
|
"reward": 0.7588743381202221, |
|
"reward_std": 0.7781134992837906, |
|
"rewards/cosine_scaled_reward": -0.06847950583323836, |
|
"rewards/format_reward": 0.8958333432674408, |
|
"step": 413 |
|
}, |
|
{ |
|
"completion_length": 1710.7292175292969, |
|
"epoch": 0.47314285714285714, |
|
"grad_norm": 3.7236738204956055, |
|
"kl": 1.0467529296875, |
|
"learning_rate": 1.7869892577476722e-07, |
|
"loss": 0.042, |
|
"reward": 0.6331272795796394, |
|
"reward_std": 0.7422880977392197, |
|
"rewards/cosine_scaled_reward": -0.06885303813032806, |
|
"rewards/format_reward": 0.770833358168602, |
|
"step": 414 |
|
}, |
|
{ |
|
"completion_length": 1409.2708740234375, |
|
"epoch": 0.4742857142857143, |
|
"grad_norm": 2.454448938369751, |
|
"kl": 0.506103515625, |
|
"learning_rate": 1.7693309235023127e-07, |
|
"loss": 0.0202, |
|
"reward": 0.9847299754619598, |
|
"reward_std": 0.6669813543558121, |
|
"rewards/cosine_scaled_reward": 0.023614969104528427, |
|
"rewards/format_reward": 0.9375000149011612, |
|
"step": 415 |
|
}, |
|
{ |
|
"completion_length": 1729.4791870117188, |
|
"epoch": 0.4754285714285714, |
|
"grad_norm": 2.803572416305542, |
|
"kl": 0.96875, |
|
"learning_rate": 1.7518544168045524e-07, |
|
"loss": 0.0387, |
|
"reward": 0.4970005638897419, |
|
"reward_std": 0.8149459362030029, |
|
"rewards/cosine_scaled_reward": -0.10566640645265579, |
|
"rewards/format_reward": 0.7083333432674408, |
|
"step": 416 |
|
}, |
|
{ |
|
"completion_length": 1810.1875457763672, |
|
"epoch": 0.4765714285714286, |
|
"grad_norm": 5.9517717361450195, |
|
"kl": 1.576904296875, |
|
"learning_rate": 1.7345605894346726e-07, |
|
"loss": 0.0632, |
|
"reward": 0.5241810567677021, |
|
"reward_std": 0.64292823523283, |
|
"rewards/cosine_scaled_reward": -0.08165947627276182, |
|
"rewards/format_reward": 0.6875000074505806, |
|
"step": 417 |
|
}, |
|
{ |
|
"completion_length": 1492.8958740234375, |
|
"epoch": 0.4777142857142857, |
|
"grad_norm": 4.798679828643799, |
|
"kl": 0.8544921875, |
|
"learning_rate": 1.7174502842694212e-07, |
|
"loss": 0.0342, |
|
"reward": 0.4330623224377632, |
|
"reward_std": 0.5981302931904793, |
|
"rewards/cosine_scaled_reward": -0.22096885181963444, |
|
"rewards/format_reward": 0.8750000149011612, |
|
"step": 418 |
|
}, |
|
{ |
|
"completion_length": 1555.2708435058594, |
|
"epoch": 0.47885714285714287, |
|
"grad_norm": 4.279539585113525, |
|
"kl": 0.86328125, |
|
"learning_rate": 1.7005243352409333e-07, |
|
"loss": 0.0346, |
|
"reward": 0.5117529258131981, |
|
"reward_std": 0.6584911718964577, |
|
"rewards/cosine_scaled_reward": -0.1399568784981966, |
|
"rewards/format_reward": 0.7916666865348816, |
|
"step": 419 |
|
}, |
|
{ |
|
"completion_length": 1216.3542175292969, |
|
"epoch": 0.48, |
|
"grad_norm": 2.101778030395508, |
|
"kl": 0.41534423828125, |
|
"learning_rate": 1.6837835672960831e-07, |
|
"loss": 0.0167, |
|
"reward": 0.896861981600523, |
|
"reward_std": 0.856042891740799, |
|
"rewards/cosine_scaled_reward": 0.0005143135786056519, |
|
"rewards/format_reward": 0.8958333432674408, |
|
"step": 420 |
|
}, |
|
{ |
|
"completion_length": 1162.3750457763672, |
|
"epoch": 0.48114285714285715, |
|
"grad_norm": 2.8854591846466064, |
|
"kl": 0.33807373046875, |
|
"learning_rate": 1.6672287963562852e-07, |
|
"loss": 0.0135, |
|
"reward": 0.8688920065760612, |
|
"reward_std": 0.7225290387868881, |
|
"rewards/cosine_scaled_reward": -0.013470660895109177, |
|
"rewards/format_reward": 0.895833358168602, |
|
"step": 421 |
|
}, |
|
{ |
|
"completion_length": 1370.8541870117188, |
|
"epoch": 0.48228571428571426, |
|
"grad_norm": 1.7124054431915283, |
|
"kl": 0.51220703125, |
|
"learning_rate": 1.6508608292777203e-07, |
|
"loss": 0.0205, |
|
"reward": 0.7653668001294136, |
|
"reward_std": 0.7626539617776871, |
|
"rewards/cosine_scaled_reward": -0.07564992923289537, |
|
"rewards/format_reward": 0.9166666865348816, |
|
"step": 422 |
|
}, |
|
{ |
|
"completion_length": 1183.708396911621, |
|
"epoch": 0.48342857142857143, |
|
"grad_norm": 1.421221137046814, |
|
"kl": 0.293243408203125, |
|
"learning_rate": 1.6346804638120098e-07, |
|
"loss": 0.0117, |
|
"reward": 0.9164831042289734, |
|
"reward_std": 0.533292543143034, |
|
"rewards/cosine_scaled_reward": -0.01050846092402935, |
|
"rewards/format_reward": 0.9375000149011612, |
|
"step": 423 |
|
}, |
|
{ |
|
"completion_length": 1333.6250305175781, |
|
"epoch": 0.4845714285714286, |
|
"grad_norm": 2.020719289779663, |
|
"kl": 0.547119140625, |
|
"learning_rate": 1.6186884885673413e-07, |
|
"loss": 0.0219, |
|
"reward": 0.5903327092528343, |
|
"reward_std": 0.5696832239627838, |
|
"rewards/cosine_scaled_reward": -0.11108366213738918, |
|
"rewards/format_reward": 0.8125000298023224, |
|
"step": 424 |
|
}, |
|
{ |
|
"completion_length": 1426.3542175292969, |
|
"epoch": 0.4857142857142857, |
|
"grad_norm": 2.2707631587982178, |
|
"kl": 0.79052734375, |
|
"learning_rate": 1.6028856829700258e-07, |
|
"loss": 0.0316, |
|
"reward": 0.5219599902629852, |
|
"reward_std": 0.5760443955659866, |
|
"rewards/cosine_scaled_reward": -0.14527002349495888, |
|
"rewards/format_reward": 0.8125000149011612, |
|
"step": 425 |
|
}, |
|
{ |
|
"completion_length": 1259.3333740234375, |
|
"epoch": 0.4868571428571429, |
|
"grad_norm": 2.267915725708008, |
|
"kl": 0.395263671875, |
|
"learning_rate": 1.5872728172265146e-07, |
|
"loss": 0.0158, |
|
"reward": 0.8012158274650574, |
|
"reward_std": 0.667873740196228, |
|
"rewards/cosine_scaled_reward": -0.06814211048185825, |
|
"rewards/format_reward": 0.9375000149011612, |
|
"step": 426 |
|
}, |
|
{ |
|
"completion_length": 1615.0416870117188, |
|
"epoch": 0.488, |
|
"grad_norm": 2.34485125541687, |
|
"kl": 0.85791015625, |
|
"learning_rate": 1.5718506522858572e-07, |
|
"loss": 0.0343, |
|
"reward": 0.7913745269179344, |
|
"reward_std": 0.7925111949443817, |
|
"rewards/cosine_scaled_reward": -0.00014608167111873627, |
|
"rewards/format_reward": 0.7916666865348816, |
|
"step": 427 |
|
}, |
|
{ |
|
"completion_length": 1747.8333740234375, |
|
"epoch": 0.48914285714285716, |
|
"grad_norm": 2.9694297313690186, |
|
"kl": 0.9736328125, |
|
"learning_rate": 1.5566199398026147e-07, |
|
"loss": 0.0389, |
|
"reward": 0.55228191614151, |
|
"reward_std": 0.6319544315338135, |
|
"rewards/cosine_scaled_reward": -0.1509423702955246, |
|
"rewards/format_reward": 0.8541666865348816, |
|
"step": 428 |
|
}, |
|
{ |
|
"completion_length": 1631.5417175292969, |
|
"epoch": 0.49028571428571427, |
|
"grad_norm": 3.960716724395752, |
|
"kl": 1.03509521484375, |
|
"learning_rate": 1.5415814221002265e-07, |
|
"loss": 0.0414, |
|
"reward": 0.8934581205248833, |
|
"reward_std": 0.5868038833141327, |
|
"rewards/cosine_scaled_reward": 0.03006240352988243, |
|
"rewards/format_reward": 0.833333358168602, |
|
"step": 429 |
|
}, |
|
{ |
|
"completion_length": 1621.4375610351562, |
|
"epoch": 0.49142857142857144, |
|
"grad_norm": 8.593777656555176, |
|
"kl": 1.48388671875, |
|
"learning_rate": 1.5267358321348285e-07, |
|
"loss": 0.0594, |
|
"reward": 0.48539859987795353, |
|
"reward_std": 0.7700510919094086, |
|
"rewards/cosine_scaled_reward": -0.12188405683264136, |
|
"rewards/format_reward": 0.7291666716337204, |
|
"step": 430 |
|
}, |
|
{ |
|
"completion_length": 1201.0208587646484, |
|
"epoch": 0.49257142857142855, |
|
"grad_norm": 1.7287312746047974, |
|
"kl": 0.67041015625, |
|
"learning_rate": 1.5120838934595337e-07, |
|
"loss": 0.0268, |
|
"reward": 0.9445644542574883, |
|
"reward_std": 0.5076880529522896, |
|
"rewards/cosine_scaled_reward": 0.0035321786999702454, |
|
"rewards/format_reward": 0.9375000149011612, |
|
"step": 431 |
|
}, |
|
{ |
|
"completion_length": 1444.2084045410156, |
|
"epoch": 0.4937142857142857, |
|
"grad_norm": 3.417004346847534, |
|
"kl": 0.6171875, |
|
"learning_rate": 1.4976263201891613e-07, |
|
"loss": 0.0247, |
|
"reward": 0.6378493383526802, |
|
"reward_std": 0.7811735272407532, |
|
"rewards/cosine_scaled_reward": -0.09774199151434004, |
|
"rewards/format_reward": 0.8333333432674408, |
|
"step": 432 |
|
}, |
|
{ |
|
"completion_length": 1373.5417175292969, |
|
"epoch": 0.4948571428571429, |
|
"grad_norm": 1.7710994482040405, |
|
"kl": 0.712890625, |
|
"learning_rate": 1.483363816965435e-07, |
|
"loss": 0.0286, |
|
"reward": 0.6063527911901474, |
|
"reward_std": 0.6604797914624214, |
|
"rewards/cosine_scaled_reward": -0.144740279763937, |
|
"rewards/format_reward": 0.8958333432674408, |
|
"step": 433 |
|
}, |
|
{ |
|
"completion_length": 977.5208435058594, |
|
"epoch": 0.496, |
|
"grad_norm": 1.736846685409546, |
|
"kl": 0.294708251953125, |
|
"learning_rate": 1.469297078922642e-07, |
|
"loss": 0.0118, |
|
"reward": 1.2148061096668243, |
|
"reward_std": 0.5557400360703468, |
|
"rewards/cosine_scaled_reward": 0.11781970039010048, |
|
"rewards/format_reward": 0.9791666716337204, |
|
"step": 434 |
|
}, |
|
{ |
|
"completion_length": 1433.8542175292969, |
|
"epoch": 0.49714285714285716, |
|
"grad_norm": 2.2994189262390137, |
|
"kl": 1.136810302734375, |
|
"learning_rate": 1.4554267916537495e-07, |
|
"loss": 0.0454, |
|
"reward": 0.4324219524860382, |
|
"reward_std": 0.8103198558092117, |
|
"rewards/cosine_scaled_reward": -0.1900390349328518, |
|
"rewards/format_reward": 0.8125000149011612, |
|
"step": 435 |
|
}, |
|
{ |
|
"completion_length": 1499.3750305175781, |
|
"epoch": 0.4982857142857143, |
|
"grad_norm": 2.9299046993255615, |
|
"kl": 0.6876220703125, |
|
"learning_rate": 1.4417536311769885e-07, |
|
"loss": 0.0275, |
|
"reward": 0.6081612259149551, |
|
"reward_std": 0.8525863587856293, |
|
"rewards/cosine_scaled_reward": -0.1334194028750062, |
|
"rewards/format_reward": 0.8750000149011612, |
|
"step": 436 |
|
}, |
|
{ |
|
"completion_length": 1131.1458892822266, |
|
"epoch": 0.49942857142857144, |
|
"grad_norm": 1.970673680305481, |
|
"kl": 0.24951171875, |
|
"learning_rate": 1.4282782639029128e-07, |
|
"loss": 0.01, |
|
"reward": 0.8882552683353424, |
|
"reward_std": 0.7682344913482666, |
|
"rewards/cosine_scaled_reward": -0.02462236536666751, |
|
"rewards/format_reward": 0.9375000149011612, |
|
"step": 437 |
|
}, |
|
{ |
|
"completion_length": 1025.5208740234375, |
|
"epoch": 0.5005714285714286, |
|
"grad_norm": 1.1531387567520142, |
|
"kl": 0.20703125, |
|
"learning_rate": 1.4150013466019114e-07, |
|
"loss": 0.0083, |
|
"reward": 0.9214688986539841, |
|
"reward_std": 0.6478733271360397, |
|
"rewards/cosine_scaled_reward": -0.03926557023078203, |
|
"rewards/format_reward": 1.0, |
|
"step": 438 |
|
}, |
|
{ |
|
"completion_length": 1004.7708740234375, |
|
"epoch": 0.5017142857142857, |
|
"grad_norm": 1.5415436029434204, |
|
"kl": 0.22320556640625, |
|
"learning_rate": 1.4019235263722034e-07, |
|
"loss": 0.0089, |
|
"reward": 0.6858842521905899, |
|
"reward_std": 0.581585705280304, |
|
"rewards/cosine_scaled_reward": -0.1466412227600813, |
|
"rewards/format_reward": 0.9791666716337204, |
|
"step": 439 |
|
}, |
|
{ |
|
"completion_length": 1667.5000610351562, |
|
"epoch": 0.5028571428571429, |
|
"grad_norm": 1.8554582595825195, |
|
"kl": 0.833984375, |
|
"learning_rate": 1.3890454406082956e-07, |
|
"loss": 0.0334, |
|
"reward": 0.6555005759000778, |
|
"reward_std": 0.8219043761491776, |
|
"rewards/cosine_scaled_reward": -0.0784997058508452, |
|
"rewards/format_reward": 0.8125000298023224, |
|
"step": 440 |
|
}, |
|
{ |
|
"completion_length": 1507.8542175292969, |
|
"epoch": 0.504, |
|
"grad_norm": 3.88932466506958, |
|
"kl": 0.70849609375, |
|
"learning_rate": 1.3763677169699217e-07, |
|
"loss": 0.0284, |
|
"reward": 0.7302692234516144, |
|
"reward_std": 0.6611650586128235, |
|
"rewards/cosine_scaled_reward": -0.07236538827419281, |
|
"rewards/format_reward": 0.8750000298023224, |
|
"step": 441 |
|
}, |
|
{ |
|
"completion_length": 1471.1250305175781, |
|
"epoch": 0.5051428571428571, |
|
"grad_norm": 3.0517144203186035, |
|
"kl": 0.61572265625, |
|
"learning_rate": 1.3638909733514452e-07, |
|
"loss": 0.0247, |
|
"reward": 0.6592699624598026, |
|
"reward_std": 0.7461230009794235, |
|
"rewards/cosine_scaled_reward": -0.11828170018270612, |
|
"rewards/format_reward": 0.8958333432674408, |
|
"step": 442 |
|
}, |
|
{ |
|
"completion_length": 1453.1458740234375, |
|
"epoch": 0.5062857142857143, |
|
"grad_norm": 2.053025722503662, |
|
"kl": 0.724609375, |
|
"learning_rate": 1.351615817851748e-07, |
|
"loss": 0.029, |
|
"reward": 0.5722346976399422, |
|
"reward_std": 0.643949382007122, |
|
"rewards/cosine_scaled_reward": -0.1305493265390396, |
|
"rewards/format_reward": 0.833333358168602, |
|
"step": 443 |
|
}, |
|
{ |
|
"completion_length": 1375.6875305175781, |
|
"epoch": 0.5074285714285715, |
|
"grad_norm": 1.6717604398727417, |
|
"kl": 0.594390869140625, |
|
"learning_rate": 1.3395428487445914e-07, |
|
"loss": 0.0237, |
|
"reward": 1.0815926790237427, |
|
"reward_std": 0.7383088618516922, |
|
"rewards/cosine_scaled_reward": 0.08246299810707569, |
|
"rewards/format_reward": 0.9166666865348816, |
|
"step": 444 |
|
}, |
|
{ |
|
"completion_length": 1785.2708740234375, |
|
"epoch": 0.5085714285714286, |
|
"grad_norm": 3.6348721981048584, |
|
"kl": 1.021484375, |
|
"learning_rate": 1.3276726544494571e-07, |
|
"loss": 0.0408, |
|
"reward": 0.6278277039527893, |
|
"reward_std": 0.6637818515300751, |
|
"rewards/cosine_scaled_reward": -0.10275283083319664, |
|
"rewards/format_reward": 0.8333333432674408, |
|
"step": 445 |
|
}, |
|
{ |
|
"completion_length": 1080.1458587646484, |
|
"epoch": 0.5097142857142857, |
|
"grad_norm": 1.7892571687698364, |
|
"kl": 0.1240234375, |
|
"learning_rate": 1.316005813502869e-07, |
|
"loss": 0.005, |
|
"reward": 0.8095807060599327, |
|
"reward_std": 0.6449108496308327, |
|
"rewards/cosine_scaled_reward": -0.053542979061603546, |
|
"rewards/format_reward": 0.9166666865348816, |
|
"step": 446 |
|
}, |
|
{ |
|
"completion_length": 1139.9166870117188, |
|
"epoch": 0.5108571428571429, |
|
"grad_norm": 1.82309091091156, |
|
"kl": 0.305145263671875, |
|
"learning_rate": 1.3045428945301953e-07, |
|
"loss": 0.0122, |
|
"reward": 0.9205853343009949, |
|
"reward_std": 0.7828942686319351, |
|
"rewards/cosine_scaled_reward": -0.02929066913202405, |
|
"rewards/format_reward": 0.9791666716337204, |
|
"step": 447 |
|
}, |
|
{ |
|
"completion_length": 1278.1250305175781, |
|
"epoch": 0.512, |
|
"grad_norm": 2.259813070297241, |
|
"kl": 0.251708984375, |
|
"learning_rate": 1.2932844562179352e-07, |
|
"loss": 0.0101, |
|
"reward": 0.8140432685613632, |
|
"reward_std": 0.658391922712326, |
|
"rewards/cosine_scaled_reward": -0.05131170805543661, |
|
"rewards/format_reward": 0.9166666865348816, |
|
"step": 448 |
|
}, |
|
{ |
|
"completion_length": 1173.3125305175781, |
|
"epoch": 0.5131428571428571, |
|
"grad_norm": 1.361348032951355, |
|
"kl": 0.3782958984375, |
|
"learning_rate": 1.2822310472864885e-07, |
|
"loss": 0.0151, |
|
"reward": 0.9565356224775314, |
|
"reward_std": 0.7290000915527344, |
|
"rewards/cosine_scaled_reward": 0.019934438169002533, |
|
"rewards/format_reward": 0.9166666716337204, |
|
"step": 449 |
|
}, |
|
{ |
|
"completion_length": 1371.1667175292969, |
|
"epoch": 0.5142857142857142, |
|
"grad_norm": 3.834918737411499, |
|
"kl": 0.513427734375, |
|
"learning_rate": 1.2713832064634125e-07, |
|
"loss": 0.0205, |
|
"reward": 0.7457224242389202, |
|
"reward_std": 0.5297245979309082, |
|
"rewards/cosine_scaled_reward": -0.04380548745393753, |
|
"rewards/format_reward": 0.8333333432674408, |
|
"step": 450 |
|
}, |
|
{ |
|
"completion_length": 1255.6250305175781, |
|
"epoch": 0.5154285714285715, |
|
"grad_norm": 2.335192918777466, |
|
"kl": 0.6416015625, |
|
"learning_rate": 1.260741462457165e-07, |
|
"loss": 0.0257, |
|
"reward": 1.0948645919561386, |
|
"reward_std": 0.7944803088903427, |
|
"rewards/cosine_scaled_reward": 0.10993227222934365, |
|
"rewards/format_reward": 0.8750000298023224, |
|
"step": 451 |
|
}, |
|
{ |
|
"completion_length": 1190.6666870117188, |
|
"epoch": 0.5165714285714286, |
|
"grad_norm": 1.7608739137649536, |
|
"kl": 0.500244140625, |
|
"learning_rate": 1.2503063339313356e-07, |
|
"loss": 0.02, |
|
"reward": 0.5270581915974617, |
|
"reward_std": 0.6462119966745377, |
|
"rewards/cosine_scaled_reward": -0.1843875776976347, |
|
"rewards/format_reward": 0.895833358168602, |
|
"step": 452 |
|
}, |
|
{ |
|
"completion_length": 1610.3125610351562, |
|
"epoch": 0.5177142857142857, |
|
"grad_norm": 2.1905384063720703, |
|
"kl": 0.96484375, |
|
"learning_rate": 1.2400783294793668e-07, |
|
"loss": 0.0386, |
|
"reward": 0.5264739021658897, |
|
"reward_std": 0.707670621573925, |
|
"rewards/cosine_scaled_reward": -0.1325964080169797, |
|
"rewards/format_reward": 0.7916666865348816, |
|
"step": 453 |
|
}, |
|
{ |
|
"completion_length": 1178.0000305175781, |
|
"epoch": 0.5188571428571429, |
|
"grad_norm": 1.7162837982177734, |
|
"kl": 0.55322265625, |
|
"learning_rate": 1.2300579475997657e-07, |
|
"loss": 0.0221, |
|
"reward": 0.7971140295267105, |
|
"reward_std": 0.5913056135177612, |
|
"rewards/cosine_scaled_reward": -0.059776326175779104, |
|
"rewards/format_reward": 0.9166666865348816, |
|
"step": 454 |
|
}, |
|
{ |
|
"completion_length": 1480.6250305175781, |
|
"epoch": 0.52, |
|
"grad_norm": 4.159846305847168, |
|
"kl": 1.05126953125, |
|
"learning_rate": 1.220245676671809e-07, |
|
"loss": 0.0421, |
|
"reward": 0.6366595476865768, |
|
"reward_std": 0.44621995836496353, |
|
"rewards/cosine_scaled_reward": -0.11917022056877613, |
|
"rewards/format_reward": 0.8750000149011612, |
|
"step": 455 |
|
}, |
|
{ |
|
"completion_length": 1499.1458740234375, |
|
"epoch": 0.5211428571428571, |
|
"grad_norm": 3.277935743331909, |
|
"kl": 0.708251953125, |
|
"learning_rate": 1.2106419949317388e-07, |
|
"loss": 0.0284, |
|
"reward": 0.7240877486765385, |
|
"reward_std": 0.5471076965332031, |
|
"rewards/cosine_scaled_reward": -0.05462279508356005, |
|
"rewards/format_reward": 0.8333333432674408, |
|
"step": 456 |
|
}, |
|
{ |
|
"completion_length": 1379.7292175292969, |
|
"epoch": 0.5222857142857142, |
|
"grad_norm": 2.5969083309173584, |
|
"kl": 0.6796875, |
|
"learning_rate": 1.2012473704494537e-07, |
|
"loss": 0.0272, |
|
"reward": 0.869850842282176, |
|
"reward_std": 0.700515478849411, |
|
"rewards/cosine_scaled_reward": 0.02867540717124939, |
|
"rewards/format_reward": 0.8125000298023224, |
|
"step": 457 |
|
}, |
|
{ |
|
"completion_length": 1159.9791870117188, |
|
"epoch": 0.5234285714285715, |
|
"grad_norm": 2.355196952819824, |
|
"kl": 0.31744384765625, |
|
"learning_rate": 1.1920622611056974e-07, |
|
"loss": 0.0127, |
|
"reward": 0.923922210931778, |
|
"reward_std": 0.5514720380306244, |
|
"rewards/cosine_scaled_reward": -0.017205584794282913, |
|
"rewards/format_reward": 0.9583333432674408, |
|
"step": 458 |
|
}, |
|
{ |
|
"completion_length": 1273.5833892822266, |
|
"epoch": 0.5245714285714286, |
|
"grad_norm": 4.148768901824951, |
|
"kl": 0.59765625, |
|
"learning_rate": 1.1830871145697412e-07, |
|
"loss": 0.0239, |
|
"reward": 0.574098750948906, |
|
"reward_std": 0.7631915658712387, |
|
"rewards/cosine_scaled_reward": -0.16086730360984802, |
|
"rewards/format_reward": 0.895833358168602, |
|
"step": 459 |
|
}, |
|
{ |
|
"completion_length": 1453.9792175292969, |
|
"epoch": 0.5257142857142857, |
|
"grad_norm": 3.0608623027801514, |
|
"kl": 0.8349609375, |
|
"learning_rate": 1.1743223682775649e-07, |
|
"loss": 0.0335, |
|
"reward": 1.0540032014250755, |
|
"reward_std": 0.7098206132650375, |
|
"rewards/cosine_scaled_reward": 0.07908494677394629, |
|
"rewards/format_reward": 0.8958333432674408, |
|
"step": 460 |
|
}, |
|
{ |
|
"completion_length": 1280.1667175292969, |
|
"epoch": 0.5268571428571428, |
|
"grad_norm": 1.6031488180160522, |
|
"kl": 0.51483154296875, |
|
"learning_rate": 1.1657684494105386e-07, |
|
"loss": 0.0206, |
|
"reward": 1.0345441699028015, |
|
"reward_std": 0.464998334646225, |
|
"rewards/cosine_scaled_reward": 0.05893874540925026, |
|
"rewards/format_reward": 0.9166666865348816, |
|
"step": 461 |
|
}, |
|
{ |
|
"completion_length": 1348.2083740234375, |
|
"epoch": 0.528, |
|
"grad_norm": 2.087766647338867, |
|
"kl": 0.288330078125, |
|
"learning_rate": 1.1574257748745986e-07, |
|
"loss": 0.0115, |
|
"reward": 0.5173667185008526, |
|
"reward_std": 0.5679958164691925, |
|
"rewards/cosine_scaled_reward": -0.18923332425765693, |
|
"rewards/format_reward": 0.8958333432674408, |
|
"step": 462 |
|
}, |
|
{ |
|
"completion_length": 1184.8333892822266, |
|
"epoch": 0.5291428571428571, |
|
"grad_norm": 2.3618013858795166, |
|
"kl": 0.67828369140625, |
|
"learning_rate": 1.1492947512799328e-07, |
|
"loss": 0.027, |
|
"reward": 0.8587629348039627, |
|
"reward_std": 0.5902151763439178, |
|
"rewards/cosine_scaled_reward": 0.002298124134540558, |
|
"rewards/format_reward": 0.8541666865348816, |
|
"step": 463 |
|
}, |
|
{ |
|
"completion_length": 1381.3750610351562, |
|
"epoch": 0.5302857142857142, |
|
"grad_norm": 2.5598762035369873, |
|
"kl": 0.4290771484375, |
|
"learning_rate": 1.1413757749211602e-07, |
|
"loss": 0.0172, |
|
"reward": 0.8569861799478531, |
|
"reward_std": 0.5644854605197906, |
|
"rewards/cosine_scaled_reward": -0.04025692865252495, |
|
"rewards/format_reward": 0.9375000149011612, |
|
"step": 464 |
|
}, |
|
{ |
|
"completion_length": 1754.0000610351562, |
|
"epoch": 0.5314285714285715, |
|
"grad_norm": 3.649667739868164, |
|
"kl": 1.2236328125, |
|
"learning_rate": 1.1336692317580158e-07, |
|
"loss": 0.049, |
|
"reward": 0.6691107526421547, |
|
"reward_std": 0.7533622533082962, |
|
"rewards/cosine_scaled_reward": -0.050861308351159096, |
|
"rewards/format_reward": 0.770833358168602, |
|
"step": 465 |
|
}, |
|
{ |
|
"completion_length": 1626.979248046875, |
|
"epoch": 0.5325714285714286, |
|
"grad_norm": 4.014570713043213, |
|
"kl": 1.1494140625, |
|
"learning_rate": 1.1261754973965422e-07, |
|
"loss": 0.046, |
|
"reward": 0.39536508079618216, |
|
"reward_std": 0.5428843200206757, |
|
"rewards/cosine_scaled_reward": -0.18773413076996803, |
|
"rewards/format_reward": 0.7708333432674408, |
|
"step": 466 |
|
}, |
|
{ |
|
"completion_length": 1410.1875305175781, |
|
"epoch": 0.5337142857142857, |
|
"grad_norm": 6.032597064971924, |
|
"kl": 0.89404296875, |
|
"learning_rate": 1.1188949370707787e-07, |
|
"loss": 0.0358, |
|
"reward": 0.7157600894570351, |
|
"reward_std": 0.6754554212093353, |
|
"rewards/cosine_scaled_reward": -0.07961997389793396, |
|
"rewards/format_reward": 0.8750000298023224, |
|
"step": 467 |
|
}, |
|
{ |
|
"completion_length": 1505.0208740234375, |
|
"epoch": 0.5348571428571428, |
|
"grad_norm": 3.0937063694000244, |
|
"kl": 1.083251953125, |
|
"learning_rate": 1.1118279056249653e-07, |
|
"loss": 0.0434, |
|
"reward": 1.0479508265852928, |
|
"reward_std": 0.7750177532434464, |
|
"rewards/cosine_scaled_reward": 0.08647541608661413, |
|
"rewards/format_reward": 0.8750000298023224, |
|
"step": 468 |
|
}, |
|
{ |
|
"completion_length": 1116.0208435058594, |
|
"epoch": 0.536, |
|
"grad_norm": 2.7102174758911133, |
|
"kl": 0.68310546875, |
|
"learning_rate": 1.1049747474962444e-07, |
|
"loss": 0.0273, |
|
"reward": 0.8038362823426723, |
|
"reward_std": 0.7213334441184998, |
|
"rewards/cosine_scaled_reward": -0.04599856585264206, |
|
"rewards/format_reward": 0.8958333432674408, |
|
"step": 469 |
|
}, |
|
{ |
|
"completion_length": 1321.1458740234375, |
|
"epoch": 0.5371428571428571, |
|
"grad_norm": 6.9517316818237305, |
|
"kl": 1.054931640625, |
|
"learning_rate": 1.0983357966978745e-07, |
|
"loss": 0.0422, |
|
"reward": 0.6858988218009472, |
|
"reward_std": 0.6414925083518028, |
|
"rewards/cosine_scaled_reward": -0.0424672719091177, |
|
"rewards/format_reward": 0.7708333432674408, |
|
"step": 470 |
|
}, |
|
{ |
|
"completion_length": 1012.8125457763672, |
|
"epoch": 0.5382857142857143, |
|
"grad_norm": 15.012682914733887, |
|
"kl": 2.00054931640625, |
|
"learning_rate": 1.0919113768029517e-07, |
|
"loss": 0.0798, |
|
"reward": 0.7205886021256447, |
|
"reward_std": 0.697634182870388, |
|
"rewards/cosine_scaled_reward": -0.1084557194262743, |
|
"rewards/format_reward": 0.9375000149011612, |
|
"step": 471 |
|
}, |
|
{ |
|
"completion_length": 1118.0000305175781, |
|
"epoch": 0.5394285714285715, |
|
"grad_norm": 1.921370506286621, |
|
"kl": 0.70703125, |
|
"learning_rate": 1.0857018009286381e-07, |
|
"loss": 0.0282, |
|
"reward": 0.8347459137439728, |
|
"reward_std": 0.7502800822257996, |
|
"rewards/cosine_scaled_reward": -0.03054371359758079, |
|
"rewards/format_reward": 0.895833358168602, |
|
"step": 472 |
|
}, |
|
{ |
|
"completion_length": 1343.3333740234375, |
|
"epoch": 0.5405714285714286, |
|
"grad_norm": 1.8898627758026123, |
|
"kl": 0.716796875, |
|
"learning_rate": 1.0797073717209013e-07, |
|
"loss": 0.0287, |
|
"reward": 0.3787691295146942, |
|
"reward_std": 0.4741926044225693, |
|
"rewards/cosine_scaled_reward": -0.2585321292281151, |
|
"rewards/format_reward": 0.8958333432674408, |
|
"step": 473 |
|
}, |
|
{ |
|
"completion_length": 1074.5417022705078, |
|
"epoch": 0.5417142857142857, |
|
"grad_norm": 1.3952125310897827, |
|
"kl": 0.275390625, |
|
"learning_rate": 1.0739283813397639e-07, |
|
"loss": 0.011, |
|
"reward": 1.0772456228733063, |
|
"reward_std": 0.8115750551223755, |
|
"rewards/cosine_scaled_reward": 0.09070614166557789, |
|
"rewards/format_reward": 0.895833358168602, |
|
"step": 474 |
|
}, |
|
{ |
|
"completion_length": 1536.7708740234375, |
|
"epoch": 0.5428571428571428, |
|
"grad_norm": 1.9422410726547241, |
|
"kl": 0.89697265625, |
|
"learning_rate": 1.068365111445064e-07, |
|
"loss": 0.036, |
|
"reward": 0.7740568369626999, |
|
"reward_std": 0.7255712598562241, |
|
"rewards/cosine_scaled_reward": -0.06088825827464461, |
|
"rewards/format_reward": 0.8958333432674408, |
|
"step": 475 |
|
}, |
|
{ |
|
"completion_length": 1238.6250305175781, |
|
"epoch": 0.544, |
|
"grad_norm": 1.791479468345642, |
|
"kl": 0.49365234375, |
|
"learning_rate": 1.063017833182728e-07, |
|
"loss": 0.0198, |
|
"reward": 1.1282098963856697, |
|
"reward_std": 0.7448728978633881, |
|
"rewards/cosine_scaled_reward": 0.1266049058176577, |
|
"rewards/format_reward": 0.8750000298023224, |
|
"step": 476 |
|
}, |
|
{ |
|
"completion_length": 926.4167175292969, |
|
"epoch": 0.5451428571428572, |
|
"grad_norm": 1.0713694095611572, |
|
"kl": 0.20794677734375, |
|
"learning_rate": 1.0578868071715544e-07, |
|
"loss": 0.0083, |
|
"reward": 1.242761254310608, |
|
"reward_std": 0.48385217040777206, |
|
"rewards/cosine_scaled_reward": 0.13179726898670197, |
|
"rewards/format_reward": 0.9791666716337204, |
|
"step": 477 |
|
}, |
|
{ |
|
"completion_length": 1389.8750457763672, |
|
"epoch": 0.5462857142857143, |
|
"grad_norm": 1.6024447679519653, |
|
"kl": 0.315399169921875, |
|
"learning_rate": 1.0529722834905125e-07, |
|
"loss": 0.0126, |
|
"reward": 0.5958605632185936, |
|
"reward_std": 0.5410900861024857, |
|
"rewards/cosine_scaled_reward": -0.17081973887979984, |
|
"rewards/format_reward": 0.9375000149011612, |
|
"step": 478 |
|
}, |
|
{ |
|
"completion_length": 1060.0000305175781, |
|
"epoch": 0.5474285714285714, |
|
"grad_norm": 1.9674124717712402, |
|
"kl": 0.351318359375, |
|
"learning_rate": 1.0482745016665526e-07, |
|
"loss": 0.0141, |
|
"reward": 0.9066205322742462, |
|
"reward_std": 0.8546933829784393, |
|
"rewards/cosine_scaled_reward": -0.036273106932640076, |
|
"rewards/format_reward": 0.9791666716337204, |
|
"step": 479 |
|
}, |
|
{ |
|
"completion_length": 1312.5208740234375, |
|
"epoch": 0.5485714285714286, |
|
"grad_norm": 1.8547768592834473, |
|
"kl": 0.3245697021484375, |
|
"learning_rate": 1.0437936906629334e-07, |
|
"loss": 0.013, |
|
"reward": 1.0583224594593048, |
|
"reward_std": 0.5566529557108879, |
|
"rewards/cosine_scaled_reward": 0.04999455437064171, |
|
"rewards/format_reward": 0.9583333432674408, |
|
"step": 480 |
|
}, |
|
{ |
|
"completion_length": 1382.7500610351562, |
|
"epoch": 0.5497142857142857, |
|
"grad_norm": 1.5025635957717896, |
|
"kl": 0.718505859375, |
|
"learning_rate": 1.0395300688680625e-07, |
|
"loss": 0.0287, |
|
"reward": 0.6311604380607605, |
|
"reward_std": 0.6715650781989098, |
|
"rewards/cosine_scaled_reward": -0.1323364470154047, |
|
"rewards/format_reward": 0.8958333432674408, |
|
"step": 481 |
|
}, |
|
{ |
|
"completion_length": 1061.8750305175781, |
|
"epoch": 0.5508571428571428, |
|
"grad_norm": 2.283987522125244, |
|
"kl": 0.368408203125, |
|
"learning_rate": 1.0354838440848501e-07, |
|
"loss": 0.0148, |
|
"reward": 0.9338921532034874, |
|
"reward_std": 0.663320891559124, |
|
"rewards/cosine_scaled_reward": -0.022637249901890755, |
|
"rewards/format_reward": 0.9791666716337204, |
|
"step": 482 |
|
}, |
|
{ |
|
"completion_length": 1664.0000305175781, |
|
"epoch": 0.552, |
|
"grad_norm": 3.247492551803589, |
|
"kl": 1.43994140625, |
|
"learning_rate": 1.0316552135205837e-07, |
|
"loss": 0.0576, |
|
"reward": 0.4382214695215225, |
|
"reward_std": 0.6629593223333359, |
|
"rewards/cosine_scaled_reward": -0.1454725954681635, |
|
"rewards/format_reward": 0.7291666865348816, |
|
"step": 483 |
|
}, |
|
{ |
|
"completion_length": 1166.9375457763672, |
|
"epoch": 0.5531428571428572, |
|
"grad_norm": 1.635184407234192, |
|
"kl": 0.59814453125, |
|
"learning_rate": 1.0280443637773163e-07, |
|
"loss": 0.0239, |
|
"reward": 1.209791585803032, |
|
"reward_std": 0.48193909227848053, |
|
"rewards/cosine_scaled_reward": 0.12572911009192467, |
|
"rewards/format_reward": 0.9583333432674408, |
|
"step": 484 |
|
}, |
|
{ |
|
"completion_length": 1562.9167022705078, |
|
"epoch": 0.5542857142857143, |
|
"grad_norm": 1.5177675485610962, |
|
"kl": 1.32794189453125, |
|
"learning_rate": 1.0246514708427701e-07, |
|
"loss": 0.0532, |
|
"reward": 0.6973680332303047, |
|
"reward_std": 0.6229738295078278, |
|
"rewards/cosine_scaled_reward": -0.07839931827038527, |
|
"rewards/format_reward": 0.8541666716337204, |
|
"step": 485 |
|
}, |
|
{ |
|
"completion_length": 1258.0416870117188, |
|
"epoch": 0.5554285714285714, |
|
"grad_norm": 1.6634312868118286, |
|
"kl": 0.598846435546875, |
|
"learning_rate": 1.0214767000817596e-07, |
|
"loss": 0.0239, |
|
"reward": 0.9936239048838615, |
|
"reward_std": 0.7174008414149284, |
|
"rewards/cosine_scaled_reward": 0.038478586822748184, |
|
"rewards/format_reward": 0.9166666865348816, |
|
"step": 486 |
|
}, |
|
{ |
|
"completion_length": 1263.3125610351562, |
|
"epoch": 0.5565714285714286, |
|
"grad_norm": 3.9874322414398193, |
|
"kl": 0.56591796875, |
|
"learning_rate": 1.0185202062281336e-07, |
|
"loss": 0.0226, |
|
"reward": 0.9569487422704697, |
|
"reward_std": 0.776068776845932, |
|
"rewards/cosine_scaled_reward": 0.009724359028041363, |
|
"rewards/format_reward": 0.9375000149011612, |
|
"step": 487 |
|
}, |
|
{ |
|
"completion_length": 1440.8333435058594, |
|
"epoch": 0.5577142857142857, |
|
"grad_norm": 3.2814691066741943, |
|
"kl": 0.67236328125, |
|
"learning_rate": 1.0157821333772304e-07, |
|
"loss": 0.0269, |
|
"reward": 0.4271909072995186, |
|
"reward_std": 0.5705131366848946, |
|
"rewards/cosine_scaled_reward": -0.2447379156947136, |
|
"rewards/format_reward": 0.9166666865348816, |
|
"step": 488 |
|
}, |
|
{ |
|
"completion_length": 983.0625457763672, |
|
"epoch": 0.5588571428571428, |
|
"grad_norm": 1.347419261932373, |
|
"kl": 0.39642333984375, |
|
"learning_rate": 1.013262614978859e-07, |
|
"loss": 0.0158, |
|
"reward": 1.1842274367809296, |
|
"reward_std": 0.5909858122467995, |
|
"rewards/cosine_scaled_reward": 0.11294705420732498, |
|
"rewards/format_reward": 0.9583333432674408, |
|
"step": 489 |
|
}, |
|
{ |
|
"completion_length": 1493.3958740234375, |
|
"epoch": 0.56, |
|
"grad_norm": 1.7785853147506714, |
|
"kl": 0.92236328125, |
|
"learning_rate": 1.0109617738307911e-07, |
|
"loss": 0.0368, |
|
"reward": 0.5224965363740921, |
|
"reward_std": 0.5668257884681225, |
|
"rewards/cosine_scaled_reward": -0.1450017336755991, |
|
"rewards/format_reward": 0.8125000298023224, |
|
"step": 490 |
|
}, |
|
{ |
|
"completion_length": 1626.2916870117188, |
|
"epoch": 0.5611428571428572, |
|
"grad_norm": 1.94631028175354, |
|
"kl": 1.02734375, |
|
"learning_rate": 1.0088797220727779e-07, |
|
"loss": 0.0411, |
|
"reward": 0.6662824004888535, |
|
"reward_std": 0.6974209845066071, |
|
"rewards/cosine_scaled_reward": -0.07310881093144417, |
|
"rewards/format_reward": 0.8125000298023224, |
|
"step": 491 |
|
}, |
|
{ |
|
"completion_length": 1049.5833740234375, |
|
"epoch": 0.5622857142857143, |
|
"grad_norm": 2.7264091968536377, |
|
"kl": 0.43701171875, |
|
"learning_rate": 1.0070165611810855e-07, |
|
"loss": 0.0175, |
|
"reward": 0.8967996649444103, |
|
"reward_std": 0.40792329236865044, |
|
"rewards/cosine_scaled_reward": -0.02035021036863327, |
|
"rewards/format_reward": 0.9375, |
|
"step": 492 |
|
}, |
|
{ |
|
"completion_length": 1251.0000610351562, |
|
"epoch": 0.5634285714285714, |
|
"grad_norm": 1.9115831851959229, |
|
"kl": 0.535888671875, |
|
"learning_rate": 1.005372381963547e-07, |
|
"loss": 0.0215, |
|
"reward": 0.7575453743338585, |
|
"reward_std": 0.6094752550125122, |
|
"rewards/cosine_scaled_reward": -0.11081065610051155, |
|
"rewards/format_reward": 0.9791666716337204, |
|
"step": 493 |
|
}, |
|
{ |
|
"completion_length": 1035.9375305175781, |
|
"epoch": 0.5645714285714286, |
|
"grad_norm": 3.7157654762268066, |
|
"kl": 0.62744140625, |
|
"learning_rate": 1.0039472645551372e-07, |
|
"loss": 0.0251, |
|
"reward": 0.49045146629214287, |
|
"reward_std": 0.4288835674524307, |
|
"rewards/cosine_scaled_reward": -0.21310760331107304, |
|
"rewards/format_reward": 0.9166666865348816, |
|
"step": 494 |
|
}, |
|
{ |
|
"completion_length": 1336.2291870117188, |
|
"epoch": 0.5657142857142857, |
|
"grad_norm": 1.2452759742736816, |
|
"kl": 0.7421875, |
|
"learning_rate": 1.002741278414069e-07, |
|
"loss": 0.0296, |
|
"reward": 1.002578854560852, |
|
"reward_std": 0.5317458659410477, |
|
"rewards/cosine_scaled_reward": 0.05337274447083473, |
|
"rewards/format_reward": 0.895833358168602, |
|
"step": 495 |
|
}, |
|
{ |
|
"completion_length": 1363.2500305175781, |
|
"epoch": 0.5668571428571428, |
|
"grad_norm": 1.7971941232681274, |
|
"kl": 0.7275390625, |
|
"learning_rate": 1.0017544823184055e-07, |
|
"loss": 0.0291, |
|
"reward": 1.0207700282335281, |
|
"reward_std": 0.8502290099859238, |
|
"rewards/cosine_scaled_reward": 0.09371834748890251, |
|
"rewards/format_reward": 0.833333358168602, |
|
"step": 496 |
|
}, |
|
{ |
|
"completion_length": 1141.7500305175781, |
|
"epoch": 0.568, |
|
"grad_norm": 2.350130796432495, |
|
"kl": 0.56689453125, |
|
"learning_rate": 1.0009869243631952e-07, |
|
"loss": 0.0227, |
|
"reward": 1.0808594226837158, |
|
"reward_std": 0.7858606725931168, |
|
"rewards/cosine_scaled_reward": 0.14459637086838484, |
|
"rewards/format_reward": 0.7916666865348816, |
|
"step": 497 |
|
}, |
|
{ |
|
"completion_length": 1354.0625610351562, |
|
"epoch": 0.5691428571428572, |
|
"grad_norm": 2.9282283782958984, |
|
"kl": 0.8212890625, |
|
"learning_rate": 1.000438641958131e-07, |
|
"loss": 0.0329, |
|
"reward": 0.7300854474306107, |
|
"reward_std": 0.6824060678482056, |
|
"rewards/cosine_scaled_reward": -0.08287395909428596, |
|
"rewards/format_reward": 0.895833358168602, |
|
"step": 498 |
|
}, |
|
{ |
|
"completion_length": 1752.0625610351562, |
|
"epoch": 0.5702857142857143, |
|
"grad_norm": 1.9427164793014526, |
|
"kl": 0.99267578125, |
|
"learning_rate": 1.0001096618257236e-07, |
|
"loss": 0.0397, |
|
"reward": 0.6041746586561203, |
|
"reward_std": 0.5943188220262527, |
|
"rewards/cosine_scaled_reward": -0.12499601114541292, |
|
"rewards/format_reward": 0.8541666865348816, |
|
"step": 499 |
|
}, |
|
{ |
|
"completion_length": 1389.7292175292969, |
|
"epoch": 0.5714285714285714, |
|
"grad_norm": 3.8614306449890137, |
|
"kl": 0.90966796875, |
|
"learning_rate": 1e-07, |
|
"loss": 0.0364, |
|
"reward": 0.8081704080104828, |
|
"reward_std": 0.7910114228725433, |
|
"rewards/cosine_scaled_reward": -0.022998109459877014, |
|
"rewards/format_reward": 0.8541666716337204, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.5714285714285714, |
|
"step": 500, |
|
"total_flos": 0.0, |
|
"train_loss": 0.007592763372914685, |
|
"train_runtime": 38120.128, |
|
"train_samples_per_second": 0.63, |
|
"train_steps_per_second": 0.013 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 500, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 50, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 0.0, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|