|
{ |
|
"best_global_step": null, |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.856898029134533, |
|
"eval_steps": 500, |
|
"global_step": 500, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2770.8472290039062, |
|
"epoch": 0.001713796058269066, |
|
"grad_norm": 0.15192405879497528, |
|
"kl": 0.0, |
|
"learning_rate": 0.0, |
|
"loss": 0.014, |
|
"reward": -0.06689765583723783, |
|
"reward_std": 0.505804143846035, |
|
"rewards/cosine_scaled_reward": -0.03344883490353823, |
|
"rewards/format_reward": 0.0, |
|
"step": 1 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2785.013916015625, |
|
"epoch": 0.003427592116538132, |
|
"grad_norm": 0.1657538264989853, |
|
"kl": 0.0, |
|
"learning_rate": 2e-08, |
|
"loss": -0.0211, |
|
"reward": -0.4646243788301945, |
|
"reward_std": 0.39301297068595886, |
|
"rewards/cosine_scaled_reward": -0.23231217823922634, |
|
"rewards/format_reward": 0.0, |
|
"step": 2 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2713.027801513672, |
|
"epoch": 0.005141388174807198, |
|
"grad_norm": 0.1747598648071289, |
|
"kl": 3.5196542739868164e-05, |
|
"learning_rate": 4e-08, |
|
"loss": -0.0275, |
|
"reward": -0.23865782655775547, |
|
"reward_std": 0.4481763616204262, |
|
"rewards/cosine_scaled_reward": -0.11932891746982932, |
|
"rewards/format_reward": 0.0, |
|
"step": 3 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2938.5277709960938, |
|
"epoch": 0.006855184233076264, |
|
"grad_norm": 0.16107600927352905, |
|
"kl": 3.7282705307006836e-05, |
|
"learning_rate": 6e-08, |
|
"loss": -0.0289, |
|
"reward": 0.06913903169333935, |
|
"reward_std": 0.6892540901899338, |
|
"rewards/cosine_scaled_reward": 0.03456950932741165, |
|
"rewards/format_reward": 0.0, |
|
"step": 4 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2532.7222290039062, |
|
"epoch": 0.00856898029134533, |
|
"grad_norm": 0.15964782238006592, |
|
"kl": 2.065300941467285e-05, |
|
"learning_rate": 8e-08, |
|
"loss": -0.0052, |
|
"reward": -0.15601756004616618, |
|
"reward_std": 0.5161308571696281, |
|
"rewards/cosine_scaled_reward": -0.07800877187401056, |
|
"rewards/format_reward": 0.0, |
|
"step": 5 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3131.25, |
|
"epoch": 0.010282776349614395, |
|
"grad_norm": 0.13910692930221558, |
|
"kl": 4.1961669921875e-05, |
|
"learning_rate": 1e-07, |
|
"loss": 0.029, |
|
"reward": -0.13883829297265038, |
|
"reward_std": 0.5291023775935173, |
|
"rewards/cosine_scaled_reward": -0.06941914733033627, |
|
"rewards/format_reward": 0.0, |
|
"step": 6 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2258.6944885253906, |
|
"epoch": 0.011996572407883462, |
|
"grad_norm": 0.21499329805374146, |
|
"kl": 3.059208393096924e-05, |
|
"learning_rate": 1.2e-07, |
|
"loss": -0.0297, |
|
"reward": -0.22816578298807144, |
|
"reward_std": 0.5721099078655243, |
|
"rewards/cosine_scaled_reward": -0.11408288218080997, |
|
"rewards/format_reward": 0.0, |
|
"step": 7 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3106.65283203125, |
|
"epoch": 0.013710368466152529, |
|
"grad_norm": 0.15807782113552094, |
|
"kl": 3.281235694885254e-05, |
|
"learning_rate": 1.4e-07, |
|
"loss": 0.0518, |
|
"reward": -0.1028524599969387, |
|
"reward_std": 0.7277905195951462, |
|
"rewards/cosine_scaled_reward": -0.051426228135824203, |
|
"rewards/format_reward": 0.0, |
|
"step": 8 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2652.2777709960938, |
|
"epoch": 0.015424164524421594, |
|
"grad_norm": 0.14988838136196136, |
|
"kl": 3.746151924133301e-05, |
|
"learning_rate": 1.6e-07, |
|
"loss": -0.0052, |
|
"reward": -0.04764566984522389, |
|
"reward_std": 0.6422684416174889, |
|
"rewards/cosine_scaled_reward": -0.023822834249585867, |
|
"rewards/format_reward": 0.0, |
|
"step": 9 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2956.250030517578, |
|
"epoch": 0.01713796058269066, |
|
"grad_norm": 0.15577340126037598, |
|
"kl": 3.62396240234375e-05, |
|
"learning_rate": 1.8e-07, |
|
"loss": 0.0369, |
|
"reward": -0.09274669736623764, |
|
"reward_std": 0.6059432476758957, |
|
"rewards/cosine_scaled_reward": -0.046373344492167234, |
|
"rewards/format_reward": 0.0, |
|
"step": 10 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2610.430633544922, |
|
"epoch": 0.018851756640959727, |
|
"grad_norm": 0.18031956255435944, |
|
"kl": 2.753734588623047e-05, |
|
"learning_rate": 2e-07, |
|
"loss": 0.0126, |
|
"reward": 0.17614622993642115, |
|
"reward_std": 0.7455325201153755, |
|
"rewards/cosine_scaled_reward": 0.08807311341661261, |
|
"rewards/format_reward": 0.0, |
|
"step": 11 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2977.2638549804688, |
|
"epoch": 0.02056555269922879, |
|
"grad_norm": 0.15254004299640656, |
|
"kl": 3.084540367126465e-05, |
|
"learning_rate": 2.1999999999999998e-07, |
|
"loss": -0.0238, |
|
"reward": -0.2835669822525233, |
|
"reward_std": 0.6270563155412674, |
|
"rewards/cosine_scaled_reward": -0.14178348786663264, |
|
"rewards/format_reward": 0.0, |
|
"step": 12 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2601.7916870117188, |
|
"epoch": 0.022279348757497857, |
|
"grad_norm": 0.1897689402103424, |
|
"kl": 4.309415817260742e-05, |
|
"learning_rate": 2.4e-07, |
|
"loss": -0.008, |
|
"reward": -0.08701697085052729, |
|
"reward_std": 0.6209904551506042, |
|
"rewards/cosine_scaled_reward": -0.04350848635658622, |
|
"rewards/format_reward": 0.0, |
|
"step": 13 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2891.3472290039062, |
|
"epoch": 0.023993144815766924, |
|
"grad_norm": 0.17451944947242737, |
|
"kl": 3.2007694244384766e-05, |
|
"learning_rate": 2.6e-07, |
|
"loss": 0.0134, |
|
"reward": -0.11856314726173878, |
|
"reward_std": 0.5714613646268845, |
|
"rewards/cosine_scaled_reward": -0.059281568974256516, |
|
"rewards/format_reward": 0.0, |
|
"step": 14 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3376.9444580078125, |
|
"epoch": 0.02570694087403599, |
|
"grad_norm": 0.19522128999233246, |
|
"kl": 4.00543212890625e-05, |
|
"learning_rate": 2.8e-07, |
|
"loss": 0.0625, |
|
"reward": -0.3375568427145481, |
|
"reward_std": 0.5690607726573944, |
|
"rewards/cosine_scaled_reward": -0.16877843253314495, |
|
"rewards/format_reward": 0.0, |
|
"step": 15 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2385.9861450195312, |
|
"epoch": 0.027420736932305057, |
|
"grad_norm": 0.17156164348125458, |
|
"kl": 3.203749656677246e-05, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0479, |
|
"reward": 0.31096187606453896, |
|
"reward_std": 0.719051368534565, |
|
"rewards/cosine_scaled_reward": 0.15548093989491463, |
|
"rewards/format_reward": 0.0, |
|
"step": 16 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2834.4166870117188, |
|
"epoch": 0.02913453299057412, |
|
"grad_norm": 0.1916762739419937, |
|
"kl": 3.910064697265625e-05, |
|
"learning_rate": 3.2e-07, |
|
"loss": 0.0288, |
|
"reward": -0.1371638989658095, |
|
"reward_std": 0.43335365504026413, |
|
"rewards/cosine_scaled_reward": -0.06858194415690377, |
|
"rewards/format_reward": 0.0, |
|
"step": 17 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3107.9166870117188, |
|
"epoch": 0.030848329048843187, |
|
"grad_norm": 0.20290644466876984, |
|
"kl": 3.56137752532959e-05, |
|
"learning_rate": 3.4000000000000003e-07, |
|
"loss": 0.0182, |
|
"reward": -0.2907893192023039, |
|
"reward_std": 0.43716832995414734, |
|
"rewards/cosine_scaled_reward": -0.14539465866982937, |
|
"rewards/format_reward": 0.0, |
|
"step": 18 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3065.611083984375, |
|
"epoch": 0.032562125107112254, |
|
"grad_norm": 0.1492234468460083, |
|
"kl": 4.0084123611450195e-05, |
|
"learning_rate": 3.6e-07, |
|
"loss": 0.0216, |
|
"reward": -0.19093798706308007, |
|
"reward_std": 0.7698801159858704, |
|
"rewards/cosine_scaled_reward": -0.09546899236738682, |
|
"rewards/format_reward": 0.0, |
|
"step": 19 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3355.7222900390625, |
|
"epoch": 0.03427592116538132, |
|
"grad_norm": 0.14321106672286987, |
|
"kl": 3.36766242980957e-05, |
|
"learning_rate": 3.7999999999999996e-07, |
|
"loss": -0.0048, |
|
"reward": -0.2757381685078144, |
|
"reward_std": 0.5536239072680473, |
|
"rewards/cosine_scaled_reward": -0.1378690842539072, |
|
"rewards/format_reward": 0.0, |
|
"step": 20 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2938.125, |
|
"epoch": 0.03598971722365039, |
|
"grad_norm": 0.20512644946575165, |
|
"kl": 4.1961669921875e-05, |
|
"learning_rate": 4e-07, |
|
"loss": 0.0577, |
|
"reward": -0.1858626427128911, |
|
"reward_std": 0.6686508804559708, |
|
"rewards/cosine_scaled_reward": -0.09293132461607456, |
|
"rewards/format_reward": 0.0, |
|
"step": 21 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3192.2361450195312, |
|
"epoch": 0.037703513281919454, |
|
"grad_norm": 0.13245940208435059, |
|
"kl": 3.49879264831543e-05, |
|
"learning_rate": 4.1999999999999995e-07, |
|
"loss": 0.0372, |
|
"reward": -0.186855623498559, |
|
"reward_std": 0.5942067578434944, |
|
"rewards/cosine_scaled_reward": -0.09342780988663435, |
|
"rewards/format_reward": 0.0, |
|
"step": 22 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3075.02783203125, |
|
"epoch": 0.03941730934018852, |
|
"grad_norm": 0.14223958551883698, |
|
"kl": 2.8640031814575195e-05, |
|
"learning_rate": 4.3999999999999997e-07, |
|
"loss": -0.0208, |
|
"reward": -0.4465179964900017, |
|
"reward_std": 0.36973506212234497, |
|
"rewards/cosine_scaled_reward": -0.223259000107646, |
|
"rewards/format_reward": 0.0, |
|
"step": 23 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2707.6250610351562, |
|
"epoch": 0.04113110539845758, |
|
"grad_norm": 0.20090773701667786, |
|
"kl": 2.9414892196655273e-05, |
|
"learning_rate": 4.6e-07, |
|
"loss": 0.0292, |
|
"reward": 0.08563654706813395, |
|
"reward_std": 0.4666801244020462, |
|
"rewards/cosine_scaled_reward": 0.04281827830709517, |
|
"rewards/format_reward": 0.0, |
|
"step": 24 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2578.9443969726562, |
|
"epoch": 0.04284490145672665, |
|
"grad_norm": 0.19762183725833893, |
|
"kl": 2.6911497116088867e-05, |
|
"learning_rate": 4.8e-07, |
|
"loss": 0.0547, |
|
"reward": -0.15825002267956734, |
|
"reward_std": 0.6721501722931862, |
|
"rewards/cosine_scaled_reward": -0.07912501133978367, |
|
"rewards/format_reward": 0.0, |
|
"step": 25 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3199.1111450195312, |
|
"epoch": 0.044558697514995714, |
|
"grad_norm": 0.14947673678398132, |
|
"kl": 3.1381845474243164e-05, |
|
"learning_rate": 5e-07, |
|
"loss": 0.0771, |
|
"reward": -0.3339938232675195, |
|
"reward_std": 0.5660227835178375, |
|
"rewards/cosine_scaled_reward": -0.16699691163375974, |
|
"rewards/format_reward": 0.0, |
|
"step": 26 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3103.3193969726562, |
|
"epoch": 0.04627249357326478, |
|
"grad_norm": 0.12868770956993103, |
|
"kl": 2.6017427444458008e-05, |
|
"learning_rate": 5.2e-07, |
|
"loss": 0.0118, |
|
"reward": -0.2791058011353016, |
|
"reward_std": 0.49328897148370743, |
|
"rewards/cosine_scaled_reward": -0.13955289125442505, |
|
"rewards/format_reward": 0.0, |
|
"step": 27 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2378.2222595214844, |
|
"epoch": 0.04798628963153385, |
|
"grad_norm": 0.2462579607963562, |
|
"kl": 2.7805566787719727e-05, |
|
"learning_rate": 5.4e-07, |
|
"loss": 0.0596, |
|
"reward": 0.03218653332442045, |
|
"reward_std": 0.6807225868105888, |
|
"rewards/cosine_scaled_reward": 0.016093265498057008, |
|
"rewards/format_reward": 0.0, |
|
"step": 28 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2971.291748046875, |
|
"epoch": 0.049700085689802914, |
|
"grad_norm": 0.16591639816761017, |
|
"kl": 3.515183925628662e-05, |
|
"learning_rate": 5.6e-07, |
|
"loss": 0.0141, |
|
"reward": 0.011478596366941929, |
|
"reward_std": 0.7397755682468414, |
|
"rewards/cosine_scaled_reward": 0.005739298183470964, |
|
"rewards/format_reward": 0.0, |
|
"step": 29 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2913.3611450195312, |
|
"epoch": 0.05141388174807198, |
|
"grad_norm": 0.13886681199073792, |
|
"kl": 3.2573938369750977e-05, |
|
"learning_rate": 5.8e-07, |
|
"loss": 0.0258, |
|
"reward": 0.05036446265876293, |
|
"reward_std": 0.6957473307847977, |
|
"rewards/cosine_scaled_reward": 0.025182233192026615, |
|
"rewards/format_reward": 0.0, |
|
"step": 30 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2665.041748046875, |
|
"epoch": 0.05312767780634105, |
|
"grad_norm": 0.16625739634037018, |
|
"kl": 2.000480890274048e-05, |
|
"learning_rate": 6e-07, |
|
"loss": 0.0045, |
|
"reward": -0.044122666819021106, |
|
"reward_std": 0.4255269840359688, |
|
"rewards/cosine_scaled_reward": -0.022061329917050898, |
|
"rewards/format_reward": 0.0, |
|
"step": 31 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2951.8611450195312, |
|
"epoch": 0.054841473864610114, |
|
"grad_norm": 0.15594074130058289, |
|
"kl": 1.9147992134094238e-05, |
|
"learning_rate": 6.2e-07, |
|
"loss": 0.0942, |
|
"reward": -0.3072533793747425, |
|
"reward_std": 0.4980456754565239, |
|
"rewards/cosine_scaled_reward": -0.15362668968737125, |
|
"rewards/format_reward": 0.0, |
|
"step": 32 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2260.0833435058594, |
|
"epoch": 0.056555269922879174, |
|
"grad_norm": 0.21370142698287964, |
|
"kl": 3.5665929317474365e-05, |
|
"learning_rate": 6.4e-07, |
|
"loss": 0.0063, |
|
"reward": 0.06617816537618637, |
|
"reward_std": 0.5614925771951675, |
|
"rewards/cosine_scaled_reward": 0.033089087810367346, |
|
"rewards/format_reward": 0.0, |
|
"step": 33 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2807.013916015625, |
|
"epoch": 0.05826906598114824, |
|
"grad_norm": 0.20051412284374237, |
|
"kl": 1.2192875146865845e-05, |
|
"learning_rate": 6.6e-07, |
|
"loss": 0.0328, |
|
"reward": -0.17473484575748444, |
|
"reward_std": 0.6600858569145203, |
|
"rewards/cosine_scaled_reward": -0.08736742846667767, |
|
"rewards/format_reward": 0.0, |
|
"step": 34 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3120.7500610351562, |
|
"epoch": 0.05998286203941731, |
|
"grad_norm": 0.13361996412277222, |
|
"kl": 3.407895565032959e-05, |
|
"learning_rate": 6.800000000000001e-07, |
|
"loss": 0.0472, |
|
"reward": -0.4979929216206074, |
|
"reward_std": 0.39260104298591614, |
|
"rewards/cosine_scaled_reward": -0.2489964533597231, |
|
"rewards/format_reward": 0.0, |
|
"step": 35 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2625.0694885253906, |
|
"epoch": 0.061696658097686374, |
|
"grad_norm": 0.16467803716659546, |
|
"kl": 2.7239322662353516e-05, |
|
"learning_rate": 7e-07, |
|
"loss": -0.0168, |
|
"reward": -0.35937849269248545, |
|
"reward_std": 0.45373768359422684, |
|
"rewards/cosine_scaled_reward": -0.1796892363927327, |
|
"rewards/format_reward": 0.0, |
|
"step": 36 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3042.1806030273438, |
|
"epoch": 0.06341045415595545, |
|
"grad_norm": 0.15104345977306366, |
|
"kl": 2.9146671295166016e-05, |
|
"learning_rate": 7.2e-07, |
|
"loss": 0.0068, |
|
"reward": -0.37954360246658325, |
|
"reward_std": 0.5432159453630447, |
|
"rewards/cosine_scaled_reward": -0.18977180123329163, |
|
"rewards/format_reward": 0.0, |
|
"step": 37 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3193.2083740234375, |
|
"epoch": 0.06512425021422451, |
|
"grad_norm": 0.20619741082191467, |
|
"kl": 1.7097219824790955e-05, |
|
"learning_rate": 7.4e-07, |
|
"loss": 0.0389, |
|
"reward": -0.29821273358538747, |
|
"reward_std": 0.5581861883401871, |
|
"rewards/cosine_scaled_reward": -0.1491063602734357, |
|
"rewards/format_reward": 0.0, |
|
"step": 38 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3018.5834045410156, |
|
"epoch": 0.06683804627249357, |
|
"grad_norm": 0.12940338253974915, |
|
"kl": 3.90857458114624e-05, |
|
"learning_rate": 7.599999999999999e-07, |
|
"loss": 0.0162, |
|
"reward": -0.25728118792176247, |
|
"reward_std": 0.34478260576725006, |
|
"rewards/cosine_scaled_reward": -0.12864059768617153, |
|
"rewards/format_reward": 0.0, |
|
"step": 39 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2860.7500610351562, |
|
"epoch": 0.06855184233076264, |
|
"grad_norm": 0.25654301047325134, |
|
"kl": 0.0001112818717956543, |
|
"learning_rate": 7.799999999999999e-07, |
|
"loss": 0.0545, |
|
"reward": 0.13069207593798637, |
|
"reward_std": 0.5447051227092743, |
|
"rewards/cosine_scaled_reward": 0.06534605007618666, |
|
"rewards/format_reward": 0.0, |
|
"step": 40 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2696.3611450195312, |
|
"epoch": 0.0702656383890317, |
|
"grad_norm": 0.19896458089351654, |
|
"kl": 4.2378902435302734e-05, |
|
"learning_rate": 8e-07, |
|
"loss": 0.0826, |
|
"reward": 0.2564197585452348, |
|
"reward_std": 0.6877201497554779, |
|
"rewards/cosine_scaled_reward": 0.12820987740997225, |
|
"rewards/format_reward": 0.0, |
|
"step": 41 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2642.3333740234375, |
|
"epoch": 0.07197943444730077, |
|
"grad_norm": 0.1658892035484314, |
|
"kl": 0.00020813941955566406, |
|
"learning_rate": 8.199999999999999e-07, |
|
"loss": 0.0149, |
|
"reward": -0.03526473790407181, |
|
"reward_std": 0.6603178381919861, |
|
"rewards/cosine_scaled_reward": -0.017632372677326202, |
|
"rewards/format_reward": 0.0, |
|
"step": 42 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2897.388916015625, |
|
"epoch": 0.07369323050556983, |
|
"grad_norm": 0.2002326250076294, |
|
"kl": 6.079673767089844e-05, |
|
"learning_rate": 8.399999999999999e-07, |
|
"loss": 0.055, |
|
"reward": 0.08917492628097534, |
|
"reward_std": 0.4714968279004097, |
|
"rewards/cosine_scaled_reward": 0.04458745941519737, |
|
"rewards/format_reward": 0.0, |
|
"step": 43 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2802.9583740234375, |
|
"epoch": 0.07540702656383891, |
|
"grad_norm": 0.14357756078243256, |
|
"kl": 0.00016063451766967773, |
|
"learning_rate": 8.599999999999999e-07, |
|
"loss": 0.0109, |
|
"reward": -0.2601087912917137, |
|
"reward_std": 0.5872670859098434, |
|
"rewards/cosine_scaled_reward": -0.13005439937114716, |
|
"rewards/format_reward": 0.0, |
|
"step": 44 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3034.8194580078125, |
|
"epoch": 0.07712082262210797, |
|
"grad_norm": 0.23196536302566528, |
|
"kl": 0.00012412667274475098, |
|
"learning_rate": 8.799999999999999e-07, |
|
"loss": 0.0428, |
|
"reward": -0.2070726901292801, |
|
"reward_std": 0.5877418145537376, |
|
"rewards/cosine_scaled_reward": -0.10353635251522064, |
|
"rewards/format_reward": 0.0, |
|
"step": 45 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2306.2083435058594, |
|
"epoch": 0.07883461868037704, |
|
"grad_norm": 0.2409650981426239, |
|
"kl": 0.0003217458724975586, |
|
"learning_rate": 9e-07, |
|
"loss": 0.0337, |
|
"reward": -0.01094321720302105, |
|
"reward_std": 0.6599317938089371, |
|
"rewards/cosine_scaled_reward": -0.00547160767018795, |
|
"rewards/format_reward": 0.0, |
|
"step": 46 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2936.1388549804688, |
|
"epoch": 0.0805484147386461, |
|
"grad_norm": 0.1777871698141098, |
|
"kl": 0.0003833882510662079, |
|
"learning_rate": 9.2e-07, |
|
"loss": -0.0387, |
|
"reward": -0.12989605404436588, |
|
"reward_std": 0.6336122080683708, |
|
"rewards/cosine_scaled_reward": -0.0649480305146426, |
|
"rewards/format_reward": 0.0, |
|
"step": 47 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2661.6806030273438, |
|
"epoch": 0.08226221079691516, |
|
"grad_norm": 0.3158990442752838, |
|
"kl": 0.00035144388675689697, |
|
"learning_rate": 9.399999999999999e-07, |
|
"loss": 0.1047, |
|
"reward": 0.12476684269495308, |
|
"reward_std": 0.5184459760785103, |
|
"rewards/cosine_scaled_reward": 0.06238342053256929, |
|
"rewards/format_reward": 0.0, |
|
"step": 48 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2053.8611450195312, |
|
"epoch": 0.08397600685518423, |
|
"grad_norm": 0.19082558155059814, |
|
"kl": 0.0006046295166015625, |
|
"learning_rate": 9.6e-07, |
|
"loss": -0.0144, |
|
"reward": 0.012501850724220276, |
|
"reward_std": 0.603157639503479, |
|
"rewards/cosine_scaled_reward": 0.006250927224755287, |
|
"rewards/format_reward": 0.0, |
|
"step": 49 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2731.6527404785156, |
|
"epoch": 0.0856898029134533, |
|
"grad_norm": 0.22663110494613647, |
|
"kl": 0.0009310245513916016, |
|
"learning_rate": 9.8e-07, |
|
"loss": 0.0409, |
|
"reward": -0.30116934701800346, |
|
"reward_std": 0.6284962445497513, |
|
"rewards/cosine_scaled_reward": -0.1505846632644534, |
|
"rewards/format_reward": 0.0, |
|
"step": 50 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2839.6944885253906, |
|
"epoch": 0.08740359897172237, |
|
"grad_norm": 0.17685562372207642, |
|
"kl": 0.0002518892288208008, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0272, |
|
"reward": -0.16751686483621597, |
|
"reward_std": 0.5093529745936394, |
|
"rewards/cosine_scaled_reward": -0.08375842124223709, |
|
"rewards/format_reward": 0.0, |
|
"step": 51 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3141.4583740234375, |
|
"epoch": 0.08911739502999143, |
|
"grad_norm": 0.14409120380878448, |
|
"kl": 0.0003941059112548828, |
|
"learning_rate": 9.999890338174275e-07, |
|
"loss": -0.0079, |
|
"reward": -0.19580290652811527, |
|
"reward_std": 0.589723251760006, |
|
"rewards/cosine_scaled_reward": -0.09790145605802536, |
|
"rewards/format_reward": 0.0, |
|
"step": 52 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3054.9445190429688, |
|
"epoch": 0.0908311910882605, |
|
"grad_norm": 0.13203154504299164, |
|
"kl": 0.0002570152282714844, |
|
"learning_rate": 9.999561358041868e-07, |
|
"loss": 0.0455, |
|
"reward": -0.2164551168680191, |
|
"reward_std": 0.6407450139522552, |
|
"rewards/cosine_scaled_reward": -0.10822756588459015, |
|
"rewards/format_reward": 0.0, |
|
"step": 53 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3393.8055419921875, |
|
"epoch": 0.09254498714652956, |
|
"grad_norm": 0.11958733946084976, |
|
"kl": 0.0005993843078613281, |
|
"learning_rate": 9.999013075636804e-07, |
|
"loss": -0.007, |
|
"reward": -0.27613697946071625, |
|
"reward_std": 0.5631539821624756, |
|
"rewards/cosine_scaled_reward": -0.13806848879903555, |
|
"rewards/format_reward": 0.0, |
|
"step": 54 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3430.3055419921875, |
|
"epoch": 0.09425878320479864, |
|
"grad_norm": 0.13475047051906586, |
|
"kl": 0.0003286600112915039, |
|
"learning_rate": 9.998245517681593e-07, |
|
"loss": 0.0301, |
|
"reward": -0.2911250814795494, |
|
"reward_std": 0.5787934809923172, |
|
"rewards/cosine_scaled_reward": -0.145562544465065, |
|
"rewards/format_reward": 0.0, |
|
"step": 55 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3075.1527709960938, |
|
"epoch": 0.0959725792630677, |
|
"grad_norm": 0.14396199584007263, |
|
"kl": 0.0008380413055419922, |
|
"learning_rate": 9.997258721585931e-07, |
|
"loss": 0.0481, |
|
"reward": -0.058986596763134, |
|
"reward_std": 0.5793360769748688, |
|
"rewards/cosine_scaled_reward": -0.029493287205696106, |
|
"rewards/format_reward": 0.0, |
|
"step": 56 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3232.9722900390625, |
|
"epoch": 0.09768637532133675, |
|
"grad_norm": 0.14357316493988037, |
|
"kl": 0.0003731250762939453, |
|
"learning_rate": 9.996052735444862e-07, |
|
"loss": 0.0542, |
|
"reward": -0.08436356298625469, |
|
"reward_std": 0.4788799285888672, |
|
"rewards/cosine_scaled_reward": -0.042181783355772495, |
|
"rewards/format_reward": 0.0, |
|
"step": 57 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3087.3194580078125, |
|
"epoch": 0.09940017137960583, |
|
"grad_norm": 0.15331892669200897, |
|
"kl": 0.0012726783752441406, |
|
"learning_rate": 9.994627618036452e-07, |
|
"loss": 0.0529, |
|
"reward": -0.29565126448869705, |
|
"reward_std": 0.5033575221896172, |
|
"rewards/cosine_scaled_reward": -0.1478256327100098, |
|
"rewards/format_reward": 0.0, |
|
"step": 58 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3110.6111450195312, |
|
"epoch": 0.10111396743787489, |
|
"grad_norm": 0.14103592932224274, |
|
"kl": 0.0015869140625, |
|
"learning_rate": 9.992983438818915e-07, |
|
"loss": 0.0384, |
|
"reward": 0.018456660211086273, |
|
"reward_std": 0.8149007856845856, |
|
"rewards/cosine_scaled_reward": 0.009228323586285114, |
|
"rewards/format_reward": 0.0, |
|
"step": 59 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3305.236083984375, |
|
"epoch": 0.10282776349614396, |
|
"grad_norm": 0.12172071635723114, |
|
"kl": 0.00035071372985839844, |
|
"learning_rate": 9.991120277927223e-07, |
|
"loss": 0.0086, |
|
"reward": -0.27341870963573456, |
|
"reward_std": 0.7006796821951866, |
|
"rewards/cosine_scaled_reward": -0.13670935295522213, |
|
"rewards/format_reward": 0.0, |
|
"step": 60 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3224.0555419921875, |
|
"epoch": 0.10454155955441302, |
|
"grad_norm": 0.13248133659362793, |
|
"kl": 0.0005254745483398438, |
|
"learning_rate": 9.989038226169207e-07, |
|
"loss": -0.0068, |
|
"reward": -0.2998387850821018, |
|
"reward_std": 0.3452136740088463, |
|
"rewards/cosine_scaled_reward": -0.14991939440369606, |
|
"rewards/format_reward": 0.0, |
|
"step": 61 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2643.5833740234375, |
|
"epoch": 0.1062553556126821, |
|
"grad_norm": 0.17902526259422302, |
|
"kl": 0.0021648406982421875, |
|
"learning_rate": 9.98673738502114e-07, |
|
"loss": 0.057, |
|
"reward": 0.017559568164870143, |
|
"reward_std": 0.5955966338515282, |
|
"rewards/cosine_scaled_reward": 0.008779789437539876, |
|
"rewards/format_reward": 0.0, |
|
"step": 62 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3496.375, |
|
"epoch": 0.10796915167095116, |
|
"grad_norm": 0.1432785838842392, |
|
"kl": 0.00047206878662109375, |
|
"learning_rate": 9.98421786662277e-07, |
|
"loss": 0.0277, |
|
"reward": -0.17097678780555725, |
|
"reward_std": 0.6070086807012558, |
|
"rewards/cosine_scaled_reward": -0.08548840321600437, |
|
"rewards/format_reward": 0.0, |
|
"step": 63 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2792.486114501953, |
|
"epoch": 0.10968294772922023, |
|
"grad_norm": 0.16470499336719513, |
|
"kl": 0.0011835098266601562, |
|
"learning_rate": 9.981479793771866e-07, |
|
"loss": 0.0207, |
|
"reward": -0.26402536034584045, |
|
"reward_std": 0.43254173547029495, |
|
"rewards/cosine_scaled_reward": -0.13201268389821053, |
|
"rewards/format_reward": 0.0, |
|
"step": 64 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3128.4861450195312, |
|
"epoch": 0.11139674378748929, |
|
"grad_norm": 0.1882910132408142, |
|
"kl": 0.006333351135253906, |
|
"learning_rate": 9.97852329991824e-07, |
|
"loss": 0.0385, |
|
"reward": -0.0892822165042162, |
|
"reward_std": 0.6130652017891407, |
|
"rewards/cosine_scaled_reward": -0.04464110638946295, |
|
"rewards/format_reward": 0.0, |
|
"step": 65 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2806.75, |
|
"epoch": 0.11311053984575835, |
|
"grad_norm": 0.15443913638591766, |
|
"kl": 0.0003552436828613281, |
|
"learning_rate": 9.975348529157229e-07, |
|
"loss": 0.0038, |
|
"reward": -0.04117146506905556, |
|
"reward_std": 0.4872736781835556, |
|
"rewards/cosine_scaled_reward": -0.02058573253452778, |
|
"rewards/format_reward": 0.0, |
|
"step": 66 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3150.8194580078125, |
|
"epoch": 0.11482433590402742, |
|
"grad_norm": 0.15191471576690674, |
|
"kl": 0.0016102790832519531, |
|
"learning_rate": 9.971955636222684e-07, |
|
"loss": 0.0316, |
|
"reward": -0.23821864277124405, |
|
"reward_std": 0.5326030105352402, |
|
"rewards/cosine_scaled_reward": -0.11910932138562202, |
|
"rewards/format_reward": 0.0, |
|
"step": 67 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2845.6806030273438, |
|
"epoch": 0.11653813196229648, |
|
"grad_norm": 0.1388000249862671, |
|
"kl": 0.0018000602722167969, |
|
"learning_rate": 9.968344786479415e-07, |
|
"loss": 0.0376, |
|
"reward": -0.17579936794936657, |
|
"reward_std": 0.6001454517245293, |
|
"rewards/cosine_scaled_reward": -0.08789968676865101, |
|
"rewards/format_reward": 0.0, |
|
"step": 68 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3050.7361450195312, |
|
"epoch": 0.11825192802056556, |
|
"grad_norm": 0.13662724196910858, |
|
"kl": 0.0015287399291992188, |
|
"learning_rate": 9.964516155915151e-07, |
|
"loss": 0.0787, |
|
"reward": -0.09626813535578549, |
|
"reward_std": 0.6232626661658287, |
|
"rewards/cosine_scaled_reward": -0.04813406406901777, |
|
"rewards/format_reward": 0.0, |
|
"step": 69 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2883.9722290039062, |
|
"epoch": 0.11996572407883462, |
|
"grad_norm": 0.18917521834373474, |
|
"kl": 0.00302886962890625, |
|
"learning_rate": 9.960469931131936e-07, |
|
"loss": -0.0608, |
|
"reward": 0.05035170167684555, |
|
"reward_std": 0.4191203862428665, |
|
"rewards/cosine_scaled_reward": 0.025175858289003372, |
|
"rewards/format_reward": 0.0, |
|
"step": 70 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3137.4583740234375, |
|
"epoch": 0.12167952013710369, |
|
"grad_norm": 0.15267273783683777, |
|
"kl": 0.0017466545104980469, |
|
"learning_rate": 9.956206309337066e-07, |
|
"loss": 0.0362, |
|
"reward": -0.04426470585167408, |
|
"reward_std": 0.6740965843200684, |
|
"rewards/cosine_scaled_reward": -0.022132341749966145, |
|
"rewards/format_reward": 0.0, |
|
"step": 71 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2443.0138549804688, |
|
"epoch": 0.12339331619537275, |
|
"grad_norm": 0.16214598715305328, |
|
"kl": 0.003936767578125, |
|
"learning_rate": 9.951725498333448e-07, |
|
"loss": -0.0396, |
|
"reward": 0.09306424111127853, |
|
"reward_std": 0.43733419477939606, |
|
"rewards/cosine_scaled_reward": 0.04653212707489729, |
|
"rewards/format_reward": 0.0, |
|
"step": 72 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3163.513916015625, |
|
"epoch": 0.12510711225364182, |
|
"grad_norm": 0.23524802923202515, |
|
"kl": 0.018090248107910156, |
|
"learning_rate": 9.947027716509488e-07, |
|
"loss": -0.0168, |
|
"reward": -0.17970024980604649, |
|
"reward_std": 0.4914797991514206, |
|
"rewards/cosine_scaled_reward": -0.0898501230403781, |
|
"rewards/format_reward": 0.0, |
|
"step": 73 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2410.25, |
|
"epoch": 0.1268209083119109, |
|
"grad_norm": 0.15706373751163483, |
|
"kl": 0.0030879974365234375, |
|
"learning_rate": 9.942113192828444e-07, |
|
"loss": 0.0191, |
|
"reward": 0.2525464817881584, |
|
"reward_std": 0.6606673151254654, |
|
"rewards/cosine_scaled_reward": 0.12627324275672436, |
|
"rewards/format_reward": 0.0, |
|
"step": 74 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3146.8611450195312, |
|
"epoch": 0.12853470437017994, |
|
"grad_norm": 0.15255555510520935, |
|
"kl": 0.0032701492309570312, |
|
"learning_rate": 9.93698216681727e-07, |
|
"loss": 0.0281, |
|
"reward": -0.07365414220839739, |
|
"reward_std": 0.5634644776582718, |
|
"rewards/cosine_scaled_reward": -0.036827060393989086, |
|
"rewards/format_reward": 0.0, |
|
"step": 75 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2159.249984741211, |
|
"epoch": 0.13024850042844902, |
|
"grad_norm": 0.39581403136253357, |
|
"kl": 0.018310546875, |
|
"learning_rate": 9.931634888554935e-07, |
|
"loss": -0.0072, |
|
"reward": 0.14826004952192307, |
|
"reward_std": 0.6063434556126595, |
|
"rewards/cosine_scaled_reward": 0.07413001451641321, |
|
"rewards/format_reward": 0.0, |
|
"step": 76 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3143.9443969726562, |
|
"epoch": 0.1319622964867181, |
|
"grad_norm": 0.13312797248363495, |
|
"kl": 0.00225830078125, |
|
"learning_rate": 9.926071618660237e-07, |
|
"loss": 0.0387, |
|
"reward": 0.15560828521847725, |
|
"reward_std": 0.680296927690506, |
|
"rewards/cosine_scaled_reward": 0.07780414074659348, |
|
"rewards/format_reward": 0.0, |
|
"step": 77 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3317.5416870117188, |
|
"epoch": 0.13367609254498714, |
|
"grad_norm": 0.13495096564292908, |
|
"kl": 0.0019426345825195312, |
|
"learning_rate": 9.9202926282791e-07, |
|
"loss": -0.0019, |
|
"reward": -0.4046759568154812, |
|
"reward_std": 0.5655369237065315, |
|
"rewards/cosine_scaled_reward": -0.20233797095716, |
|
"rewards/format_reward": 0.0, |
|
"step": 78 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2373.2361755371094, |
|
"epoch": 0.1353898886032562, |
|
"grad_norm": 0.26138797402381897, |
|
"kl": 0.010517120361328125, |
|
"learning_rate": 9.91429819907136e-07, |
|
"loss": 0.0351, |
|
"reward": -0.17695464938879013, |
|
"reward_std": 0.34004002809524536, |
|
"rewards/cosine_scaled_reward": -0.08847732283174992, |
|
"rewards/format_reward": 0.0, |
|
"step": 79 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3025.5000610351562, |
|
"epoch": 0.13710368466152528, |
|
"grad_norm": 0.17277857661247253, |
|
"kl": 0.0012784004211425781, |
|
"learning_rate": 9.908088623197048e-07, |
|
"loss": 0.0488, |
|
"reward": -0.08927152771502733, |
|
"reward_std": 0.6381218209862709, |
|
"rewards/cosine_scaled_reward": -0.04463577060960233, |
|
"rewards/format_reward": 0.0, |
|
"step": 80 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3080.2777709960938, |
|
"epoch": 0.13881748071979436, |
|
"grad_norm": 0.14923037588596344, |
|
"kl": 0.0020084381103515625, |
|
"learning_rate": 9.901664203302124e-07, |
|
"loss": 0.0073, |
|
"reward": -0.27667392790317535, |
|
"reward_std": 0.39360568672418594, |
|
"rewards/cosine_scaled_reward": -0.13833696395158768, |
|
"rewards/format_reward": 0.0, |
|
"step": 81 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2893.77783203125, |
|
"epoch": 0.1405312767780634, |
|
"grad_norm": 0.3161645531654358, |
|
"kl": 0.011153221130371094, |
|
"learning_rate": 9.895025252503755e-07, |
|
"loss": 0.0838, |
|
"reward": -0.08123429818078876, |
|
"reward_std": 0.6654616445302963, |
|
"rewards/cosine_scaled_reward": -0.040617153281345963, |
|
"rewards/format_reward": 0.0, |
|
"step": 82 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2858.736083984375, |
|
"epoch": 0.14224507283633248, |
|
"grad_norm": 0.1683678925037384, |
|
"kl": 0.001474142074584961, |
|
"learning_rate": 9.888172094375033e-07, |
|
"loss": -0.0148, |
|
"reward": -0.12576034758239985, |
|
"reward_std": 0.6605924665927887, |
|
"rewards/cosine_scaled_reward": -0.06288017379119992, |
|
"rewards/format_reward": 0.0, |
|
"step": 83 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2913.3194885253906, |
|
"epoch": 0.14395886889460155, |
|
"grad_norm": 0.22510592639446259, |
|
"kl": 0.0032978057861328125, |
|
"learning_rate": 9.881105062929221e-07, |
|
"loss": 0.042, |
|
"reward": -0.05945697799324989, |
|
"reward_std": 0.5878739953041077, |
|
"rewards/cosine_scaled_reward": -0.0297284796833992, |
|
"rewards/format_reward": 0.0, |
|
"step": 84 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2990.7916870117188, |
|
"epoch": 0.1456726649528706, |
|
"grad_norm": 0.14112693071365356, |
|
"kl": 0.0014820098876953125, |
|
"learning_rate": 9.873824502603459e-07, |
|
"loss": 0.0518, |
|
"reward": -0.05626801133621484, |
|
"reward_std": 0.5443409904837608, |
|
"rewards/cosine_scaled_reward": -0.028134002874139696, |
|
"rewards/format_reward": 0.0, |
|
"step": 85 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2831.6805419921875, |
|
"epoch": 0.14738646101113967, |
|
"grad_norm": 0.17547817528247833, |
|
"kl": 0.00238037109375, |
|
"learning_rate": 9.866330768241983e-07, |
|
"loss": 0.0229, |
|
"reward": -0.25049374252557755, |
|
"reward_std": 0.6190591081976891, |
|
"rewards/cosine_scaled_reward": -0.12524686381220818, |
|
"rewards/format_reward": 0.0, |
|
"step": 86 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3439.2639770507812, |
|
"epoch": 0.14910025706940874, |
|
"grad_norm": 0.12470373511314392, |
|
"kl": 0.0005965232849121094, |
|
"learning_rate": 9.85862422507884e-07, |
|
"loss": 0.0309, |
|
"reward": -0.15761397371534258, |
|
"reward_std": 0.568816527724266, |
|
"rewards/cosine_scaled_reward": -0.07880698406370357, |
|
"rewards/format_reward": 0.0, |
|
"step": 87 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3291.013916015625, |
|
"epoch": 0.15081405312767782, |
|
"grad_norm": 0.17072485387325287, |
|
"kl": 0.0011734962463378906, |
|
"learning_rate": 9.850705248720068e-07, |
|
"loss": -0.0003, |
|
"reward": -0.31209783256053925, |
|
"reward_std": 0.4534567594528198, |
|
"rewards/cosine_scaled_reward": -0.15604891628026962, |
|
"rewards/format_reward": 0.0, |
|
"step": 88 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2711.6111450195312, |
|
"epoch": 0.15252784918594686, |
|
"grad_norm": 0.17909394204616547, |
|
"kl": 0.00319671630859375, |
|
"learning_rate": 9.8425742251254e-07, |
|
"loss": -0.0351, |
|
"reward": -0.39153438061475754, |
|
"reward_std": 0.44514787942171097, |
|
"rewards/cosine_scaled_reward": -0.19576718658208847, |
|
"rewards/format_reward": 0.0, |
|
"step": 89 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2884.3611755371094, |
|
"epoch": 0.15424164524421594, |
|
"grad_norm": 0.1545180082321167, |
|
"kl": 0.0027666091918945312, |
|
"learning_rate": 9.83423155058946e-07, |
|
"loss": 0.0231, |
|
"reward": -0.12805988639593124, |
|
"reward_std": 0.41310104727745056, |
|
"rewards/cosine_scaled_reward": -0.06402994319796562, |
|
"rewards/format_reward": 0.0, |
|
"step": 90 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2961.8194580078125, |
|
"epoch": 0.155955441302485, |
|
"grad_norm": 0.17576223611831665, |
|
"kl": 0.004119873046875, |
|
"learning_rate": 9.825677631722435e-07, |
|
"loss": 0.0293, |
|
"reward": -0.06583835743367672, |
|
"reward_std": 0.6373212188482285, |
|
"rewards/cosine_scaled_reward": -0.03291917638853192, |
|
"rewards/format_reward": 0.0, |
|
"step": 91 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2929.5277709960938, |
|
"epoch": 0.15766923736075408, |
|
"grad_norm": 0.14930115640163422, |
|
"kl": 0.0034623146057128906, |
|
"learning_rate": 9.816912885430258e-07, |
|
"loss": 0.0296, |
|
"reward": -0.18032184429466724, |
|
"reward_std": 0.6196585968136787, |
|
"rewards/cosine_scaled_reward": -0.09016093239188194, |
|
"rewards/format_reward": 0.0, |
|
"step": 92 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2748.7361450195312, |
|
"epoch": 0.15938303341902313, |
|
"grad_norm": 0.1628389209508896, |
|
"kl": 0.0011005401611328125, |
|
"learning_rate": 9.807937738894303e-07, |
|
"loss": 0.0544, |
|
"reward": -0.048349371179938316, |
|
"reward_std": 0.5468417555093765, |
|
"rewards/cosine_scaled_reward": -0.024174699559807777, |
|
"rewards/format_reward": 0.0, |
|
"step": 93 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2974.8194580078125, |
|
"epoch": 0.1610968294772922, |
|
"grad_norm": 0.17104412615299225, |
|
"kl": 0.0025157928466796875, |
|
"learning_rate": 9.798752629550546e-07, |
|
"loss": 0.0562, |
|
"reward": -0.10820803185924888, |
|
"reward_std": 0.5462353378534317, |
|
"rewards/cosine_scaled_reward": -0.05410401395056397, |
|
"rewards/format_reward": 0.0, |
|
"step": 94 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2822.3055419921875, |
|
"epoch": 0.16281062553556128, |
|
"grad_norm": 0.22087068855762482, |
|
"kl": 0.0032253265380859375, |
|
"learning_rate": 9.78935800506826e-07, |
|
"loss": 0.0157, |
|
"reward": -0.2787464428693056, |
|
"reward_std": 0.5101591870188713, |
|
"rewards/cosine_scaled_reward": -0.139373216079548, |
|
"rewards/format_reward": 0.0, |
|
"step": 95 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3177.1389770507812, |
|
"epoch": 0.16452442159383032, |
|
"grad_norm": 0.13341942429542542, |
|
"kl": 0.0016889572143554688, |
|
"learning_rate": 9.779754323328192e-07, |
|
"loss": 0.0599, |
|
"reward": 0.22422180697321892, |
|
"reward_std": 0.6203102543950081, |
|
"rewards/cosine_scaled_reward": 0.11211090348660946, |
|
"rewards/format_reward": 0.0, |
|
"step": 96 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3359.1666870117188, |
|
"epoch": 0.1662382176520994, |
|
"grad_norm": 0.17103053629398346, |
|
"kl": 0.0048770904541015625, |
|
"learning_rate": 9.769942052400235e-07, |
|
"loss": 0.0584, |
|
"reward": -0.34769631922245026, |
|
"reward_std": 0.5649063661694527, |
|
"rewards/cosine_scaled_reward": -0.17384816892445087, |
|
"rewards/format_reward": 0.0, |
|
"step": 97 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2853.8472290039062, |
|
"epoch": 0.16795201371036847, |
|
"grad_norm": 0.16162103414535522, |
|
"kl": 0.002391815185546875, |
|
"learning_rate": 9.759921670520634e-07, |
|
"loss": -0.0363, |
|
"reward": 0.04994424246251583, |
|
"reward_std": 0.4738911837339401, |
|
"rewards/cosine_scaled_reward": 0.024972120765596628, |
|
"rewards/format_reward": 0.0, |
|
"step": 98 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3113.9722290039062, |
|
"epoch": 0.16966580976863754, |
|
"grad_norm": 0.17794044315814972, |
|
"kl": 0.002719879150390625, |
|
"learning_rate": 9.749693666068663e-07, |
|
"loss": 0.0017, |
|
"reward": -0.16785867512226105, |
|
"reward_std": 0.5008634850382805, |
|
"rewards/cosine_scaled_reward": -0.08392933756113052, |
|
"rewards/format_reward": 0.0, |
|
"step": 99 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2779.9862060546875, |
|
"epoch": 0.1713796058269066, |
|
"grad_norm": 0.1735229194164276, |
|
"kl": 0.005786895751953125, |
|
"learning_rate": 9.739258537542835e-07, |
|
"loss": -0.0595, |
|
"reward": -0.15765622071921825, |
|
"reward_std": 0.4426313266158104, |
|
"rewards/cosine_scaled_reward": -0.07882811967283487, |
|
"rewards/format_reward": 0.0, |
|
"step": 100 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2904.5833129882812, |
|
"epoch": 0.17309340188517566, |
|
"grad_norm": 0.16130799055099487, |
|
"kl": 0.0022287368774414062, |
|
"learning_rate": 9.728616793536587e-07, |
|
"loss": -0.027, |
|
"reward": -0.2833556551486254, |
|
"reward_std": 0.41574449837207794, |
|
"rewards/cosine_scaled_reward": -0.14167783502489328, |
|
"rewards/format_reward": 0.0, |
|
"step": 101 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2956.9444580078125, |
|
"epoch": 0.17480719794344474, |
|
"grad_norm": 0.14904557168483734, |
|
"kl": 0.0023751258850097656, |
|
"learning_rate": 9.717768952713511e-07, |
|
"loss": 0.0249, |
|
"reward": -0.005829242058098316, |
|
"reward_std": 0.49208924546837807, |
|
"rewards/cosine_scaled_reward": -0.002914619166404009, |
|
"rewards/format_reward": 0.0, |
|
"step": 102 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3127.75, |
|
"epoch": 0.17652099400171378, |
|
"grad_norm": 0.13523682951927185, |
|
"kl": 0.0023593902587890625, |
|
"learning_rate": 9.706715543782064e-07, |
|
"loss": 0.0048, |
|
"reward": -0.16767939552664757, |
|
"reward_std": 0.497691310942173, |
|
"rewards/cosine_scaled_reward": -0.08383970521390438, |
|
"rewards/format_reward": 0.0, |
|
"step": 103 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3349.888916015625, |
|
"epoch": 0.17823479005998286, |
|
"grad_norm": 0.16127026081085205, |
|
"kl": 0.002029895782470703, |
|
"learning_rate": 9.695457105469804e-07, |
|
"loss": -0.0079, |
|
"reward": -0.4253583773970604, |
|
"reward_std": 0.5213425680994987, |
|
"rewards/cosine_scaled_reward": -0.2126791886985302, |
|
"rewards/format_reward": 0.0, |
|
"step": 104 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2762.3056030273438, |
|
"epoch": 0.17994858611825193, |
|
"grad_norm": 0.22534409165382385, |
|
"kl": 0.004019737243652344, |
|
"learning_rate": 9.683994186497132e-07, |
|
"loss": 0.0786, |
|
"reward": -0.13280940428376198, |
|
"reward_std": 0.6939076110720634, |
|
"rewards/cosine_scaled_reward": -0.06640470400452614, |
|
"rewards/format_reward": 0.0, |
|
"step": 105 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3027.013885498047, |
|
"epoch": 0.181662382176521, |
|
"grad_norm": 0.18191885948181152, |
|
"kl": 0.001827239990234375, |
|
"learning_rate": 9.672327345550543e-07, |
|
"loss": 0.0572, |
|
"reward": -0.30150486156344414, |
|
"reward_std": 0.5941706523299217, |
|
"rewards/cosine_scaled_reward": -0.15075243171304464, |
|
"rewards/format_reward": 0.0, |
|
"step": 106 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3236.4166870117188, |
|
"epoch": 0.18337617823479005, |
|
"grad_norm": 0.12520797550678253, |
|
"kl": 0.002315521240234375, |
|
"learning_rate": 9.66045715125541e-07, |
|
"loss": 0.0039, |
|
"reward": 0.061343319714069366, |
|
"reward_std": 0.5028644949197769, |
|
"rewards/cosine_scaled_reward": 0.030671661719679832, |
|
"rewards/format_reward": 0.0, |
|
"step": 107 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3337.3056030273438, |
|
"epoch": 0.18508997429305912, |
|
"grad_norm": 0.14343461394309998, |
|
"kl": 0.0016498565673828125, |
|
"learning_rate": 9.648384182148252e-07, |
|
"loss": 0.0438, |
|
"reward": -0.17464184761047363, |
|
"reward_std": 0.5610974803566933, |
|
"rewards/cosine_scaled_reward": -0.08732092566788197, |
|
"rewards/format_reward": 0.0, |
|
"step": 108 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2781.4305419921875, |
|
"epoch": 0.1868037703513282, |
|
"grad_norm": 0.1800822913646698, |
|
"kl": 0.0033397674560546875, |
|
"learning_rate": 9.636109026648554e-07, |
|
"loss": 0.0242, |
|
"reward": -0.26444700360298157, |
|
"reward_std": 0.5241282097995281, |
|
"rewards/cosine_scaled_reward": -0.13222350925207138, |
|
"rewards/format_reward": 0.0, |
|
"step": 109 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2989.3472290039062, |
|
"epoch": 0.18851756640959727, |
|
"grad_norm": 0.24952495098114014, |
|
"kl": 0.00222015380859375, |
|
"learning_rate": 9.623632283030077e-07, |
|
"loss": 0.1294, |
|
"reward": -0.038819944486021996, |
|
"reward_std": 0.7193348854780197, |
|
"rewards/cosine_scaled_reward": -0.019409974105656147, |
|
"rewards/format_reward": 0.0, |
|
"step": 110 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2775.6111450195312, |
|
"epoch": 0.19023136246786632, |
|
"grad_norm": 0.16514870524406433, |
|
"kl": 0.0029087066650390625, |
|
"learning_rate": 9.610954559391704e-07, |
|
"loss": 0.0565, |
|
"reward": -0.3495597681030631, |
|
"reward_std": 0.3909125030040741, |
|
"rewards/cosine_scaled_reward": -0.17477987939491868, |
|
"rewards/format_reward": 0.0, |
|
"step": 111 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2831.8055419921875, |
|
"epoch": 0.1919451585261354, |
|
"grad_norm": 0.13825589418411255, |
|
"kl": 0.0023212432861328125, |
|
"learning_rate": 9.598076473627796e-07, |
|
"loss": 0.0231, |
|
"reward": -0.17934568971395493, |
|
"reward_std": 0.5252480655908585, |
|
"rewards/cosine_scaled_reward": -0.08967284485697746, |
|
"rewards/format_reward": 0.0, |
|
"step": 112 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2977.8333740234375, |
|
"epoch": 0.19365895458440446, |
|
"grad_norm": 0.15359072387218475, |
|
"kl": 0.002410888671875, |
|
"learning_rate": 9.58499865339809e-07, |
|
"loss": 0.0053, |
|
"reward": 0.25896316685248166, |
|
"reward_std": 0.705707773566246, |
|
"rewards/cosine_scaled_reward": 0.12948158156359568, |
|
"rewards/format_reward": 0.0, |
|
"step": 113 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2234.2500610351562, |
|
"epoch": 0.1953727506426735, |
|
"grad_norm": 0.17650143802165985, |
|
"kl": 0.002643585205078125, |
|
"learning_rate": 9.571721736097088e-07, |
|
"loss": -0.0481, |
|
"reward": -0.20779240669799037, |
|
"reward_std": 0.50680061429739, |
|
"rewards/cosine_scaled_reward": -0.10389620521164034, |
|
"rewards/format_reward": 0.0, |
|
"step": 114 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2372.65283203125, |
|
"epoch": 0.19708654670094258, |
|
"grad_norm": 0.21576084196567535, |
|
"kl": 0.007110595703125, |
|
"learning_rate": 9.55824636882301e-07, |
|
"loss": 0.1072, |
|
"reward": 0.03794890362769365, |
|
"reward_std": 0.6275844648480415, |
|
"rewards/cosine_scaled_reward": 0.018974455073475838, |
|
"rewards/format_reward": 0.0, |
|
"step": 115 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3263.27783203125, |
|
"epoch": 0.19880034275921166, |
|
"grad_norm": 0.18672628700733185, |
|
"kl": 0.0029048919677734375, |
|
"learning_rate": 9.54457320834625e-07, |
|
"loss": -0.034, |
|
"reward": -0.3033560863696039, |
|
"reward_std": 0.5516846142709255, |
|
"rewards/cosine_scaled_reward": -0.15167804807424545, |
|
"rewards/format_reward": 0.0, |
|
"step": 116 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3024.2500610351562, |
|
"epoch": 0.20051413881748073, |
|
"grad_norm": 0.1308911144733429, |
|
"kl": 0.00705718994140625, |
|
"learning_rate": 9.530702921077358e-07, |
|
"loss": 0.0178, |
|
"reward": 0.19102132320404053, |
|
"reward_std": 0.7014489844441414, |
|
"rewards/cosine_scaled_reward": 0.09551066905260086, |
|
"rewards/format_reward": 0.0, |
|
"step": 117 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2574.013916015625, |
|
"epoch": 0.20222793487574978, |
|
"grad_norm": 0.325631320476532, |
|
"kl": 0.013393402099609375, |
|
"learning_rate": 9.516636183034564e-07, |
|
"loss": 0.0659, |
|
"reward": -0.29521266371011734, |
|
"reward_std": 0.5856474936008453, |
|
"rewards/cosine_scaled_reward": -0.1476063383743167, |
|
"rewards/format_reward": 0.0, |
|
"step": 118 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2724.3333740234375, |
|
"epoch": 0.20394173093401885, |
|
"grad_norm": 0.14827784895896912, |
|
"kl": 0.0027828216552734375, |
|
"learning_rate": 9.502373679810839e-07, |
|
"loss": 0.0141, |
|
"reward": -0.03255775198340416, |
|
"reward_std": 0.34701335430145264, |
|
"rewards/cosine_scaled_reward": -0.01627887785434723, |
|
"rewards/format_reward": 0.0, |
|
"step": 119 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2813.1111450195312, |
|
"epoch": 0.20565552699228792, |
|
"grad_norm": 0.21779808402061462, |
|
"kl": 0.0053081512451171875, |
|
"learning_rate": 9.487916106540465e-07, |
|
"loss": 0.0158, |
|
"reward": -0.19739244412630796, |
|
"reward_std": 0.6424184814095497, |
|
"rewards/cosine_scaled_reward": -0.0986962317256257, |
|
"rewards/format_reward": 0.0, |
|
"step": 120 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2874.0000610351562, |
|
"epoch": 0.207369323050557, |
|
"grad_norm": 0.2778118848800659, |
|
"kl": 0.0032444000244140625, |
|
"learning_rate": 9.473264167865171e-07, |
|
"loss": 0.0937, |
|
"reward": -0.15650038793683052, |
|
"reward_std": 0.5867400094866753, |
|
"rewards/cosine_scaled_reward": -0.07825020421296358, |
|
"rewards/format_reward": 0.0, |
|
"step": 121 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3251.1112060546875, |
|
"epoch": 0.20908311910882604, |
|
"grad_norm": 0.12883791327476501, |
|
"kl": 0.003650665283203125, |
|
"learning_rate": 9.458418577899774e-07, |
|
"loss": 0.02, |
|
"reward": -0.30216934718191624, |
|
"reward_std": 0.5233990028500557, |
|
"rewards/cosine_scaled_reward": -0.1510846719611436, |
|
"rewards/format_reward": 0.0, |
|
"step": 122 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2774.3194580078125, |
|
"epoch": 0.21079691516709512, |
|
"grad_norm": 0.20982016623020172, |
|
"kl": 0.002071380615234375, |
|
"learning_rate": 9.443380060197385e-07, |
|
"loss": 0.0498, |
|
"reward": 0.3517572022974491, |
|
"reward_std": 0.7633289247751236, |
|
"rewards/cosine_scaled_reward": 0.17587858624756336, |
|
"rewards/format_reward": 0.0, |
|
"step": 123 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3077.4166259765625, |
|
"epoch": 0.2125107112253642, |
|
"grad_norm": 0.14578428864479065, |
|
"kl": 0.0024623870849609375, |
|
"learning_rate": 9.428149347714143e-07, |
|
"loss": 0.002, |
|
"reward": -0.09189963340759277, |
|
"reward_std": 0.4004024267196655, |
|
"rewards/cosine_scaled_reward": -0.04594981297850609, |
|
"rewards/format_reward": 0.0, |
|
"step": 124 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2898.5556030273438, |
|
"epoch": 0.21422450728363324, |
|
"grad_norm": 0.19108974933624268, |
|
"kl": 0.0016632080078125, |
|
"learning_rate": 9.412727182773486e-07, |
|
"loss": 0.0218, |
|
"reward": 0.01400849362835288, |
|
"reward_std": 0.5958191454410553, |
|
"rewards/cosine_scaled_reward": 0.007004249142482877, |
|
"rewards/format_reward": 0.0, |
|
"step": 125 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3005.916748046875, |
|
"epoch": 0.2159383033419023, |
|
"grad_norm": 0.26980966329574585, |
|
"kl": 0.004871368408203125, |
|
"learning_rate": 9.397114317029974e-07, |
|
"loss": 0.0539, |
|
"reward": -0.19987820833921432, |
|
"reward_std": 0.5232749357819557, |
|
"rewards/cosine_scaled_reward": -0.09993909671902657, |
|
"rewards/format_reward": 0.0, |
|
"step": 126 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2932.5416870117188, |
|
"epoch": 0.21765209940017138, |
|
"grad_norm": 0.15654343366622925, |
|
"kl": 0.0043792724609375, |
|
"learning_rate": 9.381311511432658e-07, |
|
"loss": 0.0405, |
|
"reward": -0.17467445600777864, |
|
"reward_std": 0.5738040953874588, |
|
"rewards/cosine_scaled_reward": -0.0873372326605022, |
|
"rewards/format_reward": 0.0, |
|
"step": 127 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3097.6666870117188, |
|
"epoch": 0.21936589545844046, |
|
"grad_norm": 0.16381874680519104, |
|
"kl": 0.00551605224609375, |
|
"learning_rate": 9.36531953618799e-07, |
|
"loss": -0.0288, |
|
"reward": -0.20874720811843872, |
|
"reward_std": 0.5535652860999107, |
|
"rewards/cosine_scaled_reward": -0.10437360778450966, |
|
"rewards/format_reward": 0.0, |
|
"step": 128 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2531.861114501953, |
|
"epoch": 0.2210796915167095, |
|
"grad_norm": 0.26021480560302734, |
|
"kl": 0.006000518798828125, |
|
"learning_rate": 9.34913917072228e-07, |
|
"loss": 0.0459, |
|
"reward": -0.044261377304792404, |
|
"reward_std": 0.4739195331931114, |
|
"rewards/cosine_scaled_reward": -0.022130683064460754, |
|
"rewards/format_reward": 0.0, |
|
"step": 129 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2091.138916015625, |
|
"epoch": 0.22279348757497858, |
|
"grad_norm": 0.22645580768585205, |
|
"kl": 0.00482940673828125, |
|
"learning_rate": 9.332771203643714e-07, |
|
"loss": -0.0704, |
|
"reward": 0.38943320140242577, |
|
"reward_std": 0.7351026237010956, |
|
"rewards/cosine_scaled_reward": 0.19471661932766438, |
|
"rewards/format_reward": 0.0, |
|
"step": 130 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3235.513916015625, |
|
"epoch": 0.22450728363324765, |
|
"grad_norm": 0.14915120601654053, |
|
"kl": 0.003574371337890625, |
|
"learning_rate": 9.316216432703916e-07, |
|
"loss": 0.0073, |
|
"reward": -0.32377296313643456, |
|
"reward_std": 0.48132046312093735, |
|
"rewards/cosine_scaled_reward": -0.16188647784292698, |
|
"rewards/format_reward": 0.0, |
|
"step": 131 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3044.4166870117188, |
|
"epoch": 0.2262210796915167, |
|
"grad_norm": 0.16817504167556763, |
|
"kl": 0.003704071044921875, |
|
"learning_rate": 9.299475664759068e-07, |
|
"loss": 0.0174, |
|
"reward": -0.18535634828731418, |
|
"reward_std": 0.6574838161468506, |
|
"rewards/cosine_scaled_reward": -0.0926781720481813, |
|
"rewards/format_reward": 0.0, |
|
"step": 132 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3003.4444580078125, |
|
"epoch": 0.22793487574978577, |
|
"grad_norm": 0.15358978509902954, |
|
"kl": 0.0041980743408203125, |
|
"learning_rate": 9.282549715730579e-07, |
|
"loss": 0.0337, |
|
"reward": -0.05171632254496217, |
|
"reward_std": 0.5909973978996277, |
|
"rewards/cosine_scaled_reward": -0.025858158012852073, |
|
"rewards/format_reward": 0.0, |
|
"step": 133 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2156.4583740234375, |
|
"epoch": 0.22964867180805484, |
|
"grad_norm": 0.1683642566204071, |
|
"kl": 0.00467681884765625, |
|
"learning_rate": 9.265439410565328e-07, |
|
"loss": 0.0033, |
|
"reward": 0.009663693606853485, |
|
"reward_std": 0.4995303153991699, |
|
"rewards/cosine_scaled_reward": 0.004831850528717041, |
|
"rewards/format_reward": 0.0, |
|
"step": 134 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2540.2777709960938, |
|
"epoch": 0.23136246786632392, |
|
"grad_norm": 0.1953487992286682, |
|
"kl": 0.00795745849609375, |
|
"learning_rate": 9.248145583195447e-07, |
|
"loss": 0.0474, |
|
"reward": -0.21851413743570447, |
|
"reward_std": 0.5443524122238159, |
|
"rewards/cosine_scaled_reward": -0.10925705661065876, |
|
"rewards/format_reward": 0.0, |
|
"step": 135 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2695.0694885253906, |
|
"epoch": 0.23307626392459296, |
|
"grad_norm": 0.1705743372440338, |
|
"kl": 0.0045166015625, |
|
"learning_rate": 9.230669076497687e-07, |
|
"loss": 0.0191, |
|
"reward": 0.05242172256112099, |
|
"reward_std": 0.5593772605061531, |
|
"rewards/cosine_scaled_reward": 0.026210861280560493, |
|
"rewards/format_reward": 0.0, |
|
"step": 136 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3158.9444580078125, |
|
"epoch": 0.23479005998286204, |
|
"grad_norm": 0.17036336660385132, |
|
"kl": 0.00504302978515625, |
|
"learning_rate": 9.213010742252327e-07, |
|
"loss": 0.0254, |
|
"reward": 0.028430916368961334, |
|
"reward_std": 0.7066435366868973, |
|
"rewards/cosine_scaled_reward": 0.014215447008609772, |
|
"rewards/format_reward": 0.0, |
|
"step": 137 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3004.0416259765625, |
|
"epoch": 0.2365038560411311, |
|
"grad_norm": 0.1331450194120407, |
|
"kl": 0.003459930419921875, |
|
"learning_rate": 9.195171441101668e-07, |
|
"loss": -0.0176, |
|
"reward": -0.014733657240867615, |
|
"reward_std": 0.5561396405100822, |
|
"rewards/cosine_scaled_reward": -0.007366828620433807, |
|
"rewards/format_reward": 0.0, |
|
"step": 138 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2905.4444580078125, |
|
"epoch": 0.23821765209940018, |
|
"grad_norm": 0.17066888511180878, |
|
"kl": 0.00594329833984375, |
|
"learning_rate": 9.177152042508077e-07, |
|
"loss": 0.0097, |
|
"reward": -0.19389863312244415, |
|
"reward_std": 0.47480132430791855, |
|
"rewards/cosine_scaled_reward": -0.09694933146238327, |
|
"rewards/format_reward": 0.0, |
|
"step": 139 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2243.4722595214844, |
|
"epoch": 0.23993144815766923, |
|
"grad_norm": 0.2052508443593979, |
|
"kl": 0.0069751739501953125, |
|
"learning_rate": 9.158953424711624e-07, |
|
"loss": 0.0149, |
|
"reward": -0.17606773134320974, |
|
"reward_std": 0.4814153388142586, |
|
"rewards/cosine_scaled_reward": -0.08803386008366942, |
|
"rewards/format_reward": 0.0, |
|
"step": 140 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3140.3472290039062, |
|
"epoch": 0.2416452442159383, |
|
"grad_norm": 0.2093152105808258, |
|
"kl": 0.009227752685546875, |
|
"learning_rate": 9.140576474687263e-07, |
|
"loss": -0.0173, |
|
"reward": -0.2940823882818222, |
|
"reward_std": 0.46395206451416016, |
|
"rewards/cosine_scaled_reward": -0.1470412015914917, |
|
"rewards/format_reward": 0.0, |
|
"step": 141 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3106.0833740234375, |
|
"epoch": 0.24335904027420738, |
|
"grad_norm": 0.15536610782146454, |
|
"kl": 0.003704071044921875, |
|
"learning_rate": 9.122022088101613e-07, |
|
"loss": -0.0133, |
|
"reward": -0.12113199383020401, |
|
"reward_std": 0.5028039142489433, |
|
"rewards/cosine_scaled_reward": -0.06056600622832775, |
|
"rewards/format_reward": 0.0, |
|
"step": 142 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3045.0, |
|
"epoch": 0.24507283633247642, |
|
"grad_norm": 0.1554841846227646, |
|
"kl": 0.00397491455078125, |
|
"learning_rate": 9.103291169269299e-07, |
|
"loss": 0.0112, |
|
"reward": -0.24326159805059433, |
|
"reward_std": 0.545206792652607, |
|
"rewards/cosine_scaled_reward": -0.12163079530000687, |
|
"rewards/format_reward": 0.0, |
|
"step": 143 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3281.52783203125, |
|
"epoch": 0.2467866323907455, |
|
"grad_norm": 0.15741369128227234, |
|
"kl": 0.004161834716796875, |
|
"learning_rate": 9.084384631108882e-07, |
|
"loss": 0.0205, |
|
"reward": -0.3316431827843189, |
|
"reward_std": 0.5960408300161362, |
|
"rewards/cosine_scaled_reward": -0.16582159511744976, |
|
"rewards/format_reward": 0.0, |
|
"step": 144 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2848.000030517578, |
|
"epoch": 0.24850042844901457, |
|
"grad_norm": 0.16731388866901398, |
|
"kl": 0.004131317138671875, |
|
"learning_rate": 9.065303395098358e-07, |
|
"loss": 0.0076, |
|
"reward": -0.030747827142477036, |
|
"reward_std": 0.532738171517849, |
|
"rewards/cosine_scaled_reward": -0.015373910777270794, |
|
"rewards/format_reward": 0.0, |
|
"step": 145 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2467.6944580078125, |
|
"epoch": 0.25021422450728364, |
|
"grad_norm": 0.21354977786540985, |
|
"kl": 0.004360198974609375, |
|
"learning_rate": 9.046048391230247e-07, |
|
"loss": 0.0206, |
|
"reward": 0.0201254915446043, |
|
"reward_std": 0.8671004623174667, |
|
"rewards/cosine_scaled_reward": 0.010062748566269875, |
|
"rewards/format_reward": 0.0, |
|
"step": 146 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2855.6666870117188, |
|
"epoch": 0.2519280205655527, |
|
"grad_norm": 0.144964799284935, |
|
"kl": 0.004791259765625, |
|
"learning_rate": 9.026620557966279e-07, |
|
"loss": -0.0329, |
|
"reward": -0.2643125932663679, |
|
"reward_std": 0.5043439790606499, |
|
"rewards/cosine_scaled_reward": -0.13215629663318396, |
|
"rewards/format_reward": 0.0, |
|
"step": 147 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3096.513916015625, |
|
"epoch": 0.2536418166238218, |
|
"grad_norm": 0.14218087494373322, |
|
"kl": 0.003002166748046875, |
|
"learning_rate": 9.007020842191634e-07, |
|
"loss": -0.0089, |
|
"reward": -0.221635602414608, |
|
"reward_std": 0.4477159082889557, |
|
"rewards/cosine_scaled_reward": -0.11081778630614281, |
|
"rewards/format_reward": 0.0, |
|
"step": 148 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2720.263916015625, |
|
"epoch": 0.25535561268209084, |
|
"grad_norm": 0.17918290197849274, |
|
"kl": 0.004497528076171875, |
|
"learning_rate": 8.987250199168808e-07, |
|
"loss": -0.0669, |
|
"reward": -0.07472209073603153, |
|
"reward_std": 0.6641673818230629, |
|
"rewards/cosine_scaled_reward": -0.03736104257404804, |
|
"rewards/format_reward": 0.0, |
|
"step": 149 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2809.666717529297, |
|
"epoch": 0.2570694087403599, |
|
"grad_norm": 0.13525010645389557, |
|
"kl": 0.004337310791015625, |
|
"learning_rate": 8.967309592491052e-07, |
|
"loss": 0.0411, |
|
"reward": 0.32146409433335066, |
|
"reward_std": 0.6463728100061417, |
|
"rewards/cosine_scaled_reward": 0.16073204344138503, |
|
"rewards/format_reward": 0.0, |
|
"step": 150 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2688.6944580078125, |
|
"epoch": 0.258783204798629, |
|
"grad_norm": 0.29565665125846863, |
|
"kl": 0.0060577392578125, |
|
"learning_rate": 8.9471999940354e-07, |
|
"loss": -0.0998, |
|
"reward": -0.27624649833887815, |
|
"reward_std": 0.46585455536842346, |
|
"rewards/cosine_scaled_reward": -0.13812324171885848, |
|
"rewards/format_reward": 0.0, |
|
"step": 151 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2944.888916015625, |
|
"epoch": 0.26049700085689803, |
|
"grad_norm": 0.15996259450912476, |
|
"kl": 0.005001068115234375, |
|
"learning_rate": 8.926922383915315e-07, |
|
"loss": 0.0467, |
|
"reward": -0.09553277865052223, |
|
"reward_std": 0.5195184722542763, |
|
"rewards/cosine_scaled_reward": -0.047766391187906265, |
|
"rewards/format_reward": 0.0, |
|
"step": 152 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2880.5833740234375, |
|
"epoch": 0.2622107969151671, |
|
"grad_norm": 0.16603334248065948, |
|
"kl": 0.003936767578125, |
|
"learning_rate": 8.906477750432903e-07, |
|
"loss": 0.0105, |
|
"reward": -0.19235826842486858, |
|
"reward_std": 0.5736033394932747, |
|
"rewards/cosine_scaled_reward": -0.09617912326939404, |
|
"rewards/format_reward": 0.0, |
|
"step": 153 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2859.0972900390625, |
|
"epoch": 0.2639245929734362, |
|
"grad_norm": 0.17567692697048187, |
|
"kl": 0.004161834716796875, |
|
"learning_rate": 8.88586709003076e-07, |
|
"loss": -0.0056, |
|
"reward": -0.19033684581518173, |
|
"reward_std": 0.5773953720927238, |
|
"rewards/cosine_scaled_reward": -0.09516842663288116, |
|
"rewards/format_reward": 0.0, |
|
"step": 154 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3215.1666870117188, |
|
"epoch": 0.2656383890317052, |
|
"grad_norm": 0.14003609120845795, |
|
"kl": 0.004474639892578125, |
|
"learning_rate": 8.865091407243394e-07, |
|
"loss": 0.0216, |
|
"reward": -0.1411176547408104, |
|
"reward_std": 0.6216752380132675, |
|
"rewards/cosine_scaled_reward": -0.07055883854627609, |
|
"rewards/format_reward": 0.0, |
|
"step": 155 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2929.1250610351562, |
|
"epoch": 0.26735218508997427, |
|
"grad_norm": 0.14357882738113403, |
|
"kl": 0.003513336181640625, |
|
"learning_rate": 8.844151714648274e-07, |
|
"loss": -0.0355, |
|
"reward": -0.26859963312745094, |
|
"reward_std": 0.501942828297615, |
|
"rewards/cosine_scaled_reward": -0.13429982028901577, |
|
"rewards/format_reward": 0.0, |
|
"step": 156 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2811.6666870117188, |
|
"epoch": 0.26906598114824337, |
|
"grad_norm": 0.18389619886875153, |
|
"kl": 0.006900787353515625, |
|
"learning_rate": 8.823049032816478e-07, |
|
"loss": -0.049, |
|
"reward": 0.005984093062579632, |
|
"reward_std": 0.7341288924217224, |
|
"rewards/cosine_scaled_reward": 0.0029920428059995174, |
|
"rewards/format_reward": 0.0, |
|
"step": 157 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2909.77783203125, |
|
"epoch": 0.2707797772065124, |
|
"grad_norm": 0.13957001268863678, |
|
"kl": 0.0042877197265625, |
|
"learning_rate": 8.801784390262943e-07, |
|
"loss": 0.0033, |
|
"reward": -0.17342954874038696, |
|
"reward_std": 0.4903194531798363, |
|
"rewards/cosine_scaled_reward": -0.08671476691961288, |
|
"rewards/format_reward": 0.0, |
|
"step": 158 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3235.013916015625, |
|
"epoch": 0.27249357326478146, |
|
"grad_norm": 0.15003739297389984, |
|
"kl": 0.005523681640625, |
|
"learning_rate": 8.780358823396352e-07, |
|
"loss": 0.0068, |
|
"reward": -0.3410843312740326, |
|
"reward_std": 0.502905935049057, |
|
"rewards/cosine_scaled_reward": -0.17054216749966145, |
|
"rewards/format_reward": 0.0, |
|
"step": 159 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3160.6805419921875, |
|
"epoch": 0.27420736932305056, |
|
"grad_norm": 0.1586807668209076, |
|
"kl": 0.00443267822265625, |
|
"learning_rate": 8.758773376468604e-07, |
|
"loss": 0.0141, |
|
"reward": 0.04759278893470764, |
|
"reward_std": 0.6465433575212955, |
|
"rewards/cosine_scaled_reward": 0.02379640005528927, |
|
"rewards/format_reward": 0.0, |
|
"step": 160 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2961.3333129882812, |
|
"epoch": 0.2759211653813196, |
|
"grad_norm": 0.18396639823913574, |
|
"kl": 0.0078125, |
|
"learning_rate": 8.737029101523929e-07, |
|
"loss": 0.0282, |
|
"reward": -0.32911188155412674, |
|
"reward_std": 0.6032818555831909, |
|
"rewards/cosine_scaled_reward": -0.16455595009028912, |
|
"rewards/format_reward": 0.0, |
|
"step": 161 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2621.2222290039062, |
|
"epoch": 0.2776349614395887, |
|
"grad_norm": 0.15461236238479614, |
|
"kl": 0.00507354736328125, |
|
"learning_rate": 8.715127058347614e-07, |
|
"loss": -0.0194, |
|
"reward": -0.4356637103483081, |
|
"reward_std": 0.36323027312755585, |
|
"rewards/cosine_scaled_reward": -0.21783185191452503, |
|
"rewards/format_reward": 0.0, |
|
"step": 162 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3038.4027709960938, |
|
"epoch": 0.27934875749785776, |
|
"grad_norm": 0.12717723846435547, |
|
"kl": 0.005706787109375, |
|
"learning_rate": 8.693068314414344e-07, |
|
"loss": 0.0023, |
|
"reward": -0.04007915942929685, |
|
"reward_std": 0.6919823586940765, |
|
"rewards/cosine_scaled_reward": -0.02003958181012422, |
|
"rewards/format_reward": 0.0, |
|
"step": 163 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2678.7361755371094, |
|
"epoch": 0.2810625535561268, |
|
"grad_norm": 0.19941791892051697, |
|
"kl": 0.005207061767578125, |
|
"learning_rate": 8.670853944836176e-07, |
|
"loss": 0.0441, |
|
"reward": -0.040945328772068024, |
|
"reward_std": 0.5933430567383766, |
|
"rewards/cosine_scaled_reward": -0.020472656935453415, |
|
"rewards/format_reward": 0.0, |
|
"step": 164 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2895.0416870117188, |
|
"epoch": 0.2827763496143959, |
|
"grad_norm": 0.16098277270793915, |
|
"kl": 0.0064697265625, |
|
"learning_rate": 8.648485032310144e-07, |
|
"loss": 0.0293, |
|
"reward": -0.09013996832072735, |
|
"reward_std": 0.5875271111726761, |
|
"rewards/cosine_scaled_reward": -0.04506997298449278, |
|
"rewards/format_reward": 0.0, |
|
"step": 165 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2754.263916015625, |
|
"epoch": 0.28449014567266495, |
|
"grad_norm": 0.15243615210056305, |
|
"kl": 0.00676727294921875, |
|
"learning_rate": 8.625962667065487e-07, |
|
"loss": 0.0191, |
|
"reward": -0.0630449466407299, |
|
"reward_std": 0.6104780063033104, |
|
"rewards/cosine_scaled_reward": -0.0315224789083004, |
|
"rewards/format_reward": 0.0, |
|
"step": 166 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3024.9166870117188, |
|
"epoch": 0.286203941730934, |
|
"grad_norm": 0.14153960347175598, |
|
"kl": 0.0053863525390625, |
|
"learning_rate": 8.603287946810513e-07, |
|
"loss": 0.0428, |
|
"reward": -0.1417745603248477, |
|
"reward_std": 0.7242364957928658, |
|
"rewards/cosine_scaled_reward": -0.07088728016242385, |
|
"rewards/format_reward": 0.0, |
|
"step": 167 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3074.4306030273438, |
|
"epoch": 0.2879177377892031, |
|
"grad_norm": 0.1459978222846985, |
|
"kl": 0.0064067840576171875, |
|
"learning_rate": 8.580461976679099e-07, |
|
"loss": 0.0112, |
|
"reward": -0.01038459874689579, |
|
"reward_std": 0.7124739363789558, |
|
"rewards/cosine_scaled_reward": -0.0051923105493187904, |
|
"rewards/format_reward": 0.0, |
|
"step": 168 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3006.638916015625, |
|
"epoch": 0.28963153384747214, |
|
"grad_norm": 0.2151106894016266, |
|
"kl": 0.00925445556640625, |
|
"learning_rate": 8.557485869176825e-07, |
|
"loss": 0.0553, |
|
"reward": -0.2934446856379509, |
|
"reward_std": 0.5195991396903992, |
|
"rewards/cosine_scaled_reward": -0.14672234281897545, |
|
"rewards/format_reward": 0.0, |
|
"step": 169 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2720.8194274902344, |
|
"epoch": 0.2913453299057412, |
|
"grad_norm": 0.16352801024913788, |
|
"kl": 0.00519561767578125, |
|
"learning_rate": 8.534360744126753e-07, |
|
"loss": 0.061, |
|
"reward": 0.10783382831141353, |
|
"reward_std": 0.6230225935578346, |
|
"rewards/cosine_scaled_reward": 0.053916911128908396, |
|
"rewards/format_reward": 0.0, |
|
"step": 170 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2681.388946533203, |
|
"epoch": 0.2930591259640103, |
|
"grad_norm": 0.17118766903877258, |
|
"kl": 0.00641632080078125, |
|
"learning_rate": 8.511087728614862e-07, |
|
"loss": 0.026, |
|
"reward": -0.0785403607878834, |
|
"reward_std": 0.5736416950821877, |
|
"rewards/cosine_scaled_reward": -0.039270187029615045, |
|
"rewards/format_reward": 0.0, |
|
"step": 171 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2987.2361450195312, |
|
"epoch": 0.29477292202227934, |
|
"grad_norm": 0.15370719134807587, |
|
"kl": 0.0035552978515625, |
|
"learning_rate": 8.487667956935087e-07, |
|
"loss": -0.0033, |
|
"reward": -0.02974682953208685, |
|
"reward_std": 0.5253070890903473, |
|
"rewards/cosine_scaled_reward": -0.014873407315462828, |
|
"rewards/format_reward": 0.0, |
|
"step": 172 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2586.4027709960938, |
|
"epoch": 0.29648671808054844, |
|
"grad_norm": 0.22191597521305084, |
|
"kl": 0.0059356689453125, |
|
"learning_rate": 8.464102570534061e-07, |
|
"loss": -0.0092, |
|
"reward": -0.2831332399509847, |
|
"reward_std": 0.5445848181843758, |
|
"rewards/cosine_scaled_reward": -0.14156663417816162, |
|
"rewards/format_reward": 0.0, |
|
"step": 173 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2655.2222900390625, |
|
"epoch": 0.2982005141388175, |
|
"grad_norm": 0.22858025133609772, |
|
"kl": 0.01116943359375, |
|
"learning_rate": 8.440392717955475e-07, |
|
"loss": -0.0181, |
|
"reward": -0.2866486459970474, |
|
"reward_std": 0.5677091330289841, |
|
"rewards/cosine_scaled_reward": -0.14332432113587856, |
|
"rewards/format_reward": 0.0, |
|
"step": 174 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2976.8194580078125, |
|
"epoch": 0.29991431019708653, |
|
"grad_norm": 0.15686574578285217, |
|
"kl": 0.00734710693359375, |
|
"learning_rate": 8.416539554784089e-07, |
|
"loss": 0.0308, |
|
"reward": -0.3254437707364559, |
|
"reward_std": 0.5169026479125023, |
|
"rewards/cosine_scaled_reward": -0.16272189188748598, |
|
"rewards/format_reward": 0.0, |
|
"step": 175 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2500.736114501953, |
|
"epoch": 0.30162810625535563, |
|
"grad_norm": 0.2628232538700104, |
|
"kl": 0.03589630126953125, |
|
"learning_rate": 8.392544243589427e-07, |
|
"loss": 0.0534, |
|
"reward": -0.1589430421590805, |
|
"reward_std": 0.6641415655612946, |
|
"rewards/cosine_scaled_reward": -0.07947152107954025, |
|
"rewards/format_reward": 0.0, |
|
"step": 176 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3302.7361450195312, |
|
"epoch": 0.3033419023136247, |
|
"grad_norm": 0.13509048521518707, |
|
"kl": 0.003936767578125, |
|
"learning_rate": 8.368407953869103e-07, |
|
"loss": -0.0077, |
|
"reward": -0.3392331041395664, |
|
"reward_std": 0.44542837142944336, |
|
"rewards/cosine_scaled_reward": -0.1696165525354445, |
|
"rewards/format_reward": 0.0, |
|
"step": 177 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2946.9306030273438, |
|
"epoch": 0.3050556983718937, |
|
"grad_norm": 0.14318227767944336, |
|
"kl": 0.004425048828125, |
|
"learning_rate": 8.344131861991828e-07, |
|
"loss": 0.0129, |
|
"reward": 0.040801383554935455, |
|
"reward_std": 0.47273271530866623, |
|
"rewards/cosine_scaled_reward": 0.02040068805217743, |
|
"rewards/format_reward": 0.0, |
|
"step": 178 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3092.125, |
|
"epoch": 0.3067694944301628, |
|
"grad_norm": 0.15563301742076874, |
|
"kl": 0.010219573974609375, |
|
"learning_rate": 8.319717151140072e-07, |
|
"loss": 0.045, |
|
"reward": -0.1892098607495427, |
|
"reward_std": 0.5936430767178535, |
|
"rewards/cosine_scaled_reward": -0.09460492385551333, |
|
"rewards/format_reward": 0.0, |
|
"step": 179 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1971.5138854980469, |
|
"epoch": 0.30848329048843187, |
|
"grad_norm": 0.19795306026935577, |
|
"kl": 0.00974273681640625, |
|
"learning_rate": 8.295165011252396e-07, |
|
"loss": -0.0138, |
|
"reward": -0.11939475126564503, |
|
"reward_std": 0.6153334528207779, |
|
"rewards/cosine_scaled_reward": -0.05969736957922578, |
|
"rewards/format_reward": 0.0, |
|
"step": 180 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3067.1945190429688, |
|
"epoch": 0.3101970865467009, |
|
"grad_norm": 0.15797466039657593, |
|
"kl": 0.005138397216796875, |
|
"learning_rate": 8.270476638965461e-07, |
|
"loss": -0.0212, |
|
"reward": 0.10869292449206114, |
|
"reward_std": 0.6324612945318222, |
|
"rewards/cosine_scaled_reward": 0.054346468299627304, |
|
"rewards/format_reward": 0.0, |
|
"step": 181 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3268.486083984375, |
|
"epoch": 0.31191088260497, |
|
"grad_norm": 0.15513566136360168, |
|
"kl": 0.006145477294921875, |
|
"learning_rate": 8.245653237555705e-07, |
|
"loss": 0.0633, |
|
"reward": -0.2609336208552122, |
|
"reward_std": 0.49053191393613815, |
|
"rewards/cosine_scaled_reward": -0.13046680949628353, |
|
"rewards/format_reward": 0.0, |
|
"step": 182 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2989.84716796875, |
|
"epoch": 0.31362467866323906, |
|
"grad_norm": 0.15209534764289856, |
|
"kl": 0.00447845458984375, |
|
"learning_rate": 8.220696016880687e-07, |
|
"loss": 0.0061, |
|
"reward": -0.040327644906938076, |
|
"reward_std": 0.717703215777874, |
|
"rewards/cosine_scaled_reward": -0.02016383269801736, |
|
"rewards/format_reward": 0.0, |
|
"step": 183 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2890.4583740234375, |
|
"epoch": 0.31533847472150817, |
|
"grad_norm": 0.1359020322561264, |
|
"kl": 0.006763458251953125, |
|
"learning_rate": 8.195606193320136e-07, |
|
"loss": 0.0369, |
|
"reward": -0.22703023999929428, |
|
"reward_std": 0.6005472913384438, |
|
"rewards/cosine_scaled_reward": -0.11351512093096972, |
|
"rewards/format_reward": 0.0, |
|
"step": 184 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2926.3194580078125, |
|
"epoch": 0.3170522707797772, |
|
"grad_norm": 0.13524238765239716, |
|
"kl": 0.0067901611328125, |
|
"learning_rate": 8.170384989716657e-07, |
|
"loss": 0.0495, |
|
"reward": -0.17516471818089485, |
|
"reward_std": 0.5499648228287697, |
|
"rewards/cosine_scaled_reward": -0.08758235163986683, |
|
"rewards/format_reward": 0.0, |
|
"step": 185 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2758.0833740234375, |
|
"epoch": 0.31876606683804626, |
|
"grad_norm": 0.19634363055229187, |
|
"kl": 0.0046234130859375, |
|
"learning_rate": 8.145033635316128e-07, |
|
"loss": -0.0094, |
|
"reward": -0.17140711098909378, |
|
"reward_std": 0.5592127367854118, |
|
"rewards/cosine_scaled_reward": -0.08570355176925659, |
|
"rewards/format_reward": 0.0, |
|
"step": 186 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2399.749969482422, |
|
"epoch": 0.32047986289631536, |
|
"grad_norm": 0.14529581367969513, |
|
"kl": 0.00421905517578125, |
|
"learning_rate": 8.119553365707802e-07, |
|
"loss": 0.0233, |
|
"reward": 0.07888301834464073, |
|
"reward_std": 0.7940803468227386, |
|
"rewards/cosine_scaled_reward": 0.03944151382893324, |
|
"rewards/format_reward": 0.0, |
|
"step": 187 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3158.486083984375, |
|
"epoch": 0.3221936589545844, |
|
"grad_norm": 0.15482233464717865, |
|
"kl": 0.0067596435546875, |
|
"learning_rate": 8.093945422764069e-07, |
|
"loss": -0.0061, |
|
"reward": -0.22822286747395992, |
|
"reward_std": 0.48042069375514984, |
|
"rewards/cosine_scaled_reward": -0.1141114397905767, |
|
"rewards/format_reward": 0.0, |
|
"step": 188 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2875.52783203125, |
|
"epoch": 0.32390745501285345, |
|
"grad_norm": 0.16830354928970337, |
|
"kl": 0.00731658935546875, |
|
"learning_rate": 8.068211054579943e-07, |
|
"loss": -0.0214, |
|
"reward": -0.27129118889570236, |
|
"reward_std": 0.44227684289216995, |
|
"rewards/cosine_scaled_reward": -0.13564559258520603, |
|
"rewards/format_reward": 0.0, |
|
"step": 189 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3101.52783203125, |
|
"epoch": 0.32562125107112255, |
|
"grad_norm": 0.17314012348651886, |
|
"kl": 0.006618499755859375, |
|
"learning_rate": 8.04235151541222e-07, |
|
"loss": 0.0361, |
|
"reward": -0.29743205150589347, |
|
"reward_std": 0.6253781244158745, |
|
"rewards/cosine_scaled_reward": -0.1487160255201161, |
|
"rewards/format_reward": 0.0, |
|
"step": 190 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3005.0694580078125, |
|
"epoch": 0.3273350471293916, |
|
"grad_norm": 0.13242636620998383, |
|
"kl": 0.005157470703125, |
|
"learning_rate": 8.01636806561836e-07, |
|
"loss": 0.0398, |
|
"reward": -0.2783219777047634, |
|
"reward_std": 0.5744869485497475, |
|
"rewards/cosine_scaled_reward": -0.139160992577672, |
|
"rewards/format_reward": 0.0, |
|
"step": 191 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2682.2777709960938, |
|
"epoch": 0.32904884318766064, |
|
"grad_norm": 0.1771107167005539, |
|
"kl": 0.00849151611328125, |
|
"learning_rate": 7.990261971595048e-07, |
|
"loss": -0.0275, |
|
"reward": -0.16758478805422783, |
|
"reward_std": 0.5308270826935768, |
|
"rewards/cosine_scaled_reward": -0.08379239588975906, |
|
"rewards/format_reward": 0.0, |
|
"step": 192 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2224.2083587646484, |
|
"epoch": 0.33076263924592975, |
|
"grad_norm": 0.2606137990951538, |
|
"kl": 0.0111236572265625, |
|
"learning_rate": 7.964034505716476e-07, |
|
"loss": 0.0598, |
|
"reward": -0.1425977125763893, |
|
"reward_std": 0.6462048292160034, |
|
"rewards/cosine_scaled_reward": -0.0712988581508398, |
|
"rewards/format_reward": 0.0, |
|
"step": 193 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2366.65283203125, |
|
"epoch": 0.3324764353041988, |
|
"grad_norm": 0.20748130977153778, |
|
"kl": 0.00627899169921875, |
|
"learning_rate": 7.93768694627233e-07, |
|
"loss": 0.0302, |
|
"reward": 0.07216466031968594, |
|
"reward_std": 0.5604969188570976, |
|
"rewards/cosine_scaled_reward": 0.03608234319835901, |
|
"rewards/format_reward": 0.0, |
|
"step": 194 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3086.3889770507812, |
|
"epoch": 0.3341902313624679, |
|
"grad_norm": 0.16518257558345795, |
|
"kl": 0.007572174072265625, |
|
"learning_rate": 7.911220577405484e-07, |
|
"loss": 0.0403, |
|
"reward": -0.2750488445162773, |
|
"reward_std": 0.44911373406648636, |
|
"rewards/cosine_scaled_reward": -0.13752441480755806, |
|
"rewards/format_reward": 0.0, |
|
"step": 195 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2695.0416870117188, |
|
"epoch": 0.33590402742073694, |
|
"grad_norm": 0.14707054197788239, |
|
"kl": 0.0079803466796875, |
|
"learning_rate": 7.884636689049422e-07, |
|
"loss": 0.0299, |
|
"reward": 0.3252771459519863, |
|
"reward_std": 0.7292146235704422, |
|
"rewards/cosine_scaled_reward": 0.162638571113348, |
|
"rewards/format_reward": 0.0, |
|
"step": 196 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2711.7083129882812, |
|
"epoch": 0.337617823479006, |
|
"grad_norm": 0.19674766063690186, |
|
"kl": 0.00566864013671875, |
|
"learning_rate": 7.857936576865356e-07, |
|
"loss": 0.0125, |
|
"reward": 0.1904342882335186, |
|
"reward_std": 0.6823486983776093, |
|
"rewards/cosine_scaled_reward": 0.09521715994924307, |
|
"rewards/format_reward": 0.0, |
|
"step": 197 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2801.2361450195312, |
|
"epoch": 0.3393316195372751, |
|
"grad_norm": 0.17002622783184052, |
|
"kl": 0.00652313232421875, |
|
"learning_rate": 7.831121542179086e-07, |
|
"loss": 0.0551, |
|
"reward": -0.1881256103515625, |
|
"reward_std": 0.41709040850400925, |
|
"rewards/cosine_scaled_reward": -0.0940628070384264, |
|
"rewards/format_reward": 0.0, |
|
"step": 198 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2945.0139770507812, |
|
"epoch": 0.34104541559554413, |
|
"grad_norm": 0.17246587574481964, |
|
"kl": 0.006256103515625, |
|
"learning_rate": 7.804192891917571e-07, |
|
"loss": -0.0014, |
|
"reward": -0.20545833744108677, |
|
"reward_std": 0.5765868201851845, |
|
"rewards/cosine_scaled_reward": -0.10272916965186596, |
|
"rewards/format_reward": 0.0, |
|
"step": 199 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2718.9583435058594, |
|
"epoch": 0.3427592116538132, |
|
"grad_norm": 0.184196338057518, |
|
"kl": 0.008544921875, |
|
"learning_rate": 7.777151938545235e-07, |
|
"loss": 0.016, |
|
"reward": -0.036401793360710144, |
|
"reward_std": 0.7076919972896576, |
|
"rewards/cosine_scaled_reward": -0.01820090040564537, |
|
"rewards/format_reward": 0.0, |
|
"step": 200 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2624.166717529297, |
|
"epoch": 0.3444730077120823, |
|
"grad_norm": 0.2025025188922882, |
|
"kl": 0.00604248046875, |
|
"learning_rate": 7.75e-07, |
|
"loss": -0.0684, |
|
"reward": -0.23150286450982094, |
|
"reward_std": 0.4834456667304039, |
|
"rewards/cosine_scaled_reward": -0.11575142852962017, |
|
"rewards/format_reward": 0.0, |
|
"step": 201 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2488.5556030273438, |
|
"epoch": 0.3461868037703513, |
|
"grad_norm": 0.15225747227668762, |
|
"kl": 0.005893707275390625, |
|
"learning_rate": 7.72273839962904e-07, |
|
"loss": -0.0317, |
|
"reward": 0.06343521224334836, |
|
"reward_std": 0.6216820403933525, |
|
"rewards/cosine_scaled_reward": 0.03171759960241616, |
|
"rewards/format_reward": 0.0, |
|
"step": 202 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2638.0556030273438, |
|
"epoch": 0.34790059982862037, |
|
"grad_norm": 0.19878201186656952, |
|
"kl": 0.00801849365234375, |
|
"learning_rate": 7.695368466124296e-07, |
|
"loss": 0.0177, |
|
"reward": 0.24296507984399796, |
|
"reward_std": 0.7006724625825882, |
|
"rewards/cosine_scaled_reward": 0.12148253805935383, |
|
"rewards/format_reward": 0.0, |
|
"step": 203 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2701.2499389648438, |
|
"epoch": 0.3496143958868895, |
|
"grad_norm": 0.16115106642246246, |
|
"kl": 0.005344390869140625, |
|
"learning_rate": 7.667891533457718e-07, |
|
"loss": 0.0175, |
|
"reward": -0.01583041623234749, |
|
"reward_std": 0.5048926845192909, |
|
"rewards/cosine_scaled_reward": -0.007915209047496319, |
|
"rewards/format_reward": 0.0, |
|
"step": 204 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2820.15283203125, |
|
"epoch": 0.3513281919451585, |
|
"grad_norm": 0.1620146483182907, |
|
"kl": 0.00921630859375, |
|
"learning_rate": 7.640308940816239e-07, |
|
"loss": 0.0106, |
|
"reward": 0.10508427396416664, |
|
"reward_std": 0.5011924579739571, |
|
"rewards/cosine_scaled_reward": 0.05254213139414787, |
|
"rewards/format_reward": 0.0, |
|
"step": 205 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2766.77783203125, |
|
"epoch": 0.35304198800342756, |
|
"grad_norm": 0.17725811898708344, |
|
"kl": 0.0068359375, |
|
"learning_rate": 7.612622032536507e-07, |
|
"loss": 0.0292, |
|
"reward": -0.025651058182120323, |
|
"reward_std": 0.6831357106566429, |
|
"rewards/cosine_scaled_reward": -0.012825531885027885, |
|
"rewards/format_reward": 0.0, |
|
"step": 206 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2571.3333740234375, |
|
"epoch": 0.35475578406169667, |
|
"grad_norm": 0.2153560221195221, |
|
"kl": 0.00772857666015625, |
|
"learning_rate": 7.584832158039378e-07, |
|
"loss": -0.0053, |
|
"reward": -0.0772455558180809, |
|
"reward_std": 0.5703203156590462, |
|
"rewards/cosine_scaled_reward": -0.038622772321105, |
|
"rewards/format_reward": 0.0, |
|
"step": 207 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2722.9444580078125, |
|
"epoch": 0.3564695801199657, |
|
"grad_norm": 0.2059468924999237, |
|
"kl": 0.00662994384765625, |
|
"learning_rate": 7.556940671764124e-07, |
|
"loss": 0.0612, |
|
"reward": -0.18379988404922187, |
|
"reward_std": 0.6482012867927551, |
|
"rewards/cosine_scaled_reward": -0.09189994307234883, |
|
"rewards/format_reward": 0.0, |
|
"step": 208 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2714.9445190429688, |
|
"epoch": 0.3581833761782348, |
|
"grad_norm": 0.1764851063489914, |
|
"kl": 0.00818634033203125, |
|
"learning_rate": 7.528948933102438e-07, |
|
"loss": 0.0477, |
|
"reward": -0.011997078021522611, |
|
"reward_std": 0.6311939656734467, |
|
"rewards/cosine_scaled_reward": -0.005998534747050144, |
|
"rewards/format_reward": 0.0, |
|
"step": 209 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2458.9583740234375, |
|
"epoch": 0.35989717223650386, |
|
"grad_norm": 0.23969826102256775, |
|
"kl": 0.00959014892578125, |
|
"learning_rate": 7.500858306332172e-07, |
|
"loss": -0.0174, |
|
"reward": -0.052909690886735916, |
|
"reward_std": 0.6342033296823502, |
|
"rewards/cosine_scaled_reward": -0.026454854756593704, |
|
"rewards/format_reward": 0.0, |
|
"step": 210 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3275.9722290039062, |
|
"epoch": 0.3616109682947729, |
|
"grad_norm": 0.14406003057956696, |
|
"kl": 0.0072784423828125, |
|
"learning_rate": 7.472670160550848e-07, |
|
"loss": -0.0075, |
|
"reward": -0.4154173508286476, |
|
"reward_std": 0.47341830283403397, |
|
"rewards/cosine_scaled_reward": -0.2077086754143238, |
|
"rewards/format_reward": 0.0, |
|
"step": 211 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2626.1805419921875, |
|
"epoch": 0.363324764353042, |
|
"grad_norm": 0.1677497923374176, |
|
"kl": 0.007781982421875, |
|
"learning_rate": 7.444385869608921e-07, |
|
"loss": -0.0218, |
|
"reward": -0.05068176053464413, |
|
"reward_std": 0.5218113884329796, |
|
"rewards/cosine_scaled_reward": -0.025340883061289787, |
|
"rewards/format_reward": 0.0, |
|
"step": 212 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2894.8472290039062, |
|
"epoch": 0.36503856041131105, |
|
"grad_norm": 0.17942826449871063, |
|
"kl": 0.00614166259765625, |
|
"learning_rate": 7.416006812042827e-07, |
|
"loss": 0.0757, |
|
"reward": -0.09456230141222477, |
|
"reward_std": 0.6797711104154587, |
|
"rewards/cosine_scaled_reward": -0.0472811465151608, |
|
"rewards/format_reward": 0.0, |
|
"step": 213 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2945.8750610351562, |
|
"epoch": 0.3667523564695801, |
|
"grad_norm": 0.1967238038778305, |
|
"kl": 0.0100555419921875, |
|
"learning_rate": 7.387534371007797e-07, |
|
"loss": -0.0128, |
|
"reward": -0.10412277281284332, |
|
"reward_std": 0.7091450989246368, |
|
"rewards/cosine_scaled_reward": -0.05206138640642166, |
|
"rewards/format_reward": 0.0, |
|
"step": 214 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2597.4444580078125, |
|
"epoch": 0.3684661525278492, |
|
"grad_norm": 0.19232463836669922, |
|
"kl": 0.00603485107421875, |
|
"learning_rate": 7.358969934210438e-07, |
|
"loss": 0.0557, |
|
"reward": 0.07514850981533527, |
|
"reward_std": 0.5688696801662445, |
|
"rewards/cosine_scaled_reward": 0.03757425490766764, |
|
"rewards/format_reward": 0.0, |
|
"step": 215 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3071.6805419921875, |
|
"epoch": 0.37017994858611825, |
|
"grad_norm": 0.15334580838680267, |
|
"kl": 0.00914764404296875, |
|
"learning_rate": 7.330314893841101e-07, |
|
"loss": -0.0092, |
|
"reward": -0.3550204383209348, |
|
"reward_std": 0.36161456257104874, |
|
"rewards/cosine_scaled_reward": -0.1775102224200964, |
|
"rewards/format_reward": 0.0, |
|
"step": 216 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2696.5972595214844, |
|
"epoch": 0.3718937446443873, |
|
"grad_norm": 0.1864735186100006, |
|
"kl": 0.005496978759765625, |
|
"learning_rate": 7.301570646506027e-07, |
|
"loss": 0.0101, |
|
"reward": -0.07679219171404839, |
|
"reward_std": 0.6243979334831238, |
|
"rewards/cosine_scaled_reward": -0.03839609259739518, |
|
"rewards/format_reward": 0.0, |
|
"step": 217 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2563.0834045410156, |
|
"epoch": 0.3736075407026564, |
|
"grad_norm": 0.16931480169296265, |
|
"kl": 0.005645751953125, |
|
"learning_rate": 7.27273859315928e-07, |
|
"loss": -0.006, |
|
"reward": -0.06438015587627888, |
|
"reward_std": 0.4739932492375374, |
|
"rewards/cosine_scaled_reward": -0.032190063036978245, |
|
"rewards/format_reward": 0.0, |
|
"step": 218 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3087.9306030273438, |
|
"epoch": 0.37532133676092544, |
|
"grad_norm": 0.15486636757850647, |
|
"kl": 0.00830841064453125, |
|
"learning_rate": 7.243820139034464e-07, |
|
"loss": 0.034, |
|
"reward": -0.1913878731429577, |
|
"reward_std": 0.7374170869588852, |
|
"rewards/cosine_scaled_reward": -0.09569394029676914, |
|
"rewards/format_reward": 0.0, |
|
"step": 219 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3060.9445190429688, |
|
"epoch": 0.37703513281919454, |
|
"grad_norm": 0.1392551213502884, |
|
"kl": 0.00780487060546875, |
|
"learning_rate": 7.214816693576234e-07, |
|
"loss": -0.0219, |
|
"reward": -0.3524288460612297, |
|
"reward_std": 0.4711146801710129, |
|
"rewards/cosine_scaled_reward": -0.17621441558003426, |
|
"rewards/format_reward": 0.0, |
|
"step": 220 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3082.2083129882812, |
|
"epoch": 0.3787489288774636, |
|
"grad_norm": 0.15662223100662231, |
|
"kl": 0.006549835205078125, |
|
"learning_rate": 7.185729670371604e-07, |
|
"loss": 0.0127, |
|
"reward": -0.051485654432326555, |
|
"reward_std": 0.5929789990186691, |
|
"rewards/cosine_scaled_reward": -0.025742830068338662, |
|
"rewards/format_reward": 0.0, |
|
"step": 221 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2778.666717529297, |
|
"epoch": 0.38046272493573263, |
|
"grad_norm": 0.17736981809139252, |
|
"kl": 0.008647918701171875, |
|
"learning_rate": 7.156560487081051e-07, |
|
"loss": 0.0084, |
|
"reward": -0.13847951218485832, |
|
"reward_std": 0.5384139195084572, |
|
"rewards/cosine_scaled_reward": -0.06923975050449371, |
|
"rewards/format_reward": 0.0, |
|
"step": 222 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3187.9305419921875, |
|
"epoch": 0.38217652099400173, |
|
"grad_norm": 0.13862548768520355, |
|
"kl": 0.00606536865234375, |
|
"learning_rate": 7.127310565369415e-07, |
|
"loss": 0.0539, |
|
"reward": -0.3446214310824871, |
|
"reward_std": 0.46420831978321075, |
|
"rewards/cosine_scaled_reward": -0.1723107136785984, |
|
"rewards/format_reward": 0.0, |
|
"step": 223 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3035.513916015625, |
|
"epoch": 0.3838903170522708, |
|
"grad_norm": 0.17018531262874603, |
|
"kl": 0.0105438232421875, |
|
"learning_rate": 7.097981330836616e-07, |
|
"loss": 0.0259, |
|
"reward": -0.21396764740347862, |
|
"reward_std": 0.5872293263673782, |
|
"rewards/cosine_scaled_reward": -0.10698381997644901, |
|
"rewards/format_reward": 0.0, |
|
"step": 224 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2804.013916015625, |
|
"epoch": 0.3856041131105398, |
|
"grad_norm": 0.19500206410884857, |
|
"kl": 0.00768280029296875, |
|
"learning_rate": 7.068574212948169e-07, |
|
"loss": 0.1055, |
|
"reward": -0.22558368369936943, |
|
"reward_std": 0.6132937371730804, |
|
"rewards/cosine_scaled_reward": -0.11279183439910412, |
|
"rewards/format_reward": 0.0, |
|
"step": 225 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2667.0555725097656, |
|
"epoch": 0.3873179091688089, |
|
"grad_norm": 0.18770119547843933, |
|
"kl": 0.007568359375, |
|
"learning_rate": 7.039090644965509e-07, |
|
"loss": 0.0042, |
|
"reward": -0.05162630486302078, |
|
"reward_std": 0.6696203723549843, |
|
"rewards/cosine_scaled_reward": -0.025813143118284643, |
|
"rewards/format_reward": 0.0, |
|
"step": 226 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2906.6805419921875, |
|
"epoch": 0.389031705227078, |
|
"grad_norm": 0.16604122519493103, |
|
"kl": 0.0076141357421875, |
|
"learning_rate": 7.009532063876148e-07, |
|
"loss": 0.0146, |
|
"reward": -0.1345351382624358, |
|
"reward_std": 0.7545941472053528, |
|
"rewards/cosine_scaled_reward": -0.0672675691312179, |
|
"rewards/format_reward": 0.0, |
|
"step": 227 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2913.791748046875, |
|
"epoch": 0.390745501285347, |
|
"grad_norm": 0.36757490038871765, |
|
"kl": 0.006805419921875, |
|
"learning_rate": 6.979899910323624e-07, |
|
"loss": 0.0796, |
|
"reward": 0.061263229697942734, |
|
"reward_std": 0.5674895793199539, |
|
"rewards/cosine_scaled_reward": 0.030631612986326218, |
|
"rewards/format_reward": 0.0, |
|
"step": 228 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2708.4027709960938, |
|
"epoch": 0.3924592973436161, |
|
"grad_norm": 0.17164915800094604, |
|
"kl": 0.0065765380859375, |
|
"learning_rate": 6.950195628537299e-07, |
|
"loss": 0.061, |
|
"reward": -0.17019816813990474, |
|
"reward_std": 0.5833596885204315, |
|
"rewards/cosine_scaled_reward": -0.08509908034466207, |
|
"rewards/format_reward": 0.0, |
|
"step": 229 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2934.0555419921875, |
|
"epoch": 0.39417309340188517, |
|
"grad_norm": 0.16252191364765167, |
|
"kl": 0.0125732421875, |
|
"learning_rate": 6.920420666261961e-07, |
|
"loss": -0.0251, |
|
"reward": -0.27500685676932335, |
|
"reward_std": 0.4450754225254059, |
|
"rewards/cosine_scaled_reward": -0.13750343304127455, |
|
"rewards/format_reward": 0.0, |
|
"step": 230 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3062.4583740234375, |
|
"epoch": 0.39588688946015427, |
|
"grad_norm": 0.13106314837932587, |
|
"kl": 0.0096435546875, |
|
"learning_rate": 6.890576474687263e-07, |
|
"loss": 0.0074, |
|
"reward": -0.11593299638479948, |
|
"reward_std": 0.5865771174430847, |
|
"rewards/cosine_scaled_reward": -0.057966490276157856, |
|
"rewards/format_reward": 0.0, |
|
"step": 231 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2548.4444580078125, |
|
"epoch": 0.3976006855184233, |
|
"grad_norm": 0.15283794701099396, |
|
"kl": 0.00823974609375, |
|
"learning_rate": 6.860664508377001e-07, |
|
"loss": 0.0156, |
|
"reward": 0.012726329267024994, |
|
"reward_std": 0.6339813768863678, |
|
"rewards/cosine_scaled_reward": 0.006363175809383392, |
|
"rewards/format_reward": 0.0, |
|
"step": 232 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1752.4861602783203, |
|
"epoch": 0.39931448157669236, |
|
"grad_norm": 0.17673321068286896, |
|
"kl": 0.0053558349609375, |
|
"learning_rate": 6.83068622519821e-07, |
|
"loss": 0.0344, |
|
"reward": 0.3881940320134163, |
|
"reward_std": 0.6750105991959572, |
|
"rewards/cosine_scaled_reward": 0.19409702718257904, |
|
"rewards/format_reward": 0.0, |
|
"step": 233 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2785.6111450195312, |
|
"epoch": 0.40102827763496146, |
|
"grad_norm": 0.172316312789917, |
|
"kl": 0.01160430908203125, |
|
"learning_rate": 6.800643086250121e-07, |
|
"loss": -0.0154, |
|
"reward": -0.2950245440006256, |
|
"reward_std": 0.6799461841583252, |
|
"rewards/cosine_scaled_reward": -0.1475122720003128, |
|
"rewards/format_reward": 0.0, |
|
"step": 234 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3018.3056030273438, |
|
"epoch": 0.4027420736932305, |
|
"grad_norm": 0.15055446326732635, |
|
"kl": 0.0095977783203125, |
|
"learning_rate": 6.770536555792944e-07, |
|
"loss": -0.0457, |
|
"reward": -0.19169194623827934, |
|
"reward_std": 0.4096248298883438, |
|
"rewards/cosine_scaled_reward": -0.09584598150104284, |
|
"rewards/format_reward": 0.0, |
|
"step": 235 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3233.8889770507812, |
|
"epoch": 0.40445586975149955, |
|
"grad_norm": 0.14838387072086334, |
|
"kl": 0.009735107421875, |
|
"learning_rate": 6.740368101176495e-07, |
|
"loss": 0.0306, |
|
"reward": -0.14736445620656013, |
|
"reward_std": 0.6041549146175385, |
|
"rewards/cosine_scaled_reward": -0.07368221180513501, |
|
"rewards/format_reward": 0.0, |
|
"step": 236 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2462.7361755371094, |
|
"epoch": 0.40616966580976865, |
|
"grad_norm": 0.21186563372612, |
|
"kl": 0.0077667236328125, |
|
"learning_rate": 6.710139192768694e-07, |
|
"loss": 0.0142, |
|
"reward": -0.2296012807637453, |
|
"reward_std": 0.5129070654511452, |
|
"rewards/cosine_scaled_reward": -0.11480064131319523, |
|
"rewards/format_reward": 0.0, |
|
"step": 237 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3028.5277709960938, |
|
"epoch": 0.4078834618680377, |
|
"grad_norm": 0.15430369973182678, |
|
"kl": 0.00980377197265625, |
|
"learning_rate": 6.679851303883891e-07, |
|
"loss": 0.0326, |
|
"reward": -0.04171431064605713, |
|
"reward_std": 0.5160864554345608, |
|
"rewards/cosine_scaled_reward": -0.020857159048318863, |
|
"rewards/format_reward": 0.0, |
|
"step": 238 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2878.597198486328, |
|
"epoch": 0.40959725792630675, |
|
"grad_norm": 0.1511092185974121, |
|
"kl": 0.00661468505859375, |
|
"learning_rate": 6.649505910711058e-07, |
|
"loss": 0.0053, |
|
"reward": -0.08533445000648499, |
|
"reward_std": 0.48660216480493546, |
|
"rewards/cosine_scaled_reward": -0.04266723245382309, |
|
"rewards/format_reward": 0.0, |
|
"step": 239 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2145.986114501953, |
|
"epoch": 0.41131105398457585, |
|
"grad_norm": 0.19034837186336517, |
|
"kl": 0.00772857666015625, |
|
"learning_rate": 6.619104492241847e-07, |
|
"loss": 0.0412, |
|
"reward": 0.22470230411272496, |
|
"reward_std": 0.5070570334792137, |
|
"rewards/cosine_scaled_reward": 0.11235115380259231, |
|
"rewards/format_reward": 0.0, |
|
"step": 240 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2227.7361450195312, |
|
"epoch": 0.4130248500428449, |
|
"grad_norm": 0.1779133826494217, |
|
"kl": 0.008626937866210938, |
|
"learning_rate": 6.588648530198504e-07, |
|
"loss": 0.0419, |
|
"reward": -0.0513172447681427, |
|
"reward_std": 0.617318756878376, |
|
"rewards/cosine_scaled_reward": -0.025658607482910156, |
|
"rewards/format_reward": 0.0, |
|
"step": 241 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3336.3333740234375, |
|
"epoch": 0.414738646101114, |
|
"grad_norm": 0.1791980117559433, |
|
"kl": 0.00740814208984375, |
|
"learning_rate": 6.558139508961654e-07, |
|
"loss": -0.0064, |
|
"reward": -0.14741731621325016, |
|
"reward_std": 0.7067866027355194, |
|
"rewards/cosine_scaled_reward": -0.07370865810662508, |
|
"rewards/format_reward": 0.0, |
|
"step": 242 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3000.888916015625, |
|
"epoch": 0.41645244215938304, |
|
"grad_norm": 0.146419495344162, |
|
"kl": 0.006435394287109375, |
|
"learning_rate": 6.527578915497951e-07, |
|
"loss": 0.0311, |
|
"reward": -0.012151572853326797, |
|
"reward_std": 0.7768204510211945, |
|
"rewards/cosine_scaled_reward": -0.006075790151953697, |
|
"rewards/format_reward": 0.0, |
|
"step": 243 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2964.9444885253906, |
|
"epoch": 0.4181662382176521, |
|
"grad_norm": 0.1862625777721405, |
|
"kl": 0.00849151611328125, |
|
"learning_rate": 6.496968239287603e-07, |
|
"loss": 0.0372, |
|
"reward": -0.16059484332799911, |
|
"reward_std": 0.5683267489075661, |
|
"rewards/cosine_scaled_reward": -0.08029741793870926, |
|
"rewards/format_reward": 0.0, |
|
"step": 244 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1833.4166717529297, |
|
"epoch": 0.4198800342759212, |
|
"grad_norm": 0.3224428594112396, |
|
"kl": 0.0082550048828125, |
|
"learning_rate": 6.466308972251785e-07, |
|
"loss": -0.0459, |
|
"reward": 0.06598322093486786, |
|
"reward_std": 0.6559992954134941, |
|
"rewards/cosine_scaled_reward": 0.032991619780659676, |
|
"rewards/format_reward": 0.0, |
|
"step": 245 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2580.0694580078125, |
|
"epoch": 0.42159383033419023, |
|
"grad_norm": 0.1514631062746048, |
|
"kl": 0.01023101806640625, |
|
"learning_rate": 6.435602608679916e-07, |
|
"loss": 0.0527, |
|
"reward": -0.02805427461862564, |
|
"reward_std": 0.6845656186342239, |
|
"rewards/cosine_scaled_reward": -0.01402713917195797, |
|
"rewards/format_reward": 0.0, |
|
"step": 246 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3155.2500610351562, |
|
"epoch": 0.4233076263924593, |
|
"grad_norm": 0.1348743587732315, |
|
"kl": 0.01012420654296875, |
|
"learning_rate": 6.404850645156841e-07, |
|
"loss": 0.0538, |
|
"reward": -0.11579635553061962, |
|
"reward_std": 0.7224173843860626, |
|
"rewards/cosine_scaled_reward": -0.057898176833987236, |
|
"rewards/format_reward": 0.0, |
|
"step": 247 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2719.7222290039062, |
|
"epoch": 0.4250214224507284, |
|
"grad_norm": 0.16808690130710602, |
|
"kl": 0.010040283203125, |
|
"learning_rate": 6.374054580489873e-07, |
|
"loss": -0.0027, |
|
"reward": -0.1423700600862503, |
|
"reward_std": 0.41877883672714233, |
|
"rewards/cosine_scaled_reward": -0.07118503004312515, |
|
"rewards/format_reward": 0.0, |
|
"step": 248 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3017.5416870117188, |
|
"epoch": 0.4267352185089974, |
|
"grad_norm": 0.13636116683483124, |
|
"kl": 0.01111602783203125, |
|
"learning_rate": 6.343215915635761e-07, |
|
"loss": 0.0335, |
|
"reward": -0.16177499457262456, |
|
"reward_std": 0.41518206894397736, |
|
"rewards/cosine_scaled_reward": -0.08088749897433445, |
|
"rewards/format_reward": 0.0, |
|
"step": 249 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2664.6666870117188, |
|
"epoch": 0.4284490145672665, |
|
"grad_norm": 0.1738223433494568, |
|
"kl": 0.00909423828125, |
|
"learning_rate": 6.31233615362752e-07, |
|
"loss": 0.0581, |
|
"reward": -0.14353771694004536, |
|
"reward_std": 0.5664958357810974, |
|
"rewards/cosine_scaled_reward": -0.07176885847002268, |
|
"rewards/format_reward": 0.0, |
|
"step": 250 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2695.2916870117188, |
|
"epoch": 0.4301628106255356, |
|
"grad_norm": 0.16924519836902618, |
|
"kl": 0.009357452392578125, |
|
"learning_rate": 6.281416799501187e-07, |
|
"loss": -0.0019, |
|
"reward": 0.09459428116679192, |
|
"reward_std": 0.6146803349256516, |
|
"rewards/cosine_scaled_reward": 0.04729713872075081, |
|
"rewards/format_reward": 0.0, |
|
"step": 251 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2714.5416259765625, |
|
"epoch": 0.4318766066838046, |
|
"grad_norm": 0.19001764059066772, |
|
"kl": 0.0080413818359375, |
|
"learning_rate": 6.25045936022246e-07, |
|
"loss": -0.0049, |
|
"reward": -0.00815525185316801, |
|
"reward_std": 0.5676329433917999, |
|
"rewards/cosine_scaled_reward": -0.00407763384282589, |
|
"rewards/format_reward": 0.0, |
|
"step": 252 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2847.4166870117188, |
|
"epoch": 0.43359040274207367, |
|
"grad_norm": 0.1463892161846161, |
|
"kl": 0.01003265380859375, |
|
"learning_rate": 6.219465344613258e-07, |
|
"loss": -0.0337, |
|
"reward": 0.009062569588422775, |
|
"reward_std": 0.5907448679208755, |
|
"rewards/cosine_scaled_reward": 0.0045312922447919846, |
|
"rewards/format_reward": 0.0, |
|
"step": 253 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2974.9166870117188, |
|
"epoch": 0.43530419880034277, |
|
"grad_norm": 0.15965710580348969, |
|
"kl": 0.009674072265625, |
|
"learning_rate": 6.188436263278172e-07, |
|
"loss": 0.0032, |
|
"reward": 0.09970302879810333, |
|
"reward_std": 0.4728682413697243, |
|
"rewards/cosine_scaled_reward": 0.04985151067376137, |
|
"rewards/format_reward": 0.0, |
|
"step": 254 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2707.4027709960938, |
|
"epoch": 0.4370179948586118, |
|
"grad_norm": 0.1550796627998352, |
|
"kl": 0.01021575927734375, |
|
"learning_rate": 6.157373628530852e-07, |
|
"loss": 0.0007, |
|
"reward": -0.08888162672519684, |
|
"reward_std": 0.4977044016122818, |
|
"rewards/cosine_scaled_reward": -0.04444081336259842, |
|
"rewards/format_reward": 0.0, |
|
"step": 255 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2744.763916015625, |
|
"epoch": 0.4387317909168809, |
|
"grad_norm": 0.21653364598751068, |
|
"kl": 0.009979248046875, |
|
"learning_rate": 6.126278954320294e-07, |
|
"loss": 0.0062, |
|
"reward": -0.24449253268539906, |
|
"reward_std": 0.4354872331023216, |
|
"rewards/cosine_scaled_reward": -0.12224626448005438, |
|
"rewards/format_reward": 0.0, |
|
"step": 256 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2490.9722290039062, |
|
"epoch": 0.44044558697514996, |
|
"grad_norm": 0.1892397254705429, |
|
"kl": 0.00962066650390625, |
|
"learning_rate": 6.095153756157051e-07, |
|
"loss": 0.0269, |
|
"reward": 0.1365387246478349, |
|
"reward_std": 0.6730539947748184, |
|
"rewards/cosine_scaled_reward": 0.06826936185825616, |
|
"rewards/format_reward": 0.0, |
|
"step": 257 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2796.7916870117188, |
|
"epoch": 0.442159383033419, |
|
"grad_norm": 0.16943146288394928, |
|
"kl": 0.0094757080078125, |
|
"learning_rate": 6.06399955103937e-07, |
|
"loss": -0.0094, |
|
"reward": -0.27603928185999393, |
|
"reward_std": 0.517802283167839, |
|
"rewards/cosine_scaled_reward": -0.13801964186131954, |
|
"rewards/format_reward": 0.0, |
|
"step": 258 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2211.4305419921875, |
|
"epoch": 0.4438731790916881, |
|
"grad_norm": 0.23119370639324188, |
|
"kl": 0.0067596435546875, |
|
"learning_rate": 6.032817857379256e-07, |
|
"loss": 0.1106, |
|
"reward": -0.10240336135029793, |
|
"reward_std": 0.5084675773978233, |
|
"rewards/cosine_scaled_reward": -0.05120168812572956, |
|
"rewards/format_reward": 0.0, |
|
"step": 259 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2785.3055419921875, |
|
"epoch": 0.44558697514995715, |
|
"grad_norm": 0.21458233892917633, |
|
"kl": 0.00876617431640625, |
|
"learning_rate": 6.001610194928464e-07, |
|
"loss": -0.0013, |
|
"reward": 0.051987094804644585, |
|
"reward_std": 0.5341488644480705, |
|
"rewards/cosine_scaled_reward": 0.02599355112761259, |
|
"rewards/format_reward": 0.0, |
|
"step": 260 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2539.2083740234375, |
|
"epoch": 0.4473007712082262, |
|
"grad_norm": 0.1954081803560257, |
|
"kl": 0.01007843017578125, |
|
"learning_rate": 5.97037808470444e-07, |
|
"loss": 0.0316, |
|
"reward": 0.026572998613119125, |
|
"reward_std": 0.42085136845707893, |
|
"rewards/cosine_scaled_reward": 0.013286499306559563, |
|
"rewards/format_reward": 0.0, |
|
"step": 261 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2897.3472900390625, |
|
"epoch": 0.4490145672664953, |
|
"grad_norm": 0.1940917670726776, |
|
"kl": 0.01140594482421875, |
|
"learning_rate": 5.939123048916173e-07, |
|
"loss": 0.0318, |
|
"reward": -0.05599740147590637, |
|
"reward_std": 0.6964142769575119, |
|
"rewards/cosine_scaled_reward": -0.027998706325888634, |
|
"rewards/format_reward": 0.0, |
|
"step": 262 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2977.875, |
|
"epoch": 0.45072836332476435, |
|
"grad_norm": 0.2107793092727661, |
|
"kl": 0.011260986328125, |
|
"learning_rate": 5.907846610890011e-07, |
|
"loss": 0.0458, |
|
"reward": -0.443071685731411, |
|
"reward_std": 0.4884059280157089, |
|
"rewards/cosine_scaled_reward": -0.22153585404157639, |
|
"rewards/format_reward": 0.0, |
|
"step": 263 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2835.9166564941406, |
|
"epoch": 0.4524421593830334, |
|
"grad_norm": 0.1562654972076416, |
|
"kl": 0.0097503662109375, |
|
"learning_rate": 5.87655029499542e-07, |
|
"loss": 0.0527, |
|
"reward": -0.31120575219392776, |
|
"reward_std": 0.42043986171483994, |
|
"rewards/cosine_scaled_reward": -0.15560288727283478, |
|
"rewards/format_reward": 0.0, |
|
"step": 264 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3025.416748046875, |
|
"epoch": 0.4541559554413025, |
|
"grad_norm": 0.35641464591026306, |
|
"kl": 0.008880615234375, |
|
"learning_rate": 5.845235626570683e-07, |
|
"loss": 0.0023, |
|
"reward": 0.14537757262587547, |
|
"reward_std": 0.4222983121871948, |
|
"rewards/cosine_scaled_reward": 0.07268879748880863, |
|
"rewards/format_reward": 0.0, |
|
"step": 265 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2631.9445190429688, |
|
"epoch": 0.45586975149957154, |
|
"grad_norm": 0.1993650197982788, |
|
"kl": 0.006988525390625, |
|
"learning_rate": 5.813904131848564e-07, |
|
"loss": 0.02, |
|
"reward": -0.21425554435700178, |
|
"reward_std": 0.6343535855412483, |
|
"rewards/cosine_scaled_reward": -0.10712776239961386, |
|
"rewards/format_reward": 0.0, |
|
"step": 266 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2562.3611450195312, |
|
"epoch": 0.45758354755784064, |
|
"grad_norm": 0.22106732428073883, |
|
"kl": 0.01442718505859375, |
|
"learning_rate": 5.78255733788191e-07, |
|
"loss": 0.0396, |
|
"reward": -0.292802631855011, |
|
"reward_std": 0.3813341185450554, |
|
"rewards/cosine_scaled_reward": -0.1464013159275055, |
|
"rewards/format_reward": 0.0, |
|
"step": 267 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2572.2083740234375, |
|
"epoch": 0.4592973436161097, |
|
"grad_norm": 0.1711571365594864, |
|
"kl": 0.00759124755859375, |
|
"learning_rate": 5.751196772469237e-07, |
|
"loss": 0.0197, |
|
"reward": -0.2553995121270418, |
|
"reward_std": 0.5235799252986908, |
|
"rewards/cosine_scaled_reward": -0.1276997560635209, |
|
"rewards/format_reward": 0.0, |
|
"step": 268 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3053.2638549804688, |
|
"epoch": 0.46101113967437873, |
|
"grad_norm": 0.1386088728904724, |
|
"kl": 0.00843048095703125, |
|
"learning_rate": 5.71982396408026e-07, |
|
"loss": 0.0184, |
|
"reward": -0.17865224927663803, |
|
"reward_std": 0.5562375336885452, |
|
"rewards/cosine_scaled_reward": -0.08932612743228674, |
|
"rewards/format_reward": 0.0, |
|
"step": 269 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2723.638916015625, |
|
"epoch": 0.46272493573264784, |
|
"grad_norm": 2.8520348072052, |
|
"kl": 0.05461883544921875, |
|
"learning_rate": 5.688440441781398e-07, |
|
"loss": -0.0068, |
|
"reward": 0.27612179331481457, |
|
"reward_std": 0.7261447310447693, |
|
"rewards/cosine_scaled_reward": 0.13806088734418154, |
|
"rewards/format_reward": 0.0, |
|
"step": 270 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2795.666748046875, |
|
"epoch": 0.4644387317909169, |
|
"grad_norm": 0.1723846048116684, |
|
"kl": 0.01483154296875, |
|
"learning_rate": 5.657047735161255e-07, |
|
"loss": 0.0373, |
|
"reward": -0.03490264154970646, |
|
"reward_std": 0.6204687505960464, |
|
"rewards/cosine_scaled_reward": -0.017451307736337185, |
|
"rewards/format_reward": 0.0, |
|
"step": 271 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2526.1944580078125, |
|
"epoch": 0.4661525278491859, |
|
"grad_norm": 0.2960320711135864, |
|
"kl": 0.0132598876953125, |
|
"learning_rate": 5.625647374256061e-07, |
|
"loss": 0.0815, |
|
"reward": 0.11341174505650997, |
|
"reward_std": 0.5083474740386009, |
|
"rewards/cosine_scaled_reward": 0.05670587276108563, |
|
"rewards/format_reward": 0.0, |
|
"step": 272 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2640.1944580078125, |
|
"epoch": 0.46786632390745503, |
|
"grad_norm": 0.17620131373405457, |
|
"kl": 0.009765625, |
|
"learning_rate": 5.594240889475106e-07, |
|
"loss": 0.0112, |
|
"reward": 0.11540575325489044, |
|
"reward_std": 0.5552510917186737, |
|
"rewards/cosine_scaled_reward": 0.05770287476480007, |
|
"rewards/format_reward": 0.0, |
|
"step": 273 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3208.8333129882812, |
|
"epoch": 0.4695801199657241, |
|
"grad_norm": 0.15742135047912598, |
|
"kl": 0.01263427734375, |
|
"learning_rate": 5.562829811526154e-07, |
|
"loss": -0.0201, |
|
"reward": -0.4686981365084648, |
|
"reward_std": 0.3511890172958374, |
|
"rewards/cosine_scaled_reward": -0.2343490682542324, |
|
"rewards/format_reward": 0.0, |
|
"step": 274 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2885.9306030273438, |
|
"epoch": 0.4712939160239931, |
|
"grad_norm": 0.18644562363624573, |
|
"kl": 0.01094818115234375, |
|
"learning_rate": 5.531415671340826e-07, |
|
"loss": 0.0224, |
|
"reward": -0.2238161340355873, |
|
"reward_std": 0.5779955387115479, |
|
"rewards/cosine_scaled_reward": -0.11190806701779366, |
|
"rewards/format_reward": 0.0, |
|
"step": 275 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2641.7222290039062, |
|
"epoch": 0.4730077120822622, |
|
"grad_norm": 0.2060326635837555, |
|
"kl": 0.009002685546875, |
|
"learning_rate": 5.5e-07, |
|
"loss": 0.0921, |
|
"reward": -0.10621737875044346, |
|
"reward_std": 0.572068989276886, |
|
"rewards/cosine_scaled_reward": -0.053108690306544304, |
|
"rewards/format_reward": 0.0, |
|
"step": 276 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3040.3193969726562, |
|
"epoch": 0.47472150814053127, |
|
"grad_norm": 0.15230870246887207, |
|
"kl": 0.009868621826171875, |
|
"learning_rate": 5.468584328659172e-07, |
|
"loss": 0.0243, |
|
"reward": -0.27920062592602335, |
|
"reward_std": 0.4912775382399559, |
|
"rewards/cosine_scaled_reward": -0.13960031296301167, |
|
"rewards/format_reward": 0.0, |
|
"step": 277 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2733.8472900390625, |
|
"epoch": 0.47643530419880037, |
|
"grad_norm": 0.16092219948768616, |
|
"kl": 0.0078125, |
|
"learning_rate": 5.437170188473847e-07, |
|
"loss": 0.0214, |
|
"reward": 0.1801936998963356, |
|
"reward_std": 0.7019116431474686, |
|
"rewards/cosine_scaled_reward": 0.09009685181081295, |
|
"rewards/format_reward": 0.0, |
|
"step": 278 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3269.8194580078125, |
|
"epoch": 0.4781491002570694, |
|
"grad_norm": 0.13919058442115784, |
|
"kl": 0.01202392578125, |
|
"learning_rate": 5.405759110524894e-07, |
|
"loss": 0.0359, |
|
"reward": -0.2203904101625085, |
|
"reward_std": 0.5241215899586678, |
|
"rewards/cosine_scaled_reward": -0.11019521998241544, |
|
"rewards/format_reward": 0.0, |
|
"step": 279 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2767.611083984375, |
|
"epoch": 0.47986289631533846, |
|
"grad_norm": 0.17986002564430237, |
|
"kl": 0.01087188720703125, |
|
"learning_rate": 5.37435262574394e-07, |
|
"loss": 0.0177, |
|
"reward": 0.20984390750527382, |
|
"reward_std": 0.6492117866873741, |
|
"rewards/cosine_scaled_reward": 0.10492195282131433, |
|
"rewards/format_reward": 0.0, |
|
"step": 280 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2349.902801513672, |
|
"epoch": 0.48157669237360756, |
|
"grad_norm": 0.20755039155483246, |
|
"kl": 0.0091094970703125, |
|
"learning_rate": 5.342952264838747e-07, |
|
"loss": 0.0718, |
|
"reward": 0.09011890506371856, |
|
"reward_std": 0.755554661154747, |
|
"rewards/cosine_scaled_reward": 0.04505945247365162, |
|
"rewards/format_reward": 0.0, |
|
"step": 281 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2690.3333740234375, |
|
"epoch": 0.4832904884318766, |
|
"grad_norm": 0.16312456130981445, |
|
"kl": 0.0081787109375, |
|
"learning_rate": 5.311559558218603e-07, |
|
"loss": -0.0225, |
|
"reward": -0.1038619177415967, |
|
"reward_std": 0.6092793643474579, |
|
"rewards/cosine_scaled_reward": -0.05193095514550805, |
|
"rewards/format_reward": 0.0, |
|
"step": 282 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2992.4443969726562, |
|
"epoch": 0.48500428449014565, |
|
"grad_norm": 0.14668744802474976, |
|
"kl": 0.009429931640625, |
|
"learning_rate": 5.28017603591974e-07, |
|
"loss": 0.0094, |
|
"reward": -0.18053901614621282, |
|
"reward_std": 0.5393766239285469, |
|
"rewards/cosine_scaled_reward": -0.09026950527913868, |
|
"rewards/format_reward": 0.0, |
|
"step": 283 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2731.1666870117188, |
|
"epoch": 0.48671808054841476, |
|
"grad_norm": 0.18690791726112366, |
|
"kl": 0.01348876953125, |
|
"learning_rate": 5.248803227530763e-07, |
|
"loss": 0.0274, |
|
"reward": 0.05301067978143692, |
|
"reward_std": 0.8040451109409332, |
|
"rewards/cosine_scaled_reward": 0.026505338959395885, |
|
"rewards/format_reward": 0.0, |
|
"step": 284 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3109.9722290039062, |
|
"epoch": 0.4884318766066838, |
|
"grad_norm": 0.13096371293067932, |
|
"kl": 0.0122222900390625, |
|
"learning_rate": 5.21744266211809e-07, |
|
"loss": -0.0039, |
|
"reward": -0.10303456708788872, |
|
"reward_std": 0.6089868098497391, |
|
"rewards/cosine_scaled_reward": -0.05151727236807346, |
|
"rewards/format_reward": 0.0, |
|
"step": 285 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2716.9166259765625, |
|
"epoch": 0.49014567266495285, |
|
"grad_norm": 0.21381936967372894, |
|
"kl": 0.00853729248046875, |
|
"learning_rate": 5.186095868151436e-07, |
|
"loss": -0.0087, |
|
"reward": -0.08554558828473091, |
|
"reward_std": 0.6172359138727188, |
|
"rewards/cosine_scaled_reward": -0.042772796005010605, |
|
"rewards/format_reward": 0.0, |
|
"step": 286 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3128.3195190429688, |
|
"epoch": 0.49185946872322195, |
|
"grad_norm": 0.20883357524871826, |
|
"kl": 0.0149688720703125, |
|
"learning_rate": 5.154764373429315e-07, |
|
"loss": 0.0885, |
|
"reward": -0.16452566534280777, |
|
"reward_std": 0.6313002184033394, |
|
"rewards/cosine_scaled_reward": -0.08226283825933933, |
|
"rewards/format_reward": 0.0, |
|
"step": 287 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2783.2777709960938, |
|
"epoch": 0.493573264781491, |
|
"grad_norm": 0.18023322522640228, |
|
"kl": 0.016357421875, |
|
"learning_rate": 5.123449705004581e-07, |
|
"loss": 0.0778, |
|
"reward": -0.29250151151791215, |
|
"reward_std": 0.5800458639860153, |
|
"rewards/cosine_scaled_reward": -0.14625075762160122, |
|
"rewards/format_reward": 0.0, |
|
"step": 288 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2974.1944274902344, |
|
"epoch": 0.4952870608397601, |
|
"grad_norm": 0.226176917552948, |
|
"kl": 0.01458740234375, |
|
"learning_rate": 5.09215338910999e-07, |
|
"loss": 0.0448, |
|
"reward": -0.33544909581542015, |
|
"reward_std": 0.5062796398997307, |
|
"rewards/cosine_scaled_reward": -0.16772454418241978, |
|
"rewards/format_reward": 0.0, |
|
"step": 289 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2436.1944274902344, |
|
"epoch": 0.49700085689802914, |
|
"grad_norm": 0.1747155487537384, |
|
"kl": 0.012481689453125, |
|
"learning_rate": 5.060876951083828e-07, |
|
"loss": -0.0449, |
|
"reward": -0.14955687522888184, |
|
"reward_std": 0.5533142015337944, |
|
"rewards/cosine_scaled_reward": -0.07477843947708607, |
|
"rewards/format_reward": 0.0, |
|
"step": 290 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3039.3611450195312, |
|
"epoch": 0.4987146529562982, |
|
"grad_norm": 0.1754036843776703, |
|
"kl": 0.01313018798828125, |
|
"learning_rate": 5.02962191529556e-07, |
|
"loss": 0.0491, |
|
"reward": -0.44222037494182587, |
|
"reward_std": 0.49202967807650566, |
|
"rewards/cosine_scaled_reward": -0.22111019119620323, |
|
"rewards/format_reward": 0.0, |
|
"step": 291 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2386.4722595214844, |
|
"epoch": 0.5004284490145673, |
|
"grad_norm": 0.23209446668624878, |
|
"kl": 0.0133209228515625, |
|
"learning_rate": 4.998389805071536e-07, |
|
"loss": 0.1035, |
|
"reward": 0.11830113036558032, |
|
"reward_std": 0.7409112825989723, |
|
"rewards/cosine_scaled_reward": 0.059150564251467586, |
|
"rewards/format_reward": 0.0, |
|
"step": 292 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2481.1666870117188, |
|
"epoch": 0.5021422450728363, |
|
"grad_norm": 0.17551322281360626, |
|
"kl": 0.0098724365234375, |
|
"learning_rate": 4.967182142620745e-07, |
|
"loss": 0.0432, |
|
"reward": -0.1329963468015194, |
|
"reward_std": 0.5577030703425407, |
|
"rewards/cosine_scaled_reward": -0.06649817898869514, |
|
"rewards/format_reward": 0.0, |
|
"step": 293 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2711.5000610351562, |
|
"epoch": 0.5038560411311054, |
|
"grad_norm": 0.18221919238567352, |
|
"kl": 0.01308441162109375, |
|
"learning_rate": 4.93600044896063e-07, |
|
"loss": 0.0587, |
|
"reward": -0.21110662072896957, |
|
"reward_std": 0.5812349170446396, |
|
"rewards/cosine_scaled_reward": -0.10555331036448479, |
|
"rewards/format_reward": 0.0, |
|
"step": 294 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2619.5972290039062, |
|
"epoch": 0.5055698371893744, |
|
"grad_norm": 0.18888363242149353, |
|
"kl": 0.0113067626953125, |
|
"learning_rate": 4.904846243842949e-07, |
|
"loss": -0.0068, |
|
"reward": 0.10603267699480057, |
|
"reward_std": 0.6550966873764992, |
|
"rewards/cosine_scaled_reward": 0.053016334772109985, |
|
"rewards/format_reward": 0.0, |
|
"step": 295 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2851.4583129882812, |
|
"epoch": 0.5072836332476436, |
|
"grad_norm": 0.15981672704219818, |
|
"kl": 0.0122833251953125, |
|
"learning_rate": 4.873721045679706e-07, |
|
"loss": 0.0399, |
|
"reward": 0.07413195073604584, |
|
"reward_std": 0.6663401573896408, |
|
"rewards/cosine_scaled_reward": 0.03706597909331322, |
|
"rewards/format_reward": 0.0, |
|
"step": 296 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2682.5833435058594, |
|
"epoch": 0.5089974293059126, |
|
"grad_norm": 0.18823187053203583, |
|
"kl": 0.0098419189453125, |
|
"learning_rate": 4.842626371469149e-07, |
|
"loss": 0.0705, |
|
"reward": -0.035793907940387726, |
|
"reward_std": 0.5416731983423233, |
|
"rewards/cosine_scaled_reward": -0.017896955832839012, |
|
"rewards/format_reward": 0.0, |
|
"step": 297 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2439.0556030273438, |
|
"epoch": 0.5107112253641817, |
|
"grad_norm": 0.17318564653396606, |
|
"kl": 0.012298583984375, |
|
"learning_rate": 4.811563736721829e-07, |
|
"loss": 0.0134, |
|
"reward": 0.026430480182170868, |
|
"reward_std": 0.5753844156861305, |
|
"rewards/cosine_scaled_reward": 0.013215240091085434, |
|
"rewards/format_reward": 0.0, |
|
"step": 298 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2576.9027709960938, |
|
"epoch": 0.5124250214224507, |
|
"grad_norm": 0.21229924261569977, |
|
"kl": 0.0172576904296875, |
|
"learning_rate": 4.780534655386743e-07, |
|
"loss": 0.0552, |
|
"reward": 0.3652267027646303, |
|
"reward_std": 0.6922546178102493, |
|
"rewards/cosine_scaled_reward": 0.18261335138231516, |
|
"rewards/format_reward": 0.0, |
|
"step": 299 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2756.9305419921875, |
|
"epoch": 0.5141388174807198, |
|
"grad_norm": 0.19316470623016357, |
|
"kl": 0.011383056640625, |
|
"learning_rate": 4.749540639777539e-07, |
|
"loss": 0.0299, |
|
"reward": 0.22619394585490227, |
|
"reward_std": 0.4907483011484146, |
|
"rewards/cosine_scaled_reward": 0.11309697106480598, |
|
"rewards/format_reward": 0.0, |
|
"step": 300 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2826.9306640625, |
|
"epoch": 0.5158526135389888, |
|
"grad_norm": 0.233236625790596, |
|
"kl": 0.01507568359375, |
|
"learning_rate": 4.7185832004988133e-07, |
|
"loss": 0.0851, |
|
"reward": -0.13008400797843933, |
|
"reward_std": 0.7507277429103851, |
|
"rewards/cosine_scaled_reward": -0.06504200212657452, |
|
"rewards/format_reward": 0.0, |
|
"step": 301 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3145.986083984375, |
|
"epoch": 0.517566409597258, |
|
"grad_norm": 0.1533445417881012, |
|
"kl": 0.0132293701171875, |
|
"learning_rate": 4.68766384637248e-07, |
|
"loss": 0.0074, |
|
"reward": -0.00015814602375030518, |
|
"reward_std": 0.7809525281190872, |
|
"rewards/cosine_scaled_reward": -7.90674239397049e-05, |
|
"rewards/format_reward": 0.0, |
|
"step": 302 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3029.40283203125, |
|
"epoch": 0.519280205655527, |
|
"grad_norm": 0.1652766764163971, |
|
"kl": 0.0153656005859375, |
|
"learning_rate": 4.656784084364238e-07, |
|
"loss": 0.0139, |
|
"reward": -0.06143874488770962, |
|
"reward_std": 0.7485700696706772, |
|
"rewards/cosine_scaled_reward": -0.030719374306499958, |
|
"rewards/format_reward": 0.0, |
|
"step": 303 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2729.52783203125, |
|
"epoch": 0.5209940017137961, |
|
"grad_norm": 0.18553942441940308, |
|
"kl": 0.01348876953125, |
|
"learning_rate": 4.6259454195101267e-07, |
|
"loss": -0.0083, |
|
"reward": -0.042102924548089504, |
|
"reward_std": 0.5168112218379974, |
|
"rewards/cosine_scaled_reward": -0.02105145249515772, |
|
"rewards/format_reward": 0.0, |
|
"step": 304 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2928.9444580078125, |
|
"epoch": 0.5227077977720651, |
|
"grad_norm": 0.24608513712882996, |
|
"kl": 0.01422119140625, |
|
"learning_rate": 4.59514935484316e-07, |
|
"loss": 0.0859, |
|
"reward": -0.11776435747742653, |
|
"reward_std": 0.6116138771176338, |
|
"rewards/cosine_scaled_reward": -0.058882176876068115, |
|
"rewards/format_reward": 0.0, |
|
"step": 305 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2568.7222290039062, |
|
"epoch": 0.5244215938303342, |
|
"grad_norm": 0.1760726422071457, |
|
"kl": 0.0126800537109375, |
|
"learning_rate": 4.5643973913200837e-07, |
|
"loss": 0.0106, |
|
"reward": -0.2896502474322915, |
|
"reward_std": 0.542039155960083, |
|
"rewards/cosine_scaled_reward": -0.1448251255787909, |
|
"rewards/format_reward": 0.0, |
|
"step": 306 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2816.7916259765625, |
|
"epoch": 0.5261353898886033, |
|
"grad_norm": 0.20767201483249664, |
|
"kl": 0.015869140625, |
|
"learning_rate": 4.5336910277482155e-07, |
|
"loss": 0.0605, |
|
"reward": -0.07595526240766048, |
|
"reward_std": 0.7446087747812271, |
|
"rewards/cosine_scaled_reward": -0.03797762934118509, |
|
"rewards/format_reward": 0.0, |
|
"step": 307 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2619.0, |
|
"epoch": 0.5278491859468724, |
|
"grad_norm": 0.18840792775154114, |
|
"kl": 0.01406097412109375, |
|
"learning_rate": 4.503031760712397e-07, |
|
"loss": 0.0317, |
|
"reward": -0.13000392355024815, |
|
"reward_std": 0.5407935008406639, |
|
"rewards/cosine_scaled_reward": -0.0650019682943821, |
|
"rewards/format_reward": 0.0, |
|
"step": 308 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2706.4583740234375, |
|
"epoch": 0.5295629820051414, |
|
"grad_norm": 0.20385313034057617, |
|
"kl": 0.017333984375, |
|
"learning_rate": 4.4724210845020494e-07, |
|
"loss": 0.0402, |
|
"reward": 0.09998160088434815, |
|
"reward_std": 0.6437982618808746, |
|
"rewards/cosine_scaled_reward": 0.049990794621407986, |
|
"rewards/format_reward": 0.0, |
|
"step": 309 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3076.0694580078125, |
|
"epoch": 0.5312767780634104, |
|
"grad_norm": 0.15351690351963043, |
|
"kl": 0.0145416259765625, |
|
"learning_rate": 4.441860491038345e-07, |
|
"loss": 0.0112, |
|
"reward": -0.1288044311950216, |
|
"reward_std": 0.5119795873761177, |
|
"rewards/cosine_scaled_reward": -0.0644022131091333, |
|
"rewards/format_reward": 0.0, |
|
"step": 310 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3078.3333129882812, |
|
"epoch": 0.5329905741216795, |
|
"grad_norm": 0.21041399240493774, |
|
"kl": 0.0153045654296875, |
|
"learning_rate": 4.4113514698014953e-07, |
|
"loss": 0.0569, |
|
"reward": -0.24592324905097485, |
|
"reward_std": 0.5915715545415878, |
|
"rewards/cosine_scaled_reward": -0.12296162731945515, |
|
"rewards/format_reward": 0.0, |
|
"step": 311 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2794.888916015625, |
|
"epoch": 0.5347043701799485, |
|
"grad_norm": 0.3006104528903961, |
|
"kl": 0.0116729736328125, |
|
"learning_rate": 4.3808955077581546e-07, |
|
"loss": 0.1717, |
|
"reward": 0.2339099831879139, |
|
"reward_std": 0.6782252490520477, |
|
"rewards/cosine_scaled_reward": 0.1169549860060215, |
|
"rewards/format_reward": 0.0, |
|
"step": 312 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2454.65283203125, |
|
"epoch": 0.5364181662382177, |
|
"grad_norm": 0.213435098528862, |
|
"kl": 0.0183868408203125, |
|
"learning_rate": 4.350494089288943e-07, |
|
"loss": -0.0051, |
|
"reward": -0.29112886637449265, |
|
"reward_std": 0.48665956407785416, |
|
"rewards/cosine_scaled_reward": -0.14556444063782692, |
|
"rewards/format_reward": 0.0, |
|
"step": 313 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2845.3194580078125, |
|
"epoch": 0.5381319622964867, |
|
"grad_norm": 0.23916800320148468, |
|
"kl": 0.0161285400390625, |
|
"learning_rate": 4.3201486961161093e-07, |
|
"loss": 0.0824, |
|
"reward": -0.16251583769917488, |
|
"reward_std": 0.4937269687652588, |
|
"rewards/cosine_scaled_reward": -0.08125792350620031, |
|
"rewards/format_reward": 0.0, |
|
"step": 314 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2697.9583740234375, |
|
"epoch": 0.5398457583547558, |
|
"grad_norm": 0.19882318377494812, |
|
"kl": 0.018157958984375, |
|
"learning_rate": 4.2898608072313045e-07, |
|
"loss": 0.0178, |
|
"reward": -0.25365344155579805, |
|
"reward_std": 0.5236896127462387, |
|
"rewards/cosine_scaled_reward": -0.12682672249502502, |
|
"rewards/format_reward": 0.0, |
|
"step": 315 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2544.9861450195312, |
|
"epoch": 0.5415595544130248, |
|
"grad_norm": 0.20584873855113983, |
|
"kl": 0.014862060546875, |
|
"learning_rate": 4.2596318988235037e-07, |
|
"loss": 0.0389, |
|
"reward": -0.09484067000448704, |
|
"reward_std": 0.6149067878723145, |
|
"rewards/cosine_scaled_reward": -0.04742033500224352, |
|
"rewards/format_reward": 0.0, |
|
"step": 316 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2950.8333740234375, |
|
"epoch": 0.5432733504712939, |
|
"grad_norm": 0.15868861973285675, |
|
"kl": 0.018310546875, |
|
"learning_rate": 4.2294634442070553e-07, |
|
"loss": 0.0378, |
|
"reward": -0.39894504845142365, |
|
"reward_std": 0.4898769110441208, |
|
"rewards/cosine_scaled_reward": -0.19947252236306667, |
|
"rewards/format_reward": 0.0, |
|
"step": 317 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2152.3195190429688, |
|
"epoch": 0.5449871465295629, |
|
"grad_norm": 0.1994917094707489, |
|
"kl": 0.0172882080078125, |
|
"learning_rate": 4.1993569137498776e-07, |
|
"loss": -0.0091, |
|
"reward": 0.24264823482371867, |
|
"reward_std": 0.6610805988311768, |
|
"rewards/cosine_scaled_reward": 0.12132412963546813, |
|
"rewards/format_reward": 0.0, |
|
"step": 318 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2402.5556030273438, |
|
"epoch": 0.5467009425878321, |
|
"grad_norm": 0.2102198302745819, |
|
"kl": 0.01351165771484375, |
|
"learning_rate": 4.1693137748017915e-07, |
|
"loss": -0.0681, |
|
"reward": 0.05987721309065819, |
|
"reward_std": 0.5766515731811523, |
|
"rewards/cosine_scaled_reward": 0.029938601423054934, |
|
"rewards/format_reward": 0.0, |
|
"step": 319 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2677.4027709960938, |
|
"epoch": 0.5484147386461011, |
|
"grad_norm": 0.2358679324388504, |
|
"kl": 0.01690673828125, |
|
"learning_rate": 4.1393354916230005e-07, |
|
"loss": 0.0956, |
|
"reward": -0.05587568995542824, |
|
"reward_std": 0.6320854872465134, |
|
"rewards/cosine_scaled_reward": -0.02793784497771412, |
|
"rewards/format_reward": 0.0, |
|
"step": 320 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3042.4722900390625, |
|
"epoch": 0.5501285347043702, |
|
"grad_norm": 0.18476322293281555, |
|
"kl": 0.017547607421875, |
|
"learning_rate": 4.1094235253127374e-07, |
|
"loss": 0.0512, |
|
"reward": -0.2119649334345013, |
|
"reward_std": 0.585174448788166, |
|
"rewards/cosine_scaled_reward": -0.1059824712574482, |
|
"rewards/format_reward": 0.0, |
|
"step": 321 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2080.375030517578, |
|
"epoch": 0.5518423307626392, |
|
"grad_norm": 0.18924832344055176, |
|
"kl": 0.0111083984375, |
|
"learning_rate": 4.079579333738039e-07, |
|
"loss": 0.0098, |
|
"reward": 0.3428979776799679, |
|
"reward_std": 0.7396816238760948, |
|
"rewards/cosine_scaled_reward": 0.1714489795267582, |
|
"rewards/format_reward": 0.0, |
|
"step": 322 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2770.7916870117188, |
|
"epoch": 0.5535561268209083, |
|
"grad_norm": 0.17449912428855896, |
|
"kl": 0.0141143798828125, |
|
"learning_rate": 4.0498043714627006e-07, |
|
"loss": 0.0149, |
|
"reward": -0.15011528879404068, |
|
"reward_std": 0.5199657753109932, |
|
"rewards/cosine_scaled_reward": -0.07505764067173004, |
|
"rewards/format_reward": 0.0, |
|
"step": 323 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2524.4305725097656, |
|
"epoch": 0.5552699228791774, |
|
"grad_norm": 0.25161027908325195, |
|
"kl": 0.01303863525390625, |
|
"learning_rate": 4.020100089676376e-07, |
|
"loss": 0.1119, |
|
"reward": 0.2225971333682537, |
|
"reward_std": 0.7053848057985306, |
|
"rewards/cosine_scaled_reward": 0.11129856202751398, |
|
"rewards/format_reward": 0.0, |
|
"step": 324 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2823.5694580078125, |
|
"epoch": 0.5569837189374465, |
|
"grad_norm": 0.17407697439193726, |
|
"kl": 0.016265869140625, |
|
"learning_rate": 3.9904679361238526e-07, |
|
"loss": -0.0328, |
|
"reward": -0.11739783291704953, |
|
"reward_std": 0.6684166565537453, |
|
"rewards/cosine_scaled_reward": -0.058698914712294936, |
|
"rewards/format_reward": 0.0, |
|
"step": 325 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2481.6944580078125, |
|
"epoch": 0.5586975149957155, |
|
"grad_norm": 0.16408374905586243, |
|
"kl": 0.01628875732421875, |
|
"learning_rate": 3.9609093550344907e-07, |
|
"loss": 0.0145, |
|
"reward": 0.05000840872526169, |
|
"reward_std": 0.4738306663930416, |
|
"rewards/cosine_scaled_reward": 0.025004200637340546, |
|
"rewards/format_reward": 0.0, |
|
"step": 326 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2850.8611450195312, |
|
"epoch": 0.5604113110539846, |
|
"grad_norm": 0.1830449402332306, |
|
"kl": 0.0183563232421875, |
|
"learning_rate": 3.931425787051832e-07, |
|
"loss": 0.054, |
|
"reward": -0.26191626861691475, |
|
"reward_std": 0.4200581759214401, |
|
"rewards/cosine_scaled_reward": -0.1309581445530057, |
|
"rewards/format_reward": 0.0, |
|
"step": 327 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2681.875030517578, |
|
"epoch": 0.5621251071122536, |
|
"grad_norm": 0.3444949984550476, |
|
"kl": 0.031097412109375, |
|
"learning_rate": 3.902018669163384e-07, |
|
"loss": 0.0002, |
|
"reward": 0.058326710015535355, |
|
"reward_std": 0.5914809927344322, |
|
"rewards/cosine_scaled_reward": 0.029163353145122528, |
|
"rewards/format_reward": 0.0, |
|
"step": 328 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2444.1805419921875, |
|
"epoch": 0.5638389031705227, |
|
"grad_norm": 0.20234812796115875, |
|
"kl": 0.0186004638671875, |
|
"learning_rate": 3.872689434630585e-07, |
|
"loss": 0.0297, |
|
"reward": 0.015948079526424408, |
|
"reward_std": 0.5476803705096245, |
|
"rewards/cosine_scaled_reward": 0.00797403953038156, |
|
"rewards/format_reward": 0.0, |
|
"step": 329 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2751.6666870117188, |
|
"epoch": 0.5655526992287918, |
|
"grad_norm": 0.20875848829746246, |
|
"kl": 0.0170745849609375, |
|
"learning_rate": 3.843439512918949e-07, |
|
"loss": 0.0404, |
|
"reward": -0.1900151213631034, |
|
"reward_std": 0.552287369966507, |
|
"rewards/cosine_scaled_reward": -0.09500756207853556, |
|
"rewards/format_reward": 0.0, |
|
"step": 330 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2700.7222900390625, |
|
"epoch": 0.5672664952870609, |
|
"grad_norm": 0.17264467477798462, |
|
"kl": 0.0172119140625, |
|
"learning_rate": 3.8142703296283953e-07, |
|
"loss": 0.0526, |
|
"reward": 0.03160311561077833, |
|
"reward_std": 0.5627969726920128, |
|
"rewards/cosine_scaled_reward": 0.015801557805389166, |
|
"rewards/format_reward": 0.0, |
|
"step": 331 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2822.8472900390625, |
|
"epoch": 0.5689802913453299, |
|
"grad_norm": 0.27976417541503906, |
|
"kl": 0.023895263671875, |
|
"learning_rate": 3.785183306423767e-07, |
|
"loss": 0.0355, |
|
"reward": 0.02845914661884308, |
|
"reward_std": 0.5001804158091545, |
|
"rewards/cosine_scaled_reward": 0.014229563996195793, |
|
"rewards/format_reward": 0.0, |
|
"step": 332 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2853.3333740234375, |
|
"epoch": 0.570694087403599, |
|
"grad_norm": 0.1514306664466858, |
|
"kl": 0.017669677734375, |
|
"learning_rate": 3.7561798609655373e-07, |
|
"loss": -0.0082, |
|
"reward": -0.13629086455330253, |
|
"reward_std": 0.4956332743167877, |
|
"rewards/cosine_scaled_reward": -0.06814542971551418, |
|
"rewards/format_reward": 0.0, |
|
"step": 333 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3072.9166870117188, |
|
"epoch": 0.572407883461868, |
|
"grad_norm": 0.14293867349624634, |
|
"kl": 0.0254974365234375, |
|
"learning_rate": 3.72726140684072e-07, |
|
"loss": 0.0174, |
|
"reward": 0.02665301039814949, |
|
"reward_std": 0.6765051260590553, |
|
"rewards/cosine_scaled_reward": 0.013326505199074745, |
|
"rewards/format_reward": 0.0, |
|
"step": 334 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2824.861114501953, |
|
"epoch": 0.5741216795201372, |
|
"grad_norm": 0.19958122074604034, |
|
"kl": 0.0171356201171875, |
|
"learning_rate": 3.6984293534939737e-07, |
|
"loss": 0.0929, |
|
"reward": -0.056068588979542255, |
|
"reward_std": 0.8257120847702026, |
|
"rewards/cosine_scaled_reward": -0.028034291230142117, |
|
"rewards/format_reward": 0.0, |
|
"step": 335 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2580.0, |
|
"epoch": 0.5758354755784062, |
|
"grad_norm": 0.229178324341774, |
|
"kl": 0.019195556640625, |
|
"learning_rate": 3.6696851061588994e-07, |
|
"loss": 0.006, |
|
"reward": -0.291859433054924, |
|
"reward_std": 0.4463714547455311, |
|
"rewards/cosine_scaled_reward": -0.1459297128021717, |
|
"rewards/format_reward": 0.0, |
|
"step": 336 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2450.138916015625, |
|
"epoch": 0.5775492716366752, |
|
"grad_norm": 0.27258360385894775, |
|
"kl": 0.0173187255859375, |
|
"learning_rate": 3.641030065789562e-07, |
|
"loss": 0.0321, |
|
"reward": 0.07944206055253744, |
|
"reward_std": 0.6395395249128342, |
|
"rewards/cosine_scaled_reward": 0.03972102585248649, |
|
"rewards/format_reward": 0.0, |
|
"step": 337 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2720.1805725097656, |
|
"epoch": 0.5792630676949443, |
|
"grad_norm": 0.20289485156536102, |
|
"kl": 0.019683837890625, |
|
"learning_rate": 3.612465628992203e-07, |
|
"loss": 0.069, |
|
"reward": 0.48021042346954346, |
|
"reward_std": 0.7420852333307266, |
|
"rewards/cosine_scaled_reward": 0.24010521546006203, |
|
"rewards/format_reward": 0.0, |
|
"step": 338 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3105.0833740234375, |
|
"epoch": 0.5809768637532133, |
|
"grad_norm": 0.18909570574760437, |
|
"kl": 0.022308349609375, |
|
"learning_rate": 3.5839931879571725e-07, |
|
"loss": 0.0378, |
|
"reward": -0.22961215861141682, |
|
"reward_std": 0.5897372663021088, |
|
"rewards/cosine_scaled_reward": -0.11480608023703098, |
|
"rewards/format_reward": 0.0, |
|
"step": 339 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2911.2083129882812, |
|
"epoch": 0.5826906598114824, |
|
"grad_norm": 0.20473147928714752, |
|
"kl": 0.022918701171875, |
|
"learning_rate": 3.555614130391079e-07, |
|
"loss": -0.0498, |
|
"reward": -0.1040644682943821, |
|
"reward_std": 0.57014200091362, |
|
"rewards/cosine_scaled_reward": -0.052032231353223324, |
|
"rewards/format_reward": 0.0, |
|
"step": 340 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2422.777801513672, |
|
"epoch": 0.5844044558697515, |
|
"grad_norm": 0.1749623566865921, |
|
"kl": 0.019775390625, |
|
"learning_rate": 3.5273298394491515e-07, |
|
"loss": 0.0118, |
|
"reward": -0.429408997297287, |
|
"reward_std": 0.39798443764448166, |
|
"rewards/cosine_scaled_reward": -0.2147044911980629, |
|
"rewards/format_reward": 0.0, |
|
"step": 341 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2946.736083984375, |
|
"epoch": 0.5861182519280206, |
|
"grad_norm": 0.20565366744995117, |
|
"kl": 0.01824951171875, |
|
"learning_rate": 3.4991416936678276e-07, |
|
"loss": 0.0572, |
|
"reward": -0.09714518021792173, |
|
"reward_std": 0.6395711675286293, |
|
"rewards/cosine_scaled_reward": -0.048572588711977005, |
|
"rewards/format_reward": 0.0, |
|
"step": 342 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2240.361114501953, |
|
"epoch": 0.5878320479862896, |
|
"grad_norm": 0.19630080461502075, |
|
"kl": 0.013671875, |
|
"learning_rate": 3.471051066897562e-07, |
|
"loss": 0.0286, |
|
"reward": 0.09563972940668464, |
|
"reward_std": 0.5933751873672009, |
|
"rewards/cosine_scaled_reward": 0.047819861210882664, |
|
"rewards/format_reward": 0.0, |
|
"step": 343 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2915.2916870117188, |
|
"epoch": 0.5895458440445587, |
|
"grad_norm": 0.20998388528823853, |
|
"kl": 0.0269317626953125, |
|
"learning_rate": 3.4430593282358777e-07, |
|
"loss": -0.0348, |
|
"reward": -0.3282645223662257, |
|
"reward_std": 0.49101946130394936, |
|
"rewards/cosine_scaled_reward": -0.16413226234726608, |
|
"rewards/format_reward": 0.0, |
|
"step": 344 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2668.0277709960938, |
|
"epoch": 0.5912596401028277, |
|
"grad_norm": 0.25542527437210083, |
|
"kl": 0.020050048828125, |
|
"learning_rate": 3.4151678419606233e-07, |
|
"loss": 0.0754, |
|
"reward": 0.21342255361378193, |
|
"reward_std": 0.653385765850544, |
|
"rewards/cosine_scaled_reward": 0.10671127680689096, |
|
"rewards/format_reward": 0.0, |
|
"step": 345 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2691.2638549804688, |
|
"epoch": 0.5929734361610969, |
|
"grad_norm": 0.21436557173728943, |
|
"kl": 0.0188446044921875, |
|
"learning_rate": 3.387377967463493e-07, |
|
"loss": 0.0297, |
|
"reward": -0.08409620448946953, |
|
"reward_std": 0.6964321285486221, |
|
"rewards/cosine_scaled_reward": -0.04204810503870249, |
|
"rewards/format_reward": 0.0, |
|
"step": 346 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2423.2083740234375, |
|
"epoch": 0.5946872322193659, |
|
"grad_norm": 0.2174253612756729, |
|
"kl": 0.0170745849609375, |
|
"learning_rate": 3.359691059183761e-07, |
|
"loss": -0.0145, |
|
"reward": 0.05711523536592722, |
|
"reward_std": 0.6910872906446457, |
|
"rewards/cosine_scaled_reward": 0.02855762024410069, |
|
"rewards/format_reward": 0.0, |
|
"step": 347 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2334.3194580078125, |
|
"epoch": 0.596401028277635, |
|
"grad_norm": 0.20871306955814362, |
|
"kl": 0.019622802734375, |
|
"learning_rate": 3.3321084665422803e-07, |
|
"loss": 0.0377, |
|
"reward": -0.29262126237154007, |
|
"reward_std": 0.5664101913571358, |
|
"rewards/cosine_scaled_reward": -0.14631063491106033, |
|
"rewards/format_reward": 0.0, |
|
"step": 348 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2684.4861450195312, |
|
"epoch": 0.598114824335904, |
|
"grad_norm": 0.2084410935640335, |
|
"kl": 0.0155181884765625, |
|
"learning_rate": 3.3046315338757026e-07, |
|
"loss": -0.0696, |
|
"reward": 0.2747867554426193, |
|
"reward_std": 0.6360199972987175, |
|
"rewards/cosine_scaled_reward": 0.13739337399601936, |
|
"rewards/format_reward": 0.0, |
|
"step": 349 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2624.9444580078125, |
|
"epoch": 0.5998286203941731, |
|
"grad_norm": 0.27559694647789, |
|
"kl": 0.01519775390625, |
|
"learning_rate": 3.2772616003709616e-07, |
|
"loss": 0.0439, |
|
"reward": 0.16777711734175682, |
|
"reward_std": 0.6573140621185303, |
|
"rewards/cosine_scaled_reward": 0.08388857543468475, |
|
"rewards/format_reward": 0.0, |
|
"step": 350 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2644.0416564941406, |
|
"epoch": 0.6015424164524421, |
|
"grad_norm": 0.21829356253147125, |
|
"kl": 0.020263671875, |
|
"learning_rate": 3.250000000000001e-07, |
|
"loss": 0.019, |
|
"reward": 0.04395672678947449, |
|
"reward_std": 0.5275484099984169, |
|
"rewards/cosine_scaled_reward": 0.02197836432605982, |
|
"rewards/format_reward": 0.0, |
|
"step": 351 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2948.3056030273438, |
|
"epoch": 0.6032562125107113, |
|
"grad_norm": 0.15744946897029877, |
|
"kl": 0.0189056396484375, |
|
"learning_rate": 3.222848061454764e-07, |
|
"loss": -0.0085, |
|
"reward": -0.41702286154031754, |
|
"reward_std": 0.5593557730317116, |
|
"rewards/cosine_scaled_reward": -0.20851144194602966, |
|
"rewards/format_reward": 0.0, |
|
"step": 352 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2635.7222595214844, |
|
"epoch": 0.6049700085689803, |
|
"grad_norm": 0.22034288942813873, |
|
"kl": 0.021209716796875, |
|
"learning_rate": 3.195807108082429e-07, |
|
"loss": -0.0335, |
|
"reward": -0.30768171697854996, |
|
"reward_std": 0.5821868106722832, |
|
"rewards/cosine_scaled_reward": -0.15384084545075893, |
|
"rewards/format_reward": 0.0, |
|
"step": 353 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2137.3055725097656, |
|
"epoch": 0.6066838046272494, |
|
"grad_norm": 0.276947557926178, |
|
"kl": 0.015472412109375, |
|
"learning_rate": 3.168878457820915e-07, |
|
"loss": 0.0844, |
|
"reward": 0.3251216746866703, |
|
"reward_std": 0.716858297586441, |
|
"rewards/cosine_scaled_reward": 0.16256084106862545, |
|
"rewards/format_reward": 0.0, |
|
"step": 354 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2492.5555725097656, |
|
"epoch": 0.6083976006855184, |
|
"grad_norm": 0.2037208080291748, |
|
"kl": 0.0183258056640625, |
|
"learning_rate": 3.142063423134644e-07, |
|
"loss": -0.0014, |
|
"reward": -0.21882931515574455, |
|
"reward_std": 0.47944844514131546, |
|
"rewards/cosine_scaled_reward": -0.10941465757787228, |
|
"rewards/format_reward": 0.0, |
|
"step": 355 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2614.3472290039062, |
|
"epoch": 0.6101113967437874, |
|
"grad_norm": 0.19817198812961578, |
|
"kl": 0.0220947265625, |
|
"learning_rate": 3.115363310950578e-07, |
|
"loss": 0.0141, |
|
"reward": -0.4298449754714966, |
|
"reward_std": 0.520567923784256, |
|
"rewards/cosine_scaled_reward": -0.2149224765598774, |
|
"rewards/format_reward": 0.0, |
|
"step": 356 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2584.777801513672, |
|
"epoch": 0.6118251928020566, |
|
"grad_norm": 0.18728262186050415, |
|
"kl": 0.021331787109375, |
|
"learning_rate": 3.0887794225945143e-07, |
|
"loss": 0.04, |
|
"reward": 0.04458676278591156, |
|
"reward_std": 0.49945997446775436, |
|
"rewards/cosine_scaled_reward": 0.02229338139295578, |
|
"rewards/format_reward": 0.0, |
|
"step": 357 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2934.513916015625, |
|
"epoch": 0.6135389888603257, |
|
"grad_norm": 0.17515863478183746, |
|
"kl": 0.017791748046875, |
|
"learning_rate": 3.062313053727671e-07, |
|
"loss": -0.0046, |
|
"reward": -0.0155550935305655, |
|
"reward_std": 0.607760101556778, |
|
"rewards/cosine_scaled_reward": -0.007777547696605325, |
|
"rewards/format_reward": 0.0, |
|
"step": 358 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2598.2638549804688, |
|
"epoch": 0.6152527849185947, |
|
"grad_norm": 0.20000198483467102, |
|
"kl": 0.0205535888671875, |
|
"learning_rate": 3.0359654942835247e-07, |
|
"loss": -0.008, |
|
"reward": -0.21508236415684223, |
|
"reward_std": 0.4807446375489235, |
|
"rewards/cosine_scaled_reward": -0.10754118673503399, |
|
"rewards/format_reward": 0.0, |
|
"step": 359 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2585.3333435058594, |
|
"epoch": 0.6169665809768637, |
|
"grad_norm": 0.1761714369058609, |
|
"kl": 0.01947021484375, |
|
"learning_rate": 3.0097380284049523e-07, |
|
"loss": 0.0011, |
|
"reward": -0.027444179635494947, |
|
"reward_std": 0.6417821869254112, |
|
"rewards/cosine_scaled_reward": -0.013722071889787912, |
|
"rewards/format_reward": 0.0, |
|
"step": 360 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2367.3611450195312, |
|
"epoch": 0.6186803770351328, |
|
"grad_norm": 0.1938982903957367, |
|
"kl": 0.01788330078125, |
|
"learning_rate": 2.9836319343816397e-07, |
|
"loss": -0.023, |
|
"reward": 0.0992561224848032, |
|
"reward_std": 0.7357365190982819, |
|
"rewards/cosine_scaled_reward": 0.04962805658578873, |
|
"rewards/format_reward": 0.0, |
|
"step": 361 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3139.541748046875, |
|
"epoch": 0.6203941730934018, |
|
"grad_norm": 0.17356501519680023, |
|
"kl": 0.024200439453125, |
|
"learning_rate": 2.9576484845877793e-07, |
|
"loss": -0.0258, |
|
"reward": -0.128750279545784, |
|
"reward_std": 0.5727476924657822, |
|
"rewards/cosine_scaled_reward": -0.06437514536082745, |
|
"rewards/format_reward": 0.0, |
|
"step": 362 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2882.4306030273438, |
|
"epoch": 0.622107969151671, |
|
"grad_norm": 0.19220975041389465, |
|
"kl": 0.0243377685546875, |
|
"learning_rate": 2.931788945420058e-07, |
|
"loss": -0.0247, |
|
"reward": -0.019596407189965248, |
|
"reward_std": 0.6233709305524826, |
|
"rewards/cosine_scaled_reward": -0.009798200335353613, |
|
"rewards/format_reward": 0.0, |
|
"step": 363 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2840.0555419921875, |
|
"epoch": 0.62382176520994, |
|
"grad_norm": 0.237908735871315, |
|
"kl": 0.02325439453125, |
|
"learning_rate": 2.9060545772359305e-07, |
|
"loss": 0.0684, |
|
"reward": -0.17538912501186132, |
|
"reward_std": 0.7643003761768341, |
|
"rewards/cosine_scaled_reward": -0.08769455272704363, |
|
"rewards/format_reward": 0.0, |
|
"step": 364 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2791.75, |
|
"epoch": 0.6255355612682091, |
|
"grad_norm": 0.1972544640302658, |
|
"kl": 0.022613525390625, |
|
"learning_rate": 2.8804466342921987e-07, |
|
"loss": -0.0356, |
|
"reward": -0.19943542033433914, |
|
"reward_std": 0.6234779357910156, |
|
"rewards/cosine_scaled_reward": -0.09971771761775017, |
|
"rewards/format_reward": 0.0, |
|
"step": 365 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2936.7916564941406, |
|
"epoch": 0.6272493573264781, |
|
"grad_norm": 0.1693785935640335, |
|
"kl": 0.022491455078125, |
|
"learning_rate": 2.854966364683872e-07, |
|
"loss": 0.0289, |
|
"reward": -0.07167929410934448, |
|
"reward_std": 0.41813354194164276, |
|
"rewards/cosine_scaled_reward": -0.035839639604091644, |
|
"rewards/format_reward": 0.0, |
|
"step": 366 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2579.8056030273438, |
|
"epoch": 0.6289631533847472, |
|
"grad_norm": 0.18452903628349304, |
|
"kl": 0.02313232421875, |
|
"learning_rate": 2.829615010283344e-07, |
|
"loss": 0.0131, |
|
"reward": 0.13851050520315766, |
|
"reward_std": 0.6860260739922523, |
|
"rewards/cosine_scaled_reward": 0.06925524887628853, |
|
"rewards/format_reward": 0.0, |
|
"step": 367 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3069.2361450195312, |
|
"epoch": 0.6306769494430163, |
|
"grad_norm": 0.207699254155159, |
|
"kl": 0.026519775390625, |
|
"learning_rate": 2.8043938066798645e-07, |
|
"loss": 0.0636, |
|
"reward": -0.25442312750965357, |
|
"reward_std": 0.5900055021047592, |
|
"rewards/cosine_scaled_reward": -0.12721156049519777, |
|
"rewards/format_reward": 0.0, |
|
"step": 368 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2244.763916015625, |
|
"epoch": 0.6323907455012854, |
|
"grad_norm": 0.17845271527767181, |
|
"kl": 0.0156707763671875, |
|
"learning_rate": 2.7793039831193133e-07, |
|
"loss": 0.0488, |
|
"reward": 0.17914995457977057, |
|
"reward_std": 0.7317003160715103, |
|
"rewards/cosine_scaled_reward": 0.08957497263327241, |
|
"rewards/format_reward": 0.0, |
|
"step": 369 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2134.1944274902344, |
|
"epoch": 0.6341045415595544, |
|
"grad_norm": 0.2277487814426422, |
|
"kl": 0.01385498046875, |
|
"learning_rate": 2.7543467624442956e-07, |
|
"loss": -0.0127, |
|
"reward": 0.11734075238928199, |
|
"reward_std": 0.5018965676426888, |
|
"rewards/cosine_scaled_reward": 0.05867037340067327, |
|
"rewards/format_reward": 0.0, |
|
"step": 370 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2490.500030517578, |
|
"epoch": 0.6358183376178235, |
|
"grad_norm": 0.21075375378131866, |
|
"kl": 0.02069091796875, |
|
"learning_rate": 2.729523361034538e-07, |
|
"loss": 0.0493, |
|
"reward": -0.03656116779893637, |
|
"reward_std": 0.4987756237387657, |
|
"rewards/cosine_scaled_reward": -0.018280583899468184, |
|
"rewards/format_reward": 0.0, |
|
"step": 371 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2664.625030517578, |
|
"epoch": 0.6375321336760925, |
|
"grad_norm": 0.22036224603652954, |
|
"kl": 0.01885986328125, |
|
"learning_rate": 2.7048349887476037e-07, |
|
"loss": 0.0736, |
|
"reward": -0.017365715699270368, |
|
"reward_std": 0.7068077325820923, |
|
"rewards/cosine_scaled_reward": -0.008682856685481966, |
|
"rewards/format_reward": 0.0, |
|
"step": 372 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2659.3889770507812, |
|
"epoch": 0.6392459297343616, |
|
"grad_norm": 0.2022118866443634, |
|
"kl": 0.0198822021484375, |
|
"learning_rate": 2.6802828488599294e-07, |
|
"loss": 0.011, |
|
"reward": -0.049437786685302854, |
|
"reward_std": 0.5779630020260811, |
|
"rewards/cosine_scaled_reward": -0.024718896602280438, |
|
"rewards/format_reward": 0.0, |
|
"step": 373 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2616.4306030273438, |
|
"epoch": 0.6409597257926307, |
|
"grad_norm": 0.1780145913362503, |
|
"kl": 0.0235595703125, |
|
"learning_rate": 2.655868138008171e-07, |
|
"loss": 0.0089, |
|
"reward": -0.017803641967475414, |
|
"reward_std": 0.6717728674411774, |
|
"rewards/cosine_scaled_reward": -0.00890181539580226, |
|
"rewards/format_reward": 0.0, |
|
"step": 374 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2676.3472290039062, |
|
"epoch": 0.6426735218508998, |
|
"grad_norm": 0.15247489511966705, |
|
"kl": 0.026458740234375, |
|
"learning_rate": 2.631592046130896e-07, |
|
"loss": 0.0205, |
|
"reward": -0.31310519203543663, |
|
"reward_std": 0.5878890082240105, |
|
"rewards/cosine_scaled_reward": -0.15655260160565376, |
|
"rewards/format_reward": 0.0, |
|
"step": 375 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2356.2083129882812, |
|
"epoch": 0.6443873179091688, |
|
"grad_norm": 0.2001314014196396, |
|
"kl": 0.0235595703125, |
|
"learning_rate": 2.6074557564105724e-07, |
|
"loss": 0.0174, |
|
"reward": -0.2070534396916628, |
|
"reward_std": 0.4216439947485924, |
|
"rewards/cosine_scaled_reward": -0.1035267198458314, |
|
"rewards/format_reward": 0.0, |
|
"step": 376 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2577.5695190429688, |
|
"epoch": 0.6461011139674379, |
|
"grad_norm": 0.19885659217834473, |
|
"kl": 0.0179290771484375, |
|
"learning_rate": 2.583460445215911e-07, |
|
"loss": -0.0114, |
|
"reward": -0.2356225922703743, |
|
"reward_std": 0.4705282226204872, |
|
"rewards/cosine_scaled_reward": -0.1178113017231226, |
|
"rewards/format_reward": 0.0, |
|
"step": 377 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2827.4305419921875, |
|
"epoch": 0.6478149100257069, |
|
"grad_norm": 0.16866172850131989, |
|
"kl": 0.022796630859375, |
|
"learning_rate": 2.5596072820445254e-07, |
|
"loss": 0.0359, |
|
"reward": -0.2195772840641439, |
|
"reward_std": 0.7464367002248764, |
|
"rewards/cosine_scaled_reward": -0.1097886401694268, |
|
"rewards/format_reward": 0.0, |
|
"step": 378 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2001.8472595214844, |
|
"epoch": 0.6495287060839761, |
|
"grad_norm": 0.27339431643486023, |
|
"kl": 0.025421142578125, |
|
"learning_rate": 2.5358974294659373e-07, |
|
"loss": -0.0481, |
|
"reward": -0.053384889382869005, |
|
"reward_std": 0.7801851779222488, |
|
"rewards/cosine_scaled_reward": -0.026692438637837768, |
|
"rewards/format_reward": 0.0, |
|
"step": 379 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2380.8611450195312, |
|
"epoch": 0.6512425021422451, |
|
"grad_norm": 0.49418047070503235, |
|
"kl": 0.028839111328125, |
|
"learning_rate": 2.512332043064913e-07, |
|
"loss": 0.1507, |
|
"reward": -0.04335943330079317, |
|
"reward_std": 0.7678016275167465, |
|
"rewards/cosine_scaled_reward": -0.021679717116057873, |
|
"rewards/format_reward": 0.0, |
|
"step": 380 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2910.1806640625, |
|
"epoch": 0.6529562982005142, |
|
"grad_norm": 0.19250288605690002, |
|
"kl": 0.022003173828125, |
|
"learning_rate": 2.488912271385139e-07, |
|
"loss": 0.0447, |
|
"reward": -0.1130654625594616, |
|
"reward_std": 0.5473960787057877, |
|
"rewards/cosine_scaled_reward": -0.05653274059295654, |
|
"rewards/format_reward": 0.0, |
|
"step": 381 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2682.9861450195312, |
|
"epoch": 0.6546700942587832, |
|
"grad_norm": 0.1798926293849945, |
|
"kl": 0.019439697265625, |
|
"learning_rate": 2.465639255873246e-07, |
|
"loss": -0.0224, |
|
"reward": -0.07310536503791809, |
|
"reward_std": 0.6817247718572617, |
|
"rewards/cosine_scaled_reward": -0.036552680656313896, |
|
"rewards/format_reward": 0.0, |
|
"step": 382 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2607.763916015625, |
|
"epoch": 0.6563838903170522, |
|
"grad_norm": 0.24983283877372742, |
|
"kl": 0.026153564453125, |
|
"learning_rate": 2.4425141308231765e-07, |
|
"loss": 0.0197, |
|
"reward": -0.24107037298381329, |
|
"reward_std": 0.6102746799588203, |
|
"rewards/cosine_scaled_reward": -0.12053518556058407, |
|
"rewards/format_reward": 0.0, |
|
"step": 383 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2681.277801513672, |
|
"epoch": 0.6580976863753213, |
|
"grad_norm": 0.21532803773880005, |
|
"kl": 0.0153350830078125, |
|
"learning_rate": 2.4195380233209006e-07, |
|
"loss": 0.0375, |
|
"reward": -0.2287786863744259, |
|
"reward_std": 0.5439959019422531, |
|
"rewards/cosine_scaled_reward": -0.11438935063779354, |
|
"rewards/format_reward": 0.0, |
|
"step": 384 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2749.4305725097656, |
|
"epoch": 0.6598114824335904, |
|
"grad_norm": 0.23645354807376862, |
|
"kl": 0.02471923828125, |
|
"learning_rate": 2.3967120531894857e-07, |
|
"loss": -0.0225, |
|
"reward": -0.1737481877207756, |
|
"reward_std": 0.5551631152629852, |
|
"rewards/cosine_scaled_reward": -0.0868740938603878, |
|
"rewards/format_reward": 0.0, |
|
"step": 385 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3019.7361450195312, |
|
"epoch": 0.6615252784918595, |
|
"grad_norm": 0.18375760316848755, |
|
"kl": 0.019195556640625, |
|
"learning_rate": 2.374037332934512e-07, |
|
"loss": 0.0429, |
|
"reward": -0.34039000049233437, |
|
"reward_std": 0.5544994547963142, |
|
"rewards/cosine_scaled_reward": -0.17019500210881233, |
|
"rewards/format_reward": 0.0, |
|
"step": 386 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2555.625030517578, |
|
"epoch": 0.6632390745501285, |
|
"grad_norm": 0.2520519196987152, |
|
"kl": 0.0201873779296875, |
|
"learning_rate": 2.3515149676898552e-07, |
|
"loss": 0.0754, |
|
"reward": 0.06691954471170902, |
|
"reward_std": 0.4953342378139496, |
|
"rewards/cosine_scaled_reward": 0.03345977142453194, |
|
"rewards/format_reward": 0.0, |
|
"step": 387 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2198.75, |
|
"epoch": 0.6649528706083976, |
|
"grad_norm": 0.21169999241828918, |
|
"kl": 0.0220489501953125, |
|
"learning_rate": 2.3291460551638237e-07, |
|
"loss": -0.0328, |
|
"reward": 0.10132637619972229, |
|
"reward_std": 0.6322794482111931, |
|
"rewards/cosine_scaled_reward": 0.050663191825151443, |
|
"rewards/format_reward": 0.0, |
|
"step": 388 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2786.27783203125, |
|
"epoch": 0.6666666666666666, |
|
"grad_norm": 0.18405954539775848, |
|
"kl": 0.0251617431640625, |
|
"learning_rate": 2.306931685585657e-07, |
|
"loss": -0.0196, |
|
"reward": 0.03023771196603775, |
|
"reward_std": 0.46946871280670166, |
|
"rewards/cosine_scaled_reward": 0.015118852257728577, |
|
"rewards/format_reward": 0.0, |
|
"step": 389 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2521.5693969726562, |
|
"epoch": 0.6683804627249358, |
|
"grad_norm": 0.19272808730602264, |
|
"kl": 0.0200347900390625, |
|
"learning_rate": 2.2848729416523859e-07, |
|
"loss": 0.0461, |
|
"reward": 0.00521535862935707, |
|
"reward_std": 0.616911455988884, |
|
"rewards/cosine_scaled_reward": 0.0026076845824718475, |
|
"rewards/format_reward": 0.0, |
|
"step": 390 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2864.041748046875, |
|
"epoch": 0.6700942587832048, |
|
"grad_norm": 0.2623915672302246, |
|
"kl": 0.0235595703125, |
|
"learning_rate": 2.2629708984760706e-07, |
|
"loss": -0.002, |
|
"reward": -0.1861814223229885, |
|
"reward_std": 0.5339604392647743, |
|
"rewards/cosine_scaled_reward": -0.0930907130241394, |
|
"rewards/format_reward": 0.0, |
|
"step": 391 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2380.9583740234375, |
|
"epoch": 0.6718080548414739, |
|
"grad_norm": 0.25610801577568054, |
|
"kl": 0.02032470703125, |
|
"learning_rate": 2.2412266235313973e-07, |
|
"loss": -0.0448, |
|
"reward": -0.07657308876514435, |
|
"reward_std": 0.6799488365650177, |
|
"rewards/cosine_scaled_reward": -0.038286540657281876, |
|
"rewards/format_reward": 0.0, |
|
"step": 392 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2998.5139770507812, |
|
"epoch": 0.6735218508997429, |
|
"grad_norm": 0.19235925376415253, |
|
"kl": 0.0222015380859375, |
|
"learning_rate": 2.2196411766036487e-07, |
|
"loss": 0.0569, |
|
"reward": -0.001154482364654541, |
|
"reward_std": 0.5102438926696777, |
|
"rewards/cosine_scaled_reward": -0.0005772355943918228, |
|
"rewards/format_reward": 0.0, |
|
"step": 393 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2115.1806030273438, |
|
"epoch": 0.675235646958012, |
|
"grad_norm": 0.2744181752204895, |
|
"kl": 0.027099609375, |
|
"learning_rate": 2.1982156097370557e-07, |
|
"loss": 0.0221, |
|
"reward": 0.058095297776162624, |
|
"reward_std": 0.718009740114212, |
|
"rewards/cosine_scaled_reward": 0.029047648888081312, |
|
"rewards/format_reward": 0.0, |
|
"step": 394 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2774.1806030273438, |
|
"epoch": 0.676949443016281, |
|
"grad_norm": 0.19175058603286743, |
|
"kl": 0.025482177734375, |
|
"learning_rate": 2.1769509671835223e-07, |
|
"loss": 0.0352, |
|
"reward": -0.136960469186306, |
|
"reward_std": 0.511358916759491, |
|
"rewards/cosine_scaled_reward": -0.0684802271425724, |
|
"rewards/format_reward": 0.0, |
|
"step": 395 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3057.9584350585938, |
|
"epoch": 0.6786632390745502, |
|
"grad_norm": 0.16223175823688507, |
|
"kl": 0.019256591796875, |
|
"learning_rate": 2.1558482853517253e-07, |
|
"loss": 0.0288, |
|
"reward": -0.04862111946567893, |
|
"reward_std": 0.5186164565384388, |
|
"rewards/cosine_scaled_reward": -0.024310562410391867, |
|
"rewards/format_reward": 0.0, |
|
"step": 396 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2582.2916870117188, |
|
"epoch": 0.6803770351328192, |
|
"grad_norm": 0.287038654088974, |
|
"kl": 0.0208740234375, |
|
"learning_rate": 2.134908592756607e-07, |
|
"loss": -0.0666, |
|
"reward": -0.17554645985364914, |
|
"reward_std": 0.5096240639686584, |
|
"rewards/cosine_scaled_reward": -0.08777323365211487, |
|
"rewards/format_reward": 0.0, |
|
"step": 397 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3108.4444580078125, |
|
"epoch": 0.6820908311910883, |
|
"grad_norm": 0.1679101139307022, |
|
"kl": 0.0231781005859375, |
|
"learning_rate": 2.1141329099692406e-07, |
|
"loss": -0.0035, |
|
"reward": 0.038632214069366455, |
|
"reward_std": 0.7707736194133759, |
|
"rewards/cosine_scaled_reward": 0.019316108897328377, |
|
"rewards/format_reward": 0.0, |
|
"step": 398 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1893.5972442626953, |
|
"epoch": 0.6838046272493573, |
|
"grad_norm": 0.2708974778652191, |
|
"kl": 0.0231170654296875, |
|
"learning_rate": 2.0935222495670968e-07, |
|
"loss": 0.0065, |
|
"reward": 0.1442592293024063, |
|
"reward_std": 0.5131981894373894, |
|
"rewards/cosine_scaled_reward": 0.07212962210178375, |
|
"rewards/format_reward": 0.0, |
|
"step": 399 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2922.7916870117188, |
|
"epoch": 0.6855184233076264, |
|
"grad_norm": 0.3239152133464813, |
|
"kl": 0.023956298828125, |
|
"learning_rate": 2.0730776160846853e-07, |
|
"loss": -0.0809, |
|
"reward": -0.12957404926419258, |
|
"reward_std": 0.5665386915206909, |
|
"rewards/cosine_scaled_reward": -0.06478701997548342, |
|
"rewards/format_reward": 0.0, |
|
"step": 400 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2859.4444580078125, |
|
"epoch": 0.6872322193658955, |
|
"grad_norm": 0.19043125212192535, |
|
"kl": 0.0223541259765625, |
|
"learning_rate": 2.0528000059645995e-07, |
|
"loss": 0.0588, |
|
"reward": -0.32605881802737713, |
|
"reward_std": 0.5183117464184761, |
|
"rewards/cosine_scaled_reward": -0.16302942391484976, |
|
"rewards/format_reward": 0.0, |
|
"step": 401 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2781.5556030273438, |
|
"epoch": 0.6889460154241646, |
|
"grad_norm": 0.26216545701026917, |
|
"kl": 0.026580810546875, |
|
"learning_rate": 2.032690407508949e-07, |
|
"loss": -0.0263, |
|
"reward": -0.4961502104997635, |
|
"reward_std": 0.3931718245148659, |
|
"rewards/cosine_scaled_reward": -0.24807510524988174, |
|
"rewards/format_reward": 0.0, |
|
"step": 402 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2736.1111450195312, |
|
"epoch": 0.6906598114824336, |
|
"grad_norm": 0.21833109855651855, |
|
"kl": 0.02435302734375, |
|
"learning_rate": 2.0127498008311922e-07, |
|
"loss": 0.0585, |
|
"reward": 0.3037844013888389, |
|
"reward_std": 0.5833063200116158, |
|
"rewards/cosine_scaled_reward": 0.1518922229297459, |
|
"rewards/format_reward": 0.0, |
|
"step": 403 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2712.861114501953, |
|
"epoch": 0.6923736075407027, |
|
"grad_norm": 0.2146695852279663, |
|
"kl": 0.025238037109375, |
|
"learning_rate": 1.9929791578083655e-07, |
|
"loss": -0.041, |
|
"reward": -0.21084421500563622, |
|
"reward_std": 0.4842342808842659, |
|
"rewards/cosine_scaled_reward": -0.10542210191488266, |
|
"rewards/format_reward": 0.0, |
|
"step": 404 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2936.3333129882812, |
|
"epoch": 0.6940874035989717, |
|
"grad_norm": 0.18586868047714233, |
|
"kl": 0.024658203125, |
|
"learning_rate": 1.9733794420337213e-07, |
|
"loss": 0.005, |
|
"reward": 0.050316065549850464, |
|
"reward_std": 0.5316065326333046, |
|
"rewards/cosine_scaled_reward": 0.02515802625566721, |
|
"rewards/format_reward": 0.0, |
|
"step": 405 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2209.0972595214844, |
|
"epoch": 0.6958011996572407, |
|
"grad_norm": 0.2406679093837738, |
|
"kl": 0.02728271484375, |
|
"learning_rate": 1.9539516087697517e-07, |
|
"loss": -0.0131, |
|
"reward": -0.021612104028463364, |
|
"reward_std": 0.5742413327097893, |
|
"rewards/cosine_scaled_reward": -0.010806052014231682, |
|
"rewards/format_reward": 0.0, |
|
"step": 406 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2988.3056640625, |
|
"epoch": 0.6975149957155099, |
|
"grad_norm": 0.23395532369613647, |
|
"kl": 0.027374267578125, |
|
"learning_rate": 1.934696604901642e-07, |
|
"loss": 0.0598, |
|
"reward": -0.08433661237359047, |
|
"reward_std": 0.5562912449240685, |
|
"rewards/cosine_scaled_reward": -0.04216831736266613, |
|
"rewards/format_reward": 0.0, |
|
"step": 407 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2319.6111755371094, |
|
"epoch": 0.699228791773779, |
|
"grad_norm": 0.2508476972579956, |
|
"kl": 0.0188751220703125, |
|
"learning_rate": 1.915615368891117e-07, |
|
"loss": -0.0462, |
|
"reward": 0.5069457921199501, |
|
"reward_std": 0.5437265560030937, |
|
"rewards/cosine_scaled_reward": 0.25347290316130966, |
|
"rewards/format_reward": 0.0, |
|
"step": 408 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2740.7777709960938, |
|
"epoch": 0.700942587832048, |
|
"grad_norm": 0.18038439750671387, |
|
"kl": 0.030517578125, |
|
"learning_rate": 1.8967088307307e-07, |
|
"loss": 0.0239, |
|
"reward": 0.10421705152839422, |
|
"reward_std": 0.6194805726408958, |
|
"rewards/cosine_scaled_reward": 0.052108526695519686, |
|
"rewards/format_reward": 0.0, |
|
"step": 409 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2735.8472595214844, |
|
"epoch": 0.702656383890317, |
|
"grad_norm": 0.2515905201435089, |
|
"kl": 0.02587890625, |
|
"learning_rate": 1.8779779118983867e-07, |
|
"loss": -0.0311, |
|
"reward": -0.1710510030388832, |
|
"reward_std": 0.5620269253849983, |
|
"rewards/cosine_scaled_reward": -0.0855255089700222, |
|
"rewards/format_reward": 0.0, |
|
"step": 410 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2493.388916015625, |
|
"epoch": 0.7043701799485861, |
|
"grad_norm": 0.21452462673187256, |
|
"kl": 0.0193328857421875, |
|
"learning_rate": 1.8594235253127372e-07, |
|
"loss": -0.0157, |
|
"reward": -0.25840797275304794, |
|
"reward_std": 0.4374122992157936, |
|
"rewards/cosine_scaled_reward": -0.12920398078858852, |
|
"rewards/format_reward": 0.0, |
|
"step": 411 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2339.2083740234375, |
|
"epoch": 0.7060839760068551, |
|
"grad_norm": 0.2920970320701599, |
|
"kl": 0.02203369140625, |
|
"learning_rate": 1.8410465752883758e-07, |
|
"loss": -0.0518, |
|
"reward": -0.25962352380156517, |
|
"reward_std": 0.5908957123756409, |
|
"rewards/cosine_scaled_reward": -0.129811754450202, |
|
"rewards/format_reward": 0.0, |
|
"step": 412 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2770.0694580078125, |
|
"epoch": 0.7077977720651243, |
|
"grad_norm": 0.2641778290271759, |
|
"kl": 0.026947021484375, |
|
"learning_rate": 1.822847957491922e-07, |
|
"loss": 0.061, |
|
"reward": -0.15783867985010147, |
|
"reward_std": 0.5947980135679245, |
|
"rewards/cosine_scaled_reward": -0.07891935110092163, |
|
"rewards/format_reward": 0.0, |
|
"step": 413 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2950.1945190429688, |
|
"epoch": 0.7095115681233933, |
|
"grad_norm": 0.2011335790157318, |
|
"kl": 0.024566650390625, |
|
"learning_rate": 1.804828558898332e-07, |
|
"loss": 0.0014, |
|
"reward": -0.0009787320159375668, |
|
"reward_std": 0.7296510636806488, |
|
"rewards/cosine_scaled_reward": -0.0004893671721220016, |
|
"rewards/format_reward": 0.0, |
|
"step": 414 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2707.9305419921875, |
|
"epoch": 0.7112253641816624, |
|
"grad_norm": 0.3163622319698334, |
|
"kl": 0.027496337890625, |
|
"learning_rate": 1.7869892577476722e-07, |
|
"loss": 0.0567, |
|
"reward": -0.3990987651050091, |
|
"reward_std": 0.43145136535167694, |
|
"rewards/cosine_scaled_reward": -0.1995493769645691, |
|
"rewards/format_reward": 0.0, |
|
"step": 415 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2968.0833129882812, |
|
"epoch": 0.7129391602399314, |
|
"grad_norm": 0.23538915812969208, |
|
"kl": 0.02618408203125, |
|
"learning_rate": 1.7693309235023127e-07, |
|
"loss": 0.0018, |
|
"reward": -0.08291278406977654, |
|
"reward_std": 0.4231496602296829, |
|
"rewards/cosine_scaled_reward": -0.041456387378275394, |
|
"rewards/format_reward": 0.0, |
|
"step": 416 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2606.3333129882812, |
|
"epoch": 0.7146529562982005, |
|
"grad_norm": 0.3015352785587311, |
|
"kl": 0.0229949951171875, |
|
"learning_rate": 1.7518544168045524e-07, |
|
"loss": 0.0752, |
|
"reward": -0.3149372674524784, |
|
"reward_std": 0.6667703241109848, |
|
"rewards/cosine_scaled_reward": -0.1574686411768198, |
|
"rewards/format_reward": 0.0, |
|
"step": 417 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2380.4445190429688, |
|
"epoch": 0.7163667523564696, |
|
"grad_norm": 0.22723092138767242, |
|
"kl": 0.0203704833984375, |
|
"learning_rate": 1.7345605894346726e-07, |
|
"loss": 0.0512, |
|
"reward": -0.34560693614184856, |
|
"reward_std": 0.4205815941095352, |
|
"rewards/cosine_scaled_reward": -0.17280346807092428, |
|
"rewards/format_reward": 0.0, |
|
"step": 418 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2430.125, |
|
"epoch": 0.7180805484147387, |
|
"grad_norm": 0.23899339139461517, |
|
"kl": 0.02789306640625, |
|
"learning_rate": 1.7174502842694212e-07, |
|
"loss": -0.0302, |
|
"reward": -0.18839553371071815, |
|
"reward_std": 0.4583168476819992, |
|
"rewards/cosine_scaled_reward": -0.09419775661081076, |
|
"rewards/format_reward": 0.0, |
|
"step": 419 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2552.4304809570312, |
|
"epoch": 0.7197943444730077, |
|
"grad_norm": 0.2333478480577469, |
|
"kl": 0.0173492431640625, |
|
"learning_rate": 1.7005243352409333e-07, |
|
"loss": 0.0486, |
|
"reward": -0.1561539713293314, |
|
"reward_std": 0.6325561329722404, |
|
"rewards/cosine_scaled_reward": -0.07807699032127857, |
|
"rewards/format_reward": 0.0, |
|
"step": 420 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2746.0694580078125, |
|
"epoch": 0.7215081405312768, |
|
"grad_norm": 0.2854664921760559, |
|
"kl": 0.0223388671875, |
|
"learning_rate": 1.6837835672960831e-07, |
|
"loss": 0.0759, |
|
"reward": -0.08271846733987331, |
|
"reward_std": 0.6506856456398964, |
|
"rewards/cosine_scaled_reward": -0.04135924857109785, |
|
"rewards/format_reward": 0.0, |
|
"step": 421 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2300.3055725097656, |
|
"epoch": 0.7232219365895458, |
|
"grad_norm": 0.17767125368118286, |
|
"kl": 0.0181427001953125, |
|
"learning_rate": 1.6672287963562852e-07, |
|
"loss": 0.0242, |
|
"reward": -0.16465576738119125, |
|
"reward_std": 0.5095989629626274, |
|
"rewards/cosine_scaled_reward": -0.08232788741588593, |
|
"rewards/format_reward": 0.0, |
|
"step": 422 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2779.0694580078125, |
|
"epoch": 0.7249357326478149, |
|
"grad_norm": 0.1646861433982849, |
|
"kl": 0.019927978515625, |
|
"learning_rate": 1.6508608292777203e-07, |
|
"loss": 0.0035, |
|
"reward": -0.05413434375077486, |
|
"reward_std": 0.7594424337148666, |
|
"rewards/cosine_scaled_reward": -0.02706717373803258, |
|
"rewards/format_reward": 0.0, |
|
"step": 423 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2956.7083129882812, |
|
"epoch": 0.726649528706084, |
|
"grad_norm": 0.22844961285591125, |
|
"kl": 0.03106689453125, |
|
"learning_rate": 1.6346804638120098e-07, |
|
"loss": -0.023, |
|
"reward": -0.16962197236716747, |
|
"reward_std": 0.6577330157160759, |
|
"rewards/cosine_scaled_reward": -0.08481098245829344, |
|
"rewards/format_reward": 0.0, |
|
"step": 424 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2825.4444580078125, |
|
"epoch": 0.7283633247643531, |
|
"grad_norm": 0.21431787312030792, |
|
"kl": 0.03460693359375, |
|
"learning_rate": 1.6186884885673413e-07, |
|
"loss": -0.0182, |
|
"reward": -0.06549269519746304, |
|
"reward_std": 0.6411803439259529, |
|
"rewards/cosine_scaled_reward": -0.032746341079473495, |
|
"rewards/format_reward": 0.0, |
|
"step": 425 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2121.5556030273438, |
|
"epoch": 0.7300771208226221, |
|
"grad_norm": 0.28984084725379944, |
|
"kl": 0.0258026123046875, |
|
"learning_rate": 1.6028856829700258e-07, |
|
"loss": -0.0539, |
|
"reward": 0.15816697012633085, |
|
"reward_std": 0.5270659551024437, |
|
"rewards/cosine_scaled_reward": 0.07908349251374602, |
|
"rewards/format_reward": 0.0, |
|
"step": 426 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2789.486083984375, |
|
"epoch": 0.7317909168808912, |
|
"grad_norm": 0.20834913849830627, |
|
"kl": 0.022003173828125, |
|
"learning_rate": 1.5872728172265146e-07, |
|
"loss": -0.0337, |
|
"reward": -0.370651263743639, |
|
"reward_std": 0.526657946407795, |
|
"rewards/cosine_scaled_reward": -0.18532563000917435, |
|
"rewards/format_reward": 0.0, |
|
"step": 427 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2442.777801513672, |
|
"epoch": 0.7335047129391602, |
|
"grad_norm": 0.2500321567058563, |
|
"kl": 0.021209716796875, |
|
"learning_rate": 1.5718506522858572e-07, |
|
"loss": 0.0647, |
|
"reward": 0.2879646308720112, |
|
"reward_std": 0.6987240761518478, |
|
"rewards/cosine_scaled_reward": 0.1439823191612959, |
|
"rewards/format_reward": 0.0, |
|
"step": 428 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2859.0694580078125, |
|
"epoch": 0.7352185089974294, |
|
"grad_norm": 0.17108042538166046, |
|
"kl": 0.02716064453125, |
|
"learning_rate": 1.5566199398026147e-07, |
|
"loss": 0.0365, |
|
"reward": -0.21791245974600315, |
|
"reward_std": 0.5681828185915947, |
|
"rewards/cosine_scaled_reward": -0.10895622940734029, |
|
"rewards/format_reward": 0.0, |
|
"step": 429 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2675.52783203125, |
|
"epoch": 0.7369323050556984, |
|
"grad_norm": 0.18789908289909363, |
|
"kl": 0.022308349609375, |
|
"learning_rate": 1.5415814221002265e-07, |
|
"loss": 0.0154, |
|
"reward": -0.023968554101884365, |
|
"reward_std": 0.5900578051805496, |
|
"rewards/cosine_scaled_reward": -0.01198427053168416, |
|
"rewards/format_reward": 0.0, |
|
"step": 430 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2358.041717529297, |
|
"epoch": 0.7386461011139674, |
|
"grad_norm": 0.24318993091583252, |
|
"kl": 0.022705078125, |
|
"learning_rate": 1.5267358321348285e-07, |
|
"loss": 0.0687, |
|
"reward": 0.029904491268098354, |
|
"reward_std": 0.7376819550991058, |
|
"rewards/cosine_scaled_reward": 0.01495224516838789, |
|
"rewards/format_reward": 0.0, |
|
"step": 431 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3008.9306030273438, |
|
"epoch": 0.7403598971722365, |
|
"grad_norm": 0.17112420499324799, |
|
"kl": 0.0255126953125, |
|
"learning_rate": 1.5120838934595337e-07, |
|
"loss": 0.0164, |
|
"reward": -0.09738675877451897, |
|
"reward_std": 0.39827052876353264, |
|
"rewards/cosine_scaled_reward": -0.04869337775744498, |
|
"rewards/format_reward": 0.0, |
|
"step": 432 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2719.166717529297, |
|
"epoch": 0.7420736932305055, |
|
"grad_norm": 0.17819786071777344, |
|
"kl": 0.024810791015625, |
|
"learning_rate": 1.4976263201891613e-07, |
|
"loss": 0.0207, |
|
"reward": 0.0039961859583854675, |
|
"reward_std": 0.4406754970550537, |
|
"rewards/cosine_scaled_reward": 0.0019980808719992638, |
|
"rewards/format_reward": 0.0, |
|
"step": 433 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2033.6805419921875, |
|
"epoch": 0.7437874892887746, |
|
"grad_norm": 0.25923460721969604, |
|
"kl": 0.0193939208984375, |
|
"learning_rate": 1.483363816965435e-07, |
|
"loss": 0.0555, |
|
"reward": -0.2742752702906728, |
|
"reward_std": 0.617987684905529, |
|
"rewards/cosine_scaled_reward": -0.1371376351453364, |
|
"rewards/format_reward": 0.0, |
|
"step": 434 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2954.9027709960938, |
|
"epoch": 0.7455012853470437, |
|
"grad_norm": 0.21946591138839722, |
|
"kl": 0.021759033203125, |
|
"learning_rate": 1.469297078922642e-07, |
|
"loss": 0.0302, |
|
"reward": 0.07878507301211357, |
|
"reward_std": 0.5823550596833229, |
|
"rewards/cosine_scaled_reward": 0.03939253278076649, |
|
"rewards/format_reward": 0.0, |
|
"step": 435 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2508.8056030273438, |
|
"epoch": 0.7472150814053128, |
|
"grad_norm": 0.18389303982257843, |
|
"kl": 0.022979736328125, |
|
"learning_rate": 1.4554267916537495e-07, |
|
"loss": 0.0511, |
|
"reward": 0.11886966414749622, |
|
"reward_std": 0.6237533167004585, |
|
"rewards/cosine_scaled_reward": 0.05943482369184494, |
|
"rewards/format_reward": 0.0, |
|
"step": 436 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2480.375030517578, |
|
"epoch": 0.7489288774635818, |
|
"grad_norm": 0.18969161808490753, |
|
"kl": 0.022491455078125, |
|
"learning_rate": 1.4417536311769885e-07, |
|
"loss": -0.0202, |
|
"reward": -0.3488190211355686, |
|
"reward_std": 0.6528129577636719, |
|
"rewards/cosine_scaled_reward": -0.17440950870513916, |
|
"rewards/format_reward": 0.0, |
|
"step": 437 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2536.4444580078125, |
|
"epoch": 0.7506426735218509, |
|
"grad_norm": 0.2745197117328644, |
|
"kl": 0.0180511474609375, |
|
"learning_rate": 1.4282782639029128e-07, |
|
"loss": -0.0504, |
|
"reward": 0.2845611646771431, |
|
"reward_std": 0.4479832947254181, |
|
"rewards/cosine_scaled_reward": 0.14228056371212006, |
|
"rewards/format_reward": 0.0, |
|
"step": 438 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3101.6528930664062, |
|
"epoch": 0.7523564695801199, |
|
"grad_norm": 0.16732795536518097, |
|
"kl": 0.023773193359375, |
|
"learning_rate": 1.4150013466019114e-07, |
|
"loss": -0.0111, |
|
"reward": -0.2682619922561571, |
|
"reward_std": 0.6106480062007904, |
|
"rewards/cosine_scaled_reward": -0.134130991587881, |
|
"rewards/format_reward": 0.0, |
|
"step": 439 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2572.5555725097656, |
|
"epoch": 0.7540702656383891, |
|
"grad_norm": 0.19039277732372284, |
|
"kl": 0.023834228515625, |
|
"learning_rate": 1.4019235263722034e-07, |
|
"loss": -0.0026, |
|
"reward": 0.057762331794947386, |
|
"reward_std": 0.5597369149327278, |
|
"rewards/cosine_scaled_reward": 0.02888116310350597, |
|
"rewards/format_reward": 0.0, |
|
"step": 440 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2865.7916259765625, |
|
"epoch": 0.7557840616966581, |
|
"grad_norm": 0.19560997188091278, |
|
"kl": 0.024139404296875, |
|
"learning_rate": 1.3890454406082956e-07, |
|
"loss": 0.0056, |
|
"reward": -0.09995577030349523, |
|
"reward_std": 0.6689890846610069, |
|
"rewards/cosine_scaled_reward": -0.049977882008533925, |
|
"rewards/format_reward": 0.0, |
|
"step": 441 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2008.4305419921875, |
|
"epoch": 0.7574978577549272, |
|
"grad_norm": 0.2906733751296997, |
|
"kl": 0.02508544921875, |
|
"learning_rate": 1.3763677169699217e-07, |
|
"loss": -0.0189, |
|
"reward": 0.1475011482834816, |
|
"reward_std": 0.6993541121482849, |
|
"rewards/cosine_scaled_reward": 0.07375057972967625, |
|
"rewards/format_reward": 0.0, |
|
"step": 442 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2263.736114501953, |
|
"epoch": 0.7592116538131962, |
|
"grad_norm": 0.27822345495224, |
|
"kl": 0.027435302734375, |
|
"learning_rate": 1.3638909733514452e-07, |
|
"loss": -0.0396, |
|
"reward": 0.08332556113600731, |
|
"reward_std": 0.692223846912384, |
|
"rewards/cosine_scaled_reward": 0.04166277777403593, |
|
"rewards/format_reward": 0.0, |
|
"step": 443 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2766.52783203125, |
|
"epoch": 0.7609254498714653, |
|
"grad_norm": 0.20186901092529297, |
|
"kl": 0.02777099609375, |
|
"learning_rate": 1.351615817851748e-07, |
|
"loss": 0.0635, |
|
"reward": 0.03594814520329237, |
|
"reward_std": 0.6744156032800674, |
|
"rewards/cosine_scaled_reward": 0.017974070739001036, |
|
"rewards/format_reward": 0.0, |
|
"step": 444 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3045.90283203125, |
|
"epoch": 0.7626392459297343, |
|
"grad_norm": 0.24945306777954102, |
|
"kl": 0.0214080810546875, |
|
"learning_rate": 1.3395428487445914e-07, |
|
"loss": 0.0559, |
|
"reward": -0.03293860936537385, |
|
"reward_std": 0.6841256394982338, |
|
"rewards/cosine_scaled_reward": -0.016469309804961085, |
|
"rewards/format_reward": 0.0, |
|
"step": 445 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2726.013885498047, |
|
"epoch": 0.7643530419880035, |
|
"grad_norm": 0.267711341381073, |
|
"kl": 0.02545166015625, |
|
"learning_rate": 1.3276726544494571e-07, |
|
"loss": -0.0273, |
|
"reward": 0.17773457616567612, |
|
"reward_std": 0.47991518676280975, |
|
"rewards/cosine_scaled_reward": 0.08886728808283806, |
|
"rewards/format_reward": 0.0, |
|
"step": 446 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2652.3750610351562, |
|
"epoch": 0.7660668380462725, |
|
"grad_norm": 0.1955571472644806, |
|
"kl": 0.024688720703125, |
|
"learning_rate": 1.316005813502869e-07, |
|
"loss": -0.0013, |
|
"reward": -0.21300538629293442, |
|
"reward_std": 0.5716921538114548, |
|
"rewards/cosine_scaled_reward": -0.10650269035249949, |
|
"rewards/format_reward": 0.0, |
|
"step": 447 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2869.0972290039062, |
|
"epoch": 0.7677806341045416, |
|
"grad_norm": 0.18771569430828094, |
|
"kl": 0.020751953125, |
|
"learning_rate": 1.3045428945301953e-07, |
|
"loss": 0.0449, |
|
"reward": 0.015052955597639084, |
|
"reward_std": 0.6415582820773125, |
|
"rewards/cosine_scaled_reward": 0.007526477798819542, |
|
"rewards/format_reward": 0.0, |
|
"step": 448 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2951.72216796875, |
|
"epoch": 0.7694944301628106, |
|
"grad_norm": 0.186003640294075, |
|
"kl": 0.0264739990234375, |
|
"learning_rate": 1.2932844562179352e-07, |
|
"loss": -0.0117, |
|
"reward": -0.1732272356748581, |
|
"reward_std": 0.6033661440014839, |
|
"rewards/cosine_scaled_reward": -0.0866136197000742, |
|
"rewards/format_reward": 0.0, |
|
"step": 449 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2393.916717529297, |
|
"epoch": 0.7712082262210797, |
|
"grad_norm": 0.18100591003894806, |
|
"kl": 0.018280029296875, |
|
"learning_rate": 1.2822310472864885e-07, |
|
"loss": 0.0174, |
|
"reward": -0.17222392931580544, |
|
"reward_std": 0.4759965166449547, |
|
"rewards/cosine_scaled_reward": -0.08611196093261242, |
|
"rewards/format_reward": 0.0, |
|
"step": 450 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3247.5555419921875, |
|
"epoch": 0.7729220222793488, |
|
"grad_norm": 0.16927795112133026, |
|
"kl": 0.026092529296875, |
|
"learning_rate": 1.2713832064634125e-07, |
|
"loss": 0.0101, |
|
"reward": -0.1602705717086792, |
|
"reward_std": 0.5965098738670349, |
|
"rewards/cosine_scaled_reward": -0.0801352858543396, |
|
"rewards/format_reward": 0.0, |
|
"step": 451 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2699.2222900390625, |
|
"epoch": 0.7746358183376179, |
|
"grad_norm": 0.16316962242126465, |
|
"kl": 0.0219879150390625, |
|
"learning_rate": 1.260741462457165e-07, |
|
"loss": 0.055, |
|
"reward": -0.06079525873064995, |
|
"reward_std": 0.6986799910664558, |
|
"rewards/cosine_scaled_reward": -0.030397622846066952, |
|
"rewards/format_reward": 0.0, |
|
"step": 452 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2595.3472290039062, |
|
"epoch": 0.7763496143958869, |
|
"grad_norm": 0.5946676135063171, |
|
"kl": 0.029022216796875, |
|
"learning_rate": 1.2503063339313356e-07, |
|
"loss": -0.0538, |
|
"reward": -0.26486414577811956, |
|
"reward_std": 0.415864534676075, |
|
"rewards/cosine_scaled_reward": -0.13243207102641463, |
|
"rewards/format_reward": 0.0, |
|
"step": 453 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2008.9305725097656, |
|
"epoch": 0.778063410454156, |
|
"grad_norm": 0.19199617207050323, |
|
"kl": 0.0153961181640625, |
|
"learning_rate": 1.2400783294793668e-07, |
|
"loss": 0.0063, |
|
"reward": 0.13748213648796082, |
|
"reward_std": 0.6150016859173775, |
|
"rewards/cosine_scaled_reward": 0.06874106079339981, |
|
"rewards/format_reward": 0.0, |
|
"step": 454 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2725.0000610351562, |
|
"epoch": 0.779777206512425, |
|
"grad_norm": 0.1928061991930008, |
|
"kl": 0.02520751953125, |
|
"learning_rate": 1.2300579475997657e-07, |
|
"loss": 0.0235, |
|
"reward": -0.0030081644654273987, |
|
"reward_std": 0.7155122309923172, |
|
"rewards/cosine_scaled_reward": -0.0015040775761008263, |
|
"rewards/format_reward": 0.0, |
|
"step": 455 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2560.888916015625, |
|
"epoch": 0.781491002570694, |
|
"grad_norm": 0.2278081625699997, |
|
"kl": 0.0233154296875, |
|
"learning_rate": 1.220245676671809e-07, |
|
"loss": 0.0422, |
|
"reward": -0.040069979906547815, |
|
"reward_std": 0.6579814180731773, |
|
"rewards/cosine_scaled_reward": -0.02003499452257529, |
|
"rewards/format_reward": 0.0, |
|
"step": 456 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2620.638916015625, |
|
"epoch": 0.7832047986289632, |
|
"grad_norm": 0.2580767571926117, |
|
"kl": 0.0252532958984375, |
|
"learning_rate": 1.2106419949317388e-07, |
|
"loss": 0.033, |
|
"reward": 0.23012623190879822, |
|
"reward_std": 0.6976396143436432, |
|
"rewards/cosine_scaled_reward": 0.11506311595439911, |
|
"rewards/format_reward": 0.0, |
|
"step": 457 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2410.513946533203, |
|
"epoch": 0.7849185946872322, |
|
"grad_norm": 0.2296840250492096, |
|
"kl": 0.0242919921875, |
|
"learning_rate": 1.2012473704494537e-07, |
|
"loss": -0.0221, |
|
"reward": -0.00019283778965473175, |
|
"reward_std": 0.6016373038291931, |
|
"rewards/cosine_scaled_reward": -9.64207574725151e-05, |
|
"rewards/format_reward": 0.0, |
|
"step": 458 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2265.4583435058594, |
|
"epoch": 0.7866323907455013, |
|
"grad_norm": 0.23107987642288208, |
|
"kl": 0.0201568603515625, |
|
"learning_rate": 1.1920622611056974e-07, |
|
"loss": -0.0309, |
|
"reward": 0.18072006362490356, |
|
"reward_std": 0.5644106566905975, |
|
"rewards/cosine_scaled_reward": 0.09036003064829856, |
|
"rewards/format_reward": 0.0, |
|
"step": 459 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2700.541748046875, |
|
"epoch": 0.7883461868037703, |
|
"grad_norm": 0.21530865132808685, |
|
"kl": 0.024566650390625, |
|
"learning_rate": 1.1830871145697412e-07, |
|
"loss": 0.0267, |
|
"reward": 0.030735374661162496, |
|
"reward_std": 0.7207788527011871, |
|
"rewards/cosine_scaled_reward": 0.01536769128870219, |
|
"rewards/format_reward": 0.0, |
|
"step": 460 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2300.8333435058594, |
|
"epoch": 0.7900599828620394, |
|
"grad_norm": 0.31069549918174744, |
|
"kl": 0.023162841796875, |
|
"learning_rate": 1.1743223682775649e-07, |
|
"loss": -0.0563, |
|
"reward": 0.09233328700065613, |
|
"reward_std": 0.7090381979942322, |
|
"rewards/cosine_scaled_reward": 0.046166639775037766, |
|
"rewards/format_reward": 0.0, |
|
"step": 461 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2608.8056030273438, |
|
"epoch": 0.7917737789203085, |
|
"grad_norm": 0.20271840691566467, |
|
"kl": 0.0250244140625, |
|
"learning_rate": 1.1657684494105386e-07, |
|
"loss": 0.0362, |
|
"reward": -0.07103721424937248, |
|
"reward_std": 0.7956888303160667, |
|
"rewards/cosine_scaled_reward": -0.03551860898733139, |
|
"rewards/format_reward": 0.0, |
|
"step": 462 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2780.4444580078125, |
|
"epoch": 0.7934875749785776, |
|
"grad_norm": 0.2081584334373474, |
|
"kl": 0.0209808349609375, |
|
"learning_rate": 1.1574257748745986e-07, |
|
"loss": -0.0471, |
|
"reward": 0.021411696448922157, |
|
"reward_std": 0.48744403570890427, |
|
"rewards/cosine_scaled_reward": 0.010705851949751377, |
|
"rewards/format_reward": 0.0, |
|
"step": 463 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2673.4861450195312, |
|
"epoch": 0.7952013710368466, |
|
"grad_norm": 0.19560140371322632, |
|
"kl": 0.0238037109375, |
|
"learning_rate": 1.1492947512799328e-07, |
|
"loss": -0.0409, |
|
"reward": 0.18470758572220802, |
|
"reward_std": 0.5649774596095085, |
|
"rewards/cosine_scaled_reward": 0.09235379751771688, |
|
"rewards/format_reward": 0.0, |
|
"step": 464 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3105.27783203125, |
|
"epoch": 0.7969151670951157, |
|
"grad_norm": 0.20849952101707458, |
|
"kl": 0.030609130859375, |
|
"learning_rate": 1.1413757749211602e-07, |
|
"loss": 0.041, |
|
"reward": -0.284846730530262, |
|
"reward_std": 0.5418054684996605, |
|
"rewards/cosine_scaled_reward": -0.14242336247116327, |
|
"rewards/format_reward": 0.0, |
|
"step": 465 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2965.8611450195312, |
|
"epoch": 0.7986289631533847, |
|
"grad_norm": 0.17242176830768585, |
|
"kl": 0.025604248046875, |
|
"learning_rate": 1.1336692317580158e-07, |
|
"loss": 0.0034, |
|
"reward": -0.3226154297590256, |
|
"reward_std": 0.5333989933133125, |
|
"rewards/cosine_scaled_reward": -0.16130771208554506, |
|
"rewards/format_reward": 0.0, |
|
"step": 466 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3079.6806030273438, |
|
"epoch": 0.8003427592116538, |
|
"grad_norm": 0.20415925979614258, |
|
"kl": 0.0255126953125, |
|
"learning_rate": 1.1261754973965422e-07, |
|
"loss": 0.0969, |
|
"reward": -0.24770671501755714, |
|
"reward_std": 0.5701889246702194, |
|
"rewards/cosine_scaled_reward": -0.12385335750877857, |
|
"rewards/format_reward": 0.0, |
|
"step": 467 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2234.986083984375, |
|
"epoch": 0.8020565552699229, |
|
"grad_norm": 0.3467549979686737, |
|
"kl": 0.024200439453125, |
|
"learning_rate": 1.1188949370707787e-07, |
|
"loss": 0.0855, |
|
"reward": -0.009742069989442825, |
|
"reward_std": 0.591868631541729, |
|
"rewards/cosine_scaled_reward": -0.004871031269431114, |
|
"rewards/format_reward": 0.0, |
|
"step": 468 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2717.0833435058594, |
|
"epoch": 0.803770351328192, |
|
"grad_norm": 0.20191779732704163, |
|
"kl": 0.0237579345703125, |
|
"learning_rate": 1.1118279056249653e-07, |
|
"loss": 0.0622, |
|
"reward": 0.058277749456465244, |
|
"reward_std": 0.7684449702501297, |
|
"rewards/cosine_scaled_reward": 0.029138876125216484, |
|
"rewards/format_reward": 0.0, |
|
"step": 469 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2607.1388549804688, |
|
"epoch": 0.805484147386461, |
|
"grad_norm": 0.2015339732170105, |
|
"kl": 0.0255126953125, |
|
"learning_rate": 1.1049747474962444e-07, |
|
"loss": -0.0145, |
|
"reward": -0.2184823751449585, |
|
"reward_std": 0.5644990280270576, |
|
"rewards/cosine_scaled_reward": -0.10924118757247925, |
|
"rewards/format_reward": 0.0, |
|
"step": 470 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3260.3333740234375, |
|
"epoch": 0.8071979434447301, |
|
"grad_norm": 0.2259937822818756, |
|
"kl": 0.02581787109375, |
|
"learning_rate": 1.0983357966978745e-07, |
|
"loss": 0.0565, |
|
"reward": -0.22493689320981503, |
|
"reward_std": 0.5675350055098534, |
|
"rewards/cosine_scaled_reward": -0.11246845219284296, |
|
"rewards/format_reward": 0.0, |
|
"step": 471 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2854.1806030273438, |
|
"epoch": 0.8089117395029991, |
|
"grad_norm": 0.21495820581912994, |
|
"kl": 0.028594970703125, |
|
"learning_rate": 1.0919113768029517e-07, |
|
"loss": -0.0313, |
|
"reward": 0.14072632044553757, |
|
"reward_std": 0.6395101621747017, |
|
"rewards/cosine_scaled_reward": 0.07036316394805908, |
|
"rewards/format_reward": 0.0, |
|
"step": 472 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2890.4444580078125, |
|
"epoch": 0.8106255355612683, |
|
"grad_norm": 0.18685470521450043, |
|
"kl": 0.023468017578125, |
|
"learning_rate": 1.0857018009286381e-07, |
|
"loss": -0.0026, |
|
"reward": -0.1240294948220253, |
|
"reward_std": 0.48969001322984695, |
|
"rewards/cosine_scaled_reward": -0.06201474368572235, |
|
"rewards/format_reward": 0.0, |
|
"step": 473 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2470.0972900390625, |
|
"epoch": 0.8123393316195373, |
|
"grad_norm": 0.2818465232849121, |
|
"kl": 0.029144287109375, |
|
"learning_rate": 1.0797073717209013e-07, |
|
"loss": 0.0035, |
|
"reward": 0.24050107831135392, |
|
"reward_std": 0.5852163806557655, |
|
"rewards/cosine_scaled_reward": 0.12025054381228983, |
|
"rewards/format_reward": 0.0, |
|
"step": 474 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2933.541748046875, |
|
"epoch": 0.8140531276778064, |
|
"grad_norm": 0.19675259292125702, |
|
"kl": 0.027496337890625, |
|
"learning_rate": 1.0739283813397639e-07, |
|
"loss": 0.0321, |
|
"reward": -0.22623535431921482, |
|
"reward_std": 0.6677599251270294, |
|
"rewards/cosine_scaled_reward": -0.1131176782073453, |
|
"rewards/format_reward": 0.0, |
|
"step": 475 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2794.9862060546875, |
|
"epoch": 0.8157669237360754, |
|
"grad_norm": 0.24406589567661285, |
|
"kl": 0.02947998046875, |
|
"learning_rate": 1.068365111445064e-07, |
|
"loss": -0.0179, |
|
"reward": -0.3521595522761345, |
|
"reward_std": 0.5985631048679352, |
|
"rewards/cosine_scaled_reward": -0.17607977613806725, |
|
"rewards/format_reward": 0.0, |
|
"step": 476 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2567.4722290039062, |
|
"epoch": 0.8174807197943444, |
|
"grad_norm": 0.17105937004089355, |
|
"kl": 0.016357421875, |
|
"learning_rate": 1.063017833182728e-07, |
|
"loss": 0.0385, |
|
"reward": -0.2558911629021168, |
|
"reward_std": 0.4246537983417511, |
|
"rewards/cosine_scaled_reward": -0.12794558703899384, |
|
"rewards/format_reward": 0.0, |
|
"step": 477 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2355.6806030273438, |
|
"epoch": 0.8191945158526135, |
|
"grad_norm": 0.24741511046886444, |
|
"kl": 0.02423095703125, |
|
"learning_rate": 1.0578868071715544e-07, |
|
"loss": 0.0854, |
|
"reward": -0.1565770129673183, |
|
"reward_std": 0.6820876449346542, |
|
"rewards/cosine_scaled_reward": -0.07828850811347365, |
|
"rewards/format_reward": 0.0, |
|
"step": 478 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2624.0833435058594, |
|
"epoch": 0.8209083119108826, |
|
"grad_norm": 0.20646637678146362, |
|
"kl": 0.0313720703125, |
|
"learning_rate": 1.0529722834905125e-07, |
|
"loss": 0.0146, |
|
"reward": 0.1560894399881363, |
|
"reward_std": 0.5770560130476952, |
|
"rewards/cosine_scaled_reward": 0.0780447069555521, |
|
"rewards/format_reward": 0.0, |
|
"step": 479 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2327.7222290039062, |
|
"epoch": 0.8226221079691517, |
|
"grad_norm": 0.23056431114673615, |
|
"kl": 0.01910400390625, |
|
"learning_rate": 1.0482745016665526e-07, |
|
"loss": 0.0635, |
|
"reward": 0.149917745962739, |
|
"reward_std": 0.7103602811694145, |
|
"rewards/cosine_scaled_reward": 0.07495887111872435, |
|
"rewards/format_reward": 0.0, |
|
"step": 480 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2461.5972900390625, |
|
"epoch": 0.8243359040274207, |
|
"grad_norm": 0.24706712365150452, |
|
"kl": 0.03350830078125, |
|
"learning_rate": 1.0437936906629334e-07, |
|
"loss": 0.0029, |
|
"reward": -0.1800133902579546, |
|
"reward_std": 0.49367547780275345, |
|
"rewards/cosine_scaled_reward": -0.09000669163651764, |
|
"rewards/format_reward": 0.0, |
|
"step": 481 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2864.0694580078125, |
|
"epoch": 0.8260497000856898, |
|
"grad_norm": 0.25076547265052795, |
|
"kl": 0.021514892578125, |
|
"learning_rate": 1.0395300688680625e-07, |
|
"loss": 0.0445, |
|
"reward": -0.14674655348062515, |
|
"reward_std": 0.5242787301540375, |
|
"rewards/cosine_scaled_reward": -0.07337328046560287, |
|
"rewards/format_reward": 0.0, |
|
"step": 482 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2596.874969482422, |
|
"epoch": 0.8277634961439588, |
|
"grad_norm": 0.2056618481874466, |
|
"kl": 0.02398681640625, |
|
"learning_rate": 1.0354838440848501e-07, |
|
"loss": 0.0449, |
|
"reward": -0.12835523579269648, |
|
"reward_std": 0.5452019795775414, |
|
"rewards/cosine_scaled_reward": -0.06417762162163854, |
|
"rewards/format_reward": 0.0, |
|
"step": 483 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2179.500030517578, |
|
"epoch": 0.829477292202228, |
|
"grad_norm": 0.17514649033546448, |
|
"kl": 0.0175018310546875, |
|
"learning_rate": 1.0316552135205837e-07, |
|
"loss": 0.058, |
|
"reward": -0.154528075363487, |
|
"reward_std": 0.5336438938975334, |
|
"rewards/cosine_scaled_reward": -0.07726403628475964, |
|
"rewards/format_reward": 0.0, |
|
"step": 484 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2221.6666259765625, |
|
"epoch": 0.831191088260497, |
|
"grad_norm": 0.2613643705844879, |
|
"kl": 0.028350830078125, |
|
"learning_rate": 1.0280443637773163e-07, |
|
"loss": 0.0496, |
|
"reward": -0.2054775208234787, |
|
"reward_std": 0.5721682235598564, |
|
"rewards/cosine_scaled_reward": -0.1027387659996748, |
|
"rewards/format_reward": 0.0, |
|
"step": 485 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2448.3193969726562, |
|
"epoch": 0.8329048843187661, |
|
"grad_norm": 0.2419876903295517, |
|
"kl": 0.028411865234375, |
|
"learning_rate": 1.0246514708427701e-07, |
|
"loss": -0.0397, |
|
"reward": -0.06807173043489456, |
|
"reward_std": 0.507116761058569, |
|
"rewards/cosine_scaled_reward": -0.03403585962951183, |
|
"rewards/format_reward": 0.0, |
|
"step": 486 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2735.7084045410156, |
|
"epoch": 0.8346186803770351, |
|
"grad_norm": 0.2406454235315323, |
|
"kl": 0.0244140625, |
|
"learning_rate": 1.0214767000817596e-07, |
|
"loss": 0.0244, |
|
"reward": -0.2896123267710209, |
|
"reward_std": 0.5326507315039635, |
|
"rewards/cosine_scaled_reward": -0.1448061689734459, |
|
"rewards/format_reward": 0.0, |
|
"step": 487 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2310.27783203125, |
|
"epoch": 0.8363324764353042, |
|
"grad_norm": 0.252458781003952, |
|
"kl": 0.021209716796875, |
|
"learning_rate": 1.0185202062281336e-07, |
|
"loss": 0.0239, |
|
"reward": 0.10779337584972382, |
|
"reward_std": 0.6982715576887131, |
|
"rewards/cosine_scaled_reward": 0.053896697354502976, |
|
"rewards/format_reward": 0.0, |
|
"step": 488 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2743.541748046875, |
|
"epoch": 0.8380462724935732, |
|
"grad_norm": 0.17434372007846832, |
|
"kl": 0.024749755859375, |
|
"learning_rate": 1.0157821333772304e-07, |
|
"loss": 0.0124, |
|
"reward": -0.16761679388582706, |
|
"reward_std": 0.6089917570352554, |
|
"rewards/cosine_scaled_reward": -0.08380839880555868, |
|
"rewards/format_reward": 0.0, |
|
"step": 489 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2486.236114501953, |
|
"epoch": 0.8397600685518424, |
|
"grad_norm": 0.2090701311826706, |
|
"kl": 0.027984619140625, |
|
"learning_rate": 1.013262614978859e-07, |
|
"loss": 0.0036, |
|
"reward": -0.05014536017552018, |
|
"reward_std": 0.5763295590877533, |
|
"rewards/cosine_scaled_reward": -0.025072677060961723, |
|
"rewards/format_reward": 0.0, |
|
"step": 490 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2848.888916015625, |
|
"epoch": 0.8414738646101114, |
|
"grad_norm": 0.22741778194904327, |
|
"kl": 0.029052734375, |
|
"learning_rate": 1.0109617738307911e-07, |
|
"loss": 0.0629, |
|
"reward": -0.16439465060830116, |
|
"reward_std": 0.6782207787036896, |
|
"rewards/cosine_scaled_reward": -0.08219731226563454, |
|
"rewards/format_reward": 0.0, |
|
"step": 491 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3189.6250610351562, |
|
"epoch": 0.8431876606683805, |
|
"grad_norm": 0.17802605032920837, |
|
"kl": 0.028839111328125, |
|
"learning_rate": 1.0088797220727779e-07, |
|
"loss": 0.0435, |
|
"reward": -0.35712628811597824, |
|
"reward_std": 0.6088190823793411, |
|
"rewards/cosine_scaled_reward": -0.17856314033269882, |
|
"rewards/format_reward": 0.0, |
|
"step": 492 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2387.888885498047, |
|
"epoch": 0.8449014567266495, |
|
"grad_norm": 0.2299438714981079, |
|
"kl": 0.025177001953125, |
|
"learning_rate": 1.0070165611810855e-07, |
|
"loss": 0.0576, |
|
"reward": -0.17006561160087585, |
|
"reward_std": 0.3991905003786087, |
|
"rewards/cosine_scaled_reward": -0.08503280207514763, |
|
"rewards/format_reward": 0.0, |
|
"step": 493 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2941.8611450195312, |
|
"epoch": 0.8466152527849186, |
|
"grad_norm": 0.18643692135810852, |
|
"kl": 0.0261383056640625, |
|
"learning_rate": 1.005372381963547e-07, |
|
"loss": 0.0474, |
|
"reward": -0.04825907852500677, |
|
"reward_std": 0.6627323552966118, |
|
"rewards/cosine_scaled_reward": -0.02412955043837428, |
|
"rewards/format_reward": 0.0, |
|
"step": 494 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2563.375030517578, |
|
"epoch": 0.8483290488431876, |
|
"grad_norm": 0.32022979855537415, |
|
"kl": 0.03094482421875, |
|
"learning_rate": 1.0039472645551372e-07, |
|
"loss": -0.0218, |
|
"reward": 0.03584544826298952, |
|
"reward_std": 0.5564405769109726, |
|
"rewards/cosine_scaled_reward": 0.017922731814906, |
|
"rewards/format_reward": 0.0, |
|
"step": 495 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2572.77783203125, |
|
"epoch": 0.8500428449014568, |
|
"grad_norm": 0.23663800954818726, |
|
"kl": 0.0213623046875, |
|
"learning_rate": 1.002741278414069e-07, |
|
"loss": 0.0756, |
|
"reward": 0.13385188579559326, |
|
"reward_std": 0.6874089986085892, |
|
"rewards/cosine_scaled_reward": 0.06692593172192574, |
|
"rewards/format_reward": 0.0, |
|
"step": 496 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2480.5694274902344, |
|
"epoch": 0.8517566409597258, |
|
"grad_norm": 0.2888961732387543, |
|
"kl": 0.0250244140625, |
|
"learning_rate": 1.0017544823184055e-07, |
|
"loss": 0.1087, |
|
"reward": 0.2151249535381794, |
|
"reward_std": 0.7869587689638138, |
|
"rewards/cosine_scaled_reward": 0.10756248049438, |
|
"rewards/format_reward": 0.0, |
|
"step": 497 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3000.1666259765625, |
|
"epoch": 0.8534704370179949, |
|
"grad_norm": 0.16567862033843994, |
|
"kl": 0.0272216796875, |
|
"learning_rate": 1.0009869243631952e-07, |
|
"loss": -0.0254, |
|
"reward": -0.33249833807349205, |
|
"reward_std": 0.48314109444618225, |
|
"rewards/cosine_scaled_reward": -0.16624917834997177, |
|
"rewards/format_reward": 0.0, |
|
"step": 498 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2385.5416870117188, |
|
"epoch": 0.8551842330762639, |
|
"grad_norm": 0.20905697345733643, |
|
"kl": 0.027862548828125, |
|
"learning_rate": 1.000438641958131e-07, |
|
"loss": 0.017, |
|
"reward": 0.046394890174269676, |
|
"reward_std": 0.649334043264389, |
|
"rewards/cosine_scaled_reward": 0.02319744322448969, |
|
"rewards/format_reward": 0.0, |
|
"step": 499 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1828.2777862548828, |
|
"epoch": 0.856898029134533, |
|
"grad_norm": 0.25425171852111816, |
|
"kl": 0.0164031982421875, |
|
"learning_rate": 1.0001096618257236e-07, |
|
"loss": -0.0015, |
|
"reward": 0.1327105201780796, |
|
"reward_std": 0.5622994378209114, |
|
"rewards/cosine_scaled_reward": 0.06635526567697525, |
|
"rewards/format_reward": 0.0, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.856898029134533, |
|
"step": 500, |
|
"total_flos": 0.0, |
|
"train_loss": 0.01978990149567835, |
|
"train_runtime": 91059.0796, |
|
"train_samples_per_second": 0.395, |
|
"train_steps_per_second": 0.005 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 500, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 50, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 0.0, |
|
"train_batch_size": 6, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|