{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.2172088660709878, "eval_steps": 500, "global_step": 275, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25390625, "completions/max_length": 3113.8125, "completions/max_terminated_length": 3113.8125, "completions/mean_length": 1242.390625, "completions/mean_terminated_length": 1679.933292388916, "completions/min_length": 0.0, "completions/min_terminated_length": 796.5, "epoch": 0.0003949252110381596, "grad_norm": 0.38875215842481353, "learning_rate": 0.0, "loss": 0.0278, "num_tokens": 333380.0, "reward": 0.328125, "reward_std": 0.3151498883962631, "rewards/reward_model/mean": 0.328125, "rewards/reward_model/std": 0.3151498921215534, "step": 1 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 2443.5625, "completions/max_terminated_length": 2443.5625, "completions/mean_length": 1032.63671875, "completions/mean_terminated_length": 1376.8490009307861, "completions/min_length": 0.0, "completions/min_terminated_length": 708.5, "epoch": 0.0007898504220763192, "grad_norm": 0.3211156233676256, "learning_rate": 5e-08, "loss": -0.0277, "num_tokens": 615255.0, "reward": 0.29296875, "reward_std": 0.20371346175670624, "rewards/reward_model/mean": 0.29296875, "rewards/reward_model/std": 0.20371347293257713, "step": 2 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 3028.5, "completions/max_terminated_length": 3028.5, "completions/mean_length": 1184.58984375, "completions/mean_terminated_length": 1579.4531784057617, "completions/min_length": 0.0, "completions/min_terminated_length": 793.5, "epoch": 0.001184775633114479, "grad_norm": 0.47798488969227343, "learning_rate": 1e-07, "loss": -0.031, "num_tokens": 937870.0, "reward": 0.296875, "reward_std": 0.32350922748446465, "rewards/reward_model/mean": 0.296875, "rewards/reward_model/std": 0.32350924238562584, "step": 3 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.26953125, "completions/max_length": 2960.5625, "completions/max_terminated_length": 2960.5625, "completions/mean_length": 1169.2265625, "completions/mean_terminated_length": 1704.0990257263184, "completions/min_length": 0.0, "completions/min_terminated_length": 1018.1875, "epoch": 0.0015797008441526385, "grad_norm": 0.3707091421061334, "learning_rate": 1.5e-07, "loss": -0.0039, "num_tokens": 1255160.0, "reward": 0.24609375, "reward_std": 0.23525599017739296, "rewards/reward_model/mean": 0.24609375, "rewards/reward_model/std": 0.23525600880384445, "step": 4 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 2851.5625, "completions/max_terminated_length": 2851.5625, "completions/mean_length": 1148.91796875, "completions/mean_terminated_length": 1531.8906707763672, "completions/min_length": 0.0, "completions/min_terminated_length": 758.25, "epoch": 0.0019746260551907983, "grad_norm": 0.4435512817720059, "learning_rate": 2e-07, "loss": -0.1241, "num_tokens": 1567747.0, "reward": 0.34375, "reward_std": 0.25076062977313995, "rewards/reward_model/mean": 0.34375, "rewards/reward_model/std": 0.2507606353610754, "step": 5 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1590.4375, "completions/max_terminated_length": 1590.4375, "completions/mean_length": 803.3203125, "completions/mean_terminated_length": 1071.0937881469727, "completions/min_length": 0.0, "completions/min_terminated_length": 719.5625, "epoch": 0.002369551266228958, "grad_norm": 0.503573864277716, "learning_rate": 2.5e-07, "loss": -0.0106, "num_tokens": 1789237.0, "reward": 0.37890625, "reward_std": 0.2071847803890705, "rewards/reward_model/mean": 0.37890625, "rewards/reward_model/std": 0.2071847841143608, "step": 6 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 2995.3125, "completions/max_terminated_length": 2995.3125, "completions/mean_length": 1147.73046875, "completions/mean_terminated_length": 1530.3073272705078, "completions/min_length": 0.0, "completions/min_terminated_length": 721.75, "epoch": 0.0027644764772671174, "grad_norm": 0.39602020984344516, "learning_rate": 3e-07, "loss": 0.0133, "num_tokens": 2098048.0, "reward": 0.41015625, "reward_std": 0.3286610618233681, "rewards/reward_model/mean": 0.41015625, "rewards/reward_model/std": 0.32866106927394867, "step": 7 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25390625, "completions/max_length": 2684.3125, "completions/max_terminated_length": 2684.3125, "completions/mean_length": 1058.1484375, "completions/mean_terminated_length": 1428.1804428100586, "completions/min_length": 0.0, "completions/min_terminated_length": 787.375, "epoch": 0.003159401688305277, "grad_norm": 0.46088421956611847, "learning_rate": 3.5e-07, "loss": 0.0366, "num_tokens": 2385942.0, "reward": 0.40234375, "reward_std": 0.26605773344635963, "rewards/reward_model/mean": 0.40234375, "rewards/reward_model/std": 0.26605774462223053, "step": 8 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 2071.375, "completions/max_terminated_length": 2071.375, "completions/mean_length": 860.8359375, "completions/mean_terminated_length": 1147.7812728881836, "completions/min_length": 0.0, "completions/min_terminated_length": 722.9375, "epoch": 0.003554326899343437, "grad_norm": 0.5226548579322207, "learning_rate": 4e-07, "loss": -0.034, "num_tokens": 2621052.0, "reward": 0.35546875, "reward_std": 0.33395427092909813, "rewards/reward_model/mean": 0.35546875, "rewards/reward_model/std": 0.3339542802423239, "step": 9 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25390625, "completions/max_length": 2918.8125, "completions/max_terminated_length": 2918.8125, "completions/mean_length": 1189.6171875, "completions/mean_terminated_length": 1600.7117042541504, "completions/min_length": 0.0, "completions/min_terminated_length": 854.0625, "epoch": 0.0039492521103815965, "grad_norm": 0.4388231429071249, "learning_rate": 4.5e-07, "loss": -0.0418, "num_tokens": 2944218.0, "reward": 0.33203125, "reward_std": 0.29445402696728706, "rewards/reward_model/mean": 0.33203125, "rewards/reward_model/std": 0.2944540400058031, "step": 10 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 3076.125, "completions/max_terminated_length": 3076.125, "completions/mean_length": 1289.80859375, "completions/mean_terminated_length": 1719.7448539733887, "completions/min_length": 0.0, "completions/min_terminated_length": 945.0625, "epoch": 0.0043441773214197565, "grad_norm": 0.3709169057319495, "learning_rate": 5e-07, "loss": 0.0011, "num_tokens": 3291225.0, "reward": 0.32421875, "reward_std": 0.27493180707097054, "rewards/reward_model/mean": 0.32421875, "rewards/reward_model/std": 0.27493181452155113, "step": 11 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.2734375, "completions/max_length": 3346.5625, "completions/max_terminated_length": 3346.5625, "completions/mean_length": 1135.9609375, "completions/mean_terminated_length": 1643.177116394043, "completions/min_length": 0.0, "completions/min_terminated_length": 851.8125, "epoch": 0.004739102532457916, "grad_norm": 0.4428751873272317, "learning_rate": 5.5e-07, "loss": -0.0752, "num_tokens": 3599439.0, "reward": 0.30078125, "reward_std": 0.2841016612946987, "rewards/reward_model/mean": 0.30078125, "rewards/reward_model/std": 0.2841016724705696, "step": 12 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 2086.0, "completions/max_terminated_length": 2086.0, "completions/mean_length": 1006.58984375, "completions/mean_terminated_length": 1342.1198043823242, "completions/min_length": 0.0, "completions/min_terminated_length": 759.8125, "epoch": 0.005134027743496076, "grad_norm": 0.5080733092482914, "learning_rate": 6e-07, "loss": -0.0281, "num_tokens": 3874902.0, "reward": 0.44140625, "reward_std": 0.31689510121941566, "rewards/reward_model/mean": 0.44140625, "rewards/reward_model/std": 0.31689511984586716, "step": 13 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 2509.5625, "completions/max_terminated_length": 2509.5625, "completions/mean_length": 1048.6796875, "completions/mean_terminated_length": 1398.239631652832, "completions/min_length": 0.0, "completions/min_terminated_length": 716.3125, "epoch": 0.005528952954534235, "grad_norm": 0.4862243267128402, "learning_rate": 6.5e-07, "loss": 0.0, "num_tokens": 4158660.0, "reward": 0.3671875, "reward_std": 0.3822711631655693, "rewards/reward_model/mean": 0.3671875, "rewards/reward_model/std": 0.38227117620408535, "step": 14 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 2615.5625, "completions/max_terminated_length": 2615.5625, "completions/mean_length": 1101.92578125, "completions/mean_terminated_length": 1469.2344284057617, "completions/min_length": 0.0, "completions/min_terminated_length": 797.5, "epoch": 0.005923878165572395, "grad_norm": 0.4300337340669923, "learning_rate": 7e-07, "loss": -0.0138, "num_tokens": 4457921.0, "reward": 0.39453125, "reward_std": 0.22028234973549843, "rewards/reward_model/mean": 0.39453125, "rewards/reward_model/std": 0.22028235904872417, "step": 15 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25390625, "completions/max_length": 2101.8125, "completions/max_terminated_length": 2101.8125, "completions/mean_length": 880.39453125, "completions/mean_terminated_length": 1184.8409461975098, "completions/min_length": 0.0, "completions/min_terminated_length": 738.5625, "epoch": 0.006318803376610554, "grad_norm": 0.5580163395677518, "learning_rate": 7.5e-07, "loss": -0.0358, "num_tokens": 4699654.0, "reward": 0.453125, "reward_std": 0.39126603677868843, "rewards/reward_model/mean": 0.453125, "rewards/reward_model/std": 0.3912660516798496, "step": 16 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25390625, "completions/max_length": 2581.25, "completions/max_terminated_length": 2581.25, "completions/mean_length": 1041.62890625, "completions/mean_terminated_length": 1401.0099906921387, "completions/min_length": 0.0, "completions/min_terminated_length": 803.3125, "epoch": 0.006713728587648714, "grad_norm": 0.41730422528440014, "learning_rate": 8e-07, "loss": -0.0402, "num_tokens": 4982183.0, "reward": 0.1640625, "reward_std": 0.2380094714462757, "rewards/reward_model/mean": 0.1640625, "rewards/reward_model/std": 0.23800948448479176, "step": 17 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 2158.625, "completions/max_terminated_length": 2158.625, "completions/mean_length": 952.8203125, "completions/mean_terminated_length": 1270.427116394043, "completions/min_length": 0.0, "completions/min_terminated_length": 710.625, "epoch": 0.007108653798686874, "grad_norm": 0.46220212855997156, "learning_rate": 8.499999999999999e-07, "loss": 0.0657, "num_tokens": 5241369.0, "reward": 0.5, "reward_std": 0.3255753889679909, "rewards/reward_model/mean": 0.5, "rewards/reward_model/std": 0.32557540014386177, "step": 18 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.2578125, "completions/max_length": 2535.8125, "completions/max_terminated_length": 2535.8125, "completions/mean_length": 1061.421875, "completions/mean_terminated_length": 1463.4771423339844, "completions/min_length": 0.0, "completions/min_terminated_length": 723.875, "epoch": 0.007503579009725033, "grad_norm": 0.4356229559411334, "learning_rate": 9e-07, "loss": -0.0077, "num_tokens": 5527381.0, "reward": 0.3125, "reward_std": 0.259771503508091, "rewards/reward_model/mean": 0.3125, "rewards/reward_model/std": 0.2597715128213167, "step": 19 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25390625, "completions/max_length": 2183.5, "completions/max_terminated_length": 2183.5, "completions/mean_length": 900.3828125, "completions/mean_terminated_length": 1205.612247467041, "completions/min_length": 0.0, "completions/min_terminated_length": 653.625, "epoch": 0.007898504220763193, "grad_norm": 0.33704962937077226, "learning_rate": 9.499999999999999e-07, "loss": -0.0344, "num_tokens": 5772135.0, "reward": 0.34765625, "reward_std": 0.20656243339180946, "rewards/reward_model/mean": 0.34765625, "rewards/reward_model/std": 0.20656244084239006, "step": 20 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 2647.9375, "completions/max_terminated_length": 2647.9375, "completions/mean_length": 1001.4375, "completions/mean_terminated_length": 1335.2500381469727, "completions/min_length": 0.0, "completions/min_terminated_length": 714.75, "epoch": 0.008293429431801353, "grad_norm": 0.4437676020200186, "learning_rate": 1e-06, "loss": -0.028, "num_tokens": 6047127.0, "reward": 0.421875, "reward_std": 0.31513818353414536, "rewards/reward_model/mean": 0.421875, "rewards/reward_model/std": 0.31513819471001625, "step": 21 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 2363.25, "completions/max_terminated_length": 2363.25, "completions/mean_length": 1020.97265625, "completions/mean_terminated_length": 1361.2969055175781, "completions/min_length": 0.0, "completions/min_terminated_length": 696.0, "epoch": 0.008688354642839513, "grad_norm": 0.4683664511149398, "learning_rate": 9.989795918367346e-07, "loss": -0.113, "num_tokens": 6325248.0, "reward": 0.33203125, "reward_std": 0.26558637991547585, "rewards/reward_model/mean": 0.33203125, "rewards/reward_model/std": 0.2655863892287016, "step": 22 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 3520.8125, "completions/max_terminated_length": 3520.8125, "completions/mean_length": 1387.30078125, "completions/mean_terminated_length": 1849.734426498413, "completions/min_length": 0.0, "completions/min_terminated_length": 918.9375, "epoch": 0.009083279853877671, "grad_norm": 0.4031063983878768, "learning_rate": 9.979591836734694e-07, "loss": -0.0364, "num_tokens": 6698925.0, "reward": 0.42578125, "reward_std": 0.2965182662010193, "rewards/reward_model/mean": 0.42578125, "rewards/reward_model/std": 0.29651827923953533, "step": 23 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1845.5, "completions/max_terminated_length": 1845.5, "completions/mean_length": 803.28515625, "completions/mean_terminated_length": 1071.0469093322754, "completions/min_length": 0.0, "completions/min_terminated_length": 660.0625, "epoch": 0.009478205064915831, "grad_norm": 0.38849683099787563, "learning_rate": 9.96938775510204e-07, "loss": -0.022, "num_tokens": 6921318.0, "reward": 0.39453125, "reward_std": 0.1498619243502617, "rewards/reward_model/mean": 0.39453125, "rewards/reward_model/std": 0.14986192621290684, "step": 24 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 2256.0, "completions/max_terminated_length": 2256.0, "completions/mean_length": 1014.53515625, "completions/mean_terminated_length": 1352.7135887145996, "completions/min_length": 0.0, "completions/min_terminated_length": 812.625, "epoch": 0.009873130275953991, "grad_norm": 0.4684514861326516, "learning_rate": 9.959183673469387e-07, "loss": -0.0412, "num_tokens": 7197119.0, "reward": 0.38671875, "reward_std": 0.35412105545401573, "rewards/reward_model/mean": 0.38671875, "rewards/reward_model/std": 0.3541210740804672, "step": 25 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 2465.375, "completions/max_terminated_length": 2465.375, "completions/mean_length": 1086.890625, "completions/mean_terminated_length": 1449.1875305175781, "completions/min_length": 0.0, "completions/min_terminated_length": 814.5625, "epoch": 0.010268055486992151, "grad_norm": 0.3492244753302287, "learning_rate": 9.948979591836735e-07, "loss": 0.0349, "num_tokens": 7491363.0, "reward": 0.5, "reward_std": 0.18871554359793663, "rewards/reward_model/mean": 0.5, "rewards/reward_model/std": 0.18871555663645267, "step": 26 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 2955.5625, "completions/max_terminated_length": 2955.5625, "completions/mean_length": 1103.4453125, "completions/mean_terminated_length": 1471.2604446411133, "completions/min_length": 0.0, "completions/min_terminated_length": 717.8125, "epoch": 0.010662980698030311, "grad_norm": 0.38296095372377004, "learning_rate": 9.938775510204081e-07, "loss": -0.0153, "num_tokens": 7789493.0, "reward": 0.28515625, "reward_std": 0.22996040806174278, "rewards/reward_model/mean": 0.28515625, "rewards/reward_model/std": 0.22996041178703308, "step": 27 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 2207.8125, "completions/max_terminated_length": 2207.8125, "completions/mean_length": 912.5546875, "completions/mean_terminated_length": 1216.7396202087402, "completions/min_length": 0.0, "completions/min_terminated_length": 718.875, "epoch": 0.01105790590906847, "grad_norm": 0.5139390144372692, "learning_rate": 9.92857142857143e-07, "loss": -0.0645, "num_tokens": 8039139.0, "reward": 0.57421875, "reward_std": 0.3620787709951401, "rewards/reward_model/mean": 0.57421875, "rewards/reward_model/std": 0.3620787784457207, "step": 28 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 2443.75, "completions/max_terminated_length": 2443.75, "completions/mean_length": 1040.1171875, "completions/mean_terminated_length": 1386.8229675292969, "completions/min_length": 0.0, "completions/min_terminated_length": 780.6875, "epoch": 0.01145283112010663, "grad_norm": 0.3883786829100392, "learning_rate": 9.918367346938776e-07, "loss": -0.0283, "num_tokens": 8321985.0, "reward": 0.390625, "reward_std": 0.2613418586552143, "rewards/reward_model/mean": 0.390625, "rewards/reward_model/std": 0.26134187169373035, "step": 29 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 2958.875, "completions/max_terminated_length": 2958.875, "completions/mean_length": 1140.53125, "completions/mean_terminated_length": 1520.708381652832, "completions/min_length": 0.0, "completions/min_terminated_length": 733.9375, "epoch": 0.01184775633114479, "grad_norm": 0.4793822157843747, "learning_rate": 9.908163265306122e-07, "loss": -0.0342, "num_tokens": 8632569.0, "reward": 0.40625, "reward_std": 0.3856222741305828, "rewards/reward_model/mean": 0.40625, "rewards/reward_model/std": 0.38562229461967945, "step": 30 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 2621.5625, "completions/max_terminated_length": 2621.5625, "completions/mean_length": 989.546875, "completions/mean_terminated_length": 1319.395866394043, "completions/min_length": 0.0, "completions/min_terminated_length": 687.375, "epoch": 0.01224268154218295, "grad_norm": 0.48114894543908415, "learning_rate": 9.897959183673468e-07, "loss": -0.0211, "num_tokens": 8901029.0, "reward": 0.3046875, "reward_std": 0.23733042925596237, "rewards/reward_model/mean": 0.3046875, "rewards/reward_model/std": 0.23733043298125267, "step": 31 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 2875.5625, "completions/max_terminated_length": 2875.5625, "completions/mean_length": 1087.28515625, "completions/mean_terminated_length": 1449.7135734558105, "completions/min_length": 0.0, "completions/min_terminated_length": 775.6875, "epoch": 0.012637606753221108, "grad_norm": 0.4008155000776, "learning_rate": 9.887755102040816e-07, "loss": 0.0005, "num_tokens": 9196334.0, "reward": 0.3046875, "reward_std": 0.25789543241262436, "rewards/reward_model/mean": 0.3046875, "rewards/reward_model/std": 0.2578954380005598, "step": 32 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 2073.3125, "completions/max_terminated_length": 2073.3125, "completions/mean_length": 947.82421875, "completions/mean_terminated_length": 1263.76566696167, "completions/min_length": 0.0, "completions/min_terminated_length": 752.25, "epoch": 0.013032531964259268, "grad_norm": 0.5303368240946322, "learning_rate": 9.877551020408163e-07, "loss": -0.0272, "num_tokens": 9455761.0, "reward": 0.515625, "reward_std": 0.4127383530139923, "rewards/reward_model/mean": 0.515625, "rewards/reward_model/std": 0.41273836605250835, "step": 33 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 3055.5, "completions/max_terminated_length": 3055.5, "completions/mean_length": 1105.40234375, "completions/mean_terminated_length": 1713.9114799499512, "completions/min_length": 0.0, "completions/min_terminated_length": 938.8125, "epoch": 0.013427457175297428, "grad_norm": 0.3478699823141093, "learning_rate": 9.867346938775509e-07, "loss": 0.0241, "num_tokens": 9757416.0, "reward": 0.27734375, "reward_std": 0.19523243233561516, "rewards/reward_model/mean": 0.27734375, "rewards/reward_model/std": 0.1952324453741312, "step": 34 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 2318.125, "completions/max_terminated_length": 2318.125, "completions/mean_length": 1060.30078125, "completions/mean_terminated_length": 1413.7344093322754, "completions/min_length": 0.0, "completions/min_terminated_length": 786.0625, "epoch": 0.013822382386335588, "grad_norm": 0.5088860431107934, "learning_rate": 9.857142857142857e-07, "loss": 0.0026, "num_tokens": 10047973.0, "reward": 0.49609375, "reward_std": 0.39163894578814507, "rewards/reward_model/mean": 0.49609375, "rewards/reward_model/std": 0.39163894951343536, "step": 35 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 2345.1875, "completions/max_terminated_length": 2345.1875, "completions/mean_length": 1056.265625, "completions/mean_terminated_length": 1408.3542022705078, "completions/min_length": 0.0, "completions/min_terminated_length": 819.6875, "epoch": 0.014217307597373748, "grad_norm": 0.4698692851620345, "learning_rate": 9.846938775510203e-07, "loss": -0.0113, "num_tokens": 10334825.0, "reward": 0.3828125, "reward_std": 0.30952034518122673, "rewards/reward_model/mean": 0.3828125, "rewards/reward_model/std": 0.3095203619450331, "step": 36 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 2361.5, "completions/max_terminated_length": 2361.5, "completions/mean_length": 986.6875, "completions/mean_terminated_length": 1315.5833702087402, "completions/min_length": 0.0, "completions/min_terminated_length": 727.4375, "epoch": 0.014612232808411906, "grad_norm": 0.46270967302847954, "learning_rate": 9.836734693877552e-07, "loss": 0.005, "num_tokens": 10605337.0, "reward": 0.40234375, "reward_std": 0.31677281856536865, "rewards/reward_model/mean": 0.40234375, "rewards/reward_model/std": 0.3167728278785944, "step": 37 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 2962.3125, "completions/max_terminated_length": 2962.3125, "completions/mean_length": 1296.54296875, "completions/mean_terminated_length": 1728.724006652832, "completions/min_length": 0.0, "completions/min_terminated_length": 885.5, "epoch": 0.015007158019450066, "grad_norm": 0.41859639791102693, "learning_rate": 9.826530612244898e-07, "loss": -0.0254, "num_tokens": 10955092.0, "reward": 0.28515625, "reward_std": 0.2941643111407757, "rewards/reward_model/mean": 0.28515625, "rewards/reward_model/std": 0.2941643241792917, "step": 38 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25390625, "completions/max_length": 2610.6875, "completions/max_terminated_length": 2610.6875, "completions/mean_length": 1062.70703125, "completions/mean_terminated_length": 1426.4280700683594, "completions/min_length": 0.0, "completions/min_terminated_length": 775.8125, "epoch": 0.015402083230488226, "grad_norm": 0.3913023432835488, "learning_rate": 9.816326530612244e-07, "loss": 0.0004, "num_tokens": 11244873.0, "reward": 0.421875, "reward_std": 0.2876000702381134, "rewards/reward_model/mean": 0.421875, "rewards/reward_model/std": 0.2876000739634037, "step": 39 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25390625, "completions/max_length": 2776.375, "completions/max_terminated_length": 2776.375, "completions/mean_length": 1099.109375, "completions/mean_terminated_length": 1471.365577697754, "completions/min_length": 0.0, "completions/min_terminated_length": 765.5, "epoch": 0.015797008441526386, "grad_norm": 0.42719124486076776, "learning_rate": 9.80612244897959e-07, "loss": -0.0123, "num_tokens": 11543749.0, "reward": 0.390625, "reward_std": 0.3086152523756027, "rewards/reward_model/mean": 0.390625, "rewards/reward_model/std": 0.3086152598261833, "step": 40 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 2463.9375, "completions/max_terminated_length": 2463.9375, "completions/mean_length": 1056.96484375, "completions/mean_terminated_length": 1409.2864952087402, "completions/min_length": 0.0, "completions/min_terminated_length": 815.5625, "epoch": 0.016191933652564546, "grad_norm": 0.42689843611464884, "learning_rate": 9.795918367346939e-07, "loss": -0.0392, "num_tokens": 11830972.0, "reward": 0.40625, "reward_std": 0.3572099804878235, "rewards/reward_model/mean": 0.40625, "rewards/reward_model/std": 0.35720999725162983, "step": 41 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 3005.9375, "completions/max_terminated_length": 3005.9375, "completions/mean_length": 1241.65234375, "completions/mean_terminated_length": 1655.5364875793457, "completions/min_length": 0.0, "completions/min_terminated_length": 815.6875, "epoch": 0.016586858863602706, "grad_norm": 0.40796066312009754, "learning_rate": 9.785714285714285e-07, "loss": -0.0261, "num_tokens": 12164403.0, "reward": 0.48046875, "reward_std": 0.29127464443445206, "rewards/reward_model/mean": 0.48046875, "rewards/reward_model/std": 0.2912746500223875, "step": 42 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 2500.8125, "completions/max_terminated_length": 2500.8125, "completions/mean_length": 937.578125, "completions/mean_terminated_length": 1250.1041831970215, "completions/min_length": 0.0, "completions/min_terminated_length": 675.625, "epoch": 0.016981784074640866, "grad_norm": 0.4540337224133811, "learning_rate": 9.775510204081631e-07, "loss": 0.0074, "num_tokens": 12419687.0, "reward": 0.3125, "reward_std": 0.2919924817979336, "rewards/reward_model/mean": 0.3125, "rewards/reward_model/std": 0.291992487385869, "step": 43 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 3103.3125, "completions/max_terminated_length": 3103.3125, "completions/mean_length": 1175.0078125, "completions/mean_terminated_length": 1566.6771278381348, "completions/min_length": 0.0, "completions/min_terminated_length": 802.25, "epoch": 0.017376709285679026, "grad_norm": 0.4470939558322697, "learning_rate": 9.76530612244898e-07, "loss": -0.0003, "num_tokens": 12737545.0, "reward": 0.41796875, "reward_std": 0.31869372725486755, "rewards/reward_model/mean": 0.41796875, "rewards/reward_model/std": 0.3186937365680933, "step": 44 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 2927.3125, "completions/max_terminated_length": 2927.3125, "completions/mean_length": 1120.7578125, "completions/mean_terminated_length": 1494.3438186645508, "completions/min_length": 0.0, "completions/min_terminated_length": 768.6875, "epoch": 0.017771634496717183, "grad_norm": 0.4783474408152646, "learning_rate": 9.755102040816326e-07, "loss": 0.0353, "num_tokens": 13042283.0, "reward": 0.37890625, "reward_std": 0.362717118114233, "rewards/reward_model/mean": 0.37890625, "rewards/reward_model/std": 0.36271712370216846, "step": 45 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 2683.3125, "completions/max_terminated_length": 2683.3125, "completions/mean_length": 1146.03125, "completions/mean_terminated_length": 1528.0417003631592, "completions/min_length": 0.0, "completions/min_terminated_length": 866.625, "epoch": 0.018166559707755343, "grad_norm": 0.3403747404190247, "learning_rate": 9.744897959183674e-07, "loss": -0.0339, "num_tokens": 13350499.0, "reward": 0.49609375, "reward_std": 0.24526194483041763, "rewards/reward_model/mean": 0.49609375, "rewards/reward_model/std": 0.24526195228099823, "step": 46 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 2709.375, "completions/max_terminated_length": 2709.375, "completions/mean_length": 1100.6953125, "completions/mean_terminated_length": 1467.59379196167, "completions/min_length": 0.0, "completions/min_terminated_length": 772.9375, "epoch": 0.018561484918793503, "grad_norm": 0.4092319901765194, "learning_rate": 9.73469387755102e-07, "loss": 0.0145, "num_tokens": 13648053.0, "reward": 0.23046875, "reward_std": 0.25637494027614594, "rewards/reward_model/mean": 0.23046875, "rewards/reward_model/std": 0.25637495517730713, "step": 47 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 2276.375, "completions/max_terminated_length": 2276.375, "completions/mean_length": 961.87109375, "completions/mean_terminated_length": 1282.494831085205, "completions/min_length": 0.0, "completions/min_terminated_length": 732.375, "epoch": 0.018956410129831663, "grad_norm": 0.545002392422666, "learning_rate": 9.724489795918366e-07, "loss": -0.0749, "num_tokens": 13911332.0, "reward": 0.421875, "reward_std": 0.2844184674322605, "rewards/reward_model/mean": 0.421875, "rewards/reward_model/std": 0.28441847674548626, "step": 48 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 2733.6875, "completions/max_terminated_length": 2733.6875, "completions/mean_length": 1105.51953125, "completions/mean_terminated_length": 1474.0260734558105, "completions/min_length": 0.0, "completions/min_terminated_length": 813.5, "epoch": 0.019351335340869823, "grad_norm": 0.5102063732280968, "learning_rate": 9.714285714285715e-07, "loss": -0.0563, "num_tokens": 14211433.0, "reward": 0.43359375, "reward_std": 0.3206084705889225, "rewards/reward_model/mean": 0.43359375, "rewards/reward_model/std": 0.3206084817647934, "step": 49 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 2481.5625, "completions/max_terminated_length": 2481.5625, "completions/mean_length": 993.6875, "completions/mean_terminated_length": 1324.9167098999023, "completions/min_length": 0.0, "completions/min_terminated_length": 702.25, "epoch": 0.019746260551907983, "grad_norm": 0.4577675150233401, "learning_rate": 9.70408163265306e-07, "loss": -0.0252, "num_tokens": 14479289.0, "reward": 0.3359375, "reward_std": 0.3564271703362465, "rewards/reward_model/mean": 0.3359375, "rewards/reward_model/std": 0.35642718710005283, "step": 50 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 2096.5, "completions/max_terminated_length": 2096.5, "completions/mean_length": 870.328125, "completions/mean_terminated_length": 1160.4375267028809, "completions/min_length": 0.0, "completions/min_terminated_length": 681.4375, "epoch": 0.020141185762946143, "grad_norm": 0.42287262631072486, "learning_rate": 9.693877551020407e-07, "loss": -0.0281, "num_tokens": 14719069.0, "reward": 0.6484375, "reward_std": 0.2636340707540512, "rewards/reward_model/mean": 0.6484375, "rewards/reward_model/std": 0.2636340782046318, "step": 51 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.2578125, "completions/max_length": 2994.0625, "completions/max_terminated_length": 2994.0625, "completions/mean_length": 1256.76171875, "completions/mean_terminated_length": 1729.682331085205, "completions/min_length": 0.0, "completions/min_terminated_length": 814.0625, "epoch": 0.020536110973984303, "grad_norm": 0.4451408442812677, "learning_rate": 9.683673469387753e-07, "loss": -0.0587, "num_tokens": 15058016.0, "reward": 0.43359375, "reward_std": 0.3718233071267605, "rewards/reward_model/mean": 0.43359375, "rewards/reward_model/std": 0.3718233183026314, "step": 52 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 2565.1875, "completions/max_terminated_length": 2565.1875, "completions/mean_length": 983.37890625, "completions/mean_terminated_length": 1311.1719131469727, "completions/min_length": 0.0, "completions/min_terminated_length": 691.75, "epoch": 0.020931036185022463, "grad_norm": 0.4752445572841864, "learning_rate": 9.673469387755102e-07, "loss": 0.0037, "num_tokens": 15326049.0, "reward": 0.48828125, "reward_std": 0.33044225722551346, "rewards/reward_model/mean": 0.48828125, "rewards/reward_model/std": 0.3304422628134489, "step": 53 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.2578125, "completions/max_length": 2586.0625, "completions/max_terminated_length": 2586.0625, "completions/mean_length": 1027.03515625, "completions/mean_terminated_length": 1394.702693939209, "completions/min_length": 0.0, "completions/min_terminated_length": 653.0, "epoch": 0.021325961396060623, "grad_norm": 0.5037785975391642, "learning_rate": 9.663265306122448e-07, "loss": -0.0559, "num_tokens": 15603898.0, "reward": 0.3515625, "reward_std": 0.3275599628686905, "rewards/reward_model/mean": 0.3515625, "rewards/reward_model/std": 0.32755997218191624, "step": 54 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 2315.9375, "completions/max_terminated_length": 2315.9375, "completions/mean_length": 1019.89453125, "completions/mean_terminated_length": 1359.859432220459, "completions/min_length": 0.0, "completions/min_terminated_length": 740.875, "epoch": 0.02172088660709878, "grad_norm": 0.4785010982162659, "learning_rate": 9.653061224489796e-07, "loss": -0.0083, "num_tokens": 15879679.0, "reward": 0.421875, "reward_std": 0.32756900787353516, "rewards/reward_model/mean": 0.421875, "rewards/reward_model/std": 0.3275690283626318, "step": 55 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.26171875, "completions/max_length": 2851.875, "completions/max_terminated_length": 2851.875, "completions/mean_length": 1233.171875, "completions/mean_terminated_length": 1727.496597290039, "completions/min_length": 0.0, "completions/min_terminated_length": 916.25, "epoch": 0.02211581181813694, "grad_norm": 0.3895015208241316, "learning_rate": 9.642857142857142e-07, "loss": -0.0298, "num_tokens": 16212283.0, "reward": 0.33203125, "reward_std": 0.2791505679488182, "rewards/reward_model/mean": 0.33203125, "rewards/reward_model/std": 0.27915058098733425, "step": 56 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 2611.875, "completions/max_terminated_length": 2611.875, "completions/mean_length": 1183.453125, "completions/mean_terminated_length": 1577.9375381469727, "completions/min_length": 0.0, "completions/min_terminated_length": 909.875, "epoch": 0.0225107370291751, "grad_norm": 0.436131631614301, "learning_rate": 9.632653061224489e-07, "loss": -0.041, "num_tokens": 16533391.0, "reward": 0.47265625, "reward_std": 0.3261702060699463, "rewards/reward_model/mean": 0.47265625, "rewards/reward_model/std": 0.32617022283375263, "step": 57 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 2137.9375, "completions/max_terminated_length": 2137.9375, "completions/mean_length": 1010.98046875, "completions/mean_terminated_length": 1347.9740028381348, "completions/min_length": 0.0, "completions/min_terminated_length": 751.9375, "epoch": 0.02290566224021326, "grad_norm": 0.4153426290042216, "learning_rate": 9.622448979591837e-07, "loss": -0.0955, "num_tokens": 16809210.0, "reward": 0.453125, "reward_std": 0.2662929520010948, "rewards/reward_model/mean": 0.453125, "rewards/reward_model/std": 0.26629296503961086, "step": 58 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 3056.5, "completions/max_terminated_length": 3056.5, "completions/mean_length": 1186.6328125, "completions/mean_terminated_length": 1582.1771278381348, "completions/min_length": 0.0, "completions/min_terminated_length": 851.375, "epoch": 0.02330058745125142, "grad_norm": 0.4599767345073827, "learning_rate": 9.612244897959183e-07, "loss": 0.0034, "num_tokens": 17129036.0, "reward": 0.33984375, "reward_std": 0.31171706691384315, "rewards/reward_model/mean": 0.33984375, "rewards/reward_model/std": 0.3117170762270689, "step": 59 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 3138.625, "completions/max_terminated_length": 3138.625, "completions/mean_length": 1215.03125, "completions/mean_terminated_length": 1620.0417251586914, "completions/min_length": 0.0, "completions/min_terminated_length": 828.1875, "epoch": 0.02369551266228958, "grad_norm": 0.4358875612174143, "learning_rate": 9.60204081632653e-07, "loss": -0.0288, "num_tokens": 17458292.0, "reward": 0.32421875, "reward_std": 0.3734620623290539, "rewards/reward_model/mean": 0.32421875, "rewards/reward_model/std": 0.37346208095550537, "step": 60 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 3345.25, "completions/max_terminated_length": 3345.25, "completions/mean_length": 1324.5078125, "completions/mean_terminated_length": 1766.0104446411133, "completions/min_length": 0.0, "completions/min_terminated_length": 914.0, "epoch": 0.02409043787332774, "grad_norm": 0.36960069304757304, "learning_rate": 9.591836734693876e-07, "loss": 0.0007, "num_tokens": 17811542.0, "reward": 0.38671875, "reward_std": 0.19586143642663956, "rewards/reward_model/mean": 0.38671875, "rewards/reward_model/std": 0.19586144387722015, "step": 61 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 3133.875, "completions/max_terminated_length": 3133.875, "completions/mean_length": 1313.97265625, "completions/mean_terminated_length": 1751.9635963439941, "completions/min_length": 0.0, "completions/min_terminated_length": 917.375, "epoch": 0.0244853630843659, "grad_norm": 0.38253618768290565, "learning_rate": 9.581632653061224e-07, "loss": 0.0124, "num_tokens": 18164175.0, "reward": 0.296875, "reward_std": 0.33165570348501205, "rewards/reward_model/mean": 0.296875, "rewards/reward_model/std": 0.3316557239741087, "step": 62 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 2382.0, "completions/max_terminated_length": 2382.0, "completions/mean_length": 971.515625, "completions/mean_terminated_length": 1295.3542098999023, "completions/min_length": 0.0, "completions/min_terminated_length": 764.3125, "epoch": 0.02488028829540406, "grad_norm": 0.4733127807420046, "learning_rate": 9.571428571428572e-07, "loss": -0.0287, "num_tokens": 18429539.0, "reward": 0.3515625, "reward_std": 0.3259189873933792, "rewards/reward_model/mean": 0.3515625, "rewards/reward_model/std": 0.32591900043189526, "step": 63 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 3077.875, "completions/max_terminated_length": 3077.875, "completions/mean_length": 1226.21875, "completions/mean_terminated_length": 1634.9583930969238, "completions/min_length": 0.0, "completions/min_terminated_length": 825.125, "epoch": 0.025275213506442216, "grad_norm": 0.4072694897049612, "learning_rate": 9.561224489795919e-07, "loss": -0.0758, "num_tokens": 18762155.0, "reward": 0.4765625, "reward_std": 0.373446237295866, "rewards/reward_model/mean": 0.4765625, "rewards/reward_model/std": 0.3734462559223175, "step": 64 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 2431.3125, "completions/max_terminated_length": 2431.3125, "completions/mean_length": 1068.81640625, "completions/mean_terminated_length": 1425.088565826416, "completions/min_length": 0.0, "completions/min_terminated_length": 804.125, "epoch": 0.025670138717480376, "grad_norm": 0.38570740824386585, "learning_rate": 9.551020408163265e-07, "loss": -0.0185, "num_tokens": 19050252.0, "reward": 0.390625, "reward_std": 0.27916165813803673, "rewards/reward_model/mean": 0.390625, "rewards/reward_model/std": 0.2791616693139076, "step": 65 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 2736.875, "completions/max_terminated_length": 2736.875, "completions/mean_length": 1156.03515625, "completions/mean_terminated_length": 1541.3802642822266, "completions/min_length": 0.0, "completions/min_terminated_length": 796.375, "epoch": 0.026065063928518536, "grad_norm": 0.4446301610176241, "learning_rate": 9.54081632653061e-07, "loss": -0.0539, "num_tokens": 19361973.0, "reward": 0.46875, "reward_std": 0.2926078587770462, "rewards/reward_model/mean": 0.46875, "rewards/reward_model/std": 0.2926078736782074, "step": 66 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 2861.0625, "completions/max_terminated_length": 2861.0625, "completions/mean_length": 1175.55859375, "completions/mean_terminated_length": 1567.4115142822266, "completions/min_length": 0.0, "completions/min_terminated_length": 855.9375, "epoch": 0.026459989139556696, "grad_norm": 0.4133582037236434, "learning_rate": 9.530612244897958e-07, "loss": 0.0774, "num_tokens": 19679236.0, "reward": 0.35546875, "reward_std": 0.23154202848672867, "rewards/reward_model/mean": 0.35546875, "rewards/reward_model/std": 0.23154203407466412, "step": 67 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 2456.125, "completions/max_terminated_length": 2456.125, "completions/mean_length": 972.23046875, "completions/mean_terminated_length": 1296.3073463439941, "completions/min_length": 0.0, "completions/min_terminated_length": 655.9375, "epoch": 0.026854914350594856, "grad_norm": 0.4359483406384567, "learning_rate": 9.520408163265306e-07, "loss": -0.0093, "num_tokens": 19943439.0, "reward": 0.390625, "reward_std": 0.3038902096450329, "rewards/reward_model/mean": 0.390625, "rewards/reward_model/std": 0.3038902170956135, "step": 68 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25390625, "completions/max_length": 2831.375, "completions/max_terminated_length": 2831.375, "completions/mean_length": 1136.53515625, "completions/mean_terminated_length": 1524.6922760009766, "completions/min_length": 0.0, "completions/min_terminated_length": 696.5625, "epoch": 0.027249839561633016, "grad_norm": 0.4701205463722084, "learning_rate": 9.510204081632653e-07, "loss": -0.0143, "num_tokens": 20253416.0, "reward": 0.22265625, "reward_std": 0.3121722787618637, "rewards/reward_model/mean": 0.22265625, "rewards/reward_model/std": 0.31217228434979916, "step": 69 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 2660.25, "completions/max_terminated_length": 2660.25, "completions/mean_length": 983.11328125, "completions/mean_terminated_length": 1310.8177490234375, "completions/min_length": 0.0, "completions/min_terminated_length": 787.375, "epoch": 0.027644764772671176, "grad_norm": 0.43114830273649984, "learning_rate": 9.499999999999999e-07, "loss": -0.0153, "num_tokens": 20521621.0, "reward": 0.390625, "reward_std": 0.28607041016221046, "rewards/reward_model/mean": 0.390625, "rewards/reward_model/std": 0.28607042133808136, "step": 70 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 2736.6875, "completions/max_terminated_length": 2736.6875, "completions/mean_length": 1095.015625, "completions/mean_terminated_length": 1460.0208587646484, "completions/min_length": 0.0, "completions/min_terminated_length": 689.9375, "epoch": 0.028039689983709336, "grad_norm": 0.49551898860663585, "learning_rate": 9.489795918367347e-07, "loss": -0.0623, "num_tokens": 20816121.0, "reward": 0.421875, "reward_std": 0.36139145120978355, "rewards/reward_model/mean": 0.421875, "rewards/reward_model/std": 0.3613914605230093, "step": 71 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 2701.0, "completions/max_terminated_length": 2701.0, "completions/mean_length": 1128.671875, "completions/mean_terminated_length": 1504.8958854675293, "completions/min_length": 0.0, "completions/min_terminated_length": 791.5625, "epoch": 0.028434615194747496, "grad_norm": 0.4778940076484883, "learning_rate": 9.479591836734694e-07, "loss": -0.0539, "num_tokens": 21120309.0, "reward": 0.375, "reward_std": 0.31379713118076324, "rewards/reward_model/mean": 0.375, "rewards/reward_model/std": 0.313797140493989, "step": 72 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 2980.1875, "completions/max_terminated_length": 2980.1875, "completions/mean_length": 1214.85546875, "completions/mean_terminated_length": 1619.8073425292969, "completions/min_length": 0.0, "completions/min_terminated_length": 728.9375, "epoch": 0.028829540405785656, "grad_norm": 0.4517839218792732, "learning_rate": 9.469387755102041e-07, "loss": -0.1154, "num_tokens": 21447504.0, "reward": 0.3828125, "reward_std": 0.3672429025173187, "rewards/reward_model/mean": 0.3828125, "rewards/reward_model/std": 0.36724291555583477, "step": 73 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.27734375, "completions/max_length": 2536.0625, "completions/max_terminated_length": 2536.0625, "completions/mean_length": 1050.9140625, "completions/mean_terminated_length": 1575.5041999816895, "completions/min_length": 0.0, "completions/min_terminated_length": 875.5625, "epoch": 0.029224465616823812, "grad_norm": 0.37353204166080134, "learning_rate": 9.459183673469387e-07, "loss": 0.0193, "num_tokens": 21731354.0, "reward": 0.47265625, "reward_std": 0.29371413215994835, "rewards/reward_model/mean": 0.47265625, "rewards/reward_model/std": 0.29371414333581924, "step": 74 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 2241.6875, "completions/max_terminated_length": 2241.6875, "completions/mean_length": 956.8203125, "completions/mean_terminated_length": 1275.7604446411133, "completions/min_length": 0.0, "completions/min_terminated_length": 742.6875, "epoch": 0.029619390827861972, "grad_norm": 0.5315641395265386, "learning_rate": 9.448979591836734e-07, "loss": -0.0418, "num_tokens": 21992780.0, "reward": 0.41796875, "reward_std": 0.3860640227794647, "rewards/reward_model/mean": 0.41796875, "rewards/reward_model/std": 0.38606402836740017, "step": 75 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 2341.9375, "completions/max_terminated_length": 2341.9375, "completions/mean_length": 1009.80078125, "completions/mean_terminated_length": 1346.4010734558105, "completions/min_length": 0.0, "completions/min_terminated_length": 830.5625, "epoch": 0.030014316038900132, "grad_norm": 0.4484435768754042, "learning_rate": 9.438775510204082e-07, "loss": -0.0281, "num_tokens": 22269273.0, "reward": 0.4609375, "reward_std": 0.33490245416760445, "rewards/reward_model/mean": 0.4609375, "rewards/reward_model/std": 0.3349024709314108, "step": 76 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 2718.9375, "completions/max_terminated_length": 2718.9375, "completions/mean_length": 1110.296875, "completions/mean_terminated_length": 1480.3958930969238, "completions/min_length": 0.0, "completions/min_terminated_length": 746.3125, "epoch": 0.030409241249938292, "grad_norm": 0.4474579629692534, "learning_rate": 9.428571428571428e-07, "loss": -0.0145, "num_tokens": 22570693.0, "reward": 0.36328125, "reward_std": 0.36949291452765465, "rewards/reward_model/mean": 0.36328125, "rewards/reward_model/std": 0.36949293687939644, "step": 77 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25390625, "completions/max_length": 3497.375, "completions/max_terminated_length": 3497.375, "completions/mean_length": 1462.66796875, "completions/mean_terminated_length": 1970.788402557373, "completions/min_length": 0.0, "completions/min_terminated_length": 900.6875, "epoch": 0.030804166460976452, "grad_norm": 0.4187479315723389, "learning_rate": 9.418367346938775e-07, "loss": -0.0941, "num_tokens": 22962032.0, "reward": 0.24609375, "reward_std": 0.3093888945877552, "rewards/reward_model/mean": 0.24609375, "rewards/reward_model/std": 0.30938890017569065, "step": 78 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 2692.125, "completions/max_terminated_length": 2692.125, "completions/mean_length": 1222.08984375, "completions/mean_terminated_length": 1629.4531707763672, "completions/min_length": 0.0, "completions/min_terminated_length": 848.5, "epoch": 0.031199091672014612, "grad_norm": 0.443802324190681, "learning_rate": 9.408163265306121e-07, "loss": -0.0193, "num_tokens": 23294343.0, "reward": 0.44921875, "reward_std": 0.33076170086860657, "rewards/reward_model/mean": 0.44921875, "rewards/reward_model/std": 0.33076171204447746, "step": 79 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 2374.1875, "completions/max_terminated_length": 2374.1875, "completions/mean_length": 1026.6640625, "completions/mean_terminated_length": 1368.8854484558105, "completions/min_length": 0.0, "completions/min_terminated_length": 726.375, "epoch": 0.03159401688305277, "grad_norm": 0.47247507472748973, "learning_rate": 9.39795918367347e-07, "loss": -0.0247, "num_tokens": 23572257.0, "reward": 0.42578125, "reward_std": 0.31041183322668076, "rewards/reward_model/mean": 0.42578125, "rewards/reward_model/std": 0.31041184440255165, "step": 80 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 2839.625, "completions/max_terminated_length": 2839.625, "completions/mean_length": 1155.69140625, "completions/mean_terminated_length": 1540.9219207763672, "completions/min_length": 0.0, "completions/min_terminated_length": 927.125, "epoch": 0.03198894209409093, "grad_norm": 0.4010781432306865, "learning_rate": 9.387755102040816e-07, "loss": -0.042, "num_tokens": 23883618.0, "reward": 0.5078125, "reward_std": 0.3805668205022812, "rewards/reward_model/mean": 0.5078125, "rewards/reward_model/std": 0.38056682981550694, "step": 81 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25390625, "completions/max_length": 2176.75, "completions/max_terminated_length": 2176.75, "completions/mean_length": 906.1796875, "completions/mean_terminated_length": 1220.406753540039, "completions/min_length": 0.0, "completions/min_terminated_length": 640.25, "epoch": 0.03238386730512909, "grad_norm": 0.4431661161399977, "learning_rate": 9.377551020408163e-07, "loss": -0.0195, "num_tokens": 24131760.0, "reward": 0.37890625, "reward_std": 0.2736629731953144, "rewards/reward_model/mean": 0.37890625, "rewards/reward_model/std": 0.27366298623383045, "step": 82 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 2718.25, "completions/max_terminated_length": 2718.25, "completions/mean_length": 1013.66015625, "completions/mean_terminated_length": 1351.5469131469727, "completions/min_length": 0.0, "completions/min_terminated_length": 747.375, "epoch": 0.03277879251616725, "grad_norm": 0.4233581322093806, "learning_rate": 9.36734693877551e-07, "loss": -0.0299, "num_tokens": 24406729.0, "reward": 0.3046875, "reward_std": 0.35749969631433487, "rewards/reward_model/mean": 0.3046875, "rewards/reward_model/std": 0.3574997130781412, "step": 83 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 2624.5625, "completions/max_terminated_length": 2624.5625, "completions/mean_length": 1174.0859375, "completions/mean_terminated_length": 1565.4479675292969, "completions/min_length": 0.0, "completions/min_terminated_length": 810.5625, "epoch": 0.03317371772720541, "grad_norm": 0.33024087639616057, "learning_rate": 9.357142857142857e-07, "loss": -0.0769, "num_tokens": 24725519.0, "reward": 0.4609375, "reward_std": 0.23898237198591232, "rewards/reward_model/mean": 0.4609375, "rewards/reward_model/std": 0.23898237571120262, "step": 84 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 2613.8125, "completions/max_terminated_length": 2613.8125, "completions/mean_length": 1091.75, "completions/mean_terminated_length": 1455.666706085205, "completions/min_length": 0.0, "completions/min_terminated_length": 667.1875, "epoch": 0.03356864293824357, "grad_norm": 0.5031405494631394, "learning_rate": 9.346938775510204e-07, "loss": -0.0415, "num_tokens": 25020351.0, "reward": 0.2890625, "reward_std": 0.33462444320321083, "rewards/reward_model/mean": 0.2890625, "rewards/reward_model/std": 0.3346244525164366, "step": 85 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 2836.3125, "completions/max_terminated_length": 2836.3125, "completions/mean_length": 1123.06640625, "completions/mean_terminated_length": 1497.4219284057617, "completions/min_length": 0.0, "completions/min_terminated_length": 793.8125, "epoch": 0.03396356814928173, "grad_norm": 0.479348274608115, "learning_rate": 9.33673469387755e-07, "loss": -0.018, "num_tokens": 25325984.0, "reward": 0.3515625, "reward_std": 0.3567710667848587, "rewards/reward_model/mean": 0.3515625, "rewards/reward_model/std": 0.35677107982337475, "step": 86 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 3108.1875, "completions/max_terminated_length": 3108.1875, "completions/mean_length": 1402.20703125, "completions/mean_terminated_length": 1869.6094131469727, "completions/min_length": 0.0, "completions/min_terminated_length": 980.625, "epoch": 0.03435849336031989, "grad_norm": 0.4262571257733803, "learning_rate": 9.326530612244897e-07, "loss": -0.0496, "num_tokens": 25703269.0, "reward": 0.29296875, "reward_std": 0.28962551429867744, "rewards/reward_model/mean": 0.29296875, "rewards/reward_model/std": 0.28962552919983864, "step": 87 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 2681.9375, "completions/max_terminated_length": 2681.9375, "completions/mean_length": 1145.85546875, "completions/mean_terminated_length": 1527.8073425292969, "completions/min_length": 0.0, "completions/min_terminated_length": 821.9375, "epoch": 0.03475341857135805, "grad_norm": 0.36120713482803624, "learning_rate": 9.316326530612244e-07, "loss": 0.0078, "num_tokens": 26013424.0, "reward": 0.359375, "reward_std": 0.30246713012456894, "rewards/reward_model/mean": 0.359375, "rewards/reward_model/std": 0.30246714502573013, "step": 88 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 2775.0, "completions/max_terminated_length": 2775.0, "completions/mean_length": 1130.08203125, "completions/mean_terminated_length": 1506.776081085205, "completions/min_length": 0.0, "completions/min_terminated_length": 815.25, "epoch": 0.03514834378239621, "grad_norm": 0.4058228508166392, "learning_rate": 9.306122448979591e-07, "loss": -0.0732, "num_tokens": 26320165.0, "reward": 0.36328125, "reward_std": 0.27133672311902046, "rewards/reward_model/mean": 0.36328125, "rewards/reward_model/std": 0.2713367287069559, "step": 89 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 2688.8125, "completions/max_terminated_length": 2688.8125, "completions/mean_length": 1150.484375, "completions/mean_terminated_length": 1533.9792137145996, "completions/min_length": 0.0, "completions/min_terminated_length": 813.125, "epoch": 0.035543268993434365, "grad_norm": 0.4014676584172598, "learning_rate": 9.295918367346939e-07, "loss": 0.0542, "num_tokens": 26631297.0, "reward": 0.4140625, "reward_std": 0.2729204408824444, "rewards/reward_model/mean": 0.4140625, "rewards/reward_model/std": 0.27292044274508953, "step": 90 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 2324.25, "completions/max_terminated_length": 2324.25, "completions/mean_length": 897.91796875, "completions/mean_terminated_length": 1197.2239837646484, "completions/min_length": 0.0, "completions/min_terminated_length": 637.0625, "epoch": 0.03593819420447253, "grad_norm": 0.4289774394684098, "learning_rate": 9.285714285714285e-07, "loss": -0.0384, "num_tokens": 26875836.0, "reward": 0.34375, "reward_std": 0.287880003452301, "rewards/reward_model/mean": 0.34375, "rewards/reward_model/std": 0.2878800090402365, "step": 91 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 2863.5625, "completions/max_terminated_length": 2863.5625, "completions/mean_length": 1205.60546875, "completions/mean_terminated_length": 1607.4740104675293, "completions/min_length": 0.0, "completions/min_terminated_length": 853.3125, "epoch": 0.036333119415510685, "grad_norm": 0.4169532124588293, "learning_rate": 9.275510204081633e-07, "loss": 0.0171, "num_tokens": 27202311.0, "reward": 0.453125, "reward_std": 0.31624314934015274, "rewards/reward_model/mean": 0.453125, "rewards/reward_model/std": 0.31624315679073334, "step": 92 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 2579.8125, "completions/max_terminated_length": 2579.8125, "completions/mean_length": 1080.72265625, "completions/mean_terminated_length": 1440.963581085205, "completions/min_length": 0.0, "completions/min_terminated_length": 746.6875, "epoch": 0.03672804462654885, "grad_norm": 0.35987285388885126, "learning_rate": 9.265306122448979e-07, "loss": 0.0054, "num_tokens": 27495648.0, "reward": 0.3359375, "reward_std": 0.248325876891613, "rewards/reward_model/mean": 0.3359375, "rewards/reward_model/std": 0.24832588247954845, "step": 93 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 2299.75, "completions/max_terminated_length": 2299.75, "completions/mean_length": 1021.8203125, "completions/mean_terminated_length": 1362.4271202087402, "completions/min_length": 0.0, "completions/min_terminated_length": 745.625, "epoch": 0.037122969837587005, "grad_norm": 0.36862624581937065, "learning_rate": 9.255102040816326e-07, "loss": 0.0306, "num_tokens": 27774082.0, "reward": 0.32421875, "reward_std": 0.192562285810709, "rewards/reward_model/mean": 0.32421875, "rewards/reward_model/std": 0.19256229512393475, "step": 94 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 2425.3125, "completions/max_terminated_length": 2425.3125, "completions/mean_length": 1012.15234375, "completions/mean_terminated_length": 1349.5364837646484, "completions/min_length": 0.0, "completions/min_terminated_length": 736.9375, "epoch": 0.03751789504862517, "grad_norm": 0.46717403874244773, "learning_rate": 9.244897959183672e-07, "loss": -0.0411, "num_tokens": 28052377.0, "reward": 0.375, "reward_std": 0.29533424973487854, "rewards/reward_model/mean": 0.375, "rewards/reward_model/std": 0.2953342590481043, "step": 95 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 2930.9375, "completions/max_terminated_length": 2930.9375, "completions/mean_length": 1334.1640625, "completions/mean_terminated_length": 1778.8854637145996, "completions/min_length": 0.0, "completions/min_terminated_length": 875.75, "epoch": 0.037912820259663325, "grad_norm": 0.46463544821341213, "learning_rate": 9.23469387755102e-07, "loss": 0.0114, "num_tokens": 28410323.0, "reward": 0.3984375, "reward_std": 0.2601151019334793, "rewards/reward_model/mean": 0.3984375, "rewards/reward_model/std": 0.2601151131093502, "step": 96 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 2812.125, "completions/max_terminated_length": 2812.125, "completions/mean_length": 1132.73046875, "completions/mean_terminated_length": 1510.3073425292969, "completions/min_length": 0.0, "completions/min_terminated_length": 811.375, "epoch": 0.03830774547070149, "grad_norm": 0.40553471048300116, "learning_rate": 9.224489795918367e-07, "loss": -0.0365, "num_tokens": 28719198.0, "reward": 0.4453125, "reward_std": 0.33587343245744705, "rewards/reward_model/mean": 0.4453125, "rewards/reward_model/std": 0.3358734454959631, "step": 97 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 2843.9375, "completions/max_terminated_length": 2843.9375, "completions/mean_length": 1178.55859375, "completions/mean_terminated_length": 1571.4115180969238, "completions/min_length": 0.0, "completions/min_terminated_length": 841.6875, "epoch": 0.038702670681739645, "grad_norm": 0.37521263178952563, "learning_rate": 9.214285714285713e-07, "loss": 0.0444, "num_tokens": 29037005.0, "reward": 0.29296875, "reward_std": 0.2461310774087906, "rewards/reward_model/mean": 0.29296875, "rewards/reward_model/std": 0.24613108485937119, "step": 98 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1791.75, "completions/max_terminated_length": 1791.75, "completions/mean_length": 868.07421875, "completions/mean_terminated_length": 1157.4323120117188, "completions/min_length": 0.0, "completions/min_terminated_length": 719.75, "epoch": 0.0390975958927778, "grad_norm": 0.48107192594360537, "learning_rate": 9.204081632653062e-07, "loss": 0.0481, "num_tokens": 29276224.0, "reward": 0.38671875, "reward_std": 0.3714906759560108, "rewards/reward_model/mean": 0.38671875, "rewards/reward_model/std": 0.37149068899452686, "step": 99 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 2269.75, "completions/max_terminated_length": 2269.75, "completions/mean_length": 1016.234375, "completions/mean_terminated_length": 1354.9792213439941, "completions/min_length": 0.0, "completions/min_terminated_length": 718.625, "epoch": 0.039492521103815965, "grad_norm": 0.42098806702233293, "learning_rate": 9.193877551020408e-07, "loss": 0.0361, "num_tokens": 29551900.0, "reward": 0.265625, "reward_std": 0.2784963957965374, "rewards/reward_model/mean": 0.265625, "rewards/reward_model/std": 0.2784964106976986, "step": 100 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1699.4375, "completions/max_terminated_length": 1699.4375, "completions/mean_length": 801.02734375, "completions/mean_terminated_length": 1068.0364875793457, "completions/min_length": 0.0, "completions/min_terminated_length": 615.25, "epoch": 0.03988744631485412, "grad_norm": 0.5356731405537598, "learning_rate": 9.183673469387755e-07, "loss": 0.0143, "num_tokens": 29773171.0, "reward": 0.58203125, "reward_std": 0.2822255901992321, "rewards/reward_model/mean": 0.58203125, "rewards/reward_model/std": 0.2822255976498127, "step": 101 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 2557.875, "completions/max_terminated_length": 2557.875, "completions/mean_length": 1085.97265625, "completions/mean_terminated_length": 1447.9635734558105, "completions/min_length": 0.0, "completions/min_terminated_length": 770.25, "epoch": 0.040282371525892285, "grad_norm": 0.44091561537904517, "learning_rate": 9.173469387755101e-07, "loss": 0.0121, "num_tokens": 30068236.0, "reward": 0.46875, "reward_std": 0.2547520101070404, "rewards/reward_model/mean": 0.46875, "rewards/reward_model/std": 0.25475201569497585, "step": 102 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25390625, "completions/max_length": 2980.6875, "completions/max_terminated_length": 2980.6875, "completions/mean_length": 1190.3359375, "completions/mean_terminated_length": 1612.0847969055176, "completions/min_length": 0.0, "completions/min_terminated_length": 834.375, "epoch": 0.04067729673693044, "grad_norm": 0.49479342656288827, "learning_rate": 9.163265306122449e-07, "loss": -0.0361, "num_tokens": 30388818.0, "reward": 0.44921875, "reward_std": 0.37670881301164627, "rewards/reward_model/mean": 0.44921875, "rewards/reward_model/std": 0.3767088260501623, "step": 103 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25390625, "completions/max_length": 2447.5, "completions/max_terminated_length": 2447.5, "completions/mean_length": 1034.96484375, "completions/mean_terminated_length": 1384.8844947814941, "completions/min_length": 0.0, "completions/min_terminated_length": 756.1875, "epoch": 0.041072221947968605, "grad_norm": 0.48855279773349714, "learning_rate": 9.153061224489796e-07, "loss": -0.0089, "num_tokens": 30669609.0, "reward": 0.30859375, "reward_std": 0.3532862849533558, "rewards/reward_model/mean": 0.30859375, "rewards/reward_model/std": 0.35328629426658154, "step": 104 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 2900.875, "completions/max_terminated_length": 2900.875, "completions/mean_length": 1099.765625, "completions/mean_terminated_length": 1466.3542213439941, "completions/min_length": 0.0, "completions/min_terminated_length": 758.0, "epoch": 0.04146714715900676, "grad_norm": 0.40640441435013686, "learning_rate": 9.142857142857142e-07, "loss": 0.0486, "num_tokens": 30969853.0, "reward": 0.37890625, "reward_std": 0.23018453642725945, "rewards/reward_model/mean": 0.37890625, "rewards/reward_model/std": 0.23018454387784004, "step": 105 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25390625, "completions/max_length": 2007.875, "completions/max_terminated_length": 2007.875, "completions/mean_length": 899.08984375, "completions/mean_terminated_length": 1202.248622894287, "completions/min_length": 0.0, "completions/min_terminated_length": 671.9375, "epoch": 0.041862072370044925, "grad_norm": 0.5163012650271843, "learning_rate": 9.132653061224489e-07, "loss": -0.0549, "num_tokens": 31215556.0, "reward": 0.46875, "reward_std": 0.3723689764738083, "rewards/reward_model/mean": 0.46875, "rewards/reward_model/std": 0.3723689913749695, "step": 106 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 3412.0625, "completions/max_terminated_length": 3412.0625, "completions/mean_length": 1399.8203125, "completions/mean_terminated_length": 1866.4271354675293, "completions/min_length": 0.0, "completions/min_terminated_length": 884.0, "epoch": 0.04225699758108308, "grad_norm": 0.44932212245128544, "learning_rate": 9.122448979591835e-07, "loss": -0.0249, "num_tokens": 31588166.0, "reward": 0.25390625, "reward_std": 0.29127464443445206, "rewards/reward_model/mean": 0.25390625, "rewards/reward_model/std": 0.2912746500223875, "step": 107 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 2494.4375, "completions/max_terminated_length": 2494.4375, "completions/mean_length": 1024.80859375, "completions/mean_terminated_length": 1366.411491394043, "completions/min_length": 0.0, "completions/min_terminated_length": 739.9375, "epoch": 0.042651922792121245, "grad_norm": 0.42009129269041123, "learning_rate": 9.112244897959184e-07, "loss": -0.0207, "num_tokens": 31865589.0, "reward": 0.57421875, "reward_std": 0.3143235482275486, "rewards/reward_model/mean": 0.57421875, "rewards/reward_model/std": 0.31432355754077435, "step": 108 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 2632.3125, "completions/max_terminated_length": 2632.3125, "completions/mean_length": 1067.3359375, "completions/mean_terminated_length": 1614.0080680847168, "completions/min_length": 0.0, "completions/min_terminated_length": 831.9375, "epoch": 0.0430468480031594, "grad_norm": 0.4399062557285054, "learning_rate": 9.10204081632653e-07, "loss": -0.0433, "num_tokens": 32155611.0, "reward": 0.4453125, "reward_std": 0.2784854285418987, "rewards/reward_model/mean": 0.4453125, "rewards/reward_model/std": 0.2784854378551245, "step": 109 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25390625, "completions/max_length": 3017.125, "completions/max_terminated_length": 3017.125, "completions/mean_length": 1251.1953125, "completions/mean_terminated_length": 1671.0199279785156, "completions/min_length": 0.0, "completions/min_terminated_length": 875.5, "epoch": 0.04344177321419756, "grad_norm": 0.32539885119679146, "learning_rate": 9.091836734693877e-07, "loss": 0.0111, "num_tokens": 32490285.0, "reward": 0.29296875, "reward_std": 0.19979893416166306, "rewards/reward_model/mean": 0.29296875, "rewards/reward_model/std": 0.19979893788695335, "step": 110 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 2326.1875, "completions/max_terminated_length": 2326.1875, "completions/mean_length": 996.17578125, "completions/mean_terminated_length": 1328.2344055175781, "completions/min_length": 0.0, "completions/min_terminated_length": 720.25, "epoch": 0.04383669842523572, "grad_norm": 0.46167054793521056, "learning_rate": 9.081632653061225e-07, "loss": 0.0419, "num_tokens": 32760730.0, "reward": 0.390625, "reward_std": 0.3078703247010708, "rewards/reward_model/mean": 0.390625, "rewards/reward_model/std": 0.3078703358769417, "step": 111 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 2016.375, "completions/max_terminated_length": 2016.375, "completions/mean_length": 863.66015625, "completions/mean_terminated_length": 1151.5468978881836, "completions/min_length": 0.0, "completions/min_terminated_length": 700.6875, "epoch": 0.04423162363627388, "grad_norm": 0.3370210396995935, "learning_rate": 9.071428571428571e-07, "loss": -0.0158, "num_tokens": 32997795.0, "reward": 0.45703125, "reward_std": 0.1732594594359398, "rewards/reward_model/mean": 0.45703125, "rewards/reward_model/std": 0.1732594631612301, "step": 112 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 2496.1875, "completions/max_terminated_length": 2496.1875, "completions/mean_length": 1090.4921875, "completions/mean_terminated_length": 1453.989631652832, "completions/min_length": 0.0, "completions/min_terminated_length": 812.5, "epoch": 0.04462654884731204, "grad_norm": 0.4688326207711077, "learning_rate": 9.061224489795918e-07, "loss": -0.0507, "num_tokens": 33293969.0, "reward": 0.5078125, "reward_std": 0.41381561383605003, "rewards/reward_model/mean": 0.5078125, "rewards/reward_model/std": 0.4138156287372112, "step": 113 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 2717.9375, "completions/max_terminated_length": 2717.9375, "completions/mean_length": 1239.0, "completions/mean_terminated_length": 1652.00004196167, "completions/min_length": 0.0, "completions/min_terminated_length": 760.0625, "epoch": 0.0450214740583502, "grad_norm": 0.40450163660717947, "learning_rate": 9.051020408163264e-07, "loss": 0.0098, "num_tokens": 33630529.0, "reward": 0.41796875, "reward_std": 0.31750796362757683, "rewards/reward_model/mean": 0.41796875, "rewards/reward_model/std": 0.3175079748034477, "step": 114 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 2516.5625, "completions/max_terminated_length": 2516.5625, "completions/mean_length": 959.01171875, "completions/mean_terminated_length": 1278.6823348999023, "completions/min_length": 0.0, "completions/min_terminated_length": 725.4375, "epoch": 0.04541639926938836, "grad_norm": 0.4983959297736443, "learning_rate": 9.040816326530612e-07, "loss": -0.0782, "num_tokens": 33892628.0, "reward": 0.48046875, "reward_std": 0.4071151651442051, "rewards/reward_model/mean": 0.48046875, "rewards/reward_model/std": 0.4071151837706566, "step": 115 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 2294.0, "completions/max_terminated_length": 2294.0, "completions/mean_length": 1014.65625, "completions/mean_terminated_length": 1352.8750267028809, "completions/min_length": 0.0, "completions/min_terminated_length": 739.625, "epoch": 0.04581132448042652, "grad_norm": 0.5045873648198765, "learning_rate": 9.030612244897958e-07, "loss": -0.0479, "num_tokens": 34168972.0, "reward": 0.3984375, "reward_std": 0.3047933802008629, "rewards/reward_model/mean": 0.3984375, "rewards/reward_model/std": 0.3047934006899595, "step": 116 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 2344.3125, "completions/max_terminated_length": 2344.3125, "completions/mean_length": 1055.10546875, "completions/mean_terminated_length": 1406.8073272705078, "completions/min_length": 0.0, "completions/min_terminated_length": 714.5625, "epoch": 0.04620624969146468, "grad_norm": 0.45288245176967895, "learning_rate": 9.020408163265306e-07, "loss": 0.0195, "num_tokens": 34453495.0, "reward": 0.43359375, "reward_std": 0.331695981323719, "rewards/reward_model/mean": 0.43359375, "rewards/reward_model/std": 0.33169599436223507, "step": 117 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.24609375, "completions/max_length": 2816.0, "completions/max_terminated_length": 2816.0, "completions/mean_length": 1130.890625, "completions/mean_terminated_length": 1502.9888343811035, "completions/min_length": 0.0, "completions/min_terminated_length": 738.9375, "epoch": 0.04660117490250284, "grad_norm": 0.4589161632122673, "learning_rate": 9.010204081632653e-07, "loss": -0.005, "num_tokens": 34760315.0, "reward": 0.2734375, "reward_std": 0.3281276896595955, "rewards/reward_model/mean": 0.2734375, "rewards/reward_model/std": 0.32812770642340183, "step": 118 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 2859.875, "completions/max_terminated_length": 2859.875, "completions/mean_length": 1195.96875, "completions/mean_terminated_length": 1594.6250648498535, "completions/min_length": 0.0, "completions/min_terminated_length": 858.8125, "epoch": 0.046996100113540995, "grad_norm": 0.38614089600444496, "learning_rate": 9e-07, "loss": -0.0667, "num_tokens": 35084211.0, "reward": 0.41796875, "reward_std": 0.2678224891424179, "rewards/reward_model/mean": 0.41796875, "rewards/reward_model/std": 0.2678225040435791, "step": 119 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 2545.625, "completions/max_terminated_length": 2545.625, "completions/mean_length": 1057.8515625, "completions/mean_terminated_length": 1410.4687881469727, "completions/min_length": 0.0, "completions/min_terminated_length": 768.625, "epoch": 0.04739102532457916, "grad_norm": 0.509307305571101, "learning_rate": 8.989795918367347e-07, "loss": -0.0192, "num_tokens": 35372829.0, "reward": 0.40625, "reward_std": 0.3106517866253853, "rewards/reward_model/mean": 0.40625, "rewards/reward_model/std": 0.3106518015265465, "step": 120 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 2066.5625, "completions/max_terminated_length": 2066.5625, "completions/mean_length": 881.765625, "completions/mean_terminated_length": 1175.6875267028809, "completions/min_length": 0.0, "completions/min_terminated_length": 680.125, "epoch": 0.047785950535617315, "grad_norm": 0.38606970158630194, "learning_rate": 8.979591836734693e-07, "loss": -0.0876, "num_tokens": 35612065.0, "reward": 0.328125, "reward_std": 0.24551982060074806, "rewards/reward_model/mean": 0.328125, "rewards/reward_model/std": 0.24551982805132866, "step": 121 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 2727.375, "completions/max_terminated_length": 2727.375, "completions/mean_length": 1081.9296875, "completions/mean_terminated_length": 1442.572946548462, "completions/min_length": 0.0, "completions/min_terminated_length": 675.5625, "epoch": 0.04818087574665548, "grad_norm": 0.3867753853816423, "learning_rate": 8.96938775510204e-07, "loss": 0.0492, "num_tokens": 35902783.0, "reward": 0.26171875, "reward_std": 0.21003375202417374, "rewards/reward_model/mean": 0.26171875, "rewards/reward_model/std": 0.21003375202417374, "step": 122 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 2536.25, "completions/max_terminated_length": 2536.25, "completions/mean_length": 1050.08203125, "completions/mean_terminated_length": 1400.1094207763672, "completions/min_length": 0.0, "completions/min_terminated_length": 792.125, "epoch": 0.048575800957693635, "grad_norm": 0.364775281482599, "learning_rate": 8.959183673469387e-07, "loss": -0.0018, "num_tokens": 36190132.0, "reward": 0.609375, "reward_std": 0.2532769702374935, "rewards/reward_model/mean": 0.609375, "rewards/reward_model/std": 0.2532769739627838, "step": 123 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 2901.6875, "completions/max_terminated_length": 2901.6875, "completions/mean_length": 1150.96484375, "completions/mean_terminated_length": 1534.6198348999023, "completions/min_length": 0.0, "completions/min_terminated_length": 794.125, "epoch": 0.0489707261687318, "grad_norm": 0.37823262548356656, "learning_rate": 8.948979591836734e-07, "loss": 0.0011, "num_tokens": 36500651.0, "reward": 0.359375, "reward_std": 0.249229047447443, "rewards/reward_model/mean": 0.359375, "rewards/reward_model/std": 0.2492290660738945, "step": 124 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 2384.125, "completions/max_terminated_length": 2384.125, "completions/mean_length": 1068.90234375, "completions/mean_terminated_length": 1425.2031898498535, "completions/min_length": 0.0, "completions/min_terminated_length": 732.5, "epoch": 0.049365651379769955, "grad_norm": 0.4786280040497525, "learning_rate": 8.938775510204081e-07, "loss": 0.0224, "num_tokens": 36789490.0, "reward": 0.453125, "reward_std": 0.37902116030454636, "rewards/reward_model/mean": 0.453125, "rewards/reward_model/std": 0.37902117520570755, "step": 125 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 2315.25, "completions/max_terminated_length": 2315.25, "completions/mean_length": 996.640625, "completions/mean_terminated_length": 1328.854206085205, "completions/min_length": 0.0, "completions/min_terminated_length": 789.375, "epoch": 0.04976057659080812, "grad_norm": 0.5094833565989169, "learning_rate": 8.928571428571428e-07, "loss": -0.0207, "num_tokens": 37059510.0, "reward": 0.3828125, "reward_std": 0.3802800588309765, "rewards/reward_model/mean": 0.3828125, "rewards/reward_model/std": 0.3802800793200731, "step": 126 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25390625, "completions/max_length": 4187.3125, "completions/max_terminated_length": 4187.3125, "completions/mean_length": 1534.63671875, "completions/mean_terminated_length": 2062.1155967712402, "completions/min_length": 0.0, "completions/min_terminated_length": 981.0625, "epoch": 0.050155501801846275, "grad_norm": 0.48313342136084464, "learning_rate": 1e-06, "loss": 0.0027, "num_tokens": 37475561.0, "reward": 0.359375, "reward_std": 0.32084842398762703, "rewards/reward_model/mean": 0.359375, "rewards/reward_model/std": 0.3208484388887882, "step": 127 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 2664.5625, "completions/max_terminated_length": 2664.5625, "completions/mean_length": 1071.0859375, "completions/mean_terminated_length": 1428.1146278381348, "completions/min_length": 0.0, "completions/min_terminated_length": 715.125, "epoch": 0.05055042701288443, "grad_norm": 0.5154650550043586, "learning_rate": 1e-06, "loss": -0.0843, "num_tokens": 37764703.0, "reward": 0.37890625, "reward_std": 0.3597511909902096, "rewards/reward_model/mean": 0.37890625, "rewards/reward_model/std": 0.3597512189298868, "step": 128 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25390625, "completions/max_length": 2739.9375, "completions/max_terminated_length": 2739.9375, "completions/mean_length": 1171.546875, "completions/mean_terminated_length": 1582.4607429504395, "completions/min_length": 0.0, "completions/min_terminated_length": 821.5, "epoch": 0.050945352223922595, "grad_norm": 0.46528605364172715, "learning_rate": 1e-06, "loss": 0.0486, "num_tokens": 38082011.0, "reward": 0.3671875, "reward_std": 0.3630910590291023, "rewards/reward_model/mean": 0.3671875, "rewards/reward_model/std": 0.36309106834232807, "step": 129 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.26171875, "completions/max_length": 3829.5625, "completions/max_terminated_length": 3829.5625, "completions/mean_length": 1392.203125, "completions/mean_terminated_length": 1954.7986526489258, "completions/min_length": 0.0, "completions/min_terminated_length": 843.3125, "epoch": 0.05134027743496075, "grad_norm": 0.42172962153313576, "learning_rate": 1e-06, "loss": -0.0337, "num_tokens": 38456095.0, "reward": 0.34375, "reward_std": 0.2944811172783375, "rewards/reward_model/mean": 0.34375, "rewards/reward_model/std": 0.2944811284542084, "step": 130 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 2466.875, "completions/max_terminated_length": 2466.875, "completions/mean_length": 1070.1796875, "completions/mean_terminated_length": 1426.9062976837158, "completions/min_length": 0.0, "completions/min_terminated_length": 648.3125, "epoch": 0.051735202645998915, "grad_norm": 0.5216407563334101, "learning_rate": 1e-06, "loss": 0.0031, "num_tokens": 38743069.0, "reward": 0.41796875, "reward_std": 0.26440105587244034, "rewards/reward_model/mean": 0.41796875, "rewards/reward_model/std": 0.26440106332302094, "step": 131 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 2742.625, "completions/max_terminated_length": 2742.625, "completions/mean_length": 1066.734375, "completions/mean_terminated_length": 1422.3125343322754, "completions/min_length": 0.0, "completions/min_terminated_length": 693.375, "epoch": 0.05213012785703707, "grad_norm": 0.38694242931154627, "learning_rate": 1e-06, "loss": 0.063, "num_tokens": 39034953.0, "reward": 0.484375, "reward_std": 0.2525683008134365, "rewards/reward_model/mean": 0.484375, "rewards/reward_model/std": 0.25256830640137196, "step": 132 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 3409.0625, "completions/max_terminated_length": 3409.0625, "completions/mean_length": 1262.94921875, "completions/mean_terminated_length": 1683.9323425292969, "completions/min_length": 0.0, "completions/min_terminated_length": 769.9375, "epoch": 0.052525053068075235, "grad_norm": 0.38850844762726466, "learning_rate": 1e-06, "loss": -0.036, "num_tokens": 39374924.0, "reward": 0.33984375, "reward_std": 0.2286141812801361, "rewards/reward_model/mean": 0.33984375, "rewards/reward_model/std": 0.2286141850054264, "step": 133 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 2759.5, "completions/max_terminated_length": 2759.5, "completions/mean_length": 1125.703125, "completions/mean_terminated_length": 1500.9375305175781, "completions/min_length": 0.0, "completions/min_terminated_length": 812.0625, "epoch": 0.05291997827911339, "grad_norm": 0.49738629043664667, "learning_rate": 1e-06, "loss": -0.0095, "num_tokens": 39678640.0, "reward": 0.37890625, "reward_std": 0.3845343627035618, "rewards/reward_model/mean": 0.37890625, "rewards/reward_model/std": 0.384534377604723, "step": 134 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.24609375, "completions/max_length": 2624.0, "completions/max_terminated_length": 2624.0, "completions/mean_length": 1169.6171875, "completions/mean_terminated_length": 1549.5208854675293, "completions/min_length": 0.0, "completions/min_terminated_length": 857.0, "epoch": 0.053314903490151555, "grad_norm": 0.4532024349224113, "learning_rate": 1e-06, "loss": 0.005, "num_tokens": 39995310.0, "reward": 0.3515625, "reward_std": 0.28411275148391724, "rewards/reward_model/mean": 0.3515625, "rewards/reward_model/std": 0.284112760797143, "step": 135 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 2512.0625, "completions/max_terminated_length": 2512.0625, "completions/mean_length": 1093.28125, "completions/mean_terminated_length": 1457.7083702087402, "completions/min_length": 0.0, "completions/min_terminated_length": 712.6875, "epoch": 0.05370982870118971, "grad_norm": 0.4511384241720582, "learning_rate": 1e-06, "loss": -0.0387, "num_tokens": 40290726.0, "reward": 0.359375, "reward_std": 0.27137478068470955, "rewards/reward_model/mean": 0.359375, "rewards/reward_model/std": 0.2713747899979353, "step": 136 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 2809.625, "completions/max_terminated_length": 2809.625, "completions/mean_length": 1201.13671875, "completions/mean_terminated_length": 1601.5156784057617, "completions/min_length": 0.0, "completions/min_terminated_length": 817.625, "epoch": 0.054104753912227875, "grad_norm": 0.4490113226312359, "learning_rate": 1e-06, "loss": -0.0394, "num_tokens": 40611865.0, "reward": 0.3828125, "reward_std": 0.34982413426041603, "rewards/reward_model/mean": 0.3828125, "rewards/reward_model/std": 0.3498241528868675, "step": 137 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.26953125, "completions/max_length": 2841.9375, "completions/max_terminated_length": 2841.9375, "completions/mean_length": 1213.546875, "completions/mean_terminated_length": 1785.834114074707, "completions/min_length": 0.0, "completions/min_terminated_length": 902.0625, "epoch": 0.05449967912326603, "grad_norm": 0.46740582747152537, "learning_rate": 1e-06, "loss": -0.0236, "num_tokens": 40936565.0, "reward": 0.4375, "reward_std": 0.31345976516604424, "rewards/reward_model/mean": 0.4375, "rewards/reward_model/std": 0.31345977261662483, "step": 138 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 2246.625, "completions/max_terminated_length": 2246.625, "completions/mean_length": 988.6328125, "completions/mean_terminated_length": 1318.1771240234375, "completions/min_length": 0.0, "completions/min_terminated_length": 730.8125, "epoch": 0.05489460433430419, "grad_norm": 0.46870088742933874, "learning_rate": 1e-06, "loss": 0.0219, "num_tokens": 41204135.0, "reward": 0.30859375, "reward_std": 0.2869618982076645, "rewards/reward_model/mean": 0.30859375, "rewards/reward_model/std": 0.28696190379559994, "step": 139 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 2527.125, "completions/max_terminated_length": 2527.125, "completions/mean_length": 989.8203125, "completions/mean_terminated_length": 1319.7604484558105, "completions/min_length": 0.0, "completions/min_terminated_length": 649.75, "epoch": 0.05528952954534235, "grad_norm": 0.4839387733202585, "learning_rate": 1e-06, "loss": 0.0344, "num_tokens": 41474329.0, "reward": 0.4765625, "reward_std": 0.3026951029896736, "rewards/reward_model/mean": 0.4765625, "rewards/reward_model/std": 0.3026951104402542, "step": 140 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 2610.9375, "completions/max_terminated_length": 2610.9375, "completions/mean_length": 1018.578125, "completions/mean_terminated_length": 1358.1042175292969, "completions/min_length": 0.0, "completions/min_terminated_length": 709.0, "epoch": 0.05568445475638051, "grad_norm": 0.4751891305645857, "learning_rate": 1e-06, "loss": -0.0215, "num_tokens": 41752589.0, "reward": 0.43359375, "reward_std": 0.34416690841317177, "rewards/reward_model/mean": 0.43359375, "rewards/reward_model/std": 0.34416691958904266, "step": 141 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 2778.4375, "completions/max_terminated_length": 2778.4375, "completions/mean_length": 1071.609375, "completions/mean_terminated_length": 1428.8125457763672, "completions/min_length": 0.0, "completions/min_terminated_length": 770.9375, "epoch": 0.05607937996741867, "grad_norm": 0.45451506953701365, "learning_rate": 1e-06, "loss": -0.0386, "num_tokens": 42043497.0, "reward": 0.41015625, "reward_std": 0.32799431681632996, "rewards/reward_model/mean": 0.41015625, "rewards/reward_model/std": 0.32799432799220085, "step": 142 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 2628.375, "completions/max_terminated_length": 2628.375, "completions/mean_length": 1057.328125, "completions/mean_terminated_length": 1409.770881652832, "completions/min_length": 0.0, "completions/min_terminated_length": 704.875, "epoch": 0.05647430517845683, "grad_norm": 0.4186438326591692, "learning_rate": 1e-06, "loss": -0.0023, "num_tokens": 42329661.0, "reward": 0.4453125, "reward_std": 0.32963043451309204, "rewards/reward_model/mean": 0.4453125, "rewards/reward_model/std": 0.32963044568896294, "step": 143 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 2528.5625, "completions/max_terminated_length": 2528.5625, "completions/mean_length": 1029.796875, "completions/mean_terminated_length": 1373.0625495910645, "completions/min_length": 0.0, "completions/min_terminated_length": 712.0, "epoch": 0.05686923038949499, "grad_norm": 0.4996261438311891, "learning_rate": 1e-06, "loss": 0.0083, "num_tokens": 42607977.0, "reward": 0.4921875, "reward_std": 0.3331216983497143, "rewards/reward_model/mean": 0.4921875, "rewards/reward_model/std": 0.3331217113882303, "step": 144 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 3285.8125, "completions/max_terminated_length": 3285.8125, "completions/mean_length": 1344.9609375, "completions/mean_terminated_length": 1793.2812995910645, "completions/min_length": 0.0, "completions/min_terminated_length": 860.1875, "epoch": 0.05726415560053315, "grad_norm": 0.40924620907342835, "learning_rate": 1e-06, "loss": -0.0114, "num_tokens": 42968703.0, "reward": 0.328125, "reward_std": 0.26544174179434776, "rewards/reward_model/mean": 0.328125, "rewards/reward_model/std": 0.26544174924492836, "step": 145 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25390625, "completions/max_length": 2870.125, "completions/max_terminated_length": 2870.125, "completions/mean_length": 1185.18359375, "completions/mean_terminated_length": 1590.6577033996582, "completions/min_length": 0.0, "completions/min_terminated_length": 831.25, "epoch": 0.05765908081157131, "grad_norm": 0.4808442174492176, "learning_rate": 1e-06, "loss": 0.0106, "num_tokens": 43290110.0, "reward": 0.4765625, "reward_std": 0.35032568126916885, "rewards/reward_model/mean": 0.4765625, "rewards/reward_model/std": 0.35032568871974945, "step": 146 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 2531.4375, "completions/max_terminated_length": 2531.4375, "completions/mean_length": 1030.9921875, "completions/mean_terminated_length": 1374.6562728881836, "completions/min_length": 0.0, "completions/min_terminated_length": 700.0625, "epoch": 0.05805400602260947, "grad_norm": 0.33103497654946706, "learning_rate": 1e-06, "loss": -0.0034, "num_tokens": 43570124.0, "reward": 0.4765625, "reward_std": 0.18871080875396729, "rewards/reward_model/mean": 0.4765625, "rewards/reward_model/std": 0.18871081806719303, "step": 147 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 2930.9375, "completions/max_terminated_length": 2930.9375, "completions/mean_length": 1153.4375, "completions/mean_terminated_length": 1537.9166984558105, "completions/min_length": 0.0, "completions/min_terminated_length": 722.875, "epoch": 0.058448931233647625, "grad_norm": 0.4250925437092233, "learning_rate": 1e-06, "loss": 0.0146, "num_tokens": 43882316.0, "reward": 0.34765625, "reward_std": 0.31599000841379166, "rewards/reward_model/mean": 0.34765625, "rewards/reward_model/std": 0.31599001958966255, "step": 148 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25390625, "completions/max_length": 3360.375, "completions/max_terminated_length": 3360.375, "completions/mean_length": 1375.11328125, "completions/mean_terminated_length": 1855.9702110290527, "completions/min_length": 0.0, "completions/min_terminated_length": 892.5625, "epoch": 0.05884385644468579, "grad_norm": 0.34508707914775194, "learning_rate": 1e-06, "loss": -0.0431, "num_tokens": 44255033.0, "reward": 0.19921875, "reward_std": 0.2534216083586216, "rewards/reward_model/mean": 0.19921875, "rewards/reward_model/std": 0.2534216158092022, "step": 149 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 2799.625, "completions/max_terminated_length": 2799.625, "completions/mean_length": 1095.80859375, "completions/mean_terminated_length": 1461.0781707763672, "completions/min_length": 0.0, "completions/min_terminated_length": 751.1875, "epoch": 0.059238781655723945, "grad_norm": 0.47145005289822633, "learning_rate": 1e-06, "loss": -0.0517, "num_tokens": 44552872.0, "reward": 0.32421875, "reward_std": 0.2846716083586216, "rewards/reward_model/mean": 0.32421875, "rewards/reward_model/std": 0.28467161394655704, "step": 150 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 2701.25, "completions/max_terminated_length": 2701.25, "completions/mean_length": 1100.515625, "completions/mean_terminated_length": 1467.3541946411133, "completions/min_length": 0.0, "completions/min_terminated_length": 797.8125, "epoch": 0.05963370686676211, "grad_norm": 0.5028390011386312, "learning_rate": 1e-06, "loss": -0.0823, "num_tokens": 44850940.0, "reward": 0.44140625, "reward_std": 0.38099464401602745, "rewards/reward_model/mean": 0.44140625, "rewards/reward_model/std": 0.3809946607798338, "step": 151 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.2578125, "completions/max_length": 3279.0, "completions/max_terminated_length": 3279.0, "completions/mean_length": 1318.3359375, "completions/mean_terminated_length": 1821.4250793457031, "completions/min_length": 0.0, "completions/min_terminated_length": 926.0, "epoch": 0.060028632077800265, "grad_norm": 0.3795048618491077, "learning_rate": 1e-06, "loss": -0.0308, "num_tokens": 45206498.0, "reward": 0.45703125, "reward_std": 0.25273411348462105, "rewards/reward_model/mean": 0.45703125, "rewards/reward_model/std": 0.25273412093520164, "step": 152 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 2596.0625, "completions/max_terminated_length": 2596.0625, "completions/mean_length": 984.88671875, "completions/mean_terminated_length": 1313.1823234558105, "completions/min_length": 0.0, "completions/min_terminated_length": 703.4375, "epoch": 0.06042355728883843, "grad_norm": 0.4609130029648073, "learning_rate": 1e-06, "loss": -0.0414, "num_tokens": 45473813.0, "reward": 0.55078125, "reward_std": 0.2807977758347988, "rewards/reward_model/mean": 0.55078125, "rewards/reward_model/std": 0.2807977870106697, "step": 153 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 2415.25, "completions/max_terminated_length": 2415.25, "completions/mean_length": 988.49609375, "completions/mean_terminated_length": 1317.9948196411133, "completions/min_length": 0.0, "completions/min_terminated_length": 709.0625, "epoch": 0.060818482499876585, "grad_norm": 0.4301394773381465, "learning_rate": 1e-06, "loss": -0.023, "num_tokens": 45745092.0, "reward": 0.484375, "reward_std": 0.2663580998778343, "rewards/reward_model/mean": 0.484375, "rewards/reward_model/std": 0.26635811291635036, "step": 154 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 2274.8125, "completions/max_terminated_length": 2274.8125, "completions/mean_length": 1049.625, "completions/mean_terminated_length": 1399.5000457763672, "completions/min_length": 0.0, "completions/min_terminated_length": 794.5, "epoch": 0.06121340771091475, "grad_norm": 0.45041141716477745, "learning_rate": 1e-06, "loss": -0.0228, "num_tokens": 46029364.0, "reward": 0.53515625, "reward_std": 0.3153516612946987, "rewards/reward_model/mean": 0.53515625, "rewards/reward_model/std": 0.31535167433321476, "step": 155 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 2724.8125, "completions/max_terminated_length": 2724.8125, "completions/mean_length": 1200.0625, "completions/mean_terminated_length": 1600.083381652832, "completions/min_length": 0.0, "completions/min_terminated_length": 856.25, "epoch": 0.061608332921952905, "grad_norm": 0.3965844039416519, "learning_rate": 1e-06, "loss": 0.0478, "num_tokens": 46352756.0, "reward": 0.45703125, "reward_std": 0.24823319911956787, "rewards/reward_model/mean": 0.45703125, "rewards/reward_model/std": 0.24823321029543877, "step": 156 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 2397.8125, "completions/max_terminated_length": 2397.8125, "completions/mean_length": 1048.0234375, "completions/mean_terminated_length": 1397.364631652832, "completions/min_length": 0.0, "completions/min_terminated_length": 723.5625, "epoch": 0.06200325813299107, "grad_norm": 0.38739862252270196, "learning_rate": 1e-06, "loss": -0.0175, "num_tokens": 46639114.0, "reward": 0.36328125, "reward_std": 0.2608758546411991, "rewards/reward_model/mean": 0.36328125, "rewards/reward_model/std": 0.26087586581707, "step": 157 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.2734375, "completions/max_length": 3484.3125, "completions/max_terminated_length": 3484.3125, "completions/mean_length": 1414.3203125, "completions/mean_terminated_length": 2096.885456085205, "completions/min_length": 0.0, "completions/min_terminated_length": 1029.3125, "epoch": 0.062398183344029225, "grad_norm": 0.3239370638548311, "learning_rate": 1e-06, "loss": -0.0372, "num_tokens": 47017228.0, "reward": 0.39453125, "reward_std": 0.16505969315767288, "rewards/reward_model/mean": 0.39453125, "rewards/reward_model/std": 0.16505970805883408, "step": 158 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 3123.875, "completions/max_terminated_length": 3123.875, "completions/mean_length": 1213.8984375, "completions/mean_terminated_length": 1865.5729637145996, "completions/min_length": 0.0, "completions/min_terminated_length": 866.1875, "epoch": 0.06279310855506738, "grad_norm": 0.35351743907111244, "learning_rate": 1e-06, "loss": 0.0007, "num_tokens": 47343090.0, "reward": 0.41796875, "reward_std": 0.30438540130853653, "rewards/reward_model/mean": 0.41796875, "rewards/reward_model/std": 0.30438540503382683, "step": 159 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.2578125, "completions/max_length": 3776.6875, "completions/max_terminated_length": 3776.6875, "completions/mean_length": 1516.65625, "completions/mean_terminated_length": 2072.5786666870117, "completions/min_length": 0.0, "completions/min_terminated_length": 998.0625, "epoch": 0.06318803376610554, "grad_norm": 0.36363244869844125, "learning_rate": 1e-06, "loss": -0.0581, "num_tokens": 47748586.0, "reward": 0.4140625, "reward_std": 0.24650176614522934, "rewards/reward_model/mean": 0.4140625, "rewards/reward_model/std": 0.24650177545845509, "step": 160 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25390625, "completions/max_length": 3287.4375, "completions/max_terminated_length": 3287.4375, "completions/mean_length": 1472.8984375, "completions/mean_terminated_length": 1995.2571716308594, "completions/min_length": 0.0, "completions/min_terminated_length": 1108.5, "epoch": 0.06358295897714371, "grad_norm": 0.416963146677633, "learning_rate": 1e-06, "loss": 0.0111, "num_tokens": 48142848.0, "reward": 0.578125, "reward_std": 0.3702508546411991, "rewards/reward_model/mean": 0.578125, "rewards/reward_model/std": 0.37025086581707, "step": 161 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 4534.3125, "completions/max_terminated_length": 4534.3125, "completions/mean_length": 1905.32421875, "completions/mean_terminated_length": 2540.4323654174805, "completions/min_length": 0.0, "completions/min_terminated_length": 983.9375, "epoch": 0.06397788418818186, "grad_norm": 0.24072158681653213, "learning_rate": 1e-06, "loss": 0.026, "num_tokens": 48651123.0, "reward": 0.12890625, "reward_std": 0.17947598174214363, "rewards/reward_model/mean": 0.12890625, "rewards/reward_model/std": 0.17947598360478878, "step": 162 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 2362.5625, "completions/max_terminated_length": 2362.5625, "completions/mean_length": 1033.29296875, "completions/mean_terminated_length": 1377.7239952087402, "completions/min_length": 0.0, "completions/min_terminated_length": 722.4375, "epoch": 0.06437280939922002, "grad_norm": 0.4149098683362285, "learning_rate": 1e-06, "loss": 0.0028, "num_tokens": 48933230.0, "reward": 0.4765625, "reward_std": 0.2732148915529251, "rewards/reward_model/mean": 0.4765625, "rewards/reward_model/std": 0.27321489714086056, "step": 163 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 2810.0, "completions/max_terminated_length": 2810.0, "completions/mean_length": 1255.48828125, "completions/mean_terminated_length": 1673.9844245910645, "completions/min_length": 0.0, "completions/min_terminated_length": 713.875, "epoch": 0.06476773461025818, "grad_norm": 0.39186835343234233, "learning_rate": 1e-06, "loss": -0.0027, "num_tokens": 49268763.0, "reward": 0.46484375, "reward_std": 0.2536344714462757, "rewards/reward_model/mean": 0.46484375, "rewards/reward_model/std": 0.25363448448479176, "step": 164 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 2885.1875, "completions/max_terminated_length": 2885.1875, "completions/mean_length": 1181.8671875, "completions/mean_terminated_length": 1575.8229637145996, "completions/min_length": 0.0, "completions/min_terminated_length": 753.8125, "epoch": 0.06516265982129635, "grad_norm": 0.38670982988020114, "learning_rate": 1e-06, "loss": -0.038, "num_tokens": 49590601.0, "reward": 0.5234375, "reward_std": 0.2536615617573261, "rewards/reward_model/mean": 0.5234375, "rewards/reward_model/std": 0.2536615710705519, "step": 165 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.2578125, "completions/max_length": 3760.875, "completions/max_terminated_length": 3760.875, "completions/mean_length": 1776.86328125, "completions/mean_terminated_length": 2432.289665222168, "completions/min_length": 0.0, "completions/min_terminated_length": 1166.875, "epoch": 0.0655575850323345, "grad_norm": 0.29533181399958297, "learning_rate": 1e-06, "loss": -0.0337, "num_tokens": 50063238.0, "reward": 0.30859375, "reward_std": 0.19807013869285583, "rewards/reward_model/mean": 0.30859375, "rewards/reward_model/std": 0.19807014986872673, "step": 166 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 2985.375, "completions/max_terminated_length": 2985.375, "completions/mean_length": 1198.50390625, "completions/mean_terminated_length": 1598.0052604675293, "completions/min_length": 0.0, "completions/min_terminated_length": 848.5, "epoch": 0.06595251024337266, "grad_norm": 0.3374303689873649, "learning_rate": 1e-06, "loss": 0.0358, "num_tokens": 50385575.0, "reward": 0.3984375, "reward_std": 0.21547781676054, "rewards/reward_model/mean": 0.3984375, "rewards/reward_model/std": 0.21547782607376575, "step": 167 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 2712.25, "completions/max_terminated_length": 2712.25, "completions/mean_length": 1096.6171875, "completions/mean_terminated_length": 1462.1562881469727, "completions/min_length": 0.0, "completions/min_terminated_length": 720.6875, "epoch": 0.06634743545441082, "grad_norm": 0.4827379063759844, "learning_rate": 1e-06, "loss": -0.0887, "num_tokens": 50684533.0, "reward": 0.515625, "reward_std": 0.39798038825392723, "rewards/reward_model/mean": 0.515625, "rewards/reward_model/std": 0.3979804050177336, "step": 168 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.27734375, "completions/max_length": 2681.3125, "completions/max_terminated_length": 2681.3125, "completions/mean_length": 1148.46484375, "completions/mean_terminated_length": 1790.1771354675293, "completions/min_length": 0.0, "completions/min_terminated_length": 957.6875, "epoch": 0.06674236066544897, "grad_norm": 0.464684415743529, "learning_rate": 1e-06, "loss": -0.0436, "num_tokens": 50995772.0, "reward": 0.5390625, "reward_std": 0.2997266612946987, "rewards/reward_model/mean": 0.5390625, "rewards/reward_model/std": 0.2997266724705696, "step": 169 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 2385.75, "completions/max_terminated_length": 2385.75, "completions/mean_length": 1078.94921875, "completions/mean_terminated_length": 1438.599006652832, "completions/min_length": 0.0, "completions/min_terminated_length": 749.25, "epoch": 0.06713728587648714, "grad_norm": 0.4770198158759528, "learning_rate": 1e-06, "loss": -0.04, "num_tokens": 51287311.0, "reward": 0.47265625, "reward_std": 0.23007603362202644, "rewards/reward_model/mean": 0.47265625, "rewards/reward_model/std": 0.2300760466605425, "step": 170 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 2588.4375, "completions/max_terminated_length": 2588.4375, "completions/mean_length": 1082.37109375, "completions/mean_terminated_length": 1443.1615028381348, "completions/min_length": 0.0, "completions/min_terminated_length": 765.125, "epoch": 0.0675322110875253, "grad_norm": 0.49559425728235257, "learning_rate": 1e-06, "loss": -0.0489, "num_tokens": 51579822.0, "reward": 0.37109375, "reward_std": 0.32105864956974983, "rewards/reward_model/mean": 0.37109375, "rewards/reward_model/std": 0.321058664470911, "step": 171 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 3009.875, "completions/max_terminated_length": 3009.875, "completions/mean_length": 1244.50390625, "completions/mean_terminated_length": 1659.338581085205, "completions/min_length": 0.0, "completions/min_terminated_length": 787.9375, "epoch": 0.06792713629856346, "grad_norm": 0.39292093644958104, "learning_rate": 1e-06, "loss": -0.0038, "num_tokens": 51914959.0, "reward": 0.4453125, "reward_std": 0.3355427235364914, "rewards/reward_model/mean": 0.4453125, "rewards/reward_model/std": 0.3355427347123623, "step": 172 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 4266.0625, "completions/max_terminated_length": 4266.0625, "completions/mean_length": 1512.8203125, "completions/mean_terminated_length": 2302.0208892822266, "completions/min_length": 0.0, "completions/min_terminated_length": 1104.375, "epoch": 0.06832206150960161, "grad_norm": 0.35228218901910247, "learning_rate": 1e-06, "loss": -0.0534, "num_tokens": 52322465.0, "reward": 0.33203125, "reward_std": 0.23490957915782928, "rewards/reward_model/mean": 0.33203125, "rewards/reward_model/std": 0.23490958660840988, "step": 173 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 3131.125, "completions/max_terminated_length": 3131.125, "completions/mean_length": 1294.859375, "completions/mean_terminated_length": 1726.47922706604, "completions/min_length": 0.0, "completions/min_terminated_length": 853.25, "epoch": 0.06871698672063978, "grad_norm": 0.49320953244409305, "learning_rate": 1e-06, "loss": -0.0509, "num_tokens": 52669181.0, "reward": 0.4375, "reward_std": 0.3958844989538193, "rewards/reward_model/mean": 0.4375, "rewards/reward_model/std": 0.3958845157176256, "step": 174 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25390625, "completions/max_length": 2684.625, "completions/max_terminated_length": 2684.625, "completions/mean_length": 1189.8203125, "completions/mean_terminated_length": 1611.9981670379639, "completions/min_length": 0.0, "completions/min_terminated_length": 786.0625, "epoch": 0.06911191193167794, "grad_norm": 0.4277355062246785, "learning_rate": 1e-06, "loss": -0.022, "num_tokens": 52990831.0, "reward": 0.66015625, "reward_std": 0.273744560778141, "rewards/reward_model/mean": 0.66015625, "rewards/reward_model/std": 0.27374457009136677, "step": 175 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 3250.5625, "completions/max_terminated_length": 3250.5625, "completions/mean_length": 1325.19140625, "completions/mean_terminated_length": 1766.9219207763672, "completions/min_length": 0.0, "completions/min_terminated_length": 837.875, "epoch": 0.0695068371427161, "grad_norm": 0.39696603411672826, "learning_rate": 1e-06, "loss": -0.0263, "num_tokens": 53347040.0, "reward": 0.48828125, "reward_std": 0.2808406911790371, "rewards/reward_model/mean": 0.48828125, "rewards/reward_model/std": 0.28084070049226284, "step": 176 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 3665.8125, "completions/max_terminated_length": 3665.8125, "completions/mean_length": 1570.984375, "completions/mean_terminated_length": 2094.6458835601807, "completions/min_length": 0.0, "completions/min_terminated_length": 967.5, "epoch": 0.06990176235375425, "grad_norm": 0.2907433233710465, "learning_rate": 1e-06, "loss": -0.003, "num_tokens": 53765884.0, "reward": 0.33984375, "reward_std": 0.22710951417684555, "rewards/reward_model/mean": 0.33984375, "rewards/reward_model/std": 0.2271095272153616, "step": 177 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 3259.125, "completions/max_terminated_length": 3259.125, "completions/mean_length": 1384.65625, "completions/mean_terminated_length": 1846.2084007263184, "completions/min_length": 0.0, "completions/min_terminated_length": 954.0, "epoch": 0.07029668756479242, "grad_norm": 0.4105058956248929, "learning_rate": 1e-06, "loss": -0.0764, "num_tokens": 54136196.0, "reward": 0.421875, "reward_std": 0.3762268088757992, "rewards/reward_model/mean": 0.421875, "rewards/reward_model/std": 0.3762268181890249, "step": 178 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 3061.375, "completions/max_terminated_length": 3061.375, "completions/mean_length": 1280.22265625, "completions/mean_terminated_length": 1706.9635848999023, "completions/min_length": 0.0, "completions/min_terminated_length": 854.875, "epoch": 0.07069161277583058, "grad_norm": 0.39108077204504693, "learning_rate": 1e-06, "loss": 0.0018, "num_tokens": 54477901.0, "reward": 0.33203125, "reward_std": 0.29475974291563034, "rewards/reward_model/mean": 0.33203125, "rewards/reward_model/std": 0.2947597559541464, "step": 179 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 3606.375, "completions/max_terminated_length": 3606.375, "completions/mean_length": 1301.125, "completions/mean_terminated_length": 1734.8333835601807, "completions/min_length": 0.0, "completions/min_terminated_length": 685.4375, "epoch": 0.07108653798686873, "grad_norm": 0.3856262600304978, "learning_rate": 1e-06, "loss": -0.0722, "num_tokens": 54824989.0, "reward": 0.375, "reward_std": 0.2804402746260166, "rewards/reward_model/mean": 0.375, "rewards/reward_model/std": 0.28044027648866177, "step": 180 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 2894.625, "completions/max_terminated_length": 2894.625, "completions/mean_length": 1100.44921875, "completions/mean_terminated_length": 1467.2656555175781, "completions/min_length": 0.0, "completions/min_terminated_length": 842.5, "epoch": 0.0714814631979069, "grad_norm": 0.3108676376468146, "learning_rate": 1e-06, "loss": -0.0099, "num_tokens": 55122336.0, "reward": 0.42578125, "reward_std": 0.19916532188653946, "rewards/reward_model/mean": 0.42578125, "rewards/reward_model/std": 0.1991653311997652, "step": 181 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 3599.3125, "completions/max_terminated_length": 3599.3125, "completions/mean_length": 1557.05859375, "completions/mean_terminated_length": 2076.0781745910645, "completions/min_length": 0.0, "completions/min_terminated_length": 920.0, "epoch": 0.07187638840894506, "grad_norm": 0.33281875483155804, "learning_rate": 1e-06, "loss": 0.0315, "num_tokens": 55538991.0, "reward": 0.38671875, "reward_std": 0.3362460434436798, "rewards/reward_model/mean": 0.38671875, "rewards/reward_model/std": 0.33624605275690556, "step": 182 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.2578125, "completions/max_length": 4117.0625, "completions/max_terminated_length": 4117.0625, "completions/mean_length": 1649.85546875, "completions/mean_terminated_length": 2249.206344604492, "completions/min_length": 0.0, "completions/min_terminated_length": 1060.25, "epoch": 0.07227131361998322, "grad_norm": 0.33606342318381793, "learning_rate": 1e-06, "loss": -0.0018, "num_tokens": 55977258.0, "reward": 0.25, "reward_std": 0.2787766270339489, "rewards/reward_model/mean": 0.25, "rewards/reward_model/std": 0.27877663634717464, "step": 183 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 3735.8125, "completions/max_terminated_length": 3735.8125, "completions/mean_length": 1540.27734375, "completions/mean_terminated_length": 2053.7031860351562, "completions/min_length": 0.0, "completions/min_terminated_length": 941.5625, "epoch": 0.07266623883102137, "grad_norm": 0.3797364839171857, "learning_rate": 1e-06, "loss": -0.0513, "num_tokens": 56388113.0, "reward": 0.41015625, "reward_std": 0.36518238857388496, "rewards/reward_model/mean": 0.41015625, "rewards/reward_model/std": 0.36518239602446556, "step": 184 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25390625, "completions/max_length": 3023.1875, "completions/max_terminated_length": 3023.1875, "completions/mean_length": 1188.66796875, "completions/mean_terminated_length": 1603.5014762878418, "completions/min_length": 0.0, "completions/min_terminated_length": 732.125, "epoch": 0.07306116404205953, "grad_norm": 0.3232396795948379, "learning_rate": 1e-06, "loss": -0.038, "num_tokens": 56706348.0, "reward": 0.3984375, "reward_std": 0.26693129912018776, "rewards/reward_model/mean": 0.3984375, "rewards/reward_model/std": 0.26693131029605865, "step": 185 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25390625, "completions/max_length": 3694.625, "completions/max_terminated_length": 3694.625, "completions/mean_length": 1701.984375, "completions/mean_terminated_length": 2298.463146209717, "completions/min_length": 0.0, "completions/min_terminated_length": 1157.875, "epoch": 0.0734560892530977, "grad_norm": 0.3560249266908414, "learning_rate": 1e-06, "loss": 0.0454, "num_tokens": 57160488.0, "reward": 0.27734375, "reward_std": 0.28267858177423477, "rewards/reward_model/mean": 0.27734375, "rewards/reward_model/std": 0.28267860040068626, "step": 186 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25390625, "completions/max_length": 2918.0625, "completions/max_terminated_length": 2918.0625, "completions/mean_length": 1143.90625, "completions/mean_terminated_length": 1533.192295074463, "completions/min_length": 0.0, "completions/min_terminated_length": 700.25, "epoch": 0.07385101446413585, "grad_norm": 0.4133440090514402, "learning_rate": 1e-06, "loss": -0.0458, "num_tokens": 57469568.0, "reward": 0.4140625, "reward_std": 0.3513515740633011, "rewards/reward_model/mean": 0.4140625, "rewards/reward_model/std": 0.351351598277688, "step": 187 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 2857.5625, "completions/max_terminated_length": 2857.5625, "completions/mean_length": 1191.71484375, "completions/mean_terminated_length": 1588.9531898498535, "completions/min_length": 0.0, "completions/min_terminated_length": 803.25, "epoch": 0.07424593967517401, "grad_norm": 0.36160503393096677, "learning_rate": 1e-06, "loss": 0.0298, "num_tokens": 57791047.0, "reward": 0.37890625, "reward_std": 0.26598741114139557, "rewards/reward_model/mean": 0.37890625, "rewards/reward_model/std": 0.26598742231726646, "step": 188 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 2815.125, "completions/max_terminated_length": 2815.125, "completions/mean_length": 1150.765625, "completions/mean_terminated_length": 1534.3542098999023, "completions/min_length": 0.0, "completions/min_terminated_length": 823.6875, "epoch": 0.07464086488621217, "grad_norm": 0.423460483552251, "learning_rate": 1e-06, "loss": -0.0473, "num_tokens": 58104123.0, "reward": 0.4609375, "reward_std": 0.30655382573604584, "rewards/reward_model/mean": 0.4609375, "rewards/reward_model/std": 0.3065538424998522, "step": 189 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25390625, "completions/max_length": 3269.0, "completions/max_terminated_length": 3269.0, "completions/mean_length": 1258.640625, "completions/mean_terminated_length": 1697.592845916748, "completions/min_length": 0.0, "completions/min_terminated_length": 755.625, "epoch": 0.07503579009725034, "grad_norm": 0.4335992347328897, "learning_rate": 1e-06, "loss": 0.0398, "num_tokens": 58442431.0, "reward": 0.4296875, "reward_std": 0.270793866366148, "rewards/reward_model/mean": 0.4296875, "rewards/reward_model/std": 0.27079387567937374, "step": 190 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 3009.0, "completions/max_terminated_length": 3009.0, "completions/mean_length": 1197.84765625, "completions/mean_terminated_length": 1597.130271911621, "completions/min_length": 0.0, "completions/min_terminated_length": 795.9375, "epoch": 0.07543071530828849, "grad_norm": 0.4242733638235913, "learning_rate": 1e-06, "loss": -0.0294, "num_tokens": 58766760.0, "reward": 0.453125, "reward_std": 0.2724410742521286, "rewards/reward_model/mean": 0.453125, "rewards/reward_model/std": 0.27244107984006405, "step": 191 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.26171875, "completions/max_length": 3824.0, "completions/max_terminated_length": 3824.0, "completions/mean_length": 1414.73046875, "completions/mean_terminated_length": 1963.2452392578125, "completions/min_length": 0.0, "completions/min_terminated_length": 769.8125, "epoch": 0.07582564051932665, "grad_norm": 0.44327845358071705, "learning_rate": 1e-06, "loss": 0.0095, "num_tokens": 59148451.0, "reward": 0.56640625, "reward_std": 0.3298723101615906, "rewards/reward_model/mean": 0.56640625, "rewards/reward_model/std": 0.32987232133746147, "step": 192 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 3283.3125, "completions/max_terminated_length": 3283.3125, "completions/mean_length": 1380.71875, "completions/mean_terminated_length": 1840.9583702087402, "completions/min_length": 0.0, "completions/min_terminated_length": 862.9375, "epoch": 0.07622056573036481, "grad_norm": 0.41646779617957325, "learning_rate": 1e-06, "loss": -0.1413, "num_tokens": 59520075.0, "reward": 0.5546875, "reward_std": 0.36467741429805756, "rewards/reward_model/mean": 0.5546875, "rewards/reward_model/std": 0.3646774273365736, "step": 193 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 2614.3125, "completions/max_terminated_length": 2614.3125, "completions/mean_length": 1178.1640625, "completions/mean_terminated_length": 1570.8854751586914, "completions/min_length": 0.0, "completions/min_terminated_length": 829.75, "epoch": 0.07661549094140298, "grad_norm": 0.45344175139667314, "learning_rate": 1e-06, "loss": -0.0634, "num_tokens": 59836261.0, "reward": 0.4140625, "reward_std": 0.26316745206713676, "rewards/reward_model/mean": 0.4140625, "rewards/reward_model/std": 0.2631674613803625, "step": 194 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 3239.625, "completions/max_terminated_length": 3239.625, "completions/mean_length": 1310.88671875, "completions/mean_terminated_length": 1747.8489875793457, "completions/min_length": 0.0, "completions/min_terminated_length": 788.9375, "epoch": 0.07701041615244113, "grad_norm": 0.4128336333603415, "learning_rate": 1e-06, "loss": -0.0697, "num_tokens": 60189432.0, "reward": 0.48828125, "reward_std": 0.26890478283166885, "rewards/reward_model/mean": 0.48828125, "rewards/reward_model/std": 0.26890479400753975, "step": 195 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 2940.5, "completions/max_terminated_length": 2940.5, "completions/mean_length": 1162.89453125, "completions/mean_terminated_length": 1550.5260925292969, "completions/min_length": 0.0, "completions/min_terminated_length": 775.0, "epoch": 0.07740534136347929, "grad_norm": 0.3904582517701028, "learning_rate": 1e-06, "loss": 0.0474, "num_tokens": 60503933.0, "reward": 0.4609375, "reward_std": 0.3146974891424179, "rewards/reward_model/mean": 0.4609375, "rewards/reward_model/std": 0.3146975040435791, "step": 196 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25390625, "completions/max_length": 3488.125, "completions/max_terminated_length": 3488.125, "completions/mean_length": 1490.92578125, "completions/mean_terminated_length": 2005.0081176757812, "completions/min_length": 0.0, "completions/min_terminated_length": 928.125, "epoch": 0.07780026657451745, "grad_norm": 0.31440037243493746, "learning_rate": 1e-06, "loss": 0.0492, "num_tokens": 60899850.0, "reward": 0.33984375, "reward_std": 0.22272183746099472, "rewards/reward_model/mean": 0.33984375, "rewards/reward_model/std": 0.22272185236215591, "step": 197 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 3410.5625, "completions/max_terminated_length": 3410.5625, "completions/mean_length": 1434.4765625, "completions/mean_terminated_length": 1912.635482788086, "completions/min_length": 0.0, "completions/min_terminated_length": 853.875, "epoch": 0.0781951917855556, "grad_norm": 0.3805637312779271, "learning_rate": 1e-06, "loss": -0.0194, "num_tokens": 61282420.0, "reward": 0.4140625, "reward_std": 0.30746113881468773, "rewards/reward_model/mean": 0.4140625, "rewards/reward_model/std": 0.3074611499905586, "step": 198 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 3067.75, "completions/max_terminated_length": 3067.75, "completions/mean_length": 1346.63671875, "completions/mean_terminated_length": 1795.51566696167, "completions/min_length": 0.0, "completions/min_terminated_length": 915.8125, "epoch": 0.07859011699659377, "grad_norm": 0.3649745126243681, "learning_rate": 1e-06, "loss": -0.0054, "num_tokens": 61644567.0, "reward": 0.44921875, "reward_std": 0.31945154443383217, "rewards/reward_model/mean": 0.44921875, "rewards/reward_model/std": 0.31945155188441277, "step": 199 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 2711.5, "completions/max_terminated_length": 2711.5, "completions/mean_length": 1081.79296875, "completions/mean_terminated_length": 1442.390682220459, "completions/min_length": 0.0, "completions/min_terminated_length": 766.8125, "epoch": 0.07898504220763193, "grad_norm": 0.46654675909116666, "learning_rate": 1e-06, "loss": -0.0222, "num_tokens": 61934914.0, "reward": 0.375, "reward_std": 0.38181402906775475, "rewards/reward_model/mean": 0.375, "rewards/reward_model/std": 0.3818140495568514, "step": 200 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 3178.4375, "completions/max_terminated_length": 3178.4375, "completions/mean_length": 1431.98828125, "completions/mean_terminated_length": 1909.3177471160889, "completions/min_length": 0.0, "completions/min_terminated_length": 833.75, "epoch": 0.1587599348373402, "grad_norm": 0.2814049141741808, "learning_rate": 1e-06, "loss": -0.034, "num_tokens": 62703372.0, "reward": 0.466796875, "reward_std": 0.27657976746559143, "rewards/reward_model/mean": 0.466796875, "rewards/reward_model/std": 0.2765797758474946, "step": 201 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 3299.03125, "completions/max_terminated_length": 3299.03125, "completions/mean_length": 1360.28125, "completions/mean_terminated_length": 1817.4189891815186, "completions/min_length": 0.0, "completions/min_terminated_length": 889.6875, "epoch": 0.1595497852594165, "grad_norm": 0.30942571037588973, "learning_rate": 1e-06, "loss": -0.0123, "num_tokens": 63434844.0, "reward": 0.3984375, "reward_std": 0.36201424337923527, "rewards/reward_model/mean": 0.3984375, "rewards/reward_model/std": 0.36201425548642874, "step": 202 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 2683.28125, "completions/max_terminated_length": 2683.28125, "completions/mean_length": 1140.6640625, "completions/mean_terminated_length": 1520.885456085205, "completions/min_length": 0.0, "completions/min_terminated_length": 740.5625, "epoch": 0.16033963568149281, "grad_norm": 0.2760033603265308, "learning_rate": 1e-06, "loss": -0.0486, "num_tokens": 64050592.0, "reward": 0.5, "reward_std": 0.26983178593218327, "rewards/reward_model/mean": 0.5, "rewards/reward_model/std": 0.26983179431408644, "step": 203 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.251953125, "completions/max_length": 3202.53125, "completions/max_terminated_length": 3202.53125, "completions/mean_length": 1333.94921875, "completions/mean_terminated_length": 1790.0149688720703, "completions/min_length": 0.0, "completions/min_terminated_length": 812.5, "epoch": 0.16112948610356914, "grad_norm": 0.2969151890337655, "learning_rate": 1e-06, "loss": -0.0421, "num_tokens": 64765590.0, "reward": 0.447265625, "reward_std": 0.31724646501243114, "rewards/reward_model/mean": 0.447265625, "rewards/reward_model/std": 0.3172464733943343, "step": 204 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.251953125, "completions/max_length": 2805.875, "completions/max_terminated_length": 2805.875, "completions/mean_length": 1149.75390625, "completions/mean_terminated_length": 1544.6072883605957, "completions/min_length": 0.0, "completions/min_terminated_length": 727.59375, "epoch": 0.16191933652564547, "grad_norm": 0.2779571574671361, "learning_rate": 1e-06, "loss": 0.0042, "num_tokens": 65388392.0, "reward": 0.5390625, "reward_std": 0.3033733330667019, "rewards/reward_model/mean": 0.5390625, "rewards/reward_model/std": 0.30337334889918566, "step": 205 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.251953125, "completions/max_length": 3383.8125, "completions/max_terminated_length": 3383.8125, "completions/mean_length": 1444.13671875, "completions/mean_terminated_length": 1935.851152420044, "completions/min_length": 0.0, "completions/min_terminated_length": 804.5625, "epoch": 0.16270918694772177, "grad_norm": 0.28730628043213724, "learning_rate": 1e-06, "loss": -0.0208, "num_tokens": 66158350.0, "reward": 0.41015625, "reward_std": 0.23379137553274632, "rewards/reward_model/mean": 0.41015625, "rewards/reward_model/std": 0.23379138484597206, "step": 206 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25390625, "completions/max_length": 2980.28125, "completions/max_terminated_length": 2980.28125, "completions/mean_length": 1296.38671875, "completions/mean_terminated_length": 1759.1370449066162, "completions/min_length": 0.0, "completions/min_terminated_length": 905.21875, "epoch": 0.1634990373697981, "grad_norm": 0.2909873838175628, "learning_rate": 1e-06, "loss": -0.0346, "num_tokens": 66857940.0, "reward": 0.513671875, "reward_std": 0.3290997166186571, "rewards/reward_model/mean": 0.513671875, "rewards/reward_model/std": 0.329099727794528, "step": 207 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.251953125, "completions/max_length": 3814.125, "completions/max_terminated_length": 3814.125, "completions/mean_length": 1571.580078125, "completions/mean_terminated_length": 2105.598768234253, "completions/min_length": 0.0, "completions/min_terminated_length": 906.84375, "epoch": 0.16428888779187442, "grad_norm": 0.26345285027745713, "learning_rate": 1e-06, "loss": -0.0327, "num_tokens": 67698189.0, "reward": 0.41015625, "reward_std": 0.28272905945777893, "rewards/reward_model/mean": 0.41015625, "rewards/reward_model/std": 0.28272906970232725, "step": 208 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.263671875, "completions/max_length": 3598.46875, "completions/max_terminated_length": 3598.46875, "completions/mean_length": 1326.0546875, "completions/mean_terminated_length": 1842.4229402542114, "completions/min_length": 0.0, "completions/min_terminated_length": 848.3125, "epoch": 0.16507873821395072, "grad_norm": 0.2927463318977004, "learning_rate": 1e-06, "loss": -0.0469, "num_tokens": 68413577.0, "reward": 0.34765625, "reward_std": 0.28284349106252193, "rewards/reward_model/mean": 0.34765625, "rewards/reward_model/std": 0.28284350503236055, "step": 209 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 3013.0, "completions/max_terminated_length": 3013.0, "completions/mean_length": 1277.693359375, "completions/mean_terminated_length": 1703.5912075042725, "completions/min_length": 0.0, "completions/min_terminated_length": 837.5625, "epoch": 0.16586858863602705, "grad_norm": 0.2929468518291329, "learning_rate": 1e-06, "loss": -0.0287, "num_tokens": 69097196.0, "reward": 0.517578125, "reward_std": 0.3065659645944834, "rewards/reward_model/mean": 0.517578125, "rewards/reward_model/std": 0.3065659748390317, "step": 210 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.251953125, "completions/max_length": 3346.625, "completions/max_terminated_length": 3346.625, "completions/mean_length": 1447.90234375, "completions/mean_terminated_length": 1937.6612739562988, "completions/min_length": 0.0, "completions/min_terminated_length": 893.34375, "epoch": 0.16665843905810337, "grad_norm": 0.2674649613328794, "learning_rate": 1e-06, "loss": -0.0261, "num_tokens": 69875034.0, "reward": 0.353515625, "reward_std": 0.27468436025083065, "rewards/reward_model/mean": 0.353515625, "rewards/reward_model/std": 0.2746843649074435, "step": 211 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 2655.1875, "completions/max_terminated_length": 2655.1875, "completions/mean_length": 1153.390625, "completions/mean_terminated_length": 1537.854196548462, "completions/min_length": 0.0, "completions/min_terminated_length": 847.375, "epoch": 0.1674482894801797, "grad_norm": 0.34788227198087907, "learning_rate": 1e-06, "loss": 0.0014, "num_tokens": 70497554.0, "reward": 0.4765625, "reward_std": 0.33552716858685017, "rewards/reward_model/mean": 0.4765625, "rewards/reward_model/std": 0.3355271853506565, "step": 212 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.251953125, "completions/max_length": 3487.1875, "completions/max_terminated_length": 3487.1875, "completions/mean_length": 1387.255859375, "completions/mean_terminated_length": 1855.5393562316895, "completions/min_length": 0.0, "completions/min_terminated_length": 895.8125, "epoch": 0.168238139902256, "grad_norm": 0.2327186232121818, "learning_rate": 1e-06, "loss": -0.023, "num_tokens": 71244101.0, "reward": 0.478515625, "reward_std": 0.28044041246175766, "rewards/reward_model/mean": 0.478515625, "rewards/reward_model/std": 0.28044042363762856, "step": 213 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.255859375, "completions/max_length": 2912.84375, "completions/max_terminated_length": 2912.84375, "completions/mean_length": 1303.65234375, "completions/mean_terminated_length": 1780.163803100586, "completions/min_length": 0.0, "completions/min_terminated_length": 910.25, "epoch": 0.16902799032433233, "grad_norm": 0.27395388973670814, "learning_rate": 1e-06, "loss": -0.0429, "num_tokens": 71948627.0, "reward": 0.587890625, "reward_std": 0.26359099708497524, "rewards/reward_model/mean": 0.587890625, "rewards/reward_model/std": 0.26359100453555584, "step": 214 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.251953125, "completions/max_length": 3103.84375, "completions/max_terminated_length": 3103.84375, "completions/mean_length": 1314.623046875, "completions/mean_terminated_length": 1764.2043533325195, "completions/min_length": 0.0, "completions/min_terminated_length": 852.96875, "epoch": 0.16981784074640865, "grad_norm": 0.25394356717766065, "learning_rate": 1e-06, "loss": -0.0114, "num_tokens": 72656162.0, "reward": 0.337890625, "reward_std": 0.23936727084219456, "rewards/reward_model/mean": 0.337890625, "rewards/reward_model/std": 0.23936727829277515, "step": 215 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.251953125, "completions/max_length": 3798.15625, "completions/max_terminated_length": 3798.15625, "completions/mean_length": 1534.83984375, "completions/mean_terminated_length": 2058.8421573638916, "completions/min_length": 0.0, "completions/min_terminated_length": 929.09375, "epoch": 0.17060769116848498, "grad_norm": 0.2841932449099531, "learning_rate": 1e-06, "loss": -0.0114, "num_tokens": 73474336.0, "reward": 0.361328125, "reward_std": 0.3034173045307398, "rewards/reward_model/mean": 0.361328125, "rewards/reward_model/std": 0.3034173157066107, "step": 216 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 3091.75, "completions/max_terminated_length": 3091.75, "completions/mean_length": 1276.23046875, "completions/mean_terminated_length": 1701.6406784057617, "completions/min_length": 0.0, "completions/min_terminated_length": 802.34375, "epoch": 0.17139754159056128, "grad_norm": 0.2805585872661958, "learning_rate": 1e-06, "loss": -0.02, "num_tokens": 74157238.0, "reward": 0.359375, "reward_std": 0.2801475264132023, "rewards/reward_model/mean": 0.359375, "rewards/reward_model/std": 0.2801475403830409, "step": 217 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.251953125, "completions/max_length": 2974.96875, "completions/max_terminated_length": 2974.96875, "completions/mean_length": 1198.87109375, "completions/mean_terminated_length": 1611.803544998169, "completions/min_length": 0.0, "completions/min_terminated_length": 786.71875, "epoch": 0.1721873920126376, "grad_norm": 0.3083129126050061, "learning_rate": 1e-06, "loss": 0.0169, "num_tokens": 74803188.0, "reward": 0.375, "reward_std": 0.29777604155242443, "rewards/reward_model/mean": 0.375, "rewards/reward_model/std": 0.2977760564535856, "step": 218 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 3106.40625, "completions/max_terminated_length": 3106.40625, "completions/mean_length": 1300.642578125, "completions/mean_terminated_length": 1734.1901607513428, "completions/min_length": 0.0, "completions/min_terminated_length": 797.875, "epoch": 0.17297724243471393, "grad_norm": 0.288052675793488, "learning_rate": 1e-06, "loss": 0.0015, "num_tokens": 75501085.0, "reward": 0.458984375, "reward_std": 0.3351223673671484, "rewards/reward_model/mean": 0.458984375, "rewards/reward_model/std": 0.33512238040566444, "step": 219 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25390625, "completions/max_length": 3137.21875, "completions/max_terminated_length": 3137.21875, "completions/mean_length": 1352.900390625, "completions/mean_terminated_length": 1824.8489971160889, "completions/min_length": 0.0, "completions/min_terminated_length": 807.25, "epoch": 0.17376709285679023, "grad_norm": 0.29974353478029364, "learning_rate": 1e-06, "loss": -0.0132, "num_tokens": 76224074.0, "reward": 0.458984375, "reward_std": 0.2986700553447008, "rewards/reward_model/mean": 0.458984375, "rewards/reward_model/std": 0.2986700674518943, "step": 220 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.248046875, "completions/max_length": 3299.75, "completions/max_terminated_length": 3299.75, "completions/mean_length": 1406.54296875, "completions/mean_terminated_length": 1872.8500022888184, "completions/min_length": 0.0, "completions/min_terminated_length": 966.34375, "epoch": 0.17455694327886656, "grad_norm": 0.28246657660179864, "learning_rate": 1e-06, "loss": -0.0167, "num_tokens": 76971152.0, "reward": 0.44921875, "reward_std": 0.33251417241990566, "rewards/reward_model/mean": 0.44921875, "rewards/reward_model/std": 0.332514182664454, "step": 221 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 3414.875, "completions/max_terminated_length": 3414.875, "completions/mean_length": 1361.201171875, "completions/mean_terminated_length": 1814.9349422454834, "completions/min_length": 0.0, "completions/min_terminated_length": 877.1875, "epoch": 0.1753467937009429, "grad_norm": 0.26626162576843426, "learning_rate": 1e-06, "loss": -0.0004, "num_tokens": 77703015.0, "reward": 0.34375, "reward_std": 0.2526954747736454, "rewards/reward_model/mean": 0.34375, "rewards/reward_model/std": 0.25269548688083887, "step": 222 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25390625, "completions/max_length": 3561.28125, "completions/max_terminated_length": 3561.28125, "completions/mean_length": 1527.4609375, "completions/mean_terminated_length": 2051.8052082061768, "completions/min_length": 0.0, "completions/min_terminated_length": 959.28125, "epoch": 0.17613664412301921, "grad_norm": 0.2907279904884026, "learning_rate": 1e-06, "loss": -0.018, "num_tokens": 78522227.0, "reward": 0.42578125, "reward_std": 0.34299319609999657, "rewards/reward_model/mean": 0.42578125, "rewards/reward_model/std": 0.3429932054132223, "step": 223 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 3390.96875, "completions/max_terminated_length": 3390.96875, "completions/mean_length": 1426.935546875, "completions/mean_terminated_length": 1897.7429447174072, "completions/min_length": 0.0, "completions/min_terminated_length": 923.1875, "epoch": 0.17692649454509551, "grad_norm": 0.2768968499967981, "learning_rate": 1e-06, "loss": -0.0011, "num_tokens": 79282930.0, "reward": 0.439453125, "reward_std": 0.31962933391332626, "rewards/reward_model/mean": 0.439453125, "rewards/reward_model/std": 0.31962934881448746, "step": 224 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 2636.3125, "completions/max_terminated_length": 2636.3125, "completions/mean_length": 1180.75390625, "completions/mean_terminated_length": 1574.3385944366455, "completions/min_length": 0.0, "completions/min_terminated_length": 811.71875, "epoch": 0.17771634496717184, "grad_norm": 0.2711190320719356, "learning_rate": 1e-06, "loss": 0.0066, "num_tokens": 79919796.0, "reward": 0.515625, "reward_std": 0.271996159106493, "rewards/reward_model/mean": 0.515625, "rewards/reward_model/std": 0.27199616841971874, "step": 225 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 3436.96875, "completions/max_terminated_length": 3436.96875, "completions/mean_length": 1397.9765625, "completions/mean_terminated_length": 1863.9687986373901, "completions/min_length": 0.0, "completions/min_terminated_length": 882.5625, "epoch": 0.17850619538924817, "grad_norm": 0.2912651553704893, "learning_rate": 1e-06, "loss": 0.0141, "num_tokens": 80667848.0, "reward": 0.490234375, "reward_std": 0.36662535928189754, "rewards/reward_model/mean": 0.490234375, "rewards/reward_model/std": 0.36662537325173616, "step": 226 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25390625, "completions/max_length": 3785.28125, "completions/max_terminated_length": 3785.28125, "completions/mean_length": 1605.783203125, "completions/mean_terminated_length": 2173.9302520751953, "completions/min_length": 0.0, "completions/min_terminated_length": 998.125, "epoch": 0.17929604581132447, "grad_norm": 0.25778491294190986, "learning_rate": 1e-06, "loss": -0.0725, "num_tokens": 81520073.0, "reward": 0.34765625, "reward_std": 0.2923587951809168, "rewards/reward_model/mean": 0.34765625, "rewards/reward_model/std": 0.29235881101340055, "step": 227 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.263671875, "completions/max_length": 4171.59375, "completions/max_terminated_length": 4171.59375, "completions/mean_length": 1613.1640625, "completions/mean_terminated_length": 2236.1502952575684, "completions/min_length": 0.0, "completions/min_terminated_length": 881.28125, "epoch": 0.1800858962334008, "grad_norm": 0.24560564704563545, "learning_rate": 1e-06, "loss": -0.0069, "num_tokens": 82381837.0, "reward": 0.3671875, "reward_std": 0.30040896870195866, "rewards/reward_model/mean": 0.3671875, "rewards/reward_model/std": 0.3004089882597327, "step": 228 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 2885.40625, "completions/max_terminated_length": 2885.40625, "completions/mean_length": 1242.787109375, "completions/mean_terminated_length": 1657.049518585205, "completions/min_length": 0.0, "completions/min_terminated_length": 883.9375, "epoch": 0.18087574665547712, "grad_norm": 0.2997984543800761, "learning_rate": 1e-06, "loss": -0.0055, "num_tokens": 83052752.0, "reward": 0.44140625, "reward_std": 0.304364213719964, "rewards/reward_model/mean": 0.44140625, "rewards/reward_model/std": 0.30436422396451235, "step": 229 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 3351.1875, "completions/max_terminated_length": 3351.1875, "completions/mean_length": 1389.552734375, "completions/mean_terminated_length": 1852.73703956604, "completions/min_length": 0.0, "completions/min_terminated_length": 821.65625, "epoch": 0.18166559707755345, "grad_norm": 0.2828624054535828, "learning_rate": 1e-06, "loss": -0.0239, "num_tokens": 83799515.0, "reward": 0.4453125, "reward_std": 0.3331186156719923, "rewards/reward_model/mean": 0.4453125, "rewards/reward_model/std": 0.33311863243579865, "step": 230 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.251953125, "completions/max_length": 3656.8125, "completions/max_terminated_length": 3656.8125, "completions/mean_length": 1472.830078125, "completions/mean_terminated_length": 1966.0911979675293, "completions/min_length": 0.0, "completions/min_terminated_length": 821.0625, "epoch": 0.18245544749962975, "grad_norm": 0.23508703064139858, "learning_rate": 1e-06, "loss": 0.0214, "num_tokens": 84586580.0, "reward": 0.318359375, "reward_std": 0.27943449281156063, "rewards/reward_model/mean": 0.318359375, "rewards/reward_model/std": 0.27943450678139925, "step": 231 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 3263.34375, "completions/max_terminated_length": 3263.34375, "completions/mean_length": 1398.359375, "completions/mean_terminated_length": 1864.4792079925537, "completions/min_length": 0.0, "completions/min_terminated_length": 956.125, "epoch": 0.18324529792170607, "grad_norm": 0.29197416700959045, "learning_rate": 1e-06, "loss": -0.021, "num_tokens": 85332556.0, "reward": 0.435546875, "reward_std": 0.33099472895264626, "rewards/reward_model/mean": 0.435546875, "rewards/reward_model/std": 0.33099474478513, "step": 232 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 3364.1875, "completions/max_terminated_length": 3364.1875, "completions/mean_length": 1465.591796875, "completions/mean_terminated_length": 1954.1224632263184, "completions/min_length": 0.0, "completions/min_terminated_length": 940.53125, "epoch": 0.1840351483437824, "grad_norm": 0.28812183479294723, "learning_rate": 1e-06, "loss": -0.0116, "num_tokens": 86113611.0, "reward": 0.431640625, "reward_std": 0.3106132224202156, "rewards/reward_model/mean": 0.431640625, "rewards/reward_model/std": 0.3106132326647639, "step": 233 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.2578125, "completions/max_length": 3015.53125, "completions/max_terminated_length": 3015.53125, "completions/mean_length": 1161.607421875, "completions/mean_terminated_length": 1598.7617492675781, "completions/min_length": 0.0, "completions/min_terminated_length": 754.78125, "epoch": 0.18482499876585873, "grad_norm": 0.2970474207840448, "learning_rate": 1e-06, "loss": 0.0137, "num_tokens": 86742722.0, "reward": 0.44921875, "reward_std": 0.28219635784626007, "rewards/reward_model/mean": 0.44921875, "rewards/reward_model/std": 0.2821963746100664, "step": 234 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.251953125, "completions/max_length": 3468.03125, "completions/max_terminated_length": 3468.03125, "completions/mean_length": 1447.220703125, "completions/mean_terminated_length": 1939.758108139038, "completions/min_length": 0.0, "completions/min_terminated_length": 849.46875, "epoch": 0.18561484918793503, "grad_norm": 0.26618996649128596, "learning_rate": 1e-06, "loss": -0.022, "num_tokens": 87513539.0, "reward": 0.41796875, "reward_std": 0.3304977659136057, "rewards/reward_model/mean": 0.41796875, "rewards/reward_model/std": 0.3304977770894766, "step": 235 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 3533.0, "completions/max_terminated_length": 3533.0, "completions/mean_length": 1500.783203125, "completions/mean_terminated_length": 2001.0443210601807, "completions/min_length": 0.0, "completions/min_terminated_length": 918.6875, "epoch": 0.18640469961001135, "grad_norm": 0.29613507564014113, "learning_rate": 1e-06, "loss": -0.0567, "num_tokens": 88311796.0, "reward": 0.486328125, "reward_std": 0.3116069156676531, "rewards/reward_model/mean": 0.486328125, "rewards/reward_model/std": 0.31160692870616913, "step": 236 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25390625, "completions/max_length": 4057.78125, "completions/max_terminated_length": 4057.78125, "completions/mean_length": 1587.142578125, "completions/mean_terminated_length": 2138.856294631958, "completions/min_length": 0.0, "completions/min_terminated_length": 864.875, "epoch": 0.18719455003208768, "grad_norm": 0.24944102337784735, "learning_rate": 1e-06, "loss": 0.0236, "num_tokens": 89159613.0, "reward": 0.3984375, "reward_std": 0.2852395474910736, "rewards/reward_model/mean": 0.3984375, "rewards/reward_model/std": 0.2852395586669445, "step": 237 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.251953125, "completions/max_length": 3401.4375, "completions/max_terminated_length": 3401.4375, "completions/mean_length": 1463.185546875, "completions/mean_terminated_length": 1953.7472114562988, "completions/min_length": 0.0, "completions/min_terminated_length": 863.125, "epoch": 0.18798440045416398, "grad_norm": 0.2915045393341782, "learning_rate": 1e-06, "loss": 0.0316, "num_tokens": 89940588.0, "reward": 0.431640625, "reward_std": 0.33506107702851295, "rewards/reward_model/mean": 0.431640625, "rewards/reward_model/std": 0.3350610891357064, "step": 238 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.263671875, "completions/max_length": 4945.46875, "completions/max_terminated_length": 4945.46875, "completions/mean_length": 1991.697265625, "completions/mean_terminated_length": 2738.3905601501465, "completions/min_length": 0.0, "completions/min_terminated_length": 1088.53125, "epoch": 0.1887742508762403, "grad_norm": 0.2374119640622859, "learning_rate": 1e-06, "loss": -0.0225, "num_tokens": 90999105.0, "reward": 0.322265625, "reward_std": 0.2961189989000559, "rewards/reward_model/mean": 0.322265625, "rewards/reward_model/std": 0.29611900355666876, "step": 239 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 3640.0625, "completions/max_terminated_length": 3640.0625, "completions/mean_length": 1574.607421875, "completions/mean_terminated_length": 2099.4766387939453, "completions/min_length": 0.0, "completions/min_terminated_length": 975.375, "epoch": 0.18956410129831663, "grad_norm": 0.2652046469032235, "learning_rate": 1e-06, "loss": -0.0483, "num_tokens": 91844648.0, "reward": 0.353515625, "reward_std": 0.3121224623173475, "rewards/reward_model/mean": 0.353515625, "rewards/reward_model/std": 0.3121224772185087, "step": 240 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 3264.96875, "completions/max_terminated_length": 3264.96875, "completions/mean_length": 1318.0625, "completions/mean_terminated_length": 1757.4167156219482, "completions/min_length": 0.0, "completions/min_terminated_length": 809.53125, "epoch": 0.19035395172039296, "grad_norm": 0.2729599591931354, "learning_rate": 1e-06, "loss": -0.0411, "num_tokens": 92552232.0, "reward": 0.421875, "reward_std": 0.2657746188342571, "rewards/reward_model/mean": 0.421875, "rewards/reward_model/std": 0.2657746262848377, "step": 241 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.251953125, "completions/max_length": 3538.9375, "completions/max_terminated_length": 3538.9375, "completions/mean_length": 1527.865234375, "completions/mean_terminated_length": 2043.749807357788, "completions/min_length": 0.0, "completions/min_terminated_length": 940.34375, "epoch": 0.19114380214246926, "grad_norm": 0.2634091029231154, "learning_rate": 1e-06, "loss": -0.0437, "num_tokens": 93365955.0, "reward": 0.4296875, "reward_std": 0.2709125243127346, "rewards/reward_model/mean": 0.4296875, "rewards/reward_model/std": 0.2709125317633152, "step": 242 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 3043.4375, "completions/max_terminated_length": 3043.4375, "completions/mean_length": 1247.07421875, "completions/mean_terminated_length": 1662.7656726837158, "completions/min_length": 0.0, "completions/min_terminated_length": 841.09375, "epoch": 0.1919336525645456, "grad_norm": 0.3088905654921969, "learning_rate": 1e-06, "loss": -0.0389, "num_tokens": 94037001.0, "reward": 0.416015625, "reward_std": 0.36305131390690804, "rewards/reward_model/mean": 0.416015625, "rewards/reward_model/std": 0.3630513232201338, "step": 243 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 3579.0625, "completions/max_terminated_length": 3579.0625, "completions/mean_length": 1442.791015625, "completions/mean_terminated_length": 1923.7213916778564, "completions/min_length": 0.0, "completions/min_terminated_length": 817.90625, "epoch": 0.19272350298662191, "grad_norm": 0.30849947044846293, "learning_rate": 1e-06, "loss": 0.0054, "num_tokens": 94809630.0, "reward": 0.48046875, "reward_std": 0.35072560235857964, "rewards/reward_model/mean": 0.48046875, "rewards/reward_model/std": 0.35072562005370855, "step": 244 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 3650.65625, "completions/max_terminated_length": 3650.65625, "completions/mean_length": 1532.7421875, "completions/mean_terminated_length": 2043.6562976837158, "completions/min_length": 0.0, "completions/min_terminated_length": 917.40625, "epoch": 0.19351335340869824, "grad_norm": 0.2785650762333097, "learning_rate": 1e-06, "loss": -0.0372, "num_tokens": 95627194.0, "reward": 0.42578125, "reward_std": 0.3126500640064478, "rewards/reward_model/mean": 0.42578125, "rewards/reward_model/std": 0.3126500779762864, "step": 245 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 3407.0, "completions/max_terminated_length": 3407.0, "completions/mean_length": 1454.650390625, "completions/mean_terminated_length": 1939.5339126586914, "completions/min_length": 0.0, "completions/min_terminated_length": 920.5, "epoch": 0.19430320383077454, "grad_norm": 0.28226736091385346, "learning_rate": 1e-06, "loss": 0.0029, "num_tokens": 96406215.0, "reward": 0.3828125, "reward_std": 0.3126484379172325, "rewards/reward_model/mean": 0.3828125, "rewards/reward_model/std": 0.312648450024426, "step": 246 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25390625, "completions/max_length": 3844.6875, "completions/max_terminated_length": 3844.6875, "completions/mean_length": 1497.228515625, "completions/mean_terminated_length": 2013.922981262207, "completions/min_length": 0.0, "completions/min_terminated_length": 896.21875, "epoch": 0.19509305425285087, "grad_norm": 0.2985930060118006, "learning_rate": 1e-06, "loss": -0.0579, "num_tokens": 97205132.0, "reward": 0.486328125, "reward_std": 0.34764265455305576, "rewards/reward_model/mean": 0.486328125, "rewards/reward_model/std": 0.3476426647976041, "step": 247 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.251953125, "completions/max_length": 4134.5, "completions/max_terminated_length": 4134.5, "completions/mean_length": 1672.998046875, "completions/mean_terminated_length": 2247.149663925171, "completions/min_length": 0.0, "completions/min_terminated_length": 1125.34375, "epoch": 0.1958829046749272, "grad_norm": 0.2463096423077944, "learning_rate": 1e-06, "loss": -0.0225, "num_tokens": 98095387.0, "reward": 0.35546875, "reward_std": 0.22585793770849705, "rewards/reward_model/mean": 0.35546875, "rewards/reward_model/std": 0.22585794795304537, "step": 248 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 3649.03125, "completions/max_terminated_length": 3649.03125, "completions/mean_length": 1443.771484375, "completions/mean_terminated_length": 1925.0287075042725, "completions/min_length": 0.0, "completions/min_terminated_length": 906.375, "epoch": 0.1966727550970035, "grad_norm": 0.2773169418269596, "learning_rate": 1e-06, "loss": 0.0338, "num_tokens": 98866054.0, "reward": 0.380859375, "reward_std": 0.2450924515724182, "rewards/reward_model/mean": 0.380859375, "rewards/reward_model/std": 0.2450924627482891, "step": 249 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.251953125, "completions/max_length": 3274.09375, "completions/max_terminated_length": 3274.09375, "completions/mean_length": 1428.625, "completions/mean_terminated_length": 1913.7323036193848, "completions/min_length": 0.0, "completions/min_terminated_length": 932.59375, "epoch": 0.19746260551907982, "grad_norm": 0.30796291257363656, "learning_rate": 1e-06, "loss": -0.0235, "num_tokens": 99630774.0, "reward": 0.4375, "reward_std": 0.31107109412550926, "rewards/reward_model/mean": 0.4375, "rewards/reward_model/std": 0.3110711080953479, "step": 250 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25390625, "completions/max_length": 3950.4375, "completions/max_terminated_length": 3950.4375, "completions/mean_length": 1593.66015625, "completions/mean_terminated_length": 2141.592878341675, "completions/min_length": 0.0, "completions/min_terminated_length": 961.5, "epoch": 0.19825245594115615, "grad_norm": 0.2800433062937126, "learning_rate": 1e-06, "loss": -0.0071, "num_tokens": 100476744.0, "reward": 0.458984375, "reward_std": 0.3172136712819338, "rewards/reward_model/mean": 0.458984375, "rewards/reward_model/std": 0.31721368059515953, "step": 251 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 3160.96875, "completions/max_terminated_length": 3160.96875, "completions/mean_length": 1411.583984375, "completions/mean_terminated_length": 1882.1120262145996, "completions/min_length": 0.0, "completions/min_terminated_length": 890.40625, "epoch": 0.19904230636323247, "grad_norm": 0.2545960004536864, "learning_rate": 1e-06, "loss": -0.0393, "num_tokens": 101235251.0, "reward": 0.390625, "reward_std": 0.28924927674233913, "rewards/reward_model/mean": 0.390625, "rewards/reward_model/std": 0.2892492860555649, "step": 252 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.251953125, "completions/max_length": 3066.78125, "completions/max_terminated_length": 3066.78125, "completions/mean_length": 1272.595703125, "completions/mean_terminated_length": 1705.7346591949463, "completions/min_length": 0.0, "completions/min_terminated_length": 806.09375, "epoch": 0.19983215678530877, "grad_norm": 0.2848909833954553, "learning_rate": 1e-06, "loss": -0.0245, "num_tokens": 101917044.0, "reward": 0.37109375, "reward_std": 0.2635906897485256, "rewards/reward_model/mean": 0.37109375, "rewards/reward_model/std": 0.26359069999307394, "step": 253 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.255859375, "completions/max_length": 3542.25, "completions/max_terminated_length": 3542.25, "completions/mean_length": 1460.732421875, "completions/mean_terminated_length": 1985.3203601837158, "completions/min_length": 0.0, "completions/min_terminated_length": 919.0625, "epoch": 0.2006220072073851, "grad_norm": 0.2546429895287337, "learning_rate": 1e-06, "loss": -0.0465, "num_tokens": 102699755.0, "reward": 0.396484375, "reward_std": 0.29285567440092564, "rewards/reward_model/mean": 0.396484375, "rewards/reward_model/std": 0.29285568464547396, "step": 254 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.251953125, "completions/max_length": 3456.15625, "completions/max_terminated_length": 3456.15625, "completions/mean_length": 1429.978515625, "completions/mean_terminated_length": 1921.5654010772705, "completions/min_length": 0.0, "completions/min_terminated_length": 908.40625, "epoch": 0.20141185762946143, "grad_norm": 0.29274084446592236, "learning_rate": 1e-06, "loss": 0.0024, "num_tokens": 103463984.0, "reward": 0.369140625, "reward_std": 0.33413926139473915, "rewards/reward_model/mean": 0.369140625, "rewards/reward_model/std": 0.3341392697766423, "step": 255 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.255859375, "completions/max_length": 3417.46875, "completions/max_terminated_length": 3417.46875, "completions/mean_length": 1436.517578125, "completions/mean_terminated_length": 1951.438808441162, "completions/min_length": 0.0, "completions/min_terminated_length": 869.25, "epoch": 0.20220170805153773, "grad_norm": 0.30259238492469176, "learning_rate": 1e-06, "loss": -0.0353, "num_tokens": 104227097.0, "reward": 0.44140625, "reward_std": 0.3677856754511595, "rewards/reward_model/mean": 0.44140625, "rewards/reward_model/std": 0.3677856894209981, "step": 256 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 3364.5625, "completions/max_terminated_length": 3364.5625, "completions/mean_length": 1448.3125, "completions/mean_terminated_length": 1931.0833778381348, "completions/min_length": 0.0, "completions/min_terminated_length": 914.4375, "epoch": 0.20299155847361405, "grad_norm": 0.2537836915470743, "learning_rate": 1e-06, "loss": -0.0185, "num_tokens": 105003033.0, "reward": 0.46484375, "reward_std": 0.30860251002013683, "rewards/reward_model/mean": 0.46484375, "rewards/reward_model/std": 0.3086025146767497, "step": 257 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.259765625, "completions/max_length": 3713.5625, "completions/max_terminated_length": 3713.5625, "completions/mean_length": 1565.392578125, "completions/mean_terminated_length": 2138.0735931396484, "completions/min_length": 0.0, "completions/min_terminated_length": 942.21875, "epoch": 0.20378140889569038, "grad_norm": 0.2373742432881273, "learning_rate": 1e-06, "loss": -0.0455, "num_tokens": 105840242.0, "reward": 0.423828125, "reward_std": 0.2807525470852852, "rewards/reward_model/mean": 0.423828125, "rewards/reward_model/std": 0.2807525545358658, "step": 258 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.251953125, "completions/max_length": 3552.28125, "completions/max_terminated_length": 3552.28125, "completions/mean_length": 1495.025390625, "completions/mean_terminated_length": 2004.6477870941162, "completions/min_length": 0.0, "completions/min_terminated_length": 900.21875, "epoch": 0.2045712593177667, "grad_norm": 0.28497460760627613, "learning_rate": 1e-06, "loss": -0.0253, "num_tokens": 106635535.0, "reward": 0.375, "reward_std": 0.28569342382252216, "rewards/reward_model/mean": 0.375, "rewards/reward_model/std": 0.2856934303417802, "step": 259 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 3877.28125, "completions/max_terminated_length": 3877.28125, "completions/mean_length": 1598.38671875, "completions/mean_terminated_length": 2131.182357788086, "completions/min_length": 0.0, "completions/min_terminated_length": 1012.0, "epoch": 0.205361109739843, "grad_norm": 0.2459788289129746, "learning_rate": 1e-06, "loss": -0.0282, "num_tokens": 107488485.0, "reward": 0.384765625, "reward_std": 0.26424265280365944, "rewards/reward_model/mean": 0.384765625, "rewards/reward_model/std": 0.2642426611855626, "step": 260 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.251953125, "completions/max_length": 3799.03125, "completions/max_terminated_length": 3799.03125, "completions/mean_length": 1548.203125, "completions/mean_terminated_length": 2074.1873092651367, "completions/min_length": 0.0, "completions/min_terminated_length": 860.875, "epoch": 0.20615096016191933, "grad_norm": 0.2675784896859238, "learning_rate": 1e-06, "loss": -0.0313, "num_tokens": 108311997.0, "reward": 0.40625, "reward_std": 0.27714776806533337, "rewards/reward_model/mean": 0.40625, "rewards/reward_model/std": 0.2771477773785591, "step": 261 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25390625, "completions/max_length": 3489.9375, "completions/max_terminated_length": 3489.9375, "completions/mean_length": 1428.130859375, "completions/mean_terminated_length": 1934.5771522521973, "completions/min_length": 0.0, "completions/min_terminated_length": 976.53125, "epoch": 0.20694081058399566, "grad_norm": 0.28298942876055977, "learning_rate": 1e-06, "loss": -0.0211, "num_tokens": 109079200.0, "reward": 0.505859375, "reward_std": 0.2774980217218399, "rewards/reward_model/mean": 0.505859375, "rewards/reward_model/std": 0.2774980263784528, "step": 262 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25390625, "completions/max_length": 3526.15625, "completions/max_terminated_length": 3526.15625, "completions/mean_length": 1537.171875, "completions/mean_terminated_length": 2080.155279159546, "completions/min_length": 0.0, "completions/min_terminated_length": 1031.34375, "epoch": 0.207730661006072, "grad_norm": 0.23750534485941546, "learning_rate": 1e-06, "loss": -0.0448, "num_tokens": 109899624.0, "reward": 0.38671875, "reward_std": 0.26891973055899143, "rewards/reward_model/mean": 0.38671875, "rewards/reward_model/std": 0.26891974080353975, "step": 263 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 3044.6875, "completions/max_terminated_length": 3044.6875, "completions/mean_length": 1179.916015625, "completions/mean_terminated_length": 1573.2214069366455, "completions/min_length": 0.0, "completions/min_terminated_length": 781.5625, "epoch": 0.2085205114281483, "grad_norm": 0.33131176316982236, "learning_rate": 1e-06, "loss": -0.0112, "num_tokens": 110534461.0, "reward": 0.359375, "reward_std": 0.29812896996736526, "rewards/reward_model/mean": 0.359375, "rewards/reward_model/std": 0.29812897834926844, "step": 264 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.251953125, "completions/max_length": 3133.625, "completions/max_terminated_length": 3133.625, "completions/mean_length": 1395.275390625, "completions/mean_terminated_length": 1870.9751930236816, "completions/min_length": 0.0, "completions/min_terminated_length": 962.21875, "epoch": 0.2093103618502246, "grad_norm": 0.27701051408611654, "learning_rate": 1e-06, "loss": -0.0066, "num_tokens": 111282858.0, "reward": 0.517578125, "reward_std": 0.3270687162876129, "rewards/reward_model/mean": 0.517578125, "rewards/reward_model/std": 0.3270687311887741, "step": 265 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25390625, "completions/max_length": 3408.8125, "completions/max_terminated_length": 3408.8125, "completions/mean_length": 1439.201171875, "completions/mean_terminated_length": 1942.8696174621582, "completions/min_length": 0.0, "completions/min_terminated_length": 912.1875, "epoch": 0.21010021227230094, "grad_norm": 0.2670395682435996, "learning_rate": 1e-06, "loss": -0.032, "num_tokens": 112051281.0, "reward": 0.552734375, "reward_std": 0.26090834103524685, "rewards/reward_model/mean": 0.552734375, "rewards/reward_model/std": 0.2609083531424403, "step": 266 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 3432.25, "completions/max_terminated_length": 3432.25, "completions/mean_length": 1521.814453125, "completions/mean_terminated_length": 2029.0860080718994, "completions/min_length": 0.0, "completions/min_terminated_length": 936.0625, "epoch": 0.21089006269437724, "grad_norm": 0.2592838023483326, "learning_rate": 1e-06, "loss": -0.0694, "num_tokens": 112860962.0, "reward": 0.49609375, "reward_std": 0.2947755679488182, "rewards/reward_model/mean": 0.49609375, "rewards/reward_model/std": 0.2947755819186568, "step": 267 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.251953125, "completions/max_length": 3215.875, "completions/max_terminated_length": 3215.875, "completions/mean_length": 1378.390625, "completions/mean_terminated_length": 1851.4571895599365, "completions/min_length": 0.0, "completions/min_terminated_length": 944.0, "epoch": 0.21167991311645357, "grad_norm": 0.2647843109530461, "learning_rate": 1e-06, "loss": 0.004, "num_tokens": 113599642.0, "reward": 0.45703125, "reward_std": 0.32947898283600807, "rewards/reward_model/mean": 0.45703125, "rewards/reward_model/std": 0.3294789995998144, "step": 268 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.251953125, "completions/max_length": 3656.875, "completions/max_terminated_length": 3656.875, "completions/mean_length": 1539.095703125, "completions/mean_terminated_length": 2056.8378944396973, "completions/min_length": 0.0, "completions/min_terminated_length": 1031.96875, "epoch": 0.2124697635385299, "grad_norm": 0.2610783051135598, "learning_rate": 1e-06, "loss": -0.0231, "num_tokens": 114420939.0, "reward": 0.416015625, "reward_std": 0.27093155309557915, "rewards/reward_model/mean": 0.416015625, "rewards/reward_model/std": 0.2709315614774823, "step": 269 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.255859375, "completions/max_length": 3471.28125, "completions/max_terminated_length": 3471.28125, "completions/mean_length": 1451.01953125, "completions/mean_terminated_length": 1969.757480621338, "completions/min_length": 0.0, "completions/min_terminated_length": 847.375, "epoch": 0.21325961396060622, "grad_norm": 0.3047833224155384, "learning_rate": 1e-06, "loss": -0.0163, "num_tokens": 115195045.0, "reward": 0.494140625, "reward_std": 0.3730160482227802, "rewards/reward_model/mean": 0.494140625, "rewards/reward_model/std": 0.373016064055264, "step": 270 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 3513.4375, "completions/max_terminated_length": 3513.4375, "completions/mean_length": 1571.177734375, "completions/mean_terminated_length": 2094.903715133667, "completions/min_length": 0.0, "completions/min_terminated_length": 1026.84375, "epoch": 0.21404946438268252, "grad_norm": 0.21531175878810543, "learning_rate": 1e-06, "loss": 0.0113, "num_tokens": 116031936.0, "reward": 0.40234375, "reward_std": 0.2819677069783211, "rewards/reward_model/mean": 0.40234375, "rewards/reward_model/std": 0.28196771536022425, "step": 271 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25390625, "completions/max_length": 3680.0, "completions/max_terminated_length": 3680.0, "completions/mean_length": 1597.91796875, "completions/mean_terminated_length": 2148.3401622772217, "completions/min_length": 0.0, "completions/min_terminated_length": 1009.4375, "epoch": 0.21483931480475885, "grad_norm": 0.2439959181457894, "learning_rate": 1e-06, "loss": 0.0129, "num_tokens": 116881862.0, "reward": 0.443359375, "reward_std": 0.2679084148257971, "rewards/reward_model/mean": 0.443359375, "rewards/reward_model/std": 0.2679084250703454, "step": 272 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 2912.59375, "completions/max_terminated_length": 2912.59375, "completions/mean_length": 1250.9609375, "completions/mean_terminated_length": 1667.9479751586914, "completions/min_length": 0.0, "completions/min_terminated_length": 925.84375, "epoch": 0.21562916522683517, "grad_norm": 0.24898759248880245, "learning_rate": 1e-06, "loss": -0.0371, "num_tokens": 117553666.0, "reward": 0.666015625, "reward_std": 0.28103455156087875, "rewards/reward_model/mean": 0.666015625, "rewards/reward_model/std": 0.28103456925600767, "step": 273 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.2578125, "completions/max_length": 4176.90625, "completions/max_terminated_length": 4176.90625, "completions/mean_length": 1703.861328125, "completions/mean_terminated_length": 2323.725841522217, "completions/min_length": 0.0, "completions/min_terminated_length": 1065.84375, "epoch": 0.2164190156489115, "grad_norm": 0.2496264386653868, "learning_rate": 1e-06, "loss": -0.0196, "num_tokens": 118461499.0, "reward": 0.32421875, "reward_std": 0.24254059232771397, "rewards/reward_model/mean": 0.32421875, "rewards/reward_model/std": 0.24254059977829456, "step": 274 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.251953125, "completions/max_length": 4180.25, "completions/max_terminated_length": 4180.25, "completions/mean_length": 1673.841796875, "completions/mean_terminated_length": 2242.9465761184692, "completions/min_length": 0.0, "completions/min_terminated_length": 863.1875, "epoch": 0.2172088660709878, "grad_norm": 0.29854877770695404, "learning_rate": 1e-06, "loss": -0.0124, "num_tokens": 119350874.0, "reward": 0.306640625, "reward_std": 0.32521410286426544, "rewards/reward_model/mean": 0.306640625, "rewards/reward_model/std": 0.3252141121774912, "step": 275 } ], "logging_steps": 1, "max_steps": 1000, "num_input_tokens_seen": 119350874, "num_train_epochs": 1, "save_steps": 25, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }