| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.018465515649524512, | |
| "eval_steps": 10000, | |
| "global_step": 50, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.00036931031299049027, | |
| "grad_norm": 0.05093964371235935, | |
| "learning_rate": 0.0, | |
| "loss": 0.0176, | |
| "reward/mean": 0.4305254817008972, | |
| "reward/std": 0.023368891328573227, | |
| "rewards/correct_answer_reward_func/mean": 0.8723958134651184, | |
| "rewards/correct_answer_reward_func/std": 0.3338659703731537, | |
| "rewards/correct_crop_func/mean": 0.0, | |
| "rewards/correct_crop_func/std": 0.0, | |
| "rewards/correct_extract_func/mean": 0.939539909362793, | |
| "rewards/correct_extract_func/std": 0.2306855320930481, | |
| "rewards/correct_find_color/mean": 0.0, | |
| "rewards/correct_find_color/std": 0.0, | |
| "rewards/format_reward_func/mean": 1.4062052965164185, | |
| "rewards/format_reward_func/std": 0.023711344227194786, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.0007386206259809805, | |
| "grad_norm": 0.05093885614710434, | |
| "learning_rate": 1e-07, | |
| "loss": 0.0176, | |
| "step": 2 | |
| }, | |
| { | |
| "epoch": 0.0011079309389714707, | |
| "grad_norm": 0.049709599363636135, | |
| "learning_rate": 2e-07, | |
| "loss": 0.0177, | |
| "reward/mean": 0.43362969160079956, | |
| "reward/std": 0.024930372834205627, | |
| "rewards/correct_answer_reward_func/mean": 0.8841145634651184, | |
| "rewards/correct_answer_reward_func/std": 0.3202960789203644, | |
| "rewards/correct_crop_func/mean": 0.0, | |
| "rewards/correct_crop_func/std": 0.0, | |
| "rewards/correct_extract_func/mean": 0.9216580390930176, | |
| "rewards/correct_extract_func/std": 0.25809445977211, | |
| "rewards/correct_find_color/mean": 0.0, | |
| "rewards/correct_find_color/std": 0.0, | |
| "rewards/format_reward_func/mean": 1.4074559211730957, | |
| "rewards/format_reward_func/std": 0.018833689391613007, | |
| "step": 3 | |
| }, | |
| { | |
| "epoch": 0.001477241251961961, | |
| "grad_norm": 0.04919887971444478, | |
| "learning_rate": 3e-07, | |
| "loss": 0.0177, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 0.0018465515649524512, | |
| "grad_norm": 0.04609056285217934, | |
| "learning_rate": 4e-07, | |
| "loss": 0.017, | |
| "reward/mean": 0.43184012174606323, | |
| "reward/std": 0.016295205801725388, | |
| "rewards/correct_answer_reward_func/mean": 0.87890625, | |
| "rewards/correct_answer_reward_func/std": 0.32644879817962646, | |
| "rewards/correct_crop_func/mean": 0.0, | |
| "rewards/correct_crop_func/std": 0.0, | |
| "rewards/correct_extract_func/mean": 0.9188368320465088, | |
| "rewards/correct_extract_func/std": 0.2631855309009552, | |
| "rewards/correct_find_color/mean": 0.0, | |
| "rewards/correct_find_color/std": 0.0, | |
| "rewards/format_reward_func/mean": 1.4060311317443848, | |
| "rewards/format_reward_func/std": 0.019542310386896133, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.0022158618779429414, | |
| "grad_norm": 0.04790138249587581, | |
| "learning_rate": 5e-07, | |
| "loss": 0.017, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 0.002585172190933432, | |
| "grad_norm": 0.07345664511629879, | |
| "learning_rate": 6e-07, | |
| "loss": 0.0171, | |
| "reward/mean": 0.4144955277442932, | |
| "reward/std": 0.025130389258265495, | |
| "rewards/correct_answer_reward_func/mean": 0.8255208134651184, | |
| "rewards/correct_answer_reward_func/std": 0.37976834177970886, | |
| "rewards/correct_crop_func/mean": 0.0, | |
| "rewards/correct_crop_func/std": 0.0, | |
| "rewards/correct_extract_func/mean": 0.880946159362793, | |
| "rewards/correct_extract_func/std": 0.31111887097358704, | |
| "rewards/correct_find_color/mean": 0.0, | |
| "rewards/correct_find_color/std": 0.0, | |
| "rewards/format_reward_func/mean": 1.4041085243225098, | |
| "rewards/format_reward_func/std": 0.03701591119170189, | |
| "step": 7 | |
| }, | |
| { | |
| "epoch": 0.002954482503923922, | |
| "grad_norm": 0.07610320387658168, | |
| "learning_rate": 7e-07, | |
| "loss": 0.0171, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 0.0033237928169144123, | |
| "grad_norm": 0.05112137950522487, | |
| "learning_rate": 8e-07, | |
| "loss": 0.0181, | |
| "reward/mean": 0.4387373626232147, | |
| "reward/std": 0.019346633926033974, | |
| "rewards/correct_answer_reward_func/mean": 0.9036458134651184, | |
| "rewards/correct_answer_reward_func/std": 0.2952686548233032, | |
| "rewards/correct_crop_func/mean": 0.0, | |
| "rewards/correct_crop_func/std": 0.0, | |
| "rewards/correct_extract_func/mean": 0.9055989384651184, | |
| "rewards/correct_extract_func/std": 0.28282052278518677, | |
| "rewards/correct_find_color/mean": 0.0, | |
| "rewards/correct_find_color/std": 0.0, | |
| "rewards/format_reward_func/mean": 1.4047561883926392, | |
| "rewards/format_reward_func/std": 0.06315362453460693, | |
| "step": 9 | |
| }, | |
| { | |
| "epoch": 0.0036931031299049025, | |
| "grad_norm": 0.05094809364589051, | |
| "learning_rate": 9e-07, | |
| "loss": 0.0181, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.004062413442895393, | |
| "grad_norm": 0.05275996751985643, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0175, | |
| "reward/mean": 0.43490076065063477, | |
| "reward/std": 0.02377907559275627, | |
| "rewards/correct_answer_reward_func/mean": 0.8893229365348816, | |
| "rewards/correct_answer_reward_func/std": 0.3139362931251526, | |
| "rewards/correct_crop_func/mean": 0.0, | |
| "rewards/correct_crop_func/std": 0.0, | |
| "rewards/correct_extract_func/mean": 0.9134114384651184, | |
| "rewards/correct_extract_func/std": 0.2700866460800171, | |
| "rewards/correct_find_color/mean": 0.0, | |
| "rewards/correct_find_color/std": 0.0, | |
| "rewards/format_reward_func/mean": 1.407015323638916, | |
| "rewards/format_reward_func/std": 0.01986781507730484, | |
| "step": 11 | |
| }, | |
| { | |
| "epoch": 0.004431723755885883, | |
| "grad_norm": 0.051170250158850315, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0175, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 0.004801034068876373, | |
| "grad_norm": 0.05767247415587884, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0181, | |
| "reward/mean": 0.4389788508415222, | |
| "reward/std": 0.02013307623565197, | |
| "rewards/correct_answer_reward_func/mean": 0.9049479365348816, | |
| "rewards/correct_answer_reward_func/std": 0.29347798228263855, | |
| "rewards/correct_crop_func/mean": 0.0, | |
| "rewards/correct_crop_func/std": 0.0, | |
| "rewards/correct_extract_func/mean": 0.8903212547302246, | |
| "rewards/correct_extract_func/std": 0.2998242974281311, | |
| "rewards/correct_find_color/mean": 0.0, | |
| "rewards/correct_find_color/std": 0.0, | |
| "rewards/format_reward_func/mean": 1.4078478813171387, | |
| "rewards/format_reward_func/std": 0.014290675520896912, | |
| "step": 13 | |
| }, | |
| { | |
| "epoch": 0.005170344381866864, | |
| "grad_norm": 0.05451251011427531, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0181, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 0.005539654694857354, | |
| "grad_norm": 0.0548131395951059, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0181, | |
| "reward/mean": 0.4339887797832489, | |
| "reward/std": 0.018782436847686768, | |
| "rewards/correct_answer_reward_func/mean": 0.88671875, | |
| "rewards/correct_answer_reward_func/std": 0.3171428442001343, | |
| "rewards/correct_crop_func/mean": 0.0, | |
| "rewards/correct_crop_func/std": 0.0, | |
| "rewards/correct_extract_func/mean": 0.9032118320465088, | |
| "rewards/correct_extract_func/std": 0.2824605405330658, | |
| "rewards/correct_find_color/mean": 0.0, | |
| "rewards/correct_find_color/std": 0.0, | |
| "rewards/format_reward_func/mean": 1.4087677001953125, | |
| "rewards/format_reward_func/std": 0.012980460189282894, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.005908965007847844, | |
| "grad_norm": 0.05240983246508732, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0181, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 0.006278275320838334, | |
| "grad_norm": 0.05134459409849994, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0182, | |
| "reward/mean": 0.43516525626182556, | |
| "reward/std": 0.02198929898440838, | |
| "rewards/correct_answer_reward_func/mean": 0.89453125, | |
| "rewards/correct_answer_reward_func/std": 0.3073566257953644, | |
| "rewards/correct_crop_func/mean": 0.0, | |
| "rewards/correct_crop_func/std": 0.0, | |
| "rewards/correct_extract_func/mean": 0.8657768368721008, | |
| "rewards/correct_extract_func/std": 0.3262862265110016, | |
| "rewards/correct_find_color/mean": 0.0, | |
| "rewards/correct_find_color/std": 0.0, | |
| "rewards/format_reward_func/mean": 1.4083256721496582, | |
| "rewards/format_reward_func/std": 0.02241017296910286, | |
| "step": 17 | |
| }, | |
| { | |
| "epoch": 0.006647585633828825, | |
| "grad_norm": 0.050341338341445184, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0182, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 0.007016895946819315, | |
| "grad_norm": 0.0432942729059209, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0176, | |
| "reward/mean": 0.43305787444114685, | |
| "reward/std": 0.017742186784744263, | |
| "rewards/correct_answer_reward_func/mean": 0.8854166865348816, | |
| "rewards/correct_answer_reward_func/std": 0.3187260329723358, | |
| "rewards/correct_crop_func/mean": 0.0, | |
| "rewards/correct_crop_func/std": 0.0, | |
| "rewards/correct_extract_func/mean": 0.8930990099906921, | |
| "rewards/correct_extract_func/std": 0.29494285583496094, | |
| "rewards/correct_find_color/mean": 0.0, | |
| "rewards/correct_find_color/std": 0.0, | |
| "rewards/format_reward_func/mean": 1.4063993692398071, | |
| "rewards/format_reward_func/std": 0.03480615094304085, | |
| "step": 19 | |
| }, | |
| { | |
| "epoch": 0.007386206259809805, | |
| "grad_norm": 0.043736628776102855, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0176, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.0077555165728002955, | |
| "grad_norm": 0.05359119220833686, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0171, | |
| "reward/mean": 0.4271976351737976, | |
| "reward/std": 0.02278582751750946, | |
| "rewards/correct_answer_reward_func/mean": 0.8619791865348816, | |
| "rewards/correct_answer_reward_func/std": 0.34514662623405457, | |
| "rewards/correct_crop_func/mean": 0.0, | |
| "rewards/correct_crop_func/std": 0.0, | |
| "rewards/correct_extract_func/mean": 0.9271919131278992, | |
| "rewards/correct_extract_func/std": 0.2506465017795563, | |
| "rewards/correct_find_color/mean": 0.0, | |
| "rewards/correct_find_color/std": 0.0, | |
| "rewards/format_reward_func/mean": 1.4078807830810547, | |
| "rewards/format_reward_func/std": 0.011555412784218788, | |
| "step": 21 | |
| }, | |
| { | |
| "epoch": 0.008124826885790786, | |
| "grad_norm": 0.053384876112853016, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0171, | |
| "step": 22 | |
| }, | |
| { | |
| "epoch": 0.008494137198781277, | |
| "grad_norm": 0.05457105190222447, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0176, | |
| "reward/mean": 0.4344092011451721, | |
| "reward/std": 0.018806444481015205, | |
| "rewards/correct_answer_reward_func/mean": 0.8893229365348816, | |
| "rewards/correct_answer_reward_func/std": 0.3139362931251526, | |
| "rewards/correct_crop_func/mean": 0.0, | |
| "rewards/correct_crop_func/std": 0.0, | |
| "rewards/correct_extract_func/mean": 0.8963108062744141, | |
| "rewards/correct_extract_func/std": 0.29719239473342896, | |
| "rewards/correct_find_color/mean": 0.0, | |
| "rewards/correct_find_color/std": 0.0, | |
| "rewards/format_reward_func/mean": 1.4072299003601074, | |
| "rewards/format_reward_func/std": 0.013067901134490967, | |
| "step": 23 | |
| }, | |
| { | |
| "epoch": 0.008863447511771766, | |
| "grad_norm": 0.051790764460388904, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0176, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 0.009232757824762256, | |
| "grad_norm": 0.06455557806308003, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0177, | |
| "reward/mean": 0.44281578063964844, | |
| "reward/std": 0.016280503943562508, | |
| "rewards/correct_answer_reward_func/mean": 0.9153645634651184, | |
| "rewards/correct_answer_reward_func/std": 0.27851977944374084, | |
| "rewards/correct_crop_func/mean": 0.0, | |
| "rewards/correct_crop_func/std": 0.0, | |
| "rewards/correct_extract_func/mean": 0.9203993678092957, | |
| "rewards/correct_extract_func/std": 0.25632622838020325, | |
| "rewards/correct_find_color/mean": 0.0, | |
| "rewards/correct_find_color/std": 0.0, | |
| "rewards/format_reward_func/mean": 1.4059442281723022, | |
| "rewards/format_reward_func/std": 0.02205752208828926, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.009602068137752747, | |
| "grad_norm": 0.054460571323261056, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0177, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 0.009971378450743237, | |
| "grad_norm": 0.045474497731843734, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0175, | |
| "reward/mean": 0.4445436894893646, | |
| "reward/std": 0.014738515019416809, | |
| "rewards/correct_answer_reward_func/mean": 0.9192708134651184, | |
| "rewards/correct_answer_reward_func/std": 0.27259624004364014, | |
| "rewards/correct_crop_func/mean": 0.0, | |
| "rewards/correct_crop_func/std": 0.0, | |
| "rewards/correct_extract_func/mean": 0.9342448115348816, | |
| "rewards/correct_extract_func/std": 0.23349910974502563, | |
| "rewards/correct_find_color/mean": 0.0, | |
| "rewards/correct_find_color/std": 0.0, | |
| "rewards/format_reward_func/mean": 1.4073508977890015, | |
| "rewards/format_reward_func/std": 0.014469039626419544, | |
| "step": 27 | |
| }, | |
| { | |
| "epoch": 0.010340688763733728, | |
| "grad_norm": 0.04518446989067636, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0175, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 0.010709999076724217, | |
| "grad_norm": 0.05758722247738054, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0183, | |
| "reward/mean": 0.44419676065444946, | |
| "reward/std": 0.02353046089410782, | |
| "rewards/correct_answer_reward_func/mean": 0.9192708134651184, | |
| "rewards/correct_answer_reward_func/std": 0.27259624004364014, | |
| "rewards/correct_crop_func/mean": 0.0, | |
| "rewards/correct_crop_func/std": 0.0, | |
| "rewards/correct_extract_func/mean": 0.923828125, | |
| "rewards/correct_extract_func/std": 0.25748127698898315, | |
| "rewards/correct_find_color/mean": 0.0, | |
| "rewards/correct_find_color/std": 0.0, | |
| "rewards/format_reward_func/mean": 1.4070063829421997, | |
| "rewards/format_reward_func/std": 0.013799347914755344, | |
| "step": 29 | |
| }, | |
| { | |
| "epoch": 0.011079309389714707, | |
| "grad_norm": 0.04951275155135316, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0183, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.011448619702705198, | |
| "grad_norm": 0.047494845238651454, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0173, | |
| "reward/mean": 0.44647669792175293, | |
| "reward/std": 0.014447808265686035, | |
| "rewards/correct_answer_reward_func/mean": 0.92578125, | |
| "rewards/correct_answer_reward_func/std": 0.2622973620891571, | |
| "rewards/correct_crop_func/mean": 0.0, | |
| "rewards/correct_crop_func/std": 0.0, | |
| "rewards/correct_extract_func/mean": 0.9365885257720947, | |
| "rewards/correct_extract_func/std": 0.24006153643131256, | |
| "rewards/correct_find_color/mean": 0.0, | |
| "rewards/correct_find_color/std": 0.0, | |
| "rewards/format_reward_func/mean": 1.4064464569091797, | |
| "rewards/format_reward_func/std": 0.02209184132516384, | |
| "step": 31 | |
| }, | |
| { | |
| "epoch": 0.011817930015695689, | |
| "grad_norm": 0.046822948788372606, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0173, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 0.01218724032868618, | |
| "grad_norm": 0.04380397393536513, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0174, | |
| "reward/mean": 0.4473347067832947, | |
| "reward/std": 0.01410503126680851, | |
| "rewards/correct_answer_reward_func/mean": 0.9309895634651184, | |
| "rewards/correct_answer_reward_func/std": 0.25363701581954956, | |
| "rewards/correct_crop_func/mean": 0.0, | |
| "rewards/correct_crop_func/std": 0.0, | |
| "rewards/correct_extract_func/mean": 0.911241352558136, | |
| "rewards/correct_extract_func/std": 0.28002622723579407, | |
| "rewards/correct_find_color/mean": 0.0, | |
| "rewards/correct_find_color/std": 0.0, | |
| "rewards/format_reward_func/mean": 1.4070053100585938, | |
| "rewards/format_reward_func/std": 0.007169181946665049, | |
| "step": 33 | |
| }, | |
| { | |
| "epoch": 0.012556550641676668, | |
| "grad_norm": 0.04264955092023057, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0174, | |
| "step": 34 | |
| }, | |
| { | |
| "epoch": 0.012925860954667159, | |
| "grad_norm": 0.04313995996563885, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0172, | |
| "reward/mean": 0.4411402642726898, | |
| "reward/std": 0.00797030795365572, | |
| "rewards/correct_answer_reward_func/mean": 0.9088541865348816, | |
| "rewards/correct_answer_reward_func/std": 0.28800395131111145, | |
| "rewards/correct_crop_func/mean": 0.0, | |
| "rewards/correct_crop_func/std": 0.0, | |
| "rewards/correct_extract_func/mean": 0.9279513359069824, | |
| "rewards/correct_extract_func/std": 0.25393322110176086, | |
| "rewards/correct_find_color/mean": 0.0, | |
| "rewards/correct_find_color/std": 0.0, | |
| "rewards/format_reward_func/mean": 1.4064545631408691, | |
| "rewards/format_reward_func/std": 0.017624543979763985, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.01329517126765765, | |
| "grad_norm": 0.04127535963471486, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0172, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 0.01366448158064814, | |
| "grad_norm": 0.060380213829128886, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0176, | |
| "reward/mean": 0.4418516159057617, | |
| "reward/std": 0.016873031854629517, | |
| "rewards/correct_answer_reward_func/mean": 0.9075520634651184, | |
| "rewards/correct_answer_reward_func/std": 0.2898460030555725, | |
| "rewards/correct_crop_func/mean": 0.0, | |
| "rewards/correct_crop_func/std": 0.0, | |
| "rewards/correct_extract_func/mean": 0.961718738079071, | |
| "rewards/correct_extract_func/std": 0.18086452782154083, | |
| "rewards/correct_find_color/mean": 0.0, | |
| "rewards/correct_find_color/std": 0.0, | |
| "rewards/format_reward_func/mean": 1.4073443412780762, | |
| "rewards/format_reward_func/std": 0.013318442739546299, | |
| "step": 37 | |
| }, | |
| { | |
| "epoch": 0.01403379189363863, | |
| "grad_norm": 0.05602980229868504, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0176, | |
| "step": 38 | |
| }, | |
| { | |
| "epoch": 0.01440310220662912, | |
| "grad_norm": 0.05184766134610364, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0177, | |
| "reward/mean": 0.43869584798812866, | |
| "reward/std": 0.019008934497833252, | |
| "rewards/correct_answer_reward_func/mean": 0.9036458134651184, | |
| "rewards/correct_answer_reward_func/std": 0.2952686548233032, | |
| "rewards/correct_crop_func/mean": 0.0, | |
| "rewards/correct_crop_func/std": 0.0, | |
| "rewards/correct_extract_func/mean": 0.9003472328186035, | |
| "rewards/correct_extract_func/std": 0.2904185354709625, | |
| "rewards/correct_find_color/mean": 0.0, | |
| "rewards/correct_find_color/std": 0.0, | |
| "rewards/format_reward_func/mean": 1.405916690826416, | |
| "rewards/format_reward_func/std": 0.0255013108253479, | |
| "step": 39 | |
| }, | |
| { | |
| "epoch": 0.01477241251961961, | |
| "grad_norm": 0.05148310080801445, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0176, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.0151417228326101, | |
| "grad_norm": 0.05581834264976948, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0175, | |
| "reward/mean": 0.4319329261779785, | |
| "reward/std": 0.020871102809906006, | |
| "rewards/correct_answer_reward_func/mean": 0.8802083134651184, | |
| "rewards/correct_answer_reward_func/std": 0.3249293863773346, | |
| "rewards/correct_crop_func/mean": 0.0, | |
| "rewards/correct_crop_func/std": 0.0, | |
| "rewards/correct_extract_func/mean": 0.907118022441864, | |
| "rewards/correct_extract_func/std": 0.2817213237285614, | |
| "rewards/correct_find_color/mean": 0.0, | |
| "rewards/correct_find_color/std": 0.0, | |
| "rewards/format_reward_func/mean": 1.406569004058838, | |
| "rewards/format_reward_func/std": 0.02022281102836132, | |
| "step": 41 | |
| }, | |
| { | |
| "epoch": 0.015511033145600591, | |
| "grad_norm": 0.05268407384500521, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0175, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 0.01588034345859108, | |
| "grad_norm": 0.09367675201948025, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0176, | |
| "reward/mean": 0.4355073869228363, | |
| "reward/std": 0.01918705925345421, | |
| "rewards/correct_answer_reward_func/mean": 0.8919270634651184, | |
| "rewards/correct_answer_reward_func/std": 0.3106748163700104, | |
| "rewards/correct_crop_func/mean": 0.0, | |
| "rewards/correct_crop_func/std": 0.0, | |
| "rewards/correct_extract_func/mean": 0.9130207896232605, | |
| "rewards/correct_extract_func/std": 0.27445390820503235, | |
| "rewards/correct_find_color/mean": 0.0, | |
| "rewards/correct_find_color/std": 0.0, | |
| "rewards/format_reward_func/mean": 1.405385971069336, | |
| "rewards/format_reward_func/std": 0.026785731315612793, | |
| "step": 43 | |
| }, | |
| { | |
| "epoch": 0.016249653771581572, | |
| "grad_norm": 0.05227366508331771, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0176, | |
| "step": 44 | |
| }, | |
| { | |
| "epoch": 0.01661896408457206, | |
| "grad_norm": 0.04497808527453425, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0179, | |
| "reward/mean": 0.4444352984428406, | |
| "reward/std": 0.01481956522911787, | |
| "rewards/correct_answer_reward_func/mean": 0.9192708134651184, | |
| "rewards/correct_answer_reward_func/std": 0.27259624004364014, | |
| "rewards/correct_crop_func/mean": 0.0, | |
| "rewards/correct_crop_func/std": 0.0, | |
| "rewards/correct_extract_func/mean": 0.9329426884651184, | |
| "rewards/correct_extract_func/std": 0.24013008177280426, | |
| "rewards/correct_find_color/mean": 0.0, | |
| "rewards/correct_find_color/std": 0.0, | |
| "rewards/format_reward_func/mean": 1.406657338142395, | |
| "rewards/format_reward_func/std": 0.015218171291053295, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.016988274397562553, | |
| "grad_norm": 0.055837167119088496, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0179, | |
| "step": 46 | |
| }, | |
| { | |
| "epoch": 0.017357584710553042, | |
| "grad_norm": 0.05873478813480533, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0178, | |
| "reward/mean": 0.44062528014183044, | |
| "reward/std": 0.01832752674818039, | |
| "rewards/correct_answer_reward_func/mean": 0.9088541865348816, | |
| "rewards/correct_answer_reward_func/std": 0.28800395131111145, | |
| "rewards/correct_crop_func/mean": 0.0, | |
| "rewards/correct_crop_func/std": 0.0, | |
| "rewards/correct_extract_func/mean": 0.9090712070465088, | |
| "rewards/correct_extract_func/std": 0.27737653255462646, | |
| "rewards/correct_find_color/mean": 0.0, | |
| "rewards/correct_find_color/std": 0.0, | |
| "rewards/format_reward_func/mean": 1.4069687128067017, | |
| "rewards/format_reward_func/std": 0.02851445972919464, | |
| "step": 47 | |
| }, | |
| { | |
| "epoch": 0.01772689502354353, | |
| "grad_norm": 0.05518986438955003, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0178, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 0.018096205336534023, | |
| "grad_norm": 0.04474697485013119, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0182, | |
| "reward/mean": 0.44659337401390076, | |
| "reward/std": 0.013036997988820076, | |
| "rewards/correct_answer_reward_func/mean": 0.9296875, | |
| "rewards/correct_answer_reward_func/std": 0.2558395564556122, | |
| "rewards/correct_crop_func/mean": 0.0, | |
| "rewards/correct_crop_func/std": 0.0, | |
| "rewards/correct_extract_func/mean": 0.9007161259651184, | |
| "rewards/correct_extract_func/std": 0.2947409451007843, | |
| "rewards/correct_find_color/mean": 0.0, | |
| "rewards/correct_find_color/std": 0.0, | |
| "rewards/format_reward_func/mean": 1.406656265258789, | |
| "rewards/format_reward_func/std": 0.00936658214777708, | |
| "step": 49 | |
| }, | |
| { | |
| "epoch": 0.018465515649524512, | |
| "grad_norm": 0.043438923657574534, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0181, | |
| "step": 50 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 2708, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 50, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 0.0, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |