{ "best_metric": 0.7502281069755554, "best_model_checkpoint": "./checkpoints/llava-v1.6-34b-chatml_direct-anyres/checkpoint-115", "epoch": 3.75, "eval_steps": 1.0, "global_step": 120, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.03125, "grad_norm": 0.3345325833952841, "learning_rate": 0.0, "loss": 1.2677, "step": 1 }, { "epoch": 0.03125, "eval_loss": 1.3042412996292114, "eval_runtime": 195.8656, "eval_samples_per_second": 1.021, "eval_steps_per_second": 0.128, "step": 1 }, { "epoch": 0.0625, "grad_norm": 0.35890614982474045, "learning_rate": 8.613531161467863e-06, "loss": 1.3505, "step": 2 }, { "epoch": 0.0625, "eval_loss": 1.3042412996292114, "eval_runtime": 190.4145, "eval_samples_per_second": 1.05, "eval_steps_per_second": 0.131, "step": 2 }, { "epoch": 0.09375, "grad_norm": 0.3316701046946783, "learning_rate": 1.3652123889719709e-05, "loss": 1.2569, "step": 3 }, { "epoch": 0.09375, "eval_loss": 1.2942205667495728, "eval_runtime": 190.6475, "eval_samples_per_second": 1.049, "eval_steps_per_second": 0.131, "step": 3 }, { "epoch": 0.125, "grad_norm": 0.32188096642751235, "learning_rate": 1.7227062322935725e-05, "loss": 1.2323, "step": 4 }, { "epoch": 0.125, "eval_loss": 1.2789555788040161, "eval_runtime": 189.7666, "eval_samples_per_second": 1.054, "eval_steps_per_second": 0.132, "step": 4 }, { "epoch": 0.15625, "grad_norm": 0.3767527705004001, "learning_rate": 2e-05, "loss": 1.2785, "step": 5 }, { "epoch": 0.15625, "eval_loss": 1.258152723312378, "eval_runtime": 189.5935, "eval_samples_per_second": 1.055, "eval_steps_per_second": 0.132, "step": 5 }, { "epoch": 0.1875, "grad_norm": 0.3287126070774628, "learning_rate": 2e-05, "loss": 1.2151, "step": 6 }, { "epoch": 0.1875, "eval_loss": 1.2347216606140137, "eval_runtime": 190.4111, "eval_samples_per_second": 1.05, "eval_steps_per_second": 0.131, "step": 6 }, { "epoch": 0.21875, "grad_norm": 0.34451127286331007, "learning_rate": 2e-05, "loss": 1.2968, "step": 7 }, { "epoch": 0.21875, "eval_loss": 1.210167646408081, "eval_runtime": 190.9799, "eval_samples_per_second": 1.047, "eval_steps_per_second": 0.131, "step": 7 }, { "epoch": 0.25, "grad_norm": 0.36105870692958336, "learning_rate": 2e-05, "loss": 1.2277, "step": 8 }, { "epoch": 0.25, "eval_loss": 1.1862907409667969, "eval_runtime": 190.685, "eval_samples_per_second": 1.049, "eval_steps_per_second": 0.131, "step": 8 }, { "epoch": 0.28125, "grad_norm": 0.35460549637546845, "learning_rate": 2e-05, "loss": 1.2101, "step": 9 }, { "epoch": 0.28125, "eval_loss": 1.1649302244186401, "eval_runtime": 190.1569, "eval_samples_per_second": 1.052, "eval_steps_per_second": 0.131, "step": 9 }, { "epoch": 0.3125, "grad_norm": 0.3134923556618721, "learning_rate": 2e-05, "loss": 1.1163, "step": 10 }, { "epoch": 0.3125, "eval_loss": 1.144965410232544, "eval_runtime": 190.0982, "eval_samples_per_second": 1.052, "eval_steps_per_second": 0.132, "step": 10 }, { "epoch": 0.34375, "grad_norm": 0.3069481492118633, "learning_rate": 2e-05, "loss": 1.1483, "step": 11 }, { "epoch": 0.34375, "eval_loss": 1.124668002128601, "eval_runtime": 192.0572, "eval_samples_per_second": 1.041, "eval_steps_per_second": 0.13, "step": 11 }, { "epoch": 0.375, "grad_norm": 0.2801324709168811, "learning_rate": 2e-05, "loss": 1.1172, "step": 12 }, { "epoch": 0.375, "eval_loss": 1.1061824560165405, "eval_runtime": 192.6406, "eval_samples_per_second": 1.038, "eval_steps_per_second": 0.13, "step": 12 }, { "epoch": 0.40625, "grad_norm": 0.33156251919932406, "learning_rate": 2e-05, "loss": 1.1902, "step": 13 }, { "epoch": 0.40625, "eval_loss": 1.0897018909454346, "eval_runtime": 192.8064, "eval_samples_per_second": 1.037, "eval_steps_per_second": 0.13, "step": 13 }, { "epoch": 0.4375, "grad_norm": 0.3307149375898363, "learning_rate": 2e-05, "loss": 1.1014, "step": 14 }, { "epoch": 0.4375, "eval_loss": 1.075058937072754, "eval_runtime": 192.1353, "eval_samples_per_second": 1.041, "eval_steps_per_second": 0.13, "step": 14 }, { "epoch": 0.46875, "grad_norm": 0.31999611930431227, "learning_rate": 2e-05, "loss": 1.0847, "step": 15 }, { "epoch": 0.46875, "eval_loss": 1.0613062381744385, "eval_runtime": 192.0586, "eval_samples_per_second": 1.041, "eval_steps_per_second": 0.13, "step": 15 }, { "epoch": 0.5, "grad_norm": 0.2494159223446848, "learning_rate": 2e-05, "loss": 1.0428, "step": 16 }, { "epoch": 0.5, "eval_loss": 1.0484405755996704, "eval_runtime": 192.52, "eval_samples_per_second": 1.039, "eval_steps_per_second": 0.13, "step": 16 }, { "epoch": 0.53125, "grad_norm": 0.2899303168196212, "learning_rate": 2e-05, "loss": 1.122, "step": 17 }, { "epoch": 0.53125, "eval_loss": 1.036120891571045, "eval_runtime": 192.5716, "eval_samples_per_second": 1.039, "eval_steps_per_second": 0.13, "step": 17 }, { "epoch": 0.5625, "grad_norm": 0.2995776829874209, "learning_rate": 2e-05, "loss": 1.0425, "step": 18 }, { "epoch": 0.5625, "eval_loss": 1.0226774215698242, "eval_runtime": 192.5256, "eval_samples_per_second": 1.039, "eval_steps_per_second": 0.13, "step": 18 }, { "epoch": 0.59375, "grad_norm": 0.28709859243892955, "learning_rate": 2e-05, "loss": 1.0098, "step": 19 }, { "epoch": 0.59375, "eval_loss": 1.0081837177276611, "eval_runtime": 192.3486, "eval_samples_per_second": 1.04, "eval_steps_per_second": 0.13, "step": 19 }, { "epoch": 0.625, "grad_norm": 0.27612474678791227, "learning_rate": 2e-05, "loss": 1.0563, "step": 20 }, { "epoch": 0.625, "eval_loss": 0.994163990020752, "eval_runtime": 191.9782, "eval_samples_per_second": 1.042, "eval_steps_per_second": 0.13, "step": 20 }, { "epoch": 0.65625, "grad_norm": 0.24260720679126926, "learning_rate": 2e-05, "loss": 1.0355, "step": 21 }, { "epoch": 0.65625, "eval_loss": 0.9819543361663818, "eval_runtime": 191.9306, "eval_samples_per_second": 1.042, "eval_steps_per_second": 0.13, "step": 21 }, { "epoch": 0.6875, "grad_norm": 0.25336536603884946, "learning_rate": 2e-05, "loss": 1.0525, "step": 22 }, { "epoch": 0.6875, "eval_loss": 0.9709838032722473, "eval_runtime": 192.9913, "eval_samples_per_second": 1.036, "eval_steps_per_second": 0.13, "step": 22 }, { "epoch": 0.71875, "grad_norm": 0.24820839136364292, "learning_rate": 2e-05, "loss": 1.1392, "step": 23 }, { "epoch": 0.71875, "eval_loss": 0.9616628885269165, "eval_runtime": 192.6673, "eval_samples_per_second": 1.038, "eval_steps_per_second": 0.13, "step": 23 }, { "epoch": 0.75, "grad_norm": 0.24589291203527217, "learning_rate": 2e-05, "loss": 1.058, "step": 24 }, { "epoch": 0.75, "eval_loss": 0.9531083703041077, "eval_runtime": 193.0994, "eval_samples_per_second": 1.036, "eval_steps_per_second": 0.129, "step": 24 }, { "epoch": 0.78125, "grad_norm": 0.249532642718915, "learning_rate": 2e-05, "loss": 0.938, "step": 25 }, { "epoch": 0.78125, "eval_loss": 0.9455437660217285, "eval_runtime": 191.9941, "eval_samples_per_second": 1.042, "eval_steps_per_second": 0.13, "step": 25 }, { "epoch": 0.8125, "grad_norm": 0.28034242585086017, "learning_rate": 2e-05, "loss": 0.9387, "step": 26 }, { "epoch": 0.8125, "eval_loss": 0.93752121925354, "eval_runtime": 195.4083, "eval_samples_per_second": 1.023, "eval_steps_per_second": 0.128, "step": 26 }, { "epoch": 0.84375, "grad_norm": 0.2692565070546352, "learning_rate": 2e-05, "loss": 1.0474, "step": 27 }, { "epoch": 0.84375, "eval_loss": 0.9300512075424194, "eval_runtime": 195.3651, "eval_samples_per_second": 1.024, "eval_steps_per_second": 0.128, "step": 27 }, { "epoch": 0.875, "grad_norm": 0.24705041646949316, "learning_rate": 2e-05, "loss": 0.9596, "step": 28 }, { "epoch": 0.875, "eval_loss": 0.9226720929145813, "eval_runtime": 195.4848, "eval_samples_per_second": 1.023, "eval_steps_per_second": 0.128, "step": 28 }, { "epoch": 0.90625, "grad_norm": 0.24799871352606165, "learning_rate": 2e-05, "loss": 1.0172, "step": 29 }, { "epoch": 0.90625, "eval_loss": 0.9159422516822815, "eval_runtime": 196.157, "eval_samples_per_second": 1.02, "eval_steps_per_second": 0.127, "step": 29 }, { "epoch": 0.9375, "grad_norm": 0.29755264040904106, "learning_rate": 2e-05, "loss": 0.9324, "step": 30 }, { "epoch": 0.9375, "eval_loss": 0.9090733528137207, "eval_runtime": 196.5295, "eval_samples_per_second": 1.018, "eval_steps_per_second": 0.127, "step": 30 }, { "epoch": 0.96875, "grad_norm": 0.2629221961008751, "learning_rate": 2e-05, "loss": 0.9265, "step": 31 }, { "epoch": 0.96875, "eval_loss": 0.9027940630912781, "eval_runtime": 196.281, "eval_samples_per_second": 1.019, "eval_steps_per_second": 0.127, "step": 31 }, { "epoch": 1.0, "grad_norm": 0.2901110704218056, "learning_rate": 2e-05, "loss": 0.9933, "step": 32 }, { "epoch": 1.0, "eval_loss": 0.8970211148262024, "eval_runtime": 190.2702, "eval_samples_per_second": 1.051, "eval_steps_per_second": 0.131, "step": 32 }, { "epoch": 1.03125, "grad_norm": 0.27746608883483487, "learning_rate": 2e-05, "loss": 0.9339, "step": 33 }, { "epoch": 1.03125, "eval_loss": 0.8916085958480835, "eval_runtime": 189.4543, "eval_samples_per_second": 1.056, "eval_steps_per_second": 0.132, "step": 33 }, { "epoch": 1.0625, "grad_norm": 0.26134437145600353, "learning_rate": 2e-05, "loss": 0.9438, "step": 34 }, { "epoch": 1.0625, "eval_loss": 0.8867039680480957, "eval_runtime": 189.6926, "eval_samples_per_second": 1.054, "eval_steps_per_second": 0.132, "step": 34 }, { "epoch": 1.09375, "grad_norm": 0.252882507519195, "learning_rate": 2e-05, "loss": 0.8979, "step": 35 }, { "epoch": 1.09375, "eval_loss": 0.8824067711830139, "eval_runtime": 189.9217, "eval_samples_per_second": 1.053, "eval_steps_per_second": 0.132, "step": 35 }, { "epoch": 1.125, "grad_norm": 0.25443025949474585, "learning_rate": 2e-05, "loss": 0.9411, "step": 36 }, { "epoch": 1.125, "eval_loss": 0.8788293600082397, "eval_runtime": 191.4083, "eval_samples_per_second": 1.045, "eval_steps_per_second": 0.131, "step": 36 }, { "epoch": 1.15625, "grad_norm": 0.2559343621244427, "learning_rate": 2e-05, "loss": 0.9827, "step": 37 }, { "epoch": 1.15625, "eval_loss": 0.8760793805122375, "eval_runtime": 191.2732, "eval_samples_per_second": 1.046, "eval_steps_per_second": 0.131, "step": 37 }, { "epoch": 1.1875, "grad_norm": 0.25403189851366254, "learning_rate": 2e-05, "loss": 0.8658, "step": 38 }, { "epoch": 1.1875, "eval_loss": 0.8727380633354187, "eval_runtime": 190.4281, "eval_samples_per_second": 1.05, "eval_steps_per_second": 0.131, "step": 38 }, { "epoch": 1.21875, "grad_norm": 0.2493777578005398, "learning_rate": 2e-05, "loss": 1.0053, "step": 39 }, { "epoch": 1.21875, "eval_loss": 0.869698703289032, "eval_runtime": 190.3431, "eval_samples_per_second": 1.051, "eval_steps_per_second": 0.131, "step": 39 }, { "epoch": 1.25, "grad_norm": 0.24823573574563138, "learning_rate": 2e-05, "loss": 0.8967, "step": 40 }, { "epoch": 1.25, "eval_loss": 0.8664910793304443, "eval_runtime": 189.9802, "eval_samples_per_second": 1.053, "eval_steps_per_second": 0.132, "step": 40 }, { "epoch": 1.28125, "grad_norm": 0.25462243237743476, "learning_rate": 2e-05, "loss": 1.0064, "step": 41 }, { "epoch": 1.28125, "eval_loss": 0.8638657927513123, "eval_runtime": 195.3373, "eval_samples_per_second": 1.024, "eval_steps_per_second": 0.128, "step": 41 }, { "epoch": 1.3125, "grad_norm": 0.2604089386111215, "learning_rate": 2e-05, "loss": 0.9898, "step": 42 }, { "epoch": 1.3125, "eval_loss": 0.8607734441757202, "eval_runtime": 195.219, "eval_samples_per_second": 1.024, "eval_steps_per_second": 0.128, "step": 42 }, { "epoch": 1.34375, "grad_norm": 0.27139202440805793, "learning_rate": 2e-05, "loss": 1.0539, "step": 43 }, { "epoch": 1.34375, "eval_loss": 0.8573687672615051, "eval_runtime": 195.8828, "eval_samples_per_second": 1.021, "eval_steps_per_second": 0.128, "step": 43 }, { "epoch": 1.375, "grad_norm": 0.27474433057157854, "learning_rate": 2e-05, "loss": 0.86, "step": 44 }, { "epoch": 1.375, "eval_loss": 0.8537396192550659, "eval_runtime": 194.9741, "eval_samples_per_second": 1.026, "eval_steps_per_second": 0.128, "step": 44 }, { "epoch": 1.40625, "grad_norm": 0.2537208760747199, "learning_rate": 2e-05, "loss": 0.9562, "step": 45 }, { "epoch": 1.40625, "eval_loss": 0.8497809767723083, "eval_runtime": 194.9162, "eval_samples_per_second": 1.026, "eval_steps_per_second": 0.128, "step": 45 }, { "epoch": 1.4375, "grad_norm": 0.27560461131090846, "learning_rate": 2e-05, "loss": 0.8767, "step": 46 }, { "epoch": 1.4375, "eval_loss": 0.8458660244941711, "eval_runtime": 195.1157, "eval_samples_per_second": 1.025, "eval_steps_per_second": 0.128, "step": 46 }, { "epoch": 1.46875, "grad_norm": 0.2594536794112662, "learning_rate": 2e-05, "loss": 0.9256, "step": 47 }, { "epoch": 1.46875, "eval_loss": 0.8429936766624451, "eval_runtime": 195.048, "eval_samples_per_second": 1.025, "eval_steps_per_second": 0.128, "step": 47 }, { "epoch": 1.5, "grad_norm": 0.28583207453838866, "learning_rate": 2e-05, "loss": 0.9858, "step": 48 }, { "epoch": 1.5, "eval_loss": 0.84013831615448, "eval_runtime": 194.3046, "eval_samples_per_second": 1.029, "eval_steps_per_second": 0.129, "step": 48 }, { "epoch": 1.53125, "grad_norm": 0.28118976636788506, "learning_rate": 2e-05, "loss": 0.9158, "step": 49 }, { "epoch": 1.53125, "eval_loss": 0.8369531035423279, "eval_runtime": 194.521, "eval_samples_per_second": 1.028, "eval_steps_per_second": 0.129, "step": 49 }, { "epoch": 1.5625, "grad_norm": 0.29276573776696546, "learning_rate": 2e-05, "loss": 0.8745, "step": 50 }, { "epoch": 1.5625, "eval_loss": 0.8341982960700989, "eval_runtime": 194.1114, "eval_samples_per_second": 1.03, "eval_steps_per_second": 0.129, "step": 50 }, { "epoch": 1.59375, "grad_norm": 0.2860638141439372, "learning_rate": 2e-05, "loss": 0.854, "step": 51 }, { "epoch": 1.59375, "eval_loss": 0.8317239284515381, "eval_runtime": 198.1029, "eval_samples_per_second": 1.01, "eval_steps_per_second": 0.126, "step": 51 }, { "epoch": 1.625, "grad_norm": 0.29960349722496704, "learning_rate": 2e-05, "loss": 0.8399, "step": 52 }, { "epoch": 1.625, "eval_loss": 0.8290513753890991, "eval_runtime": 198.0764, "eval_samples_per_second": 1.01, "eval_steps_per_second": 0.126, "step": 52 }, { "epoch": 1.65625, "grad_norm": 0.2964234305808419, "learning_rate": 2e-05, "loss": 0.9694, "step": 53 }, { "epoch": 1.65625, "eval_loss": 0.8267760276794434, "eval_runtime": 197.8284, "eval_samples_per_second": 1.011, "eval_steps_per_second": 0.126, "step": 53 }, { "epoch": 1.6875, "grad_norm": 0.26183932644077784, "learning_rate": 2e-05, "loss": 0.8153, "step": 54 }, { "epoch": 1.6875, "eval_loss": 0.824044942855835, "eval_runtime": 198.1694, "eval_samples_per_second": 1.009, "eval_steps_per_second": 0.126, "step": 54 }, { "epoch": 1.71875, "grad_norm": 0.3067024314453144, "learning_rate": 2e-05, "loss": 0.883, "step": 55 }, { "epoch": 1.71875, "eval_loss": 0.8216392397880554, "eval_runtime": 198.0249, "eval_samples_per_second": 1.01, "eval_steps_per_second": 0.126, "step": 55 }, { "epoch": 1.75, "grad_norm": 0.27888658705355013, "learning_rate": 2e-05, "loss": 0.8771, "step": 56 }, { "epoch": 1.75, "eval_loss": 0.8194215297698975, "eval_runtime": 195.4688, "eval_samples_per_second": 1.023, "eval_steps_per_second": 0.128, "step": 56 }, { "epoch": 1.78125, "grad_norm": 0.32571765544245934, "learning_rate": 2e-05, "loss": 0.897, "step": 57 }, { "epoch": 1.78125, "eval_loss": 0.8167170882225037, "eval_runtime": 189.6243, "eval_samples_per_second": 1.055, "eval_steps_per_second": 0.132, "step": 57 }, { "epoch": 1.8125, "grad_norm": 0.292216058855145, "learning_rate": 2e-05, "loss": 0.9277, "step": 58 }, { "epoch": 1.8125, "eval_loss": 0.8145509958267212, "eval_runtime": 190.2429, "eval_samples_per_second": 1.051, "eval_steps_per_second": 0.131, "step": 58 }, { "epoch": 1.84375, "grad_norm": 0.29002612820437024, "learning_rate": 2e-05, "loss": 0.8971, "step": 59 }, { "epoch": 1.84375, "eval_loss": 0.8122230768203735, "eval_runtime": 189.9403, "eval_samples_per_second": 1.053, "eval_steps_per_second": 0.132, "step": 59 }, { "epoch": 1.875, "grad_norm": 0.2926088029288858, "learning_rate": 2e-05, "loss": 0.9225, "step": 60 }, { "epoch": 1.875, "eval_loss": 0.8100479245185852, "eval_runtime": 190.2569, "eval_samples_per_second": 1.051, "eval_steps_per_second": 0.131, "step": 60 }, { "epoch": 1.90625, "grad_norm": 0.30068993111077397, "learning_rate": 2e-05, "loss": 0.9134, "step": 61 }, { "epoch": 1.90625, "eval_loss": 0.808087944984436, "eval_runtime": 192.4896, "eval_samples_per_second": 1.039, "eval_steps_per_second": 0.13, "step": 61 }, { "epoch": 1.9375, "grad_norm": 0.3157573686768343, "learning_rate": 2e-05, "loss": 0.8965, "step": 62 }, { "epoch": 1.9375, "eval_loss": 0.8057371377944946, "eval_runtime": 190.0158, "eval_samples_per_second": 1.053, "eval_steps_per_second": 0.132, "step": 62 }, { "epoch": 1.96875, "grad_norm": 0.31215592754506605, "learning_rate": 2e-05, "loss": 0.7828, "step": 63 }, { "epoch": 1.96875, "eval_loss": 0.8031384944915771, "eval_runtime": 189.5204, "eval_samples_per_second": 1.055, "eval_steps_per_second": 0.132, "step": 63 }, { "epoch": 2.0, "grad_norm": 0.29422828766227993, "learning_rate": 2e-05, "loss": 0.8196, "step": 64 }, { "epoch": 2.0, "eval_loss": 0.8012601733207703, "eval_runtime": 189.7041, "eval_samples_per_second": 1.054, "eval_steps_per_second": 0.132, "step": 64 }, { "epoch": 2.03125, "grad_norm": 0.2885449518895793, "learning_rate": 2e-05, "loss": 0.9715, "step": 65 }, { "epoch": 2.03125, "eval_loss": 0.8001161813735962, "eval_runtime": 189.57, "eval_samples_per_second": 1.055, "eval_steps_per_second": 0.132, "step": 65 }, { "epoch": 2.0625, "grad_norm": 0.30260184063348483, "learning_rate": 2e-05, "loss": 0.7912, "step": 66 }, { "epoch": 2.0625, "eval_loss": 0.7989436388015747, "eval_runtime": 193.0193, "eval_samples_per_second": 1.036, "eval_steps_per_second": 0.13, "step": 66 }, { "epoch": 2.09375, "grad_norm": 0.32650294605024255, "learning_rate": 2e-05, "loss": 0.8176, "step": 67 }, { "epoch": 2.09375, "eval_loss": 0.7972333431243896, "eval_runtime": 193.2225, "eval_samples_per_second": 1.035, "eval_steps_per_second": 0.129, "step": 67 }, { "epoch": 2.125, "grad_norm": 0.3382679480741134, "learning_rate": 2e-05, "loss": 0.8141, "step": 68 }, { "epoch": 2.125, "eval_loss": 0.7950598001480103, "eval_runtime": 193.2781, "eval_samples_per_second": 1.035, "eval_steps_per_second": 0.129, "step": 68 }, { "epoch": 2.15625, "grad_norm": 0.3094090784935889, "learning_rate": 2e-05, "loss": 0.796, "step": 69 }, { "epoch": 2.15625, "eval_loss": 0.7932476997375488, "eval_runtime": 193.0282, "eval_samples_per_second": 1.036, "eval_steps_per_second": 0.13, "step": 69 }, { "epoch": 2.1875, "grad_norm": 0.30209558834780514, "learning_rate": 2e-05, "loss": 0.927, "step": 70 }, { "epoch": 2.1875, "eval_loss": 0.7922118902206421, "eval_runtime": 192.8284, "eval_samples_per_second": 1.037, "eval_steps_per_second": 0.13, "step": 70 }, { "epoch": 2.21875, "grad_norm": 0.35958652905266686, "learning_rate": 2e-05, "loss": 0.8224, "step": 71 }, { "epoch": 2.21875, "eval_loss": 0.7909810543060303, "eval_runtime": 202.3128, "eval_samples_per_second": 0.989, "eval_steps_per_second": 0.124, "step": 71 }, { "epoch": 2.25, "grad_norm": 0.356338004067507, "learning_rate": 2e-05, "loss": 0.8376, "step": 72 }, { "epoch": 2.25, "eval_loss": 0.7894656658172607, "eval_runtime": 195.1481, "eval_samples_per_second": 1.025, "eval_steps_per_second": 0.128, "step": 72 }, { "epoch": 2.28125, "grad_norm": 0.31886905989727465, "learning_rate": 2e-05, "loss": 0.8688, "step": 73 }, { "epoch": 2.28125, "eval_loss": 0.7889463901519775, "eval_runtime": 194.8418, "eval_samples_per_second": 1.026, "eval_steps_per_second": 0.128, "step": 73 }, { "epoch": 2.3125, "grad_norm": 0.35606342918056466, "learning_rate": 2e-05, "loss": 0.835, "step": 74 }, { "epoch": 2.3125, "eval_loss": 0.7875587344169617, "eval_runtime": 194.7701, "eval_samples_per_second": 1.027, "eval_steps_per_second": 0.128, "step": 74 }, { "epoch": 2.34375, "grad_norm": 0.3161858862696026, "learning_rate": 2e-05, "loss": 0.873, "step": 75 }, { "epoch": 2.34375, "eval_loss": 0.7863460779190063, "eval_runtime": 195.4811, "eval_samples_per_second": 1.023, "eval_steps_per_second": 0.128, "step": 75 }, { "epoch": 2.375, "grad_norm": 0.35771781884741477, "learning_rate": 2e-05, "loss": 0.9021, "step": 76 }, { "epoch": 2.375, "eval_loss": 0.7847577929496765, "eval_runtime": 195.3724, "eval_samples_per_second": 1.024, "eval_steps_per_second": 0.128, "step": 76 }, { "epoch": 2.40625, "grad_norm": 0.3549789155823785, "learning_rate": 2e-05, "loss": 0.9195, "step": 77 }, { "epoch": 2.40625, "eval_loss": 0.783415675163269, "eval_runtime": 190.226, "eval_samples_per_second": 1.051, "eval_steps_per_second": 0.131, "step": 77 }, { "epoch": 2.4375, "grad_norm": 0.34734314309709374, "learning_rate": 2e-05, "loss": 0.8386, "step": 78 }, { "epoch": 2.4375, "eval_loss": 0.7814657688140869, "eval_runtime": 190.3177, "eval_samples_per_second": 1.051, "eval_steps_per_second": 0.131, "step": 78 }, { "epoch": 2.46875, "grad_norm": 0.35540762574897183, "learning_rate": 2e-05, "loss": 0.851, "step": 79 }, { "epoch": 2.46875, "eval_loss": 0.7798058390617371, "eval_runtime": 190.7447, "eval_samples_per_second": 1.049, "eval_steps_per_second": 0.131, "step": 79 }, { "epoch": 2.5, "grad_norm": 0.3844458514717174, "learning_rate": 2e-05, "loss": 0.767, "step": 80 }, { "epoch": 2.5, "eval_loss": 0.7777827978134155, "eval_runtime": 190.8548, "eval_samples_per_second": 1.048, "eval_steps_per_second": 0.131, "step": 80 }, { "epoch": 2.53125, "grad_norm": 0.36232344175264375, "learning_rate": 2e-05, "loss": 0.8508, "step": 81 }, { "epoch": 2.53125, "eval_loss": 0.7763205170631409, "eval_runtime": 190.335, "eval_samples_per_second": 1.051, "eval_steps_per_second": 0.131, "step": 81 }, { "epoch": 2.5625, "grad_norm": 0.36279843147857743, "learning_rate": 2e-05, "loss": 0.8331, "step": 82 }, { "epoch": 2.5625, "eval_loss": 0.7757676839828491, "eval_runtime": 190.9559, "eval_samples_per_second": 1.047, "eval_steps_per_second": 0.131, "step": 82 }, { "epoch": 2.59375, "grad_norm": 0.395360566032837, "learning_rate": 2e-05, "loss": 0.847, "step": 83 }, { "epoch": 2.59375, "eval_loss": 0.7743326425552368, "eval_runtime": 190.6372, "eval_samples_per_second": 1.049, "eval_steps_per_second": 0.131, "step": 83 }, { "epoch": 2.625, "grad_norm": 0.4268568783791123, "learning_rate": 2e-05, "loss": 0.869, "step": 84 }, { "epoch": 2.625, "eval_loss": 0.772053062915802, "eval_runtime": 190.2072, "eval_samples_per_second": 1.051, "eval_steps_per_second": 0.131, "step": 84 }, { "epoch": 2.65625, "grad_norm": 0.3581495538253167, "learning_rate": 2e-05, "loss": 0.8591, "step": 85 }, { "epoch": 2.65625, "eval_loss": 0.7711917757987976, "eval_runtime": 190.4392, "eval_samples_per_second": 1.05, "eval_steps_per_second": 0.131, "step": 85 }, { "epoch": 2.6875, "grad_norm": 0.3952841797586726, "learning_rate": 2e-05, "loss": 0.8167, "step": 86 }, { "epoch": 2.6875, "eval_loss": 0.7714033722877502, "eval_runtime": 193.8038, "eval_samples_per_second": 1.032, "eval_steps_per_second": 0.129, "step": 86 }, { "epoch": 2.71875, "grad_norm": 0.41820009905687616, "learning_rate": 2e-05, "loss": 0.8165, "step": 87 }, { "epoch": 2.71875, "eval_loss": 0.771486759185791, "eval_runtime": 194.4791, "eval_samples_per_second": 1.028, "eval_steps_per_second": 0.129, "step": 87 }, { "epoch": 2.75, "grad_norm": 0.3852566717747202, "learning_rate": 2e-05, "loss": 0.8459, "step": 88 }, { "epoch": 2.75, "eval_loss": 0.7710732817649841, "eval_runtime": 194.3404, "eval_samples_per_second": 1.029, "eval_steps_per_second": 0.129, "step": 88 }, { "epoch": 2.78125, "grad_norm": 0.39909292055831935, "learning_rate": 2e-05, "loss": 0.8945, "step": 89 }, { "epoch": 2.78125, "eval_loss": 0.7708308696746826, "eval_runtime": 194.4483, "eval_samples_per_second": 1.029, "eval_steps_per_second": 0.129, "step": 89 }, { "epoch": 2.8125, "grad_norm": 0.3916487629667217, "learning_rate": 2e-05, "loss": 0.8029, "step": 90 }, { "epoch": 2.8125, "eval_loss": 0.7713395953178406, "eval_runtime": 194.6045, "eval_samples_per_second": 1.028, "eval_steps_per_second": 0.128, "step": 90 }, { "epoch": 2.84375, "grad_norm": 0.36969072235715195, "learning_rate": 2e-05, "loss": 0.7704, "step": 91 }, { "epoch": 2.84375, "eval_loss": 0.7713618278503418, "eval_runtime": 194.3895, "eval_samples_per_second": 1.029, "eval_steps_per_second": 0.129, "step": 91 }, { "epoch": 2.875, "grad_norm": 0.3853248559868725, "learning_rate": 2e-05, "loss": 0.8247, "step": 92 }, { "epoch": 2.875, "eval_loss": 0.7703633308410645, "eval_runtime": 194.0457, "eval_samples_per_second": 1.031, "eval_steps_per_second": 0.129, "step": 92 }, { "epoch": 2.90625, "grad_norm": 0.38111471762069055, "learning_rate": 2e-05, "loss": 0.855, "step": 93 }, { "epoch": 2.90625, "eval_loss": 0.7690189480781555, "eval_runtime": 194.3506, "eval_samples_per_second": 1.029, "eval_steps_per_second": 0.129, "step": 93 }, { "epoch": 2.9375, "grad_norm": 0.3701270310997752, "learning_rate": 2e-05, "loss": 0.7518, "step": 94 }, { "epoch": 2.9375, "eval_loss": 0.7675644159317017, "eval_runtime": 194.3756, "eval_samples_per_second": 1.029, "eval_steps_per_second": 0.129, "step": 94 }, { "epoch": 2.96875, "grad_norm": 0.40489524752286055, "learning_rate": 2e-05, "loss": 0.8559, "step": 95 }, { "epoch": 2.96875, "eval_loss": 0.766002357006073, "eval_runtime": 193.9472, "eval_samples_per_second": 1.031, "eval_steps_per_second": 0.129, "step": 95 }, { "epoch": 3.0, "grad_norm": 0.39220887464051457, "learning_rate": 2e-05, "loss": 0.8629, "step": 96 }, { "epoch": 3.0, "eval_loss": 0.7644355893135071, "eval_runtime": 196.1686, "eval_samples_per_second": 1.02, "eval_steps_per_second": 0.127, "step": 96 }, { "epoch": 3.03125, "grad_norm": 0.3644925708419195, "learning_rate": 2e-05, "loss": 0.7434, "step": 97 }, { "epoch": 3.03125, "eval_loss": 0.7628399133682251, "eval_runtime": 196.1515, "eval_samples_per_second": 1.02, "eval_steps_per_second": 0.127, "step": 97 }, { "epoch": 3.0625, "grad_norm": 0.407089942317534, "learning_rate": 2e-05, "loss": 0.8038, "step": 98 }, { "epoch": 3.0625, "eval_loss": 0.7609645128250122, "eval_runtime": 196.8662, "eval_samples_per_second": 1.016, "eval_steps_per_second": 0.127, "step": 98 }, { "epoch": 3.09375, "grad_norm": 0.38849177572880716, "learning_rate": 2e-05, "loss": 0.8106, "step": 99 }, { "epoch": 3.09375, "eval_loss": 0.7598288059234619, "eval_runtime": 196.1846, "eval_samples_per_second": 1.019, "eval_steps_per_second": 0.127, "step": 99 }, { "epoch": 3.125, "grad_norm": 0.41885563528617265, "learning_rate": 2e-05, "loss": 0.808, "step": 100 }, { "epoch": 3.125, "eval_loss": 0.7587143778800964, "eval_runtime": 195.7296, "eval_samples_per_second": 1.022, "eval_steps_per_second": 0.128, "step": 100 }, { "epoch": 3.15625, "grad_norm": 0.4003909227323588, "learning_rate": 2e-05, "loss": 0.791, "step": 101 }, { "epoch": 3.15625, "eval_loss": 0.7578326463699341, "eval_runtime": 195.2831, "eval_samples_per_second": 1.024, "eval_steps_per_second": 0.128, "step": 101 }, { "epoch": 3.1875, "grad_norm": 0.4014550365826672, "learning_rate": 2e-05, "loss": 0.7402, "step": 102 }, { "epoch": 3.1875, "eval_loss": 0.7573958039283752, "eval_runtime": 189.5234, "eval_samples_per_second": 1.055, "eval_steps_per_second": 0.132, "step": 102 }, { "epoch": 3.21875, "grad_norm": 0.4018554316691014, "learning_rate": 2e-05, "loss": 0.8165, "step": 103 }, { "epoch": 3.21875, "eval_loss": 0.7571737766265869, "eval_runtime": 190.0146, "eval_samples_per_second": 1.053, "eval_steps_per_second": 0.132, "step": 103 }, { "epoch": 3.25, "grad_norm": 0.39691385018938347, "learning_rate": 2e-05, "loss": 0.7806, "step": 104 }, { "epoch": 3.25, "eval_loss": 0.7581367492675781, "eval_runtime": 190.1851, "eval_samples_per_second": 1.052, "eval_steps_per_second": 0.131, "step": 104 }, { "epoch": 3.28125, "grad_norm": 0.390373263306042, "learning_rate": 2e-05, "loss": 0.7454, "step": 105 }, { "epoch": 3.28125, "eval_loss": 0.7590533494949341, "eval_runtime": 190.1255, "eval_samples_per_second": 1.052, "eval_steps_per_second": 0.131, "step": 105 }, { "epoch": 3.3125, "grad_norm": 0.45093404603350434, "learning_rate": 2e-05, "loss": 0.8598, "step": 106 }, { "epoch": 3.3125, "eval_loss": 0.7584137916564941, "eval_runtime": 193.5956, "eval_samples_per_second": 1.033, "eval_steps_per_second": 0.129, "step": 106 }, { "epoch": 3.34375, "grad_norm": 0.4112664411035318, "learning_rate": 2e-05, "loss": 0.8612, "step": 107 }, { "epoch": 3.34375, "eval_loss": 0.757759690284729, "eval_runtime": 191.7864, "eval_samples_per_second": 1.043, "eval_steps_per_second": 0.13, "step": 107 }, { "epoch": 3.375, "grad_norm": 0.4158875890717671, "learning_rate": 2e-05, "loss": 0.7916, "step": 108 }, { "epoch": 3.375, "eval_loss": 0.756908655166626, "eval_runtime": 190.1833, "eval_samples_per_second": 1.052, "eval_steps_per_second": 0.131, "step": 108 }, { "epoch": 3.40625, "grad_norm": 0.4234644828447959, "learning_rate": 2e-05, "loss": 0.798, "step": 109 }, { "epoch": 3.40625, "eval_loss": 0.7559736371040344, "eval_runtime": 190.0515, "eval_samples_per_second": 1.052, "eval_steps_per_second": 0.132, "step": 109 }, { "epoch": 3.4375, "grad_norm": 0.480506693884699, "learning_rate": 2e-05, "loss": 0.7964, "step": 110 }, { "epoch": 3.4375, "eval_loss": 0.7547717094421387, "eval_runtime": 189.9579, "eval_samples_per_second": 1.053, "eval_steps_per_second": 0.132, "step": 110 }, { "epoch": 3.46875, "grad_norm": 0.4540166631930203, "learning_rate": 2e-05, "loss": 0.8172, "step": 111 }, { "epoch": 3.46875, "eval_loss": 0.7538467645645142, "eval_runtime": 194.2136, "eval_samples_per_second": 1.03, "eval_steps_per_second": 0.129, "step": 111 }, { "epoch": 3.5, "grad_norm": 0.46782505650509537, "learning_rate": 2e-05, "loss": 0.7826, "step": 112 }, { "epoch": 3.5, "eval_loss": 0.7537503838539124, "eval_runtime": 194.396, "eval_samples_per_second": 1.029, "eval_steps_per_second": 0.129, "step": 112 }, { "epoch": 3.53125, "grad_norm": 0.5212465522615498, "learning_rate": 2e-05, "loss": 0.7715, "step": 113 }, { "epoch": 3.53125, "eval_loss": 0.7527998089790344, "eval_runtime": 193.1123, "eval_samples_per_second": 1.036, "eval_steps_per_second": 0.129, "step": 113 }, { "epoch": 3.5625, "grad_norm": 0.4869709286188453, "learning_rate": 2e-05, "loss": 0.8007, "step": 114 }, { "epoch": 3.5625, "eval_loss": 0.7510444521903992, "eval_runtime": 193.3613, "eval_samples_per_second": 1.034, "eval_steps_per_second": 0.129, "step": 114 }, { "epoch": 3.59375, "grad_norm": 0.45218808224844204, "learning_rate": 2e-05, "loss": 0.7183, "step": 115 }, { "epoch": 3.59375, "eval_loss": 0.7502281069755554, "eval_runtime": 192.9756, "eval_samples_per_second": 1.036, "eval_steps_per_second": 0.13, "step": 115 }, { "epoch": 3.625, "grad_norm": 0.4563394511077237, "learning_rate": 2e-05, "loss": 0.7457, "step": 116 }, { "epoch": 3.625, "eval_loss": 0.75026535987854, "eval_runtime": 196.5666, "eval_samples_per_second": 1.017, "eval_steps_per_second": 0.127, "step": 116 }, { "epoch": 3.65625, "grad_norm": 0.4439962576867383, "learning_rate": 2e-05, "loss": 0.8091, "step": 117 }, { "epoch": 3.65625, "eval_loss": 0.7515027523040771, "eval_runtime": 196.6311, "eval_samples_per_second": 1.017, "eval_steps_per_second": 0.127, "step": 117 }, { "epoch": 3.6875, "grad_norm": 0.44782841352683267, "learning_rate": 2e-05, "loss": 0.7636, "step": 118 }, { "epoch": 3.6875, "eval_loss": 0.7535241842269897, "eval_runtime": 196.4625, "eval_samples_per_second": 1.018, "eval_steps_per_second": 0.127, "step": 118 }, { "epoch": 3.71875, "grad_norm": 0.5651906616770451, "learning_rate": 2e-05, "loss": 0.8106, "step": 119 }, { "epoch": 3.71875, "eval_loss": 0.7535383701324463, "eval_runtime": 196.0073, "eval_samples_per_second": 1.02, "eval_steps_per_second": 0.128, "step": 119 }, { "epoch": 3.75, "grad_norm": 0.44715700121507296, "learning_rate": 2e-05, "loss": 0.7991, "step": 120 }, { "epoch": 3.75, "eval_loss": 0.75335294008255, "eval_runtime": 195.9837, "eval_samples_per_second": 1.02, "eval_steps_per_second": 0.128, "step": 120 }, { "epoch": 3.75, "step": 120, "total_flos": 266165567225856.0, "train_loss": 0.0, "train_runtime": 1.4333, "train_samples_per_second": 2790.753, "train_steps_per_second": 44.652 } ], "logging_steps": 1.0, "max_steps": 64, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 5, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 266165567225856.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }