{ "best_global_step": 20000, "best_metric": 0.14082255959510803, "best_model_checkpoint": "/media/user/Expansion1/deberta-v3-base-zyda-2-v2-text-quality-v3/checkpoint-20000", "epoch": 10.0, "eval_steps": 500, "global_step": 100000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.05, "grad_norm": 11.949178695678711, "learning_rate": 4.97505e-05, "loss": 0.3835, "num_input_tokens_seen": 512000, "step": 500 }, { "epoch": 0.1, "grad_norm": 3.623898506164551, "learning_rate": 4.95005e-05, "loss": 0.2484, "num_input_tokens_seen": 1024000, "step": 1000 }, { "epoch": 0.15, "grad_norm": 3.0655770301818848, "learning_rate": 4.9250500000000006e-05, "loss": 0.2332, "num_input_tokens_seen": 1536000, "step": 1500 }, { "epoch": 0.2, "grad_norm": 8.194499015808105, "learning_rate": 4.9000500000000006e-05, "loss": 0.2097, "num_input_tokens_seen": 2048000, "step": 2000 }, { "epoch": 0.25, "grad_norm": 1.8507510423660278, "learning_rate": 4.875050000000001e-05, "loss": 0.1988, "num_input_tokens_seen": 2560000, "step": 2500 }, { "epoch": 0.3, "grad_norm": 2.8679802417755127, "learning_rate": 4.85005e-05, "loss": 0.1957, "num_input_tokens_seen": 3072000, "step": 3000 }, { "epoch": 0.35, "grad_norm": 3.2234883308410645, "learning_rate": 4.82505e-05, "loss": 0.1793, "num_input_tokens_seen": 3584000, "step": 3500 }, { "epoch": 0.4, "grad_norm": 4.360517978668213, "learning_rate": 4.80005e-05, "loss": 0.1816, "num_input_tokens_seen": 4096000, "step": 4000 }, { "epoch": 0.45, "grad_norm": 5.652502536773682, "learning_rate": 4.77505e-05, "loss": 0.1855, "num_input_tokens_seen": 4608000, "step": 4500 }, { "epoch": 0.5, "grad_norm": 3.757875919342041, "learning_rate": 4.7500500000000004e-05, "loss": 0.1751, "num_input_tokens_seen": 5120000, "step": 5000 }, { "epoch": 0.55, "grad_norm": 3.0092484951019287, "learning_rate": 4.7250500000000004e-05, "loss": 0.1785, "num_input_tokens_seen": 5632000, "step": 5500 }, { "epoch": 0.6, "grad_norm": 7.830347061157227, "learning_rate": 4.7000500000000005e-05, "loss": 0.1711, "num_input_tokens_seen": 6144000, "step": 6000 }, { "epoch": 0.65, "grad_norm": 2.926468849182129, "learning_rate": 4.6750500000000006e-05, "loss": 0.168, "num_input_tokens_seen": 6656000, "step": 6500 }, { "epoch": 0.7, "grad_norm": 3.43612003326416, "learning_rate": 4.65005e-05, "loss": 0.1772, "num_input_tokens_seen": 7168000, "step": 7000 }, { "epoch": 0.75, "grad_norm": 2.3997323513031006, "learning_rate": 4.62505e-05, "loss": 0.1632, "num_input_tokens_seen": 7680000, "step": 7500 }, { "epoch": 0.8, "grad_norm": 12.628423690795898, "learning_rate": 4.60005e-05, "loss": 0.1714, "num_input_tokens_seen": 8192000, "step": 8000 }, { "epoch": 0.85, "grad_norm": 1.8220003843307495, "learning_rate": 4.57505e-05, "loss": 0.1613, "num_input_tokens_seen": 8704000, "step": 8500 }, { "epoch": 0.9, "grad_norm": 2.2584903240203857, "learning_rate": 4.55005e-05, "loss": 0.1547, "num_input_tokens_seen": 9216000, "step": 9000 }, { "epoch": 0.95, "grad_norm": 1.5416566133499146, "learning_rate": 4.52505e-05, "loss": 0.1594, "num_input_tokens_seen": 9728000, "step": 9500 }, { "epoch": 1.0, "grad_norm": 2.472825288772583, "learning_rate": 4.5000500000000004e-05, "loss": 0.1635, "num_input_tokens_seen": 10239872, "step": 10000 }, { "epoch": 1.0, "eval_combined_score": 0.18538867612314003, "eval_loss": 0.18538866937160492, "eval_mse": 0.18538868287467514, "eval_runtime": 29.5714, "eval_samples_per_second": 676.329, "eval_steps_per_second": 84.541, "num_input_tokens_seen": 10239872, "step": 10000 }, { "epoch": 1.05, "grad_norm": 2.986963987350464, "learning_rate": 4.47505e-05, "loss": 0.1226, "num_input_tokens_seen": 10751872, "step": 10500 }, { "epoch": 1.1, "grad_norm": 0.7588199973106384, "learning_rate": 4.45005e-05, "loss": 0.1172, "num_input_tokens_seen": 11263872, "step": 11000 }, { "epoch": 1.15, "grad_norm": 0.8432678580284119, "learning_rate": 4.42505e-05, "loss": 0.1186, "num_input_tokens_seen": 11775872, "step": 11500 }, { "epoch": 1.2, "grad_norm": 12.563228607177734, "learning_rate": 4.40005e-05, "loss": 0.1139, "num_input_tokens_seen": 12287872, "step": 12000 }, { "epoch": 1.25, "grad_norm": 2.207587242126465, "learning_rate": 4.37505e-05, "loss": 0.121, "num_input_tokens_seen": 12799872, "step": 12500 }, { "epoch": 1.3, "grad_norm": 1.978637456893921, "learning_rate": 4.35005e-05, "loss": 0.1114, "num_input_tokens_seen": 13311872, "step": 13000 }, { "epoch": 1.35, "grad_norm": 5.6478729248046875, "learning_rate": 4.32505e-05, "loss": 0.1182, "num_input_tokens_seen": 13823872, "step": 13500 }, { "epoch": 1.4, "grad_norm": 3.0157413482666016, "learning_rate": 4.30005e-05, "loss": 0.1099, "num_input_tokens_seen": 14335872, "step": 14000 }, { "epoch": 1.45, "grad_norm": 2.2837512493133545, "learning_rate": 4.2750500000000003e-05, "loss": 0.1154, "num_input_tokens_seen": 14847872, "step": 14500 }, { "epoch": 1.5, "grad_norm": 2.124837875366211, "learning_rate": 4.2500500000000004e-05, "loss": 0.1163, "num_input_tokens_seen": 15359872, "step": 15000 }, { "epoch": 1.55, "grad_norm": 1.8782966136932373, "learning_rate": 4.2250500000000005e-05, "loss": 0.1167, "num_input_tokens_seen": 15871872, "step": 15500 }, { "epoch": 1.6, "grad_norm": 1.085688591003418, "learning_rate": 4.2000500000000006e-05, "loss": 0.1156, "num_input_tokens_seen": 16383872, "step": 16000 }, { "epoch": 1.65, "grad_norm": 1.9874955415725708, "learning_rate": 4.1750500000000006e-05, "loss": 0.1183, "num_input_tokens_seen": 16895872, "step": 16500 }, { "epoch": 1.7, "grad_norm": 2.6902706623077393, "learning_rate": 4.15005e-05, "loss": 0.1112, "num_input_tokens_seen": 17407872, "step": 17000 }, { "epoch": 1.75, "grad_norm": 3.0735440254211426, "learning_rate": 4.12505e-05, "loss": 0.1159, "num_input_tokens_seen": 17919872, "step": 17500 }, { "epoch": 1.8, "grad_norm": 2.936267614364624, "learning_rate": 4.10005e-05, "loss": 0.1187, "num_input_tokens_seen": 18431872, "step": 18000 }, { "epoch": 1.85, "grad_norm": 3.598895311355591, "learning_rate": 4.07505e-05, "loss": 0.1147, "num_input_tokens_seen": 18943872, "step": 18500 }, { "epoch": 1.9, "grad_norm": 3.655381917953491, "learning_rate": 4.05005e-05, "loss": 0.1387, "num_input_tokens_seen": 19455872, "step": 19000 }, { "epoch": 1.95, "grad_norm": 9.855778694152832, "learning_rate": 4.0250500000000004e-05, "loss": 0.1238, "num_input_tokens_seen": 19967872, "step": 19500 }, { "epoch": 2.0, "grad_norm": 2.558746337890625, "learning_rate": 4.0000500000000004e-05, "loss": 0.1241, "num_input_tokens_seen": 20479744, "step": 20000 }, { "epoch": 2.0, "eval_combined_score": 0.14082256163832602, "eval_loss": 0.14082255959510803, "eval_mse": 0.14082256368154403, "eval_runtime": 30.1283, "eval_samples_per_second": 663.828, "eval_steps_per_second": 82.979, "num_input_tokens_seen": 20479744, "step": 20000 }, { "epoch": 2.05, "grad_norm": 1.3755764961242676, "learning_rate": 3.97505e-05, "loss": 0.0804, "num_input_tokens_seen": 20991744, "step": 20500 }, { "epoch": 2.1, "grad_norm": 3.242955207824707, "learning_rate": 3.95005e-05, "loss": 0.0795, "num_input_tokens_seen": 21503744, "step": 21000 }, { "epoch": 2.15, "grad_norm": 2.4045000076293945, "learning_rate": 3.92505e-05, "loss": 0.0814, "num_input_tokens_seen": 22015744, "step": 21500 }, { "epoch": 2.2, "grad_norm": 2.5508718490600586, "learning_rate": 3.90005e-05, "loss": 0.0848, "num_input_tokens_seen": 22527744, "step": 22000 }, { "epoch": 2.25, "grad_norm": 2.8529911041259766, "learning_rate": 3.87505e-05, "loss": 0.081, "num_input_tokens_seen": 23039744, "step": 22500 }, { "epoch": 2.3, "grad_norm": 10.657905578613281, "learning_rate": 3.85005e-05, "loss": 0.0786, "num_input_tokens_seen": 23551744, "step": 23000 }, { "epoch": 2.35, "grad_norm": 2.378411293029785, "learning_rate": 3.82505e-05, "loss": 0.0823, "num_input_tokens_seen": 24063744, "step": 23500 }, { "epoch": 2.4, "grad_norm": 2.6125261783599854, "learning_rate": 3.80005e-05, "loss": 0.0787, "num_input_tokens_seen": 24575744, "step": 24000 }, { "epoch": 2.45, "grad_norm": 1.3133174180984497, "learning_rate": 3.77505e-05, "loss": 0.0761, "num_input_tokens_seen": 25087744, "step": 24500 }, { "epoch": 2.5, "grad_norm": 3.3419981002807617, "learning_rate": 3.75005e-05, "loss": 0.0775, "num_input_tokens_seen": 25599744, "step": 25000 }, { "epoch": 2.55, "grad_norm": 2.1734654903411865, "learning_rate": 3.72505e-05, "loss": 0.0846, "num_input_tokens_seen": 26111744, "step": 25500 }, { "epoch": 2.6, "grad_norm": 3.2352869510650635, "learning_rate": 3.70005e-05, "loss": 0.0817, "num_input_tokens_seen": 26623744, "step": 26000 }, { "epoch": 2.65, "grad_norm": 3.37646746635437, "learning_rate": 3.675050000000001e-05, "loss": 0.0816, "num_input_tokens_seen": 27135744, "step": 26500 }, { "epoch": 2.7, "grad_norm": 2.5875842571258545, "learning_rate": 3.650050000000001e-05, "loss": 0.0843, "num_input_tokens_seen": 27647744, "step": 27000 }, { "epoch": 2.75, "grad_norm": 7.768916606903076, "learning_rate": 3.62505e-05, "loss": 0.089, "num_input_tokens_seen": 28159744, "step": 27500 }, { "epoch": 2.8, "grad_norm": 2.6333940029144287, "learning_rate": 3.60005e-05, "loss": 0.1209, "num_input_tokens_seen": 28671744, "step": 28000 }, { "epoch": 2.85, "grad_norm": 3.4022088050842285, "learning_rate": 3.57505e-05, "loss": 0.082, "num_input_tokens_seen": 29183744, "step": 28500 }, { "epoch": 2.9, "grad_norm": 1.5310307741165161, "learning_rate": 3.5500500000000003e-05, "loss": 0.0813, "num_input_tokens_seen": 29695744, "step": 29000 }, { "epoch": 2.95, "grad_norm": 3.3515617847442627, "learning_rate": 3.5250500000000004e-05, "loss": 0.0856, "num_input_tokens_seen": 30207744, "step": 29500 }, { "epoch": 3.0, "grad_norm": 1.5893547534942627, "learning_rate": 3.5000500000000005e-05, "loss": 0.0882, "num_input_tokens_seen": 30719616, "step": 30000 }, { "epoch": 3.0, "eval_combined_score": 0.1746896443902683, "eval_loss": 0.1746896207332611, "eval_mse": 0.17468963824495307, "eval_runtime": 29.4701, "eval_samples_per_second": 678.654, "eval_steps_per_second": 84.832, "num_input_tokens_seen": 30719616, "step": 30000 }, { "epoch": 3.05, "grad_norm": 1.333294153213501, "learning_rate": 3.4750500000000006e-05, "loss": 0.0562, "num_input_tokens_seen": 31231616, "step": 30500 }, { "epoch": 3.1, "grad_norm": 0.8254738450050354, "learning_rate": 3.45005e-05, "loss": 0.053, "num_input_tokens_seen": 31743616, "step": 31000 }, { "epoch": 3.15, "grad_norm": 1.7611359357833862, "learning_rate": 3.42505e-05, "loss": 0.0533, "num_input_tokens_seen": 32255616, "step": 31500 }, { "epoch": 3.2, "grad_norm": 1.1055493354797363, "learning_rate": 3.40005e-05, "loss": 0.0557, "num_input_tokens_seen": 32767616, "step": 32000 }, { "epoch": 3.25, "grad_norm": 1.6912920475006104, "learning_rate": 3.37505e-05, "loss": 0.0557, "num_input_tokens_seen": 33279616, "step": 32500 }, { "epoch": 3.3, "grad_norm": 2.5604867935180664, "learning_rate": 3.35005e-05, "loss": 0.0619, "num_input_tokens_seen": 33791616, "step": 33000 }, { "epoch": 3.35, "grad_norm": 1.7852438688278198, "learning_rate": 3.32505e-05, "loss": 0.0564, "num_input_tokens_seen": 34303616, "step": 33500 }, { "epoch": 3.4, "grad_norm": 2.84659481048584, "learning_rate": 3.3000500000000004e-05, "loss": 0.0563, "num_input_tokens_seen": 34815616, "step": 34000 }, { "epoch": 3.45, "grad_norm": 2.0315301418304443, "learning_rate": 3.27505e-05, "loss": 0.054, "num_input_tokens_seen": 35327616, "step": 34500 }, { "epoch": 3.5, "grad_norm": 1.9043070077896118, "learning_rate": 3.25005e-05, "loss": 0.0583, "num_input_tokens_seen": 35839616, "step": 35000 }, { "epoch": 3.55, "grad_norm": 1.7389405965805054, "learning_rate": 3.22505e-05, "loss": 0.0544, "num_input_tokens_seen": 36351616, "step": 35500 }, { "epoch": 3.6, "grad_norm": 0.8132746815681458, "learning_rate": 3.20005e-05, "loss": 0.055, "num_input_tokens_seen": 36863616, "step": 36000 }, { "epoch": 3.65, "grad_norm": 1.8100671768188477, "learning_rate": 3.17505e-05, "loss": 0.0558, "num_input_tokens_seen": 37375616, "step": 36500 }, { "epoch": 3.7, "grad_norm": 10.433902740478516, "learning_rate": 3.15005e-05, "loss": 0.0568, "num_input_tokens_seen": 37887616, "step": 37000 }, { "epoch": 3.75, "grad_norm": 0.7512624263763428, "learning_rate": 3.12505e-05, "loss": 0.0541, "num_input_tokens_seen": 38399616, "step": 37500 }, { "epoch": 3.8, "grad_norm": 1.3957535028457642, "learning_rate": 3.1000499999999996e-05, "loss": 0.0546, "num_input_tokens_seen": 38911616, "step": 38000 }, { "epoch": 3.85, "grad_norm": 1.069032073020935, "learning_rate": 3.0750499999999996e-05, "loss": 0.0547, "num_input_tokens_seen": 39423616, "step": 38500 }, { "epoch": 3.9, "grad_norm": 3.4046223163604736, "learning_rate": 3.0500500000000004e-05, "loss": 0.0567, "num_input_tokens_seen": 39935616, "step": 39000 }, { "epoch": 3.95, "grad_norm": 1.5711253881454468, "learning_rate": 3.0250500000000005e-05, "loss": 0.0571, "num_input_tokens_seen": 40447616, "step": 39500 }, { "epoch": 4.0, "grad_norm": 2.8568646907806396, "learning_rate": 3.0000500000000005e-05, "loss": 0.054, "num_input_tokens_seen": 40959488, "step": 40000 }, { "epoch": 4.0, "eval_combined_score": 0.1528494923779644, "eval_loss": 0.152849480509758, "eval_mse": 0.15284948934500966, "eval_runtime": 29.495, "eval_samples_per_second": 678.08, "eval_steps_per_second": 84.76, "num_input_tokens_seen": 40959488, "step": 40000 }, { "epoch": 4.05, "grad_norm": 1.1214642524719238, "learning_rate": 2.9750500000000003e-05, "loss": 0.0365, "num_input_tokens_seen": 41471488, "step": 40500 }, { "epoch": 4.1, "grad_norm": 2.6408936977386475, "learning_rate": 2.9500500000000003e-05, "loss": 0.0361, "num_input_tokens_seen": 41983488, "step": 41000 }, { "epoch": 4.15, "grad_norm": 1.0093015432357788, "learning_rate": 2.9250500000000004e-05, "loss": 0.0361, "num_input_tokens_seen": 42495488, "step": 41500 }, { "epoch": 4.2, "grad_norm": 2.0412521362304688, "learning_rate": 2.90005e-05, "loss": 0.0377, "num_input_tokens_seen": 43007488, "step": 42000 }, { "epoch": 4.25, "grad_norm": 2.0059244632720947, "learning_rate": 2.8750500000000002e-05, "loss": 0.0354, "num_input_tokens_seen": 43519488, "step": 42500 }, { "epoch": 4.3, "grad_norm": 3.214423179626465, "learning_rate": 2.8500500000000003e-05, "loss": 0.0373, "num_input_tokens_seen": 44031488, "step": 43000 }, { "epoch": 4.35, "grad_norm": 2.101541519165039, "learning_rate": 2.8250500000000003e-05, "loss": 0.0381, "num_input_tokens_seen": 44543488, "step": 43500 }, { "epoch": 4.4, "grad_norm": 0.8797721862792969, "learning_rate": 2.80005e-05, "loss": 0.0381, "num_input_tokens_seen": 45055488, "step": 44000 }, { "epoch": 4.45, "grad_norm": 2.0589728355407715, "learning_rate": 2.77505e-05, "loss": 0.036, "num_input_tokens_seen": 45567488, "step": 44500 }, { "epoch": 4.5, "grad_norm": 2.5758140087127686, "learning_rate": 2.7500500000000002e-05, "loss": 0.0372, "num_input_tokens_seen": 46079488, "step": 45000 }, { "epoch": 4.55, "grad_norm": 1.531252145767212, "learning_rate": 2.72505e-05, "loss": 0.0381, "num_input_tokens_seen": 46591488, "step": 45500 }, { "epoch": 4.6, "grad_norm": 1.053691029548645, "learning_rate": 2.70005e-05, "loss": 0.0396, "num_input_tokens_seen": 47103488, "step": 46000 }, { "epoch": 4.65, "grad_norm": 1.031100869178772, "learning_rate": 2.67505e-05, "loss": 0.0376, "num_input_tokens_seen": 47615488, "step": 46500 }, { "epoch": 4.7, "grad_norm": 0.8592771887779236, "learning_rate": 2.65005e-05, "loss": 0.0381, "num_input_tokens_seen": 48127488, "step": 47000 }, { "epoch": 4.75, "grad_norm": 3.529454231262207, "learning_rate": 2.62505e-05, "loss": 0.0406, "num_input_tokens_seen": 48639488, "step": 47500 }, { "epoch": 4.8, "grad_norm": 1.2595094442367554, "learning_rate": 2.60005e-05, "loss": 0.044, "num_input_tokens_seen": 49151488, "step": 48000 }, { "epoch": 4.85, "grad_norm": 1.0460163354873657, "learning_rate": 2.57505e-05, "loss": 0.0411, "num_input_tokens_seen": 49663488, "step": 48500 }, { "epoch": 4.9, "grad_norm": 0.7415432333946228, "learning_rate": 2.55005e-05, "loss": 0.0376, "num_input_tokens_seen": 50175488, "step": 49000 }, { "epoch": 4.95, "grad_norm": 0.9863350987434387, "learning_rate": 2.5250499999999998e-05, "loss": 0.039, "num_input_tokens_seen": 50687488, "step": 49500 }, { "epoch": 5.0, "grad_norm": 2.2840659618377686, "learning_rate": 2.50005e-05, "loss": 0.0372, "num_input_tokens_seen": 51199360, "step": 50000 }, { "epoch": 5.0, "eval_combined_score": 0.14802570643204935, "eval_loss": 0.14802570641040802, "eval_mse": 0.14802570645369068, "eval_runtime": 29.5199, "eval_samples_per_second": 677.508, "eval_steps_per_second": 84.689, "num_input_tokens_seen": 51199360, "step": 50000 }, { "epoch": 5.05, "grad_norm": 1.0202912092208862, "learning_rate": 2.4750500000000003e-05, "loss": 0.0253, "num_input_tokens_seen": 51711360, "step": 50500 }, { "epoch": 5.1, "grad_norm": 1.1298741102218628, "learning_rate": 2.45005e-05, "loss": 0.0269, "num_input_tokens_seen": 52223360, "step": 51000 }, { "epoch": 5.15, "grad_norm": 1.2378206253051758, "learning_rate": 2.42505e-05, "loss": 0.0258, "num_input_tokens_seen": 52735360, "step": 51500 }, { "epoch": 5.2, "grad_norm": 1.6293431520462036, "learning_rate": 2.4000500000000002e-05, "loss": 0.0272, "num_input_tokens_seen": 53247360, "step": 52000 }, { "epoch": 5.25, "grad_norm": 3.9734299182891846, "learning_rate": 2.37505e-05, "loss": 0.0272, "num_input_tokens_seen": 53759360, "step": 52500 }, { "epoch": 5.3, "grad_norm": 0.6598159074783325, "learning_rate": 2.35005e-05, "loss": 0.0262, "num_input_tokens_seen": 54271360, "step": 53000 }, { "epoch": 5.35, "grad_norm": 0.6012576818466187, "learning_rate": 2.32505e-05, "loss": 0.027, "num_input_tokens_seen": 54783360, "step": 53500 }, { "epoch": 5.4, "grad_norm": 2.462887763977051, "learning_rate": 2.30005e-05, "loss": 0.0268, "num_input_tokens_seen": 55295360, "step": 54000 }, { "epoch": 5.45, "grad_norm": 2.0268304347991943, "learning_rate": 2.2750500000000002e-05, "loss": 0.0263, "num_input_tokens_seen": 55807360, "step": 54500 }, { "epoch": 5.5, "grad_norm": 0.8000567555427551, "learning_rate": 2.2500500000000003e-05, "loss": 0.0282, "num_input_tokens_seen": 56319360, "step": 55000 }, { "epoch": 5.55, "grad_norm": 1.5781893730163574, "learning_rate": 2.2250500000000003e-05, "loss": 0.0265, "num_input_tokens_seen": 56831360, "step": 55500 }, { "epoch": 5.6, "grad_norm": 1.2630614042282104, "learning_rate": 2.20005e-05, "loss": 0.0257, "num_input_tokens_seen": 57343360, "step": 56000 }, { "epoch": 5.65, "grad_norm": 1.3778091669082642, "learning_rate": 2.17505e-05, "loss": 0.0271, "num_input_tokens_seen": 57855360, "step": 56500 }, { "epoch": 5.7, "grad_norm": 1.0909324884414673, "learning_rate": 2.1500500000000002e-05, "loss": 0.026, "num_input_tokens_seen": 58367360, "step": 57000 }, { "epoch": 5.75, "grad_norm": 3.5209500789642334, "learning_rate": 2.1250500000000003e-05, "loss": 0.0264, "num_input_tokens_seen": 58879360, "step": 57500 }, { "epoch": 5.8, "grad_norm": 1.4671865701675415, "learning_rate": 2.10005e-05, "loss": 0.0265, "num_input_tokens_seen": 59391360, "step": 58000 }, { "epoch": 5.85, "grad_norm": 1.6409125328063965, "learning_rate": 2.07505e-05, "loss": 0.0268, "num_input_tokens_seen": 59903360, "step": 58500 }, { "epoch": 5.9, "grad_norm": 1.4418998956680298, "learning_rate": 2.05005e-05, "loss": 0.0262, "num_input_tokens_seen": 60415360, "step": 59000 }, { "epoch": 5.95, "grad_norm": 1.3441293239593506, "learning_rate": 2.02505e-05, "loss": 0.0257, "num_input_tokens_seen": 60927360, "step": 59500 }, { "epoch": 6.0, "grad_norm": 0.8778462409973145, "learning_rate": 2.00005e-05, "loss": 0.0263, "num_input_tokens_seen": 61439232, "step": 60000 }, { "epoch": 6.0, "eval_combined_score": 0.15236617343673117, "eval_loss": 0.15236616134643555, "eval_mse": 0.1523661706258656, "eval_runtime": 29.4149, "eval_samples_per_second": 679.926, "eval_steps_per_second": 84.991, "num_input_tokens_seen": 61439232, "step": 60000 }, { "epoch": 6.05, "grad_norm": 1.4335697889328003, "learning_rate": 1.97505e-05, "loss": 0.0181, "num_input_tokens_seen": 61951232, "step": 60500 }, { "epoch": 6.1, "grad_norm": 1.1381551027297974, "learning_rate": 1.95005e-05, "loss": 0.0197, "num_input_tokens_seen": 62463232, "step": 61000 }, { "epoch": 6.15, "grad_norm": 0.7046132683753967, "learning_rate": 1.92505e-05, "loss": 0.0186, "num_input_tokens_seen": 62975232, "step": 61500 }, { "epoch": 6.2, "grad_norm": 1.008306860923767, "learning_rate": 1.9000500000000002e-05, "loss": 0.0192, "num_input_tokens_seen": 63487232, "step": 62000 }, { "epoch": 6.25, "grad_norm": 2.0765221118927, "learning_rate": 1.8750500000000003e-05, "loss": 0.0185, "num_input_tokens_seen": 63999232, "step": 62500 }, { "epoch": 6.3, "grad_norm": 1.2361551523208618, "learning_rate": 1.85005e-05, "loss": 0.0179, "num_input_tokens_seen": 64511232, "step": 63000 }, { "epoch": 6.35, "grad_norm": 0.7231354117393494, "learning_rate": 1.82505e-05, "loss": 0.0194, "num_input_tokens_seen": 65023232, "step": 63500 }, { "epoch": 6.4, "grad_norm": 0.779230535030365, "learning_rate": 1.80005e-05, "loss": 0.0198, "num_input_tokens_seen": 65535232, "step": 64000 }, { "epoch": 6.45, "grad_norm": 0.7320069074630737, "learning_rate": 1.77505e-05, "loss": 0.0187, "num_input_tokens_seen": 66047232, "step": 64500 }, { "epoch": 6.5, "grad_norm": 0.8597579598426819, "learning_rate": 1.75005e-05, "loss": 0.0191, "num_input_tokens_seen": 66559232, "step": 65000 }, { "epoch": 6.55, "grad_norm": 1.4109529256820679, "learning_rate": 1.72505e-05, "loss": 0.0192, "num_input_tokens_seen": 67071232, "step": 65500 }, { "epoch": 6.6, "grad_norm": 1.4900848865509033, "learning_rate": 1.70005e-05, "loss": 0.0173, "num_input_tokens_seen": 67583232, "step": 66000 }, { "epoch": 6.65, "grad_norm": 1.3828743696212769, "learning_rate": 1.6750499999999998e-05, "loss": 0.0176, "num_input_tokens_seen": 68095232, "step": 66500 }, { "epoch": 6.7, "grad_norm": 0.6733376383781433, "learning_rate": 1.6500500000000002e-05, "loss": 0.019, "num_input_tokens_seen": 68607232, "step": 67000 }, { "epoch": 6.75, "grad_norm": 0.4570697546005249, "learning_rate": 1.6250500000000003e-05, "loss": 0.0181, "num_input_tokens_seen": 69119232, "step": 67500 }, { "epoch": 6.8, "grad_norm": 0.9463149309158325, "learning_rate": 1.60005e-05, "loss": 0.0174, "num_input_tokens_seen": 69631232, "step": 68000 }, { "epoch": 6.85, "grad_norm": 0.9304377436637878, "learning_rate": 1.57505e-05, "loss": 0.0185, "num_input_tokens_seen": 70143232, "step": 68500 }, { "epoch": 6.9, "grad_norm": 0.8526313304901123, "learning_rate": 1.5500500000000002e-05, "loss": 0.0185, "num_input_tokens_seen": 70655232, "step": 69000 }, { "epoch": 6.95, "grad_norm": 1.6793274879455566, "learning_rate": 1.52505e-05, "loss": 0.0194, "num_input_tokens_seen": 71167232, "step": 69500 }, { "epoch": 7.0, "grad_norm": 1.2873644828796387, "learning_rate": 1.5000500000000001e-05, "loss": 0.0203, "num_input_tokens_seen": 71679104, "step": 70000 }, { "epoch": 7.0, "eval_combined_score": 0.1494929350818927, "eval_loss": 0.14949294924736023, "eval_mse": 0.14949293581758635, "eval_runtime": 29.5374, "eval_samples_per_second": 677.107, "eval_steps_per_second": 84.638, "num_input_tokens_seen": 71679104, "step": 70000 }, { "epoch": 7.05, "grad_norm": 0.8493014574050903, "learning_rate": 1.47505e-05, "loss": 0.014, "num_input_tokens_seen": 72191104, "step": 70500 }, { "epoch": 7.1, "grad_norm": 0.6162556409835815, "learning_rate": 1.45005e-05, "loss": 0.0145, "num_input_tokens_seen": 72703104, "step": 71000 }, { "epoch": 7.15, "grad_norm": 0.6198768019676208, "learning_rate": 1.42505e-05, "loss": 0.0135, "num_input_tokens_seen": 73215104, "step": 71500 }, { "epoch": 7.2, "grad_norm": 0.6122292876243591, "learning_rate": 1.40005e-05, "loss": 0.0138, "num_input_tokens_seen": 73727104, "step": 72000 }, { "epoch": 7.25, "grad_norm": 0.8132468461990356, "learning_rate": 1.37505e-05, "loss": 0.0136, "num_input_tokens_seen": 74239104, "step": 72500 }, { "epoch": 7.3, "grad_norm": 0.791746973991394, "learning_rate": 1.3500499999999999e-05, "loss": 0.0136, "num_input_tokens_seen": 74751104, "step": 73000 }, { "epoch": 7.35, "grad_norm": 1.6126739978790283, "learning_rate": 1.3250500000000001e-05, "loss": 0.0139, "num_input_tokens_seen": 75263104, "step": 73500 }, { "epoch": 7.4, "grad_norm": 1.348046898841858, "learning_rate": 1.3000500000000002e-05, "loss": 0.0149, "num_input_tokens_seen": 75775104, "step": 74000 }, { "epoch": 7.45, "grad_norm": 1.5154032707214355, "learning_rate": 1.2750500000000001e-05, "loss": 0.0133, "num_input_tokens_seen": 76287104, "step": 74500 }, { "epoch": 7.5, "grad_norm": 1.3086836338043213, "learning_rate": 1.2500500000000002e-05, "loss": 0.0134, "num_input_tokens_seen": 76799104, "step": 75000 }, { "epoch": 7.55, "grad_norm": 1.3077424764633179, "learning_rate": 1.22505e-05, "loss": 0.0134, "num_input_tokens_seen": 77311104, "step": 75500 }, { "epoch": 7.6, "grad_norm": 1.377185344696045, "learning_rate": 1.2000500000000001e-05, "loss": 0.0128, "num_input_tokens_seen": 77823104, "step": 76000 }, { "epoch": 7.65, "grad_norm": 1.2250688076019287, "learning_rate": 1.17505e-05, "loss": 0.0146, "num_input_tokens_seen": 78335104, "step": 76500 }, { "epoch": 7.7, "grad_norm": 0.8044687509536743, "learning_rate": 1.15005e-05, "loss": 0.0132, "num_input_tokens_seen": 78847104, "step": 77000 }, { "epoch": 7.75, "grad_norm": 0.8126741647720337, "learning_rate": 1.12505e-05, "loss": 0.0134, "num_input_tokens_seen": 79359104, "step": 77500 }, { "epoch": 7.8, "grad_norm": 0.6075248122215271, "learning_rate": 1.10005e-05, "loss": 0.0131, "num_input_tokens_seen": 79871104, "step": 78000 }, { "epoch": 7.85, "grad_norm": 1.874189853668213, "learning_rate": 1.0750500000000002e-05, "loss": 0.0134, "num_input_tokens_seen": 80383104, "step": 78500 }, { "epoch": 7.9, "grad_norm": 0.5488854646682739, "learning_rate": 1.05005e-05, "loss": 0.0137, "num_input_tokens_seen": 80895104, "step": 79000 }, { "epoch": 7.95, "grad_norm": 1.5739060640335083, "learning_rate": 1.0250500000000001e-05, "loss": 0.0131, "num_input_tokens_seen": 81407104, "step": 79500 }, { "epoch": 8.0, "grad_norm": 1.897755742073059, "learning_rate": 1.00005e-05, "loss": 0.0135, "num_input_tokens_seen": 81918976, "step": 80000 }, { "epoch": 8.0, "eval_combined_score": 0.1482119562218898, "eval_loss": 0.14821195602416992, "eval_mse": 0.14821195641960966, "eval_runtime": 29.5069, "eval_samples_per_second": 677.807, "eval_steps_per_second": 84.726, "num_input_tokens_seen": 81918976, "step": 80000 }, { "epoch": 8.05, "grad_norm": 0.39859962463378906, "learning_rate": 9.7505e-06, "loss": 0.0107, "num_input_tokens_seen": 82430976, "step": 80500 }, { "epoch": 8.1, "grad_norm": 1.8892147541046143, "learning_rate": 9.500500000000002e-06, "loss": 0.01, "num_input_tokens_seen": 82942976, "step": 81000 }, { "epoch": 8.15, "grad_norm": 0.7789964079856873, "learning_rate": 9.2505e-06, "loss": 0.0111, "num_input_tokens_seen": 83454976, "step": 81500 }, { "epoch": 8.2, "grad_norm": 0.6423227787017822, "learning_rate": 9.000500000000001e-06, "loss": 0.011, "num_input_tokens_seen": 83966976, "step": 82000 }, { "epoch": 8.25, "grad_norm": 0.6862022876739502, "learning_rate": 8.7505e-06, "loss": 0.0105, "num_input_tokens_seen": 84478976, "step": 82500 }, { "epoch": 8.3, "grad_norm": 0.6521459817886353, "learning_rate": 8.5005e-06, "loss": 0.011, "num_input_tokens_seen": 84990976, "step": 83000 }, { "epoch": 8.35, "grad_norm": 1.0782101154327393, "learning_rate": 8.2505e-06, "loss": 0.01, "num_input_tokens_seen": 85502976, "step": 83500 }, { "epoch": 8.4, "grad_norm": 0.32573211193084717, "learning_rate": 8.0005e-06, "loss": 0.0102, "num_input_tokens_seen": 86014976, "step": 84000 }, { "epoch": 8.45, "grad_norm": 0.4790741205215454, "learning_rate": 7.750500000000001e-06, "loss": 0.0097, "num_input_tokens_seen": 86526976, "step": 84500 }, { "epoch": 8.5, "grad_norm": 5.938267230987549, "learning_rate": 7.5005000000000004e-06, "loss": 0.0099, "num_input_tokens_seen": 87038976, "step": 85000 }, { "epoch": 8.55, "grad_norm": 0.3625955283641815, "learning_rate": 7.2505e-06, "loss": 0.0101, "num_input_tokens_seen": 87550976, "step": 85500 }, { "epoch": 8.6, "grad_norm": 1.664149522781372, "learning_rate": 7.0005e-06, "loss": 0.0103, "num_input_tokens_seen": 88062976, "step": 86000 }, { "epoch": 8.65, "grad_norm": 0.35580164194107056, "learning_rate": 6.7505e-06, "loss": 0.0097, "num_input_tokens_seen": 88574976, "step": 86500 }, { "epoch": 8.7, "grad_norm": 0.814786434173584, "learning_rate": 6.5005e-06, "loss": 0.0099, "num_input_tokens_seen": 89086976, "step": 87000 }, { "epoch": 8.75, "grad_norm": 0.479640930891037, "learning_rate": 6.2505000000000005e-06, "loss": 0.0101, "num_input_tokens_seen": 89598976, "step": 87500 }, { "epoch": 8.8, "grad_norm": 0.4606671929359436, "learning_rate": 6.0005e-06, "loss": 0.0094, "num_input_tokens_seen": 90110976, "step": 88000 }, { "epoch": 8.85, "grad_norm": 2.0643467903137207, "learning_rate": 5.7505e-06, "loss": 0.0099, "num_input_tokens_seen": 90622976, "step": 88500 }, { "epoch": 8.9, "grad_norm": 0.6785427331924438, "learning_rate": 5.5005e-06, "loss": 0.0103, "num_input_tokens_seen": 91134976, "step": 89000 }, { "epoch": 8.95, "grad_norm": 0.6333959102630615, "learning_rate": 5.250500000000001e-06, "loss": 0.01, "num_input_tokens_seen": 91646976, "step": 89500 }, { "epoch": 9.0, "grad_norm": 0.8463544249534607, "learning_rate": 5.000500000000001e-06, "loss": 0.0098, "num_input_tokens_seen": 92158848, "step": 90000 }, { "epoch": 9.0, "eval_combined_score": 0.14495953552467267, "eval_loss": 0.14495953917503357, "eval_mse": 0.1449595318743118, "eval_runtime": 29.5073, "eval_samples_per_second": 677.799, "eval_steps_per_second": 84.725, "num_input_tokens_seen": 92158848, "step": 90000 }, { "epoch": 9.05, "grad_norm": 0.39637425541877747, "learning_rate": 4.7505000000000005e-06, "loss": 0.0082, "num_input_tokens_seen": 92670848, "step": 90500 }, { "epoch": 9.1, "grad_norm": 0.7424957752227783, "learning_rate": 4.5005e-06, "loss": 0.0085, "num_input_tokens_seen": 93182848, "step": 91000 }, { "epoch": 9.15, "grad_norm": 0.8151483535766602, "learning_rate": 4.2505e-06, "loss": 0.008, "num_input_tokens_seen": 93694848, "step": 91500 }, { "epoch": 9.2, "grad_norm": 1.604078769683838, "learning_rate": 4.0005e-06, "loss": 0.0086, "num_input_tokens_seen": 94206848, "step": 92000 }, { "epoch": 9.25, "grad_norm": 0.42909368872642517, "learning_rate": 3.7505e-06, "loss": 0.0084, "num_input_tokens_seen": 94718848, "step": 92500 }, { "epoch": 9.3, "grad_norm": 0.6759423017501831, "learning_rate": 3.5005e-06, "loss": 0.0077, "num_input_tokens_seen": 95230848, "step": 93000 }, { "epoch": 9.35, "grad_norm": 0.5954917669296265, "learning_rate": 3.2505e-06, "loss": 0.0081, "num_input_tokens_seen": 95742848, "step": 93500 }, { "epoch": 9.4, "grad_norm": 0.6435306072235107, "learning_rate": 3.0005000000000003e-06, "loss": 0.0079, "num_input_tokens_seen": 96254848, "step": 94000 }, { "epoch": 9.45, "grad_norm": 0.8906601071357727, "learning_rate": 2.7505e-06, "loss": 0.008, "num_input_tokens_seen": 96766848, "step": 94500 }, { "epoch": 9.5, "grad_norm": 1.4101794958114624, "learning_rate": 2.5005e-06, "loss": 0.0075, "num_input_tokens_seen": 97278848, "step": 95000 }, { "epoch": 9.55, "grad_norm": 0.7406792044639587, "learning_rate": 2.2505000000000003e-06, "loss": 0.0078, "num_input_tokens_seen": 97790848, "step": 95500 }, { "epoch": 9.6, "grad_norm": 1.437361240386963, "learning_rate": 2.0004999999999997e-06, "loss": 0.0077, "num_input_tokens_seen": 98302848, "step": 96000 }, { "epoch": 9.65, "grad_norm": 0.4781911373138428, "learning_rate": 1.7505e-06, "loss": 0.0078, "num_input_tokens_seen": 98814848, "step": 96500 }, { "epoch": 9.7, "grad_norm": 0.5876700282096863, "learning_rate": 1.5005e-06, "loss": 0.0075, "num_input_tokens_seen": 99326848, "step": 97000 }, { "epoch": 9.75, "grad_norm": 0.933368980884552, "learning_rate": 1.2505000000000001e-06, "loss": 0.008, "num_input_tokens_seen": 99838848, "step": 97500 }, { "epoch": 9.8, "grad_norm": 0.7791544198989868, "learning_rate": 1.0005e-06, "loss": 0.0075, "num_input_tokens_seen": 100350848, "step": 98000 }, { "epoch": 9.85, "grad_norm": 0.45317134261131287, "learning_rate": 7.505000000000001e-07, "loss": 0.0078, "num_input_tokens_seen": 100862848, "step": 98500 }, { "epoch": 9.9, "grad_norm": 1.5439448356628418, "learning_rate": 5.005e-07, "loss": 0.0074, "num_input_tokens_seen": 101374848, "step": 99000 }, { "epoch": 9.95, "grad_norm": 0.5587248206138611, "learning_rate": 2.5049999999999997e-07, "loss": 0.0079, "num_input_tokens_seen": 101886848, "step": 99500 }, { "epoch": 10.0, "grad_norm": 0.6633381247520447, "learning_rate": 5e-10, "loss": 0.0073, "num_input_tokens_seen": 102398720, "step": 100000 }, { "epoch": 10.0, "eval_combined_score": 0.14527450438803524, "eval_loss": 0.14527450501918793, "eval_mse": 0.14527450375688256, "eval_runtime": 29.5752, "eval_samples_per_second": 676.241, "eval_steps_per_second": 84.53, "num_input_tokens_seen": 102398720, "step": 100000 }, { "epoch": 10.0, "num_input_tokens_seen": 102398720, "step": 100000, "total_flos": 5.262202453327104e+16, "train_loss": 0.056572345192432406, "train_runtime": 7202.8043, "train_samples_per_second": 111.066, "train_steps_per_second": 13.883, "train_tokens_per_second": 14216.507 } ], "logging_steps": 500, "max_steps": 100000, "num_input_tokens_seen": 102398720, "num_train_epochs": 10, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 5.262202453327104e+16, "train_batch_size": 8, "trial_name": null, "trial_params": null }