|
{ |
|
"best_global_step": 20000, |
|
"best_metric": 0.14082255959510803, |
|
"best_model_checkpoint": "/media/user/Expansion1/deberta-v3-base-zyda-2-v2-text-quality-v3/checkpoint-20000", |
|
"epoch": 10.0, |
|
"eval_steps": 500, |
|
"global_step": 100000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 11.949178695678711, |
|
"learning_rate": 4.97505e-05, |
|
"loss": 0.3835, |
|
"num_input_tokens_seen": 512000, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 3.623898506164551, |
|
"learning_rate": 4.95005e-05, |
|
"loss": 0.2484, |
|
"num_input_tokens_seen": 1024000, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 3.0655770301818848, |
|
"learning_rate": 4.9250500000000006e-05, |
|
"loss": 0.2332, |
|
"num_input_tokens_seen": 1536000, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 8.194499015808105, |
|
"learning_rate": 4.9000500000000006e-05, |
|
"loss": 0.2097, |
|
"num_input_tokens_seen": 2048000, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 1.8507510423660278, |
|
"learning_rate": 4.875050000000001e-05, |
|
"loss": 0.1988, |
|
"num_input_tokens_seen": 2560000, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 2.8679802417755127, |
|
"learning_rate": 4.85005e-05, |
|
"loss": 0.1957, |
|
"num_input_tokens_seen": 3072000, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 3.2234883308410645, |
|
"learning_rate": 4.82505e-05, |
|
"loss": 0.1793, |
|
"num_input_tokens_seen": 3584000, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 4.360517978668213, |
|
"learning_rate": 4.80005e-05, |
|
"loss": 0.1816, |
|
"num_input_tokens_seen": 4096000, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 5.652502536773682, |
|
"learning_rate": 4.77505e-05, |
|
"loss": 0.1855, |
|
"num_input_tokens_seen": 4608000, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 3.757875919342041, |
|
"learning_rate": 4.7500500000000004e-05, |
|
"loss": 0.1751, |
|
"num_input_tokens_seen": 5120000, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 3.0092484951019287, |
|
"learning_rate": 4.7250500000000004e-05, |
|
"loss": 0.1785, |
|
"num_input_tokens_seen": 5632000, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 7.830347061157227, |
|
"learning_rate": 4.7000500000000005e-05, |
|
"loss": 0.1711, |
|
"num_input_tokens_seen": 6144000, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 2.926468849182129, |
|
"learning_rate": 4.6750500000000006e-05, |
|
"loss": 0.168, |
|
"num_input_tokens_seen": 6656000, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 3.43612003326416, |
|
"learning_rate": 4.65005e-05, |
|
"loss": 0.1772, |
|
"num_input_tokens_seen": 7168000, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 2.3997323513031006, |
|
"learning_rate": 4.62505e-05, |
|
"loss": 0.1632, |
|
"num_input_tokens_seen": 7680000, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 12.628423690795898, |
|
"learning_rate": 4.60005e-05, |
|
"loss": 0.1714, |
|
"num_input_tokens_seen": 8192000, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 1.8220003843307495, |
|
"learning_rate": 4.57505e-05, |
|
"loss": 0.1613, |
|
"num_input_tokens_seen": 8704000, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 2.2584903240203857, |
|
"learning_rate": 4.55005e-05, |
|
"loss": 0.1547, |
|
"num_input_tokens_seen": 9216000, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 1.5416566133499146, |
|
"learning_rate": 4.52505e-05, |
|
"loss": 0.1594, |
|
"num_input_tokens_seen": 9728000, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 2.472825288772583, |
|
"learning_rate": 4.5000500000000004e-05, |
|
"loss": 0.1635, |
|
"num_input_tokens_seen": 10239872, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_combined_score": 0.18538867612314003, |
|
"eval_loss": 0.18538866937160492, |
|
"eval_mse": 0.18538868287467514, |
|
"eval_runtime": 29.5714, |
|
"eval_samples_per_second": 676.329, |
|
"eval_steps_per_second": 84.541, |
|
"num_input_tokens_seen": 10239872, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 1.05, |
|
"grad_norm": 2.986963987350464, |
|
"learning_rate": 4.47505e-05, |
|
"loss": 0.1226, |
|
"num_input_tokens_seen": 10751872, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 1.1, |
|
"grad_norm": 0.7588199973106384, |
|
"learning_rate": 4.45005e-05, |
|
"loss": 0.1172, |
|
"num_input_tokens_seen": 11263872, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 1.15, |
|
"grad_norm": 0.8432678580284119, |
|
"learning_rate": 4.42505e-05, |
|
"loss": 0.1186, |
|
"num_input_tokens_seen": 11775872, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"grad_norm": 12.563228607177734, |
|
"learning_rate": 4.40005e-05, |
|
"loss": 0.1139, |
|
"num_input_tokens_seen": 12287872, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"grad_norm": 2.207587242126465, |
|
"learning_rate": 4.37505e-05, |
|
"loss": 0.121, |
|
"num_input_tokens_seen": 12799872, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 1.3, |
|
"grad_norm": 1.978637456893921, |
|
"learning_rate": 4.35005e-05, |
|
"loss": 0.1114, |
|
"num_input_tokens_seen": 13311872, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 1.35, |
|
"grad_norm": 5.6478729248046875, |
|
"learning_rate": 4.32505e-05, |
|
"loss": 0.1182, |
|
"num_input_tokens_seen": 13823872, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 1.4, |
|
"grad_norm": 3.0157413482666016, |
|
"learning_rate": 4.30005e-05, |
|
"loss": 0.1099, |
|
"num_input_tokens_seen": 14335872, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 1.45, |
|
"grad_norm": 2.2837512493133545, |
|
"learning_rate": 4.2750500000000003e-05, |
|
"loss": 0.1154, |
|
"num_input_tokens_seen": 14847872, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"grad_norm": 2.124837875366211, |
|
"learning_rate": 4.2500500000000004e-05, |
|
"loss": 0.1163, |
|
"num_input_tokens_seen": 15359872, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 1.55, |
|
"grad_norm": 1.8782966136932373, |
|
"learning_rate": 4.2250500000000005e-05, |
|
"loss": 0.1167, |
|
"num_input_tokens_seen": 15871872, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"grad_norm": 1.085688591003418, |
|
"learning_rate": 4.2000500000000006e-05, |
|
"loss": 0.1156, |
|
"num_input_tokens_seen": 16383872, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 1.65, |
|
"grad_norm": 1.9874955415725708, |
|
"learning_rate": 4.1750500000000006e-05, |
|
"loss": 0.1183, |
|
"num_input_tokens_seen": 16895872, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 1.7, |
|
"grad_norm": 2.6902706623077393, |
|
"learning_rate": 4.15005e-05, |
|
"loss": 0.1112, |
|
"num_input_tokens_seen": 17407872, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 1.75, |
|
"grad_norm": 3.0735440254211426, |
|
"learning_rate": 4.12505e-05, |
|
"loss": 0.1159, |
|
"num_input_tokens_seen": 17919872, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 1.8, |
|
"grad_norm": 2.936267614364624, |
|
"learning_rate": 4.10005e-05, |
|
"loss": 0.1187, |
|
"num_input_tokens_seen": 18431872, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 1.85, |
|
"grad_norm": 3.598895311355591, |
|
"learning_rate": 4.07505e-05, |
|
"loss": 0.1147, |
|
"num_input_tokens_seen": 18943872, |
|
"step": 18500 |
|
}, |
|
{ |
|
"epoch": 1.9, |
|
"grad_norm": 3.655381917953491, |
|
"learning_rate": 4.05005e-05, |
|
"loss": 0.1387, |
|
"num_input_tokens_seen": 19455872, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 1.95, |
|
"grad_norm": 9.855778694152832, |
|
"learning_rate": 4.0250500000000004e-05, |
|
"loss": 0.1238, |
|
"num_input_tokens_seen": 19967872, |
|
"step": 19500 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 2.558746337890625, |
|
"learning_rate": 4.0000500000000004e-05, |
|
"loss": 0.1241, |
|
"num_input_tokens_seen": 20479744, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_combined_score": 0.14082256163832602, |
|
"eval_loss": 0.14082255959510803, |
|
"eval_mse": 0.14082256368154403, |
|
"eval_runtime": 30.1283, |
|
"eval_samples_per_second": 663.828, |
|
"eval_steps_per_second": 82.979, |
|
"num_input_tokens_seen": 20479744, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 2.05, |
|
"grad_norm": 1.3755764961242676, |
|
"learning_rate": 3.97505e-05, |
|
"loss": 0.0804, |
|
"num_input_tokens_seen": 20991744, |
|
"step": 20500 |
|
}, |
|
{ |
|
"epoch": 2.1, |
|
"grad_norm": 3.242955207824707, |
|
"learning_rate": 3.95005e-05, |
|
"loss": 0.0795, |
|
"num_input_tokens_seen": 21503744, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 2.15, |
|
"grad_norm": 2.4045000076293945, |
|
"learning_rate": 3.92505e-05, |
|
"loss": 0.0814, |
|
"num_input_tokens_seen": 22015744, |
|
"step": 21500 |
|
}, |
|
{ |
|
"epoch": 2.2, |
|
"grad_norm": 2.5508718490600586, |
|
"learning_rate": 3.90005e-05, |
|
"loss": 0.0848, |
|
"num_input_tokens_seen": 22527744, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 2.25, |
|
"grad_norm": 2.8529911041259766, |
|
"learning_rate": 3.87505e-05, |
|
"loss": 0.081, |
|
"num_input_tokens_seen": 23039744, |
|
"step": 22500 |
|
}, |
|
{ |
|
"epoch": 2.3, |
|
"grad_norm": 10.657905578613281, |
|
"learning_rate": 3.85005e-05, |
|
"loss": 0.0786, |
|
"num_input_tokens_seen": 23551744, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 2.35, |
|
"grad_norm": 2.378411293029785, |
|
"learning_rate": 3.82505e-05, |
|
"loss": 0.0823, |
|
"num_input_tokens_seen": 24063744, |
|
"step": 23500 |
|
}, |
|
{ |
|
"epoch": 2.4, |
|
"grad_norm": 2.6125261783599854, |
|
"learning_rate": 3.80005e-05, |
|
"loss": 0.0787, |
|
"num_input_tokens_seen": 24575744, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 2.45, |
|
"grad_norm": 1.3133174180984497, |
|
"learning_rate": 3.77505e-05, |
|
"loss": 0.0761, |
|
"num_input_tokens_seen": 25087744, |
|
"step": 24500 |
|
}, |
|
{ |
|
"epoch": 2.5, |
|
"grad_norm": 3.3419981002807617, |
|
"learning_rate": 3.75005e-05, |
|
"loss": 0.0775, |
|
"num_input_tokens_seen": 25599744, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 2.55, |
|
"grad_norm": 2.1734654903411865, |
|
"learning_rate": 3.72505e-05, |
|
"loss": 0.0846, |
|
"num_input_tokens_seen": 26111744, |
|
"step": 25500 |
|
}, |
|
{ |
|
"epoch": 2.6, |
|
"grad_norm": 3.2352869510650635, |
|
"learning_rate": 3.70005e-05, |
|
"loss": 0.0817, |
|
"num_input_tokens_seen": 26623744, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 2.65, |
|
"grad_norm": 3.37646746635437, |
|
"learning_rate": 3.675050000000001e-05, |
|
"loss": 0.0816, |
|
"num_input_tokens_seen": 27135744, |
|
"step": 26500 |
|
}, |
|
{ |
|
"epoch": 2.7, |
|
"grad_norm": 2.5875842571258545, |
|
"learning_rate": 3.650050000000001e-05, |
|
"loss": 0.0843, |
|
"num_input_tokens_seen": 27647744, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 2.75, |
|
"grad_norm": 7.768916606903076, |
|
"learning_rate": 3.62505e-05, |
|
"loss": 0.089, |
|
"num_input_tokens_seen": 28159744, |
|
"step": 27500 |
|
}, |
|
{ |
|
"epoch": 2.8, |
|
"grad_norm": 2.6333940029144287, |
|
"learning_rate": 3.60005e-05, |
|
"loss": 0.1209, |
|
"num_input_tokens_seen": 28671744, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 2.85, |
|
"grad_norm": 3.4022088050842285, |
|
"learning_rate": 3.57505e-05, |
|
"loss": 0.082, |
|
"num_input_tokens_seen": 29183744, |
|
"step": 28500 |
|
}, |
|
{ |
|
"epoch": 2.9, |
|
"grad_norm": 1.5310307741165161, |
|
"learning_rate": 3.5500500000000003e-05, |
|
"loss": 0.0813, |
|
"num_input_tokens_seen": 29695744, |
|
"step": 29000 |
|
}, |
|
{ |
|
"epoch": 2.95, |
|
"grad_norm": 3.3515617847442627, |
|
"learning_rate": 3.5250500000000004e-05, |
|
"loss": 0.0856, |
|
"num_input_tokens_seen": 30207744, |
|
"step": 29500 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"grad_norm": 1.5893547534942627, |
|
"learning_rate": 3.5000500000000005e-05, |
|
"loss": 0.0882, |
|
"num_input_tokens_seen": 30719616, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_combined_score": 0.1746896443902683, |
|
"eval_loss": 0.1746896207332611, |
|
"eval_mse": 0.17468963824495307, |
|
"eval_runtime": 29.4701, |
|
"eval_samples_per_second": 678.654, |
|
"eval_steps_per_second": 84.832, |
|
"num_input_tokens_seen": 30719616, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 3.05, |
|
"grad_norm": 1.333294153213501, |
|
"learning_rate": 3.4750500000000006e-05, |
|
"loss": 0.0562, |
|
"num_input_tokens_seen": 31231616, |
|
"step": 30500 |
|
}, |
|
{ |
|
"epoch": 3.1, |
|
"grad_norm": 0.8254738450050354, |
|
"learning_rate": 3.45005e-05, |
|
"loss": 0.053, |
|
"num_input_tokens_seen": 31743616, |
|
"step": 31000 |
|
}, |
|
{ |
|
"epoch": 3.15, |
|
"grad_norm": 1.7611359357833862, |
|
"learning_rate": 3.42505e-05, |
|
"loss": 0.0533, |
|
"num_input_tokens_seen": 32255616, |
|
"step": 31500 |
|
}, |
|
{ |
|
"epoch": 3.2, |
|
"grad_norm": 1.1055493354797363, |
|
"learning_rate": 3.40005e-05, |
|
"loss": 0.0557, |
|
"num_input_tokens_seen": 32767616, |
|
"step": 32000 |
|
}, |
|
{ |
|
"epoch": 3.25, |
|
"grad_norm": 1.6912920475006104, |
|
"learning_rate": 3.37505e-05, |
|
"loss": 0.0557, |
|
"num_input_tokens_seen": 33279616, |
|
"step": 32500 |
|
}, |
|
{ |
|
"epoch": 3.3, |
|
"grad_norm": 2.5604867935180664, |
|
"learning_rate": 3.35005e-05, |
|
"loss": 0.0619, |
|
"num_input_tokens_seen": 33791616, |
|
"step": 33000 |
|
}, |
|
{ |
|
"epoch": 3.35, |
|
"grad_norm": 1.7852438688278198, |
|
"learning_rate": 3.32505e-05, |
|
"loss": 0.0564, |
|
"num_input_tokens_seen": 34303616, |
|
"step": 33500 |
|
}, |
|
{ |
|
"epoch": 3.4, |
|
"grad_norm": 2.84659481048584, |
|
"learning_rate": 3.3000500000000004e-05, |
|
"loss": 0.0563, |
|
"num_input_tokens_seen": 34815616, |
|
"step": 34000 |
|
}, |
|
{ |
|
"epoch": 3.45, |
|
"grad_norm": 2.0315301418304443, |
|
"learning_rate": 3.27505e-05, |
|
"loss": 0.054, |
|
"num_input_tokens_seen": 35327616, |
|
"step": 34500 |
|
}, |
|
{ |
|
"epoch": 3.5, |
|
"grad_norm": 1.9043070077896118, |
|
"learning_rate": 3.25005e-05, |
|
"loss": 0.0583, |
|
"num_input_tokens_seen": 35839616, |
|
"step": 35000 |
|
}, |
|
{ |
|
"epoch": 3.55, |
|
"grad_norm": 1.7389405965805054, |
|
"learning_rate": 3.22505e-05, |
|
"loss": 0.0544, |
|
"num_input_tokens_seen": 36351616, |
|
"step": 35500 |
|
}, |
|
{ |
|
"epoch": 3.6, |
|
"grad_norm": 0.8132746815681458, |
|
"learning_rate": 3.20005e-05, |
|
"loss": 0.055, |
|
"num_input_tokens_seen": 36863616, |
|
"step": 36000 |
|
}, |
|
{ |
|
"epoch": 3.65, |
|
"grad_norm": 1.8100671768188477, |
|
"learning_rate": 3.17505e-05, |
|
"loss": 0.0558, |
|
"num_input_tokens_seen": 37375616, |
|
"step": 36500 |
|
}, |
|
{ |
|
"epoch": 3.7, |
|
"grad_norm": 10.433902740478516, |
|
"learning_rate": 3.15005e-05, |
|
"loss": 0.0568, |
|
"num_input_tokens_seen": 37887616, |
|
"step": 37000 |
|
}, |
|
{ |
|
"epoch": 3.75, |
|
"grad_norm": 0.7512624263763428, |
|
"learning_rate": 3.12505e-05, |
|
"loss": 0.0541, |
|
"num_input_tokens_seen": 38399616, |
|
"step": 37500 |
|
}, |
|
{ |
|
"epoch": 3.8, |
|
"grad_norm": 1.3957535028457642, |
|
"learning_rate": 3.1000499999999996e-05, |
|
"loss": 0.0546, |
|
"num_input_tokens_seen": 38911616, |
|
"step": 38000 |
|
}, |
|
{ |
|
"epoch": 3.85, |
|
"grad_norm": 1.069032073020935, |
|
"learning_rate": 3.0750499999999996e-05, |
|
"loss": 0.0547, |
|
"num_input_tokens_seen": 39423616, |
|
"step": 38500 |
|
}, |
|
{ |
|
"epoch": 3.9, |
|
"grad_norm": 3.4046223163604736, |
|
"learning_rate": 3.0500500000000004e-05, |
|
"loss": 0.0567, |
|
"num_input_tokens_seen": 39935616, |
|
"step": 39000 |
|
}, |
|
{ |
|
"epoch": 3.95, |
|
"grad_norm": 1.5711253881454468, |
|
"learning_rate": 3.0250500000000005e-05, |
|
"loss": 0.0571, |
|
"num_input_tokens_seen": 40447616, |
|
"step": 39500 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"grad_norm": 2.8568646907806396, |
|
"learning_rate": 3.0000500000000005e-05, |
|
"loss": 0.054, |
|
"num_input_tokens_seen": 40959488, |
|
"step": 40000 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_combined_score": 0.1528494923779644, |
|
"eval_loss": 0.152849480509758, |
|
"eval_mse": 0.15284948934500966, |
|
"eval_runtime": 29.495, |
|
"eval_samples_per_second": 678.08, |
|
"eval_steps_per_second": 84.76, |
|
"num_input_tokens_seen": 40959488, |
|
"step": 40000 |
|
}, |
|
{ |
|
"epoch": 4.05, |
|
"grad_norm": 1.1214642524719238, |
|
"learning_rate": 2.9750500000000003e-05, |
|
"loss": 0.0365, |
|
"num_input_tokens_seen": 41471488, |
|
"step": 40500 |
|
}, |
|
{ |
|
"epoch": 4.1, |
|
"grad_norm": 2.6408936977386475, |
|
"learning_rate": 2.9500500000000003e-05, |
|
"loss": 0.0361, |
|
"num_input_tokens_seen": 41983488, |
|
"step": 41000 |
|
}, |
|
{ |
|
"epoch": 4.15, |
|
"grad_norm": 1.0093015432357788, |
|
"learning_rate": 2.9250500000000004e-05, |
|
"loss": 0.0361, |
|
"num_input_tokens_seen": 42495488, |
|
"step": 41500 |
|
}, |
|
{ |
|
"epoch": 4.2, |
|
"grad_norm": 2.0412521362304688, |
|
"learning_rate": 2.90005e-05, |
|
"loss": 0.0377, |
|
"num_input_tokens_seen": 43007488, |
|
"step": 42000 |
|
}, |
|
{ |
|
"epoch": 4.25, |
|
"grad_norm": 2.0059244632720947, |
|
"learning_rate": 2.8750500000000002e-05, |
|
"loss": 0.0354, |
|
"num_input_tokens_seen": 43519488, |
|
"step": 42500 |
|
}, |
|
{ |
|
"epoch": 4.3, |
|
"grad_norm": 3.214423179626465, |
|
"learning_rate": 2.8500500000000003e-05, |
|
"loss": 0.0373, |
|
"num_input_tokens_seen": 44031488, |
|
"step": 43000 |
|
}, |
|
{ |
|
"epoch": 4.35, |
|
"grad_norm": 2.101541519165039, |
|
"learning_rate": 2.8250500000000003e-05, |
|
"loss": 0.0381, |
|
"num_input_tokens_seen": 44543488, |
|
"step": 43500 |
|
}, |
|
{ |
|
"epoch": 4.4, |
|
"grad_norm": 0.8797721862792969, |
|
"learning_rate": 2.80005e-05, |
|
"loss": 0.0381, |
|
"num_input_tokens_seen": 45055488, |
|
"step": 44000 |
|
}, |
|
{ |
|
"epoch": 4.45, |
|
"grad_norm": 2.0589728355407715, |
|
"learning_rate": 2.77505e-05, |
|
"loss": 0.036, |
|
"num_input_tokens_seen": 45567488, |
|
"step": 44500 |
|
}, |
|
{ |
|
"epoch": 4.5, |
|
"grad_norm": 2.5758140087127686, |
|
"learning_rate": 2.7500500000000002e-05, |
|
"loss": 0.0372, |
|
"num_input_tokens_seen": 46079488, |
|
"step": 45000 |
|
}, |
|
{ |
|
"epoch": 4.55, |
|
"grad_norm": 1.531252145767212, |
|
"learning_rate": 2.72505e-05, |
|
"loss": 0.0381, |
|
"num_input_tokens_seen": 46591488, |
|
"step": 45500 |
|
}, |
|
{ |
|
"epoch": 4.6, |
|
"grad_norm": 1.053691029548645, |
|
"learning_rate": 2.70005e-05, |
|
"loss": 0.0396, |
|
"num_input_tokens_seen": 47103488, |
|
"step": 46000 |
|
}, |
|
{ |
|
"epoch": 4.65, |
|
"grad_norm": 1.031100869178772, |
|
"learning_rate": 2.67505e-05, |
|
"loss": 0.0376, |
|
"num_input_tokens_seen": 47615488, |
|
"step": 46500 |
|
}, |
|
{ |
|
"epoch": 4.7, |
|
"grad_norm": 0.8592771887779236, |
|
"learning_rate": 2.65005e-05, |
|
"loss": 0.0381, |
|
"num_input_tokens_seen": 48127488, |
|
"step": 47000 |
|
}, |
|
{ |
|
"epoch": 4.75, |
|
"grad_norm": 3.529454231262207, |
|
"learning_rate": 2.62505e-05, |
|
"loss": 0.0406, |
|
"num_input_tokens_seen": 48639488, |
|
"step": 47500 |
|
}, |
|
{ |
|
"epoch": 4.8, |
|
"grad_norm": 1.2595094442367554, |
|
"learning_rate": 2.60005e-05, |
|
"loss": 0.044, |
|
"num_input_tokens_seen": 49151488, |
|
"step": 48000 |
|
}, |
|
{ |
|
"epoch": 4.85, |
|
"grad_norm": 1.0460163354873657, |
|
"learning_rate": 2.57505e-05, |
|
"loss": 0.0411, |
|
"num_input_tokens_seen": 49663488, |
|
"step": 48500 |
|
}, |
|
{ |
|
"epoch": 4.9, |
|
"grad_norm": 0.7415432333946228, |
|
"learning_rate": 2.55005e-05, |
|
"loss": 0.0376, |
|
"num_input_tokens_seen": 50175488, |
|
"step": 49000 |
|
}, |
|
{ |
|
"epoch": 4.95, |
|
"grad_norm": 0.9863350987434387, |
|
"learning_rate": 2.5250499999999998e-05, |
|
"loss": 0.039, |
|
"num_input_tokens_seen": 50687488, |
|
"step": 49500 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"grad_norm": 2.2840659618377686, |
|
"learning_rate": 2.50005e-05, |
|
"loss": 0.0372, |
|
"num_input_tokens_seen": 51199360, |
|
"step": 50000 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"eval_combined_score": 0.14802570643204935, |
|
"eval_loss": 0.14802570641040802, |
|
"eval_mse": 0.14802570645369068, |
|
"eval_runtime": 29.5199, |
|
"eval_samples_per_second": 677.508, |
|
"eval_steps_per_second": 84.689, |
|
"num_input_tokens_seen": 51199360, |
|
"step": 50000 |
|
}, |
|
{ |
|
"epoch": 5.05, |
|
"grad_norm": 1.0202912092208862, |
|
"learning_rate": 2.4750500000000003e-05, |
|
"loss": 0.0253, |
|
"num_input_tokens_seen": 51711360, |
|
"step": 50500 |
|
}, |
|
{ |
|
"epoch": 5.1, |
|
"grad_norm": 1.1298741102218628, |
|
"learning_rate": 2.45005e-05, |
|
"loss": 0.0269, |
|
"num_input_tokens_seen": 52223360, |
|
"step": 51000 |
|
}, |
|
{ |
|
"epoch": 5.15, |
|
"grad_norm": 1.2378206253051758, |
|
"learning_rate": 2.42505e-05, |
|
"loss": 0.0258, |
|
"num_input_tokens_seen": 52735360, |
|
"step": 51500 |
|
}, |
|
{ |
|
"epoch": 5.2, |
|
"grad_norm": 1.6293431520462036, |
|
"learning_rate": 2.4000500000000002e-05, |
|
"loss": 0.0272, |
|
"num_input_tokens_seen": 53247360, |
|
"step": 52000 |
|
}, |
|
{ |
|
"epoch": 5.25, |
|
"grad_norm": 3.9734299182891846, |
|
"learning_rate": 2.37505e-05, |
|
"loss": 0.0272, |
|
"num_input_tokens_seen": 53759360, |
|
"step": 52500 |
|
}, |
|
{ |
|
"epoch": 5.3, |
|
"grad_norm": 0.6598159074783325, |
|
"learning_rate": 2.35005e-05, |
|
"loss": 0.0262, |
|
"num_input_tokens_seen": 54271360, |
|
"step": 53000 |
|
}, |
|
{ |
|
"epoch": 5.35, |
|
"grad_norm": 0.6012576818466187, |
|
"learning_rate": 2.32505e-05, |
|
"loss": 0.027, |
|
"num_input_tokens_seen": 54783360, |
|
"step": 53500 |
|
}, |
|
{ |
|
"epoch": 5.4, |
|
"grad_norm": 2.462887763977051, |
|
"learning_rate": 2.30005e-05, |
|
"loss": 0.0268, |
|
"num_input_tokens_seen": 55295360, |
|
"step": 54000 |
|
}, |
|
{ |
|
"epoch": 5.45, |
|
"grad_norm": 2.0268304347991943, |
|
"learning_rate": 2.2750500000000002e-05, |
|
"loss": 0.0263, |
|
"num_input_tokens_seen": 55807360, |
|
"step": 54500 |
|
}, |
|
{ |
|
"epoch": 5.5, |
|
"grad_norm": 0.8000567555427551, |
|
"learning_rate": 2.2500500000000003e-05, |
|
"loss": 0.0282, |
|
"num_input_tokens_seen": 56319360, |
|
"step": 55000 |
|
}, |
|
{ |
|
"epoch": 5.55, |
|
"grad_norm": 1.5781893730163574, |
|
"learning_rate": 2.2250500000000003e-05, |
|
"loss": 0.0265, |
|
"num_input_tokens_seen": 56831360, |
|
"step": 55500 |
|
}, |
|
{ |
|
"epoch": 5.6, |
|
"grad_norm": 1.2630614042282104, |
|
"learning_rate": 2.20005e-05, |
|
"loss": 0.0257, |
|
"num_input_tokens_seen": 57343360, |
|
"step": 56000 |
|
}, |
|
{ |
|
"epoch": 5.65, |
|
"grad_norm": 1.3778091669082642, |
|
"learning_rate": 2.17505e-05, |
|
"loss": 0.0271, |
|
"num_input_tokens_seen": 57855360, |
|
"step": 56500 |
|
}, |
|
{ |
|
"epoch": 5.7, |
|
"grad_norm": 1.0909324884414673, |
|
"learning_rate": 2.1500500000000002e-05, |
|
"loss": 0.026, |
|
"num_input_tokens_seen": 58367360, |
|
"step": 57000 |
|
}, |
|
{ |
|
"epoch": 5.75, |
|
"grad_norm": 3.5209500789642334, |
|
"learning_rate": 2.1250500000000003e-05, |
|
"loss": 0.0264, |
|
"num_input_tokens_seen": 58879360, |
|
"step": 57500 |
|
}, |
|
{ |
|
"epoch": 5.8, |
|
"grad_norm": 1.4671865701675415, |
|
"learning_rate": 2.10005e-05, |
|
"loss": 0.0265, |
|
"num_input_tokens_seen": 59391360, |
|
"step": 58000 |
|
}, |
|
{ |
|
"epoch": 5.85, |
|
"grad_norm": 1.6409125328063965, |
|
"learning_rate": 2.07505e-05, |
|
"loss": 0.0268, |
|
"num_input_tokens_seen": 59903360, |
|
"step": 58500 |
|
}, |
|
{ |
|
"epoch": 5.9, |
|
"grad_norm": 1.4418998956680298, |
|
"learning_rate": 2.05005e-05, |
|
"loss": 0.0262, |
|
"num_input_tokens_seen": 60415360, |
|
"step": 59000 |
|
}, |
|
{ |
|
"epoch": 5.95, |
|
"grad_norm": 1.3441293239593506, |
|
"learning_rate": 2.02505e-05, |
|
"loss": 0.0257, |
|
"num_input_tokens_seen": 60927360, |
|
"step": 59500 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"grad_norm": 0.8778462409973145, |
|
"learning_rate": 2.00005e-05, |
|
"loss": 0.0263, |
|
"num_input_tokens_seen": 61439232, |
|
"step": 60000 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"eval_combined_score": 0.15236617343673117, |
|
"eval_loss": 0.15236616134643555, |
|
"eval_mse": 0.1523661706258656, |
|
"eval_runtime": 29.4149, |
|
"eval_samples_per_second": 679.926, |
|
"eval_steps_per_second": 84.991, |
|
"num_input_tokens_seen": 61439232, |
|
"step": 60000 |
|
}, |
|
{ |
|
"epoch": 6.05, |
|
"grad_norm": 1.4335697889328003, |
|
"learning_rate": 1.97505e-05, |
|
"loss": 0.0181, |
|
"num_input_tokens_seen": 61951232, |
|
"step": 60500 |
|
}, |
|
{ |
|
"epoch": 6.1, |
|
"grad_norm": 1.1381551027297974, |
|
"learning_rate": 1.95005e-05, |
|
"loss": 0.0197, |
|
"num_input_tokens_seen": 62463232, |
|
"step": 61000 |
|
}, |
|
{ |
|
"epoch": 6.15, |
|
"grad_norm": 0.7046132683753967, |
|
"learning_rate": 1.92505e-05, |
|
"loss": 0.0186, |
|
"num_input_tokens_seen": 62975232, |
|
"step": 61500 |
|
}, |
|
{ |
|
"epoch": 6.2, |
|
"grad_norm": 1.008306860923767, |
|
"learning_rate": 1.9000500000000002e-05, |
|
"loss": 0.0192, |
|
"num_input_tokens_seen": 63487232, |
|
"step": 62000 |
|
}, |
|
{ |
|
"epoch": 6.25, |
|
"grad_norm": 2.0765221118927, |
|
"learning_rate": 1.8750500000000003e-05, |
|
"loss": 0.0185, |
|
"num_input_tokens_seen": 63999232, |
|
"step": 62500 |
|
}, |
|
{ |
|
"epoch": 6.3, |
|
"grad_norm": 1.2361551523208618, |
|
"learning_rate": 1.85005e-05, |
|
"loss": 0.0179, |
|
"num_input_tokens_seen": 64511232, |
|
"step": 63000 |
|
}, |
|
{ |
|
"epoch": 6.35, |
|
"grad_norm": 0.7231354117393494, |
|
"learning_rate": 1.82505e-05, |
|
"loss": 0.0194, |
|
"num_input_tokens_seen": 65023232, |
|
"step": 63500 |
|
}, |
|
{ |
|
"epoch": 6.4, |
|
"grad_norm": 0.779230535030365, |
|
"learning_rate": 1.80005e-05, |
|
"loss": 0.0198, |
|
"num_input_tokens_seen": 65535232, |
|
"step": 64000 |
|
}, |
|
{ |
|
"epoch": 6.45, |
|
"grad_norm": 0.7320069074630737, |
|
"learning_rate": 1.77505e-05, |
|
"loss": 0.0187, |
|
"num_input_tokens_seen": 66047232, |
|
"step": 64500 |
|
}, |
|
{ |
|
"epoch": 6.5, |
|
"grad_norm": 0.8597579598426819, |
|
"learning_rate": 1.75005e-05, |
|
"loss": 0.0191, |
|
"num_input_tokens_seen": 66559232, |
|
"step": 65000 |
|
}, |
|
{ |
|
"epoch": 6.55, |
|
"grad_norm": 1.4109529256820679, |
|
"learning_rate": 1.72505e-05, |
|
"loss": 0.0192, |
|
"num_input_tokens_seen": 67071232, |
|
"step": 65500 |
|
}, |
|
{ |
|
"epoch": 6.6, |
|
"grad_norm": 1.4900848865509033, |
|
"learning_rate": 1.70005e-05, |
|
"loss": 0.0173, |
|
"num_input_tokens_seen": 67583232, |
|
"step": 66000 |
|
}, |
|
{ |
|
"epoch": 6.65, |
|
"grad_norm": 1.3828743696212769, |
|
"learning_rate": 1.6750499999999998e-05, |
|
"loss": 0.0176, |
|
"num_input_tokens_seen": 68095232, |
|
"step": 66500 |
|
}, |
|
{ |
|
"epoch": 6.7, |
|
"grad_norm": 0.6733376383781433, |
|
"learning_rate": 1.6500500000000002e-05, |
|
"loss": 0.019, |
|
"num_input_tokens_seen": 68607232, |
|
"step": 67000 |
|
}, |
|
{ |
|
"epoch": 6.75, |
|
"grad_norm": 0.4570697546005249, |
|
"learning_rate": 1.6250500000000003e-05, |
|
"loss": 0.0181, |
|
"num_input_tokens_seen": 69119232, |
|
"step": 67500 |
|
}, |
|
{ |
|
"epoch": 6.8, |
|
"grad_norm": 0.9463149309158325, |
|
"learning_rate": 1.60005e-05, |
|
"loss": 0.0174, |
|
"num_input_tokens_seen": 69631232, |
|
"step": 68000 |
|
}, |
|
{ |
|
"epoch": 6.85, |
|
"grad_norm": 0.9304377436637878, |
|
"learning_rate": 1.57505e-05, |
|
"loss": 0.0185, |
|
"num_input_tokens_seen": 70143232, |
|
"step": 68500 |
|
}, |
|
{ |
|
"epoch": 6.9, |
|
"grad_norm": 0.8526313304901123, |
|
"learning_rate": 1.5500500000000002e-05, |
|
"loss": 0.0185, |
|
"num_input_tokens_seen": 70655232, |
|
"step": 69000 |
|
}, |
|
{ |
|
"epoch": 6.95, |
|
"grad_norm": 1.6793274879455566, |
|
"learning_rate": 1.52505e-05, |
|
"loss": 0.0194, |
|
"num_input_tokens_seen": 71167232, |
|
"step": 69500 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"grad_norm": 1.2873644828796387, |
|
"learning_rate": 1.5000500000000001e-05, |
|
"loss": 0.0203, |
|
"num_input_tokens_seen": 71679104, |
|
"step": 70000 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"eval_combined_score": 0.1494929350818927, |
|
"eval_loss": 0.14949294924736023, |
|
"eval_mse": 0.14949293581758635, |
|
"eval_runtime": 29.5374, |
|
"eval_samples_per_second": 677.107, |
|
"eval_steps_per_second": 84.638, |
|
"num_input_tokens_seen": 71679104, |
|
"step": 70000 |
|
}, |
|
{ |
|
"epoch": 7.05, |
|
"grad_norm": 0.8493014574050903, |
|
"learning_rate": 1.47505e-05, |
|
"loss": 0.014, |
|
"num_input_tokens_seen": 72191104, |
|
"step": 70500 |
|
}, |
|
{ |
|
"epoch": 7.1, |
|
"grad_norm": 0.6162556409835815, |
|
"learning_rate": 1.45005e-05, |
|
"loss": 0.0145, |
|
"num_input_tokens_seen": 72703104, |
|
"step": 71000 |
|
}, |
|
{ |
|
"epoch": 7.15, |
|
"grad_norm": 0.6198768019676208, |
|
"learning_rate": 1.42505e-05, |
|
"loss": 0.0135, |
|
"num_input_tokens_seen": 73215104, |
|
"step": 71500 |
|
}, |
|
{ |
|
"epoch": 7.2, |
|
"grad_norm": 0.6122292876243591, |
|
"learning_rate": 1.40005e-05, |
|
"loss": 0.0138, |
|
"num_input_tokens_seen": 73727104, |
|
"step": 72000 |
|
}, |
|
{ |
|
"epoch": 7.25, |
|
"grad_norm": 0.8132468461990356, |
|
"learning_rate": 1.37505e-05, |
|
"loss": 0.0136, |
|
"num_input_tokens_seen": 74239104, |
|
"step": 72500 |
|
}, |
|
{ |
|
"epoch": 7.3, |
|
"grad_norm": 0.791746973991394, |
|
"learning_rate": 1.3500499999999999e-05, |
|
"loss": 0.0136, |
|
"num_input_tokens_seen": 74751104, |
|
"step": 73000 |
|
}, |
|
{ |
|
"epoch": 7.35, |
|
"grad_norm": 1.6126739978790283, |
|
"learning_rate": 1.3250500000000001e-05, |
|
"loss": 0.0139, |
|
"num_input_tokens_seen": 75263104, |
|
"step": 73500 |
|
}, |
|
{ |
|
"epoch": 7.4, |
|
"grad_norm": 1.348046898841858, |
|
"learning_rate": 1.3000500000000002e-05, |
|
"loss": 0.0149, |
|
"num_input_tokens_seen": 75775104, |
|
"step": 74000 |
|
}, |
|
{ |
|
"epoch": 7.45, |
|
"grad_norm": 1.5154032707214355, |
|
"learning_rate": 1.2750500000000001e-05, |
|
"loss": 0.0133, |
|
"num_input_tokens_seen": 76287104, |
|
"step": 74500 |
|
}, |
|
{ |
|
"epoch": 7.5, |
|
"grad_norm": 1.3086836338043213, |
|
"learning_rate": 1.2500500000000002e-05, |
|
"loss": 0.0134, |
|
"num_input_tokens_seen": 76799104, |
|
"step": 75000 |
|
}, |
|
{ |
|
"epoch": 7.55, |
|
"grad_norm": 1.3077424764633179, |
|
"learning_rate": 1.22505e-05, |
|
"loss": 0.0134, |
|
"num_input_tokens_seen": 77311104, |
|
"step": 75500 |
|
}, |
|
{ |
|
"epoch": 7.6, |
|
"grad_norm": 1.377185344696045, |
|
"learning_rate": 1.2000500000000001e-05, |
|
"loss": 0.0128, |
|
"num_input_tokens_seen": 77823104, |
|
"step": 76000 |
|
}, |
|
{ |
|
"epoch": 7.65, |
|
"grad_norm": 1.2250688076019287, |
|
"learning_rate": 1.17505e-05, |
|
"loss": 0.0146, |
|
"num_input_tokens_seen": 78335104, |
|
"step": 76500 |
|
}, |
|
{ |
|
"epoch": 7.7, |
|
"grad_norm": 0.8044687509536743, |
|
"learning_rate": 1.15005e-05, |
|
"loss": 0.0132, |
|
"num_input_tokens_seen": 78847104, |
|
"step": 77000 |
|
}, |
|
{ |
|
"epoch": 7.75, |
|
"grad_norm": 0.8126741647720337, |
|
"learning_rate": 1.12505e-05, |
|
"loss": 0.0134, |
|
"num_input_tokens_seen": 79359104, |
|
"step": 77500 |
|
}, |
|
{ |
|
"epoch": 7.8, |
|
"grad_norm": 0.6075248122215271, |
|
"learning_rate": 1.10005e-05, |
|
"loss": 0.0131, |
|
"num_input_tokens_seen": 79871104, |
|
"step": 78000 |
|
}, |
|
{ |
|
"epoch": 7.85, |
|
"grad_norm": 1.874189853668213, |
|
"learning_rate": 1.0750500000000002e-05, |
|
"loss": 0.0134, |
|
"num_input_tokens_seen": 80383104, |
|
"step": 78500 |
|
}, |
|
{ |
|
"epoch": 7.9, |
|
"grad_norm": 0.5488854646682739, |
|
"learning_rate": 1.05005e-05, |
|
"loss": 0.0137, |
|
"num_input_tokens_seen": 80895104, |
|
"step": 79000 |
|
}, |
|
{ |
|
"epoch": 7.95, |
|
"grad_norm": 1.5739060640335083, |
|
"learning_rate": 1.0250500000000001e-05, |
|
"loss": 0.0131, |
|
"num_input_tokens_seen": 81407104, |
|
"step": 79500 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"grad_norm": 1.897755742073059, |
|
"learning_rate": 1.00005e-05, |
|
"loss": 0.0135, |
|
"num_input_tokens_seen": 81918976, |
|
"step": 80000 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"eval_combined_score": 0.1482119562218898, |
|
"eval_loss": 0.14821195602416992, |
|
"eval_mse": 0.14821195641960966, |
|
"eval_runtime": 29.5069, |
|
"eval_samples_per_second": 677.807, |
|
"eval_steps_per_second": 84.726, |
|
"num_input_tokens_seen": 81918976, |
|
"step": 80000 |
|
}, |
|
{ |
|
"epoch": 8.05, |
|
"grad_norm": 0.39859962463378906, |
|
"learning_rate": 9.7505e-06, |
|
"loss": 0.0107, |
|
"num_input_tokens_seen": 82430976, |
|
"step": 80500 |
|
}, |
|
{ |
|
"epoch": 8.1, |
|
"grad_norm": 1.8892147541046143, |
|
"learning_rate": 9.500500000000002e-06, |
|
"loss": 0.01, |
|
"num_input_tokens_seen": 82942976, |
|
"step": 81000 |
|
}, |
|
{ |
|
"epoch": 8.15, |
|
"grad_norm": 0.7789964079856873, |
|
"learning_rate": 9.2505e-06, |
|
"loss": 0.0111, |
|
"num_input_tokens_seen": 83454976, |
|
"step": 81500 |
|
}, |
|
{ |
|
"epoch": 8.2, |
|
"grad_norm": 0.6423227787017822, |
|
"learning_rate": 9.000500000000001e-06, |
|
"loss": 0.011, |
|
"num_input_tokens_seen": 83966976, |
|
"step": 82000 |
|
}, |
|
{ |
|
"epoch": 8.25, |
|
"grad_norm": 0.6862022876739502, |
|
"learning_rate": 8.7505e-06, |
|
"loss": 0.0105, |
|
"num_input_tokens_seen": 84478976, |
|
"step": 82500 |
|
}, |
|
{ |
|
"epoch": 8.3, |
|
"grad_norm": 0.6521459817886353, |
|
"learning_rate": 8.5005e-06, |
|
"loss": 0.011, |
|
"num_input_tokens_seen": 84990976, |
|
"step": 83000 |
|
}, |
|
{ |
|
"epoch": 8.35, |
|
"grad_norm": 1.0782101154327393, |
|
"learning_rate": 8.2505e-06, |
|
"loss": 0.01, |
|
"num_input_tokens_seen": 85502976, |
|
"step": 83500 |
|
}, |
|
{ |
|
"epoch": 8.4, |
|
"grad_norm": 0.32573211193084717, |
|
"learning_rate": 8.0005e-06, |
|
"loss": 0.0102, |
|
"num_input_tokens_seen": 86014976, |
|
"step": 84000 |
|
}, |
|
{ |
|
"epoch": 8.45, |
|
"grad_norm": 0.4790741205215454, |
|
"learning_rate": 7.750500000000001e-06, |
|
"loss": 0.0097, |
|
"num_input_tokens_seen": 86526976, |
|
"step": 84500 |
|
}, |
|
{ |
|
"epoch": 8.5, |
|
"grad_norm": 5.938267230987549, |
|
"learning_rate": 7.5005000000000004e-06, |
|
"loss": 0.0099, |
|
"num_input_tokens_seen": 87038976, |
|
"step": 85000 |
|
}, |
|
{ |
|
"epoch": 8.55, |
|
"grad_norm": 0.3625955283641815, |
|
"learning_rate": 7.2505e-06, |
|
"loss": 0.0101, |
|
"num_input_tokens_seen": 87550976, |
|
"step": 85500 |
|
}, |
|
{ |
|
"epoch": 8.6, |
|
"grad_norm": 1.664149522781372, |
|
"learning_rate": 7.0005e-06, |
|
"loss": 0.0103, |
|
"num_input_tokens_seen": 88062976, |
|
"step": 86000 |
|
}, |
|
{ |
|
"epoch": 8.65, |
|
"grad_norm": 0.35580164194107056, |
|
"learning_rate": 6.7505e-06, |
|
"loss": 0.0097, |
|
"num_input_tokens_seen": 88574976, |
|
"step": 86500 |
|
}, |
|
{ |
|
"epoch": 8.7, |
|
"grad_norm": 0.814786434173584, |
|
"learning_rate": 6.5005e-06, |
|
"loss": 0.0099, |
|
"num_input_tokens_seen": 89086976, |
|
"step": 87000 |
|
}, |
|
{ |
|
"epoch": 8.75, |
|
"grad_norm": 0.479640930891037, |
|
"learning_rate": 6.2505000000000005e-06, |
|
"loss": 0.0101, |
|
"num_input_tokens_seen": 89598976, |
|
"step": 87500 |
|
}, |
|
{ |
|
"epoch": 8.8, |
|
"grad_norm": 0.4606671929359436, |
|
"learning_rate": 6.0005e-06, |
|
"loss": 0.0094, |
|
"num_input_tokens_seen": 90110976, |
|
"step": 88000 |
|
}, |
|
{ |
|
"epoch": 8.85, |
|
"grad_norm": 2.0643467903137207, |
|
"learning_rate": 5.7505e-06, |
|
"loss": 0.0099, |
|
"num_input_tokens_seen": 90622976, |
|
"step": 88500 |
|
}, |
|
{ |
|
"epoch": 8.9, |
|
"grad_norm": 0.6785427331924438, |
|
"learning_rate": 5.5005e-06, |
|
"loss": 0.0103, |
|
"num_input_tokens_seen": 91134976, |
|
"step": 89000 |
|
}, |
|
{ |
|
"epoch": 8.95, |
|
"grad_norm": 0.6333959102630615, |
|
"learning_rate": 5.250500000000001e-06, |
|
"loss": 0.01, |
|
"num_input_tokens_seen": 91646976, |
|
"step": 89500 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"grad_norm": 0.8463544249534607, |
|
"learning_rate": 5.000500000000001e-06, |
|
"loss": 0.0098, |
|
"num_input_tokens_seen": 92158848, |
|
"step": 90000 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"eval_combined_score": 0.14495953552467267, |
|
"eval_loss": 0.14495953917503357, |
|
"eval_mse": 0.1449595318743118, |
|
"eval_runtime": 29.5073, |
|
"eval_samples_per_second": 677.799, |
|
"eval_steps_per_second": 84.725, |
|
"num_input_tokens_seen": 92158848, |
|
"step": 90000 |
|
}, |
|
{ |
|
"epoch": 9.05, |
|
"grad_norm": 0.39637425541877747, |
|
"learning_rate": 4.7505000000000005e-06, |
|
"loss": 0.0082, |
|
"num_input_tokens_seen": 92670848, |
|
"step": 90500 |
|
}, |
|
{ |
|
"epoch": 9.1, |
|
"grad_norm": 0.7424957752227783, |
|
"learning_rate": 4.5005e-06, |
|
"loss": 0.0085, |
|
"num_input_tokens_seen": 93182848, |
|
"step": 91000 |
|
}, |
|
{ |
|
"epoch": 9.15, |
|
"grad_norm": 0.8151483535766602, |
|
"learning_rate": 4.2505e-06, |
|
"loss": 0.008, |
|
"num_input_tokens_seen": 93694848, |
|
"step": 91500 |
|
}, |
|
{ |
|
"epoch": 9.2, |
|
"grad_norm": 1.604078769683838, |
|
"learning_rate": 4.0005e-06, |
|
"loss": 0.0086, |
|
"num_input_tokens_seen": 94206848, |
|
"step": 92000 |
|
}, |
|
{ |
|
"epoch": 9.25, |
|
"grad_norm": 0.42909368872642517, |
|
"learning_rate": 3.7505e-06, |
|
"loss": 0.0084, |
|
"num_input_tokens_seen": 94718848, |
|
"step": 92500 |
|
}, |
|
{ |
|
"epoch": 9.3, |
|
"grad_norm": 0.6759423017501831, |
|
"learning_rate": 3.5005e-06, |
|
"loss": 0.0077, |
|
"num_input_tokens_seen": 95230848, |
|
"step": 93000 |
|
}, |
|
{ |
|
"epoch": 9.35, |
|
"grad_norm": 0.5954917669296265, |
|
"learning_rate": 3.2505e-06, |
|
"loss": 0.0081, |
|
"num_input_tokens_seen": 95742848, |
|
"step": 93500 |
|
}, |
|
{ |
|
"epoch": 9.4, |
|
"grad_norm": 0.6435306072235107, |
|
"learning_rate": 3.0005000000000003e-06, |
|
"loss": 0.0079, |
|
"num_input_tokens_seen": 96254848, |
|
"step": 94000 |
|
}, |
|
{ |
|
"epoch": 9.45, |
|
"grad_norm": 0.8906601071357727, |
|
"learning_rate": 2.7505e-06, |
|
"loss": 0.008, |
|
"num_input_tokens_seen": 96766848, |
|
"step": 94500 |
|
}, |
|
{ |
|
"epoch": 9.5, |
|
"grad_norm": 1.4101794958114624, |
|
"learning_rate": 2.5005e-06, |
|
"loss": 0.0075, |
|
"num_input_tokens_seen": 97278848, |
|
"step": 95000 |
|
}, |
|
{ |
|
"epoch": 9.55, |
|
"grad_norm": 0.7406792044639587, |
|
"learning_rate": 2.2505000000000003e-06, |
|
"loss": 0.0078, |
|
"num_input_tokens_seen": 97790848, |
|
"step": 95500 |
|
}, |
|
{ |
|
"epoch": 9.6, |
|
"grad_norm": 1.437361240386963, |
|
"learning_rate": 2.0004999999999997e-06, |
|
"loss": 0.0077, |
|
"num_input_tokens_seen": 98302848, |
|
"step": 96000 |
|
}, |
|
{ |
|
"epoch": 9.65, |
|
"grad_norm": 0.4781911373138428, |
|
"learning_rate": 1.7505e-06, |
|
"loss": 0.0078, |
|
"num_input_tokens_seen": 98814848, |
|
"step": 96500 |
|
}, |
|
{ |
|
"epoch": 9.7, |
|
"grad_norm": 0.5876700282096863, |
|
"learning_rate": 1.5005e-06, |
|
"loss": 0.0075, |
|
"num_input_tokens_seen": 99326848, |
|
"step": 97000 |
|
}, |
|
{ |
|
"epoch": 9.75, |
|
"grad_norm": 0.933368980884552, |
|
"learning_rate": 1.2505000000000001e-06, |
|
"loss": 0.008, |
|
"num_input_tokens_seen": 99838848, |
|
"step": 97500 |
|
}, |
|
{ |
|
"epoch": 9.8, |
|
"grad_norm": 0.7791544198989868, |
|
"learning_rate": 1.0005e-06, |
|
"loss": 0.0075, |
|
"num_input_tokens_seen": 100350848, |
|
"step": 98000 |
|
}, |
|
{ |
|
"epoch": 9.85, |
|
"grad_norm": 0.45317134261131287, |
|
"learning_rate": 7.505000000000001e-07, |
|
"loss": 0.0078, |
|
"num_input_tokens_seen": 100862848, |
|
"step": 98500 |
|
}, |
|
{ |
|
"epoch": 9.9, |
|
"grad_norm": 1.5439448356628418, |
|
"learning_rate": 5.005e-07, |
|
"loss": 0.0074, |
|
"num_input_tokens_seen": 101374848, |
|
"step": 99000 |
|
}, |
|
{ |
|
"epoch": 9.95, |
|
"grad_norm": 0.5587248206138611, |
|
"learning_rate": 2.5049999999999997e-07, |
|
"loss": 0.0079, |
|
"num_input_tokens_seen": 101886848, |
|
"step": 99500 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"grad_norm": 0.6633381247520447, |
|
"learning_rate": 5e-10, |
|
"loss": 0.0073, |
|
"num_input_tokens_seen": 102398720, |
|
"step": 100000 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"eval_combined_score": 0.14527450438803524, |
|
"eval_loss": 0.14527450501918793, |
|
"eval_mse": 0.14527450375688256, |
|
"eval_runtime": 29.5752, |
|
"eval_samples_per_second": 676.241, |
|
"eval_steps_per_second": 84.53, |
|
"num_input_tokens_seen": 102398720, |
|
"step": 100000 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"num_input_tokens_seen": 102398720, |
|
"step": 100000, |
|
"total_flos": 5.262202453327104e+16, |
|
"train_loss": 0.056572345192432406, |
|
"train_runtime": 7202.8043, |
|
"train_samples_per_second": 111.066, |
|
"train_steps_per_second": 13.883, |
|
"train_tokens_per_second": 14216.507 |
|
} |
|
], |
|
"logging_steps": 500, |
|
"max_steps": 100000, |
|
"num_input_tokens_seen": 102398720, |
|
"num_train_epochs": 10, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 5.262202453327104e+16, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|