deberta-v3-base-quality-v3 / trainer_state.json
agentlans's picture
Upload 13 files
4ded3e9 verified
{
"best_global_step": 20000,
"best_metric": 0.14082255959510803,
"best_model_checkpoint": "/media/user/Expansion1/deberta-v3-base-zyda-2-v2-text-quality-v3/checkpoint-20000",
"epoch": 10.0,
"eval_steps": 500,
"global_step": 100000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.05,
"grad_norm": 11.949178695678711,
"learning_rate": 4.97505e-05,
"loss": 0.3835,
"num_input_tokens_seen": 512000,
"step": 500
},
{
"epoch": 0.1,
"grad_norm": 3.623898506164551,
"learning_rate": 4.95005e-05,
"loss": 0.2484,
"num_input_tokens_seen": 1024000,
"step": 1000
},
{
"epoch": 0.15,
"grad_norm": 3.0655770301818848,
"learning_rate": 4.9250500000000006e-05,
"loss": 0.2332,
"num_input_tokens_seen": 1536000,
"step": 1500
},
{
"epoch": 0.2,
"grad_norm": 8.194499015808105,
"learning_rate": 4.9000500000000006e-05,
"loss": 0.2097,
"num_input_tokens_seen": 2048000,
"step": 2000
},
{
"epoch": 0.25,
"grad_norm": 1.8507510423660278,
"learning_rate": 4.875050000000001e-05,
"loss": 0.1988,
"num_input_tokens_seen": 2560000,
"step": 2500
},
{
"epoch": 0.3,
"grad_norm": 2.8679802417755127,
"learning_rate": 4.85005e-05,
"loss": 0.1957,
"num_input_tokens_seen": 3072000,
"step": 3000
},
{
"epoch": 0.35,
"grad_norm": 3.2234883308410645,
"learning_rate": 4.82505e-05,
"loss": 0.1793,
"num_input_tokens_seen": 3584000,
"step": 3500
},
{
"epoch": 0.4,
"grad_norm": 4.360517978668213,
"learning_rate": 4.80005e-05,
"loss": 0.1816,
"num_input_tokens_seen": 4096000,
"step": 4000
},
{
"epoch": 0.45,
"grad_norm": 5.652502536773682,
"learning_rate": 4.77505e-05,
"loss": 0.1855,
"num_input_tokens_seen": 4608000,
"step": 4500
},
{
"epoch": 0.5,
"grad_norm": 3.757875919342041,
"learning_rate": 4.7500500000000004e-05,
"loss": 0.1751,
"num_input_tokens_seen": 5120000,
"step": 5000
},
{
"epoch": 0.55,
"grad_norm": 3.0092484951019287,
"learning_rate": 4.7250500000000004e-05,
"loss": 0.1785,
"num_input_tokens_seen": 5632000,
"step": 5500
},
{
"epoch": 0.6,
"grad_norm": 7.830347061157227,
"learning_rate": 4.7000500000000005e-05,
"loss": 0.1711,
"num_input_tokens_seen": 6144000,
"step": 6000
},
{
"epoch": 0.65,
"grad_norm": 2.926468849182129,
"learning_rate": 4.6750500000000006e-05,
"loss": 0.168,
"num_input_tokens_seen": 6656000,
"step": 6500
},
{
"epoch": 0.7,
"grad_norm": 3.43612003326416,
"learning_rate": 4.65005e-05,
"loss": 0.1772,
"num_input_tokens_seen": 7168000,
"step": 7000
},
{
"epoch": 0.75,
"grad_norm": 2.3997323513031006,
"learning_rate": 4.62505e-05,
"loss": 0.1632,
"num_input_tokens_seen": 7680000,
"step": 7500
},
{
"epoch": 0.8,
"grad_norm": 12.628423690795898,
"learning_rate": 4.60005e-05,
"loss": 0.1714,
"num_input_tokens_seen": 8192000,
"step": 8000
},
{
"epoch": 0.85,
"grad_norm": 1.8220003843307495,
"learning_rate": 4.57505e-05,
"loss": 0.1613,
"num_input_tokens_seen": 8704000,
"step": 8500
},
{
"epoch": 0.9,
"grad_norm": 2.2584903240203857,
"learning_rate": 4.55005e-05,
"loss": 0.1547,
"num_input_tokens_seen": 9216000,
"step": 9000
},
{
"epoch": 0.95,
"grad_norm": 1.5416566133499146,
"learning_rate": 4.52505e-05,
"loss": 0.1594,
"num_input_tokens_seen": 9728000,
"step": 9500
},
{
"epoch": 1.0,
"grad_norm": 2.472825288772583,
"learning_rate": 4.5000500000000004e-05,
"loss": 0.1635,
"num_input_tokens_seen": 10239872,
"step": 10000
},
{
"epoch": 1.0,
"eval_combined_score": 0.18538867612314003,
"eval_loss": 0.18538866937160492,
"eval_mse": 0.18538868287467514,
"eval_runtime": 29.5714,
"eval_samples_per_second": 676.329,
"eval_steps_per_second": 84.541,
"num_input_tokens_seen": 10239872,
"step": 10000
},
{
"epoch": 1.05,
"grad_norm": 2.986963987350464,
"learning_rate": 4.47505e-05,
"loss": 0.1226,
"num_input_tokens_seen": 10751872,
"step": 10500
},
{
"epoch": 1.1,
"grad_norm": 0.7588199973106384,
"learning_rate": 4.45005e-05,
"loss": 0.1172,
"num_input_tokens_seen": 11263872,
"step": 11000
},
{
"epoch": 1.15,
"grad_norm": 0.8432678580284119,
"learning_rate": 4.42505e-05,
"loss": 0.1186,
"num_input_tokens_seen": 11775872,
"step": 11500
},
{
"epoch": 1.2,
"grad_norm": 12.563228607177734,
"learning_rate": 4.40005e-05,
"loss": 0.1139,
"num_input_tokens_seen": 12287872,
"step": 12000
},
{
"epoch": 1.25,
"grad_norm": 2.207587242126465,
"learning_rate": 4.37505e-05,
"loss": 0.121,
"num_input_tokens_seen": 12799872,
"step": 12500
},
{
"epoch": 1.3,
"grad_norm": 1.978637456893921,
"learning_rate": 4.35005e-05,
"loss": 0.1114,
"num_input_tokens_seen": 13311872,
"step": 13000
},
{
"epoch": 1.35,
"grad_norm": 5.6478729248046875,
"learning_rate": 4.32505e-05,
"loss": 0.1182,
"num_input_tokens_seen": 13823872,
"step": 13500
},
{
"epoch": 1.4,
"grad_norm": 3.0157413482666016,
"learning_rate": 4.30005e-05,
"loss": 0.1099,
"num_input_tokens_seen": 14335872,
"step": 14000
},
{
"epoch": 1.45,
"grad_norm": 2.2837512493133545,
"learning_rate": 4.2750500000000003e-05,
"loss": 0.1154,
"num_input_tokens_seen": 14847872,
"step": 14500
},
{
"epoch": 1.5,
"grad_norm": 2.124837875366211,
"learning_rate": 4.2500500000000004e-05,
"loss": 0.1163,
"num_input_tokens_seen": 15359872,
"step": 15000
},
{
"epoch": 1.55,
"grad_norm": 1.8782966136932373,
"learning_rate": 4.2250500000000005e-05,
"loss": 0.1167,
"num_input_tokens_seen": 15871872,
"step": 15500
},
{
"epoch": 1.6,
"grad_norm": 1.085688591003418,
"learning_rate": 4.2000500000000006e-05,
"loss": 0.1156,
"num_input_tokens_seen": 16383872,
"step": 16000
},
{
"epoch": 1.65,
"grad_norm": 1.9874955415725708,
"learning_rate": 4.1750500000000006e-05,
"loss": 0.1183,
"num_input_tokens_seen": 16895872,
"step": 16500
},
{
"epoch": 1.7,
"grad_norm": 2.6902706623077393,
"learning_rate": 4.15005e-05,
"loss": 0.1112,
"num_input_tokens_seen": 17407872,
"step": 17000
},
{
"epoch": 1.75,
"grad_norm": 3.0735440254211426,
"learning_rate": 4.12505e-05,
"loss": 0.1159,
"num_input_tokens_seen": 17919872,
"step": 17500
},
{
"epoch": 1.8,
"grad_norm": 2.936267614364624,
"learning_rate": 4.10005e-05,
"loss": 0.1187,
"num_input_tokens_seen": 18431872,
"step": 18000
},
{
"epoch": 1.85,
"grad_norm": 3.598895311355591,
"learning_rate": 4.07505e-05,
"loss": 0.1147,
"num_input_tokens_seen": 18943872,
"step": 18500
},
{
"epoch": 1.9,
"grad_norm": 3.655381917953491,
"learning_rate": 4.05005e-05,
"loss": 0.1387,
"num_input_tokens_seen": 19455872,
"step": 19000
},
{
"epoch": 1.95,
"grad_norm": 9.855778694152832,
"learning_rate": 4.0250500000000004e-05,
"loss": 0.1238,
"num_input_tokens_seen": 19967872,
"step": 19500
},
{
"epoch": 2.0,
"grad_norm": 2.558746337890625,
"learning_rate": 4.0000500000000004e-05,
"loss": 0.1241,
"num_input_tokens_seen": 20479744,
"step": 20000
},
{
"epoch": 2.0,
"eval_combined_score": 0.14082256163832602,
"eval_loss": 0.14082255959510803,
"eval_mse": 0.14082256368154403,
"eval_runtime": 30.1283,
"eval_samples_per_second": 663.828,
"eval_steps_per_second": 82.979,
"num_input_tokens_seen": 20479744,
"step": 20000
},
{
"epoch": 2.05,
"grad_norm": 1.3755764961242676,
"learning_rate": 3.97505e-05,
"loss": 0.0804,
"num_input_tokens_seen": 20991744,
"step": 20500
},
{
"epoch": 2.1,
"grad_norm": 3.242955207824707,
"learning_rate": 3.95005e-05,
"loss": 0.0795,
"num_input_tokens_seen": 21503744,
"step": 21000
},
{
"epoch": 2.15,
"grad_norm": 2.4045000076293945,
"learning_rate": 3.92505e-05,
"loss": 0.0814,
"num_input_tokens_seen": 22015744,
"step": 21500
},
{
"epoch": 2.2,
"grad_norm": 2.5508718490600586,
"learning_rate": 3.90005e-05,
"loss": 0.0848,
"num_input_tokens_seen": 22527744,
"step": 22000
},
{
"epoch": 2.25,
"grad_norm": 2.8529911041259766,
"learning_rate": 3.87505e-05,
"loss": 0.081,
"num_input_tokens_seen": 23039744,
"step": 22500
},
{
"epoch": 2.3,
"grad_norm": 10.657905578613281,
"learning_rate": 3.85005e-05,
"loss": 0.0786,
"num_input_tokens_seen": 23551744,
"step": 23000
},
{
"epoch": 2.35,
"grad_norm": 2.378411293029785,
"learning_rate": 3.82505e-05,
"loss": 0.0823,
"num_input_tokens_seen": 24063744,
"step": 23500
},
{
"epoch": 2.4,
"grad_norm": 2.6125261783599854,
"learning_rate": 3.80005e-05,
"loss": 0.0787,
"num_input_tokens_seen": 24575744,
"step": 24000
},
{
"epoch": 2.45,
"grad_norm": 1.3133174180984497,
"learning_rate": 3.77505e-05,
"loss": 0.0761,
"num_input_tokens_seen": 25087744,
"step": 24500
},
{
"epoch": 2.5,
"grad_norm": 3.3419981002807617,
"learning_rate": 3.75005e-05,
"loss": 0.0775,
"num_input_tokens_seen": 25599744,
"step": 25000
},
{
"epoch": 2.55,
"grad_norm": 2.1734654903411865,
"learning_rate": 3.72505e-05,
"loss": 0.0846,
"num_input_tokens_seen": 26111744,
"step": 25500
},
{
"epoch": 2.6,
"grad_norm": 3.2352869510650635,
"learning_rate": 3.70005e-05,
"loss": 0.0817,
"num_input_tokens_seen": 26623744,
"step": 26000
},
{
"epoch": 2.65,
"grad_norm": 3.37646746635437,
"learning_rate": 3.675050000000001e-05,
"loss": 0.0816,
"num_input_tokens_seen": 27135744,
"step": 26500
},
{
"epoch": 2.7,
"grad_norm": 2.5875842571258545,
"learning_rate": 3.650050000000001e-05,
"loss": 0.0843,
"num_input_tokens_seen": 27647744,
"step": 27000
},
{
"epoch": 2.75,
"grad_norm": 7.768916606903076,
"learning_rate": 3.62505e-05,
"loss": 0.089,
"num_input_tokens_seen": 28159744,
"step": 27500
},
{
"epoch": 2.8,
"grad_norm": 2.6333940029144287,
"learning_rate": 3.60005e-05,
"loss": 0.1209,
"num_input_tokens_seen": 28671744,
"step": 28000
},
{
"epoch": 2.85,
"grad_norm": 3.4022088050842285,
"learning_rate": 3.57505e-05,
"loss": 0.082,
"num_input_tokens_seen": 29183744,
"step": 28500
},
{
"epoch": 2.9,
"grad_norm": 1.5310307741165161,
"learning_rate": 3.5500500000000003e-05,
"loss": 0.0813,
"num_input_tokens_seen": 29695744,
"step": 29000
},
{
"epoch": 2.95,
"grad_norm": 3.3515617847442627,
"learning_rate": 3.5250500000000004e-05,
"loss": 0.0856,
"num_input_tokens_seen": 30207744,
"step": 29500
},
{
"epoch": 3.0,
"grad_norm": 1.5893547534942627,
"learning_rate": 3.5000500000000005e-05,
"loss": 0.0882,
"num_input_tokens_seen": 30719616,
"step": 30000
},
{
"epoch": 3.0,
"eval_combined_score": 0.1746896443902683,
"eval_loss": 0.1746896207332611,
"eval_mse": 0.17468963824495307,
"eval_runtime": 29.4701,
"eval_samples_per_second": 678.654,
"eval_steps_per_second": 84.832,
"num_input_tokens_seen": 30719616,
"step": 30000
},
{
"epoch": 3.05,
"grad_norm": 1.333294153213501,
"learning_rate": 3.4750500000000006e-05,
"loss": 0.0562,
"num_input_tokens_seen": 31231616,
"step": 30500
},
{
"epoch": 3.1,
"grad_norm": 0.8254738450050354,
"learning_rate": 3.45005e-05,
"loss": 0.053,
"num_input_tokens_seen": 31743616,
"step": 31000
},
{
"epoch": 3.15,
"grad_norm": 1.7611359357833862,
"learning_rate": 3.42505e-05,
"loss": 0.0533,
"num_input_tokens_seen": 32255616,
"step": 31500
},
{
"epoch": 3.2,
"grad_norm": 1.1055493354797363,
"learning_rate": 3.40005e-05,
"loss": 0.0557,
"num_input_tokens_seen": 32767616,
"step": 32000
},
{
"epoch": 3.25,
"grad_norm": 1.6912920475006104,
"learning_rate": 3.37505e-05,
"loss": 0.0557,
"num_input_tokens_seen": 33279616,
"step": 32500
},
{
"epoch": 3.3,
"grad_norm": 2.5604867935180664,
"learning_rate": 3.35005e-05,
"loss": 0.0619,
"num_input_tokens_seen": 33791616,
"step": 33000
},
{
"epoch": 3.35,
"grad_norm": 1.7852438688278198,
"learning_rate": 3.32505e-05,
"loss": 0.0564,
"num_input_tokens_seen": 34303616,
"step": 33500
},
{
"epoch": 3.4,
"grad_norm": 2.84659481048584,
"learning_rate": 3.3000500000000004e-05,
"loss": 0.0563,
"num_input_tokens_seen": 34815616,
"step": 34000
},
{
"epoch": 3.45,
"grad_norm": 2.0315301418304443,
"learning_rate": 3.27505e-05,
"loss": 0.054,
"num_input_tokens_seen": 35327616,
"step": 34500
},
{
"epoch": 3.5,
"grad_norm": 1.9043070077896118,
"learning_rate": 3.25005e-05,
"loss": 0.0583,
"num_input_tokens_seen": 35839616,
"step": 35000
},
{
"epoch": 3.55,
"grad_norm": 1.7389405965805054,
"learning_rate": 3.22505e-05,
"loss": 0.0544,
"num_input_tokens_seen": 36351616,
"step": 35500
},
{
"epoch": 3.6,
"grad_norm": 0.8132746815681458,
"learning_rate": 3.20005e-05,
"loss": 0.055,
"num_input_tokens_seen": 36863616,
"step": 36000
},
{
"epoch": 3.65,
"grad_norm": 1.8100671768188477,
"learning_rate": 3.17505e-05,
"loss": 0.0558,
"num_input_tokens_seen": 37375616,
"step": 36500
},
{
"epoch": 3.7,
"grad_norm": 10.433902740478516,
"learning_rate": 3.15005e-05,
"loss": 0.0568,
"num_input_tokens_seen": 37887616,
"step": 37000
},
{
"epoch": 3.75,
"grad_norm": 0.7512624263763428,
"learning_rate": 3.12505e-05,
"loss": 0.0541,
"num_input_tokens_seen": 38399616,
"step": 37500
},
{
"epoch": 3.8,
"grad_norm": 1.3957535028457642,
"learning_rate": 3.1000499999999996e-05,
"loss": 0.0546,
"num_input_tokens_seen": 38911616,
"step": 38000
},
{
"epoch": 3.85,
"grad_norm": 1.069032073020935,
"learning_rate": 3.0750499999999996e-05,
"loss": 0.0547,
"num_input_tokens_seen": 39423616,
"step": 38500
},
{
"epoch": 3.9,
"grad_norm": 3.4046223163604736,
"learning_rate": 3.0500500000000004e-05,
"loss": 0.0567,
"num_input_tokens_seen": 39935616,
"step": 39000
},
{
"epoch": 3.95,
"grad_norm": 1.5711253881454468,
"learning_rate": 3.0250500000000005e-05,
"loss": 0.0571,
"num_input_tokens_seen": 40447616,
"step": 39500
},
{
"epoch": 4.0,
"grad_norm": 2.8568646907806396,
"learning_rate": 3.0000500000000005e-05,
"loss": 0.054,
"num_input_tokens_seen": 40959488,
"step": 40000
},
{
"epoch": 4.0,
"eval_combined_score": 0.1528494923779644,
"eval_loss": 0.152849480509758,
"eval_mse": 0.15284948934500966,
"eval_runtime": 29.495,
"eval_samples_per_second": 678.08,
"eval_steps_per_second": 84.76,
"num_input_tokens_seen": 40959488,
"step": 40000
},
{
"epoch": 4.05,
"grad_norm": 1.1214642524719238,
"learning_rate": 2.9750500000000003e-05,
"loss": 0.0365,
"num_input_tokens_seen": 41471488,
"step": 40500
},
{
"epoch": 4.1,
"grad_norm": 2.6408936977386475,
"learning_rate": 2.9500500000000003e-05,
"loss": 0.0361,
"num_input_tokens_seen": 41983488,
"step": 41000
},
{
"epoch": 4.15,
"grad_norm": 1.0093015432357788,
"learning_rate": 2.9250500000000004e-05,
"loss": 0.0361,
"num_input_tokens_seen": 42495488,
"step": 41500
},
{
"epoch": 4.2,
"grad_norm": 2.0412521362304688,
"learning_rate": 2.90005e-05,
"loss": 0.0377,
"num_input_tokens_seen": 43007488,
"step": 42000
},
{
"epoch": 4.25,
"grad_norm": 2.0059244632720947,
"learning_rate": 2.8750500000000002e-05,
"loss": 0.0354,
"num_input_tokens_seen": 43519488,
"step": 42500
},
{
"epoch": 4.3,
"grad_norm": 3.214423179626465,
"learning_rate": 2.8500500000000003e-05,
"loss": 0.0373,
"num_input_tokens_seen": 44031488,
"step": 43000
},
{
"epoch": 4.35,
"grad_norm": 2.101541519165039,
"learning_rate": 2.8250500000000003e-05,
"loss": 0.0381,
"num_input_tokens_seen": 44543488,
"step": 43500
},
{
"epoch": 4.4,
"grad_norm": 0.8797721862792969,
"learning_rate": 2.80005e-05,
"loss": 0.0381,
"num_input_tokens_seen": 45055488,
"step": 44000
},
{
"epoch": 4.45,
"grad_norm": 2.0589728355407715,
"learning_rate": 2.77505e-05,
"loss": 0.036,
"num_input_tokens_seen": 45567488,
"step": 44500
},
{
"epoch": 4.5,
"grad_norm": 2.5758140087127686,
"learning_rate": 2.7500500000000002e-05,
"loss": 0.0372,
"num_input_tokens_seen": 46079488,
"step": 45000
},
{
"epoch": 4.55,
"grad_norm": 1.531252145767212,
"learning_rate": 2.72505e-05,
"loss": 0.0381,
"num_input_tokens_seen": 46591488,
"step": 45500
},
{
"epoch": 4.6,
"grad_norm": 1.053691029548645,
"learning_rate": 2.70005e-05,
"loss": 0.0396,
"num_input_tokens_seen": 47103488,
"step": 46000
},
{
"epoch": 4.65,
"grad_norm": 1.031100869178772,
"learning_rate": 2.67505e-05,
"loss": 0.0376,
"num_input_tokens_seen": 47615488,
"step": 46500
},
{
"epoch": 4.7,
"grad_norm": 0.8592771887779236,
"learning_rate": 2.65005e-05,
"loss": 0.0381,
"num_input_tokens_seen": 48127488,
"step": 47000
},
{
"epoch": 4.75,
"grad_norm": 3.529454231262207,
"learning_rate": 2.62505e-05,
"loss": 0.0406,
"num_input_tokens_seen": 48639488,
"step": 47500
},
{
"epoch": 4.8,
"grad_norm": 1.2595094442367554,
"learning_rate": 2.60005e-05,
"loss": 0.044,
"num_input_tokens_seen": 49151488,
"step": 48000
},
{
"epoch": 4.85,
"grad_norm": 1.0460163354873657,
"learning_rate": 2.57505e-05,
"loss": 0.0411,
"num_input_tokens_seen": 49663488,
"step": 48500
},
{
"epoch": 4.9,
"grad_norm": 0.7415432333946228,
"learning_rate": 2.55005e-05,
"loss": 0.0376,
"num_input_tokens_seen": 50175488,
"step": 49000
},
{
"epoch": 4.95,
"grad_norm": 0.9863350987434387,
"learning_rate": 2.5250499999999998e-05,
"loss": 0.039,
"num_input_tokens_seen": 50687488,
"step": 49500
},
{
"epoch": 5.0,
"grad_norm": 2.2840659618377686,
"learning_rate": 2.50005e-05,
"loss": 0.0372,
"num_input_tokens_seen": 51199360,
"step": 50000
},
{
"epoch": 5.0,
"eval_combined_score": 0.14802570643204935,
"eval_loss": 0.14802570641040802,
"eval_mse": 0.14802570645369068,
"eval_runtime": 29.5199,
"eval_samples_per_second": 677.508,
"eval_steps_per_second": 84.689,
"num_input_tokens_seen": 51199360,
"step": 50000
},
{
"epoch": 5.05,
"grad_norm": 1.0202912092208862,
"learning_rate": 2.4750500000000003e-05,
"loss": 0.0253,
"num_input_tokens_seen": 51711360,
"step": 50500
},
{
"epoch": 5.1,
"grad_norm": 1.1298741102218628,
"learning_rate": 2.45005e-05,
"loss": 0.0269,
"num_input_tokens_seen": 52223360,
"step": 51000
},
{
"epoch": 5.15,
"grad_norm": 1.2378206253051758,
"learning_rate": 2.42505e-05,
"loss": 0.0258,
"num_input_tokens_seen": 52735360,
"step": 51500
},
{
"epoch": 5.2,
"grad_norm": 1.6293431520462036,
"learning_rate": 2.4000500000000002e-05,
"loss": 0.0272,
"num_input_tokens_seen": 53247360,
"step": 52000
},
{
"epoch": 5.25,
"grad_norm": 3.9734299182891846,
"learning_rate": 2.37505e-05,
"loss": 0.0272,
"num_input_tokens_seen": 53759360,
"step": 52500
},
{
"epoch": 5.3,
"grad_norm": 0.6598159074783325,
"learning_rate": 2.35005e-05,
"loss": 0.0262,
"num_input_tokens_seen": 54271360,
"step": 53000
},
{
"epoch": 5.35,
"grad_norm": 0.6012576818466187,
"learning_rate": 2.32505e-05,
"loss": 0.027,
"num_input_tokens_seen": 54783360,
"step": 53500
},
{
"epoch": 5.4,
"grad_norm": 2.462887763977051,
"learning_rate": 2.30005e-05,
"loss": 0.0268,
"num_input_tokens_seen": 55295360,
"step": 54000
},
{
"epoch": 5.45,
"grad_norm": 2.0268304347991943,
"learning_rate": 2.2750500000000002e-05,
"loss": 0.0263,
"num_input_tokens_seen": 55807360,
"step": 54500
},
{
"epoch": 5.5,
"grad_norm": 0.8000567555427551,
"learning_rate": 2.2500500000000003e-05,
"loss": 0.0282,
"num_input_tokens_seen": 56319360,
"step": 55000
},
{
"epoch": 5.55,
"grad_norm": 1.5781893730163574,
"learning_rate": 2.2250500000000003e-05,
"loss": 0.0265,
"num_input_tokens_seen": 56831360,
"step": 55500
},
{
"epoch": 5.6,
"grad_norm": 1.2630614042282104,
"learning_rate": 2.20005e-05,
"loss": 0.0257,
"num_input_tokens_seen": 57343360,
"step": 56000
},
{
"epoch": 5.65,
"grad_norm": 1.3778091669082642,
"learning_rate": 2.17505e-05,
"loss": 0.0271,
"num_input_tokens_seen": 57855360,
"step": 56500
},
{
"epoch": 5.7,
"grad_norm": 1.0909324884414673,
"learning_rate": 2.1500500000000002e-05,
"loss": 0.026,
"num_input_tokens_seen": 58367360,
"step": 57000
},
{
"epoch": 5.75,
"grad_norm": 3.5209500789642334,
"learning_rate": 2.1250500000000003e-05,
"loss": 0.0264,
"num_input_tokens_seen": 58879360,
"step": 57500
},
{
"epoch": 5.8,
"grad_norm": 1.4671865701675415,
"learning_rate": 2.10005e-05,
"loss": 0.0265,
"num_input_tokens_seen": 59391360,
"step": 58000
},
{
"epoch": 5.85,
"grad_norm": 1.6409125328063965,
"learning_rate": 2.07505e-05,
"loss": 0.0268,
"num_input_tokens_seen": 59903360,
"step": 58500
},
{
"epoch": 5.9,
"grad_norm": 1.4418998956680298,
"learning_rate": 2.05005e-05,
"loss": 0.0262,
"num_input_tokens_seen": 60415360,
"step": 59000
},
{
"epoch": 5.95,
"grad_norm": 1.3441293239593506,
"learning_rate": 2.02505e-05,
"loss": 0.0257,
"num_input_tokens_seen": 60927360,
"step": 59500
},
{
"epoch": 6.0,
"grad_norm": 0.8778462409973145,
"learning_rate": 2.00005e-05,
"loss": 0.0263,
"num_input_tokens_seen": 61439232,
"step": 60000
},
{
"epoch": 6.0,
"eval_combined_score": 0.15236617343673117,
"eval_loss": 0.15236616134643555,
"eval_mse": 0.1523661706258656,
"eval_runtime": 29.4149,
"eval_samples_per_second": 679.926,
"eval_steps_per_second": 84.991,
"num_input_tokens_seen": 61439232,
"step": 60000
},
{
"epoch": 6.05,
"grad_norm": 1.4335697889328003,
"learning_rate": 1.97505e-05,
"loss": 0.0181,
"num_input_tokens_seen": 61951232,
"step": 60500
},
{
"epoch": 6.1,
"grad_norm": 1.1381551027297974,
"learning_rate": 1.95005e-05,
"loss": 0.0197,
"num_input_tokens_seen": 62463232,
"step": 61000
},
{
"epoch": 6.15,
"grad_norm": 0.7046132683753967,
"learning_rate": 1.92505e-05,
"loss": 0.0186,
"num_input_tokens_seen": 62975232,
"step": 61500
},
{
"epoch": 6.2,
"grad_norm": 1.008306860923767,
"learning_rate": 1.9000500000000002e-05,
"loss": 0.0192,
"num_input_tokens_seen": 63487232,
"step": 62000
},
{
"epoch": 6.25,
"grad_norm": 2.0765221118927,
"learning_rate": 1.8750500000000003e-05,
"loss": 0.0185,
"num_input_tokens_seen": 63999232,
"step": 62500
},
{
"epoch": 6.3,
"grad_norm": 1.2361551523208618,
"learning_rate": 1.85005e-05,
"loss": 0.0179,
"num_input_tokens_seen": 64511232,
"step": 63000
},
{
"epoch": 6.35,
"grad_norm": 0.7231354117393494,
"learning_rate": 1.82505e-05,
"loss": 0.0194,
"num_input_tokens_seen": 65023232,
"step": 63500
},
{
"epoch": 6.4,
"grad_norm": 0.779230535030365,
"learning_rate": 1.80005e-05,
"loss": 0.0198,
"num_input_tokens_seen": 65535232,
"step": 64000
},
{
"epoch": 6.45,
"grad_norm": 0.7320069074630737,
"learning_rate": 1.77505e-05,
"loss": 0.0187,
"num_input_tokens_seen": 66047232,
"step": 64500
},
{
"epoch": 6.5,
"grad_norm": 0.8597579598426819,
"learning_rate": 1.75005e-05,
"loss": 0.0191,
"num_input_tokens_seen": 66559232,
"step": 65000
},
{
"epoch": 6.55,
"grad_norm": 1.4109529256820679,
"learning_rate": 1.72505e-05,
"loss": 0.0192,
"num_input_tokens_seen": 67071232,
"step": 65500
},
{
"epoch": 6.6,
"grad_norm": 1.4900848865509033,
"learning_rate": 1.70005e-05,
"loss": 0.0173,
"num_input_tokens_seen": 67583232,
"step": 66000
},
{
"epoch": 6.65,
"grad_norm": 1.3828743696212769,
"learning_rate": 1.6750499999999998e-05,
"loss": 0.0176,
"num_input_tokens_seen": 68095232,
"step": 66500
},
{
"epoch": 6.7,
"grad_norm": 0.6733376383781433,
"learning_rate": 1.6500500000000002e-05,
"loss": 0.019,
"num_input_tokens_seen": 68607232,
"step": 67000
},
{
"epoch": 6.75,
"grad_norm": 0.4570697546005249,
"learning_rate": 1.6250500000000003e-05,
"loss": 0.0181,
"num_input_tokens_seen": 69119232,
"step": 67500
},
{
"epoch": 6.8,
"grad_norm": 0.9463149309158325,
"learning_rate": 1.60005e-05,
"loss": 0.0174,
"num_input_tokens_seen": 69631232,
"step": 68000
},
{
"epoch": 6.85,
"grad_norm": 0.9304377436637878,
"learning_rate": 1.57505e-05,
"loss": 0.0185,
"num_input_tokens_seen": 70143232,
"step": 68500
},
{
"epoch": 6.9,
"grad_norm": 0.8526313304901123,
"learning_rate": 1.5500500000000002e-05,
"loss": 0.0185,
"num_input_tokens_seen": 70655232,
"step": 69000
},
{
"epoch": 6.95,
"grad_norm": 1.6793274879455566,
"learning_rate": 1.52505e-05,
"loss": 0.0194,
"num_input_tokens_seen": 71167232,
"step": 69500
},
{
"epoch": 7.0,
"grad_norm": 1.2873644828796387,
"learning_rate": 1.5000500000000001e-05,
"loss": 0.0203,
"num_input_tokens_seen": 71679104,
"step": 70000
},
{
"epoch": 7.0,
"eval_combined_score": 0.1494929350818927,
"eval_loss": 0.14949294924736023,
"eval_mse": 0.14949293581758635,
"eval_runtime": 29.5374,
"eval_samples_per_second": 677.107,
"eval_steps_per_second": 84.638,
"num_input_tokens_seen": 71679104,
"step": 70000
},
{
"epoch": 7.05,
"grad_norm": 0.8493014574050903,
"learning_rate": 1.47505e-05,
"loss": 0.014,
"num_input_tokens_seen": 72191104,
"step": 70500
},
{
"epoch": 7.1,
"grad_norm": 0.6162556409835815,
"learning_rate": 1.45005e-05,
"loss": 0.0145,
"num_input_tokens_seen": 72703104,
"step": 71000
},
{
"epoch": 7.15,
"grad_norm": 0.6198768019676208,
"learning_rate": 1.42505e-05,
"loss": 0.0135,
"num_input_tokens_seen": 73215104,
"step": 71500
},
{
"epoch": 7.2,
"grad_norm": 0.6122292876243591,
"learning_rate": 1.40005e-05,
"loss": 0.0138,
"num_input_tokens_seen": 73727104,
"step": 72000
},
{
"epoch": 7.25,
"grad_norm": 0.8132468461990356,
"learning_rate": 1.37505e-05,
"loss": 0.0136,
"num_input_tokens_seen": 74239104,
"step": 72500
},
{
"epoch": 7.3,
"grad_norm": 0.791746973991394,
"learning_rate": 1.3500499999999999e-05,
"loss": 0.0136,
"num_input_tokens_seen": 74751104,
"step": 73000
},
{
"epoch": 7.35,
"grad_norm": 1.6126739978790283,
"learning_rate": 1.3250500000000001e-05,
"loss": 0.0139,
"num_input_tokens_seen": 75263104,
"step": 73500
},
{
"epoch": 7.4,
"grad_norm": 1.348046898841858,
"learning_rate": 1.3000500000000002e-05,
"loss": 0.0149,
"num_input_tokens_seen": 75775104,
"step": 74000
},
{
"epoch": 7.45,
"grad_norm": 1.5154032707214355,
"learning_rate": 1.2750500000000001e-05,
"loss": 0.0133,
"num_input_tokens_seen": 76287104,
"step": 74500
},
{
"epoch": 7.5,
"grad_norm": 1.3086836338043213,
"learning_rate": 1.2500500000000002e-05,
"loss": 0.0134,
"num_input_tokens_seen": 76799104,
"step": 75000
},
{
"epoch": 7.55,
"grad_norm": 1.3077424764633179,
"learning_rate": 1.22505e-05,
"loss": 0.0134,
"num_input_tokens_seen": 77311104,
"step": 75500
},
{
"epoch": 7.6,
"grad_norm": 1.377185344696045,
"learning_rate": 1.2000500000000001e-05,
"loss": 0.0128,
"num_input_tokens_seen": 77823104,
"step": 76000
},
{
"epoch": 7.65,
"grad_norm": 1.2250688076019287,
"learning_rate": 1.17505e-05,
"loss": 0.0146,
"num_input_tokens_seen": 78335104,
"step": 76500
},
{
"epoch": 7.7,
"grad_norm": 0.8044687509536743,
"learning_rate": 1.15005e-05,
"loss": 0.0132,
"num_input_tokens_seen": 78847104,
"step": 77000
},
{
"epoch": 7.75,
"grad_norm": 0.8126741647720337,
"learning_rate": 1.12505e-05,
"loss": 0.0134,
"num_input_tokens_seen": 79359104,
"step": 77500
},
{
"epoch": 7.8,
"grad_norm": 0.6075248122215271,
"learning_rate": 1.10005e-05,
"loss": 0.0131,
"num_input_tokens_seen": 79871104,
"step": 78000
},
{
"epoch": 7.85,
"grad_norm": 1.874189853668213,
"learning_rate": 1.0750500000000002e-05,
"loss": 0.0134,
"num_input_tokens_seen": 80383104,
"step": 78500
},
{
"epoch": 7.9,
"grad_norm": 0.5488854646682739,
"learning_rate": 1.05005e-05,
"loss": 0.0137,
"num_input_tokens_seen": 80895104,
"step": 79000
},
{
"epoch": 7.95,
"grad_norm": 1.5739060640335083,
"learning_rate": 1.0250500000000001e-05,
"loss": 0.0131,
"num_input_tokens_seen": 81407104,
"step": 79500
},
{
"epoch": 8.0,
"grad_norm": 1.897755742073059,
"learning_rate": 1.00005e-05,
"loss": 0.0135,
"num_input_tokens_seen": 81918976,
"step": 80000
},
{
"epoch": 8.0,
"eval_combined_score": 0.1482119562218898,
"eval_loss": 0.14821195602416992,
"eval_mse": 0.14821195641960966,
"eval_runtime": 29.5069,
"eval_samples_per_second": 677.807,
"eval_steps_per_second": 84.726,
"num_input_tokens_seen": 81918976,
"step": 80000
},
{
"epoch": 8.05,
"grad_norm": 0.39859962463378906,
"learning_rate": 9.7505e-06,
"loss": 0.0107,
"num_input_tokens_seen": 82430976,
"step": 80500
},
{
"epoch": 8.1,
"grad_norm": 1.8892147541046143,
"learning_rate": 9.500500000000002e-06,
"loss": 0.01,
"num_input_tokens_seen": 82942976,
"step": 81000
},
{
"epoch": 8.15,
"grad_norm": 0.7789964079856873,
"learning_rate": 9.2505e-06,
"loss": 0.0111,
"num_input_tokens_seen": 83454976,
"step": 81500
},
{
"epoch": 8.2,
"grad_norm": 0.6423227787017822,
"learning_rate": 9.000500000000001e-06,
"loss": 0.011,
"num_input_tokens_seen": 83966976,
"step": 82000
},
{
"epoch": 8.25,
"grad_norm": 0.6862022876739502,
"learning_rate": 8.7505e-06,
"loss": 0.0105,
"num_input_tokens_seen": 84478976,
"step": 82500
},
{
"epoch": 8.3,
"grad_norm": 0.6521459817886353,
"learning_rate": 8.5005e-06,
"loss": 0.011,
"num_input_tokens_seen": 84990976,
"step": 83000
},
{
"epoch": 8.35,
"grad_norm": 1.0782101154327393,
"learning_rate": 8.2505e-06,
"loss": 0.01,
"num_input_tokens_seen": 85502976,
"step": 83500
},
{
"epoch": 8.4,
"grad_norm": 0.32573211193084717,
"learning_rate": 8.0005e-06,
"loss": 0.0102,
"num_input_tokens_seen": 86014976,
"step": 84000
},
{
"epoch": 8.45,
"grad_norm": 0.4790741205215454,
"learning_rate": 7.750500000000001e-06,
"loss": 0.0097,
"num_input_tokens_seen": 86526976,
"step": 84500
},
{
"epoch": 8.5,
"grad_norm": 5.938267230987549,
"learning_rate": 7.5005000000000004e-06,
"loss": 0.0099,
"num_input_tokens_seen": 87038976,
"step": 85000
},
{
"epoch": 8.55,
"grad_norm": 0.3625955283641815,
"learning_rate": 7.2505e-06,
"loss": 0.0101,
"num_input_tokens_seen": 87550976,
"step": 85500
},
{
"epoch": 8.6,
"grad_norm": 1.664149522781372,
"learning_rate": 7.0005e-06,
"loss": 0.0103,
"num_input_tokens_seen": 88062976,
"step": 86000
},
{
"epoch": 8.65,
"grad_norm": 0.35580164194107056,
"learning_rate": 6.7505e-06,
"loss": 0.0097,
"num_input_tokens_seen": 88574976,
"step": 86500
},
{
"epoch": 8.7,
"grad_norm": 0.814786434173584,
"learning_rate": 6.5005e-06,
"loss": 0.0099,
"num_input_tokens_seen": 89086976,
"step": 87000
},
{
"epoch": 8.75,
"grad_norm": 0.479640930891037,
"learning_rate": 6.2505000000000005e-06,
"loss": 0.0101,
"num_input_tokens_seen": 89598976,
"step": 87500
},
{
"epoch": 8.8,
"grad_norm": 0.4606671929359436,
"learning_rate": 6.0005e-06,
"loss": 0.0094,
"num_input_tokens_seen": 90110976,
"step": 88000
},
{
"epoch": 8.85,
"grad_norm": 2.0643467903137207,
"learning_rate": 5.7505e-06,
"loss": 0.0099,
"num_input_tokens_seen": 90622976,
"step": 88500
},
{
"epoch": 8.9,
"grad_norm": 0.6785427331924438,
"learning_rate": 5.5005e-06,
"loss": 0.0103,
"num_input_tokens_seen": 91134976,
"step": 89000
},
{
"epoch": 8.95,
"grad_norm": 0.6333959102630615,
"learning_rate": 5.250500000000001e-06,
"loss": 0.01,
"num_input_tokens_seen": 91646976,
"step": 89500
},
{
"epoch": 9.0,
"grad_norm": 0.8463544249534607,
"learning_rate": 5.000500000000001e-06,
"loss": 0.0098,
"num_input_tokens_seen": 92158848,
"step": 90000
},
{
"epoch": 9.0,
"eval_combined_score": 0.14495953552467267,
"eval_loss": 0.14495953917503357,
"eval_mse": 0.1449595318743118,
"eval_runtime": 29.5073,
"eval_samples_per_second": 677.799,
"eval_steps_per_second": 84.725,
"num_input_tokens_seen": 92158848,
"step": 90000
},
{
"epoch": 9.05,
"grad_norm": 0.39637425541877747,
"learning_rate": 4.7505000000000005e-06,
"loss": 0.0082,
"num_input_tokens_seen": 92670848,
"step": 90500
},
{
"epoch": 9.1,
"grad_norm": 0.7424957752227783,
"learning_rate": 4.5005e-06,
"loss": 0.0085,
"num_input_tokens_seen": 93182848,
"step": 91000
},
{
"epoch": 9.15,
"grad_norm": 0.8151483535766602,
"learning_rate": 4.2505e-06,
"loss": 0.008,
"num_input_tokens_seen": 93694848,
"step": 91500
},
{
"epoch": 9.2,
"grad_norm": 1.604078769683838,
"learning_rate": 4.0005e-06,
"loss": 0.0086,
"num_input_tokens_seen": 94206848,
"step": 92000
},
{
"epoch": 9.25,
"grad_norm": 0.42909368872642517,
"learning_rate": 3.7505e-06,
"loss": 0.0084,
"num_input_tokens_seen": 94718848,
"step": 92500
},
{
"epoch": 9.3,
"grad_norm": 0.6759423017501831,
"learning_rate": 3.5005e-06,
"loss": 0.0077,
"num_input_tokens_seen": 95230848,
"step": 93000
},
{
"epoch": 9.35,
"grad_norm": 0.5954917669296265,
"learning_rate": 3.2505e-06,
"loss": 0.0081,
"num_input_tokens_seen": 95742848,
"step": 93500
},
{
"epoch": 9.4,
"grad_norm": 0.6435306072235107,
"learning_rate": 3.0005000000000003e-06,
"loss": 0.0079,
"num_input_tokens_seen": 96254848,
"step": 94000
},
{
"epoch": 9.45,
"grad_norm": 0.8906601071357727,
"learning_rate": 2.7505e-06,
"loss": 0.008,
"num_input_tokens_seen": 96766848,
"step": 94500
},
{
"epoch": 9.5,
"grad_norm": 1.4101794958114624,
"learning_rate": 2.5005e-06,
"loss": 0.0075,
"num_input_tokens_seen": 97278848,
"step": 95000
},
{
"epoch": 9.55,
"grad_norm": 0.7406792044639587,
"learning_rate": 2.2505000000000003e-06,
"loss": 0.0078,
"num_input_tokens_seen": 97790848,
"step": 95500
},
{
"epoch": 9.6,
"grad_norm": 1.437361240386963,
"learning_rate": 2.0004999999999997e-06,
"loss": 0.0077,
"num_input_tokens_seen": 98302848,
"step": 96000
},
{
"epoch": 9.65,
"grad_norm": 0.4781911373138428,
"learning_rate": 1.7505e-06,
"loss": 0.0078,
"num_input_tokens_seen": 98814848,
"step": 96500
},
{
"epoch": 9.7,
"grad_norm": 0.5876700282096863,
"learning_rate": 1.5005e-06,
"loss": 0.0075,
"num_input_tokens_seen": 99326848,
"step": 97000
},
{
"epoch": 9.75,
"grad_norm": 0.933368980884552,
"learning_rate": 1.2505000000000001e-06,
"loss": 0.008,
"num_input_tokens_seen": 99838848,
"step": 97500
},
{
"epoch": 9.8,
"grad_norm": 0.7791544198989868,
"learning_rate": 1.0005e-06,
"loss": 0.0075,
"num_input_tokens_seen": 100350848,
"step": 98000
},
{
"epoch": 9.85,
"grad_norm": 0.45317134261131287,
"learning_rate": 7.505000000000001e-07,
"loss": 0.0078,
"num_input_tokens_seen": 100862848,
"step": 98500
},
{
"epoch": 9.9,
"grad_norm": 1.5439448356628418,
"learning_rate": 5.005e-07,
"loss": 0.0074,
"num_input_tokens_seen": 101374848,
"step": 99000
},
{
"epoch": 9.95,
"grad_norm": 0.5587248206138611,
"learning_rate": 2.5049999999999997e-07,
"loss": 0.0079,
"num_input_tokens_seen": 101886848,
"step": 99500
},
{
"epoch": 10.0,
"grad_norm": 0.6633381247520447,
"learning_rate": 5e-10,
"loss": 0.0073,
"num_input_tokens_seen": 102398720,
"step": 100000
},
{
"epoch": 10.0,
"eval_combined_score": 0.14527450438803524,
"eval_loss": 0.14527450501918793,
"eval_mse": 0.14527450375688256,
"eval_runtime": 29.5752,
"eval_samples_per_second": 676.241,
"eval_steps_per_second": 84.53,
"num_input_tokens_seen": 102398720,
"step": 100000
},
{
"epoch": 10.0,
"num_input_tokens_seen": 102398720,
"step": 100000,
"total_flos": 5.262202453327104e+16,
"train_loss": 0.056572345192432406,
"train_runtime": 7202.8043,
"train_samples_per_second": 111.066,
"train_steps_per_second": 13.883,
"train_tokens_per_second": 14216.507
}
],
"logging_steps": 500,
"max_steps": 100000,
"num_input_tokens_seen": 102398720,
"num_train_epochs": 10,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 5.262202453327104e+16,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}