xlm-r_ben-beng / trainer_state.json
DGurgurov's picture
Uploading checkpoint-98500 for xlm-r - ben-beng
ac948a7 verified
{
"best_metric": 1.022267460823059,
"best_model_checkpoint": "./model_fine-tune/glot/xlm-r/ben-Beng/checkpoint-98500",
"epoch": 29.60625187856928,
"eval_steps": 500,
"global_step": 98500,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.15028554253080853,
"grad_norm": 4.279157638549805,
"learning_rate": 9.95e-05,
"loss": 1.5206,
"step": 500
},
{
"epoch": 0.15028554253080853,
"eval_accuracy": 0.7123080245508447,
"eval_loss": 1.6415441036224365,
"eval_runtime": 108.0973,
"eval_samples_per_second": 220.635,
"eval_steps_per_second": 6.901,
"step": 500
},
{
"epoch": 0.30057108506161706,
"grad_norm": 3.948293685913086,
"learning_rate": 9.900000000000001e-05,
"loss": 1.4505,
"step": 1000
},
{
"epoch": 0.30057108506161706,
"eval_accuracy": 0.7195765918792126,
"eval_loss": 1.6040955781936646,
"eval_runtime": 108.1918,
"eval_samples_per_second": 220.442,
"eval_steps_per_second": 6.895,
"step": 1000
},
{
"epoch": 0.4508566275924256,
"grad_norm": 3.838204860687256,
"learning_rate": 9.850000000000001e-05,
"loss": 1.4092,
"step": 1500
},
{
"epoch": 0.4508566275924256,
"eval_accuracy": 0.7255862618332372,
"eval_loss": 1.5711835622787476,
"eval_runtime": 105.314,
"eval_samples_per_second": 226.466,
"eval_steps_per_second": 7.084,
"step": 1500
},
{
"epoch": 0.6011421701232341,
"grad_norm": 3.7270383834838867,
"learning_rate": 9.8e-05,
"loss": 1.3738,
"step": 2000
},
{
"epoch": 0.6011421701232341,
"eval_accuracy": 0.7298732586939131,
"eval_loss": 1.562820315361023,
"eval_runtime": 104.3524,
"eval_samples_per_second": 228.552,
"eval_steps_per_second": 7.149,
"step": 2000
},
{
"epoch": 0.7514277126540427,
"grad_norm": 3.720855474472046,
"learning_rate": 9.75e-05,
"loss": 1.3447,
"step": 2500
},
{
"epoch": 0.7514277126540427,
"eval_accuracy": 0.7334076213509759,
"eval_loss": 1.533949613571167,
"eval_runtime": 105.6055,
"eval_samples_per_second": 225.84,
"eval_steps_per_second": 7.064,
"step": 2500
},
{
"epoch": 0.9017132551848512,
"grad_norm": 3.9192264080047607,
"learning_rate": 9.7e-05,
"loss": 1.3215,
"step": 3000
},
{
"epoch": 0.9017132551848512,
"eval_accuracy": 0.737443643353561,
"eval_loss": 1.5036447048187256,
"eval_runtime": 103.4988,
"eval_samples_per_second": 230.437,
"eval_steps_per_second": 7.208,
"step": 3000
},
{
"epoch": 1.0519987977156597,
"grad_norm": 3.8544235229492188,
"learning_rate": 9.65e-05,
"loss": 1.3074,
"step": 3500
},
{
"epoch": 1.0519987977156597,
"eval_accuracy": 0.7401312504576956,
"eval_loss": 1.4803341627120972,
"eval_runtime": 104.2256,
"eval_samples_per_second": 228.831,
"eval_steps_per_second": 7.158,
"step": 3500
},
{
"epoch": 1.2022843402464682,
"grad_norm": 3.2664737701416016,
"learning_rate": 9.6e-05,
"loss": 1.2826,
"step": 4000
},
{
"epoch": 1.2022843402464682,
"eval_accuracy": 0.7433592201310665,
"eval_loss": 1.475807785987854,
"eval_runtime": 104.5255,
"eval_samples_per_second": 228.174,
"eval_steps_per_second": 7.137,
"step": 4000
},
{
"epoch": 1.3525698827772767,
"grad_norm": 3.448021650314331,
"learning_rate": 9.55e-05,
"loss": 1.2676,
"step": 4500
},
{
"epoch": 1.3525698827772767,
"eval_accuracy": 0.7454010087654052,
"eval_loss": 1.467736005783081,
"eval_runtime": 106.1685,
"eval_samples_per_second": 224.643,
"eval_steps_per_second": 7.027,
"step": 4500
},
{
"epoch": 1.5028554253080855,
"grad_norm": 3.6344072818756104,
"learning_rate": 9.5e-05,
"loss": 1.2462,
"step": 5000
},
{
"epoch": 1.5028554253080855,
"eval_accuracy": 0.7480509601003421,
"eval_loss": 1.454475998878479,
"eval_runtime": 107.2463,
"eval_samples_per_second": 222.385,
"eval_steps_per_second": 6.956,
"step": 5000
},
{
"epoch": 1.653140967838894,
"grad_norm": 3.191260576248169,
"learning_rate": 9.449999999999999e-05,
"loss": 1.2353,
"step": 5500
},
{
"epoch": 1.653140967838894,
"eval_accuracy": 0.7502682619699697,
"eval_loss": 1.4226497411727905,
"eval_runtime": 107.1587,
"eval_samples_per_second": 222.567,
"eval_steps_per_second": 6.962,
"step": 5500
},
{
"epoch": 1.8034265103697025,
"grad_norm": 3.074147939682007,
"learning_rate": 9.4e-05,
"loss": 1.2227,
"step": 6000
},
{
"epoch": 1.8034265103697025,
"eval_accuracy": 0.7523797476360646,
"eval_loss": 1.4185426235198975,
"eval_runtime": 107.8236,
"eval_samples_per_second": 221.195,
"eval_steps_per_second": 6.919,
"step": 6000
},
{
"epoch": 1.953712052900511,
"grad_norm": 3.0745174884796143,
"learning_rate": 9.350000000000001e-05,
"loss": 1.2179,
"step": 6500
},
{
"epoch": 1.953712052900511,
"eval_accuracy": 0.7525899025066154,
"eval_loss": 1.4250199794769287,
"eval_runtime": 103.869,
"eval_samples_per_second": 229.616,
"eval_steps_per_second": 7.182,
"step": 6500
},
{
"epoch": 2.1039975954313195,
"grad_norm": 3.809814453125,
"learning_rate": 9.300000000000001e-05,
"loss": 1.1973,
"step": 7000
},
{
"epoch": 2.1039975954313195,
"eval_accuracy": 0.7544079430376059,
"eval_loss": 1.4045050144195557,
"eval_runtime": 103.9514,
"eval_samples_per_second": 229.434,
"eval_steps_per_second": 7.176,
"step": 7000
},
{
"epoch": 2.254283137962128,
"grad_norm": 3.262021541595459,
"learning_rate": 9.250000000000001e-05,
"loss": 1.1854,
"step": 7500
},
{
"epoch": 2.254283137962128,
"eval_accuracy": 0.7566596229680038,
"eval_loss": 1.4005974531173706,
"eval_runtime": 103.4306,
"eval_samples_per_second": 230.59,
"eval_steps_per_second": 7.213,
"step": 7500
},
{
"epoch": 2.4045686804929365,
"grad_norm": 3.562636613845825,
"learning_rate": 9.200000000000001e-05,
"loss": 1.1894,
"step": 8000
},
{
"epoch": 2.4045686804929365,
"eval_accuracy": 0.7518658362592376,
"eval_loss": 1.4261118173599243,
"eval_runtime": 107.7368,
"eval_samples_per_second": 221.373,
"eval_steps_per_second": 6.924,
"step": 8000
},
{
"epoch": 2.554854223023745,
"grad_norm": 3.0819168090820312,
"learning_rate": 9.15e-05,
"loss": 1.1821,
"step": 8500
},
{
"epoch": 2.554854223023745,
"eval_accuracy": 0.759439375239073,
"eval_loss": 1.3829759359359741,
"eval_runtime": 108.9119,
"eval_samples_per_second": 218.984,
"eval_steps_per_second": 6.85,
"step": 8500
},
{
"epoch": 2.7051397655545535,
"grad_norm": 3.441218852996826,
"learning_rate": 9.1e-05,
"loss": 1.1617,
"step": 9000
},
{
"epoch": 2.7051397655545535,
"eval_accuracy": 0.7595643512285075,
"eval_loss": 1.38172447681427,
"eval_runtime": 107.6971,
"eval_samples_per_second": 221.454,
"eval_steps_per_second": 6.927,
"step": 9000
},
{
"epoch": 2.855425308085362,
"grad_norm": 2.9258110523223877,
"learning_rate": 9.05e-05,
"loss": 1.1647,
"step": 9500
},
{
"epoch": 2.855425308085362,
"eval_accuracy": 0.7622641760596893,
"eval_loss": 1.3564621210098267,
"eval_runtime": 108.8704,
"eval_samples_per_second": 219.068,
"eval_steps_per_second": 6.852,
"step": 9500
},
{
"epoch": 3.005710850616171,
"grad_norm": 3.097913980484009,
"learning_rate": 9e-05,
"loss": 1.1543,
"step": 10000
},
{
"epoch": 3.005710850616171,
"eval_accuracy": 0.7637979723391941,
"eval_loss": 1.3542959690093994,
"eval_runtime": 108.5802,
"eval_samples_per_second": 219.653,
"eval_steps_per_second": 6.87,
"step": 10000
},
{
"epoch": 3.1559963931469794,
"grad_norm": 3.168121099472046,
"learning_rate": 8.950000000000001e-05,
"loss": 1.1379,
"step": 10500
},
{
"epoch": 3.1559963931469794,
"eval_accuracy": 0.76408780640708,
"eval_loss": 1.3504241704940796,
"eval_runtime": 108.2586,
"eval_samples_per_second": 220.306,
"eval_steps_per_second": 6.891,
"step": 10500
},
{
"epoch": 3.306281935677788,
"grad_norm": 3.410958766937256,
"learning_rate": 8.900000000000001e-05,
"loss": 1.1254,
"step": 11000
},
{
"epoch": 3.306281935677788,
"eval_accuracy": 0.7652592815735387,
"eval_loss": 1.346474528312683,
"eval_runtime": 107.3669,
"eval_samples_per_second": 222.136,
"eval_steps_per_second": 6.948,
"step": 11000
},
{
"epoch": 3.4565674782085964,
"grad_norm": 2.775892734527588,
"learning_rate": 8.850000000000001e-05,
"loss": 1.1269,
"step": 11500
},
{
"epoch": 3.4565674782085964,
"eval_accuracy": 0.7657911530106338,
"eval_loss": 1.3591572046279907,
"eval_runtime": 106.844,
"eval_samples_per_second": 223.223,
"eval_steps_per_second": 6.982,
"step": 11500
},
{
"epoch": 3.606853020739405,
"grad_norm": 3.029595375061035,
"learning_rate": 8.800000000000001e-05,
"loss": 1.1247,
"step": 12000
},
{
"epoch": 3.606853020739405,
"eval_accuracy": 0.767305872889515,
"eval_loss": 1.3377037048339844,
"eval_runtime": 108.9561,
"eval_samples_per_second": 218.896,
"eval_steps_per_second": 6.847,
"step": 12000
},
{
"epoch": 3.7571385632702134,
"grad_norm": 3.1967430114746094,
"learning_rate": 8.75e-05,
"loss": 1.1123,
"step": 12500
},
{
"epoch": 3.7571385632702134,
"eval_accuracy": 0.7687083850392965,
"eval_loss": 1.340959906578064,
"eval_runtime": 107.8508,
"eval_samples_per_second": 221.139,
"eval_steps_per_second": 6.917,
"step": 12500
},
{
"epoch": 3.907424105801022,
"grad_norm": 2.859966993331909,
"learning_rate": 8.7e-05,
"loss": 1.112,
"step": 13000
},
{
"epoch": 3.907424105801022,
"eval_accuracy": 0.7696794726604136,
"eval_loss": 1.3237755298614502,
"eval_runtime": 107.6845,
"eval_samples_per_second": 221.48,
"eval_steps_per_second": 6.928,
"step": 13000
},
{
"epoch": 4.057709648331831,
"grad_norm": 2.8421764373779297,
"learning_rate": 8.65e-05,
"loss": 1.0983,
"step": 13500
},
{
"epoch": 4.057709648331831,
"eval_accuracy": 0.7714031663702743,
"eval_loss": 1.3144177198410034,
"eval_runtime": 108.7428,
"eval_samples_per_second": 219.325,
"eval_steps_per_second": 6.86,
"step": 13500
},
{
"epoch": 4.207995190862639,
"grad_norm": 3.1582884788513184,
"learning_rate": 8.6e-05,
"loss": 1.0918,
"step": 14000
},
{
"epoch": 4.207995190862639,
"eval_accuracy": 0.7718766206792639,
"eval_loss": 1.3221428394317627,
"eval_runtime": 108.1645,
"eval_samples_per_second": 220.498,
"eval_steps_per_second": 6.897,
"step": 14000
},
{
"epoch": 4.358280733393448,
"grad_norm": 2.631855010986328,
"learning_rate": 8.55e-05,
"loss": 1.0884,
"step": 14500
},
{
"epoch": 4.358280733393448,
"eval_accuracy": 0.772324790492508,
"eval_loss": 1.3152235746383667,
"eval_runtime": 107.5113,
"eval_samples_per_second": 221.837,
"eval_steps_per_second": 6.939,
"step": 14500
},
{
"epoch": 4.508566275924256,
"grad_norm": 3.242208480834961,
"learning_rate": 8.5e-05,
"loss": 1.0837,
"step": 15000
},
{
"epoch": 4.508566275924256,
"eval_accuracy": 0.7737920955341676,
"eval_loss": 1.2969176769256592,
"eval_runtime": 107.6972,
"eval_samples_per_second": 221.454,
"eval_steps_per_second": 6.927,
"step": 15000
},
{
"epoch": 4.658851818455065,
"grad_norm": 3.0691699981689453,
"learning_rate": 8.450000000000001e-05,
"loss": 1.0796,
"step": 15500
},
{
"epoch": 4.658851818455065,
"eval_accuracy": 0.7738074322270785,
"eval_loss": 1.3085857629776,
"eval_runtime": 106.2258,
"eval_samples_per_second": 224.522,
"eval_steps_per_second": 7.023,
"step": 15500
},
{
"epoch": 4.809137360985873,
"grad_norm": 3.258615732192993,
"learning_rate": 8.4e-05,
"loss": 1.0714,
"step": 16000
},
{
"epoch": 4.809137360985873,
"eval_accuracy": 0.775101051869418,
"eval_loss": 1.2939372062683105,
"eval_runtime": 106.8715,
"eval_samples_per_second": 223.165,
"eval_steps_per_second": 6.98,
"step": 16000
},
{
"epoch": 4.959422903516682,
"grad_norm": 3.1920101642608643,
"learning_rate": 8.35e-05,
"loss": 1.071,
"step": 16500
},
{
"epoch": 4.959422903516682,
"eval_accuracy": 0.7753841079350378,
"eval_loss": 1.2854809761047363,
"eval_runtime": 106.6266,
"eval_samples_per_second": 223.678,
"eval_steps_per_second": 6.996,
"step": 16500
},
{
"epoch": 5.10970844604749,
"grad_norm": 3.2814955711364746,
"learning_rate": 8.3e-05,
"loss": 1.0568,
"step": 17000
},
{
"epoch": 5.10970844604749,
"eval_accuracy": 0.7761810007493132,
"eval_loss": 1.2916713953018188,
"eval_runtime": 107.4768,
"eval_samples_per_second": 221.908,
"eval_steps_per_second": 6.941,
"step": 17000
},
{
"epoch": 5.259993988578299,
"grad_norm": 3.2327558994293213,
"learning_rate": 8.25e-05,
"loss": 1.0549,
"step": 17500
},
{
"epoch": 5.259993988578299,
"eval_accuracy": 0.7780586564617779,
"eval_loss": 1.2857751846313477,
"eval_runtime": 107.761,
"eval_samples_per_second": 221.323,
"eval_steps_per_second": 6.923,
"step": 17500
},
{
"epoch": 5.410279531109107,
"grad_norm": 3.0951426029205322,
"learning_rate": 8.2e-05,
"loss": 1.0511,
"step": 18000
},
{
"epoch": 5.410279531109107,
"eval_accuracy": 0.7783227014884583,
"eval_loss": 1.2948368787765503,
"eval_runtime": 108.2092,
"eval_samples_per_second": 220.406,
"eval_steps_per_second": 6.894,
"step": 18000
},
{
"epoch": 5.560565073639916,
"grad_norm": 2.639129638671875,
"learning_rate": 8.15e-05,
"loss": 1.0492,
"step": 18500
},
{
"epoch": 5.560565073639916,
"eval_accuracy": 0.7788534810933934,
"eval_loss": 1.2669661045074463,
"eval_runtime": 108.4058,
"eval_samples_per_second": 220.007,
"eval_steps_per_second": 6.882,
"step": 18500
},
{
"epoch": 5.710850616170724,
"grad_norm": 2.4974186420440674,
"learning_rate": 8.1e-05,
"loss": 1.0476,
"step": 19000
},
{
"epoch": 5.710850616170724,
"eval_accuracy": 0.7792994699668719,
"eval_loss": 1.2653881311416626,
"eval_runtime": 108.006,
"eval_samples_per_second": 220.821,
"eval_steps_per_second": 6.907,
"step": 19000
},
{
"epoch": 5.861136158701533,
"grad_norm": 2.6549036502838135,
"learning_rate": 8.05e-05,
"loss": 1.0537,
"step": 19500
},
{
"epoch": 5.861136158701533,
"eval_accuracy": 0.781079231841748,
"eval_loss": 1.2602005004882812,
"eval_runtime": 107.8534,
"eval_samples_per_second": 221.133,
"eval_steps_per_second": 6.917,
"step": 19500
},
{
"epoch": 6.011421701232342,
"grad_norm": 3.047539234161377,
"learning_rate": 8e-05,
"loss": 1.038,
"step": 20000
},
{
"epoch": 6.011421701232342,
"eval_accuracy": 0.7802534672325914,
"eval_loss": 1.2705827951431274,
"eval_runtime": 106.7758,
"eval_samples_per_second": 223.365,
"eval_steps_per_second": 6.987,
"step": 20000
},
{
"epoch": 6.16170724376315,
"grad_norm": 2.7509360313415527,
"learning_rate": 7.950000000000001e-05,
"loss": 1.0268,
"step": 20500
},
{
"epoch": 6.16170724376315,
"eval_accuracy": 0.7803480613075008,
"eval_loss": 1.2649264335632324,
"eval_runtime": 107.1559,
"eval_samples_per_second": 222.573,
"eval_steps_per_second": 6.962,
"step": 20500
},
{
"epoch": 6.311992786293959,
"grad_norm": 2.5355842113494873,
"learning_rate": 7.900000000000001e-05,
"loss": 1.0294,
"step": 21000
},
{
"epoch": 6.311992786293959,
"eval_accuracy": 0.7818066507198538,
"eval_loss": 1.250462532043457,
"eval_runtime": 115.6436,
"eval_samples_per_second": 206.237,
"eval_steps_per_second": 6.451,
"step": 21000
},
{
"epoch": 6.462278328824767,
"grad_norm": 2.9398176670074463,
"learning_rate": 7.850000000000001e-05,
"loss": 1.0209,
"step": 21500
},
{
"epoch": 6.462278328824767,
"eval_accuracy": 0.7832408080027191,
"eval_loss": 1.2534795999526978,
"eval_runtime": 107.8449,
"eval_samples_per_second": 221.151,
"eval_steps_per_second": 6.917,
"step": 21500
},
{
"epoch": 6.612563871355576,
"grad_norm": 2.7950327396392822,
"learning_rate": 7.800000000000001e-05,
"loss": 1.0268,
"step": 22000
},
{
"epoch": 6.612563871355576,
"eval_accuracy": 0.7836778557957977,
"eval_loss": 1.2419434785842896,
"eval_runtime": 107.3738,
"eval_samples_per_second": 222.121,
"eval_steps_per_second": 6.948,
"step": 22000
},
{
"epoch": 6.762849413886384,
"grad_norm": 3.0478243827819824,
"learning_rate": 7.75e-05,
"loss": 1.0233,
"step": 22500
},
{
"epoch": 6.762849413886384,
"eval_accuracy": 0.7839343784481503,
"eval_loss": 1.244408369064331,
"eval_runtime": 107.5196,
"eval_samples_per_second": 221.82,
"eval_steps_per_second": 6.938,
"step": 22500
},
{
"epoch": 6.913134956417193,
"grad_norm": 3.049609661102295,
"learning_rate": 7.7e-05,
"loss": 1.016,
"step": 23000
},
{
"epoch": 6.913134956417193,
"eval_accuracy": 0.7844505307770736,
"eval_loss": 1.2420412302017212,
"eval_runtime": 108.256,
"eval_samples_per_second": 220.311,
"eval_steps_per_second": 6.891,
"step": 23000
},
{
"epoch": 7.063420498948001,
"grad_norm": 3.2929279804229736,
"learning_rate": 7.65e-05,
"loss": 1.0113,
"step": 23500
},
{
"epoch": 7.063420498948001,
"eval_accuracy": 0.7841336670175184,
"eval_loss": 1.2468925714492798,
"eval_runtime": 108.0777,
"eval_samples_per_second": 220.675,
"eval_steps_per_second": 6.902,
"step": 23500
},
{
"epoch": 7.21370604147881,
"grad_norm": 2.5201022624969482,
"learning_rate": 7.6e-05,
"loss": 1.01,
"step": 24000
},
{
"epoch": 7.21370604147881,
"eval_accuracy": 0.7857022835486672,
"eval_loss": 1.2313475608825684,
"eval_runtime": 108.2646,
"eval_samples_per_second": 220.294,
"eval_steps_per_second": 6.891,
"step": 24000
},
{
"epoch": 7.363991584009618,
"grad_norm": 2.926717519760132,
"learning_rate": 7.55e-05,
"loss": 1.0017,
"step": 24500
},
{
"epoch": 7.363991584009618,
"eval_accuracy": 0.7857564674613335,
"eval_loss": 1.2349497079849243,
"eval_runtime": 107.7822,
"eval_samples_per_second": 221.28,
"eval_steps_per_second": 6.921,
"step": 24500
},
{
"epoch": 7.514277126540427,
"grad_norm": 2.6643712520599365,
"learning_rate": 7.500000000000001e-05,
"loss": 0.9939,
"step": 25000
},
{
"epoch": 7.514277126540427,
"eval_accuracy": 0.7868022880070933,
"eval_loss": 1.2337555885314941,
"eval_runtime": 106.7322,
"eval_samples_per_second": 223.456,
"eval_steps_per_second": 6.989,
"step": 25000
},
{
"epoch": 7.664562669071236,
"grad_norm": 2.679358720779419,
"learning_rate": 7.450000000000001e-05,
"loss": 0.9921,
"step": 25500
},
{
"epoch": 7.664562669071236,
"eval_accuracy": 0.7861627827490367,
"eval_loss": 1.2326431274414062,
"eval_runtime": 107.5896,
"eval_samples_per_second": 221.676,
"eval_steps_per_second": 6.934,
"step": 25500
},
{
"epoch": 7.814848211602044,
"grad_norm": 2.5837836265563965,
"learning_rate": 7.4e-05,
"loss": 0.9973,
"step": 26000
},
{
"epoch": 7.814848211602044,
"eval_accuracy": 0.7871569919088683,
"eval_loss": 1.2331918478012085,
"eval_runtime": 107.9543,
"eval_samples_per_second": 220.927,
"eval_steps_per_second": 6.91,
"step": 26000
},
{
"epoch": 7.965133754132852,
"grad_norm": 2.536043882369995,
"learning_rate": 7.35e-05,
"loss": 0.9953,
"step": 26500
},
{
"epoch": 7.965133754132852,
"eval_accuracy": 0.7891570577004067,
"eval_loss": 1.2163290977478027,
"eval_runtime": 113.9576,
"eval_samples_per_second": 209.288,
"eval_steps_per_second": 6.546,
"step": 26500
},
{
"epoch": 8.115419296663662,
"grad_norm": 2.690735340118408,
"learning_rate": 7.3e-05,
"loss": 0.9847,
"step": 27000
},
{
"epoch": 8.115419296663662,
"eval_accuracy": 0.7888144688528846,
"eval_loss": 1.209494709968567,
"eval_runtime": 116.0276,
"eval_samples_per_second": 205.555,
"eval_steps_per_second": 6.43,
"step": 27000
},
{
"epoch": 8.26570483919447,
"grad_norm": 2.4185330867767334,
"learning_rate": 7.25e-05,
"loss": 0.9797,
"step": 27500
},
{
"epoch": 8.26570483919447,
"eval_accuracy": 0.7891209106681478,
"eval_loss": 1.218719482421875,
"eval_runtime": 114.5016,
"eval_samples_per_second": 208.294,
"eval_steps_per_second": 6.515,
"step": 27500
},
{
"epoch": 8.415990381725278,
"grad_norm": 2.7199230194091797,
"learning_rate": 7.2e-05,
"loss": 0.9755,
"step": 28000
},
{
"epoch": 8.415990381725278,
"eval_accuracy": 0.7887393254933777,
"eval_loss": 1.2271952629089355,
"eval_runtime": 111.4173,
"eval_samples_per_second": 214.06,
"eval_steps_per_second": 6.696,
"step": 28000
},
{
"epoch": 8.566275924256086,
"grad_norm": 2.6658599376678467,
"learning_rate": 7.15e-05,
"loss": 0.9749,
"step": 28500
},
{
"epoch": 8.566275924256086,
"eval_accuracy": 0.7904619110106559,
"eval_loss": 1.217193603515625,
"eval_runtime": 107.3348,
"eval_samples_per_second": 222.202,
"eval_steps_per_second": 6.95,
"step": 28500
},
{
"epoch": 8.716561466786896,
"grad_norm": 2.9954679012298584,
"learning_rate": 7.1e-05,
"loss": 0.9747,
"step": 29000
},
{
"epoch": 8.716561466786896,
"eval_accuracy": 0.7904568069124281,
"eval_loss": 1.2054858207702637,
"eval_runtime": 111.9259,
"eval_samples_per_second": 213.087,
"eval_steps_per_second": 6.665,
"step": 29000
},
{
"epoch": 8.866847009317704,
"grad_norm": 2.506471872329712,
"learning_rate": 7.05e-05,
"loss": 0.9715,
"step": 29500
},
{
"epoch": 8.866847009317704,
"eval_accuracy": 0.790933514493206,
"eval_loss": 1.203903317451477,
"eval_runtime": 106.5362,
"eval_samples_per_second": 223.868,
"eval_steps_per_second": 7.002,
"step": 29500
},
{
"epoch": 9.017132551848512,
"grad_norm": 2.67307710647583,
"learning_rate": 7e-05,
"loss": 0.9716,
"step": 30000
},
{
"epoch": 9.017132551848512,
"eval_accuracy": 0.7914492771721259,
"eval_loss": 1.2048789262771606,
"eval_runtime": 121.4256,
"eval_samples_per_second": 196.417,
"eval_steps_per_second": 6.144,
"step": 30000
},
{
"epoch": 9.16741809437932,
"grad_norm": 2.9317779541015625,
"learning_rate": 6.95e-05,
"loss": 0.962,
"step": 30500
},
{
"epoch": 9.16741809437932,
"eval_accuracy": 0.7917621280752731,
"eval_loss": 1.199426293373108,
"eval_runtime": 121.1706,
"eval_samples_per_second": 196.83,
"eval_steps_per_second": 6.157,
"step": 30500
},
{
"epoch": 9.31770363691013,
"grad_norm": 2.679703712463379,
"learning_rate": 6.9e-05,
"loss": 0.9642,
"step": 31000
},
{
"epoch": 9.31770363691013,
"eval_accuracy": 0.7925397227321643,
"eval_loss": 1.193407654762268,
"eval_runtime": 123.2276,
"eval_samples_per_second": 193.544,
"eval_steps_per_second": 6.054,
"step": 31000
},
{
"epoch": 9.467989179440938,
"grad_norm": 2.4788730144500732,
"learning_rate": 6.850000000000001e-05,
"loss": 0.9601,
"step": 31500
},
{
"epoch": 9.467989179440938,
"eval_accuracy": 0.792334985338317,
"eval_loss": 1.1934560537338257,
"eval_runtime": 119.6088,
"eval_samples_per_second": 199.4,
"eval_steps_per_second": 6.237,
"step": 31500
},
{
"epoch": 9.618274721971746,
"grad_norm": 2.8737170696258545,
"learning_rate": 6.800000000000001e-05,
"loss": 0.9541,
"step": 32000
},
{
"epoch": 9.618274721971746,
"eval_accuracy": 0.7933121889138387,
"eval_loss": 1.1921508312225342,
"eval_runtime": 124.8462,
"eval_samples_per_second": 191.035,
"eval_steps_per_second": 5.975,
"step": 32000
},
{
"epoch": 9.768560264502554,
"grad_norm": 2.7593533992767334,
"learning_rate": 6.750000000000001e-05,
"loss": 0.9599,
"step": 32500
},
{
"epoch": 9.768560264502554,
"eval_accuracy": 0.7930929148774838,
"eval_loss": 1.2042547464370728,
"eval_runtime": 124.9577,
"eval_samples_per_second": 190.865,
"eval_steps_per_second": 5.97,
"step": 32500
},
{
"epoch": 9.918845807033364,
"grad_norm": 2.389718532562256,
"learning_rate": 6.7e-05,
"loss": 0.9649,
"step": 33000
},
{
"epoch": 9.918845807033364,
"eval_accuracy": 0.7939532145922783,
"eval_loss": 1.1914194822311401,
"eval_runtime": 120.967,
"eval_samples_per_second": 197.161,
"eval_steps_per_second": 6.167,
"step": 33000
},
{
"epoch": 10.069131349564172,
"grad_norm": 2.846874475479126,
"learning_rate": 6.65e-05,
"loss": 0.9467,
"step": 33500
},
{
"epoch": 10.069131349564172,
"eval_accuracy": 0.7947739980138828,
"eval_loss": 1.1852329969406128,
"eval_runtime": 120.8513,
"eval_samples_per_second": 197.35,
"eval_steps_per_second": 6.173,
"step": 33500
},
{
"epoch": 10.21941689209498,
"grad_norm": 2.58475399017334,
"learning_rate": 6.6e-05,
"loss": 0.9406,
"step": 34000
},
{
"epoch": 10.21941689209498,
"eval_accuracy": 0.7943515220267853,
"eval_loss": 1.1843186616897583,
"eval_runtime": 121.8305,
"eval_samples_per_second": 195.764,
"eval_steps_per_second": 6.123,
"step": 34000
},
{
"epoch": 10.36970243462579,
"grad_norm": 3.003615140914917,
"learning_rate": 6.55e-05,
"loss": 0.9395,
"step": 34500
},
{
"epoch": 10.36970243462579,
"eval_accuracy": 0.7948140182397172,
"eval_loss": 1.1831063032150269,
"eval_runtime": 120.2856,
"eval_samples_per_second": 198.278,
"eval_steps_per_second": 6.202,
"step": 34500
},
{
"epoch": 10.519987977156598,
"grad_norm": 2.6997032165527344,
"learning_rate": 6.500000000000001e-05,
"loss": 0.9338,
"step": 35000
},
{
"epoch": 10.519987977156598,
"eval_accuracy": 0.7955270740039722,
"eval_loss": 1.1888868808746338,
"eval_runtime": 121.3291,
"eval_samples_per_second": 196.573,
"eval_steps_per_second": 6.149,
"step": 35000
},
{
"epoch": 10.670273519687406,
"grad_norm": 2.5620720386505127,
"learning_rate": 6.450000000000001e-05,
"loss": 0.9413,
"step": 35500
},
{
"epoch": 10.670273519687406,
"eval_accuracy": 0.7960009228127132,
"eval_loss": 1.1771271228790283,
"eval_runtime": 116.3789,
"eval_samples_per_second": 204.934,
"eval_steps_per_second": 6.41,
"step": 35500
},
{
"epoch": 10.820559062218214,
"grad_norm": 2.4902310371398926,
"learning_rate": 6.400000000000001e-05,
"loss": 0.941,
"step": 36000
},
{
"epoch": 10.820559062218214,
"eval_accuracy": 0.7952424860773146,
"eval_loss": 1.176774263381958,
"eval_runtime": 116.6512,
"eval_samples_per_second": 204.456,
"eval_steps_per_second": 6.395,
"step": 36000
},
{
"epoch": 10.970844604749024,
"grad_norm": 2.6512043476104736,
"learning_rate": 6.35e-05,
"loss": 0.9371,
"step": 36500
},
{
"epoch": 10.970844604749024,
"eval_accuracy": 0.796440942176153,
"eval_loss": 1.1723679304122925,
"eval_runtime": 119.8979,
"eval_samples_per_second": 198.919,
"eval_steps_per_second": 6.222,
"step": 36500
},
{
"epoch": 11.121130147279832,
"grad_norm": 2.6038336753845215,
"learning_rate": 6.3e-05,
"loss": 0.9293,
"step": 37000
},
{
"epoch": 11.121130147279832,
"eval_accuracy": 0.7967239525153044,
"eval_loss": 1.1750506162643433,
"eval_runtime": 120.4764,
"eval_samples_per_second": 197.964,
"eval_steps_per_second": 6.192,
"step": 37000
},
{
"epoch": 11.27141568981064,
"grad_norm": 2.5120317935943604,
"learning_rate": 6.25e-05,
"loss": 0.9279,
"step": 37500
},
{
"epoch": 11.27141568981064,
"eval_accuracy": 0.7974463986990397,
"eval_loss": 1.1721601486206055,
"eval_runtime": 115.5123,
"eval_samples_per_second": 206.471,
"eval_steps_per_second": 6.458,
"step": 37500
},
{
"epoch": 11.421701232341448,
"grad_norm": 2.6776065826416016,
"learning_rate": 6.2e-05,
"loss": 0.9273,
"step": 38000
},
{
"epoch": 11.421701232341448,
"eval_accuracy": 0.7977723054491466,
"eval_loss": 1.1790024042129517,
"eval_runtime": 118.0413,
"eval_samples_per_second": 202.048,
"eval_steps_per_second": 6.32,
"step": 38000
},
{
"epoch": 11.571986774872258,
"grad_norm": 2.473292589187622,
"learning_rate": 6.15e-05,
"loss": 0.9176,
"step": 38500
},
{
"epoch": 11.571986774872258,
"eval_accuracy": 0.7978240654330204,
"eval_loss": 1.1769903898239136,
"eval_runtime": 119.5414,
"eval_samples_per_second": 199.512,
"eval_steps_per_second": 6.241,
"step": 38500
},
{
"epoch": 11.722272317403066,
"grad_norm": 2.573493242263794,
"learning_rate": 6.1e-05,
"loss": 0.918,
"step": 39000
},
{
"epoch": 11.722272317403066,
"eval_accuracy": 0.7978464067774307,
"eval_loss": 1.1745543479919434,
"eval_runtime": 118.8031,
"eval_samples_per_second": 200.752,
"eval_steps_per_second": 6.279,
"step": 39000
},
{
"epoch": 11.872557859933874,
"grad_norm": 2.496293067932129,
"learning_rate": 6.05e-05,
"loss": 0.9209,
"step": 39500
},
{
"epoch": 11.872557859933874,
"eval_accuracy": 0.7988222222591843,
"eval_loss": 1.1572139263153076,
"eval_runtime": 119.3596,
"eval_samples_per_second": 199.816,
"eval_steps_per_second": 6.25,
"step": 39500
},
{
"epoch": 12.022843402464684,
"grad_norm": 2.8805453777313232,
"learning_rate": 6e-05,
"loss": 0.9204,
"step": 40000
},
{
"epoch": 12.022843402464684,
"eval_accuracy": 0.799307208811464,
"eval_loss": 1.1740858554840088,
"eval_runtime": 118.6109,
"eval_samples_per_second": 201.078,
"eval_steps_per_second": 6.289,
"step": 40000
},
{
"epoch": 12.173128944995492,
"grad_norm": 2.6088273525238037,
"learning_rate": 5.95e-05,
"loss": 0.9149,
"step": 40500
},
{
"epoch": 12.173128944995492,
"eval_accuracy": 0.7991013549531508,
"eval_loss": 1.1727755069732666,
"eval_runtime": 117.7954,
"eval_samples_per_second": 202.47,
"eval_steps_per_second": 6.333,
"step": 40500
},
{
"epoch": 12.3234144875263,
"grad_norm": 2.426567316055298,
"learning_rate": 5.9e-05,
"loss": 0.9009,
"step": 41000
},
{
"epoch": 12.3234144875263,
"eval_accuracy": 0.798481914968918,
"eval_loss": 1.1672251224517822,
"eval_runtime": 119.0757,
"eval_samples_per_second": 200.293,
"eval_steps_per_second": 6.265,
"step": 41000
},
{
"epoch": 12.473700030057108,
"grad_norm": 2.687640428543091,
"learning_rate": 5.85e-05,
"loss": 0.9094,
"step": 41500
},
{
"epoch": 12.473700030057108,
"eval_accuracy": 0.7998230800562746,
"eval_loss": 1.1598495244979858,
"eval_runtime": 118.9102,
"eval_samples_per_second": 200.572,
"eval_steps_per_second": 6.274,
"step": 41500
},
{
"epoch": 12.623985572587918,
"grad_norm": 3.0333776473999023,
"learning_rate": 5.8e-05,
"loss": 0.9149,
"step": 42000
},
{
"epoch": 12.623985572587918,
"eval_accuracy": 0.7999625316919439,
"eval_loss": 1.171325445175171,
"eval_runtime": 115.3742,
"eval_samples_per_second": 206.719,
"eval_steps_per_second": 6.466,
"step": 42000
},
{
"epoch": 12.774271115118726,
"grad_norm": 2.299436569213867,
"learning_rate": 5.7499999999999995e-05,
"loss": 0.9068,
"step": 42500
},
{
"epoch": 12.774271115118726,
"eval_accuracy": 0.8010648350504735,
"eval_loss": 1.1537537574768066,
"eval_runtime": 118.7443,
"eval_samples_per_second": 200.852,
"eval_steps_per_second": 6.282,
"step": 42500
},
{
"epoch": 12.924556657649534,
"grad_norm": 2.7340447902679443,
"learning_rate": 5.6999999999999996e-05,
"loss": 0.9099,
"step": 43000
},
{
"epoch": 12.924556657649534,
"eval_accuracy": 0.8012578176921267,
"eval_loss": 1.1426852941513062,
"eval_runtime": 120.2664,
"eval_samples_per_second": 198.31,
"eval_steps_per_second": 6.203,
"step": 43000
},
{
"epoch": 13.074842200180342,
"grad_norm": 2.6585094928741455,
"learning_rate": 5.65e-05,
"loss": 0.8951,
"step": 43500
},
{
"epoch": 13.074842200180342,
"eval_accuracy": 0.8005615361847466,
"eval_loss": 1.1578710079193115,
"eval_runtime": 115.0048,
"eval_samples_per_second": 207.383,
"eval_steps_per_second": 6.487,
"step": 43500
},
{
"epoch": 13.225127742711152,
"grad_norm": 2.6981582641601562,
"learning_rate": 5.6000000000000006e-05,
"loss": 0.8886,
"step": 44000
},
{
"epoch": 13.225127742711152,
"eval_accuracy": 0.8010340860790154,
"eval_loss": 1.1602416038513184,
"eval_runtime": 113.2435,
"eval_samples_per_second": 210.608,
"eval_steps_per_second": 6.588,
"step": 44000
},
{
"epoch": 13.37541328524196,
"grad_norm": 2.6016407012939453,
"learning_rate": 5.550000000000001e-05,
"loss": 0.9057,
"step": 44500
},
{
"epoch": 13.37541328524196,
"eval_accuracy": 0.8021396084503188,
"eval_loss": 1.1406522989273071,
"eval_runtime": 115.7578,
"eval_samples_per_second": 206.034,
"eval_steps_per_second": 6.444,
"step": 44500
},
{
"epoch": 13.525698827772768,
"grad_norm": 2.693239688873291,
"learning_rate": 5.500000000000001e-05,
"loss": 0.8921,
"step": 45000
},
{
"epoch": 13.525698827772768,
"eval_accuracy": 0.801952397809383,
"eval_loss": 1.1361913681030273,
"eval_runtime": 112.7804,
"eval_samples_per_second": 211.473,
"eval_steps_per_second": 6.615,
"step": 45000
},
{
"epoch": 13.675984370303578,
"grad_norm": 2.386279582977295,
"learning_rate": 5.45e-05,
"loss": 0.8942,
"step": 45500
},
{
"epoch": 13.675984370303578,
"eval_accuracy": 0.8024716769653207,
"eval_loss": 1.1441001892089844,
"eval_runtime": 114.7396,
"eval_samples_per_second": 207.862,
"eval_steps_per_second": 6.502,
"step": 45500
},
{
"epoch": 13.826269912834386,
"grad_norm": 2.3237531185150146,
"learning_rate": 5.4000000000000005e-05,
"loss": 0.8939,
"step": 46000
},
{
"epoch": 13.826269912834386,
"eval_accuracy": 0.8025017531571386,
"eval_loss": 1.141733169555664,
"eval_runtime": 112.3505,
"eval_samples_per_second": 212.282,
"eval_steps_per_second": 6.64,
"step": 46000
},
{
"epoch": 13.976555455365194,
"grad_norm": 2.463498592376709,
"learning_rate": 5.3500000000000006e-05,
"loss": 0.891,
"step": 46500
},
{
"epoch": 13.976555455365194,
"eval_accuracy": 0.803545043091428,
"eval_loss": 1.144685983657837,
"eval_runtime": 114.632,
"eval_samples_per_second": 208.057,
"eval_steps_per_second": 6.508,
"step": 46500
},
{
"epoch": 14.126840997896002,
"grad_norm": 2.7393481731414795,
"learning_rate": 5.300000000000001e-05,
"loss": 0.8846,
"step": 47000
},
{
"epoch": 14.126840997896002,
"eval_accuracy": 0.8038034340555349,
"eval_loss": 1.1320561170578003,
"eval_runtime": 112.5915,
"eval_samples_per_second": 211.828,
"eval_steps_per_second": 6.626,
"step": 47000
},
{
"epoch": 14.277126540426812,
"grad_norm": 2.776505708694458,
"learning_rate": 5.25e-05,
"loss": 0.8841,
"step": 47500
},
{
"epoch": 14.277126540426812,
"eval_accuracy": 0.8034705833910766,
"eval_loss": 1.1436700820922852,
"eval_runtime": 114.9721,
"eval_samples_per_second": 207.442,
"eval_steps_per_second": 6.489,
"step": 47500
},
{
"epoch": 14.42741208295762,
"grad_norm": 2.536817789077759,
"learning_rate": 5.2000000000000004e-05,
"loss": 0.8811,
"step": 48000
},
{
"epoch": 14.42741208295762,
"eval_accuracy": 0.8044537392561116,
"eval_loss": 1.133318305015564,
"eval_runtime": 113.5045,
"eval_samples_per_second": 210.124,
"eval_steps_per_second": 6.572,
"step": 48000
},
{
"epoch": 14.577697625488428,
"grad_norm": 2.524203062057495,
"learning_rate": 5.1500000000000005e-05,
"loss": 0.8778,
"step": 48500
},
{
"epoch": 14.577697625488428,
"eval_accuracy": 0.8039248049976617,
"eval_loss": 1.1258032321929932,
"eval_runtime": 114.9378,
"eval_samples_per_second": 207.504,
"eval_steps_per_second": 6.49,
"step": 48500
},
{
"epoch": 14.727983168019236,
"grad_norm": 2.6129727363586426,
"learning_rate": 5.1000000000000006e-05,
"loss": 0.8765,
"step": 49000
},
{
"epoch": 14.727983168019236,
"eval_accuracy": 0.8044657041630938,
"eval_loss": 1.1352417469024658,
"eval_runtime": 114.1357,
"eval_samples_per_second": 208.962,
"eval_steps_per_second": 6.536,
"step": 49000
},
{
"epoch": 14.878268710550046,
"grad_norm": 2.6218881607055664,
"learning_rate": 5.05e-05,
"loss": 0.876,
"step": 49500
},
{
"epoch": 14.878268710550046,
"eval_accuracy": 0.8049223412984352,
"eval_loss": 1.131935954093933,
"eval_runtime": 114.4278,
"eval_samples_per_second": 208.428,
"eval_steps_per_second": 6.519,
"step": 49500
},
{
"epoch": 15.028554253080854,
"grad_norm": 2.654724597930908,
"learning_rate": 5e-05,
"loss": 0.879,
"step": 50000
},
{
"epoch": 15.028554253080854,
"eval_accuracy": 0.8056327032697777,
"eval_loss": 1.1161094903945923,
"eval_runtime": 112.2273,
"eval_samples_per_second": 212.515,
"eval_steps_per_second": 6.647,
"step": 50000
},
{
"epoch": 15.178839795611662,
"grad_norm": 2.569153308868408,
"learning_rate": 4.9500000000000004e-05,
"loss": 0.8674,
"step": 50500
},
{
"epoch": 15.178839795611662,
"eval_accuracy": 0.8048139780991767,
"eval_loss": 1.1278635263442993,
"eval_runtime": 114.7277,
"eval_samples_per_second": 207.884,
"eval_steps_per_second": 6.502,
"step": 50500
},
{
"epoch": 15.32912533814247,
"grad_norm": 2.5382237434387207,
"learning_rate": 4.9e-05,
"loss": 0.8676,
"step": 51000
},
{
"epoch": 15.32912533814247,
"eval_accuracy": 0.8064227363767525,
"eval_loss": 1.132450819015503,
"eval_runtime": 107.2478,
"eval_samples_per_second": 222.382,
"eval_steps_per_second": 6.956,
"step": 51000
},
{
"epoch": 15.47941088067328,
"grad_norm": 2.480746030807495,
"learning_rate": 4.85e-05,
"loss": 0.867,
"step": 51500
},
{
"epoch": 15.47941088067328,
"eval_accuracy": 0.806509341887431,
"eval_loss": 1.1208624839782715,
"eval_runtime": 107.5739,
"eval_samples_per_second": 221.708,
"eval_steps_per_second": 6.935,
"step": 51500
},
{
"epoch": 15.629696423204088,
"grad_norm": 2.4417829513549805,
"learning_rate": 4.8e-05,
"loss": 0.8648,
"step": 52000
},
{
"epoch": 15.629696423204088,
"eval_accuracy": 0.80664691560334,
"eval_loss": 1.123421549797058,
"eval_runtime": 107.0763,
"eval_samples_per_second": 222.738,
"eval_steps_per_second": 6.967,
"step": 52000
},
{
"epoch": 15.779981965734896,
"grad_norm": 2.3048479557037354,
"learning_rate": 4.75e-05,
"loss": 0.87,
"step": 52500
},
{
"epoch": 15.779981965734896,
"eval_accuracy": 0.8067313801594327,
"eval_loss": 1.1155991554260254,
"eval_runtime": 108.1545,
"eval_samples_per_second": 220.518,
"eval_steps_per_second": 6.898,
"step": 52500
},
{
"epoch": 15.930267508265704,
"grad_norm": 2.6421682834625244,
"learning_rate": 4.7e-05,
"loss": 0.8694,
"step": 53000
},
{
"epoch": 15.930267508265704,
"eval_accuracy": 0.8066291950596075,
"eval_loss": 1.1215757131576538,
"eval_runtime": 109.1629,
"eval_samples_per_second": 218.481,
"eval_steps_per_second": 6.834,
"step": 53000
},
{
"epoch": 16.080553050796514,
"grad_norm": 2.4313910007476807,
"learning_rate": 4.6500000000000005e-05,
"loss": 0.8668,
"step": 53500
},
{
"epoch": 16.080553050796514,
"eval_accuracy": 0.8079721270128156,
"eval_loss": 1.1246066093444824,
"eval_runtime": 108.472,
"eval_samples_per_second": 219.872,
"eval_steps_per_second": 6.877,
"step": 53500
},
{
"epoch": 16.230838593327324,
"grad_norm": 2.2622108459472656,
"learning_rate": 4.600000000000001e-05,
"loss": 0.8603,
"step": 54000
},
{
"epoch": 16.230838593327324,
"eval_accuracy": 0.8070964068446762,
"eval_loss": 1.1118344068527222,
"eval_runtime": 107.6522,
"eval_samples_per_second": 221.547,
"eval_steps_per_second": 6.93,
"step": 54000
},
{
"epoch": 16.38112413585813,
"grad_norm": 2.355970621109009,
"learning_rate": 4.55e-05,
"loss": 0.8557,
"step": 54500
},
{
"epoch": 16.38112413585813,
"eval_accuracy": 0.8079940925101587,
"eval_loss": 1.1261143684387207,
"eval_runtime": 107.1531,
"eval_samples_per_second": 222.579,
"eval_steps_per_second": 6.962,
"step": 54500
},
{
"epoch": 16.53140967838894,
"grad_norm": 2.4477593898773193,
"learning_rate": 4.5e-05,
"loss": 0.8521,
"step": 55000
},
{
"epoch": 16.53140967838894,
"eval_accuracy": 0.8084604086659483,
"eval_loss": 1.1086950302124023,
"eval_runtime": 106.6462,
"eval_samples_per_second": 223.637,
"eval_steps_per_second": 6.995,
"step": 55000
},
{
"epoch": 16.681695220919746,
"grad_norm": 3.1719090938568115,
"learning_rate": 4.4500000000000004e-05,
"loss": 0.8583,
"step": 55500
},
{
"epoch": 16.681695220919746,
"eval_accuracy": 0.8085530062732024,
"eval_loss": 1.1111265420913696,
"eval_runtime": 107.453,
"eval_samples_per_second": 221.958,
"eval_steps_per_second": 6.943,
"step": 55500
},
{
"epoch": 16.831980763450556,
"grad_norm": 2.867218494415283,
"learning_rate": 4.4000000000000006e-05,
"loss": 0.8509,
"step": 56000
},
{
"epoch": 16.831980763450556,
"eval_accuracy": 0.8083466491211581,
"eval_loss": 1.1217560768127441,
"eval_runtime": 107.5379,
"eval_samples_per_second": 221.782,
"eval_steps_per_second": 6.937,
"step": 56000
},
{
"epoch": 16.982266305981366,
"grad_norm": 2.421694040298462,
"learning_rate": 4.35e-05,
"loss": 0.8591,
"step": 56500
},
{
"epoch": 16.982266305981366,
"eval_accuracy": 0.808431427546503,
"eval_loss": 1.1109532117843628,
"eval_runtime": 108.5465,
"eval_samples_per_second": 219.721,
"eval_steps_per_second": 6.873,
"step": 56500
},
{
"epoch": 17.132551848512172,
"grad_norm": 2.6324167251586914,
"learning_rate": 4.3e-05,
"loss": 0.8417,
"step": 57000
},
{
"epoch": 17.132551848512172,
"eval_accuracy": 0.8085933337773042,
"eval_loss": 1.1125506162643433,
"eval_runtime": 108.0877,
"eval_samples_per_second": 220.654,
"eval_steps_per_second": 6.902,
"step": 57000
},
{
"epoch": 17.282837391042982,
"grad_norm": 2.448883056640625,
"learning_rate": 4.25e-05,
"loss": 0.8511,
"step": 57500
},
{
"epoch": 17.282837391042982,
"eval_accuracy": 0.8093454793356915,
"eval_loss": 1.1108651161193848,
"eval_runtime": 108.0036,
"eval_samples_per_second": 220.826,
"eval_steps_per_second": 6.907,
"step": 57500
},
{
"epoch": 17.43312293357379,
"grad_norm": 2.647814989089966,
"learning_rate": 4.2e-05,
"loss": 0.8472,
"step": 58000
},
{
"epoch": 17.43312293357379,
"eval_accuracy": 0.8097202877059791,
"eval_loss": 1.1124111413955688,
"eval_runtime": 107.9075,
"eval_samples_per_second": 221.023,
"eval_steps_per_second": 6.913,
"step": 58000
},
{
"epoch": 17.583408476104598,
"grad_norm": 2.3744354248046875,
"learning_rate": 4.15e-05,
"loss": 0.8381,
"step": 58500
},
{
"epoch": 17.583408476104598,
"eval_accuracy": 0.8095948479389424,
"eval_loss": 1.1056084632873535,
"eval_runtime": 107.6903,
"eval_samples_per_second": 221.468,
"eval_steps_per_second": 6.927,
"step": 58500
},
{
"epoch": 17.733694018635408,
"grad_norm": 2.3030834197998047,
"learning_rate": 4.1e-05,
"loss": 0.8474,
"step": 59000
},
{
"epoch": 17.733694018635408,
"eval_accuracy": 0.8102527131048097,
"eval_loss": 1.1011704206466675,
"eval_runtime": 107.5969,
"eval_samples_per_second": 221.661,
"eval_steps_per_second": 6.933,
"step": 59000
},
{
"epoch": 17.883979561166214,
"grad_norm": 2.610208749771118,
"learning_rate": 4.05e-05,
"loss": 0.8456,
"step": 59500
},
{
"epoch": 17.883979561166214,
"eval_accuracy": 0.8108135572608609,
"eval_loss": 1.097122073173523,
"eval_runtime": 106.6046,
"eval_samples_per_second": 223.724,
"eval_steps_per_second": 6.998,
"step": 59500
},
{
"epoch": 18.034265103697024,
"grad_norm": 2.491633176803589,
"learning_rate": 4e-05,
"loss": 0.8367,
"step": 60000
},
{
"epoch": 18.034265103697024,
"eval_accuracy": 0.8112650064622723,
"eval_loss": 1.0913532972335815,
"eval_runtime": 107.2029,
"eval_samples_per_second": 222.475,
"eval_steps_per_second": 6.959,
"step": 60000
},
{
"epoch": 18.184550646227834,
"grad_norm": 2.270582437515259,
"learning_rate": 3.9500000000000005e-05,
"loss": 0.8336,
"step": 60500
},
{
"epoch": 18.184550646227834,
"eval_accuracy": 0.8104727212172935,
"eval_loss": 1.110021710395813,
"eval_runtime": 106.0993,
"eval_samples_per_second": 224.789,
"eval_steps_per_second": 7.031,
"step": 60500
},
{
"epoch": 18.33483618875864,
"grad_norm": 2.4716830253601074,
"learning_rate": 3.9000000000000006e-05,
"loss": 0.8404,
"step": 61000
},
{
"epoch": 18.33483618875864,
"eval_accuracy": 0.8114022782578146,
"eval_loss": 1.0959724187850952,
"eval_runtime": 107.1337,
"eval_samples_per_second": 222.619,
"eval_steps_per_second": 6.963,
"step": 61000
},
{
"epoch": 18.48512173128945,
"grad_norm": 2.679825782775879,
"learning_rate": 3.85e-05,
"loss": 0.8365,
"step": 61500
},
{
"epoch": 18.48512173128945,
"eval_accuracy": 0.8112698414020383,
"eval_loss": 1.0895620584487915,
"eval_runtime": 107.9425,
"eval_samples_per_second": 220.951,
"eval_steps_per_second": 6.911,
"step": 61500
},
{
"epoch": 18.63540727382026,
"grad_norm": 2.5028414726257324,
"learning_rate": 3.8e-05,
"loss": 0.8308,
"step": 62000
},
{
"epoch": 18.63540727382026,
"eval_accuracy": 0.8112445219475855,
"eval_loss": 1.098541498184204,
"eval_runtime": 108.1696,
"eval_samples_per_second": 220.487,
"eval_steps_per_second": 6.897,
"step": 62000
},
{
"epoch": 18.785692816351066,
"grad_norm": 2.4303503036499023,
"learning_rate": 3.7500000000000003e-05,
"loss": 0.8308,
"step": 62500
},
{
"epoch": 18.785692816351066,
"eval_accuracy": 0.8120511999201301,
"eval_loss": 1.0968284606933594,
"eval_runtime": 107.5288,
"eval_samples_per_second": 221.801,
"eval_steps_per_second": 6.938,
"step": 62500
},
{
"epoch": 18.935978358881876,
"grad_norm": 2.9847395420074463,
"learning_rate": 3.7e-05,
"loss": 0.8312,
"step": 63000
},
{
"epoch": 18.935978358881876,
"eval_accuracy": 0.8121808312724796,
"eval_loss": 1.0884159803390503,
"eval_runtime": 107.9223,
"eval_samples_per_second": 220.992,
"eval_steps_per_second": 6.912,
"step": 63000
},
{
"epoch": 19.086263901412686,
"grad_norm": 2.302724838256836,
"learning_rate": 3.65e-05,
"loss": 0.8279,
"step": 63500
},
{
"epoch": 19.086263901412686,
"eval_accuracy": 0.8127240285256392,
"eval_loss": 1.0900928974151611,
"eval_runtime": 106.9119,
"eval_samples_per_second": 223.081,
"eval_steps_per_second": 6.978,
"step": 63500
},
{
"epoch": 19.236549443943492,
"grad_norm": 2.4187684059143066,
"learning_rate": 3.6e-05,
"loss": 0.8219,
"step": 64000
},
{
"epoch": 19.236549443943492,
"eval_accuracy": 0.8129403972187366,
"eval_loss": 1.0872172117233276,
"eval_runtime": 109.8336,
"eval_samples_per_second": 217.147,
"eval_steps_per_second": 6.792,
"step": 64000
},
{
"epoch": 19.3868349864743,
"grad_norm": 2.740501880645752,
"learning_rate": 3.55e-05,
"loss": 0.8266,
"step": 64500
},
{
"epoch": 19.3868349864743,
"eval_accuracy": 0.8114303723800741,
"eval_loss": 1.0987831354141235,
"eval_runtime": 109.7168,
"eval_samples_per_second": 217.378,
"eval_steps_per_second": 6.799,
"step": 64500
},
{
"epoch": 19.537120529005108,
"grad_norm": 2.326366424560547,
"learning_rate": 3.5e-05,
"loss": 0.8255,
"step": 65000
},
{
"epoch": 19.537120529005108,
"eval_accuracy": 0.8124286882301512,
"eval_loss": 1.0885217189788818,
"eval_runtime": 114.778,
"eval_samples_per_second": 207.792,
"eval_steps_per_second": 6.5,
"step": 65000
},
{
"epoch": 19.687406071535918,
"grad_norm": 3.1694321632385254,
"learning_rate": 3.45e-05,
"loss": 0.8185,
"step": 65500
},
{
"epoch": 19.687406071535918,
"eval_accuracy": 0.8129749253547676,
"eval_loss": 1.0836535692214966,
"eval_runtime": 114.5951,
"eval_samples_per_second": 208.124,
"eval_steps_per_second": 6.51,
"step": 65500
},
{
"epoch": 19.837691614066728,
"grad_norm": 2.6415817737579346,
"learning_rate": 3.4000000000000007e-05,
"loss": 0.8219,
"step": 66000
},
{
"epoch": 19.837691614066728,
"eval_accuracy": 0.8122496169754481,
"eval_loss": 1.0868114233016968,
"eval_runtime": 113.7304,
"eval_samples_per_second": 209.707,
"eval_steps_per_second": 6.559,
"step": 66000
},
{
"epoch": 19.987977156597534,
"grad_norm": 2.567044496536255,
"learning_rate": 3.35e-05,
"loss": 0.8214,
"step": 66500
},
{
"epoch": 19.987977156597534,
"eval_accuracy": 0.8135328500351103,
"eval_loss": 1.0849970579147339,
"eval_runtime": 114.2096,
"eval_samples_per_second": 208.827,
"eval_steps_per_second": 6.532,
"step": 66500
},
{
"epoch": 20.138262699128344,
"grad_norm": 2.475660562515259,
"learning_rate": 3.3e-05,
"loss": 0.8123,
"step": 67000
},
{
"epoch": 20.138262699128344,
"eval_accuracy": 0.8127171094527156,
"eval_loss": 1.0919703245162964,
"eval_runtime": 110.4581,
"eval_samples_per_second": 215.919,
"eval_steps_per_second": 6.754,
"step": 67000
},
{
"epoch": 20.288548241659154,
"grad_norm": 2.9205057621002197,
"learning_rate": 3.2500000000000004e-05,
"loss": 0.8146,
"step": 67500
},
{
"epoch": 20.288548241659154,
"eval_accuracy": 0.8139352319239987,
"eval_loss": 1.0828359127044678,
"eval_runtime": 110.5729,
"eval_samples_per_second": 215.695,
"eval_steps_per_second": 6.747,
"step": 67500
},
{
"epoch": 20.43883378418996,
"grad_norm": 2.775470018386841,
"learning_rate": 3.2000000000000005e-05,
"loss": 0.8117,
"step": 68000
},
{
"epoch": 20.43883378418996,
"eval_accuracy": 0.8144743916913453,
"eval_loss": 1.0843496322631836,
"eval_runtime": 110.9504,
"eval_samples_per_second": 214.961,
"eval_steps_per_second": 6.724,
"step": 68000
},
{
"epoch": 20.58911932672077,
"grad_norm": 2.8596460819244385,
"learning_rate": 3.15e-05,
"loss": 0.8142,
"step": 68500
},
{
"epoch": 20.58911932672077,
"eval_accuracy": 0.8145867898001746,
"eval_loss": 1.0775424242019653,
"eval_runtime": 110.2042,
"eval_samples_per_second": 216.416,
"eval_steps_per_second": 6.769,
"step": 68500
},
{
"epoch": 20.73940486925158,
"grad_norm": 2.5671284198760986,
"learning_rate": 3.1e-05,
"loss": 0.8176,
"step": 69000
},
{
"epoch": 20.73940486925158,
"eval_accuracy": 0.8140961234786387,
"eval_loss": 1.0756142139434814,
"eval_runtime": 110.7907,
"eval_samples_per_second": 215.271,
"eval_steps_per_second": 6.733,
"step": 69000
},
{
"epoch": 20.889690411782386,
"grad_norm": 2.549713373184204,
"learning_rate": 3.05e-05,
"loss": 0.813,
"step": 69500
},
{
"epoch": 20.889690411782386,
"eval_accuracy": 0.8145543791131244,
"eval_loss": 1.0741270780563354,
"eval_runtime": 113.2044,
"eval_samples_per_second": 210.681,
"eval_steps_per_second": 6.59,
"step": 69500
},
{
"epoch": 21.039975954313196,
"grad_norm": 2.433366060256958,
"learning_rate": 3e-05,
"loss": 0.8044,
"step": 70000
},
{
"epoch": 21.039975954313196,
"eval_accuracy": 0.8138238331972251,
"eval_loss": 1.0890129804611206,
"eval_runtime": 113.5064,
"eval_samples_per_second": 210.12,
"eval_steps_per_second": 6.572,
"step": 70000
},
{
"epoch": 21.190261496844002,
"grad_norm": 2.672717571258545,
"learning_rate": 2.95e-05,
"loss": 0.8034,
"step": 70500
},
{
"epoch": 21.190261496844002,
"eval_accuracy": 0.8146000723609511,
"eval_loss": 1.0757637023925781,
"eval_runtime": 109.4526,
"eval_samples_per_second": 217.902,
"eval_steps_per_second": 6.816,
"step": 70500
},
{
"epoch": 21.340547039374812,
"grad_norm": 2.7107882499694824,
"learning_rate": 2.9e-05,
"loss": 0.8007,
"step": 71000
},
{
"epoch": 21.340547039374812,
"eval_accuracy": 0.815981095746543,
"eval_loss": 1.074793815612793,
"eval_runtime": 113.0561,
"eval_samples_per_second": 210.957,
"eval_steps_per_second": 6.598,
"step": 71000
},
{
"epoch": 21.49083258190562,
"grad_norm": 2.419224262237549,
"learning_rate": 2.8499999999999998e-05,
"loss": 0.8009,
"step": 71500
},
{
"epoch": 21.49083258190562,
"eval_accuracy": 0.8157551774698867,
"eval_loss": 1.0713990926742554,
"eval_runtime": 114.1603,
"eval_samples_per_second": 208.917,
"eval_steps_per_second": 6.535,
"step": 71500
},
{
"epoch": 21.641118124436428,
"grad_norm": 2.68849778175354,
"learning_rate": 2.8000000000000003e-05,
"loss": 0.8042,
"step": 72000
},
{
"epoch": 21.641118124436428,
"eval_accuracy": 0.816240857712766,
"eval_loss": 1.0804829597473145,
"eval_runtime": 113.3941,
"eval_samples_per_second": 210.328,
"eval_steps_per_second": 6.579,
"step": 72000
},
{
"epoch": 21.791403666967238,
"grad_norm": 2.3715298175811768,
"learning_rate": 2.7500000000000004e-05,
"loss": 0.7971,
"step": 72500
},
{
"epoch": 21.791403666967238,
"eval_accuracy": 0.8154816358162141,
"eval_loss": 1.0725679397583008,
"eval_runtime": 114.0456,
"eval_samples_per_second": 209.127,
"eval_steps_per_second": 6.541,
"step": 72500
},
{
"epoch": 21.941689209498048,
"grad_norm": 2.6499125957489014,
"learning_rate": 2.7000000000000002e-05,
"loss": 0.807,
"step": 73000
},
{
"epoch": 21.941689209498048,
"eval_accuracy": 0.8168061183380335,
"eval_loss": 1.0588874816894531,
"eval_runtime": 112.5964,
"eval_samples_per_second": 211.819,
"eval_steps_per_second": 6.625,
"step": 73000
},
{
"epoch": 22.091974752028854,
"grad_norm": 2.766451358795166,
"learning_rate": 2.6500000000000004e-05,
"loss": 0.7973,
"step": 73500
},
{
"epoch": 22.091974752028854,
"eval_accuracy": 0.8168862140868284,
"eval_loss": 1.063796877861023,
"eval_runtime": 113.9911,
"eval_samples_per_second": 209.227,
"eval_steps_per_second": 6.544,
"step": 73500
},
{
"epoch": 22.242260294559664,
"grad_norm": 2.5453474521636963,
"learning_rate": 2.6000000000000002e-05,
"loss": 0.7915,
"step": 74000
},
{
"epoch": 22.242260294559664,
"eval_accuracy": 0.8162707303528417,
"eval_loss": 1.068402886390686,
"eval_runtime": 113.1917,
"eval_samples_per_second": 210.704,
"eval_steps_per_second": 6.591,
"step": 74000
},
{
"epoch": 22.392545837090474,
"grad_norm": 2.449525833129883,
"learning_rate": 2.5500000000000003e-05,
"loss": 0.8019,
"step": 74500
},
{
"epoch": 22.392545837090474,
"eval_accuracy": 0.8168465033060288,
"eval_loss": 1.071266531944275,
"eval_runtime": 114.1644,
"eval_samples_per_second": 208.909,
"eval_steps_per_second": 6.534,
"step": 74500
},
{
"epoch": 22.54283137962128,
"grad_norm": 2.782717704772949,
"learning_rate": 2.5e-05,
"loss": 0.7959,
"step": 75000
},
{
"epoch": 22.54283137962128,
"eval_accuracy": 0.8176195224595183,
"eval_loss": 1.0642682313919067,
"eval_runtime": 112.938,
"eval_samples_per_second": 211.178,
"eval_steps_per_second": 6.605,
"step": 75000
},
{
"epoch": 22.69311692215209,
"grad_norm": 2.754309892654419,
"learning_rate": 2.45e-05,
"loss": 0.7905,
"step": 75500
},
{
"epoch": 22.69311692215209,
"eval_accuracy": 0.8177573368082611,
"eval_loss": 1.0715863704681396,
"eval_runtime": 113.2687,
"eval_samples_per_second": 210.561,
"eval_steps_per_second": 6.586,
"step": 75500
},
{
"epoch": 22.843402464682896,
"grad_norm": 2.665132999420166,
"learning_rate": 2.4e-05,
"loss": 0.7894,
"step": 76000
},
{
"epoch": 22.843402464682896,
"eval_accuracy": 0.8184662117931626,
"eval_loss": 1.0566191673278809,
"eval_runtime": 114.3215,
"eval_samples_per_second": 208.622,
"eval_steps_per_second": 6.525,
"step": 76000
},
{
"epoch": 22.993688007213706,
"grad_norm": 2.2912895679473877,
"learning_rate": 2.35e-05,
"loss": 0.789,
"step": 76500
},
{
"epoch": 22.993688007213706,
"eval_accuracy": 0.8173126447012466,
"eval_loss": 1.0590641498565674,
"eval_runtime": 114.0949,
"eval_samples_per_second": 209.037,
"eval_steps_per_second": 6.538,
"step": 76500
},
{
"epoch": 23.143973549744516,
"grad_norm": 2.7320480346679688,
"learning_rate": 2.3000000000000003e-05,
"loss": 0.7859,
"step": 77000
},
{
"epoch": 23.143973549744516,
"eval_accuracy": 0.8181692377590023,
"eval_loss": 1.0568209886550903,
"eval_runtime": 114.0712,
"eval_samples_per_second": 209.08,
"eval_steps_per_second": 6.54,
"step": 77000
},
{
"epoch": 23.294259092275322,
"grad_norm": 2.5960936546325684,
"learning_rate": 2.25e-05,
"loss": 0.7894,
"step": 77500
},
{
"epoch": 23.294259092275322,
"eval_accuracy": 0.8178769275591534,
"eval_loss": 1.061540961265564,
"eval_runtime": 113.9113,
"eval_samples_per_second": 209.373,
"eval_steps_per_second": 6.549,
"step": 77500
},
{
"epoch": 23.44454463480613,
"grad_norm": 2.435558557510376,
"learning_rate": 2.2000000000000003e-05,
"loss": 0.7887,
"step": 78000
},
{
"epoch": 23.44454463480613,
"eval_accuracy": 0.8177420807085761,
"eval_loss": 1.0617201328277588,
"eval_runtime": 113.4776,
"eval_samples_per_second": 210.174,
"eval_steps_per_second": 6.574,
"step": 78000
},
{
"epoch": 23.59483017733694,
"grad_norm": 2.3692948818206787,
"learning_rate": 2.15e-05,
"loss": 0.7826,
"step": 78500
},
{
"epoch": 23.59483017733694,
"eval_accuracy": 0.8180053577064287,
"eval_loss": 1.0567620992660522,
"eval_runtime": 113.7057,
"eval_samples_per_second": 209.752,
"eval_steps_per_second": 6.561,
"step": 78500
},
{
"epoch": 23.745115719867748,
"grad_norm": 2.5609283447265625,
"learning_rate": 2.1e-05,
"loss": 0.7885,
"step": 79000
},
{
"epoch": 23.745115719867748,
"eval_accuracy": 0.8188002648627605,
"eval_loss": 1.0632256269454956,
"eval_runtime": 113.6739,
"eval_samples_per_second": 209.811,
"eval_steps_per_second": 6.563,
"step": 79000
},
{
"epoch": 23.895401262398558,
"grad_norm": 2.3372645378112793,
"learning_rate": 2.05e-05,
"loss": 0.7883,
"step": 79500
},
{
"epoch": 23.895401262398558,
"eval_accuracy": 0.8187176321623402,
"eval_loss": 1.066706657409668,
"eval_runtime": 114.0886,
"eval_samples_per_second": 209.048,
"eval_steps_per_second": 6.539,
"step": 79500
},
{
"epoch": 24.045686804929367,
"grad_norm": 2.3112826347351074,
"learning_rate": 2e-05,
"loss": 0.783,
"step": 80000
},
{
"epoch": 24.045686804929367,
"eval_accuracy": 0.818846919349616,
"eval_loss": 1.0611591339111328,
"eval_runtime": 113.1339,
"eval_samples_per_second": 210.812,
"eval_steps_per_second": 6.594,
"step": 80000
},
{
"epoch": 24.195972347460174,
"grad_norm": 2.514390230178833,
"learning_rate": 1.9500000000000003e-05,
"loss": 0.7804,
"step": 80500
},
{
"epoch": 24.195972347460174,
"eval_accuracy": 0.818812757678464,
"eval_loss": 1.0573077201843262,
"eval_runtime": 114.2005,
"eval_samples_per_second": 208.843,
"eval_steps_per_second": 6.532,
"step": 80500
},
{
"epoch": 24.346257889990984,
"grad_norm": 2.4737348556518555,
"learning_rate": 1.9e-05,
"loss": 0.7811,
"step": 81000
},
{
"epoch": 24.346257889990984,
"eval_accuracy": 0.8192913294066332,
"eval_loss": 1.0586949586868286,
"eval_runtime": 113.7459,
"eval_samples_per_second": 209.678,
"eval_steps_per_second": 6.558,
"step": 81000
},
{
"epoch": 24.49654343252179,
"grad_norm": 2.288757562637329,
"learning_rate": 1.85e-05,
"loss": 0.7767,
"step": 81500
},
{
"epoch": 24.49654343252179,
"eval_accuracy": 0.8197264062916556,
"eval_loss": 1.0525128841400146,
"eval_runtime": 113.6132,
"eval_samples_per_second": 209.923,
"eval_steps_per_second": 6.566,
"step": 81500
},
{
"epoch": 24.6468289750526,
"grad_norm": 2.4246654510498047,
"learning_rate": 1.8e-05,
"loss": 0.7803,
"step": 82000
},
{
"epoch": 24.6468289750526,
"eval_accuracy": 0.8195798531764643,
"eval_loss": 1.0466234683990479,
"eval_runtime": 114.7779,
"eval_samples_per_second": 207.793,
"eval_steps_per_second": 6.5,
"step": 82000
},
{
"epoch": 24.79711451758341,
"grad_norm": 3.0004007816314697,
"learning_rate": 1.75e-05,
"loss": 0.7688,
"step": 82500
},
{
"epoch": 24.79711451758341,
"eval_accuracy": 0.8197989075740256,
"eval_loss": 1.0529950857162476,
"eval_runtime": 113.2072,
"eval_samples_per_second": 210.676,
"eval_steps_per_second": 6.59,
"step": 82500
},
{
"epoch": 24.947400060114216,
"grad_norm": 2.476900577545166,
"learning_rate": 1.7000000000000003e-05,
"loss": 0.7734,
"step": 83000
},
{
"epoch": 24.947400060114216,
"eval_accuracy": 0.8198411078292097,
"eval_loss": 1.0492240190505981,
"eval_runtime": 114.2362,
"eval_samples_per_second": 208.778,
"eval_steps_per_second": 6.53,
"step": 83000
},
{
"epoch": 25.097685602645026,
"grad_norm": 2.6050217151641846,
"learning_rate": 1.65e-05,
"loss": 0.7741,
"step": 83500
},
{
"epoch": 25.097685602645026,
"eval_accuracy": 0.8199209388676126,
"eval_loss": 1.0443811416625977,
"eval_runtime": 115.0076,
"eval_samples_per_second": 207.378,
"eval_steps_per_second": 6.487,
"step": 83500
},
{
"epoch": 25.247971145175836,
"grad_norm": 2.845093011856079,
"learning_rate": 1.6000000000000003e-05,
"loss": 0.768,
"step": 84000
},
{
"epoch": 25.247971145175836,
"eval_accuracy": 0.8203943019525041,
"eval_loss": 1.0556350946426392,
"eval_runtime": 115.581,
"eval_samples_per_second": 206.349,
"eval_steps_per_second": 6.454,
"step": 84000
},
{
"epoch": 25.398256687706642,
"grad_norm": 2.8715054988861084,
"learning_rate": 1.55e-05,
"loss": 0.7731,
"step": 84500
},
{
"epoch": 25.398256687706642,
"eval_accuracy": 0.8204570484351956,
"eval_loss": 1.0443503856658936,
"eval_runtime": 114.2662,
"eval_samples_per_second": 208.723,
"eval_steps_per_second": 6.529,
"step": 84500
},
{
"epoch": 25.54854223023745,
"grad_norm": 2.2550415992736816,
"learning_rate": 1.5e-05,
"loss": 0.7675,
"step": 85000
},
{
"epoch": 25.54854223023745,
"eval_accuracy": 0.8200469636032075,
"eval_loss": 1.0414886474609375,
"eval_runtime": 114.9948,
"eval_samples_per_second": 207.401,
"eval_steps_per_second": 6.487,
"step": 85000
},
{
"epoch": 25.69882777276826,
"grad_norm": 2.238607168197632,
"learning_rate": 1.45e-05,
"loss": 0.7675,
"step": 85500
},
{
"epoch": 25.69882777276826,
"eval_accuracy": 0.8207652275282068,
"eval_loss": 1.0444408655166626,
"eval_runtime": 115.1961,
"eval_samples_per_second": 207.038,
"eval_steps_per_second": 6.476,
"step": 85500
},
{
"epoch": 25.849113315299068,
"grad_norm": 2.242630958557129,
"learning_rate": 1.4000000000000001e-05,
"loss": 0.7655,
"step": 86000
},
{
"epoch": 25.849113315299068,
"eval_accuracy": 0.8202594142538011,
"eval_loss": 1.0470616817474365,
"eval_runtime": 116.2229,
"eval_samples_per_second": 205.209,
"eval_steps_per_second": 6.419,
"step": 86000
},
{
"epoch": 25.999398857829878,
"grad_norm": 2.5453295707702637,
"learning_rate": 1.3500000000000001e-05,
"loss": 0.7678,
"step": 86500
},
{
"epoch": 25.999398857829878,
"eval_accuracy": 0.8200539868874701,
"eval_loss": 1.0508933067321777,
"eval_runtime": 114.7052,
"eval_samples_per_second": 207.924,
"eval_steps_per_second": 6.504,
"step": 86500
},
{
"epoch": 26.149684400360684,
"grad_norm": 2.391352415084839,
"learning_rate": 1.3000000000000001e-05,
"loss": 0.7653,
"step": 87000
},
{
"epoch": 26.149684400360684,
"eval_accuracy": 0.8205590231388606,
"eval_loss": 1.0484099388122559,
"eval_runtime": 115.0871,
"eval_samples_per_second": 207.234,
"eval_steps_per_second": 6.482,
"step": 87000
},
{
"epoch": 26.299969942891494,
"grad_norm": 2.5071208477020264,
"learning_rate": 1.25e-05,
"loss": 0.7679,
"step": 87500
},
{
"epoch": 26.299969942891494,
"eval_accuracy": 0.8217573625905127,
"eval_loss": 1.0400173664093018,
"eval_runtime": 113.1376,
"eval_samples_per_second": 210.805,
"eval_steps_per_second": 6.594,
"step": 87500
},
{
"epoch": 26.450255485422304,
"grad_norm": 2.5453133583068848,
"learning_rate": 1.2e-05,
"loss": 0.7656,
"step": 88000
},
{
"epoch": 26.450255485422304,
"eval_accuracy": 0.8213452575807276,
"eval_loss": 1.0516611337661743,
"eval_runtime": 113.3943,
"eval_samples_per_second": 210.328,
"eval_steps_per_second": 6.579,
"step": 88000
},
{
"epoch": 26.60054102795311,
"grad_norm": 2.509098529815674,
"learning_rate": 1.1500000000000002e-05,
"loss": 0.7593,
"step": 88500
},
{
"epoch": 26.60054102795311,
"eval_accuracy": 0.8213436048487595,
"eval_loss": 1.0389631986618042,
"eval_runtime": 116.4627,
"eval_samples_per_second": 204.787,
"eval_steps_per_second": 6.405,
"step": 88500
},
{
"epoch": 26.75082657048392,
"grad_norm": 2.835665464401245,
"learning_rate": 1.1000000000000001e-05,
"loss": 0.7606,
"step": 89000
},
{
"epoch": 26.75082657048392,
"eval_accuracy": 0.8212840552795833,
"eval_loss": 1.0403192043304443,
"eval_runtime": 114.9351,
"eval_samples_per_second": 207.508,
"eval_steps_per_second": 6.491,
"step": 89000
},
{
"epoch": 26.90111211301473,
"grad_norm": 2.5455708503723145,
"learning_rate": 1.05e-05,
"loss": 0.7578,
"step": 89500
},
{
"epoch": 26.90111211301473,
"eval_accuracy": 0.821311120726958,
"eval_loss": 1.036601185798645,
"eval_runtime": 114.3227,
"eval_samples_per_second": 208.62,
"eval_steps_per_second": 6.525,
"step": 89500
},
{
"epoch": 27.051397655545536,
"grad_norm": 2.821378231048584,
"learning_rate": 1e-05,
"loss": 0.765,
"step": 90000
},
{
"epoch": 27.051397655545536,
"eval_accuracy": 0.8221304808698725,
"eval_loss": 1.0333795547485352,
"eval_runtime": 115.4024,
"eval_samples_per_second": 206.668,
"eval_steps_per_second": 6.464,
"step": 90000
},
{
"epoch": 27.201683198076346,
"grad_norm": 2.59919810295105,
"learning_rate": 9.5e-06,
"loss": 0.7589,
"step": 90500
},
{
"epoch": 27.201683198076346,
"eval_accuracy": 0.8222628454875403,
"eval_loss": 1.033319354057312,
"eval_runtime": 115.8055,
"eval_samples_per_second": 205.949,
"eval_steps_per_second": 6.442,
"step": 90500
},
{
"epoch": 27.351968740607152,
"grad_norm": 2.601203203201294,
"learning_rate": 9e-06,
"loss": 0.7563,
"step": 91000
},
{
"epoch": 27.351968740607152,
"eval_accuracy": 0.8218045068983223,
"eval_loss": 1.046338677406311,
"eval_runtime": 116.2837,
"eval_samples_per_second": 205.102,
"eval_steps_per_second": 6.415,
"step": 91000
},
{
"epoch": 27.50225428313796,
"grad_norm": 2.8450610637664795,
"learning_rate": 8.500000000000002e-06,
"loss": 0.7559,
"step": 91500
},
{
"epoch": 27.50225428313796,
"eval_accuracy": 0.8218716317162797,
"eval_loss": 1.044384241104126,
"eval_runtime": 115.6484,
"eval_samples_per_second": 206.229,
"eval_steps_per_second": 6.451,
"step": 91500
},
{
"epoch": 27.65253982566877,
"grad_norm": 2.7283740043640137,
"learning_rate": 8.000000000000001e-06,
"loss": 0.7586,
"step": 92000
},
{
"epoch": 27.65253982566877,
"eval_accuracy": 0.8221257250457026,
"eval_loss": 1.0296134948730469,
"eval_runtime": 115.214,
"eval_samples_per_second": 207.006,
"eval_steps_per_second": 6.475,
"step": 92000
},
{
"epoch": 27.802825368199578,
"grad_norm": 2.795022964477539,
"learning_rate": 7.5e-06,
"loss": 0.7585,
"step": 92500
},
{
"epoch": 27.802825368199578,
"eval_accuracy": 0.82162242799983,
"eval_loss": 1.045117735862732,
"eval_runtime": 113.2778,
"eval_samples_per_second": 210.544,
"eval_steps_per_second": 6.586,
"step": 92500
},
{
"epoch": 27.953110910730388,
"grad_norm": 2.52536678314209,
"learning_rate": 7.000000000000001e-06,
"loss": 0.7548,
"step": 93000
},
{
"epoch": 27.953110910730388,
"eval_accuracy": 0.8225299354468154,
"eval_loss": 1.0380265712738037,
"eval_runtime": 113.5711,
"eval_samples_per_second": 210.001,
"eval_steps_per_second": 6.569,
"step": 93000
},
{
"epoch": 28.103396453261198,
"grad_norm": 2.3631057739257812,
"learning_rate": 6.5000000000000004e-06,
"loss": 0.7542,
"step": 93500
},
{
"epoch": 28.103396453261198,
"eval_accuracy": 0.8229969642015583,
"eval_loss": 1.0490919351577759,
"eval_runtime": 112.4838,
"eval_samples_per_second": 212.031,
"eval_steps_per_second": 6.632,
"step": 93500
},
{
"epoch": 28.253681995792004,
"grad_norm": 2.2699172496795654,
"learning_rate": 6e-06,
"loss": 0.7522,
"step": 94000
},
{
"epoch": 28.253681995792004,
"eval_accuracy": 0.8230864296098176,
"eval_loss": 1.0274651050567627,
"eval_runtime": 111.8716,
"eval_samples_per_second": 213.191,
"eval_steps_per_second": 6.668,
"step": 94000
},
{
"epoch": 28.403967538322814,
"grad_norm": 2.44769287109375,
"learning_rate": 5.500000000000001e-06,
"loss": 0.7569,
"step": 94500
},
{
"epoch": 28.403967538322814,
"eval_accuracy": 0.8234413637256175,
"eval_loss": 1.0298686027526855,
"eval_runtime": 113.5377,
"eval_samples_per_second": 210.062,
"eval_steps_per_second": 6.571,
"step": 94500
},
{
"epoch": 28.554253080853623,
"grad_norm": 2.7103466987609863,
"learning_rate": 5e-06,
"loss": 0.7536,
"step": 95000
},
{
"epoch": 28.554253080853623,
"eval_accuracy": 0.8224712064035926,
"eval_loss": 1.034175992012024,
"eval_runtime": 113.1535,
"eval_samples_per_second": 210.776,
"eval_steps_per_second": 6.593,
"step": 95000
},
{
"epoch": 28.70453862338443,
"grad_norm": 2.594381809234619,
"learning_rate": 4.5e-06,
"loss": 0.7547,
"step": 95500
},
{
"epoch": 28.70453862338443,
"eval_accuracy": 0.8231576514254546,
"eval_loss": 1.0324558019638062,
"eval_runtime": 111.2443,
"eval_samples_per_second": 214.393,
"eval_steps_per_second": 6.706,
"step": 95500
},
{
"epoch": 28.85482416591524,
"grad_norm": 2.623845100402832,
"learning_rate": 4.000000000000001e-06,
"loss": 0.7529,
"step": 96000
},
{
"epoch": 28.85482416591524,
"eval_accuracy": 0.8241724336625226,
"eval_loss": 1.0312702655792236,
"eval_runtime": 115.266,
"eval_samples_per_second": 206.913,
"eval_steps_per_second": 6.472,
"step": 96000
},
{
"epoch": 29.005109708446046,
"grad_norm": 2.706256628036499,
"learning_rate": 3.5000000000000004e-06,
"loss": 0.7461,
"step": 96500
},
{
"epoch": 29.005109708446046,
"eval_accuracy": 0.8230462104005479,
"eval_loss": 1.0389125347137451,
"eval_runtime": 115.814,
"eval_samples_per_second": 205.934,
"eval_steps_per_second": 6.441,
"step": 96500
},
{
"epoch": 29.155395250976856,
"grad_norm": 2.697993278503418,
"learning_rate": 3e-06,
"loss": 0.7513,
"step": 97000
},
{
"epoch": 29.155395250976856,
"eval_accuracy": 0.8236184494498082,
"eval_loss": 1.036423921585083,
"eval_runtime": 115.3283,
"eval_samples_per_second": 206.801,
"eval_steps_per_second": 6.468,
"step": 97000
},
{
"epoch": 29.305680793507666,
"grad_norm": 2.68867826461792,
"learning_rate": 2.5e-06,
"loss": 0.7494,
"step": 97500
},
{
"epoch": 29.305680793507666,
"eval_accuracy": 0.82393844898435,
"eval_loss": 1.0242578983306885,
"eval_runtime": 116.3698,
"eval_samples_per_second": 204.95,
"eval_steps_per_second": 6.411,
"step": 97500
},
{
"epoch": 29.455966336038472,
"grad_norm": 2.259347915649414,
"learning_rate": 2.0000000000000003e-06,
"loss": 0.7438,
"step": 98000
},
{
"epoch": 29.455966336038472,
"eval_accuracy": 0.8236859969210806,
"eval_loss": 1.0227982997894287,
"eval_runtime": 115.3714,
"eval_samples_per_second": 206.724,
"eval_steps_per_second": 6.466,
"step": 98000
},
{
"epoch": 29.60625187856928,
"grad_norm": 2.8278329372406006,
"learning_rate": 1.5e-06,
"loss": 0.7499,
"step": 98500
},
{
"epoch": 29.60625187856928,
"eval_accuracy": 0.8236034478162941,
"eval_loss": 1.022267460823059,
"eval_runtime": 114.5023,
"eval_samples_per_second": 208.293,
"eval_steps_per_second": 6.515,
"step": 98500
}
],
"logging_steps": 500,
"max_steps": 100000,
"num_input_tokens_seen": 0,
"num_train_epochs": 31,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 8.31746763541971e+17,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}