diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -1,83 +1,3602 @@ -{ - "best_metric": 0.5882352941176471, - "best_model_checkpoint": "swin-tiny-patch4-window7-224-finetuned-papsmear/checkpoint-19", - "epoch": 2.769230769230769, - "eval_steps": 500, - "global_step": 27, - "is_hyper_param_search": false, - "is_local_process_zero": true, - "is_world_process_zero": true, - "log_history": [ - { - "epoch": 0.9230769230769231, - "eval_accuracy": 0.35294117647058826, - "eval_loss": 1.424035668373108, - "eval_runtime": 28.6569, - "eval_samples_per_second": 4.746, - "eval_steps_per_second": 0.174, - "step": 9 - }, - { - "epoch": 1.0256410256410255, - "grad_norm": 14.609389305114746, - "learning_rate": 3.541666666666667e-05, - "loss": 1.5837, - "step": 10 - }, - { - "epoch": 1.9487179487179487, - "eval_accuracy": 0.5882352941176471, - "eval_loss": 1.1057605743408203, - "eval_runtime": 29.2045, - "eval_samples_per_second": 4.657, - "eval_steps_per_second": 0.171, - "step": 19 - }, - { - "epoch": 2.051282051282051, - "grad_norm": 20.33160400390625, - "learning_rate": 1.4583333333333335e-05, - "loss": 1.245, - "step": 20 - }, - { - "epoch": 2.769230769230769, - "eval_accuracy": 0.5882352941176471, - "eval_loss": 1.0397424697875977, - "eval_runtime": 28.5486, - "eval_samples_per_second": 4.764, - "eval_steps_per_second": 0.175, - "step": 27 - }, - { - "epoch": 2.769230769230769, - "step": 27, - "total_flos": 8.47186783811666e+16, - "train_loss": 1.3284762523792408, - "train_runtime": 800.6992, - "train_samples_per_second": 4.586, - "train_steps_per_second": 0.034 - } - ], - "logging_steps": 10, - "max_steps": 27, - "num_input_tokens_seen": 0, - "num_train_epochs": 3, - "save_steps": 500, - "stateful_callbacks": { - "TrainerControl": { - "args": { - "should_epoch_stop": false, - "should_evaluate": false, - "should_log": false, - "should_save": true, - "should_training_stop": true - }, - "attributes": {} - } - }, - "total_flos": 8.47186783811666e+16, - "train_batch_size": 32, - "trial_name": null, - "trial_params": null -} +{ + "best_metric": 0.9779411764705882, + "best_model_checkpoint": "swin-tiny-patch4-window7-224-finetuned-papsmear\\checkpoint-2448", + "epoch": 99.34640522875817, + "eval_steps": 500, + "global_step": 3800, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.26143790849673204, + "grad_norm": 19.404264450073242, + "learning_rate": 1.3157894736842106e-06, + "loss": 1.8243, + "step": 10 + }, + { + "epoch": 0.5228758169934641, + "grad_norm": 9.874568939208984, + "learning_rate": 2.631578947368421e-06, + "loss": 1.7542, + "step": 20 + }, + { + "epoch": 0.7843137254901961, + "grad_norm": 13.61699390411377, + "learning_rate": 3.9473684210526315e-06, + "loss": 1.7081, + "step": 30 + }, + { + "epoch": 0.9934640522875817, + "eval_accuracy": 0.2867647058823529, + "eval_loss": 1.6642274856567383, + "eval_runtime": 19.1091, + "eval_samples_per_second": 7.117, + "eval_steps_per_second": 0.89, + "step": 38 + }, + { + "epoch": 1.0457516339869282, + "grad_norm": 17.95810317993164, + "learning_rate": 5.263157894736842e-06, + "loss": 1.6316, + "step": 40 + }, + { + "epoch": 1.3071895424836601, + "grad_norm": 11.760519027709961, + "learning_rate": 6.578947368421053e-06, + "loss": 1.6191, + "step": 50 + }, + { + "epoch": 1.5686274509803921, + "grad_norm": 12.139671325683594, + "learning_rate": 7.894736842105263e-06, + "loss": 1.514, + "step": 60 + }, + { + "epoch": 1.8300653594771243, + "grad_norm": 11.897443771362305, + "learning_rate": 9.210526315789474e-06, + "loss": 1.4025, + "step": 70 + }, + { + "epoch": 1.9869281045751634, + "eval_accuracy": 0.4632352941176471, + "eval_loss": 1.3760590553283691, + "eval_runtime": 16.8545, + "eval_samples_per_second": 8.069, + "eval_steps_per_second": 1.009, + "step": 76 + }, + { + "epoch": 2.0915032679738563, + "grad_norm": 14.211647987365723, + "learning_rate": 1.0526315789473684e-05, + "loss": 1.341, + "step": 80 + }, + { + "epoch": 2.3529411764705883, + "grad_norm": 21.328588485717773, + "learning_rate": 1.1842105263157895e-05, + "loss": 1.2617, + "step": 90 + }, + { + "epoch": 2.6143790849673203, + "grad_norm": 24.131996154785156, + "learning_rate": 1.3157894736842106e-05, + "loss": 1.1608, + "step": 100 + }, + { + "epoch": 2.8758169934640523, + "grad_norm": 23.461227416992188, + "learning_rate": 1.4473684210526317e-05, + "loss": 1.0918, + "step": 110 + }, + { + "epoch": 2.980392156862745, + "eval_accuracy": 0.5514705882352942, + "eval_loss": 1.0276451110839844, + "eval_runtime": 17.5433, + "eval_samples_per_second": 7.752, + "eval_steps_per_second": 0.969, + "step": 114 + }, + { + "epoch": 3.1372549019607843, + "grad_norm": 44.0300407409668, + "learning_rate": 1.5789473684210526e-05, + "loss": 0.9044, + "step": 120 + }, + { + "epoch": 3.3986928104575163, + "grad_norm": 23.61319923400879, + "learning_rate": 1.7105263157894737e-05, + "loss": 0.9409, + "step": 130 + }, + { + "epoch": 3.6601307189542482, + "grad_norm": 27.572128295898438, + "learning_rate": 1.8421052631578947e-05, + "loss": 0.9152, + "step": 140 + }, + { + "epoch": 3.9215686274509802, + "grad_norm": 20.785051345825195, + "learning_rate": 1.9736842105263158e-05, + "loss": 0.8051, + "step": 150 + }, + { + "epoch": 4.0, + "eval_accuracy": 0.6691176470588235, + "eval_loss": 0.7678546905517578, + "eval_runtime": 17.2269, + "eval_samples_per_second": 7.895, + "eval_steps_per_second": 0.987, + "step": 153 + }, + { + "epoch": 4.183006535947713, + "grad_norm": 32.00216293334961, + "learning_rate": 2.105263157894737e-05, + "loss": 0.7821, + "step": 160 + }, + { + "epoch": 4.444444444444445, + "grad_norm": 23.564285278320312, + "learning_rate": 2.236842105263158e-05, + "loss": 0.8036, + "step": 170 + }, + { + "epoch": 4.705882352941177, + "grad_norm": 21.403562545776367, + "learning_rate": 2.368421052631579e-05, + "loss": 0.7355, + "step": 180 + }, + { + "epoch": 4.967320261437909, + "grad_norm": 31.243640899658203, + "learning_rate": 2.5e-05, + "loss": 0.635, + "step": 190 + }, + { + "epoch": 4.993464052287582, + "eval_accuracy": 0.7867647058823529, + "eval_loss": 0.5927847623825073, + "eval_runtime": 17.4003, + "eval_samples_per_second": 7.816, + "eval_steps_per_second": 0.977, + "step": 191 + }, + { + "epoch": 5.228758169934641, + "grad_norm": 23.90205192565918, + "learning_rate": 2.6315789473684212e-05, + "loss": 0.6363, + "step": 200 + }, + { + "epoch": 5.490196078431373, + "grad_norm": 23.38309669494629, + "learning_rate": 2.7631578947368426e-05, + "loss": 0.6285, + "step": 210 + }, + { + "epoch": 5.751633986928105, + "grad_norm": 41.387149810791016, + "learning_rate": 2.8947368421052634e-05, + "loss": 0.6051, + "step": 220 + }, + { + "epoch": 5.9869281045751634, + "eval_accuracy": 0.75, + "eval_loss": 0.695731520652771, + "eval_runtime": 17.5363, + "eval_samples_per_second": 7.755, + "eval_steps_per_second": 0.969, + "step": 229 + }, + { + "epoch": 6.0130718954248366, + "grad_norm": 33.84821319580078, + "learning_rate": 3.0263157894736844e-05, + "loss": 0.6503, + "step": 230 + }, + { + "epoch": 6.2745098039215685, + "grad_norm": 18.2890682220459, + "learning_rate": 3.157894736842105e-05, + "loss": 0.4905, + "step": 240 + }, + { + "epoch": 6.5359477124183005, + "grad_norm": 25.626060485839844, + "learning_rate": 3.289473684210527e-05, + "loss": 0.5262, + "step": 250 + }, + { + "epoch": 6.7973856209150325, + "grad_norm": 28.431270599365234, + "learning_rate": 3.421052631578947e-05, + "loss": 0.5539, + "step": 260 + }, + { + "epoch": 6.980392156862745, + "eval_accuracy": 0.7941176470588235, + "eval_loss": 0.5016477108001709, + "eval_runtime": 17.3512, + "eval_samples_per_second": 7.838, + "eval_steps_per_second": 0.98, + "step": 267 + }, + { + "epoch": 7.0588235294117645, + "grad_norm": 21.074764251708984, + "learning_rate": 3.5526315789473684e-05, + "loss": 0.4807, + "step": 270 + }, + { + "epoch": 7.3202614379084965, + "grad_norm": 21.632251739501953, + "learning_rate": 3.6842105263157895e-05, + "loss": 0.4704, + "step": 280 + }, + { + "epoch": 7.5816993464052285, + "grad_norm": 41.86575698852539, + "learning_rate": 3.815789473684211e-05, + "loss": 0.5141, + "step": 290 + }, + { + "epoch": 7.8431372549019605, + "grad_norm": 20.23293685913086, + "learning_rate": 3.9473684210526316e-05, + "loss": 0.4683, + "step": 300 + }, + { + "epoch": 8.0, + "eval_accuracy": 0.8235294117647058, + "eval_loss": 0.4732811748981476, + "eval_runtime": 17.0473, + "eval_samples_per_second": 7.978, + "eval_steps_per_second": 0.997, + "step": 306 + }, + { + "epoch": 8.104575163398692, + "grad_norm": 67.42210388183594, + "learning_rate": 4.078947368421053e-05, + "loss": 0.451, + "step": 310 + }, + { + "epoch": 8.366013071895425, + "grad_norm": 22.807098388671875, + "learning_rate": 4.210526315789474e-05, + "loss": 0.4019, + "step": 320 + }, + { + "epoch": 8.627450980392156, + "grad_norm": 31.961091995239258, + "learning_rate": 4.342105263157895e-05, + "loss": 0.4663, + "step": 330 + }, + { + "epoch": 8.88888888888889, + "grad_norm": 26.965513229370117, + "learning_rate": 4.473684210526316e-05, + "loss": 0.4153, + "step": 340 + }, + { + "epoch": 8.993464052287582, + "eval_accuracy": 0.8529411764705882, + "eval_loss": 0.4834950268268585, + "eval_runtime": 16.944, + "eval_samples_per_second": 8.026, + "eval_steps_per_second": 1.003, + "step": 344 + }, + { + "epoch": 9.15032679738562, + "grad_norm": 21.733226776123047, + "learning_rate": 4.605263157894737e-05, + "loss": 0.473, + "step": 350 + }, + { + "epoch": 9.411764705882353, + "grad_norm": 17.1552734375, + "learning_rate": 4.736842105263158e-05, + "loss": 0.3912, + "step": 360 + }, + { + "epoch": 9.673202614379084, + "grad_norm": 39.66945266723633, + "learning_rate": 4.868421052631579e-05, + "loss": 0.465, + "step": 370 + }, + { + "epoch": 9.934640522875817, + "grad_norm": 24.060779571533203, + "learning_rate": 5e-05, + "loss": 0.3954, + "step": 380 + }, + { + "epoch": 9.986928104575163, + "eval_accuracy": 0.8308823529411765, + "eval_loss": 0.5431119203567505, + "eval_runtime": 16.9702, + "eval_samples_per_second": 8.014, + "eval_steps_per_second": 1.002, + "step": 382 + }, + { + "epoch": 10.196078431372548, + "grad_norm": 22.754186630249023, + "learning_rate": 4.985380116959065e-05, + "loss": 0.309, + "step": 390 + }, + { + "epoch": 10.457516339869281, + "grad_norm": 25.09243392944336, + "learning_rate": 4.970760233918128e-05, + "loss": 0.2985, + "step": 400 + }, + { + "epoch": 10.718954248366012, + "grad_norm": 32.95780563354492, + "learning_rate": 4.956140350877193e-05, + "loss": 0.3551, + "step": 410 + }, + { + "epoch": 10.980392156862745, + "grad_norm": 24.594146728515625, + "learning_rate": 4.941520467836258e-05, + "loss": 0.3524, + "step": 420 + }, + { + "epoch": 10.980392156862745, + "eval_accuracy": 0.8235294117647058, + "eval_loss": 0.4060741364955902, + "eval_runtime": 16.9787, + "eval_samples_per_second": 8.01, + "eval_steps_per_second": 1.001, + "step": 420 + }, + { + "epoch": 11.241830065359476, + "grad_norm": 34.58118438720703, + "learning_rate": 4.926900584795322e-05, + "loss": 0.3015, + "step": 430 + }, + { + "epoch": 11.50326797385621, + "grad_norm": 17.467493057250977, + "learning_rate": 4.912280701754386e-05, + "loss": 0.332, + "step": 440 + }, + { + "epoch": 11.764705882352942, + "grad_norm": 11.450825691223145, + "learning_rate": 4.8976608187134504e-05, + "loss": 0.3546, + "step": 450 + }, + { + "epoch": 12.0, + "eval_accuracy": 0.8382352941176471, + "eval_loss": 0.4924784302711487, + "eval_runtime": 17.0509, + "eval_samples_per_second": 7.976, + "eval_steps_per_second": 0.997, + "step": 459 + }, + { + "epoch": 12.026143790849673, + "grad_norm": 22.95159912109375, + "learning_rate": 4.883040935672515e-05, + "loss": 0.3362, + "step": 460 + }, + { + "epoch": 12.287581699346406, + "grad_norm": 15.78369140625, + "learning_rate": 4.868421052631579e-05, + "loss": 0.2589, + "step": 470 + }, + { + "epoch": 12.549019607843137, + "grad_norm": 18.571977615356445, + "learning_rate": 4.853801169590643e-05, + "loss": 0.2588, + "step": 480 + }, + { + "epoch": 12.81045751633987, + "grad_norm": 10.237850189208984, + "learning_rate": 4.839181286549708e-05, + "loss": 0.2922, + "step": 490 + }, + { + "epoch": 12.993464052287582, + "eval_accuracy": 0.875, + "eval_loss": 0.36371880769729614, + "eval_runtime": 16.7827, + "eval_samples_per_second": 8.104, + "eval_steps_per_second": 1.013, + "step": 497 + }, + { + "epoch": 13.071895424836601, + "grad_norm": 14.183631896972656, + "learning_rate": 4.824561403508772e-05, + "loss": 0.2683, + "step": 500 + }, + { + "epoch": 13.333333333333334, + "grad_norm": 15.362314224243164, + "learning_rate": 4.8099415204678366e-05, + "loss": 0.2178, + "step": 510 + }, + { + "epoch": 13.594771241830065, + "grad_norm": 31.49340057373047, + "learning_rate": 4.7953216374269006e-05, + "loss": 0.2095, + "step": 520 + }, + { + "epoch": 13.856209150326798, + "grad_norm": 39.85598373413086, + "learning_rate": 4.780701754385965e-05, + "loss": 0.2342, + "step": 530 + }, + { + "epoch": 13.986928104575163, + "eval_accuracy": 0.8970588235294118, + "eval_loss": 0.32859814167022705, + "eval_runtime": 16.8467, + "eval_samples_per_second": 8.073, + "eval_steps_per_second": 1.009, + "step": 535 + }, + { + "epoch": 14.117647058823529, + "grad_norm": 22.395517349243164, + "learning_rate": 4.7660818713450294e-05, + "loss": 0.2927, + "step": 540 + }, + { + "epoch": 14.379084967320262, + "grad_norm": 15.716471672058105, + "learning_rate": 4.751461988304094e-05, + "loss": 0.2419, + "step": 550 + }, + { + "epoch": 14.640522875816993, + "grad_norm": 13.827138900756836, + "learning_rate": 4.736842105263158e-05, + "loss": 0.2215, + "step": 560 + }, + { + "epoch": 14.901960784313726, + "grad_norm": 8.343385696411133, + "learning_rate": 4.722222222222222e-05, + "loss": 0.2083, + "step": 570 + }, + { + "epoch": 14.980392156862745, + "eval_accuracy": 0.8823529411764706, + "eval_loss": 0.327125608921051, + "eval_runtime": 17.1905, + "eval_samples_per_second": 7.911, + "eval_steps_per_second": 0.989, + "step": 573 + }, + { + "epoch": 15.163398692810457, + "grad_norm": 27.369592666625977, + "learning_rate": 4.707602339181287e-05, + "loss": 0.1837, + "step": 580 + }, + { + "epoch": 15.42483660130719, + "grad_norm": 4.707042217254639, + "learning_rate": 4.6929824561403515e-05, + "loss": 0.1872, + "step": 590 + }, + { + "epoch": 15.686274509803921, + "grad_norm": 19.026412963867188, + "learning_rate": 4.678362573099415e-05, + "loss": 0.2063, + "step": 600 + }, + { + "epoch": 15.947712418300654, + "grad_norm": 39.22539138793945, + "learning_rate": 4.6637426900584796e-05, + "loss": 0.2704, + "step": 610 + }, + { + "epoch": 16.0, + "eval_accuracy": 0.8823529411764706, + "eval_loss": 0.3700261414051056, + "eval_runtime": 17.2498, + "eval_samples_per_second": 7.884, + "eval_steps_per_second": 0.986, + "step": 612 + }, + { + "epoch": 16.209150326797385, + "grad_norm": 4.610194683074951, + "learning_rate": 4.649122807017544e-05, + "loss": 0.1895, + "step": 620 + }, + { + "epoch": 16.470588235294116, + "grad_norm": 27.570838928222656, + "learning_rate": 4.634502923976608e-05, + "loss": 0.1492, + "step": 630 + }, + { + "epoch": 16.73202614379085, + "grad_norm": 13.742429733276367, + "learning_rate": 4.619883040935672e-05, + "loss": 0.1698, + "step": 640 + }, + { + "epoch": 16.99346405228758, + "grad_norm": 16.786169052124023, + "learning_rate": 4.605263157894737e-05, + "loss": 0.1871, + "step": 650 + }, + { + "epoch": 16.99346405228758, + "eval_accuracy": 0.8970588235294118, + "eval_loss": 0.34471678733825684, + "eval_runtime": 16.7473, + "eval_samples_per_second": 8.121, + "eval_steps_per_second": 1.015, + "step": 650 + }, + { + "epoch": 17.254901960784313, + "grad_norm": 15.884855270385742, + "learning_rate": 4.590643274853802e-05, + "loss": 0.1335, + "step": 660 + }, + { + "epoch": 17.516339869281047, + "grad_norm": 17.3248348236084, + "learning_rate": 4.576023391812866e-05, + "loss": 0.1399, + "step": 670 + }, + { + "epoch": 17.77777777777778, + "grad_norm": 16.090543746948242, + "learning_rate": 4.56140350877193e-05, + "loss": 0.226, + "step": 680 + }, + { + "epoch": 17.986928104575163, + "eval_accuracy": 0.8602941176470589, + "eval_loss": 0.4279506206512451, + "eval_runtime": 16.8179, + "eval_samples_per_second": 8.087, + "eval_steps_per_second": 1.011, + "step": 688 + }, + { + "epoch": 18.03921568627451, + "grad_norm": 17.314950942993164, + "learning_rate": 4.5467836257309945e-05, + "loss": 0.2657, + "step": 690 + }, + { + "epoch": 18.30065359477124, + "grad_norm": 26.111413955688477, + "learning_rate": 4.5321637426900585e-05, + "loss": 0.1238, + "step": 700 + }, + { + "epoch": 18.562091503267975, + "grad_norm": 34.5568962097168, + "learning_rate": 4.517543859649123e-05, + "loss": 0.3426, + "step": 710 + }, + { + "epoch": 18.823529411764707, + "grad_norm": 27.506118774414062, + "learning_rate": 4.502923976608187e-05, + "loss": 0.245, + "step": 720 + }, + { + "epoch": 18.980392156862745, + "eval_accuracy": 0.8088235294117647, + "eval_loss": 0.6445416212081909, + "eval_runtime": 16.6042, + "eval_samples_per_second": 8.191, + "eval_steps_per_second": 1.024, + "step": 726 + }, + { + "epoch": 19.084967320261438, + "grad_norm": 8.742308616638184, + "learning_rate": 4.488304093567251e-05, + "loss": 0.1876, + "step": 730 + }, + { + "epoch": 19.34640522875817, + "grad_norm": 37.74170684814453, + "learning_rate": 4.473684210526316e-05, + "loss": 0.1044, + "step": 740 + }, + { + "epoch": 19.607843137254903, + "grad_norm": 17.85502815246582, + "learning_rate": 4.4590643274853806e-05, + "loss": 0.1637, + "step": 750 + }, + { + "epoch": 19.869281045751634, + "grad_norm": 13.413275718688965, + "learning_rate": 4.4444444444444447e-05, + "loss": 0.1545, + "step": 760 + }, + { + "epoch": 20.0, + "eval_accuracy": 0.8602941176470589, + "eval_loss": 0.41802164912223816, + "eval_runtime": 16.9375, + "eval_samples_per_second": 8.03, + "eval_steps_per_second": 1.004, + "step": 765 + }, + { + "epoch": 20.130718954248366, + "grad_norm": 24.223968505859375, + "learning_rate": 4.429824561403509e-05, + "loss": 0.1333, + "step": 770 + }, + { + "epoch": 20.392156862745097, + "grad_norm": 22.863794326782227, + "learning_rate": 4.4152046783625734e-05, + "loss": 0.1223, + "step": 780 + }, + { + "epoch": 20.65359477124183, + "grad_norm": 20.22460174560547, + "learning_rate": 4.400584795321638e-05, + "loss": 0.1906, + "step": 790 + }, + { + "epoch": 20.915032679738562, + "grad_norm": 6.557627201080322, + "learning_rate": 4.3859649122807014e-05, + "loss": 0.0981, + "step": 800 + }, + { + "epoch": 20.99346405228758, + "eval_accuracy": 0.9044117647058824, + "eval_loss": 0.32080766558647156, + "eval_runtime": 17.4044, + "eval_samples_per_second": 7.814, + "eval_steps_per_second": 0.977, + "step": 803 + }, + { + "epoch": 21.176470588235293, + "grad_norm": 11.885444641113281, + "learning_rate": 4.371345029239766e-05, + "loss": 0.1654, + "step": 810 + }, + { + "epoch": 21.437908496732025, + "grad_norm": 16.748071670532227, + "learning_rate": 4.356725146198831e-05, + "loss": 0.1706, + "step": 820 + }, + { + "epoch": 21.69934640522876, + "grad_norm": 25.410442352294922, + "learning_rate": 4.342105263157895e-05, + "loss": 0.1121, + "step": 830 + }, + { + "epoch": 21.96078431372549, + "grad_norm": 24.631742477416992, + "learning_rate": 4.327485380116959e-05, + "loss": 0.1455, + "step": 840 + }, + { + "epoch": 21.986928104575163, + "eval_accuracy": 0.8602941176470589, + "eval_loss": 0.425643652677536, + "eval_runtime": 20.0595, + "eval_samples_per_second": 6.78, + "eval_steps_per_second": 0.847, + "step": 841 + }, + { + "epoch": 22.22222222222222, + "grad_norm": 9.926827430725098, + "learning_rate": 4.3128654970760236e-05, + "loss": 0.144, + "step": 850 + }, + { + "epoch": 22.483660130718953, + "grad_norm": 32.22057342529297, + "learning_rate": 4.298245614035088e-05, + "loss": 0.1328, + "step": 860 + }, + { + "epoch": 22.745098039215687, + "grad_norm": 6.770218849182129, + "learning_rate": 4.283625730994152e-05, + "loss": 0.2405, + "step": 870 + }, + { + "epoch": 22.980392156862745, + "eval_accuracy": 0.8970588235294118, + "eval_loss": 0.34735360741615295, + "eval_runtime": 36.4621, + "eval_samples_per_second": 3.73, + "eval_steps_per_second": 0.466, + "step": 879 + }, + { + "epoch": 23.00653594771242, + "grad_norm": 18.301342010498047, + "learning_rate": 4.269005847953216e-05, + "loss": 0.1407, + "step": 880 + }, + { + "epoch": 23.26797385620915, + "grad_norm": 25.70302963256836, + "learning_rate": 4.254385964912281e-05, + "loss": 0.1403, + "step": 890 + }, + { + "epoch": 23.529411764705884, + "grad_norm": 6.829775333404541, + "learning_rate": 4.239766081871345e-05, + "loss": 0.1278, + "step": 900 + }, + { + "epoch": 23.790849673202615, + "grad_norm": 15.183685302734375, + "learning_rate": 4.22514619883041e-05, + "loss": 0.1549, + "step": 910 + }, + { + "epoch": 24.0, + "eval_accuracy": 0.9044117647058824, + "eval_loss": 0.39403286576271057, + "eval_runtime": 30.2513, + "eval_samples_per_second": 4.496, + "eval_steps_per_second": 0.562, + "step": 918 + }, + { + "epoch": 24.052287581699346, + "grad_norm": 76.56197357177734, + "learning_rate": 4.210526315789474e-05, + "loss": 0.2019, + "step": 920 + }, + { + "epoch": 24.313725490196077, + "grad_norm": 10.338065147399902, + "learning_rate": 4.195906432748538e-05, + "loss": 0.1341, + "step": 930 + }, + { + "epoch": 24.575163398692812, + "grad_norm": 10.710972785949707, + "learning_rate": 4.1812865497076025e-05, + "loss": 0.1207, + "step": 940 + }, + { + "epoch": 24.836601307189543, + "grad_norm": 19.086135864257812, + "learning_rate": 4.166666666666667e-05, + "loss": 0.1721, + "step": 950 + }, + { + "epoch": 24.99346405228758, + "eval_accuracy": 0.8823529411764706, + "eval_loss": 0.4279385805130005, + "eval_runtime": 29.9969, + "eval_samples_per_second": 4.534, + "eval_steps_per_second": 0.567, + "step": 956 + }, + { + "epoch": 25.098039215686274, + "grad_norm": 6.991425514221191, + "learning_rate": 4.152046783625731e-05, + "loss": 0.0729, + "step": 960 + }, + { + "epoch": 25.359477124183005, + "grad_norm": 8.979483604431152, + "learning_rate": 4.137426900584795e-05, + "loss": 0.1826, + "step": 970 + }, + { + "epoch": 25.62091503267974, + "grad_norm": 11.570904731750488, + "learning_rate": 4.12280701754386e-05, + "loss": 0.1492, + "step": 980 + }, + { + "epoch": 25.88235294117647, + "grad_norm": 14.8778076171875, + "learning_rate": 4.1081871345029247e-05, + "loss": 0.1378, + "step": 990 + }, + { + "epoch": 25.986928104575163, + "eval_accuracy": 0.9044117647058824, + "eval_loss": 0.387086421251297, + "eval_runtime": 29.0075, + "eval_samples_per_second": 4.688, + "eval_steps_per_second": 0.586, + "step": 994 + }, + { + "epoch": 26.143790849673202, + "grad_norm": 11.985469818115234, + "learning_rate": 4.093567251461988e-05, + "loss": 0.1122, + "step": 1000 + }, + { + "epoch": 26.405228758169933, + "grad_norm": 22.02225685119629, + "learning_rate": 4.078947368421053e-05, + "loss": 0.1172, + "step": 1010 + }, + { + "epoch": 26.666666666666668, + "grad_norm": 1.2671743631362915, + "learning_rate": 4.0643274853801174e-05, + "loss": 0.0891, + "step": 1020 + }, + { + "epoch": 26.9281045751634, + "grad_norm": 10.896835327148438, + "learning_rate": 4.0497076023391814e-05, + "loss": 0.0924, + "step": 1030 + }, + { + "epoch": 26.980392156862745, + "eval_accuracy": 0.8455882352941176, + "eval_loss": 0.7301138639450073, + "eval_runtime": 28.9067, + "eval_samples_per_second": 4.705, + "eval_steps_per_second": 0.588, + "step": 1032 + }, + { + "epoch": 27.18954248366013, + "grad_norm": 7.8527960777282715, + "learning_rate": 4.0350877192982455e-05, + "loss": 0.1348, + "step": 1040 + }, + { + "epoch": 27.45098039215686, + "grad_norm": 2.1555140018463135, + "learning_rate": 4.02046783625731e-05, + "loss": 0.0675, + "step": 1050 + }, + { + "epoch": 27.712418300653596, + "grad_norm": 7.751283645629883, + "learning_rate": 4.005847953216375e-05, + "loss": 0.0916, + "step": 1060 + }, + { + "epoch": 27.973856209150327, + "grad_norm": 33.804786682128906, + "learning_rate": 3.991228070175439e-05, + "loss": 0.1325, + "step": 1070 + }, + { + "epoch": 28.0, + "eval_accuracy": 0.9044117647058824, + "eval_loss": 0.3712061643600464, + "eval_runtime": 28.0451, + "eval_samples_per_second": 4.849, + "eval_steps_per_second": 0.606, + "step": 1071 + }, + { + "epoch": 28.235294117647058, + "grad_norm": 7.706085205078125, + "learning_rate": 3.976608187134503e-05, + "loss": 0.0879, + "step": 1080 + }, + { + "epoch": 28.49673202614379, + "grad_norm": 4.338534355163574, + "learning_rate": 3.9619883040935676e-05, + "loss": 0.1017, + "step": 1090 + }, + { + "epoch": 28.758169934640524, + "grad_norm": 9.544697761535645, + "learning_rate": 3.9473684210526316e-05, + "loss": 0.1426, + "step": 1100 + }, + { + "epoch": 28.99346405228758, + "eval_accuracy": 0.8602941176470589, + "eval_loss": 0.440034419298172, + "eval_runtime": 30.1321, + "eval_samples_per_second": 4.513, + "eval_steps_per_second": 0.564, + "step": 1109 + }, + { + "epoch": 29.019607843137255, + "grad_norm": 0.3841346502304077, + "learning_rate": 3.932748538011696e-05, + "loss": 0.0981, + "step": 1110 + }, + { + "epoch": 29.281045751633986, + "grad_norm": 9.533553123474121, + "learning_rate": 3.9181286549707604e-05, + "loss": 0.0926, + "step": 1120 + }, + { + "epoch": 29.54248366013072, + "grad_norm": 26.160850524902344, + "learning_rate": 3.9035087719298244e-05, + "loss": 0.083, + "step": 1130 + }, + { + "epoch": 29.80392156862745, + "grad_norm": 18.309621810913086, + "learning_rate": 3.888888888888889e-05, + "loss": 0.0866, + "step": 1140 + }, + { + "epoch": 29.986928104575163, + "eval_accuracy": 0.9411764705882353, + "eval_loss": 0.27793076634407043, + "eval_runtime": 29.3246, + "eval_samples_per_second": 4.638, + "eval_steps_per_second": 0.58, + "step": 1147 + }, + { + "epoch": 30.065359477124183, + "grad_norm": 24.974849700927734, + "learning_rate": 3.874269005847954e-05, + "loss": 0.11, + "step": 1150 + }, + { + "epoch": 30.326797385620914, + "grad_norm": 3.7421281337738037, + "learning_rate": 3.859649122807018e-05, + "loss": 0.0712, + "step": 1160 + }, + { + "epoch": 30.58823529411765, + "grad_norm": 10.041555404663086, + "learning_rate": 3.845029239766082e-05, + "loss": 0.0702, + "step": 1170 + }, + { + "epoch": 30.84967320261438, + "grad_norm": 37.238948822021484, + "learning_rate": 3.8304093567251465e-05, + "loss": 0.0659, + "step": 1180 + }, + { + "epoch": 30.980392156862745, + "eval_accuracy": 0.9411764705882353, + "eval_loss": 0.3207360804080963, + "eval_runtime": 34.3274, + "eval_samples_per_second": 3.962, + "eval_steps_per_second": 0.495, + "step": 1185 + }, + { + "epoch": 31.11111111111111, + "grad_norm": 13.073234558105469, + "learning_rate": 3.815789473684211e-05, + "loss": 0.0547, + "step": 1190 + }, + { + "epoch": 31.372549019607842, + "grad_norm": 3.1763381958007812, + "learning_rate": 3.8011695906432746e-05, + "loss": 0.0727, + "step": 1200 + }, + { + "epoch": 31.633986928104576, + "grad_norm": 1.5747133493423462, + "learning_rate": 3.786549707602339e-05, + "loss": 0.1023, + "step": 1210 + }, + { + "epoch": 31.895424836601308, + "grad_norm": 12.335155487060547, + "learning_rate": 3.771929824561404e-05, + "loss": 0.1175, + "step": 1220 + }, + { + "epoch": 32.0, + "eval_accuracy": 0.9044117647058824, + "eval_loss": 0.43389689922332764, + "eval_runtime": 32.183, + "eval_samples_per_second": 4.226, + "eval_steps_per_second": 0.528, + "step": 1224 + }, + { + "epoch": 32.15686274509804, + "grad_norm": 2.676323413848877, + "learning_rate": 3.757309941520468e-05, + "loss": 0.129, + "step": 1230 + }, + { + "epoch": 32.41830065359477, + "grad_norm": 0.5916957259178162, + "learning_rate": 3.742690058479532e-05, + "loss": 0.0585, + "step": 1240 + }, + { + "epoch": 32.6797385620915, + "grad_norm": 11.02872085571289, + "learning_rate": 3.728070175438597e-05, + "loss": 0.045, + "step": 1250 + }, + { + "epoch": 32.94117647058823, + "grad_norm": 44.40802001953125, + "learning_rate": 3.713450292397661e-05, + "loss": 0.0455, + "step": 1260 + }, + { + "epoch": 32.99346405228758, + "eval_accuracy": 0.9264705882352942, + "eval_loss": 0.4536753296852112, + "eval_runtime": 32.0477, + "eval_samples_per_second": 4.244, + "eval_steps_per_second": 0.53, + "step": 1262 + }, + { + "epoch": 33.20261437908497, + "grad_norm": 0.4168817400932312, + "learning_rate": 3.6988304093567254e-05, + "loss": 0.0625, + "step": 1270 + }, + { + "epoch": 33.4640522875817, + "grad_norm": 7.689728260040283, + "learning_rate": 3.6842105263157895e-05, + "loss": 0.1613, + "step": 1280 + }, + { + "epoch": 33.72549019607843, + "grad_norm": 9.364749908447266, + "learning_rate": 3.669590643274854e-05, + "loss": 0.1001, + "step": 1290 + }, + { + "epoch": 33.98692810457516, + "grad_norm": 14.09304428100586, + "learning_rate": 3.654970760233918e-05, + "loss": 0.1006, + "step": 1300 + }, + { + "epoch": 33.98692810457516, + "eval_accuracy": 0.875, + "eval_loss": 0.6521199345588684, + "eval_runtime": 33.7228, + "eval_samples_per_second": 4.033, + "eval_steps_per_second": 0.504, + "step": 1300 + }, + { + "epoch": 34.248366013071895, + "grad_norm": 14.115684509277344, + "learning_rate": 3.640350877192983e-05, + "loss": 0.1592, + "step": 1310 + }, + { + "epoch": 34.509803921568626, + "grad_norm": 2.2361948490142822, + "learning_rate": 3.625730994152047e-05, + "loss": 0.0785, + "step": 1320 + }, + { + "epoch": 34.77124183006536, + "grad_norm": 15.101175308227539, + "learning_rate": 3.611111111111111e-05, + "loss": 0.033, + "step": 1330 + }, + { + "epoch": 34.98039215686274, + "eval_accuracy": 0.9044117647058824, + "eval_loss": 0.5615760087966919, + "eval_runtime": 20.5904, + "eval_samples_per_second": 6.605, + "eval_steps_per_second": 0.826, + "step": 1338 + }, + { + "epoch": 35.032679738562095, + "grad_norm": 74.07561492919922, + "learning_rate": 3.5964912280701756e-05, + "loss": 0.1336, + "step": 1340 + }, + { + "epoch": 35.294117647058826, + "grad_norm": 40.868961334228516, + "learning_rate": 3.5818713450292403e-05, + "loss": 0.1209, + "step": 1350 + }, + { + "epoch": 35.55555555555556, + "grad_norm": 11.251754760742188, + "learning_rate": 3.5672514619883044e-05, + "loss": 0.0658, + "step": 1360 + }, + { + "epoch": 35.81699346405229, + "grad_norm": 20.791095733642578, + "learning_rate": 3.5526315789473684e-05, + "loss": 0.0979, + "step": 1370 + }, + { + "epoch": 36.0, + "eval_accuracy": 0.9191176470588235, + "eval_loss": 0.3717995882034302, + "eval_runtime": 21.531, + "eval_samples_per_second": 6.316, + "eval_steps_per_second": 0.79, + "step": 1377 + }, + { + "epoch": 36.07843137254902, + "grad_norm": 13.336127281188965, + "learning_rate": 3.538011695906433e-05, + "loss": 0.0712, + "step": 1380 + }, + { + "epoch": 36.33986928104575, + "grad_norm": 7.379011154174805, + "learning_rate": 3.523391812865498e-05, + "loss": 0.0826, + "step": 1390 + }, + { + "epoch": 36.60130718954248, + "grad_norm": 1.9048967361450195, + "learning_rate": 3.508771929824561e-05, + "loss": 0.0791, + "step": 1400 + }, + { + "epoch": 36.86274509803921, + "grad_norm": 32.38518142700195, + "learning_rate": 3.494152046783626e-05, + "loss": 0.1045, + "step": 1410 + }, + { + "epoch": 36.99346405228758, + "eval_accuracy": 0.9632352941176471, + "eval_loss": 0.25290319323539734, + "eval_runtime": 22.9294, + "eval_samples_per_second": 5.931, + "eval_steps_per_second": 0.741, + "step": 1415 + }, + { + "epoch": 37.12418300653595, + "grad_norm": 14.719789505004883, + "learning_rate": 3.4795321637426905e-05, + "loss": 0.0977, + "step": 1420 + }, + { + "epoch": 37.38562091503268, + "grad_norm": 21.388763427734375, + "learning_rate": 3.4649122807017546e-05, + "loss": 0.0374, + "step": 1430 + }, + { + "epoch": 37.64705882352941, + "grad_norm": 7.066629886627197, + "learning_rate": 3.4502923976608186e-05, + "loss": 0.0819, + "step": 1440 + }, + { + "epoch": 37.908496732026144, + "grad_norm": 4.583933353424072, + "learning_rate": 3.435672514619883e-05, + "loss": 0.0815, + "step": 1450 + }, + { + "epoch": 37.98692810457516, + "eval_accuracy": 0.9338235294117647, + "eval_loss": 0.3510648012161255, + "eval_runtime": 21.3875, + "eval_samples_per_second": 6.359, + "eval_steps_per_second": 0.795, + "step": 1453 + }, + { + "epoch": 38.169934640522875, + "grad_norm": 14.378546714782715, + "learning_rate": 3.421052631578947e-05, + "loss": 0.1109, + "step": 1460 + }, + { + "epoch": 38.431372549019606, + "grad_norm": 4.1210408210754395, + "learning_rate": 3.406432748538012e-05, + "loss": 0.052, + "step": 1470 + }, + { + "epoch": 38.69281045751634, + "grad_norm": 18.48431396484375, + "learning_rate": 3.391812865497076e-05, + "loss": 0.0932, + "step": 1480 + }, + { + "epoch": 38.95424836601307, + "grad_norm": 30.51089859008789, + "learning_rate": 3.377192982456141e-05, + "loss": 0.0761, + "step": 1490 + }, + { + "epoch": 38.98039215686274, + "eval_accuracy": 0.9338235294117647, + "eval_loss": 0.31144019961357117, + "eval_runtime": 32.6124, + "eval_samples_per_second": 4.17, + "eval_steps_per_second": 0.521, + "step": 1491 + }, + { + "epoch": 39.21568627450981, + "grad_norm": 29.487356185913086, + "learning_rate": 3.362573099415205e-05, + "loss": 0.0995, + "step": 1500 + }, + { + "epoch": 39.47712418300654, + "grad_norm": 4.752898216247559, + "learning_rate": 3.3479532163742695e-05, + "loss": 0.0986, + "step": 1510 + }, + { + "epoch": 39.73856209150327, + "grad_norm": 23.433902740478516, + "learning_rate": 3.3333333333333335e-05, + "loss": 0.0908, + "step": 1520 + }, + { + "epoch": 40.0, + "grad_norm": 8.154867172241211, + "learning_rate": 3.3187134502923975e-05, + "loss": 0.0747, + "step": 1530 + }, + { + "epoch": 40.0, + "eval_accuracy": 0.9338235294117647, + "eval_loss": 0.2836870849132538, + "eval_runtime": 33.717, + "eval_samples_per_second": 4.034, + "eval_steps_per_second": 0.504, + "step": 1530 + }, + { + "epoch": 40.26143790849673, + "grad_norm": 66.09915924072266, + "learning_rate": 3.304093567251462e-05, + "loss": 0.0746, + "step": 1540 + }, + { + "epoch": 40.52287581699346, + "grad_norm": 8.447415351867676, + "learning_rate": 3.289473684210527e-05, + "loss": 0.0809, + "step": 1550 + }, + { + "epoch": 40.78431372549019, + "grad_norm": 11.7717866897583, + "learning_rate": 3.274853801169591e-05, + "loss": 0.0545, + "step": 1560 + }, + { + "epoch": 40.99346405228758, + "eval_accuracy": 0.9411764705882353, + "eval_loss": 0.42687493562698364, + "eval_runtime": 30.8285, + "eval_samples_per_second": 4.412, + "eval_steps_per_second": 0.551, + "step": 1568 + }, + { + "epoch": 41.04575163398693, + "grad_norm": 2.3586502075195312, + "learning_rate": 3.260233918128655e-05, + "loss": 0.058, + "step": 1570 + }, + { + "epoch": 41.30718954248366, + "grad_norm": 31.519433975219727, + "learning_rate": 3.24561403508772e-05, + "loss": 0.0838, + "step": 1580 + }, + { + "epoch": 41.568627450980394, + "grad_norm": 0.15550392866134644, + "learning_rate": 3.230994152046784e-05, + "loss": 0.0853, + "step": 1590 + }, + { + "epoch": 41.830065359477125, + "grad_norm": 6.823671340942383, + "learning_rate": 3.216374269005848e-05, + "loss": 0.0796, + "step": 1600 + }, + { + "epoch": 41.98692810457516, + "eval_accuracy": 0.9411764705882353, + "eval_loss": 0.23307542502880096, + "eval_runtime": 33.1415, + "eval_samples_per_second": 4.104, + "eval_steps_per_second": 0.513, + "step": 1606 + }, + { + "epoch": 42.091503267973856, + "grad_norm": 11.52629566192627, + "learning_rate": 3.2017543859649124e-05, + "loss": 0.0903, + "step": 1610 + }, + { + "epoch": 42.35294117647059, + "grad_norm": 11.996484756469727, + "learning_rate": 3.187134502923977e-05, + "loss": 0.0595, + "step": 1620 + }, + { + "epoch": 42.61437908496732, + "grad_norm": 1.5475754737854004, + "learning_rate": 3.172514619883041e-05, + "loss": 0.0993, + "step": 1630 + }, + { + "epoch": 42.87581699346405, + "grad_norm": 18.27874755859375, + "learning_rate": 3.157894736842105e-05, + "loss": 0.055, + "step": 1640 + }, + { + "epoch": 42.98039215686274, + "eval_accuracy": 0.9485294117647058, + "eval_loss": 0.28995171189308167, + "eval_runtime": 31.1656, + "eval_samples_per_second": 4.364, + "eval_steps_per_second": 0.545, + "step": 1644 + }, + { + "epoch": 43.13725490196079, + "grad_norm": 1.7079222202301025, + "learning_rate": 3.14327485380117e-05, + "loss": 0.0851, + "step": 1650 + }, + { + "epoch": 43.39869281045752, + "grad_norm": 0.0829237625002861, + "learning_rate": 3.128654970760234e-05, + "loss": 0.061, + "step": 1660 + }, + { + "epoch": 43.66013071895425, + "grad_norm": 2.6961874961853027, + "learning_rate": 3.1140350877192986e-05, + "loss": 0.0205, + "step": 1670 + }, + { + "epoch": 43.92156862745098, + "grad_norm": 3.1870129108428955, + "learning_rate": 3.0994152046783626e-05, + "loss": 0.0706, + "step": 1680 + }, + { + "epoch": 44.0, + "eval_accuracy": 0.9632352941176471, + "eval_loss": 0.3367806077003479, + "eval_runtime": 25.249, + "eval_samples_per_second": 5.386, + "eval_steps_per_second": 0.673, + "step": 1683 + }, + { + "epoch": 44.18300653594771, + "grad_norm": 10.678839683532715, + "learning_rate": 3.084795321637427e-05, + "loss": 0.0555, + "step": 1690 + }, + { + "epoch": 44.44444444444444, + "grad_norm": 0.1511285901069641, + "learning_rate": 3.0701754385964913e-05, + "loss": 0.0463, + "step": 1700 + }, + { + "epoch": 44.705882352941174, + "grad_norm": 19.222854614257812, + "learning_rate": 3.055555555555556e-05, + "loss": 0.0783, + "step": 1710 + }, + { + "epoch": 44.967320261437905, + "grad_norm": 12.824193954467773, + "learning_rate": 3.0409356725146197e-05, + "loss": 0.0505, + "step": 1720 + }, + { + "epoch": 44.99346405228758, + "eval_accuracy": 0.9485294117647058, + "eval_loss": 0.3779818117618561, + "eval_runtime": 19.0793, + "eval_samples_per_second": 7.128, + "eval_steps_per_second": 0.891, + "step": 1721 + }, + { + "epoch": 45.22875816993464, + "grad_norm": 18.495044708251953, + "learning_rate": 3.0263157894736844e-05, + "loss": 0.0679, + "step": 1730 + }, + { + "epoch": 45.490196078431374, + "grad_norm": 22.039566040039062, + "learning_rate": 3.0116959064327488e-05, + "loss": 0.0618, + "step": 1740 + }, + { + "epoch": 45.751633986928105, + "grad_norm": 0.6790270209312439, + "learning_rate": 2.997076023391813e-05, + "loss": 0.0698, + "step": 1750 + }, + { + "epoch": 45.98692810457516, + "eval_accuracy": 0.9191176470588235, + "eval_loss": 0.48222464323043823, + "eval_runtime": 33.9657, + "eval_samples_per_second": 4.004, + "eval_steps_per_second": 0.501, + "step": 1759 + }, + { + "epoch": 46.01307189542484, + "grad_norm": 48.15066909790039, + "learning_rate": 2.9824561403508772e-05, + "loss": 0.0745, + "step": 1760 + }, + { + "epoch": 46.27450980392157, + "grad_norm": 48.96921920776367, + "learning_rate": 2.9678362573099415e-05, + "loss": 0.11, + "step": 1770 + }, + { + "epoch": 46.5359477124183, + "grad_norm": 16.973966598510742, + "learning_rate": 2.9532163742690062e-05, + "loss": 0.0183, + "step": 1780 + }, + { + "epoch": 46.79738562091503, + "grad_norm": 11.563841819763184, + "learning_rate": 2.9385964912280706e-05, + "loss": 0.0275, + "step": 1790 + }, + { + "epoch": 46.98039215686274, + "eval_accuracy": 0.9632352941176471, + "eval_loss": 0.34339553117752075, + "eval_runtime": 33.4784, + "eval_samples_per_second": 4.062, + "eval_steps_per_second": 0.508, + "step": 1797 + }, + { + "epoch": 47.05882352941177, + "grad_norm": 18.660812377929688, + "learning_rate": 2.9239766081871346e-05, + "loss": 0.0307, + "step": 1800 + }, + { + "epoch": 47.3202614379085, + "grad_norm": 19.048458099365234, + "learning_rate": 2.909356725146199e-05, + "loss": 0.036, + "step": 1810 + }, + { + "epoch": 47.58169934640523, + "grad_norm": 0.8519901037216187, + "learning_rate": 2.8947368421052634e-05, + "loss": 0.0491, + "step": 1820 + }, + { + "epoch": 47.84313725490196, + "grad_norm": 0.9929773211479187, + "learning_rate": 2.8801169590643277e-05, + "loss": 0.0641, + "step": 1830 + }, + { + "epoch": 48.0, + "eval_accuracy": 0.9705882352941176, + "eval_loss": 0.3386637568473816, + "eval_runtime": 33.9575, + "eval_samples_per_second": 4.005, + "eval_steps_per_second": 0.501, + "step": 1836 + }, + { + "epoch": 48.10457516339869, + "grad_norm": 27.548429489135742, + "learning_rate": 2.8654970760233917e-05, + "loss": 0.0634, + "step": 1840 + }, + { + "epoch": 48.36601307189542, + "grad_norm": 0.4367322027683258, + "learning_rate": 2.850877192982456e-05, + "loss": 0.0756, + "step": 1850 + }, + { + "epoch": 48.627450980392155, + "grad_norm": 18.30873680114746, + "learning_rate": 2.8362573099415208e-05, + "loss": 0.0134, + "step": 1860 + }, + { + "epoch": 48.888888888888886, + "grad_norm": 0.011559017933905125, + "learning_rate": 2.821637426900585e-05, + "loss": 0.0484, + "step": 1870 + }, + { + "epoch": 48.99346405228758, + "eval_accuracy": 0.9191176470588235, + "eval_loss": 0.5349822640419006, + "eval_runtime": 38.4788, + "eval_samples_per_second": 3.534, + "eval_steps_per_second": 0.442, + "step": 1874 + }, + { + "epoch": 49.150326797385624, + "grad_norm": 2.1214957237243652, + "learning_rate": 2.8070175438596492e-05, + "loss": 0.088, + "step": 1880 + }, + { + "epoch": 49.411764705882355, + "grad_norm": 27.645193099975586, + "learning_rate": 2.7923976608187135e-05, + "loss": 0.0621, + "step": 1890 + }, + { + "epoch": 49.673202614379086, + "grad_norm": 1.3699434995651245, + "learning_rate": 2.777777777777778e-05, + "loss": 0.0528, + "step": 1900 + }, + { + "epoch": 49.93464052287582, + "grad_norm": 8.130342483520508, + "learning_rate": 2.7631578947368426e-05, + "loss": 0.0388, + "step": 1910 + }, + { + "epoch": 49.98692810457516, + "eval_accuracy": 0.9117647058823529, + "eval_loss": 0.382554292678833, + "eval_runtime": 33.8716, + "eval_samples_per_second": 4.015, + "eval_steps_per_second": 0.502, + "step": 1912 + }, + { + "epoch": 50.19607843137255, + "grad_norm": 47.961002349853516, + "learning_rate": 2.7485380116959063e-05, + "loss": 0.0941, + "step": 1920 + }, + { + "epoch": 50.45751633986928, + "grad_norm": 36.82217025756836, + "learning_rate": 2.733918128654971e-05, + "loss": 0.0863, + "step": 1930 + }, + { + "epoch": 50.71895424836601, + "grad_norm": 5.911373615264893, + "learning_rate": 2.7192982456140354e-05, + "loss": 0.0324, + "step": 1940 + }, + { + "epoch": 50.98039215686274, + "grad_norm": 24.99283790588379, + "learning_rate": 2.7046783625730997e-05, + "loss": 0.0347, + "step": 1950 + }, + { + "epoch": 50.98039215686274, + "eval_accuracy": 0.9558823529411765, + "eval_loss": 0.3738501965999603, + "eval_runtime": 30.759, + "eval_samples_per_second": 4.421, + "eval_steps_per_second": 0.553, + "step": 1950 + }, + { + "epoch": 51.24183006535948, + "grad_norm": 70.3333969116211, + "learning_rate": 2.6900584795321637e-05, + "loss": 0.0428, + "step": 1960 + }, + { + "epoch": 51.50326797385621, + "grad_norm": 13.072953224182129, + "learning_rate": 2.675438596491228e-05, + "loss": 0.0505, + "step": 1970 + }, + { + "epoch": 51.76470588235294, + "grad_norm": 39.30720520019531, + "learning_rate": 2.6608187134502928e-05, + "loss": 0.1046, + "step": 1980 + }, + { + "epoch": 52.0, + "eval_accuracy": 0.9117647058823529, + "eval_loss": 0.3074805736541748, + "eval_runtime": 33.894, + "eval_samples_per_second": 4.013, + "eval_steps_per_second": 0.502, + "step": 1989 + }, + { + "epoch": 52.02614379084967, + "grad_norm": 23.061525344848633, + "learning_rate": 2.6461988304093572e-05, + "loss": 0.0566, + "step": 1990 + }, + { + "epoch": 52.287581699346404, + "grad_norm": 2.5243396759033203, + "learning_rate": 2.6315789473684212e-05, + "loss": 0.0605, + "step": 2000 + }, + { + "epoch": 52.549019607843135, + "grad_norm": 11.470220565795898, + "learning_rate": 2.6169590643274856e-05, + "loss": 0.0767, + "step": 2010 + }, + { + "epoch": 52.810457516339866, + "grad_norm": 0.23322105407714844, + "learning_rate": 2.60233918128655e-05, + "loss": 0.0298, + "step": 2020 + }, + { + "epoch": 52.99346405228758, + "eval_accuracy": 0.9558823529411765, + "eval_loss": 0.3557595908641815, + "eval_runtime": 25.1218, + "eval_samples_per_second": 5.414, + "eval_steps_per_second": 0.677, + "step": 2027 + }, + { + "epoch": 53.071895424836605, + "grad_norm": 4.624847412109375, + "learning_rate": 2.5877192982456143e-05, + "loss": 0.0563, + "step": 2030 + }, + { + "epoch": 53.333333333333336, + "grad_norm": 0.25727781653404236, + "learning_rate": 2.5730994152046783e-05, + "loss": 0.0977, + "step": 2040 + }, + { + "epoch": 53.59477124183007, + "grad_norm": 0.22140049934387207, + "learning_rate": 2.5584795321637427e-05, + "loss": 0.0199, + "step": 2050 + }, + { + "epoch": 53.8562091503268, + "grad_norm": 0.9178116321563721, + "learning_rate": 2.5438596491228074e-05, + "loss": 0.0478, + "step": 2060 + }, + { + "epoch": 53.98692810457516, + "eval_accuracy": 0.9705882352941176, + "eval_loss": 0.30555427074432373, + "eval_runtime": 37.1043, + "eval_samples_per_second": 3.665, + "eval_steps_per_second": 0.458, + "step": 2065 + }, + { + "epoch": 54.11764705882353, + "grad_norm": 19.221540451049805, + "learning_rate": 2.5292397660818717e-05, + "loss": 0.0289, + "step": 2070 + }, + { + "epoch": 54.37908496732026, + "grad_norm": 1.848120093345642, + "learning_rate": 2.5146198830409358e-05, + "loss": 0.095, + "step": 2080 + }, + { + "epoch": 54.64052287581699, + "grad_norm": 10.04775619506836, + "learning_rate": 2.5e-05, + "loss": 0.0218, + "step": 2090 + }, + { + "epoch": 54.90196078431372, + "grad_norm": 0.047169651836156845, + "learning_rate": 2.485380116959064e-05, + "loss": 0.0285, + "step": 2100 + }, + { + "epoch": 54.98039215686274, + "eval_accuracy": 0.9632352941176471, + "eval_loss": 0.28512153029441833, + "eval_runtime": 32.4012, + "eval_samples_per_second": 4.197, + "eval_steps_per_second": 0.525, + "step": 2103 + }, + { + "epoch": 55.16339869281046, + "grad_norm": 2.4437642097473145, + "learning_rate": 2.470760233918129e-05, + "loss": 0.0029, + "step": 2110 + }, + { + "epoch": 55.42483660130719, + "grad_norm": 14.518400192260742, + "learning_rate": 2.456140350877193e-05, + "loss": 0.0621, + "step": 2120 + }, + { + "epoch": 55.68627450980392, + "grad_norm": 2.9272749423980713, + "learning_rate": 2.4415204678362576e-05, + "loss": 0.0129, + "step": 2130 + }, + { + "epoch": 55.947712418300654, + "grad_norm": 19.935407638549805, + "learning_rate": 2.4269005847953216e-05, + "loss": 0.0407, + "step": 2140 + }, + { + "epoch": 56.0, + "eval_accuracy": 0.9558823529411765, + "eval_loss": 0.32225164771080017, + "eval_runtime": 33.148, + "eval_samples_per_second": 4.103, + "eval_steps_per_second": 0.513, + "step": 2142 + }, + { + "epoch": 56.209150326797385, + "grad_norm": 32.69438934326172, + "learning_rate": 2.412280701754386e-05, + "loss": 0.0161, + "step": 2150 + }, + { + "epoch": 56.470588235294116, + "grad_norm": 0.04998353496193886, + "learning_rate": 2.3976608187134503e-05, + "loss": 0.0446, + "step": 2160 + }, + { + "epoch": 56.73202614379085, + "grad_norm": 0.830470085144043, + "learning_rate": 2.3830409356725147e-05, + "loss": 0.1066, + "step": 2170 + }, + { + "epoch": 56.99346405228758, + "grad_norm": 21.04816436767578, + "learning_rate": 2.368421052631579e-05, + "loss": 0.0459, + "step": 2180 + }, + { + "epoch": 56.99346405228758, + "eval_accuracy": 0.9485294117647058, + "eval_loss": 0.45745787024497986, + "eval_runtime": 31.4986, + "eval_samples_per_second": 4.318, + "eval_steps_per_second": 0.54, + "step": 2180 + }, + { + "epoch": 57.254901960784316, + "grad_norm": 6.693302631378174, + "learning_rate": 2.3538011695906434e-05, + "loss": 0.0569, + "step": 2190 + }, + { + "epoch": 57.51633986928105, + "grad_norm": 12.218875885009766, + "learning_rate": 2.3391812865497074e-05, + "loss": 0.0455, + "step": 2200 + }, + { + "epoch": 57.77777777777778, + "grad_norm": 56.21259689331055, + "learning_rate": 2.324561403508772e-05, + "loss": 0.0409, + "step": 2210 + }, + { + "epoch": 57.98692810457516, + "eval_accuracy": 0.9632352941176471, + "eval_loss": 0.29300644993782043, + "eval_runtime": 31.4287, + "eval_samples_per_second": 4.327, + "eval_steps_per_second": 0.541, + "step": 2218 + }, + { + "epoch": 58.03921568627451, + "grad_norm": 0.48025286197662354, + "learning_rate": 2.309941520467836e-05, + "loss": 0.0526, + "step": 2220 + }, + { + "epoch": 58.30065359477124, + "grad_norm": 6.530683994293213, + "learning_rate": 2.295321637426901e-05, + "loss": 0.0791, + "step": 2230 + }, + { + "epoch": 58.56209150326797, + "grad_norm": 35.76517105102539, + "learning_rate": 2.280701754385965e-05, + "loss": 0.033, + "step": 2240 + }, + { + "epoch": 58.8235294117647, + "grad_norm": 4.9538679122924805, + "learning_rate": 2.2660818713450292e-05, + "loss": 0.0743, + "step": 2250 + }, + { + "epoch": 58.98039215686274, + "eval_accuracy": 0.9485294117647058, + "eval_loss": 0.4032076299190521, + "eval_runtime": 34.2283, + "eval_samples_per_second": 3.973, + "eval_steps_per_second": 0.497, + "step": 2256 + }, + { + "epoch": 59.08496732026144, + "grad_norm": 8.96496868133545, + "learning_rate": 2.2514619883040936e-05, + "loss": 0.0358, + "step": 2260 + }, + { + "epoch": 59.34640522875817, + "grad_norm": 10.487314224243164, + "learning_rate": 2.236842105263158e-05, + "loss": 0.0805, + "step": 2270 + }, + { + "epoch": 59.6078431372549, + "grad_norm": 3.922236442565918, + "learning_rate": 2.2222222222222223e-05, + "loss": 0.0096, + "step": 2280 + }, + { + "epoch": 59.869281045751634, + "grad_norm": 5.181495666503906, + "learning_rate": 2.2076023391812867e-05, + "loss": 0.0346, + "step": 2290 + }, + { + "epoch": 60.0, + "eval_accuracy": 0.9411764705882353, + "eval_loss": 0.37382781505584717, + "eval_runtime": 37.1282, + "eval_samples_per_second": 3.663, + "eval_steps_per_second": 0.458, + "step": 2295 + }, + { + "epoch": 60.130718954248366, + "grad_norm": 0.059666648507118225, + "learning_rate": 2.1929824561403507e-05, + "loss": 0.0551, + "step": 2300 + }, + { + "epoch": 60.3921568627451, + "grad_norm": 0.5856298804283142, + "learning_rate": 2.1783625730994154e-05, + "loss": 0.0331, + "step": 2310 + }, + { + "epoch": 60.65359477124183, + "grad_norm": 5.777927875518799, + "learning_rate": 2.1637426900584794e-05, + "loss": 0.0112, + "step": 2320 + }, + { + "epoch": 60.91503267973856, + "grad_norm": 13.134035110473633, + "learning_rate": 2.149122807017544e-05, + "loss": 0.0302, + "step": 2330 + }, + { + "epoch": 60.99346405228758, + "eval_accuracy": 0.9485294117647058, + "eval_loss": 0.3597317337989807, + "eval_runtime": 31.126, + "eval_samples_per_second": 4.369, + "eval_steps_per_second": 0.546, + "step": 2333 + }, + { + "epoch": 61.1764705882353, + "grad_norm": 28.286643981933594, + "learning_rate": 2.134502923976608e-05, + "loss": 0.0311, + "step": 2340 + }, + { + "epoch": 61.43790849673203, + "grad_norm": 6.936996936798096, + "learning_rate": 2.1198830409356725e-05, + "loss": 0.139, + "step": 2350 + }, + { + "epoch": 61.69934640522876, + "grad_norm": 1.0503500699996948, + "learning_rate": 2.105263157894737e-05, + "loss": 0.0666, + "step": 2360 + }, + { + "epoch": 61.96078431372549, + "grad_norm": 5.756121635437012, + "learning_rate": 2.0906432748538013e-05, + "loss": 0.0488, + "step": 2370 + }, + { + "epoch": 61.98692810457516, + "eval_accuracy": 0.9558823529411765, + "eval_loss": 0.2594568133354187, + "eval_runtime": 34.9133, + "eval_samples_per_second": 3.895, + "eval_steps_per_second": 0.487, + "step": 2371 + }, + { + "epoch": 62.22222222222222, + "grad_norm": 17.791810989379883, + "learning_rate": 2.0760233918128656e-05, + "loss": 0.0294, + "step": 2380 + }, + { + "epoch": 62.48366013071895, + "grad_norm": 0.014880876056849957, + "learning_rate": 2.06140350877193e-05, + "loss": 0.0516, + "step": 2390 + }, + { + "epoch": 62.745098039215684, + "grad_norm": 33.730533599853516, + "learning_rate": 2.046783625730994e-05, + "loss": 0.0562, + "step": 2400 + }, + { + "epoch": 62.98039215686274, + "eval_accuracy": 0.9411764705882353, + "eval_loss": 0.3763536512851715, + "eval_runtime": 35.0422, + "eval_samples_per_second": 3.881, + "eval_steps_per_second": 0.485, + "step": 2409 + }, + { + "epoch": 63.00653594771242, + "grad_norm": 58.39078903198242, + "learning_rate": 2.0321637426900587e-05, + "loss": 0.0751, + "step": 2410 + }, + { + "epoch": 63.26797385620915, + "grad_norm": 0.0864597037434578, + "learning_rate": 2.0175438596491227e-05, + "loss": 0.0393, + "step": 2420 + }, + { + "epoch": 63.529411764705884, + "grad_norm": 18.966829299926758, + "learning_rate": 2.0029239766081874e-05, + "loss": 0.0251, + "step": 2430 + }, + { + "epoch": 63.790849673202615, + "grad_norm": 25.66364288330078, + "learning_rate": 1.9883040935672515e-05, + "loss": 0.0216, + "step": 2440 + }, + { + "epoch": 64.0, + "eval_accuracy": 0.9779411764705882, + "eval_loss": 0.2643776834011078, + "eval_runtime": 17.3782, + "eval_samples_per_second": 7.826, + "eval_steps_per_second": 0.978, + "step": 2448 + }, + { + "epoch": 64.05228758169935, + "grad_norm": 1.6527997255325317, + "learning_rate": 1.9736842105263158e-05, + "loss": 0.054, + "step": 2450 + }, + { + "epoch": 64.31372549019608, + "grad_norm": 0.06280579417943954, + "learning_rate": 1.9590643274853802e-05, + "loss": 0.0287, + "step": 2460 + }, + { + "epoch": 64.57516339869281, + "grad_norm": 1.6318433284759521, + "learning_rate": 1.9444444444444445e-05, + "loss": 0.0399, + "step": 2470 + }, + { + "epoch": 64.83660130718954, + "grad_norm": 1.7933380603790283, + "learning_rate": 1.929824561403509e-05, + "loss": 0.0219, + "step": 2480 + }, + { + "epoch": 64.99346405228758, + "eval_accuracy": 0.9632352941176471, + "eval_loss": 0.30917930603027344, + "eval_runtime": 17.1251, + "eval_samples_per_second": 7.942, + "eval_steps_per_second": 0.993, + "step": 2486 + }, + { + "epoch": 65.09803921568627, + "grad_norm": 10.366903305053711, + "learning_rate": 1.9152046783625733e-05, + "loss": 0.0539, + "step": 2490 + }, + { + "epoch": 65.359477124183, + "grad_norm": 0.2696276307106018, + "learning_rate": 1.9005847953216373e-05, + "loss": 0.0123, + "step": 2500 + }, + { + "epoch": 65.62091503267973, + "grad_norm": 2.0707309246063232, + "learning_rate": 1.885964912280702e-05, + "loss": 0.0209, + "step": 2510 + }, + { + "epoch": 65.88235294117646, + "grad_norm": 0.026714438572525978, + "learning_rate": 1.871345029239766e-05, + "loss": 0.0272, + "step": 2520 + }, + { + "epoch": 65.98692810457516, + "eval_accuracy": 0.9632352941176471, + "eval_loss": 0.2898404896259308, + "eval_runtime": 17.5281, + "eval_samples_per_second": 7.759, + "eval_steps_per_second": 0.97, + "step": 2524 + }, + { + "epoch": 66.14379084967321, + "grad_norm": 0.15798357129096985, + "learning_rate": 1.8567251461988304e-05, + "loss": 0.0091, + "step": 2530 + }, + { + "epoch": 66.40522875816994, + "grad_norm": 85.56695556640625, + "learning_rate": 1.8421052631578947e-05, + "loss": 0.0221, + "step": 2540 + }, + { + "epoch": 66.66666666666667, + "grad_norm": 25.615230560302734, + "learning_rate": 1.827485380116959e-05, + "loss": 0.0645, + "step": 2550 + }, + { + "epoch": 66.9281045751634, + "grad_norm": 22.72310447692871, + "learning_rate": 1.8128654970760235e-05, + "loss": 0.027, + "step": 2560 + }, + { + "epoch": 66.98039215686275, + "eval_accuracy": 0.9632352941176471, + "eval_loss": 0.2693423628807068, + "eval_runtime": 23.0579, + "eval_samples_per_second": 5.898, + "eval_steps_per_second": 0.737, + "step": 2562 + }, + { + "epoch": 67.18954248366013, + "grad_norm": 24.883161544799805, + "learning_rate": 1.7982456140350878e-05, + "loss": 0.0293, + "step": 2570 + }, + { + "epoch": 67.45098039215686, + "grad_norm": 6.90622615814209, + "learning_rate": 1.7836257309941522e-05, + "loss": 0.022, + "step": 2580 + }, + { + "epoch": 67.7124183006536, + "grad_norm": 48.23540115356445, + "learning_rate": 1.7690058479532165e-05, + "loss": 0.0509, + "step": 2590 + }, + { + "epoch": 67.97385620915033, + "grad_norm": 0.07863592356443405, + "learning_rate": 1.7543859649122806e-05, + "loss": 0.0397, + "step": 2600 + }, + { + "epoch": 68.0, + "eval_accuracy": 0.9411764705882353, + "eval_loss": 0.38426852226257324, + "eval_runtime": 23.971, + "eval_samples_per_second": 5.674, + "eval_steps_per_second": 0.709, + "step": 2601 + }, + { + "epoch": 68.23529411764706, + "grad_norm": 4.26972770690918, + "learning_rate": 1.7397660818713453e-05, + "loss": 0.0409, + "step": 2610 + }, + { + "epoch": 68.49673202614379, + "grad_norm": 1.8150982856750488, + "learning_rate": 1.7251461988304093e-05, + "loss": 0.0315, + "step": 2620 + }, + { + "epoch": 68.75816993464052, + "grad_norm": 13.07569694519043, + "learning_rate": 1.7105263157894737e-05, + "loss": 0.0154, + "step": 2630 + }, + { + "epoch": 68.99346405228758, + "eval_accuracy": 0.9485294117647058, + "eval_loss": 0.30511775612831116, + "eval_runtime": 23.3134, + "eval_samples_per_second": 5.834, + "eval_steps_per_second": 0.729, + "step": 2639 + }, + { + "epoch": 69.01960784313725, + "grad_norm": 0.576351523399353, + "learning_rate": 1.695906432748538e-05, + "loss": 0.0387, + "step": 2640 + }, + { + "epoch": 69.28104575163398, + "grad_norm": 0.867915153503418, + "learning_rate": 1.6812865497076024e-05, + "loss": 0.0178, + "step": 2650 + }, + { + "epoch": 69.54248366013071, + "grad_norm": 20.2279052734375, + "learning_rate": 1.6666666666666667e-05, + "loss": 0.0392, + "step": 2660 + }, + { + "epoch": 69.80392156862744, + "grad_norm": 0.04353189095854759, + "learning_rate": 1.652046783625731e-05, + "loss": 0.0004, + "step": 2670 + }, + { + "epoch": 69.98692810457516, + "eval_accuracy": 0.9411764705882353, + "eval_loss": 0.39089399576187134, + "eval_runtime": 23.3469, + "eval_samples_per_second": 5.825, + "eval_steps_per_second": 0.728, + "step": 2677 + }, + { + "epoch": 70.06535947712419, + "grad_norm": 77.49730682373047, + "learning_rate": 1.6374269005847955e-05, + "loss": 0.0467, + "step": 2680 + }, + { + "epoch": 70.32679738562092, + "grad_norm": 49.50137710571289, + "learning_rate": 1.62280701754386e-05, + "loss": 0.0228, + "step": 2690 + }, + { + "epoch": 70.58823529411765, + "grad_norm": 0.5024857521057129, + "learning_rate": 1.608187134502924e-05, + "loss": 0.0045, + "step": 2700 + }, + { + "epoch": 70.84967320261438, + "grad_norm": 3.8934128284454346, + "learning_rate": 1.5935672514619886e-05, + "loss": 0.0651, + "step": 2710 + }, + { + "epoch": 70.98039215686275, + "eval_accuracy": 0.9485294117647058, + "eval_loss": 0.29772186279296875, + "eval_runtime": 25.8712, + "eval_samples_per_second": 5.257, + "eval_steps_per_second": 0.657, + "step": 2715 + }, + { + "epoch": 71.11111111111111, + "grad_norm": 7.867006778717041, + "learning_rate": 1.5789473684210526e-05, + "loss": 0.008, + "step": 2720 + }, + { + "epoch": 71.37254901960785, + "grad_norm": 13.64209270477295, + "learning_rate": 1.564327485380117e-05, + "loss": 0.0757, + "step": 2730 + }, + { + "epoch": 71.63398692810458, + "grad_norm": 6.453034400939941, + "learning_rate": 1.5497076023391813e-05, + "loss": 0.0214, + "step": 2740 + }, + { + "epoch": 71.89542483660131, + "grad_norm": 0.1501288115978241, + "learning_rate": 1.5350877192982457e-05, + "loss": 0.016, + "step": 2750 + }, + { + "epoch": 72.0, + "eval_accuracy": 0.9632352941176471, + "eval_loss": 0.2694728374481201, + "eval_runtime": 20.9056, + "eval_samples_per_second": 6.505, + "eval_steps_per_second": 0.813, + "step": 2754 + }, + { + "epoch": 72.15686274509804, + "grad_norm": 0.034015778452157974, + "learning_rate": 1.5204678362573099e-05, + "loss": 0.012, + "step": 2760 + }, + { + "epoch": 72.41830065359477, + "grad_norm": 11.159213066101074, + "learning_rate": 1.5058479532163744e-05, + "loss": 0.0444, + "step": 2770 + }, + { + "epoch": 72.6797385620915, + "grad_norm": 2.5402066707611084, + "learning_rate": 1.4912280701754386e-05, + "loss": 0.0359, + "step": 2780 + }, + { + "epoch": 72.94117647058823, + "grad_norm": 0.016565600410103798, + "learning_rate": 1.4766081871345031e-05, + "loss": 0.0351, + "step": 2790 + }, + { + "epoch": 72.99346405228758, + "eval_accuracy": 0.9705882352941176, + "eval_loss": 0.2720423936843872, + "eval_runtime": 22.3116, + "eval_samples_per_second": 6.095, + "eval_steps_per_second": 0.762, + "step": 2792 + }, + { + "epoch": 73.20261437908496, + "grad_norm": 79.11601257324219, + "learning_rate": 1.4619883040935673e-05, + "loss": 0.044, + "step": 2800 + }, + { + "epoch": 73.4640522875817, + "grad_norm": 5.53911018371582, + "learning_rate": 1.4473684210526317e-05, + "loss": 0.0298, + "step": 2810 + }, + { + "epoch": 73.72549019607843, + "grad_norm": 0.40750911831855774, + "learning_rate": 1.4327485380116959e-05, + "loss": 0.011, + "step": 2820 + }, + { + "epoch": 73.98692810457516, + "grad_norm": 0.9360626339912415, + "learning_rate": 1.4181286549707604e-05, + "loss": 0.0206, + "step": 2830 + }, + { + "epoch": 73.98692810457516, + "eval_accuracy": 0.9705882352941176, + "eval_loss": 0.25490206480026245, + "eval_runtime": 22.7726, + "eval_samples_per_second": 5.972, + "eval_steps_per_second": 0.747, + "step": 2830 + }, + { + "epoch": 74.2483660130719, + "grad_norm": 6.835451602935791, + "learning_rate": 1.4035087719298246e-05, + "loss": 0.0109, + "step": 2840 + }, + { + "epoch": 74.50980392156863, + "grad_norm": 0.1265513300895691, + "learning_rate": 1.388888888888889e-05, + "loss": 0.0436, + "step": 2850 + }, + { + "epoch": 74.77124183006536, + "grad_norm": 0.20871244370937347, + "learning_rate": 1.3742690058479531e-05, + "loss": 0.0109, + "step": 2860 + }, + { + "epoch": 74.98039215686275, + "eval_accuracy": 0.9705882352941176, + "eval_loss": 0.24122387170791626, + "eval_runtime": 19.4498, + "eval_samples_per_second": 6.992, + "eval_steps_per_second": 0.874, + "step": 2868 + }, + { + "epoch": 75.0326797385621, + "grad_norm": 24.267925262451172, + "learning_rate": 1.3596491228070177e-05, + "loss": 0.0207, + "step": 2870 + }, + { + "epoch": 75.29411764705883, + "grad_norm": 9.061148643493652, + "learning_rate": 1.3450292397660819e-05, + "loss": 0.0105, + "step": 2880 + }, + { + "epoch": 75.55555555555556, + "grad_norm": 1.2824314832687378, + "learning_rate": 1.3304093567251464e-05, + "loss": 0.0182, + "step": 2890 + }, + { + "epoch": 75.81699346405229, + "grad_norm": 0.003347081132233143, + "learning_rate": 1.3157894736842106e-05, + "loss": 0.0012, + "step": 2900 + }, + { + "epoch": 76.0, + "eval_accuracy": 0.9779411764705882, + "eval_loss": 0.34939995408058167, + "eval_runtime": 20.8219, + "eval_samples_per_second": 6.532, + "eval_steps_per_second": 0.816, + "step": 2907 + }, + { + "epoch": 76.07843137254902, + "grad_norm": 5.410060882568359, + "learning_rate": 1.301169590643275e-05, + "loss": 0.0214, + "step": 2910 + }, + { + "epoch": 76.33986928104575, + "grad_norm": 0.6613653898239136, + "learning_rate": 1.2865497076023392e-05, + "loss": 0.0261, + "step": 2920 + }, + { + "epoch": 76.60130718954248, + "grad_norm": 1.0403037071228027, + "learning_rate": 1.2719298245614037e-05, + "loss": 0.0555, + "step": 2930 + }, + { + "epoch": 76.86274509803921, + "grad_norm": 15.238615036010742, + "learning_rate": 1.2573099415204679e-05, + "loss": 0.0418, + "step": 2940 + }, + { + "epoch": 76.99346405228758, + "eval_accuracy": 0.9632352941176471, + "eval_loss": 0.37292152643203735, + "eval_runtime": 20.8077, + "eval_samples_per_second": 6.536, + "eval_steps_per_second": 0.817, + "step": 2945 + }, + { + "epoch": 77.12418300653594, + "grad_norm": 31.79336166381836, + "learning_rate": 1.242690058479532e-05, + "loss": 0.0302, + "step": 2950 + }, + { + "epoch": 77.38562091503267, + "grad_norm": 0.0776483416557312, + "learning_rate": 1.2280701754385964e-05, + "loss": 0.0094, + "step": 2960 + }, + { + "epoch": 77.6470588235294, + "grad_norm": 63.487571716308594, + "learning_rate": 1.2134502923976608e-05, + "loss": 0.0473, + "step": 2970 + }, + { + "epoch": 77.90849673202614, + "grad_norm": 0.09107412397861481, + "learning_rate": 1.1988304093567252e-05, + "loss": 0.0165, + "step": 2980 + }, + { + "epoch": 77.98692810457516, + "eval_accuracy": 0.9632352941176471, + "eval_loss": 0.347072571516037, + "eval_runtime": 17.8737, + "eval_samples_per_second": 7.609, + "eval_steps_per_second": 0.951, + "step": 2983 + }, + { + "epoch": 78.16993464052288, + "grad_norm": 36.47078323364258, + "learning_rate": 1.1842105263157895e-05, + "loss": 0.0176, + "step": 2990 + }, + { + "epoch": 78.43137254901961, + "grad_norm": 0.0024324676487594843, + "learning_rate": 1.1695906432748537e-05, + "loss": 0.0317, + "step": 3000 + }, + { + "epoch": 78.69281045751634, + "grad_norm": 26.059871673583984, + "learning_rate": 1.154970760233918e-05, + "loss": 0.0699, + "step": 3010 + }, + { + "epoch": 78.95424836601308, + "grad_norm": 38.14042282104492, + "learning_rate": 1.1403508771929824e-05, + "loss": 0.0163, + "step": 3020 + }, + { + "epoch": 78.98039215686275, + "eval_accuracy": 0.9705882352941176, + "eval_loss": 0.29730716347694397, + "eval_runtime": 18.5858, + "eval_samples_per_second": 7.317, + "eval_steps_per_second": 0.915, + "step": 3021 + }, + { + "epoch": 79.2156862745098, + "grad_norm": 87.14070129394531, + "learning_rate": 1.1257309941520468e-05, + "loss": 0.0556, + "step": 3030 + }, + { + "epoch": 79.47712418300654, + "grad_norm": 3.418160915374756, + "learning_rate": 1.1111111111111112e-05, + "loss": 0.0073, + "step": 3040 + }, + { + "epoch": 79.73856209150327, + "grad_norm": 22.285499572753906, + "learning_rate": 1.0964912280701754e-05, + "loss": 0.0249, + "step": 3050 + }, + { + "epoch": 80.0, + "grad_norm": 35.9242057800293, + "learning_rate": 1.0818713450292397e-05, + "loss": 0.0202, + "step": 3060 + }, + { + "epoch": 80.0, + "eval_accuracy": 0.9558823529411765, + "eval_loss": 0.3729775846004486, + "eval_runtime": 19.8789, + "eval_samples_per_second": 6.841, + "eval_steps_per_second": 0.855, + "step": 3060 + }, + { + "epoch": 80.26143790849673, + "grad_norm": 15.128210067749023, + "learning_rate": 1.067251461988304e-05, + "loss": 0.0628, + "step": 3070 + }, + { + "epoch": 80.52287581699346, + "grad_norm": 29.2634220123291, + "learning_rate": 1.0526315789473684e-05, + "loss": 0.0244, + "step": 3080 + }, + { + "epoch": 80.7843137254902, + "grad_norm": 79.84837341308594, + "learning_rate": 1.0380116959064328e-05, + "loss": 0.0368, + "step": 3090 + }, + { + "epoch": 80.99346405228758, + "eval_accuracy": 0.9705882352941176, + "eval_loss": 0.2876713275909424, + "eval_runtime": 19.4821, + "eval_samples_per_second": 6.981, + "eval_steps_per_second": 0.873, + "step": 3098 + }, + { + "epoch": 81.04575163398692, + "grad_norm": 2.7281501293182373, + "learning_rate": 1.023391812865497e-05, + "loss": 0.0238, + "step": 3100 + }, + { + "epoch": 81.30718954248366, + "grad_norm": 0.0004346697241999209, + "learning_rate": 1.0087719298245614e-05, + "loss": 0.0305, + "step": 3110 + }, + { + "epoch": 81.56862745098039, + "grad_norm": 0.03860533982515335, + "learning_rate": 9.941520467836257e-06, + "loss": 0.0136, + "step": 3120 + }, + { + "epoch": 81.83006535947712, + "grad_norm": 0.4280990958213806, + "learning_rate": 9.795321637426901e-06, + "loss": 0.0374, + "step": 3130 + }, + { + "epoch": 81.98692810457516, + "eval_accuracy": 0.9632352941176471, + "eval_loss": 0.41433659195899963, + "eval_runtime": 19.9936, + "eval_samples_per_second": 6.802, + "eval_steps_per_second": 0.85, + "step": 3136 + }, + { + "epoch": 82.09150326797386, + "grad_norm": 31.7745418548584, + "learning_rate": 9.649122807017545e-06, + "loss": 0.0105, + "step": 3140 + }, + { + "epoch": 82.3529411764706, + "grad_norm": 2.9742166996002197, + "learning_rate": 9.502923976608186e-06, + "loss": 0.0361, + "step": 3150 + }, + { + "epoch": 82.61437908496733, + "grad_norm": 3.588392734527588, + "learning_rate": 9.35672514619883e-06, + "loss": 0.0648, + "step": 3160 + }, + { + "epoch": 82.87581699346406, + "grad_norm": 0.4829164147377014, + "learning_rate": 9.210526315789474e-06, + "loss": 0.0296, + "step": 3170 + }, + { + "epoch": 82.98039215686275, + "eval_accuracy": 0.9779411764705882, + "eval_loss": 0.2895439565181732, + "eval_runtime": 17.9847, + "eval_samples_per_second": 7.562, + "eval_steps_per_second": 0.945, + "step": 3174 + }, + { + "epoch": 83.13725490196079, + "grad_norm": 22.893632888793945, + "learning_rate": 9.064327485380117e-06, + "loss": 0.0115, + "step": 3180 + }, + { + "epoch": 83.39869281045752, + "grad_norm": 0.021368976682424545, + "learning_rate": 8.918128654970761e-06, + "loss": 0.0269, + "step": 3190 + }, + { + "epoch": 83.66013071895425, + "grad_norm": 0.06225317716598511, + "learning_rate": 8.771929824561403e-06, + "loss": 0.0024, + "step": 3200 + }, + { + "epoch": 83.92156862745098, + "grad_norm": 0.05705859139561653, + "learning_rate": 8.625730994152046e-06, + "loss": 0.0405, + "step": 3210 + }, + { + "epoch": 84.0, + "eval_accuracy": 0.9558823529411765, + "eval_loss": 0.29270094633102417, + "eval_runtime": 19.1133, + "eval_samples_per_second": 7.115, + "eval_steps_per_second": 0.889, + "step": 3213 + }, + { + "epoch": 84.18300653594771, + "grad_norm": 24.514904022216797, + "learning_rate": 8.47953216374269e-06, + "loss": 0.0098, + "step": 3220 + }, + { + "epoch": 84.44444444444444, + "grad_norm": 0.596236526966095, + "learning_rate": 8.333333333333334e-06, + "loss": 0.0035, + "step": 3230 + }, + { + "epoch": 84.70588235294117, + "grad_norm": 0.050445396453142166, + "learning_rate": 8.187134502923977e-06, + "loss": 0.005, + "step": 3240 + }, + { + "epoch": 84.9673202614379, + "grad_norm": 0.07400578260421753, + "learning_rate": 8.04093567251462e-06, + "loss": 0.0097, + "step": 3250 + }, + { + "epoch": 84.99346405228758, + "eval_accuracy": 0.9632352941176471, + "eval_loss": 0.317930668592453, + "eval_runtime": 18.575, + "eval_samples_per_second": 7.322, + "eval_steps_per_second": 0.915, + "step": 3251 + }, + { + "epoch": 85.22875816993464, + "grad_norm": 12.950275421142578, + "learning_rate": 7.894736842105263e-06, + "loss": 0.0026, + "step": 3260 + }, + { + "epoch": 85.49019607843137, + "grad_norm": 16.546571731567383, + "learning_rate": 7.748538011695907e-06, + "loss": 0.0257, + "step": 3270 + }, + { + "epoch": 85.7516339869281, + "grad_norm": 0.6142169237136841, + "learning_rate": 7.602339181286549e-06, + "loss": 0.0182, + "step": 3280 + }, + { + "epoch": 85.98692810457516, + "eval_accuracy": 0.9705882352941176, + "eval_loss": 0.30465030670166016, + "eval_runtime": 18.7827, + "eval_samples_per_second": 7.241, + "eval_steps_per_second": 0.905, + "step": 3289 + }, + { + "epoch": 86.01307189542484, + "grad_norm": 0.09201680123806, + "learning_rate": 7.456140350877193e-06, + "loss": 0.0086, + "step": 3290 + }, + { + "epoch": 86.27450980392157, + "grad_norm": 0.6810176372528076, + "learning_rate": 7.3099415204678366e-06, + "loss": 0.0033, + "step": 3300 + }, + { + "epoch": 86.5359477124183, + "grad_norm": 7.0328474044799805, + "learning_rate": 7.163742690058479e-06, + "loss": 0.023, + "step": 3310 + }, + { + "epoch": 86.79738562091504, + "grad_norm": 0.5138120055198669, + "learning_rate": 7.017543859649123e-06, + "loss": 0.0207, + "step": 3320 + }, + { + "epoch": 86.98039215686275, + "eval_accuracy": 0.9779411764705882, + "eval_loss": 0.3018016815185547, + "eval_runtime": 17.5979, + "eval_samples_per_second": 7.728, + "eval_steps_per_second": 0.966, + "step": 3327 + }, + { + "epoch": 87.05882352941177, + "grad_norm": 0.11021004617214203, + "learning_rate": 6.871345029239766e-06, + "loss": 0.0711, + "step": 3330 + }, + { + "epoch": 87.3202614379085, + "grad_norm": 0.03013734146952629, + "learning_rate": 6.725146198830409e-06, + "loss": 0.0424, + "step": 3340 + }, + { + "epoch": 87.58169934640523, + "grad_norm": 69.32197570800781, + "learning_rate": 6.578947368421053e-06, + "loss": 0.0269, + "step": 3350 + }, + { + "epoch": 87.84313725490196, + "grad_norm": 0.45887792110443115, + "learning_rate": 6.432748538011696e-06, + "loss": 0.0207, + "step": 3360 + }, + { + "epoch": 88.0, + "eval_accuracy": 0.9632352941176471, + "eval_loss": 0.332051545381546, + "eval_runtime": 17.8575, + "eval_samples_per_second": 7.616, + "eval_steps_per_second": 0.952, + "step": 3366 + }, + { + "epoch": 88.10457516339869, + "grad_norm": 0.007120809052139521, + "learning_rate": 6.286549707602339e-06, + "loss": 0.0047, + "step": 3370 + }, + { + "epoch": 88.36601307189542, + "grad_norm": 0.051657985895872116, + "learning_rate": 6.140350877192982e-06, + "loss": 0.0224, + "step": 3380 + }, + { + "epoch": 88.62745098039215, + "grad_norm": 0.6093434691429138, + "learning_rate": 5.994152046783626e-06, + "loss": 0.0052, + "step": 3390 + }, + { + "epoch": 88.88888888888889, + "grad_norm": 25.99680519104004, + "learning_rate": 5.8479532163742686e-06, + "loss": 0.003, + "step": 3400 + }, + { + "epoch": 88.99346405228758, + "eval_accuracy": 0.9705882352941176, + "eval_loss": 0.30860844254493713, + "eval_runtime": 18.245, + "eval_samples_per_second": 7.454, + "eval_steps_per_second": 0.932, + "step": 3404 + }, + { + "epoch": 89.15032679738562, + "grad_norm": 31.555145263671875, + "learning_rate": 5.701754385964912e-06, + "loss": 0.0329, + "step": 3410 + }, + { + "epoch": 89.41176470588235, + "grad_norm": 18.486536026000977, + "learning_rate": 5.555555555555556e-06, + "loss": 0.029, + "step": 3420 + }, + { + "epoch": 89.67320261437908, + "grad_norm": 0.33306655287742615, + "learning_rate": 5.409356725146199e-06, + "loss": 0.0098, + "step": 3430 + }, + { + "epoch": 89.93464052287581, + "grad_norm": 2.643474578857422, + "learning_rate": 5.263157894736842e-06, + "loss": 0.0157, + "step": 3440 + }, + { + "epoch": 89.98692810457516, + "eval_accuracy": 0.9705882352941176, + "eval_loss": 0.2947893440723419, + "eval_runtime": 18.1316, + "eval_samples_per_second": 7.501, + "eval_steps_per_second": 0.938, + "step": 3442 + }, + { + "epoch": 90.19607843137256, + "grad_norm": 6.317154407501221, + "learning_rate": 5.116959064327485e-06, + "loss": 0.008, + "step": 3450 + }, + { + "epoch": 90.45751633986929, + "grad_norm": 1.63987398147583, + "learning_rate": 4.970760233918129e-06, + "loss": 0.0219, + "step": 3460 + }, + { + "epoch": 90.71895424836602, + "grad_norm": 8.074739456176758, + "learning_rate": 4.824561403508772e-06, + "loss": 0.0188, + "step": 3470 + }, + { + "epoch": 90.98039215686275, + "grad_norm": 0.2915269136428833, + "learning_rate": 4.678362573099415e-06, + "loss": 0.0428, + "step": 3480 + }, + { + "epoch": 90.98039215686275, + "eval_accuracy": 0.9705882352941176, + "eval_loss": 0.3174949586391449, + "eval_runtime": 17.8483, + "eval_samples_per_second": 7.62, + "eval_steps_per_second": 0.952, + "step": 3480 + }, + { + "epoch": 91.24183006535948, + "grad_norm": 0.3356679677963257, + "learning_rate": 4.532163742690059e-06, + "loss": 0.0161, + "step": 3490 + }, + { + "epoch": 91.50326797385621, + "grad_norm": 1.1951477527618408, + "learning_rate": 4.3859649122807014e-06, + "loss": 0.0205, + "step": 3500 + }, + { + "epoch": 91.76470588235294, + "grad_norm": 0.05076509341597557, + "learning_rate": 4.239766081871345e-06, + "loss": 0.0189, + "step": 3510 + }, + { + "epoch": 92.0, + "eval_accuracy": 0.9632352941176471, + "eval_loss": 0.3239772915840149, + "eval_runtime": 17.301, + "eval_samples_per_second": 7.861, + "eval_steps_per_second": 0.983, + "step": 3519 + }, + { + "epoch": 92.02614379084967, + "grad_norm": 1.3812580108642578, + "learning_rate": 4.093567251461989e-06, + "loss": 0.0212, + "step": 3520 + }, + { + "epoch": 92.2875816993464, + "grad_norm": 0.3320296108722687, + "learning_rate": 3.9473684210526315e-06, + "loss": 0.0073, + "step": 3530 + }, + { + "epoch": 92.54901960784314, + "grad_norm": 0.009532331489026546, + "learning_rate": 3.8011695906432747e-06, + "loss": 0.0053, + "step": 3540 + }, + { + "epoch": 92.81045751633987, + "grad_norm": 0.5157586932182312, + "learning_rate": 3.6549707602339183e-06, + "loss": 0.0046, + "step": 3550 + }, + { + "epoch": 92.99346405228758, + "eval_accuracy": 0.9632352941176471, + "eval_loss": 0.341442346572876, + "eval_runtime": 18.8672, + "eval_samples_per_second": 7.208, + "eval_steps_per_second": 0.901, + "step": 3557 + }, + { + "epoch": 93.0718954248366, + "grad_norm": 61.38653564453125, + "learning_rate": 3.5087719298245615e-06, + "loss": 0.0246, + "step": 3560 + }, + { + "epoch": 93.33333333333333, + "grad_norm": 0.477070152759552, + "learning_rate": 3.3625730994152047e-06, + "loss": 0.0639, + "step": 3570 + }, + { + "epoch": 93.59477124183006, + "grad_norm": 68.3900375366211, + "learning_rate": 3.216374269005848e-06, + "loss": 0.0255, + "step": 3580 + }, + { + "epoch": 93.85620915032679, + "grad_norm": 0.3444403111934662, + "learning_rate": 3.070175438596491e-06, + "loss": 0.0057, + "step": 3590 + }, + { + "epoch": 93.98692810457516, + "eval_accuracy": 0.9632352941176471, + "eval_loss": 0.33292174339294434, + "eval_runtime": 17.7377, + "eval_samples_per_second": 7.667, + "eval_steps_per_second": 0.958, + "step": 3595 + }, + { + "epoch": 94.11764705882354, + "grad_norm": 0.04389649257063866, + "learning_rate": 2.9239766081871343e-06, + "loss": 0.0058, + "step": 3600 + }, + { + "epoch": 94.37908496732027, + "grad_norm": 0.5849317908287048, + "learning_rate": 2.777777777777778e-06, + "loss": 0.0586, + "step": 3610 + }, + { + "epoch": 94.640522875817, + "grad_norm": 0.019542796537280083, + "learning_rate": 2.631578947368421e-06, + "loss": 0.001, + "step": 3620 + }, + { + "epoch": 94.90196078431373, + "grad_norm": 0.002426290884613991, + "learning_rate": 2.4853801169590643e-06, + "loss": 0.0165, + "step": 3630 + }, + { + "epoch": 94.98039215686275, + "eval_accuracy": 0.9632352941176471, + "eval_loss": 0.32402223348617554, + "eval_runtime": 17.5747, + "eval_samples_per_second": 7.738, + "eval_steps_per_second": 0.967, + "step": 3633 + }, + { + "epoch": 95.16339869281046, + "grad_norm": 2.353595495223999, + "learning_rate": 2.3391812865497075e-06, + "loss": 0.0009, + "step": 3640 + }, + { + "epoch": 95.42483660130719, + "grad_norm": 0.7732095718383789, + "learning_rate": 2.1929824561403507e-06, + "loss": 0.0273, + "step": 3650 + }, + { + "epoch": 95.68627450980392, + "grad_norm": 0.006318532861769199, + "learning_rate": 2.0467836257309943e-06, + "loss": 0.0219, + "step": 3660 + }, + { + "epoch": 95.94771241830065, + "grad_norm": 0.12237526476383209, + "learning_rate": 1.9005847953216373e-06, + "loss": 0.006, + "step": 3670 + }, + { + "epoch": 96.0, + "eval_accuracy": 0.9705882352941176, + "eval_loss": 0.3180083632469177, + "eval_runtime": 18.1825, + "eval_samples_per_second": 7.48, + "eval_steps_per_second": 0.935, + "step": 3672 + }, + { + "epoch": 96.20915032679738, + "grad_norm": 4.133842468261719, + "learning_rate": 1.7543859649122807e-06, + "loss": 0.0876, + "step": 3680 + }, + { + "epoch": 96.47058823529412, + "grad_norm": 14.3917236328125, + "learning_rate": 1.608187134502924e-06, + "loss": 0.0033, + "step": 3690 + }, + { + "epoch": 96.73202614379085, + "grad_norm": 0.6327334642410278, + "learning_rate": 1.4619883040935671e-06, + "loss": 0.0045, + "step": 3700 + }, + { + "epoch": 96.99346405228758, + "grad_norm": 0.47620221972465515, + "learning_rate": 1.3157894736842106e-06, + "loss": 0.0172, + "step": 3710 + }, + { + "epoch": 96.99346405228758, + "eval_accuracy": 0.9779411764705882, + "eval_loss": 0.3103199303150177, + "eval_runtime": 17.4264, + "eval_samples_per_second": 7.804, + "eval_steps_per_second": 0.976, + "step": 3710 + }, + { + "epoch": 97.25490196078431, + "grad_norm": 43.838233947753906, + "learning_rate": 1.1695906432748538e-06, + "loss": 0.0047, + "step": 3720 + }, + { + "epoch": 97.51633986928104, + "grad_norm": 0.001560373231768608, + "learning_rate": 1.0233918128654972e-06, + "loss": 0.0032, + "step": 3730 + }, + { + "epoch": 97.77777777777777, + "grad_norm": 0.00045679722097702324, + "learning_rate": 8.771929824561404e-07, + "loss": 0.0109, + "step": 3740 + }, + { + "epoch": 97.98692810457516, + "eval_accuracy": 0.9779411764705882, + "eval_loss": 0.3034810721874237, + "eval_runtime": 18.06, + "eval_samples_per_second": 7.53, + "eval_steps_per_second": 0.941, + "step": 3748 + }, + { + "epoch": 98.03921568627452, + "grad_norm": 0.0029410182032734156, + "learning_rate": 7.309941520467836e-07, + "loss": 0.0093, + "step": 3750 + }, + { + "epoch": 98.30065359477125, + "grad_norm": 0.060371335595846176, + "learning_rate": 5.847953216374269e-07, + "loss": 0.0147, + "step": 3760 + }, + { + "epoch": 98.56209150326798, + "grad_norm": 0.0018022909061983228, + "learning_rate": 4.385964912280702e-07, + "loss": 0.0325, + "step": 3770 + }, + { + "epoch": 98.82352941176471, + "grad_norm": 0.866423487663269, + "learning_rate": 2.9239766081871344e-07, + "loss": 0.0172, + "step": 3780 + }, + { + "epoch": 98.98039215686275, + "eval_accuracy": 0.9779411764705882, + "eval_loss": 0.3034467101097107, + "eval_runtime": 20.5056, + "eval_samples_per_second": 6.632, + "eval_steps_per_second": 0.829, + "step": 3786 + }, + { + "epoch": 99.08496732026144, + "grad_norm": 0.015289215371012688, + "learning_rate": 1.4619883040935672e-07, + "loss": 0.0003, + "step": 3790 + }, + { + "epoch": 99.34640522875817, + "grad_norm": 0.3536844849586487, + "learning_rate": 0.0, + "loss": 0.0219, + "step": 3800 + }, + { + "epoch": 99.34640522875817, + "eval_accuracy": 0.9779411764705882, + "eval_loss": 0.3036399185657501, + "eval_runtime": 18.1299, + "eval_samples_per_second": 7.501, + "eval_steps_per_second": 0.938, + "step": 3800 + }, + { + "epoch": 99.34640522875817, + "step": 3800, + "total_flos": 3.0228260830838784e+18, + "train_loss": 0.1524556069365874, + "train_runtime": 23400.6351, + "train_samples_per_second": 5.231, + "train_steps_per_second": 0.162 + } + ], + "logging_steps": 10, + "max_steps": 3800, + "num_input_tokens_seen": 0, + "num_train_epochs": 100, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 3.0228260830838784e+18, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}