{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 9.654545454545454, "eval_steps": 500, "global_step": 270, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.03636363636363636, "grad_norm": 3.691458225250244, "learning_rate": 7.4074074074074075e-06, "loss": 2.6618, "step": 1 }, { "epoch": 0.18181818181818182, "grad_norm": 3.446274518966675, "learning_rate": 3.7037037037037037e-05, "loss": 2.6408, "step": 5 }, { "epoch": 0.36363636363636365, "grad_norm": 2.032472848892212, "learning_rate": 7.407407407407407e-05, "loss": 2.5317, "step": 10 }, { "epoch": 0.5454545454545454, "grad_norm": 0.9463324546813965, "learning_rate": 0.00011111111111111112, "loss": 2.36, "step": 15 }, { "epoch": 0.7272727272727273, "grad_norm": 0.7080094218254089, "learning_rate": 0.00014814814814814815, "loss": 2.2174, "step": 20 }, { "epoch": 0.9090909090909091, "grad_norm": 0.5037975311279297, "learning_rate": 0.0001851851851851852, "loss": 2.1182, "step": 25 }, { "epoch": 1.0, "eval_loss": 2.782132625579834, "eval_runtime": 0.8292, "eval_samples_per_second": 12.06, "eval_steps_per_second": 1.206, "step": 28 }, { "epoch": 1.0727272727272728, "grad_norm": 0.41888633370399475, "learning_rate": 0.00019992479525042303, "loss": 2.0277, "step": 30 }, { "epoch": 1.2545454545454544, "grad_norm": 0.27915704250335693, "learning_rate": 0.00019946562024066014, "loss": 1.9587, "step": 35 }, { "epoch": 1.4363636363636363, "grad_norm": 0.20056034624576569, "learning_rate": 0.00019859096633447965, "loss": 1.9087, "step": 40 }, { "epoch": 1.6181818181818182, "grad_norm": 0.16737522184848785, "learning_rate": 0.00019730448705798239, "loss": 1.8766, "step": 45 }, { "epoch": 1.8, "grad_norm": 0.15048423409461975, "learning_rate": 0.00019561155617738797, "loss": 1.8481, "step": 50 }, { "epoch": 1.981818181818182, "grad_norm": 0.1224176436662674, "learning_rate": 0.000193519245252219, "loss": 1.8354, "step": 55 }, { "epoch": 2.0, "eval_loss": 2.737755537033081, "eval_runtime": 0.829, "eval_samples_per_second": 12.063, "eval_steps_per_second": 1.206, "step": 56 }, { "epoch": 2.1454545454545455, "grad_norm": 0.1324545294046402, "learning_rate": 0.0001910362940966147, "loss": 1.8118, "step": 60 }, { "epoch": 2.327272727272727, "grad_norm": 0.11611360311508179, "learning_rate": 0.0001881730742721608, "loss": 1.7937, "step": 65 }, { "epoch": 2.509090909090909, "grad_norm": 0.1148991584777832, "learning_rate": 0.00018494154576472976, "loss": 1.7791, "step": 70 }, { "epoch": 2.690909090909091, "grad_norm": 0.11438702791929245, "learning_rate": 0.00018135520702629675, "loss": 1.7654, "step": 75 }, { "epoch": 2.8727272727272726, "grad_norm": 0.11716635525226593, "learning_rate": 0.00017742903859041325, "loss": 1.7604, "step": 80 }, { "epoch": 3.0, "eval_loss": 2.7259373664855957, "eval_runtime": 0.8303, "eval_samples_per_second": 12.044, "eval_steps_per_second": 1.204, "step": 84 }, { "epoch": 3.036363636363636, "grad_norm": 0.1302882581949234, "learning_rate": 0.00017317944049686124, "loss": 1.7453, "step": 85 }, { "epoch": 3.2181818181818183, "grad_norm": 0.12489154189825058, "learning_rate": 0.0001686241637868734, "loss": 1.7396, "step": 90 }, { "epoch": 3.4, "grad_norm": 0.10804688185453415, "learning_rate": 0.0001637822363550706, "loss": 1.7272, "step": 95 }, { "epoch": 3.581818181818182, "grad_norm": 0.1448238343000412, "learning_rate": 0.0001586738834678418, "loss": 1.7231, "step": 100 }, { "epoch": 3.7636363636363637, "grad_norm": 0.12403673678636551, "learning_rate": 0.00015332044328016914, "loss": 1.7101, "step": 105 }, { "epoch": 3.9454545454545453, "grad_norm": 0.11520184576511383, "learning_rate": 0.0001477442777037949, "loss": 1.7035, "step": 110 }, { "epoch": 4.0, "eval_loss": 2.724990129470825, "eval_runtime": 0.8296, "eval_samples_per_second": 12.053, "eval_steps_per_second": 1.205, "step": 112 }, { "epoch": 4.109090909090909, "grad_norm": 0.11850611865520477, "learning_rate": 0.0001419686789990429, "loss": 1.6998, "step": 115 }, { "epoch": 4.290909090909091, "grad_norm": 0.141310453414917, "learning_rate": 0.00013601777248047105, "loss": 1.6942, "step": 120 }, { "epoch": 4.472727272727273, "grad_norm": 0.14388997852802277, "learning_rate": 0.00012991641574276418, "loss": 1.6887, "step": 125 }, { "epoch": 4.654545454545454, "grad_norm": 0.11356977373361588, "learning_rate": 0.00012369009482781192, "loss": 1.6845, "step": 130 }, { "epoch": 4.836363636363636, "grad_norm": 0.13505423069000244, "learning_rate": 0.00011736481776669306, "loss": 1.6801, "step": 135 }, { "epoch": 5.0, "grad_norm": 0.18071481585502625, "learning_rate": 0.00011096700594125318, "loss": 1.6822, "step": 140 }, { "epoch": 5.0, "eval_loss": 2.7262730598449707, "eval_runtime": 0.8327, "eval_samples_per_second": 12.009, "eval_steps_per_second": 1.201, "step": 140 }, { "epoch": 5.181818181818182, "grad_norm": 0.12405228614807129, "learning_rate": 0.00010452338371907064, "loss": 1.671, "step": 145 }, { "epoch": 5.363636363636363, "grad_norm": 0.15709254145622253, "learning_rate": 9.806086682281758e-05, "loss": 1.6697, "step": 150 }, { "epoch": 5.545454545454545, "grad_norm": 0.1405353993177414, "learning_rate": 9.160644990030931e-05, "loss": 1.6707, "step": 155 }, { "epoch": 5.7272727272727275, "grad_norm": 0.13487176597118378, "learning_rate": 8.518709376487515e-05, "loss": 1.6619, "step": 160 }, { "epoch": 5.909090909090909, "grad_norm": 0.12394227087497711, "learning_rate": 7.882961277705895e-05, "loss": 1.6619, "step": 165 }, { "epoch": 6.0, "eval_loss": 2.7253997325897217, "eval_runtime": 0.8321, "eval_samples_per_second": 12.017, "eval_steps_per_second": 1.202, "step": 168 }, { "epoch": 6.072727272727272, "grad_norm": 0.11816684156656265, "learning_rate": 7.256056283806986e-05, "loss": 1.6573, "step": 170 }, { "epoch": 6.254545454545455, "grad_norm": 0.14117498695850372, "learning_rate": 6.640613046284581e-05, "loss": 1.6622, "step": 175 }, { "epoch": 6.4363636363636365, "grad_norm": 0.1342514008283615, "learning_rate": 6.039202339608432e-05, "loss": 1.6535, "step": 180 }, { "epoch": 6.618181818181818, "grad_norm": 0.13483189046382904, "learning_rate": 5.4543363228149946e-05, "loss": 1.6532, "step": 185 }, { "epoch": 6.8, "grad_norm": 0.1636153757572174, "learning_rate": 4.888458045941269e-05, "loss": 1.6482, "step": 190 }, { "epoch": 6.9818181818181815, "grad_norm": 0.1563912183046341, "learning_rate": 4.343931245134616e-05, "loss": 1.6471, "step": 195 }, { "epoch": 7.0, "eval_loss": 2.7240517139434814, "eval_runtime": 0.8312, "eval_samples_per_second": 12.031, "eval_steps_per_second": 1.203, "step": 196 }, { "epoch": 7.1454545454545455, "grad_norm": 0.11320989578962326, "learning_rate": 3.8230304690654304e-05, "loss": 1.6472, "step": 200 }, { "epoch": 7.327272727272727, "grad_norm": 0.111383818089962, "learning_rate": 3.3279315778858036e-05, "loss": 1.6488, "step": 205 }, { "epoch": 7.509090909090909, "grad_norm": 0.10844731330871582, "learning_rate": 2.8607026544210114e-05, "loss": 1.6458, "step": 210 }, { "epoch": 7.690909090909091, "grad_norm": 0.10823339223861694, "learning_rate": 2.423295365558821e-05, "loss": 1.6456, "step": 215 }, { "epoch": 7.872727272727273, "grad_norm": 0.10790830850601196, "learning_rate": 2.01753680992107e-05, "loss": 1.6458, "step": 220 }, { "epoch": 8.0, "eval_loss": 2.7252650260925293, "eval_runtime": 0.8302, "eval_samples_per_second": 12.045, "eval_steps_per_second": 1.204, "step": 224 }, { "epoch": 8.036363636363637, "grad_norm": 0.11462420970201492, "learning_rate": 1.6451218858706374e-05, "loss": 1.643, "step": 225 }, { "epoch": 8.218181818181819, "grad_norm": 0.10164881497621536, "learning_rate": 1.307606211733522e-05, "loss": 1.6435, "step": 230 }, { "epoch": 8.4, "grad_norm": 0.11715802550315857, "learning_rate": 1.0063996278090704e-05, "loss": 1.6436, "step": 235 }, { "epoch": 8.581818181818182, "grad_norm": 0.1077931597828865, "learning_rate": 7.427603073110967e-06, "loss": 1.6437, "step": 240 }, { "epoch": 8.763636363636364, "grad_norm": 0.09881118685007095, "learning_rate": 5.177895008392353e-06, "loss": 1.6415, "step": 245 }, { "epoch": 8.945454545454545, "grad_norm": 0.0973580852150917, "learning_rate": 3.3242693633337983e-06, "loss": 1.641, "step": 250 }, { "epoch": 9.0, "eval_loss": 2.725569009780884, "eval_runtime": 0.8306, "eval_samples_per_second": 12.039, "eval_steps_per_second": 1.204, "step": 252 }, { "epoch": 9.10909090909091, "grad_norm": 0.10264136642217636, "learning_rate": 1.874468937261531e-06, "loss": 1.6464, "step": 255 }, { "epoch": 9.290909090909091, "grad_norm": 0.1021399274468422, "learning_rate": 8.345497068998897e-07, "loss": 1.6443, "step": 260 }, { "epoch": 9.472727272727273, "grad_norm": 0.10423731058835983, "learning_rate": 2.088555298867978e-07, "loss": 1.6436, "step": 265 }, { "epoch": 9.654545454545454, "grad_norm": 0.09860274940729141, "learning_rate": 0.0, "loss": 1.6383, "step": 270 }, { "epoch": 9.654545454545454, "eval_loss": 2.725593328475952, "eval_runtime": 0.8317, "eval_samples_per_second": 12.024, "eval_steps_per_second": 1.202, "step": 270 }, { "epoch": 9.654545454545454, "step": 270, "total_flos": 8.156088875152835e+17, "train_loss": 1.7710220513520418, "train_runtime": 1245.0854, "train_samples_per_second": 112.233, "train_steps_per_second": 0.217 } ], "logging_steps": 5, "max_steps": 270, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 8.156088875152835e+17, "train_batch_size": 32, "trial_name": null, "trial_params": null }