{ "best_metric": null, "best_model_checkpoint": null, "epoch": 4.0, "eval_steps": 20, "global_step": 108, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.04, "learning_rate": 2e-05, "loss": 0.9768, "step": 1 }, { "epoch": 0.07, "learning_rate": 4e-05, "loss": 1.0553, "step": 2 }, { "epoch": 0.11, "learning_rate": 6e-05, "loss": 0.9074, "step": 3 }, { "epoch": 0.15, "learning_rate": 8e-05, "loss": 1.0351, "step": 4 }, { "epoch": 0.19, "learning_rate": 0.0001, "loss": 0.9918, "step": 5 }, { "epoch": 0.22, "learning_rate": 0.00012, "loss": 0.9872, "step": 6 }, { "epoch": 0.26, "learning_rate": 0.00014, "loss": 0.9573, "step": 7 }, { "epoch": 0.3, "learning_rate": 0.00016, "loss": 1.0466, "step": 8 }, { "epoch": 0.33, "learning_rate": 0.00018, "loss": 0.8995, "step": 9 }, { "epoch": 0.37, "learning_rate": 0.0002, "loss": 0.9041, "step": 10 }, { "epoch": 0.41, "learning_rate": 0.00019996841892833, "loss": 0.936, "step": 11 }, { "epoch": 0.44, "learning_rate": 0.00019987369566060176, "loss": 0.8254, "step": 12 }, { "epoch": 0.48, "learning_rate": 0.0001997158900260614, "loss": 0.9508, "step": 13 }, { "epoch": 0.52, "learning_rate": 0.00019949510169813003, "loss": 0.929, "step": 14 }, { "epoch": 0.56, "learning_rate": 0.0001992114701314478, "loss": 0.9618, "step": 15 }, { "epoch": 0.59, "learning_rate": 0.0001988651744737914, "loss": 0.9317, "step": 16 }, { "epoch": 0.63, "learning_rate": 0.00019845643345292054, "loss": 0.9399, "step": 17 }, { "epoch": 0.67, "learning_rate": 0.0001979855052384247, "loss": 0.9377, "step": 18 }, { "epoch": 0.7, "learning_rate": 0.00019745268727865774, "loss": 0.9048, "step": 19 }, { "epoch": 0.74, "learning_rate": 0.0001968583161128631, "loss": 0.9311, "step": 20 }, { "epoch": 0.74, "eval_loss": 0.8045752644538879, "eval_runtime": 2.684, "eval_samples_per_second": 1.118, "eval_steps_per_second": 0.745, "step": 20 }, { "epoch": 0.78, "learning_rate": 0.0001962027671586086, "loss": 0.9376, "step": 21 }, { "epoch": 0.81, "learning_rate": 0.00019548645447466431, "loss": 0.8598, "step": 22 }, { "epoch": 0.85, "learning_rate": 0.00019470983049947444, "loss": 0.991, "step": 23 }, { "epoch": 0.89, "learning_rate": 0.00019387338576538744, "loss": 0.8472, "step": 24 }, { "epoch": 0.93, "learning_rate": 0.00019297764858882514, "loss": 0.8818, "step": 25 }, { "epoch": 0.96, "learning_rate": 0.00019202318473658705, "loss": 0.8879, "step": 26 }, { "epoch": 1.0, "learning_rate": 0.00019101059706849957, "loss": 0.8483, "step": 27 }, { "epoch": 1.04, "learning_rate": 0.0001899405251566371, "loss": 0.9505, "step": 28 }, { "epoch": 1.07, "learning_rate": 0.00018881364488135448, "loss": 0.9116, "step": 29 }, { "epoch": 1.11, "learning_rate": 0.00018763066800438636, "loss": 0.8575, "step": 30 }, { "epoch": 1.15, "learning_rate": 0.00018639234171928353, "loss": 0.8093, "step": 31 }, { "epoch": 1.19, "learning_rate": 0.00018509944817946922, "loss": 0.7966, "step": 32 }, { "epoch": 1.22, "learning_rate": 0.0001837528040042142, "loss": 0.8263, "step": 33 }, { "epoch": 1.26, "learning_rate": 0.00018235325976284275, "loss": 0.7951, "step": 34 }, { "epoch": 1.3, "learning_rate": 0.00018090169943749476, "loss": 0.849, "step": 35 }, { "epoch": 1.33, "learning_rate": 0.00017939903986478355, "loss": 0.863, "step": 36 }, { "epoch": 1.37, "learning_rate": 0.00017784623015670238, "loss": 0.8144, "step": 37 }, { "epoch": 1.41, "learning_rate": 0.0001762442511011448, "loss": 0.8078, "step": 38 }, { "epoch": 1.44, "learning_rate": 0.00017459411454241822, "loss": 0.7997, "step": 39 }, { "epoch": 1.48, "learning_rate": 0.00017289686274214118, "loss": 0.9322, "step": 40 }, { "epoch": 1.48, "eval_loss": 0.7793169617652893, "eval_runtime": 2.6811, "eval_samples_per_second": 1.119, "eval_steps_per_second": 0.746, "step": 40 }, { "epoch": 1.52, "learning_rate": 0.00017115356772092857, "loss": 0.8279, "step": 41 }, { "epoch": 1.56, "learning_rate": 0.0001693653305812805, "loss": 0.8759, "step": 42 }, { "epoch": 1.59, "learning_rate": 0.00016753328081210245, "loss": 0.8748, "step": 43 }, { "epoch": 1.63, "learning_rate": 0.00016565857557529566, "loss": 0.7638, "step": 44 }, { "epoch": 1.67, "learning_rate": 0.000163742398974869, "loss": 0.7941, "step": 45 }, { "epoch": 1.7, "learning_rate": 0.00016178596130903344, "loss": 0.8321, "step": 46 }, { "epoch": 1.74, "learning_rate": 0.0001597904983057519, "loss": 0.894, "step": 47 }, { "epoch": 1.78, "learning_rate": 0.00015775727034222675, "loss": 0.9176, "step": 48 }, { "epoch": 1.81, "learning_rate": 0.00015568756164881882, "loss": 0.8286, "step": 49 }, { "epoch": 1.85, "learning_rate": 0.00015358267949789966, "loss": 0.9328, "step": 50 }, { "epoch": 1.89, "learning_rate": 0.00015144395337815064, "loss": 0.8644, "step": 51 }, { "epoch": 1.93, "learning_rate": 0.00014927273415482915, "loss": 0.7769, "step": 52 }, { "epoch": 1.96, "learning_rate": 0.0001470703932165333, "loss": 0.8, "step": 53 }, { "epoch": 2.0, "learning_rate": 0.00014483832160900326, "loss": 0.7781, "step": 54 }, { "epoch": 2.04, "learning_rate": 0.00014257792915650728, "loss": 0.7852, "step": 55 }, { "epoch": 2.07, "learning_rate": 0.00014029064357136628, "loss": 0.7796, "step": 56 }, { "epoch": 2.11, "learning_rate": 0.00013797790955218014, "loss": 0.8287, "step": 57 }, { "epoch": 2.15, "learning_rate": 0.00013564118787132506, "loss": 0.6845, "step": 58 }, { "epoch": 2.19, "learning_rate": 0.00013328195445229868, "loss": 0.7821, "step": 59 }, { "epoch": 2.22, "learning_rate": 0.00013090169943749476, "loss": 0.708, "step": 60 }, { "epoch": 2.22, "eval_loss": 0.7880761027336121, "eval_runtime": 2.6843, "eval_samples_per_second": 1.118, "eval_steps_per_second": 0.745, "step": 60 }, { "epoch": 2.26, "learning_rate": 0.0001285019262469976, "loss": 0.8098, "step": 61 }, { "epoch": 2.3, "learning_rate": 0.00012608415062898972, "loss": 0.82, "step": 62 }, { "epoch": 2.33, "learning_rate": 0.00012364989970237248, "loss": 0.7187, "step": 63 }, { "epoch": 2.37, "learning_rate": 0.00012120071099220549, "loss": 0.7802, "step": 64 }, { "epoch": 2.41, "learning_rate": 0.00011873813145857249, "loss": 0.6834, "step": 65 }, { "epoch": 2.44, "learning_rate": 0.00011626371651948838, "loss": 0.6808, "step": 66 }, { "epoch": 2.48, "learning_rate": 0.0001137790290684638, "loss": 0.7881, "step": 67 }, { "epoch": 2.52, "learning_rate": 0.00011128563848734816, "loss": 0.7281, "step": 68 }, { "epoch": 2.56, "learning_rate": 0.00010878511965507434, "loss": 0.7231, "step": 69 }, { "epoch": 2.59, "learning_rate": 0.00010627905195293135, "loss": 0.6938, "step": 70 }, { "epoch": 2.63, "learning_rate": 0.00010376901826699348, "loss": 0.7633, "step": 71 }, { "epoch": 2.67, "learning_rate": 0.00010125660398833528, "loss": 0.8253, "step": 72 }, { "epoch": 2.7, "learning_rate": 9.874339601166473e-05, "loss": 0.8197, "step": 73 }, { "epoch": 2.74, "learning_rate": 9.623098173300654e-05, "loss": 0.7403, "step": 74 }, { "epoch": 2.78, "learning_rate": 9.372094804706867e-05, "loss": 0.8175, "step": 75 }, { "epoch": 2.81, "learning_rate": 9.121488034492569e-05, "loss": 0.7249, "step": 76 }, { "epoch": 2.85, "learning_rate": 8.871436151265184e-05, "loss": 0.7029, "step": 77 }, { "epoch": 2.89, "learning_rate": 8.62209709315362e-05, "loss": 0.8081, "step": 78 }, { "epoch": 2.93, "learning_rate": 8.373628348051165e-05, "loss": 0.7087, "step": 79 }, { "epoch": 2.96, "learning_rate": 8.126186854142752e-05, "loss": 0.762, "step": 80 }, { "epoch": 2.96, "eval_loss": 0.7806326746940613, "eval_runtime": 2.6841, "eval_samples_per_second": 1.118, "eval_steps_per_second": 0.745, "step": 80 }, { "epoch": 3.0, "learning_rate": 7.879928900779456e-05, "loss": 0.6724, "step": 81 }, { "epoch": 3.04, "learning_rate": 7.635010029762756e-05, "loss": 0.578, "step": 82 }, { "epoch": 3.07, "learning_rate": 7.391584937101033e-05, "loss": 0.6599, "step": 83 }, { "epoch": 3.11, "learning_rate": 7.149807375300239e-05, "loss": 0.732, "step": 84 }, { "epoch": 3.15, "learning_rate": 6.909830056250527e-05, "loss": 0.6144, "step": 85 }, { "epoch": 3.19, "learning_rate": 6.671804554770135e-05, "loss": 0.6812, "step": 86 }, { "epoch": 3.22, "learning_rate": 6.435881212867493e-05, "loss": 0.6753, "step": 87 }, { "epoch": 3.26, "learning_rate": 6.20220904478199e-05, "loss": 0.6341, "step": 88 }, { "epoch": 3.3, "learning_rate": 5.9709356428633746e-05, "loss": 0.6752, "step": 89 }, { "epoch": 3.33, "learning_rate": 5.7422070843492734e-05, "loss": 0.6995, "step": 90 }, { "epoch": 3.37, "learning_rate": 5.5161678390996796e-05, "loss": 0.6411, "step": 91 }, { "epoch": 3.41, "learning_rate": 5.292960678346675e-05, "loss": 0.6527, "step": 92 }, { "epoch": 3.44, "learning_rate": 5.072726584517086e-05, "loss": 0.7026, "step": 93 }, { "epoch": 3.48, "learning_rate": 4.8556046621849346e-05, "loss": 0.6603, "step": 94 }, { "epoch": 3.52, "learning_rate": 4.6417320502100316e-05, "loss": 0.6798, "step": 95 }, { "epoch": 3.56, "learning_rate": 4.431243835118124e-05, "loss": 0.623, "step": 96 }, { "epoch": 3.59, "learning_rate": 4.224272965777326e-05, "loss": 0.685, "step": 97 }, { "epoch": 3.63, "learning_rate": 4.020950169424815e-05, "loss": 0.7674, "step": 98 }, { "epoch": 3.67, "learning_rate": 3.821403869096658e-05, "loss": 0.7068, "step": 99 }, { "epoch": 3.7, "learning_rate": 3.6257601025131026e-05, "loss": 0.6724, "step": 100 }, { "epoch": 3.7, "eval_loss": 0.811485767364502, "eval_runtime": 2.6837, "eval_samples_per_second": 1.118, "eval_steps_per_second": 0.745, "step": 100 }, { "epoch": 3.74, "learning_rate": 3.4341424424704375e-05, "loss": 0.7169, "step": 101 }, { "epoch": 3.78, "learning_rate": 3.246671918789755e-05, "loss": 0.6499, "step": 102 }, { "epoch": 3.81, "learning_rate": 3.063466941871952e-05, "loss": 0.7342, "step": 103 }, { "epoch": 3.85, "learning_rate": 2.8846432279071467e-05, "loss": 0.6587, "step": 104 }, { "epoch": 3.89, "learning_rate": 2.7103137257858868e-05, "loss": 0.6042, "step": 105 }, { "epoch": 3.93, "learning_rate": 2.540588545758179e-05, "loss": 0.6507, "step": 106 }, { "epoch": 3.96, "learning_rate": 2.37557488988552e-05, "loss": 0.6646, "step": 107 }, { "epoch": 4.0, "learning_rate": 2.2153769843297667e-05, "loss": 0.6783, "step": 108 } ], "logging_steps": 1, "max_steps": 135, "num_train_epochs": 5, "save_steps": 500, "total_flos": 1.310693833285632e+17, "trial_name": null, "trial_params": null }