{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.42533081285444235, "eval_steps": 25, "global_step": 1125, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00945179584120983, "grad_norm": 0.48126843571662903, "learning_rate": 0.0002, "loss": 1.4186, "step": 25 }, { "epoch": 0.00945179584120983, "eval_loss": 1.2822998762130737, "eval_runtime": 1560.6226, "eval_samples_per_second": 0.847, "eval_steps_per_second": 0.212, "step": 25 }, { "epoch": 0.01890359168241966, "grad_norm": 0.8693311810493469, "learning_rate": 0.0002, "loss": 1.2478, "step": 50 }, { "epoch": 0.01890359168241966, "eval_loss": 1.261049747467041, "eval_runtime": 1561.6033, "eval_samples_per_second": 0.847, "eval_steps_per_second": 0.212, "step": 50 }, { "epoch": 0.02835538752362949, "grad_norm": 0.4594016969203949, "learning_rate": 0.0002, "loss": 1.1961, "step": 75 }, { "epoch": 0.02835538752362949, "eval_loss": 1.2359907627105713, "eval_runtime": 1561.4875, "eval_samples_per_second": 0.847, "eval_steps_per_second": 0.212, "step": 75 }, { "epoch": 0.03780718336483932, "grad_norm": 0.7460442185401917, "learning_rate": 0.0002, "loss": 1.245, "step": 100 }, { "epoch": 0.03780718336483932, "eval_loss": 1.2357805967330933, "eval_runtime": 1561.6317, "eval_samples_per_second": 0.847, "eval_steps_per_second": 0.212, "step": 100 }, { "epoch": 0.04725897920604915, "grad_norm": 0.37976986169815063, "learning_rate": 0.0002, "loss": 1.2213, "step": 125 }, { "epoch": 0.04725897920604915, "eval_loss": 1.2154258489608765, "eval_runtime": 1561.4032, "eval_samples_per_second": 0.847, "eval_steps_per_second": 0.212, "step": 125 }, { "epoch": 0.05671077504725898, "grad_norm": 0.6762637495994568, "learning_rate": 0.0002, "loss": 1.199, "step": 150 }, { "epoch": 0.05671077504725898, "eval_loss": 1.2192034721374512, "eval_runtime": 1561.5162, "eval_samples_per_second": 0.847, "eval_steps_per_second": 0.212, "step": 150 }, { "epoch": 0.0661625708884688, "grad_norm": 0.3414202034473419, "learning_rate": 0.0002, "loss": 1.1825, "step": 175 }, { "epoch": 0.0661625708884688, "eval_loss": 1.199916124343872, "eval_runtime": 1561.7104, "eval_samples_per_second": 0.847, "eval_steps_per_second": 0.212, "step": 175 }, { "epoch": 0.07561436672967864, "grad_norm": 0.8801635503768921, "learning_rate": 0.0002, "loss": 1.1358, "step": 200 }, { "epoch": 0.07561436672967864, "eval_loss": 1.201659083366394, "eval_runtime": 1561.6394, "eval_samples_per_second": 0.847, "eval_steps_per_second": 0.212, "step": 200 }, { "epoch": 0.08506616257088846, "grad_norm": 0.31596821546554565, "learning_rate": 0.0002, "loss": 1.2173, "step": 225 }, { "epoch": 0.08506616257088846, "eval_loss": 1.18569016456604, "eval_runtime": 1561.8408, "eval_samples_per_second": 0.846, "eval_steps_per_second": 0.212, "step": 225 }, { "epoch": 0.0945179584120983, "grad_norm": 0.9426243305206299, "learning_rate": 0.0002, "loss": 1.1652, "step": 250 }, { "epoch": 0.0945179584120983, "eval_loss": 1.1847585439682007, "eval_runtime": 1561.46, "eval_samples_per_second": 0.847, "eval_steps_per_second": 0.212, "step": 250 }, { "epoch": 0.10396975425330812, "grad_norm": 0.3340831398963928, "learning_rate": 0.0002, "loss": 1.1563, "step": 275 }, { "epoch": 0.10396975425330812, "eval_loss": 1.1764663457870483, "eval_runtime": 1563.551, "eval_samples_per_second": 0.846, "eval_steps_per_second": 0.212, "step": 275 }, { "epoch": 0.11342155009451796, "grad_norm": 1.1844408512115479, "learning_rate": 0.0002, "loss": 1.1976, "step": 300 }, { "epoch": 0.11342155009451796, "eval_loss": 1.182220697402954, "eval_runtime": 1562.1264, "eval_samples_per_second": 0.846, "eval_steps_per_second": 0.212, "step": 300 }, { "epoch": 0.12287334593572778, "grad_norm": 0.35529959201812744, "learning_rate": 0.0002, "loss": 1.197, "step": 325 }, { "epoch": 0.12287334593572778, "eval_loss": 1.170316219329834, "eval_runtime": 1561.289, "eval_samples_per_second": 0.847, "eval_steps_per_second": 0.212, "step": 325 }, { "epoch": 0.1323251417769376, "grad_norm": 0.644234836101532, "learning_rate": 0.0002, "loss": 1.1317, "step": 350 }, { "epoch": 0.1323251417769376, "eval_loss": 1.173732876777649, "eval_runtime": 1561.2179, "eval_samples_per_second": 0.847, "eval_steps_per_second": 0.212, "step": 350 }, { "epoch": 0.14177693761814744, "grad_norm": 0.38344722986221313, "learning_rate": 0.0002, "loss": 1.2229, "step": 375 }, { "epoch": 0.14177693761814744, "eval_loss": 1.1632750034332275, "eval_runtime": 1562.0523, "eval_samples_per_second": 0.846, "eval_steps_per_second": 0.212, "step": 375 }, { "epoch": 0.15122873345935728, "grad_norm": 0.709377646446228, "learning_rate": 0.0002, "loss": 1.1853, "step": 400 }, { "epoch": 0.15122873345935728, "eval_loss": 1.1692676544189453, "eval_runtime": 1561.2568, "eval_samples_per_second": 0.847, "eval_steps_per_second": 0.212, "step": 400 }, { "epoch": 0.16068052930056712, "grad_norm": 0.34974658489227295, "learning_rate": 0.0002, "loss": 1.1479, "step": 425 }, { "epoch": 0.16068052930056712, "eval_loss": 1.1600748300552368, "eval_runtime": 1562.1908, "eval_samples_per_second": 0.846, "eval_steps_per_second": 0.212, "step": 425 }, { "epoch": 0.17013232514177692, "grad_norm": 0.8809393644332886, "learning_rate": 0.0002, "loss": 1.1047, "step": 450 }, { "epoch": 0.17013232514177692, "eval_loss": 1.1649720668792725, "eval_runtime": 1563.0297, "eval_samples_per_second": 0.846, "eval_steps_per_second": 0.212, "step": 450 }, { "epoch": 0.17958412098298676, "grad_norm": 0.319968581199646, "learning_rate": 0.0002, "loss": 1.1477, "step": 475 }, { "epoch": 0.17958412098298676, "eval_loss": 1.1558725833892822, "eval_runtime": 1561.2187, "eval_samples_per_second": 0.847, "eval_steps_per_second": 0.212, "step": 475 }, { "epoch": 0.1890359168241966, "grad_norm": 0.7769630551338196, "learning_rate": 0.0002, "loss": 1.1831, "step": 500 }, { "epoch": 0.1890359168241966, "eval_loss": 1.162941336631775, "eval_runtime": 1561.4384, "eval_samples_per_second": 0.847, "eval_steps_per_second": 0.212, "step": 500 }, { "epoch": 0.19848771266540643, "grad_norm": 0.3040992319583893, "learning_rate": 0.0002, "loss": 1.134, "step": 525 }, { "epoch": 0.19848771266540643, "eval_loss": 1.153849720954895, "eval_runtime": 1561.176, "eval_samples_per_second": 0.847, "eval_steps_per_second": 0.212, "step": 525 }, { "epoch": 0.20793950850661624, "grad_norm": 0.656995415687561, "learning_rate": 0.0002, "loss": 1.1366, "step": 550 }, { "epoch": 0.20793950850661624, "eval_loss": 1.156500220298767, "eval_runtime": 1561.228, "eval_samples_per_second": 0.847, "eval_steps_per_second": 0.212, "step": 550 }, { "epoch": 0.21739130434782608, "grad_norm": 0.32160601019859314, "learning_rate": 0.0002, "loss": 1.1581, "step": 575 }, { "epoch": 0.21739130434782608, "eval_loss": 1.1488285064697266, "eval_runtime": 1561.286, "eval_samples_per_second": 0.847, "eval_steps_per_second": 0.212, "step": 575 }, { "epoch": 0.22684310018903592, "grad_norm": 0.5169605016708374, "learning_rate": 0.0002, "loss": 1.1179, "step": 600 }, { "epoch": 0.22684310018903592, "eval_loss": 1.1587059497833252, "eval_runtime": 1561.443, "eval_samples_per_second": 0.847, "eval_steps_per_second": 0.212, "step": 600 }, { "epoch": 0.23629489603024575, "grad_norm": 0.3807673156261444, "learning_rate": 0.0002, "loss": 1.1654, "step": 625 }, { "epoch": 0.23629489603024575, "eval_loss": 1.146795630455017, "eval_runtime": 1561.4729, "eval_samples_per_second": 0.847, "eval_steps_per_second": 0.212, "step": 625 }, { "epoch": 0.24574669187145556, "grad_norm": 1.206275224685669, "learning_rate": 0.0002, "loss": 1.1549, "step": 650 }, { "epoch": 0.24574669187145556, "eval_loss": 1.149159550666809, "eval_runtime": 1561.5158, "eval_samples_per_second": 0.847, "eval_steps_per_second": 0.212, "step": 650 }, { "epoch": 0.2551984877126654, "grad_norm": 0.3218563497066498, "learning_rate": 0.0002, "loss": 1.147, "step": 675 }, { "epoch": 0.2551984877126654, "eval_loss": 1.1431602239608765, "eval_runtime": 1561.424, "eval_samples_per_second": 0.847, "eval_steps_per_second": 0.212, "step": 675 }, { "epoch": 0.2646502835538752, "grad_norm": 0.7758462429046631, "learning_rate": 0.0002, "loss": 1.1113, "step": 700 }, { "epoch": 0.2646502835538752, "eval_loss": 1.1470929384231567, "eval_runtime": 1561.3413, "eval_samples_per_second": 0.847, "eval_steps_per_second": 0.212, "step": 700 }, { "epoch": 0.2741020793950851, "grad_norm": 0.3400532901287079, "learning_rate": 0.0002, "loss": 1.1684, "step": 725 }, { "epoch": 0.2741020793950851, "eval_loss": 1.1409646272659302, "eval_runtime": 1561.1615, "eval_samples_per_second": 0.847, "eval_steps_per_second": 0.212, "step": 725 }, { "epoch": 0.2835538752362949, "grad_norm": 0.48636239767074585, "learning_rate": 0.0002, "loss": 1.1016, "step": 750 }, { "epoch": 0.2835538752362949, "eval_loss": 1.1419570446014404, "eval_runtime": 1561.247, "eval_samples_per_second": 0.847, "eval_steps_per_second": 0.212, "step": 750 }, { "epoch": 0.29300567107750475, "grad_norm": 0.3466539978981018, "learning_rate": 0.0002, "loss": 1.1589, "step": 775 }, { "epoch": 0.29300567107750475, "eval_loss": 1.137436032295227, "eval_runtime": 1561.3303, "eval_samples_per_second": 0.847, "eval_steps_per_second": 0.212, "step": 775 }, { "epoch": 0.30245746691871456, "grad_norm": 1.0184762477874756, "learning_rate": 0.0002, "loss": 1.1275, "step": 800 }, { "epoch": 0.30245746691871456, "eval_loss": 1.1429524421691895, "eval_runtime": 1561.4223, "eval_samples_per_second": 0.847, "eval_steps_per_second": 0.212, "step": 800 }, { "epoch": 0.31190926275992437, "grad_norm": 0.3569687306880951, "learning_rate": 0.0002, "loss": 1.2014, "step": 825 }, { "epoch": 0.31190926275992437, "eval_loss": 1.134521722793579, "eval_runtime": 1561.5607, "eval_samples_per_second": 0.847, "eval_steps_per_second": 0.212, "step": 825 }, { "epoch": 0.32136105860113423, "grad_norm": 0.503614068031311, "learning_rate": 0.0002, "loss": 1.0947, "step": 850 }, { "epoch": 0.32136105860113423, "eval_loss": 1.1380345821380615, "eval_runtime": 1561.4636, "eval_samples_per_second": 0.847, "eval_steps_per_second": 0.212, "step": 850 }, { "epoch": 0.33081285444234404, "grad_norm": 0.4224971532821655, "learning_rate": 0.0002, "loss": 1.1505, "step": 875 }, { "epoch": 0.33081285444234404, "eval_loss": 1.1311566829681396, "eval_runtime": 1561.5445, "eval_samples_per_second": 0.847, "eval_steps_per_second": 0.212, "step": 875 }, { "epoch": 0.34026465028355385, "grad_norm": 0.6001178026199341, "learning_rate": 0.0002, "loss": 1.1121, "step": 900 }, { "epoch": 0.34026465028355385, "eval_loss": 1.1359593868255615, "eval_runtime": 1561.55, "eval_samples_per_second": 0.847, "eval_steps_per_second": 0.212, "step": 900 }, { "epoch": 0.3497164461247637, "grad_norm": 0.3645350933074951, "learning_rate": 0.0002, "loss": 1.1452, "step": 925 }, { "epoch": 0.3497164461247637, "eval_loss": 1.1279844045639038, "eval_runtime": 1561.6948, "eval_samples_per_second": 0.847, "eval_steps_per_second": 0.212, "step": 925 }, { "epoch": 0.3591682419659735, "grad_norm": 0.6315143704414368, "learning_rate": 0.0002, "loss": 1.0865, "step": 950 }, { "epoch": 0.3591682419659735, "eval_loss": 1.1323318481445312, "eval_runtime": 1561.5263, "eval_samples_per_second": 0.847, "eval_steps_per_second": 0.212, "step": 950 }, { "epoch": 0.3686200378071834, "grad_norm": 0.3632996380329132, "learning_rate": 0.0002, "loss": 1.1383, "step": 975 }, { "epoch": 0.3686200378071834, "eval_loss": 1.1256133317947388, "eval_runtime": 1561.413, "eval_samples_per_second": 0.847, "eval_steps_per_second": 0.212, "step": 975 }, { "epoch": 0.3780718336483932, "grad_norm": 0.8775736689567566, "learning_rate": 0.0002, "loss": 1.1071, "step": 1000 }, { "epoch": 0.3780718336483932, "eval_loss": 1.130606770515442, "eval_runtime": 1561.5903, "eval_samples_per_second": 0.847, "eval_steps_per_second": 0.212, "step": 1000 }, { "epoch": 0.387523629489603, "grad_norm": 0.32248276472091675, "learning_rate": 0.0002, "loss": 1.1603, "step": 1025 }, { "epoch": 0.387523629489603, "eval_loss": 1.122152328491211, "eval_runtime": 1561.5582, "eval_samples_per_second": 0.847, "eval_steps_per_second": 0.212, "step": 1025 }, { "epoch": 0.39697542533081287, "grad_norm": 1.2496217489242554, "learning_rate": 0.0002, "loss": 1.0542, "step": 1050 }, { "epoch": 0.39697542533081287, "eval_loss": 1.129094123840332, "eval_runtime": 1561.5299, "eval_samples_per_second": 0.847, "eval_steps_per_second": 0.212, "step": 1050 }, { "epoch": 0.4064272211720227, "grad_norm": 0.31586310267448425, "learning_rate": 0.0002, "loss": 1.1224, "step": 1075 }, { "epoch": 0.4064272211720227, "eval_loss": 1.1187065839767456, "eval_runtime": 1561.5901, "eval_samples_per_second": 0.847, "eval_steps_per_second": 0.212, "step": 1075 }, { "epoch": 0.4158790170132325, "grad_norm": 0.944985032081604, "learning_rate": 0.0002, "loss": 1.133, "step": 1100 }, { "epoch": 0.4158790170132325, "eval_loss": 1.122226595878601, "eval_runtime": 1561.6201, "eval_samples_per_second": 0.847, "eval_steps_per_second": 0.212, "step": 1100 }, { "epoch": 0.42533081285444235, "grad_norm": 0.3063657879829407, "learning_rate": 0.0002, "loss": 1.1122, "step": 1125 }, { "epoch": 0.42533081285444235, "eval_loss": 1.1147044897079468, "eval_runtime": 1561.5596, "eval_samples_per_second": 0.847, "eval_steps_per_second": 0.212, "step": 1125 } ], "logging_steps": 25, "max_steps": 2645, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 25, "total_flos": 8.654158217045606e+16, "train_batch_size": 4, "trial_name": null, "trial_params": null }