{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9985228951255539, "eval_steps": 500, "global_step": 338, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.014771048744460856, "grad_norm": 2.5938093662261963, "learning_rate": 0.0002, "loss": 0.9809, "step": 5 }, { "epoch": 0.029542097488921712, "grad_norm": 0.9637813568115234, "learning_rate": 0.0002, "loss": 0.4769, "step": 10 }, { "epoch": 0.04431314623338257, "grad_norm": 1.666123628616333, "learning_rate": 0.0002, "loss": 0.4077, "step": 15 }, { "epoch": 0.059084194977843424, "grad_norm": 0.5881379246711731, "learning_rate": 0.0002, "loss": 0.4035, "step": 20 }, { "epoch": 0.07385524372230429, "grad_norm": 0.5366687774658203, "learning_rate": 0.0002, "loss": 0.3695, "step": 25 }, { "epoch": 0.08862629246676514, "grad_norm": 0.5429549217224121, "learning_rate": 0.0002, "loss": 0.3579, "step": 30 }, { "epoch": 0.103397341211226, "grad_norm": 0.5302016735076904, "learning_rate": 0.0002, "loss": 0.3623, "step": 35 }, { "epoch": 0.11816838995568685, "grad_norm": 0.4706576466560364, "learning_rate": 0.0002, "loss": 0.3423, "step": 40 }, { "epoch": 0.1329394387001477, "grad_norm": 0.47507619857788086, "learning_rate": 0.0002, "loss": 0.3301, "step": 45 }, { "epoch": 0.14771048744460857, "grad_norm": 0.487821102142334, "learning_rate": 0.0002, "loss": 0.3262, "step": 50 }, { "epoch": 0.16248153618906944, "grad_norm": 0.46188947558403015, "learning_rate": 0.0002, "loss": 0.3381, "step": 55 }, { "epoch": 0.17725258493353027, "grad_norm": 0.49672871828079224, "learning_rate": 0.0002, "loss": 0.3474, "step": 60 }, { "epoch": 0.19202363367799113, "grad_norm": 0.45688968896865845, "learning_rate": 0.0002, "loss": 0.3356, "step": 65 }, { "epoch": 0.206794682422452, "grad_norm": 0.5083580017089844, "learning_rate": 0.0002, "loss": 0.317, "step": 70 }, { "epoch": 0.22156573116691286, "grad_norm": 0.4326242506504059, "learning_rate": 0.0002, "loss": 0.3107, "step": 75 }, { "epoch": 0.2363367799113737, "grad_norm": 0.7657620906829834, "learning_rate": 0.0002, "loss": 0.3055, "step": 80 }, { "epoch": 0.2511078286558346, "grad_norm": 0.4073372483253479, "learning_rate": 0.0002, "loss": 0.3041, "step": 85 }, { "epoch": 0.2658788774002954, "grad_norm": 0.4194050431251526, "learning_rate": 0.0002, "loss": 0.3121, "step": 90 }, { "epoch": 0.28064992614475626, "grad_norm": 0.4937780499458313, "learning_rate": 0.0002, "loss": 0.3065, "step": 95 }, { "epoch": 0.29542097488921715, "grad_norm": 0.39246585965156555, "learning_rate": 0.0002, "loss": 0.3081, "step": 100 }, { "epoch": 0.310192023633678, "grad_norm": 0.4153652787208557, "learning_rate": 0.0002, "loss": 0.3074, "step": 105 }, { "epoch": 0.3249630723781389, "grad_norm": 0.39885184168815613, "learning_rate": 0.0002, "loss": 0.3016, "step": 110 }, { "epoch": 0.3397341211225997, "grad_norm": 0.3999512195587158, "learning_rate": 0.0002, "loss": 0.302, "step": 115 }, { "epoch": 0.35450516986706054, "grad_norm": 0.40937578678131104, "learning_rate": 0.0002, "loss": 0.2964, "step": 120 }, { "epoch": 0.36927621861152143, "grad_norm": 1.0849940776824951, "learning_rate": 0.0002, "loss": 0.3098, "step": 125 }, { "epoch": 0.38404726735598227, "grad_norm": 0.36466699838638306, "learning_rate": 0.0002, "loss": 0.2964, "step": 130 }, { "epoch": 0.3988183161004431, "grad_norm": 0.32518795132637024, "learning_rate": 0.0002, "loss": 0.2788, "step": 135 }, { "epoch": 0.413589364844904, "grad_norm": 0.3508060872554779, "learning_rate": 0.0002, "loss": 0.2758, "step": 140 }, { "epoch": 0.42836041358936483, "grad_norm": 0.34023162722587585, "learning_rate": 0.0002, "loss": 0.2955, "step": 145 }, { "epoch": 0.4431314623338257, "grad_norm": 0.3429297208786011, "learning_rate": 0.0002, "loss": 0.2812, "step": 150 }, { "epoch": 0.45790251107828656, "grad_norm": 0.3394342064857483, "learning_rate": 0.0002, "loss": 0.2751, "step": 155 }, { "epoch": 0.4726735598227474, "grad_norm": 0.3172396421432495, "learning_rate": 0.0002, "loss": 0.2813, "step": 160 }, { "epoch": 0.4874446085672083, "grad_norm": 0.5636305809020996, "learning_rate": 0.0002, "loss": 0.2714, "step": 165 }, { "epoch": 0.5022156573116692, "grad_norm": 0.33329370617866516, "learning_rate": 0.0002, "loss": 0.2759, "step": 170 }, { "epoch": 0.51698670605613, "grad_norm": 0.34862470626831055, "learning_rate": 0.0002, "loss": 0.2875, "step": 175 }, { "epoch": 0.5317577548005908, "grad_norm": 0.41521379351615906, "learning_rate": 0.0002, "loss": 0.2744, "step": 180 }, { "epoch": 0.5465288035450517, "grad_norm": 0.3359523117542267, "learning_rate": 0.0002, "loss": 0.282, "step": 185 }, { "epoch": 0.5612998522895125, "grad_norm": 0.3089170455932617, "learning_rate": 0.0002, "loss": 0.2628, "step": 190 }, { "epoch": 0.5760709010339734, "grad_norm": 0.36551329493522644, "learning_rate": 0.0002, "loss": 0.2776, "step": 195 }, { "epoch": 0.5908419497784343, "grad_norm": 0.32992231845855713, "learning_rate": 0.0002, "loss": 0.2599, "step": 200 }, { "epoch": 0.6056129985228951, "grad_norm": 0.3119284510612488, "learning_rate": 0.0002, "loss": 0.2699, "step": 205 }, { "epoch": 0.620384047267356, "grad_norm": 0.2953311800956726, "learning_rate": 0.0002, "loss": 0.2705, "step": 210 }, { "epoch": 0.6351550960118169, "grad_norm": 0.3757329285144806, "learning_rate": 0.0002, "loss": 0.2918, "step": 215 }, { "epoch": 0.6499261447562777, "grad_norm": 0.36705055832862854, "learning_rate": 0.0002, "loss": 0.2545, "step": 220 }, { "epoch": 0.6646971935007385, "grad_norm": 0.3092058002948761, "learning_rate": 0.0002, "loss": 0.2624, "step": 225 }, { "epoch": 0.6794682422451994, "grad_norm": 0.31742286682128906, "learning_rate": 0.0002, "loss": 0.2602, "step": 230 }, { "epoch": 0.6942392909896603, "grad_norm": 0.2955617308616638, "learning_rate": 0.0002, "loss": 0.256, "step": 235 }, { "epoch": 0.7090103397341211, "grad_norm": 0.3345969617366791, "learning_rate": 0.0002, "loss": 0.2687, "step": 240 }, { "epoch": 0.723781388478582, "grad_norm": 0.2796613276004791, "learning_rate": 0.0002, "loss": 0.2526, "step": 245 }, { "epoch": 0.7385524372230429, "grad_norm": 0.5415365695953369, "learning_rate": 0.0002, "loss": 0.2545, "step": 250 }, { "epoch": 0.7533234859675036, "grad_norm": 0.3844436705112457, "learning_rate": 0.0002, "loss": 0.2599, "step": 255 }, { "epoch": 0.7680945347119645, "grad_norm": 0.3186696171760559, "learning_rate": 0.0002, "loss": 0.2477, "step": 260 }, { "epoch": 0.7828655834564254, "grad_norm": 0.38170936703681946, "learning_rate": 0.0002, "loss": 0.2582, "step": 265 }, { "epoch": 0.7976366322008862, "grad_norm": 0.29369300603866577, "learning_rate": 0.0002, "loss": 0.2505, "step": 270 }, { "epoch": 0.8124076809453471, "grad_norm": 0.29856300354003906, "learning_rate": 0.0002, "loss": 0.2675, "step": 275 }, { "epoch": 0.827178729689808, "grad_norm": 0.2721855342388153, "learning_rate": 0.0002, "loss": 0.2489, "step": 280 }, { "epoch": 0.8419497784342689, "grad_norm": 0.3029973804950714, "learning_rate": 0.0002, "loss": 0.2575, "step": 285 }, { "epoch": 0.8567208271787297, "grad_norm": 0.2983309030532837, "learning_rate": 0.0002, "loss": 0.2628, "step": 290 }, { "epoch": 0.8714918759231906, "grad_norm": 0.5093730092048645, "learning_rate": 0.0002, "loss": 0.2552, "step": 295 }, { "epoch": 0.8862629246676514, "grad_norm": 0.28230157494544983, "learning_rate": 0.0002, "loss": 0.2592, "step": 300 }, { "epoch": 0.9010339734121122, "grad_norm": 0.371902197599411, "learning_rate": 0.0002, "loss": 0.2596, "step": 305 }, { "epoch": 0.9158050221565731, "grad_norm": 0.3786104619503021, "learning_rate": 0.0002, "loss": 0.25, "step": 310 }, { "epoch": 0.930576070901034, "grad_norm": 0.4518865942955017, "learning_rate": 0.0002, "loss": 0.2546, "step": 315 }, { "epoch": 0.9453471196454948, "grad_norm": 0.29951682686805725, "learning_rate": 0.0002, "loss": 0.2433, "step": 320 }, { "epoch": 0.9601181683899557, "grad_norm": 0.2999703884124756, "learning_rate": 0.0002, "loss": 0.2419, "step": 325 }, { "epoch": 0.9748892171344166, "grad_norm": 0.2904799282550812, "learning_rate": 0.0002, "loss": 0.2474, "step": 330 }, { "epoch": 0.9896602658788775, "grad_norm": 0.28127652406692505, "learning_rate": 0.0002, "loss": 0.2458, "step": 335 }, { "epoch": 0.9985228951255539, "step": 338, "total_flos": 2.6821312997071258e+17, "train_loss": 0.3007896387365443, "train_runtime": 3313.8561, "train_samples_per_second": 1.632, "train_steps_per_second": 0.102 } ], "logging_steps": 5, "max_steps": 338, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.6821312997071258e+17, "train_batch_size": 8, "trial_name": null, "trial_params": null }