|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.9985228951255539, |
|
"eval_steps": 500, |
|
"global_step": 338, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.014771048744460856, |
|
"grad_norm": 2.5938093662261963, |
|
"learning_rate": 0.0002, |
|
"loss": 0.9809, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.029542097488921712, |
|
"grad_norm": 0.9637813568115234, |
|
"learning_rate": 0.0002, |
|
"loss": 0.4769, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.04431314623338257, |
|
"grad_norm": 1.666123628616333, |
|
"learning_rate": 0.0002, |
|
"loss": 0.4077, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.059084194977843424, |
|
"grad_norm": 0.5881379246711731, |
|
"learning_rate": 0.0002, |
|
"loss": 0.4035, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.07385524372230429, |
|
"grad_norm": 0.5366687774658203, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3695, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.08862629246676514, |
|
"grad_norm": 0.5429549217224121, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3579, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.103397341211226, |
|
"grad_norm": 0.5302016735076904, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3623, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.11816838995568685, |
|
"grad_norm": 0.4706576466560364, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3423, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.1329394387001477, |
|
"grad_norm": 0.47507619857788086, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3301, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.14771048744460857, |
|
"grad_norm": 0.487821102142334, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3262, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.16248153618906944, |
|
"grad_norm": 0.46188947558403015, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3381, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.17725258493353027, |
|
"grad_norm": 0.49672871828079224, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3474, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.19202363367799113, |
|
"grad_norm": 0.45688968896865845, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3356, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.206794682422452, |
|
"grad_norm": 0.5083580017089844, |
|
"learning_rate": 0.0002, |
|
"loss": 0.317, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.22156573116691286, |
|
"grad_norm": 0.4326242506504059, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3107, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.2363367799113737, |
|
"grad_norm": 0.7657620906829834, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3055, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.2511078286558346, |
|
"grad_norm": 0.4073372483253479, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3041, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.2658788774002954, |
|
"grad_norm": 0.4194050431251526, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3121, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.28064992614475626, |
|
"grad_norm": 0.4937780499458313, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3065, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.29542097488921715, |
|
"grad_norm": 0.39246585965156555, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3081, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.310192023633678, |
|
"grad_norm": 0.4153652787208557, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3074, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.3249630723781389, |
|
"grad_norm": 0.39885184168815613, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3016, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.3397341211225997, |
|
"grad_norm": 0.3999512195587158, |
|
"learning_rate": 0.0002, |
|
"loss": 0.302, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.35450516986706054, |
|
"grad_norm": 0.40937578678131104, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2964, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.36927621861152143, |
|
"grad_norm": 1.0849940776824951, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3098, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.38404726735598227, |
|
"grad_norm": 0.36466699838638306, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2964, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.3988183161004431, |
|
"grad_norm": 0.32518795132637024, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2788, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.413589364844904, |
|
"grad_norm": 0.3508060872554779, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2758, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.42836041358936483, |
|
"grad_norm": 0.34023162722587585, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2955, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.4431314623338257, |
|
"grad_norm": 0.3429297208786011, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2812, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.45790251107828656, |
|
"grad_norm": 0.3394342064857483, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2751, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.4726735598227474, |
|
"grad_norm": 0.3172396421432495, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2813, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.4874446085672083, |
|
"grad_norm": 0.5636305809020996, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2714, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.5022156573116692, |
|
"grad_norm": 0.33329370617866516, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2759, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.51698670605613, |
|
"grad_norm": 0.34862470626831055, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2875, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.5317577548005908, |
|
"grad_norm": 0.41521379351615906, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2744, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.5465288035450517, |
|
"grad_norm": 0.3359523117542267, |
|
"learning_rate": 0.0002, |
|
"loss": 0.282, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.5612998522895125, |
|
"grad_norm": 0.3089170455932617, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2628, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.5760709010339734, |
|
"grad_norm": 0.36551329493522644, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2776, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.5908419497784343, |
|
"grad_norm": 0.32992231845855713, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2599, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.6056129985228951, |
|
"grad_norm": 0.3119284510612488, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2699, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.620384047267356, |
|
"grad_norm": 0.2953311800956726, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2705, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.6351550960118169, |
|
"grad_norm": 0.3757329285144806, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2918, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.6499261447562777, |
|
"grad_norm": 0.36705055832862854, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2545, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.6646971935007385, |
|
"grad_norm": 0.3092058002948761, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2624, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.6794682422451994, |
|
"grad_norm": 0.31742286682128906, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2602, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.6942392909896603, |
|
"grad_norm": 0.2955617308616638, |
|
"learning_rate": 0.0002, |
|
"loss": 0.256, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.7090103397341211, |
|
"grad_norm": 0.3345969617366791, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2687, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.723781388478582, |
|
"grad_norm": 0.2796613276004791, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2526, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.7385524372230429, |
|
"grad_norm": 0.5415365695953369, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2545, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.7533234859675036, |
|
"grad_norm": 0.3844436705112457, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2599, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.7680945347119645, |
|
"grad_norm": 0.3186696171760559, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2477, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.7828655834564254, |
|
"grad_norm": 0.38170936703681946, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2582, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.7976366322008862, |
|
"grad_norm": 0.29369300603866577, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2505, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.8124076809453471, |
|
"grad_norm": 0.29856300354003906, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2675, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.827178729689808, |
|
"grad_norm": 0.2721855342388153, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2489, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.8419497784342689, |
|
"grad_norm": 0.3029973804950714, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2575, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.8567208271787297, |
|
"grad_norm": 0.2983309030532837, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2628, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.8714918759231906, |
|
"grad_norm": 0.5093730092048645, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2552, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.8862629246676514, |
|
"grad_norm": 0.28230157494544983, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2592, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.9010339734121122, |
|
"grad_norm": 0.371902197599411, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2596, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.9158050221565731, |
|
"grad_norm": 0.3786104619503021, |
|
"learning_rate": 0.0002, |
|
"loss": 0.25, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.930576070901034, |
|
"grad_norm": 0.4518865942955017, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2546, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.9453471196454948, |
|
"grad_norm": 0.29951682686805725, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2433, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.9601181683899557, |
|
"grad_norm": 0.2999703884124756, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2419, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.9748892171344166, |
|
"grad_norm": 0.2904799282550812, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2474, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.9896602658788775, |
|
"grad_norm": 0.28127652406692505, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2458, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.9985228951255539, |
|
"step": 338, |
|
"total_flos": 2.6821312997071258e+17, |
|
"train_loss": 0.3007896387365443, |
|
"train_runtime": 3313.8561, |
|
"train_samples_per_second": 1.632, |
|
"train_steps_per_second": 0.102 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 338, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2.6821312997071258e+17, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|