{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 8.790419161676647, "eval_steps": 500, "global_step": 369, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.23952095808383234, "grad_norm": 0.7001179456710815, "learning_rate": 7.500000000000001e-05, "loss": 1.4338, "step": 10 }, { "epoch": 0.47904191616766467, "grad_norm": 0.27172571420669556, "learning_rate": 9.990516643685222e-05, "loss": 0.8267, "step": 20 }, { "epoch": 0.718562874251497, "grad_norm": 0.19638711214065552, "learning_rate": 9.944154131125642e-05, "loss": 0.4881, "step": 30 }, { "epoch": 0.9580838323353293, "grad_norm": 0.20754235982894897, "learning_rate": 9.859528969650738e-05, "loss": 0.3591, "step": 40 }, { "epoch": 1.0, "eval_accuracy": 0.9034878200745683, "eval_loss": 0.39884674549102783, "eval_runtime": 30.4717, "eval_samples_per_second": 12.635, "eval_steps_per_second": 2.133, "step": 42 }, { "epoch": 1.1916167664670658, "grad_norm": 0.16468322277069092, "learning_rate": 9.737296070648186e-05, "loss": 0.4172, "step": 50 }, { "epoch": 1.4311377245508983, "grad_norm": 0.1612997055053711, "learning_rate": 9.57840139057007e-05, "loss": 0.3893, "step": 60 }, { "epoch": 1.6706586826347305, "grad_norm": 0.16499397158622742, "learning_rate": 9.384074610206495e-05, "loss": 0.3658, "step": 70 }, { "epoch": 1.910179640718563, "grad_norm": 0.14762192964553833, "learning_rate": 9.155819618225708e-05, "loss": 0.2673, "step": 80 }, { "epoch": 2.0, "eval_accuracy": 0.9196909824354998, "eval_loss": 0.325968861579895, "eval_runtime": 30.3583, "eval_samples_per_second": 12.682, "eval_steps_per_second": 2.141, "step": 84 }, { "epoch": 2.143712574850299, "grad_norm": 0.1777992993593216, "learning_rate": 8.895402872628352e-05, "loss": 0.3165, "step": 90 }, { "epoch": 2.3832335329341316, "grad_norm": 0.2070295810699463, "learning_rate": 8.604839730186125e-05, "loss": 0.336, "step": 100 }, { "epoch": 2.622754491017964, "grad_norm": 0.1750418245792389, "learning_rate": 8.286378849660896e-05, "loss": 0.288, "step": 110 }, { "epoch": 2.8622754491017965, "grad_norm": 0.20214907824993134, "learning_rate": 7.942484789507283e-05, "loss": 0.2425, "step": 120 }, { "epoch": 3.0, "eval_accuracy": 0.9289047847079928, "eval_loss": 0.289765864610672, "eval_runtime": 30.3541, "eval_samples_per_second": 12.684, "eval_steps_per_second": 2.141, "step": 126 }, { "epoch": 3.095808383233533, "grad_norm": 0.2640642523765564, "learning_rate": 7.57581893473448e-05, "loss": 0.2304, "step": 130 }, { "epoch": 3.3353293413173652, "grad_norm": 0.26811081171035767, "learning_rate": 7.18921890053375e-05, "loss": 0.2905, "step": 140 }, { "epoch": 3.5748502994011977, "grad_norm": 0.27033841609954834, "learning_rate": 6.785676572066225e-05, "loss": 0.2336, "step": 150 }, { "epoch": 3.81437125748503, "grad_norm": 0.22121788561344147, "learning_rate": 6.368314950360415e-05, "loss": 0.2069, "step": 160 }, { "epoch": 4.0, "eval_accuracy": 0.9345709163732674, "eval_loss": 0.2659221291542053, "eval_runtime": 30.3929, "eval_samples_per_second": 12.667, "eval_steps_per_second": 2.139, "step": 168 }, { "epoch": 4.047904191616767, "grad_norm": 0.2626728117465973, "learning_rate": 5.940363983508257e-05, "loss": 0.1815, "step": 170 }, { "epoch": 4.287425149700598, "grad_norm": 0.2664365768432617, "learning_rate": 5.5051355702012893e-05, "loss": 0.2426, "step": 180 }, { "epoch": 4.526946107784431, "grad_norm": 0.2793976962566376, "learning_rate": 5.0659979290537954e-05, "loss": 0.1998, "step": 190 }, { "epoch": 4.766467065868263, "grad_norm": 0.2911984622478485, "learning_rate": 4.626349532067879e-05, "loss": 0.1713, "step": 200 }, { "epoch": 5.0, "grad_norm": 0.2870716452598572, "learning_rate": 4.189592803968563e-05, "loss": 0.138, "step": 210 }, { "epoch": 5.0, "eval_accuracy": 0.9391251537470097, "eval_loss": 0.2524815797805786, "eval_runtime": 30.4065, "eval_samples_per_second": 12.662, "eval_steps_per_second": 2.138, "step": 210 }, { "epoch": 5.2395209580838324, "grad_norm": 0.3472362756729126, "learning_rate": 3.759107790948882e-05, "loss": 0.199, "step": 220 }, { "epoch": 5.479041916167665, "grad_norm": 0.33083346486091614, "learning_rate": 3.338226002601703e-05, "loss": 0.1654, "step": 230 }, { "epoch": 5.718562874251497, "grad_norm": 0.3220432996749878, "learning_rate": 2.9302046294747497e-05, "loss": 0.1459, "step": 240 }, { "epoch": 5.95808383233533, "grad_norm": 0.35546302795410156, "learning_rate": 2.5382013357782893e-05, "loss": 0.1251, "step": 250 }, { "epoch": 6.0, "eval_accuracy": 0.9430640763868036, "eval_loss": 0.24969537556171417, "eval_runtime": 30.3846, "eval_samples_per_second": 12.671, "eval_steps_per_second": 2.139, "step": 252 }, { "epoch": 6.191616766467066, "grad_norm": 0.35284000635147095, "learning_rate": 2.1652498223239427e-05, "loss": 0.1486, "step": 260 }, { "epoch": 6.431137724550898, "grad_norm": 0.31098905205726624, "learning_rate": 1.814236348812211e-05, "loss": 0.1527, "step": 270 }, { "epoch": 6.6706586826347305, "grad_norm": 0.34438222646713257, "learning_rate": 1.4878773971620074e-05, "loss": 0.1344, "step": 280 }, { "epoch": 6.910179640718563, "grad_norm": 0.2907264232635498, "learning_rate": 1.1886986487449475e-05, "loss": 0.1072, "step": 290 }, { "epoch": 7.0, "eval_accuracy": 0.9438642485491895, "eval_loss": 0.24749523401260376, "eval_runtime": 30.43, "eval_samples_per_second": 12.652, "eval_steps_per_second": 2.136, "step": 294 }, { "epoch": 7.1437125748503, "grad_norm": 0.3019481301307678, "learning_rate": 9.190154382188921e-06, "loss": 0.1224, "step": 300 }, { "epoch": 7.383233532934132, "grad_norm": 0.29105937480926514, "learning_rate": 6.809148352279182e-06, "loss": 0.1493, "step": 310 }, { "epoch": 7.6227544910179645, "grad_norm": 0.2779920697212219, "learning_rate": 4.762394926378477e-06, "loss": 0.1126, "step": 320 }, { "epoch": 7.862275449101796, "grad_norm": 0.3359215557575226, "learning_rate": 3.065733863053072e-06, "loss": 0.1059, "step": 330 }, { "epoch": 8.0, "eval_accuracy": 0.944612993482476, "eval_loss": 0.2483372986316681, "eval_runtime": 30.4006, "eval_samples_per_second": 12.664, "eval_steps_per_second": 2.138, "step": 336 }, { "epoch": 8.095808383233534, "grad_norm": 0.26059141755104065, "learning_rate": 1.7322955673980678e-06, "loss": 0.0986, "step": 340 }, { "epoch": 8.335329341317365, "grad_norm": 0.26576822996139526, "learning_rate": 7.723994752570462e-07, "loss": 0.1465, "step": 350 }, { "epoch": 8.574850299401197, "grad_norm": 0.2771126329898834, "learning_rate": 1.9347419144180035e-07, "loss": 0.1073, "step": 360 }, { "epoch": 8.790419161676647, "eval_accuracy": 0.9448452683547727, "eval_loss": 0.24962204694747925, "eval_runtime": 30.226, "eval_samples_per_second": 12.737, "eval_steps_per_second": 2.15, "step": 369 }, { "epoch": 8.790419161676647, "step": 369, "total_flos": 3.210182982232965e+17, "train_loss": 0.2637010880602085, "train_runtime": 2751.4148, "train_samples_per_second": 3.271, "train_steps_per_second": 0.134 } ], "logging_steps": 10, "max_steps": 369, "num_input_tokens_seen": 0, "num_train_epochs": 9, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.210182982232965e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }