{ "best_metric": 1.6794742345809937, "best_model_checkpoint": "models/dehanalkautsar/mbert-uncased-modified_embedding_table-en/checkpoint-70000", "epoch": 0.624464048155099, "eval_steps": 2000, "global_step": 70000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.017841829947288543, "grad_norm": 31.71474266052246, "learning_rate": 4.9977697687696264e-05, "loss": 5.6683, "step": 2000 }, { "epoch": 0.017841829947288543, "eval_loss": 3.4221630096435547, "eval_runtime": 73.4656, "eval_samples_per_second": 136.118, "eval_steps_per_second": 2.137, "step": 2000 }, { "epoch": 0.035683659894577086, "grad_norm": 20.9453182220459, "learning_rate": 4.995539537539252e-05, "loss": 3.1985, "step": 4000 }, { "epoch": 0.035683659894577086, "eval_loss": 2.7761876583099365, "eval_runtime": 73.4155, "eval_samples_per_second": 136.211, "eval_steps_per_second": 2.139, "step": 4000 }, { "epoch": 0.05352548984186563, "grad_norm": 18.989194869995117, "learning_rate": 4.993309306308878e-05, "loss": 2.7816, "step": 6000 }, { "epoch": 0.05352548984186563, "eval_loss": 2.5367112159729004, "eval_runtime": 73.3987, "eval_samples_per_second": 136.242, "eval_steps_per_second": 2.139, "step": 6000 }, { "epoch": 0.07136731978915417, "grad_norm": 20.059371948242188, "learning_rate": 4.991079075078504e-05, "loss": 2.5767, "step": 8000 }, { "epoch": 0.07136731978915417, "eval_loss": 2.3765993118286133, "eval_runtime": 73.3358, "eval_samples_per_second": 136.359, "eval_steps_per_second": 2.141, "step": 8000 }, { "epoch": 0.08920914973644271, "grad_norm": 18.088693618774414, "learning_rate": 4.9888488438481305e-05, "loss": 2.4472, "step": 10000 }, { "epoch": 0.08920914973644271, "eval_loss": 2.261122226715088, "eval_runtime": 73.3532, "eval_samples_per_second": 136.327, "eval_steps_per_second": 2.14, "step": 10000 }, { "epoch": 0.10705097968373126, "grad_norm": 18.355438232421875, "learning_rate": 4.986618612617757e-05, "loss": 2.3517, "step": 12000 }, { "epoch": 0.10705097968373126, "eval_loss": 2.197890520095825, "eval_runtime": 73.2175, "eval_samples_per_second": 136.579, "eval_steps_per_second": 2.144, "step": 12000 }, { "epoch": 0.1248928096310198, "grad_norm": 19.445158004760742, "learning_rate": 4.984388381387383e-05, "loss": 2.2819, "step": 14000 }, { "epoch": 0.1248928096310198, "eval_loss": 2.135493040084839, "eval_runtime": 73.523, "eval_samples_per_second": 136.012, "eval_steps_per_second": 2.135, "step": 14000 }, { "epoch": 0.14273463957830834, "grad_norm": 16.74393081665039, "learning_rate": 4.9821581501570084e-05, "loss": 2.2253, "step": 16000 }, { "epoch": 0.14273463957830834, "eval_loss": 2.0676965713500977, "eval_runtime": 73.4191, "eval_samples_per_second": 136.204, "eval_steps_per_second": 2.138, "step": 16000 }, { "epoch": 0.1605764695255969, "grad_norm": 18.755170822143555, "learning_rate": 4.9799279189266346e-05, "loss": 2.1737, "step": 18000 }, { "epoch": 0.1605764695255969, "eval_loss": 2.0221915245056152, "eval_runtime": 73.3608, "eval_samples_per_second": 136.313, "eval_steps_per_second": 2.14, "step": 18000 }, { "epoch": 0.17841829947288543, "grad_norm": 18.238853454589844, "learning_rate": 4.977697687696261e-05, "loss": 2.1339, "step": 20000 }, { "epoch": 0.17841829947288543, "eval_loss": 1.9968066215515137, "eval_runtime": 73.376, "eval_samples_per_second": 136.284, "eval_steps_per_second": 2.14, "step": 20000 }, { "epoch": 0.19626012942017398, "grad_norm": 16.624298095703125, "learning_rate": 4.975467456465886e-05, "loss": 2.097, "step": 22000 }, { "epoch": 0.19626012942017398, "eval_loss": 1.973600149154663, "eval_runtime": 73.4156, "eval_samples_per_second": 136.211, "eval_steps_per_second": 2.139, "step": 22000 }, { "epoch": 0.2141019593674625, "grad_norm": 21.57083511352539, "learning_rate": 4.9732372252355125e-05, "loss": 2.0679, "step": 24000 }, { "epoch": 0.2141019593674625, "eval_loss": 1.9505703449249268, "eval_runtime": 73.2384, "eval_samples_per_second": 136.54, "eval_steps_per_second": 2.144, "step": 24000 }, { "epoch": 0.23194378931475107, "grad_norm": 18.031625747680664, "learning_rate": 4.971006994005139e-05, "loss": 2.0474, "step": 26000 }, { "epoch": 0.23194378931475107, "eval_loss": 1.9203472137451172, "eval_runtime": 73.2703, "eval_samples_per_second": 136.481, "eval_steps_per_second": 2.143, "step": 26000 }, { "epoch": 0.2497856192620396, "grad_norm": 18.387907028198242, "learning_rate": 4.968776762774764e-05, "loss": 2.0235, "step": 28000 }, { "epoch": 0.2497856192620396, "eval_loss": 1.9067487716674805, "eval_runtime": 73.2711, "eval_samples_per_second": 136.479, "eval_steps_per_second": 2.143, "step": 28000 }, { "epoch": 0.2676274492093281, "grad_norm": 16.74208641052246, "learning_rate": 4.966546531544391e-05, "loss": 2.0007, "step": 30000 }, { "epoch": 0.2676274492093281, "eval_loss": 1.8875941038131714, "eval_runtime": 73.3071, "eval_samples_per_second": 136.412, "eval_steps_per_second": 2.142, "step": 30000 }, { "epoch": 0.2854692791566167, "grad_norm": 17.28813934326172, "learning_rate": 4.964316300314017e-05, "loss": 1.9809, "step": 32000 }, { "epoch": 0.2854692791566167, "eval_loss": 1.8658957481384277, "eval_runtime": 73.2999, "eval_samples_per_second": 136.426, "eval_steps_per_second": 2.142, "step": 32000 }, { "epoch": 0.30331110910390524, "grad_norm": 17.0612735748291, "learning_rate": 4.962086069083643e-05, "loss": 1.9672, "step": 34000 }, { "epoch": 0.30331110910390524, "eval_loss": 1.8565300703048706, "eval_runtime": 73.3279, "eval_samples_per_second": 136.374, "eval_steps_per_second": 2.141, "step": 34000 }, { "epoch": 0.3211529390511938, "grad_norm": 17.805253982543945, "learning_rate": 4.959855837853269e-05, "loss": 1.9517, "step": 36000 }, { "epoch": 0.3211529390511938, "eval_loss": 1.8271287679672241, "eval_runtime": 73.2647, "eval_samples_per_second": 136.491, "eval_steps_per_second": 2.143, "step": 36000 }, { "epoch": 0.33899476899848235, "grad_norm": 16.978797912597656, "learning_rate": 4.957625606622895e-05, "loss": 1.9358, "step": 38000 }, { "epoch": 0.33899476899848235, "eval_loss": 1.8138540983200073, "eval_runtime": 73.3003, "eval_samples_per_second": 136.425, "eval_steps_per_second": 2.142, "step": 38000 }, { "epoch": 0.35683659894577086, "grad_norm": 18.134506225585938, "learning_rate": 4.955395375392521e-05, "loss": 1.9204, "step": 40000 }, { "epoch": 0.35683659894577086, "eval_loss": 1.8061386346817017, "eval_runtime": 73.3032, "eval_samples_per_second": 136.42, "eval_steps_per_second": 2.142, "step": 40000 }, { "epoch": 0.3746784288930594, "grad_norm": 16.018447875976562, "learning_rate": 4.953165144162147e-05, "loss": 1.9103, "step": 42000 }, { "epoch": 0.3746784288930594, "eval_loss": 1.790651559829712, "eval_runtime": 73.3055, "eval_samples_per_second": 136.415, "eval_steps_per_second": 2.142, "step": 42000 }, { "epoch": 0.39252025884034797, "grad_norm": 18.30422592163086, "learning_rate": 4.950934912931773e-05, "loss": 1.8984, "step": 44000 }, { "epoch": 0.39252025884034797, "eval_loss": 1.787701964378357, "eval_runtime": 73.3134, "eval_samples_per_second": 136.401, "eval_steps_per_second": 2.141, "step": 44000 }, { "epoch": 0.4103620887876365, "grad_norm": 16.60624122619629, "learning_rate": 4.9487046817013986e-05, "loss": 1.89, "step": 46000 }, { "epoch": 0.4103620887876365, "eval_loss": 1.7718769311904907, "eval_runtime": 73.3399, "eval_samples_per_second": 136.351, "eval_steps_per_second": 2.141, "step": 46000 }, { "epoch": 0.428203918734925, "grad_norm": 15.059417724609375, "learning_rate": 4.964316300314017e-05, "loss": 1.8775, "step": 48000 }, { "epoch": 0.428203918734925, "eval_loss": 1.763095736503601, "eval_runtime": 73.3521, "eval_samples_per_second": 136.329, "eval_steps_per_second": 2.14, "step": 48000 }, { "epoch": 0.4460457486822136, "grad_norm": 17.129064559936523, "learning_rate": 4.9628294794937674e-05, "loss": 1.8687, "step": 50000 }, { "epoch": 0.4460457486822136, "eval_loss": 1.743654727935791, "eval_runtime": 73.2498, "eval_samples_per_second": 136.519, "eval_steps_per_second": 2.143, "step": 50000 }, { "epoch": 0.46388757862950214, "grad_norm": 16.349536895751953, "learning_rate": 4.947285443645707e-05, "loss": 1.8632, "step": 52000 }, { "epoch": 0.46388757862950214, "eval_loss": 1.743268609046936, "eval_runtime": 73.3921, "eval_samples_per_second": 136.254, "eval_steps_per_second": 2.139, "step": 52000 }, { "epoch": 0.4817294085767907, "grad_norm": 16.42721939086914, "learning_rate": 4.9452579607090025e-05, "loss": 1.8494, "step": 54000 }, { "epoch": 0.4817294085767907, "eval_loss": 1.7285025119781494, "eval_runtime": 73.2209, "eval_samples_per_second": 136.573, "eval_steps_per_second": 2.144, "step": 54000 }, { "epoch": 0.4995712385240792, "grad_norm": 17.674468994140625, "learning_rate": 4.9432304777722994e-05, "loss": 1.8404, "step": 56000 }, { "epoch": 0.4995712385240792, "eval_loss": 1.7261757850646973, "eval_runtime": 73.1858, "eval_samples_per_second": 136.639, "eval_steps_per_second": 2.145, "step": 56000 }, { "epoch": 0.5174130684713678, "grad_norm": 16.304468154907227, "learning_rate": 4.941202994835596e-05, "loss": 1.8308, "step": 58000 }, { "epoch": 0.5174130684713678, "eval_loss": 1.7157503366470337, "eval_runtime": 73.1684, "eval_samples_per_second": 136.671, "eval_steps_per_second": 2.146, "step": 58000 }, { "epoch": 0.5352548984186563, "grad_norm": 17.134702682495117, "learning_rate": 4.939175511898892e-05, "loss": 1.8245, "step": 60000 }, { "epoch": 0.5352548984186563, "eval_loss": 1.7094610929489136, "eval_runtime": 73.3218, "eval_samples_per_second": 136.385, "eval_steps_per_second": 2.141, "step": 60000 }, { "epoch": 0.5530967283659448, "grad_norm": 17.70859146118164, "learning_rate": 4.937148028962189e-05, "loss": 1.8201, "step": 62000 }, { "epoch": 0.5530967283659448, "eval_loss": 1.70658540725708, "eval_runtime": 73.3241, "eval_samples_per_second": 136.381, "eval_steps_per_second": 2.141, "step": 62000 }, { "epoch": 0.5709385583132334, "grad_norm": 16.962129592895508, "learning_rate": 4.935120546025485e-05, "loss": 1.8107, "step": 64000 }, { "epoch": 0.5709385583132334, "eval_loss": 1.6914931535720825, "eval_runtime": 73.2479, "eval_samples_per_second": 136.523, "eval_steps_per_second": 2.143, "step": 64000 }, { "epoch": 0.5887803882605219, "grad_norm": 16.842283248901367, "learning_rate": 4.933093063088781e-05, "loss": 1.8027, "step": 66000 }, { "epoch": 0.5887803882605219, "eval_loss": 1.683428168296814, "eval_runtime": 73.2102, "eval_samples_per_second": 136.593, "eval_steps_per_second": 2.145, "step": 66000 }, { "epoch": 0.6066222182078105, "grad_norm": 17.075162887573242, "learning_rate": 4.9310655801520775e-05, "loss": 1.7991, "step": 68000 }, { "epoch": 0.6066222182078105, "eval_loss": 1.688643217086792, "eval_runtime": 73.3407, "eval_samples_per_second": 136.35, "eval_steps_per_second": 2.141, "step": 68000 }, { "epoch": 0.624464048155099, "grad_norm": 17.02593231201172, "learning_rate": 4.929038097215374e-05, "loss": 1.7906, "step": 70000 }, { "epoch": 0.624464048155099, "eval_loss": 1.6794742345809937, "eval_runtime": 73.266, "eval_samples_per_second": 136.489, "eval_steps_per_second": 2.143, "step": 70000 } ], "logging_steps": 2000, "max_steps": 4932224, "num_input_tokens_seen": 0, "num_train_epochs": 44, "save_steps": 2000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 4.716630245376e+18, "train_batch_size": 32, "trial_name": null, "trial_params": null }