{ "best_metric": 1.8970773220062256, "best_model_checkpoint": "models/dehanalkautsar/mbert-5-with-parallel-tokenizer-30k/checkpoint-70000", "epoch": 29.99623562926035, "eval_steps": 2000, "global_step": 73710, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.8139179977617255, "grad_norm": 12.944409370422363, "learning_rate": 9.185999185999187e-05, "loss": 6.4974, "step": 2000 }, { "epoch": 0.8139179977617255, "eval_loss": 5.766712665557861, "eval_runtime": 74.2223, "eval_samples_per_second": 134.73, "eval_steps_per_second": 2.115, "step": 2000 }, { "epoch": 1.6275307762742903, "grad_norm": 11.577170372009277, "learning_rate": 8.371998371998372e-05, "loss": 5.2766, "step": 4000 }, { "epoch": 1.6275307762742903, "eval_loss": 4.884174823760986, "eval_runtime": 74.2859, "eval_samples_per_second": 134.615, "eval_steps_per_second": 2.113, "step": 4000 }, { "epoch": 2.4411435547868554, "grad_norm": 18.84739875793457, "learning_rate": 7.557997557997558e-05, "loss": 4.4979, "step": 6000 }, { "epoch": 2.4411435547868554, "eval_loss": 3.7487125396728516, "eval_runtime": 74.2668, "eval_samples_per_second": 134.65, "eval_steps_per_second": 2.114, "step": 6000 }, { "epoch": 3.25475633329942, "grad_norm": 16.400541305541992, "learning_rate": 6.743996743996744e-05, "loss": 3.3089, "step": 8000 }, { "epoch": 3.25475633329942, "eval_loss": 2.872248649597168, "eval_runtime": 74.2418, "eval_samples_per_second": 134.695, "eval_steps_per_second": 2.115, "step": 8000 }, { "epoch": 4.068369111811985, "grad_norm": 15.022873878479004, "learning_rate": 5.929995929995931e-05, "loss": 2.8436, "step": 10000 }, { "epoch": 4.068369111811985, "eval_loss": 2.6301286220550537, "eval_runtime": 74.2596, "eval_samples_per_second": 134.663, "eval_steps_per_second": 2.114, "step": 10000 }, { "epoch": 4.882287109573711, "grad_norm": 14.731890678405762, "learning_rate": 5.115995115995116e-05, "loss": 2.6498, "step": 12000 }, { "epoch": 4.882287109573711, "eval_loss": 2.4912309646606445, "eval_runtime": 74.2946, "eval_samples_per_second": 134.599, "eval_steps_per_second": 2.113, "step": 12000 }, { "epoch": 5.6958998880862755, "grad_norm": 15.547078132629395, "learning_rate": 4.301994301994302e-05, "loss": 2.5281, "step": 14000 }, { "epoch": 5.6958998880862755, "eval_loss": 2.4065146446228027, "eval_runtime": 74.3674, "eval_samples_per_second": 134.468, "eval_steps_per_second": 2.111, "step": 14000 }, { "epoch": 6.50951266659884, "grad_norm": 14.562560081481934, "learning_rate": 3.487993487993488e-05, "loss": 2.4394, "step": 16000 }, { "epoch": 6.50951266659884, "eval_loss": 2.3277714252471924, "eval_runtime": 74.2604, "eval_samples_per_second": 134.661, "eval_steps_per_second": 2.114, "step": 16000 }, { "epoch": 7.323125445111405, "grad_norm": 14.990942001342773, "learning_rate": 2.673992673992674e-05, "loss": 2.3797, "step": 18000 }, { "epoch": 7.323125445111405, "eval_loss": 2.277052640914917, "eval_runtime": 74.2753, "eval_samples_per_second": 134.634, "eval_steps_per_second": 2.114, "step": 18000 }, { "epoch": 8.13673822362397, "grad_norm": 14.372356414794922, "learning_rate": 1.85999185999186e-05, "loss": 2.3303, "step": 20000 }, { "epoch": 8.13673822362397, "eval_loss": 2.2430338859558105, "eval_runtime": 74.2644, "eval_samples_per_second": 134.654, "eval_steps_per_second": 2.114, "step": 20000 }, { "epoch": 8.950656221385696, "grad_norm": 15.09987735748291, "learning_rate": 1.045991045991046e-05, "loss": 2.2974, "step": 22000 }, { "epoch": 8.950656221385696, "eval_loss": 2.2221477031707764, "eval_runtime": 74.2795, "eval_samples_per_second": 134.627, "eval_steps_per_second": 2.114, "step": 22000 }, { "epoch": 9.76426899989826, "grad_norm": 14.753724098205566, "learning_rate": 2.31990231990232e-06, "loss": 2.272, "step": 24000 }, { "epoch": 9.76426899989826, "eval_loss": 2.2013912200927734, "eval_runtime": 74.2825, "eval_samples_per_second": 134.621, "eval_steps_per_second": 2.114, "step": 24000 }, { "epoch": 10.581951368399634, "grad_norm": 14.659090995788574, "learning_rate": 4.708994708994709e-05, "loss": 2.2996, "step": 26000 }, { "epoch": 10.581951368399634, "eval_loss": 2.2351293563842773, "eval_runtime": 74.3521, "eval_samples_per_second": 134.495, "eval_steps_per_second": 2.112, "step": 26000 }, { "epoch": 11.395564146912198, "grad_norm": 14.084636688232422, "learning_rate": 4.301994301994302e-05, "loss": 2.2664, "step": 28000 }, { "epoch": 11.395564146912198, "eval_loss": 2.174744129180908, "eval_runtime": 74.3256, "eval_samples_per_second": 134.543, "eval_steps_per_second": 2.112, "step": 28000 }, { "epoch": 12.209176925424764, "grad_norm": 13.796431541442871, "learning_rate": 3.8949938949938955e-05, "loss": 2.2317, "step": 30000 }, { "epoch": 12.209176925424764, "eval_loss": 2.150289535522461, "eval_runtime": 74.3662, "eval_samples_per_second": 134.47, "eval_steps_per_second": 2.111, "step": 30000 }, { "epoch": 13.022789703937327, "grad_norm": 14.407515525817871, "learning_rate": 3.487993487993488e-05, "loss": 2.1954, "step": 32000 }, { "epoch": 13.022789703937327, "eval_loss": 2.1134209632873535, "eval_runtime": 74.3327, "eval_samples_per_second": 134.53, "eval_steps_per_second": 2.112, "step": 32000 }, { "epoch": 13.836707701699053, "grad_norm": 15.40833854675293, "learning_rate": 3.080993080993081e-05, "loss": 2.1617, "step": 34000 }, { "epoch": 13.836707701699053, "eval_loss": 2.069462299346924, "eval_runtime": 74.3637, "eval_samples_per_second": 134.474, "eval_steps_per_second": 2.111, "step": 34000 }, { "epoch": 14.650320480211619, "grad_norm": 15.9359130859375, "learning_rate": 2.673992673992674e-05, "loss": 2.1372, "step": 36000 }, { "epoch": 14.650320480211619, "eval_loss": 2.0720558166503906, "eval_runtime": 74.3125, "eval_samples_per_second": 134.567, "eval_steps_per_second": 2.113, "step": 36000 }, { "epoch": 15.463933258724184, "grad_norm": 14.391825675964355, "learning_rate": 2.2669922669922673e-05, "loss": 2.1101, "step": 38000 }, { "epoch": 15.463933258724184, "eval_loss": 2.0433037281036377, "eval_runtime": 74.3291, "eval_samples_per_second": 134.537, "eval_steps_per_second": 2.112, "step": 38000 }, { "epoch": 16.27754603723675, "grad_norm": 14.036117553710938, "learning_rate": 1.85999185999186e-05, "loss": 2.0897, "step": 40000 }, { "epoch": 16.27754603723675, "eval_loss": 2.0293655395507812, "eval_runtime": 74.3236, "eval_samples_per_second": 134.547, "eval_steps_per_second": 2.112, "step": 40000 }, { "epoch": 17.091158815749314, "grad_norm": 14.702261924743652, "learning_rate": 1.4529914529914531e-05, "loss": 2.074, "step": 42000 }, { "epoch": 17.091158815749314, "eval_loss": 2.007510185241699, "eval_runtime": 74.3171, "eval_samples_per_second": 134.559, "eval_steps_per_second": 2.113, "step": 42000 }, { "epoch": 17.905076813511037, "grad_norm": 22.03821563720703, "learning_rate": 1.045991045991046e-05, "loss": 2.0601, "step": 44000 }, { "epoch": 17.905076813511037, "eval_loss": 1.9990129470825195, "eval_runtime": 74.3189, "eval_samples_per_second": 134.555, "eval_steps_per_second": 2.113, "step": 44000 }, { "epoch": 18.718689592023605, "grad_norm": 14.990620613098145, "learning_rate": 6.38990638990639e-06, "loss": 2.0474, "step": 46000 }, { "epoch": 18.718689592023605, "eval_loss": 2.0009329319000244, "eval_runtime": 74.3384, "eval_samples_per_second": 134.52, "eval_steps_per_second": 2.112, "step": 46000 }, { "epoch": 19.53230237053617, "grad_norm": 14.060431480407715, "learning_rate": 2.31990231990232e-06, "loss": 2.0354, "step": 48000 }, { "epoch": 19.53230237053617, "eval_loss": 1.9961448907852173, "eval_runtime": 74.3131, "eval_samples_per_second": 134.566, "eval_steps_per_second": 2.113, "step": 48000 }, { "epoch": 20.34998473903754, "grad_norm": 14.80652904510498, "learning_rate": 3.2166598833265497e-05, "loss": 2.0557, "step": 50000 }, { "epoch": 20.34998473903754, "eval_loss": 2.0071523189544678, "eval_runtime": 74.3479, "eval_samples_per_second": 134.503, "eval_steps_per_second": 2.112, "step": 50000 }, { "epoch": 21.163597517550105, "grad_norm": 14.184722900390625, "learning_rate": 2.945326278659612e-05, "loss": 2.0577, "step": 52000 }, { "epoch": 21.163597517550105, "eval_loss": 1.9845408201217651, "eval_runtime": 74.3501, "eval_samples_per_second": 134.499, "eval_steps_per_second": 2.112, "step": 52000 }, { "epoch": 21.977515515311833, "grad_norm": 14.42458438873291, "learning_rate": 2.673992673992674e-05, "loss": 2.0387, "step": 54000 }, { "epoch": 21.977515515311833, "eval_loss": 1.9765958786010742, "eval_runtime": 74.3649, "eval_samples_per_second": 134.472, "eval_steps_per_second": 2.111, "step": 54000 }, { "epoch": 22.791128293824396, "grad_norm": 14.213285446166992, "learning_rate": 2.4026590693257362e-05, "loss": 2.0236, "step": 56000 }, { "epoch": 22.791128293824396, "eval_loss": 1.9648773670196533, "eval_runtime": 74.3689, "eval_samples_per_second": 134.465, "eval_steps_per_second": 2.111, "step": 56000 }, { "epoch": 23.604741072336964, "grad_norm": 14.83034610748291, "learning_rate": 2.1313254646587983e-05, "loss": 2.0031, "step": 58000 }, { "epoch": 23.604741072336964, "eval_loss": 1.9651334285736084, "eval_runtime": 74.407, "eval_samples_per_second": 134.396, "eval_steps_per_second": 2.11, "step": 58000 }, { "epoch": 24.418353850849527, "grad_norm": 14.208704948425293, "learning_rate": 1.85999185999186e-05, "loss": 1.9961, "step": 60000 }, { "epoch": 24.418353850849527, "eval_loss": 1.9572482109069824, "eval_runtime": 74.3794, "eval_samples_per_second": 134.446, "eval_steps_per_second": 2.111, "step": 60000 }, { "epoch": 25.23196662936209, "grad_norm": 14.465250968933105, "learning_rate": 1.588658255324922e-05, "loss": 1.9818, "step": 62000 }, { "epoch": 25.23196662936209, "eval_loss": 1.9356729984283447, "eval_runtime": 74.4073, "eval_samples_per_second": 134.395, "eval_steps_per_second": 2.11, "step": 62000 }, { "epoch": 26.045579407874655, "grad_norm": 14.194435119628906, "learning_rate": 1.317324650657984e-05, "loss": 1.9687, "step": 64000 }, { "epoch": 26.045579407874655, "eval_loss": 1.931473970413208, "eval_runtime": 74.3744, "eval_samples_per_second": 134.455, "eval_steps_per_second": 2.111, "step": 64000 }, { "epoch": 26.859497405636382, "grad_norm": 13.469378471374512, "learning_rate": 1.045991045991046e-05, "loss": 1.9594, "step": 66000 }, { "epoch": 26.859497405636382, "eval_loss": 1.9075134992599487, "eval_runtime": 74.359, "eval_samples_per_second": 134.483, "eval_steps_per_second": 2.111, "step": 66000 }, { "epoch": 27.673110184148946, "grad_norm": 15.649884223937988, "learning_rate": 7.74657441324108e-06, "loss": 1.9516, "step": 68000 }, { "epoch": 27.673110184148946, "eval_loss": 1.9032074213027954, "eval_runtime": 74.3639, "eval_samples_per_second": 134.474, "eval_steps_per_second": 2.111, "step": 68000 }, { "epoch": 28.486722962661513, "grad_norm": 14.68303394317627, "learning_rate": 5.0332383665717e-06, "loss": 1.9444, "step": 70000 }, { "epoch": 28.486722962661513, "eval_loss": 1.8970773220062256, "eval_runtime": 74.36, "eval_samples_per_second": 134.481, "eval_steps_per_second": 2.111, "step": 70000 }, { "epoch": 29.300335741174077, "grad_norm": 14.542703628540039, "learning_rate": 2.31990231990232e-06, "loss": 1.9393, "step": 72000 }, { "epoch": 29.300335741174077, "eval_loss": 1.906204104423523, "eval_runtime": 74.3531, "eval_samples_per_second": 134.493, "eval_steps_per_second": 2.112, "step": 72000 } ], "logging_steps": 2000, "max_steps": 73710, "num_input_tokens_seen": 0, "num_train_epochs": 30, "save_steps": 2000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 4.96509085097257e+18, "train_batch_size": 32, "trial_name": null, "trial_params": null }