{ "best_metric": null, "best_model_checkpoint": null, "epoch": 10.0, "eval_steps": 500, "global_step": 1250, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.08, "grad_norm": 4.869012355804443, "learning_rate": 1.5873015873015872e-05, "loss": 1.1684, "step": 10 }, { "epoch": 0.16, "grad_norm": 5.065932750701904, "learning_rate": 3.1746031746031745e-05, "loss": 0.6254, "step": 20 }, { "epoch": 0.24, "grad_norm": 1.7544628381729126, "learning_rate": 4.761904761904762e-05, "loss": 0.3667, "step": 30 }, { "epoch": 0.32, "grad_norm": 1.402125597000122, "learning_rate": 6.349206349206349e-05, "loss": 0.2619, "step": 40 }, { "epoch": 0.4, "grad_norm": 1.5777990818023682, "learning_rate": 7.936507936507937e-05, "loss": 0.211, "step": 50 }, { "epoch": 0.48, "grad_norm": 1.0018945932388306, "learning_rate": 9.523809523809524e-05, "loss": 0.1758, "step": 60 }, { "epoch": 0.56, "grad_norm": 1.201993465423584, "learning_rate": 9.999141931470729e-05, "loss": 0.1447, "step": 70 }, { "epoch": 0.64, "grad_norm": 0.9454079866409302, "learning_rate": 9.994939855811362e-05, "loss": 0.1282, "step": 80 }, { "epoch": 0.72, "grad_norm": 1.1873888969421387, "learning_rate": 9.987239108233668e-05, "loss": 0.1169, "step": 90 }, { "epoch": 0.8, "grad_norm": 0.8586097359657288, "learning_rate": 9.976045082674319e-05, "loss": 0.105, "step": 100 }, { "epoch": 0.88, "grad_norm": 1.0581028461456299, "learning_rate": 9.961365619912989e-05, "loss": 0.1102, "step": 110 }, { "epoch": 0.96, "grad_norm": 0.5893928408622742, "learning_rate": 9.94321100208032e-05, "loss": 0.1056, "step": 120 }, { "epoch": 1.04, "grad_norm": 1.3466310501098633, "learning_rate": 9.921593945455869e-05, "loss": 0.0826, "step": 130 }, { "epoch": 1.12, "grad_norm": 1.1419179439544678, "learning_rate": 9.896529591561093e-05, "loss": 0.0899, "step": 140 }, { "epoch": 1.2, "grad_norm": 0.5349047183990479, "learning_rate": 9.868035496553546e-05, "loss": 0.0823, "step": 150 }, { "epoch": 1.28, "grad_norm": 0.6395344734191895, "learning_rate": 9.836131618929819e-05, "loss": 0.0816, "step": 160 }, { "epoch": 1.3599999999999999, "grad_norm": 0.7662268877029419, "learning_rate": 9.800840305545715e-05, "loss": 0.0773, "step": 170 }, { "epoch": 1.44, "grad_norm": 0.6182479858398438, "learning_rate": 9.762186275963563e-05, "loss": 0.0667, "step": 180 }, { "epoch": 1.52, "grad_norm": 0.28476619720458984, "learning_rate": 9.720196605137565e-05, "loss": 0.0662, "step": 190 }, { "epoch": 1.6, "grad_norm": 0.528924286365509, "learning_rate": 9.674900704449324e-05, "loss": 0.0699, "step": 200 }, { "epoch": 1.6800000000000002, "grad_norm": 0.420554518699646, "learning_rate": 9.626330301106837e-05, "loss": 0.0624, "step": 210 }, { "epoch": 1.76, "grad_norm": 0.40601664781570435, "learning_rate": 9.574519415921396e-05, "loss": 0.0631, "step": 220 }, { "epoch": 1.8399999999999999, "grad_norm": 0.4216195046901703, "learning_rate": 9.519504339477932e-05, "loss": 0.064, "step": 230 }, { "epoch": 1.92, "grad_norm": 0.35759010910987854, "learning_rate": 9.46132360671552e-05, "loss": 0.0645, "step": 240 }, { "epoch": 2.0, "grad_norm": 0.7622753977775574, "learning_rate": 9.400017969935848e-05, "loss": 0.0558, "step": 250 }, { "epoch": 2.08, "grad_norm": 0.6483420729637146, "learning_rate": 9.335630370258533e-05, "loss": 0.061, "step": 260 }, { "epoch": 2.16, "grad_norm": 0.6169353723526001, "learning_rate": 9.26820590754331e-05, "loss": 0.0553, "step": 270 }, { "epoch": 2.24, "grad_norm": 0.6126364469528198, "learning_rate": 9.197791808800135e-05, "loss": 0.0552, "step": 280 }, { "epoch": 2.32, "grad_norm": 0.45238834619522095, "learning_rate": 9.124437395109353e-05, "loss": 0.051, "step": 290 }, { "epoch": 2.4, "grad_norm": 0.4955156743526459, "learning_rate": 9.048194047075069e-05, "loss": 0.0556, "step": 300 }, { "epoch": 2.48, "grad_norm": 0.7795207500457764, "learning_rate": 8.969115168835954e-05, "loss": 0.0573, "step": 310 }, { "epoch": 2.56, "grad_norm": 0.4718877971172333, "learning_rate": 8.887256150658684e-05, "loss": 0.0473, "step": 320 }, { "epoch": 2.64, "grad_norm": 0.3217944800853729, "learning_rate": 8.802674330140192e-05, "loss": 0.0499, "step": 330 }, { "epoch": 2.7199999999999998, "grad_norm": 0.4460347592830658, "learning_rate": 8.715428952045936e-05, "loss": 0.0553, "step": 340 }, { "epoch": 2.8, "grad_norm": 0.3968408703804016, "learning_rate": 8.625581126812312e-05, "loss": 0.0512, "step": 350 }, { "epoch": 2.88, "grad_norm": 0.32385963201522827, "learning_rate": 8.533193787742251e-05, "loss": 0.0514, "step": 360 }, { "epoch": 2.96, "grad_norm": 0.5356223583221436, "learning_rate": 8.438331646924013e-05, "loss": 0.0463, "step": 370 }, { "epoch": 3.04, "grad_norm": 0.6326996088027954, "learning_rate": 8.341061149904045e-05, "loss": 0.0465, "step": 380 }, { "epoch": 3.12, "grad_norm": 0.547283411026001, "learning_rate": 8.24145042914565e-05, "loss": 0.0505, "step": 390 }, { "epoch": 3.2, "grad_norm": 0.4946414828300476, "learning_rate": 8.13956925630605e-05, "loss": 0.0416, "step": 400 }, { "epoch": 3.2800000000000002, "grad_norm": 0.24442224204540253, "learning_rate": 8.035488993365312e-05, "loss": 0.0464, "step": 410 }, { "epoch": 3.36, "grad_norm": 0.4730175733566284, "learning_rate": 7.929282542641325e-05, "loss": 0.0444, "step": 420 }, { "epoch": 3.44, "grad_norm": 0.2920738458633423, "learning_rate": 7.821024295725865e-05, "loss": 0.0467, "step": 430 }, { "epoch": 3.52, "grad_norm": 0.2938491106033325, "learning_rate": 7.710790081377502e-05, "loss": 0.0425, "step": 440 }, { "epoch": 3.6, "grad_norm": 0.41461989283561707, "learning_rate": 7.598657112407865e-05, "loss": 0.0413, "step": 450 }, { "epoch": 3.68, "grad_norm": 0.36295655369758606, "learning_rate": 7.484703931598445e-05, "loss": 0.0384, "step": 460 }, { "epoch": 3.76, "grad_norm": 0.38623902201652527, "learning_rate": 7.369010356685833e-05, "loss": 0.0418, "step": 470 }, { "epoch": 3.84, "grad_norm": 0.4889287054538727, "learning_rate": 7.251657424453928e-05, "loss": 0.044, "step": 480 }, { "epoch": 3.92, "grad_norm": 0.4019559919834137, "learning_rate": 7.132727333972265e-05, "loss": 0.0368, "step": 490 }, { "epoch": 4.0, "grad_norm": 1.0160191059112549, "learning_rate": 7.012303389020234e-05, "loss": 0.0495, "step": 500 }, { "epoch": 4.08, "grad_norm": 0.3743612766265869, "learning_rate": 6.890469939737506e-05, "loss": 0.04, "step": 510 }, { "epoch": 4.16, "grad_norm": 0.306122362613678, "learning_rate": 6.767312323541555e-05, "loss": 0.0393, "step": 520 }, { "epoch": 4.24, "grad_norm": 0.5479507446289062, "learning_rate": 6.64291680535363e-05, "loss": 0.0394, "step": 530 }, { "epoch": 4.32, "grad_norm": 0.3650203049182892, "learning_rate": 6.517370517175081e-05, "loss": 0.0456, "step": 540 }, { "epoch": 4.4, "grad_norm": 0.31947705149650574, "learning_rate": 6.390761397056328e-05, "loss": 0.0331, "step": 550 }, { "epoch": 4.48, "grad_norm": 0.8813490271568298, "learning_rate": 6.26317812750126e-05, "loss": 0.0395, "step": 560 }, { "epoch": 4.5600000000000005, "grad_norm": 0.21433396637439728, "learning_rate": 6.134710073350156e-05, "loss": 0.0402, "step": 570 }, { "epoch": 4.64, "grad_norm": 0.2874234616756439, "learning_rate": 6.005447219184702e-05, "loss": 0.038, "step": 580 }, { "epoch": 4.72, "grad_norm": 0.3092989921569824, "learning_rate": 5.87548010629889e-05, "loss": 0.0355, "step": 590 }, { "epoch": 4.8, "grad_norm": 0.3180033564567566, "learning_rate": 5.7448997692799764e-05, "loss": 0.0407, "step": 600 }, { "epoch": 4.88, "grad_norm": 0.5046788454055786, "learning_rate": 5.61379767224393e-05, "loss": 0.0384, "step": 610 }, { "epoch": 4.96, "grad_norm": 0.2650161385536194, "learning_rate": 5.482265644769998e-05, "loss": 0.0371, "step": 620 }, { "epoch": 5.04, "grad_norm": 0.264657586812973, "learning_rate": 5.3503958175793055e-05, "loss": 0.0399, "step": 630 }, { "epoch": 5.12, "grad_norm": 0.3759991526603699, "learning_rate": 5.218280558002506e-05, "loss": 0.0326, "step": 640 }, { "epoch": 5.2, "grad_norm": 0.48156824707984924, "learning_rate": 5.086012405281717e-05, "loss": 0.0384, "step": 650 }, { "epoch": 5.28, "grad_norm": 0.4213162064552307, "learning_rate": 4.9536840057520224e-05, "loss": 0.0384, "step": 660 }, { "epoch": 5.36, "grad_norm": 0.41504690051078796, "learning_rate": 4.821388047947979e-05, "loss": 0.0373, "step": 670 }, { "epoch": 5.44, "grad_norm": 0.24580952525138855, "learning_rate": 4.689217197680554e-05, "loss": 0.0339, "step": 680 }, { "epoch": 5.52, "grad_norm": 0.35898831486701965, "learning_rate": 4.5572640331299875e-05, "loss": 0.033, "step": 690 }, { "epoch": 5.6, "grad_norm": 0.39469045400619507, "learning_rate": 4.425620980000026e-05, "loss": 0.0327, "step": 700 }, { "epoch": 5.68, "grad_norm": 0.27831265330314636, "learning_rate": 4.294380246778966e-05, "loss": 0.037, "step": 710 }, { "epoch": 5.76, "grad_norm": 0.2990424633026123, "learning_rate": 4.163633760152834e-05, "loss": 0.0367, "step": 720 }, { "epoch": 5.84, "grad_norm": 0.34065642952919006, "learning_rate": 4.03347310061597e-05, "loss": 0.0354, "step": 730 }, { "epoch": 5.92, "grad_norm": 0.4730483591556549, "learning_rate": 3.903989438324077e-05, "loss": 0.0349, "step": 740 }, { "epoch": 6.0, "grad_norm": 0.24116595089435577, "learning_rate": 3.775273469234712e-05, "loss": 0.0322, "step": 750 }, { "epoch": 6.08, "grad_norm": 0.2788262963294983, "learning_rate": 3.6474153515799e-05, "loss": 0.0346, "step": 760 }, { "epoch": 6.16, "grad_norm": 0.28659942746162415, "learning_rate": 3.520504642715424e-05, "loss": 0.0367, "step": 770 }, { "epoch": 6.24, "grad_norm": 0.28894150257110596, "learning_rate": 3.39463023639097e-05, "loss": 0.0372, "step": 780 }, { "epoch": 6.32, "grad_norm": 0.42454323172569275, "learning_rate": 3.2698803004851026e-05, "loss": 0.0282, "step": 790 }, { "epoch": 6.4, "grad_norm": 0.34259018301963806, "learning_rate": 3.1463422152486674e-05, "loss": 0.0315, "step": 800 }, { "epoch": 6.48, "grad_norm": 0.3654833436012268, "learning_rate": 3.024102512099889e-05, "loss": 0.0294, "step": 810 }, { "epoch": 6.5600000000000005, "grad_norm": 0.26922348141670227, "learning_rate": 2.9032468130140168e-05, "loss": 0.0274, "step": 820 }, { "epoch": 6.64, "grad_norm": 0.2571790814399719, "learning_rate": 2.783859770549996e-05, "loss": 0.0313, "step": 830 }, { "epoch": 6.72, "grad_norm": 0.26845499873161316, "learning_rate": 2.6660250085561457e-05, "loss": 0.0277, "step": 840 }, { "epoch": 6.8, "grad_norm": 0.20030668377876282, "learning_rate": 2.54982506359641e-05, "loss": 0.0341, "step": 850 }, { "epoch": 6.88, "grad_norm": 0.3981160521507263, "learning_rate": 2.435341327138168e-05, "loss": 0.0293, "step": 860 }, { "epoch": 6.96, "grad_norm": 0.3079529106616974, "learning_rate": 2.3226539885421343e-05, "loss": 0.0311, "step": 870 }, { "epoch": 7.04, "grad_norm": 0.15538842976093292, "learning_rate": 2.2118419788942672e-05, "loss": 0.0297, "step": 880 }, { "epoch": 7.12, "grad_norm": 0.35271063446998596, "learning_rate": 2.1029829157190117e-05, "loss": 0.0311, "step": 890 }, { "epoch": 7.2, "grad_norm": 0.3626379668712616, "learning_rate": 1.9961530486126327e-05, "loss": 0.0318, "step": 900 }, { "epoch": 7.28, "grad_norm": 0.32586774230003357, "learning_rate": 1.8914272058347088e-05, "loss": 0.0285, "step": 910 }, { "epoch": 7.36, "grad_norm": 0.23219723999500275, "learning_rate": 1.7888787418951645e-05, "loss": 0.0276, "step": 920 }, { "epoch": 7.44, "grad_norm": 0.2271978110074997, "learning_rate": 1.6885794861736183e-05, "loss": 0.024, "step": 930 }, { "epoch": 7.52, "grad_norm": 0.5763266086578369, "learning_rate": 1.5905996926069628e-05, "loss": 0.032, "step": 940 }, { "epoch": 7.6, "grad_norm": 0.30359622836112976, "learning_rate": 1.4950079904804759e-05, "loss": 0.0244, "step": 950 }, { "epoch": 7.68, "grad_norm": 0.2251555472612381, "learning_rate": 1.4018713363569035e-05, "loss": 0.0306, "step": 960 }, { "epoch": 7.76, "grad_norm": 0.48927322030067444, "learning_rate": 1.3112549671771796e-05, "loss": 0.0303, "step": 970 }, { "epoch": 7.84, "grad_norm": 0.206648051738739, "learning_rate": 1.2232223545656552e-05, "loss": 0.0294, "step": 980 }, { "epoch": 7.92, "grad_norm": 0.3137107193470001, "learning_rate": 1.1378351603718312e-05, "loss": 0.0298, "step": 990 }, { "epoch": 8.0, "grad_norm": 0.47006338834762573, "learning_rate": 1.0551531934797243e-05, "loss": 0.0319, "step": 1000 }, { "epoch": 8.08, "grad_norm": 0.27968037128448486, "learning_rate": 9.752343679151399e-06, "loss": 0.0233, "step": 1010 }, { "epoch": 8.16, "grad_norm": 0.2268412709236145, "learning_rate": 8.981346622801905e-06, "loss": 0.0285, "step": 1020 }, { "epoch": 8.24, "grad_norm": 0.17193864285945892, "learning_rate": 8.239080805434513e-06, "loss": 0.0297, "step": 1030 }, { "epoch": 8.32, "grad_norm": 0.3541881740093231, "learning_rate": 7.526066142132521e-06, "loss": 0.0228, "step": 1040 }, { "epoch": 8.4, "grad_norm": 0.20205742120742798, "learning_rate": 6.842802059205727e-06, "loss": 0.0252, "step": 1050 }, { "epoch": 8.48, "grad_norm": 0.3247056007385254, "learning_rate": 6.189767144370645e-06, "loss": 0.0233, "step": 1060 }, { "epoch": 8.56, "grad_norm": 0.23709805309772491, "learning_rate": 5.567418811526981e-06, "loss": 0.0256, "step": 1070 }, { "epoch": 8.64, "grad_norm": 0.20851150155067444, "learning_rate": 4.976192980365124e-06, "loss": 0.0283, "step": 1080 }, { "epoch": 8.72, "grad_norm": 0.26959723234176636, "learning_rate": 4.416503771029201e-06, "loss": 0.0255, "step": 1090 }, { "epoch": 8.8, "grad_norm": 0.32582834362983704, "learning_rate": 3.888743214049346e-06, "loss": 0.0256, "step": 1100 }, { "epoch": 8.88, "grad_norm": 0.18712686002254486, "learning_rate": 3.393280975746588e-06, "loss": 0.0228, "step": 1110 }, { "epoch": 8.96, "grad_norm": 0.2023114413022995, "learning_rate": 2.9304640993025988e-06, "loss": 0.0229, "step": 1120 }, { "epoch": 9.04, "grad_norm": 0.177691251039505, "learning_rate": 2.500616761675578e-06, "loss": 0.0238, "step": 1130 }, { "epoch": 9.12, "grad_norm": 0.21255093812942505, "learning_rate": 2.104040046532768e-06, "loss": 0.0282, "step": 1140 }, { "epoch": 9.2, "grad_norm": 0.35373473167419434, "learning_rate": 1.7410117333583498e-06, "loss": 0.0243, "step": 1150 }, { "epoch": 9.28, "grad_norm": 0.2814777195453644, "learning_rate": 1.4117861028847267e-06, "loss": 0.0247, "step": 1160 }, { "epoch": 9.36, "grad_norm": 0.2685955762863159, "learning_rate": 1.1165937589833087e-06, "loss": 0.0262, "step": 1170 }, { "epoch": 9.44, "grad_norm": 0.1922096461057663, "learning_rate": 8.55641467139534e-07, "loss": 0.0266, "step": 1180 }, { "epoch": 9.52, "grad_norm": 0.2045939713716507, "learning_rate": 6.291120096254433e-07, "loss": 0.0234, "step": 1190 }, { "epoch": 9.6, "grad_norm": 0.22145062685012817, "learning_rate": 4.371640574710345e-07, "loss": 0.0232, "step": 1200 }, { "epoch": 9.68, "grad_norm": 0.2229902297258377, "learning_rate": 2.7993205932420053e-07, "loss": 0.0286, "step": 1210 }, { "epoch": 9.76, "grad_norm": 0.22045257687568665, "learning_rate": 1.5752614727712057e-07, "loss": 0.0214, "step": 1220 }, { "epoch": 9.84, "grad_norm": 0.2349853515625, "learning_rate": 7.003205972494486e-08, "loss": 0.024, "step": 1230 }, { "epoch": 9.92, "grad_norm": 0.32890427112579346, "learning_rate": 1.7511081310922495e-08, "loss": 0.0234, "step": 1240 }, { "epoch": 10.0, "grad_norm": 0.8977156281471252, "learning_rate": 0.0, "loss": 0.0235, "step": 1250 }, { "epoch": 10.0, "step": 1250, "total_flos": 1.7818885277831808e+17, "train_loss": 0.06373299721479415, "train_runtime": 1560.6933, "train_samples_per_second": 50.926, "train_steps_per_second": 0.801 } ], "logging_steps": 10, "max_steps": 1250, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 10000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.7818885277831808e+17, "train_batch_size": 64, "trial_name": null, "trial_params": null }