{ "best_metric": null, "best_model_checkpoint": null, "epoch": 6.662921348314606, "eval_steps": 500, "global_step": 593, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.11235955056179775, "grad_norm": 3.5541882514953613, "learning_rate": 6.666666666666667e-05, "loss": 0.9261, "step": 10 }, { "epoch": 0.2247191011235955, "grad_norm": 1.3109055757522583, "learning_rate": 0.00013333333333333334, "loss": 0.3754, "step": 20 }, { "epoch": 0.33707865168539325, "grad_norm": 1.4723390340805054, "learning_rate": 0.0002, "loss": 0.2649, "step": 30 }, { "epoch": 0.449438202247191, "grad_norm": 0.7010077238082886, "learning_rate": 0.000199844353174683, "loss": 0.2166, "step": 40 }, { "epoch": 0.5617977528089888, "grad_norm": 1.1943029165267944, "learning_rate": 0.00019937789721741653, "loss": 0.1767, "step": 50 }, { "epoch": 0.6741573033707865, "grad_norm": 0.5831332206726074, "learning_rate": 0.00019860208417597864, "loss": 0.1481, "step": 60 }, { "epoch": 0.7865168539325843, "grad_norm": 1.139088749885559, "learning_rate": 0.00019751932910710805, "loss": 0.1318, "step": 70 }, { "epoch": 0.898876404494382, "grad_norm": 1.130523681640625, "learning_rate": 0.00019613300255858616, "loss": 0.1277, "step": 80 }, { "epoch": 1.0112359550561798, "grad_norm": 0.8881850242614746, "learning_rate": 0.0001944474200769355, "loss": 0.1202, "step": 90 }, { "epoch": 1.1235955056179776, "grad_norm": 0.6445964574813843, "learning_rate": 0.00019246782877339766, "loss": 0.101, "step": 100 }, { "epoch": 1.2359550561797752, "grad_norm": 0.45679864287376404, "learning_rate": 0.00019020039099000907, "loss": 0.1078, "step": 110 }, { "epoch": 1.348314606741573, "grad_norm": 0.5455164909362793, "learning_rate": 0.0001876521651166215, "loss": 0.0931, "step": 120 }, { "epoch": 1.4606741573033708, "grad_norm": 0.5435442924499512, "learning_rate": 0.00018483108361858262, "loss": 0.0866, "step": 130 }, { "epoch": 1.5730337078651684, "grad_norm": 0.4356015622615814, "learning_rate": 0.00018174592834347504, "loss": 0.0899, "step": 140 }, { "epoch": 1.6853932584269664, "grad_norm": 0.6584963798522949, "learning_rate": 0.00017840630318378232, "loss": 0.0905, "step": 150 }, { "epoch": 1.797752808988764, "grad_norm": 0.500027060508728, "learning_rate": 0.00017482260418058164, "loss": 0.091, "step": 160 }, { "epoch": 1.9101123595505618, "grad_norm": 0.720208466053009, "learning_rate": 0.00017100598716132773, "loss": 0.0805, "step": 170 }, { "epoch": 2.0224719101123596, "grad_norm": 0.7830264568328857, "learning_rate": 0.0001669683330124706, "loss": 0.0779, "step": 180 }, { "epoch": 2.134831460674157, "grad_norm": 0.6720086932182312, "learning_rate": 0.0001627222106950102, "loss": 0.0731, "step": 190 }, { "epoch": 2.247191011235955, "grad_norm": 0.4705169200897217, "learning_rate": 0.0001582808381181189, "loss": 0.0705, "step": 200 }, { "epoch": 2.359550561797753, "grad_norm": 0.40476611256599426, "learning_rate": 0.0001536580409926296, "loss": 0.0715, "step": 210 }, { "epoch": 2.4719101123595504, "grad_norm": 0.5198217034339905, "learning_rate": 0.0001488682097924756, "loss": 0.0658, "step": 220 }, { "epoch": 2.5842696629213484, "grad_norm": 0.3450973331928253, "learning_rate": 0.00014392625495805912, "loss": 0.0613, "step": 230 }, { "epoch": 2.696629213483146, "grad_norm": 0.4224022626876831, "learning_rate": 0.00013884756048099687, "loss": 0.0663, "step": 240 }, { "epoch": 2.808988764044944, "grad_norm": 0.3117457926273346, "learning_rate": 0.00013364793601473106, "loss": 0.0577, "step": 250 }, { "epoch": 2.9213483146067416, "grad_norm": 0.2872294783592224, "learning_rate": 0.00012834356766008197, "loss": 0.0626, "step": 260 }, { "epoch": 3.033707865168539, "grad_norm": 0.4661742150783539, "learning_rate": 0.0001229509675789439, "loss": 0.0619, "step": 270 }, { "epoch": 3.146067415730337, "grad_norm": 0.3543541729450226, "learning_rate": 0.00011748692259297347, "loss": 0.0494, "step": 280 }, { "epoch": 3.258426966292135, "grad_norm": 0.3537474572658539, "learning_rate": 0.00011196844192727984, "loss": 0.052, "step": 290 }, { "epoch": 3.370786516853933, "grad_norm": 0.3915044367313385, "learning_rate": 0.00010641270426178676, "loss": 0.0547, "step": 300 }, { "epoch": 3.4831460674157304, "grad_norm": 0.3162959814071655, "learning_rate": 0.00010083700425509279, "loss": 0.0524, "step": 310 }, { "epoch": 3.595505617977528, "grad_norm": 0.38653671741485596, "learning_rate": 9.52586987072972e-05, "loss": 0.0465, "step": 320 }, { "epoch": 3.7078651685393256, "grad_norm": 0.3417101800441742, "learning_rate": 8.969515252938322e-05, "loss": 0.0482, "step": 330 }, { "epoch": 3.8202247191011236, "grad_norm": 0.28841620683670044, "learning_rate": 8.41636846873528e-05, "loss": 0.0564, "step": 340 }, { "epoch": 3.932584269662921, "grad_norm": 0.3317897915840149, "learning_rate": 7.868151428938502e-05, "loss": 0.0458, "step": 350 }, { "epoch": 4.044943820224719, "grad_norm": 0.1963573545217514, "learning_rate": 7.326570698384568e-05, "loss": 0.0498, "step": 360 }, { "epoch": 4.157303370786517, "grad_norm": 0.2925940454006195, "learning_rate": 6.793312183500759e-05, "loss": 0.0411, "step": 370 }, { "epoch": 4.269662921348314, "grad_norm": 0.2857739329338074, "learning_rate": 6.270035884185367e-05, "loss": 0.0391, "step": 380 }, { "epoch": 4.382022471910112, "grad_norm": 0.3383257985115051, "learning_rate": 5.758370726333434e-05, "loss": 0.0448, "step": 390 }, { "epoch": 4.49438202247191, "grad_norm": 0.2550479471683502, "learning_rate": 5.2599094910938594e-05, "loss": 0.0382, "step": 400 }, { "epoch": 4.606741573033708, "grad_norm": 0.21575377881526947, "learning_rate": 4.7762038566428155e-05, "loss": 0.043, "step": 410 }, { "epoch": 4.719101123595506, "grad_norm": 0.24605822563171387, "learning_rate": 4.3087595679081096e-05, "loss": 0.0417, "step": 420 }, { "epoch": 4.831460674157303, "grad_norm": 0.2827017903327942, "learning_rate": 3.8590317492808236e-05, "loss": 0.0421, "step": 430 }, { "epoch": 4.943820224719101, "grad_norm": 0.2163945734500885, "learning_rate": 3.428420374905483e-05, "loss": 0.0412, "step": 440 }, { "epoch": 5.056179775280899, "grad_norm": 0.22983698546886444, "learning_rate": 3.0182659106494192e-05, "loss": 0.0393, "step": 450 }, { "epoch": 5.168539325842697, "grad_norm": 0.25033506751060486, "learning_rate": 2.629845141317656e-05, "loss": 0.0358, "step": 460 }, { "epoch": 5.280898876404494, "grad_norm": 0.21685783565044403, "learning_rate": 2.264367196102869e-05, "loss": 0.0332, "step": 470 }, { "epoch": 5.393258426966292, "grad_norm": 0.2170724868774414, "learning_rate": 1.9229697846429773e-05, "loss": 0.0379, "step": 480 }, { "epoch": 5.50561797752809, "grad_norm": 0.17588071525096893, "learning_rate": 1.606715655403289e-05, "loss": 0.033, "step": 490 }, { "epoch": 5.617977528089888, "grad_norm": 0.3224557936191559, "learning_rate": 1.3165892874079899e-05, "loss": 0.0379, "step": 500 }, { "epoch": 5.730337078651686, "grad_norm": 0.443591445684433, "learning_rate": 1.0534938256194671e-05, "loss": 0.0352, "step": 510 }, { "epoch": 5.842696629213483, "grad_norm": 0.23971615731716156, "learning_rate": 8.182482695053728e-06, "loss": 0.0343, "step": 520 }, { "epoch": 5.955056179775281, "grad_norm": 0.33371302485466003, "learning_rate": 6.1158492354529195e-06, "loss": 0.0346, "step": 530 }, { "epoch": 6.067415730337078, "grad_norm": 0.2301403433084488, "learning_rate": 4.3414711761338375e-06, "loss": 0.0343, "step": 540 }, { "epoch": 6.179775280898877, "grad_norm": 0.16674034297466278, "learning_rate": 2.8648720433333996e-06, "loss": 0.03, "step": 550 }, { "epoch": 6.292134831460674, "grad_norm": 0.2350272834300995, "learning_rate": 1.6906483963973207e-06, "loss": 0.0344, "step": 560 }, { "epoch": 6.404494382022472, "grad_norm": 0.1408814638853073, "learning_rate": 8.224555189827565e-07, "loss": 0.0387, "step": 570 }, { "epoch": 6.51685393258427, "grad_norm": 0.2586834132671356, "learning_rate": 2.629960403923715e-07, "loss": 0.0329, "step": 580 }, { "epoch": 6.629213483146067, "grad_norm": 0.21313035488128662, "learning_rate": 1.4011522460866122e-08, "loss": 0.0324, "step": 590 }, { "epoch": 6.662921348314606, "step": 593, "total_flos": 8.40154581763943e+16, "train_loss": 0.0879759074865666, "train_runtime": 704.4365, "train_samples_per_second": 53.876, "train_steps_per_second": 0.842 } ], "logging_steps": 10, "max_steps": 593, "num_input_tokens_seen": 0, "num_train_epochs": 7, "save_steps": 10000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 8.40154581763943e+16, "train_batch_size": 64, "trial_name": null, "trial_params": null }