|
{ |
|
"best_global_step": null, |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.0, |
|
"eval_steps": 500, |
|
"global_step": 20958, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.047714476572192, |
|
"grad_norm": 0.023498278111219406, |
|
"learning_rate": 0.00019523809523809525, |
|
"loss": 0.4674, |
|
"mean_token_accuracy": 0.8906204112768173, |
|
"num_tokens": 360041.0, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.095428953144384, |
|
"grad_norm": 3.823547601699829, |
|
"learning_rate": 0.00019046664758087605, |
|
"loss": 0.3194, |
|
"mean_token_accuracy": 0.9205255397558212, |
|
"num_tokens": 727772.0, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.143143429716576, |
|
"grad_norm": 0.0001839943724917248, |
|
"learning_rate": 0.00018569519992365686, |
|
"loss": 0.2674, |
|
"mean_token_accuracy": 0.9327976078987121, |
|
"num_tokens": 1097406.0, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.190857906288768, |
|
"grad_norm": 4.199777126312256, |
|
"learning_rate": 0.00018092375226643766, |
|
"loss": 0.326, |
|
"mean_token_accuracy": 0.920593403160572, |
|
"num_tokens": 1462220.0, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.23857238286096002, |
|
"grad_norm": 5.751025676727295, |
|
"learning_rate": 0.00017615230460921847, |
|
"loss": 0.2693, |
|
"mean_token_accuracy": 0.936355301618576, |
|
"num_tokens": 1828224.0, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.286286859433152, |
|
"grad_norm": 3.971045732498169, |
|
"learning_rate": 0.00017138085695199925, |
|
"loss": 0.2827, |
|
"mean_token_accuracy": 0.9261101527214051, |
|
"num_tokens": 2211342.0, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.33400133600534404, |
|
"grad_norm": 2.15079665184021, |
|
"learning_rate": 0.00016660940929478005, |
|
"loss": 0.2461, |
|
"mean_token_accuracy": 0.9350018633604049, |
|
"num_tokens": 2572498.0, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.381715812577536, |
|
"grad_norm": 0.5263189077377319, |
|
"learning_rate": 0.00016183796163756083, |
|
"loss": 0.2534, |
|
"mean_token_accuracy": 0.9374513441324234, |
|
"num_tokens": 2949312.0, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.42943028914972803, |
|
"grad_norm": 0.004941379185765982, |
|
"learning_rate": 0.00015706651398034164, |
|
"loss": 0.2395, |
|
"mean_token_accuracy": 0.9378743978738785, |
|
"num_tokens": 3314910.0, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.47714476572192005, |
|
"grad_norm": 0.007795912679284811, |
|
"learning_rate": 0.00015229506632312244, |
|
"loss": 0.2284, |
|
"mean_token_accuracy": 0.9409392136335373, |
|
"num_tokens": 3686624.0, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.5248592422941121, |
|
"grad_norm": 0.0033825524151325226, |
|
"learning_rate": 0.00014752361866590325, |
|
"loss": 0.2145, |
|
"mean_token_accuracy": 0.9464458491802216, |
|
"num_tokens": 4063459.0, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 0.572573718866304, |
|
"grad_norm": 3.8457958698272705, |
|
"learning_rate": 0.00014275217100868402, |
|
"loss": 0.2268, |
|
"mean_token_accuracy": 0.9410561621189117, |
|
"num_tokens": 4423993.0, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.620288195438496, |
|
"grad_norm": 0.020238121971488, |
|
"learning_rate": 0.00013798072335146483, |
|
"loss": 0.223, |
|
"mean_token_accuracy": 0.9393890690803528, |
|
"num_tokens": 4788203.0, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 0.6680026720106881, |
|
"grad_norm": 0.0006973391864448786, |
|
"learning_rate": 0.00013320927569424564, |
|
"loss": 0.2434, |
|
"mean_token_accuracy": 0.9404154337644577, |
|
"num_tokens": 5162496.0, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.71571714858288, |
|
"grad_norm": 0.0010198453674092889, |
|
"learning_rate": 0.00012843782803702644, |
|
"loss": 0.191, |
|
"mean_token_accuracy": 0.9535960764884949, |
|
"num_tokens": 5520427.0, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 0.763431625155072, |
|
"grad_norm": 0.001297333394177258, |
|
"learning_rate": 0.00012366638037980725, |
|
"loss": 0.2167, |
|
"mean_token_accuracy": 0.942195966720581, |
|
"num_tokens": 5901763.0, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.8111461017272641, |
|
"grad_norm": 6.195448398590088, |
|
"learning_rate": 0.00011889493272258805, |
|
"loss": 0.2305, |
|
"mean_token_accuracy": 0.9376264967918396, |
|
"num_tokens": 6272492.0, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 0.8588605782994561, |
|
"grad_norm": 0.0025545568205416203, |
|
"learning_rate": 0.00011412348506536883, |
|
"loss": 0.2303, |
|
"mean_token_accuracy": 0.9435879285335541, |
|
"num_tokens": 6657487.0, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 0.906575054871648, |
|
"grad_norm": 0.0006595577578991652, |
|
"learning_rate": 0.00010935203740814964, |
|
"loss": 0.179, |
|
"mean_token_accuracy": 0.9520990616083145, |
|
"num_tokens": 7022908.0, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 0.9542895314438401, |
|
"grad_norm": 3.1752443313598633, |
|
"learning_rate": 0.00010458058975093044, |
|
"loss": 0.1827, |
|
"mean_token_accuracy": 0.9488343714475632, |
|
"num_tokens": 7376243.0, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 1.002004008016032, |
|
"grad_norm": 0.001512572169303894, |
|
"learning_rate": 9.980914209371123e-05, |
|
"loss": 0.2101, |
|
"mean_token_accuracy": 0.9424606282711029, |
|
"num_tokens": 7749938.0, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 1.0497184845882241, |
|
"grad_norm": 3.992393732070923, |
|
"learning_rate": 9.503769443649203e-05, |
|
"loss": 0.1309, |
|
"mean_token_accuracy": 0.9618149808645249, |
|
"num_tokens": 8127846.0, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 1.097432961160416, |
|
"grad_norm": 0.00025509227998554707, |
|
"learning_rate": 9.026624677927283e-05, |
|
"loss": 0.1219, |
|
"mean_token_accuracy": 0.9635183781385421, |
|
"num_tokens": 8503956.0, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 1.145147437732608, |
|
"grad_norm": 0.0013997952919453382, |
|
"learning_rate": 8.549479912205364e-05, |
|
"loss": 0.1235, |
|
"mean_token_accuracy": 0.963010191321373, |
|
"num_tokens": 8862553.0, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 1.1928619143048, |
|
"grad_norm": 2.114091157913208, |
|
"learning_rate": 8.072335146483443e-05, |
|
"loss": 0.1271, |
|
"mean_token_accuracy": 0.9629592669010162, |
|
"num_tokens": 9233864.0, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 1.240576390876992, |
|
"grad_norm": 0.277444452047348, |
|
"learning_rate": 7.595190380761523e-05, |
|
"loss": 0.1187, |
|
"mean_token_accuracy": 0.9649668201208115, |
|
"num_tokens": 9596971.0, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 1.288290867449184, |
|
"grad_norm": 4.878781318664551, |
|
"learning_rate": 7.118045615039604e-05, |
|
"loss": 0.1226, |
|
"mean_token_accuracy": 0.9633177869319915, |
|
"num_tokens": 9966614.0, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 1.3360053440213762, |
|
"grad_norm": 5.269028186798096, |
|
"learning_rate": 6.640900849317683e-05, |
|
"loss": 0.1336, |
|
"mean_token_accuracy": 0.9619411797523498, |
|
"num_tokens": 10338731.0, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 1.3837198205935681, |
|
"grad_norm": 0.000423251127358526, |
|
"learning_rate": 6.163756083595764e-05, |
|
"loss": 0.1386, |
|
"mean_token_accuracy": 0.9611272529363633, |
|
"num_tokens": 10724381.0, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 1.43143429716576, |
|
"grad_norm": 0.754705548286438, |
|
"learning_rate": 5.6866113178738436e-05, |
|
"loss": 0.1182, |
|
"mean_token_accuracy": 0.9661157331466674, |
|
"num_tokens": 11094900.0, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 1.479148773737952, |
|
"grad_norm": 0.0017857268685474992, |
|
"learning_rate": 5.2094665521519235e-05, |
|
"loss": 0.1175, |
|
"mean_token_accuracy": 0.9679548003673554, |
|
"num_tokens": 11454717.0, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 1.5268632503101442, |
|
"grad_norm": 3.5908143520355225, |
|
"learning_rate": 4.732321786430003e-05, |
|
"loss": 0.11, |
|
"mean_token_accuracy": 0.9671754879951477, |
|
"num_tokens": 11832591.0, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 1.5745777268823362, |
|
"grad_norm": 0.010208655148744583, |
|
"learning_rate": 4.255177020708083e-05, |
|
"loss": 0.115, |
|
"mean_token_accuracy": 0.9662747744321823, |
|
"num_tokens": 12223203.0, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 1.6222922034545282, |
|
"grad_norm": 0.002450750907883048, |
|
"learning_rate": 3.778032254986163e-05, |
|
"loss": 0.1112, |
|
"mean_token_accuracy": 0.9668832242488861, |
|
"num_tokens": 12593144.0, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 1.6700066800267201, |
|
"grad_norm": 0.004513042513281107, |
|
"learning_rate": 3.300887489264243e-05, |
|
"loss": 0.1004, |
|
"mean_token_accuracy": 0.9708232057094573, |
|
"num_tokens": 12954505.0, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 1.7177211565989121, |
|
"grad_norm": 0.007037173956632614, |
|
"learning_rate": 2.8237427235423232e-05, |
|
"loss": 0.1114, |
|
"mean_token_accuracy": 0.9671108702421188, |
|
"num_tokens": 13307858.0, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 1.765435633171104, |
|
"grad_norm": 0.015288141556084156, |
|
"learning_rate": 2.3465979578204027e-05, |
|
"loss": 0.1034, |
|
"mean_token_accuracy": 0.9698293421268463, |
|
"num_tokens": 13673816.0, |
|
"step": 18500 |
|
}, |
|
{ |
|
"epoch": 1.813150109743296, |
|
"grad_norm": 6.889008045196533, |
|
"learning_rate": 1.869453192098483e-05, |
|
"loss": 0.0977, |
|
"mean_token_accuracy": 0.9710470995903016, |
|
"num_tokens": 14038630.0, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 1.860864586315488, |
|
"grad_norm": 2.6273930072784424, |
|
"learning_rate": 1.3923084263765626e-05, |
|
"loss": 0.0936, |
|
"mean_token_accuracy": 0.9728796405792236, |
|
"num_tokens": 14401139.0, |
|
"step": 19500 |
|
}, |
|
{ |
|
"epoch": 1.90857906288768, |
|
"grad_norm": 0.004859536420553923, |
|
"learning_rate": 9.151636606546427e-06, |
|
"loss": 0.0929, |
|
"mean_token_accuracy": 0.9708857105970383, |
|
"num_tokens": 14770982.0, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 1.9562935394598722, |
|
"grad_norm": 0.4923778176307678, |
|
"learning_rate": 4.380188949327226e-06, |
|
"loss": 0.0963, |
|
"mean_token_accuracy": 0.9713275592327117, |
|
"num_tokens": 15136348.0, |
|
"step": 20500 |
|
} |
|
], |
|
"logging_steps": 500, |
|
"max_steps": 20958, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 2, |
|
"save_steps": 5000, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 3.399363404153889e+17, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|