|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 10.0, |
|
"eval_steps": 500, |
|
"global_step": 3800, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.02631578947368421, |
|
"grad_norm": 9.638947486877441, |
|
"learning_rate": 5.263157894736842e-06, |
|
"loss": 1.4471, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.05263157894736842, |
|
"grad_norm": 9.498916625976562, |
|
"learning_rate": 1.0526315789473684e-05, |
|
"loss": 1.2214, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.07894736842105263, |
|
"grad_norm": 2.470106601715088, |
|
"learning_rate": 1.5789473684210526e-05, |
|
"loss": 0.6744, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.10526315789473684, |
|
"grad_norm": 2.844433546066284, |
|
"learning_rate": 2.105263157894737e-05, |
|
"loss": 0.4362, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.13157894736842105, |
|
"grad_norm": 1.874556303024292, |
|
"learning_rate": 2.6315789473684212e-05, |
|
"loss": 0.3136, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.15789473684210525, |
|
"grad_norm": 1.6943750381469727, |
|
"learning_rate": 3.157894736842105e-05, |
|
"loss": 0.2589, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.18421052631578946, |
|
"grad_norm": 2.1084232330322266, |
|
"learning_rate": 3.6842105263157895e-05, |
|
"loss": 0.2282, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.21052631578947367, |
|
"grad_norm": 1.509325623512268, |
|
"learning_rate": 4.210526315789474e-05, |
|
"loss": 0.1952, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.23684210526315788, |
|
"grad_norm": 2.0447444915771484, |
|
"learning_rate": 4.736842105263158e-05, |
|
"loss": 0.1842, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.2631578947368421, |
|
"grad_norm": 1.149940013885498, |
|
"learning_rate": 5.2631578947368424e-05, |
|
"loss": 0.1915, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.2894736842105263, |
|
"grad_norm": 1.1014671325683594, |
|
"learning_rate": 5.789473684210527e-05, |
|
"loss": 0.1688, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.3157894736842105, |
|
"grad_norm": 1.9964191913604736, |
|
"learning_rate": 6.31578947368421e-05, |
|
"loss": 0.1494, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.34210526315789475, |
|
"grad_norm": 0.7053777575492859, |
|
"learning_rate": 6.842105263157895e-05, |
|
"loss": 0.1418, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.3684210526315789, |
|
"grad_norm": 1.077986240386963, |
|
"learning_rate": 7.368421052631579e-05, |
|
"loss": 0.1442, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.39473684210526316, |
|
"grad_norm": 1.095324993133545, |
|
"learning_rate": 7.894736842105263e-05, |
|
"loss": 0.1332, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.42105263157894735, |
|
"grad_norm": 0.9210699200630188, |
|
"learning_rate": 8.421052631578948e-05, |
|
"loss": 0.1322, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.4473684210526316, |
|
"grad_norm": 1.3079745769500732, |
|
"learning_rate": 8.947368421052632e-05, |
|
"loss": 0.1204, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.47368421052631576, |
|
"grad_norm": 1.196655035018921, |
|
"learning_rate": 9.473684210526316e-05, |
|
"loss": 0.1227, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.8218312859535217, |
|
"learning_rate": 0.0001, |
|
"loss": 0.1215, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.5263157894736842, |
|
"grad_norm": 0.9372950196266174, |
|
"learning_rate": 9.999810668616086e-05, |
|
"loss": 0.114, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.5526315789473685, |
|
"grad_norm": 1.0510334968566895, |
|
"learning_rate": 9.999242688802886e-05, |
|
"loss": 0.1101, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.5789473684210527, |
|
"grad_norm": 0.7145567536354065, |
|
"learning_rate": 9.998296103574967e-05, |
|
"loss": 0.0964, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.6052631578947368, |
|
"grad_norm": 0.7895988821983337, |
|
"learning_rate": 9.996970984619641e-05, |
|
"loss": 0.0948, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.631578947368421, |
|
"grad_norm": 0.8074854612350464, |
|
"learning_rate": 9.995267432291555e-05, |
|
"loss": 0.1025, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.6578947368421053, |
|
"grad_norm": 0.9896809458732605, |
|
"learning_rate": 9.993185575605073e-05, |
|
"loss": 0.0954, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.6842105263157895, |
|
"grad_norm": 1.2382564544677734, |
|
"learning_rate": 9.990725572224521e-05, |
|
"loss": 0.0965, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.7105263157894737, |
|
"grad_norm": 0.6677968502044678, |
|
"learning_rate": 9.987887608452235e-05, |
|
"loss": 0.104, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.7368421052631579, |
|
"grad_norm": 0.8802872896194458, |
|
"learning_rate": 9.984671899214457e-05, |
|
"loss": 0.0936, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.7631578947368421, |
|
"grad_norm": 0.5323192477226257, |
|
"learning_rate": 9.981078688045062e-05, |
|
"loss": 0.0937, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.7894736842105263, |
|
"grad_norm": 0.43582749366760254, |
|
"learning_rate": 9.977108247067108e-05, |
|
"loss": 0.0913, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.8157894736842105, |
|
"grad_norm": 0.5718427896499634, |
|
"learning_rate": 9.972760876972226e-05, |
|
"loss": 0.0913, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.8421052631578947, |
|
"grad_norm": 0.5905811190605164, |
|
"learning_rate": 9.968036906997855e-05, |
|
"loss": 0.091, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.868421052631579, |
|
"grad_norm": 0.5551950931549072, |
|
"learning_rate": 9.962936694902307e-05, |
|
"loss": 0.0827, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.8947368421052632, |
|
"grad_norm": 0.37185293436050415, |
|
"learning_rate": 9.957460626937664e-05, |
|
"loss": 0.0768, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.9210526315789473, |
|
"grad_norm": 0.7125674486160278, |
|
"learning_rate": 9.951609117820538e-05, |
|
"loss": 0.0878, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.9473684210526315, |
|
"grad_norm": 0.518326461315155, |
|
"learning_rate": 9.945382610700657e-05, |
|
"loss": 0.0841, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.9736842105263158, |
|
"grad_norm": 0.40797701478004456, |
|
"learning_rate": 9.938781577127306e-05, |
|
"loss": 0.0899, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.8398942351341248, |
|
"learning_rate": 9.931806517013612e-05, |
|
"loss": 0.082, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 1.0263157894736843, |
|
"grad_norm": 0.5522546768188477, |
|
"learning_rate": 9.92445795859869e-05, |
|
"loss": 0.0833, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 1.0526315789473684, |
|
"grad_norm": 0.7629371285438538, |
|
"learning_rate": 9.916736458407632e-05, |
|
"loss": 0.0782, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.0789473684210527, |
|
"grad_norm": 0.6491877436637878, |
|
"learning_rate": 9.908642601209366e-05, |
|
"loss": 0.0795, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 1.1052631578947367, |
|
"grad_norm": 0.5657724142074585, |
|
"learning_rate": 9.900176999972366e-05, |
|
"loss": 0.0763, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 1.131578947368421, |
|
"grad_norm": 0.9348977208137512, |
|
"learning_rate": 9.89134029581823e-05, |
|
"loss": 0.0789, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 1.1578947368421053, |
|
"grad_norm": 0.9060670733451843, |
|
"learning_rate": 9.88213315797313e-05, |
|
"loss": 0.0897, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 1.1842105263157894, |
|
"grad_norm": 0.5024181008338928, |
|
"learning_rate": 9.872556283717125e-05, |
|
"loss": 0.0781, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.2105263157894737, |
|
"grad_norm": 0.6639284491539001, |
|
"learning_rate": 9.86261039833136e-05, |
|
"loss": 0.0871, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 1.236842105263158, |
|
"grad_norm": 0.572541356086731, |
|
"learning_rate": 9.852296255043129e-05, |
|
"loss": 0.0741, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 1.263157894736842, |
|
"grad_norm": 0.8119587898254395, |
|
"learning_rate": 9.841614634968843e-05, |
|
"loss": 0.0732, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 1.2894736842105263, |
|
"grad_norm": 0.45451977849006653, |
|
"learning_rate": 9.830566347054868e-05, |
|
"loss": 0.0734, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 1.3157894736842106, |
|
"grad_norm": 0.48096764087677, |
|
"learning_rate": 9.819152228016257e-05, |
|
"loss": 0.0729, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.3421052631578947, |
|
"grad_norm": 0.4565185606479645, |
|
"learning_rate": 9.807373142273395e-05, |
|
"loss": 0.0673, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 1.368421052631579, |
|
"grad_norm": 0.7846236824989319, |
|
"learning_rate": 9.795229981886521e-05, |
|
"loss": 0.0687, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 1.3947368421052633, |
|
"grad_norm": 0.46232283115386963, |
|
"learning_rate": 9.782723666488181e-05, |
|
"loss": 0.0718, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 1.4210526315789473, |
|
"grad_norm": 0.5114668607711792, |
|
"learning_rate": 9.769855143213575e-05, |
|
"loss": 0.0739, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 1.4473684210526316, |
|
"grad_norm": 0.7048826217651367, |
|
"learning_rate": 9.756625386628832e-05, |
|
"loss": 0.066, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.4736842105263157, |
|
"grad_norm": 0.609493613243103, |
|
"learning_rate": 9.743035398657201e-05, |
|
"loss": 0.0744, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"grad_norm": 0.6096906065940857, |
|
"learning_rate": 9.729086208503174e-05, |
|
"loss": 0.0678, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 1.526315789473684, |
|
"grad_norm": 0.8587449193000793, |
|
"learning_rate": 9.714778872574541e-05, |
|
"loss": 0.0656, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 1.5526315789473686, |
|
"grad_norm": 0.576169490814209, |
|
"learning_rate": 9.700114474402387e-05, |
|
"loss": 0.0715, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 1.5789473684210527, |
|
"grad_norm": 0.6562088131904602, |
|
"learning_rate": 9.685094124559034e-05, |
|
"loss": 0.0694, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.6052631578947367, |
|
"grad_norm": 0.3795994818210602, |
|
"learning_rate": 9.669718960573927e-05, |
|
"loss": 0.0643, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 1.631578947368421, |
|
"grad_norm": 0.5144829154014587, |
|
"learning_rate": 9.653990146847499e-05, |
|
"loss": 0.0713, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 1.6578947368421053, |
|
"grad_norm": 0.34501057863235474, |
|
"learning_rate": 9.637908874562978e-05, |
|
"loss": 0.0655, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 1.6842105263157894, |
|
"grad_norm": 0.3585371673107147, |
|
"learning_rate": 9.621476361596177e-05, |
|
"loss": 0.0634, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 1.7105263157894737, |
|
"grad_norm": 0.4583841562271118, |
|
"learning_rate": 9.604693852423268e-05, |
|
"loss": 0.0603, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 1.736842105263158, |
|
"grad_norm": 0.372306764125824, |
|
"learning_rate": 9.58756261802652e-05, |
|
"loss": 0.0606, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 1.763157894736842, |
|
"grad_norm": 0.49868544936180115, |
|
"learning_rate": 9.570083955798065e-05, |
|
"loss": 0.0651, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 1.7894736842105263, |
|
"grad_norm": 0.531895101070404, |
|
"learning_rate": 9.552259189441626e-05, |
|
"loss": 0.0708, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 1.8157894736842106, |
|
"grad_norm": 0.5189070701599121, |
|
"learning_rate": 9.534089668872274e-05, |
|
"loss": 0.0688, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 1.8421052631578947, |
|
"grad_norm": 0.36183297634124756, |
|
"learning_rate": 9.515576770114199e-05, |
|
"loss": 0.0684, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.868421052631579, |
|
"grad_norm": 0.2646963894367218, |
|
"learning_rate": 9.496721895196497e-05, |
|
"loss": 0.0604, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 1.8947368421052633, |
|
"grad_norm": 0.4395381510257721, |
|
"learning_rate": 9.477526472046995e-05, |
|
"loss": 0.0523, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 1.9210526315789473, |
|
"grad_norm": 0.5579097867012024, |
|
"learning_rate": 9.457991954384105e-05, |
|
"loss": 0.0612, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 1.9473684210526314, |
|
"grad_norm": 0.4455315172672272, |
|
"learning_rate": 9.438119821606727e-05, |
|
"loss": 0.0625, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 1.973684210526316, |
|
"grad_norm": 0.5463271737098694, |
|
"learning_rate": 9.417911578682229e-05, |
|
"loss": 0.0605, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 0.7862467765808105, |
|
"learning_rate": 9.397368756032445e-05, |
|
"loss": 0.0606, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 2.026315789473684, |
|
"grad_norm": 0.48962733149528503, |
|
"learning_rate": 9.376492909417795e-05, |
|
"loss": 0.0565, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 2.0526315789473686, |
|
"grad_norm": 0.37852731347084045, |
|
"learning_rate": 9.35528561981945e-05, |
|
"loss": 0.0622, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 2.0789473684210527, |
|
"grad_norm": 0.37320488691329956, |
|
"learning_rate": 9.333748493319603e-05, |
|
"loss": 0.0602, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 2.1052631578947367, |
|
"grad_norm": 0.5312328338623047, |
|
"learning_rate": 9.311883160979844e-05, |
|
"loss": 0.0522, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 2.1315789473684212, |
|
"grad_norm": 0.5505270957946777, |
|
"learning_rate": 9.289691278717623e-05, |
|
"loss": 0.0544, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 2.1578947368421053, |
|
"grad_norm": 0.6213037371635437, |
|
"learning_rate": 9.267174527180853e-05, |
|
"loss": 0.0647, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 2.1842105263157894, |
|
"grad_norm": 0.5001884698867798, |
|
"learning_rate": 9.244334611620629e-05, |
|
"loss": 0.0573, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 2.2105263157894735, |
|
"grad_norm": 0.41479626297950745, |
|
"learning_rate": 9.221173261762073e-05, |
|
"loss": 0.0576, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 2.236842105263158, |
|
"grad_norm": 0.4636107385158539, |
|
"learning_rate": 9.197692231673361e-05, |
|
"loss": 0.0607, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 2.263157894736842, |
|
"grad_norm": 0.4859972298145294, |
|
"learning_rate": 9.173893299632856e-05, |
|
"loss": 0.0561, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 2.2894736842105265, |
|
"grad_norm": 0.3786041736602783, |
|
"learning_rate": 9.149778267994457e-05, |
|
"loss": 0.0594, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 2.3157894736842106, |
|
"grad_norm": 0.5478907823562622, |
|
"learning_rate": 9.12534896305109e-05, |
|
"loss": 0.0547, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 2.3421052631578947, |
|
"grad_norm": 0.3971192240715027, |
|
"learning_rate": 9.100607234896397e-05, |
|
"loss": 0.058, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 2.3684210526315788, |
|
"grad_norm": 0.4420956075191498, |
|
"learning_rate": 9.075554957284633e-05, |
|
"loss": 0.0576, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 2.3947368421052633, |
|
"grad_norm": 0.5636497139930725, |
|
"learning_rate": 9.050194027488754e-05, |
|
"loss": 0.0513, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 2.4210526315789473, |
|
"grad_norm": 0.6197888255119324, |
|
"learning_rate": 9.024526366156732e-05, |
|
"loss": 0.0547, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 2.4473684210526314, |
|
"grad_norm": 0.5714887976646423, |
|
"learning_rate": 8.998553917166108e-05, |
|
"loss": 0.0605, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 2.473684210526316, |
|
"grad_norm": 0.3763100206851959, |
|
"learning_rate": 8.972278647476764e-05, |
|
"loss": 0.053, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 2.5, |
|
"grad_norm": 0.4168098270893097, |
|
"learning_rate": 8.945702546981969e-05, |
|
"loss": 0.0587, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 2.526315789473684, |
|
"grad_norm": 0.3482176959514618, |
|
"learning_rate": 8.918827628357677e-05, |
|
"loss": 0.053, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 2.5526315789473686, |
|
"grad_norm": 0.4938332736492157, |
|
"learning_rate": 8.891655926910103e-05, |
|
"loss": 0.06, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 2.5789473684210527, |
|
"grad_norm": 0.3451533317565918, |
|
"learning_rate": 8.864189500421582e-05, |
|
"loss": 0.0612, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 2.6052631578947367, |
|
"grad_norm": 0.4287240505218506, |
|
"learning_rate": 8.836430428994732e-05, |
|
"loss": 0.054, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 2.6315789473684212, |
|
"grad_norm": 0.6272952556610107, |
|
"learning_rate": 8.808380814894912e-05, |
|
"loss": 0.0555, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 2.6578947368421053, |
|
"grad_norm": 0.46518048644065857, |
|
"learning_rate": 8.780042782391028e-05, |
|
"loss": 0.0608, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 2.6842105263157894, |
|
"grad_norm": 0.7546030282974243, |
|
"learning_rate": 8.751418477594645e-05, |
|
"loss": 0.0566, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 2.7105263157894735, |
|
"grad_norm": 0.6570419073104858, |
|
"learning_rate": 8.722510068297454e-05, |
|
"loss": 0.0565, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 2.736842105263158, |
|
"grad_norm": 0.5381316542625427, |
|
"learning_rate": 8.693319743807116e-05, |
|
"loss": 0.0534, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 2.763157894736842, |
|
"grad_norm": 0.566612958908081, |
|
"learning_rate": 8.663849714781442e-05, |
|
"loss": 0.05, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 2.7894736842105265, |
|
"grad_norm": 0.49792104959487915, |
|
"learning_rate": 8.634102213060984e-05, |
|
"loss": 0.0501, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 2.8157894736842106, |
|
"grad_norm": 0.39989131689071655, |
|
"learning_rate": 8.60407949150001e-05, |
|
"loss": 0.0541, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 2.8421052631578947, |
|
"grad_norm": 0.3729485273361206, |
|
"learning_rate": 8.573783823795889e-05, |
|
"loss": 0.0568, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 2.8684210526315788, |
|
"grad_norm": 0.3304407000541687, |
|
"learning_rate": 8.543217504316896e-05, |
|
"loss": 0.0508, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 2.8947368421052633, |
|
"grad_norm": 0.3478791415691376, |
|
"learning_rate": 8.512382847928461e-05, |
|
"loss": 0.0544, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 2.9210526315789473, |
|
"grad_norm": 0.35698530077934265, |
|
"learning_rate": 8.48128218981785e-05, |
|
"loss": 0.0522, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 2.9473684210526314, |
|
"grad_norm": 0.37133410573005676, |
|
"learning_rate": 8.44991788531732e-05, |
|
"loss": 0.0488, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 2.973684210526316, |
|
"grad_norm": 0.31634142994880676, |
|
"learning_rate": 8.418292309725738e-05, |
|
"loss": 0.0551, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"grad_norm": 0.5727193355560303, |
|
"learning_rate": 8.386407858128706e-05, |
|
"loss": 0.0497, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 3.026315789473684, |
|
"grad_norm": 0.3365287482738495, |
|
"learning_rate": 8.35426694521716e-05, |
|
"loss": 0.048, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 3.0526315789473686, |
|
"grad_norm": 0.26241013407707214, |
|
"learning_rate": 8.321872005104509e-05, |
|
"loss": 0.0517, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 3.0789473684210527, |
|
"grad_norm": 0.39349165558815, |
|
"learning_rate": 8.289225491142292e-05, |
|
"loss": 0.0559, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 3.1052631578947367, |
|
"grad_norm": 0.4374188482761383, |
|
"learning_rate": 8.256329875734375e-05, |
|
"loss": 0.0544, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 3.1315789473684212, |
|
"grad_norm": 0.3512132465839386, |
|
"learning_rate": 8.223187650149712e-05, |
|
"loss": 0.0477, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 3.1578947368421053, |
|
"grad_norm": 0.3832947313785553, |
|
"learning_rate": 8.189801324333681e-05, |
|
"loss": 0.0453, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 3.1842105263157894, |
|
"grad_norm": 0.27593138813972473, |
|
"learning_rate": 8.156173426717988e-05, |
|
"loss": 0.0529, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 3.2105263157894735, |
|
"grad_norm": 0.3951173424720764, |
|
"learning_rate": 8.122306504029194e-05, |
|
"loss": 0.0524, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 3.236842105263158, |
|
"grad_norm": 0.3123633563518524, |
|
"learning_rate": 8.08820312109583e-05, |
|
"loss": 0.0477, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 3.263157894736842, |
|
"grad_norm": 0.4703565835952759, |
|
"learning_rate": 8.053865860654175e-05, |
|
"loss": 0.0473, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 3.2894736842105265, |
|
"grad_norm": 0.3525274991989136, |
|
"learning_rate": 8.019297323152642e-05, |
|
"loss": 0.057, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 3.3157894736842106, |
|
"grad_norm": 0.352622926235199, |
|
"learning_rate": 7.984500126554853e-05, |
|
"loss": 0.0487, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 3.3421052631578947, |
|
"grad_norm": 0.5331799387931824, |
|
"learning_rate": 7.94947690614136e-05, |
|
"loss": 0.0519, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 3.3684210526315788, |
|
"grad_norm": 0.5408539772033691, |
|
"learning_rate": 7.914230314310079e-05, |
|
"loss": 0.0467, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 3.3947368421052633, |
|
"grad_norm": 0.42730775475502014, |
|
"learning_rate": 7.878763020375415e-05, |
|
"loss": 0.0486, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 3.4210526315789473, |
|
"grad_norm": 0.3785395920276642, |
|
"learning_rate": 7.843077710366105e-05, |
|
"loss": 0.0518, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 3.4473684210526314, |
|
"grad_norm": 0.3049032986164093, |
|
"learning_rate": 7.807177086821802e-05, |
|
"loss": 0.0433, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 3.473684210526316, |
|
"grad_norm": 0.3494362533092499, |
|
"learning_rate": 7.771063868588399e-05, |
|
"loss": 0.0453, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 3.5, |
|
"grad_norm": 0.4008173942565918, |
|
"learning_rate": 7.734740790612136e-05, |
|
"loss": 0.0544, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 3.526315789473684, |
|
"grad_norm": 0.5177074074745178, |
|
"learning_rate": 7.698210603732454e-05, |
|
"loss": 0.0433, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 3.5526315789473686, |
|
"grad_norm": 0.42569538950920105, |
|
"learning_rate": 7.661476074473695e-05, |
|
"loss": 0.0428, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 3.5789473684210527, |
|
"grad_norm": 0.36168932914733887, |
|
"learning_rate": 7.624539984835557e-05, |
|
"loss": 0.0466, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 3.6052631578947367, |
|
"grad_norm": 0.4134399890899658, |
|
"learning_rate": 7.587405132082433e-05, |
|
"loss": 0.0468, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 3.6315789473684212, |
|
"grad_norm": 0.2959583103656769, |
|
"learning_rate": 7.550074328531545e-05, |
|
"loss": 0.0477, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 3.6578947368421053, |
|
"grad_norm": 0.4948980510234833, |
|
"learning_rate": 7.512550401339971e-05, |
|
"loss": 0.0453, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 3.6842105263157894, |
|
"grad_norm": 0.5422468185424805, |
|
"learning_rate": 7.47483619229054e-05, |
|
"loss": 0.049, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 3.7105263157894735, |
|
"grad_norm": 0.4651060402393341, |
|
"learning_rate": 7.436934557576612e-05, |
|
"loss": 0.0486, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 3.736842105263158, |
|
"grad_norm": 0.37345531582832336, |
|
"learning_rate": 7.39884836758576e-05, |
|
"loss": 0.0428, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 3.763157894736842, |
|
"grad_norm": 0.4845902621746063, |
|
"learning_rate": 7.360580506682414e-05, |
|
"loss": 0.0435, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 3.7894736842105265, |
|
"grad_norm": 0.38114190101623535, |
|
"learning_rate": 7.322133872989398e-05, |
|
"loss": 0.0384, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 3.8157894736842106, |
|
"grad_norm": 0.4120785892009735, |
|
"learning_rate": 7.283511378168458e-05, |
|
"loss": 0.0422, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 3.8421052631578947, |
|
"grad_norm": 0.4169304370880127, |
|
"learning_rate": 7.244715947199749e-05, |
|
"loss": 0.0413, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 3.8684210526315788, |
|
"grad_norm": 0.3038704991340637, |
|
"learning_rate": 7.20575051816033e-05, |
|
"loss": 0.0445, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 3.8947368421052633, |
|
"grad_norm": 0.3277107775211334, |
|
"learning_rate": 7.16661804200164e-05, |
|
"loss": 0.0443, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 3.9210526315789473, |
|
"grad_norm": 0.3675183057785034, |
|
"learning_rate": 7.127321482326026e-05, |
|
"loss": 0.046, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 3.9473684210526314, |
|
"grad_norm": 0.3455309569835663, |
|
"learning_rate": 7.087863815162298e-05, |
|
"loss": 0.0424, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 3.973684210526316, |
|
"grad_norm": 0.49531644582748413, |
|
"learning_rate": 7.04824802874035e-05, |
|
"loss": 0.0457, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"grad_norm": 0.6025938391685486, |
|
"learning_rate": 7.008477123264848e-05, |
|
"loss": 0.0497, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 4.026315789473684, |
|
"grad_norm": 0.2605507969856262, |
|
"learning_rate": 6.96855411068802e-05, |
|
"loss": 0.047, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 4.052631578947368, |
|
"grad_norm": 0.3557581603527069, |
|
"learning_rate": 6.928482014481558e-05, |
|
"loss": 0.0434, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 4.078947368421052, |
|
"grad_norm": 0.6790323257446289, |
|
"learning_rate": 6.888263869407631e-05, |
|
"loss": 0.0431, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 4.105263157894737, |
|
"grad_norm": 0.5850968360900879, |
|
"learning_rate": 6.847902721289068e-05, |
|
"loss": 0.0444, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 4.131578947368421, |
|
"grad_norm": 0.22265614569187164, |
|
"learning_rate": 6.807401626778679e-05, |
|
"loss": 0.0463, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 4.157894736842105, |
|
"grad_norm": 0.3996434509754181, |
|
"learning_rate": 6.766763653127773e-05, |
|
"loss": 0.0391, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 4.184210526315789, |
|
"grad_norm": 0.4038737118244171, |
|
"learning_rate": 6.725991877953868e-05, |
|
"loss": 0.0432, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 4.2105263157894735, |
|
"grad_norm": 0.5624067187309265, |
|
"learning_rate": 6.685089389007612e-05, |
|
"loss": 0.0437, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 4.2368421052631575, |
|
"grad_norm": 0.32445138692855835, |
|
"learning_rate": 6.644059283938938e-05, |
|
"loss": 0.0439, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 4.2631578947368425, |
|
"grad_norm": 0.385633647441864, |
|
"learning_rate": 6.602904670062476e-05, |
|
"loss": 0.0362, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 4.2894736842105265, |
|
"grad_norm": 0.4192872643470764, |
|
"learning_rate": 6.561628664122226e-05, |
|
"loss": 0.0439, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 4.315789473684211, |
|
"grad_norm": 0.46780234575271606, |
|
"learning_rate": 6.520234392055522e-05, |
|
"loss": 0.0447, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 4.342105263157895, |
|
"grad_norm": 0.30417683720588684, |
|
"learning_rate": 6.478724988756285e-05, |
|
"loss": 0.0413, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 4.368421052631579, |
|
"grad_norm": 0.3585522472858429, |
|
"learning_rate": 6.437103597837631e-05, |
|
"loss": 0.0451, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 4.394736842105263, |
|
"grad_norm": 0.2543241083621979, |
|
"learning_rate": 6.39537337139377e-05, |
|
"loss": 0.0454, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 4.421052631578947, |
|
"grad_norm": 0.41101086139678955, |
|
"learning_rate": 6.353537469761315e-05, |
|
"loss": 0.0468, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 4.447368421052632, |
|
"grad_norm": 0.2859801650047302, |
|
"learning_rate": 6.311599061279932e-05, |
|
"loss": 0.0462, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 4.473684210526316, |
|
"grad_norm": 0.2935991585254669, |
|
"learning_rate": 6.269561322052378e-05, |
|
"loss": 0.0457, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 4.5, |
|
"grad_norm": 0.311879426240921, |
|
"learning_rate": 6.227427435703997e-05, |
|
"loss": 0.0419, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 4.526315789473684, |
|
"grad_norm": 0.4428638517856598, |
|
"learning_rate": 6.185200593141593e-05, |
|
"loss": 0.0412, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 4.552631578947368, |
|
"grad_norm": 0.44332432746887207, |
|
"learning_rate": 6.142883992311781e-05, |
|
"loss": 0.0391, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 4.578947368421053, |
|
"grad_norm": 0.31303486227989197, |
|
"learning_rate": 6.100480837958802e-05, |
|
"loss": 0.0478, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 4.605263157894737, |
|
"grad_norm": 0.4248807430267334, |
|
"learning_rate": 6.057994341381813e-05, |
|
"loss": 0.0419, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 4.631578947368421, |
|
"grad_norm": 0.38378843665122986, |
|
"learning_rate": 6.015427720191693e-05, |
|
"loss": 0.0388, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 4.657894736842105, |
|
"grad_norm": 0.232202410697937, |
|
"learning_rate": 5.9727841980673604e-05, |
|
"loss": 0.0491, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 4.684210526315789, |
|
"grad_norm": 0.3846134841442108, |
|
"learning_rate": 5.93006700451164e-05, |
|
"loss": 0.0442, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 4.7105263157894735, |
|
"grad_norm": 0.24118691682815552, |
|
"learning_rate": 5.887279374606679e-05, |
|
"loss": 0.0379, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 4.7368421052631575, |
|
"grad_norm": 0.3538924753665924, |
|
"learning_rate": 5.844424548768952e-05, |
|
"loss": 0.0371, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 4.7631578947368425, |
|
"grad_norm": 0.2391858547925949, |
|
"learning_rate": 5.8015057725038534e-05, |
|
"loss": 0.0393, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 4.7894736842105265, |
|
"grad_norm": 0.43974366784095764, |
|
"learning_rate": 5.7585262961599054e-05, |
|
"loss": 0.0408, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 4.815789473684211, |
|
"grad_norm": 0.3488527834415436, |
|
"learning_rate": 5.7154893746826014e-05, |
|
"loss": 0.0419, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 4.842105263157895, |
|
"grad_norm": 0.5002457499504089, |
|
"learning_rate": 5.672398267367902e-05, |
|
"loss": 0.0403, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 4.868421052631579, |
|
"grad_norm": 0.31320708990097046, |
|
"learning_rate": 5.6292562376154037e-05, |
|
"loss": 0.0423, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 4.894736842105263, |
|
"grad_norm": 0.26685863733291626, |
|
"learning_rate": 5.586066552681179e-05, |
|
"loss": 0.0377, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 4.921052631578947, |
|
"grad_norm": 0.3947713077068329, |
|
"learning_rate": 5.542832483430363e-05, |
|
"loss": 0.0401, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 4.947368421052632, |
|
"grad_norm": 0.20824529230594635, |
|
"learning_rate": 5.499557304089419e-05, |
|
"loss": 0.0382, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 4.973684210526316, |
|
"grad_norm": 0.25169357657432556, |
|
"learning_rate": 5.4562442919981816e-05, |
|
"loss": 0.0339, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"grad_norm": 0.4444282054901123, |
|
"learning_rate": 5.4128967273616625e-05, |
|
"loss": 0.0443, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 5.026315789473684, |
|
"grad_norm": 0.4260099530220032, |
|
"learning_rate": 5.3695178930016196e-05, |
|
"loss": 0.0364, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 5.052631578947368, |
|
"grad_norm": 0.33936482667922974, |
|
"learning_rate": 5.3261110741079525e-05, |
|
"loss": 0.0367, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 5.078947368421052, |
|
"grad_norm": 0.5758324861526489, |
|
"learning_rate": 5.2826795579898956e-05, |
|
"loss": 0.0383, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 5.105263157894737, |
|
"grad_norm": 0.4980020821094513, |
|
"learning_rate": 5.2392266338270736e-05, |
|
"loss": 0.0422, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 5.131578947368421, |
|
"grad_norm": 0.4701398015022278, |
|
"learning_rate": 5.195755592420387e-05, |
|
"loss": 0.0372, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 5.157894736842105, |
|
"grad_norm": 0.3783451020717621, |
|
"learning_rate": 5.1522697259428146e-05, |
|
"loss": 0.0371, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 5.184210526315789, |
|
"grad_norm": 0.41548261046409607, |
|
"learning_rate": 5.1087723276900646e-05, |
|
"loss": 0.0415, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 5.2105263157894735, |
|
"grad_norm": 0.3551521897315979, |
|
"learning_rate": 5.065266691831181e-05, |
|
"loss": 0.0344, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 5.2368421052631575, |
|
"grad_norm": 0.5270403623580933, |
|
"learning_rate": 5.021756113159062e-05, |
|
"loss": 0.0355, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 5.2631578947368425, |
|
"grad_norm": 0.29675352573394775, |
|
"learning_rate": 4.978243886840939e-05, |
|
"loss": 0.0424, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 5.2894736842105265, |
|
"grad_norm": 0.3667472302913666, |
|
"learning_rate": 4.934733308168821e-05, |
|
"loss": 0.0449, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 5.315789473684211, |
|
"grad_norm": 0.34768638014793396, |
|
"learning_rate": 4.891227672309935e-05, |
|
"loss": 0.041, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 5.342105263157895, |
|
"grad_norm": 0.38154706358909607, |
|
"learning_rate": 4.8477302740571866e-05, |
|
"loss": 0.0382, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 5.368421052631579, |
|
"grad_norm": 0.16855105757713318, |
|
"learning_rate": 4.804244407579613e-05, |
|
"loss": 0.0376, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 5.394736842105263, |
|
"grad_norm": 0.3421989679336548, |
|
"learning_rate": 4.760773366172929e-05, |
|
"loss": 0.0381, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 5.421052631578947, |
|
"grad_norm": 0.3012019991874695, |
|
"learning_rate": 4.717320442010105e-05, |
|
"loss": 0.0353, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 5.447368421052632, |
|
"grad_norm": 0.2907329797744751, |
|
"learning_rate": 4.673888925892048e-05, |
|
"loss": 0.0372, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 5.473684210526316, |
|
"grad_norm": 0.3655814826488495, |
|
"learning_rate": 4.630482106998381e-05, |
|
"loss": 0.0354, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 5.5, |
|
"grad_norm": 0.3697777986526489, |
|
"learning_rate": 4.5871032726383386e-05, |
|
"loss": 0.0426, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 5.526315789473684, |
|
"grad_norm": 0.22509945929050446, |
|
"learning_rate": 4.5437557080018175e-05, |
|
"loss": 0.0358, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 5.552631578947368, |
|
"grad_norm": 0.30516788363456726, |
|
"learning_rate": 4.500442695910582e-05, |
|
"loss": 0.0411, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 5.578947368421053, |
|
"grad_norm": 0.3428977429866791, |
|
"learning_rate": 4.457167516569637e-05, |
|
"loss": 0.032, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 5.605263157894737, |
|
"grad_norm": 0.2875311076641083, |
|
"learning_rate": 4.413933447318821e-05, |
|
"loss": 0.0339, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 5.631578947368421, |
|
"grad_norm": 0.3058890700340271, |
|
"learning_rate": 4.3707437623845995e-05, |
|
"loss": 0.035, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 5.657894736842105, |
|
"grad_norm": 0.317531019449234, |
|
"learning_rate": 4.3276017326320985e-05, |
|
"loss": 0.0324, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 5.684210526315789, |
|
"grad_norm": 0.3118530809879303, |
|
"learning_rate": 4.2845106253174e-05, |
|
"loss": 0.0345, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 5.7105263157894735, |
|
"grad_norm": 0.2496698647737503, |
|
"learning_rate": 4.2414737038400964e-05, |
|
"loss": 0.0373, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 5.7368421052631575, |
|
"grad_norm": 0.26971638202667236, |
|
"learning_rate": 4.198494227496148e-05, |
|
"loss": 0.0368, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 5.7631578947368425, |
|
"grad_norm": 0.43503808975219727, |
|
"learning_rate": 4.155575451231048e-05, |
|
"loss": 0.0348, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 5.7894736842105265, |
|
"grad_norm": 0.26233163475990295, |
|
"learning_rate": 4.112720625393322e-05, |
|
"loss": 0.0354, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 5.815789473684211, |
|
"grad_norm": 0.40549346804618835, |
|
"learning_rate": 4.069932995488361e-05, |
|
"loss": 0.0385, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 5.842105263157895, |
|
"grad_norm": 0.3487369120121002, |
|
"learning_rate": 4.0272158019326414e-05, |
|
"loss": 0.0365, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 5.868421052631579, |
|
"grad_norm": 0.35258445143699646, |
|
"learning_rate": 3.9845722798083066e-05, |
|
"loss": 0.035, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 5.894736842105263, |
|
"grad_norm": 0.3888448476791382, |
|
"learning_rate": 3.942005658618188e-05, |
|
"loss": 0.0357, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 5.921052631578947, |
|
"grad_norm": 0.41619524359703064, |
|
"learning_rate": 3.8995191620412e-05, |
|
"loss": 0.0352, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 5.947368421052632, |
|
"grad_norm": 0.5103023052215576, |
|
"learning_rate": 3.8571160076882204e-05, |
|
"loss": 0.0331, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 5.973684210526316, |
|
"grad_norm": 0.458965927362442, |
|
"learning_rate": 3.8147994068584087e-05, |
|
"loss": 0.0301, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"grad_norm": 0.32402315735816956, |
|
"learning_rate": 3.772572564296005e-05, |
|
"loss": 0.0351, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 6.026315789473684, |
|
"grad_norm": 0.2968176603317261, |
|
"learning_rate": 3.730438677947624e-05, |
|
"loss": 0.0413, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 6.052631578947368, |
|
"grad_norm": 0.3646889626979828, |
|
"learning_rate": 3.6884009387200714e-05, |
|
"loss": 0.0357, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 6.078947368421052, |
|
"grad_norm": 0.26003819704055786, |
|
"learning_rate": 3.646462530238684e-05, |
|
"loss": 0.0334, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 6.105263157894737, |
|
"grad_norm": 0.28209343552589417, |
|
"learning_rate": 3.60462662860623e-05, |
|
"loss": 0.0317, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 6.131578947368421, |
|
"grad_norm": 0.4188937544822693, |
|
"learning_rate": 3.56289640216237e-05, |
|
"loss": 0.0354, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 6.157894736842105, |
|
"grad_norm": 0.44584834575653076, |
|
"learning_rate": 3.521275011243715e-05, |
|
"loss": 0.0395, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 6.184210526315789, |
|
"grad_norm": 0.44341957569122314, |
|
"learning_rate": 3.4797656079444806e-05, |
|
"loss": 0.033, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 6.2105263157894735, |
|
"grad_norm": 0.3678707182407379, |
|
"learning_rate": 3.4383713358777735e-05, |
|
"loss": 0.0344, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 6.2368421052631575, |
|
"grad_norm": 0.30562394857406616, |
|
"learning_rate": 3.397095329937526e-05, |
|
"loss": 0.0339, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 6.2631578947368425, |
|
"grad_norm": 0.4624996781349182, |
|
"learning_rate": 3.355940716061064e-05, |
|
"loss": 0.0341, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 6.2894736842105265, |
|
"grad_norm": 0.20757588744163513, |
|
"learning_rate": 3.31491061099239e-05, |
|
"loss": 0.0296, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 6.315789473684211, |
|
"grad_norm": 0.31677284836769104, |
|
"learning_rate": 3.274008122046132e-05, |
|
"loss": 0.0338, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 6.342105263157895, |
|
"grad_norm": 0.24551235139369965, |
|
"learning_rate": 3.233236346872227e-05, |
|
"loss": 0.0381, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 6.368421052631579, |
|
"grad_norm": 0.250813364982605, |
|
"learning_rate": 3.192598373221322e-05, |
|
"loss": 0.0333, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 6.394736842105263, |
|
"grad_norm": 0.5114396810531616, |
|
"learning_rate": 3.152097278710933e-05, |
|
"loss": 0.0396, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 6.421052631578947, |
|
"grad_norm": 0.4336398243904114, |
|
"learning_rate": 3.1117361305923684e-05, |
|
"loss": 0.0361, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 6.447368421052632, |
|
"grad_norm": 0.3308594226837158, |
|
"learning_rate": 3.071517985518442e-05, |
|
"loss": 0.0314, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 6.473684210526316, |
|
"grad_norm": 0.4118829667568207, |
|
"learning_rate": 3.0314458893119808e-05, |
|
"loss": 0.0337, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 6.5, |
|
"grad_norm": 0.19590795040130615, |
|
"learning_rate": 2.991522876735154e-05, |
|
"loss": 0.0359, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 6.526315789473684, |
|
"grad_norm": 0.39492878317832947, |
|
"learning_rate": 2.9517519712596498e-05, |
|
"loss": 0.0304, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 6.552631578947368, |
|
"grad_norm": 0.2706056833267212, |
|
"learning_rate": 2.9121361848377014e-05, |
|
"loss": 0.0301, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 6.578947368421053, |
|
"grad_norm": 0.2509630024433136, |
|
"learning_rate": 2.872678517673975e-05, |
|
"loss": 0.0313, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 6.605263157894737, |
|
"grad_norm": 0.37828516960144043, |
|
"learning_rate": 2.8333819579983623e-05, |
|
"loss": 0.0323, |
|
"step": 2510 |
|
}, |
|
{ |
|
"epoch": 6.631578947368421, |
|
"grad_norm": 0.3628530204296112, |
|
"learning_rate": 2.794249481839669e-05, |
|
"loss": 0.0292, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 6.657894736842105, |
|
"grad_norm": 0.35174816846847534, |
|
"learning_rate": 2.7552840528002498e-05, |
|
"loss": 0.0326, |
|
"step": 2530 |
|
}, |
|
{ |
|
"epoch": 6.684210526315789, |
|
"grad_norm": 0.3942660391330719, |
|
"learning_rate": 2.7164886218315444e-05, |
|
"loss": 0.0314, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 6.7105263157894735, |
|
"grad_norm": 0.2891586124897003, |
|
"learning_rate": 2.6778661270106025e-05, |
|
"loss": 0.0346, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 6.7368421052631575, |
|
"grad_norm": 0.3602871000766754, |
|
"learning_rate": 2.6394194933175875e-05, |
|
"loss": 0.0338, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 6.7631578947368425, |
|
"grad_norm": 0.3285452127456665, |
|
"learning_rate": 2.601151632414241e-05, |
|
"loss": 0.0301, |
|
"step": 2570 |
|
}, |
|
{ |
|
"epoch": 6.7894736842105265, |
|
"grad_norm": 0.3526369035243988, |
|
"learning_rate": 2.5630654424233903e-05, |
|
"loss": 0.0331, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 6.815789473684211, |
|
"grad_norm": 0.2297569215297699, |
|
"learning_rate": 2.5251638077094602e-05, |
|
"loss": 0.0311, |
|
"step": 2590 |
|
}, |
|
{ |
|
"epoch": 6.842105263157895, |
|
"grad_norm": 0.24023893475532532, |
|
"learning_rate": 2.4874495986600294e-05, |
|
"loss": 0.0299, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 6.868421052631579, |
|
"grad_norm": 0.3183501660823822, |
|
"learning_rate": 2.4499256714684565e-05, |
|
"loss": 0.0329, |
|
"step": 2610 |
|
}, |
|
{ |
|
"epoch": 6.894736842105263, |
|
"grad_norm": 0.335347056388855, |
|
"learning_rate": 2.4125948679175686e-05, |
|
"loss": 0.0302, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 6.921052631578947, |
|
"grad_norm": 0.31279057264328003, |
|
"learning_rate": 2.3754600151644445e-05, |
|
"loss": 0.0317, |
|
"step": 2630 |
|
}, |
|
{ |
|
"epoch": 6.947368421052632, |
|
"grad_norm": 0.25722017884254456, |
|
"learning_rate": 2.3385239255263077e-05, |
|
"loss": 0.034, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 6.973684210526316, |
|
"grad_norm": 0.4903739392757416, |
|
"learning_rate": 2.3017893962675458e-05, |
|
"loss": 0.0272, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"grad_norm": 0.31394830346107483, |
|
"learning_rate": 2.2652592093878666e-05, |
|
"loss": 0.0343, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 7.026315789473684, |
|
"grad_norm": 0.24519610404968262, |
|
"learning_rate": 2.228936131411601e-05, |
|
"loss": 0.0283, |
|
"step": 2670 |
|
}, |
|
{ |
|
"epoch": 7.052631578947368, |
|
"grad_norm": 0.3303487002849579, |
|
"learning_rate": 2.1928229131782007e-05, |
|
"loss": 0.0311, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 7.078947368421052, |
|
"grad_norm": 0.18910463154315948, |
|
"learning_rate": 2.1569222896338966e-05, |
|
"loss": 0.0272, |
|
"step": 2690 |
|
}, |
|
{ |
|
"epoch": 7.105263157894737, |
|
"grad_norm": 0.2918621003627777, |
|
"learning_rate": 2.1212369796245864e-05, |
|
"loss": 0.0298, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 7.131578947368421, |
|
"grad_norm": 0.42113614082336426, |
|
"learning_rate": 2.0857696856899232e-05, |
|
"loss": 0.0295, |
|
"step": 2710 |
|
}, |
|
{ |
|
"epoch": 7.157894736842105, |
|
"grad_norm": 0.3034079670906067, |
|
"learning_rate": 2.0505230938586418e-05, |
|
"loss": 0.0235, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 7.184210526315789, |
|
"grad_norm": 0.4173114001750946, |
|
"learning_rate": 2.0154998734451474e-05, |
|
"loss": 0.0302, |
|
"step": 2730 |
|
}, |
|
{ |
|
"epoch": 7.2105263157894735, |
|
"grad_norm": 0.3530351519584656, |
|
"learning_rate": 1.980702676847358e-05, |
|
"loss": 0.0322, |
|
"step": 2740 |
|
}, |
|
{ |
|
"epoch": 7.2368421052631575, |
|
"grad_norm": 0.29570046067237854, |
|
"learning_rate": 1.9461341393458254e-05, |
|
"loss": 0.0272, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 7.2631578947368425, |
|
"grad_norm": 0.2424498051404953, |
|
"learning_rate": 1.9117968789041712e-05, |
|
"loss": 0.028, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 7.2894736842105265, |
|
"grad_norm": 0.3934868574142456, |
|
"learning_rate": 1.877693495970809e-05, |
|
"loss": 0.0272, |
|
"step": 2770 |
|
}, |
|
{ |
|
"epoch": 7.315789473684211, |
|
"grad_norm": 0.21242979168891907, |
|
"learning_rate": 1.8438265732820126e-05, |
|
"loss": 0.0327, |
|
"step": 2780 |
|
}, |
|
{ |
|
"epoch": 7.342105263157895, |
|
"grad_norm": 0.37688693404197693, |
|
"learning_rate": 1.8101986756663197e-05, |
|
"loss": 0.0351, |
|
"step": 2790 |
|
}, |
|
{ |
|
"epoch": 7.368421052631579, |
|
"grad_norm": 0.22811609506607056, |
|
"learning_rate": 1.776812349850289e-05, |
|
"loss": 0.0276, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 7.394736842105263, |
|
"grad_norm": 0.22656874358654022, |
|
"learning_rate": 1.7436701242656272e-05, |
|
"loss": 0.0287, |
|
"step": 2810 |
|
}, |
|
{ |
|
"epoch": 7.421052631578947, |
|
"grad_norm": 0.35913366079330444, |
|
"learning_rate": 1.7107745088577087e-05, |
|
"loss": 0.0309, |
|
"step": 2820 |
|
}, |
|
{ |
|
"epoch": 7.447368421052632, |
|
"grad_norm": 0.23046566545963287, |
|
"learning_rate": 1.678127994895492e-05, |
|
"loss": 0.0242, |
|
"step": 2830 |
|
}, |
|
{ |
|
"epoch": 7.473684210526316, |
|
"grad_norm": 0.5616809725761414, |
|
"learning_rate": 1.6457330547828402e-05, |
|
"loss": 0.0292, |
|
"step": 2840 |
|
}, |
|
{ |
|
"epoch": 7.5, |
|
"grad_norm": 0.25659796595573425, |
|
"learning_rate": 1.6135921418712956e-05, |
|
"loss": 0.0265, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 7.526315789473684, |
|
"grad_norm": 0.32121288776397705, |
|
"learning_rate": 1.5817076902742622e-05, |
|
"loss": 0.0283, |
|
"step": 2860 |
|
}, |
|
{ |
|
"epoch": 7.552631578947368, |
|
"grad_norm": 0.4093388319015503, |
|
"learning_rate": 1.5500821146826805e-05, |
|
"loss": 0.0338, |
|
"step": 2870 |
|
}, |
|
{ |
|
"epoch": 7.578947368421053, |
|
"grad_norm": 0.4066798686981201, |
|
"learning_rate": 1.5187178101821503e-05, |
|
"loss": 0.0354, |
|
"step": 2880 |
|
}, |
|
{ |
|
"epoch": 7.605263157894737, |
|
"grad_norm": 0.2548167109489441, |
|
"learning_rate": 1.4876171520715399e-05, |
|
"loss": 0.0289, |
|
"step": 2890 |
|
}, |
|
{ |
|
"epoch": 7.631578947368421, |
|
"grad_norm": 0.375201940536499, |
|
"learning_rate": 1.4567824956831043e-05, |
|
"loss": 0.0308, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 7.657894736842105, |
|
"grad_norm": 0.2906716763973236, |
|
"learning_rate": 1.4262161762041121e-05, |
|
"loss": 0.03, |
|
"step": 2910 |
|
}, |
|
{ |
|
"epoch": 7.684210526315789, |
|
"grad_norm": 0.25092822313308716, |
|
"learning_rate": 1.3959205084999911e-05, |
|
"loss": 0.0262, |
|
"step": 2920 |
|
}, |
|
{ |
|
"epoch": 7.7105263157894735, |
|
"grad_norm": 0.27973148226737976, |
|
"learning_rate": 1.3658977869390166e-05, |
|
"loss": 0.0259, |
|
"step": 2930 |
|
}, |
|
{ |
|
"epoch": 7.7368421052631575, |
|
"grad_norm": 0.24641232192516327, |
|
"learning_rate": 1.336150285218558e-05, |
|
"loss": 0.0288, |
|
"step": 2940 |
|
}, |
|
{ |
|
"epoch": 7.7631578947368425, |
|
"grad_norm": 0.17812731862068176, |
|
"learning_rate": 1.3066802561928854e-05, |
|
"loss": 0.0302, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 7.7894736842105265, |
|
"grad_norm": 0.2735963761806488, |
|
"learning_rate": 1.2774899317025468e-05, |
|
"loss": 0.0281, |
|
"step": 2960 |
|
}, |
|
{ |
|
"epoch": 7.815789473684211, |
|
"grad_norm": 0.2304324209690094, |
|
"learning_rate": 1.2485815224053582e-05, |
|
"loss": 0.0286, |
|
"step": 2970 |
|
}, |
|
{ |
|
"epoch": 7.842105263157895, |
|
"grad_norm": 0.28357961773872375, |
|
"learning_rate": 1.2199572176089741e-05, |
|
"loss": 0.0228, |
|
"step": 2980 |
|
}, |
|
{ |
|
"epoch": 7.868421052631579, |
|
"grad_norm": 0.20956674218177795, |
|
"learning_rate": 1.1916191851050873e-05, |
|
"loss": 0.0276, |
|
"step": 2990 |
|
}, |
|
{ |
|
"epoch": 7.894736842105263, |
|
"grad_norm": 0.25499334931373596, |
|
"learning_rate": 1.163569571005269e-05, |
|
"loss": 0.0263, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 7.921052631578947, |
|
"grad_norm": 0.406367689371109, |
|
"learning_rate": 1.1358104995784186e-05, |
|
"loss": 0.03, |
|
"step": 3010 |
|
}, |
|
{ |
|
"epoch": 7.947368421052632, |
|
"grad_norm": 0.49673014879226685, |
|
"learning_rate": 1.1083440730898974e-05, |
|
"loss": 0.029, |
|
"step": 3020 |
|
}, |
|
{ |
|
"epoch": 7.973684210526316, |
|
"grad_norm": 0.21329355239868164, |
|
"learning_rate": 1.0811723716423233e-05, |
|
"loss": 0.0254, |
|
"step": 3030 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"grad_norm": 0.38303622603416443, |
|
"learning_rate": 1.0542974530180327e-05, |
|
"loss": 0.031, |
|
"step": 3040 |
|
}, |
|
{ |
|
"epoch": 8.026315789473685, |
|
"grad_norm": 0.30326029658317566, |
|
"learning_rate": 1.027721352523237e-05, |
|
"loss": 0.031, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 8.052631578947368, |
|
"grad_norm": 0.18923179805278778, |
|
"learning_rate": 1.0014460828338928e-05, |
|
"loss": 0.0239, |
|
"step": 3060 |
|
}, |
|
{ |
|
"epoch": 8.078947368421053, |
|
"grad_norm": 0.2916359603404999, |
|
"learning_rate": 9.75473633843268e-06, |
|
"loss": 0.0288, |
|
"step": 3070 |
|
}, |
|
{ |
|
"epoch": 8.105263157894736, |
|
"grad_norm": 0.39194855093955994, |
|
"learning_rate": 9.498059725112467e-06, |
|
"loss": 0.0255, |
|
"step": 3080 |
|
}, |
|
{ |
|
"epoch": 8.131578947368421, |
|
"grad_norm": 0.23311467468738556, |
|
"learning_rate": 9.244450427153683e-06, |
|
"loss": 0.0271, |
|
"step": 3090 |
|
}, |
|
{ |
|
"epoch": 8.157894736842104, |
|
"grad_norm": 0.25806355476379395, |
|
"learning_rate": 8.99392765103605e-06, |
|
"loss": 0.0253, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 8.18421052631579, |
|
"grad_norm": 0.19991613924503326, |
|
"learning_rate": 8.746510369489103e-06, |
|
"loss": 0.0306, |
|
"step": 3110 |
|
}, |
|
{ |
|
"epoch": 8.210526315789474, |
|
"grad_norm": 0.26584678888320923, |
|
"learning_rate": 8.502217320055427e-06, |
|
"loss": 0.0295, |
|
"step": 3120 |
|
}, |
|
{ |
|
"epoch": 8.236842105263158, |
|
"grad_norm": 0.28366735577583313, |
|
"learning_rate": 8.261067003671447e-06, |
|
"loss": 0.0267, |
|
"step": 3130 |
|
}, |
|
{ |
|
"epoch": 8.263157894736842, |
|
"grad_norm": 0.2124335616827011, |
|
"learning_rate": 8.0230776832664e-06, |
|
"loss": 0.027, |
|
"step": 3140 |
|
}, |
|
{ |
|
"epoch": 8.289473684210526, |
|
"grad_norm": 0.6940687894821167, |
|
"learning_rate": 7.78826738237926e-06, |
|
"loss": 0.0239, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 8.31578947368421, |
|
"grad_norm": 0.3029802739620209, |
|
"learning_rate": 7.556653883793724e-06, |
|
"loss": 0.0293, |
|
"step": 3160 |
|
}, |
|
{ |
|
"epoch": 8.342105263157896, |
|
"grad_norm": 0.33494675159454346, |
|
"learning_rate": 7.328254728191464e-06, |
|
"loss": 0.0322, |
|
"step": 3170 |
|
}, |
|
{ |
|
"epoch": 8.368421052631579, |
|
"grad_norm": 0.5959463715553284, |
|
"learning_rate": 7.103087212823778e-06, |
|
"loss": 0.0282, |
|
"step": 3180 |
|
}, |
|
{ |
|
"epoch": 8.394736842105264, |
|
"grad_norm": 0.3299950957298279, |
|
"learning_rate": 6.881168390201581e-06, |
|
"loss": 0.0289, |
|
"step": 3190 |
|
}, |
|
{ |
|
"epoch": 8.421052631578947, |
|
"grad_norm": 0.28407934308052063, |
|
"learning_rate": 6.66251506680397e-06, |
|
"loss": 0.0277, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 8.447368421052632, |
|
"grad_norm": 0.24567903578281403, |
|
"learning_rate": 6.447143801805516e-06, |
|
"loss": 0.0293, |
|
"step": 3210 |
|
}, |
|
{ |
|
"epoch": 8.473684210526315, |
|
"grad_norm": 0.20823001861572266, |
|
"learning_rate": 6.23507090582206e-06, |
|
"loss": 0.0257, |
|
"step": 3220 |
|
}, |
|
{ |
|
"epoch": 8.5, |
|
"grad_norm": 0.29089733958244324, |
|
"learning_rate": 6.026312439675552e-06, |
|
"loss": 0.0268, |
|
"step": 3230 |
|
}, |
|
{ |
|
"epoch": 8.526315789473685, |
|
"grad_norm": 0.3374447524547577, |
|
"learning_rate": 5.820884213177713e-06, |
|
"loss": 0.0276, |
|
"step": 3240 |
|
}, |
|
{ |
|
"epoch": 8.552631578947368, |
|
"grad_norm": 0.32109740376472473, |
|
"learning_rate": 5.618801783932725e-06, |
|
"loss": 0.0302, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 8.578947368421053, |
|
"grad_norm": 0.25355905294418335, |
|
"learning_rate": 5.420080456158971e-06, |
|
"loss": 0.0263, |
|
"step": 3260 |
|
}, |
|
{ |
|
"epoch": 8.605263157894736, |
|
"grad_norm": 0.6211085319519043, |
|
"learning_rate": 5.224735279530063e-06, |
|
"loss": 0.0284, |
|
"step": 3270 |
|
}, |
|
{ |
|
"epoch": 8.631578947368421, |
|
"grad_norm": 0.23430858552455902, |
|
"learning_rate": 5.032781048035034e-06, |
|
"loss": 0.0264, |
|
"step": 3280 |
|
}, |
|
{ |
|
"epoch": 8.657894736842106, |
|
"grad_norm": 0.24149766564369202, |
|
"learning_rate": 4.84423229885802e-06, |
|
"loss": 0.027, |
|
"step": 3290 |
|
}, |
|
{ |
|
"epoch": 8.68421052631579, |
|
"grad_norm": 0.3725610077381134, |
|
"learning_rate": 4.659103311277274e-06, |
|
"loss": 0.0253, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 8.710526315789474, |
|
"grad_norm": 0.2504403293132782, |
|
"learning_rate": 4.477408105583741e-06, |
|
"loss": 0.0276, |
|
"step": 3310 |
|
}, |
|
{ |
|
"epoch": 8.736842105263158, |
|
"grad_norm": 0.21490493416786194, |
|
"learning_rate": 4.29916044201934e-06, |
|
"loss": 0.0272, |
|
"step": 3320 |
|
}, |
|
{ |
|
"epoch": 8.763157894736842, |
|
"grad_norm": 0.30955877900123596, |
|
"learning_rate": 4.124373819734795e-06, |
|
"loss": 0.0308, |
|
"step": 3330 |
|
}, |
|
{ |
|
"epoch": 8.789473684210526, |
|
"grad_norm": 0.210093155503273, |
|
"learning_rate": 3.953061475767339e-06, |
|
"loss": 0.0264, |
|
"step": 3340 |
|
}, |
|
{ |
|
"epoch": 8.81578947368421, |
|
"grad_norm": 0.29672837257385254, |
|
"learning_rate": 3.785236384038232e-06, |
|
"loss": 0.0295, |
|
"step": 3350 |
|
}, |
|
{ |
|
"epoch": 8.842105263157894, |
|
"grad_norm": 0.5261228680610657, |
|
"learning_rate": 3.620911254370224e-06, |
|
"loss": 0.0249, |
|
"step": 3360 |
|
}, |
|
{ |
|
"epoch": 8.868421052631579, |
|
"grad_norm": 0.22544153034687042, |
|
"learning_rate": 3.460098531525019e-06, |
|
"loss": 0.028, |
|
"step": 3370 |
|
}, |
|
{ |
|
"epoch": 8.894736842105264, |
|
"grad_norm": 0.24784667789936066, |
|
"learning_rate": 3.302810394260736e-06, |
|
"loss": 0.0252, |
|
"step": 3380 |
|
}, |
|
{ |
|
"epoch": 8.921052631578947, |
|
"grad_norm": 0.21960538625717163, |
|
"learning_rate": 3.1490587544096782e-06, |
|
"loss": 0.0255, |
|
"step": 3390 |
|
}, |
|
{ |
|
"epoch": 8.947368421052632, |
|
"grad_norm": 0.4938012361526489, |
|
"learning_rate": 2.9988552559761294e-06, |
|
"loss": 0.0261, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 8.973684210526315, |
|
"grad_norm": 0.2430790662765503, |
|
"learning_rate": 2.85221127425459e-06, |
|
"loss": 0.0283, |
|
"step": 3410 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"grad_norm": 0.3423117995262146, |
|
"learning_rate": 2.7091379149682685e-06, |
|
"loss": 0.0262, |
|
"step": 3420 |
|
}, |
|
{ |
|
"epoch": 9.026315789473685, |
|
"grad_norm": 0.1678830087184906, |
|
"learning_rate": 2.5696460134279955e-06, |
|
"loss": 0.0226, |
|
"step": 3430 |
|
}, |
|
{ |
|
"epoch": 9.052631578947368, |
|
"grad_norm": 0.1864800751209259, |
|
"learning_rate": 2.4337461337116894e-06, |
|
"loss": 0.028, |
|
"step": 3440 |
|
}, |
|
{ |
|
"epoch": 9.078947368421053, |
|
"grad_norm": 0.3498280942440033, |
|
"learning_rate": 2.3014485678642563e-06, |
|
"loss": 0.0251, |
|
"step": 3450 |
|
}, |
|
{ |
|
"epoch": 9.105263157894736, |
|
"grad_norm": 0.2959420680999756, |
|
"learning_rate": 2.1727633351182e-06, |
|
"loss": 0.0267, |
|
"step": 3460 |
|
}, |
|
{ |
|
"epoch": 9.131578947368421, |
|
"grad_norm": 0.46708524227142334, |
|
"learning_rate": 2.0477001811347985e-06, |
|
"loss": 0.0271, |
|
"step": 3470 |
|
}, |
|
{ |
|
"epoch": 9.157894736842104, |
|
"grad_norm": 0.21950027346611023, |
|
"learning_rate": 1.9262685772660606e-06, |
|
"loss": 0.0241, |
|
"step": 3480 |
|
}, |
|
{ |
|
"epoch": 9.18421052631579, |
|
"grad_norm": 0.19924266636371613, |
|
"learning_rate": 1.8084777198374315e-06, |
|
"loss": 0.0276, |
|
"step": 3490 |
|
}, |
|
{ |
|
"epoch": 9.210526315789474, |
|
"grad_norm": 0.29325613379478455, |
|
"learning_rate": 1.6943365294513236e-06, |
|
"loss": 0.0278, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 9.236842105263158, |
|
"grad_norm": 0.19263319671154022, |
|
"learning_rate": 1.5838536503115675e-06, |
|
"loss": 0.0275, |
|
"step": 3510 |
|
}, |
|
{ |
|
"epoch": 9.263157894736842, |
|
"grad_norm": 0.3293076753616333, |
|
"learning_rate": 1.4770374495687134e-06, |
|
"loss": 0.0232, |
|
"step": 3520 |
|
}, |
|
{ |
|
"epoch": 9.289473684210526, |
|
"grad_norm": 0.23276633024215698, |
|
"learning_rate": 1.3738960166864101e-06, |
|
"loss": 0.0245, |
|
"step": 3530 |
|
}, |
|
{ |
|
"epoch": 9.31578947368421, |
|
"grad_norm": 0.3246747553348541, |
|
"learning_rate": 1.274437162828751e-06, |
|
"loss": 0.0258, |
|
"step": 3540 |
|
}, |
|
{ |
|
"epoch": 9.342105263157896, |
|
"grad_norm": 0.3524140417575836, |
|
"learning_rate": 1.1786684202687026e-06, |
|
"loss": 0.023, |
|
"step": 3550 |
|
}, |
|
{ |
|
"epoch": 9.368421052631579, |
|
"grad_norm": 0.27717113494873047, |
|
"learning_rate": 1.0865970418177051e-06, |
|
"loss": 0.0292, |
|
"step": 3560 |
|
}, |
|
{ |
|
"epoch": 9.394736842105264, |
|
"grad_norm": 0.29303857684135437, |
|
"learning_rate": 9.98230000276351e-07, |
|
"loss": 0.0239, |
|
"step": 3570 |
|
}, |
|
{ |
|
"epoch": 9.421052631578947, |
|
"grad_norm": 0.5621191263198853, |
|
"learning_rate": 9.135739879063465e-07, |
|
"loss": 0.0242, |
|
"step": 3580 |
|
}, |
|
{ |
|
"epoch": 9.447368421052632, |
|
"grad_norm": 0.41081079840660095, |
|
"learning_rate": 8.326354159236882e-07, |
|
"loss": 0.0232, |
|
"step": 3590 |
|
}, |
|
{ |
|
"epoch": 9.473684210526315, |
|
"grad_norm": 0.48332345485687256, |
|
"learning_rate": 7.554204140131138e-07, |
|
"loss": 0.026, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 9.5, |
|
"grad_norm": 0.36336687207221985, |
|
"learning_rate": 6.819348298638839e-07, |
|
"loss": 0.0226, |
|
"step": 3610 |
|
}, |
|
{ |
|
"epoch": 9.526315789473685, |
|
"grad_norm": 0.4939121901988983, |
|
"learning_rate": 6.121842287269419e-07, |
|
"loss": 0.0274, |
|
"step": 3620 |
|
}, |
|
{ |
|
"epoch": 9.552631578947368, |
|
"grad_norm": 0.60195392370224, |
|
"learning_rate": 5.46173892993429e-07, |
|
"loss": 0.0229, |
|
"step": 3630 |
|
}, |
|
{ |
|
"epoch": 9.578947368421053, |
|
"grad_norm": 0.3372774124145508, |
|
"learning_rate": 4.839088217946208e-07, |
|
"loss": 0.0236, |
|
"step": 3640 |
|
}, |
|
{ |
|
"epoch": 9.605263157894736, |
|
"grad_norm": 0.2751666307449341, |
|
"learning_rate": 4.253937306233691e-07, |
|
"loss": 0.0267, |
|
"step": 3650 |
|
}, |
|
{ |
|
"epoch": 9.631578947368421, |
|
"grad_norm": 0.2811025381088257, |
|
"learning_rate": 3.706330509769429e-07, |
|
"loss": 0.0269, |
|
"step": 3660 |
|
}, |
|
{ |
|
"epoch": 9.657894736842106, |
|
"grad_norm": 0.27488309144973755, |
|
"learning_rate": 3.1963093002145285e-07, |
|
"loss": 0.0294, |
|
"step": 3670 |
|
}, |
|
{ |
|
"epoch": 9.68421052631579, |
|
"grad_norm": 0.29794642329216003, |
|
"learning_rate": 2.7239123027775204e-07, |
|
"loss": 0.0226, |
|
"step": 3680 |
|
}, |
|
{ |
|
"epoch": 9.710526315789474, |
|
"grad_norm": 0.49477118253707886, |
|
"learning_rate": 2.289175293289314e-07, |
|
"loss": 0.0263, |
|
"step": 3690 |
|
}, |
|
{ |
|
"epoch": 9.736842105263158, |
|
"grad_norm": 0.44544729590415955, |
|
"learning_rate": 1.8921311954937516e-07, |
|
"loss": 0.024, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 9.763157894736842, |
|
"grad_norm": 0.380344033241272, |
|
"learning_rate": 1.5328100785542697e-07, |
|
"loss": 0.0243, |
|
"step": 3710 |
|
}, |
|
{ |
|
"epoch": 9.789473684210526, |
|
"grad_norm": 0.2532555162906647, |
|
"learning_rate": 1.211239154776611e-07, |
|
"loss": 0.0256, |
|
"step": 3720 |
|
}, |
|
{ |
|
"epoch": 9.81578947368421, |
|
"grad_norm": 0.3260039985179901, |
|
"learning_rate": 9.27442777547971e-08, |
|
"loss": 0.0253, |
|
"step": 3730 |
|
}, |
|
{ |
|
"epoch": 9.842105263157894, |
|
"grad_norm": 0.2649020254611969, |
|
"learning_rate": 6.814424394926966e-08, |
|
"loss": 0.0298, |
|
"step": 3740 |
|
}, |
|
{ |
|
"epoch": 9.868421052631579, |
|
"grad_norm": 0.257893443107605, |
|
"learning_rate": 4.732567708445878e-08, |
|
"loss": 0.0223, |
|
"step": 3750 |
|
}, |
|
{ |
|
"epoch": 9.894736842105264, |
|
"grad_norm": 0.20920208096504211, |
|
"learning_rate": 3.029015380359157e-08, |
|
"loss": 0.0249, |
|
"step": 3760 |
|
}, |
|
{ |
|
"epoch": 9.921052631578947, |
|
"grad_norm": 0.19053097069263458, |
|
"learning_rate": 1.7038964250343238e-08, |
|
"loss": 0.029, |
|
"step": 3770 |
|
}, |
|
{ |
|
"epoch": 9.947368421052632, |
|
"grad_norm": 0.3256742060184479, |
|
"learning_rate": 7.573111971148627e-09, |
|
"loss": 0.0254, |
|
"step": 3780 |
|
}, |
|
{ |
|
"epoch": 9.973684210526315, |
|
"grad_norm": 0.3680499792098999, |
|
"learning_rate": 1.8933138391574732e-09, |
|
"loss": 0.025, |
|
"step": 3790 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"grad_norm": 0.5628379583358765, |
|
"learning_rate": 0.0, |
|
"loss": 0.0263, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"step": 3800, |
|
"total_flos": 4.0064924863782144e+17, |
|
"train_loss": 0.05822706136264299, |
|
"train_runtime": 3981.0494, |
|
"train_samples_per_second": 46.716, |
|
"train_steps_per_second": 0.955 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 3800, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 10, |
|
"save_steps": 10000, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 4.0064924863782144e+17, |
|
"train_batch_size": 49, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|