|
{
|
|
"best_metric": null,
|
|
"best_model_checkpoint": null,
|
|
"epoch": 5.0,
|
|
"eval_steps": 500,
|
|
"global_step": 191805,
|
|
"is_hyper_param_search": false,
|
|
"is_local_process_zero": true,
|
|
"is_world_process_zero": true,
|
|
"log_history": [
|
|
{
|
|
"epoch": 0.013034071061755428,
|
|
"grad_norm": 2.9903717041015625,
|
|
"learning_rate": 4.9869659289382446e-05,
|
|
"loss": 6.0967,
|
|
"step": 500
|
|
},
|
|
{
|
|
"epoch": 0.026068142123510857,
|
|
"grad_norm": 4.055346965789795,
|
|
"learning_rate": 4.973931857876489e-05,
|
|
"loss": 5.5622,
|
|
"step": 1000
|
|
},
|
|
{
|
|
"epoch": 0.039102213185266285,
|
|
"grad_norm": 5.480607032775879,
|
|
"learning_rate": 4.960897786814734e-05,
|
|
"loss": 5.4371,
|
|
"step": 1500
|
|
},
|
|
{
|
|
"epoch": 0.052136284247021714,
|
|
"grad_norm": 4.727134704589844,
|
|
"learning_rate": 4.9478637157529784e-05,
|
|
"loss": 5.3535,
|
|
"step": 2000
|
|
},
|
|
{
|
|
"epoch": 0.06517035530877714,
|
|
"grad_norm": 4.737260341644287,
|
|
"learning_rate": 4.934829644691223e-05,
|
|
"loss": 5.2508,
|
|
"step": 2500
|
|
},
|
|
{
|
|
"epoch": 0.07820442637053257,
|
|
"grad_norm": 5.066771984100342,
|
|
"learning_rate": 4.921795573629467e-05,
|
|
"loss": 5.199,
|
|
"step": 3000
|
|
},
|
|
{
|
|
"epoch": 0.091238497432288,
|
|
"grad_norm": 3.627026319503784,
|
|
"learning_rate": 4.908761502567713e-05,
|
|
"loss": 5.084,
|
|
"step": 3500
|
|
},
|
|
{
|
|
"epoch": 0.10427256849404343,
|
|
"grad_norm": 4.254016876220703,
|
|
"learning_rate": 4.895727431505957e-05,
|
|
"loss": 4.9441,
|
|
"step": 4000
|
|
},
|
|
{
|
|
"epoch": 0.11730663955579886,
|
|
"grad_norm": 6.351306438446045,
|
|
"learning_rate": 4.8826933604442015e-05,
|
|
"loss": 4.8021,
|
|
"step": 4500
|
|
},
|
|
{
|
|
"epoch": 0.13034071061755428,
|
|
"grad_norm": 7.492619037628174,
|
|
"learning_rate": 4.869659289382446e-05,
|
|
"loss": 4.6446,
|
|
"step": 5000
|
|
},
|
|
{
|
|
"epoch": 0.14337478167930973,
|
|
"grad_norm": 6.017455577850342,
|
|
"learning_rate": 4.856625218320691e-05,
|
|
"loss": 4.4574,
|
|
"step": 5500
|
|
},
|
|
{
|
|
"epoch": 0.15640885274106514,
|
|
"grad_norm": 5.2971343994140625,
|
|
"learning_rate": 4.843591147258935e-05,
|
|
"loss": 4.2184,
|
|
"step": 6000
|
|
},
|
|
{
|
|
"epoch": 0.16944292380282058,
|
|
"grad_norm": 9.367820739746094,
|
|
"learning_rate": 4.8305570761971796e-05,
|
|
"loss": 4.101,
|
|
"step": 6500
|
|
},
|
|
{
|
|
"epoch": 0.182476994864576,
|
|
"grad_norm": 7.676972389221191,
|
|
"learning_rate": 4.817523005135424e-05,
|
|
"loss": 3.9548,
|
|
"step": 7000
|
|
},
|
|
{
|
|
"epoch": 0.19551106592633144,
|
|
"grad_norm": 6.3607563972473145,
|
|
"learning_rate": 4.804488934073669e-05,
|
|
"loss": 3.8584,
|
|
"step": 7500
|
|
},
|
|
{
|
|
"epoch": 0.20854513698808685,
|
|
"grad_norm": 5.45451021194458,
|
|
"learning_rate": 4.7914548630119134e-05,
|
|
"loss": 3.7841,
|
|
"step": 8000
|
|
},
|
|
{
|
|
"epoch": 0.2215792080498423,
|
|
"grad_norm": 16.199485778808594,
|
|
"learning_rate": 4.778420791950158e-05,
|
|
"loss": 3.6685,
|
|
"step": 8500
|
|
},
|
|
{
|
|
"epoch": 0.2346132791115977,
|
|
"grad_norm": 6.077032089233398,
|
|
"learning_rate": 4.765386720888402e-05,
|
|
"loss": 3.6017,
|
|
"step": 9000
|
|
},
|
|
{
|
|
"epoch": 0.24764735017335315,
|
|
"grad_norm": 11.489569664001465,
|
|
"learning_rate": 4.752352649826647e-05,
|
|
"loss": 3.5553,
|
|
"step": 9500
|
|
},
|
|
{
|
|
"epoch": 0.26068142123510857,
|
|
"grad_norm": 4.917782783508301,
|
|
"learning_rate": 4.7393185787648915e-05,
|
|
"loss": 3.4537,
|
|
"step": 10000
|
|
},
|
|
{
|
|
"epoch": 0.273715492296864,
|
|
"grad_norm": 5.945028781890869,
|
|
"learning_rate": 4.7262845077031366e-05,
|
|
"loss": 3.4442,
|
|
"step": 10500
|
|
},
|
|
{
|
|
"epoch": 0.28674956335861945,
|
|
"grad_norm": 7.648957252502441,
|
|
"learning_rate": 4.713250436641381e-05,
|
|
"loss": 3.3772,
|
|
"step": 11000
|
|
},
|
|
{
|
|
"epoch": 0.29978363442037487,
|
|
"grad_norm": 7.488467216491699,
|
|
"learning_rate": 4.700216365579625e-05,
|
|
"loss": 3.3026,
|
|
"step": 11500
|
|
},
|
|
{
|
|
"epoch": 0.3128177054821303,
|
|
"grad_norm": 5.8792619705200195,
|
|
"learning_rate": 4.68718229451787e-05,
|
|
"loss": 3.2446,
|
|
"step": 12000
|
|
},
|
|
{
|
|
"epoch": 0.3258517765438857,
|
|
"grad_norm": 10.038032531738281,
|
|
"learning_rate": 4.674148223456115e-05,
|
|
"loss": 3.216,
|
|
"step": 12500
|
|
},
|
|
{
|
|
"epoch": 0.33888584760564117,
|
|
"grad_norm": 7.69769811630249,
|
|
"learning_rate": 4.661114152394359e-05,
|
|
"loss": 3.1869,
|
|
"step": 13000
|
|
},
|
|
{
|
|
"epoch": 0.3519199186673966,
|
|
"grad_norm": 6.179595470428467,
|
|
"learning_rate": 4.6480800813326034e-05,
|
|
"loss": 3.1464,
|
|
"step": 13500
|
|
},
|
|
{
|
|
"epoch": 0.364953989729152,
|
|
"grad_norm": 5.665715217590332,
|
|
"learning_rate": 4.6350460102708484e-05,
|
|
"loss": 3.079,
|
|
"step": 14000
|
|
},
|
|
{
|
|
"epoch": 0.3779880607909074,
|
|
"grad_norm": 4.681985855102539,
|
|
"learning_rate": 4.622011939209093e-05,
|
|
"loss": 3.0724,
|
|
"step": 14500
|
|
},
|
|
{
|
|
"epoch": 0.3910221318526629,
|
|
"grad_norm": 11.111820220947266,
|
|
"learning_rate": 4.608977868147337e-05,
|
|
"loss": 3.0356,
|
|
"step": 15000
|
|
},
|
|
{
|
|
"epoch": 0.4040562029144183,
|
|
"grad_norm": 5.951188564300537,
|
|
"learning_rate": 4.5959437970855815e-05,
|
|
"loss": 3.01,
|
|
"step": 15500
|
|
},
|
|
{
|
|
"epoch": 0.4170902739761737,
|
|
"grad_norm": 5.438151836395264,
|
|
"learning_rate": 4.5829097260238266e-05,
|
|
"loss": 2.9605,
|
|
"step": 16000
|
|
},
|
|
{
|
|
"epoch": 0.4301243450379291,
|
|
"grad_norm": 10.49527645111084,
|
|
"learning_rate": 4.569875654962071e-05,
|
|
"loss": 2.9453,
|
|
"step": 16500
|
|
},
|
|
{
|
|
"epoch": 0.4431584160996846,
|
|
"grad_norm": 6.611765384674072,
|
|
"learning_rate": 4.556841583900316e-05,
|
|
"loss": 2.9529,
|
|
"step": 17000
|
|
},
|
|
{
|
|
"epoch": 0.45619248716144,
|
|
"grad_norm": 5.289289474487305,
|
|
"learning_rate": 4.54380751283856e-05,
|
|
"loss": 2.9081,
|
|
"step": 17500
|
|
},
|
|
{
|
|
"epoch": 0.4692265582231954,
|
|
"grad_norm": 5.65715217590332,
|
|
"learning_rate": 4.530773441776805e-05,
|
|
"loss": 2.8152,
|
|
"step": 18000
|
|
},
|
|
{
|
|
"epoch": 0.48226062928495084,
|
|
"grad_norm": 5.513209819793701,
|
|
"learning_rate": 4.51773937071505e-05,
|
|
"loss": 2.8664,
|
|
"step": 18500
|
|
},
|
|
{
|
|
"epoch": 0.4952947003467063,
|
|
"grad_norm": 4.413240909576416,
|
|
"learning_rate": 4.504705299653294e-05,
|
|
"loss": 2.8854,
|
|
"step": 19000
|
|
},
|
|
{
|
|
"epoch": 0.5083287714084617,
|
|
"grad_norm": 5.602241039276123,
|
|
"learning_rate": 4.4916712285915384e-05,
|
|
"loss": 2.8295,
|
|
"step": 19500
|
|
},
|
|
{
|
|
"epoch": 0.5213628424702171,
|
|
"grad_norm": 8.221460342407227,
|
|
"learning_rate": 4.478637157529783e-05,
|
|
"loss": 2.7826,
|
|
"step": 20000
|
|
},
|
|
{
|
|
"epoch": 0.5343969135319726,
|
|
"grad_norm": 5.350883483886719,
|
|
"learning_rate": 4.465603086468028e-05,
|
|
"loss": 2.7846,
|
|
"step": 20500
|
|
},
|
|
{
|
|
"epoch": 0.547430984593728,
|
|
"grad_norm": 6.6059393882751465,
|
|
"learning_rate": 4.452569015406272e-05,
|
|
"loss": 2.7562,
|
|
"step": 21000
|
|
},
|
|
{
|
|
"epoch": 0.5604650556554834,
|
|
"grad_norm": 7.050083637237549,
|
|
"learning_rate": 4.4395349443445166e-05,
|
|
"loss": 2.7102,
|
|
"step": 21500
|
|
},
|
|
{
|
|
"epoch": 0.5734991267172389,
|
|
"grad_norm": 6.74811315536499,
|
|
"learning_rate": 4.426500873282761e-05,
|
|
"loss": 2.7215,
|
|
"step": 22000
|
|
},
|
|
{
|
|
"epoch": 0.5865331977789943,
|
|
"grad_norm": 7.959073543548584,
|
|
"learning_rate": 4.413466802221006e-05,
|
|
"loss": 2.7185,
|
|
"step": 22500
|
|
},
|
|
{
|
|
"epoch": 0.5995672688407497,
|
|
"grad_norm": 7.594911098480225,
|
|
"learning_rate": 4.40043273115925e-05,
|
|
"loss": 2.6624,
|
|
"step": 23000
|
|
},
|
|
{
|
|
"epoch": 0.6126013399025051,
|
|
"grad_norm": 5.935075283050537,
|
|
"learning_rate": 4.3873986600974954e-05,
|
|
"loss": 2.6398,
|
|
"step": 23500
|
|
},
|
|
{
|
|
"epoch": 0.6256354109642606,
|
|
"grad_norm": 7.0315961837768555,
|
|
"learning_rate": 4.37436458903574e-05,
|
|
"loss": 2.6571,
|
|
"step": 24000
|
|
},
|
|
{
|
|
"epoch": 0.638669482026016,
|
|
"grad_norm": 6.930845260620117,
|
|
"learning_rate": 4.361330517973984e-05,
|
|
"loss": 2.6009,
|
|
"step": 24500
|
|
},
|
|
{
|
|
"epoch": 0.6517035530877714,
|
|
"grad_norm": 14.607309341430664,
|
|
"learning_rate": 4.348296446912229e-05,
|
|
"loss": 2.6493,
|
|
"step": 25000
|
|
},
|
|
{
|
|
"epoch": 0.6647376241495269,
|
|
"grad_norm": 5.613809108734131,
|
|
"learning_rate": 4.3352623758504735e-05,
|
|
"loss": 2.6042,
|
|
"step": 25500
|
|
},
|
|
{
|
|
"epoch": 0.6777716952112823,
|
|
"grad_norm": 6.0553693771362305,
|
|
"learning_rate": 4.322228304788718e-05,
|
|
"loss": 2.6153,
|
|
"step": 26000
|
|
},
|
|
{
|
|
"epoch": 0.6908057662730377,
|
|
"grad_norm": 8.716107368469238,
|
|
"learning_rate": 4.309194233726962e-05,
|
|
"loss": 2.5757,
|
|
"step": 26500
|
|
},
|
|
{
|
|
"epoch": 0.7038398373347932,
|
|
"grad_norm": 7.430722713470459,
|
|
"learning_rate": 4.296160162665207e-05,
|
|
"loss": 2.5682,
|
|
"step": 27000
|
|
},
|
|
{
|
|
"epoch": 0.7168739083965486,
|
|
"grad_norm": 9.687034606933594,
|
|
"learning_rate": 4.2831260916034516e-05,
|
|
"loss": 2.5377,
|
|
"step": 27500
|
|
},
|
|
{
|
|
"epoch": 0.729907979458304,
|
|
"grad_norm": 3.729767084121704,
|
|
"learning_rate": 4.270092020541696e-05,
|
|
"loss": 2.5217,
|
|
"step": 28000
|
|
},
|
|
{
|
|
"epoch": 0.7429420505200595,
|
|
"grad_norm": 9.692636489868164,
|
|
"learning_rate": 4.25705794947994e-05,
|
|
"loss": 2.4829,
|
|
"step": 28500
|
|
},
|
|
{
|
|
"epoch": 0.7559761215818148,
|
|
"grad_norm": 8.260266304016113,
|
|
"learning_rate": 4.2440238784181854e-05,
|
|
"loss": 2.4971,
|
|
"step": 29000
|
|
},
|
|
{
|
|
"epoch": 0.7690101926435703,
|
|
"grad_norm": 5.885035037994385,
|
|
"learning_rate": 4.23098980735643e-05,
|
|
"loss": 2.4823,
|
|
"step": 29500
|
|
},
|
|
{
|
|
"epoch": 0.7820442637053258,
|
|
"grad_norm": 11.001029968261719,
|
|
"learning_rate": 4.217955736294674e-05,
|
|
"loss": 2.4583,
|
|
"step": 30000
|
|
},
|
|
{
|
|
"epoch": 0.7950783347670811,
|
|
"grad_norm": 9.69256591796875,
|
|
"learning_rate": 4.204921665232919e-05,
|
|
"loss": 2.447,
|
|
"step": 30500
|
|
},
|
|
{
|
|
"epoch": 0.8081124058288366,
|
|
"grad_norm": 15.954379081726074,
|
|
"learning_rate": 4.191887594171164e-05,
|
|
"loss": 2.4427,
|
|
"step": 31000
|
|
},
|
|
{
|
|
"epoch": 0.8211464768905921,
|
|
"grad_norm": 5.421440124511719,
|
|
"learning_rate": 4.1788535231094085e-05,
|
|
"loss": 2.4181,
|
|
"step": 31500
|
|
},
|
|
{
|
|
"epoch": 0.8341805479523474,
|
|
"grad_norm": 9.169551849365234,
|
|
"learning_rate": 4.165819452047653e-05,
|
|
"loss": 2.4105,
|
|
"step": 32000
|
|
},
|
|
{
|
|
"epoch": 0.8472146190141029,
|
|
"grad_norm": 5.778009414672852,
|
|
"learning_rate": 4.152785380985897e-05,
|
|
"loss": 2.4145,
|
|
"step": 32500
|
|
},
|
|
{
|
|
"epoch": 0.8602486900758582,
|
|
"grad_norm": 6.441959857940674,
|
|
"learning_rate": 4.139751309924142e-05,
|
|
"loss": 2.4334,
|
|
"step": 33000
|
|
},
|
|
{
|
|
"epoch": 0.8732827611376137,
|
|
"grad_norm": 7.385718822479248,
|
|
"learning_rate": 4.1267172388623866e-05,
|
|
"loss": 2.392,
|
|
"step": 33500
|
|
},
|
|
{
|
|
"epoch": 0.8863168321993692,
|
|
"grad_norm": 15.347734451293945,
|
|
"learning_rate": 4.113683167800631e-05,
|
|
"loss": 2.3981,
|
|
"step": 34000
|
|
},
|
|
{
|
|
"epoch": 0.8993509032611245,
|
|
"grad_norm": 10.47854232788086,
|
|
"learning_rate": 4.1006490967388754e-05,
|
|
"loss": 2.3511,
|
|
"step": 34500
|
|
},
|
|
{
|
|
"epoch": 0.91238497432288,
|
|
"grad_norm": 11.82073974609375,
|
|
"learning_rate": 4.0876150256771204e-05,
|
|
"loss": 2.3632,
|
|
"step": 35000
|
|
},
|
|
{
|
|
"epoch": 0.9254190453846355,
|
|
"grad_norm": 8.932971954345703,
|
|
"learning_rate": 4.074580954615365e-05,
|
|
"loss": 2.3272,
|
|
"step": 35500
|
|
},
|
|
{
|
|
"epoch": 0.9384531164463908,
|
|
"grad_norm": 11.068861961364746,
|
|
"learning_rate": 4.061546883553609e-05,
|
|
"loss": 2.3321,
|
|
"step": 36000
|
|
},
|
|
{
|
|
"epoch": 0.9514871875081463,
|
|
"grad_norm": 5.649448871612549,
|
|
"learning_rate": 4.0485128124918535e-05,
|
|
"loss": 2.3498,
|
|
"step": 36500
|
|
},
|
|
{
|
|
"epoch": 0.9645212585699017,
|
|
"grad_norm": 9.020928382873535,
|
|
"learning_rate": 4.0354787414300985e-05,
|
|
"loss": 2.3331,
|
|
"step": 37000
|
|
},
|
|
{
|
|
"epoch": 0.9775553296316571,
|
|
"grad_norm": 12.966954231262207,
|
|
"learning_rate": 4.0224446703683436e-05,
|
|
"loss": 2.3095,
|
|
"step": 37500
|
|
},
|
|
{
|
|
"epoch": 0.9905894006934126,
|
|
"grad_norm": 5.641653060913086,
|
|
"learning_rate": 4.009410599306588e-05,
|
|
"loss": 2.3127,
|
|
"step": 38000
|
|
},
|
|
{
|
|
"epoch": 1.003623471755168,
|
|
"grad_norm": 8.139008522033691,
|
|
"learning_rate": 3.996376528244832e-05,
|
|
"loss": 2.2846,
|
|
"step": 38500
|
|
},
|
|
{
|
|
"epoch": 1.0166575428169233,
|
|
"grad_norm": 7.005831241607666,
|
|
"learning_rate": 3.9833424571830766e-05,
|
|
"loss": 2.2518,
|
|
"step": 39000
|
|
},
|
|
{
|
|
"epoch": 1.029691613878679,
|
|
"grad_norm": 3.906301975250244,
|
|
"learning_rate": 3.970308386121322e-05,
|
|
"loss": 2.2632,
|
|
"step": 39500
|
|
},
|
|
{
|
|
"epoch": 1.0427256849404343,
|
|
"grad_norm": 4.201974391937256,
|
|
"learning_rate": 3.957274315059566e-05,
|
|
"loss": 2.2299,
|
|
"step": 40000
|
|
},
|
|
{
|
|
"epoch": 1.0557597560021896,
|
|
"grad_norm": 6.107882022857666,
|
|
"learning_rate": 3.9442402439978104e-05,
|
|
"loss": 2.2016,
|
|
"step": 40500
|
|
},
|
|
{
|
|
"epoch": 1.0687938270639452,
|
|
"grad_norm": 8.289084434509277,
|
|
"learning_rate": 3.931206172936055e-05,
|
|
"loss": 2.2227,
|
|
"step": 41000
|
|
},
|
|
{
|
|
"epoch": 1.0818278981257006,
|
|
"grad_norm": 5.386382102966309,
|
|
"learning_rate": 3.9181721018743e-05,
|
|
"loss": 2.1849,
|
|
"step": 41500
|
|
},
|
|
{
|
|
"epoch": 1.094861969187456,
|
|
"grad_norm": 5.536214828491211,
|
|
"learning_rate": 3.905138030812544e-05,
|
|
"loss": 2.2085,
|
|
"step": 42000
|
|
},
|
|
{
|
|
"epoch": 1.1078960402492115,
|
|
"grad_norm": 67.06414031982422,
|
|
"learning_rate": 3.8921039597507885e-05,
|
|
"loss": 2.2039,
|
|
"step": 42500
|
|
},
|
|
{
|
|
"epoch": 1.1209301113109669,
|
|
"grad_norm": 8.36019229888916,
|
|
"learning_rate": 3.879069888689033e-05,
|
|
"loss": 2.1925,
|
|
"step": 43000
|
|
},
|
|
{
|
|
"epoch": 1.1339641823727222,
|
|
"grad_norm": 14.266386985778809,
|
|
"learning_rate": 3.866035817627278e-05,
|
|
"loss": 2.2101,
|
|
"step": 43500
|
|
},
|
|
{
|
|
"epoch": 1.1469982534344778,
|
|
"grad_norm": 11.47070598602295,
|
|
"learning_rate": 3.853001746565523e-05,
|
|
"loss": 2.1402,
|
|
"step": 44000
|
|
},
|
|
{
|
|
"epoch": 1.1600323244962332,
|
|
"grad_norm": 5.293683052062988,
|
|
"learning_rate": 3.839967675503767e-05,
|
|
"loss": 2.1872,
|
|
"step": 44500
|
|
},
|
|
{
|
|
"epoch": 1.1730663955579885,
|
|
"grad_norm": 32.234737396240234,
|
|
"learning_rate": 3.826933604442012e-05,
|
|
"loss": 2.1357,
|
|
"step": 45000
|
|
},
|
|
{
|
|
"epoch": 1.1861004666197439,
|
|
"grad_norm": 3.9005160331726074,
|
|
"learning_rate": 3.813899533380256e-05,
|
|
"loss": 2.1263,
|
|
"step": 45500
|
|
},
|
|
{
|
|
"epoch": 1.1991345376814995,
|
|
"grad_norm": 9.012932777404785,
|
|
"learning_rate": 3.800865462318501e-05,
|
|
"loss": 2.1718,
|
|
"step": 46000
|
|
},
|
|
{
|
|
"epoch": 1.2121686087432548,
|
|
"grad_norm": 8.86204719543457,
|
|
"learning_rate": 3.7878313912567454e-05,
|
|
"loss": 2.1718,
|
|
"step": 46500
|
|
},
|
|
{
|
|
"epoch": 1.2252026798050104,
|
|
"grad_norm": 29.908674240112305,
|
|
"learning_rate": 3.77479732019499e-05,
|
|
"loss": 2.1227,
|
|
"step": 47000
|
|
},
|
|
{
|
|
"epoch": 1.2382367508667658,
|
|
"grad_norm": 3.599839687347412,
|
|
"learning_rate": 3.761763249133234e-05,
|
|
"loss": 2.1301,
|
|
"step": 47500
|
|
},
|
|
{
|
|
"epoch": 1.2512708219285211,
|
|
"grad_norm": 12.039328575134277,
|
|
"learning_rate": 3.748729178071479e-05,
|
|
"loss": 2.1226,
|
|
"step": 48000
|
|
},
|
|
{
|
|
"epoch": 1.2643048929902765,
|
|
"grad_norm": 3.92248797416687,
|
|
"learning_rate": 3.7356951070097236e-05,
|
|
"loss": 2.156,
|
|
"step": 48500
|
|
},
|
|
{
|
|
"epoch": 1.277338964052032,
|
|
"grad_norm": 22.514301300048828,
|
|
"learning_rate": 3.722661035947968e-05,
|
|
"loss": 2.1001,
|
|
"step": 49000
|
|
},
|
|
{
|
|
"epoch": 1.2903730351137874,
|
|
"grad_norm": 4.8082990646362305,
|
|
"learning_rate": 3.709626964886212e-05,
|
|
"loss": 2.1167,
|
|
"step": 49500
|
|
},
|
|
{
|
|
"epoch": 1.303407106175543,
|
|
"grad_norm": 7.884994983673096,
|
|
"learning_rate": 3.696592893824457e-05,
|
|
"loss": 2.1118,
|
|
"step": 50000
|
|
},
|
|
{
|
|
"epoch": 1.3164411772372984,
|
|
"grad_norm": 4.282125949859619,
|
|
"learning_rate": 3.6835588227627024e-05,
|
|
"loss": 2.0749,
|
|
"step": 50500
|
|
},
|
|
{
|
|
"epoch": 1.3294752482990537,
|
|
"grad_norm": 19.30133819580078,
|
|
"learning_rate": 3.670524751700947e-05,
|
|
"loss": 2.1081,
|
|
"step": 51000
|
|
},
|
|
{
|
|
"epoch": 1.342509319360809,
|
|
"grad_norm": 3.800236463546753,
|
|
"learning_rate": 3.657490680639191e-05,
|
|
"loss": 2.0964,
|
|
"step": 51500
|
|
},
|
|
{
|
|
"epoch": 1.3555433904225647,
|
|
"grad_norm": 5.734689235687256,
|
|
"learning_rate": 3.6444566095774355e-05,
|
|
"loss": 2.0736,
|
|
"step": 52000
|
|
},
|
|
{
|
|
"epoch": 1.36857746148432,
|
|
"grad_norm": 7.496071815490723,
|
|
"learning_rate": 3.6314225385156805e-05,
|
|
"loss": 2.0545,
|
|
"step": 52500
|
|
},
|
|
{
|
|
"epoch": 1.3816115325460754,
|
|
"grad_norm": 7.645195007324219,
|
|
"learning_rate": 3.618388467453925e-05,
|
|
"loss": 2.0407,
|
|
"step": 53000
|
|
},
|
|
{
|
|
"epoch": 1.394645603607831,
|
|
"grad_norm": 22.738969802856445,
|
|
"learning_rate": 3.605354396392169e-05,
|
|
"loss": 2.0554,
|
|
"step": 53500
|
|
},
|
|
{
|
|
"epoch": 1.4076796746695863,
|
|
"grad_norm": 9.185379028320312,
|
|
"learning_rate": 3.5923203253304136e-05,
|
|
"loss": 2.0364,
|
|
"step": 54000
|
|
},
|
|
{
|
|
"epoch": 1.4207137457313417,
|
|
"grad_norm": 9.092364311218262,
|
|
"learning_rate": 3.5792862542686586e-05,
|
|
"loss": 2.023,
|
|
"step": 54500
|
|
},
|
|
{
|
|
"epoch": 1.433747816793097,
|
|
"grad_norm": 3.8213064670562744,
|
|
"learning_rate": 3.566252183206903e-05,
|
|
"loss": 2.0429,
|
|
"step": 55000
|
|
},
|
|
{
|
|
"epoch": 1.4467818878548526,
|
|
"grad_norm": 15.87769603729248,
|
|
"learning_rate": 3.553218112145147e-05,
|
|
"loss": 1.9853,
|
|
"step": 55500
|
|
},
|
|
{
|
|
"epoch": 1.459815958916608,
|
|
"grad_norm": 8.585647583007812,
|
|
"learning_rate": 3.540184041083392e-05,
|
|
"loss": 2.0239,
|
|
"step": 56000
|
|
},
|
|
{
|
|
"epoch": 1.4728500299783636,
|
|
"grad_norm": 4.249543190002441,
|
|
"learning_rate": 3.527149970021637e-05,
|
|
"loss": 2.0305,
|
|
"step": 56500
|
|
},
|
|
{
|
|
"epoch": 1.485884101040119,
|
|
"grad_norm": 6.320367336273193,
|
|
"learning_rate": 3.514115898959881e-05,
|
|
"loss": 2.0173,
|
|
"step": 57000
|
|
},
|
|
{
|
|
"epoch": 1.4989181721018743,
|
|
"grad_norm": 5.058931350708008,
|
|
"learning_rate": 3.501081827898126e-05,
|
|
"loss": 1.9641,
|
|
"step": 57500
|
|
},
|
|
{
|
|
"epoch": 1.5119522431636296,
|
|
"grad_norm": 10.568583488464355,
|
|
"learning_rate": 3.4880477568363705e-05,
|
|
"loss": 2.035,
|
|
"step": 58000
|
|
},
|
|
{
|
|
"epoch": 1.524986314225385,
|
|
"grad_norm": 6.535768985748291,
|
|
"learning_rate": 3.475013685774615e-05,
|
|
"loss": 1.9971,
|
|
"step": 58500
|
|
},
|
|
{
|
|
"epoch": 1.5380203852871406,
|
|
"grad_norm": 11.262877464294434,
|
|
"learning_rate": 3.46197961471286e-05,
|
|
"loss": 2.0076,
|
|
"step": 59000
|
|
},
|
|
{
|
|
"epoch": 1.5510544563488962,
|
|
"grad_norm": 8.998533248901367,
|
|
"learning_rate": 3.448945543651104e-05,
|
|
"loss": 1.986,
|
|
"step": 59500
|
|
},
|
|
{
|
|
"epoch": 1.5640885274106515,
|
|
"grad_norm": 5.243868827819824,
|
|
"learning_rate": 3.4359114725893486e-05,
|
|
"loss": 2.0148,
|
|
"step": 60000
|
|
},
|
|
{
|
|
"epoch": 1.5771225984724069,
|
|
"grad_norm": 6.43707275390625,
|
|
"learning_rate": 3.422877401527593e-05,
|
|
"loss": 1.9952,
|
|
"step": 60500
|
|
},
|
|
{
|
|
"epoch": 1.5901566695341622,
|
|
"grad_norm": 10.8756742477417,
|
|
"learning_rate": 3.409843330465838e-05,
|
|
"loss": 1.9688,
|
|
"step": 61000
|
|
},
|
|
{
|
|
"epoch": 1.6031907405959176,
|
|
"grad_norm": 3.6488418579101562,
|
|
"learning_rate": 3.3968092594040824e-05,
|
|
"loss": 1.9545,
|
|
"step": 61500
|
|
},
|
|
{
|
|
"epoch": 1.6162248116576732,
|
|
"grad_norm": 3.8945696353912354,
|
|
"learning_rate": 3.383775188342327e-05,
|
|
"loss": 1.9692,
|
|
"step": 62000
|
|
},
|
|
{
|
|
"epoch": 1.6292588827194285,
|
|
"grad_norm": 4.477757453918457,
|
|
"learning_rate": 3.370741117280571e-05,
|
|
"loss": 1.9559,
|
|
"step": 62500
|
|
},
|
|
{
|
|
"epoch": 1.6422929537811841,
|
|
"grad_norm": 5.086141586303711,
|
|
"learning_rate": 3.357707046218816e-05,
|
|
"loss": 1.929,
|
|
"step": 63000
|
|
},
|
|
{
|
|
"epoch": 1.6553270248429395,
|
|
"grad_norm": 5.249891757965088,
|
|
"learning_rate": 3.3446729751570605e-05,
|
|
"loss": 1.9686,
|
|
"step": 63500
|
|
},
|
|
{
|
|
"epoch": 1.6683610959046948,
|
|
"grad_norm": 9.6456880569458,
|
|
"learning_rate": 3.3316389040953055e-05,
|
|
"loss": 1.952,
|
|
"step": 64000
|
|
},
|
|
{
|
|
"epoch": 1.6813951669664502,
|
|
"grad_norm": 5.007114410400391,
|
|
"learning_rate": 3.31860483303355e-05,
|
|
"loss": 1.9229,
|
|
"step": 64500
|
|
},
|
|
{
|
|
"epoch": 1.6944292380282058,
|
|
"grad_norm": 4.589148044586182,
|
|
"learning_rate": 3.305570761971795e-05,
|
|
"loss": 1.9296,
|
|
"step": 65000
|
|
},
|
|
{
|
|
"epoch": 1.7074633090899611,
|
|
"grad_norm": 10.281172752380371,
|
|
"learning_rate": 3.292536690910039e-05,
|
|
"loss": 1.9153,
|
|
"step": 65500
|
|
},
|
|
{
|
|
"epoch": 1.7204973801517167,
|
|
"grad_norm": 7.041563034057617,
|
|
"learning_rate": 3.2795026198482837e-05,
|
|
"loss": 1.9276,
|
|
"step": 66000
|
|
},
|
|
{
|
|
"epoch": 1.733531451213472,
|
|
"grad_norm": 8.523409843444824,
|
|
"learning_rate": 3.266468548786528e-05,
|
|
"loss": 1.8871,
|
|
"step": 66500
|
|
},
|
|
{
|
|
"epoch": 1.7465655222752274,
|
|
"grad_norm": 18.92120361328125,
|
|
"learning_rate": 3.253434477724773e-05,
|
|
"loss": 1.8963,
|
|
"step": 67000
|
|
},
|
|
{
|
|
"epoch": 1.7595995933369828,
|
|
"grad_norm": 17.547399520874023,
|
|
"learning_rate": 3.2404004066630174e-05,
|
|
"loss": 1.9069,
|
|
"step": 67500
|
|
},
|
|
{
|
|
"epoch": 1.7726336643987382,
|
|
"grad_norm": 9.223323822021484,
|
|
"learning_rate": 3.227366335601262e-05,
|
|
"loss": 1.9232,
|
|
"step": 68000
|
|
},
|
|
{
|
|
"epoch": 1.7856677354604937,
|
|
"grad_norm": 17.263656616210938,
|
|
"learning_rate": 3.214332264539506e-05,
|
|
"loss": 1.89,
|
|
"step": 68500
|
|
},
|
|
{
|
|
"epoch": 1.7987018065222493,
|
|
"grad_norm": 19.6173152923584,
|
|
"learning_rate": 3.201298193477751e-05,
|
|
"loss": 1.8764,
|
|
"step": 69000
|
|
},
|
|
{
|
|
"epoch": 1.8117358775840047,
|
|
"grad_norm": 10.714072227478027,
|
|
"learning_rate": 3.1882641224159955e-05,
|
|
"loss": 1.9165,
|
|
"step": 69500
|
|
},
|
|
{
|
|
"epoch": 1.82476994864576,
|
|
"grad_norm": 5.039360523223877,
|
|
"learning_rate": 3.17523005135424e-05,
|
|
"loss": 1.8422,
|
|
"step": 70000
|
|
},
|
|
{
|
|
"epoch": 1.8378040197075154,
|
|
"grad_norm": 28.72756576538086,
|
|
"learning_rate": 3.162195980292485e-05,
|
|
"loss": 1.8819,
|
|
"step": 70500
|
|
},
|
|
{
|
|
"epoch": 1.8508380907692707,
|
|
"grad_norm": 4.069336414337158,
|
|
"learning_rate": 3.149161909230729e-05,
|
|
"loss": 1.8769,
|
|
"step": 71000
|
|
},
|
|
{
|
|
"epoch": 1.8638721618310263,
|
|
"grad_norm": 4.223635196685791,
|
|
"learning_rate": 3.136127838168974e-05,
|
|
"loss": 1.8799,
|
|
"step": 71500
|
|
},
|
|
{
|
|
"epoch": 1.8769062328927817,
|
|
"grad_norm": 10.401415824890137,
|
|
"learning_rate": 3.123093767107219e-05,
|
|
"loss": 1.905,
|
|
"step": 72000
|
|
},
|
|
{
|
|
"epoch": 1.8899403039545373,
|
|
"grad_norm": 5.064211368560791,
|
|
"learning_rate": 3.110059696045463e-05,
|
|
"loss": 1.827,
|
|
"step": 72500
|
|
},
|
|
{
|
|
"epoch": 1.9029743750162926,
|
|
"grad_norm": 4.138282299041748,
|
|
"learning_rate": 3.0970256249837074e-05,
|
|
"loss": 1.8237,
|
|
"step": 73000
|
|
},
|
|
{
|
|
"epoch": 1.916008446078048,
|
|
"grad_norm": 3.365440845489502,
|
|
"learning_rate": 3.0839915539219525e-05,
|
|
"loss": 1.8421,
|
|
"step": 73500
|
|
},
|
|
{
|
|
"epoch": 1.9290425171398033,
|
|
"grad_norm": 7.819665431976318,
|
|
"learning_rate": 3.070957482860197e-05,
|
|
"loss": 1.8413,
|
|
"step": 74000
|
|
},
|
|
{
|
|
"epoch": 1.942076588201559,
|
|
"grad_norm": 8.81440544128418,
|
|
"learning_rate": 3.057923411798441e-05,
|
|
"loss": 1.8633,
|
|
"step": 74500
|
|
},
|
|
{
|
|
"epoch": 1.9551106592633143,
|
|
"grad_norm": 12.814815521240234,
|
|
"learning_rate": 3.044889340736686e-05,
|
|
"loss": 1.8255,
|
|
"step": 75000
|
|
},
|
|
{
|
|
"epoch": 1.9681447303250699,
|
|
"grad_norm": 7.332582950592041,
|
|
"learning_rate": 3.0318552696749302e-05,
|
|
"loss": 1.8228,
|
|
"step": 75500
|
|
},
|
|
{
|
|
"epoch": 1.9811788013868252,
|
|
"grad_norm": 6.4567694664001465,
|
|
"learning_rate": 3.018821198613175e-05,
|
|
"loss": 1.8514,
|
|
"step": 76000
|
|
},
|
|
{
|
|
"epoch": 1.9942128724485806,
|
|
"grad_norm": 33.37932205200195,
|
|
"learning_rate": 3.0057871275514193e-05,
|
|
"loss": 1.8347,
|
|
"step": 76500
|
|
},
|
|
{
|
|
"epoch": 2.007246943510336,
|
|
"grad_norm": 3.908621072769165,
|
|
"learning_rate": 2.992753056489664e-05,
|
|
"loss": 1.8015,
|
|
"step": 77000
|
|
},
|
|
{
|
|
"epoch": 2.0202810145720913,
|
|
"grad_norm": 3.9100475311279297,
|
|
"learning_rate": 2.979718985427909e-05,
|
|
"loss": 1.8148,
|
|
"step": 77500
|
|
},
|
|
{
|
|
"epoch": 2.0333150856338467,
|
|
"grad_norm": 4.988982200622559,
|
|
"learning_rate": 2.9666849143661534e-05,
|
|
"loss": 1.7508,
|
|
"step": 78000
|
|
},
|
|
{
|
|
"epoch": 2.0463491566956025,
|
|
"grad_norm": 5.134647846221924,
|
|
"learning_rate": 2.953650843304398e-05,
|
|
"loss": 1.7613,
|
|
"step": 78500
|
|
},
|
|
{
|
|
"epoch": 2.059383227757358,
|
|
"grad_norm": 6.9095845222473145,
|
|
"learning_rate": 2.9406167722426425e-05,
|
|
"loss": 1.8106,
|
|
"step": 79000
|
|
},
|
|
{
|
|
"epoch": 2.072417298819113,
|
|
"grad_norm": 14.57297420501709,
|
|
"learning_rate": 2.927582701180887e-05,
|
|
"loss": 1.7387,
|
|
"step": 79500
|
|
},
|
|
{
|
|
"epoch": 2.0854513698808685,
|
|
"grad_norm": 46.801937103271484,
|
|
"learning_rate": 2.9145486301191315e-05,
|
|
"loss": 1.7732,
|
|
"step": 80000
|
|
},
|
|
{
|
|
"epoch": 2.098485440942624,
|
|
"grad_norm": 10.51559829711914,
|
|
"learning_rate": 2.9015145590573762e-05,
|
|
"loss": 1.779,
|
|
"step": 80500
|
|
},
|
|
{
|
|
"epoch": 2.1115195120043793,
|
|
"grad_norm": 3.4089362621307373,
|
|
"learning_rate": 2.8884804879956206e-05,
|
|
"loss": 1.7613,
|
|
"step": 81000
|
|
},
|
|
{
|
|
"epoch": 2.124553583066135,
|
|
"grad_norm": 6.211880207061768,
|
|
"learning_rate": 2.8754464169338653e-05,
|
|
"loss": 1.7656,
|
|
"step": 81500
|
|
},
|
|
{
|
|
"epoch": 2.1375876541278904,
|
|
"grad_norm": 4.486207962036133,
|
|
"learning_rate": 2.8624123458721096e-05,
|
|
"loss": 1.7653,
|
|
"step": 82000
|
|
},
|
|
{
|
|
"epoch": 2.150621725189646,
|
|
"grad_norm": 4.438023090362549,
|
|
"learning_rate": 2.8493782748103543e-05,
|
|
"loss": 1.758,
|
|
"step": 82500
|
|
},
|
|
{
|
|
"epoch": 2.163655796251401,
|
|
"grad_norm": 5.200678825378418,
|
|
"learning_rate": 2.8363442037485987e-05,
|
|
"loss": 1.7487,
|
|
"step": 83000
|
|
},
|
|
{
|
|
"epoch": 2.1766898673131565,
|
|
"grad_norm": 11.503108024597168,
|
|
"learning_rate": 2.8233101326868434e-05,
|
|
"loss": 1.7539,
|
|
"step": 83500
|
|
},
|
|
{
|
|
"epoch": 2.189723938374912,
|
|
"grad_norm": 3.5593841075897217,
|
|
"learning_rate": 2.8102760616250884e-05,
|
|
"loss": 1.7604,
|
|
"step": 84000
|
|
},
|
|
{
|
|
"epoch": 2.2027580094366677,
|
|
"grad_norm": 4.380959510803223,
|
|
"learning_rate": 2.7972419905633328e-05,
|
|
"loss": 1.7688,
|
|
"step": 84500
|
|
},
|
|
{
|
|
"epoch": 2.215792080498423,
|
|
"grad_norm": 8.921208381652832,
|
|
"learning_rate": 2.7842079195015775e-05,
|
|
"loss": 1.7414,
|
|
"step": 85000
|
|
},
|
|
{
|
|
"epoch": 2.2288261515601784,
|
|
"grad_norm": 4.622405529022217,
|
|
"learning_rate": 2.771173848439822e-05,
|
|
"loss": 1.7623,
|
|
"step": 85500
|
|
},
|
|
{
|
|
"epoch": 2.2418602226219337,
|
|
"grad_norm": 27.651330947875977,
|
|
"learning_rate": 2.7581397773780666e-05,
|
|
"loss": 1.7172,
|
|
"step": 86000
|
|
},
|
|
{
|
|
"epoch": 2.254894293683689,
|
|
"grad_norm": 4.457437992095947,
|
|
"learning_rate": 2.745105706316311e-05,
|
|
"loss": 1.7444,
|
|
"step": 86500
|
|
},
|
|
{
|
|
"epoch": 2.2679283647454445,
|
|
"grad_norm": 5.793179988861084,
|
|
"learning_rate": 2.7320716352545556e-05,
|
|
"loss": 1.7386,
|
|
"step": 87000
|
|
},
|
|
{
|
|
"epoch": 2.2809624358072,
|
|
"grad_norm": 3.3070342540740967,
|
|
"learning_rate": 2.7190375641928e-05,
|
|
"loss": 1.7066,
|
|
"step": 87500
|
|
},
|
|
{
|
|
"epoch": 2.2939965068689556,
|
|
"grad_norm": 4.475468158721924,
|
|
"learning_rate": 2.7060034931310447e-05,
|
|
"loss": 1.7212,
|
|
"step": 88000
|
|
},
|
|
{
|
|
"epoch": 2.307030577930711,
|
|
"grad_norm": 4.4862847328186035,
|
|
"learning_rate": 2.692969422069289e-05,
|
|
"loss": 1.7265,
|
|
"step": 88500
|
|
},
|
|
{
|
|
"epoch": 2.3200646489924663,
|
|
"grad_norm": 3.608401298522949,
|
|
"learning_rate": 2.6799353510075337e-05,
|
|
"loss": 1.7324,
|
|
"step": 89000
|
|
},
|
|
{
|
|
"epoch": 2.3330987200542217,
|
|
"grad_norm": 4.134375095367432,
|
|
"learning_rate": 2.666901279945778e-05,
|
|
"loss": 1.6866,
|
|
"step": 89500
|
|
},
|
|
{
|
|
"epoch": 2.346132791115977,
|
|
"grad_norm": 4.030068874359131,
|
|
"learning_rate": 2.6538672088840228e-05,
|
|
"loss": 1.6955,
|
|
"step": 90000
|
|
},
|
|
{
|
|
"epoch": 2.3591668621777324,
|
|
"grad_norm": 7.18529748916626,
|
|
"learning_rate": 2.640833137822267e-05,
|
|
"loss": 1.7119,
|
|
"step": 90500
|
|
},
|
|
{
|
|
"epoch": 2.3722009332394878,
|
|
"grad_norm": 3.633330821990967,
|
|
"learning_rate": 2.6277990667605122e-05,
|
|
"loss": 1.737,
|
|
"step": 91000
|
|
},
|
|
{
|
|
"epoch": 2.3852350043012436,
|
|
"grad_norm": 5.056845188140869,
|
|
"learning_rate": 2.614764995698757e-05,
|
|
"loss": 1.7121,
|
|
"step": 91500
|
|
},
|
|
{
|
|
"epoch": 2.398269075362999,
|
|
"grad_norm": 3.203246831893921,
|
|
"learning_rate": 2.6017309246370013e-05,
|
|
"loss": 1.7096,
|
|
"step": 92000
|
|
},
|
|
{
|
|
"epoch": 2.4113031464247543,
|
|
"grad_norm": 3.830634355545044,
|
|
"learning_rate": 2.588696853575246e-05,
|
|
"loss": 1.7047,
|
|
"step": 92500
|
|
},
|
|
{
|
|
"epoch": 2.4243372174865097,
|
|
"grad_norm": 3.5095880031585693,
|
|
"learning_rate": 2.5756627825134903e-05,
|
|
"loss": 1.6875,
|
|
"step": 93000
|
|
},
|
|
{
|
|
"epoch": 2.437371288548265,
|
|
"grad_norm": 13.952683448791504,
|
|
"learning_rate": 2.562628711451735e-05,
|
|
"loss": 1.727,
|
|
"step": 93500
|
|
},
|
|
{
|
|
"epoch": 2.450405359610021,
|
|
"grad_norm": 4.152392387390137,
|
|
"learning_rate": 2.5495946403899794e-05,
|
|
"loss": 1.674,
|
|
"step": 94000
|
|
},
|
|
{
|
|
"epoch": 2.463439430671776,
|
|
"grad_norm": 28.32253074645996,
|
|
"learning_rate": 2.536560569328224e-05,
|
|
"loss": 1.6635,
|
|
"step": 94500
|
|
},
|
|
{
|
|
"epoch": 2.4764735017335315,
|
|
"grad_norm": 37.356117248535156,
|
|
"learning_rate": 2.5235264982664684e-05,
|
|
"loss": 1.6936,
|
|
"step": 95000
|
|
},
|
|
{
|
|
"epoch": 2.489507572795287,
|
|
"grad_norm": 11.425202369689941,
|
|
"learning_rate": 2.510492427204713e-05,
|
|
"loss": 1.6635,
|
|
"step": 95500
|
|
},
|
|
{
|
|
"epoch": 2.5025416438570423,
|
|
"grad_norm": 3.700289726257324,
|
|
"learning_rate": 2.497458356142958e-05,
|
|
"loss": 1.7051,
|
|
"step": 96000
|
|
},
|
|
{
|
|
"epoch": 2.5155757149187976,
|
|
"grad_norm": 16.234506607055664,
|
|
"learning_rate": 2.4844242850812025e-05,
|
|
"loss": 1.676,
|
|
"step": 96500
|
|
},
|
|
{
|
|
"epoch": 2.528609785980553,
|
|
"grad_norm": 3.4809882640838623,
|
|
"learning_rate": 2.471390214019447e-05,
|
|
"loss": 1.6795,
|
|
"step": 97000
|
|
},
|
|
{
|
|
"epoch": 2.5416438570423088,
|
|
"grad_norm": 4.420949459075928,
|
|
"learning_rate": 2.4583561429576916e-05,
|
|
"loss": 1.6926,
|
|
"step": 97500
|
|
},
|
|
{
|
|
"epoch": 2.554677928104064,
|
|
"grad_norm": 24.02429962158203,
|
|
"learning_rate": 2.445322071895936e-05,
|
|
"loss": 1.6479,
|
|
"step": 98000
|
|
},
|
|
{
|
|
"epoch": 2.5677119991658195,
|
|
"grad_norm": 4.912638187408447,
|
|
"learning_rate": 2.4322880008341807e-05,
|
|
"loss": 1.6598,
|
|
"step": 98500
|
|
},
|
|
{
|
|
"epoch": 2.580746070227575,
|
|
"grad_norm": 22.43536376953125,
|
|
"learning_rate": 2.419253929772425e-05,
|
|
"loss": 1.6532,
|
|
"step": 99000
|
|
},
|
|
{
|
|
"epoch": 2.59378014128933,
|
|
"grad_norm": 4.317445755004883,
|
|
"learning_rate": 2.40621985871067e-05,
|
|
"loss": 1.6554,
|
|
"step": 99500
|
|
},
|
|
{
|
|
"epoch": 2.606814212351086,
|
|
"grad_norm": 14.290596008300781,
|
|
"learning_rate": 2.3931857876489144e-05,
|
|
"loss": 1.6265,
|
|
"step": 100000
|
|
},
|
|
{
|
|
"epoch": 2.619848283412841,
|
|
"grad_norm": 4.331130504608154,
|
|
"learning_rate": 2.380151716587159e-05,
|
|
"loss": 1.6706,
|
|
"step": 100500
|
|
},
|
|
{
|
|
"epoch": 2.6328823544745967,
|
|
"grad_norm": 7.016634941101074,
|
|
"learning_rate": 2.3671176455254035e-05,
|
|
"loss": 1.649,
|
|
"step": 101000
|
|
},
|
|
{
|
|
"epoch": 2.645916425536352,
|
|
"grad_norm": 5.680657386779785,
|
|
"learning_rate": 2.3540835744636482e-05,
|
|
"loss": 1.6126,
|
|
"step": 101500
|
|
},
|
|
{
|
|
"epoch": 2.6589504965981074,
|
|
"grad_norm": 4.337413311004639,
|
|
"learning_rate": 2.3410495034018925e-05,
|
|
"loss": 1.6317,
|
|
"step": 102000
|
|
},
|
|
{
|
|
"epoch": 2.671984567659863,
|
|
"grad_norm": 20.466943740844727,
|
|
"learning_rate": 2.3280154323401372e-05,
|
|
"loss": 1.6348,
|
|
"step": 102500
|
|
},
|
|
{
|
|
"epoch": 2.685018638721618,
|
|
"grad_norm": 4.808228969573975,
|
|
"learning_rate": 2.314981361278382e-05,
|
|
"loss": 1.5979,
|
|
"step": 103000
|
|
},
|
|
{
|
|
"epoch": 2.698052709783374,
|
|
"grad_norm": 4.296200752258301,
|
|
"learning_rate": 2.3019472902166263e-05,
|
|
"loss": 1.6281,
|
|
"step": 103500
|
|
},
|
|
{
|
|
"epoch": 2.7110867808451293,
|
|
"grad_norm": 32.726078033447266,
|
|
"learning_rate": 2.288913219154871e-05,
|
|
"loss": 1.5966,
|
|
"step": 104000
|
|
},
|
|
{
|
|
"epoch": 2.7241208519068847,
|
|
"grad_norm": 4.275684833526611,
|
|
"learning_rate": 2.2758791480931154e-05,
|
|
"loss": 1.6108,
|
|
"step": 104500
|
|
},
|
|
{
|
|
"epoch": 2.73715492296864,
|
|
"grad_norm": 3.496002197265625,
|
|
"learning_rate": 2.26284507703136e-05,
|
|
"loss": 1.6026,
|
|
"step": 105000
|
|
},
|
|
{
|
|
"epoch": 2.7501889940303954,
|
|
"grad_norm": 9.172469139099121,
|
|
"learning_rate": 2.2498110059696044e-05,
|
|
"loss": 1.631,
|
|
"step": 105500
|
|
},
|
|
{
|
|
"epoch": 2.7632230650921508,
|
|
"grad_norm": 16.79161834716797,
|
|
"learning_rate": 2.2367769349078495e-05,
|
|
"loss": 1.6357,
|
|
"step": 106000
|
|
},
|
|
{
|
|
"epoch": 2.776257136153906,
|
|
"grad_norm": 14.198761940002441,
|
|
"learning_rate": 2.2237428638460938e-05,
|
|
"loss": 1.6423,
|
|
"step": 106500
|
|
},
|
|
{
|
|
"epoch": 2.789291207215662,
|
|
"grad_norm": 5.301556587219238,
|
|
"learning_rate": 2.2107087927843385e-05,
|
|
"loss": 1.6125,
|
|
"step": 107000
|
|
},
|
|
{
|
|
"epoch": 2.8023252782774173,
|
|
"grad_norm": 26.385272979736328,
|
|
"learning_rate": 2.197674721722583e-05,
|
|
"loss": 1.6334,
|
|
"step": 107500
|
|
},
|
|
{
|
|
"epoch": 2.8153593493391726,
|
|
"grad_norm": 9.757530212402344,
|
|
"learning_rate": 2.1846406506608276e-05,
|
|
"loss": 1.586,
|
|
"step": 108000
|
|
},
|
|
{
|
|
"epoch": 2.828393420400928,
|
|
"grad_norm": 20.982559204101562,
|
|
"learning_rate": 2.171606579599072e-05,
|
|
"loss": 1.6066,
|
|
"step": 108500
|
|
},
|
|
{
|
|
"epoch": 2.8414274914626834,
|
|
"grad_norm": 3.695369243621826,
|
|
"learning_rate": 2.1585725085373166e-05,
|
|
"loss": 1.6307,
|
|
"step": 109000
|
|
},
|
|
{
|
|
"epoch": 2.8544615625244387,
|
|
"grad_norm": 14.864655494689941,
|
|
"learning_rate": 2.1455384374755613e-05,
|
|
"loss": 1.5847,
|
|
"step": 109500
|
|
},
|
|
{
|
|
"epoch": 2.867495633586194,
|
|
"grad_norm": 3.9043121337890625,
|
|
"learning_rate": 2.1325043664138057e-05,
|
|
"loss": 1.5904,
|
|
"step": 110000
|
|
},
|
|
{
|
|
"epoch": 2.88052970464795,
|
|
"grad_norm": 4.432578086853027,
|
|
"learning_rate": 2.1194702953520504e-05,
|
|
"loss": 1.6037,
|
|
"step": 110500
|
|
},
|
|
{
|
|
"epoch": 2.8935637757097052,
|
|
"grad_norm": 6.775419235229492,
|
|
"learning_rate": 2.1064362242902948e-05,
|
|
"loss": 1.6052,
|
|
"step": 111000
|
|
},
|
|
{
|
|
"epoch": 2.9065978467714606,
|
|
"grad_norm": 5.090266227722168,
|
|
"learning_rate": 2.0934021532285395e-05,
|
|
"loss": 1.5814,
|
|
"step": 111500
|
|
},
|
|
{
|
|
"epoch": 2.919631917833216,
|
|
"grad_norm": 7.805962085723877,
|
|
"learning_rate": 2.0803680821667838e-05,
|
|
"loss": 1.6016,
|
|
"step": 112000
|
|
},
|
|
{
|
|
"epoch": 2.9326659888949713,
|
|
"grad_norm": 6.22263240814209,
|
|
"learning_rate": 2.067334011105029e-05,
|
|
"loss": 1.564,
|
|
"step": 112500
|
|
},
|
|
{
|
|
"epoch": 2.945700059956727,
|
|
"grad_norm": 23.055776596069336,
|
|
"learning_rate": 2.0542999400432732e-05,
|
|
"loss": 1.555,
|
|
"step": 113000
|
|
},
|
|
{
|
|
"epoch": 2.958734131018482,
|
|
"grad_norm": 20.39297866821289,
|
|
"learning_rate": 2.041265868981518e-05,
|
|
"loss": 1.5306,
|
|
"step": 113500
|
|
},
|
|
{
|
|
"epoch": 2.971768202080238,
|
|
"grad_norm": 5.571432113647461,
|
|
"learning_rate": 2.0282317979197623e-05,
|
|
"loss": 1.577,
|
|
"step": 114000
|
|
},
|
|
{
|
|
"epoch": 2.984802273141993,
|
|
"grad_norm": 15.77784252166748,
|
|
"learning_rate": 2.015197726858007e-05,
|
|
"loss": 1.6165,
|
|
"step": 114500
|
|
},
|
|
{
|
|
"epoch": 2.9978363442037486,
|
|
"grad_norm": 4.388451099395752,
|
|
"learning_rate": 2.0021636557962513e-05,
|
|
"loss": 1.544,
|
|
"step": 115000
|
|
},
|
|
{
|
|
"epoch": 3.010870415265504,
|
|
"grad_norm": 2.794743776321411,
|
|
"learning_rate": 1.989129584734496e-05,
|
|
"loss": 1.561,
|
|
"step": 115500
|
|
},
|
|
{
|
|
"epoch": 3.0239044863272593,
|
|
"grad_norm": 38.998512268066406,
|
|
"learning_rate": 1.9760955136727407e-05,
|
|
"loss": 1.5344,
|
|
"step": 116000
|
|
},
|
|
{
|
|
"epoch": 3.036938557389015,
|
|
"grad_norm": 10.872420310974121,
|
|
"learning_rate": 1.9630614426109854e-05,
|
|
"loss": 1.5191,
|
|
"step": 116500
|
|
},
|
|
{
|
|
"epoch": 3.0499726284507704,
|
|
"grad_norm": 4.433558464050293,
|
|
"learning_rate": 1.9500273715492298e-05,
|
|
"loss": 1.5093,
|
|
"step": 117000
|
|
},
|
|
{
|
|
"epoch": 3.063006699512526,
|
|
"grad_norm": 3.8315622806549072,
|
|
"learning_rate": 1.9369933004874745e-05,
|
|
"loss": 1.5344,
|
|
"step": 117500
|
|
},
|
|
{
|
|
"epoch": 3.076040770574281,
|
|
"grad_norm": 24.29652976989746,
|
|
"learning_rate": 1.923959229425719e-05,
|
|
"loss": 1.5557,
|
|
"step": 118000
|
|
},
|
|
{
|
|
"epoch": 3.0890748416360365,
|
|
"grad_norm": 4.876192092895508,
|
|
"learning_rate": 1.9109251583639636e-05,
|
|
"loss": 1.5381,
|
|
"step": 118500
|
|
},
|
|
{
|
|
"epoch": 3.102108912697792,
|
|
"grad_norm": 4.730300426483154,
|
|
"learning_rate": 1.897891087302208e-05,
|
|
"loss": 1.4977,
|
|
"step": 119000
|
|
},
|
|
{
|
|
"epoch": 3.1151429837595472,
|
|
"grad_norm": 15.773541450500488,
|
|
"learning_rate": 1.8848570162404526e-05,
|
|
"loss": 1.5262,
|
|
"step": 119500
|
|
},
|
|
{
|
|
"epoch": 3.128177054821303,
|
|
"grad_norm": 3.4133520126342773,
|
|
"learning_rate": 1.8718229451786973e-05,
|
|
"loss": 1.5142,
|
|
"step": 120000
|
|
},
|
|
{
|
|
"epoch": 3.1412111258830584,
|
|
"grad_norm": 4.271722316741943,
|
|
"learning_rate": 1.8587888741169417e-05,
|
|
"loss": 1.5108,
|
|
"step": 120500
|
|
},
|
|
{
|
|
"epoch": 3.1542451969448138,
|
|
"grad_norm": 4.478157997131348,
|
|
"learning_rate": 1.8457548030551864e-05,
|
|
"loss": 1.5111,
|
|
"step": 121000
|
|
},
|
|
{
|
|
"epoch": 3.167279268006569,
|
|
"grad_norm": 6.74271821975708,
|
|
"learning_rate": 1.8327207319934307e-05,
|
|
"loss": 1.5359,
|
|
"step": 121500
|
|
},
|
|
{
|
|
"epoch": 3.1803133390683245,
|
|
"grad_norm": 10.100676536560059,
|
|
"learning_rate": 1.8196866609316754e-05,
|
|
"loss": 1.4856,
|
|
"step": 122000
|
|
},
|
|
{
|
|
"epoch": 3.19334741013008,
|
|
"grad_norm": 5.077882289886475,
|
|
"learning_rate": 1.8066525898699198e-05,
|
|
"loss": 1.5054,
|
|
"step": 122500
|
|
},
|
|
{
|
|
"epoch": 3.2063814811918356,
|
|
"grad_norm": 4.155623912811279,
|
|
"learning_rate": 1.793618518808165e-05,
|
|
"loss": 1.5089,
|
|
"step": 123000
|
|
},
|
|
{
|
|
"epoch": 3.219415552253591,
|
|
"grad_norm": 3.6238481998443604,
|
|
"learning_rate": 1.7805844477464092e-05,
|
|
"loss": 1.4933,
|
|
"step": 123500
|
|
},
|
|
{
|
|
"epoch": 3.2324496233153464,
|
|
"grad_norm": 4.119343280792236,
|
|
"learning_rate": 1.767550376684654e-05,
|
|
"loss": 1.5215,
|
|
"step": 124000
|
|
},
|
|
{
|
|
"epoch": 3.2454836943771017,
|
|
"grad_norm": 3.789219379425049,
|
|
"learning_rate": 1.7545163056228983e-05,
|
|
"loss": 1.4686,
|
|
"step": 124500
|
|
},
|
|
{
|
|
"epoch": 3.258517765438857,
|
|
"grad_norm": 23.477462768554688,
|
|
"learning_rate": 1.741482234561143e-05,
|
|
"loss": 1.4928,
|
|
"step": 125000
|
|
},
|
|
{
|
|
"epoch": 3.2715518365006124,
|
|
"grad_norm": 34.81294250488281,
|
|
"learning_rate": 1.7284481634993873e-05,
|
|
"loss": 1.5147,
|
|
"step": 125500
|
|
},
|
|
{
|
|
"epoch": 3.2845859075623682,
|
|
"grad_norm": 3.911698579788208,
|
|
"learning_rate": 1.715414092437632e-05,
|
|
"loss": 1.498,
|
|
"step": 126000
|
|
},
|
|
{
|
|
"epoch": 3.2976199786241236,
|
|
"grad_norm": 17.540603637695312,
|
|
"learning_rate": 1.7023800213758767e-05,
|
|
"loss": 1.5224,
|
|
"step": 126500
|
|
},
|
|
{
|
|
"epoch": 3.310654049685879,
|
|
"grad_norm": 5.028404712677002,
|
|
"learning_rate": 1.689345950314121e-05,
|
|
"loss": 1.4782,
|
|
"step": 127000
|
|
},
|
|
{
|
|
"epoch": 3.3236881207476343,
|
|
"grad_norm": 11.53537654876709,
|
|
"learning_rate": 1.6763118792523658e-05,
|
|
"loss": 1.4837,
|
|
"step": 127500
|
|
},
|
|
{
|
|
"epoch": 3.3367221918093897,
|
|
"grad_norm": 3.8512253761291504,
|
|
"learning_rate": 1.66327780819061e-05,
|
|
"loss": 1.4528,
|
|
"step": 128000
|
|
},
|
|
{
|
|
"epoch": 3.349756262871145,
|
|
"grad_norm": 3.932035207748413,
|
|
"learning_rate": 1.650243737128855e-05,
|
|
"loss": 1.5026,
|
|
"step": 128500
|
|
},
|
|
{
|
|
"epoch": 3.3627903339329004,
|
|
"grad_norm": 4.325034141540527,
|
|
"learning_rate": 1.6372096660670992e-05,
|
|
"loss": 1.4717,
|
|
"step": 129000
|
|
},
|
|
{
|
|
"epoch": 3.375824404994656,
|
|
"grad_norm": 7.62436580657959,
|
|
"learning_rate": 1.6241755950053442e-05,
|
|
"loss": 1.4677,
|
|
"step": 129500
|
|
},
|
|
{
|
|
"epoch": 3.3888584760564116,
|
|
"grad_norm": 4.481779098510742,
|
|
"learning_rate": 1.6111415239435886e-05,
|
|
"loss": 1.487,
|
|
"step": 130000
|
|
},
|
|
{
|
|
"epoch": 3.401892547118167,
|
|
"grad_norm": 4.1522536277771,
|
|
"learning_rate": 1.5981074528818333e-05,
|
|
"loss": 1.4724,
|
|
"step": 130500
|
|
},
|
|
{
|
|
"epoch": 3.4149266181799223,
|
|
"grad_norm": 22.38875961303711,
|
|
"learning_rate": 1.5850733818200777e-05,
|
|
"loss": 1.4694,
|
|
"step": 131000
|
|
},
|
|
{
|
|
"epoch": 3.4279606892416776,
|
|
"grad_norm": 5.144596099853516,
|
|
"learning_rate": 1.5720393107583224e-05,
|
|
"loss": 1.4792,
|
|
"step": 131500
|
|
},
|
|
{
|
|
"epoch": 3.440994760303433,
|
|
"grad_norm": 4.0159912109375,
|
|
"learning_rate": 1.5590052396965667e-05,
|
|
"loss": 1.4535,
|
|
"step": 132000
|
|
},
|
|
{
|
|
"epoch": 3.454028831365189,
|
|
"grad_norm": 4.164160251617432,
|
|
"learning_rate": 1.5459711686348114e-05,
|
|
"loss": 1.4516,
|
|
"step": 132500
|
|
},
|
|
{
|
|
"epoch": 3.467062902426944,
|
|
"grad_norm": 4.1465349197387695,
|
|
"learning_rate": 1.532937097573056e-05,
|
|
"loss": 1.4383,
|
|
"step": 133000
|
|
},
|
|
{
|
|
"epoch": 3.4800969734886995,
|
|
"grad_norm": 5.3553466796875,
|
|
"learning_rate": 1.5199030265113007e-05,
|
|
"loss": 1.4588,
|
|
"step": 133500
|
|
},
|
|
{
|
|
"epoch": 3.493131044550455,
|
|
"grad_norm": 4.2381110191345215,
|
|
"learning_rate": 1.5068689554495452e-05,
|
|
"loss": 1.4607,
|
|
"step": 134000
|
|
},
|
|
{
|
|
"epoch": 3.5061651156122102,
|
|
"grad_norm": 4.227059364318848,
|
|
"learning_rate": 1.4938348843877897e-05,
|
|
"loss": 1.4855,
|
|
"step": 134500
|
|
},
|
|
{
|
|
"epoch": 3.5191991866739656,
|
|
"grad_norm": 4.23318338394165,
|
|
"learning_rate": 1.4808008133260342e-05,
|
|
"loss": 1.4452,
|
|
"step": 135000
|
|
},
|
|
{
|
|
"epoch": 3.5322332577357214,
|
|
"grad_norm": 4.2789788246154785,
|
|
"learning_rate": 1.4677667422642788e-05,
|
|
"loss": 1.4471,
|
|
"step": 135500
|
|
},
|
|
{
|
|
"epoch": 3.5452673287974767,
|
|
"grad_norm": 14.372062683105469,
|
|
"learning_rate": 1.4547326712025236e-05,
|
|
"loss": 1.4663,
|
|
"step": 136000
|
|
},
|
|
{
|
|
"epoch": 3.558301399859232,
|
|
"grad_norm": 4.719635963439941,
|
|
"learning_rate": 1.4416986001407682e-05,
|
|
"loss": 1.4628,
|
|
"step": 136500
|
|
},
|
|
{
|
|
"epoch": 3.5713354709209875,
|
|
"grad_norm": 4.603359222412109,
|
|
"learning_rate": 1.4286645290790127e-05,
|
|
"loss": 1.4464,
|
|
"step": 137000
|
|
},
|
|
{
|
|
"epoch": 3.584369541982743,
|
|
"grad_norm": 4.167656421661377,
|
|
"learning_rate": 1.4156304580172572e-05,
|
|
"loss": 1.4816,
|
|
"step": 137500
|
|
},
|
|
{
|
|
"epoch": 3.597403613044498,
|
|
"grad_norm": 3.9802513122558594,
|
|
"learning_rate": 1.4025963869555018e-05,
|
|
"loss": 1.4404,
|
|
"step": 138000
|
|
},
|
|
{
|
|
"epoch": 3.6104376841062535,
|
|
"grad_norm": 4.956002235412598,
|
|
"learning_rate": 1.3895623158937463e-05,
|
|
"loss": 1.4463,
|
|
"step": 138500
|
|
},
|
|
{
|
|
"epoch": 3.6234717551680093,
|
|
"grad_norm": 4.82868766784668,
|
|
"learning_rate": 1.3765282448319908e-05,
|
|
"loss": 1.429,
|
|
"step": 139000
|
|
},
|
|
{
|
|
"epoch": 3.6365058262297647,
|
|
"grad_norm": 9.303766250610352,
|
|
"learning_rate": 1.3634941737702355e-05,
|
|
"loss": 1.4492,
|
|
"step": 139500
|
|
},
|
|
{
|
|
"epoch": 3.64953989729152,
|
|
"grad_norm": 4.728789806365967,
|
|
"learning_rate": 1.35046010270848e-05,
|
|
"loss": 1.4599,
|
|
"step": 140000
|
|
},
|
|
{
|
|
"epoch": 3.6625739683532754,
|
|
"grad_norm": 4.169735431671143,
|
|
"learning_rate": 1.3374260316467246e-05,
|
|
"loss": 1.4346,
|
|
"step": 140500
|
|
},
|
|
{
|
|
"epoch": 3.675608039415031,
|
|
"grad_norm": 4.134032249450684,
|
|
"learning_rate": 1.3243919605849691e-05,
|
|
"loss": 1.426,
|
|
"step": 141000
|
|
},
|
|
{
|
|
"epoch": 3.6886421104767866,
|
|
"grad_norm": 7.31259822845459,
|
|
"learning_rate": 1.3113578895232136e-05,
|
|
"loss": 1.4489,
|
|
"step": 141500
|
|
},
|
|
{
|
|
"epoch": 3.7016761815385415,
|
|
"grad_norm": 41.01179885864258,
|
|
"learning_rate": 1.2983238184614582e-05,
|
|
"loss": 1.4594,
|
|
"step": 142000
|
|
},
|
|
{
|
|
"epoch": 3.7147102526002973,
|
|
"grad_norm": 4.123907566070557,
|
|
"learning_rate": 1.2852897473997027e-05,
|
|
"loss": 1.4445,
|
|
"step": 142500
|
|
},
|
|
{
|
|
"epoch": 3.7277443236620527,
|
|
"grad_norm": 12.47805404663086,
|
|
"learning_rate": 1.2722556763379476e-05,
|
|
"loss": 1.416,
|
|
"step": 143000
|
|
},
|
|
{
|
|
"epoch": 3.740778394723808,
|
|
"grad_norm": 4.795707702636719,
|
|
"learning_rate": 1.2592216052761921e-05,
|
|
"loss": 1.449,
|
|
"step": 143500
|
|
},
|
|
{
|
|
"epoch": 3.7538124657855634,
|
|
"grad_norm": 3.754809856414795,
|
|
"learning_rate": 1.2461875342144366e-05,
|
|
"loss": 1.4353,
|
|
"step": 144000
|
|
},
|
|
{
|
|
"epoch": 3.7668465368473187,
|
|
"grad_norm": 4.847051620483398,
|
|
"learning_rate": 1.2331534631526812e-05,
|
|
"loss": 1.4081,
|
|
"step": 144500
|
|
},
|
|
{
|
|
"epoch": 3.7798806079090745,
|
|
"grad_norm": 5.240978240966797,
|
|
"learning_rate": 1.2201193920909257e-05,
|
|
"loss": 1.4497,
|
|
"step": 145000
|
|
},
|
|
{
|
|
"epoch": 3.79291467897083,
|
|
"grad_norm": 4.278606414794922,
|
|
"learning_rate": 1.2070853210291704e-05,
|
|
"loss": 1.4296,
|
|
"step": 145500
|
|
},
|
|
{
|
|
"epoch": 3.8059487500325853,
|
|
"grad_norm": 24.963735580444336,
|
|
"learning_rate": 1.194051249967415e-05,
|
|
"loss": 1.4273,
|
|
"step": 146000
|
|
},
|
|
{
|
|
"epoch": 3.8189828210943406,
|
|
"grad_norm": 3.3722941875457764,
|
|
"learning_rate": 1.1810171789056595e-05,
|
|
"loss": 1.3939,
|
|
"step": 146500
|
|
},
|
|
{
|
|
"epoch": 3.832016892156096,
|
|
"grad_norm": 3.9926798343658447,
|
|
"learning_rate": 1.1679831078439042e-05,
|
|
"loss": 1.4149,
|
|
"step": 147000
|
|
},
|
|
{
|
|
"epoch": 3.8450509632178513,
|
|
"grad_norm": 7.269467353820801,
|
|
"learning_rate": 1.1549490367821487e-05,
|
|
"loss": 1.4004,
|
|
"step": 147500
|
|
},
|
|
{
|
|
"epoch": 3.8580850342796067,
|
|
"grad_norm": 5.596455097198486,
|
|
"learning_rate": 1.1419149657203932e-05,
|
|
"loss": 1.4133,
|
|
"step": 148000
|
|
},
|
|
{
|
|
"epoch": 3.8711191053413625,
|
|
"grad_norm": 5.81203556060791,
|
|
"learning_rate": 1.1288808946586377e-05,
|
|
"loss": 1.4313,
|
|
"step": 148500
|
|
},
|
|
{
|
|
"epoch": 3.884153176403118,
|
|
"grad_norm": 4.842901229858398,
|
|
"learning_rate": 1.1158468235968823e-05,
|
|
"loss": 1.4139,
|
|
"step": 149000
|
|
},
|
|
{
|
|
"epoch": 3.897187247464873,
|
|
"grad_norm": 3.6464438438415527,
|
|
"learning_rate": 1.1028127525351268e-05,
|
|
"loss": 1.4189,
|
|
"step": 149500
|
|
},
|
|
{
|
|
"epoch": 3.9102213185266286,
|
|
"grad_norm": 5.625620365142822,
|
|
"learning_rate": 1.0897786814733713e-05,
|
|
"loss": 1.4119,
|
|
"step": 150000
|
|
},
|
|
{
|
|
"epoch": 3.923255389588384,
|
|
"grad_norm": 3.84614896774292,
|
|
"learning_rate": 1.076744610411616e-05,
|
|
"loss": 1.4094,
|
|
"step": 150500
|
|
},
|
|
{
|
|
"epoch": 3.9362894606501397,
|
|
"grad_norm": 5.183802127838135,
|
|
"learning_rate": 1.0637105393498606e-05,
|
|
"loss": 1.4157,
|
|
"step": 151000
|
|
},
|
|
{
|
|
"epoch": 3.9493235317118947,
|
|
"grad_norm": 4.6199140548706055,
|
|
"learning_rate": 1.0506764682881051e-05,
|
|
"loss": 1.4067,
|
|
"step": 151500
|
|
},
|
|
{
|
|
"epoch": 3.9623576027736505,
|
|
"grad_norm": 5.642277717590332,
|
|
"learning_rate": 1.0376423972263498e-05,
|
|
"loss": 1.3994,
|
|
"step": 152000
|
|
},
|
|
{
|
|
"epoch": 3.975391673835406,
|
|
"grad_norm": 4.15669584274292,
|
|
"learning_rate": 1.0246083261645943e-05,
|
|
"loss": 1.4304,
|
|
"step": 152500
|
|
},
|
|
{
|
|
"epoch": 3.988425744897161,
|
|
"grad_norm": 4.729000568389893,
|
|
"learning_rate": 1.0115742551028389e-05,
|
|
"loss": 1.3979,
|
|
"step": 153000
|
|
},
|
|
{
|
|
"epoch": 4.001459815958917,
|
|
"grad_norm": 3.2223262786865234,
|
|
"learning_rate": 9.985401840410834e-06,
|
|
"loss": 1.3897,
|
|
"step": 153500
|
|
},
|
|
{
|
|
"epoch": 4.014493887020672,
|
|
"grad_norm": 4.223217964172363,
|
|
"learning_rate": 9.855061129793281e-06,
|
|
"loss": 1.3567,
|
|
"step": 154000
|
|
},
|
|
{
|
|
"epoch": 4.027527958082428,
|
|
"grad_norm": 3.201354742050171,
|
|
"learning_rate": 9.724720419175726e-06,
|
|
"loss": 1.3796,
|
|
"step": 154500
|
|
},
|
|
{
|
|
"epoch": 4.040562029144183,
|
|
"grad_norm": 31.99419593811035,
|
|
"learning_rate": 9.594379708558171e-06,
|
|
"loss": 1.3475,
|
|
"step": 155000
|
|
},
|
|
{
|
|
"epoch": 4.053596100205938,
|
|
"grad_norm": 19.76371192932129,
|
|
"learning_rate": 9.464038997940618e-06,
|
|
"loss": 1.3278,
|
|
"step": 155500
|
|
},
|
|
{
|
|
"epoch": 4.066630171267693,
|
|
"grad_norm": 3.462979316711426,
|
|
"learning_rate": 9.333698287323064e-06,
|
|
"loss": 1.3632,
|
|
"step": 156000
|
|
},
|
|
{
|
|
"epoch": 4.079664242329449,
|
|
"grad_norm": 27.641897201538086,
|
|
"learning_rate": 9.203357576705509e-06,
|
|
"loss": 1.3203,
|
|
"step": 156500
|
|
},
|
|
{
|
|
"epoch": 4.092698313391205,
|
|
"grad_norm": 3.934295654296875,
|
|
"learning_rate": 9.073016866087954e-06,
|
|
"loss": 1.3793,
|
|
"step": 157000
|
|
},
|
|
{
|
|
"epoch": 4.10573238445296,
|
|
"grad_norm": 3.3237240314483643,
|
|
"learning_rate": 8.9426761554704e-06,
|
|
"loss": 1.3375,
|
|
"step": 157500
|
|
},
|
|
{
|
|
"epoch": 4.118766455514716,
|
|
"grad_norm": 5.202388286590576,
|
|
"learning_rate": 8.812335444852845e-06,
|
|
"loss": 1.3852,
|
|
"step": 158000
|
|
},
|
|
{
|
|
"epoch": 4.131800526576471,
|
|
"grad_norm": 28.595399856567383,
|
|
"learning_rate": 8.68199473423529e-06,
|
|
"loss": 1.3644,
|
|
"step": 158500
|
|
},
|
|
{
|
|
"epoch": 4.144834597638226,
|
|
"grad_norm": 3.2022364139556885,
|
|
"learning_rate": 8.551654023617737e-06,
|
|
"loss": 1.3734,
|
|
"step": 159000
|
|
},
|
|
{
|
|
"epoch": 4.157868668699982,
|
|
"grad_norm": 4.231220245361328,
|
|
"learning_rate": 8.421313313000183e-06,
|
|
"loss": 1.349,
|
|
"step": 159500
|
|
},
|
|
{
|
|
"epoch": 4.170902739761737,
|
|
"grad_norm": 4.515881538391113,
|
|
"learning_rate": 8.290972602382628e-06,
|
|
"loss": 1.3392,
|
|
"step": 160000
|
|
},
|
|
{
|
|
"epoch": 4.183936810823493,
|
|
"grad_norm": 3.6497957706451416,
|
|
"learning_rate": 8.160631891765075e-06,
|
|
"loss": 1.3495,
|
|
"step": 160500
|
|
},
|
|
{
|
|
"epoch": 4.196970881885248,
|
|
"grad_norm": 16.680282592773438,
|
|
"learning_rate": 8.03029118114752e-06,
|
|
"loss": 1.3566,
|
|
"step": 161000
|
|
},
|
|
{
|
|
"epoch": 4.210004952947004,
|
|
"grad_norm": 18.566879272460938,
|
|
"learning_rate": 7.899950470529966e-06,
|
|
"loss": 1.3248,
|
|
"step": 161500
|
|
},
|
|
{
|
|
"epoch": 4.2230390240087585,
|
|
"grad_norm": 3.9700820446014404,
|
|
"learning_rate": 7.769609759912413e-06,
|
|
"loss": 1.3767,
|
|
"step": 162000
|
|
},
|
|
{
|
|
"epoch": 4.236073095070514,
|
|
"grad_norm": 42.5576286315918,
|
|
"learning_rate": 7.639269049294858e-06,
|
|
"loss": 1.3346,
|
|
"step": 162500
|
|
},
|
|
{
|
|
"epoch": 4.24910716613227,
|
|
"grad_norm": 7.013011455535889,
|
|
"learning_rate": 7.508928338677302e-06,
|
|
"loss": 1.3752,
|
|
"step": 163000
|
|
},
|
|
{
|
|
"epoch": 4.262141237194025,
|
|
"grad_norm": 12.351140975952148,
|
|
"learning_rate": 7.3785876280597476e-06,
|
|
"loss": 1.3213,
|
|
"step": 163500
|
|
},
|
|
{
|
|
"epoch": 4.275175308255781,
|
|
"grad_norm": 48.051631927490234,
|
|
"learning_rate": 7.2482469174421946e-06,
|
|
"loss": 1.3453,
|
|
"step": 164000
|
|
},
|
|
{
|
|
"epoch": 4.288209379317536,
|
|
"grad_norm": 3.8004846572875977,
|
|
"learning_rate": 7.11790620682464e-06,
|
|
"loss": 1.3231,
|
|
"step": 164500
|
|
},
|
|
{
|
|
"epoch": 4.301243450379292,
|
|
"grad_norm": 3.8865389823913574,
|
|
"learning_rate": 6.987565496207085e-06,
|
|
"loss": 1.3353,
|
|
"step": 165000
|
|
},
|
|
{
|
|
"epoch": 4.3142775214410465,
|
|
"grad_norm": 4.471733093261719,
|
|
"learning_rate": 6.857224785589532e-06,
|
|
"loss": 1.3411,
|
|
"step": 165500
|
|
},
|
|
{
|
|
"epoch": 4.327311592502802,
|
|
"grad_norm": 4.856067657470703,
|
|
"learning_rate": 6.7268840749719775e-06,
|
|
"loss": 1.3254,
|
|
"step": 166000
|
|
},
|
|
{
|
|
"epoch": 4.340345663564558,
|
|
"grad_norm": 4.089067459106445,
|
|
"learning_rate": 6.596543364354423e-06,
|
|
"loss": 1.3676,
|
|
"step": 166500
|
|
},
|
|
{
|
|
"epoch": 4.353379734626313,
|
|
"grad_norm": 4.231725215911865,
|
|
"learning_rate": 6.466202653736869e-06,
|
|
"loss": 1.3331,
|
|
"step": 167000
|
|
},
|
|
{
|
|
"epoch": 4.366413805688069,
|
|
"grad_norm": 4.140297889709473,
|
|
"learning_rate": 6.335861943119314e-06,
|
|
"loss": 1.3338,
|
|
"step": 167500
|
|
},
|
|
{
|
|
"epoch": 4.379447876749824,
|
|
"grad_norm": 3.1667165756225586,
|
|
"learning_rate": 6.2055212325017595e-06,
|
|
"loss": 1.3658,
|
|
"step": 168000
|
|
},
|
|
{
|
|
"epoch": 4.3924819478115795,
|
|
"grad_norm": 4.982083797454834,
|
|
"learning_rate": 6.075180521884206e-06,
|
|
"loss": 1.3098,
|
|
"step": 168500
|
|
},
|
|
{
|
|
"epoch": 4.405516018873335,
|
|
"grad_norm": 19.951147079467773,
|
|
"learning_rate": 5.944839811266651e-06,
|
|
"loss": 1.315,
|
|
"step": 169000
|
|
},
|
|
{
|
|
"epoch": 4.41855008993509,
|
|
"grad_norm": 5.146533489227295,
|
|
"learning_rate": 5.814499100649097e-06,
|
|
"loss": 1.3322,
|
|
"step": 169500
|
|
},
|
|
{
|
|
"epoch": 4.431584160996846,
|
|
"grad_norm": 4.29327917098999,
|
|
"learning_rate": 5.684158390031543e-06,
|
|
"loss": 1.3165,
|
|
"step": 170000
|
|
},
|
|
{
|
|
"epoch": 4.444618232058601,
|
|
"grad_norm": 4.86635160446167,
|
|
"learning_rate": 5.5538176794139886e-06,
|
|
"loss": 1.3266,
|
|
"step": 170500
|
|
},
|
|
{
|
|
"epoch": 4.457652303120357,
|
|
"grad_norm": 5.066024303436279,
|
|
"learning_rate": 5.423476968796435e-06,
|
|
"loss": 1.3201,
|
|
"step": 171000
|
|
},
|
|
{
|
|
"epoch": 4.470686374182112,
|
|
"grad_norm": 5.111464500427246,
|
|
"learning_rate": 5.293136258178879e-06,
|
|
"loss": 1.3188,
|
|
"step": 171500
|
|
},
|
|
{
|
|
"epoch": 4.4837204452438675,
|
|
"grad_norm": 4.428502082824707,
|
|
"learning_rate": 5.162795547561325e-06,
|
|
"loss": 1.3162,
|
|
"step": 172000
|
|
},
|
|
{
|
|
"epoch": 4.496754516305623,
|
|
"grad_norm": 2.84608793258667,
|
|
"learning_rate": 5.0324548369437715e-06,
|
|
"loss": 1.3052,
|
|
"step": 172500
|
|
},
|
|
{
|
|
"epoch": 4.509788587367378,
|
|
"grad_norm": 4.425991058349609,
|
|
"learning_rate": 4.902114126326217e-06,
|
|
"loss": 1.3252,
|
|
"step": 173000
|
|
},
|
|
{
|
|
"epoch": 4.522822658429134,
|
|
"grad_norm": 21.735198974609375,
|
|
"learning_rate": 4.771773415708663e-06,
|
|
"loss": 1.3333,
|
|
"step": 173500
|
|
},
|
|
{
|
|
"epoch": 4.535856729490889,
|
|
"grad_norm": 4.519357204437256,
|
|
"learning_rate": 4.641432705091108e-06,
|
|
"loss": 1.3115,
|
|
"step": 174000
|
|
},
|
|
{
|
|
"epoch": 4.548890800552645,
|
|
"grad_norm": 25.662084579467773,
|
|
"learning_rate": 4.511091994473554e-06,
|
|
"loss": 1.3134,
|
|
"step": 174500
|
|
},
|
|
{
|
|
"epoch": 4.5619248716144,
|
|
"grad_norm": 3.4979422092437744,
|
|
"learning_rate": 4.3807512838560005e-06,
|
|
"loss": 1.3202,
|
|
"step": 175000
|
|
},
|
|
{
|
|
"epoch": 4.574958942676155,
|
|
"grad_norm": 4.444785118103027,
|
|
"learning_rate": 4.250410573238446e-06,
|
|
"loss": 1.3174,
|
|
"step": 175500
|
|
},
|
|
{
|
|
"epoch": 4.587993013737911,
|
|
"grad_norm": 6.712714672088623,
|
|
"learning_rate": 4.120069862620891e-06,
|
|
"loss": 1.3343,
|
|
"step": 176000
|
|
},
|
|
{
|
|
"epoch": 4.601027084799666,
|
|
"grad_norm": 4.870098114013672,
|
|
"learning_rate": 3.9897291520033364e-06,
|
|
"loss": 1.3312,
|
|
"step": 176500
|
|
},
|
|
{
|
|
"epoch": 4.614061155861422,
|
|
"grad_norm": 4.5157928466796875,
|
|
"learning_rate": 3.859388441385783e-06,
|
|
"loss": 1.3133,
|
|
"step": 177000
|
|
},
|
|
{
|
|
"epoch": 4.627095226923177,
|
|
"grad_norm": 3.297917366027832,
|
|
"learning_rate": 3.7290477307682287e-06,
|
|
"loss": 1.34,
|
|
"step": 177500
|
|
},
|
|
{
|
|
"epoch": 4.640129297984933,
|
|
"grad_norm": 5.5820698738098145,
|
|
"learning_rate": 3.598707020150674e-06,
|
|
"loss": 1.2856,
|
|
"step": 178000
|
|
},
|
|
{
|
|
"epoch": 4.653163369046688,
|
|
"grad_norm": 68.55699157714844,
|
|
"learning_rate": 3.4683663095331198e-06,
|
|
"loss": 1.3293,
|
|
"step": 178500
|
|
},
|
|
{
|
|
"epoch": 4.666197440108443,
|
|
"grad_norm": 4.395013332366943,
|
|
"learning_rate": 3.338025598915565e-06,
|
|
"loss": 1.3156,
|
|
"step": 179000
|
|
},
|
|
{
|
|
"epoch": 4.679231511170199,
|
|
"grad_norm": 4.131389141082764,
|
|
"learning_rate": 3.2076848882980112e-06,
|
|
"loss": 1.3349,
|
|
"step": 179500
|
|
},
|
|
{
|
|
"epoch": 4.692265582231954,
|
|
"grad_norm": 3.2444746494293213,
|
|
"learning_rate": 3.077344177680457e-06,
|
|
"loss": 1.2882,
|
|
"step": 180000
|
|
},
|
|
{
|
|
"epoch": 4.70529965329371,
|
|
"grad_norm": 6.894190788269043,
|
|
"learning_rate": 2.9470034670629027e-06,
|
|
"loss": 1.3064,
|
|
"step": 180500
|
|
},
|
|
{
|
|
"epoch": 4.718333724355465,
|
|
"grad_norm": 4.13007926940918,
|
|
"learning_rate": 2.816662756445348e-06,
|
|
"loss": 1.3319,
|
|
"step": 181000
|
|
},
|
|
{
|
|
"epoch": 4.731367795417221,
|
|
"grad_norm": 4.010223388671875,
|
|
"learning_rate": 2.686322045827794e-06,
|
|
"loss": 1.3289,
|
|
"step": 181500
|
|
},
|
|
{
|
|
"epoch": 4.7444018664789755,
|
|
"grad_norm": 5.212350845336914,
|
|
"learning_rate": 2.55598133521024e-06,
|
|
"loss": 1.3052,
|
|
"step": 182000
|
|
},
|
|
{
|
|
"epoch": 4.757435937540731,
|
|
"grad_norm": 4.112293243408203,
|
|
"learning_rate": 2.4256406245926856e-06,
|
|
"loss": 1.3178,
|
|
"step": 182500
|
|
},
|
|
{
|
|
"epoch": 4.770470008602487,
|
|
"grad_norm": 4.711720943450928,
|
|
"learning_rate": 2.295299913975131e-06,
|
|
"loss": 1.3017,
|
|
"step": 183000
|
|
},
|
|
{
|
|
"epoch": 4.783504079664242,
|
|
"grad_norm": 4.1918439865112305,
|
|
"learning_rate": 2.1649592033575766e-06,
|
|
"loss": 1.3368,
|
|
"step": 183500
|
|
},
|
|
{
|
|
"epoch": 4.796538150725998,
|
|
"grad_norm": 4.53779411315918,
|
|
"learning_rate": 2.0346184927400227e-06,
|
|
"loss": 1.3103,
|
|
"step": 184000
|
|
},
|
|
{
|
|
"epoch": 4.809572221787754,
|
|
"grad_norm": 2.9776086807250977,
|
|
"learning_rate": 1.9042777821224683e-06,
|
|
"loss": 1.3325,
|
|
"step": 184500
|
|
},
|
|
{
|
|
"epoch": 4.822606292849509,
|
|
"grad_norm": 5.410048007965088,
|
|
"learning_rate": 1.773937071504914e-06,
|
|
"loss": 1.324,
|
|
"step": 185000
|
|
},
|
|
{
|
|
"epoch": 4.835640363911264,
|
|
"grad_norm": 5.260219573974609,
|
|
"learning_rate": 1.6435963608873595e-06,
|
|
"loss": 1.3339,
|
|
"step": 185500
|
|
},
|
|
{
|
|
"epoch": 4.848674434973019,
|
|
"grad_norm": 5.610768795013428,
|
|
"learning_rate": 1.5132556502698054e-06,
|
|
"loss": 1.2985,
|
|
"step": 186000
|
|
},
|
|
{
|
|
"epoch": 4.861708506034775,
|
|
"grad_norm": 6.287191390991211,
|
|
"learning_rate": 1.382914939652251e-06,
|
|
"loss": 1.2973,
|
|
"step": 186500
|
|
},
|
|
{
|
|
"epoch": 4.87474257709653,
|
|
"grad_norm": 32.12895202636719,
|
|
"learning_rate": 1.2525742290346967e-06,
|
|
"loss": 1.2914,
|
|
"step": 187000
|
|
},
|
|
{
|
|
"epoch": 4.887776648158286,
|
|
"grad_norm": 15.296839714050293,
|
|
"learning_rate": 1.1222335184171426e-06,
|
|
"loss": 1.3231,
|
|
"step": 187500
|
|
},
|
|
{
|
|
"epoch": 4.900810719220042,
|
|
"grad_norm": 4.650936126708984,
|
|
"learning_rate": 9.918928077995881e-07,
|
|
"loss": 1.2902,
|
|
"step": 188000
|
|
},
|
|
{
|
|
"epoch": 4.9138447902817965,
|
|
"grad_norm": 25.2452335357666,
|
|
"learning_rate": 8.615520971820338e-07,
|
|
"loss": 1.2964,
|
|
"step": 188500
|
|
},
|
|
{
|
|
"epoch": 4.926878861343552,
|
|
"grad_norm": 4.3756890296936035,
|
|
"learning_rate": 7.312113865644796e-07,
|
|
"loss": 1.3137,
|
|
"step": 189000
|
|
},
|
|
{
|
|
"epoch": 4.939912932405307,
|
|
"grad_norm": 32.994510650634766,
|
|
"learning_rate": 6.008706759469253e-07,
|
|
"loss": 1.3033,
|
|
"step": 189500
|
|
},
|
|
{
|
|
"epoch": 4.952947003467063,
|
|
"grad_norm": 3.0575180053710938,
|
|
"learning_rate": 4.70529965329371e-07,
|
|
"loss": 1.2992,
|
|
"step": 190000
|
|
},
|
|
{
|
|
"epoch": 4.965981074528818,
|
|
"grad_norm": 4.4134135246276855,
|
|
"learning_rate": 3.401892547118167e-07,
|
|
"loss": 1.2839,
|
|
"step": 190500
|
|
},
|
|
{
|
|
"epoch": 4.979015145590574,
|
|
"grad_norm": 40.072750091552734,
|
|
"learning_rate": 2.0984854409426243e-07,
|
|
"loss": 1.3057,
|
|
"step": 191000
|
|
},
|
|
{
|
|
"epoch": 4.99204921665233,
|
|
"grad_norm": 19.755613327026367,
|
|
"learning_rate": 7.950783347670812e-08,
|
|
"loss": 1.2946,
|
|
"step": 191500
|
|
}
|
|
],
|
|
"logging_steps": 500,
|
|
"max_steps": 191805,
|
|
"num_input_tokens_seen": 0,
|
|
"num_train_epochs": 5,
|
|
"save_steps": 10000,
|
|
"stateful_callbacks": {
|
|
"TrainerControl": {
|
|
"args": {
|
|
"should_epoch_stop": false,
|
|
"should_evaluate": false,
|
|
"should_log": false,
|
|
"should_save": true,
|
|
"should_training_stop": true
|
|
},
|
|
"attributes": {}
|
|
}
|
|
},
|
|
"total_flos": 4.066567392204288e+17,
|
|
"train_batch_size": 8,
|
|
"trial_name": null,
|
|
"trial_params": null
|
|
}
|
|
|