biomed-gemma-3-4b-it / trainer_state.json
AdaptLLM's picture
Upload folder using huggingface_hub
dab7f79 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9999365683476055,
"eval_steps": 500,
"global_step": 3941,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.002537266095781795,
"grad_norm": 93.43679809570312,
"learning_rate": 2.278481012658228e-07,
"loss": 2.9759,
"step": 10
},
{
"epoch": 0.00507453219156359,
"grad_norm": 27.491559982299805,
"learning_rate": 4.810126582278482e-07,
"loss": 2.6286,
"step": 20
},
{
"epoch": 0.007611798287345386,
"grad_norm": 7.7028632164001465,
"learning_rate": 7.341772151898735e-07,
"loss": 2.043,
"step": 30
},
{
"epoch": 0.01014906438312718,
"grad_norm": 4.423813343048096,
"learning_rate": 9.873417721518988e-07,
"loss": 1.8015,
"step": 40
},
{
"epoch": 0.012686330478908976,
"grad_norm": 3.1954057216644287,
"learning_rate": 1.240506329113924e-06,
"loss": 1.7109,
"step": 50
},
{
"epoch": 0.015223596574690771,
"grad_norm": 2.8202602863311768,
"learning_rate": 1.4936708860759495e-06,
"loss": 1.6439,
"step": 60
},
{
"epoch": 0.017760862670472565,
"grad_norm": 2.797365427017212,
"learning_rate": 1.7468354430379747e-06,
"loss": 1.6011,
"step": 70
},
{
"epoch": 0.02029812876625436,
"grad_norm": 2.647296667098999,
"learning_rate": 2.0000000000000003e-06,
"loss": 1.5568,
"step": 80
},
{
"epoch": 0.022835394862036156,
"grad_norm": 2.60683274269104,
"learning_rate": 2.2531645569620258e-06,
"loss": 1.5331,
"step": 90
},
{
"epoch": 0.02537266095781795,
"grad_norm": 2.625460386276245,
"learning_rate": 2.5063291139240508e-06,
"loss": 1.5384,
"step": 100
},
{
"epoch": 0.027909927053599747,
"grad_norm": 2.527884006500244,
"learning_rate": 2.7594936708860766e-06,
"loss": 1.4949,
"step": 110
},
{
"epoch": 0.030447193149381543,
"grad_norm": 2.7804925441741943,
"learning_rate": 3.0126582278481016e-06,
"loss": 1.4907,
"step": 120
},
{
"epoch": 0.032984459245163335,
"grad_norm": 2.730539321899414,
"learning_rate": 3.265822784810127e-06,
"loss": 1.4794,
"step": 130
},
{
"epoch": 0.03552172534094513,
"grad_norm": 2.6378061771392822,
"learning_rate": 3.518987341772152e-06,
"loss": 1.4549,
"step": 140
},
{
"epoch": 0.038058991436726926,
"grad_norm": 2.7467641830444336,
"learning_rate": 3.7721518987341775e-06,
"loss": 1.4678,
"step": 150
},
{
"epoch": 0.04059625753250872,
"grad_norm": 2.5447590351104736,
"learning_rate": 4.025316455696203e-06,
"loss": 1.4312,
"step": 160
},
{
"epoch": 0.04313352362829052,
"grad_norm": 2.4532039165496826,
"learning_rate": 4.278481012658228e-06,
"loss": 1.456,
"step": 170
},
{
"epoch": 0.04567078972407231,
"grad_norm": 2.5699751377105713,
"learning_rate": 4.531645569620253e-06,
"loss": 1.4444,
"step": 180
},
{
"epoch": 0.04820805581985411,
"grad_norm": 2.583108425140381,
"learning_rate": 4.784810126582279e-06,
"loss": 1.4284,
"step": 190
},
{
"epoch": 0.0507453219156359,
"grad_norm": 2.6186587810516357,
"learning_rate": 5.037974683544305e-06,
"loss": 1.4493,
"step": 200
},
{
"epoch": 0.0532825880114177,
"grad_norm": 2.6218819618225098,
"learning_rate": 5.29113924050633e-06,
"loss": 1.4177,
"step": 210
},
{
"epoch": 0.055819854107199494,
"grad_norm": 2.581024646759033,
"learning_rate": 5.544303797468355e-06,
"loss": 1.4254,
"step": 220
},
{
"epoch": 0.05835712020298129,
"grad_norm": 2.5171055793762207,
"learning_rate": 5.79746835443038e-06,
"loss": 1.4257,
"step": 230
},
{
"epoch": 0.060894386298763085,
"grad_norm": 2.5574731826782227,
"learning_rate": 6.050632911392406e-06,
"loss": 1.4126,
"step": 240
},
{
"epoch": 0.06343165239454487,
"grad_norm": 2.3908963203430176,
"learning_rate": 6.303797468354431e-06,
"loss": 1.4096,
"step": 250
},
{
"epoch": 0.06596891849032667,
"grad_norm": 2.561500310897827,
"learning_rate": 6.5569620253164564e-06,
"loss": 1.4148,
"step": 260
},
{
"epoch": 0.06850618458610847,
"grad_norm": 2.5578243732452393,
"learning_rate": 6.810126582278481e-06,
"loss": 1.4126,
"step": 270
},
{
"epoch": 0.07104345068189026,
"grad_norm": 2.5203826427459717,
"learning_rate": 7.0632911392405065e-06,
"loss": 1.4237,
"step": 280
},
{
"epoch": 0.07358071677767206,
"grad_norm": 2.444420337677002,
"learning_rate": 7.316455696202533e-06,
"loss": 1.4053,
"step": 290
},
{
"epoch": 0.07611798287345385,
"grad_norm": 2.3801279067993164,
"learning_rate": 7.569620253164558e-06,
"loss": 1.4021,
"step": 300
},
{
"epoch": 0.07865524896923565,
"grad_norm": 2.3352255821228027,
"learning_rate": 7.822784810126582e-06,
"loss": 1.4092,
"step": 310
},
{
"epoch": 0.08119251506501744,
"grad_norm": 2.3657045364379883,
"learning_rate": 8.075949367088608e-06,
"loss": 1.3745,
"step": 320
},
{
"epoch": 0.08372978116079924,
"grad_norm": 2.3734323978424072,
"learning_rate": 8.329113924050633e-06,
"loss": 1.3878,
"step": 330
},
{
"epoch": 0.08626704725658103,
"grad_norm": 2.341235637664795,
"learning_rate": 8.582278481012659e-06,
"loss": 1.397,
"step": 340
},
{
"epoch": 0.08880431335236283,
"grad_norm": 2.304081916809082,
"learning_rate": 8.835443037974685e-06,
"loss": 1.4008,
"step": 350
},
{
"epoch": 0.09134157944814462,
"grad_norm": 2.5316545963287354,
"learning_rate": 9.08860759493671e-06,
"loss": 1.3804,
"step": 360
},
{
"epoch": 0.09387884554392642,
"grad_norm": 2.275545597076416,
"learning_rate": 9.341772151898735e-06,
"loss": 1.386,
"step": 370
},
{
"epoch": 0.09641611163970822,
"grad_norm": 2.518347978591919,
"learning_rate": 9.59493670886076e-06,
"loss": 1.3956,
"step": 380
},
{
"epoch": 0.09895337773549001,
"grad_norm": 2.351276397705078,
"learning_rate": 9.848101265822785e-06,
"loss": 1.3803,
"step": 390
},
{
"epoch": 0.1014906438312718,
"grad_norm": 2.251296043395996,
"learning_rate": 9.999968603457859e-06,
"loss": 1.3898,
"step": 400
},
{
"epoch": 0.1040279099270536,
"grad_norm": 2.2908174991607666,
"learning_rate": 9.999615396887012e-06,
"loss": 1.3835,
"step": 410
},
{
"epoch": 0.1065651760228354,
"grad_norm": 2.127427577972412,
"learning_rate": 9.998869765883566e-06,
"loss": 1.349,
"step": 420
},
{
"epoch": 0.1091024421186172,
"grad_norm": 2.285038471221924,
"learning_rate": 9.997731768972785e-06,
"loss": 1.3973,
"step": 430
},
{
"epoch": 0.11163970821439899,
"grad_norm": 2.093776226043701,
"learning_rate": 9.996201495477102e-06,
"loss": 1.3757,
"step": 440
},
{
"epoch": 0.11417697431018078,
"grad_norm": 2.1693060398101807,
"learning_rate": 9.994279065509094e-06,
"loss": 1.3786,
"step": 450
},
{
"epoch": 0.11671424040596258,
"grad_norm": 2.2407472133636475,
"learning_rate": 9.991964629962067e-06,
"loss": 1.3793,
"step": 460
},
{
"epoch": 0.11925150650174438,
"grad_norm": 2.1516313552856445,
"learning_rate": 9.989258370498208e-06,
"loss": 1.3562,
"step": 470
},
{
"epoch": 0.12178877259752617,
"grad_norm": 2.272516965866089,
"learning_rate": 9.986160499534318e-06,
"loss": 1.3968,
"step": 480
},
{
"epoch": 0.12432603869330797,
"grad_norm": 2.11755108833313,
"learning_rate": 9.982671260225156e-06,
"loss": 1.3714,
"step": 490
},
{
"epoch": 0.12686330478908975,
"grad_norm": 2.0977492332458496,
"learning_rate": 9.97879092644434e-06,
"loss": 1.3549,
"step": 500
},
{
"epoch": 0.12940057088487156,
"grad_norm": 2.2265727519989014,
"learning_rate": 9.974519802762853e-06,
"loss": 1.3699,
"step": 510
},
{
"epoch": 0.13193783698065334,
"grad_norm": 2.15244197845459,
"learning_rate": 9.969858224425138e-06,
"loss": 1.3608,
"step": 520
},
{
"epoch": 0.13447510307643515,
"grad_norm": 2.1284122467041016,
"learning_rate": 9.96480655732279e-06,
"loss": 1.3766,
"step": 530
},
{
"epoch": 0.13701236917221693,
"grad_norm": 2.03669810295105,
"learning_rate": 9.959365197965824e-06,
"loss": 1.3596,
"step": 540
},
{
"epoch": 0.13954963526799874,
"grad_norm": 1.9937801361083984,
"learning_rate": 9.953534573451568e-06,
"loss": 1.3501,
"step": 550
},
{
"epoch": 0.14208690136378052,
"grad_norm": 2.097184419631958,
"learning_rate": 9.947315141431126e-06,
"loss": 1.3443,
"step": 560
},
{
"epoch": 0.14462416745956233,
"grad_norm": 2.1515302658081055,
"learning_rate": 9.940707390073465e-06,
"loss": 1.3548,
"step": 570
},
{
"epoch": 0.1471614335553441,
"grad_norm": 2.027521848678589,
"learning_rate": 9.933711838027096e-06,
"loss": 1.3415,
"step": 580
},
{
"epoch": 0.14969869965112592,
"grad_norm": 2.1078367233276367,
"learning_rate": 9.926329034379361e-06,
"loss": 1.3578,
"step": 590
},
{
"epoch": 0.1522359657469077,
"grad_norm": 2.1043877601623535,
"learning_rate": 9.918559558613344e-06,
"loss": 1.3673,
"step": 600
},
{
"epoch": 0.1547732318426895,
"grad_norm": 2.100511074066162,
"learning_rate": 9.910404020562377e-06,
"loss": 1.3556,
"step": 610
},
{
"epoch": 0.1573104979384713,
"grad_norm": 1.9388527870178223,
"learning_rate": 9.901863060362176e-06,
"loss": 1.3608,
"step": 620
},
{
"epoch": 0.1598477640342531,
"grad_norm": 2.0223309993743896,
"learning_rate": 9.8929373484006e-06,
"loss": 1.3301,
"step": 630
},
{
"epoch": 0.16238503013003489,
"grad_norm": 2.006113290786743,
"learning_rate": 9.883627585265032e-06,
"loss": 1.3345,
"step": 640
},
{
"epoch": 0.1649222962258167,
"grad_norm": 2.0276896953582764,
"learning_rate": 9.873934501687381e-06,
"loss": 1.3437,
"step": 650
},
{
"epoch": 0.16745956232159848,
"grad_norm": 1.9418350458145142,
"learning_rate": 9.863858858486736e-06,
"loss": 1.3307,
"step": 660
},
{
"epoch": 0.16999682841738029,
"grad_norm": 1.919950008392334,
"learning_rate": 9.853401446509641e-06,
"loss": 1.3478,
"step": 670
},
{
"epoch": 0.17253409451316207,
"grad_norm": 1.9946993589401245,
"learning_rate": 9.842563086568024e-06,
"loss": 1.3491,
"step": 680
},
{
"epoch": 0.17507136060894388,
"grad_norm": 1.9728672504425049,
"learning_rate": 9.831344629374778e-06,
"loss": 1.3603,
"step": 690
},
{
"epoch": 0.17760862670472566,
"grad_norm": 1.9747499227523804,
"learning_rate": 9.81974695547697e-06,
"loss": 1.3278,
"step": 700
},
{
"epoch": 0.18014589280050744,
"grad_norm": 1.931283712387085,
"learning_rate": 9.807770975186743e-06,
"loss": 1.3389,
"step": 710
},
{
"epoch": 0.18268315889628925,
"grad_norm": 1.8983511924743652,
"learning_rate": 9.795417628509857e-06,
"loss": 1.3369,
"step": 720
},
{
"epoch": 0.18522042499207103,
"grad_norm": 2.0960946083068848,
"learning_rate": 9.78268788507191e-06,
"loss": 1.3519,
"step": 730
},
{
"epoch": 0.18775769108785284,
"grad_norm": 1.9553834199905396,
"learning_rate": 9.769582744042224e-06,
"loss": 1.3383,
"step": 740
},
{
"epoch": 0.19029495718363462,
"grad_norm": 1.8932089805603027,
"learning_rate": 9.756103234055432e-06,
"loss": 1.34,
"step": 750
},
{
"epoch": 0.19283222327941643,
"grad_norm": 1.9457850456237793,
"learning_rate": 9.742250413130728e-06,
"loss": 1.323,
"step": 760
},
{
"epoch": 0.1953694893751982,
"grad_norm": 1.8577946424484253,
"learning_rate": 9.728025368588829e-06,
"loss": 1.3251,
"step": 770
},
{
"epoch": 0.19790675547098002,
"grad_norm": 2.0047388076782227,
"learning_rate": 9.713429216966624e-06,
"loss": 1.3202,
"step": 780
},
{
"epoch": 0.2004440215667618,
"grad_norm": 1.9848380088806152,
"learning_rate": 9.698463103929542e-06,
"loss": 1.3433,
"step": 790
},
{
"epoch": 0.2029812876625436,
"grad_norm": 1.8254072666168213,
"learning_rate": 9.68312820418163e-06,
"loss": 1.3186,
"step": 800
},
{
"epoch": 0.2055185537583254,
"grad_norm": 1.911875605583191,
"learning_rate": 9.667425721373333e-06,
"loss": 1.3379,
"step": 810
},
{
"epoch": 0.2080558198541072,
"grad_norm": 1.9765597581863403,
"learning_rate": 9.651356888007041e-06,
"loss": 1.3319,
"step": 820
},
{
"epoch": 0.210593085949889,
"grad_norm": 2.052441358566284,
"learning_rate": 9.634922965340334e-06,
"loss": 1.3152,
"step": 830
},
{
"epoch": 0.2131303520456708,
"grad_norm": 1.8989280462265015,
"learning_rate": 9.618125243286989e-06,
"loss": 1.3092,
"step": 840
},
{
"epoch": 0.21566761814145258,
"grad_norm": 1.8644095659255981,
"learning_rate": 9.60096504031573e-06,
"loss": 1.3248,
"step": 850
},
{
"epoch": 0.2182048842372344,
"grad_norm": 1.9270919561386108,
"learning_rate": 9.58344370334675e-06,
"loss": 1.3525,
"step": 860
},
{
"epoch": 0.22074215033301617,
"grad_norm": 1.8386811017990112,
"learning_rate": 9.565562607645974e-06,
"loss": 1.3214,
"step": 870
},
{
"epoch": 0.22327941642879798,
"grad_norm": 1.8396153450012207,
"learning_rate": 9.547323156717133e-06,
"loss": 1.3247,
"step": 880
},
{
"epoch": 0.22581668252457976,
"grad_norm": 1.906518578529358,
"learning_rate": 9.52872678219158e-06,
"loss": 1.3132,
"step": 890
},
{
"epoch": 0.22835394862036157,
"grad_norm": 1.9358681440353394,
"learning_rate": 9.50977494371594e-06,
"loss": 1.3199,
"step": 900
},
{
"epoch": 0.23089121471614335,
"grad_norm": 1.8726778030395508,
"learning_rate": 9.490469128837525e-06,
"loss": 1.3058,
"step": 910
},
{
"epoch": 0.23342848081192516,
"grad_norm": 1.8988134860992432,
"learning_rate": 9.470810852887586e-06,
"loss": 1.3035,
"step": 920
},
{
"epoch": 0.23596574690770694,
"grad_norm": 1.8984090089797974,
"learning_rate": 9.450801658862371e-06,
"loss": 1.321,
"step": 930
},
{
"epoch": 0.23850301300348875,
"grad_norm": 1.8265104293823242,
"learning_rate": 9.430443117302006e-06,
"loss": 1.3089,
"step": 940
},
{
"epoch": 0.24104027909927053,
"grad_norm": 2.0038204193115234,
"learning_rate": 9.409736826167233e-06,
"loss": 1.3185,
"step": 950
},
{
"epoch": 0.24357754519505234,
"grad_norm": 1.9437892436981201,
"learning_rate": 9.388684410713977e-06,
"loss": 1.3148,
"step": 960
},
{
"epoch": 0.24611481129083412,
"grad_norm": 1.902868628501892,
"learning_rate": 9.367287523365782e-06,
"loss": 1.3092,
"step": 970
},
{
"epoch": 0.24865207738661593,
"grad_norm": 1.8983051776885986,
"learning_rate": 9.345547843584108e-06,
"loss": 1.3091,
"step": 980
},
{
"epoch": 0.25118934348239774,
"grad_norm": 1.9476397037506104,
"learning_rate": 9.323467077736513e-06,
"loss": 1.3149,
"step": 990
},
{
"epoch": 0.2537266095781795,
"grad_norm": 1.9828585386276245,
"learning_rate": 9.301046958962707e-06,
"loss": 1.3149,
"step": 1000
},
{
"epoch": 0.2562638756739613,
"grad_norm": 1.8465077877044678,
"learning_rate": 9.278289247038537e-06,
"loss": 1.3113,
"step": 1010
},
{
"epoch": 0.2588011417697431,
"grad_norm": 1.7844876050949097,
"learning_rate": 9.255195728237837e-06,
"loss": 1.3075,
"step": 1020
},
{
"epoch": 0.2613384078655249,
"grad_norm": 1.8422950506210327,
"learning_rate": 9.231768215192243e-06,
"loss": 1.3071,
"step": 1030
},
{
"epoch": 0.2638756739613067,
"grad_norm": 1.9389331340789795,
"learning_rate": 9.2080085467489e-06,
"loss": 1.3315,
"step": 1040
},
{
"epoch": 0.2664129400570885,
"grad_norm": 1.8695167303085327,
"learning_rate": 9.183918587826142e-06,
"loss": 1.3203,
"step": 1050
},
{
"epoch": 0.2689502061528703,
"grad_norm": 1.843494176864624,
"learning_rate": 9.159500229267103e-06,
"loss": 1.3073,
"step": 1060
},
{
"epoch": 0.27148747224865205,
"grad_norm": 1.819220781326294,
"learning_rate": 9.134755387691315e-06,
"loss": 1.317,
"step": 1070
},
{
"epoch": 0.27402473834443386,
"grad_norm": 1.8392246961593628,
"learning_rate": 9.109686005344258e-06,
"loss": 1.3055,
"step": 1080
},
{
"epoch": 0.27656200444021567,
"grad_norm": 1.9258952140808105,
"learning_rate": 9.084294049944919e-06,
"loss": 1.303,
"step": 1090
},
{
"epoch": 0.2790992705359975,
"grad_norm": 1.7235634326934814,
"learning_rate": 9.05858151453134e-06,
"loss": 1.2958,
"step": 1100
},
{
"epoch": 0.28163653663177923,
"grad_norm": 1.8805299997329712,
"learning_rate": 9.032550417304189e-06,
"loss": 1.3123,
"step": 1110
},
{
"epoch": 0.28417380272756104,
"grad_norm": 1.7971928119659424,
"learning_rate": 9.006202801468342e-06,
"loss": 1.3181,
"step": 1120
},
{
"epoch": 0.28671106882334285,
"grad_norm": 1.8092015981674194,
"learning_rate": 8.979540735072512e-06,
"loss": 1.2802,
"step": 1130
},
{
"epoch": 0.28924833491912466,
"grad_norm": 1.7668564319610596,
"learning_rate": 8.952566310846931e-06,
"loss": 1.2911,
"step": 1140
},
{
"epoch": 0.2917856010149064,
"grad_norm": 1.8799540996551514,
"learning_rate": 8.925281646039078e-06,
"loss": 1.2966,
"step": 1150
},
{
"epoch": 0.2943228671106882,
"grad_norm": 1.9077820777893066,
"learning_rate": 8.897688882247515e-06,
"loss": 1.2889,
"step": 1160
},
{
"epoch": 0.29686013320647003,
"grad_norm": 1.818803071975708,
"learning_rate": 8.869790185253766e-06,
"loss": 1.2922,
"step": 1170
},
{
"epoch": 0.29939739930225184,
"grad_norm": 1.8670169115066528,
"learning_rate": 8.841587744852339e-06,
"loss": 1.3137,
"step": 1180
},
{
"epoch": 0.3019346653980336,
"grad_norm": 1.8387978076934814,
"learning_rate": 8.813083774678841e-06,
"loss": 1.2988,
"step": 1190
},
{
"epoch": 0.3044719314938154,
"grad_norm": 1.9049063920974731,
"learning_rate": 8.784280512036235e-06,
"loss": 1.3002,
"step": 1200
},
{
"epoch": 0.3070091975895972,
"grad_norm": 1.8128398656845093,
"learning_rate": 8.755180217719218e-06,
"loss": 1.2896,
"step": 1210
},
{
"epoch": 0.309546463685379,
"grad_norm": 1.8100541830062866,
"learning_rate": 8.72578517583679e-06,
"loss": 1.3028,
"step": 1220
},
{
"epoch": 0.3120837297811608,
"grad_norm": 1.8533449172973633,
"learning_rate": 8.696097693632944e-06,
"loss": 1.2791,
"step": 1230
},
{
"epoch": 0.3146209958769426,
"grad_norm": 1.8735203742980957,
"learning_rate": 8.666120101305596e-06,
"loss": 1.3084,
"step": 1240
},
{
"epoch": 0.3171582619727244,
"grad_norm": 1.8829270601272583,
"learning_rate": 8.635854751823666e-06,
"loss": 1.3125,
"step": 1250
},
{
"epoch": 0.3196955280685062,
"grad_norm": 1.8787264823913574,
"learning_rate": 8.60530402074241e-06,
"loss": 1.2904,
"step": 1260
},
{
"epoch": 0.32223279416428796,
"grad_norm": 1.7555420398712158,
"learning_rate": 8.574470306016936e-06,
"loss": 1.3098,
"step": 1270
},
{
"epoch": 0.32477006026006977,
"grad_norm": 1.8944544792175293,
"learning_rate": 8.543356027814009e-06,
"loss": 1.2818,
"step": 1280
},
{
"epoch": 0.3273073263558516,
"grad_norm": 1.8189594745635986,
"learning_rate": 8.511963628322076e-06,
"loss": 1.2925,
"step": 1290
},
{
"epoch": 0.3298445924516334,
"grad_norm": 1.8110857009887695,
"learning_rate": 8.480295571559581e-06,
"loss": 1.2868,
"step": 1300
},
{
"epoch": 0.33238185854741514,
"grad_norm": 1.7613568305969238,
"learning_rate": 8.448354343181568e-06,
"loss": 1.2935,
"step": 1310
},
{
"epoch": 0.33491912464319695,
"grad_norm": 1.743290662765503,
"learning_rate": 8.416142450284565e-06,
"loss": 1.3024,
"step": 1320
},
{
"epoch": 0.33745639073897876,
"grad_norm": 1.7666162252426147,
"learning_rate": 8.383662421209813e-06,
"loss": 1.2934,
"step": 1330
},
{
"epoch": 0.33999365683476057,
"grad_norm": 1.842170000076294,
"learning_rate": 8.350916805344812e-06,
"loss": 1.3163,
"step": 1340
},
{
"epoch": 0.3425309229305423,
"grad_norm": 1.8108347654342651,
"learning_rate": 8.317908172923207e-06,
"loss": 1.2687,
"step": 1350
},
{
"epoch": 0.34506818902632413,
"grad_norm": 1.832701325416565,
"learning_rate": 8.28463911482306e-06,
"loss": 1.287,
"step": 1360
},
{
"epoch": 0.34760545512210594,
"grad_norm": 1.7689234018325806,
"learning_rate": 8.251112242363488e-06,
"loss": 1.3073,
"step": 1370
},
{
"epoch": 0.35014272121788775,
"grad_norm": 1.7938982248306274,
"learning_rate": 8.217330187099689e-06,
"loss": 1.2734,
"step": 1380
},
{
"epoch": 0.3526799873136695,
"grad_norm": 1.7464276552200317,
"learning_rate": 8.183295600616399e-06,
"loss": 1.2746,
"step": 1390
},
{
"epoch": 0.3552172534094513,
"grad_norm": 1.8033825159072876,
"learning_rate": 8.149011154319763e-06,
"loss": 1.2833,
"step": 1400
},
{
"epoch": 0.3577545195052331,
"grad_norm": 1.850847601890564,
"learning_rate": 8.114479539227653e-06,
"loss": 1.3033,
"step": 1410
},
{
"epoch": 0.3602917856010149,
"grad_norm": 1.7663108110427856,
"learning_rate": 8.079703465758447e-06,
"loss": 1.2756,
"step": 1420
},
{
"epoch": 0.3628290516967967,
"grad_norm": 1.8576610088348389,
"learning_rate": 8.044685663518289e-06,
"loss": 1.2871,
"step": 1430
},
{
"epoch": 0.3653663177925785,
"grad_norm": 1.7135543823242188,
"learning_rate": 8.009428881086836e-06,
"loss": 1.2825,
"step": 1440
},
{
"epoch": 0.3679035838883603,
"grad_norm": 1.8041423559188843,
"learning_rate": 7.97393588580152e-06,
"loss": 1.2726,
"step": 1450
},
{
"epoch": 0.37044084998414206,
"grad_norm": 1.852770447731018,
"learning_rate": 7.93820946354034e-06,
"loss": 1.2754,
"step": 1460
},
{
"epoch": 0.37297811607992387,
"grad_norm": 1.8034069538116455,
"learning_rate": 7.902252418503198e-06,
"loss": 1.2881,
"step": 1470
},
{
"epoch": 0.3755153821757057,
"grad_norm": 1.834839105606079,
"learning_rate": 7.86606757299178e-06,
"loss": 1.2717,
"step": 1480
},
{
"epoch": 0.3780526482714875,
"grad_norm": 1.8019623756408691,
"learning_rate": 7.829657767188052e-06,
"loss": 1.2863,
"step": 1490
},
{
"epoch": 0.38058991436726924,
"grad_norm": 1.8359981775283813,
"learning_rate": 7.793025858931317e-06,
"loss": 1.2896,
"step": 1500
},
{
"epoch": 0.38312718046305105,
"grad_norm": 1.7588212490081787,
"learning_rate": 7.756174723493908e-06,
"loss": 1.298,
"step": 1510
},
{
"epoch": 0.38566444655883286,
"grad_norm": 1.810434103012085,
"learning_rate": 7.719107253355494e-06,
"loss": 1.294,
"step": 1520
},
{
"epoch": 0.3882017126546147,
"grad_norm": 1.7165814638137817,
"learning_rate": 7.68182635797606e-06,
"loss": 1.2529,
"step": 1530
},
{
"epoch": 0.3907389787503964,
"grad_norm": 1.7785570621490479,
"learning_rate": 7.644334963567542e-06,
"loss": 1.2726,
"step": 1540
},
{
"epoch": 0.39327624484617824,
"grad_norm": 1.8550214767456055,
"learning_rate": 7.606636012864126e-06,
"loss": 1.2866,
"step": 1550
},
{
"epoch": 0.39581351094196005,
"grad_norm": 1.8188886642456055,
"learning_rate": 7.568732464891293e-06,
"loss": 1.2867,
"step": 1560
},
{
"epoch": 0.39835077703774185,
"grad_norm": 1.7937846183776855,
"learning_rate": 7.530627294733549e-06,
"loss": 1.2764,
"step": 1570
},
{
"epoch": 0.4008880431335236,
"grad_norm": 1.8102056980133057,
"learning_rate": 7.492323493300912e-06,
"loss": 1.2663,
"step": 1580
},
{
"epoch": 0.4034253092293054,
"grad_norm": 1.8100765943527222,
"learning_rate": 7.453824067094152e-06,
"loss": 1.2772,
"step": 1590
},
{
"epoch": 0.4059625753250872,
"grad_norm": 1.7928545475006104,
"learning_rate": 7.4151320379688105e-06,
"loss": 1.2831,
"step": 1600
},
{
"epoch": 0.40849984142086904,
"grad_norm": 1.7782552242279053,
"learning_rate": 7.376250442898006e-06,
"loss": 1.2701,
"step": 1610
},
{
"epoch": 0.4110371075166508,
"grad_norm": 1.7261615991592407,
"learning_rate": 7.33718233373407e-06,
"loss": 1.2761,
"step": 1620
},
{
"epoch": 0.4135743736124326,
"grad_norm": 1.8159124851226807,
"learning_rate": 7.297930776968989e-06,
"loss": 1.2817,
"step": 1630
},
{
"epoch": 0.4161116397082144,
"grad_norm": 1.8756343126296997,
"learning_rate": 7.258498853493729e-06,
"loss": 1.2881,
"step": 1640
},
{
"epoch": 0.4186489058039962,
"grad_norm": 1.7900598049163818,
"learning_rate": 7.2188896583563984e-06,
"loss": 1.2602,
"step": 1650
},
{
"epoch": 0.421186171899778,
"grad_norm": 1.744611144065857,
"learning_rate": 7.179106300519329e-06,
"loss": 1.2911,
"step": 1660
},
{
"epoch": 0.4237234379955598,
"grad_norm": 1.7962532043457031,
"learning_rate": 7.13915190261504e-06,
"loss": 1.2581,
"step": 1670
},
{
"epoch": 0.4262607040913416,
"grad_norm": 1.8410406112670898,
"learning_rate": 7.099029600701144e-06,
"loss": 1.2632,
"step": 1680
},
{
"epoch": 0.4287979701871234,
"grad_norm": 1.6939560174942017,
"learning_rate": 7.0587425440141955e-06,
"loss": 1.2632,
"step": 1690
},
{
"epoch": 0.43133523628290515,
"grad_norm": 1.7786022424697876,
"learning_rate": 7.0182938947225025e-06,
"loss": 1.2703,
"step": 1700
},
{
"epoch": 0.43387250237868696,
"grad_norm": 1.6461378335952759,
"learning_rate": 6.977686827677926e-06,
"loss": 1.2769,
"step": 1710
},
{
"epoch": 0.4364097684744688,
"grad_norm": 1.9497860670089722,
"learning_rate": 6.936924530166682e-06,
"loss": 1.288,
"step": 1720
},
{
"epoch": 0.4389470345702506,
"grad_norm": 1.6506319046020508,
"learning_rate": 6.896010201659173e-06,
"loss": 1.2687,
"step": 1730
},
{
"epoch": 0.44148430066603234,
"grad_norm": 1.7398324012756348,
"learning_rate": 6.854947053558849e-06,
"loss": 1.27,
"step": 1740
},
{
"epoch": 0.44402156676181415,
"grad_norm": 1.7190606594085693,
"learning_rate": 6.8137383089501526e-06,
"loss": 1.2643,
"step": 1750
},
{
"epoch": 0.44655883285759596,
"grad_norm": 1.7999427318572998,
"learning_rate": 6.772387202345528e-06,
"loss": 1.2713,
"step": 1760
},
{
"epoch": 0.4490960989533777,
"grad_norm": 1.7355235815048218,
"learning_rate": 6.730896979431543e-06,
"loss": 1.2786,
"step": 1770
},
{
"epoch": 0.4516333650491595,
"grad_norm": 1.7803362607955933,
"learning_rate": 6.689270896814139e-06,
"loss": 1.2664,
"step": 1780
},
{
"epoch": 0.45417063114494133,
"grad_norm": 1.8156557083129883,
"learning_rate": 6.647512221763005e-06,
"loss": 1.2663,
"step": 1790
},
{
"epoch": 0.45670789724072314,
"grad_norm": 1.7821699380874634,
"learning_rate": 6.6056242319551315e-06,
"loss": 1.2662,
"step": 1800
},
{
"epoch": 0.4592451633365049,
"grad_norm": 1.8154984712600708,
"learning_rate": 6.563610215217551e-06,
"loss": 1.2605,
"step": 1810
},
{
"epoch": 0.4617824294322867,
"grad_norm": 1.7243108749389648,
"learning_rate": 6.5214734692692594e-06,
"loss": 1.272,
"step": 1820
},
{
"epoch": 0.4643196955280685,
"grad_norm": 1.7438788414001465,
"learning_rate": 6.479217301462386e-06,
"loss": 1.2607,
"step": 1830
},
{
"epoch": 0.4668569616238503,
"grad_norm": 1.7700178623199463,
"learning_rate": 6.43684502852259e-06,
"loss": 1.2536,
"step": 1840
},
{
"epoch": 0.4693942277196321,
"grad_norm": 1.6637680530548096,
"learning_rate": 6.394359976288729e-06,
"loss": 1.2542,
"step": 1850
},
{
"epoch": 0.4719314938154139,
"grad_norm": 1.7491919994354248,
"learning_rate": 6.3517654794518156e-06,
"loss": 1.26,
"step": 1860
},
{
"epoch": 0.4744687599111957,
"grad_norm": 1.7363123893737793,
"learning_rate": 6.309064881293265e-06,
"loss": 1.2713,
"step": 1870
},
{
"epoch": 0.4770060260069775,
"grad_norm": 1.7664891481399536,
"learning_rate": 6.266261533422487e-06,
"loss": 1.2626,
"step": 1880
},
{
"epoch": 0.47954329210275926,
"grad_norm": 1.828539252281189,
"learning_rate": 6.223358795513812e-06,
"loss": 1.2598,
"step": 1890
},
{
"epoch": 0.48208055819854106,
"grad_norm": 1.7587262392044067,
"learning_rate": 6.18036003504278e-06,
"loss": 1.2582,
"step": 1900
},
{
"epoch": 0.4846178242943229,
"grad_norm": 1.7577661275863647,
"learning_rate": 6.1372686270218385e-06,
"loss": 1.2454,
"step": 1910
},
{
"epoch": 0.4871550903901047,
"grad_norm": 1.6523966789245605,
"learning_rate": 6.094087953735423e-06,
"loss": 1.2712,
"step": 1920
},
{
"epoch": 0.48969235648588644,
"grad_norm": 1.8142590522766113,
"learning_rate": 6.050821404474483e-06,
"loss": 1.2506,
"step": 1930
},
{
"epoch": 0.49222962258166825,
"grad_norm": 1.7929701805114746,
"learning_rate": 6.00747237527045e-06,
"loss": 1.2698,
"step": 1940
},
{
"epoch": 0.49476688867745006,
"grad_norm": 1.817660927772522,
"learning_rate": 5.964044268628688e-06,
"loss": 1.2539,
"step": 1950
},
{
"epoch": 0.49730415477323187,
"grad_norm": 1.7909172773361206,
"learning_rate": 5.920540493261415e-06,
"loss": 1.2707,
"step": 1960
},
{
"epoch": 0.4998414208690136,
"grad_norm": 1.8175256252288818,
"learning_rate": 5.8769644638201635e-06,
"loss": 1.2575,
"step": 1970
},
{
"epoch": 0.5023786869647955,
"grad_norm": 1.7416564226150513,
"learning_rate": 5.8333196006277536e-06,
"loss": 1.2512,
"step": 1980
},
{
"epoch": 0.5049159530605772,
"grad_norm": 1.751071810722351,
"learning_rate": 5.789609329409826e-06,
"loss": 1.2531,
"step": 1990
},
{
"epoch": 0.507453219156359,
"grad_norm": 1.8549920320510864,
"learning_rate": 5.7458370810259635e-06,
"loss": 1.2397,
"step": 2000
},
{
"epoch": 0.5099904852521409,
"grad_norm": 1.7362955808639526,
"learning_rate": 5.702006291200389e-06,
"loss": 1.2399,
"step": 2010
},
{
"epoch": 0.5125277513479226,
"grad_norm": 1.802452564239502,
"learning_rate": 5.6581204002523e-06,
"loss": 1.2408,
"step": 2020
},
{
"epoch": 0.5150650174437044,
"grad_norm": 1.7542202472686768,
"learning_rate": 5.614182852825835e-06,
"loss": 1.2542,
"step": 2030
},
{
"epoch": 0.5176022835394862,
"grad_norm": 1.816231369972229,
"learning_rate": 5.570197097619688e-06,
"loss": 1.2637,
"step": 2040
},
{
"epoch": 0.520139549635268,
"grad_norm": 1.6630245447158813,
"learning_rate": 5.526166587116436e-06,
"loss": 1.2488,
"step": 2050
},
{
"epoch": 0.5226768157310498,
"grad_norm": 1.8530631065368652,
"learning_rate": 5.4820947773115374e-06,
"loss": 1.2675,
"step": 2060
},
{
"epoch": 0.5252140818268316,
"grad_norm": 1.8050600290298462,
"learning_rate": 5.437985127442065e-06,
"loss": 1.2491,
"step": 2070
},
{
"epoch": 0.5277513479226134,
"grad_norm": 1.755374789237976,
"learning_rate": 5.393841099715205e-06,
"loss": 1.2401,
"step": 2080
},
{
"epoch": 0.5302886140183952,
"grad_norm": 1.7990094423294067,
"learning_rate": 5.349666159036482e-06,
"loss": 1.2447,
"step": 2090
},
{
"epoch": 0.532825880114177,
"grad_norm": 1.8541128635406494,
"learning_rate": 5.305463772737812e-06,
"loss": 1.2422,
"step": 2100
},
{
"epoch": 0.5353631462099587,
"grad_norm": 1.7271368503570557,
"learning_rate": 5.261237410305344e-06,
"loss": 1.2508,
"step": 2110
},
{
"epoch": 0.5379004123057406,
"grad_norm": 1.8599367141723633,
"learning_rate": 5.2169905431071356e-06,
"loss": 1.2523,
"step": 2120
},
{
"epoch": 0.5404376784015223,
"grad_norm": 1.7589484453201294,
"learning_rate": 5.172726644120678e-06,
"loss": 1.2369,
"step": 2130
},
{
"epoch": 0.5429749444973041,
"grad_norm": 1.8130236864089966,
"learning_rate": 5.128449187660309e-06,
"loss": 1.2411,
"step": 2140
},
{
"epoch": 0.545512210593086,
"grad_norm": 1.8312796354293823,
"learning_rate": 5.084161649104502e-06,
"loss": 1.2534,
"step": 2150
},
{
"epoch": 0.5480494766888677,
"grad_norm": 1.8068058490753174,
"learning_rate": 5.039867504623084e-06,
"loss": 1.2269,
"step": 2160
},
{
"epoch": 0.5505867427846496,
"grad_norm": 1.774015188217163,
"learning_rate": 4.995570230904386e-06,
"loss": 1.2254,
"step": 2170
},
{
"epoch": 0.5531240088804313,
"grad_norm": 1.788631558418274,
"learning_rate": 4.951273304882358e-06,
"loss": 1.2449,
"step": 2180
},
{
"epoch": 0.5556612749762131,
"grad_norm": 1.6815987825393677,
"learning_rate": 4.906980203463659e-06,
"loss": 1.2437,
"step": 2190
},
{
"epoch": 0.558198541071995,
"grad_norm": 1.7828891277313232,
"learning_rate": 4.862694403254747e-06,
"loss": 1.2457,
"step": 2200
},
{
"epoch": 0.5607358071677767,
"grad_norm": 1.7972067594528198,
"learning_rate": 4.818419380289009e-06,
"loss": 1.2651,
"step": 2210
},
{
"epoch": 0.5632730732635585,
"grad_norm": 1.8437052965164185,
"learning_rate": 4.774158609753908e-06,
"loss": 1.2506,
"step": 2220
},
{
"epoch": 0.5658103393593403,
"grad_norm": 1.7658838033676147,
"learning_rate": 4.729915565718223e-06,
"loss": 1.2347,
"step": 2230
},
{
"epoch": 0.5683476054551221,
"grad_norm": 1.7554123401641846,
"learning_rate": 4.685693720859369e-06,
"loss": 1.2374,
"step": 2240
},
{
"epoch": 0.570884871550904,
"grad_norm": 1.7203478813171387,
"learning_rate": 4.641496546190813e-06,
"loss": 1.2364,
"step": 2250
},
{
"epoch": 0.5734221376466857,
"grad_norm": 1.759904384613037,
"learning_rate": 4.597327510789635e-06,
"loss": 1.2236,
"step": 2260
},
{
"epoch": 0.5759594037424675,
"grad_norm": 1.8005478382110596,
"learning_rate": 4.553190081524242e-06,
"loss": 1.2424,
"step": 2270
},
{
"epoch": 0.5784966698382493,
"grad_norm": 1.809352159500122,
"learning_rate": 4.5090877227822424e-06,
"loss": 1.2413,
"step": 2280
},
{
"epoch": 0.5810339359340311,
"grad_norm": 1.8049602508544922,
"learning_rate": 4.46502389619853e-06,
"loss": 1.2542,
"step": 2290
},
{
"epoch": 0.5835712020298128,
"grad_norm": 1.803334355354309,
"learning_rate": 4.421002060383569e-06,
"loss": 1.2353,
"step": 2300
},
{
"epoch": 0.5861084681255947,
"grad_norm": 1.7442089319229126,
"learning_rate": 4.3770256706519375e-06,
"loss": 1.2263,
"step": 2310
},
{
"epoch": 0.5886457342213764,
"grad_norm": 1.7808609008789062,
"learning_rate": 4.3330981787511006e-06,
"loss": 1.2266,
"step": 2320
},
{
"epoch": 0.5911830003171583,
"grad_norm": 1.8719425201416016,
"learning_rate": 4.289223032590491e-06,
"loss": 1.2609,
"step": 2330
},
{
"epoch": 0.5937202664129401,
"grad_norm": 1.7992342710494995,
"learning_rate": 4.245403675970877e-06,
"loss": 1.2318,
"step": 2340
},
{
"epoch": 0.5962575325087218,
"grad_norm": 1.8241426944732666,
"learning_rate": 4.201643548314051e-06,
"loss": 1.2339,
"step": 2350
},
{
"epoch": 0.5987947986045037,
"grad_norm": 1.826092004776001,
"learning_rate": 4.157946084392871e-06,
"loss": 1.2481,
"step": 2360
},
{
"epoch": 0.6013320647002854,
"grad_norm": 1.6740612983703613,
"learning_rate": 4.114314714061659e-06,
"loss": 1.2213,
"step": 2370
},
{
"epoch": 0.6038693307960672,
"grad_norm": 1.747037649154663,
"learning_rate": 4.0707528619869976e-06,
"loss": 1.2248,
"step": 2380
},
{
"epoch": 0.6064065968918491,
"grad_norm": 1.7676377296447754,
"learning_rate": 4.027263947378907e-06,
"loss": 1.2239,
"step": 2390
},
{
"epoch": 0.6089438629876308,
"grad_norm": 1.770889163017273,
"learning_rate": 3.9838513837224814e-06,
"loss": 1.2395,
"step": 2400
},
{
"epoch": 0.6114811290834127,
"grad_norm": 1.6925628185272217,
"learning_rate": 3.940518578509963e-06,
"loss": 1.2347,
"step": 2410
},
{
"epoch": 0.6140183951791944,
"grad_norm": 1.7867980003356934,
"learning_rate": 3.8972689329732725e-06,
"loss": 1.2392,
"step": 2420
},
{
"epoch": 0.6165556612749762,
"grad_norm": 1.7436933517456055,
"learning_rate": 3.854105841817056e-06,
"loss": 1.224,
"step": 2430
},
{
"epoch": 0.619092927370758,
"grad_norm": 1.7485663890838623,
"learning_rate": 3.811032692952227e-06,
"loss": 1.2104,
"step": 2440
},
{
"epoch": 0.6216301934665398,
"grad_norm": 1.7039844989776611,
"learning_rate": 3.7680528672300404e-06,
"loss": 1.2377,
"step": 2450
},
{
"epoch": 0.6241674595623216,
"grad_norm": 1.7440192699432373,
"learning_rate": 3.7251697381767373e-06,
"loss": 1.2385,
"step": 2460
},
{
"epoch": 0.6267047256581034,
"grad_norm": 1.716308832168579,
"learning_rate": 3.6823866717287437e-06,
"loss": 1.2349,
"step": 2470
},
{
"epoch": 0.6292419917538852,
"grad_norm": 1.8369241952896118,
"learning_rate": 3.6397070259684793e-06,
"loss": 1.233,
"step": 2480
},
{
"epoch": 0.6317792578496669,
"grad_norm": 1.7305907011032104,
"learning_rate": 3.5971341508607814e-06,
"loss": 1.2129,
"step": 2490
},
{
"epoch": 0.6343165239454488,
"grad_norm": 1.9149237871170044,
"learning_rate": 3.5546713879899563e-06,
"loss": 1.2193,
"step": 2500
},
{
"epoch": 0.6368537900412305,
"grad_norm": 1.9159984588623047,
"learning_rate": 3.512322070297503e-06,
"loss": 1.2177,
"step": 2510
},
{
"epoch": 0.6393910561370124,
"grad_norm": 1.7947359085083008,
"learning_rate": 3.4700895218205026e-06,
"loss": 1.2315,
"step": 2520
},
{
"epoch": 0.6419283222327942,
"grad_norm": 1.7361483573913574,
"learning_rate": 3.4279770574307096e-06,
"loss": 1.2353,
"step": 2530
},
{
"epoch": 0.6444655883285759,
"grad_norm": 1.8074885606765747,
"learning_rate": 3.385987982574372e-06,
"loss": 1.2171,
"step": 2540
},
{
"epoch": 0.6470028544243578,
"grad_norm": 1.7764816284179688,
"learning_rate": 3.3441255930127752e-06,
"loss": 1.2393,
"step": 2550
},
{
"epoch": 0.6495401205201395,
"grad_norm": 2.0956342220306396,
"learning_rate": 3.3023931745635606e-06,
"loss": 1.227,
"step": 2560
},
{
"epoch": 0.6520773866159213,
"grad_norm": 1.8369258642196655,
"learning_rate": 3.2607940028428154e-06,
"loss": 1.2378,
"step": 2570
},
{
"epoch": 0.6546146527117032,
"grad_norm": 1.7695237398147583,
"learning_rate": 3.2193313430079737e-06,
"loss": 1.2432,
"step": 2580
},
{
"epoch": 0.6571519188074849,
"grad_norm": 1.834971308708191,
"learning_rate": 3.178008449501517e-06,
"loss": 1.2215,
"step": 2590
},
{
"epoch": 0.6596891849032668,
"grad_norm": 1.7134376764297485,
"learning_rate": 3.1368285657955464e-06,
"loss": 1.2204,
"step": 2600
},
{
"epoch": 0.6622264509990485,
"grad_norm": 1.8207467794418335,
"learning_rate": 3.0957949241371845e-06,
"loss": 1.2371,
"step": 2610
},
{
"epoch": 0.6647637170948303,
"grad_norm": 1.8170554637908936,
"learning_rate": 3.0549107452948866e-06,
"loss": 1.235,
"step": 2620
},
{
"epoch": 0.6673009831906122,
"grad_norm": 1.8893216848373413,
"learning_rate": 3.014179238305629e-06,
"loss": 1.2257,
"step": 2630
},
{
"epoch": 0.6698382492863939,
"grad_norm": 1.7861133813858032,
"learning_rate": 2.9736036002230332e-06,
"loss": 1.2061,
"step": 2640
},
{
"epoch": 0.6723755153821757,
"grad_norm": 1.8051108121871948,
"learning_rate": 2.933187015866431e-06,
"loss": 1.2432,
"step": 2650
},
{
"epoch": 0.6749127814779575,
"grad_norm": 1.710418939590454,
"learning_rate": 2.892932657570878e-06,
"loss": 1.2179,
"step": 2660
},
{
"epoch": 0.6774500475737393,
"grad_norm": 1.7585214376449585,
"learning_rate": 2.8528436849381518e-06,
"loss": 1.2522,
"step": 2670
},
{
"epoch": 0.6799873136695211,
"grad_norm": 1.8252116441726685,
"learning_rate": 2.8129232445887623e-06,
"loss": 1.2288,
"step": 2680
},
{
"epoch": 0.6825245797653029,
"grad_norm": 1.8297280073165894,
"learning_rate": 2.773174469914964e-06,
"loss": 1.2273,
"step": 2690
},
{
"epoch": 0.6850618458610847,
"grad_norm": 1.8258917331695557,
"learning_rate": 2.7336004808348094e-06,
"loss": 1.2183,
"step": 2700
},
{
"epoch": 0.6875991119568665,
"grad_norm": 1.7145591974258423,
"learning_rate": 2.6942043835472725e-06,
"loss": 1.2234,
"step": 2710
},
{
"epoch": 0.6901363780526483,
"grad_norm": 1.8191933631896973,
"learning_rate": 2.654989270288435e-06,
"loss": 1.2301,
"step": 2720
},
{
"epoch": 0.69267364414843,
"grad_norm": 1.793045997619629,
"learning_rate": 2.615958219088776e-06,
"loss": 1.2253,
"step": 2730
},
{
"epoch": 0.6952109102442119,
"grad_norm": 1.7396857738494873,
"learning_rate": 2.577114293531571e-06,
"loss": 1.2183,
"step": 2740
},
{
"epoch": 0.6977481763399936,
"grad_norm": 1.7470922470092773,
"learning_rate": 2.538460542512435e-06,
"loss": 1.2193,
"step": 2750
},
{
"epoch": 0.7002854424357755,
"grad_norm": 1.885988473892212,
"learning_rate": 2.5000000000000015e-06,
"loss": 1.2203,
"step": 2760
},
{
"epoch": 0.7028227085315573,
"grad_norm": 1.7453058958053589,
"learning_rate": 2.461735684797794e-06,
"loss": 1.2308,
"step": 2770
},
{
"epoch": 0.705359974627339,
"grad_norm": 1.9530824422836304,
"learning_rate": 2.4236706003072733e-06,
"loss": 1.2472,
"step": 2780
},
{
"epoch": 0.7078972407231209,
"grad_norm": 1.9082190990447998,
"learning_rate": 2.385807734292097e-06,
"loss": 1.211,
"step": 2790
},
{
"epoch": 0.7104345068189026,
"grad_norm": 1.8059364557266235,
"learning_rate": 2.3481500586436067e-06,
"loss": 1.2307,
"step": 2800
},
{
"epoch": 0.7129717729146844,
"grad_norm": 2.033775806427002,
"learning_rate": 2.3107005291475653e-06,
"loss": 1.2313,
"step": 2810
},
{
"epoch": 0.7155090390104663,
"grad_norm": 1.7707144021987915,
"learning_rate": 2.273462085252146e-06,
"loss": 1.2019,
"step": 2820
},
{
"epoch": 0.718046305106248,
"grad_norm": 1.8923752307891846,
"learning_rate": 2.236437649837223e-06,
"loss": 1.2496,
"step": 2830
},
{
"epoch": 0.7205835712020298,
"grad_norm": 1.820609450340271,
"learning_rate": 2.1996301289849474e-06,
"loss": 1.2232,
"step": 2840
},
{
"epoch": 0.7231208372978116,
"grad_norm": 1.8817142248153687,
"learning_rate": 2.1630424117516436e-06,
"loss": 1.2134,
"step": 2850
},
{
"epoch": 0.7256581033935934,
"grad_norm": 1.7665314674377441,
"learning_rate": 2.126677369941047e-06,
"loss": 1.192,
"step": 2860
},
{
"epoch": 0.7281953694893752,
"grad_norm": 1.8656190633773804,
"learning_rate": 2.0905378578788947e-06,
"loss": 1.218,
"step": 2870
},
{
"epoch": 0.730732635585157,
"grad_norm": 1.7951234579086304,
"learning_rate": 2.0546267121888863e-06,
"loss": 1.2099,
"step": 2880
},
{
"epoch": 0.7332699016809388,
"grad_norm": 1.8712400197982788,
"learning_rate": 2.0189467515700283e-06,
"loss": 1.2071,
"step": 2890
},
{
"epoch": 0.7358071677767206,
"grad_norm": 1.8098704814910889,
"learning_rate": 1.9835007765754035e-06,
"loss": 1.2345,
"step": 2900
},
{
"epoch": 0.7383444338725024,
"grad_norm": 1.8739259243011475,
"learning_rate": 1.9482915693923442e-06,
"loss": 1.2138,
"step": 2910
},
{
"epoch": 0.7408816999682841,
"grad_norm": 1.8259673118591309,
"learning_rate": 1.913321893624059e-06,
"loss": 1.2103,
"step": 2920
},
{
"epoch": 0.743418966064066,
"grad_norm": 1.879623532295227,
"learning_rate": 1.878594494072713e-06,
"loss": 1.2094,
"step": 2930
},
{
"epoch": 0.7459562321598477,
"grad_norm": 1.7758175134658813,
"learning_rate": 1.8441120965239912e-06,
"loss": 1.219,
"step": 2940
},
{
"epoch": 0.7484934982556296,
"grad_norm": 1.7712740898132324,
"learning_rate": 1.8098774075331383e-06,
"loss": 1.2312,
"step": 2950
},
{
"epoch": 0.7510307643514114,
"grad_norm": 1.8342068195343018,
"learning_rate": 1.7758931142125308e-06,
"loss": 1.2284,
"step": 2960
},
{
"epoch": 0.7535680304471931,
"grad_norm": 1.922353982925415,
"learning_rate": 1.7421618840207576e-06,
"loss": 1.2251,
"step": 2970
},
{
"epoch": 0.756105296542975,
"grad_norm": 1.7844698429107666,
"learning_rate": 1.7086863645532425e-06,
"loss": 1.2057,
"step": 2980
},
{
"epoch": 0.7586425626387567,
"grad_norm": 1.8080805540084839,
"learning_rate": 1.6754691833344472e-06,
"loss": 1.2355,
"step": 2990
},
{
"epoch": 0.7611798287345385,
"grad_norm": 1.791804313659668,
"learning_rate": 1.642512947611622e-06,
"loss": 1.2406,
"step": 3000
},
{
"epoch": 0.7637170948303204,
"grad_norm": 1.8191893100738525,
"learning_rate": 1.6098202441501599e-06,
"loss": 1.2101,
"step": 3010
},
{
"epoch": 0.7662543609261021,
"grad_norm": 1.8108185529708862,
"learning_rate": 1.5773936390305678e-06,
"loss": 1.1916,
"step": 3020
},
{
"epoch": 0.768791627021884,
"grad_norm": 1.7712724208831787,
"learning_rate": 1.5452356774470468e-06,
"loss": 1.2103,
"step": 3030
},
{
"epoch": 0.7713288931176657,
"grad_norm": 1.8046748638153076,
"learning_rate": 1.5133488835077204e-06,
"loss": 1.2147,
"step": 3040
},
{
"epoch": 0.7738661592134475,
"grad_norm": 1.8228182792663574,
"learning_rate": 1.4817357600365061e-06,
"loss": 1.2153,
"step": 3050
},
{
"epoch": 0.7764034253092293,
"grad_norm": 1.9025073051452637,
"learning_rate": 1.4503987883766857e-06,
"loss": 1.2207,
"step": 3060
},
{
"epoch": 0.7789406914050111,
"grad_norm": 1.8209235668182373,
"learning_rate": 1.4193404281961172e-06,
"loss": 1.2225,
"step": 3070
},
{
"epoch": 0.7814779575007929,
"grad_norm": 1.8447314500808716,
"learning_rate": 1.3885631172941932e-06,
"loss": 1.2265,
"step": 3080
},
{
"epoch": 0.7840152235965747,
"grad_norm": 1.8785367012023926,
"learning_rate": 1.3580692714104887e-06,
"loss": 1.2053,
"step": 3090
},
{
"epoch": 0.7865524896923565,
"grad_norm": 1.9189660549163818,
"learning_rate": 1.3278612840351468e-06,
"loss": 1.2253,
"step": 3100
},
{
"epoch": 0.7890897557881383,
"grad_norm": 1.7978157997131348,
"learning_rate": 1.2979415262210089e-06,
"loss": 1.2183,
"step": 3110
},
{
"epoch": 0.7916270218839201,
"grad_norm": 1.769068956375122,
"learning_rate": 1.2683123463975144e-06,
"loss": 1.2057,
"step": 3120
},
{
"epoch": 0.7941642879797018,
"grad_norm": 1.7503260374069214,
"learning_rate": 1.2389760701863717e-06,
"loss": 1.2295,
"step": 3130
},
{
"epoch": 0.7967015540754837,
"grad_norm": 1.8129879236221313,
"learning_rate": 1.2099350002190063e-06,
"loss": 1.2066,
"step": 3140
},
{
"epoch": 0.7992388201712655,
"grad_norm": 1.824507713317871,
"learning_rate": 1.1811914159558374e-06,
"loss": 1.2385,
"step": 3150
},
{
"epoch": 0.8017760862670472,
"grad_norm": 1.8247697353363037,
"learning_rate": 1.1527475735073574e-06,
"loss": 1.2372,
"step": 3160
},
{
"epoch": 0.8043133523628291,
"grad_norm": 1.7803229093551636,
"learning_rate": 1.1246057054570414e-06,
"loss": 1.2009,
"step": 3170
},
{
"epoch": 0.8068506184586108,
"grad_norm": 1.7703157663345337,
"learning_rate": 1.0967680206861198e-06,
"loss": 1.1976,
"step": 3180
},
{
"epoch": 0.8093878845543926,
"grad_norm": 1.7612484693527222,
"learning_rate": 1.069236704200195e-06,
"loss": 1.206,
"step": 3190
},
{
"epoch": 0.8119251506501745,
"grad_norm": 1.739809274673462,
"learning_rate": 1.0420139169577393e-06,
"loss": 1.2068,
"step": 3200
},
{
"epoch": 0.8144624167459562,
"grad_norm": 1.7609456777572632,
"learning_rate": 1.01510179570048e-06,
"loss": 1.2115,
"step": 3210
},
{
"epoch": 0.8169996828417381,
"grad_norm": 1.8573811054229736,
"learning_rate": 9.88502452785685e-07,
"loss": 1.2151,
"step": 3220
},
{
"epoch": 0.8195369489375198,
"grad_norm": 1.7674131393432617,
"learning_rate": 9.62217976020357e-07,
"loss": 1.221,
"step": 3230
},
{
"epoch": 0.8220742150333016,
"grad_norm": 1.9328287839889526,
"learning_rate": 9.362504284973683e-07,
"loss": 1.2013,
"step": 3240
},
{
"epoch": 0.8246114811290834,
"grad_norm": 1.881496787071228,
"learning_rate": 9.1060184843352e-07,
"loss": 1.2104,
"step": 3250
},
{
"epoch": 0.8271487472248652,
"grad_norm": 1.797257661819458,
"learning_rate": 8.852742490095628e-07,
"loss": 1.2026,
"step": 3260
},
{
"epoch": 0.829686013320647,
"grad_norm": 1.7446339130401611,
"learning_rate": 8.602696182121812e-07,
"loss": 1.2078,
"step": 3270
},
{
"epoch": 0.8322232794164288,
"grad_norm": 1.8723726272583008,
"learning_rate": 8.35589918677952e-07,
"loss": 1.2117,
"step": 3280
},
{
"epoch": 0.8347605455122106,
"grad_norm": 1.7332195043563843,
"learning_rate": 8.112370875393e-07,
"loss": 1.2154,
"step": 3290
},
{
"epoch": 0.8372978116079924,
"grad_norm": 1.7967216968536377,
"learning_rate": 7.872130362724422e-07,
"loss": 1.1956,
"step": 3300
},
{
"epoch": 0.8398350777037742,
"grad_norm": 1.889953851699829,
"learning_rate": 7.635196505473652e-07,
"loss": 1.2149,
"step": 3310
},
{
"epoch": 0.842372343799556,
"grad_norm": 1.8281513452529907,
"learning_rate": 7.401587900798091e-07,
"loss": 1.2106,
"step": 3320
},
{
"epoch": 0.8449096098953378,
"grad_norm": 1.917148232460022,
"learning_rate": 7.171322884852988e-07,
"loss": 1.2256,
"step": 3330
},
{
"epoch": 0.8474468759911196,
"grad_norm": 1.8084789514541626,
"learning_rate": 6.944419531352236e-07,
"loss": 1.2162,
"step": 3340
},
{
"epoch": 0.8499841420869013,
"grad_norm": 1.7765891551971436,
"learning_rate": 6.720895650149744e-07,
"loss": 1.2236,
"step": 3350
},
{
"epoch": 0.8525214081826832,
"grad_norm": 1.8284307718276978,
"learning_rate": 6.500768785841482e-07,
"loss": 1.2111,
"step": 3360
},
{
"epoch": 0.8550586742784649,
"grad_norm": 1.8736671209335327,
"learning_rate": 6.284056216388451e-07,
"loss": 1.2111,
"step": 3370
},
{
"epoch": 0.8575959403742468,
"grad_norm": 1.8187748193740845,
"learning_rate": 6.070774951760505e-07,
"loss": 1.2058,
"step": 3380
},
{
"epoch": 0.8601332064700286,
"grad_norm": 1.811501145362854,
"learning_rate": 5.860941732601166e-07,
"loss": 1.1993,
"step": 3390
},
{
"epoch": 0.8626704725658103,
"grad_norm": 1.8596574068069458,
"learning_rate": 5.654573028913735e-07,
"loss": 1.2039,
"step": 3400
},
{
"epoch": 0.8652077386615922,
"grad_norm": 1.8533954620361328,
"learning_rate": 5.451685038768473e-07,
"loss": 1.228,
"step": 3410
},
{
"epoch": 0.8677450047573739,
"grad_norm": 1.7679084539413452,
"learning_rate": 5.252293687031196e-07,
"loss": 1.1993,
"step": 3420
},
{
"epoch": 0.8702822708531557,
"grad_norm": 1.8449029922485352,
"learning_rate": 5.05641462411336e-07,
"loss": 1.2015,
"step": 3430
},
{
"epoch": 0.8728195369489375,
"grad_norm": 1.8610782623291016,
"learning_rate": 4.864063224743626e-07,
"loss": 1.2049,
"step": 3440
},
{
"epoch": 0.8753568030447193,
"grad_norm": 1.7010682821273804,
"learning_rate": 4.6752545867610963e-07,
"loss": 1.2047,
"step": 3450
},
{
"epoch": 0.8778940691405012,
"grad_norm": 1.7259236574172974,
"learning_rate": 4.4900035299302036e-07,
"loss": 1.204,
"step": 3460
},
{
"epoch": 0.8804313352362829,
"grad_norm": 1.7741477489471436,
"learning_rate": 4.308324594777635e-07,
"loss": 1.2025,
"step": 3470
},
{
"epoch": 0.8829686013320647,
"grad_norm": 1.8965697288513184,
"learning_rate": 4.130232041450866e-07,
"loss": 1.2034,
"step": 3480
},
{
"epoch": 0.8855058674278465,
"grad_norm": 1.7573304176330566,
"learning_rate": 3.9557398485989884e-07,
"loss": 1.1985,
"step": 3490
},
{
"epoch": 0.8880431335236283,
"grad_norm": 1.7425026893615723,
"learning_rate": 3.784861712275467e-07,
"loss": 1.1938,
"step": 3500
},
{
"epoch": 0.89058039961941,
"grad_norm": 1.7624136209487915,
"learning_rate": 3.61761104486314e-07,
"loss": 1.2098,
"step": 3510
},
{
"epoch": 0.8931176657151919,
"grad_norm": 1.8496613502502441,
"learning_rate": 3.454000974021432e-07,
"loss": 1.2179,
"step": 3520
},
{
"epoch": 0.8956549318109737,
"grad_norm": 1.796911597251892,
"learning_rate": 3.294044341655983e-07,
"loss": 1.1989,
"step": 3530
},
{
"epoch": 0.8981921979067554,
"grad_norm": 1.854840874671936,
"learning_rate": 3.1377537029107174e-07,
"loss": 1.1939,
"step": 3540
},
{
"epoch": 0.9007294640025373,
"grad_norm": 1.7666873931884766,
"learning_rate": 2.985141325182267e-07,
"loss": 1.2132,
"step": 3550
},
{
"epoch": 0.903266730098319,
"grad_norm": 1.8497194051742554,
"learning_rate": 2.836219187157202e-07,
"loss": 1.2106,
"step": 3560
},
{
"epoch": 0.9058039961941009,
"grad_norm": 1.865463376045227,
"learning_rate": 2.69099897787175e-07,
"loss": 1.1999,
"step": 3570
},
{
"epoch": 0.9083412622898827,
"grad_norm": 1.7132046222686768,
"learning_rate": 2.5494920957943314e-07,
"loss": 1.1973,
"step": 3580
},
{
"epoch": 0.9108785283856644,
"grad_norm": 1.8095914125442505,
"learning_rate": 2.411709647930882e-07,
"loss": 1.2137,
"step": 3590
},
{
"epoch": 0.9134157944814463,
"grad_norm": 1.745731234550476,
"learning_rate": 2.2776624489530664e-07,
"loss": 1.2098,
"step": 3600
},
{
"epoch": 0.915953060577228,
"grad_norm": 1.834447979927063,
"learning_rate": 2.1473610203494032e-07,
"loss": 1.2122,
"step": 3610
},
{
"epoch": 0.9184903266730098,
"grad_norm": 1.8160932064056396,
"learning_rate": 2.0208155895994343e-07,
"loss": 1.1939,
"step": 3620
},
{
"epoch": 0.9210275927687916,
"grad_norm": 1.8465439081192017,
"learning_rate": 1.8980360893709582e-07,
"loss": 1.2068,
"step": 3630
},
{
"epoch": 0.9235648588645734,
"grad_norm": 1.8164421319961548,
"learning_rate": 1.7790321567404011e-07,
"loss": 1.2053,
"step": 3640
},
{
"epoch": 0.9261021249603553,
"grad_norm": 1.8823806047439575,
"learning_rate": 1.6638131324364094e-07,
"loss": 1.2077,
"step": 3650
},
{
"epoch": 0.928639391056137,
"grad_norm": 1.8699020147323608,
"learning_rate": 1.55238806010668e-07,
"loss": 1.1787,
"step": 3660
},
{
"epoch": 0.9311766571519188,
"grad_norm": 1.856771469116211,
"learning_rate": 1.444765685608096e-07,
"loss": 1.2126,
"step": 3670
},
{
"epoch": 0.9337139232477006,
"grad_norm": 1.7968028783798218,
"learning_rate": 1.340954456320287e-07,
"loss": 1.2085,
"step": 3680
},
{
"epoch": 0.9362511893434824,
"grad_norm": 1.8331055641174316,
"learning_rate": 1.2409625204825802e-07,
"loss": 1.2081,
"step": 3690
},
{
"epoch": 0.9387884554392641,
"grad_norm": 1.7524057626724243,
"learning_rate": 1.1447977265544141e-07,
"loss": 1.2121,
"step": 3700
},
{
"epoch": 0.941325721535046,
"grad_norm": 1.8265066146850586,
"learning_rate": 1.052467622599329e-07,
"loss": 1.1897,
"step": 3710
},
{
"epoch": 0.9438629876308278,
"grad_norm": 1.8773318529129028,
"learning_rate": 9.639794556925041e-08,
"loss": 1.2053,
"step": 3720
},
{
"epoch": 0.9464002537266096,
"grad_norm": 1.8355180025100708,
"learning_rate": 8.793401713519333e-08,
"loss": 1.2044,
"step": 3730
},
{
"epoch": 0.9489375198223914,
"grad_norm": 1.8861024379730225,
"learning_rate": 7.985564129932566e-08,
"loss": 1.2143,
"step": 3740
},
{
"epoch": 0.9514747859181731,
"grad_norm": 1.8514657020568848,
"learning_rate": 7.216345214083264e-08,
"loss": 1.2143,
"step": 3750
},
{
"epoch": 0.954012052013955,
"grad_norm": 1.7812689542770386,
"learning_rate": 6.485805342674901e-08,
"loss": 1.1858,
"step": 3760
},
{
"epoch": 0.9565493181097368,
"grad_norm": 1.838179349899292,
"learning_rate": 5.7940018564570654e-08,
"loss": 1.2116,
"step": 3770
},
{
"epoch": 0.9590865842055185,
"grad_norm": 1.7216005325317383,
"learning_rate": 5.1409890557246876e-08,
"loss": 1.2106,
"step": 3780
},
{
"epoch": 0.9616238503013004,
"grad_norm": 1.8689485788345337,
"learning_rate": 4.526818196055938e-08,
"loss": 1.2112,
"step": 3790
},
{
"epoch": 0.9641611163970821,
"grad_norm": 1.773857593536377,
"learning_rate": 3.951537484289114e-08,
"loss": 1.2254,
"step": 3800
},
{
"epoch": 0.966698382492864,
"grad_norm": 1.8553355932235718,
"learning_rate": 3.4151920747390044e-08,
"loss": 1.1961,
"step": 3810
},
{
"epoch": 0.9692356485886457,
"grad_norm": 1.7645963430404663,
"learning_rate": 2.9178240656523305e-08,
"loss": 1.2079,
"step": 3820
},
{
"epoch": 0.9717729146844275,
"grad_norm": 1.8326435089111328,
"learning_rate": 2.4594724959037253e-08,
"loss": 1.2201,
"step": 3830
},
{
"epoch": 0.9743101807802094,
"grad_norm": 1.8248659372329712,
"learning_rate": 2.0401733419315727e-08,
"loss": 1.2018,
"step": 3840
},
{
"epoch": 0.9768474468759911,
"grad_norm": 1.7907987833023071,
"learning_rate": 1.659959514913767e-08,
"loss": 1.2187,
"step": 3850
},
{
"epoch": 0.9793847129717729,
"grad_norm": 1.8781341314315796,
"learning_rate": 1.3188608581851114e-08,
"loss": 1.1951,
"step": 3860
},
{
"epoch": 0.9819219790675547,
"grad_norm": 1.8266582489013672,
"learning_rate": 1.016904144894304e-08,
"loss": 1.2178,
"step": 3870
},
{
"epoch": 0.9844592451633365,
"grad_norm": 1.7504289150238037,
"learning_rate": 7.541130759027848e-09,
"loss": 1.2076,
"step": 3880
},
{
"epoch": 0.9869965112591182,
"grad_norm": 1.7687255144119263,
"learning_rate": 5.305082779244464e-09,
"loss": 1.1923,
"step": 3890
},
{
"epoch": 0.9895337773549001,
"grad_norm": 1.7166252136230469,
"learning_rate": 3.4610730190648423e-09,
"loss": 1.223,
"step": 3900
},
{
"epoch": 0.9920710434506819,
"grad_norm": 1.792605996131897,
"learning_rate": 2.0092462165194337e-09,
"loss": 1.2253,
"step": 3910
},
{
"epoch": 0.9946083095464637,
"grad_norm": 1.7338823080062866,
"learning_rate": 9.497163268351595e-10,
"loss": 1.186,
"step": 3920
},
{
"epoch": 0.9971455756422455,
"grad_norm": 1.8996716737747192,
"learning_rate": 2.825665134920108e-10,
"loss": 1.2094,
"step": 3930
},
{
"epoch": 0.9996828417380272,
"grad_norm": 1.8665469884872437,
"learning_rate": 7.849141696048002e-12,
"loss": 1.2135,
"step": 3940
},
{
"epoch": 0.9999365683476055,
"step": 3941,
"total_flos": 6.337025018626572e+18,
"train_loss": 1.2871940559434636,
"train_runtime": 18302.3162,
"train_samples_per_second": 27.564,
"train_steps_per_second": 0.215
}
],
"logging_steps": 10,
"max_steps": 3941,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 6.337025018626572e+18,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}