{
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 5.0,
  "eval_steps": 500,
  "global_step": 1455,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "epoch": 0.03436426116838488,
      "grad_norm": 5.543055534362793,
      "learning_rate": 2.7397260273972603e-05,
      "loss": 1.3448,
      "step": 10
    },
    {
      "epoch": 0.06872852233676977,
      "grad_norm": 2.9320967197418213,
      "learning_rate": 5.479452054794521e-05,
      "loss": 0.574,
      "step": 20
    },
    {
      "epoch": 0.10309278350515463,
      "grad_norm": 2.3950774669647217,
      "learning_rate": 8.219178082191781e-05,
      "loss": 0.3403,
      "step": 30
    },
    {
      "epoch": 0.13745704467353953,
      "grad_norm": 1.2579808235168457,
      "learning_rate": 0.00010958904109589041,
      "loss": 0.2498,
      "step": 40
    },
    {
      "epoch": 0.1718213058419244,
      "grad_norm": 1.5484106540679932,
      "learning_rate": 0.000136986301369863,
      "loss": 0.2205,
      "step": 50
    },
    {
      "epoch": 0.20618556701030927,
      "grad_norm": 0.9280014634132385,
      "learning_rate": 0.00016438356164383562,
      "loss": 0.2014,
      "step": 60
    },
    {
      "epoch": 0.24054982817869416,
      "grad_norm": 0.8061118125915527,
      "learning_rate": 0.0001917808219178082,
      "loss": 0.1826,
      "step": 70
    },
    {
      "epoch": 0.27491408934707906,
      "grad_norm": 1.369801640510559,
      "learning_rate": 0.00019998733979961563,
      "loss": 0.1547,
      "step": 80
    },
    {
      "epoch": 0.30927835051546393,
      "grad_norm": 0.7772133350372314,
      "learning_rate": 0.0001999253383717226,
      "loss": 0.1451,
      "step": 90
    },
    {
      "epoch": 0.3436426116838488,
      "grad_norm": 1.0345059633255005,
      "learning_rate": 0.00019981170237143067,
      "loss": 0.1422,
      "step": 100
    },
    {
      "epoch": 0.37800687285223367,
      "grad_norm": 0.7138351798057556,
      "learning_rate": 0.00019964649051804355,
      "loss": 0.1441,
      "step": 110
    },
    {
      "epoch": 0.41237113402061853,
      "grad_norm": 0.7671028971672058,
      "learning_rate": 0.000199429788181734,
      "loss": 0.1323,
      "step": 120
    },
    {
      "epoch": 0.44673539518900346,
      "grad_norm": 0.7595092058181763,
      "learning_rate": 0.0001991617073394306,
      "loss": 0.1167,
      "step": 130
    },
    {
      "epoch": 0.48109965635738833,
      "grad_norm": 0.9330806732177734,
      "learning_rate": 0.00019884238651695556,
      "loss": 0.1146,
      "step": 140
    },
    {
      "epoch": 0.5154639175257731,
      "grad_norm": 0.6702235341072083,
      "learning_rate": 0.00019847199071744415,
      "loss": 0.1254,
      "step": 150
    },
    {
      "epoch": 0.5498281786941581,
      "grad_norm": 0.493888258934021,
      "learning_rate": 0.00019805071133608242,
      "loss": 0.1005,
      "step": 160
    },
    {
      "epoch": 0.584192439862543,
      "grad_norm": 0.5991718769073486,
      "learning_rate": 0.0001975787660612072,
      "loss": 0.0953,
      "step": 170
    },
    {
      "epoch": 0.6185567010309279,
      "grad_norm": 0.6898994445800781,
      "learning_rate": 0.00019705639876181969,
      "loss": 0.0816,
      "step": 180
    },
    {
      "epoch": 0.6529209621993127,
      "grad_norm": 0.6507448554039001,
      "learning_rate": 0.00019648387936157068,
      "loss": 0.0898,
      "step": 190
    },
    {
      "epoch": 0.6872852233676976,
      "grad_norm": 0.6411319971084595,
      "learning_rate": 0.00019586150369928245,
      "loss": 0.0887,
      "step": 200
    },
    {
      "epoch": 0.7216494845360825,
      "grad_norm": 0.5917101502418518,
      "learning_rate": 0.00019518959337607957,
      "loss": 0.079,
      "step": 210
    },
    {
      "epoch": 0.7560137457044673,
      "grad_norm": 0.6061806678771973,
      "learning_rate": 0.0001944684955892075,
      "loss": 0.0946,
      "step": 220
    },
    {
      "epoch": 0.7903780068728522,
      "grad_norm": 0.4965127408504486,
      "learning_rate": 0.0001936985829526247,
      "loss": 0.0769,
      "step": 230
    },
    {
      "epoch": 0.8247422680412371,
      "grad_norm": 0.6563584208488464,
      "learning_rate": 0.00019288025330446126,
      "loss": 0.0865,
      "step": 240
    },
    {
      "epoch": 0.8591065292096219,
      "grad_norm": 0.38654449582099915,
      "learning_rate": 0.00019201392950144363,
      "loss": 0.0767,
      "step": 250
    },
    {
      "epoch": 0.8934707903780069,
      "grad_norm": 0.4376271069049835,
      "learning_rate": 0.0001911000592003909,
      "loss": 0.0729,
      "step": 260
    },
    {
      "epoch": 0.9278350515463918,
      "grad_norm": 0.3565465211868286,
      "learning_rate": 0.00019013911462689668,
      "loss": 0.0697,
      "step": 270
    },
    {
      "epoch": 0.9621993127147767,
      "grad_norm": 0.6215035915374756,
      "learning_rate": 0.000189131592331315,
      "loss": 0.087,
      "step": 280
    },
    {
      "epoch": 0.9965635738831615,
      "grad_norm": 0.5725772976875305,
      "learning_rate": 0.00018807801293217735,
      "loss": 0.0703,
      "step": 290
    },
    {
      "epoch": 1.0309278350515463,
      "grad_norm": 0.3892384469509125,
      "learning_rate": 0.00018697892084717238,
      "loss": 0.0657,
      "step": 300
    },
    {
      "epoch": 1.0652920962199313,
      "grad_norm": 0.4619602859020233,
      "learning_rate": 0.00018583488401182843,
      "loss": 0.0686,
      "step": 310
    },
    {
      "epoch": 1.0996563573883162,
      "grad_norm": 0.48653048276901245,
      "learning_rate": 0.0001846464935860431,
      "loss": 0.0712,
      "step": 320
    },
    {
      "epoch": 1.134020618556701,
      "grad_norm": 0.42437320947647095,
      "learning_rate": 0.0001834143636486124,
      "loss": 0.0745,
      "step": 330
    },
    {
      "epoch": 1.168384879725086,
      "grad_norm": 0.5314765572547913,
      "learning_rate": 0.00018213913087991685,
      "loss": 0.0583,
      "step": 340
    },
    {
      "epoch": 1.2027491408934707,
      "grad_norm": 0.618599534034729,
      "learning_rate": 0.00018082145423292868,
      "loss": 0.0735,
      "step": 350
    },
    {
      "epoch": 1.2371134020618557,
      "grad_norm": 0.4557077884674072,
      "learning_rate": 0.0001794620145927101,
      "loss": 0.0724,
      "step": 360
    },
    {
      "epoch": 1.2714776632302405,
      "grad_norm": 0.3549683690071106,
      "learning_rate": 0.00017806151442457827,
      "loss": 0.0591,
      "step": 370
    },
    {
      "epoch": 1.3058419243986255,
      "grad_norm": 0.4492852985858917,
      "learning_rate": 0.00017662067741111974,
      "loss": 0.0586,
      "step": 380
    },
    {
      "epoch": 1.3402061855670104,
      "grad_norm": 0.392857164144516,
      "learning_rate": 0.00017514024807824055,
      "loss": 0.0765,
      "step": 390
    },
    {
      "epoch": 1.3745704467353952,
      "grad_norm": 0.3970784544944763,
      "learning_rate": 0.00017362099141044626,
      "loss": 0.0534,
      "step": 400
    },
    {
      "epoch": 1.40893470790378,
      "grad_norm": 0.44281744956970215,
      "learning_rate": 0.00017206369245555036,
      "loss": 0.0611,
      "step": 410
    },
    {
      "epoch": 1.443298969072165,
      "grad_norm": 0.3246597945690155,
      "learning_rate": 0.0001704691559190155,
      "loss": 0.0671,
      "step": 420
    },
    {
      "epoch": 1.47766323024055,
      "grad_norm": 0.33960649371147156,
      "learning_rate": 0.0001688382057481364,
      "loss": 0.0661,
      "step": 430
    },
    {
      "epoch": 1.5120274914089347,
      "grad_norm": 0.5256112813949585,
      "learning_rate": 0.00016717168470628077,
      "loss": 0.0675,
      "step": 440
    },
    {
      "epoch": 1.5463917525773194,
      "grad_norm": 0.5827927589416504,
      "learning_rate": 0.0001654704539374066,
      "loss": 0.0667,
      "step": 450
    },
    {
      "epoch": 1.5807560137457046,
      "grad_norm": 0.6631433963775635,
      "learning_rate": 0.00016373539252108202,
      "loss": 0.0636,
      "step": 460
    },
    {
      "epoch": 1.6151202749140894,
      "grad_norm": 0.5339364409446716,
      "learning_rate": 0.00016196739701823716,
      "loss": 0.0726,
      "step": 470
    },
    {
      "epoch": 1.6494845360824741,
      "grad_norm": 0.6372013092041016,
      "learning_rate": 0.00016016738100788297,
      "loss": 0.0556,
      "step": 480
    },
    {
      "epoch": 1.6838487972508591,
      "grad_norm": 0.46744048595428467,
      "learning_rate": 0.00015833627461503595,
      "loss": 0.059,
      "step": 490
    },
    {
      "epoch": 1.718213058419244,
      "grad_norm": 0.32253313064575195,
      "learning_rate": 0.0001564750240300934,
      "loss": 0.0475,
      "step": 500
    },
    {
      "epoch": 1.7525773195876289,
      "grad_norm": 0.4637961685657501,
      "learning_rate": 0.00015458459101990693,
      "loss": 0.0514,
      "step": 510
    },
    {
      "epoch": 1.7869415807560136,
      "grad_norm": 0.43408897519111633,
      "learning_rate": 0.00015266595243080714,
      "loss": 0.0509,
      "step": 520
    },
    {
      "epoch": 1.8213058419243986,
      "grad_norm": 0.5546535849571228,
      "learning_rate": 0.00015072009968383656,
      "loss": 0.0572,
      "step": 530
    },
    {
      "epoch": 1.8556701030927836,
      "grad_norm": 0.3202098309993744,
      "learning_rate": 0.00014874803826245089,
      "loss": 0.0615,
      "step": 540
    },
    {
      "epoch": 1.8900343642611683,
      "grad_norm": 0.4085174798965454,
      "learning_rate": 0.00014675078719295415,
      "loss": 0.0561,
      "step": 550
    },
    {
      "epoch": 1.9243986254295533,
      "grad_norm": 0.4084959030151367,
      "learning_rate": 0.00014472937851793557,
      "loss": 0.0616,
      "step": 560
    },
    {
      "epoch": 1.9587628865979383,
      "grad_norm": 0.4582497179508209,
      "learning_rate": 0.00014268485676298078,
      "loss": 0.0675,
      "step": 570
    },
    {
      "epoch": 1.993127147766323,
      "grad_norm": 0.25662359595298767,
      "learning_rate": 0.0001406182783969324,
      "loss": 0.0543,
      "step": 580
    },
    {
      "epoch": 2.027491408934708,
      "grad_norm": 0.2858852744102478,
      "learning_rate": 0.00013853071128597924,
      "loss": 0.0447,
      "step": 590
    },
    {
      "epoch": 2.0618556701030926,
      "grad_norm": 0.4853512942790985,
      "learning_rate": 0.0001364232341418564,
      "loss": 0.0537,
      "step": 600
    },
    {
      "epoch": 2.0962199312714778,
      "grad_norm": 0.40022608637809753,
      "learning_rate": 0.00013429693596444067,
      "loss": 0.0647,
      "step": 610
    },
    {
      "epoch": 2.1305841924398625,
      "grad_norm": 0.44074031710624695,
      "learning_rate": 0.00013215291547903006,
      "loss": 0.063,
      "step": 620
    },
    {
      "epoch": 2.1649484536082473,
      "grad_norm": 0.3592728078365326,
      "learning_rate": 0.00012999228056859784,
      "loss": 0.0608,
      "step": 630
    },
    {
      "epoch": 2.1993127147766325,
      "grad_norm": 0.3472447395324707,
      "learning_rate": 0.00012781614770131442,
      "loss": 0.0541,
      "step": 640
    },
    {
      "epoch": 2.2336769759450172,
      "grad_norm": 0.30898717045783997,
      "learning_rate": 0.00012562564135363313,
      "loss": 0.0454,
      "step": 650
    },
    {
      "epoch": 2.268041237113402,
      "grad_norm": 0.3706619441509247,
      "learning_rate": 0.0001234218934292376,
      "loss": 0.0524,
      "step": 660
    },
    {
      "epoch": 2.3024054982817868,
      "grad_norm": 0.35367798805236816,
      "learning_rate": 0.00012120604267415172,
      "loss": 0.0351,
      "step": 670
    },
    {
      "epoch": 2.336769759450172,
      "grad_norm": 0.36357077956199646,
      "learning_rate": 0.00011897923408831346,
      "loss": 0.0558,
      "step": 680
    },
    {
      "epoch": 2.3711340206185567,
      "grad_norm": 0.4092961251735687,
      "learning_rate": 0.0001167426183339174,
      "loss": 0.041,
      "step": 690
    },
    {
      "epoch": 2.4054982817869415,
      "grad_norm": 0.2752332389354706,
      "learning_rate": 0.00011449735114083127,
      "loss": 0.0407,
      "step": 700
    },
    {
      "epoch": 2.4398625429553267,
      "grad_norm": 0.38444244861602783,
      "learning_rate": 0.00011224459270939384,
      "loss": 0.044,
      "step": 710
    },
    {
      "epoch": 2.4742268041237114,
      "grad_norm": 0.3202449679374695,
      "learning_rate": 0.000109985507110903,
      "loss": 0.0422,
      "step": 720
    },
    {
      "epoch": 2.508591065292096,
      "grad_norm": 0.2754347324371338,
      "learning_rate": 0.00010772126168610325,
      "loss": 0.0468,
      "step": 730
    },
    {
      "epoch": 2.542955326460481,
      "grad_norm": 0.32674992084503174,
      "learning_rate": 0.00010545302644198405,
      "loss": 0.0461,
      "step": 740
    },
    {
      "epoch": 2.5773195876288657,
      "grad_norm": 0.27970951795578003,
      "learning_rate": 0.00010318197344720018,
      "loss": 0.0396,
      "step": 750
    },
    {
      "epoch": 2.611683848797251,
      "grad_norm": 0.3448905646800995,
      "learning_rate": 0.0001009092762264271,
      "loss": 0.039,
      "step": 760
    },
    {
      "epoch": 2.6460481099656357,
      "grad_norm": 0.32179659605026245,
      "learning_rate": 9.863610915396365e-05,
      "loss": 0.0446,
      "step": 770
    },
    {
      "epoch": 2.680412371134021,
      "grad_norm": 0.3091253340244293,
      "learning_rate": 9.63636468468959e-05,
      "loss": 0.0536,
      "step": 780
    },
    {
      "epoch": 2.7147766323024056,
      "grad_norm": 0.4326021671295166,
      "learning_rate": 9.409306355813529e-05,
      "loss": 0.0401,
      "step": 790
    },
    {
      "epoch": 2.7491408934707904,
      "grad_norm": 0.2855621874332428,
      "learning_rate": 9.18255325696454e-05,
      "loss": 0.0475,
      "step": 800
    },
    {
      "epoch": 2.783505154639175,
      "grad_norm": 0.33704933524131775,
      "learning_rate": 8.956222558616998e-05,
      "loss": 0.036,
      "step": 810
    },
    {
      "epoch": 2.81786941580756,
      "grad_norm": 0.3991442620754242,
      "learning_rate": 8.730431212977625e-05,
      "loss": 0.0582,
      "step": 820
    },
    {
      "epoch": 2.852233676975945,
      "grad_norm": 0.29364827275276184,
      "learning_rate": 8.505295893552594e-05,
      "loss": 0.0442,
      "step": 830
    },
    {
      "epoch": 2.88659793814433,
      "grad_norm": 0.38249287009239197,
      "learning_rate": 8.280932934858652e-05,
      "loss": 0.043,
      "step": 840
    },
    {
      "epoch": 2.9209621993127146,
      "grad_norm": 0.5033205151557922,
      "learning_rate": 8.05745827230941e-05,
      "loss": 0.0407,
      "step": 850
    },
    {
      "epoch": 2.9553264604811,
      "grad_norm": 0.24252034723758698,
      "learning_rate": 7.834987382307861e-05,
      "loss": 0.0459,
      "step": 860
    },
    {
      "epoch": 2.9896907216494846,
      "grad_norm": 0.24571438133716583,
      "learning_rate": 7.613635222576072e-05,
      "loss": 0.0452,
      "step": 870
    },
    {
      "epoch": 3.0240549828178693,
      "grad_norm": 0.32780882716178894,
      "learning_rate": 7.393516172752919e-05,
      "loss": 0.0427,
      "step": 880
    },
    {
      "epoch": 3.058419243986254,
      "grad_norm": 0.28867006301879883,
      "learning_rate": 7.174743975290513e-05,
      "loss": 0.0411,
      "step": 890
    },
    {
      "epoch": 3.0927835051546393,
      "grad_norm": 0.37426048517227173,
      "learning_rate": 6.957431676679896e-05,
      "loss": 0.0386,
      "step": 900
    },
    {
      "epoch": 3.127147766323024,
      "grad_norm": 0.24257159233093262,
      "learning_rate": 6.741691569036338e-05,
      "loss": 0.0329,
      "step": 910
    },
    {
      "epoch": 3.161512027491409,
      "grad_norm": 0.2398187667131424,
      "learning_rate": 6.527635132074493e-05,
      "loss": 0.0494,
      "step": 920
    },
    {
      "epoch": 3.195876288659794,
      "grad_norm": 0.35756927728652954,
      "learning_rate": 6.315372975503285e-05,
      "loss": 0.0494,
      "step": 930
    },
    {
      "epoch": 3.2302405498281788,
      "grad_norm": 0.33009472489356995,
      "learning_rate": 6.1050147818704e-05,
      "loss": 0.0375,
      "step": 940
    },
    {
      "epoch": 3.2646048109965635,
      "grad_norm": 0.30801263451576233,
      "learning_rate": 5.896669249885851e-05,
      "loss": 0.0361,
      "step": 950
    },
    {
      "epoch": 3.2989690721649483,
      "grad_norm": 0.31775572896003723,
      "learning_rate": 5.690444038253935e-05,
      "loss": 0.0473,
      "step": 960
    },
    {
      "epoch": 3.3333333333333335,
      "grad_norm": 0.3134918212890625,
      "learning_rate": 5.4864457100425783e-05,
      "loss": 0.0298,
      "step": 970
    },
    {
      "epoch": 3.3676975945017182,
      "grad_norm": 0.2736685574054718,
      "learning_rate": 5.284779677618841e-05,
      "loss": 0.0334,
      "step": 980
    },
    {
      "epoch": 3.402061855670103,
      "grad_norm": 0.5353654623031616,
      "learning_rate": 5.0855501481790305e-05,
      "loss": 0.0395,
      "step": 990
    },
    {
      "epoch": 3.436426116838488,
      "grad_norm": 0.40775638818740845,
      "learning_rate": 4.8888600699015496e-05,
      "loss": 0.0365,
      "step": 1000
    },
    {
      "epoch": 3.470790378006873,
      "grad_norm": 0.25919926166534424,
      "learning_rate": 4.694811078750338e-05,
      "loss": 0.0422,
      "step": 1010
    },
    {
      "epoch": 3.5051546391752577,
      "grad_norm": 0.3091573417186737,
      "learning_rate": 4.50350344595635e-05,
      "loss": 0.0318,
      "step": 1020
    },
    {
      "epoch": 3.5395189003436425,
      "grad_norm": 0.33824992179870605,
      "learning_rate": 4.315036026204262e-05,
      "loss": 0.034,
      "step": 1030
    },
    {
      "epoch": 3.5738831615120272,
      "grad_norm": 0.2815360128879547,
      "learning_rate": 4.129506206551138e-05,
      "loss": 0.0307,
      "step": 1040
    },
    {
      "epoch": 3.6082474226804124,
      "grad_norm": 0.15872405469417572,
      "learning_rate": 3.947009856103465e-05,
      "loss": 0.04,
      "step": 1050
    },
    {
      "epoch": 3.642611683848797,
      "grad_norm": 0.24633029103279114,
      "learning_rate": 3.767641276478563e-05,
      "loss": 0.047,
      "step": 1060
    },
    {
      "epoch": 3.6769759450171824,
      "grad_norm": 0.22606323659420013,
      "learning_rate": 3.591493153075966e-05,
      "loss": 0.0313,
      "step": 1070
    },
    {
      "epoch": 3.711340206185567,
      "grad_norm": 0.36013609170913696,
      "learning_rate": 3.41865650718396e-05,
      "loss": 0.033,
      "step": 1080
    },
    {
      "epoch": 3.745704467353952,
      "grad_norm": 0.2635957896709442,
      "learning_rate": 3.24922064894601e-05,
      "loss": 0.0377,
      "step": 1090
    },
    {
      "epoch": 3.7800687285223367,
      "grad_norm": 0.22290170192718506,
      "learning_rate": 3.083273131211382e-05,
      "loss": 0.032,
      "step": 1100
    },
    {
      "epoch": 3.8144329896907214,
      "grad_norm": 0.21059395372867584,
      "learning_rate": 2.920899704293849e-05,
      "loss": 0.0339,
      "step": 1110
    },
    {
      "epoch": 3.8487972508591066,
      "grad_norm": 0.22615396976470947,
      "learning_rate": 2.762184271661785e-05,
      "loss": 0.0282,
      "step": 1120
    },
    {
      "epoch": 3.8831615120274914,
      "grad_norm": 0.14210452139377594,
      "learning_rate": 2.6072088465826038e-05,
      "loss": 0.031,
      "step": 1130
    },
    {
      "epoch": 3.917525773195876,
      "grad_norm": 0.199430912733078,
      "learning_rate": 2.4560535097439108e-05,
      "loss": 0.0286,
      "step": 1140
    },
    {
      "epoch": 3.9518900343642613,
      "grad_norm": 0.22842490673065186,
      "learning_rate": 2.308796367873296e-05,
      "loss": 0.0343,
      "step": 1150
    },
    {
      "epoch": 3.986254295532646,
      "grad_norm": 0.20589038729667664,
      "learning_rate": 2.165513513378121e-05,
      "loss": 0.0266,
      "step": 1160
    },
    {
      "epoch": 4.020618556701031,
      "grad_norm": 0.19770939648151398,
      "learning_rate": 2.0262789850261798e-05,
      "loss": 0.0334,
      "step": 1170
    },
    {
      "epoch": 4.054982817869416,
      "grad_norm": 0.32386648654937744,
      "learning_rate": 1.8911647296875147e-05,
      "loss": 0.0274,
      "step": 1180
    },
    {
      "epoch": 4.0893470790378,
      "grad_norm": 0.20792323350906372,
      "learning_rate": 1.7602405651572275e-05,
      "loss": 0.0295,
      "step": 1190
    },
    {
      "epoch": 4.123711340206185,
      "grad_norm": 0.17066961526870728,
      "learning_rate": 1.6335741440784035e-05,
      "loss": 0.0385,
      "step": 1200
    },
    {
      "epoch": 4.158075601374571,
      "grad_norm": 0.3135523796081543,
      "learning_rate": 1.511230918983867e-05,
      "loss": 0.0432,
      "step": 1210
    },
    {
      "epoch": 4.1924398625429555,
      "grad_norm": 0.2972412407398224,
      "learning_rate": 1.3932741084747913e-05,
      "loss": 0.0316,
      "step": 1220
    },
    {
      "epoch": 4.22680412371134,
      "grad_norm": 0.19240647554397583,
      "learning_rate": 1.2797646645536566e-05,
      "loss": 0.0278,
      "step": 1230
    },
    {
      "epoch": 4.261168384879725,
      "grad_norm": 0.3429684638977051,
      "learning_rate": 1.1707612411284253e-05,
      "loss": 0.0369,
      "step": 1240
    },
    {
      "epoch": 4.29553264604811,
      "grad_norm": 0.2781321704387665,
      "learning_rate": 1.0663201637042252e-05,
      "loss": 0.0335,
      "step": 1250
    },
    {
      "epoch": 4.329896907216495,
      "grad_norm": 0.2124054729938507,
      "learning_rate": 9.664954002781745e-06,
      "loss": 0.0264,
      "step": 1260
    },
    {
      "epoch": 4.364261168384879,
      "grad_norm": 0.20779696106910706,
      "learning_rate": 8.713385334524283e-06,
      "loss": 0.0235,
      "step": 1270
    },
    {
      "epoch": 4.398625429553265,
      "grad_norm": 0.20655418932437897,
      "learning_rate": 7.808987337798158e-06,
      "loss": 0.029,
      "step": 1280
    },
    {
      "epoch": 4.43298969072165,
      "grad_norm": 0.3112201392650604,
      "learning_rate": 6.952227343558671e-06,
      "loss": 0.0273,
      "step": 1290
    },
    {
      "epoch": 4.4673539518900345,
      "grad_norm": 0.20721083879470825,
      "learning_rate": 6.143548066703475e-06,
      "loss": 0.0317,
      "step": 1300
    },
    {
      "epoch": 4.501718213058419,
      "grad_norm": 0.24077868461608887,
      "learning_rate": 5.383367377307857e-06,
      "loss": 0.0258,
      "step": 1310
    },
    {
      "epoch": 4.536082474226804,
      "grad_norm": 0.19729691743850708,
      "learning_rate": 4.672078084698095e-06,
      "loss": 0.0287,
      "step": 1320
    },
    {
      "epoch": 4.570446735395189,
      "grad_norm": 0.3005140423774719,
      "learning_rate": 4.010047734474454e-06,
      "loss": 0.0324,
      "step": 1330
    },
    {
      "epoch": 4.6048109965635735,
      "grad_norm": 0.3983416259288788,
      "learning_rate": 3.397618418588877e-06,
      "loss": 0.0419,
      "step": 1340
    },
    {
      "epoch": 4.639175257731958,
      "grad_norm": 0.24803949892520905,
      "learning_rate": 2.8351065985751766e-06,
      "loss": 0.0305,
      "step": 1350
    },
    {
      "epoch": 4.673539518900344,
      "grad_norm": 0.3299216628074646,
      "learning_rate": 2.322802942023461e-06,
      "loss": 0.0378,
      "step": 1360
    },
    {
      "epoch": 4.707903780068729,
      "grad_norm": 0.33067575097084045,
      "learning_rate": 1.8609721723830132e-06,
      "loss": 0.0278,
      "step": 1370
    },
    {
      "epoch": 4.742268041237113,
      "grad_norm": 0.3416236340999603,
      "learning_rate": 1.4498529321713584e-06,
      "loss": 0.0257,
      "step": 1380
    },
    {
      "epoch": 4.776632302405498,
      "grad_norm": 0.20940996706485748,
      "learning_rate": 1.0896576596600705e-06,
      "loss": 0.031,
      "step": 1390
    },
    {
      "epoch": 4.810996563573883,
      "grad_norm": 0.186074361205101,
      "learning_rate": 7.80572479101327e-07,
      "loss": 0.0295,
      "step": 1400
    },
    {
      "epoch": 4.845360824742268,
      "grad_norm": 0.19710496068000793,
      "learning_rate": 5.227571045515633e-07,
      "loss": 0.0416,
      "step": 1410
    },
    {
      "epoch": 4.879725085910653,
      "grad_norm": 0.20495687425136566,
      "learning_rate": 3.163447573422351e-07,
      "loss": 0.0329,
      "step": 1420
    },
    {
      "epoch": 4.914089347079038,
      "grad_norm": 0.4691133499145508,
      "learning_rate": 1.614420972401165e-07,
      "loss": 0.0323,
      "step": 1430
    },
    {
      "epoch": 4.948453608247423,
      "grad_norm": 0.23419992625713348,
      "learning_rate": 5.812916733284324e-08,
      "loss": 0.0205,
      "step": 1440
    },
    {
      "epoch": 4.982817869415808,
      "grad_norm": 0.21251557767391205,
      "learning_rate": 6.459352668164442e-09,
      "loss": 0.0332,
      "step": 1450
    },
    {
      "epoch": 5.0,
      "step": 1455,
      "total_flos": 2.885368061470752e+16,
      "train_loss": 0.07233676388603714,
      "train_runtime": 485.4593,
      "train_samples_per_second": 47.955,
      "train_steps_per_second": 2.997
    }
  ],
  "logging_steps": 10,
  "max_steps": 1455,
  "num_input_tokens_seen": 0,
  "num_train_epochs": 5,
  "save_steps": 500,
  "stateful_callbacks": {
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": false,
        "should_save": true,
        "should_training_stop": true
      },
      "attributes": {}
    }
  },
  "total_flos": 2.885368061470752e+16,
  "train_batch_size": 16,
  "trial_name": null,
  "trial_params": null
}