{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 300, "global_step": 1443, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.002079002079002079, "grad_norm": 21.0, "learning_rate": 1e-06, "loss": 1.4443, "step": 1 }, { "epoch": 0.010395010395010396, "grad_norm": 7.46875, "learning_rate": 1.9999786113807535e-06, "loss": 1.4249, "step": 5 }, { "epoch": 0.02079002079002079, "grad_norm": 5.1875, "learning_rate": 1.999847906465395e-06, "loss": 1.3779, "step": 10 }, { "epoch": 0.031185031185031187, "grad_norm": 4.125, "learning_rate": 1.999598394713099e-06, "loss": 1.3446, "step": 15 }, { "epoch": 0.04158004158004158, "grad_norm": 3.859375, "learning_rate": 1.9992301057720783e-06, "loss": 1.3445, "step": 20 }, { "epoch": 0.05197505197505198, "grad_norm": 3.75, "learning_rate": 1.9987430834042318e-06, "loss": 1.3221, "step": 25 }, { "epoch": 0.062370062370062374, "grad_norm": 3.703125, "learning_rate": 1.9981373854799487e-06, "loss": 1.3312, "step": 30 }, { "epoch": 0.07276507276507277, "grad_norm": 3.515625, "learning_rate": 1.99741308397123e-06, "loss": 1.3108, "step": 35 }, { "epoch": 0.08316008316008316, "grad_norm": 3.703125, "learning_rate": 1.9965702649431362e-06, "loss": 1.3148, "step": 40 }, { "epoch": 0.09355509355509356, "grad_norm": 3.5625, "learning_rate": 1.9956090285435628e-06, "loss": 1.3075, "step": 45 }, { "epoch": 0.10395010395010396, "grad_norm": 3.703125, "learning_rate": 1.9945294889913377e-06, "loss": 1.3029, "step": 50 }, { "epoch": 0.11434511434511435, "grad_norm": 3.671875, "learning_rate": 1.9933317745626504e-06, "loss": 1.3022, "step": 55 }, { "epoch": 0.12474012474012475, "grad_norm": 3.671875, "learning_rate": 1.9920160275758096e-06, "loss": 1.2885, "step": 60 }, { "epoch": 0.13513513513513514, "grad_norm": 3.515625, "learning_rate": 1.9905824043743316e-06, "loss": 1.2944, "step": 65 }, { "epoch": 0.14553014553014554, "grad_norm": 3.453125, "learning_rate": 1.9890310753083633e-06, "loss": 1.2993, "step": 70 }, { "epoch": 0.15592515592515593, "grad_norm": 3.484375, "learning_rate": 1.987362224714441e-06, "loss": 1.2885, "step": 75 }, { "epoch": 0.16632016632016633, "grad_norm": 3.5, "learning_rate": 1.985576050893585e-06, "loss": 1.2862, "step": 80 }, { "epoch": 0.17671517671517672, "grad_norm": 3.578125, "learning_rate": 1.983672766087738e-06, "loss": 1.2854, "step": 85 }, { "epoch": 0.18711018711018712, "grad_norm": 3.546875, "learning_rate": 1.9816525964545446e-06, "loss": 1.2765, "step": 90 }, { "epoch": 0.19750519750519752, "grad_norm": 3.640625, "learning_rate": 1.979515782040478e-06, "loss": 1.2874, "step": 95 }, { "epoch": 0.2079002079002079, "grad_norm": 3.546875, "learning_rate": 1.977262576752319e-06, "loss": 1.2808, "step": 100 }, { "epoch": 0.2182952182952183, "grad_norm": 3.59375, "learning_rate": 1.97489324832698e-06, "loss": 1.2768, "step": 105 }, { "epoch": 0.2286902286902287, "grad_norm": 3.59375, "learning_rate": 1.972408078299698e-06, "loss": 1.2757, "step": 110 }, { "epoch": 0.2390852390852391, "grad_norm": 3.53125, "learning_rate": 1.969807361970575e-06, "loss": 1.2897, "step": 115 }, { "epoch": 0.2494802494802495, "grad_norm": 3.5, "learning_rate": 1.9670914083694948e-06, "loss": 1.2796, "step": 120 }, { "epoch": 0.2598752598752599, "grad_norm": 3.515625, "learning_rate": 1.9642605402193976e-06, "loss": 1.2684, "step": 125 }, { "epoch": 0.2702702702702703, "grad_norm": 3.53125, "learning_rate": 1.9613150938979343e-06, "loss": 1.2694, "step": 130 }, { "epoch": 0.2806652806652807, "grad_norm": 3.546875, "learning_rate": 1.958255419397498e-06, "loss": 1.263, "step": 135 }, { "epoch": 0.2910602910602911, "grad_norm": 3.578125, "learning_rate": 1.9550818802836337e-06, "loss": 1.271, "step": 140 }, { "epoch": 0.30145530145530147, "grad_norm": 3.640625, "learning_rate": 1.9517948536518397e-06, "loss": 1.2638, "step": 145 }, { "epoch": 0.31185031185031187, "grad_norm": 3.59375, "learning_rate": 1.9483947300827577e-06, "loss": 1.2633, "step": 150 }, { "epoch": 0.32224532224532226, "grad_norm": 3.484375, "learning_rate": 1.944881913595762e-06, "loss": 1.2648, "step": 155 }, { "epoch": 0.33264033264033266, "grad_norm": 3.46875, "learning_rate": 1.941256821600955e-06, "loss": 1.2588, "step": 160 }, { "epoch": 0.34303534303534305, "grad_norm": 3.5, "learning_rate": 1.9375198848495637e-06, "loss": 1.258, "step": 165 }, { "epoch": 0.35343035343035345, "grad_norm": 3.53125, "learning_rate": 1.9336715473827594e-06, "loss": 1.2588, "step": 170 }, { "epoch": 0.36382536382536385, "grad_norm": 3.515625, "learning_rate": 1.929712266478893e-06, "loss": 1.2538, "step": 175 }, { "epoch": 0.37422037422037424, "grad_norm": 3.765625, "learning_rate": 1.9256425125991596e-06, "loss": 1.2539, "step": 180 }, { "epoch": 0.38461538461538464, "grad_norm": 3.546875, "learning_rate": 1.9214627693316952e-06, "loss": 1.2556, "step": 185 }, { "epoch": 0.39501039501039503, "grad_norm": 3.59375, "learning_rate": 1.9171735333341148e-06, "loss": 1.2524, "step": 190 }, { "epoch": 0.40540540540540543, "grad_norm": 3.59375, "learning_rate": 1.9127753142744977e-06, "loss": 1.2471, "step": 195 }, { "epoch": 0.4158004158004158, "grad_norm": 3.515625, "learning_rate": 1.9082686347708253e-06, "loss": 1.246, "step": 200 }, { "epoch": 0.4261954261954262, "grad_norm": 3.421875, "learning_rate": 1.9036540303288815e-06, "loss": 1.247, "step": 205 }, { "epoch": 0.4365904365904366, "grad_norm": 3.546875, "learning_rate": 1.898932049278621e-06, "loss": 1.2496, "step": 210 }, { "epoch": 0.446985446985447, "grad_norm": 3.40625, "learning_rate": 1.8941032527090148e-06, "loss": 1.2492, "step": 215 }, { "epoch": 0.4573804573804574, "grad_norm": 3.515625, "learning_rate": 1.8891682144013777e-06, "loss": 1.2476, "step": 220 }, { "epoch": 0.4677754677754678, "grad_norm": 3.40625, "learning_rate": 1.8841275207611895e-06, "loss": 1.2426, "step": 225 }, { "epoch": 0.4781704781704782, "grad_norm": 3.5, "learning_rate": 1.8789817707484162e-06, "loss": 1.2388, "step": 230 }, { "epoch": 0.4885654885654886, "grad_norm": 3.5625, "learning_rate": 1.8737315758063364e-06, "loss": 1.2374, "step": 235 }, { "epoch": 0.498960498960499, "grad_norm": 3.453125, "learning_rate": 1.8683775597888886e-06, "loss": 1.2437, "step": 240 }, { "epoch": 0.5093555093555093, "grad_norm": 3.5, "learning_rate": 1.8629203588865419e-06, "loss": 1.2335, "step": 245 }, { "epoch": 0.5197505197505198, "grad_norm": 3.53125, "learning_rate": 1.8573606215506985e-06, "loss": 1.2355, "step": 250 }, { "epoch": 0.5301455301455301, "grad_norm": 3.4375, "learning_rate": 1.8516990084166443e-06, "loss": 1.2384, "step": 255 }, { "epoch": 0.5405405405405406, "grad_norm": 3.625, "learning_rate": 1.8459361922250469e-06, "loss": 1.232, "step": 260 }, { "epoch": 0.5509355509355509, "grad_norm": 3.515625, "learning_rate": 1.8400728577420187e-06, "loss": 1.2356, "step": 265 }, { "epoch": 0.5613305613305614, "grad_norm": 3.515625, "learning_rate": 1.8341097016777484e-06, "loss": 1.2408, "step": 270 }, { "epoch": 0.5717255717255717, "grad_norm": 3.46875, "learning_rate": 1.8280474326037155e-06, "loss": 1.2465, "step": 275 }, { "epoch": 0.5821205821205822, "grad_norm": 3.53125, "learning_rate": 1.8218867708684937e-06, "loss": 1.2331, "step": 280 }, { "epoch": 0.5925155925155925, "grad_norm": 3.46875, "learning_rate": 1.8156284485121556e-06, "loss": 1.2323, "step": 285 }, { "epoch": 0.6029106029106029, "grad_norm": 3.4375, "learning_rate": 1.8092732091792884e-06, "loss": 1.2365, "step": 290 }, { "epoch": 0.6133056133056133, "grad_norm": 3.5625, "learning_rate": 1.8028218080306302e-06, "loss": 1.2394, "step": 295 }, { "epoch": 0.6237006237006237, "grad_norm": 3.609375, "learning_rate": 1.7962750116533387e-06, "loss": 1.2263, "step": 300 }, { "epoch": 0.6237006237006237, "eval_loss": 1.2331745624542236, "eval_runtime": 10.8501, "eval_samples_per_second": 85.713, "eval_steps_per_second": 2.765, "step": 300 }, { "epoch": 0.6340956340956341, "grad_norm": 3.5625, "learning_rate": 1.7896335979699001e-06, "loss": 1.2311, "step": 305 }, { "epoch": 0.6444906444906445, "grad_norm": 3.46875, "learning_rate": 1.782898356145694e-06, "loss": 1.2236, "step": 310 }, { "epoch": 0.6548856548856549, "grad_norm": 4.25, "learning_rate": 1.7760700864952205e-06, "loss": 1.2376, "step": 315 }, { "epoch": 0.6652806652806653, "grad_norm": 3.484375, "learning_rate": 1.7691496003870018e-06, "loss": 1.2239, "step": 320 }, { "epoch": 0.6756756756756757, "grad_norm": 3.578125, "learning_rate": 1.7621377201471735e-06, "loss": 1.2311, "step": 325 }, { "epoch": 0.6860706860706861, "grad_norm": 3.796875, "learning_rate": 1.75503527896177e-06, "loss": 1.226, "step": 330 }, { "epoch": 0.6964656964656964, "grad_norm": 3.46875, "learning_rate": 1.7478431207777215e-06, "loss": 1.228, "step": 335 }, { "epoch": 0.7068607068607069, "grad_norm": 3.484375, "learning_rate": 1.7405621002025735e-06, "loss": 1.2216, "step": 340 }, { "epoch": 0.7172557172557172, "grad_norm": 3.46875, "learning_rate": 1.733193082402936e-06, "loss": 1.2223, "step": 345 }, { "epoch": 0.7276507276507277, "grad_norm": 3.53125, "learning_rate": 1.7257369430016817e-06, "loss": 1.2158, "step": 350 }, { "epoch": 0.738045738045738, "grad_norm": 3.4375, "learning_rate": 1.7181945679739003e-06, "loss": 1.2206, "step": 355 }, { "epoch": 0.7484407484407485, "grad_norm": 3.59375, "learning_rate": 1.7105668535416205e-06, "loss": 1.2292, "step": 360 }, { "epoch": 0.7588357588357588, "grad_norm": 3.578125, "learning_rate": 1.7028547060673197e-06, "loss": 1.2272, "step": 365 }, { "epoch": 0.7692307692307693, "grad_norm": 3.609375, "learning_rate": 1.6950590419462229e-06, "loss": 1.2264, "step": 370 }, { "epoch": 0.7796257796257796, "grad_norm": 3.578125, "learning_rate": 1.687180787497413e-06, "loss": 1.2155, "step": 375 }, { "epoch": 0.7900207900207901, "grad_norm": 3.421875, "learning_rate": 1.6792208788537617e-06, "loss": 1.2154, "step": 380 }, { "epoch": 0.8004158004158004, "grad_norm": 3.625, "learning_rate": 1.6711802618506926e-06, "loss": 1.2134, "step": 385 }, { "epoch": 0.8108108108108109, "grad_norm": 3.546875, "learning_rate": 1.663059891913793e-06, "loss": 1.2211, "step": 390 }, { "epoch": 0.8212058212058212, "grad_norm": 3.734375, "learning_rate": 1.6548607339452852e-06, "loss": 1.2222, "step": 395 }, { "epoch": 0.8316008316008316, "grad_norm": 3.515625, "learning_rate": 1.6465837622093722e-06, "loss": 1.2188, "step": 400 }, { "epoch": 0.841995841995842, "grad_norm": 3.515625, "learning_rate": 1.6382299602164706e-06, "loss": 1.2189, "step": 405 }, { "epoch": 0.8523908523908524, "grad_norm": 3.5625, "learning_rate": 1.6298003206063466e-06, "loss": 1.2145, "step": 410 }, { "epoch": 0.8627858627858628, "grad_norm": 3.53125, "learning_rate": 1.6212958450301625e-06, "loss": 1.2168, "step": 415 }, { "epoch": 0.8731808731808732, "grad_norm": 3.515625, "learning_rate": 1.6127175440314594e-06, "loss": 1.2083, "step": 420 }, { "epoch": 0.8835758835758836, "grad_norm": 3.484375, "learning_rate": 1.6040664369260758e-06, "loss": 1.2183, "step": 425 }, { "epoch": 0.893970893970894, "grad_norm": 3.671875, "learning_rate": 1.5953435516810303e-06, "loss": 1.2134, "step": 430 }, { "epoch": 0.9043659043659044, "grad_norm": 3.515625, "learning_rate": 1.586549924792372e-06, "loss": 1.2145, "step": 435 }, { "epoch": 0.9147609147609148, "grad_norm": 3.53125, "learning_rate": 1.5776866011620198e-06, "loss": 1.2148, "step": 440 }, { "epoch": 0.9251559251559252, "grad_norm": 3.6875, "learning_rate": 1.5687546339736013e-06, "loss": 1.212, "step": 445 }, { "epoch": 0.9355509355509356, "grad_norm": 3.578125, "learning_rate": 1.559755084567309e-06, "loss": 1.2183, "step": 450 }, { "epoch": 0.9459459459459459, "grad_norm": 3.640625, "learning_rate": 1.5506890223137857e-06, "loss": 1.2123, "step": 455 }, { "epoch": 0.9563409563409564, "grad_norm": 3.5625, "learning_rate": 1.5415575244870578e-06, "loss": 1.214, "step": 460 }, { "epoch": 0.9667359667359667, "grad_norm": 3.59375, "learning_rate": 1.5323616761365278e-06, "loss": 1.2081, "step": 465 }, { "epoch": 0.9771309771309772, "grad_norm": 3.515625, "learning_rate": 1.5231025699580427e-06, "loss": 1.1995, "step": 470 }, { "epoch": 0.9875259875259875, "grad_norm": 3.484375, "learning_rate": 1.513781306164056e-06, "loss": 1.2226, "step": 475 }, { "epoch": 0.997920997920998, "grad_norm": 3.5, "learning_rate": 1.5043989923528937e-06, "loss": 1.2183, "step": 480 }, { "epoch": 1.0083160083160083, "grad_norm": 3.453125, "learning_rate": 1.4949567433771448e-06, "loss": 1.1826, "step": 485 }, { "epoch": 1.0187110187110187, "grad_norm": 3.609375, "learning_rate": 1.4854556812111887e-06, "loss": 1.1925, "step": 490 }, { "epoch": 1.0291060291060292, "grad_norm": 3.5, "learning_rate": 1.4758969348178766e-06, "loss": 1.19, "step": 495 }, { "epoch": 1.0395010395010396, "grad_norm": 3.53125, "learning_rate": 1.4662816400143836e-06, "loss": 1.1963, "step": 500 }, { "epoch": 1.04989604989605, "grad_norm": 3.640625, "learning_rate": 1.4566109393372433e-06, "loss": 1.1872, "step": 505 }, { "epoch": 1.0602910602910602, "grad_norm": 3.578125, "learning_rate": 1.4468859819065882e-06, "loss": 1.1833, "step": 510 }, { "epoch": 1.0706860706860706, "grad_norm": 3.5625, "learning_rate": 1.4371079232896044e-06, "loss": 1.1815, "step": 515 }, { "epoch": 1.0810810810810811, "grad_norm": 3.53125, "learning_rate": 1.4272779253632212e-06, "loss": 1.1855, "step": 520 }, { "epoch": 1.0914760914760915, "grad_norm": 3.59375, "learning_rate": 1.4173971561760518e-06, "loss": 1.188, "step": 525 }, { "epoch": 1.1018711018711018, "grad_norm": 3.609375, "learning_rate": 1.4074667898096009e-06, "loss": 1.1873, "step": 530 }, { "epoch": 1.1122661122661124, "grad_norm": 3.53125, "learning_rate": 1.397488006238752e-06, "loss": 1.1945, "step": 535 }, { "epoch": 1.1226611226611227, "grad_norm": 3.5, "learning_rate": 1.387461991191559e-06, "loss": 1.1856, "step": 540 }, { "epoch": 1.133056133056133, "grad_norm": 3.609375, "learning_rate": 1.3773899360083524e-06, "loss": 1.1868, "step": 545 }, { "epoch": 1.1434511434511434, "grad_norm": 3.609375, "learning_rate": 1.3672730375001773e-06, "loss": 1.1791, "step": 550 }, { "epoch": 1.1538461538461537, "grad_norm": 3.65625, "learning_rate": 1.357112497806582e-06, "loss": 1.1969, "step": 555 }, { "epoch": 1.1642411642411643, "grad_norm": 3.609375, "learning_rate": 1.3469095242527764e-06, "loss": 1.1828, "step": 560 }, { "epoch": 1.1746361746361746, "grad_norm": 3.515625, "learning_rate": 1.3366653292061682e-06, "loss": 1.1803, "step": 565 }, { "epoch": 1.185031185031185, "grad_norm": 3.53125, "learning_rate": 1.3263811299323063e-06, "loss": 1.1803, "step": 570 }, { "epoch": 1.1954261954261955, "grad_norm": 3.6875, "learning_rate": 1.3160581484502382e-06, "loss": 1.1823, "step": 575 }, { "epoch": 1.2058212058212059, "grad_norm": 3.71875, "learning_rate": 1.3056976113873037e-06, "loss": 1.1832, "step": 580 }, { "epoch": 1.2162162162162162, "grad_norm": 3.625, "learning_rate": 1.2953007498333807e-06, "loss": 1.1841, "step": 585 }, { "epoch": 1.2266112266112266, "grad_norm": 3.59375, "learning_rate": 1.284868799194602e-06, "loss": 1.1804, "step": 590 }, { "epoch": 1.237006237006237, "grad_norm": 3.5, "learning_rate": 1.2744029990465574e-06, "loss": 1.1811, "step": 595 }, { "epoch": 1.2474012474012475, "grad_norm": 3.5625, "learning_rate": 1.2639045929870018e-06, "loss": 1.1794, "step": 600 }, { "epoch": 1.2474012474012475, "eval_loss": 1.2036519050598145, "eval_runtime": 10.8624, "eval_samples_per_second": 85.616, "eval_steps_per_second": 2.762, "step": 600 }, { "epoch": 1.2577962577962578, "grad_norm": 3.609375, "learning_rate": 1.2533748284880842e-06, "loss": 1.1905, "step": 605 }, { "epoch": 1.2681912681912682, "grad_norm": 3.5625, "learning_rate": 1.2428149567481184e-06, "loss": 1.1836, "step": 610 }, { "epoch": 1.2785862785862787, "grad_norm": 3.5625, "learning_rate": 1.2322262325429063e-06, "loss": 1.1823, "step": 615 }, { "epoch": 1.288981288981289, "grad_norm": 3.5625, "learning_rate": 1.2216099140766436e-06, "loss": 1.1836, "step": 620 }, { "epoch": 1.2993762993762994, "grad_norm": 3.71875, "learning_rate": 1.2109672628324104e-06, "loss": 1.1837, "step": 625 }, { "epoch": 1.3097713097713097, "grad_norm": 3.578125, "learning_rate": 1.2002995434222767e-06, "loss": 1.1827, "step": 630 }, { "epoch": 1.32016632016632, "grad_norm": 3.6875, "learning_rate": 1.1896080234370355e-06, "loss": 1.1803, "step": 635 }, { "epoch": 1.3305613305613306, "grad_norm": 3.515625, "learning_rate": 1.178893973295581e-06, "loss": 1.1788, "step": 640 }, { "epoch": 1.340956340956341, "grad_norm": 3.640625, "learning_rate": 1.1681586660939504e-06, "loss": 1.1918, "step": 645 }, { "epoch": 1.3513513513513513, "grad_norm": 3.578125, "learning_rate": 1.1574033774540505e-06, "loss": 1.1796, "step": 650 }, { "epoch": 1.3617463617463619, "grad_norm": 3.65625, "learning_rate": 1.1466293853720795e-06, "loss": 1.1837, "step": 655 }, { "epoch": 1.3721413721413722, "grad_norm": 3.609375, "learning_rate": 1.1358379700666703e-06, "loss": 1.1776, "step": 660 }, { "epoch": 1.3825363825363826, "grad_norm": 3.546875, "learning_rate": 1.1250304138267701e-06, "loss": 1.181, "step": 665 }, { "epoch": 1.392931392931393, "grad_norm": 3.6875, "learning_rate": 1.11420800085927e-06, "loss": 1.1764, "step": 670 }, { "epoch": 1.4033264033264032, "grad_norm": 3.65625, "learning_rate": 1.1033720171364108e-06, "loss": 1.1792, "step": 675 }, { "epoch": 1.4137214137214138, "grad_norm": 3.546875, "learning_rate": 1.092523750242977e-06, "loss": 1.1784, "step": 680 }, { "epoch": 1.4241164241164241, "grad_norm": 3.65625, "learning_rate": 1.0816644892232997e-06, "loss": 1.1855, "step": 685 }, { "epoch": 1.4345114345114345, "grad_norm": 3.640625, "learning_rate": 1.070795524428086e-06, "loss": 1.1782, "step": 690 }, { "epoch": 1.444906444906445, "grad_norm": 3.5625, "learning_rate": 1.0599181473610938e-06, "loss": 1.1837, "step": 695 }, { "epoch": 1.4553014553014554, "grad_norm": 3.59375, "learning_rate": 1.049033650525668e-06, "loss": 1.1786, "step": 700 }, { "epoch": 1.4656964656964657, "grad_norm": 3.5625, "learning_rate": 1.0381433272711585e-06, "loss": 1.1747, "step": 705 }, { "epoch": 1.476091476091476, "grad_norm": 3.71875, "learning_rate": 1.0272484716392408e-06, "loss": 1.1854, "step": 710 }, { "epoch": 1.4864864864864864, "grad_norm": 3.59375, "learning_rate": 1.0163503782101484e-06, "loss": 1.1755, "step": 715 }, { "epoch": 1.496881496881497, "grad_norm": 3.578125, "learning_rate": 1.0054503419488454e-06, "loss": 1.1795, "step": 720 }, { "epoch": 1.5072765072765073, "grad_norm": 3.578125, "learning_rate": 9.945496580511543e-07, "loss": 1.1846, "step": 725 }, { "epoch": 1.5176715176715176, "grad_norm": 3.6875, "learning_rate": 9.836496217898518e-07, "loss": 1.1806, "step": 730 }, { "epoch": 1.5280665280665282, "grad_norm": 3.609375, "learning_rate": 9.72751528360759e-07, "loss": 1.1667, "step": 735 }, { "epoch": 1.5384615384615383, "grad_norm": 3.65625, "learning_rate": 9.618566727288414e-07, "loss": 1.185, "step": 740 }, { "epoch": 1.5488565488565489, "grad_norm": 3.703125, "learning_rate": 9.509663494743321e-07, "loss": 1.191, "step": 745 }, { "epoch": 1.5592515592515592, "grad_norm": 4.9375, "learning_rate": 9.400818526389062e-07, "loss": 1.1769, "step": 750 }, { "epoch": 1.5696465696465696, "grad_norm": 3.546875, "learning_rate": 9.292044755719138e-07, "loss": 1.1741, "step": 755 }, { "epoch": 1.5800415800415801, "grad_norm": 3.546875, "learning_rate": 9.183355107767003e-07, "loss": 1.1771, "step": 760 }, { "epoch": 1.5904365904365905, "grad_norm": 3.59375, "learning_rate": 9.07476249757023e-07, "loss": 1.1725, "step": 765 }, { "epoch": 1.6008316008316008, "grad_norm": 3.59375, "learning_rate": 8.966279828635894e-07, "loss": 1.1801, "step": 770 }, { "epoch": 1.6112266112266114, "grad_norm": 3.609375, "learning_rate": 8.8579199914073e-07, "loss": 1.1697, "step": 775 }, { "epoch": 1.6216216216216215, "grad_norm": 3.546875, "learning_rate": 8.749695861732299e-07, "loss": 1.175, "step": 780 }, { "epoch": 1.632016632016632, "grad_norm": 3.578125, "learning_rate": 8.641620299333295e-07, "loss": 1.1818, "step": 785 }, { "epoch": 1.6424116424116424, "grad_norm": 3.65625, "learning_rate": 8.533706146279207e-07, "loss": 1.1697, "step": 790 }, { "epoch": 1.6528066528066527, "grad_norm": 3.59375, "learning_rate": 8.425966225459493e-07, "loss": 1.1702, "step": 795 }, { "epoch": 1.6632016632016633, "grad_norm": 3.65625, "learning_rate": 8.318413339060495e-07, "loss": 1.1793, "step": 800 }, { "epoch": 1.6735966735966736, "grad_norm": 3.65625, "learning_rate": 8.21106026704419e-07, "loss": 1.1801, "step": 805 }, { "epoch": 1.683991683991684, "grad_norm": 3.59375, "learning_rate": 8.103919765629645e-07, "loss": 1.1797, "step": 810 }, { "epoch": 1.6943866943866945, "grad_norm": 3.5625, "learning_rate": 7.997004565777233e-07, "loss": 1.1659, "step": 815 }, { "epoch": 1.7047817047817047, "grad_norm": 3.671875, "learning_rate": 7.890327371675895e-07, "loss": 1.1743, "step": 820 }, { "epoch": 1.7151767151767152, "grad_norm": 3.671875, "learning_rate": 7.783900859233562e-07, "loss": 1.1849, "step": 825 }, { "epoch": 1.7255717255717256, "grad_norm": 3.625, "learning_rate": 7.677737674570936e-07, "loss": 1.1791, "step": 830 }, { "epoch": 1.735966735966736, "grad_norm": 3.578125, "learning_rate": 7.571850432518819e-07, "loss": 1.1789, "step": 835 }, { "epoch": 1.7463617463617465, "grad_norm": 3.5625, "learning_rate": 7.466251715119156e-07, "loss": 1.1773, "step": 840 }, { "epoch": 1.7567567567567568, "grad_norm": 3.609375, "learning_rate": 7.360954070129981e-07, "loss": 1.175, "step": 845 }, { "epoch": 1.7671517671517671, "grad_norm": 3.609375, "learning_rate": 7.255970009534425e-07, "loss": 1.1816, "step": 850 }, { "epoch": 1.7775467775467777, "grad_norm": 3.734375, "learning_rate": 7.151312008053979e-07, "loss": 1.1848, "step": 855 }, { "epoch": 1.7879417879417878, "grad_norm": 3.5625, "learning_rate": 7.046992501666195e-07, "loss": 1.175, "step": 860 }, { "epoch": 1.7983367983367984, "grad_norm": 3.625, "learning_rate": 6.943023886126965e-07, "loss": 1.1753, "step": 865 }, { "epoch": 1.8087318087318087, "grad_norm": 3.5625, "learning_rate": 6.839418515497618e-07, "loss": 1.1718, "step": 870 }, { "epoch": 1.819126819126819, "grad_norm": 3.59375, "learning_rate": 6.736188700676935e-07, "loss": 1.164, "step": 875 }, { "epoch": 1.8295218295218296, "grad_norm": 3.59375, "learning_rate": 6.633346707938319e-07, "loss": 1.1761, "step": 880 }, { "epoch": 1.83991683991684, "grad_norm": 3.671875, "learning_rate": 6.530904757472236e-07, "loss": 1.1869, "step": 885 }, { "epoch": 1.8503118503118503, "grad_norm": 3.59375, "learning_rate": 6.42887502193418e-07, "loss": 1.1836, "step": 890 }, { "epoch": 1.8607068607068609, "grad_norm": 3.578125, "learning_rate": 6.327269624998227e-07, "loss": 1.1699, "step": 895 }, { "epoch": 1.871101871101871, "grad_norm": 3.65625, "learning_rate": 6.226100639916474e-07, "loss": 1.1743, "step": 900 }, { "epoch": 1.871101871101871, "eval_loss": 1.1942965984344482, "eval_runtime": 10.786, "eval_samples_per_second": 86.223, "eval_steps_per_second": 2.781, "step": 900 }, { "epoch": 1.8814968814968815, "grad_norm": 3.53125, "learning_rate": 6.125380088084408e-07, "loss": 1.1797, "step": 905 }, { "epoch": 1.8918918918918919, "grad_norm": 3.71875, "learning_rate": 6.025119937612481e-07, "loss": 1.1758, "step": 910 }, { "epoch": 1.9022869022869022, "grad_norm": 3.65625, "learning_rate": 5.925332101903994e-07, "loss": 1.1783, "step": 915 }, { "epoch": 1.9126819126819128, "grad_norm": 3.71875, "learning_rate": 5.826028438239479e-07, "loss": 1.1763, "step": 920 }, { "epoch": 1.9230769230769231, "grad_norm": 3.546875, "learning_rate": 5.727220746367791e-07, "loss": 1.18, "step": 925 }, { "epoch": 1.9334719334719335, "grad_norm": 3.546875, "learning_rate": 5.628920767103957e-07, "loss": 1.1782, "step": 930 }, { "epoch": 1.943866943866944, "grad_norm": 3.65625, "learning_rate": 5.531140180934119e-07, "loss": 1.1772, "step": 935 }, { "epoch": 1.9542619542619541, "grad_norm": 3.546875, "learning_rate": 5.433890606627568e-07, "loss": 1.175, "step": 940 }, { "epoch": 1.9646569646569647, "grad_norm": 3.59375, "learning_rate": 5.337183599856164e-07, "loss": 1.1745, "step": 945 }, { "epoch": 1.975051975051975, "grad_norm": 3.5625, "learning_rate": 5.241030651821231e-07, "loss": 1.1662, "step": 950 }, { "epoch": 1.9854469854469854, "grad_norm": 3.609375, "learning_rate": 5.145443187888114e-07, "loss": 1.171, "step": 955 }, { "epoch": 1.995841995841996, "grad_norm": 3.640625, "learning_rate": 5.050432566228552e-07, "loss": 1.1831, "step": 960 }, { "epoch": 2.006237006237006, "grad_norm": 3.609375, "learning_rate": 4.956010076471065e-07, "loss": 1.1592, "step": 965 }, { "epoch": 2.0166320166320166, "grad_norm": 3.53125, "learning_rate": 4.862186938359441e-07, "loss": 1.1667, "step": 970 }, { "epoch": 2.027027027027027, "grad_norm": 3.5625, "learning_rate": 4.768974300419573e-07, "loss": 1.1678, "step": 975 }, { "epoch": 2.0374220374220373, "grad_norm": 3.625, "learning_rate": 4.6763832386347214e-07, "loss": 1.1723, "step": 980 }, { "epoch": 2.047817047817048, "grad_norm": 3.578125, "learning_rate": 4.5844247551294224e-07, "loss": 1.1642, "step": 985 }, { "epoch": 2.0582120582120584, "grad_norm": 3.609375, "learning_rate": 4.493109776862143e-07, "loss": 1.17, "step": 990 }, { "epoch": 2.0686070686070686, "grad_norm": 3.640625, "learning_rate": 4.402449154326913e-07, "loss": 1.1651, "step": 995 }, { "epoch": 2.079002079002079, "grad_norm": 3.640625, "learning_rate": 4.312453660263987e-07, "loss": 1.1783, "step": 1000 }, { "epoch": 2.0893970893970892, "grad_norm": 3.578125, "learning_rate": 4.2231339883798025e-07, "loss": 1.1711, "step": 1005 }, { "epoch": 2.0997920997921, "grad_norm": 3.5625, "learning_rate": 4.13450075207628e-07, "loss": 1.163, "step": 1010 }, { "epoch": 2.1101871101871104, "grad_norm": 3.515625, "learning_rate": 4.0465644831897006e-07, "loss": 1.17, "step": 1015 }, { "epoch": 2.1205821205821205, "grad_norm": 3.5625, "learning_rate": 3.9593356307392436e-07, "loss": 1.1733, "step": 1020 }, { "epoch": 2.130977130977131, "grad_norm": 3.59375, "learning_rate": 3.872824559685409e-07, "loss": 1.1762, "step": 1025 }, { "epoch": 2.141372141372141, "grad_norm": 3.671875, "learning_rate": 3.7870415496983743e-07, "loss": 1.1734, "step": 1030 }, { "epoch": 2.1517671517671517, "grad_norm": 3.578125, "learning_rate": 3.701996793936535e-07, "loss": 1.1724, "step": 1035 }, { "epoch": 2.1621621621621623, "grad_norm": 3.5625, "learning_rate": 3.6177003978352917e-07, "loss": 1.1718, "step": 1040 }, { "epoch": 2.1725571725571724, "grad_norm": 3.5625, "learning_rate": 3.5341623779062813e-07, "loss": 1.1688, "step": 1045 }, { "epoch": 2.182952182952183, "grad_norm": 3.609375, "learning_rate": 3.45139266054715e-07, "loss": 1.1732, "step": 1050 }, { "epoch": 2.1933471933471935, "grad_norm": 3.59375, "learning_rate": 3.3694010808620733e-07, "loss": 1.1619, "step": 1055 }, { "epoch": 2.2037422037422036, "grad_norm": 3.625, "learning_rate": 3.288197381493075e-07, "loss": 1.1673, "step": 1060 }, { "epoch": 2.214137214137214, "grad_norm": 3.609375, "learning_rate": 3.207791211462383e-07, "loss": 1.1725, "step": 1065 }, { "epoch": 2.2245322245322248, "grad_norm": 3.546875, "learning_rate": 3.128192125025869e-07, "loss": 1.1673, "step": 1070 }, { "epoch": 2.234927234927235, "grad_norm": 3.578125, "learning_rate": 3.049409580537773e-07, "loss": 1.1735, "step": 1075 }, { "epoch": 2.2453222453222454, "grad_norm": 3.5625, "learning_rate": 2.9714529393268016e-07, "loss": 1.1583, "step": 1080 }, { "epoch": 2.2557172557172556, "grad_norm": 3.484375, "learning_rate": 2.8943314645837955e-07, "loss": 1.1715, "step": 1085 }, { "epoch": 2.266112266112266, "grad_norm": 3.609375, "learning_rate": 2.8180543202609984e-07, "loss": 1.164, "step": 1090 }, { "epoch": 2.2765072765072767, "grad_norm": 3.53125, "learning_rate": 2.742630569983182e-07, "loss": 1.1695, "step": 1095 }, { "epoch": 2.286902286902287, "grad_norm": 3.578125, "learning_rate": 2.66806917597064e-07, "loss": 1.169, "step": 1100 }, { "epoch": 2.2972972972972974, "grad_norm": 3.578125, "learning_rate": 2.594378997974267e-07, "loss": 1.1615, "step": 1105 }, { "epoch": 2.3076923076923075, "grad_norm": 3.6875, "learning_rate": 2.5215687922227845e-07, "loss": 1.1712, "step": 1110 }, { "epoch": 2.318087318087318, "grad_norm": 3.515625, "learning_rate": 2.4496472103823027e-07, "loss": 1.1688, "step": 1115 }, { "epoch": 2.3284823284823286, "grad_norm": 3.609375, "learning_rate": 2.378622798528266e-07, "loss": 1.1631, "step": 1120 }, { "epoch": 2.3388773388773387, "grad_norm": 3.6875, "learning_rate": 2.3085039961299814e-07, "loss": 1.1671, "step": 1125 }, { "epoch": 2.3492723492723493, "grad_norm": 3.640625, "learning_rate": 2.239299135047794e-07, "loss": 1.1623, "step": 1130 }, { "epoch": 2.35966735966736, "grad_norm": 3.578125, "learning_rate": 2.1710164385430585e-07, "loss": 1.1716, "step": 1135 }, { "epoch": 2.37006237006237, "grad_norm": 3.546875, "learning_rate": 2.103664020300997e-07, "loss": 1.1674, "step": 1140 }, { "epoch": 2.3804573804573805, "grad_norm": 3.453125, "learning_rate": 2.037249883466614e-07, "loss": 1.1623, "step": 1145 }, { "epoch": 2.390852390852391, "grad_norm": 3.65625, "learning_rate": 1.971781919693697e-07, "loss": 1.1808, "step": 1150 }, { "epoch": 2.401247401247401, "grad_norm": 3.625, "learning_rate": 1.9072679082071163e-07, "loss": 1.169, "step": 1155 }, { "epoch": 2.4116424116424118, "grad_norm": 3.671875, "learning_rate": 1.8437155148784433e-07, "loss": 1.1717, "step": 1160 }, { "epoch": 2.422037422037422, "grad_norm": 3.59375, "learning_rate": 1.781132291315064e-07, "loss": 1.1706, "step": 1165 }, { "epoch": 2.4324324324324325, "grad_norm": 3.625, "learning_rate": 1.7195256739628439e-07, "loss": 1.1722, "step": 1170 }, { "epoch": 2.442827442827443, "grad_norm": 3.71875, "learning_rate": 1.6589029832225155e-07, "loss": 1.1615, "step": 1175 }, { "epoch": 2.453222453222453, "grad_norm": 3.625, "learning_rate": 1.599271422579812e-07, "loss": 1.1691, "step": 1180 }, { "epoch": 2.4636174636174637, "grad_norm": 3.640625, "learning_rate": 1.5406380777495297e-07, "loss": 1.1647, "step": 1185 }, { "epoch": 2.474012474012474, "grad_norm": 3.59375, "learning_rate": 1.4830099158335563e-07, "loss": 1.1707, "step": 1190 }, { "epoch": 2.4844074844074844, "grad_norm": 3.578125, "learning_rate": 1.426393784493015e-07, "loss": 1.1564, "step": 1195 }, { "epoch": 2.494802494802495, "grad_norm": 3.59375, "learning_rate": 1.3707964111345805e-07, "loss": 1.1721, "step": 1200 }, { "epoch": 2.494802494802495, "eval_loss": 1.1930803060531616, "eval_runtime": 10.8355, "eval_samples_per_second": 85.829, "eval_steps_per_second": 2.769, "step": 1200 }, { "epoch": 2.505197505197505, "grad_norm": 3.65625, "learning_rate": 1.3162244021111123e-07, "loss": 1.1677, "step": 1205 }, { "epoch": 2.5155925155925156, "grad_norm": 3.59375, "learning_rate": 1.2626842419366369e-07, "loss": 1.1551, "step": 1210 }, { "epoch": 2.525987525987526, "grad_norm": 3.640625, "learning_rate": 1.2101822925158378e-07, "loss": 1.1678, "step": 1215 }, { "epoch": 2.5363825363825363, "grad_norm": 3.65625, "learning_rate": 1.1587247923881016e-07, "loss": 1.1731, "step": 1220 }, { "epoch": 2.546777546777547, "grad_norm": 3.6875, "learning_rate": 1.1083178559862227e-07, "loss": 1.1707, "step": 1225 }, { "epoch": 2.5571725571725574, "grad_norm": 3.671875, "learning_rate": 1.0589674729098507e-07, "loss": 1.1733, "step": 1230 }, { "epoch": 2.5675675675675675, "grad_norm": 3.578125, "learning_rate": 1.0106795072137896e-07, "loss": 1.1741, "step": 1235 }, { "epoch": 2.577962577962578, "grad_norm": 3.625, "learning_rate": 9.634596967111853e-08, "loss": 1.1704, "step": 1240 }, { "epoch": 2.5883575883575882, "grad_norm": 3.640625, "learning_rate": 9.173136522917457e-08, "loss": 1.1679, "step": 1245 }, { "epoch": 2.598752598752599, "grad_norm": 3.546875, "learning_rate": 8.722468572550213e-08, "loss": 1.1682, "step": 1250 }, { "epoch": 2.609147609147609, "grad_norm": 3.640625, "learning_rate": 8.28264666658851e-08, "loss": 1.1653, "step": 1255 }, { "epoch": 2.6195426195426195, "grad_norm": 3.734375, "learning_rate": 7.853723066830486e-08, "loss": 1.1672, "step": 1260 }, { "epoch": 2.62993762993763, "grad_norm": 3.640625, "learning_rate": 7.435748740084046e-08, "loss": 1.1606, "step": 1265 }, { "epoch": 2.64033264033264, "grad_norm": 3.65625, "learning_rate": 7.028773352110684e-08, "loss": 1.1634, "step": 1270 }, { "epoch": 2.6507276507276507, "grad_norm": 3.515625, "learning_rate": 6.632845261724051e-08, "loss": 1.1635, "step": 1275 }, { "epoch": 2.6611226611226613, "grad_norm": 4.1875, "learning_rate": 6.248011515043617e-08, "loss": 1.1641, "step": 1280 }, { "epoch": 2.6715176715176714, "grad_norm": 3.65625, "learning_rate": 5.8743178399044966e-08, "loss": 1.1642, "step": 1285 }, { "epoch": 2.681912681912682, "grad_norm": 3.625, "learning_rate": 5.511808640423765e-08, "loss": 1.1727, "step": 1290 }, { "epoch": 2.6923076923076925, "grad_norm": 3.59375, "learning_rate": 5.160526991724246e-08, "loss": 1.1732, "step": 1295 }, { "epoch": 2.7027027027027026, "grad_norm": 3.609375, "learning_rate": 4.8205146348160195e-08, "loss": 1.1699, "step": 1300 }, { "epoch": 2.713097713097713, "grad_norm": 3.6875, "learning_rate": 4.491811971636605e-08, "loss": 1.166, "step": 1305 }, { "epoch": 2.7234927234927238, "grad_norm": 3.625, "learning_rate": 4.174458060250208e-08, "loss": 1.1712, "step": 1310 }, { "epoch": 2.733887733887734, "grad_norm": 3.59375, "learning_rate": 3.868490610206565e-08, "loss": 1.1595, "step": 1315 }, { "epoch": 2.7442827442827444, "grad_norm": 3.578125, "learning_rate": 3.5739459780602665e-08, "loss": 1.1684, "step": 1320 }, { "epoch": 2.7546777546777546, "grad_norm": 3.578125, "learning_rate": 3.290859163050508e-08, "loss": 1.1744, "step": 1325 }, { "epoch": 2.765072765072765, "grad_norm": 3.671875, "learning_rate": 3.0192638029424735e-08, "loss": 1.1664, "step": 1330 }, { "epoch": 2.7754677754677752, "grad_norm": 3.53125, "learning_rate": 2.7591921700302222e-08, "loss": 1.1612, "step": 1335 }, { "epoch": 2.785862785862786, "grad_norm": 3.609375, "learning_rate": 2.5106751673020012e-08, "loss": 1.174, "step": 1340 }, { "epoch": 2.7962577962577964, "grad_norm": 3.59375, "learning_rate": 2.273742324768124e-08, "loss": 1.1602, "step": 1345 }, { "epoch": 2.8066528066528065, "grad_norm": 3.546875, "learning_rate": 2.048421795952171e-08, "loss": 1.1708, "step": 1350 }, { "epoch": 2.817047817047817, "grad_norm": 3.609375, "learning_rate": 1.8347403545455497e-08, "loss": 1.1622, "step": 1355 }, { "epoch": 2.8274428274428276, "grad_norm": 3.640625, "learning_rate": 1.6327233912261984e-08, "loss": 1.1668, "step": 1360 }, { "epoch": 2.8378378378378377, "grad_norm": 3.609375, "learning_rate": 1.4423949106414868e-08, "loss": 1.1708, "step": 1365 }, { "epoch": 2.8482328482328483, "grad_norm": 3.640625, "learning_rate": 1.2637775285558983e-08, "loss": 1.1663, "step": 1370 }, { "epoch": 2.858627858627859, "grad_norm": 3.5625, "learning_rate": 1.0968924691636572e-08, "loss": 1.1621, "step": 1375 }, { "epoch": 2.869022869022869, "grad_norm": 3.671875, "learning_rate": 9.417595625668462e-09, "loss": 1.1769, "step": 1380 }, { "epoch": 2.8794178794178795, "grad_norm": 3.71875, "learning_rate": 7.983972424190354e-09, "loss": 1.1784, "step": 1385 }, { "epoch": 2.88981288981289, "grad_norm": 3.65625, "learning_rate": 6.668225437349351e-09, "loss": 1.1734, "step": 1390 }, { "epoch": 2.9002079002079, "grad_norm": 3.75, "learning_rate": 5.470511008662026e-09, "loss": 1.1747, "step": 1395 }, { "epoch": 2.9106029106029108, "grad_norm": 3.671875, "learning_rate": 4.390971456437076e-09, "loss": 1.171, "step": 1400 }, { "epoch": 2.920997920997921, "grad_norm": 3.671875, "learning_rate": 3.429735056863725e-09, "loss": 1.1667, "step": 1405 }, { "epoch": 2.9313929313929314, "grad_norm": 3.671875, "learning_rate": 2.5869160287702586e-09, "loss": 1.1683, "step": 1410 }, { "epoch": 2.9417879417879416, "grad_norm": 3.625, "learning_rate": 1.8626145200513199e-09, "loss": 1.1702, "step": 1415 }, { "epoch": 2.952182952182952, "grad_norm": 3.5, "learning_rate": 1.2569165957680983e-09, "loss": 1.1712, "step": 1420 }, { "epoch": 2.9625779625779627, "grad_norm": 3.640625, "learning_rate": 7.698942279216192e-10, "loss": 1.1725, "step": 1425 }, { "epoch": 2.972972972972973, "grad_norm": 3.5, "learning_rate": 4.016052869005859e-10, "loss": 1.1693, "step": 1430 }, { "epoch": 2.9833679833679834, "grad_norm": 3.578125, "learning_rate": 1.520935346051022e-10, "loss": 1.1682, "step": 1435 }, { "epoch": 2.993762993762994, "grad_norm": 3.609375, "learning_rate": 2.1388619246498486e-11, "loss": 1.1627, "step": 1440 }, { "epoch": 3.0, "step": 1443, "total_flos": 8.072899219804914e+18, "train_loss": 1.2002355462548142, "train_runtime": 6547.9575, "train_samples_per_second": 14.077, "train_steps_per_second": 0.22 } ], "logging_steps": 5, "max_steps": 1443, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 300, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 8.072899219804914e+18, "train_batch_size": 8, "trial_name": null, "trial_params": null }