|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 3.0, |
|
"eval_steps": 300, |
|
"global_step": 1443, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.002079002079002079, |
|
"grad_norm": 21.0, |
|
"learning_rate": 1e-06, |
|
"loss": 1.4443, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.010395010395010396, |
|
"grad_norm": 7.46875, |
|
"learning_rate": 1.9999786113807535e-06, |
|
"loss": 1.4249, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.02079002079002079, |
|
"grad_norm": 5.1875, |
|
"learning_rate": 1.999847906465395e-06, |
|
"loss": 1.3779, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.031185031185031187, |
|
"grad_norm": 4.125, |
|
"learning_rate": 1.999598394713099e-06, |
|
"loss": 1.3446, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.04158004158004158, |
|
"grad_norm": 3.859375, |
|
"learning_rate": 1.9992301057720783e-06, |
|
"loss": 1.3445, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.05197505197505198, |
|
"grad_norm": 3.75, |
|
"learning_rate": 1.9987430834042318e-06, |
|
"loss": 1.3221, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.062370062370062374, |
|
"grad_norm": 3.703125, |
|
"learning_rate": 1.9981373854799487e-06, |
|
"loss": 1.3312, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.07276507276507277, |
|
"grad_norm": 3.515625, |
|
"learning_rate": 1.99741308397123e-06, |
|
"loss": 1.3108, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.08316008316008316, |
|
"grad_norm": 3.703125, |
|
"learning_rate": 1.9965702649431362e-06, |
|
"loss": 1.3148, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.09355509355509356, |
|
"grad_norm": 3.5625, |
|
"learning_rate": 1.9956090285435628e-06, |
|
"loss": 1.3075, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.10395010395010396, |
|
"grad_norm": 3.703125, |
|
"learning_rate": 1.9945294889913377e-06, |
|
"loss": 1.3029, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.11434511434511435, |
|
"grad_norm": 3.671875, |
|
"learning_rate": 1.9933317745626504e-06, |
|
"loss": 1.3022, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.12474012474012475, |
|
"grad_norm": 3.671875, |
|
"learning_rate": 1.9920160275758096e-06, |
|
"loss": 1.2885, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.13513513513513514, |
|
"grad_norm": 3.515625, |
|
"learning_rate": 1.9905824043743316e-06, |
|
"loss": 1.2944, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.14553014553014554, |
|
"grad_norm": 3.453125, |
|
"learning_rate": 1.9890310753083633e-06, |
|
"loss": 1.2993, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.15592515592515593, |
|
"grad_norm": 3.484375, |
|
"learning_rate": 1.987362224714441e-06, |
|
"loss": 1.2885, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.16632016632016633, |
|
"grad_norm": 3.5, |
|
"learning_rate": 1.985576050893585e-06, |
|
"loss": 1.2862, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.17671517671517672, |
|
"grad_norm": 3.578125, |
|
"learning_rate": 1.983672766087738e-06, |
|
"loss": 1.2854, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.18711018711018712, |
|
"grad_norm": 3.546875, |
|
"learning_rate": 1.9816525964545446e-06, |
|
"loss": 1.2765, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.19750519750519752, |
|
"grad_norm": 3.640625, |
|
"learning_rate": 1.979515782040478e-06, |
|
"loss": 1.2874, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.2079002079002079, |
|
"grad_norm": 3.546875, |
|
"learning_rate": 1.977262576752319e-06, |
|
"loss": 1.2808, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.2182952182952183, |
|
"grad_norm": 3.59375, |
|
"learning_rate": 1.97489324832698e-06, |
|
"loss": 1.2768, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.2286902286902287, |
|
"grad_norm": 3.59375, |
|
"learning_rate": 1.972408078299698e-06, |
|
"loss": 1.2757, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.2390852390852391, |
|
"grad_norm": 3.53125, |
|
"learning_rate": 1.969807361970575e-06, |
|
"loss": 1.2897, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.2494802494802495, |
|
"grad_norm": 3.5, |
|
"learning_rate": 1.9670914083694948e-06, |
|
"loss": 1.2796, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.2598752598752599, |
|
"grad_norm": 3.515625, |
|
"learning_rate": 1.9642605402193976e-06, |
|
"loss": 1.2684, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.2702702702702703, |
|
"grad_norm": 3.53125, |
|
"learning_rate": 1.9613150938979343e-06, |
|
"loss": 1.2694, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.2806652806652807, |
|
"grad_norm": 3.546875, |
|
"learning_rate": 1.958255419397498e-06, |
|
"loss": 1.263, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.2910602910602911, |
|
"grad_norm": 3.578125, |
|
"learning_rate": 1.9550818802836337e-06, |
|
"loss": 1.271, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.30145530145530147, |
|
"grad_norm": 3.640625, |
|
"learning_rate": 1.9517948536518397e-06, |
|
"loss": 1.2638, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.31185031185031187, |
|
"grad_norm": 3.59375, |
|
"learning_rate": 1.9483947300827577e-06, |
|
"loss": 1.2633, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.32224532224532226, |
|
"grad_norm": 3.484375, |
|
"learning_rate": 1.944881913595762e-06, |
|
"loss": 1.2648, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.33264033264033266, |
|
"grad_norm": 3.46875, |
|
"learning_rate": 1.941256821600955e-06, |
|
"loss": 1.2588, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.34303534303534305, |
|
"grad_norm": 3.5, |
|
"learning_rate": 1.9375198848495637e-06, |
|
"loss": 1.258, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.35343035343035345, |
|
"grad_norm": 3.53125, |
|
"learning_rate": 1.9336715473827594e-06, |
|
"loss": 1.2588, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.36382536382536385, |
|
"grad_norm": 3.515625, |
|
"learning_rate": 1.929712266478893e-06, |
|
"loss": 1.2538, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.37422037422037424, |
|
"grad_norm": 3.765625, |
|
"learning_rate": 1.9256425125991596e-06, |
|
"loss": 1.2539, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.38461538461538464, |
|
"grad_norm": 3.546875, |
|
"learning_rate": 1.9214627693316952e-06, |
|
"loss": 1.2556, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.39501039501039503, |
|
"grad_norm": 3.59375, |
|
"learning_rate": 1.9171735333341148e-06, |
|
"loss": 1.2524, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.40540540540540543, |
|
"grad_norm": 3.59375, |
|
"learning_rate": 1.9127753142744977e-06, |
|
"loss": 1.2471, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.4158004158004158, |
|
"grad_norm": 3.515625, |
|
"learning_rate": 1.9082686347708253e-06, |
|
"loss": 1.246, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.4261954261954262, |
|
"grad_norm": 3.421875, |
|
"learning_rate": 1.9036540303288815e-06, |
|
"loss": 1.247, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.4365904365904366, |
|
"grad_norm": 3.546875, |
|
"learning_rate": 1.898932049278621e-06, |
|
"loss": 1.2496, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.446985446985447, |
|
"grad_norm": 3.40625, |
|
"learning_rate": 1.8941032527090148e-06, |
|
"loss": 1.2492, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.4573804573804574, |
|
"grad_norm": 3.515625, |
|
"learning_rate": 1.8891682144013777e-06, |
|
"loss": 1.2476, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.4677754677754678, |
|
"grad_norm": 3.40625, |
|
"learning_rate": 1.8841275207611895e-06, |
|
"loss": 1.2426, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.4781704781704782, |
|
"grad_norm": 3.5, |
|
"learning_rate": 1.8789817707484162e-06, |
|
"loss": 1.2388, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.4885654885654886, |
|
"grad_norm": 3.5625, |
|
"learning_rate": 1.8737315758063364e-06, |
|
"loss": 1.2374, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.498960498960499, |
|
"grad_norm": 3.453125, |
|
"learning_rate": 1.8683775597888886e-06, |
|
"loss": 1.2437, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.5093555093555093, |
|
"grad_norm": 3.5, |
|
"learning_rate": 1.8629203588865419e-06, |
|
"loss": 1.2335, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.5197505197505198, |
|
"grad_norm": 3.53125, |
|
"learning_rate": 1.8573606215506985e-06, |
|
"loss": 1.2355, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.5301455301455301, |
|
"grad_norm": 3.4375, |
|
"learning_rate": 1.8516990084166443e-06, |
|
"loss": 1.2384, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.5405405405405406, |
|
"grad_norm": 3.625, |
|
"learning_rate": 1.8459361922250469e-06, |
|
"loss": 1.232, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.5509355509355509, |
|
"grad_norm": 3.515625, |
|
"learning_rate": 1.8400728577420187e-06, |
|
"loss": 1.2356, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.5613305613305614, |
|
"grad_norm": 3.515625, |
|
"learning_rate": 1.8341097016777484e-06, |
|
"loss": 1.2408, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.5717255717255717, |
|
"grad_norm": 3.46875, |
|
"learning_rate": 1.8280474326037155e-06, |
|
"loss": 1.2465, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.5821205821205822, |
|
"grad_norm": 3.53125, |
|
"learning_rate": 1.8218867708684937e-06, |
|
"loss": 1.2331, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.5925155925155925, |
|
"grad_norm": 3.46875, |
|
"learning_rate": 1.8156284485121556e-06, |
|
"loss": 1.2323, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.6029106029106029, |
|
"grad_norm": 3.4375, |
|
"learning_rate": 1.8092732091792884e-06, |
|
"loss": 1.2365, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.6133056133056133, |
|
"grad_norm": 3.5625, |
|
"learning_rate": 1.8028218080306302e-06, |
|
"loss": 1.2394, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.6237006237006237, |
|
"grad_norm": 3.609375, |
|
"learning_rate": 1.7962750116533387e-06, |
|
"loss": 1.2263, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.6237006237006237, |
|
"eval_loss": 1.2331745624542236, |
|
"eval_runtime": 10.8501, |
|
"eval_samples_per_second": 85.713, |
|
"eval_steps_per_second": 2.765, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.6340956340956341, |
|
"grad_norm": 3.5625, |
|
"learning_rate": 1.7896335979699001e-06, |
|
"loss": 1.2311, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.6444906444906445, |
|
"grad_norm": 3.46875, |
|
"learning_rate": 1.782898356145694e-06, |
|
"loss": 1.2236, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.6548856548856549, |
|
"grad_norm": 4.25, |
|
"learning_rate": 1.7760700864952205e-06, |
|
"loss": 1.2376, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.6652806652806653, |
|
"grad_norm": 3.484375, |
|
"learning_rate": 1.7691496003870018e-06, |
|
"loss": 1.2239, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.6756756756756757, |
|
"grad_norm": 3.578125, |
|
"learning_rate": 1.7621377201471735e-06, |
|
"loss": 1.2311, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.6860706860706861, |
|
"grad_norm": 3.796875, |
|
"learning_rate": 1.75503527896177e-06, |
|
"loss": 1.226, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.6964656964656964, |
|
"grad_norm": 3.46875, |
|
"learning_rate": 1.7478431207777215e-06, |
|
"loss": 1.228, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.7068607068607069, |
|
"grad_norm": 3.484375, |
|
"learning_rate": 1.7405621002025735e-06, |
|
"loss": 1.2216, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.7172557172557172, |
|
"grad_norm": 3.46875, |
|
"learning_rate": 1.733193082402936e-06, |
|
"loss": 1.2223, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.7276507276507277, |
|
"grad_norm": 3.53125, |
|
"learning_rate": 1.7257369430016817e-06, |
|
"loss": 1.2158, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.738045738045738, |
|
"grad_norm": 3.4375, |
|
"learning_rate": 1.7181945679739003e-06, |
|
"loss": 1.2206, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.7484407484407485, |
|
"grad_norm": 3.59375, |
|
"learning_rate": 1.7105668535416205e-06, |
|
"loss": 1.2292, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.7588357588357588, |
|
"grad_norm": 3.578125, |
|
"learning_rate": 1.7028547060673197e-06, |
|
"loss": 1.2272, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.7692307692307693, |
|
"grad_norm": 3.609375, |
|
"learning_rate": 1.6950590419462229e-06, |
|
"loss": 1.2264, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.7796257796257796, |
|
"grad_norm": 3.578125, |
|
"learning_rate": 1.687180787497413e-06, |
|
"loss": 1.2155, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.7900207900207901, |
|
"grad_norm": 3.421875, |
|
"learning_rate": 1.6792208788537617e-06, |
|
"loss": 1.2154, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.8004158004158004, |
|
"grad_norm": 3.625, |
|
"learning_rate": 1.6711802618506926e-06, |
|
"loss": 1.2134, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.8108108108108109, |
|
"grad_norm": 3.546875, |
|
"learning_rate": 1.663059891913793e-06, |
|
"loss": 1.2211, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.8212058212058212, |
|
"grad_norm": 3.734375, |
|
"learning_rate": 1.6548607339452852e-06, |
|
"loss": 1.2222, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.8316008316008316, |
|
"grad_norm": 3.515625, |
|
"learning_rate": 1.6465837622093722e-06, |
|
"loss": 1.2188, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.841995841995842, |
|
"grad_norm": 3.515625, |
|
"learning_rate": 1.6382299602164706e-06, |
|
"loss": 1.2189, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.8523908523908524, |
|
"grad_norm": 3.5625, |
|
"learning_rate": 1.6298003206063466e-06, |
|
"loss": 1.2145, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.8627858627858628, |
|
"grad_norm": 3.53125, |
|
"learning_rate": 1.6212958450301625e-06, |
|
"loss": 1.2168, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.8731808731808732, |
|
"grad_norm": 3.515625, |
|
"learning_rate": 1.6127175440314594e-06, |
|
"loss": 1.2083, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.8835758835758836, |
|
"grad_norm": 3.484375, |
|
"learning_rate": 1.6040664369260758e-06, |
|
"loss": 1.2183, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.893970893970894, |
|
"grad_norm": 3.671875, |
|
"learning_rate": 1.5953435516810303e-06, |
|
"loss": 1.2134, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.9043659043659044, |
|
"grad_norm": 3.515625, |
|
"learning_rate": 1.586549924792372e-06, |
|
"loss": 1.2145, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.9147609147609148, |
|
"grad_norm": 3.53125, |
|
"learning_rate": 1.5776866011620198e-06, |
|
"loss": 1.2148, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.9251559251559252, |
|
"grad_norm": 3.6875, |
|
"learning_rate": 1.5687546339736013e-06, |
|
"loss": 1.212, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 0.9355509355509356, |
|
"grad_norm": 3.578125, |
|
"learning_rate": 1.559755084567309e-06, |
|
"loss": 1.2183, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.9459459459459459, |
|
"grad_norm": 3.640625, |
|
"learning_rate": 1.5506890223137857e-06, |
|
"loss": 1.2123, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 0.9563409563409564, |
|
"grad_norm": 3.5625, |
|
"learning_rate": 1.5415575244870578e-06, |
|
"loss": 1.214, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.9667359667359667, |
|
"grad_norm": 3.59375, |
|
"learning_rate": 1.5323616761365278e-06, |
|
"loss": 1.2081, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 0.9771309771309772, |
|
"grad_norm": 3.515625, |
|
"learning_rate": 1.5231025699580427e-06, |
|
"loss": 1.1995, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.9875259875259875, |
|
"grad_norm": 3.484375, |
|
"learning_rate": 1.513781306164056e-06, |
|
"loss": 1.2226, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.997920997920998, |
|
"grad_norm": 3.5, |
|
"learning_rate": 1.5043989923528937e-06, |
|
"loss": 1.2183, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 1.0083160083160083, |
|
"grad_norm": 3.453125, |
|
"learning_rate": 1.4949567433771448e-06, |
|
"loss": 1.1826, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 1.0187110187110187, |
|
"grad_norm": 3.609375, |
|
"learning_rate": 1.4854556812111887e-06, |
|
"loss": 1.1925, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 1.0291060291060292, |
|
"grad_norm": 3.5, |
|
"learning_rate": 1.4758969348178766e-06, |
|
"loss": 1.19, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 1.0395010395010396, |
|
"grad_norm": 3.53125, |
|
"learning_rate": 1.4662816400143836e-06, |
|
"loss": 1.1963, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.04989604989605, |
|
"grad_norm": 3.640625, |
|
"learning_rate": 1.4566109393372433e-06, |
|
"loss": 1.1872, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 1.0602910602910602, |
|
"grad_norm": 3.578125, |
|
"learning_rate": 1.4468859819065882e-06, |
|
"loss": 1.1833, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 1.0706860706860706, |
|
"grad_norm": 3.5625, |
|
"learning_rate": 1.4371079232896044e-06, |
|
"loss": 1.1815, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 1.0810810810810811, |
|
"grad_norm": 3.53125, |
|
"learning_rate": 1.4272779253632212e-06, |
|
"loss": 1.1855, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 1.0914760914760915, |
|
"grad_norm": 3.59375, |
|
"learning_rate": 1.4173971561760518e-06, |
|
"loss": 1.188, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 1.1018711018711018, |
|
"grad_norm": 3.609375, |
|
"learning_rate": 1.4074667898096009e-06, |
|
"loss": 1.1873, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 1.1122661122661124, |
|
"grad_norm": 3.53125, |
|
"learning_rate": 1.397488006238752e-06, |
|
"loss": 1.1945, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 1.1226611226611227, |
|
"grad_norm": 3.5, |
|
"learning_rate": 1.387461991191559e-06, |
|
"loss": 1.1856, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 1.133056133056133, |
|
"grad_norm": 3.609375, |
|
"learning_rate": 1.3773899360083524e-06, |
|
"loss": 1.1868, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 1.1434511434511434, |
|
"grad_norm": 3.609375, |
|
"learning_rate": 1.3672730375001773e-06, |
|
"loss": 1.1791, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.1538461538461537, |
|
"grad_norm": 3.65625, |
|
"learning_rate": 1.357112497806582e-06, |
|
"loss": 1.1969, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 1.1642411642411643, |
|
"grad_norm": 3.609375, |
|
"learning_rate": 1.3469095242527764e-06, |
|
"loss": 1.1828, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 1.1746361746361746, |
|
"grad_norm": 3.515625, |
|
"learning_rate": 1.3366653292061682e-06, |
|
"loss": 1.1803, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 1.185031185031185, |
|
"grad_norm": 3.53125, |
|
"learning_rate": 1.3263811299323063e-06, |
|
"loss": 1.1803, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 1.1954261954261955, |
|
"grad_norm": 3.6875, |
|
"learning_rate": 1.3160581484502382e-06, |
|
"loss": 1.1823, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 1.2058212058212059, |
|
"grad_norm": 3.71875, |
|
"learning_rate": 1.3056976113873037e-06, |
|
"loss": 1.1832, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 1.2162162162162162, |
|
"grad_norm": 3.625, |
|
"learning_rate": 1.2953007498333807e-06, |
|
"loss": 1.1841, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 1.2266112266112266, |
|
"grad_norm": 3.59375, |
|
"learning_rate": 1.284868799194602e-06, |
|
"loss": 1.1804, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 1.237006237006237, |
|
"grad_norm": 3.5, |
|
"learning_rate": 1.2744029990465574e-06, |
|
"loss": 1.1811, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 1.2474012474012475, |
|
"grad_norm": 3.5625, |
|
"learning_rate": 1.2639045929870018e-06, |
|
"loss": 1.1794, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.2474012474012475, |
|
"eval_loss": 1.2036519050598145, |
|
"eval_runtime": 10.8624, |
|
"eval_samples_per_second": 85.616, |
|
"eval_steps_per_second": 2.762, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.2577962577962578, |
|
"grad_norm": 3.609375, |
|
"learning_rate": 1.2533748284880842e-06, |
|
"loss": 1.1905, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 1.2681912681912682, |
|
"grad_norm": 3.5625, |
|
"learning_rate": 1.2428149567481184e-06, |
|
"loss": 1.1836, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 1.2785862785862787, |
|
"grad_norm": 3.5625, |
|
"learning_rate": 1.2322262325429063e-06, |
|
"loss": 1.1823, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 1.288981288981289, |
|
"grad_norm": 3.5625, |
|
"learning_rate": 1.2216099140766436e-06, |
|
"loss": 1.1836, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 1.2993762993762994, |
|
"grad_norm": 3.71875, |
|
"learning_rate": 1.2109672628324104e-06, |
|
"loss": 1.1837, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 1.3097713097713097, |
|
"grad_norm": 3.578125, |
|
"learning_rate": 1.2002995434222767e-06, |
|
"loss": 1.1827, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 1.32016632016632, |
|
"grad_norm": 3.6875, |
|
"learning_rate": 1.1896080234370355e-06, |
|
"loss": 1.1803, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 1.3305613305613306, |
|
"grad_norm": 3.515625, |
|
"learning_rate": 1.178893973295581e-06, |
|
"loss": 1.1788, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 1.340956340956341, |
|
"grad_norm": 3.640625, |
|
"learning_rate": 1.1681586660939504e-06, |
|
"loss": 1.1918, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 1.3513513513513513, |
|
"grad_norm": 3.578125, |
|
"learning_rate": 1.1574033774540505e-06, |
|
"loss": 1.1796, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 1.3617463617463619, |
|
"grad_norm": 3.65625, |
|
"learning_rate": 1.1466293853720795e-06, |
|
"loss": 1.1837, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 1.3721413721413722, |
|
"grad_norm": 3.609375, |
|
"learning_rate": 1.1358379700666703e-06, |
|
"loss": 1.1776, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 1.3825363825363826, |
|
"grad_norm": 3.546875, |
|
"learning_rate": 1.1250304138267701e-06, |
|
"loss": 1.181, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 1.392931392931393, |
|
"grad_norm": 3.6875, |
|
"learning_rate": 1.11420800085927e-06, |
|
"loss": 1.1764, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 1.4033264033264032, |
|
"grad_norm": 3.65625, |
|
"learning_rate": 1.1033720171364108e-06, |
|
"loss": 1.1792, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 1.4137214137214138, |
|
"grad_norm": 3.546875, |
|
"learning_rate": 1.092523750242977e-06, |
|
"loss": 1.1784, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 1.4241164241164241, |
|
"grad_norm": 3.65625, |
|
"learning_rate": 1.0816644892232997e-06, |
|
"loss": 1.1855, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 1.4345114345114345, |
|
"grad_norm": 3.640625, |
|
"learning_rate": 1.070795524428086e-06, |
|
"loss": 1.1782, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 1.444906444906445, |
|
"grad_norm": 3.5625, |
|
"learning_rate": 1.0599181473610938e-06, |
|
"loss": 1.1837, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 1.4553014553014554, |
|
"grad_norm": 3.59375, |
|
"learning_rate": 1.049033650525668e-06, |
|
"loss": 1.1786, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.4656964656964657, |
|
"grad_norm": 3.5625, |
|
"learning_rate": 1.0381433272711585e-06, |
|
"loss": 1.1747, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 1.476091476091476, |
|
"grad_norm": 3.71875, |
|
"learning_rate": 1.0272484716392408e-06, |
|
"loss": 1.1854, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 1.4864864864864864, |
|
"grad_norm": 3.59375, |
|
"learning_rate": 1.0163503782101484e-06, |
|
"loss": 1.1755, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 1.496881496881497, |
|
"grad_norm": 3.578125, |
|
"learning_rate": 1.0054503419488454e-06, |
|
"loss": 1.1795, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 1.5072765072765073, |
|
"grad_norm": 3.578125, |
|
"learning_rate": 9.945496580511543e-07, |
|
"loss": 1.1846, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 1.5176715176715176, |
|
"grad_norm": 3.6875, |
|
"learning_rate": 9.836496217898518e-07, |
|
"loss": 1.1806, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 1.5280665280665282, |
|
"grad_norm": 3.609375, |
|
"learning_rate": 9.72751528360759e-07, |
|
"loss": 1.1667, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 1.5384615384615383, |
|
"grad_norm": 3.65625, |
|
"learning_rate": 9.618566727288414e-07, |
|
"loss": 1.185, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 1.5488565488565489, |
|
"grad_norm": 3.703125, |
|
"learning_rate": 9.509663494743321e-07, |
|
"loss": 1.191, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 1.5592515592515592, |
|
"grad_norm": 4.9375, |
|
"learning_rate": 9.400818526389062e-07, |
|
"loss": 1.1769, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 1.5696465696465696, |
|
"grad_norm": 3.546875, |
|
"learning_rate": 9.292044755719138e-07, |
|
"loss": 1.1741, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 1.5800415800415801, |
|
"grad_norm": 3.546875, |
|
"learning_rate": 9.183355107767003e-07, |
|
"loss": 1.1771, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 1.5904365904365905, |
|
"grad_norm": 3.59375, |
|
"learning_rate": 9.07476249757023e-07, |
|
"loss": 1.1725, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 1.6008316008316008, |
|
"grad_norm": 3.59375, |
|
"learning_rate": 8.966279828635894e-07, |
|
"loss": 1.1801, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 1.6112266112266114, |
|
"grad_norm": 3.609375, |
|
"learning_rate": 8.8579199914073e-07, |
|
"loss": 1.1697, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 1.6216216216216215, |
|
"grad_norm": 3.546875, |
|
"learning_rate": 8.749695861732299e-07, |
|
"loss": 1.175, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 1.632016632016632, |
|
"grad_norm": 3.578125, |
|
"learning_rate": 8.641620299333295e-07, |
|
"loss": 1.1818, |
|
"step": 785 |
|
}, |
|
{ |
|
"epoch": 1.6424116424116424, |
|
"grad_norm": 3.65625, |
|
"learning_rate": 8.533706146279207e-07, |
|
"loss": 1.1697, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 1.6528066528066527, |
|
"grad_norm": 3.59375, |
|
"learning_rate": 8.425966225459493e-07, |
|
"loss": 1.1702, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 1.6632016632016633, |
|
"grad_norm": 3.65625, |
|
"learning_rate": 8.318413339060495e-07, |
|
"loss": 1.1793, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.6735966735966736, |
|
"grad_norm": 3.65625, |
|
"learning_rate": 8.21106026704419e-07, |
|
"loss": 1.1801, |
|
"step": 805 |
|
}, |
|
{ |
|
"epoch": 1.683991683991684, |
|
"grad_norm": 3.59375, |
|
"learning_rate": 8.103919765629645e-07, |
|
"loss": 1.1797, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 1.6943866943866945, |
|
"grad_norm": 3.5625, |
|
"learning_rate": 7.997004565777233e-07, |
|
"loss": 1.1659, |
|
"step": 815 |
|
}, |
|
{ |
|
"epoch": 1.7047817047817047, |
|
"grad_norm": 3.671875, |
|
"learning_rate": 7.890327371675895e-07, |
|
"loss": 1.1743, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 1.7151767151767152, |
|
"grad_norm": 3.671875, |
|
"learning_rate": 7.783900859233562e-07, |
|
"loss": 1.1849, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 1.7255717255717256, |
|
"grad_norm": 3.625, |
|
"learning_rate": 7.677737674570936e-07, |
|
"loss": 1.1791, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 1.735966735966736, |
|
"grad_norm": 3.578125, |
|
"learning_rate": 7.571850432518819e-07, |
|
"loss": 1.1789, |
|
"step": 835 |
|
}, |
|
{ |
|
"epoch": 1.7463617463617465, |
|
"grad_norm": 3.5625, |
|
"learning_rate": 7.466251715119156e-07, |
|
"loss": 1.1773, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 1.7567567567567568, |
|
"grad_norm": 3.609375, |
|
"learning_rate": 7.360954070129981e-07, |
|
"loss": 1.175, |
|
"step": 845 |
|
}, |
|
{ |
|
"epoch": 1.7671517671517671, |
|
"grad_norm": 3.609375, |
|
"learning_rate": 7.255970009534425e-07, |
|
"loss": 1.1816, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 1.7775467775467777, |
|
"grad_norm": 3.734375, |
|
"learning_rate": 7.151312008053979e-07, |
|
"loss": 1.1848, |
|
"step": 855 |
|
}, |
|
{ |
|
"epoch": 1.7879417879417878, |
|
"grad_norm": 3.5625, |
|
"learning_rate": 7.046992501666195e-07, |
|
"loss": 1.175, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 1.7983367983367984, |
|
"grad_norm": 3.625, |
|
"learning_rate": 6.943023886126965e-07, |
|
"loss": 1.1753, |
|
"step": 865 |
|
}, |
|
{ |
|
"epoch": 1.8087318087318087, |
|
"grad_norm": 3.5625, |
|
"learning_rate": 6.839418515497618e-07, |
|
"loss": 1.1718, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 1.819126819126819, |
|
"grad_norm": 3.59375, |
|
"learning_rate": 6.736188700676935e-07, |
|
"loss": 1.164, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 1.8295218295218296, |
|
"grad_norm": 3.59375, |
|
"learning_rate": 6.633346707938319e-07, |
|
"loss": 1.1761, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 1.83991683991684, |
|
"grad_norm": 3.671875, |
|
"learning_rate": 6.530904757472236e-07, |
|
"loss": 1.1869, |
|
"step": 885 |
|
}, |
|
{ |
|
"epoch": 1.8503118503118503, |
|
"grad_norm": 3.59375, |
|
"learning_rate": 6.42887502193418e-07, |
|
"loss": 1.1836, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 1.8607068607068609, |
|
"grad_norm": 3.578125, |
|
"learning_rate": 6.327269624998227e-07, |
|
"loss": 1.1699, |
|
"step": 895 |
|
}, |
|
{ |
|
"epoch": 1.871101871101871, |
|
"grad_norm": 3.65625, |
|
"learning_rate": 6.226100639916474e-07, |
|
"loss": 1.1743, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 1.871101871101871, |
|
"eval_loss": 1.1942965984344482, |
|
"eval_runtime": 10.786, |
|
"eval_samples_per_second": 86.223, |
|
"eval_steps_per_second": 2.781, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 1.8814968814968815, |
|
"grad_norm": 3.53125, |
|
"learning_rate": 6.125380088084408e-07, |
|
"loss": 1.1797, |
|
"step": 905 |
|
}, |
|
{ |
|
"epoch": 1.8918918918918919, |
|
"grad_norm": 3.71875, |
|
"learning_rate": 6.025119937612481e-07, |
|
"loss": 1.1758, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 1.9022869022869022, |
|
"grad_norm": 3.65625, |
|
"learning_rate": 5.925332101903994e-07, |
|
"loss": 1.1783, |
|
"step": 915 |
|
}, |
|
{ |
|
"epoch": 1.9126819126819128, |
|
"grad_norm": 3.71875, |
|
"learning_rate": 5.826028438239479e-07, |
|
"loss": 1.1763, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 1.9230769230769231, |
|
"grad_norm": 3.546875, |
|
"learning_rate": 5.727220746367791e-07, |
|
"loss": 1.18, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 1.9334719334719335, |
|
"grad_norm": 3.546875, |
|
"learning_rate": 5.628920767103957e-07, |
|
"loss": 1.1782, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 1.943866943866944, |
|
"grad_norm": 3.65625, |
|
"learning_rate": 5.531140180934119e-07, |
|
"loss": 1.1772, |
|
"step": 935 |
|
}, |
|
{ |
|
"epoch": 1.9542619542619541, |
|
"grad_norm": 3.546875, |
|
"learning_rate": 5.433890606627568e-07, |
|
"loss": 1.175, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 1.9646569646569647, |
|
"grad_norm": 3.59375, |
|
"learning_rate": 5.337183599856164e-07, |
|
"loss": 1.1745, |
|
"step": 945 |
|
}, |
|
{ |
|
"epoch": 1.975051975051975, |
|
"grad_norm": 3.5625, |
|
"learning_rate": 5.241030651821231e-07, |
|
"loss": 1.1662, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 1.9854469854469854, |
|
"grad_norm": 3.609375, |
|
"learning_rate": 5.145443187888114e-07, |
|
"loss": 1.171, |
|
"step": 955 |
|
}, |
|
{ |
|
"epoch": 1.995841995841996, |
|
"grad_norm": 3.640625, |
|
"learning_rate": 5.050432566228552e-07, |
|
"loss": 1.1831, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 2.006237006237006, |
|
"grad_norm": 3.609375, |
|
"learning_rate": 4.956010076471065e-07, |
|
"loss": 1.1592, |
|
"step": 965 |
|
}, |
|
{ |
|
"epoch": 2.0166320166320166, |
|
"grad_norm": 3.53125, |
|
"learning_rate": 4.862186938359441e-07, |
|
"loss": 1.1667, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 2.027027027027027, |
|
"grad_norm": 3.5625, |
|
"learning_rate": 4.768974300419573e-07, |
|
"loss": 1.1678, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 2.0374220374220373, |
|
"grad_norm": 3.625, |
|
"learning_rate": 4.6763832386347214e-07, |
|
"loss": 1.1723, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 2.047817047817048, |
|
"grad_norm": 3.578125, |
|
"learning_rate": 4.5844247551294224e-07, |
|
"loss": 1.1642, |
|
"step": 985 |
|
}, |
|
{ |
|
"epoch": 2.0582120582120584, |
|
"grad_norm": 3.609375, |
|
"learning_rate": 4.493109776862143e-07, |
|
"loss": 1.17, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 2.0686070686070686, |
|
"grad_norm": 3.640625, |
|
"learning_rate": 4.402449154326913e-07, |
|
"loss": 1.1651, |
|
"step": 995 |
|
}, |
|
{ |
|
"epoch": 2.079002079002079, |
|
"grad_norm": 3.640625, |
|
"learning_rate": 4.312453660263987e-07, |
|
"loss": 1.1783, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 2.0893970893970892, |
|
"grad_norm": 3.578125, |
|
"learning_rate": 4.2231339883798025e-07, |
|
"loss": 1.1711, |
|
"step": 1005 |
|
}, |
|
{ |
|
"epoch": 2.0997920997921, |
|
"grad_norm": 3.5625, |
|
"learning_rate": 4.13450075207628e-07, |
|
"loss": 1.163, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 2.1101871101871104, |
|
"grad_norm": 3.515625, |
|
"learning_rate": 4.0465644831897006e-07, |
|
"loss": 1.17, |
|
"step": 1015 |
|
}, |
|
{ |
|
"epoch": 2.1205821205821205, |
|
"grad_norm": 3.5625, |
|
"learning_rate": 3.9593356307392436e-07, |
|
"loss": 1.1733, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 2.130977130977131, |
|
"grad_norm": 3.59375, |
|
"learning_rate": 3.872824559685409e-07, |
|
"loss": 1.1762, |
|
"step": 1025 |
|
}, |
|
{ |
|
"epoch": 2.141372141372141, |
|
"grad_norm": 3.671875, |
|
"learning_rate": 3.7870415496983743e-07, |
|
"loss": 1.1734, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 2.1517671517671517, |
|
"grad_norm": 3.578125, |
|
"learning_rate": 3.701996793936535e-07, |
|
"loss": 1.1724, |
|
"step": 1035 |
|
}, |
|
{ |
|
"epoch": 2.1621621621621623, |
|
"grad_norm": 3.5625, |
|
"learning_rate": 3.6177003978352917e-07, |
|
"loss": 1.1718, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 2.1725571725571724, |
|
"grad_norm": 3.5625, |
|
"learning_rate": 3.5341623779062813e-07, |
|
"loss": 1.1688, |
|
"step": 1045 |
|
}, |
|
{ |
|
"epoch": 2.182952182952183, |
|
"grad_norm": 3.609375, |
|
"learning_rate": 3.45139266054715e-07, |
|
"loss": 1.1732, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 2.1933471933471935, |
|
"grad_norm": 3.59375, |
|
"learning_rate": 3.3694010808620733e-07, |
|
"loss": 1.1619, |
|
"step": 1055 |
|
}, |
|
{ |
|
"epoch": 2.2037422037422036, |
|
"grad_norm": 3.625, |
|
"learning_rate": 3.288197381493075e-07, |
|
"loss": 1.1673, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 2.214137214137214, |
|
"grad_norm": 3.609375, |
|
"learning_rate": 3.207791211462383e-07, |
|
"loss": 1.1725, |
|
"step": 1065 |
|
}, |
|
{ |
|
"epoch": 2.2245322245322248, |
|
"grad_norm": 3.546875, |
|
"learning_rate": 3.128192125025869e-07, |
|
"loss": 1.1673, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 2.234927234927235, |
|
"grad_norm": 3.578125, |
|
"learning_rate": 3.049409580537773e-07, |
|
"loss": 1.1735, |
|
"step": 1075 |
|
}, |
|
{ |
|
"epoch": 2.2453222453222454, |
|
"grad_norm": 3.5625, |
|
"learning_rate": 2.9714529393268016e-07, |
|
"loss": 1.1583, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 2.2557172557172556, |
|
"grad_norm": 3.484375, |
|
"learning_rate": 2.8943314645837955e-07, |
|
"loss": 1.1715, |
|
"step": 1085 |
|
}, |
|
{ |
|
"epoch": 2.266112266112266, |
|
"grad_norm": 3.609375, |
|
"learning_rate": 2.8180543202609984e-07, |
|
"loss": 1.164, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 2.2765072765072767, |
|
"grad_norm": 3.53125, |
|
"learning_rate": 2.742630569983182e-07, |
|
"loss": 1.1695, |
|
"step": 1095 |
|
}, |
|
{ |
|
"epoch": 2.286902286902287, |
|
"grad_norm": 3.578125, |
|
"learning_rate": 2.66806917597064e-07, |
|
"loss": 1.169, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 2.2972972972972974, |
|
"grad_norm": 3.578125, |
|
"learning_rate": 2.594378997974267e-07, |
|
"loss": 1.1615, |
|
"step": 1105 |
|
}, |
|
{ |
|
"epoch": 2.3076923076923075, |
|
"grad_norm": 3.6875, |
|
"learning_rate": 2.5215687922227845e-07, |
|
"loss": 1.1712, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 2.318087318087318, |
|
"grad_norm": 3.515625, |
|
"learning_rate": 2.4496472103823027e-07, |
|
"loss": 1.1688, |
|
"step": 1115 |
|
}, |
|
{ |
|
"epoch": 2.3284823284823286, |
|
"grad_norm": 3.609375, |
|
"learning_rate": 2.378622798528266e-07, |
|
"loss": 1.1631, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 2.3388773388773387, |
|
"grad_norm": 3.6875, |
|
"learning_rate": 2.3085039961299814e-07, |
|
"loss": 1.1671, |
|
"step": 1125 |
|
}, |
|
{ |
|
"epoch": 2.3492723492723493, |
|
"grad_norm": 3.640625, |
|
"learning_rate": 2.239299135047794e-07, |
|
"loss": 1.1623, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 2.35966735966736, |
|
"grad_norm": 3.578125, |
|
"learning_rate": 2.1710164385430585e-07, |
|
"loss": 1.1716, |
|
"step": 1135 |
|
}, |
|
{ |
|
"epoch": 2.37006237006237, |
|
"grad_norm": 3.546875, |
|
"learning_rate": 2.103664020300997e-07, |
|
"loss": 1.1674, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 2.3804573804573805, |
|
"grad_norm": 3.453125, |
|
"learning_rate": 2.037249883466614e-07, |
|
"loss": 1.1623, |
|
"step": 1145 |
|
}, |
|
{ |
|
"epoch": 2.390852390852391, |
|
"grad_norm": 3.65625, |
|
"learning_rate": 1.971781919693697e-07, |
|
"loss": 1.1808, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 2.401247401247401, |
|
"grad_norm": 3.625, |
|
"learning_rate": 1.9072679082071163e-07, |
|
"loss": 1.169, |
|
"step": 1155 |
|
}, |
|
{ |
|
"epoch": 2.4116424116424118, |
|
"grad_norm": 3.671875, |
|
"learning_rate": 1.8437155148784433e-07, |
|
"loss": 1.1717, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 2.422037422037422, |
|
"grad_norm": 3.59375, |
|
"learning_rate": 1.781132291315064e-07, |
|
"loss": 1.1706, |
|
"step": 1165 |
|
}, |
|
{ |
|
"epoch": 2.4324324324324325, |
|
"grad_norm": 3.625, |
|
"learning_rate": 1.7195256739628439e-07, |
|
"loss": 1.1722, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 2.442827442827443, |
|
"grad_norm": 3.71875, |
|
"learning_rate": 1.6589029832225155e-07, |
|
"loss": 1.1615, |
|
"step": 1175 |
|
}, |
|
{ |
|
"epoch": 2.453222453222453, |
|
"grad_norm": 3.625, |
|
"learning_rate": 1.599271422579812e-07, |
|
"loss": 1.1691, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 2.4636174636174637, |
|
"grad_norm": 3.640625, |
|
"learning_rate": 1.5406380777495297e-07, |
|
"loss": 1.1647, |
|
"step": 1185 |
|
}, |
|
{ |
|
"epoch": 2.474012474012474, |
|
"grad_norm": 3.59375, |
|
"learning_rate": 1.4830099158335563e-07, |
|
"loss": 1.1707, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 2.4844074844074844, |
|
"grad_norm": 3.578125, |
|
"learning_rate": 1.426393784493015e-07, |
|
"loss": 1.1564, |
|
"step": 1195 |
|
}, |
|
{ |
|
"epoch": 2.494802494802495, |
|
"grad_norm": 3.59375, |
|
"learning_rate": 1.3707964111345805e-07, |
|
"loss": 1.1721, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 2.494802494802495, |
|
"eval_loss": 1.1930803060531616, |
|
"eval_runtime": 10.8355, |
|
"eval_samples_per_second": 85.829, |
|
"eval_steps_per_second": 2.769, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 2.505197505197505, |
|
"grad_norm": 3.65625, |
|
"learning_rate": 1.3162244021111123e-07, |
|
"loss": 1.1677, |
|
"step": 1205 |
|
}, |
|
{ |
|
"epoch": 2.5155925155925156, |
|
"grad_norm": 3.59375, |
|
"learning_rate": 1.2626842419366369e-07, |
|
"loss": 1.1551, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 2.525987525987526, |
|
"grad_norm": 3.640625, |
|
"learning_rate": 1.2101822925158378e-07, |
|
"loss": 1.1678, |
|
"step": 1215 |
|
}, |
|
{ |
|
"epoch": 2.5363825363825363, |
|
"grad_norm": 3.65625, |
|
"learning_rate": 1.1587247923881016e-07, |
|
"loss": 1.1731, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 2.546777546777547, |
|
"grad_norm": 3.6875, |
|
"learning_rate": 1.1083178559862227e-07, |
|
"loss": 1.1707, |
|
"step": 1225 |
|
}, |
|
{ |
|
"epoch": 2.5571725571725574, |
|
"grad_norm": 3.671875, |
|
"learning_rate": 1.0589674729098507e-07, |
|
"loss": 1.1733, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 2.5675675675675675, |
|
"grad_norm": 3.578125, |
|
"learning_rate": 1.0106795072137896e-07, |
|
"loss": 1.1741, |
|
"step": 1235 |
|
}, |
|
{ |
|
"epoch": 2.577962577962578, |
|
"grad_norm": 3.625, |
|
"learning_rate": 9.634596967111853e-08, |
|
"loss": 1.1704, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 2.5883575883575882, |
|
"grad_norm": 3.640625, |
|
"learning_rate": 9.173136522917457e-08, |
|
"loss": 1.1679, |
|
"step": 1245 |
|
}, |
|
{ |
|
"epoch": 2.598752598752599, |
|
"grad_norm": 3.546875, |
|
"learning_rate": 8.722468572550213e-08, |
|
"loss": 1.1682, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 2.609147609147609, |
|
"grad_norm": 3.640625, |
|
"learning_rate": 8.28264666658851e-08, |
|
"loss": 1.1653, |
|
"step": 1255 |
|
}, |
|
{ |
|
"epoch": 2.6195426195426195, |
|
"grad_norm": 3.734375, |
|
"learning_rate": 7.853723066830486e-08, |
|
"loss": 1.1672, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 2.62993762993763, |
|
"grad_norm": 3.640625, |
|
"learning_rate": 7.435748740084046e-08, |
|
"loss": 1.1606, |
|
"step": 1265 |
|
}, |
|
{ |
|
"epoch": 2.64033264033264, |
|
"grad_norm": 3.65625, |
|
"learning_rate": 7.028773352110684e-08, |
|
"loss": 1.1634, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 2.6507276507276507, |
|
"grad_norm": 3.515625, |
|
"learning_rate": 6.632845261724051e-08, |
|
"loss": 1.1635, |
|
"step": 1275 |
|
}, |
|
{ |
|
"epoch": 2.6611226611226613, |
|
"grad_norm": 4.1875, |
|
"learning_rate": 6.248011515043617e-08, |
|
"loss": 1.1641, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 2.6715176715176714, |
|
"grad_norm": 3.65625, |
|
"learning_rate": 5.8743178399044966e-08, |
|
"loss": 1.1642, |
|
"step": 1285 |
|
}, |
|
{ |
|
"epoch": 2.681912681912682, |
|
"grad_norm": 3.625, |
|
"learning_rate": 5.511808640423765e-08, |
|
"loss": 1.1727, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 2.6923076923076925, |
|
"grad_norm": 3.59375, |
|
"learning_rate": 5.160526991724246e-08, |
|
"loss": 1.1732, |
|
"step": 1295 |
|
}, |
|
{ |
|
"epoch": 2.7027027027027026, |
|
"grad_norm": 3.609375, |
|
"learning_rate": 4.8205146348160195e-08, |
|
"loss": 1.1699, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 2.713097713097713, |
|
"grad_norm": 3.6875, |
|
"learning_rate": 4.491811971636605e-08, |
|
"loss": 1.166, |
|
"step": 1305 |
|
}, |
|
{ |
|
"epoch": 2.7234927234927238, |
|
"grad_norm": 3.625, |
|
"learning_rate": 4.174458060250208e-08, |
|
"loss": 1.1712, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 2.733887733887734, |
|
"grad_norm": 3.59375, |
|
"learning_rate": 3.868490610206565e-08, |
|
"loss": 1.1595, |
|
"step": 1315 |
|
}, |
|
{ |
|
"epoch": 2.7442827442827444, |
|
"grad_norm": 3.578125, |
|
"learning_rate": 3.5739459780602665e-08, |
|
"loss": 1.1684, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 2.7546777546777546, |
|
"grad_norm": 3.578125, |
|
"learning_rate": 3.290859163050508e-08, |
|
"loss": 1.1744, |
|
"step": 1325 |
|
}, |
|
{ |
|
"epoch": 2.765072765072765, |
|
"grad_norm": 3.671875, |
|
"learning_rate": 3.0192638029424735e-08, |
|
"loss": 1.1664, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 2.7754677754677752, |
|
"grad_norm": 3.53125, |
|
"learning_rate": 2.7591921700302222e-08, |
|
"loss": 1.1612, |
|
"step": 1335 |
|
}, |
|
{ |
|
"epoch": 2.785862785862786, |
|
"grad_norm": 3.609375, |
|
"learning_rate": 2.5106751673020012e-08, |
|
"loss": 1.174, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 2.7962577962577964, |
|
"grad_norm": 3.59375, |
|
"learning_rate": 2.273742324768124e-08, |
|
"loss": 1.1602, |
|
"step": 1345 |
|
}, |
|
{ |
|
"epoch": 2.8066528066528065, |
|
"grad_norm": 3.546875, |
|
"learning_rate": 2.048421795952171e-08, |
|
"loss": 1.1708, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 2.817047817047817, |
|
"grad_norm": 3.609375, |
|
"learning_rate": 1.8347403545455497e-08, |
|
"loss": 1.1622, |
|
"step": 1355 |
|
}, |
|
{ |
|
"epoch": 2.8274428274428276, |
|
"grad_norm": 3.640625, |
|
"learning_rate": 1.6327233912261984e-08, |
|
"loss": 1.1668, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 2.8378378378378377, |
|
"grad_norm": 3.609375, |
|
"learning_rate": 1.4423949106414868e-08, |
|
"loss": 1.1708, |
|
"step": 1365 |
|
}, |
|
{ |
|
"epoch": 2.8482328482328483, |
|
"grad_norm": 3.640625, |
|
"learning_rate": 1.2637775285558983e-08, |
|
"loss": 1.1663, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 2.858627858627859, |
|
"grad_norm": 3.5625, |
|
"learning_rate": 1.0968924691636572e-08, |
|
"loss": 1.1621, |
|
"step": 1375 |
|
}, |
|
{ |
|
"epoch": 2.869022869022869, |
|
"grad_norm": 3.671875, |
|
"learning_rate": 9.417595625668462e-09, |
|
"loss": 1.1769, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 2.8794178794178795, |
|
"grad_norm": 3.71875, |
|
"learning_rate": 7.983972424190354e-09, |
|
"loss": 1.1784, |
|
"step": 1385 |
|
}, |
|
{ |
|
"epoch": 2.88981288981289, |
|
"grad_norm": 3.65625, |
|
"learning_rate": 6.668225437349351e-09, |
|
"loss": 1.1734, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 2.9002079002079, |
|
"grad_norm": 3.75, |
|
"learning_rate": 5.470511008662026e-09, |
|
"loss": 1.1747, |
|
"step": 1395 |
|
}, |
|
{ |
|
"epoch": 2.9106029106029108, |
|
"grad_norm": 3.671875, |
|
"learning_rate": 4.390971456437076e-09, |
|
"loss": 1.171, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 2.920997920997921, |
|
"grad_norm": 3.671875, |
|
"learning_rate": 3.429735056863725e-09, |
|
"loss": 1.1667, |
|
"step": 1405 |
|
}, |
|
{ |
|
"epoch": 2.9313929313929314, |
|
"grad_norm": 3.671875, |
|
"learning_rate": 2.5869160287702586e-09, |
|
"loss": 1.1683, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 2.9417879417879416, |
|
"grad_norm": 3.625, |
|
"learning_rate": 1.8626145200513199e-09, |
|
"loss": 1.1702, |
|
"step": 1415 |
|
}, |
|
{ |
|
"epoch": 2.952182952182952, |
|
"grad_norm": 3.5, |
|
"learning_rate": 1.2569165957680983e-09, |
|
"loss": 1.1712, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 2.9625779625779627, |
|
"grad_norm": 3.640625, |
|
"learning_rate": 7.698942279216192e-10, |
|
"loss": 1.1725, |
|
"step": 1425 |
|
}, |
|
{ |
|
"epoch": 2.972972972972973, |
|
"grad_norm": 3.5, |
|
"learning_rate": 4.016052869005859e-10, |
|
"loss": 1.1693, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 2.9833679833679834, |
|
"grad_norm": 3.578125, |
|
"learning_rate": 1.520935346051022e-10, |
|
"loss": 1.1682, |
|
"step": 1435 |
|
}, |
|
{ |
|
"epoch": 2.993762993762994, |
|
"grad_norm": 3.609375, |
|
"learning_rate": 2.1388619246498486e-11, |
|
"loss": 1.1627, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"step": 1443, |
|
"total_flos": 8.072899219804914e+18, |
|
"train_loss": 1.2002355462548142, |
|
"train_runtime": 6547.9575, |
|
"train_samples_per_second": 14.077, |
|
"train_steps_per_second": 0.22 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 1443, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 300, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 8.072899219804914e+18, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|