|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.05660991522665195, |
|
"eval_steps": 500, |
|
"global_step": 1000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0005660991522665195, |
|
"grad_norm": 0.4538244605064392, |
|
"learning_rate": 2e-05, |
|
"loss": 3.4057, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.001132198304533039, |
|
"grad_norm": 0.5858215689659119, |
|
"learning_rate": 4e-05, |
|
"loss": 3.0604, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.0016982974567995583, |
|
"grad_norm": 0.612471342086792, |
|
"learning_rate": 6e-05, |
|
"loss": 3.3496, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.002264396609066078, |
|
"grad_norm": 0.9295674562454224, |
|
"learning_rate": 8e-05, |
|
"loss": 2.9204, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.0028304957613325974, |
|
"grad_norm": 0.7428744435310364, |
|
"learning_rate": 0.0001, |
|
"loss": 2.7989, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.0033965949135991167, |
|
"grad_norm": 39.52121353149414, |
|
"learning_rate": 0.00012, |
|
"loss": 2.3753, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.003962694065865636, |
|
"grad_norm": 1.2908339500427246, |
|
"learning_rate": 0.00014, |
|
"loss": 2.3282, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.004528793218132156, |
|
"grad_norm": 1.0259243249893188, |
|
"learning_rate": 0.00016, |
|
"loss": 2.0235, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.0050948923703986754, |
|
"grad_norm": 5.856877326965332, |
|
"learning_rate": 0.00018, |
|
"loss": 2.3282, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.005660991522665195, |
|
"grad_norm": 1.6842005252838135, |
|
"learning_rate": 0.0002, |
|
"loss": 2.1914, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.006227090674931714, |
|
"grad_norm": 1.1379739046096802, |
|
"learning_rate": 0.0001999390827019096, |
|
"loss": 2.1828, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.006793189827198233, |
|
"grad_norm": 1.5961283445358276, |
|
"learning_rate": 0.00019975640502598244, |
|
"loss": 2.1712, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.0073592889794647535, |
|
"grad_norm": 1.2035201787948608, |
|
"learning_rate": 0.00019945218953682734, |
|
"loss": 1.8216, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.007925388131731272, |
|
"grad_norm": 3.0688140392303467, |
|
"learning_rate": 0.00019902680687415705, |
|
"loss": 2.0315, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.008491487283997793, |
|
"grad_norm": 1.050741195678711, |
|
"learning_rate": 0.00019848077530122083, |
|
"loss": 1.9974, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.009057586436264312, |
|
"grad_norm": 1.5697944164276123, |
|
"learning_rate": 0.00019781476007338058, |
|
"loss": 1.8635, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.009623685588530832, |
|
"grad_norm": 1.8822702169418335, |
|
"learning_rate": 0.00019702957262759965, |
|
"loss": 1.7645, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.010189784740797351, |
|
"grad_norm": 6.6875224113464355, |
|
"learning_rate": 0.0001961261695938319, |
|
"loss": 2.0548, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.01075588389306387, |
|
"grad_norm": 1.6249600648880005, |
|
"learning_rate": 0.00019510565162951537, |
|
"loss": 1.9403, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.01132198304533039, |
|
"grad_norm": 1.3810372352600098, |
|
"learning_rate": 0.00019396926207859084, |
|
"loss": 1.8465, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.011888082197596909, |
|
"grad_norm": 1.6512261629104614, |
|
"learning_rate": 0.00019271838545667876, |
|
"loss": 1.764, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.012454181349863428, |
|
"grad_norm": 2.0129904747009277, |
|
"learning_rate": 0.0001913545457642601, |
|
"loss": 1.6833, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.013020280502129947, |
|
"grad_norm": 2.3854947090148926, |
|
"learning_rate": 0.0001898794046299167, |
|
"loss": 1.9643, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.013586379654396467, |
|
"grad_norm": 11.102593421936035, |
|
"learning_rate": 0.00018829475928589271, |
|
"loss": 1.8319, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.014152478806662988, |
|
"grad_norm": 1.0218788385391235, |
|
"learning_rate": 0.00018660254037844388, |
|
"loss": 1.7456, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.014718577958929507, |
|
"grad_norm": 2.1926181316375732, |
|
"learning_rate": 0.0001848048096156426, |
|
"loss": 1.7705, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.015284677111196026, |
|
"grad_norm": 1.2894445657730103, |
|
"learning_rate": 0.00018290375725550417, |
|
"loss": 1.759, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.015850776263462544, |
|
"grad_norm": 1.2821546792984009, |
|
"learning_rate": 0.00018090169943749476, |
|
"loss": 1.7809, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.016416875415729067, |
|
"grad_norm": 1.6037951707839966, |
|
"learning_rate": 0.00017880107536067218, |
|
"loss": 1.6338, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.016982974567995586, |
|
"grad_norm": 1.6165717840194702, |
|
"learning_rate": 0.0001766044443118978, |
|
"loss": 1.59, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.017549073720262105, |
|
"grad_norm": 1.3858145475387573, |
|
"learning_rate": 0.00017431448254773944, |
|
"loss": 1.7459, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.018115172872528625, |
|
"grad_norm": 1.5735255479812622, |
|
"learning_rate": 0.0001719339800338651, |
|
"loss": 1.7295, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.018681272024795144, |
|
"grad_norm": 1.284432053565979, |
|
"learning_rate": 0.00016946583704589973, |
|
"loss": 1.769, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.019247371177061663, |
|
"grad_norm": 0.984366238117218, |
|
"learning_rate": 0.00016691306063588583, |
|
"loss": 1.6516, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.019813470329328182, |
|
"grad_norm": 1.6160484552383423, |
|
"learning_rate": 0.00016427876096865394, |
|
"loss": 1.529, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.020379569481594702, |
|
"grad_norm": 1.8237900733947754, |
|
"learning_rate": 0.0001615661475325658, |
|
"loss": 1.7505, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.02094566863386122, |
|
"grad_norm": 1.297549843788147, |
|
"learning_rate": 0.00015877852522924732, |
|
"loss": 1.7133, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.02151176778612774, |
|
"grad_norm": 2.0918915271759033, |
|
"learning_rate": 0.0001559192903470747, |
|
"loss": 1.6225, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.02207786693839426, |
|
"grad_norm": 1.0555062294006348, |
|
"learning_rate": 0.0001529919264233205, |
|
"loss": 1.6404, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.02264396609066078, |
|
"grad_norm": 1.5048798322677612, |
|
"learning_rate": 0.00015000000000000001, |
|
"loss": 1.6544, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.0232100652429273, |
|
"grad_norm": 1.7201550006866455, |
|
"learning_rate": 0.00014694715627858908, |
|
"loss": 1.5341, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.023776164395193818, |
|
"grad_norm": 1.2342567443847656, |
|
"learning_rate": 0.00014383711467890774, |
|
"loss": 1.5762, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.024342263547460337, |
|
"grad_norm": 0.8844823241233826, |
|
"learning_rate": 0.00014067366430758004, |
|
"loss": 1.4964, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.024908362699726856, |
|
"grad_norm": 2.017977714538574, |
|
"learning_rate": 0.00013746065934159123, |
|
"loss": 1.5507, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.025474461851993375, |
|
"grad_norm": 1.0709903240203857, |
|
"learning_rate": 0.00013420201433256689, |
|
"loss": 1.51, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.026040561004259895, |
|
"grad_norm": 0.863139271736145, |
|
"learning_rate": 0.00013090169943749476, |
|
"loss": 1.5191, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.026606660156526414, |
|
"grad_norm": 1.096670389175415, |
|
"learning_rate": 0.0001275637355816999, |
|
"loss": 1.6605, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.027172759308792933, |
|
"grad_norm": 0.9680797457695007, |
|
"learning_rate": 0.00012419218955996676, |
|
"loss": 1.6433, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.027738858461059456, |
|
"grad_norm": 1.6639995574951172, |
|
"learning_rate": 0.00012079116908177593, |
|
"loss": 1.5499, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.028304957613325975, |
|
"grad_norm": 1.3906327486038208, |
|
"learning_rate": 0.00011736481776669306, |
|
"loss": 1.5884, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.028871056765592495, |
|
"grad_norm": 1.403045654296875, |
|
"learning_rate": 0.00011391731009600654, |
|
"loss": 1.4895, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.029437155917859014, |
|
"grad_norm": 1.4162797927856445, |
|
"learning_rate": 0.00011045284632676536, |
|
"loss": 1.7302, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.030003255070125533, |
|
"grad_norm": 1.0728657245635986, |
|
"learning_rate": 0.00010697564737441252, |
|
"loss": 1.6658, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.030569354222392053, |
|
"grad_norm": 0.9048101902008057, |
|
"learning_rate": 0.00010348994967025012, |
|
"loss": 1.6423, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.031135453374658572, |
|
"grad_norm": 4.7274250984191895, |
|
"learning_rate": 0.0001, |
|
"loss": 1.6433, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.03170155252692509, |
|
"grad_norm": 1.6377885341644287, |
|
"learning_rate": 9.651005032974994e-05, |
|
"loss": 1.6475, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.03226765167919161, |
|
"grad_norm": 2.1424057483673096, |
|
"learning_rate": 9.302435262558747e-05, |
|
"loss": 1.5888, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.03283375083145813, |
|
"grad_norm": 0.9035511612892151, |
|
"learning_rate": 8.954715367323468e-05, |
|
"loss": 1.6005, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.03339984998372465, |
|
"grad_norm": 1.209153413772583, |
|
"learning_rate": 8.608268990399349e-05, |
|
"loss": 1.6187, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.03396594913599117, |
|
"grad_norm": 1.4936399459838867, |
|
"learning_rate": 8.263518223330697e-05, |
|
"loss": 1.6622, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.03453204828825769, |
|
"grad_norm": 0.9744408130645752, |
|
"learning_rate": 7.920883091822408e-05, |
|
"loss": 1.5553, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.03509814744052421, |
|
"grad_norm": 1.1214041709899902, |
|
"learning_rate": 7.580781044003324e-05, |
|
"loss": 1.4749, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.03566424659279073, |
|
"grad_norm": 1.5141066312789917, |
|
"learning_rate": 7.243626441830009e-05, |
|
"loss": 1.6162, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.03623034574505725, |
|
"grad_norm": 1.1192114353179932, |
|
"learning_rate": 6.909830056250527e-05, |
|
"loss": 1.6655, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.03679644489732377, |
|
"grad_norm": 0.8808345794677734, |
|
"learning_rate": 6.579798566743314e-05, |
|
"loss": 1.5585, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.03736254404959029, |
|
"grad_norm": 1.2207506895065308, |
|
"learning_rate": 6.25393406584088e-05, |
|
"loss": 1.5282, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.03792864320185681, |
|
"grad_norm": 0.8557892441749573, |
|
"learning_rate": 5.9326335692419995e-05, |
|
"loss": 1.6304, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.038494742354123326, |
|
"grad_norm": 1.4455292224884033, |
|
"learning_rate": 5.616288532109225e-05, |
|
"loss": 1.7623, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.039060841506389846, |
|
"grad_norm": 1.7669495344161987, |
|
"learning_rate": 5.305284372141095e-05, |
|
"loss": 1.5934, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.039626940658656365, |
|
"grad_norm": 2.2413482666015625, |
|
"learning_rate": 5.000000000000002e-05, |
|
"loss": 1.692, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.040193039810922884, |
|
"grad_norm": 0.9742591977119446, |
|
"learning_rate": 4.700807357667952e-05, |
|
"loss": 1.5471, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.040759138963189404, |
|
"grad_norm": 1.437410593032837, |
|
"learning_rate": 4.4080709652925336e-05, |
|
"loss": 1.5466, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.04132523811545592, |
|
"grad_norm": 1.625700831413269, |
|
"learning_rate": 4.12214747707527e-05, |
|
"loss": 1.5798, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.04189133726772244, |
|
"grad_norm": 0.8843604922294617, |
|
"learning_rate": 3.843385246743417e-05, |
|
"loss": 1.5987, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.04245743641998896, |
|
"grad_norm": 1.7022398710250854, |
|
"learning_rate": 3.5721239031346066e-05, |
|
"loss": 1.7433, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.04302353557225548, |
|
"grad_norm": 0.8799572587013245, |
|
"learning_rate": 3.308693936411421e-05, |
|
"loss": 1.4654, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.043589634724522, |
|
"grad_norm": 0.8340188264846802, |
|
"learning_rate": 3.053416295410026e-05, |
|
"loss": 1.5339, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.04415573387678852, |
|
"grad_norm": 1.684546947479248, |
|
"learning_rate": 2.8066019966134904e-05, |
|
"loss": 1.5587, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.04472183302905504, |
|
"grad_norm": 1.205370306968689, |
|
"learning_rate": 2.5685517452260567e-05, |
|
"loss": 1.3387, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.04528793218132156, |
|
"grad_norm": 1.310747504234314, |
|
"learning_rate": 2.339555568810221e-05, |
|
"loss": 1.538, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.04585403133358808, |
|
"grad_norm": 2.451975107192993, |
|
"learning_rate": 2.119892463932781e-05, |
|
"loss": 1.6725, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.0464201304858546, |
|
"grad_norm": 2.1125898361206055, |
|
"learning_rate": 1.9098300562505266e-05, |
|
"loss": 1.4746, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.046986229638121116, |
|
"grad_norm": 0.9616561532020569, |
|
"learning_rate": 1.7096242744495837e-05, |
|
"loss": 1.4724, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.047552328790387635, |
|
"grad_norm": 1.1507611274719238, |
|
"learning_rate": 1.5195190384357404e-05, |
|
"loss": 1.4674, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.048118427942654154, |
|
"grad_norm": 1.4164016246795654, |
|
"learning_rate": 1.339745962155613e-05, |
|
"loss": 1.5415, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.048684527094920674, |
|
"grad_norm": 1.2756904363632202, |
|
"learning_rate": 1.1705240714107302e-05, |
|
"loss": 1.4425, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.04925062624718719, |
|
"grad_norm": 1.3417842388153076, |
|
"learning_rate": 1.0120595370083318e-05, |
|
"loss": 1.4356, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.04981672539945371, |
|
"grad_norm": 2.374289035797119, |
|
"learning_rate": 8.645454235739903e-06, |
|
"loss": 1.4808, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.05038282455172023, |
|
"grad_norm": 3.114461660385132, |
|
"learning_rate": 7.281614543321269e-06, |
|
"loss": 1.6683, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.05094892370398675, |
|
"grad_norm": 1.7116568088531494, |
|
"learning_rate": 6.030737921409169e-06, |
|
"loss": 1.6627, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.05151502285625327, |
|
"grad_norm": 1.497970461845398, |
|
"learning_rate": 4.8943483704846475e-06, |
|
"loss": 1.4333, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.05208112200851979, |
|
"grad_norm": 1.462307333946228, |
|
"learning_rate": 3.873830406168111e-06, |
|
"loss": 1.3954, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.05264722116078631, |
|
"grad_norm": 0.9947201609611511, |
|
"learning_rate": 2.970427372400353e-06, |
|
"loss": 1.5113, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.05321332031305283, |
|
"grad_norm": 1.0234606266021729, |
|
"learning_rate": 2.1852399266194314e-06, |
|
"loss": 1.5614, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.05377941946531935, |
|
"grad_norm": 1.0096086263656616, |
|
"learning_rate": 1.5192246987791981e-06, |
|
"loss": 1.4893, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.05434551861758587, |
|
"grad_norm": 1.2131421566009521, |
|
"learning_rate": 9.731931258429638e-07, |
|
"loss": 1.5647, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.05491161776985239, |
|
"grad_norm": 1.0574455261230469, |
|
"learning_rate": 5.478104631726711e-07, |
|
"loss": 1.3718, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.05547771692211891, |
|
"grad_norm": 1.1654020547866821, |
|
"learning_rate": 2.4359497401758024e-07, |
|
"loss": 1.6251, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.05604381607438543, |
|
"grad_norm": 1.3310041427612305, |
|
"learning_rate": 6.09172980904238e-08, |
|
"loss": 1.6333, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.05660991522665195, |
|
"grad_norm": 1.0006204843521118, |
|
"learning_rate": 0.0, |
|
"loss": 1.534, |
|
"step": 1000 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 1000, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2.297589686332416e+16, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|