|
{ |
|
"best_global_step": null, |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.7859030837004406, |
|
"eval_steps": 500, |
|
"global_step": 1520, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.011747430249632892, |
|
"grad_norm": 1.5699902772903442, |
|
"learning_rate": 0.00027, |
|
"loss": 3.0983, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.023494860499265784, |
|
"grad_norm": 1.6029695272445679, |
|
"learning_rate": 0.00029991523567092526, |
|
"loss": 2.062, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.03524229074889868, |
|
"grad_norm": 1.593436360359192, |
|
"learning_rate": 0.00029962234616583063, |
|
"loss": 1.2074, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.04698972099853157, |
|
"grad_norm": 0.5851414799690247, |
|
"learning_rate": 0.00029912069357315393, |
|
"loss": 0.888, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.05873715124816446, |
|
"grad_norm": 0.25992292165756226, |
|
"learning_rate": 0.0002984109778320875, |
|
"loss": 0.7685, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.07048458149779736, |
|
"grad_norm": 0.21082307398319244, |
|
"learning_rate": 0.00029749418918542057, |
|
"loss": 0.7096, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.08223201174743025, |
|
"grad_norm": 0.16843102872371674, |
|
"learning_rate": 0.0002963716067978866, |
|
"loss": 0.6901, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.09397944199706314, |
|
"grad_norm": 0.12076722830533981, |
|
"learning_rate": 0.000295044796971387, |
|
"loss": 0.6702, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.10572687224669604, |
|
"grad_norm": 0.21371866762638092, |
|
"learning_rate": 0.000293515610959582, |
|
"loss": 0.6353, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.11747430249632893, |
|
"grad_norm": 0.13458965718746185, |
|
"learning_rate": 0.0002917861823848985, |
|
"loss": 0.6479, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.12922173274596183, |
|
"grad_norm": 0.265765517950058, |
|
"learning_rate": 0.0002898589242615568, |
|
"loss": 0.6244, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.14096916299559473, |
|
"grad_norm": 0.1473032385110855, |
|
"learning_rate": 0.0002877365256287728, |
|
"loss": 0.6217, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.1527165932452276, |
|
"grad_norm": 0.1591167151927948, |
|
"learning_rate": 0.00028542194779883047, |
|
"loss": 0.6022, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.1644640234948605, |
|
"grad_norm": 0.13270772993564606, |
|
"learning_rate": 0.00028291842022526133, |
|
"loss": 0.6098, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.1762114537444934, |
|
"grad_norm": 0.1444919854402542, |
|
"learning_rate": 0.0002802294359968954, |
|
"loss": 0.5971, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.18795888399412627, |
|
"grad_norm": 0.1571902334690094, |
|
"learning_rate": 0.0002773587469640702, |
|
"loss": 0.5937, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.19970631424375918, |
|
"grad_norm": 0.11585285514593124, |
|
"learning_rate": 0.0002743103585037989, |
|
"loss": 0.6054, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.21145374449339208, |
|
"grad_norm": 0.10303252190351486, |
|
"learning_rate": 0.0002710885239312008, |
|
"loss": 0.5708, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.22320117474302498, |
|
"grad_norm": 0.09355439245700836, |
|
"learning_rate": 0.00026769773856499167, |
|
"loss": 0.5806, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.23494860499265785, |
|
"grad_norm": 0.09288550913333893, |
|
"learning_rate": 0.0002641427334553158, |
|
"loss": 0.5747, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.24669603524229075, |
|
"grad_norm": 0.10875760763883591, |
|
"learning_rate": 0.00026042846878266795, |
|
"loss": 0.5879, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.25844346549192365, |
|
"grad_norm": 0.09756477177143097, |
|
"learning_rate": 0.0002565601269371192, |
|
"loss": 0.5852, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.2701908957415565, |
|
"grad_norm": 0.10926368832588196, |
|
"learning_rate": 0.0002525431052874997, |
|
"loss": 0.5605, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.28193832599118945, |
|
"grad_norm": 0.09802033007144928, |
|
"learning_rate": 0.00024838300865062966, |
|
"loss": 0.5738, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.2936857562408223, |
|
"grad_norm": 0.10284294933080673, |
|
"learning_rate": 0.00024408564147110443, |
|
"loss": 0.5847, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.3054331864904552, |
|
"grad_norm": 0.09890703111886978, |
|
"learning_rate": 0.00023965699972254602, |
|
"loss": 0.5736, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.31718061674008813, |
|
"grad_norm": 0.09091509878635406, |
|
"learning_rate": 0.00023510326254162027, |
|
"loss": 0.5577, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.328928046989721, |
|
"grad_norm": 0.0930003970861435, |
|
"learning_rate": 0.00023043078360649285, |
|
"loss": 0.5651, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.3406754772393539, |
|
"grad_norm": 0.0988878533244133, |
|
"learning_rate": 0.00022564608227175316, |
|
"loss": 0.548, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.3524229074889868, |
|
"grad_norm": 0.11749754101037979, |
|
"learning_rate": 0.0002207558344721757, |
|
"loss": 0.5587, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.3641703377386197, |
|
"grad_norm": 0.10936658829450607, |
|
"learning_rate": 0.00021576686340800985, |
|
"loss": 0.5694, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.37591776798825255, |
|
"grad_norm": 0.15082670748233795, |
|
"learning_rate": 0.00021068613002479553, |
|
"loss": 0.5688, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.3876651982378855, |
|
"grad_norm": 0.10255635529756546, |
|
"learning_rate": 0.00020552072330098716, |
|
"loss": 0.56, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.39941262848751835, |
|
"grad_norm": 0.10504507273435593, |
|
"learning_rate": 0.0002002778503569374, |
|
"loss": 0.557, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.4111600587371512, |
|
"grad_norm": 0.1146383211016655, |
|
"learning_rate": 0.00019496482639904194, |
|
"loss": 0.5497, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.42290748898678415, |
|
"grad_norm": 0.09596443176269531, |
|
"learning_rate": 0.00018958906451307489, |
|
"loss": 0.556, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.434654919236417, |
|
"grad_norm": 0.10395421087741852, |
|
"learning_rate": 0.0001841580653209565, |
|
"loss": 0.5634, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.44640234948604995, |
|
"grad_norm": 0.08797866106033325, |
|
"learning_rate": 0.00017867940651538483, |
|
"loss": 0.5544, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.4581497797356828, |
|
"grad_norm": 0.1006847620010376, |
|
"learning_rate": 0.0001731607322869329, |
|
"loss": 0.5562, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.4698972099853157, |
|
"grad_norm": 0.09849337488412857, |
|
"learning_rate": 0.00016760974265836331, |
|
"loss": 0.5477, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.48164464023494863, |
|
"grad_norm": 0.0988384336233139, |
|
"learning_rate": 0.00016203418274104278, |
|
"loss": 0.5459, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.4933920704845815, |
|
"grad_norm": 0.09625212848186493, |
|
"learning_rate": 0.0001564418319284454, |
|
"loss": 0.5516, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.5051395007342144, |
|
"grad_norm": 0.09705183655023575, |
|
"learning_rate": 0.00015084049304182347, |
|
"loss": 0.5375, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.5168869309838473, |
|
"grad_norm": 0.17180472612380981, |
|
"learning_rate": 0.00014523798144319027, |
|
"loss": 0.539, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.5286343612334802, |
|
"grad_norm": 0.09553302824497223, |
|
"learning_rate": 0.00013964211413080522, |
|
"loss": 0.5418, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.540381791483113, |
|
"grad_norm": 0.10648112744092941, |
|
"learning_rate": 0.0001340606988323758, |
|
"loss": 0.5414, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.5521292217327459, |
|
"grad_norm": 0.09907692670822144, |
|
"learning_rate": 0.00012850152311119498, |
|
"loss": 0.5353, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.5638766519823789, |
|
"grad_norm": 0.11162377148866653, |
|
"learning_rate": 0.00012297234350041228, |
|
"loss": 0.528, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.5756240822320118, |
|
"grad_norm": 0.10550152510404587, |
|
"learning_rate": 0.00011748087468060128, |
|
"loss": 0.533, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.5873715124816447, |
|
"grad_norm": 0.09718377143144608, |
|
"learning_rate": 0.0001120347787157222, |
|
"loss": 0.5409, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.5991189427312775, |
|
"grad_norm": 0.09185861796140671, |
|
"learning_rate": 0.0001066416543624984, |
|
"loss": 0.5354, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.6108663729809104, |
|
"grad_norm": 0.0927920788526535, |
|
"learning_rate": 0.00010130902646812369, |
|
"loss": 0.5454, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.6226138032305433, |
|
"grad_norm": 0.087093785405159, |
|
"learning_rate": 9.604433547109344e-05, |
|
"loss": 0.5295, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.6343612334801763, |
|
"grad_norm": 0.09994326531887054, |
|
"learning_rate": 9.085492701980751e-05, |
|
"loss": 0.5322, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.6461086637298091, |
|
"grad_norm": 0.09507084637880325, |
|
"learning_rate": 8.574804172343134e-05, |
|
"loss": 0.5224, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.657856093979442, |
|
"grad_norm": 0.08571015298366547, |
|
"learning_rate": 8.07308050493148e-05, |
|
"loss": 0.5378, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.6696035242290749, |
|
"grad_norm": 0.08876761794090271, |
|
"learning_rate": 7.581021738106408e-05, |
|
"loss": 0.5265, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.6813509544787077, |
|
"grad_norm": 0.09467241168022156, |
|
"learning_rate": 7.099314425113907e-05, |
|
"loss": 0.5392, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.6930983847283406, |
|
"grad_norm": 0.08804601430892944, |
|
"learning_rate": 6.628630676160445e-05, |
|
"loss": 0.5365, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.7048458149779736, |
|
"grad_norm": 0.08877623081207275, |
|
"learning_rate": 6.169627220639871e-05, |
|
"loss": 0.5354, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.7165932452276065, |
|
"grad_norm": 0.09122662246227264, |
|
"learning_rate": 5.722944490820774e-05, |
|
"loss": 0.5356, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.7283406754772394, |
|
"grad_norm": 0.08744510263204575, |
|
"learning_rate": 5.289205728272586e-05, |
|
"loss": 0.5424, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.7400881057268722, |
|
"grad_norm": 0.08927814662456512, |
|
"learning_rate": 4.869016114277345e-05, |
|
"loss": 0.5268, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.7518355359765051, |
|
"grad_norm": 0.09256933629512787, |
|
"learning_rate": 4.462961925440341e-05, |
|
"loss": 0.5414, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.7635829662261381, |
|
"grad_norm": 0.08703339844942093, |
|
"learning_rate": 4.071609715677899e-05, |
|
"loss": 0.5376, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.775330396475771, |
|
"grad_norm": 0.08876251429319382, |
|
"learning_rate": 3.695505525723465e-05, |
|
"loss": 0.5307, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.7870778267254038, |
|
"grad_norm": 0.08702490478754044, |
|
"learning_rate": 3.3351741212551595e-05, |
|
"loss": 0.5307, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.7988252569750367, |
|
"grad_norm": 0.08601511269807816, |
|
"learning_rate": 2.9911182607076516e-05, |
|
"loss": 0.5372, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.8105726872246696, |
|
"grad_norm": 0.0857272818684578, |
|
"learning_rate": 2.663817993790021e-05, |
|
"loss": 0.528, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.8223201174743024, |
|
"grad_norm": 0.08725214004516602, |
|
"learning_rate": 2.3537299916883512e-05, |
|
"loss": 0.5378, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.8340675477239354, |
|
"grad_norm": 0.0845843032002449, |
|
"learning_rate": 2.0612869098875988e-05, |
|
"loss": 0.5389, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.8458149779735683, |
|
"grad_norm": 0.08480172604322433, |
|
"learning_rate": 1.786896784501778e-05, |
|
"loss": 0.5244, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.8575624082232012, |
|
"grad_norm": 0.09265288710594177, |
|
"learning_rate": 1.5309424629547164e-05, |
|
"loss": 0.5403, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.869309838472834, |
|
"grad_norm": 0.08523637801408768, |
|
"learning_rate": 1.2937810698057921e-05, |
|
"loss": 0.5332, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.8810572687224669, |
|
"grad_norm": 0.08431612700223923, |
|
"learning_rate": 1.0757435084658694e-05, |
|
"loss": 0.5198, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.8928046989720999, |
|
"grad_norm": 0.08998807519674301, |
|
"learning_rate": 8.771339994987953e-06, |
|
"loss": 0.5251, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.9045521292217328, |
|
"grad_norm": 0.08884080499410629, |
|
"learning_rate": 6.98229656152543e-06, |
|
"loss": 0.5449, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.9162995594713657, |
|
"grad_norm": 0.08583056926727295, |
|
"learning_rate": 5.392800977123047e-06, |
|
"loss": 0.5264, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.9280469897209985, |
|
"grad_norm": 0.08824951946735382, |
|
"learning_rate": 4.005071012149952e-06, |
|
"loss": 0.5306, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.9397944199706314, |
|
"grad_norm": 0.08726619184017181, |
|
"learning_rate": 2.821042920111427e-06, |
|
"loss": 0.5205, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.9515418502202643, |
|
"grad_norm": 0.08729498088359833, |
|
"learning_rate": 1.8423687360584137e-06, |
|
"loss": 0.5217, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.9632892804698973, |
|
"grad_norm": 0.08497074991464615, |
|
"learning_rate": 1.070413971558115e-06, |
|
"loss": 0.534, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.9750367107195301, |
|
"grad_norm": 0.08497001975774765, |
|
"learning_rate": 5.062557094410058e-07, |
|
"loss": 0.5241, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.986784140969163, |
|
"grad_norm": 0.08295251429080963, |
|
"learning_rate": 1.5068110098296338e-07, |
|
"loss": 0.5337, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.9985315712187959, |
|
"grad_norm": 0.10860061645507812, |
|
"learning_rate": 0.00022638651575377874, |
|
"loss": 0.5227, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 1.0105726872246696, |
|
"grad_norm": 0.11796294897794724, |
|
"learning_rate": 0.00022478592280680777, |
|
"loss": 0.588, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 1.0223201174743024, |
|
"grad_norm": 0.1594405323266983, |
|
"learning_rate": 0.0002231739162937319, |
|
"loss": 0.5307, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 1.0340675477239354, |
|
"grad_norm": 0.10787333548069, |
|
"learning_rate": 0.0002215507422333499, |
|
"loss": 0.5359, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 1.0458149779735684, |
|
"grad_norm": 0.10763130336999893, |
|
"learning_rate": 0.0002199166483488127, |
|
"loss": 0.5407, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 1.0575624082232011, |
|
"grad_norm": 0.13658902049064636, |
|
"learning_rate": 0.00021827188402981652, |
|
"loss": 0.5255, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 1.0693098384728341, |
|
"grad_norm": 0.10522827506065369, |
|
"learning_rate": 0.00021661670029454207, |
|
"loss": 0.5276, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 1.0810572687224669, |
|
"grad_norm": 0.1422538459300995, |
|
"learning_rate": 0.0002149513497513448, |
|
"loss": 0.5245, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 1.0928046989720999, |
|
"grad_norm": 0.10326780378818512, |
|
"learning_rate": 0.00021327608656020305, |
|
"loss": 0.5294, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 1.1045521292217328, |
|
"grad_norm": 0.11100132018327713, |
|
"learning_rate": 0.00021159116639392868, |
|
"loss": 0.52, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 1.1162995594713656, |
|
"grad_norm": 0.09583411365747452, |
|
"learning_rate": 0.00020989684639914738, |
|
"loss": 0.5247, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 1.1280469897209986, |
|
"grad_norm": 0.10812857002019882, |
|
"learning_rate": 0.00020819338515705378, |
|
"loss": 0.5236, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 1.1397944199706314, |
|
"grad_norm": 0.12208293378353119, |
|
"learning_rate": 0.00020648104264394784, |
|
"loss": 0.5217, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 1.1515418502202643, |
|
"grad_norm": 0.11540035158395767, |
|
"learning_rate": 0.00020476008019155794, |
|
"loss": 0.5387, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 1.1632892804698973, |
|
"grad_norm": 0.10755149275064468, |
|
"learning_rate": 0.00020303076044715738, |
|
"loss": 0.5057, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 1.17503671071953, |
|
"grad_norm": 0.10145018994808197, |
|
"learning_rate": 0.0002012933473334804, |
|
"loss": 0.5202, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.186784140969163, |
|
"grad_norm": 0.11095395684242249, |
|
"learning_rate": 0.00019954810600844277, |
|
"loss": 0.5314, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 1.1985315712187958, |
|
"grad_norm": 0.097834512591362, |
|
"learning_rate": 0.00019779530282467456, |
|
"loss": 0.5178, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 1.2102790014684288, |
|
"grad_norm": 0.09915532171726227, |
|
"learning_rate": 0.00019603520528887027, |
|
"loss": 0.5205, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 1.2220264317180616, |
|
"grad_norm": 0.1107698306441307, |
|
"learning_rate": 0.00019426808202096298, |
|
"loss": 0.5268, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 1.2337738619676946, |
|
"grad_norm": 0.11669424921274185, |
|
"learning_rate": 0.0001924942027131284, |
|
"loss": 0.53, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 1.2455212922173275, |
|
"grad_norm": 0.11590099334716797, |
|
"learning_rate": 0.00019071383808862534, |
|
"loss": 0.5085, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 1.2572687224669603, |
|
"grad_norm": 0.1027660220861435, |
|
"learning_rate": 0.00018892725986047917, |
|
"loss": 0.5193, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 1.2690161527165933, |
|
"grad_norm": 0.09436651319265366, |
|
"learning_rate": 0.00018713474069001354, |
|
"loss": 0.5002, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 1.280763582966226, |
|
"grad_norm": 0.11379121989011765, |
|
"learning_rate": 0.00018533655414523808, |
|
"loss": 0.5212, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 1.292511013215859, |
|
"grad_norm": 0.09809733927249908, |
|
"learning_rate": 0.00018353297465909717, |
|
"loss": 0.5124, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 1.3042584434654918, |
|
"grad_norm": 0.1027405858039856, |
|
"learning_rate": 0.00018172427748758713, |
|
"loss": 0.5177, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 1.3160058737151248, |
|
"grad_norm": 0.10089763253927231, |
|
"learning_rate": 0.0001799107386677475, |
|
"loss": 0.4969, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 1.3277533039647578, |
|
"grad_norm": 0.09994267672300339, |
|
"learning_rate": 0.0001780926349755332, |
|
"loss": 0.516, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 1.3395007342143905, |
|
"grad_norm": 0.10974204540252686, |
|
"learning_rate": 0.00017627024388357416, |
|
"loss": 0.5035, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 1.3512481644640235, |
|
"grad_norm": 0.09834876656532288, |
|
"learning_rate": 0.00017444384351882817, |
|
"loss": 0.5121, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 1.3629955947136563, |
|
"grad_norm": 0.09756341576576233, |
|
"learning_rate": 0.0001726137126201342, |
|
"loss": 0.5289, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 1.3747430249632893, |
|
"grad_norm": 0.09796813875436783, |
|
"learning_rate": 0.0001707801304956723, |
|
"loss": 0.5054, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 1.3864904552129222, |
|
"grad_norm": 0.09447074681520462, |
|
"learning_rate": 0.00016894337698033663, |
|
"loss": 0.5067, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 1.398237885462555, |
|
"grad_norm": 0.10086411237716675, |
|
"learning_rate": 0.00016710373239302772, |
|
"loss": 0.5191, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 1.409985315712188, |
|
"grad_norm": 0.10048293322324753, |
|
"learning_rate": 0.00016526147749387155, |
|
"loss": 0.5073, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 1.4217327459618208, |
|
"grad_norm": 0.09760654717683792, |
|
"learning_rate": 0.00016341689344137088, |
|
"loss": 0.5254, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 1.4334801762114537, |
|
"grad_norm": 0.09539603441953659, |
|
"learning_rate": 0.00016157026174949538, |
|
"loss": 0.5116, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 1.4452276064610867, |
|
"grad_norm": 0.09473835676908493, |
|
"learning_rate": 0.00015972186424471855, |
|
"loss": 0.5011, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 1.4569750367107195, |
|
"grad_norm": 0.10754521191120148, |
|
"learning_rate": 0.0001578719830230061, |
|
"loss": 0.5116, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 1.4687224669603525, |
|
"grad_norm": 0.09202101826667786, |
|
"learning_rate": 0.00015602090040676324, |
|
"loss": 0.4964, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 1.4804698972099852, |
|
"grad_norm": 0.1060076653957367, |
|
"learning_rate": 0.00015416889890174792, |
|
"loss": 0.505, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 1.4922173274596182, |
|
"grad_norm": 0.10437231510877609, |
|
"learning_rate": 0.0001523162611539557, |
|
"loss": 0.5065, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 1.5039647577092512, |
|
"grad_norm": 0.14216673374176025, |
|
"learning_rate": 0.0001504632699064833, |
|
"loss": 0.5221, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 1.515712187958884, |
|
"grad_norm": 0.10676784813404083, |
|
"learning_rate": 0.00014861020795637716, |
|
"loss": 0.5057, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 1.5274596182085167, |
|
"grad_norm": 0.09706170856952667, |
|
"learning_rate": 0.00014675735811147444, |
|
"loss": 0.5054, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 1.5392070484581497, |
|
"grad_norm": 0.09725037962198257, |
|
"learning_rate": 0.00014490500314724117, |
|
"loss": 0.5083, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 1.5509544787077827, |
|
"grad_norm": 0.09938537329435349, |
|
"learning_rate": 0.0001430534257636167, |
|
"loss": 0.5157, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 1.5627019089574157, |
|
"grad_norm": 0.10331527143716812, |
|
"learning_rate": 0.00014120290854186863, |
|
"loss": 0.5151, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 1.5744493392070484, |
|
"grad_norm": 0.09629181027412415, |
|
"learning_rate": 0.00013935373390146634, |
|
"loss": 0.507, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 1.5861967694566812, |
|
"grad_norm": 0.10021866858005524, |
|
"learning_rate": 0.00013750618405697912, |
|
"loss": 0.4973, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 1.5979441997063142, |
|
"grad_norm": 0.10414128750562668, |
|
"learning_rate": 0.0001356605409750058, |
|
"loss": 0.5033, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 1.6096916299559472, |
|
"grad_norm": 0.11649428308010101, |
|
"learning_rate": 0.0001340013252947644, |
|
"loss": 0.5259, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 1.6214390602055802, |
|
"grad_norm": 0.09817013144493103, |
|
"learning_rate": 0.00013216008080267535, |
|
"loss": 0.5164, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 1.633186490455213, |
|
"grad_norm": 0.10239794105291367, |
|
"learning_rate": 0.0001303215589766901, |
|
"loss": 0.5011, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 1.6449339207048457, |
|
"grad_norm": 0.09663370996713638, |
|
"learning_rate": 0.00012848604040558272, |
|
"loss": 0.5096, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 1.6566813509544787, |
|
"grad_norm": 0.10224564373493195, |
|
"learning_rate": 0.0001266538052197809, |
|
"loss": 0.5055, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 1.6684287812041116, |
|
"grad_norm": 0.09594379365444183, |
|
"learning_rate": 0.00012482513304861364, |
|
"loss": 0.5051, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 1.6801762114537446, |
|
"grad_norm": 0.10660111159086227, |
|
"learning_rate": 0.00012300030297763518, |
|
"loss": 0.5076, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 1.6919236417033774, |
|
"grad_norm": 0.10405760258436203, |
|
"learning_rate": 0.0001211795935060317, |
|
"loss": 0.5089, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 1.7036710719530102, |
|
"grad_norm": 0.10634606331586838, |
|
"learning_rate": 0.00011936328250411801, |
|
"loss": 0.504, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 1.7154185022026431, |
|
"grad_norm": 0.1119045689702034, |
|
"learning_rate": 0.00011755164717092988, |
|
"loss": 0.5105, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 1.7271659324522761, |
|
"grad_norm": 0.10727712512016296, |
|
"learning_rate": 0.00011574496399191876, |
|
"loss": 0.5185, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 1.738913362701909, |
|
"grad_norm": 0.09993914514780045, |
|
"learning_rate": 0.00011394350869675567, |
|
"loss": 0.5004, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 1.7506607929515419, |
|
"grad_norm": 0.10068885236978531, |
|
"learning_rate": 0.00011214755621725042, |
|
"loss": 0.5091, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 1.7624082232011746, |
|
"grad_norm": 0.09819114953279495, |
|
"learning_rate": 0.00011035738064539201, |
|
"loss": 0.496, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 1.7741556534508076, |
|
"grad_norm": 0.09929963946342468, |
|
"learning_rate": 0.00010857325519151842, |
|
"loss": 0.5033, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 1.7859030837004406, |
|
"grad_norm": 0.09696891903877258, |
|
"learning_rate": 0.00010679545214261935, |
|
"loss": 0.5133, |
|
"step": 1520 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 2553, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 40, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 3.22152020339327e+18, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|