|
{ |
|
"best_global_step": null, |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.9609397944199705, |
|
"eval_steps": 500, |
|
"global_step": 2520, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.011747430249632892, |
|
"grad_norm": 1.5699902772903442, |
|
"learning_rate": 0.00027, |
|
"loss": 3.0983, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.023494860499265784, |
|
"grad_norm": 1.6029695272445679, |
|
"learning_rate": 0.00029991523567092526, |
|
"loss": 2.062, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.03524229074889868, |
|
"grad_norm": 1.593436360359192, |
|
"learning_rate": 0.00029962234616583063, |
|
"loss": 1.2074, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.04698972099853157, |
|
"grad_norm": 0.5851414799690247, |
|
"learning_rate": 0.00029912069357315393, |
|
"loss": 0.888, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.05873715124816446, |
|
"grad_norm": 0.25992292165756226, |
|
"learning_rate": 0.0002984109778320875, |
|
"loss": 0.7685, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.07048458149779736, |
|
"grad_norm": 0.21082307398319244, |
|
"learning_rate": 0.00029749418918542057, |
|
"loss": 0.7096, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.08223201174743025, |
|
"grad_norm": 0.16843102872371674, |
|
"learning_rate": 0.0002963716067978866, |
|
"loss": 0.6901, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.09397944199706314, |
|
"grad_norm": 0.12076722830533981, |
|
"learning_rate": 0.000295044796971387, |
|
"loss": 0.6702, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.10572687224669604, |
|
"grad_norm": 0.21371866762638092, |
|
"learning_rate": 0.000293515610959582, |
|
"loss": 0.6353, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.11747430249632893, |
|
"grad_norm": 0.13458965718746185, |
|
"learning_rate": 0.0002917861823848985, |
|
"loss": 0.6479, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.12922173274596183, |
|
"grad_norm": 0.265765517950058, |
|
"learning_rate": 0.0002898589242615568, |
|
"loss": 0.6244, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.14096916299559473, |
|
"grad_norm": 0.1473032385110855, |
|
"learning_rate": 0.0002877365256287728, |
|
"loss": 0.6217, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.1527165932452276, |
|
"grad_norm": 0.1591167151927948, |
|
"learning_rate": 0.00028542194779883047, |
|
"loss": 0.6022, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.1644640234948605, |
|
"grad_norm": 0.13270772993564606, |
|
"learning_rate": 0.00028291842022526133, |
|
"loss": 0.6098, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.1762114537444934, |
|
"grad_norm": 0.1444919854402542, |
|
"learning_rate": 0.0002802294359968954, |
|
"loss": 0.5971, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.18795888399412627, |
|
"grad_norm": 0.1571902334690094, |
|
"learning_rate": 0.0002773587469640702, |
|
"loss": 0.5937, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.19970631424375918, |
|
"grad_norm": 0.11585285514593124, |
|
"learning_rate": 0.0002743103585037989, |
|
"loss": 0.6054, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.21145374449339208, |
|
"grad_norm": 0.10303252190351486, |
|
"learning_rate": 0.0002710885239312008, |
|
"loss": 0.5708, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.22320117474302498, |
|
"grad_norm": 0.09355439245700836, |
|
"learning_rate": 0.00026769773856499167, |
|
"loss": 0.5806, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.23494860499265785, |
|
"grad_norm": 0.09288550913333893, |
|
"learning_rate": 0.0002641427334553158, |
|
"loss": 0.5747, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.24669603524229075, |
|
"grad_norm": 0.10875760763883591, |
|
"learning_rate": 0.00026042846878266795, |
|
"loss": 0.5879, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.25844346549192365, |
|
"grad_norm": 0.09756477177143097, |
|
"learning_rate": 0.0002565601269371192, |
|
"loss": 0.5852, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.2701908957415565, |
|
"grad_norm": 0.10926368832588196, |
|
"learning_rate": 0.0002525431052874997, |
|
"loss": 0.5605, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.28193832599118945, |
|
"grad_norm": 0.09802033007144928, |
|
"learning_rate": 0.00024838300865062966, |
|
"loss": 0.5738, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.2936857562408223, |
|
"grad_norm": 0.10284294933080673, |
|
"learning_rate": 0.00024408564147110443, |
|
"loss": 0.5847, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.3054331864904552, |
|
"grad_norm": 0.09890703111886978, |
|
"learning_rate": 0.00023965699972254602, |
|
"loss": 0.5736, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.31718061674008813, |
|
"grad_norm": 0.09091509878635406, |
|
"learning_rate": 0.00023510326254162027, |
|
"loss": 0.5577, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.328928046989721, |
|
"grad_norm": 0.0930003970861435, |
|
"learning_rate": 0.00023043078360649285, |
|
"loss": 0.5651, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.3406754772393539, |
|
"grad_norm": 0.0988878533244133, |
|
"learning_rate": 0.00022564608227175316, |
|
"loss": 0.548, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.3524229074889868, |
|
"grad_norm": 0.11749754101037979, |
|
"learning_rate": 0.0002207558344721757, |
|
"loss": 0.5587, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.3641703377386197, |
|
"grad_norm": 0.10936658829450607, |
|
"learning_rate": 0.00021576686340800985, |
|
"loss": 0.5694, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.37591776798825255, |
|
"grad_norm": 0.15082670748233795, |
|
"learning_rate": 0.00021068613002479553, |
|
"loss": 0.5688, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.3876651982378855, |
|
"grad_norm": 0.10255635529756546, |
|
"learning_rate": 0.00020552072330098716, |
|
"loss": 0.56, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.39941262848751835, |
|
"grad_norm": 0.10504507273435593, |
|
"learning_rate": 0.0002002778503569374, |
|
"loss": 0.557, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.4111600587371512, |
|
"grad_norm": 0.1146383211016655, |
|
"learning_rate": 0.00019496482639904194, |
|
"loss": 0.5497, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.42290748898678415, |
|
"grad_norm": 0.09596443176269531, |
|
"learning_rate": 0.00018958906451307489, |
|
"loss": 0.556, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.434654919236417, |
|
"grad_norm": 0.10395421087741852, |
|
"learning_rate": 0.0001841580653209565, |
|
"loss": 0.5634, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.44640234948604995, |
|
"grad_norm": 0.08797866106033325, |
|
"learning_rate": 0.00017867940651538483, |
|
"loss": 0.5544, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.4581497797356828, |
|
"grad_norm": 0.1006847620010376, |
|
"learning_rate": 0.0001731607322869329, |
|
"loss": 0.5562, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.4698972099853157, |
|
"grad_norm": 0.09849337488412857, |
|
"learning_rate": 0.00016760974265836331, |
|
"loss": 0.5477, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.48164464023494863, |
|
"grad_norm": 0.0988384336233139, |
|
"learning_rate": 0.00016203418274104278, |
|
"loss": 0.5459, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.4933920704845815, |
|
"grad_norm": 0.09625212848186493, |
|
"learning_rate": 0.0001564418319284454, |
|
"loss": 0.5516, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.5051395007342144, |
|
"grad_norm": 0.09705183655023575, |
|
"learning_rate": 0.00015084049304182347, |
|
"loss": 0.5375, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.5168869309838473, |
|
"grad_norm": 0.17180472612380981, |
|
"learning_rate": 0.00014523798144319027, |
|
"loss": 0.539, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.5286343612334802, |
|
"grad_norm": 0.09553302824497223, |
|
"learning_rate": 0.00013964211413080522, |
|
"loss": 0.5418, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.540381791483113, |
|
"grad_norm": 0.10648112744092941, |
|
"learning_rate": 0.0001340606988323758, |
|
"loss": 0.5414, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.5521292217327459, |
|
"grad_norm": 0.09907692670822144, |
|
"learning_rate": 0.00012850152311119498, |
|
"loss": 0.5353, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.5638766519823789, |
|
"grad_norm": 0.11162377148866653, |
|
"learning_rate": 0.00012297234350041228, |
|
"loss": 0.528, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.5756240822320118, |
|
"grad_norm": 0.10550152510404587, |
|
"learning_rate": 0.00011748087468060128, |
|
"loss": 0.533, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.5873715124816447, |
|
"grad_norm": 0.09718377143144608, |
|
"learning_rate": 0.0001120347787157222, |
|
"loss": 0.5409, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.5991189427312775, |
|
"grad_norm": 0.09185861796140671, |
|
"learning_rate": 0.0001066416543624984, |
|
"loss": 0.5354, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.6108663729809104, |
|
"grad_norm": 0.0927920788526535, |
|
"learning_rate": 0.00010130902646812369, |
|
"loss": 0.5454, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.6226138032305433, |
|
"grad_norm": 0.087093785405159, |
|
"learning_rate": 9.604433547109344e-05, |
|
"loss": 0.5295, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.6343612334801763, |
|
"grad_norm": 0.09994326531887054, |
|
"learning_rate": 9.085492701980751e-05, |
|
"loss": 0.5322, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.6461086637298091, |
|
"grad_norm": 0.09507084637880325, |
|
"learning_rate": 8.574804172343134e-05, |
|
"loss": 0.5224, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.657856093979442, |
|
"grad_norm": 0.08571015298366547, |
|
"learning_rate": 8.07308050493148e-05, |
|
"loss": 0.5378, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.6696035242290749, |
|
"grad_norm": 0.08876761794090271, |
|
"learning_rate": 7.581021738106408e-05, |
|
"loss": 0.5265, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.6813509544787077, |
|
"grad_norm": 0.09467241168022156, |
|
"learning_rate": 7.099314425113907e-05, |
|
"loss": 0.5392, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.6930983847283406, |
|
"grad_norm": 0.08804601430892944, |
|
"learning_rate": 6.628630676160445e-05, |
|
"loss": 0.5365, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.7048458149779736, |
|
"grad_norm": 0.08877623081207275, |
|
"learning_rate": 6.169627220639871e-05, |
|
"loss": 0.5354, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.7165932452276065, |
|
"grad_norm": 0.09122662246227264, |
|
"learning_rate": 5.722944490820774e-05, |
|
"loss": 0.5356, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.7283406754772394, |
|
"grad_norm": 0.08744510263204575, |
|
"learning_rate": 5.289205728272586e-05, |
|
"loss": 0.5424, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.7400881057268722, |
|
"grad_norm": 0.08927814662456512, |
|
"learning_rate": 4.869016114277345e-05, |
|
"loss": 0.5268, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.7518355359765051, |
|
"grad_norm": 0.09256933629512787, |
|
"learning_rate": 4.462961925440341e-05, |
|
"loss": 0.5414, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.7635829662261381, |
|
"grad_norm": 0.08703339844942093, |
|
"learning_rate": 4.071609715677899e-05, |
|
"loss": 0.5376, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.775330396475771, |
|
"grad_norm": 0.08876251429319382, |
|
"learning_rate": 3.695505525723465e-05, |
|
"loss": 0.5307, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.7870778267254038, |
|
"grad_norm": 0.08702490478754044, |
|
"learning_rate": 3.3351741212551595e-05, |
|
"loss": 0.5307, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.7988252569750367, |
|
"grad_norm": 0.08601511269807816, |
|
"learning_rate": 2.9911182607076516e-05, |
|
"loss": 0.5372, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.8105726872246696, |
|
"grad_norm": 0.0857272818684578, |
|
"learning_rate": 2.663817993790021e-05, |
|
"loss": 0.528, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.8223201174743024, |
|
"grad_norm": 0.08725214004516602, |
|
"learning_rate": 2.3537299916883512e-05, |
|
"loss": 0.5378, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.8340675477239354, |
|
"grad_norm": 0.0845843032002449, |
|
"learning_rate": 2.0612869098875988e-05, |
|
"loss": 0.5389, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.8458149779735683, |
|
"grad_norm": 0.08480172604322433, |
|
"learning_rate": 1.786896784501778e-05, |
|
"loss": 0.5244, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.8575624082232012, |
|
"grad_norm": 0.09265288710594177, |
|
"learning_rate": 1.5309424629547164e-05, |
|
"loss": 0.5403, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.869309838472834, |
|
"grad_norm": 0.08523637801408768, |
|
"learning_rate": 1.2937810698057921e-05, |
|
"loss": 0.5332, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.8810572687224669, |
|
"grad_norm": 0.08431612700223923, |
|
"learning_rate": 1.0757435084658694e-05, |
|
"loss": 0.5198, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.8928046989720999, |
|
"grad_norm": 0.08998807519674301, |
|
"learning_rate": 8.771339994987953e-06, |
|
"loss": 0.5251, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.9045521292217328, |
|
"grad_norm": 0.08884080499410629, |
|
"learning_rate": 6.98229656152543e-06, |
|
"loss": 0.5449, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.9162995594713657, |
|
"grad_norm": 0.08583056926727295, |
|
"learning_rate": 5.392800977123047e-06, |
|
"loss": 0.5264, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.9280469897209985, |
|
"grad_norm": 0.08824951946735382, |
|
"learning_rate": 4.005071012149952e-06, |
|
"loss": 0.5306, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.9397944199706314, |
|
"grad_norm": 0.08726619184017181, |
|
"learning_rate": 2.821042920111427e-06, |
|
"loss": 0.5205, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.9515418502202643, |
|
"grad_norm": 0.08729498088359833, |
|
"learning_rate": 1.8423687360584137e-06, |
|
"loss": 0.5217, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.9632892804698973, |
|
"grad_norm": 0.08497074991464615, |
|
"learning_rate": 1.070413971558115e-06, |
|
"loss": 0.534, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.9750367107195301, |
|
"grad_norm": 0.08497001975774765, |
|
"learning_rate": 5.062557094410058e-07, |
|
"loss": 0.5241, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.986784140969163, |
|
"grad_norm": 0.08295251429080963, |
|
"learning_rate": 1.5068110098296338e-07, |
|
"loss": 0.5337, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.9985315712187959, |
|
"grad_norm": 0.10860061645507812, |
|
"learning_rate": 0.00022638651575377874, |
|
"loss": 0.5227, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 1.0105726872246696, |
|
"grad_norm": 0.11796294897794724, |
|
"learning_rate": 0.00022478592280680777, |
|
"loss": 0.588, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 1.0223201174743024, |
|
"grad_norm": 0.1594405323266983, |
|
"learning_rate": 0.0002231739162937319, |
|
"loss": 0.5307, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 1.0340675477239354, |
|
"grad_norm": 0.10787333548069, |
|
"learning_rate": 0.0002215507422333499, |
|
"loss": 0.5359, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 1.0458149779735684, |
|
"grad_norm": 0.10763130336999893, |
|
"learning_rate": 0.0002199166483488127, |
|
"loss": 0.5407, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 1.0575624082232011, |
|
"grad_norm": 0.13658902049064636, |
|
"learning_rate": 0.00021827188402981652, |
|
"loss": 0.5255, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 1.0693098384728341, |
|
"grad_norm": 0.10522827506065369, |
|
"learning_rate": 0.00021661670029454207, |
|
"loss": 0.5276, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 1.0810572687224669, |
|
"grad_norm": 0.1422538459300995, |
|
"learning_rate": 0.0002149513497513448, |
|
"loss": 0.5245, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 1.0928046989720999, |
|
"grad_norm": 0.10326780378818512, |
|
"learning_rate": 0.00021327608656020305, |
|
"loss": 0.5294, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 1.1045521292217328, |
|
"grad_norm": 0.11100132018327713, |
|
"learning_rate": 0.00021159116639392868, |
|
"loss": 0.52, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 1.1162995594713656, |
|
"grad_norm": 0.09583411365747452, |
|
"learning_rate": 0.00020989684639914738, |
|
"loss": 0.5247, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 1.1280469897209986, |
|
"grad_norm": 0.10812857002019882, |
|
"learning_rate": 0.00020819338515705378, |
|
"loss": 0.5236, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 1.1397944199706314, |
|
"grad_norm": 0.12208293378353119, |
|
"learning_rate": 0.00020648104264394784, |
|
"loss": 0.5217, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 1.1515418502202643, |
|
"grad_norm": 0.11540035158395767, |
|
"learning_rate": 0.00020476008019155794, |
|
"loss": 0.5387, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 1.1632892804698973, |
|
"grad_norm": 0.10755149275064468, |
|
"learning_rate": 0.00020303076044715738, |
|
"loss": 0.5057, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 1.17503671071953, |
|
"grad_norm": 0.10145018994808197, |
|
"learning_rate": 0.0002012933473334804, |
|
"loss": 0.5202, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.186784140969163, |
|
"grad_norm": 0.11095395684242249, |
|
"learning_rate": 0.00019954810600844277, |
|
"loss": 0.5314, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 1.1985315712187958, |
|
"grad_norm": 0.097834512591362, |
|
"learning_rate": 0.00019779530282467456, |
|
"loss": 0.5178, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 1.2102790014684288, |
|
"grad_norm": 0.09915532171726227, |
|
"learning_rate": 0.00019603520528887027, |
|
"loss": 0.5205, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 1.2220264317180616, |
|
"grad_norm": 0.1107698306441307, |
|
"learning_rate": 0.00019426808202096298, |
|
"loss": 0.5268, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 1.2337738619676946, |
|
"grad_norm": 0.11669424921274185, |
|
"learning_rate": 0.0001924942027131284, |
|
"loss": 0.53, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 1.2455212922173275, |
|
"grad_norm": 0.11590099334716797, |
|
"learning_rate": 0.00019071383808862534, |
|
"loss": 0.5085, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 1.2572687224669603, |
|
"grad_norm": 0.1027660220861435, |
|
"learning_rate": 0.00018892725986047917, |
|
"loss": 0.5193, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 1.2690161527165933, |
|
"grad_norm": 0.09436651319265366, |
|
"learning_rate": 0.00018713474069001354, |
|
"loss": 0.5002, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 1.280763582966226, |
|
"grad_norm": 0.11379121989011765, |
|
"learning_rate": 0.00018533655414523808, |
|
"loss": 0.5212, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 1.292511013215859, |
|
"grad_norm": 0.09809733927249908, |
|
"learning_rate": 0.00018353297465909717, |
|
"loss": 0.5124, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 1.3042584434654918, |
|
"grad_norm": 0.1027405858039856, |
|
"learning_rate": 0.00018172427748758713, |
|
"loss": 0.5177, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 1.3160058737151248, |
|
"grad_norm": 0.10089763253927231, |
|
"learning_rate": 0.0001799107386677475, |
|
"loss": 0.4969, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 1.3277533039647578, |
|
"grad_norm": 0.09994267672300339, |
|
"learning_rate": 0.0001780926349755332, |
|
"loss": 0.516, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 1.3395007342143905, |
|
"grad_norm": 0.10974204540252686, |
|
"learning_rate": 0.00017627024388357416, |
|
"loss": 0.5035, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 1.3512481644640235, |
|
"grad_norm": 0.09834876656532288, |
|
"learning_rate": 0.00017444384351882817, |
|
"loss": 0.5121, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 1.3629955947136563, |
|
"grad_norm": 0.09756341576576233, |
|
"learning_rate": 0.0001726137126201342, |
|
"loss": 0.5289, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 1.3747430249632893, |
|
"grad_norm": 0.09796813875436783, |
|
"learning_rate": 0.0001707801304956723, |
|
"loss": 0.5054, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 1.3864904552129222, |
|
"grad_norm": 0.09447074681520462, |
|
"learning_rate": 0.00016894337698033663, |
|
"loss": 0.5067, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 1.398237885462555, |
|
"grad_norm": 0.10086411237716675, |
|
"learning_rate": 0.00016710373239302772, |
|
"loss": 0.5191, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 1.409985315712188, |
|
"grad_norm": 0.10048293322324753, |
|
"learning_rate": 0.00016526147749387155, |
|
"loss": 0.5073, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 1.4217327459618208, |
|
"grad_norm": 0.09760654717683792, |
|
"learning_rate": 0.00016341689344137088, |
|
"loss": 0.5254, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 1.4334801762114537, |
|
"grad_norm": 0.09539603441953659, |
|
"learning_rate": 0.00016157026174949538, |
|
"loss": 0.5116, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 1.4452276064610867, |
|
"grad_norm": 0.09473835676908493, |
|
"learning_rate": 0.00015972186424471855, |
|
"loss": 0.5011, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 1.4569750367107195, |
|
"grad_norm": 0.10754521191120148, |
|
"learning_rate": 0.0001578719830230061, |
|
"loss": 0.5116, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 1.4687224669603525, |
|
"grad_norm": 0.09202101826667786, |
|
"learning_rate": 0.00015602090040676324, |
|
"loss": 0.4964, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 1.4804698972099852, |
|
"grad_norm": 0.1060076653957367, |
|
"learning_rate": 0.00015416889890174792, |
|
"loss": 0.505, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 1.4922173274596182, |
|
"grad_norm": 0.10437231510877609, |
|
"learning_rate": 0.0001523162611539557, |
|
"loss": 0.5065, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 1.5039647577092512, |
|
"grad_norm": 0.14216673374176025, |
|
"learning_rate": 0.0001504632699064833, |
|
"loss": 0.5221, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 1.515712187958884, |
|
"grad_norm": 0.10676784813404083, |
|
"learning_rate": 0.00014861020795637716, |
|
"loss": 0.5057, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 1.5274596182085167, |
|
"grad_norm": 0.09706170856952667, |
|
"learning_rate": 0.00014675735811147444, |
|
"loss": 0.5054, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 1.5392070484581497, |
|
"grad_norm": 0.09725037962198257, |
|
"learning_rate": 0.00014490500314724117, |
|
"loss": 0.5083, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 1.5509544787077827, |
|
"grad_norm": 0.09938537329435349, |
|
"learning_rate": 0.0001430534257636167, |
|
"loss": 0.5157, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 1.5627019089574157, |
|
"grad_norm": 0.10331527143716812, |
|
"learning_rate": 0.00014120290854186863, |
|
"loss": 0.5151, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 1.5744493392070484, |
|
"grad_norm": 0.09629181027412415, |
|
"learning_rate": 0.00013935373390146634, |
|
"loss": 0.507, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 1.5861967694566812, |
|
"grad_norm": 0.10021866858005524, |
|
"learning_rate": 0.00013750618405697912, |
|
"loss": 0.4973, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 1.5979441997063142, |
|
"grad_norm": 0.10414128750562668, |
|
"learning_rate": 0.0001356605409750058, |
|
"loss": 0.5033, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 1.6096916299559472, |
|
"grad_norm": 0.11649428308010101, |
|
"learning_rate": 0.0001340013252947644, |
|
"loss": 0.5259, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 1.6214390602055802, |
|
"grad_norm": 0.09817013144493103, |
|
"learning_rate": 0.00013216008080267535, |
|
"loss": 0.5164, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 1.633186490455213, |
|
"grad_norm": 0.10239794105291367, |
|
"learning_rate": 0.0001303215589766901, |
|
"loss": 0.5011, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 1.6449339207048457, |
|
"grad_norm": 0.09663370996713638, |
|
"learning_rate": 0.00012848604040558272, |
|
"loss": 0.5096, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 1.6566813509544787, |
|
"grad_norm": 0.10224564373493195, |
|
"learning_rate": 0.0001266538052197809, |
|
"loss": 0.5055, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 1.6684287812041116, |
|
"grad_norm": 0.09594379365444183, |
|
"learning_rate": 0.00012482513304861364, |
|
"loss": 0.5051, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 1.6801762114537446, |
|
"grad_norm": 0.10660111159086227, |
|
"learning_rate": 0.00012300030297763518, |
|
"loss": 0.5076, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 1.6919236417033774, |
|
"grad_norm": 0.10405760258436203, |
|
"learning_rate": 0.0001211795935060317, |
|
"loss": 0.5089, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 1.7036710719530102, |
|
"grad_norm": 0.10634606331586838, |
|
"learning_rate": 0.00011936328250411801, |
|
"loss": 0.504, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 1.7154185022026431, |
|
"grad_norm": 0.1119045689702034, |
|
"learning_rate": 0.00011755164717092988, |
|
"loss": 0.5105, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 1.7271659324522761, |
|
"grad_norm": 0.10727712512016296, |
|
"learning_rate": 0.00011574496399191876, |
|
"loss": 0.5185, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 1.738913362701909, |
|
"grad_norm": 0.09993914514780045, |
|
"learning_rate": 0.00011394350869675567, |
|
"loss": 0.5004, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 1.7506607929515419, |
|
"grad_norm": 0.10068885236978531, |
|
"learning_rate": 0.00011214755621725042, |
|
"loss": 0.5091, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 1.7624082232011746, |
|
"grad_norm": 0.09819114953279495, |
|
"learning_rate": 0.00011035738064539201, |
|
"loss": 0.496, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 1.7741556534508076, |
|
"grad_norm": 0.09929963946342468, |
|
"learning_rate": 0.00010857325519151842, |
|
"loss": 0.5033, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 1.7859030837004406, |
|
"grad_norm": 0.09696891903877258, |
|
"learning_rate": 0.00010679545214261935, |
|
"loss": 0.5133, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 1.7976505139500736, |
|
"grad_norm": 0.1030985563993454, |
|
"learning_rate": 0.0001050242428207814, |
|
"loss": 0.5142, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 1.8093979441997063, |
|
"grad_norm": 0.10666483640670776, |
|
"learning_rate": 0.0001032598975417796, |
|
"loss": 0.5205, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 1.821145374449339, |
|
"grad_norm": 0.1053660586476326, |
|
"learning_rate": 0.00010150268557382262, |
|
"loss": 0.498, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 1.832892804698972, |
|
"grad_norm": 0.1028640866279602, |
|
"learning_rate": 9.975287509645826e-05, |
|
"loss": 0.5096, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 1.844640234948605, |
|
"grad_norm": 0.10187330096960068, |
|
"learning_rate": 9.801073315964465e-05, |
|
"loss": 0.4961, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 1.8563876651982378, |
|
"grad_norm": 0.0993318185210228, |
|
"learning_rate": 9.627652564299405e-05, |
|
"loss": 0.5028, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 1.8681350954478708, |
|
"grad_norm": 0.11013616621494293, |
|
"learning_rate": 9.455051721519528e-05, |
|
"loss": 0.5011, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 1.8798825256975036, |
|
"grad_norm": 0.10433095693588257, |
|
"learning_rate": 9.283297129362094e-05, |
|
"loss": 0.4977, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 1.8916299559471366, |
|
"grad_norm": 0.10458213835954666, |
|
"learning_rate": 9.112415000412531e-05, |
|
"loss": 0.5107, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 1.9033773861967695, |
|
"grad_norm": 0.10259649157524109, |
|
"learning_rate": 8.942431414104001e-05, |
|
"loss": 0.4994, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 1.9151248164464023, |
|
"grad_norm": 0.09932565689086914, |
|
"learning_rate": 8.773372312737238e-05, |
|
"loss": 0.499, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 1.9268722466960353, |
|
"grad_norm": 0.10920233279466629, |
|
"learning_rate": 8.605263497521283e-05, |
|
"loss": 0.5061, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 1.938619676945668, |
|
"grad_norm": 0.0942125990986824, |
|
"learning_rate": 8.438130624635852e-05, |
|
"loss": 0.4941, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 1.950367107195301, |
|
"grad_norm": 0.12091836333274841, |
|
"learning_rate": 8.271999201315755e-05, |
|
"loss": 0.499, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 1.962114537444934, |
|
"grad_norm": 0.10680700093507767, |
|
"learning_rate": 8.106894581958054e-05, |
|
"loss": 0.4949, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 1.9738619676945668, |
|
"grad_norm": 0.10140874981880188, |
|
"learning_rate": 7.942841964252586e-05, |
|
"loss": 0.5032, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 1.9856093979441996, |
|
"grad_norm": 0.10470426827669144, |
|
"learning_rate": 7.779866385336391e-05, |
|
"loss": 0.5072, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 1.9973568281938325, |
|
"grad_norm": 0.10490316152572632, |
|
"learning_rate": 7.617992717972585e-05, |
|
"loss": 0.5024, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 2.0093979441997063, |
|
"grad_norm": 0.10185109823942184, |
|
"learning_rate": 7.457245666754417e-05, |
|
"loss": 0.541, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 2.0211453744493393, |
|
"grad_norm": 0.10151582956314087, |
|
"learning_rate": 7.297649764334912e-05, |
|
"loss": 0.4831, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 2.0328928046989723, |
|
"grad_norm": 0.10989069938659668, |
|
"learning_rate": 7.139229367682778e-05, |
|
"loss": 0.486, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 2.044640234948605, |
|
"grad_norm": 0.11440616101026535, |
|
"learning_rate": 6.982008654365156e-05, |
|
"loss": 0.4842, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 2.056387665198238, |
|
"grad_norm": 0.1018548235297203, |
|
"learning_rate": 6.82601161885771e-05, |
|
"loss": 0.4911, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 2.0681350954478708, |
|
"grad_norm": 0.1053592786192894, |
|
"learning_rate": 6.671262068882665e-05, |
|
"loss": 0.4924, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 2.0798825256975038, |
|
"grad_norm": 0.10619944334030151, |
|
"learning_rate": 6.517783621775382e-05, |
|
"loss": 0.4736, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 2.0916299559471367, |
|
"grad_norm": 0.11708024144172668, |
|
"learning_rate": 6.36559970087992e-05, |
|
"loss": 0.4824, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 2.1033773861967693, |
|
"grad_norm": 0.12601934373378754, |
|
"learning_rate": 6.214733531974292e-05, |
|
"loss": 0.4834, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 2.1151248164464023, |
|
"grad_norm": 0.10728344321250916, |
|
"learning_rate": 6.065208139725811e-05, |
|
"loss": 0.4889, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 2.1268722466960353, |
|
"grad_norm": 0.10394187271595001, |
|
"learning_rate": 5.917046344177123e-05, |
|
"loss": 0.4893, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 2.1386196769456682, |
|
"grad_norm": 0.11126961559057236, |
|
"learning_rate": 5.770270757263536e-05, |
|
"loss": 0.4876, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 2.150367107195301, |
|
"grad_norm": 0.10413071513175964, |
|
"learning_rate": 5.624903779362031e-05, |
|
"loss": 0.4764, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 2.1621145374449338, |
|
"grad_norm": 0.10565336793661118, |
|
"learning_rate": 5.480967595872602e-05, |
|
"loss": 0.4781, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 2.1738619676945667, |
|
"grad_norm": 0.10836539417505264, |
|
"learning_rate": 5.338484173832413e-05, |
|
"loss": 0.4854, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 2.1856093979441997, |
|
"grad_norm": 0.11080804467201233, |
|
"learning_rate": 5.197475258563249e-05, |
|
"loss": 0.4815, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 2.1973568281938327, |
|
"grad_norm": 0.11599951237440109, |
|
"learning_rate": 5.057962370352815e-05, |
|
"loss": 0.4878, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 2.2091042584434657, |
|
"grad_norm": 0.10745177417993546, |
|
"learning_rate": 4.91996680117041e-05, |
|
"loss": 0.4737, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 2.2208516886930982, |
|
"grad_norm": 0.10236770659685135, |
|
"learning_rate": 4.783509611417409e-05, |
|
"loss": 0.4759, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 2.232599118942731, |
|
"grad_norm": 0.11330056935548782, |
|
"learning_rate": 4.648611626713082e-05, |
|
"loss": 0.4725, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 2.244346549192364, |
|
"grad_norm": 0.10368051379919052, |
|
"learning_rate": 4.515293434716279e-05, |
|
"loss": 0.4872, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 2.256093979441997, |
|
"grad_norm": 0.10511163622140884, |
|
"learning_rate": 4.38357538198343e-05, |
|
"loss": 0.4839, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 2.2678414096916297, |
|
"grad_norm": 0.09937173873186111, |
|
"learning_rate": 4.253477570863275e-05, |
|
"loss": 0.4768, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 2.2795888399412627, |
|
"grad_norm": 0.1072772666811943, |
|
"learning_rate": 4.1250198564289644e-05, |
|
"loss": 0.4915, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 2.2913362701908957, |
|
"grad_norm": 0.11077064275741577, |
|
"learning_rate": 3.998221843447808e-05, |
|
"loss": 0.4773, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 2.3030837004405287, |
|
"grad_norm": 0.10051790624856949, |
|
"learning_rate": 3.8731028833892955e-05, |
|
"loss": 0.4728, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 2.3148311306901617, |
|
"grad_norm": 0.11899662017822266, |
|
"learning_rate": 3.749682071471727e-05, |
|
"loss": 0.4978, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 2.3265785609397946, |
|
"grad_norm": 0.1075495257973671, |
|
"learning_rate": 3.627978243747965e-05, |
|
"loss": 0.491, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 2.338325991189427, |
|
"grad_norm": 0.10509738326072693, |
|
"learning_rate": 3.5080099742307495e-05, |
|
"loss": 0.4672, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 2.35007342143906, |
|
"grad_norm": 0.10816201567649841, |
|
"learning_rate": 3.3897955720579985e-05, |
|
"loss": 0.4856, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 2.361820851688693, |
|
"grad_norm": 0.11207477003335953, |
|
"learning_rate": 3.2733530786985124e-05, |
|
"loss": 0.4893, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 2.373568281938326, |
|
"grad_norm": 0.10209004580974579, |
|
"learning_rate": 3.1587002651985776e-05, |
|
"loss": 0.4737, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 2.3853157121879587, |
|
"grad_norm": 0.11698783189058304, |
|
"learning_rate": 3.0458546294697954e-05, |
|
"loss": 0.4883, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 2.3970631424375917, |
|
"grad_norm": 0.09947340935468674, |
|
"learning_rate": 2.9348333936186003e-05, |
|
"loss": 0.4761, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 2.4088105726872246, |
|
"grad_norm": 0.1026497632265091, |
|
"learning_rate": 2.82565350131791e-05, |
|
"loss": 0.4867, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 2.4205580029368576, |
|
"grad_norm": 0.10487735271453857, |
|
"learning_rate": 2.718331615221218e-05, |
|
"loss": 0.4759, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 2.4323054331864906, |
|
"grad_norm": 0.10684232413768768, |
|
"learning_rate": 2.61288411441961e-05, |
|
"loss": 0.4881, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 2.444052863436123, |
|
"grad_norm": 0.104823999106884, |
|
"learning_rate": 2.5093270919420383e-05, |
|
"loss": 0.4973, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 2.455800293685756, |
|
"grad_norm": 0.10103822499513626, |
|
"learning_rate": 2.4076763522992665e-05, |
|
"loss": 0.4887, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 2.467547723935389, |
|
"grad_norm": 0.10180474817752838, |
|
"learning_rate": 2.307947409071825e-05, |
|
"loss": 0.4791, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 2.479295154185022, |
|
"grad_norm": 0.10081729292869568, |
|
"learning_rate": 2.210155482542402e-05, |
|
"loss": 0.4822, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 2.491042584434655, |
|
"grad_norm": 0.10296090692281723, |
|
"learning_rate": 2.1143154973729735e-05, |
|
"loss": 0.4776, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 2.5027900146842876, |
|
"grad_norm": 0.10592051595449448, |
|
"learning_rate": 2.0204420803270327e-05, |
|
"loss": 0.484, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 2.5145374449339206, |
|
"grad_norm": 0.10436718910932541, |
|
"learning_rate": 1.9285495580373362e-05, |
|
"loss": 0.4741, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 2.5262848751835536, |
|
"grad_norm": 0.10932262241840363, |
|
"learning_rate": 1.8386519548193994e-05, |
|
"loss": 0.4846, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 2.5380323054331866, |
|
"grad_norm": 0.10435889661312103, |
|
"learning_rate": 1.7507629905311644e-05, |
|
"loss": 0.4984, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 2.5497797356828196, |
|
"grad_norm": 0.11018865555524826, |
|
"learning_rate": 1.664896078479126e-05, |
|
"loss": 0.4728, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 2.561527165932452, |
|
"grad_norm": 0.10020755231380463, |
|
"learning_rate": 1.581064323371225e-05, |
|
"loss": 0.4808, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 2.573274596182085, |
|
"grad_norm": 0.10242980718612671, |
|
"learning_rate": 1.4992805193168717e-05, |
|
"loss": 0.483, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 2.585022026431718, |
|
"grad_norm": 0.10522596538066864, |
|
"learning_rate": 1.4195571478743495e-05, |
|
"loss": 0.4798, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 2.596769456681351, |
|
"grad_norm": 0.10111811757087708, |
|
"learning_rate": 1.3419063761459025e-05, |
|
"loss": 0.4758, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 2.6085168869309836, |
|
"grad_norm": 0.10785708576440811, |
|
"learning_rate": 1.2663400549208741e-05, |
|
"loss": 0.4785, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 2.6202643171806166, |
|
"grad_norm": 0.10563918203115463, |
|
"learning_rate": 1.1928697168670465e-05, |
|
"loss": 0.4838, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 2.6320117474302496, |
|
"grad_norm": 0.10474205017089844, |
|
"learning_rate": 1.1215065747705742e-05, |
|
"loss": 0.4752, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 2.6437591776798826, |
|
"grad_norm": 0.1004628837108612, |
|
"learning_rate": 1.0522615198247364e-05, |
|
"loss": 0.4812, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 2.6555066079295155, |
|
"grad_norm": 0.10973802208900452, |
|
"learning_rate": 9.851451199677573e-06, |
|
"loss": 0.4984, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 2.6672540381791485, |
|
"grad_norm": 0.0998314619064331, |
|
"learning_rate": 9.201676182699558e-06, |
|
"loss": 0.4908, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 2.679001468428781, |
|
"grad_norm": 0.10704085975885391, |
|
"learning_rate": 8.573389313704981e-06, |
|
"loss": 0.4659, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 2.690748898678414, |
|
"grad_norm": 0.1085141971707344, |
|
"learning_rate": 7.966686479639428e-06, |
|
"loss": 0.4846, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 2.702496328928047, |
|
"grad_norm": 0.10413148999214172, |
|
"learning_rate": 7.381660273368572e-06, |
|
"loss": 0.4787, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 2.71424375917768, |
|
"grad_norm": 0.10362162441015244, |
|
"learning_rate": 6.818399979546885e-06, |
|
"loss": 0.4793, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 2.7259911894273126, |
|
"grad_norm": 0.11292777210474014, |
|
"learning_rate": 6.276991560991395e-06, |
|
"loss": 0.4861, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 2.7377386196769455, |
|
"grad_norm": 0.10381820797920227, |
|
"learning_rate": 5.7575176455622764e-06, |
|
"loss": 0.4834, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 2.7494860499265785, |
|
"grad_norm": 0.10142537951469421, |
|
"learning_rate": 5.260057513552573e-06, |
|
"loss": 0.4812, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 2.7612334801762115, |
|
"grad_norm": 0.10647214204072952, |
|
"learning_rate": 4.78468708558864e-06, |
|
"loss": 0.4782, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 2.7729809104258445, |
|
"grad_norm": 0.10398197919130325, |
|
"learning_rate": 4.3314789110433675e-06, |
|
"loss": 0.4849, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 2.7847283406754775, |
|
"grad_norm": 0.10818745195865631, |
|
"learning_rate": 3.90050215696408e-06, |
|
"loss": 0.477, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 2.79647577092511, |
|
"grad_norm": 0.10164166986942291, |
|
"learning_rate": 3.491822597516375e-06, |
|
"loss": 0.4788, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 2.808223201174743, |
|
"grad_norm": 0.1044260635972023, |
|
"learning_rate": 3.1055026039459863e-06, |
|
"loss": 0.482, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 2.819970631424376, |
|
"grad_norm": 0.09673094749450684, |
|
"learning_rate": 2.741601135059851e-06, |
|
"loss": 0.4799, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 2.831718061674009, |
|
"grad_norm": 0.1022416204214096, |
|
"learning_rate": 2.4001737282280055e-06, |
|
"loss": 0.4966, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 2.8434654919236415, |
|
"grad_norm": 0.10257957875728607, |
|
"learning_rate": 2.081272490907765e-06, |
|
"loss": 0.4785, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 2.8552129221732745, |
|
"grad_norm": 0.10026570409536362, |
|
"learning_rate": 1.784946092691153e-06, |
|
"loss": 0.4849, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 2.8669603524229075, |
|
"grad_norm": 0.10606394708156586, |
|
"learning_rate": 1.5112397578771585e-06, |
|
"loss": 0.4804, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 2.8787077826725405, |
|
"grad_norm": 0.1049143448472023, |
|
"learning_rate": 1.2601952585698405e-06, |
|
"loss": 0.488, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 2.8904552129221734, |
|
"grad_norm": 0.10490565001964569, |
|
"learning_rate": 1.0318509083030447e-06, |
|
"loss": 0.4713, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 2.9022026431718064, |
|
"grad_norm": 0.10494933277368546, |
|
"learning_rate": 8.2624155619328e-07, |
|
"loss": 0.4845, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 2.913950073421439, |
|
"grad_norm": 0.10932092368602753, |
|
"learning_rate": 6.43398581621124e-07, |
|
"loss": 0.4881, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 2.925697503671072, |
|
"grad_norm": 0.10375595092773438, |
|
"learning_rate": 4.833498894421528e-07, |
|
"loss": 0.4851, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 2.937444933920705, |
|
"grad_norm": 0.10374415665864944, |
|
"learning_rate": 3.4611990572829815e-07, |
|
"loss": 0.4789, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 2.9491923641703375, |
|
"grad_norm": 0.10261189937591553, |
|
"learning_rate": 2.317295740399294e-07, |
|
"loss": 0.4806, |
|
"step": 2510 |
|
}, |
|
{ |
|
"epoch": 2.9609397944199705, |
|
"grad_norm": 0.10277044773101807, |
|
"learning_rate": 1.4019635222961012e-07, |
|
"loss": 0.4829, |
|
"step": 2520 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 2553, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 40, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 5.341122626662171e+18, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|