|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 4.990881458966565, |
|
"eval_steps": 500, |
|
"global_step": 1642, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.030395136778115502, |
|
"grad_norm": 10.756904602050781, |
|
"learning_rate": 2.4096385542168677e-05, |
|
"loss": 1.6027, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.060790273556231005, |
|
"grad_norm": 9.451399803161621, |
|
"learning_rate": 4.8192771084337354e-05, |
|
"loss": 0.4287, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.0911854103343465, |
|
"grad_norm": 1.1107724905014038, |
|
"learning_rate": 7.228915662650602e-05, |
|
"loss": 0.0996, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.12158054711246201, |
|
"grad_norm": 0.6546974778175354, |
|
"learning_rate": 9.638554216867471e-05, |
|
"loss": 0.0405, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.1519756838905775, |
|
"grad_norm": 0.4774630665779114, |
|
"learning_rate": 0.0001204819277108434, |
|
"loss": 0.0249, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.182370820668693, |
|
"grad_norm": 1.0936344861984253, |
|
"learning_rate": 0.00014457831325301204, |
|
"loss": 0.027, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.2127659574468085, |
|
"grad_norm": 0.6960570216178894, |
|
"learning_rate": 0.00016867469879518074, |
|
"loss": 0.0187, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.24316109422492402, |
|
"grad_norm": 0.45391130447387695, |
|
"learning_rate": 0.00019277108433734942, |
|
"loss": 0.0143, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.2735562310030395, |
|
"grad_norm": 0.3450222611427307, |
|
"learning_rate": 0.0001999900512985548, |
|
"loss": 0.0122, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.303951367781155, |
|
"grad_norm": 0.8266738653182983, |
|
"learning_rate": 0.00019994132773027597, |
|
"loss": 0.0186, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.3343465045592705, |
|
"grad_norm": 0.4097825586795807, |
|
"learning_rate": 0.00019985202174272956, |
|
"loss": 0.014, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.364741641337386, |
|
"grad_norm": 0.6124298572540283, |
|
"learning_rate": 0.00019972216959972274, |
|
"loss": 0.0156, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.3951367781155015, |
|
"grad_norm": 0.5567657351493835, |
|
"learning_rate": 0.00019955182402933334, |
|
"loss": 0.0078, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.425531914893617, |
|
"grad_norm": 0.6027739644050598, |
|
"learning_rate": 0.00019934105420249908, |
|
"loss": 0.0089, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.45592705167173253, |
|
"grad_norm": 0.5931612849235535, |
|
"learning_rate": 0.00019908994570492993, |
|
"loss": 0.0143, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.48632218844984804, |
|
"grad_norm": 0.44545474648475647, |
|
"learning_rate": 0.00019879860050235469, |
|
"loss": 0.0123, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.5167173252279635, |
|
"grad_norm": 0.29651954770088196, |
|
"learning_rate": 0.0001984671368991169, |
|
"loss": 0.008, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.547112462006079, |
|
"grad_norm": 0.19712242484092712, |
|
"learning_rate": 0.0001980956894901356, |
|
"loss": 0.006, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.5775075987841946, |
|
"grad_norm": 0.4244577884674072, |
|
"learning_rate": 0.00019768440910625162, |
|
"loss": 0.0096, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.60790273556231, |
|
"grad_norm": 0.5452653169631958, |
|
"learning_rate": 0.00019723346275298052, |
|
"loss": 0.0128, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.6382978723404256, |
|
"grad_norm": 0.5842880010604858, |
|
"learning_rate": 0.00019674303354269833, |
|
"loss": 0.0115, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.668693009118541, |
|
"grad_norm": 0.5381531715393066, |
|
"learning_rate": 0.00019621332062028617, |
|
"loss": 0.0102, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.6990881458966566, |
|
"grad_norm": 0.7361140847206116, |
|
"learning_rate": 0.00019564453908226515, |
|
"loss": 0.0098, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.729483282674772, |
|
"grad_norm": 0.45423439145088196, |
|
"learning_rate": 0.00019503691988945367, |
|
"loss": 0.0097, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.7598784194528876, |
|
"grad_norm": 0.38471490144729614, |
|
"learning_rate": 0.000194390709773183, |
|
"loss": 0.0117, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.790273556231003, |
|
"grad_norm": 0.36030113697052, |
|
"learning_rate": 0.0001937061711351089, |
|
"loss": 0.009, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.8206686930091185, |
|
"grad_norm": 0.3028634786605835, |
|
"learning_rate": 0.00019298358194066016, |
|
"loss": 0.0069, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.851063829787234, |
|
"grad_norm": 0.278566837310791, |
|
"learning_rate": 0.0001922232356061672, |
|
"loss": 0.011, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.8814589665653495, |
|
"grad_norm": 0.3821546137332916, |
|
"learning_rate": 0.00019142544087971693, |
|
"loss": 0.0054, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.9118541033434651, |
|
"grad_norm": 0.26465708017349243, |
|
"learning_rate": 0.00019059052171578155, |
|
"loss": 0.0077, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.9422492401215805, |
|
"grad_norm": 0.4308549463748932, |
|
"learning_rate": 0.00018971881714367295, |
|
"loss": 0.0057, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.9726443768996961, |
|
"grad_norm": 0.226839080452919, |
|
"learning_rate": 0.0001888106811298755, |
|
"loss": 0.0077, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 1.0030395136778116, |
|
"grad_norm": 0.39627695083618164, |
|
"learning_rate": 0.00018786648243431363, |
|
"loss": 0.0115, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 1.033434650455927, |
|
"grad_norm": 0.6822514533996582, |
|
"learning_rate": 0.00018688660446061235, |
|
"loss": 0.0103, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 1.0638297872340425, |
|
"grad_norm": 0.34325116872787476, |
|
"learning_rate": 0.00018587144510041128, |
|
"loss": 0.0095, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 1.094224924012158, |
|
"grad_norm": 1.1588419675827026, |
|
"learning_rate": 0.00018482141657179594, |
|
"loss": 0.0199, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 1.1246200607902737, |
|
"grad_norm": 0.5164496898651123, |
|
"learning_rate": 0.00018373694525191138, |
|
"loss": 0.0113, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 1.155015197568389, |
|
"grad_norm": 0.38176482915878296, |
|
"learning_rate": 0.00018261847150382644, |
|
"loss": 0.0109, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 1.1854103343465046, |
|
"grad_norm": 0.43293264508247375, |
|
"learning_rate": 0.000181466449497719, |
|
"loss": 0.0126, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 1.21580547112462, |
|
"grad_norm": 0.2756524980068207, |
|
"learning_rate": 0.00018028134702645425, |
|
"loss": 0.0057, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.2462006079027357, |
|
"grad_norm": 0.3666749894618988, |
|
"learning_rate": 0.00017906364531563185, |
|
"loss": 0.0047, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 1.2765957446808511, |
|
"grad_norm": 0.3208726942539215, |
|
"learning_rate": 0.00017781383882817811, |
|
"loss": 0.0153, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 1.3069908814589666, |
|
"grad_norm": 0.36565855145454407, |
|
"learning_rate": 0.00017653243506356332, |
|
"loss": 0.0078, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 1.337386018237082, |
|
"grad_norm": 0.30531516671180725, |
|
"learning_rate": 0.00017521995435172504, |
|
"loss": 0.0086, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 1.3677811550151975, |
|
"grad_norm": 0.4445202648639679, |
|
"learning_rate": 0.00017387692964178198, |
|
"loss": 0.0046, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.3981762917933132, |
|
"grad_norm": 0.30582693219184875, |
|
"learning_rate": 0.00017250390628562303, |
|
"loss": 0.0044, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 1.4285714285714286, |
|
"grad_norm": 0.22891288995742798, |
|
"learning_rate": 0.00017110144181646072, |
|
"loss": 0.0084, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 1.458966565349544, |
|
"grad_norm": 0.34802454710006714, |
|
"learning_rate": 0.00016967010572243758, |
|
"loss": 0.0053, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 1.4893617021276595, |
|
"grad_norm": 0.4866830110549927, |
|
"learning_rate": 0.00016821047921537858, |
|
"loss": 0.0082, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 1.5197568389057752, |
|
"grad_norm": 0.11244069784879684, |
|
"learning_rate": 0.0001667231549947828, |
|
"loss": 0.0043, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.5501519756838906, |
|
"grad_norm": 0.37716448307037354, |
|
"learning_rate": 0.00016520873700715045, |
|
"loss": 0.0075, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 1.580547112462006, |
|
"grad_norm": 0.3906088173389435, |
|
"learning_rate": 0.00016366784020074282, |
|
"loss": 0.0051, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 1.6109422492401215, |
|
"grad_norm": 0.400751531124115, |
|
"learning_rate": 0.00016210109027587494, |
|
"loss": 0.0066, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 1.641337386018237, |
|
"grad_norm": 0.3523465096950531, |
|
"learning_rate": 0.00016050912343084216, |
|
"loss": 0.0076, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 1.6717325227963524, |
|
"grad_norm": 0.1771322637796402, |
|
"learning_rate": 0.00015889258610358398, |
|
"loss": 0.0045, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.702127659574468, |
|
"grad_norm": 0.1138119027018547, |
|
"learning_rate": 0.00015725213470918977, |
|
"loss": 0.0058, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 1.7325227963525835, |
|
"grad_norm": 0.32440638542175293, |
|
"learning_rate": 0.00015558843537335338, |
|
"loss": 0.0086, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 1.7629179331306992, |
|
"grad_norm": 0.35322776436805725, |
|
"learning_rate": 0.0001539021636618844, |
|
"loss": 0.0063, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 1.7933130699088147, |
|
"grad_norm": 0.36127084493637085, |
|
"learning_rate": 0.0001521940043063863, |
|
"loss": 0.0058, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 1.8237082066869301, |
|
"grad_norm": 0.3828403949737549, |
|
"learning_rate": 0.00015046465092621278, |
|
"loss": 0.0066, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.8541033434650456, |
|
"grad_norm": 0.2598365247249603, |
|
"learning_rate": 0.00014871480574681477, |
|
"loss": 0.0098, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 1.884498480243161, |
|
"grad_norm": 0.42567044496536255, |
|
"learning_rate": 0.00014694517931459317, |
|
"loss": 0.0102, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 1.9148936170212765, |
|
"grad_norm": 0.30201858282089233, |
|
"learning_rate": 0.00014515649020837277, |
|
"loss": 0.0059, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 1.9452887537993921, |
|
"grad_norm": 0.2485564947128296, |
|
"learning_rate": 0.00014334946474761412, |
|
"loss": 0.0056, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 1.9756838905775076, |
|
"grad_norm": 0.23254001140594482, |
|
"learning_rate": 0.0001415248366974826, |
|
"loss": 0.0048, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 2.0060790273556233, |
|
"grad_norm": 0.2745867967605591, |
|
"learning_rate": 0.00013968334697089406, |
|
"loss": 0.0035, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 2.0364741641337387, |
|
"grad_norm": 0.21013915538787842, |
|
"learning_rate": 0.0001378257433276578, |
|
"loss": 0.0027, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 2.066869300911854, |
|
"grad_norm": 0.11735199391841888, |
|
"learning_rate": 0.00013595278007083933, |
|
"loss": 0.0063, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 2.0972644376899696, |
|
"grad_norm": 0.14921429753303528, |
|
"learning_rate": 0.00013406521774046636, |
|
"loss": 0.0023, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 2.127659574468085, |
|
"grad_norm": 0.12081817537546158, |
|
"learning_rate": 0.000132163822804702, |
|
"loss": 0.0019, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 2.1580547112462005, |
|
"grad_norm": 0.21969059109687805, |
|
"learning_rate": 0.00013024936734861087, |
|
"loss": 0.0051, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 2.188449848024316, |
|
"grad_norm": 0.15178360044956207, |
|
"learning_rate": 0.00012832262876064427, |
|
"loss": 0.003, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 2.2188449848024314, |
|
"grad_norm": 0.10473588109016418, |
|
"learning_rate": 0.00012638438941697206, |
|
"loss": 0.0042, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 2.2492401215805473, |
|
"grad_norm": 0.12606075406074524, |
|
"learning_rate": 0.0001244354363637889, |
|
"loss": 0.0035, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 2.2796352583586628, |
|
"grad_norm": 0.17029185593128204, |
|
"learning_rate": 0.0001224765609977246, |
|
"loss": 0.0029, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 2.310030395136778, |
|
"grad_norm": 0.35497593879699707, |
|
"learning_rate": 0.00012050855874448737, |
|
"loss": 0.0032, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 2.3404255319148937, |
|
"grad_norm": 0.17530225217342377, |
|
"learning_rate": 0.00011853222873587167, |
|
"loss": 0.0039, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 2.370820668693009, |
|
"grad_norm": 0.26195624470710754, |
|
"learning_rate": 0.00011654837348526044, |
|
"loss": 0.0063, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 2.4012158054711246, |
|
"grad_norm": 0.19774211943149567, |
|
"learning_rate": 0.00011455779856175488, |
|
"loss": 0.0019, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 2.43161094224924, |
|
"grad_norm": 0.23176009953022003, |
|
"learning_rate": 0.00011256131226306288, |
|
"loss": 0.0033, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 2.4620060790273555, |
|
"grad_norm": 0.21269629895687103, |
|
"learning_rate": 0.00011055972528727973, |
|
"loss": 0.002, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 2.4924012158054714, |
|
"grad_norm": 0.15158076584339142, |
|
"learning_rate": 0.00010855385040369419, |
|
"loss": 0.0093, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 2.522796352583587, |
|
"grad_norm": 0.16666357219219208, |
|
"learning_rate": 0.00010654450212275324, |
|
"loss": 0.0045, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 2.5531914893617023, |
|
"grad_norm": 0.20873229205608368, |
|
"learning_rate": 0.00010453249636532007, |
|
"loss": 0.0027, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 2.5835866261398177, |
|
"grad_norm": 0.20907987654209137, |
|
"learning_rate": 0.00010251865013135931, |
|
"loss": 0.0027, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 2.613981762917933, |
|
"grad_norm": 0.4642738997936249, |
|
"learning_rate": 0.00010050378116818391, |
|
"loss": 0.0047, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 2.6443768996960486, |
|
"grad_norm": 0.16256296634674072, |
|
"learning_rate": 9.848870763839877e-05, |
|
"loss": 0.0032, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 2.674772036474164, |
|
"grad_norm": 0.23262229561805725, |
|
"learning_rate": 9.64742477876756e-05, |
|
"loss": 0.0046, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 2.7051671732522795, |
|
"grad_norm": 0.255894273519516, |
|
"learning_rate": 9.44612196124941e-05, |
|
"loss": 0.0049, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 2.735562310030395, |
|
"grad_norm": 0.1491995006799698, |
|
"learning_rate": 9.245044052798435e-05, |
|
"loss": 0.0034, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 2.7659574468085104, |
|
"grad_norm": 0.18624486029148102, |
|
"learning_rate": 9.044272703600505e-05, |
|
"loss": 0.0031, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 2.7963525835866263, |
|
"grad_norm": 0.3413298428058624, |
|
"learning_rate": 8.843889439359308e-05, |
|
"loss": 0.0091, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 2.8267477203647418, |
|
"grad_norm": 0.4524446725845337, |
|
"learning_rate": 8.643975628191802e-05, |
|
"loss": 0.0084, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 2.857142857142857, |
|
"grad_norm": 0.2205473631620407, |
|
"learning_rate": 8.444612447587683e-05, |
|
"loss": 0.0038, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 2.8875379939209727, |
|
"grad_norm": 0.3354128301143646, |
|
"learning_rate": 8.245880851446255e-05, |
|
"loss": 0.0039, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 2.917933130699088, |
|
"grad_norm": 0.40203210711479187, |
|
"learning_rate": 8.047861537204107e-05, |
|
"loss": 0.0069, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 2.9483282674772036, |
|
"grad_norm": 0.16645453870296478, |
|
"learning_rate": 7.850634913066887e-05, |
|
"loss": 0.003, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 2.978723404255319, |
|
"grad_norm": 0.1760583221912384, |
|
"learning_rate": 7.654281065358575e-05, |
|
"loss": 0.0038, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 3.0091185410334345, |
|
"grad_norm": 0.09641872346401215, |
|
"learning_rate": 7.458879726001431e-05, |
|
"loss": 0.002, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 3.0395136778115504, |
|
"grad_norm": 0.3135606050491333, |
|
"learning_rate": 7.264510240139836e-05, |
|
"loss": 0.0052, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 3.069908814589666, |
|
"grad_norm": 0.1596423238515854, |
|
"learning_rate": 7.071251533921235e-05, |
|
"loss": 0.0035, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 3.1003039513677813, |
|
"grad_norm": 0.09552032500505447, |
|
"learning_rate": 6.879182082447185e-05, |
|
"loss": 0.0023, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 3.1306990881458967, |
|
"grad_norm": 0.14375700056552887, |
|
"learning_rate": 6.688379877907548e-05, |
|
"loss": 0.0034, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 3.161094224924012, |
|
"grad_norm": 0.18640564382076263, |
|
"learning_rate": 6.498922397910822e-05, |
|
"loss": 0.0021, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 3.1914893617021276, |
|
"grad_norm": 0.15173585712909698, |
|
"learning_rate": 6.310886574023362e-05, |
|
"loss": 0.0019, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 3.221884498480243, |
|
"grad_norm": 0.17078043520450592, |
|
"learning_rate": 6.124348760530383e-05, |
|
"loss": 0.0064, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 3.2522796352583585, |
|
"grad_norm": 0.11285611987113953, |
|
"learning_rate": 5.9393847034313544e-05, |
|
"loss": 0.0022, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 3.282674772036474, |
|
"grad_norm": 0.20150907337665558, |
|
"learning_rate": 5.756069509682368e-05, |
|
"loss": 0.0021, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 3.31306990881459, |
|
"grad_norm": 0.23782743513584137, |
|
"learning_rate": 5.574477616698061e-05, |
|
"loss": 0.0014, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 3.3434650455927053, |
|
"grad_norm": 0.05953775346279144, |
|
"learning_rate": 5.3946827621253514e-05, |
|
"loss": 0.0025, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 3.3738601823708207, |
|
"grad_norm": 0.09737365692853928, |
|
"learning_rate": 5.2167579539013456e-05, |
|
"loss": 0.0016, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 3.404255319148936, |
|
"grad_norm": 0.10946661978960037, |
|
"learning_rate": 5.0407754406075926e-05, |
|
"loss": 0.0018, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 3.4346504559270516, |
|
"grad_norm": 0.18973323702812195, |
|
"learning_rate": 4.866806682132611e-05, |
|
"loss": 0.0025, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 3.465045592705167, |
|
"grad_norm": 0.10450422018766403, |
|
"learning_rate": 4.694922320654727e-05, |
|
"loss": 0.0009, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 3.4954407294832825, |
|
"grad_norm": 0.1521192193031311, |
|
"learning_rate": 4.5251921519569404e-05, |
|
"loss": 0.0026, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 3.5258358662613984, |
|
"grad_norm": 0.11275038123130798, |
|
"learning_rate": 4.35768509708548e-05, |
|
"loss": 0.0014, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 3.556231003039514, |
|
"grad_norm": 0.209175705909729, |
|
"learning_rate": 4.1924691743635504e-05, |
|
"loss": 0.0025, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 3.5866261398176293, |
|
"grad_norm": 0.17129185795783997, |
|
"learning_rate": 4.029611471771646e-05, |
|
"loss": 0.0015, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 3.617021276595745, |
|
"grad_norm": 0.08248832076787949, |
|
"learning_rate": 3.869178119705648e-05, |
|
"loss": 0.0024, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 3.6474164133738602, |
|
"grad_norm": 0.18126089870929718, |
|
"learning_rate": 3.711234264123747e-05, |
|
"loss": 0.0028, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 3.6778115501519757, |
|
"grad_norm": 0.2583954632282257, |
|
"learning_rate": 3.555844040093129e-05, |
|
"loss": 0.0041, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 3.708206686930091, |
|
"grad_norm": 0.16434016823768616, |
|
"learning_rate": 3.403070545747107e-05, |
|
"loss": 0.0028, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 3.7386018237082066, |
|
"grad_norm": 0.13008223474025726, |
|
"learning_rate": 3.252975816663375e-05, |
|
"loss": 0.0015, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 3.768996960486322, |
|
"grad_norm": 0.0734616294503212, |
|
"learning_rate": 3.1056208006736634e-05, |
|
"loss": 0.0016, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 3.7993920972644375, |
|
"grad_norm": 0.06645731627941132, |
|
"learning_rate": 2.9610653331151216e-05, |
|
"loss": 0.0016, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 3.829787234042553, |
|
"grad_norm": 0.24428494274616241, |
|
"learning_rate": 2.8193681125334393e-05, |
|
"loss": 0.002, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 3.860182370820669, |
|
"grad_norm": 0.22978048026561737, |
|
"learning_rate": 2.6805866768475663e-05, |
|
"loss": 0.0017, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 3.8905775075987843, |
|
"grad_norm": 0.1229507103562355, |
|
"learning_rate": 2.5447773799857244e-05, |
|
"loss": 0.0018, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 3.9209726443768997, |
|
"grad_norm": 0.2323068082332611, |
|
"learning_rate": 2.4119953690022025e-05, |
|
"loss": 0.0017, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 3.951367781155015, |
|
"grad_norm": 0.23123906552791595, |
|
"learning_rate": 2.2822945616841963e-05, |
|
"loss": 0.002, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 3.9817629179331306, |
|
"grad_norm": 0.10087133944034576, |
|
"learning_rate": 2.1557276246578307e-05, |
|
"loss": 0.0018, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 4.0121580547112465, |
|
"grad_norm": 0.058749064803123474, |
|
"learning_rate": 2.032345952002218e-05, |
|
"loss": 0.0011, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 4.042553191489362, |
|
"grad_norm": 0.15570074319839478, |
|
"learning_rate": 1.9121996443802482e-05, |
|
"loss": 0.004, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 4.072948328267477, |
|
"grad_norm": 0.2478976845741272, |
|
"learning_rate": 1.7953374886946006e-05, |
|
"loss": 0.0036, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 4.103343465045593, |
|
"grad_norm": 0.08812441676855087, |
|
"learning_rate": 1.681806938277205e-05, |
|
"loss": 0.0015, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 4.133738601823708, |
|
"grad_norm": 0.0631280392408371, |
|
"learning_rate": 1.5716540936202363e-05, |
|
"loss": 0.0013, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 4.164133738601824, |
|
"grad_norm": 0.061599250882864, |
|
"learning_rate": 1.4649236836564263e-05, |
|
"loss": 0.0078, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 4.194528875379939, |
|
"grad_norm": 0.04393768310546875, |
|
"learning_rate": 1.361659047596332e-05, |
|
"loss": 0.0021, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 4.224924012158055, |
|
"grad_norm": 0.0476597435772419, |
|
"learning_rate": 1.2619021173299051e-05, |
|
"loss": 0.0014, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 4.25531914893617, |
|
"grad_norm": 0.07678095251321793, |
|
"learning_rate": 1.1656934003995302e-05, |
|
"loss": 0.0027, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 4.285714285714286, |
|
"grad_norm": 0.1129755899310112, |
|
"learning_rate": 1.0730719635514296e-05, |
|
"loss": 0.0053, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 4.316109422492401, |
|
"grad_norm": 0.13822700083255768, |
|
"learning_rate": 9.840754168721289e-06, |
|
"loss": 0.002, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 4.3465045592705165, |
|
"grad_norm": 0.10752403736114502, |
|
"learning_rate": 8.987398985164108e-06, |
|
"loss": 0.0024, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 4.376899696048632, |
|
"grad_norm": 0.06812784075737, |
|
"learning_rate": 8.171000600329682e-06, |
|
"loss": 0.001, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 4.407294832826747, |
|
"grad_norm": 0.20128777623176575, |
|
"learning_rate": 7.391890522937139e-06, |
|
"loss": 0.0015, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 4.437689969604863, |
|
"grad_norm": 0.0681811273097992, |
|
"learning_rate": 6.6503851203245205e-06, |
|
"loss": 0.0073, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 4.468085106382979, |
|
"grad_norm": 0.04031313210725784, |
|
"learning_rate": 5.946785489983941e-06, |
|
"loss": 0.001, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 4.498480243161095, |
|
"grad_norm": 0.16159360110759735, |
|
"learning_rate": 5.2813773372971995e-06, |
|
"loss": 0.002, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 4.52887537993921, |
|
"grad_norm": 0.051200274378061295, |
|
"learning_rate": 4.654430859521519e-06, |
|
"loss": 0.0008, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 4.5592705167173255, |
|
"grad_norm": 0.05586516112089157, |
|
"learning_rate": 4.066200636072604e-06, |
|
"loss": 0.0024, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 4.589665653495441, |
|
"grad_norm": 0.22293174266815186, |
|
"learning_rate": 3.5169255251495283e-06, |
|
"loss": 0.0012, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 4.620060790273556, |
|
"grad_norm": 0.06470826268196106, |
|
"learning_rate": 3.006828566743358e-06, |
|
"loss": 0.0009, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 4.650455927051672, |
|
"grad_norm": 0.11062423884868622, |
|
"learning_rate": 2.536116892069007e-06, |
|
"loss": 0.0017, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 4.680851063829787, |
|
"grad_norm": 0.08456968516111374, |
|
"learning_rate": 2.1049816394570486e-06, |
|
"loss": 0.0006, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 4.711246200607903, |
|
"grad_norm": 0.0969931110739708, |
|
"learning_rate": 1.7135978767395588e-06, |
|
"loss": 0.0037, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 4.741641337386018, |
|
"grad_norm": 0.0655050203204155, |
|
"learning_rate": 1.3621245301617014e-06, |
|
"loss": 0.001, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 4.772036474164134, |
|
"grad_norm": 0.038389451801776886, |
|
"learning_rate": 1.0507043198477617e-06, |
|
"loss": 0.0007, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 4.802431610942249, |
|
"grad_norm": 0.027961203828454018, |
|
"learning_rate": 7.794637018477824e-07, |
|
"loss": 0.0013, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 4.832826747720365, |
|
"grad_norm": 0.24096284806728363, |
|
"learning_rate": 5.485128167885933e-07, |
|
"loss": 0.003, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 4.86322188449848, |
|
"grad_norm": 0.031791090965270996, |
|
"learning_rate": 3.579454451498099e-07, |
|
"loss": 0.0023, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 4.8936170212765955, |
|
"grad_norm": 0.028634727001190186, |
|
"learning_rate": 2.0783896918310508e-07, |
|
"loss": 0.001, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 4.924012158054711, |
|
"grad_norm": 0.11568836122751236, |
|
"learning_rate": 9.82543414901782e-08, |
|
"loss": 0.0038, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 4.954407294832826, |
|
"grad_norm": 0.13896380364894867, |
|
"learning_rate": 2.9236060272186395e-08, |
|
"loss": 0.005, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 4.984802431610943, |
|
"grad_norm": 0.02793799713253975, |
|
"learning_rate": 8.121512607317528e-10, |
|
"loss": 0.0016, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 4.990881458966565, |
|
"step": 1642, |
|
"total_flos": 5.715020951091302e+16, |
|
"train_loss": 0.018644058636483826, |
|
"train_runtime": 735.9075, |
|
"train_samples_per_second": 35.7, |
|
"train_steps_per_second": 2.231 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 1642, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 5, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 5.715020951091302e+16, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|