|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 3.1493757012485974, |
|
"eval_steps": 100, |
|
"global_step": 1024, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.03071993856012288, |
|
"grad_norm": 1362.820556640625, |
|
"learning_rate": 0.0001999995200527669, |
|
"loss": 9.71, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.06143987712024576, |
|
"grad_norm": 770.7006225585938, |
|
"learning_rate": 0.00019995200907733468, |
|
"loss": 8.4271, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.09215981568036864, |
|
"grad_norm": 1022.9829711914062, |
|
"learning_rate": 0.00019980808237191178, |
|
"loss": 7.8192, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.12287975424049152, |
|
"grad_norm": 1215.0618896484375, |
|
"learning_rate": 0.00019956835802723916, |
|
"loss": 7.6393, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.1535996928006144, |
|
"grad_norm": 1212.7867431640625, |
|
"learning_rate": 0.0001992330661351665, |
|
"loss": 7.6322, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.18431963136073728, |
|
"grad_norm": 1856.4029541015625, |
|
"learning_rate": 0.00019880252851503915, |
|
"loss": 7.5722, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.21503956992086015, |
|
"grad_norm": 1200.9031982421875, |
|
"learning_rate": 0.0001982771584048096, |
|
"loss": 7.519, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.24575950848098305, |
|
"grad_norm": 1107.1383056640625, |
|
"learning_rate": 0.00019765746006440455, |
|
"loss": 7.3991, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.27647944704110594, |
|
"grad_norm": 1291.4737548828125, |
|
"learning_rate": 0.00019694402829172663, |
|
"loss": 7.3051, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.3071993856012288, |
|
"grad_norm": 1314.5531005859375, |
|
"learning_rate": 0.0001961375478517564, |
|
"loss": 7.246, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.3071993856012288, |
|
"eval_loss": 7.195280075073242, |
|
"eval_runtime": 8.8206, |
|
"eval_samples_per_second": 56.685, |
|
"eval_steps_per_second": 9.523, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.3379193241613517, |
|
"grad_norm": 1382.552734375, |
|
"learning_rate": 0.00019523879281930235, |
|
"loss": 7.2223, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.36863926272147457, |
|
"grad_norm": 1070.0693359375, |
|
"learning_rate": 0.00019424862583602965, |
|
"loss": 7.148, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.3993592012815974, |
|
"grad_norm": 755.69580078125, |
|
"learning_rate": 0.00019316799728248075, |
|
"loss": 7.1237, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.4300791398417203, |
|
"grad_norm": 954.8897705078125, |
|
"learning_rate": 0.00019199794436588243, |
|
"loss": 7.1325, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.4607990784018432, |
|
"grad_norm": 774.8735961914062, |
|
"learning_rate": 0.00019073959012461545, |
|
"loss": 7.0651, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.4915190169619661, |
|
"grad_norm": 1204.67236328125, |
|
"learning_rate": 0.00018939414235030134, |
|
"loss": 6.9959, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.522238955522089, |
|
"grad_norm": 1073.4049072265625, |
|
"learning_rate": 0.0001879628924285419, |
|
"loss": 6.9933, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.5529588940822119, |
|
"grad_norm": 1143.1695556640625, |
|
"learning_rate": 0.00018644721409942323, |
|
"loss": 7.064, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.5836788326423347, |
|
"grad_norm": 1076.1048583984375, |
|
"learning_rate": 0.00018484856213897498, |
|
"loss": 7.0452, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.6143987712024576, |
|
"grad_norm": 1768.272216796875, |
|
"learning_rate": 0.00018316847096284917, |
|
"loss": 7.0609, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.6143987712024576, |
|
"eval_loss": 7.0105695724487305, |
|
"eval_runtime": 8.8282, |
|
"eval_samples_per_second": 56.637, |
|
"eval_steps_per_second": 9.515, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.6451187097625805, |
|
"grad_norm": 1186.74951171875, |
|
"learning_rate": 0.0001814085531535599, |
|
"loss": 6.9933, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.6758386483227034, |
|
"grad_norm": 770.9393920898438, |
|
"learning_rate": 0.00017957049791269685, |
|
"loss": 6.9445, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.7065585868828262, |
|
"grad_norm": 1249.0562744140625, |
|
"learning_rate": 0.00017765606943959833, |
|
"loss": 6.9064, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.7372785254429491, |
|
"grad_norm": 1456.103271484375, |
|
"learning_rate": 0.00017566710523804043, |
|
"loss": 6.8975, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.767998464003072, |
|
"grad_norm": 602.2179565429688, |
|
"learning_rate": 0.00017360551435256674, |
|
"loss": 6.9077, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.7987184025631948, |
|
"grad_norm": 2122.6357421875, |
|
"learning_rate": 0.00017168962077029147, |
|
"loss": 6.8866, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.8294383411233177, |
|
"grad_norm": 1087.0120849609375, |
|
"learning_rate": 0.00016949554673441534, |
|
"loss": 6.9582, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.8601582796834406, |
|
"grad_norm": 713.1141967773438, |
|
"learning_rate": 0.00016723476959036083, |
|
"loss": 6.9267, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.8908782182435635, |
|
"grad_norm": 780.1596069335938, |
|
"learning_rate": 0.0001649094592737497, |
|
"loss": 6.8531, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.9215981568036864, |
|
"grad_norm": 557.41015625, |
|
"learning_rate": 0.00016252184766033342, |
|
"loss": 6.8226, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.9215981568036864, |
|
"eval_loss": 6.788844585418701, |
|
"eval_runtime": 9.2188, |
|
"eval_samples_per_second": 54.237, |
|
"eval_steps_per_second": 9.112, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.9523180953638093, |
|
"grad_norm": 1023.5853881835938, |
|
"learning_rate": 0.0001600742264237979, |
|
"loss": 6.8258, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.9830380339239322, |
|
"grad_norm": 832.1343994140625, |
|
"learning_rate": 0.00015756894483617267, |
|
"loss": 6.9351, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 1.0149759700480598, |
|
"grad_norm": 673.265625, |
|
"learning_rate": 0.0001550084075129563, |
|
"loss": 6.8737, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 1.0456959086081827, |
|
"grad_norm": 460.1312561035156, |
|
"learning_rate": 0.00015239507210512194, |
|
"loss": 6.7986, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 1.0764158471683056, |
|
"grad_norm": 911.1027221679688, |
|
"learning_rate": 0.00014973144694021876, |
|
"loss": 6.7203, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 1.1071357857284285, |
|
"grad_norm": 615.1552734375, |
|
"learning_rate": 0.00014702008861483266, |
|
"loss": 6.7367, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 1.1378557242885514, |
|
"grad_norm": 1422.3477783203125, |
|
"learning_rate": 0.00014426359954071796, |
|
"loss": 6.8291, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 1.1685756628486743, |
|
"grad_norm": 960.8421020507812, |
|
"learning_rate": 0.00014146462544695426, |
|
"loss": 6.9442, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 1.1992956014087972, |
|
"grad_norm": 598.9495239257812, |
|
"learning_rate": 0.00013862585284052714, |
|
"loss": 6.8753, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 1.23001553996892, |
|
"grad_norm": 465.4633483886719, |
|
"learning_rate": 0.00013575000642776893, |
|
"loss": 6.7756, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.23001553996892, |
|
"eval_loss": 6.714211940765381, |
|
"eval_runtime": 8.2864, |
|
"eval_samples_per_second": 60.34, |
|
"eval_steps_per_second": 10.137, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.260735478529043, |
|
"grad_norm": 409.80078125, |
|
"learning_rate": 0.0001328398464991355, |
|
"loss": 6.6953, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 1.291455417089166, |
|
"grad_norm": 537.5800170898438, |
|
"learning_rate": 0.00012989816627982848, |
|
"loss": 6.6806, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 1.3221753556492888, |
|
"grad_norm": 582.1690673828125, |
|
"learning_rate": 0.00012692778924880603, |
|
"loss": 6.6567, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 1.3528952942094117, |
|
"grad_norm": 995.1676635742188, |
|
"learning_rate": 0.0001239315664287558, |
|
"loss": 6.703, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 1.3836152327695346, |
|
"grad_norm": 1078.3963623046875, |
|
"learning_rate": 0.00012091237364963071, |
|
"loss": 6.8837, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.4143351713296575, |
|
"grad_norm": 1639.4901123046875, |
|
"learning_rate": 0.00011787310878837422, |
|
"loss": 6.9726, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 1.4450551098897801, |
|
"grad_norm": 726.534423828125, |
|
"learning_rate": 0.00011481668898748475, |
|
"loss": 6.9038, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 1.475775048449903, |
|
"grad_norm": 458.4363098144531, |
|
"learning_rate": 0.00011174604785508813, |
|
"loss": 6.7909, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 1.506494987010026, |
|
"grad_norm": 482.659912109375, |
|
"learning_rate": 0.00010866413264920678, |
|
"loss": 6.6934, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 1.5372149255701488, |
|
"grad_norm": 362.69256591796875, |
|
"learning_rate": 0.00010557390144892684, |
|
"loss": 6.6197, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.5372149255701488, |
|
"eval_loss": 6.591891765594482, |
|
"eval_runtime": 8.6234, |
|
"eval_samples_per_second": 57.982, |
|
"eval_steps_per_second": 9.741, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.5679348641302717, |
|
"grad_norm": 673.8775634765625, |
|
"learning_rate": 0.0001024783203151793, |
|
"loss": 6.5968, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 1.5986548026903946, |
|
"grad_norm": 621.0140380859375, |
|
"learning_rate": 9.938036044386005e-05, |
|
"loss": 6.6061, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 1.6293747412505175, |
|
"grad_norm": 1936.8753662109375, |
|
"learning_rate": 9.628299531402117e-05, |
|
"loss": 6.7405, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 1.6600946798106404, |
|
"grad_norm": 871.2101440429688, |
|
"learning_rate": 9.318919783387094e-05, |
|
"loss": 6.8414, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 1.6908146183707633, |
|
"grad_norm": 646.2464599609375, |
|
"learning_rate": 9.010193748732155e-05, |
|
"loss": 6.8444, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.721534556930886, |
|
"grad_norm": 533.4226684570312, |
|
"learning_rate": 8.702417748382385e-05, |
|
"loss": 6.7516, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 1.7522544954910089, |
|
"grad_norm": 512.05322265625, |
|
"learning_rate": 8.395887191422397e-05, |
|
"loss": 6.6651, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 1.7829744340511318, |
|
"grad_norm": 461.73052978515625, |
|
"learning_rate": 8.090896291537273e-05, |
|
"loss": 6.6219, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 1.8136943726112547, |
|
"grad_norm": 524.8619995117188, |
|
"learning_rate": 7.787737784620803e-05, |
|
"loss": 6.6067, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 1.8444143111713776, |
|
"grad_norm": 623.9786376953125, |
|
"learning_rate": 7.486702647802213e-05, |
|
"loss": 6.6108, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.8444143111713776, |
|
"eval_loss": 6.602721691131592, |
|
"eval_runtime": 8.5465, |
|
"eval_samples_per_second": 58.503, |
|
"eval_steps_per_second": 9.829, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.8751342497315004, |
|
"grad_norm": 1003.4849243164062, |
|
"learning_rate": 7.188079820160904e-05, |
|
"loss": 6.6348, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 1.9058541882916233, |
|
"grad_norm": 1812.9306640625, |
|
"learning_rate": 6.892155925397436e-05, |
|
"loss": 6.7396, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 1.9365741268517462, |
|
"grad_norm": 1092.574951171875, |
|
"learning_rate": 6.59921499672677e-05, |
|
"loss": 6.8439, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 1.9672940654118691, |
|
"grad_norm": 758.6673583984375, |
|
"learning_rate": 6.309538204257977e-05, |
|
"loss": 6.8437, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 1.998014003971992, |
|
"grad_norm": 714.4383544921875, |
|
"learning_rate": 6.02340358512196e-05, |
|
"loss": 6.8018, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 2.0299519400961197, |
|
"grad_norm": 631.6743774414062, |
|
"learning_rate": 5.7410857766062966e-05, |
|
"loss": 6.7339, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 2.0606718786562426, |
|
"grad_norm": 506.4139099121094, |
|
"learning_rate": 5.4628557525532976e-05, |
|
"loss": 6.6692, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 2.0913918172163655, |
|
"grad_norm": 593.3082275390625, |
|
"learning_rate": 5.188980563274315e-05, |
|
"loss": 6.6358, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 2.1221117557764884, |
|
"grad_norm": 580.3704833984375, |
|
"learning_rate": 4.9197230792299195e-05, |
|
"loss": 6.6278, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 2.1528316943366113, |
|
"grad_norm": 566.3848266601562, |
|
"learning_rate": 4.6553417387219886e-05, |
|
"loss": 6.6662, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 2.1528316943366113, |
|
"eval_loss": 6.699697971343994, |
|
"eval_runtime": 8.6309, |
|
"eval_samples_per_second": 57.931, |
|
"eval_steps_per_second": 9.732, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 2.183551632896734, |
|
"grad_norm": 938.6303100585938, |
|
"learning_rate": 4.396090299839852e-05, |
|
"loss": 6.7142, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 2.214271571456857, |
|
"grad_norm": 713.5470581054688, |
|
"learning_rate": 4.1422175968985955e-05, |
|
"loss": 6.7151, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 2.24499151001698, |
|
"grad_norm": 594.885498046875, |
|
"learning_rate": 3.8939673016032953e-05, |
|
"loss": 6.6822, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 2.275711448577103, |
|
"grad_norm": 671.8080444335938, |
|
"learning_rate": 3.651577689168405e-05, |
|
"loss": 6.6504, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 2.3064313871372257, |
|
"grad_norm": 614.1011962890625, |
|
"learning_rate": 3.415281409616844e-05, |
|
"loss": 6.6417, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 2.3371513256973486, |
|
"grad_norm": 556.9248657226562, |
|
"learning_rate": 3.185305264478159e-05, |
|
"loss": 6.6225, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 2.3678712642574715, |
|
"grad_norm": 948.7615356445312, |
|
"learning_rate": 2.9839130153161154e-05, |
|
"loss": 6.6301, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 2.3985912028175944, |
|
"grad_norm": 673.330322265625, |
|
"learning_rate": 2.766548066920338e-05, |
|
"loss": 6.6598, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 2.4293111413777173, |
|
"grad_norm": 591.10400390625, |
|
"learning_rate": 2.5561259191710407e-05, |
|
"loss": 6.6749, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 2.46003107993784, |
|
"grad_norm": 600.33154296875, |
|
"learning_rate": 2.3528485391286147e-05, |
|
"loss": 6.6622, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 2.46003107993784, |
|
"eval_loss": 6.648305892944336, |
|
"eval_runtime": 8.5752, |
|
"eval_samples_per_second": 58.308, |
|
"eval_steps_per_second": 9.796, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 2.490751018497963, |
|
"grad_norm": 493.4453125, |
|
"learning_rate": 2.1569110361735677e-05, |
|
"loss": 6.66, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 2.521470957058086, |
|
"grad_norm": 673.8823852539062, |
|
"learning_rate": 2e-05, |
|
"loss": 6.6283, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 2.552190895618209, |
|
"grad_norm": 502.336669921875, |
|
"learning_rate": 2e-05, |
|
"loss": 6.6223, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 2.582910834178332, |
|
"grad_norm": 883.548583984375, |
|
"learning_rate": 2e-05, |
|
"loss": 6.6159, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 2.6136307727384547, |
|
"grad_norm": 699.3168334960938, |
|
"learning_rate": 2e-05, |
|
"loss": 6.6334, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 2.6443507112985776, |
|
"grad_norm": 890.4816284179688, |
|
"learning_rate": 2e-05, |
|
"loss": 6.628, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 2.6750706498587, |
|
"grad_norm": 701.9059448242188, |
|
"learning_rate": 2e-05, |
|
"loss": 6.6377, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 2.7057905884188234, |
|
"grad_norm": 559.9364013671875, |
|
"learning_rate": 2e-05, |
|
"loss": 6.6348, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 2.736510526978946, |
|
"grad_norm": 680.9859008789062, |
|
"learning_rate": 2e-05, |
|
"loss": 6.6436, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 2.767230465539069, |
|
"grad_norm": 1093.75537109375, |
|
"learning_rate": 2e-05, |
|
"loss": 6.6431, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 2.767230465539069, |
|
"eval_loss": 6.635093688964844, |
|
"eval_runtime": 9.2903, |
|
"eval_samples_per_second": 53.819, |
|
"eval_steps_per_second": 9.042, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 2.7979504040991916, |
|
"grad_norm": 742.6687622070312, |
|
"learning_rate": 2e-05, |
|
"loss": 6.6505, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 2.828670342659315, |
|
"grad_norm": 743.6510620117188, |
|
"learning_rate": 2e-05, |
|
"loss": 6.6612, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 2.8593902812194374, |
|
"grad_norm": 731.3457641601562, |
|
"learning_rate": 2e-05, |
|
"loss": 6.6618, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 2.8901102197795603, |
|
"grad_norm": 845.3829956054688, |
|
"learning_rate": 2e-05, |
|
"loss": 6.6633, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 2.920830158339683, |
|
"grad_norm": 870.3146362304688, |
|
"learning_rate": 2e-05, |
|
"loss": 6.681, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 2.951550096899806, |
|
"grad_norm": 1200.5750732421875, |
|
"learning_rate": 2e-05, |
|
"loss": 6.683, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 2.982270035459929, |
|
"grad_norm": 1079.7291259765625, |
|
"learning_rate": 2e-05, |
|
"loss": 6.7085, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 3.0142079715840566, |
|
"grad_norm": 1077.7926025390625, |
|
"learning_rate": 2e-05, |
|
"loss": 6.7156, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 3.0449279101441795, |
|
"grad_norm": 1077.4931640625, |
|
"learning_rate": 2e-05, |
|
"loss": 6.7211, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 3.0756478487043024, |
|
"grad_norm": 1055.1063232421875, |
|
"learning_rate": 2e-05, |
|
"loss": 6.7276, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 3.0756478487043024, |
|
"eval_loss": 6.713944435119629, |
|
"eval_runtime": 8.3473, |
|
"eval_samples_per_second": 59.9, |
|
"eval_steps_per_second": 10.063, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 3.1063677872644253, |
|
"grad_norm": 1610.3421630859375, |
|
"learning_rate": 2e-05, |
|
"loss": 6.7381, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 3.137087725824548, |
|
"grad_norm": 1750.0655517578125, |
|
"learning_rate": 2e-05, |
|
"loss": 6.7705, |
|
"step": 1020 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 1024, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 4, |
|
"save_steps": 1024, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2.7337483099786183e+19, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|