{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.1493757012485974, "eval_steps": 100, "global_step": 1024, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.03071993856012288, "grad_norm": 1362.820556640625, "learning_rate": 0.0001999995200527669, "loss": 9.71, "step": 10 }, { "epoch": 0.06143987712024576, "grad_norm": 770.7006225585938, "learning_rate": 0.00019995200907733468, "loss": 8.4271, "step": 20 }, { "epoch": 0.09215981568036864, "grad_norm": 1022.9829711914062, "learning_rate": 0.00019980808237191178, "loss": 7.8192, "step": 30 }, { "epoch": 0.12287975424049152, "grad_norm": 1215.0618896484375, "learning_rate": 0.00019956835802723916, "loss": 7.6393, "step": 40 }, { "epoch": 0.1535996928006144, "grad_norm": 1212.7867431640625, "learning_rate": 0.0001992330661351665, "loss": 7.6322, "step": 50 }, { "epoch": 0.18431963136073728, "grad_norm": 1856.4029541015625, "learning_rate": 0.00019880252851503915, "loss": 7.5722, "step": 60 }, { "epoch": 0.21503956992086015, "grad_norm": 1200.9031982421875, "learning_rate": 0.0001982771584048096, "loss": 7.519, "step": 70 }, { "epoch": 0.24575950848098305, "grad_norm": 1107.1383056640625, "learning_rate": 0.00019765746006440455, "loss": 7.3991, "step": 80 }, { "epoch": 0.27647944704110594, "grad_norm": 1291.4737548828125, "learning_rate": 0.00019694402829172663, "loss": 7.3051, "step": 90 }, { "epoch": 0.3071993856012288, "grad_norm": 1314.5531005859375, "learning_rate": 0.0001961375478517564, "loss": 7.246, "step": 100 }, { "epoch": 0.3071993856012288, "eval_loss": 7.195280075073242, "eval_runtime": 8.8206, "eval_samples_per_second": 56.685, "eval_steps_per_second": 9.523, "step": 100 }, { "epoch": 0.3379193241613517, "grad_norm": 1382.552734375, "learning_rate": 0.00019523879281930235, "loss": 7.2223, "step": 110 }, { "epoch": 0.36863926272147457, "grad_norm": 1070.0693359375, "learning_rate": 0.00019424862583602965, "loss": 7.148, "step": 120 }, { "epoch": 0.3993592012815974, "grad_norm": 755.69580078125, "learning_rate": 0.00019316799728248075, "loss": 7.1237, "step": 130 }, { "epoch": 0.4300791398417203, "grad_norm": 954.8897705078125, "learning_rate": 0.00019199794436588243, "loss": 7.1325, "step": 140 }, { "epoch": 0.4607990784018432, "grad_norm": 774.8735961914062, "learning_rate": 0.00019073959012461545, "loss": 7.0651, "step": 150 }, { "epoch": 0.4915190169619661, "grad_norm": 1204.67236328125, "learning_rate": 0.00018939414235030134, "loss": 6.9959, "step": 160 }, { "epoch": 0.522238955522089, "grad_norm": 1073.4049072265625, "learning_rate": 0.0001879628924285419, "loss": 6.9933, "step": 170 }, { "epoch": 0.5529588940822119, "grad_norm": 1143.1695556640625, "learning_rate": 0.00018644721409942323, "loss": 7.064, "step": 180 }, { "epoch": 0.5836788326423347, "grad_norm": 1076.1048583984375, "learning_rate": 0.00018484856213897498, "loss": 7.0452, "step": 190 }, { "epoch": 0.6143987712024576, "grad_norm": 1768.272216796875, "learning_rate": 0.00018316847096284917, "loss": 7.0609, "step": 200 }, { "epoch": 0.6143987712024576, "eval_loss": 7.0105695724487305, "eval_runtime": 8.8282, "eval_samples_per_second": 56.637, "eval_steps_per_second": 9.515, "step": 200 }, { "epoch": 0.6451187097625805, "grad_norm": 1186.74951171875, "learning_rate": 0.0001814085531535599, "loss": 6.9933, "step": 210 }, { "epoch": 0.6758386483227034, "grad_norm": 770.9393920898438, "learning_rate": 0.00017957049791269685, "loss": 6.9445, "step": 220 }, { "epoch": 0.7065585868828262, "grad_norm": 1249.0562744140625, "learning_rate": 0.00017765606943959833, "loss": 6.9064, "step": 230 }, { "epoch": 0.7372785254429491, "grad_norm": 1456.103271484375, "learning_rate": 0.00017566710523804043, "loss": 6.8975, "step": 240 }, { "epoch": 0.767998464003072, "grad_norm": 602.2179565429688, "learning_rate": 0.00017360551435256674, "loss": 6.9077, "step": 250 }, { "epoch": 0.7987184025631948, "grad_norm": 2122.6357421875, "learning_rate": 0.00017168962077029147, "loss": 6.8866, "step": 260 }, { "epoch": 0.8294383411233177, "grad_norm": 1087.0120849609375, "learning_rate": 0.00016949554673441534, "loss": 6.9582, "step": 270 }, { "epoch": 0.8601582796834406, "grad_norm": 713.1141967773438, "learning_rate": 0.00016723476959036083, "loss": 6.9267, "step": 280 }, { "epoch": 0.8908782182435635, "grad_norm": 780.1596069335938, "learning_rate": 0.0001649094592737497, "loss": 6.8531, "step": 290 }, { "epoch": 0.9215981568036864, "grad_norm": 557.41015625, "learning_rate": 0.00016252184766033342, "loss": 6.8226, "step": 300 }, { "epoch": 0.9215981568036864, "eval_loss": 6.788844585418701, "eval_runtime": 9.2188, "eval_samples_per_second": 54.237, "eval_steps_per_second": 9.112, "step": 300 }, { "epoch": 0.9523180953638093, "grad_norm": 1023.5853881835938, "learning_rate": 0.0001600742264237979, "loss": 6.8258, "step": 310 }, { "epoch": 0.9830380339239322, "grad_norm": 832.1343994140625, "learning_rate": 0.00015756894483617267, "loss": 6.9351, "step": 320 }, { "epoch": 1.0149759700480598, "grad_norm": 673.265625, "learning_rate": 0.0001550084075129563, "loss": 6.8737, "step": 330 }, { "epoch": 1.0456959086081827, "grad_norm": 460.1312561035156, "learning_rate": 0.00015239507210512194, "loss": 6.7986, "step": 340 }, { "epoch": 1.0764158471683056, "grad_norm": 911.1027221679688, "learning_rate": 0.00014973144694021876, "loss": 6.7203, "step": 350 }, { "epoch": 1.1071357857284285, "grad_norm": 615.1552734375, "learning_rate": 0.00014702008861483266, "loss": 6.7367, "step": 360 }, { "epoch": 1.1378557242885514, "grad_norm": 1422.3477783203125, "learning_rate": 0.00014426359954071796, "loss": 6.8291, "step": 370 }, { "epoch": 1.1685756628486743, "grad_norm": 960.8421020507812, "learning_rate": 0.00014146462544695426, "loss": 6.9442, "step": 380 }, { "epoch": 1.1992956014087972, "grad_norm": 598.9495239257812, "learning_rate": 0.00013862585284052714, "loss": 6.8753, "step": 390 }, { "epoch": 1.23001553996892, "grad_norm": 465.4633483886719, "learning_rate": 0.00013575000642776893, "loss": 6.7756, "step": 400 }, { "epoch": 1.23001553996892, "eval_loss": 6.714211940765381, "eval_runtime": 8.2864, "eval_samples_per_second": 60.34, "eval_steps_per_second": 10.137, "step": 400 }, { "epoch": 1.260735478529043, "grad_norm": 409.80078125, "learning_rate": 0.0001328398464991355, "loss": 6.6953, "step": 410 }, { "epoch": 1.291455417089166, "grad_norm": 537.5800170898438, "learning_rate": 0.00012989816627982848, "loss": 6.6806, "step": 420 }, { "epoch": 1.3221753556492888, "grad_norm": 582.1690673828125, "learning_rate": 0.00012692778924880603, "loss": 6.6567, "step": 430 }, { "epoch": 1.3528952942094117, "grad_norm": 995.1676635742188, "learning_rate": 0.0001239315664287558, "loss": 6.703, "step": 440 }, { "epoch": 1.3836152327695346, "grad_norm": 1078.3963623046875, "learning_rate": 0.00012091237364963071, "loss": 6.8837, "step": 450 }, { "epoch": 1.4143351713296575, "grad_norm": 1639.4901123046875, "learning_rate": 0.00011787310878837422, "loss": 6.9726, "step": 460 }, { "epoch": 1.4450551098897801, "grad_norm": 726.534423828125, "learning_rate": 0.00011481668898748475, "loss": 6.9038, "step": 470 }, { "epoch": 1.475775048449903, "grad_norm": 458.4363098144531, "learning_rate": 0.00011174604785508813, "loss": 6.7909, "step": 480 }, { "epoch": 1.506494987010026, "grad_norm": 482.659912109375, "learning_rate": 0.00010866413264920678, "loss": 6.6934, "step": 490 }, { "epoch": 1.5372149255701488, "grad_norm": 362.69256591796875, "learning_rate": 0.00010557390144892684, "loss": 6.6197, "step": 500 }, { "epoch": 1.5372149255701488, "eval_loss": 6.591891765594482, "eval_runtime": 8.6234, "eval_samples_per_second": 57.982, "eval_steps_per_second": 9.741, "step": 500 }, { "epoch": 1.5679348641302717, "grad_norm": 673.8775634765625, "learning_rate": 0.0001024783203151793, "loss": 6.5968, "step": 510 }, { "epoch": 1.5986548026903946, "grad_norm": 621.0140380859375, "learning_rate": 9.938036044386005e-05, "loss": 6.6061, "step": 520 }, { "epoch": 1.6293747412505175, "grad_norm": 1936.8753662109375, "learning_rate": 9.628299531402117e-05, "loss": 6.7405, "step": 530 }, { "epoch": 1.6600946798106404, "grad_norm": 871.2101440429688, "learning_rate": 9.318919783387094e-05, "loss": 6.8414, "step": 540 }, { "epoch": 1.6908146183707633, "grad_norm": 646.2464599609375, "learning_rate": 9.010193748732155e-05, "loss": 6.8444, "step": 550 }, { "epoch": 1.721534556930886, "grad_norm": 533.4226684570312, "learning_rate": 8.702417748382385e-05, "loss": 6.7516, "step": 560 }, { "epoch": 1.7522544954910089, "grad_norm": 512.05322265625, "learning_rate": 8.395887191422397e-05, "loss": 6.6651, "step": 570 }, { "epoch": 1.7829744340511318, "grad_norm": 461.73052978515625, "learning_rate": 8.090896291537273e-05, "loss": 6.6219, "step": 580 }, { "epoch": 1.8136943726112547, "grad_norm": 524.8619995117188, "learning_rate": 7.787737784620803e-05, "loss": 6.6067, "step": 590 }, { "epoch": 1.8444143111713776, "grad_norm": 623.9786376953125, "learning_rate": 7.486702647802213e-05, "loss": 6.6108, "step": 600 }, { "epoch": 1.8444143111713776, "eval_loss": 6.602721691131592, "eval_runtime": 8.5465, "eval_samples_per_second": 58.503, "eval_steps_per_second": 9.829, "step": 600 }, { "epoch": 1.8751342497315004, "grad_norm": 1003.4849243164062, "learning_rate": 7.188079820160904e-05, "loss": 6.6348, "step": 610 }, { "epoch": 1.9058541882916233, "grad_norm": 1812.9306640625, "learning_rate": 6.892155925397436e-05, "loss": 6.7396, "step": 620 }, { "epoch": 1.9365741268517462, "grad_norm": 1092.574951171875, "learning_rate": 6.59921499672677e-05, "loss": 6.8439, "step": 630 }, { "epoch": 1.9672940654118691, "grad_norm": 758.6673583984375, "learning_rate": 6.309538204257977e-05, "loss": 6.8437, "step": 640 }, { "epoch": 1.998014003971992, "grad_norm": 714.4383544921875, "learning_rate": 6.02340358512196e-05, "loss": 6.8018, "step": 650 }, { "epoch": 2.0299519400961197, "grad_norm": 631.6743774414062, "learning_rate": 5.7410857766062966e-05, "loss": 6.7339, "step": 660 }, { "epoch": 2.0606718786562426, "grad_norm": 506.4139099121094, "learning_rate": 5.4628557525532976e-05, "loss": 6.6692, "step": 670 }, { "epoch": 2.0913918172163655, "grad_norm": 593.3082275390625, "learning_rate": 5.188980563274315e-05, "loss": 6.6358, "step": 680 }, { "epoch": 2.1221117557764884, "grad_norm": 580.3704833984375, "learning_rate": 4.9197230792299195e-05, "loss": 6.6278, "step": 690 }, { "epoch": 2.1528316943366113, "grad_norm": 566.3848266601562, "learning_rate": 4.6553417387219886e-05, "loss": 6.6662, "step": 700 }, { "epoch": 2.1528316943366113, "eval_loss": 6.699697971343994, "eval_runtime": 8.6309, "eval_samples_per_second": 57.931, "eval_steps_per_second": 9.732, "step": 700 }, { "epoch": 2.183551632896734, "grad_norm": 938.6303100585938, "learning_rate": 4.396090299839852e-05, "loss": 6.7142, "step": 710 }, { "epoch": 2.214271571456857, "grad_norm": 713.5470581054688, "learning_rate": 4.1422175968985955e-05, "loss": 6.7151, "step": 720 }, { "epoch": 2.24499151001698, "grad_norm": 594.885498046875, "learning_rate": 3.8939673016032953e-05, "loss": 6.6822, "step": 730 }, { "epoch": 2.275711448577103, "grad_norm": 671.8080444335938, "learning_rate": 3.651577689168405e-05, "loss": 6.6504, "step": 740 }, { "epoch": 2.3064313871372257, "grad_norm": 614.1011962890625, "learning_rate": 3.415281409616844e-05, "loss": 6.6417, "step": 750 }, { "epoch": 2.3371513256973486, "grad_norm": 556.9248657226562, "learning_rate": 3.185305264478159e-05, "loss": 6.6225, "step": 760 }, { "epoch": 2.3678712642574715, "grad_norm": 948.7615356445312, "learning_rate": 2.9839130153161154e-05, "loss": 6.6301, "step": 770 }, { "epoch": 2.3985912028175944, "grad_norm": 673.330322265625, "learning_rate": 2.766548066920338e-05, "loss": 6.6598, "step": 780 }, { "epoch": 2.4293111413777173, "grad_norm": 591.10400390625, "learning_rate": 2.5561259191710407e-05, "loss": 6.6749, "step": 790 }, { "epoch": 2.46003107993784, "grad_norm": 600.33154296875, "learning_rate": 2.3528485391286147e-05, "loss": 6.6622, "step": 800 }, { "epoch": 2.46003107993784, "eval_loss": 6.648305892944336, "eval_runtime": 8.5752, "eval_samples_per_second": 58.308, "eval_steps_per_second": 9.796, "step": 800 }, { "epoch": 2.490751018497963, "grad_norm": 493.4453125, "learning_rate": 2.1569110361735677e-05, "loss": 6.66, "step": 810 }, { "epoch": 2.521470957058086, "grad_norm": 673.8823852539062, "learning_rate": 2e-05, "loss": 6.6283, "step": 820 }, { "epoch": 2.552190895618209, "grad_norm": 502.336669921875, "learning_rate": 2e-05, "loss": 6.6223, "step": 830 }, { "epoch": 2.582910834178332, "grad_norm": 883.548583984375, "learning_rate": 2e-05, "loss": 6.6159, "step": 840 }, { "epoch": 2.6136307727384547, "grad_norm": 699.3168334960938, "learning_rate": 2e-05, "loss": 6.6334, "step": 850 }, { "epoch": 2.6443507112985776, "grad_norm": 890.4816284179688, "learning_rate": 2e-05, "loss": 6.628, "step": 860 }, { "epoch": 2.6750706498587, "grad_norm": 701.9059448242188, "learning_rate": 2e-05, "loss": 6.6377, "step": 870 }, { "epoch": 2.7057905884188234, "grad_norm": 559.9364013671875, "learning_rate": 2e-05, "loss": 6.6348, "step": 880 }, { "epoch": 2.736510526978946, "grad_norm": 680.9859008789062, "learning_rate": 2e-05, "loss": 6.6436, "step": 890 }, { "epoch": 2.767230465539069, "grad_norm": 1093.75537109375, "learning_rate": 2e-05, "loss": 6.6431, "step": 900 }, { "epoch": 2.767230465539069, "eval_loss": 6.635093688964844, "eval_runtime": 9.2903, "eval_samples_per_second": 53.819, "eval_steps_per_second": 9.042, "step": 900 }, { "epoch": 2.7979504040991916, "grad_norm": 742.6687622070312, "learning_rate": 2e-05, "loss": 6.6505, "step": 910 }, { "epoch": 2.828670342659315, "grad_norm": 743.6510620117188, "learning_rate": 2e-05, "loss": 6.6612, "step": 920 }, { "epoch": 2.8593902812194374, "grad_norm": 731.3457641601562, "learning_rate": 2e-05, "loss": 6.6618, "step": 930 }, { "epoch": 2.8901102197795603, "grad_norm": 845.3829956054688, "learning_rate": 2e-05, "loss": 6.6633, "step": 940 }, { "epoch": 2.920830158339683, "grad_norm": 870.3146362304688, "learning_rate": 2e-05, "loss": 6.681, "step": 950 }, { "epoch": 2.951550096899806, "grad_norm": 1200.5750732421875, "learning_rate": 2e-05, "loss": 6.683, "step": 960 }, { "epoch": 2.982270035459929, "grad_norm": 1079.7291259765625, "learning_rate": 2e-05, "loss": 6.7085, "step": 970 }, { "epoch": 3.0142079715840566, "grad_norm": 1077.7926025390625, "learning_rate": 2e-05, "loss": 6.7156, "step": 980 }, { "epoch": 3.0449279101441795, "grad_norm": 1077.4931640625, "learning_rate": 2e-05, "loss": 6.7211, "step": 990 }, { "epoch": 3.0756478487043024, "grad_norm": 1055.1063232421875, "learning_rate": 2e-05, "loss": 6.7276, "step": 1000 }, { "epoch": 3.0756478487043024, "eval_loss": 6.713944435119629, "eval_runtime": 8.3473, "eval_samples_per_second": 59.9, "eval_steps_per_second": 10.063, "step": 1000 }, { "epoch": 3.1063677872644253, "grad_norm": 1610.3421630859375, "learning_rate": 2e-05, "loss": 6.7381, "step": 1010 }, { "epoch": 3.137087725824548, "grad_norm": 1750.0655517578125, "learning_rate": 2e-05, "loss": 6.7705, "step": 1020 } ], "logging_steps": 10, "max_steps": 1024, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 1024, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.7337483099786183e+19, "train_batch_size": 2, "trial_name": null, "trial_params": null }