Marcus2112's picture
Upload folder using huggingface_hub
4373f28 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.1493757012485974,
"eval_steps": 100,
"global_step": 1024,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.03071993856012288,
"grad_norm": 1362.820556640625,
"learning_rate": 0.0001999995200527669,
"loss": 9.71,
"step": 10
},
{
"epoch": 0.06143987712024576,
"grad_norm": 770.7006225585938,
"learning_rate": 0.00019995200907733468,
"loss": 8.4271,
"step": 20
},
{
"epoch": 0.09215981568036864,
"grad_norm": 1022.9829711914062,
"learning_rate": 0.00019980808237191178,
"loss": 7.8192,
"step": 30
},
{
"epoch": 0.12287975424049152,
"grad_norm": 1215.0618896484375,
"learning_rate": 0.00019956835802723916,
"loss": 7.6393,
"step": 40
},
{
"epoch": 0.1535996928006144,
"grad_norm": 1212.7867431640625,
"learning_rate": 0.0001992330661351665,
"loss": 7.6322,
"step": 50
},
{
"epoch": 0.18431963136073728,
"grad_norm": 1856.4029541015625,
"learning_rate": 0.00019880252851503915,
"loss": 7.5722,
"step": 60
},
{
"epoch": 0.21503956992086015,
"grad_norm": 1200.9031982421875,
"learning_rate": 0.0001982771584048096,
"loss": 7.519,
"step": 70
},
{
"epoch": 0.24575950848098305,
"grad_norm": 1107.1383056640625,
"learning_rate": 0.00019765746006440455,
"loss": 7.3991,
"step": 80
},
{
"epoch": 0.27647944704110594,
"grad_norm": 1291.4737548828125,
"learning_rate": 0.00019694402829172663,
"loss": 7.3051,
"step": 90
},
{
"epoch": 0.3071993856012288,
"grad_norm": 1314.5531005859375,
"learning_rate": 0.0001961375478517564,
"loss": 7.246,
"step": 100
},
{
"epoch": 0.3071993856012288,
"eval_loss": 7.195280075073242,
"eval_runtime": 8.8206,
"eval_samples_per_second": 56.685,
"eval_steps_per_second": 9.523,
"step": 100
},
{
"epoch": 0.3379193241613517,
"grad_norm": 1382.552734375,
"learning_rate": 0.00019523879281930235,
"loss": 7.2223,
"step": 110
},
{
"epoch": 0.36863926272147457,
"grad_norm": 1070.0693359375,
"learning_rate": 0.00019424862583602965,
"loss": 7.148,
"step": 120
},
{
"epoch": 0.3993592012815974,
"grad_norm": 755.69580078125,
"learning_rate": 0.00019316799728248075,
"loss": 7.1237,
"step": 130
},
{
"epoch": 0.4300791398417203,
"grad_norm": 954.8897705078125,
"learning_rate": 0.00019199794436588243,
"loss": 7.1325,
"step": 140
},
{
"epoch": 0.4607990784018432,
"grad_norm": 774.8735961914062,
"learning_rate": 0.00019073959012461545,
"loss": 7.0651,
"step": 150
},
{
"epoch": 0.4915190169619661,
"grad_norm": 1204.67236328125,
"learning_rate": 0.00018939414235030134,
"loss": 6.9959,
"step": 160
},
{
"epoch": 0.522238955522089,
"grad_norm": 1073.4049072265625,
"learning_rate": 0.0001879628924285419,
"loss": 6.9933,
"step": 170
},
{
"epoch": 0.5529588940822119,
"grad_norm": 1143.1695556640625,
"learning_rate": 0.00018644721409942323,
"loss": 7.064,
"step": 180
},
{
"epoch": 0.5836788326423347,
"grad_norm": 1076.1048583984375,
"learning_rate": 0.00018484856213897498,
"loss": 7.0452,
"step": 190
},
{
"epoch": 0.6143987712024576,
"grad_norm": 1768.272216796875,
"learning_rate": 0.00018316847096284917,
"loss": 7.0609,
"step": 200
},
{
"epoch": 0.6143987712024576,
"eval_loss": 7.0105695724487305,
"eval_runtime": 8.8282,
"eval_samples_per_second": 56.637,
"eval_steps_per_second": 9.515,
"step": 200
},
{
"epoch": 0.6451187097625805,
"grad_norm": 1186.74951171875,
"learning_rate": 0.0001814085531535599,
"loss": 6.9933,
"step": 210
},
{
"epoch": 0.6758386483227034,
"grad_norm": 770.9393920898438,
"learning_rate": 0.00017957049791269685,
"loss": 6.9445,
"step": 220
},
{
"epoch": 0.7065585868828262,
"grad_norm": 1249.0562744140625,
"learning_rate": 0.00017765606943959833,
"loss": 6.9064,
"step": 230
},
{
"epoch": 0.7372785254429491,
"grad_norm": 1456.103271484375,
"learning_rate": 0.00017566710523804043,
"loss": 6.8975,
"step": 240
},
{
"epoch": 0.767998464003072,
"grad_norm": 602.2179565429688,
"learning_rate": 0.00017360551435256674,
"loss": 6.9077,
"step": 250
},
{
"epoch": 0.7987184025631948,
"grad_norm": 2122.6357421875,
"learning_rate": 0.00017168962077029147,
"loss": 6.8866,
"step": 260
},
{
"epoch": 0.8294383411233177,
"grad_norm": 1087.0120849609375,
"learning_rate": 0.00016949554673441534,
"loss": 6.9582,
"step": 270
},
{
"epoch": 0.8601582796834406,
"grad_norm": 713.1141967773438,
"learning_rate": 0.00016723476959036083,
"loss": 6.9267,
"step": 280
},
{
"epoch": 0.8908782182435635,
"grad_norm": 780.1596069335938,
"learning_rate": 0.0001649094592737497,
"loss": 6.8531,
"step": 290
},
{
"epoch": 0.9215981568036864,
"grad_norm": 557.41015625,
"learning_rate": 0.00016252184766033342,
"loss": 6.8226,
"step": 300
},
{
"epoch": 0.9215981568036864,
"eval_loss": 6.788844585418701,
"eval_runtime": 9.2188,
"eval_samples_per_second": 54.237,
"eval_steps_per_second": 9.112,
"step": 300
},
{
"epoch": 0.9523180953638093,
"grad_norm": 1023.5853881835938,
"learning_rate": 0.0001600742264237979,
"loss": 6.8258,
"step": 310
},
{
"epoch": 0.9830380339239322,
"grad_norm": 832.1343994140625,
"learning_rate": 0.00015756894483617267,
"loss": 6.9351,
"step": 320
},
{
"epoch": 1.0149759700480598,
"grad_norm": 673.265625,
"learning_rate": 0.0001550084075129563,
"loss": 6.8737,
"step": 330
},
{
"epoch": 1.0456959086081827,
"grad_norm": 460.1312561035156,
"learning_rate": 0.00015239507210512194,
"loss": 6.7986,
"step": 340
},
{
"epoch": 1.0764158471683056,
"grad_norm": 911.1027221679688,
"learning_rate": 0.00014973144694021876,
"loss": 6.7203,
"step": 350
},
{
"epoch": 1.1071357857284285,
"grad_norm": 615.1552734375,
"learning_rate": 0.00014702008861483266,
"loss": 6.7367,
"step": 360
},
{
"epoch": 1.1378557242885514,
"grad_norm": 1422.3477783203125,
"learning_rate": 0.00014426359954071796,
"loss": 6.8291,
"step": 370
},
{
"epoch": 1.1685756628486743,
"grad_norm": 960.8421020507812,
"learning_rate": 0.00014146462544695426,
"loss": 6.9442,
"step": 380
},
{
"epoch": 1.1992956014087972,
"grad_norm": 598.9495239257812,
"learning_rate": 0.00013862585284052714,
"loss": 6.8753,
"step": 390
},
{
"epoch": 1.23001553996892,
"grad_norm": 465.4633483886719,
"learning_rate": 0.00013575000642776893,
"loss": 6.7756,
"step": 400
},
{
"epoch": 1.23001553996892,
"eval_loss": 6.714211940765381,
"eval_runtime": 8.2864,
"eval_samples_per_second": 60.34,
"eval_steps_per_second": 10.137,
"step": 400
},
{
"epoch": 1.260735478529043,
"grad_norm": 409.80078125,
"learning_rate": 0.0001328398464991355,
"loss": 6.6953,
"step": 410
},
{
"epoch": 1.291455417089166,
"grad_norm": 537.5800170898438,
"learning_rate": 0.00012989816627982848,
"loss": 6.6806,
"step": 420
},
{
"epoch": 1.3221753556492888,
"grad_norm": 582.1690673828125,
"learning_rate": 0.00012692778924880603,
"loss": 6.6567,
"step": 430
},
{
"epoch": 1.3528952942094117,
"grad_norm": 995.1676635742188,
"learning_rate": 0.0001239315664287558,
"loss": 6.703,
"step": 440
},
{
"epoch": 1.3836152327695346,
"grad_norm": 1078.3963623046875,
"learning_rate": 0.00012091237364963071,
"loss": 6.8837,
"step": 450
},
{
"epoch": 1.4143351713296575,
"grad_norm": 1639.4901123046875,
"learning_rate": 0.00011787310878837422,
"loss": 6.9726,
"step": 460
},
{
"epoch": 1.4450551098897801,
"grad_norm": 726.534423828125,
"learning_rate": 0.00011481668898748475,
"loss": 6.9038,
"step": 470
},
{
"epoch": 1.475775048449903,
"grad_norm": 458.4363098144531,
"learning_rate": 0.00011174604785508813,
"loss": 6.7909,
"step": 480
},
{
"epoch": 1.506494987010026,
"grad_norm": 482.659912109375,
"learning_rate": 0.00010866413264920678,
"loss": 6.6934,
"step": 490
},
{
"epoch": 1.5372149255701488,
"grad_norm": 362.69256591796875,
"learning_rate": 0.00010557390144892684,
"loss": 6.6197,
"step": 500
},
{
"epoch": 1.5372149255701488,
"eval_loss": 6.591891765594482,
"eval_runtime": 8.6234,
"eval_samples_per_second": 57.982,
"eval_steps_per_second": 9.741,
"step": 500
},
{
"epoch": 1.5679348641302717,
"grad_norm": 673.8775634765625,
"learning_rate": 0.0001024783203151793,
"loss": 6.5968,
"step": 510
},
{
"epoch": 1.5986548026903946,
"grad_norm": 621.0140380859375,
"learning_rate": 9.938036044386005e-05,
"loss": 6.6061,
"step": 520
},
{
"epoch": 1.6293747412505175,
"grad_norm": 1936.8753662109375,
"learning_rate": 9.628299531402117e-05,
"loss": 6.7405,
"step": 530
},
{
"epoch": 1.6600946798106404,
"grad_norm": 871.2101440429688,
"learning_rate": 9.318919783387094e-05,
"loss": 6.8414,
"step": 540
},
{
"epoch": 1.6908146183707633,
"grad_norm": 646.2464599609375,
"learning_rate": 9.010193748732155e-05,
"loss": 6.8444,
"step": 550
},
{
"epoch": 1.721534556930886,
"grad_norm": 533.4226684570312,
"learning_rate": 8.702417748382385e-05,
"loss": 6.7516,
"step": 560
},
{
"epoch": 1.7522544954910089,
"grad_norm": 512.05322265625,
"learning_rate": 8.395887191422397e-05,
"loss": 6.6651,
"step": 570
},
{
"epoch": 1.7829744340511318,
"grad_norm": 461.73052978515625,
"learning_rate": 8.090896291537273e-05,
"loss": 6.6219,
"step": 580
},
{
"epoch": 1.8136943726112547,
"grad_norm": 524.8619995117188,
"learning_rate": 7.787737784620803e-05,
"loss": 6.6067,
"step": 590
},
{
"epoch": 1.8444143111713776,
"grad_norm": 623.9786376953125,
"learning_rate": 7.486702647802213e-05,
"loss": 6.6108,
"step": 600
},
{
"epoch": 1.8444143111713776,
"eval_loss": 6.602721691131592,
"eval_runtime": 8.5465,
"eval_samples_per_second": 58.503,
"eval_steps_per_second": 9.829,
"step": 600
},
{
"epoch": 1.8751342497315004,
"grad_norm": 1003.4849243164062,
"learning_rate": 7.188079820160904e-05,
"loss": 6.6348,
"step": 610
},
{
"epoch": 1.9058541882916233,
"grad_norm": 1812.9306640625,
"learning_rate": 6.892155925397436e-05,
"loss": 6.7396,
"step": 620
},
{
"epoch": 1.9365741268517462,
"grad_norm": 1092.574951171875,
"learning_rate": 6.59921499672677e-05,
"loss": 6.8439,
"step": 630
},
{
"epoch": 1.9672940654118691,
"grad_norm": 758.6673583984375,
"learning_rate": 6.309538204257977e-05,
"loss": 6.8437,
"step": 640
},
{
"epoch": 1.998014003971992,
"grad_norm": 714.4383544921875,
"learning_rate": 6.02340358512196e-05,
"loss": 6.8018,
"step": 650
},
{
"epoch": 2.0299519400961197,
"grad_norm": 631.6743774414062,
"learning_rate": 5.7410857766062966e-05,
"loss": 6.7339,
"step": 660
},
{
"epoch": 2.0606718786562426,
"grad_norm": 506.4139099121094,
"learning_rate": 5.4628557525532976e-05,
"loss": 6.6692,
"step": 670
},
{
"epoch": 2.0913918172163655,
"grad_norm": 593.3082275390625,
"learning_rate": 5.188980563274315e-05,
"loss": 6.6358,
"step": 680
},
{
"epoch": 2.1221117557764884,
"grad_norm": 580.3704833984375,
"learning_rate": 4.9197230792299195e-05,
"loss": 6.6278,
"step": 690
},
{
"epoch": 2.1528316943366113,
"grad_norm": 566.3848266601562,
"learning_rate": 4.6553417387219886e-05,
"loss": 6.6662,
"step": 700
},
{
"epoch": 2.1528316943366113,
"eval_loss": 6.699697971343994,
"eval_runtime": 8.6309,
"eval_samples_per_second": 57.931,
"eval_steps_per_second": 9.732,
"step": 700
},
{
"epoch": 2.183551632896734,
"grad_norm": 938.6303100585938,
"learning_rate": 4.396090299839852e-05,
"loss": 6.7142,
"step": 710
},
{
"epoch": 2.214271571456857,
"grad_norm": 713.5470581054688,
"learning_rate": 4.1422175968985955e-05,
"loss": 6.7151,
"step": 720
},
{
"epoch": 2.24499151001698,
"grad_norm": 594.885498046875,
"learning_rate": 3.8939673016032953e-05,
"loss": 6.6822,
"step": 730
},
{
"epoch": 2.275711448577103,
"grad_norm": 671.8080444335938,
"learning_rate": 3.651577689168405e-05,
"loss": 6.6504,
"step": 740
},
{
"epoch": 2.3064313871372257,
"grad_norm": 614.1011962890625,
"learning_rate": 3.415281409616844e-05,
"loss": 6.6417,
"step": 750
},
{
"epoch": 2.3371513256973486,
"grad_norm": 556.9248657226562,
"learning_rate": 3.185305264478159e-05,
"loss": 6.6225,
"step": 760
},
{
"epoch": 2.3678712642574715,
"grad_norm": 948.7615356445312,
"learning_rate": 2.9839130153161154e-05,
"loss": 6.6301,
"step": 770
},
{
"epoch": 2.3985912028175944,
"grad_norm": 673.330322265625,
"learning_rate": 2.766548066920338e-05,
"loss": 6.6598,
"step": 780
},
{
"epoch": 2.4293111413777173,
"grad_norm": 591.10400390625,
"learning_rate": 2.5561259191710407e-05,
"loss": 6.6749,
"step": 790
},
{
"epoch": 2.46003107993784,
"grad_norm": 600.33154296875,
"learning_rate": 2.3528485391286147e-05,
"loss": 6.6622,
"step": 800
},
{
"epoch": 2.46003107993784,
"eval_loss": 6.648305892944336,
"eval_runtime": 8.5752,
"eval_samples_per_second": 58.308,
"eval_steps_per_second": 9.796,
"step": 800
},
{
"epoch": 2.490751018497963,
"grad_norm": 493.4453125,
"learning_rate": 2.1569110361735677e-05,
"loss": 6.66,
"step": 810
},
{
"epoch": 2.521470957058086,
"grad_norm": 673.8823852539062,
"learning_rate": 2e-05,
"loss": 6.6283,
"step": 820
},
{
"epoch": 2.552190895618209,
"grad_norm": 502.336669921875,
"learning_rate": 2e-05,
"loss": 6.6223,
"step": 830
},
{
"epoch": 2.582910834178332,
"grad_norm": 883.548583984375,
"learning_rate": 2e-05,
"loss": 6.6159,
"step": 840
},
{
"epoch": 2.6136307727384547,
"grad_norm": 699.3168334960938,
"learning_rate": 2e-05,
"loss": 6.6334,
"step": 850
},
{
"epoch": 2.6443507112985776,
"grad_norm": 890.4816284179688,
"learning_rate": 2e-05,
"loss": 6.628,
"step": 860
},
{
"epoch": 2.6750706498587,
"grad_norm": 701.9059448242188,
"learning_rate": 2e-05,
"loss": 6.6377,
"step": 870
},
{
"epoch": 2.7057905884188234,
"grad_norm": 559.9364013671875,
"learning_rate": 2e-05,
"loss": 6.6348,
"step": 880
},
{
"epoch": 2.736510526978946,
"grad_norm": 680.9859008789062,
"learning_rate": 2e-05,
"loss": 6.6436,
"step": 890
},
{
"epoch": 2.767230465539069,
"grad_norm": 1093.75537109375,
"learning_rate": 2e-05,
"loss": 6.6431,
"step": 900
},
{
"epoch": 2.767230465539069,
"eval_loss": 6.635093688964844,
"eval_runtime": 9.2903,
"eval_samples_per_second": 53.819,
"eval_steps_per_second": 9.042,
"step": 900
},
{
"epoch": 2.7979504040991916,
"grad_norm": 742.6687622070312,
"learning_rate": 2e-05,
"loss": 6.6505,
"step": 910
},
{
"epoch": 2.828670342659315,
"grad_norm": 743.6510620117188,
"learning_rate": 2e-05,
"loss": 6.6612,
"step": 920
},
{
"epoch": 2.8593902812194374,
"grad_norm": 731.3457641601562,
"learning_rate": 2e-05,
"loss": 6.6618,
"step": 930
},
{
"epoch": 2.8901102197795603,
"grad_norm": 845.3829956054688,
"learning_rate": 2e-05,
"loss": 6.6633,
"step": 940
},
{
"epoch": 2.920830158339683,
"grad_norm": 870.3146362304688,
"learning_rate": 2e-05,
"loss": 6.681,
"step": 950
},
{
"epoch": 2.951550096899806,
"grad_norm": 1200.5750732421875,
"learning_rate": 2e-05,
"loss": 6.683,
"step": 960
},
{
"epoch": 2.982270035459929,
"grad_norm": 1079.7291259765625,
"learning_rate": 2e-05,
"loss": 6.7085,
"step": 970
},
{
"epoch": 3.0142079715840566,
"grad_norm": 1077.7926025390625,
"learning_rate": 2e-05,
"loss": 6.7156,
"step": 980
},
{
"epoch": 3.0449279101441795,
"grad_norm": 1077.4931640625,
"learning_rate": 2e-05,
"loss": 6.7211,
"step": 990
},
{
"epoch": 3.0756478487043024,
"grad_norm": 1055.1063232421875,
"learning_rate": 2e-05,
"loss": 6.7276,
"step": 1000
},
{
"epoch": 3.0756478487043024,
"eval_loss": 6.713944435119629,
"eval_runtime": 8.3473,
"eval_samples_per_second": 59.9,
"eval_steps_per_second": 10.063,
"step": 1000
},
{
"epoch": 3.1063677872644253,
"grad_norm": 1610.3421630859375,
"learning_rate": 2e-05,
"loss": 6.7381,
"step": 1010
},
{
"epoch": 3.137087725824548,
"grad_norm": 1750.0655517578125,
"learning_rate": 2e-05,
"loss": 6.7705,
"step": 1020
}
],
"logging_steps": 10,
"max_steps": 1024,
"num_input_tokens_seen": 0,
"num_train_epochs": 4,
"save_steps": 1024,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2.7337483099786183e+19,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}