{ "best_metric": 5.004675388336182, "best_model_checkpoint": "./models/instruct-finetuning/base/german-gpt2/checkpoint-100", "epoch": 2000.0, "eval_steps": 100, "global_step": 2000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 10.0, "grad_norm": 0.031428202986717224, "learning_rate": 2.9999999999999997e-05, "loss": 5.1962, "step": 10 }, { "epoch": 20.0, "grad_norm": 0.03904120624065399, "learning_rate": 5.9999999999999995e-05, "loss": 5.1962, "step": 20 }, { "epoch": 30.0, "grad_norm": 0.06079966947436333, "learning_rate": 8.999999999999999e-05, "loss": 5.1962, "step": 30 }, { "epoch": 40.0, "grad_norm": 0.10312424600124359, "learning_rate": 0.00011999999999999999, "loss": 5.1961, "step": 40 }, { "epoch": 50.0, "grad_norm": 0.16226649284362793, "learning_rate": 0.00015, "loss": 5.1659, "step": 50 }, { "epoch": 60.0, "grad_norm": 0.23943403363227844, "learning_rate": 0.00017999999999999998, "loss": 5.1257, "step": 60 }, { "epoch": 70.0, "grad_norm": 0.3223947584629059, "learning_rate": 0.00020999999999999998, "loss": 5.0529, "step": 70 }, { "epoch": 80.0, "grad_norm": 0.38690948486328125, "learning_rate": 0.00023999999999999998, "loss": 4.9391, "step": 80 }, { "epoch": 90.0, "grad_norm": 0.41154125332832336, "learning_rate": 0.00027, "loss": 4.7838, "step": 90 }, { "epoch": 100.0, "grad_norm": 0.3825448453426361, "learning_rate": 0.0003, "loss": 4.6023, "step": 100 }, { "epoch": 100.0, "eval_loss": 5.004675388336182, "eval_runtime": 0.2223, "eval_samples_per_second": 71.982, "eval_steps_per_second": 4.499, "step": 100 }, { "epoch": 110.0, "grad_norm": 0.30922797322273254, "learning_rate": 0.00029842105263157894, "loss": 4.4241, "step": 110 }, { "epoch": 120.0, "grad_norm": 0.23929822444915771, "learning_rate": 0.00029684210526315785, "loss": 4.284, "step": 120 }, { "epoch": 130.0, "grad_norm": 0.18230345845222473, "learning_rate": 0.0002952631578947368, "loss": 4.1875, "step": 130 }, { "epoch": 140.0, "grad_norm": 0.1398445963859558, "learning_rate": 0.0002936842105263158, "loss": 4.1233, "step": 140 }, { "epoch": 150.0, "grad_norm": 0.11148730665445328, "learning_rate": 0.0002921052631578947, "loss": 4.081, "step": 150 }, { "epoch": 160.0, "grad_norm": 0.08616490662097931, "learning_rate": 0.00029052631578947366, "loss": 4.0534, "step": 160 }, { "epoch": 170.0, "grad_norm": 0.07182959467172623, "learning_rate": 0.00028894736842105263, "loss": 4.0349, "step": 170 }, { "epoch": 180.0, "grad_norm": 0.05815623328089714, "learning_rate": 0.00028736842105263154, "loss": 4.0226, "step": 180 }, { "epoch": 190.0, "grad_norm": 0.04977192357182503, "learning_rate": 0.0002857894736842105, "loss": 4.0141, "step": 190 }, { "epoch": 200.0, "grad_norm": 0.04297927767038345, "learning_rate": 0.0002842105263157894, "loss": 4.0085, "step": 200 }, { "epoch": 200.0, "eval_loss": 5.356107234954834, "eval_runtime": 0.177, "eval_samples_per_second": 90.385, "eval_steps_per_second": 5.649, "step": 200 }, { "epoch": 210.0, "grad_norm": 0.03621674329042435, "learning_rate": 0.0002826315789473684, "loss": 4.0037, "step": 210 }, { "epoch": 220.0, "grad_norm": 0.03204211965203285, "learning_rate": 0.00028105263157894735, "loss": 4.0004, "step": 220 }, { "epoch": 230.0, "grad_norm": 0.028840389102697372, "learning_rate": 0.0002794736842105263, "loss": 3.9973, "step": 230 }, { "epoch": 240.0, "grad_norm": 0.0280955508351326, "learning_rate": 0.00027789473684210523, "loss": 3.9951, "step": 240 }, { "epoch": 250.0, "grad_norm": 0.023594656959176064, "learning_rate": 0.0002763157894736842, "loss": 3.9928, "step": 250 }, { "epoch": 260.0, "grad_norm": 0.022978538647294044, "learning_rate": 0.0002747368421052631, "loss": 3.9907, "step": 260 }, { "epoch": 270.0, "grad_norm": 0.022501131519675255, "learning_rate": 0.00027315789473684207, "loss": 3.9889, "step": 270 }, { "epoch": 280.0, "grad_norm": 0.019320150837302208, "learning_rate": 0.00027157894736842104, "loss": 3.9871, "step": 280 }, { "epoch": 290.0, "grad_norm": 0.019894694909453392, "learning_rate": 0.00027, "loss": 3.9856, "step": 290 }, { "epoch": 300.0, "grad_norm": 0.01797688938677311, "learning_rate": 0.0002684210526315789, "loss": 3.9836, "step": 300 }, { "epoch": 300.0, "eval_loss": 5.540426731109619, "eval_runtime": 0.1855, "eval_samples_per_second": 86.274, "eval_steps_per_second": 5.392, "step": 300 }, { "epoch": 310.0, "grad_norm": 0.018274759873747826, "learning_rate": 0.0002668421052631579, "loss": 3.9822, "step": 310 }, { "epoch": 320.0, "grad_norm": 0.017490526661276817, "learning_rate": 0.0002652631578947368, "loss": 3.9805, "step": 320 }, { "epoch": 330.0, "grad_norm": 0.01812375709414482, "learning_rate": 0.00026368421052631576, "loss": 3.9789, "step": 330 }, { "epoch": 340.0, "grad_norm": 0.018261738121509552, "learning_rate": 0.0002621052631578947, "loss": 3.9771, "step": 340 }, { "epoch": 350.0, "grad_norm": 0.017833173274993896, "learning_rate": 0.0002605263157894737, "loss": 3.9751, "step": 350 }, { "epoch": 360.0, "grad_norm": 0.018583981320261955, "learning_rate": 0.0002589473684210526, "loss": 3.9736, "step": 360 }, { "epoch": 370.0, "grad_norm": 0.01927025429904461, "learning_rate": 0.00025736842105263157, "loss": 3.972, "step": 370 }, { "epoch": 380.0, "grad_norm": 0.0189345870167017, "learning_rate": 0.0002557894736842105, "loss": 3.9699, "step": 380 }, { "epoch": 390.0, "grad_norm": 0.01912664994597435, "learning_rate": 0.00025421052631578945, "loss": 3.9676, "step": 390 }, { "epoch": 400.0, "grad_norm": 0.02028432860970497, "learning_rate": 0.00025263157894736836, "loss": 3.9653, "step": 400 }, { "epoch": 400.0, "eval_loss": 5.637810230255127, "eval_runtime": 0.1801, "eval_samples_per_second": 88.844, "eval_steps_per_second": 5.553, "step": 400 }, { "epoch": 410.0, "grad_norm": 0.02026030793786049, "learning_rate": 0.0002510526315789474, "loss": 3.9632, "step": 410 }, { "epoch": 420.0, "grad_norm": 0.02084442228078842, "learning_rate": 0.0002494736842105263, "loss": 3.9609, "step": 420 }, { "epoch": 430.0, "grad_norm": 0.021354952827095985, "learning_rate": 0.00024789473684210526, "loss": 3.9589, "step": 430 }, { "epoch": 440.0, "grad_norm": 0.021611124277114868, "learning_rate": 0.00024631578947368417, "loss": 3.9561, "step": 440 }, { "epoch": 450.0, "grad_norm": 0.02221105247735977, "learning_rate": 0.00024473684210526314, "loss": 3.9535, "step": 450 }, { "epoch": 460.0, "grad_norm": 0.02258644811809063, "learning_rate": 0.00024315789473684207, "loss": 3.9506, "step": 460 }, { "epoch": 470.0, "grad_norm": 0.023103930056095123, "learning_rate": 0.000241578947368421, "loss": 3.9479, "step": 470 }, { "epoch": 480.0, "grad_norm": 0.02367890067398548, "learning_rate": 0.00023999999999999998, "loss": 3.9446, "step": 480 }, { "epoch": 490.0, "grad_norm": 0.024778548628091812, "learning_rate": 0.00023842105263157895, "loss": 3.9417, "step": 490 }, { "epoch": 500.0, "grad_norm": 0.025496676564216614, "learning_rate": 0.00023684210526315788, "loss": 3.9384, "step": 500 }, { "epoch": 500.0, "eval_loss": 5.694141864776611, "eval_runtime": 0.1839, "eval_samples_per_second": 86.992, "eval_steps_per_second": 5.437, "step": 500 }, { "epoch": 510.0, "grad_norm": 0.025006623938679695, "learning_rate": 0.00023526315789473682, "loss": 3.9352, "step": 510 }, { "epoch": 520.0, "grad_norm": 0.025370489805936813, "learning_rate": 0.00023368421052631576, "loss": 3.9321, "step": 520 }, { "epoch": 530.0, "grad_norm": 0.025797314941883087, "learning_rate": 0.0002321052631578947, "loss": 3.9286, "step": 530 }, { "epoch": 540.0, "grad_norm": 0.025854870676994324, "learning_rate": 0.00023052631578947364, "loss": 3.9247, "step": 540 }, { "epoch": 550.0, "grad_norm": 0.026357613503932953, "learning_rate": 0.00022894736842105263, "loss": 3.9208, "step": 550 }, { "epoch": 560.0, "grad_norm": 0.026599900797009468, "learning_rate": 0.00022736842105263157, "loss": 3.9174, "step": 560 }, { "epoch": 570.0, "grad_norm": 0.026790177449584007, "learning_rate": 0.0002257894736842105, "loss": 3.9133, "step": 570 }, { "epoch": 580.0, "grad_norm": 0.027292467653751373, "learning_rate": 0.00022421052631578945, "loss": 3.91, "step": 580 }, { "epoch": 590.0, "grad_norm": 0.027395937591791153, "learning_rate": 0.0002226315789473684, "loss": 3.9059, "step": 590 }, { "epoch": 600.0, "grad_norm": 0.028047366067767143, "learning_rate": 0.00022105263157894733, "loss": 3.9018, "step": 600 }, { "epoch": 600.0, "eval_loss": 5.727851867675781, "eval_runtime": 0.1864, "eval_samples_per_second": 85.817, "eval_steps_per_second": 5.364, "step": 600 }, { "epoch": 610.0, "grad_norm": 0.028283094987273216, "learning_rate": 0.00021947368421052632, "loss": 3.8979, "step": 610 }, { "epoch": 620.0, "grad_norm": 0.028225364163517952, "learning_rate": 0.00021789473684210526, "loss": 3.8931, "step": 620 }, { "epoch": 630.0, "grad_norm": 0.028118429705500603, "learning_rate": 0.0002163157894736842, "loss": 3.8888, "step": 630 }, { "epoch": 640.0, "grad_norm": 0.028346119448542595, "learning_rate": 0.00021473684210526314, "loss": 3.8845, "step": 640 }, { "epoch": 650.0, "grad_norm": 0.028137408196926117, "learning_rate": 0.00021315789473684208, "loss": 3.8803, "step": 650 }, { "epoch": 660.0, "grad_norm": 0.028345981612801552, "learning_rate": 0.00021157894736842102, "loss": 3.876, "step": 660 }, { "epoch": 670.0, "grad_norm": 0.02837321348488331, "learning_rate": 0.00020999999999999998, "loss": 3.8713, "step": 670 }, { "epoch": 680.0, "grad_norm": 0.028911523520946503, "learning_rate": 0.00020842105263157895, "loss": 3.8666, "step": 680 }, { "epoch": 690.0, "grad_norm": 0.028772979974746704, "learning_rate": 0.0002068421052631579, "loss": 3.862, "step": 690 }, { "epoch": 700.0, "grad_norm": 0.029433540999889374, "learning_rate": 0.00020526315789473683, "loss": 3.8568, "step": 700 }, { "epoch": 700.0, "eval_loss": 5.748946189880371, "eval_runtime": 0.1868, "eval_samples_per_second": 85.653, "eval_steps_per_second": 5.353, "step": 700 }, { "epoch": 710.0, "grad_norm": 0.029892120510339737, "learning_rate": 0.00020368421052631576, "loss": 3.8524, "step": 710 }, { "epoch": 720.0, "grad_norm": 0.029888318851590157, "learning_rate": 0.0002021052631578947, "loss": 3.8475, "step": 720 }, { "epoch": 730.0, "grad_norm": 0.0297097060829401, "learning_rate": 0.00020052631578947367, "loss": 3.8423, "step": 730 }, { "epoch": 740.0, "grad_norm": 0.030762242153286934, "learning_rate": 0.0001989473684210526, "loss": 3.8369, "step": 740 }, { "epoch": 750.0, "grad_norm": 0.03135882690548897, "learning_rate": 0.00019736842105263157, "loss": 3.832, "step": 750 }, { "epoch": 760.0, "grad_norm": 0.030511273071169853, "learning_rate": 0.0001957894736842105, "loss": 3.8267, "step": 760 }, { "epoch": 770.0, "grad_norm": 0.030351830646395683, "learning_rate": 0.00019421052631578945, "loss": 3.8216, "step": 770 }, { "epoch": 780.0, "grad_norm": 0.030749695375561714, "learning_rate": 0.0001926315789473684, "loss": 3.8159, "step": 780 }, { "epoch": 790.0, "grad_norm": 0.03194636106491089, "learning_rate": 0.00019105263157894736, "loss": 3.8106, "step": 790 }, { "epoch": 800.0, "grad_norm": 0.031771063804626465, "learning_rate": 0.0001894736842105263, "loss": 3.805, "step": 800 }, { "epoch": 800.0, "eval_loss": 5.7597551345825195, "eval_runtime": 0.1841, "eval_samples_per_second": 86.921, "eval_steps_per_second": 5.433, "step": 800 }, { "epoch": 810.0, "grad_norm": 0.031563833355903625, "learning_rate": 0.00018789473684210524, "loss": 3.7993, "step": 810 }, { "epoch": 820.0, "grad_norm": 0.032348621636629105, "learning_rate": 0.0001863157894736842, "loss": 3.7937, "step": 820 }, { "epoch": 830.0, "grad_norm": 0.03224165365099907, "learning_rate": 0.00018473684210526314, "loss": 3.788, "step": 830 }, { "epoch": 840.0, "grad_norm": 0.03302668035030365, "learning_rate": 0.00018315789473684208, "loss": 3.782, "step": 840 }, { "epoch": 850.0, "grad_norm": 0.033235520124435425, "learning_rate": 0.00018157894736842105, "loss": 3.7764, "step": 850 }, { "epoch": 860.0, "grad_norm": 0.03329872339963913, "learning_rate": 0.00017999999999999998, "loss": 3.77, "step": 860 }, { "epoch": 870.0, "grad_norm": 0.033791761845350266, "learning_rate": 0.00017842105263157892, "loss": 3.7644, "step": 870 }, { "epoch": 880.0, "grad_norm": 0.03337433189153671, "learning_rate": 0.00017684210526315786, "loss": 3.7581, "step": 880 }, { "epoch": 890.0, "grad_norm": 0.035035695880651474, "learning_rate": 0.00017526315789473683, "loss": 3.7524, "step": 890 }, { "epoch": 900.0, "grad_norm": 0.034206751734018326, "learning_rate": 0.0001736842105263158, "loss": 3.7453, "step": 900 }, { "epoch": 900.0, "eval_loss": 5.7635884284973145, "eval_runtime": 0.1859, "eval_samples_per_second": 86.057, "eval_steps_per_second": 5.379, "step": 900 }, { "epoch": 910.0, "grad_norm": 0.034498460590839386, "learning_rate": 0.00017210526315789473, "loss": 3.7398, "step": 910 }, { "epoch": 920.0, "grad_norm": 0.035180237144231796, "learning_rate": 0.00017052631578947367, "loss": 3.7332, "step": 920 }, { "epoch": 930.0, "grad_norm": 0.03439970314502716, "learning_rate": 0.0001689473684210526, "loss": 3.7268, "step": 930 }, { "epoch": 940.0, "grad_norm": 0.03578351438045502, "learning_rate": 0.00016736842105263155, "loss": 3.72, "step": 940 }, { "epoch": 950.0, "grad_norm": 0.03579821437597275, "learning_rate": 0.00016578947368421052, "loss": 3.7141, "step": 950 }, { "epoch": 960.0, "grad_norm": 0.03594134375452995, "learning_rate": 0.00016421052631578948, "loss": 3.7078, "step": 960 }, { "epoch": 970.0, "grad_norm": 0.03562381863594055, "learning_rate": 0.00016263157894736842, "loss": 3.7012, "step": 970 }, { "epoch": 980.0, "grad_norm": 0.03593499958515167, "learning_rate": 0.00016105263157894736, "loss": 3.6949, "step": 980 }, { "epoch": 990.0, "grad_norm": 0.03597477078437805, "learning_rate": 0.0001594736842105263, "loss": 3.688, "step": 990 }, { "epoch": 1000.0, "grad_norm": 0.03582670912146568, "learning_rate": 0.00015789473684210524, "loss": 3.6811, "step": 1000 }, { "epoch": 1000.0, "eval_loss": 5.763302803039551, "eval_runtime": 0.1818, "eval_samples_per_second": 88.016, "eval_steps_per_second": 5.501, "step": 1000 }, { "epoch": 1010.0, "grad_norm": 0.03663730248808861, "learning_rate": 0.00015631578947368418, "loss": 3.675, "step": 1010 }, { "epoch": 1020.0, "grad_norm": 0.0365881472826004, "learning_rate": 0.00015473684210526317, "loss": 3.6687, "step": 1020 }, { "epoch": 1030.0, "grad_norm": 0.03673651069402695, "learning_rate": 0.0001531578947368421, "loss": 3.6619, "step": 1030 }, { "epoch": 1040.0, "grad_norm": 0.036763161420822144, "learning_rate": 0.00015157894736842105, "loss": 3.6556, "step": 1040 }, { "epoch": 1050.0, "grad_norm": 0.037773504853248596, "learning_rate": 0.00015, "loss": 3.6487, "step": 1050 }, { "epoch": 1060.0, "grad_norm": 0.03719266131520271, "learning_rate": 0.00014842105263157893, "loss": 3.6423, "step": 1060 }, { "epoch": 1070.0, "grad_norm": 0.037161633372306824, "learning_rate": 0.0001468421052631579, "loss": 3.6358, "step": 1070 }, { "epoch": 1080.0, "grad_norm": 0.038159944117069244, "learning_rate": 0.00014526315789473683, "loss": 3.6289, "step": 1080 }, { "epoch": 1090.0, "grad_norm": 0.03829749673604965, "learning_rate": 0.00014368421052631577, "loss": 3.6229, "step": 1090 }, { "epoch": 1100.0, "grad_norm": 0.03794450685381889, "learning_rate": 0.0001421052631578947, "loss": 3.6155, "step": 1100 }, { "epoch": 1100.0, "eval_loss": 5.762149810791016, "eval_runtime": 0.1831, "eval_samples_per_second": 87.366, "eval_steps_per_second": 5.46, "step": 1100 }, { "epoch": 1110.0, "grad_norm": 0.0385553203523159, "learning_rate": 0.00014052631578947367, "loss": 3.6099, "step": 1110 }, { "epoch": 1120.0, "grad_norm": 0.038567330688238144, "learning_rate": 0.00013894736842105261, "loss": 3.6031, "step": 1120 }, { "epoch": 1130.0, "grad_norm": 0.038120087236166, "learning_rate": 0.00013736842105263155, "loss": 3.597, "step": 1130 }, { "epoch": 1140.0, "grad_norm": 0.03771847486495972, "learning_rate": 0.00013578947368421052, "loss": 3.59, "step": 1140 }, { "epoch": 1150.0, "grad_norm": 0.03899717703461647, "learning_rate": 0.00013421052631578946, "loss": 3.5841, "step": 1150 }, { "epoch": 1160.0, "grad_norm": 0.03935745730996132, "learning_rate": 0.0001326315789473684, "loss": 3.5777, "step": 1160 }, { "epoch": 1170.0, "grad_norm": 0.03850308433175087, "learning_rate": 0.00013105263157894736, "loss": 3.5711, "step": 1170 }, { "epoch": 1180.0, "grad_norm": 0.03847126662731171, "learning_rate": 0.0001294736842105263, "loss": 3.5648, "step": 1180 }, { "epoch": 1190.0, "grad_norm": 0.03784647956490517, "learning_rate": 0.00012789473684210524, "loss": 3.5596, "step": 1190 }, { "epoch": 1200.0, "grad_norm": 0.038580793887376785, "learning_rate": 0.00012631578947368418, "loss": 3.5529, "step": 1200 }, { "epoch": 1200.0, "eval_loss": 5.760702610015869, "eval_runtime": 0.186, "eval_samples_per_second": 86.019, "eval_steps_per_second": 5.376, "step": 1200 }, { "epoch": 1210.0, "grad_norm": 0.03951945900917053, "learning_rate": 0.00012473684210526315, "loss": 3.5471, "step": 1210 }, { "epoch": 1220.0, "grad_norm": 0.039367739111185074, "learning_rate": 0.00012315789473684208, "loss": 3.5404, "step": 1220 }, { "epoch": 1230.0, "grad_norm": 0.039361272007226944, "learning_rate": 0.00012157894736842104, "loss": 3.5349, "step": 1230 }, { "epoch": 1240.0, "grad_norm": 0.03928719088435173, "learning_rate": 0.00011999999999999999, "loss": 3.5282, "step": 1240 }, { "epoch": 1250.0, "grad_norm": 0.038380853831768036, "learning_rate": 0.00011842105263157894, "loss": 3.5224, "step": 1250 }, { "epoch": 1260.0, "grad_norm": 0.0398189015686512, "learning_rate": 0.00011684210526315788, "loss": 3.5166, "step": 1260 }, { "epoch": 1270.0, "grad_norm": 0.039852969348430634, "learning_rate": 0.00011526315789473682, "loss": 3.5111, "step": 1270 }, { "epoch": 1280.0, "grad_norm": 0.03926192224025726, "learning_rate": 0.00011368421052631579, "loss": 3.505, "step": 1280 }, { "epoch": 1290.0, "grad_norm": 0.03940172120928764, "learning_rate": 0.00011210526315789472, "loss": 3.4998, "step": 1290 }, { "epoch": 1300.0, "grad_norm": 0.04009696841239929, "learning_rate": 0.00011052631578947366, "loss": 3.4937, "step": 1300 }, { "epoch": 1300.0, "eval_loss": 5.759952068328857, "eval_runtime": 0.1865, "eval_samples_per_second": 85.806, "eval_steps_per_second": 5.363, "step": 1300 }, { "epoch": 1310.0, "grad_norm": 0.04028191417455673, "learning_rate": 0.00010894736842105263, "loss": 3.4881, "step": 1310 }, { "epoch": 1320.0, "grad_norm": 0.03911704197525978, "learning_rate": 0.00010736842105263157, "loss": 3.4829, "step": 1320 }, { "epoch": 1330.0, "grad_norm": 0.04032404348254204, "learning_rate": 0.00010578947368421051, "loss": 3.4779, "step": 1330 }, { "epoch": 1340.0, "grad_norm": 0.04010001942515373, "learning_rate": 0.00010421052631578947, "loss": 3.4719, "step": 1340 }, { "epoch": 1350.0, "grad_norm": 0.039343658834695816, "learning_rate": 0.00010263157894736841, "loss": 3.4671, "step": 1350 }, { "epoch": 1360.0, "grad_norm": 0.03940987586975098, "learning_rate": 0.00010105263157894735, "loss": 3.4615, "step": 1360 }, { "epoch": 1370.0, "grad_norm": 0.03980929031968117, "learning_rate": 9.94736842105263e-05, "loss": 3.4558, "step": 1370 }, { "epoch": 1380.0, "grad_norm": 0.04070553928613663, "learning_rate": 9.789473684210526e-05, "loss": 3.4507, "step": 1380 }, { "epoch": 1390.0, "grad_norm": 0.03986859321594238, "learning_rate": 9.63157894736842e-05, "loss": 3.4461, "step": 1390 }, { "epoch": 1400.0, "grad_norm": 0.04015074670314789, "learning_rate": 9.473684210526315e-05, "loss": 3.4413, "step": 1400 }, { "epoch": 1400.0, "eval_loss": 5.759387016296387, "eval_runtime": 0.1824, "eval_samples_per_second": 87.704, "eval_steps_per_second": 5.481, "step": 1400 }, { "epoch": 1410.0, "grad_norm": 0.04038800671696663, "learning_rate": 9.31578947368421e-05, "loss": 3.4369, "step": 1410 }, { "epoch": 1420.0, "grad_norm": 0.040515296161174774, "learning_rate": 9.157894736842104e-05, "loss": 3.4314, "step": 1420 }, { "epoch": 1430.0, "grad_norm": 0.03986503183841705, "learning_rate": 8.999999999999999e-05, "loss": 3.4281, "step": 1430 }, { "epoch": 1440.0, "grad_norm": 0.039644401520490646, "learning_rate": 8.842105263157893e-05, "loss": 3.4224, "step": 1440 }, { "epoch": 1450.0, "grad_norm": 0.041090868413448334, "learning_rate": 8.68421052631579e-05, "loss": 3.4178, "step": 1450 }, { "epoch": 1460.0, "grad_norm": 0.039826828986406326, "learning_rate": 8.526315789473684e-05, "loss": 3.4133, "step": 1460 }, { "epoch": 1470.0, "grad_norm": 0.03991613909602165, "learning_rate": 8.368421052631578e-05, "loss": 3.4084, "step": 1470 }, { "epoch": 1480.0, "grad_norm": 0.040304556488990784, "learning_rate": 8.210526315789474e-05, "loss": 3.4041, "step": 1480 }, { "epoch": 1490.0, "grad_norm": 0.04031972959637642, "learning_rate": 8.052631578947368e-05, "loss": 3.4001, "step": 1490 }, { "epoch": 1500.0, "grad_norm": 0.03925486281514168, "learning_rate": 7.894736842105262e-05, "loss": 3.3959, "step": 1500 }, { "epoch": 1500.0, "eval_loss": 5.76047420501709, "eval_runtime": 0.1839, "eval_samples_per_second": 87.009, "eval_steps_per_second": 5.438, "step": 1500 }, { "epoch": 1510.0, "grad_norm": 0.039462652057409286, "learning_rate": 7.736842105263159e-05, "loss": 3.3922, "step": 1510 }, { "epoch": 1520.0, "grad_norm": 0.039901990443468094, "learning_rate": 7.578947368421052e-05, "loss": 3.3877, "step": 1520 }, { "epoch": 1530.0, "grad_norm": 0.03943758085370064, "learning_rate": 7.421052631578946e-05, "loss": 3.3843, "step": 1530 }, { "epoch": 1540.0, "grad_norm": 0.03951801732182503, "learning_rate": 7.263157894736842e-05, "loss": 3.3793, "step": 1540 }, { "epoch": 1550.0, "grad_norm": 0.03950580582022667, "learning_rate": 7.105263157894735e-05, "loss": 3.3767, "step": 1550 }, { "epoch": 1560.0, "grad_norm": 0.039812780916690826, "learning_rate": 6.947368421052631e-05, "loss": 3.3727, "step": 1560 }, { "epoch": 1570.0, "grad_norm": 0.0408426858484745, "learning_rate": 6.789473684210526e-05, "loss": 3.3689, "step": 1570 }, { "epoch": 1580.0, "grad_norm": 0.040794286876916885, "learning_rate": 6.63157894736842e-05, "loss": 3.3659, "step": 1580 }, { "epoch": 1590.0, "grad_norm": 0.04000876471400261, "learning_rate": 6.473684210526315e-05, "loss": 3.3622, "step": 1590 }, { "epoch": 1600.0, "grad_norm": 0.03949282690882683, "learning_rate": 6.315789473684209e-05, "loss": 3.3593, "step": 1600 }, { "epoch": 1600.0, "eval_loss": 5.761292934417725, "eval_runtime": 0.1828, "eval_samples_per_second": 87.518, "eval_steps_per_second": 5.47, "step": 1600 }, { "epoch": 1610.0, "grad_norm": 0.039206769317388535, "learning_rate": 6.157894736842104e-05, "loss": 3.3561, "step": 1610 }, { "epoch": 1620.0, "grad_norm": 0.040203340351581573, "learning_rate": 5.9999999999999995e-05, "loss": 3.3521, "step": 1620 }, { "epoch": 1630.0, "grad_norm": 0.04115165024995804, "learning_rate": 5.842105263157894e-05, "loss": 3.3493, "step": 1630 }, { "epoch": 1640.0, "grad_norm": 0.0398443304002285, "learning_rate": 5.684210526315789e-05, "loss": 3.3467, "step": 1640 }, { "epoch": 1650.0, "grad_norm": 0.0401863157749176, "learning_rate": 5.526315789473683e-05, "loss": 3.3445, "step": 1650 }, { "epoch": 1660.0, "grad_norm": 0.04017779976129532, "learning_rate": 5.3684210526315784e-05, "loss": 3.3407, "step": 1660 }, { "epoch": 1670.0, "grad_norm": 0.04098346456885338, "learning_rate": 5.210526315789474e-05, "loss": 3.3381, "step": 1670 }, { "epoch": 1680.0, "grad_norm": 0.04120390862226486, "learning_rate": 5.0526315789473676e-05, "loss": 3.3351, "step": 1680 }, { "epoch": 1690.0, "grad_norm": 0.03920115903019905, "learning_rate": 4.894736842105263e-05, "loss": 3.3325, "step": 1690 }, { "epoch": 1700.0, "grad_norm": 0.040873102843761444, "learning_rate": 4.7368421052631574e-05, "loss": 3.3306, "step": 1700 }, { "epoch": 1700.0, "eval_loss": 5.762476444244385, "eval_runtime": 0.1821, "eval_samples_per_second": 87.857, "eval_steps_per_second": 5.491, "step": 1700 }, { "epoch": 1710.0, "grad_norm": 0.04099594056606293, "learning_rate": 4.578947368421052e-05, "loss": 3.3278, "step": 1710 }, { "epoch": 1720.0, "grad_norm": 0.03981698304414749, "learning_rate": 4.4210526315789466e-05, "loss": 3.3252, "step": 1720 }, { "epoch": 1730.0, "grad_norm": 0.040660880506038666, "learning_rate": 4.263157894736842e-05, "loss": 3.3232, "step": 1730 }, { "epoch": 1740.0, "grad_norm": 0.03986099362373352, "learning_rate": 4.105263157894737e-05, "loss": 3.3206, "step": 1740 }, { "epoch": 1750.0, "grad_norm": 0.040134310722351074, "learning_rate": 3.947368421052631e-05, "loss": 3.3187, "step": 1750 }, { "epoch": 1760.0, "grad_norm": 0.03983493149280548, "learning_rate": 3.789473684210526e-05, "loss": 3.3169, "step": 1760 }, { "epoch": 1770.0, "grad_norm": 0.04003679007291794, "learning_rate": 3.631578947368421e-05, "loss": 3.3149, "step": 1770 }, { "epoch": 1780.0, "grad_norm": 0.03963625803589821, "learning_rate": 3.4736842105263153e-05, "loss": 3.3135, "step": 1780 }, { "epoch": 1790.0, "grad_norm": 0.039849694818258286, "learning_rate": 3.31578947368421e-05, "loss": 3.3102, "step": 1790 }, { "epoch": 1800.0, "grad_norm": 0.04026848450303078, "learning_rate": 3.1578947368421045e-05, "loss": 3.3091, "step": 1800 }, { "epoch": 1800.0, "eval_loss": 5.764220237731934, "eval_runtime": 0.187, "eval_samples_per_second": 85.545, "eval_steps_per_second": 5.347, "step": 1800 }, { "epoch": 1810.0, "grad_norm": 0.04031796008348465, "learning_rate": 2.9999999999999997e-05, "loss": 3.3076, "step": 1810 }, { "epoch": 1820.0, "grad_norm": 0.03976697474718094, "learning_rate": 2.8421052631578946e-05, "loss": 3.3061, "step": 1820 }, { "epoch": 1830.0, "grad_norm": 0.040501099079847336, "learning_rate": 2.6842105263157892e-05, "loss": 3.3045, "step": 1830 }, { "epoch": 1840.0, "grad_norm": 0.04013768583536148, "learning_rate": 2.5263157894736838e-05, "loss": 3.3031, "step": 1840 }, { "epoch": 1850.0, "grad_norm": 0.041588522493839264, "learning_rate": 2.3684210526315787e-05, "loss": 3.3022, "step": 1850 }, { "epoch": 1860.0, "grad_norm": 0.04206200689077377, "learning_rate": 2.2105263157894733e-05, "loss": 3.302, "step": 1860 }, { "epoch": 1870.0, "grad_norm": 0.040176983922719955, "learning_rate": 2.0526315789473685e-05, "loss": 3.2997, "step": 1870 }, { "epoch": 1880.0, "grad_norm": 0.04021242633461952, "learning_rate": 1.894736842105263e-05, "loss": 3.2989, "step": 1880 }, { "epoch": 1890.0, "grad_norm": 0.0406365729868412, "learning_rate": 1.7368421052631577e-05, "loss": 3.2981, "step": 1890 }, { "epoch": 1900.0, "grad_norm": 0.040391016751527786, "learning_rate": 1.5789473684210522e-05, "loss": 3.298, "step": 1900 }, { "epoch": 1900.0, "eval_loss": 5.765007019042969, "eval_runtime": 0.1834, "eval_samples_per_second": 87.238, "eval_steps_per_second": 5.452, "step": 1900 }, { "epoch": 1910.0, "grad_norm": 0.03956448659300804, "learning_rate": 1.4210526315789473e-05, "loss": 3.2951, "step": 1910 }, { "epoch": 1920.0, "grad_norm": 0.0398997962474823, "learning_rate": 1.2631578947368419e-05, "loss": 3.2956, "step": 1920 }, { "epoch": 1930.0, "grad_norm": 0.04039820656180382, "learning_rate": 1.1052631578947366e-05, "loss": 3.2952, "step": 1930 }, { "epoch": 1940.0, "grad_norm": 0.03983141854405403, "learning_rate": 9.473684210526315e-06, "loss": 3.2932, "step": 1940 }, { "epoch": 1950.0, "grad_norm": 0.04014051333069801, "learning_rate": 7.894736842105261e-06, "loss": 3.2938, "step": 1950 }, { "epoch": 1960.0, "grad_norm": 0.04070891812443733, "learning_rate": 6.3157894736842095e-06, "loss": 3.2938, "step": 1960 }, { "epoch": 1970.0, "grad_norm": 0.04128490760922432, "learning_rate": 4.736842105263158e-06, "loss": 3.2926, "step": 1970 }, { "epoch": 1980.0, "grad_norm": 0.03932284563779831, "learning_rate": 3.1578947368421047e-06, "loss": 3.2926, "step": 1980 }, { "epoch": 1990.0, "grad_norm": 0.03982985392212868, "learning_rate": 1.5789473684210524e-06, "loss": 3.2925, "step": 1990 }, { "epoch": 2000.0, "grad_norm": 0.0400531105697155, "learning_rate": 0.0, "loss": 3.2921, "step": 2000 }, { "epoch": 2000.0, "eval_loss": 5.765518665313721, "eval_runtime": 0.1824, "eval_samples_per_second": 87.71, "eval_steps_per_second": 5.482, "step": 2000 } ], "logging_steps": 10, "max_steps": 2000, "num_input_tokens_seen": 0, "num_train_epochs": 2000, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 8521884106752000.0, "train_batch_size": 30, "trial_name": null, "trial_params": null }