ecapa-tdnn-voxceleb1-c512-aam / trainer_state.json
yangwang825's picture
Training in progress, epoch 1
177b9da verified
raw
history blame
51.4 kB
{
"best_metric": 0.9757901815736382,
"best_model_checkpoint": "/mnt/data4_HDD_14TB/yang/voxceleb-checkpoints/ecapa-tdnn/voxceleb1/pretrain/c512-aam-len3-bs256-lr5e-4/checkpoint-3450",
"epoch": 10.0,
"eval_steps": 500,
"global_step": 5750,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.034782608695652174,
"grad_norm": 6.155358791351318,
"learning_rate": 1.739130434782609e-05,
"loss": 13.2026,
"step": 20
},
{
"epoch": 0.06956521739130435,
"grad_norm": 5.816741943359375,
"learning_rate": 3.478260869565218e-05,
"loss": 13.1252,
"step": 40
},
{
"epoch": 0.10434782608695652,
"grad_norm": 5.273156642913818,
"learning_rate": 5.2173913043478256e-05,
"loss": 13.0001,
"step": 60
},
{
"epoch": 0.1391304347826087,
"grad_norm": 4.86655330657959,
"learning_rate": 6.956521739130436e-05,
"loss": 12.8639,
"step": 80
},
{
"epoch": 0.17391304347826086,
"grad_norm": 4.438321113586426,
"learning_rate": 8.695652173913044e-05,
"loss": 12.7376,
"step": 100
},
{
"epoch": 0.20869565217391303,
"grad_norm": 4.164404392242432,
"learning_rate": 0.00010434782608695651,
"loss": 12.5722,
"step": 120
},
{
"epoch": 0.24347826086956523,
"grad_norm": 3.858990430831909,
"learning_rate": 0.00012173913043478261,
"loss": 12.4229,
"step": 140
},
{
"epoch": 0.2782608695652174,
"grad_norm": 3.6574394702911377,
"learning_rate": 0.0001391304347826087,
"loss": 12.2581,
"step": 160
},
{
"epoch": 0.3130434782608696,
"grad_norm": 3.3787951469421387,
"learning_rate": 0.0001565217391304348,
"loss": 12.0753,
"step": 180
},
{
"epoch": 0.34782608695652173,
"grad_norm": 3.323820114135742,
"learning_rate": 0.00017391304347826088,
"loss": 11.9261,
"step": 200
},
{
"epoch": 0.3826086956521739,
"grad_norm": 3.247619152069092,
"learning_rate": 0.00019130434782608697,
"loss": 11.7417,
"step": 220
},
{
"epoch": 0.41739130434782606,
"grad_norm": 3.2254152297973633,
"learning_rate": 0.00020869565217391303,
"loss": 11.5771,
"step": 240
},
{
"epoch": 0.45217391304347826,
"grad_norm": 3.1803464889526367,
"learning_rate": 0.00022608695652173914,
"loss": 11.3969,
"step": 260
},
{
"epoch": 0.48695652173913045,
"grad_norm": 3.41034197807312,
"learning_rate": 0.00024347826086956522,
"loss": 11.2684,
"step": 280
},
{
"epoch": 0.5217391304347826,
"grad_norm": 3.246403217315674,
"learning_rate": 0.0002608695652173913,
"loss": 11.0744,
"step": 300
},
{
"epoch": 0.5565217391304348,
"grad_norm": 3.202021360397339,
"learning_rate": 0.0002782608695652174,
"loss": 10.8929,
"step": 320
},
{
"epoch": 0.591304347826087,
"grad_norm": 3.1231367588043213,
"learning_rate": 0.0002956521739130435,
"loss": 10.7468,
"step": 340
},
{
"epoch": 0.6260869565217392,
"grad_norm": 3.1820390224456787,
"learning_rate": 0.0003130434782608696,
"loss": 10.606,
"step": 360
},
{
"epoch": 0.6608695652173913,
"grad_norm": 3.2470555305480957,
"learning_rate": 0.0003304347826086956,
"loss": 10.4871,
"step": 380
},
{
"epoch": 0.6956521739130435,
"grad_norm": 3.2452709674835205,
"learning_rate": 0.00034782608695652176,
"loss": 10.2836,
"step": 400
},
{
"epoch": 0.7304347826086957,
"grad_norm": 3.203894853591919,
"learning_rate": 0.00036521739130434785,
"loss": 10.1154,
"step": 420
},
{
"epoch": 0.7652173913043478,
"grad_norm": 3.269970178604126,
"learning_rate": 0.00038260869565217393,
"loss": 9.9283,
"step": 440
},
{
"epoch": 0.8,
"grad_norm": 3.261357545852661,
"learning_rate": 0.0004,
"loss": 9.8674,
"step": 460
},
{
"epoch": 0.8347826086956521,
"grad_norm": 3.393953323364258,
"learning_rate": 0.00041739130434782605,
"loss": 9.6224,
"step": 480
},
{
"epoch": 0.8695652173913043,
"grad_norm": 3.321411609649658,
"learning_rate": 0.0004347826086956522,
"loss": 9.524,
"step": 500
},
{
"epoch": 0.9043478260869565,
"grad_norm": 3.3886823654174805,
"learning_rate": 0.0004521739130434783,
"loss": 9.384,
"step": 520
},
{
"epoch": 0.9391304347826087,
"grad_norm": 3.4735491275787354,
"learning_rate": 0.00046956521739130436,
"loss": 9.1767,
"step": 540
},
{
"epoch": 0.9739130434782609,
"grad_norm": 3.416966676712036,
"learning_rate": 0.00048695652173913045,
"loss": 9.047,
"step": 560
},
{
"epoch": 1.0,
"eval_accuracy": 0.43039677202420984,
"eval_loss": 8.366157531738281,
"eval_runtime": 42.3364,
"eval_samples_per_second": 35.123,
"eval_steps_per_second": 35.123,
"step": 575
},
{
"epoch": 1.008695652173913,
"grad_norm": 3.446899890899658,
"learning_rate": 0.0004995169082125604,
"loss": 8.8835,
"step": 580
},
{
"epoch": 1.0434782608695652,
"grad_norm": 3.5842247009277344,
"learning_rate": 0.0004975845410628019,
"loss": 8.6436,
"step": 600
},
{
"epoch": 1.0782608695652174,
"grad_norm": 3.5029306411743164,
"learning_rate": 0.0004956521739130435,
"loss": 8.4775,
"step": 620
},
{
"epoch": 1.1130434782608696,
"grad_norm": 3.5451033115386963,
"learning_rate": 0.0004937198067632851,
"loss": 8.322,
"step": 640
},
{
"epoch": 1.1478260869565218,
"grad_norm": 3.5502634048461914,
"learning_rate": 0.0004917874396135266,
"loss": 8.1264,
"step": 660
},
{
"epoch": 1.182608695652174,
"grad_norm": 3.607395648956299,
"learning_rate": 0.0004898550724637681,
"loss": 7.9905,
"step": 680
},
{
"epoch": 1.2173913043478262,
"grad_norm": 3.6438565254211426,
"learning_rate": 0.0004879227053140097,
"loss": 7.8252,
"step": 700
},
{
"epoch": 1.2521739130434781,
"grad_norm": 3.656705141067505,
"learning_rate": 0.0004859903381642512,
"loss": 7.7737,
"step": 720
},
{
"epoch": 1.2869565217391306,
"grad_norm": 3.7424328327178955,
"learning_rate": 0.0004840579710144928,
"loss": 7.5822,
"step": 740
},
{
"epoch": 1.3217391304347825,
"grad_norm": 3.673156261444092,
"learning_rate": 0.0004821256038647343,
"loss": 7.4563,
"step": 760
},
{
"epoch": 1.3565217391304347,
"grad_norm": 3.6774067878723145,
"learning_rate": 0.0004801932367149758,
"loss": 7.3379,
"step": 780
},
{
"epoch": 1.391304347826087,
"grad_norm": 3.811283826828003,
"learning_rate": 0.0004782608695652174,
"loss": 7.1559,
"step": 800
},
{
"epoch": 1.4260869565217391,
"grad_norm": 3.7899839878082275,
"learning_rate": 0.00047632850241545894,
"loss": 7.0834,
"step": 820
},
{
"epoch": 1.4608695652173913,
"grad_norm": 3.583247423171997,
"learning_rate": 0.00047439613526570047,
"loss": 6.9172,
"step": 840
},
{
"epoch": 1.4956521739130435,
"grad_norm": 3.8192331790924072,
"learning_rate": 0.00047246376811594206,
"loss": 6.7251,
"step": 860
},
{
"epoch": 1.5304347826086957,
"grad_norm": 3.8098299503326416,
"learning_rate": 0.0004705314009661836,
"loss": 6.7871,
"step": 880
},
{
"epoch": 1.5652173913043477,
"grad_norm": 3.7341325283050537,
"learning_rate": 0.0004685990338164252,
"loss": 6.6103,
"step": 900
},
{
"epoch": 1.6,
"grad_norm": 3.9190495014190674,
"learning_rate": 0.00046666666666666666,
"loss": 6.4507,
"step": 920
},
{
"epoch": 1.634782608695652,
"grad_norm": 3.9456422328948975,
"learning_rate": 0.0004647342995169082,
"loss": 6.3619,
"step": 940
},
{
"epoch": 1.6695652173913045,
"grad_norm": 3.899134874343872,
"learning_rate": 0.0004628019323671498,
"loss": 6.2957,
"step": 960
},
{
"epoch": 1.7043478260869565,
"grad_norm": 3.878810167312622,
"learning_rate": 0.0004608695652173913,
"loss": 6.1362,
"step": 980
},
{
"epoch": 1.7391304347826086,
"grad_norm": 3.9270784854888916,
"learning_rate": 0.00045893719806763285,
"loss": 5.9814,
"step": 1000
},
{
"epoch": 1.7739130434782608,
"grad_norm": 3.8247644901275635,
"learning_rate": 0.00045700483091787444,
"loss": 5.9095,
"step": 1020
},
{
"epoch": 1.808695652173913,
"grad_norm": 3.8870134353637695,
"learning_rate": 0.000455072463768116,
"loss": 5.7793,
"step": 1040
},
{
"epoch": 1.8434782608695652,
"grad_norm": 3.9533441066741943,
"learning_rate": 0.00045314009661835745,
"loss": 5.7754,
"step": 1060
},
{
"epoch": 1.8782608695652174,
"grad_norm": 3.9928998947143555,
"learning_rate": 0.00045120772946859904,
"loss": 5.5886,
"step": 1080
},
{
"epoch": 1.9130434782608696,
"grad_norm": 4.030064582824707,
"learning_rate": 0.0004492753623188406,
"loss": 5.5482,
"step": 1100
},
{
"epoch": 1.9478260869565216,
"grad_norm": 3.961806297302246,
"learning_rate": 0.0004473429951690821,
"loss": 5.4807,
"step": 1120
},
{
"epoch": 1.982608695652174,
"grad_norm": 4.003119945526123,
"learning_rate": 0.0004454106280193237,
"loss": 5.3508,
"step": 1140
},
{
"epoch": 2.0,
"eval_accuracy": 0.8190988567585743,
"eval_loss": 4.025164604187012,
"eval_runtime": 42.7144,
"eval_samples_per_second": 34.813,
"eval_steps_per_second": 34.813,
"step": 1150
},
{
"epoch": 2.017391304347826,
"grad_norm": 3.958116292953491,
"learning_rate": 0.00044347826086956523,
"loss": 5.1229,
"step": 1160
},
{
"epoch": 2.0521739130434784,
"grad_norm": 3.864279270172119,
"learning_rate": 0.00044154589371980677,
"loss": 4.8146,
"step": 1180
},
{
"epoch": 2.0869565217391304,
"grad_norm": 4.045077323913574,
"learning_rate": 0.0004396135265700483,
"loss": 4.8843,
"step": 1200
},
{
"epoch": 2.121739130434783,
"grad_norm": 4.061978816986084,
"learning_rate": 0.00043768115942028983,
"loss": 4.8078,
"step": 1220
},
{
"epoch": 2.1565217391304348,
"grad_norm": 4.040159225463867,
"learning_rate": 0.0004357487922705314,
"loss": 4.6812,
"step": 1240
},
{
"epoch": 2.1913043478260867,
"grad_norm": 4.234623908996582,
"learning_rate": 0.00043381642512077296,
"loss": 4.6701,
"step": 1260
},
{
"epoch": 2.226086956521739,
"grad_norm": 4.030038356781006,
"learning_rate": 0.0004318840579710145,
"loss": 4.6221,
"step": 1280
},
{
"epoch": 2.260869565217391,
"grad_norm": 3.9954497814178467,
"learning_rate": 0.0004299516908212561,
"loss": 4.5647,
"step": 1300
},
{
"epoch": 2.2956521739130435,
"grad_norm": 4.188636779785156,
"learning_rate": 0.0004280193236714976,
"loss": 4.4502,
"step": 1320
},
{
"epoch": 2.3304347826086955,
"grad_norm": 4.185456275939941,
"learning_rate": 0.00042608695652173915,
"loss": 4.359,
"step": 1340
},
{
"epoch": 2.365217391304348,
"grad_norm": 4.123263359069824,
"learning_rate": 0.0004241545893719807,
"loss": 4.2863,
"step": 1360
},
{
"epoch": 2.4,
"grad_norm": 4.194387435913086,
"learning_rate": 0.0004222222222222222,
"loss": 4.3354,
"step": 1380
},
{
"epoch": 2.4347826086956523,
"grad_norm": 4.065763473510742,
"learning_rate": 0.00042028985507246375,
"loss": 4.2176,
"step": 1400
},
{
"epoch": 2.4695652173913043,
"grad_norm": 4.120363712310791,
"learning_rate": 0.00041835748792270534,
"loss": 4.0597,
"step": 1420
},
{
"epoch": 2.5043478260869563,
"grad_norm": 4.3197174072265625,
"learning_rate": 0.00041642512077294687,
"loss": 4.028,
"step": 1440
},
{
"epoch": 2.5391304347826087,
"grad_norm": 4.2683610916137695,
"learning_rate": 0.0004144927536231884,
"loss": 3.9833,
"step": 1460
},
{
"epoch": 2.573913043478261,
"grad_norm": 4.15448522567749,
"learning_rate": 0.00041256038647343,
"loss": 4.0065,
"step": 1480
},
{
"epoch": 2.608695652173913,
"grad_norm": 4.348177433013916,
"learning_rate": 0.0004106280193236715,
"loss": 3.8134,
"step": 1500
},
{
"epoch": 2.643478260869565,
"grad_norm": 4.100021839141846,
"learning_rate": 0.00040869565217391306,
"loss": 3.8548,
"step": 1520
},
{
"epoch": 2.6782608695652175,
"grad_norm": 4.344174385070801,
"learning_rate": 0.0004067632850241546,
"loss": 3.7814,
"step": 1540
},
{
"epoch": 2.7130434782608694,
"grad_norm": 4.240079402923584,
"learning_rate": 0.00040483091787439613,
"loss": 3.7578,
"step": 1560
},
{
"epoch": 2.747826086956522,
"grad_norm": 4.468689918518066,
"learning_rate": 0.0004028985507246377,
"loss": 3.7331,
"step": 1580
},
{
"epoch": 2.782608695652174,
"grad_norm": 4.28464937210083,
"learning_rate": 0.00040096618357487925,
"loss": 3.6396,
"step": 1600
},
{
"epoch": 2.8173913043478263,
"grad_norm": 4.166805744171143,
"learning_rate": 0.0003990338164251208,
"loss": 3.5799,
"step": 1620
},
{
"epoch": 2.8521739130434782,
"grad_norm": 4.237683296203613,
"learning_rate": 0.0003971014492753624,
"loss": 3.4734,
"step": 1640
},
{
"epoch": 2.8869565217391306,
"grad_norm": 4.153097152709961,
"learning_rate": 0.00039516908212560385,
"loss": 3.5183,
"step": 1660
},
{
"epoch": 2.9217391304347826,
"grad_norm": 4.2313947677612305,
"learning_rate": 0.0003932367149758454,
"loss": 3.3963,
"step": 1680
},
{
"epoch": 2.9565217391304346,
"grad_norm": 3.992475748062134,
"learning_rate": 0.000391304347826087,
"loss": 3.3081,
"step": 1700
},
{
"epoch": 2.991304347826087,
"grad_norm": 4.4731059074401855,
"learning_rate": 0.0003893719806763285,
"loss": 3.3124,
"step": 1720
},
{
"epoch": 3.0,
"eval_accuracy": 0.9260255548083389,
"eval_loss": 2.1082653999328613,
"eval_runtime": 22.1676,
"eval_samples_per_second": 67.08,
"eval_steps_per_second": 67.08,
"step": 1725
},
{
"epoch": 3.026086956521739,
"grad_norm": 4.272000312805176,
"learning_rate": 0.00038743961352657004,
"loss": 3.1247,
"step": 1740
},
{
"epoch": 3.0608695652173914,
"grad_norm": 4.102330207824707,
"learning_rate": 0.00038550724637681163,
"loss": 3.1064,
"step": 1760
},
{
"epoch": 3.0956521739130434,
"grad_norm": 4.381846904754639,
"learning_rate": 0.00038357487922705317,
"loss": 2.9371,
"step": 1780
},
{
"epoch": 3.130434782608696,
"grad_norm": 4.1588921546936035,
"learning_rate": 0.00038164251207729465,
"loss": 2.9355,
"step": 1800
},
{
"epoch": 3.1652173913043478,
"grad_norm": 4.279609203338623,
"learning_rate": 0.00037971014492753623,
"loss": 2.8545,
"step": 1820
},
{
"epoch": 3.2,
"grad_norm": 4.240756988525391,
"learning_rate": 0.00037777777777777777,
"loss": 2.8096,
"step": 1840
},
{
"epoch": 3.234782608695652,
"grad_norm": 4.11091947555542,
"learning_rate": 0.00037584541062801936,
"loss": 2.8138,
"step": 1860
},
{
"epoch": 3.269565217391304,
"grad_norm": 4.078794479370117,
"learning_rate": 0.0003739130434782609,
"loss": 2.7417,
"step": 1880
},
{
"epoch": 3.3043478260869565,
"grad_norm": 4.368116855621338,
"learning_rate": 0.0003719806763285024,
"loss": 2.7937,
"step": 1900
},
{
"epoch": 3.3391304347826085,
"grad_norm": 4.044319152832031,
"learning_rate": 0.000370048309178744,
"loss": 2.7361,
"step": 1920
},
{
"epoch": 3.373913043478261,
"grad_norm": 4.314040184020996,
"learning_rate": 0.0003681159420289855,
"loss": 2.7054,
"step": 1940
},
{
"epoch": 3.408695652173913,
"grad_norm": 4.185855388641357,
"learning_rate": 0.000366183574879227,
"loss": 2.6682,
"step": 1960
},
{
"epoch": 3.4434782608695653,
"grad_norm": 4.433622360229492,
"learning_rate": 0.0003642512077294686,
"loss": 2.6644,
"step": 1980
},
{
"epoch": 3.4782608695652173,
"grad_norm": 4.048947811126709,
"learning_rate": 0.00036231884057971015,
"loss": 2.618,
"step": 2000
},
{
"epoch": 3.5130434782608697,
"grad_norm": 4.145406246185303,
"learning_rate": 0.0003603864734299517,
"loss": 2.5982,
"step": 2020
},
{
"epoch": 3.5478260869565217,
"grad_norm": 4.2812910079956055,
"learning_rate": 0.00035845410628019327,
"loss": 2.6138,
"step": 2040
},
{
"epoch": 3.5826086956521737,
"grad_norm": 4.400162220001221,
"learning_rate": 0.0003565217391304348,
"loss": 2.5039,
"step": 2060
},
{
"epoch": 3.617391304347826,
"grad_norm": 4.217800617218018,
"learning_rate": 0.0003545893719806763,
"loss": 2.5249,
"step": 2080
},
{
"epoch": 3.6521739130434785,
"grad_norm": 4.076215744018555,
"learning_rate": 0.0003526570048309179,
"loss": 2.4547,
"step": 2100
},
{
"epoch": 3.6869565217391305,
"grad_norm": 4.139514446258545,
"learning_rate": 0.0003507246376811594,
"loss": 2.4315,
"step": 2120
},
{
"epoch": 3.7217391304347824,
"grad_norm": 4.118022918701172,
"learning_rate": 0.00034879227053140094,
"loss": 2.3836,
"step": 2140
},
{
"epoch": 3.756521739130435,
"grad_norm": 4.137601852416992,
"learning_rate": 0.00034685990338164253,
"loss": 2.3284,
"step": 2160
},
{
"epoch": 3.791304347826087,
"grad_norm": 4.023979663848877,
"learning_rate": 0.00034492753623188406,
"loss": 2.3095,
"step": 2180
},
{
"epoch": 3.8260869565217392,
"grad_norm": 4.042725086212158,
"learning_rate": 0.00034299516908212565,
"loss": 2.305,
"step": 2200
},
{
"epoch": 3.860869565217391,
"grad_norm": 4.265875339508057,
"learning_rate": 0.0003410628019323672,
"loss": 2.3237,
"step": 2220
},
{
"epoch": 3.8956521739130436,
"grad_norm": 4.205041408538818,
"learning_rate": 0.00033913043478260867,
"loss": 2.335,
"step": 2240
},
{
"epoch": 3.9304347826086956,
"grad_norm": 4.1344709396362305,
"learning_rate": 0.00033719806763285025,
"loss": 2.2341,
"step": 2260
},
{
"epoch": 3.965217391304348,
"grad_norm": 4.247790813446045,
"learning_rate": 0.0003352657004830918,
"loss": 2.251,
"step": 2280
},
{
"epoch": 4.0,
"grad_norm": 4.859626770019531,
"learning_rate": 0.0003333333333333333,
"loss": 2.3212,
"step": 2300
},
{
"epoch": 4.0,
"eval_accuracy": 0.9435104236718225,
"eval_loss": 1.2223739624023438,
"eval_runtime": 14.8513,
"eval_samples_per_second": 100.126,
"eval_steps_per_second": 100.126,
"step": 2300
},
{
"epoch": 4.034782608695652,
"grad_norm": 4.098020553588867,
"learning_rate": 0.0003314009661835749,
"loss": 1.9133,
"step": 2320
},
{
"epoch": 4.069565217391304,
"grad_norm": 4.198029041290283,
"learning_rate": 0.00032946859903381644,
"loss": 1.9814,
"step": 2340
},
{
"epoch": 4.104347826086957,
"grad_norm": 3.960844039916992,
"learning_rate": 0.000327536231884058,
"loss": 1.9505,
"step": 2360
},
{
"epoch": 4.139130434782609,
"grad_norm": 4.0190300941467285,
"learning_rate": 0.0003256038647342995,
"loss": 1.8815,
"step": 2380
},
{
"epoch": 4.173913043478261,
"grad_norm": 4.040708541870117,
"learning_rate": 0.00032367149758454105,
"loss": 1.8365,
"step": 2400
},
{
"epoch": 4.208695652173913,
"grad_norm": 4.077364444732666,
"learning_rate": 0.0003217391304347826,
"loss": 1.84,
"step": 2420
},
{
"epoch": 4.243478260869566,
"grad_norm": 4.267309188842773,
"learning_rate": 0.0003199033816425121,
"loss": 1.8864,
"step": 2440
},
{
"epoch": 4.278260869565218,
"grad_norm": 3.978663921356201,
"learning_rate": 0.00031797101449275363,
"loss": 1.9015,
"step": 2460
},
{
"epoch": 4.3130434782608695,
"grad_norm": 4.089256763458252,
"learning_rate": 0.0003160386473429952,
"loss": 1.8388,
"step": 2480
},
{
"epoch": 4.3478260869565215,
"grad_norm": 3.9317057132720947,
"learning_rate": 0.0003141062801932367,
"loss": 1.7845,
"step": 2500
},
{
"epoch": 4.3826086956521735,
"grad_norm": 3.9738080501556396,
"learning_rate": 0.00031217391304347823,
"loss": 1.7725,
"step": 2520
},
{
"epoch": 4.417391304347826,
"grad_norm": 4.232215881347656,
"learning_rate": 0.0003102415458937198,
"loss": 1.852,
"step": 2540
},
{
"epoch": 4.452173913043478,
"grad_norm": 4.050131797790527,
"learning_rate": 0.00030830917874396136,
"loss": 1.8234,
"step": 2560
},
{
"epoch": 4.48695652173913,
"grad_norm": 4.217935085296631,
"learning_rate": 0.0003063768115942029,
"loss": 1.8148,
"step": 2580
},
{
"epoch": 4.521739130434782,
"grad_norm": 3.9807074069976807,
"learning_rate": 0.0003044444444444445,
"loss": 1.7134,
"step": 2600
},
{
"epoch": 4.556521739130435,
"grad_norm": 4.05940580368042,
"learning_rate": 0.000302512077294686,
"loss": 1.6752,
"step": 2620
},
{
"epoch": 4.591304347826087,
"grad_norm": 4.454566955566406,
"learning_rate": 0.00030057971014492755,
"loss": 1.8413,
"step": 2640
},
{
"epoch": 4.626086956521739,
"grad_norm": 4.144088268280029,
"learning_rate": 0.0002986473429951691,
"loss": 1.7948,
"step": 2660
},
{
"epoch": 4.660869565217391,
"grad_norm": 3.940176010131836,
"learning_rate": 0.0002967149758454106,
"loss": 1.7468,
"step": 2680
},
{
"epoch": 4.695652173913043,
"grad_norm": 4.198675632476807,
"learning_rate": 0.0002948792270531401,
"loss": 1.709,
"step": 2700
},
{
"epoch": 4.730434782608696,
"grad_norm": 3.976001501083374,
"learning_rate": 0.00029294685990338167,
"loss": 1.6506,
"step": 2720
},
{
"epoch": 4.765217391304348,
"grad_norm": 4.033059120178223,
"learning_rate": 0.0002910144927536232,
"loss": 1.7042,
"step": 2740
},
{
"epoch": 4.8,
"grad_norm": 4.062041759490967,
"learning_rate": 0.0002890821256038648,
"loss": 1.6795,
"step": 2760
},
{
"epoch": 4.834782608695652,
"grad_norm": 3.988589286804199,
"learning_rate": 0.00028714975845410627,
"loss": 1.7029,
"step": 2780
},
{
"epoch": 4.869565217391305,
"grad_norm": 4.16325044631958,
"learning_rate": 0.0002852173913043478,
"loss": 1.6641,
"step": 2800
},
{
"epoch": 4.904347826086957,
"grad_norm": 4.323537349700928,
"learning_rate": 0.0002832850241545894,
"loss": 1.6953,
"step": 2820
},
{
"epoch": 4.939130434782609,
"grad_norm": 3.8293144702911377,
"learning_rate": 0.0002813526570048309,
"loss": 1.5863,
"step": 2840
},
{
"epoch": 4.973913043478261,
"grad_norm": 3.8955535888671875,
"learning_rate": 0.00027942028985507246,
"loss": 1.6276,
"step": 2860
},
{
"epoch": 5.0,
"eval_accuracy": 0.9677202420981843,
"eval_loss": 0.8229038715362549,
"eval_runtime": 88.6744,
"eval_samples_per_second": 16.769,
"eval_steps_per_second": 16.769,
"step": 2875
},
{
"epoch": 5.008695652173913,
"grad_norm": 3.8480091094970703,
"learning_rate": 0.00027748792270531405,
"loss": 1.5701,
"step": 2880
},
{
"epoch": 5.043478260869565,
"grad_norm": 3.679872512817383,
"learning_rate": 0.0002755555555555556,
"loss": 1.3786,
"step": 2900
},
{
"epoch": 5.078260869565217,
"grad_norm": 4.13381290435791,
"learning_rate": 0.00027362318840579706,
"loss": 1.3563,
"step": 2920
},
{
"epoch": 5.113043478260869,
"grad_norm": 3.7467329502105713,
"learning_rate": 0.00027169082125603865,
"loss": 1.3588,
"step": 2940
},
{
"epoch": 5.147826086956521,
"grad_norm": 3.5837419033050537,
"learning_rate": 0.0002698550724637681,
"loss": 1.3782,
"step": 2960
},
{
"epoch": 5.182608695652174,
"grad_norm": 4.077097415924072,
"learning_rate": 0.00026792270531400964,
"loss": 1.3969,
"step": 2980
},
{
"epoch": 5.217391304347826,
"grad_norm": 3.5995211601257324,
"learning_rate": 0.00026599033816425123,
"loss": 1.3346,
"step": 3000
},
{
"epoch": 5.252173913043478,
"grad_norm": 3.714010000228882,
"learning_rate": 0.00026405797101449277,
"loss": 1.3772,
"step": 3020
},
{
"epoch": 5.28695652173913,
"grad_norm": 3.807094097137451,
"learning_rate": 0.00026231884057971016,
"loss": 1.3452,
"step": 3040
},
{
"epoch": 5.321739130434783,
"grad_norm": 4.012477397918701,
"learning_rate": 0.0002603864734299517,
"loss": 1.3161,
"step": 3060
},
{
"epoch": 5.356521739130435,
"grad_norm": 3.850520372390747,
"learning_rate": 0.0002584541062801932,
"loss": 1.3146,
"step": 3080
},
{
"epoch": 5.391304347826087,
"grad_norm": NaN,
"learning_rate": 0.00025661835748792274,
"loss": 1.3057,
"step": 3100
},
{
"epoch": 5.426086956521739,
"grad_norm": 3.697744607925415,
"learning_rate": 0.0002546859903381643,
"loss": 1.2619,
"step": 3120
},
{
"epoch": 5.460869565217392,
"grad_norm": 4.125018119812012,
"learning_rate": 0.00025275362318840576,
"loss": 1.3436,
"step": 3140
},
{
"epoch": 5.495652173913044,
"grad_norm": 4.1491899490356445,
"learning_rate": 0.00025082125603864735,
"loss": 1.3289,
"step": 3160
},
{
"epoch": 5.530434782608696,
"grad_norm": 3.9294846057891846,
"learning_rate": 0.0002488888888888889,
"loss": 1.218,
"step": 3180
},
{
"epoch": 5.565217391304348,
"grad_norm": 3.9030706882476807,
"learning_rate": 0.00024695652173913047,
"loss": 1.3219,
"step": 3200
},
{
"epoch": 5.6,
"grad_norm": 4.124849319458008,
"learning_rate": 0.000245024154589372,
"loss": 1.2694,
"step": 3220
},
{
"epoch": 5.6347826086956525,
"grad_norm": 4.1668500900268555,
"learning_rate": 0.0002432850241545894,
"loss": 1.2379,
"step": 3240
},
{
"epoch": 5.6695652173913045,
"grad_norm": 4.098198890686035,
"learning_rate": 0.00024135265700483093,
"loss": 1.2892,
"step": 3260
},
{
"epoch": 5.7043478260869565,
"grad_norm": 3.690241813659668,
"learning_rate": 0.00023942028985507246,
"loss": 1.2742,
"step": 3280
},
{
"epoch": 5.739130434782608,
"grad_norm": 3.978963613510132,
"learning_rate": 0.00023748792270531402,
"loss": 1.1755,
"step": 3300
},
{
"epoch": 5.773913043478261,
"grad_norm": 3.7397215366363525,
"learning_rate": 0.00023574879227053139,
"loss": 1.2256,
"step": 3320
},
{
"epoch": 5.808695652173913,
"grad_norm": 3.9201064109802246,
"learning_rate": 0.00023391304347826088,
"loss": 1.238,
"step": 3340
},
{
"epoch": 5.843478260869565,
"grad_norm": 3.725389242172241,
"learning_rate": 0.0002319806763285024,
"loss": 1.1706,
"step": 3360
},
{
"epoch": 5.878260869565217,
"grad_norm": 3.5844123363494873,
"learning_rate": 0.00023004830917874397,
"loss": 1.1644,
"step": 3380
},
{
"epoch": 5.913043478260869,
"grad_norm": 3.79936146736145,
"learning_rate": 0.00022821256038647343,
"loss": 1.2256,
"step": 3400
},
{
"epoch": 5.947826086956522,
"grad_norm": 3.5947725772857666,
"learning_rate": 0.00022628019323671497,
"loss": 1.2488,
"step": 3420
},
{
"epoch": 5.982608695652174,
"grad_norm": NaN,
"learning_rate": 0.00022444444444444446,
"loss": 1.1418,
"step": 3440
},
{
"epoch": 6.0,
"eval_accuracy": 0.9757901815736382,
"eval_loss": 0.5840117335319519,
"eval_runtime": 97.2696,
"eval_samples_per_second": 15.287,
"eval_steps_per_second": 15.287,
"step": 3450
},
{
"epoch": 6.017391304347826,
"grad_norm": 3.5959298610687256,
"learning_rate": 0.00022260869565217392,
"loss": 1.1254,
"step": 3460
},
{
"epoch": 6.052173913043478,
"grad_norm": 3.9623775482177734,
"learning_rate": 0.00022067632850241545,
"loss": 1.0343,
"step": 3480
},
{
"epoch": 6.086956521739131,
"grad_norm": 3.735102415084839,
"learning_rate": 0.00021874396135265702,
"loss": 1.0348,
"step": 3500
},
{
"epoch": 6.121739130434783,
"grad_norm": 3.4255013465881348,
"learning_rate": 0.00021681159420289855,
"loss": 0.9796,
"step": 3520
},
{
"epoch": 6.156521739130435,
"grad_norm": 3.981841564178467,
"learning_rate": 0.00021497584541062804,
"loss": 0.9865,
"step": 3540
},
{
"epoch": 6.191304347826087,
"grad_norm": 3.9057116508483887,
"learning_rate": 0.00021314009661835748,
"loss": 1.0054,
"step": 3560
},
{
"epoch": 6.226086956521739,
"grad_norm": 3.626560688018799,
"learning_rate": 0.00021120772946859904,
"loss": 1.0012,
"step": 3580
},
{
"epoch": 6.260869565217392,
"grad_norm": 3.687683582305908,
"learning_rate": 0.0002093719806763285,
"loss": 1.0129,
"step": 3600
},
{
"epoch": 6.2956521739130435,
"grad_norm": 3.8632826805114746,
"learning_rate": 0.00020763285024154592,
"loss": 0.9333,
"step": 3620
},
{
"epoch": 6.3304347826086955,
"grad_norm": 4.089422702789307,
"learning_rate": 0.0002058937198067633,
"loss": 1.0259,
"step": 3640
},
{
"epoch": 6.3652173913043475,
"grad_norm": 4.261268615722656,
"learning_rate": 0.00020415458937198067,
"loss": 1.0184,
"step": 3660
},
{
"epoch": 6.4,
"grad_norm": 2.3901586532592773,
"learning_rate": 0.0002026086956521739,
"loss": 1.0293,
"step": 3680
},
{
"epoch": 6.434782608695652,
"grad_norm": 2.233633518218994,
"learning_rate": 0.00020067632850241546,
"loss": 1.0026,
"step": 3700
},
{
"epoch": 6.469565217391304,
"grad_norm": 2.049773693084717,
"learning_rate": 0.00019893719806763285,
"loss": 1.0426,
"step": 3720
},
{
"epoch": 6.504347826086956,
"grad_norm": 2.21939754486084,
"learning_rate": 0.0001970048309178744,
"loss": 1.0324,
"step": 3740
},
{
"epoch": 6.539130434782608,
"grad_norm": 2.2138895988464355,
"learning_rate": 0.00019516908212560387,
"loss": 1.0666,
"step": 3760
},
{
"epoch": 6.573913043478261,
"grad_norm": 1.9186855554580688,
"learning_rate": 0.0001932367149758454,
"loss": 1.0724,
"step": 3780
},
{
"epoch": 6.608695652173913,
"grad_norm": 1.302451729774475,
"learning_rate": 0.00019159420289855073,
"loss": 1.0867,
"step": 3800
},
{
"epoch": 6.643478260869565,
"grad_norm": 1.1770459413528442,
"learning_rate": 0.00018975845410628022,
"loss": 1.0659,
"step": 3820
},
{
"epoch": 6.678260869565217,
"grad_norm": 0.2651650309562683,
"learning_rate": 0.0001881159420289855,
"loss": 1.0494,
"step": 3840
},
{
"epoch": 6.71304347826087,
"grad_norm": 0.0,
"learning_rate": 0.0001867632850241546,
"loss": 1.0464,
"step": 3860
},
{
"epoch": 6.747826086956522,
"grad_norm": 0.0,
"learning_rate": 0.000185024154589372,
"loss": 1.0457,
"step": 3880
},
{
"epoch": 6.782608695652174,
"grad_norm": 0.0,
"learning_rate": 0.00018328502415458937,
"loss": 0.9815,
"step": 3900
},
{
"epoch": 6.817391304347826,
"grad_norm": 0.0,
"learning_rate": 0.0001816425120772947,
"loss": 1.0094,
"step": 3920
},
{
"epoch": 6.852173913043478,
"grad_norm": NaN,
"learning_rate": 0.00018028985507246377,
"loss": 1.0023,
"step": 3940
},
{
"epoch": 6.886956521739131,
"grad_norm": 0.0,
"learning_rate": 0.00017893719806763288,
"loss": 1.0278,
"step": 3960
},
{
"epoch": 6.921739130434783,
"grad_norm": 0.0,
"learning_rate": 0.0001771014492753623,
"loss": 1.0123,
"step": 3980
},
{
"epoch": 6.956521739130435,
"grad_norm": 0.0,
"learning_rate": 0.00017565217391304346,
"loss": 1.0774,
"step": 4000
},
{
"epoch": 6.9913043478260875,
"grad_norm": 0.0,
"learning_rate": 0.00017391304347826088,
"loss": 1.0484,
"step": 4020
},
{
"epoch": 7.0,
"eval_accuracy": 0.9737726967047747,
"eval_loss": 0.5780686736106873,
"eval_runtime": 118.8154,
"eval_samples_per_second": 12.515,
"eval_steps_per_second": 12.515,
"step": 4025
},
{
"epoch": 7.026086956521739,
"grad_norm": 0.0,
"learning_rate": 0.0001723671497584541,
"loss": 0.9799,
"step": 4040
},
{
"epoch": 7.060869565217391,
"grad_norm": 0.0,
"learning_rate": 0.00017091787439613525,
"loss": 0.9588,
"step": 4060
},
{
"epoch": 7.095652173913043,
"grad_norm": NaN,
"learning_rate": 0.00016966183574879226,
"loss": 0.9421,
"step": 4080
},
{
"epoch": 7.130434782608695,
"grad_norm": 0.0,
"learning_rate": 0.00016782608695652175,
"loss": 0.9551,
"step": 4100
},
{
"epoch": 7.165217391304348,
"grad_norm": 0.0,
"learning_rate": 0.00016618357487922704,
"loss": 0.9622,
"step": 4120
},
{
"epoch": 7.2,
"grad_norm": 0.0,
"learning_rate": 0.00016444444444444446,
"loss": 0.9712,
"step": 4140
},
{
"epoch": 7.234782608695652,
"grad_norm": 0.0,
"learning_rate": 0.00016299516908212561,
"loss": 0.9834,
"step": 4160
},
{
"epoch": 7.269565217391304,
"grad_norm": NaN,
"learning_rate": 0.00016135265700483093,
"loss": 0.9968,
"step": 4180
},
{
"epoch": 7.304347826086957,
"grad_norm": 0.0,
"learning_rate": 0.00015961352657004833,
"loss": 0.956,
"step": 4200
},
{
"epoch": 7.339130434782609,
"grad_norm": 0.0,
"learning_rate": 0.00015806763285024155,
"loss": 0.8981,
"step": 4220
},
{
"epoch": 7.373913043478261,
"grad_norm": 0.0,
"learning_rate": 0.00015642512077294684,
"loss": 0.9515,
"step": 4240
},
{
"epoch": 7.408695652173913,
"grad_norm": 0.0,
"learning_rate": 0.0001548792270531401,
"loss": 0.9535,
"step": 4260
},
{
"epoch": 7.443478260869565,
"grad_norm": NaN,
"learning_rate": 0.00015333333333333334,
"loss": 0.9646,
"step": 4280
},
{
"epoch": 7.478260869565218,
"grad_norm": 0.0,
"learning_rate": 0.00015140096618357487,
"loss": 0.9821,
"step": 4300
},
{
"epoch": 7.51304347826087,
"grad_norm": 0.0,
"learning_rate": 0.00015014492753623188,
"loss": 0.9259,
"step": 4320
},
{
"epoch": 7.547826086956522,
"grad_norm": 0.0,
"learning_rate": 0.00014869565217391303,
"loss": 0.9494,
"step": 4340
},
{
"epoch": 7.582608695652174,
"grad_norm": 0.0,
"learning_rate": 0.00014714975845410628,
"loss": 0.9305,
"step": 4360
},
{
"epoch": 7.6173913043478265,
"grad_norm": 0.0,
"learning_rate": 0.0001455072463768116,
"loss": 0.8889,
"step": 4380
},
{
"epoch": 7.6521739130434785,
"grad_norm": 0.0,
"learning_rate": 0.00014396135265700482,
"loss": 0.9524,
"step": 4400
},
{
"epoch": 7.6869565217391305,
"grad_norm": 0.0,
"learning_rate": 0.00014231884057971014,
"loss": 0.9065,
"step": 4420
},
{
"epoch": 7.721739130434782,
"grad_norm": 0.0,
"learning_rate": 0.00014048309178743963,
"loss": 0.9153,
"step": 4440
},
{
"epoch": 7.756521739130434,
"grad_norm": NaN,
"learning_rate": 0.0001403864734299517,
"loss": 0.6675,
"step": 4460
},
{
"epoch": 7.791304347826087,
"grad_norm": NaN,
"learning_rate": 0.0001403864734299517,
"loss": 0.0,
"step": 4480
},
{
"epoch": 7.826086956521739,
"grad_norm": NaN,
"learning_rate": 0.0001403864734299517,
"loss": 0.0,
"step": 4500
},
{
"epoch": 7.860869565217391,
"grad_norm": NaN,
"learning_rate": 0.0001403864734299517,
"loss": 0.0,
"step": 4520
},
{
"epoch": 7.895652173913043,
"grad_norm": NaN,
"learning_rate": 0.0001403864734299517,
"loss": 0.0,
"step": 4540
},
{
"epoch": 7.930434782608696,
"grad_norm": NaN,
"learning_rate": 0.0001403864734299517,
"loss": 0.0,
"step": 4560
},
{
"epoch": 7.965217391304348,
"grad_norm": NaN,
"learning_rate": 0.0001403864734299517,
"loss": 0.0,
"step": 4580
},
{
"epoch": 8.0,
"grad_norm": NaN,
"learning_rate": 0.0001403864734299517,
"loss": 0.0,
"step": 4600
},
{
"epoch": 8.0,
"eval_accuracy": 0.0006724949562878278,
"eval_loss": NaN,
"eval_runtime": 129.6238,
"eval_samples_per_second": 11.472,
"eval_steps_per_second": 11.472,
"step": 4600
},
{
"epoch": 8.034782608695652,
"grad_norm": NaN,
"learning_rate": 0.0001403864734299517,
"loss": 0.0,
"step": 4620
},
{
"epoch": 8.069565217391304,
"grad_norm": NaN,
"learning_rate": 0.0001403864734299517,
"loss": 0.0,
"step": 4640
},
{
"epoch": 8.104347826086956,
"grad_norm": NaN,
"learning_rate": 0.0001403864734299517,
"loss": 0.0,
"step": 4660
},
{
"epoch": 8.139130434782608,
"grad_norm": NaN,
"learning_rate": 0.0001403864734299517,
"loss": 0.0,
"step": 4680
},
{
"epoch": 8.173913043478262,
"grad_norm": NaN,
"learning_rate": 0.0001403864734299517,
"loss": 0.0,
"step": 4700
},
{
"epoch": 8.208695652173914,
"grad_norm": NaN,
"learning_rate": 0.0001403864734299517,
"loss": 0.0,
"step": 4720
},
{
"epoch": 8.243478260869566,
"grad_norm": NaN,
"learning_rate": 0.0001403864734299517,
"loss": 0.0,
"step": 4740
},
{
"epoch": 8.278260869565218,
"grad_norm": NaN,
"learning_rate": 0.0001403864734299517,
"loss": 0.0,
"step": 4760
},
{
"epoch": 8.31304347826087,
"grad_norm": NaN,
"learning_rate": 0.0001403864734299517,
"loss": 0.0,
"step": 4780
},
{
"epoch": 8.347826086956522,
"grad_norm": NaN,
"learning_rate": 0.0001403864734299517,
"loss": 0.0,
"step": 4800
},
{
"epoch": 8.382608695652173,
"grad_norm": NaN,
"learning_rate": 0.0001403864734299517,
"loss": 0.0,
"step": 4820
},
{
"epoch": 8.417391304347825,
"grad_norm": NaN,
"learning_rate": 0.0001403864734299517,
"loss": 0.0,
"step": 4840
},
{
"epoch": 8.452173913043477,
"grad_norm": NaN,
"learning_rate": 0.0001403864734299517,
"loss": 0.0,
"step": 4860
},
{
"epoch": 8.486956521739131,
"grad_norm": NaN,
"learning_rate": 0.0001403864734299517,
"loss": 0.0,
"step": 4880
},
{
"epoch": 8.521739130434783,
"grad_norm": NaN,
"learning_rate": 0.0001403864734299517,
"loss": 0.0,
"step": 4900
},
{
"epoch": 8.556521739130435,
"grad_norm": NaN,
"learning_rate": 0.0001403864734299517,
"loss": 0.0,
"step": 4920
},
{
"epoch": 8.591304347826087,
"grad_norm": NaN,
"learning_rate": 0.0001403864734299517,
"loss": 0.0,
"step": 4940
},
{
"epoch": 8.626086956521739,
"grad_norm": NaN,
"learning_rate": 0.0001403864734299517,
"loss": 0.0,
"step": 4960
},
{
"epoch": 8.660869565217391,
"grad_norm": NaN,
"learning_rate": 0.0001403864734299517,
"loss": 0.0,
"step": 4980
},
{
"epoch": 8.695652173913043,
"grad_norm": NaN,
"learning_rate": 0.0001403864734299517,
"loss": 0.0,
"step": 5000
},
{
"epoch": 8.730434782608695,
"grad_norm": NaN,
"learning_rate": 0.0001403864734299517,
"loss": 0.0,
"step": 5020
},
{
"epoch": 8.765217391304347,
"grad_norm": NaN,
"learning_rate": 0.0001403864734299517,
"loss": 0.0,
"step": 5040
},
{
"epoch": 8.8,
"grad_norm": NaN,
"learning_rate": 0.0001403864734299517,
"loss": 0.0,
"step": 5060
},
{
"epoch": 8.834782608695653,
"grad_norm": NaN,
"learning_rate": 0.0001403864734299517,
"loss": 0.0,
"step": 5080
},
{
"epoch": 8.869565217391305,
"grad_norm": NaN,
"learning_rate": 0.0001403864734299517,
"loss": 0.0,
"step": 5100
},
{
"epoch": 8.904347826086957,
"grad_norm": NaN,
"learning_rate": 0.0001403864734299517,
"loss": 0.0,
"step": 5120
},
{
"epoch": 8.939130434782609,
"grad_norm": NaN,
"learning_rate": 0.0001403864734299517,
"loss": 0.0,
"step": 5140
},
{
"epoch": 8.97391304347826,
"grad_norm": NaN,
"learning_rate": 0.0001403864734299517,
"loss": 0.0,
"step": 5160
},
{
"epoch": 9.0,
"eval_accuracy": 0.0006724949562878278,
"eval_loss": NaN,
"eval_runtime": 117.1288,
"eval_samples_per_second": 12.695,
"eval_steps_per_second": 12.695,
"step": 5175
},
{
"epoch": 9.008695652173913,
"grad_norm": NaN,
"learning_rate": 0.0001403864734299517,
"loss": 0.0,
"step": 5180
},
{
"epoch": 9.043478260869565,
"grad_norm": NaN,
"learning_rate": 0.0001403864734299517,
"loss": 0.0,
"step": 5200
},
{
"epoch": 9.078260869565218,
"grad_norm": NaN,
"learning_rate": 0.0001403864734299517,
"loss": 0.0,
"step": 5220
},
{
"epoch": 9.11304347826087,
"grad_norm": NaN,
"learning_rate": 0.0001403864734299517,
"loss": 0.0,
"step": 5240
},
{
"epoch": 9.147826086956522,
"grad_norm": NaN,
"learning_rate": 0.0001403864734299517,
"loss": 0.0,
"step": 5260
},
{
"epoch": 9.182608695652174,
"grad_norm": NaN,
"learning_rate": 0.0001403864734299517,
"loss": 0.0,
"step": 5280
},
{
"epoch": 9.217391304347826,
"grad_norm": NaN,
"learning_rate": 0.0001403864734299517,
"loss": 0.0,
"step": 5300
},
{
"epoch": 9.252173913043478,
"grad_norm": NaN,
"learning_rate": 0.0001403864734299517,
"loss": 0.0,
"step": 5320
},
{
"epoch": 9.28695652173913,
"grad_norm": NaN,
"learning_rate": 0.0001403864734299517,
"loss": 0.0,
"step": 5340
},
{
"epoch": 9.321739130434782,
"grad_norm": NaN,
"learning_rate": 0.0001403864734299517,
"loss": 0.0,
"step": 5360
},
{
"epoch": 9.356521739130434,
"grad_norm": NaN,
"learning_rate": 0.0001403864734299517,
"loss": 0.0,
"step": 5380
},
{
"epoch": 9.391304347826088,
"grad_norm": NaN,
"learning_rate": 0.0001403864734299517,
"loss": 0.0,
"step": 5400
},
{
"epoch": 9.42608695652174,
"grad_norm": NaN,
"learning_rate": 0.0001403864734299517,
"loss": 0.0,
"step": 5420
},
{
"epoch": 9.460869565217392,
"grad_norm": NaN,
"learning_rate": 0.0001403864734299517,
"loss": 0.0,
"step": 5440
},
{
"epoch": 9.495652173913044,
"grad_norm": NaN,
"learning_rate": 0.0001403864734299517,
"loss": 0.0,
"step": 5460
},
{
"epoch": 9.530434782608696,
"grad_norm": NaN,
"learning_rate": 0.0001403864734299517,
"loss": 0.0,
"step": 5480
},
{
"epoch": 9.565217391304348,
"grad_norm": NaN,
"learning_rate": 0.0001403864734299517,
"loss": 0.0,
"step": 5500
},
{
"epoch": 9.6,
"grad_norm": NaN,
"learning_rate": 0.0001403864734299517,
"loss": 0.0,
"step": 5520
},
{
"epoch": 9.634782608695652,
"grad_norm": NaN,
"learning_rate": 0.0001403864734299517,
"loss": 0.0,
"step": 5540
},
{
"epoch": 9.669565217391304,
"grad_norm": NaN,
"learning_rate": 0.0001403864734299517,
"loss": 0.0,
"step": 5560
},
{
"epoch": 9.704347826086957,
"grad_norm": NaN,
"learning_rate": 0.0001403864734299517,
"loss": 0.0,
"step": 5580
},
{
"epoch": 9.73913043478261,
"grad_norm": NaN,
"learning_rate": 0.0001403864734299517,
"loss": 0.0,
"step": 5600
},
{
"epoch": 9.773913043478261,
"grad_norm": NaN,
"learning_rate": 0.0001403864734299517,
"loss": 0.0,
"step": 5620
},
{
"epoch": 9.808695652173913,
"grad_norm": NaN,
"learning_rate": 0.0001403864734299517,
"loss": 0.0,
"step": 5640
},
{
"epoch": 9.843478260869565,
"grad_norm": NaN,
"learning_rate": 0.0001403864734299517,
"loss": 0.0,
"step": 5660
},
{
"epoch": 9.878260869565217,
"grad_norm": NaN,
"learning_rate": 0.0001403864734299517,
"loss": 0.0,
"step": 5680
},
{
"epoch": 9.91304347826087,
"grad_norm": NaN,
"learning_rate": 0.0001403864734299517,
"loss": 0.0,
"step": 5700
},
{
"epoch": 9.947826086956521,
"grad_norm": NaN,
"learning_rate": 0.0001403864734299517,
"loss": 0.0,
"step": 5720
},
{
"epoch": 9.982608695652173,
"grad_norm": NaN,
"learning_rate": 0.0001403864734299517,
"loss": 0.0,
"step": 5740
},
{
"epoch": 10.0,
"eval_accuracy": 0.0006724949562878278,
"eval_loss": NaN,
"eval_runtime": 103.3199,
"eval_samples_per_second": 14.392,
"eval_steps_per_second": 14.392,
"step": 5750
},
{
"epoch": 10.0,
"step": 5750,
"total_flos": 2.7398100529152e+18,
"train_loss": 2.9414075751926587,
"train_runtime": 59857.6179,
"train_samples_per_second": 24.584,
"train_steps_per_second": 0.096
}
],
"logging_steps": 20,
"max_steps": 5750,
"num_input_tokens_seen": 0,
"num_train_epochs": 10,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2.7398100529152e+18,
"train_batch_size": 256,
"trial_name": null,
"trial_params": null
}