nahidalam's picture
initial 3k steps
64ef9ab
raw
history blame
52.4 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 8.90207715133531,
"eval_steps": 500,
"global_step": 3000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.02967359050445104,
"grad_norm": 1.9033336639404297,
"learning_rate": 2.0000000000000003e-06,
"loss": 0.2317,
"step": 10
},
{
"epoch": 0.05934718100890208,
"grad_norm": 1.0404284000396729,
"learning_rate": 4.000000000000001e-06,
"loss": 0.1977,
"step": 20
},
{
"epoch": 0.08902077151335312,
"grad_norm": 0.6579734683036804,
"learning_rate": 6e-06,
"loss": 0.1451,
"step": 30
},
{
"epoch": 0.11869436201780416,
"grad_norm": 0.33155006170272827,
"learning_rate": 8.000000000000001e-06,
"loss": 0.0959,
"step": 40
},
{
"epoch": 0.14836795252225518,
"grad_norm": 0.5317391753196716,
"learning_rate": 1e-05,
"loss": 0.0828,
"step": 50
},
{
"epoch": 0.17804154302670624,
"grad_norm": 0.45179909467697144,
"learning_rate": 1.2e-05,
"loss": 0.0814,
"step": 60
},
{
"epoch": 0.20771513353115728,
"grad_norm": 0.2707938849925995,
"learning_rate": 1.4000000000000001e-05,
"loss": 0.0562,
"step": 70
},
{
"epoch": 0.23738872403560832,
"grad_norm": 0.22402559220790863,
"learning_rate": 1.6000000000000003e-05,
"loss": 0.0594,
"step": 80
},
{
"epoch": 0.26706231454005935,
"grad_norm": 0.16533811390399933,
"learning_rate": 1.8e-05,
"loss": 0.0529,
"step": 90
},
{
"epoch": 0.29673590504451036,
"grad_norm": 0.222530797123909,
"learning_rate": 2e-05,
"loss": 0.0522,
"step": 100
},
{
"epoch": 0.3264094955489614,
"grad_norm": 0.1894129067659378,
"learning_rate": 2.2000000000000003e-05,
"loss": 0.0514,
"step": 110
},
{
"epoch": 0.3560830860534125,
"grad_norm": 0.20559543371200562,
"learning_rate": 2.4e-05,
"loss": 0.0462,
"step": 120
},
{
"epoch": 0.3857566765578635,
"grad_norm": 0.157830610871315,
"learning_rate": 2.6000000000000002e-05,
"loss": 0.0471,
"step": 130
},
{
"epoch": 0.41543026706231456,
"grad_norm": 0.14663924276828766,
"learning_rate": 2.8000000000000003e-05,
"loss": 0.0449,
"step": 140
},
{
"epoch": 0.44510385756676557,
"grad_norm": 0.14772620797157288,
"learning_rate": 3e-05,
"loss": 0.0424,
"step": 150
},
{
"epoch": 0.47477744807121663,
"grad_norm": 0.16058433055877686,
"learning_rate": 3.2000000000000005e-05,
"loss": 0.0424,
"step": 160
},
{
"epoch": 0.5044510385756676,
"grad_norm": 0.15857172012329102,
"learning_rate": 3.4000000000000007e-05,
"loss": 0.041,
"step": 170
},
{
"epoch": 0.5341246290801187,
"grad_norm": 0.17435680329799652,
"learning_rate": 3.6e-05,
"loss": 0.0408,
"step": 180
},
{
"epoch": 0.5637982195845698,
"grad_norm": 0.1439993977546692,
"learning_rate": 3.8e-05,
"loss": 0.0352,
"step": 190
},
{
"epoch": 0.5934718100890207,
"grad_norm": 0.15629075467586517,
"learning_rate": 4e-05,
"loss": 0.0383,
"step": 200
},
{
"epoch": 0.6231454005934718,
"grad_norm": 0.1610369235277176,
"learning_rate": 4.2e-05,
"loss": 0.0392,
"step": 210
},
{
"epoch": 0.6528189910979229,
"grad_norm": 0.17589861154556274,
"learning_rate": 4.4000000000000006e-05,
"loss": 0.0374,
"step": 220
},
{
"epoch": 0.6824925816023739,
"grad_norm": 0.19186066091060638,
"learning_rate": 4.600000000000001e-05,
"loss": 0.0358,
"step": 230
},
{
"epoch": 0.712166172106825,
"grad_norm": 0.1579175740480423,
"learning_rate": 4.8e-05,
"loss": 0.0357,
"step": 240
},
{
"epoch": 0.7418397626112759,
"grad_norm": 0.17220136523246765,
"learning_rate": 5e-05,
"loss": 0.0334,
"step": 250
},
{
"epoch": 0.771513353115727,
"grad_norm": 0.18591266870498657,
"learning_rate": 5.2000000000000004e-05,
"loss": 0.0315,
"step": 260
},
{
"epoch": 0.8011869436201781,
"grad_norm": 0.2341579794883728,
"learning_rate": 5.4000000000000005e-05,
"loss": 0.0375,
"step": 270
},
{
"epoch": 0.8308605341246291,
"grad_norm": 0.15227168798446655,
"learning_rate": 5.6000000000000006e-05,
"loss": 0.031,
"step": 280
},
{
"epoch": 0.8605341246290801,
"grad_norm": 0.1876339167356491,
"learning_rate": 5.8e-05,
"loss": 0.0371,
"step": 290
},
{
"epoch": 0.8902077151335311,
"grad_norm": 0.1789393573999405,
"learning_rate": 6e-05,
"loss": 0.0313,
"step": 300
},
{
"epoch": 0.9198813056379822,
"grad_norm": 0.1678636074066162,
"learning_rate": 6.2e-05,
"loss": 0.0349,
"step": 310
},
{
"epoch": 0.9495548961424333,
"grad_norm": 0.17457032203674316,
"learning_rate": 6.400000000000001e-05,
"loss": 0.0296,
"step": 320
},
{
"epoch": 0.9792284866468842,
"grad_norm": 0.14290577173233032,
"learning_rate": 6.6e-05,
"loss": 0.0308,
"step": 330
},
{
"epoch": 1.0089020771513353,
"grad_norm": 0.23601128160953522,
"learning_rate": 6.800000000000001e-05,
"loss": 0.0298,
"step": 340
},
{
"epoch": 1.0385756676557865,
"grad_norm": 0.14039042592048645,
"learning_rate": 7e-05,
"loss": 0.0262,
"step": 350
},
{
"epoch": 1.0682492581602374,
"grad_norm": 0.1804966777563095,
"learning_rate": 7.2e-05,
"loss": 0.0284,
"step": 360
},
{
"epoch": 1.0979228486646884,
"grad_norm": 0.22986947000026703,
"learning_rate": 7.4e-05,
"loss": 0.0308,
"step": 370
},
{
"epoch": 1.1275964391691395,
"grad_norm": 0.20188020169734955,
"learning_rate": 7.6e-05,
"loss": 0.0261,
"step": 380
},
{
"epoch": 1.1572700296735905,
"grad_norm": 0.14067409932613373,
"learning_rate": 7.800000000000001e-05,
"loss": 0.028,
"step": 390
},
{
"epoch": 1.1869436201780414,
"grad_norm": 0.16516339778900146,
"learning_rate": 8e-05,
"loss": 0.0247,
"step": 400
},
{
"epoch": 1.2166172106824926,
"grad_norm": 0.19918474555015564,
"learning_rate": 8.2e-05,
"loss": 0.0301,
"step": 410
},
{
"epoch": 1.2462908011869436,
"grad_norm": 0.1878385990858078,
"learning_rate": 8.4e-05,
"loss": 0.0251,
"step": 420
},
{
"epoch": 1.2759643916913945,
"grad_norm": 0.20107118785381317,
"learning_rate": 8.6e-05,
"loss": 0.0279,
"step": 430
},
{
"epoch": 1.3056379821958457,
"grad_norm": 0.24616649746894836,
"learning_rate": 8.800000000000001e-05,
"loss": 0.0259,
"step": 440
},
{
"epoch": 1.3353115727002967,
"grad_norm": 0.19029636681079865,
"learning_rate": 9e-05,
"loss": 0.0262,
"step": 450
},
{
"epoch": 1.3649851632047478,
"grad_norm": 0.194508358836174,
"learning_rate": 9.200000000000001e-05,
"loss": 0.0275,
"step": 460
},
{
"epoch": 1.3946587537091988,
"grad_norm": 0.20826251804828644,
"learning_rate": 9.4e-05,
"loss": 0.0289,
"step": 470
},
{
"epoch": 1.4243323442136497,
"grad_norm": 0.13222843408584595,
"learning_rate": 9.6e-05,
"loss": 0.0249,
"step": 480
},
{
"epoch": 1.454005934718101,
"grad_norm": 0.13967235386371613,
"learning_rate": 9.8e-05,
"loss": 0.0231,
"step": 490
},
{
"epoch": 1.4836795252225519,
"grad_norm": 0.21556402742862701,
"learning_rate": 0.0001,
"loss": 0.0232,
"step": 500
},
{
"epoch": 1.513353115727003,
"grad_norm": 0.2407234013080597,
"learning_rate": 9.999972660400536e-05,
"loss": 0.025,
"step": 510
},
{
"epoch": 1.543026706231454,
"grad_norm": 0.1544090360403061,
"learning_rate": 9.999890641901125e-05,
"loss": 0.0224,
"step": 520
},
{
"epoch": 1.572700296735905,
"grad_norm": 0.1930345594882965,
"learning_rate": 9.999753945398704e-05,
"loss": 0.0244,
"step": 530
},
{
"epoch": 1.6023738872403561,
"grad_norm": 0.2288358211517334,
"learning_rate": 9.99956257238817e-05,
"loss": 0.0223,
"step": 540
},
{
"epoch": 1.632047477744807,
"grad_norm": 0.2028588354587555,
"learning_rate": 9.999316524962345e-05,
"loss": 0.022,
"step": 550
},
{
"epoch": 1.6617210682492582,
"grad_norm": 0.17989283800125122,
"learning_rate": 9.999015805811965e-05,
"loss": 0.0201,
"step": 560
},
{
"epoch": 1.6913946587537092,
"grad_norm": 0.20576386153697968,
"learning_rate": 9.998660418225645e-05,
"loss": 0.0211,
"step": 570
},
{
"epoch": 1.7210682492581602,
"grad_norm": 0.1263924539089203,
"learning_rate": 9.998250366089848e-05,
"loss": 0.0205,
"step": 580
},
{
"epoch": 1.7507418397626113,
"grad_norm": 0.23239193856716156,
"learning_rate": 9.997785653888835e-05,
"loss": 0.0223,
"step": 590
},
{
"epoch": 1.7804154302670623,
"grad_norm": 0.15964201092720032,
"learning_rate": 9.997266286704631e-05,
"loss": 0.0236,
"step": 600
},
{
"epoch": 1.8100890207715135,
"grad_norm": 0.1998339742422104,
"learning_rate": 9.996692270216947e-05,
"loss": 0.0254,
"step": 610
},
{
"epoch": 1.8397626112759644,
"grad_norm": 0.25945162773132324,
"learning_rate": 9.996063610703137e-05,
"loss": 0.0207,
"step": 620
},
{
"epoch": 1.8694362017804154,
"grad_norm": 0.1687825620174408,
"learning_rate": 9.995380315038119e-05,
"loss": 0.0169,
"step": 630
},
{
"epoch": 1.8991097922848663,
"grad_norm": 0.13454881310462952,
"learning_rate": 9.994642390694308e-05,
"loss": 0.0185,
"step": 640
},
{
"epoch": 1.9287833827893175,
"grad_norm": 0.1880808174610138,
"learning_rate": 9.993849845741524e-05,
"loss": 0.0199,
"step": 650
},
{
"epoch": 1.9584569732937687,
"grad_norm": 0.17725640535354614,
"learning_rate": 9.993002688846913e-05,
"loss": 0.019,
"step": 660
},
{
"epoch": 1.9881305637982196,
"grad_norm": 0.17680853605270386,
"learning_rate": 9.992100929274846e-05,
"loss": 0.0257,
"step": 670
},
{
"epoch": 2.0178041543026706,
"grad_norm": 0.18133607506752014,
"learning_rate": 9.991144576886823e-05,
"loss": 0.0222,
"step": 680
},
{
"epoch": 2.0474777448071215,
"grad_norm": 0.21174193918704987,
"learning_rate": 9.990133642141359e-05,
"loss": 0.0192,
"step": 690
},
{
"epoch": 2.077151335311573,
"grad_norm": 0.15766288340091705,
"learning_rate": 9.989068136093873e-05,
"loss": 0.0199,
"step": 700
},
{
"epoch": 2.106824925816024,
"grad_norm": 0.19250448048114777,
"learning_rate": 9.987948070396571e-05,
"loss": 0.0252,
"step": 710
},
{
"epoch": 2.136498516320475,
"grad_norm": 0.1412709653377533,
"learning_rate": 9.986773457298311e-05,
"loss": 0.0186,
"step": 720
},
{
"epoch": 2.166172106824926,
"grad_norm": 0.2136259824037552,
"learning_rate": 9.985544309644475e-05,
"loss": 0.0204,
"step": 730
},
{
"epoch": 2.1958456973293767,
"grad_norm": 0.10815251618623734,
"learning_rate": 9.984260640876821e-05,
"loss": 0.0153,
"step": 740
},
{
"epoch": 2.2255192878338277,
"grad_norm": 0.14663298428058624,
"learning_rate": 9.98292246503335e-05,
"loss": 0.0189,
"step": 750
},
{
"epoch": 2.255192878338279,
"grad_norm": 0.18529076874256134,
"learning_rate": 9.981529796748134e-05,
"loss": 0.0155,
"step": 760
},
{
"epoch": 2.28486646884273,
"grad_norm": 0.2349974811077118,
"learning_rate": 9.980082651251175e-05,
"loss": 0.0172,
"step": 770
},
{
"epoch": 2.314540059347181,
"grad_norm": 0.10778886079788208,
"learning_rate": 9.97858104436822e-05,
"loss": 0.0161,
"step": 780
},
{
"epoch": 2.344213649851632,
"grad_norm": 0.15675969421863556,
"learning_rate": 9.977024992520602e-05,
"loss": 0.0165,
"step": 790
},
{
"epoch": 2.373887240356083,
"grad_norm": 0.23468513786792755,
"learning_rate": 9.975414512725057e-05,
"loss": 0.0196,
"step": 800
},
{
"epoch": 2.4035608308605343,
"grad_norm": 0.1332869678735733,
"learning_rate": 9.973749622593534e-05,
"loss": 0.0193,
"step": 810
},
{
"epoch": 2.4332344213649852,
"grad_norm": 0.1406887024641037,
"learning_rate": 9.972030340333001e-05,
"loss": 0.0186,
"step": 820
},
{
"epoch": 2.462908011869436,
"grad_norm": 0.11544730514287949,
"learning_rate": 9.970256684745258e-05,
"loss": 0.0195,
"step": 830
},
{
"epoch": 2.492581602373887,
"grad_norm": 0.19476240873336792,
"learning_rate": 9.968428675226714e-05,
"loss": 0.0171,
"step": 840
},
{
"epoch": 2.5222551928783385,
"grad_norm": 0.22309833765029907,
"learning_rate": 9.966546331768191e-05,
"loss": 0.0223,
"step": 850
},
{
"epoch": 2.551928783382789,
"grad_norm": 0.2214643657207489,
"learning_rate": 9.964609674954696e-05,
"loss": 0.019,
"step": 860
},
{
"epoch": 2.5816023738872405,
"grad_norm": 0.2298765480518341,
"learning_rate": 9.962618725965196e-05,
"loss": 0.0178,
"step": 870
},
{
"epoch": 2.6112759643916914,
"grad_norm": 0.1542595475912094,
"learning_rate": 9.96057350657239e-05,
"loss": 0.0152,
"step": 880
},
{
"epoch": 2.6409495548961424,
"grad_norm": 0.12994691729545593,
"learning_rate": 9.95847403914247e-05,
"loss": 0.0153,
"step": 890
},
{
"epoch": 2.6706231454005933,
"grad_norm": 0.1726643294095993,
"learning_rate": 9.956320346634876e-05,
"loss": 0.0219,
"step": 900
},
{
"epoch": 2.7002967359050443,
"grad_norm": 0.18292242288589478,
"learning_rate": 9.954112452602045e-05,
"loss": 0.0137,
"step": 910
},
{
"epoch": 2.7299703264094957,
"grad_norm": 0.19749417901039124,
"learning_rate": 9.95185038118915e-05,
"loss": 0.0179,
"step": 920
},
{
"epoch": 2.7596439169139466,
"grad_norm": 0.17808304727077484,
"learning_rate": 9.949534157133844e-05,
"loss": 0.0155,
"step": 930
},
{
"epoch": 2.7893175074183976,
"grad_norm": 0.13589969277381897,
"learning_rate": 9.94716380576598e-05,
"loss": 0.0143,
"step": 940
},
{
"epoch": 2.8189910979228485,
"grad_norm": 0.17047159373760223,
"learning_rate": 9.944739353007344e-05,
"loss": 0.0211,
"step": 950
},
{
"epoch": 2.8486646884272995,
"grad_norm": 0.15535619854927063,
"learning_rate": 9.942260825371358e-05,
"loss": 0.0132,
"step": 960
},
{
"epoch": 2.878338278931751,
"grad_norm": 0.1609240472316742,
"learning_rate": 9.939728249962807e-05,
"loss": 0.0145,
"step": 970
},
{
"epoch": 2.908011869436202,
"grad_norm": 0.20487530529499054,
"learning_rate": 9.937141654477528e-05,
"loss": 0.0172,
"step": 980
},
{
"epoch": 2.9376854599406528,
"grad_norm": 0.12371553480625153,
"learning_rate": 9.934501067202117e-05,
"loss": 0.0191,
"step": 990
},
{
"epoch": 2.9673590504451037,
"grad_norm": 0.15513016283512115,
"learning_rate": 9.931806517013612e-05,
"loss": 0.0158,
"step": 1000
},
{
"epoch": 2.9970326409495547,
"grad_norm": 0.1723584234714508,
"learning_rate": 9.929058033379181e-05,
"loss": 0.0139,
"step": 1010
},
{
"epoch": 3.026706231454006,
"grad_norm": 0.1482209414243698,
"learning_rate": 9.926255646355804e-05,
"loss": 0.0148,
"step": 1020
},
{
"epoch": 3.056379821958457,
"grad_norm": 0.15559454262256622,
"learning_rate": 9.923399386589933e-05,
"loss": 0.0147,
"step": 1030
},
{
"epoch": 3.086053412462908,
"grad_norm": 0.22350917756557465,
"learning_rate": 9.92048928531717e-05,
"loss": 0.0157,
"step": 1040
},
{
"epoch": 3.115727002967359,
"grad_norm": 0.1581738144159317,
"learning_rate": 9.917525374361912e-05,
"loss": 0.0133,
"step": 1050
},
{
"epoch": 3.14540059347181,
"grad_norm": 0.20087914168834686,
"learning_rate": 9.914507686137019e-05,
"loss": 0.0208,
"step": 1060
},
{
"epoch": 3.1750741839762613,
"grad_norm": 0.16003265976905823,
"learning_rate": 9.911436253643445e-05,
"loss": 0.0168,
"step": 1070
},
{
"epoch": 3.2047477744807122,
"grad_norm": 0.15235169231891632,
"learning_rate": 9.90831111046988e-05,
"loss": 0.0168,
"step": 1080
},
{
"epoch": 3.234421364985163,
"grad_norm": 0.15660665929317474,
"learning_rate": 9.905132290792394e-05,
"loss": 0.0106,
"step": 1090
},
{
"epoch": 3.264094955489614,
"grad_norm": 0.16853424906730652,
"learning_rate": 9.901899829374047e-05,
"loss": 0.0149,
"step": 1100
},
{
"epoch": 3.293768545994065,
"grad_norm": 0.1335846185684204,
"learning_rate": 9.89861376156452e-05,
"loss": 0.0168,
"step": 1110
},
{
"epoch": 3.3234421364985165,
"grad_norm": 0.20238997042179108,
"learning_rate": 9.895274123299723e-05,
"loss": 0.0154,
"step": 1120
},
{
"epoch": 3.3531157270029674,
"grad_norm": 0.22216491401195526,
"learning_rate": 9.891880951101407e-05,
"loss": 0.019,
"step": 1130
},
{
"epoch": 3.3827893175074184,
"grad_norm": 0.2017626017332077,
"learning_rate": 9.888434282076758e-05,
"loss": 0.017,
"step": 1140
},
{
"epoch": 3.4124629080118694,
"grad_norm": 0.18049117922782898,
"learning_rate": 9.884934153917997e-05,
"loss": 0.0163,
"step": 1150
},
{
"epoch": 3.4421364985163203,
"grad_norm": 0.28145721554756165,
"learning_rate": 9.881380604901964e-05,
"loss": 0.0166,
"step": 1160
},
{
"epoch": 3.4718100890207717,
"grad_norm": 0.2356300801038742,
"learning_rate": 9.877773673889701e-05,
"loss": 0.0155,
"step": 1170
},
{
"epoch": 3.5014836795252227,
"grad_norm": 0.24113395810127258,
"learning_rate": 9.87411340032603e-05,
"loss": 0.0156,
"step": 1180
},
{
"epoch": 3.5311572700296736,
"grad_norm": 0.18665863573551178,
"learning_rate": 9.870399824239117e-05,
"loss": 0.0159,
"step": 1190
},
{
"epoch": 3.5608308605341246,
"grad_norm": 0.16171567142009735,
"learning_rate": 9.86663298624003e-05,
"loss": 0.0167,
"step": 1200
},
{
"epoch": 3.5905044510385755,
"grad_norm": 0.17315839231014252,
"learning_rate": 9.862812927522309e-05,
"loss": 0.017,
"step": 1210
},
{
"epoch": 3.620178041543027,
"grad_norm": 0.14727933704853058,
"learning_rate": 9.858939689861506e-05,
"loss": 0.0132,
"step": 1220
},
{
"epoch": 3.649851632047478,
"grad_norm": 0.1552547812461853,
"learning_rate": 9.855013315614725e-05,
"loss": 0.0175,
"step": 1230
},
{
"epoch": 3.679525222551929,
"grad_norm": 0.1715100109577179,
"learning_rate": 9.851033847720166e-05,
"loss": 0.0139,
"step": 1240
},
{
"epoch": 3.7091988130563798,
"grad_norm": 0.16414763033390045,
"learning_rate": 9.847001329696653e-05,
"loss": 0.0189,
"step": 1250
},
{
"epoch": 3.7388724035608307,
"grad_norm": 0.1251063048839569,
"learning_rate": 9.842915805643155e-05,
"loss": 0.0145,
"step": 1260
},
{
"epoch": 3.768545994065282,
"grad_norm": 0.17011059820652008,
"learning_rate": 9.838777320238312e-05,
"loss": 0.0161,
"step": 1270
},
{
"epoch": 3.798219584569733,
"grad_norm": 0.14429537951946259,
"learning_rate": 9.834585918739936e-05,
"loss": 0.0159,
"step": 1280
},
{
"epoch": 3.827893175074184,
"grad_norm": 0.138567715883255,
"learning_rate": 9.830341646984521e-05,
"loss": 0.0175,
"step": 1290
},
{
"epoch": 3.857566765578635,
"grad_norm": 0.08295896649360657,
"learning_rate": 9.826044551386744e-05,
"loss": 0.0145,
"step": 1300
},
{
"epoch": 3.887240356083086,
"grad_norm": 0.0911448523402214,
"learning_rate": 9.821694678938953e-05,
"loss": 0.0134,
"step": 1310
},
{
"epoch": 3.9169139465875373,
"grad_norm": 0.14157798886299133,
"learning_rate": 9.817292077210659e-05,
"loss": 0.0158,
"step": 1320
},
{
"epoch": 3.9465875370919883,
"grad_norm": 0.17415288090705872,
"learning_rate": 9.812836794348004e-05,
"loss": 0.0125,
"step": 1330
},
{
"epoch": 3.9762611275964392,
"grad_norm": 0.22007249295711517,
"learning_rate": 9.808328879073251e-05,
"loss": 0.0153,
"step": 1340
},
{
"epoch": 4.005934718100891,
"grad_norm": 0.144961416721344,
"learning_rate": 9.803768380684242e-05,
"loss": 0.0117,
"step": 1350
},
{
"epoch": 4.035608308605341,
"grad_norm": 0.14304885268211365,
"learning_rate": 9.799155349053851e-05,
"loss": 0.0138,
"step": 1360
},
{
"epoch": 4.0652818991097925,
"grad_norm": 0.18843571841716766,
"learning_rate": 9.794489834629455e-05,
"loss": 0.0106,
"step": 1370
},
{
"epoch": 4.094955489614243,
"grad_norm": 0.16858817636966705,
"learning_rate": 9.789771888432375e-05,
"loss": 0.0138,
"step": 1380
},
{
"epoch": 4.1246290801186944,
"grad_norm": 0.19177594780921936,
"learning_rate": 9.785001562057309e-05,
"loss": 0.0114,
"step": 1390
},
{
"epoch": 4.154302670623146,
"grad_norm": 0.20158767700195312,
"learning_rate": 9.780178907671789e-05,
"loss": 0.0146,
"step": 1400
},
{
"epoch": 4.183976261127596,
"grad_norm": 0.17675232887268066,
"learning_rate": 9.775303978015585e-05,
"loss": 0.0116,
"step": 1410
},
{
"epoch": 4.213649851632048,
"grad_norm": 0.20077385008335114,
"learning_rate": 9.77037682640015e-05,
"loss": 0.0172,
"step": 1420
},
{
"epoch": 4.243323442136498,
"grad_norm": 0.17185665667057037,
"learning_rate": 9.765397506708023e-05,
"loss": 0.0138,
"step": 1430
},
{
"epoch": 4.27299703264095,
"grad_norm": 0.1641971468925476,
"learning_rate": 9.760366073392246e-05,
"loss": 0.0145,
"step": 1440
},
{
"epoch": 4.302670623145401,
"grad_norm": 0.13757102191448212,
"learning_rate": 9.755282581475769e-05,
"loss": 0.0158,
"step": 1450
},
{
"epoch": 4.332344213649852,
"grad_norm": 0.18012432754039764,
"learning_rate": 9.750147086550844e-05,
"loss": 0.0139,
"step": 1460
},
{
"epoch": 4.362017804154303,
"grad_norm": 0.1639927178621292,
"learning_rate": 9.744959644778422e-05,
"loss": 0.0142,
"step": 1470
},
{
"epoch": 4.3916913946587535,
"grad_norm": 0.17392724752426147,
"learning_rate": 9.739720312887535e-05,
"loss": 0.018,
"step": 1480
},
{
"epoch": 4.421364985163205,
"grad_norm": 0.15744361281394958,
"learning_rate": 9.734429148174675e-05,
"loss": 0.0128,
"step": 1490
},
{
"epoch": 4.451038575667655,
"grad_norm": 0.12954673171043396,
"learning_rate": 9.729086208503174e-05,
"loss": 0.0188,
"step": 1500
},
{
"epoch": 4.480712166172107,
"grad_norm": 0.16809141635894775,
"learning_rate": 9.723691552302562e-05,
"loss": 0.0144,
"step": 1510
},
{
"epoch": 4.510385756676558,
"grad_norm": 0.1523902416229248,
"learning_rate": 9.718245238567939e-05,
"loss": 0.0145,
"step": 1520
},
{
"epoch": 4.540059347181009,
"grad_norm": 0.26666077971458435,
"learning_rate": 9.712747326859315e-05,
"loss": 0.015,
"step": 1530
},
{
"epoch": 4.56973293768546,
"grad_norm": 0.193909153342247,
"learning_rate": 9.707197877300974e-05,
"loss": 0.0167,
"step": 1540
},
{
"epoch": 4.5994065281899115,
"grad_norm": 0.17812030017375946,
"learning_rate": 9.701596950580806e-05,
"loss": 0.0157,
"step": 1550
},
{
"epoch": 4.629080118694362,
"grad_norm": 0.26170411705970764,
"learning_rate": 9.695944607949649e-05,
"loss": 0.0139,
"step": 1560
},
{
"epoch": 4.658753709198813,
"grad_norm": 0.14579689502716064,
"learning_rate": 9.690240911220618e-05,
"loss": 0.013,
"step": 1570
},
{
"epoch": 4.688427299703264,
"grad_norm": 0.17479683458805084,
"learning_rate": 9.684485922768422e-05,
"loss": 0.014,
"step": 1580
},
{
"epoch": 4.718100890207715,
"grad_norm": 0.15623094141483307,
"learning_rate": 9.6786797055287e-05,
"loss": 0.0151,
"step": 1590
},
{
"epoch": 4.747774480712166,
"grad_norm": 0.1983657032251358,
"learning_rate": 9.672822322997305e-05,
"loss": 0.0138,
"step": 1600
},
{
"epoch": 4.777448071216617,
"grad_norm": 0.13670101761817932,
"learning_rate": 9.66691383922964e-05,
"loss": 0.0114,
"step": 1610
},
{
"epoch": 4.807121661721069,
"grad_norm": 0.13849210739135742,
"learning_rate": 9.660954318839933e-05,
"loss": 0.0105,
"step": 1620
},
{
"epoch": 4.836795252225519,
"grad_norm": 0.13254909217357635,
"learning_rate": 9.654943827000548e-05,
"loss": 0.0113,
"step": 1630
},
{
"epoch": 4.8664688427299705,
"grad_norm": 0.12954489886760712,
"learning_rate": 9.648882429441257e-05,
"loss": 0.0133,
"step": 1640
},
{
"epoch": 4.896142433234421,
"grad_norm": 0.13290920853614807,
"learning_rate": 9.642770192448536e-05,
"loss": 0.0156,
"step": 1650
},
{
"epoch": 4.925816023738872,
"grad_norm": 0.13683238625526428,
"learning_rate": 9.636607182864827e-05,
"loss": 0.0096,
"step": 1660
},
{
"epoch": 4.955489614243324,
"grad_norm": 0.07962514460086823,
"learning_rate": 9.630393468087818e-05,
"loss": 0.0128,
"step": 1670
},
{
"epoch": 4.985163204747774,
"grad_norm": 0.15497808158397675,
"learning_rate": 9.624129116069694e-05,
"loss": 0.014,
"step": 1680
},
{
"epoch": 5.014836795252226,
"grad_norm": 0.1419367492198944,
"learning_rate": 9.617814195316411e-05,
"loss": 0.0132,
"step": 1690
},
{
"epoch": 5.044510385756676,
"grad_norm": 0.2201174795627594,
"learning_rate": 9.611448774886924e-05,
"loss": 0.0133,
"step": 1700
},
{
"epoch": 5.074183976261128,
"grad_norm": 0.16477946937084198,
"learning_rate": 9.605032924392457e-05,
"loss": 0.0131,
"step": 1710
},
{
"epoch": 5.103857566765579,
"grad_norm": 0.19834354519844055,
"learning_rate": 9.598566713995718e-05,
"loss": 0.0154,
"step": 1720
},
{
"epoch": 5.1335311572700295,
"grad_norm": 0.22880475223064423,
"learning_rate": 9.59205021441015e-05,
"loss": 0.014,
"step": 1730
},
{
"epoch": 5.163204747774481,
"grad_norm": 0.16253937780857086,
"learning_rate": 9.58548349689915e-05,
"loss": 0.0122,
"step": 1740
},
{
"epoch": 5.192878338278931,
"grad_norm": 0.22121521830558777,
"learning_rate": 9.578866633275288e-05,
"loss": 0.015,
"step": 1750
},
{
"epoch": 5.222551928783383,
"grad_norm": 0.182882621884346,
"learning_rate": 9.572199695899522e-05,
"loss": 0.0163,
"step": 1760
},
{
"epoch": 5.252225519287834,
"grad_norm": 0.17466451227664948,
"learning_rate": 9.565482757680415e-05,
"loss": 0.0145,
"step": 1770
},
{
"epoch": 5.281899109792285,
"grad_norm": 0.12321746349334717,
"learning_rate": 9.558715892073323e-05,
"loss": 0.0127,
"step": 1780
},
{
"epoch": 5.311572700296736,
"grad_norm": 0.1869288980960846,
"learning_rate": 9.551899173079607e-05,
"loss": 0.0185,
"step": 1790
},
{
"epoch": 5.341246290801187,
"grad_norm": 0.1715238392353058,
"learning_rate": 9.545032675245813e-05,
"loss": 0.0136,
"step": 1800
},
{
"epoch": 5.370919881305638,
"grad_norm": 0.20112700760364532,
"learning_rate": 9.538116473662861e-05,
"loss": 0.013,
"step": 1810
},
{
"epoch": 5.400593471810089,
"grad_norm": 0.13722355663776398,
"learning_rate": 9.531150643965223e-05,
"loss": 0.0112,
"step": 1820
},
{
"epoch": 5.43026706231454,
"grad_norm": 0.2131308764219284,
"learning_rate": 9.524135262330098e-05,
"loss": 0.012,
"step": 1830
},
{
"epoch": 5.459940652818991,
"grad_norm": 0.13212528824806213,
"learning_rate": 9.517070405476575e-05,
"loss": 0.0112,
"step": 1840
},
{
"epoch": 5.489614243323442,
"grad_norm": 0.1769159734249115,
"learning_rate": 9.509956150664796e-05,
"loss": 0.0122,
"step": 1850
},
{
"epoch": 5.519287833827893,
"grad_norm": 0.15795955061912537,
"learning_rate": 9.502792575695112e-05,
"loss": 0.0152,
"step": 1860
},
{
"epoch": 5.548961424332344,
"grad_norm": 0.1461247205734253,
"learning_rate": 9.49557975890723e-05,
"loss": 0.0138,
"step": 1870
},
{
"epoch": 5.578635014836795,
"grad_norm": 0.12148546427488327,
"learning_rate": 9.488317779179361e-05,
"loss": 0.0105,
"step": 1880
},
{
"epoch": 5.6083086053412465,
"grad_norm": 0.17779065668582916,
"learning_rate": 9.481006715927351e-05,
"loss": 0.0107,
"step": 1890
},
{
"epoch": 5.637982195845697,
"grad_norm": 0.17717322707176208,
"learning_rate": 9.473646649103818e-05,
"loss": 0.0158,
"step": 1900
},
{
"epoch": 5.667655786350148,
"grad_norm": 0.19820888340473175,
"learning_rate": 9.46623765919727e-05,
"loss": 0.0137,
"step": 1910
},
{
"epoch": 5.697329376854599,
"grad_norm": 0.15721198916435242,
"learning_rate": 9.458779827231237e-05,
"loss": 0.0119,
"step": 1920
},
{
"epoch": 5.72700296735905,
"grad_norm": 0.17158320546150208,
"learning_rate": 9.451273234763371e-05,
"loss": 0.0133,
"step": 1930
},
{
"epoch": 5.756676557863502,
"grad_norm": 0.15745575726032257,
"learning_rate": 9.443717963884569e-05,
"loss": 0.0129,
"step": 1940
},
{
"epoch": 5.786350148367952,
"grad_norm": 0.17795485258102417,
"learning_rate": 9.43611409721806e-05,
"loss": 0.013,
"step": 1950
},
{
"epoch": 5.816023738872404,
"grad_norm": 0.18350425362586975,
"learning_rate": 9.428461717918511e-05,
"loss": 0.0103,
"step": 1960
},
{
"epoch": 5.845697329376854,
"grad_norm": 0.1675357222557068,
"learning_rate": 9.420760909671118e-05,
"loss": 0.0149,
"step": 1970
},
{
"epoch": 5.8753709198813056,
"grad_norm": 0.1395285427570343,
"learning_rate": 9.413011756690685e-05,
"loss": 0.0174,
"step": 1980
},
{
"epoch": 5.905044510385757,
"grad_norm": 0.18694210052490234,
"learning_rate": 9.405214343720707e-05,
"loss": 0.011,
"step": 1990
},
{
"epoch": 5.9347181008902075,
"grad_norm": 0.13509497046470642,
"learning_rate": 9.397368756032445e-05,
"loss": 0.0105,
"step": 2000
},
{
"epoch": 5.964391691394659,
"grad_norm": 0.1707238107919693,
"learning_rate": 9.389475079423988e-05,
"loss": 0.0099,
"step": 2010
},
{
"epoch": 5.994065281899109,
"grad_norm": 0.18797723948955536,
"learning_rate": 9.381533400219318e-05,
"loss": 0.0144,
"step": 2020
},
{
"epoch": 6.023738872403561,
"grad_norm": 0.15263915061950684,
"learning_rate": 9.373543805267368e-05,
"loss": 0.0145,
"step": 2030
},
{
"epoch": 6.053412462908012,
"grad_norm": 0.10786967724561691,
"learning_rate": 9.365506381941066e-05,
"loss": 0.0167,
"step": 2040
},
{
"epoch": 6.083086053412463,
"grad_norm": 0.10059456527233124,
"learning_rate": 9.357421218136386e-05,
"loss": 0.0127,
"step": 2050
},
{
"epoch": 6.112759643916914,
"grad_norm": 0.12447630614042282,
"learning_rate": 9.349288402271388e-05,
"loss": 0.0109,
"step": 2060
},
{
"epoch": 6.142433234421365,
"grad_norm": 0.14649389684200287,
"learning_rate": 9.341108023285238e-05,
"loss": 0.0115,
"step": 2070
},
{
"epoch": 6.172106824925816,
"grad_norm": 0.09072308987379074,
"learning_rate": 9.332880170637252e-05,
"loss": 0.0097,
"step": 2080
},
{
"epoch": 6.201780415430267,
"grad_norm": 0.1700124889612198,
"learning_rate": 9.32460493430591e-05,
"loss": 0.0107,
"step": 2090
},
{
"epoch": 6.231454005934718,
"grad_norm": 0.13447318971157074,
"learning_rate": 9.316282404787871e-05,
"loss": 0.0104,
"step": 2100
},
{
"epoch": 6.261127596439169,
"grad_norm": 0.14388670027256012,
"learning_rate": 9.30791267309698e-05,
"loss": 0.0101,
"step": 2110
},
{
"epoch": 6.29080118694362,
"grad_norm": 0.14514364302158356,
"learning_rate": 9.299495830763286e-05,
"loss": 0.0096,
"step": 2120
},
{
"epoch": 6.320474777448071,
"grad_norm": 0.09969841688871384,
"learning_rate": 9.291031969832026e-05,
"loss": 0.0115,
"step": 2130
},
{
"epoch": 6.350148367952523,
"grad_norm": 0.14247213304042816,
"learning_rate": 9.282521182862629e-05,
"loss": 0.0115,
"step": 2140
},
{
"epoch": 6.379821958456973,
"grad_norm": 0.13603922724723816,
"learning_rate": 9.273963562927695e-05,
"loss": 0.0147,
"step": 2150
},
{
"epoch": 6.4094955489614245,
"grad_norm": 0.23838986456394196,
"learning_rate": 9.265359203611987e-05,
"loss": 0.0115,
"step": 2160
},
{
"epoch": 6.439169139465875,
"grad_norm": 0.11037889868021011,
"learning_rate": 9.256708199011401e-05,
"loss": 0.0096,
"step": 2170
},
{
"epoch": 6.468842729970326,
"grad_norm": 0.15438543260097504,
"learning_rate": 9.248010643731935e-05,
"loss": 0.0126,
"step": 2180
},
{
"epoch": 6.498516320474778,
"grad_norm": 0.12337090075016022,
"learning_rate": 9.239266632888659e-05,
"loss": 0.0093,
"step": 2190
},
{
"epoch": 6.528189910979228,
"grad_norm": 0.09916039556264877,
"learning_rate": 9.230476262104677e-05,
"loss": 0.0114,
"step": 2200
},
{
"epoch": 6.55786350148368,
"grad_norm": 0.1173950806260109,
"learning_rate": 9.221639627510076e-05,
"loss": 0.0095,
"step": 2210
},
{
"epoch": 6.58753709198813,
"grad_norm": 0.14159461855888367,
"learning_rate": 9.212756825740873e-05,
"loss": 0.0096,
"step": 2220
},
{
"epoch": 6.617210682492582,
"grad_norm": 0.14629167318344116,
"learning_rate": 9.20382795393797e-05,
"loss": 0.0118,
"step": 2230
},
{
"epoch": 6.646884272997033,
"grad_norm": 0.11691708117723465,
"learning_rate": 9.194853109746074e-05,
"loss": 0.0095,
"step": 2240
},
{
"epoch": 6.6765578635014835,
"grad_norm": 0.12816114723682404,
"learning_rate": 9.185832391312644e-05,
"loss": 0.0133,
"step": 2250
},
{
"epoch": 6.706231454005935,
"grad_norm": 0.11063099652528763,
"learning_rate": 9.176765897286813e-05,
"loss": 0.0125,
"step": 2260
},
{
"epoch": 6.735905044510385,
"grad_norm": 0.12592542171478271,
"learning_rate": 9.167653726818305e-05,
"loss": 0.0097,
"step": 2270
},
{
"epoch": 6.765578635014837,
"grad_norm": 0.20816679298877716,
"learning_rate": 9.158495979556358e-05,
"loss": 0.0127,
"step": 2280
},
{
"epoch": 6.795252225519288,
"grad_norm": 0.13589587807655334,
"learning_rate": 9.14929275564863e-05,
"loss": 0.0112,
"step": 2290
},
{
"epoch": 6.824925816023739,
"grad_norm": 0.17974646389484406,
"learning_rate": 9.140044155740101e-05,
"loss": 0.0088,
"step": 2300
},
{
"epoch": 6.85459940652819,
"grad_norm": 0.18915049731731415,
"learning_rate": 9.130750280971978e-05,
"loss": 0.0156,
"step": 2310
},
{
"epoch": 6.884272997032641,
"grad_norm": 0.14018063247203827,
"learning_rate": 9.121411232980588e-05,
"loss": 0.0098,
"step": 2320
},
{
"epoch": 6.913946587537092,
"grad_norm": 0.13840338587760925,
"learning_rate": 9.112027113896262e-05,
"loss": 0.017,
"step": 2330
},
{
"epoch": 6.943620178041543,
"grad_norm": 0.11696403473615646,
"learning_rate": 9.102598026342222e-05,
"loss": 0.0099,
"step": 2340
},
{
"epoch": 6.973293768545994,
"grad_norm": 0.13574601709842682,
"learning_rate": 9.093124073433463e-05,
"loss": 0.014,
"step": 2350
},
{
"epoch": 7.002967359050445,
"grad_norm": 0.10100409388542175,
"learning_rate": 9.083605358775612e-05,
"loss": 0.0103,
"step": 2360
},
{
"epoch": 7.032640949554896,
"grad_norm": 0.16500301659107208,
"learning_rate": 9.074041986463808e-05,
"loss": 0.0109,
"step": 2370
},
{
"epoch": 7.062314540059347,
"grad_norm": 0.18439586460590363,
"learning_rate": 9.064434061081562e-05,
"loss": 0.0095,
"step": 2380
},
{
"epoch": 7.091988130563799,
"grad_norm": 0.12944291532039642,
"learning_rate": 9.0547816876996e-05,
"loss": 0.0123,
"step": 2390
},
{
"epoch": 7.121661721068249,
"grad_norm": 0.1536119282245636,
"learning_rate": 9.045084971874738e-05,
"loss": 0.0125,
"step": 2400
},
{
"epoch": 7.1513353115727005,
"grad_norm": 0.1650673747062683,
"learning_rate": 9.035344019648702e-05,
"loss": 0.0092,
"step": 2410
},
{
"epoch": 7.181008902077151,
"grad_norm": 0.12277387827634811,
"learning_rate": 9.025558937546988e-05,
"loss": 0.0102,
"step": 2420
},
{
"epoch": 7.210682492581602,
"grad_norm": 0.15055687725543976,
"learning_rate": 9.015729832577681e-05,
"loss": 0.0094,
"step": 2430
},
{
"epoch": 7.240356083086054,
"grad_norm": 0.16825971007347107,
"learning_rate": 9.005856812230304e-05,
"loss": 0.0116,
"step": 2440
},
{
"epoch": 7.270029673590504,
"grad_norm": 0.10691312700510025,
"learning_rate": 8.995939984474624e-05,
"loss": 0.0095,
"step": 2450
},
{
"epoch": 7.299703264094956,
"grad_norm": 0.14602239429950714,
"learning_rate": 8.98597945775948e-05,
"loss": 0.0124,
"step": 2460
},
{
"epoch": 7.329376854599406,
"grad_norm": 0.13875631988048553,
"learning_rate": 8.975975341011596e-05,
"loss": 0.0106,
"step": 2470
},
{
"epoch": 7.359050445103858,
"grad_norm": 0.12208565324544907,
"learning_rate": 8.965927743634391e-05,
"loss": 0.0108,
"step": 2480
},
{
"epoch": 7.388724035608309,
"grad_norm": 0.11230789124965668,
"learning_rate": 8.955836775506776e-05,
"loss": 0.0081,
"step": 2490
},
{
"epoch": 7.4183976261127595,
"grad_norm": 0.13064904510974884,
"learning_rate": 8.945702546981969e-05,
"loss": 0.0122,
"step": 2500
},
{
"epoch": 7.448071216617211,
"grad_norm": 0.16824467480182648,
"learning_rate": 8.935525168886262e-05,
"loss": 0.0112,
"step": 2510
},
{
"epoch": 7.4777448071216615,
"grad_norm": 0.11342830210924149,
"learning_rate": 8.92530475251784e-05,
"loss": 0.0103,
"step": 2520
},
{
"epoch": 7.507418397626113,
"grad_norm": 0.15296466648578644,
"learning_rate": 8.91504140964553e-05,
"loss": 0.0085,
"step": 2530
},
{
"epoch": 7.537091988130564,
"grad_norm": 0.16064001619815826,
"learning_rate": 8.90473525250761e-05,
"loss": 0.0114,
"step": 2540
},
{
"epoch": 7.566765578635015,
"grad_norm": 0.10076630860567093,
"learning_rate": 8.894386393810563e-05,
"loss": 0.0144,
"step": 2550
},
{
"epoch": 7.596439169139466,
"grad_norm": 0.15510450303554535,
"learning_rate": 8.883994946727849e-05,
"loss": 0.0112,
"step": 2560
},
{
"epoch": 7.626112759643917,
"grad_norm": 0.21251456439495087,
"learning_rate": 8.873561024898668e-05,
"loss": 0.0106,
"step": 2570
},
{
"epoch": 7.655786350148368,
"grad_norm": 0.17526623606681824,
"learning_rate": 8.863084742426719e-05,
"loss": 0.0113,
"step": 2580
},
{
"epoch": 7.6854599406528195,
"grad_norm": 0.12284035235643387,
"learning_rate": 8.852566213878947e-05,
"loss": 0.0114,
"step": 2590
},
{
"epoch": 7.71513353115727,
"grad_norm": 0.12916874885559082,
"learning_rate": 8.842005554284296e-05,
"loss": 0.0099,
"step": 2600
},
{
"epoch": 7.744807121661721,
"grad_norm": 0.17493458092212677,
"learning_rate": 8.831402879132446e-05,
"loss": 0.0092,
"step": 2610
},
{
"epoch": 7.774480712166172,
"grad_norm": 0.12995202839374542,
"learning_rate": 8.820758304372557e-05,
"loss": 0.0104,
"step": 2620
},
{
"epoch": 7.804154302670623,
"grad_norm": 0.08063960820436478,
"learning_rate": 8.810071946411989e-05,
"loss": 0.0087,
"step": 2630
},
{
"epoch": 7.833827893175075,
"grad_norm": 0.10341209173202515,
"learning_rate": 8.799343922115044e-05,
"loss": 0.0077,
"step": 2640
},
{
"epoch": 7.863501483679525,
"grad_norm": 0.108217254281044,
"learning_rate": 8.788574348801675e-05,
"loss": 0.0117,
"step": 2650
},
{
"epoch": 7.893175074183977,
"grad_norm": 0.1359342336654663,
"learning_rate": 8.77776334424621e-05,
"loss": 0.0157,
"step": 2660
},
{
"epoch": 7.922848664688427,
"grad_norm": 0.13467204570770264,
"learning_rate": 8.766911026676064e-05,
"loss": 0.011,
"step": 2670
},
{
"epoch": 7.9525222551928785,
"grad_norm": 0.1321392059326172,
"learning_rate": 8.756017514770443e-05,
"loss": 0.0136,
"step": 2680
},
{
"epoch": 7.98219584569733,
"grad_norm": 0.16591744124889374,
"learning_rate": 8.745082927659047e-05,
"loss": 0.0093,
"step": 2690
},
{
"epoch": 8.011869436201781,
"grad_norm": 0.14482906460762024,
"learning_rate": 8.73410738492077e-05,
"loss": 0.012,
"step": 2700
},
{
"epoch": 8.041543026706231,
"grad_norm": 0.12772271037101746,
"learning_rate": 8.723091006582389e-05,
"loss": 0.0118,
"step": 2710
},
{
"epoch": 8.071216617210682,
"grad_norm": 0.12283479422330856,
"learning_rate": 8.71203391311725e-05,
"loss": 0.0104,
"step": 2720
},
{
"epoch": 8.100890207715134,
"grad_norm": 0.11549960821866989,
"learning_rate": 8.700936225443959e-05,
"loss": 0.0101,
"step": 2730
},
{
"epoch": 8.130563798219585,
"grad_norm": 0.12340424209833145,
"learning_rate": 8.689798064925049e-05,
"loss": 0.0108,
"step": 2740
},
{
"epoch": 8.160237388724036,
"grad_norm": 0.13828441500663757,
"learning_rate": 8.678619553365659e-05,
"loss": 0.0084,
"step": 2750
},
{
"epoch": 8.189910979228486,
"grad_norm": 0.176404669880867,
"learning_rate": 8.6674008130122e-05,
"loss": 0.0093,
"step": 2760
},
{
"epoch": 8.219584569732937,
"grad_norm": 0.12829335033893585,
"learning_rate": 8.656141966551019e-05,
"loss": 0.0095,
"step": 2770
},
{
"epoch": 8.249258160237389,
"grad_norm": 0.15259785950183868,
"learning_rate": 8.644843137107059e-05,
"loss": 0.01,
"step": 2780
},
{
"epoch": 8.27893175074184,
"grad_norm": 0.17509141564369202,
"learning_rate": 8.633504448242505e-05,
"loss": 0.0086,
"step": 2790
},
{
"epoch": 8.308605341246292,
"grad_norm": 0.18248887360095978,
"learning_rate": 8.622126023955446e-05,
"loss": 0.0098,
"step": 2800
},
{
"epoch": 8.338278931750741,
"grad_norm": 0.13852570950984955,
"learning_rate": 8.610707988678503e-05,
"loss": 0.0109,
"step": 2810
},
{
"epoch": 8.367952522255193,
"grad_norm": 0.15752212703227997,
"learning_rate": 8.599250467277483e-05,
"loss": 0.0088,
"step": 2820
},
{
"epoch": 8.397626112759644,
"grad_norm": 0.08893997222185135,
"learning_rate": 8.587753585050004e-05,
"loss": 0.0096,
"step": 2830
},
{
"epoch": 8.427299703264095,
"grad_norm": 0.1134849488735199,
"learning_rate": 8.576217467724128e-05,
"loss": 0.0105,
"step": 2840
},
{
"epoch": 8.456973293768545,
"grad_norm": 0.18662723898887634,
"learning_rate": 8.564642241456986e-05,
"loss": 0.0102,
"step": 2850
},
{
"epoch": 8.486646884272997,
"grad_norm": 0.07733399420976639,
"learning_rate": 8.553028032833397e-05,
"loss": 0.0104,
"step": 2860
},
{
"epoch": 8.516320474777448,
"grad_norm": 0.13568611443042755,
"learning_rate": 8.541374968864487e-05,
"loss": 0.0086,
"step": 2870
},
{
"epoch": 8.5459940652819,
"grad_norm": 0.07520133256912231,
"learning_rate": 8.529683176986295e-05,
"loss": 0.0084,
"step": 2880
},
{
"epoch": 8.57566765578635,
"grad_norm": 0.06504914909601212,
"learning_rate": 8.517952785058385e-05,
"loss": 0.0086,
"step": 2890
},
{
"epoch": 8.605341246290802,
"grad_norm": 0.15347328782081604,
"learning_rate": 8.506183921362443e-05,
"loss": 0.0097,
"step": 2900
},
{
"epoch": 8.635014836795252,
"grad_norm": 0.15778006613254547,
"learning_rate": 8.494376714600878e-05,
"loss": 0.0157,
"step": 2910
},
{
"epoch": 8.664688427299703,
"grad_norm": 0.15846551954746246,
"learning_rate": 8.482531293895412e-05,
"loss": 0.0093,
"step": 2920
},
{
"epoch": 8.694362017804155,
"grad_norm": 0.19215027987957,
"learning_rate": 8.470647788785665e-05,
"loss": 0.0101,
"step": 2930
},
{
"epoch": 8.724035608308606,
"grad_norm": 0.14507389068603516,
"learning_rate": 8.458726329227747e-05,
"loss": 0.0102,
"step": 2940
},
{
"epoch": 8.753709198813056,
"grad_norm": 0.15245003998279572,
"learning_rate": 8.44676704559283e-05,
"loss": 0.009,
"step": 2950
},
{
"epoch": 8.783382789317507,
"grad_norm": 0.16085323691368103,
"learning_rate": 8.434770068665723e-05,
"loss": 0.0122,
"step": 2960
},
{
"epoch": 8.813056379821958,
"grad_norm": 0.12772567570209503,
"learning_rate": 8.422735529643444e-05,
"loss": 0.012,
"step": 2970
},
{
"epoch": 8.84272997032641,
"grad_norm": 0.12985776364803314,
"learning_rate": 8.410663560133784e-05,
"loss": 0.0085,
"step": 2980
},
{
"epoch": 8.872403560830861,
"grad_norm": 0.13308796286582947,
"learning_rate": 8.398554292153866e-05,
"loss": 0.0076,
"step": 2990
},
{
"epoch": 8.90207715133531,
"grad_norm": 0.18778546154499054,
"learning_rate": 8.386407858128706e-05,
"loss": 0.0116,
"step": 3000
}
],
"logging_steps": 10,
"max_steps": 10000,
"num_input_tokens_seen": 0,
"num_train_epochs": 30,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 2.375898115552051e+18,
"train_batch_size": 32,
"trial_name": null,
"trial_params": null
}