political-bias-roBERTa-base / trainer_state.json
kritigupta's picture
Upload folder using huggingface_hub
1d5234f verified
raw
history blame
131 kB
{
"best_metric": 0.4010973274707794,
"best_model_checkpoint": "./results/checkpoint-3756",
"epoch": 2.0,
"eval_steps": 500,
"global_step": 7512,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0026624068157614484,
"grad_norm": 2.180868148803711,
"learning_rate": 4.9955626553070645e-06,
"loss": 1.101,
"step": 10
},
{
"epoch": 0.005324813631522897,
"grad_norm": 20.548362731933594,
"learning_rate": 4.991125310614129e-06,
"loss": 1.1047,
"step": 20
},
{
"epoch": 0.007987220447284345,
"grad_norm": 2.2688851356506348,
"learning_rate": 4.986687965921193e-06,
"loss": 1.1001,
"step": 30
},
{
"epoch": 0.010649627263045794,
"grad_norm": 1.3329694271087646,
"learning_rate": 4.982250621228258e-06,
"loss": 1.0965,
"step": 40
},
{
"epoch": 0.013312034078807242,
"grad_norm": 3.3356387615203857,
"learning_rate": 4.977813276535322e-06,
"loss": 1.0884,
"step": 50
},
{
"epoch": 0.01597444089456869,
"grad_norm": 3.10524582862854,
"learning_rate": 4.973375931842386e-06,
"loss": 1.0968,
"step": 60
},
{
"epoch": 0.01863684771033014,
"grad_norm": 3.935608386993408,
"learning_rate": 4.96893858714945e-06,
"loss": 1.1149,
"step": 70
},
{
"epoch": 0.021299254526091587,
"grad_norm": 4.374054908752441,
"learning_rate": 4.964501242456514e-06,
"loss": 1.0865,
"step": 80
},
{
"epoch": 0.023961661341853034,
"grad_norm": 2.463519811630249,
"learning_rate": 4.960063897763578e-06,
"loss": 1.093,
"step": 90
},
{
"epoch": 0.026624068157614485,
"grad_norm": 3.622931718826294,
"learning_rate": 4.955626553070643e-06,
"loss": 1.076,
"step": 100
},
{
"epoch": 0.029286474973375932,
"grad_norm": 4.1382036209106445,
"learning_rate": 4.951189208377707e-06,
"loss": 1.0658,
"step": 110
},
{
"epoch": 0.03194888178913738,
"grad_norm": 5.006036758422852,
"learning_rate": 4.946751863684771e-06,
"loss": 1.0998,
"step": 120
},
{
"epoch": 0.03461128860489883,
"grad_norm": 2.9711780548095703,
"learning_rate": 4.942314518991836e-06,
"loss": 1.0782,
"step": 130
},
{
"epoch": 0.03727369542066028,
"grad_norm": 3.5013747215270996,
"learning_rate": 4.9378771742989e-06,
"loss": 1.1043,
"step": 140
},
{
"epoch": 0.039936102236421724,
"grad_norm": 3.3159549236297607,
"learning_rate": 4.9334398296059644e-06,
"loss": 1.0854,
"step": 150
},
{
"epoch": 0.042598509052183174,
"grad_norm": 3.0417370796203613,
"learning_rate": 4.9290024849130285e-06,
"loss": 1.0969,
"step": 160
},
{
"epoch": 0.045260915867944625,
"grad_norm": 4.1927995681762695,
"learning_rate": 4.924565140220093e-06,
"loss": 1.064,
"step": 170
},
{
"epoch": 0.04792332268370607,
"grad_norm": 6.60683536529541,
"learning_rate": 4.920127795527157e-06,
"loss": 1.0867,
"step": 180
},
{
"epoch": 0.05058572949946752,
"grad_norm": 5.03337287902832,
"learning_rate": 4.915690450834222e-06,
"loss": 1.0379,
"step": 190
},
{
"epoch": 0.05324813631522897,
"grad_norm": 20.25901985168457,
"learning_rate": 4.911253106141286e-06,
"loss": 1.0063,
"step": 200
},
{
"epoch": 0.05591054313099041,
"grad_norm": 16.945417404174805,
"learning_rate": 4.90681576144835e-06,
"loss": 1.0068,
"step": 210
},
{
"epoch": 0.058572949946751864,
"grad_norm": 16.699810028076172,
"learning_rate": 4.902378416755414e-06,
"loss": 1.0226,
"step": 220
},
{
"epoch": 0.061235356762513314,
"grad_norm": 17.65724754333496,
"learning_rate": 4.897941072062478e-06,
"loss": 1.0968,
"step": 230
},
{
"epoch": 0.06389776357827476,
"grad_norm": 16.281299591064453,
"learning_rate": 4.893503727369542e-06,
"loss": 1.0512,
"step": 240
},
{
"epoch": 0.06656017039403621,
"grad_norm": 9.094976425170898,
"learning_rate": 4.889066382676606e-06,
"loss": 1.0198,
"step": 250
},
{
"epoch": 0.06922257720979766,
"grad_norm": 14.096275329589844,
"learning_rate": 4.884629037983671e-06,
"loss": 0.9963,
"step": 260
},
{
"epoch": 0.07188498402555911,
"grad_norm": 9.90117073059082,
"learning_rate": 4.880191693290735e-06,
"loss": 0.9744,
"step": 270
},
{
"epoch": 0.07454739084132056,
"grad_norm": 32.20856475830078,
"learning_rate": 4.875754348597799e-06,
"loss": 0.9397,
"step": 280
},
{
"epoch": 0.077209797657082,
"grad_norm": 24.709117889404297,
"learning_rate": 4.8713170039048635e-06,
"loss": 0.9559,
"step": 290
},
{
"epoch": 0.07987220447284345,
"grad_norm": 34.5052490234375,
"learning_rate": 4.866879659211928e-06,
"loss": 1.0161,
"step": 300
},
{
"epoch": 0.0825346112886049,
"grad_norm": 24.2115535736084,
"learning_rate": 4.862442314518992e-06,
"loss": 0.9021,
"step": 310
},
{
"epoch": 0.08519701810436635,
"grad_norm": 21.005727767944336,
"learning_rate": 4.858004969826057e-06,
"loss": 0.8696,
"step": 320
},
{
"epoch": 0.0878594249201278,
"grad_norm": 15.019360542297363,
"learning_rate": 4.853567625133121e-06,
"loss": 0.9507,
"step": 330
},
{
"epoch": 0.09052183173588925,
"grad_norm": 16.113773345947266,
"learning_rate": 4.849130280440185e-06,
"loss": 0.9447,
"step": 340
},
{
"epoch": 0.09318423855165069,
"grad_norm": 14.285008430480957,
"learning_rate": 4.84469293574725e-06,
"loss": 0.9346,
"step": 350
},
{
"epoch": 0.09584664536741214,
"grad_norm": 23.993881225585938,
"learning_rate": 4.840255591054314e-06,
"loss": 0.8827,
"step": 360
},
{
"epoch": 0.09850905218317359,
"grad_norm": 17.099952697753906,
"learning_rate": 4.835818246361378e-06,
"loss": 0.8679,
"step": 370
},
{
"epoch": 0.10117145899893504,
"grad_norm": 15.823094367980957,
"learning_rate": 4.831380901668442e-06,
"loss": 0.9071,
"step": 380
},
{
"epoch": 0.10383386581469649,
"grad_norm": 23.060998916625977,
"learning_rate": 4.826943556975506e-06,
"loss": 0.8569,
"step": 390
},
{
"epoch": 0.10649627263045794,
"grad_norm": 47.94389724731445,
"learning_rate": 4.82250621228257e-06,
"loss": 0.8119,
"step": 400
},
{
"epoch": 0.10915867944621938,
"grad_norm": 26.12384796142578,
"learning_rate": 4.818068867589634e-06,
"loss": 0.7319,
"step": 410
},
{
"epoch": 0.11182108626198083,
"grad_norm": 13.601881980895996,
"learning_rate": 4.813631522896699e-06,
"loss": 0.6861,
"step": 420
},
{
"epoch": 0.11448349307774228,
"grad_norm": 24.602535247802734,
"learning_rate": 4.809194178203763e-06,
"loss": 0.8515,
"step": 430
},
{
"epoch": 0.11714589989350373,
"grad_norm": 27.71798324584961,
"learning_rate": 4.8047568335108275e-06,
"loss": 0.9217,
"step": 440
},
{
"epoch": 0.11980830670926518,
"grad_norm": 14.012025833129883,
"learning_rate": 4.800319488817892e-06,
"loss": 0.708,
"step": 450
},
{
"epoch": 0.12247071352502663,
"grad_norm": 13.043439865112305,
"learning_rate": 4.795882144124956e-06,
"loss": 0.861,
"step": 460
},
{
"epoch": 0.12513312034078808,
"grad_norm": 21.09777069091797,
"learning_rate": 4.79144479943202e-06,
"loss": 0.6439,
"step": 470
},
{
"epoch": 0.12779552715654952,
"grad_norm": 33.561073303222656,
"learning_rate": 4.787007454739085e-06,
"loss": 0.8182,
"step": 480
},
{
"epoch": 0.13045793397231098,
"grad_norm": 47.77071762084961,
"learning_rate": 4.782570110046149e-06,
"loss": 0.6582,
"step": 490
},
{
"epoch": 0.13312034078807242,
"grad_norm": 26.4221248626709,
"learning_rate": 4.778132765353213e-06,
"loss": 0.626,
"step": 500
},
{
"epoch": 0.13578274760383385,
"grad_norm": 25.370988845825195,
"learning_rate": 4.773695420660277e-06,
"loss": 0.7418,
"step": 510
},
{
"epoch": 0.13844515441959532,
"grad_norm": 21.20345115661621,
"learning_rate": 4.769258075967342e-06,
"loss": 0.8102,
"step": 520
},
{
"epoch": 0.14110756123535675,
"grad_norm": 31.8303279876709,
"learning_rate": 4.764820731274406e-06,
"loss": 0.674,
"step": 530
},
{
"epoch": 0.14376996805111822,
"grad_norm": 44.16394805908203,
"learning_rate": 4.76038338658147e-06,
"loss": 0.8384,
"step": 540
},
{
"epoch": 0.14643237486687966,
"grad_norm": 23.01811981201172,
"learning_rate": 4.755946041888534e-06,
"loss": 0.7557,
"step": 550
},
{
"epoch": 0.14909478168264112,
"grad_norm": 57.59928512573242,
"learning_rate": 4.751508697195598e-06,
"loss": 0.7821,
"step": 560
},
{
"epoch": 0.15175718849840256,
"grad_norm": 26.099145889282227,
"learning_rate": 4.747071352502663e-06,
"loss": 0.8087,
"step": 570
},
{
"epoch": 0.154419595314164,
"grad_norm": 17.113813400268555,
"learning_rate": 4.7426340078097274e-06,
"loss": 0.7424,
"step": 580
},
{
"epoch": 0.15708200212992546,
"grad_norm": 17.72032356262207,
"learning_rate": 4.7381966631167915e-06,
"loss": 0.7406,
"step": 590
},
{
"epoch": 0.1597444089456869,
"grad_norm": 20.37584114074707,
"learning_rate": 4.733759318423856e-06,
"loss": 0.6136,
"step": 600
},
{
"epoch": 0.16240681576144836,
"grad_norm": 14.854445457458496,
"learning_rate": 4.72932197373092e-06,
"loss": 0.7863,
"step": 610
},
{
"epoch": 0.1650692225772098,
"grad_norm": 32.50448989868164,
"learning_rate": 4.724884629037984e-06,
"loss": 0.7957,
"step": 620
},
{
"epoch": 0.16773162939297126,
"grad_norm": 29.92447853088379,
"learning_rate": 4.720447284345048e-06,
"loss": 0.6076,
"step": 630
},
{
"epoch": 0.1703940362087327,
"grad_norm": 12.294814109802246,
"learning_rate": 4.716009939652113e-06,
"loss": 0.7537,
"step": 640
},
{
"epoch": 0.17305644302449413,
"grad_norm": 28.16832160949707,
"learning_rate": 4.711572594959177e-06,
"loss": 0.6871,
"step": 650
},
{
"epoch": 0.1757188498402556,
"grad_norm": 90.14241027832031,
"learning_rate": 4.707135250266241e-06,
"loss": 0.5651,
"step": 660
},
{
"epoch": 0.17838125665601703,
"grad_norm": 48.91092300415039,
"learning_rate": 4.702697905573305e-06,
"loss": 0.8134,
"step": 670
},
{
"epoch": 0.1810436634717785,
"grad_norm": 27.665481567382812,
"learning_rate": 4.698260560880369e-06,
"loss": 0.8326,
"step": 680
},
{
"epoch": 0.18370607028753994,
"grad_norm": 33.83591842651367,
"learning_rate": 4.693823216187433e-06,
"loss": 0.7091,
"step": 690
},
{
"epoch": 0.18636847710330137,
"grad_norm": 25.907155990600586,
"learning_rate": 4.6893858714944975e-06,
"loss": 0.7269,
"step": 700
},
{
"epoch": 0.18903088391906284,
"grad_norm": 19.19723129272461,
"learning_rate": 4.684948526801562e-06,
"loss": 0.6516,
"step": 710
},
{
"epoch": 0.19169329073482427,
"grad_norm": 20.81375503540039,
"learning_rate": 4.6805111821086265e-06,
"loss": 0.6718,
"step": 720
},
{
"epoch": 0.19435569755058574,
"grad_norm": 49.074684143066406,
"learning_rate": 4.6760738374156914e-06,
"loss": 0.7673,
"step": 730
},
{
"epoch": 0.19701810436634717,
"grad_norm": 31.557104110717773,
"learning_rate": 4.6716364927227555e-06,
"loss": 0.6633,
"step": 740
},
{
"epoch": 0.19968051118210864,
"grad_norm": 33.61763000488281,
"learning_rate": 4.66719914802982e-06,
"loss": 0.6651,
"step": 750
},
{
"epoch": 0.20234291799787008,
"grad_norm": 29.612642288208008,
"learning_rate": 4.662761803336884e-06,
"loss": 0.7033,
"step": 760
},
{
"epoch": 0.2050053248136315,
"grad_norm": 35.30272674560547,
"learning_rate": 4.658324458643948e-06,
"loss": 0.6713,
"step": 770
},
{
"epoch": 0.20766773162939298,
"grad_norm": 56.90960693359375,
"learning_rate": 4.653887113951012e-06,
"loss": 0.5338,
"step": 780
},
{
"epoch": 0.2103301384451544,
"grad_norm": 42.410377502441406,
"learning_rate": 4.649449769258076e-06,
"loss": 0.6004,
"step": 790
},
{
"epoch": 0.21299254526091588,
"grad_norm": 25.92689323425293,
"learning_rate": 4.645012424565141e-06,
"loss": 0.7435,
"step": 800
},
{
"epoch": 0.21565495207667731,
"grad_norm": 21.87204933166504,
"learning_rate": 4.640575079872205e-06,
"loss": 0.6701,
"step": 810
},
{
"epoch": 0.21831735889243875,
"grad_norm": 29.98390007019043,
"learning_rate": 4.636137735179269e-06,
"loss": 0.5621,
"step": 820
},
{
"epoch": 0.22097976570820022,
"grad_norm": 45.0776481628418,
"learning_rate": 4.631700390486333e-06,
"loss": 0.7574,
"step": 830
},
{
"epoch": 0.22364217252396165,
"grad_norm": 22.044301986694336,
"learning_rate": 4.627263045793397e-06,
"loss": 0.5998,
"step": 840
},
{
"epoch": 0.22630457933972312,
"grad_norm": 19.954084396362305,
"learning_rate": 4.6228257011004615e-06,
"loss": 0.6442,
"step": 850
},
{
"epoch": 0.22896698615548455,
"grad_norm": 46.29410171508789,
"learning_rate": 4.618388356407526e-06,
"loss": 0.5442,
"step": 860
},
{
"epoch": 0.23162939297124602,
"grad_norm": 46.635860443115234,
"learning_rate": 4.6139510117145905e-06,
"loss": 0.4968,
"step": 870
},
{
"epoch": 0.23429179978700745,
"grad_norm": 61.50525665283203,
"learning_rate": 4.609513667021655e-06,
"loss": 0.6694,
"step": 880
},
{
"epoch": 0.2369542066027689,
"grad_norm": 22.6109619140625,
"learning_rate": 4.605076322328719e-06,
"loss": 0.4651,
"step": 890
},
{
"epoch": 0.23961661341853036,
"grad_norm": 35.642433166503906,
"learning_rate": 4.600638977635783e-06,
"loss": 0.4204,
"step": 900
},
{
"epoch": 0.2422790202342918,
"grad_norm": 40.22004699707031,
"learning_rate": 4.596201632942847e-06,
"loss": 0.5079,
"step": 910
},
{
"epoch": 0.24494142705005326,
"grad_norm": 54.28346633911133,
"learning_rate": 4.591764288249911e-06,
"loss": 0.6034,
"step": 920
},
{
"epoch": 0.2476038338658147,
"grad_norm": 14.682561874389648,
"learning_rate": 4.587326943556976e-06,
"loss": 0.6961,
"step": 930
},
{
"epoch": 0.25026624068157616,
"grad_norm": 46.71699142456055,
"learning_rate": 4.58288959886404e-06,
"loss": 0.5714,
"step": 940
},
{
"epoch": 0.25292864749733757,
"grad_norm": 19.711076736450195,
"learning_rate": 4.578452254171105e-06,
"loss": 0.5371,
"step": 950
},
{
"epoch": 0.25559105431309903,
"grad_norm": 44.26546096801758,
"learning_rate": 4.574014909478169e-06,
"loss": 0.6029,
"step": 960
},
{
"epoch": 0.2582534611288605,
"grad_norm": 30.732563018798828,
"learning_rate": 4.569577564785233e-06,
"loss": 0.6433,
"step": 970
},
{
"epoch": 0.26091586794462196,
"grad_norm": 36.73834991455078,
"learning_rate": 4.565140220092297e-06,
"loss": 0.5183,
"step": 980
},
{
"epoch": 0.26357827476038337,
"grad_norm": 33.021827697753906,
"learning_rate": 4.560702875399361e-06,
"loss": 0.7053,
"step": 990
},
{
"epoch": 0.26624068157614483,
"grad_norm": 15.916735649108887,
"learning_rate": 4.5562655307064255e-06,
"loss": 0.5576,
"step": 1000
},
{
"epoch": 0.2689030883919063,
"grad_norm": 33.52116775512695,
"learning_rate": 4.55182818601349e-06,
"loss": 0.5954,
"step": 1010
},
{
"epoch": 0.2715654952076677,
"grad_norm": 71.4487533569336,
"learning_rate": 4.5473908413205545e-06,
"loss": 0.6136,
"step": 1020
},
{
"epoch": 0.27422790202342917,
"grad_norm": 51.97545623779297,
"learning_rate": 4.542953496627619e-06,
"loss": 0.6643,
"step": 1030
},
{
"epoch": 0.27689030883919064,
"grad_norm": 16.853572845458984,
"learning_rate": 4.538516151934683e-06,
"loss": 0.4944,
"step": 1040
},
{
"epoch": 0.2795527156549521,
"grad_norm": 54.41709518432617,
"learning_rate": 4.534078807241747e-06,
"loss": 0.671,
"step": 1050
},
{
"epoch": 0.2822151224707135,
"grad_norm": 27.009441375732422,
"learning_rate": 4.529641462548811e-06,
"loss": 0.3649,
"step": 1060
},
{
"epoch": 0.284877529286475,
"grad_norm": 58.78076171875,
"learning_rate": 4.525204117855875e-06,
"loss": 0.6091,
"step": 1070
},
{
"epoch": 0.28753993610223644,
"grad_norm": 44.915767669677734,
"learning_rate": 4.52076677316294e-06,
"loss": 0.5505,
"step": 1080
},
{
"epoch": 0.29020234291799785,
"grad_norm": 40.530338287353516,
"learning_rate": 4.516329428470004e-06,
"loss": 0.5749,
"step": 1090
},
{
"epoch": 0.2928647497337593,
"grad_norm": 100.83206939697266,
"learning_rate": 4.511892083777068e-06,
"loss": 0.7549,
"step": 1100
},
{
"epoch": 0.2955271565495208,
"grad_norm": 20.176633834838867,
"learning_rate": 4.507454739084132e-06,
"loss": 0.4467,
"step": 1110
},
{
"epoch": 0.29818956336528224,
"grad_norm": 6.653196811676025,
"learning_rate": 4.503017394391196e-06,
"loss": 0.5694,
"step": 1120
},
{
"epoch": 0.30085197018104365,
"grad_norm": 30.77353858947754,
"learning_rate": 4.4985800496982605e-06,
"loss": 0.6137,
"step": 1130
},
{
"epoch": 0.3035143769968051,
"grad_norm": 47.246639251708984,
"learning_rate": 4.494142705005325e-06,
"loss": 0.7237,
"step": 1140
},
{
"epoch": 0.3061767838125666,
"grad_norm": 19.615482330322266,
"learning_rate": 4.4897053603123895e-06,
"loss": 0.5938,
"step": 1150
},
{
"epoch": 0.308839190628328,
"grad_norm": 43.58363342285156,
"learning_rate": 4.485268015619454e-06,
"loss": 0.5076,
"step": 1160
},
{
"epoch": 0.31150159744408945,
"grad_norm": 19.3497257232666,
"learning_rate": 4.4808306709265185e-06,
"loss": 0.5314,
"step": 1170
},
{
"epoch": 0.3141640042598509,
"grad_norm": 34.415191650390625,
"learning_rate": 4.476393326233583e-06,
"loss": 0.6132,
"step": 1180
},
{
"epoch": 0.3168264110756124,
"grad_norm": 26.902507781982422,
"learning_rate": 4.471955981540647e-06,
"loss": 0.5762,
"step": 1190
},
{
"epoch": 0.3194888178913738,
"grad_norm": 12.590169906616211,
"learning_rate": 4.467518636847711e-06,
"loss": 0.5765,
"step": 1200
},
{
"epoch": 0.32215122470713525,
"grad_norm": 60.077022552490234,
"learning_rate": 4.463081292154775e-06,
"loss": 0.56,
"step": 1210
},
{
"epoch": 0.3248136315228967,
"grad_norm": 38.976280212402344,
"learning_rate": 4.458643947461839e-06,
"loss": 0.6197,
"step": 1220
},
{
"epoch": 0.3274760383386581,
"grad_norm": 98.09929656982422,
"learning_rate": 4.454206602768903e-06,
"loss": 0.5782,
"step": 1230
},
{
"epoch": 0.3301384451544196,
"grad_norm": 38.21907424926758,
"learning_rate": 4.449769258075968e-06,
"loss": 0.5094,
"step": 1240
},
{
"epoch": 0.33280085197018106,
"grad_norm": 33.70249557495117,
"learning_rate": 4.445331913383032e-06,
"loss": 0.5482,
"step": 1250
},
{
"epoch": 0.3354632587859425,
"grad_norm": 19.78879737854004,
"learning_rate": 4.440894568690096e-06,
"loss": 0.5706,
"step": 1260
},
{
"epoch": 0.33812566560170393,
"grad_norm": 52.534610748291016,
"learning_rate": 4.43645722399716e-06,
"loss": 0.6004,
"step": 1270
},
{
"epoch": 0.3407880724174654,
"grad_norm": 27.925262451171875,
"learning_rate": 4.4320198793042245e-06,
"loss": 0.5816,
"step": 1280
},
{
"epoch": 0.34345047923322686,
"grad_norm": 69.1347885131836,
"learning_rate": 4.4275825346112886e-06,
"loss": 0.6185,
"step": 1290
},
{
"epoch": 0.34611288604898827,
"grad_norm": 31.98657989501953,
"learning_rate": 4.423145189918353e-06,
"loss": 0.6491,
"step": 1300
},
{
"epoch": 0.34877529286474973,
"grad_norm": 16.68319320678711,
"learning_rate": 4.418707845225418e-06,
"loss": 0.4388,
"step": 1310
},
{
"epoch": 0.3514376996805112,
"grad_norm": 39.13215255737305,
"learning_rate": 4.414270500532482e-06,
"loss": 0.6739,
"step": 1320
},
{
"epoch": 0.3541001064962726,
"grad_norm": 36.07078552246094,
"learning_rate": 4.409833155839546e-06,
"loss": 0.5749,
"step": 1330
},
{
"epoch": 0.35676251331203407,
"grad_norm": 55.29554748535156,
"learning_rate": 4.40539581114661e-06,
"loss": 0.598,
"step": 1340
},
{
"epoch": 0.35942492012779553,
"grad_norm": 38.93369674682617,
"learning_rate": 4.400958466453675e-06,
"loss": 0.6119,
"step": 1350
},
{
"epoch": 0.362087326943557,
"grad_norm": 35.16259002685547,
"learning_rate": 4.396521121760739e-06,
"loss": 0.462,
"step": 1360
},
{
"epoch": 0.3647497337593184,
"grad_norm": 51.75934982299805,
"learning_rate": 4.392083777067803e-06,
"loss": 0.5884,
"step": 1370
},
{
"epoch": 0.36741214057507987,
"grad_norm": 24.179126739501953,
"learning_rate": 4.387646432374867e-06,
"loss": 0.5186,
"step": 1380
},
{
"epoch": 0.37007454739084134,
"grad_norm": 16.37703514099121,
"learning_rate": 4.383209087681931e-06,
"loss": 0.4809,
"step": 1390
},
{
"epoch": 0.37273695420660274,
"grad_norm": 60.02155303955078,
"learning_rate": 4.378771742988996e-06,
"loss": 0.5701,
"step": 1400
},
{
"epoch": 0.3753993610223642,
"grad_norm": 32.391414642333984,
"learning_rate": 4.37433439829606e-06,
"loss": 0.5422,
"step": 1410
},
{
"epoch": 0.3780617678381257,
"grad_norm": 32.528743743896484,
"learning_rate": 4.369897053603124e-06,
"loss": 0.4741,
"step": 1420
},
{
"epoch": 0.38072417465388714,
"grad_norm": 33.44760513305664,
"learning_rate": 4.3654597089101885e-06,
"loss": 0.4124,
"step": 1430
},
{
"epoch": 0.38338658146964855,
"grad_norm": 30.302766799926758,
"learning_rate": 4.361022364217253e-06,
"loss": 0.409,
"step": 1440
},
{
"epoch": 0.38604898828541,
"grad_norm": 46.419212341308594,
"learning_rate": 4.356585019524317e-06,
"loss": 0.4647,
"step": 1450
},
{
"epoch": 0.3887113951011715,
"grad_norm": 43.90689468383789,
"learning_rate": 4.352147674831382e-06,
"loss": 0.6913,
"step": 1460
},
{
"epoch": 0.3913738019169329,
"grad_norm": 6.017773151397705,
"learning_rate": 4.347710330138446e-06,
"loss": 0.5771,
"step": 1470
},
{
"epoch": 0.39403620873269435,
"grad_norm": 14.032795906066895,
"learning_rate": 4.34327298544551e-06,
"loss": 0.601,
"step": 1480
},
{
"epoch": 0.3966986155484558,
"grad_norm": 20.390214920043945,
"learning_rate": 4.338835640752574e-06,
"loss": 0.4624,
"step": 1490
},
{
"epoch": 0.3993610223642173,
"grad_norm": 58.51810836791992,
"learning_rate": 4.334398296059638e-06,
"loss": 0.5423,
"step": 1500
},
{
"epoch": 0.4020234291799787,
"grad_norm": 56.609100341796875,
"learning_rate": 4.329960951366702e-06,
"loss": 0.5219,
"step": 1510
},
{
"epoch": 0.40468583599574015,
"grad_norm": 65.66415405273438,
"learning_rate": 4.325523606673766e-06,
"loss": 0.5343,
"step": 1520
},
{
"epoch": 0.4073482428115016,
"grad_norm": 32.781795501708984,
"learning_rate": 4.321086261980831e-06,
"loss": 0.4656,
"step": 1530
},
{
"epoch": 0.410010649627263,
"grad_norm": 34.07284927368164,
"learning_rate": 4.316648917287895e-06,
"loss": 0.5398,
"step": 1540
},
{
"epoch": 0.4126730564430245,
"grad_norm": 34.39366912841797,
"learning_rate": 4.312211572594959e-06,
"loss": 0.5753,
"step": 1550
},
{
"epoch": 0.41533546325878595,
"grad_norm": 11.670945167541504,
"learning_rate": 4.307774227902024e-06,
"loss": 0.3905,
"step": 1560
},
{
"epoch": 0.4179978700745474,
"grad_norm": 48.68906784057617,
"learning_rate": 4.303336883209088e-06,
"loss": 0.662,
"step": 1570
},
{
"epoch": 0.4206602768903088,
"grad_norm": 16.71038818359375,
"learning_rate": 4.2988995385161525e-06,
"loss": 0.4915,
"step": 1580
},
{
"epoch": 0.4233226837060703,
"grad_norm": 24.46503448486328,
"learning_rate": 4.294462193823217e-06,
"loss": 0.6037,
"step": 1590
},
{
"epoch": 0.42598509052183176,
"grad_norm": 42.81535720825195,
"learning_rate": 4.290024849130281e-06,
"loss": 0.5119,
"step": 1600
},
{
"epoch": 0.42864749733759316,
"grad_norm": 33.23320007324219,
"learning_rate": 4.285587504437345e-06,
"loss": 0.384,
"step": 1610
},
{
"epoch": 0.43130990415335463,
"grad_norm": 36.269622802734375,
"learning_rate": 4.28115015974441e-06,
"loss": 0.561,
"step": 1620
},
{
"epoch": 0.4339723109691161,
"grad_norm": 93.0941162109375,
"learning_rate": 4.276712815051474e-06,
"loss": 0.4301,
"step": 1630
},
{
"epoch": 0.4366347177848775,
"grad_norm": 51.127262115478516,
"learning_rate": 4.272275470358538e-06,
"loss": 0.4352,
"step": 1640
},
{
"epoch": 0.43929712460063897,
"grad_norm": 35.915279388427734,
"learning_rate": 4.267838125665602e-06,
"loss": 0.518,
"step": 1650
},
{
"epoch": 0.44195953141640043,
"grad_norm": 31.928068161010742,
"learning_rate": 4.263400780972666e-06,
"loss": 0.586,
"step": 1660
},
{
"epoch": 0.4446219382321619,
"grad_norm": 43.50398635864258,
"learning_rate": 4.25896343627973e-06,
"loss": 0.4898,
"step": 1670
},
{
"epoch": 0.4472843450479233,
"grad_norm": 16.387245178222656,
"learning_rate": 4.254526091586794e-06,
"loss": 0.6626,
"step": 1680
},
{
"epoch": 0.44994675186368477,
"grad_norm": 25.265329360961914,
"learning_rate": 4.250088746893859e-06,
"loss": 0.4814,
"step": 1690
},
{
"epoch": 0.45260915867944623,
"grad_norm": 59.30329895019531,
"learning_rate": 4.245651402200923e-06,
"loss": 0.4817,
"step": 1700
},
{
"epoch": 0.45527156549520764,
"grad_norm": 36.43183135986328,
"learning_rate": 4.2412140575079875e-06,
"loss": 0.5139,
"step": 1710
},
{
"epoch": 0.4579339723109691,
"grad_norm": 35.189395904541016,
"learning_rate": 4.2367767128150516e-06,
"loss": 0.6202,
"step": 1720
},
{
"epoch": 0.46059637912673057,
"grad_norm": 36.17230987548828,
"learning_rate": 4.232339368122116e-06,
"loss": 0.5062,
"step": 1730
},
{
"epoch": 0.46325878594249204,
"grad_norm": 29.44896697998047,
"learning_rate": 4.22790202342918e-06,
"loss": 0.5371,
"step": 1740
},
{
"epoch": 0.46592119275825344,
"grad_norm": 29.84225082397461,
"learning_rate": 4.223464678736245e-06,
"loss": 0.4877,
"step": 1750
},
{
"epoch": 0.4685835995740149,
"grad_norm": 67.50408935546875,
"learning_rate": 4.219027334043309e-06,
"loss": 0.368,
"step": 1760
},
{
"epoch": 0.4712460063897764,
"grad_norm": 35.0632209777832,
"learning_rate": 4.214589989350373e-06,
"loss": 0.6171,
"step": 1770
},
{
"epoch": 0.4739084132055378,
"grad_norm": 67.43580627441406,
"learning_rate": 4.210152644657438e-06,
"loss": 0.4611,
"step": 1780
},
{
"epoch": 0.47657082002129925,
"grad_norm": 14.031696319580078,
"learning_rate": 4.205715299964502e-06,
"loss": 0.3923,
"step": 1790
},
{
"epoch": 0.4792332268370607,
"grad_norm": 34.448787689208984,
"learning_rate": 4.201277955271566e-06,
"loss": 0.5624,
"step": 1800
},
{
"epoch": 0.4818956336528222,
"grad_norm": 22.780643463134766,
"learning_rate": 4.19684061057863e-06,
"loss": 0.5367,
"step": 1810
},
{
"epoch": 0.4845580404685836,
"grad_norm": 23.203998565673828,
"learning_rate": 4.192403265885694e-06,
"loss": 0.5435,
"step": 1820
},
{
"epoch": 0.48722044728434505,
"grad_norm": 29.038433074951172,
"learning_rate": 4.187965921192758e-06,
"loss": 0.429,
"step": 1830
},
{
"epoch": 0.4898828541001065,
"grad_norm": 46.48039245605469,
"learning_rate": 4.183528576499823e-06,
"loss": 0.6275,
"step": 1840
},
{
"epoch": 0.4925452609158679,
"grad_norm": 25.627628326416016,
"learning_rate": 4.179091231806887e-06,
"loss": 0.5386,
"step": 1850
},
{
"epoch": 0.4952076677316294,
"grad_norm": 16.85994529724121,
"learning_rate": 4.1746538871139515e-06,
"loss": 0.5058,
"step": 1860
},
{
"epoch": 0.49787007454739085,
"grad_norm": 25.239809036254883,
"learning_rate": 4.170216542421016e-06,
"loss": 0.2934,
"step": 1870
},
{
"epoch": 0.5005324813631523,
"grad_norm": 26.714866638183594,
"learning_rate": 4.16577919772808e-06,
"loss": 0.3887,
"step": 1880
},
{
"epoch": 0.5031948881789138,
"grad_norm": 29.640674591064453,
"learning_rate": 4.161341853035144e-06,
"loss": 0.5879,
"step": 1890
},
{
"epoch": 0.5058572949946751,
"grad_norm": 18.185070037841797,
"learning_rate": 4.156904508342208e-06,
"loss": 0.4893,
"step": 1900
},
{
"epoch": 0.5085197018104366,
"grad_norm": 35.684303283691406,
"learning_rate": 4.152467163649273e-06,
"loss": 0.4306,
"step": 1910
},
{
"epoch": 0.5111821086261981,
"grad_norm": 40.58506393432617,
"learning_rate": 4.148029818956337e-06,
"loss": 0.7325,
"step": 1920
},
{
"epoch": 0.5138445154419595,
"grad_norm": 17.87203025817871,
"learning_rate": 4.143592474263401e-06,
"loss": 0.5623,
"step": 1930
},
{
"epoch": 0.516506922257721,
"grad_norm": 71.33222961425781,
"learning_rate": 4.139155129570465e-06,
"loss": 0.6075,
"step": 1940
},
{
"epoch": 0.5191693290734825,
"grad_norm": 26.079328536987305,
"learning_rate": 4.134717784877529e-06,
"loss": 0.6177,
"step": 1950
},
{
"epoch": 0.5218317358892439,
"grad_norm": 25.3037166595459,
"learning_rate": 4.130280440184593e-06,
"loss": 0.4658,
"step": 1960
},
{
"epoch": 0.5244941427050053,
"grad_norm": 46.988792419433594,
"learning_rate": 4.125843095491658e-06,
"loss": 0.5115,
"step": 1970
},
{
"epoch": 0.5271565495207667,
"grad_norm": 36.285133361816406,
"learning_rate": 4.121405750798722e-06,
"loss": 0.5386,
"step": 1980
},
{
"epoch": 0.5298189563365282,
"grad_norm": 37.87679672241211,
"learning_rate": 4.1169684061057865e-06,
"loss": 0.4324,
"step": 1990
},
{
"epoch": 0.5324813631522897,
"grad_norm": 25.39067840576172,
"learning_rate": 4.112531061412851e-06,
"loss": 0.6276,
"step": 2000
},
{
"epoch": 0.5351437699680511,
"grad_norm": 20.298128128051758,
"learning_rate": 4.1080937167199155e-06,
"loss": 0.4346,
"step": 2010
},
{
"epoch": 0.5378061767838126,
"grad_norm": 20.097126007080078,
"learning_rate": 4.10365637202698e-06,
"loss": 0.5676,
"step": 2020
},
{
"epoch": 0.5404685835995741,
"grad_norm": 30.568199157714844,
"learning_rate": 4.099219027334044e-06,
"loss": 0.513,
"step": 2030
},
{
"epoch": 0.5431309904153354,
"grad_norm": 24.185678482055664,
"learning_rate": 4.094781682641108e-06,
"loss": 0.5528,
"step": 2040
},
{
"epoch": 0.5457933972310969,
"grad_norm": 15.934721946716309,
"learning_rate": 4.090344337948172e-06,
"loss": 0.4815,
"step": 2050
},
{
"epoch": 0.5484558040468583,
"grad_norm": 41.82506561279297,
"learning_rate": 4.085906993255237e-06,
"loss": 0.5723,
"step": 2060
},
{
"epoch": 0.5511182108626198,
"grad_norm": 36.954341888427734,
"learning_rate": 4.081469648562301e-06,
"loss": 0.4566,
"step": 2070
},
{
"epoch": 0.5537806176783813,
"grad_norm": 49.04082489013672,
"learning_rate": 4.077032303869365e-06,
"loss": 0.6509,
"step": 2080
},
{
"epoch": 0.5564430244941427,
"grad_norm": 17.157665252685547,
"learning_rate": 4.072594959176429e-06,
"loss": 0.5015,
"step": 2090
},
{
"epoch": 0.5591054313099042,
"grad_norm": 21.952617645263672,
"learning_rate": 4.068157614483493e-06,
"loss": 0.5426,
"step": 2100
},
{
"epoch": 0.5617678381256656,
"grad_norm": 23.275299072265625,
"learning_rate": 4.063720269790557e-06,
"loss": 0.4363,
"step": 2110
},
{
"epoch": 0.564430244941427,
"grad_norm": 16.434040069580078,
"learning_rate": 4.0592829250976214e-06,
"loss": 0.475,
"step": 2120
},
{
"epoch": 0.5670926517571885,
"grad_norm": 37.54704666137695,
"learning_rate": 4.054845580404686e-06,
"loss": 0.5098,
"step": 2130
},
{
"epoch": 0.56975505857295,
"grad_norm": 9.03038501739502,
"learning_rate": 4.0504082357117505e-06,
"loss": 0.4563,
"step": 2140
},
{
"epoch": 0.5724174653887114,
"grad_norm": 87.7586669921875,
"learning_rate": 4.0459708910188146e-06,
"loss": 0.513,
"step": 2150
},
{
"epoch": 0.5750798722044729,
"grad_norm": 29.9242000579834,
"learning_rate": 4.041533546325879e-06,
"loss": 0.5034,
"step": 2160
},
{
"epoch": 0.5777422790202343,
"grad_norm": 52.17824172973633,
"learning_rate": 4.037096201632943e-06,
"loss": 0.5702,
"step": 2170
},
{
"epoch": 0.5804046858359957,
"grad_norm": 38.12932586669922,
"learning_rate": 4.032658856940008e-06,
"loss": 0.3469,
"step": 2180
},
{
"epoch": 0.5830670926517572,
"grad_norm": 39.25871276855469,
"learning_rate": 4.028221512247072e-06,
"loss": 0.4787,
"step": 2190
},
{
"epoch": 0.5857294994675186,
"grad_norm": 22.107851028442383,
"learning_rate": 4.023784167554136e-06,
"loss": 0.4713,
"step": 2200
},
{
"epoch": 0.5883919062832801,
"grad_norm": 44.968650817871094,
"learning_rate": 4.0193468228612e-06,
"loss": 0.4283,
"step": 2210
},
{
"epoch": 0.5910543130990416,
"grad_norm": 29.94749641418457,
"learning_rate": 4.014909478168265e-06,
"loss": 0.4444,
"step": 2220
},
{
"epoch": 0.593716719914803,
"grad_norm": 27.584457397460938,
"learning_rate": 4.010472133475329e-06,
"loss": 0.3339,
"step": 2230
},
{
"epoch": 0.5963791267305645,
"grad_norm": 42.6247673034668,
"learning_rate": 4.006034788782393e-06,
"loss": 0.5293,
"step": 2240
},
{
"epoch": 0.5990415335463258,
"grad_norm": 52.9679069519043,
"learning_rate": 4.001597444089457e-06,
"loss": 0.4374,
"step": 2250
},
{
"epoch": 0.6017039403620873,
"grad_norm": 26.44418716430664,
"learning_rate": 3.997160099396521e-06,
"loss": 0.4075,
"step": 2260
},
{
"epoch": 0.6043663471778488,
"grad_norm": 30.389057159423828,
"learning_rate": 3.9927227547035854e-06,
"loss": 0.6161,
"step": 2270
},
{
"epoch": 0.6070287539936102,
"grad_norm": 56.16935348510742,
"learning_rate": 3.9882854100106495e-06,
"loss": 0.6018,
"step": 2280
},
{
"epoch": 0.6096911608093717,
"grad_norm": 29.757858276367188,
"learning_rate": 3.9838480653177145e-06,
"loss": 0.4651,
"step": 2290
},
{
"epoch": 0.6123535676251332,
"grad_norm": 69.78173828125,
"learning_rate": 3.979410720624779e-06,
"loss": 0.5459,
"step": 2300
},
{
"epoch": 0.6150159744408946,
"grad_norm": 11.894743919372559,
"learning_rate": 3.974973375931843e-06,
"loss": 0.5168,
"step": 2310
},
{
"epoch": 0.617678381256656,
"grad_norm": 48.09218215942383,
"learning_rate": 3.970536031238907e-06,
"loss": 0.4355,
"step": 2320
},
{
"epoch": 0.6203407880724174,
"grad_norm": 44.30209732055664,
"learning_rate": 3.966098686545971e-06,
"loss": 0.4098,
"step": 2330
},
{
"epoch": 0.6230031948881789,
"grad_norm": 21.9451961517334,
"learning_rate": 3.961661341853035e-06,
"loss": 0.7032,
"step": 2340
},
{
"epoch": 0.6256656017039404,
"grad_norm": 24.848848342895508,
"learning_rate": 3.9572239971601e-06,
"loss": 0.458,
"step": 2350
},
{
"epoch": 0.6283280085197018,
"grad_norm": 34.55842971801758,
"learning_rate": 3.952786652467164e-06,
"loss": 0.5756,
"step": 2360
},
{
"epoch": 0.6309904153354633,
"grad_norm": 33.10586166381836,
"learning_rate": 3.948349307774228e-06,
"loss": 0.3295,
"step": 2370
},
{
"epoch": 0.6336528221512248,
"grad_norm": 18.658554077148438,
"learning_rate": 3.943911963081292e-06,
"loss": 0.3606,
"step": 2380
},
{
"epoch": 0.6363152289669861,
"grad_norm": 49.64012145996094,
"learning_rate": 3.939474618388357e-06,
"loss": 0.3052,
"step": 2390
},
{
"epoch": 0.6389776357827476,
"grad_norm": 16.022010803222656,
"learning_rate": 3.935037273695421e-06,
"loss": 0.2831,
"step": 2400
},
{
"epoch": 0.641640042598509,
"grad_norm": 56.64069366455078,
"learning_rate": 3.930599929002485e-06,
"loss": 0.5239,
"step": 2410
},
{
"epoch": 0.6443024494142705,
"grad_norm": 57.587303161621094,
"learning_rate": 3.9261625843095495e-06,
"loss": 0.4597,
"step": 2420
},
{
"epoch": 0.646964856230032,
"grad_norm": 42.01318359375,
"learning_rate": 3.9217252396166136e-06,
"loss": 0.6487,
"step": 2430
},
{
"epoch": 0.6496272630457934,
"grad_norm": 104.44071960449219,
"learning_rate": 3.9172878949236785e-06,
"loss": 0.6415,
"step": 2440
},
{
"epoch": 0.6522896698615549,
"grad_norm": 16.419157028198242,
"learning_rate": 3.912850550230743e-06,
"loss": 0.4841,
"step": 2450
},
{
"epoch": 0.6549520766773163,
"grad_norm": 38.34169387817383,
"learning_rate": 3.908413205537807e-06,
"loss": 0.66,
"step": 2460
},
{
"epoch": 0.6576144834930777,
"grad_norm": 14.542204856872559,
"learning_rate": 3.903975860844871e-06,
"loss": 0.5374,
"step": 2470
},
{
"epoch": 0.6602768903088392,
"grad_norm": 19.804458618164062,
"learning_rate": 3.899538516151935e-06,
"loss": 0.4279,
"step": 2480
},
{
"epoch": 0.6629392971246006,
"grad_norm": 42.396751403808594,
"learning_rate": 3.895101171458999e-06,
"loss": 0.4125,
"step": 2490
},
{
"epoch": 0.6656017039403621,
"grad_norm": 46.004024505615234,
"learning_rate": 3.890663826766063e-06,
"loss": 0.5548,
"step": 2500
},
{
"epoch": 0.6682641107561236,
"grad_norm": 57.96288299560547,
"learning_rate": 3.886226482073128e-06,
"loss": 0.4561,
"step": 2510
},
{
"epoch": 0.670926517571885,
"grad_norm": 24.315519332885742,
"learning_rate": 3.881789137380192e-06,
"loss": 0.4999,
"step": 2520
},
{
"epoch": 0.6735889243876464,
"grad_norm": 13.40101146697998,
"learning_rate": 3.877351792687256e-06,
"loss": 0.5261,
"step": 2530
},
{
"epoch": 0.6762513312034079,
"grad_norm": 37.81547164916992,
"learning_rate": 3.87291444799432e-06,
"loss": 0.4626,
"step": 2540
},
{
"epoch": 0.6789137380191693,
"grad_norm": 39.67838668823242,
"learning_rate": 3.8684771033013844e-06,
"loss": 0.5874,
"step": 2550
},
{
"epoch": 0.6815761448349308,
"grad_norm": 23.241886138916016,
"learning_rate": 3.8640397586084485e-06,
"loss": 0.5641,
"step": 2560
},
{
"epoch": 0.6842385516506923,
"grad_norm": 25.078184127807617,
"learning_rate": 3.859602413915513e-06,
"loss": 0.6331,
"step": 2570
},
{
"epoch": 0.6869009584664537,
"grad_norm": 20.75146484375,
"learning_rate": 3.855165069222578e-06,
"loss": 0.4448,
"step": 2580
},
{
"epoch": 0.6895633652822151,
"grad_norm": 12.216752052307129,
"learning_rate": 3.850727724529642e-06,
"loss": 0.3436,
"step": 2590
},
{
"epoch": 0.6922257720979765,
"grad_norm": 41.08386993408203,
"learning_rate": 3.846290379836707e-06,
"loss": 0.6558,
"step": 2600
},
{
"epoch": 0.694888178913738,
"grad_norm": 10.4852933883667,
"learning_rate": 3.841853035143771e-06,
"loss": 0.2737,
"step": 2610
},
{
"epoch": 0.6975505857294995,
"grad_norm": 28.873003005981445,
"learning_rate": 3.837415690450835e-06,
"loss": 0.3726,
"step": 2620
},
{
"epoch": 0.7002129925452609,
"grad_norm": 39.70866775512695,
"learning_rate": 3.832978345757899e-06,
"loss": 0.5008,
"step": 2630
},
{
"epoch": 0.7028753993610224,
"grad_norm": 31.29554557800293,
"learning_rate": 3.828541001064963e-06,
"loss": 0.4126,
"step": 2640
},
{
"epoch": 0.7055378061767839,
"grad_norm": 6.893424987792969,
"learning_rate": 3.824103656372027e-06,
"loss": 0.4141,
"step": 2650
},
{
"epoch": 0.7082002129925452,
"grad_norm": 74.72613525390625,
"learning_rate": 3.819666311679091e-06,
"loss": 0.6189,
"step": 2660
},
{
"epoch": 0.7108626198083067,
"grad_norm": 76.01870727539062,
"learning_rate": 3.815228966986156e-06,
"loss": 0.5298,
"step": 2670
},
{
"epoch": 0.7135250266240681,
"grad_norm": 17.088764190673828,
"learning_rate": 3.8107916222932203e-06,
"loss": 0.3905,
"step": 2680
},
{
"epoch": 0.7161874334398296,
"grad_norm": 46.16141128540039,
"learning_rate": 3.8063542776002844e-06,
"loss": 0.5484,
"step": 2690
},
{
"epoch": 0.7188498402555911,
"grad_norm": 16.93437385559082,
"learning_rate": 3.8019169329073485e-06,
"loss": 0.4768,
"step": 2700
},
{
"epoch": 0.7215122470713525,
"grad_norm": 41.89393615722656,
"learning_rate": 3.7974795882144125e-06,
"loss": 0.5061,
"step": 2710
},
{
"epoch": 0.724174653887114,
"grad_norm": 33.18333435058594,
"learning_rate": 3.793042243521477e-06,
"loss": 0.3526,
"step": 2720
},
{
"epoch": 0.7268370607028753,
"grad_norm": 10.398252487182617,
"learning_rate": 3.788604898828541e-06,
"loss": 0.6246,
"step": 2730
},
{
"epoch": 0.7294994675186368,
"grad_norm": 28.568174362182617,
"learning_rate": 3.7841675541356053e-06,
"loss": 0.5033,
"step": 2740
},
{
"epoch": 0.7321618743343983,
"grad_norm": 53.6268196105957,
"learning_rate": 3.77973020944267e-06,
"loss": 0.3727,
"step": 2750
},
{
"epoch": 0.7348242811501597,
"grad_norm": 55.8770637512207,
"learning_rate": 3.775292864749734e-06,
"loss": 0.3693,
"step": 2760
},
{
"epoch": 0.7374866879659212,
"grad_norm": 39.622161865234375,
"learning_rate": 3.770855520056798e-06,
"loss": 0.2361,
"step": 2770
},
{
"epoch": 0.7401490947816827,
"grad_norm": 1.1328420639038086,
"learning_rate": 3.7664181753638625e-06,
"loss": 0.3064,
"step": 2780
},
{
"epoch": 0.7428115015974441,
"grad_norm": 49.34321212768555,
"learning_rate": 3.7619808306709266e-06,
"loss": 0.4954,
"step": 2790
},
{
"epoch": 0.7454739084132055,
"grad_norm": 32.354949951171875,
"learning_rate": 3.7575434859779916e-06,
"loss": 0.482,
"step": 2800
},
{
"epoch": 0.748136315228967,
"grad_norm": 48.409175872802734,
"learning_rate": 3.7531061412850556e-06,
"loss": 0.5572,
"step": 2810
},
{
"epoch": 0.7507987220447284,
"grad_norm": 26.978425979614258,
"learning_rate": 3.7486687965921197e-06,
"loss": 0.4575,
"step": 2820
},
{
"epoch": 0.7534611288604899,
"grad_norm": 41.84669494628906,
"learning_rate": 3.744231451899184e-06,
"loss": 0.5782,
"step": 2830
},
{
"epoch": 0.7561235356762513,
"grad_norm": 39.736915588378906,
"learning_rate": 3.7397941072062484e-06,
"loss": 0.4928,
"step": 2840
},
{
"epoch": 0.7587859424920128,
"grad_norm": 26.33810806274414,
"learning_rate": 3.7353567625133125e-06,
"loss": 0.4704,
"step": 2850
},
{
"epoch": 0.7614483493077743,
"grad_norm": 25.78823471069336,
"learning_rate": 3.7309194178203766e-06,
"loss": 0.5336,
"step": 2860
},
{
"epoch": 0.7641107561235356,
"grad_norm": 50.200721740722656,
"learning_rate": 3.726482073127441e-06,
"loss": 0.3914,
"step": 2870
},
{
"epoch": 0.7667731629392971,
"grad_norm": 35.294254302978516,
"learning_rate": 3.722044728434505e-06,
"loss": 0.5317,
"step": 2880
},
{
"epoch": 0.7694355697550586,
"grad_norm": 45.99223327636719,
"learning_rate": 3.7176073837415693e-06,
"loss": 0.4422,
"step": 2890
},
{
"epoch": 0.77209797657082,
"grad_norm": 23.060827255249023,
"learning_rate": 3.713170039048634e-06,
"loss": 0.5555,
"step": 2900
},
{
"epoch": 0.7747603833865815,
"grad_norm": 52.92953872680664,
"learning_rate": 3.708732694355698e-06,
"loss": 0.3346,
"step": 2910
},
{
"epoch": 0.777422790202343,
"grad_norm": 48.301513671875,
"learning_rate": 3.704295349662762e-06,
"loss": 0.4427,
"step": 2920
},
{
"epoch": 0.7800851970181044,
"grad_norm": 48.90705108642578,
"learning_rate": 3.699858004969826e-06,
"loss": 0.4144,
"step": 2930
},
{
"epoch": 0.7827476038338658,
"grad_norm": 50.751583099365234,
"learning_rate": 3.6954206602768906e-06,
"loss": 0.4262,
"step": 2940
},
{
"epoch": 0.7854100106496272,
"grad_norm": 36.761234283447266,
"learning_rate": 3.6909833155839547e-06,
"loss": 0.4766,
"step": 2950
},
{
"epoch": 0.7880724174653887,
"grad_norm": 39.654090881347656,
"learning_rate": 3.686545970891019e-06,
"loss": 0.462,
"step": 2960
},
{
"epoch": 0.7907348242811502,
"grad_norm": 40.53966522216797,
"learning_rate": 3.6821086261980833e-06,
"loss": 0.4302,
"step": 2970
},
{
"epoch": 0.7933972310969116,
"grad_norm": 39.34804153442383,
"learning_rate": 3.6776712815051474e-06,
"loss": 0.3504,
"step": 2980
},
{
"epoch": 0.7960596379126731,
"grad_norm": 39.78946304321289,
"learning_rate": 3.6732339368122115e-06,
"loss": 0.3504,
"step": 2990
},
{
"epoch": 0.7987220447284346,
"grad_norm": 53.113887786865234,
"learning_rate": 3.668796592119276e-06,
"loss": 0.4809,
"step": 3000
},
{
"epoch": 0.8013844515441959,
"grad_norm": 46.49972915649414,
"learning_rate": 3.6643592474263406e-06,
"loss": 0.5108,
"step": 3010
},
{
"epoch": 0.8040468583599574,
"grad_norm": 4.3874921798706055,
"learning_rate": 3.6599219027334047e-06,
"loss": 0.5426,
"step": 3020
},
{
"epoch": 0.8067092651757188,
"grad_norm": 35.56306457519531,
"learning_rate": 3.655484558040469e-06,
"loss": 0.3684,
"step": 3030
},
{
"epoch": 0.8093716719914803,
"grad_norm": 54.54863739013672,
"learning_rate": 3.6510472133475333e-06,
"loss": 0.5309,
"step": 3040
},
{
"epoch": 0.8120340788072418,
"grad_norm": 39.69523620605469,
"learning_rate": 3.6466098686545974e-06,
"loss": 0.3355,
"step": 3050
},
{
"epoch": 0.8146964856230032,
"grad_norm": 49.38870620727539,
"learning_rate": 3.642172523961662e-06,
"loss": 0.5092,
"step": 3060
},
{
"epoch": 0.8173588924387647,
"grad_norm": 34.680328369140625,
"learning_rate": 3.637735179268726e-06,
"loss": 0.3772,
"step": 3070
},
{
"epoch": 0.820021299254526,
"grad_norm": 16.585277557373047,
"learning_rate": 3.63329783457579e-06,
"loss": 0.4852,
"step": 3080
},
{
"epoch": 0.8226837060702875,
"grad_norm": 35.107913970947266,
"learning_rate": 3.6288604898828546e-06,
"loss": 0.598,
"step": 3090
},
{
"epoch": 0.825346112886049,
"grad_norm": 34.67172622680664,
"learning_rate": 3.6244231451899187e-06,
"loss": 0.4372,
"step": 3100
},
{
"epoch": 0.8280085197018104,
"grad_norm": 69.40287017822266,
"learning_rate": 3.619985800496983e-06,
"loss": 0.5542,
"step": 3110
},
{
"epoch": 0.8306709265175719,
"grad_norm": 26.95096206665039,
"learning_rate": 3.615548455804047e-06,
"loss": 0.4781,
"step": 3120
},
{
"epoch": 0.8333333333333334,
"grad_norm": 28.22862434387207,
"learning_rate": 3.6111111111111115e-06,
"loss": 0.5491,
"step": 3130
},
{
"epoch": 0.8359957401490948,
"grad_norm": 46.3458366394043,
"learning_rate": 3.6066737664181756e-06,
"loss": 0.5018,
"step": 3140
},
{
"epoch": 0.8386581469648562,
"grad_norm": 69.2251968383789,
"learning_rate": 3.6022364217252397e-06,
"loss": 0.5572,
"step": 3150
},
{
"epoch": 0.8413205537806177,
"grad_norm": 23.1965274810791,
"learning_rate": 3.597799077032304e-06,
"loss": 0.2832,
"step": 3160
},
{
"epoch": 0.8439829605963791,
"grad_norm": 64.3228988647461,
"learning_rate": 3.5933617323393683e-06,
"loss": 0.4113,
"step": 3170
},
{
"epoch": 0.8466453674121406,
"grad_norm": 19.056867599487305,
"learning_rate": 3.5889243876464324e-06,
"loss": 0.2894,
"step": 3180
},
{
"epoch": 0.849307774227902,
"grad_norm": 16.736753463745117,
"learning_rate": 3.584487042953497e-06,
"loss": 0.4683,
"step": 3190
},
{
"epoch": 0.8519701810436635,
"grad_norm": 21.99988555908203,
"learning_rate": 3.580049698260561e-06,
"loss": 0.4359,
"step": 3200
},
{
"epoch": 0.854632587859425,
"grad_norm": 66.83743286132812,
"learning_rate": 3.575612353567625e-06,
"loss": 0.4584,
"step": 3210
},
{
"epoch": 0.8572949946751863,
"grad_norm": 136.4222869873047,
"learning_rate": 3.57117500887469e-06,
"loss": 0.4841,
"step": 3220
},
{
"epoch": 0.8599574014909478,
"grad_norm": 48.702186584472656,
"learning_rate": 3.566737664181754e-06,
"loss": 0.5208,
"step": 3230
},
{
"epoch": 0.8626198083067093,
"grad_norm": 17.272945404052734,
"learning_rate": 3.5623003194888182e-06,
"loss": 0.3549,
"step": 3240
},
{
"epoch": 0.8652822151224707,
"grad_norm": 55.59714126586914,
"learning_rate": 3.5578629747958828e-06,
"loss": 0.397,
"step": 3250
},
{
"epoch": 0.8679446219382322,
"grad_norm": 24.468503952026367,
"learning_rate": 3.553425630102947e-06,
"loss": 0.5792,
"step": 3260
},
{
"epoch": 0.8706070287539937,
"grad_norm": 20.15557289123535,
"learning_rate": 3.548988285410011e-06,
"loss": 0.4701,
"step": 3270
},
{
"epoch": 0.873269435569755,
"grad_norm": 3.6037347316741943,
"learning_rate": 3.5445509407170755e-06,
"loss": 0.4938,
"step": 3280
},
{
"epoch": 0.8759318423855165,
"grad_norm": 36.691715240478516,
"learning_rate": 3.5401135960241396e-06,
"loss": 0.4292,
"step": 3290
},
{
"epoch": 0.8785942492012779,
"grad_norm": 38.12320327758789,
"learning_rate": 3.5356762513312037e-06,
"loss": 0.5849,
"step": 3300
},
{
"epoch": 0.8812566560170394,
"grad_norm": 31.022741317749023,
"learning_rate": 3.5312389066382678e-06,
"loss": 0.2545,
"step": 3310
},
{
"epoch": 0.8839190628328009,
"grad_norm": 24.874244689941406,
"learning_rate": 3.5268015619453323e-06,
"loss": 0.4597,
"step": 3320
},
{
"epoch": 0.8865814696485623,
"grad_norm": 33.82793426513672,
"learning_rate": 3.5223642172523964e-06,
"loss": 0.4707,
"step": 3330
},
{
"epoch": 0.8892438764643238,
"grad_norm": 25.76340103149414,
"learning_rate": 3.5179268725594605e-06,
"loss": 0.3651,
"step": 3340
},
{
"epoch": 0.8919062832800851,
"grad_norm": 36.18081283569336,
"learning_rate": 3.513489527866525e-06,
"loss": 0.4041,
"step": 3350
},
{
"epoch": 0.8945686900958466,
"grad_norm": 43.22450256347656,
"learning_rate": 3.509052183173589e-06,
"loss": 0.4119,
"step": 3360
},
{
"epoch": 0.8972310969116081,
"grad_norm": 25.002256393432617,
"learning_rate": 3.504614838480653e-06,
"loss": 0.4748,
"step": 3370
},
{
"epoch": 0.8998935037273695,
"grad_norm": 3.837367534637451,
"learning_rate": 3.5001774937877177e-06,
"loss": 0.352,
"step": 3380
},
{
"epoch": 0.902555910543131,
"grad_norm": 33.43516159057617,
"learning_rate": 3.495740149094782e-06,
"loss": 0.2963,
"step": 3390
},
{
"epoch": 0.9052183173588925,
"grad_norm": 46.475669860839844,
"learning_rate": 3.491302804401846e-06,
"loss": 0.4292,
"step": 3400
},
{
"epoch": 0.9078807241746539,
"grad_norm": 8.306092262268066,
"learning_rate": 3.48686545970891e-06,
"loss": 0.527,
"step": 3410
},
{
"epoch": 0.9105431309904153,
"grad_norm": 52.78518295288086,
"learning_rate": 3.482428115015975e-06,
"loss": 0.5575,
"step": 3420
},
{
"epoch": 0.9132055378061767,
"grad_norm": 39.20545959472656,
"learning_rate": 3.477990770323039e-06,
"loss": 0.5689,
"step": 3430
},
{
"epoch": 0.9158679446219382,
"grad_norm": 19.458885192871094,
"learning_rate": 3.4735534256301036e-06,
"loss": 0.4101,
"step": 3440
},
{
"epoch": 0.9185303514376997,
"grad_norm": 55.984554290771484,
"learning_rate": 3.4691160809371677e-06,
"loss": 0.4009,
"step": 3450
},
{
"epoch": 0.9211927582534611,
"grad_norm": 23.385419845581055,
"learning_rate": 3.4646787362442318e-06,
"loss": 0.4437,
"step": 3460
},
{
"epoch": 0.9238551650692226,
"grad_norm": 18.056705474853516,
"learning_rate": 3.4602413915512963e-06,
"loss": 0.4475,
"step": 3470
},
{
"epoch": 0.9265175718849841,
"grad_norm": 41.02918243408203,
"learning_rate": 3.4558040468583604e-06,
"loss": 0.4416,
"step": 3480
},
{
"epoch": 0.9291799787007454,
"grad_norm": 13.734370231628418,
"learning_rate": 3.4513667021654245e-06,
"loss": 0.4073,
"step": 3490
},
{
"epoch": 0.9318423855165069,
"grad_norm": 40.19343948364258,
"learning_rate": 3.4469293574724886e-06,
"loss": 0.429,
"step": 3500
},
{
"epoch": 0.9345047923322684,
"grad_norm": 15.645760536193848,
"learning_rate": 3.442492012779553e-06,
"loss": 0.5276,
"step": 3510
},
{
"epoch": 0.9371671991480298,
"grad_norm": 49.22931671142578,
"learning_rate": 3.4380546680866172e-06,
"loss": 0.5557,
"step": 3520
},
{
"epoch": 0.9398296059637913,
"grad_norm": 20.097721099853516,
"learning_rate": 3.4336173233936813e-06,
"loss": 0.4071,
"step": 3530
},
{
"epoch": 0.9424920127795527,
"grad_norm": 33.681278228759766,
"learning_rate": 3.429179978700746e-06,
"loss": 0.448,
"step": 3540
},
{
"epoch": 0.9451544195953142,
"grad_norm": 19.279861450195312,
"learning_rate": 3.42474263400781e-06,
"loss": 0.5137,
"step": 3550
},
{
"epoch": 0.9478168264110756,
"grad_norm": 17.787982940673828,
"learning_rate": 3.420305289314874e-06,
"loss": 0.4027,
"step": 3560
},
{
"epoch": 0.950479233226837,
"grad_norm": 46.412715911865234,
"learning_rate": 3.4158679446219386e-06,
"loss": 0.6265,
"step": 3570
},
{
"epoch": 0.9531416400425985,
"grad_norm": 30.365901947021484,
"learning_rate": 3.4114305999290027e-06,
"loss": 0.5602,
"step": 3580
},
{
"epoch": 0.95580404685836,
"grad_norm": 41.739784240722656,
"learning_rate": 3.4069932552360668e-06,
"loss": 0.4984,
"step": 3590
},
{
"epoch": 0.9584664536741214,
"grad_norm": 8.452544212341309,
"learning_rate": 3.402555910543131e-06,
"loss": 0.4311,
"step": 3600
},
{
"epoch": 0.9611288604898829,
"grad_norm": 14.705548286437988,
"learning_rate": 3.3981185658501954e-06,
"loss": 0.4545,
"step": 3610
},
{
"epoch": 0.9637912673056444,
"grad_norm": 24.49713897705078,
"learning_rate": 3.3936812211572595e-06,
"loss": 0.3465,
"step": 3620
},
{
"epoch": 0.9664536741214057,
"grad_norm": 35.99775314331055,
"learning_rate": 3.3892438764643244e-06,
"loss": 0.4129,
"step": 3630
},
{
"epoch": 0.9691160809371672,
"grad_norm": 56.255348205566406,
"learning_rate": 3.3848065317713885e-06,
"loss": 0.4973,
"step": 3640
},
{
"epoch": 0.9717784877529286,
"grad_norm": 38.99000930786133,
"learning_rate": 3.3803691870784526e-06,
"loss": 0.286,
"step": 3650
},
{
"epoch": 0.9744408945686901,
"grad_norm": 29.624326705932617,
"learning_rate": 3.375931842385517e-06,
"loss": 0.5008,
"step": 3660
},
{
"epoch": 0.9771033013844516,
"grad_norm": 50.39006042480469,
"learning_rate": 3.3714944976925812e-06,
"loss": 0.4797,
"step": 3670
},
{
"epoch": 0.979765708200213,
"grad_norm": 48.023006439208984,
"learning_rate": 3.3670571529996453e-06,
"loss": 0.5441,
"step": 3680
},
{
"epoch": 0.9824281150159745,
"grad_norm": 32.83159637451172,
"learning_rate": 3.36261980830671e-06,
"loss": 0.4854,
"step": 3690
},
{
"epoch": 0.9850905218317358,
"grad_norm": 57.853755950927734,
"learning_rate": 3.358182463613774e-06,
"loss": 0.5452,
"step": 3700
},
{
"epoch": 0.9877529286474973,
"grad_norm": 24.01287841796875,
"learning_rate": 3.353745118920838e-06,
"loss": 0.3642,
"step": 3710
},
{
"epoch": 0.9904153354632588,
"grad_norm": 15.74479866027832,
"learning_rate": 3.349307774227902e-06,
"loss": 0.4246,
"step": 3720
},
{
"epoch": 0.9930777422790202,
"grad_norm": 38.5820198059082,
"learning_rate": 3.3448704295349667e-06,
"loss": 0.4976,
"step": 3730
},
{
"epoch": 0.9957401490947817,
"grad_norm": 23.191646575927734,
"learning_rate": 3.3404330848420308e-06,
"loss": 0.4632,
"step": 3740
},
{
"epoch": 0.9984025559105432,
"grad_norm": 20.22792625427246,
"learning_rate": 3.335995740149095e-06,
"loss": 0.3077,
"step": 3750
},
{
"epoch": 1.0,
"eval_loss": 0.4010973274707794,
"eval_runtime": 391.327,
"eval_samples_per_second": 9.598,
"eval_steps_per_second": 1.201,
"step": 3756
},
{
"epoch": 1.0010649627263046,
"grad_norm": 24.37016487121582,
"learning_rate": 3.3315583954561594e-06,
"loss": 0.4895,
"step": 3760
},
{
"epoch": 1.003727369542066,
"grad_norm": 53.734981536865234,
"learning_rate": 3.3271210507632235e-06,
"loss": 0.331,
"step": 3770
},
{
"epoch": 1.0063897763578276,
"grad_norm": 53.42403793334961,
"learning_rate": 3.3226837060702876e-06,
"loss": 0.2463,
"step": 3780
},
{
"epoch": 1.009052183173589,
"grad_norm": 51.97494125366211,
"learning_rate": 3.318246361377352e-06,
"loss": 0.4091,
"step": 3790
},
{
"epoch": 1.0117145899893503,
"grad_norm": 28.96375274658203,
"learning_rate": 3.313809016684416e-06,
"loss": 0.2977,
"step": 3800
},
{
"epoch": 1.0143769968051117,
"grad_norm": 48.554866790771484,
"learning_rate": 3.3093716719914803e-06,
"loss": 0.4292,
"step": 3810
},
{
"epoch": 1.0170394036208732,
"grad_norm": 39.90666198730469,
"learning_rate": 3.3049343272985444e-06,
"loss": 0.3101,
"step": 3820
},
{
"epoch": 1.0197018104366347,
"grad_norm": 58.565547943115234,
"learning_rate": 3.300496982605609e-06,
"loss": 0.3791,
"step": 3830
},
{
"epoch": 1.0223642172523961,
"grad_norm": 53.34403991699219,
"learning_rate": 3.2960596379126734e-06,
"loss": 0.4418,
"step": 3840
},
{
"epoch": 1.0250266240681576,
"grad_norm": 73.85496520996094,
"learning_rate": 3.291622293219738e-06,
"loss": 0.3763,
"step": 3850
},
{
"epoch": 1.027689030883919,
"grad_norm": 46.47140884399414,
"learning_rate": 3.287184948526802e-06,
"loss": 0.6791,
"step": 3860
},
{
"epoch": 1.0303514376996805,
"grad_norm": 35.71063995361328,
"learning_rate": 3.282747603833866e-06,
"loss": 0.5107,
"step": 3870
},
{
"epoch": 1.033013844515442,
"grad_norm": 20.168060302734375,
"learning_rate": 3.2783102591409307e-06,
"loss": 0.3675,
"step": 3880
},
{
"epoch": 1.0356762513312034,
"grad_norm": 27.247339248657227,
"learning_rate": 3.2738729144479948e-06,
"loss": 0.3953,
"step": 3890
},
{
"epoch": 1.038338658146965,
"grad_norm": 94.36700439453125,
"learning_rate": 3.269435569755059e-06,
"loss": 0.4062,
"step": 3900
},
{
"epoch": 1.0410010649627264,
"grad_norm": 35.7130012512207,
"learning_rate": 3.264998225062123e-06,
"loss": 0.5071,
"step": 3910
},
{
"epoch": 1.0436634717784878,
"grad_norm": 35.89786911010742,
"learning_rate": 3.2605608803691875e-06,
"loss": 0.3961,
"step": 3920
},
{
"epoch": 1.0463258785942493,
"grad_norm": 38.03929901123047,
"learning_rate": 3.2561235356762516e-06,
"loss": 0.4472,
"step": 3930
},
{
"epoch": 1.0489882854100105,
"grad_norm": 30.750192642211914,
"learning_rate": 3.2516861909833157e-06,
"loss": 0.3322,
"step": 3940
},
{
"epoch": 1.051650692225772,
"grad_norm": 19.738998413085938,
"learning_rate": 3.2472488462903802e-06,
"loss": 0.4264,
"step": 3950
},
{
"epoch": 1.0543130990415335,
"grad_norm": 55.47446823120117,
"learning_rate": 3.2428115015974443e-06,
"loss": 0.471,
"step": 3960
},
{
"epoch": 1.056975505857295,
"grad_norm": 49.76363754272461,
"learning_rate": 3.2383741569045084e-06,
"loss": 0.4537,
"step": 3970
},
{
"epoch": 1.0596379126730564,
"grad_norm": 80.31526947021484,
"learning_rate": 3.233936812211573e-06,
"loss": 0.3749,
"step": 3980
},
{
"epoch": 1.0623003194888179,
"grad_norm": 53.54792022705078,
"learning_rate": 3.229499467518637e-06,
"loss": 0.2977,
"step": 3990
},
{
"epoch": 1.0649627263045793,
"grad_norm": 82.70904541015625,
"learning_rate": 3.225062122825701e-06,
"loss": 0.4078,
"step": 4000
},
{
"epoch": 1.0676251331203408,
"grad_norm": 29.12423324584961,
"learning_rate": 3.2206247781327652e-06,
"loss": 0.4449,
"step": 4010
},
{
"epoch": 1.0702875399361023,
"grad_norm": 58.45504379272461,
"learning_rate": 3.2161874334398298e-06,
"loss": 0.5086,
"step": 4020
},
{
"epoch": 1.0729499467518637,
"grad_norm": 21.932579040527344,
"learning_rate": 3.211750088746894e-06,
"loss": 0.245,
"step": 4030
},
{
"epoch": 1.0756123535676252,
"grad_norm": 12.67576789855957,
"learning_rate": 3.207312744053959e-06,
"loss": 0.3531,
"step": 4040
},
{
"epoch": 1.0782747603833867,
"grad_norm": 3.022346258163452,
"learning_rate": 3.202875399361023e-06,
"loss": 0.3535,
"step": 4050
},
{
"epoch": 1.0809371671991481,
"grad_norm": 50.03273391723633,
"learning_rate": 3.198438054668087e-06,
"loss": 0.4069,
"step": 4060
},
{
"epoch": 1.0835995740149094,
"grad_norm": 23.05666160583496,
"learning_rate": 3.1940007099751515e-06,
"loss": 0.4915,
"step": 4070
},
{
"epoch": 1.0862619808306708,
"grad_norm": 49.80161666870117,
"learning_rate": 3.1895633652822156e-06,
"loss": 0.3975,
"step": 4080
},
{
"epoch": 1.0889243876464323,
"grad_norm": 45.57204055786133,
"learning_rate": 3.1851260205892797e-06,
"loss": 0.3995,
"step": 4090
},
{
"epoch": 1.0915867944621938,
"grad_norm": 29.443382263183594,
"learning_rate": 3.180688675896344e-06,
"loss": 0.4173,
"step": 4100
},
{
"epoch": 1.0942492012779552,
"grad_norm": 20.2729434967041,
"learning_rate": 3.1762513312034083e-06,
"loss": 0.3886,
"step": 4110
},
{
"epoch": 1.0969116080937167,
"grad_norm": 60.771549224853516,
"learning_rate": 3.1718139865104724e-06,
"loss": 0.3734,
"step": 4120
},
{
"epoch": 1.0995740149094781,
"grad_norm": 44.5765495300293,
"learning_rate": 3.1673766418175365e-06,
"loss": 0.4133,
"step": 4130
},
{
"epoch": 1.1022364217252396,
"grad_norm": 83.75230407714844,
"learning_rate": 3.162939297124601e-06,
"loss": 0.5197,
"step": 4140
},
{
"epoch": 1.104898828541001,
"grad_norm": 32.55630874633789,
"learning_rate": 3.158501952431665e-06,
"loss": 0.2907,
"step": 4150
},
{
"epoch": 1.1075612353567625,
"grad_norm": 7.132206916809082,
"learning_rate": 3.1540646077387292e-06,
"loss": 0.4218,
"step": 4160
},
{
"epoch": 1.110223642172524,
"grad_norm": 85.82096862792969,
"learning_rate": 3.1496272630457938e-06,
"loss": 0.3169,
"step": 4170
},
{
"epoch": 1.1128860489882855,
"grad_norm": 44.675106048583984,
"learning_rate": 3.145189918352858e-06,
"loss": 0.3032,
"step": 4180
},
{
"epoch": 1.115548455804047,
"grad_norm": 16.726261138916016,
"learning_rate": 3.140752573659922e-06,
"loss": 0.421,
"step": 4190
},
{
"epoch": 1.1182108626198084,
"grad_norm": 25.914119720458984,
"learning_rate": 3.136315228966986e-06,
"loss": 0.3087,
"step": 4200
},
{
"epoch": 1.1208732694355699,
"grad_norm": 33.73661804199219,
"learning_rate": 3.1318778842740506e-06,
"loss": 0.389,
"step": 4210
},
{
"epoch": 1.123535676251331,
"grad_norm": 70.91206359863281,
"learning_rate": 3.1274405395811147e-06,
"loss": 0.3949,
"step": 4220
},
{
"epoch": 1.1261980830670926,
"grad_norm": 24.275022506713867,
"learning_rate": 3.1230031948881788e-06,
"loss": 0.384,
"step": 4230
},
{
"epoch": 1.128860489882854,
"grad_norm": 18.02881622314453,
"learning_rate": 3.1185658501952433e-06,
"loss": 0.3381,
"step": 4240
},
{
"epoch": 1.1315228966986155,
"grad_norm": 17.50286865234375,
"learning_rate": 3.114128505502308e-06,
"loss": 0.3784,
"step": 4250
},
{
"epoch": 1.134185303514377,
"grad_norm": 58.238224029541016,
"learning_rate": 3.1096911608093723e-06,
"loss": 0.5182,
"step": 4260
},
{
"epoch": 1.1368477103301384,
"grad_norm": 36.494564056396484,
"learning_rate": 3.1052538161164364e-06,
"loss": 0.3208,
"step": 4270
},
{
"epoch": 1.1395101171459,
"grad_norm": 5.250776767730713,
"learning_rate": 3.1008164714235005e-06,
"loss": 0.386,
"step": 4280
},
{
"epoch": 1.1421725239616614,
"grad_norm": 12.714916229248047,
"learning_rate": 3.0963791267305646e-06,
"loss": 0.5409,
"step": 4290
},
{
"epoch": 1.1448349307774228,
"grad_norm": 19.4248046875,
"learning_rate": 3.091941782037629e-06,
"loss": 0.4072,
"step": 4300
},
{
"epoch": 1.1474973375931843,
"grad_norm": 63.79132843017578,
"learning_rate": 3.0875044373446933e-06,
"loss": 0.3945,
"step": 4310
},
{
"epoch": 1.1501597444089458,
"grad_norm": 45.68960189819336,
"learning_rate": 3.0830670926517574e-06,
"loss": 0.4618,
"step": 4320
},
{
"epoch": 1.1528221512247072,
"grad_norm": 32.63360595703125,
"learning_rate": 3.078629747958822e-06,
"loss": 0.3013,
"step": 4330
},
{
"epoch": 1.1554845580404687,
"grad_norm": 60.17452621459961,
"learning_rate": 3.074192403265886e-06,
"loss": 0.3819,
"step": 4340
},
{
"epoch": 1.15814696485623,
"grad_norm": 45.550968170166016,
"learning_rate": 3.06975505857295e-06,
"loss": 0.3872,
"step": 4350
},
{
"epoch": 1.1608093716719914,
"grad_norm": 43.52312088012695,
"learning_rate": 3.0653177138800146e-06,
"loss": 0.4543,
"step": 4360
},
{
"epoch": 1.1634717784877529,
"grad_norm": 32.9071159362793,
"learning_rate": 3.0608803691870787e-06,
"loss": 0.4621,
"step": 4370
},
{
"epoch": 1.1661341853035143,
"grad_norm": 18.797021865844727,
"learning_rate": 3.056443024494143e-06,
"loss": 0.2536,
"step": 4380
},
{
"epoch": 1.1687965921192758,
"grad_norm": 29.822704315185547,
"learning_rate": 3.052005679801207e-06,
"loss": 0.4395,
"step": 4390
},
{
"epoch": 1.1714589989350372,
"grad_norm": 42.53751754760742,
"learning_rate": 3.0475683351082714e-06,
"loss": 0.3256,
"step": 4400
},
{
"epoch": 1.1741214057507987,
"grad_norm": 19.648696899414062,
"learning_rate": 3.0431309904153355e-06,
"loss": 0.2939,
"step": 4410
},
{
"epoch": 1.1767838125665602,
"grad_norm": 35.35326385498047,
"learning_rate": 3.0386936457223996e-06,
"loss": 0.3974,
"step": 4420
},
{
"epoch": 1.1794462193823216,
"grad_norm": 14.91307258605957,
"learning_rate": 3.034256301029464e-06,
"loss": 0.3006,
"step": 4430
},
{
"epoch": 1.182108626198083,
"grad_norm": 17.40831756591797,
"learning_rate": 3.0298189563365282e-06,
"loss": 0.3188,
"step": 4440
},
{
"epoch": 1.1847710330138446,
"grad_norm": 37.886653900146484,
"learning_rate": 3.0253816116435923e-06,
"loss": 0.4533,
"step": 4450
},
{
"epoch": 1.187433439829606,
"grad_norm": 49.2622184753418,
"learning_rate": 3.0209442669506573e-06,
"loss": 0.3325,
"step": 4460
},
{
"epoch": 1.1900958466453675,
"grad_norm": 34.96827697753906,
"learning_rate": 3.0165069222577214e-06,
"loss": 0.3037,
"step": 4470
},
{
"epoch": 1.192758253461129,
"grad_norm": 21.063220977783203,
"learning_rate": 3.0120695775647855e-06,
"loss": 0.3211,
"step": 4480
},
{
"epoch": 1.1954206602768904,
"grad_norm": 59.07707214355469,
"learning_rate": 3.00763223287185e-06,
"loss": 0.5351,
"step": 4490
},
{
"epoch": 1.1980830670926517,
"grad_norm": 108.94097900390625,
"learning_rate": 3.003194888178914e-06,
"loss": 0.2543,
"step": 4500
},
{
"epoch": 1.2007454739084131,
"grad_norm": 72.97836303710938,
"learning_rate": 2.998757543485978e-06,
"loss": 0.5126,
"step": 4510
},
{
"epoch": 1.2034078807241746,
"grad_norm": 38.91378402709961,
"learning_rate": 2.9943201987930427e-06,
"loss": 0.2882,
"step": 4520
},
{
"epoch": 1.206070287539936,
"grad_norm": 76.03022003173828,
"learning_rate": 2.989882854100107e-06,
"loss": 0.5089,
"step": 4530
},
{
"epoch": 1.2087326943556975,
"grad_norm": 8.401103973388672,
"learning_rate": 2.985445509407171e-06,
"loss": 0.3408,
"step": 4540
},
{
"epoch": 1.211395101171459,
"grad_norm": 11.905583381652832,
"learning_rate": 2.9810081647142354e-06,
"loss": 0.3178,
"step": 4550
},
{
"epoch": 1.2140575079872205,
"grad_norm": 5.930868625640869,
"learning_rate": 2.9765708200212995e-06,
"loss": 0.5802,
"step": 4560
},
{
"epoch": 1.216719914802982,
"grad_norm": 51.42832565307617,
"learning_rate": 2.9721334753283636e-06,
"loss": 0.4175,
"step": 4570
},
{
"epoch": 1.2193823216187434,
"grad_norm": 71.15629577636719,
"learning_rate": 2.967696130635428e-06,
"loss": 0.4067,
"step": 4580
},
{
"epoch": 1.2220447284345048,
"grad_norm": 31.330825805664062,
"learning_rate": 2.9632587859424922e-06,
"loss": 0.4262,
"step": 4590
},
{
"epoch": 1.2247071352502663,
"grad_norm": 48.36967086791992,
"learning_rate": 2.9588214412495563e-06,
"loss": 0.3853,
"step": 4600
},
{
"epoch": 1.2273695420660278,
"grad_norm": 23.711320877075195,
"learning_rate": 2.9543840965566204e-06,
"loss": 0.3924,
"step": 4610
},
{
"epoch": 1.230031948881789,
"grad_norm": 45.54877471923828,
"learning_rate": 2.949946751863685e-06,
"loss": 0.363,
"step": 4620
},
{
"epoch": 1.2326943556975505,
"grad_norm": 34.75027084350586,
"learning_rate": 2.945509407170749e-06,
"loss": 0.2775,
"step": 4630
},
{
"epoch": 1.235356762513312,
"grad_norm": 92.7309341430664,
"learning_rate": 2.941072062477813e-06,
"loss": 0.4013,
"step": 4640
},
{
"epoch": 1.2380191693290734,
"grad_norm": 6.587616920471191,
"learning_rate": 2.9366347177848777e-06,
"loss": 0.3302,
"step": 4650
},
{
"epoch": 1.2406815761448349,
"grad_norm": 55.009342193603516,
"learning_rate": 2.9321973730919418e-06,
"loss": 0.5266,
"step": 4660
},
{
"epoch": 1.2433439829605963,
"grad_norm": 19.210548400878906,
"learning_rate": 2.9277600283990067e-06,
"loss": 0.2891,
"step": 4670
},
{
"epoch": 1.2460063897763578,
"grad_norm": 71.92720794677734,
"learning_rate": 2.923322683706071e-06,
"loss": 0.3926,
"step": 4680
},
{
"epoch": 1.2486687965921193,
"grad_norm": 29.53013038635254,
"learning_rate": 2.918885339013135e-06,
"loss": 0.3793,
"step": 4690
},
{
"epoch": 1.2513312034078807,
"grad_norm": 66.53036499023438,
"learning_rate": 2.914447994320199e-06,
"loss": 0.4606,
"step": 4700
},
{
"epoch": 1.2539936102236422,
"grad_norm": 62.39105987548828,
"learning_rate": 2.9100106496272635e-06,
"loss": 0.4766,
"step": 4710
},
{
"epoch": 1.2566560170394037,
"grad_norm": 1.6803525686264038,
"learning_rate": 2.9055733049343276e-06,
"loss": 0.4049,
"step": 4720
},
{
"epoch": 1.2593184238551651,
"grad_norm": 24.308237075805664,
"learning_rate": 2.9011359602413917e-06,
"loss": 0.5353,
"step": 4730
},
{
"epoch": 1.2619808306709266,
"grad_norm": 61.788597106933594,
"learning_rate": 2.8966986155484563e-06,
"loss": 0.4769,
"step": 4740
},
{
"epoch": 1.264643237486688,
"grad_norm": 60.700103759765625,
"learning_rate": 2.8922612708555204e-06,
"loss": 0.4012,
"step": 4750
},
{
"epoch": 1.2673056443024495,
"grad_norm": 50.121177673339844,
"learning_rate": 2.8878239261625845e-06,
"loss": 0.2974,
"step": 4760
},
{
"epoch": 1.269968051118211,
"grad_norm": 56.65994644165039,
"learning_rate": 2.883386581469649e-06,
"loss": 0.5166,
"step": 4770
},
{
"epoch": 1.2726304579339724,
"grad_norm": 47.86198425292969,
"learning_rate": 2.878949236776713e-06,
"loss": 0.3719,
"step": 4780
},
{
"epoch": 1.2752928647497337,
"grad_norm": 17.415645599365234,
"learning_rate": 2.874511892083777e-06,
"loss": 0.2031,
"step": 4790
},
{
"epoch": 1.2779552715654952,
"grad_norm": 24.741268157958984,
"learning_rate": 2.8700745473908413e-06,
"loss": 0.3492,
"step": 4800
},
{
"epoch": 1.2806176783812566,
"grad_norm": 41.743743896484375,
"learning_rate": 2.865637202697906e-06,
"loss": 0.3182,
"step": 4810
},
{
"epoch": 1.283280085197018,
"grad_norm": 39.02400207519531,
"learning_rate": 2.86119985800497e-06,
"loss": 0.3966,
"step": 4820
},
{
"epoch": 1.2859424920127795,
"grad_norm": 26.04469871520996,
"learning_rate": 2.856762513312034e-06,
"loss": 0.6672,
"step": 4830
},
{
"epoch": 1.288604898828541,
"grad_norm": 88.40325927734375,
"learning_rate": 2.8523251686190985e-06,
"loss": 0.4201,
"step": 4840
},
{
"epoch": 1.2912673056443025,
"grad_norm": 21.478443145751953,
"learning_rate": 2.8478878239261626e-06,
"loss": 0.5281,
"step": 4850
},
{
"epoch": 1.293929712460064,
"grad_norm": 66.9277114868164,
"learning_rate": 2.8434504792332267e-06,
"loss": 0.4107,
"step": 4860
},
{
"epoch": 1.2965921192758254,
"grad_norm": 15.34659481048584,
"learning_rate": 2.8390131345402917e-06,
"loss": 0.3604,
"step": 4870
},
{
"epoch": 1.2992545260915869,
"grad_norm": 37.452537536621094,
"learning_rate": 2.8345757898473558e-06,
"loss": 0.2633,
"step": 4880
},
{
"epoch": 1.3019169329073481,
"grad_norm": 33.15935134887695,
"learning_rate": 2.83013844515442e-06,
"loss": 0.4409,
"step": 4890
},
{
"epoch": 1.3045793397231096,
"grad_norm": 49.37697982788086,
"learning_rate": 2.8257011004614844e-06,
"loss": 0.4413,
"step": 4900
},
{
"epoch": 1.307241746538871,
"grad_norm": 3.118769884109497,
"learning_rate": 2.8212637557685485e-06,
"loss": 0.304,
"step": 4910
},
{
"epoch": 1.3099041533546325,
"grad_norm": 20.165117263793945,
"learning_rate": 2.8168264110756126e-06,
"loss": 0.4082,
"step": 4920
},
{
"epoch": 1.312566560170394,
"grad_norm": 52.98624038696289,
"learning_rate": 2.812389066382677e-06,
"loss": 0.2349,
"step": 4930
},
{
"epoch": 1.3152289669861554,
"grad_norm": 20.26984977722168,
"learning_rate": 2.807951721689741e-06,
"loss": 0.3856,
"step": 4940
},
{
"epoch": 1.317891373801917,
"grad_norm": 17.35883903503418,
"learning_rate": 2.8035143769968053e-06,
"loss": 0.4064,
"step": 4950
},
{
"epoch": 1.3205537806176784,
"grad_norm": 55.4739990234375,
"learning_rate": 2.79907703230387e-06,
"loss": 0.4243,
"step": 4960
},
{
"epoch": 1.3232161874334398,
"grad_norm": 57.40946578979492,
"learning_rate": 2.794639687610934e-06,
"loss": 0.4577,
"step": 4970
},
{
"epoch": 1.3258785942492013,
"grad_norm": 53.673954010009766,
"learning_rate": 2.790202342917998e-06,
"loss": 0.3182,
"step": 4980
},
{
"epoch": 1.3285410010649628,
"grad_norm": 37.89504623413086,
"learning_rate": 2.785764998225062e-06,
"loss": 0.2015,
"step": 4990
},
{
"epoch": 1.3312034078807242,
"grad_norm": 58.81607437133789,
"learning_rate": 2.7813276535321266e-06,
"loss": 0.3429,
"step": 5000
},
{
"epoch": 1.3338658146964857,
"grad_norm": 61.05684280395508,
"learning_rate": 2.7768903088391907e-06,
"loss": 0.4401,
"step": 5010
},
{
"epoch": 1.3365282215122471,
"grad_norm": 38.03738784790039,
"learning_rate": 2.772452964146255e-06,
"loss": 0.3691,
"step": 5020
},
{
"epoch": 1.3391906283280086,
"grad_norm": 23.519515991210938,
"learning_rate": 2.7680156194533194e-06,
"loss": 0.4365,
"step": 5030
},
{
"epoch": 1.34185303514377,
"grad_norm": 5.398426532745361,
"learning_rate": 2.7635782747603834e-06,
"loss": 0.3614,
"step": 5040
},
{
"epoch": 1.3445154419595315,
"grad_norm": 94.55555725097656,
"learning_rate": 2.7591409300674475e-06,
"loss": 0.3434,
"step": 5050
},
{
"epoch": 1.3471778487752928,
"grad_norm": 13.663138389587402,
"learning_rate": 2.754703585374512e-06,
"loss": 0.3295,
"step": 5060
},
{
"epoch": 1.3498402555910542,
"grad_norm": 18.806346893310547,
"learning_rate": 2.750266240681576e-06,
"loss": 0.3202,
"step": 5070
},
{
"epoch": 1.3525026624068157,
"grad_norm": 50.802101135253906,
"learning_rate": 2.7458288959886407e-06,
"loss": 0.4979,
"step": 5080
},
{
"epoch": 1.3551650692225772,
"grad_norm": 42.24506378173828,
"learning_rate": 2.741391551295705e-06,
"loss": 0.4015,
"step": 5090
},
{
"epoch": 1.3578274760383386,
"grad_norm": 9.65519905090332,
"learning_rate": 2.7369542066027693e-06,
"loss": 0.2794,
"step": 5100
},
{
"epoch": 1.3604898828541,
"grad_norm": 23.062803268432617,
"learning_rate": 2.7325168619098334e-06,
"loss": 0.3299,
"step": 5110
},
{
"epoch": 1.3631522896698616,
"grad_norm": 42.74171447753906,
"learning_rate": 2.728079517216898e-06,
"loss": 0.3362,
"step": 5120
},
{
"epoch": 1.365814696485623,
"grad_norm": 1.1335097551345825,
"learning_rate": 2.723642172523962e-06,
"loss": 0.416,
"step": 5130
},
{
"epoch": 1.3684771033013845,
"grad_norm": 44.670101165771484,
"learning_rate": 2.719204827831026e-06,
"loss": 0.4816,
"step": 5140
},
{
"epoch": 1.371139510117146,
"grad_norm": 18.282821655273438,
"learning_rate": 2.7147674831380906e-06,
"loss": 0.3793,
"step": 5150
},
{
"epoch": 1.3738019169329074,
"grad_norm": 38.96583557128906,
"learning_rate": 2.7103301384451547e-06,
"loss": 0.4056,
"step": 5160
},
{
"epoch": 1.3764643237486687,
"grad_norm": 73.3528823852539,
"learning_rate": 2.705892793752219e-06,
"loss": 0.4056,
"step": 5170
},
{
"epoch": 1.3791267305644301,
"grad_norm": 12.709755897521973,
"learning_rate": 2.701455449059283e-06,
"loss": 0.3328,
"step": 5180
},
{
"epoch": 1.3817891373801916,
"grad_norm": 42.40044021606445,
"learning_rate": 2.6970181043663475e-06,
"loss": 0.45,
"step": 5190
},
{
"epoch": 1.384451544195953,
"grad_norm": 30.970508575439453,
"learning_rate": 2.6925807596734116e-06,
"loss": 0.3236,
"step": 5200
},
{
"epoch": 1.3871139510117145,
"grad_norm": 46.15846633911133,
"learning_rate": 2.6881434149804757e-06,
"loss": 0.3149,
"step": 5210
},
{
"epoch": 1.389776357827476,
"grad_norm": 26.226940155029297,
"learning_rate": 2.68370607028754e-06,
"loss": 0.4892,
"step": 5220
},
{
"epoch": 1.3924387646432375,
"grad_norm": 19.278440475463867,
"learning_rate": 2.6792687255946043e-06,
"loss": 0.2178,
"step": 5230
},
{
"epoch": 1.395101171458999,
"grad_norm": 44.71614456176758,
"learning_rate": 2.6748313809016684e-06,
"loss": 0.3859,
"step": 5240
},
{
"epoch": 1.3977635782747604,
"grad_norm": 24.44068717956543,
"learning_rate": 2.670394036208733e-06,
"loss": 0.3681,
"step": 5250
},
{
"epoch": 1.4004259850905219,
"grad_norm": 53.29160690307617,
"learning_rate": 2.665956691515797e-06,
"loss": 0.3337,
"step": 5260
},
{
"epoch": 1.4030883919062833,
"grad_norm": 48.54678726196289,
"learning_rate": 2.661519346822861e-06,
"loss": 0.3785,
"step": 5270
},
{
"epoch": 1.4057507987220448,
"grad_norm": 26.18408966064453,
"learning_rate": 2.657082002129925e-06,
"loss": 0.2488,
"step": 5280
},
{
"epoch": 1.4084132055378062,
"grad_norm": 64.42898559570312,
"learning_rate": 2.65264465743699e-06,
"loss": 0.2565,
"step": 5290
},
{
"epoch": 1.4110756123535677,
"grad_norm": 74.4629898071289,
"learning_rate": 2.6482073127440542e-06,
"loss": 0.495,
"step": 5300
},
{
"epoch": 1.4137380191693292,
"grad_norm": 62.5654411315918,
"learning_rate": 2.6437699680511188e-06,
"loss": 0.5005,
"step": 5310
},
{
"epoch": 1.4164004259850906,
"grad_norm": 1.9694045782089233,
"learning_rate": 2.639332623358183e-06,
"loss": 0.4535,
"step": 5320
},
{
"epoch": 1.419062832800852,
"grad_norm": 55.817508697509766,
"learning_rate": 2.634895278665247e-06,
"loss": 0.2567,
"step": 5330
},
{
"epoch": 1.4217252396166133,
"grad_norm": 28.193262100219727,
"learning_rate": 2.6304579339723115e-06,
"loss": 0.2602,
"step": 5340
},
{
"epoch": 1.4243876464323748,
"grad_norm": 21.148727416992188,
"learning_rate": 2.6260205892793756e-06,
"loss": 0.2867,
"step": 5350
},
{
"epoch": 1.4270500532481363,
"grad_norm": 1.262163519859314,
"learning_rate": 2.6215832445864397e-06,
"loss": 0.2097,
"step": 5360
},
{
"epoch": 1.4297124600638977,
"grad_norm": 8.631014823913574,
"learning_rate": 2.6171458998935038e-06,
"loss": 0.2318,
"step": 5370
},
{
"epoch": 1.4323748668796592,
"grad_norm": 72.81652069091797,
"learning_rate": 2.6127085552005683e-06,
"loss": 0.3175,
"step": 5380
},
{
"epoch": 1.4350372736954207,
"grad_norm": 38.914085388183594,
"learning_rate": 2.6082712105076324e-06,
"loss": 0.302,
"step": 5390
},
{
"epoch": 1.4376996805111821,
"grad_norm": 67.58287048339844,
"learning_rate": 2.6038338658146965e-06,
"loss": 0.5973,
"step": 5400
},
{
"epoch": 1.4403620873269436,
"grad_norm": 148.8922576904297,
"learning_rate": 2.599396521121761e-06,
"loss": 0.4405,
"step": 5410
},
{
"epoch": 1.443024494142705,
"grad_norm": 34.74929428100586,
"learning_rate": 2.594959176428825e-06,
"loss": 0.3936,
"step": 5420
},
{
"epoch": 1.4456869009584665,
"grad_norm": 14.037424087524414,
"learning_rate": 2.590521831735889e-06,
"loss": 0.2189,
"step": 5430
},
{
"epoch": 1.4483493077742278,
"grad_norm": 5.392988681793213,
"learning_rate": 2.5860844870429537e-06,
"loss": 0.3137,
"step": 5440
},
{
"epoch": 1.4510117145899892,
"grad_norm": 54.35594177246094,
"learning_rate": 2.581647142350018e-06,
"loss": 0.5976,
"step": 5450
},
{
"epoch": 1.4536741214057507,
"grad_norm": 10.110573768615723,
"learning_rate": 2.577209797657082e-06,
"loss": 0.4212,
"step": 5460
},
{
"epoch": 1.4563365282215122,
"grad_norm": 97.60591125488281,
"learning_rate": 2.572772452964146e-06,
"loss": 0.5271,
"step": 5470
},
{
"epoch": 1.4589989350372736,
"grad_norm": 78.51748657226562,
"learning_rate": 2.5683351082712105e-06,
"loss": 0.3833,
"step": 5480
},
{
"epoch": 1.461661341853035,
"grad_norm": 18.49489974975586,
"learning_rate": 2.563897763578275e-06,
"loss": 0.4865,
"step": 5490
},
{
"epoch": 1.4643237486687966,
"grad_norm": 65.34386444091797,
"learning_rate": 2.5594604188853396e-06,
"loss": 0.1895,
"step": 5500
},
{
"epoch": 1.466986155484558,
"grad_norm": 14.89499568939209,
"learning_rate": 2.5550230741924037e-06,
"loss": 0.3546,
"step": 5510
},
{
"epoch": 1.4696485623003195,
"grad_norm": 67.33487701416016,
"learning_rate": 2.550585729499468e-06,
"loss": 0.2224,
"step": 5520
},
{
"epoch": 1.472310969116081,
"grad_norm": 59.60548400878906,
"learning_rate": 2.5461483848065323e-06,
"loss": 0.333,
"step": 5530
},
{
"epoch": 1.4749733759318424,
"grad_norm": 60.33387756347656,
"learning_rate": 2.5417110401135964e-06,
"loss": 0.5795,
"step": 5540
},
{
"epoch": 1.4776357827476039,
"grad_norm": 22.29596710205078,
"learning_rate": 2.5372736954206605e-06,
"loss": 0.4306,
"step": 5550
},
{
"epoch": 1.4802981895633653,
"grad_norm": 29.321380615234375,
"learning_rate": 2.532836350727725e-06,
"loss": 0.4097,
"step": 5560
},
{
"epoch": 1.4829605963791268,
"grad_norm": 0.48686346411705017,
"learning_rate": 2.528399006034789e-06,
"loss": 0.2729,
"step": 5570
},
{
"epoch": 1.4856230031948883,
"grad_norm": 57.16348648071289,
"learning_rate": 2.5239616613418532e-06,
"loss": 0.5437,
"step": 5580
},
{
"epoch": 1.4882854100106497,
"grad_norm": 67.26814270019531,
"learning_rate": 2.5195243166489173e-06,
"loss": 0.3588,
"step": 5590
},
{
"epoch": 1.4909478168264112,
"grad_norm": 26.68613052368164,
"learning_rate": 2.515086971955982e-06,
"loss": 0.3797,
"step": 5600
},
{
"epoch": 1.4936102236421724,
"grad_norm": 15.83231258392334,
"learning_rate": 2.510649627263046e-06,
"loss": 0.3654,
"step": 5610
},
{
"epoch": 1.496272630457934,
"grad_norm": 91.63920593261719,
"learning_rate": 2.50621228257011e-06,
"loss": 0.3257,
"step": 5620
},
{
"epoch": 1.4989350372736954,
"grad_norm": 38.38882827758789,
"learning_rate": 2.5017749378771746e-06,
"loss": 0.3135,
"step": 5630
},
{
"epoch": 1.5015974440894568,
"grad_norm": 6.710525989532471,
"learning_rate": 2.4973375931842387e-06,
"loss": 0.3344,
"step": 5640
},
{
"epoch": 1.5042598509052183,
"grad_norm": 31.802005767822266,
"learning_rate": 2.492900248491303e-06,
"loss": 0.4441,
"step": 5650
},
{
"epoch": 1.5069222577209798,
"grad_norm": 62.77358627319336,
"learning_rate": 2.4884629037983673e-06,
"loss": 0.7027,
"step": 5660
},
{
"epoch": 1.5095846645367412,
"grad_norm": 3.811572551727295,
"learning_rate": 2.4840255591054314e-06,
"loss": 0.4285,
"step": 5670
},
{
"epoch": 1.5122470713525027,
"grad_norm": 47.65203857421875,
"learning_rate": 2.479588214412496e-06,
"loss": 0.4392,
"step": 5680
},
{
"epoch": 1.5149094781682642,
"grad_norm": 13.71064567565918,
"learning_rate": 2.47515086971956e-06,
"loss": 0.5708,
"step": 5690
},
{
"epoch": 1.5175718849840254,
"grad_norm": 116.10169982910156,
"learning_rate": 2.470713525026624e-06,
"loss": 0.7059,
"step": 5700
},
{
"epoch": 1.5202342917997869,
"grad_norm": 24.96187973022461,
"learning_rate": 2.4662761803336886e-06,
"loss": 0.4132,
"step": 5710
},
{
"epoch": 1.5228966986155483,
"grad_norm": 6.301574230194092,
"learning_rate": 2.4618388356407527e-06,
"loss": 0.3916,
"step": 5720
},
{
"epoch": 1.5255591054313098,
"grad_norm": 66.5113754272461,
"learning_rate": 2.457401490947817e-06,
"loss": 0.1712,
"step": 5730
},
{
"epoch": 1.5282215122470713,
"grad_norm": 1.1590064764022827,
"learning_rate": 2.4529641462548813e-06,
"loss": 0.4512,
"step": 5740
},
{
"epoch": 1.5308839190628327,
"grad_norm": 84.05433654785156,
"learning_rate": 2.448526801561946e-06,
"loss": 0.3576,
"step": 5750
},
{
"epoch": 1.5335463258785942,
"grad_norm": 12.33926773071289,
"learning_rate": 2.44408945686901e-06,
"loss": 0.4766,
"step": 5760
},
{
"epoch": 1.5362087326943556,
"grad_norm": 10.728660583496094,
"learning_rate": 2.439652112176074e-06,
"loss": 0.5107,
"step": 5770
},
{
"epoch": 1.5388711395101171,
"grad_norm": 79.01044464111328,
"learning_rate": 2.435214767483138e-06,
"loss": 0.4151,
"step": 5780
},
{
"epoch": 1.5415335463258786,
"grad_norm": 43.35115432739258,
"learning_rate": 2.4307774227902027e-06,
"loss": 0.2835,
"step": 5790
},
{
"epoch": 1.54419595314164,
"grad_norm": 47.672882080078125,
"learning_rate": 2.4263400780972668e-06,
"loss": 0.2557,
"step": 5800
},
{
"epoch": 1.5468583599574015,
"grad_norm": 47.75846862792969,
"learning_rate": 2.421902733404331e-06,
"loss": 0.3627,
"step": 5810
},
{
"epoch": 1.549520766773163,
"grad_norm": 71.76559448242188,
"learning_rate": 2.4174653887113954e-06,
"loss": 0.6354,
"step": 5820
},
{
"epoch": 1.5521831735889244,
"grad_norm": 45.219818115234375,
"learning_rate": 2.4130280440184595e-06,
"loss": 0.4252,
"step": 5830
},
{
"epoch": 1.554845580404686,
"grad_norm": 60.581390380859375,
"learning_rate": 2.4085906993255236e-06,
"loss": 0.4574,
"step": 5840
},
{
"epoch": 1.5575079872204474,
"grad_norm": 50.423282623291016,
"learning_rate": 2.404153354632588e-06,
"loss": 0.3814,
"step": 5850
},
{
"epoch": 1.5601703940362088,
"grad_norm": 48.1116828918457,
"learning_rate": 2.3997160099396522e-06,
"loss": 0.4046,
"step": 5860
},
{
"epoch": 1.5628328008519703,
"grad_norm": 60.15108108520508,
"learning_rate": 2.3952786652467167e-06,
"loss": 0.3961,
"step": 5870
},
{
"epoch": 1.5654952076677318,
"grad_norm": 57.97117233276367,
"learning_rate": 2.390841320553781e-06,
"loss": 0.3043,
"step": 5880
},
{
"epoch": 1.5681576144834932,
"grad_norm": 67.44349670410156,
"learning_rate": 2.386403975860845e-06,
"loss": 0.4291,
"step": 5890
},
{
"epoch": 1.5708200212992547,
"grad_norm": 16.61199951171875,
"learning_rate": 2.3819666311679095e-06,
"loss": 0.586,
"step": 5900
},
{
"epoch": 1.573482428115016,
"grad_norm": 72.90435791015625,
"learning_rate": 2.3775292864749736e-06,
"loss": 0.3939,
"step": 5910
},
{
"epoch": 1.5761448349307774,
"grad_norm": 13.542293548583984,
"learning_rate": 2.3730919417820377e-06,
"loss": 0.4731,
"step": 5920
},
{
"epoch": 1.5788072417465389,
"grad_norm": 79.02220153808594,
"learning_rate": 2.368654597089102e-06,
"loss": 0.3008,
"step": 5930
},
{
"epoch": 1.5814696485623003,
"grad_norm": 8.137725830078125,
"learning_rate": 2.3642172523961663e-06,
"loss": 0.3233,
"step": 5940
},
{
"epoch": 1.5841320553780618,
"grad_norm": 44.669395446777344,
"learning_rate": 2.3597799077032304e-06,
"loss": 0.3562,
"step": 5950
},
{
"epoch": 1.5867944621938233,
"grad_norm": 2.6869125366210938,
"learning_rate": 2.355342563010295e-06,
"loss": 0.3489,
"step": 5960
},
{
"epoch": 1.5894568690095847,
"grad_norm": 41.042076110839844,
"learning_rate": 2.350905218317359e-06,
"loss": 0.5328,
"step": 5970
},
{
"epoch": 1.592119275825346,
"grad_norm": 1.4201816320419312,
"learning_rate": 2.3464678736244235e-06,
"loss": 0.4808,
"step": 5980
},
{
"epoch": 1.5947816826411074,
"grad_norm": 89.9078598022461,
"learning_rate": 2.3420305289314876e-06,
"loss": 0.4753,
"step": 5990
},
{
"epoch": 1.5974440894568689,
"grad_norm": 63.90070343017578,
"learning_rate": 2.3375931842385517e-06,
"loss": 0.5337,
"step": 6000
},
{
"epoch": 1.6001064962726304,
"grad_norm": 26.923553466796875,
"learning_rate": 2.3331558395456162e-06,
"loss": 0.4904,
"step": 6010
},
{
"epoch": 1.6027689030883918,
"grad_norm": 55.201473236083984,
"learning_rate": 2.3287184948526803e-06,
"loss": 0.4771,
"step": 6020
},
{
"epoch": 1.6054313099041533,
"grad_norm": 28.148975372314453,
"learning_rate": 2.3242811501597444e-06,
"loss": 0.3157,
"step": 6030
},
{
"epoch": 1.6080937167199147,
"grad_norm": 54.23725509643555,
"learning_rate": 2.319843805466809e-06,
"loss": 0.3538,
"step": 6040
},
{
"epoch": 1.6107561235356762,
"grad_norm": 35.689388275146484,
"learning_rate": 2.315406460773873e-06,
"loss": 0.4193,
"step": 6050
},
{
"epoch": 1.6134185303514377,
"grad_norm": 93.01923370361328,
"learning_rate": 2.3109691160809376e-06,
"loss": 0.4713,
"step": 6060
},
{
"epoch": 1.6160809371671991,
"grad_norm": 71.91993713378906,
"learning_rate": 2.3065317713880017e-06,
"loss": 0.3949,
"step": 6070
},
{
"epoch": 1.6187433439829606,
"grad_norm": 134.8788299560547,
"learning_rate": 2.3020944266950658e-06,
"loss": 0.3017,
"step": 6080
},
{
"epoch": 1.621405750798722,
"grad_norm": 18.717525482177734,
"learning_rate": 2.2976570820021303e-06,
"loss": 0.3954,
"step": 6090
},
{
"epoch": 1.6240681576144835,
"grad_norm": 58.103179931640625,
"learning_rate": 2.2932197373091944e-06,
"loss": 0.397,
"step": 6100
},
{
"epoch": 1.626730564430245,
"grad_norm": 37.77821350097656,
"learning_rate": 2.2887823926162585e-06,
"loss": 0.29,
"step": 6110
},
{
"epoch": 1.6293929712460065,
"grad_norm": 46.78099822998047,
"learning_rate": 2.284345047923323e-06,
"loss": 0.33,
"step": 6120
},
{
"epoch": 1.632055378061768,
"grad_norm": 40.923824310302734,
"learning_rate": 2.279907703230387e-06,
"loss": 0.3925,
"step": 6130
},
{
"epoch": 1.6347177848775294,
"grad_norm": 65.53398895263672,
"learning_rate": 2.275470358537451e-06,
"loss": 0.3028,
"step": 6140
},
{
"epoch": 1.6373801916932909,
"grad_norm": 7.815479278564453,
"learning_rate": 2.2710330138445157e-06,
"loss": 0.4098,
"step": 6150
},
{
"epoch": 1.6400425985090523,
"grad_norm": 32.87179946899414,
"learning_rate": 2.26659566915158e-06,
"loss": 0.1592,
"step": 6160
},
{
"epoch": 1.6427050053248138,
"grad_norm": 41.67914962768555,
"learning_rate": 2.2621583244586443e-06,
"loss": 0.4901,
"step": 6170
},
{
"epoch": 1.645367412140575,
"grad_norm": 131.9647674560547,
"learning_rate": 2.2577209797657084e-06,
"loss": 0.3775,
"step": 6180
},
{
"epoch": 1.6480298189563365,
"grad_norm": 5.2909159660339355,
"learning_rate": 2.2532836350727725e-06,
"loss": 0.3421,
"step": 6190
},
{
"epoch": 1.650692225772098,
"grad_norm": 2.528217077255249,
"learning_rate": 2.248846290379837e-06,
"loss": 0.5184,
"step": 6200
},
{
"epoch": 1.6533546325878594,
"grad_norm": 200.53085327148438,
"learning_rate": 2.244408945686901e-06,
"loss": 0.4295,
"step": 6210
},
{
"epoch": 1.6560170394036209,
"grad_norm": 87.9400405883789,
"learning_rate": 2.2399716009939653e-06,
"loss": 0.4585,
"step": 6220
},
{
"epoch": 1.6586794462193823,
"grad_norm": 67.28507995605469,
"learning_rate": 2.2355342563010298e-06,
"loss": 0.3509,
"step": 6230
},
{
"epoch": 1.6613418530351438,
"grad_norm": 44.125972747802734,
"learning_rate": 2.231096911608094e-06,
"loss": 0.5828,
"step": 6240
},
{
"epoch": 1.664004259850905,
"grad_norm": 8.366595268249512,
"learning_rate": 2.226659566915158e-06,
"loss": 0.2661,
"step": 6250
},
{
"epoch": 1.6666666666666665,
"grad_norm": 17.8718318939209,
"learning_rate": 2.222222222222222e-06,
"loss": 0.2754,
"step": 6260
},
{
"epoch": 1.669329073482428,
"grad_norm": 25.51759147644043,
"learning_rate": 2.2177848775292866e-06,
"loss": 0.3202,
"step": 6270
},
{
"epoch": 1.6719914802981894,
"grad_norm": 116.83739471435547,
"learning_rate": 2.213347532836351e-06,
"loss": 0.3134,
"step": 6280
},
{
"epoch": 1.674653887113951,
"grad_norm": 55.138797760009766,
"learning_rate": 2.2089101881434152e-06,
"loss": 0.3301,
"step": 6290
},
{
"epoch": 1.6773162939297124,
"grad_norm": 62.49728775024414,
"learning_rate": 2.2044728434504793e-06,
"loss": 0.4146,
"step": 6300
},
{
"epoch": 1.6799787007454738,
"grad_norm": 39.887271881103516,
"learning_rate": 2.200035498757544e-06,
"loss": 0.3644,
"step": 6310
},
{
"epoch": 1.6826411075612353,
"grad_norm": 51.77403259277344,
"learning_rate": 2.195598154064608e-06,
"loss": 0.2316,
"step": 6320
},
{
"epoch": 1.6853035143769968,
"grad_norm": 29.805233001708984,
"learning_rate": 2.191160809371672e-06,
"loss": 0.2236,
"step": 6330
},
{
"epoch": 1.6879659211927582,
"grad_norm": 110.7286376953125,
"learning_rate": 2.1867234646787366e-06,
"loss": 0.3946,
"step": 6340
},
{
"epoch": 1.6906283280085197,
"grad_norm": 7.167260646820068,
"learning_rate": 2.1822861199858007e-06,
"loss": 0.397,
"step": 6350
},
{
"epoch": 1.6932907348242812,
"grad_norm": 35.95744705200195,
"learning_rate": 2.1778487752928648e-06,
"loss": 0.2771,
"step": 6360
},
{
"epoch": 1.6959531416400426,
"grad_norm": 79.4239273071289,
"learning_rate": 2.1734114305999293e-06,
"loss": 0.5002,
"step": 6370
},
{
"epoch": 1.698615548455804,
"grad_norm": 25.363061904907227,
"learning_rate": 2.1689740859069934e-06,
"loss": 0.438,
"step": 6380
},
{
"epoch": 1.7012779552715656,
"grad_norm": 40.0933837890625,
"learning_rate": 2.164536741214058e-06,
"loss": 0.3968,
"step": 6390
},
{
"epoch": 1.703940362087327,
"grad_norm": 54.05282974243164,
"learning_rate": 2.160099396521122e-06,
"loss": 0.408,
"step": 6400
},
{
"epoch": 1.7066027689030885,
"grad_norm": 38.28696823120117,
"learning_rate": 2.155662051828186e-06,
"loss": 0.4197,
"step": 6410
},
{
"epoch": 1.70926517571885,
"grad_norm": 21.76958465576172,
"learning_rate": 2.1512247071352506e-06,
"loss": 0.3579,
"step": 6420
},
{
"epoch": 1.7119275825346114,
"grad_norm": 47.16743087768555,
"learning_rate": 2.1467873624423147e-06,
"loss": 0.4702,
"step": 6430
},
{
"epoch": 1.7145899893503729,
"grad_norm": 49.87248229980469,
"learning_rate": 2.142350017749379e-06,
"loss": 0.377,
"step": 6440
},
{
"epoch": 1.7172523961661343,
"grad_norm": 6.606771469116211,
"learning_rate": 2.1379126730564433e-06,
"loss": 0.4398,
"step": 6450
},
{
"epoch": 1.7199148029818956,
"grad_norm": 47.57036209106445,
"learning_rate": 2.1334753283635074e-06,
"loss": 0.5081,
"step": 6460
},
{
"epoch": 1.722577209797657,
"grad_norm": 43.59548568725586,
"learning_rate": 2.1290379836705715e-06,
"loss": 0.2404,
"step": 6470
},
{
"epoch": 1.7252396166134185,
"grad_norm": 76.43495178222656,
"learning_rate": 2.124600638977636e-06,
"loss": 0.349,
"step": 6480
},
{
"epoch": 1.72790202342918,
"grad_norm": 9.13902473449707,
"learning_rate": 2.1201632942847e-06,
"loss": 0.3139,
"step": 6490
},
{
"epoch": 1.7305644302449414,
"grad_norm": 6.037651538848877,
"learning_rate": 2.1157259495917647e-06,
"loss": 0.3026,
"step": 6500
},
{
"epoch": 1.733226837060703,
"grad_norm": 25.021121978759766,
"learning_rate": 2.1112886048988288e-06,
"loss": 0.3791,
"step": 6510
},
{
"epoch": 1.7358892438764644,
"grad_norm": 7.0810065269470215,
"learning_rate": 2.106851260205893e-06,
"loss": 0.3124,
"step": 6520
},
{
"epoch": 1.7385516506922256,
"grad_norm": 71.46830749511719,
"learning_rate": 2.1024139155129574e-06,
"loss": 0.3059,
"step": 6530
},
{
"epoch": 1.741214057507987,
"grad_norm": 35.76477813720703,
"learning_rate": 2.0979765708200215e-06,
"loss": 0.2401,
"step": 6540
},
{
"epoch": 1.7438764643237485,
"grad_norm": 64.08877563476562,
"learning_rate": 2.0935392261270856e-06,
"loss": 0.4158,
"step": 6550
},
{
"epoch": 1.74653887113951,
"grad_norm": 0.7380656599998474,
"learning_rate": 2.0891018814341497e-06,
"loss": 0.2728,
"step": 6560
},
{
"epoch": 1.7492012779552715,
"grad_norm": 20.05269432067871,
"learning_rate": 2.084664536741214e-06,
"loss": 0.3754,
"step": 6570
},
{
"epoch": 1.751863684771033,
"grad_norm": 8.07951831817627,
"learning_rate": 2.0802271920482787e-06,
"loss": 0.2884,
"step": 6580
},
{
"epoch": 1.7545260915867944,
"grad_norm": 33.27769470214844,
"learning_rate": 2.075789847355343e-06,
"loss": 0.5066,
"step": 6590
},
{
"epoch": 1.7571884984025559,
"grad_norm": 54.88376998901367,
"learning_rate": 2.071352502662407e-06,
"loss": 0.2625,
"step": 6600
},
{
"epoch": 1.7598509052183173,
"grad_norm": 83.93997192382812,
"learning_rate": 2.0669151579694714e-06,
"loss": 0.2864,
"step": 6610
},
{
"epoch": 1.7625133120340788,
"grad_norm": 28.251733779907227,
"learning_rate": 2.0624778132765355e-06,
"loss": 0.243,
"step": 6620
},
{
"epoch": 1.7651757188498403,
"grad_norm": 11.35056209564209,
"learning_rate": 2.0580404685835996e-06,
"loss": 0.3056,
"step": 6630
},
{
"epoch": 1.7678381256656017,
"grad_norm": 39.22128677368164,
"learning_rate": 2.053603123890664e-06,
"loss": 0.3891,
"step": 6640
},
{
"epoch": 1.7705005324813632,
"grad_norm": 24.096765518188477,
"learning_rate": 2.0491657791977283e-06,
"loss": 0.4854,
"step": 6650
},
{
"epoch": 1.7731629392971247,
"grad_norm": 42.6016731262207,
"learning_rate": 2.0447284345047924e-06,
"loss": 0.3963,
"step": 6660
},
{
"epoch": 1.7758253461128861,
"grad_norm": 30.725244522094727,
"learning_rate": 2.0402910898118565e-06,
"loss": 0.3167,
"step": 6670
},
{
"epoch": 1.7784877529286476,
"grad_norm": 34.02895736694336,
"learning_rate": 2.035853745118921e-06,
"loss": 0.325,
"step": 6680
},
{
"epoch": 1.781150159744409,
"grad_norm": 19.365659713745117,
"learning_rate": 2.0314164004259855e-06,
"loss": 0.3661,
"step": 6690
},
{
"epoch": 1.7838125665601705,
"grad_norm": 61.94493865966797,
"learning_rate": 2.0269790557330496e-06,
"loss": 0.515,
"step": 6700
},
{
"epoch": 1.786474973375932,
"grad_norm": 85.5712890625,
"learning_rate": 2.0225417110401137e-06,
"loss": 0.4211,
"step": 6710
},
{
"epoch": 1.7891373801916934,
"grad_norm": 43.722721099853516,
"learning_rate": 2.0181043663471782e-06,
"loss": 0.3279,
"step": 6720
},
{
"epoch": 1.791799787007455,
"grad_norm": 90.56026458740234,
"learning_rate": 2.0136670216542423e-06,
"loss": 0.2575,
"step": 6730
},
{
"epoch": 1.7944621938232161,
"grad_norm": 8.048587799072266,
"learning_rate": 2.0092296769613064e-06,
"loss": 0.4455,
"step": 6740
},
{
"epoch": 1.7971246006389776,
"grad_norm": 41.22207260131836,
"learning_rate": 2.0047923322683705e-06,
"loss": 0.3261,
"step": 6750
},
{
"epoch": 1.799787007454739,
"grad_norm": 29.55959701538086,
"learning_rate": 2.000354987575435e-06,
"loss": 0.4426,
"step": 6760
},
{
"epoch": 1.8024494142705005,
"grad_norm": 45.04960632324219,
"learning_rate": 1.995917642882499e-06,
"loss": 0.4013,
"step": 6770
},
{
"epoch": 1.805111821086262,
"grad_norm": 63.23904037475586,
"learning_rate": 1.9914802981895632e-06,
"loss": 0.4687,
"step": 6780
},
{
"epoch": 1.8077742279020235,
"grad_norm": 49.02014923095703,
"learning_rate": 1.9870429534966278e-06,
"loss": 0.465,
"step": 6790
},
{
"epoch": 1.810436634717785,
"grad_norm": 29.04024314880371,
"learning_rate": 1.9826056088036923e-06,
"loss": 0.2825,
"step": 6800
},
{
"epoch": 1.8130990415335462,
"grad_norm": 97.53395080566406,
"learning_rate": 1.9781682641107564e-06,
"loss": 0.4703,
"step": 6810
},
{
"epoch": 1.8157614483493076,
"grad_norm": 39.52357482910156,
"learning_rate": 1.9737309194178205e-06,
"loss": 0.3025,
"step": 6820
},
{
"epoch": 1.818423855165069,
"grad_norm": 43.74717712402344,
"learning_rate": 1.969293574724885e-06,
"loss": 0.4479,
"step": 6830
},
{
"epoch": 1.8210862619808306,
"grad_norm": 51.20553970336914,
"learning_rate": 1.964856230031949e-06,
"loss": 0.2219,
"step": 6840
},
{
"epoch": 1.823748668796592,
"grad_norm": 43.11589050292969,
"learning_rate": 1.960418885339013e-06,
"loss": 0.4179,
"step": 6850
},
{
"epoch": 1.8264110756123535,
"grad_norm": 15.990487098693848,
"learning_rate": 1.9559815406460773e-06,
"loss": 0.4402,
"step": 6860
},
{
"epoch": 1.829073482428115,
"grad_norm": 79.59407806396484,
"learning_rate": 1.951544195953142e-06,
"loss": 0.2956,
"step": 6870
},
{
"epoch": 1.8317358892438764,
"grad_norm": 79.91435241699219,
"learning_rate": 1.947106851260206e-06,
"loss": 0.2556,
"step": 6880
},
{
"epoch": 1.834398296059638,
"grad_norm": 36.0528678894043,
"learning_rate": 1.9426695065672704e-06,
"loss": 0.3791,
"step": 6890
},
{
"epoch": 1.8370607028753994,
"grad_norm": 53.18592834472656,
"learning_rate": 1.9382321618743345e-06,
"loss": 0.5045,
"step": 6900
},
{
"epoch": 1.8397231096911608,
"grad_norm": 28.174659729003906,
"learning_rate": 1.933794817181399e-06,
"loss": 0.251,
"step": 6910
},
{
"epoch": 1.8423855165069223,
"grad_norm": 91.26044464111328,
"learning_rate": 1.929357472488463e-06,
"loss": 0.3903,
"step": 6920
},
{
"epoch": 1.8450479233226837,
"grad_norm": 52.09424591064453,
"learning_rate": 1.9249201277955272e-06,
"loss": 0.2075,
"step": 6930
},
{
"epoch": 1.8477103301384452,
"grad_norm": 46.893714904785156,
"learning_rate": 1.9204827831025918e-06,
"loss": 0.3913,
"step": 6940
},
{
"epoch": 1.8503727369542067,
"grad_norm": 85.90911102294922,
"learning_rate": 1.916045438409656e-06,
"loss": 0.1606,
"step": 6950
},
{
"epoch": 1.8530351437699681,
"grad_norm": 2.6444146633148193,
"learning_rate": 1.91160809371672e-06,
"loss": 0.2813,
"step": 6960
},
{
"epoch": 1.8556975505857296,
"grad_norm": 0.4849480092525482,
"learning_rate": 1.9071707490237843e-06,
"loss": 0.3612,
"step": 6970
},
{
"epoch": 1.858359957401491,
"grad_norm": 81.86653137207031,
"learning_rate": 1.9027334043308484e-06,
"loss": 0.3383,
"step": 6980
},
{
"epoch": 1.8610223642172525,
"grad_norm": 70.43437194824219,
"learning_rate": 1.8982960596379127e-06,
"loss": 0.5331,
"step": 6990
},
{
"epoch": 1.863684771033014,
"grad_norm": 93.09194946289062,
"learning_rate": 1.8938587149449772e-06,
"loss": 0.3872,
"step": 7000
},
{
"epoch": 1.8663471778487752,
"grad_norm": 1.219547152519226,
"learning_rate": 1.8894213702520415e-06,
"loss": 0.4318,
"step": 7010
},
{
"epoch": 1.8690095846645367,
"grad_norm": 77.21363067626953,
"learning_rate": 1.8849840255591056e-06,
"loss": 0.4622,
"step": 7020
},
{
"epoch": 1.8716719914802982,
"grad_norm": 14.271868705749512,
"learning_rate": 1.88054668086617e-06,
"loss": 0.2822,
"step": 7030
},
{
"epoch": 1.8743343982960596,
"grad_norm": 44.3095703125,
"learning_rate": 1.876109336173234e-06,
"loss": 0.4507,
"step": 7040
},
{
"epoch": 1.876996805111821,
"grad_norm": 59.71763610839844,
"learning_rate": 1.8716719914802983e-06,
"loss": 0.3678,
"step": 7050
},
{
"epoch": 1.8796592119275826,
"grad_norm": 84.87511444091797,
"learning_rate": 1.8672346467873626e-06,
"loss": 0.531,
"step": 7060
},
{
"epoch": 1.882321618743344,
"grad_norm": 29.216161727905273,
"learning_rate": 1.8627973020944267e-06,
"loss": 0.2502,
"step": 7070
},
{
"epoch": 1.8849840255591053,
"grad_norm": 25.902713775634766,
"learning_rate": 1.858359957401491e-06,
"loss": 0.34,
"step": 7080
},
{
"epoch": 1.8876464323748667,
"grad_norm": 76.64691162109375,
"learning_rate": 1.8539226127085551e-06,
"loss": 0.3078,
"step": 7090
},
{
"epoch": 1.8903088391906282,
"grad_norm": 50.414451599121094,
"learning_rate": 1.8494852680156197e-06,
"loss": 0.2755,
"step": 7100
},
{
"epoch": 1.8929712460063897,
"grad_norm": 41.31875228881836,
"learning_rate": 1.845047923322684e-06,
"loss": 0.3238,
"step": 7110
},
{
"epoch": 1.8956336528221511,
"grad_norm": 4.913923263549805,
"learning_rate": 1.840610578629748e-06,
"loss": 0.3134,
"step": 7120
},
{
"epoch": 1.8982960596379126,
"grad_norm": 64.60235595703125,
"learning_rate": 1.8361732339368124e-06,
"loss": 0.362,
"step": 7130
},
{
"epoch": 1.900958466453674,
"grad_norm": 32.756526947021484,
"learning_rate": 1.8317358892438767e-06,
"loss": 0.2105,
"step": 7140
},
{
"epoch": 1.9036208732694355,
"grad_norm": 42.97222900390625,
"learning_rate": 1.8272985445509408e-06,
"loss": 0.5158,
"step": 7150
},
{
"epoch": 1.906283280085197,
"grad_norm": 66.98497009277344,
"learning_rate": 1.8228611998580051e-06,
"loss": 0.4249,
"step": 7160
},
{
"epoch": 1.9089456869009584,
"grad_norm": 52.125022888183594,
"learning_rate": 1.8184238551650692e-06,
"loss": 0.3424,
"step": 7170
},
{
"epoch": 1.91160809371672,
"grad_norm": 40.963932037353516,
"learning_rate": 1.8139865104721335e-06,
"loss": 0.2219,
"step": 7180
},
{
"epoch": 1.9142705005324814,
"grad_norm": 26.110610961914062,
"learning_rate": 1.8095491657791978e-06,
"loss": 0.4898,
"step": 7190
},
{
"epoch": 1.9169329073482428,
"grad_norm": 62.95829391479492,
"learning_rate": 1.8051118210862623e-06,
"loss": 0.4794,
"step": 7200
},
{
"epoch": 1.9195953141640043,
"grad_norm": 69.13304901123047,
"learning_rate": 1.8006744763933264e-06,
"loss": 0.4656,
"step": 7210
},
{
"epoch": 1.9222577209797658,
"grad_norm": 82.6933364868164,
"learning_rate": 1.7962371317003908e-06,
"loss": 0.5496,
"step": 7220
},
{
"epoch": 1.9249201277955272,
"grad_norm": 37.5575065612793,
"learning_rate": 1.7917997870074549e-06,
"loss": 0.2939,
"step": 7230
},
{
"epoch": 1.9275825346112887,
"grad_norm": 49.96902847290039,
"learning_rate": 1.7873624423145192e-06,
"loss": 0.3707,
"step": 7240
},
{
"epoch": 1.9302449414270502,
"grad_norm": 79.97505950927734,
"learning_rate": 1.7829250976215835e-06,
"loss": 0.2701,
"step": 7250
},
{
"epoch": 1.9329073482428116,
"grad_norm": 57.69758605957031,
"learning_rate": 1.7784877529286476e-06,
"loss": 0.4132,
"step": 7260
},
{
"epoch": 1.935569755058573,
"grad_norm": 0.8374597430229187,
"learning_rate": 1.7740504082357119e-06,
"loss": 0.2273,
"step": 7270
},
{
"epoch": 1.9382321618743346,
"grad_norm": 13.859219551086426,
"learning_rate": 1.769613063542776e-06,
"loss": 0.329,
"step": 7280
},
{
"epoch": 1.9408945686900958,
"grad_norm": 56.059635162353516,
"learning_rate": 1.7651757188498403e-06,
"loss": 0.3534,
"step": 7290
},
{
"epoch": 1.9435569755058573,
"grad_norm": 48.91998291015625,
"learning_rate": 1.7607383741569046e-06,
"loss": 0.461,
"step": 7300
},
{
"epoch": 1.9462193823216187,
"grad_norm": 128.54881286621094,
"learning_rate": 1.7563010294639691e-06,
"loss": 0.5919,
"step": 7310
},
{
"epoch": 1.9488817891373802,
"grad_norm": 59.10694122314453,
"learning_rate": 1.7518636847710332e-06,
"loss": 0.3616,
"step": 7320
},
{
"epoch": 1.9515441959531417,
"grad_norm": 49.39072036743164,
"learning_rate": 1.7474263400780975e-06,
"loss": 0.4074,
"step": 7330
},
{
"epoch": 1.9542066027689031,
"grad_norm": 21.689308166503906,
"learning_rate": 1.7429889953851616e-06,
"loss": 0.3047,
"step": 7340
},
{
"epoch": 1.9568690095846646,
"grad_norm": 97.16775512695312,
"learning_rate": 1.738551650692226e-06,
"loss": 0.2323,
"step": 7350
},
{
"epoch": 1.9595314164004258,
"grad_norm": 39.6245231628418,
"learning_rate": 1.7341143059992903e-06,
"loss": 0.3336,
"step": 7360
},
{
"epoch": 1.9621938232161873,
"grad_norm": 81.37751770019531,
"learning_rate": 1.7296769613063543e-06,
"loss": 0.3158,
"step": 7370
},
{
"epoch": 1.9648562300319488,
"grad_norm": 8.210901260375977,
"learning_rate": 1.7252396166134187e-06,
"loss": 0.3438,
"step": 7380
},
{
"epoch": 1.9675186368477102,
"grad_norm": 60.9276123046875,
"learning_rate": 1.7208022719204828e-06,
"loss": 0.4774,
"step": 7390
},
{
"epoch": 1.9701810436634717,
"grad_norm": 36.85103225708008,
"learning_rate": 1.716364927227547e-06,
"loss": 0.2424,
"step": 7400
},
{
"epoch": 1.9728434504792332,
"grad_norm": 1.969171404838562,
"learning_rate": 1.7119275825346116e-06,
"loss": 0.2775,
"step": 7410
},
{
"epoch": 1.9755058572949946,
"grad_norm": 20.650897979736328,
"learning_rate": 1.7074902378416757e-06,
"loss": 0.271,
"step": 7420
},
{
"epoch": 1.978168264110756,
"grad_norm": 45.896629333496094,
"learning_rate": 1.70305289314874e-06,
"loss": 0.4138,
"step": 7430
},
{
"epoch": 1.9808306709265175,
"grad_norm": 32.696041107177734,
"learning_rate": 1.6986155484558043e-06,
"loss": 0.3364,
"step": 7440
},
{
"epoch": 1.983493077742279,
"grad_norm": 22.776897430419922,
"learning_rate": 1.6941782037628684e-06,
"loss": 0.5304,
"step": 7450
},
{
"epoch": 1.9861554845580405,
"grad_norm": 1.038737416267395,
"learning_rate": 1.6897408590699327e-06,
"loss": 0.2505,
"step": 7460
},
{
"epoch": 1.988817891373802,
"grad_norm": 58.575130462646484,
"learning_rate": 1.6853035143769968e-06,
"loss": 0.544,
"step": 7470
},
{
"epoch": 1.9914802981895634,
"grad_norm": 73.77245330810547,
"learning_rate": 1.6808661696840611e-06,
"loss": 0.3231,
"step": 7480
},
{
"epoch": 1.9941427050053249,
"grad_norm": 45.046348571777344,
"learning_rate": 1.6764288249911254e-06,
"loss": 0.31,
"step": 7490
},
{
"epoch": 1.9968051118210863,
"grad_norm": 28.5229549407959,
"learning_rate": 1.6719914802981895e-06,
"loss": 0.3738,
"step": 7500
},
{
"epoch": 1.9994675186368478,
"grad_norm": 65.60608673095703,
"learning_rate": 1.667554135605254e-06,
"loss": 0.3686,
"step": 7510
},
{
"epoch": 2.0,
"eval_loss": 0.47438761591911316,
"eval_runtime": 403.5495,
"eval_samples_per_second": 9.307,
"eval_steps_per_second": 1.165,
"step": 7512
}
],
"logging_steps": 10,
"max_steps": 11268,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.5809432817604608e+16,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}