|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.7212405337179949, |
|
"eval_steps": 200, |
|
"global_step": 6000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0006010337780983291, |
|
"grad_norm": 0.9436860084533691, |
|
"learning_rate": 3.0048076923076927e-06, |
|
"loss": 4.4875, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.0012020675561966582, |
|
"grad_norm": 0.49743083119392395, |
|
"learning_rate": 6.0096153846153855e-06, |
|
"loss": 4.2438, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.0018031013342949874, |
|
"grad_norm": 0.7305557131767273, |
|
"learning_rate": 9.014423076923076e-06, |
|
"loss": 4.7438, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.0024041351123933164, |
|
"grad_norm": 1.2842742204666138, |
|
"learning_rate": 1.2019230769230771e-05, |
|
"loss": 4.3125, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.0030051688904916456, |
|
"grad_norm": 1.0128940343856812, |
|
"learning_rate": 1.5024038461538462e-05, |
|
"loss": 4.2969, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.003606202668589975, |
|
"grad_norm": 1.6097222566604614, |
|
"learning_rate": 1.8028846153846152e-05, |
|
"loss": 4.3625, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.004207236446688304, |
|
"grad_norm": 0.7380394339561462, |
|
"learning_rate": 2.103365384615385e-05, |
|
"loss": 3.6562, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.004808270224786633, |
|
"grad_norm": 2.499553918838501, |
|
"learning_rate": 2.4038461538461542e-05, |
|
"loss": 3.9656, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.005409304002884962, |
|
"grad_norm": 0.9382426142692566, |
|
"learning_rate": 2.704326923076923e-05, |
|
"loss": 3.7906, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.006010337780983291, |
|
"grad_norm": 0.4448552429676056, |
|
"learning_rate": 3.0048076923076925e-05, |
|
"loss": 3.4531, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.00661137155908162, |
|
"grad_norm": 0.6187996864318848, |
|
"learning_rate": 3.3052884615384615e-05, |
|
"loss": 3.0406, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.00721240533717995, |
|
"grad_norm": 0.4894959032535553, |
|
"learning_rate": 3.6057692307692304e-05, |
|
"loss": 2.9844, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.007813439115278278, |
|
"grad_norm": 0.523160994052887, |
|
"learning_rate": 3.90625e-05, |
|
"loss": 2.825, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.008414472893376608, |
|
"grad_norm": 0.41818058490753174, |
|
"learning_rate": 4.20673076923077e-05, |
|
"loss": 2.5125, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.009015506671474938, |
|
"grad_norm": 0.37683457136154175, |
|
"learning_rate": 4.507211538461539e-05, |
|
"loss": 2.6844, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.009616540449573266, |
|
"grad_norm": 0.428375780582428, |
|
"learning_rate": 4.8076923076923084e-05, |
|
"loss": 2.6719, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.010217574227671595, |
|
"grad_norm": 0.3897765576839447, |
|
"learning_rate": 5.108173076923077e-05, |
|
"loss": 2.2531, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.010818608005769925, |
|
"grad_norm": 0.2265370637178421, |
|
"learning_rate": 5.408653846153846e-05, |
|
"loss": 2.2844, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.011419641783868253, |
|
"grad_norm": 0.2113611400127411, |
|
"learning_rate": 5.709134615384615e-05, |
|
"loss": 2.1922, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.012020675561966582, |
|
"grad_norm": 0.1886824667453766, |
|
"learning_rate": 6.009615384615385e-05, |
|
"loss": 2.3516, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.012621709340064912, |
|
"grad_norm": 0.25855502486228943, |
|
"learning_rate": 6.310096153846154e-05, |
|
"loss": 2.4, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.01322274311816324, |
|
"grad_norm": 0.22833962738513947, |
|
"learning_rate": 6.610576923076923e-05, |
|
"loss": 2.2844, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.01382377689626157, |
|
"grad_norm": 0.30784738063812256, |
|
"learning_rate": 6.911057692307693e-05, |
|
"loss": 2.2016, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.0144248106743599, |
|
"grad_norm": 0.3998744487762451, |
|
"learning_rate": 7.211538461538461e-05, |
|
"loss": 2.4125, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.015025844452458229, |
|
"grad_norm": 0.24773858487606049, |
|
"learning_rate": 7.512019230769231e-05, |
|
"loss": 2.4156, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.015626878230556557, |
|
"grad_norm": 0.26020580530166626, |
|
"learning_rate": 7.8125e-05, |
|
"loss": 2.0094, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.016227912008654886, |
|
"grad_norm": 0.25112366676330566, |
|
"learning_rate": 8.112980769230769e-05, |
|
"loss": 2.5969, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.016828945786753216, |
|
"grad_norm": 0.3155271112918854, |
|
"learning_rate": 8.41346153846154e-05, |
|
"loss": 1.9844, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.017429979564851546, |
|
"grad_norm": 0.2684473693370819, |
|
"learning_rate": 8.713942307692307e-05, |
|
"loss": 2.3594, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.018031013342949875, |
|
"grad_norm": 0.19519321620464325, |
|
"learning_rate": 9.014423076923077e-05, |
|
"loss": 2.1906, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.0186320471210482, |
|
"grad_norm": 0.29595857858657837, |
|
"learning_rate": 9.314903846153846e-05, |
|
"loss": 2.4844, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.01923308089914653, |
|
"grad_norm": 0.21725840866565704, |
|
"learning_rate": 9.615384615384617e-05, |
|
"loss": 1.9969, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.01983411467724486, |
|
"grad_norm": 0.250431627035141, |
|
"learning_rate": 9.915865384615384e-05, |
|
"loss": 2.1469, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.02043514845534319, |
|
"grad_norm": 0.22979402542114258, |
|
"learning_rate": 0.00010216346153846153, |
|
"loss": 2.0891, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.02103618223344152, |
|
"grad_norm": 0.29841649532318115, |
|
"learning_rate": 0.00010516826923076924, |
|
"loss": 2.0891, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.02163721601153985, |
|
"grad_norm": 0.3121524155139923, |
|
"learning_rate": 0.00010817307692307693, |
|
"loss": 2.2938, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.02223824978963818, |
|
"grad_norm": 0.25094497203826904, |
|
"learning_rate": 0.00011117788461538462, |
|
"loss": 2.0672, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.022839283567736506, |
|
"grad_norm": 0.32229083776474, |
|
"learning_rate": 0.0001141826923076923, |
|
"loss": 1.9781, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.023440317345834835, |
|
"grad_norm": 0.30247944593429565, |
|
"learning_rate": 0.0001171875, |
|
"loss": 2.4469, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.024041351123933165, |
|
"grad_norm": 0.3992522358894348, |
|
"learning_rate": 0.0001201923076923077, |
|
"loss": 2.1609, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.024041351123933165, |
|
"eval_loss": 2.7308592796325684, |
|
"eval_model_preparation_time": 0.0053, |
|
"eval_runtime": 35.262, |
|
"eval_samples_per_second": 4.537, |
|
"eval_steps_per_second": 1.134, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.024642384902031494, |
|
"grad_norm": 0.28425589203834534, |
|
"learning_rate": 0.0001231971153846154, |
|
"loss": 2.625, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.025243418680129824, |
|
"grad_norm": 0.31964734196662903, |
|
"learning_rate": 0.00012620192307692308, |
|
"loss": 1.9328, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.025844452458228154, |
|
"grad_norm": 0.37272173166275024, |
|
"learning_rate": 0.00012920673076923078, |
|
"loss": 2.1641, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.02644548623632648, |
|
"grad_norm": 0.32725071907043457, |
|
"learning_rate": 0.00013221153846153846, |
|
"loss": 2.225, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.02704652001442481, |
|
"grad_norm": 0.25303465127944946, |
|
"learning_rate": 0.00013521634615384616, |
|
"loss": 2.2375, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.02764755379252314, |
|
"grad_norm": 0.4098326861858368, |
|
"learning_rate": 0.00013822115384615386, |
|
"loss": 2.0203, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.02824858757062147, |
|
"grad_norm": 0.3435593545436859, |
|
"learning_rate": 0.00014122596153846154, |
|
"loss": 2.3016, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.0288496213487198, |
|
"grad_norm": 0.4556426703929901, |
|
"learning_rate": 0.00014423076923076922, |
|
"loss": 2.2969, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.029450655126818128, |
|
"grad_norm": 0.39692002534866333, |
|
"learning_rate": 0.00014723557692307692, |
|
"loss": 2.3031, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.030051688904916458, |
|
"grad_norm": 0.31686538457870483, |
|
"learning_rate": 0.00015024038461538462, |
|
"loss": 2.2984, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.030652722683014784, |
|
"grad_norm": 0.30815422534942627, |
|
"learning_rate": 0.00015324519230769233, |
|
"loss": 2.1734, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.031253756461113114, |
|
"grad_norm": 0.3927950859069824, |
|
"learning_rate": 0.00015625, |
|
"loss": 2.0031, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.03185479023921144, |
|
"grad_norm": 0.3010413944721222, |
|
"learning_rate": 0.00015925480769230768, |
|
"loss": 2.1875, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.03245582401730977, |
|
"grad_norm": 0.39929866790771484, |
|
"learning_rate": 0.00016225961538461538, |
|
"loss": 2.2266, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.0330568577954081, |
|
"grad_norm": 0.3709786832332611, |
|
"learning_rate": 0.00016526442307692309, |
|
"loss": 2.2344, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.03365789157350643, |
|
"grad_norm": 0.38551804423332214, |
|
"learning_rate": 0.0001682692307692308, |
|
"loss": 2.0391, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.03425892535160476, |
|
"grad_norm": 0.3497028350830078, |
|
"learning_rate": 0.00017127403846153847, |
|
"loss": 2.1328, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.03485995912970309, |
|
"grad_norm": 0.22066070139408112, |
|
"learning_rate": 0.00017427884615384614, |
|
"loss": 1.9891, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.03546099290780142, |
|
"grad_norm": 0.3861188590526581, |
|
"learning_rate": 0.00017728365384615385, |
|
"loss": 2.0266, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.03606202668589975, |
|
"grad_norm": 0.43038997054100037, |
|
"learning_rate": 0.00018028846153846155, |
|
"loss": 2.2062, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.03666306046399807, |
|
"grad_norm": 0.4089072644710541, |
|
"learning_rate": 0.00018329326923076922, |
|
"loss": 2.2016, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.0372640942420964, |
|
"grad_norm": 0.40281516313552856, |
|
"learning_rate": 0.00018629807692307693, |
|
"loss": 2.2578, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.03786512802019473, |
|
"grad_norm": 0.33316513895988464, |
|
"learning_rate": 0.0001893028846153846, |
|
"loss": 2.1844, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.03846616179829306, |
|
"grad_norm": 0.4020228087902069, |
|
"learning_rate": 0.00019230769230769233, |
|
"loss": 2.2109, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.03906719557639139, |
|
"grad_norm": 0.36403888463974, |
|
"learning_rate": 0.0001953125, |
|
"loss": 2.0063, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.03966822935448972, |
|
"grad_norm": 0.4289080500602722, |
|
"learning_rate": 0.0001983173076923077, |
|
"loss": 2.1641, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.04026926313258805, |
|
"grad_norm": 0.3827407658100128, |
|
"learning_rate": 0.0002013221153846154, |
|
"loss": 2.4125, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.04087029691068638, |
|
"grad_norm": 0.28297996520996094, |
|
"learning_rate": 0.00020432692307692307, |
|
"loss": 2.2047, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.04147133068878471, |
|
"grad_norm": 0.3654349744319916, |
|
"learning_rate": 0.0002073317307692308, |
|
"loss": 2.0344, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.04207236446688304, |
|
"grad_norm": 0.44768983125686646, |
|
"learning_rate": 0.00021033653846153847, |
|
"loss": 2.0469, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.04267339824498137, |
|
"grad_norm": 0.36050865054130554, |
|
"learning_rate": 0.00021334134615384615, |
|
"loss": 1.8203, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.0432744320230797, |
|
"grad_norm": 0.41343504190444946, |
|
"learning_rate": 0.00021634615384615385, |
|
"loss": 1.9031, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.04387546580117803, |
|
"grad_norm": 0.33549779653549194, |
|
"learning_rate": 0.00021935096153846153, |
|
"loss": 1.9859, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.04447649957927636, |
|
"grad_norm": 0.39200559258461, |
|
"learning_rate": 0.00022235576923076923, |
|
"loss": 2.0672, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.04507753335737468, |
|
"grad_norm": 0.5816010236740112, |
|
"learning_rate": 0.00022536057692307694, |
|
"loss": 2.1625, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.04567856713547301, |
|
"grad_norm": 0.4004225432872772, |
|
"learning_rate": 0.0002283653846153846, |
|
"loss": 2.1297, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.04627960091357134, |
|
"grad_norm": 0.3329584300518036, |
|
"learning_rate": 0.00023137019230769232, |
|
"loss": 1.8969, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.04688063469166967, |
|
"grad_norm": 0.3800398111343384, |
|
"learning_rate": 0.000234375, |
|
"loss": 1.875, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.047481668469768, |
|
"grad_norm": 0.5345351696014404, |
|
"learning_rate": 0.0002373798076923077, |
|
"loss": 2.0641, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.04808270224786633, |
|
"grad_norm": 0.31537583470344543, |
|
"learning_rate": 0.0002403846153846154, |
|
"loss": 2.0828, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.04808270224786633, |
|
"eval_loss": 2.657031297683716, |
|
"eval_model_preparation_time": 0.0053, |
|
"eval_runtime": 35.2197, |
|
"eval_samples_per_second": 4.543, |
|
"eval_steps_per_second": 1.136, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.04868373602596466, |
|
"grad_norm": 0.3651765286922455, |
|
"learning_rate": 0.00024338942307692307, |
|
"loss": 2.2188, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.04928476980406299, |
|
"grad_norm": 0.42044126987457275, |
|
"learning_rate": 0.0002463942307692308, |
|
"loss": 1.9625, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.04988580358216132, |
|
"grad_norm": 0.3405047357082367, |
|
"learning_rate": 0.00024939903846153845, |
|
"loss": 2.1203, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.05048683736025965, |
|
"grad_norm": 0.5022028088569641, |
|
"learning_rate": 0.00025240384615384616, |
|
"loss": 1.7672, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.05108787113835798, |
|
"grad_norm": 0.31208300590515137, |
|
"learning_rate": 0.00025540865384615386, |
|
"loss": 1.9266, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.05168890491645631, |
|
"grad_norm": 0.39399516582489014, |
|
"learning_rate": 0.00025841346153846156, |
|
"loss": 1.8828, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.05228993869455464, |
|
"grad_norm": 0.42515093088150024, |
|
"learning_rate": 0.0002614182692307692, |
|
"loss": 1.7656, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.05289097247265296, |
|
"grad_norm": 0.3947089910507202, |
|
"learning_rate": 0.0002644230769230769, |
|
"loss": 2.0484, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.05349200625075129, |
|
"grad_norm": 0.6280628442764282, |
|
"learning_rate": 0.0002674278846153846, |
|
"loss": 2.1422, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 0.05409304002884962, |
|
"grad_norm": 0.3639807105064392, |
|
"learning_rate": 0.0002704326923076923, |
|
"loss": 1.9781, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.05469407380694795, |
|
"grad_norm": 0.3984295427799225, |
|
"learning_rate": 0.0002734375, |
|
"loss": 2.2359, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 0.05529510758504628, |
|
"grad_norm": 0.33954715728759766, |
|
"learning_rate": 0.00027644230769230773, |
|
"loss": 2.3547, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.05589614136314461, |
|
"grad_norm": 0.4361511468887329, |
|
"learning_rate": 0.0002794471153846154, |
|
"loss": 2.0859, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 0.05649717514124294, |
|
"grad_norm": 0.471563458442688, |
|
"learning_rate": 0.0002824519230769231, |
|
"loss": 2.1703, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.05709820891934127, |
|
"grad_norm": 0.2517772614955902, |
|
"learning_rate": 0.0002854567307692308, |
|
"loss": 2.0281, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.0576992426974396, |
|
"grad_norm": 0.3190082907676697, |
|
"learning_rate": 0.00028846153846153843, |
|
"loss": 2.0234, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.05830027647553793, |
|
"grad_norm": 0.37972012162208557, |
|
"learning_rate": 0.00029146634615384614, |
|
"loss": 2.15, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 0.058901310253636256, |
|
"grad_norm": 0.37980136275291443, |
|
"learning_rate": 0.00029447115384615384, |
|
"loss": 2.1219, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.059502344031734586, |
|
"grad_norm": 0.32648953795433044, |
|
"learning_rate": 0.00029747596153846154, |
|
"loss": 1.9703, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 0.060103377809832916, |
|
"grad_norm": 0.28836116194725037, |
|
"learning_rate": 0.00030048076923076925, |
|
"loss": 2.0406, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.060704411587931245, |
|
"grad_norm": 0.2953934967517853, |
|
"learning_rate": 0.00030348557692307695, |
|
"loss": 2.2156, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 0.06130544536602957, |
|
"grad_norm": 0.4778139889240265, |
|
"learning_rate": 0.00030649038461538465, |
|
"loss": 2.0672, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.0619064791441279, |
|
"grad_norm": 0.27339640259742737, |
|
"learning_rate": 0.0003094951923076923, |
|
"loss": 1.8953, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 0.06250751292222623, |
|
"grad_norm": 0.3127667009830475, |
|
"learning_rate": 0.0003125, |
|
"loss": 2.0859, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.06310854670032456, |
|
"grad_norm": 0.2676738500595093, |
|
"learning_rate": 0.0003155048076923077, |
|
"loss": 1.9656, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.06370958047842289, |
|
"grad_norm": 0.3519584834575653, |
|
"learning_rate": 0.00031850961538461536, |
|
"loss": 2.0828, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.06431061425652122, |
|
"grad_norm": 0.38000714778900146, |
|
"learning_rate": 0.00032151442307692306, |
|
"loss": 1.8656, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 0.06491164803461955, |
|
"grad_norm": 0.5076779127120972, |
|
"learning_rate": 0.00032451923076923077, |
|
"loss": 1.8938, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.06551268181271787, |
|
"grad_norm": 0.3919801414012909, |
|
"learning_rate": 0.00032752403846153847, |
|
"loss": 2.1203, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 0.0661137155908162, |
|
"grad_norm": 0.3263305425643921, |
|
"learning_rate": 0.00033052884615384617, |
|
"loss": 2.0344, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.06671474936891453, |
|
"grad_norm": 0.4196506440639496, |
|
"learning_rate": 0.0003335336538461539, |
|
"loss": 2.1078, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 0.06731578314701286, |
|
"grad_norm": 0.3997637927532196, |
|
"learning_rate": 0.0003365384615384616, |
|
"loss": 1.8844, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.06791681692511119, |
|
"grad_norm": 0.39547184109687805, |
|
"learning_rate": 0.00033954326923076923, |
|
"loss": 1.9406, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 0.06851785070320952, |
|
"grad_norm": 0.36170271039009094, |
|
"learning_rate": 0.00034254807692307693, |
|
"loss": 2.2469, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.06911888448130785, |
|
"grad_norm": 0.3041069507598877, |
|
"learning_rate": 0.00034555288461538463, |
|
"loss": 1.7734, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 0.06971991825940618, |
|
"grad_norm": 0.31936579942703247, |
|
"learning_rate": 0.0003485576923076923, |
|
"loss": 2.1141, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.0703209520375045, |
|
"grad_norm": 0.5643404722213745, |
|
"learning_rate": 0.0003515625, |
|
"loss": 1.9, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 0.07092198581560284, |
|
"grad_norm": 0.43453335762023926, |
|
"learning_rate": 0.0003545673076923077, |
|
"loss": 1.5203, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.07152301959370116, |
|
"grad_norm": 0.28918376564979553, |
|
"learning_rate": 0.0003575721153846154, |
|
"loss": 1.9859, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 0.0721240533717995, |
|
"grad_norm": 0.4441574215888977, |
|
"learning_rate": 0.0003605769230769231, |
|
"loss": 1.7422, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.0721240533717995, |
|
"eval_loss": 2.598828077316284, |
|
"eval_model_preparation_time": 0.0053, |
|
"eval_runtime": 35.2182, |
|
"eval_samples_per_second": 4.543, |
|
"eval_steps_per_second": 1.136, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.07272508714989782, |
|
"grad_norm": 0.47054970264434814, |
|
"learning_rate": 0.0003635817307692308, |
|
"loss": 1.975, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 0.07332612092799615, |
|
"grad_norm": 0.4808221459388733, |
|
"learning_rate": 0.00036658653846153845, |
|
"loss": 2.0969, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.07392715470609448, |
|
"grad_norm": 0.40260159969329834, |
|
"learning_rate": 0.00036959134615384615, |
|
"loss": 2.0781, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 0.0745281884841928, |
|
"grad_norm": 0.3565881848335266, |
|
"learning_rate": 0.00037259615384615386, |
|
"loss": 1.8562, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.07512922226229114, |
|
"grad_norm": 0.41623455286026, |
|
"learning_rate": 0.00037560096153846156, |
|
"loss": 1.9688, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 0.07573025604038947, |
|
"grad_norm": 0.442056804895401, |
|
"learning_rate": 0.0003786057692307692, |
|
"loss": 2.0531, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.0763312898184878, |
|
"grad_norm": 0.5474425554275513, |
|
"learning_rate": 0.0003816105769230769, |
|
"loss": 2.1391, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 0.07693232359658612, |
|
"grad_norm": 0.29002273082733154, |
|
"learning_rate": 0.00038461538461538467, |
|
"loss": 1.6906, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.07753335737468446, |
|
"grad_norm": 0.30469194054603577, |
|
"learning_rate": 0.0003876201923076923, |
|
"loss": 1.6859, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 0.07813439115278278, |
|
"grad_norm": 0.3932645618915558, |
|
"learning_rate": 0.000390625, |
|
"loss": 1.8328, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.07873542493088112, |
|
"grad_norm": 0.4049251079559326, |
|
"learning_rate": 0.0003936298076923077, |
|
"loss": 1.8672, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 0.07933645870897944, |
|
"grad_norm": 0.4889291524887085, |
|
"learning_rate": 0.0003966346153846154, |
|
"loss": 2.0531, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.07993749248707778, |
|
"grad_norm": 0.38475117087364197, |
|
"learning_rate": 0.0003996394230769231, |
|
"loss": 1.8422, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 0.0805385262651761, |
|
"grad_norm": 0.34599217772483826, |
|
"learning_rate": 0.0004026442307692308, |
|
"loss": 1.8391, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.08113956004327443, |
|
"grad_norm": 0.39600178599357605, |
|
"learning_rate": 0.00040564903846153843, |
|
"loss": 1.8484, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 0.08174059382137276, |
|
"grad_norm": 0.3293285071849823, |
|
"learning_rate": 0.00040865384615384613, |
|
"loss": 1.6656, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.08234162759947108, |
|
"grad_norm": 0.37310031056404114, |
|
"learning_rate": 0.00041165865384615384, |
|
"loss": 1.9609, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 0.08294266137756942, |
|
"grad_norm": 0.41512343287467957, |
|
"learning_rate": 0.0004146634615384616, |
|
"loss": 1.9937, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.08354369515566774, |
|
"grad_norm": 0.47950249910354614, |
|
"learning_rate": 0.00041766826923076924, |
|
"loss": 1.9109, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 0.08414472893376608, |
|
"grad_norm": 0.4324653744697571, |
|
"learning_rate": 0.00042067307692307695, |
|
"loss": 1.9953, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.0847457627118644, |
|
"grad_norm": 0.3693973422050476, |
|
"learning_rate": 0.00042367788461538465, |
|
"loss": 1.9016, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 0.08534679648996274, |
|
"grad_norm": 0.33113107085227966, |
|
"learning_rate": 0.0004266826923076923, |
|
"loss": 2.2266, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.08594783026806106, |
|
"grad_norm": 0.5808571577072144, |
|
"learning_rate": 0.0004296875, |
|
"loss": 1.5063, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 0.0865488640461594, |
|
"grad_norm": 0.3792312443256378, |
|
"learning_rate": 0.0004326923076923077, |
|
"loss": 1.8016, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.08714989782425772, |
|
"grad_norm": 0.43698450922966003, |
|
"learning_rate": 0.00043569711538461535, |
|
"loss": 1.7219, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 0.08775093160235606, |
|
"grad_norm": 0.43264222145080566, |
|
"learning_rate": 0.00043870192307692306, |
|
"loss": 1.7234, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.08835196538045438, |
|
"grad_norm": 0.5246540307998657, |
|
"learning_rate": 0.0004417067307692308, |
|
"loss": 1.7531, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 0.08895299915855272, |
|
"grad_norm": 0.2953200936317444, |
|
"learning_rate": 0.00044471153846153846, |
|
"loss": 1.9438, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.08955403293665104, |
|
"grad_norm": 0.39238616824150085, |
|
"learning_rate": 0.00044771634615384617, |
|
"loss": 1.7172, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 0.09015506671474936, |
|
"grad_norm": 0.4887576401233673, |
|
"learning_rate": 0.00045072115384615387, |
|
"loss": 1.9594, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.0907561004928477, |
|
"grad_norm": 0.391634076833725, |
|
"learning_rate": 0.0004537259615384616, |
|
"loss": 1.8406, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 0.09135713427094602, |
|
"grad_norm": 0.4006985127925873, |
|
"learning_rate": 0.0004567307692307692, |
|
"loss": 1.7984, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.09195816804904436, |
|
"grad_norm": 0.3601657748222351, |
|
"learning_rate": 0.0004597355769230769, |
|
"loss": 1.9844, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 0.09255920182714268, |
|
"grad_norm": 0.5057326555252075, |
|
"learning_rate": 0.00046274038461538463, |
|
"loss": 1.6703, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.09316023560524102, |
|
"grad_norm": 0.5787122845649719, |
|
"learning_rate": 0.0004657451923076923, |
|
"loss": 1.8984, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 0.09376126938333934, |
|
"grad_norm": 0.4849441945552826, |
|
"learning_rate": 0.00046875, |
|
"loss": 1.8672, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.09436230316143768, |
|
"grad_norm": 0.44167378544807434, |
|
"learning_rate": 0.00047175480769230774, |
|
"loss": 1.6422, |
|
"step": 785 |
|
}, |
|
{ |
|
"epoch": 0.094963336939536, |
|
"grad_norm": 0.6295076608657837, |
|
"learning_rate": 0.0004747596153846154, |
|
"loss": 1.6875, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.09556437071763434, |
|
"grad_norm": 0.4804101586341858, |
|
"learning_rate": 0.0004777644230769231, |
|
"loss": 1.8203, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 0.09616540449573266, |
|
"grad_norm": 0.4898495674133301, |
|
"learning_rate": 0.0004807692307692308, |
|
"loss": 1.9891, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.09616540449573266, |
|
"eval_loss": 2.535351514816284, |
|
"eval_model_preparation_time": 0.0053, |
|
"eval_runtime": 35.1754, |
|
"eval_samples_per_second": 4.549, |
|
"eval_steps_per_second": 1.137, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.096766438273831, |
|
"grad_norm": 0.43688085675239563, |
|
"learning_rate": 0.00048377403846153845, |
|
"loss": 1.7234, |
|
"step": 805 |
|
}, |
|
{ |
|
"epoch": 0.09736747205192932, |
|
"grad_norm": 0.5891087651252747, |
|
"learning_rate": 0.00048677884615384615, |
|
"loss": 1.7969, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.09796850583002764, |
|
"grad_norm": 0.5140319466590881, |
|
"learning_rate": 0.0004897836538461539, |
|
"loss": 2.0719, |
|
"step": 815 |
|
}, |
|
{ |
|
"epoch": 0.09856953960812598, |
|
"grad_norm": 0.40886375308036804, |
|
"learning_rate": 0.0004927884615384616, |
|
"loss": 2.0891, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.0991705733862243, |
|
"grad_norm": 0.3513309955596924, |
|
"learning_rate": 0.0004957932692307692, |
|
"loss": 1.8453, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 0.09977160716432264, |
|
"grad_norm": 0.5530559420585632, |
|
"learning_rate": 0.0004987980769230769, |
|
"loss": 1.675, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.10037264094242096, |
|
"grad_norm": 0.4348265528678894, |
|
"learning_rate": 0.0004999999983630302, |
|
"loss": 1.7891, |
|
"step": 835 |
|
}, |
|
{ |
|
"epoch": 0.1009736747205193, |
|
"grad_norm": 0.5396342277526855, |
|
"learning_rate": 0.0004999999883593255, |
|
"loss": 1.9047, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.10157470849861762, |
|
"grad_norm": 0.5154384970664978, |
|
"learning_rate": 0.0004999999692613442, |
|
"loss": 1.8844, |
|
"step": 845 |
|
}, |
|
{ |
|
"epoch": 0.10217574227671596, |
|
"grad_norm": 0.29072120785713196, |
|
"learning_rate": 0.0004999999410690872, |
|
"loss": 1.6531, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.10277677605481428, |
|
"grad_norm": 0.4125816822052002, |
|
"learning_rate": 0.0004999999037825552, |
|
"loss": 1.9031, |
|
"step": 855 |
|
}, |
|
{ |
|
"epoch": 0.10337780983291261, |
|
"grad_norm": 0.34915369749069214, |
|
"learning_rate": 0.0004999998574017497, |
|
"loss": 1.8609, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.10397884361101094, |
|
"grad_norm": 0.3622804284095764, |
|
"learning_rate": 0.0004999998019266724, |
|
"loss": 1.7484, |
|
"step": 865 |
|
}, |
|
{ |
|
"epoch": 0.10457987738910927, |
|
"grad_norm": 0.36787149310112, |
|
"learning_rate": 0.0004999997373573254, |
|
"loss": 1.7812, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.1051809111672076, |
|
"grad_norm": 0.4469545781612396, |
|
"learning_rate": 0.0004999996636937108, |
|
"loss": 1.5484, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 0.10578194494530592, |
|
"grad_norm": 0.30026400089263916, |
|
"learning_rate": 0.0004999995809358316, |
|
"loss": 1.6703, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.10638297872340426, |
|
"grad_norm": 0.4870736002922058, |
|
"learning_rate": 0.0004999994890836904, |
|
"loss": 1.7547, |
|
"step": 885 |
|
}, |
|
{ |
|
"epoch": 0.10698401250150258, |
|
"grad_norm": 0.6516287326812744, |
|
"learning_rate": 0.000499999388137291, |
|
"loss": 1.7891, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.10758504627960092, |
|
"grad_norm": 0.2974604368209839, |
|
"learning_rate": 0.0004999992780966368, |
|
"loss": 1.8359, |
|
"step": 895 |
|
}, |
|
{ |
|
"epoch": 0.10818608005769924, |
|
"grad_norm": 0.3521243929862976, |
|
"learning_rate": 0.0004999991589617318, |
|
"loss": 1.9141, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.10878711383579757, |
|
"grad_norm": 0.38353726267814636, |
|
"learning_rate": 0.0004999990307325803, |
|
"loss": 1.775, |
|
"step": 905 |
|
}, |
|
{ |
|
"epoch": 0.1093881476138959, |
|
"grad_norm": 0.46048542857170105, |
|
"learning_rate": 0.0004999988934091872, |
|
"loss": 1.7297, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.10998918139199423, |
|
"grad_norm": 0.4313719570636749, |
|
"learning_rate": 0.0004999987469915573, |
|
"loss": 1.2891, |
|
"step": 915 |
|
}, |
|
{ |
|
"epoch": 0.11059021517009256, |
|
"grad_norm": 0.5933486223220825, |
|
"learning_rate": 0.0004999985914796961, |
|
"loss": 1.6938, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.1111912489481909, |
|
"grad_norm": 0.5271236300468445, |
|
"learning_rate": 0.000499998426873609, |
|
"loss": 1.8, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 0.11179228272628922, |
|
"grad_norm": 0.3807511031627655, |
|
"learning_rate": 0.0004999982531733022, |
|
"loss": 1.3086, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.11239331650438755, |
|
"grad_norm": 0.4684934914112091, |
|
"learning_rate": 0.0004999980703787819, |
|
"loss": 1.4875, |
|
"step": 935 |
|
}, |
|
{ |
|
"epoch": 0.11299435028248588, |
|
"grad_norm": 0.5648980140686035, |
|
"learning_rate": 0.0004999978784900549, |
|
"loss": 1.6578, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.1135953840605842, |
|
"grad_norm": 0.4021349549293518, |
|
"learning_rate": 0.0004999976775071278, |
|
"loss": 1.8266, |
|
"step": 945 |
|
}, |
|
{ |
|
"epoch": 0.11419641783868253, |
|
"grad_norm": 0.3722395598888397, |
|
"learning_rate": 0.0004999974674300084, |
|
"loss": 1.8969, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.11479745161678086, |
|
"grad_norm": 0.407781720161438, |
|
"learning_rate": 0.000499997248258704, |
|
"loss": 1.6562, |
|
"step": 955 |
|
}, |
|
{ |
|
"epoch": 0.1153984853948792, |
|
"grad_norm": 0.44156748056411743, |
|
"learning_rate": 0.0004999970199932229, |
|
"loss": 2.0688, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.11599951917297752, |
|
"grad_norm": 0.40020808577537537, |
|
"learning_rate": 0.000499996782633573, |
|
"loss": 1.5047, |
|
"step": 965 |
|
}, |
|
{ |
|
"epoch": 0.11660055295107585, |
|
"grad_norm": 0.38710176944732666, |
|
"learning_rate": 0.0004999965361797633, |
|
"loss": 1.7367, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.11720158672917418, |
|
"grad_norm": 0.344836562871933, |
|
"learning_rate": 0.0004999962806318025, |
|
"loss": 1.7828, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 0.11780262050727251, |
|
"grad_norm": 0.3811284899711609, |
|
"learning_rate": 0.0004999960159897, |
|
"loss": 1.7766, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.11840365428537084, |
|
"grad_norm": 0.5141933560371399, |
|
"learning_rate": 0.0004999957422534654, |
|
"loss": 1.75, |
|
"step": 985 |
|
}, |
|
{ |
|
"epoch": 0.11900468806346917, |
|
"grad_norm": 0.37530529499053955, |
|
"learning_rate": 0.0004999954594231088, |
|
"loss": 2.0922, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.1196057218415675, |
|
"grad_norm": 0.41129302978515625, |
|
"learning_rate": 0.0004999951674986401, |
|
"loss": 1.5781, |
|
"step": 995 |
|
}, |
|
{ |
|
"epoch": 0.12020675561966583, |
|
"grad_norm": 0.3869934380054474, |
|
"learning_rate": 0.0004999948664800704, |
|
"loss": 1.7422, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.12020675561966583, |
|
"eval_loss": 2.4908204078674316, |
|
"eval_model_preparation_time": 0.0053, |
|
"eval_runtime": 35.1997, |
|
"eval_samples_per_second": 4.545, |
|
"eval_steps_per_second": 1.136, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.12080778939776415, |
|
"grad_norm": 0.36643335223197937, |
|
"learning_rate": 0.0004999945563674105, |
|
"loss": 1.6797, |
|
"step": 1005 |
|
}, |
|
{ |
|
"epoch": 0.12140882317586249, |
|
"grad_norm": 0.45910894870758057, |
|
"learning_rate": 0.0004999942371606714, |
|
"loss": 1.7063, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 0.12200985695396081, |
|
"grad_norm": 0.350729763507843, |
|
"learning_rate": 0.0004999939088598652, |
|
"loss": 1.6344, |
|
"step": 1015 |
|
}, |
|
{ |
|
"epoch": 0.12261089073205914, |
|
"grad_norm": 0.46493440866470337, |
|
"learning_rate": 0.0004999935714650034, |
|
"loss": 1.9641, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.12321192451015747, |
|
"grad_norm": 0.42726650834083557, |
|
"learning_rate": 0.0004999932249760984, |
|
"loss": 1.7094, |
|
"step": 1025 |
|
}, |
|
{ |
|
"epoch": 0.1238129582882558, |
|
"grad_norm": 0.28014904260635376, |
|
"learning_rate": 0.000499992869393163, |
|
"loss": 1.8516, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 0.12441399206635413, |
|
"grad_norm": 0.4522114098072052, |
|
"learning_rate": 0.0004999925047162099, |
|
"loss": 1.3961, |
|
"step": 1035 |
|
}, |
|
{ |
|
"epoch": 0.12501502584445245, |
|
"grad_norm": 0.46475955843925476, |
|
"learning_rate": 0.0004999921309452526, |
|
"loss": 1.4062, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.1256160596225508, |
|
"grad_norm": 0.44490954279899597, |
|
"learning_rate": 0.0004999917480803044, |
|
"loss": 1.6719, |
|
"step": 1045 |
|
}, |
|
{ |
|
"epoch": 0.12621709340064913, |
|
"grad_norm": 0.40904587507247925, |
|
"learning_rate": 0.0004999913561213793, |
|
"loss": 1.7734, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.12681812717874744, |
|
"grad_norm": 0.36412525177001953, |
|
"learning_rate": 0.0004999909550684918, |
|
"loss": 1.2594, |
|
"step": 1055 |
|
}, |
|
{ |
|
"epoch": 0.12741916095684577, |
|
"grad_norm": 0.7560976147651672, |
|
"learning_rate": 0.0004999905449216563, |
|
"loss": 1.6047, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.1280201947349441, |
|
"grad_norm": 0.5383388996124268, |
|
"learning_rate": 0.0004999901256808878, |
|
"loss": 1.6016, |
|
"step": 1065 |
|
}, |
|
{ |
|
"epoch": 0.12862122851304245, |
|
"grad_norm": 0.5255587100982666, |
|
"learning_rate": 0.0004999896973462012, |
|
"loss": 1.7828, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 0.12922226229114075, |
|
"grad_norm": 0.4830612242221832, |
|
"learning_rate": 0.0004999892599176127, |
|
"loss": 1.8781, |
|
"step": 1075 |
|
}, |
|
{ |
|
"epoch": 0.1298232960692391, |
|
"grad_norm": 0.3687385618686676, |
|
"learning_rate": 0.0004999888133951377, |
|
"loss": 1.4797, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.13042432984733743, |
|
"grad_norm": 0.3518010675907135, |
|
"learning_rate": 0.0004999883577787927, |
|
"loss": 1.7234, |
|
"step": 1085 |
|
}, |
|
{ |
|
"epoch": 0.13102536362543574, |
|
"grad_norm": 0.4522668719291687, |
|
"learning_rate": 0.0004999878930685943, |
|
"loss": 1.675, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 0.13162639740353407, |
|
"grad_norm": 0.3153088390827179, |
|
"learning_rate": 0.0004999874192645592, |
|
"loss": 1.7328, |
|
"step": 1095 |
|
}, |
|
{ |
|
"epoch": 0.1322274311816324, |
|
"grad_norm": 0.4520825147628784, |
|
"learning_rate": 0.0004999869363667048, |
|
"loss": 1.925, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.13282846495973075, |
|
"grad_norm": 0.3040079176425934, |
|
"learning_rate": 0.0004999864443750486, |
|
"loss": 1.6922, |
|
"step": 1105 |
|
}, |
|
{ |
|
"epoch": 0.13342949873782906, |
|
"grad_norm": 0.5198135375976562, |
|
"learning_rate": 0.0004999859432896084, |
|
"loss": 1.6562, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 0.1340305325159274, |
|
"grad_norm": 0.30772989988327026, |
|
"learning_rate": 0.0004999854331104028, |
|
"loss": 1.8078, |
|
"step": 1115 |
|
}, |
|
{ |
|
"epoch": 0.13463156629402573, |
|
"grad_norm": 0.39027324318885803, |
|
"learning_rate": 0.0004999849138374498, |
|
"loss": 1.625, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.13523260007212407, |
|
"grad_norm": 0.4438004195690155, |
|
"learning_rate": 0.0004999843854707688, |
|
"loss": 1.5414, |
|
"step": 1125 |
|
}, |
|
{ |
|
"epoch": 0.13583363385022237, |
|
"grad_norm": 0.4966782033443451, |
|
"learning_rate": 0.0004999838480103787, |
|
"loss": 1.4836, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 0.1364346676283207, |
|
"grad_norm": 0.5602577328681946, |
|
"learning_rate": 0.0004999833014562992, |
|
"loss": 1.3961, |
|
"step": 1135 |
|
}, |
|
{ |
|
"epoch": 0.13703570140641905, |
|
"grad_norm": 0.5276179909706116, |
|
"learning_rate": 0.0004999827458085502, |
|
"loss": 1.8422, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.13763673518451738, |
|
"grad_norm": 0.4706065058708191, |
|
"learning_rate": 0.0004999821810671518, |
|
"loss": 1.7109, |
|
"step": 1145 |
|
}, |
|
{ |
|
"epoch": 0.1382377689626157, |
|
"grad_norm": 0.38341307640075684, |
|
"learning_rate": 0.0004999816072321245, |
|
"loss": 1.8859, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.13883880274071403, |
|
"grad_norm": 0.5754373073577881, |
|
"learning_rate": 0.0004999810243034894, |
|
"loss": 1.8, |
|
"step": 1155 |
|
}, |
|
{ |
|
"epoch": 0.13943983651881237, |
|
"grad_norm": 0.5003094673156738, |
|
"learning_rate": 0.0004999804322812676, |
|
"loss": 1.6766, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 0.14004087029691067, |
|
"grad_norm": 0.31239280104637146, |
|
"learning_rate": 0.0004999798311654805, |
|
"loss": 1.775, |
|
"step": 1165 |
|
}, |
|
{ |
|
"epoch": 0.140641904075009, |
|
"grad_norm": 0.3998953700065613, |
|
"learning_rate": 0.0004999792209561501, |
|
"loss": 1.7516, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 0.14124293785310735, |
|
"grad_norm": 0.3099336624145508, |
|
"learning_rate": 0.0004999786016532986, |
|
"loss": 1.8422, |
|
"step": 1175 |
|
}, |
|
{ |
|
"epoch": 0.14184397163120568, |
|
"grad_norm": 0.48160257935523987, |
|
"learning_rate": 0.0004999779732569485, |
|
"loss": 1.6062, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 0.142445005409304, |
|
"grad_norm": 0.5494711399078369, |
|
"learning_rate": 0.0004999773357671227, |
|
"loss": 1.5906, |
|
"step": 1185 |
|
}, |
|
{ |
|
"epoch": 0.14304603918740233, |
|
"grad_norm": 0.7721512913703918, |
|
"learning_rate": 0.0004999766891838444, |
|
"loss": 1.7734, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 0.14364707296550067, |
|
"grad_norm": 0.5135265588760376, |
|
"learning_rate": 0.000499976033507137, |
|
"loss": 1.4812, |
|
"step": 1195 |
|
}, |
|
{ |
|
"epoch": 0.144248106743599, |
|
"grad_norm": 0.7913392186164856, |
|
"learning_rate": 0.0004999753687370245, |
|
"loss": 1.5484, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.144248106743599, |
|
"eval_loss": 2.5082030296325684, |
|
"eval_model_preparation_time": 0.0053, |
|
"eval_runtime": 35.2256, |
|
"eval_samples_per_second": 4.542, |
|
"eval_steps_per_second": 1.136, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.1448491405216973, |
|
"grad_norm": 0.6069223880767822, |
|
"learning_rate": 0.0004999746948735308, |
|
"loss": 1.4484, |
|
"step": 1205 |
|
}, |
|
{ |
|
"epoch": 0.14545017429979565, |
|
"grad_norm": 0.4137849807739258, |
|
"learning_rate": 0.0004999740119166809, |
|
"loss": 1.6719, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 0.14605120807789398, |
|
"grad_norm": 0.7047042846679688, |
|
"learning_rate": 0.0004999733198664992, |
|
"loss": 1.5312, |
|
"step": 1215 |
|
}, |
|
{ |
|
"epoch": 0.1466522418559923, |
|
"grad_norm": 0.5389900207519531, |
|
"learning_rate": 0.0004999726187230111, |
|
"loss": 1.4297, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 0.14725327563409063, |
|
"grad_norm": 0.5395992994308472, |
|
"learning_rate": 0.0004999719084862421, |
|
"loss": 1.6328, |
|
"step": 1225 |
|
}, |
|
{ |
|
"epoch": 0.14785430941218897, |
|
"grad_norm": 0.43566471338272095, |
|
"learning_rate": 0.0004999711891562179, |
|
"loss": 1.7094, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 0.1484553431902873, |
|
"grad_norm": 0.3409474194049835, |
|
"learning_rate": 0.0004999704607329648, |
|
"loss": 1.6656, |
|
"step": 1235 |
|
}, |
|
{ |
|
"epoch": 0.1490563769683856, |
|
"grad_norm": 0.5498088002204895, |
|
"learning_rate": 0.0004999697232165092, |
|
"loss": 1.6016, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 0.14965741074648395, |
|
"grad_norm": 0.567551851272583, |
|
"learning_rate": 0.000499968976606878, |
|
"loss": 1.6828, |
|
"step": 1245 |
|
}, |
|
{ |
|
"epoch": 0.15025844452458229, |
|
"grad_norm": 0.4866923987865448, |
|
"learning_rate": 0.0004999682209040983, |
|
"loss": 1.6547, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.15085947830268062, |
|
"grad_norm": 0.3780736029148102, |
|
"learning_rate": 0.0004999674561081977, |
|
"loss": 1.6719, |
|
"step": 1255 |
|
}, |
|
{ |
|
"epoch": 0.15146051208077893, |
|
"grad_norm": 0.3219822347164154, |
|
"learning_rate": 0.0004999666822192039, |
|
"loss": 1.4195, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 0.15206154585887727, |
|
"grad_norm": 0.3056913912296295, |
|
"learning_rate": 0.0004999658992371451, |
|
"loss": 1.7484, |
|
"step": 1265 |
|
}, |
|
{ |
|
"epoch": 0.1526625796369756, |
|
"grad_norm": 0.4860096573829651, |
|
"learning_rate": 0.0004999651071620499, |
|
"loss": 1.6516, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 0.15326361341507394, |
|
"grad_norm": 0.4047755002975464, |
|
"learning_rate": 0.0004999643059939469, |
|
"loss": 1.6984, |
|
"step": 1275 |
|
}, |
|
{ |
|
"epoch": 0.15386464719317225, |
|
"grad_norm": 0.27880361676216125, |
|
"learning_rate": 0.0004999634957328652, |
|
"loss": 1.8078, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 0.15446568097127059, |
|
"grad_norm": 0.4087715148925781, |
|
"learning_rate": 0.0004999626763788346, |
|
"loss": 1.6422, |
|
"step": 1285 |
|
}, |
|
{ |
|
"epoch": 0.15506671474936892, |
|
"grad_norm": 0.556612491607666, |
|
"learning_rate": 0.0004999618479318847, |
|
"loss": 1.5359, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 0.15566774852746723, |
|
"grad_norm": 0.5415599346160889, |
|
"learning_rate": 0.0004999610103920457, |
|
"loss": 1.5641, |
|
"step": 1295 |
|
}, |
|
{ |
|
"epoch": 0.15626878230556557, |
|
"grad_norm": 0.48660141229629517, |
|
"learning_rate": 0.0004999601637593479, |
|
"loss": 1.5, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.1568698160836639, |
|
"grad_norm": 0.5874481797218323, |
|
"learning_rate": 0.0004999593080338224, |
|
"loss": 1.3844, |
|
"step": 1305 |
|
}, |
|
{ |
|
"epoch": 0.15747084986176224, |
|
"grad_norm": 0.3727753460407257, |
|
"learning_rate": 0.0004999584432155, |
|
"loss": 1.8125, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 0.15807188363986055, |
|
"grad_norm": 0.35395169258117676, |
|
"learning_rate": 0.0004999575693044124, |
|
"loss": 1.3305, |
|
"step": 1315 |
|
}, |
|
{ |
|
"epoch": 0.1586729174179589, |
|
"grad_norm": 0.7356476783752441, |
|
"learning_rate": 0.0004999566863005912, |
|
"loss": 1.7078, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 0.15927395119605722, |
|
"grad_norm": 0.4838991165161133, |
|
"learning_rate": 0.0004999557942040687, |
|
"loss": 1.5969, |
|
"step": 1325 |
|
}, |
|
{ |
|
"epoch": 0.15987498497415556, |
|
"grad_norm": 0.4504292905330658, |
|
"learning_rate": 0.0004999548930148773, |
|
"loss": 1.4555, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 0.16047601875225387, |
|
"grad_norm": 0.5174041390419006, |
|
"learning_rate": 0.0004999539827330497, |
|
"loss": 1.5266, |
|
"step": 1335 |
|
}, |
|
{ |
|
"epoch": 0.1610770525303522, |
|
"grad_norm": 0.42709511518478394, |
|
"learning_rate": 0.000499953063358619, |
|
"loss": 1.4344, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 0.16167808630845054, |
|
"grad_norm": 0.34575751423835754, |
|
"learning_rate": 0.0004999521348916189, |
|
"loss": 1.5219, |
|
"step": 1345 |
|
}, |
|
{ |
|
"epoch": 0.16227912008654885, |
|
"grad_norm": 0.4409041404724121, |
|
"learning_rate": 0.0004999511973320829, |
|
"loss": 1.6172, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.1628801538646472, |
|
"grad_norm": 0.37874963879585266, |
|
"learning_rate": 0.0004999502506800452, |
|
"loss": 1.3156, |
|
"step": 1355 |
|
}, |
|
{ |
|
"epoch": 0.16348118764274552, |
|
"grad_norm": 0.39675143361091614, |
|
"learning_rate": 0.0004999492949355401, |
|
"loss": 1.7672, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 0.16408222142084386, |
|
"grad_norm": 0.4887191951274872, |
|
"learning_rate": 0.0004999483300986027, |
|
"loss": 1.6578, |
|
"step": 1365 |
|
}, |
|
{ |
|
"epoch": 0.16468325519894217, |
|
"grad_norm": 0.5052289366722107, |
|
"learning_rate": 0.000499947356169268, |
|
"loss": 1.5766, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 0.1652842889770405, |
|
"grad_norm": 0.3420865833759308, |
|
"learning_rate": 0.000499946373147571, |
|
"loss": 1.4281, |
|
"step": 1375 |
|
}, |
|
{ |
|
"epoch": 0.16588532275513884, |
|
"grad_norm": 0.6112978458404541, |
|
"learning_rate": 0.0004999453810335479, |
|
"loss": 1.4234, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 0.16648635653323718, |
|
"grad_norm": 0.46144208312034607, |
|
"learning_rate": 0.0004999443798272348, |
|
"loss": 1.4609, |
|
"step": 1385 |
|
}, |
|
{ |
|
"epoch": 0.1670873903113355, |
|
"grad_norm": 0.5132108926773071, |
|
"learning_rate": 0.000499943369528668, |
|
"loss": 1.5656, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 0.16768842408943382, |
|
"grad_norm": 0.5717546939849854, |
|
"learning_rate": 0.000499942350137884, |
|
"loss": 1.3617, |
|
"step": 1395 |
|
}, |
|
{ |
|
"epoch": 0.16828945786753216, |
|
"grad_norm": 0.4766351580619812, |
|
"learning_rate": 0.0004999413216549203, |
|
"loss": 1.5016, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.16828945786753216, |
|
"eval_loss": 2.3990235328674316, |
|
"eval_model_preparation_time": 0.0053, |
|
"eval_runtime": 35.19, |
|
"eval_samples_per_second": 4.547, |
|
"eval_steps_per_second": 1.137, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.1688904916456305, |
|
"grad_norm": 0.43601974844932556, |
|
"learning_rate": 0.0004999402840798142, |
|
"loss": 1.4156, |
|
"step": 1405 |
|
}, |
|
{ |
|
"epoch": 0.1694915254237288, |
|
"grad_norm": 0.6105599403381348, |
|
"learning_rate": 0.0004999392374126034, |
|
"loss": 1.7, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 0.17009255920182714, |
|
"grad_norm": 0.4957026243209839, |
|
"learning_rate": 0.0004999381816533259, |
|
"loss": 1.7969, |
|
"step": 1415 |
|
}, |
|
{ |
|
"epoch": 0.17069359297992548, |
|
"grad_norm": 0.44195666909217834, |
|
"learning_rate": 0.0004999371168020201, |
|
"loss": 1.4375, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 0.1712946267580238, |
|
"grad_norm": 0.45855048298835754, |
|
"learning_rate": 0.0004999360428587249, |
|
"loss": 1.6141, |
|
"step": 1425 |
|
}, |
|
{ |
|
"epoch": 0.17189566053612212, |
|
"grad_norm": 0.6269901990890503, |
|
"learning_rate": 0.0004999349598234792, |
|
"loss": 1.3953, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 0.17249669431422046, |
|
"grad_norm": 0.3805680274963379, |
|
"learning_rate": 0.0004999338676963225, |
|
"loss": 1.6484, |
|
"step": 1435 |
|
}, |
|
{ |
|
"epoch": 0.1730977280923188, |
|
"grad_norm": 0.6604627966880798, |
|
"learning_rate": 0.0004999327664772945, |
|
"loss": 1.5969, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 0.1736987618704171, |
|
"grad_norm": 0.4411623179912567, |
|
"learning_rate": 0.0004999316561664353, |
|
"loss": 1.2609, |
|
"step": 1445 |
|
}, |
|
{ |
|
"epoch": 0.17429979564851544, |
|
"grad_norm": 0.5301747918128967, |
|
"learning_rate": 0.0004999305367637852, |
|
"loss": 1.6141, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 0.17490082942661378, |
|
"grad_norm": 0.5128594040870667, |
|
"learning_rate": 0.000499929408269385, |
|
"loss": 1.6187, |
|
"step": 1455 |
|
}, |
|
{ |
|
"epoch": 0.17550186320471212, |
|
"grad_norm": 0.596217155456543, |
|
"learning_rate": 0.0004999282706832758, |
|
"loss": 1.4531, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 0.17610289698281043, |
|
"grad_norm": 0.45486292243003845, |
|
"learning_rate": 0.0004999271240054987, |
|
"loss": 1.4012, |
|
"step": 1465 |
|
}, |
|
{ |
|
"epoch": 0.17670393076090876, |
|
"grad_norm": 0.6031058430671692, |
|
"learning_rate": 0.0004999259682360957, |
|
"loss": 1.6203, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 0.1773049645390071, |
|
"grad_norm": 0.4107096493244171, |
|
"learning_rate": 0.0004999248033751088, |
|
"loss": 1.7312, |
|
"step": 1475 |
|
}, |
|
{ |
|
"epoch": 0.17790599831710543, |
|
"grad_norm": 0.46700888872146606, |
|
"learning_rate": 0.0004999236294225803, |
|
"loss": 1.5234, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 0.17850703209520374, |
|
"grad_norm": 0.7690737247467041, |
|
"learning_rate": 0.000499922446378553, |
|
"loss": 1.307, |
|
"step": 1485 |
|
}, |
|
{ |
|
"epoch": 0.17910806587330208, |
|
"grad_norm": 0.5420579314231873, |
|
"learning_rate": 0.0004999212542430698, |
|
"loss": 1.6562, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 0.17970909965140042, |
|
"grad_norm": 0.4624311625957489, |
|
"learning_rate": 0.0004999200530161742, |
|
"loss": 1.3234, |
|
"step": 1495 |
|
}, |
|
{ |
|
"epoch": 0.18031013342949873, |
|
"grad_norm": 0.4610016345977783, |
|
"learning_rate": 0.0004999188426979097, |
|
"loss": 1.5516, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.18091116720759706, |
|
"grad_norm": 0.5131213068962097, |
|
"learning_rate": 0.0004999176232883206, |
|
"loss": 1.5867, |
|
"step": 1505 |
|
}, |
|
{ |
|
"epoch": 0.1815122009856954, |
|
"grad_norm": 0.5673689842224121, |
|
"learning_rate": 0.0004999163947874511, |
|
"loss": 1.5078, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 0.18211323476379374, |
|
"grad_norm": 0.7008316516876221, |
|
"learning_rate": 0.000499915157195346, |
|
"loss": 1.5562, |
|
"step": 1515 |
|
}, |
|
{ |
|
"epoch": 0.18271426854189204, |
|
"grad_norm": 0.5652767419815063, |
|
"learning_rate": 0.00049991391051205, |
|
"loss": 1.4469, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 0.18331530231999038, |
|
"grad_norm": 0.5506184101104736, |
|
"learning_rate": 0.0004999126547376089, |
|
"loss": 1.4531, |
|
"step": 1525 |
|
}, |
|
{ |
|
"epoch": 0.18391633609808872, |
|
"grad_norm": 0.5806117057800293, |
|
"learning_rate": 0.000499911389872068, |
|
"loss": 1.7594, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 0.18451736987618705, |
|
"grad_norm": 0.5860136151313782, |
|
"learning_rate": 0.0004999101159154736, |
|
"loss": 1.5562, |
|
"step": 1535 |
|
}, |
|
{ |
|
"epoch": 0.18511840365428536, |
|
"grad_norm": 0.5575783252716064, |
|
"learning_rate": 0.000499908832867872, |
|
"loss": 1.6391, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 0.1857194374323837, |
|
"grad_norm": 0.3992920219898224, |
|
"learning_rate": 0.0004999075407293096, |
|
"loss": 1.3859, |
|
"step": 1545 |
|
}, |
|
{ |
|
"epoch": 0.18632047121048204, |
|
"grad_norm": 0.8294938206672668, |
|
"learning_rate": 0.0004999062394998336, |
|
"loss": 1.25, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 0.18692150498858034, |
|
"grad_norm": 0.6560512185096741, |
|
"learning_rate": 0.0004999049291794915, |
|
"loss": 1.4453, |
|
"step": 1555 |
|
}, |
|
{ |
|
"epoch": 0.18752253876667868, |
|
"grad_norm": 0.5583436489105225, |
|
"learning_rate": 0.0004999036097683307, |
|
"loss": 1.3969, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 0.18812357254477702, |
|
"grad_norm": 0.6256234645843506, |
|
"learning_rate": 0.0004999022812663993, |
|
"loss": 1.518, |
|
"step": 1565 |
|
}, |
|
{ |
|
"epoch": 0.18872460632287535, |
|
"grad_norm": 0.5769176483154297, |
|
"learning_rate": 0.0004999009436737457, |
|
"loss": 1.6609, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 0.18932564010097366, |
|
"grad_norm": 0.6486324071884155, |
|
"learning_rate": 0.0004998995969904183, |
|
"loss": 1.3172, |
|
"step": 1575 |
|
}, |
|
{ |
|
"epoch": 0.189926673879072, |
|
"grad_norm": 0.34935474395751953, |
|
"learning_rate": 0.0004998982412164663, |
|
"loss": 1.5562, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 0.19052770765717034, |
|
"grad_norm": 0.5806995630264282, |
|
"learning_rate": 0.000499896876351939, |
|
"loss": 1.6219, |
|
"step": 1585 |
|
}, |
|
{ |
|
"epoch": 0.19112874143526867, |
|
"grad_norm": 0.6906558275222778, |
|
"learning_rate": 0.0004998955023968862, |
|
"loss": 1.5172, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 0.19172977521336698, |
|
"grad_norm": 0.49730750918388367, |
|
"learning_rate": 0.0004998941193513575, |
|
"loss": 1.6797, |
|
"step": 1595 |
|
}, |
|
{ |
|
"epoch": 0.19233080899146532, |
|
"grad_norm": 0.5871158242225647, |
|
"learning_rate": 0.0004998927272154036, |
|
"loss": 1.6125, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.19233080899146532, |
|
"eval_loss": 2.360156297683716, |
|
"eval_model_preparation_time": 0.0053, |
|
"eval_runtime": 35.1647, |
|
"eval_samples_per_second": 4.55, |
|
"eval_steps_per_second": 1.138, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.19293184276956366, |
|
"grad_norm": 0.3994157910346985, |
|
"learning_rate": 0.000499891325989075, |
|
"loss": 1.3305, |
|
"step": 1605 |
|
}, |
|
{ |
|
"epoch": 0.193532876547662, |
|
"grad_norm": 0.3497280180454254, |
|
"learning_rate": 0.0004998899156724224, |
|
"loss": 1.3531, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 0.1941339103257603, |
|
"grad_norm": 0.4835513234138489, |
|
"learning_rate": 0.0004998884962654976, |
|
"loss": 1.293, |
|
"step": 1615 |
|
}, |
|
{ |
|
"epoch": 0.19473494410385864, |
|
"grad_norm": 0.4717245101928711, |
|
"learning_rate": 0.0004998870677683519, |
|
"loss": 1.3742, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 0.19533597788195697, |
|
"grad_norm": 0.3917827308177948, |
|
"learning_rate": 0.0004998856301810373, |
|
"loss": 1.5719, |
|
"step": 1625 |
|
}, |
|
{ |
|
"epoch": 0.19593701166005528, |
|
"grad_norm": 0.4725429117679596, |
|
"learning_rate": 0.0004998841835036061, |
|
"loss": 1.3859, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 0.19653804543815362, |
|
"grad_norm": 0.4728795289993286, |
|
"learning_rate": 0.0004998827277361111, |
|
"loss": 1.4203, |
|
"step": 1635 |
|
}, |
|
{ |
|
"epoch": 0.19713907921625196, |
|
"grad_norm": 0.6246328949928284, |
|
"learning_rate": 0.000499881262878605, |
|
"loss": 1.7719, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 0.1977401129943503, |
|
"grad_norm": 0.7019891738891602, |
|
"learning_rate": 0.0004998797889311413, |
|
"loss": 1.3781, |
|
"step": 1645 |
|
}, |
|
{ |
|
"epoch": 0.1983411467724486, |
|
"grad_norm": 0.2940036654472351, |
|
"learning_rate": 0.0004998783058937735, |
|
"loss": 1.4148, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 0.19894218055054694, |
|
"grad_norm": 0.434410959482193, |
|
"learning_rate": 0.0004998768137665556, |
|
"loss": 1.6094, |
|
"step": 1655 |
|
}, |
|
{ |
|
"epoch": 0.19954321432864527, |
|
"grad_norm": 0.5853382349014282, |
|
"learning_rate": 0.0004998753125495418, |
|
"loss": 1.4125, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 0.2001442481067436, |
|
"grad_norm": 0.5105974078178406, |
|
"learning_rate": 0.0004998738022427867, |
|
"loss": 1.3313, |
|
"step": 1665 |
|
}, |
|
{ |
|
"epoch": 0.20074528188484192, |
|
"grad_norm": 0.4266336262226105, |
|
"learning_rate": 0.0004998722828463455, |
|
"loss": 1.5953, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 0.20134631566294026, |
|
"grad_norm": 0.4918626844882965, |
|
"learning_rate": 0.0004998707543602731, |
|
"loss": 1.8383, |
|
"step": 1675 |
|
}, |
|
{ |
|
"epoch": 0.2019473494410386, |
|
"grad_norm": 0.4804850220680237, |
|
"learning_rate": 0.0004998692167846253, |
|
"loss": 1.1484, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 0.20254838321913693, |
|
"grad_norm": 0.5131824612617493, |
|
"learning_rate": 0.0004998676701194581, |
|
"loss": 1.7109, |
|
"step": 1685 |
|
}, |
|
{ |
|
"epoch": 0.20314941699723524, |
|
"grad_norm": 0.4895535111427307, |
|
"learning_rate": 0.0004998661143648277, |
|
"loss": 1.7453, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 0.20375045077533357, |
|
"grad_norm": 0.4180288314819336, |
|
"learning_rate": 0.0004998645495207906, |
|
"loss": 1.0766, |
|
"step": 1695 |
|
}, |
|
{ |
|
"epoch": 0.2043514845534319, |
|
"grad_norm": 0.4888496696949005, |
|
"learning_rate": 0.0004998629755874037, |
|
"loss": 1.5359, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.20495251833153022, |
|
"grad_norm": 0.666147768497467, |
|
"learning_rate": 0.0004998613925647245, |
|
"loss": 1.5609, |
|
"step": 1705 |
|
}, |
|
{ |
|
"epoch": 0.20555355210962856, |
|
"grad_norm": 0.563382625579834, |
|
"learning_rate": 0.0004998598004528103, |
|
"loss": 1.4187, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 0.2061545858877269, |
|
"grad_norm": 0.619296669960022, |
|
"learning_rate": 0.0004998581992517192, |
|
"loss": 1.3367, |
|
"step": 1715 |
|
}, |
|
{ |
|
"epoch": 0.20675561966582523, |
|
"grad_norm": 0.928014874458313, |
|
"learning_rate": 0.0004998565889615096, |
|
"loss": 1.4094, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 0.20735665344392354, |
|
"grad_norm": 0.4932372272014618, |
|
"learning_rate": 0.0004998549695822397, |
|
"loss": 1.3719, |
|
"step": 1725 |
|
}, |
|
{ |
|
"epoch": 0.20795768722202188, |
|
"grad_norm": 0.6022034287452698, |
|
"learning_rate": 0.0004998533411139685, |
|
"loss": 1.5781, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 0.2085587210001202, |
|
"grad_norm": 0.41716283559799194, |
|
"learning_rate": 0.0004998517035567554, |
|
"loss": 1.1914, |
|
"step": 1735 |
|
}, |
|
{ |
|
"epoch": 0.20915975477821855, |
|
"grad_norm": 0.4988159239292145, |
|
"learning_rate": 0.0004998500569106599, |
|
"loss": 1.475, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 0.20976078855631686, |
|
"grad_norm": 0.4242478907108307, |
|
"learning_rate": 0.0004998484011757419, |
|
"loss": 1.2859, |
|
"step": 1745 |
|
}, |
|
{ |
|
"epoch": 0.2103618223344152, |
|
"grad_norm": 0.5382992625236511, |
|
"learning_rate": 0.0004998467363520617, |
|
"loss": 1.3687, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 0.21096285611251353, |
|
"grad_norm": 0.31303003430366516, |
|
"learning_rate": 0.0004998450624396797, |
|
"loss": 1.9281, |
|
"step": 1755 |
|
}, |
|
{ |
|
"epoch": 0.21156388989061184, |
|
"grad_norm": 0.5793948173522949, |
|
"learning_rate": 0.0004998433794386569, |
|
"loss": 1.457, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 0.21216492366871018, |
|
"grad_norm": 0.48824676871299744, |
|
"learning_rate": 0.0004998416873490544, |
|
"loss": 1.5359, |
|
"step": 1765 |
|
}, |
|
{ |
|
"epoch": 0.2127659574468085, |
|
"grad_norm": 0.5384695529937744, |
|
"learning_rate": 0.000499839986170934, |
|
"loss": 1.4461, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 0.21336699122490685, |
|
"grad_norm": 0.5212387442588806, |
|
"learning_rate": 0.0004998382759043574, |
|
"loss": 1.2844, |
|
"step": 1775 |
|
}, |
|
{ |
|
"epoch": 0.21396802500300516, |
|
"grad_norm": 0.5552918910980225, |
|
"learning_rate": 0.0004998365565493868, |
|
"loss": 1.5516, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 0.2145690587811035, |
|
"grad_norm": 0.5672168135643005, |
|
"learning_rate": 0.0004998348281060848, |
|
"loss": 1.6297, |
|
"step": 1785 |
|
}, |
|
{ |
|
"epoch": 0.21517009255920183, |
|
"grad_norm": 0.620464026927948, |
|
"learning_rate": 0.0004998330905745143, |
|
"loss": 1.5047, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 0.21577112633730017, |
|
"grad_norm": 0.5900077819824219, |
|
"learning_rate": 0.0004998313439547384, |
|
"loss": 1.2367, |
|
"step": 1795 |
|
}, |
|
{ |
|
"epoch": 0.21637216011539848, |
|
"grad_norm": 0.5305217504501343, |
|
"learning_rate": 0.0004998295882468209, |
|
"loss": 1.5906, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.21637216011539848, |
|
"eval_loss": 2.288867235183716, |
|
"eval_model_preparation_time": 0.0053, |
|
"eval_runtime": 35.2003, |
|
"eval_samples_per_second": 4.545, |
|
"eval_steps_per_second": 1.136, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.2169731938934968, |
|
"grad_norm": 0.5836020112037659, |
|
"learning_rate": 0.0004998278234508253, |
|
"loss": 1.4891, |
|
"step": 1805 |
|
}, |
|
{ |
|
"epoch": 0.21757422767159515, |
|
"grad_norm": 0.3793884813785553, |
|
"learning_rate": 0.0004998260495668161, |
|
"loss": 1.3328, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 0.21817526144969349, |
|
"grad_norm": 0.5394117832183838, |
|
"learning_rate": 0.0004998242665948577, |
|
"loss": 1.368, |
|
"step": 1815 |
|
}, |
|
{ |
|
"epoch": 0.2187762952277918, |
|
"grad_norm": 0.39613473415374756, |
|
"learning_rate": 0.0004998224745350148, |
|
"loss": 1.2285, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 0.21937732900589013, |
|
"grad_norm": 0.543116569519043, |
|
"learning_rate": 0.0004998206733873529, |
|
"loss": 1.5078, |
|
"step": 1825 |
|
}, |
|
{ |
|
"epoch": 0.21997836278398847, |
|
"grad_norm": 0.4901551306247711, |
|
"learning_rate": 0.0004998188631519375, |
|
"loss": 1.4516, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 0.22057939656208678, |
|
"grad_norm": 0.5067916512489319, |
|
"learning_rate": 0.0004998170438288342, |
|
"loss": 1.5719, |
|
"step": 1835 |
|
}, |
|
{ |
|
"epoch": 0.2211804303401851, |
|
"grad_norm": 0.4343029856681824, |
|
"learning_rate": 0.0004998152154181093, |
|
"loss": 1.3766, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 0.22178146411828345, |
|
"grad_norm": 0.5296164155006409, |
|
"learning_rate": 0.0004998133779198293, |
|
"loss": 1.3625, |
|
"step": 1845 |
|
}, |
|
{ |
|
"epoch": 0.2223824978963818, |
|
"grad_norm": 0.4429774284362793, |
|
"learning_rate": 0.0004998115313340611, |
|
"loss": 1.3891, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 0.2229835316744801, |
|
"grad_norm": 0.5772582292556763, |
|
"learning_rate": 0.0004998096756608719, |
|
"loss": 1.5437, |
|
"step": 1855 |
|
}, |
|
{ |
|
"epoch": 0.22358456545257843, |
|
"grad_norm": 0.5951064825057983, |
|
"learning_rate": 0.0004998078109003291, |
|
"loss": 1.4672, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 0.22418559923067677, |
|
"grad_norm": 0.3261686861515045, |
|
"learning_rate": 0.0004998059370525006, |
|
"loss": 1.5063, |
|
"step": 1865 |
|
}, |
|
{ |
|
"epoch": 0.2247866330087751, |
|
"grad_norm": 0.3098331689834595, |
|
"learning_rate": 0.0004998040541174545, |
|
"loss": 1.5094, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 0.22538766678687341, |
|
"grad_norm": 0.8590214252471924, |
|
"learning_rate": 0.0004998021620952593, |
|
"loss": 1.3977, |
|
"step": 1875 |
|
}, |
|
{ |
|
"epoch": 0.22598870056497175, |
|
"grad_norm": 0.5078855752944946, |
|
"learning_rate": 0.0004998002609859839, |
|
"loss": 1.2789, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 0.2265897343430701, |
|
"grad_norm": 0.4515461027622223, |
|
"learning_rate": 0.0004997983507896976, |
|
"loss": 1.368, |
|
"step": 1885 |
|
}, |
|
{ |
|
"epoch": 0.2271907681211684, |
|
"grad_norm": 0.4937264025211334, |
|
"learning_rate": 0.0004997964315064695, |
|
"loss": 1.1953, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 0.22779180189926673, |
|
"grad_norm": 0.6028769612312317, |
|
"learning_rate": 0.0004997945031363697, |
|
"loss": 1.4859, |
|
"step": 1895 |
|
}, |
|
{ |
|
"epoch": 0.22839283567736507, |
|
"grad_norm": 0.4746128022670746, |
|
"learning_rate": 0.0004997925656794683, |
|
"loss": 1.6016, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.2289938694554634, |
|
"grad_norm": 0.519091010093689, |
|
"learning_rate": 0.0004997906191358358, |
|
"loss": 1.3906, |
|
"step": 1905 |
|
}, |
|
{ |
|
"epoch": 0.22959490323356171, |
|
"grad_norm": 0.4584903419017792, |
|
"learning_rate": 0.0004997886635055429, |
|
"loss": 1.3258, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 0.23019593701166005, |
|
"grad_norm": 0.7446622252464294, |
|
"learning_rate": 0.0004997866987886608, |
|
"loss": 1.2141, |
|
"step": 1915 |
|
}, |
|
{ |
|
"epoch": 0.2307969707897584, |
|
"grad_norm": 0.5405495166778564, |
|
"learning_rate": 0.0004997847249852609, |
|
"loss": 1.4359, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 0.23139800456785672, |
|
"grad_norm": 0.38187775015830994, |
|
"learning_rate": 0.0004997827420954152, |
|
"loss": 1.7219, |
|
"step": 1925 |
|
}, |
|
{ |
|
"epoch": 0.23199903834595503, |
|
"grad_norm": 0.503364622592926, |
|
"learning_rate": 0.0004997807501191957, |
|
"loss": 1.3586, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 0.23260007212405337, |
|
"grad_norm": 0.43855008482933044, |
|
"learning_rate": 0.0004997787490566749, |
|
"loss": 1.7625, |
|
"step": 1935 |
|
}, |
|
{ |
|
"epoch": 0.2332011059021517, |
|
"grad_norm": 0.4955185651779175, |
|
"learning_rate": 0.0004997767389079255, |
|
"loss": 1.2281, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 0.23380213968025004, |
|
"grad_norm": 0.7726651430130005, |
|
"learning_rate": 0.0004997747196730206, |
|
"loss": 1.5445, |
|
"step": 1945 |
|
}, |
|
{ |
|
"epoch": 0.23440317345834835, |
|
"grad_norm": 0.38199684023857117, |
|
"learning_rate": 0.000499772691352034, |
|
"loss": 1.4203, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 0.2350042072364467, |
|
"grad_norm": 0.4838792383670807, |
|
"learning_rate": 0.000499770653945039, |
|
"loss": 1.2484, |
|
"step": 1955 |
|
}, |
|
{ |
|
"epoch": 0.23560524101454502, |
|
"grad_norm": 0.43874993920326233, |
|
"learning_rate": 0.00049976860745211, |
|
"loss": 1.3594, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 0.23620627479264333, |
|
"grad_norm": 0.4992177188396454, |
|
"learning_rate": 0.0004997665518733215, |
|
"loss": 1.1977, |
|
"step": 1965 |
|
}, |
|
{ |
|
"epoch": 0.23680730857074167, |
|
"grad_norm": 0.526907742023468, |
|
"learning_rate": 0.000499764487208748, |
|
"loss": 1.1609, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 0.23740834234884, |
|
"grad_norm": 0.599902868270874, |
|
"learning_rate": 0.000499762413458465, |
|
"loss": 1.4203, |
|
"step": 1975 |
|
}, |
|
{ |
|
"epoch": 0.23800937612693834, |
|
"grad_norm": 0.42601317167282104, |
|
"learning_rate": 0.0004997603306225475, |
|
"loss": 1.1516, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 0.23861040990503665, |
|
"grad_norm": 0.3787403404712677, |
|
"learning_rate": 0.0004997582387010715, |
|
"loss": 1.3391, |
|
"step": 1985 |
|
}, |
|
{ |
|
"epoch": 0.239211443683135, |
|
"grad_norm": 0.5586139559745789, |
|
"learning_rate": 0.0004997561376941131, |
|
"loss": 1.6656, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 0.23981247746123333, |
|
"grad_norm": 0.44761109352111816, |
|
"learning_rate": 0.0004997540276017487, |
|
"loss": 1.5828, |
|
"step": 1995 |
|
}, |
|
{ |
|
"epoch": 0.24041351123933166, |
|
"grad_norm": 0.4291538894176483, |
|
"learning_rate": 0.000499751908424055, |
|
"loss": 1.4539, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.24041351123933166, |
|
"eval_loss": 2.2626953125, |
|
"eval_model_preparation_time": 0.0053, |
|
"eval_runtime": 35.2012, |
|
"eval_samples_per_second": 4.545, |
|
"eval_steps_per_second": 1.136, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.24101454501742997, |
|
"grad_norm": 0.46680477261543274, |
|
"learning_rate": 0.0004997497801611093, |
|
"loss": 1.2609, |
|
"step": 2005 |
|
}, |
|
{ |
|
"epoch": 0.2416155787955283, |
|
"grad_norm": 0.42086416482925415, |
|
"learning_rate": 0.0004997476428129887, |
|
"loss": 1.1609, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 0.24221661257362664, |
|
"grad_norm": 0.7524279952049255, |
|
"learning_rate": 0.0004997454963797713, |
|
"loss": 1.0633, |
|
"step": 2015 |
|
}, |
|
{ |
|
"epoch": 0.24281764635172498, |
|
"grad_norm": 0.43722498416900635, |
|
"learning_rate": 0.0004997433408615349, |
|
"loss": 1.2969, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 0.2434186801298233, |
|
"grad_norm": 0.2848932147026062, |
|
"learning_rate": 0.0004997411762583581, |
|
"loss": 1.2063, |
|
"step": 2025 |
|
}, |
|
{ |
|
"epoch": 0.24401971390792163, |
|
"grad_norm": 0.4349381923675537, |
|
"learning_rate": 0.0004997390025703194, |
|
"loss": 1.3625, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 0.24462074768601996, |
|
"grad_norm": 0.4666562080383301, |
|
"learning_rate": 0.0004997368197974982, |
|
"loss": 1.4164, |
|
"step": 2035 |
|
}, |
|
{ |
|
"epoch": 0.24522178146411827, |
|
"grad_norm": 0.5730391144752502, |
|
"learning_rate": 0.0004997346279399736, |
|
"loss": 1.1633, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 0.2458228152422166, |
|
"grad_norm": 0.5395126938819885, |
|
"learning_rate": 0.0004997324269978255, |
|
"loss": 1.2398, |
|
"step": 2045 |
|
}, |
|
{ |
|
"epoch": 0.24642384902031494, |
|
"grad_norm": 0.3828608989715576, |
|
"learning_rate": 0.000499730216971134, |
|
"loss": 1.0594, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 0.24702488279841328, |
|
"grad_norm": 0.796903133392334, |
|
"learning_rate": 0.0004997279978599794, |
|
"loss": 1.3055, |
|
"step": 2055 |
|
}, |
|
{ |
|
"epoch": 0.2476259165765116, |
|
"grad_norm": 0.35091638565063477, |
|
"learning_rate": 0.0004997257696644424, |
|
"loss": 1.2023, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 0.24822695035460993, |
|
"grad_norm": 0.46753543615341187, |
|
"learning_rate": 0.000499723532384604, |
|
"loss": 1.3203, |
|
"step": 2065 |
|
}, |
|
{ |
|
"epoch": 0.24882798413270826, |
|
"grad_norm": 0.5231248736381531, |
|
"learning_rate": 0.0004997212860205459, |
|
"loss": 1.3438, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 0.2494290179108066, |
|
"grad_norm": 0.470639169216156, |
|
"learning_rate": 0.0004997190305723495, |
|
"loss": 1.3031, |
|
"step": 2075 |
|
}, |
|
{ |
|
"epoch": 0.2500300516889049, |
|
"grad_norm": 0.4669177234172821, |
|
"learning_rate": 0.000499716766040097, |
|
"loss": 1.5188, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 0.25063108546700325, |
|
"grad_norm": 0.5113319754600525, |
|
"learning_rate": 0.0004997144924238706, |
|
"loss": 1.0992, |
|
"step": 2085 |
|
}, |
|
{ |
|
"epoch": 0.2512321192451016, |
|
"grad_norm": 0.5395264625549316, |
|
"learning_rate": 0.0004997122097237533, |
|
"loss": 1.3281, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 0.2518331530231999, |
|
"grad_norm": 0.47676244378089905, |
|
"learning_rate": 0.0004997099179398279, |
|
"loss": 1.2898, |
|
"step": 2095 |
|
}, |
|
{ |
|
"epoch": 0.25243418680129825, |
|
"grad_norm": 0.3385642468929291, |
|
"learning_rate": 0.0004997076170721778, |
|
"loss": 1.2078, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.25303522057939654, |
|
"grad_norm": 0.3868078887462616, |
|
"learning_rate": 0.0004997053071208868, |
|
"loss": 1.4563, |
|
"step": 2105 |
|
}, |
|
{ |
|
"epoch": 0.2536362543574949, |
|
"grad_norm": 0.437321275472641, |
|
"learning_rate": 0.0004997029880860389, |
|
"loss": 1.3977, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 0.2542372881355932, |
|
"grad_norm": 0.6515981554985046, |
|
"learning_rate": 0.0004997006599677183, |
|
"loss": 1.2461, |
|
"step": 2115 |
|
}, |
|
{ |
|
"epoch": 0.25483832191369155, |
|
"grad_norm": 0.3654949367046356, |
|
"learning_rate": 0.0004996983227660099, |
|
"loss": 1.4187, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 0.2554393556917899, |
|
"grad_norm": 0.5203860998153687, |
|
"learning_rate": 0.0004996959764809987, |
|
"loss": 1.4328, |
|
"step": 2125 |
|
}, |
|
{ |
|
"epoch": 0.2560403894698882, |
|
"grad_norm": 0.5454062223434448, |
|
"learning_rate": 0.00049969362111277, |
|
"loss": 1.5125, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 0.25664142324798656, |
|
"grad_norm": 0.5460174679756165, |
|
"learning_rate": 0.0004996912566614094, |
|
"loss": 1.4344, |
|
"step": 2135 |
|
}, |
|
{ |
|
"epoch": 0.2572424570260849, |
|
"grad_norm": 0.4798714816570282, |
|
"learning_rate": 0.000499688883127003, |
|
"loss": 1.1953, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 0.2578434908041832, |
|
"grad_norm": 0.679547905921936, |
|
"learning_rate": 0.0004996865005096372, |
|
"loss": 1.2688, |
|
"step": 2145 |
|
}, |
|
{ |
|
"epoch": 0.2584445245822815, |
|
"grad_norm": 0.42334336042404175, |
|
"learning_rate": 0.0004996841088093985, |
|
"loss": 1.1516, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 0.25904555836037985, |
|
"grad_norm": 0.4171724021434784, |
|
"learning_rate": 0.000499681708026374, |
|
"loss": 1.0961, |
|
"step": 2155 |
|
}, |
|
{ |
|
"epoch": 0.2596465921384782, |
|
"grad_norm": 0.6091195940971375, |
|
"learning_rate": 0.0004996792981606511, |
|
"loss": 1.0164, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 0.2602476259165765, |
|
"grad_norm": 0.7312507033348083, |
|
"learning_rate": 0.0004996768792123173, |
|
"loss": 1.3031, |
|
"step": 2165 |
|
}, |
|
{ |
|
"epoch": 0.26084865969467486, |
|
"grad_norm": 0.8120207786560059, |
|
"learning_rate": 0.0004996744511814609, |
|
"loss": 1.1641, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 0.2614496934727732, |
|
"grad_norm": 0.4702399969100952, |
|
"learning_rate": 0.0004996720140681699, |
|
"loss": 1.2805, |
|
"step": 2175 |
|
}, |
|
{ |
|
"epoch": 0.2620507272508715, |
|
"grad_norm": 0.45239925384521484, |
|
"learning_rate": 0.0004996695678725331, |
|
"loss": 1.5539, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 0.2626517610289698, |
|
"grad_norm": 0.6370692253112793, |
|
"learning_rate": 0.0004996671125946394, |
|
"loss": 1.2156, |
|
"step": 2185 |
|
}, |
|
{ |
|
"epoch": 0.26325279480706815, |
|
"grad_norm": 0.6115698218345642, |
|
"learning_rate": 0.0004996646482345781, |
|
"loss": 1.2891, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 0.2638538285851665, |
|
"grad_norm": 0.611488401889801, |
|
"learning_rate": 0.0004996621747924391, |
|
"loss": 1.3023, |
|
"step": 2195 |
|
}, |
|
{ |
|
"epoch": 0.2644548623632648, |
|
"grad_norm": 0.6977550983428955, |
|
"learning_rate": 0.0004996596922683122, |
|
"loss": 1.3555, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.2644548623632648, |
|
"eval_loss": 2.2613282203674316, |
|
"eval_model_preparation_time": 0.0053, |
|
"eval_runtime": 35.2156, |
|
"eval_samples_per_second": 4.543, |
|
"eval_steps_per_second": 1.136, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.26505589614136316, |
|
"grad_norm": 0.6270340085029602, |
|
"learning_rate": 0.0004996572006622876, |
|
"loss": 1.5938, |
|
"step": 2205 |
|
}, |
|
{ |
|
"epoch": 0.2656569299194615, |
|
"grad_norm": 0.5670061707496643, |
|
"learning_rate": 0.0004996546999744561, |
|
"loss": 1.5016, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 0.26625796369755983, |
|
"grad_norm": 0.38163918256759644, |
|
"learning_rate": 0.0004996521902049086, |
|
"loss": 1.2812, |
|
"step": 2215 |
|
}, |
|
{ |
|
"epoch": 0.2668589974756581, |
|
"grad_norm": 0.45828545093536377, |
|
"learning_rate": 0.0004996496713537365, |
|
"loss": 1.3023, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 0.26746003125375645, |
|
"grad_norm": 0.4318217933177948, |
|
"learning_rate": 0.0004996471434210312, |
|
"loss": 1.6039, |
|
"step": 2225 |
|
}, |
|
{ |
|
"epoch": 0.2680610650318548, |
|
"grad_norm": 0.5099067091941833, |
|
"learning_rate": 0.0004996446064068848, |
|
"loss": 1.5562, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 0.2686620988099531, |
|
"grad_norm": 0.7253368496894836, |
|
"learning_rate": 0.0004996420603113897, |
|
"loss": 1.2523, |
|
"step": 2235 |
|
}, |
|
{ |
|
"epoch": 0.26926313258805146, |
|
"grad_norm": 0.6101372838020325, |
|
"learning_rate": 0.0004996395051346384, |
|
"loss": 1.4125, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 0.2698641663661498, |
|
"grad_norm": 0.5073166489601135, |
|
"learning_rate": 0.0004996369408767238, |
|
"loss": 1.1109, |
|
"step": 2245 |
|
}, |
|
{ |
|
"epoch": 0.27046520014424813, |
|
"grad_norm": 0.4978417456150055, |
|
"learning_rate": 0.0004996343675377393, |
|
"loss": 1.3438, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 0.2710662339223464, |
|
"grad_norm": 0.695686936378479, |
|
"learning_rate": 0.0004996317851177784, |
|
"loss": 1.0445, |
|
"step": 2255 |
|
}, |
|
{ |
|
"epoch": 0.27166726770044475, |
|
"grad_norm": 0.5276048183441162, |
|
"learning_rate": 0.000499629193616935, |
|
"loss": 1.2703, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 0.2722683014785431, |
|
"grad_norm": 0.7686821222305298, |
|
"learning_rate": 0.0004996265930353036, |
|
"loss": 1.2656, |
|
"step": 2265 |
|
}, |
|
{ |
|
"epoch": 0.2728693352566414, |
|
"grad_norm": 0.673497200012207, |
|
"learning_rate": 0.0004996239833729786, |
|
"loss": 1.4055, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 0.27347036903473976, |
|
"grad_norm": 0.4770069122314453, |
|
"learning_rate": 0.000499621364630055, |
|
"loss": 1.1227, |
|
"step": 2275 |
|
}, |
|
{ |
|
"epoch": 0.2740714028128381, |
|
"grad_norm": 0.630565881729126, |
|
"learning_rate": 0.000499618736806628, |
|
"loss": 1.293, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 0.27467243659093643, |
|
"grad_norm": 0.5288265943527222, |
|
"learning_rate": 0.0004996160999027933, |
|
"loss": 1.5109, |
|
"step": 2285 |
|
}, |
|
{ |
|
"epoch": 0.27527347036903477, |
|
"grad_norm": 0.35486194491386414, |
|
"learning_rate": 0.0004996134539186469, |
|
"loss": 1.5078, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 0.27587450414713305, |
|
"grad_norm": 0.5654587745666504, |
|
"learning_rate": 0.0004996107988542847, |
|
"loss": 1.625, |
|
"step": 2295 |
|
}, |
|
{ |
|
"epoch": 0.2764755379252314, |
|
"grad_norm": 0.40694040060043335, |
|
"learning_rate": 0.0004996081347098037, |
|
"loss": 1.4531, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.2770765717033297, |
|
"grad_norm": 0.5765879154205322, |
|
"learning_rate": 0.0004996054614853005, |
|
"loss": 1.343, |
|
"step": 2305 |
|
}, |
|
{ |
|
"epoch": 0.27767760548142806, |
|
"grad_norm": 0.49710384011268616, |
|
"learning_rate": 0.0004996027791808725, |
|
"loss": 1.3266, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 0.2782786392595264, |
|
"grad_norm": 0.5011634826660156, |
|
"learning_rate": 0.0004996000877966172, |
|
"loss": 1.3438, |
|
"step": 2315 |
|
}, |
|
{ |
|
"epoch": 0.27887967303762473, |
|
"grad_norm": 0.6307665705680847, |
|
"learning_rate": 0.0004995973873326326, |
|
"loss": 1.5703, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 0.27948070681572307, |
|
"grad_norm": 0.46662095189094543, |
|
"learning_rate": 0.0004995946777890169, |
|
"loss": 1.4414, |
|
"step": 2325 |
|
}, |
|
{ |
|
"epoch": 0.28008174059382135, |
|
"grad_norm": 0.49989181756973267, |
|
"learning_rate": 0.0004995919591658687, |
|
"loss": 1.3789, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 0.2806827743719197, |
|
"grad_norm": 0.4880094528198242, |
|
"learning_rate": 0.0004995892314632867, |
|
"loss": 1.2633, |
|
"step": 2335 |
|
}, |
|
{ |
|
"epoch": 0.281283808150018, |
|
"grad_norm": 0.6314132213592529, |
|
"learning_rate": 0.0004995864946813703, |
|
"loss": 1.5539, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 0.28188484192811636, |
|
"grad_norm": 0.7073726654052734, |
|
"learning_rate": 0.0004995837488202191, |
|
"loss": 1.3766, |
|
"step": 2345 |
|
}, |
|
{ |
|
"epoch": 0.2824858757062147, |
|
"grad_norm": 0.5559587478637695, |
|
"learning_rate": 0.0004995809938799329, |
|
"loss": 1.4875, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 0.28308690948431303, |
|
"grad_norm": 0.4955267906188965, |
|
"learning_rate": 0.0004995782298606119, |
|
"loss": 1.3156, |
|
"step": 2355 |
|
}, |
|
{ |
|
"epoch": 0.28368794326241137, |
|
"grad_norm": 0.4989592432975769, |
|
"learning_rate": 0.0004995754567623567, |
|
"loss": 1.2484, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 0.28428897704050965, |
|
"grad_norm": 0.5886387228965759, |
|
"learning_rate": 0.0004995726745852681, |
|
"loss": 1.2344, |
|
"step": 2365 |
|
}, |
|
{ |
|
"epoch": 0.284890010818608, |
|
"grad_norm": 0.5085893273353577, |
|
"learning_rate": 0.0004995698833294474, |
|
"loss": 1.407, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 0.2854910445967063, |
|
"grad_norm": 0.4706375002861023, |
|
"learning_rate": 0.000499567082994996, |
|
"loss": 1.3117, |
|
"step": 2375 |
|
}, |
|
{ |
|
"epoch": 0.28609207837480466, |
|
"grad_norm": 0.5287367701530457, |
|
"learning_rate": 0.000499564273582016, |
|
"loss": 1.2594, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 0.286693112152903, |
|
"grad_norm": 0.5483081936836243, |
|
"learning_rate": 0.0004995614550906093, |
|
"loss": 1.1008, |
|
"step": 2385 |
|
}, |
|
{ |
|
"epoch": 0.28729414593100133, |
|
"grad_norm": 0.8154200911521912, |
|
"learning_rate": 0.0004995586275208788, |
|
"loss": 1.5164, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 0.28789517970909967, |
|
"grad_norm": 0.837818443775177, |
|
"learning_rate": 0.000499555790872927, |
|
"loss": 1.2531, |
|
"step": 2395 |
|
}, |
|
{ |
|
"epoch": 0.288496213487198, |
|
"grad_norm": 0.4989728033542633, |
|
"learning_rate": 0.0004995529451468574, |
|
"loss": 1.3719, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.288496213487198, |
|
"eval_loss": 2.189453125, |
|
"eval_model_preparation_time": 0.0053, |
|
"eval_runtime": 35.202, |
|
"eval_samples_per_second": 4.545, |
|
"eval_steps_per_second": 1.136, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.2890972472652963, |
|
"grad_norm": 0.5615506768226624, |
|
"learning_rate": 0.0004995500903427732, |
|
"loss": 1.1023, |
|
"step": 2405 |
|
}, |
|
{ |
|
"epoch": 0.2896982810433946, |
|
"grad_norm": 0.7758134007453918, |
|
"learning_rate": 0.0004995472264607784, |
|
"loss": 1.2625, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 0.29029931482149296, |
|
"grad_norm": 0.6751444935798645, |
|
"learning_rate": 0.0004995443535009773, |
|
"loss": 1.5734, |
|
"step": 2415 |
|
}, |
|
{ |
|
"epoch": 0.2909003485995913, |
|
"grad_norm": 0.5839786529541016, |
|
"learning_rate": 0.0004995414714634743, |
|
"loss": 1.3625, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 0.29150138237768963, |
|
"grad_norm": 0.5906524062156677, |
|
"learning_rate": 0.0004995385803483742, |
|
"loss": 1.0875, |
|
"step": 2425 |
|
}, |
|
{ |
|
"epoch": 0.29210241615578797, |
|
"grad_norm": 0.7597156763076782, |
|
"learning_rate": 0.0004995356801557821, |
|
"loss": 1.4781, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 0.2927034499338863, |
|
"grad_norm": 0.5112520456314087, |
|
"learning_rate": 0.0004995327708858038, |
|
"loss": 1.2758, |
|
"step": 2435 |
|
}, |
|
{ |
|
"epoch": 0.2933044837119846, |
|
"grad_norm": 0.44212523102760315, |
|
"learning_rate": 0.0004995298525385447, |
|
"loss": 1.5094, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 0.2939055174900829, |
|
"grad_norm": 0.43641284108161926, |
|
"learning_rate": 0.0004995269251141114, |
|
"loss": 1.1656, |
|
"step": 2445 |
|
}, |
|
{ |
|
"epoch": 0.29450655126818126, |
|
"grad_norm": 0.4382478892803192, |
|
"learning_rate": 0.0004995239886126102, |
|
"loss": 1.5023, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 0.2951075850462796, |
|
"grad_norm": 0.6196442246437073, |
|
"learning_rate": 0.0004995210430341478, |
|
"loss": 1.2875, |
|
"step": 2455 |
|
}, |
|
{ |
|
"epoch": 0.29570861882437793, |
|
"grad_norm": 0.6048389673233032, |
|
"learning_rate": 0.0004995180883788316, |
|
"loss": 0.9516, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 0.29630965260247627, |
|
"grad_norm": 0.5682608485221863, |
|
"learning_rate": 0.0004995151246467689, |
|
"loss": 1.3422, |
|
"step": 2465 |
|
}, |
|
{ |
|
"epoch": 0.2969106863805746, |
|
"grad_norm": 0.5677405595779419, |
|
"learning_rate": 0.0004995121518380674, |
|
"loss": 1.5016, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 0.29751172015867294, |
|
"grad_norm": 0.48005715012550354, |
|
"learning_rate": 0.0004995091699528355, |
|
"loss": 1.3219, |
|
"step": 2475 |
|
}, |
|
{ |
|
"epoch": 0.2981127539367712, |
|
"grad_norm": 0.48294246196746826, |
|
"learning_rate": 0.0004995061789911817, |
|
"loss": 1.2516, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 0.29871378771486956, |
|
"grad_norm": 0.7167287468910217, |
|
"learning_rate": 0.0004995031789532147, |
|
"loss": 1.3531, |
|
"step": 2485 |
|
}, |
|
{ |
|
"epoch": 0.2993148214929679, |
|
"grad_norm": 0.5675193667411804, |
|
"learning_rate": 0.0004995001698390434, |
|
"loss": 1.3648, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 0.29991585527106623, |
|
"grad_norm": 0.5264390707015991, |
|
"learning_rate": 0.0004994971516487775, |
|
"loss": 1.1133, |
|
"step": 2495 |
|
}, |
|
{ |
|
"epoch": 0.30051688904916457, |
|
"grad_norm": 0.5506901144981384, |
|
"learning_rate": 0.0004994941243825269, |
|
"loss": 1.1594, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.3011179228272629, |
|
"grad_norm": 0.9272066950798035, |
|
"learning_rate": 0.0004994910880404015, |
|
"loss": 1.4906, |
|
"step": 2505 |
|
}, |
|
{ |
|
"epoch": 0.30171895660536124, |
|
"grad_norm": 0.5853176712989807, |
|
"learning_rate": 0.0004994880426225119, |
|
"loss": 1.3508, |
|
"step": 2510 |
|
}, |
|
{ |
|
"epoch": 0.3023199903834595, |
|
"grad_norm": 0.4796172082424164, |
|
"learning_rate": 0.0004994849881289687, |
|
"loss": 1.3484, |
|
"step": 2515 |
|
}, |
|
{ |
|
"epoch": 0.30292102416155786, |
|
"grad_norm": 0.6331420540809631, |
|
"learning_rate": 0.0004994819245598833, |
|
"loss": 1.2188, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 0.3035220579396562, |
|
"grad_norm": 0.6519079208374023, |
|
"learning_rate": 0.000499478851915367, |
|
"loss": 1.3531, |
|
"step": 2525 |
|
}, |
|
{ |
|
"epoch": 0.30412309171775453, |
|
"grad_norm": 0.6366649866104126, |
|
"learning_rate": 0.0004994757701955314, |
|
"loss": 1.1703, |
|
"step": 2530 |
|
}, |
|
{ |
|
"epoch": 0.30472412549585287, |
|
"grad_norm": 0.5621868371963501, |
|
"learning_rate": 0.0004994726794004888, |
|
"loss": 1.0441, |
|
"step": 2535 |
|
}, |
|
{ |
|
"epoch": 0.3053251592739512, |
|
"grad_norm": 0.6726334095001221, |
|
"learning_rate": 0.0004994695795303517, |
|
"loss": 1.2984, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 0.30592619305204954, |
|
"grad_norm": 0.5448851585388184, |
|
"learning_rate": 0.0004994664705852326, |
|
"loss": 0.8781, |
|
"step": 2545 |
|
}, |
|
{ |
|
"epoch": 0.3065272268301479, |
|
"grad_norm": 0.6853761076927185, |
|
"learning_rate": 0.0004994633525652448, |
|
"loss": 1.6891, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 0.30712826060824616, |
|
"grad_norm": 0.5627267956733704, |
|
"learning_rate": 0.0004994602254705017, |
|
"loss": 1.368, |
|
"step": 2555 |
|
}, |
|
{ |
|
"epoch": 0.3077292943863445, |
|
"grad_norm": 0.38999640941619873, |
|
"learning_rate": 0.0004994570893011171, |
|
"loss": 1.3789, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 0.30833032816444284, |
|
"grad_norm": 0.6671114563941956, |
|
"learning_rate": 0.000499453944057205, |
|
"loss": 1.4078, |
|
"step": 2565 |
|
}, |
|
{ |
|
"epoch": 0.30893136194254117, |
|
"grad_norm": 0.5521063208580017, |
|
"learning_rate": 0.0004994507897388798, |
|
"loss": 1.5859, |
|
"step": 2570 |
|
}, |
|
{ |
|
"epoch": 0.3095323957206395, |
|
"grad_norm": 0.6885313391685486, |
|
"learning_rate": 0.0004994476263462563, |
|
"loss": 1.2578, |
|
"step": 2575 |
|
}, |
|
{ |
|
"epoch": 0.31013342949873784, |
|
"grad_norm": 0.45498156547546387, |
|
"learning_rate": 0.0004994444538794495, |
|
"loss": 1.3914, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 0.3107344632768362, |
|
"grad_norm": 0.5482655167579651, |
|
"learning_rate": 0.0004994412723385749, |
|
"loss": 1.3391, |
|
"step": 2585 |
|
}, |
|
{ |
|
"epoch": 0.31133549705493446, |
|
"grad_norm": 0.5240392684936523, |
|
"learning_rate": 0.0004994380817237482, |
|
"loss": 1.25, |
|
"step": 2590 |
|
}, |
|
{ |
|
"epoch": 0.3119365308330328, |
|
"grad_norm": 0.5129856467247009, |
|
"learning_rate": 0.0004994348820350854, |
|
"loss": 1.4406, |
|
"step": 2595 |
|
}, |
|
{ |
|
"epoch": 0.31253756461113114, |
|
"grad_norm": 0.5252668261528015, |
|
"learning_rate": 0.000499431673272703, |
|
"loss": 1.0773, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.31253756461113114, |
|
"eval_loss": 2.1500000953674316, |
|
"eval_model_preparation_time": 0.0053, |
|
"eval_runtime": 35.1975, |
|
"eval_samples_per_second": 4.546, |
|
"eval_steps_per_second": 1.136, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.31313859838922947, |
|
"grad_norm": 0.6648097634315491, |
|
"learning_rate": 0.0004994284554367176, |
|
"loss": 1.1133, |
|
"step": 2605 |
|
}, |
|
{ |
|
"epoch": 0.3137396321673278, |
|
"grad_norm": 0.6218547224998474, |
|
"learning_rate": 0.0004994252285272465, |
|
"loss": 1.2937, |
|
"step": 2610 |
|
}, |
|
{ |
|
"epoch": 0.31434066594542615, |
|
"grad_norm": 0.6880519390106201, |
|
"learning_rate": 0.0004994219925444068, |
|
"loss": 1.8039, |
|
"step": 2615 |
|
}, |
|
{ |
|
"epoch": 0.3149416997235245, |
|
"grad_norm": 0.6464706063270569, |
|
"learning_rate": 0.0004994187474883164, |
|
"loss": 1.5594, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 0.3155427335016228, |
|
"grad_norm": 0.7200093865394592, |
|
"learning_rate": 0.0004994154933590932, |
|
"loss": 1.0945, |
|
"step": 2625 |
|
}, |
|
{ |
|
"epoch": 0.3161437672797211, |
|
"grad_norm": 0.6853864789009094, |
|
"learning_rate": 0.0004994122301568557, |
|
"loss": 1.268, |
|
"step": 2630 |
|
}, |
|
{ |
|
"epoch": 0.31674480105781944, |
|
"grad_norm": 0.5081961750984192, |
|
"learning_rate": 0.0004994089578817226, |
|
"loss": 1.4062, |
|
"step": 2635 |
|
}, |
|
{ |
|
"epoch": 0.3173458348359178, |
|
"grad_norm": 0.4750553071498871, |
|
"learning_rate": 0.0004994056765338129, |
|
"loss": 1.2828, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 0.3179468686140161, |
|
"grad_norm": 0.5867997407913208, |
|
"learning_rate": 0.0004994023861132459, |
|
"loss": 1.2484, |
|
"step": 2645 |
|
}, |
|
{ |
|
"epoch": 0.31854790239211445, |
|
"grad_norm": 0.7348740696907043, |
|
"learning_rate": 0.0004993990866201414, |
|
"loss": 1.2258, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 0.3191489361702128, |
|
"grad_norm": 0.5523998141288757, |
|
"learning_rate": 0.0004993957780546193, |
|
"loss": 1.2805, |
|
"step": 2655 |
|
}, |
|
{ |
|
"epoch": 0.3197499699483111, |
|
"grad_norm": 0.5308116674423218, |
|
"learning_rate": 0.0004993924604168001, |
|
"loss": 1.3188, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 0.3203510037264094, |
|
"grad_norm": 0.40592867136001587, |
|
"learning_rate": 0.0004993891337068046, |
|
"loss": 1.2148, |
|
"step": 2665 |
|
}, |
|
{ |
|
"epoch": 0.32095203750450774, |
|
"grad_norm": 0.6522583365440369, |
|
"learning_rate": 0.0004993857979247535, |
|
"loss": 1.175, |
|
"step": 2670 |
|
}, |
|
{ |
|
"epoch": 0.3215530712826061, |
|
"grad_norm": 0.5981694459915161, |
|
"learning_rate": 0.0004993824530707682, |
|
"loss": 1.143, |
|
"step": 2675 |
|
}, |
|
{ |
|
"epoch": 0.3221541050607044, |
|
"grad_norm": 0.6832042932510376, |
|
"learning_rate": 0.0004993790991449707, |
|
"loss": 1.2242, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 0.32275513883880275, |
|
"grad_norm": 0.6935708522796631, |
|
"learning_rate": 0.0004993757361474825, |
|
"loss": 0.9617, |
|
"step": 2685 |
|
}, |
|
{ |
|
"epoch": 0.3233561726169011, |
|
"grad_norm": 0.5491186380386353, |
|
"learning_rate": 0.0004993723640784265, |
|
"loss": 1.3672, |
|
"step": 2690 |
|
}, |
|
{ |
|
"epoch": 0.3239572063949994, |
|
"grad_norm": 0.4743538498878479, |
|
"learning_rate": 0.0004993689829379249, |
|
"loss": 1.1547, |
|
"step": 2695 |
|
}, |
|
{ |
|
"epoch": 0.3245582401730977, |
|
"grad_norm": 0.641859769821167, |
|
"learning_rate": 0.0004993655927261008, |
|
"loss": 1.4078, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.32515927395119604, |
|
"grad_norm": 0.5002933144569397, |
|
"learning_rate": 0.0004993621934430778, |
|
"loss": 0.9492, |
|
"step": 2705 |
|
}, |
|
{ |
|
"epoch": 0.3257603077292944, |
|
"grad_norm": 0.7241799831390381, |
|
"learning_rate": 0.0004993587850889793, |
|
"loss": 1.575, |
|
"step": 2710 |
|
}, |
|
{ |
|
"epoch": 0.3263613415073927, |
|
"grad_norm": 0.5693483948707581, |
|
"learning_rate": 0.0004993553676639292, |
|
"loss": 0.9961, |
|
"step": 2715 |
|
}, |
|
{ |
|
"epoch": 0.32696237528549105, |
|
"grad_norm": 0.43130815029144287, |
|
"learning_rate": 0.000499351941168052, |
|
"loss": 1.2984, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 0.3275634090635894, |
|
"grad_norm": 0.5054978728294373, |
|
"learning_rate": 0.0004993485056014724, |
|
"loss": 1.1375, |
|
"step": 2725 |
|
}, |
|
{ |
|
"epoch": 0.3281644428416877, |
|
"grad_norm": 0.5581235289573669, |
|
"learning_rate": 0.0004993450609643152, |
|
"loss": 1.0164, |
|
"step": 2730 |
|
}, |
|
{ |
|
"epoch": 0.32876547661978606, |
|
"grad_norm": 0.6733124256134033, |
|
"learning_rate": 0.0004993416072567059, |
|
"loss": 1.4078, |
|
"step": 2735 |
|
}, |
|
{ |
|
"epoch": 0.32936651039788434, |
|
"grad_norm": 0.5003538727760315, |
|
"learning_rate": 0.0004993381444787699, |
|
"loss": 0.8742, |
|
"step": 2740 |
|
}, |
|
{ |
|
"epoch": 0.3299675441759827, |
|
"grad_norm": 0.6292559504508972, |
|
"learning_rate": 0.0004993346726306333, |
|
"loss": 1.007, |
|
"step": 2745 |
|
}, |
|
{ |
|
"epoch": 0.330568577954081, |
|
"grad_norm": 0.6760239005088806, |
|
"learning_rate": 0.0004993311917124224, |
|
"loss": 1.25, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 0.33116961173217935, |
|
"grad_norm": 0.6075654625892639, |
|
"learning_rate": 0.0004993277017242638, |
|
"loss": 1.5766, |
|
"step": 2755 |
|
}, |
|
{ |
|
"epoch": 0.3317706455102777, |
|
"grad_norm": 0.5432557463645935, |
|
"learning_rate": 0.0004993242026662846, |
|
"loss": 1.0883, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 0.332371679288376, |
|
"grad_norm": 0.6972253918647766, |
|
"learning_rate": 0.0004993206945386118, |
|
"loss": 0.9992, |
|
"step": 2765 |
|
}, |
|
{ |
|
"epoch": 0.33297271306647436, |
|
"grad_norm": 0.45837146043777466, |
|
"learning_rate": 0.0004993171773413731, |
|
"loss": 1.6766, |
|
"step": 2770 |
|
}, |
|
{ |
|
"epoch": 0.33357374684457264, |
|
"grad_norm": 0.5207621455192566, |
|
"learning_rate": 0.0004993136510746966, |
|
"loss": 1.2578, |
|
"step": 2775 |
|
}, |
|
{ |
|
"epoch": 0.334174780622671, |
|
"grad_norm": 0.7034028768539429, |
|
"learning_rate": 0.0004993101157387106, |
|
"loss": 1.3578, |
|
"step": 2780 |
|
}, |
|
{ |
|
"epoch": 0.3347758144007693, |
|
"grad_norm": 0.544851541519165, |
|
"learning_rate": 0.0004993065713335434, |
|
"loss": 1.2836, |
|
"step": 2785 |
|
}, |
|
{ |
|
"epoch": 0.33537684817886765, |
|
"grad_norm": 0.705143928527832, |
|
"learning_rate": 0.0004993030178593241, |
|
"loss": 1.4453, |
|
"step": 2790 |
|
}, |
|
{ |
|
"epoch": 0.335977881956966, |
|
"grad_norm": 0.6619438529014587, |
|
"learning_rate": 0.0004992994553161823, |
|
"loss": 1.0547, |
|
"step": 2795 |
|
}, |
|
{ |
|
"epoch": 0.3365789157350643, |
|
"grad_norm": 0.5982903242111206, |
|
"learning_rate": 0.000499295883704247, |
|
"loss": 1.4281, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.3365789157350643, |
|
"eval_loss": 2.189453125, |
|
"eval_model_preparation_time": 0.0053, |
|
"eval_runtime": 35.1965, |
|
"eval_samples_per_second": 4.546, |
|
"eval_steps_per_second": 1.136, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.33717994951316266, |
|
"grad_norm": 0.589056670665741, |
|
"learning_rate": 0.0004992923030236485, |
|
"loss": 1.3727, |
|
"step": 2805 |
|
}, |
|
{ |
|
"epoch": 0.337780983291261, |
|
"grad_norm": 0.39378607273101807, |
|
"learning_rate": 0.000499288713274517, |
|
"loss": 1.1789, |
|
"step": 2810 |
|
}, |
|
{ |
|
"epoch": 0.3383820170693593, |
|
"grad_norm": 0.5460519790649414, |
|
"learning_rate": 0.000499285114456983, |
|
"loss": 1.1719, |
|
"step": 2815 |
|
}, |
|
{ |
|
"epoch": 0.3389830508474576, |
|
"grad_norm": 0.4953864812850952, |
|
"learning_rate": 0.0004992815065711774, |
|
"loss": 1.1672, |
|
"step": 2820 |
|
}, |
|
{ |
|
"epoch": 0.33958408462555595, |
|
"grad_norm": 0.5705846548080444, |
|
"learning_rate": 0.0004992778896172317, |
|
"loss": 1.5328, |
|
"step": 2825 |
|
}, |
|
{ |
|
"epoch": 0.3401851184036543, |
|
"grad_norm": 0.5687447190284729, |
|
"learning_rate": 0.0004992742635952771, |
|
"loss": 1.0063, |
|
"step": 2830 |
|
}, |
|
{ |
|
"epoch": 0.3407861521817526, |
|
"grad_norm": 0.516343891620636, |
|
"learning_rate": 0.0004992706285054458, |
|
"loss": 1.1492, |
|
"step": 2835 |
|
}, |
|
{ |
|
"epoch": 0.34138718595985096, |
|
"grad_norm": 0.6128392815589905, |
|
"learning_rate": 0.0004992669843478699, |
|
"loss": 1.325, |
|
"step": 2840 |
|
}, |
|
{ |
|
"epoch": 0.3419882197379493, |
|
"grad_norm": 0.5104270577430725, |
|
"learning_rate": 0.000499263331122682, |
|
"loss": 1.0195, |
|
"step": 2845 |
|
}, |
|
{ |
|
"epoch": 0.3425892535160476, |
|
"grad_norm": 0.38332250714302063, |
|
"learning_rate": 0.0004992596688300149, |
|
"loss": 1.302, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 0.3431902872941459, |
|
"grad_norm": 0.5674039125442505, |
|
"learning_rate": 0.000499255997470002, |
|
"loss": 1.2094, |
|
"step": 2855 |
|
}, |
|
{ |
|
"epoch": 0.34379132107224425, |
|
"grad_norm": 0.7987366914749146, |
|
"learning_rate": 0.0004992523170427766, |
|
"loss": 1.2047, |
|
"step": 2860 |
|
}, |
|
{ |
|
"epoch": 0.3443923548503426, |
|
"grad_norm": 0.45501282811164856, |
|
"learning_rate": 0.0004992486275484729, |
|
"loss": 1.1539, |
|
"step": 2865 |
|
}, |
|
{ |
|
"epoch": 0.3449933886284409, |
|
"grad_norm": 0.5390669703483582, |
|
"learning_rate": 0.0004992449289872249, |
|
"loss": 1.2102, |
|
"step": 2870 |
|
}, |
|
{ |
|
"epoch": 0.34559442240653926, |
|
"grad_norm": 0.6710581183433533, |
|
"learning_rate": 0.0004992412213591672, |
|
"loss": 1.4297, |
|
"step": 2875 |
|
}, |
|
{ |
|
"epoch": 0.3461954561846376, |
|
"grad_norm": 0.6371570825576782, |
|
"learning_rate": 0.0004992375046644347, |
|
"loss": 1.0164, |
|
"step": 2880 |
|
}, |
|
{ |
|
"epoch": 0.34679648996273593, |
|
"grad_norm": 0.49934741854667664, |
|
"learning_rate": 0.0004992337789031625, |
|
"loss": 1.1313, |
|
"step": 2885 |
|
}, |
|
{ |
|
"epoch": 0.3473975237408342, |
|
"grad_norm": 0.41756120324134827, |
|
"learning_rate": 0.0004992300440754862, |
|
"loss": 1.1969, |
|
"step": 2890 |
|
}, |
|
{ |
|
"epoch": 0.34799855751893255, |
|
"grad_norm": 0.8102174997329712, |
|
"learning_rate": 0.0004992263001815418, |
|
"loss": 1.2719, |
|
"step": 2895 |
|
}, |
|
{ |
|
"epoch": 0.3485995912970309, |
|
"grad_norm": 0.45573851466178894, |
|
"learning_rate": 0.0004992225472214653, |
|
"loss": 1.1375, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 0.3492006250751292, |
|
"grad_norm": 0.5512142777442932, |
|
"learning_rate": 0.0004992187851953932, |
|
"loss": 1.4781, |
|
"step": 2905 |
|
}, |
|
{ |
|
"epoch": 0.34980165885322756, |
|
"grad_norm": 0.6429489850997925, |
|
"learning_rate": 0.0004992150141034624, |
|
"loss": 1.3453, |
|
"step": 2910 |
|
}, |
|
{ |
|
"epoch": 0.3504026926313259, |
|
"grad_norm": 0.6230481266975403, |
|
"learning_rate": 0.0004992112339458103, |
|
"loss": 1.2766, |
|
"step": 2915 |
|
}, |
|
{ |
|
"epoch": 0.35100372640942423, |
|
"grad_norm": 0.6134311556816101, |
|
"learning_rate": 0.0004992074447225741, |
|
"loss": 1.0664, |
|
"step": 2920 |
|
}, |
|
{ |
|
"epoch": 0.3516047601875225, |
|
"grad_norm": 0.5894711017608643, |
|
"learning_rate": 0.0004992036464338918, |
|
"loss": 0.9, |
|
"step": 2925 |
|
}, |
|
{ |
|
"epoch": 0.35220579396562085, |
|
"grad_norm": 0.525947630405426, |
|
"learning_rate": 0.0004991998390799016, |
|
"loss": 1.4844, |
|
"step": 2930 |
|
}, |
|
{ |
|
"epoch": 0.3528068277437192, |
|
"grad_norm": 0.5282221436500549, |
|
"learning_rate": 0.0004991960226607418, |
|
"loss": 1.0398, |
|
"step": 2935 |
|
}, |
|
{ |
|
"epoch": 0.3534078615218175, |
|
"grad_norm": 0.5808281302452087, |
|
"learning_rate": 0.0004991921971765514, |
|
"loss": 0.943, |
|
"step": 2940 |
|
}, |
|
{ |
|
"epoch": 0.35400889529991586, |
|
"grad_norm": 0.6163946390151978, |
|
"learning_rate": 0.0004991883626274696, |
|
"loss": 1.1086, |
|
"step": 2945 |
|
}, |
|
{ |
|
"epoch": 0.3546099290780142, |
|
"grad_norm": 0.4154224991798401, |
|
"learning_rate": 0.0004991845190136357, |
|
"loss": 1.2703, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 0.35521096285611253, |
|
"grad_norm": 0.4030189514160156, |
|
"learning_rate": 0.0004991806663351897, |
|
"loss": 1.1086, |
|
"step": 2955 |
|
}, |
|
{ |
|
"epoch": 0.35581199663421087, |
|
"grad_norm": 0.5927292704582214, |
|
"learning_rate": 0.0004991768045922718, |
|
"loss": 0.9758, |
|
"step": 2960 |
|
}, |
|
{ |
|
"epoch": 0.35641303041230915, |
|
"grad_norm": 0.476971834897995, |
|
"learning_rate": 0.0004991729337850223, |
|
"loss": 1.525, |
|
"step": 2965 |
|
}, |
|
{ |
|
"epoch": 0.3570140641904075, |
|
"grad_norm": 0.5584660768508911, |
|
"learning_rate": 0.000499169053913582, |
|
"loss": 1.2125, |
|
"step": 2970 |
|
}, |
|
{ |
|
"epoch": 0.3576150979685058, |
|
"grad_norm": 0.5617804527282715, |
|
"learning_rate": 0.0004991651649780922, |
|
"loss": 1.3102, |
|
"step": 2975 |
|
}, |
|
{ |
|
"epoch": 0.35821613174660416, |
|
"grad_norm": 0.3463181257247925, |
|
"learning_rate": 0.0004991612669786942, |
|
"loss": 1.4227, |
|
"step": 2980 |
|
}, |
|
{ |
|
"epoch": 0.3588171655247025, |
|
"grad_norm": 0.5156741142272949, |
|
"learning_rate": 0.0004991573599155299, |
|
"loss": 1.3828, |
|
"step": 2985 |
|
}, |
|
{ |
|
"epoch": 0.35941819930280083, |
|
"grad_norm": 0.6080055832862854, |
|
"learning_rate": 0.0004991534437887414, |
|
"loss": 1.0102, |
|
"step": 2990 |
|
}, |
|
{ |
|
"epoch": 0.36001923308089917, |
|
"grad_norm": 0.5142369866371155, |
|
"learning_rate": 0.0004991495185984711, |
|
"loss": 1.3469, |
|
"step": 2995 |
|
}, |
|
{ |
|
"epoch": 0.36062026685899745, |
|
"grad_norm": 0.4148232638835907, |
|
"learning_rate": 0.000499145584344862, |
|
"loss": 1.3102, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.36062026685899745, |
|
"eval_loss": 2.1435546875, |
|
"eval_model_preparation_time": 0.0053, |
|
"eval_runtime": 35.2124, |
|
"eval_samples_per_second": 4.544, |
|
"eval_steps_per_second": 1.136, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.3612213006370958, |
|
"grad_norm": 0.43302541971206665, |
|
"learning_rate": 0.000499141641028057, |
|
"loss": 1.1625, |
|
"step": 3005 |
|
}, |
|
{ |
|
"epoch": 0.3618223344151941, |
|
"grad_norm": 0.5702094435691833, |
|
"learning_rate": 0.0004991376886481996, |
|
"loss": 1.4937, |
|
"step": 3010 |
|
}, |
|
{ |
|
"epoch": 0.36242336819329246, |
|
"grad_norm": 0.602148175239563, |
|
"learning_rate": 0.0004991337272054336, |
|
"loss": 1.4453, |
|
"step": 3015 |
|
}, |
|
{ |
|
"epoch": 0.3630244019713908, |
|
"grad_norm": 0.7094736099243164, |
|
"learning_rate": 0.0004991297566999031, |
|
"loss": 1.1242, |
|
"step": 3020 |
|
}, |
|
{ |
|
"epoch": 0.36362543574948913, |
|
"grad_norm": 1.0491294860839844, |
|
"learning_rate": 0.0004991257771317525, |
|
"loss": 1.2945, |
|
"step": 3025 |
|
}, |
|
{ |
|
"epoch": 0.36422646952758747, |
|
"grad_norm": 0.5383880734443665, |
|
"learning_rate": 0.0004991217885011266, |
|
"loss": 1.4398, |
|
"step": 3030 |
|
}, |
|
{ |
|
"epoch": 0.36482750330568575, |
|
"grad_norm": 0.5303685665130615, |
|
"learning_rate": 0.0004991177908081706, |
|
"loss": 1.357, |
|
"step": 3035 |
|
}, |
|
{ |
|
"epoch": 0.3654285370837841, |
|
"grad_norm": 0.5658953785896301, |
|
"learning_rate": 0.0004991137840530297, |
|
"loss": 1.625, |
|
"step": 3040 |
|
}, |
|
{ |
|
"epoch": 0.3660295708618824, |
|
"grad_norm": 0.6413328051567078, |
|
"learning_rate": 0.0004991097682358498, |
|
"loss": 1.0664, |
|
"step": 3045 |
|
}, |
|
{ |
|
"epoch": 0.36663060463998076, |
|
"grad_norm": 0.40573734045028687, |
|
"learning_rate": 0.000499105743356777, |
|
"loss": 1.4219, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 0.3672316384180791, |
|
"grad_norm": 0.4775443375110626, |
|
"learning_rate": 0.0004991017094159576, |
|
"loss": 1.1383, |
|
"step": 3055 |
|
}, |
|
{ |
|
"epoch": 0.36783267219617743, |
|
"grad_norm": 0.4478199779987335, |
|
"learning_rate": 0.0004990976664135384, |
|
"loss": 1.1906, |
|
"step": 3060 |
|
}, |
|
{ |
|
"epoch": 0.36843370597427577, |
|
"grad_norm": 0.5353224277496338, |
|
"learning_rate": 0.0004990936143496664, |
|
"loss": 1.0695, |
|
"step": 3065 |
|
}, |
|
{ |
|
"epoch": 0.3690347397523741, |
|
"grad_norm": 0.6140496730804443, |
|
"learning_rate": 0.0004990895532244893, |
|
"loss": 1.45, |
|
"step": 3070 |
|
}, |
|
{ |
|
"epoch": 0.3696357735304724, |
|
"grad_norm": 0.674697995185852, |
|
"learning_rate": 0.0004990854830381545, |
|
"loss": 1.2617, |
|
"step": 3075 |
|
}, |
|
{ |
|
"epoch": 0.3702368073085707, |
|
"grad_norm": 0.793539822101593, |
|
"learning_rate": 0.0004990814037908102, |
|
"loss": 1.0797, |
|
"step": 3080 |
|
}, |
|
{ |
|
"epoch": 0.37083784108666906, |
|
"grad_norm": 0.4356972873210907, |
|
"learning_rate": 0.0004990773154826048, |
|
"loss": 1.257, |
|
"step": 3085 |
|
}, |
|
{ |
|
"epoch": 0.3714388748647674, |
|
"grad_norm": 0.4007517099380493, |
|
"learning_rate": 0.000499073218113687, |
|
"loss": 1.6695, |
|
"step": 3090 |
|
}, |
|
{ |
|
"epoch": 0.37203990864286574, |
|
"grad_norm": 0.7213647961616516, |
|
"learning_rate": 0.0004990691116842058, |
|
"loss": 1.1039, |
|
"step": 3095 |
|
}, |
|
{ |
|
"epoch": 0.37264094242096407, |
|
"grad_norm": 0.5188817977905273, |
|
"learning_rate": 0.0004990649961943105, |
|
"loss": 1.3844, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 0.3732419761990624, |
|
"grad_norm": 0.5494900345802307, |
|
"learning_rate": 0.0004990608716441511, |
|
"loss": 1.3094, |
|
"step": 3105 |
|
}, |
|
{ |
|
"epoch": 0.3738430099771607, |
|
"grad_norm": 0.6364961266517639, |
|
"learning_rate": 0.0004990567380338774, |
|
"loss": 1.432, |
|
"step": 3110 |
|
}, |
|
{ |
|
"epoch": 0.374444043755259, |
|
"grad_norm": 0.7705929279327393, |
|
"learning_rate": 0.0004990525953636399, |
|
"loss": 1.3594, |
|
"step": 3115 |
|
}, |
|
{ |
|
"epoch": 0.37504507753335736, |
|
"grad_norm": 0.6256239414215088, |
|
"learning_rate": 0.0004990484436335892, |
|
"loss": 1.4969, |
|
"step": 3120 |
|
}, |
|
{ |
|
"epoch": 0.3756461113114557, |
|
"grad_norm": 0.8040322661399841, |
|
"learning_rate": 0.0004990442828438764, |
|
"loss": 1.3664, |
|
"step": 3125 |
|
}, |
|
{ |
|
"epoch": 0.37624714508955404, |
|
"grad_norm": 0.7349644303321838, |
|
"learning_rate": 0.0004990401129946528, |
|
"loss": 1.3016, |
|
"step": 3130 |
|
}, |
|
{ |
|
"epoch": 0.3768481788676524, |
|
"grad_norm": 0.7406263947486877, |
|
"learning_rate": 0.0004990359340860701, |
|
"loss": 1.2672, |
|
"step": 3135 |
|
}, |
|
{ |
|
"epoch": 0.3774492126457507, |
|
"grad_norm": 0.6488270163536072, |
|
"learning_rate": 0.0004990317461182803, |
|
"loss": 1.5125, |
|
"step": 3140 |
|
}, |
|
{ |
|
"epoch": 0.37805024642384905, |
|
"grad_norm": 0.450980544090271, |
|
"learning_rate": 0.0004990275490914358, |
|
"loss": 0.9531, |
|
"step": 3145 |
|
}, |
|
{ |
|
"epoch": 0.3786512802019473, |
|
"grad_norm": 0.5992457866668701, |
|
"learning_rate": 0.0004990233430056892, |
|
"loss": 1.2563, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 0.37925231398004566, |
|
"grad_norm": 0.4323638677597046, |
|
"learning_rate": 0.0004990191278611936, |
|
"loss": 1.1438, |
|
"step": 3155 |
|
}, |
|
{ |
|
"epoch": 0.379853347758144, |
|
"grad_norm": 0.9859302639961243, |
|
"learning_rate": 0.0004990149036581023, |
|
"loss": 1.2555, |
|
"step": 3160 |
|
}, |
|
{ |
|
"epoch": 0.38045438153624234, |
|
"grad_norm": 0.8136280179023743, |
|
"learning_rate": 0.0004990106703965689, |
|
"loss": 1.3172, |
|
"step": 3165 |
|
}, |
|
{ |
|
"epoch": 0.3810554153143407, |
|
"grad_norm": 0.5532881021499634, |
|
"learning_rate": 0.0004990064280767475, |
|
"loss": 0.9656, |
|
"step": 3170 |
|
}, |
|
{ |
|
"epoch": 0.381656449092439, |
|
"grad_norm": 0.5996264219284058, |
|
"learning_rate": 0.0004990021766987923, |
|
"loss": 1.2688, |
|
"step": 3175 |
|
}, |
|
{ |
|
"epoch": 0.38225748287053735, |
|
"grad_norm": 0.6474243402481079, |
|
"learning_rate": 0.0004989979162628582, |
|
"loss": 0.9461, |
|
"step": 3180 |
|
}, |
|
{ |
|
"epoch": 0.3828585166486356, |
|
"grad_norm": 0.5693522691726685, |
|
"learning_rate": 0.0004989936467690998, |
|
"loss": 1.0906, |
|
"step": 3185 |
|
}, |
|
{ |
|
"epoch": 0.38345955042673396, |
|
"grad_norm": 0.5393794775009155, |
|
"learning_rate": 0.0004989893682176727, |
|
"loss": 1.2445, |
|
"step": 3190 |
|
}, |
|
{ |
|
"epoch": 0.3840605842048323, |
|
"grad_norm": 0.6597670316696167, |
|
"learning_rate": 0.0004989850806087325, |
|
"loss": 1.4289, |
|
"step": 3195 |
|
}, |
|
{ |
|
"epoch": 0.38466161798293064, |
|
"grad_norm": 0.6367236971855164, |
|
"learning_rate": 0.0004989807839424352, |
|
"loss": 1.2266, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 0.38466161798293064, |
|
"eval_loss": 2.1474609375, |
|
"eval_model_preparation_time": 0.0053, |
|
"eval_runtime": 35.1942, |
|
"eval_samples_per_second": 4.546, |
|
"eval_steps_per_second": 1.137, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 0.385262651761029, |
|
"grad_norm": 0.5790736079216003, |
|
"learning_rate": 0.0004989764782189369, |
|
"loss": 1.2164, |
|
"step": 3205 |
|
}, |
|
{ |
|
"epoch": 0.3858636855391273, |
|
"grad_norm": 0.502346932888031, |
|
"learning_rate": 0.0004989721634383943, |
|
"loss": 1.3391, |
|
"step": 3210 |
|
}, |
|
{ |
|
"epoch": 0.38646471931722565, |
|
"grad_norm": 0.6391408443450928, |
|
"learning_rate": 0.0004989678396009645, |
|
"loss": 1.2711, |
|
"step": 3215 |
|
}, |
|
{ |
|
"epoch": 0.387065753095324, |
|
"grad_norm": 0.40186697244644165, |
|
"learning_rate": 0.0004989635067068047, |
|
"loss": 1.1691, |
|
"step": 3220 |
|
}, |
|
{ |
|
"epoch": 0.38766678687342226, |
|
"grad_norm": 0.5028790235519409, |
|
"learning_rate": 0.0004989591647560726, |
|
"loss": 1.1609, |
|
"step": 3225 |
|
}, |
|
{ |
|
"epoch": 0.3882678206515206, |
|
"grad_norm": 0.4072652757167816, |
|
"learning_rate": 0.0004989548137489259, |
|
"loss": 1.5672, |
|
"step": 3230 |
|
}, |
|
{ |
|
"epoch": 0.38886885442961894, |
|
"grad_norm": 0.4738612174987793, |
|
"learning_rate": 0.0004989504536855232, |
|
"loss": 1.3344, |
|
"step": 3235 |
|
}, |
|
{ |
|
"epoch": 0.3894698882077173, |
|
"grad_norm": 0.9375470876693726, |
|
"learning_rate": 0.0004989460845660229, |
|
"loss": 1.0484, |
|
"step": 3240 |
|
}, |
|
{ |
|
"epoch": 0.3900709219858156, |
|
"grad_norm": 0.4789920449256897, |
|
"learning_rate": 0.000498941706390584, |
|
"loss": 1.1109, |
|
"step": 3245 |
|
}, |
|
{ |
|
"epoch": 0.39067195576391395, |
|
"grad_norm": 0.6214213967323303, |
|
"learning_rate": 0.0004989373191593658, |
|
"loss": 1.3516, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 0.3912729895420123, |
|
"grad_norm": 0.49041748046875, |
|
"learning_rate": 0.0004989329228725277, |
|
"loss": 1.2977, |
|
"step": 3255 |
|
}, |
|
{ |
|
"epoch": 0.39187402332011056, |
|
"grad_norm": 0.5892441272735596, |
|
"learning_rate": 0.00049892851753023, |
|
"loss": 1.1109, |
|
"step": 3260 |
|
}, |
|
{ |
|
"epoch": 0.3924750570982089, |
|
"grad_norm": 0.4488331973552704, |
|
"learning_rate": 0.0004989241031326326, |
|
"loss": 1.2734, |
|
"step": 3265 |
|
}, |
|
{ |
|
"epoch": 0.39307609087630724, |
|
"grad_norm": 0.43196800351142883, |
|
"learning_rate": 0.0004989196796798963, |
|
"loss": 1.1059, |
|
"step": 3270 |
|
}, |
|
{ |
|
"epoch": 0.3936771246544056, |
|
"grad_norm": 0.3739017844200134, |
|
"learning_rate": 0.0004989152471721819, |
|
"loss": 0.8492, |
|
"step": 3275 |
|
}, |
|
{ |
|
"epoch": 0.3942781584325039, |
|
"grad_norm": 0.5097094178199768, |
|
"learning_rate": 0.0004989108056096505, |
|
"loss": 1.1836, |
|
"step": 3280 |
|
}, |
|
{ |
|
"epoch": 0.39487919221060225, |
|
"grad_norm": 0.7751893401145935, |
|
"learning_rate": 0.000498906354992464, |
|
"loss": 1.443, |
|
"step": 3285 |
|
}, |
|
{ |
|
"epoch": 0.3954802259887006, |
|
"grad_norm": 0.6941812634468079, |
|
"learning_rate": 0.0004989018953207841, |
|
"loss": 1.2094, |
|
"step": 3290 |
|
}, |
|
{ |
|
"epoch": 0.3960812597667989, |
|
"grad_norm": 0.5261918902397156, |
|
"learning_rate": 0.0004988974265947731, |
|
"loss": 1.1297, |
|
"step": 3295 |
|
}, |
|
{ |
|
"epoch": 0.3966822935448972, |
|
"grad_norm": 0.6817163228988647, |
|
"learning_rate": 0.0004988929488145934, |
|
"loss": 1.5297, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 0.39728332732299554, |
|
"grad_norm": 0.6343579888343811, |
|
"learning_rate": 0.0004988884619804082, |
|
"loss": 1.1766, |
|
"step": 3305 |
|
}, |
|
{ |
|
"epoch": 0.3978843611010939, |
|
"grad_norm": 0.4660755693912506, |
|
"learning_rate": 0.0004988839660923805, |
|
"loss": 0.9953, |
|
"step": 3310 |
|
}, |
|
{ |
|
"epoch": 0.3984853948791922, |
|
"grad_norm": 0.6850960850715637, |
|
"learning_rate": 0.0004988794611506738, |
|
"loss": 1.4023, |
|
"step": 3315 |
|
}, |
|
{ |
|
"epoch": 0.39908642865729055, |
|
"grad_norm": 0.7170994877815247, |
|
"learning_rate": 0.0004988749471554521, |
|
"loss": 1.1391, |
|
"step": 3320 |
|
}, |
|
{ |
|
"epoch": 0.3996874624353889, |
|
"grad_norm": 0.9181567430496216, |
|
"learning_rate": 0.0004988704241068795, |
|
"loss": 1.3047, |
|
"step": 3325 |
|
}, |
|
{ |
|
"epoch": 0.4002884962134872, |
|
"grad_norm": 0.5317991971969604, |
|
"learning_rate": 0.0004988658920051207, |
|
"loss": 1.3273, |
|
"step": 3330 |
|
}, |
|
{ |
|
"epoch": 0.4008895299915855, |
|
"grad_norm": 0.5464211702346802, |
|
"learning_rate": 0.0004988613508503405, |
|
"loss": 1.0758, |
|
"step": 3335 |
|
}, |
|
{ |
|
"epoch": 0.40149056376968384, |
|
"grad_norm": 0.664720892906189, |
|
"learning_rate": 0.0004988568006427039, |
|
"loss": 1.1383, |
|
"step": 3340 |
|
}, |
|
{ |
|
"epoch": 0.4020915975477822, |
|
"grad_norm": 0.5137344002723694, |
|
"learning_rate": 0.0004988522413823767, |
|
"loss": 0.8992, |
|
"step": 3345 |
|
}, |
|
{ |
|
"epoch": 0.4026926313258805, |
|
"grad_norm": 0.6021727919578552, |
|
"learning_rate": 0.0004988476730695246, |
|
"loss": 1.0391, |
|
"step": 3350 |
|
}, |
|
{ |
|
"epoch": 0.40329366510397885, |
|
"grad_norm": 0.5493384003639221, |
|
"learning_rate": 0.0004988430957043138, |
|
"loss": 1.2063, |
|
"step": 3355 |
|
}, |
|
{ |
|
"epoch": 0.4038946988820772, |
|
"grad_norm": 0.44344305992126465, |
|
"learning_rate": 0.0004988385092869109, |
|
"loss": 1.1539, |
|
"step": 3360 |
|
}, |
|
{ |
|
"epoch": 0.4044957326601755, |
|
"grad_norm": 0.5496264100074768, |
|
"learning_rate": 0.0004988339138174827, |
|
"loss": 1.0008, |
|
"step": 3365 |
|
}, |
|
{ |
|
"epoch": 0.40509676643827386, |
|
"grad_norm": 0.5868191719055176, |
|
"learning_rate": 0.0004988293092961962, |
|
"loss": 1.0273, |
|
"step": 3370 |
|
}, |
|
{ |
|
"epoch": 0.40569780021637214, |
|
"grad_norm": 0.47252243757247925, |
|
"learning_rate": 0.0004988246957232191, |
|
"loss": 1.1547, |
|
"step": 3375 |
|
}, |
|
{ |
|
"epoch": 0.4062988339944705, |
|
"grad_norm": 0.5384260416030884, |
|
"learning_rate": 0.0004988200730987192, |
|
"loss": 0.9969, |
|
"step": 3380 |
|
}, |
|
{ |
|
"epoch": 0.4068998677725688, |
|
"grad_norm": 0.5157658457756042, |
|
"learning_rate": 0.0004988154414228645, |
|
"loss": 1.218, |
|
"step": 3385 |
|
}, |
|
{ |
|
"epoch": 0.40750090155066715, |
|
"grad_norm": 0.609667181968689, |
|
"learning_rate": 0.0004988108006958237, |
|
"loss": 1.2977, |
|
"step": 3390 |
|
}, |
|
{ |
|
"epoch": 0.4081019353287655, |
|
"grad_norm": 0.6255223751068115, |
|
"learning_rate": 0.0004988061509177656, |
|
"loss": 1.0859, |
|
"step": 3395 |
|
}, |
|
{ |
|
"epoch": 0.4087029691068638, |
|
"grad_norm": 0.48681220412254333, |
|
"learning_rate": 0.0004988014920888592, |
|
"loss": 1.1094, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 0.4087029691068638, |
|
"eval_loss": 2.1009764671325684, |
|
"eval_model_preparation_time": 0.0053, |
|
"eval_runtime": 35.1797, |
|
"eval_samples_per_second": 4.548, |
|
"eval_steps_per_second": 1.137, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 0.40930400288496216, |
|
"grad_norm": 0.7313075065612793, |
|
"learning_rate": 0.0004987968242092741, |
|
"loss": 1.4648, |
|
"step": 3405 |
|
}, |
|
{ |
|
"epoch": 0.40990503666306044, |
|
"grad_norm": 0.3973584771156311, |
|
"learning_rate": 0.00049879214727918, |
|
"loss": 1.4195, |
|
"step": 3410 |
|
}, |
|
{ |
|
"epoch": 0.4105060704411588, |
|
"grad_norm": 0.5780074596405029, |
|
"learning_rate": 0.0004987874612987471, |
|
"loss": 1.1383, |
|
"step": 3415 |
|
}, |
|
{ |
|
"epoch": 0.4111071042192571, |
|
"grad_norm": 0.5280987620353699, |
|
"learning_rate": 0.0004987827662681459, |
|
"loss": 1.2125, |
|
"step": 3420 |
|
}, |
|
{ |
|
"epoch": 0.41170813799735545, |
|
"grad_norm": 0.672178328037262, |
|
"learning_rate": 0.0004987780621875471, |
|
"loss": 1.2563, |
|
"step": 3425 |
|
}, |
|
{ |
|
"epoch": 0.4123091717754538, |
|
"grad_norm": 0.5133812427520752, |
|
"learning_rate": 0.0004987733490571218, |
|
"loss": 1.2266, |
|
"step": 3430 |
|
}, |
|
{ |
|
"epoch": 0.4129102055535521, |
|
"grad_norm": 0.7771773338317871, |
|
"learning_rate": 0.0004987686268770415, |
|
"loss": 1.2648, |
|
"step": 3435 |
|
}, |
|
{ |
|
"epoch": 0.41351123933165046, |
|
"grad_norm": 0.555830180644989, |
|
"learning_rate": 0.0004987638956474781, |
|
"loss": 1.2445, |
|
"step": 3440 |
|
}, |
|
{ |
|
"epoch": 0.41411227310974874, |
|
"grad_norm": 0.377913236618042, |
|
"learning_rate": 0.0004987591553686035, |
|
"loss": 1.0813, |
|
"step": 3445 |
|
}, |
|
{ |
|
"epoch": 0.4147133068878471, |
|
"grad_norm": 0.6742091774940491, |
|
"learning_rate": 0.0004987544060405903, |
|
"loss": 1.1516, |
|
"step": 3450 |
|
}, |
|
{ |
|
"epoch": 0.4153143406659454, |
|
"grad_norm": 0.6832530498504639, |
|
"learning_rate": 0.0004987496476636112, |
|
"loss": 0.9953, |
|
"step": 3455 |
|
}, |
|
{ |
|
"epoch": 0.41591537444404375, |
|
"grad_norm": 0.7047104835510254, |
|
"learning_rate": 0.0004987448802378393, |
|
"loss": 1.2781, |
|
"step": 3460 |
|
}, |
|
{ |
|
"epoch": 0.4165164082221421, |
|
"grad_norm": 0.5788468718528748, |
|
"learning_rate": 0.000498740103763448, |
|
"loss": 1.1266, |
|
"step": 3465 |
|
}, |
|
{ |
|
"epoch": 0.4171174420002404, |
|
"grad_norm": 0.6551057696342468, |
|
"learning_rate": 0.0004987353182406111, |
|
"loss": 0.9711, |
|
"step": 3470 |
|
}, |
|
{ |
|
"epoch": 0.41771847577833876, |
|
"grad_norm": 0.7851700782775879, |
|
"learning_rate": 0.0004987305236695027, |
|
"loss": 1.2258, |
|
"step": 3475 |
|
}, |
|
{ |
|
"epoch": 0.4183195095564371, |
|
"grad_norm": 0.45742517709732056, |
|
"learning_rate": 0.000498725720050297, |
|
"loss": 1.1375, |
|
"step": 3480 |
|
}, |
|
{ |
|
"epoch": 0.4189205433345354, |
|
"grad_norm": 0.5413776636123657, |
|
"learning_rate": 0.0004987209073831691, |
|
"loss": 1.3938, |
|
"step": 3485 |
|
}, |
|
{ |
|
"epoch": 0.4195215771126337, |
|
"grad_norm": 0.5107463598251343, |
|
"learning_rate": 0.0004987160856682938, |
|
"loss": 1.0172, |
|
"step": 3490 |
|
}, |
|
{ |
|
"epoch": 0.42012261089073205, |
|
"grad_norm": 0.6791651844978333, |
|
"learning_rate": 0.0004987112549058466, |
|
"loss": 1.2555, |
|
"step": 3495 |
|
}, |
|
{ |
|
"epoch": 0.4207236446688304, |
|
"grad_norm": 0.71052086353302, |
|
"learning_rate": 0.0004987064150960033, |
|
"loss": 1.057, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.4213246784469287, |
|
"grad_norm": 0.5631623268127441, |
|
"learning_rate": 0.0004987015662389398, |
|
"loss": 1.143, |
|
"step": 3505 |
|
}, |
|
{ |
|
"epoch": 0.42192571222502706, |
|
"grad_norm": 0.7245007753372192, |
|
"learning_rate": 0.0004986967083348325, |
|
"loss": 1.0203, |
|
"step": 3510 |
|
}, |
|
{ |
|
"epoch": 0.4225267460031254, |
|
"grad_norm": 0.5436373353004456, |
|
"learning_rate": 0.0004986918413838583, |
|
"loss": 1.0805, |
|
"step": 3515 |
|
}, |
|
{ |
|
"epoch": 0.4231277797812237, |
|
"grad_norm": 0.4831235110759735, |
|
"learning_rate": 0.0004986869653861941, |
|
"loss": 1.2867, |
|
"step": 3520 |
|
}, |
|
{ |
|
"epoch": 0.423728813559322, |
|
"grad_norm": 0.4958447813987732, |
|
"learning_rate": 0.0004986820803420172, |
|
"loss": 1.143, |
|
"step": 3525 |
|
}, |
|
{ |
|
"epoch": 0.42432984733742035, |
|
"grad_norm": 0.7917611002922058, |
|
"learning_rate": 0.0004986771862515055, |
|
"loss": 1.2012, |
|
"step": 3530 |
|
}, |
|
{ |
|
"epoch": 0.4249308811155187, |
|
"grad_norm": 0.6926260590553284, |
|
"learning_rate": 0.0004986722831148369, |
|
"loss": 1.3297, |
|
"step": 3535 |
|
}, |
|
{ |
|
"epoch": 0.425531914893617, |
|
"grad_norm": 0.5297942757606506, |
|
"learning_rate": 0.0004986673709321898, |
|
"loss": 1.0195, |
|
"step": 3540 |
|
}, |
|
{ |
|
"epoch": 0.42613294867171536, |
|
"grad_norm": 0.6709704399108887, |
|
"learning_rate": 0.0004986624497037429, |
|
"loss": 1.307, |
|
"step": 3545 |
|
}, |
|
{ |
|
"epoch": 0.4267339824498137, |
|
"grad_norm": 0.586955189704895, |
|
"learning_rate": 0.0004986575194296752, |
|
"loss": 0.9141, |
|
"step": 3550 |
|
}, |
|
{ |
|
"epoch": 0.42733501622791203, |
|
"grad_norm": 0.573215663433075, |
|
"learning_rate": 0.000498652580110166, |
|
"loss": 1.0344, |
|
"step": 3555 |
|
}, |
|
{ |
|
"epoch": 0.4279360500060103, |
|
"grad_norm": 0.4756597578525543, |
|
"learning_rate": 0.0004986476317453951, |
|
"loss": 1.1664, |
|
"step": 3560 |
|
}, |
|
{ |
|
"epoch": 0.42853708378410865, |
|
"grad_norm": 0.5231835246086121, |
|
"learning_rate": 0.0004986426743355425, |
|
"loss": 1.1281, |
|
"step": 3565 |
|
}, |
|
{ |
|
"epoch": 0.429138117562207, |
|
"grad_norm": 0.4669915735721588, |
|
"learning_rate": 0.0004986377078807884, |
|
"loss": 1.2641, |
|
"step": 3570 |
|
}, |
|
{ |
|
"epoch": 0.4297391513403053, |
|
"grad_norm": 0.49261239171028137, |
|
"learning_rate": 0.0004986327323813135, |
|
"loss": 1.1812, |
|
"step": 3575 |
|
}, |
|
{ |
|
"epoch": 0.43034018511840366, |
|
"grad_norm": 0.5701055526733398, |
|
"learning_rate": 0.0004986277478372989, |
|
"loss": 0.943, |
|
"step": 3580 |
|
}, |
|
{ |
|
"epoch": 0.430941218896502, |
|
"grad_norm": 0.8740291595458984, |
|
"learning_rate": 0.0004986227542489259, |
|
"loss": 1.4688, |
|
"step": 3585 |
|
}, |
|
{ |
|
"epoch": 0.43154225267460034, |
|
"grad_norm": 0.5957376956939697, |
|
"learning_rate": 0.000498617751616376, |
|
"loss": 1.0031, |
|
"step": 3590 |
|
}, |
|
{ |
|
"epoch": 0.4321432864526986, |
|
"grad_norm": 0.5455657243728638, |
|
"learning_rate": 0.0004986127399398315, |
|
"loss": 1.1375, |
|
"step": 3595 |
|
}, |
|
{ |
|
"epoch": 0.43274432023079695, |
|
"grad_norm": 0.7837244868278503, |
|
"learning_rate": 0.0004986077192194743, |
|
"loss": 1.1492, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 0.43274432023079695, |
|
"eval_loss": 2.097851514816284, |
|
"eval_model_preparation_time": 0.0053, |
|
"eval_runtime": 35.2375, |
|
"eval_samples_per_second": 4.541, |
|
"eval_steps_per_second": 1.135, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 0.4333453540088953, |
|
"grad_norm": 0.6420156359672546, |
|
"learning_rate": 0.0004986026894554874, |
|
"loss": 1.6125, |
|
"step": 3605 |
|
}, |
|
{ |
|
"epoch": 0.4339463877869936, |
|
"grad_norm": 0.37207821011543274, |
|
"learning_rate": 0.0004985976506480535, |
|
"loss": 1.0586, |
|
"step": 3610 |
|
}, |
|
{ |
|
"epoch": 0.43454742156509196, |
|
"grad_norm": 0.4620397090911865, |
|
"learning_rate": 0.000498592602797356, |
|
"loss": 1.1328, |
|
"step": 3615 |
|
}, |
|
{ |
|
"epoch": 0.4351484553431903, |
|
"grad_norm": 0.8707792162895203, |
|
"learning_rate": 0.0004985875459035786, |
|
"loss": 1.2578, |
|
"step": 3620 |
|
}, |
|
{ |
|
"epoch": 0.43574948912128864, |
|
"grad_norm": 0.5785757303237915, |
|
"learning_rate": 0.0004985824799669052, |
|
"loss": 1.1313, |
|
"step": 3625 |
|
}, |
|
{ |
|
"epoch": 0.43635052289938697, |
|
"grad_norm": 0.581447958946228, |
|
"learning_rate": 0.00049857740498752, |
|
"loss": 1.2219, |
|
"step": 3630 |
|
}, |
|
{ |
|
"epoch": 0.43695155667748525, |
|
"grad_norm": 0.5232359170913696, |
|
"learning_rate": 0.0004985723209656078, |
|
"loss": 1.0891, |
|
"step": 3635 |
|
}, |
|
{ |
|
"epoch": 0.4375525904555836, |
|
"grad_norm": 0.4474778175354004, |
|
"learning_rate": 0.0004985672279013534, |
|
"loss": 1.625, |
|
"step": 3640 |
|
}, |
|
{ |
|
"epoch": 0.4381536242336819, |
|
"grad_norm": 0.7095141410827637, |
|
"learning_rate": 0.000498562125794942, |
|
"loss": 0.9273, |
|
"step": 3645 |
|
}, |
|
{ |
|
"epoch": 0.43875465801178026, |
|
"grad_norm": 0.6456997394561768, |
|
"learning_rate": 0.0004985570146465593, |
|
"loss": 1.1453, |
|
"step": 3650 |
|
}, |
|
{ |
|
"epoch": 0.4393556917898786, |
|
"grad_norm": 0.7230183482170105, |
|
"learning_rate": 0.0004985518944563914, |
|
"loss": 1.15, |
|
"step": 3655 |
|
}, |
|
{ |
|
"epoch": 0.43995672556797694, |
|
"grad_norm": 0.6240507364273071, |
|
"learning_rate": 0.0004985467652246243, |
|
"loss": 1.0586, |
|
"step": 3660 |
|
}, |
|
{ |
|
"epoch": 0.4405577593460753, |
|
"grad_norm": 1.0956453084945679, |
|
"learning_rate": 0.0004985416269514447, |
|
"loss": 1.3141, |
|
"step": 3665 |
|
}, |
|
{ |
|
"epoch": 0.44115879312417355, |
|
"grad_norm": 0.4983053505420685, |
|
"learning_rate": 0.0004985364796370394, |
|
"loss": 1.0805, |
|
"step": 3670 |
|
}, |
|
{ |
|
"epoch": 0.4417598269022719, |
|
"grad_norm": 0.5128179788589478, |
|
"learning_rate": 0.0004985313232815958, |
|
"loss": 0.9055, |
|
"step": 3675 |
|
}, |
|
{ |
|
"epoch": 0.4423608606803702, |
|
"grad_norm": 0.535322368144989, |
|
"learning_rate": 0.0004985261578853014, |
|
"loss": 0.9563, |
|
"step": 3680 |
|
}, |
|
{ |
|
"epoch": 0.44296189445846856, |
|
"grad_norm": 0.6636273264884949, |
|
"learning_rate": 0.000498520983448344, |
|
"loss": 1.2719, |
|
"step": 3685 |
|
}, |
|
{ |
|
"epoch": 0.4435629282365669, |
|
"grad_norm": 0.7313540577888489, |
|
"learning_rate": 0.0004985157999709122, |
|
"loss": 1.5672, |
|
"step": 3690 |
|
}, |
|
{ |
|
"epoch": 0.44416396201466524, |
|
"grad_norm": 0.45398956537246704, |
|
"learning_rate": 0.000498510607453194, |
|
"loss": 1.0688, |
|
"step": 3695 |
|
}, |
|
{ |
|
"epoch": 0.4447649957927636, |
|
"grad_norm": 0.4940120577812195, |
|
"learning_rate": 0.0004985054058953788, |
|
"loss": 1.2211, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 0.4453660295708619, |
|
"grad_norm": 0.48061200976371765, |
|
"learning_rate": 0.0004985001952976556, |
|
"loss": 1.1328, |
|
"step": 3705 |
|
}, |
|
{ |
|
"epoch": 0.4459670633489602, |
|
"grad_norm": 0.6256884336471558, |
|
"learning_rate": 0.0004984949756602139, |
|
"loss": 1.2969, |
|
"step": 3710 |
|
}, |
|
{ |
|
"epoch": 0.4465680971270585, |
|
"grad_norm": 0.5207997560501099, |
|
"learning_rate": 0.0004984897469832437, |
|
"loss": 1.3641, |
|
"step": 3715 |
|
}, |
|
{ |
|
"epoch": 0.44716913090515686, |
|
"grad_norm": 0.4868186414241791, |
|
"learning_rate": 0.000498484509266935, |
|
"loss": 0.8984, |
|
"step": 3720 |
|
}, |
|
{ |
|
"epoch": 0.4477701646832552, |
|
"grad_norm": 0.5324142575263977, |
|
"learning_rate": 0.0004984792625114786, |
|
"loss": 1.1617, |
|
"step": 3725 |
|
}, |
|
{ |
|
"epoch": 0.44837119846135354, |
|
"grad_norm": 0.5691832304000854, |
|
"learning_rate": 0.0004984740067170651, |
|
"loss": 1.2117, |
|
"step": 3730 |
|
}, |
|
{ |
|
"epoch": 0.4489722322394519, |
|
"grad_norm": 0.419612318277359, |
|
"learning_rate": 0.0004984687418838859, |
|
"loss": 1.0063, |
|
"step": 3735 |
|
}, |
|
{ |
|
"epoch": 0.4495732660175502, |
|
"grad_norm": 0.5638041496276855, |
|
"learning_rate": 0.0004984634680121325, |
|
"loss": 1.0805, |
|
"step": 3740 |
|
}, |
|
{ |
|
"epoch": 0.4501742997956485, |
|
"grad_norm": 0.5611728429794312, |
|
"learning_rate": 0.0004984581851019966, |
|
"loss": 1.282, |
|
"step": 3745 |
|
}, |
|
{ |
|
"epoch": 0.45077533357374683, |
|
"grad_norm": 0.6137074828147888, |
|
"learning_rate": 0.0004984528931536705, |
|
"loss": 1.2445, |
|
"step": 3750 |
|
}, |
|
{ |
|
"epoch": 0.45137636735184516, |
|
"grad_norm": 0.4265820384025574, |
|
"learning_rate": 0.0004984475921673466, |
|
"loss": 0.9336, |
|
"step": 3755 |
|
}, |
|
{ |
|
"epoch": 0.4519774011299435, |
|
"grad_norm": 0.6829708218574524, |
|
"learning_rate": 0.0004984422821432178, |
|
"loss": 1.4375, |
|
"step": 3760 |
|
}, |
|
{ |
|
"epoch": 0.45257843490804184, |
|
"grad_norm": 0.41247573494911194, |
|
"learning_rate": 0.0004984369630814773, |
|
"loss": 1.3852, |
|
"step": 3765 |
|
}, |
|
{ |
|
"epoch": 0.4531794686861402, |
|
"grad_norm": 0.5414575338363647, |
|
"learning_rate": 0.0004984316349823186, |
|
"loss": 1.0758, |
|
"step": 3770 |
|
}, |
|
{ |
|
"epoch": 0.4537805024642385, |
|
"grad_norm": 0.5673078298568726, |
|
"learning_rate": 0.0004984262978459355, |
|
"loss": 0.9523, |
|
"step": 3775 |
|
}, |
|
{ |
|
"epoch": 0.4543815362423368, |
|
"grad_norm": 0.5617050528526306, |
|
"learning_rate": 0.0004984209516725221, |
|
"loss": 1.0805, |
|
"step": 3780 |
|
}, |
|
{ |
|
"epoch": 0.45498257002043513, |
|
"grad_norm": 0.5333116054534912, |
|
"learning_rate": 0.0004984155964622729, |
|
"loss": 1.0984, |
|
"step": 3785 |
|
}, |
|
{ |
|
"epoch": 0.45558360379853347, |
|
"grad_norm": 0.6070541739463806, |
|
"learning_rate": 0.0004984102322153827, |
|
"loss": 1.2125, |
|
"step": 3790 |
|
}, |
|
{ |
|
"epoch": 0.4561846375766318, |
|
"grad_norm": 0.6610611081123352, |
|
"learning_rate": 0.0004984048589320467, |
|
"loss": 1.1281, |
|
"step": 3795 |
|
}, |
|
{ |
|
"epoch": 0.45678567135473014, |
|
"grad_norm": 0.6735252737998962, |
|
"learning_rate": 0.0004983994766124602, |
|
"loss": 1.0371, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 0.45678567135473014, |
|
"eval_loss": 2.081835985183716, |
|
"eval_model_preparation_time": 0.0053, |
|
"eval_runtime": 35.1969, |
|
"eval_samples_per_second": 4.546, |
|
"eval_steps_per_second": 1.136, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 0.4573867051328285, |
|
"grad_norm": 0.48256927728652954, |
|
"learning_rate": 0.0004983940852568193, |
|
"loss": 1.0188, |
|
"step": 3805 |
|
}, |
|
{ |
|
"epoch": 0.4579877389109268, |
|
"grad_norm": 0.4252387583255768, |
|
"learning_rate": 0.0004983886848653197, |
|
"loss": 1.1875, |
|
"step": 3810 |
|
}, |
|
{ |
|
"epoch": 0.45858877268902515, |
|
"grad_norm": 0.5931745767593384, |
|
"learning_rate": 0.0004983832754381582, |
|
"loss": 1.0766, |
|
"step": 3815 |
|
}, |
|
{ |
|
"epoch": 0.45918980646712343, |
|
"grad_norm": 0.5438306927680969, |
|
"learning_rate": 0.0004983778569755315, |
|
"loss": 1.4563, |
|
"step": 3820 |
|
}, |
|
{ |
|
"epoch": 0.45979084024522177, |
|
"grad_norm": 0.4355262219905853, |
|
"learning_rate": 0.0004983724294776366, |
|
"loss": 1.0938, |
|
"step": 3825 |
|
}, |
|
{ |
|
"epoch": 0.4603918740233201, |
|
"grad_norm": 0.44057753682136536, |
|
"learning_rate": 0.0004983669929446711, |
|
"loss": 1.0375, |
|
"step": 3830 |
|
}, |
|
{ |
|
"epoch": 0.46099290780141844, |
|
"grad_norm": 0.5248241424560547, |
|
"learning_rate": 0.0004983615473768326, |
|
"loss": 1.1828, |
|
"step": 3835 |
|
}, |
|
{ |
|
"epoch": 0.4615939415795168, |
|
"grad_norm": 0.6102302074432373, |
|
"learning_rate": 0.0004983560927743193, |
|
"loss": 1.2516, |
|
"step": 3840 |
|
}, |
|
{ |
|
"epoch": 0.4621949753576151, |
|
"grad_norm": 0.6916151642799377, |
|
"learning_rate": 0.0004983506291373295, |
|
"loss": 1.6047, |
|
"step": 3845 |
|
}, |
|
{ |
|
"epoch": 0.46279600913571345, |
|
"grad_norm": 0.5055292844772339, |
|
"learning_rate": 0.0004983451564660622, |
|
"loss": 1.0172, |
|
"step": 3850 |
|
}, |
|
{ |
|
"epoch": 0.46339704291381173, |
|
"grad_norm": 0.6418437957763672, |
|
"learning_rate": 0.0004983396747607161, |
|
"loss": 1.1805, |
|
"step": 3855 |
|
}, |
|
{ |
|
"epoch": 0.46399807669191007, |
|
"grad_norm": 0.5011945962905884, |
|
"learning_rate": 0.000498334184021491, |
|
"loss": 1.0055, |
|
"step": 3860 |
|
}, |
|
{ |
|
"epoch": 0.4645991104700084, |
|
"grad_norm": 0.5504122376441956, |
|
"learning_rate": 0.0004983286842485864, |
|
"loss": 0.9742, |
|
"step": 3865 |
|
}, |
|
{ |
|
"epoch": 0.46520014424810674, |
|
"grad_norm": 0.3638380467891693, |
|
"learning_rate": 0.0004983231754422024, |
|
"loss": 0.9906, |
|
"step": 3870 |
|
}, |
|
{ |
|
"epoch": 0.4658011780262051, |
|
"grad_norm": 0.5893705487251282, |
|
"learning_rate": 0.0004983176576025394, |
|
"loss": 1.0367, |
|
"step": 3875 |
|
}, |
|
{ |
|
"epoch": 0.4664022118043034, |
|
"grad_norm": 0.5191811323165894, |
|
"learning_rate": 0.0004983121307297983, |
|
"loss": 1.4234, |
|
"step": 3880 |
|
}, |
|
{ |
|
"epoch": 0.46700324558240175, |
|
"grad_norm": 0.5002449750900269, |
|
"learning_rate": 0.0004983065948241799, |
|
"loss": 1.1242, |
|
"step": 3885 |
|
}, |
|
{ |
|
"epoch": 0.4676042793605001, |
|
"grad_norm": 0.5473368167877197, |
|
"learning_rate": 0.0004983010498858857, |
|
"loss": 1.1219, |
|
"step": 3890 |
|
}, |
|
{ |
|
"epoch": 0.46820531313859837, |
|
"grad_norm": 0.5295068621635437, |
|
"learning_rate": 0.0004982954959151174, |
|
"loss": 1.4508, |
|
"step": 3895 |
|
}, |
|
{ |
|
"epoch": 0.4688063469166967, |
|
"grad_norm": 0.8546839356422424, |
|
"learning_rate": 0.000498289932912077, |
|
"loss": 0.9664, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 0.46940738069479504, |
|
"grad_norm": 0.6534102559089661, |
|
"learning_rate": 0.000498284360876967, |
|
"loss": 1.4344, |
|
"step": 3905 |
|
}, |
|
{ |
|
"epoch": 0.4700084144728934, |
|
"grad_norm": 0.4570360779762268, |
|
"learning_rate": 0.0004982787798099898, |
|
"loss": 0.9531, |
|
"step": 3910 |
|
}, |
|
{ |
|
"epoch": 0.4706094482509917, |
|
"grad_norm": 0.5392407178878784, |
|
"learning_rate": 0.0004982731897113488, |
|
"loss": 1.243, |
|
"step": 3915 |
|
}, |
|
{ |
|
"epoch": 0.47121048202909005, |
|
"grad_norm": 0.7176635265350342, |
|
"learning_rate": 0.0004982675905812469, |
|
"loss": 0.968, |
|
"step": 3920 |
|
}, |
|
{ |
|
"epoch": 0.4718115158071884, |
|
"grad_norm": 0.5123677253723145, |
|
"learning_rate": 0.0004982619824198882, |
|
"loss": 1.4969, |
|
"step": 3925 |
|
}, |
|
{ |
|
"epoch": 0.47241254958528667, |
|
"grad_norm": 0.5915489196777344, |
|
"learning_rate": 0.0004982563652274766, |
|
"loss": 1.3805, |
|
"step": 3930 |
|
}, |
|
{ |
|
"epoch": 0.473013583363385, |
|
"grad_norm": 0.5139390230178833, |
|
"learning_rate": 0.0004982507390042163, |
|
"loss": 1.057, |
|
"step": 3935 |
|
}, |
|
{ |
|
"epoch": 0.47361461714148334, |
|
"grad_norm": 0.6480752229690552, |
|
"learning_rate": 0.0004982451037503121, |
|
"loss": 1.0063, |
|
"step": 3940 |
|
}, |
|
{ |
|
"epoch": 0.4742156509195817, |
|
"grad_norm": 0.6889259219169617, |
|
"learning_rate": 0.0004982394594659689, |
|
"loss": 1.4828, |
|
"step": 3945 |
|
}, |
|
{ |
|
"epoch": 0.47481668469768, |
|
"grad_norm": 0.5907976627349854, |
|
"learning_rate": 0.0004982338061513921, |
|
"loss": 1.2148, |
|
"step": 3950 |
|
}, |
|
{ |
|
"epoch": 0.47541771847577835, |
|
"grad_norm": 0.6065306067466736, |
|
"learning_rate": 0.0004982281438067874, |
|
"loss": 1.3336, |
|
"step": 3955 |
|
}, |
|
{ |
|
"epoch": 0.4760187522538767, |
|
"grad_norm": 0.5058939456939697, |
|
"learning_rate": 0.0004982224724323606, |
|
"loss": 1.2148, |
|
"step": 3960 |
|
}, |
|
{ |
|
"epoch": 0.476619786031975, |
|
"grad_norm": 0.633465588092804, |
|
"learning_rate": 0.0004982167920283181, |
|
"loss": 1.1977, |
|
"step": 3965 |
|
}, |
|
{ |
|
"epoch": 0.4772208198100733, |
|
"grad_norm": 0.48402997851371765, |
|
"learning_rate": 0.0004982111025948666, |
|
"loss": 1.0758, |
|
"step": 3970 |
|
}, |
|
{ |
|
"epoch": 0.47782185358817164, |
|
"grad_norm": 0.4740035831928253, |
|
"learning_rate": 0.000498205404132213, |
|
"loss": 0.9914, |
|
"step": 3975 |
|
}, |
|
{ |
|
"epoch": 0.47842288736627, |
|
"grad_norm": 0.6403579711914062, |
|
"learning_rate": 0.0004981996966405646, |
|
"loss": 0.9035, |
|
"step": 3980 |
|
}, |
|
{ |
|
"epoch": 0.4790239211443683, |
|
"grad_norm": 0.6459212303161621, |
|
"learning_rate": 0.000498193980120129, |
|
"loss": 1.1273, |
|
"step": 3985 |
|
}, |
|
{ |
|
"epoch": 0.47962495492246665, |
|
"grad_norm": 0.5674965977668762, |
|
"learning_rate": 0.0004981882545711142, |
|
"loss": 1.3828, |
|
"step": 3990 |
|
}, |
|
{ |
|
"epoch": 0.480225988700565, |
|
"grad_norm": 0.6159428954124451, |
|
"learning_rate": 0.0004981825199937285, |
|
"loss": 1.2344, |
|
"step": 3995 |
|
}, |
|
{ |
|
"epoch": 0.4808270224786633, |
|
"grad_norm": 0.6476476788520813, |
|
"learning_rate": 0.0004981767763881803, |
|
"loss": 1.4625, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.4808270224786633, |
|
"eval_loss": 2.0904297828674316, |
|
"eval_model_preparation_time": 0.0053, |
|
"eval_runtime": 35.1956, |
|
"eval_samples_per_second": 4.546, |
|
"eval_steps_per_second": 1.137, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.4814280562567616, |
|
"grad_norm": 0.7665592432022095, |
|
"learning_rate": 0.0004981710237546789, |
|
"loss": 1.2703, |
|
"step": 4005 |
|
}, |
|
{ |
|
"epoch": 0.48202909003485994, |
|
"grad_norm": 0.5057299733161926, |
|
"learning_rate": 0.0004981652620934333, |
|
"loss": 1.3039, |
|
"step": 4010 |
|
}, |
|
{ |
|
"epoch": 0.4826301238129583, |
|
"grad_norm": 0.46735164523124695, |
|
"learning_rate": 0.0004981594914046532, |
|
"loss": 1.0281, |
|
"step": 4015 |
|
}, |
|
{ |
|
"epoch": 0.4832311575910566, |
|
"grad_norm": 0.6298277378082275, |
|
"learning_rate": 0.0004981537116885484, |
|
"loss": 1.0656, |
|
"step": 4020 |
|
}, |
|
{ |
|
"epoch": 0.48383219136915495, |
|
"grad_norm": 0.6472790837287903, |
|
"learning_rate": 0.0004981479229453292, |
|
"loss": 1.0418, |
|
"step": 4025 |
|
}, |
|
{ |
|
"epoch": 0.4844332251472533, |
|
"grad_norm": 0.5558010339736938, |
|
"learning_rate": 0.0004981421251752063, |
|
"loss": 1.0992, |
|
"step": 4030 |
|
}, |
|
{ |
|
"epoch": 0.4850342589253516, |
|
"grad_norm": 0.46242016553878784, |
|
"learning_rate": 0.0004981363183783903, |
|
"loss": 1.002, |
|
"step": 4035 |
|
}, |
|
{ |
|
"epoch": 0.48563529270344996, |
|
"grad_norm": 0.48881420493125916, |
|
"learning_rate": 0.0004981305025550929, |
|
"loss": 1.1867, |
|
"step": 4040 |
|
}, |
|
{ |
|
"epoch": 0.48623632648154824, |
|
"grad_norm": 0.4354248642921448, |
|
"learning_rate": 0.0004981246777055252, |
|
"loss": 1.4141, |
|
"step": 4045 |
|
}, |
|
{ |
|
"epoch": 0.4868373602596466, |
|
"grad_norm": 0.5004878044128418, |
|
"learning_rate": 0.0004981188438298995, |
|
"loss": 0.9684, |
|
"step": 4050 |
|
}, |
|
{ |
|
"epoch": 0.4874383940377449, |
|
"grad_norm": 0.5613626837730408, |
|
"learning_rate": 0.0004981130009284277, |
|
"loss": 1.1555, |
|
"step": 4055 |
|
}, |
|
{ |
|
"epoch": 0.48803942781584325, |
|
"grad_norm": 0.4278530776500702, |
|
"learning_rate": 0.0004981071490013225, |
|
"loss": 1.1289, |
|
"step": 4060 |
|
}, |
|
{ |
|
"epoch": 0.4886404615939416, |
|
"grad_norm": 0.6259580850601196, |
|
"learning_rate": 0.0004981012880487968, |
|
"loss": 0.9828, |
|
"step": 4065 |
|
}, |
|
{ |
|
"epoch": 0.4892414953720399, |
|
"grad_norm": 0.5686140656471252, |
|
"learning_rate": 0.0004980954180710636, |
|
"loss": 1.1805, |
|
"step": 4070 |
|
}, |
|
{ |
|
"epoch": 0.48984252915013826, |
|
"grad_norm": 1.0444506406784058, |
|
"learning_rate": 0.0004980895390683367, |
|
"loss": 0.9242, |
|
"step": 4075 |
|
}, |
|
{ |
|
"epoch": 0.49044356292823654, |
|
"grad_norm": 0.49180054664611816, |
|
"learning_rate": 0.0004980836510408297, |
|
"loss": 0.9906, |
|
"step": 4080 |
|
}, |
|
{ |
|
"epoch": 0.4910445967063349, |
|
"grad_norm": 0.39391833543777466, |
|
"learning_rate": 0.000498077753988757, |
|
"loss": 0.9578, |
|
"step": 4085 |
|
}, |
|
{ |
|
"epoch": 0.4916456304844332, |
|
"grad_norm": 0.6872186660766602, |
|
"learning_rate": 0.0004980718479123332, |
|
"loss": 1.2711, |
|
"step": 4090 |
|
}, |
|
{ |
|
"epoch": 0.49224666426253155, |
|
"grad_norm": 0.4728960692882538, |
|
"learning_rate": 0.0004980659328117728, |
|
"loss": 1.0828, |
|
"step": 4095 |
|
}, |
|
{ |
|
"epoch": 0.4928476980406299, |
|
"grad_norm": 0.4187394380569458, |
|
"learning_rate": 0.0004980600086872913, |
|
"loss": 1.0727, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 0.4934487318187282, |
|
"grad_norm": 0.5191169381141663, |
|
"learning_rate": 0.000498054075539104, |
|
"loss": 0.9246, |
|
"step": 4105 |
|
}, |
|
{ |
|
"epoch": 0.49404976559682656, |
|
"grad_norm": 0.5990204811096191, |
|
"learning_rate": 0.0004980481333674269, |
|
"loss": 1.2625, |
|
"step": 4110 |
|
}, |
|
{ |
|
"epoch": 0.49465079937492484, |
|
"grad_norm": 0.5898841619491577, |
|
"learning_rate": 0.0004980421821724759, |
|
"loss": 0.9133, |
|
"step": 4115 |
|
}, |
|
{ |
|
"epoch": 0.4952518331530232, |
|
"grad_norm": 0.576693594455719, |
|
"learning_rate": 0.0004980362219544677, |
|
"loss": 1.2969, |
|
"step": 4120 |
|
}, |
|
{ |
|
"epoch": 0.4958528669311215, |
|
"grad_norm": 0.5620527267456055, |
|
"learning_rate": 0.000498030252713619, |
|
"loss": 0.9883, |
|
"step": 4125 |
|
}, |
|
{ |
|
"epoch": 0.49645390070921985, |
|
"grad_norm": 0.46309694647789, |
|
"learning_rate": 0.0004980242744501472, |
|
"loss": 1.3211, |
|
"step": 4130 |
|
}, |
|
{ |
|
"epoch": 0.4970549344873182, |
|
"grad_norm": 0.6207218170166016, |
|
"learning_rate": 0.0004980182871642694, |
|
"loss": 1.2563, |
|
"step": 4135 |
|
}, |
|
{ |
|
"epoch": 0.4976559682654165, |
|
"grad_norm": 0.5662885904312134, |
|
"learning_rate": 0.0004980122908562036, |
|
"loss": 1.3328, |
|
"step": 4140 |
|
}, |
|
{ |
|
"epoch": 0.49825700204351486, |
|
"grad_norm": 0.6134732961654663, |
|
"learning_rate": 0.000498006285526168, |
|
"loss": 1.132, |
|
"step": 4145 |
|
}, |
|
{ |
|
"epoch": 0.4988580358216132, |
|
"grad_norm": 0.7582330107688904, |
|
"learning_rate": 0.0004980002711743809, |
|
"loss": 0.8836, |
|
"step": 4150 |
|
}, |
|
{ |
|
"epoch": 0.4994590695997115, |
|
"grad_norm": 0.6320111155509949, |
|
"learning_rate": 0.0004979942478010612, |
|
"loss": 1.1727, |
|
"step": 4155 |
|
}, |
|
{ |
|
"epoch": 0.5000601033778098, |
|
"grad_norm": 0.5162654519081116, |
|
"learning_rate": 0.0004979882154064279, |
|
"loss": 1.2016, |
|
"step": 4160 |
|
}, |
|
{ |
|
"epoch": 0.5006611371559082, |
|
"grad_norm": 0.25934234261512756, |
|
"learning_rate": 0.0004979821739907005, |
|
"loss": 1.0988, |
|
"step": 4165 |
|
}, |
|
{ |
|
"epoch": 0.5012621709340065, |
|
"grad_norm": 0.6046193838119507, |
|
"learning_rate": 0.0004979761235540988, |
|
"loss": 1.3953, |
|
"step": 4170 |
|
}, |
|
{ |
|
"epoch": 0.5018632047121048, |
|
"grad_norm": 0.5944646596908569, |
|
"learning_rate": 0.0004979700640968429, |
|
"loss": 1.1305, |
|
"step": 4175 |
|
}, |
|
{ |
|
"epoch": 0.5024642384902032, |
|
"grad_norm": 0.5635605454444885, |
|
"learning_rate": 0.0004979639956191531, |
|
"loss": 1.2656, |
|
"step": 4180 |
|
}, |
|
{ |
|
"epoch": 0.5030652722683014, |
|
"grad_norm": 0.5029686689376831, |
|
"learning_rate": 0.0004979579181212504, |
|
"loss": 1.2305, |
|
"step": 4185 |
|
}, |
|
{ |
|
"epoch": 0.5036663060463998, |
|
"grad_norm": 0.8044158816337585, |
|
"learning_rate": 0.0004979518316033556, |
|
"loss": 1.225, |
|
"step": 4190 |
|
}, |
|
{ |
|
"epoch": 0.5042673398244981, |
|
"grad_norm": 0.6193830966949463, |
|
"learning_rate": 0.0004979457360656902, |
|
"loss": 0.9453, |
|
"step": 4195 |
|
}, |
|
{ |
|
"epoch": 0.5048683736025965, |
|
"grad_norm": 0.5999032258987427, |
|
"learning_rate": 0.0004979396315084761, |
|
"loss": 1.2148, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 0.5048683736025965, |
|
"eval_loss": 2.069140672683716, |
|
"eval_model_preparation_time": 0.0053, |
|
"eval_runtime": 35.1981, |
|
"eval_samples_per_second": 4.546, |
|
"eval_steps_per_second": 1.136, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 0.5054694073806948, |
|
"grad_norm": 0.6504601836204529, |
|
"learning_rate": 0.0004979335179319352, |
|
"loss": 1.082, |
|
"step": 4205 |
|
}, |
|
{ |
|
"epoch": 0.5060704411587931, |
|
"grad_norm": 0.5235728025436401, |
|
"learning_rate": 0.00049792739533629, |
|
"loss": 0.9156, |
|
"step": 4210 |
|
}, |
|
{ |
|
"epoch": 0.5066714749368915, |
|
"grad_norm": 0.46760815382003784, |
|
"learning_rate": 0.0004979212637217631, |
|
"loss": 1.007, |
|
"step": 4215 |
|
}, |
|
{ |
|
"epoch": 0.5072725087149897, |
|
"grad_norm": 0.5325601696968079, |
|
"learning_rate": 0.0004979151230885776, |
|
"loss": 1.1094, |
|
"step": 4220 |
|
}, |
|
{ |
|
"epoch": 0.5078735424930881, |
|
"grad_norm": 0.4098140299320221, |
|
"learning_rate": 0.0004979089734369568, |
|
"loss": 0.8449, |
|
"step": 4225 |
|
}, |
|
{ |
|
"epoch": 0.5084745762711864, |
|
"grad_norm": 0.6042998433113098, |
|
"learning_rate": 0.0004979028147671246, |
|
"loss": 1.1672, |
|
"step": 4230 |
|
}, |
|
{ |
|
"epoch": 0.5090756100492848, |
|
"grad_norm": 0.41027069091796875, |
|
"learning_rate": 0.0004978966470793049, |
|
"loss": 1.1992, |
|
"step": 4235 |
|
}, |
|
{ |
|
"epoch": 0.5096766438273831, |
|
"grad_norm": 0.5344287753105164, |
|
"learning_rate": 0.0004978904703737221, |
|
"loss": 0.9934, |
|
"step": 4240 |
|
}, |
|
{ |
|
"epoch": 0.5102776776054815, |
|
"grad_norm": 0.5234737992286682, |
|
"learning_rate": 0.000497884284650601, |
|
"loss": 1.0617, |
|
"step": 4245 |
|
}, |
|
{ |
|
"epoch": 0.5108787113835798, |
|
"grad_norm": 0.4804365336894989, |
|
"learning_rate": 0.0004978780899101663, |
|
"loss": 1.0098, |
|
"step": 4250 |
|
}, |
|
{ |
|
"epoch": 0.511479745161678, |
|
"grad_norm": 0.6976252198219299, |
|
"learning_rate": 0.0004978718861526438, |
|
"loss": 1.2852, |
|
"step": 4255 |
|
}, |
|
{ |
|
"epoch": 0.5120807789397764, |
|
"grad_norm": 0.4136990010738373, |
|
"learning_rate": 0.0004978656733782588, |
|
"loss": 1.1062, |
|
"step": 4260 |
|
}, |
|
{ |
|
"epoch": 0.5126818127178747, |
|
"grad_norm": 0.5344454646110535, |
|
"learning_rate": 0.0004978594515872373, |
|
"loss": 1.2984, |
|
"step": 4265 |
|
}, |
|
{ |
|
"epoch": 0.5132828464959731, |
|
"grad_norm": 0.42136499285697937, |
|
"learning_rate": 0.0004978532207798059, |
|
"loss": 1.0766, |
|
"step": 4270 |
|
}, |
|
{ |
|
"epoch": 0.5138838802740714, |
|
"grad_norm": 0.4809805750846863, |
|
"learning_rate": 0.0004978469809561911, |
|
"loss": 1.1469, |
|
"step": 4275 |
|
}, |
|
{ |
|
"epoch": 0.5144849140521698, |
|
"grad_norm": 0.5629724264144897, |
|
"learning_rate": 0.0004978407321166199, |
|
"loss": 1.218, |
|
"step": 4280 |
|
}, |
|
{ |
|
"epoch": 0.5150859478302681, |
|
"grad_norm": 0.34920796751976013, |
|
"learning_rate": 0.0004978344742613195, |
|
"loss": 0.8383, |
|
"step": 4285 |
|
}, |
|
{ |
|
"epoch": 0.5156869816083663, |
|
"grad_norm": 0.5682929158210754, |
|
"learning_rate": 0.0004978282073905178, |
|
"loss": 1.198, |
|
"step": 4290 |
|
}, |
|
{ |
|
"epoch": 0.5162880153864647, |
|
"grad_norm": 0.49201318621635437, |
|
"learning_rate": 0.0004978219315044426, |
|
"loss": 1.2352, |
|
"step": 4295 |
|
}, |
|
{ |
|
"epoch": 0.516889049164563, |
|
"grad_norm": 0.721458911895752, |
|
"learning_rate": 0.0004978156466033222, |
|
"loss": 0.9203, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 0.5174900829426614, |
|
"grad_norm": 0.5688953995704651, |
|
"learning_rate": 0.0004978093526873853, |
|
"loss": 1.1977, |
|
"step": 4305 |
|
}, |
|
{ |
|
"epoch": 0.5180911167207597, |
|
"grad_norm": 0.6028774976730347, |
|
"learning_rate": 0.0004978030497568607, |
|
"loss": 1.475, |
|
"step": 4310 |
|
}, |
|
{ |
|
"epoch": 0.5186921504988581, |
|
"grad_norm": 0.6064693331718445, |
|
"learning_rate": 0.000497796737811978, |
|
"loss": 1.1285, |
|
"step": 4315 |
|
}, |
|
{ |
|
"epoch": 0.5192931842769564, |
|
"grad_norm": 0.47966837882995605, |
|
"learning_rate": 0.0004977904168529664, |
|
"loss": 1.2102, |
|
"step": 4320 |
|
}, |
|
{ |
|
"epoch": 0.5198942180550546, |
|
"grad_norm": 0.7327610850334167, |
|
"learning_rate": 0.0004977840868800561, |
|
"loss": 1.1367, |
|
"step": 4325 |
|
}, |
|
{ |
|
"epoch": 0.520495251833153, |
|
"grad_norm": 0.5901091694831848, |
|
"learning_rate": 0.0004977777478934774, |
|
"loss": 1.3352, |
|
"step": 4330 |
|
}, |
|
{ |
|
"epoch": 0.5210962856112513, |
|
"grad_norm": 0.5034139752388, |
|
"learning_rate": 0.0004977713998934607, |
|
"loss": 0.9094, |
|
"step": 4335 |
|
}, |
|
{ |
|
"epoch": 0.5216973193893497, |
|
"grad_norm": 0.43822070956230164, |
|
"learning_rate": 0.0004977650428802371, |
|
"loss": 1.3539, |
|
"step": 4340 |
|
}, |
|
{ |
|
"epoch": 0.522298353167448, |
|
"grad_norm": 0.5016989707946777, |
|
"learning_rate": 0.0004977586768540377, |
|
"loss": 1.2398, |
|
"step": 4345 |
|
}, |
|
{ |
|
"epoch": 0.5228993869455464, |
|
"grad_norm": 0.4488658607006073, |
|
"learning_rate": 0.0004977523018150941, |
|
"loss": 1.0188, |
|
"step": 4350 |
|
}, |
|
{ |
|
"epoch": 0.5235004207236447, |
|
"grad_norm": 0.5360990762710571, |
|
"learning_rate": 0.0004977459177636384, |
|
"loss": 1.1133, |
|
"step": 4355 |
|
}, |
|
{ |
|
"epoch": 0.524101454501743, |
|
"grad_norm": 0.41790762543678284, |
|
"learning_rate": 0.0004977395246999026, |
|
"loss": 1.0016, |
|
"step": 4360 |
|
}, |
|
{ |
|
"epoch": 0.5247024882798413, |
|
"grad_norm": 0.8560382723808289, |
|
"learning_rate": 0.0004977331226241194, |
|
"loss": 1.1953, |
|
"step": 4365 |
|
}, |
|
{ |
|
"epoch": 0.5253035220579396, |
|
"grad_norm": 0.5320670008659363, |
|
"learning_rate": 0.0004977267115365216, |
|
"loss": 1.0711, |
|
"step": 4370 |
|
}, |
|
{ |
|
"epoch": 0.525904555836038, |
|
"grad_norm": 0.5246152877807617, |
|
"learning_rate": 0.0004977202914373426, |
|
"loss": 1.2891, |
|
"step": 4375 |
|
}, |
|
{ |
|
"epoch": 0.5265055896141363, |
|
"grad_norm": 0.6537752747535706, |
|
"learning_rate": 0.0004977138623268156, |
|
"loss": 1.2719, |
|
"step": 4380 |
|
}, |
|
{ |
|
"epoch": 0.5271066233922347, |
|
"grad_norm": 0.5286105275154114, |
|
"learning_rate": 0.0004977074242051748, |
|
"loss": 1.1391, |
|
"step": 4385 |
|
}, |
|
{ |
|
"epoch": 0.527707657170333, |
|
"grad_norm": 0.5731997489929199, |
|
"learning_rate": 0.0004977009770726541, |
|
"loss": 1.0484, |
|
"step": 4390 |
|
}, |
|
{ |
|
"epoch": 0.5283086909484312, |
|
"grad_norm": 0.6958596110343933, |
|
"learning_rate": 0.0004976945209294884, |
|
"loss": 0.9648, |
|
"step": 4395 |
|
}, |
|
{ |
|
"epoch": 0.5289097247265296, |
|
"grad_norm": 0.8171592950820923, |
|
"learning_rate": 0.0004976880557759124, |
|
"loss": 1.5773, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 0.5289097247265296, |
|
"eval_loss": 2.049023389816284, |
|
"eval_model_preparation_time": 0.0053, |
|
"eval_runtime": 35.1838, |
|
"eval_samples_per_second": 4.548, |
|
"eval_steps_per_second": 1.137, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 0.5295107585046279, |
|
"grad_norm": 0.4283256530761719, |
|
"learning_rate": 0.000497681581612161, |
|
"loss": 0.9379, |
|
"step": 4405 |
|
}, |
|
{ |
|
"epoch": 0.5301117922827263, |
|
"grad_norm": 0.6683317422866821, |
|
"learning_rate": 0.0004976750984384701, |
|
"loss": 1.3484, |
|
"step": 4410 |
|
}, |
|
{ |
|
"epoch": 0.5307128260608246, |
|
"grad_norm": 0.7179862260818481, |
|
"learning_rate": 0.0004976686062550754, |
|
"loss": 0.7375, |
|
"step": 4415 |
|
}, |
|
{ |
|
"epoch": 0.531313859838923, |
|
"grad_norm": 0.5547930002212524, |
|
"learning_rate": 0.000497662105062213, |
|
"loss": 0.932, |
|
"step": 4420 |
|
}, |
|
{ |
|
"epoch": 0.5319148936170213, |
|
"grad_norm": 0.6749304533004761, |
|
"learning_rate": 0.0004976555948601194, |
|
"loss": 1.0969, |
|
"step": 4425 |
|
}, |
|
{ |
|
"epoch": 0.5325159273951197, |
|
"grad_norm": 0.4598415493965149, |
|
"learning_rate": 0.0004976490756490316, |
|
"loss": 1.2563, |
|
"step": 4430 |
|
}, |
|
{ |
|
"epoch": 0.5331169611732179, |
|
"grad_norm": 1.0315395593643188, |
|
"learning_rate": 0.0004976425474291866, |
|
"loss": 0.9234, |
|
"step": 4435 |
|
}, |
|
{ |
|
"epoch": 0.5337179949513162, |
|
"grad_norm": 0.5202517509460449, |
|
"learning_rate": 0.0004976360102008219, |
|
"loss": 0.9219, |
|
"step": 4440 |
|
}, |
|
{ |
|
"epoch": 0.5343190287294146, |
|
"grad_norm": 0.6117017269134521, |
|
"learning_rate": 0.0004976294639641753, |
|
"loss": 1.1805, |
|
"step": 4445 |
|
}, |
|
{ |
|
"epoch": 0.5349200625075129, |
|
"grad_norm": 0.6244452595710754, |
|
"learning_rate": 0.000497622908719485, |
|
"loss": 1.1406, |
|
"step": 4450 |
|
}, |
|
{ |
|
"epoch": 0.5355210962856113, |
|
"grad_norm": 0.6520891785621643, |
|
"learning_rate": 0.0004976163444669893, |
|
"loss": 1.2078, |
|
"step": 4455 |
|
}, |
|
{ |
|
"epoch": 0.5361221300637096, |
|
"grad_norm": 0.6139251589775085, |
|
"learning_rate": 0.0004976097712069272, |
|
"loss": 1.0719, |
|
"step": 4460 |
|
}, |
|
{ |
|
"epoch": 0.536723163841808, |
|
"grad_norm": 0.46594560146331787, |
|
"learning_rate": 0.0004976031889395376, |
|
"loss": 0.9531, |
|
"step": 4465 |
|
}, |
|
{ |
|
"epoch": 0.5373241976199062, |
|
"grad_norm": 0.6443547606468201, |
|
"learning_rate": 0.0004975965976650601, |
|
"loss": 1.0945, |
|
"step": 4470 |
|
}, |
|
{ |
|
"epoch": 0.5379252313980045, |
|
"grad_norm": 0.6175803542137146, |
|
"learning_rate": 0.0004975899973837344, |
|
"loss": 1.5594, |
|
"step": 4475 |
|
}, |
|
{ |
|
"epoch": 0.5385262651761029, |
|
"grad_norm": 0.7328972220420837, |
|
"learning_rate": 0.0004975833880958006, |
|
"loss": 1.1953, |
|
"step": 4480 |
|
}, |
|
{ |
|
"epoch": 0.5391272989542012, |
|
"grad_norm": 0.485140860080719, |
|
"learning_rate": 0.0004975767698014992, |
|
"loss": 1.1117, |
|
"step": 4485 |
|
}, |
|
{ |
|
"epoch": 0.5397283327322996, |
|
"grad_norm": 0.5293512344360352, |
|
"learning_rate": 0.0004975701425010709, |
|
"loss": 1.025, |
|
"step": 4490 |
|
}, |
|
{ |
|
"epoch": 0.5403293665103979, |
|
"grad_norm": 0.7554160952568054, |
|
"learning_rate": 0.0004975635061947568, |
|
"loss": 1.0391, |
|
"step": 4495 |
|
}, |
|
{ |
|
"epoch": 0.5409304002884963, |
|
"grad_norm": 0.41734662652015686, |
|
"learning_rate": 0.0004975568608827982, |
|
"loss": 0.8695, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.5415314340665945, |
|
"grad_norm": 0.39808404445648193, |
|
"learning_rate": 0.0004975502065654371, |
|
"loss": 1.1461, |
|
"step": 4505 |
|
}, |
|
{ |
|
"epoch": 0.5421324678446928, |
|
"grad_norm": 0.6031619906425476, |
|
"learning_rate": 0.0004975435432429153, |
|
"loss": 1.2563, |
|
"step": 4510 |
|
}, |
|
{ |
|
"epoch": 0.5427335016227912, |
|
"grad_norm": 0.7665862441062927, |
|
"learning_rate": 0.0004975368709154753, |
|
"loss": 1.2812, |
|
"step": 4515 |
|
}, |
|
{ |
|
"epoch": 0.5433345354008895, |
|
"grad_norm": 0.7608307003974915, |
|
"learning_rate": 0.0004975301895833598, |
|
"loss": 1.1766, |
|
"step": 4520 |
|
}, |
|
{ |
|
"epoch": 0.5439355691789879, |
|
"grad_norm": 0.6584916114807129, |
|
"learning_rate": 0.0004975234992468118, |
|
"loss": 1.0078, |
|
"step": 4525 |
|
}, |
|
{ |
|
"epoch": 0.5445366029570862, |
|
"grad_norm": 0.5199896097183228, |
|
"learning_rate": 0.0004975167999060748, |
|
"loss": 1.2406, |
|
"step": 4530 |
|
}, |
|
{ |
|
"epoch": 0.5451376367351846, |
|
"grad_norm": 0.5209721922874451, |
|
"learning_rate": 0.0004975100915613925, |
|
"loss": 1.0367, |
|
"step": 4535 |
|
}, |
|
{ |
|
"epoch": 0.5457386705132828, |
|
"grad_norm": 0.6140447854995728, |
|
"learning_rate": 0.0004975033742130087, |
|
"loss": 0.9734, |
|
"step": 4540 |
|
}, |
|
{ |
|
"epoch": 0.5463397042913811, |
|
"grad_norm": 0.5982779860496521, |
|
"learning_rate": 0.0004974966478611681, |
|
"loss": 1.182, |
|
"step": 4545 |
|
}, |
|
{ |
|
"epoch": 0.5469407380694795, |
|
"grad_norm": 0.48514458537101746, |
|
"learning_rate": 0.0004974899125061151, |
|
"loss": 1.2273, |
|
"step": 4550 |
|
}, |
|
{ |
|
"epoch": 0.5475417718475778, |
|
"grad_norm": 0.5001315474510193, |
|
"learning_rate": 0.0004974831681480949, |
|
"loss": 1.2789, |
|
"step": 4555 |
|
}, |
|
{ |
|
"epoch": 0.5481428056256762, |
|
"grad_norm": 0.4541556239128113, |
|
"learning_rate": 0.0004974764147873526, |
|
"loss": 0.9297, |
|
"step": 4560 |
|
}, |
|
{ |
|
"epoch": 0.5487438394037745, |
|
"grad_norm": 0.5090661644935608, |
|
"learning_rate": 0.0004974696524241342, |
|
"loss": 0.984, |
|
"step": 4565 |
|
}, |
|
{ |
|
"epoch": 0.5493448731818729, |
|
"grad_norm": 0.49955224990844727, |
|
"learning_rate": 0.0004974628810586854, |
|
"loss": 1.0625, |
|
"step": 4570 |
|
}, |
|
{ |
|
"epoch": 0.5499459069599711, |
|
"grad_norm": 0.6976388692855835, |
|
"learning_rate": 0.0004974561006912527, |
|
"loss": 1.1914, |
|
"step": 4575 |
|
}, |
|
{ |
|
"epoch": 0.5505469407380695, |
|
"grad_norm": 0.4535270929336548, |
|
"learning_rate": 0.0004974493113220827, |
|
"loss": 0.7617, |
|
"step": 4580 |
|
}, |
|
{ |
|
"epoch": 0.5511479745161678, |
|
"grad_norm": 0.8714334964752197, |
|
"learning_rate": 0.0004974425129514224, |
|
"loss": 0.9938, |
|
"step": 4585 |
|
}, |
|
{ |
|
"epoch": 0.5517490082942661, |
|
"grad_norm": 0.49312853813171387, |
|
"learning_rate": 0.000497435705579519, |
|
"loss": 1.2945, |
|
"step": 4590 |
|
}, |
|
{ |
|
"epoch": 0.5523500420723645, |
|
"grad_norm": 0.5688885450363159, |
|
"learning_rate": 0.0004974288892066203, |
|
"loss": 1.0527, |
|
"step": 4595 |
|
}, |
|
{ |
|
"epoch": 0.5529510758504628, |
|
"grad_norm": 0.5999502539634705, |
|
"learning_rate": 0.0004974220638329741, |
|
"loss": 0.9891, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 0.5529510758504628, |
|
"eval_loss": 2.064257860183716, |
|
"eval_model_preparation_time": 0.0053, |
|
"eval_runtime": 35.2145, |
|
"eval_samples_per_second": 4.544, |
|
"eval_steps_per_second": 1.136, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 0.5535521096285612, |
|
"grad_norm": 0.5289875268936157, |
|
"learning_rate": 0.0004974152294588289, |
|
"loss": 1.1328, |
|
"step": 4605 |
|
}, |
|
{ |
|
"epoch": 0.5541531434066594, |
|
"grad_norm": 0.7003397941589355, |
|
"learning_rate": 0.000497408386084433, |
|
"loss": 0.8477, |
|
"step": 4610 |
|
}, |
|
{ |
|
"epoch": 0.5547541771847578, |
|
"grad_norm": 0.5111108422279358, |
|
"learning_rate": 0.0004974015337100357, |
|
"loss": 1.168, |
|
"step": 4615 |
|
}, |
|
{ |
|
"epoch": 0.5553552109628561, |
|
"grad_norm": 0.4160408675670624, |
|
"learning_rate": 0.0004973946723358858, |
|
"loss": 1.3211, |
|
"step": 4620 |
|
}, |
|
{ |
|
"epoch": 0.5559562447409544, |
|
"grad_norm": 0.5501172542572021, |
|
"learning_rate": 0.0004973878019622335, |
|
"loss": 1.3945, |
|
"step": 4625 |
|
}, |
|
{ |
|
"epoch": 0.5565572785190528, |
|
"grad_norm": 0.4727821350097656, |
|
"learning_rate": 0.0004973809225893282, |
|
"loss": 1.3078, |
|
"step": 4630 |
|
}, |
|
{ |
|
"epoch": 0.5571583122971511, |
|
"grad_norm": 0.5417948365211487, |
|
"learning_rate": 0.0004973740342174204, |
|
"loss": 1.1062, |
|
"step": 4635 |
|
}, |
|
{ |
|
"epoch": 0.5577593460752495, |
|
"grad_norm": 0.39009690284729004, |
|
"learning_rate": 0.0004973671368467607, |
|
"loss": 0.7863, |
|
"step": 4640 |
|
}, |
|
{ |
|
"epoch": 0.5583603798533477, |
|
"grad_norm": 0.6143473386764526, |
|
"learning_rate": 0.0004973602304776, |
|
"loss": 1.0836, |
|
"step": 4645 |
|
}, |
|
{ |
|
"epoch": 0.5589614136314461, |
|
"grad_norm": 0.46591776609420776, |
|
"learning_rate": 0.0004973533151101893, |
|
"loss": 1.0797, |
|
"step": 4650 |
|
}, |
|
{ |
|
"epoch": 0.5595624474095444, |
|
"grad_norm": 0.636924684047699, |
|
"learning_rate": 0.0004973463907447804, |
|
"loss": 0.9961, |
|
"step": 4655 |
|
}, |
|
{ |
|
"epoch": 0.5601634811876427, |
|
"grad_norm": 0.7269019484519958, |
|
"learning_rate": 0.0004973394573816252, |
|
"loss": 1.0727, |
|
"step": 4660 |
|
}, |
|
{ |
|
"epoch": 0.5607645149657411, |
|
"grad_norm": 0.5581966638565063, |
|
"learning_rate": 0.0004973325150209758, |
|
"loss": 0.832, |
|
"step": 4665 |
|
}, |
|
{ |
|
"epoch": 0.5613655487438394, |
|
"grad_norm": 0.586113691329956, |
|
"learning_rate": 0.0004973255636630847, |
|
"loss": 1.132, |
|
"step": 4670 |
|
}, |
|
{ |
|
"epoch": 0.5619665825219378, |
|
"grad_norm": 0.7046157121658325, |
|
"learning_rate": 0.0004973186033082049, |
|
"loss": 1.1008, |
|
"step": 4675 |
|
}, |
|
{ |
|
"epoch": 0.562567616300036, |
|
"grad_norm": 0.6651023626327515, |
|
"learning_rate": 0.0004973116339565897, |
|
"loss": 1.1445, |
|
"step": 4680 |
|
}, |
|
{ |
|
"epoch": 0.5631686500781344, |
|
"grad_norm": 0.6627675890922546, |
|
"learning_rate": 0.0004973046556084923, |
|
"loss": 1.2563, |
|
"step": 4685 |
|
}, |
|
{ |
|
"epoch": 0.5637696838562327, |
|
"grad_norm": 0.4929293692111969, |
|
"learning_rate": 0.0004972976682641668, |
|
"loss": 1.1836, |
|
"step": 4690 |
|
}, |
|
{ |
|
"epoch": 0.564370717634331, |
|
"grad_norm": 0.500619649887085, |
|
"learning_rate": 0.0004972906719238673, |
|
"loss": 1.1789, |
|
"step": 4695 |
|
}, |
|
{ |
|
"epoch": 0.5649717514124294, |
|
"grad_norm": 0.456321120262146, |
|
"learning_rate": 0.0004972836665878483, |
|
"loss": 1.1375, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 0.5655727851905277, |
|
"grad_norm": 0.43033653497695923, |
|
"learning_rate": 0.0004972766522563648, |
|
"loss": 0.7367, |
|
"step": 4705 |
|
}, |
|
{ |
|
"epoch": 0.5661738189686261, |
|
"grad_norm": 0.5689796805381775, |
|
"learning_rate": 0.0004972696289296715, |
|
"loss": 0.7828, |
|
"step": 4710 |
|
}, |
|
{ |
|
"epoch": 0.5667748527467243, |
|
"grad_norm": 0.6046550273895264, |
|
"learning_rate": 0.0004972625966080244, |
|
"loss": 1.082, |
|
"step": 4715 |
|
}, |
|
{ |
|
"epoch": 0.5673758865248227, |
|
"grad_norm": 0.6092924475669861, |
|
"learning_rate": 0.0004972555552916791, |
|
"loss": 1.0945, |
|
"step": 4720 |
|
}, |
|
{ |
|
"epoch": 0.567976920302921, |
|
"grad_norm": 0.6022126078605652, |
|
"learning_rate": 0.0004972485049808918, |
|
"loss": 1.0648, |
|
"step": 4725 |
|
}, |
|
{ |
|
"epoch": 0.5685779540810193, |
|
"grad_norm": 0.6475672721862793, |
|
"learning_rate": 0.0004972414456759189, |
|
"loss": 1.4375, |
|
"step": 4730 |
|
}, |
|
{ |
|
"epoch": 0.5691789878591177, |
|
"grad_norm": 0.5474737882614136, |
|
"learning_rate": 0.0004972343773770172, |
|
"loss": 1.2367, |
|
"step": 4735 |
|
}, |
|
{ |
|
"epoch": 0.569780021637216, |
|
"grad_norm": 0.5624226331710815, |
|
"learning_rate": 0.0004972273000844439, |
|
"loss": 0.9305, |
|
"step": 4740 |
|
}, |
|
{ |
|
"epoch": 0.5703810554153144, |
|
"grad_norm": 0.5779100656509399, |
|
"learning_rate": 0.0004972202137984564, |
|
"loss": 0.9156, |
|
"step": 4745 |
|
}, |
|
{ |
|
"epoch": 0.5709820891934126, |
|
"grad_norm": 0.8340283036231995, |
|
"learning_rate": 0.0004972131185193123, |
|
"loss": 1.3273, |
|
"step": 4750 |
|
}, |
|
{ |
|
"epoch": 0.571583122971511, |
|
"grad_norm": 0.5648449063301086, |
|
"learning_rate": 0.0004972060142472702, |
|
"loss": 1.1727, |
|
"step": 4755 |
|
}, |
|
{ |
|
"epoch": 0.5721841567496093, |
|
"grad_norm": 0.6421749591827393, |
|
"learning_rate": 0.0004971989009825879, |
|
"loss": 1.0184, |
|
"step": 4760 |
|
}, |
|
{ |
|
"epoch": 0.5727851905277077, |
|
"grad_norm": 0.5583392381668091, |
|
"learning_rate": 0.0004971917787255247, |
|
"loss": 1.3945, |
|
"step": 4765 |
|
}, |
|
{ |
|
"epoch": 0.573386224305806, |
|
"grad_norm": 0.6780814528465271, |
|
"learning_rate": 0.0004971846474763394, |
|
"loss": 1.2617, |
|
"step": 4770 |
|
}, |
|
{ |
|
"epoch": 0.5739872580839043, |
|
"grad_norm": 0.6801932454109192, |
|
"learning_rate": 0.0004971775072352914, |
|
"loss": 1.127, |
|
"step": 4775 |
|
}, |
|
{ |
|
"epoch": 0.5745882918620027, |
|
"grad_norm": 0.7682610154151917, |
|
"learning_rate": 0.0004971703580026407, |
|
"loss": 1.093, |
|
"step": 4780 |
|
}, |
|
{ |
|
"epoch": 0.575189325640101, |
|
"grad_norm": 0.5586374998092651, |
|
"learning_rate": 0.000497163199778647, |
|
"loss": 1.1852, |
|
"step": 4785 |
|
}, |
|
{ |
|
"epoch": 0.5757903594181993, |
|
"grad_norm": 0.574213445186615, |
|
"learning_rate": 0.000497156032563571, |
|
"loss": 1.257, |
|
"step": 4790 |
|
}, |
|
{ |
|
"epoch": 0.5763913931962976, |
|
"grad_norm": 0.4758005738258362, |
|
"learning_rate": 0.0004971488563576732, |
|
"loss": 0.7699, |
|
"step": 4795 |
|
}, |
|
{ |
|
"epoch": 0.576992426974396, |
|
"grad_norm": 0.7012805938720703, |
|
"learning_rate": 0.0004971416711612149, |
|
"loss": 0.9437, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 0.576992426974396, |
|
"eval_loss": 2.10546875, |
|
"eval_model_preparation_time": 0.0053, |
|
"eval_runtime": 35.1867, |
|
"eval_samples_per_second": 4.547, |
|
"eval_steps_per_second": 1.137, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 0.5775934607524943, |
|
"grad_norm": 0.5702618360519409, |
|
"learning_rate": 0.0004971344769744572, |
|
"loss": 1.4156, |
|
"step": 4805 |
|
}, |
|
{ |
|
"epoch": 0.5781944945305926, |
|
"grad_norm": 0.5781665444374084, |
|
"learning_rate": 0.000497127273797662, |
|
"loss": 1.1836, |
|
"step": 4810 |
|
}, |
|
{ |
|
"epoch": 0.578795528308691, |
|
"grad_norm": 0.6150456070899963, |
|
"learning_rate": 0.0004971200616310914, |
|
"loss": 1.0434, |
|
"step": 4815 |
|
}, |
|
{ |
|
"epoch": 0.5793965620867892, |
|
"grad_norm": 0.49010351300239563, |
|
"learning_rate": 0.0004971128404750075, |
|
"loss": 1.0312, |
|
"step": 4820 |
|
}, |
|
{ |
|
"epoch": 0.5799975958648876, |
|
"grad_norm": 0.5255641341209412, |
|
"learning_rate": 0.000497105610329673, |
|
"loss": 1.1898, |
|
"step": 4825 |
|
}, |
|
{ |
|
"epoch": 0.5805986296429859, |
|
"grad_norm": 0.5797913670539856, |
|
"learning_rate": 0.0004970983711953512, |
|
"loss": 0.9797, |
|
"step": 4830 |
|
}, |
|
{ |
|
"epoch": 0.5811996634210843, |
|
"grad_norm": 0.3937300741672516, |
|
"learning_rate": 0.0004970911230723052, |
|
"loss": 1.1031, |
|
"step": 4835 |
|
}, |
|
{ |
|
"epoch": 0.5818006971991826, |
|
"grad_norm": 0.5963497161865234, |
|
"learning_rate": 0.0004970838659607987, |
|
"loss": 1.0164, |
|
"step": 4840 |
|
}, |
|
{ |
|
"epoch": 0.5824017309772809, |
|
"grad_norm": 0.7100831866264343, |
|
"learning_rate": 0.0004970765998610957, |
|
"loss": 0.943, |
|
"step": 4845 |
|
}, |
|
{ |
|
"epoch": 0.5830027647553793, |
|
"grad_norm": 0.5534042119979858, |
|
"learning_rate": 0.0004970693247734606, |
|
"loss": 1.143, |
|
"step": 4850 |
|
}, |
|
{ |
|
"epoch": 0.5836037985334775, |
|
"grad_norm": 0.8651660084724426, |
|
"learning_rate": 0.000497062040698158, |
|
"loss": 0.8562, |
|
"step": 4855 |
|
}, |
|
{ |
|
"epoch": 0.5842048323115759, |
|
"grad_norm": 0.5462768077850342, |
|
"learning_rate": 0.0004970547476354528, |
|
"loss": 1.118, |
|
"step": 4860 |
|
}, |
|
{ |
|
"epoch": 0.5848058660896742, |
|
"grad_norm": 0.5735900402069092, |
|
"learning_rate": 0.0004970474455856103, |
|
"loss": 1.0475, |
|
"step": 4865 |
|
}, |
|
{ |
|
"epoch": 0.5854068998677726, |
|
"grad_norm": 0.7304772138595581, |
|
"learning_rate": 0.0004970401345488962, |
|
"loss": 0.9406, |
|
"step": 4870 |
|
}, |
|
{ |
|
"epoch": 0.5860079336458709, |
|
"grad_norm": 0.8543664813041687, |
|
"learning_rate": 0.0004970328145255767, |
|
"loss": 1.1461, |
|
"step": 4875 |
|
}, |
|
{ |
|
"epoch": 0.5866089674239692, |
|
"grad_norm": 0.47991877794265747, |
|
"learning_rate": 0.0004970254855159176, |
|
"loss": 1.2852, |
|
"step": 4880 |
|
}, |
|
{ |
|
"epoch": 0.5872100012020676, |
|
"grad_norm": 0.5912867188453674, |
|
"learning_rate": 0.0004970181475201857, |
|
"loss": 1.0023, |
|
"step": 4885 |
|
}, |
|
{ |
|
"epoch": 0.5878110349801658, |
|
"grad_norm": 0.598426878452301, |
|
"learning_rate": 0.0004970108005386482, |
|
"loss": 1.3953, |
|
"step": 4890 |
|
}, |
|
{ |
|
"epoch": 0.5884120687582642, |
|
"grad_norm": 0.40192949771881104, |
|
"learning_rate": 0.0004970034445715719, |
|
"loss": 1.3156, |
|
"step": 4895 |
|
}, |
|
{ |
|
"epoch": 0.5890131025363625, |
|
"grad_norm": 0.663162112236023, |
|
"learning_rate": 0.0004969960796192246, |
|
"loss": 1.0555, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 0.5896141363144609, |
|
"grad_norm": 0.5530306696891785, |
|
"learning_rate": 0.0004969887056818743, |
|
"loss": 1.3445, |
|
"step": 4905 |
|
}, |
|
{ |
|
"epoch": 0.5902151700925592, |
|
"grad_norm": 0.5497463941574097, |
|
"learning_rate": 0.0004969813227597892, |
|
"loss": 1.3188, |
|
"step": 4910 |
|
}, |
|
{ |
|
"epoch": 0.5908162038706576, |
|
"grad_norm": 0.40855851769447327, |
|
"learning_rate": 0.0004969739308532379, |
|
"loss": 1.1141, |
|
"step": 4915 |
|
}, |
|
{ |
|
"epoch": 0.5914172376487559, |
|
"grad_norm": 0.6550401449203491, |
|
"learning_rate": 0.0004969665299624891, |
|
"loss": 1.0875, |
|
"step": 4920 |
|
}, |
|
{ |
|
"epoch": 0.5920182714268541, |
|
"grad_norm": 0.6358505487442017, |
|
"learning_rate": 0.0004969591200878122, |
|
"loss": 1.4852, |
|
"step": 4925 |
|
}, |
|
{ |
|
"epoch": 0.5926193052049525, |
|
"grad_norm": 0.45583033561706543, |
|
"learning_rate": 0.0004969517012294768, |
|
"loss": 1.1062, |
|
"step": 4930 |
|
}, |
|
{ |
|
"epoch": 0.5932203389830508, |
|
"grad_norm": 0.6163007020950317, |
|
"learning_rate": 0.0004969442733877526, |
|
"loss": 1.1102, |
|
"step": 4935 |
|
}, |
|
{ |
|
"epoch": 0.5938213727611492, |
|
"grad_norm": 0.7980563640594482, |
|
"learning_rate": 0.00049693683656291, |
|
"loss": 1.1547, |
|
"step": 4940 |
|
}, |
|
{ |
|
"epoch": 0.5944224065392475, |
|
"grad_norm": 0.371162086725235, |
|
"learning_rate": 0.0004969293907552193, |
|
"loss": 1.2648, |
|
"step": 4945 |
|
}, |
|
{ |
|
"epoch": 0.5950234403173459, |
|
"grad_norm": 0.4477928578853607, |
|
"learning_rate": 0.0004969219359649516, |
|
"loss": 1.193, |
|
"step": 4950 |
|
}, |
|
{ |
|
"epoch": 0.5956244740954442, |
|
"grad_norm": 0.5350117683410645, |
|
"learning_rate": 0.0004969144721923779, |
|
"loss": 0.7547, |
|
"step": 4955 |
|
}, |
|
{ |
|
"epoch": 0.5962255078735424, |
|
"grad_norm": 0.7265620231628418, |
|
"learning_rate": 0.0004969069994377697, |
|
"loss": 1.1359, |
|
"step": 4960 |
|
}, |
|
{ |
|
"epoch": 0.5968265416516408, |
|
"grad_norm": 0.4857397973537445, |
|
"learning_rate": 0.0004968995177013991, |
|
"loss": 1.2906, |
|
"step": 4965 |
|
}, |
|
{ |
|
"epoch": 0.5974275754297391, |
|
"grad_norm": 0.43424108624458313, |
|
"learning_rate": 0.000496892026983538, |
|
"loss": 1.1633, |
|
"step": 4970 |
|
}, |
|
{ |
|
"epoch": 0.5980286092078375, |
|
"grad_norm": 0.5249903202056885, |
|
"learning_rate": 0.0004968845272844589, |
|
"loss": 0.8766, |
|
"step": 4975 |
|
}, |
|
{ |
|
"epoch": 0.5986296429859358, |
|
"grad_norm": 0.591312825679779, |
|
"learning_rate": 0.0004968770186044347, |
|
"loss": 1.107, |
|
"step": 4980 |
|
}, |
|
{ |
|
"epoch": 0.5992306767640342, |
|
"grad_norm": 0.5988960266113281, |
|
"learning_rate": 0.0004968695009437385, |
|
"loss": 1.0281, |
|
"step": 4985 |
|
}, |
|
{ |
|
"epoch": 0.5998317105421325, |
|
"grad_norm": 0.5821179151535034, |
|
"learning_rate": 0.0004968619743026439, |
|
"loss": 1.0852, |
|
"step": 4990 |
|
}, |
|
{ |
|
"epoch": 0.6004327443202307, |
|
"grad_norm": 0.5867944359779358, |
|
"learning_rate": 0.0004968544386814245, |
|
"loss": 1.1008, |
|
"step": 4995 |
|
}, |
|
{ |
|
"epoch": 0.6010337780983291, |
|
"grad_norm": 0.461322158575058, |
|
"learning_rate": 0.0004968468940803546, |
|
"loss": 0.9305, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.6010337780983291, |
|
"eval_loss": 2.072265625, |
|
"eval_model_preparation_time": 0.0053, |
|
"eval_runtime": 35.2054, |
|
"eval_samples_per_second": 4.545, |
|
"eval_steps_per_second": 1.136, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.6016348118764274, |
|
"grad_norm": 0.47805655002593994, |
|
"learning_rate": 0.0004968393404997085, |
|
"loss": 0.8844, |
|
"step": 5005 |
|
}, |
|
{ |
|
"epoch": 0.6022358456545258, |
|
"grad_norm": 0.6164557933807373, |
|
"learning_rate": 0.0004968317779397611, |
|
"loss": 1.0875, |
|
"step": 5010 |
|
}, |
|
{ |
|
"epoch": 0.6028368794326241, |
|
"grad_norm": 0.6458466053009033, |
|
"learning_rate": 0.0004968242064007875, |
|
"loss": 0.8664, |
|
"step": 5015 |
|
}, |
|
{ |
|
"epoch": 0.6034379132107225, |
|
"grad_norm": 0.5833812355995178, |
|
"learning_rate": 0.000496816625883063, |
|
"loss": 0.7797, |
|
"step": 5020 |
|
}, |
|
{ |
|
"epoch": 0.6040389469888208, |
|
"grad_norm": 0.6610152721405029, |
|
"learning_rate": 0.0004968090363868634, |
|
"loss": 0.8711, |
|
"step": 5025 |
|
}, |
|
{ |
|
"epoch": 0.604639980766919, |
|
"grad_norm": 0.5444918274879456, |
|
"learning_rate": 0.0004968014379124649, |
|
"loss": 1.1195, |
|
"step": 5030 |
|
}, |
|
{ |
|
"epoch": 0.6052410145450174, |
|
"grad_norm": 0.7095007300376892, |
|
"learning_rate": 0.0004967938304601438, |
|
"loss": 1.2203, |
|
"step": 5035 |
|
}, |
|
{ |
|
"epoch": 0.6058420483231157, |
|
"grad_norm": 0.5106276869773865, |
|
"learning_rate": 0.000496786214030177, |
|
"loss": 0.9242, |
|
"step": 5040 |
|
}, |
|
{ |
|
"epoch": 0.6064430821012141, |
|
"grad_norm": 0.7541503310203552, |
|
"learning_rate": 0.0004967785886228414, |
|
"loss": 1.1953, |
|
"step": 5045 |
|
}, |
|
{ |
|
"epoch": 0.6070441158793124, |
|
"grad_norm": 0.6698989868164062, |
|
"learning_rate": 0.0004967709542384142, |
|
"loss": 1.0977, |
|
"step": 5050 |
|
}, |
|
{ |
|
"epoch": 0.6076451496574108, |
|
"grad_norm": 0.65958172082901, |
|
"learning_rate": 0.0004967633108771735, |
|
"loss": 1.1953, |
|
"step": 5055 |
|
}, |
|
{ |
|
"epoch": 0.6082461834355091, |
|
"grad_norm": 0.4623226225376129, |
|
"learning_rate": 0.0004967556585393972, |
|
"loss": 1.0875, |
|
"step": 5060 |
|
}, |
|
{ |
|
"epoch": 0.6088472172136074, |
|
"grad_norm": 0.5534173846244812, |
|
"learning_rate": 0.0004967479972253637, |
|
"loss": 0.8086, |
|
"step": 5065 |
|
}, |
|
{ |
|
"epoch": 0.6094482509917057, |
|
"grad_norm": 0.5159333944320679, |
|
"learning_rate": 0.0004967403269353516, |
|
"loss": 1.0781, |
|
"step": 5070 |
|
}, |
|
{ |
|
"epoch": 0.610049284769804, |
|
"grad_norm": 0.9555249810218811, |
|
"learning_rate": 0.00049673264766964, |
|
"loss": 1.3039, |
|
"step": 5075 |
|
}, |
|
{ |
|
"epoch": 0.6106503185479024, |
|
"grad_norm": 0.5126526951789856, |
|
"learning_rate": 0.0004967249594285081, |
|
"loss": 1.3594, |
|
"step": 5080 |
|
}, |
|
{ |
|
"epoch": 0.6112513523260007, |
|
"grad_norm": 0.4960355758666992, |
|
"learning_rate": 0.0004967172622122358, |
|
"loss": 1.0219, |
|
"step": 5085 |
|
}, |
|
{ |
|
"epoch": 0.6118523861040991, |
|
"grad_norm": 0.6622259020805359, |
|
"learning_rate": 0.000496709556021103, |
|
"loss": 0.9633, |
|
"step": 5090 |
|
}, |
|
{ |
|
"epoch": 0.6124534198821974, |
|
"grad_norm": 0.5355902910232544, |
|
"learning_rate": 0.0004967018408553901, |
|
"loss": 1.2641, |
|
"step": 5095 |
|
}, |
|
{ |
|
"epoch": 0.6130544536602958, |
|
"grad_norm": 0.5535646080970764, |
|
"learning_rate": 0.0004966941167153776, |
|
"loss": 1.125, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 0.613655487438394, |
|
"grad_norm": 0.40969082713127136, |
|
"learning_rate": 0.0004966863836013465, |
|
"loss": 1.5328, |
|
"step": 5105 |
|
}, |
|
{ |
|
"epoch": 0.6142565212164923, |
|
"grad_norm": 0.7018454074859619, |
|
"learning_rate": 0.0004966786415135783, |
|
"loss": 1.009, |
|
"step": 5110 |
|
}, |
|
{ |
|
"epoch": 0.6148575549945907, |
|
"grad_norm": 0.7394018769264221, |
|
"learning_rate": 0.0004966708904523546, |
|
"loss": 1.4625, |
|
"step": 5115 |
|
}, |
|
{ |
|
"epoch": 0.615458588772689, |
|
"grad_norm": 0.43013033270835876, |
|
"learning_rate": 0.0004966631304179571, |
|
"loss": 0.7812, |
|
"step": 5120 |
|
}, |
|
{ |
|
"epoch": 0.6160596225507874, |
|
"grad_norm": 0.6355203986167908, |
|
"learning_rate": 0.0004966553614106684, |
|
"loss": 0.8352, |
|
"step": 5125 |
|
}, |
|
{ |
|
"epoch": 0.6166606563288857, |
|
"grad_norm": 0.4331319332122803, |
|
"learning_rate": 0.0004966475834307708, |
|
"loss": 1.0063, |
|
"step": 5130 |
|
}, |
|
{ |
|
"epoch": 0.6172616901069841, |
|
"grad_norm": 0.47516930103302, |
|
"learning_rate": 0.0004966397964785475, |
|
"loss": 1.1465, |
|
"step": 5135 |
|
}, |
|
{ |
|
"epoch": 0.6178627238850823, |
|
"grad_norm": 0.46409985423088074, |
|
"learning_rate": 0.0004966320005542817, |
|
"loss": 1.3172, |
|
"step": 5140 |
|
}, |
|
{ |
|
"epoch": 0.6184637576631806, |
|
"grad_norm": 0.5062604546546936, |
|
"learning_rate": 0.000496624195658257, |
|
"loss": 1.0352, |
|
"step": 5145 |
|
}, |
|
{ |
|
"epoch": 0.619064791441279, |
|
"grad_norm": 0.5797644257545471, |
|
"learning_rate": 0.0004966163817907573, |
|
"loss": 1.043, |
|
"step": 5150 |
|
}, |
|
{ |
|
"epoch": 0.6196658252193773, |
|
"grad_norm": 0.6194161176681519, |
|
"learning_rate": 0.0004966085589520668, |
|
"loss": 0.9633, |
|
"step": 5155 |
|
}, |
|
{ |
|
"epoch": 0.6202668589974757, |
|
"grad_norm": 0.570642352104187, |
|
"learning_rate": 0.0004966007271424701, |
|
"loss": 1.6047, |
|
"step": 5160 |
|
}, |
|
{ |
|
"epoch": 0.620867892775574, |
|
"grad_norm": 0.6968045830726624, |
|
"learning_rate": 0.0004965928863622522, |
|
"loss": 1.2914, |
|
"step": 5165 |
|
}, |
|
{ |
|
"epoch": 0.6214689265536724, |
|
"grad_norm": 0.4688586890697479, |
|
"learning_rate": 0.0004965850366116982, |
|
"loss": 1.4031, |
|
"step": 5170 |
|
}, |
|
{ |
|
"epoch": 0.6220699603317706, |
|
"grad_norm": 0.6257545351982117, |
|
"learning_rate": 0.0004965771778910936, |
|
"loss": 1.1641, |
|
"step": 5175 |
|
}, |
|
{ |
|
"epoch": 0.6226709941098689, |
|
"grad_norm": 0.5326806902885437, |
|
"learning_rate": 0.0004965693102007245, |
|
"loss": 0.973, |
|
"step": 5180 |
|
}, |
|
{ |
|
"epoch": 0.6232720278879673, |
|
"grad_norm": 0.5571873188018799, |
|
"learning_rate": 0.0004965614335408769, |
|
"loss": 0.9023, |
|
"step": 5185 |
|
}, |
|
{ |
|
"epoch": 0.6238730616660656, |
|
"grad_norm": 0.67780601978302, |
|
"learning_rate": 0.0004965535479118374, |
|
"loss": 1.1531, |
|
"step": 5190 |
|
}, |
|
{ |
|
"epoch": 0.624474095444164, |
|
"grad_norm": 0.5032584071159363, |
|
"learning_rate": 0.0004965456533138928, |
|
"loss": 1.3109, |
|
"step": 5195 |
|
}, |
|
{ |
|
"epoch": 0.6250751292222623, |
|
"grad_norm": 0.5984988808631897, |
|
"learning_rate": 0.0004965377497473304, |
|
"loss": 1.3664, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 0.6250751292222623, |
|
"eval_loss": 1.9845702648162842, |
|
"eval_model_preparation_time": 0.0053, |
|
"eval_runtime": 35.2079, |
|
"eval_samples_per_second": 4.544, |
|
"eval_steps_per_second": 1.136, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 0.6256761630003607, |
|
"grad_norm": 0.6601026654243469, |
|
"learning_rate": 0.0004965298372124376, |
|
"loss": 1.0969, |
|
"step": 5205 |
|
}, |
|
{ |
|
"epoch": 0.6262771967784589, |
|
"grad_norm": 0.6069896817207336, |
|
"learning_rate": 0.0004965219157095024, |
|
"loss": 0.7707, |
|
"step": 5210 |
|
}, |
|
{ |
|
"epoch": 0.6268782305565572, |
|
"grad_norm": 0.5757645964622498, |
|
"learning_rate": 0.0004965139852388127, |
|
"loss": 0.9836, |
|
"step": 5215 |
|
}, |
|
{ |
|
"epoch": 0.6274792643346556, |
|
"grad_norm": 0.658743143081665, |
|
"learning_rate": 0.0004965060458006573, |
|
"loss": 0.7297, |
|
"step": 5220 |
|
}, |
|
{ |
|
"epoch": 0.6280802981127539, |
|
"grad_norm": 0.6893699765205383, |
|
"learning_rate": 0.0004964980973953247, |
|
"loss": 1.0586, |
|
"step": 5225 |
|
}, |
|
{ |
|
"epoch": 0.6286813318908523, |
|
"grad_norm": 0.6678071022033691, |
|
"learning_rate": 0.0004964901400231043, |
|
"loss": 0.9922, |
|
"step": 5230 |
|
}, |
|
{ |
|
"epoch": 0.6292823656689506, |
|
"grad_norm": 0.8199008703231812, |
|
"learning_rate": 0.0004964821736842854, |
|
"loss": 1.0664, |
|
"step": 5235 |
|
}, |
|
{ |
|
"epoch": 0.629883399447049, |
|
"grad_norm": 0.516659140586853, |
|
"learning_rate": 0.0004964741983791578, |
|
"loss": 1.0359, |
|
"step": 5240 |
|
}, |
|
{ |
|
"epoch": 0.6304844332251472, |
|
"grad_norm": 0.5256388187408447, |
|
"learning_rate": 0.0004964662141080117, |
|
"loss": 0.8906, |
|
"step": 5245 |
|
}, |
|
{ |
|
"epoch": 0.6310854670032456, |
|
"grad_norm": 0.6450999975204468, |
|
"learning_rate": 0.0004964582208711375, |
|
"loss": 0.8363, |
|
"step": 5250 |
|
}, |
|
{ |
|
"epoch": 0.6316865007813439, |
|
"grad_norm": 0.7471349239349365, |
|
"learning_rate": 0.000496450218668826, |
|
"loss": 1.198, |
|
"step": 5255 |
|
}, |
|
{ |
|
"epoch": 0.6322875345594422, |
|
"grad_norm": 0.5109617114067078, |
|
"learning_rate": 0.0004964422075013682, |
|
"loss": 0.9227, |
|
"step": 5260 |
|
}, |
|
{ |
|
"epoch": 0.6328885683375406, |
|
"grad_norm": 0.48837974667549133, |
|
"learning_rate": 0.0004964341873690557, |
|
"loss": 0.9469, |
|
"step": 5265 |
|
}, |
|
{ |
|
"epoch": 0.6334896021156389, |
|
"grad_norm": 0.7013422250747681, |
|
"learning_rate": 0.0004964261582721801, |
|
"loss": 1.1594, |
|
"step": 5270 |
|
}, |
|
{ |
|
"epoch": 0.6340906358937373, |
|
"grad_norm": 0.3042369484901428, |
|
"learning_rate": 0.0004964181202110335, |
|
"loss": 1.3625, |
|
"step": 5275 |
|
}, |
|
{ |
|
"epoch": 0.6346916696718355, |
|
"grad_norm": 0.8467724919319153, |
|
"learning_rate": 0.0004964100731859084, |
|
"loss": 0.9328, |
|
"step": 5280 |
|
}, |
|
{ |
|
"epoch": 0.6352927034499339, |
|
"grad_norm": 0.7622195482254028, |
|
"learning_rate": 0.0004964020171970974, |
|
"loss": 1.0672, |
|
"step": 5285 |
|
}, |
|
{ |
|
"epoch": 0.6358937372280322, |
|
"grad_norm": 0.5378862023353577, |
|
"learning_rate": 0.0004963939522448936, |
|
"loss": 1.0277, |
|
"step": 5290 |
|
}, |
|
{ |
|
"epoch": 0.6364947710061305, |
|
"grad_norm": 0.5819463133811951, |
|
"learning_rate": 0.0004963858783295905, |
|
"loss": 1.1039, |
|
"step": 5295 |
|
}, |
|
{ |
|
"epoch": 0.6370958047842289, |
|
"grad_norm": 0.5175995826721191, |
|
"learning_rate": 0.0004963777954514816, |
|
"loss": 1.3875, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 0.6376968385623272, |
|
"grad_norm": 0.6846615076065063, |
|
"learning_rate": 0.000496369703610861, |
|
"loss": 0.8723, |
|
"step": 5305 |
|
}, |
|
{ |
|
"epoch": 0.6382978723404256, |
|
"grad_norm": 0.6325473189353943, |
|
"learning_rate": 0.000496361602808023, |
|
"loss": 1.2469, |
|
"step": 5310 |
|
}, |
|
{ |
|
"epoch": 0.6388989061185238, |
|
"grad_norm": 0.49080929160118103, |
|
"learning_rate": 0.0004963534930432625, |
|
"loss": 1.2977, |
|
"step": 5315 |
|
}, |
|
{ |
|
"epoch": 0.6394999398966222, |
|
"grad_norm": 0.4832301437854767, |
|
"learning_rate": 0.0004963453743168743, |
|
"loss": 1.1719, |
|
"step": 5320 |
|
}, |
|
{ |
|
"epoch": 0.6401009736747205, |
|
"grad_norm": 0.7145053744316101, |
|
"learning_rate": 0.000496337246629154, |
|
"loss": 1.2859, |
|
"step": 5325 |
|
}, |
|
{ |
|
"epoch": 0.6407020074528188, |
|
"grad_norm": 0.5795300602912903, |
|
"learning_rate": 0.0004963291099803969, |
|
"loss": 0.9617, |
|
"step": 5330 |
|
}, |
|
{ |
|
"epoch": 0.6413030412309172, |
|
"grad_norm": 0.9390627145767212, |
|
"learning_rate": 0.0004963209643708991, |
|
"loss": 1.1703, |
|
"step": 5335 |
|
}, |
|
{ |
|
"epoch": 0.6419040750090155, |
|
"grad_norm": 0.7437942624092102, |
|
"learning_rate": 0.000496312809800957, |
|
"loss": 1.0711, |
|
"step": 5340 |
|
}, |
|
{ |
|
"epoch": 0.6425051087871139, |
|
"grad_norm": 0.5893829464912415, |
|
"learning_rate": 0.0004963046462708673, |
|
"loss": 1.2203, |
|
"step": 5345 |
|
}, |
|
{ |
|
"epoch": 0.6431061425652121, |
|
"grad_norm": 0.430411696434021, |
|
"learning_rate": 0.0004962964737809268, |
|
"loss": 1.068, |
|
"step": 5350 |
|
}, |
|
{ |
|
"epoch": 0.6437071763433105, |
|
"grad_norm": 0.4798862040042877, |
|
"learning_rate": 0.0004962882923314329, |
|
"loss": 1.0195, |
|
"step": 5355 |
|
}, |
|
{ |
|
"epoch": 0.6443082101214088, |
|
"grad_norm": 0.7981201410293579, |
|
"learning_rate": 0.0004962801019226833, |
|
"loss": 0.9211, |
|
"step": 5360 |
|
}, |
|
{ |
|
"epoch": 0.6449092438995071, |
|
"grad_norm": 0.4801620841026306, |
|
"learning_rate": 0.0004962719025549757, |
|
"loss": 1.2961, |
|
"step": 5365 |
|
}, |
|
{ |
|
"epoch": 0.6455102776776055, |
|
"grad_norm": 0.4748445451259613, |
|
"learning_rate": 0.0004962636942286086, |
|
"loss": 1.207, |
|
"step": 5370 |
|
}, |
|
{ |
|
"epoch": 0.6461113114557038, |
|
"grad_norm": 0.5984495282173157, |
|
"learning_rate": 0.0004962554769438802, |
|
"loss": 0.9734, |
|
"step": 5375 |
|
}, |
|
{ |
|
"epoch": 0.6467123452338022, |
|
"grad_norm": 0.6702393889427185, |
|
"learning_rate": 0.0004962472507010901, |
|
"loss": 1.0977, |
|
"step": 5380 |
|
}, |
|
{ |
|
"epoch": 0.6473133790119004, |
|
"grad_norm": 0.5429012775421143, |
|
"learning_rate": 0.0004962390155005369, |
|
"loss": 1.0641, |
|
"step": 5385 |
|
}, |
|
{ |
|
"epoch": 0.6479144127899988, |
|
"grad_norm": 0.39091694355010986, |
|
"learning_rate": 0.0004962307713425206, |
|
"loss": 0.9906, |
|
"step": 5390 |
|
}, |
|
{ |
|
"epoch": 0.6485154465680971, |
|
"grad_norm": 0.4490283131599426, |
|
"learning_rate": 0.0004962225182273409, |
|
"loss": 1.1297, |
|
"step": 5395 |
|
}, |
|
{ |
|
"epoch": 0.6491164803461954, |
|
"grad_norm": 0.6250773072242737, |
|
"learning_rate": 0.0004962142561552981, |
|
"loss": 0.8102, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 0.6491164803461954, |
|
"eval_loss": 2.007031202316284, |
|
"eval_model_preparation_time": 0.0053, |
|
"eval_runtime": 35.2151, |
|
"eval_samples_per_second": 4.544, |
|
"eval_steps_per_second": 1.136, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 0.6497175141242938, |
|
"grad_norm": 0.4846701920032501, |
|
"learning_rate": 0.0004962059851266926, |
|
"loss": 1.4289, |
|
"step": 5405 |
|
}, |
|
{ |
|
"epoch": 0.6503185479023921, |
|
"grad_norm": 0.5710358023643494, |
|
"learning_rate": 0.0004961977051418253, |
|
"loss": 1.0961, |
|
"step": 5410 |
|
}, |
|
{ |
|
"epoch": 0.6509195816804905, |
|
"grad_norm": 0.46397438645362854, |
|
"learning_rate": 0.0004961894162009977, |
|
"loss": 0.9297, |
|
"step": 5415 |
|
}, |
|
{ |
|
"epoch": 0.6515206154585887, |
|
"grad_norm": 0.5259897708892822, |
|
"learning_rate": 0.0004961811183045111, |
|
"loss": 1.0891, |
|
"step": 5420 |
|
}, |
|
{ |
|
"epoch": 0.6521216492366871, |
|
"grad_norm": 0.44706469774246216, |
|
"learning_rate": 0.0004961728114526672, |
|
"loss": 1.175, |
|
"step": 5425 |
|
}, |
|
{ |
|
"epoch": 0.6527226830147854, |
|
"grad_norm": 0.6101322174072266, |
|
"learning_rate": 0.0004961644956457685, |
|
"loss": 1.143, |
|
"step": 5430 |
|
}, |
|
{ |
|
"epoch": 0.6533237167928838, |
|
"grad_norm": 0.544548511505127, |
|
"learning_rate": 0.0004961561708841173, |
|
"loss": 0.9246, |
|
"step": 5435 |
|
}, |
|
{ |
|
"epoch": 0.6539247505709821, |
|
"grad_norm": 0.7637187242507935, |
|
"learning_rate": 0.0004961478371680165, |
|
"loss": 1.3031, |
|
"step": 5440 |
|
}, |
|
{ |
|
"epoch": 0.6545257843490804, |
|
"grad_norm": 0.5239999294281006, |
|
"learning_rate": 0.0004961394944977692, |
|
"loss": 1.0242, |
|
"step": 5445 |
|
}, |
|
{ |
|
"epoch": 0.6551268181271788, |
|
"grad_norm": 0.6028966307640076, |
|
"learning_rate": 0.000496131142873679, |
|
"loss": 1.2961, |
|
"step": 5450 |
|
}, |
|
{ |
|
"epoch": 0.655727851905277, |
|
"grad_norm": 0.9021722674369812, |
|
"learning_rate": 0.0004961227822960495, |
|
"loss": 1.0543, |
|
"step": 5455 |
|
}, |
|
{ |
|
"epoch": 0.6563288856833754, |
|
"grad_norm": 0.8960506916046143, |
|
"learning_rate": 0.0004961144127651851, |
|
"loss": 1.4555, |
|
"step": 5460 |
|
}, |
|
{ |
|
"epoch": 0.6569299194614737, |
|
"grad_norm": 0.6960546374320984, |
|
"learning_rate": 0.0004961060342813901, |
|
"loss": 0.7801, |
|
"step": 5465 |
|
}, |
|
{ |
|
"epoch": 0.6575309532395721, |
|
"grad_norm": 0.7192292213439941, |
|
"learning_rate": 0.0004960976468449692, |
|
"loss": 0.9641, |
|
"step": 5470 |
|
}, |
|
{ |
|
"epoch": 0.6581319870176704, |
|
"grad_norm": 0.605060875415802, |
|
"learning_rate": 0.0004960892504562277, |
|
"loss": 1.6898, |
|
"step": 5475 |
|
}, |
|
{ |
|
"epoch": 0.6587330207957687, |
|
"grad_norm": 0.5543851256370544, |
|
"learning_rate": 0.000496080845115471, |
|
"loss": 1.4141, |
|
"step": 5480 |
|
}, |
|
{ |
|
"epoch": 0.6593340545738671, |
|
"grad_norm": 0.6614383459091187, |
|
"learning_rate": 0.0004960724308230047, |
|
"loss": 1.0016, |
|
"step": 5485 |
|
}, |
|
{ |
|
"epoch": 0.6599350883519653, |
|
"grad_norm": 0.5792709589004517, |
|
"learning_rate": 0.0004960640075791351, |
|
"loss": 1.2906, |
|
"step": 5490 |
|
}, |
|
{ |
|
"epoch": 0.6605361221300637, |
|
"grad_norm": 0.509619951248169, |
|
"learning_rate": 0.0004960555753841685, |
|
"loss": 0.9914, |
|
"step": 5495 |
|
}, |
|
{ |
|
"epoch": 0.661137155908162, |
|
"grad_norm": 0.6596083045005798, |
|
"learning_rate": 0.0004960471342384116, |
|
"loss": 0.8969, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 0.6617381896862604, |
|
"grad_norm": 0.43772363662719727, |
|
"learning_rate": 0.0004960386841421716, |
|
"loss": 1.3281, |
|
"step": 5505 |
|
}, |
|
{ |
|
"epoch": 0.6623392234643587, |
|
"grad_norm": 0.551668643951416, |
|
"learning_rate": 0.0004960302250957558, |
|
"loss": 0.9844, |
|
"step": 5510 |
|
}, |
|
{ |
|
"epoch": 0.662940257242457, |
|
"grad_norm": 0.5039249658584595, |
|
"learning_rate": 0.0004960217570994719, |
|
"loss": 1.1273, |
|
"step": 5515 |
|
}, |
|
{ |
|
"epoch": 0.6635412910205554, |
|
"grad_norm": 0.48881009221076965, |
|
"learning_rate": 0.0004960132801536281, |
|
"loss": 1.3953, |
|
"step": 5520 |
|
}, |
|
{ |
|
"epoch": 0.6641423247986536, |
|
"grad_norm": 0.7355161905288696, |
|
"learning_rate": 0.0004960047942585324, |
|
"loss": 0.868, |
|
"step": 5525 |
|
}, |
|
{ |
|
"epoch": 0.664743358576752, |
|
"grad_norm": 0.38211506605148315, |
|
"learning_rate": 0.0004959962994144939, |
|
"loss": 1.1547, |
|
"step": 5530 |
|
}, |
|
{ |
|
"epoch": 0.6653443923548503, |
|
"grad_norm": 0.5978872776031494, |
|
"learning_rate": 0.0004959877956218213, |
|
"loss": 1.1145, |
|
"step": 5535 |
|
}, |
|
{ |
|
"epoch": 0.6659454261329487, |
|
"grad_norm": 0.6254252791404724, |
|
"learning_rate": 0.0004959792828808241, |
|
"loss": 1.1023, |
|
"step": 5540 |
|
}, |
|
{ |
|
"epoch": 0.666546459911047, |
|
"grad_norm": 0.5093739032745361, |
|
"learning_rate": 0.0004959707611918121, |
|
"loss": 1.1742, |
|
"step": 5545 |
|
}, |
|
{ |
|
"epoch": 0.6671474936891453, |
|
"grad_norm": 0.5325815677642822, |
|
"learning_rate": 0.0004959622305550951, |
|
"loss": 1.1641, |
|
"step": 5550 |
|
}, |
|
{ |
|
"epoch": 0.6677485274672437, |
|
"grad_norm": 0.5326328873634338, |
|
"learning_rate": 0.0004959536909709834, |
|
"loss": 0.8969, |
|
"step": 5555 |
|
}, |
|
{ |
|
"epoch": 0.668349561245342, |
|
"grad_norm": 0.45378610491752625, |
|
"learning_rate": 0.0004959451424397879, |
|
"loss": 1.0051, |
|
"step": 5560 |
|
}, |
|
{ |
|
"epoch": 0.6689505950234403, |
|
"grad_norm": 0.6395484209060669, |
|
"learning_rate": 0.0004959365849618192, |
|
"loss": 1.0594, |
|
"step": 5565 |
|
}, |
|
{ |
|
"epoch": 0.6695516288015386, |
|
"grad_norm": 0.6629273891448975, |
|
"learning_rate": 0.0004959280185373888, |
|
"loss": 1.182, |
|
"step": 5570 |
|
}, |
|
{ |
|
"epoch": 0.670152662579637, |
|
"grad_norm": 0.6001206040382385, |
|
"learning_rate": 0.0004959194431668084, |
|
"loss": 0.825, |
|
"step": 5575 |
|
}, |
|
{ |
|
"epoch": 0.6707536963577353, |
|
"grad_norm": 0.6217541098594666, |
|
"learning_rate": 0.0004959108588503898, |
|
"loss": 1.0355, |
|
"step": 5580 |
|
}, |
|
{ |
|
"epoch": 0.6713547301358337, |
|
"grad_norm": 0.5813589096069336, |
|
"learning_rate": 0.0004959022655884453, |
|
"loss": 0.9965, |
|
"step": 5585 |
|
}, |
|
{ |
|
"epoch": 0.671955763913932, |
|
"grad_norm": 0.4870590269565582, |
|
"learning_rate": 0.0004958936633812876, |
|
"loss": 1.1906, |
|
"step": 5590 |
|
}, |
|
{ |
|
"epoch": 0.6725567976920303, |
|
"grad_norm": 0.46450790762901306, |
|
"learning_rate": 0.0004958850522292295, |
|
"loss": 0.9266, |
|
"step": 5595 |
|
}, |
|
{ |
|
"epoch": 0.6731578314701286, |
|
"grad_norm": 0.7803342938423157, |
|
"learning_rate": 0.0004958764321325843, |
|
"loss": 1.1867, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 0.6731578314701286, |
|
"eval_loss": 1.960351586341858, |
|
"eval_model_preparation_time": 0.0053, |
|
"eval_runtime": 35.1953, |
|
"eval_samples_per_second": 4.546, |
|
"eval_steps_per_second": 1.137, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 0.6737588652482269, |
|
"grad_norm": 0.39002323150634766, |
|
"learning_rate": 0.0004958678030916655, |
|
"loss": 0.9242, |
|
"step": 5605 |
|
}, |
|
{ |
|
"epoch": 0.6743598990263253, |
|
"grad_norm": 0.6863328814506531, |
|
"learning_rate": 0.0004958591651067872, |
|
"loss": 0.9203, |
|
"step": 5610 |
|
}, |
|
{ |
|
"epoch": 0.6749609328044236, |
|
"grad_norm": 0.5728829503059387, |
|
"learning_rate": 0.0004958505181782635, |
|
"loss": 1.1797, |
|
"step": 5615 |
|
}, |
|
{ |
|
"epoch": 0.675561966582522, |
|
"grad_norm": 0.5186361074447632, |
|
"learning_rate": 0.0004958418623064088, |
|
"loss": 0.7125, |
|
"step": 5620 |
|
}, |
|
{ |
|
"epoch": 0.6761630003606203, |
|
"grad_norm": 0.5343161225318909, |
|
"learning_rate": 0.0004958331974915382, |
|
"loss": 1.0008, |
|
"step": 5625 |
|
}, |
|
{ |
|
"epoch": 0.6767640341387186, |
|
"grad_norm": 0.6222845315933228, |
|
"learning_rate": 0.0004958245237339669, |
|
"loss": 1.0906, |
|
"step": 5630 |
|
}, |
|
{ |
|
"epoch": 0.6773650679168169, |
|
"grad_norm": 0.5673485398292542, |
|
"learning_rate": 0.0004958158410340103, |
|
"loss": 1.0516, |
|
"step": 5635 |
|
}, |
|
{ |
|
"epoch": 0.6779661016949152, |
|
"grad_norm": 0.6129093170166016, |
|
"learning_rate": 0.0004958071493919842, |
|
"loss": 0.932, |
|
"step": 5640 |
|
}, |
|
{ |
|
"epoch": 0.6785671354730136, |
|
"grad_norm": 0.43740183115005493, |
|
"learning_rate": 0.0004957984488082049, |
|
"loss": 1.4109, |
|
"step": 5645 |
|
}, |
|
{ |
|
"epoch": 0.6791681692511119, |
|
"grad_norm": 0.632262647151947, |
|
"learning_rate": 0.0004957897392829889, |
|
"loss": 1.3547, |
|
"step": 5650 |
|
}, |
|
{ |
|
"epoch": 0.6797692030292103, |
|
"grad_norm": 0.5351587533950806, |
|
"learning_rate": 0.0004957810208166531, |
|
"loss": 1.1938, |
|
"step": 5655 |
|
}, |
|
{ |
|
"epoch": 0.6803702368073086, |
|
"grad_norm": 0.6918035745620728, |
|
"learning_rate": 0.0004957722934095145, |
|
"loss": 0.9172, |
|
"step": 5660 |
|
}, |
|
{ |
|
"epoch": 0.6809712705854069, |
|
"grad_norm": 0.5031920671463013, |
|
"learning_rate": 0.0004957635570618906, |
|
"loss": 0.9242, |
|
"step": 5665 |
|
}, |
|
{ |
|
"epoch": 0.6815723043635052, |
|
"grad_norm": 0.8015581965446472, |
|
"learning_rate": 0.0004957548117740993, |
|
"loss": 0.884, |
|
"step": 5670 |
|
}, |
|
{ |
|
"epoch": 0.6821733381416035, |
|
"grad_norm": 0.5075511336326599, |
|
"learning_rate": 0.0004957460575464586, |
|
"loss": 1.0656, |
|
"step": 5675 |
|
}, |
|
{ |
|
"epoch": 0.6827743719197019, |
|
"grad_norm": 0.43553194403648376, |
|
"learning_rate": 0.000495737294379287, |
|
"loss": 1.1762, |
|
"step": 5680 |
|
}, |
|
{ |
|
"epoch": 0.6833754056978002, |
|
"grad_norm": 0.48835834860801697, |
|
"learning_rate": 0.0004957285222729034, |
|
"loss": 1.0547, |
|
"step": 5685 |
|
}, |
|
{ |
|
"epoch": 0.6839764394758986, |
|
"grad_norm": 0.5027529001235962, |
|
"learning_rate": 0.0004957197412276267, |
|
"loss": 1.1742, |
|
"step": 5690 |
|
}, |
|
{ |
|
"epoch": 0.6845774732539969, |
|
"grad_norm": 0.581287682056427, |
|
"learning_rate": 0.0004957109512437766, |
|
"loss": 0.9242, |
|
"step": 5695 |
|
}, |
|
{ |
|
"epoch": 0.6851785070320952, |
|
"grad_norm": 0.5454129576683044, |
|
"learning_rate": 0.0004957021523216725, |
|
"loss": 1.1492, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 0.6857795408101935, |
|
"grad_norm": 0.5323740839958191, |
|
"learning_rate": 0.0004956933444616347, |
|
"loss": 0.8586, |
|
"step": 5705 |
|
}, |
|
{ |
|
"epoch": 0.6863805745882918, |
|
"grad_norm": 0.8149203658103943, |
|
"learning_rate": 0.0004956845276639836, |
|
"loss": 1.15, |
|
"step": 5710 |
|
}, |
|
{ |
|
"epoch": 0.6869816083663902, |
|
"grad_norm": 0.6589764952659607, |
|
"learning_rate": 0.00049567570192904, |
|
"loss": 1.132, |
|
"step": 5715 |
|
}, |
|
{ |
|
"epoch": 0.6875826421444885, |
|
"grad_norm": 0.5598886609077454, |
|
"learning_rate": 0.0004956668672571247, |
|
"loss": 0.9125, |
|
"step": 5720 |
|
}, |
|
{ |
|
"epoch": 0.6881836759225869, |
|
"grad_norm": 0.4387410581111908, |
|
"learning_rate": 0.0004956580236485593, |
|
"loss": 0.8129, |
|
"step": 5725 |
|
}, |
|
{ |
|
"epoch": 0.6887847097006852, |
|
"grad_norm": 1.346904993057251, |
|
"learning_rate": 0.0004956491711036654, |
|
"loss": 1.3328, |
|
"step": 5730 |
|
}, |
|
{ |
|
"epoch": 0.6893857434787835, |
|
"grad_norm": 0.597332775592804, |
|
"learning_rate": 0.0004956403096227651, |
|
"loss": 1.2812, |
|
"step": 5735 |
|
}, |
|
{ |
|
"epoch": 0.6899867772568818, |
|
"grad_norm": 0.4443131685256958, |
|
"learning_rate": 0.0004956314392061808, |
|
"loss": 0.9059, |
|
"step": 5740 |
|
}, |
|
{ |
|
"epoch": 0.6905878110349801, |
|
"grad_norm": 0.547560453414917, |
|
"learning_rate": 0.000495622559854235, |
|
"loss": 1.1344, |
|
"step": 5745 |
|
}, |
|
{ |
|
"epoch": 0.6911888448130785, |
|
"grad_norm": 0.5721062421798706, |
|
"learning_rate": 0.0004956136715672509, |
|
"loss": 1.1805, |
|
"step": 5750 |
|
}, |
|
{ |
|
"epoch": 0.6917898785911768, |
|
"grad_norm": 0.5476487278938293, |
|
"learning_rate": 0.0004956047743455517, |
|
"loss": 1.2406, |
|
"step": 5755 |
|
}, |
|
{ |
|
"epoch": 0.6923909123692752, |
|
"grad_norm": 0.5772802829742432, |
|
"learning_rate": 0.0004955958681894611, |
|
"loss": 1.1797, |
|
"step": 5760 |
|
}, |
|
{ |
|
"epoch": 0.6929919461473735, |
|
"grad_norm": 0.5622345805168152, |
|
"learning_rate": 0.000495586953099303, |
|
"loss": 1.0258, |
|
"step": 5765 |
|
}, |
|
{ |
|
"epoch": 0.6935929799254719, |
|
"grad_norm": 0.3974035680294037, |
|
"learning_rate": 0.0004955780290754018, |
|
"loss": 0.9348, |
|
"step": 5770 |
|
}, |
|
{ |
|
"epoch": 0.6941940137035701, |
|
"grad_norm": 0.7566075921058655, |
|
"learning_rate": 0.0004955690961180822, |
|
"loss": 1.2188, |
|
"step": 5775 |
|
}, |
|
{ |
|
"epoch": 0.6947950474816684, |
|
"grad_norm": 0.5623620748519897, |
|
"learning_rate": 0.0004955601542276691, |
|
"loss": 1.0195, |
|
"step": 5780 |
|
}, |
|
{ |
|
"epoch": 0.6953960812597668, |
|
"grad_norm": 0.5877131819725037, |
|
"learning_rate": 0.0004955512034044876, |
|
"loss": 0.7508, |
|
"step": 5785 |
|
}, |
|
{ |
|
"epoch": 0.6959971150378651, |
|
"grad_norm": 0.35780635476112366, |
|
"learning_rate": 0.0004955422436488635, |
|
"loss": 1.0418, |
|
"step": 5790 |
|
}, |
|
{ |
|
"epoch": 0.6965981488159635, |
|
"grad_norm": 0.6338667869567871, |
|
"learning_rate": 0.0004955332749611227, |
|
"loss": 0.866, |
|
"step": 5795 |
|
}, |
|
{ |
|
"epoch": 0.6971991825940618, |
|
"grad_norm": 0.6346995830535889, |
|
"learning_rate": 0.0004955242973415915, |
|
"loss": 0.9914, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 0.6971991825940618, |
|
"eval_loss": 1.979101538658142, |
|
"eval_model_preparation_time": 0.0053, |
|
"eval_runtime": 35.1923, |
|
"eval_samples_per_second": 4.546, |
|
"eval_steps_per_second": 1.137, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 0.6978002163721602, |
|
"grad_norm": 0.6727893948554993, |
|
"learning_rate": 0.0004955153107905964, |
|
"loss": 1.1859, |
|
"step": 5805 |
|
}, |
|
{ |
|
"epoch": 0.6984012501502584, |
|
"grad_norm": 0.4857117235660553, |
|
"learning_rate": 0.0004955063153084642, |
|
"loss": 1.5766, |
|
"step": 5810 |
|
}, |
|
{ |
|
"epoch": 0.6990022839283567, |
|
"grad_norm": 0.6355534791946411, |
|
"learning_rate": 0.0004954973108955223, |
|
"loss": 1.1781, |
|
"step": 5815 |
|
}, |
|
{ |
|
"epoch": 0.6996033177064551, |
|
"grad_norm": 0.5181488990783691, |
|
"learning_rate": 0.0004954882975520983, |
|
"loss": 1.2094, |
|
"step": 5820 |
|
}, |
|
{ |
|
"epoch": 0.7002043514845534, |
|
"grad_norm": 0.3698882460594177, |
|
"learning_rate": 0.0004954792752785198, |
|
"loss": 1.0766, |
|
"step": 5825 |
|
}, |
|
{ |
|
"epoch": 0.7008053852626518, |
|
"grad_norm": 0.6260622143745422, |
|
"learning_rate": 0.0004954702440751153, |
|
"loss": 0.9172, |
|
"step": 5830 |
|
}, |
|
{ |
|
"epoch": 0.7014064190407501, |
|
"grad_norm": 0.5361654162406921, |
|
"learning_rate": 0.0004954612039422132, |
|
"loss": 1.0484, |
|
"step": 5835 |
|
}, |
|
{ |
|
"epoch": 0.7020074528188485, |
|
"grad_norm": 0.4724903404712677, |
|
"learning_rate": 0.0004954521548801424, |
|
"loss": 1.225, |
|
"step": 5840 |
|
}, |
|
{ |
|
"epoch": 0.7026084865969467, |
|
"grad_norm": 0.5695177912712097, |
|
"learning_rate": 0.000495443096889232, |
|
"loss": 0.9734, |
|
"step": 5845 |
|
}, |
|
{ |
|
"epoch": 0.703209520375045, |
|
"grad_norm": 0.8220880627632141, |
|
"learning_rate": 0.0004954340299698116, |
|
"loss": 0.9305, |
|
"step": 5850 |
|
}, |
|
{ |
|
"epoch": 0.7038105541531434, |
|
"grad_norm": 0.46409645676612854, |
|
"learning_rate": 0.000495424954122211, |
|
"loss": 1.5328, |
|
"step": 5855 |
|
}, |
|
{ |
|
"epoch": 0.7044115879312417, |
|
"grad_norm": 0.5940005779266357, |
|
"learning_rate": 0.0004954158693467603, |
|
"loss": 0.9586, |
|
"step": 5860 |
|
}, |
|
{ |
|
"epoch": 0.7050126217093401, |
|
"grad_norm": 0.6850906014442444, |
|
"learning_rate": 0.00049540677564379, |
|
"loss": 1.2406, |
|
"step": 5865 |
|
}, |
|
{ |
|
"epoch": 0.7056136554874384, |
|
"grad_norm": 0.6025176048278809, |
|
"learning_rate": 0.0004953976730136309, |
|
"loss": 1.0621, |
|
"step": 5870 |
|
}, |
|
{ |
|
"epoch": 0.7062146892655368, |
|
"grad_norm": 0.5903462767601013, |
|
"learning_rate": 0.0004953885614566142, |
|
"loss": 0.8078, |
|
"step": 5875 |
|
}, |
|
{ |
|
"epoch": 0.706815723043635, |
|
"grad_norm": 0.8402755856513977, |
|
"learning_rate": 0.0004953794409730713, |
|
"loss": 0.8309, |
|
"step": 5880 |
|
}, |
|
{ |
|
"epoch": 0.7074167568217333, |
|
"grad_norm": 0.525725245475769, |
|
"learning_rate": 0.0004953703115633339, |
|
"loss": 1.1781, |
|
"step": 5885 |
|
}, |
|
{ |
|
"epoch": 0.7080177905998317, |
|
"grad_norm": 0.5770952701568604, |
|
"learning_rate": 0.0004953611732277342, |
|
"loss": 1.3102, |
|
"step": 5890 |
|
}, |
|
{ |
|
"epoch": 0.70861882437793, |
|
"grad_norm": 0.5640763640403748, |
|
"learning_rate": 0.0004953520259666046, |
|
"loss": 1.1016, |
|
"step": 5895 |
|
}, |
|
{ |
|
"epoch": 0.7092198581560284, |
|
"grad_norm": 0.6585139036178589, |
|
"learning_rate": 0.0004953428697802778, |
|
"loss": 0.9719, |
|
"step": 5900 |
|
}, |
|
{ |
|
"epoch": 0.7098208919341267, |
|
"grad_norm": 0.8663554191589355, |
|
"learning_rate": 0.0004953337046690871, |
|
"loss": 0.9508, |
|
"step": 5905 |
|
}, |
|
{ |
|
"epoch": 0.7104219257122251, |
|
"grad_norm": 0.7086400985717773, |
|
"learning_rate": 0.0004953245306333656, |
|
"loss": 0.8703, |
|
"step": 5910 |
|
}, |
|
{ |
|
"epoch": 0.7110229594903233, |
|
"grad_norm": 0.36062702536582947, |
|
"learning_rate": 0.0004953153476734472, |
|
"loss": 0.8773, |
|
"step": 5915 |
|
}, |
|
{ |
|
"epoch": 0.7116239932684217, |
|
"grad_norm": 0.5696423053741455, |
|
"learning_rate": 0.0004953061557896658, |
|
"loss": 1.15, |
|
"step": 5920 |
|
}, |
|
{ |
|
"epoch": 0.71222502704652, |
|
"grad_norm": 0.3699018955230713, |
|
"learning_rate": 0.000495296954982356, |
|
"loss": 1.0984, |
|
"step": 5925 |
|
}, |
|
{ |
|
"epoch": 0.7128260608246183, |
|
"grad_norm": 0.5031812787055969, |
|
"learning_rate": 0.0004952877452518523, |
|
"loss": 0.802, |
|
"step": 5930 |
|
}, |
|
{ |
|
"epoch": 0.7134270946027167, |
|
"grad_norm": 0.5342919230461121, |
|
"learning_rate": 0.0004952785265984898, |
|
"loss": 1.0727, |
|
"step": 5935 |
|
}, |
|
{ |
|
"epoch": 0.714028128380815, |
|
"grad_norm": 0.7044137120246887, |
|
"learning_rate": 0.0004952692990226039, |
|
"loss": 0.7984, |
|
"step": 5940 |
|
}, |
|
{ |
|
"epoch": 0.7146291621589134, |
|
"grad_norm": 0.5580726861953735, |
|
"learning_rate": 0.0004952600625245301, |
|
"loss": 1.1875, |
|
"step": 5945 |
|
}, |
|
{ |
|
"epoch": 0.7152301959370116, |
|
"grad_norm": 0.5382014513015747, |
|
"learning_rate": 0.0004952508171046046, |
|
"loss": 0.9633, |
|
"step": 5950 |
|
}, |
|
{ |
|
"epoch": 0.71583122971511, |
|
"grad_norm": 0.6365616321563721, |
|
"learning_rate": 0.0004952415627631636, |
|
"loss": 1.0867, |
|
"step": 5955 |
|
}, |
|
{ |
|
"epoch": 0.7164322634932083, |
|
"grad_norm": 0.572903573513031, |
|
"learning_rate": 0.0004952322995005438, |
|
"loss": 0.9461, |
|
"step": 5960 |
|
}, |
|
{ |
|
"epoch": 0.7170332972713066, |
|
"grad_norm": 0.5480600595474243, |
|
"learning_rate": 0.0004952230273170822, |
|
"loss": 1.0547, |
|
"step": 5965 |
|
}, |
|
{ |
|
"epoch": 0.717634331049405, |
|
"grad_norm": 0.6455062031745911, |
|
"learning_rate": 0.000495213746213116, |
|
"loss": 0.8516, |
|
"step": 5970 |
|
}, |
|
{ |
|
"epoch": 0.7182353648275033, |
|
"grad_norm": 0.4712236225605011, |
|
"learning_rate": 0.0004952044561889829, |
|
"loss": 0.8742, |
|
"step": 5975 |
|
}, |
|
{ |
|
"epoch": 0.7188363986056017, |
|
"grad_norm": 0.4829672873020172, |
|
"learning_rate": 0.0004951951572450207, |
|
"loss": 0.9465, |
|
"step": 5980 |
|
}, |
|
{ |
|
"epoch": 0.7194374323837, |
|
"grad_norm": 0.5550187826156616, |
|
"learning_rate": 0.000495185849381568, |
|
"loss": 0.8234, |
|
"step": 5985 |
|
}, |
|
{ |
|
"epoch": 0.7200384661617983, |
|
"grad_norm": 0.4478211998939514, |
|
"learning_rate": 0.000495176532598963, |
|
"loss": 1.3477, |
|
"step": 5990 |
|
}, |
|
{ |
|
"epoch": 0.7206394999398966, |
|
"grad_norm": 0.5679304599761963, |
|
"learning_rate": 0.0004951672068975448, |
|
"loss": 1.0531, |
|
"step": 5995 |
|
}, |
|
{ |
|
"epoch": 0.7212405337179949, |
|
"grad_norm": 0.4058151841163635, |
|
"learning_rate": 0.0004951578722776526, |
|
"loss": 0.9563, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.7212405337179949, |
|
"eval_loss": 1.9480469226837158, |
|
"eval_model_preparation_time": 0.0053, |
|
"eval_runtime": 35.1671, |
|
"eval_samples_per_second": 4.55, |
|
"eval_steps_per_second": 1.137, |
|
"step": 6000 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 83190, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 10, |
|
"save_steps": 3000, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.307687484653568e+18, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|