|
{ |
|
"best_metric": 0.4268312156200409, |
|
"best_model_checkpoint": "gte-modernbert-philosophy-v1-1-autotr/checkpoint-11543", |
|
"epoch": 1.0, |
|
"eval_steps": 500, |
|
"global_step": 11543, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0021658147795200556, |
|
"grad_norm": 9.93041706085205, |
|
"learning_rate": 6.493506493506493e-07, |
|
"loss": 1.2002, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.004331629559040111, |
|
"grad_norm": 10.093696594238281, |
|
"learning_rate": 1.2987012987012986e-06, |
|
"loss": 1.1623, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.006497444338560167, |
|
"grad_norm": 9.860174179077148, |
|
"learning_rate": 1.948051948051948e-06, |
|
"loss": 1.2012, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.008663259118080222, |
|
"grad_norm": 8.943851470947266, |
|
"learning_rate": 2.597402597402597e-06, |
|
"loss": 1.1853, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.010829073897600277, |
|
"grad_norm": 5.130438327789307, |
|
"learning_rate": 3.246753246753247e-06, |
|
"loss": 0.9767, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.012994888677120333, |
|
"grad_norm": 8.450475692749023, |
|
"learning_rate": 3.896103896103896e-06, |
|
"loss": 0.865, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.015160703456640388, |
|
"grad_norm": 3.7100517749786377, |
|
"learning_rate": 4.5454545454545455e-06, |
|
"loss": 0.7733, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.017326518236160444, |
|
"grad_norm": 15.069323539733887, |
|
"learning_rate": 5.194805194805194e-06, |
|
"loss": 0.9545, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.0194923330156805, |
|
"grad_norm": 9.745051383972168, |
|
"learning_rate": 5.8181818181818185e-06, |
|
"loss": 0.8309, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.021658147795200554, |
|
"grad_norm": 7.115631580352783, |
|
"learning_rate": 6.467532467532467e-06, |
|
"loss": 0.7514, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.02382396257472061, |
|
"grad_norm": 15.047070503234863, |
|
"learning_rate": 7.116883116883117e-06, |
|
"loss": 0.5555, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.025989777354240667, |
|
"grad_norm": 14.228421211242676, |
|
"learning_rate": 7.766233766233767e-06, |
|
"loss": 0.563, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.02815559213376072, |
|
"grad_norm": 8.77304458618164, |
|
"learning_rate": 8.415584415584416e-06, |
|
"loss": 0.618, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.030321406913280776, |
|
"grad_norm": 4.78096866607666, |
|
"learning_rate": 9.064935064935066e-06, |
|
"loss": 0.6538, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.032487221692800834, |
|
"grad_norm": 9.978753089904785, |
|
"learning_rate": 9.714285714285715e-06, |
|
"loss": 0.5802, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.03465303647232089, |
|
"grad_norm": 15.071464538574219, |
|
"learning_rate": 1.0363636363636364e-05, |
|
"loss": 0.6568, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.036818851251840944, |
|
"grad_norm": 12.352958679199219, |
|
"learning_rate": 1.1012987012987013e-05, |
|
"loss": 0.4934, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.038984666031361, |
|
"grad_norm": 8.754806518554688, |
|
"learning_rate": 1.1662337662337662e-05, |
|
"loss": 0.597, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.04115048081088105, |
|
"grad_norm": 2.432300090789795, |
|
"learning_rate": 1.2311688311688312e-05, |
|
"loss": 0.3812, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.04331629559040111, |
|
"grad_norm": 7.439950466156006, |
|
"learning_rate": 1.2961038961038961e-05, |
|
"loss": 0.482, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.04548211036992116, |
|
"grad_norm": 14.198251724243164, |
|
"learning_rate": 1.361038961038961e-05, |
|
"loss": 0.5347, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.04764792514944122, |
|
"grad_norm": 5.91489839553833, |
|
"learning_rate": 1.425974025974026e-05, |
|
"loss": 0.5012, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.04981373992896128, |
|
"grad_norm": 14.394525527954102, |
|
"learning_rate": 1.490909090909091e-05, |
|
"loss": 0.5765, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 0.05197955470848133, |
|
"grad_norm": 16.823543548583984, |
|
"learning_rate": 1.555844155844156e-05, |
|
"loss": 0.4286, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.05414536948800139, |
|
"grad_norm": 4.72399377822876, |
|
"learning_rate": 1.6207792207792207e-05, |
|
"loss": 0.5167, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 0.05631118426752144, |
|
"grad_norm": 18.0063419342041, |
|
"learning_rate": 1.6857142857142858e-05, |
|
"loss": 0.4791, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.0584769990470415, |
|
"grad_norm": 11.925456047058105, |
|
"learning_rate": 1.750649350649351e-05, |
|
"loss": 0.5022, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 0.06064281382656155, |
|
"grad_norm": 2.7437996864318848, |
|
"learning_rate": 1.8155844155844156e-05, |
|
"loss": 0.438, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.0628086286060816, |
|
"grad_norm": 1.8270901441574097, |
|
"learning_rate": 1.8805194805194806e-05, |
|
"loss": 0.3995, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 0.06497444338560167, |
|
"grad_norm": 4.187374591827393, |
|
"learning_rate": 1.9454545454545453e-05, |
|
"loss": 0.2924, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.06714025816512172, |
|
"grad_norm": 12.709814071655273, |
|
"learning_rate": 2.0103896103896104e-05, |
|
"loss": 0.4391, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 0.06930607294464178, |
|
"grad_norm": 8.789942741394043, |
|
"learning_rate": 2.0753246753246755e-05, |
|
"loss": 0.4328, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.07147188772416183, |
|
"grad_norm": 10.182008743286133, |
|
"learning_rate": 2.137662337662338e-05, |
|
"loss": 0.5658, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 0.07363770250368189, |
|
"grad_norm": 3.5178301334381104, |
|
"learning_rate": 2.2025974025974026e-05, |
|
"loss": 0.4541, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.07580351728320193, |
|
"grad_norm": 8.124090194702148, |
|
"learning_rate": 2.2675324675324676e-05, |
|
"loss": 0.5381, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 0.077969332062722, |
|
"grad_norm": 11.69704532623291, |
|
"learning_rate": 2.3324675324675324e-05, |
|
"loss": 0.4523, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.08013514684224206, |
|
"grad_norm": 19.822145462036133, |
|
"learning_rate": 2.3974025974025974e-05, |
|
"loss": 0.3522, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 0.0823009616217621, |
|
"grad_norm": 8.31993579864502, |
|
"learning_rate": 2.4623376623376625e-05, |
|
"loss": 0.4475, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.08446677640128217, |
|
"grad_norm": 5.60876989364624, |
|
"learning_rate": 2.5246753246753246e-05, |
|
"loss": 0.4448, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 0.08663259118080222, |
|
"grad_norm": 9.872743606567383, |
|
"learning_rate": 2.5896103896103896e-05, |
|
"loss": 0.407, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.08879840596032228, |
|
"grad_norm": 7.193666458129883, |
|
"learning_rate": 2.6545454545454547e-05, |
|
"loss": 0.4616, |
|
"step": 1025 |
|
}, |
|
{ |
|
"epoch": 0.09096422073984232, |
|
"grad_norm": 17.595991134643555, |
|
"learning_rate": 2.7194805194805194e-05, |
|
"loss": 0.4213, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.09313003551936239, |
|
"grad_norm": 3.281184196472168, |
|
"learning_rate": 2.7844155844155844e-05, |
|
"loss": 0.465, |
|
"step": 1075 |
|
}, |
|
{ |
|
"epoch": 0.09529585029888243, |
|
"grad_norm": 7.671459197998047, |
|
"learning_rate": 2.849350649350649e-05, |
|
"loss": 0.2964, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.0974616650784025, |
|
"grad_norm": 7.963995933532715, |
|
"learning_rate": 2.9142857142857142e-05, |
|
"loss": 0.4414, |
|
"step": 1125 |
|
}, |
|
{ |
|
"epoch": 0.09962747985792256, |
|
"grad_norm": 1.8723474740982056, |
|
"learning_rate": 2.9792207792207793e-05, |
|
"loss": 0.3508, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.1017932946374426, |
|
"grad_norm": 5.1907877922058105, |
|
"learning_rate": 2.995090489025799e-05, |
|
"loss": 0.3362, |
|
"step": 1175 |
|
}, |
|
{ |
|
"epoch": 0.10395910941696267, |
|
"grad_norm": 5.219175815582275, |
|
"learning_rate": 2.9878706199460916e-05, |
|
"loss": 0.4953, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.10612492419648271, |
|
"grad_norm": 15.204286575317383, |
|
"learning_rate": 2.9806507508663843e-05, |
|
"loss": 0.4041, |
|
"step": 1225 |
|
}, |
|
{ |
|
"epoch": 0.10829073897600278, |
|
"grad_norm": 5.872297286987305, |
|
"learning_rate": 2.973430881786677e-05, |
|
"loss": 0.3773, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.11045655375552282, |
|
"grad_norm": 7.201790809631348, |
|
"learning_rate": 2.9662110127069697e-05, |
|
"loss": 0.3574, |
|
"step": 1275 |
|
}, |
|
{ |
|
"epoch": 0.11262236853504289, |
|
"grad_norm": 2.872793674468994, |
|
"learning_rate": 2.9589911436272623e-05, |
|
"loss": 0.642, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.11478818331456293, |
|
"grad_norm": 10.854488372802734, |
|
"learning_rate": 2.951771274547555e-05, |
|
"loss": 0.3783, |
|
"step": 1325 |
|
}, |
|
{ |
|
"epoch": 0.116953998094083, |
|
"grad_norm": 2.162464141845703, |
|
"learning_rate": 2.9445514054678477e-05, |
|
"loss": 0.4905, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.11911981287360304, |
|
"grad_norm": 14.541825294494629, |
|
"learning_rate": 2.9373315363881403e-05, |
|
"loss": 0.3937, |
|
"step": 1375 |
|
}, |
|
{ |
|
"epoch": 0.1212856276531231, |
|
"grad_norm": 1.6897481679916382, |
|
"learning_rate": 2.9301116673084327e-05, |
|
"loss": 0.4245, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.12345144243264317, |
|
"grad_norm": 9.359882354736328, |
|
"learning_rate": 2.9228917982287253e-05, |
|
"loss": 0.4139, |
|
"step": 1425 |
|
}, |
|
{ |
|
"epoch": 0.1256172572121632, |
|
"grad_norm": 39.94605255126953, |
|
"learning_rate": 2.915671929149018e-05, |
|
"loss": 0.4305, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 0.12778307199168326, |
|
"grad_norm": 10.268132209777832, |
|
"learning_rate": 2.908452060069311e-05, |
|
"loss": 0.675, |
|
"step": 1475 |
|
}, |
|
{ |
|
"epoch": 0.12994888677120334, |
|
"grad_norm": 1.7209604978561401, |
|
"learning_rate": 2.9012321909896037e-05, |
|
"loss": 0.55, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.13211470155072338, |
|
"grad_norm": 8.541482925415039, |
|
"learning_rate": 2.894012321909896e-05, |
|
"loss": 0.4033, |
|
"step": 1525 |
|
}, |
|
{ |
|
"epoch": 0.13428051633024343, |
|
"grad_norm": 10.4110107421875, |
|
"learning_rate": 2.8867924528301887e-05, |
|
"loss": 0.4167, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 0.13644633110976348, |
|
"grad_norm": 10.823756217956543, |
|
"learning_rate": 2.8795725837504814e-05, |
|
"loss": 0.3814, |
|
"step": 1575 |
|
}, |
|
{ |
|
"epoch": 0.13861214588928356, |
|
"grad_norm": 0.6896539926528931, |
|
"learning_rate": 2.872352714670774e-05, |
|
"loss": 0.5183, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.1407779606688036, |
|
"grad_norm": 4.357579231262207, |
|
"learning_rate": 2.8651328455910667e-05, |
|
"loss": 0.3343, |
|
"step": 1625 |
|
}, |
|
{ |
|
"epoch": 0.14294377544832365, |
|
"grad_norm": 12.074344635009766, |
|
"learning_rate": 2.857912976511359e-05, |
|
"loss": 0.4212, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 0.14510959022784373, |
|
"grad_norm": 11.660531997680664, |
|
"learning_rate": 2.850693107431652e-05, |
|
"loss": 0.4737, |
|
"step": 1675 |
|
}, |
|
{ |
|
"epoch": 0.14727540500736377, |
|
"grad_norm": 15.467144966125488, |
|
"learning_rate": 2.8434732383519447e-05, |
|
"loss": 0.4563, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.14944121978688382, |
|
"grad_norm": 9.277994155883789, |
|
"learning_rate": 2.8362533692722374e-05, |
|
"loss": 0.4251, |
|
"step": 1725 |
|
}, |
|
{ |
|
"epoch": 0.15160703456640387, |
|
"grad_norm": 3.6043941974639893, |
|
"learning_rate": 2.82903350019253e-05, |
|
"loss": 0.3497, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 0.15377284934592395, |
|
"grad_norm": 3.933353900909424, |
|
"learning_rate": 2.8218136311128224e-05, |
|
"loss": 0.3753, |
|
"step": 1775 |
|
}, |
|
{ |
|
"epoch": 0.155938664125444, |
|
"grad_norm": 3.8728222846984863, |
|
"learning_rate": 2.814593762033115e-05, |
|
"loss": 0.4031, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.15810447890496404, |
|
"grad_norm": 8.067976951599121, |
|
"learning_rate": 2.8073738929534077e-05, |
|
"loss": 0.4037, |
|
"step": 1825 |
|
}, |
|
{ |
|
"epoch": 0.16027029368448412, |
|
"grad_norm": 9.141134262084961, |
|
"learning_rate": 2.8001540238737004e-05, |
|
"loss": 0.4114, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 0.16243610846400416, |
|
"grad_norm": 1.8272747993469238, |
|
"learning_rate": 2.7929341547939934e-05, |
|
"loss": 0.3848, |
|
"step": 1875 |
|
}, |
|
{ |
|
"epoch": 0.1646019232435242, |
|
"grad_norm": 0.4890976846218109, |
|
"learning_rate": 2.7857142857142858e-05, |
|
"loss": 0.5088, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.16676773802304426, |
|
"grad_norm": 9.043623924255371, |
|
"learning_rate": 2.7784944166345784e-05, |
|
"loss": 0.4032, |
|
"step": 1925 |
|
}, |
|
{ |
|
"epoch": 0.16893355280256434, |
|
"grad_norm": 9.092608451843262, |
|
"learning_rate": 2.771274547554871e-05, |
|
"loss": 0.3354, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 0.17109936758208438, |
|
"grad_norm": 6.121222972869873, |
|
"learning_rate": 2.7640546784751638e-05, |
|
"loss": 0.4163, |
|
"step": 1975 |
|
}, |
|
{ |
|
"epoch": 0.17326518236160443, |
|
"grad_norm": 1.539663314819336, |
|
"learning_rate": 2.7568348093954564e-05, |
|
"loss": 0.3715, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.17543099714112448, |
|
"grad_norm": 16.089406967163086, |
|
"learning_rate": 2.7496149403157488e-05, |
|
"loss": 0.3424, |
|
"step": 2025 |
|
}, |
|
{ |
|
"epoch": 0.17759681192064455, |
|
"grad_norm": 12.510934829711914, |
|
"learning_rate": 2.7423950712360414e-05, |
|
"loss": 0.3311, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 0.1797626267001646, |
|
"grad_norm": 2.823338508605957, |
|
"learning_rate": 2.7351752021563345e-05, |
|
"loss": 0.4362, |
|
"step": 2075 |
|
}, |
|
{ |
|
"epoch": 0.18192844147968465, |
|
"grad_norm": 6.191600322723389, |
|
"learning_rate": 2.727955333076627e-05, |
|
"loss": 0.4441, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.18409425625920472, |
|
"grad_norm": 4.86907434463501, |
|
"learning_rate": 2.7207354639969198e-05, |
|
"loss": 0.3122, |
|
"step": 2125 |
|
}, |
|
{ |
|
"epoch": 0.18626007103872477, |
|
"grad_norm": 7.323814868927002, |
|
"learning_rate": 2.713515594917212e-05, |
|
"loss": 0.3717, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 0.18842588581824482, |
|
"grad_norm": 10.09737491607666, |
|
"learning_rate": 2.7062957258375048e-05, |
|
"loss": 0.3461, |
|
"step": 2175 |
|
}, |
|
{ |
|
"epoch": 0.19059170059776487, |
|
"grad_norm": 8.536800384521484, |
|
"learning_rate": 2.6990758567577975e-05, |
|
"loss": 0.4816, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.19275751537728494, |
|
"grad_norm": 5.237682819366455, |
|
"learning_rate": 2.69185598767809e-05, |
|
"loss": 0.4784, |
|
"step": 2225 |
|
}, |
|
{ |
|
"epoch": 0.194923330156805, |
|
"grad_norm": 10.763497352600098, |
|
"learning_rate": 2.6846361185983828e-05, |
|
"loss": 0.4334, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 0.19708914493632504, |
|
"grad_norm": 0.7019050121307373, |
|
"learning_rate": 2.6774162495186755e-05, |
|
"loss": 0.3437, |
|
"step": 2275 |
|
}, |
|
{ |
|
"epoch": 0.19925495971584511, |
|
"grad_norm": 8.020634651184082, |
|
"learning_rate": 2.670196380438968e-05, |
|
"loss": 0.4333, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.20142077449536516, |
|
"grad_norm": 10.549779891967773, |
|
"learning_rate": 2.662976511359261e-05, |
|
"loss": 0.3609, |
|
"step": 2325 |
|
}, |
|
{ |
|
"epoch": 0.2035865892748852, |
|
"grad_norm": 5.6236677169799805, |
|
"learning_rate": 2.6557566422795535e-05, |
|
"loss": 0.3437, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 0.20575240405440526, |
|
"grad_norm": 1.4388600587844849, |
|
"learning_rate": 2.648536773199846e-05, |
|
"loss": 0.4911, |
|
"step": 2375 |
|
}, |
|
{ |
|
"epoch": 0.20791821883392533, |
|
"grad_norm": 4.445183277130127, |
|
"learning_rate": 2.6413169041201385e-05, |
|
"loss": 0.3872, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.21008403361344538, |
|
"grad_norm": 9.076152801513672, |
|
"learning_rate": 2.6340970350404312e-05, |
|
"loss": 0.276, |
|
"step": 2425 |
|
}, |
|
{ |
|
"epoch": 0.21224984839296543, |
|
"grad_norm": 5.573355197906494, |
|
"learning_rate": 2.6268771659607242e-05, |
|
"loss": 0.3318, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 0.21441566317248548, |
|
"grad_norm": 5.015573024749756, |
|
"learning_rate": 2.619657296881017e-05, |
|
"loss": 0.4833, |
|
"step": 2475 |
|
}, |
|
{ |
|
"epoch": 0.21658147795200555, |
|
"grad_norm": 3.9038755893707275, |
|
"learning_rate": 2.6124374278013092e-05, |
|
"loss": 0.4656, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.2187472927315256, |
|
"grad_norm": 2.66627836227417, |
|
"learning_rate": 2.605217558721602e-05, |
|
"loss": 0.4232, |
|
"step": 2525 |
|
}, |
|
{ |
|
"epoch": 0.22091310751104565, |
|
"grad_norm": 8.859906196594238, |
|
"learning_rate": 2.5979976896418945e-05, |
|
"loss": 0.434, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 0.22307892229056572, |
|
"grad_norm": 3.2811522483825684, |
|
"learning_rate": 2.5907778205621872e-05, |
|
"loss": 0.2479, |
|
"step": 2575 |
|
}, |
|
{ |
|
"epoch": 0.22524473707008577, |
|
"grad_norm": 8.53447437286377, |
|
"learning_rate": 2.58355795148248e-05, |
|
"loss": 0.4656, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.22741055184960582, |
|
"grad_norm": 6.359921455383301, |
|
"learning_rate": 2.5763380824027722e-05, |
|
"loss": 0.3881, |
|
"step": 2625 |
|
}, |
|
{ |
|
"epoch": 0.22957636662912587, |
|
"grad_norm": 6.196253776550293, |
|
"learning_rate": 2.5691182133230652e-05, |
|
"loss": 0.3637, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 0.23174218140864594, |
|
"grad_norm": 7.805304050445557, |
|
"learning_rate": 2.561898344243358e-05, |
|
"loss": 0.3099, |
|
"step": 2675 |
|
}, |
|
{ |
|
"epoch": 0.233907996188166, |
|
"grad_norm": 4.51755428314209, |
|
"learning_rate": 2.5546784751636506e-05, |
|
"loss": 0.3933, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.23607381096768604, |
|
"grad_norm": 5.72914981842041, |
|
"learning_rate": 2.5474586060839432e-05, |
|
"loss": 0.3789, |
|
"step": 2725 |
|
}, |
|
{ |
|
"epoch": 0.23823962574720609, |
|
"grad_norm": 2.4809954166412354, |
|
"learning_rate": 2.5402387370042356e-05, |
|
"loss": 0.4056, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 0.24040544052672616, |
|
"grad_norm": 1.940656065940857, |
|
"learning_rate": 2.5330188679245282e-05, |
|
"loss": 0.4132, |
|
"step": 2775 |
|
}, |
|
{ |
|
"epoch": 0.2425712553062462, |
|
"grad_norm": 3.452242851257324, |
|
"learning_rate": 2.525798998844821e-05, |
|
"loss": 0.375, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.24473707008576626, |
|
"grad_norm": 9.220993041992188, |
|
"learning_rate": 2.5185791297651136e-05, |
|
"loss": 0.3026, |
|
"step": 2825 |
|
}, |
|
{ |
|
"epoch": 0.24690288486528633, |
|
"grad_norm": 10.027073860168457, |
|
"learning_rate": 2.5113592606854066e-05, |
|
"loss": 0.5372, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 0.24906869964480638, |
|
"grad_norm": 2.228799819946289, |
|
"learning_rate": 2.504139391605699e-05, |
|
"loss": 0.4233, |
|
"step": 2875 |
|
}, |
|
{ |
|
"epoch": 0.2512345144243264, |
|
"grad_norm": 7.281198978424072, |
|
"learning_rate": 2.4969195225259916e-05, |
|
"loss": 0.2945, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 0.2534003292038465, |
|
"grad_norm": 1.4160314798355103, |
|
"learning_rate": 2.4896996534462843e-05, |
|
"loss": 0.2916, |
|
"step": 2925 |
|
}, |
|
{ |
|
"epoch": 0.2555661439833665, |
|
"grad_norm": 4.095098972320557, |
|
"learning_rate": 2.482479784366577e-05, |
|
"loss": 0.3536, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 0.25773195876288657, |
|
"grad_norm": 1.413552165031433, |
|
"learning_rate": 2.4752599152868696e-05, |
|
"loss": 0.3246, |
|
"step": 2975 |
|
}, |
|
{ |
|
"epoch": 0.2598977735424067, |
|
"grad_norm": 3.3196184635162354, |
|
"learning_rate": 2.468040046207162e-05, |
|
"loss": 0.4236, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.2620635883219267, |
|
"grad_norm": 11.855537414550781, |
|
"learning_rate": 2.4608201771274546e-05, |
|
"loss": 0.4088, |
|
"step": 3025 |
|
}, |
|
{ |
|
"epoch": 0.26422940310144677, |
|
"grad_norm": 9.322809219360352, |
|
"learning_rate": 2.4536003080477476e-05, |
|
"loss": 0.4522, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 0.2663952178809668, |
|
"grad_norm": 7.581571578979492, |
|
"learning_rate": 2.4463804389680403e-05, |
|
"loss": 0.3445, |
|
"step": 3075 |
|
}, |
|
{ |
|
"epoch": 0.26856103266048686, |
|
"grad_norm": 2.6131093502044678, |
|
"learning_rate": 2.439160569888333e-05, |
|
"loss": 0.3575, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 0.2707268474400069, |
|
"grad_norm": 3.68662166595459, |
|
"learning_rate": 2.4319407008086253e-05, |
|
"loss": 0.3809, |
|
"step": 3125 |
|
}, |
|
{ |
|
"epoch": 0.27289266221952696, |
|
"grad_norm": 2.3688032627105713, |
|
"learning_rate": 2.424720831728918e-05, |
|
"loss": 0.3364, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 0.27505847699904706, |
|
"grad_norm": 1.155315637588501, |
|
"learning_rate": 2.4175009626492106e-05, |
|
"loss": 0.4103, |
|
"step": 3175 |
|
}, |
|
{ |
|
"epoch": 0.2772242917785671, |
|
"grad_norm": 35.7138671875, |
|
"learning_rate": 2.4102810935695033e-05, |
|
"loss": 0.3502, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 0.27939010655808716, |
|
"grad_norm": 6.429433822631836, |
|
"learning_rate": 2.403061224489796e-05, |
|
"loss": 0.2632, |
|
"step": 3225 |
|
}, |
|
{ |
|
"epoch": 0.2815559213376072, |
|
"grad_norm": 9.816515922546387, |
|
"learning_rate": 2.3958413554100887e-05, |
|
"loss": 0.406, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 0.28372173611712725, |
|
"grad_norm": 1.9653140306472778, |
|
"learning_rate": 2.3886214863303813e-05, |
|
"loss": 0.4363, |
|
"step": 3275 |
|
}, |
|
{ |
|
"epoch": 0.2858875508966473, |
|
"grad_norm": 9.559599876403809, |
|
"learning_rate": 2.381401617250674e-05, |
|
"loss": 0.2819, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 0.28805336567616735, |
|
"grad_norm": 10.623549461364746, |
|
"learning_rate": 2.3741817481709667e-05, |
|
"loss": 0.3421, |
|
"step": 3325 |
|
}, |
|
{ |
|
"epoch": 0.29021918045568745, |
|
"grad_norm": 2.4988913536071777, |
|
"learning_rate": 2.366961879091259e-05, |
|
"loss": 0.269, |
|
"step": 3350 |
|
}, |
|
{ |
|
"epoch": 0.2923849952352075, |
|
"grad_norm": 4.704137802124023, |
|
"learning_rate": 2.3597420100115517e-05, |
|
"loss": 0.2902, |
|
"step": 3375 |
|
}, |
|
{ |
|
"epoch": 0.29455081001472755, |
|
"grad_norm": 9.48901653289795, |
|
"learning_rate": 2.3525221409318443e-05, |
|
"loss": 0.3548, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 0.2967166247942476, |
|
"grad_norm": 0.5201269388198853, |
|
"learning_rate": 2.3453022718521374e-05, |
|
"loss": 0.4575, |
|
"step": 3425 |
|
}, |
|
{ |
|
"epoch": 0.29888243957376764, |
|
"grad_norm": 8.074861526489258, |
|
"learning_rate": 2.33808240277243e-05, |
|
"loss": 0.3942, |
|
"step": 3450 |
|
}, |
|
{ |
|
"epoch": 0.3010482543532877, |
|
"grad_norm": 8.45334243774414, |
|
"learning_rate": 2.3308625336927224e-05, |
|
"loss": 0.3537, |
|
"step": 3475 |
|
}, |
|
{ |
|
"epoch": 0.30321406913280774, |
|
"grad_norm": 2.7069313526153564, |
|
"learning_rate": 2.323642664613015e-05, |
|
"loss": 0.3672, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.30537988391232784, |
|
"grad_norm": 13.849508285522461, |
|
"learning_rate": 2.3164227955333077e-05, |
|
"loss": 0.3502, |
|
"step": 3525 |
|
}, |
|
{ |
|
"epoch": 0.3075456986918479, |
|
"grad_norm": 4.5892462730407715, |
|
"learning_rate": 2.3092029264536004e-05, |
|
"loss": 0.2545, |
|
"step": 3550 |
|
}, |
|
{ |
|
"epoch": 0.30971151347136794, |
|
"grad_norm": 1.035447120666504, |
|
"learning_rate": 2.301983057373893e-05, |
|
"loss": 0.2544, |
|
"step": 3575 |
|
}, |
|
{ |
|
"epoch": 0.311877328250888, |
|
"grad_norm": 5.170057773590088, |
|
"learning_rate": 2.2947631882941854e-05, |
|
"loss": 0.3443, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 0.31404314303040803, |
|
"grad_norm": 2.908191204071045, |
|
"learning_rate": 2.2875433192144784e-05, |
|
"loss": 0.3784, |
|
"step": 3625 |
|
}, |
|
{ |
|
"epoch": 0.3162089578099281, |
|
"grad_norm": 9.946891784667969, |
|
"learning_rate": 2.280323450134771e-05, |
|
"loss": 0.3828, |
|
"step": 3650 |
|
}, |
|
{ |
|
"epoch": 0.31837477258944813, |
|
"grad_norm": 10.337167739868164, |
|
"learning_rate": 2.2731035810550637e-05, |
|
"loss": 0.4032, |
|
"step": 3675 |
|
}, |
|
{ |
|
"epoch": 0.32054058736896823, |
|
"grad_norm": 10.093758583068848, |
|
"learning_rate": 2.2658837119753564e-05, |
|
"loss": 0.2556, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 0.3227064021484883, |
|
"grad_norm": 7.309471130371094, |
|
"learning_rate": 2.2586638428956487e-05, |
|
"loss": 0.3352, |
|
"step": 3725 |
|
}, |
|
{ |
|
"epoch": 0.32487221692800833, |
|
"grad_norm": 10.050370216369629, |
|
"learning_rate": 2.2514439738159414e-05, |
|
"loss": 0.4054, |
|
"step": 3750 |
|
}, |
|
{ |
|
"epoch": 0.3270380317075284, |
|
"grad_norm": 3.858546733856201, |
|
"learning_rate": 2.244224104736234e-05, |
|
"loss": 0.3049, |
|
"step": 3775 |
|
}, |
|
{ |
|
"epoch": 0.3292038464870484, |
|
"grad_norm": 5.640537261962891, |
|
"learning_rate": 2.2370042356565267e-05, |
|
"loss": 0.2223, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 0.33136966126656847, |
|
"grad_norm": 5.106541633605957, |
|
"learning_rate": 2.2297843665768198e-05, |
|
"loss": 0.4878, |
|
"step": 3825 |
|
}, |
|
{ |
|
"epoch": 0.3335354760460885, |
|
"grad_norm": 7.738224029541016, |
|
"learning_rate": 2.222564497497112e-05, |
|
"loss": 0.3015, |
|
"step": 3850 |
|
}, |
|
{ |
|
"epoch": 0.33570129082560857, |
|
"grad_norm": 12.313666343688965, |
|
"learning_rate": 2.2153446284174048e-05, |
|
"loss": 0.3816, |
|
"step": 3875 |
|
}, |
|
{ |
|
"epoch": 0.33786710560512867, |
|
"grad_norm": 0.9929437041282654, |
|
"learning_rate": 2.2081247593376974e-05, |
|
"loss": 0.3334, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 0.3400329203846487, |
|
"grad_norm": 5.753032207489014, |
|
"learning_rate": 2.20090489025799e-05, |
|
"loss": 0.3724, |
|
"step": 3925 |
|
}, |
|
{ |
|
"epoch": 0.34219873516416877, |
|
"grad_norm": 8.37396240234375, |
|
"learning_rate": 2.1936850211782828e-05, |
|
"loss": 0.4217, |
|
"step": 3950 |
|
}, |
|
{ |
|
"epoch": 0.3443645499436888, |
|
"grad_norm": 7.365005016326904, |
|
"learning_rate": 2.186465152098575e-05, |
|
"loss": 0.4339, |
|
"step": 3975 |
|
}, |
|
{ |
|
"epoch": 0.34653036472320886, |
|
"grad_norm": 1.91083824634552, |
|
"learning_rate": 2.1792452830188678e-05, |
|
"loss": 0.3642, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.3486961795027289, |
|
"grad_norm": 3.0427494049072266, |
|
"learning_rate": 2.1720254139391608e-05, |
|
"loss": 0.3819, |
|
"step": 4025 |
|
}, |
|
{ |
|
"epoch": 0.35086199428224896, |
|
"grad_norm": 1.176952838897705, |
|
"learning_rate": 2.1648055448594535e-05, |
|
"loss": 0.2796, |
|
"step": 4050 |
|
}, |
|
{ |
|
"epoch": 0.35302780906176906, |
|
"grad_norm": 1.0579583644866943, |
|
"learning_rate": 2.157585675779746e-05, |
|
"loss": 0.4277, |
|
"step": 4075 |
|
}, |
|
{ |
|
"epoch": 0.3551936238412891, |
|
"grad_norm": 11.798035621643066, |
|
"learning_rate": 2.1503658067000385e-05, |
|
"loss": 0.3407, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 0.35735943862080916, |
|
"grad_norm": 15.57787036895752, |
|
"learning_rate": 2.143145937620331e-05, |
|
"loss": 0.2781, |
|
"step": 4125 |
|
}, |
|
{ |
|
"epoch": 0.3595252534003292, |
|
"grad_norm": 8.533368110656738, |
|
"learning_rate": 2.1359260685406238e-05, |
|
"loss": 0.4274, |
|
"step": 4150 |
|
}, |
|
{ |
|
"epoch": 0.36169106817984925, |
|
"grad_norm": 8.470250129699707, |
|
"learning_rate": 2.1287061994609165e-05, |
|
"loss": 0.3609, |
|
"step": 4175 |
|
}, |
|
{ |
|
"epoch": 0.3638568829593693, |
|
"grad_norm": 6.417985439300537, |
|
"learning_rate": 2.121486330381209e-05, |
|
"loss": 0.3476, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 0.36602269773888935, |
|
"grad_norm": 8.685192108154297, |
|
"learning_rate": 2.1142664613015018e-05, |
|
"loss": 0.41, |
|
"step": 4225 |
|
}, |
|
{ |
|
"epoch": 0.36818851251840945, |
|
"grad_norm": 7.082727432250977, |
|
"learning_rate": 2.1070465922217945e-05, |
|
"loss": 0.4003, |
|
"step": 4250 |
|
}, |
|
{ |
|
"epoch": 0.3703543272979295, |
|
"grad_norm": 4.621776103973389, |
|
"learning_rate": 2.099826723142087e-05, |
|
"loss": 0.306, |
|
"step": 4275 |
|
}, |
|
{ |
|
"epoch": 0.37252014207744955, |
|
"grad_norm": 3.1071817874908447, |
|
"learning_rate": 2.09260685406238e-05, |
|
"loss": 0.2335, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 0.3746859568569696, |
|
"grad_norm": 7.23638916015625, |
|
"learning_rate": 2.085386984982672e-05, |
|
"loss": 0.2733, |
|
"step": 4325 |
|
}, |
|
{ |
|
"epoch": 0.37685177163648964, |
|
"grad_norm": 6.893523693084717, |
|
"learning_rate": 2.078167115902965e-05, |
|
"loss": 0.3007, |
|
"step": 4350 |
|
}, |
|
{ |
|
"epoch": 0.3790175864160097, |
|
"grad_norm": 5.9917073249816895, |
|
"learning_rate": 2.0709472468232575e-05, |
|
"loss": 0.3086, |
|
"step": 4375 |
|
}, |
|
{ |
|
"epoch": 0.38118340119552974, |
|
"grad_norm": 6.596795558929443, |
|
"learning_rate": 2.0637273777435502e-05, |
|
"loss": 0.365, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 0.38334921597504984, |
|
"grad_norm": 9.045963287353516, |
|
"learning_rate": 2.0565075086638432e-05, |
|
"loss": 0.3255, |
|
"step": 4425 |
|
}, |
|
{ |
|
"epoch": 0.3855150307545699, |
|
"grad_norm": 6.755446434020996, |
|
"learning_rate": 2.0492876395841355e-05, |
|
"loss": 0.3765, |
|
"step": 4450 |
|
}, |
|
{ |
|
"epoch": 0.38768084553408994, |
|
"grad_norm": 11.626537322998047, |
|
"learning_rate": 2.0420677705044282e-05, |
|
"loss": 0.2946, |
|
"step": 4475 |
|
}, |
|
{ |
|
"epoch": 0.38984666031361, |
|
"grad_norm": 4.125662326812744, |
|
"learning_rate": 2.034847901424721e-05, |
|
"loss": 0.3298, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.39201247509313003, |
|
"grad_norm": 1.2437127828598022, |
|
"learning_rate": 2.0276280323450135e-05, |
|
"loss": 0.3645, |
|
"step": 4525 |
|
}, |
|
{ |
|
"epoch": 0.3941782898726501, |
|
"grad_norm": 10.272943496704102, |
|
"learning_rate": 2.0204081632653062e-05, |
|
"loss": 0.2403, |
|
"step": 4550 |
|
}, |
|
{ |
|
"epoch": 0.3963441046521701, |
|
"grad_norm": 2.164606809616089, |
|
"learning_rate": 2.0131882941855985e-05, |
|
"loss": 0.28, |
|
"step": 4575 |
|
}, |
|
{ |
|
"epoch": 0.39850991943169023, |
|
"grad_norm": 9.157061576843262, |
|
"learning_rate": 2.0059684251058916e-05, |
|
"loss": 0.3814, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 0.4006757342112103, |
|
"grad_norm": 4.034579277038574, |
|
"learning_rate": 1.9987485560261842e-05, |
|
"loss": 0.3419, |
|
"step": 4625 |
|
}, |
|
{ |
|
"epoch": 0.4028415489907303, |
|
"grad_norm": 2.5503344535827637, |
|
"learning_rate": 1.991528686946477e-05, |
|
"loss": 0.3374, |
|
"step": 4650 |
|
}, |
|
{ |
|
"epoch": 0.4050073637702504, |
|
"grad_norm": 4.660188674926758, |
|
"learning_rate": 1.9843088178667696e-05, |
|
"loss": 0.3511, |
|
"step": 4675 |
|
}, |
|
{ |
|
"epoch": 0.4071731785497704, |
|
"grad_norm": 7.020951747894287, |
|
"learning_rate": 1.977088948787062e-05, |
|
"loss": 0.4339, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 0.40933899332929047, |
|
"grad_norm": 5.4507269859313965, |
|
"learning_rate": 1.9698690797073546e-05, |
|
"loss": 0.3441, |
|
"step": 4725 |
|
}, |
|
{ |
|
"epoch": 0.4115048081088105, |
|
"grad_norm": 11.266243934631348, |
|
"learning_rate": 1.9626492106276472e-05, |
|
"loss": 0.346, |
|
"step": 4750 |
|
}, |
|
{ |
|
"epoch": 0.41367062288833056, |
|
"grad_norm": 2.1191511154174805, |
|
"learning_rate": 1.95542934154794e-05, |
|
"loss": 0.3723, |
|
"step": 4775 |
|
}, |
|
{ |
|
"epoch": 0.41583643766785067, |
|
"grad_norm": 1.9068052768707275, |
|
"learning_rate": 1.948209472468233e-05, |
|
"loss": 0.2075, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 0.4180022524473707, |
|
"grad_norm": 0.36394304037094116, |
|
"learning_rate": 1.9409896033885253e-05, |
|
"loss": 0.2431, |
|
"step": 4825 |
|
}, |
|
{ |
|
"epoch": 0.42016806722689076, |
|
"grad_norm": 6.177628993988037, |
|
"learning_rate": 1.933769734308818e-05, |
|
"loss": 0.2642, |
|
"step": 4850 |
|
}, |
|
{ |
|
"epoch": 0.4223338820064108, |
|
"grad_norm": 3.6669273376464844, |
|
"learning_rate": 1.9265498652291106e-05, |
|
"loss": 0.1763, |
|
"step": 4875 |
|
}, |
|
{ |
|
"epoch": 0.42449969678593086, |
|
"grad_norm": 7.836557865142822, |
|
"learning_rate": 1.9193299961494033e-05, |
|
"loss": 0.3862, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 0.4266655115654509, |
|
"grad_norm": 10.140486717224121, |
|
"learning_rate": 1.912110127069696e-05, |
|
"loss": 0.3053, |
|
"step": 4925 |
|
}, |
|
{ |
|
"epoch": 0.42883132634497095, |
|
"grad_norm": 2.8873586654663086, |
|
"learning_rate": 1.9048902579899883e-05, |
|
"loss": 0.3162, |
|
"step": 4950 |
|
}, |
|
{ |
|
"epoch": 0.43099714112449106, |
|
"grad_norm": 1.758466362953186, |
|
"learning_rate": 1.897670388910281e-05, |
|
"loss": 0.3178, |
|
"step": 4975 |
|
}, |
|
{ |
|
"epoch": 0.4331629559040111, |
|
"grad_norm": 7.523651599884033, |
|
"learning_rate": 1.890450519830574e-05, |
|
"loss": 0.2789, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.43532877068353115, |
|
"grad_norm": 5.955496311187744, |
|
"learning_rate": 1.8832306507508666e-05, |
|
"loss": 0.1777, |
|
"step": 5025 |
|
}, |
|
{ |
|
"epoch": 0.4374945854630512, |
|
"grad_norm": 9.068547248840332, |
|
"learning_rate": 1.8760107816711593e-05, |
|
"loss": 0.4155, |
|
"step": 5050 |
|
}, |
|
{ |
|
"epoch": 0.43966040024257125, |
|
"grad_norm": 4.900373458862305, |
|
"learning_rate": 1.8687909125914516e-05, |
|
"loss": 0.2983, |
|
"step": 5075 |
|
}, |
|
{ |
|
"epoch": 0.4418262150220913, |
|
"grad_norm": 3.5501790046691895, |
|
"learning_rate": 1.8615710435117443e-05, |
|
"loss": 0.3687, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 0.44399202980161134, |
|
"grad_norm": 1.0216822624206543, |
|
"learning_rate": 1.854351174432037e-05, |
|
"loss": 0.2428, |
|
"step": 5125 |
|
}, |
|
{ |
|
"epoch": 0.44615784458113145, |
|
"grad_norm": 7.637403964996338, |
|
"learning_rate": 1.8471313053523296e-05, |
|
"loss": 0.3071, |
|
"step": 5150 |
|
}, |
|
{ |
|
"epoch": 0.4483236593606515, |
|
"grad_norm": 9.478981018066406, |
|
"learning_rate": 1.8399114362726223e-05, |
|
"loss": 0.2911, |
|
"step": 5175 |
|
}, |
|
{ |
|
"epoch": 0.45048947414017154, |
|
"grad_norm": 3.875411033630371, |
|
"learning_rate": 1.832691567192915e-05, |
|
"loss": 0.3152, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 0.4526552889196916, |
|
"grad_norm": 1.1700037717819214, |
|
"learning_rate": 1.8254716981132077e-05, |
|
"loss": 0.2776, |
|
"step": 5225 |
|
}, |
|
{ |
|
"epoch": 0.45482110369921164, |
|
"grad_norm": 4.037864685058594, |
|
"learning_rate": 1.8182518290335003e-05, |
|
"loss": 0.2674, |
|
"step": 5250 |
|
}, |
|
{ |
|
"epoch": 0.4569869184787317, |
|
"grad_norm": 2.6295673847198486, |
|
"learning_rate": 1.811031959953793e-05, |
|
"loss": 0.3035, |
|
"step": 5275 |
|
}, |
|
{ |
|
"epoch": 0.45915273325825173, |
|
"grad_norm": 9.654006004333496, |
|
"learning_rate": 1.8038120908740853e-05, |
|
"loss": 0.3352, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 0.46131854803777184, |
|
"grad_norm": 7.339272975921631, |
|
"learning_rate": 1.796592221794378e-05, |
|
"loss": 0.3879, |
|
"step": 5325 |
|
}, |
|
{ |
|
"epoch": 0.4634843628172919, |
|
"grad_norm": 5.668703079223633, |
|
"learning_rate": 1.7893723527146707e-05, |
|
"loss": 0.3828, |
|
"step": 5350 |
|
}, |
|
{ |
|
"epoch": 0.46565017759681193, |
|
"grad_norm": 11.843222618103027, |
|
"learning_rate": 1.7821524836349633e-05, |
|
"loss": 0.2797, |
|
"step": 5375 |
|
}, |
|
{ |
|
"epoch": 0.467815992376332, |
|
"grad_norm": 3.3071844577789307, |
|
"learning_rate": 1.7749326145552564e-05, |
|
"loss": 0.3492, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 0.469981807155852, |
|
"grad_norm": 11.303645133972168, |
|
"learning_rate": 1.7677127454755487e-05, |
|
"loss": 0.5, |
|
"step": 5425 |
|
}, |
|
{ |
|
"epoch": 0.4721476219353721, |
|
"grad_norm": 1.1275362968444824, |
|
"learning_rate": 1.7604928763958414e-05, |
|
"loss": 0.2317, |
|
"step": 5450 |
|
}, |
|
{ |
|
"epoch": 0.4743134367148921, |
|
"grad_norm": 11.97022533416748, |
|
"learning_rate": 1.753273007316134e-05, |
|
"loss": 0.2411, |
|
"step": 5475 |
|
}, |
|
{ |
|
"epoch": 0.47647925149441217, |
|
"grad_norm": 2.9647443294525146, |
|
"learning_rate": 1.7460531382364267e-05, |
|
"loss": 0.277, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 0.4786450662739323, |
|
"grad_norm": 5.046292781829834, |
|
"learning_rate": 1.7388332691567194e-05, |
|
"loss": 0.4112, |
|
"step": 5525 |
|
}, |
|
{ |
|
"epoch": 0.4808108810534523, |
|
"grad_norm": 8.11351203918457, |
|
"learning_rate": 1.7316134000770117e-05, |
|
"loss": 0.5116, |
|
"step": 5550 |
|
}, |
|
{ |
|
"epoch": 0.48297669583297237, |
|
"grad_norm": 1.0861672163009644, |
|
"learning_rate": 1.7243935309973047e-05, |
|
"loss": 0.3264, |
|
"step": 5575 |
|
}, |
|
{ |
|
"epoch": 0.4851425106124924, |
|
"grad_norm": 2.311553955078125, |
|
"learning_rate": 1.7171736619175974e-05, |
|
"loss": 0.3688, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 0.48730832539201246, |
|
"grad_norm": 2.371721029281616, |
|
"learning_rate": 1.70995379283789e-05, |
|
"loss": 0.3224, |
|
"step": 5625 |
|
}, |
|
{ |
|
"epoch": 0.4894741401715325, |
|
"grad_norm": 7.7612714767456055, |
|
"learning_rate": 1.7027339237581827e-05, |
|
"loss": 0.3778, |
|
"step": 5650 |
|
}, |
|
{ |
|
"epoch": 0.49163995495105256, |
|
"grad_norm": 7.416019916534424, |
|
"learning_rate": 1.695514054678475e-05, |
|
"loss": 0.3671, |
|
"step": 5675 |
|
}, |
|
{ |
|
"epoch": 0.49380576973057266, |
|
"grad_norm": 7.0320940017700195, |
|
"learning_rate": 1.6882941855987677e-05, |
|
"loss": 0.3331, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 0.4959715845100927, |
|
"grad_norm": 0.8671308159828186, |
|
"learning_rate": 1.6810743165190604e-05, |
|
"loss": 0.3426, |
|
"step": 5725 |
|
}, |
|
{ |
|
"epoch": 0.49813739928961276, |
|
"grad_norm": 6.607793807983398, |
|
"learning_rate": 1.673854447439353e-05, |
|
"loss": 0.2863, |
|
"step": 5750 |
|
}, |
|
{ |
|
"epoch": 0.5003032140691328, |
|
"grad_norm": 10.399803161621094, |
|
"learning_rate": 1.666634578359646e-05, |
|
"loss": 0.5822, |
|
"step": 5775 |
|
}, |
|
{ |
|
"epoch": 0.5024690288486529, |
|
"grad_norm": 2.4261348247528076, |
|
"learning_rate": 1.6594147092799384e-05, |
|
"loss": 0.2687, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 0.5046348436281729, |
|
"grad_norm": 0.30012401938438416, |
|
"learning_rate": 1.652194840200231e-05, |
|
"loss": 0.3365, |
|
"step": 5825 |
|
}, |
|
{ |
|
"epoch": 0.506800658407693, |
|
"grad_norm": 8.255668640136719, |
|
"learning_rate": 1.6449749711205238e-05, |
|
"loss": 0.4609, |
|
"step": 5850 |
|
}, |
|
{ |
|
"epoch": 0.508966473187213, |
|
"grad_norm": 6.495670795440674, |
|
"learning_rate": 1.6377551020408164e-05, |
|
"loss": 0.3127, |
|
"step": 5875 |
|
}, |
|
{ |
|
"epoch": 0.511132287966733, |
|
"grad_norm": 4.311783790588379, |
|
"learning_rate": 1.630535232961109e-05, |
|
"loss": 0.2705, |
|
"step": 5900 |
|
}, |
|
{ |
|
"epoch": 0.5132981027462531, |
|
"grad_norm": 7.5022430419921875, |
|
"learning_rate": 1.6233153638814014e-05, |
|
"loss": 0.3089, |
|
"step": 5925 |
|
}, |
|
{ |
|
"epoch": 0.5154639175257731, |
|
"grad_norm": 9.813260078430176, |
|
"learning_rate": 1.616095494801694e-05, |
|
"loss": 0.3386, |
|
"step": 5950 |
|
}, |
|
{ |
|
"epoch": 0.5176297323052933, |
|
"grad_norm": 8.11892318725586, |
|
"learning_rate": 1.608875625721987e-05, |
|
"loss": 0.3796, |
|
"step": 5975 |
|
}, |
|
{ |
|
"epoch": 0.5197955470848133, |
|
"grad_norm": 8.750290870666504, |
|
"learning_rate": 1.6016557566422798e-05, |
|
"loss": 0.4231, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.5219613618643334, |
|
"grad_norm": 8.316088676452637, |
|
"learning_rate": 1.5944358875625725e-05, |
|
"loss": 0.3922, |
|
"step": 6025 |
|
}, |
|
{ |
|
"epoch": 0.5241271766438534, |
|
"grad_norm": 4.458547592163086, |
|
"learning_rate": 1.5872160184828648e-05, |
|
"loss": 0.3138, |
|
"step": 6050 |
|
}, |
|
{ |
|
"epoch": 0.5262929914233735, |
|
"grad_norm": 4.100847244262695, |
|
"learning_rate": 1.5799961494031575e-05, |
|
"loss": 0.3106, |
|
"step": 6075 |
|
}, |
|
{ |
|
"epoch": 0.5284588062028935, |
|
"grad_norm": 3.5927000045776367, |
|
"learning_rate": 1.57277628032345e-05, |
|
"loss": 0.188, |
|
"step": 6100 |
|
}, |
|
{ |
|
"epoch": 0.5306246209824136, |
|
"grad_norm": 0.6444216370582581, |
|
"learning_rate": 1.5655564112437428e-05, |
|
"loss": 0.209, |
|
"step": 6125 |
|
}, |
|
{ |
|
"epoch": 0.5327904357619336, |
|
"grad_norm": 6.649785041809082, |
|
"learning_rate": 1.5583365421640355e-05, |
|
"loss": 0.2617, |
|
"step": 6150 |
|
}, |
|
{ |
|
"epoch": 0.5349562505414537, |
|
"grad_norm": 8.491826057434082, |
|
"learning_rate": 1.551116673084328e-05, |
|
"loss": 0.3059, |
|
"step": 6175 |
|
}, |
|
{ |
|
"epoch": 0.5371220653209737, |
|
"grad_norm": 22.71511459350586, |
|
"learning_rate": 1.5438968040046208e-05, |
|
"loss": 0.2764, |
|
"step": 6200 |
|
}, |
|
{ |
|
"epoch": 0.5392878801004938, |
|
"grad_norm": 6.877171516418457, |
|
"learning_rate": 1.5366769349249135e-05, |
|
"loss": 0.2801, |
|
"step": 6225 |
|
}, |
|
{ |
|
"epoch": 0.5414536948800138, |
|
"grad_norm": 0.46479833126068115, |
|
"learning_rate": 1.529457065845206e-05, |
|
"loss": 0.3744, |
|
"step": 6250 |
|
}, |
|
{ |
|
"epoch": 0.5436195096595339, |
|
"grad_norm": 7.200215816497803, |
|
"learning_rate": 1.5222371967654987e-05, |
|
"loss": 0.3067, |
|
"step": 6275 |
|
}, |
|
{ |
|
"epoch": 0.5457853244390539, |
|
"grad_norm": 6.230359077453613, |
|
"learning_rate": 1.5150173276857913e-05, |
|
"loss": 0.3305, |
|
"step": 6300 |
|
}, |
|
{ |
|
"epoch": 0.5479511392185741, |
|
"grad_norm": 3.2241950035095215, |
|
"learning_rate": 1.5077974586060838e-05, |
|
"loss": 0.2827, |
|
"step": 6325 |
|
}, |
|
{ |
|
"epoch": 0.5501169539980941, |
|
"grad_norm": 10.813590049743652, |
|
"learning_rate": 1.5005775895263765e-05, |
|
"loss": 0.2712, |
|
"step": 6350 |
|
}, |
|
{ |
|
"epoch": 0.5522827687776142, |
|
"grad_norm": 3.5207877159118652, |
|
"learning_rate": 1.4933577204466692e-05, |
|
"loss": 0.2677, |
|
"step": 6375 |
|
}, |
|
{ |
|
"epoch": 0.5544485835571342, |
|
"grad_norm": 6.884098529815674, |
|
"learning_rate": 1.4861378513669619e-05, |
|
"loss": 0.4269, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 0.5566143983366543, |
|
"grad_norm": 12.490416526794434, |
|
"learning_rate": 1.4789179822872547e-05, |
|
"loss": 0.3834, |
|
"step": 6425 |
|
}, |
|
{ |
|
"epoch": 0.5587802131161743, |
|
"grad_norm": 6.844019889831543, |
|
"learning_rate": 1.4716981132075472e-05, |
|
"loss": 0.4177, |
|
"step": 6450 |
|
}, |
|
{ |
|
"epoch": 0.5609460278956944, |
|
"grad_norm": 2.4574711322784424, |
|
"learning_rate": 1.4644782441278399e-05, |
|
"loss": 0.2457, |
|
"step": 6475 |
|
}, |
|
{ |
|
"epoch": 0.5631118426752144, |
|
"grad_norm": 4.939560413360596, |
|
"learning_rate": 1.4572583750481324e-05, |
|
"loss": 0.348, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 0.5652776574547345, |
|
"grad_norm": 11.443745613098145, |
|
"learning_rate": 1.4500385059684252e-05, |
|
"loss": 0.3035, |
|
"step": 6525 |
|
}, |
|
{ |
|
"epoch": 0.5674434722342545, |
|
"grad_norm": 5.136826515197754, |
|
"learning_rate": 1.4428186368887177e-05, |
|
"loss": 0.39, |
|
"step": 6550 |
|
}, |
|
{ |
|
"epoch": 0.5696092870137746, |
|
"grad_norm": 8.772330284118652, |
|
"learning_rate": 1.4355987678090104e-05, |
|
"loss": 0.366, |
|
"step": 6575 |
|
}, |
|
{ |
|
"epoch": 0.5717751017932946, |
|
"grad_norm": 0.46080633997917175, |
|
"learning_rate": 1.428378898729303e-05, |
|
"loss": 0.2299, |
|
"step": 6600 |
|
}, |
|
{ |
|
"epoch": 0.5739409165728147, |
|
"grad_norm": 5.478773593902588, |
|
"learning_rate": 1.4211590296495957e-05, |
|
"loss": 0.1737, |
|
"step": 6625 |
|
}, |
|
{ |
|
"epoch": 0.5761067313523347, |
|
"grad_norm": 11.235420227050781, |
|
"learning_rate": 1.4139391605698884e-05, |
|
"loss": 0.3773, |
|
"step": 6650 |
|
}, |
|
{ |
|
"epoch": 0.5782725461318549, |
|
"grad_norm": 7.810971260070801, |
|
"learning_rate": 1.4067192914901809e-05, |
|
"loss": 0.3409, |
|
"step": 6675 |
|
}, |
|
{ |
|
"epoch": 0.5804383609113749, |
|
"grad_norm": 2.817094087600708, |
|
"learning_rate": 1.3994994224104737e-05, |
|
"loss": 0.1739, |
|
"step": 6700 |
|
}, |
|
{ |
|
"epoch": 0.582604175690895, |
|
"grad_norm": 0.4941748082637787, |
|
"learning_rate": 1.3922795533307664e-05, |
|
"loss": 0.3462, |
|
"step": 6725 |
|
}, |
|
{ |
|
"epoch": 0.584769990470415, |
|
"grad_norm": 1.5013363361358643, |
|
"learning_rate": 1.3850596842510589e-05, |
|
"loss": 0.2976, |
|
"step": 6750 |
|
}, |
|
{ |
|
"epoch": 0.586935805249935, |
|
"grad_norm": 4.63820219039917, |
|
"learning_rate": 1.3778398151713516e-05, |
|
"loss": 0.3246, |
|
"step": 6775 |
|
}, |
|
{ |
|
"epoch": 0.5891016200294551, |
|
"grad_norm": 0.6134036779403687, |
|
"learning_rate": 1.3706199460916443e-05, |
|
"loss": 0.3808, |
|
"step": 6800 |
|
}, |
|
{ |
|
"epoch": 0.5912674348089751, |
|
"grad_norm": 9.693577766418457, |
|
"learning_rate": 1.363400077011937e-05, |
|
"loss": 0.2926, |
|
"step": 6825 |
|
}, |
|
{ |
|
"epoch": 0.5934332495884952, |
|
"grad_norm": 8.138602256774902, |
|
"learning_rate": 1.3561802079322296e-05, |
|
"loss": 0.2709, |
|
"step": 6850 |
|
}, |
|
{ |
|
"epoch": 0.5955990643680152, |
|
"grad_norm": 5.065515041351318, |
|
"learning_rate": 1.3489603388525221e-05, |
|
"loss": 0.3777, |
|
"step": 6875 |
|
}, |
|
{ |
|
"epoch": 0.5977648791475353, |
|
"grad_norm": 6.169302463531494, |
|
"learning_rate": 1.341740469772815e-05, |
|
"loss": 0.2834, |
|
"step": 6900 |
|
}, |
|
{ |
|
"epoch": 0.5999306939270553, |
|
"grad_norm": 1.4236884117126465, |
|
"learning_rate": 1.3345206006931074e-05, |
|
"loss": 0.2965, |
|
"step": 6925 |
|
}, |
|
{ |
|
"epoch": 0.6020965087065754, |
|
"grad_norm": 4.954479217529297, |
|
"learning_rate": 1.3273007316134001e-05, |
|
"loss": 0.2399, |
|
"step": 6950 |
|
}, |
|
{ |
|
"epoch": 0.6042623234860954, |
|
"grad_norm": 1.1738444566726685, |
|
"learning_rate": 1.3200808625336928e-05, |
|
"loss": 0.2936, |
|
"step": 6975 |
|
}, |
|
{ |
|
"epoch": 0.6064281382656155, |
|
"grad_norm": 6.822793006896973, |
|
"learning_rate": 1.3128609934539855e-05, |
|
"loss": 0.2674, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.6085939530451355, |
|
"grad_norm": 9.408463478088379, |
|
"learning_rate": 1.3056411243742781e-05, |
|
"loss": 0.265, |
|
"step": 7025 |
|
}, |
|
{ |
|
"epoch": 0.6107597678246557, |
|
"grad_norm": 24.97877311706543, |
|
"learning_rate": 1.2984212552945706e-05, |
|
"loss": 0.3257, |
|
"step": 7050 |
|
}, |
|
{ |
|
"epoch": 0.6129255826041757, |
|
"grad_norm": 2.854039192199707, |
|
"learning_rate": 1.2912013862148633e-05, |
|
"loss": 0.3504, |
|
"step": 7075 |
|
}, |
|
{ |
|
"epoch": 0.6150913973836958, |
|
"grad_norm": 0.40900859236717224, |
|
"learning_rate": 1.283981517135156e-05, |
|
"loss": 0.1485, |
|
"step": 7100 |
|
}, |
|
{ |
|
"epoch": 0.6172572121632158, |
|
"grad_norm": 5.776600360870361, |
|
"learning_rate": 1.2767616480554486e-05, |
|
"loss": 0.2598, |
|
"step": 7125 |
|
}, |
|
{ |
|
"epoch": 0.6194230269427359, |
|
"grad_norm": 1.7507195472717285, |
|
"learning_rate": 1.2695417789757413e-05, |
|
"loss": 0.2838, |
|
"step": 7150 |
|
}, |
|
{ |
|
"epoch": 0.6215888417222559, |
|
"grad_norm": 7.723363399505615, |
|
"learning_rate": 1.2623219098960338e-05, |
|
"loss": 0.3391, |
|
"step": 7175 |
|
}, |
|
{ |
|
"epoch": 0.623754656501776, |
|
"grad_norm": 6.485815048217773, |
|
"learning_rate": 1.2551020408163267e-05, |
|
"loss": 0.3568, |
|
"step": 7200 |
|
}, |
|
{ |
|
"epoch": 0.625920471281296, |
|
"grad_norm": 0.392874151468277, |
|
"learning_rate": 1.2481709664998075e-05, |
|
"loss": 0.3001, |
|
"step": 7225 |
|
}, |
|
{ |
|
"epoch": 0.6280862860608161, |
|
"grad_norm": 1.3930811882019043, |
|
"learning_rate": 1.2409510974201001e-05, |
|
"loss": 0.2613, |
|
"step": 7250 |
|
}, |
|
{ |
|
"epoch": 0.6302521008403361, |
|
"grad_norm": 0.3461158275604248, |
|
"learning_rate": 1.2337312283403928e-05, |
|
"loss": 0.3379, |
|
"step": 7275 |
|
}, |
|
{ |
|
"epoch": 0.6324179156198562, |
|
"grad_norm": 3.489888906478882, |
|
"learning_rate": 1.2265113592606855e-05, |
|
"loss": 0.3347, |
|
"step": 7300 |
|
}, |
|
{ |
|
"epoch": 0.6345837303993762, |
|
"grad_norm": 2.3235511779785156, |
|
"learning_rate": 1.219291490180978e-05, |
|
"loss": 0.242, |
|
"step": 7325 |
|
}, |
|
{ |
|
"epoch": 0.6367495451788963, |
|
"grad_norm": 10.576093673706055, |
|
"learning_rate": 1.2120716211012708e-05, |
|
"loss": 0.3076, |
|
"step": 7350 |
|
}, |
|
{ |
|
"epoch": 0.6389153599584163, |
|
"grad_norm": 4.862971305847168, |
|
"learning_rate": 1.2048517520215633e-05, |
|
"loss": 0.3055, |
|
"step": 7375 |
|
}, |
|
{ |
|
"epoch": 0.6410811747379365, |
|
"grad_norm": 4.282524108886719, |
|
"learning_rate": 1.197631882941856e-05, |
|
"loss": 0.4014, |
|
"step": 7400 |
|
}, |
|
{ |
|
"epoch": 0.6432469895174565, |
|
"grad_norm": 1.2869305610656738, |
|
"learning_rate": 1.1904120138621487e-05, |
|
"loss": 0.3723, |
|
"step": 7425 |
|
}, |
|
{ |
|
"epoch": 0.6454128042969766, |
|
"grad_norm": 8.37488842010498, |
|
"learning_rate": 1.1831921447824414e-05, |
|
"loss": 0.3421, |
|
"step": 7450 |
|
}, |
|
{ |
|
"epoch": 0.6475786190764966, |
|
"grad_norm": 8.292667388916016, |
|
"learning_rate": 1.175972275702734e-05, |
|
"loss": 0.4306, |
|
"step": 7475 |
|
}, |
|
{ |
|
"epoch": 0.6497444338560167, |
|
"grad_norm": 7.678843975067139, |
|
"learning_rate": 1.1687524066230265e-05, |
|
"loss": 0.2536, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 0.6519102486355367, |
|
"grad_norm": 1.5608030557632446, |
|
"learning_rate": 1.1615325375433192e-05, |
|
"loss": 0.264, |
|
"step": 7525 |
|
}, |
|
{ |
|
"epoch": 0.6540760634150568, |
|
"grad_norm": 7.649046897888184, |
|
"learning_rate": 1.1543126684636119e-05, |
|
"loss": 0.1767, |
|
"step": 7550 |
|
}, |
|
{ |
|
"epoch": 0.6562418781945768, |
|
"grad_norm": 4.701557636260986, |
|
"learning_rate": 1.1470927993839045e-05, |
|
"loss": 0.259, |
|
"step": 7575 |
|
}, |
|
{ |
|
"epoch": 0.6584076929740968, |
|
"grad_norm": 14.77114200592041, |
|
"learning_rate": 1.1398729303041972e-05, |
|
"loss": 0.2761, |
|
"step": 7600 |
|
}, |
|
{ |
|
"epoch": 0.6605735077536169, |
|
"grad_norm": 0.08189712464809418, |
|
"learning_rate": 1.1326530612244897e-05, |
|
"loss": 0.2934, |
|
"step": 7625 |
|
}, |
|
{ |
|
"epoch": 0.6627393225331369, |
|
"grad_norm": 8.246410369873047, |
|
"learning_rate": 1.1254331921447826e-05, |
|
"loss": 0.3055, |
|
"step": 7650 |
|
}, |
|
{ |
|
"epoch": 0.664905137312657, |
|
"grad_norm": 2.8091800212860107, |
|
"learning_rate": 1.118213323065075e-05, |
|
"loss": 0.2532, |
|
"step": 7675 |
|
}, |
|
{ |
|
"epoch": 0.667070952092177, |
|
"grad_norm": 8.43855094909668, |
|
"learning_rate": 1.1109934539853677e-05, |
|
"loss": 0.2942, |
|
"step": 7700 |
|
}, |
|
{ |
|
"epoch": 0.6692367668716971, |
|
"grad_norm": 2.259917974472046, |
|
"learning_rate": 1.1037735849056604e-05, |
|
"loss": 0.2048, |
|
"step": 7725 |
|
}, |
|
{ |
|
"epoch": 0.6714025816512171, |
|
"grad_norm": 13.296177864074707, |
|
"learning_rate": 1.096553715825953e-05, |
|
"loss": 0.2884, |
|
"step": 7750 |
|
}, |
|
{ |
|
"epoch": 0.6735683964307373, |
|
"grad_norm": 7.745298862457275, |
|
"learning_rate": 1.0893338467462457e-05, |
|
"loss": 0.3598, |
|
"step": 7775 |
|
}, |
|
{ |
|
"epoch": 0.6757342112102573, |
|
"grad_norm": 1.932173490524292, |
|
"learning_rate": 1.0821139776665382e-05, |
|
"loss": 0.3318, |
|
"step": 7800 |
|
}, |
|
{ |
|
"epoch": 0.6779000259897774, |
|
"grad_norm": 7.833034515380859, |
|
"learning_rate": 1.0748941085868309e-05, |
|
"loss": 0.3058, |
|
"step": 7825 |
|
}, |
|
{ |
|
"epoch": 0.6800658407692974, |
|
"grad_norm": 8.620037078857422, |
|
"learning_rate": 1.0676742395071238e-05, |
|
"loss": 0.3395, |
|
"step": 7850 |
|
}, |
|
{ |
|
"epoch": 0.6822316555488175, |
|
"grad_norm": 8.948209762573242, |
|
"learning_rate": 1.0604543704274163e-05, |
|
"loss": 0.2973, |
|
"step": 7875 |
|
}, |
|
{ |
|
"epoch": 0.6843974703283375, |
|
"grad_norm": 5.001883506774902, |
|
"learning_rate": 1.053234501347709e-05, |
|
"loss": 0.2741, |
|
"step": 7900 |
|
}, |
|
{ |
|
"epoch": 0.6865632851078576, |
|
"grad_norm": 10.376258850097656, |
|
"learning_rate": 1.0460146322680016e-05, |
|
"loss": 0.2493, |
|
"step": 7925 |
|
}, |
|
{ |
|
"epoch": 0.6887290998873776, |
|
"grad_norm": 9.021862030029297, |
|
"learning_rate": 1.0387947631882943e-05, |
|
"loss": 0.2966, |
|
"step": 7950 |
|
}, |
|
{ |
|
"epoch": 0.6908949146668977, |
|
"grad_norm": 12.025108337402344, |
|
"learning_rate": 1.0315748941085868e-05, |
|
"loss": 0.3207, |
|
"step": 7975 |
|
}, |
|
{ |
|
"epoch": 0.6930607294464177, |
|
"grad_norm": 0.8383066058158875, |
|
"learning_rate": 1.0243550250288794e-05, |
|
"loss": 0.2501, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.6952265442259378, |
|
"grad_norm": 2.6812140941619873, |
|
"learning_rate": 1.0171351559491723e-05, |
|
"loss": 0.4028, |
|
"step": 8025 |
|
}, |
|
{ |
|
"epoch": 0.6973923590054578, |
|
"grad_norm": 11.301798820495605, |
|
"learning_rate": 1.0099152868694648e-05, |
|
"loss": 0.3549, |
|
"step": 8050 |
|
}, |
|
{ |
|
"epoch": 0.6995581737849779, |
|
"grad_norm": 8.55245304107666, |
|
"learning_rate": 1.0026954177897575e-05, |
|
"loss": 0.3805, |
|
"step": 8075 |
|
}, |
|
{ |
|
"epoch": 0.7017239885644979, |
|
"grad_norm": 1.9036015272140503, |
|
"learning_rate": 9.9547554871005e-06, |
|
"loss": 0.353, |
|
"step": 8100 |
|
}, |
|
{ |
|
"epoch": 0.7038898033440181, |
|
"grad_norm": 1.0196151733398438, |
|
"learning_rate": 9.882556796303428e-06, |
|
"loss": 0.3569, |
|
"step": 8125 |
|
}, |
|
{ |
|
"epoch": 0.7060556181235381, |
|
"grad_norm": 2.688908338546753, |
|
"learning_rate": 9.810358105506355e-06, |
|
"loss": 0.2588, |
|
"step": 8150 |
|
}, |
|
{ |
|
"epoch": 0.7082214329030582, |
|
"grad_norm": 0.6335782408714294, |
|
"learning_rate": 9.73815941470928e-06, |
|
"loss": 0.2252, |
|
"step": 8175 |
|
}, |
|
{ |
|
"epoch": 0.7103872476825782, |
|
"grad_norm": 4.539221286773682, |
|
"learning_rate": 9.665960723912206e-06, |
|
"loss": 0.2747, |
|
"step": 8200 |
|
}, |
|
{ |
|
"epoch": 0.7125530624620983, |
|
"grad_norm": 8.757186889648438, |
|
"learning_rate": 9.593762033115133e-06, |
|
"loss": 0.3239, |
|
"step": 8225 |
|
}, |
|
{ |
|
"epoch": 0.7147188772416183, |
|
"grad_norm": 1.7275235652923584, |
|
"learning_rate": 9.52156334231806e-06, |
|
"loss": 0.2954, |
|
"step": 8250 |
|
}, |
|
{ |
|
"epoch": 0.7168846920211384, |
|
"grad_norm": 6.338670253753662, |
|
"learning_rate": 9.449364651520987e-06, |
|
"loss": 0.3749, |
|
"step": 8275 |
|
}, |
|
{ |
|
"epoch": 0.7190505068006584, |
|
"grad_norm": 1.565496563911438, |
|
"learning_rate": 9.377165960723912e-06, |
|
"loss": 0.2757, |
|
"step": 8300 |
|
}, |
|
{ |
|
"epoch": 0.7212163215801785, |
|
"grad_norm": 0.0664602667093277, |
|
"learning_rate": 9.30496726992684e-06, |
|
"loss": 0.3012, |
|
"step": 8325 |
|
}, |
|
{ |
|
"epoch": 0.7233821363596985, |
|
"grad_norm": 10.375814437866211, |
|
"learning_rate": 9.232768579129765e-06, |
|
"loss": 0.2985, |
|
"step": 8350 |
|
}, |
|
{ |
|
"epoch": 0.7255479511392185, |
|
"grad_norm": 16.607072830200195, |
|
"learning_rate": 9.160569888332692e-06, |
|
"loss": 0.2656, |
|
"step": 8375 |
|
}, |
|
{ |
|
"epoch": 0.7277137659187386, |
|
"grad_norm": 0.6724597811698914, |
|
"learning_rate": 9.088371197535618e-06, |
|
"loss": 0.2007, |
|
"step": 8400 |
|
}, |
|
{ |
|
"epoch": 0.7298795806982586, |
|
"grad_norm": 2.3397414684295654, |
|
"learning_rate": 9.016172506738545e-06, |
|
"loss": 0.2402, |
|
"step": 8425 |
|
}, |
|
{ |
|
"epoch": 0.7320453954777787, |
|
"grad_norm": 11.172548294067383, |
|
"learning_rate": 8.943973815941472e-06, |
|
"loss": 0.3434, |
|
"step": 8450 |
|
}, |
|
{ |
|
"epoch": 0.7342112102572987, |
|
"grad_norm": 12.031539916992188, |
|
"learning_rate": 8.871775125144397e-06, |
|
"loss": 0.2628, |
|
"step": 8475 |
|
}, |
|
{ |
|
"epoch": 0.7363770250368189, |
|
"grad_norm": 0.37211769819259644, |
|
"learning_rate": 8.799576434347324e-06, |
|
"loss": 0.265, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 0.738542839816339, |
|
"grad_norm": 6.181528568267822, |
|
"learning_rate": 8.72737774355025e-06, |
|
"loss": 0.3748, |
|
"step": 8525 |
|
}, |
|
{ |
|
"epoch": 0.740708654595859, |
|
"grad_norm": 2.7227742671966553, |
|
"learning_rate": 8.655179052753177e-06, |
|
"loss": 0.249, |
|
"step": 8550 |
|
}, |
|
{ |
|
"epoch": 0.742874469375379, |
|
"grad_norm": 7.977476596832275, |
|
"learning_rate": 8.582980361956104e-06, |
|
"loss": 0.3375, |
|
"step": 8575 |
|
}, |
|
{ |
|
"epoch": 0.7450402841548991, |
|
"grad_norm": 11.404130935668945, |
|
"learning_rate": 8.510781671159029e-06, |
|
"loss": 0.3336, |
|
"step": 8600 |
|
}, |
|
{ |
|
"epoch": 0.7472060989344191, |
|
"grad_norm": 0.4421218931674957, |
|
"learning_rate": 8.438582980361957e-06, |
|
"loss": 0.3702, |
|
"step": 8625 |
|
}, |
|
{ |
|
"epoch": 0.7493719137139392, |
|
"grad_norm": 4.386607646942139, |
|
"learning_rate": 8.366384289564882e-06, |
|
"loss": 0.3494, |
|
"step": 8650 |
|
}, |
|
{ |
|
"epoch": 0.7515377284934592, |
|
"grad_norm": 5.428525924682617, |
|
"learning_rate": 8.294185598767809e-06, |
|
"loss": 0.2996, |
|
"step": 8675 |
|
}, |
|
{ |
|
"epoch": 0.7537035432729793, |
|
"grad_norm": 0.3034394085407257, |
|
"learning_rate": 8.221986907970736e-06, |
|
"loss": 0.2433, |
|
"step": 8700 |
|
}, |
|
{ |
|
"epoch": 0.7558693580524993, |
|
"grad_norm": 3.75878643989563, |
|
"learning_rate": 8.149788217173662e-06, |
|
"loss": 0.3027, |
|
"step": 8725 |
|
}, |
|
{ |
|
"epoch": 0.7580351728320194, |
|
"grad_norm": 9.965909004211426, |
|
"learning_rate": 8.077589526376589e-06, |
|
"loss": 0.382, |
|
"step": 8750 |
|
}, |
|
{ |
|
"epoch": 0.7602009876115394, |
|
"grad_norm": 7.314566135406494, |
|
"learning_rate": 8.005390835579514e-06, |
|
"loss": 0.2874, |
|
"step": 8775 |
|
}, |
|
{ |
|
"epoch": 0.7623668023910595, |
|
"grad_norm": 8.704547882080078, |
|
"learning_rate": 7.93319214478244e-06, |
|
"loss": 0.2737, |
|
"step": 8800 |
|
}, |
|
{ |
|
"epoch": 0.7645326171705795, |
|
"grad_norm": 10.275945663452148, |
|
"learning_rate": 7.86099345398537e-06, |
|
"loss": 0.3212, |
|
"step": 8825 |
|
}, |
|
{ |
|
"epoch": 0.7666984319500997, |
|
"grad_norm": 4.1912641525268555, |
|
"learning_rate": 7.788794763188294e-06, |
|
"loss": 0.3475, |
|
"step": 8850 |
|
}, |
|
{ |
|
"epoch": 0.7688642467296197, |
|
"grad_norm": 10.281148910522461, |
|
"learning_rate": 7.716596072391221e-06, |
|
"loss": 0.221, |
|
"step": 8875 |
|
}, |
|
{ |
|
"epoch": 0.7710300615091398, |
|
"grad_norm": 9.613810539245605, |
|
"learning_rate": 7.644397381594146e-06, |
|
"loss": 0.2587, |
|
"step": 8900 |
|
}, |
|
{ |
|
"epoch": 0.7731958762886598, |
|
"grad_norm": 1.2200976610183716, |
|
"learning_rate": 7.572198690797074e-06, |
|
"loss": 0.2852, |
|
"step": 8925 |
|
}, |
|
{ |
|
"epoch": 0.7753616910681799, |
|
"grad_norm": 2.445672035217285, |
|
"learning_rate": 7.5e-06, |
|
"loss": 0.3837, |
|
"step": 8950 |
|
}, |
|
{ |
|
"epoch": 0.7775275058476999, |
|
"grad_norm": 13.744851112365723, |
|
"learning_rate": 7.427801309202927e-06, |
|
"loss": 0.2333, |
|
"step": 8975 |
|
}, |
|
{ |
|
"epoch": 0.77969332062722, |
|
"grad_norm": 4.426064968109131, |
|
"learning_rate": 7.355602618405853e-06, |
|
"loss": 0.3036, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 0.78185913540674, |
|
"grad_norm": 8.329988479614258, |
|
"learning_rate": 7.28340392760878e-06, |
|
"loss": 0.3287, |
|
"step": 9025 |
|
}, |
|
{ |
|
"epoch": 0.7840249501862601, |
|
"grad_norm": 4.122848987579346, |
|
"learning_rate": 7.211205236811706e-06, |
|
"loss": 0.3248, |
|
"step": 9050 |
|
}, |
|
{ |
|
"epoch": 0.7861907649657801, |
|
"grad_norm": 6.127285480499268, |
|
"learning_rate": 7.139006546014633e-06, |
|
"loss": 0.2395, |
|
"step": 9075 |
|
}, |
|
{ |
|
"epoch": 0.7883565797453002, |
|
"grad_norm": 1.6887600421905518, |
|
"learning_rate": 7.066807855217559e-06, |
|
"loss": 0.2647, |
|
"step": 9100 |
|
}, |
|
{ |
|
"epoch": 0.7905223945248202, |
|
"grad_norm": 1.4300670623779297, |
|
"learning_rate": 6.9946091644204855e-06, |
|
"loss": 0.3345, |
|
"step": 9125 |
|
}, |
|
{ |
|
"epoch": 0.7926882093043403, |
|
"grad_norm": 9.334101676940918, |
|
"learning_rate": 6.922410473623411e-06, |
|
"loss": 0.3421, |
|
"step": 9150 |
|
}, |
|
{ |
|
"epoch": 0.7948540240838603, |
|
"grad_norm": 6.996714115142822, |
|
"learning_rate": 6.850211782826339e-06, |
|
"loss": 0.3496, |
|
"step": 9175 |
|
}, |
|
{ |
|
"epoch": 0.7970198388633805, |
|
"grad_norm": 8.47280216217041, |
|
"learning_rate": 6.778013092029265e-06, |
|
"loss": 0.253, |
|
"step": 9200 |
|
}, |
|
{ |
|
"epoch": 0.7991856536429005, |
|
"grad_norm": 3.239483118057251, |
|
"learning_rate": 6.708702348864075e-06, |
|
"loss": 0.3462, |
|
"step": 9225 |
|
}, |
|
{ |
|
"epoch": 0.8013514684224206, |
|
"grad_norm": 1.6153030395507812, |
|
"learning_rate": 6.6365036580670006e-06, |
|
"loss": 0.2688, |
|
"step": 9250 |
|
}, |
|
{ |
|
"epoch": 0.8035172832019406, |
|
"grad_norm": 5.316878795623779, |
|
"learning_rate": 6.564304967269927e-06, |
|
"loss": 0.3301, |
|
"step": 9275 |
|
}, |
|
{ |
|
"epoch": 0.8056830979814606, |
|
"grad_norm": 8.06822395324707, |
|
"learning_rate": 6.492106276472853e-06, |
|
"loss": 0.3382, |
|
"step": 9300 |
|
}, |
|
{ |
|
"epoch": 0.8078489127609807, |
|
"grad_norm": 2.8038644790649414, |
|
"learning_rate": 6.41990758567578e-06, |
|
"loss": 0.2219, |
|
"step": 9325 |
|
}, |
|
{ |
|
"epoch": 0.8100147275405007, |
|
"grad_norm": 5.063823223114014, |
|
"learning_rate": 6.3477088948787066e-06, |
|
"loss": 0.278, |
|
"step": 9350 |
|
}, |
|
{ |
|
"epoch": 0.8121805423200208, |
|
"grad_norm": 6.974782466888428, |
|
"learning_rate": 6.275510204081633e-06, |
|
"loss": 0.2338, |
|
"step": 9375 |
|
}, |
|
{ |
|
"epoch": 0.8143463570995408, |
|
"grad_norm": 2.8085834980010986, |
|
"learning_rate": 6.203311513284559e-06, |
|
"loss": 0.2732, |
|
"step": 9400 |
|
}, |
|
{ |
|
"epoch": 0.8165121718790609, |
|
"grad_norm": 12.976601600646973, |
|
"learning_rate": 6.131112822487486e-06, |
|
"loss": 0.2973, |
|
"step": 9425 |
|
}, |
|
{ |
|
"epoch": 0.8186779866585809, |
|
"grad_norm": 2.7448630332946777, |
|
"learning_rate": 6.058914131690412e-06, |
|
"loss": 0.2783, |
|
"step": 9450 |
|
}, |
|
{ |
|
"epoch": 0.820843801438101, |
|
"grad_norm": 2.347792387008667, |
|
"learning_rate": 5.986715440893339e-06, |
|
"loss": 0.2418, |
|
"step": 9475 |
|
}, |
|
{ |
|
"epoch": 0.823009616217621, |
|
"grad_norm": 2.851559638977051, |
|
"learning_rate": 5.914516750096265e-06, |
|
"loss": 0.2603, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 0.8251754309971411, |
|
"grad_norm": 6.941406726837158, |
|
"learning_rate": 5.842318059299192e-06, |
|
"loss": 0.1888, |
|
"step": 9525 |
|
}, |
|
{ |
|
"epoch": 0.8273412457766611, |
|
"grad_norm": 4.45375394821167, |
|
"learning_rate": 5.770119368502118e-06, |
|
"loss": 0.2581, |
|
"step": 9550 |
|
}, |
|
{ |
|
"epoch": 0.8295070605561813, |
|
"grad_norm": 5.2709641456604, |
|
"learning_rate": 5.6979206777050444e-06, |
|
"loss": 0.2742, |
|
"step": 9575 |
|
}, |
|
{ |
|
"epoch": 0.8316728753357013, |
|
"grad_norm": 2.6814463138580322, |
|
"learning_rate": 5.62572198690797e-06, |
|
"loss": 0.2156, |
|
"step": 9600 |
|
}, |
|
{ |
|
"epoch": 0.8338386901152214, |
|
"grad_norm": 0.12416364997625351, |
|
"learning_rate": 5.553523296110898e-06, |
|
"loss": 0.3317, |
|
"step": 9625 |
|
}, |
|
{ |
|
"epoch": 0.8360045048947414, |
|
"grad_norm": 5.639218807220459, |
|
"learning_rate": 5.481324605313824e-06, |
|
"loss": 0.1967, |
|
"step": 9650 |
|
}, |
|
{ |
|
"epoch": 0.8381703196742615, |
|
"grad_norm": 0.8800064921379089, |
|
"learning_rate": 5.4091259145167504e-06, |
|
"loss": 0.1701, |
|
"step": 9675 |
|
}, |
|
{ |
|
"epoch": 0.8403361344537815, |
|
"grad_norm": 2.7125442028045654, |
|
"learning_rate": 5.336927223719676e-06, |
|
"loss": 0.3064, |
|
"step": 9700 |
|
}, |
|
{ |
|
"epoch": 0.8425019492333016, |
|
"grad_norm": 3.1365272998809814, |
|
"learning_rate": 5.264728532922603e-06, |
|
"loss": 0.3511, |
|
"step": 9725 |
|
}, |
|
{ |
|
"epoch": 0.8446677640128216, |
|
"grad_norm": 10.584244728088379, |
|
"learning_rate": 5.19252984212553e-06, |
|
"loss": 0.2461, |
|
"step": 9750 |
|
}, |
|
{ |
|
"epoch": 0.8468335787923417, |
|
"grad_norm": 0.7926290035247803, |
|
"learning_rate": 5.1203311513284565e-06, |
|
"loss": 0.3047, |
|
"step": 9775 |
|
}, |
|
{ |
|
"epoch": 0.8489993935718617, |
|
"grad_norm": 10.744616508483887, |
|
"learning_rate": 5.048132460531382e-06, |
|
"loss": 0.3234, |
|
"step": 9800 |
|
}, |
|
{ |
|
"epoch": 0.8511652083513818, |
|
"grad_norm": 3.9436535835266113, |
|
"learning_rate": 4.975933769734309e-06, |
|
"loss": 0.2843, |
|
"step": 9825 |
|
}, |
|
{ |
|
"epoch": 0.8533310231309018, |
|
"grad_norm": 0.2785266637802124, |
|
"learning_rate": 4.903735078937235e-06, |
|
"loss": 0.3365, |
|
"step": 9850 |
|
}, |
|
{ |
|
"epoch": 0.8554968379104219, |
|
"grad_norm": 7.446309566497803, |
|
"learning_rate": 4.831536388140162e-06, |
|
"loss": 0.3802, |
|
"step": 9875 |
|
}, |
|
{ |
|
"epoch": 0.8576626526899419, |
|
"grad_norm": 9.687524795532227, |
|
"learning_rate": 4.759337697343088e-06, |
|
"loss": 0.2587, |
|
"step": 9900 |
|
}, |
|
{ |
|
"epoch": 0.8598284674694621, |
|
"grad_norm": 0.4837453067302704, |
|
"learning_rate": 4.687139006546015e-06, |
|
"loss": 0.2367, |
|
"step": 9925 |
|
}, |
|
{ |
|
"epoch": 0.8619942822489821, |
|
"grad_norm": 0.7170611023902893, |
|
"learning_rate": 4.614940315748941e-06, |
|
"loss": 0.2971, |
|
"step": 9950 |
|
}, |
|
{ |
|
"epoch": 0.8641600970285022, |
|
"grad_norm": 16.417407989501953, |
|
"learning_rate": 4.542741624951868e-06, |
|
"loss": 0.2884, |
|
"step": 9975 |
|
}, |
|
{ |
|
"epoch": 0.8663259118080222, |
|
"grad_norm": 7.771174430847168, |
|
"learning_rate": 4.4705429341547935e-06, |
|
"loss": 0.2296, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 0.8684917265875423, |
|
"grad_norm": 1.540907859802246, |
|
"learning_rate": 4.398344243357721e-06, |
|
"loss": 0.3145, |
|
"step": 10025 |
|
}, |
|
{ |
|
"epoch": 0.8706575413670623, |
|
"grad_norm": 1.4157791137695312, |
|
"learning_rate": 4.326145552560647e-06, |
|
"loss": 0.178, |
|
"step": 10050 |
|
}, |
|
{ |
|
"epoch": 0.8728233561465824, |
|
"grad_norm": 4.707205295562744, |
|
"learning_rate": 4.253946861763574e-06, |
|
"loss": 0.2681, |
|
"step": 10075 |
|
}, |
|
{ |
|
"epoch": 0.8749891709261024, |
|
"grad_norm": 3.7186520099639893, |
|
"learning_rate": 4.1817481709664995e-06, |
|
"loss": 0.3191, |
|
"step": 10100 |
|
}, |
|
{ |
|
"epoch": 0.8771549857056224, |
|
"grad_norm": 1.6584956645965576, |
|
"learning_rate": 4.109549480169426e-06, |
|
"loss": 0.2544, |
|
"step": 10125 |
|
}, |
|
{ |
|
"epoch": 0.8793208004851425, |
|
"grad_norm": 9.22360610961914, |
|
"learning_rate": 4.037350789372352e-06, |
|
"loss": 0.2965, |
|
"step": 10150 |
|
}, |
|
{ |
|
"epoch": 0.8814866152646625, |
|
"grad_norm": 3.5934746265411377, |
|
"learning_rate": 3.96515209857528e-06, |
|
"loss": 0.317, |
|
"step": 10175 |
|
}, |
|
{ |
|
"epoch": 0.8836524300441826, |
|
"grad_norm": 1.5978528261184692, |
|
"learning_rate": 3.892953407778206e-06, |
|
"loss": 0.2149, |
|
"step": 10200 |
|
}, |
|
{ |
|
"epoch": 0.8858182448237026, |
|
"grad_norm": 4.726417064666748, |
|
"learning_rate": 3.820754716981132e-06, |
|
"loss": 0.4876, |
|
"step": 10225 |
|
}, |
|
{ |
|
"epoch": 0.8879840596032227, |
|
"grad_norm": 7.836237907409668, |
|
"learning_rate": 3.7485560261840585e-06, |
|
"loss": 0.2984, |
|
"step": 10250 |
|
}, |
|
{ |
|
"epoch": 0.8901498743827427, |
|
"grad_norm": 6.5479912757873535, |
|
"learning_rate": 3.676357335386985e-06, |
|
"loss": 0.3024, |
|
"step": 10275 |
|
}, |
|
{ |
|
"epoch": 0.8923156891622629, |
|
"grad_norm": 1.180179476737976, |
|
"learning_rate": 3.6041586445899115e-06, |
|
"loss": 0.2447, |
|
"step": 10300 |
|
}, |
|
{ |
|
"epoch": 0.8944815039417829, |
|
"grad_norm": 5.868828773498535, |
|
"learning_rate": 3.5319599537928378e-06, |
|
"loss": 0.2684, |
|
"step": 10325 |
|
}, |
|
{ |
|
"epoch": 0.896647318721303, |
|
"grad_norm": 6.2655816078186035, |
|
"learning_rate": 3.4597612629957645e-06, |
|
"loss": 0.1714, |
|
"step": 10350 |
|
}, |
|
{ |
|
"epoch": 0.898813133500823, |
|
"grad_norm": 6.3384270668029785, |
|
"learning_rate": 3.387562572198691e-06, |
|
"loss": 0.2776, |
|
"step": 10375 |
|
}, |
|
{ |
|
"epoch": 0.9009789482803431, |
|
"grad_norm": 6.097102165222168, |
|
"learning_rate": 3.315363881401617e-06, |
|
"loss": 0.2745, |
|
"step": 10400 |
|
}, |
|
{ |
|
"epoch": 0.9031447630598631, |
|
"grad_norm": 7.250086784362793, |
|
"learning_rate": 3.243165190604544e-06, |
|
"loss": 0.3299, |
|
"step": 10425 |
|
}, |
|
{ |
|
"epoch": 0.9053105778393832, |
|
"grad_norm": 9.260988235473633, |
|
"learning_rate": 3.17096649980747e-06, |
|
"loss": 0.2629, |
|
"step": 10450 |
|
}, |
|
{ |
|
"epoch": 0.9074763926189032, |
|
"grad_norm": 8.009949684143066, |
|
"learning_rate": 3.0987678090103964e-06, |
|
"loss": 0.3627, |
|
"step": 10475 |
|
}, |
|
{ |
|
"epoch": 0.9096422073984233, |
|
"grad_norm": 1.247878074645996, |
|
"learning_rate": 3.026569118213323e-06, |
|
"loss": 0.2236, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 0.9118080221779433, |
|
"grad_norm": 6.759634971618652, |
|
"learning_rate": 2.9543704274162494e-06, |
|
"loss": 0.2819, |
|
"step": 10525 |
|
}, |
|
{ |
|
"epoch": 0.9139738369574634, |
|
"grad_norm": 0.09837600588798523, |
|
"learning_rate": 2.882171736619176e-06, |
|
"loss": 0.3129, |
|
"step": 10550 |
|
}, |
|
{ |
|
"epoch": 0.9161396517369834, |
|
"grad_norm": 6.850848197937012, |
|
"learning_rate": 2.8099730458221024e-06, |
|
"loss": 0.3051, |
|
"step": 10575 |
|
}, |
|
{ |
|
"epoch": 0.9183054665165035, |
|
"grad_norm": 8.94210147857666, |
|
"learning_rate": 2.7377743550250287e-06, |
|
"loss": 0.3955, |
|
"step": 10600 |
|
}, |
|
{ |
|
"epoch": 0.9204712812960235, |
|
"grad_norm": 8.595787048339844, |
|
"learning_rate": 2.6655756642279554e-06, |
|
"loss": 0.2493, |
|
"step": 10625 |
|
}, |
|
{ |
|
"epoch": 0.9226370960755437, |
|
"grad_norm": 7.062394618988037, |
|
"learning_rate": 2.5933769734308817e-06, |
|
"loss": 0.2543, |
|
"step": 10650 |
|
}, |
|
{ |
|
"epoch": 0.9248029108550637, |
|
"grad_norm": 3.371393918991089, |
|
"learning_rate": 2.521178282633808e-06, |
|
"loss": 0.2222, |
|
"step": 10675 |
|
}, |
|
{ |
|
"epoch": 0.9269687256345838, |
|
"grad_norm": 1.3468866348266602, |
|
"learning_rate": 2.4489795918367347e-06, |
|
"loss": 0.2823, |
|
"step": 10700 |
|
}, |
|
{ |
|
"epoch": 0.9291345404141038, |
|
"grad_norm": 15.475239753723145, |
|
"learning_rate": 2.376780901039661e-06, |
|
"loss": 0.3098, |
|
"step": 10725 |
|
}, |
|
{ |
|
"epoch": 0.9313003551936239, |
|
"grad_norm": 6.605096340179443, |
|
"learning_rate": 2.3045822102425877e-06, |
|
"loss": 0.3009, |
|
"step": 10750 |
|
}, |
|
{ |
|
"epoch": 0.9334661699731439, |
|
"grad_norm": 3.2146847248077393, |
|
"learning_rate": 2.2323835194455144e-06, |
|
"loss": 0.2623, |
|
"step": 10775 |
|
}, |
|
{ |
|
"epoch": 0.935631984752664, |
|
"grad_norm": 2.727200508117676, |
|
"learning_rate": 2.1601848286484407e-06, |
|
"loss": 0.1952, |
|
"step": 10800 |
|
}, |
|
{ |
|
"epoch": 0.937797799532184, |
|
"grad_norm": 2.7418553829193115, |
|
"learning_rate": 2.0879861378513674e-06, |
|
"loss": 0.4527, |
|
"step": 10825 |
|
}, |
|
{ |
|
"epoch": 0.939963614311704, |
|
"grad_norm": 8.577201843261719, |
|
"learning_rate": 2.0157874470542937e-06, |
|
"loss": 0.2323, |
|
"step": 10850 |
|
}, |
|
{ |
|
"epoch": 0.9421294290912241, |
|
"grad_norm": 4.514817237854004, |
|
"learning_rate": 1.94358875625722e-06, |
|
"loss": 0.3109, |
|
"step": 10875 |
|
}, |
|
{ |
|
"epoch": 0.9442952438707441, |
|
"grad_norm": 10.761394500732422, |
|
"learning_rate": 1.8713900654601463e-06, |
|
"loss": 0.3335, |
|
"step": 10900 |
|
}, |
|
{ |
|
"epoch": 0.9464610586502642, |
|
"grad_norm": 8.004775047302246, |
|
"learning_rate": 1.7991913746630728e-06, |
|
"loss": 0.2862, |
|
"step": 10925 |
|
}, |
|
{ |
|
"epoch": 0.9486268734297842, |
|
"grad_norm": 7.491416931152344, |
|
"learning_rate": 1.7269926838659993e-06, |
|
"loss": 0.4005, |
|
"step": 10950 |
|
}, |
|
{ |
|
"epoch": 0.9507926882093043, |
|
"grad_norm": 6.168478488922119, |
|
"learning_rate": 1.6547939930689255e-06, |
|
"loss": 0.2815, |
|
"step": 10975 |
|
}, |
|
{ |
|
"epoch": 0.9529585029888243, |
|
"grad_norm": 7.221772193908691, |
|
"learning_rate": 1.582595302271852e-06, |
|
"loss": 0.2157, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 0.9551243177683445, |
|
"grad_norm": 5.9744086265563965, |
|
"learning_rate": 1.5103966114747788e-06, |
|
"loss": 0.3733, |
|
"step": 11025 |
|
}, |
|
{ |
|
"epoch": 0.9572901325478645, |
|
"grad_norm": 5.776475429534912, |
|
"learning_rate": 1.4381979206777053e-06, |
|
"loss": 0.2843, |
|
"step": 11050 |
|
}, |
|
{ |
|
"epoch": 0.9594559473273846, |
|
"grad_norm": 1.3870640993118286, |
|
"learning_rate": 1.3659992298806316e-06, |
|
"loss": 0.1963, |
|
"step": 11075 |
|
}, |
|
{ |
|
"epoch": 0.9616217621069046, |
|
"grad_norm": 7.3776535987854, |
|
"learning_rate": 1.293800539083558e-06, |
|
"loss": 0.3081, |
|
"step": 11100 |
|
}, |
|
{ |
|
"epoch": 0.9637875768864247, |
|
"grad_norm": 11.289216995239258, |
|
"learning_rate": 1.2216018482864846e-06, |
|
"loss": 0.2317, |
|
"step": 11125 |
|
}, |
|
{ |
|
"epoch": 0.9659533916659447, |
|
"grad_norm": 11.621864318847656, |
|
"learning_rate": 1.1494031574894108e-06, |
|
"loss": 0.3027, |
|
"step": 11150 |
|
}, |
|
{ |
|
"epoch": 0.9681192064454648, |
|
"grad_norm": 11.617834091186523, |
|
"learning_rate": 1.0772044666923373e-06, |
|
"loss": 0.3581, |
|
"step": 11175 |
|
}, |
|
{ |
|
"epoch": 0.9702850212249848, |
|
"grad_norm": 5.500637531280518, |
|
"learning_rate": 1.0050057758952638e-06, |
|
"loss": 0.3, |
|
"step": 11200 |
|
}, |
|
{ |
|
"epoch": 0.9724508360045049, |
|
"grad_norm": 3.552578926086426, |
|
"learning_rate": 9.328070850981902e-07, |
|
"loss": 0.2797, |
|
"step": 11225 |
|
}, |
|
{ |
|
"epoch": 0.9746166507840249, |
|
"grad_norm": 1.074208378791809, |
|
"learning_rate": 8.606083943011167e-07, |
|
"loss": 0.2918, |
|
"step": 11250 |
|
}, |
|
{ |
|
"epoch": 0.976782465563545, |
|
"grad_norm": 11.449936866760254, |
|
"learning_rate": 7.884097035040431e-07, |
|
"loss": 0.2519, |
|
"step": 11275 |
|
}, |
|
{ |
|
"epoch": 0.978948280343065, |
|
"grad_norm": 2.988003730773926, |
|
"learning_rate": 7.162110127069696e-07, |
|
"loss": 0.2183, |
|
"step": 11300 |
|
}, |
|
{ |
|
"epoch": 0.9811140951225851, |
|
"grad_norm": 2.9280929565429688, |
|
"learning_rate": 6.44012321909896e-07, |
|
"loss": 0.2764, |
|
"step": 11325 |
|
}, |
|
{ |
|
"epoch": 0.9832799099021051, |
|
"grad_norm": 3.2279105186462402, |
|
"learning_rate": 5.718136311128224e-07, |
|
"loss": 0.4107, |
|
"step": 11350 |
|
}, |
|
{ |
|
"epoch": 0.9854457246816253, |
|
"grad_norm": 2.54160737991333, |
|
"learning_rate": 4.996149403157489e-07, |
|
"loss": 0.3135, |
|
"step": 11375 |
|
}, |
|
{ |
|
"epoch": 0.9876115394611453, |
|
"grad_norm": 1.3068925142288208, |
|
"learning_rate": 4.2741624951867543e-07, |
|
"loss": 0.2138, |
|
"step": 11400 |
|
}, |
|
{ |
|
"epoch": 0.9897773542406654, |
|
"grad_norm": 8.606940269470215, |
|
"learning_rate": 3.5521755872160183e-07, |
|
"loss": 0.2984, |
|
"step": 11425 |
|
}, |
|
{ |
|
"epoch": 0.9919431690201854, |
|
"grad_norm": 1.2513303756713867, |
|
"learning_rate": 2.830188679245283e-07, |
|
"loss": 0.2407, |
|
"step": 11450 |
|
}, |
|
{ |
|
"epoch": 0.9941089837997055, |
|
"grad_norm": 11.340466499328613, |
|
"learning_rate": 2.1082017712745478e-07, |
|
"loss": 0.2449, |
|
"step": 11475 |
|
}, |
|
{ |
|
"epoch": 0.9962747985792255, |
|
"grad_norm": 6.166193008422852, |
|
"learning_rate": 1.386214863303812e-07, |
|
"loss": 0.2629, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 0.9984406133587456, |
|
"grad_norm": 4.004662990570068, |
|
"learning_rate": 6.642279553330766e-08, |
|
"loss": 0.3488, |
|
"step": 11525 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_cosine_accuracy": 0.9693415637860082, |
|
"eval_loss": 0.4268312156200409, |
|
"eval_runtime": 50.4023, |
|
"eval_samples_per_second": 96.424, |
|
"eval_steps_per_second": 6.031, |
|
"step": 11543 |
|
} |
|
], |
|
"logging_steps": 25, |
|
"max_steps": 11543, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"EarlyStoppingCallback": { |
|
"args": { |
|
"early_stopping_patience": 5, |
|
"early_stopping_threshold": 0.01 |
|
}, |
|
"attributes": { |
|
"early_stopping_patience_counter": 0 |
|
} |
|
}, |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 0.0, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|