|
{ |
|
"best_metric": 0.798653244972229, |
|
"best_model_checkpoint": "FastCoderL4-ITX/checkpoint-500", |
|
"epoch": 1.0, |
|
"eval_steps": 250, |
|
"global_step": 547, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0018281535648994515, |
|
"grad_norm": 16.024444580078125, |
|
"learning_rate": 1.2000000000000002e-07, |
|
"loss": 1.6383, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.003656307129798903, |
|
"grad_norm": 16.114477157592773, |
|
"learning_rate": 2.4000000000000003e-07, |
|
"loss": 1.7323, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.005484460694698354, |
|
"grad_norm": 14.292167663574219, |
|
"learning_rate": 3.6e-07, |
|
"loss": 1.4207, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.007312614259597806, |
|
"grad_norm": 15.010176658630371, |
|
"learning_rate": 4.800000000000001e-07, |
|
"loss": 1.5956, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.009140767824497258, |
|
"grad_norm": 13.827630996704102, |
|
"learning_rate": 6.000000000000001e-07, |
|
"loss": 1.49, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.010968921389396709, |
|
"grad_norm": 15.43071174621582, |
|
"learning_rate": 7.2e-07, |
|
"loss": 1.6081, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.012797074954296161, |
|
"grad_norm": 14.97592544555664, |
|
"learning_rate": 8.4e-07, |
|
"loss": 1.6164, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.014625228519195612, |
|
"grad_norm": 11.73971939086914, |
|
"learning_rate": 9.600000000000001e-07, |
|
"loss": 1.4299, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.016453382084095063, |
|
"grad_norm": 12.449714660644531, |
|
"learning_rate": 1.08e-06, |
|
"loss": 1.3328, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.018281535648994516, |
|
"grad_norm": 12.710100173950195, |
|
"learning_rate": 1.2000000000000002e-06, |
|
"loss": 1.4129, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.02010968921389397, |
|
"grad_norm": 12.13203239440918, |
|
"learning_rate": 1.3199999999999999e-06, |
|
"loss": 1.3971, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.021937842778793418, |
|
"grad_norm": 10.500185012817383, |
|
"learning_rate": 1.44e-06, |
|
"loss": 1.4321, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.02376599634369287, |
|
"grad_norm": 10.064560890197754, |
|
"learning_rate": 1.5599999999999999e-06, |
|
"loss": 1.2872, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.025594149908592323, |
|
"grad_norm": 7.85143518447876, |
|
"learning_rate": 1.68e-06, |
|
"loss": 1.2345, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.027422303473491772, |
|
"grad_norm": 7.530126094818115, |
|
"learning_rate": 1.8e-06, |
|
"loss": 1.1803, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.029250457038391225, |
|
"grad_norm": 6.091775417327881, |
|
"learning_rate": 1.9200000000000003e-06, |
|
"loss": 1.2247, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.031078610603290677, |
|
"grad_norm": 4.9651384353637695, |
|
"learning_rate": 2.0400000000000004e-06, |
|
"loss": 1.1655, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.03290676416819013, |
|
"grad_norm": 6.209571361541748, |
|
"learning_rate": 2.16e-06, |
|
"loss": 1.0649, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.03473491773308958, |
|
"grad_norm": 4.946502208709717, |
|
"learning_rate": 2.28e-06, |
|
"loss": 1.1046, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.03656307129798903, |
|
"grad_norm": 4.954932689666748, |
|
"learning_rate": 2.4000000000000003e-06, |
|
"loss": 1.0964, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.038391224862888484, |
|
"grad_norm": 3.8354671001434326, |
|
"learning_rate": 2.52e-06, |
|
"loss": 1.2277, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.04021937842778794, |
|
"grad_norm": 4.310220718383789, |
|
"learning_rate": 2.6399999999999997e-06, |
|
"loss": 1.042, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.04204753199268738, |
|
"grad_norm": 3.9748997688293457, |
|
"learning_rate": 2.76e-06, |
|
"loss": 1.0234, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.043875685557586835, |
|
"grad_norm": 3.9019360542297363, |
|
"learning_rate": 2.88e-06, |
|
"loss": 1.1286, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.04570383912248629, |
|
"grad_norm": 4.246694564819336, |
|
"learning_rate": 3e-06, |
|
"loss": 0.9793, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.04753199268738574, |
|
"grad_norm": 3.8797051906585693, |
|
"learning_rate": 3.1199999999999998e-06, |
|
"loss": 1.0747, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.04936014625228519, |
|
"grad_norm": 4.0023908615112305, |
|
"learning_rate": 3.24e-06, |
|
"loss": 1.1031, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.051188299817184646, |
|
"grad_norm": 4.26245641708374, |
|
"learning_rate": 3.36e-06, |
|
"loss": 1.003, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.05301645338208409, |
|
"grad_norm": 4.6040215492248535, |
|
"learning_rate": 3.48e-06, |
|
"loss": 0.9311, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.054844606946983544, |
|
"grad_norm": 4.464705467224121, |
|
"learning_rate": 3.6e-06, |
|
"loss": 1.0341, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.056672760511883, |
|
"grad_norm": 3.787562608718872, |
|
"learning_rate": 3.72e-06, |
|
"loss": 0.984, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.05850091407678245, |
|
"grad_norm": 3.2259016036987305, |
|
"learning_rate": 3.8400000000000005e-06, |
|
"loss": 0.9167, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.0603290676416819, |
|
"grad_norm": 3.7597789764404297, |
|
"learning_rate": 3.96e-06, |
|
"loss": 1.0784, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.062157221206581355, |
|
"grad_norm": 3.173090934753418, |
|
"learning_rate": 4.080000000000001e-06, |
|
"loss": 0.9436, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.06398537477148081, |
|
"grad_norm": 3.336909055709839, |
|
"learning_rate": 4.2000000000000004e-06, |
|
"loss": 0.8013, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.06581352833638025, |
|
"grad_norm": 2.738156318664551, |
|
"learning_rate": 4.32e-06, |
|
"loss": 1.1238, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.06764168190127971, |
|
"grad_norm": 3.3270339965820312, |
|
"learning_rate": 4.44e-06, |
|
"loss": 0.8423, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.06946983546617916, |
|
"grad_norm": 2.872663736343384, |
|
"learning_rate": 4.56e-06, |
|
"loss": 0.9931, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.0712979890310786, |
|
"grad_norm": 3.2571451663970947, |
|
"learning_rate": 4.68e-06, |
|
"loss": 0.9323, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.07312614259597806, |
|
"grad_norm": 2.999234437942505, |
|
"learning_rate": 4.800000000000001e-06, |
|
"loss": 0.9247, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.07495429616087751, |
|
"grad_norm": 2.9580419063568115, |
|
"learning_rate": 4.92e-06, |
|
"loss": 0.8751, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.07678244972577697, |
|
"grad_norm": 2.8437395095825195, |
|
"learning_rate": 5.04e-06, |
|
"loss": 0.8857, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.07861060329067641, |
|
"grad_norm": 3.175656318664551, |
|
"learning_rate": 5.16e-06, |
|
"loss": 0.8942, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.08043875685557587, |
|
"grad_norm": 2.684788703918457, |
|
"learning_rate": 5.279999999999999e-06, |
|
"loss": 0.8725, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.08226691042047532, |
|
"grad_norm": 3.000286340713501, |
|
"learning_rate": 5.4e-06, |
|
"loss": 0.8803, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.08409506398537477, |
|
"grad_norm": 2.856066942214966, |
|
"learning_rate": 5.52e-06, |
|
"loss": 0.9705, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.08592321755027423, |
|
"grad_norm": 3.0575389862060547, |
|
"learning_rate": 5.64e-06, |
|
"loss": 0.8106, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.08775137111517367, |
|
"grad_norm": 2.649608612060547, |
|
"learning_rate": 5.76e-06, |
|
"loss": 1.0701, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.08957952468007313, |
|
"grad_norm": 3.1014580726623535, |
|
"learning_rate": 5.8800000000000005e-06, |
|
"loss": 0.9607, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.09140767824497258, |
|
"grad_norm": 2.6570193767547607, |
|
"learning_rate": 6e-06, |
|
"loss": 0.9685, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.09323583180987204, |
|
"grad_norm": 3.082258462905884, |
|
"learning_rate": 6.12e-06, |
|
"loss": 1.0039, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.09506398537477148, |
|
"grad_norm": 2.4003512859344482, |
|
"learning_rate": 6.2399999999999995e-06, |
|
"loss": 0.8934, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.09689213893967093, |
|
"grad_norm": 2.605583667755127, |
|
"learning_rate": 6.36e-06, |
|
"loss": 0.8891, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.09872029250457039, |
|
"grad_norm": 2.541799306869507, |
|
"learning_rate": 6.48e-06, |
|
"loss": 0.8183, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.10054844606946983, |
|
"grad_norm": 2.594459056854248, |
|
"learning_rate": 6.6e-06, |
|
"loss": 0.9906, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.10237659963436929, |
|
"grad_norm": 2.9506289958953857, |
|
"learning_rate": 6.72e-06, |
|
"loss": 0.8263, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.10420475319926874, |
|
"grad_norm": 2.8362669944763184, |
|
"learning_rate": 6.840000000000001e-06, |
|
"loss": 0.9, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.10603290676416818, |
|
"grad_norm": 2.6192896366119385, |
|
"learning_rate": 6.96e-06, |
|
"loss": 1.05, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.10786106032906764, |
|
"grad_norm": 2.7502949237823486, |
|
"learning_rate": 7.08e-06, |
|
"loss": 0.87, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.10968921389396709, |
|
"grad_norm": 2.6745474338531494, |
|
"learning_rate": 7.2e-06, |
|
"loss": 0.8163, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.11151736745886655, |
|
"grad_norm": 2.6584086418151855, |
|
"learning_rate": 7.32e-06, |
|
"loss": 0.8813, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.113345521023766, |
|
"grad_norm": 2.689574956893921, |
|
"learning_rate": 7.44e-06, |
|
"loss": 0.9404, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.11517367458866545, |
|
"grad_norm": 2.754441738128662, |
|
"learning_rate": 7.5600000000000005e-06, |
|
"loss": 0.7416, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.1170018281535649, |
|
"grad_norm": 2.8178014755249023, |
|
"learning_rate": 7.680000000000001e-06, |
|
"loss": 0.8377, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.11882998171846434, |
|
"grad_norm": 2.8821122646331787, |
|
"learning_rate": 7.8e-06, |
|
"loss": 0.7101, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.1206581352833638, |
|
"grad_norm": 2.6646909713745117, |
|
"learning_rate": 7.92e-06, |
|
"loss": 1.0581, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.12248628884826325, |
|
"grad_norm": 2.9155476093292236, |
|
"learning_rate": 8.040000000000001e-06, |
|
"loss": 0.8417, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.12431444241316271, |
|
"grad_norm": 2.7877771854400635, |
|
"learning_rate": 8.160000000000001e-06, |
|
"loss": 0.9266, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.12614259597806216, |
|
"grad_norm": 2.625126361846924, |
|
"learning_rate": 8.28e-06, |
|
"loss": 1.0048, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.12797074954296161, |
|
"grad_norm": 2.7259960174560547, |
|
"learning_rate": 8.400000000000001e-06, |
|
"loss": 0.9485, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.12979890310786105, |
|
"grad_norm": 2.743478536605835, |
|
"learning_rate": 8.52e-06, |
|
"loss": 0.9221, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.1316270566727605, |
|
"grad_norm": 2.586174964904785, |
|
"learning_rate": 8.64e-06, |
|
"loss": 0.8967, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.13345521023765997, |
|
"grad_norm": 2.817873954772949, |
|
"learning_rate": 8.759999999999999e-06, |
|
"loss": 0.943, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.13528336380255943, |
|
"grad_norm": 2.692861557006836, |
|
"learning_rate": 8.88e-06, |
|
"loss": 0.8334, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.13711151736745886, |
|
"grad_norm": 2.9305572509765625, |
|
"learning_rate": 9e-06, |
|
"loss": 0.8215, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.13893967093235832, |
|
"grad_norm": 2.898930072784424, |
|
"learning_rate": 9.12e-06, |
|
"loss": 0.8979, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.14076782449725778, |
|
"grad_norm": 2.8066327571868896, |
|
"learning_rate": 9.24e-06, |
|
"loss": 1.0717, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.1425959780621572, |
|
"grad_norm": 3.126624584197998, |
|
"learning_rate": 9.36e-06, |
|
"loss": 0.8887, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.14442413162705667, |
|
"grad_norm": 2.469200611114502, |
|
"learning_rate": 9.48e-06, |
|
"loss": 0.9542, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.14625228519195613, |
|
"grad_norm": 2.6940770149230957, |
|
"learning_rate": 9.600000000000001e-06, |
|
"loss": 0.9756, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.1480804387568556, |
|
"grad_norm": 2.847891330718994, |
|
"learning_rate": 9.72e-06, |
|
"loss": 0.8966, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.14990859232175502, |
|
"grad_norm": 2.9159109592437744, |
|
"learning_rate": 9.84e-06, |
|
"loss": 0.8055, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.15173674588665448, |
|
"grad_norm": 2.9693570137023926, |
|
"learning_rate": 9.960000000000001e-06, |
|
"loss": 0.8913, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.15356489945155394, |
|
"grad_norm": 2.6382272243499756, |
|
"learning_rate": 1.008e-05, |
|
"loss": 0.8565, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.15539305301645337, |
|
"grad_norm": 2.7299423217773438, |
|
"learning_rate": 1.02e-05, |
|
"loss": 0.8096, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.15722120658135283, |
|
"grad_norm": 2.7661237716674805, |
|
"learning_rate": 1.032e-05, |
|
"loss": 0.9193, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.1590493601462523, |
|
"grad_norm": 3.0896854400634766, |
|
"learning_rate": 1.044e-05, |
|
"loss": 0.7745, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.16087751371115175, |
|
"grad_norm": 2.6443893909454346, |
|
"learning_rate": 1.0559999999999999e-05, |
|
"loss": 0.8674, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.16270566727605118, |
|
"grad_norm": 3.047353506088257, |
|
"learning_rate": 1.068e-05, |
|
"loss": 0.9062, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.16453382084095064, |
|
"grad_norm": 2.7751214504241943, |
|
"learning_rate": 1.08e-05, |
|
"loss": 0.8222, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.1663619744058501, |
|
"grad_norm": 2.5556681156158447, |
|
"learning_rate": 1.092e-05, |
|
"loss": 0.7737, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.16819012797074953, |
|
"grad_norm": 2.840104103088379, |
|
"learning_rate": 1.104e-05, |
|
"loss": 0.9967, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.170018281535649, |
|
"grad_norm": 2.784130811691284, |
|
"learning_rate": 1.116e-05, |
|
"loss": 0.8571, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.17184643510054845, |
|
"grad_norm": 2.5982677936553955, |
|
"learning_rate": 1.128e-05, |
|
"loss": 0.7934, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.1736745886654479, |
|
"grad_norm": 3.1838393211364746, |
|
"learning_rate": 1.1400000000000001e-05, |
|
"loss": 0.8569, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.17550274223034734, |
|
"grad_norm": 2.793653726577759, |
|
"learning_rate": 1.152e-05, |
|
"loss": 0.9144, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.1773308957952468, |
|
"grad_norm": 2.6756796836853027, |
|
"learning_rate": 1.164e-05, |
|
"loss": 0.8517, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.17915904936014626, |
|
"grad_norm": 2.6979010105133057, |
|
"learning_rate": 1.1760000000000001e-05, |
|
"loss": 0.7551, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.1809872029250457, |
|
"grad_norm": 2.9032483100891113, |
|
"learning_rate": 1.1880000000000001e-05, |
|
"loss": 0.777, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.18281535648994515, |
|
"grad_norm": 2.555727243423462, |
|
"learning_rate": 1.2e-05, |
|
"loss": 0.7583, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.1846435100548446, |
|
"grad_norm": 2.7780463695526123, |
|
"learning_rate": 1.2120000000000001e-05, |
|
"loss": 1.0916, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.18647166361974407, |
|
"grad_norm": 2.791424512863159, |
|
"learning_rate": 1.224e-05, |
|
"loss": 0.9344, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.1882998171846435, |
|
"grad_norm": 2.590106248855591, |
|
"learning_rate": 1.236e-05, |
|
"loss": 0.8391, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.19012797074954296, |
|
"grad_norm": 2.7519073486328125, |
|
"learning_rate": 1.2479999999999999e-05, |
|
"loss": 0.7809, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.19195612431444242, |
|
"grad_norm": 2.8074002265930176, |
|
"learning_rate": 1.26e-05, |
|
"loss": 0.8258, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.19378427787934185, |
|
"grad_norm": 2.6220719814300537, |
|
"learning_rate": 1.272e-05, |
|
"loss": 0.7542, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.1956124314442413, |
|
"grad_norm": 2.8143625259399414, |
|
"learning_rate": 1.284e-05, |
|
"loss": 0.8587, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.19744058500914077, |
|
"grad_norm": 2.4876911640167236, |
|
"learning_rate": 1.296e-05, |
|
"loss": 0.8425, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.19926873857404023, |
|
"grad_norm": 2.7102651596069336, |
|
"learning_rate": 1.308e-05, |
|
"loss": 0.9726, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.20109689213893966, |
|
"grad_norm": 2.375572919845581, |
|
"learning_rate": 1.32e-05, |
|
"loss": 0.8122, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.20292504570383912, |
|
"grad_norm": 2.485874652862549, |
|
"learning_rate": 1.3320000000000001e-05, |
|
"loss": 0.7726, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.20475319926873858, |
|
"grad_norm": 2.5263822078704834, |
|
"learning_rate": 1.344e-05, |
|
"loss": 0.9219, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.20658135283363802, |
|
"grad_norm": 2.5467567443847656, |
|
"learning_rate": 1.356e-05, |
|
"loss": 0.8116, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.20840950639853748, |
|
"grad_norm": 2.3540358543395996, |
|
"learning_rate": 1.3680000000000001e-05, |
|
"loss": 1.0343, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.21023765996343693, |
|
"grad_norm": 2.6379354000091553, |
|
"learning_rate": 1.3800000000000002e-05, |
|
"loss": 0.8242, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.21206581352833637, |
|
"grad_norm": 2.5178139209747314, |
|
"learning_rate": 1.392e-05, |
|
"loss": 0.8899, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.21389396709323583, |
|
"grad_norm": 2.802619695663452, |
|
"learning_rate": 1.4040000000000001e-05, |
|
"loss": 0.8031, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.21572212065813529, |
|
"grad_norm": 2.7448935508728027, |
|
"learning_rate": 1.416e-05, |
|
"loss": 0.8676, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.21755027422303475, |
|
"grad_norm": 2.626340627670288, |
|
"learning_rate": 1.428e-05, |
|
"loss": 0.9465, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.21937842778793418, |
|
"grad_norm": 2.5691044330596924, |
|
"learning_rate": 1.44e-05, |
|
"loss": 0.712, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.22120658135283364, |
|
"grad_norm": 2.877453565597534, |
|
"learning_rate": 1.452e-05, |
|
"loss": 0.8605, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.2230347349177331, |
|
"grad_norm": 2.409876585006714, |
|
"learning_rate": 1.464e-05, |
|
"loss": 0.8972, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.22486288848263253, |
|
"grad_norm": 2.517220973968506, |
|
"learning_rate": 1.4760000000000001e-05, |
|
"loss": 0.822, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.226691042047532, |
|
"grad_norm": 2.53521728515625, |
|
"learning_rate": 1.488e-05, |
|
"loss": 0.7721, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.22851919561243145, |
|
"grad_norm": 2.533579111099243, |
|
"learning_rate": 1.5e-05, |
|
"loss": 0.7182, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.2303473491773309, |
|
"grad_norm": 2.8807780742645264, |
|
"learning_rate": 1.5120000000000001e-05, |
|
"loss": 0.8755, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.23217550274223034, |
|
"grad_norm": 2.8886823654174805, |
|
"learning_rate": 1.524e-05, |
|
"loss": 0.8119, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.2340036563071298, |
|
"grad_norm": 2.710432529449463, |
|
"learning_rate": 1.5360000000000002e-05, |
|
"loss": 0.7054, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.23583180987202926, |
|
"grad_norm": 2.3780925273895264, |
|
"learning_rate": 1.548e-05, |
|
"loss": 0.9101, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.2376599634369287, |
|
"grad_norm": 2.6293869018554688, |
|
"learning_rate": 1.56e-05, |
|
"loss": 0.7895, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.23948811700182815, |
|
"grad_norm": 2.584303617477417, |
|
"learning_rate": 1.5720000000000002e-05, |
|
"loss": 1.0317, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.2413162705667276, |
|
"grad_norm": 2.4637179374694824, |
|
"learning_rate": 1.584e-05, |
|
"loss": 0.7805, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.24314442413162707, |
|
"grad_norm": 2.4105379581451416, |
|
"learning_rate": 1.596e-05, |
|
"loss": 0.8044, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.2449725776965265, |
|
"grad_norm": 2.476205825805664, |
|
"learning_rate": 1.6080000000000002e-05, |
|
"loss": 0.7283, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.24680073126142596, |
|
"grad_norm": 2.620548725128174, |
|
"learning_rate": 1.62e-05, |
|
"loss": 0.8035, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.24862888482632542, |
|
"grad_norm": 2.4662225246429443, |
|
"learning_rate": 1.6320000000000003e-05, |
|
"loss": 0.8235, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.25045703839122485, |
|
"grad_norm": 2.405362367630005, |
|
"learning_rate": 1.6440000000000002e-05, |
|
"loss": 0.8681, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.2522851919561243, |
|
"grad_norm": 2.331638813018799, |
|
"learning_rate": 1.656e-05, |
|
"loss": 0.8784, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.25411334552102377, |
|
"grad_norm": 2.796093463897705, |
|
"learning_rate": 1.6680000000000003e-05, |
|
"loss": 0.9942, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.25594149908592323, |
|
"grad_norm": 2.3736331462860107, |
|
"learning_rate": 1.6800000000000002e-05, |
|
"loss": 0.7229, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.2577696526508227, |
|
"grad_norm": 2.4110031127929688, |
|
"learning_rate": 1.6919999999999997e-05, |
|
"loss": 0.8202, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.2595978062157221, |
|
"grad_norm": 2.3349928855895996, |
|
"learning_rate": 1.704e-05, |
|
"loss": 0.7966, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.26142595978062155, |
|
"grad_norm": 2.4862008094787598, |
|
"learning_rate": 1.716e-05, |
|
"loss": 0.8141, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.263254113345521, |
|
"grad_norm": 2.787587881088257, |
|
"learning_rate": 1.728e-05, |
|
"loss": 0.7861, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.26508226691042047, |
|
"grad_norm": 2.687865972518921, |
|
"learning_rate": 1.74e-05, |
|
"loss": 0.9085, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.26691042047531993, |
|
"grad_norm": 2.517024278640747, |
|
"learning_rate": 1.7519999999999998e-05, |
|
"loss": 0.8719, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.2687385740402194, |
|
"grad_norm": 2.4157791137695312, |
|
"learning_rate": 1.764e-05, |
|
"loss": 0.8469, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.27056672760511885, |
|
"grad_norm": 2.647015333175659, |
|
"learning_rate": 1.776e-05, |
|
"loss": 0.8133, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.27239488117001825, |
|
"grad_norm": 2.7705986499786377, |
|
"learning_rate": 1.7879999999999998e-05, |
|
"loss": 0.8819, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.2742230347349177, |
|
"grad_norm": 2.2369964122772217, |
|
"learning_rate": 1.8e-05, |
|
"loss": 0.88, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.2760511882998172, |
|
"grad_norm": 2.239433765411377, |
|
"learning_rate": 1.812e-05, |
|
"loss": 0.7873, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.27787934186471663, |
|
"grad_norm": 2.493117332458496, |
|
"learning_rate": 1.824e-05, |
|
"loss": 0.8111, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.2797074954296161, |
|
"grad_norm": 2.5309877395629883, |
|
"learning_rate": 1.836e-05, |
|
"loss": 0.7235, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.28153564899451555, |
|
"grad_norm": 2.403522491455078, |
|
"learning_rate": 1.848e-05, |
|
"loss": 0.816, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.283363802559415, |
|
"grad_norm": 2.8262531757354736, |
|
"learning_rate": 1.86e-05, |
|
"loss": 0.9069, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.2851919561243144, |
|
"grad_norm": 2.51188588142395, |
|
"learning_rate": 1.872e-05, |
|
"loss": 0.8979, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.2870201096892139, |
|
"grad_norm": 2.493990659713745, |
|
"learning_rate": 1.884e-05, |
|
"loss": 0.798, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 0.28884826325411334, |
|
"grad_norm": 2.5412824153900146, |
|
"learning_rate": 1.896e-05, |
|
"loss": 0.7898, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.2906764168190128, |
|
"grad_norm": 2.4731011390686035, |
|
"learning_rate": 1.908e-05, |
|
"loss": 0.8854, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.29250457038391225, |
|
"grad_norm": 2.6185050010681152, |
|
"learning_rate": 1.9200000000000003e-05, |
|
"loss": 0.8163, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.2943327239488117, |
|
"grad_norm": 2.384073495864868, |
|
"learning_rate": 1.932e-05, |
|
"loss": 0.7888, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 0.2961608775137112, |
|
"grad_norm": 2.566452741622925, |
|
"learning_rate": 1.944e-05, |
|
"loss": 0.8344, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.2979890310786106, |
|
"grad_norm": 2.4498672485351562, |
|
"learning_rate": 1.9560000000000002e-05, |
|
"loss": 0.8288, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 0.29981718464351004, |
|
"grad_norm": 2.7561299800872803, |
|
"learning_rate": 1.968e-05, |
|
"loss": 0.8321, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.3016453382084095, |
|
"grad_norm": 2.5148916244506836, |
|
"learning_rate": 1.98e-05, |
|
"loss": 0.8343, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.30347349177330896, |
|
"grad_norm": 2.444960594177246, |
|
"learning_rate": 1.9920000000000002e-05, |
|
"loss": 0.6833, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.3053016453382084, |
|
"grad_norm": 2.5153768062591553, |
|
"learning_rate": 2.004e-05, |
|
"loss": 0.9192, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 0.3071297989031079, |
|
"grad_norm": 2.301560640335083, |
|
"learning_rate": 2.016e-05, |
|
"loss": 0.7864, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.30895795246800734, |
|
"grad_norm": 2.628103733062744, |
|
"learning_rate": 2.0280000000000002e-05, |
|
"loss": 0.8426, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 0.31078610603290674, |
|
"grad_norm": 2.4587066173553467, |
|
"learning_rate": 2.04e-05, |
|
"loss": 0.8344, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.3126142595978062, |
|
"grad_norm": 2.4356703758239746, |
|
"learning_rate": 2.0520000000000003e-05, |
|
"loss": 0.7558, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 0.31444241316270566, |
|
"grad_norm": 2.531304121017456, |
|
"learning_rate": 2.064e-05, |
|
"loss": 0.855, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.3162705667276051, |
|
"grad_norm": 2.2168610095977783, |
|
"learning_rate": 2.0759999999999998e-05, |
|
"loss": 0.8551, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 0.3180987202925046, |
|
"grad_norm": 2.4772465229034424, |
|
"learning_rate": 2.088e-05, |
|
"loss": 0.8782, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.31992687385740404, |
|
"grad_norm": 2.4406375885009766, |
|
"learning_rate": 2.1e-05, |
|
"loss": 0.775, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.3217550274223035, |
|
"grad_norm": 2.638505697250366, |
|
"learning_rate": 2.1119999999999998e-05, |
|
"loss": 0.9181, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.3235831809872029, |
|
"grad_norm": 2.452930212020874, |
|
"learning_rate": 2.124e-05, |
|
"loss": 0.8452, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 0.32541133455210236, |
|
"grad_norm": 2.370314836502075, |
|
"learning_rate": 2.136e-05, |
|
"loss": 1.0293, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 0.3272394881170018, |
|
"grad_norm": 2.4259750843048096, |
|
"learning_rate": 2.148e-05, |
|
"loss": 0.7744, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 0.3290676416819013, |
|
"grad_norm": 2.374286413192749, |
|
"learning_rate": 2.16e-05, |
|
"loss": 0.8336, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.33089579524680074, |
|
"grad_norm": 2.4372458457946777, |
|
"learning_rate": 2.172e-05, |
|
"loss": 0.9673, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 0.3327239488117002, |
|
"grad_norm": 2.6595754623413086, |
|
"learning_rate": 2.184e-05, |
|
"loss": 0.8805, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.33455210237659966, |
|
"grad_norm": 2.521261692047119, |
|
"learning_rate": 2.196e-05, |
|
"loss": 0.962, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 0.33638025594149906, |
|
"grad_norm": 2.559983015060425, |
|
"learning_rate": 2.208e-05, |
|
"loss": 0.8236, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.3382084095063985, |
|
"grad_norm": 2.5021865367889404, |
|
"learning_rate": 2.22e-05, |
|
"loss": 0.7696, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.340036563071298, |
|
"grad_norm": 2.389669418334961, |
|
"learning_rate": 2.232e-05, |
|
"loss": 0.9296, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.34186471663619744, |
|
"grad_norm": 2.8006410598754883, |
|
"learning_rate": 2.2440000000000002e-05, |
|
"loss": 1.1051, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 0.3436928702010969, |
|
"grad_norm": 2.246638774871826, |
|
"learning_rate": 2.256e-05, |
|
"loss": 0.67, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.34552102376599636, |
|
"grad_norm": 2.3323843479156494, |
|
"learning_rate": 2.268e-05, |
|
"loss": 0.7483, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 0.3473491773308958, |
|
"grad_norm": 2.599168539047241, |
|
"learning_rate": 2.2800000000000002e-05, |
|
"loss": 0.7095, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.3491773308957952, |
|
"grad_norm": 2.5335357189178467, |
|
"learning_rate": 2.292e-05, |
|
"loss": 0.7943, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 0.3510054844606947, |
|
"grad_norm": 2.523808717727661, |
|
"learning_rate": 2.304e-05, |
|
"loss": 0.8714, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.35283363802559414, |
|
"grad_norm": 2.3433940410614014, |
|
"learning_rate": 2.3160000000000002e-05, |
|
"loss": 0.7879, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 0.3546617915904936, |
|
"grad_norm": 2.5101304054260254, |
|
"learning_rate": 2.328e-05, |
|
"loss": 0.9299, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 0.35648994515539306, |
|
"grad_norm": 2.652029275894165, |
|
"learning_rate": 2.3400000000000003e-05, |
|
"loss": 0.813, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.3583180987202925, |
|
"grad_norm": 2.250645160675049, |
|
"learning_rate": 2.3520000000000002e-05, |
|
"loss": 0.9784, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.360146252285192, |
|
"grad_norm": 2.2848877906799316, |
|
"learning_rate": 2.364e-05, |
|
"loss": 0.9483, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 0.3619744058500914, |
|
"grad_norm": 2.4996519088745117, |
|
"learning_rate": 2.3760000000000003e-05, |
|
"loss": 0.8746, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.36380255941499084, |
|
"grad_norm": 2.451387882232666, |
|
"learning_rate": 2.3880000000000002e-05, |
|
"loss": 0.8514, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 0.3656307129798903, |
|
"grad_norm": 2.382949113845825, |
|
"learning_rate": 2.4e-05, |
|
"loss": 1.0895, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.36745886654478976, |
|
"grad_norm": 2.407252788543701, |
|
"learning_rate": 2.4120000000000003e-05, |
|
"loss": 0.9273, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 0.3692870201096892, |
|
"grad_norm": 2.554053544998169, |
|
"learning_rate": 2.4240000000000002e-05, |
|
"loss": 0.8187, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 0.3711151736745887, |
|
"grad_norm": 2.1548268795013428, |
|
"learning_rate": 2.4360000000000004e-05, |
|
"loss": 0.9683, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 0.37294332723948814, |
|
"grad_norm": 2.419849395751953, |
|
"learning_rate": 2.448e-05, |
|
"loss": 0.8276, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 0.37477148080438755, |
|
"grad_norm": 2.300262451171875, |
|
"learning_rate": 2.4599999999999998e-05, |
|
"loss": 0.8748, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.376599634369287, |
|
"grad_norm": 2.4870543479919434, |
|
"learning_rate": 2.472e-05, |
|
"loss": 0.8901, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 0.37842778793418647, |
|
"grad_norm": 2.703481435775757, |
|
"learning_rate": 2.484e-05, |
|
"loss": 0.871, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 0.3802559414990859, |
|
"grad_norm": 2.597571611404419, |
|
"learning_rate": 2.4959999999999998e-05, |
|
"loss": 0.747, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 0.3820840950639854, |
|
"grad_norm": 2.4933812618255615, |
|
"learning_rate": 2.508e-05, |
|
"loss": 0.7869, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 0.38391224862888484, |
|
"grad_norm": 2.566986322402954, |
|
"learning_rate": 2.52e-05, |
|
"loss": 0.9081, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.3857404021937843, |
|
"grad_norm": 2.4893436431884766, |
|
"learning_rate": 2.5319999999999998e-05, |
|
"loss": 0.866, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 0.3875685557586837, |
|
"grad_norm": 2.5950074195861816, |
|
"learning_rate": 2.544e-05, |
|
"loss": 0.8783, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 0.38939670932358317, |
|
"grad_norm": 2.3816328048706055, |
|
"learning_rate": 2.556e-05, |
|
"loss": 0.8963, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 0.3912248628884826, |
|
"grad_norm": 2.064539670944214, |
|
"learning_rate": 2.568e-05, |
|
"loss": 0.8979, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 0.3930530164533821, |
|
"grad_norm": 2.43748140335083, |
|
"learning_rate": 2.58e-05, |
|
"loss": 0.8466, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.39488117001828155, |
|
"grad_norm": 2.2571210861206055, |
|
"learning_rate": 2.592e-05, |
|
"loss": 0.8433, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.396709323583181, |
|
"grad_norm": 2.3223443031311035, |
|
"learning_rate": 2.604e-05, |
|
"loss": 0.7485, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 0.39853747714808047, |
|
"grad_norm": 2.435385227203369, |
|
"learning_rate": 2.616e-05, |
|
"loss": 0.8868, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 0.40036563071297987, |
|
"grad_norm": 2.4609930515289307, |
|
"learning_rate": 2.628e-05, |
|
"loss": 0.7649, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 0.40219378427787933, |
|
"grad_norm": 2.3334007263183594, |
|
"learning_rate": 2.64e-05, |
|
"loss": 0.8722, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.4040219378427788, |
|
"grad_norm": 2.4103660583496094, |
|
"learning_rate": 2.652e-05, |
|
"loss": 0.8687, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 0.40585009140767825, |
|
"grad_norm": 2.386665105819702, |
|
"learning_rate": 2.6640000000000002e-05, |
|
"loss": 0.9062, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 0.4076782449725777, |
|
"grad_norm": 2.420870065689087, |
|
"learning_rate": 2.676e-05, |
|
"loss": 0.9941, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 0.40950639853747717, |
|
"grad_norm": 2.643944025039673, |
|
"learning_rate": 2.688e-05, |
|
"loss": 0.8953, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 0.4113345521023766, |
|
"grad_norm": 2.400880813598633, |
|
"learning_rate": 2.7000000000000002e-05, |
|
"loss": 0.8583, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.41316270566727603, |
|
"grad_norm": 2.415785312652588, |
|
"learning_rate": 2.712e-05, |
|
"loss": 0.7549, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 0.4149908592321755, |
|
"grad_norm": 2.6550943851470947, |
|
"learning_rate": 2.724e-05, |
|
"loss": 0.9005, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 0.41681901279707495, |
|
"grad_norm": 2.31974720954895, |
|
"learning_rate": 2.7360000000000002e-05, |
|
"loss": 0.9962, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 0.4186471663619744, |
|
"grad_norm": 2.463061571121216, |
|
"learning_rate": 2.748e-05, |
|
"loss": 0.7754, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 0.42047531992687387, |
|
"grad_norm": 2.5701842308044434, |
|
"learning_rate": 2.7600000000000003e-05, |
|
"loss": 0.772, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.42230347349177333, |
|
"grad_norm": 2.3573224544525146, |
|
"learning_rate": 2.7720000000000002e-05, |
|
"loss": 0.8872, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 0.42413162705667273, |
|
"grad_norm": 2.345667600631714, |
|
"learning_rate": 2.784e-05, |
|
"loss": 0.7977, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 0.4259597806215722, |
|
"grad_norm": 2.583740234375, |
|
"learning_rate": 2.7960000000000003e-05, |
|
"loss": 0.9406, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 0.42778793418647165, |
|
"grad_norm": 2.51877760887146, |
|
"learning_rate": 2.8080000000000002e-05, |
|
"loss": 0.8245, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 0.4296160877513711, |
|
"grad_norm": 2.6624832153320312, |
|
"learning_rate": 2.8199999999999998e-05, |
|
"loss": 0.8747, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.43144424131627057, |
|
"grad_norm": 2.6126315593719482, |
|
"learning_rate": 2.832e-05, |
|
"loss": 0.881, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 0.43327239488117003, |
|
"grad_norm": 2.533567428588867, |
|
"learning_rate": 2.844e-05, |
|
"loss": 0.9505, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 0.4351005484460695, |
|
"grad_norm": 2.4115335941314697, |
|
"learning_rate": 2.856e-05, |
|
"loss": 0.9703, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 0.4369287020109689, |
|
"grad_norm": 2.2946977615356445, |
|
"learning_rate": 2.868e-05, |
|
"loss": 0.8025, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 0.43875685557586835, |
|
"grad_norm": 2.7821929454803467, |
|
"learning_rate": 2.88e-05, |
|
"loss": 0.8108, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.4405850091407678, |
|
"grad_norm": 2.5924153327941895, |
|
"learning_rate": 2.892e-05, |
|
"loss": 0.7716, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 0.4424131627056673, |
|
"grad_norm": 2.484504222869873, |
|
"learning_rate": 2.904e-05, |
|
"loss": 0.8917, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 0.44424131627056673, |
|
"grad_norm": 2.4044761657714844, |
|
"learning_rate": 2.916e-05, |
|
"loss": 0.9806, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 0.4460694698354662, |
|
"grad_norm": 2.3332765102386475, |
|
"learning_rate": 2.928e-05, |
|
"loss": 0.7616, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 0.44789762340036565, |
|
"grad_norm": 2.3703112602233887, |
|
"learning_rate": 2.94e-05, |
|
"loss": 0.8937, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.44972577696526506, |
|
"grad_norm": 2.3351054191589355, |
|
"learning_rate": 2.9520000000000002e-05, |
|
"loss": 0.83, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 0.4515539305301645, |
|
"grad_norm": 2.3738510608673096, |
|
"learning_rate": 2.964e-05, |
|
"loss": 0.904, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 0.453382084095064, |
|
"grad_norm": 2.5012619495391846, |
|
"learning_rate": 2.976e-05, |
|
"loss": 0.8809, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 0.45521023765996343, |
|
"grad_norm": 2.5719287395477295, |
|
"learning_rate": 2.9880000000000002e-05, |
|
"loss": 0.773, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 0.4570383912248629, |
|
"grad_norm": 2.3036999702453613, |
|
"learning_rate": 3e-05, |
|
"loss": 0.7487, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.4570383912248629, |
|
"eval_loss": 0.8340924382209778, |
|
"eval_runtime": 11.3221, |
|
"eval_samples_per_second": 98.215, |
|
"eval_steps_per_second": 3.091, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.45886654478976235, |
|
"grad_norm": 2.355015754699707, |
|
"learning_rate": 2.9999160841378727e-05, |
|
"loss": 0.7973, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 0.4606946983546618, |
|
"grad_norm": 2.296038866043091, |
|
"learning_rate": 2.9996643459406528e-05, |
|
"loss": 0.8632, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 0.4625228519195612, |
|
"grad_norm": 2.2504048347473145, |
|
"learning_rate": 2.999244813574778e-05, |
|
"loss": 0.704, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 0.4643510054844607, |
|
"grad_norm": 2.4145545959472656, |
|
"learning_rate": 2.9986575339808077e-05, |
|
"loss": 0.7892, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 0.46617915904936014, |
|
"grad_norm": 2.3196182250976562, |
|
"learning_rate": 2.997902572868174e-05, |
|
"loss": 0.9237, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.4680073126142596, |
|
"grad_norm": 2.5195236206054688, |
|
"learning_rate": 2.9969800147078265e-05, |
|
"loss": 0.8632, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 0.46983546617915906, |
|
"grad_norm": 2.3776962757110596, |
|
"learning_rate": 2.995889962722784e-05, |
|
"loss": 0.8948, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 0.4716636197440585, |
|
"grad_norm": 2.3582563400268555, |
|
"learning_rate": 2.9946325388765812e-05, |
|
"loss": 0.8258, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 0.473491773308958, |
|
"grad_norm": 2.4774725437164307, |
|
"learning_rate": 2.993207883859627e-05, |
|
"loss": 0.8687, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 0.4753199268738574, |
|
"grad_norm": 2.2049193382263184, |
|
"learning_rate": 2.99161615707346e-05, |
|
"loss": 0.9289, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.47714808043875684, |
|
"grad_norm": 2.2471542358398438, |
|
"learning_rate": 2.9898575366129145e-05, |
|
"loss": 0.8769, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 0.4789762340036563, |
|
"grad_norm": 2.2609918117523193, |
|
"learning_rate": 2.9879322192461932e-05, |
|
"loss": 1.0632, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 0.48080438756855576, |
|
"grad_norm": 2.3569087982177734, |
|
"learning_rate": 2.985840420392851e-05, |
|
"loss": 0.854, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 0.4826325411334552, |
|
"grad_norm": 2.398346185684204, |
|
"learning_rate": 2.9835823740996944e-05, |
|
"loss": 0.7765, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 0.4844606946983547, |
|
"grad_norm": 2.251390218734741, |
|
"learning_rate": 2.9811583330145915e-05, |
|
"loss": 0.8045, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.48628884826325414, |
|
"grad_norm": 2.3630456924438477, |
|
"learning_rate": 2.9785685683582057e-05, |
|
"loss": 0.8945, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 0.48811700182815354, |
|
"grad_norm": 2.259655714035034, |
|
"learning_rate": 2.975813369893649e-05, |
|
"loss": 0.7409, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 0.489945155393053, |
|
"grad_norm": 2.4072036743164062, |
|
"learning_rate": 2.97289304589406e-05, |
|
"loss": 0.8358, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 0.49177330895795246, |
|
"grad_norm": 2.3019490242004395, |
|
"learning_rate": 2.9698079231081144e-05, |
|
"loss": 0.8837, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 0.4936014625228519, |
|
"grad_norm": 2.3812527656555176, |
|
"learning_rate": 2.966558346723463e-05, |
|
"loss": 0.8772, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.4954296160877514, |
|
"grad_norm": 2.3249640464782715, |
|
"learning_rate": 2.963144680328111e-05, |
|
"loss": 0.7369, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 0.49725776965265084, |
|
"grad_norm": 2.431414842605591, |
|
"learning_rate": 2.959567305869736e-05, |
|
"loss": 0.8207, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 0.4990859232175503, |
|
"grad_norm": 2.3795621395111084, |
|
"learning_rate": 2.955826623612954e-05, |
|
"loss": 0.73, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 0.5009140767824497, |
|
"grad_norm": 2.426405906677246, |
|
"learning_rate": 2.9519230520945346e-05, |
|
"loss": 0.9324, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 0.5027422303473492, |
|
"grad_norm": 2.2649593353271484, |
|
"learning_rate": 2.947857028076569e-05, |
|
"loss": 0.8003, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.5045703839122486, |
|
"grad_norm": 2.481842041015625, |
|
"learning_rate": 2.943629006497606e-05, |
|
"loss": 0.7915, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 0.506398537477148, |
|
"grad_norm": 2.5210118293762207, |
|
"learning_rate": 2.939239460421746e-05, |
|
"loss": 0.7953, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 0.5082266910420475, |
|
"grad_norm": 2.3630707263946533, |
|
"learning_rate": 2.934688880985714e-05, |
|
"loss": 0.8232, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 0.5100548446069469, |
|
"grad_norm": 2.3418996334075928, |
|
"learning_rate": 2.9299777773439056e-05, |
|
"loss": 0.909, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 0.5118829981718465, |
|
"grad_norm": 2.34122633934021, |
|
"learning_rate": 2.925106676611418e-05, |
|
"loss": 0.7633, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.5137111517367459, |
|
"grad_norm": 2.499547243118286, |
|
"learning_rate": 2.9200761238050756e-05, |
|
"loss": 0.851, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 0.5155393053016454, |
|
"grad_norm": 2.456969738006592, |
|
"learning_rate": 2.9148866817824454e-05, |
|
"loss": 0.8803, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 0.5173674588665448, |
|
"grad_norm": 2.2602295875549316, |
|
"learning_rate": 2.9095389311788626e-05, |
|
"loss": 0.8049, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 0.5191956124314442, |
|
"grad_norm": 2.1520049571990967, |
|
"learning_rate": 2.9040334703424637e-05, |
|
"loss": 0.7233, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 0.5210237659963437, |
|
"grad_norm": 2.4685440063476562, |
|
"learning_rate": 2.8983709152672386e-05, |
|
"loss": 0.9514, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.5228519195612431, |
|
"grad_norm": 2.296013593673706, |
|
"learning_rate": 2.892551899524109e-05, |
|
"loss": 0.7938, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 0.5246800731261426, |
|
"grad_norm": 2.3713924884796143, |
|
"learning_rate": 2.8865770741900382e-05, |
|
"loss": 0.93, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 0.526508226691042, |
|
"grad_norm": 2.6389975547790527, |
|
"learning_rate": 2.8804471077751847e-05, |
|
"loss": 0.9036, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 0.5283363802559415, |
|
"grad_norm": 2.4582440853118896, |
|
"learning_rate": 2.8741626861481043e-05, |
|
"loss": 0.9437, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 0.5301645338208409, |
|
"grad_norm": 2.3008275032043457, |
|
"learning_rate": 2.8677245124590087e-05, |
|
"loss": 0.7939, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.5319926873857403, |
|
"grad_norm": 2.319469928741455, |
|
"learning_rate": 2.8611333070610918e-05, |
|
"loss": 0.8535, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 0.5338208409506399, |
|
"grad_norm": 2.295746088027954, |
|
"learning_rate": 2.8543898074299322e-05, |
|
"loss": 0.736, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 0.5356489945155393, |
|
"grad_norm": 2.5527262687683105, |
|
"learning_rate": 2.8474947680809754e-05, |
|
"loss": 0.8192, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 0.5374771480804388, |
|
"grad_norm": 2.308958053588867, |
|
"learning_rate": 2.8404489604851186e-05, |
|
"loss": 0.9077, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 0.5393053016453382, |
|
"grad_norm": 2.524796724319458, |
|
"learning_rate": 2.8332531729823853e-05, |
|
"loss": 0.8038, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.5411334552102377, |
|
"grad_norm": 2.420640468597412, |
|
"learning_rate": 2.8259082106937255e-05, |
|
"loss": 0.7417, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 0.5429616087751371, |
|
"grad_norm": 2.364328384399414, |
|
"learning_rate": 2.8184148954309295e-05, |
|
"loss": 0.8791, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 0.5447897623400365, |
|
"grad_norm": 2.412336587905884, |
|
"learning_rate": 2.8107740656046775e-05, |
|
"loss": 0.83, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 0.546617915904936, |
|
"grad_norm": 2.5241622924804688, |
|
"learning_rate": 2.802986576130733e-05, |
|
"loss": 0.8886, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 0.5484460694698354, |
|
"grad_norm": 2.330146074295044, |
|
"learning_rate": 2.7950532983342863e-05, |
|
"loss": 0.8117, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.5502742230347349, |
|
"grad_norm": 2.1738884449005127, |
|
"learning_rate": 2.7869751198524656e-05, |
|
"loss": 0.8588, |
|
"step": 301 |
|
}, |
|
{ |
|
"epoch": 0.5521023765996343, |
|
"grad_norm": 2.343388319015503, |
|
"learning_rate": 2.7787529445350192e-05, |
|
"loss": 0.7355, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 0.5539305301645339, |
|
"grad_norm": 2.2163190841674805, |
|
"learning_rate": 2.7703876923431882e-05, |
|
"loss": 0.8508, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 0.5557586837294333, |
|
"grad_norm": 2.1025807857513428, |
|
"learning_rate": 2.7618802992467718e-05, |
|
"loss": 0.7909, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 0.5575868372943327, |
|
"grad_norm": 2.4115538597106934, |
|
"learning_rate": 2.753231717119405e-05, |
|
"loss": 0.7964, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.5594149908592322, |
|
"grad_norm": 2.2953007221221924, |
|
"learning_rate": 2.744442913632054e-05, |
|
"loss": 0.8284, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 0.5612431444241316, |
|
"grad_norm": 2.4674270153045654, |
|
"learning_rate": 2.7355148721447492e-05, |
|
"loss": 0.9302, |
|
"step": 307 |
|
}, |
|
{ |
|
"epoch": 0.5630712979890311, |
|
"grad_norm": 2.447037935256958, |
|
"learning_rate": 2.7264485915965548e-05, |
|
"loss": 0.9281, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 0.5648994515539305, |
|
"grad_norm": 2.1784889698028564, |
|
"learning_rate": 2.717245086393801e-05, |
|
"loss": 0.7989, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 0.56672760511883, |
|
"grad_norm": 2.2562270164489746, |
|
"learning_rate": 2.707905386296588e-05, |
|
"loss": 0.8856, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.5685557586837294, |
|
"grad_norm": 2.272416591644287, |
|
"learning_rate": 2.6984305363035616e-05, |
|
"loss": 1.0322, |
|
"step": 311 |
|
}, |
|
{ |
|
"epoch": 0.5703839122486288, |
|
"grad_norm": 2.2202160358428955, |
|
"learning_rate": 2.6888215965349974e-05, |
|
"loss": 0.9454, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 0.5722120658135283, |
|
"grad_norm": 2.4724793434143066, |
|
"learning_rate": 2.6790796421141813e-05, |
|
"loss": 0.8584, |
|
"step": 313 |
|
}, |
|
{ |
|
"epoch": 0.5740402193784278, |
|
"grad_norm": 2.3383536338806152, |
|
"learning_rate": 2.6692057630471184e-05, |
|
"loss": 0.978, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 0.5758683729433273, |
|
"grad_norm": 2.173809766769409, |
|
"learning_rate": 2.6592010641005745e-05, |
|
"loss": 0.8318, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.5776965265082267, |
|
"grad_norm": 2.306762456893921, |
|
"learning_rate": 2.649066664678467e-05, |
|
"loss": 0.841, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 0.5795246800731262, |
|
"grad_norm": 2.038734197616577, |
|
"learning_rate": 2.638803698696615e-05, |
|
"loss": 0.8219, |
|
"step": 317 |
|
}, |
|
{ |
|
"epoch": 0.5813528336380256, |
|
"grad_norm": 2.2740612030029297, |
|
"learning_rate": 2.6284133144558697e-05, |
|
"loss": 0.8945, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 0.583180987202925, |
|
"grad_norm": 2.338181972503662, |
|
"learning_rate": 2.6178966745136322e-05, |
|
"loss": 1.0114, |
|
"step": 319 |
|
}, |
|
{ |
|
"epoch": 0.5850091407678245, |
|
"grad_norm": 2.357879877090454, |
|
"learning_rate": 2.60725495555378e-05, |
|
"loss": 0.7024, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.5868372943327239, |
|
"grad_norm": 2.271117925643921, |
|
"learning_rate": 2.5964893482550076e-05, |
|
"loss": 0.8802, |
|
"step": 321 |
|
}, |
|
{ |
|
"epoch": 0.5886654478976234, |
|
"grad_norm": 2.092961072921753, |
|
"learning_rate": 2.5856010571576052e-05, |
|
"loss": 0.8343, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 0.5904936014625228, |
|
"grad_norm": 2.297849655151367, |
|
"learning_rate": 2.574591300528686e-05, |
|
"loss": 0.8124, |
|
"step": 323 |
|
}, |
|
{ |
|
"epoch": 0.5923217550274223, |
|
"grad_norm": 2.293593645095825, |
|
"learning_rate": 2.563461310225875e-05, |
|
"loss": 0.7819, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 0.5941499085923218, |
|
"grad_norm": 2.2364585399627686, |
|
"learning_rate": 2.552212331559482e-05, |
|
"loss": 0.9649, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.5959780621572212, |
|
"grad_norm": 2.2145204544067383, |
|
"learning_rate": 2.5408456231531634e-05, |
|
"loss": 0.8959, |
|
"step": 326 |
|
}, |
|
{ |
|
"epoch": 0.5978062157221207, |
|
"grad_norm": 2.4612884521484375, |
|
"learning_rate": 2.5293624568031008e-05, |
|
"loss": 0.929, |
|
"step": 327 |
|
}, |
|
{ |
|
"epoch": 0.5996343692870201, |
|
"grad_norm": 2.4367892742156982, |
|
"learning_rate": 2.5177641173356985e-05, |
|
"loss": 0.7942, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 0.6014625228519196, |
|
"grad_norm": 2.5621209144592285, |
|
"learning_rate": 2.5060519024638312e-05, |
|
"loss": 0.9107, |
|
"step": 329 |
|
}, |
|
{ |
|
"epoch": 0.603290676416819, |
|
"grad_norm": 2.2086422443389893, |
|
"learning_rate": 2.4942271226416444e-05, |
|
"loss": 0.7485, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.6051188299817185, |
|
"grad_norm": 2.4878604412078857, |
|
"learning_rate": 2.482291100917928e-05, |
|
"loss": 0.8663, |
|
"step": 331 |
|
}, |
|
{ |
|
"epoch": 0.6069469835466179, |
|
"grad_norm": 2.4622035026550293, |
|
"learning_rate": 2.4702451727880862e-05, |
|
"loss": 0.9976, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 0.6087751371115173, |
|
"grad_norm": 2.313488245010376, |
|
"learning_rate": 2.458090686044712e-05, |
|
"loss": 0.86, |
|
"step": 333 |
|
}, |
|
{ |
|
"epoch": 0.6106032906764168, |
|
"grad_norm": 2.495249032974243, |
|
"learning_rate": 2.445829000626784e-05, |
|
"loss": 0.7586, |
|
"step": 334 |
|
}, |
|
{ |
|
"epoch": 0.6124314442413162, |
|
"grad_norm": 2.2994625568389893, |
|
"learning_rate": 2.433461488467505e-05, |
|
"loss": 0.9011, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.6142595978062158, |
|
"grad_norm": 2.410585403442383, |
|
"learning_rate": 2.4209895333408028e-05, |
|
"loss": 0.7784, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 0.6160877513711152, |
|
"grad_norm": 2.371408462524414, |
|
"learning_rate": 2.4084145307065e-05, |
|
"loss": 0.9034, |
|
"step": 337 |
|
}, |
|
{ |
|
"epoch": 0.6179159049360147, |
|
"grad_norm": 2.2253592014312744, |
|
"learning_rate": 2.3957378875541795e-05, |
|
"loss": 0.8581, |
|
"step": 338 |
|
}, |
|
{ |
|
"epoch": 0.6197440585009141, |
|
"grad_norm": 2.18859601020813, |
|
"learning_rate": 2.382961022245759e-05, |
|
"loss": 0.8338, |
|
"step": 339 |
|
}, |
|
{ |
|
"epoch": 0.6215722120658135, |
|
"grad_norm": 2.1277389526367188, |
|
"learning_rate": 2.3700853643567973e-05, |
|
"loss": 0.7985, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.623400365630713, |
|
"grad_norm": 2.2631025314331055, |
|
"learning_rate": 2.3571123545165362e-05, |
|
"loss": 0.865, |
|
"step": 341 |
|
}, |
|
{ |
|
"epoch": 0.6252285191956124, |
|
"grad_norm": 2.4531781673431396, |
|
"learning_rate": 2.3440434442467155e-05, |
|
"loss": 0.8673, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 0.6270566727605119, |
|
"grad_norm": 2.3396685123443604, |
|
"learning_rate": 2.3308800957991657e-05, |
|
"loss": 0.868, |
|
"step": 343 |
|
}, |
|
{ |
|
"epoch": 0.6288848263254113, |
|
"grad_norm": 2.2110092639923096, |
|
"learning_rate": 2.3176237819921975e-05, |
|
"loss": 0.7553, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 0.6307129798903108, |
|
"grad_norm": 2.3857622146606445, |
|
"learning_rate": 2.3042759860458142e-05, |
|
"loss": 0.7463, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.6325411334552102, |
|
"grad_norm": 2.304614782333374, |
|
"learning_rate": 2.2908382014157536e-05, |
|
"loss": 0.939, |
|
"step": 346 |
|
}, |
|
{ |
|
"epoch": 0.6343692870201096, |
|
"grad_norm": 2.360813617706299, |
|
"learning_rate": 2.2773119316263935e-05, |
|
"loss": 0.7792, |
|
"step": 347 |
|
}, |
|
{ |
|
"epoch": 0.6361974405850092, |
|
"grad_norm": 2.41550612449646, |
|
"learning_rate": 2.2636986901025208e-05, |
|
"loss": 0.8776, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 0.6380255941499086, |
|
"grad_norm": 2.514841318130493, |
|
"learning_rate": 2.25e-05, |
|
"loss": 0.8356, |
|
"step": 349 |
|
}, |
|
{ |
|
"epoch": 0.6398537477148081, |
|
"grad_norm": 2.2054624557495117, |
|
"learning_rate": 2.2362173940353522e-05, |
|
"loss": 0.7899, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.6416819012797075, |
|
"grad_norm": 2.144213914871216, |
|
"learning_rate": 2.2223524143142595e-05, |
|
"loss": 0.8054, |
|
"step": 351 |
|
}, |
|
{ |
|
"epoch": 0.643510054844607, |
|
"grad_norm": 2.340751886367798, |
|
"learning_rate": 2.2084066121590242e-05, |
|
"loss": 0.8224, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 0.6453382084095064, |
|
"grad_norm": 2.3917925357818604, |
|
"learning_rate": 2.194381547934994e-05, |
|
"loss": 0.8739, |
|
"step": 353 |
|
}, |
|
{ |
|
"epoch": 0.6471663619744058, |
|
"grad_norm": 2.30846905708313, |
|
"learning_rate": 2.1802787908759767e-05, |
|
"loss": 0.866, |
|
"step": 354 |
|
}, |
|
{ |
|
"epoch": 0.6489945155393053, |
|
"grad_norm": 2.0527448654174805, |
|
"learning_rate": 2.1660999189086613e-05, |
|
"loss": 0.8253, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.6508226691042047, |
|
"grad_norm": 2.263025999069214, |
|
"learning_rate": 2.1518465184760686e-05, |
|
"loss": 0.8838, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 0.6526508226691042, |
|
"grad_norm": 2.3904080390930176, |
|
"learning_rate": 2.1375201843600452e-05, |
|
"loss": 0.9442, |
|
"step": 357 |
|
}, |
|
{ |
|
"epoch": 0.6544789762340036, |
|
"grad_norm": 2.1965222358703613, |
|
"learning_rate": 2.12312251950283e-05, |
|
"loss": 0.6803, |
|
"step": 358 |
|
}, |
|
{ |
|
"epoch": 0.6563071297989032, |
|
"grad_norm": 2.2777087688446045, |
|
"learning_rate": 2.108655134827701e-05, |
|
"loss": 0.8077, |
|
"step": 359 |
|
}, |
|
{ |
|
"epoch": 0.6581352833638026, |
|
"grad_norm": 2.2738406658172607, |
|
"learning_rate": 2.0941196490587352e-05, |
|
"loss": 0.855, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.659963436928702, |
|
"grad_norm": 2.04484486579895, |
|
"learning_rate": 2.0795176885396928e-05, |
|
"loss": 0.8816, |
|
"step": 361 |
|
}, |
|
{ |
|
"epoch": 0.6617915904936015, |
|
"grad_norm": 2.364666223526001, |
|
"learning_rate": 2.064850887052048e-05, |
|
"loss": 0.9707, |
|
"step": 362 |
|
}, |
|
{ |
|
"epoch": 0.6636197440585009, |
|
"grad_norm": 2.2735183238983154, |
|
"learning_rate": 2.0501208856321895e-05, |
|
"loss": 0.8226, |
|
"step": 363 |
|
}, |
|
{ |
|
"epoch": 0.6654478976234004, |
|
"grad_norm": 2.370248794555664, |
|
"learning_rate": 2.035329332387808e-05, |
|
"loss": 0.797, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 0.6672760511882998, |
|
"grad_norm": 2.614694595336914, |
|
"learning_rate": 2.0204778823134936e-05, |
|
"loss": 0.8665, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.6691042047531993, |
|
"grad_norm": 2.3441321849823, |
|
"learning_rate": 2.0055681971055626e-05, |
|
"loss": 0.8658, |
|
"step": 366 |
|
}, |
|
{ |
|
"epoch": 0.6709323583180987, |
|
"grad_norm": 2.3217623233795166, |
|
"learning_rate": 1.990601944976133e-05, |
|
"loss": 0.8256, |
|
"step": 367 |
|
}, |
|
{ |
|
"epoch": 0.6727605118829981, |
|
"grad_norm": 2.209233522415161, |
|
"learning_rate": 1.9755808004664702e-05, |
|
"loss": 0.7482, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 0.6745886654478976, |
|
"grad_norm": 2.4364049434661865, |
|
"learning_rate": 1.9605064442596316e-05, |
|
"loss": 0.8031, |
|
"step": 369 |
|
}, |
|
{ |
|
"epoch": 0.676416819012797, |
|
"grad_norm": 2.168339967727661, |
|
"learning_rate": 1.9453805629924126e-05, |
|
"loss": 0.8416, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.6782449725776966, |
|
"grad_norm": 2.428342580795288, |
|
"learning_rate": 1.9302048490666356e-05, |
|
"loss": 0.8554, |
|
"step": 371 |
|
}, |
|
{ |
|
"epoch": 0.680073126142596, |
|
"grad_norm": 1.9630411863327026, |
|
"learning_rate": 1.9149810004597906e-05, |
|
"loss": 0.7988, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 0.6819012797074955, |
|
"grad_norm": 2.591010570526123, |
|
"learning_rate": 1.8997107205350525e-05, |
|
"loss": 1.048, |
|
"step": 373 |
|
}, |
|
{ |
|
"epoch": 0.6837294332723949, |
|
"grad_norm": 2.476414442062378, |
|
"learning_rate": 1.884395717850694e-05, |
|
"loss": 0.8041, |
|
"step": 374 |
|
}, |
|
{ |
|
"epoch": 0.6855575868372943, |
|
"grad_norm": 2.514333486557007, |
|
"learning_rate": 1.8690377059689202e-05, |
|
"loss": 0.8906, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.6873857404021938, |
|
"grad_norm": 2.299752712249756, |
|
"learning_rate": 1.853638403264141e-05, |
|
"loss": 0.9203, |
|
"step": 376 |
|
}, |
|
{ |
|
"epoch": 0.6892138939670932, |
|
"grad_norm": 2.3039369583129883, |
|
"learning_rate": 1.8381995327307067e-05, |
|
"loss": 0.8833, |
|
"step": 377 |
|
}, |
|
{ |
|
"epoch": 0.6910420475319927, |
|
"grad_norm": 2.3373348712921143, |
|
"learning_rate": 1.822722821790126e-05, |
|
"loss": 0.7324, |
|
"step": 378 |
|
}, |
|
{ |
|
"epoch": 0.6928702010968921, |
|
"grad_norm": 2.774083137512207, |
|
"learning_rate": 1.807210002097786e-05, |
|
"loss": 0.8778, |
|
"step": 379 |
|
}, |
|
{ |
|
"epoch": 0.6946983546617916, |
|
"grad_norm": 2.214552402496338, |
|
"learning_rate": 1.791662809349206e-05, |
|
"loss": 0.8044, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.696526508226691, |
|
"grad_norm": 2.298497438430786, |
|
"learning_rate": 1.7760829830858305e-05, |
|
"loss": 0.8667, |
|
"step": 381 |
|
}, |
|
{ |
|
"epoch": 0.6983546617915904, |
|
"grad_norm": 2.23805570602417, |
|
"learning_rate": 1.760472266500396e-05, |
|
"loss": 0.7938, |
|
"step": 382 |
|
}, |
|
{ |
|
"epoch": 0.70018281535649, |
|
"grad_norm": 2.18110990524292, |
|
"learning_rate": 1.744832406241889e-05, |
|
"loss": 0.8147, |
|
"step": 383 |
|
}, |
|
{ |
|
"epoch": 0.7020109689213894, |
|
"grad_norm": 2.2718112468719482, |
|
"learning_rate": 1.7291651522201208e-05, |
|
"loss": 0.973, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 0.7038391224862889, |
|
"grad_norm": 2.254279375076294, |
|
"learning_rate": 1.713472257409928e-05, |
|
"loss": 0.7439, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.7056672760511883, |
|
"grad_norm": 2.268983840942383, |
|
"learning_rate": 1.6977554776550403e-05, |
|
"loss": 0.8309, |
|
"step": 386 |
|
}, |
|
{ |
|
"epoch": 0.7074954296160878, |
|
"grad_norm": 2.189608097076416, |
|
"learning_rate": 1.682016571471623e-05, |
|
"loss": 0.8748, |
|
"step": 387 |
|
}, |
|
{ |
|
"epoch": 0.7093235831809872, |
|
"grad_norm": 2.231454610824585, |
|
"learning_rate": 1.6662572998515166e-05, |
|
"loss": 0.8759, |
|
"step": 388 |
|
}, |
|
{ |
|
"epoch": 0.7111517367458866, |
|
"grad_norm": 2.324653148651123, |
|
"learning_rate": 1.6504794260652077e-05, |
|
"loss": 0.7731, |
|
"step": 389 |
|
}, |
|
{ |
|
"epoch": 0.7129798903107861, |
|
"grad_norm": 2.113718271255493, |
|
"learning_rate": 1.6346847154645376e-05, |
|
"loss": 0.7961, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.7148080438756855, |
|
"grad_norm": 2.413463830947876, |
|
"learning_rate": 1.6188749352851825e-05, |
|
"loss": 0.9315, |
|
"step": 391 |
|
}, |
|
{ |
|
"epoch": 0.716636197440585, |
|
"grad_norm": 2.175915002822876, |
|
"learning_rate": 1.6030518544489215e-05, |
|
"loss": 0.7061, |
|
"step": 392 |
|
}, |
|
{ |
|
"epoch": 0.7184643510054844, |
|
"grad_norm": 2.2238268852233887, |
|
"learning_rate": 1.587217243365714e-05, |
|
"loss": 0.8585, |
|
"step": 393 |
|
}, |
|
{ |
|
"epoch": 0.720292504570384, |
|
"grad_norm": 2.3010525703430176, |
|
"learning_rate": 1.5713728737356138e-05, |
|
"loss": 0.8064, |
|
"step": 394 |
|
}, |
|
{ |
|
"epoch": 0.7221206581352834, |
|
"grad_norm": 2.2713418006896973, |
|
"learning_rate": 1.555520518350537e-05, |
|
"loss": 0.8125, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.7239488117001828, |
|
"grad_norm": 2.311316967010498, |
|
"learning_rate": 1.5396619508959102e-05, |
|
"loss": 0.7494, |
|
"step": 396 |
|
}, |
|
{ |
|
"epoch": 0.7257769652650823, |
|
"grad_norm": 2.3094563484191895, |
|
"learning_rate": 1.523798945752212e-05, |
|
"loss": 0.8135, |
|
"step": 397 |
|
}, |
|
{ |
|
"epoch": 0.7276051188299817, |
|
"grad_norm": 2.1408050060272217, |
|
"learning_rate": 1.5079332777964467e-05, |
|
"loss": 0.8519, |
|
"step": 398 |
|
}, |
|
{ |
|
"epoch": 0.7294332723948812, |
|
"grad_norm": 2.196596622467041, |
|
"learning_rate": 1.4920667222035532e-05, |
|
"loss": 0.9019, |
|
"step": 399 |
|
}, |
|
{ |
|
"epoch": 0.7312614259597806, |
|
"grad_norm": 2.4077069759368896, |
|
"learning_rate": 1.4762010542477881e-05, |
|
"loss": 0.8437, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.7330895795246801, |
|
"grad_norm": 2.138925075531006, |
|
"learning_rate": 1.46033804910409e-05, |
|
"loss": 0.7867, |
|
"step": 401 |
|
}, |
|
{ |
|
"epoch": 0.7349177330895795, |
|
"grad_norm": 2.280134439468384, |
|
"learning_rate": 1.4444794816494629e-05, |
|
"loss": 1.0417, |
|
"step": 402 |
|
}, |
|
{ |
|
"epoch": 0.7367458866544789, |
|
"grad_norm": 2.484534502029419, |
|
"learning_rate": 1.4286271262643866e-05, |
|
"loss": 0.7929, |
|
"step": 403 |
|
}, |
|
{ |
|
"epoch": 0.7385740402193784, |
|
"grad_norm": 2.2009499073028564, |
|
"learning_rate": 1.4127827566342864e-05, |
|
"loss": 0.7963, |
|
"step": 404 |
|
}, |
|
{ |
|
"epoch": 0.7404021937842779, |
|
"grad_norm": 2.313990831375122, |
|
"learning_rate": 1.3969481455510787e-05, |
|
"loss": 0.9538, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.7422303473491774, |
|
"grad_norm": 2.1209707260131836, |
|
"learning_rate": 1.3811250647148172e-05, |
|
"loss": 0.8327, |
|
"step": 406 |
|
}, |
|
{ |
|
"epoch": 0.7440585009140768, |
|
"grad_norm": 2.3821375370025635, |
|
"learning_rate": 1.3653152845354625e-05, |
|
"loss": 0.8677, |
|
"step": 407 |
|
}, |
|
{ |
|
"epoch": 0.7458866544789763, |
|
"grad_norm": 2.179967164993286, |
|
"learning_rate": 1.3495205739347925e-05, |
|
"loss": 0.8095, |
|
"step": 408 |
|
}, |
|
{ |
|
"epoch": 0.7477148080438757, |
|
"grad_norm": 2.5116395950317383, |
|
"learning_rate": 1.3337427001484836e-05, |
|
"loss": 0.9218, |
|
"step": 409 |
|
}, |
|
{ |
|
"epoch": 0.7495429616087751, |
|
"grad_norm": 2.173802375793457, |
|
"learning_rate": 1.3179834285283773e-05, |
|
"loss": 0.7475, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.7513711151736746, |
|
"grad_norm": 2.0795040130615234, |
|
"learning_rate": 1.3022445223449596e-05, |
|
"loss": 0.8749, |
|
"step": 411 |
|
}, |
|
{ |
|
"epoch": 0.753199268738574, |
|
"grad_norm": 2.1474385261535645, |
|
"learning_rate": 1.2865277425900725e-05, |
|
"loss": 0.8277, |
|
"step": 412 |
|
}, |
|
{ |
|
"epoch": 0.7550274223034735, |
|
"grad_norm": 2.243417978286743, |
|
"learning_rate": 1.2708348477798795e-05, |
|
"loss": 0.8147, |
|
"step": 413 |
|
}, |
|
{ |
|
"epoch": 0.7568555758683729, |
|
"grad_norm": 2.3106589317321777, |
|
"learning_rate": 1.255167593758111e-05, |
|
"loss": 0.7848, |
|
"step": 414 |
|
}, |
|
{ |
|
"epoch": 0.7586837294332724, |
|
"grad_norm": 2.397627830505371, |
|
"learning_rate": 1.2395277334996045e-05, |
|
"loss": 0.9778, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.7605118829981719, |
|
"grad_norm": 2.3535757064819336, |
|
"learning_rate": 1.2239170169141696e-05, |
|
"loss": 0.7996, |
|
"step": 416 |
|
}, |
|
{ |
|
"epoch": 0.7623400365630713, |
|
"grad_norm": 2.224731922149658, |
|
"learning_rate": 1.2083371906507939e-05, |
|
"loss": 0.8442, |
|
"step": 417 |
|
}, |
|
{ |
|
"epoch": 0.7641681901279708, |
|
"grad_norm": 2.4303503036499023, |
|
"learning_rate": 1.1927899979022143e-05, |
|
"loss": 0.8317, |
|
"step": 418 |
|
}, |
|
{ |
|
"epoch": 0.7659963436928702, |
|
"grad_norm": 2.4696667194366455, |
|
"learning_rate": 1.1772771782098748e-05, |
|
"loss": 0.8581, |
|
"step": 419 |
|
}, |
|
{ |
|
"epoch": 0.7678244972577697, |
|
"grad_norm": 2.2766096591949463, |
|
"learning_rate": 1.1618004672692937e-05, |
|
"loss": 0.781, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.7696526508226691, |
|
"grad_norm": 2.2170205116271973, |
|
"learning_rate": 1.146361596735859e-05, |
|
"loss": 0.6847, |
|
"step": 421 |
|
}, |
|
{ |
|
"epoch": 0.7714808043875686, |
|
"grad_norm": 2.301888942718506, |
|
"learning_rate": 1.1309622940310798e-05, |
|
"loss": 0.9334, |
|
"step": 422 |
|
}, |
|
{ |
|
"epoch": 0.773308957952468, |
|
"grad_norm": 2.0786006450653076, |
|
"learning_rate": 1.1156042821493062e-05, |
|
"loss": 0.8339, |
|
"step": 423 |
|
}, |
|
{ |
|
"epoch": 0.7751371115173674, |
|
"grad_norm": 2.1867787837982178, |
|
"learning_rate": 1.1002892794649478e-05, |
|
"loss": 0.8398, |
|
"step": 424 |
|
}, |
|
{ |
|
"epoch": 0.7769652650822669, |
|
"grad_norm": 2.1924829483032227, |
|
"learning_rate": 1.0850189995402096e-05, |
|
"loss": 0.8241, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.7787934186471663, |
|
"grad_norm": 2.104240655899048, |
|
"learning_rate": 1.069795150933365e-05, |
|
"loss": 0.83, |
|
"step": 426 |
|
}, |
|
{ |
|
"epoch": 0.7806215722120659, |
|
"grad_norm": 2.301518201828003, |
|
"learning_rate": 1.0546194370075882e-05, |
|
"loss": 0.7494, |
|
"step": 427 |
|
}, |
|
{ |
|
"epoch": 0.7824497257769653, |
|
"grad_norm": 2.3547585010528564, |
|
"learning_rate": 1.0394935557403684e-05, |
|
"loss": 0.7907, |
|
"step": 428 |
|
}, |
|
{ |
|
"epoch": 0.7842778793418648, |
|
"grad_norm": 2.225034713745117, |
|
"learning_rate": 1.0244191995335299e-05, |
|
"loss": 0.8484, |
|
"step": 429 |
|
}, |
|
{ |
|
"epoch": 0.7861060329067642, |
|
"grad_norm": 2.3130884170532227, |
|
"learning_rate": 1.0093980550238676e-05, |
|
"loss": 0.8425, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.7879341864716636, |
|
"grad_norm": 2.425241708755493, |
|
"learning_rate": 9.944318028944374e-06, |
|
"loss": 0.9269, |
|
"step": 431 |
|
}, |
|
{ |
|
"epoch": 0.7897623400365631, |
|
"grad_norm": 2.1149165630340576, |
|
"learning_rate": 9.795221176865065e-06, |
|
"loss": 0.7503, |
|
"step": 432 |
|
}, |
|
{ |
|
"epoch": 0.7915904936014625, |
|
"grad_norm": 2.3856897354125977, |
|
"learning_rate": 9.646706676121924e-06, |
|
"loss": 0.8628, |
|
"step": 433 |
|
}, |
|
{ |
|
"epoch": 0.793418647166362, |
|
"grad_norm": 2.1912615299224854, |
|
"learning_rate": 9.49879114367811e-06, |
|
"loss": 0.8198, |
|
"step": 434 |
|
}, |
|
{ |
|
"epoch": 0.7952468007312614, |
|
"grad_norm": 2.1112685203552246, |
|
"learning_rate": 9.351491129479519e-06, |
|
"loss": 0.8933, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.7970749542961609, |
|
"grad_norm": 2.3817248344421387, |
|
"learning_rate": 9.20482311460307e-06, |
|
"loss": 0.8212, |
|
"step": 436 |
|
}, |
|
{ |
|
"epoch": 0.7989031078610603, |
|
"grad_norm": 2.216339349746704, |
|
"learning_rate": 9.058803509412647e-06, |
|
"loss": 0.7964, |
|
"step": 437 |
|
}, |
|
{ |
|
"epoch": 0.8007312614259597, |
|
"grad_norm": 2.2197396755218506, |
|
"learning_rate": 8.913448651722994e-06, |
|
"loss": 0.7535, |
|
"step": 438 |
|
}, |
|
{ |
|
"epoch": 0.8025594149908593, |
|
"grad_norm": 2.083980083465576, |
|
"learning_rate": 8.768774804971705e-06, |
|
"loss": 0.9009, |
|
"step": 439 |
|
}, |
|
{ |
|
"epoch": 0.8043875685557587, |
|
"grad_norm": 2.0909934043884277, |
|
"learning_rate": 8.624798156399554e-06, |
|
"loss": 0.8016, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.8062157221206582, |
|
"grad_norm": 2.4581222534179688, |
|
"learning_rate": 8.481534815239323e-06, |
|
"loss": 0.9227, |
|
"step": 441 |
|
}, |
|
{ |
|
"epoch": 0.8080438756855576, |
|
"grad_norm": 2.1503217220306396, |
|
"learning_rate": 8.339000810913388e-06, |
|
"loss": 0.7305, |
|
"step": 442 |
|
}, |
|
{ |
|
"epoch": 0.8098720292504571, |
|
"grad_norm": 1.9855475425720215, |
|
"learning_rate": 8.197212091240237e-06, |
|
"loss": 0.7195, |
|
"step": 443 |
|
}, |
|
{ |
|
"epoch": 0.8117001828153565, |
|
"grad_norm": 2.25361967086792, |
|
"learning_rate": 8.056184520650064e-06, |
|
"loss": 0.7594, |
|
"step": 444 |
|
}, |
|
{ |
|
"epoch": 0.8135283363802559, |
|
"grad_norm": 2.2054708003997803, |
|
"learning_rate": 7.915933878409762e-06, |
|
"loss": 0.7931, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 0.8153564899451554, |
|
"grad_norm": 2.134115219116211, |
|
"learning_rate": 7.776475856857409e-06, |
|
"loss": 0.7195, |
|
"step": 446 |
|
}, |
|
{ |
|
"epoch": 0.8171846435100548, |
|
"grad_norm": 1.9758131504058838, |
|
"learning_rate": 7.63782605964648e-06, |
|
"loss": 0.872, |
|
"step": 447 |
|
}, |
|
{ |
|
"epoch": 0.8190127970749543, |
|
"grad_norm": 2.291642904281616, |
|
"learning_rate": 7.500000000000004e-06, |
|
"loss": 0.8467, |
|
"step": 448 |
|
}, |
|
{ |
|
"epoch": 0.8208409506398537, |
|
"grad_norm": 2.2243387699127197, |
|
"learning_rate": 7.3630130989748e-06, |
|
"loss": 0.9038, |
|
"step": 449 |
|
}, |
|
{ |
|
"epoch": 0.8226691042047533, |
|
"grad_norm": 2.283393383026123, |
|
"learning_rate": 7.226880683736066e-06, |
|
"loss": 0.8102, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.8244972577696527, |
|
"grad_norm": 2.078200101852417, |
|
"learning_rate": 7.091617985842463e-06, |
|
"loss": 0.761, |
|
"step": 451 |
|
}, |
|
{ |
|
"epoch": 0.8263254113345521, |
|
"grad_norm": 2.3057701587677, |
|
"learning_rate": 6.9572401395418615e-06, |
|
"loss": 0.8682, |
|
"step": 452 |
|
}, |
|
{ |
|
"epoch": 0.8281535648994516, |
|
"grad_norm": 2.171827793121338, |
|
"learning_rate": 6.8237621800780255e-06, |
|
"loss": 0.7561, |
|
"step": 453 |
|
}, |
|
{ |
|
"epoch": 0.829981718464351, |
|
"grad_norm": 2.3417348861694336, |
|
"learning_rate": 6.691199042008346e-06, |
|
"loss": 0.8277, |
|
"step": 454 |
|
}, |
|
{ |
|
"epoch": 0.8318098720292505, |
|
"grad_norm": 2.1309165954589844, |
|
"learning_rate": 6.559565557532847e-06, |
|
"loss": 0.8441, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 0.8336380255941499, |
|
"grad_norm": 2.3415029048919678, |
|
"learning_rate": 6.428876454834643e-06, |
|
"loss": 0.787, |
|
"step": 456 |
|
}, |
|
{ |
|
"epoch": 0.8354661791590493, |
|
"grad_norm": 2.2141568660736084, |
|
"learning_rate": 6.2991463564320296e-06, |
|
"loss": 0.8158, |
|
"step": 457 |
|
}, |
|
{ |
|
"epoch": 0.8372943327239488, |
|
"grad_norm": 2.0096514225006104, |
|
"learning_rate": 6.170389777542409e-06, |
|
"loss": 0.7489, |
|
"step": 458 |
|
}, |
|
{ |
|
"epoch": 0.8391224862888482, |
|
"grad_norm": 2.125929355621338, |
|
"learning_rate": 6.0426211244582105e-06, |
|
"loss": 0.8803, |
|
"step": 459 |
|
}, |
|
{ |
|
"epoch": 0.8409506398537477, |
|
"grad_norm": 2.0805740356445312, |
|
"learning_rate": 5.915854692935002e-06, |
|
"loss": 0.773, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.8427787934186471, |
|
"grad_norm": 2.357139825820923, |
|
"learning_rate": 5.790104666591974e-06, |
|
"loss": 0.7609, |
|
"step": 461 |
|
}, |
|
{ |
|
"epoch": 0.8446069469835467, |
|
"grad_norm": 2.277031898498535, |
|
"learning_rate": 5.665385115324954e-06, |
|
"loss": 0.8573, |
|
"step": 462 |
|
}, |
|
{ |
|
"epoch": 0.8464351005484461, |
|
"grad_norm": 2.2020912170410156, |
|
"learning_rate": 5.541709993732168e-06, |
|
"loss": 0.9261, |
|
"step": 463 |
|
}, |
|
{ |
|
"epoch": 0.8482632541133455, |
|
"grad_norm": 2.294649362564087, |
|
"learning_rate": 5.419093139552878e-06, |
|
"loss": 0.8164, |
|
"step": 464 |
|
}, |
|
{ |
|
"epoch": 0.850091407678245, |
|
"grad_norm": 2.047896385192871, |
|
"learning_rate": 5.297548272119138e-06, |
|
"loss": 0.8419, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 0.8519195612431444, |
|
"grad_norm": 2.4558777809143066, |
|
"learning_rate": 5.177088990820725e-06, |
|
"loss": 0.8319, |
|
"step": 466 |
|
}, |
|
{ |
|
"epoch": 0.8537477148080439, |
|
"grad_norm": 2.008725643157959, |
|
"learning_rate": 5.05772877358356e-06, |
|
"loss": 0.7503, |
|
"step": 467 |
|
}, |
|
{ |
|
"epoch": 0.8555758683729433, |
|
"grad_norm": 2.16011643409729, |
|
"learning_rate": 4.939480975361687e-06, |
|
"loss": 0.7007, |
|
"step": 468 |
|
}, |
|
{ |
|
"epoch": 0.8574040219378428, |
|
"grad_norm": 2.166571855545044, |
|
"learning_rate": 4.822358826643019e-06, |
|
"loss": 0.7383, |
|
"step": 469 |
|
}, |
|
{ |
|
"epoch": 0.8592321755027422, |
|
"grad_norm": 2.3428239822387695, |
|
"learning_rate": 4.706375431968998e-06, |
|
"loss": 0.792, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.8610603290676416, |
|
"grad_norm": 2.3133058547973633, |
|
"learning_rate": 4.591543768468364e-06, |
|
"loss": 0.7791, |
|
"step": 471 |
|
}, |
|
{ |
|
"epoch": 0.8628884826325411, |
|
"grad_norm": 2.227383852005005, |
|
"learning_rate": 4.4778766844051795e-06, |
|
"loss": 0.8838, |
|
"step": 472 |
|
}, |
|
{ |
|
"epoch": 0.8647166361974405, |
|
"grad_norm": 1.9852975606918335, |
|
"learning_rate": 4.365386897741249e-06, |
|
"loss": 0.8375, |
|
"step": 473 |
|
}, |
|
{ |
|
"epoch": 0.8665447897623401, |
|
"grad_norm": 2.151278018951416, |
|
"learning_rate": 4.254086994713141e-06, |
|
"loss": 0.7966, |
|
"step": 474 |
|
}, |
|
{ |
|
"epoch": 0.8683729433272395, |
|
"grad_norm": 2.355102777481079, |
|
"learning_rate": 4.1439894284239474e-06, |
|
"loss": 0.8264, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.870201096892139, |
|
"grad_norm": 2.390646457672119, |
|
"learning_rate": 4.035106517449926e-06, |
|
"loss": 0.8292, |
|
"step": 476 |
|
}, |
|
{ |
|
"epoch": 0.8720292504570384, |
|
"grad_norm": 2.1484568119049072, |
|
"learning_rate": 3.9274504444622025e-06, |
|
"loss": 0.8624, |
|
"step": 477 |
|
}, |
|
{ |
|
"epoch": 0.8738574040219378, |
|
"grad_norm": 2.134361505508423, |
|
"learning_rate": 3.82103325486368e-06, |
|
"loss": 0.8226, |
|
"step": 478 |
|
}, |
|
{ |
|
"epoch": 0.8756855575868373, |
|
"grad_norm": 2.1799209117889404, |
|
"learning_rate": 3.715866855441309e-06, |
|
"loss": 0.7563, |
|
"step": 479 |
|
}, |
|
{ |
|
"epoch": 0.8775137111517367, |
|
"grad_norm": 2.338834285736084, |
|
"learning_rate": 3.6119630130338537e-06, |
|
"loss": 0.8319, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.8793418647166362, |
|
"grad_norm": 2.032010555267334, |
|
"learning_rate": 3.5093333532153316e-06, |
|
"loss": 0.7693, |
|
"step": 481 |
|
}, |
|
{ |
|
"epoch": 0.8811700182815356, |
|
"grad_norm": 2.1978771686553955, |
|
"learning_rate": 3.4079893589942544e-06, |
|
"loss": 0.7642, |
|
"step": 482 |
|
}, |
|
{ |
|
"epoch": 0.8829981718464351, |
|
"grad_norm": 2.5220754146575928, |
|
"learning_rate": 3.3079423695288204e-06, |
|
"loss": 0.9182, |
|
"step": 483 |
|
}, |
|
{ |
|
"epoch": 0.8848263254113345, |
|
"grad_norm": 2.1148622035980225, |
|
"learning_rate": 3.2092035788581907e-06, |
|
"loss": 0.8411, |
|
"step": 484 |
|
}, |
|
{ |
|
"epoch": 0.886654478976234, |
|
"grad_norm": 2.1336936950683594, |
|
"learning_rate": 3.1117840346500287e-06, |
|
"loss": 0.7711, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 0.8884826325411335, |
|
"grad_norm": 2.175741672515869, |
|
"learning_rate": 3.0156946369643803e-06, |
|
"loss": 0.9526, |
|
"step": 486 |
|
}, |
|
{ |
|
"epoch": 0.8903107861060329, |
|
"grad_norm": 2.207550525665283, |
|
"learning_rate": 2.9209461370341204e-06, |
|
"loss": 0.7538, |
|
"step": 487 |
|
}, |
|
{ |
|
"epoch": 0.8921389396709324, |
|
"grad_norm": 2.0048232078552246, |
|
"learning_rate": 2.8275491360619875e-06, |
|
"loss": 0.8079, |
|
"step": 488 |
|
}, |
|
{ |
|
"epoch": 0.8939670932358318, |
|
"grad_norm": 2.2302756309509277, |
|
"learning_rate": 2.735514084034457e-06, |
|
"loss": 0.8385, |
|
"step": 489 |
|
}, |
|
{ |
|
"epoch": 0.8957952468007313, |
|
"grad_norm": 2.7533788681030273, |
|
"learning_rate": 2.64485127855251e-06, |
|
"loss": 0.7718, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.8976234003656307, |
|
"grad_norm": 2.3614344596862793, |
|
"learning_rate": 2.5555708636794594e-06, |
|
"loss": 0.7767, |
|
"step": 491 |
|
}, |
|
{ |
|
"epoch": 0.8994515539305301, |
|
"grad_norm": 2.726402521133423, |
|
"learning_rate": 2.467682828805956e-06, |
|
"loss": 0.7917, |
|
"step": 492 |
|
}, |
|
{ |
|
"epoch": 0.9012797074954296, |
|
"grad_norm": 2.2285687923431396, |
|
"learning_rate": 2.38119700753228e-06, |
|
"loss": 0.8958, |
|
"step": 493 |
|
}, |
|
{ |
|
"epoch": 0.903107861060329, |
|
"grad_norm": 2.1934146881103516, |
|
"learning_rate": 2.2961230765681158e-06, |
|
"loss": 0.7796, |
|
"step": 494 |
|
}, |
|
{ |
|
"epoch": 0.9049360146252285, |
|
"grad_norm": 2.349043607711792, |
|
"learning_rate": 2.212470554649805e-06, |
|
"loss": 0.8538, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 0.906764168190128, |
|
"grad_norm": 1.995997667312622, |
|
"learning_rate": 2.130248801475344e-06, |
|
"loss": 0.8433, |
|
"step": 496 |
|
}, |
|
{ |
|
"epoch": 0.9085923217550275, |
|
"grad_norm": 2.1767685413360596, |
|
"learning_rate": 2.0494670166571356e-06, |
|
"loss": 0.8276, |
|
"step": 497 |
|
}, |
|
{ |
|
"epoch": 0.9104204753199269, |
|
"grad_norm": 2.255619525909424, |
|
"learning_rate": 1.9701342386926712e-06, |
|
"loss": 0.7797, |
|
"step": 498 |
|
}, |
|
{ |
|
"epoch": 0.9122486288848263, |
|
"grad_norm": 2.3576643466949463, |
|
"learning_rate": 1.892259343953226e-06, |
|
"loss": 0.9015, |
|
"step": 499 |
|
}, |
|
{ |
|
"epoch": 0.9140767824497258, |
|
"grad_norm": 1.9980827569961548, |
|
"learning_rate": 1.815851045690708e-06, |
|
"loss": 0.6846, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.9140767824497258, |
|
"eval_loss": 0.798653244972229, |
|
"eval_runtime": 11.4055, |
|
"eval_samples_per_second": 97.497, |
|
"eval_steps_per_second": 3.069, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.9159049360146252, |
|
"grad_norm": 2.24575138092041, |
|
"learning_rate": 1.7409178930627473e-06, |
|
"loss": 0.8362, |
|
"step": 501 |
|
}, |
|
{ |
|
"epoch": 0.9177330895795247, |
|
"grad_norm": 2.058715343475342, |
|
"learning_rate": 1.6674682701761496e-06, |
|
"loss": 0.8225, |
|
"step": 502 |
|
}, |
|
{ |
|
"epoch": 0.9195612431444241, |
|
"grad_norm": 2.0738391876220703, |
|
"learning_rate": 1.5955103951488177e-06, |
|
"loss": 0.7747, |
|
"step": 503 |
|
}, |
|
{ |
|
"epoch": 0.9213893967093236, |
|
"grad_norm": 2.142606735229492, |
|
"learning_rate": 1.5250523191902455e-06, |
|
"loss": 0.8331, |
|
"step": 504 |
|
}, |
|
{ |
|
"epoch": 0.923217550274223, |
|
"grad_norm": 2.2022759914398193, |
|
"learning_rate": 1.456101925700684e-06, |
|
"loss": 0.8037, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 0.9250457038391224, |
|
"grad_norm": 2.1481759548187256, |
|
"learning_rate": 1.3886669293890837e-06, |
|
"loss": 0.7431, |
|
"step": 506 |
|
}, |
|
{ |
|
"epoch": 0.926873857404022, |
|
"grad_norm": 2.3185274600982666, |
|
"learning_rate": 1.322754875409915e-06, |
|
"loss": 0.7726, |
|
"step": 507 |
|
}, |
|
{ |
|
"epoch": 0.9287020109689214, |
|
"grad_norm": 2.315138816833496, |
|
"learning_rate": 1.2583731385189562e-06, |
|
"loss": 0.7026, |
|
"step": 508 |
|
}, |
|
{ |
|
"epoch": 0.9305301645338209, |
|
"grad_norm": 2.050353527069092, |
|
"learning_rate": 1.1955289222481513e-06, |
|
"loss": 0.7373, |
|
"step": 509 |
|
}, |
|
{ |
|
"epoch": 0.9323583180987203, |
|
"grad_norm": 2.3529744148254395, |
|
"learning_rate": 1.1342292580996195e-06, |
|
"loss": 0.8461, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.9341864716636198, |
|
"grad_norm": 2.264411687850952, |
|
"learning_rate": 1.0744810047589116e-06, |
|
"loss": 1.05, |
|
"step": 511 |
|
}, |
|
{ |
|
"epoch": 0.9360146252285192, |
|
"grad_norm": 2.2528390884399414, |
|
"learning_rate": 1.0162908473276133e-06, |
|
"loss": 0.8218, |
|
"step": 512 |
|
}, |
|
{ |
|
"epoch": 0.9378427787934186, |
|
"grad_norm": 2.23812198638916, |
|
"learning_rate": 9.596652965753632e-07, |
|
"loss": 0.8533, |
|
"step": 513 |
|
}, |
|
{ |
|
"epoch": 0.9396709323583181, |
|
"grad_norm": 2.4503235816955566, |
|
"learning_rate": 9.046106882113753e-07, |
|
"loss": 0.8821, |
|
"step": 514 |
|
}, |
|
{ |
|
"epoch": 0.9414990859232175, |
|
"grad_norm": 2.152954578399658, |
|
"learning_rate": 8.511331821755459e-07, |
|
"loss": 0.7932, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 0.943327239488117, |
|
"grad_norm": 2.1594455242156982, |
|
"learning_rate": 7.992387619492436e-07, |
|
"loss": 0.7988, |
|
"step": 516 |
|
}, |
|
{ |
|
"epoch": 0.9451553930530164, |
|
"grad_norm": 2.086651086807251, |
|
"learning_rate": 7.489332338858202e-07, |
|
"loss": 0.8552, |
|
"step": 517 |
|
}, |
|
{ |
|
"epoch": 0.946983546617916, |
|
"grad_norm": 2.134727954864502, |
|
"learning_rate": 7.002222265609476e-07, |
|
"loss": 0.8825, |
|
"step": 518 |
|
}, |
|
{ |
|
"epoch": 0.9488117001828154, |
|
"grad_norm": 2.169853448867798, |
|
"learning_rate": 6.53111190142861e-07, |
|
"loss": 0.8105, |
|
"step": 519 |
|
}, |
|
{ |
|
"epoch": 0.9506398537477148, |
|
"grad_norm": 2.000743865966797, |
|
"learning_rate": 6.076053957825411e-07, |
|
"loss": 0.6882, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.9524680073126143, |
|
"grad_norm": 2.1314992904663086, |
|
"learning_rate": 5.637099350239427e-07, |
|
"loss": 0.7354, |
|
"step": 521 |
|
}, |
|
{ |
|
"epoch": 0.9542961608775137, |
|
"grad_norm": 2.3546230792999268, |
|
"learning_rate": 5.214297192343104e-07, |
|
"loss": 0.8793, |
|
"step": 522 |
|
}, |
|
{ |
|
"epoch": 0.9561243144424132, |
|
"grad_norm": 2.054684638977051, |
|
"learning_rate": 4.807694790546563e-07, |
|
"loss": 0.8644, |
|
"step": 523 |
|
}, |
|
{ |
|
"epoch": 0.9579524680073126, |
|
"grad_norm": 2.0605905055999756, |
|
"learning_rate": 4.417337638704588e-07, |
|
"loss": 0.675, |
|
"step": 524 |
|
}, |
|
{ |
|
"epoch": 0.9597806215722121, |
|
"grad_norm": 2.196253776550293, |
|
"learning_rate": 4.043269413026429e-07, |
|
"loss": 0.8171, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.9616087751371115, |
|
"grad_norm": 2.239720582962036, |
|
"learning_rate": 3.6855319671889433e-07, |
|
"loss": 0.7863, |
|
"step": 526 |
|
}, |
|
{ |
|
"epoch": 0.9634369287020109, |
|
"grad_norm": 2.3303980827331543, |
|
"learning_rate": 3.3441653276537253e-07, |
|
"loss": 0.7169, |
|
"step": 527 |
|
}, |
|
{ |
|
"epoch": 0.9652650822669104, |
|
"grad_norm": 2.10151743888855, |
|
"learning_rate": 3.0192076891885745e-07, |
|
"loss": 0.8925, |
|
"step": 528 |
|
}, |
|
{ |
|
"epoch": 0.9670932358318098, |
|
"grad_norm": 2.475900411605835, |
|
"learning_rate": 2.710695410593994e-07, |
|
"loss": 0.8043, |
|
"step": 529 |
|
}, |
|
{ |
|
"epoch": 0.9689213893967094, |
|
"grad_norm": 2.0351574420928955, |
|
"learning_rate": 2.418663010635114e-07, |
|
"loss": 0.6677, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.9707495429616088, |
|
"grad_norm": 2.2573163509368896, |
|
"learning_rate": 2.1431431641794287e-07, |
|
"loss": 0.8685, |
|
"step": 531 |
|
}, |
|
{ |
|
"epoch": 0.9725776965265083, |
|
"grad_norm": 2.2806551456451416, |
|
"learning_rate": 1.8841666985408566e-07, |
|
"loss": 1.0264, |
|
"step": 532 |
|
}, |
|
{ |
|
"epoch": 0.9744058500914077, |
|
"grad_norm": 2.0971100330352783, |
|
"learning_rate": 1.6417625900305656e-07, |
|
"loss": 0.663, |
|
"step": 533 |
|
}, |
|
{ |
|
"epoch": 0.9762340036563071, |
|
"grad_norm": 2.1479334831237793, |
|
"learning_rate": 1.4159579607148976e-07, |
|
"loss": 0.7461, |
|
"step": 534 |
|
}, |
|
{ |
|
"epoch": 0.9780621572212066, |
|
"grad_norm": 2.1846601963043213, |
|
"learning_rate": 1.206778075380699e-07, |
|
"loss": 0.7843, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 0.979890310786106, |
|
"grad_norm": 2.18355131149292, |
|
"learning_rate": 1.0142463387085465e-07, |
|
"loss": 0.8233, |
|
"step": 536 |
|
}, |
|
{ |
|
"epoch": 0.9817184643510055, |
|
"grad_norm": 1.9972505569458008, |
|
"learning_rate": 8.38384292653993e-08, |
|
"loss": 0.6412, |
|
"step": 537 |
|
}, |
|
{ |
|
"epoch": 0.9835466179159049, |
|
"grad_norm": 2.2325432300567627, |
|
"learning_rate": 6.792116140373117e-08, |
|
"loss": 0.7315, |
|
"step": 538 |
|
}, |
|
{ |
|
"epoch": 0.9853747714808044, |
|
"grad_norm": 2.270096778869629, |
|
"learning_rate": 5.367461123419071e-08, |
|
"loss": 0.7166, |
|
"step": 539 |
|
}, |
|
{ |
|
"epoch": 0.9872029250457038, |
|
"grad_norm": 2.084451675415039, |
|
"learning_rate": 4.110037277216427e-08, |
|
"loss": 0.7703, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.9890310786106032, |
|
"grad_norm": 2.1931207180023193, |
|
"learning_rate": 3.0199852921735104e-08, |
|
"loss": 0.9388, |
|
"step": 541 |
|
}, |
|
{ |
|
"epoch": 0.9908592321755028, |
|
"grad_norm": 2.08048939704895, |
|
"learning_rate": 2.0974271318260907e-08, |
|
"loss": 0.669, |
|
"step": 542 |
|
}, |
|
{ |
|
"epoch": 0.9926873857404022, |
|
"grad_norm": 2.402120351791382, |
|
"learning_rate": 1.342466019192301e-08, |
|
"loss": 0.8257, |
|
"step": 543 |
|
}, |
|
{ |
|
"epoch": 0.9945155393053017, |
|
"grad_norm": 2.3177034854888916, |
|
"learning_rate": 7.551864252223762e-09, |
|
"loss": 0.8117, |
|
"step": 544 |
|
}, |
|
{ |
|
"epoch": 0.9963436928702011, |
|
"grad_norm": 2.3477089405059814, |
|
"learning_rate": 3.3565405934721237e-09, |
|
"loss": 0.8285, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 0.9981718464351006, |
|
"grad_norm": 2.4188199043273926, |
|
"learning_rate": 8.391586212741498e-10, |
|
"loss": 0.8643, |
|
"step": 546 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 2.4587948322296143, |
|
"learning_rate": 0.0, |
|
"loss": 0.8511, |
|
"step": 547 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 547, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 250, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2.615833264128e+16, |
|
"train_batch_size": 32, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|