|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.9856, |
|
"eval_steps": 500, |
|
"global_step": 468, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0064, |
|
"grad_norm": 2.451486825942993, |
|
"learning_rate": 2.1276595744680853e-06, |
|
"loss": 1.5704, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0128, |
|
"grad_norm": 1.047212839126587, |
|
"learning_rate": 4.255319148936171e-06, |
|
"loss": 1.3347, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.0192, |
|
"grad_norm": 1.2422709465026855, |
|
"learning_rate": 6.3829787234042555e-06, |
|
"loss": 1.4116, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.0256, |
|
"grad_norm": 2.0192582607269287, |
|
"learning_rate": 8.510638297872341e-06, |
|
"loss": 1.5004, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.032, |
|
"grad_norm": 1.6840580701828003, |
|
"learning_rate": 1.0638297872340426e-05, |
|
"loss": 1.478, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.0384, |
|
"grad_norm": 1.680579662322998, |
|
"learning_rate": 1.2765957446808511e-05, |
|
"loss": 1.3965, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.0448, |
|
"grad_norm": 1.8126763105392456, |
|
"learning_rate": 1.4893617021276596e-05, |
|
"loss": 1.4944, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.0512, |
|
"grad_norm": 2.288001537322998, |
|
"learning_rate": 1.7021276595744682e-05, |
|
"loss": 1.4845, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.0576, |
|
"grad_norm": 1.4367578029632568, |
|
"learning_rate": 1.9148936170212766e-05, |
|
"loss": 1.3447, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.064, |
|
"grad_norm": 1.576250433921814, |
|
"learning_rate": 2.1276595744680852e-05, |
|
"loss": 1.3918, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.0704, |
|
"grad_norm": 1.6590297222137451, |
|
"learning_rate": 2.340425531914894e-05, |
|
"loss": 1.3442, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.0768, |
|
"grad_norm": 2.338789701461792, |
|
"learning_rate": 2.5531914893617022e-05, |
|
"loss": 1.7413, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.0832, |
|
"grad_norm": 3.2776601314544678, |
|
"learning_rate": 2.765957446808511e-05, |
|
"loss": 1.7825, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.0896, |
|
"grad_norm": 1.463249921798706, |
|
"learning_rate": 2.9787234042553192e-05, |
|
"loss": 1.2387, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.096, |
|
"grad_norm": 2.5417563915252686, |
|
"learning_rate": 3.191489361702128e-05, |
|
"loss": 1.5182, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.1024, |
|
"grad_norm": 1.448764681816101, |
|
"learning_rate": 3.4042553191489365e-05, |
|
"loss": 1.1844, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.1088, |
|
"grad_norm": 1.5268467664718628, |
|
"learning_rate": 3.617021276595745e-05, |
|
"loss": 1.3816, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.1152, |
|
"grad_norm": 2.561483383178711, |
|
"learning_rate": 3.829787234042553e-05, |
|
"loss": 1.3909, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.1216, |
|
"grad_norm": 1.2603929042816162, |
|
"learning_rate": 4.0425531914893614e-05, |
|
"loss": 1.241, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.128, |
|
"grad_norm": 0.7591760158538818, |
|
"learning_rate": 4.2553191489361704e-05, |
|
"loss": 1.1528, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.1344, |
|
"grad_norm": 1.6874586343765259, |
|
"learning_rate": 4.468085106382979e-05, |
|
"loss": 1.3256, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.1408, |
|
"grad_norm": 1.5209052562713623, |
|
"learning_rate": 4.680851063829788e-05, |
|
"loss": 1.3031, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.1472, |
|
"grad_norm": 0.5414145588874817, |
|
"learning_rate": 4.893617021276596e-05, |
|
"loss": 1.192, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.1536, |
|
"grad_norm": 1.071704626083374, |
|
"learning_rate": 5.1063829787234044e-05, |
|
"loss": 1.0929, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.5227977633476257, |
|
"learning_rate": 5.319148936170213e-05, |
|
"loss": 1.1455, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.1664, |
|
"grad_norm": 0.5635082721710205, |
|
"learning_rate": 5.531914893617022e-05, |
|
"loss": 1.2057, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.1728, |
|
"grad_norm": 0.6073344945907593, |
|
"learning_rate": 5.744680851063831e-05, |
|
"loss": 1.2319, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.1792, |
|
"grad_norm": 1.3395496606826782, |
|
"learning_rate": 5.9574468085106384e-05, |
|
"loss": 1.3086, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.1856, |
|
"grad_norm": 0.5415111184120178, |
|
"learning_rate": 6.170212765957447e-05, |
|
"loss": 1.2653, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.192, |
|
"grad_norm": 0.7579400539398193, |
|
"learning_rate": 6.382978723404256e-05, |
|
"loss": 1.0565, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.1984, |
|
"grad_norm": 0.4650651514530182, |
|
"learning_rate": 6.595744680851063e-05, |
|
"loss": 1.3408, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.2048, |
|
"grad_norm": 0.31583133339881897, |
|
"learning_rate": 6.808510638297873e-05, |
|
"loss": 1.1341, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.2112, |
|
"grad_norm": 0.3642672896385193, |
|
"learning_rate": 7.021276595744681e-05, |
|
"loss": 1.2263, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.2176, |
|
"grad_norm": 0.40529701113700867, |
|
"learning_rate": 7.23404255319149e-05, |
|
"loss": 1.2229, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.224, |
|
"grad_norm": 0.2421422302722931, |
|
"learning_rate": 7.446808510638298e-05, |
|
"loss": 0.9803, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.2304, |
|
"grad_norm": 0.3055451810359955, |
|
"learning_rate": 7.659574468085106e-05, |
|
"loss": 1.0436, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.2368, |
|
"grad_norm": 0.26383817195892334, |
|
"learning_rate": 7.872340425531916e-05, |
|
"loss": 0.9907, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.2432, |
|
"grad_norm": 0.3040568232536316, |
|
"learning_rate": 8.085106382978723e-05, |
|
"loss": 1.1131, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.2496, |
|
"grad_norm": 0.20183025300502777, |
|
"learning_rate": 8.297872340425533e-05, |
|
"loss": 0.955, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.256, |
|
"grad_norm": 0.24510295689105988, |
|
"learning_rate": 8.510638297872341e-05, |
|
"loss": 1.0317, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.2624, |
|
"grad_norm": 0.22165146470069885, |
|
"learning_rate": 8.723404255319149e-05, |
|
"loss": 0.9927, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.2688, |
|
"grad_norm": 0.18096181750297546, |
|
"learning_rate": 8.936170212765958e-05, |
|
"loss": 0.9169, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.2752, |
|
"grad_norm": 0.2374049425125122, |
|
"learning_rate": 9.148936170212766e-05, |
|
"loss": 0.9629, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.2816, |
|
"grad_norm": 0.28989022970199585, |
|
"learning_rate": 9.361702127659576e-05, |
|
"loss": 1.0841, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.288, |
|
"grad_norm": 0.25799044966697693, |
|
"learning_rate": 9.574468085106384e-05, |
|
"loss": 1.0746, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.2944, |
|
"grad_norm": 0.22019030153751373, |
|
"learning_rate": 9.787234042553192e-05, |
|
"loss": 0.9699, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.3008, |
|
"grad_norm": 0.18696561455726624, |
|
"learning_rate": 0.0001, |
|
"loss": 0.8615, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.3072, |
|
"grad_norm": 0.2744862735271454, |
|
"learning_rate": 9.999860789001946e-05, |
|
"loss": 1.0763, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.3136, |
|
"grad_norm": 0.19078461825847626, |
|
"learning_rate": 9.999443163759668e-05, |
|
"loss": 0.8894, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.19642512500286102, |
|
"learning_rate": 9.998747147528374e-05, |
|
"loss": 0.8941, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.3264, |
|
"grad_norm": 0.1972423791885376, |
|
"learning_rate": 9.997772779065312e-05, |
|
"loss": 0.9058, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.3328, |
|
"grad_norm": 0.19428911805152893, |
|
"learning_rate": 9.996520112627602e-05, |
|
"loss": 0.9221, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.3392, |
|
"grad_norm": 0.20967088639736176, |
|
"learning_rate": 9.994989217969224e-05, |
|
"loss": 0.9024, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.3456, |
|
"grad_norm": 0.19962789118289948, |
|
"learning_rate": 9.993180180337126e-05, |
|
"loss": 0.9491, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.352, |
|
"grad_norm": 0.1546671986579895, |
|
"learning_rate": 9.991093100466482e-05, |
|
"loss": 0.7734, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.3584, |
|
"grad_norm": 0.15869870781898499, |
|
"learning_rate": 9.988728094575082e-05, |
|
"loss": 0.8626, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.3648, |
|
"grad_norm": 0.2690005302429199, |
|
"learning_rate": 9.986085294356857e-05, |
|
"loss": 0.905, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.3712, |
|
"grad_norm": 0.265375018119812, |
|
"learning_rate": 9.983164846974548e-05, |
|
"loss": 0.9522, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.3776, |
|
"grad_norm": 0.1881999671459198, |
|
"learning_rate": 9.979966915051517e-05, |
|
"loss": 0.8813, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.384, |
|
"grad_norm": 0.1687404066324234, |
|
"learning_rate": 9.97649167666268e-05, |
|
"loss": 0.7873, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.3904, |
|
"grad_norm": 0.16878949105739594, |
|
"learning_rate": 9.972739325324596e-05, |
|
"loss": 0.8416, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.3968, |
|
"grad_norm": 0.1852271556854248, |
|
"learning_rate": 9.968710069984698e-05, |
|
"loss": 0.8725, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.4032, |
|
"grad_norm": 0.20418129861354828, |
|
"learning_rate": 9.964404135009648e-05, |
|
"loss": 0.8924, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.4096, |
|
"grad_norm": 0.17017708718776703, |
|
"learning_rate": 9.95982176017285e-05, |
|
"loss": 0.8465, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.416, |
|
"grad_norm": 0.200260192155838, |
|
"learning_rate": 9.95496320064109e-05, |
|
"loss": 0.897, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.4224, |
|
"grad_norm": 0.19314530491828918, |
|
"learning_rate": 9.94982872696034e-05, |
|
"loss": 0.8314, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.4288, |
|
"grad_norm": 0.16122443974018097, |
|
"learning_rate": 9.94441862504068e-05, |
|
"loss": 0.8258, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.4352, |
|
"grad_norm": 0.1727280616760254, |
|
"learning_rate": 9.938733196140386e-05, |
|
"loss": 0.852, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.4416, |
|
"grad_norm": 0.1809166669845581, |
|
"learning_rate": 9.932772756849153e-05, |
|
"loss": 0.7956, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.448, |
|
"grad_norm": 0.13888217508792877, |
|
"learning_rate": 9.926537639070457e-05, |
|
"loss": 0.7456, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.4544, |
|
"grad_norm": 0.1737329661846161, |
|
"learning_rate": 9.92002819000309e-05, |
|
"loss": 0.7735, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.4608, |
|
"grad_norm": 0.21496297419071198, |
|
"learning_rate": 9.91324477212181e-05, |
|
"loss": 0.8925, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.4672, |
|
"grad_norm": 0.1533055156469345, |
|
"learning_rate": 9.906187763157168e-05, |
|
"loss": 0.7513, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.4736, |
|
"grad_norm": 0.17420215904712677, |
|
"learning_rate": 9.898857556074468e-05, |
|
"loss": 0.82, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.17036575078964233, |
|
"learning_rate": 9.891254559051885e-05, |
|
"loss": 0.8445, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.4864, |
|
"grad_norm": 0.19612562656402588, |
|
"learning_rate": 9.883379195457746e-05, |
|
"loss": 0.7657, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.4928, |
|
"grad_norm": 0.18145954608917236, |
|
"learning_rate": 9.875231903826936e-05, |
|
"loss": 0.8554, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.4992, |
|
"grad_norm": 0.19653621315956116, |
|
"learning_rate": 9.866813137836499e-05, |
|
"loss": 0.8247, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.5056, |
|
"grad_norm": 0.15563753247261047, |
|
"learning_rate": 9.858123366280358e-05, |
|
"loss": 0.8427, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.512, |
|
"grad_norm": 0.18898862600326538, |
|
"learning_rate": 9.849163073043223e-05, |
|
"loss": 0.8435, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.5184, |
|
"grad_norm": 0.2034827470779419, |
|
"learning_rate": 9.839932757073638e-05, |
|
"loss": 0.8755, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.5248, |
|
"grad_norm": 0.18574374914169312, |
|
"learning_rate": 9.830432932356206e-05, |
|
"loss": 0.7832, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.5312, |
|
"grad_norm": 0.1965630203485489, |
|
"learning_rate": 9.820664127882957e-05, |
|
"loss": 0.8227, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.5376, |
|
"grad_norm": 0.1712363362312317, |
|
"learning_rate": 9.8106268876239e-05, |
|
"loss": 0.7677, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.544, |
|
"grad_norm": 0.1882326453924179, |
|
"learning_rate": 9.800321770496726e-05, |
|
"loss": 0.8642, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.5504, |
|
"grad_norm": 0.21462184190750122, |
|
"learning_rate": 9.789749350335693e-05, |
|
"loss": 0.8289, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.5568, |
|
"grad_norm": 0.16833196580410004, |
|
"learning_rate": 9.778910215859667e-05, |
|
"loss": 0.7127, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.5632, |
|
"grad_norm": 0.23049375414848328, |
|
"learning_rate": 9.767804970639339e-05, |
|
"loss": 0.8281, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.5696, |
|
"grad_norm": 0.22619804739952087, |
|
"learning_rate": 9.756434233063616e-05, |
|
"loss": 0.8528, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.576, |
|
"grad_norm": 0.17294900119304657, |
|
"learning_rate": 9.744798636305188e-05, |
|
"loss": 0.7163, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.5824, |
|
"grad_norm": 0.21446748077869415, |
|
"learning_rate": 9.732898828285273e-05, |
|
"loss": 0.76, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.5888, |
|
"grad_norm": 0.19360435009002686, |
|
"learning_rate": 9.72073547163753e-05, |
|
"loss": 0.7694, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.5952, |
|
"grad_norm": 0.16693833470344543, |
|
"learning_rate": 9.708309243671165e-05, |
|
"loss": 0.7513, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.6016, |
|
"grad_norm": 0.2053443044424057, |
|
"learning_rate": 9.695620836333219e-05, |
|
"loss": 0.7768, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.608, |
|
"grad_norm": 0.20644915103912354, |
|
"learning_rate": 9.68267095617003e-05, |
|
"loss": 0.7645, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.6144, |
|
"grad_norm": 0.179234117269516, |
|
"learning_rate": 9.669460324287898e-05, |
|
"loss": 0.8299, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.6208, |
|
"grad_norm": 0.18114909529685974, |
|
"learning_rate": 9.655989676312918e-05, |
|
"loss": 0.7326, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.6272, |
|
"grad_norm": 0.19695445895195007, |
|
"learning_rate": 9.642259762350033e-05, |
|
"loss": 0.7831, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.6336, |
|
"grad_norm": 0.17625164985656738, |
|
"learning_rate": 9.628271346941252e-05, |
|
"loss": 0.7511, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 0.2164350301027298, |
|
"learning_rate": 9.614025209023084e-05, |
|
"loss": 0.7882, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.6464, |
|
"grad_norm": 0.18725153803825378, |
|
"learning_rate": 9.59952214188316e-05, |
|
"loss": 0.7671, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.6528, |
|
"grad_norm": 0.18197496235370636, |
|
"learning_rate": 9.58476295311606e-05, |
|
"loss": 0.7698, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.6592, |
|
"grad_norm": 0.19369439780712128, |
|
"learning_rate": 9.569748464578343e-05, |
|
"loss": 0.7534, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.6656, |
|
"grad_norm": 0.2009640485048294, |
|
"learning_rate": 9.554479512342784e-05, |
|
"loss": 0.7415, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.672, |
|
"grad_norm": 0.19398680329322815, |
|
"learning_rate": 9.538956946651815e-05, |
|
"loss": 0.7429, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.6784, |
|
"grad_norm": 0.20824606716632843, |
|
"learning_rate": 9.52318163187018e-05, |
|
"loss": 0.815, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.6848, |
|
"grad_norm": 0.21556821465492249, |
|
"learning_rate": 9.507154446436805e-05, |
|
"loss": 0.7476, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.6912, |
|
"grad_norm": 0.17891979217529297, |
|
"learning_rate": 9.490876282815884e-05, |
|
"loss": 0.7209, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.6976, |
|
"grad_norm": 0.21326586604118347, |
|
"learning_rate": 9.474348047447177e-05, |
|
"loss": 0.7386, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.704, |
|
"grad_norm": 0.17727015912532806, |
|
"learning_rate": 9.457570660695541e-05, |
|
"loss": 0.7015, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.7104, |
|
"grad_norm": 0.20510976016521454, |
|
"learning_rate": 9.440545056799677e-05, |
|
"loss": 0.7266, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.7168, |
|
"grad_norm": 0.17708668112754822, |
|
"learning_rate": 9.423272183820108e-05, |
|
"loss": 0.7472, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.7232, |
|
"grad_norm": 0.18563494086265564, |
|
"learning_rate": 9.405753003586395e-05, |
|
"loss": 0.7652, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.7296, |
|
"grad_norm": 0.17155848443508148, |
|
"learning_rate": 9.387988491643558e-05, |
|
"loss": 0.7821, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.736, |
|
"grad_norm": 0.22160275280475616, |
|
"learning_rate": 9.369979637197775e-05, |
|
"loss": 0.7631, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.7424, |
|
"grad_norm": 0.20093092322349548, |
|
"learning_rate": 9.351727443061283e-05, |
|
"loss": 0.7896, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.7488, |
|
"grad_norm": 0.19560478627681732, |
|
"learning_rate": 9.333232925596552e-05, |
|
"loss": 0.7933, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.7552, |
|
"grad_norm": 0.1992943435907364, |
|
"learning_rate": 9.314497114659671e-05, |
|
"loss": 0.7548, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.7616, |
|
"grad_norm": 0.19478629529476166, |
|
"learning_rate": 9.295521053543019e-05, |
|
"loss": 0.7775, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.768, |
|
"grad_norm": 0.18368546664714813, |
|
"learning_rate": 9.276305798917159e-05, |
|
"loss": 0.7601, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.7744, |
|
"grad_norm": 0.2044031322002411, |
|
"learning_rate": 9.256852420771998e-05, |
|
"loss": 0.7497, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.7808, |
|
"grad_norm": 0.1834166944026947, |
|
"learning_rate": 9.237162002357214e-05, |
|
"loss": 0.7278, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.7872, |
|
"grad_norm": 0.19447724521160126, |
|
"learning_rate": 9.217235640121926e-05, |
|
"loss": 0.72, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.7936, |
|
"grad_norm": 0.1712876558303833, |
|
"learning_rate": 9.197074443653642e-05, |
|
"loss": 0.7404, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.20997127890586853, |
|
"learning_rate": 9.176679535616477e-05, |
|
"loss": 0.7825, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.8064, |
|
"grad_norm": 0.1855696141719818, |
|
"learning_rate": 9.156052051688632e-05, |
|
"loss": 0.7547, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.8128, |
|
"grad_norm": 0.20914626121520996, |
|
"learning_rate": 9.135193140499156e-05, |
|
"loss": 0.7972, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.8192, |
|
"grad_norm": 0.21978524327278137, |
|
"learning_rate": 9.114103963563986e-05, |
|
"loss": 0.7325, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.8256, |
|
"grad_norm": 0.2079222947359085, |
|
"learning_rate": 9.092785695221271e-05, |
|
"loss": 0.8084, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.832, |
|
"grad_norm": 0.1860748827457428, |
|
"learning_rate": 9.071239522565977e-05, |
|
"loss": 0.7359, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.8384, |
|
"grad_norm": 0.1871631145477295, |
|
"learning_rate": 9.049466645383784e-05, |
|
"loss": 0.7138, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.8448, |
|
"grad_norm": 0.19846683740615845, |
|
"learning_rate": 9.027468276084275e-05, |
|
"loss": 0.7176, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.8512, |
|
"grad_norm": 0.2049422711133957, |
|
"learning_rate": 9.00524563963343e-05, |
|
"loss": 0.7595, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.8576, |
|
"grad_norm": 0.17989754676818848, |
|
"learning_rate": 8.982799973485407e-05, |
|
"loss": 0.7445, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.864, |
|
"grad_norm": 0.32045578956604004, |
|
"learning_rate": 8.960132527513643e-05, |
|
"loss": 0.7214, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.8704, |
|
"grad_norm": 0.2243393063545227, |
|
"learning_rate": 8.937244563941247e-05, |
|
"loss": 0.8011, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.8768, |
|
"grad_norm": 0.20574022829532623, |
|
"learning_rate": 8.914137357270723e-05, |
|
"loss": 0.768, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.8832, |
|
"grad_norm": 0.2292594015598297, |
|
"learning_rate": 8.890812194212988e-05, |
|
"loss": 0.7083, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.8896, |
|
"grad_norm": 0.2046431005001068, |
|
"learning_rate": 8.867270373615734e-05, |
|
"loss": 0.77, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.896, |
|
"grad_norm": 0.22153380513191223, |
|
"learning_rate": 8.843513206391101e-05, |
|
"loss": 0.7262, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.9024, |
|
"grad_norm": 0.214860737323761, |
|
"learning_rate": 8.81954201544267e-05, |
|
"loss": 0.8284, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.9088, |
|
"grad_norm": 0.21554942429065704, |
|
"learning_rate": 8.795358135591811e-05, |
|
"loss": 0.7712, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.9152, |
|
"grad_norm": 0.21957093477249146, |
|
"learning_rate": 8.77096291350334e-05, |
|
"loss": 0.7235, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.9216, |
|
"grad_norm": 0.20161031186580658, |
|
"learning_rate": 8.746357707610545e-05, |
|
"loss": 0.7071, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.928, |
|
"grad_norm": 0.20437265932559967, |
|
"learning_rate": 8.721543888039533e-05, |
|
"loss": 0.7134, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.9344, |
|
"grad_norm": 0.19950765371322632, |
|
"learning_rate": 8.69652283653294e-05, |
|
"loss": 0.6978, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.9408, |
|
"grad_norm": 0.19119159877300262, |
|
"learning_rate": 8.671295946372988e-05, |
|
"loss": 0.7583, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.9472, |
|
"grad_norm": 0.20924100279808044, |
|
"learning_rate": 8.645864622303898e-05, |
|
"loss": 0.7563, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.9536, |
|
"grad_norm": 0.18858124315738678, |
|
"learning_rate": 8.620230280453673e-05, |
|
"loss": 0.7121, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 0.17950883507728577, |
|
"learning_rate": 8.594394348255238e-05, |
|
"loss": 0.6578, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.9664, |
|
"grad_norm": 0.20606084167957306, |
|
"learning_rate": 8.568358264366957e-05, |
|
"loss": 0.71, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.9728, |
|
"grad_norm": 0.18285425007343292, |
|
"learning_rate": 8.542123478592518e-05, |
|
"loss": 0.6941, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.9792, |
|
"grad_norm": 0.20143665373325348, |
|
"learning_rate": 8.515691451800205e-05, |
|
"loss": 0.636, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.9856, |
|
"grad_norm": 0.20506098866462708, |
|
"learning_rate": 8.489063655841551e-05, |
|
"loss": 0.7111, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.992, |
|
"grad_norm": 0.18759530782699585, |
|
"learning_rate": 8.462241573469379e-05, |
|
"loss": 0.733, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.9984, |
|
"grad_norm": 0.22259001433849335, |
|
"learning_rate": 8.435226698255227e-05, |
|
"loss": 0.7405, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.3408335745334625, |
|
"learning_rate": 8.408020534506195e-05, |
|
"loss": 0.6648, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 1.0064, |
|
"grad_norm": 0.18838848173618317, |
|
"learning_rate": 8.380624597181165e-05, |
|
"loss": 0.6529, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 1.0128, |
|
"grad_norm": 0.20577505230903625, |
|
"learning_rate": 8.353040411806447e-05, |
|
"loss": 0.6837, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 1.0192, |
|
"grad_norm": 0.20288875699043274, |
|
"learning_rate": 8.325269514390835e-05, |
|
"loss": 0.7313, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 1.0256, |
|
"grad_norm": 0.2250303328037262, |
|
"learning_rate": 8.297313451340064e-05, |
|
"loss": 0.7675, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 1.032, |
|
"grad_norm": 0.21464917063713074, |
|
"learning_rate": 8.269173779370711e-05, |
|
"loss": 0.778, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 1.0384, |
|
"grad_norm": 0.19386309385299683, |
|
"learning_rate": 8.240852065423506e-05, |
|
"loss": 0.7163, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 1.0448, |
|
"grad_norm": 0.1796194165945053, |
|
"learning_rate": 8.21234988657607e-05, |
|
"loss": 0.6736, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 1.0512, |
|
"grad_norm": 0.21359069645404816, |
|
"learning_rate": 8.183668829955111e-05, |
|
"loss": 0.7748, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 1.0576, |
|
"grad_norm": 0.19759395718574524, |
|
"learning_rate": 8.154810492648037e-05, |
|
"loss": 0.746, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 1.064, |
|
"grad_norm": 0.19113849103450775, |
|
"learning_rate": 8.125776481614024e-05, |
|
"loss": 0.6629, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 1.0704, |
|
"grad_norm": 0.18363501131534576, |
|
"learning_rate": 8.096568413594533e-05, |
|
"loss": 0.674, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 1.0768, |
|
"grad_norm": 0.20529989898204803, |
|
"learning_rate": 8.067187915023282e-05, |
|
"loss": 0.7051, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 1.0832, |
|
"grad_norm": 0.176677405834198, |
|
"learning_rate": 8.037636621935685e-05, |
|
"loss": 0.7434, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 1.0896, |
|
"grad_norm": 0.2223319411277771, |
|
"learning_rate": 8.007916179877741e-05, |
|
"loss": 0.6931, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 1.096, |
|
"grad_norm": 0.18287041783332825, |
|
"learning_rate": 7.978028243814415e-05, |
|
"loss": 0.7043, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 1.1024, |
|
"grad_norm": 0.2017446905374527, |
|
"learning_rate": 7.947974478037468e-05, |
|
"loss": 0.7687, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 1.1088, |
|
"grad_norm": 0.17362761497497559, |
|
"learning_rate": 7.91775655607279e-05, |
|
"loss": 0.7475, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 1.1152, |
|
"grad_norm": 0.23194265365600586, |
|
"learning_rate": 7.887376160587215e-05, |
|
"loss": 0.7347, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 1.1216, |
|
"grad_norm": 0.19501113891601562, |
|
"learning_rate": 7.85683498329481e-05, |
|
"loss": 0.7044, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 1.1280000000000001, |
|
"grad_norm": 0.20629528164863586, |
|
"learning_rate": 7.826134724862687e-05, |
|
"loss": 0.7334, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 1.1344, |
|
"grad_norm": 0.21323485672473907, |
|
"learning_rate": 7.795277094816291e-05, |
|
"loss": 0.7211, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 1.1408, |
|
"grad_norm": 0.23606210947036743, |
|
"learning_rate": 7.764263811444215e-05, |
|
"loss": 0.6876, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 1.1472, |
|
"grad_norm": 0.1839892566204071, |
|
"learning_rate": 7.733096601702507e-05, |
|
"loss": 0.7583, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 1.1536, |
|
"grad_norm": 0.21783791482448578, |
|
"learning_rate": 7.70177720111852e-05, |
|
"loss": 0.6689, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 1.16, |
|
"grad_norm": 0.1926935911178589, |
|
"learning_rate": 7.67030735369426e-05, |
|
"loss": 0.76, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 1.1663999999999999, |
|
"grad_norm": 0.18627937138080597, |
|
"learning_rate": 7.638688811809274e-05, |
|
"loss": 0.6958, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 1.1728, |
|
"grad_norm": 0.20846357941627502, |
|
"learning_rate": 7.60692333612307e-05, |
|
"loss": 0.6813, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 1.1792, |
|
"grad_norm": 0.1936255395412445, |
|
"learning_rate": 7.575012695477076e-05, |
|
"loss": 0.6926, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 1.1856, |
|
"grad_norm": 0.18793527781963348, |
|
"learning_rate": 7.54295866679615e-05, |
|
"loss": 0.6974, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 1.192, |
|
"grad_norm": 0.19964028894901276, |
|
"learning_rate": 7.510763034989617e-05, |
|
"loss": 0.7603, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 1.1984, |
|
"grad_norm": 0.20926351845264435, |
|
"learning_rate": 7.478427592851893e-05, |
|
"loss": 0.6682, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 1.2048, |
|
"grad_norm": 0.22575032711029053, |
|
"learning_rate": 7.44595414096265e-05, |
|
"loss": 0.7939, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 1.2112, |
|
"grad_norm": 0.19395670294761658, |
|
"learning_rate": 7.413344487586542e-05, |
|
"loss": 0.6991, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 1.2176, |
|
"grad_norm": 0.19221967458724976, |
|
"learning_rate": 7.380600448572531e-05, |
|
"loss": 0.7237, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 1.224, |
|
"grad_norm": 0.2095392495393753, |
|
"learning_rate": 7.347723847252756e-05, |
|
"loss": 0.7686, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 1.2304, |
|
"grad_norm": 0.19817109405994415, |
|
"learning_rate": 7.314716514341006e-05, |
|
"loss": 0.711, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 1.2368000000000001, |
|
"grad_norm": 0.19115367531776428, |
|
"learning_rate": 7.28158028783079e-05, |
|
"loss": 0.6759, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 1.2432, |
|
"grad_norm": 0.1996709704399109, |
|
"learning_rate": 7.248317012892969e-05, |
|
"loss": 0.7026, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 1.2496, |
|
"grad_norm": 0.1884952187538147, |
|
"learning_rate": 7.214928541773027e-05, |
|
"loss": 0.7281, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 1.256, |
|
"grad_norm": 0.1934693157672882, |
|
"learning_rate": 7.181416733687919e-05, |
|
"loss": 0.6471, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 1.2624, |
|
"grad_norm": 0.19305215775966644, |
|
"learning_rate": 7.147783454722545e-05, |
|
"loss": 0.7218, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 1.2688, |
|
"grad_norm": 0.17377148568630219, |
|
"learning_rate": 7.114030577725836e-05, |
|
"loss": 0.6985, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 1.2752, |
|
"grad_norm": 0.19253912568092346, |
|
"learning_rate": 7.080159982206471e-05, |
|
"loss": 0.723, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.2816, |
|
"grad_norm": 0.2046697437763214, |
|
"learning_rate": 7.046173554228213e-05, |
|
"loss": 0.6556, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 1.288, |
|
"grad_norm": 0.18565595149993896, |
|
"learning_rate": 7.012073186304886e-05, |
|
"loss": 0.7111, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 1.2944, |
|
"grad_norm": 0.1991768181324005, |
|
"learning_rate": 6.977860777294988e-05, |
|
"loss": 0.7236, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 1.3008, |
|
"grad_norm": 0.20029094815254211, |
|
"learning_rate": 6.943538232295964e-05, |
|
"loss": 0.6902, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 1.3072, |
|
"grad_norm": 0.20728713274002075, |
|
"learning_rate": 6.909107462538113e-05, |
|
"loss": 0.723, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 1.3136, |
|
"grad_norm": 0.20966534316539764, |
|
"learning_rate": 6.874570385278161e-05, |
|
"loss": 0.7214, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 1.32, |
|
"grad_norm": 0.19986563920974731, |
|
"learning_rate": 6.839928923692504e-05, |
|
"loss": 0.7464, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 1.3264, |
|
"grad_norm": 0.22603270411491394, |
|
"learning_rate": 6.805185006770124e-05, |
|
"loss": 0.7287, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 1.3328, |
|
"grad_norm": 0.1987605094909668, |
|
"learning_rate": 6.770340569205158e-05, |
|
"loss": 0.7483, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 1.3392, |
|
"grad_norm": 0.23768657445907593, |
|
"learning_rate": 6.735397551289178e-05, |
|
"loss": 0.685, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 1.3456000000000001, |
|
"grad_norm": 0.19051390886306763, |
|
"learning_rate": 6.700357898803145e-05, |
|
"loss": 0.7484, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 1.3519999999999999, |
|
"grad_norm": 0.1968153864145279, |
|
"learning_rate": 6.665223562909058e-05, |
|
"loss": 0.7095, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 1.3584, |
|
"grad_norm": 0.21399809420108795, |
|
"learning_rate": 6.629996500041299e-05, |
|
"loss": 0.6991, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 1.3648, |
|
"grad_norm": 0.21878115832805634, |
|
"learning_rate": 6.594678671797703e-05, |
|
"loss": 0.7192, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 1.3712, |
|
"grad_norm": 0.19469957053661346, |
|
"learning_rate": 6.559272044830317e-05, |
|
"loss": 0.6861, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 1.3776, |
|
"grad_norm": 0.23191341757774353, |
|
"learning_rate": 6.523778590735891e-05, |
|
"loss": 0.7194, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 1.384, |
|
"grad_norm": 0.21132110059261322, |
|
"learning_rate": 6.488200285946094e-05, |
|
"loss": 0.685, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 1.3904, |
|
"grad_norm": 0.22124455869197845, |
|
"learning_rate": 6.452539111617453e-05, |
|
"loss": 0.6685, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 1.3968, |
|
"grad_norm": 0.19542793929576874, |
|
"learning_rate": 6.416797053521038e-05, |
|
"loss": 0.6813, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 1.4032, |
|
"grad_norm": 0.2289544641971588, |
|
"learning_rate": 6.38097610193188e-05, |
|
"loss": 0.6714, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 1.4096, |
|
"grad_norm": 0.2579493224620819, |
|
"learning_rate": 6.345078251518143e-05, |
|
"loss": 0.7159, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 1.416, |
|
"grad_norm": 0.19421054422855377, |
|
"learning_rate": 6.309105501230067e-05, |
|
"loss": 0.7322, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 1.4224, |
|
"grad_norm": 0.20962068438529968, |
|
"learning_rate": 6.273059854188636e-05, |
|
"loss": 0.7126, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 1.4288, |
|
"grad_norm": 0.19890280067920685, |
|
"learning_rate": 6.236943317574054e-05, |
|
"loss": 0.7368, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 1.4352, |
|
"grad_norm": 0.20143720507621765, |
|
"learning_rate": 6.200757902513962e-05, |
|
"loss": 0.7455, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 1.4416, |
|
"grad_norm": 0.23619429767131805, |
|
"learning_rate": 6.164505623971457e-05, |
|
"loss": 0.7332, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 1.448, |
|
"grad_norm": 0.18844173848628998, |
|
"learning_rate": 6.128188500632892e-05, |
|
"loss": 0.726, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 1.4544000000000001, |
|
"grad_norm": 0.2009935975074768, |
|
"learning_rate": 6.091808554795462e-05, |
|
"loss": 0.7047, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 1.4607999999999999, |
|
"grad_norm": 0.21597343683242798, |
|
"learning_rate": 6.055367812254592e-05, |
|
"loss": 0.6953, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 1.4672, |
|
"grad_norm": 0.21373218297958374, |
|
"learning_rate": 6.0188683021911396e-05, |
|
"loss": 0.7517, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 1.4736, |
|
"grad_norm": 0.2007780373096466, |
|
"learning_rate": 5.9823120570583926e-05, |
|
"loss": 0.6925, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 1.48, |
|
"grad_norm": 0.24474382400512695, |
|
"learning_rate": 5.9457011124689023e-05, |
|
"loss": 0.6962, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 1.4864, |
|
"grad_norm": 0.20391108095645905, |
|
"learning_rate": 5.909037507081121e-05, |
|
"loss": 0.75, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 1.4928, |
|
"grad_norm": 0.20164735615253448, |
|
"learning_rate": 5.8723232824858886e-05, |
|
"loss": 0.7141, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 1.4992, |
|
"grad_norm": 0.19647496938705444, |
|
"learning_rate": 5.835560483092743e-05, |
|
"loss": 0.7297, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 1.5056, |
|
"grad_norm": 0.1943722367286682, |
|
"learning_rate": 5.798751156016085e-05, |
|
"loss": 0.7056, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 1.512, |
|
"grad_norm": 0.1920538693666458, |
|
"learning_rate": 5.761897350961175e-05, |
|
"loss": 0.7046, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 1.5184, |
|
"grad_norm": 0.2010536789894104, |
|
"learning_rate": 5.7250011201100095e-05, |
|
"loss": 0.7222, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 1.5248, |
|
"grad_norm": 0.20179545879364014, |
|
"learning_rate": 5.688064518007036e-05, |
|
"loss": 0.6906, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 1.5312000000000001, |
|
"grad_norm": 0.19752287864685059, |
|
"learning_rate": 5.6510896014447526e-05, |
|
"loss": 0.7016, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 1.5375999999999999, |
|
"grad_norm": 0.20254461467266083, |
|
"learning_rate": 5.6140784293491725e-05, |
|
"loss": 0.6707, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 1.544, |
|
"grad_norm": 0.2074304223060608, |
|
"learning_rate": 5.577033062665179e-05, |
|
"loss": 0.7068, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 1.5504, |
|
"grad_norm": 0.1754094362258911, |
|
"learning_rate": 5.53995556424176e-05, |
|
"loss": 0.6778, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 1.5568, |
|
"grad_norm": 0.19732975959777832, |
|
"learning_rate": 5.50284799871714e-05, |
|
"loss": 0.7431, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 1.5632000000000001, |
|
"grad_norm": 0.22642773389816284, |
|
"learning_rate": 5.465712432403812e-05, |
|
"loss": 0.7276, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 1.5695999999999999, |
|
"grad_norm": 0.19384561479091644, |
|
"learning_rate": 5.428550933173476e-05, |
|
"loss": 0.6988, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 1.576, |
|
"grad_norm": 0.19731836020946503, |
|
"learning_rate": 5.391365570341893e-05, |
|
"loss": 0.6659, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 1.5824, |
|
"grad_norm": 0.1964057832956314, |
|
"learning_rate": 5.3541584145536475e-05, |
|
"loss": 0.6876, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 1.5888, |
|
"grad_norm": 0.21582961082458496, |
|
"learning_rate": 5.316931537666856e-05, |
|
"loss": 0.7048, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 1.5952, |
|
"grad_norm": 0.18675744533538818, |
|
"learning_rate": 5.279687012637799e-05, |
|
"loss": 0.657, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 1.6016, |
|
"grad_norm": 0.21063970029354095, |
|
"learning_rate": 5.24242691340547e-05, |
|
"loss": 0.6786, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 1.608, |
|
"grad_norm": 0.20948725938796997, |
|
"learning_rate": 5.2051533147761155e-05, |
|
"loss": 0.712, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 1.6143999999999998, |
|
"grad_norm": 0.2167293280363083, |
|
"learning_rate": 5.167868292307678e-05, |
|
"loss": 0.668, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 1.6208, |
|
"grad_norm": 0.19093649089336395, |
|
"learning_rate": 5.1305739221942364e-05, |
|
"loss": 0.7175, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 1.6272, |
|
"grad_norm": 0.2434680163860321, |
|
"learning_rate": 5.093272281150383e-05, |
|
"loss": 0.6888, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 1.6336, |
|
"grad_norm": 0.21365131437778473, |
|
"learning_rate": 5.05596544629559e-05, |
|
"loss": 0.6907, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 1.6400000000000001, |
|
"grad_norm": 0.24530403316020966, |
|
"learning_rate": 5.018655495038541e-05, |
|
"loss": 0.6995, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 1.6463999999999999, |
|
"grad_norm": 0.2155607044696808, |
|
"learning_rate": 4.981344504961459e-05, |
|
"loss": 0.6894, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 1.6528, |
|
"grad_norm": 0.193691223859787, |
|
"learning_rate": 4.944034553704412e-05, |
|
"loss": 0.6693, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 1.6592, |
|
"grad_norm": 0.23936335742473602, |
|
"learning_rate": 4.9067277188496185e-05, |
|
"loss": 0.7161, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 1.6656, |
|
"grad_norm": 0.19866809248924255, |
|
"learning_rate": 4.869426077805765e-05, |
|
"loss": 0.6558, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 1.6720000000000002, |
|
"grad_norm": 0.21519921720027924, |
|
"learning_rate": 4.8321317076923223e-05, |
|
"loss": 0.6388, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 1.6784, |
|
"grad_norm": 0.23403239250183105, |
|
"learning_rate": 4.794846685223884e-05, |
|
"loss": 0.723, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 1.6848, |
|
"grad_norm": 0.20778563618659973, |
|
"learning_rate": 4.757573086594529e-05, |
|
"loss": 0.7281, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 1.6912, |
|
"grad_norm": 0.2214445173740387, |
|
"learning_rate": 4.7203129873622045e-05, |
|
"loss": 0.6819, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 1.6976, |
|
"grad_norm": 0.23840227723121643, |
|
"learning_rate": 4.6830684623331446e-05, |
|
"loss": 0.6612, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 1.704, |
|
"grad_norm": 0.21336662769317627, |
|
"learning_rate": 4.645841585446356e-05, |
|
"loss": 0.6873, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 1.7104, |
|
"grad_norm": 0.20929306745529175, |
|
"learning_rate": 4.60863442965811e-05, |
|
"loss": 0.6937, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 1.7168, |
|
"grad_norm": 0.2372131198644638, |
|
"learning_rate": 4.5714490668265245e-05, |
|
"loss": 0.6603, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 1.7231999999999998, |
|
"grad_norm": 0.19182223081588745, |
|
"learning_rate": 4.5342875675961885e-05, |
|
"loss": 0.7286, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 1.7296, |
|
"grad_norm": 0.19139911234378815, |
|
"learning_rate": 4.497152001282861e-05, |
|
"loss": 0.6884, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 1.736, |
|
"grad_norm": 0.22471372783184052, |
|
"learning_rate": 4.460044435758241e-05, |
|
"loss": 0.6634, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 1.7424, |
|
"grad_norm": 0.1903676688671112, |
|
"learning_rate": 4.4229669373348226e-05, |
|
"loss": 0.6813, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 1.7488000000000001, |
|
"grad_norm": 0.18955154716968536, |
|
"learning_rate": 4.3859215706508294e-05, |
|
"loss": 0.6851, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 1.7551999999999999, |
|
"grad_norm": 0.208612322807312, |
|
"learning_rate": 4.348910398555249e-05, |
|
"loss": 0.6685, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 1.7616, |
|
"grad_norm": 0.19398434460163116, |
|
"learning_rate": 4.3119354819929655e-05, |
|
"loss": 0.6544, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 1.768, |
|
"grad_norm": 0.1975647658109665, |
|
"learning_rate": 4.27499887988999e-05, |
|
"loss": 0.6744, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 1.7744, |
|
"grad_norm": 0.19207845628261566, |
|
"learning_rate": 4.2381026490388245e-05, |
|
"loss": 0.6928, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 1.7808000000000002, |
|
"grad_norm": 0.19270960986614227, |
|
"learning_rate": 4.201248843983918e-05, |
|
"loss": 0.6818, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 1.7872, |
|
"grad_norm": 0.20096321403980255, |
|
"learning_rate": 4.164439516907258e-05, |
|
"loss": 0.6712, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 1.7936, |
|
"grad_norm": 0.20221710205078125, |
|
"learning_rate": 4.127676717514113e-05, |
|
"loss": 0.697, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 1.8, |
|
"grad_norm": 0.19503732025623322, |
|
"learning_rate": 4.0909624929188805e-05, |
|
"loss": 0.6939, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 1.8064, |
|
"grad_norm": 0.20334869623184204, |
|
"learning_rate": 4.0542988875310995e-05, |
|
"loss": 0.644, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 1.8128, |
|
"grad_norm": 0.20078979432582855, |
|
"learning_rate": 4.0176879429416086e-05, |
|
"loss": 0.7029, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 1.8192, |
|
"grad_norm": 0.20540271699428558, |
|
"learning_rate": 3.981131697808862e-05, |
|
"loss": 0.6893, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 1.8256000000000001, |
|
"grad_norm": 0.21139805018901825, |
|
"learning_rate": 3.9446321877454094e-05, |
|
"loss": 0.7459, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 1.8319999999999999, |
|
"grad_norm": 0.2058684229850769, |
|
"learning_rate": 3.90819144520454e-05, |
|
"loss": 0.6872, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 1.8384, |
|
"grad_norm": 0.20217950642108917, |
|
"learning_rate": 3.8718114993671084e-05, |
|
"loss": 0.679, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 1.8448, |
|
"grad_norm": 0.1952074021100998, |
|
"learning_rate": 3.835494376028544e-05, |
|
"loss": 0.7296, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 1.8512, |
|
"grad_norm": 0.20614856481552124, |
|
"learning_rate": 3.7992420974860384e-05, |
|
"loss": 0.6543, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 1.8576000000000001, |
|
"grad_norm": 0.19841645658016205, |
|
"learning_rate": 3.7630566824259456e-05, |
|
"loss": 0.6866, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 1.8639999999999999, |
|
"grad_norm": 0.20447316765785217, |
|
"learning_rate": 3.726940145811363e-05, |
|
"loss": 0.6684, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 1.8704, |
|
"grad_norm": 0.2253882735967636, |
|
"learning_rate": 3.6908944987699345e-05, |
|
"loss": 0.6934, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 1.8768, |
|
"grad_norm": 0.20601052045822144, |
|
"learning_rate": 3.654921748481858e-05, |
|
"loss": 0.6936, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 1.8832, |
|
"grad_norm": 0.19950821995735168, |
|
"learning_rate": 3.6190238980681236e-05, |
|
"loss": 0.6804, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 1.8896, |
|
"grad_norm": 0.23119482398033142, |
|
"learning_rate": 3.583202946478963e-05, |
|
"loss": 0.682, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 1.896, |
|
"grad_norm": 0.19434580206871033, |
|
"learning_rate": 3.547460888382547e-05, |
|
"loss": 0.6698, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 1.9024, |
|
"grad_norm": 0.19283151626586914, |
|
"learning_rate": 3.511799714053907e-05, |
|
"loss": 0.6838, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 1.9088, |
|
"grad_norm": 0.2131282240152359, |
|
"learning_rate": 3.47622140926411e-05, |
|
"loss": 0.7203, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 1.9152, |
|
"grad_norm": 0.20812910795211792, |
|
"learning_rate": 3.4407279551696846e-05, |
|
"loss": 0.697, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 1.9216, |
|
"grad_norm": 0.19355922937393188, |
|
"learning_rate": 3.4053213282022984e-05, |
|
"loss": 0.7057, |
|
"step": 301 |
|
}, |
|
{ |
|
"epoch": 1.928, |
|
"grad_norm": 0.19805647432804108, |
|
"learning_rate": 3.370003499958703e-05, |
|
"loss": 0.7373, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 1.9344000000000001, |
|
"grad_norm": 0.21588417887687683, |
|
"learning_rate": 3.334776437090944e-05, |
|
"loss": 0.7187, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 1.9407999999999999, |
|
"grad_norm": 0.17682795226573944, |
|
"learning_rate": 3.299642101196854e-05, |
|
"loss": 0.7133, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 1.9472, |
|
"grad_norm": 0.201612651348114, |
|
"learning_rate": 3.2646024487108215e-05, |
|
"loss": 0.6767, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 1.9536, |
|
"grad_norm": 0.20109272003173828, |
|
"learning_rate": 3.2296594307948425e-05, |
|
"loss": 0.6907, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 1.96, |
|
"grad_norm": 0.1948445439338684, |
|
"learning_rate": 3.1948149932298774e-05, |
|
"loss": 0.7196, |
|
"step": 307 |
|
}, |
|
{ |
|
"epoch": 1.9664000000000001, |
|
"grad_norm": 0.20404204726219177, |
|
"learning_rate": 3.160071076307497e-05, |
|
"loss": 0.7123, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 1.9727999999999999, |
|
"grad_norm": 0.20908042788505554, |
|
"learning_rate": 3.125429614721842e-05, |
|
"loss": 0.6982, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 1.9792, |
|
"grad_norm": 0.21416354179382324, |
|
"learning_rate": 3.0908925374618895e-05, |
|
"loss": 0.6476, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 1.9856, |
|
"grad_norm": 0.20553742349147797, |
|
"learning_rate": 3.056461767704037e-05, |
|
"loss": 0.729, |
|
"step": 311 |
|
}, |
|
{ |
|
"epoch": 1.992, |
|
"grad_norm": 0.2184964120388031, |
|
"learning_rate": 3.0221392227050126e-05, |
|
"loss": 0.6833, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 1.9984, |
|
"grad_norm": 0.19415715336799622, |
|
"learning_rate": 2.987926813695116e-05, |
|
"loss": 0.75, |
|
"step": 313 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 0.4077971279621124, |
|
"learning_rate": 2.9538264457717878e-05, |
|
"loss": 0.7006, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 2.0064, |
|
"grad_norm": 0.23124489188194275, |
|
"learning_rate": 2.9198400177935305e-05, |
|
"loss": 0.6592, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 2.0128, |
|
"grad_norm": 0.2515321969985962, |
|
"learning_rate": 2.885969422274165e-05, |
|
"loss": 0.6851, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 2.0192, |
|
"grad_norm": 0.21909447014331818, |
|
"learning_rate": 2.8522165452774557e-05, |
|
"loss": 0.7244, |
|
"step": 317 |
|
}, |
|
{ |
|
"epoch": 2.0256, |
|
"grad_norm": 0.20850086212158203, |
|
"learning_rate": 2.8185832663120815e-05, |
|
"loss": 0.7144, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 2.032, |
|
"grad_norm": 0.2121347337961197, |
|
"learning_rate": 2.7850714582269722e-05, |
|
"loss": 0.6756, |
|
"step": 319 |
|
}, |
|
{ |
|
"epoch": 2.0384, |
|
"grad_norm": 0.22937630116939545, |
|
"learning_rate": 2.7516829871070292e-05, |
|
"loss": 0.6907, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 2.0448, |
|
"grad_norm": 0.20349667966365814, |
|
"learning_rate": 2.7184197121692127e-05, |
|
"loss": 0.6523, |
|
"step": 321 |
|
}, |
|
{ |
|
"epoch": 2.0512, |
|
"grad_norm": 0.19774508476257324, |
|
"learning_rate": 2.6852834856589947e-05, |
|
"loss": 0.6745, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 2.0576, |
|
"grad_norm": 0.1831101030111313, |
|
"learning_rate": 2.652276152747246e-05, |
|
"loss": 0.6973, |
|
"step": 323 |
|
}, |
|
{ |
|
"epoch": 2.064, |
|
"grad_norm": 0.20878373086452484, |
|
"learning_rate": 2.6193995514274705e-05, |
|
"loss": 0.67, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 2.0704, |
|
"grad_norm": 0.21983082592487335, |
|
"learning_rate": 2.5866555124134577e-05, |
|
"loss": 0.6811, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 2.0768, |
|
"grad_norm": 0.19198405742645264, |
|
"learning_rate": 2.5540458590373527e-05, |
|
"loss": 0.6282, |
|
"step": 326 |
|
}, |
|
{ |
|
"epoch": 2.0832, |
|
"grad_norm": 0.19146208465099335, |
|
"learning_rate": 2.5215724071481072e-05, |
|
"loss": 0.7243, |
|
"step": 327 |
|
}, |
|
{ |
|
"epoch": 2.0896, |
|
"grad_norm": 0.22032876312732697, |
|
"learning_rate": 2.4892369650103836e-05, |
|
"loss": 0.6853, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 2.096, |
|
"grad_norm": 0.19633299112319946, |
|
"learning_rate": 2.457041333203852e-05, |
|
"loss": 0.6914, |
|
"step": 329 |
|
}, |
|
{ |
|
"epoch": 2.1024, |
|
"grad_norm": 0.17531536519527435, |
|
"learning_rate": 2.4249873045229244e-05, |
|
"loss": 0.7256, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 2.1088, |
|
"grad_norm": 0.2074475884437561, |
|
"learning_rate": 2.3930766638769326e-05, |
|
"loss": 0.7376, |
|
"step": 331 |
|
}, |
|
{ |
|
"epoch": 2.1152, |
|
"grad_norm": 0.20909680426120758, |
|
"learning_rate": 2.3613111881907275e-05, |
|
"loss": 0.6866, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 2.1216, |
|
"grad_norm": 0.19580084085464478, |
|
"learning_rate": 2.3296926463057396e-05, |
|
"loss": 0.7069, |
|
"step": 333 |
|
}, |
|
{ |
|
"epoch": 2.128, |
|
"grad_norm": 0.18059836328029633, |
|
"learning_rate": 2.2982227988814796e-05, |
|
"loss": 0.6449, |
|
"step": 334 |
|
}, |
|
{ |
|
"epoch": 2.1344, |
|
"grad_norm": 0.1941729485988617, |
|
"learning_rate": 2.2669033982974945e-05, |
|
"loss": 0.6295, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 2.1408, |
|
"grad_norm": 0.2014259248971939, |
|
"learning_rate": 2.235736188555787e-05, |
|
"loss": 0.6776, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 2.1471999999999998, |
|
"grad_norm": 0.21426711976528168, |
|
"learning_rate": 2.2047229051837102e-05, |
|
"loss": 0.6988, |
|
"step": 337 |
|
}, |
|
{ |
|
"epoch": 2.1536, |
|
"grad_norm": 0.21534332633018494, |
|
"learning_rate": 2.173865275137314e-05, |
|
"loss": 0.6451, |
|
"step": 338 |
|
}, |
|
{ |
|
"epoch": 2.16, |
|
"grad_norm": 0.21011656522750854, |
|
"learning_rate": 2.143165016705192e-05, |
|
"loss": 0.6699, |
|
"step": 339 |
|
}, |
|
{ |
|
"epoch": 2.1664, |
|
"grad_norm": 0.21305830776691437, |
|
"learning_rate": 2.1126238394127868e-05, |
|
"loss": 0.6535, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 2.1728, |
|
"grad_norm": 0.22721970081329346, |
|
"learning_rate": 2.0822434439272122e-05, |
|
"loss": 0.6317, |
|
"step": 341 |
|
}, |
|
{ |
|
"epoch": 2.1792, |
|
"grad_norm": 0.20172137022018433, |
|
"learning_rate": 2.052025521962534e-05, |
|
"loss": 0.6911, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 2.1856, |
|
"grad_norm": 0.19113659858703613, |
|
"learning_rate": 2.0219717561855855e-05, |
|
"loss": 0.7039, |
|
"step": 343 |
|
}, |
|
{ |
|
"epoch": 2.192, |
|
"grad_norm": 0.2135518491268158, |
|
"learning_rate": 1.992083820122259e-05, |
|
"loss": 0.684, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 2.1984, |
|
"grad_norm": 0.19133660197257996, |
|
"learning_rate": 1.962363378064316e-05, |
|
"loss": 0.6835, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 2.2048, |
|
"grad_norm": 0.19742318987846375, |
|
"learning_rate": 1.9328120849767194e-05, |
|
"loss": 0.6411, |
|
"step": 346 |
|
}, |
|
{ |
|
"epoch": 2.2112, |
|
"grad_norm": 0.2134242057800293, |
|
"learning_rate": 1.903431586405468e-05, |
|
"loss": 0.6987, |
|
"step": 347 |
|
}, |
|
{ |
|
"epoch": 2.2176, |
|
"grad_norm": 0.19351407885551453, |
|
"learning_rate": 1.8742235183859747e-05, |
|
"loss": 0.6387, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 2.224, |
|
"grad_norm": 0.19710959494113922, |
|
"learning_rate": 1.8451895073519643e-05, |
|
"loss": 0.6638, |
|
"step": 349 |
|
}, |
|
{ |
|
"epoch": 2.2304, |
|
"grad_norm": 0.18867988884449005, |
|
"learning_rate": 1.8163311700448898e-05, |
|
"loss": 0.6697, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 2.2368, |
|
"grad_norm": 0.18838374316692352, |
|
"learning_rate": 1.7876501134239316e-05, |
|
"loss": 0.7276, |
|
"step": 351 |
|
}, |
|
{ |
|
"epoch": 2.2432, |
|
"grad_norm": 0.1833486109972, |
|
"learning_rate": 1.7591479345764973e-05, |
|
"loss": 0.6753, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 2.2496, |
|
"grad_norm": 0.23941516876220703, |
|
"learning_rate": 1.7308262206292897e-05, |
|
"loss": 0.6962, |
|
"step": 353 |
|
}, |
|
{ |
|
"epoch": 2.2560000000000002, |
|
"grad_norm": 0.19911228120326996, |
|
"learning_rate": 1.7026865486599377e-05, |
|
"loss": 0.6764, |
|
"step": 354 |
|
}, |
|
{ |
|
"epoch": 2.2624, |
|
"grad_norm": 0.21614587306976318, |
|
"learning_rate": 1.6747304856091662e-05, |
|
"loss": 0.6218, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 2.2688, |
|
"grad_norm": 0.1931372880935669, |
|
"learning_rate": 1.6469595881935525e-05, |
|
"loss": 0.7262, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 2.2752, |
|
"grad_norm": 0.19275271892547607, |
|
"learning_rate": 1.6193754028188364e-05, |
|
"loss": 0.7365, |
|
"step": 357 |
|
}, |
|
{ |
|
"epoch": 2.2816, |
|
"grad_norm": 0.19287358224391937, |
|
"learning_rate": 1.591979465493806e-05, |
|
"loss": 0.6888, |
|
"step": 358 |
|
}, |
|
{ |
|
"epoch": 2.288, |
|
"grad_norm": 0.18417416512966156, |
|
"learning_rate": 1.564773301744774e-05, |
|
"loss": 0.6533, |
|
"step": 359 |
|
}, |
|
{ |
|
"epoch": 2.2944, |
|
"grad_norm": 0.1791851967573166, |
|
"learning_rate": 1.5377584265306223e-05, |
|
"loss": 0.6738, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 2.3008, |
|
"grad_norm": 0.1805466115474701, |
|
"learning_rate": 1.510936344158448e-05, |
|
"loss": 0.664, |
|
"step": 361 |
|
}, |
|
{ |
|
"epoch": 2.3072, |
|
"grad_norm": 0.1946578025817871, |
|
"learning_rate": 1.4843085481997959e-05, |
|
"loss": 0.7261, |
|
"step": 362 |
|
}, |
|
{ |
|
"epoch": 2.3136, |
|
"grad_norm": 0.1858345866203308, |
|
"learning_rate": 1.457876521407484e-05, |
|
"loss": 0.6374, |
|
"step": 363 |
|
}, |
|
{ |
|
"epoch": 2.32, |
|
"grad_norm": 0.19666247069835663, |
|
"learning_rate": 1.431641735633044e-05, |
|
"loss": 0.6469, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 2.3264, |
|
"grad_norm": 0.16774997115135193, |
|
"learning_rate": 1.4056056517447635e-05, |
|
"loss": 0.6785, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 2.3327999999999998, |
|
"grad_norm": 0.19413676857948303, |
|
"learning_rate": 1.3797697195463278e-05, |
|
"loss": 0.6797, |
|
"step": 366 |
|
}, |
|
{ |
|
"epoch": 2.3392, |
|
"grad_norm": 0.1797027438879013, |
|
"learning_rate": 1.3541353776961036e-05, |
|
"loss": 0.6968, |
|
"step": 367 |
|
}, |
|
{ |
|
"epoch": 2.3456, |
|
"grad_norm": 0.17702320218086243, |
|
"learning_rate": 1.3287040536270135e-05, |
|
"loss": 0.6696, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 2.352, |
|
"grad_norm": 0.19950032234191895, |
|
"learning_rate": 1.3034771634670601e-05, |
|
"loss": 0.6559, |
|
"step": 369 |
|
}, |
|
{ |
|
"epoch": 2.3584, |
|
"grad_norm": 0.19765135645866394, |
|
"learning_rate": 1.2784561119604682e-05, |
|
"loss": 0.6154, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 2.3648, |
|
"grad_norm": 0.21111543476581573, |
|
"learning_rate": 1.2536422923894564e-05, |
|
"loss": 0.6153, |
|
"step": 371 |
|
}, |
|
{ |
|
"epoch": 2.3712, |
|
"grad_norm": 0.20043286681175232, |
|
"learning_rate": 1.2290370864966622e-05, |
|
"loss": 0.6988, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 2.3776, |
|
"grad_norm": 0.19548365473747253, |
|
"learning_rate": 1.2046418644081903e-05, |
|
"loss": 0.7292, |
|
"step": 373 |
|
}, |
|
{ |
|
"epoch": 2.384, |
|
"grad_norm": 0.1912689507007599, |
|
"learning_rate": 1.1804579845573289e-05, |
|
"loss": 0.7139, |
|
"step": 374 |
|
}, |
|
{ |
|
"epoch": 2.3904, |
|
"grad_norm": 0.18108686804771423, |
|
"learning_rate": 1.1564867936088992e-05, |
|
"loss": 0.6878, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 2.3968, |
|
"grad_norm": 0.2090510129928589, |
|
"learning_rate": 1.1327296263842652e-05, |
|
"loss": 0.6882, |
|
"step": 376 |
|
}, |
|
{ |
|
"epoch": 2.4032, |
|
"grad_norm": 0.19618839025497437, |
|
"learning_rate": 1.1091878057870136e-05, |
|
"loss": 0.6613, |
|
"step": 377 |
|
}, |
|
{ |
|
"epoch": 2.4096, |
|
"grad_norm": 0.18068960309028625, |
|
"learning_rate": 1.0858626427292795e-05, |
|
"loss": 0.6645, |
|
"step": 378 |
|
}, |
|
{ |
|
"epoch": 2.416, |
|
"grad_norm": 0.1936527043581009, |
|
"learning_rate": 1.0627554360587534e-05, |
|
"loss": 0.6942, |
|
"step": 379 |
|
}, |
|
{ |
|
"epoch": 2.4224, |
|
"grad_norm": 0.19242267310619354, |
|
"learning_rate": 1.0398674724863583e-05, |
|
"loss": 0.707, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 2.4288, |
|
"grad_norm": 0.17682623863220215, |
|
"learning_rate": 1.0172000265145936e-05, |
|
"loss": 0.6429, |
|
"step": 381 |
|
}, |
|
{ |
|
"epoch": 2.4352, |
|
"grad_norm": 0.18140776455402374, |
|
"learning_rate": 9.94754360366571e-06, |
|
"loss": 0.7112, |
|
"step": 382 |
|
}, |
|
{ |
|
"epoch": 2.4416, |
|
"grad_norm": 0.1756039410829544, |
|
"learning_rate": 9.72531723915726e-06, |
|
"loss": 0.6616, |
|
"step": 383 |
|
}, |
|
{ |
|
"epoch": 2.448, |
|
"grad_norm": 0.19845832884311676, |
|
"learning_rate": 9.505333546162171e-06, |
|
"loss": 0.6774, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 2.4544, |
|
"grad_norm": 0.19389702379703522, |
|
"learning_rate": 9.287604774340236e-06, |
|
"loss": 0.7189, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 2.4608, |
|
"grad_norm": 0.20373724400997162, |
|
"learning_rate": 9.07214304778729e-06, |
|
"loss": 0.7034, |
|
"step": 386 |
|
}, |
|
{ |
|
"epoch": 2.4672, |
|
"grad_norm": 0.1839911788702011, |
|
"learning_rate": 8.858960364360141e-06, |
|
"loss": 0.6394, |
|
"step": 387 |
|
}, |
|
{ |
|
"epoch": 2.4736000000000002, |
|
"grad_norm": 0.18362964689731598, |
|
"learning_rate": 8.648068595008457e-06, |
|
"loss": 0.673, |
|
"step": 388 |
|
}, |
|
{ |
|
"epoch": 2.48, |
|
"grad_norm": 0.18894684314727783, |
|
"learning_rate": 8.439479483113683e-06, |
|
"loss": 0.6781, |
|
"step": 389 |
|
}, |
|
{ |
|
"epoch": 2.4864, |
|
"grad_norm": 0.1816522479057312, |
|
"learning_rate": 8.233204643835236e-06, |
|
"loss": 0.6894, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 2.4928, |
|
"grad_norm": 0.1781761348247528, |
|
"learning_rate": 8.029255563463589e-06, |
|
"loss": 0.681, |
|
"step": 391 |
|
}, |
|
{ |
|
"epoch": 2.4992, |
|
"grad_norm": 0.18166184425354004, |
|
"learning_rate": 7.827643598780749e-06, |
|
"loss": 0.6752, |
|
"step": 392 |
|
}, |
|
{ |
|
"epoch": 2.5056000000000003, |
|
"grad_norm": 0.19368097186088562, |
|
"learning_rate": 7.628379976427868e-06, |
|
"loss": 0.6476, |
|
"step": 393 |
|
}, |
|
{ |
|
"epoch": 2.512, |
|
"grad_norm": 0.1752738654613495, |
|
"learning_rate": 7.431475792280018e-06, |
|
"loss": 0.6838, |
|
"step": 394 |
|
}, |
|
{ |
|
"epoch": 2.5183999999999997, |
|
"grad_norm": 0.1779792755842209, |
|
"learning_rate": 7.236942010828429e-06, |
|
"loss": 0.7104, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 2.5248, |
|
"grad_norm": 0.18969357013702393, |
|
"learning_rate": 7.0447894645698175e-06, |
|
"loss": 0.6944, |
|
"step": 396 |
|
}, |
|
{ |
|
"epoch": 2.5312, |
|
"grad_norm": 0.18680952489376068, |
|
"learning_rate": 6.855028853403294e-06, |
|
"loss": 0.6931, |
|
"step": 397 |
|
}, |
|
{ |
|
"epoch": 2.5376, |
|
"grad_norm": 0.19200138747692108, |
|
"learning_rate": 6.667670744034499e-06, |
|
"loss": 0.732, |
|
"step": 398 |
|
}, |
|
{ |
|
"epoch": 2.544, |
|
"grad_norm": 0.1732376366853714, |
|
"learning_rate": 6.482725569387172e-06, |
|
"loss": 0.6958, |
|
"step": 399 |
|
}, |
|
{ |
|
"epoch": 2.5504, |
|
"grad_norm": 0.1787935346364975, |
|
"learning_rate": 6.300203628022272e-06, |
|
"loss": 0.6913, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 2.5568, |
|
"grad_norm": 0.18525420129299164, |
|
"learning_rate": 6.120115083564431e-06, |
|
"loss": 0.6651, |
|
"step": 401 |
|
}, |
|
{ |
|
"epoch": 2.5632, |
|
"grad_norm": 0.19965125620365143, |
|
"learning_rate": 5.942469964136055e-06, |
|
"loss": 0.6701, |
|
"step": 402 |
|
}, |
|
{ |
|
"epoch": 2.5696, |
|
"grad_norm": 0.20355017483234406, |
|
"learning_rate": 5.767278161798911e-06, |
|
"loss": 0.6546, |
|
"step": 403 |
|
}, |
|
{ |
|
"epoch": 2.576, |
|
"grad_norm": 0.18207506835460663, |
|
"learning_rate": 5.5945494320032434e-06, |
|
"loss": 0.651, |
|
"step": 404 |
|
}, |
|
{ |
|
"epoch": 2.5824, |
|
"grad_norm": 0.16914696991443634, |
|
"learning_rate": 5.424293393044611e-06, |
|
"loss": 0.6754, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 2.5888, |
|
"grad_norm": 0.17765933275222778, |
|
"learning_rate": 5.256519525528253e-06, |
|
"loss": 0.7015, |
|
"step": 406 |
|
}, |
|
{ |
|
"epoch": 2.5952, |
|
"grad_norm": 0.17559665441513062, |
|
"learning_rate": 5.091237171841173e-06, |
|
"loss": 0.7134, |
|
"step": 407 |
|
}, |
|
{ |
|
"epoch": 2.6016, |
|
"grad_norm": 0.17291496694087982, |
|
"learning_rate": 4.928455535631959e-06, |
|
"loss": 0.7031, |
|
"step": 408 |
|
}, |
|
{ |
|
"epoch": 2.608, |
|
"grad_norm": 0.18128253519535065, |
|
"learning_rate": 4.768183681298211e-06, |
|
"loss": 0.7073, |
|
"step": 409 |
|
}, |
|
{ |
|
"epoch": 2.6144, |
|
"grad_norm": 0.1818084418773651, |
|
"learning_rate": 4.610430533481857e-06, |
|
"loss": 0.7378, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 2.6208, |
|
"grad_norm": 0.17898201942443848, |
|
"learning_rate": 4.455204876572172e-06, |
|
"loss": 0.7126, |
|
"step": 411 |
|
}, |
|
{ |
|
"epoch": 2.6272, |
|
"grad_norm": 0.18443121016025543, |
|
"learning_rate": 4.302515354216574e-06, |
|
"loss": 0.6359, |
|
"step": 412 |
|
}, |
|
{ |
|
"epoch": 2.6336, |
|
"grad_norm": 0.16818998754024506, |
|
"learning_rate": 4.1523704688394176e-06, |
|
"loss": 0.6955, |
|
"step": 413 |
|
}, |
|
{ |
|
"epoch": 2.64, |
|
"grad_norm": 0.17661628127098083, |
|
"learning_rate": 4.004778581168412e-06, |
|
"loss": 0.6518, |
|
"step": 414 |
|
}, |
|
{ |
|
"epoch": 2.6464, |
|
"grad_norm": 0.17434482276439667, |
|
"learning_rate": 3.859747909769162e-06, |
|
"loss": 0.6369, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 2.6528, |
|
"grad_norm": 0.1895863562822342, |
|
"learning_rate": 3.7172865305874826e-06, |
|
"loss": 0.6902, |
|
"step": 416 |
|
}, |
|
{ |
|
"epoch": 2.6592000000000002, |
|
"grad_norm": 0.17753319442272186, |
|
"learning_rate": 3.5774023764996723e-06, |
|
"loss": 0.7559, |
|
"step": 417 |
|
}, |
|
{ |
|
"epoch": 2.6656, |
|
"grad_norm": 0.16980338096618652, |
|
"learning_rate": 3.440103236870823e-06, |
|
"loss": 0.6977, |
|
"step": 418 |
|
}, |
|
{ |
|
"epoch": 2.672, |
|
"grad_norm": 0.17221416532993317, |
|
"learning_rate": 3.3053967571210378e-06, |
|
"loss": 0.673, |
|
"step": 419 |
|
}, |
|
{ |
|
"epoch": 2.6784, |
|
"grad_norm": 0.18261651694774628, |
|
"learning_rate": 3.1732904382996976e-06, |
|
"loss": 0.6809, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 2.6848, |
|
"grad_norm": 0.18153472244739532, |
|
"learning_rate": 3.04379163666782e-06, |
|
"loss": 0.6428, |
|
"step": 421 |
|
}, |
|
{ |
|
"epoch": 2.6912000000000003, |
|
"grad_norm": 0.1869465857744217, |
|
"learning_rate": 2.916907563288357e-06, |
|
"loss": 0.6529, |
|
"step": 422 |
|
}, |
|
{ |
|
"epoch": 2.6976, |
|
"grad_norm": 0.18409712612628937, |
|
"learning_rate": 2.792645283624712e-06, |
|
"loss": 0.6518, |
|
"step": 423 |
|
}, |
|
{ |
|
"epoch": 2.7039999999999997, |
|
"grad_norm": 0.19606178998947144, |
|
"learning_rate": 2.671011717147276e-06, |
|
"loss": 0.6725, |
|
"step": 424 |
|
}, |
|
{ |
|
"epoch": 2.7104, |
|
"grad_norm": 0.17141272127628326, |
|
"learning_rate": 2.5520136369481198e-06, |
|
"loss": 0.6554, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 2.7168, |
|
"grad_norm": 0.16549499332904816, |
|
"learning_rate": 2.4356576693638556e-06, |
|
"loss": 0.6629, |
|
"step": 426 |
|
}, |
|
{ |
|
"epoch": 2.7232, |
|
"grad_norm": 0.18200421333312988, |
|
"learning_rate": 2.321950293606623e-06, |
|
"loss": 0.7158, |
|
"step": 427 |
|
}, |
|
{ |
|
"epoch": 2.7296, |
|
"grad_norm": 0.1918717324733734, |
|
"learning_rate": 2.210897841403331e-06, |
|
"loss": 0.7156, |
|
"step": 428 |
|
}, |
|
{ |
|
"epoch": 2.7359999999999998, |
|
"grad_norm": 0.16785910725593567, |
|
"learning_rate": 2.1025064966430696e-06, |
|
"loss": 0.6626, |
|
"step": 429 |
|
}, |
|
{ |
|
"epoch": 2.7424, |
|
"grad_norm": 0.17111513018608093, |
|
"learning_rate": 1.9967822950327454e-06, |
|
"loss": 0.6273, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 2.7488, |
|
"grad_norm": 0.18554243445396423, |
|
"learning_rate": 1.8937311237610166e-06, |
|
"loss": 0.6827, |
|
"step": 431 |
|
}, |
|
{ |
|
"epoch": 2.7552, |
|
"grad_norm": 0.1815970093011856, |
|
"learning_rate": 1.793358721170435e-06, |
|
"loss": 0.6945, |
|
"step": 432 |
|
}, |
|
{ |
|
"epoch": 2.7616, |
|
"grad_norm": 0.18188947439193726, |
|
"learning_rate": 1.6956706764379438e-06, |
|
"loss": 0.6751, |
|
"step": 433 |
|
}, |
|
{ |
|
"epoch": 2.768, |
|
"grad_norm": 0.17477133870124817, |
|
"learning_rate": 1.6006724292636166e-06, |
|
"loss": 0.6617, |
|
"step": 434 |
|
}, |
|
{ |
|
"epoch": 2.7744, |
|
"grad_norm": 0.17050299048423767, |
|
"learning_rate": 1.5083692695677832e-06, |
|
"loss": 0.6813, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 2.7808, |
|
"grad_norm": 0.18110086023807526, |
|
"learning_rate": 1.418766337196431e-06, |
|
"loss": 0.654, |
|
"step": 436 |
|
}, |
|
{ |
|
"epoch": 2.7872, |
|
"grad_norm": 0.17326004803180695, |
|
"learning_rate": 1.331868621635024e-06, |
|
"loss": 0.6791, |
|
"step": 437 |
|
}, |
|
{ |
|
"epoch": 2.7936, |
|
"grad_norm": 0.16975191235542297, |
|
"learning_rate": 1.2476809617306407e-06, |
|
"loss": 0.6558, |
|
"step": 438 |
|
}, |
|
{ |
|
"epoch": 2.8, |
|
"grad_norm": 0.18156126141548157, |
|
"learning_rate": 1.166208045422551e-06, |
|
"loss": 0.6519, |
|
"step": 439 |
|
}, |
|
{ |
|
"epoch": 2.8064, |
|
"grad_norm": 0.17490343749523163, |
|
"learning_rate": 1.0874544094811423e-06, |
|
"loss": 0.6796, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 2.8128, |
|
"grad_norm": 0.2003561407327652, |
|
"learning_rate": 1.0114244392553317e-06, |
|
"loss": 0.6837, |
|
"step": 441 |
|
}, |
|
{ |
|
"epoch": 2.8192, |
|
"grad_norm": 0.17029455304145813, |
|
"learning_rate": 9.381223684283291e-07, |
|
"loss": 0.619, |
|
"step": 442 |
|
}, |
|
{ |
|
"epoch": 2.8256, |
|
"grad_norm": 0.18762946128845215, |
|
"learning_rate": 8.675522787819023e-07, |
|
"loss": 0.7032, |
|
"step": 443 |
|
}, |
|
{ |
|
"epoch": 2.832, |
|
"grad_norm": 0.1766004115343094, |
|
"learning_rate": 7.9971809996911e-07, |
|
"loss": 0.6922, |
|
"step": 444 |
|
}, |
|
{ |
|
"epoch": 2.8384, |
|
"grad_norm": 0.17784501612186432, |
|
"learning_rate": 7.346236092954318e-07, |
|
"loss": 0.735, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 2.8448, |
|
"grad_norm": 0.1895914524793625, |
|
"learning_rate": 6.722724315084805e-07, |
|
"loss": 0.7254, |
|
"step": 446 |
|
}, |
|
{ |
|
"epoch": 2.8512, |
|
"grad_norm": 0.17988507449626923, |
|
"learning_rate": 6.12668038596137e-07, |
|
"loss": 0.6898, |
|
"step": 447 |
|
}, |
|
{ |
|
"epoch": 2.8576, |
|
"grad_norm": 0.18528032302856445, |
|
"learning_rate": 5.558137495932037e-07, |
|
"loss": 0.6719, |
|
"step": 448 |
|
}, |
|
{ |
|
"epoch": 2.864, |
|
"grad_norm": 0.1748681515455246, |
|
"learning_rate": 5.017127303966085e-07, |
|
"loss": 0.6558, |
|
"step": 449 |
|
}, |
|
{ |
|
"epoch": 2.8704, |
|
"grad_norm": 0.1760055422782898, |
|
"learning_rate": 4.5036799358910697e-07, |
|
"loss": 0.7183, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 2.8768000000000002, |
|
"grad_norm": 0.17728447914123535, |
|
"learning_rate": 4.0178239827151075e-07, |
|
"loss": 0.6566, |
|
"step": 451 |
|
}, |
|
{ |
|
"epoch": 2.8832, |
|
"grad_norm": 0.1707494556903839, |
|
"learning_rate": 3.5595864990352056e-07, |
|
"loss": 0.7113, |
|
"step": 452 |
|
}, |
|
{ |
|
"epoch": 2.8895999999999997, |
|
"grad_norm": 0.17298643290996552, |
|
"learning_rate": 3.128993001530245e-07, |
|
"loss": 0.636, |
|
"step": 453 |
|
}, |
|
{ |
|
"epoch": 2.896, |
|
"grad_norm": 0.18352839350700378, |
|
"learning_rate": 2.72606746754045e-07, |
|
"loss": 0.6611, |
|
"step": 454 |
|
}, |
|
{ |
|
"epoch": 2.9024, |
|
"grad_norm": 0.1850634068250656, |
|
"learning_rate": 2.3508323337321226e-07, |
|
"loss": 0.6805, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 2.9088000000000003, |
|
"grad_norm": 0.176837757229805, |
|
"learning_rate": 2.0033084948483105e-07, |
|
"loss": 0.7048, |
|
"step": 456 |
|
}, |
|
{ |
|
"epoch": 2.9152, |
|
"grad_norm": 0.1697586327791214, |
|
"learning_rate": 1.6835153025451245e-07, |
|
"loss": 0.6847, |
|
"step": 457 |
|
}, |
|
{ |
|
"epoch": 2.9215999999999998, |
|
"grad_norm": 0.16893458366394043, |
|
"learning_rate": 1.3914705643143789e-07, |
|
"loss": 0.6545, |
|
"step": 458 |
|
}, |
|
{ |
|
"epoch": 2.928, |
|
"grad_norm": 0.17263327538967133, |
|
"learning_rate": 1.1271905424918295e-07, |
|
"loss": 0.7085, |
|
"step": 459 |
|
}, |
|
{ |
|
"epoch": 2.9344, |
|
"grad_norm": 0.17428065836429596, |
|
"learning_rate": 8.906899533517865e-08, |
|
"loss": 0.6696, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 2.9408, |
|
"grad_norm": 0.17830806970596313, |
|
"learning_rate": 6.819819662874371e-08, |
|
"loss": 0.6562, |
|
"step": 461 |
|
}, |
|
{ |
|
"epoch": 2.9472, |
|
"grad_norm": 0.16810065507888794, |
|
"learning_rate": 5.0107820307770947e-08, |
|
"loss": 0.7167, |
|
"step": 462 |
|
}, |
|
{ |
|
"epoch": 2.9536, |
|
"grad_norm": 0.18890230357646942, |
|
"learning_rate": 3.4798873723984605e-08, |
|
"loss": 0.7027, |
|
"step": 463 |
|
}, |
|
{ |
|
"epoch": 2.96, |
|
"grad_norm": 0.181377574801445, |
|
"learning_rate": 2.227220934688523e-08, |
|
"loss": 0.6166, |
|
"step": 464 |
|
}, |
|
{ |
|
"epoch": 2.9664, |
|
"grad_norm": 0.16959308087825775, |
|
"learning_rate": 1.2528524716259871e-08, |
|
"loss": 0.7082, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 2.9728, |
|
"grad_norm": 0.16611914336681366, |
|
"learning_rate": 5.568362403318705e-09, |
|
"loss": 0.6582, |
|
"step": 466 |
|
}, |
|
{ |
|
"epoch": 2.9792, |
|
"grad_norm": 0.16966696083545685, |
|
"learning_rate": 1.3921099805302984e-09, |
|
"loss": 0.676, |
|
"step": 467 |
|
}, |
|
{ |
|
"epoch": 2.9856, |
|
"grad_norm": 0.16860099136829376, |
|
"learning_rate": 0.0, |
|
"loss": 0.6525, |
|
"step": 468 |
|
}, |
|
{ |
|
"epoch": 2.9856, |
|
"step": 468, |
|
"total_flos": 6.618679011990045e+18, |
|
"train_loss": 0.7691191567314996, |
|
"train_runtime": 29137.5675, |
|
"train_samples_per_second": 1.03, |
|
"train_steps_per_second": 0.016 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 468, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 100, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 6.618679011990045e+18, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|