|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.012672665061462425, |
|
"eval_steps": 50, |
|
"global_step": 200, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 6.336332530731213e-05, |
|
"grad_norm": 0.6353956460952759, |
|
"learning_rate": 2e-05, |
|
"loss": 1.0504, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 6.336332530731213e-05, |
|
"eval_loss": 1.031355619430542, |
|
"eval_runtime": 244.6123, |
|
"eval_samples_per_second": 27.17, |
|
"eval_steps_per_second": 13.585, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.00012672665061462426, |
|
"grad_norm": 0.6307565569877625, |
|
"learning_rate": 4e-05, |
|
"loss": 1.0031, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.0001900899759219364, |
|
"grad_norm": 0.5707690119743347, |
|
"learning_rate": 6e-05, |
|
"loss": 1.0266, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.0002534533012292485, |
|
"grad_norm": 0.4813165068626404, |
|
"learning_rate": 8e-05, |
|
"loss": 0.813, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.0003168166265365606, |
|
"grad_norm": 0.5120643377304077, |
|
"learning_rate": 0.0001, |
|
"loss": 0.9762, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.0003801799518438728, |
|
"grad_norm": 0.5939896106719971, |
|
"learning_rate": 0.00012, |
|
"loss": 1.2066, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.0004435432771511849, |
|
"grad_norm": 0.5686677098274231, |
|
"learning_rate": 0.00014, |
|
"loss": 0.909, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.000506906602458497, |
|
"grad_norm": 0.5287855267524719, |
|
"learning_rate": 0.00016, |
|
"loss": 0.8397, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.0005702699277658091, |
|
"grad_norm": 0.7295874357223511, |
|
"learning_rate": 0.00018, |
|
"loss": 1.0453, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.0006336332530731213, |
|
"grad_norm": 0.540809154510498, |
|
"learning_rate": 0.0002, |
|
"loss": 0.8007, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.0006969965783804334, |
|
"grad_norm": 0.8492153882980347, |
|
"learning_rate": 0.0001999863304992469, |
|
"loss": 0.7473, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.0007603599036877456, |
|
"grad_norm": 0.9232691526412964, |
|
"learning_rate": 0.00019994532573409262, |
|
"loss": 0.7589, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.0008237232289950577, |
|
"grad_norm": 0.683419942855835, |
|
"learning_rate": 0.00019987699691483048, |
|
"loss": 0.7474, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.0008870865543023698, |
|
"grad_norm": 0.7936184406280518, |
|
"learning_rate": 0.00019978136272187747, |
|
"loss": 0.7031, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.0009504498796096819, |
|
"grad_norm": 0.9965357184410095, |
|
"learning_rate": 0.000199658449300667, |
|
"loss": 0.8313, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.001013813204916994, |
|
"grad_norm": 0.6645439863204956, |
|
"learning_rate": 0.00019950829025450114, |
|
"loss": 0.6294, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.001077176530224306, |
|
"grad_norm": 0.6381065249443054, |
|
"learning_rate": 0.00019933092663536382, |
|
"loss": 0.5783, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.0011405398555316183, |
|
"grad_norm": 0.72495037317276, |
|
"learning_rate": 0.00019912640693269752, |
|
"loss": 0.5646, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.0012039031808389305, |
|
"grad_norm": 0.714322030544281, |
|
"learning_rate": 0.00019889478706014687, |
|
"loss": 0.5865, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.0012672665061462425, |
|
"grad_norm": 0.6124002933502197, |
|
"learning_rate": 0.00019863613034027224, |
|
"loss": 0.5744, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.0013306298314535547, |
|
"grad_norm": 0.6906410455703735, |
|
"learning_rate": 0.00019835050748723824, |
|
"loss": 0.588, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.0013939931567608667, |
|
"grad_norm": 0.6728127598762512, |
|
"learning_rate": 0.00019803799658748094, |
|
"loss": 0.5253, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.001457356482068179, |
|
"grad_norm": 0.6391797065734863, |
|
"learning_rate": 0.00019769868307835994, |
|
"loss": 0.4975, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.0015207198073754911, |
|
"grad_norm": 0.7494686841964722, |
|
"learning_rate": 0.0001973326597248006, |
|
"loss": 0.5511, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.0015840831326828031, |
|
"grad_norm": 0.6722864508628845, |
|
"learning_rate": 0.00019694002659393305, |
|
"loss": 0.5188, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.0016474464579901153, |
|
"grad_norm": 0.7420151233673096, |
|
"learning_rate": 0.00019652089102773488, |
|
"loss": 0.5475, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.0017108097832974276, |
|
"grad_norm": 0.8725250959396362, |
|
"learning_rate": 0.00019607536761368484, |
|
"loss": 0.5319, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.0017741731086047396, |
|
"grad_norm": 0.6723807454109192, |
|
"learning_rate": 0.00019560357815343577, |
|
"loss": 0.4288, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.0018375364339120518, |
|
"grad_norm": 0.9087620377540588, |
|
"learning_rate": 0.00019510565162951537, |
|
"loss": 0.7248, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.0019008997592193638, |
|
"grad_norm": 0.6110153794288635, |
|
"learning_rate": 0.00019458172417006347, |
|
"loss": 0.3915, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.001964263084526676, |
|
"grad_norm": 0.8582053184509277, |
|
"learning_rate": 0.00019403193901161613, |
|
"loss": 0.3134, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.002027626409833988, |
|
"grad_norm": 0.5912528038024902, |
|
"learning_rate": 0.0001934564464599461, |
|
"loss": 0.5711, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.0020909897351413004, |
|
"grad_norm": 0.9212374091148376, |
|
"learning_rate": 0.00019285540384897073, |
|
"loss": 0.5806, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.002154353060448612, |
|
"grad_norm": 0.5697605609893799, |
|
"learning_rate": 0.00019222897549773848, |
|
"loss": 0.4163, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.0022177163857559244, |
|
"grad_norm": 0.8068739175796509, |
|
"learning_rate": 0.00019157733266550575, |
|
"loss": 0.5182, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.0022810797110632366, |
|
"grad_norm": 0.6407749056816101, |
|
"learning_rate": 0.00019090065350491626, |
|
"loss": 0.4277, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.002344443036370549, |
|
"grad_norm": 0.6839185953140259, |
|
"learning_rate": 0.00019019912301329592, |
|
"loss": 0.4279, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.002407806361677861, |
|
"grad_norm": 0.7499701380729675, |
|
"learning_rate": 0.00018947293298207635, |
|
"loss": 0.8125, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.002471169686985173, |
|
"grad_norm": 0.6437113285064697, |
|
"learning_rate": 0.0001887222819443612, |
|
"loss": 0.413, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.002534533012292485, |
|
"grad_norm": 0.908708930015564, |
|
"learning_rate": 0.0001879473751206489, |
|
"loss": 0.3385, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.0025978963375997972, |
|
"grad_norm": 0.4454040229320526, |
|
"learning_rate": 0.00018714842436272773, |
|
"loss": 0.3258, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.0026612596629071094, |
|
"grad_norm": 0.7585754990577698, |
|
"learning_rate": 0.00018632564809575742, |
|
"loss": 0.5409, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.0027246229882144216, |
|
"grad_norm": 0.5129997134208679, |
|
"learning_rate": 0.0001854792712585539, |
|
"loss": 0.373, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.0027879863135217334, |
|
"grad_norm": 0.5935911536216736, |
|
"learning_rate": 0.00018460952524209355, |
|
"loss": 0.4932, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.0028513496388290456, |
|
"grad_norm": 0.7715666890144348, |
|
"learning_rate": 0.00018371664782625287, |
|
"loss": 0.4267, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.002914712964136358, |
|
"grad_norm": 0.4729037880897522, |
|
"learning_rate": 0.00018280088311480201, |
|
"loss": 0.4372, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.00297807628944367, |
|
"grad_norm": 0.5748558640480042, |
|
"learning_rate": 0.00018186248146866927, |
|
"loss": 0.4251, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.0030414396147509823, |
|
"grad_norm": 0.6334372758865356, |
|
"learning_rate": 0.00018090169943749476, |
|
"loss": 0.4073, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.003104802940058294, |
|
"grad_norm": 0.5685266256332397, |
|
"learning_rate": 0.0001799187996894925, |
|
"loss": 0.5214, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.0031681662653656063, |
|
"grad_norm": 0.8813576698303223, |
|
"learning_rate": 0.00017891405093963938, |
|
"loss": 0.4797, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.0031681662653656063, |
|
"eval_loss": 0.4177018702030182, |
|
"eval_runtime": 243.0252, |
|
"eval_samples_per_second": 27.347, |
|
"eval_steps_per_second": 13.673, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.0032315295906729185, |
|
"grad_norm": 0.8580302596092224, |
|
"learning_rate": 0.00017788772787621126, |
|
"loss": 0.3992, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.0032948929159802307, |
|
"grad_norm": 0.7331790328025818, |
|
"learning_rate": 0.00017684011108568592, |
|
"loss": 0.5749, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.003358256241287543, |
|
"grad_norm": 1.4232603311538696, |
|
"learning_rate": 0.0001757714869760335, |
|
"loss": 0.633, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.003421619566594855, |
|
"grad_norm": 0.6469828486442566, |
|
"learning_rate": 0.0001746821476984154, |
|
"loss": 0.4805, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.003484982891902167, |
|
"grad_norm": 0.5833075046539307, |
|
"learning_rate": 0.00017357239106731317, |
|
"loss": 0.303, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.003548346217209479, |
|
"grad_norm": 0.5430967807769775, |
|
"learning_rate": 0.00017244252047910892, |
|
"loss": 0.3606, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.0036117095425167913, |
|
"grad_norm": 0.7270248532295227, |
|
"learning_rate": 0.00017129284482913972, |
|
"loss": 0.519, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.0036750728678241035, |
|
"grad_norm": 0.497071236371994, |
|
"learning_rate": 0.00017012367842724887, |
|
"loss": 0.3192, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.0037384361931314157, |
|
"grad_norm": 0.5522649884223938, |
|
"learning_rate": 0.0001689353409118566, |
|
"loss": 0.4403, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.0038017995184387275, |
|
"grad_norm": 1.4259494543075562, |
|
"learning_rate": 0.00016772815716257412, |
|
"loss": 0.4538, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.0038651628437460397, |
|
"grad_norm": 0.5942732095718384, |
|
"learning_rate": 0.0001665024572113848, |
|
"loss": 0.3762, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.003928526169053352, |
|
"grad_norm": 0.5439912676811218, |
|
"learning_rate": 0.00016525857615241687, |
|
"loss": 0.4122, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.003991889494360664, |
|
"grad_norm": 0.4867478311061859, |
|
"learning_rate": 0.00016399685405033167, |
|
"loss": 0.2925, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.004055252819667976, |
|
"grad_norm": 0.8213767409324646, |
|
"learning_rate": 0.0001627176358473537, |
|
"loss": 0.4217, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.004118616144975288, |
|
"grad_norm": 1.1848950386047363, |
|
"learning_rate": 0.0001614212712689668, |
|
"loss": 0.3365, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.004181979470282601, |
|
"grad_norm": 0.7401292324066162, |
|
"learning_rate": 0.00016010811472830252, |
|
"loss": 0.7397, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.0042453427955899126, |
|
"grad_norm": 0.910925030708313, |
|
"learning_rate": 0.00015877852522924732, |
|
"loss": 0.431, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.004308706120897224, |
|
"grad_norm": 0.8558295965194702, |
|
"learning_rate": 0.00015743286626829437, |
|
"loss": 0.3999, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.004372069446204537, |
|
"grad_norm": 0.5612680315971375, |
|
"learning_rate": 0.0001560715057351673, |
|
"loss": 0.4328, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.004435432771511849, |
|
"grad_norm": 0.38900530338287354, |
|
"learning_rate": 0.00015469481581224272, |
|
"loss": 0.4551, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.004498796096819161, |
|
"grad_norm": 0.5994769334793091, |
|
"learning_rate": 0.0001533031728727994, |
|
"loss": 0.4326, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.004562159422126473, |
|
"grad_norm": 0.5441716909408569, |
|
"learning_rate": 0.00015189695737812152, |
|
"loss": 0.7896, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.004625522747433785, |
|
"grad_norm": 1.3651723861694336, |
|
"learning_rate": 0.0001504765537734844, |
|
"loss": 0.3994, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.004688886072741098, |
|
"grad_norm": 1.0307939052581787, |
|
"learning_rate": 0.00014904235038305083, |
|
"loss": 0.4365, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.004752249398048409, |
|
"grad_norm": 0.5589604377746582, |
|
"learning_rate": 0.00014759473930370736, |
|
"loss": 0.4381, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.004815612723355722, |
|
"grad_norm": 0.6483279466629028, |
|
"learning_rate": 0.0001461341162978688, |
|
"loss": 1.2379, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.004878976048663034, |
|
"grad_norm": 0.688217282295227, |
|
"learning_rate": 0.00014466088068528068, |
|
"loss": 0.3769, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.004942339373970346, |
|
"grad_norm": 0.6985648274421692, |
|
"learning_rate": 0.00014317543523384928, |
|
"loss": 0.2771, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.005005702699277658, |
|
"grad_norm": 1.2414263486862183, |
|
"learning_rate": 0.00014167818604952906, |
|
"loss": 0.3534, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.00506906602458497, |
|
"grad_norm": 0.6788727045059204, |
|
"learning_rate": 0.00014016954246529696, |
|
"loss": 0.5284, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.005132429349892283, |
|
"grad_norm": 0.7818289995193481, |
|
"learning_rate": 0.00013864991692924523, |
|
"loss": 0.2941, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.0051957926751995944, |
|
"grad_norm": 0.6787352561950684, |
|
"learning_rate": 0.00013711972489182208, |
|
"loss": 0.3318, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.005259156000506906, |
|
"grad_norm": 0.971752941608429, |
|
"learning_rate": 0.00013557938469225167, |
|
"loss": 0.4559, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.005322519325814219, |
|
"grad_norm": 0.5854300260543823, |
|
"learning_rate": 0.00013402931744416433, |
|
"loss": 0.3601, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.005385882651121531, |
|
"grad_norm": 0.7647449374198914, |
|
"learning_rate": 0.00013246994692046836, |
|
"loss": 0.435, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.005449245976428843, |
|
"grad_norm": 0.5981462001800537, |
|
"learning_rate": 0.00013090169943749476, |
|
"loss": 0.4311, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.005512609301736155, |
|
"grad_norm": 0.4233483374118805, |
|
"learning_rate": 0.0001293250037384465, |
|
"loss": 0.415, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.005575972627043467, |
|
"grad_norm": 0.8820663094520569, |
|
"learning_rate": 0.00012774029087618446, |
|
"loss": 0.3958, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.0056393359523507795, |
|
"grad_norm": 0.6327323317527771, |
|
"learning_rate": 0.00012614799409538198, |
|
"loss": 0.6534, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.005702699277658091, |
|
"grad_norm": 0.5511671304702759, |
|
"learning_rate": 0.00012454854871407994, |
|
"loss": 0.3668, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.005766062602965404, |
|
"grad_norm": 0.7432234883308411, |
|
"learning_rate": 0.00012294239200467516, |
|
"loss": 0.3317, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.005829425928272716, |
|
"grad_norm": 0.6933262348175049, |
|
"learning_rate": 0.0001213299630743747, |
|
"loss": 0.3444, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.0058927892535800275, |
|
"grad_norm": 0.7745187878608704, |
|
"learning_rate": 0.00011971170274514802, |
|
"loss": 0.1975, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.00595615257888734, |
|
"grad_norm": 0.6152411699295044, |
|
"learning_rate": 0.000118088053433211, |
|
"loss": 0.5463, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.006019515904194652, |
|
"grad_norm": 1.3992512226104736, |
|
"learning_rate": 0.00011645945902807341, |
|
"loss": 0.4622, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.0060828792295019645, |
|
"grad_norm": 0.8983330726623535, |
|
"learning_rate": 0.0001148263647711842, |
|
"loss": 0.466, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.006146242554809276, |
|
"grad_norm": 0.5503897666931152, |
|
"learning_rate": 0.00011318921713420691, |
|
"loss": 0.37, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.006209605880116588, |
|
"grad_norm": 0.5463910102844238, |
|
"learning_rate": 0.00011154846369695863, |
|
"loss": 0.3198, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.006272969205423901, |
|
"grad_norm": 0.4767880141735077, |
|
"learning_rate": 0.0001099045530250463, |
|
"loss": 0.3526, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.0063363325307312125, |
|
"grad_norm": 1.2058966159820557, |
|
"learning_rate": 0.00010825793454723325, |
|
"loss": 0.4968, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.0063363325307312125, |
|
"eval_loss": 0.3803197145462036, |
|
"eval_runtime": 243.4104, |
|
"eval_samples_per_second": 27.304, |
|
"eval_steps_per_second": 13.652, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.006399695856038525, |
|
"grad_norm": 0.4597567319869995, |
|
"learning_rate": 0.00010660905843256994, |
|
"loss": 0.4651, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.006463059181345837, |
|
"grad_norm": 0.6151206493377686, |
|
"learning_rate": 0.00010495837546732224, |
|
"loss": 0.443, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.006526422506653149, |
|
"grad_norm": 1.1683275699615479, |
|
"learning_rate": 0.00010330633693173082, |
|
"loss": 0.629, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.006589785831960461, |
|
"grad_norm": 1.0897656679153442, |
|
"learning_rate": 0.00010165339447663587, |
|
"loss": 0.42, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.006653149157267773, |
|
"grad_norm": 0.5582302212715149, |
|
"learning_rate": 0.0001, |
|
"loss": 0.2376, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.006716512482575086, |
|
"grad_norm": 0.5862799286842346, |
|
"learning_rate": 9.834660552336415e-05, |
|
"loss": 0.291, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.006779875807882398, |
|
"grad_norm": 0.7772594094276428, |
|
"learning_rate": 9.669366306826919e-05, |
|
"loss": 0.3586, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.00684323913318971, |
|
"grad_norm": 0.7611879110336304, |
|
"learning_rate": 9.504162453267777e-05, |
|
"loss": 0.3803, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.006906602458497022, |
|
"grad_norm": 1.1162878274917603, |
|
"learning_rate": 9.339094156743007e-05, |
|
"loss": 0.4645, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.006969965783804334, |
|
"grad_norm": 2.089069128036499, |
|
"learning_rate": 9.174206545276677e-05, |
|
"loss": 0.3853, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.007033329109111646, |
|
"grad_norm": 0.658745527267456, |
|
"learning_rate": 9.009544697495374e-05, |
|
"loss": 0.5383, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.007096692434418958, |
|
"grad_norm": 0.7592114210128784, |
|
"learning_rate": 8.845153630304139e-05, |
|
"loss": 0.6006, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.007160055759726271, |
|
"grad_norm": 0.666191041469574, |
|
"learning_rate": 8.681078286579311e-05, |
|
"loss": 0.2725, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.007223419085033583, |
|
"grad_norm": 0.5870088934898376, |
|
"learning_rate": 8.517363522881579e-05, |
|
"loss": 0.2916, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.007286782410340894, |
|
"grad_norm": 0.5232657790184021, |
|
"learning_rate": 8.35405409719266e-05, |
|
"loss": 0.4027, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.007350145735648207, |
|
"grad_norm": 0.8465791344642639, |
|
"learning_rate": 8.191194656678904e-05, |
|
"loss": 0.396, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.007413509060955519, |
|
"grad_norm": 0.4592651128768921, |
|
"learning_rate": 8.028829725485199e-05, |
|
"loss": 0.2962, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.0074768723862628315, |
|
"grad_norm": 0.4681718051433563, |
|
"learning_rate": 7.867003692562534e-05, |
|
"loss": 0.2729, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.007540235711570143, |
|
"grad_norm": 0.5922088623046875, |
|
"learning_rate": 7.705760799532485e-05, |
|
"loss": 0.5202, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.007603599036877455, |
|
"grad_norm": 0.9889622330665588, |
|
"learning_rate": 7.54514512859201e-05, |
|
"loss": 0.4689, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.007666962362184768, |
|
"grad_norm": 0.49433234333992004, |
|
"learning_rate": 7.385200590461803e-05, |
|
"loss": 0.332, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.0077303256874920795, |
|
"grad_norm": 0.6642858386039734, |
|
"learning_rate": 7.225970912381556e-05, |
|
"loss": 0.4718, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.007793689012799392, |
|
"grad_norm": 1.099217414855957, |
|
"learning_rate": 7.067499626155354e-05, |
|
"loss": 0.3997, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.007857052338106704, |
|
"grad_norm": 0.4867766201496124, |
|
"learning_rate": 6.909830056250527e-05, |
|
"loss": 0.3403, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.007920415663414017, |
|
"grad_norm": 1.1562979221343994, |
|
"learning_rate": 6.753005307953167e-05, |
|
"loss": 0.4399, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.007983778988721327, |
|
"grad_norm": 0.48987850546836853, |
|
"learning_rate": 6.59706825558357e-05, |
|
"loss": 0.7933, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.00804714231402864, |
|
"grad_norm": 0.7575961947441101, |
|
"learning_rate": 6.442061530774834e-05, |
|
"loss": 0.5499, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.008110505639335953, |
|
"grad_norm": 0.48674753308296204, |
|
"learning_rate": 6.28802751081779e-05, |
|
"loss": 0.4209, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.008173868964643264, |
|
"grad_norm": 4.047654628753662, |
|
"learning_rate": 6.135008307075481e-05, |
|
"loss": 0.4438, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.008237232289950576, |
|
"grad_norm": 0.9471072554588318, |
|
"learning_rate": 5.983045753470308e-05, |
|
"loss": 0.4107, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.008300595615257889, |
|
"grad_norm": 0.49057498574256897, |
|
"learning_rate": 5.832181395047098e-05, |
|
"loss": 0.2653, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.008363958940565202, |
|
"grad_norm": 0.7635347247123718, |
|
"learning_rate": 5.6824564766150726e-05, |
|
"loss": 0.4394, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.008427322265872512, |
|
"grad_norm": 0.7269777059555054, |
|
"learning_rate": 5.533911931471936e-05, |
|
"loss": 0.414, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.008490685591179825, |
|
"grad_norm": 0.7226413488388062, |
|
"learning_rate": 5.386588370213124e-05, |
|
"loss": 0.4055, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.008554048916487138, |
|
"grad_norm": 0.6612048745155334, |
|
"learning_rate": 5.240526069629265e-05, |
|
"loss": 0.369, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.008617412241794449, |
|
"grad_norm": 0.6469954252243042, |
|
"learning_rate": 5.095764961694922e-05, |
|
"loss": 0.3586, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.008680775567101761, |
|
"grad_norm": 3.354370355606079, |
|
"learning_rate": 4.952344622651566e-05, |
|
"loss": 0.6585, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.008744138892409074, |
|
"grad_norm": 0.4419330954551697, |
|
"learning_rate": 4.810304262187852e-05, |
|
"loss": 0.4426, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.008807502217716385, |
|
"grad_norm": 0.6738267540931702, |
|
"learning_rate": 4.669682712720065e-05, |
|
"loss": 0.4253, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.008870865543023698, |
|
"grad_norm": 0.39082711935043335, |
|
"learning_rate": 4.530518418775733e-05, |
|
"loss": 0.3901, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.00893422886833101, |
|
"grad_norm": 0.42554858326911926, |
|
"learning_rate": 4.392849426483274e-05, |
|
"loss": 0.3449, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.008997592193638323, |
|
"grad_norm": 0.7555180788040161, |
|
"learning_rate": 4.256713373170564e-05, |
|
"loss": 0.3519, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.009060955518945634, |
|
"grad_norm": 0.6981064081192017, |
|
"learning_rate": 4.12214747707527e-05, |
|
"loss": 0.4164, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.009124318844252946, |
|
"grad_norm": 0.632774293422699, |
|
"learning_rate": 3.9891885271697496e-05, |
|
"loss": 0.2556, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.009187682169560259, |
|
"grad_norm": 0.9713982343673706, |
|
"learning_rate": 3.857872873103322e-05, |
|
"loss": 0.315, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.00925104549486757, |
|
"grad_norm": 0.6205018758773804, |
|
"learning_rate": 3.7282364152646297e-05, |
|
"loss": 0.5835, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.009314408820174883, |
|
"grad_norm": 0.5199056267738342, |
|
"learning_rate": 3.600314594966834e-05, |
|
"loss": 0.4307, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.009377772145482195, |
|
"grad_norm": 0.4700050354003906, |
|
"learning_rate": 3.4741423847583134e-05, |
|
"loss": 0.435, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.009441135470789508, |
|
"grad_norm": 0.4441298246383667, |
|
"learning_rate": 3.349754278861517e-05, |
|
"loss": 0.321, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.009504498796096819, |
|
"grad_norm": 0.5568394064903259, |
|
"learning_rate": 3.227184283742591e-05, |
|
"loss": 0.3498, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.009504498796096819, |
|
"eval_loss": 0.3687565326690674, |
|
"eval_runtime": 243.2458, |
|
"eval_samples_per_second": 27.322, |
|
"eval_steps_per_second": 13.661, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.009567862121404131, |
|
"grad_norm": 0.6333056092262268, |
|
"learning_rate": 3.106465908814342e-05, |
|
"loss": 0.5005, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.009631225446711444, |
|
"grad_norm": 0.5589426159858704, |
|
"learning_rate": 2.9876321572751144e-05, |
|
"loss": 0.3283, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.009694588772018755, |
|
"grad_norm": 0.6089442372322083, |
|
"learning_rate": 2.87071551708603e-05, |
|
"loss": 0.407, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.009757952097326068, |
|
"grad_norm": 0.6092202663421631, |
|
"learning_rate": 2.7557479520891104e-05, |
|
"loss": 0.3826, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.00982131542263338, |
|
"grad_norm": 0.7596146464347839, |
|
"learning_rate": 2.6427608932686843e-05, |
|
"loss": 0.3406, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.009884678747940691, |
|
"grad_norm": 0.5944835543632507, |
|
"learning_rate": 2.5317852301584643e-05, |
|
"loss": 0.3796, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.009948042073248004, |
|
"grad_norm": 0.7699212431907654, |
|
"learning_rate": 2.422851302396655e-05, |
|
"loss": 0.3407, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 0.010011405398555316, |
|
"grad_norm": 0.640291690826416, |
|
"learning_rate": 2.315988891431412e-05, |
|
"loss": 0.2453, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.010074768723862629, |
|
"grad_norm": 0.6129955649375916, |
|
"learning_rate": 2.2112272123788768e-05, |
|
"loss": 0.5138, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.01013813204916994, |
|
"grad_norm": 0.633854329586029, |
|
"learning_rate": 2.1085949060360654e-05, |
|
"loss": 0.2867, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.010201495374477253, |
|
"grad_norm": 0.5275560021400452, |
|
"learning_rate": 2.008120031050753e-05, |
|
"loss": 0.497, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 0.010264858699784565, |
|
"grad_norm": 0.5423012971878052, |
|
"learning_rate": 1.9098300562505266e-05, |
|
"loss": 0.3148, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.010328222025091876, |
|
"grad_norm": 0.6173859238624573, |
|
"learning_rate": 1.8137518531330767e-05, |
|
"loss": 0.4203, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 0.010391585350399189, |
|
"grad_norm": 0.5317500829696655, |
|
"learning_rate": 1.7199116885197995e-05, |
|
"loss": 0.3832, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.010454948675706502, |
|
"grad_norm": 0.7663280963897705, |
|
"learning_rate": 1.6283352173747145e-05, |
|
"loss": 0.4067, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.010518312001013812, |
|
"grad_norm": 0.721696138381958, |
|
"learning_rate": 1.5390474757906446e-05, |
|
"loss": 0.5179, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.010581675326321125, |
|
"grad_norm": 0.5552138686180115, |
|
"learning_rate": 1.4520728741446089e-05, |
|
"loss": 0.3094, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 0.010645038651628438, |
|
"grad_norm": 0.9579009413719177, |
|
"learning_rate": 1.3674351904242611e-05, |
|
"loss": 0.5804, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.01070840197693575, |
|
"grad_norm": 0.6748710870742798, |
|
"learning_rate": 1.2851575637272262e-05, |
|
"loss": 0.3898, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 0.010771765302243061, |
|
"grad_norm": 0.4435841143131256, |
|
"learning_rate": 1.2052624879351104e-05, |
|
"loss": 0.2155, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.010835128627550374, |
|
"grad_norm": 0.5674950480461121, |
|
"learning_rate": 1.1277718055638819e-05, |
|
"loss": 0.4185, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 0.010898491952857687, |
|
"grad_norm": 0.6614958047866821, |
|
"learning_rate": 1.0527067017923654e-05, |
|
"loss": 0.3382, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.010961855278164997, |
|
"grad_norm": 0.6172627806663513, |
|
"learning_rate": 9.80087698670411e-06, |
|
"loss": 0.2709, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 0.01102521860347231, |
|
"grad_norm": 0.6699212789535522, |
|
"learning_rate": 9.09934649508375e-06, |
|
"loss": 0.3927, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.011088581928779623, |
|
"grad_norm": 0.6174507141113281, |
|
"learning_rate": 8.422667334494249e-06, |
|
"loss": 0.3937, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.011151945254086934, |
|
"grad_norm": 1.5301414728164673, |
|
"learning_rate": 7.771024502261526e-06, |
|
"loss": 0.2859, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.011215308579394246, |
|
"grad_norm": 0.6501045823097229, |
|
"learning_rate": 7.144596151029303e-06, |
|
"loss": 0.4025, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 0.011278671904701559, |
|
"grad_norm": 0.8940390348434448, |
|
"learning_rate": 6.543553540053926e-06, |
|
"loss": 0.3392, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 0.011342035230008872, |
|
"grad_norm": 0.5241157412528992, |
|
"learning_rate": 5.968060988383883e-06, |
|
"loss": 0.3524, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 0.011405398555316183, |
|
"grad_norm": 0.44170692563056946, |
|
"learning_rate": 5.418275829936537e-06, |
|
"loss": 0.3085, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.011468761880623495, |
|
"grad_norm": 0.5839574933052063, |
|
"learning_rate": 4.8943483704846475e-06, |
|
"loss": 0.4545, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 0.011532125205930808, |
|
"grad_norm": 0.768035352230072, |
|
"learning_rate": 4.3964218465642355e-06, |
|
"loss": 0.3086, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.011595488531238119, |
|
"grad_norm": 0.39008185267448425, |
|
"learning_rate": 3.924632386315186e-06, |
|
"loss": 0.2691, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 0.011658851856545431, |
|
"grad_norm": 0.5092243552207947, |
|
"learning_rate": 3.4791089722651436e-06, |
|
"loss": 0.3891, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.011722215181852744, |
|
"grad_norm": 0.6592461466789246, |
|
"learning_rate": 3.059973406066963e-06, |
|
"loss": 0.3093, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.011785578507160055, |
|
"grad_norm": 0.7416530251502991, |
|
"learning_rate": 2.667340275199426e-06, |
|
"loss": 0.3265, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.011848941832467368, |
|
"grad_norm": 0.9776325821876526, |
|
"learning_rate": 2.3013169216400733e-06, |
|
"loss": 0.3259, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 0.01191230515777468, |
|
"grad_norm": 0.668077290058136, |
|
"learning_rate": 1.9620034125190644e-06, |
|
"loss": 0.3664, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.011975668483081993, |
|
"grad_norm": 0.677909791469574, |
|
"learning_rate": 1.6494925127617634e-06, |
|
"loss": 0.3525, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 0.012039031808389304, |
|
"grad_norm": 0.7730016708374023, |
|
"learning_rate": 1.3638696597277679e-06, |
|
"loss": 0.385, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.012102395133696616, |
|
"grad_norm": 0.3925216495990753, |
|
"learning_rate": 1.1052129398531507e-06, |
|
"loss": 0.2316, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 0.012165758459003929, |
|
"grad_norm": 0.5705835223197937, |
|
"learning_rate": 8.735930673024806e-07, |
|
"loss": 0.3251, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.01222912178431124, |
|
"grad_norm": 0.6721336841583252, |
|
"learning_rate": 6.690733646361857e-07, |
|
"loss": 0.3518, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 0.012292485109618553, |
|
"grad_norm": 0.4080224931240082, |
|
"learning_rate": 4.917097454988584e-07, |
|
"loss": 0.373, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 0.012355848434925865, |
|
"grad_norm": 0.45140305161476135, |
|
"learning_rate": 3.415506993330153e-07, |
|
"loss": 0.3109, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.012419211760233176, |
|
"grad_norm": 0.5861942768096924, |
|
"learning_rate": 2.1863727812254653e-07, |
|
"loss": 0.4429, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.012482575085540489, |
|
"grad_norm": 0.6811448335647583, |
|
"learning_rate": 1.230030851695263e-07, |
|
"loss": 0.3463, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 0.012545938410847801, |
|
"grad_norm": 0.6060794591903687, |
|
"learning_rate": 5.467426590739511e-08, |
|
"loss": 0.4508, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.012609301736155114, |
|
"grad_norm": 0.542587399482727, |
|
"learning_rate": 1.3669500753099585e-08, |
|
"loss": 0.3189, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 0.012672665061462425, |
|
"grad_norm": 0.4969688951969147, |
|
"learning_rate": 0.0, |
|
"loss": 0.2483, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.012672665061462425, |
|
"eval_loss": 0.36642253398895264, |
|
"eval_runtime": 243.7079, |
|
"eval_samples_per_second": 27.27, |
|
"eval_steps_per_second": 13.635, |
|
"step": 200 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 200, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 50, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2.737284494603059e+16, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|