|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.7562164797659678, |
|
"eval_steps": 500, |
|
"global_step": 300, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.005850804485616773, |
|
"grad_norm": 4.189145565032959, |
|
"learning_rate": 1.9607843137254904e-07, |
|
"loss": 0.6022, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.011701608971233545, |
|
"grad_norm": 4.088385105133057, |
|
"learning_rate": 3.921568627450981e-07, |
|
"loss": 0.6105, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.017552413456850317, |
|
"grad_norm": 4.105137348175049, |
|
"learning_rate": 5.882352941176471e-07, |
|
"loss": 0.6234, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.02340321794246709, |
|
"grad_norm": 4.010756015777588, |
|
"learning_rate": 7.843137254901962e-07, |
|
"loss": 0.5629, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.02925402242808386, |
|
"grad_norm": 4.201730728149414, |
|
"learning_rate": 9.80392156862745e-07, |
|
"loss": 0.6236, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.035104826913700635, |
|
"grad_norm": 4.13097620010376, |
|
"learning_rate": 1.1764705882352942e-06, |
|
"loss": 0.6058, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.040955631399317405, |
|
"grad_norm": 3.753781318664551, |
|
"learning_rate": 1.3725490196078434e-06, |
|
"loss": 0.5798, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.04680643588493418, |
|
"grad_norm": 3.1203114986419678, |
|
"learning_rate": 1.5686274509803923e-06, |
|
"loss": 0.5575, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.05265724037055095, |
|
"grad_norm": 3.1326870918273926, |
|
"learning_rate": 1.7647058823529414e-06, |
|
"loss": 0.5794, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.05850804485616772, |
|
"grad_norm": 3.01350736618042, |
|
"learning_rate": 1.96078431372549e-06, |
|
"loss": 0.5721, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.0643588493417845, |
|
"grad_norm": 2.0586817264556885, |
|
"learning_rate": 2.1568627450980393e-06, |
|
"loss": 0.5389, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.07020965382740127, |
|
"grad_norm": 2.056138753890991, |
|
"learning_rate": 2.3529411764705885e-06, |
|
"loss": 0.5578, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.07606045831301804, |
|
"grad_norm": 1.8458319902420044, |
|
"learning_rate": 2.549019607843137e-06, |
|
"loss": 0.5432, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.08191126279863481, |
|
"grad_norm": 1.3385547399520874, |
|
"learning_rate": 2.7450980392156867e-06, |
|
"loss": 0.5375, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.08776206728425158, |
|
"grad_norm": 2.10184383392334, |
|
"learning_rate": 2.9411764705882355e-06, |
|
"loss": 0.4834, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.09361287176986836, |
|
"grad_norm": 2.354717254638672, |
|
"learning_rate": 3.1372549019607846e-06, |
|
"loss": 0.5087, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.09946367625548513, |
|
"grad_norm": 2.4186935424804688, |
|
"learning_rate": 3.3333333333333333e-06, |
|
"loss": 0.5408, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.1053144807411019, |
|
"grad_norm": 2.02093243598938, |
|
"learning_rate": 3.529411764705883e-06, |
|
"loss": 0.4967, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.11116528522671867, |
|
"grad_norm": 1.9769740104675293, |
|
"learning_rate": 3.7254901960784316e-06, |
|
"loss": 0.5429, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.11701608971233544, |
|
"grad_norm": 1.4087600708007812, |
|
"learning_rate": 3.92156862745098e-06, |
|
"loss": 0.4855, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.12286689419795221, |
|
"grad_norm": 1.4071195125579834, |
|
"learning_rate": 4.11764705882353e-06, |
|
"loss": 0.4956, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.128717698683569, |
|
"grad_norm": 1.4400174617767334, |
|
"learning_rate": 4.313725490196079e-06, |
|
"loss": 0.4966, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.13456850316918575, |
|
"grad_norm": 1.2176562547683716, |
|
"learning_rate": 4.509803921568628e-06, |
|
"loss": 0.4892, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.14041930765480254, |
|
"grad_norm": 1.0557763576507568, |
|
"learning_rate": 4.705882352941177e-06, |
|
"loss": 0.4664, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.1462701121404193, |
|
"grad_norm": 1.0654219388961792, |
|
"learning_rate": 4.901960784313726e-06, |
|
"loss": 0.4427, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.15212091662603608, |
|
"grad_norm": 0.8639155626296997, |
|
"learning_rate": 5.098039215686274e-06, |
|
"loss": 0.4676, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.15797172111165286, |
|
"grad_norm": 0.8091264963150024, |
|
"learning_rate": 5.294117647058824e-06, |
|
"loss": 0.4339, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.16382252559726962, |
|
"grad_norm": 0.7697594165802002, |
|
"learning_rate": 5.4901960784313735e-06, |
|
"loss": 0.4164, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.1696733300828864, |
|
"grad_norm": 0.8522382378578186, |
|
"learning_rate": 5.686274509803922e-06, |
|
"loss": 0.4512, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.17552413456850316, |
|
"grad_norm": 0.7640376687049866, |
|
"learning_rate": 5.882352941176471e-06, |
|
"loss": 0.432, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.18137493905411994, |
|
"grad_norm": 0.6247867941856384, |
|
"learning_rate": 6.07843137254902e-06, |
|
"loss": 0.408, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.18722574353973673, |
|
"grad_norm": 0.6288900971412659, |
|
"learning_rate": 6.274509803921569e-06, |
|
"loss": 0.4611, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.19307654802535348, |
|
"grad_norm": 0.6182562708854675, |
|
"learning_rate": 6.470588235294119e-06, |
|
"loss": 0.4257, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.19892735251097027, |
|
"grad_norm": 0.6193389892578125, |
|
"learning_rate": 6.666666666666667e-06, |
|
"loss": 0.4063, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.20477815699658702, |
|
"grad_norm": 0.6892727017402649, |
|
"learning_rate": 6.862745098039216e-06, |
|
"loss": 0.3967, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.2106289614822038, |
|
"grad_norm": 0.6725057363510132, |
|
"learning_rate": 7.058823529411766e-06, |
|
"loss": 0.4428, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.21647976596782056, |
|
"grad_norm": 0.5203535556793213, |
|
"learning_rate": 7.2549019607843145e-06, |
|
"loss": 0.4151, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.22233057045343735, |
|
"grad_norm": 0.45232418179512024, |
|
"learning_rate": 7.450980392156863e-06, |
|
"loss": 0.3666, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.22818137493905413, |
|
"grad_norm": 0.5872768759727478, |
|
"learning_rate": 7.647058823529411e-06, |
|
"loss": 0.4144, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.2340321794246709, |
|
"grad_norm": 0.526172399520874, |
|
"learning_rate": 7.84313725490196e-06, |
|
"loss": 0.4346, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.23988298391028767, |
|
"grad_norm": 0.5474228858947754, |
|
"learning_rate": 8.03921568627451e-06, |
|
"loss": 0.3965, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.24573378839590443, |
|
"grad_norm": 0.46727877855300903, |
|
"learning_rate": 8.23529411764706e-06, |
|
"loss": 0.4417, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.2515845928815212, |
|
"grad_norm": 0.40532198548316956, |
|
"learning_rate": 8.43137254901961e-06, |
|
"loss": 0.3851, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.257435397367138, |
|
"grad_norm": 0.4897397458553314, |
|
"learning_rate": 8.627450980392157e-06, |
|
"loss": 0.4013, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.26328620185275475, |
|
"grad_norm": 0.4565890431404114, |
|
"learning_rate": 8.823529411764707e-06, |
|
"loss": 0.3745, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.2691370063383715, |
|
"grad_norm": 0.38417261838912964, |
|
"learning_rate": 9.019607843137256e-06, |
|
"loss": 0.3783, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.2749878108239883, |
|
"grad_norm": 0.40912356972694397, |
|
"learning_rate": 9.215686274509804e-06, |
|
"loss": 0.3879, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.2808386153096051, |
|
"grad_norm": 0.42792415618896484, |
|
"learning_rate": 9.411764705882354e-06, |
|
"loss": 0.3837, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.28668941979522183, |
|
"grad_norm": 0.4394405484199524, |
|
"learning_rate": 9.607843137254903e-06, |
|
"loss": 0.4004, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.2925402242808386, |
|
"grad_norm": 0.4622238576412201, |
|
"learning_rate": 9.803921568627451e-06, |
|
"loss": 0.409, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.2983910287664554, |
|
"grad_norm": 0.3894466757774353, |
|
"learning_rate": 1e-05, |
|
"loss": 0.3766, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.30424183325207216, |
|
"grad_norm": 0.39314836263656616, |
|
"learning_rate": 9.999882884955554e-06, |
|
"loss": 0.3418, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.3100926377376889, |
|
"grad_norm": 0.44764766097068787, |
|
"learning_rate": 9.999531545308584e-06, |
|
"loss": 0.3909, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.3159434422233057, |
|
"grad_norm": 0.403144896030426, |
|
"learning_rate": 9.998945997517957e-06, |
|
"loss": 0.3716, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.3217942467089225, |
|
"grad_norm": 0.4303280711174011, |
|
"learning_rate": 9.998126269014255e-06, |
|
"loss": 0.4026, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.32764505119453924, |
|
"grad_norm": 0.4083136022090912, |
|
"learning_rate": 9.997072398198492e-06, |
|
"loss": 0.3842, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.333495855680156, |
|
"grad_norm": 0.3750261664390564, |
|
"learning_rate": 9.99578443444032e-06, |
|
"loss": 0.3605, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.3393466601657728, |
|
"grad_norm": 0.43343302607536316, |
|
"learning_rate": 9.994262438075713e-06, |
|
"loss": 0.4119, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.34519746465138956, |
|
"grad_norm": 0.3778004050254822, |
|
"learning_rate": 9.992506480404137e-06, |
|
"loss": 0.3616, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.3510482691370063, |
|
"grad_norm": 0.36973798274993896, |
|
"learning_rate": 9.990516643685222e-06, |
|
"loss": 0.3793, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.35689907362262313, |
|
"grad_norm": 0.3836229145526886, |
|
"learning_rate": 9.988293021134888e-06, |
|
"loss": 0.3492, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.3627498781082399, |
|
"grad_norm": 0.3700697720050812, |
|
"learning_rate": 9.985835716921e-06, |
|
"loss": 0.3583, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.36860068259385664, |
|
"grad_norm": 0.4023352861404419, |
|
"learning_rate": 9.983144846158472e-06, |
|
"loss": 0.3697, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.37445148707947346, |
|
"grad_norm": 0.38035494089126587, |
|
"learning_rate": 9.980220534903889e-06, |
|
"loss": 0.3772, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.3803022915650902, |
|
"grad_norm": 0.3641819953918457, |
|
"learning_rate": 9.977062920149583e-06, |
|
"loss": 0.3562, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.38615309605070697, |
|
"grad_norm": 0.39018484950065613, |
|
"learning_rate": 9.973672149817232e-06, |
|
"loss": 0.3377, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.3920039005363237, |
|
"grad_norm": 0.351622998714447, |
|
"learning_rate": 9.970048382750925e-06, |
|
"loss": 0.351, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.39785470502194054, |
|
"grad_norm": 0.40039461851119995, |
|
"learning_rate": 9.966191788709716e-06, |
|
"loss": 0.3775, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.4037055095075573, |
|
"grad_norm": 0.3892274796962738, |
|
"learning_rate": 9.96210254835968e-06, |
|
"loss": 0.4034, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.40955631399317405, |
|
"grad_norm": 0.4052744507789612, |
|
"learning_rate": 9.957780853265441e-06, |
|
"loss": 0.4079, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.41540711847879086, |
|
"grad_norm": 0.3877456486225128, |
|
"learning_rate": 9.953226905881208e-06, |
|
"loss": 0.3342, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.4212579229644076, |
|
"grad_norm": 0.4107078015804291, |
|
"learning_rate": 9.948440919541277e-06, |
|
"loss": 0.358, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.4271087274500244, |
|
"grad_norm": 0.37597158551216125, |
|
"learning_rate": 9.943423118450051e-06, |
|
"loss": 0.3948, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.43295953193564113, |
|
"grad_norm": 0.4590906798839569, |
|
"learning_rate": 9.938173737671531e-06, |
|
"loss": 0.3847, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.43881033642125794, |
|
"grad_norm": 0.48799118399620056, |
|
"learning_rate": 9.932693023118299e-06, |
|
"loss": 0.3845, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.4446611409068747, |
|
"grad_norm": 0.39222586154937744, |
|
"learning_rate": 9.926981231540007e-06, |
|
"loss": 0.3872, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.45051194539249145, |
|
"grad_norm": 0.4158020615577698, |
|
"learning_rate": 9.921038630511345e-06, |
|
"loss": 0.388, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.45636274987810826, |
|
"grad_norm": 0.40331101417541504, |
|
"learning_rate": 9.91486549841951e-06, |
|
"loss": 0.3705, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.462213554363725, |
|
"grad_norm": 0.4275971055030823, |
|
"learning_rate": 9.908462124451152e-06, |
|
"loss": 0.3849, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.4680643588493418, |
|
"grad_norm": 0.3466413915157318, |
|
"learning_rate": 9.901828808578846e-06, |
|
"loss": 0.347, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.47391516333495853, |
|
"grad_norm": 0.44375771284103394, |
|
"learning_rate": 9.894965861547023e-06, |
|
"loss": 0.373, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.47976596782057535, |
|
"grad_norm": 0.38661712408065796, |
|
"learning_rate": 9.887873604857424e-06, |
|
"loss": 0.3702, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.4856167723061921, |
|
"grad_norm": 0.41488274931907654, |
|
"learning_rate": 9.88055237075403e-06, |
|
"loss": 0.3574, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.49146757679180886, |
|
"grad_norm": 0.41137149930000305, |
|
"learning_rate": 9.873002502207502e-06, |
|
"loss": 0.3901, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.49731838127742567, |
|
"grad_norm": 0.39136987924575806, |
|
"learning_rate": 9.86522435289912e-06, |
|
"loss": 0.38, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.5031691857630424, |
|
"grad_norm": 0.37086671590805054, |
|
"learning_rate": 9.857218287204204e-06, |
|
"loss": 0.3541, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.5090199902486592, |
|
"grad_norm": 0.43105342984199524, |
|
"learning_rate": 9.848984680175049e-06, |
|
"loss": 0.4087, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.514870794734276, |
|
"grad_norm": 0.36811238527297974, |
|
"learning_rate": 9.840523917523354e-06, |
|
"loss": 0.3639, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.5207215992198927, |
|
"grad_norm": 0.378967821598053, |
|
"learning_rate": 9.831836395602164e-06, |
|
"loss": 0.3251, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.5265724037055095, |
|
"grad_norm": 0.36341214179992676, |
|
"learning_rate": 9.822922521387277e-06, |
|
"loss": 0.3705, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.5324232081911263, |
|
"grad_norm": 0.37682002782821655, |
|
"learning_rate": 9.813782712458206e-06, |
|
"loss": 0.3513, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.538274012676743, |
|
"grad_norm": 0.4142582416534424, |
|
"learning_rate": 9.804417396978605e-06, |
|
"loss": 0.3716, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.5441248171623598, |
|
"grad_norm": 0.4432157278060913, |
|
"learning_rate": 9.794827013676206e-06, |
|
"loss": 0.4126, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.5499756216479766, |
|
"grad_norm": 0.47457224130630493, |
|
"learning_rate": 9.78501201182228e-06, |
|
"loss": 0.3941, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.5558264261335933, |
|
"grad_norm": 0.35374128818511963, |
|
"learning_rate": 9.774972851210572e-06, |
|
"loss": 0.3893, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.5616772306192102, |
|
"grad_norm": 0.37110310792922974, |
|
"learning_rate": 9.764710002135784e-06, |
|
"loss": 0.3453, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.567528035104827, |
|
"grad_norm": 0.4286816716194153, |
|
"learning_rate": 9.754223945371524e-06, |
|
"loss": 0.3674, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.5733788395904437, |
|
"grad_norm": 0.3735758662223816, |
|
"learning_rate": 9.743515172147793e-06, |
|
"loss": 0.3572, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.5792296440760605, |
|
"grad_norm": 0.3784080445766449, |
|
"learning_rate": 9.732584184127973e-06, |
|
"loss": 0.3864, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.5850804485616772, |
|
"grad_norm": 0.40882179141044617, |
|
"learning_rate": 9.721431493385322e-06, |
|
"loss": 0.3458, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.590931253047294, |
|
"grad_norm": 0.3924429416656494, |
|
"learning_rate": 9.710057622378992e-06, |
|
"loss": 0.3497, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.5967820575329108, |
|
"grad_norm": 0.41799789667129517, |
|
"learning_rate": 9.698463103929542e-06, |
|
"loss": 0.3915, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.6026328620185275, |
|
"grad_norm": 0.4201458990573883, |
|
"learning_rate": 9.686648481193994e-06, |
|
"loss": 0.3797, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.6084836665041443, |
|
"grad_norm": 0.3876160979270935, |
|
"learning_rate": 9.674614307640368e-06, |
|
"loss": 0.3667, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.6143344709897611, |
|
"grad_norm": 0.39733994007110596, |
|
"learning_rate": 9.66236114702178e-06, |
|
"loss": 0.3746, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.6201852754753778, |
|
"grad_norm": 0.4422380030155182, |
|
"learning_rate": 9.649889573350006e-06, |
|
"loss": 0.3657, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.6260360799609946, |
|
"grad_norm": 0.34534451365470886, |
|
"learning_rate": 9.637200170868607e-06, |
|
"loss": 0.3173, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.6318868844466115, |
|
"grad_norm": 0.49448907375335693, |
|
"learning_rate": 9.62429353402556e-06, |
|
"loss": 0.3528, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.6377376889322282, |
|
"grad_norm": 0.4157074987888336, |
|
"learning_rate": 9.611170267445401e-06, |
|
"loss": 0.3647, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.643588493417845, |
|
"grad_norm": 0.3649308383464813, |
|
"learning_rate": 9.597830985900913e-06, |
|
"loss": 0.3592, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.6494392979034618, |
|
"grad_norm": 0.38802069425582886, |
|
"learning_rate": 9.584276314284316e-06, |
|
"loss": 0.3749, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.6552901023890785, |
|
"grad_norm": 0.41905415058135986, |
|
"learning_rate": 9.570506887577994e-06, |
|
"loss": 0.3761, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.6611409068746953, |
|
"grad_norm": 0.34973040223121643, |
|
"learning_rate": 9.556523350824759e-06, |
|
"loss": 0.3377, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.666991711360312, |
|
"grad_norm": 0.42152735590934753, |
|
"learning_rate": 9.542326359097619e-06, |
|
"loss": 0.3758, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.6728425158459288, |
|
"grad_norm": 0.34654316306114197, |
|
"learning_rate": 9.527916577469104e-06, |
|
"loss": 0.3612, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.6786933203315456, |
|
"grad_norm": 0.3440297842025757, |
|
"learning_rate": 9.5132946809801e-06, |
|
"loss": 0.37, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.6845441248171623, |
|
"grad_norm": 0.36565279960632324, |
|
"learning_rate": 9.498461354608228e-06, |
|
"loss": 0.352, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.6903949293027791, |
|
"grad_norm": 0.3970431983470917, |
|
"learning_rate": 9.483417293235759e-06, |
|
"loss": 0.3694, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.6962457337883959, |
|
"grad_norm": 0.3433384895324707, |
|
"learning_rate": 9.468163201617063e-06, |
|
"loss": 0.3657, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.7020965382740126, |
|
"grad_norm": 0.39245930314064026, |
|
"learning_rate": 9.452699794345583e-06, |
|
"loss": 0.362, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.7079473427596294, |
|
"grad_norm": 0.38453614711761475, |
|
"learning_rate": 9.437027795820373e-06, |
|
"loss": 0.3675, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.7137981472452463, |
|
"grad_norm": 0.369517058134079, |
|
"learning_rate": 9.421147940212152e-06, |
|
"loss": 0.3634, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.719648951730863, |
|
"grad_norm": 0.38849949836730957, |
|
"learning_rate": 9.405060971428924e-06, |
|
"loss": 0.3387, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.7254997562164798, |
|
"grad_norm": 0.4063083231449127, |
|
"learning_rate": 9.388767643081109e-06, |
|
"loss": 0.3719, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.7313505607020966, |
|
"grad_norm": 0.40234676003456116, |
|
"learning_rate": 9.372268718446259e-06, |
|
"loss": 0.3939, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.7372013651877133, |
|
"grad_norm": 0.3845783770084381, |
|
"learning_rate": 9.355564970433288e-06, |
|
"loss": 0.3699, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.7430521696733301, |
|
"grad_norm": 0.3887750506401062, |
|
"learning_rate": 9.338657181546277e-06, |
|
"loss": 0.3686, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.7489029741589469, |
|
"grad_norm": 0.3700850307941437, |
|
"learning_rate": 9.321546143847802e-06, |
|
"loss": 0.3431, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.7547537786445636, |
|
"grad_norm": 0.44235607981681824, |
|
"learning_rate": 9.30423265892184e-06, |
|
"loss": 0.3836, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.7606045831301804, |
|
"grad_norm": 0.39945074915885925, |
|
"learning_rate": 9.286717537836211e-06, |
|
"loss": 0.3706, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.7664553876157971, |
|
"grad_norm": 0.42615601420402527, |
|
"learning_rate": 9.269001601104593e-06, |
|
"loss": 0.369, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.7723061921014139, |
|
"grad_norm": 0.4713898003101349, |
|
"learning_rate": 9.251085678648072e-06, |
|
"loss": 0.3818, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.7781569965870307, |
|
"grad_norm": 0.3744489550590515, |
|
"learning_rate": 9.232970609756267e-06, |
|
"loss": 0.3542, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.7840078010726474, |
|
"grad_norm": 0.3802720308303833, |
|
"learning_rate": 9.214657243048021e-06, |
|
"loss": 0.3346, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.7898586055582643, |
|
"grad_norm": 0.45320552587509155, |
|
"learning_rate": 9.196146436431635e-06, |
|
"loss": 0.3766, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.7957094100438811, |
|
"grad_norm": 0.3729214370250702, |
|
"learning_rate": 9.177439057064684e-06, |
|
"loss": 0.3694, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.8015602145294978, |
|
"grad_norm": 0.3678078055381775, |
|
"learning_rate": 9.158535981313395e-06, |
|
"loss": 0.3515, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.8074110190151146, |
|
"grad_norm": 0.4144746959209442, |
|
"learning_rate": 9.13943809471159e-06, |
|
"loss": 0.3756, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.8132618235007314, |
|
"grad_norm": 0.3548150658607483, |
|
"learning_rate": 9.120146291919206e-06, |
|
"loss": 0.3494, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.8191126279863481, |
|
"grad_norm": 0.3966399133205414, |
|
"learning_rate": 9.100661476680379e-06, |
|
"loss": 0.3427, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.8249634324719649, |
|
"grad_norm": 0.4523519277572632, |
|
"learning_rate": 9.08098456178111e-06, |
|
"loss": 0.3641, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.8308142369575817, |
|
"grad_norm": 0.45737963914871216, |
|
"learning_rate": 9.061116469006504e-06, |
|
"loss": 0.3643, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.8366650414431984, |
|
"grad_norm": 0.34355804324150085, |
|
"learning_rate": 9.041058129097586e-06, |
|
"loss": 0.3227, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.8425158459288152, |
|
"grad_norm": 0.4239197373390198, |
|
"learning_rate": 9.020810481707709e-06, |
|
"loss": 0.3604, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.8483666504144319, |
|
"grad_norm": 0.4363431930541992, |
|
"learning_rate": 9.00037447535852e-06, |
|
"loss": 0.3785, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.8542174549000487, |
|
"grad_norm": 0.383635550737381, |
|
"learning_rate": 8.979751067395534e-06, |
|
"loss": 0.355, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.8600682593856656, |
|
"grad_norm": 0.3972126543521881, |
|
"learning_rate": 8.958941223943292e-06, |
|
"loss": 0.394, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.8659190638712823, |
|
"grad_norm": 0.3762996196746826, |
|
"learning_rate": 8.937945919860086e-06, |
|
"loss": 0.3779, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.8717698683568991, |
|
"grad_norm": 0.40220147371292114, |
|
"learning_rate": 8.916766138692303e-06, |
|
"loss": 0.3725, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.8776206728425159, |
|
"grad_norm": 0.35849395394325256, |
|
"learning_rate": 8.895402872628352e-06, |
|
"loss": 0.3533, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.8834714773281326, |
|
"grad_norm": 0.3301231861114502, |
|
"learning_rate": 8.873857122452174e-06, |
|
"loss": 0.3156, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.8893222818137494, |
|
"grad_norm": 0.39462047815322876, |
|
"learning_rate": 8.852129897496367e-06, |
|
"loss": 0.3538, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.8951730862993662, |
|
"grad_norm": 0.3844425082206726, |
|
"learning_rate": 8.83022221559489e-06, |
|
"loss": 0.3913, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.9010238907849829, |
|
"grad_norm": 0.37792298197746277, |
|
"learning_rate": 8.808135103035407e-06, |
|
"loss": 0.3495, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.9068746952705997, |
|
"grad_norm": 0.39290040731430054, |
|
"learning_rate": 8.785869594511182e-06, |
|
"loss": 0.3784, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.9127254997562165, |
|
"grad_norm": 0.3619037866592407, |
|
"learning_rate": 8.763426733072624e-06, |
|
"loss": 0.3614, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.9185763042418332, |
|
"grad_norm": 0.3633933663368225, |
|
"learning_rate": 8.740807570078419e-06, |
|
"loss": 0.3902, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 0.92442710872745, |
|
"grad_norm": 0.3714929223060608, |
|
"learning_rate": 8.718013165146275e-06, |
|
"loss": 0.3274, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.9302779132130667, |
|
"grad_norm": 0.38371893763542175, |
|
"learning_rate": 8.695044586103297e-06, |
|
"loss": 0.3507, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.9361287176986836, |
|
"grad_norm": 0.34635236859321594, |
|
"learning_rate": 8.671902908935942e-06, |
|
"loss": 0.3275, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.9419795221843004, |
|
"grad_norm": 0.34420835971832275, |
|
"learning_rate": 8.648589217739635e-06, |
|
"loss": 0.3461, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 0.9478303266699171, |
|
"grad_norm": 0.3969476819038391, |
|
"learning_rate": 8.625104604667965e-06, |
|
"loss": 0.3579, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.9536811311555339, |
|
"grad_norm": 0.3697619140148163, |
|
"learning_rate": 8.601450169881533e-06, |
|
"loss": 0.3476, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 0.9595319356411507, |
|
"grad_norm": 0.3809903860092163, |
|
"learning_rate": 8.577627021496413e-06, |
|
"loss": 0.36, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.9653827401267674, |
|
"grad_norm": 0.3934761881828308, |
|
"learning_rate": 8.553636275532236e-06, |
|
"loss": 0.3704, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.9712335446123842, |
|
"grad_norm": 0.3420058786869049, |
|
"learning_rate": 8.529479055859918e-06, |
|
"loss": 0.3335, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.977084349098001, |
|
"grad_norm": 0.3801231384277344, |
|
"learning_rate": 8.505156494148997e-06, |
|
"loss": 0.3723, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 0.9829351535836177, |
|
"grad_norm": 0.38984423875808716, |
|
"learning_rate": 8.480669729814635e-06, |
|
"loss": 0.3563, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.9887859580692345, |
|
"grad_norm": 0.369872123003006, |
|
"learning_rate": 8.456019909964224e-06, |
|
"loss": 0.3494, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 0.9946367625548513, |
|
"grad_norm": 0.3835128843784332, |
|
"learning_rate": 8.43120818934367e-06, |
|
"loss": 0.3672, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 1.0014627011214041, |
|
"grad_norm": 0.4482472538948059, |
|
"learning_rate": 8.40623573028327e-06, |
|
"loss": 0.4454, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 1.007313505607021, |
|
"grad_norm": 0.45144927501678467, |
|
"learning_rate": 8.381103702643295e-06, |
|
"loss": 0.3454, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 1.0131643100926377, |
|
"grad_norm": 0.3322243094444275, |
|
"learning_rate": 8.35581328375915e-06, |
|
"loss": 0.2828, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 1.0190151145782544, |
|
"grad_norm": 0.397659033536911, |
|
"learning_rate": 8.330365658386252e-06, |
|
"loss": 0.3287, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 1.0248659190638714, |
|
"grad_norm": 0.3485862910747528, |
|
"learning_rate": 8.30476201864451e-06, |
|
"loss": 0.2744, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 1.030716723549488, |
|
"grad_norm": 0.3832169473171234, |
|
"learning_rate": 8.27900356396249e-06, |
|
"loss": 0.2868, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 1.0365675280351048, |
|
"grad_norm": 0.4184396266937256, |
|
"learning_rate": 8.25309150102121e-06, |
|
"loss": 0.3291, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 1.0424183325207217, |
|
"grad_norm": 0.45518970489501953, |
|
"learning_rate": 8.227027043697642e-06, |
|
"loss": 0.3489, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 1.0482691370063384, |
|
"grad_norm": 0.3730817437171936, |
|
"learning_rate": 8.200811413007808e-06, |
|
"loss": 0.3055, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 1.054119941491955, |
|
"grad_norm": 0.398185133934021, |
|
"learning_rate": 8.174445837049614e-06, |
|
"loss": 0.326, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 1.059970745977572, |
|
"grad_norm": 0.4147329032421112, |
|
"learning_rate": 8.147931550945301e-06, |
|
"loss": 0.2961, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 1.0658215504631887, |
|
"grad_norm": 0.4088496267795563, |
|
"learning_rate": 8.121269796783585e-06, |
|
"loss": 0.3239, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 1.0716723549488054, |
|
"grad_norm": 0.35450735688209534, |
|
"learning_rate": 8.094461823561473e-06, |
|
"loss": 0.2851, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 1.0775231594344223, |
|
"grad_norm": 0.4081903100013733, |
|
"learning_rate": 8.06750888712576e-06, |
|
"loss": 0.3188, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 1.083373963920039, |
|
"grad_norm": 0.3934895396232605, |
|
"learning_rate": 8.040412250114184e-06, |
|
"loss": 0.2891, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 1.0892247684056557, |
|
"grad_norm": 0.35631951689720154, |
|
"learning_rate": 8.013173181896283e-06, |
|
"loss": 0.2667, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 1.0950755728912727, |
|
"grad_norm": 0.42703738808631897, |
|
"learning_rate": 7.985792958513932e-06, |
|
"loss": 0.312, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 1.1009263773768894, |
|
"grad_norm": 0.4023725986480713, |
|
"learning_rate": 7.958272862621562e-06, |
|
"loss": 0.3343, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 1.106777181862506, |
|
"grad_norm": 0.3514081537723541, |
|
"learning_rate": 7.930614183426074e-06, |
|
"loss": 0.2959, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 1.1126279863481228, |
|
"grad_norm": 0.40648946166038513, |
|
"learning_rate": 7.902818216626446e-06, |
|
"loss": 0.3529, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 1.1184787908337397, |
|
"grad_norm": 0.38296204805374146, |
|
"learning_rate": 7.874886264353035e-06, |
|
"loss": 0.2988, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 1.1243295953193564, |
|
"grad_norm": 0.4062958061695099, |
|
"learning_rate": 7.846819635106569e-06, |
|
"loss": 0.3344, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 1.130180399804973, |
|
"grad_norm": 0.3408312499523163, |
|
"learning_rate": 7.818619643696863e-06, |
|
"loss": 0.2857, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 1.13603120429059, |
|
"grad_norm": 0.3789331316947937, |
|
"learning_rate": 7.790287611181217e-06, |
|
"loss": 0.3077, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 1.1418820087762067, |
|
"grad_norm": 0.38520050048828125, |
|
"learning_rate": 7.76182486480253e-06, |
|
"loss": 0.3025, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 1.1477328132618234, |
|
"grad_norm": 0.3634053170681, |
|
"learning_rate": 7.733232737927123e-06, |
|
"loss": 0.3037, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 1.1535836177474403, |
|
"grad_norm": 0.42052581906318665, |
|
"learning_rate": 7.70451256998228e-06, |
|
"loss": 0.304, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 1.159434422233057, |
|
"grad_norm": 0.3758928179740906, |
|
"learning_rate": 7.675665706393502e-06, |
|
"loss": 0.2755, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 1.1652852267186737, |
|
"grad_norm": 0.35784485936164856, |
|
"learning_rate": 7.646693498521472e-06, |
|
"loss": 0.2876, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 1.1711360312042907, |
|
"grad_norm": 0.38650694489479065, |
|
"learning_rate": 7.617597303598754e-06, |
|
"loss": 0.288, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.1769868356899074, |
|
"grad_norm": 0.3944965898990631, |
|
"learning_rate": 7.588378484666214e-06, |
|
"loss": 0.3211, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 1.182837640175524, |
|
"grad_norm": 0.3851556181907654, |
|
"learning_rate": 7.559038410509161e-06, |
|
"loss": 0.3389, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 1.188688444661141, |
|
"grad_norm": 0.3507968783378601, |
|
"learning_rate": 7.529578455593232e-06, |
|
"loss": 0.2943, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 1.1945392491467577, |
|
"grad_norm": 0.3462185561656952, |
|
"learning_rate": 7.500000000000001e-06, |
|
"loss": 0.3112, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 1.2003900536323744, |
|
"grad_norm": 0.3465600609779358, |
|
"learning_rate": 7.47030442936232e-06, |
|
"loss": 0.3165, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 1.2062408581179913, |
|
"grad_norm": 0.3432478904724121, |
|
"learning_rate": 7.440493134799425e-06, |
|
"loss": 0.2977, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 1.212091662603608, |
|
"grad_norm": 0.3325629234313965, |
|
"learning_rate": 7.4105675128517456e-06, |
|
"loss": 0.2809, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 1.2179424670892247, |
|
"grad_norm": 0.37305665016174316, |
|
"learning_rate": 7.380528965415501e-06, |
|
"loss": 0.3494, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 1.2237932715748416, |
|
"grad_norm": 0.3855370283126831, |
|
"learning_rate": 7.35037889967702e-06, |
|
"loss": 0.331, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 1.2296440760604583, |
|
"grad_norm": 0.38624921441078186, |
|
"learning_rate": 7.320118728046818e-06, |
|
"loss": 0.3249, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 1.235494880546075, |
|
"grad_norm": 0.339275985956192, |
|
"learning_rate": 7.289749868093432e-06, |
|
"loss": 0.2979, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 1.2413456850316917, |
|
"grad_norm": 0.362403929233551, |
|
"learning_rate": 7.259273742477017e-06, |
|
"loss": 0.3071, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 1.2471964895173087, |
|
"grad_norm": 0.331527978181839, |
|
"learning_rate": 7.2286917788826926e-06, |
|
"loss": 0.2959, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 1.2530472940029254, |
|
"grad_norm": 0.34029752016067505, |
|
"learning_rate": 7.19800540995367e-06, |
|
"loss": 0.2873, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 1.258898098488542, |
|
"grad_norm": 0.38359367847442627, |
|
"learning_rate": 7.167216073224136e-06, |
|
"loss": 0.3215, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 1.264748902974159, |
|
"grad_norm": 0.3701342046260834, |
|
"learning_rate": 7.136325211051905e-06, |
|
"loss": 0.2931, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 1.2705997074597757, |
|
"grad_norm": 0.3997856080532074, |
|
"learning_rate": 7.1053342705508564e-06, |
|
"loss": 0.319, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 1.2764505119453924, |
|
"grad_norm": 0.3141786456108093, |
|
"learning_rate": 7.074244703523137e-06, |
|
"loss": 0.2628, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 1.2823013164310093, |
|
"grad_norm": 0.363447368144989, |
|
"learning_rate": 7.043057966391158e-06, |
|
"loss": 0.3079, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 1.288152120916626, |
|
"grad_norm": 0.3675538897514343, |
|
"learning_rate": 7.011775520129363e-06, |
|
"loss": 0.2912, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 1.2940029254022427, |
|
"grad_norm": 0.3745831251144409, |
|
"learning_rate": 6.980398830195785e-06, |
|
"loss": 0.287, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 1.2998537298878596, |
|
"grad_norm": 0.34273862838745117, |
|
"learning_rate": 6.948929366463397e-06, |
|
"loss": 0.2739, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 1.3057045343734763, |
|
"grad_norm": 0.38599085807800293, |
|
"learning_rate": 6.9173686031512595e-06, |
|
"loss": 0.3386, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 1.311555338859093, |
|
"grad_norm": 0.35338225960731506, |
|
"learning_rate": 6.885718018755448e-06, |
|
"loss": 0.3034, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 1.31740614334471, |
|
"grad_norm": 0.35684457421302795, |
|
"learning_rate": 6.8539790959798045e-06, |
|
"loss": 0.3159, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 1.3232569478303267, |
|
"grad_norm": 0.342815101146698, |
|
"learning_rate": 6.822153321666469e-06, |
|
"loss": 0.3237, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 1.3291077523159434, |
|
"grad_norm": 0.36875948309898376, |
|
"learning_rate": 6.790242186726231e-06, |
|
"loss": 0.3084, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 1.3349585568015603, |
|
"grad_norm": 0.37179967761039734, |
|
"learning_rate": 6.758247186068684e-06, |
|
"loss": 0.3171, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 1.340809361287177, |
|
"grad_norm": 0.35630038380622864, |
|
"learning_rate": 6.7261698185322e-06, |
|
"loss": 0.3041, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 1.3466601657727937, |
|
"grad_norm": 0.39249274134635925, |
|
"learning_rate": 6.6940115868137065e-06, |
|
"loss": 0.2953, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 1.3525109702584106, |
|
"grad_norm": 0.3363463878631592, |
|
"learning_rate": 6.6617739973982985e-06, |
|
"loss": 0.3005, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 1.3583617747440273, |
|
"grad_norm": 0.36309415102005005, |
|
"learning_rate": 6.629458560488664e-06, |
|
"loss": 0.3415, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 1.364212579229644, |
|
"grad_norm": 0.3635103106498718, |
|
"learning_rate": 6.597066789934336e-06, |
|
"loss": 0.3117, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 1.370063383715261, |
|
"grad_norm": 0.3717254102230072, |
|
"learning_rate": 6.5646002031607726e-06, |
|
"loss": 0.3336, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 1.3759141882008776, |
|
"grad_norm": 0.3539208173751831, |
|
"learning_rate": 6.5320603210982745e-06, |
|
"loss": 0.3335, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 1.3817649926864943, |
|
"grad_norm": 0.3605196475982666, |
|
"learning_rate": 6.499448668110735e-06, |
|
"loss": 0.319, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 1.3876157971721113, |
|
"grad_norm": 0.39067190885543823, |
|
"learning_rate": 6.466766771924231e-06, |
|
"loss": 0.3104, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 1.393466601657728, |
|
"grad_norm": 0.3777407705783844, |
|
"learning_rate": 6.434016163555452e-06, |
|
"loss": 0.3069, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 1.3993174061433447, |
|
"grad_norm": 0.34741804003715515, |
|
"learning_rate": 6.401198377239979e-06, |
|
"loss": 0.2852, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 1.4051682106289616, |
|
"grad_norm": 0.3834282457828522, |
|
"learning_rate": 6.368314950360416e-06, |
|
"loss": 0.3474, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 1.4110190151145783, |
|
"grad_norm": 0.3760935664176941, |
|
"learning_rate": 6.3353674233743585e-06, |
|
"loss": 0.3136, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 1.416869819600195, |
|
"grad_norm": 0.3629906475543976, |
|
"learning_rate": 6.302357339742245e-06, |
|
"loss": 0.3403, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 1.422720624085812, |
|
"grad_norm": 0.342675119638443, |
|
"learning_rate": 6.269286245855039e-06, |
|
"loss": 0.2915, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 1.4285714285714286, |
|
"grad_norm": 0.3933790326118469, |
|
"learning_rate": 6.236155690961795e-06, |
|
"loss": 0.3048, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 1.4344222330570453, |
|
"grad_norm": 0.35148119926452637, |
|
"learning_rate": 6.202967227097073e-06, |
|
"loss": 0.3072, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 1.4402730375426622, |
|
"grad_norm": 0.3553239405155182, |
|
"learning_rate": 6.169722409008244e-06, |
|
"loss": 0.2988, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 1.446123842028279, |
|
"grad_norm": 0.39217159152030945, |
|
"learning_rate": 6.136422794082645e-06, |
|
"loss": 0.2945, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 1.4519746465138956, |
|
"grad_norm": 0.39117711782455444, |
|
"learning_rate": 6.10306994227463e-06, |
|
"loss": 0.3038, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 1.4578254509995126, |
|
"grad_norm": 0.3591575026512146, |
|
"learning_rate": 6.0696654160324875e-06, |
|
"loss": 0.3136, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 1.4636762554851293, |
|
"grad_norm": 0.4656267464160919, |
|
"learning_rate": 6.0362107802252486e-06, |
|
"loss": 0.3496, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 1.469527059970746, |
|
"grad_norm": 0.3674546778202057, |
|
"learning_rate": 6.002707602069377e-06, |
|
"loss": 0.3121, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 1.4753778644563629, |
|
"grad_norm": 0.4174729585647583, |
|
"learning_rate": 5.9691574510553505e-06, |
|
"loss": 0.3121, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 1.4812286689419796, |
|
"grad_norm": 0.3748752176761627, |
|
"learning_rate": 5.935561898874142e-06, |
|
"loss": 0.3125, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 1.4870794734275963, |
|
"grad_norm": 0.3187505006790161, |
|
"learning_rate": 5.901922519343586e-06, |
|
"loss": 0.3013, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 1.4929302779132132, |
|
"grad_norm": 0.34686118364334106, |
|
"learning_rate": 5.8682408883346535e-06, |
|
"loss": 0.3099, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 1.49878108239883, |
|
"grad_norm": 0.38693419098854065, |
|
"learning_rate": 5.834518583697628e-06, |
|
"loss": 0.343, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 1.5046318868844466, |
|
"grad_norm": 0.38468196988105774, |
|
"learning_rate": 5.800757185188195e-06, |
|
"loss": 0.3152, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 1.5104826913700635, |
|
"grad_norm": 0.3720076084136963, |
|
"learning_rate": 5.766958274393428e-06, |
|
"loss": 0.3289, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 1.51633349585568, |
|
"grad_norm": 0.3495715260505676, |
|
"learning_rate": 5.733123434657704e-06, |
|
"loss": 0.3268, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 1.522184300341297, |
|
"grad_norm": 0.33257222175598145, |
|
"learning_rate": 5.699254251008524e-06, |
|
"loss": 0.306, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 1.5280351048269138, |
|
"grad_norm": 0.35938987135887146, |
|
"learning_rate": 5.66535231008227e-06, |
|
"loss": 0.3221, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 1.5338859093125303, |
|
"grad_norm": 0.3358217477798462, |
|
"learning_rate": 5.631419200049867e-06, |
|
"loss": 0.3109, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 1.5397367137981472, |
|
"grad_norm": 0.3260052502155304, |
|
"learning_rate": 5.597456510542395e-06, |
|
"loss": 0.2735, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 1.5455875182837642, |
|
"grad_norm": 0.3558763861656189, |
|
"learning_rate": 5.5634658325766066e-06, |
|
"loss": 0.3133, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 1.5514383227693807, |
|
"grad_norm": 0.34226661920547485, |
|
"learning_rate": 5.529448758480408e-06, |
|
"loss": 0.301, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 1.5572891272549976, |
|
"grad_norm": 0.40270325541496277, |
|
"learning_rate": 5.495406881818256e-06, |
|
"loss": 0.3427, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 1.5631399317406145, |
|
"grad_norm": 0.3240657150745392, |
|
"learning_rate": 5.46134179731651e-06, |
|
"loss": 0.2948, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 1.568990736226231, |
|
"grad_norm": 0.36010023951530457, |
|
"learning_rate": 5.427255100788726e-06, |
|
"loss": 0.2869, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 1.574841540711848, |
|
"grad_norm": 0.3521655797958374, |
|
"learning_rate": 5.393148389060893e-06, |
|
"loss": 0.2908, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 1.5806923451974646, |
|
"grad_norm": 0.3522508442401886, |
|
"learning_rate": 5.359023259896638e-06, |
|
"loss": 0.3222, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 1.5865431496830813, |
|
"grad_norm": 0.358254075050354, |
|
"learning_rate": 5.3248813119223665e-06, |
|
"loss": 0.3191, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 1.5923939541686982, |
|
"grad_norm": 0.36198315024375916, |
|
"learning_rate": 5.290724144552379e-06, |
|
"loss": 0.315, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 1.598244758654315, |
|
"grad_norm": 0.353097528219223, |
|
"learning_rate": 5.2565533579139484e-06, |
|
"loss": 0.3015, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 1.6040955631399316, |
|
"grad_norm": 0.35641244053840637, |
|
"learning_rate": 5.222370552772353e-06, |
|
"loss": 0.3108, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 1.6099463676255485, |
|
"grad_norm": 0.35300660133361816, |
|
"learning_rate": 5.188177330455886e-06, |
|
"loss": 0.3443, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 1.6157971721111652, |
|
"grad_norm": 0.33080846071243286, |
|
"learning_rate": 5.153975292780852e-06, |
|
"loss": 0.2871, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 1.621647976596782, |
|
"grad_norm": 0.33396315574645996, |
|
"learning_rate": 5.119766041976516e-06, |
|
"loss": 0.3089, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 1.6274987810823989, |
|
"grad_norm": 0.34597212076187134, |
|
"learning_rate": 5.085551180610046e-06, |
|
"loss": 0.2817, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 1.6333495855680156, |
|
"grad_norm": 0.3279144763946533, |
|
"learning_rate": 5.05133231151145e-06, |
|
"loss": 0.2944, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 1.6392003900536323, |
|
"grad_norm": 0.3529197871685028, |
|
"learning_rate": 5.017111037698477e-06, |
|
"loss": 0.3195, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 1.6450511945392492, |
|
"grad_norm": 0.36540284752845764, |
|
"learning_rate": 4.9828889623015265e-06, |
|
"loss": 0.3282, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 1.650901999024866, |
|
"grad_norm": 0.33339953422546387, |
|
"learning_rate": 4.948667688488552e-06, |
|
"loss": 0.2907, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 1.6567528035104826, |
|
"grad_norm": 0.32981109619140625, |
|
"learning_rate": 4.9144488193899546e-06, |
|
"loss": 0.2982, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 1.6626036079960995, |
|
"grad_norm": 0.33798947930336, |
|
"learning_rate": 4.880233958023486e-06, |
|
"loss": 0.2964, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 1.6684544124817162, |
|
"grad_norm": 0.3474103808403015, |
|
"learning_rate": 4.846024707219149e-06, |
|
"loss": 0.3301, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 1.674305216967333, |
|
"grad_norm": 0.3323943316936493, |
|
"learning_rate": 4.811822669544115e-06, |
|
"loss": 0.3014, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 1.6801560214529498, |
|
"grad_norm": 0.38225099444389343, |
|
"learning_rate": 4.777629447227649e-06, |
|
"loss": 0.3389, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 1.6860068259385665, |
|
"grad_norm": 0.3148108720779419, |
|
"learning_rate": 4.7434466420860515e-06, |
|
"loss": 0.298, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 1.6918576304241832, |
|
"grad_norm": 0.3262878656387329, |
|
"learning_rate": 4.7092758554476215e-06, |
|
"loss": 0.29, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 1.6977084349098002, |
|
"grad_norm": 0.3702300190925598, |
|
"learning_rate": 4.675118688077634e-06, |
|
"loss": 0.327, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 1.7035592393954169, |
|
"grad_norm": 0.3070249855518341, |
|
"learning_rate": 4.640976740103363e-06, |
|
"loss": 0.2918, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 1.7094100438810336, |
|
"grad_norm": 0.3508608937263489, |
|
"learning_rate": 4.606851610939108e-06, |
|
"loss": 0.3251, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 1.7152608483666505, |
|
"grad_norm": 0.3425685465335846, |
|
"learning_rate": 4.572744899211275e-06, |
|
"loss": 0.3039, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 1.7211116528522672, |
|
"grad_norm": 0.33032500743865967, |
|
"learning_rate": 4.53865820268349e-06, |
|
"loss": 0.2874, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 1.726962457337884, |
|
"grad_norm": 0.34354081749916077, |
|
"learning_rate": 4.504593118181745e-06, |
|
"loss": 0.293, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 1.7328132618235008, |
|
"grad_norm": 0.35744139552116394, |
|
"learning_rate": 4.470551241519594e-06, |
|
"loss": 0.3136, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 1.7386640663091175, |
|
"grad_norm": 0.34493860602378845, |
|
"learning_rate": 4.436534167423395e-06, |
|
"loss": 0.2967, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 1.7445148707947342, |
|
"grad_norm": 0.35344043374061584, |
|
"learning_rate": 4.402543489457607e-06, |
|
"loss": 0.3073, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 1.7503656752803511, |
|
"grad_norm": 0.3236096203327179, |
|
"learning_rate": 4.368580799950133e-06, |
|
"loss": 0.3045, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 1.7562164797659678, |
|
"grad_norm": 0.32016465067863464, |
|
"learning_rate": 4.334647689917734e-06, |
|
"loss": 0.2846, |
|
"step": 300 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 510, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 50, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 5.564276434660229e+18, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|