{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.1711360312042907, "eval_steps": 500, "global_step": 200, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.005850804485616773, "grad_norm": 4.189145565032959, "learning_rate": 1.9607843137254904e-07, "loss": 0.6022, "step": 1 }, { "epoch": 0.011701608971233545, "grad_norm": 4.088385105133057, "learning_rate": 3.921568627450981e-07, "loss": 0.6105, "step": 2 }, { "epoch": 0.017552413456850317, "grad_norm": 4.105137348175049, "learning_rate": 5.882352941176471e-07, "loss": 0.6234, "step": 3 }, { "epoch": 0.02340321794246709, "grad_norm": 4.010756015777588, "learning_rate": 7.843137254901962e-07, "loss": 0.5629, "step": 4 }, { "epoch": 0.02925402242808386, "grad_norm": 4.201730728149414, "learning_rate": 9.80392156862745e-07, "loss": 0.6236, "step": 5 }, { "epoch": 0.035104826913700635, "grad_norm": 4.13097620010376, "learning_rate": 1.1764705882352942e-06, "loss": 0.6058, "step": 6 }, { "epoch": 0.040955631399317405, "grad_norm": 3.753781318664551, "learning_rate": 1.3725490196078434e-06, "loss": 0.5798, "step": 7 }, { "epoch": 0.04680643588493418, "grad_norm": 3.1203114986419678, "learning_rate": 1.5686274509803923e-06, "loss": 0.5575, "step": 8 }, { "epoch": 0.05265724037055095, "grad_norm": 3.1326870918273926, "learning_rate": 1.7647058823529414e-06, "loss": 0.5794, "step": 9 }, { "epoch": 0.05850804485616772, "grad_norm": 3.01350736618042, "learning_rate": 1.96078431372549e-06, "loss": 0.5721, "step": 10 }, { "epoch": 0.0643588493417845, "grad_norm": 2.0586817264556885, "learning_rate": 2.1568627450980393e-06, "loss": 0.5389, "step": 11 }, { "epoch": 0.07020965382740127, "grad_norm": 2.056138753890991, "learning_rate": 2.3529411764705885e-06, "loss": 0.5578, "step": 12 }, { "epoch": 0.07606045831301804, "grad_norm": 1.8458319902420044, "learning_rate": 2.549019607843137e-06, "loss": 0.5432, "step": 13 }, { "epoch": 0.08191126279863481, "grad_norm": 1.3385547399520874, "learning_rate": 2.7450980392156867e-06, "loss": 0.5375, "step": 14 }, { "epoch": 0.08776206728425158, "grad_norm": 2.10184383392334, "learning_rate": 2.9411764705882355e-06, "loss": 0.4834, "step": 15 }, { "epoch": 0.09361287176986836, "grad_norm": 2.354717254638672, "learning_rate": 3.1372549019607846e-06, "loss": 0.5087, "step": 16 }, { "epoch": 0.09946367625548513, "grad_norm": 2.4186935424804688, "learning_rate": 3.3333333333333333e-06, "loss": 0.5408, "step": 17 }, { "epoch": 0.1053144807411019, "grad_norm": 2.02093243598938, "learning_rate": 3.529411764705883e-06, "loss": 0.4967, "step": 18 }, { "epoch": 0.11116528522671867, "grad_norm": 1.9769740104675293, "learning_rate": 3.7254901960784316e-06, "loss": 0.5429, "step": 19 }, { "epoch": 0.11701608971233544, "grad_norm": 1.4087600708007812, "learning_rate": 3.92156862745098e-06, "loss": 0.4855, "step": 20 }, { "epoch": 0.12286689419795221, "grad_norm": 1.4071195125579834, "learning_rate": 4.11764705882353e-06, "loss": 0.4956, "step": 21 }, { "epoch": 0.128717698683569, "grad_norm": 1.4400174617767334, "learning_rate": 4.313725490196079e-06, "loss": 0.4966, "step": 22 }, { "epoch": 0.13456850316918575, "grad_norm": 1.2176562547683716, "learning_rate": 4.509803921568628e-06, "loss": 0.4892, "step": 23 }, { "epoch": 0.14041930765480254, "grad_norm": 1.0557763576507568, "learning_rate": 4.705882352941177e-06, "loss": 0.4664, "step": 24 }, { "epoch": 0.1462701121404193, "grad_norm": 1.0654219388961792, "learning_rate": 4.901960784313726e-06, "loss": 0.4427, "step": 25 }, { "epoch": 0.15212091662603608, "grad_norm": 0.8639155626296997, "learning_rate": 5.098039215686274e-06, "loss": 0.4676, "step": 26 }, { "epoch": 0.15797172111165286, "grad_norm": 0.8091264963150024, "learning_rate": 5.294117647058824e-06, "loss": 0.4339, "step": 27 }, { "epoch": 0.16382252559726962, "grad_norm": 0.7697594165802002, "learning_rate": 5.4901960784313735e-06, "loss": 0.4164, "step": 28 }, { "epoch": 0.1696733300828864, "grad_norm": 0.8522382378578186, "learning_rate": 5.686274509803922e-06, "loss": 0.4512, "step": 29 }, { "epoch": 0.17552413456850316, "grad_norm": 0.7640376687049866, "learning_rate": 5.882352941176471e-06, "loss": 0.432, "step": 30 }, { "epoch": 0.18137493905411994, "grad_norm": 0.6247867941856384, "learning_rate": 6.07843137254902e-06, "loss": 0.408, "step": 31 }, { "epoch": 0.18722574353973673, "grad_norm": 0.6288900971412659, "learning_rate": 6.274509803921569e-06, "loss": 0.4611, "step": 32 }, { "epoch": 0.19307654802535348, "grad_norm": 0.6182562708854675, "learning_rate": 6.470588235294119e-06, "loss": 0.4257, "step": 33 }, { "epoch": 0.19892735251097027, "grad_norm": 0.6193389892578125, "learning_rate": 6.666666666666667e-06, "loss": 0.4063, "step": 34 }, { "epoch": 0.20477815699658702, "grad_norm": 0.6892727017402649, "learning_rate": 6.862745098039216e-06, "loss": 0.3967, "step": 35 }, { "epoch": 0.2106289614822038, "grad_norm": 0.6725057363510132, "learning_rate": 7.058823529411766e-06, "loss": 0.4428, "step": 36 }, { "epoch": 0.21647976596782056, "grad_norm": 0.5203535556793213, "learning_rate": 7.2549019607843145e-06, "loss": 0.4151, "step": 37 }, { "epoch": 0.22233057045343735, "grad_norm": 0.45232418179512024, "learning_rate": 7.450980392156863e-06, "loss": 0.3666, "step": 38 }, { "epoch": 0.22818137493905413, "grad_norm": 0.5872768759727478, "learning_rate": 7.647058823529411e-06, "loss": 0.4144, "step": 39 }, { "epoch": 0.2340321794246709, "grad_norm": 0.526172399520874, "learning_rate": 7.84313725490196e-06, "loss": 0.4346, "step": 40 }, { "epoch": 0.23988298391028767, "grad_norm": 0.5474228858947754, "learning_rate": 8.03921568627451e-06, "loss": 0.3965, "step": 41 }, { "epoch": 0.24573378839590443, "grad_norm": 0.46727877855300903, "learning_rate": 8.23529411764706e-06, "loss": 0.4417, "step": 42 }, { "epoch": 0.2515845928815212, "grad_norm": 0.40532198548316956, "learning_rate": 8.43137254901961e-06, "loss": 0.3851, "step": 43 }, { "epoch": 0.257435397367138, "grad_norm": 0.4897397458553314, "learning_rate": 8.627450980392157e-06, "loss": 0.4013, "step": 44 }, { "epoch": 0.26328620185275475, "grad_norm": 0.4565890431404114, "learning_rate": 8.823529411764707e-06, "loss": 0.3745, "step": 45 }, { "epoch": 0.2691370063383715, "grad_norm": 0.38417261838912964, "learning_rate": 9.019607843137256e-06, "loss": 0.3783, "step": 46 }, { "epoch": 0.2749878108239883, "grad_norm": 0.40912356972694397, "learning_rate": 9.215686274509804e-06, "loss": 0.3879, "step": 47 }, { "epoch": 0.2808386153096051, "grad_norm": 0.42792415618896484, "learning_rate": 9.411764705882354e-06, "loss": 0.3837, "step": 48 }, { "epoch": 0.28668941979522183, "grad_norm": 0.4394405484199524, "learning_rate": 9.607843137254903e-06, "loss": 0.4004, "step": 49 }, { "epoch": 0.2925402242808386, "grad_norm": 0.4622238576412201, "learning_rate": 9.803921568627451e-06, "loss": 0.409, "step": 50 }, { "epoch": 0.2983910287664554, "grad_norm": 0.3894466757774353, "learning_rate": 1e-05, "loss": 0.3766, "step": 51 }, { "epoch": 0.30424183325207216, "grad_norm": 0.39314836263656616, "learning_rate": 9.999882884955554e-06, "loss": 0.3418, "step": 52 }, { "epoch": 0.3100926377376889, "grad_norm": 0.44764766097068787, "learning_rate": 9.999531545308584e-06, "loss": 0.3909, "step": 53 }, { "epoch": 0.3159434422233057, "grad_norm": 0.403144896030426, "learning_rate": 9.998945997517957e-06, "loss": 0.3716, "step": 54 }, { "epoch": 0.3217942467089225, "grad_norm": 0.4303280711174011, "learning_rate": 9.998126269014255e-06, "loss": 0.4026, "step": 55 }, { "epoch": 0.32764505119453924, "grad_norm": 0.4083136022090912, "learning_rate": 9.997072398198492e-06, "loss": 0.3842, "step": 56 }, { "epoch": 0.333495855680156, "grad_norm": 0.3750261664390564, "learning_rate": 9.99578443444032e-06, "loss": 0.3605, "step": 57 }, { "epoch": 0.3393466601657728, "grad_norm": 0.43343302607536316, "learning_rate": 9.994262438075713e-06, "loss": 0.4119, "step": 58 }, { "epoch": 0.34519746465138956, "grad_norm": 0.3778004050254822, "learning_rate": 9.992506480404137e-06, "loss": 0.3616, "step": 59 }, { "epoch": 0.3510482691370063, "grad_norm": 0.36973798274993896, "learning_rate": 9.990516643685222e-06, "loss": 0.3793, "step": 60 }, { "epoch": 0.35689907362262313, "grad_norm": 0.3836229145526886, "learning_rate": 9.988293021134888e-06, "loss": 0.3492, "step": 61 }, { "epoch": 0.3627498781082399, "grad_norm": 0.3700697720050812, "learning_rate": 9.985835716921e-06, "loss": 0.3583, "step": 62 }, { "epoch": 0.36860068259385664, "grad_norm": 0.4023352861404419, "learning_rate": 9.983144846158472e-06, "loss": 0.3697, "step": 63 }, { "epoch": 0.37445148707947346, "grad_norm": 0.38035494089126587, "learning_rate": 9.980220534903889e-06, "loss": 0.3772, "step": 64 }, { "epoch": 0.3803022915650902, "grad_norm": 0.3641819953918457, "learning_rate": 9.977062920149583e-06, "loss": 0.3562, "step": 65 }, { "epoch": 0.38615309605070697, "grad_norm": 0.39018484950065613, "learning_rate": 9.973672149817232e-06, "loss": 0.3377, "step": 66 }, { "epoch": 0.3920039005363237, "grad_norm": 0.351622998714447, "learning_rate": 9.970048382750925e-06, "loss": 0.351, "step": 67 }, { "epoch": 0.39785470502194054, "grad_norm": 0.40039461851119995, "learning_rate": 9.966191788709716e-06, "loss": 0.3775, "step": 68 }, { "epoch": 0.4037055095075573, "grad_norm": 0.3892274796962738, "learning_rate": 9.96210254835968e-06, "loss": 0.4034, "step": 69 }, { "epoch": 0.40955631399317405, "grad_norm": 0.4052744507789612, "learning_rate": 9.957780853265441e-06, "loss": 0.4079, "step": 70 }, { "epoch": 0.41540711847879086, "grad_norm": 0.3877456486225128, "learning_rate": 9.953226905881208e-06, "loss": 0.3342, "step": 71 }, { "epoch": 0.4212579229644076, "grad_norm": 0.4107078015804291, "learning_rate": 9.948440919541277e-06, "loss": 0.358, "step": 72 }, { "epoch": 0.4271087274500244, "grad_norm": 0.37597158551216125, "learning_rate": 9.943423118450051e-06, "loss": 0.3948, "step": 73 }, { "epoch": 0.43295953193564113, "grad_norm": 0.4590906798839569, "learning_rate": 9.938173737671531e-06, "loss": 0.3847, "step": 74 }, { "epoch": 0.43881033642125794, "grad_norm": 0.48799118399620056, "learning_rate": 9.932693023118299e-06, "loss": 0.3845, "step": 75 }, { "epoch": 0.4446611409068747, "grad_norm": 0.39222586154937744, "learning_rate": 9.926981231540007e-06, "loss": 0.3872, "step": 76 }, { "epoch": 0.45051194539249145, "grad_norm": 0.4158020615577698, "learning_rate": 9.921038630511345e-06, "loss": 0.388, "step": 77 }, { "epoch": 0.45636274987810826, "grad_norm": 0.40331101417541504, "learning_rate": 9.91486549841951e-06, "loss": 0.3705, "step": 78 }, { "epoch": 0.462213554363725, "grad_norm": 0.4275971055030823, "learning_rate": 9.908462124451152e-06, "loss": 0.3849, "step": 79 }, { "epoch": 0.4680643588493418, "grad_norm": 0.3466413915157318, "learning_rate": 9.901828808578846e-06, "loss": 0.347, "step": 80 }, { "epoch": 0.47391516333495853, "grad_norm": 0.44375771284103394, "learning_rate": 9.894965861547023e-06, "loss": 0.373, "step": 81 }, { "epoch": 0.47976596782057535, "grad_norm": 0.38661712408065796, "learning_rate": 9.887873604857424e-06, "loss": 0.3702, "step": 82 }, { "epoch": 0.4856167723061921, "grad_norm": 0.41488274931907654, "learning_rate": 9.88055237075403e-06, "loss": 0.3574, "step": 83 }, { "epoch": 0.49146757679180886, "grad_norm": 0.41137149930000305, "learning_rate": 9.873002502207502e-06, "loss": 0.3901, "step": 84 }, { "epoch": 0.49731838127742567, "grad_norm": 0.39136987924575806, "learning_rate": 9.86522435289912e-06, "loss": 0.38, "step": 85 }, { "epoch": 0.5031691857630424, "grad_norm": 0.37086671590805054, "learning_rate": 9.857218287204204e-06, "loss": 0.3541, "step": 86 }, { "epoch": 0.5090199902486592, "grad_norm": 0.43105342984199524, "learning_rate": 9.848984680175049e-06, "loss": 0.4087, "step": 87 }, { "epoch": 0.514870794734276, "grad_norm": 0.36811238527297974, "learning_rate": 9.840523917523354e-06, "loss": 0.3639, "step": 88 }, { "epoch": 0.5207215992198927, "grad_norm": 0.378967821598053, "learning_rate": 9.831836395602164e-06, "loss": 0.3251, "step": 89 }, { "epoch": 0.5265724037055095, "grad_norm": 0.36341214179992676, "learning_rate": 9.822922521387277e-06, "loss": 0.3705, "step": 90 }, { "epoch": 0.5324232081911263, "grad_norm": 0.37682002782821655, "learning_rate": 9.813782712458206e-06, "loss": 0.3513, "step": 91 }, { "epoch": 0.538274012676743, "grad_norm": 0.4142582416534424, "learning_rate": 9.804417396978605e-06, "loss": 0.3716, "step": 92 }, { "epoch": 0.5441248171623598, "grad_norm": 0.4432157278060913, "learning_rate": 9.794827013676206e-06, "loss": 0.4126, "step": 93 }, { "epoch": 0.5499756216479766, "grad_norm": 0.47457224130630493, "learning_rate": 9.78501201182228e-06, "loss": 0.3941, "step": 94 }, { "epoch": 0.5558264261335933, "grad_norm": 0.35374128818511963, "learning_rate": 9.774972851210572e-06, "loss": 0.3893, "step": 95 }, { "epoch": 0.5616772306192102, "grad_norm": 0.37110310792922974, "learning_rate": 9.764710002135784e-06, "loss": 0.3453, "step": 96 }, { "epoch": 0.567528035104827, "grad_norm": 0.4286816716194153, "learning_rate": 9.754223945371524e-06, "loss": 0.3674, "step": 97 }, { "epoch": 0.5733788395904437, "grad_norm": 0.3735758662223816, "learning_rate": 9.743515172147793e-06, "loss": 0.3572, "step": 98 }, { "epoch": 0.5792296440760605, "grad_norm": 0.3784080445766449, "learning_rate": 9.732584184127973e-06, "loss": 0.3864, "step": 99 }, { "epoch": 0.5850804485616772, "grad_norm": 0.40882179141044617, "learning_rate": 9.721431493385322e-06, "loss": 0.3458, "step": 100 }, { "epoch": 0.590931253047294, "grad_norm": 0.3924429416656494, "learning_rate": 9.710057622378992e-06, "loss": 0.3497, "step": 101 }, { "epoch": 0.5967820575329108, "grad_norm": 0.41799789667129517, "learning_rate": 9.698463103929542e-06, "loss": 0.3915, "step": 102 }, { "epoch": 0.6026328620185275, "grad_norm": 0.4201458990573883, "learning_rate": 9.686648481193994e-06, "loss": 0.3797, "step": 103 }, { "epoch": 0.6084836665041443, "grad_norm": 0.3876160979270935, "learning_rate": 9.674614307640368e-06, "loss": 0.3667, "step": 104 }, { "epoch": 0.6143344709897611, "grad_norm": 0.39733994007110596, "learning_rate": 9.66236114702178e-06, "loss": 0.3746, "step": 105 }, { "epoch": 0.6201852754753778, "grad_norm": 0.4422380030155182, "learning_rate": 9.649889573350006e-06, "loss": 0.3657, "step": 106 }, { "epoch": 0.6260360799609946, "grad_norm": 0.34534451365470886, "learning_rate": 9.637200170868607e-06, "loss": 0.3173, "step": 107 }, { "epoch": 0.6318868844466115, "grad_norm": 0.49448907375335693, "learning_rate": 9.62429353402556e-06, "loss": 0.3528, "step": 108 }, { "epoch": 0.6377376889322282, "grad_norm": 0.4157074987888336, "learning_rate": 9.611170267445401e-06, "loss": 0.3647, "step": 109 }, { "epoch": 0.643588493417845, "grad_norm": 0.3649308383464813, "learning_rate": 9.597830985900913e-06, "loss": 0.3592, "step": 110 }, { "epoch": 0.6494392979034618, "grad_norm": 0.38802069425582886, "learning_rate": 9.584276314284316e-06, "loss": 0.3749, "step": 111 }, { "epoch": 0.6552901023890785, "grad_norm": 0.41905415058135986, "learning_rate": 9.570506887577994e-06, "loss": 0.3761, "step": 112 }, { "epoch": 0.6611409068746953, "grad_norm": 0.34973040223121643, "learning_rate": 9.556523350824759e-06, "loss": 0.3377, "step": 113 }, { "epoch": 0.666991711360312, "grad_norm": 0.42152735590934753, "learning_rate": 9.542326359097619e-06, "loss": 0.3758, "step": 114 }, { "epoch": 0.6728425158459288, "grad_norm": 0.34654316306114197, "learning_rate": 9.527916577469104e-06, "loss": 0.3612, "step": 115 }, { "epoch": 0.6786933203315456, "grad_norm": 0.3440297842025757, "learning_rate": 9.5132946809801e-06, "loss": 0.37, "step": 116 }, { "epoch": 0.6845441248171623, "grad_norm": 0.36565279960632324, "learning_rate": 9.498461354608228e-06, "loss": 0.352, "step": 117 }, { "epoch": 0.6903949293027791, "grad_norm": 0.3970431983470917, "learning_rate": 9.483417293235759e-06, "loss": 0.3694, "step": 118 }, { "epoch": 0.6962457337883959, "grad_norm": 0.3433384895324707, "learning_rate": 9.468163201617063e-06, "loss": 0.3657, "step": 119 }, { "epoch": 0.7020965382740126, "grad_norm": 0.39245930314064026, "learning_rate": 9.452699794345583e-06, "loss": 0.362, "step": 120 }, { "epoch": 0.7079473427596294, "grad_norm": 0.38453614711761475, "learning_rate": 9.437027795820373e-06, "loss": 0.3675, "step": 121 }, { "epoch": 0.7137981472452463, "grad_norm": 0.369517058134079, "learning_rate": 9.421147940212152e-06, "loss": 0.3634, "step": 122 }, { "epoch": 0.719648951730863, "grad_norm": 0.38849949836730957, "learning_rate": 9.405060971428924e-06, "loss": 0.3387, "step": 123 }, { "epoch": 0.7254997562164798, "grad_norm": 0.4063083231449127, "learning_rate": 9.388767643081109e-06, "loss": 0.3719, "step": 124 }, { "epoch": 0.7313505607020966, "grad_norm": 0.40234676003456116, "learning_rate": 9.372268718446259e-06, "loss": 0.3939, "step": 125 }, { "epoch": 0.7372013651877133, "grad_norm": 0.3845783770084381, "learning_rate": 9.355564970433288e-06, "loss": 0.3699, "step": 126 }, { "epoch": 0.7430521696733301, "grad_norm": 0.3887750506401062, "learning_rate": 9.338657181546277e-06, "loss": 0.3686, "step": 127 }, { "epoch": 0.7489029741589469, "grad_norm": 0.3700850307941437, "learning_rate": 9.321546143847802e-06, "loss": 0.3431, "step": 128 }, { "epoch": 0.7547537786445636, "grad_norm": 0.44235607981681824, "learning_rate": 9.30423265892184e-06, "loss": 0.3836, "step": 129 }, { "epoch": 0.7606045831301804, "grad_norm": 0.39945074915885925, "learning_rate": 9.286717537836211e-06, "loss": 0.3706, "step": 130 }, { "epoch": 0.7664553876157971, "grad_norm": 0.42615601420402527, "learning_rate": 9.269001601104593e-06, "loss": 0.369, "step": 131 }, { "epoch": 0.7723061921014139, "grad_norm": 0.4713898003101349, "learning_rate": 9.251085678648072e-06, "loss": 0.3818, "step": 132 }, { "epoch": 0.7781569965870307, "grad_norm": 0.3744489550590515, "learning_rate": 9.232970609756267e-06, "loss": 0.3542, "step": 133 }, { "epoch": 0.7840078010726474, "grad_norm": 0.3802720308303833, "learning_rate": 9.214657243048021e-06, "loss": 0.3346, "step": 134 }, { "epoch": 0.7898586055582643, "grad_norm": 0.45320552587509155, "learning_rate": 9.196146436431635e-06, "loss": 0.3766, "step": 135 }, { "epoch": 0.7957094100438811, "grad_norm": 0.3729214370250702, "learning_rate": 9.177439057064684e-06, "loss": 0.3694, "step": 136 }, { "epoch": 0.8015602145294978, "grad_norm": 0.3678078055381775, "learning_rate": 9.158535981313395e-06, "loss": 0.3515, "step": 137 }, { "epoch": 0.8074110190151146, "grad_norm": 0.4144746959209442, "learning_rate": 9.13943809471159e-06, "loss": 0.3756, "step": 138 }, { "epoch": 0.8132618235007314, "grad_norm": 0.3548150658607483, "learning_rate": 9.120146291919206e-06, "loss": 0.3494, "step": 139 }, { "epoch": 0.8191126279863481, "grad_norm": 0.3966399133205414, "learning_rate": 9.100661476680379e-06, "loss": 0.3427, "step": 140 }, { "epoch": 0.8249634324719649, "grad_norm": 0.4523519277572632, "learning_rate": 9.08098456178111e-06, "loss": 0.3641, "step": 141 }, { "epoch": 0.8308142369575817, "grad_norm": 0.45737963914871216, "learning_rate": 9.061116469006504e-06, "loss": 0.3643, "step": 142 }, { "epoch": 0.8366650414431984, "grad_norm": 0.34355804324150085, "learning_rate": 9.041058129097586e-06, "loss": 0.3227, "step": 143 }, { "epoch": 0.8425158459288152, "grad_norm": 0.4239197373390198, "learning_rate": 9.020810481707709e-06, "loss": 0.3604, "step": 144 }, { "epoch": 0.8483666504144319, "grad_norm": 0.4363431930541992, "learning_rate": 9.00037447535852e-06, "loss": 0.3785, "step": 145 }, { "epoch": 0.8542174549000487, "grad_norm": 0.383635550737381, "learning_rate": 8.979751067395534e-06, "loss": 0.355, "step": 146 }, { "epoch": 0.8600682593856656, "grad_norm": 0.3972126543521881, "learning_rate": 8.958941223943292e-06, "loss": 0.394, "step": 147 }, { "epoch": 0.8659190638712823, "grad_norm": 0.3762996196746826, "learning_rate": 8.937945919860086e-06, "loss": 0.3779, "step": 148 }, { "epoch": 0.8717698683568991, "grad_norm": 0.40220147371292114, "learning_rate": 8.916766138692303e-06, "loss": 0.3725, "step": 149 }, { "epoch": 0.8776206728425159, "grad_norm": 0.35849395394325256, "learning_rate": 8.895402872628352e-06, "loss": 0.3533, "step": 150 }, { "epoch": 0.8834714773281326, "grad_norm": 0.3301231861114502, "learning_rate": 8.873857122452174e-06, "loss": 0.3156, "step": 151 }, { "epoch": 0.8893222818137494, "grad_norm": 0.39462047815322876, "learning_rate": 8.852129897496367e-06, "loss": 0.3538, "step": 152 }, { "epoch": 0.8951730862993662, "grad_norm": 0.3844425082206726, "learning_rate": 8.83022221559489e-06, "loss": 0.3913, "step": 153 }, { "epoch": 0.9010238907849829, "grad_norm": 0.37792298197746277, "learning_rate": 8.808135103035407e-06, "loss": 0.3495, "step": 154 }, { "epoch": 0.9068746952705997, "grad_norm": 0.39290040731430054, "learning_rate": 8.785869594511182e-06, "loss": 0.3784, "step": 155 }, { "epoch": 0.9127254997562165, "grad_norm": 0.3619037866592407, "learning_rate": 8.763426733072624e-06, "loss": 0.3614, "step": 156 }, { "epoch": 0.9185763042418332, "grad_norm": 0.3633933663368225, "learning_rate": 8.740807570078419e-06, "loss": 0.3902, "step": 157 }, { "epoch": 0.92442710872745, "grad_norm": 0.3714929223060608, "learning_rate": 8.718013165146275e-06, "loss": 0.3274, "step": 158 }, { "epoch": 0.9302779132130667, "grad_norm": 0.38371893763542175, "learning_rate": 8.695044586103297e-06, "loss": 0.3507, "step": 159 }, { "epoch": 0.9361287176986836, "grad_norm": 0.34635236859321594, "learning_rate": 8.671902908935942e-06, "loss": 0.3275, "step": 160 }, { "epoch": 0.9419795221843004, "grad_norm": 0.34420835971832275, "learning_rate": 8.648589217739635e-06, "loss": 0.3461, "step": 161 }, { "epoch": 0.9478303266699171, "grad_norm": 0.3969476819038391, "learning_rate": 8.625104604667965e-06, "loss": 0.3579, "step": 162 }, { "epoch": 0.9536811311555339, "grad_norm": 0.3697619140148163, "learning_rate": 8.601450169881533e-06, "loss": 0.3476, "step": 163 }, { "epoch": 0.9595319356411507, "grad_norm": 0.3809903860092163, "learning_rate": 8.577627021496413e-06, "loss": 0.36, "step": 164 }, { "epoch": 0.9653827401267674, "grad_norm": 0.3934761881828308, "learning_rate": 8.553636275532236e-06, "loss": 0.3704, "step": 165 }, { "epoch": 0.9712335446123842, "grad_norm": 0.3420058786869049, "learning_rate": 8.529479055859918e-06, "loss": 0.3335, "step": 166 }, { "epoch": 0.977084349098001, "grad_norm": 0.3801231384277344, "learning_rate": 8.505156494148997e-06, "loss": 0.3723, "step": 167 }, { "epoch": 0.9829351535836177, "grad_norm": 0.38984423875808716, "learning_rate": 8.480669729814635e-06, "loss": 0.3563, "step": 168 }, { "epoch": 0.9887859580692345, "grad_norm": 0.369872123003006, "learning_rate": 8.456019909964224e-06, "loss": 0.3494, "step": 169 }, { "epoch": 0.9946367625548513, "grad_norm": 0.3835128843784332, "learning_rate": 8.43120818934367e-06, "loss": 0.3672, "step": 170 }, { "epoch": 1.0014627011214041, "grad_norm": 0.4482472538948059, "learning_rate": 8.40623573028327e-06, "loss": 0.4454, "step": 171 }, { "epoch": 1.007313505607021, "grad_norm": 0.45144927501678467, "learning_rate": 8.381103702643295e-06, "loss": 0.3454, "step": 172 }, { "epoch": 1.0131643100926377, "grad_norm": 0.3322243094444275, "learning_rate": 8.35581328375915e-06, "loss": 0.2828, "step": 173 }, { "epoch": 1.0190151145782544, "grad_norm": 0.397659033536911, "learning_rate": 8.330365658386252e-06, "loss": 0.3287, "step": 174 }, { "epoch": 1.0248659190638714, "grad_norm": 0.3485862910747528, "learning_rate": 8.30476201864451e-06, "loss": 0.2744, "step": 175 }, { "epoch": 1.030716723549488, "grad_norm": 0.3832169473171234, "learning_rate": 8.27900356396249e-06, "loss": 0.2868, "step": 176 }, { "epoch": 1.0365675280351048, "grad_norm": 0.4184396266937256, "learning_rate": 8.25309150102121e-06, "loss": 0.3291, "step": 177 }, { "epoch": 1.0424183325207217, "grad_norm": 0.45518970489501953, "learning_rate": 8.227027043697642e-06, "loss": 0.3489, "step": 178 }, { "epoch": 1.0482691370063384, "grad_norm": 0.3730817437171936, "learning_rate": 8.200811413007808e-06, "loss": 0.3055, "step": 179 }, { "epoch": 1.054119941491955, "grad_norm": 0.398185133934021, "learning_rate": 8.174445837049614e-06, "loss": 0.326, "step": 180 }, { "epoch": 1.059970745977572, "grad_norm": 0.4147329032421112, "learning_rate": 8.147931550945301e-06, "loss": 0.2961, "step": 181 }, { "epoch": 1.0658215504631887, "grad_norm": 0.4088496267795563, "learning_rate": 8.121269796783585e-06, "loss": 0.3239, "step": 182 }, { "epoch": 1.0716723549488054, "grad_norm": 0.35450735688209534, "learning_rate": 8.094461823561473e-06, "loss": 0.2851, "step": 183 }, { "epoch": 1.0775231594344223, "grad_norm": 0.4081903100013733, "learning_rate": 8.06750888712576e-06, "loss": 0.3188, "step": 184 }, { "epoch": 1.083373963920039, "grad_norm": 0.3934895396232605, "learning_rate": 8.040412250114184e-06, "loss": 0.2891, "step": 185 }, { "epoch": 1.0892247684056557, "grad_norm": 0.35631951689720154, "learning_rate": 8.013173181896283e-06, "loss": 0.2667, "step": 186 }, { "epoch": 1.0950755728912727, "grad_norm": 0.42703738808631897, "learning_rate": 7.985792958513932e-06, "loss": 0.312, "step": 187 }, { "epoch": 1.1009263773768894, "grad_norm": 0.4023725986480713, "learning_rate": 7.958272862621562e-06, "loss": 0.3343, "step": 188 }, { "epoch": 1.106777181862506, "grad_norm": 0.3514081537723541, "learning_rate": 7.930614183426074e-06, "loss": 0.2959, "step": 189 }, { "epoch": 1.1126279863481228, "grad_norm": 0.40648946166038513, "learning_rate": 7.902818216626446e-06, "loss": 0.3529, "step": 190 }, { "epoch": 1.1184787908337397, "grad_norm": 0.38296204805374146, "learning_rate": 7.874886264353035e-06, "loss": 0.2988, "step": 191 }, { "epoch": 1.1243295953193564, "grad_norm": 0.4062958061695099, "learning_rate": 7.846819635106569e-06, "loss": 0.3344, "step": 192 }, { "epoch": 1.130180399804973, "grad_norm": 0.3408312499523163, "learning_rate": 7.818619643696863e-06, "loss": 0.2857, "step": 193 }, { "epoch": 1.13603120429059, "grad_norm": 0.3789331316947937, "learning_rate": 7.790287611181217e-06, "loss": 0.3077, "step": 194 }, { "epoch": 1.1418820087762067, "grad_norm": 0.38520050048828125, "learning_rate": 7.76182486480253e-06, "loss": 0.3025, "step": 195 }, { "epoch": 1.1477328132618234, "grad_norm": 0.3634053170681, "learning_rate": 7.733232737927123e-06, "loss": 0.3037, "step": 196 }, { "epoch": 1.1535836177474403, "grad_norm": 0.42052581906318665, "learning_rate": 7.70451256998228e-06, "loss": 0.304, "step": 197 }, { "epoch": 1.159434422233057, "grad_norm": 0.3758928179740906, "learning_rate": 7.675665706393502e-06, "loss": 0.2755, "step": 198 }, { "epoch": 1.1652852267186737, "grad_norm": 0.35784485936164856, "learning_rate": 7.646693498521472e-06, "loss": 0.2876, "step": 199 }, { "epoch": 1.1711360312042907, "grad_norm": 0.38650694489479065, "learning_rate": 7.617597303598754e-06, "loss": 0.288, "step": 200 } ], "logging_steps": 1, "max_steps": 510, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 3.7100142648497275e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }