{ "best_global_step": 380, "best_metric": 0.3527662754058838, "best_model_checkpoint": "miner_id_24/checkpoint-380", "epoch": 0.6734603455914931, "eval_steps": 20, "global_step": 380, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0017722640673460345, "grad_norm": 0.12925809621810913, "learning_rate": 0.0, "loss": 2.4115, "step": 1 }, { "epoch": 0.0017722640673460345, "eval_loss": 2.651242256164551, "eval_runtime": 13.7278, "eval_samples_per_second": 3.351, "eval_steps_per_second": 0.874, "step": 1 }, { "epoch": 0.003544528134692069, "grad_norm": 0.13433989882469177, "learning_rate": 2e-05, "loss": 2.2713, "step": 2 }, { "epoch": 0.005316792202038104, "grad_norm": 0.16423076391220093, "learning_rate": 4e-05, "loss": 2.1567, "step": 3 }, { "epoch": 0.007089056269384138, "grad_norm": 0.14701242744922638, "learning_rate": 6e-05, "loss": 2.4028, "step": 4 }, { "epoch": 0.008861320336730172, "grad_norm": 0.1483909636735916, "learning_rate": 8e-05, "loss": 2.2558, "step": 5 }, { "epoch": 0.010633584404076208, "grad_norm": 0.1745777726173401, "learning_rate": 0.0001, "loss": 2.13, "step": 6 }, { "epoch": 0.012405848471422242, "grad_norm": 0.21620893478393555, "learning_rate": 0.00012, "loss": 2.0816, "step": 7 }, { "epoch": 0.014178112538768276, "grad_norm": 0.24888384342193604, "learning_rate": 0.00014, "loss": 2.087, "step": 8 }, { "epoch": 0.01595037660611431, "grad_norm": 0.27732551097869873, "learning_rate": 0.00016, "loss": 2.1129, "step": 9 }, { "epoch": 0.017722640673460344, "grad_norm": 0.23451898992061615, "learning_rate": 0.00018, "loss": 1.6034, "step": 10 }, { "epoch": 0.01949490474080638, "grad_norm": 0.2714381814002991, "learning_rate": 0.0002, "loss": 1.7197, "step": 11 }, { "epoch": 0.021267168808152416, "grad_norm": 0.23261328041553497, "learning_rate": 0.0001999998255714219, "loss": 1.2375, "step": 12 }, { "epoch": 0.02303943287549845, "grad_norm": 0.22171521186828613, "learning_rate": 0.0001999993022862961, "loss": 1.0814, "step": 13 }, { "epoch": 0.024811696942844484, "grad_norm": 0.2599828541278839, "learning_rate": 0.00019999843014644814, "loss": 1.213, "step": 14 }, { "epoch": 0.02658396101019052, "grad_norm": 0.30285561084747314, "learning_rate": 0.0001999972091549205, "loss": 1.2193, "step": 15 }, { "epoch": 0.028356225077536552, "grad_norm": 0.36278098821640015, "learning_rate": 0.0001999956393159727, "loss": 1.0557, "step": 16 }, { "epoch": 0.030128489144882586, "grad_norm": 0.29912176728248596, "learning_rate": 0.0001999937206350813, "loss": 0.8909, "step": 17 }, { "epoch": 0.03190075321222862, "grad_norm": 0.3200126886367798, "learning_rate": 0.0001999914531189397, "loss": 0.9609, "step": 18 }, { "epoch": 0.033673017279574655, "grad_norm": 0.3217049539089203, "learning_rate": 0.0001999888367754583, "loss": 0.7435, "step": 19 }, { "epoch": 0.03544528134692069, "grad_norm": 0.28469687700271606, "learning_rate": 0.00019998587161376442, "loss": 0.7353, "step": 20 }, { "epoch": 0.03544528134692069, "eval_loss": 1.0812681913375854, "eval_runtime": 12.5141, "eval_samples_per_second": 3.676, "eval_steps_per_second": 0.959, "step": 20 }, { "epoch": 0.03721754541426672, "grad_norm": 0.2957800328731537, "learning_rate": 0.00019998255764420216, "loss": 0.7687, "step": 21 }, { "epoch": 0.03898980948161276, "grad_norm": 0.2632061839103699, "learning_rate": 0.00019997889487833267, "loss": 0.6937, "step": 22 }, { "epoch": 0.0407620735489588, "grad_norm": 0.3017753064632416, "learning_rate": 0.00019997488332893371, "loss": 0.7435, "step": 23 }, { "epoch": 0.04253433761630483, "grad_norm": 0.362894207239151, "learning_rate": 0.00019997052300999978, "loss": 0.8498, "step": 24 }, { "epoch": 0.044306601683650866, "grad_norm": 0.35318875312805176, "learning_rate": 0.0001999658139367423, "loss": 0.7093, "step": 25 }, { "epoch": 0.0460788657509969, "grad_norm": 0.28998640179634094, "learning_rate": 0.00019996075612558915, "loss": 0.6885, "step": 26 }, { "epoch": 0.047851129818342934, "grad_norm": 0.32814618945121765, "learning_rate": 0.00019995534959418482, "loss": 0.6796, "step": 27 }, { "epoch": 0.04962339388568897, "grad_norm": 0.3003632128238678, "learning_rate": 0.00019994959436139045, "loss": 0.7224, "step": 28 }, { "epoch": 0.051395657953035, "grad_norm": 0.3441886901855469, "learning_rate": 0.00019994349044728354, "loss": 0.7491, "step": 29 }, { "epoch": 0.05316792202038104, "grad_norm": 0.28324809670448303, "learning_rate": 0.00019993703787315803, "loss": 0.742, "step": 30 }, { "epoch": 0.05494018608772707, "grad_norm": 0.3303508162498474, "learning_rate": 0.00019993023666152425, "loss": 0.7068, "step": 31 }, { "epoch": 0.056712450155073105, "grad_norm": 0.26269403100013733, "learning_rate": 0.0001999230868361086, "loss": 0.6117, "step": 32 }, { "epoch": 0.05848471422241914, "grad_norm": 0.33218932151794434, "learning_rate": 0.00019991558842185388, "loss": 0.671, "step": 33 }, { "epoch": 0.06025697828976517, "grad_norm": 0.2783183157444, "learning_rate": 0.00019990774144491876, "loss": 0.5227, "step": 34 }, { "epoch": 0.06202924235711121, "grad_norm": 0.30541640520095825, "learning_rate": 0.00019989954593267804, "loss": 0.5839, "step": 35 }, { "epoch": 0.06380150642445724, "grad_norm": 0.3339098393917084, "learning_rate": 0.0001998910019137223, "loss": 0.6583, "step": 36 }, { "epoch": 0.06557377049180328, "grad_norm": 0.3580738306045532, "learning_rate": 0.00019988210941785796, "loss": 0.7033, "step": 37 }, { "epoch": 0.06734603455914931, "grad_norm": 0.371333509683609, "learning_rate": 0.00019987286847610716, "loss": 0.761, "step": 38 }, { "epoch": 0.06911829862649535, "grad_norm": 1.2145196199417114, "learning_rate": 0.0001998632791207076, "loss": 1.2967, "step": 39 }, { "epoch": 0.07089056269384138, "grad_norm": 2.6015830039978027, "learning_rate": 0.00019985334138511237, "loss": 2.3005, "step": 40 }, { "epoch": 0.07089056269384138, "eval_loss": 0.7685866355895996, "eval_runtime": 12.4989, "eval_samples_per_second": 3.68, "eval_steps_per_second": 0.96, "step": 40 }, { "epoch": 0.07266282676118742, "grad_norm": 2.216989040374756, "learning_rate": 0.00019984305530399007, "loss": 1.3945, "step": 41 }, { "epoch": 0.07443509082853345, "grad_norm": 1.9432921409606934, "learning_rate": 0.00019983242091322433, "loss": 0.9692, "step": 42 }, { "epoch": 0.07620735489587949, "grad_norm": 1.4889724254608154, "learning_rate": 0.00019982143824991402, "loss": 0.5717, "step": 43 }, { "epoch": 0.07797961896322551, "grad_norm": 0.9190578460693359, "learning_rate": 0.00019981010735237295, "loss": 0.4295, "step": 44 }, { "epoch": 0.07975188303057155, "grad_norm": 0.33360040187835693, "learning_rate": 0.00019979842826012978, "loss": 0.0673, "step": 45 }, { "epoch": 0.0815241470979176, "grad_norm": 0.1340532898902893, "learning_rate": 0.00019978640101392785, "loss": 0.0286, "step": 46 }, { "epoch": 0.08329641116526362, "grad_norm": 0.8504681587219238, "learning_rate": 0.00019977402565572505, "loss": 0.0683, "step": 47 }, { "epoch": 0.08506867523260966, "grad_norm": 0.054726023226976395, "learning_rate": 0.00019976130222869375, "loss": 0.008, "step": 48 }, { "epoch": 0.08684093929995569, "grad_norm": 0.32191044092178345, "learning_rate": 0.00019974823077722044, "loss": 0.019, "step": 49 }, { "epoch": 0.08861320336730173, "grad_norm": 0.02536635659635067, "learning_rate": 0.00019973481134690592, "loss": 0.0029, "step": 50 }, { "epoch": 0.09038546743464776, "grad_norm": 4.455417633056641, "learning_rate": 0.0001997210439845648, "loss": 4.1137, "step": 51 }, { "epoch": 0.0921577315019938, "grad_norm": 3.4098949432373047, "learning_rate": 0.0001997069287382255, "loss": 3.0123, "step": 52 }, { "epoch": 0.09392999556933983, "grad_norm": 2.0529322624206543, "learning_rate": 0.00019969246565713005, "loss": 2.0137, "step": 53 }, { "epoch": 0.09570225963668587, "grad_norm": 1.2987276315689087, "learning_rate": 0.00019967765479173398, "loss": 1.4734, "step": 54 }, { "epoch": 0.0974745237040319, "grad_norm": 0.5781848430633545, "learning_rate": 0.000199662496193706, "loss": 0.6889, "step": 55 }, { "epoch": 0.09924678777137794, "grad_norm": 0.44919276237487793, "learning_rate": 0.00019964698991592803, "loss": 0.6298, "step": 56 }, { "epoch": 0.10101905183872396, "grad_norm": 0.4295942783355713, "learning_rate": 0.00019963113601249478, "loss": 0.6922, "step": 57 }, { "epoch": 0.10279131590607, "grad_norm": 0.39820319414138794, "learning_rate": 0.00019961493453871375, "loss": 0.5131, "step": 58 }, { "epoch": 0.10456357997341603, "grad_norm": 0.44660472869873047, "learning_rate": 0.00019959838555110493, "loss": 0.7165, "step": 59 }, { "epoch": 0.10633584404076207, "grad_norm": 0.5147534012794495, "learning_rate": 0.00019958148910740063, "loss": 0.8805, "step": 60 }, { "epoch": 0.10633584404076207, "eval_loss": 0.7933574914932251, "eval_runtime": 12.53, "eval_samples_per_second": 3.671, "eval_steps_per_second": 0.958, "step": 60 }, { "epoch": 0.10810810810810811, "grad_norm": 0.6109986305236816, "learning_rate": 0.00019956424526654538, "loss": 0.9824, "step": 61 }, { "epoch": 0.10988037217545414, "grad_norm": 0.35513007640838623, "learning_rate": 0.00019954665408869547, "loss": 0.6339, "step": 62 }, { "epoch": 0.11165263624280018, "grad_norm": 0.37850314378738403, "learning_rate": 0.000199528715635219, "loss": 0.7067, "step": 63 }, { "epoch": 0.11342490031014621, "grad_norm": 0.4014244079589844, "learning_rate": 0.00019951042996869557, "loss": 0.5827, "step": 64 }, { "epoch": 0.11519716437749225, "grad_norm": 0.4545835852622986, "learning_rate": 0.00019949179715291604, "loss": 0.6512, "step": 65 }, { "epoch": 0.11696942844483828, "grad_norm": 0.3380361795425415, "learning_rate": 0.00019947281725288227, "loss": 0.5706, "step": 66 }, { "epoch": 0.11874169251218432, "grad_norm": 0.3664691746234894, "learning_rate": 0.00019945349033480706, "loss": 0.6035, "step": 67 }, { "epoch": 0.12051395657953035, "grad_norm": 0.35849088430404663, "learning_rate": 0.0001994338164661137, "loss": 0.7432, "step": 68 }, { "epoch": 0.12228622064687639, "grad_norm": 0.2919897437095642, "learning_rate": 0.00019941379571543596, "loss": 0.5266, "step": 69 }, { "epoch": 0.12405848471422241, "grad_norm": 0.27844852209091187, "learning_rate": 0.0001993934281526176, "loss": 0.5018, "step": 70 }, { "epoch": 0.12583074878156844, "grad_norm": 0.22602231800556183, "learning_rate": 0.00019937271384871233, "loss": 0.3981, "step": 71 }, { "epoch": 0.12760301284891448, "grad_norm": 0.29444751143455505, "learning_rate": 0.00019935165287598348, "loss": 0.5292, "step": 72 }, { "epoch": 0.12937527691626052, "grad_norm": 0.3249082565307617, "learning_rate": 0.00019933024530790377, "loss": 0.5284, "step": 73 }, { "epoch": 0.13114754098360656, "grad_norm": 0.2540333867073059, "learning_rate": 0.000199308491219155, "loss": 0.4385, "step": 74 }, { "epoch": 0.1329198050509526, "grad_norm": 0.35392552614212036, "learning_rate": 0.00019928639068562794, "loss": 0.6445, "step": 75 }, { "epoch": 0.13469206911829862, "grad_norm": 0.2971354126930237, "learning_rate": 0.00019926394378442182, "loss": 0.5594, "step": 76 }, { "epoch": 0.13646433318564466, "grad_norm": 0.32821521162986755, "learning_rate": 0.00019924115059384424, "loss": 0.5882, "step": 77 }, { "epoch": 0.1382365972529907, "grad_norm": 0.3232933282852173, "learning_rate": 0.000199218011193411, "loss": 0.6179, "step": 78 }, { "epoch": 0.14000886132033674, "grad_norm": 0.2667105793952942, "learning_rate": 0.0001991945256638454, "loss": 0.3496, "step": 79 }, { "epoch": 0.14178112538768275, "grad_norm": 0.34408506751060486, "learning_rate": 0.00019917069408707848, "loss": 0.5075, "step": 80 }, { "epoch": 0.14178112538768275, "eval_loss": 0.6197513341903687, "eval_runtime": 12.5353, "eval_samples_per_second": 3.67, "eval_steps_per_second": 0.957, "step": 80 }, { "epoch": 0.1435533894550288, "grad_norm": 0.355451375246048, "learning_rate": 0.00019914651654624838, "loss": 0.5298, "step": 81 }, { "epoch": 0.14532565352237484, "grad_norm": 0.3298553228378296, "learning_rate": 0.0001991219931257002, "loss": 0.6274, "step": 82 }, { "epoch": 0.14709791758972088, "grad_norm": 0.2740597426891327, "learning_rate": 0.0001990971239109856, "loss": 0.463, "step": 83 }, { "epoch": 0.1488701816570669, "grad_norm": 0.2838585376739502, "learning_rate": 0.0001990719089888627, "loss": 0.3991, "step": 84 }, { "epoch": 0.15064244572441293, "grad_norm": 0.31955984234809875, "learning_rate": 0.00019904634844729544, "loss": 0.5019, "step": 85 }, { "epoch": 0.15241470979175897, "grad_norm": 0.3846657872200012, "learning_rate": 0.00019902044237545368, "loss": 0.6655, "step": 86 }, { "epoch": 0.15418697385910501, "grad_norm": 0.4838159680366516, "learning_rate": 0.0001989941908637126, "loss": 0.4693, "step": 87 }, { "epoch": 0.15595923792645103, "grad_norm": 1.4374454021453857, "learning_rate": 0.00019896759400365248, "loss": 0.6474, "step": 88 }, { "epoch": 0.15773150199379707, "grad_norm": 1.0277749300003052, "learning_rate": 0.00019894065188805833, "loss": 0.3156, "step": 89 }, { "epoch": 0.1595037660611431, "grad_norm": 0.7107067108154297, "learning_rate": 0.00019891336461091966, "loss": 0.1806, "step": 90 }, { "epoch": 0.16127603012848915, "grad_norm": 0.2835214138031006, "learning_rate": 0.00019888573226743014, "loss": 0.0499, "step": 91 }, { "epoch": 0.1630482941958352, "grad_norm": 0.09006321430206299, "learning_rate": 0.00019885775495398714, "loss": 0.0199, "step": 92 }, { "epoch": 0.1648205582631812, "grad_norm": 0.07613609731197357, "learning_rate": 0.00019882943276819154, "loss": 0.0139, "step": 93 }, { "epoch": 0.16659282233052725, "grad_norm": 0.9178115129470825, "learning_rate": 0.00019880076580884722, "loss": 0.1661, "step": 94 }, { "epoch": 0.1683650863978733, "grad_norm": 0.10496717691421509, "learning_rate": 0.00019877175417596106, "loss": 0.0139, "step": 95 }, { "epoch": 0.17013735046521933, "grad_norm": 0.40430745482444763, "learning_rate": 0.00019874239797074212, "loss": 0.0534, "step": 96 }, { "epoch": 0.17190961453256534, "grad_norm": 0.3484821021556854, "learning_rate": 0.00019871269729560166, "loss": 0.0429, "step": 97 }, { "epoch": 0.17368187859991138, "grad_norm": 0.3421931862831116, "learning_rate": 0.00019868265225415265, "loss": 0.0237, "step": 98 }, { "epoch": 0.17545414266725742, "grad_norm": 0.6225730180740356, "learning_rate": 0.00019865226295120926, "loss": 0.0507, "step": 99 }, { "epoch": 0.17722640673460346, "grad_norm": 1.2726138830184937, "learning_rate": 0.0001986215294927868, "loss": 0.1253, "step": 100 }, { "epoch": 0.17722640673460346, "eval_loss": 0.8289081454277039, "eval_runtime": 12.5178, "eval_samples_per_second": 3.675, "eval_steps_per_second": 0.959, "step": 100 }, { "epoch": 0.17899867080194948, "grad_norm": 0.7516783475875854, "learning_rate": 0.00019859045198610117, "loss": 0.8659, "step": 101 }, { "epoch": 0.18077093486929552, "grad_norm": 0.5679604411125183, "learning_rate": 0.00019855903053956847, "loss": 0.8206, "step": 102 }, { "epoch": 0.18254319893664156, "grad_norm": 0.4812873601913452, "learning_rate": 0.00019852726526280466, "loss": 0.6633, "step": 103 }, { "epoch": 0.1843154630039876, "grad_norm": 0.41497766971588135, "learning_rate": 0.0001984951562666251, "loss": 0.6336, "step": 104 }, { "epoch": 0.18608772707133364, "grad_norm": 0.31270650029182434, "learning_rate": 0.00019846270366304447, "loss": 0.4493, "step": 105 }, { "epoch": 0.18785999113867966, "grad_norm": 0.3731766641139984, "learning_rate": 0.00019842990756527588, "loss": 0.5743, "step": 106 }, { "epoch": 0.1896322552060257, "grad_norm": 0.4123268127441406, "learning_rate": 0.0001983967680877309, "loss": 0.6132, "step": 107 }, { "epoch": 0.19140451927337174, "grad_norm": 0.3724026679992676, "learning_rate": 0.000198363285346019, "loss": 0.5653, "step": 108 }, { "epoch": 0.19317678334071778, "grad_norm": 0.3632831573486328, "learning_rate": 0.00019832945945694706, "loss": 0.4145, "step": 109 }, { "epoch": 0.1949490474080638, "grad_norm": 0.31893858313560486, "learning_rate": 0.00019829529053851919, "loss": 0.5278, "step": 110 }, { "epoch": 0.19672131147540983, "grad_norm": 0.3031080365180969, "learning_rate": 0.000198260778709936, "loss": 0.4561, "step": 111 }, { "epoch": 0.19849357554275587, "grad_norm": 0.3131203055381775, "learning_rate": 0.00019822592409159462, "loss": 0.5189, "step": 112 }, { "epoch": 0.20026583961010191, "grad_norm": 0.32728081941604614, "learning_rate": 0.00019819072680508776, "loss": 0.5995, "step": 113 }, { "epoch": 0.20203810367744793, "grad_norm": 0.341036856174469, "learning_rate": 0.00019815518697320373, "loss": 0.5, "step": 114 }, { "epoch": 0.20381036774479397, "grad_norm": 0.3471624255180359, "learning_rate": 0.00019811930471992572, "loss": 0.621, "step": 115 }, { "epoch": 0.20558263181214, "grad_norm": 0.32070469856262207, "learning_rate": 0.0001980830801704316, "loss": 0.4149, "step": 116 }, { "epoch": 0.20735489587948605, "grad_norm": 0.27312496304512024, "learning_rate": 0.00019804651345109328, "loss": 0.4613, "step": 117 }, { "epoch": 0.20912715994683206, "grad_norm": 0.3245198130607605, "learning_rate": 0.00019800960468947638, "loss": 0.4486, "step": 118 }, { "epoch": 0.2108994240141781, "grad_norm": 0.3386727273464203, "learning_rate": 0.00019797235401433977, "loss": 0.5543, "step": 119 }, { "epoch": 0.21267168808152415, "grad_norm": 0.3053520917892456, "learning_rate": 0.00019793476155563507, "loss": 0.5017, "step": 120 }, { "epoch": 0.21267168808152415, "eval_loss": 0.5166746973991394, "eval_runtime": 12.5246, "eval_samples_per_second": 3.673, "eval_steps_per_second": 0.958, "step": 120 }, { "epoch": 0.2144439521488702, "grad_norm": 0.2705133259296417, "learning_rate": 0.00019789682744450628, "loss": 0.4048, "step": 121 }, { "epoch": 0.21621621621621623, "grad_norm": 0.3043995201587677, "learning_rate": 0.00019785855181328924, "loss": 0.4654, "step": 122 }, { "epoch": 0.21798848028356224, "grad_norm": 0.2664417326450348, "learning_rate": 0.00019781993479551125, "loss": 0.4206, "step": 123 }, { "epoch": 0.21976074435090828, "grad_norm": 0.27439233660697937, "learning_rate": 0.00019778097652589053, "loss": 0.4231, "step": 124 }, { "epoch": 0.22153300841825432, "grad_norm": 0.3255537450313568, "learning_rate": 0.0001977416771403358, "loss": 0.5605, "step": 125 }, { "epoch": 0.22330527248560036, "grad_norm": 0.2588534951210022, "learning_rate": 0.00019770203677594577, "loss": 0.3125, "step": 126 }, { "epoch": 0.22507753655294638, "grad_norm": 0.3110398054122925, "learning_rate": 0.00019766205557100868, "loss": 0.4537, "step": 127 }, { "epoch": 0.22684980062029242, "grad_norm": 0.2798526883125305, "learning_rate": 0.00019762173366500184, "loss": 0.4938, "step": 128 }, { "epoch": 0.22862206468763846, "grad_norm": 0.23593056201934814, "learning_rate": 0.0001975810711985911, "loss": 0.375, "step": 129 }, { "epoch": 0.2303943287549845, "grad_norm": 0.3059161603450775, "learning_rate": 0.00019754006831363042, "loss": 0.543, "step": 130 }, { "epoch": 0.23216659282233051, "grad_norm": 0.31934845447540283, "learning_rate": 0.0001974987251531612, "loss": 0.3901, "step": 131 }, { "epoch": 0.23393885688967656, "grad_norm": 0.2853747010231018, "learning_rate": 0.00019745704186141213, "loss": 0.4604, "step": 132 }, { "epoch": 0.2357111209570226, "grad_norm": 0.32849520444869995, "learning_rate": 0.00019741501858379828, "loss": 0.4805, "step": 133 }, { "epoch": 0.23748338502436864, "grad_norm": 0.2687203586101532, "learning_rate": 0.0001973726554669209, "loss": 0.3265, "step": 134 }, { "epoch": 0.23925564909171468, "grad_norm": 0.24127566814422607, "learning_rate": 0.00019732995265856672, "loss": 0.2944, "step": 135 }, { "epoch": 0.2410279131590607, "grad_norm": 0.2910301089286804, "learning_rate": 0.00019728691030770757, "loss": 0.4817, "step": 136 }, { "epoch": 0.24280017722640673, "grad_norm": 0.37831345200538635, "learning_rate": 0.00019724352856449975, "loss": 0.433, "step": 137 }, { "epoch": 0.24457244129375277, "grad_norm": 0.0021937908604741096, "learning_rate": 0.00019719980758028358, "loss": 0.0002, "step": 138 }, { "epoch": 0.24634470536109881, "grad_norm": 0.01243460550904274, "learning_rate": 0.00019715574750758287, "loss": 0.0003, "step": 139 }, { "epoch": 0.24811696942844483, "grad_norm": 0.0029907238204032183, "learning_rate": 0.00019711134850010432, "loss": 0.0002, "step": 140 }, { "epoch": 0.24811696942844483, "eval_loss": 0.46935102343559265, "eval_runtime": 12.5116, "eval_samples_per_second": 3.677, "eval_steps_per_second": 0.959, "step": 140 }, { "epoch": 0.24988923349579087, "grad_norm": 0.006767922081053257, "learning_rate": 0.00019706661071273703, "loss": 0.0003, "step": 141 }, { "epoch": 0.2516614975631369, "grad_norm": 0.06532242149114609, "learning_rate": 0.000197021534301552, "loss": 0.0031, "step": 142 }, { "epoch": 0.2534337616304829, "grad_norm": 1.6721391677856445, "learning_rate": 0.0001969761194238015, "loss": 0.1677, "step": 143 }, { "epoch": 0.25520602569782896, "grad_norm": 0.00030420508119277656, "learning_rate": 0.00019693036623791857, "loss": 0.0, "step": 144 }, { "epoch": 0.256978289765175, "grad_norm": 0.0002608789363875985, "learning_rate": 0.00019688427490351652, "loss": 0.0, "step": 145 }, { "epoch": 0.25875055383252105, "grad_norm": 0.01210006233304739, "learning_rate": 0.0001968378455813882, "loss": 0.0007, "step": 146 }, { "epoch": 0.2605228178998671, "grad_norm": 0.00028047978412359953, "learning_rate": 0.00019679107843350563, "loss": 0.0, "step": 147 }, { "epoch": 0.26229508196721313, "grad_norm": 0.011316346935927868, "learning_rate": 0.00019674397362301943, "loss": 0.0007, "step": 148 }, { "epoch": 0.26406734603455917, "grad_norm": 0.015622918494045734, "learning_rate": 0.00019669653131425803, "loss": 0.0009, "step": 149 }, { "epoch": 0.2658396101019052, "grad_norm": 0.0004477797483559698, "learning_rate": 0.00019664875167272735, "loss": 0.0001, "step": 150 }, { "epoch": 0.2676118741692512, "grad_norm": 0.40891826152801514, "learning_rate": 0.00019660063486511006, "loss": 0.5748, "step": 151 }, { "epoch": 0.26938413823659724, "grad_norm": 0.4348830580711365, "learning_rate": 0.0001965521810592651, "loss": 0.6122, "step": 152 }, { "epoch": 0.2711564023039433, "grad_norm": 0.4001963138580322, "learning_rate": 0.00019650339042422707, "loss": 0.5396, "step": 153 }, { "epoch": 0.2729286663712893, "grad_norm": 0.3244207501411438, "learning_rate": 0.00019645426313020557, "loss": 0.4277, "step": 154 }, { "epoch": 0.27470093043863536, "grad_norm": 0.2711174190044403, "learning_rate": 0.00019640479934858465, "loss": 0.4096, "step": 155 }, { "epoch": 0.2764731945059814, "grad_norm": 0.26809030771255493, "learning_rate": 0.0001963549992519223, "loss": 0.3406, "step": 156 }, { "epoch": 0.27824545857332744, "grad_norm": 0.39266160130500793, "learning_rate": 0.0001963048630139497, "loss": 0.5504, "step": 157 }, { "epoch": 0.2800177226406735, "grad_norm": 0.30174747109413147, "learning_rate": 0.0001962543908095707, "loss": 0.5131, "step": 158 }, { "epoch": 0.28178998670801947, "grad_norm": 0.3103567659854889, "learning_rate": 0.0001962035828148612, "loss": 0.398, "step": 159 }, { "epoch": 0.2835622507753655, "grad_norm": 0.36974287033081055, "learning_rate": 0.00019615243920706853, "loss": 0.5365, "step": 160 }, { "epoch": 0.2835622507753655, "eval_loss": 0.4812617897987366, "eval_runtime": 13.1676, "eval_samples_per_second": 3.493, "eval_steps_per_second": 0.911, "step": 160 }, { "epoch": 0.28533451484271155, "grad_norm": 0.3254373073577881, "learning_rate": 0.00019610096016461085, "loss": 0.452, "step": 161 }, { "epoch": 0.2871067789100576, "grad_norm": 0.28305336833000183, "learning_rate": 0.0001960491458670764, "loss": 0.3274, "step": 162 }, { "epoch": 0.28887904297740363, "grad_norm": 0.25707581639289856, "learning_rate": 0.00019599699649522316, "loss": 0.3662, "step": 163 }, { "epoch": 0.2906513070447497, "grad_norm": 0.3143969476222992, "learning_rate": 0.00019594451223097788, "loss": 0.4637, "step": 164 }, { "epoch": 0.2924235711120957, "grad_norm": 0.31218788027763367, "learning_rate": 0.00019589169325743574, "loss": 0.4821, "step": 165 }, { "epoch": 0.29419583517944176, "grad_norm": 0.3597668409347534, "learning_rate": 0.0001958385397588594, "loss": 0.4708, "step": 166 }, { "epoch": 0.2959680992467878, "grad_norm": 0.2988709509372711, "learning_rate": 0.00019578505192067874, "loss": 0.4491, "step": 167 }, { "epoch": 0.2977403633141338, "grad_norm": 0.3274686932563782, "learning_rate": 0.0001957312299294899, "loss": 0.4829, "step": 168 }, { "epoch": 0.2995126273814798, "grad_norm": 0.27369508147239685, "learning_rate": 0.00019567707397305474, "loss": 0.4035, "step": 169 }, { "epoch": 0.30128489144882586, "grad_norm": 0.2339191734790802, "learning_rate": 0.00019562258424030016, "loss": 0.2611, "step": 170 }, { "epoch": 0.3030571555161719, "grad_norm": 0.28207650780677795, "learning_rate": 0.00019556776092131753, "loss": 0.3826, "step": 171 }, { "epoch": 0.30482941958351795, "grad_norm": 0.2762835621833801, "learning_rate": 0.00019551260420736189, "loss": 0.4305, "step": 172 }, { "epoch": 0.306601683650864, "grad_norm": 0.3020841181278229, "learning_rate": 0.00019545711429085137, "loss": 0.41, "step": 173 }, { "epoch": 0.30837394771821003, "grad_norm": 0.2975738048553467, "learning_rate": 0.0001954012913653666, "loss": 0.5253, "step": 174 }, { "epoch": 0.31014621178555607, "grad_norm": 0.252259224653244, "learning_rate": 0.00019534513562564972, "loss": 0.3272, "step": 175 }, { "epoch": 0.31191847585290206, "grad_norm": 0.2440161108970642, "learning_rate": 0.00019528864726760416, "loss": 0.3537, "step": 176 }, { "epoch": 0.3136907399202481, "grad_norm": 0.28864967823028564, "learning_rate": 0.00019523182648829358, "loss": 0.3547, "step": 177 }, { "epoch": 0.31546300398759414, "grad_norm": 0.2466517984867096, "learning_rate": 0.00019517467348594125, "loss": 0.3194, "step": 178 }, { "epoch": 0.3172352680549402, "grad_norm": 0.3009297847747803, "learning_rate": 0.00019511718845992962, "loss": 0.3482, "step": 179 }, { "epoch": 0.3190075321222862, "grad_norm": 0.32632288336753845, "learning_rate": 0.00019505937161079927, "loss": 0.5656, "step": 180 }, { "epoch": 0.3190075321222862, "eval_loss": 0.42666807770729065, "eval_runtime": 13.1321, "eval_samples_per_second": 3.503, "eval_steps_per_second": 0.914, "step": 180 }, { "epoch": 0.32077979618963226, "grad_norm": 0.29848650097846985, "learning_rate": 0.0001950012231402484, "loss": 0.4143, "step": 181 }, { "epoch": 0.3225520602569783, "grad_norm": 0.26230475306510925, "learning_rate": 0.0001949427432511321, "loss": 0.3482, "step": 182 }, { "epoch": 0.32432432432432434, "grad_norm": 0.2788306474685669, "learning_rate": 0.00019488393214746173, "loss": 0.3834, "step": 183 }, { "epoch": 0.3260965883916704, "grad_norm": 0.23439691960811615, "learning_rate": 0.00019482479003440394, "loss": 0.2863, "step": 184 }, { "epoch": 0.32786885245901637, "grad_norm": 0.27671945095062256, "learning_rate": 0.00019476531711828027, "loss": 0.4385, "step": 185 }, { "epoch": 0.3296411165263624, "grad_norm": 0.2779659926891327, "learning_rate": 0.00019470551360656623, "loss": 0.3883, "step": 186 }, { "epoch": 0.33141338059370845, "grad_norm": 0.3389156460762024, "learning_rate": 0.00019464537970789066, "loss": 0.4089, "step": 187 }, { "epoch": 0.3331856446610545, "grad_norm": 0.2997131049633026, "learning_rate": 0.00019458491563203493, "loss": 0.4321, "step": 188 }, { "epoch": 0.33495790872840053, "grad_norm": 0.17764919996261597, "learning_rate": 0.00019452412158993236, "loss": 0.0348, "step": 189 }, { "epoch": 0.3367301727957466, "grad_norm": 0.00036286766408011317, "learning_rate": 0.0001944629977936673, "loss": 0.0, "step": 190 }, { "epoch": 0.3385024368630926, "grad_norm": 0.00023168908955994993, "learning_rate": 0.00019440154445647445, "loss": 0.0, "step": 191 }, { "epoch": 0.34027470093043866, "grad_norm": 0.004144479986280203, "learning_rate": 0.0001943397617927382, "loss": 0.0003, "step": 192 }, { "epoch": 0.3420469649977847, "grad_norm": 0.003571633715182543, "learning_rate": 0.0001942776500179918, "loss": 0.0003, "step": 193 }, { "epoch": 0.3438192290651307, "grad_norm": 0.5097905993461609, "learning_rate": 0.0001942152093489166, "loss": 0.0525, "step": 194 }, { "epoch": 0.3455914931324767, "grad_norm": 0.00029317947337403893, "learning_rate": 0.0001941524400033414, "loss": 0.0, "step": 195 }, { "epoch": 0.34736375719982276, "grad_norm": 0.00027563507319428027, "learning_rate": 0.00019408934220024144, "loss": 0.0, "step": 196 }, { "epoch": 0.3491360212671688, "grad_norm": 0.025055795907974243, "learning_rate": 0.000194025916159738, "loss": 0.0015, "step": 197 }, { "epoch": 0.35090828533451485, "grad_norm": 0.02219068631529808, "learning_rate": 0.0001939621621030974, "loss": 0.0013, "step": 198 }, { "epoch": 0.3526805494018609, "grad_norm": 0.0035792894195765257, "learning_rate": 0.00019389808025273015, "loss": 0.0003, "step": 199 }, { "epoch": 0.35445281346920693, "grad_norm": 0.01858345977962017, "learning_rate": 0.0001938336708321904, "loss": 0.0013, "step": 200 }, { "epoch": 0.35445281346920693, "eval_loss": 0.4978634715080261, "eval_runtime": 13.1814, "eval_samples_per_second": 3.49, "eval_steps_per_second": 0.91, "step": 200 }, { "epoch": 0.35622507753655297, "grad_norm": 0.43667736649513245, "learning_rate": 0.00019376893406617503, "loss": 0.6103, "step": 201 }, { "epoch": 0.35799734160389896, "grad_norm": 0.31308457255363464, "learning_rate": 0.0001937038701805229, "loss": 0.4278, "step": 202 }, { "epoch": 0.359769605671245, "grad_norm": 0.3520476818084717, "learning_rate": 0.00019363847940221396, "loss": 0.4417, "step": 203 }, { "epoch": 0.36154186973859104, "grad_norm": 0.3142838776111603, "learning_rate": 0.00019357276195936868, "loss": 0.5054, "step": 204 }, { "epoch": 0.3633141338059371, "grad_norm": 0.35768458247184753, "learning_rate": 0.00019350671808124705, "loss": 0.565, "step": 205 }, { "epoch": 0.3650863978732831, "grad_norm": 0.2697443962097168, "learning_rate": 0.00019344034799824789, "loss": 0.4118, "step": 206 }, { "epoch": 0.36685866194062916, "grad_norm": 0.272493451833725, "learning_rate": 0.0001933736519419079, "loss": 0.472, "step": 207 }, { "epoch": 0.3686309260079752, "grad_norm": 0.3051384389400482, "learning_rate": 0.00019330663014490115, "loss": 0.4262, "step": 208 }, { "epoch": 0.37040319007532124, "grad_norm": 0.37801337242126465, "learning_rate": 0.00019323928284103791, "loss": 0.4006, "step": 209 }, { "epoch": 0.3721754541426673, "grad_norm": 0.3074350953102112, "learning_rate": 0.0001931716102652641, "loss": 0.4329, "step": 210 }, { "epoch": 0.37394771821001327, "grad_norm": 0.28358665108680725, "learning_rate": 0.00019310361265366033, "loss": 0.3733, "step": 211 }, { "epoch": 0.3757199822773593, "grad_norm": 0.2780790627002716, "learning_rate": 0.0001930352902434411, "loss": 0.362, "step": 212 }, { "epoch": 0.37749224634470535, "grad_norm": 0.2990495562553406, "learning_rate": 0.00019296664327295412, "loss": 0.4836, "step": 213 }, { "epoch": 0.3792645104120514, "grad_norm": 0.25120624899864197, "learning_rate": 0.00019289767198167916, "loss": 0.3802, "step": 214 }, { "epoch": 0.38103677447939743, "grad_norm": 0.2883426249027252, "learning_rate": 0.00019282837661022755, "loss": 0.4385, "step": 215 }, { "epoch": 0.3828090385467435, "grad_norm": 0.28411924839019775, "learning_rate": 0.00019275875740034116, "loss": 0.4392, "step": 216 }, { "epoch": 0.3845813026140895, "grad_norm": 0.24637405574321747, "learning_rate": 0.00019268881459489156, "loss": 0.3272, "step": 217 }, { "epoch": 0.38635356668143556, "grad_norm": 0.3365507125854492, "learning_rate": 0.00019261854843787926, "loss": 0.5247, "step": 218 }, { "epoch": 0.38812583074878154, "grad_norm": 0.23533238470554352, "learning_rate": 0.00019254795917443275, "loss": 0.2877, "step": 219 }, { "epoch": 0.3898980948161276, "grad_norm": 0.38877180218696594, "learning_rate": 0.00019247704705080773, "loss": 0.5153, "step": 220 }, { "epoch": 0.3898980948161276, "eval_loss": 0.42870044708251953, "eval_runtime": 13.1885, "eval_samples_per_second": 3.488, "eval_steps_per_second": 0.91, "step": 220 }, { "epoch": 0.3916703588834736, "grad_norm": 0.28654739260673523, "learning_rate": 0.00019240581231438626, "loss": 0.3613, "step": 221 }, { "epoch": 0.39344262295081966, "grad_norm": 0.30965128540992737, "learning_rate": 0.00019233425521367577, "loss": 0.457, "step": 222 }, { "epoch": 0.3952148870181657, "grad_norm": 0.2371031641960144, "learning_rate": 0.00019226237599830834, "loss": 0.2824, "step": 223 }, { "epoch": 0.39698715108551175, "grad_norm": 0.3060905337333679, "learning_rate": 0.00019219017491903974, "loss": 0.4131, "step": 224 }, { "epoch": 0.3987594151528578, "grad_norm": 0.2877762019634247, "learning_rate": 0.00019211765222774865, "loss": 0.3575, "step": 225 }, { "epoch": 0.40053167922020383, "grad_norm": 0.2885982394218445, "learning_rate": 0.0001920448081774356, "loss": 0.4164, "step": 226 }, { "epoch": 0.40230394328754987, "grad_norm": 0.22618408501148224, "learning_rate": 0.00019197164302222234, "loss": 0.2818, "step": 227 }, { "epoch": 0.40407620735489586, "grad_norm": 0.31908485293388367, "learning_rate": 0.00019189815701735073, "loss": 0.3765, "step": 228 }, { "epoch": 0.4058484714222419, "grad_norm": 0.24042654037475586, "learning_rate": 0.0001918243504191819, "loss": 0.3042, "step": 229 }, { "epoch": 0.40762073548958794, "grad_norm": 0.3048646152019501, "learning_rate": 0.00019175022348519554, "loss": 0.3434, "step": 230 }, { "epoch": 0.409392999556934, "grad_norm": 0.27876347303390503, "learning_rate": 0.00019167577647398871, "loss": 0.3549, "step": 231 }, { "epoch": 0.41116526362428, "grad_norm": 0.24614454805850983, "learning_rate": 0.00019160100964527512, "loss": 0.3694, "step": 232 }, { "epoch": 0.41293752769162606, "grad_norm": 0.26314517855644226, "learning_rate": 0.00019152592325988424, "loss": 0.3506, "step": 233 }, { "epoch": 0.4147097917589721, "grad_norm": 0.2168036252260208, "learning_rate": 0.0001914505175797603, "loss": 0.3882, "step": 234 }, { "epoch": 0.41648205582631814, "grad_norm": 0.35566362738609314, "learning_rate": 0.0001913747928679614, "loss": 0.4505, "step": 235 }, { "epoch": 0.41825431989366413, "grad_norm": 0.24027441442012787, "learning_rate": 0.00019129874938865861, "loss": 0.3372, "step": 236 }, { "epoch": 0.42002658396101017, "grad_norm": 0.3028562366962433, "learning_rate": 0.00019122238740713505, "loss": 0.2804, "step": 237 }, { "epoch": 0.4217988480283562, "grad_norm": 0.2627307176589966, "learning_rate": 0.00019114570718978496, "loss": 0.2863, "step": 238 }, { "epoch": 0.42357111209570225, "grad_norm": 0.5339404940605164, "learning_rate": 0.00019106870900411277, "loss": 0.3219, "step": 239 }, { "epoch": 0.4253433761630483, "grad_norm": 0.0003258722135797143, "learning_rate": 0.00019099139311873215, "loss": 0.0, "step": 240 }, { "epoch": 0.4253433761630483, "eval_loss": 0.41977012157440186, "eval_runtime": 13.1902, "eval_samples_per_second": 3.487, "eval_steps_per_second": 0.91, "step": 240 }, { "epoch": 0.42711564023039433, "grad_norm": 0.00027717245393432677, "learning_rate": 0.00019091375980336513, "loss": 0.0, "step": 241 }, { "epoch": 0.4288879042977404, "grad_norm": 0.004714862443506718, "learning_rate": 0.00019083580932884104, "loss": 0.0003, "step": 242 }, { "epoch": 0.4306601683650864, "grad_norm": 0.001995272934436798, "learning_rate": 0.00019075754196709572, "loss": 0.0002, "step": 243 }, { "epoch": 0.43243243243243246, "grad_norm": 0.0018995344871655107, "learning_rate": 0.00019067895799117045, "loss": 0.0002, "step": 244 }, { "epoch": 0.43420469649977844, "grad_norm": 0.0036318032070994377, "learning_rate": 0.00019060005767521104, "loss": 0.0003, "step": 245 }, { "epoch": 0.4359769605671245, "grad_norm": 0.0004436742456164211, "learning_rate": 0.00019052084129446694, "loss": 0.0001, "step": 246 }, { "epoch": 0.4377492246344705, "grad_norm": 0.001414547092281282, "learning_rate": 0.00019044130912529013, "loss": 0.0002, "step": 247 }, { "epoch": 0.43952148870181657, "grad_norm": 0.0006285074632614851, "learning_rate": 0.00019036146144513425, "loss": 0.0001, "step": 248 }, { "epoch": 0.4412937527691626, "grad_norm": 0.002389760920777917, "learning_rate": 0.00019028129853255367, "loss": 0.0002, "step": 249 }, { "epoch": 0.44306601683650865, "grad_norm": 0.0036429688334465027, "learning_rate": 0.00019020082066720245, "loss": 0.0001, "step": 250 }, { "epoch": 0.4448382809038547, "grad_norm": 0.4008120596408844, "learning_rate": 0.00019012002812983337, "loss": 0.6168, "step": 251 }, { "epoch": 0.44661054497120073, "grad_norm": 0.30179068446159363, "learning_rate": 0.000190038921202297, "loss": 0.4365, "step": 252 }, { "epoch": 0.44838280903854677, "grad_norm": 0.2941570580005646, "learning_rate": 0.00018995750016754067, "loss": 0.3508, "step": 253 }, { "epoch": 0.45015507310589276, "grad_norm": 0.3080598711967468, "learning_rate": 0.00018987576530960743, "loss": 0.4263, "step": 254 }, { "epoch": 0.4519273371732388, "grad_norm": 0.28178784251213074, "learning_rate": 0.00018979371691363523, "loss": 0.3681, "step": 255 }, { "epoch": 0.45369960124058484, "grad_norm": 0.33791857957839966, "learning_rate": 0.00018971135526585573, "loss": 0.4315, "step": 256 }, { "epoch": 0.4554718653079309, "grad_norm": 0.23317955434322357, "learning_rate": 0.0001896286806535935, "loss": 0.316, "step": 257 }, { "epoch": 0.4572441293752769, "grad_norm": 0.317516565322876, "learning_rate": 0.00018954569336526478, "loss": 0.4158, "step": 258 }, { "epoch": 0.45901639344262296, "grad_norm": 0.3018972873687744, "learning_rate": 0.00018946239369037668, "loss": 0.4386, "step": 259 }, { "epoch": 0.460788657509969, "grad_norm": 0.3114071190357208, "learning_rate": 0.00018937878191952606, "loss": 0.4839, "step": 260 }, { "epoch": 0.460788657509969, "eval_loss": 0.3883247673511505, "eval_runtime": 13.1715, "eval_samples_per_second": 3.492, "eval_steps_per_second": 0.911, "step": 260 }, { "epoch": 0.46256092157731504, "grad_norm": 0.3128843605518341, "learning_rate": 0.00018929485834439863, "loss": 0.4418, "step": 261 }, { "epoch": 0.46433318564466103, "grad_norm": 0.25437384843826294, "learning_rate": 0.00018921062325776773, "loss": 0.3405, "step": 262 }, { "epoch": 0.46610544971200707, "grad_norm": 0.35694077610969543, "learning_rate": 0.00018912607695349348, "loss": 0.6361, "step": 263 }, { "epoch": 0.4678777137793531, "grad_norm": 0.32367607951164246, "learning_rate": 0.00018904121972652176, "loss": 0.4257, "step": 264 }, { "epoch": 0.46964997784669915, "grad_norm": 0.2708199620246887, "learning_rate": 0.00018895605187288305, "loss": 0.3654, "step": 265 }, { "epoch": 0.4714222419140452, "grad_norm": 0.23354637622833252, "learning_rate": 0.0001888705736896915, "loss": 0.3189, "step": 266 }, { "epoch": 0.47319450598139123, "grad_norm": 0.3102363049983978, "learning_rate": 0.00018878478547514389, "loss": 0.536, "step": 267 }, { "epoch": 0.4749667700487373, "grad_norm": 0.26287028193473816, "learning_rate": 0.00018869868752851852, "loss": 0.4657, "step": 268 }, { "epoch": 0.4767390341160833, "grad_norm": 0.26888734102249146, "learning_rate": 0.00018861228015017425, "loss": 0.4643, "step": 269 }, { "epoch": 0.47851129818342936, "grad_norm": 0.2884310185909271, "learning_rate": 0.0001885255636415494, "loss": 0.4784, "step": 270 }, { "epoch": 0.48028356225077534, "grad_norm": 0.27578142285346985, "learning_rate": 0.0001884385383051607, "loss": 0.4067, "step": 271 }, { "epoch": 0.4820558263181214, "grad_norm": 0.24176117777824402, "learning_rate": 0.0001883512044446023, "loss": 0.3361, "step": 272 }, { "epoch": 0.4838280903854674, "grad_norm": 0.20125697553157806, "learning_rate": 0.00018826356236454462, "loss": 0.247, "step": 273 }, { "epoch": 0.48560035445281347, "grad_norm": 0.3126257061958313, "learning_rate": 0.00018817561237073326, "loss": 0.4648, "step": 274 }, { "epoch": 0.4873726185201595, "grad_norm": 0.23726914823055267, "learning_rate": 0.00018808735476998818, "loss": 0.3678, "step": 275 }, { "epoch": 0.48914488258750555, "grad_norm": 0.27817875146865845, "learning_rate": 0.00018799878987020224, "loss": 0.4532, "step": 276 }, { "epoch": 0.4909171466548516, "grad_norm": 0.2772604525089264, "learning_rate": 0.00018790991798034047, "loss": 0.5133, "step": 277 }, { "epoch": 0.49268941072219763, "grad_norm": 0.22860132157802582, "learning_rate": 0.0001878207394104388, "loss": 0.3466, "step": 278 }, { "epoch": 0.4944616747895436, "grad_norm": 0.3124736249446869, "learning_rate": 0.0001877312544716031, "loss": 0.3893, "step": 279 }, { "epoch": 0.49623393885688966, "grad_norm": 0.23166291415691376, "learning_rate": 0.00018764146347600793, "loss": 0.3673, "step": 280 }, { "epoch": 0.49623393885688966, "eval_loss": 0.4022579491138458, "eval_runtime": 13.1631, "eval_samples_per_second": 3.495, "eval_steps_per_second": 0.912, "step": 280 }, { "epoch": 0.4980062029242357, "grad_norm": 0.33012211322784424, "learning_rate": 0.00018755136673689562, "loss": 0.494, "step": 281 }, { "epoch": 0.49977846699158174, "grad_norm": 0.2126472443342209, "learning_rate": 0.0001874609645685751, "loss": 0.3049, "step": 282 }, { "epoch": 0.5015507310589278, "grad_norm": 0.2131752222776413, "learning_rate": 0.00018737025728642078, "loss": 0.301, "step": 283 }, { "epoch": 0.5033229951262738, "grad_norm": 0.2322147935628891, "learning_rate": 0.00018727924520687154, "loss": 0.347, "step": 284 }, { "epoch": 0.5050952591936199, "grad_norm": 0.23549233376979828, "learning_rate": 0.00018718792864742953, "loss": 0.3884, "step": 285 }, { "epoch": 0.5068675232609658, "grad_norm": 0.2643439471721649, "learning_rate": 0.00018709630792665906, "loss": 0.2832, "step": 286 }, { "epoch": 0.5086397873283119, "grad_norm": 0.22141581773757935, "learning_rate": 0.0001870043833641856, "loss": 0.2574, "step": 287 }, { "epoch": 0.5104120513956579, "grad_norm": 0.34856167435646057, "learning_rate": 0.0001869121552806946, "loss": 0.3827, "step": 288 }, { "epoch": 0.512184315463004, "grad_norm": 0.002351484727114439, "learning_rate": 0.0001868196239979303, "loss": 0.0003, "step": 289 }, { "epoch": 0.51395657953035, "grad_norm": 0.0005069540929980576, "learning_rate": 0.00018672678983869463, "loss": 0.0001, "step": 290 }, { "epoch": 0.5157288435976961, "grad_norm": 0.00036229719989933074, "learning_rate": 0.00018663365312684632, "loss": 0.0, "step": 291 }, { "epoch": 0.5175011076650421, "grad_norm": 0.0026155211962759495, "learning_rate": 0.00018654021418729937, "loss": 0.0002, "step": 292 }, { "epoch": 0.5192733717323881, "grad_norm": 0.0028819674625992775, "learning_rate": 0.00018644647334602223, "loss": 0.0002, "step": 293 }, { "epoch": 0.5210456357997342, "grad_norm": 0.0042349896393716335, "learning_rate": 0.00018635243093003654, "loss": 0.0003, "step": 294 }, { "epoch": 0.5228178998670802, "grad_norm": 0.0011076276423409581, "learning_rate": 0.000186258087267416, "loss": 0.0001, "step": 295 }, { "epoch": 0.5245901639344263, "grad_norm": 0.00015828786126803607, "learning_rate": 0.00018616344268728523, "loss": 0.0, "step": 296 }, { "epoch": 0.5263624280017722, "grad_norm": 0.00010309902427252382, "learning_rate": 0.00018606849751981862, "loss": 0.0, "step": 297 }, { "epoch": 0.5281346920691183, "grad_norm": 0.0009718142100609839, "learning_rate": 0.00018597325209623918, "loss": 0.0001, "step": 298 }, { "epoch": 0.5299069561364643, "grad_norm": 0.000575641926843673, "learning_rate": 0.00018587770674881738, "loss": 0.0001, "step": 299 }, { "epoch": 0.5316792202038104, "grad_norm": 0.0004926875117234886, "learning_rate": 0.00018578186181086997, "loss": 0.0001, "step": 300 }, { "epoch": 0.5316792202038104, "eval_loss": 0.4220569133758545, "eval_runtime": 13.1669, "eval_samples_per_second": 3.494, "eval_steps_per_second": 0.911, "step": 300 }, { "epoch": 0.5334514842711564, "grad_norm": 0.4100307524204254, "learning_rate": 0.00018568571761675893, "loss": 0.6124, "step": 301 }, { "epoch": 0.5352237483385024, "grad_norm": 0.2929050326347351, "learning_rate": 0.00018558927450189013, "loss": 0.4304, "step": 302 }, { "epoch": 0.5369960124058485, "grad_norm": 0.28687912225723267, "learning_rate": 0.00018549253280271231, "loss": 0.4497, "step": 303 }, { "epoch": 0.5387682764731945, "grad_norm": 0.25559526681900024, "learning_rate": 0.0001853954928567158, "loss": 0.3621, "step": 304 }, { "epoch": 0.5405405405405406, "grad_norm": 0.2866135835647583, "learning_rate": 0.00018529815500243135, "loss": 0.4047, "step": 305 }, { "epoch": 0.5423128046078866, "grad_norm": 0.2470010221004486, "learning_rate": 0.00018520051957942907, "loss": 0.3199, "step": 306 }, { "epoch": 0.5440850686752327, "grad_norm": 0.216833233833313, "learning_rate": 0.00018510258692831712, "loss": 0.305, "step": 307 }, { "epoch": 0.5458573327425786, "grad_norm": 0.26033347845077515, "learning_rate": 0.00018500435739074057, "loss": 0.3308, "step": 308 }, { "epoch": 0.5476295968099246, "grad_norm": 0.2934284806251526, "learning_rate": 0.00018490583130938016, "loss": 0.3925, "step": 309 }, { "epoch": 0.5494018608772707, "grad_norm": 0.2791336178779602, "learning_rate": 0.0001848070090279512, "loss": 0.3914, "step": 310 }, { "epoch": 0.5511741249446167, "grad_norm": 0.19181470572948456, "learning_rate": 0.00018470789089120227, "loss": 0.2361, "step": 311 }, { "epoch": 0.5529463890119628, "grad_norm": 0.2595193386077881, "learning_rate": 0.00018460847724491414, "loss": 0.3149, "step": 312 }, { "epoch": 0.5547186530793088, "grad_norm": 0.34042033553123474, "learning_rate": 0.00018450876843589836, "loss": 0.4902, "step": 313 }, { "epoch": 0.5564909171466549, "grad_norm": 0.22444982826709747, "learning_rate": 0.00018440876481199626, "loss": 0.3216, "step": 314 }, { "epoch": 0.5582631812140009, "grad_norm": 0.26198506355285645, "learning_rate": 0.00018430846672207768, "loss": 0.3799, "step": 315 }, { "epoch": 0.560035445281347, "grad_norm": 0.2633836567401886, "learning_rate": 0.00018420787451603961, "loss": 0.4304, "step": 316 }, { "epoch": 0.561807709348693, "grad_norm": 0.28047966957092285, "learning_rate": 0.00018410698854480525, "loss": 0.3364, "step": 317 }, { "epoch": 0.5635799734160389, "grad_norm": 0.23388344049453735, "learning_rate": 0.00018400580916032246, "loss": 0.2959, "step": 318 }, { "epoch": 0.565352237483385, "grad_norm": 0.2674112021923065, "learning_rate": 0.0001839043367155628, "loss": 0.4, "step": 319 }, { "epoch": 0.567124501550731, "grad_norm": 0.28098639845848083, "learning_rate": 0.00018380257156452013, "loss": 0.3772, "step": 320 }, { "epoch": 0.567124501550731, "eval_loss": 0.3923526108264923, "eval_runtime": 13.1912, "eval_samples_per_second": 3.487, "eval_steps_per_second": 0.91, "step": 320 }, { "epoch": 0.5688967656180771, "grad_norm": 0.22875545918941498, "learning_rate": 0.0001837005140622095, "loss": 0.3477, "step": 321 }, { "epoch": 0.5706690296854231, "grad_norm": 0.24979935586452484, "learning_rate": 0.00018359816456466575, "loss": 0.3476, "step": 322 }, { "epoch": 0.5724412937527692, "grad_norm": 0.2524240016937256, "learning_rate": 0.0001834955234289425, "loss": 0.319, "step": 323 }, { "epoch": 0.5742135578201152, "grad_norm": 0.21320028603076935, "learning_rate": 0.00018339259101311063, "loss": 0.2503, "step": 324 }, { "epoch": 0.5759858218874613, "grad_norm": 0.28563594818115234, "learning_rate": 0.00018328936767625728, "loss": 0.3751, "step": 325 }, { "epoch": 0.5777580859548073, "grad_norm": 0.26557430624961853, "learning_rate": 0.00018318585377848442, "loss": 0.4099, "step": 326 }, { "epoch": 0.5795303500221533, "grad_norm": 0.31162795424461365, "learning_rate": 0.00018308204968090774, "loss": 0.3998, "step": 327 }, { "epoch": 0.5813026140894993, "grad_norm": 0.2938039302825928, "learning_rate": 0.0001829779557456552, "loss": 0.3495, "step": 328 }, { "epoch": 0.5830748781568453, "grad_norm": 0.293608695268631, "learning_rate": 0.00018287357233586597, "loss": 0.42, "step": 329 }, { "epoch": 0.5848471422241914, "grad_norm": 0.24747273325920105, "learning_rate": 0.00018276889981568906, "loss": 0.2803, "step": 330 }, { "epoch": 0.5866194062915374, "grad_norm": 0.23101884126663208, "learning_rate": 0.00018266393855028204, "loss": 0.2968, "step": 331 }, { "epoch": 0.5883916703588835, "grad_norm": 0.2644041180610657, "learning_rate": 0.00018255868890580977, "loss": 0.3806, "step": 332 }, { "epoch": 0.5901639344262295, "grad_norm": 0.3414515554904938, "learning_rate": 0.00018245315124944318, "loss": 0.3432, "step": 333 }, { "epoch": 0.5919361984935756, "grad_norm": 0.30226683616638184, "learning_rate": 0.00018234732594935794, "loss": 0.4051, "step": 334 }, { "epoch": 0.5937084625609216, "grad_norm": 0.19498959183692932, "learning_rate": 0.0001822412133747332, "loss": 0.2675, "step": 335 }, { "epoch": 0.5954807266282676, "grad_norm": 0.20967549085617065, "learning_rate": 0.00018213481389575025, "loss": 0.2579, "step": 336 }, { "epoch": 0.5972529906956137, "grad_norm": 0.2511061728000641, "learning_rate": 0.0001820281278835913, "loss": 0.2448, "step": 337 }, { "epoch": 0.5990252547629596, "grad_norm": 0.0006951466202735901, "learning_rate": 0.0001819211557104381, "loss": 0.0001, "step": 338 }, { "epoch": 0.6007975188303057, "grad_norm": 0.0004717277188319713, "learning_rate": 0.00018181389774947078, "loss": 0.0001, "step": 339 }, { "epoch": 0.6025697828976517, "grad_norm": 0.00043875849223695695, "learning_rate": 0.0001817063543748664, "loss": 0.0, "step": 340 }, { "epoch": 0.6025697828976517, "eval_loss": 0.4012752175331116, "eval_runtime": 13.2312, "eval_samples_per_second": 3.477, "eval_steps_per_second": 0.907, "step": 340 }, { "epoch": 0.6043420469649978, "grad_norm": 0.006495294161140919, "learning_rate": 0.0001815985259617977, "loss": 0.0006, "step": 341 }, { "epoch": 0.6061143110323438, "grad_norm": 0.008487917482852936, "learning_rate": 0.00018149041288643183, "loss": 0.0008, "step": 342 }, { "epoch": 0.6078865750996899, "grad_norm": 0.0007697686087340117, "learning_rate": 0.00018138201552592896, "loss": 0.0001, "step": 343 }, { "epoch": 0.6096588391670359, "grad_norm": 0.003934359177947044, "learning_rate": 0.00018127333425844107, "loss": 0.0004, "step": 344 }, { "epoch": 0.6114311032343819, "grad_norm": 0.0005663599004037678, "learning_rate": 0.00018116436946311057, "loss": 0.0001, "step": 345 }, { "epoch": 0.613203367301728, "grad_norm": 0.0012789000757038593, "learning_rate": 0.00018105512152006892, "loss": 0.0001, "step": 346 }, { "epoch": 0.614975631369074, "grad_norm": 0.0008825391996651888, "learning_rate": 0.00018094559081043533, "loss": 0.0001, "step": 347 }, { "epoch": 0.6167478954364201, "grad_norm": 0.0011678257724270225, "learning_rate": 0.00018083577771631562, "loss": 0.0001, "step": 348 }, { "epoch": 0.618520159503766, "grad_norm": 0.003733244724571705, "learning_rate": 0.00018072568262080053, "loss": 0.0003, "step": 349 }, { "epoch": 0.6202924235711121, "grad_norm": 0.001708000316284597, "learning_rate": 0.00018061530590796475, "loss": 0.0001, "step": 350 }, { "epoch": 0.6220646876384581, "grad_norm": 0.27428433299064636, "learning_rate": 0.00018050464796286528, "loss": 0.3947, "step": 351 }, { "epoch": 0.6238369517058041, "grad_norm": 0.2888141870498657, "learning_rate": 0.00018039370917154036, "loss": 0.3801, "step": 352 }, { "epoch": 0.6256092157731502, "grad_norm": 0.25782379508018494, "learning_rate": 0.00018028248992100782, "loss": 0.3265, "step": 353 }, { "epoch": 0.6273814798404962, "grad_norm": 0.2534767687320709, "learning_rate": 0.00018017099059926403, "loss": 0.3201, "step": 354 }, { "epoch": 0.6291537439078423, "grad_norm": 0.39024215936660767, "learning_rate": 0.00018005921159528233, "loss": 0.5622, "step": 355 }, { "epoch": 0.6309260079751883, "grad_norm": 0.23832963407039642, "learning_rate": 0.0001799471532990118, "loss": 0.3344, "step": 356 }, { "epoch": 0.6326982720425344, "grad_norm": 0.28127962350845337, "learning_rate": 0.00017983481610137577, "loss": 0.3471, "step": 357 }, { "epoch": 0.6344705361098804, "grad_norm": 0.300924152135849, "learning_rate": 0.00017972220039427067, "loss": 0.3829, "step": 358 }, { "epoch": 0.6362428001772265, "grad_norm": 0.3358384370803833, "learning_rate": 0.00017960930657056438, "loss": 0.4754, "step": 359 }, { "epoch": 0.6380150642445724, "grad_norm": 0.2575319707393646, "learning_rate": 0.0001794961350240951, "loss": 0.3859, "step": 360 }, { "epoch": 0.6380150642445724, "eval_loss": 0.3793332874774933, "eval_runtime": 13.242, "eval_samples_per_second": 3.474, "eval_steps_per_second": 0.906, "step": 360 }, { "epoch": 0.6397873283119184, "grad_norm": 0.19102731347084045, "learning_rate": 0.00017938268614966994, "loss": 0.2565, "step": 361 }, { "epoch": 0.6415595923792645, "grad_norm": 0.27359429001808167, "learning_rate": 0.00017926896034306332, "loss": 0.3371, "step": 362 }, { "epoch": 0.6433318564466105, "grad_norm": 0.1932365894317627, "learning_rate": 0.00017915495800101594, "loss": 0.2547, "step": 363 }, { "epoch": 0.6451041205139566, "grad_norm": 0.19281119108200073, "learning_rate": 0.00017904067952123303, "loss": 0.218, "step": 364 }, { "epoch": 0.6468763845813026, "grad_norm": 0.3079637289047241, "learning_rate": 0.00017892612530238334, "loss": 0.4615, "step": 365 }, { "epoch": 0.6486486486486487, "grad_norm": 0.23658387362957, "learning_rate": 0.0001788112957440974, "loss": 0.4055, "step": 366 }, { "epoch": 0.6504209127159947, "grad_norm": 0.27093634009361267, "learning_rate": 0.00017869619124696634, "loss": 0.409, "step": 367 }, { "epoch": 0.6521931767833408, "grad_norm": 0.23031426966190338, "learning_rate": 0.00017858081221254048, "loss": 0.3424, "step": 368 }, { "epoch": 0.6539654408506868, "grad_norm": 0.22337500751018524, "learning_rate": 0.00017846515904332782, "loss": 0.3182, "step": 369 }, { "epoch": 0.6557377049180327, "grad_norm": 0.2885172963142395, "learning_rate": 0.00017834923214279268, "loss": 0.3405, "step": 370 }, { "epoch": 0.6575099689853788, "grad_norm": 0.26560068130493164, "learning_rate": 0.00017823303191535442, "loss": 0.3328, "step": 371 }, { "epoch": 0.6592822330527248, "grad_norm": 0.18252065777778625, "learning_rate": 0.00017811655876638578, "loss": 0.2438, "step": 372 }, { "epoch": 0.6610544971200709, "grad_norm": 0.19787971675395966, "learning_rate": 0.00017799981310221173, "loss": 0.2515, "step": 373 }, { "epoch": 0.6628267611874169, "grad_norm": 0.2631565034389496, "learning_rate": 0.00017788279533010785, "loss": 0.3755, "step": 374 }, { "epoch": 0.664599025254763, "grad_norm": 0.24500946700572968, "learning_rate": 0.00017776550585829896, "loss": 0.3366, "step": 375 }, { "epoch": 0.666371289322109, "grad_norm": 0.3467278778553009, "learning_rate": 0.00017764794509595786, "loss": 0.5403, "step": 376 }, { "epoch": 0.6681435533894551, "grad_norm": 0.2552179992198944, "learning_rate": 0.00017753011345320366, "loss": 0.3533, "step": 377 }, { "epoch": 0.6699158174568011, "grad_norm": 0.3037780225276947, "learning_rate": 0.00017741201134110042, "loss": 0.5212, "step": 378 }, { "epoch": 0.671688081524147, "grad_norm": 0.29784807562828064, "learning_rate": 0.0001772936391716559, "loss": 0.4664, "step": 379 }, { "epoch": 0.6734603455914931, "grad_norm": 0.22430609166622162, "learning_rate": 0.00017717499735781983, "loss": 0.2937, "step": 380 }, { "epoch": 0.6734603455914931, "eval_loss": 0.3527662754058838, "eval_runtime": 13.1985, "eval_samples_per_second": 3.485, "eval_steps_per_second": 0.909, "step": 380 } ], "logging_steps": 1, "max_steps": 1692, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 20, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 5, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 9.674563755442176e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }