diff --git "a/last-checkpoint/trainer_state.json" "b/last-checkpoint/trainer_state.json" new file mode 100644--- /dev/null +++ "b/last-checkpoint/trainer_state.json" @@ -0,0 +1,7052 @@ +{ + "best_metric": 0.7326682209968567, + "best_model_checkpoint": "./w2v-bert-2.0-chichewa_34h/checkpoint-1000", + "epoch": 5.619718309859155, + "eval_steps": 1000, + "global_step": 1000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.005633802816901409, + "grad_norm": 23.37450408935547, + "learning_rate": 3.0000000000000004e-09, + "loss": 10.0931, + "step": 1 + }, + { + "epoch": 0.011267605633802818, + "grad_norm": 23.055225372314453, + "learning_rate": 6.000000000000001e-09, + "loss": 9.7041, + "step": 2 + }, + { + "epoch": 0.016901408450704224, + "grad_norm": 21.591968536376953, + "learning_rate": 9e-09, + "loss": 9.2937, + "step": 3 + }, + { + "epoch": 0.022535211267605635, + "grad_norm": 19.100778579711914, + "learning_rate": 1.2000000000000002e-08, + "loss": 8.3783, + "step": 4 + }, + { + "epoch": 0.028169014084507043, + "grad_norm": 20.580049514770508, + "learning_rate": 1.5000000000000002e-08, + "loss": 8.7382, + "step": 5 + }, + { + "epoch": 0.03380281690140845, + "grad_norm": 19.053171157836914, + "learning_rate": 1.8e-08, + "loss": 8.1976, + "step": 6 + }, + { + "epoch": 0.03943661971830986, + "grad_norm": 20.025707244873047, + "learning_rate": 2.1e-08, + "loss": 8.5869, + "step": 7 + }, + { + "epoch": 0.04507042253521127, + "grad_norm": 19.60275650024414, + "learning_rate": 2.4000000000000003e-08, + "loss": 8.2925, + "step": 8 + }, + { + "epoch": 0.05070422535211268, + "grad_norm": 19.109025955200195, + "learning_rate": 2.7e-08, + "loss": 8.3882, + "step": 9 + }, + { + "epoch": 0.056338028169014086, + "grad_norm": 18.777395248413086, + "learning_rate": 3.0000000000000004e-08, + "loss": 8.0625, + "step": 10 + }, + { + "epoch": 0.061971830985915494, + "grad_norm": 19.636445999145508, + "learning_rate": 3.3000000000000004e-08, + "loss": 8.4087, + "step": 11 + }, + { + "epoch": 0.0676056338028169, + "grad_norm": 20.13718032836914, + "learning_rate": 3.6e-08, + "loss": 8.6273, + "step": 12 + }, + { + "epoch": 0.07323943661971831, + "grad_norm": 19.168004989624023, + "learning_rate": 3.9e-08, + "loss": 8.1722, + "step": 13 + }, + { + "epoch": 0.07887323943661972, + "grad_norm": 19.736047744750977, + "learning_rate": 4.2e-08, + "loss": 8.4533, + "step": 14 + }, + { + "epoch": 0.08450704225352113, + "grad_norm": 18.998886108398438, + "learning_rate": 4.5e-08, + "loss": 8.145, + "step": 15 + }, + { + "epoch": 0.09014084507042254, + "grad_norm": 19.929027557373047, + "learning_rate": 4.8000000000000006e-08, + "loss": 8.3833, + "step": 16 + }, + { + "epoch": 0.09577464788732394, + "grad_norm": 18.18242645263672, + "learning_rate": 5.1e-08, + "loss": 7.8769, + "step": 17 + }, + { + "epoch": 0.10140845070422536, + "grad_norm": 19.729808807373047, + "learning_rate": 5.4e-08, + "loss": 8.3034, + "step": 18 + }, + { + "epoch": 0.10704225352112676, + "grad_norm": 19.35500144958496, + "learning_rate": 5.7e-08, + "loss": 8.1648, + "step": 19 + }, + { + "epoch": 0.11267605633802817, + "grad_norm": 19.351375579833984, + "learning_rate": 6.000000000000001e-08, + "loss": 8.2531, + "step": 20 + }, + { + "epoch": 0.11830985915492957, + "grad_norm": 20.13310432434082, + "learning_rate": 6.3e-08, + "loss": 8.4955, + "step": 21 + }, + { + "epoch": 0.12394366197183099, + "grad_norm": 19.18810272216797, + "learning_rate": 6.600000000000001e-08, + "loss": 8.0951, + "step": 22 + }, + { + "epoch": 0.1295774647887324, + "grad_norm": 19.997060775756836, + "learning_rate": 6.9e-08, + "loss": 8.3085, + "step": 23 + }, + { + "epoch": 0.1352112676056338, + "grad_norm": 19.73467445373535, + "learning_rate": 7.2e-08, + "loss": 8.0012, + "step": 24 + }, + { + "epoch": 0.14084507042253522, + "grad_norm": 20.199962615966797, + "learning_rate": 7.500000000000001e-08, + "loss": 8.1924, + "step": 25 + }, + { + "epoch": 0.14647887323943662, + "grad_norm": 19.340503692626953, + "learning_rate": 7.8e-08, + "loss": 8.1655, + "step": 26 + }, + { + "epoch": 0.15211267605633802, + "grad_norm": 18.63260269165039, + "learning_rate": 8.100000000000001e-08, + "loss": 7.8248, + "step": 27 + }, + { + "epoch": 0.15774647887323945, + "grad_norm": 18.444799423217773, + "learning_rate": 8.4e-08, + "loss": 7.9568, + "step": 28 + }, + { + "epoch": 0.16338028169014085, + "grad_norm": 19.628917694091797, + "learning_rate": 8.7e-08, + "loss": 7.9813, + "step": 29 + }, + { + "epoch": 0.16901408450704225, + "grad_norm": 18.812955856323242, + "learning_rate": 9e-08, + "loss": 7.8513, + "step": 30 + }, + { + "epoch": 0.17464788732394365, + "grad_norm": 19.53274154663086, + "learning_rate": 9.3e-08, + "loss": 8.0772, + "step": 31 + }, + { + "epoch": 0.18028169014084508, + "grad_norm": 18.708633422851562, + "learning_rate": 9.600000000000001e-08, + "loss": 7.7278, + "step": 32 + }, + { + "epoch": 0.18591549295774648, + "grad_norm": 18.192834854125977, + "learning_rate": 9.9e-08, + "loss": 7.7268, + "step": 33 + }, + { + "epoch": 0.19154929577464788, + "grad_norm": 18.53217887878418, + "learning_rate": 1.02e-07, + "loss": 7.6227, + "step": 34 + }, + { + "epoch": 0.19718309859154928, + "grad_norm": 18.989917755126953, + "learning_rate": 1.05e-07, + "loss": 7.7894, + "step": 35 + }, + { + "epoch": 0.2028169014084507, + "grad_norm": 19.437061309814453, + "learning_rate": 1.08e-07, + "loss": 7.6945, + "step": 36 + }, + { + "epoch": 0.2084507042253521, + "grad_norm": 18.471364974975586, + "learning_rate": 1.11e-07, + "loss": 7.524, + "step": 37 + }, + { + "epoch": 0.2140845070422535, + "grad_norm": 19.86949920654297, + "learning_rate": 1.14e-07, + "loss": 7.8956, + "step": 38 + }, + { + "epoch": 0.21971830985915494, + "grad_norm": 19.169902801513672, + "learning_rate": 1.17e-07, + "loss": 7.6071, + "step": 39 + }, + { + "epoch": 0.22535211267605634, + "grad_norm": 18.673480987548828, + "learning_rate": 1.2000000000000002e-07, + "loss": 7.3223, + "step": 40 + }, + { + "epoch": 0.23098591549295774, + "grad_norm": 18.413576126098633, + "learning_rate": 1.23e-07, + "loss": 7.2614, + "step": 41 + }, + { + "epoch": 0.23661971830985915, + "grad_norm": 18.13873863220215, + "learning_rate": 1.26e-07, + "loss": 7.179, + "step": 42 + }, + { + "epoch": 0.24225352112676057, + "grad_norm": 18.680736541748047, + "learning_rate": 1.29e-07, + "loss": 7.36, + "step": 43 + }, + { + "epoch": 0.24788732394366197, + "grad_norm": 18.37202262878418, + "learning_rate": 1.3200000000000002e-07, + "loss": 7.1083, + "step": 44 + }, + { + "epoch": 0.2535211267605634, + "grad_norm": 28.554738998413086, + "learning_rate": 1.35e-07, + "loss": 10.9576, + "step": 45 + }, + { + "epoch": 0.2591549295774648, + "grad_norm": 23.22928810119629, + "learning_rate": 1.38e-07, + "loss": 9.4035, + "step": 46 + }, + { + "epoch": 0.2647887323943662, + "grad_norm": 22.669530868530273, + "learning_rate": 1.41e-07, + "loss": 9.0631, + "step": 47 + }, + { + "epoch": 0.2704225352112676, + "grad_norm": 21.29054069519043, + "learning_rate": 1.44e-07, + "loss": 8.5767, + "step": 48 + }, + { + "epoch": 0.27605633802816903, + "grad_norm": 21.267587661743164, + "learning_rate": 1.47e-07, + "loss": 8.5628, + "step": 49 + }, + { + "epoch": 0.28169014084507044, + "grad_norm": 20.70184898376465, + "learning_rate": 1.5000000000000002e-07, + "loss": 8.4956, + "step": 50 + }, + { + "epoch": 0.28732394366197184, + "grad_norm": 20.579097747802734, + "learning_rate": 1.53e-07, + "loss": 8.4322, + "step": 51 + }, + { + "epoch": 0.29295774647887324, + "grad_norm": 20.76494026184082, + "learning_rate": 1.56e-07, + "loss": 8.3483, + "step": 52 + }, + { + "epoch": 0.29859154929577464, + "grad_norm": 22.010337829589844, + "learning_rate": 1.59e-07, + "loss": 8.6267, + "step": 53 + }, + { + "epoch": 0.30422535211267604, + "grad_norm": 20.0366268157959, + "learning_rate": 1.6200000000000002e-07, + "loss": 8.0702, + "step": 54 + }, + { + "epoch": 0.30985915492957744, + "grad_norm": 20.99293327331543, + "learning_rate": 1.6499999999999998e-07, + "loss": 8.232, + "step": 55 + }, + { + "epoch": 0.3154929577464789, + "grad_norm": 19.92287254333496, + "learning_rate": 1.68e-07, + "loss": 8.09, + "step": 56 + }, + { + "epoch": 0.3211267605633803, + "grad_norm": 20.472942352294922, + "learning_rate": 1.71e-07, + "loss": 8.0906, + "step": 57 + }, + { + "epoch": 0.3267605633802817, + "grad_norm": 21.892868041992188, + "learning_rate": 1.74e-07, + "loss": 8.4593, + "step": 58 + }, + { + "epoch": 0.3323943661971831, + "grad_norm": 21.296226501464844, + "learning_rate": 1.77e-07, + "loss": 8.2587, + "step": 59 + }, + { + "epoch": 0.3380281690140845, + "grad_norm": 21.45792579650879, + "learning_rate": 1.8e-07, + "loss": 8.2867, + "step": 60 + }, + { + "epoch": 0.3436619718309859, + "grad_norm": 20.300464630126953, + "learning_rate": 1.83e-07, + "loss": 7.7559, + "step": 61 + }, + { + "epoch": 0.3492957746478873, + "grad_norm": 21.698631286621094, + "learning_rate": 1.86e-07, + "loss": 8.199, + "step": 62 + }, + { + "epoch": 0.35492957746478876, + "grad_norm": 19.86581039428711, + "learning_rate": 1.89e-07, + "loss": 7.6167, + "step": 63 + }, + { + "epoch": 0.36056338028169016, + "grad_norm": 21.967016220092773, + "learning_rate": 1.9200000000000003e-07, + "loss": 8.1939, + "step": 64 + }, + { + "epoch": 0.36619718309859156, + "grad_norm": 21.37307357788086, + "learning_rate": 1.9499999999999999e-07, + "loss": 8.1185, + "step": 65 + }, + { + "epoch": 0.37183098591549296, + "grad_norm": 20.290283203125, + "learning_rate": 1.98e-07, + "loss": 7.8508, + "step": 66 + }, + { + "epoch": 0.37746478873239436, + "grad_norm": 21.446462631225586, + "learning_rate": 2.01e-07, + "loss": 7.8273, + "step": 67 + }, + { + "epoch": 0.38309859154929576, + "grad_norm": 21.343917846679688, + "learning_rate": 2.04e-07, + "loss": 7.8103, + "step": 68 + }, + { + "epoch": 0.38873239436619716, + "grad_norm": 20.38936996459961, + "learning_rate": 2.0700000000000001e-07, + "loss": 7.5509, + "step": 69 + }, + { + "epoch": 0.39436619718309857, + "grad_norm": 21.852344512939453, + "learning_rate": 2.1e-07, + "loss": 7.975, + "step": 70 + }, + { + "epoch": 0.4, + "grad_norm": 21.029077529907227, + "learning_rate": 2.1300000000000001e-07, + "loss": 7.7329, + "step": 71 + }, + { + "epoch": 0.4056338028169014, + "grad_norm": 21.277189254760742, + "learning_rate": 2.16e-07, + "loss": 7.697, + "step": 72 + }, + { + "epoch": 0.4112676056338028, + "grad_norm": 20.803091049194336, + "learning_rate": 2.1900000000000002e-07, + "loss": 7.6135, + "step": 73 + }, + { + "epoch": 0.4169014084507042, + "grad_norm": 19.920984268188477, + "learning_rate": 2.22e-07, + "loss": 7.3491, + "step": 74 + }, + { + "epoch": 0.4225352112676056, + "grad_norm": 20.588159561157227, + "learning_rate": 2.25e-07, + "loss": 7.3246, + "step": 75 + }, + { + "epoch": 0.428169014084507, + "grad_norm": 20.864524841308594, + "learning_rate": 2.28e-07, + "loss": 7.4887, + "step": 76 + }, + { + "epoch": 0.43380281690140843, + "grad_norm": 20.144886016845703, + "learning_rate": 2.3100000000000002e-07, + "loss": 7.2445, + "step": 77 + }, + { + "epoch": 0.4394366197183099, + "grad_norm": 20.628171920776367, + "learning_rate": 2.34e-07, + "loss": 7.1465, + "step": 78 + }, + { + "epoch": 0.4450704225352113, + "grad_norm": 21.245912551879883, + "learning_rate": 2.3700000000000002e-07, + "loss": 7.4386, + "step": 79 + }, + { + "epoch": 0.4507042253521127, + "grad_norm": 22.11734390258789, + "learning_rate": 2.4000000000000003e-07, + "loss": 7.3555, + "step": 80 + }, + { + "epoch": 0.4563380281690141, + "grad_norm": 21.49259376525879, + "learning_rate": 2.43e-07, + "loss": 7.2534, + "step": 81 + }, + { + "epoch": 0.4619718309859155, + "grad_norm": 21.531265258789062, + "learning_rate": 2.46e-07, + "loss": 7.2902, + "step": 82 + }, + { + "epoch": 0.4676056338028169, + "grad_norm": 23.177501678466797, + "learning_rate": 2.49e-07, + "loss": 7.5368, + "step": 83 + }, + { + "epoch": 0.4732394366197183, + "grad_norm": 21.143033981323242, + "learning_rate": 2.52e-07, + "loss": 7.0782, + "step": 84 + }, + { + "epoch": 0.4788732394366197, + "grad_norm": 21.302194595336914, + "learning_rate": 2.5500000000000005e-07, + "loss": 6.9088, + "step": 85 + }, + { + "epoch": 0.48450704225352115, + "grad_norm": 20.92340660095215, + "learning_rate": 2.58e-07, + "loss": 6.8607, + "step": 86 + }, + { + "epoch": 0.49014084507042255, + "grad_norm": 21.352828979492188, + "learning_rate": 2.6099999999999997e-07, + "loss": 6.8558, + "step": 87 + }, + { + "epoch": 0.49577464788732395, + "grad_norm": 20.361351013183594, + "learning_rate": 2.6400000000000003e-07, + "loss": 6.574, + "step": 88 + }, + { + "epoch": 0.5014084507042254, + "grad_norm": 32.090736389160156, + "learning_rate": 2.67e-07, + "loss": 9.4598, + "step": 89 + }, + { + "epoch": 0.5070422535211268, + "grad_norm": 29.89952850341797, + "learning_rate": 2.7e-07, + "loss": 8.9964, + "step": 90 + }, + { + "epoch": 0.5126760563380282, + "grad_norm": 26.13751983642578, + "learning_rate": 2.73e-07, + "loss": 8.2028, + "step": 91 + }, + { + "epoch": 0.5183098591549296, + "grad_norm": 24.984628677368164, + "learning_rate": 2.76e-07, + "loss": 7.9064, + "step": 92 + }, + { + "epoch": 0.523943661971831, + "grad_norm": 26.014310836791992, + "learning_rate": 2.79e-07, + "loss": 7.9926, + "step": 93 + }, + { + "epoch": 0.5295774647887324, + "grad_norm": 24.743000030517578, + "learning_rate": 2.82e-07, + "loss": 7.7252, + "step": 94 + }, + { + "epoch": 0.5352112676056338, + "grad_norm": 26.368148803710938, + "learning_rate": 2.85e-07, + "loss": 7.9539, + "step": 95 + }, + { + "epoch": 0.5408450704225352, + "grad_norm": 27.547103881835938, + "learning_rate": 2.88e-07, + "loss": 8.0634, + "step": 96 + }, + { + "epoch": 0.5464788732394367, + "grad_norm": 25.676986694335938, + "learning_rate": 2.91e-07, + "loss": 7.727, + "step": 97 + }, + { + "epoch": 0.5521126760563381, + "grad_norm": 26.837203979492188, + "learning_rate": 2.94e-07, + "loss": 7.8317, + "step": 98 + }, + { + "epoch": 0.5577464788732395, + "grad_norm": 26.280786514282227, + "learning_rate": 2.97e-07, + "loss": 7.6846, + "step": 99 + }, + { + "epoch": 0.5633802816901409, + "grad_norm": 27.732624053955078, + "learning_rate": 3.0000000000000004e-07, + "loss": 7.79, + "step": 100 + }, + { + "epoch": 0.5690140845070423, + "grad_norm": 27.480138778686523, + "learning_rate": 3.03e-07, + "loss": 7.6279, + "step": 101 + }, + { + "epoch": 0.5746478873239437, + "grad_norm": 25.806196212768555, + "learning_rate": 3.06e-07, + "loss": 7.3647, + "step": 102 + }, + { + "epoch": 0.5802816901408451, + "grad_norm": 27.241296768188477, + "learning_rate": 3.0900000000000003e-07, + "loss": 7.6385, + "step": 103 + }, + { + "epoch": 0.5859154929577465, + "grad_norm": 27.93297004699707, + "learning_rate": 3.12e-07, + "loss": 7.6403, + "step": 104 + }, + { + "epoch": 0.5915492957746479, + "grad_norm": 28.791919708251953, + "learning_rate": 3.15e-07, + "loss": 7.6192, + "step": 105 + }, + { + "epoch": 0.5971830985915493, + "grad_norm": 28.215551376342773, + "learning_rate": 3.18e-07, + "loss": 7.4607, + "step": 106 + }, + { + "epoch": 0.6028169014084507, + "grad_norm": 29.93889617919922, + "learning_rate": 3.21e-07, + "loss": 7.6467, + "step": 107 + }, + { + "epoch": 0.6084507042253521, + "grad_norm": 27.232234954833984, + "learning_rate": 3.2400000000000004e-07, + "loss": 7.1521, + "step": 108 + }, + { + "epoch": 0.6140845070422535, + "grad_norm": 28.14197540283203, + "learning_rate": 3.27e-07, + "loss": 7.2471, + "step": 109 + }, + { + "epoch": 0.6197183098591549, + "grad_norm": 27.78325653076172, + "learning_rate": 3.2999999999999996e-07, + "loss": 7.153, + "step": 110 + }, + { + "epoch": 0.6253521126760564, + "grad_norm": 27.265422821044922, + "learning_rate": 3.3300000000000003e-07, + "loss": 7.0405, + "step": 111 + }, + { + "epoch": 0.6309859154929578, + "grad_norm": 29.104612350463867, + "learning_rate": 3.36e-07, + "loss": 7.294, + "step": 112 + }, + { + "epoch": 0.6366197183098592, + "grad_norm": 28.48741912841797, + "learning_rate": 3.39e-07, + "loss": 7.0529, + "step": 113 + }, + { + "epoch": 0.6422535211267606, + "grad_norm": 29.633562088012695, + "learning_rate": 3.42e-07, + "loss": 7.0792, + "step": 114 + }, + { + "epoch": 0.647887323943662, + "grad_norm": 29.809619903564453, + "learning_rate": 3.45e-07, + "loss": 7.1284, + "step": 115 + }, + { + "epoch": 0.6535211267605634, + "grad_norm": 29.871963500976562, + "learning_rate": 3.48e-07, + "loss": 7.0132, + "step": 116 + }, + { + "epoch": 0.6591549295774648, + "grad_norm": 29.897907257080078, + "learning_rate": 3.51e-07, + "loss": 6.9608, + "step": 117 + }, + { + "epoch": 0.6647887323943662, + "grad_norm": 30.40986442565918, + "learning_rate": 3.54e-07, + "loss": 7.0252, + "step": 118 + }, + { + "epoch": 0.6704225352112676, + "grad_norm": 27.947280883789062, + "learning_rate": 3.5700000000000003e-07, + "loss": 6.595, + "step": 119 + }, + { + "epoch": 0.676056338028169, + "grad_norm": 27.550397872924805, + "learning_rate": 3.6e-07, + "loss": 6.506, + "step": 120 + }, + { + "epoch": 0.6816901408450704, + "grad_norm": 29.49580192565918, + "learning_rate": 3.63e-07, + "loss": 6.6519, + "step": 121 + }, + { + "epoch": 0.6873239436619718, + "grad_norm": 26.66489028930664, + "learning_rate": 3.66e-07, + "loss": 6.2367, + "step": 122 + }, + { + "epoch": 0.6929577464788732, + "grad_norm": 27.495731353759766, + "learning_rate": 3.6900000000000004e-07, + "loss": 6.3315, + "step": 123 + }, + { + "epoch": 0.6985915492957746, + "grad_norm": 28.700468063354492, + "learning_rate": 3.72e-07, + "loss": 6.445, + "step": 124 + }, + { + "epoch": 0.704225352112676, + "grad_norm": 29.51987075805664, + "learning_rate": 3.75e-07, + "loss": 6.4283, + "step": 125 + }, + { + "epoch": 0.7098591549295775, + "grad_norm": 28.541399002075195, + "learning_rate": 3.78e-07, + "loss": 6.2563, + "step": 126 + }, + { + "epoch": 0.7154929577464789, + "grad_norm": 30.63127899169922, + "learning_rate": 3.81e-07, + "loss": 6.4155, + "step": 127 + }, + { + "epoch": 0.7211267605633803, + "grad_norm": 28.95149803161621, + "learning_rate": 3.8400000000000005e-07, + "loss": 6.2745, + "step": 128 + }, + { + "epoch": 0.7267605633802817, + "grad_norm": 28.605030059814453, + "learning_rate": 3.87e-07, + "loss": 6.2393, + "step": 129 + }, + { + "epoch": 0.7323943661971831, + "grad_norm": 27.956668853759766, + "learning_rate": 3.8999999999999997e-07, + "loss": 6.0958, + "step": 130 + }, + { + "epoch": 0.7380281690140845, + "grad_norm": 28.11646270751953, + "learning_rate": 3.9300000000000004e-07, + "loss": 5.963, + "step": 131 + }, + { + "epoch": 0.7436619718309859, + "grad_norm": 24.97135353088379, + "learning_rate": 3.96e-07, + "loss": 5.6173, + "step": 132 + }, + { + "epoch": 0.7492957746478873, + "grad_norm": Infinity, + "learning_rate": 3.96e-07, + "loss": 8.1969, + "step": 133 + }, + { + "epoch": 0.7549295774647887, + "grad_norm": 42.452293395996094, + "learning_rate": 3.99e-07, + "loss": 7.3189, + "step": 134 + }, + { + "epoch": 0.7605633802816901, + "grad_norm": 39.87089157104492, + "learning_rate": 4.02e-07, + "loss": 6.9278, + "step": 135 + }, + { + "epoch": 0.7661971830985915, + "grad_norm": 36.937870025634766, + "learning_rate": 4.05e-07, + "loss": 6.6801, + "step": 136 + }, + { + "epoch": 0.7718309859154929, + "grad_norm": 36.147464752197266, + "learning_rate": 4.08e-07, + "loss": 6.6071, + "step": 137 + }, + { + "epoch": 0.7774647887323943, + "grad_norm": 39.3355598449707, + "learning_rate": 4.11e-07, + "loss": 6.7304, + "step": 138 + }, + { + "epoch": 0.7830985915492957, + "grad_norm": 34.039981842041016, + "learning_rate": 4.1400000000000003e-07, + "loss": 6.2375, + "step": 139 + }, + { + "epoch": 0.7887323943661971, + "grad_norm": 36.65914535522461, + "learning_rate": 4.17e-07, + "loss": 6.4087, + "step": 140 + }, + { + "epoch": 0.7943661971830986, + "grad_norm": 37.91633605957031, + "learning_rate": 4.2e-07, + "loss": 6.4484, + "step": 141 + }, + { + "epoch": 0.8, + "grad_norm": 39.105125427246094, + "learning_rate": 4.23e-07, + "loss": 6.4929, + "step": 142 + }, + { + "epoch": 0.8056338028169014, + "grad_norm": 40.3162727355957, + "learning_rate": 4.2600000000000003e-07, + "loss": 6.5174, + "step": 143 + }, + { + "epoch": 0.8112676056338028, + "grad_norm": 35.70503234863281, + "learning_rate": 4.2900000000000004e-07, + "loss": 6.1367, + "step": 144 + }, + { + "epoch": 0.8169014084507042, + "grad_norm": 36.86891174316406, + "learning_rate": 4.32e-07, + "loss": 6.1179, + "step": 145 + }, + { + "epoch": 0.8225352112676056, + "grad_norm": 39.80533218383789, + "learning_rate": 4.35e-07, + "loss": 6.2952, + "step": 146 + }, + { + "epoch": 0.828169014084507, + "grad_norm": 36.694480895996094, + "learning_rate": 4.3800000000000003e-07, + "loss": 5.9916, + "step": 147 + }, + { + "epoch": 0.8338028169014085, + "grad_norm": 38.44818115234375, + "learning_rate": 4.41e-07, + "loss": 6.0445, + "step": 148 + }, + { + "epoch": 0.8394366197183099, + "grad_norm": 38.265995025634766, + "learning_rate": 4.44e-07, + "loss": 6.0292, + "step": 149 + }, + { + "epoch": 0.8450704225352113, + "grad_norm": 36.704952239990234, + "learning_rate": 4.47e-07, + "loss": 5.8879, + "step": 150 + }, + { + "epoch": 0.8507042253521127, + "grad_norm": 37.268917083740234, + "learning_rate": 4.5e-07, + "loss": 5.8587, + "step": 151 + }, + { + "epoch": 0.856338028169014, + "grad_norm": 35.31634521484375, + "learning_rate": 4.5300000000000005e-07, + "loss": 5.7078, + "step": 152 + }, + { + "epoch": 0.8619718309859155, + "grad_norm": 37.19781494140625, + "learning_rate": 4.56e-07, + "loss": 5.7555, + "step": 153 + }, + { + "epoch": 0.8676056338028169, + "grad_norm": 34.77775955200195, + "learning_rate": 4.5899999999999997e-07, + "loss": 5.6057, + "step": 154 + }, + { + "epoch": 0.8732394366197183, + "grad_norm": 33.57127380371094, + "learning_rate": 4.6200000000000003e-07, + "loss": 5.4405, + "step": 155 + }, + { + "epoch": 0.8788732394366198, + "grad_norm": 36.10496139526367, + "learning_rate": 4.65e-07, + "loss": 5.4799, + "step": 156 + }, + { + "epoch": 0.8845070422535212, + "grad_norm": 36.44635009765625, + "learning_rate": 4.68e-07, + "loss": 5.4978, + "step": 157 + }, + { + "epoch": 0.8901408450704226, + "grad_norm": 34.81025314331055, + "learning_rate": 4.7099999999999997e-07, + "loss": 5.351, + "step": 158 + }, + { + "epoch": 0.895774647887324, + "grad_norm": 34.08943557739258, + "learning_rate": 4.7400000000000004e-07, + "loss": 5.2605, + "step": 159 + }, + { + "epoch": 0.9014084507042254, + "grad_norm": 34.23727798461914, + "learning_rate": 4.77e-07, + "loss": 5.2195, + "step": 160 + }, + { + "epoch": 0.9070422535211268, + "grad_norm": 32.607879638671875, + "learning_rate": 4.800000000000001e-07, + "loss": 5.1298, + "step": 161 + }, + { + "epoch": 0.9126760563380282, + "grad_norm": 33.94306564331055, + "learning_rate": 4.83e-07, + "loss": 5.1769, + "step": 162 + }, + { + "epoch": 0.9183098591549296, + "grad_norm": 31.63994026184082, + "learning_rate": 4.86e-07, + "loss": 5.024, + "step": 163 + }, + { + "epoch": 0.923943661971831, + "grad_norm": 33.26905059814453, + "learning_rate": 4.89e-07, + "loss": 4.9806, + "step": 164 + }, + { + "epoch": 0.9295774647887324, + "grad_norm": 32.36957931518555, + "learning_rate": 4.92e-07, + "loss": 4.9891, + "step": 165 + }, + { + "epoch": 0.9352112676056338, + "grad_norm": 28.738908767700195, + "learning_rate": 4.95e-07, + "loss": 4.7682, + "step": 166 + }, + { + "epoch": 0.9408450704225352, + "grad_norm": 28.542316436767578, + "learning_rate": 4.98e-07, + "loss": 4.8045, + "step": 167 + }, + { + "epoch": 0.9464788732394366, + "grad_norm": 25.792383193969727, + "learning_rate": 5.01e-07, + "loss": 4.6496, + "step": 168 + }, + { + "epoch": 0.952112676056338, + "grad_norm": 27.16576385498047, + "learning_rate": 5.04e-07, + "loss": 4.6519, + "step": 169 + }, + { + "epoch": 0.9577464788732394, + "grad_norm": 25.441028594970703, + "learning_rate": 5.07e-07, + "loss": 4.5866, + "step": 170 + }, + { + "epoch": 0.9633802816901409, + "grad_norm": 25.167131423950195, + "learning_rate": 5.100000000000001e-07, + "loss": 4.6062, + "step": 171 + }, + { + "epoch": 0.9690140845070423, + "grad_norm": 23.295936584472656, + "learning_rate": 5.13e-07, + "loss": 4.5423, + "step": 172 + }, + { + "epoch": 0.9746478873239437, + "grad_norm": 21.83075523376465, + "learning_rate": 5.16e-07, + "loss": 4.4349, + "step": 173 + }, + { + "epoch": 0.9802816901408451, + "grad_norm": 18.890296936035156, + "learning_rate": 5.19e-07, + "loss": 4.3618, + "step": 174 + }, + { + "epoch": 0.9859154929577465, + "grad_norm": 17.86733627319336, + "learning_rate": 5.219999999999999e-07, + "loss": 4.3321, + "step": 175 + }, + { + "epoch": 0.9915492957746479, + "grad_norm": 15.089102745056152, + "learning_rate": 5.250000000000001e-07, + "loss": 4.2655, + "step": 176 + }, + { + "epoch": 0.9971830985915493, + "grad_norm": 20.50058937072754, + "learning_rate": 5.280000000000001e-07, + "loss": 4.4262, + "step": 177 + }, + { + "epoch": 1.0, + "grad_norm": 7.529200553894043, + "learning_rate": 5.31e-07, + "loss": 2.1518, + "step": 178 + }, + { + "epoch": 1.0056338028169014, + "grad_norm": 28.378694534301758, + "learning_rate": 5.34e-07, + "loss": 4.7345, + "step": 179 + }, + { + "epoch": 1.0112676056338028, + "grad_norm": 24.153104782104492, + "learning_rate": 5.37e-07, + "loss": 4.617, + "step": 180 + }, + { + "epoch": 1.0169014084507042, + "grad_norm": 16.407052993774414, + "learning_rate": 5.4e-07, + "loss": 4.3972, + "step": 181 + }, + { + "epoch": 1.0225352112676056, + "grad_norm": 13.738978385925293, + "learning_rate": 5.43e-07, + "loss": 4.3588, + "step": 182 + }, + { + "epoch": 1.028169014084507, + "grad_norm": 11.753392219543457, + "learning_rate": 5.46e-07, + "loss": 4.3117, + "step": 183 + }, + { + "epoch": 1.0338028169014084, + "grad_norm": 10.56106185913086, + "learning_rate": 5.490000000000001e-07, + "loss": 4.2987, + "step": 184 + }, + { + "epoch": 1.0394366197183098, + "grad_norm": 8.984735488891602, + "learning_rate": 5.52e-07, + "loss": 4.2585, + "step": 185 + }, + { + "epoch": 1.0450704225352112, + "grad_norm": 7.201539516448975, + "learning_rate": 5.55e-07, + "loss": 4.2362, + "step": 186 + }, + { + "epoch": 1.0507042253521126, + "grad_norm": 6.696246147155762, + "learning_rate": 5.58e-07, + "loss": 4.2512, + "step": 187 + }, + { + "epoch": 1.056338028169014, + "grad_norm": 7.102692604064941, + "learning_rate": 5.61e-07, + "loss": 4.2358, + "step": 188 + }, + { + "epoch": 1.0619718309859154, + "grad_norm": 7.0099616050720215, + "learning_rate": 5.64e-07, + "loss": 4.252, + "step": 189 + }, + { + "epoch": 1.0676056338028168, + "grad_norm": 6.835728645324707, + "learning_rate": 5.67e-07, + "loss": 4.2108, + "step": 190 + }, + { + "epoch": 1.0732394366197182, + "grad_norm": 7.586347579956055, + "learning_rate": 5.7e-07, + "loss": 4.1935, + "step": 191 + }, + { + "epoch": 1.0788732394366196, + "grad_norm": 7.363047122955322, + "learning_rate": 5.73e-07, + "loss": 4.1578, + "step": 192 + }, + { + "epoch": 1.084507042253521, + "grad_norm": 7.040609836578369, + "learning_rate": 5.76e-07, + "loss": 4.136, + "step": 193 + }, + { + "epoch": 1.0901408450704226, + "grad_norm": 6.436159610748291, + "learning_rate": 5.790000000000001e-07, + "loss": 4.1612, + "step": 194 + }, + { + "epoch": 1.095774647887324, + "grad_norm": 7.356907367706299, + "learning_rate": 5.82e-07, + "loss": 4.096, + "step": 195 + }, + { + "epoch": 1.1014084507042254, + "grad_norm": 6.898291110992432, + "learning_rate": 5.85e-07, + "loss": 4.0987, + "step": 196 + }, + { + "epoch": 1.1070422535211268, + "grad_norm": 6.676201343536377, + "learning_rate": 5.88e-07, + "loss": 4.0404, + "step": 197 + }, + { + "epoch": 1.1126760563380282, + "grad_norm": 7.3079352378845215, + "learning_rate": 5.909999999999999e-07, + "loss": 4.0107, + "step": 198 + }, + { + "epoch": 1.1183098591549296, + "grad_norm": 6.069024562835693, + "learning_rate": 5.94e-07, + "loss": 4.0537, + "step": 199 + }, + { + "epoch": 1.123943661971831, + "grad_norm": 6.009862899780273, + "learning_rate": 5.970000000000001e-07, + "loss": 4.0273, + "step": 200 + }, + { + "epoch": 1.1295774647887324, + "grad_norm": 6.165462970733643, + "learning_rate": 6.000000000000001e-07, + "loss": 4.037, + "step": 201 + }, + { + "epoch": 1.1352112676056338, + "grad_norm": 6.048194408416748, + "learning_rate": 6.03e-07, + "loss": 3.9988, + "step": 202 + }, + { + "epoch": 1.1408450704225352, + "grad_norm": 6.066869258880615, + "learning_rate": 6.06e-07, + "loss": 3.9743, + "step": 203 + }, + { + "epoch": 1.1464788732394366, + "grad_norm": 5.647124290466309, + "learning_rate": 6.09e-07, + "loss": 3.9283, + "step": 204 + }, + { + "epoch": 1.152112676056338, + "grad_norm": 6.250226974487305, + "learning_rate": 6.12e-07, + "loss": 4.0008, + "step": 205 + }, + { + "epoch": 1.1577464788732394, + "grad_norm": 5.445071697235107, + "learning_rate": 6.15e-07, + "loss": 3.9175, + "step": 206 + }, + { + "epoch": 1.1633802816901408, + "grad_norm": 5.444154262542725, + "learning_rate": 6.180000000000001e-07, + "loss": 3.9345, + "step": 207 + }, + { + "epoch": 1.1690140845070423, + "grad_norm": 5.232634544372559, + "learning_rate": 6.21e-07, + "loss": 3.9629, + "step": 208 + }, + { + "epoch": 1.1746478873239437, + "grad_norm": 5.474876403808594, + "learning_rate": 6.24e-07, + "loss": 3.8974, + "step": 209 + }, + { + "epoch": 1.180281690140845, + "grad_norm": 5.5575642585754395, + "learning_rate": 6.27e-07, + "loss": 3.8296, + "step": 210 + }, + { + "epoch": 1.1859154929577465, + "grad_norm": 5.940123081207275, + "learning_rate": 6.3e-07, + "loss": 3.8652, + "step": 211 + }, + { + "epoch": 1.1915492957746479, + "grad_norm": 5.474195957183838, + "learning_rate": 6.33e-07, + "loss": 3.8572, + "step": 212 + }, + { + "epoch": 1.1971830985915493, + "grad_norm": 6.2269415855407715, + "learning_rate": 6.36e-07, + "loss": 3.7893, + "step": 213 + }, + { + "epoch": 1.2028169014084507, + "grad_norm": 5.20849609375, + "learning_rate": 6.39e-07, + "loss": 3.8468, + "step": 214 + }, + { + "epoch": 1.208450704225352, + "grad_norm": 6.209558963775635, + "learning_rate": 6.42e-07, + "loss": 3.7182, + "step": 215 + }, + { + "epoch": 1.2140845070422535, + "grad_norm": 5.492266654968262, + "learning_rate": 6.45e-07, + "loss": 3.7881, + "step": 216 + }, + { + "epoch": 1.2197183098591549, + "grad_norm": 4.731509208679199, + "learning_rate": 6.480000000000001e-07, + "loss": 3.7978, + "step": 217 + }, + { + "epoch": 1.2253521126760563, + "grad_norm": 5.000571250915527, + "learning_rate": 6.51e-07, + "loss": 3.7661, + "step": 218 + }, + { + "epoch": 1.2309859154929577, + "grad_norm": 5.254445552825928, + "learning_rate": 6.54e-07, + "loss": 3.7341, + "step": 219 + }, + { + "epoch": 1.236619718309859, + "grad_norm": 4.641867637634277, + "learning_rate": 6.57e-07, + "loss": 3.767, + "step": 220 + }, + { + "epoch": 1.2422535211267607, + "grad_norm": 5.31817626953125, + "learning_rate": 6.599999999999999e-07, + "loss": 3.7103, + "step": 221 + }, + { + "epoch": 1.247887323943662, + "grad_norm": 5.551826477050781, + "learning_rate": 6.63e-07, + "loss": 3.6299, + "step": 222 + }, + { + "epoch": 1.2535211267605635, + "grad_norm": 17.55986213684082, + "learning_rate": 6.660000000000001e-07, + "loss": 4.1343, + "step": 223 + }, + { + "epoch": 1.2591549295774649, + "grad_norm": 12.343875885009766, + "learning_rate": 6.690000000000001e-07, + "loss": 4.0199, + "step": 224 + }, + { + "epoch": 1.2647887323943663, + "grad_norm": 10.083864212036133, + "learning_rate": 6.72e-07, + "loss": 3.9426, + "step": 225 + }, + { + "epoch": 1.2704225352112677, + "grad_norm": 6.6908345222473145, + "learning_rate": 6.75e-07, + "loss": 3.8179, + "step": 226 + }, + { + "epoch": 1.276056338028169, + "grad_norm": 6.976649284362793, + "learning_rate": 6.78e-07, + "loss": 3.8272, + "step": 227 + }, + { + "epoch": 1.2816901408450705, + "grad_norm": 5.452723503112793, + "learning_rate": 6.81e-07, + "loss": 3.7949, + "step": 228 + }, + { + "epoch": 1.287323943661972, + "grad_norm": 4.782120704650879, + "learning_rate": 6.84e-07, + "loss": 3.739, + "step": 229 + }, + { + "epoch": 1.2929577464788733, + "grad_norm": 5.477733612060547, + "learning_rate": 6.87e-07, + "loss": 3.7878, + "step": 230 + }, + { + "epoch": 1.2985915492957747, + "grad_norm": 4.35331916809082, + "learning_rate": 6.9e-07, + "loss": 3.7312, + "step": 231 + }, + { + "epoch": 1.304225352112676, + "grad_norm": 4.688896656036377, + "learning_rate": 6.93e-07, + "loss": 3.6926, + "step": 232 + }, + { + "epoch": 1.3098591549295775, + "grad_norm": 4.093502998352051, + "learning_rate": 6.96e-07, + "loss": 3.7448, + "step": 233 + }, + { + "epoch": 1.315492957746479, + "grad_norm": 4.601402282714844, + "learning_rate": 6.990000000000001e-07, + "loss": 3.733, + "step": 234 + }, + { + "epoch": 1.3211267605633803, + "grad_norm": 5.0725555419921875, + "learning_rate": 7.02e-07, + "loss": 3.6794, + "step": 235 + }, + { + "epoch": 1.3267605633802817, + "grad_norm": 4.2628374099731445, + "learning_rate": 7.05e-07, + "loss": 3.6747, + "step": 236 + }, + { + "epoch": 1.332394366197183, + "grad_norm": 4.159196853637695, + "learning_rate": 7.08e-07, + "loss": 3.6834, + "step": 237 + }, + { + "epoch": 1.3380281690140845, + "grad_norm": 3.971897602081299, + "learning_rate": 7.11e-07, + "loss": 3.6498, + "step": 238 + }, + { + "epoch": 1.343661971830986, + "grad_norm": 4.230648517608643, + "learning_rate": 7.140000000000001e-07, + "loss": 3.6179, + "step": 239 + }, + { + "epoch": 1.3492957746478873, + "grad_norm": 4.850123405456543, + "learning_rate": 7.170000000000001e-07, + "loss": 3.5813, + "step": 240 + }, + { + "epoch": 1.3549295774647887, + "grad_norm": 3.619288206100464, + "learning_rate": 7.2e-07, + "loss": 3.6227, + "step": 241 + }, + { + "epoch": 1.36056338028169, + "grad_norm": 4.144800662994385, + "learning_rate": 7.23e-07, + "loss": 3.5781, + "step": 242 + }, + { + "epoch": 1.3661971830985915, + "grad_norm": 3.449338912963867, + "learning_rate": 7.26e-07, + "loss": 3.5961, + "step": 243 + }, + { + "epoch": 1.371830985915493, + "grad_norm": 3.306309938430786, + "learning_rate": 7.29e-07, + "loss": 3.542, + "step": 244 + }, + { + "epoch": 1.3774647887323943, + "grad_norm": 3.7249999046325684, + "learning_rate": 7.32e-07, + "loss": 3.5631, + "step": 245 + }, + { + "epoch": 1.3830985915492957, + "grad_norm": 3.403693914413452, + "learning_rate": 7.350000000000001e-07, + "loss": 3.4998, + "step": 246 + }, + { + "epoch": 1.388732394366197, + "grad_norm": 3.535109519958496, + "learning_rate": 7.380000000000001e-07, + "loss": 3.5187, + "step": 247 + }, + { + "epoch": 1.3943661971830985, + "grad_norm": 3.3754496574401855, + "learning_rate": 7.41e-07, + "loss": 3.5296, + "step": 248 + }, + { + "epoch": 1.4, + "grad_norm": 3.4589614868164062, + "learning_rate": 7.44e-07, + "loss": 3.5365, + "step": 249 + }, + { + "epoch": 1.4056338028169013, + "grad_norm": 3.0371317863464355, + "learning_rate": 7.47e-07, + "loss": 3.4723, + "step": 250 + }, + { + "epoch": 1.4112676056338027, + "grad_norm": 3.40959095954895, + "learning_rate": 7.5e-07, + "loss": 3.47, + "step": 251 + }, + { + "epoch": 1.4169014084507041, + "grad_norm": 3.2615842819213867, + "learning_rate": 7.53e-07, + "loss": 3.5185, + "step": 252 + }, + { + "epoch": 1.4225352112676055, + "grad_norm": 2.8736321926116943, + "learning_rate": 7.56e-07, + "loss": 3.5047, + "step": 253 + }, + { + "epoch": 1.428169014084507, + "grad_norm": 3.355738639831543, + "learning_rate": 7.59e-07, + "loss": 3.4177, + "step": 254 + }, + { + "epoch": 1.4338028169014083, + "grad_norm": 3.714266300201416, + "learning_rate": 7.62e-07, + "loss": 3.4446, + "step": 255 + }, + { + "epoch": 1.43943661971831, + "grad_norm": 3.5116286277770996, + "learning_rate": 7.65e-07, + "loss": 3.4098, + "step": 256 + }, + { + "epoch": 1.4450704225352113, + "grad_norm": 3.016411542892456, + "learning_rate": 7.680000000000001e-07, + "loss": 3.3968, + "step": 257 + }, + { + "epoch": 1.4507042253521127, + "grad_norm": 3.350151538848877, + "learning_rate": 7.71e-07, + "loss": 3.3943, + "step": 258 + }, + { + "epoch": 1.4563380281690141, + "grad_norm": 3.121488571166992, + "learning_rate": 7.74e-07, + "loss": 3.3664, + "step": 259 + }, + { + "epoch": 1.4619718309859155, + "grad_norm": 2.776261329650879, + "learning_rate": 7.77e-07, + "loss": 3.3606, + "step": 260 + }, + { + "epoch": 1.467605633802817, + "grad_norm": 3.6956863403320312, + "learning_rate": 7.799999999999999e-07, + "loss": 3.3889, + "step": 261 + }, + { + "epoch": 1.4732394366197183, + "grad_norm": 5.630766868591309, + "learning_rate": 7.830000000000001e-07, + "loss": 3.3498, + "step": 262 + }, + { + "epoch": 1.4788732394366197, + "grad_norm": 4.4029693603515625, + "learning_rate": 7.860000000000001e-07, + "loss": 3.3344, + "step": 263 + }, + { + "epoch": 1.4845070422535211, + "grad_norm": 2.447305917739868, + "learning_rate": 7.89e-07, + "loss": 3.3012, + "step": 264 + }, + { + "epoch": 1.4901408450704225, + "grad_norm": 10.303879737854004, + "learning_rate": 7.92e-07, + "loss": 3.3133, + "step": 265 + }, + { + "epoch": 1.495774647887324, + "grad_norm": 3.3926241397857666, + "learning_rate": 7.95e-07, + "loss": 3.3414, + "step": 266 + }, + { + "epoch": 1.5014084507042254, + "grad_norm": 15.229345321655273, + "learning_rate": 7.98e-07, + "loss": 3.6492, + "step": 267 + }, + { + "epoch": 1.5070422535211268, + "grad_norm": 8.38891887664795, + "learning_rate": 8.01e-07, + "loss": 3.4816, + "step": 268 + }, + { + "epoch": 1.5126760563380282, + "grad_norm": 7.57686185836792, + "learning_rate": 8.04e-07, + "loss": 3.4408, + "step": 269 + }, + { + "epoch": 1.5183098591549296, + "grad_norm": 4.836228847503662, + "learning_rate": 8.070000000000001e-07, + "loss": 3.4019, + "step": 270 + }, + { + "epoch": 1.523943661971831, + "grad_norm": 4.142337322235107, + "learning_rate": 8.1e-07, + "loss": 3.3743, + "step": 271 + }, + { + "epoch": 1.5295774647887324, + "grad_norm": 3.1797330379486084, + "learning_rate": 8.13e-07, + "loss": 3.3709, + "step": 272 + }, + { + "epoch": 1.5352112676056338, + "grad_norm": 3.2781105041503906, + "learning_rate": 8.16e-07, + "loss": 3.3235, + "step": 273 + }, + { + "epoch": 1.5408450704225352, + "grad_norm": 2.989748239517212, + "learning_rate": 8.19e-07, + "loss": 3.3489, + "step": 274 + }, + { + "epoch": 1.5464788732394368, + "grad_norm": 3.2133727073669434, + "learning_rate": 8.22e-07, + "loss": 3.3769, + "step": 275 + }, + { + "epoch": 1.5521126760563382, + "grad_norm": 2.8232502937316895, + "learning_rate": 8.25e-07, + "loss": 3.3146, + "step": 276 + }, + { + "epoch": 1.5577464788732396, + "grad_norm": 3.972604751586914, + "learning_rate": 8.280000000000001e-07, + "loss": 3.3137, + "step": 277 + }, + { + "epoch": 1.563380281690141, + "grad_norm": 3.0173697471618652, + "learning_rate": 8.31e-07, + "loss": 3.2774, + "step": 278 + }, + { + "epoch": 1.5690140845070424, + "grad_norm": 2.3883769512176514, + "learning_rate": 8.34e-07, + "loss": 3.3073, + "step": 279 + }, + { + "epoch": 1.5746478873239438, + "grad_norm": 2.2080185413360596, + "learning_rate": 8.370000000000001e-07, + "loss": 3.2611, + "step": 280 + }, + { + "epoch": 1.5802816901408452, + "grad_norm": 3.058180570602417, + "learning_rate": 8.4e-07, + "loss": 3.2561, + "step": 281 + }, + { + "epoch": 1.5859154929577466, + "grad_norm": 3.721339225769043, + "learning_rate": 8.43e-07, + "loss": 3.2693, + "step": 282 + }, + { + "epoch": 1.591549295774648, + "grad_norm": 4.630777359008789, + "learning_rate": 8.46e-07, + "loss": 3.2459, + "step": 283 + }, + { + "epoch": 1.5971830985915494, + "grad_norm": 2.207216501235962, + "learning_rate": 8.489999999999999e-07, + "loss": 3.215, + "step": 284 + }, + { + "epoch": 1.6028169014084508, + "grad_norm": 2.0840320587158203, + "learning_rate": 8.520000000000001e-07, + "loss": 3.2255, + "step": 285 + }, + { + "epoch": 1.6084507042253522, + "grad_norm": 2.489400625228882, + "learning_rate": 8.550000000000001e-07, + "loss": 3.2098, + "step": 286 + }, + { + "epoch": 1.6140845070422536, + "grad_norm": 2.2128732204437256, + "learning_rate": 8.580000000000001e-07, + "loss": 3.236, + "step": 287 + }, + { + "epoch": 1.619718309859155, + "grad_norm": 1.8475383520126343, + "learning_rate": 8.61e-07, + "loss": 3.2143, + "step": 288 + }, + { + "epoch": 1.6253521126760564, + "grad_norm": 3.034592390060425, + "learning_rate": 8.64e-07, + "loss": 3.1599, + "step": 289 + }, + { + "epoch": 1.6309859154929578, + "grad_norm": 3.48130464553833, + "learning_rate": 8.67e-07, + "loss": 3.1656, + "step": 290 + }, + { + "epoch": 1.6366197183098592, + "grad_norm": 2.465686321258545, + "learning_rate": 8.7e-07, + "loss": 3.1281, + "step": 291 + }, + { + "epoch": 1.6422535211267606, + "grad_norm": 2.5849077701568604, + "learning_rate": 8.73e-07, + "loss": 3.1726, + "step": 292 + }, + { + "epoch": 1.647887323943662, + "grad_norm": 2.192965507507324, + "learning_rate": 8.760000000000001e-07, + "loss": 3.1238, + "step": 293 + }, + { + "epoch": 1.6535211267605634, + "grad_norm": 1.818284273147583, + "learning_rate": 8.79e-07, + "loss": 3.1451, + "step": 294 + }, + { + "epoch": 1.6591549295774648, + "grad_norm": 1.8378015756607056, + "learning_rate": 8.82e-07, + "loss": 3.1432, + "step": 295 + }, + { + "epoch": 1.6647887323943662, + "grad_norm": 4.533341407775879, + "learning_rate": 8.85e-07, + "loss": 3.1177, + "step": 296 + }, + { + "epoch": 1.6704225352112676, + "grad_norm": 3.2857859134674072, + "learning_rate": 8.88e-07, + "loss": 3.1139, + "step": 297 + }, + { + "epoch": 1.676056338028169, + "grad_norm": 2.6295394897460938, + "learning_rate": 8.91e-07, + "loss": 3.1375, + "step": 298 + }, + { + "epoch": 1.6816901408450704, + "grad_norm": 2.6304931640625, + "learning_rate": 8.94e-07, + "loss": 3.1008, + "step": 299 + }, + { + "epoch": 1.6873239436619718, + "grad_norm": 2.0864717960357666, + "learning_rate": 8.97e-07, + "loss": 3.0836, + "step": 300 + }, + { + "epoch": 1.6929577464788732, + "grad_norm": 1.9646295309066772, + "learning_rate": 9e-07, + "loss": 3.0995, + "step": 301 + }, + { + "epoch": 1.6985915492957746, + "grad_norm": 2.4765114784240723, + "learning_rate": 9.03e-07, + "loss": 3.0557, + "step": 302 + }, + { + "epoch": 1.704225352112676, + "grad_norm": 1.6692005395889282, + "learning_rate": 9.060000000000001e-07, + "loss": 3.0774, + "step": 303 + }, + { + "epoch": 1.7098591549295774, + "grad_norm": 3.1556854248046875, + "learning_rate": 9.09e-07, + "loss": 3.103, + "step": 304 + }, + { + "epoch": 1.7154929577464788, + "grad_norm": 4.288612365722656, + "learning_rate": 9.12e-07, + "loss": 3.0551, + "step": 305 + }, + { + "epoch": 1.7211267605633802, + "grad_norm": 1.729299783706665, + "learning_rate": 9.15e-07, + "loss": 3.0499, + "step": 306 + }, + { + "epoch": 1.7267605633802816, + "grad_norm": 3.1948986053466797, + "learning_rate": 9.179999999999999e-07, + "loss": 3.0367, + "step": 307 + }, + { + "epoch": 1.732394366197183, + "grad_norm": 2.3002986907958984, + "learning_rate": 9.210000000000001e-07, + "loss": 3.0475, + "step": 308 + }, + { + "epoch": 1.7380281690140844, + "grad_norm": 2.291055917739868, + "learning_rate": 9.240000000000001e-07, + "loss": 3.0386, + "step": 309 + }, + { + "epoch": 1.7436619718309858, + "grad_norm": 3.8127360343933105, + "learning_rate": 9.270000000000001e-07, + "loss": 3.0193, + "step": 310 + }, + { + "epoch": 1.7492957746478872, + "grad_norm": 8.73247241973877, + "learning_rate": 9.3e-07, + "loss": 3.3324, + "step": 311 + }, + { + "epoch": 1.7549295774647886, + "grad_norm": 5.212857723236084, + "learning_rate": 9.33e-07, + "loss": 3.2241, + "step": 312 + }, + { + "epoch": 1.76056338028169, + "grad_norm": 5.405500411987305, + "learning_rate": 9.36e-07, + "loss": 3.1716, + "step": 313 + }, + { + "epoch": 1.7661971830985914, + "grad_norm": 3.3166182041168213, + "learning_rate": 9.39e-07, + "loss": 3.1203, + "step": 314 + }, + { + "epoch": 1.7718309859154928, + "grad_norm": 4.144299507141113, + "learning_rate": 9.419999999999999e-07, + "loss": 3.1726, + "step": 315 + }, + { + "epoch": 1.7774647887323942, + "grad_norm": 3.576324224472046, + "learning_rate": 9.450000000000001e-07, + "loss": 3.1518, + "step": 316 + }, + { + "epoch": 1.7830985915492956, + "grad_norm": 3.0369715690612793, + "learning_rate": 9.480000000000001e-07, + "loss": 3.1077, + "step": 317 + }, + { + "epoch": 1.788732394366197, + "grad_norm": 3.1641204357147217, + "learning_rate": 9.51e-07, + "loss": 3.1223, + "step": 318 + }, + { + "epoch": 1.7943661971830986, + "grad_norm": 2.084167242050171, + "learning_rate": 9.54e-07, + "loss": 3.0577, + "step": 319 + }, + { + "epoch": 1.8, + "grad_norm": 2.145345449447632, + "learning_rate": 9.57e-07, + "loss": 3.1057, + "step": 320 + }, + { + "epoch": 1.8056338028169014, + "grad_norm": 3.1458144187927246, + "learning_rate": 9.600000000000001e-07, + "loss": 3.1221, + "step": 321 + }, + { + "epoch": 1.8112676056338028, + "grad_norm": 2.8545215129852295, + "learning_rate": 9.63e-07, + "loss": 3.092, + "step": 322 + }, + { + "epoch": 1.8169014084507042, + "grad_norm": 15.60283374786377, + "learning_rate": 9.66e-07, + "loss": 3.0654, + "step": 323 + }, + { + "epoch": 1.8225352112676056, + "grad_norm": 2.4702093601226807, + "learning_rate": 9.690000000000002e-07, + "loss": 3.0606, + "step": 324 + }, + { + "epoch": 1.828169014084507, + "grad_norm": 3.160369396209717, + "learning_rate": 9.72e-07, + "loss": 3.0641, + "step": 325 + }, + { + "epoch": 1.8338028169014085, + "grad_norm": 2.9996860027313232, + "learning_rate": 9.75e-07, + "loss": 3.06, + "step": 326 + }, + { + "epoch": 1.8394366197183099, + "grad_norm": 4.691850662231445, + "learning_rate": 9.78e-07, + "loss": 3.0031, + "step": 327 + }, + { + "epoch": 1.8450704225352113, + "grad_norm": 2.0418202877044678, + "learning_rate": 9.81e-07, + "loss": 3.0525, + "step": 328 + }, + { + "epoch": 1.8507042253521127, + "grad_norm": 3.0207626819610596, + "learning_rate": 9.84e-07, + "loss": 3.0289, + "step": 329 + }, + { + "epoch": 1.856338028169014, + "grad_norm": 1.9885998964309692, + "learning_rate": 9.87e-07, + "loss": 3.0429, + "step": 330 + }, + { + "epoch": 1.8619718309859155, + "grad_norm": 2.9951462745666504, + "learning_rate": 9.9e-07, + "loss": 3.0391, + "step": 331 + }, + { + "epoch": 1.8676056338028169, + "grad_norm": 3.579716682434082, + "learning_rate": 9.929999999999999e-07, + "loss": 3.0725, + "step": 332 + }, + { + "epoch": 1.8732394366197183, + "grad_norm": 3.4275898933410645, + "learning_rate": 9.96e-07, + "loss": 3.0552, + "step": 333 + }, + { + "epoch": 1.8788732394366199, + "grad_norm": 3.915097951889038, + "learning_rate": 9.99e-07, + "loss": 3.0403, + "step": 334 + }, + { + "epoch": 1.8845070422535213, + "grad_norm": 3.5404648780822754, + "learning_rate": 1.002e-06, + "loss": 3.0184, + "step": 335 + }, + { + "epoch": 1.8901408450704227, + "grad_norm": 1.993233561515808, + "learning_rate": 1.0050000000000001e-06, + "loss": 3.0271, + "step": 336 + }, + { + "epoch": 1.895774647887324, + "grad_norm": 2.958256483078003, + "learning_rate": 1.008e-06, + "loss": 3.0009, + "step": 337 + }, + { + "epoch": 1.9014084507042255, + "grad_norm": 2.1944117546081543, + "learning_rate": 1.0110000000000001e-06, + "loss": 2.973, + "step": 338 + }, + { + "epoch": 1.9070422535211269, + "grad_norm": 4.562254905700684, + "learning_rate": 1.014e-06, + "loss": 3.0002, + "step": 339 + }, + { + "epoch": 1.9126760563380283, + "grad_norm": 2.5491044521331787, + "learning_rate": 1.017e-06, + "loss": 3.0079, + "step": 340 + }, + { + "epoch": 1.9183098591549297, + "grad_norm": 4.045207977294922, + "learning_rate": 1.0200000000000002e-06, + "loss": 2.9921, + "step": 341 + }, + { + "epoch": 1.923943661971831, + "grad_norm": 4.1953864097595215, + "learning_rate": 1.023e-06, + "loss": 2.9992, + "step": 342 + }, + { + "epoch": 1.9295774647887325, + "grad_norm": 2.195458173751831, + "learning_rate": 1.026e-06, + "loss": 3.0202, + "step": 343 + }, + { + "epoch": 1.935211267605634, + "grad_norm": 6.150205135345459, + "learning_rate": 1.029e-06, + "loss": 2.9895, + "step": 344 + }, + { + "epoch": 1.9408450704225353, + "grad_norm": 2.1709234714508057, + "learning_rate": 1.032e-06, + "loss": 3.026, + "step": 345 + }, + { + "epoch": 1.9464788732394367, + "grad_norm": 3.657919406890869, + "learning_rate": 1.035e-06, + "loss": 2.9522, + "step": 346 + }, + { + "epoch": 1.952112676056338, + "grad_norm": 5.459045886993408, + "learning_rate": 1.038e-06, + "loss": 2.9906, + "step": 347 + }, + { + "epoch": 1.9577464788732395, + "grad_norm": 2.215078592300415, + "learning_rate": 1.041e-06, + "loss": 2.9721, + "step": 348 + }, + { + "epoch": 1.963380281690141, + "grad_norm": 2.1503262519836426, + "learning_rate": 1.0439999999999999e-06, + "loss": 2.9818, + "step": 349 + }, + { + "epoch": 1.9690140845070423, + "grad_norm": 21.535140991210938, + "learning_rate": 1.047e-06, + "loss": 2.9945, + "step": 350 + }, + { + "epoch": 1.9746478873239437, + "grad_norm": 3.492640733718872, + "learning_rate": 1.0500000000000001e-06, + "loss": 2.9585, + "step": 351 + }, + { + "epoch": 1.980281690140845, + "grad_norm": 10.216153144836426, + "learning_rate": 1.053e-06, + "loss": 3.0104, + "step": 352 + }, + { + "epoch": 1.9859154929577465, + "grad_norm": 3.894277334213257, + "learning_rate": 1.0560000000000001e-06, + "loss": 2.9685, + "step": 353 + }, + { + "epoch": 1.991549295774648, + "grad_norm": NaN, + "learning_rate": 1.0560000000000001e-06, + "loss": 3.016, + "step": 354 + }, + { + "epoch": 1.9971830985915493, + "grad_norm": 2.764295816421509, + "learning_rate": 1.059e-06, + "loss": 3.0144, + "step": 355 + }, + { + "epoch": 2.0, + "grad_norm": 2.5533089637756348, + "learning_rate": 1.062e-06, + "loss": 1.4893, + "step": 356 + }, + { + "epoch": 2.0056338028169014, + "grad_norm": 6.689918518066406, + "learning_rate": 1.065e-06, + "loss": 3.156, + "step": 357 + }, + { + "epoch": 2.011267605633803, + "grad_norm": 4.828878879547119, + "learning_rate": 1.068e-06, + "loss": 3.1063, + "step": 358 + }, + { + "epoch": 2.016901408450704, + "grad_norm": 3.7969484329223633, + "learning_rate": 1.0710000000000002e-06, + "loss": 3.0965, + "step": 359 + }, + { + "epoch": 2.0225352112676056, + "grad_norm": 3.629793405532837, + "learning_rate": 1.074e-06, + "loss": 3.0502, + "step": 360 + }, + { + "epoch": 2.028169014084507, + "grad_norm": 4.136383533477783, + "learning_rate": 1.077e-06, + "loss": 3.0603, + "step": 361 + }, + { + "epoch": 2.0338028169014084, + "grad_norm": 4.763708591461182, + "learning_rate": 1.08e-06, + "loss": 3.035, + "step": 362 + }, + { + "epoch": 2.03943661971831, + "grad_norm": 2.662196397781372, + "learning_rate": 1.083e-06, + "loss": 2.979, + "step": 363 + }, + { + "epoch": 2.045070422535211, + "grad_norm": 2.799008369445801, + "learning_rate": 1.086e-06, + "loss": 3.0423, + "step": 364 + }, + { + "epoch": 2.0507042253521126, + "grad_norm": 3.0370540618896484, + "learning_rate": 1.089e-06, + "loss": 3.0191, + "step": 365 + }, + { + "epoch": 2.056338028169014, + "grad_norm": 3.5550408363342285, + "learning_rate": 1.092e-06, + "loss": 2.9998, + "step": 366 + }, + { + "epoch": 2.0619718309859154, + "grad_norm": 2.8191301822662354, + "learning_rate": 1.0949999999999999e-06, + "loss": 3.0373, + "step": 367 + }, + { + "epoch": 2.067605633802817, + "grad_norm": 4.018879413604736, + "learning_rate": 1.0980000000000001e-06, + "loss": 2.9933, + "step": 368 + }, + { + "epoch": 2.073239436619718, + "grad_norm": 3.813567876815796, + "learning_rate": 1.1010000000000001e-06, + "loss": 3.0079, + "step": 369 + }, + { + "epoch": 2.0788732394366196, + "grad_norm": 2.6536049842834473, + "learning_rate": 1.104e-06, + "loss": 2.9907, + "step": 370 + }, + { + "epoch": 2.084507042253521, + "grad_norm": 1.7705937623977661, + "learning_rate": 1.1070000000000002e-06, + "loss": 2.9996, + "step": 371 + }, + { + "epoch": 2.0901408450704224, + "grad_norm": 5.244455814361572, + "learning_rate": 1.11e-06, + "loss": 3.0193, + "step": 372 + }, + { + "epoch": 2.095774647887324, + "grad_norm": 4.140601634979248, + "learning_rate": 1.113e-06, + "loss": 2.9686, + "step": 373 + }, + { + "epoch": 2.101408450704225, + "grad_norm": 2.9329001903533936, + "learning_rate": 1.116e-06, + "loss": 2.9843, + "step": 374 + }, + { + "epoch": 2.1070422535211266, + "grad_norm": 1.8754574060440063, + "learning_rate": 1.119e-06, + "loss": 2.9626, + "step": 375 + }, + { + "epoch": 2.112676056338028, + "grad_norm": 4.379260540008545, + "learning_rate": 1.122e-06, + "loss": 2.9652, + "step": 376 + }, + { + "epoch": 2.1183098591549294, + "grad_norm": 3.4436750411987305, + "learning_rate": 1.125e-06, + "loss": 2.9727, + "step": 377 + }, + { + "epoch": 2.123943661971831, + "grad_norm": 3.883599281311035, + "learning_rate": 1.128e-06, + "loss": 2.9619, + "step": 378 + }, + { + "epoch": 2.129577464788732, + "grad_norm": 3.0950586795806885, + "learning_rate": 1.131e-06, + "loss": 2.9473, + "step": 379 + }, + { + "epoch": 2.1352112676056336, + "grad_norm": 1.735630989074707, + "learning_rate": 1.134e-06, + "loss": 2.9603, + "step": 380 + }, + { + "epoch": 2.140845070422535, + "grad_norm": 5.800061225891113, + "learning_rate": 1.137e-06, + "loss": 3.0003, + "step": 381 + }, + { + "epoch": 2.1464788732394364, + "grad_norm": 3.413560152053833, + "learning_rate": 1.14e-06, + "loss": 2.9284, + "step": 382 + }, + { + "epoch": 2.152112676056338, + "grad_norm": 2.977813482284546, + "learning_rate": 1.1430000000000001e-06, + "loss": 2.9381, + "step": 383 + }, + { + "epoch": 2.1577464788732392, + "grad_norm": 2.820646047592163, + "learning_rate": 1.146e-06, + "loss": 2.9239, + "step": 384 + }, + { + "epoch": 2.1633802816901406, + "grad_norm": 3.4326274394989014, + "learning_rate": 1.1490000000000001e-06, + "loss": 2.9261, + "step": 385 + }, + { + "epoch": 2.169014084507042, + "grad_norm": 6.641375541687012, + "learning_rate": 1.152e-06, + "loss": 2.9679, + "step": 386 + }, + { + "epoch": 2.1746478873239434, + "grad_norm": 5.253774166107178, + "learning_rate": 1.155e-06, + "loss": 2.9745, + "step": 387 + }, + { + "epoch": 2.1802816901408453, + "grad_norm": 3.8959267139434814, + "learning_rate": 1.1580000000000002e-06, + "loss": 2.9117, + "step": 388 + }, + { + "epoch": 2.1859154929577467, + "grad_norm": 1.9529342651367188, + "learning_rate": 1.161e-06, + "loss": 2.9216, + "step": 389 + }, + { + "epoch": 2.191549295774648, + "grad_norm": 6.595493793487549, + "learning_rate": 1.164e-06, + "loss": 2.9176, + "step": 390 + }, + { + "epoch": 2.1971830985915495, + "grad_norm": 3.104062557220459, + "learning_rate": 1.167e-06, + "loss": 2.9234, + "step": 391 + }, + { + "epoch": 2.202816901408451, + "grad_norm": 2.850505828857422, + "learning_rate": 1.17e-06, + "loss": 2.9371, + "step": 392 + }, + { + "epoch": 2.2084507042253523, + "grad_norm": 3.024420738220215, + "learning_rate": 1.173e-06, + "loss": 2.9335, + "step": 393 + }, + { + "epoch": 2.2140845070422537, + "grad_norm": 3.7631072998046875, + "learning_rate": 1.176e-06, + "loss": 2.9129, + "step": 394 + }, + { + "epoch": 2.219718309859155, + "grad_norm": 3.0405235290527344, + "learning_rate": 1.179e-06, + "loss": 2.9242, + "step": 395 + }, + { + "epoch": 2.2253521126760565, + "grad_norm": 4.891364097595215, + "learning_rate": 1.1819999999999999e-06, + "loss": 2.9131, + "step": 396 + }, + { + "epoch": 2.230985915492958, + "grad_norm": 2.57051420211792, + "learning_rate": 1.185e-06, + "loss": 2.8991, + "step": 397 + }, + { + "epoch": 2.2366197183098593, + "grad_norm": 3.683042049407959, + "learning_rate": 1.188e-06, + "loss": 2.9211, + "step": 398 + }, + { + "epoch": 2.2422535211267607, + "grad_norm": 2.8408870697021484, + "learning_rate": 1.191e-06, + "loss": 2.8893, + "step": 399 + }, + { + "epoch": 2.247887323943662, + "grad_norm": 6.567075252532959, + "learning_rate": 1.1940000000000001e-06, + "loss": 2.9126, + "step": 400 + }, + { + "epoch": 2.2535211267605635, + "grad_norm": 4.963144779205322, + "learning_rate": 1.197e-06, + "loss": 3.0861, + "step": 401 + }, + { + "epoch": 2.259154929577465, + "grad_norm": 5.517329692840576, + "learning_rate": 1.2000000000000002e-06, + "loss": 3.0618, + "step": 402 + }, + { + "epoch": 2.2647887323943663, + "grad_norm": 3.461358070373535, + "learning_rate": 1.203e-06, + "loss": 2.9884, + "step": 403 + }, + { + "epoch": 2.2704225352112677, + "grad_norm": 5.536230087280273, + "learning_rate": 1.206e-06, + "loss": 2.9819, + "step": 404 + }, + { + "epoch": 2.276056338028169, + "grad_norm": 3.172144651412964, + "learning_rate": 1.2090000000000002e-06, + "loss": 2.965, + "step": 405 + }, + { + "epoch": 2.2816901408450705, + "grad_norm": 4.672169208526611, + "learning_rate": 1.212e-06, + "loss": 2.9963, + "step": 406 + }, + { + "epoch": 2.287323943661972, + "grad_norm": 2.5767641067504883, + "learning_rate": 1.215e-06, + "loss": 2.9119, + "step": 407 + }, + { + "epoch": 2.2929577464788733, + "grad_norm": 3.546799659729004, + "learning_rate": 1.218e-06, + "loss": 2.9451, + "step": 408 + }, + { + "epoch": 2.2985915492957747, + "grad_norm": 2.6019978523254395, + "learning_rate": 1.221e-06, + "loss": 2.9306, + "step": 409 + }, + { + "epoch": 2.304225352112676, + "grad_norm": 5.824201583862305, + "learning_rate": 1.224e-06, + "loss": 2.9445, + "step": 410 + }, + { + "epoch": 2.3098591549295775, + "grad_norm": 3.3293278217315674, + "learning_rate": 1.227e-06, + "loss": 2.9351, + "step": 411 + }, + { + "epoch": 2.315492957746479, + "grad_norm": 5.109325885772705, + "learning_rate": 1.23e-06, + "loss": 2.9134, + "step": 412 + }, + { + "epoch": 2.3211267605633803, + "grad_norm": 3.5838801860809326, + "learning_rate": 1.2329999999999999e-06, + "loss": 2.9145, + "step": 413 + }, + { + "epoch": 2.3267605633802817, + "grad_norm": 4.243381977081299, + "learning_rate": 1.2360000000000001e-06, + "loss": 2.905, + "step": 414 + }, + { + "epoch": 2.332394366197183, + "grad_norm": 6.325119972229004, + "learning_rate": 1.2390000000000001e-06, + "loss": 2.9102, + "step": 415 + }, + { + "epoch": 2.3380281690140845, + "grad_norm": 3.0729176998138428, + "learning_rate": 1.242e-06, + "loss": 2.8892, + "step": 416 + }, + { + "epoch": 2.343661971830986, + "grad_norm": 5.03809118270874, + "learning_rate": 1.2450000000000002e-06, + "loss": 2.8839, + "step": 417 + }, + { + "epoch": 2.3492957746478873, + "grad_norm": 2.4732346534729004, + "learning_rate": 1.248e-06, + "loss": 2.9064, + "step": 418 + }, + { + "epoch": 2.3549295774647887, + "grad_norm": 3.957566499710083, + "learning_rate": 1.251e-06, + "loss": 2.8872, + "step": 419 + }, + { + "epoch": 2.36056338028169, + "grad_norm": 3.1195902824401855, + "learning_rate": 1.254e-06, + "loss": 2.8886, + "step": 420 + }, + { + "epoch": 2.3661971830985915, + "grad_norm": 4.334029674530029, + "learning_rate": 1.257e-06, + "loss": 2.873, + "step": 421 + }, + { + "epoch": 2.371830985915493, + "grad_norm": 2.1096246242523193, + "learning_rate": 1.26e-06, + "loss": 2.8649, + "step": 422 + }, + { + "epoch": 2.3774647887323943, + "grad_norm": 4.480950355529785, + "learning_rate": 1.263e-06, + "loss": 2.8852, + "step": 423 + }, + { + "epoch": 2.3830985915492957, + "grad_norm": 4.877039909362793, + "learning_rate": 1.266e-06, + "loss": 2.8639, + "step": 424 + }, + { + "epoch": 2.388732394366197, + "grad_norm": 3.921311140060425, + "learning_rate": 1.269e-06, + "loss": 2.8589, + "step": 425 + }, + { + "epoch": 2.3943661971830985, + "grad_norm": 2.2594597339630127, + "learning_rate": 1.272e-06, + "loss": 2.8935, + "step": 426 + }, + { + "epoch": 2.4, + "grad_norm": 4.150793075561523, + "learning_rate": 1.275e-06, + "loss": 2.8623, + "step": 427 + }, + { + "epoch": 2.4056338028169013, + "grad_norm": 2.5821540355682373, + "learning_rate": 1.278e-06, + "loss": 2.8597, + "step": 428 + }, + { + "epoch": 2.4112676056338027, + "grad_norm": 3.32155704498291, + "learning_rate": 1.281e-06, + "loss": 2.8644, + "step": 429 + }, + { + "epoch": 2.416901408450704, + "grad_norm": 2.0583064556121826, + "learning_rate": 1.284e-06, + "loss": 2.8809, + "step": 430 + }, + { + "epoch": 2.4225352112676055, + "grad_norm": 2.4684250354766846, + "learning_rate": 1.2870000000000001e-06, + "loss": 2.8583, + "step": 431 + }, + { + "epoch": 2.428169014084507, + "grad_norm": 2.8934361934661865, + "learning_rate": 1.29e-06, + "loss": 2.8812, + "step": 432 + }, + { + "epoch": 2.4338028169014083, + "grad_norm": 3.3147571086883545, + "learning_rate": 1.293e-06, + "loss": 2.8513, + "step": 433 + }, + { + "epoch": 2.4394366197183097, + "grad_norm": 3.118459701538086, + "learning_rate": 1.2960000000000002e-06, + "loss": 2.8737, + "step": 434 + }, + { + "epoch": 2.445070422535211, + "grad_norm": 5.725314140319824, + "learning_rate": 1.299e-06, + "loss": 2.8238, + "step": 435 + }, + { + "epoch": 2.4507042253521125, + "grad_norm": 2.1813156604766846, + "learning_rate": 1.302e-06, + "loss": 2.827, + "step": 436 + }, + { + "epoch": 2.456338028169014, + "grad_norm": 2.7385337352752686, + "learning_rate": 1.305e-06, + "loss": 2.8563, + "step": 437 + }, + { + "epoch": 2.4619718309859153, + "grad_norm": 4.738112926483154, + "learning_rate": 1.308e-06, + "loss": 2.822, + "step": 438 + }, + { + "epoch": 2.4676056338028167, + "grad_norm": 3.1284008026123047, + "learning_rate": 1.311e-06, + "loss": 2.8628, + "step": 439 + }, + { + "epoch": 2.473239436619718, + "grad_norm": 3.920558452606201, + "learning_rate": 1.314e-06, + "loss": 2.8456, + "step": 440 + }, + { + "epoch": 2.4788732394366195, + "grad_norm": 3.4516639709472656, + "learning_rate": 1.317e-06, + "loss": 2.8238, + "step": 441 + }, + { + "epoch": 2.4845070422535214, + "grad_norm": 2.7140746116638184, + "learning_rate": 1.3199999999999999e-06, + "loss": 2.8164, + "step": 442 + }, + { + "epoch": 2.4901408450704228, + "grad_norm": 3.0408647060394287, + "learning_rate": 1.323e-06, + "loss": 2.8268, + "step": 443 + }, + { + "epoch": 2.495774647887324, + "grad_norm": 5.611374378204346, + "learning_rate": 1.326e-06, + "loss": 2.8166, + "step": 444 + }, + { + "epoch": 2.5014084507042256, + "grad_norm": 4.1387619972229, + "learning_rate": 1.3290000000000001e-06, + "loss": 3.0038, + "step": 445 + }, + { + "epoch": 2.507042253521127, + "grad_norm": 3.045279026031494, + "learning_rate": 1.3320000000000001e-06, + "loss": 2.9354, + "step": 446 + }, + { + "epoch": 2.5126760563380284, + "grad_norm": 2.492701530456543, + "learning_rate": 1.335e-06, + "loss": 2.9068, + "step": 447 + }, + { + "epoch": 2.5183098591549298, + "grad_norm": 3.7552857398986816, + "learning_rate": 1.3380000000000001e-06, + "loss": 2.8962, + "step": 448 + }, + { + "epoch": 2.523943661971831, + "grad_norm": 3.1780288219451904, + "learning_rate": 1.341e-06, + "loss": 2.869, + "step": 449 + }, + { + "epoch": 2.5295774647887326, + "grad_norm": 3.949904680252075, + "learning_rate": 1.344e-06, + "loss": 2.8825, + "step": 450 + }, + { + "epoch": 2.535211267605634, + "grad_norm": 1.9001998901367188, + "learning_rate": 1.3470000000000002e-06, + "loss": 2.8544, + "step": 451 + }, + { + "epoch": 2.5408450704225354, + "grad_norm": 3.131744384765625, + "learning_rate": 1.35e-06, + "loss": 2.8851, + "step": 452 + }, + { + "epoch": 2.546478873239437, + "grad_norm": 3.6800670623779297, + "learning_rate": 1.353e-06, + "loss": 2.8998, + "step": 453 + }, + { + "epoch": 2.552112676056338, + "grad_norm": 3.9882307052612305, + "learning_rate": 1.356e-06, + "loss": 2.8831, + "step": 454 + }, + { + "epoch": 2.5577464788732396, + "grad_norm": 2.457385778427124, + "learning_rate": 1.359e-06, + "loss": 2.8616, + "step": 455 + }, + { + "epoch": 2.563380281690141, + "grad_norm": 2.2192134857177734, + "learning_rate": 1.362e-06, + "loss": 2.8799, + "step": 456 + }, + { + "epoch": 2.5690140845070424, + "grad_norm": 3.6589062213897705, + "learning_rate": 1.365e-06, + "loss": 2.8637, + "step": 457 + }, + { + "epoch": 2.574647887323944, + "grad_norm": 2.2132861614227295, + "learning_rate": 1.368e-06, + "loss": 2.8135, + "step": 458 + }, + { + "epoch": 2.580281690140845, + "grad_norm": 2.988835334777832, + "learning_rate": 1.3709999999999999e-06, + "loss": 2.8356, + "step": 459 + }, + { + "epoch": 2.5859154929577466, + "grad_norm": 2.2689504623413086, + "learning_rate": 1.374e-06, + "loss": 2.8513, + "step": 460 + }, + { + "epoch": 2.591549295774648, + "grad_norm": 2.1941077709198, + "learning_rate": 1.3770000000000001e-06, + "loss": 2.8369, + "step": 461 + }, + { + "epoch": 2.5971830985915494, + "grad_norm": 1.760879635810852, + "learning_rate": 1.38e-06, + "loss": 2.8273, + "step": 462 + }, + { + "epoch": 2.602816901408451, + "grad_norm": 4.897495269775391, + "learning_rate": 1.3830000000000001e-06, + "loss": 2.8198, + "step": 463 + }, + { + "epoch": 2.608450704225352, + "grad_norm": 4.290358543395996, + "learning_rate": 1.386e-06, + "loss": 2.8371, + "step": 464 + }, + { + "epoch": 2.6140845070422536, + "grad_norm": 3.481776714324951, + "learning_rate": 1.389e-06, + "loss": 2.8027, + "step": 465 + }, + { + "epoch": 2.619718309859155, + "grad_norm": 2.979186773300171, + "learning_rate": 1.392e-06, + "loss": 2.8286, + "step": 466 + }, + { + "epoch": 2.6253521126760564, + "grad_norm": 4.169500350952148, + "learning_rate": 1.395e-06, + "loss": 2.8278, + "step": 467 + }, + { + "epoch": 2.630985915492958, + "grad_norm": 5.333034515380859, + "learning_rate": 1.3980000000000002e-06, + "loss": 2.8301, + "step": 468 + }, + { + "epoch": 2.636619718309859, + "grad_norm": 6.221430778503418, + "learning_rate": 1.401e-06, + "loss": 2.8066, + "step": 469 + }, + { + "epoch": 2.6422535211267606, + "grad_norm": 4.619940757751465, + "learning_rate": 1.404e-06, + "loss": 2.7916, + "step": 470 + }, + { + "epoch": 2.647887323943662, + "grad_norm": 5.482257843017578, + "learning_rate": 1.407e-06, + "loss": 2.806, + "step": 471 + }, + { + "epoch": 2.6535211267605634, + "grad_norm": 4.32538366317749, + "learning_rate": 1.41e-06, + "loss": 2.799, + "step": 472 + }, + { + "epoch": 2.659154929577465, + "grad_norm": 5.186101913452148, + "learning_rate": 1.413e-06, + "loss": 2.7999, + "step": 473 + }, + { + "epoch": 2.664788732394366, + "grad_norm": 6.246681213378906, + "learning_rate": 1.416e-06, + "loss": 2.7924, + "step": 474 + }, + { + "epoch": 2.6704225352112676, + "grad_norm": 5.235856056213379, + "learning_rate": 1.419e-06, + "loss": 2.7918, + "step": 475 + }, + { + "epoch": 2.676056338028169, + "grad_norm": 2.9274845123291016, + "learning_rate": 1.422e-06, + "loss": 2.8112, + "step": 476 + }, + { + "epoch": 2.6816901408450704, + "grad_norm": 3.573071241378784, + "learning_rate": 1.4250000000000001e-06, + "loss": 2.7849, + "step": 477 + }, + { + "epoch": 2.687323943661972, + "grad_norm": 4.214164733886719, + "learning_rate": 1.4280000000000001e-06, + "loss": 2.7879, + "step": 478 + }, + { + "epoch": 2.692957746478873, + "grad_norm": 5.562944412231445, + "learning_rate": 1.431e-06, + "loss": 2.7879, + "step": 479 + }, + { + "epoch": 2.6985915492957746, + "grad_norm": 5.23793888092041, + "learning_rate": 1.4340000000000002e-06, + "loss": 2.7858, + "step": 480 + }, + { + "epoch": 2.704225352112676, + "grad_norm": 4.238168716430664, + "learning_rate": 1.437e-06, + "loss": 2.7769, + "step": 481 + }, + { + "epoch": 2.7098591549295774, + "grad_norm": 4.448953628540039, + "learning_rate": 1.44e-06, + "loss": 2.7617, + "step": 482 + }, + { + "epoch": 2.715492957746479, + "grad_norm": 7.470413684844971, + "learning_rate": 1.443e-06, + "loss": 2.7972, + "step": 483 + }, + { + "epoch": 2.72112676056338, + "grad_norm": 3.1278200149536133, + "learning_rate": 1.446e-06, + "loss": 2.7428, + "step": 484 + }, + { + "epoch": 2.7267605633802816, + "grad_norm": 3.5966947078704834, + "learning_rate": 1.449e-06, + "loss": 2.774, + "step": 485 + }, + { + "epoch": 2.732394366197183, + "grad_norm": 3.109821319580078, + "learning_rate": 1.452e-06, + "loss": 2.7867, + "step": 486 + }, + { + "epoch": 2.7380281690140844, + "grad_norm": 4.992013931274414, + "learning_rate": 1.455e-06, + "loss": 2.7909, + "step": 487 + }, + { + "epoch": 2.743661971830986, + "grad_norm": 5.205508232116699, + "learning_rate": 1.458e-06, + "loss": 2.742, + "step": 488 + }, + { + "epoch": 2.749295774647887, + "grad_norm": 3.8282198905944824, + "learning_rate": 1.461e-06, + "loss": 2.9147, + "step": 489 + }, + { + "epoch": 2.7549295774647886, + "grad_norm": 3.4786813259124756, + "learning_rate": 1.464e-06, + "loss": 2.9016, + "step": 490 + }, + { + "epoch": 2.76056338028169, + "grad_norm": 4.716727256774902, + "learning_rate": 1.467e-06, + "loss": 2.8472, + "step": 491 + }, + { + "epoch": 2.7661971830985914, + "grad_norm": 2.4909396171569824, + "learning_rate": 1.4700000000000001e-06, + "loss": 2.8292, + "step": 492 + }, + { + "epoch": 2.771830985915493, + "grad_norm": 2.098987102508545, + "learning_rate": 1.473e-06, + "loss": 2.8354, + "step": 493 + }, + { + "epoch": 2.777464788732394, + "grad_norm": 3.8950672149658203, + "learning_rate": 1.4760000000000001e-06, + "loss": 2.8584, + "step": 494 + }, + { + "epoch": 2.7830985915492956, + "grad_norm": 3.389186382293701, + "learning_rate": 1.479e-06, + "loss": 2.8431, + "step": 495 + }, + { + "epoch": 2.788732394366197, + "grad_norm": 2.1209559440612793, + "learning_rate": 1.482e-06, + "loss": 2.8189, + "step": 496 + }, + { + "epoch": 2.7943661971830984, + "grad_norm": 2.8957386016845703, + "learning_rate": 1.4850000000000002e-06, + "loss": 2.8003, + "step": 497 + }, + { + "epoch": 2.8, + "grad_norm": 1.9849811792373657, + "learning_rate": 1.488e-06, + "loss": 2.7981, + "step": 498 + }, + { + "epoch": 2.8056338028169012, + "grad_norm": 2.376265287399292, + "learning_rate": 1.491e-06, + "loss": 2.8231, + "step": 499 + }, + { + "epoch": 2.8112676056338026, + "grad_norm": 3.4236159324645996, + "learning_rate": 1.494e-06, + "loss": 2.8096, + "step": 500 + }, + { + "epoch": 2.816901408450704, + "grad_norm": 2.7298974990844727, + "learning_rate": 1.497e-06, + "loss": 2.8237, + "step": 501 + }, + { + "epoch": 2.8225352112676054, + "grad_norm": 2.521082639694214, + "learning_rate": 1.5e-06, + "loss": 2.765, + "step": 502 + }, + { + "epoch": 2.828169014084507, + "grad_norm": 2.644693613052368, + "learning_rate": 1.503e-06, + "loss": 2.8049, + "step": 503 + }, + { + "epoch": 2.8338028169014082, + "grad_norm": 3.143251419067383, + "learning_rate": 1.506e-06, + "loss": 2.7773, + "step": 504 + }, + { + "epoch": 2.8394366197183096, + "grad_norm": 2.4421496391296387, + "learning_rate": 1.5089999999999999e-06, + "loss": 2.7613, + "step": 505 + }, + { + "epoch": 2.845070422535211, + "grad_norm": 3.1848864555358887, + "learning_rate": 1.512e-06, + "loss": 2.7795, + "step": 506 + }, + { + "epoch": 2.8507042253521124, + "grad_norm": 2.911860704421997, + "learning_rate": 1.5150000000000001e-06, + "loss": 2.7608, + "step": 507 + }, + { + "epoch": 2.856338028169014, + "grad_norm": 1.8944320678710938, + "learning_rate": 1.518e-06, + "loss": 2.7835, + "step": 508 + }, + { + "epoch": 2.8619718309859152, + "grad_norm": 2.056074380874634, + "learning_rate": 1.5210000000000001e-06, + "loss": 2.7737, + "step": 509 + }, + { + "epoch": 2.8676056338028166, + "grad_norm": 3.7578797340393066, + "learning_rate": 1.524e-06, + "loss": 2.7783, + "step": 510 + }, + { + "epoch": 2.873239436619718, + "grad_norm": 2.587336778640747, + "learning_rate": 1.5270000000000002e-06, + "loss": 2.7797, + "step": 511 + }, + { + "epoch": 2.87887323943662, + "grad_norm": 5.611008644104004, + "learning_rate": 1.53e-06, + "loss": 2.744, + "step": 512 + }, + { + "epoch": 2.8845070422535213, + "grad_norm": 10.416324615478516, + "learning_rate": 1.533e-06, + "loss": 2.7366, + "step": 513 + }, + { + "epoch": 2.8901408450704227, + "grad_norm": 4.064530849456787, + "learning_rate": 1.5360000000000002e-06, + "loss": 2.7416, + "step": 514 + }, + { + "epoch": 2.895774647887324, + "grad_norm": 6.801990032196045, + "learning_rate": 1.539e-06, + "loss": 2.7769, + "step": 515 + }, + { + "epoch": 2.9014084507042255, + "grad_norm": 2.629598379135132, + "learning_rate": 1.542e-06, + "loss": 2.744, + "step": 516 + }, + { + "epoch": 2.907042253521127, + "grad_norm": 4.535959720611572, + "learning_rate": 1.545e-06, + "loss": 2.7681, + "step": 517 + }, + { + "epoch": 2.9126760563380283, + "grad_norm": 5.370257377624512, + "learning_rate": 1.548e-06, + "loss": 2.7789, + "step": 518 + }, + { + "epoch": 2.9183098591549297, + "grad_norm": 5.6404852867126465, + "learning_rate": 1.551e-06, + "loss": 2.7694, + "step": 519 + }, + { + "epoch": 2.923943661971831, + "grad_norm": 3.5447475910186768, + "learning_rate": 1.554e-06, + "loss": 2.7573, + "step": 520 + }, + { + "epoch": 2.9295774647887325, + "grad_norm": 3.844684362411499, + "learning_rate": 1.557e-06, + "loss": 2.7499, + "step": 521 + }, + { + "epoch": 2.935211267605634, + "grad_norm": 39.249507904052734, + "learning_rate": 1.5599999999999999e-06, + "loss": 2.7408, + "step": 522 + }, + { + "epoch": 2.9408450704225353, + "grad_norm": 2.2351744174957275, + "learning_rate": 1.5630000000000001e-06, + "loss": 2.7732, + "step": 523 + }, + { + "epoch": 2.9464788732394367, + "grad_norm": 4.3039870262146, + "learning_rate": 1.5660000000000001e-06, + "loss": 2.7418, + "step": 524 + }, + { + "epoch": 2.952112676056338, + "grad_norm": 8.005955696105957, + "learning_rate": 1.569e-06, + "loss": 2.699, + "step": 525 + }, + { + "epoch": 2.9577464788732395, + "grad_norm": 4.523921489715576, + "learning_rate": 1.5720000000000002e-06, + "loss": 2.7148, + "step": 526 + }, + { + "epoch": 2.963380281690141, + "grad_norm": 8.109228134155273, + "learning_rate": 1.575e-06, + "loss": 2.7359, + "step": 527 + }, + { + "epoch": 2.9690140845070423, + "grad_norm": 54.502098083496094, + "learning_rate": 1.578e-06, + "loss": 2.753, + "step": 528 + }, + { + "epoch": 2.9746478873239437, + "grad_norm": 4.717286586761475, + "learning_rate": 1.581e-06, + "loss": 2.7562, + "step": 529 + }, + { + "epoch": 2.980281690140845, + "grad_norm": 17.204912185668945, + "learning_rate": 1.584e-06, + "loss": 2.8052, + "step": 530 + }, + { + "epoch": 2.9859154929577465, + "grad_norm": 6.698378562927246, + "learning_rate": 1.5870000000000002e-06, + "loss": 2.7627, + "step": 531 + }, + { + "epoch": 2.991549295774648, + "grad_norm": 9.332170486450195, + "learning_rate": 1.59e-06, + "loss": 2.7683, + "step": 532 + }, + { + "epoch": 2.9971830985915493, + "grad_norm": 2.9608309268951416, + "learning_rate": 1.593e-06, + "loss": 2.8084, + "step": 533 + }, + { + "epoch": 3.0, + "grad_norm": 3.3101515769958496, + "learning_rate": 1.596e-06, + "loss": 1.3821, + "step": 534 + }, + { + "epoch": 3.0056338028169014, + "grad_norm": 4.513636112213135, + "learning_rate": 1.599e-06, + "loss": 2.8764, + "step": 535 + }, + { + "epoch": 3.011267605633803, + "grad_norm": 6.628481864929199, + "learning_rate": 1.602e-06, + "loss": 2.8876, + "step": 536 + }, + { + "epoch": 3.016901408450704, + "grad_norm": 2.4313488006591797, + "learning_rate": 1.605e-06, + "loss": 2.8239, + "step": 537 + }, + { + "epoch": 3.0225352112676056, + "grad_norm": 3.9404282569885254, + "learning_rate": 1.608e-06, + "loss": 2.8526, + "step": 538 + }, + { + "epoch": 3.028169014084507, + "grad_norm": 2.381598949432373, + "learning_rate": 1.611e-06, + "loss": 2.8307, + "step": 539 + }, + { + "epoch": 3.0338028169014084, + "grad_norm": 3.1737258434295654, + "learning_rate": 1.6140000000000001e-06, + "loss": 2.8175, + "step": 540 + }, + { + "epoch": 3.03943661971831, + "grad_norm": 2.4474072456359863, + "learning_rate": 1.6170000000000001e-06, + "loss": 2.792, + "step": 541 + }, + { + "epoch": 3.045070422535211, + "grad_norm": 4.117803573608398, + "learning_rate": 1.62e-06, + "loss": 2.7852, + "step": 542 + }, + { + "epoch": 3.0507042253521126, + "grad_norm": 2.7527599334716797, + "learning_rate": 1.6230000000000002e-06, + "loss": 2.7904, + "step": 543 + }, + { + "epoch": 3.056338028169014, + "grad_norm": 1.7446880340576172, + "learning_rate": 1.626e-06, + "loss": 2.7979, + "step": 544 + }, + { + "epoch": 3.0619718309859154, + "grad_norm": 2.72400164604187, + "learning_rate": 1.629e-06, + "loss": 2.7947, + "step": 545 + }, + { + "epoch": 3.067605633802817, + "grad_norm": 2.1043014526367188, + "learning_rate": 1.632e-06, + "loss": 2.7753, + "step": 546 + }, + { + "epoch": 3.073239436619718, + "grad_norm": 2.054772138595581, + "learning_rate": 1.635e-06, + "loss": 2.7834, + "step": 547 + }, + { + "epoch": 3.0788732394366196, + "grad_norm": 2.433123826980591, + "learning_rate": 1.638e-06, + "loss": 2.7527, + "step": 548 + }, + { + "epoch": 3.084507042253521, + "grad_norm": 4.187600135803223, + "learning_rate": 1.641e-06, + "loss": 2.7706, + "step": 549 + }, + { + "epoch": 3.0901408450704224, + "grad_norm": 1.6656733751296997, + "learning_rate": 1.644e-06, + "loss": 2.7446, + "step": 550 + }, + { + "epoch": 3.095774647887324, + "grad_norm": 1.757063388824463, + "learning_rate": 1.6469999999999999e-06, + "loss": 2.729, + "step": 551 + }, + { + "epoch": 3.101408450704225, + "grad_norm": 2.2468183040618896, + "learning_rate": 1.65e-06, + "loss": 2.7629, + "step": 552 + }, + { + "epoch": 3.1070422535211266, + "grad_norm": 2.3012444972991943, + "learning_rate": 1.653e-06, + "loss": 2.7679, + "step": 553 + }, + { + "epoch": 3.112676056338028, + "grad_norm": 3.9080018997192383, + "learning_rate": 1.6560000000000001e-06, + "loss": 2.7438, + "step": 554 + }, + { + "epoch": 3.1183098591549294, + "grad_norm": 2.3420796394348145, + "learning_rate": 1.6590000000000001e-06, + "loss": 2.7693, + "step": 555 + }, + { + "epoch": 3.123943661971831, + "grad_norm": 3.211016893386841, + "learning_rate": 1.662e-06, + "loss": 2.7565, + "step": 556 + }, + { + "epoch": 3.129577464788732, + "grad_norm": 4.163708209991455, + "learning_rate": 1.6650000000000002e-06, + "loss": 2.7658, + "step": 557 + }, + { + "epoch": 3.1352112676056336, + "grad_norm": 3.493562698364258, + "learning_rate": 1.668e-06, + "loss": 2.7368, + "step": 558 + }, + { + "epoch": 3.140845070422535, + "grad_norm": 2.458523988723755, + "learning_rate": 1.671e-06, + "loss": 2.7076, + "step": 559 + }, + { + "epoch": 3.1464788732394364, + "grad_norm": 3.26328182220459, + "learning_rate": 1.6740000000000002e-06, + "loss": 2.6912, + "step": 560 + }, + { + "epoch": 3.152112676056338, + "grad_norm": 2.4104485511779785, + "learning_rate": 1.677e-06, + "loss": 2.7585, + "step": 561 + }, + { + "epoch": 3.1577464788732392, + "grad_norm": 2.657015800476074, + "learning_rate": 1.68e-06, + "loss": 2.7099, + "step": 562 + }, + { + "epoch": 3.1633802816901406, + "grad_norm": 2.3434484004974365, + "learning_rate": 1.683e-06, + "loss": 2.744, + "step": 563 + }, + { + "epoch": 3.169014084507042, + "grad_norm": 3.666388988494873, + "learning_rate": 1.686e-06, + "loss": 2.7251, + "step": 564 + }, + { + "epoch": 3.1746478873239434, + "grad_norm": 3.7017822265625, + "learning_rate": 1.689e-06, + "loss": 2.7218, + "step": 565 + }, + { + "epoch": 3.1802816901408453, + "grad_norm": 6.779127597808838, + "learning_rate": 1.692e-06, + "loss": 2.7156, + "step": 566 + }, + { + "epoch": 3.1859154929577467, + "grad_norm": 4.765157222747803, + "learning_rate": 1.695e-06, + "loss": 2.7168, + "step": 567 + }, + { + "epoch": 3.191549295774648, + "grad_norm": 3.8683316707611084, + "learning_rate": 1.6979999999999999e-06, + "loss": 2.6945, + "step": 568 + }, + { + "epoch": 3.1971830985915495, + "grad_norm": 2.9999983310699463, + "learning_rate": 1.701e-06, + "loss": 2.7323, + "step": 569 + }, + { + "epoch": 3.202816901408451, + "grad_norm": 2.4626176357269287, + "learning_rate": 1.7040000000000001e-06, + "loss": 2.7142, + "step": 570 + }, + { + "epoch": 3.2084507042253523, + "grad_norm": 4.122666358947754, + "learning_rate": 1.707e-06, + "loss": 2.6754, + "step": 571 + }, + { + "epoch": 3.2140845070422537, + "grad_norm": 3.5713374614715576, + "learning_rate": 1.7100000000000001e-06, + "loss": 2.6513, + "step": 572 + }, + { + "epoch": 3.219718309859155, + "grad_norm": 4.797949314117432, + "learning_rate": 1.713e-06, + "loss": 2.688, + "step": 573 + }, + { + "epoch": 3.2253521126760565, + "grad_norm": 3.065605640411377, + "learning_rate": 1.7160000000000002e-06, + "loss": 2.6772, + "step": 574 + }, + { + "epoch": 3.230985915492958, + "grad_norm": 2.238821029663086, + "learning_rate": 1.719e-06, + "loss": 2.6712, + "step": 575 + }, + { + "epoch": 3.2366197183098593, + "grad_norm": 2.4874846935272217, + "learning_rate": 1.722e-06, + "loss": 2.7081, + "step": 576 + }, + { + "epoch": 3.2422535211267607, + "grad_norm": 2.7824885845184326, + "learning_rate": 1.7250000000000002e-06, + "loss": 2.7113, + "step": 577 + }, + { + "epoch": 3.247887323943662, + "grad_norm": 3.1023831367492676, + "learning_rate": 1.728e-06, + "loss": 2.6572, + "step": 578 + }, + { + "epoch": 3.2535211267605635, + "grad_norm": 5.776034832000732, + "learning_rate": 1.731e-06, + "loss": 2.8611, + "step": 579 + }, + { + "epoch": 3.259154929577465, + "grad_norm": 2.806821823120117, + "learning_rate": 1.734e-06, + "loss": 2.8481, + "step": 580 + }, + { + "epoch": 3.2647887323943663, + "grad_norm": 3.0129337310791016, + "learning_rate": 1.737e-06, + "loss": 2.8037, + "step": 581 + }, + { + "epoch": 3.2704225352112677, + "grad_norm": 5.805482387542725, + "learning_rate": 1.74e-06, + "loss": 2.8091, + "step": 582 + }, + { + "epoch": 3.276056338028169, + "grad_norm": 9.397015571594238, + "learning_rate": 1.743e-06, + "loss": 2.8086, + "step": 583 + }, + { + "epoch": 3.2816901408450705, + "grad_norm": 2.8330020904541016, + "learning_rate": 1.746e-06, + "loss": 2.7994, + "step": 584 + }, + { + "epoch": 3.287323943661972, + "grad_norm": 3.208578109741211, + "learning_rate": 1.749e-06, + "loss": 2.7591, + "step": 585 + }, + { + "epoch": 3.2929577464788733, + "grad_norm": 4.422690391540527, + "learning_rate": 1.7520000000000001e-06, + "loss": 2.7969, + "step": 586 + }, + { + "epoch": 3.2985915492957747, + "grad_norm": 2.6905901432037354, + "learning_rate": 1.7550000000000001e-06, + "loss": 2.7611, + "step": 587 + }, + { + "epoch": 3.304225352112676, + "grad_norm": 4.492880821228027, + "learning_rate": 1.758e-06, + "loss": 2.7692, + "step": 588 + }, + { + "epoch": 3.3098591549295775, + "grad_norm": 2.1210246086120605, + "learning_rate": 1.7610000000000002e-06, + "loss": 2.7765, + "step": 589 + }, + { + "epoch": 3.315492957746479, + "grad_norm": 2.7414491176605225, + "learning_rate": 1.764e-06, + "loss": 2.7318, + "step": 590 + }, + { + "epoch": 3.3211267605633803, + "grad_norm": 2.884840965270996, + "learning_rate": 1.767e-06, + "loss": 2.7482, + "step": 591 + }, + { + "epoch": 3.3267605633802817, + "grad_norm": 3.144826650619507, + "learning_rate": 1.77e-06, + "loss": 2.7188, + "step": 592 + }, + { + "epoch": 3.332394366197183, + "grad_norm": 2.618098020553589, + "learning_rate": 1.773e-06, + "loss": 2.7531, + "step": 593 + }, + { + "epoch": 3.3380281690140845, + "grad_norm": 1.7363290786743164, + "learning_rate": 1.776e-06, + "loss": 2.7328, + "step": 594 + }, + { + "epoch": 3.343661971830986, + "grad_norm": 3.7917160987854004, + "learning_rate": 1.779e-06, + "loss": 2.7158, + "step": 595 + }, + { + "epoch": 3.3492957746478873, + "grad_norm": 3.086524248123169, + "learning_rate": 1.782e-06, + "loss": 2.7073, + "step": 596 + }, + { + "epoch": 3.3549295774647887, + "grad_norm": 1.768385887145996, + "learning_rate": 1.785e-06, + "loss": 2.7167, + "step": 597 + }, + { + "epoch": 3.36056338028169, + "grad_norm": 3.622553586959839, + "learning_rate": 1.788e-06, + "loss": 2.7171, + "step": 598 + }, + { + "epoch": 3.3661971830985915, + "grad_norm": 2.9177920818328857, + "learning_rate": 1.791e-06, + "loss": 2.7344, + "step": 599 + }, + { + "epoch": 3.371830985915493, + "grad_norm": 2.422119140625, + "learning_rate": 1.794e-06, + "loss": 2.6836, + "step": 600 + }, + { + "epoch": 3.3774647887323943, + "grad_norm": 3.2521135807037354, + "learning_rate": 1.7970000000000001e-06, + "loss": 2.6564, + "step": 601 + }, + { + "epoch": 3.3830985915492957, + "grad_norm": 3.1682698726654053, + "learning_rate": 1.8e-06, + "loss": 2.6881, + "step": 602 + }, + { + "epoch": 3.388732394366197, + "grad_norm": 2.0565860271453857, + "learning_rate": 1.8030000000000001e-06, + "loss": 2.681, + "step": 603 + }, + { + "epoch": 3.3943661971830985, + "grad_norm": 5.913877964019775, + "learning_rate": 1.806e-06, + "loss": 2.66, + "step": 604 + }, + { + "epoch": 3.4, + "grad_norm": 5.101724624633789, + "learning_rate": 1.809e-06, + "loss": 2.7031, + "step": 605 + }, + { + "epoch": 3.4056338028169013, + "grad_norm": 4.672321319580078, + "learning_rate": 1.8120000000000002e-06, + "loss": 2.6847, + "step": 606 + }, + { + "epoch": 3.4112676056338027, + "grad_norm": 3.59171462059021, + "learning_rate": 1.815e-06, + "loss": 2.6721, + "step": 607 + }, + { + "epoch": 3.416901408450704, + "grad_norm": 4.02622652053833, + "learning_rate": 1.818e-06, + "loss": 2.6493, + "step": 608 + }, + { + "epoch": 3.4225352112676055, + "grad_norm": 6.657413959503174, + "learning_rate": 1.821e-06, + "loss": 2.6754, + "step": 609 + }, + { + "epoch": 3.428169014084507, + "grad_norm": 6.219770908355713, + "learning_rate": 1.824e-06, + "loss": 2.6491, + "step": 610 + }, + { + "epoch": 3.4338028169014083, + "grad_norm": 4.524576187133789, + "learning_rate": 1.827e-06, + "loss": 2.676, + "step": 611 + }, + { + "epoch": 3.4394366197183097, + "grad_norm": 2.754516363143921, + "learning_rate": 1.83e-06, + "loss": 2.6256, + "step": 612 + }, + { + "epoch": 3.445070422535211, + "grad_norm": 7.4485979080200195, + "learning_rate": 1.833e-06, + "loss": 2.6068, + "step": 613 + }, + { + "epoch": 3.4507042253521125, + "grad_norm": 5.208416938781738, + "learning_rate": 1.8359999999999999e-06, + "loss": 2.6381, + "step": 614 + }, + { + "epoch": 3.456338028169014, + "grad_norm": 5.324192047119141, + "learning_rate": 1.839e-06, + "loss": 2.6656, + "step": 615 + }, + { + "epoch": 3.4619718309859153, + "grad_norm": 3.2816479206085205, + "learning_rate": 1.8420000000000001e-06, + "loss": 2.6474, + "step": 616 + }, + { + "epoch": 3.4676056338028167, + "grad_norm": 7.767096519470215, + "learning_rate": 1.8450000000000001e-06, + "loss": 2.6936, + "step": 617 + }, + { + "epoch": 3.473239436619718, + "grad_norm": 3.7725744247436523, + "learning_rate": 1.8480000000000001e-06, + "loss": 2.6174, + "step": 618 + }, + { + "epoch": 3.4788732394366195, + "grad_norm": 5.0832905769348145, + "learning_rate": 1.851e-06, + "loss": 2.6354, + "step": 619 + }, + { + "epoch": 3.4845070422535214, + "grad_norm": 5.80606746673584, + "learning_rate": 1.8540000000000002e-06, + "loss": 2.5779, + "step": 620 + }, + { + "epoch": 3.4901408450704228, + "grad_norm": 3.8659050464630127, + "learning_rate": 1.857e-06, + "loss": 2.5956, + "step": 621 + }, + { + "epoch": 3.495774647887324, + "grad_norm": 5.184934616088867, + "learning_rate": 1.86e-06, + "loss": 2.6341, + "step": 622 + }, + { + "epoch": 3.5014084507042256, + "grad_norm": 4.191617488861084, + "learning_rate": 1.8630000000000002e-06, + "loss": 2.8809, + "step": 623 + }, + { + "epoch": 3.507042253521127, + "grad_norm": 5.942683219909668, + "learning_rate": 1.866e-06, + "loss": 2.7936, + "step": 624 + }, + { + "epoch": 3.5126760563380284, + "grad_norm": 3.73496413230896, + "learning_rate": 1.869e-06, + "loss": 2.7653, + "step": 625 + }, + { + "epoch": 3.5183098591549298, + "grad_norm": 4.427114009857178, + "learning_rate": 1.872e-06, + "loss": 2.7579, + "step": 626 + }, + { + "epoch": 3.523943661971831, + "grad_norm": 7.355522632598877, + "learning_rate": 1.875e-06, + "loss": 2.7391, + "step": 627 + }, + { + "epoch": 3.5295774647887326, + "grad_norm": 5.933565616607666, + "learning_rate": 1.878e-06, + "loss": 2.7299, + "step": 628 + }, + { + "epoch": 3.535211267605634, + "grad_norm": 2.801149845123291, + "learning_rate": 1.8810000000000003e-06, + "loss": 2.7234, + "step": 629 + }, + { + "epoch": 3.5408450704225354, + "grad_norm": 3.7464067935943604, + "learning_rate": 1.8839999999999999e-06, + "loss": 2.7146, + "step": 630 + }, + { + "epoch": 3.546478873239437, + "grad_norm": 4.548725605010986, + "learning_rate": 1.8869999999999999e-06, + "loss": 2.7071, + "step": 631 + }, + { + "epoch": 3.552112676056338, + "grad_norm": 5.175673961639404, + "learning_rate": 1.8900000000000001e-06, + "loss": 2.7026, + "step": 632 + }, + { + "epoch": 3.5577464788732396, + "grad_norm": 2.5320608615875244, + "learning_rate": 1.8930000000000001e-06, + "loss": 2.7358, + "step": 633 + }, + { + "epoch": 3.563380281690141, + "grad_norm": 2.2658591270446777, + "learning_rate": 1.8960000000000001e-06, + "loss": 2.693, + "step": 634 + }, + { + "epoch": 3.5690140845070424, + "grad_norm": 2.9923315048217773, + "learning_rate": 1.899e-06, + "loss": 2.6586, + "step": 635 + }, + { + "epoch": 3.574647887323944, + "grad_norm": 1.8636302947998047, + "learning_rate": 1.902e-06, + "loss": 2.6489, + "step": 636 + }, + { + "epoch": 3.580281690140845, + "grad_norm": 2.93691349029541, + "learning_rate": 1.905e-06, + "loss": 2.6609, + "step": 637 + }, + { + "epoch": 3.5859154929577466, + "grad_norm": 3.5617032051086426, + "learning_rate": 1.908e-06, + "loss": 2.659, + "step": 638 + }, + { + "epoch": 3.591549295774648, + "grad_norm": 3.435441732406616, + "learning_rate": 1.9110000000000004e-06, + "loss": 2.6154, + "step": 639 + }, + { + "epoch": 3.5971830985915494, + "grad_norm": 4.196032524108887, + "learning_rate": 1.914e-06, + "loss": 2.616, + "step": 640 + }, + { + "epoch": 3.602816901408451, + "grad_norm": 3.23492169380188, + "learning_rate": 1.917e-06, + "loss": 2.6048, + "step": 641 + }, + { + "epoch": 3.608450704225352, + "grad_norm": 2.949294328689575, + "learning_rate": 1.9200000000000003e-06, + "loss": 2.598, + "step": 642 + }, + { + "epoch": 3.6140845070422536, + "grad_norm": 2.844964027404785, + "learning_rate": 1.923e-06, + "loss": 2.6041, + "step": 643 + }, + { + "epoch": 3.619718309859155, + "grad_norm": 3.5030791759490967, + "learning_rate": 1.926e-06, + "loss": 2.5685, + "step": 644 + }, + { + "epoch": 3.6253521126760564, + "grad_norm": 10.395556449890137, + "learning_rate": 1.929e-06, + "loss": 2.576, + "step": 645 + }, + { + "epoch": 3.630985915492958, + "grad_norm": 2.640410900115967, + "learning_rate": 1.932e-06, + "loss": 2.5623, + "step": 646 + }, + { + "epoch": 3.636619718309859, + "grad_norm": 3.267350673675537, + "learning_rate": 1.935e-06, + "loss": 2.5574, + "step": 647 + }, + { + "epoch": 3.6422535211267606, + "grad_norm": 9.421785354614258, + "learning_rate": 1.9380000000000003e-06, + "loss": 2.539, + "step": 648 + }, + { + "epoch": 3.647887323943662, + "grad_norm": 5.227574825286865, + "learning_rate": 1.9409999999999997e-06, + "loss": 2.5801, + "step": 649 + }, + { + "epoch": 3.6535211267605634, + "grad_norm": 2.5911121368408203, + "learning_rate": 1.944e-06, + "loss": 2.4754, + "step": 650 + }, + { + "epoch": 3.659154929577465, + "grad_norm": 3.5224828720092773, + "learning_rate": 1.947e-06, + "loss": 2.5001, + "step": 651 + }, + { + "epoch": 3.664788732394366, + "grad_norm": 4.783766269683838, + "learning_rate": 1.95e-06, + "loss": 2.5012, + "step": 652 + }, + { + "epoch": 3.6704225352112676, + "grad_norm": 3.3553149700164795, + "learning_rate": 1.953e-06, + "loss": 2.531, + "step": 653 + }, + { + "epoch": 3.676056338028169, + "grad_norm": 2.931241512298584, + "learning_rate": 1.956e-06, + "loss": 2.497, + "step": 654 + }, + { + "epoch": 3.6816901408450704, + "grad_norm": 4.4694695472717285, + "learning_rate": 1.959e-06, + "loss": 2.4942, + "step": 655 + }, + { + "epoch": 3.687323943661972, + "grad_norm": 3.122934103012085, + "learning_rate": 1.962e-06, + "loss": 2.4754, + "step": 656 + }, + { + "epoch": 3.692957746478873, + "grad_norm": 6.193077087402344, + "learning_rate": 1.9650000000000002e-06, + "loss": 2.4694, + "step": 657 + }, + { + "epoch": 3.6985915492957746, + "grad_norm": 2.414370059967041, + "learning_rate": 1.968e-06, + "loss": 2.4517, + "step": 658 + }, + { + "epoch": 3.704225352112676, + "grad_norm": 3.34991717338562, + "learning_rate": 1.971e-06, + "loss": 2.4296, + "step": 659 + }, + { + "epoch": 3.7098591549295774, + "grad_norm": 6.847357749938965, + "learning_rate": 1.974e-06, + "loss": 2.4321, + "step": 660 + }, + { + "epoch": 3.715492957746479, + "grad_norm": 3.4056034088134766, + "learning_rate": 1.977e-06, + "loss": 2.4373, + "step": 661 + }, + { + "epoch": 3.72112676056338, + "grad_norm": 2.5987486839294434, + "learning_rate": 1.98e-06, + "loss": 2.4231, + "step": 662 + }, + { + "epoch": 3.7267605633802816, + "grad_norm": 2.899972915649414, + "learning_rate": 1.9830000000000003e-06, + "loss": 2.4102, + "step": 663 + }, + { + "epoch": 3.732394366197183, + "grad_norm": 4.216456890106201, + "learning_rate": 1.9859999999999997e-06, + "loss": 2.4324, + "step": 664 + }, + { + "epoch": 3.7380281690140844, + "grad_norm": 3.7976484298706055, + "learning_rate": 1.989e-06, + "loss": 2.3899, + "step": 665 + }, + { + "epoch": 3.743661971830986, + "grad_norm": 13.57735538482666, + "learning_rate": 1.992e-06, + "loss": 2.3653, + "step": 666 + }, + { + "epoch": 3.749295774647887, + "grad_norm": 5.536102771759033, + "learning_rate": 1.995e-06, + "loss": 2.6115, + "step": 667 + }, + { + "epoch": 3.7549295774647886, + "grad_norm": 3.1471431255340576, + "learning_rate": 1.998e-06, + "loss": 2.6113, + "step": 668 + }, + { + "epoch": 3.76056338028169, + "grad_norm": 2.3586504459381104, + "learning_rate": 2.001e-06, + "loss": 2.5347, + "step": 669 + }, + { + "epoch": 3.7661971830985914, + "grad_norm": 3.342392683029175, + "learning_rate": 2.004e-06, + "loss": 2.5331, + "step": 670 + }, + { + "epoch": 3.771830985915493, + "grad_norm": 3.824932098388672, + "learning_rate": 2.007e-06, + "loss": 2.4711, + "step": 671 + }, + { + "epoch": 3.777464788732394, + "grad_norm": 2.8359029293060303, + "learning_rate": 2.0100000000000002e-06, + "loss": 2.454, + "step": 672 + }, + { + "epoch": 3.7830985915492956, + "grad_norm": 3.8589389324188232, + "learning_rate": 2.0130000000000005e-06, + "loss": 2.3785, + "step": 673 + }, + { + "epoch": 3.788732394366197, + "grad_norm": 4.179661750793457, + "learning_rate": 2.016e-06, + "loss": 2.4114, + "step": 674 + }, + { + "epoch": 3.7943661971830984, + "grad_norm": 4.384422302246094, + "learning_rate": 2.019e-06, + "loss": 2.399, + "step": 675 + }, + { + "epoch": 3.8, + "grad_norm": 2.1641438007354736, + "learning_rate": 2.0220000000000003e-06, + "loss": 2.3519, + "step": 676 + }, + { + "epoch": 3.8056338028169012, + "grad_norm": 4.078607082366943, + "learning_rate": 2.025e-06, + "loss": 2.3549, + "step": 677 + }, + { + "epoch": 3.8112676056338026, + "grad_norm": 4.0506181716918945, + "learning_rate": 2.028e-06, + "loss": 2.3168, + "step": 678 + }, + { + "epoch": 3.816901408450704, + "grad_norm": 3.3806586265563965, + "learning_rate": 2.031e-06, + "loss": 2.3383, + "step": 679 + }, + { + "epoch": 3.8225352112676054, + "grad_norm": 2.962216854095459, + "learning_rate": 2.034e-06, + "loss": 2.2476, + "step": 680 + }, + { + "epoch": 3.828169014084507, + "grad_norm": 2.2728936672210693, + "learning_rate": 2.037e-06, + "loss": 2.2852, + "step": 681 + }, + { + "epoch": 3.8338028169014082, + "grad_norm": 2.4136433601379395, + "learning_rate": 2.0400000000000004e-06, + "loss": 2.2914, + "step": 682 + }, + { + "epoch": 3.8394366197183096, + "grad_norm": 2.7648229598999023, + "learning_rate": 2.0429999999999998e-06, + "loss": 2.1966, + "step": 683 + }, + { + "epoch": 3.845070422535211, + "grad_norm": 2.779562473297119, + "learning_rate": 2.046e-06, + "loss": 2.2622, + "step": 684 + }, + { + "epoch": 3.8507042253521124, + "grad_norm": 2.7937114238739014, + "learning_rate": 2.049e-06, + "loss": 2.1844, + "step": 685 + }, + { + "epoch": 3.856338028169014, + "grad_norm": 2.820657730102539, + "learning_rate": 2.052e-06, + "loss": 2.1237, + "step": 686 + }, + { + "epoch": 3.8619718309859152, + "grad_norm": 3.204495668411255, + "learning_rate": 2.0550000000000002e-06, + "loss": 2.1909, + "step": 687 + }, + { + "epoch": 3.8676056338028166, + "grad_norm": 3.271052598953247, + "learning_rate": 2.058e-06, + "loss": 2.1032, + "step": 688 + }, + { + "epoch": 3.873239436619718, + "grad_norm": 3.531216621398926, + "learning_rate": 2.061e-06, + "loss": 2.144, + "step": 689 + }, + { + "epoch": 3.87887323943662, + "grad_norm": 2.4491655826568604, + "learning_rate": 2.064e-06, + "loss": 2.0294, + "step": 690 + }, + { + "epoch": 3.8845070422535213, + "grad_norm": 4.347051620483398, + "learning_rate": 2.0670000000000003e-06, + "loss": 2.0617, + "step": 691 + }, + { + "epoch": 3.8901408450704227, + "grad_norm": 4.764668941497803, + "learning_rate": 2.07e-06, + "loss": 2.0337, + "step": 692 + }, + { + "epoch": 3.895774647887324, + "grad_norm": 7.034329891204834, + "learning_rate": 2.073e-06, + "loss": 1.9692, + "step": 693 + }, + { + "epoch": 3.9014084507042255, + "grad_norm": 2.9240710735321045, + "learning_rate": 2.076e-06, + "loss": 1.9619, + "step": 694 + }, + { + "epoch": 3.907042253521127, + "grad_norm": 4.533735752105713, + "learning_rate": 2.079e-06, + "loss": 1.9579, + "step": 695 + }, + { + "epoch": 3.9126760563380283, + "grad_norm": 2.520022392272949, + "learning_rate": 2.082e-06, + "loss": 2.0109, + "step": 696 + }, + { + "epoch": 3.9183098591549297, + "grad_norm": 2.678165912628174, + "learning_rate": 2.0850000000000004e-06, + "loss": 1.9549, + "step": 697 + }, + { + "epoch": 3.923943661971831, + "grad_norm": 3.2894513607025146, + "learning_rate": 2.0879999999999997e-06, + "loss": 1.8676, + "step": 698 + }, + { + "epoch": 3.9295774647887325, + "grad_norm": 2.6271660327911377, + "learning_rate": 2.091e-06, + "loss": 1.9041, + "step": 699 + }, + { + "epoch": 3.935211267605634, + "grad_norm": 4.272736549377441, + "learning_rate": 2.094e-06, + "loss": 1.8641, + "step": 700 + }, + { + "epoch": 3.9408450704225353, + "grad_norm": 3.6323680877685547, + "learning_rate": 2.097e-06, + "loss": 1.7602, + "step": 701 + }, + { + "epoch": 3.9464788732394367, + "grad_norm": 3.2892682552337646, + "learning_rate": 2.1000000000000002e-06, + "loss": 1.8483, + "step": 702 + }, + { + "epoch": 3.952112676056338, + "grad_norm": 3.96871018409729, + "learning_rate": 2.103e-06, + "loss": 1.7396, + "step": 703 + }, + { + "epoch": 3.9577464788732395, + "grad_norm": 3.023526668548584, + "learning_rate": 2.106e-06, + "loss": 1.7653, + "step": 704 + }, + { + "epoch": 3.963380281690141, + "grad_norm": 3.367448091506958, + "learning_rate": 2.109e-06, + "loss": 1.7519, + "step": 705 + }, + { + "epoch": 3.9690140845070423, + "grad_norm": 3.7302796840667725, + "learning_rate": 2.1120000000000003e-06, + "loss": 1.6711, + "step": 706 + }, + { + "epoch": 3.9746478873239437, + "grad_norm": 3.7805285453796387, + "learning_rate": 2.1149999999999997e-06, + "loss": 1.7193, + "step": 707 + }, + { + "epoch": 3.980281690140845, + "grad_norm": 3.4666497707366943, + "learning_rate": 2.118e-06, + "loss": 1.669, + "step": 708 + }, + { + "epoch": 3.9859154929577465, + "grad_norm": 4.103410243988037, + "learning_rate": 2.121e-06, + "loss": 1.7194, + "step": 709 + }, + { + "epoch": 3.991549295774648, + "grad_norm": NaN, + "learning_rate": 2.121e-06, + "loss": 1.7002, + "step": 710 + }, + { + "epoch": 3.9971830985915493, + "grad_norm": 5.339576244354248, + "learning_rate": 2.124e-06, + "loss": 1.8009, + "step": 711 + }, + { + "epoch": 4.0, + "grad_norm": 6.8820624351501465, + "learning_rate": 2.127e-06, + "loss": 0.8197, + "step": 712 + }, + { + "epoch": 4.005633802816901, + "grad_norm": 8.731258392333984, + "learning_rate": 2.13e-06, + "loss": 2.1075, + "step": 713 + }, + { + "epoch": 4.011267605633803, + "grad_norm": 4.658092498779297, + "learning_rate": 2.133e-06, + "loss": 2.0035, + "step": 714 + }, + { + "epoch": 4.016901408450704, + "grad_norm": 3.997706890106201, + "learning_rate": 2.136e-06, + "loss": 1.9042, + "step": 715 + }, + { + "epoch": 4.022535211267606, + "grad_norm": 5.904595375061035, + "learning_rate": 2.139e-06, + "loss": 1.8859, + "step": 716 + }, + { + "epoch": 4.028169014084507, + "grad_norm": 4.508280277252197, + "learning_rate": 2.1420000000000004e-06, + "loss": 1.7883, + "step": 717 + }, + { + "epoch": 4.033802816901408, + "grad_norm": 3.468778371810913, + "learning_rate": 2.145e-06, + "loss": 1.7626, + "step": 718 + }, + { + "epoch": 4.03943661971831, + "grad_norm": 3.4685027599334717, + "learning_rate": 2.148e-06, + "loss": 1.7001, + "step": 719 + }, + { + "epoch": 4.045070422535211, + "grad_norm": 3.296505928039551, + "learning_rate": 2.1510000000000002e-06, + "loss": 1.669, + "step": 720 + }, + { + "epoch": 4.050704225352113, + "grad_norm": 4.874464988708496, + "learning_rate": 2.154e-06, + "loss": 1.705, + "step": 721 + }, + { + "epoch": 4.056338028169014, + "grad_norm": 4.497097969055176, + "learning_rate": 2.1570000000000003e-06, + "loss": 1.6824, + "step": 722 + }, + { + "epoch": 4.061971830985915, + "grad_norm": 3.7446484565734863, + "learning_rate": 2.16e-06, + "loss": 1.7157, + "step": 723 + }, + { + "epoch": 4.067605633802817, + "grad_norm": 3.5548088550567627, + "learning_rate": 2.163e-06, + "loss": 1.6035, + "step": 724 + }, + { + "epoch": 4.073239436619718, + "grad_norm": 3.665282964706421, + "learning_rate": 2.166e-06, + "loss": 1.5725, + "step": 725 + }, + { + "epoch": 4.07887323943662, + "grad_norm": 3.388298749923706, + "learning_rate": 2.1690000000000003e-06, + "loss": 1.5237, + "step": 726 + }, + { + "epoch": 4.084507042253521, + "grad_norm": 2.3221206665039062, + "learning_rate": 2.172e-06, + "loss": 1.5083, + "step": 727 + }, + { + "epoch": 4.090140845070422, + "grad_norm": 14.295064926147461, + "learning_rate": 2.175e-06, + "loss": 1.4483, + "step": 728 + }, + { + "epoch": 4.095774647887324, + "grad_norm": 4.429758548736572, + "learning_rate": 2.178e-06, + "loss": 1.4727, + "step": 729 + }, + { + "epoch": 4.101408450704225, + "grad_norm": 4.3126959800720215, + "learning_rate": 2.181e-06, + "loss": 1.4766, + "step": 730 + }, + { + "epoch": 4.107042253521127, + "grad_norm": 2.687000036239624, + "learning_rate": 2.184e-06, + "loss": 1.4647, + "step": 731 + }, + { + "epoch": 4.112676056338028, + "grad_norm": 3.530883550643921, + "learning_rate": 2.1870000000000004e-06, + "loss": 1.4935, + "step": 732 + }, + { + "epoch": 4.118309859154929, + "grad_norm": 4.200030326843262, + "learning_rate": 2.1899999999999998e-06, + "loss": 1.4652, + "step": 733 + }, + { + "epoch": 4.123943661971831, + "grad_norm": 7.423676013946533, + "learning_rate": 2.193e-06, + "loss": 1.3852, + "step": 734 + }, + { + "epoch": 4.129577464788732, + "grad_norm": 2.8539798259735107, + "learning_rate": 2.1960000000000002e-06, + "loss": 1.3247, + "step": 735 + }, + { + "epoch": 4.135211267605634, + "grad_norm": 3.968510627746582, + "learning_rate": 2.199e-06, + "loss": 1.2985, + "step": 736 + }, + { + "epoch": 4.140845070422535, + "grad_norm": 2.972294807434082, + "learning_rate": 2.2020000000000003e-06, + "loss": 1.3183, + "step": 737 + }, + { + "epoch": 4.146478873239436, + "grad_norm": 3.018853187561035, + "learning_rate": 2.205e-06, + "loss": 1.2738, + "step": 738 + }, + { + "epoch": 4.152112676056338, + "grad_norm": 2.274994134902954, + "learning_rate": 2.208e-06, + "loss": 1.3257, + "step": 739 + }, + { + "epoch": 4.157746478873239, + "grad_norm": 2.3417112827301025, + "learning_rate": 2.211e-06, + "loss": 1.2046, + "step": 740 + }, + { + "epoch": 4.163380281690141, + "grad_norm": 3.256418228149414, + "learning_rate": 2.2140000000000003e-06, + "loss": 1.2625, + "step": 741 + }, + { + "epoch": 4.169014084507042, + "grad_norm": 2.557704210281372, + "learning_rate": 2.2169999999999997e-06, + "loss": 1.2124, + "step": 742 + }, + { + "epoch": 4.174647887323943, + "grad_norm": 3.7538537979125977, + "learning_rate": 2.22e-06, + "loss": 1.1492, + "step": 743 + }, + { + "epoch": 4.180281690140845, + "grad_norm": 2.9859628677368164, + "learning_rate": 2.223e-06, + "loss": 1.2012, + "step": 744 + }, + { + "epoch": 4.185915492957746, + "grad_norm": 3.2278072834014893, + "learning_rate": 2.226e-06, + "loss": 1.2356, + "step": 745 + }, + { + "epoch": 4.191549295774648, + "grad_norm": 3.32592511177063, + "learning_rate": 2.229e-06, + "loss": 1.1824, + "step": 746 + }, + { + "epoch": 4.197183098591549, + "grad_norm": 4.771190643310547, + "learning_rate": 2.232e-06, + "loss": 1.1087, + "step": 747 + }, + { + "epoch": 4.20281690140845, + "grad_norm": 2.638829231262207, + "learning_rate": 2.2349999999999998e-06, + "loss": 1.1162, + "step": 748 + }, + { + "epoch": 4.208450704225352, + "grad_norm": 6.826143264770508, + "learning_rate": 2.238e-06, + "loss": 1.1157, + "step": 749 + }, + { + "epoch": 4.214084507042253, + "grad_norm": 3.0462405681610107, + "learning_rate": 2.2410000000000002e-06, + "loss": 1.0576, + "step": 750 + }, + { + "epoch": 4.219718309859155, + "grad_norm": 4.8925909996032715, + "learning_rate": 2.244e-06, + "loss": 1.2062, + "step": 751 + }, + { + "epoch": 4.225352112676056, + "grad_norm": 5.137844562530518, + "learning_rate": 2.247e-06, + "loss": 1.1099, + "step": 752 + }, + { + "epoch": 4.230985915492957, + "grad_norm": 4.4226861000061035, + "learning_rate": 2.25e-06, + "loss": 1.0575, + "step": 753 + }, + { + "epoch": 4.236619718309859, + "grad_norm": 8.942727088928223, + "learning_rate": 2.253e-06, + "loss": 1.0234, + "step": 754 + }, + { + "epoch": 4.24225352112676, + "grad_norm": 5.536681652069092, + "learning_rate": 2.256e-06, + "loss": 1.0121, + "step": 755 + }, + { + "epoch": 4.247887323943662, + "grad_norm": 5.978938102722168, + "learning_rate": 2.2590000000000003e-06, + "loss": 0.9913, + "step": 756 + }, + { + "epoch": 4.253521126760563, + "grad_norm": 9.773181915283203, + "learning_rate": 2.262e-06, + "loss": 1.4302, + "step": 757 + }, + { + "epoch": 4.259154929577464, + "grad_norm": 8.560917854309082, + "learning_rate": 2.265e-06, + "loss": 1.4571, + "step": 758 + }, + { + "epoch": 4.264788732394366, + "grad_norm": 5.962185382843018, + "learning_rate": 2.268e-06, + "loss": 1.3484, + "step": 759 + }, + { + "epoch": 4.270422535211267, + "grad_norm": 3.354548454284668, + "learning_rate": 2.2710000000000004e-06, + "loss": 1.2123, + "step": 760 + }, + { + "epoch": 4.276056338028169, + "grad_norm": 3.654059410095215, + "learning_rate": 2.274e-06, + "loss": 1.3055, + "step": 761 + }, + { + "epoch": 4.28169014084507, + "grad_norm": 4.993838787078857, + "learning_rate": 2.277e-06, + "loss": 1.3121, + "step": 762 + }, + { + "epoch": 4.2873239436619714, + "grad_norm": 3.195439577102661, + "learning_rate": 2.28e-06, + "loss": 1.1986, + "step": 763 + }, + { + "epoch": 4.292957746478873, + "grad_norm": 2.968320608139038, + "learning_rate": 2.283e-06, + "loss": 1.2231, + "step": 764 + }, + { + "epoch": 4.298591549295774, + "grad_norm": 3.53824520111084, + "learning_rate": 2.2860000000000002e-06, + "loss": 1.2196, + "step": 765 + }, + { + "epoch": 4.304225352112676, + "grad_norm": 5.174617767333984, + "learning_rate": 2.2890000000000004e-06, + "loss": 1.1367, + "step": 766 + }, + { + "epoch": 4.309859154929577, + "grad_norm": 8.613746643066406, + "learning_rate": 2.292e-06, + "loss": 1.0579, + "step": 767 + }, + { + "epoch": 4.3154929577464785, + "grad_norm": 3.378936529159546, + "learning_rate": 2.295e-06, + "loss": 1.077, + "step": 768 + }, + { + "epoch": 4.321126760563381, + "grad_norm": 3.4128663539886475, + "learning_rate": 2.2980000000000003e-06, + "loss": 1.0403, + "step": 769 + }, + { + "epoch": 4.326760563380281, + "grad_norm": 4.564036846160889, + "learning_rate": 2.301e-06, + "loss": 1.0029, + "step": 770 + }, + { + "epoch": 4.3323943661971835, + "grad_norm": 4.310069561004639, + "learning_rate": 2.304e-06, + "loss": 1.0173, + "step": 771 + }, + { + "epoch": 4.338028169014084, + "grad_norm": 4.131349086761475, + "learning_rate": 2.307e-06, + "loss": 1.0233, + "step": 772 + }, + { + "epoch": 4.343661971830986, + "grad_norm": 2.9398694038391113, + "learning_rate": 2.31e-06, + "loss": 0.9647, + "step": 773 + }, + { + "epoch": 4.349295774647887, + "grad_norm": 2.1672401428222656, + "learning_rate": 2.313e-06, + "loss": 0.9424, + "step": 774 + }, + { + "epoch": 4.354929577464789, + "grad_norm": 3.017409324645996, + "learning_rate": 2.3160000000000004e-06, + "loss": 0.9195, + "step": 775 + }, + { + "epoch": 4.3605633802816905, + "grad_norm": 7.53600549697876, + "learning_rate": 2.3189999999999997e-06, + "loss": 0.9747, + "step": 776 + }, + { + "epoch": 4.366197183098592, + "grad_norm": 4.035458564758301, + "learning_rate": 2.322e-06, + "loss": 0.9055, + "step": 777 + }, + { + "epoch": 4.371830985915493, + "grad_norm": 2.679673671722412, + "learning_rate": 2.325e-06, + "loss": 0.911, + "step": 778 + }, + { + "epoch": 4.377464788732395, + "grad_norm": 2.053769588470459, + "learning_rate": 2.328e-06, + "loss": 0.8975, + "step": 779 + }, + { + "epoch": 4.383098591549296, + "grad_norm": 3.9188501834869385, + "learning_rate": 2.3310000000000002e-06, + "loss": 0.8792, + "step": 780 + }, + { + "epoch": 4.3887323943661976, + "grad_norm": 2.785120725631714, + "learning_rate": 2.334e-06, + "loss": 0.8571, + "step": 781 + }, + { + "epoch": 4.394366197183099, + "grad_norm": 2.4356346130371094, + "learning_rate": 2.337e-06, + "loss": 0.8674, + "step": 782 + }, + { + "epoch": 4.4, + "grad_norm": 2.8814375400543213, + "learning_rate": 2.34e-06, + "loss": 0.8873, + "step": 783 + }, + { + "epoch": 4.405633802816902, + "grad_norm": 5.882245063781738, + "learning_rate": 2.3430000000000003e-06, + "loss": 0.846, + "step": 784 + }, + { + "epoch": 4.411267605633803, + "grad_norm": 4.481248378753662, + "learning_rate": 2.346e-06, + "loss": 0.7242, + "step": 785 + }, + { + "epoch": 4.416901408450705, + "grad_norm": 3.376279592514038, + "learning_rate": 2.349e-06, + "loss": 0.8909, + "step": 786 + }, + { + "epoch": 4.422535211267606, + "grad_norm": 5.917909622192383, + "learning_rate": 2.352e-06, + "loss": 0.8524, + "step": 787 + }, + { + "epoch": 4.428169014084507, + "grad_norm": 2.9090683460235596, + "learning_rate": 2.355e-06, + "loss": 0.7465, + "step": 788 + }, + { + "epoch": 4.433802816901409, + "grad_norm": 4.003406524658203, + "learning_rate": 2.358e-06, + "loss": 0.8354, + "step": 789 + }, + { + "epoch": 4.43943661971831, + "grad_norm": 14.42676067352295, + "learning_rate": 2.3610000000000003e-06, + "loss": 0.7535, + "step": 790 + }, + { + "epoch": 4.445070422535212, + "grad_norm": 3.3684017658233643, + "learning_rate": 2.3639999999999997e-06, + "loss": 0.8171, + "step": 791 + }, + { + "epoch": 4.450704225352113, + "grad_norm": 3.4548377990722656, + "learning_rate": 2.367e-06, + "loss": 0.6947, + "step": 792 + }, + { + "epoch": 4.456338028169014, + "grad_norm": 3.2732181549072266, + "learning_rate": 2.37e-06, + "loss": 0.7211, + "step": 793 + }, + { + "epoch": 4.461971830985916, + "grad_norm": 3.2089741230010986, + "learning_rate": 2.373e-06, + "loss": 0.656, + "step": 794 + }, + { + "epoch": 4.467605633802817, + "grad_norm": 3.5135998725891113, + "learning_rate": 2.376e-06, + "loss": 0.764, + "step": 795 + }, + { + "epoch": 4.473239436619719, + "grad_norm": 2.727151393890381, + "learning_rate": 2.379e-06, + "loss": 0.6935, + "step": 796 + }, + { + "epoch": 4.47887323943662, + "grad_norm": 6.873143196105957, + "learning_rate": 2.382e-06, + "loss": 0.6877, + "step": 797 + }, + { + "epoch": 4.484507042253521, + "grad_norm": 3.446615695953369, + "learning_rate": 2.385e-06, + "loss": 0.6804, + "step": 798 + }, + { + "epoch": 4.490140845070423, + "grad_norm": 5.030722618103027, + "learning_rate": 2.3880000000000003e-06, + "loss": 0.7444, + "step": 799 + }, + { + "epoch": 4.495774647887324, + "grad_norm": 6.555557727813721, + "learning_rate": 2.391e-06, + "loss": 0.7532, + "step": 800 + }, + { + "epoch": 4.501408450704226, + "grad_norm": 11.13542652130127, + "learning_rate": 2.394e-06, + "loss": 1.2519, + "step": 801 + }, + { + "epoch": 4.507042253521127, + "grad_norm": 6.424833297729492, + "learning_rate": 2.397e-06, + "loss": 1.1184, + "step": 802 + }, + { + "epoch": 4.512676056338028, + "grad_norm": 5.684906959533691, + "learning_rate": 2.4000000000000003e-06, + "loss": 1.1603, + "step": 803 + }, + { + "epoch": 4.51830985915493, + "grad_norm": 6.032043933868408, + "learning_rate": 2.403e-06, + "loss": 0.9755, + "step": 804 + }, + { + "epoch": 4.523943661971831, + "grad_norm": 7.522682189941406, + "learning_rate": 2.406e-06, + "loss": 1.0374, + "step": 805 + }, + { + "epoch": 4.529577464788733, + "grad_norm": 4.559472560882568, + "learning_rate": 2.409e-06, + "loss": 0.963, + "step": 806 + }, + { + "epoch": 4.535211267605634, + "grad_norm": 8.044770240783691, + "learning_rate": 2.412e-06, + "loss": 0.8639, + "step": 807 + }, + { + "epoch": 4.540845070422535, + "grad_norm": 4.376523017883301, + "learning_rate": 2.415e-06, + "loss": 0.8814, + "step": 808 + }, + { + "epoch": 4.546478873239437, + "grad_norm": 5.2792534828186035, + "learning_rate": 2.4180000000000004e-06, + "loss": 0.9314, + "step": 809 + }, + { + "epoch": 4.552112676056338, + "grad_norm": 6.261806964874268, + "learning_rate": 2.4209999999999998e-06, + "loss": 0.8478, + "step": 810 + }, + { + "epoch": 4.55774647887324, + "grad_norm": 5.189519882202148, + "learning_rate": 2.424e-06, + "loss": 0.9977, + "step": 811 + }, + { + "epoch": 4.563380281690141, + "grad_norm": 5.815868854522705, + "learning_rate": 2.4270000000000002e-06, + "loss": 0.8007, + "step": 812 + }, + { + "epoch": 4.569014084507042, + "grad_norm": 5.150207042694092, + "learning_rate": 2.43e-06, + "loss": 0.9216, + "step": 813 + }, + { + "epoch": 4.574647887323944, + "grad_norm": 3.9940452575683594, + "learning_rate": 2.4330000000000003e-06, + "loss": 0.8655, + "step": 814 + }, + { + "epoch": 4.580281690140845, + "grad_norm": 2.9555556774139404, + "learning_rate": 2.436e-06, + "loss": 0.8565, + "step": 815 + }, + { + "epoch": 4.585915492957747, + "grad_norm": 3.326611280441284, + "learning_rate": 2.439e-06, + "loss": 0.7621, + "step": 816 + }, + { + "epoch": 4.591549295774648, + "grad_norm": 3.882078170776367, + "learning_rate": 2.442e-06, + "loss": 0.7741, + "step": 817 + }, + { + "epoch": 4.597183098591549, + "grad_norm": 11.13386344909668, + "learning_rate": 2.4450000000000003e-06, + "loss": 0.7245, + "step": 818 + }, + { + "epoch": 4.602816901408451, + "grad_norm": 4.292423248291016, + "learning_rate": 2.448e-06, + "loss": 0.7295, + "step": 819 + }, + { + "epoch": 4.608450704225352, + "grad_norm": 10.06823444366455, + "learning_rate": 2.451e-06, + "loss": 0.6212, + "step": 820 + }, + { + "epoch": 4.614084507042254, + "grad_norm": 4.313129901885986, + "learning_rate": 2.454e-06, + "loss": 0.6961, + "step": 821 + }, + { + "epoch": 4.619718309859155, + "grad_norm": 6.2645182609558105, + "learning_rate": 2.457e-06, + "loss": 0.7328, + "step": 822 + }, + { + "epoch": 4.625352112676056, + "grad_norm": 2.457139015197754, + "learning_rate": 2.46e-06, + "loss": 0.7715, + "step": 823 + }, + { + "epoch": 4.630985915492958, + "grad_norm": 3.2750065326690674, + "learning_rate": 2.4630000000000004e-06, + "loss": 0.7052, + "step": 824 + }, + { + "epoch": 4.636619718309859, + "grad_norm": 4.161436557769775, + "learning_rate": 2.4659999999999998e-06, + "loss": 0.6182, + "step": 825 + }, + { + "epoch": 4.642253521126761, + "grad_norm": 6.171960353851318, + "learning_rate": 2.469e-06, + "loss": 0.6678, + "step": 826 + }, + { + "epoch": 4.647887323943662, + "grad_norm": 4.594521999359131, + "learning_rate": 2.4720000000000002e-06, + "loss": 0.6498, + "step": 827 + }, + { + "epoch": 4.653521126760563, + "grad_norm": 5.677289009094238, + "learning_rate": 2.475e-06, + "loss": 0.6358, + "step": 828 + }, + { + "epoch": 4.659154929577465, + "grad_norm": 7.616566181182861, + "learning_rate": 2.4780000000000002e-06, + "loss": 0.6303, + "step": 829 + }, + { + "epoch": 4.664788732394366, + "grad_norm": 3.6935417652130127, + "learning_rate": 2.481e-06, + "loss": 0.695, + "step": 830 + }, + { + "epoch": 4.670422535211268, + "grad_norm": 6.680566310882568, + "learning_rate": 2.484e-06, + "loss": 0.6622, + "step": 831 + }, + { + "epoch": 4.676056338028169, + "grad_norm": 4.233198165893555, + "learning_rate": 2.487e-06, + "loss": 0.5588, + "step": 832 + }, + { + "epoch": 4.68169014084507, + "grad_norm": 2.316086769104004, + "learning_rate": 2.4900000000000003e-06, + "loss": 0.6056, + "step": 833 + }, + { + "epoch": 4.687323943661972, + "grad_norm": 4.981112480163574, + "learning_rate": 2.4929999999999997e-06, + "loss": 0.615, + "step": 834 + }, + { + "epoch": 4.692957746478873, + "grad_norm": 2.577280044555664, + "learning_rate": 2.496e-06, + "loss": 0.6051, + "step": 835 + }, + { + "epoch": 4.698591549295775, + "grad_norm": 3.450861930847168, + "learning_rate": 2.499e-06, + "loss": 0.6528, + "step": 836 + }, + { + "epoch": 4.704225352112676, + "grad_norm": 2.7186598777770996, + "learning_rate": 2.502e-06, + "loss": 0.6306, + "step": 837 + }, + { + "epoch": 4.709859154929577, + "grad_norm": 2.859384298324585, + "learning_rate": 2.505e-06, + "loss": 0.6805, + "step": 838 + }, + { + "epoch": 4.715492957746479, + "grad_norm": 3.7891767024993896, + "learning_rate": 2.508e-06, + "loss": 0.6214, + "step": 839 + }, + { + "epoch": 4.72112676056338, + "grad_norm": 3.8955564498901367, + "learning_rate": 2.5109999999999998e-06, + "loss": 0.5786, + "step": 840 + }, + { + "epoch": 4.726760563380282, + "grad_norm": 6.762790203094482, + "learning_rate": 2.514e-06, + "loss": 0.5551, + "step": 841 + }, + { + "epoch": 4.732394366197183, + "grad_norm": 3.186650276184082, + "learning_rate": 2.517e-06, + "loss": 0.5718, + "step": 842 + }, + { + "epoch": 4.738028169014084, + "grad_norm": 4.856934547424316, + "learning_rate": 2.52e-06, + "loss": 0.5954, + "step": 843 + }, + { + "epoch": 4.743661971830986, + "grad_norm": 5.6527791023254395, + "learning_rate": 2.523e-06, + "loss": 0.6891, + "step": 844 + }, + { + "epoch": 4.749295774647887, + "grad_norm": 6.431458473205566, + "learning_rate": 2.526e-06, + "loss": 1.0238, + "step": 845 + }, + { + "epoch": 4.754929577464789, + "grad_norm": 3.19467830657959, + "learning_rate": 2.5290000000000003e-06, + "loss": 0.9253, + "step": 846 + }, + { + "epoch": 4.76056338028169, + "grad_norm": 3.688904285430908, + "learning_rate": 2.532e-06, + "loss": 0.8768, + "step": 847 + }, + { + "epoch": 4.766197183098591, + "grad_norm": 3.788217067718506, + "learning_rate": 2.5350000000000003e-06, + "loss": 0.9304, + "step": 848 + }, + { + "epoch": 4.771830985915493, + "grad_norm": 3.510573625564575, + "learning_rate": 2.538e-06, + "loss": 0.8295, + "step": 849 + }, + { + "epoch": 4.777464788732394, + "grad_norm": 3.672971248626709, + "learning_rate": 2.541e-06, + "loss": 0.8227, + "step": 850 + }, + { + "epoch": 4.783098591549296, + "grad_norm": 9.765240669250488, + "learning_rate": 2.544e-06, + "loss": 0.8331, + "step": 851 + }, + { + "epoch": 4.788732394366197, + "grad_norm": 4.7542924880981445, + "learning_rate": 2.5470000000000003e-06, + "loss": 0.8178, + "step": 852 + }, + { + "epoch": 4.794366197183098, + "grad_norm": 2.981187105178833, + "learning_rate": 2.55e-06, + "loss": 0.8533, + "step": 853 + }, + { + "epoch": 4.8, + "grad_norm": 5.025519847869873, + "learning_rate": 2.553e-06, + "loss": 0.8352, + "step": 854 + }, + { + "epoch": 4.805633802816901, + "grad_norm": 3.1089770793914795, + "learning_rate": 2.556e-06, + "loss": 0.7018, + "step": 855 + }, + { + "epoch": 4.811267605633803, + "grad_norm": 2.9379541873931885, + "learning_rate": 2.559e-06, + "loss": 0.7346, + "step": 856 + }, + { + "epoch": 4.816901408450704, + "grad_norm": 3.395998001098633, + "learning_rate": 2.562e-06, + "loss": 0.7934, + "step": 857 + }, + { + "epoch": 4.822535211267605, + "grad_norm": 3.98203182220459, + "learning_rate": 2.5650000000000004e-06, + "loss": 0.7462, + "step": 858 + }, + { + "epoch": 4.828169014084507, + "grad_norm": 2.7102365493774414, + "learning_rate": 2.568e-06, + "loss": 0.6715, + "step": 859 + }, + { + "epoch": 4.833802816901408, + "grad_norm": 6.256322860717773, + "learning_rate": 2.571e-06, + "loss": 0.8342, + "step": 860 + }, + { + "epoch": 4.83943661971831, + "grad_norm": 2.3925929069519043, + "learning_rate": 2.5740000000000003e-06, + "loss": 0.6078, + "step": 861 + }, + { + "epoch": 4.845070422535211, + "grad_norm": 2.342928647994995, + "learning_rate": 2.577e-06, + "loss": 0.6536, + "step": 862 + }, + { + "epoch": 4.850704225352112, + "grad_norm": 2.7884795665740967, + "learning_rate": 2.58e-06, + "loss": 0.6163, + "step": 863 + }, + { + "epoch": 4.856338028169014, + "grad_norm": 2.60550856590271, + "learning_rate": 2.583e-06, + "loss": 0.6322, + "step": 864 + }, + { + "epoch": 4.861971830985915, + "grad_norm": 3.2051239013671875, + "learning_rate": 2.586e-06, + "loss": 0.5733, + "step": 865 + }, + { + "epoch": 4.867605633802817, + "grad_norm": 2.349964141845703, + "learning_rate": 2.589e-06, + "loss": 0.5361, + "step": 866 + }, + { + "epoch": 4.873239436619718, + "grad_norm": 7.408186435699463, + "learning_rate": 2.5920000000000003e-06, + "loss": 0.6847, + "step": 867 + }, + { + "epoch": 4.878873239436619, + "grad_norm": 108.75530242919922, + "learning_rate": 2.5949999999999997e-06, + "loss": 0.5442, + "step": 868 + }, + { + "epoch": 4.884507042253521, + "grad_norm": 3.8446199893951416, + "learning_rate": 2.598e-06, + "loss": 0.6033, + "step": 869 + }, + { + "epoch": 4.890140845070422, + "grad_norm": 2.7630155086517334, + "learning_rate": 2.601e-06, + "loss": 0.5898, + "step": 870 + }, + { + "epoch": 4.895774647887324, + "grad_norm": 2.9433813095092773, + "learning_rate": 2.604e-06, + "loss": 0.6187, + "step": 871 + }, + { + "epoch": 4.901408450704225, + "grad_norm": 3.2574610710144043, + "learning_rate": 2.607e-06, + "loss": 0.4792, + "step": 872 + }, + { + "epoch": 4.907042253521126, + "grad_norm": 2.240130662918091, + "learning_rate": 2.61e-06, + "loss": 0.4657, + "step": 873 + }, + { + "epoch": 4.912676056338028, + "grad_norm": 3.9430322647094727, + "learning_rate": 2.613e-06, + "loss": 0.6533, + "step": 874 + }, + { + "epoch": 4.918309859154929, + "grad_norm": 2.186457633972168, + "learning_rate": 2.616e-06, + "loss": 0.4592, + "step": 875 + }, + { + "epoch": 4.923943661971831, + "grad_norm": 6.3133955001831055, + "learning_rate": 2.6190000000000003e-06, + "loss": 0.6135, + "step": 876 + }, + { + "epoch": 4.929577464788732, + "grad_norm": 2.4026143550872803, + "learning_rate": 2.622e-06, + "loss": 0.6044, + "step": 877 + }, + { + "epoch": 4.9352112676056334, + "grad_norm": 2.5374703407287598, + "learning_rate": 2.625e-06, + "loss": 0.5245, + "step": 878 + }, + { + "epoch": 4.940845070422535, + "grad_norm": 2.3137059211730957, + "learning_rate": 2.628e-06, + "loss": 0.6356, + "step": 879 + }, + { + "epoch": 4.946478873239436, + "grad_norm": 3.0200490951538086, + "learning_rate": 2.631e-06, + "loss": 0.5608, + "step": 880 + }, + { + "epoch": 4.952112676056338, + "grad_norm": 2.407966375350952, + "learning_rate": 2.634e-06, + "loss": 0.5017, + "step": 881 + }, + { + "epoch": 4.957746478873239, + "grad_norm": 4.552791118621826, + "learning_rate": 2.6370000000000003e-06, + "loss": 0.5556, + "step": 882 + }, + { + "epoch": 4.9633802816901404, + "grad_norm": 2.422546863555908, + "learning_rate": 2.6399999999999997e-06, + "loss": 0.4539, + "step": 883 + }, + { + "epoch": 4.969014084507043, + "grad_norm": 2.9516642093658447, + "learning_rate": 2.643e-06, + "loss": 0.5157, + "step": 884 + }, + { + "epoch": 4.974647887323943, + "grad_norm": 3.9323694705963135, + "learning_rate": 2.646e-06, + "loss": 0.5258, + "step": 885 + }, + { + "epoch": 4.9802816901408455, + "grad_norm": 4.032447338104248, + "learning_rate": 2.649e-06, + "loss": 0.6014, + "step": 886 + }, + { + "epoch": 4.985915492957746, + "grad_norm": 4.838427543640137, + "learning_rate": 2.652e-06, + "loss": 0.4939, + "step": 887 + }, + { + "epoch": 4.991549295774648, + "grad_norm": 7.549025535583496, + "learning_rate": 2.655e-06, + "loss": 0.5463, + "step": 888 + }, + { + "epoch": 4.997183098591549, + "grad_norm": 2.98043155670166, + "learning_rate": 2.6580000000000002e-06, + "loss": 0.6438, + "step": 889 + }, + { + "epoch": 5.0, + "grad_norm": 3.0263454914093018, + "learning_rate": 2.661e-06, + "loss": 0.2365, + "step": 890 + }, + { + "epoch": 5.005633802816901, + "grad_norm": 3.7286951541900635, + "learning_rate": 2.6640000000000002e-06, + "loss": 0.8897, + "step": 891 + }, + { + "epoch": 5.011267605633803, + "grad_norm": 3.884514570236206, + "learning_rate": 2.6670000000000005e-06, + "loss": 0.7991, + "step": 892 + }, + { + "epoch": 5.016901408450704, + "grad_norm": 3.591808319091797, + "learning_rate": 2.67e-06, + "loss": 0.7984, + "step": 893 + }, + { + "epoch": 5.022535211267606, + "grad_norm": 2.895336866378784, + "learning_rate": 2.673e-06, + "loss": 0.8905, + "step": 894 + }, + { + "epoch": 5.028169014084507, + "grad_norm": 4.619030952453613, + "learning_rate": 2.6760000000000003e-06, + "loss": 0.7814, + "step": 895 + }, + { + "epoch": 5.033802816901408, + "grad_norm": 3.87629771232605, + "learning_rate": 2.679e-06, + "loss": 0.7244, + "step": 896 + }, + { + "epoch": 5.03943661971831, + "grad_norm": 3.3396666049957275, + "learning_rate": 2.682e-06, + "loss": 0.681, + "step": 897 + }, + { + "epoch": 5.045070422535211, + "grad_norm": 4.312685489654541, + "learning_rate": 2.685e-06, + "loss": 0.6109, + "step": 898 + }, + { + "epoch": 5.050704225352113, + "grad_norm": 3.3607585430145264, + "learning_rate": 2.688e-06, + "loss": 0.6679, + "step": 899 + }, + { + "epoch": 5.056338028169014, + "grad_norm": 4.933312892913818, + "learning_rate": 2.691e-06, + "loss": 0.6408, + "step": 900 + }, + { + "epoch": 5.061971830985915, + "grad_norm": 7.953579425811768, + "learning_rate": 2.6940000000000004e-06, + "loss": 0.704, + "step": 901 + }, + { + "epoch": 5.067605633802817, + "grad_norm": 2.636009931564331, + "learning_rate": 2.6969999999999998e-06, + "loss": 0.6687, + "step": 902 + }, + { + "epoch": 5.073239436619718, + "grad_norm": 2.501701831817627, + "learning_rate": 2.7e-06, + "loss": 0.6241, + "step": 903 + }, + { + "epoch": 5.07887323943662, + "grad_norm": 3.6819188594818115, + "learning_rate": 2.703e-06, + "loss": 0.7167, + "step": 904 + }, + { + "epoch": 5.084507042253521, + "grad_norm": 2.071209192276001, + "learning_rate": 2.706e-06, + "loss": 0.5678, + "step": 905 + }, + { + "epoch": 5.090140845070422, + "grad_norm": 2.5370724201202393, + "learning_rate": 2.7090000000000002e-06, + "loss": 0.6386, + "step": 906 + }, + { + "epoch": 5.095774647887324, + "grad_norm": 5.760916233062744, + "learning_rate": 2.712e-06, + "loss": 0.6145, + "step": 907 + }, + { + "epoch": 5.101408450704225, + "grad_norm": 2.104323148727417, + "learning_rate": 2.715e-06, + "loss": 0.5532, + "step": 908 + }, + { + "epoch": 5.107042253521127, + "grad_norm": 2.2896480560302734, + "learning_rate": 2.718e-06, + "loss": 0.5573, + "step": 909 + }, + { + "epoch": 5.112676056338028, + "grad_norm": 4.796744346618652, + "learning_rate": 2.7210000000000003e-06, + "loss": 0.4646, + "step": 910 + }, + { + "epoch": 5.118309859154929, + "grad_norm": 2.22436785697937, + "learning_rate": 2.724e-06, + "loss": 0.5938, + "step": 911 + }, + { + "epoch": 5.123943661971831, + "grad_norm": 5.623243808746338, + "learning_rate": 2.727e-06, + "loss": 0.5198, + "step": 912 + }, + { + "epoch": 5.129577464788732, + "grad_norm": 2.2928428649902344, + "learning_rate": 2.73e-06, + "loss": 0.5933, + "step": 913 + }, + { + "epoch": 5.135211267605634, + "grad_norm": 3.24739408493042, + "learning_rate": 2.733e-06, + "loss": 0.4768, + "step": 914 + }, + { + "epoch": 5.140845070422535, + "grad_norm": 5.295236110687256, + "learning_rate": 2.736e-06, + "loss": 0.5845, + "step": 915 + }, + { + "epoch": 5.146478873239436, + "grad_norm": 5.906332015991211, + "learning_rate": 2.7390000000000004e-06, + "loss": 0.536, + "step": 916 + }, + { + "epoch": 5.152112676056338, + "grad_norm": 2.0323636531829834, + "learning_rate": 2.7419999999999998e-06, + "loss": 0.598, + "step": 917 + }, + { + "epoch": 5.157746478873239, + "grad_norm": 2.347740650177002, + "learning_rate": 2.745e-06, + "loss": 0.4687, + "step": 918 + }, + { + "epoch": 5.163380281690141, + "grad_norm": 5.7692999839782715, + "learning_rate": 2.748e-06, + "loss": 0.4441, + "step": 919 + }, + { + "epoch": 5.169014084507042, + "grad_norm": 2.8650996685028076, + "learning_rate": 2.751e-06, + "loss": 0.6155, + "step": 920 + }, + { + "epoch": 5.174647887323943, + "grad_norm": 2.7199292182922363, + "learning_rate": 2.7540000000000002e-06, + "loss": 0.5006, + "step": 921 + }, + { + "epoch": 5.180281690140845, + "grad_norm": 2.181318521499634, + "learning_rate": 2.757e-06, + "loss": 0.4766, + "step": 922 + }, + { + "epoch": 5.185915492957746, + "grad_norm": 2.543658971786499, + "learning_rate": 2.76e-06, + "loss": 0.5543, + "step": 923 + }, + { + "epoch": 5.191549295774648, + "grad_norm": 7.329205513000488, + "learning_rate": 2.763e-06, + "loss": 0.4265, + "step": 924 + }, + { + "epoch": 5.197183098591549, + "grad_norm": 2.5346627235412598, + "learning_rate": 2.7660000000000003e-06, + "loss": 0.4808, + "step": 925 + }, + { + "epoch": 5.20281690140845, + "grad_norm": 2.384472370147705, + "learning_rate": 2.7689999999999997e-06, + "loss": 0.4395, + "step": 926 + }, + { + "epoch": 5.208450704225352, + "grad_norm": 3.4939169883728027, + "learning_rate": 2.772e-06, + "loss": 0.4119, + "step": 927 + }, + { + "epoch": 5.214084507042253, + "grad_norm": 5.23539924621582, + "learning_rate": 2.775e-06, + "loss": 0.4431, + "step": 928 + }, + { + "epoch": 5.219718309859155, + "grad_norm": 2.3811333179473877, + "learning_rate": 2.778e-06, + "loss": 0.4725, + "step": 929 + }, + { + "epoch": 5.225352112676056, + "grad_norm": 6.978809356689453, + "learning_rate": 2.781e-06, + "loss": 0.4696, + "step": 930 + }, + { + "epoch": 5.230985915492957, + "grad_norm": 5.870415687561035, + "learning_rate": 2.784e-06, + "loss": 0.381, + "step": 931 + }, + { + "epoch": 5.236619718309859, + "grad_norm": 4.045934677124023, + "learning_rate": 2.787e-06, + "loss": 0.6097, + "step": 932 + }, + { + "epoch": 5.24225352112676, + "grad_norm": 21.88624382019043, + "learning_rate": 2.79e-06, + "loss": 0.3777, + "step": 933 + }, + { + "epoch": 5.247887323943662, + "grad_norm": 3.4917380809783936, + "learning_rate": 2.793e-06, + "loss": 0.523, + "step": 934 + }, + { + "epoch": 5.253521126760563, + "grad_norm": 2.8486669063568115, + "learning_rate": 2.7960000000000004e-06, + "loss": 0.7939, + "step": 935 + }, + { + "epoch": 5.259154929577464, + "grad_norm": 3.822283983230591, + "learning_rate": 2.799e-06, + "loss": 0.8049, + "step": 936 + }, + { + "epoch": 5.264788732394366, + "grad_norm": 3.4523532390594482, + "learning_rate": 2.802e-06, + "loss": 0.7975, + "step": 937 + }, + { + "epoch": 5.270422535211267, + "grad_norm": 2.5025782585144043, + "learning_rate": 2.8050000000000002e-06, + "loss": 0.6506, + "step": 938 + }, + { + "epoch": 5.276056338028169, + "grad_norm": 2.210494041442871, + "learning_rate": 2.808e-06, + "loss": 0.6887, + "step": 939 + }, + { + "epoch": 5.28169014084507, + "grad_norm": 2.1525866985321045, + "learning_rate": 2.8110000000000003e-06, + "loss": 0.7456, + "step": 940 + }, + { + "epoch": 5.2873239436619714, + "grad_norm": 6.6180338859558105, + "learning_rate": 2.814e-06, + "loss": 0.6736, + "step": 941 + }, + { + "epoch": 5.292957746478873, + "grad_norm": 2.8995516300201416, + "learning_rate": 2.817e-06, + "loss": 0.6474, + "step": 942 + }, + { + "epoch": 5.298591549295774, + "grad_norm": 2.2757349014282227, + "learning_rate": 2.82e-06, + "loss": 0.6859, + "step": 943 + }, + { + "epoch": 5.304225352112676, + "grad_norm": 2.54069185256958, + "learning_rate": 2.8230000000000003e-06, + "loss": 0.5716, + "step": 944 + }, + { + "epoch": 5.309859154929577, + "grad_norm": 3.2585108280181885, + "learning_rate": 2.826e-06, + "loss": 0.5769, + "step": 945 + }, + { + "epoch": 5.3154929577464785, + "grad_norm": 2.3954012393951416, + "learning_rate": 2.829e-06, + "loss": 0.5716, + "step": 946 + }, + { + "epoch": 5.321126760563381, + "grad_norm": 2.4710357189178467, + "learning_rate": 2.832e-06, + "loss": 0.5858, + "step": 947 + }, + { + "epoch": 5.326760563380281, + "grad_norm": 2.4542315006256104, + "learning_rate": 2.835e-06, + "loss": 0.5595, + "step": 948 + }, + { + "epoch": 5.3323943661971835, + "grad_norm": 4.040088176727295, + "learning_rate": 2.838e-06, + "loss": 0.4993, + "step": 949 + }, + { + "epoch": 5.338028169014084, + "grad_norm": 2.6751508712768555, + "learning_rate": 2.8410000000000004e-06, + "loss": 0.5521, + "step": 950 + }, + { + "epoch": 5.343661971830986, + "grad_norm": 1.8362388610839844, + "learning_rate": 2.844e-06, + "loss": 0.4828, + "step": 951 + }, + { + "epoch": 5.349295774647887, + "grad_norm": 3.870666980743408, + "learning_rate": 2.847e-06, + "loss": 0.5741, + "step": 952 + }, + { + "epoch": 5.354929577464789, + "grad_norm": 2.98319411277771, + "learning_rate": 2.8500000000000002e-06, + "loss": 0.4582, + "step": 953 + }, + { + "epoch": 5.3605633802816905, + "grad_norm": 2.085663080215454, + "learning_rate": 2.853e-06, + "loss": 0.4475, + "step": 954 + }, + { + "epoch": 5.366197183098592, + "grad_norm": 2.988067626953125, + "learning_rate": 2.8560000000000003e-06, + "loss": 0.5461, + "step": 955 + }, + { + "epoch": 5.371830985915493, + "grad_norm": 2.501213312149048, + "learning_rate": 2.859e-06, + "loss": 0.5009, + "step": 956 + }, + { + "epoch": 5.377464788732395, + "grad_norm": 5.369483947753906, + "learning_rate": 2.862e-06, + "loss": 0.4742, + "step": 957 + }, + { + "epoch": 5.383098591549296, + "grad_norm": 3.1307785511016846, + "learning_rate": 2.865e-06, + "loss": 0.4614, + "step": 958 + }, + { + "epoch": 5.3887323943661976, + "grad_norm": 2.3280138969421387, + "learning_rate": 2.8680000000000003e-06, + "loss": 0.4165, + "step": 959 + }, + { + "epoch": 5.394366197183099, + "grad_norm": 3.9419243335723877, + "learning_rate": 2.8709999999999997e-06, + "loss": 0.4646, + "step": 960 + }, + { + "epoch": 5.4, + "grad_norm": 1.6847516298294067, + "learning_rate": 2.874e-06, + "loss": 0.4, + "step": 961 + }, + { + "epoch": 5.405633802816902, + "grad_norm": 2.56376576423645, + "learning_rate": 2.877e-06, + "loss": 0.4022, + "step": 962 + }, + { + "epoch": 5.411267605633803, + "grad_norm": 2.88046932220459, + "learning_rate": 2.88e-06, + "loss": 0.4373, + "step": 963 + }, + { + "epoch": 5.416901408450705, + "grad_norm": 2.5140233039855957, + "learning_rate": 2.883e-06, + "loss": 0.476, + "step": 964 + }, + { + "epoch": 5.422535211267606, + "grad_norm": 3.400726079940796, + "learning_rate": 2.886e-06, + "loss": 0.3556, + "step": 965 + }, + { + "epoch": 5.428169014084507, + "grad_norm": 2.35408091545105, + "learning_rate": 2.8889999999999998e-06, + "loss": 0.3892, + "step": 966 + }, + { + "epoch": 5.433802816901409, + "grad_norm": 2.032426357269287, + "learning_rate": 2.892e-06, + "loss": 0.4039, + "step": 967 + }, + { + "epoch": 5.43943661971831, + "grad_norm": 2.5404016971588135, + "learning_rate": 2.8950000000000002e-06, + "loss": 0.3942, + "step": 968 + }, + { + "epoch": 5.445070422535212, + "grad_norm": 2.5330841541290283, + "learning_rate": 2.898e-06, + "loss": 0.3797, + "step": 969 + }, + { + "epoch": 5.450704225352113, + "grad_norm": 6.841852188110352, + "learning_rate": 2.901e-06, + "loss": 0.4562, + "step": 970 + }, + { + "epoch": 5.456338028169014, + "grad_norm": 2.308804988861084, + "learning_rate": 2.904e-06, + "loss": 0.5075, + "step": 971 + }, + { + "epoch": 5.461971830985916, + "grad_norm": 3.835463523864746, + "learning_rate": 2.907e-06, + "loss": 0.3515, + "step": 972 + }, + { + "epoch": 5.467605633802817, + "grad_norm": 8.132750511169434, + "learning_rate": 2.91e-06, + "loss": 0.4574, + "step": 973 + }, + { + "epoch": 5.473239436619719, + "grad_norm": 2.29156756401062, + "learning_rate": 2.9130000000000003e-06, + "loss": 0.3985, + "step": 974 + }, + { + "epoch": 5.47887323943662, + "grad_norm": 3.6257643699645996, + "learning_rate": 2.916e-06, + "loss": 0.4139, + "step": 975 + }, + { + "epoch": 5.484507042253521, + "grad_norm": 2.5936882495880127, + "learning_rate": 2.919e-06, + "loss": 0.4021, + "step": 976 + }, + { + "epoch": 5.490140845070423, + "grad_norm": 3.8019628524780273, + "learning_rate": 2.922e-06, + "loss": 0.476, + "step": 977 + }, + { + "epoch": 5.495774647887324, + "grad_norm": 4.725766658782959, + "learning_rate": 2.9250000000000004e-06, + "loss": 0.4575, + "step": 978 + }, + { + "epoch": 5.501408450704226, + "grad_norm": 5.978580951690674, + "learning_rate": 2.928e-06, + "loss": 0.8134, + "step": 979 + }, + { + "epoch": 5.507042253521127, + "grad_norm": 2.873779773712158, + "learning_rate": 2.931e-06, + "loss": 0.74, + "step": 980 + }, + { + "epoch": 5.512676056338028, + "grad_norm": 2.560568332672119, + "learning_rate": 2.934e-06, + "loss": 0.7099, + "step": 981 + }, + { + "epoch": 5.51830985915493, + "grad_norm": 2.4004578590393066, + "learning_rate": 2.937e-06, + "loss": 0.6652, + "step": 982 + }, + { + "epoch": 5.523943661971831, + "grad_norm": 3.904620409011841, + "learning_rate": 2.9400000000000002e-06, + "loss": 0.7448, + "step": 983 + }, + { + "epoch": 5.529577464788733, + "grad_norm": 1.8310728073120117, + "learning_rate": 2.9430000000000005e-06, + "loss": 0.6161, + "step": 984 + }, + { + "epoch": 5.535211267605634, + "grad_norm": 8.018976211547852, + "learning_rate": 2.946e-06, + "loss": 0.5445, + "step": 985 + }, + { + "epoch": 5.540845070422535, + "grad_norm": 2.1463332176208496, + "learning_rate": 2.949e-06, + "loss": 0.6716, + "step": 986 + }, + { + "epoch": 5.546478873239437, + "grad_norm": 2.329373359680176, + "learning_rate": 2.9520000000000003e-06, + "loss": 0.5893, + "step": 987 + }, + { + "epoch": 5.552112676056338, + "grad_norm": 2.2067439556121826, + "learning_rate": 2.955e-06, + "loss": 0.5582, + "step": 988 + }, + { + "epoch": 5.55774647887324, + "grad_norm": 2.535163164138794, + "learning_rate": 2.958e-06, + "loss": 0.6627, + "step": 989 + }, + { + "epoch": 5.563380281690141, + "grad_norm": 2.639993190765381, + "learning_rate": 2.961e-06, + "loss": 0.4677, + "step": 990 + }, + { + "epoch": 5.569014084507042, + "grad_norm": 2.374748945236206, + "learning_rate": 2.964e-06, + "loss": 0.5139, + "step": 991 + }, + { + "epoch": 5.574647887323944, + "grad_norm": 7.6541314125061035, + "learning_rate": 2.967e-06, + "loss": 0.4792, + "step": 992 + }, + { + "epoch": 5.580281690140845, + "grad_norm": 1.8786181211471558, + "learning_rate": 2.9700000000000004e-06, + "loss": 0.4939, + "step": 993 + }, + { + "epoch": 5.585915492957747, + "grad_norm": 2.8175878524780273, + "learning_rate": 2.9729999999999997e-06, + "loss": 0.5425, + "step": 994 + }, + { + "epoch": 5.591549295774648, + "grad_norm": 4.114474773406982, + "learning_rate": 2.976e-06, + "loss": 0.4222, + "step": 995 + }, + { + "epoch": 5.597183098591549, + "grad_norm": 7.6624860763549805, + "learning_rate": 2.979e-06, + "loss": 0.4513, + "step": 996 + }, + { + "epoch": 5.602816901408451, + "grad_norm": 1.9664232730865479, + "learning_rate": 2.982e-06, + "loss": 0.4046, + "step": 997 + }, + { + "epoch": 5.608450704225352, + "grad_norm": 2.2487237453460693, + "learning_rate": 2.9850000000000002e-06, + "loss": 0.4825, + "step": 998 + }, + { + "epoch": 5.614084507042254, + "grad_norm": 2.0302340984344482, + "learning_rate": 2.988e-06, + "loss": 0.5438, + "step": 999 + }, + { + "epoch": 5.619718309859155, + "grad_norm": 2.2003347873687744, + "learning_rate": 2.991e-06, + "loss": 0.4609, + "step": 1000 + }, + { + "epoch": 5.619718309859155, + "eval_cer": 0.1952717940741893, + "eval_loss": 0.7326682209968567, + "eval_runtime": 17.4904, + "eval_samples_per_second": 17.381, + "eval_steps_per_second": 0.572, + "eval_wer": 0.6745970836531082, + "step": 1000 + } + ], + "logging_steps": 1.0, + "max_steps": 100000, + "num_input_tokens_seen": 0, + "num_train_epochs": 565, + "save_steps": 1000, + "stateful_callbacks": { + "EarlyStoppingCallback": { + "args": { + "early_stopping_patience": 5, + "early_stopping_threshold": 0.0 + }, + "attributes": { + "early_stopping_patience_counter": 0 + } + }, + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.055486320296336e+19, + "train_batch_size": 32, + "trial_name": null, + "trial_params": null +}