{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9812889812889813, "eval_steps": 500, "global_step": 360, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.008316008316008316, "grad_norm": 8.161552429199219, "learning_rate": 0.0, "loss": 4.8011, "step": 1 }, { "epoch": 0.016632016632016633, "grad_norm": 8.788352966308594, "learning_rate": 2.0000000000000003e-06, "loss": 4.8042, "step": 2 }, { "epoch": 0.02494802494802495, "grad_norm": 7.977554798126221, "learning_rate": 4.000000000000001e-06, "loss": 4.8113, "step": 3 }, { "epoch": 0.033264033264033266, "grad_norm": 8.319157600402832, "learning_rate": 6e-06, "loss": 4.8424, "step": 4 }, { "epoch": 0.04158004158004158, "grad_norm": 6.344866752624512, "learning_rate": 8.000000000000001e-06, "loss": 4.7736, "step": 5 }, { "epoch": 0.0498960498960499, "grad_norm": 5.9034647941589355, "learning_rate": 1e-05, "loss": 4.7142, "step": 6 }, { "epoch": 0.058212058212058215, "grad_norm": 5.377430438995361, "learning_rate": 1.2e-05, "loss": 4.6148, "step": 7 }, { "epoch": 0.06652806652806653, "grad_norm": 6.9711222648620605, "learning_rate": 1.4000000000000001e-05, "loss": 4.5419, "step": 8 }, { "epoch": 0.07484407484407485, "grad_norm": 6.142094612121582, "learning_rate": 1.6000000000000003e-05, "loss": 4.4492, "step": 9 }, { "epoch": 0.08316008316008316, "grad_norm": 4.155233383178711, "learning_rate": 1.8e-05, "loss": 4.2954, "step": 10 }, { "epoch": 0.09147609147609148, "grad_norm": 3.6919031143188477, "learning_rate": 2e-05, "loss": 4.208, "step": 11 }, { "epoch": 0.0997920997920998, "grad_norm": 3.5758368968963623, "learning_rate": 2.2000000000000003e-05, "loss": 4.0939, "step": 12 }, { "epoch": 0.10810810810810811, "grad_norm": 2.7365787029266357, "learning_rate": 2.4e-05, "loss": 4.0046, "step": 13 }, { "epoch": 0.11642411642411643, "grad_norm": 2.0502889156341553, "learning_rate": 2.6000000000000002e-05, "loss": 3.9557, "step": 14 }, { "epoch": 0.12474012474012475, "grad_norm": 2.2007663249969482, "learning_rate": 2.8000000000000003e-05, "loss": 3.8579, "step": 15 }, { "epoch": 0.13305613305613306, "grad_norm": 2.462459087371826, "learning_rate": 3e-05, "loss": 3.8167, "step": 16 }, { "epoch": 0.14137214137214138, "grad_norm": 2.7364518642425537, "learning_rate": 3.2000000000000005e-05, "loss": 3.8004, "step": 17 }, { "epoch": 0.1496881496881497, "grad_norm": 2.85400128364563, "learning_rate": 3.4000000000000007e-05, "loss": 3.6991, "step": 18 }, { "epoch": 0.158004158004158, "grad_norm": 2.499318838119507, "learning_rate": 3.6e-05, "loss": 3.6456, "step": 19 }, { "epoch": 0.16632016632016633, "grad_norm": 2.214195728302002, "learning_rate": 3.8e-05, "loss": 3.6195, "step": 20 }, { "epoch": 0.17463617463617465, "grad_norm": 2.195711135864258, "learning_rate": 4e-05, "loss": 3.5519, "step": 21 }, { "epoch": 0.18295218295218296, "grad_norm": 1.992714762687683, "learning_rate": 4.2e-05, "loss": 3.5034, "step": 22 }, { "epoch": 0.19126819126819128, "grad_norm": 2.6501753330230713, "learning_rate": 4.4000000000000006e-05, "loss": 3.4722, "step": 23 }, { "epoch": 0.1995841995841996, "grad_norm": 2.2270214557647705, "learning_rate": 4.600000000000001e-05, "loss": 3.4126, "step": 24 }, { "epoch": 0.2079002079002079, "grad_norm": 3.029968023300171, "learning_rate": 4.8e-05, "loss": 3.3517, "step": 25 }, { "epoch": 0.21621621621621623, "grad_norm": 1.7878334522247314, "learning_rate": 5e-05, "loss": 3.3321, "step": 26 }, { "epoch": 0.22453222453222454, "grad_norm": 1.6116269826889038, "learning_rate": 5.2000000000000004e-05, "loss": 3.2636, "step": 27 }, { "epoch": 0.23284823284823286, "grad_norm": 2.6376936435699463, "learning_rate": 5.4000000000000005e-05, "loss": 3.2541, "step": 28 }, { "epoch": 0.24116424116424118, "grad_norm": 10.313434600830078, "learning_rate": 5.6000000000000006e-05, "loss": 3.2762, "step": 29 }, { "epoch": 0.2494802494802495, "grad_norm": 5.4037251472473145, "learning_rate": 5.8e-05, "loss": 3.2097, "step": 30 }, { "epoch": 0.2577962577962578, "grad_norm": 15.105055809020996, "learning_rate": 6e-05, "loss": 3.2342, "step": 31 }, { "epoch": 0.2661122661122661, "grad_norm": 15.099345207214355, "learning_rate": 6.2e-05, "loss": 3.2316, "step": 32 }, { "epoch": 0.27442827442827444, "grad_norm": 1.7807673215866089, "learning_rate": 6.400000000000001e-05, "loss": 3.1318, "step": 33 }, { "epoch": 0.28274428274428276, "grad_norm": 7.983642101287842, "learning_rate": 6.6e-05, "loss": 3.1442, "step": 34 }, { "epoch": 0.2910602910602911, "grad_norm": 4.913079738616943, "learning_rate": 6.800000000000001e-05, "loss": 3.1082, "step": 35 }, { "epoch": 0.2993762993762994, "grad_norm": 12.353167533874512, "learning_rate": 7e-05, "loss": 3.1656, "step": 36 }, { "epoch": 0.3076923076923077, "grad_norm": 10.911690711975098, "learning_rate": 7.2e-05, "loss": 3.1361, "step": 37 }, { "epoch": 0.316008316008316, "grad_norm": 1.3475183248519897, "learning_rate": 7.4e-05, "loss": 3.057, "step": 38 }, { "epoch": 0.32432432432432434, "grad_norm": 7.743471145629883, "learning_rate": 7.6e-05, "loss": 3.0549, "step": 39 }, { "epoch": 0.33264033264033266, "grad_norm": 4.499805450439453, "learning_rate": 7.800000000000001e-05, "loss": 2.9976, "step": 40 }, { "epoch": 0.340956340956341, "grad_norm": 4.716672420501709, "learning_rate": 8e-05, "loss": 2.9922, "step": 41 }, { "epoch": 0.3492723492723493, "grad_norm": 5.240478038787842, "learning_rate": 8.2e-05, "loss": 3.0115, "step": 42 }, { "epoch": 0.3575883575883576, "grad_norm": 1.5897458791732788, "learning_rate": 8.4e-05, "loss": 2.9899, "step": 43 }, { "epoch": 0.3659043659043659, "grad_norm": 2.463665723800659, "learning_rate": 8.6e-05, "loss": 2.9799, "step": 44 }, { "epoch": 0.37422037422037424, "grad_norm": 1.827469825744629, "learning_rate": 8.800000000000001e-05, "loss": 2.928, "step": 45 }, { "epoch": 0.38253638253638256, "grad_norm": 1.878302812576294, "learning_rate": 9e-05, "loss": 2.9717, "step": 46 }, { "epoch": 0.3908523908523909, "grad_norm": 2.1960086822509766, "learning_rate": 9.200000000000001e-05, "loss": 2.9037, "step": 47 }, { "epoch": 0.3991683991683992, "grad_norm": 3.8804755210876465, "learning_rate": 9.4e-05, "loss": 2.9174, "step": 48 }, { "epoch": 0.4074844074844075, "grad_norm": 1.1289819478988647, "learning_rate": 9.6e-05, "loss": 2.8997, "step": 49 }, { "epoch": 0.4158004158004158, "grad_norm": 4.365386009216309, "learning_rate": 9.8e-05, "loss": 2.8705, "step": 50 }, { "epoch": 0.42411642411642414, "grad_norm": 4.703887462615967, "learning_rate": 0.0001, "loss": 2.876, "step": 51 }, { "epoch": 0.43243243243243246, "grad_norm": 2.8481767177581787, "learning_rate": 0.00010200000000000001, "loss": 2.8617, "step": 52 }, { "epoch": 0.4407484407484408, "grad_norm": 10.58674144744873, "learning_rate": 0.00010400000000000001, "loss": 2.8998, "step": 53 }, { "epoch": 0.4490644490644491, "grad_norm": 6.807188510894775, "learning_rate": 0.00010600000000000002, "loss": 2.8795, "step": 54 }, { "epoch": 0.4573804573804574, "grad_norm": 2.453004837036133, "learning_rate": 0.00010800000000000001, "loss": 2.8707, "step": 55 }, { "epoch": 0.4656964656964657, "grad_norm": 1.8895411491394043, "learning_rate": 0.00011000000000000002, "loss": 2.8367, "step": 56 }, { "epoch": 0.47401247401247404, "grad_norm": 3.585893154144287, "learning_rate": 0.00011200000000000001, "loss": 2.8564, "step": 57 }, { "epoch": 0.48232848232848236, "grad_norm": 2.117868661880493, "learning_rate": 0.00011399999999999999, "loss": 2.8326, "step": 58 }, { "epoch": 0.49064449064449067, "grad_norm": 1.4010989665985107, "learning_rate": 0.000116, "loss": 2.784, "step": 59 }, { "epoch": 0.498960498960499, "grad_norm": 0.8910171985626221, "learning_rate": 0.000118, "loss": 2.7828, "step": 60 }, { "epoch": 0.5072765072765073, "grad_norm": 0.8965553641319275, "learning_rate": 0.00012, "loss": 2.7768, "step": 61 }, { "epoch": 0.5155925155925156, "grad_norm": 1.0502556562423706, "learning_rate": 0.000122, "loss": 2.7733, "step": 62 }, { "epoch": 0.5239085239085239, "grad_norm": 2.425708532333374, "learning_rate": 0.000124, "loss": 2.7897, "step": 63 }, { "epoch": 0.5322245322245323, "grad_norm": 0.8981500864028931, "learning_rate": 0.000126, "loss": 2.7452, "step": 64 }, { "epoch": 0.5405405405405406, "grad_norm": 0.7442967891693115, "learning_rate": 0.00012800000000000002, "loss": 2.7297, "step": 65 }, { "epoch": 0.5488565488565489, "grad_norm": 1.2096165418624878, "learning_rate": 0.00013000000000000002, "loss": 2.7521, "step": 66 }, { "epoch": 0.5571725571725572, "grad_norm": 5.993701457977295, "learning_rate": 0.000132, "loss": 2.7633, "step": 67 }, { "epoch": 0.5654885654885655, "grad_norm": 1.6413137912750244, "learning_rate": 0.000134, "loss": 2.7269, "step": 68 }, { "epoch": 0.5738045738045738, "grad_norm": 2.8430962562561035, "learning_rate": 0.00013600000000000003, "loss": 2.7717, "step": 69 }, { "epoch": 0.5821205821205822, "grad_norm": 2.3216440677642822, "learning_rate": 0.000138, "loss": 2.8099, "step": 70 }, { "epoch": 0.5904365904365905, "grad_norm": 1.4732354879379272, "learning_rate": 0.00014, "loss": 2.7525, "step": 71 }, { "epoch": 0.5987525987525988, "grad_norm": 1.524367332458496, "learning_rate": 0.000142, "loss": 2.7422, "step": 72 }, { "epoch": 0.6070686070686071, "grad_norm": 1.230338215827942, "learning_rate": 0.000144, "loss": 2.7555, "step": 73 }, { "epoch": 0.6153846153846154, "grad_norm": 0.9941631555557251, "learning_rate": 0.000146, "loss": 2.7042, "step": 74 }, { "epoch": 0.6237006237006237, "grad_norm": 1.3642252683639526, "learning_rate": 0.000148, "loss": 2.7088, "step": 75 }, { "epoch": 0.632016632016632, "grad_norm": 0.681107223033905, "learning_rate": 0.00015000000000000001, "loss": 2.7168, "step": 76 }, { "epoch": 0.6403326403326404, "grad_norm": 0.8406685590744019, "learning_rate": 0.000152, "loss": 2.6938, "step": 77 }, { "epoch": 0.6486486486486487, "grad_norm": 0.6661787033081055, "learning_rate": 0.000154, "loss": 2.7035, "step": 78 }, { "epoch": 0.656964656964657, "grad_norm": 0.5472131967544556, "learning_rate": 0.00015600000000000002, "loss": 2.6913, "step": 79 }, { "epoch": 0.6652806652806653, "grad_norm": 0.5465010404586792, "learning_rate": 0.00015800000000000002, "loss": 2.6913, "step": 80 }, { "epoch": 0.6735966735966736, "grad_norm": 0.6352857351303101, "learning_rate": 0.00016, "loss": 2.6834, "step": 81 }, { "epoch": 0.681912681912682, "grad_norm": 0.7992680668830872, "learning_rate": 0.000162, "loss": 2.6984, "step": 82 }, { "epoch": 0.6902286902286903, "grad_norm": 0.543773889541626, "learning_rate": 0.000164, "loss": 2.6631, "step": 83 }, { "epoch": 0.6985446985446986, "grad_norm": 0.4968494474887848, "learning_rate": 0.000166, "loss": 2.6707, "step": 84 }, { "epoch": 0.7068607068607069, "grad_norm": 0.5793298482894897, "learning_rate": 0.000168, "loss": 2.6776, "step": 85 }, { "epoch": 0.7151767151767152, "grad_norm": 1.137745976448059, "learning_rate": 0.00017, "loss": 2.6852, "step": 86 }, { "epoch": 0.7234927234927235, "grad_norm": 1.2862417697906494, "learning_rate": 0.000172, "loss": 2.6967, "step": 87 }, { "epoch": 0.7318087318087318, "grad_norm": 0.5603763461112976, "learning_rate": 0.000174, "loss": 2.6928, "step": 88 }, { "epoch": 0.7401247401247402, "grad_norm": 0.9993265867233276, "learning_rate": 0.00017600000000000002, "loss": 2.667, "step": 89 }, { "epoch": 0.7484407484407485, "grad_norm": 0.982528567314148, "learning_rate": 0.00017800000000000002, "loss": 2.6673, "step": 90 }, { "epoch": 0.7567567567567568, "grad_norm": 0.6244588494300842, "learning_rate": 0.00018, "loss": 2.6803, "step": 91 }, { "epoch": 0.7650727650727651, "grad_norm": 0.5460201501846313, "learning_rate": 0.000182, "loss": 2.6341, "step": 92 }, { "epoch": 0.7733887733887734, "grad_norm": 0.618466854095459, "learning_rate": 0.00018400000000000003, "loss": 2.6338, "step": 93 }, { "epoch": 0.7817047817047817, "grad_norm": 0.5517793893814087, "learning_rate": 0.00018600000000000002, "loss": 2.6288, "step": 94 }, { "epoch": 0.7900207900207901, "grad_norm": 0.5810732841491699, "learning_rate": 0.000188, "loss": 2.6575, "step": 95 }, { "epoch": 0.7983367983367984, "grad_norm": 0.5090439915657043, "learning_rate": 0.00019, "loss": 2.6646, "step": 96 }, { "epoch": 0.8066528066528067, "grad_norm": 0.5052196979522705, "learning_rate": 0.000192, "loss": 2.6515, "step": 97 }, { "epoch": 0.814968814968815, "grad_norm": 0.4681943953037262, "learning_rate": 0.000194, "loss": 2.6507, "step": 98 }, { "epoch": 0.8232848232848233, "grad_norm": 0.47244778275489807, "learning_rate": 0.000196, "loss": 2.6259, "step": 99 }, { "epoch": 0.8316008316008316, "grad_norm": 0.6378790140151978, "learning_rate": 0.00019800000000000002, "loss": 2.6371, "step": 100 }, { "epoch": 0.83991683991684, "grad_norm": 0.5825613737106323, "learning_rate": 0.0002, "loss": 2.6397, "step": 101 }, { "epoch": 0.8482328482328483, "grad_norm": 0.4540935754776001, "learning_rate": 0.00019999270008556108, "loss": 2.6295, "step": 102 }, { "epoch": 0.8565488565488566, "grad_norm": 0.4203695058822632, "learning_rate": 0.00019997080140801932, "loss": 2.6144, "step": 103 }, { "epoch": 0.8648648648648649, "grad_norm": 0.4233225882053375, "learning_rate": 0.00019993430716454413, "loss": 2.6378, "step": 104 }, { "epoch": 0.8731808731808732, "grad_norm": 0.4105020761489868, "learning_rate": 0.00019988322268323268, "loss": 2.6502, "step": 105 }, { "epoch": 0.8814968814968815, "grad_norm": 0.38928887248039246, "learning_rate": 0.00019981755542233177, "loss": 2.6154, "step": 106 }, { "epoch": 0.8898128898128899, "grad_norm": 0.4148002862930298, "learning_rate": 0.00019973731496914914, "loss": 2.6405, "step": 107 }, { "epoch": 0.8981288981288982, "grad_norm": 0.39650681614875793, "learning_rate": 0.00019964251303865362, "loss": 2.6215, "step": 108 }, { "epoch": 0.9064449064449065, "grad_norm": 0.4093710780143738, "learning_rate": 0.00019953316347176488, "loss": 2.6382, "step": 109 }, { "epoch": 0.9147609147609148, "grad_norm": 0.43350887298583984, "learning_rate": 0.00019940928223333252, "loss": 2.6281, "step": 110 }, { "epoch": 0.9230769230769231, "grad_norm": 0.5919923782348633, "learning_rate": 0.0001992708874098054, "loss": 2.6348, "step": 111 }, { "epoch": 0.9313929313929314, "grad_norm": 2.143718957901001, "learning_rate": 0.00019911799920659093, "loss": 2.6295, "step": 112 }, { "epoch": 0.9397089397089398, "grad_norm": 0.8955861330032349, "learning_rate": 0.0001989506399451051, "loss": 2.5963, "step": 113 }, { "epoch": 0.9480249480249481, "grad_norm": 0.7444430589675903, "learning_rate": 0.00019876883405951377, "loss": 2.6131, "step": 114 }, { "epoch": 0.9563409563409564, "grad_norm": 0.6534228324890137, "learning_rate": 0.0001985726080931651, "loss": 2.621, "step": 115 }, { "epoch": 0.9646569646569647, "grad_norm": 0.6738382577896118, "learning_rate": 0.00019836199069471437, "loss": 2.6123, "step": 116 }, { "epoch": 0.972972972972973, "grad_norm": 0.46829137206077576, "learning_rate": 0.00019813701261394136, "loss": 2.6216, "step": 117 }, { "epoch": 0.9812889812889813, "grad_norm": 0.5436323881149292, "learning_rate": 0.00019789770669726087, "loss": 2.6364, "step": 118 }, { "epoch": 0.9896049896049897, "grad_norm": 0.3988193869590759, "learning_rate": 0.00019764410788292722, "loss": 2.6036, "step": 119 }, { "epoch": 0.997920997920998, "grad_norm": 0.43007829785346985, "learning_rate": 0.00019737625319593335, "loss": 2.6067, "step": 120 }, { "epoch": 1.0, "grad_norm": 0.6972622871398926, "learning_rate": 0.0001970941817426052, "loss": 2.5875, "step": 121 }, { "epoch": 1.0083160083160083, "grad_norm": 0.7436782121658325, "learning_rate": 0.00019679793470489228, "loss": 2.6127, "step": 122 }, { "epoch": 1.0166320166320166, "grad_norm": 2.4193379878997803, "learning_rate": 0.00019648755533435518, "loss": 2.575, "step": 123 }, { "epoch": 1.024948024948025, "grad_norm": 0.6707261204719543, "learning_rate": 0.00019616308894585078, "loss": 2.5704, "step": 124 }, { "epoch": 1.0332640332640333, "grad_norm": 0.46323344111442566, "learning_rate": 0.00019582458291091663, "loss": 2.5438, "step": 125 }, { "epoch": 1.0415800415800416, "grad_norm": 0.5376307964324951, "learning_rate": 0.00019547208665085457, "loss": 2.5629, "step": 126 }, { "epoch": 1.04989604989605, "grad_norm": 0.4404616951942444, "learning_rate": 0.00019510565162951537, "loss": 2.5539, "step": 127 }, { "epoch": 1.0582120582120582, "grad_norm": 0.45081833004951477, "learning_rate": 0.00019472533134578507, "loss": 2.558, "step": 128 }, { "epoch": 1.0665280665280665, "grad_norm": 0.6300471425056458, "learning_rate": 0.0001943311813257743, "loss": 2.5688, "step": 129 }, { "epoch": 1.0748440748440748, "grad_norm": 0.49277371168136597, "learning_rate": 0.00019392325911471155, "loss": 2.5545, "step": 130 }, { "epoch": 1.0831600831600832, "grad_norm": 0.4726185202598572, "learning_rate": 0.0001935016242685415, "loss": 2.569, "step": 131 }, { "epoch": 1.0914760914760915, "grad_norm": 0.5317509174346924, "learning_rate": 0.00019306633834523024, "loss": 2.5738, "step": 132 }, { "epoch": 1.0997920997920998, "grad_norm": 0.4991399645805359, "learning_rate": 0.00019261746489577765, "loss": 2.5491, "step": 133 }, { "epoch": 1.1081081081081081, "grad_norm": 0.6071000099182129, "learning_rate": 0.0001921550694549393, "loss": 2.5667, "step": 134 }, { "epoch": 1.1164241164241164, "grad_norm": 0.4629747271537781, "learning_rate": 0.00019167921953165825, "loss": 2.5514, "step": 135 }, { "epoch": 1.1247401247401247, "grad_norm": 0.41420885920524597, "learning_rate": 0.00019118998459920902, "loss": 2.5432, "step": 136 }, { "epoch": 1.133056133056133, "grad_norm": 0.41042062640190125, "learning_rate": 0.00019068743608505455, "loss": 2.5783, "step": 137 }, { "epoch": 1.1413721413721414, "grad_norm": 0.40009400248527527, "learning_rate": 0.00019017164736041795, "loss": 2.5598, "step": 138 }, { "epoch": 1.1496881496881497, "grad_norm": 0.44067704677581787, "learning_rate": 0.00018964269372957038, "loss": 2.56, "step": 139 }, { "epoch": 1.158004158004158, "grad_norm": 0.4079325795173645, "learning_rate": 0.0001891006524188368, "loss": 2.5748, "step": 140 }, { "epoch": 1.1663201663201663, "grad_norm": 0.4190751314163208, "learning_rate": 0.000188545602565321, "loss": 2.5569, "step": 141 }, { "epoch": 1.1746361746361746, "grad_norm": 0.39320653676986694, "learning_rate": 0.00018797762520535177, "loss": 2.5689, "step": 142 }, { "epoch": 1.182952182952183, "grad_norm": 0.3822016716003418, "learning_rate": 0.0001873968032626518, "loss": 2.543, "step": 143 }, { "epoch": 1.1912681912681913, "grad_norm": 0.37677645683288574, "learning_rate": 0.00018680322153623075, "loss": 2.5315, "step": 144 }, { "epoch": 1.1995841995841996, "grad_norm": 0.3912084996700287, "learning_rate": 0.00018619696668800492, "loss": 2.5344, "step": 145 }, { "epoch": 1.207900207900208, "grad_norm": 0.3857469856739044, "learning_rate": 0.00018557812723014476, "loss": 2.5631, "step": 146 }, { "epoch": 1.2162162162162162, "grad_norm": 0.38990500569343567, "learning_rate": 0.0001849467935121521, "loss": 2.5472, "step": 147 }, { "epoch": 1.2245322245322245, "grad_norm": 0.38198038935661316, "learning_rate": 0.00018430305770766948, "loss": 2.5287, "step": 148 }, { "epoch": 1.2328482328482329, "grad_norm": 0.5182828307151794, "learning_rate": 0.00018364701380102266, "loss": 2.5404, "step": 149 }, { "epoch": 1.2411642411642412, "grad_norm": 0.4881569445133209, "learning_rate": 0.00018297875757349952, "loss": 2.5547, "step": 150 }, { "epoch": 1.2494802494802495, "grad_norm": 0.4347280263900757, "learning_rate": 0.00018229838658936564, "loss": 2.5338, "step": 151 }, { "epoch": 1.2577962577962578, "grad_norm": 0.47588396072387695, "learning_rate": 0.0001816060001816205, "loss": 2.5507, "step": 152 }, { "epoch": 1.2661122661122661, "grad_norm": 0.3899973928928375, "learning_rate": 0.00018090169943749476, "loss": 2.5596, "step": 153 }, { "epoch": 1.2744282744282744, "grad_norm": 0.39740344882011414, "learning_rate": 0.00018018558718369186, "loss": 2.539, "step": 154 }, { "epoch": 1.2827442827442828, "grad_norm": 0.42894020676612854, "learning_rate": 0.00017945776797137543, "loss": 2.5211, "step": 155 }, { "epoch": 1.291060291060291, "grad_norm": 0.42803955078125, "learning_rate": 0.00017871834806090501, "loss": 2.5267, "step": 156 }, { "epoch": 1.2993762993762994, "grad_norm": 0.4332488477230072, "learning_rate": 0.00017796743540632223, "loss": 2.5422, "step": 157 }, { "epoch": 1.3076923076923077, "grad_norm": 0.5245593786239624, "learning_rate": 0.00017720513963958968, "loss": 2.5275, "step": 158 }, { "epoch": 1.316008316008316, "grad_norm": 0.4242333173751831, "learning_rate": 0.00017643157205458483, "loss": 2.5284, "step": 159 }, { "epoch": 1.3243243243243243, "grad_norm": 0.4454910457134247, "learning_rate": 0.00017564684559085136, "loss": 2.5204, "step": 160 }, { "epoch": 1.3326403326403327, "grad_norm": 0.5117389559745789, "learning_rate": 0.00017485107481711012, "loss": 2.5484, "step": 161 }, { "epoch": 1.340956340956341, "grad_norm": 0.5533547401428223, "learning_rate": 0.00017404437591453235, "loss": 2.5256, "step": 162 }, { "epoch": 1.3492723492723493, "grad_norm": 0.4423222541809082, "learning_rate": 0.00017322686665977737, "loss": 2.5516, "step": 163 }, { "epoch": 1.3575883575883576, "grad_norm": 0.6984363198280334, "learning_rate": 0.00017239866640779745, "loss": 2.5392, "step": 164 }, { "epoch": 1.365904365904366, "grad_norm": 0.46573567390441895, "learning_rate": 0.00017155989607441213, "loss": 2.5215, "step": 165 }, { "epoch": 1.3742203742203742, "grad_norm": 0.43190667033195496, "learning_rate": 0.00017071067811865476, "loss": 2.5109, "step": 166 }, { "epoch": 1.3825363825363826, "grad_norm": 0.6366004943847656, "learning_rate": 0.00016985113652489374, "loss": 2.5607, "step": 167 }, { "epoch": 1.3908523908523909, "grad_norm": 0.4087092876434326, "learning_rate": 0.00016898139678473076, "loss": 2.5446, "step": 168 }, { "epoch": 1.3991683991683992, "grad_norm": 0.40001702308654785, "learning_rate": 0.00016810158587867973, "loss": 2.5087, "step": 169 }, { "epoch": 1.4074844074844075, "grad_norm": 0.40865230560302734, "learning_rate": 0.00016721183225762727, "loss": 2.5256, "step": 170 }, { "epoch": 1.4158004158004158, "grad_norm": 0.41592201590538025, "learning_rate": 0.00016631226582407952, "loss": 2.5457, "step": 171 }, { "epoch": 1.4241164241164241, "grad_norm": 0.47380271553993225, "learning_rate": 0.00016540301791319645, "loss": 2.5239, "step": 172 }, { "epoch": 1.4324324324324325, "grad_norm": 0.3751722574234009, "learning_rate": 0.00016448422127361706, "loss": 2.5277, "step": 173 }, { "epoch": 1.4407484407484408, "grad_norm": 0.531104326248169, "learning_rate": 0.00016355601004807856, "loss": 2.5095, "step": 174 }, { "epoch": 1.449064449064449, "grad_norm": 0.3729030191898346, "learning_rate": 0.00016261851975383137, "loss": 2.508, "step": 175 }, { "epoch": 1.4573804573804574, "grad_norm": 0.4520551562309265, "learning_rate": 0.00016167188726285434, "loss": 2.5198, "step": 176 }, { "epoch": 1.4656964656964657, "grad_norm": 0.4487551152706146, "learning_rate": 0.00016071625078187114, "loss": 2.5153, "step": 177 }, { "epoch": 1.474012474012474, "grad_norm": 0.41815704107284546, "learning_rate": 0.00015975174983217275, "loss": 2.5218, "step": 178 }, { "epoch": 1.4823284823284824, "grad_norm": 0.4343583583831787, "learning_rate": 0.00015877852522924732, "loss": 2.518, "step": 179 }, { "epoch": 1.4906444906444907, "grad_norm": 0.4115164279937744, "learning_rate": 0.0001577967190622215, "loss": 2.5137, "step": 180 }, { "epoch": 1.498960498960499, "grad_norm": 0.366964191198349, "learning_rate": 0.00015680647467311557, "loss": 2.5379, "step": 181 }, { "epoch": 1.5072765072765073, "grad_norm": 0.42494460940361023, "learning_rate": 0.00015580793663591585, "loss": 2.5471, "step": 182 }, { "epoch": 1.5155925155925156, "grad_norm": 0.39186251163482666, "learning_rate": 0.00015480125073546704, "loss": 2.5131, "step": 183 }, { "epoch": 1.523908523908524, "grad_norm": 0.3860762119293213, "learning_rate": 0.00015378656394618787, "loss": 2.5355, "step": 184 }, { "epoch": 1.5322245322245323, "grad_norm": 0.4578627943992615, "learning_rate": 0.0001527640244106133, "loss": 2.5119, "step": 185 }, { "epoch": 1.5405405405405406, "grad_norm": 0.3482917547225952, "learning_rate": 0.00015173378141776568, "loss": 2.512, "step": 186 }, { "epoch": 1.5488565488565489, "grad_norm": 0.3526865541934967, "learning_rate": 0.00015069598538135906, "loss": 2.5142, "step": 187 }, { "epoch": 1.5571725571725572, "grad_norm": 0.3888881206512451, "learning_rate": 0.0001496507878178388, "loss": 2.5202, "step": 188 }, { "epoch": 1.5654885654885655, "grad_norm": 0.39839354157447815, "learning_rate": 0.0001485983413242606, "loss": 2.5178, "step": 189 }, { "epoch": 1.5738045738045738, "grad_norm": 0.42406460642814636, "learning_rate": 0.00014753879955601163, "loss": 2.4996, "step": 190 }, { "epoch": 1.5821205821205822, "grad_norm": 0.4432888627052307, "learning_rate": 0.00014647231720437686, "loss": 2.5232, "step": 191 }, { "epoch": 1.5904365904365905, "grad_norm": 0.43413785099983215, "learning_rate": 0.00014539904997395468, "loss": 2.5243, "step": 192 }, { "epoch": 1.5987525987525988, "grad_norm": 0.3551112413406372, "learning_rate": 0.00014431915455992414, "loss": 2.5134, "step": 193 }, { "epoch": 1.607068607068607, "grad_norm": 0.36720749735832214, "learning_rate": 0.00014323278862516775, "loss": 2.502, "step": 194 }, { "epoch": 1.6153846153846154, "grad_norm": 0.35447460412979126, "learning_rate": 0.00014214011077725292, "loss": 2.5036, "step": 195 }, { "epoch": 1.6237006237006237, "grad_norm": 0.3743102550506592, "learning_rate": 0.0001410412805452757, "loss": 2.4936, "step": 196 }, { "epoch": 1.632016632016632, "grad_norm": 0.36030158400535583, "learning_rate": 0.00013993645835656953, "loss": 2.4851, "step": 197 }, { "epoch": 1.6403326403326404, "grad_norm": 0.3682660162448883, "learning_rate": 0.0001388258055132835, "loss": 2.5107, "step": 198 }, { "epoch": 1.6486486486486487, "grad_norm": 0.37054452300071716, "learning_rate": 0.00013770948416883205, "loss": 2.4875, "step": 199 }, { "epoch": 1.656964656964657, "grad_norm": 0.4111086428165436, "learning_rate": 0.00013658765730422125, "loss": 2.5055, "step": 200 }, { "epoch": 1.6652806652806653, "grad_norm": 0.36359384655952454, "learning_rate": 0.00013546048870425356, "loss": 2.5099, "step": 201 }, { "epoch": 1.6735966735966736, "grad_norm": 0.40381795167922974, "learning_rate": 0.00013432814293361584, "loss": 2.5162, "step": 202 }, { "epoch": 1.681912681912682, "grad_norm": 0.3458056151866913, "learning_rate": 0.00013319078531285285, "loss": 2.4827, "step": 203 }, { "epoch": 1.6902286902286903, "grad_norm": 0.35546690225601196, "learning_rate": 0.00013204858189423097, "loss": 2.5162, "step": 204 }, { "epoch": 1.6985446985446986, "grad_norm": 0.37076064944267273, "learning_rate": 0.00013090169943749476, "loss": 2.5201, "step": 205 }, { "epoch": 1.706860706860707, "grad_norm": 0.3626950681209564, "learning_rate": 0.00012975030538552032, "loss": 2.5196, "step": 206 }, { "epoch": 1.7151767151767152, "grad_norm": 0.38436341285705566, "learning_rate": 0.00012859456783986893, "loss": 2.5157, "step": 207 }, { "epoch": 1.7234927234927235, "grad_norm": 0.38007652759552, "learning_rate": 0.0001274346555362446, "loss": 2.4935, "step": 208 }, { "epoch": 1.7318087318087318, "grad_norm": 0.38916251063346863, "learning_rate": 0.0001262707378198587, "loss": 2.5083, "step": 209 }, { "epoch": 1.7401247401247402, "grad_norm": 0.37072858214378357, "learning_rate": 0.00012510298462070619, "loss": 2.4915, "step": 210 }, { "epoch": 1.7484407484407485, "grad_norm": 0.4798509478569031, "learning_rate": 0.0001239315664287558, "loss": 2.5189, "step": 211 }, { "epoch": 1.7567567567567568, "grad_norm": 0.4451827108860016, "learning_rate": 0.000122756654269059, "loss": 2.5231, "step": 212 }, { "epoch": 1.7650727650727651, "grad_norm": 0.35244473814964294, "learning_rate": 0.00012157841967678063, "loss": 2.5061, "step": 213 }, { "epoch": 1.7733887733887734, "grad_norm": 0.34157419204711914, "learning_rate": 0.00012039703467215488, "loss": 2.5044, "step": 214 }, { "epoch": 1.7817047817047817, "grad_norm": 0.40339285135269165, "learning_rate": 0.00011921267173537086, "loss": 2.506, "step": 215 }, { "epoch": 1.79002079002079, "grad_norm": 0.3662133514881134, "learning_rate": 0.0001180255037813906, "loss": 2.5012, "step": 216 }, { "epoch": 1.7983367983367984, "grad_norm": 0.3495447039604187, "learning_rate": 0.00011683570413470383, "loss": 2.5, "step": 217 }, { "epoch": 1.8066528066528067, "grad_norm": 0.36315813660621643, "learning_rate": 0.0001156434465040231, "loss": 2.4923, "step": 218 }, { "epoch": 1.814968814968815, "grad_norm": 0.3807941973209381, "learning_rate": 0.00011444890495692213, "loss": 2.5123, "step": 219 }, { "epoch": 1.8232848232848233, "grad_norm": 0.36616984009742737, "learning_rate": 0.00011325225389442277, "loss": 2.4954, "step": 220 }, { "epoch": 1.8316008316008316, "grad_norm": 0.372138112783432, "learning_rate": 0.0001120536680255323, "loss": 2.488, "step": 221 }, { "epoch": 1.83991683991684, "grad_norm": 0.37164467573165894, "learning_rate": 0.00011085332234173664, "loss": 2.4883, "step": 222 }, { "epoch": 1.8482328482328483, "grad_norm": 0.34948042035102844, "learning_rate": 0.00010965139209145152, "loss": 2.4864, "step": 223 }, { "epoch": 1.8565488565488566, "grad_norm": 0.35938721895217896, "learning_rate": 0.00010844805275443673, "loss": 2.4928, "step": 224 }, { "epoch": 1.864864864864865, "grad_norm": 0.3879775404930115, "learning_rate": 0.00010724348001617625, "loss": 2.4938, "step": 225 }, { "epoch": 1.8731808731808732, "grad_norm": 0.36195841431617737, "learning_rate": 0.00010603784974222861, "loss": 2.4925, "step": 226 }, { "epoch": 1.8814968814968815, "grad_norm": 0.35239464044570923, "learning_rate": 0.00010483133795255071, "loss": 2.4973, "step": 227 }, { "epoch": 1.8898128898128899, "grad_norm": 0.3456074297428131, "learning_rate": 0.00010362412079579924, "loss": 2.4966, "step": 228 }, { "epoch": 1.8981288981288982, "grad_norm": 0.3808579444885254, "learning_rate": 0.00010241637452361323, "loss": 2.5087, "step": 229 }, { "epoch": 1.9064449064449065, "grad_norm": 0.39099177718162537, "learning_rate": 0.00010120827546488174, "loss": 2.4894, "step": 230 }, { "epoch": 1.9147609147609148, "grad_norm": 0.3821989595890045, "learning_rate": 0.0001, "loss": 2.4971, "step": 231 }, { "epoch": 1.9230769230769231, "grad_norm": 0.3640426695346832, "learning_rate": 9.879172453511827e-05, "loss": 2.4893, "step": 232 }, { "epoch": 1.9313929313929314, "grad_norm": 0.3754567801952362, "learning_rate": 9.75836254763868e-05, "loss": 2.4871, "step": 233 }, { "epoch": 1.9397089397089398, "grad_norm": 0.3793177902698517, "learning_rate": 9.63758792042008e-05, "loss": 2.4929, "step": 234 }, { "epoch": 1.948024948024948, "grad_norm": 0.35912153124809265, "learning_rate": 9.516866204744931e-05, "loss": 2.503, "step": 235 }, { "epoch": 1.9563409563409564, "grad_norm": 0.3523949384689331, "learning_rate": 9.396215025777139e-05, "loss": 2.4869, "step": 236 }, { "epoch": 1.9646569646569647, "grad_norm": 0.38750869035720825, "learning_rate": 9.275651998382377e-05, "loss": 2.4957, "step": 237 }, { "epoch": 1.972972972972973, "grad_norm": 0.3791241943836212, "learning_rate": 9.155194724556331e-05, "loss": 2.506, "step": 238 }, { "epoch": 1.9812889812889813, "grad_norm": 0.34388408064842224, "learning_rate": 9.034860790854849e-05, "loss": 2.5003, "step": 239 }, { "epoch": 1.9896049896049897, "grad_norm": 0.3853475749492645, "learning_rate": 8.914667765826338e-05, "loss": 2.4585, "step": 240 }, { "epoch": 1.997920997920998, "grad_norm": 0.3552019000053406, "learning_rate": 8.79463319744677e-05, "loss": 2.4799, "step": 241 }, { "epoch": 2.0, "grad_norm": 0.6879647374153137, "learning_rate": 8.674774610557728e-05, "loss": 2.485, "step": 242 }, { "epoch": 2.008316008316008, "grad_norm": 0.4841165542602539, "learning_rate": 8.55510950430779e-05, "loss": 2.3719, "step": 243 }, { "epoch": 2.0166320166320166, "grad_norm": 0.40582990646362305, "learning_rate": 8.435655349597689e-05, "loss": 2.3914, "step": 244 }, { "epoch": 2.024948024948025, "grad_norm": 0.44056642055511475, "learning_rate": 8.316429586529615e-05, "loss": 2.3862, "step": 245 }, { "epoch": 2.0332640332640333, "grad_norm": 0.5893064141273499, "learning_rate": 8.197449621860943e-05, "loss": 2.3759, "step": 246 }, { "epoch": 2.0415800415800414, "grad_norm": 0.5335010290145874, "learning_rate": 8.078732826462915e-05, "loss": 2.3966, "step": 247 }, { "epoch": 2.04989604989605, "grad_norm": 0.40315303206443787, "learning_rate": 7.960296532784515e-05, "loss": 2.3886, "step": 248 }, { "epoch": 2.0582120582120584, "grad_norm": 0.45403429865837097, "learning_rate": 7.84215803232194e-05, "loss": 2.3664, "step": 249 }, { "epoch": 2.0665280665280665, "grad_norm": 0.44575178623199463, "learning_rate": 7.7243345730941e-05, "loss": 2.3705, "step": 250 }, { "epoch": 2.0748440748440746, "grad_norm": 0.41231396794319153, "learning_rate": 7.606843357124426e-05, "loss": 2.3417, "step": 251 }, { "epoch": 2.083160083160083, "grad_norm": 0.45150724053382874, "learning_rate": 7.489701537929384e-05, "loss": 2.3467, "step": 252 }, { "epoch": 2.0914760914760917, "grad_norm": 0.45270782709121704, "learning_rate": 7.372926218014131e-05, "loss": 2.3589, "step": 253 }, { "epoch": 2.0997920997921, "grad_norm": 0.49504727125167847, "learning_rate": 7.256534446375542e-05, "loss": 2.3821, "step": 254 }, { "epoch": 2.108108108108108, "grad_norm": 0.40953919291496277, "learning_rate": 7.14054321601311e-05, "loss": 2.3754, "step": 255 }, { "epoch": 2.1164241164241164, "grad_norm": 0.429299533367157, "learning_rate": 7.024969461447972e-05, "loss": 2.3582, "step": 256 }, { "epoch": 2.124740124740125, "grad_norm": 0.4189452826976776, "learning_rate": 6.909830056250527e-05, "loss": 2.3636, "step": 257 }, { "epoch": 2.133056133056133, "grad_norm": 0.4161284267902374, "learning_rate": 6.795141810576906e-05, "loss": 2.3543, "step": 258 }, { "epoch": 2.141372141372141, "grad_norm": 0.4045678675174713, "learning_rate": 6.680921468714719e-05, "loss": 2.362, "step": 259 }, { "epoch": 2.1496881496881497, "grad_norm": 0.47194382548332214, "learning_rate": 6.567185706638417e-05, "loss": 2.377, "step": 260 }, { "epoch": 2.1580041580041582, "grad_norm": 0.47539058327674866, "learning_rate": 6.453951129574644e-05, "loss": 2.3645, "step": 261 }, { "epoch": 2.1663201663201663, "grad_norm": 0.43471434712409973, "learning_rate": 6.341234269577879e-05, "loss": 2.3441, "step": 262 }, { "epoch": 2.1746361746361744, "grad_norm": 0.45001259446144104, "learning_rate": 6.229051583116796e-05, "loss": 2.3124, "step": 263 }, { "epoch": 2.182952182952183, "grad_norm": 0.40879690647125244, "learning_rate": 6.117419448671651e-05, "loss": 2.3396, "step": 264 }, { "epoch": 2.1912681912681915, "grad_norm": 0.42216163873672485, "learning_rate": 6.006354164343046e-05, "loss": 2.3724, "step": 265 }, { "epoch": 2.1995841995841996, "grad_norm": 0.41845160722732544, "learning_rate": 5.8958719454724346e-05, "loss": 2.3541, "step": 266 }, { "epoch": 2.2079002079002077, "grad_norm": 0.423265665769577, "learning_rate": 5.785988922274711e-05, "loss": 2.3594, "step": 267 }, { "epoch": 2.2162162162162162, "grad_norm": 0.4284776747226715, "learning_rate": 5.676721137483225e-05, "loss": 2.3676, "step": 268 }, { "epoch": 2.2245322245322248, "grad_norm": 0.4213842749595642, "learning_rate": 5.568084544007588e-05, "loss": 2.3392, "step": 269 }, { "epoch": 2.232848232848233, "grad_norm": 0.4187883138656616, "learning_rate": 5.4600950026045326e-05, "loss": 2.3501, "step": 270 }, { "epoch": 2.241164241164241, "grad_norm": 0.41858911514282227, "learning_rate": 5.3527682795623146e-05, "loss": 2.3563, "step": 271 }, { "epoch": 2.2494802494802495, "grad_norm": 0.4647063910961151, "learning_rate": 5.246120044398839e-05, "loss": 2.3575, "step": 272 }, { "epoch": 2.257796257796258, "grad_norm": 0.41441211104393005, "learning_rate": 5.14016586757394e-05, "loss": 2.3534, "step": 273 }, { "epoch": 2.266112266112266, "grad_norm": 0.4543509781360626, "learning_rate": 5.0349212182161254e-05, "loss": 2.3562, "step": 274 }, { "epoch": 2.274428274428274, "grad_norm": 0.4140624403953552, "learning_rate": 4.9304014618640995e-05, "loss": 2.3488, "step": 275 }, { "epoch": 2.2827442827442828, "grad_norm": 0.4368607699871063, "learning_rate": 4.826621858223431e-05, "loss": 2.3532, "step": 276 }, { "epoch": 2.2910602910602913, "grad_norm": 0.41528797149658203, "learning_rate": 4.723597558938672e-05, "loss": 2.3591, "step": 277 }, { "epoch": 2.2993762993762994, "grad_norm": 0.43311697244644165, "learning_rate": 4.6213436053812144e-05, "loss": 2.3347, "step": 278 }, { "epoch": 2.3076923076923075, "grad_norm": 0.42939522862434387, "learning_rate": 4.519874926453302e-05, "loss": 2.3505, "step": 279 }, { "epoch": 2.316008316008316, "grad_norm": 0.42948731780052185, "learning_rate": 4.419206336408418e-05, "loss": 2.3377, "step": 280 }, { "epoch": 2.3243243243243246, "grad_norm": 0.44148650765419006, "learning_rate": 4.3193525326884435e-05, "loss": 2.3413, "step": 281 }, { "epoch": 2.3326403326403327, "grad_norm": 0.424143522977829, "learning_rate": 4.220328093777851e-05, "loss": 2.3377, "step": 282 }, { "epoch": 2.3409563409563408, "grad_norm": 0.4698318541049957, "learning_rate": 4.12214747707527e-05, "loss": 2.3692, "step": 283 }, { "epoch": 2.3492723492723493, "grad_norm": 0.42582592368125916, "learning_rate": 4.0248250167827275e-05, "loss": 2.3572, "step": 284 }, { "epoch": 2.357588357588358, "grad_norm": 0.4405396282672882, "learning_rate": 3.9283749218128885e-05, "loss": 2.352, "step": 285 }, { "epoch": 2.365904365904366, "grad_norm": 0.4368214011192322, "learning_rate": 3.832811273714569e-05, "loss": 2.3425, "step": 286 }, { "epoch": 2.374220374220374, "grad_norm": 0.42838993668556213, "learning_rate": 3.738148024616863e-05, "loss": 2.3472, "step": 287 }, { "epoch": 2.3825363825363826, "grad_norm": 0.4351613223552704, "learning_rate": 3.644398995192147e-05, "loss": 2.3441, "step": 288 }, { "epoch": 2.390852390852391, "grad_norm": 0.4328824281692505, "learning_rate": 3.5515778726382966e-05, "loss": 2.3573, "step": 289 }, { "epoch": 2.399168399168399, "grad_norm": 0.42902839183807373, "learning_rate": 3.459698208680359e-05, "loss": 2.3672, "step": 290 }, { "epoch": 2.4074844074844073, "grad_norm": 0.44229575991630554, "learning_rate": 3.36877341759205e-05, "loss": 2.3371, "step": 291 }, { "epoch": 2.415800415800416, "grad_norm": 0.4500284492969513, "learning_rate": 3.2788167742372725e-05, "loss": 2.329, "step": 292 }, { "epoch": 2.4241164241164244, "grad_norm": 0.6344565749168396, "learning_rate": 3.1898414121320276e-05, "loss": 2.3618, "step": 293 }, { "epoch": 2.4324324324324325, "grad_norm": 0.42112040519714355, "learning_rate": 3.101860321526924e-05, "loss": 2.3349, "step": 294 }, { "epoch": 2.4407484407484406, "grad_norm": 0.49494612216949463, "learning_rate": 3.0148863475106314e-05, "loss": 2.3389, "step": 295 }, { "epoch": 2.449064449064449, "grad_norm": 0.4607335925102234, "learning_rate": 2.9289321881345254e-05, "loss": 2.3245, "step": 296 }, { "epoch": 2.4573804573804576, "grad_norm": 0.4578121602535248, "learning_rate": 2.84401039255879e-05, "loss": 2.3573, "step": 297 }, { "epoch": 2.4656964656964657, "grad_norm": 0.4579460620880127, "learning_rate": 2.7601333592202583e-05, "loss": 2.3372, "step": 298 }, { "epoch": 2.474012474012474, "grad_norm": 0.4420205354690552, "learning_rate": 2.677313334022268e-05, "loss": 2.3221, "step": 299 }, { "epoch": 2.4823284823284824, "grad_norm": 0.4563603103160858, "learning_rate": 2.59556240854677e-05, "loss": 2.3446, "step": 300 }, { "epoch": 2.490644490644491, "grad_norm": 0.44042691588401794, "learning_rate": 2.514892518288988e-05, "loss": 2.3279, "step": 301 }, { "epoch": 2.498960498960499, "grad_norm": 0.4372948408126831, "learning_rate": 2.4353154409148637e-05, "loss": 2.3629, "step": 302 }, { "epoch": 2.507276507276507, "grad_norm": 0.4709435701370239, "learning_rate": 2.356842794541516e-05, "loss": 2.353, "step": 303 }, { "epoch": 2.5155925155925156, "grad_norm": 0.42820531129837036, "learning_rate": 2.2794860360410342e-05, "loss": 2.3419, "step": 304 }, { "epoch": 2.523908523908524, "grad_norm": 0.4281867742538452, "learning_rate": 2.2032564593677774e-05, "loss": 2.3369, "step": 305 }, { "epoch": 2.5322245322245323, "grad_norm": 0.4326595664024353, "learning_rate": 2.1281651939094992e-05, "loss": 2.3215, "step": 306 }, { "epoch": 2.5405405405405403, "grad_norm": 0.4328608810901642, "learning_rate": 2.0542232028624586e-05, "loss": 2.3094, "step": 307 }, { "epoch": 2.548856548856549, "grad_norm": 0.42794665694236755, "learning_rate": 1.981441281630816e-05, "loss": 2.3303, "step": 308 }, { "epoch": 2.5571725571725574, "grad_norm": 0.43002575635910034, "learning_rate": 1.9098300562505266e-05, "loss": 2.3516, "step": 309 }, { "epoch": 2.5654885654885655, "grad_norm": 0.43052583932876587, "learning_rate": 1.8393999818379525e-05, "loss": 2.3592, "step": 310 }, { "epoch": 2.5738045738045736, "grad_norm": 0.41541633009910583, "learning_rate": 1.7701613410634365e-05, "loss": 2.333, "step": 311 }, { "epoch": 2.582120582120582, "grad_norm": 0.44588619470596313, "learning_rate": 1.7021242426500493e-05, "loss": 2.3445, "step": 312 }, { "epoch": 2.5904365904365907, "grad_norm": 0.4385538399219513, "learning_rate": 1.6352986198977325e-05, "loss": 2.3406, "step": 313 }, { "epoch": 2.598752598752599, "grad_norm": 0.43057897686958313, "learning_rate": 1.5696942292330576e-05, "loss": 2.3426, "step": 314 }, { "epoch": 2.607068607068607, "grad_norm": 0.4492436945438385, "learning_rate": 1.5053206487847914e-05, "loss": 2.33, "step": 315 }, { "epoch": 2.6153846153846154, "grad_norm": 0.43555155396461487, "learning_rate": 1.442187276985526e-05, "loss": 2.342, "step": 316 }, { "epoch": 2.623700623700624, "grad_norm": 0.4379512071609497, "learning_rate": 1.3803033311995072e-05, "loss": 2.3202, "step": 317 }, { "epoch": 2.632016632016632, "grad_norm": 0.42732083797454834, "learning_rate": 1.3196778463769255e-05, "loss": 2.3104, "step": 318 }, { "epoch": 2.64033264033264, "grad_norm": 0.43830588459968567, "learning_rate": 1.260319673734821e-05, "loss": 2.3252, "step": 319 }, { "epoch": 2.6486486486486487, "grad_norm": 0.43504393100738525, "learning_rate": 1.2022374794648228e-05, "loss": 2.3416, "step": 320 }, { "epoch": 2.6569646569646572, "grad_norm": 0.43797558546066284, "learning_rate": 1.1454397434679021e-05, "loss": 2.3361, "step": 321 }, { "epoch": 2.6652806652806653, "grad_norm": 0.4464263916015625, "learning_rate": 1.0899347581163221e-05, "loss": 2.3515, "step": 322 }, { "epoch": 2.6735966735966734, "grad_norm": 0.43725448846817017, "learning_rate": 1.0357306270429624e-05, "loss": 2.3236, "step": 323 }, { "epoch": 2.681912681912682, "grad_norm": 0.4281315803527832, "learning_rate": 9.828352639582072e-06, "loss": 2.3296, "step": 324 }, { "epoch": 2.6902286902286905, "grad_norm": 0.4342389702796936, "learning_rate": 9.31256391494546e-06, "loss": 2.3097, "step": 325 }, { "epoch": 2.6985446985446986, "grad_norm": 0.44206634163856506, "learning_rate": 8.810015400790994e-06, "loss": 2.3303, "step": 326 }, { "epoch": 2.7068607068607067, "grad_norm": 0.4294271767139435, "learning_rate": 8.32078046834176e-06, "loss": 2.3183, "step": 327 }, { "epoch": 2.715176715176715, "grad_norm": 0.43030035495758057, "learning_rate": 7.844930545060703e-06, "loss": 2.3368, "step": 328 }, { "epoch": 2.7234927234927238, "grad_norm": 0.435197651386261, "learning_rate": 7.382535104222366e-06, "loss": 2.3312, "step": 329 }, { "epoch": 2.731808731808732, "grad_norm": 0.4398477375507355, "learning_rate": 6.9336616547697965e-06, "loss": 2.3146, "step": 330 }, { "epoch": 2.74012474012474, "grad_norm": 0.43261393904685974, "learning_rate": 6.498375731458528e-06, "loss": 2.3277, "step": 331 }, { "epoch": 2.7484407484407485, "grad_norm": 0.46077948808670044, "learning_rate": 6.076740885288479e-06, "loss": 2.3493, "step": 332 }, { "epoch": 2.756756756756757, "grad_norm": 0.4285019636154175, "learning_rate": 5.668818674225685e-06, "loss": 2.3537, "step": 333 }, { "epoch": 2.765072765072765, "grad_norm": 0.43278998136520386, "learning_rate": 5.274668654214932e-06, "loss": 2.3504, "step": 334 }, { "epoch": 2.773388773388773, "grad_norm": 0.43840229511260986, "learning_rate": 4.8943483704846475e-06, "loss": 2.333, "step": 335 }, { "epoch": 2.7817047817047817, "grad_norm": 0.4274958670139313, "learning_rate": 4.527913349145441e-06, "loss": 2.3467, "step": 336 }, { "epoch": 2.7900207900207903, "grad_norm": 0.4174635410308838, "learning_rate": 4.175417089083378e-06, "loss": 2.3248, "step": 337 }, { "epoch": 2.7983367983367984, "grad_norm": 0.4231819808483124, "learning_rate": 3.836911054149239e-06, "loss": 2.3169, "step": 338 }, { "epoch": 2.8066528066528065, "grad_norm": 0.4251345098018646, "learning_rate": 3.512444665644865e-06, "loss": 2.333, "step": 339 }, { "epoch": 2.814968814968815, "grad_norm": 0.42021313309669495, "learning_rate": 3.202065295107726e-06, "loss": 2.3475, "step": 340 }, { "epoch": 2.8232848232848236, "grad_norm": 0.4197041094303131, "learning_rate": 2.905818257394799e-06, "loss": 2.3128, "step": 341 }, { "epoch": 2.8316008316008316, "grad_norm": 0.4266311228275299, "learning_rate": 2.6237468040666512e-06, "loss": 2.3249, "step": 342 }, { "epoch": 2.8399168399168397, "grad_norm": 0.442434698343277, "learning_rate": 2.3558921170727888e-06, "loss": 2.3422, "step": 343 }, { "epoch": 2.8482328482328483, "grad_norm": 0.4329405725002289, "learning_rate": 2.1022933027391555e-06, "loss": 2.3188, "step": 344 }, { "epoch": 2.856548856548857, "grad_norm": 0.43024739623069763, "learning_rate": 1.8629873860586566e-06, "loss": 2.3365, "step": 345 }, { "epoch": 2.864864864864865, "grad_norm": 0.43431299924850464, "learning_rate": 1.6380093052856483e-06, "loss": 2.3476, "step": 346 }, { "epoch": 2.873180873180873, "grad_norm": 0.43452492356300354, "learning_rate": 1.4273919068349184e-06, "loss": 2.3327, "step": 347 }, { "epoch": 2.8814968814968815, "grad_norm": 0.4286072552204132, "learning_rate": 1.231165940486234e-06, "loss": 2.334, "step": 348 }, { "epoch": 2.88981288981289, "grad_norm": 0.4313005805015564, "learning_rate": 1.0493600548948878e-06, "loss": 2.3216, "step": 349 }, { "epoch": 2.898128898128898, "grad_norm": 0.4270586371421814, "learning_rate": 8.820007934090879e-07, "loss": 2.3438, "step": 350 }, { "epoch": 2.9064449064449063, "grad_norm": 0.4391648769378662, "learning_rate": 7.291125901946027e-07, "loss": 2.3528, "step": 351 }, { "epoch": 2.914760914760915, "grad_norm": 0.4252782166004181, "learning_rate": 5.907177666674812e-07, "loss": 2.3422, "step": 352 }, { "epoch": 2.9230769230769234, "grad_norm": 0.4223313629627228, "learning_rate": 4.668365282351372e-07, "loss": 2.3233, "step": 353 }, { "epoch": 2.9313929313929314, "grad_norm": 0.43984875082969666, "learning_rate": 3.5748696134639825e-07, "loss": 2.3437, "step": 354 }, { "epoch": 2.9397089397089395, "grad_norm": 0.4258709251880646, "learning_rate": 2.6268503085089547e-07, "loss": 2.3406, "step": 355 }, { "epoch": 2.948024948024948, "grad_norm": 0.42968347668647766, "learning_rate": 1.824445776682504e-07, "loss": 2.3277, "step": 356 }, { "epoch": 2.9563409563409566, "grad_norm": 0.4366348683834076, "learning_rate": 1.1677731676733584e-07, "loss": 2.3263, "step": 357 }, { "epoch": 2.9646569646569647, "grad_norm": 0.42044591903686523, "learning_rate": 6.569283545587724e-08, "loss": 2.3439, "step": 358 }, { "epoch": 2.972972972972973, "grad_norm": 0.43199941515922546, "learning_rate": 2.9198591980705848e-08, "loss": 2.3411, "step": 359 }, { "epoch": 2.9812889812889813, "grad_norm": 0.4222280979156494, "learning_rate": 7.2999144389296335e-09, "loss": 2.3358, "step": 360 } ], "logging_steps": 1, "max_steps": 360, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 200, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.2413772944716595e+18, "train_batch_size": 4, "trial_name": null, "trial_params": null }