{ "best_metric": null, "best_model_checkpoint": null, "epoch": 4.997679814385151, "eval_steps": 500, "global_step": 6462, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.007733952049497293, "grad_norm": 6.05774450302124, "learning_rate": 6.172839506172839e-06, "loss": 1.1474, "step": 10 }, { "epoch": 0.015467904098994586, "grad_norm": 4.862592697143555, "learning_rate": 1.2345679012345678e-05, "loss": 0.8677, "step": 20 }, { "epoch": 0.02320185614849188, "grad_norm": 2.659966230392456, "learning_rate": 1.8518518518518518e-05, "loss": 0.5596, "step": 30 }, { "epoch": 0.030935808197989172, "grad_norm": 3.7260355949401855, "learning_rate": 2.4691358024691357e-05, "loss": 0.3624, "step": 40 }, { "epoch": 0.038669760247486466, "grad_norm": 2.9796321392059326, "learning_rate": 3.08641975308642e-05, "loss": 0.2695, "step": 50 }, { "epoch": 0.04640371229698376, "grad_norm": 2.203373670578003, "learning_rate": 3.7037037037037037e-05, "loss": 0.2362, "step": 60 }, { "epoch": 0.054137664346481054, "grad_norm": 1.4973496198654175, "learning_rate": 4.3209876543209875e-05, "loss": 0.179, "step": 70 }, { "epoch": 0.061871616395978345, "grad_norm": 1.632278323173523, "learning_rate": 4.938271604938271e-05, "loss": 0.1675, "step": 80 }, { "epoch": 0.06960556844547564, "grad_norm": 1.761098027229309, "learning_rate": 5.555555555555556e-05, "loss": 0.1611, "step": 90 }, { "epoch": 0.07733952049497293, "grad_norm": 2.3847756385803223, "learning_rate": 6.17283950617284e-05, "loss": 0.1342, "step": 100 }, { "epoch": 0.08507347254447023, "grad_norm": 1.3783845901489258, "learning_rate": 6.790123456790123e-05, "loss": 0.1311, "step": 110 }, { "epoch": 0.09280742459396751, "grad_norm": 1.4245771169662476, "learning_rate": 7.407407407407407e-05, "loss": 0.1389, "step": 120 }, { "epoch": 0.10054137664346481, "grad_norm": 0.8332632780075073, "learning_rate": 8.024691358024692e-05, "loss": 0.1078, "step": 130 }, { "epoch": 0.10827532869296211, "grad_norm": 1.3833789825439453, "learning_rate": 8.641975308641975e-05, "loss": 0.1148, "step": 140 }, { "epoch": 0.11600928074245939, "grad_norm": 1.578161597251892, "learning_rate": 9.25925925925926e-05, "loss": 0.1227, "step": 150 }, { "epoch": 0.12374323279195669, "grad_norm": 1.313278317451477, "learning_rate": 9.876543209876543e-05, "loss": 0.1196, "step": 160 }, { "epoch": 0.131477184841454, "grad_norm": 0.9247048497200012, "learning_rate": 0.00010493827160493828, "loss": 0.1073, "step": 170 }, { "epoch": 0.13921113689095127, "grad_norm": 1.0717965364456177, "learning_rate": 0.00011111111111111112, "loss": 0.105, "step": 180 }, { "epoch": 0.14694508894044858, "grad_norm": 1.2795355319976807, "learning_rate": 0.00011728395061728397, "loss": 0.1, "step": 190 }, { "epoch": 0.15467904098994587, "grad_norm": 1.315598487854004, "learning_rate": 0.0001234567901234568, "loss": 0.0994, "step": 200 }, { "epoch": 0.16241299303944315, "grad_norm": 1.1581859588623047, "learning_rate": 0.00012962962962962963, "loss": 0.0936, "step": 210 }, { "epoch": 0.17014694508894046, "grad_norm": 1.5182558298110962, "learning_rate": 0.00013580246913580247, "loss": 0.0971, "step": 220 }, { "epoch": 0.17788089713843774, "grad_norm": 0.973030149936676, "learning_rate": 0.00014197530864197534, "loss": 0.0842, "step": 230 }, { "epoch": 0.18561484918793503, "grad_norm": 1.0252668857574463, "learning_rate": 0.00014814814814814815, "loss": 0.0898, "step": 240 }, { "epoch": 0.19334880123743234, "grad_norm": 0.879585862159729, "learning_rate": 0.00015432098765432098, "loss": 0.0922, "step": 250 }, { "epoch": 0.20108275328692962, "grad_norm": 0.8261191248893738, "learning_rate": 0.00016049382716049385, "loss": 0.0792, "step": 260 }, { "epoch": 0.2088167053364269, "grad_norm": 0.601457953453064, "learning_rate": 0.0001666666666666667, "loss": 0.0858, "step": 270 }, { "epoch": 0.21655065738592422, "grad_norm": 1.2878525257110596, "learning_rate": 0.0001728395061728395, "loss": 0.0954, "step": 280 }, { "epoch": 0.2242846094354215, "grad_norm": 0.7943984270095825, "learning_rate": 0.00017901234567901234, "loss": 0.0703, "step": 290 }, { "epoch": 0.23201856148491878, "grad_norm": 0.7658869624137878, "learning_rate": 0.0001851851851851852, "loss": 0.0936, "step": 300 }, { "epoch": 0.2397525135344161, "grad_norm": 0.7662453055381775, "learning_rate": 0.00019135802469135804, "loss": 0.0774, "step": 310 }, { "epoch": 0.24748646558391338, "grad_norm": 1.0913536548614502, "learning_rate": 0.00019753086419753085, "loss": 0.0849, "step": 320 }, { "epoch": 0.2552204176334107, "grad_norm": 0.690075695514679, "learning_rate": 0.00019999952846043234, "loss": 0.0812, "step": 330 }, { "epoch": 0.262954369682908, "grad_norm": 1.1156948804855347, "learning_rate": 0.00019999664684584523, "loss": 0.0777, "step": 340 }, { "epoch": 0.27068832173240526, "grad_norm": 1.4576770067214966, "learning_rate": 0.0001999911456584946, "loss": 0.0799, "step": 350 }, { "epoch": 0.27842227378190254, "grad_norm": 2.7017621994018555, "learning_rate": 0.00019998302504249278, "loss": 0.1402, "step": 360 }, { "epoch": 0.2861562258313998, "grad_norm": 0.8603020310401917, "learning_rate": 0.00019997228521057242, "loss": 0.0999, "step": 370 }, { "epoch": 0.29389017788089716, "grad_norm": 1.2301173210144043, "learning_rate": 0.00019995892644408066, "loss": 0.1248, "step": 380 }, { "epoch": 0.30162412993039445, "grad_norm": 1.230200171470642, "learning_rate": 0.0001999429490929718, "loss": 0.1284, "step": 390 }, { "epoch": 0.30935808197989173, "grad_norm": 0.9165638089179993, "learning_rate": 0.00019992435357579828, "loss": 0.0937, "step": 400 }, { "epoch": 0.317092034029389, "grad_norm": 1.0406192541122437, "learning_rate": 0.00019990314037969948, "loss": 0.1153, "step": 410 }, { "epoch": 0.3248259860788863, "grad_norm": 0.9106414318084717, "learning_rate": 0.00019987931006038915, "loss": 0.1149, "step": 420 }, { "epoch": 0.3325599381283836, "grad_norm": 0.8717958927154541, "learning_rate": 0.0001998528632421408, "loss": 0.0987, "step": 430 }, { "epoch": 0.3402938901778809, "grad_norm": 0.8013294339179993, "learning_rate": 0.00019982380061777134, "loss": 0.1048, "step": 440 }, { "epoch": 0.3480278422273782, "grad_norm": 1.1209100484848022, "learning_rate": 0.0001997921229486228, "loss": 0.1006, "step": 450 }, { "epoch": 0.3557617942768755, "grad_norm": 1.1739383935928345, "learning_rate": 0.00019975783106454266, "loss": 0.102, "step": 460 }, { "epoch": 0.36349574632637277, "grad_norm": 0.5681895017623901, "learning_rate": 0.0001997209258638619, "loss": 0.0996, "step": 470 }, { "epoch": 0.37122969837587005, "grad_norm": 0.9269053936004639, "learning_rate": 0.00019968140831337148, "loss": 0.0963, "step": 480 }, { "epoch": 0.37896365042536734, "grad_norm": 0.6350008249282837, "learning_rate": 0.00019963927944829712, "loss": 0.0847, "step": 490 }, { "epoch": 0.3866976024748647, "grad_norm": 0.7555776238441467, "learning_rate": 0.00019959454037227214, "loss": 0.0948, "step": 500 }, { "epoch": 0.39443155452436196, "grad_norm": 0.8819997310638428, "learning_rate": 0.00019954719225730847, "loss": 0.0831, "step": 510 }, { "epoch": 0.40216550657385924, "grad_norm": 0.7193794250488281, "learning_rate": 0.00019949723634376601, "loss": 0.0804, "step": 520 }, { "epoch": 0.4098994586233565, "grad_norm": 0.6095755100250244, "learning_rate": 0.00019944467394032015, "loss": 0.0863, "step": 530 }, { "epoch": 0.4176334106728538, "grad_norm": 0.8777745962142944, "learning_rate": 0.00019938950642392746, "loss": 0.0928, "step": 540 }, { "epoch": 0.4253673627223511, "grad_norm": 0.7616790533065796, "learning_rate": 0.00019933173523978967, "loss": 0.0802, "step": 550 }, { "epoch": 0.43310131477184843, "grad_norm": 0.6164016723632812, "learning_rate": 0.0001992713619013157, "loss": 0.0787, "step": 560 }, { "epoch": 0.4408352668213457, "grad_norm": 0.9934972524642944, "learning_rate": 0.00019920838799008213, "loss": 0.1021, "step": 570 }, { "epoch": 0.448569218870843, "grad_norm": 0.5896562933921814, "learning_rate": 0.00019914281515579166, "loss": 0.0899, "step": 580 }, { "epoch": 0.4563031709203403, "grad_norm": 0.8718979358673096, "learning_rate": 0.00019907464511623003, "loss": 0.0874, "step": 590 }, { "epoch": 0.46403712296983757, "grad_norm": 0.9159581065177917, "learning_rate": 0.00019900387965722093, "loss": 0.1059, "step": 600 }, { "epoch": 0.4717710750193349, "grad_norm": 0.6951258182525635, "learning_rate": 0.0001989305206325792, "loss": 0.1011, "step": 610 }, { "epoch": 0.4795050270688322, "grad_norm": 1.1430431604385376, "learning_rate": 0.00019885456996406232, "loss": 0.0951, "step": 620 }, { "epoch": 0.4872389791183295, "grad_norm": 0.7171654105186462, "learning_rate": 0.00019877602964131995, "loss": 0.0874, "step": 630 }, { "epoch": 0.49497293116782676, "grad_norm": 0.5507563352584839, "learning_rate": 0.0001986949017218421, "loss": 0.0752, "step": 640 }, { "epoch": 0.502706883217324, "grad_norm": 0.5208908915519714, "learning_rate": 0.00019861118833090484, "loss": 0.0835, "step": 650 }, { "epoch": 0.5104408352668214, "grad_norm": 0.49363574385643005, "learning_rate": 0.00019852489166151497, "loss": 0.073, "step": 660 }, { "epoch": 0.5181747873163186, "grad_norm": 0.9492205381393433, "learning_rate": 0.00019843601397435224, "loss": 0.0785, "step": 670 }, { "epoch": 0.525908739365816, "grad_norm": 0.6137520670890808, "learning_rate": 0.00019834455759771045, "loss": 0.0893, "step": 680 }, { "epoch": 0.5336426914153132, "grad_norm": 1.3644485473632812, "learning_rate": 0.00019825052492743628, "loss": 0.078, "step": 690 }, { "epoch": 0.5413766434648105, "grad_norm": 0.7063598036766052, "learning_rate": 0.00019815391842686655, "loss": 0.0882, "step": 700 }, { "epoch": 0.5491105955143079, "grad_norm": 0.6659109592437744, "learning_rate": 0.0001980547406267637, "loss": 0.0769, "step": 710 }, { "epoch": 0.5568445475638051, "grad_norm": 0.7407658100128174, "learning_rate": 0.00019795299412524945, "loss": 0.0755, "step": 720 }, { "epoch": 0.5645784996133024, "grad_norm": 0.6697968244552612, "learning_rate": 0.00019784868158773686, "loss": 0.0829, "step": 730 }, { "epoch": 0.5723124516627996, "grad_norm": 0.6077336668968201, "learning_rate": 0.00019774180574686038, "loss": 0.081, "step": 740 }, { "epoch": 0.580046403712297, "grad_norm": 0.9944800138473511, "learning_rate": 0.0001976323694024043, "loss": 0.0845, "step": 750 }, { "epoch": 0.5877803557617943, "grad_norm": 0.6956860423088074, "learning_rate": 0.00019752037542122942, "loss": 0.0837, "step": 760 }, { "epoch": 0.5955143078112916, "grad_norm": 0.8935562372207642, "learning_rate": 0.00019740582673719793, "loss": 0.0795, "step": 770 }, { "epoch": 0.6032482598607889, "grad_norm": 0.44260120391845703, "learning_rate": 0.00019728872635109662, "loss": 0.0838, "step": 780 }, { "epoch": 0.6109822119102861, "grad_norm": 1.0964995622634888, "learning_rate": 0.0001971690773305581, "loss": 0.0823, "step": 790 }, { "epoch": 0.6187161639597835, "grad_norm": 0.4430236518383026, "learning_rate": 0.0001970468828099807, "loss": 0.0721, "step": 800 }, { "epoch": 0.6264501160092807, "grad_norm": 0.6705779433250427, "learning_rate": 0.0001969221459904461, "loss": 0.0696, "step": 810 }, { "epoch": 0.634184068058778, "grad_norm": 0.6119517087936401, "learning_rate": 0.00019679487013963564, "loss": 0.0996, "step": 820 }, { "epoch": 0.6419180201082754, "grad_norm": 0.704067051410675, "learning_rate": 0.00019666505859174463, "loss": 0.0817, "step": 830 }, { "epoch": 0.6496519721577726, "grad_norm": 0.5037317872047424, "learning_rate": 0.00019653271474739503, "loss": 0.0725, "step": 840 }, { "epoch": 0.6573859242072699, "grad_norm": 0.6249669790267944, "learning_rate": 0.00019639784207354643, "loss": 0.0735, "step": 850 }, { "epoch": 0.6651198762567672, "grad_norm": 0.4149720370769501, "learning_rate": 0.00019626044410340514, "loss": 0.0704, "step": 860 }, { "epoch": 0.6728538283062645, "grad_norm": 0.7939838767051697, "learning_rate": 0.00019612052443633161, "loss": 0.0705, "step": 870 }, { "epoch": 0.6805877803557618, "grad_norm": 1.0307519435882568, "learning_rate": 0.0001959780867377463, "loss": 0.0886, "step": 880 }, { "epoch": 0.6883217324052591, "grad_norm": 0.7175227999687195, "learning_rate": 0.00019583313473903343, "loss": 0.0821, "step": 890 }, { "epoch": 0.6960556844547564, "grad_norm": 0.6045669913291931, "learning_rate": 0.00019568567223744339, "loss": 0.0823, "step": 900 }, { "epoch": 0.7037896365042536, "grad_norm": 0.5727024674415588, "learning_rate": 0.0001955357030959933, "loss": 0.075, "step": 910 }, { "epoch": 0.711523588553751, "grad_norm": 0.696556806564331, "learning_rate": 0.00019538323124336567, "loss": 0.0869, "step": 920 }, { "epoch": 0.7192575406032483, "grad_norm": 0.5535450577735901, "learning_rate": 0.00019522826067380552, "loss": 0.0933, "step": 930 }, { "epoch": 0.7269914926527455, "grad_norm": 0.7417379021644592, "learning_rate": 0.00019507079544701583, "loss": 0.077, "step": 940 }, { "epoch": 0.7347254447022429, "grad_norm": 0.9103347063064575, "learning_rate": 0.00019491083968805112, "loss": 0.0873, "step": 950 }, { "epoch": 0.7424593967517401, "grad_norm": 0.6831582188606262, "learning_rate": 0.0001947483975872094, "loss": 0.0802, "step": 960 }, { "epoch": 0.7501933488012374, "grad_norm": 0.5629430413246155, "learning_rate": 0.00019458347339992236, "loss": 0.0889, "step": 970 }, { "epoch": 0.7579273008507347, "grad_norm": 0.46935540437698364, "learning_rate": 0.00019441607144664397, "loss": 0.0751, "step": 980 }, { "epoch": 0.765661252900232, "grad_norm": 0.547493040561676, "learning_rate": 0.00019424619611273727, "loss": 0.0748, "step": 990 }, { "epoch": 0.7733952049497294, "grad_norm": 0.535914421081543, "learning_rate": 0.00019407385184835944, "loss": 0.0764, "step": 1000 }, { "epoch": 0.7811291569992266, "grad_norm": 0.45856887102127075, "learning_rate": 0.00019389904316834532, "loss": 0.0805, "step": 1010 }, { "epoch": 0.7888631090487239, "grad_norm": 0.7034686207771301, "learning_rate": 0.00019372177465208897, "loss": 0.0697, "step": 1020 }, { "epoch": 0.7965970610982211, "grad_norm": 0.8264154195785522, "learning_rate": 0.00019354205094342396, "loss": 0.0654, "step": 1030 }, { "epoch": 0.8043310131477185, "grad_norm": 0.5145363211631775, "learning_rate": 0.00019335987675050137, "loss": 0.0638, "step": 1040 }, { "epoch": 0.8120649651972158, "grad_norm": 0.8528919816017151, "learning_rate": 0.00019317525684566685, "loss": 0.0748, "step": 1050 }, { "epoch": 0.819798917246713, "grad_norm": 0.4077950119972229, "learning_rate": 0.0001929881960653353, "loss": 0.0701, "step": 1060 }, { "epoch": 0.8275328692962104, "grad_norm": 0.515906035900116, "learning_rate": 0.00019279869930986427, "loss": 0.0686, "step": 1070 }, { "epoch": 0.8352668213457076, "grad_norm": 0.4109879434108734, "learning_rate": 0.00019260677154342564, "loss": 0.08, "step": 1080 }, { "epoch": 0.843000773395205, "grad_norm": 0.4209107458591461, "learning_rate": 0.00019241241779387544, "loss": 0.0661, "step": 1090 }, { "epoch": 0.8507347254447022, "grad_norm": 0.3800555169582367, "learning_rate": 0.00019221564315262236, "loss": 0.0652, "step": 1100 }, { "epoch": 0.8584686774941995, "grad_norm": 0.8711904883384705, "learning_rate": 0.0001920164527744941, "loss": 0.0627, "step": 1110 }, { "epoch": 0.8662026295436969, "grad_norm": 0.621431827545166, "learning_rate": 0.00019181485187760256, "loss": 0.0597, "step": 1120 }, { "epoch": 0.8739365815931941, "grad_norm": 0.6950638890266418, "learning_rate": 0.00019161084574320696, "loss": 0.0637, "step": 1130 }, { "epoch": 0.8816705336426914, "grad_norm": 0.6309072971343994, "learning_rate": 0.0001914044397155757, "loss": 0.0708, "step": 1140 }, { "epoch": 0.8894044856921887, "grad_norm": 0.5841305255889893, "learning_rate": 0.00019119563920184614, "loss": 0.0748, "step": 1150 }, { "epoch": 0.897138437741686, "grad_norm": 0.48138627409935, "learning_rate": 0.00019098444967188306, "loss": 0.0653, "step": 1160 }, { "epoch": 0.9048723897911833, "grad_norm": 0.9201337099075317, "learning_rate": 0.00019077087665813545, "loss": 0.0791, "step": 1170 }, { "epoch": 0.9126063418406806, "grad_norm": 0.7514659762382507, "learning_rate": 0.00019055492575549131, "loss": 0.0767, "step": 1180 }, { "epoch": 0.9203402938901779, "grad_norm": 0.6012793779373169, "learning_rate": 0.00019033660262113145, "loss": 0.0748, "step": 1190 }, { "epoch": 0.9280742459396751, "grad_norm": 0.6147593855857849, "learning_rate": 0.00019011591297438097, "loss": 0.0736, "step": 1200 }, { "epoch": 0.9358081979891725, "grad_norm": 0.37907811999320984, "learning_rate": 0.0001898928625965596, "loss": 0.0886, "step": 1210 }, { "epoch": 0.9435421500386698, "grad_norm": 0.7277001142501831, "learning_rate": 0.00018966745733083027, "loss": 0.0842, "step": 1220 }, { "epoch": 0.951276102088167, "grad_norm": 0.8345341086387634, "learning_rate": 0.00018943970308204583, "loss": 0.07, "step": 1230 }, { "epoch": 0.9590100541376644, "grad_norm": 0.654051661491394, "learning_rate": 0.00018920960581659472, "loss": 0.0771, "step": 1240 }, { "epoch": 0.9667440061871616, "grad_norm": 0.637369692325592, "learning_rate": 0.0001889771715622443, "loss": 0.0702, "step": 1250 }, { "epoch": 0.974477958236659, "grad_norm": 0.7220625877380371, "learning_rate": 0.00018874240640798316, "loss": 0.0736, "step": 1260 }, { "epoch": 0.9822119102861562, "grad_norm": 0.5539993047714233, "learning_rate": 0.00018850531650386153, "loss": 0.0693, "step": 1270 }, { "epoch": 0.9899458623356535, "grad_norm": 0.44338443875312805, "learning_rate": 0.00018826590806083032, "loss": 0.0723, "step": 1280 }, { "epoch": 0.9976798143851509, "grad_norm": 0.6246799826622009, "learning_rate": 0.00018802418735057815, "loss": 0.075, "step": 1290 }, { "epoch": 1.005413766434648, "grad_norm": 0.6295077204704285, "learning_rate": 0.00018778016070536717, "loss": 0.0593, "step": 1300 }, { "epoch": 1.0131477184841453, "grad_norm": 0.6199585795402527, "learning_rate": 0.0001875338345178674, "loss": 0.076, "step": 1310 }, { "epoch": 1.0208816705336428, "grad_norm": 0.48564451932907104, "learning_rate": 0.0001872852152409888, "loss": 0.072, "step": 1320 }, { "epoch": 1.02861562258314, "grad_norm": 0.7130405306816101, "learning_rate": 0.00018703430938771273, "loss": 0.074, "step": 1330 }, { "epoch": 1.0363495746326372, "grad_norm": 0.5083802342414856, "learning_rate": 0.0001867811235309209, "loss": 0.0709, "step": 1340 }, { "epoch": 1.0440835266821347, "grad_norm": 0.559683620929718, "learning_rate": 0.00018652566430322356, "loss": 0.0769, "step": 1350 }, { "epoch": 1.051817478731632, "grad_norm": 0.8660276532173157, "learning_rate": 0.00018626793839678538, "loss": 0.076, "step": 1360 }, { "epoch": 1.0595514307811291, "grad_norm": 0.8426743745803833, "learning_rate": 0.0001860079525631504, "loss": 0.0671, "step": 1370 }, { "epoch": 1.0672853828306264, "grad_norm": 0.7161004543304443, "learning_rate": 0.0001857457136130651, "loss": 0.0761, "step": 1380 }, { "epoch": 1.0750193348801238, "grad_norm": 0.5542100667953491, "learning_rate": 0.0001854812284163, "loss": 0.0774, "step": 1390 }, { "epoch": 1.082753286929621, "grad_norm": 0.8244419097900391, "learning_rate": 0.00018521450390146947, "loss": 0.0752, "step": 1400 }, { "epoch": 1.0904872389791183, "grad_norm": 0.8143429756164551, "learning_rate": 0.00018494554705585065, "loss": 0.0814, "step": 1410 }, { "epoch": 1.0982211910286157, "grad_norm": 0.49108588695526123, "learning_rate": 0.00018467436492520007, "loss": 0.067, "step": 1420 }, { "epoch": 1.105955143078113, "grad_norm": 0.7407925724983215, "learning_rate": 0.00018440096461356915, "loss": 0.0784, "step": 1430 }, { "epoch": 1.1136890951276102, "grad_norm": 0.43566811084747314, "learning_rate": 0.00018412535328311814, "loss": 0.0675, "step": 1440 }, { "epoch": 1.1214230471771076, "grad_norm": 0.7760009169578552, "learning_rate": 0.0001838475381539285, "loss": 0.0743, "step": 1450 }, { "epoch": 1.1291569992266048, "grad_norm": 0.5270483493804932, "learning_rate": 0.0001835675265038137, "loss": 0.0673, "step": 1460 }, { "epoch": 1.136890951276102, "grad_norm": 0.5115463137626648, "learning_rate": 0.00018328532566812866, "loss": 0.0592, "step": 1470 }, { "epoch": 1.1446249033255993, "grad_norm": 0.9868899583816528, "learning_rate": 0.00018300094303957747, "loss": 0.0671, "step": 1480 }, { "epoch": 1.1523588553750967, "grad_norm": 0.5406227111816406, "learning_rate": 0.00018271438606801986, "loss": 0.0739, "step": 1490 }, { "epoch": 1.160092807424594, "grad_norm": 0.7857775092124939, "learning_rate": 0.0001824256622602759, "loss": 0.0712, "step": 1500 }, { "epoch": 1.1678267594740912, "grad_norm": 0.8214250802993774, "learning_rate": 0.0001821347791799294, "loss": 0.0608, "step": 1510 }, { "epoch": 1.1755607115235884, "grad_norm": 1.1604512929916382, "learning_rate": 0.00018184174444712986, "loss": 0.0603, "step": 1520 }, { "epoch": 1.1832946635730859, "grad_norm": 0.6629369854927063, "learning_rate": 0.00018154656573839275, "loss": 0.065, "step": 1530 }, { "epoch": 1.191028615622583, "grad_norm": 0.5788053870201111, "learning_rate": 0.0001812492507863984, "loss": 0.0639, "step": 1540 }, { "epoch": 1.1987625676720803, "grad_norm": 0.5337408185005188, "learning_rate": 0.00018094980737978945, "loss": 0.0706, "step": 1550 }, { "epoch": 1.2064965197215778, "grad_norm": 0.42491695284843445, "learning_rate": 0.0001806482433629669, "loss": 0.0667, "step": 1560 }, { "epoch": 1.214230471771075, "grad_norm": 0.5971820950508118, "learning_rate": 0.00018034456663588451, "loss": 0.065, "step": 1570 }, { "epoch": 1.2219644238205722, "grad_norm": 0.4656304717063904, "learning_rate": 0.00018003878515384178, "loss": 0.0708, "step": 1580 }, { "epoch": 1.2296983758700697, "grad_norm": 0.523751437664032, "learning_rate": 0.00017973090692727583, "loss": 0.0672, "step": 1590 }, { "epoch": 1.237432327919567, "grad_norm": 0.284178227186203, "learning_rate": 0.0001794209400215512, "loss": 0.0623, "step": 1600 }, { "epoch": 1.2451662799690641, "grad_norm": 0.6124892830848694, "learning_rate": 0.00017910889255674887, "loss": 0.0748, "step": 1610 }, { "epoch": 1.2529002320185616, "grad_norm": 0.9540546536445618, "learning_rate": 0.00017879477270745328, "loss": 0.0681, "step": 1620 }, { "epoch": 1.2606341840680588, "grad_norm": 0.46374186873435974, "learning_rate": 0.0001784785887025384, "loss": 0.0677, "step": 1630 }, { "epoch": 1.268368136117556, "grad_norm": 0.393171489238739, "learning_rate": 0.0001781603488249521, "loss": 0.0702, "step": 1640 }, { "epoch": 1.2761020881670533, "grad_norm": 0.4648221731185913, "learning_rate": 0.000177840061411499, "loss": 0.0693, "step": 1650 }, { "epoch": 1.2838360402165507, "grad_norm": 0.635676383972168, "learning_rate": 0.00017751773485262233, "loss": 0.0704, "step": 1660 }, { "epoch": 1.291569992266048, "grad_norm": 0.8249944448471069, "learning_rate": 0.00017719337759218394, "loss": 0.067, "step": 1670 }, { "epoch": 1.2993039443155452, "grad_norm": 0.5957729816436768, "learning_rate": 0.00017686699812724326, "loss": 0.062, "step": 1680 }, { "epoch": 1.3070378963650424, "grad_norm": 0.5320748090744019, "learning_rate": 0.0001765386050078345, "loss": 0.0756, "step": 1690 }, { "epoch": 1.3147718484145399, "grad_norm": 0.6222145557403564, "learning_rate": 0.00017620820683674287, "loss": 0.0637, "step": 1700 }, { "epoch": 1.322505800464037, "grad_norm": 0.6644905805587769, "learning_rate": 0.0001758758122692791, "loss": 0.0629, "step": 1710 }, { "epoch": 1.3302397525135343, "grad_norm": 0.4027056396007538, "learning_rate": 0.0001755414300130527, "loss": 0.0699, "step": 1720 }, { "epoch": 1.3379737045630318, "grad_norm": 0.49972447752952576, "learning_rate": 0.00017520506882774393, "loss": 0.0553, "step": 1730 }, { "epoch": 1.345707656612529, "grad_norm": 0.5508860945701599, "learning_rate": 0.00017486673752487424, "loss": 0.0804, "step": 1740 }, { "epoch": 1.3534416086620262, "grad_norm": 0.5580312609672546, "learning_rate": 0.0001745264449675755, "loss": 0.0653, "step": 1750 }, { "epoch": 1.3611755607115237, "grad_norm": 0.6820082068443298, "learning_rate": 0.00017418420007035774, "loss": 0.0582, "step": 1760 }, { "epoch": 1.368909512761021, "grad_norm": 0.7619492411613464, "learning_rate": 0.0001738400117988757, "loss": 0.066, "step": 1770 }, { "epoch": 1.3766434648105181, "grad_norm": 0.4091757833957672, "learning_rate": 0.000173493889169694, "loss": 0.0617, "step": 1780 }, { "epoch": 1.3843774168600156, "grad_norm": 0.48953068256378174, "learning_rate": 0.00017314584125005075, "loss": 0.0733, "step": 1790 }, { "epoch": 1.3921113689095128, "grad_norm": 0.4602923095226288, "learning_rate": 0.00017279587715762022, "loss": 0.0562, "step": 1800 }, { "epoch": 1.39984532095901, "grad_norm": 0.7369881868362427, "learning_rate": 0.00017244400606027381, "loss": 0.0614, "step": 1810 }, { "epoch": 1.4075792730085073, "grad_norm": 0.3813045024871826, "learning_rate": 0.00017209023717584013, "loss": 0.0565, "step": 1820 }, { "epoch": 1.4153132250580047, "grad_norm": 0.4860456883907318, "learning_rate": 0.00017173457977186316, "loss": 0.0673, "step": 1830 }, { "epoch": 1.423047177107502, "grad_norm": 0.4667312204837799, "learning_rate": 0.00017137704316535989, "loss": 0.065, "step": 1840 }, { "epoch": 1.4307811291569992, "grad_norm": 0.3896380066871643, "learning_rate": 0.00017101763672257593, "loss": 0.065, "step": 1850 }, { "epoch": 1.4385150812064964, "grad_norm": 0.6005688309669495, "learning_rate": 0.00017065636985874027, "loss": 0.0706, "step": 1860 }, { "epoch": 1.4462490332559939, "grad_norm": 0.9429843425750732, "learning_rate": 0.0001702932520378186, "loss": 0.062, "step": 1870 }, { "epoch": 1.453982985305491, "grad_norm": 0.4232138395309448, "learning_rate": 0.00016992829277226546, "loss": 0.0604, "step": 1880 }, { "epoch": 1.4617169373549883, "grad_norm": 0.36831173300743103, "learning_rate": 0.0001695615016227749, "loss": 0.06, "step": 1890 }, { "epoch": 1.4694508894044858, "grad_norm": 0.40997442603111267, "learning_rate": 0.00016919288819803024, "loss": 0.0638, "step": 1900 }, { "epoch": 1.477184841453983, "grad_norm": 0.48790526390075684, "learning_rate": 0.00016882246215445208, "loss": 0.0501, "step": 1910 }, { "epoch": 1.4849187935034802, "grad_norm": 0.5824405550956726, "learning_rate": 0.00016845023319594557, "loss": 0.0608, "step": 1920 }, { "epoch": 1.4926527455529777, "grad_norm": 0.3718617260456085, "learning_rate": 0.00016807621107364613, "loss": 0.0591, "step": 1930 }, { "epoch": 1.500386697602475, "grad_norm": 0.6794911623001099, "learning_rate": 0.00016770040558566394, "loss": 0.0613, "step": 1940 }, { "epoch": 1.5081206496519721, "grad_norm": 0.45372748374938965, "learning_rate": 0.00016732282657682732, "loss": 0.0695, "step": 1950 }, { "epoch": 1.5158546017014696, "grad_norm": 0.4764237701892853, "learning_rate": 0.00016694348393842475, "loss": 0.0565, "step": 1960 }, { "epoch": 1.5235885537509666, "grad_norm": 0.566295325756073, "learning_rate": 0.000166562387607946, "loss": 0.0731, "step": 1970 }, { "epoch": 1.531322505800464, "grad_norm": 0.4176735281944275, "learning_rate": 0.00016617954756882144, "loss": 0.0571, "step": 1980 }, { "epoch": 1.5390564578499615, "grad_norm": 0.5348420143127441, "learning_rate": 0.00016579497385016073, "loss": 0.0566, "step": 1990 }, { "epoch": 1.5467904098994585, "grad_norm": 0.6526331901550293, "learning_rate": 0.00016540867652649013, "loss": 0.0643, "step": 2000 }, { "epoch": 1.554524361948956, "grad_norm": 0.2983779013156891, "learning_rate": 0.00016502066571748842, "loss": 0.0634, "step": 2010 }, { "epoch": 1.5622583139984532, "grad_norm": 0.6886798143386841, "learning_rate": 0.00016463095158772187, "loss": 0.0565, "step": 2020 }, { "epoch": 1.5699922660479504, "grad_norm": 0.6495068073272705, "learning_rate": 0.0001642395443463781, "loss": 0.0664, "step": 2030 }, { "epoch": 1.5777262180974478, "grad_norm": 0.47325339913368225, "learning_rate": 0.00016384645424699835, "loss": 0.0589, "step": 2040 }, { "epoch": 1.585460170146945, "grad_norm": 0.4646705389022827, "learning_rate": 0.0001634516915872091, "loss": 0.057, "step": 2050 }, { "epoch": 1.5931941221964423, "grad_norm": 0.43619847297668457, "learning_rate": 0.00016305526670845226, "loss": 0.058, "step": 2060 }, { "epoch": 1.6009280742459397, "grad_norm": 0.6316474080085754, "learning_rate": 0.00016265718999571415, "loss": 0.0587, "step": 2070 }, { "epoch": 1.608662026295437, "grad_norm": 0.47097697854042053, "learning_rate": 0.00016225747187725368, "loss": 0.0604, "step": 2080 }, { "epoch": 1.6163959783449342, "grad_norm": 0.6374630331993103, "learning_rate": 0.00016185612282432885, "loss": 0.0514, "step": 2090 }, { "epoch": 1.6241299303944317, "grad_norm": 0.30891451239585876, "learning_rate": 0.0001614531533509227, "loss": 0.0602, "step": 2100 }, { "epoch": 1.6318638824439289, "grad_norm": 0.703156590461731, "learning_rate": 0.0001610485740134678, "loss": 0.0631, "step": 2110 }, { "epoch": 1.639597834493426, "grad_norm": 0.5643807053565979, "learning_rate": 0.00016064239541056964, "loss": 0.0727, "step": 2120 }, { "epoch": 1.6473317865429236, "grad_norm": 0.494285523891449, "learning_rate": 0.00016023462818272907, "loss": 0.059, "step": 2130 }, { "epoch": 1.6550657385924206, "grad_norm": 0.43368974328041077, "learning_rate": 0.0001598252830120636, "loss": 0.0615, "step": 2140 }, { "epoch": 1.662799690641918, "grad_norm": 0.35887664556503296, "learning_rate": 0.0001594143706220273, "loss": 0.06, "step": 2150 }, { "epoch": 1.6705336426914155, "grad_norm": 0.39203256368637085, "learning_rate": 0.00015900190177713016, "loss": 0.0651, "step": 2160 }, { "epoch": 1.6782675947409125, "grad_norm": 0.6477435231208801, "learning_rate": 0.0001585878872826561, "loss": 0.06, "step": 2170 }, { "epoch": 1.68600154679041, "grad_norm": 0.6552746295928955, "learning_rate": 0.00015817233798437968, "loss": 0.0481, "step": 2180 }, { "epoch": 1.6937354988399071, "grad_norm": 0.39788496494293213, "learning_rate": 0.0001577552647682822, "loss": 0.0563, "step": 2190 }, { "epoch": 1.7014694508894044, "grad_norm": 0.456989586353302, "learning_rate": 0.00015733667856026635, "loss": 0.0548, "step": 2200 }, { "epoch": 1.7092034029389018, "grad_norm": 0.4791859984397888, "learning_rate": 0.0001569165903258701, "loss": 0.0621, "step": 2210 }, { "epoch": 1.716937354988399, "grad_norm": 0.33941400051116943, "learning_rate": 0.00015649501106997953, "loss": 0.0628, "step": 2220 }, { "epoch": 1.7246713070378963, "grad_norm": 0.6558396816253662, "learning_rate": 0.00015607195183654025, "loss": 0.0666, "step": 2230 }, { "epoch": 1.7324052590873937, "grad_norm": 0.4829707145690918, "learning_rate": 0.0001556474237082683, "loss": 0.0546, "step": 2240 }, { "epoch": 1.740139211136891, "grad_norm": 0.5314657092094421, "learning_rate": 0.0001552214378063599, "loss": 0.0701, "step": 2250 }, { "epoch": 1.7478731631863882, "grad_norm": 0.45208579301834106, "learning_rate": 0.00015479400529019985, "loss": 0.0696, "step": 2260 }, { "epoch": 1.7556071152358856, "grad_norm": 0.6665327548980713, "learning_rate": 0.0001543651373570694, "loss": 0.0627, "step": 2270 }, { "epoch": 1.7633410672853829, "grad_norm": 0.798313558101654, "learning_rate": 0.00015393484524185288, "loss": 0.06, "step": 2280 }, { "epoch": 1.77107501933488, "grad_norm": 0.451084166765213, "learning_rate": 0.00015350314021674323, "loss": 0.0566, "step": 2290 }, { "epoch": 1.7788089713843775, "grad_norm": 0.36802592873573303, "learning_rate": 0.000153070033590947, "loss": 0.0556, "step": 2300 }, { "epoch": 1.7865429234338746, "grad_norm": 0.3082532286643982, "learning_rate": 0.0001526355367103878, "loss": 0.0564, "step": 2310 }, { "epoch": 1.794276875483372, "grad_norm": 0.41715461015701294, "learning_rate": 0.00015219966095740927, "loss": 0.0576, "step": 2320 }, { "epoch": 1.8020108275328695, "grad_norm": 0.4718796908855438, "learning_rate": 0.0001517624177504768, "loss": 0.0634, "step": 2330 }, { "epoch": 1.8097447795823665, "grad_norm": 0.4008396565914154, "learning_rate": 0.0001513238185438784, "loss": 0.0623, "step": 2340 }, { "epoch": 1.817478731631864, "grad_norm": 0.4633011221885681, "learning_rate": 0.0001508838748274247, "loss": 0.0667, "step": 2350 }, { "epoch": 1.8252126836813611, "grad_norm": 0.5459713935852051, "learning_rate": 0.00015044259812614793, "loss": 0.067, "step": 2360 }, { "epoch": 1.8329466357308584, "grad_norm": 0.3791009783744812, "learning_rate": 0.00015000000000000001, "loss": 0.0598, "step": 2370 }, { "epoch": 1.8406805877803558, "grad_norm": 0.3739205598831177, "learning_rate": 0.00014955609204354966, "loss": 0.0623, "step": 2380 }, { "epoch": 1.848414539829853, "grad_norm": 0.3235006630420685, "learning_rate": 0.00014911088588567877, "loss": 0.0638, "step": 2390 }, { "epoch": 1.8561484918793503, "grad_norm": 0.39373257756233215, "learning_rate": 0.00014866439318927762, "loss": 0.0511, "step": 2400 }, { "epoch": 1.8638824439288477, "grad_norm": 0.3228203058242798, "learning_rate": 0.0001482166256509395, "loss": 0.0584, "step": 2410 }, { "epoch": 1.871616395978345, "grad_norm": 0.5671100616455078, "learning_rate": 0.00014776759500065428, "loss": 0.0581, "step": 2420 }, { "epoch": 1.8793503480278422, "grad_norm": 0.4199039936065674, "learning_rate": 0.0001473173130015009, "loss": 0.0548, "step": 2430 }, { "epoch": 1.8870843000773396, "grad_norm": 0.6057584285736084, "learning_rate": 0.0001468657914493396, "loss": 0.0559, "step": 2440 }, { "epoch": 1.8948182521268369, "grad_norm": 0.24723778665065765, "learning_rate": 0.00014641304217250252, "loss": 0.0577, "step": 2450 }, { "epoch": 1.902552204176334, "grad_norm": 0.4581877589225769, "learning_rate": 0.0001459590770314841, "loss": 0.0567, "step": 2460 }, { "epoch": 1.9102861562258315, "grad_norm": 0.44455474615097046, "learning_rate": 0.00014550390791863045, "loss": 0.0555, "step": 2470 }, { "epoch": 1.9180201082753285, "grad_norm": 0.5755113363265991, "learning_rate": 0.0001450475467578273, "loss": 0.0565, "step": 2480 }, { "epoch": 1.925754060324826, "grad_norm": 0.30566319823265076, "learning_rate": 0.00014459000550418836, "loss": 0.0584, "step": 2490 }, { "epoch": 1.9334880123743234, "grad_norm": 0.3295874297618866, "learning_rate": 0.00014413129614374148, "loss": 0.0511, "step": 2500 }, { "epoch": 1.9412219644238204, "grad_norm": 0.49240174889564514, "learning_rate": 0.00014367143069311515, "loss": 0.0617, "step": 2510 }, { "epoch": 1.948955916473318, "grad_norm": 0.4429064393043518, "learning_rate": 0.00014321042119922337, "loss": 0.0502, "step": 2520 }, { "epoch": 1.9566898685228151, "grad_norm": 0.8340647220611572, "learning_rate": 0.00014274827973895026, "loss": 0.0583, "step": 2530 }, { "epoch": 1.9644238205723124, "grad_norm": 0.38168779015541077, "learning_rate": 0.0001422850184188336, "loss": 0.0689, "step": 2540 }, { "epoch": 1.9721577726218098, "grad_norm": 0.40917128324508667, "learning_rate": 0.00014182064937474763, "loss": 0.0554, "step": 2550 }, { "epoch": 1.979891724671307, "grad_norm": 0.5609645843505859, "learning_rate": 0.00014135518477158537, "loss": 0.0647, "step": 2560 }, { "epoch": 1.9876256767208043, "grad_norm": 0.3516216576099396, "learning_rate": 0.00014088863680293955, "loss": 0.0638, "step": 2570 }, { "epoch": 1.9953596287703017, "grad_norm": 0.49886783957481384, "learning_rate": 0.00014042101769078355, "loss": 0.0537, "step": 2580 }, { "epoch": 2.0030935808197987, "grad_norm": 0.5100917816162109, "learning_rate": 0.00013995233968515104, "loss": 0.0518, "step": 2590 }, { "epoch": 2.010827532869296, "grad_norm": 0.48347681760787964, "learning_rate": 0.00013948261506381508, "loss": 0.0702, "step": 2600 }, { "epoch": 2.0185614849187936, "grad_norm": 0.6293825507164001, "learning_rate": 0.00013901185613196654, "loss": 0.0498, "step": 2610 }, { "epoch": 2.0262954369682906, "grad_norm": 0.5591667890548706, "learning_rate": 0.00013854007522189172, "loss": 0.0486, "step": 2620 }, { "epoch": 2.034029389017788, "grad_norm": 0.29898470640182495, "learning_rate": 0.0001380672846926492, "loss": 0.0588, "step": 2630 }, { "epoch": 2.0417633410672855, "grad_norm": 0.5507124066352844, "learning_rate": 0.00013759349692974628, "loss": 0.054, "step": 2640 }, { "epoch": 2.0494972931167825, "grad_norm": 0.3150232434272766, "learning_rate": 0.0001371187243448143, "loss": 0.0521, "step": 2650 }, { "epoch": 2.05723124516628, "grad_norm": 0.6643772721290588, "learning_rate": 0.00013664297937528364, "loss": 0.0548, "step": 2660 }, { "epoch": 2.0649651972157774, "grad_norm": 0.4388512670993805, "learning_rate": 0.0001361662744840579, "loss": 0.0558, "step": 2670 }, { "epoch": 2.0726991492652744, "grad_norm": 0.6543988585472107, "learning_rate": 0.00013568862215918717, "loss": 0.0555, "step": 2680 }, { "epoch": 2.080433101314772, "grad_norm": 0.8125224709510803, "learning_rate": 0.0001352100349135414, "loss": 0.0537, "step": 2690 }, { "epoch": 2.0881670533642693, "grad_norm": 0.41702935099601746, "learning_rate": 0.00013473052528448201, "loss": 0.0513, "step": 2700 }, { "epoch": 2.0959010054137663, "grad_norm": 0.5020561218261719, "learning_rate": 0.00013425010583353392, "loss": 0.0568, "step": 2710 }, { "epoch": 2.103634957463264, "grad_norm": 0.48202699422836304, "learning_rate": 0.0001337687891460562, "loss": 0.0522, "step": 2720 }, { "epoch": 2.111368909512761, "grad_norm": 0.5081518292427063, "learning_rate": 0.0001332865878309125, "loss": 0.0461, "step": 2730 }, { "epoch": 2.1191028615622582, "grad_norm": 0.3508821129798889, "learning_rate": 0.00013280351452014065, "loss": 0.0517, "step": 2740 }, { "epoch": 2.1268368136117557, "grad_norm": 0.4882681965827942, "learning_rate": 0.00013231958186862196, "loss": 0.062, "step": 2750 }, { "epoch": 2.1345707656612527, "grad_norm": 0.5338916778564453, "learning_rate": 0.0001318348025537494, "loss": 0.0554, "step": 2760 }, { "epoch": 2.14230471771075, "grad_norm": 0.36750414967536926, "learning_rate": 0.0001313491892750957, "loss": 0.0519, "step": 2770 }, { "epoch": 2.1500386697602476, "grad_norm": 0.4782717823982239, "learning_rate": 0.00013086275475408064, "loss": 0.0612, "step": 2780 }, { "epoch": 2.1577726218097446, "grad_norm": 0.43543562293052673, "learning_rate": 0.00013037551173363774, "loss": 0.06, "step": 2790 }, { "epoch": 2.165506573859242, "grad_norm": 0.3427865207195282, "learning_rate": 0.0001298874729778804, "loss": 0.056, "step": 2800 }, { "epoch": 2.1732405259087395, "grad_norm": 0.2854020297527313, "learning_rate": 0.0001293986512717677, "loss": 0.0513, "step": 2810 }, { "epoch": 2.1809744779582365, "grad_norm": 0.43441343307495117, "learning_rate": 0.00012890905942076927, "loss": 0.0504, "step": 2820 }, { "epoch": 2.188708430007734, "grad_norm": 0.4922176003456116, "learning_rate": 0.00012841871025052996, "loss": 0.0519, "step": 2830 }, { "epoch": 2.1964423820572314, "grad_norm": 0.6458988785743713, "learning_rate": 0.00012792761660653383, "loss": 0.0555, "step": 2840 }, { "epoch": 2.2041763341067284, "grad_norm": 0.8427999019622803, "learning_rate": 0.0001274357913537676, "loss": 0.0596, "step": 2850 }, { "epoch": 2.211910286156226, "grad_norm": 0.41042545437812805, "learning_rate": 0.00012694324737638364, "loss": 0.0481, "step": 2860 }, { "epoch": 2.2196442382057233, "grad_norm": 0.609891951084137, "learning_rate": 0.00012644999757736248, "loss": 0.0549, "step": 2870 }, { "epoch": 2.2273781902552203, "grad_norm": 0.7705824375152588, "learning_rate": 0.00012595605487817482, "loss": 0.0601, "step": 2880 }, { "epoch": 2.2351121423047178, "grad_norm": 0.5561114549636841, "learning_rate": 0.00012546143221844295, "loss": 0.0539, "step": 2890 }, { "epoch": 2.2428460943542152, "grad_norm": 0.435332715511322, "learning_rate": 0.00012496614255560183, "loss": 0.0548, "step": 2900 }, { "epoch": 2.2505800464037122, "grad_norm": 0.9199612140655518, "learning_rate": 0.0001244701988645596, "loss": 0.0598, "step": 2910 }, { "epoch": 2.2583139984532097, "grad_norm": 0.46239349246025085, "learning_rate": 0.00012397361413735784, "loss": 0.056, "step": 2920 }, { "epoch": 2.266047950502707, "grad_norm": 0.4475820064544678, "learning_rate": 0.00012347640138283094, "loss": 0.0437, "step": 2930 }, { "epoch": 2.273781902552204, "grad_norm": 0.33769491314888, "learning_rate": 0.0001229785736262656, "loss": 0.0566, "step": 2940 }, { "epoch": 2.2815158546017016, "grad_norm": 0.39393454790115356, "learning_rate": 0.0001224801439090594, "loss": 0.0703, "step": 2950 }, { "epoch": 2.2892498066511986, "grad_norm": 0.336415559053421, "learning_rate": 0.0001219811252883793, "loss": 0.0553, "step": 2960 }, { "epoch": 2.296983758700696, "grad_norm": 1.0814809799194336, "learning_rate": 0.00012148153083681954, "loss": 0.0514, "step": 2970 }, { "epoch": 2.3047177107501935, "grad_norm": 0.36882534623146057, "learning_rate": 0.00012098137364205915, "loss": 0.0517, "step": 2980 }, { "epoch": 2.3124516627996905, "grad_norm": 0.47040390968322754, "learning_rate": 0.00012048066680651908, "loss": 0.0568, "step": 2990 }, { "epoch": 2.320185614849188, "grad_norm": 0.35711705684661865, "learning_rate": 0.00011997942344701906, "loss": 0.0616, "step": 3000 }, { "epoch": 2.3279195668986854, "grad_norm": 0.3350996971130371, "learning_rate": 0.00011947765669443396, "loss": 0.063, "step": 3010 }, { "epoch": 2.3356535189481824, "grad_norm": 0.38386791944503784, "learning_rate": 0.00011897537969334967, "loss": 0.0563, "step": 3020 }, { "epoch": 2.34338747099768, "grad_norm": 0.30716222524642944, "learning_rate": 0.00011847260560171896, "loss": 0.0511, "step": 3030 }, { "epoch": 2.351121423047177, "grad_norm": 0.48992228507995605, "learning_rate": 0.00011796934759051659, "loss": 0.0422, "step": 3040 }, { "epoch": 2.3588553750966743, "grad_norm": 0.3818512260913849, "learning_rate": 0.00011746561884339444, "loss": 0.0545, "step": 3050 }, { "epoch": 2.3665893271461718, "grad_norm": 0.5619115233421326, "learning_rate": 0.00011696143255633607, "loss": 0.042, "step": 3060 }, { "epoch": 2.3743232791956688, "grad_norm": 0.5043448209762573, "learning_rate": 0.00011645680193731103, "loss": 0.0455, "step": 3070 }, { "epoch": 2.382057231245166, "grad_norm": 0.2971482276916504, "learning_rate": 0.00011595174020592878, "loss": 0.0586, "step": 3080 }, { "epoch": 2.3897911832946637, "grad_norm": 0.3022650182247162, "learning_rate": 0.0001154462605930926, "loss": 0.0502, "step": 3090 }, { "epoch": 2.3975251353441607, "grad_norm": 0.5461617708206177, "learning_rate": 0.00011494037634065271, "loss": 0.0568, "step": 3100 }, { "epoch": 2.405259087393658, "grad_norm": 0.5164432525634766, "learning_rate": 0.00011443410070105962, "loss": 0.0451, "step": 3110 }, { "epoch": 2.4129930394431556, "grad_norm": 0.22362294793128967, "learning_rate": 0.00011392744693701682, "loss": 0.0461, "step": 3120 }, { "epoch": 2.4207269914926526, "grad_norm": 0.6176216006278992, "learning_rate": 0.00011342042832113336, "loss": 0.0572, "step": 3130 }, { "epoch": 2.42846094354215, "grad_norm": 0.48704126477241516, "learning_rate": 0.00011291305813557615, "loss": 0.044, "step": 3140 }, { "epoch": 2.4361948955916475, "grad_norm": 0.6449659466743469, "learning_rate": 0.0001124053496717221, "loss": 0.056, "step": 3150 }, { "epoch": 2.4439288476411445, "grad_norm": 0.4028393626213074, "learning_rate": 0.00011189731622980978, "loss": 0.0479, "step": 3160 }, { "epoch": 2.451662799690642, "grad_norm": 0.30113962292671204, "learning_rate": 0.0001113889711185912, "loss": 0.0548, "step": 3170 }, { "epoch": 2.4593967517401394, "grad_norm": 0.32609930634498596, "learning_rate": 0.00011088032765498291, "loss": 0.0486, "step": 3180 }, { "epoch": 2.4671307037896364, "grad_norm": 0.49029791355133057, "learning_rate": 0.00011037139916371734, "loss": 0.0413, "step": 3190 }, { "epoch": 2.474864655839134, "grad_norm": 0.3671391010284424, "learning_rate": 0.00010986219897699375, "loss": 0.0541, "step": 3200 }, { "epoch": 2.4825986078886313, "grad_norm": 0.2625034749507904, "learning_rate": 0.00010935274043412876, "loss": 0.0453, "step": 3210 }, { "epoch": 2.4903325599381283, "grad_norm": 0.6732146143913269, "learning_rate": 0.00010884303688120714, "loss": 0.0551, "step": 3220 }, { "epoch": 2.4980665119876257, "grad_norm": 0.4208376109600067, "learning_rate": 0.00010833310167073209, "loss": 0.048, "step": 3230 }, { "epoch": 2.505800464037123, "grad_norm": 0.4626869261264801, "learning_rate": 0.0001078229481612754, "loss": 0.0414, "step": 3240 }, { "epoch": 2.51353441608662, "grad_norm": 0.5585875511169434, "learning_rate": 0.00010731258971712761, "loss": 0.0493, "step": 3250 }, { "epoch": 2.5212683681361177, "grad_norm": 0.5375910997390747, "learning_rate": 0.0001068020397079478, "loss": 0.0496, "step": 3260 }, { "epoch": 2.529002320185615, "grad_norm": 0.3865591883659363, "learning_rate": 0.00010629131150841343, "loss": 0.0504, "step": 3270 }, { "epoch": 2.536736272235112, "grad_norm": 0.44638946652412415, "learning_rate": 0.00010578041849786995, "loss": 0.0548, "step": 3280 }, { "epoch": 2.5444702242846096, "grad_norm": 0.3882666826248169, "learning_rate": 0.0001052693740599803, "loss": 0.0467, "step": 3290 }, { "epoch": 2.5522041763341066, "grad_norm": 0.613440752029419, "learning_rate": 0.00010475819158237425, "loss": 0.0574, "step": 3300 }, { "epoch": 2.559938128383604, "grad_norm": 0.4089605510234833, "learning_rate": 0.00010424688445629782, "loss": 0.0508, "step": 3310 }, { "epoch": 2.5676720804331015, "grad_norm": 0.28633350133895874, "learning_rate": 0.00010373546607626236, "loss": 0.0443, "step": 3320 }, { "epoch": 2.5754060324825985, "grad_norm": 0.4307003319263458, "learning_rate": 0.00010322394983969368, "loss": 0.0511, "step": 3330 }, { "epoch": 2.583139984532096, "grad_norm": 0.4238097369670868, "learning_rate": 0.00010271234914658117, "loss": 0.0479, "step": 3340 }, { "epoch": 2.590873936581593, "grad_norm": 0.3680228590965271, "learning_rate": 0.00010220067739912656, "loss": 0.045, "step": 3350 }, { "epoch": 2.5986078886310904, "grad_norm": 0.5732664465904236, "learning_rate": 0.0001016889480013931, "loss": 0.0504, "step": 3360 }, { "epoch": 2.606341840680588, "grad_norm": 0.6249806880950928, "learning_rate": 0.00010117717435895425, "loss": 0.049, "step": 3370 }, { "epoch": 2.614075792730085, "grad_norm": 0.3082541525363922, "learning_rate": 0.0001006653698785424, "loss": 0.0515, "step": 3380 }, { "epoch": 2.6218097447795823, "grad_norm": 0.41302812099456787, "learning_rate": 0.00010015354796769802, "loss": 0.0497, "step": 3390 }, { "epoch": 2.6295436968290797, "grad_norm": 0.3680625855922699, "learning_rate": 9.964172203441799e-05, "loss": 0.0526, "step": 3400 }, { "epoch": 2.6372776488785767, "grad_norm": 0.62422776222229, "learning_rate": 9.912990548680472e-05, "loss": 0.0508, "step": 3410 }, { "epoch": 2.645011600928074, "grad_norm": 0.37407588958740234, "learning_rate": 9.861811173271459e-05, "loss": 0.0499, "step": 3420 }, { "epoch": 2.6527455529775716, "grad_norm": 0.5591204762458801, "learning_rate": 9.8106354179407e-05, "loss": 0.0543, "step": 3430 }, { "epoch": 2.6604795050270686, "grad_norm": 0.5074604749679565, "learning_rate": 9.759464623319302e-05, "loss": 0.0554, "step": 3440 }, { "epoch": 2.668213457076566, "grad_norm": 0.5068333745002747, "learning_rate": 9.708300129908403e-05, "loss": 0.0502, "step": 3450 }, { "epoch": 2.6759474091260635, "grad_norm": 0.32990241050720215, "learning_rate": 9.657143278044085e-05, "loss": 0.0428, "step": 3460 }, { "epoch": 2.6836813611755606, "grad_norm": 0.38101649284362793, "learning_rate": 9.605995407862247e-05, "loss": 0.0522, "step": 3470 }, { "epoch": 2.691415313225058, "grad_norm": 0.482260137796402, "learning_rate": 9.554857859263486e-05, "loss": 0.0457, "step": 3480 }, { "epoch": 2.6991492652745555, "grad_norm": 0.3293984532356262, "learning_rate": 9.503731971878022e-05, "loss": 0.0452, "step": 3490 }, { "epoch": 2.7068832173240525, "grad_norm": 0.6199944019317627, "learning_rate": 9.452619085030588e-05, "loss": 0.0506, "step": 3500 }, { "epoch": 2.71461716937355, "grad_norm": 0.4337422251701355, "learning_rate": 9.401520537705339e-05, "loss": 0.0484, "step": 3510 }, { "epoch": 2.7223511214230474, "grad_norm": 0.40171828866004944, "learning_rate": 9.350437668510794e-05, "loss": 0.054, "step": 3520 }, { "epoch": 2.7300850734725444, "grad_norm": 0.40027546882629395, "learning_rate": 9.299371815644749e-05, "loss": 0.0455, "step": 3530 }, { "epoch": 2.737819025522042, "grad_norm": 0.612714946269989, "learning_rate": 9.248324316859237e-05, "loss": 0.0502, "step": 3540 }, { "epoch": 2.7455529775715393, "grad_norm": 0.45406460762023926, "learning_rate": 9.197296509425471e-05, "loss": 0.0454, "step": 3550 }, { "epoch": 2.7532869296210363, "grad_norm": 0.703279972076416, "learning_rate": 9.14628973009882e-05, "loss": 0.0494, "step": 3560 }, { "epoch": 2.7610208816705337, "grad_norm": 0.5926344990730286, "learning_rate": 9.095305315083795e-05, "loss": 0.0544, "step": 3570 }, { "epoch": 2.768754833720031, "grad_norm": 0.43660876154899597, "learning_rate": 9.04434459999902e-05, "loss": 0.0459, "step": 3580 }, { "epoch": 2.776488785769528, "grad_norm": 0.310839980840683, "learning_rate": 8.993408919842276e-05, "loss": 0.0462, "step": 3590 }, { "epoch": 2.7842227378190256, "grad_norm": 0.39856886863708496, "learning_rate": 8.942499608955516e-05, "loss": 0.0509, "step": 3600 }, { "epoch": 2.791956689868523, "grad_norm": 0.4131952226161957, "learning_rate": 8.891618000989891e-05, "loss": 0.047, "step": 3610 }, { "epoch": 2.79969064191802, "grad_norm": 0.512795627117157, "learning_rate": 8.840765428870845e-05, "loss": 0.0453, "step": 3620 }, { "epoch": 2.8074245939675175, "grad_norm": 0.5774818658828735, "learning_rate": 8.789943224763182e-05, "loss": 0.0429, "step": 3630 }, { "epoch": 2.8151585460170145, "grad_norm": 0.5578257441520691, "learning_rate": 8.73915272003615e-05, "loss": 0.0457, "step": 3640 }, { "epoch": 2.822892498066512, "grad_norm": 0.47416234016418457, "learning_rate": 8.6883952452286e-05, "loss": 0.0483, "step": 3650 }, { "epoch": 2.8306264501160094, "grad_norm": 0.42786017060279846, "learning_rate": 8.637672130014105e-05, "loss": 0.0471, "step": 3660 }, { "epoch": 2.8383604021655064, "grad_norm": 0.3835844099521637, "learning_rate": 8.586984703166126e-05, "loss": 0.044, "step": 3670 }, { "epoch": 2.846094354215004, "grad_norm": 0.8271371126174927, "learning_rate": 8.536334292523216e-05, "loss": 0.0458, "step": 3680 }, { "epoch": 2.853828306264501, "grad_norm": 0.5438926815986633, "learning_rate": 8.485722224954237e-05, "loss": 0.0438, "step": 3690 }, { "epoch": 2.8615622583139984, "grad_norm": 0.31754782795906067, "learning_rate": 8.435149826323574e-05, "loss": 0.0432, "step": 3700 }, { "epoch": 2.869296210363496, "grad_norm": 0.3482377827167511, "learning_rate": 8.384618421456436e-05, "loss": 0.0499, "step": 3710 }, { "epoch": 2.877030162412993, "grad_norm": 0.37949004769325256, "learning_rate": 8.33412933410413e-05, "loss": 0.051, "step": 3720 }, { "epoch": 2.8847641144624903, "grad_norm": 0.4423973560333252, "learning_rate": 8.283683886909385e-05, "loss": 0.0438, "step": 3730 }, { "epoch": 2.8924980665119877, "grad_norm": 0.3723476827144623, "learning_rate": 8.23328340137171e-05, "loss": 0.0399, "step": 3740 }, { "epoch": 2.9002320185614847, "grad_norm": 0.4731302261352539, "learning_rate": 8.182929197812769e-05, "loss": 0.048, "step": 3750 }, { "epoch": 2.907965970610982, "grad_norm": 0.45893344283103943, "learning_rate": 8.132622595341792e-05, "loss": 0.0447, "step": 3760 }, { "epoch": 2.9156999226604796, "grad_norm": 0.522000789642334, "learning_rate": 8.08236491182103e-05, "loss": 0.0467, "step": 3770 }, { "epoch": 2.9234338747099766, "grad_norm": 1.1093336343765259, "learning_rate": 8.032157463831216e-05, "loss": 0.0462, "step": 3780 }, { "epoch": 2.931167826759474, "grad_norm": 0.4075047969818115, "learning_rate": 7.982001566637092e-05, "loss": 0.0436, "step": 3790 }, { "epoch": 2.9389017788089715, "grad_norm": 0.38247767090797424, "learning_rate": 7.931898534152928e-05, "loss": 0.049, "step": 3800 }, { "epoch": 2.9466357308584685, "grad_norm": 0.359324187040329, "learning_rate": 7.881849678908132e-05, "loss": 0.0599, "step": 3810 }, { "epoch": 2.954369682907966, "grad_norm": 0.38413822650909424, "learning_rate": 7.831856312012855e-05, "loss": 0.044, "step": 3820 }, { "epoch": 2.9621036349574634, "grad_norm": 0.4635251760482788, "learning_rate": 7.781919743123624e-05, "loss": 0.0482, "step": 3830 }, { "epoch": 2.9698375870069604, "grad_norm": 0.7617440819740295, "learning_rate": 7.732041280409066e-05, "loss": 0.057, "step": 3840 }, { "epoch": 2.977571539056458, "grad_norm": 0.5643038749694824, "learning_rate": 7.682222230515622e-05, "loss": 0.0471, "step": 3850 }, { "epoch": 2.9853054911059553, "grad_norm": 0.6302655935287476, "learning_rate": 7.63246389853331e-05, "loss": 0.0584, "step": 3860 }, { "epoch": 2.9930394431554523, "grad_norm": 0.7743414640426636, "learning_rate": 7.582767587961552e-05, "loss": 0.0589, "step": 3870 }, { "epoch": 3.00077339520495, "grad_norm": 0.5334261655807495, "learning_rate": 7.533134600675024e-05, "loss": 0.0436, "step": 3880 }, { "epoch": 3.0085073472544472, "grad_norm": 0.6000996232032776, "learning_rate": 7.483566236889533e-05, "loss": 0.05, "step": 3890 }, { "epoch": 3.0162412993039442, "grad_norm": 0.3452172577381134, "learning_rate": 7.43406379512798e-05, "loss": 0.0431, "step": 3900 }, { "epoch": 3.0239752513534417, "grad_norm": 0.5110192894935608, "learning_rate": 7.384628572186333e-05, "loss": 0.0428, "step": 3910 }, { "epoch": 3.0317092034029387, "grad_norm": 0.35554081201553345, "learning_rate": 7.335261863099651e-05, "loss": 0.0359, "step": 3920 }, { "epoch": 3.039443155452436, "grad_norm": 0.2638823986053467, "learning_rate": 7.285964961108163e-05, "loss": 0.0433, "step": 3930 }, { "epoch": 3.0471771075019336, "grad_norm": 0.38675743341445923, "learning_rate": 7.23673915762339e-05, "loss": 0.0511, "step": 3940 }, { "epoch": 3.0549110595514306, "grad_norm": 0.6245094537734985, "learning_rate": 7.187585742194311e-05, "loss": 0.0477, "step": 3950 }, { "epoch": 3.062645011600928, "grad_norm": 0.41278642416000366, "learning_rate": 7.138506002473591e-05, "loss": 0.0515, "step": 3960 }, { "epoch": 3.0703789636504255, "grad_norm": 0.4102723300457001, "learning_rate": 7.089501224183837e-05, "loss": 0.0498, "step": 3970 }, { "epoch": 3.0781129156999225, "grad_norm": 0.3685295283794403, "learning_rate": 7.040572691083913e-05, "loss": 0.037, "step": 3980 }, { "epoch": 3.08584686774942, "grad_norm": 0.3958451747894287, "learning_rate": 6.991721684935328e-05, "loss": 0.0392, "step": 3990 }, { "epoch": 3.0935808197989174, "grad_norm": 0.4311326742172241, "learning_rate": 6.942949485468651e-05, "loss": 0.0403, "step": 4000 }, { "epoch": 3.1013147718484144, "grad_norm": 0.3655391037464142, "learning_rate": 6.89425737034997e-05, "loss": 0.0485, "step": 4010 }, { "epoch": 3.109048723897912, "grad_norm": 0.34593647718429565, "learning_rate": 6.845646615147445e-05, "loss": 0.0459, "step": 4020 }, { "epoch": 3.1167826759474093, "grad_norm": 0.41606053709983826, "learning_rate": 6.797118493297885e-05, "loss": 0.0463, "step": 4030 }, { "epoch": 3.1245166279969063, "grad_norm": 0.5124359130859375, "learning_rate": 6.748674276073371e-05, "loss": 0.0455, "step": 4040 }, { "epoch": 3.1322505800464038, "grad_norm": 0.5297297239303589, "learning_rate": 6.700315232547981e-05, "loss": 0.0484, "step": 4050 }, { "epoch": 3.139984532095901, "grad_norm": 0.27737703919410706, "learning_rate": 6.652042629564528e-05, "loss": 0.0506, "step": 4060 }, { "epoch": 3.1477184841453982, "grad_norm": 0.4292052388191223, "learning_rate": 6.60385773170138e-05, "loss": 0.0496, "step": 4070 }, { "epoch": 3.1554524361948957, "grad_norm": 0.9094840884208679, "learning_rate": 6.555761801239313e-05, "loss": 0.0472, "step": 4080 }, { "epoch": 3.1631863882443927, "grad_norm": 0.5290389657020569, "learning_rate": 6.507756098128475e-05, "loss": 0.0467, "step": 4090 }, { "epoch": 3.17092034029389, "grad_norm": 0.40371954441070557, "learning_rate": 6.459841879955365e-05, "loss": 0.043, "step": 4100 }, { "epoch": 3.1786542923433876, "grad_norm": 0.49757513403892517, "learning_rate": 6.41202040190987e-05, "loss": 0.0405, "step": 4110 }, { "epoch": 3.1863882443928846, "grad_norm": 0.37258026003837585, "learning_rate": 6.364292916752414e-05, "loss": 0.0412, "step": 4120 }, { "epoch": 3.194122196442382, "grad_norm": 0.43795064091682434, "learning_rate": 6.31666067478113e-05, "loss": 0.0437, "step": 4130 }, { "epoch": 3.2018561484918795, "grad_norm": 0.47758275270462036, "learning_rate": 6.26912492379909e-05, "loss": 0.0456, "step": 4140 }, { "epoch": 3.2095901005413765, "grad_norm": 0.433479905128479, "learning_rate": 6.221686909081634e-05, "loss": 0.0388, "step": 4150 }, { "epoch": 3.217324052590874, "grad_norm": 0.36021220684051514, "learning_rate": 6.174347873343749e-05, "loss": 0.0404, "step": 4160 }, { "epoch": 3.2250580046403714, "grad_norm": 0.42296263575553894, "learning_rate": 6.127109056707504e-05, "loss": 0.0371, "step": 4170 }, { "epoch": 3.2327919566898684, "grad_norm": 0.664455771446228, "learning_rate": 6.0799716966695674e-05, "loss": 0.0444, "step": 4180 }, { "epoch": 3.240525908739366, "grad_norm": 0.3612815737724304, "learning_rate": 6.032937028068797e-05, "loss": 0.0356, "step": 4190 }, { "epoch": 3.2482598607888633, "grad_norm": 0.6047795414924622, "learning_rate": 5.986006283053866e-05, "loss": 0.0393, "step": 4200 }, { "epoch": 3.2559938128383603, "grad_norm": 0.40483081340789795, "learning_rate": 5.9391806910510185e-05, "loss": 0.0435, "step": 4210 }, { "epoch": 3.2637277648878578, "grad_norm": 0.6035226583480835, "learning_rate": 5.8924614787318446e-05, "loss": 0.0405, "step": 4220 }, { "epoch": 3.271461716937355, "grad_norm": 0.5182496309280396, "learning_rate": 5.845849869981137e-05, "loss": 0.034, "step": 4230 }, { "epoch": 3.279195668986852, "grad_norm": 0.39812296628952026, "learning_rate": 5.799347085864851e-05, "loss": 0.0438, "step": 4240 }, { "epoch": 3.2869296210363497, "grad_norm": 0.434983491897583, "learning_rate": 5.752954344598105e-05, "loss": 0.0433, "step": 4250 }, { "epoch": 3.2946635730858467, "grad_norm": 0.3555368185043335, "learning_rate": 5.706672861513262e-05, "loss": 0.0432, "step": 4260 }, { "epoch": 3.302397525135344, "grad_norm": 0.4039643108844757, "learning_rate": 5.6605038490280995e-05, "loss": 0.0379, "step": 4270 }, { "epoch": 3.3101314771848416, "grad_norm": 0.4258735477924347, "learning_rate": 5.614448516614059e-05, "loss": 0.0466, "step": 4280 }, { "epoch": 3.3178654292343386, "grad_norm": 0.4303387701511383, "learning_rate": 5.5685080707645265e-05, "loss": 0.0525, "step": 4290 }, { "epoch": 3.325599381283836, "grad_norm": 0.7440736293792725, "learning_rate": 5.522683714963275e-05, "loss": 0.0474, "step": 4300 }, { "epoch": 3.3333333333333335, "grad_norm": 0.5000112056732178, "learning_rate": 5.476976649652892e-05, "loss": 0.0437, "step": 4310 }, { "epoch": 3.3410672853828305, "grad_norm": 0.4684523642063141, "learning_rate": 5.431388072203373e-05, "loss": 0.0466, "step": 4320 }, { "epoch": 3.348801237432328, "grad_norm": 0.5402469635009766, "learning_rate": 5.385919176880706e-05, "loss": 0.0482, "step": 4330 }, { "epoch": 3.3565351894818254, "grad_norm": 0.6256360411643982, "learning_rate": 5.340571154815641e-05, "loss": 0.0379, "step": 4340 }, { "epoch": 3.3642691415313224, "grad_norm": 0.4393569529056549, "learning_rate": 5.2953451939724454e-05, "loss": 0.0389, "step": 4350 }, { "epoch": 3.37200309358082, "grad_norm": 0.33346205949783325, "learning_rate": 5.250242479117795e-05, "loss": 0.0398, "step": 4360 }, { "epoch": 3.379737045630317, "grad_norm": 0.37940773367881775, "learning_rate": 5.205264191789741e-05, "loss": 0.0452, "step": 4370 }, { "epoch": 3.3874709976798143, "grad_norm": 0.38195016980171204, "learning_rate": 5.160411510266768e-05, "loss": 0.0456, "step": 4380 }, { "epoch": 3.3952049497293117, "grad_norm": 0.40058159828186035, "learning_rate": 5.115685609536889e-05, "loss": 0.0461, "step": 4390 }, { "epoch": 3.4029389017788088, "grad_norm": 0.41986632347106934, "learning_rate": 5.071087661266918e-05, "loss": 0.0458, "step": 4400 }, { "epoch": 3.410672853828306, "grad_norm": 0.39719370007514954, "learning_rate": 5.02661883377173e-05, "loss": 0.0398, "step": 4410 }, { "epoch": 3.4184068058778037, "grad_norm": 0.4247942566871643, "learning_rate": 4.982280291983683e-05, "loss": 0.0371, "step": 4420 }, { "epoch": 3.4261407579273007, "grad_norm": 0.4117302894592285, "learning_rate": 4.938073197422084e-05, "loss": 0.039, "step": 4430 }, { "epoch": 3.433874709976798, "grad_norm": 0.5301624536514282, "learning_rate": 4.89399870816278e-05, "loss": 0.0403, "step": 4440 }, { "epoch": 3.4416086620262956, "grad_norm": 0.4850146770477295, "learning_rate": 4.850057978807803e-05, "loss": 0.0468, "step": 4450 }, { "epoch": 3.4493426140757926, "grad_norm": 0.48304876685142517, "learning_rate": 4.806252160455125e-05, "loss": 0.0498, "step": 4460 }, { "epoch": 3.45707656612529, "grad_norm": 0.4879327714443207, "learning_rate": 4.7625824006685136e-05, "loss": 0.0499, "step": 4470 }, { "epoch": 3.4648105181747875, "grad_norm": 0.2993864119052887, "learning_rate": 4.71904984344746e-05, "loss": 0.0355, "step": 4480 }, { "epoch": 3.4725444702242845, "grad_norm": 0.440640926361084, "learning_rate": 4.675655629197222e-05, "loss": 0.0452, "step": 4490 }, { "epoch": 3.480278422273782, "grad_norm": 0.3782712519168854, "learning_rate": 4.6324008946989314e-05, "loss": 0.049, "step": 4500 }, { "epoch": 3.4880123743232794, "grad_norm": 0.6026772856712341, "learning_rate": 4.589286773079828e-05, "loss": 0.0373, "step": 4510 }, { "epoch": 3.4957463263727764, "grad_norm": 0.6759253144264221, "learning_rate": 4.546314393783567e-05, "loss": 0.045, "step": 4520 }, { "epoch": 3.503480278422274, "grad_norm": 0.34079480171203613, "learning_rate": 4.5034848825406505e-05, "loss": 0.0396, "step": 4530 }, { "epoch": 3.5112142304717713, "grad_norm": 0.5628577470779419, "learning_rate": 4.4607993613388976e-05, "loss": 0.0358, "step": 4540 }, { "epoch": 3.5189481825212683, "grad_norm": 0.40260758996009827, "learning_rate": 4.4182589483941015e-05, "loss": 0.0382, "step": 4550 }, { "epoch": 3.5266821345707657, "grad_norm": 0.3873592019081116, "learning_rate": 4.375864758120696e-05, "loss": 0.0446, "step": 4560 }, { "epoch": 3.534416086620263, "grad_norm": 0.4504919946193695, "learning_rate": 4.333617901102591e-05, "loss": 0.0461, "step": 4570 }, { "epoch": 3.54215003866976, "grad_norm": 0.4575732946395874, "learning_rate": 4.2915194840640426e-05, "loss": 0.0456, "step": 4580 }, { "epoch": 3.5498839907192576, "grad_norm": 0.41531655192375183, "learning_rate": 4.2495706098407085e-05, "loss": 0.0362, "step": 4590 }, { "epoch": 3.557617942768755, "grad_norm": 0.4861404001712799, "learning_rate": 4.207772377350717e-05, "loss": 0.0442, "step": 4600 }, { "epoch": 3.565351894818252, "grad_norm": 0.2354011982679367, "learning_rate": 4.166125881565899e-05, "loss": 0.0495, "step": 4610 }, { "epoch": 3.5730858468677495, "grad_norm": 0.4333617687225342, "learning_rate": 4.124632213483093e-05, "loss": 0.0402, "step": 4620 }, { "epoch": 3.5808197989172466, "grad_norm": 0.42827537655830383, "learning_rate": 4.083292460095587e-05, "loss": 0.0358, "step": 4630 }, { "epoch": 3.588553750966744, "grad_norm": 0.4020684063434601, "learning_rate": 4.0421077043646e-05, "loss": 0.0395, "step": 4640 }, { "epoch": 3.5962877030162415, "grad_norm": 0.2955048680305481, "learning_rate": 4.0010790251909624e-05, "loss": 0.0426, "step": 4650 }, { "epoch": 3.6040216550657385, "grad_norm": 0.6455952525138855, "learning_rate": 3.960207497386818e-05, "loss": 0.05, "step": 4660 }, { "epoch": 3.611755607115236, "grad_norm": 0.27891743183135986, "learning_rate": 3.9194941916474746e-05, "loss": 0.0478, "step": 4670 }, { "epoch": 3.619489559164733, "grad_norm": 0.26489436626434326, "learning_rate": 3.878940174523371e-05, "loss": 0.0373, "step": 4680 }, { "epoch": 3.6272235112142304, "grad_norm": 0.4396156966686249, "learning_rate": 3.8385465083921136e-05, "loss": 0.0466, "step": 4690 }, { "epoch": 3.634957463263728, "grad_norm": 0.6993714570999146, "learning_rate": 3.7983142514306615e-05, "loss": 0.0415, "step": 4700 }, { "epoch": 3.642691415313225, "grad_norm": 0.4104761779308319, "learning_rate": 3.7582444575875964e-05, "loss": 0.0376, "step": 4710 }, { "epoch": 3.6504253673627223, "grad_norm": 0.43550264835357666, "learning_rate": 3.7183381765555325e-05, "loss": 0.0391, "step": 4720 }, { "epoch": 3.6581593194122197, "grad_norm": 0.34116363525390625, "learning_rate": 3.678596453743579e-05, "loss": 0.0336, "step": 4730 }, { "epoch": 3.6658932714617167, "grad_norm": 0.4384767711162567, "learning_rate": 3.6390203302500034e-05, "loss": 0.0382, "step": 4740 }, { "epoch": 3.673627223511214, "grad_norm": 0.39377689361572266, "learning_rate": 3.599610842834917e-05, "loss": 0.0334, "step": 4750 }, { "epoch": 3.6813611755607116, "grad_norm": 0.44471830129623413, "learning_rate": 3.560369023893138e-05, "loss": 0.0385, "step": 4760 }, { "epoch": 3.6890951276102086, "grad_norm": 0.5463324189186096, "learning_rate": 3.521295901427132e-05, "loss": 0.0447, "step": 4770 }, { "epoch": 3.696829079659706, "grad_norm": 0.36133643984794617, "learning_rate": 3.4823924990201074e-05, "loss": 0.0439, "step": 4780 }, { "epoch": 3.7045630317092035, "grad_norm": 0.5574067831039429, "learning_rate": 3.443659835809158e-05, "loss": 0.0434, "step": 4790 }, { "epoch": 3.7122969837587005, "grad_norm": 0.4034649729728699, "learning_rate": 3.4050989264586096e-05, "loss": 0.0369, "step": 4800 }, { "epoch": 3.720030935808198, "grad_norm": 0.4505595862865448, "learning_rate": 3.366710781133411e-05, "loss": 0.04, "step": 4810 }, { "epoch": 3.7277648878576954, "grad_norm": 0.3705731928348541, "learning_rate": 3.32849640547269e-05, "loss": 0.04, "step": 4820 }, { "epoch": 3.7354988399071924, "grad_norm": 0.3757650852203369, "learning_rate": 3.290456800563378e-05, "loss": 0.0466, "step": 4830 }, { "epoch": 3.74323279195669, "grad_norm": 0.5580532550811768, "learning_rate": 3.2525929629140294e-05, "loss": 0.0453, "step": 4840 }, { "epoch": 3.7509667440061873, "grad_norm": 0.5846080780029297, "learning_rate": 3.21490588442868e-05, "loss": 0.039, "step": 4850 }, { "epoch": 3.7587006960556844, "grad_norm": 0.7010289430618286, "learning_rate": 3.1773965523808754e-05, "loss": 0.0432, "step": 4860 }, { "epoch": 3.766434648105182, "grad_norm": 0.27942541241645813, "learning_rate": 3.1400659493878105e-05, "loss": 0.0483, "step": 4870 }, { "epoch": 3.7741686001546793, "grad_norm": 0.965315043926239, "learning_rate": 3.1029150533845884e-05, "loss": 0.0407, "step": 4880 }, { "epoch": 3.7819025522041763, "grad_norm": 0.518121600151062, "learning_rate": 3.065944837598596e-05, "loss": 0.0382, "step": 4890 }, { "epoch": 3.7896365042536737, "grad_norm": 0.6056118011474609, "learning_rate": 3.0291562705240105e-05, "loss": 0.0348, "step": 4900 }, { "epoch": 3.797370456303171, "grad_norm": 0.6434788107872009, "learning_rate": 2.9925503158964298e-05, "loss": 0.0364, "step": 4910 }, { "epoch": 3.805104408352668, "grad_norm": 0.3909451961517334, "learning_rate": 2.956127932667625e-05, "loss": 0.035, "step": 4920 }, { "epoch": 3.8128383604021656, "grad_norm": 0.29201436042785645, "learning_rate": 2.9198900749804247e-05, "loss": 0.04, "step": 4930 }, { "epoch": 3.820572312451663, "grad_norm": 0.34561288356781006, "learning_rate": 2.8838376921437103e-05, "loss": 0.0371, "step": 4940 }, { "epoch": 3.82830626450116, "grad_norm": 0.5528729557991028, "learning_rate": 2.8479717286075502e-05, "loss": 0.0452, "step": 4950 }, { "epoch": 3.8360402165506575, "grad_norm": 0.5122141242027283, "learning_rate": 2.81229312393846e-05, "loss": 0.0391, "step": 4960 }, { "epoch": 3.8437741686001545, "grad_norm": 0.5706027150154114, "learning_rate": 2.7768028127947975e-05, "loss": 0.041, "step": 4970 }, { "epoch": 3.851508120649652, "grad_norm": 0.6069139838218689, "learning_rate": 2.7415017249022524e-05, "loss": 0.0372, "step": 4980 }, { "epoch": 3.8592420726991494, "grad_norm": 0.5668622851371765, "learning_rate": 2.7063907850295257e-05, "loss": 0.0451, "step": 4990 }, { "epoch": 3.8669760247486464, "grad_norm": 0.5618468523025513, "learning_rate": 2.6714709129640735e-05, "loss": 0.0385, "step": 5000 }, { "epoch": 3.874709976798144, "grad_norm": 0.7862294316291809, "learning_rate": 2.6367430234880284e-05, "loss": 0.0387, "step": 5010 }, { "epoch": 3.882443928847641, "grad_norm": 0.3596813976764679, "learning_rate": 2.6022080263542272e-05, "loss": 0.0368, "step": 5020 }, { "epoch": 3.8901778808971383, "grad_norm": 0.3073577582836151, "learning_rate": 2.567866826262393e-05, "loss": 0.0384, "step": 5030 }, { "epoch": 3.897911832946636, "grad_norm": 0.46745383739471436, "learning_rate": 2.5337203228354035e-05, "loss": 0.036, "step": 5040 }, { "epoch": 3.905645784996133, "grad_norm": 0.24336379766464233, "learning_rate": 2.499769410595767e-05, "loss": 0.0407, "step": 5050 }, { "epoch": 3.9133797370456302, "grad_norm": 0.38927024602890015, "learning_rate": 2.4660149789421495e-05, "loss": 0.0351, "step": 5060 }, { "epoch": 3.9211136890951277, "grad_norm": 0.665361225605011, "learning_rate": 2.4324579121261047e-05, "loss": 0.0384, "step": 5070 }, { "epoch": 3.9288476411446247, "grad_norm": 0.48273828625679016, "learning_rate": 2.3990990892288912e-05, "loss": 0.042, "step": 5080 }, { "epoch": 3.936581593194122, "grad_norm": 0.4618639051914215, "learning_rate": 2.365939384138449e-05, "loss": 0.0424, "step": 5090 }, { "epoch": 3.9443155452436196, "grad_norm": 0.48375582695007324, "learning_rate": 2.3329796655265102e-05, "loss": 0.045, "step": 5100 }, { "epoch": 3.9520494972931166, "grad_norm": 0.36676934361457825, "learning_rate": 2.3002207968258348e-05, "loss": 0.0449, "step": 5110 }, { "epoch": 3.959783449342614, "grad_norm": 0.47094133496284485, "learning_rate": 2.2676636362076076e-05, "loss": 0.0366, "step": 5120 }, { "epoch": 3.9675174013921115, "grad_norm": 0.2710976302623749, "learning_rate": 2.2353090365589348e-05, "loss": 0.0344, "step": 5130 }, { "epoch": 3.9752513534416085, "grad_norm": 0.47731488943099976, "learning_rate": 2.2031578454605196e-05, "loss": 0.0377, "step": 5140 }, { "epoch": 3.982985305491106, "grad_norm": 0.33124613761901855, "learning_rate": 2.1712109051644435e-05, "loss": 0.0355, "step": 5150 }, { "epoch": 3.9907192575406034, "grad_norm": 0.3584497272968292, "learning_rate": 2.139469052572127e-05, "loss": 0.0373, "step": 5160 }, { "epoch": 3.9984532095901004, "grad_norm": 0.501616358757019, "learning_rate": 2.1079331192123652e-05, "loss": 0.0345, "step": 5170 }, { "epoch": 4.006187161639597, "grad_norm": 0.26638418436050415, "learning_rate": 2.0766039312195894e-05, "loss": 0.0392, "step": 5180 }, { "epoch": 4.013921113689095, "grad_norm": 0.40235665440559387, "learning_rate": 2.0454823093121924e-05, "loss": 0.0364, "step": 5190 }, { "epoch": 4.021655065738592, "grad_norm": 0.47847551107406616, "learning_rate": 2.014569068771043e-05, "loss": 0.0391, "step": 5200 }, { "epoch": 4.029389017788089, "grad_norm": 0.8141834139823914, "learning_rate": 1.983865019418122e-05, "loss": 0.0347, "step": 5210 }, { "epoch": 4.037122969837587, "grad_norm": 0.6128751635551453, "learning_rate": 1.9533709655953235e-05, "loss": 0.0363, "step": 5220 }, { "epoch": 4.044856921887084, "grad_norm": 0.4189740717411041, "learning_rate": 1.9230877061433507e-05, "loss": 0.0399, "step": 5230 }, { "epoch": 4.052590873936581, "grad_norm": 0.6825684905052185, "learning_rate": 1.893016034380829e-05, "loss": 0.0352, "step": 5240 }, { "epoch": 4.060324825986079, "grad_norm": 0.2782864570617676, "learning_rate": 1.8631567380834957e-05, "loss": 0.0278, "step": 5250 }, { "epoch": 4.068058778035576, "grad_norm": 0.36507630348205566, "learning_rate": 1.8335105994635716e-05, "loss": 0.044, "step": 5260 }, { "epoch": 4.075792730085073, "grad_norm": 0.49042633175849915, "learning_rate": 1.804078395149269e-05, "loss": 0.0364, "step": 5270 }, { "epoch": 4.083526682134571, "grad_norm": 0.6503674983978271, "learning_rate": 1.774860896164454e-05, "loss": 0.0386, "step": 5280 }, { "epoch": 4.091260634184068, "grad_norm": 0.42561471462249756, "learning_rate": 1.7458588679084342e-05, "loss": 0.036, "step": 5290 }, { "epoch": 4.098994586233565, "grad_norm": 0.45733729004859924, "learning_rate": 1.7170730701359205e-05, "loss": 0.0442, "step": 5300 }, { "epoch": 4.106728538283063, "grad_norm": 0.4612604081630707, "learning_rate": 1.6885042569371146e-05, "loss": 0.0487, "step": 5310 }, { "epoch": 4.11446249033256, "grad_norm": 0.6571356058120728, "learning_rate": 1.6601531767179602e-05, "loss": 0.0383, "step": 5320 }, { "epoch": 4.122196442382057, "grad_norm": 0.487978458404541, "learning_rate": 1.6320205721805405e-05, "loss": 0.0432, "step": 5330 }, { "epoch": 4.129930394431555, "grad_norm": 0.309918612241745, "learning_rate": 1.60410718030361e-05, "loss": 0.0362, "step": 5340 }, { "epoch": 4.137664346481052, "grad_norm": 0.4701076149940491, "learning_rate": 1.5764137323233053e-05, "loss": 0.0323, "step": 5350 }, { "epoch": 4.145398298530549, "grad_norm": 0.33448338508605957, "learning_rate": 1.5489409537139654e-05, "loss": 0.0452, "step": 5360 }, { "epoch": 4.153132250580047, "grad_norm": 0.6301515698432922, "learning_rate": 1.5216895641691542e-05, "loss": 0.0324, "step": 5370 }, { "epoch": 4.160866202629544, "grad_norm": 0.5176683664321899, "learning_rate": 1.4946602775827857e-05, "loss": 0.037, "step": 5380 }, { "epoch": 4.168600154679041, "grad_norm": 0.486626535654068, "learning_rate": 1.4678538020304322e-05, "loss": 0.0366, "step": 5390 }, { "epoch": 4.176334106728539, "grad_norm": 0.37586092948913574, "learning_rate": 1.4412708397507724e-05, "loss": 0.0375, "step": 5400 }, { "epoch": 4.184068058778036, "grad_norm": 0.30214399099349976, "learning_rate": 1.4149120871272026e-05, "loss": 0.0347, "step": 5410 }, { "epoch": 4.191802010827533, "grad_norm": 0.3565197288990021, "learning_rate": 1.388778234669571e-05, "loss": 0.0427, "step": 5420 }, { "epoch": 4.199535962877031, "grad_norm": 0.4674254357814789, "learning_rate": 1.3628699669961243e-05, "loss": 0.0389, "step": 5430 }, { "epoch": 4.207269914926528, "grad_norm": 0.45576655864715576, "learning_rate": 1.3371879628155392e-05, "loss": 0.0437, "step": 5440 }, { "epoch": 4.215003866976025, "grad_norm": 0.40200021862983704, "learning_rate": 1.3117328949091634e-05, "loss": 0.0375, "step": 5450 }, { "epoch": 4.222737819025522, "grad_norm": 0.39354407787323, "learning_rate": 1.2865054301133805e-05, "loss": 0.0313, "step": 5460 }, { "epoch": 4.2304717710750195, "grad_norm": 0.5385573506355286, "learning_rate": 1.2615062293021507e-05, "loss": 0.0363, "step": 5470 }, { "epoch": 4.2382057231245165, "grad_norm": 0.43671444058418274, "learning_rate": 1.2367359473696883e-05, "loss": 0.0367, "step": 5480 }, { "epoch": 4.2459396751740135, "grad_norm": 0.43923044204711914, "learning_rate": 1.2121952332133091e-05, "loss": 0.0295, "step": 5490 }, { "epoch": 4.253673627223511, "grad_norm": 0.9461666941642761, "learning_rate": 1.1878847297164365e-05, "loss": 0.0327, "step": 5500 }, { "epoch": 4.261407579273008, "grad_norm": 0.697158932685852, "learning_rate": 1.1638050737317496e-05, "loss": 0.043, "step": 5510 }, { "epoch": 4.269141531322505, "grad_norm": 0.3874329924583435, "learning_rate": 1.1399568960645135e-05, "loss": 0.0347, "step": 5520 }, { "epoch": 4.276875483372003, "grad_norm": 0.38157331943511963, "learning_rate": 1.1163408214560434e-05, "loss": 0.035, "step": 5530 }, { "epoch": 4.2846094354215, "grad_norm": 0.3770166039466858, "learning_rate": 1.0929574685673405e-05, "loss": 0.0359, "step": 5540 }, { "epoch": 4.292343387470997, "grad_norm": 0.27722567319869995, "learning_rate": 1.0698074499628885e-05, "loss": 0.0315, "step": 5550 }, { "epoch": 4.300077339520495, "grad_norm": 0.4084804356098175, "learning_rate": 1.0468913720946084e-05, "loss": 0.0382, "step": 5560 }, { "epoch": 4.307811291569992, "grad_norm": 0.3033837378025055, "learning_rate": 1.0242098352859587e-05, "loss": 0.0397, "step": 5570 }, { "epoch": 4.315545243619489, "grad_norm": 1.1259981393814087, "learning_rate": 1.0017634337162275e-05, "loss": 0.037, "step": 5580 }, { "epoch": 4.323279195668987, "grad_norm": 0.43768250942230225, "learning_rate": 9.795527554049511e-06, "loss": 0.032, "step": 5590 }, { "epoch": 4.331013147718484, "grad_norm": 0.39885565638542175, "learning_rate": 9.575783821965257e-06, "loss": 0.0381, "step": 5600 }, { "epoch": 4.338747099767981, "grad_norm": 0.3133554756641388, "learning_rate": 9.3584088974494e-06, "loss": 0.0373, "step": 5610 }, { "epoch": 4.346481051817479, "grad_norm": 0.3405650556087494, "learning_rate": 9.143408474987281e-06, "loss": 0.0431, "step": 5620 }, { "epoch": 4.354215003866976, "grad_norm": 0.2051086276769638, "learning_rate": 8.930788186860228e-06, "loss": 0.0303, "step": 5630 }, { "epoch": 4.361948955916473, "grad_norm": 0.3833163380622864, "learning_rate": 8.720553602998172e-06, "loss": 0.0395, "step": 5640 }, { "epoch": 4.369682907965971, "grad_norm": 0.48563048243522644, "learning_rate": 8.512710230833688e-06, "loss": 0.0428, "step": 5650 }, { "epoch": 4.377416860015468, "grad_norm": 0.4406111240386963, "learning_rate": 8.307263515157737e-06, "loss": 0.0317, "step": 5660 }, { "epoch": 4.385150812064965, "grad_norm": 0.7994678616523743, "learning_rate": 8.10421883797694e-06, "loss": 0.0349, "step": 5670 }, { "epoch": 4.392884764114463, "grad_norm": 0.3733707368373871, "learning_rate": 7.90358151837277e-06, "loss": 0.0481, "step": 5680 }, { "epoch": 4.40061871616396, "grad_norm": 0.5773397088050842, "learning_rate": 7.705356812362019e-06, "loss": 0.0386, "step": 5690 }, { "epoch": 4.408352668213457, "grad_norm": 0.3579750061035156, "learning_rate": 7.509549912759228e-06, "loss": 0.0406, "step": 5700 }, { "epoch": 4.416086620262955, "grad_norm": 0.31784024834632874, "learning_rate": 7.316165949040599e-06, "loss": 0.0437, "step": 5710 }, { "epoch": 4.423820572312452, "grad_norm": 0.35957804322242737, "learning_rate": 7.1252099872096575e-06, "loss": 0.029, "step": 5720 }, { "epoch": 4.431554524361949, "grad_norm": 0.33093947172164917, "learning_rate": 6.936687029664502e-06, "loss": 0.0351, "step": 5730 }, { "epoch": 4.439288476411447, "grad_norm": 0.42664435505867004, "learning_rate": 6.750602015066776e-06, "loss": 0.0323, "step": 5740 }, { "epoch": 4.447022428460944, "grad_norm": 0.38652506470680237, "learning_rate": 6.566959818212326e-06, "loss": 0.0331, "step": 5750 }, { "epoch": 4.454756380510441, "grad_norm": 0.4349561929702759, "learning_rate": 6.3857652499033974e-06, "loss": 0.0319, "step": 5760 }, { "epoch": 4.4624903325599385, "grad_norm": 0.4477129578590393, "learning_rate": 6.2070230568227365e-06, "loss": 0.0332, "step": 5770 }, { "epoch": 4.4702242846094355, "grad_norm": 0.5434504747390747, "learning_rate": 6.030737921409169e-06, "loss": 0.0338, "step": 5780 }, { "epoch": 4.477958236658933, "grad_norm": 0.72066330909729, "learning_rate": 5.856914461734919e-06, "loss": 0.0345, "step": 5790 }, { "epoch": 4.4856921887084305, "grad_norm": 0.3643154501914978, "learning_rate": 5.685557231384675e-06, "loss": 0.0375, "step": 5800 }, { "epoch": 4.4934261407579275, "grad_norm": 0.3324568271636963, "learning_rate": 5.516670719336337e-06, "loss": 0.0372, "step": 5810 }, { "epoch": 4.5011600928074245, "grad_norm": 0.35616499185562134, "learning_rate": 5.350259349843278e-06, "loss": 0.0369, "step": 5820 }, { "epoch": 4.5088940448569215, "grad_norm": 1.1673767566680908, "learning_rate": 5.186327482318609e-06, "loss": 0.0319, "step": 5830 }, { "epoch": 4.516627996906419, "grad_norm": 0.33102595806121826, "learning_rate": 5.024879411220884e-06, "loss": 0.0376, "step": 5840 }, { "epoch": 4.524361948955916, "grad_norm": 0.3244016170501709, "learning_rate": 4.865919365941629e-06, "loss": 0.0326, "step": 5850 }, { "epoch": 4.532095901005414, "grad_norm": 0.46505865454673767, "learning_rate": 4.709451510694496e-06, "loss": 0.0378, "step": 5860 }, { "epoch": 4.539829853054911, "grad_norm": 0.2924868166446686, "learning_rate": 4.555479944406283e-06, "loss": 0.0296, "step": 5870 }, { "epoch": 4.547563805104408, "grad_norm": 0.3826541006565094, "learning_rate": 4.40400870060943e-06, "loss": 0.0362, "step": 5880 }, { "epoch": 4.555297757153905, "grad_norm": 0.4856719672679901, "learning_rate": 4.255041747336452e-06, "loss": 0.0371, "step": 5890 }, { "epoch": 4.563031709203403, "grad_norm": 0.29238465428352356, "learning_rate": 4.1085829870159184e-06, "loss": 0.0341, "step": 5900 }, { "epoch": 4.5707656612529, "grad_norm": 0.3634920120239258, "learning_rate": 3.964636256370302e-06, "loss": 0.0396, "step": 5910 }, { "epoch": 4.578499613302397, "grad_norm": 0.6764735579490662, "learning_rate": 3.823205326315394e-06, "loss": 0.0355, "step": 5920 }, { "epoch": 4.586233565351895, "grad_norm": 0.33399373292922974, "learning_rate": 3.6842939018615352e-06, "loss": 0.0352, "step": 5930 }, { "epoch": 4.593967517401392, "grad_norm": 0.797825038433075, "learning_rate": 3.547905622016601e-06, "loss": 0.0427, "step": 5940 }, { "epoch": 4.601701469450889, "grad_norm": 0.34758442640304565, "learning_rate": 3.414044059690602e-06, "loss": 0.0291, "step": 5950 }, { "epoch": 4.609435421500387, "grad_norm": 0.6090482473373413, "learning_rate": 3.282712721602199e-06, "loss": 0.0388, "step": 5960 }, { "epoch": 4.617169373549884, "grad_norm": 0.2727656662464142, "learning_rate": 3.1539150481866843e-06, "loss": 0.0402, "step": 5970 }, { "epoch": 4.624903325599381, "grad_norm": 0.49124860763549805, "learning_rate": 3.027654413505976e-06, "loss": 0.0315, "step": 5980 }, { "epoch": 4.632637277648879, "grad_norm": 0.38370001316070557, "learning_rate": 2.9039341251601683e-06, "loss": 0.0404, "step": 5990 }, { "epoch": 4.640371229698376, "grad_norm": 0.34134286642074585, "learning_rate": 2.7827574242009437e-06, "loss": 0.0361, "step": 6000 }, { "epoch": 4.648105181747873, "grad_norm": 0.39743635058403015, "learning_rate": 2.6641274850465746e-06, "loss": 0.0362, "step": 6010 }, { "epoch": 4.655839133797371, "grad_norm": 0.5696566700935364, "learning_rate": 2.5480474153988464e-06, "loss": 0.0336, "step": 6020 }, { "epoch": 4.663573085846868, "grad_norm": 0.2874321937561035, "learning_rate": 2.434520256161632e-06, "loss": 0.0337, "step": 6030 }, { "epoch": 4.671307037896365, "grad_norm": 0.4226866066455841, "learning_rate": 2.3235489813611676e-06, "loss": 0.0433, "step": 6040 }, { "epoch": 4.679040989945863, "grad_norm": 0.4023924171924591, "learning_rate": 2.2151364980682376e-06, "loss": 0.0328, "step": 6050 }, { "epoch": 4.68677494199536, "grad_norm": 0.4275440573692322, "learning_rate": 2.109285646321979e-06, "loss": 0.033, "step": 6060 }, { "epoch": 4.694508894044857, "grad_norm": 0.4322325885295868, "learning_rate": 2.0059991990554083e-06, "loss": 0.0337, "step": 6070 }, { "epoch": 4.702242846094354, "grad_norm": 0.4136297106742859, "learning_rate": 1.9052798620229351e-06, "loss": 0.0322, "step": 6080 }, { "epoch": 4.709976798143852, "grad_norm": 0.6176156401634216, "learning_rate": 1.8071302737293295e-06, "loss": 0.0336, "step": 6090 }, { "epoch": 4.717710750193349, "grad_norm": 0.66657555103302, "learning_rate": 1.711553005360711e-06, "loss": 0.035, "step": 6100 }, { "epoch": 4.7254447022428465, "grad_norm": 0.568735659122467, "learning_rate": 1.6185505607171026e-06, "loss": 0.0399, "step": 6110 }, { "epoch": 4.7331786542923435, "grad_norm": 0.4238165318965912, "learning_rate": 1.5281253761469161e-06, "loss": 0.035, "step": 6120 }, { "epoch": 4.7409126063418405, "grad_norm": 0.4959665536880493, "learning_rate": 1.4402798204831036e-06, "loss": 0.0425, "step": 6130 }, { "epoch": 4.7486465583913375, "grad_norm": 0.6961854100227356, "learning_rate": 1.3550161949810514e-06, "loss": 0.0442, "step": 6140 }, { "epoch": 4.756380510440835, "grad_norm": 0.3563513159751892, "learning_rate": 1.2723367332583946e-06, "loss": 0.0351, "step": 6150 }, { "epoch": 4.764114462490332, "grad_norm": 0.4680556356906891, "learning_rate": 1.192243601236409e-06, "loss": 0.0358, "step": 6160 }, { "epoch": 4.77184841453983, "grad_norm": 0.29293152689933777, "learning_rate": 1.1147388970833227e-06, "loss": 0.0347, "step": 6170 }, { "epoch": 4.779582366589327, "grad_norm": 0.47611597180366516, "learning_rate": 1.0398246511593268e-06, "loss": 0.0391, "step": 6180 }, { "epoch": 4.787316318638824, "grad_norm": 0.6561389565467834, "learning_rate": 9.6750282596344e-07, "loss": 0.0356, "step": 6190 }, { "epoch": 4.795050270688321, "grad_norm": 0.48020797967910767, "learning_rate": 8.977753160819835e-07, "loss": 0.0409, "step": 6200 }, { "epoch": 4.802784222737819, "grad_norm": 0.6973733305931091, "learning_rate": 8.30643948139087e-07, "loss": 0.0406, "step": 6210 }, { "epoch": 4.810518174787316, "grad_norm": 0.3586060702800751, "learning_rate": 7.661104807487607e-07, "loss": 0.0469, "step": 6220 }, { "epoch": 4.818252126836813, "grad_norm": 0.5961159467697144, "learning_rate": 7.041766044688091e-07, "loss": 0.0346, "step": 6230 }, { "epoch": 4.825986078886311, "grad_norm": 0.2577856183052063, "learning_rate": 6.448439417565788e-07, "loss": 0.0364, "step": 6240 }, { "epoch": 4.833720030935808, "grad_norm": 0.8134392499923706, "learning_rate": 5.881140469265023e-07, "loss": 0.0335, "step": 6250 }, { "epoch": 4.841453982985305, "grad_norm": 0.4075404107570648, "learning_rate": 5.339884061092427e-07, "loss": 0.0407, "step": 6260 }, { "epoch": 4.849187935034803, "grad_norm": 0.3454798460006714, "learning_rate": 4.82468437212913e-07, "loss": 0.0341, "step": 6270 }, { "epoch": 4.8569218870843, "grad_norm": 0.44289132952690125, "learning_rate": 4.335554898858396e-07, "loss": 0.0389, "step": 6280 }, { "epoch": 4.864655839133797, "grad_norm": 0.2241939902305603, "learning_rate": 3.8725084548122406e-07, "loss": 0.033, "step": 6290 }, { "epoch": 4.872389791183295, "grad_norm": 0.43803855776786804, "learning_rate": 3.435557170236026e-07, "loss": 0.0389, "step": 6300 }, { "epoch": 4.880123743232792, "grad_norm": 0.41212889552116394, "learning_rate": 3.0247124917703874e-07, "loss": 0.0378, "step": 6310 }, { "epoch": 4.887857695282289, "grad_norm": 0.6957119703292847, "learning_rate": 2.639985182151583e-07, "loss": 0.0338, "step": 6320 }, { "epoch": 4.895591647331787, "grad_norm": 0.41441500186920166, "learning_rate": 2.2813853199292746e-07, "loss": 0.0392, "step": 6330 }, { "epoch": 4.903325599381284, "grad_norm": 0.2402866631746292, "learning_rate": 1.948922299202849e-07, "loss": 0.0379, "step": 6340 }, { "epoch": 4.911059551430781, "grad_norm": 0.2667694389820099, "learning_rate": 1.6426048293750606e-07, "loss": 0.0379, "step": 6350 }, { "epoch": 4.918793503480279, "grad_norm": 0.41275689005851746, "learning_rate": 1.3624409349239918e-07, "loss": 0.0301, "step": 6360 }, { "epoch": 4.926527455529776, "grad_norm": 0.3666990101337433, "learning_rate": 1.108437955192887e-07, "loss": 0.0319, "step": 6370 }, { "epoch": 4.934261407579273, "grad_norm": 0.9078080654144287, "learning_rate": 8.806025441975286e-08, "loss": 0.041, "step": 6380 }, { "epoch": 4.941995359628771, "grad_norm": 0.6370543837547302, "learning_rate": 6.789406704527102e-08, "loss": 0.0398, "step": 6390 }, { "epoch": 4.949729311678268, "grad_norm": 0.40755051374435425, "learning_rate": 5.0345761681491746e-08, "loss": 0.044, "step": 6400 }, { "epoch": 4.957463263727765, "grad_norm": 0.5913822650909424, "learning_rate": 3.541579803445494e-08, "loss": 0.0367, "step": 6410 }, { "epoch": 4.965197215777263, "grad_norm": 0.47930198907852173, "learning_rate": 2.3104567218545924e-08, "loss": 0.0351, "step": 6420 }, { "epoch": 4.97293116782676, "grad_norm": 0.3852298855781555, "learning_rate": 1.3412391746225884e-08, "loss": 0.0332, "step": 6430 }, { "epoch": 4.980665119876257, "grad_norm": 0.2181631624698639, "learning_rate": 6.3395255195941585e-09, "loss": 0.0355, "step": 6440 }, { "epoch": 4.988399071925754, "grad_norm": 0.3657686710357666, "learning_rate": 1.886153823749126e-09, "loss": 0.0364, "step": 6450 }, { "epoch": 4.9961330239752515, "grad_norm": 0.36002346873283386, "learning_rate": 5.2393321903210224e-11, "loss": 0.0336, "step": 6460 }, { "epoch": 4.997679814385151, "step": 6462, "total_flos": 2.239328575228873e+17, "train_loss": 0.06088957418689083, "train_runtime": 2866.8783, "train_samples_per_second": 36.064, "train_steps_per_second": 2.254 } ], "logging_steps": 10, "max_steps": 6462, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.239328575228873e+17, "train_batch_size": 16, "trial_name": null, "trial_params": null }