diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,73876 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 18.13647698934482, + "eval_steps": 187, + "global_step": 10000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.001813647698934482, + "grad_norm": 0.05615234375, + "learning_rate": 0.0002, + "loss": 1.5015, + "step": 1 + }, + { + "epoch": 0.001813647698934482, + "eval_loss": 1.7367299795150757, + "eval_runtime": 151.0712, + "eval_samples_per_second": 6.619, + "eval_steps_per_second": 6.619, + "step": 1 + }, + { + "epoch": 0.001813647698934482, + "mmlu_eval_accuracy": 0.3344444779104109, + "mmlu_eval_accuracy_abstract_algebra": 0.2727272727272727, + "mmlu_eval_accuracy_anatomy": 0.42857142857142855, + "mmlu_eval_accuracy_astronomy": 0.25, + "mmlu_eval_accuracy_business_ethics": 0.5454545454545454, + "mmlu_eval_accuracy_clinical_knowledge": 0.20689655172413793, + "mmlu_eval_accuracy_college_biology": 0.3125, + "mmlu_eval_accuracy_college_chemistry": 0.375, + "mmlu_eval_accuracy_college_computer_science": 0.2727272727272727, + "mmlu_eval_accuracy_college_mathematics": 0.2727272727272727, + "mmlu_eval_accuracy_college_medicine": 0.22727272727272727, + "mmlu_eval_accuracy_college_physics": 0.2727272727272727, + "mmlu_eval_accuracy_computer_security": 0.36363636363636365, + "mmlu_eval_accuracy_conceptual_physics": 0.19230769230769232, + "mmlu_eval_accuracy_econometrics": 0.0, + "mmlu_eval_accuracy_electrical_engineering": 0.125, + "mmlu_eval_accuracy_elementary_mathematics": 0.24390243902439024, + "mmlu_eval_accuracy_formal_logic": 0.14285714285714285, + "mmlu_eval_accuracy_global_facts": 0.2, + "mmlu_eval_accuracy_high_school_biology": 0.5, + "mmlu_eval_accuracy_high_school_chemistry": 0.2727272727272727, + "mmlu_eval_accuracy_high_school_computer_science": 0.4444444444444444, + "mmlu_eval_accuracy_high_school_european_history": 0.3888888888888889, + "mmlu_eval_accuracy_high_school_geography": 0.45454545454545453, + "mmlu_eval_accuracy_high_school_government_and_politics": 0.42857142857142855, + "mmlu_eval_accuracy_high_school_macroeconomics": 0.32558139534883723, + "mmlu_eval_accuracy_high_school_mathematics": 0.13793103448275862, + "mmlu_eval_accuracy_high_school_microeconomics": 0.2692307692307692, + "mmlu_eval_accuracy_high_school_physics": 0.23529411764705882, + "mmlu_eval_accuracy_high_school_psychology": 0.4666666666666667, + "mmlu_eval_accuracy_high_school_statistics": 0.391304347826087, + "mmlu_eval_accuracy_high_school_us_history": 0.5, + "mmlu_eval_accuracy_high_school_world_history": 0.2692307692307692, + "mmlu_eval_accuracy_human_aging": 0.5217391304347826, + "mmlu_eval_accuracy_human_sexuality": 0.16666666666666666, + "mmlu_eval_accuracy_international_law": 0.23076923076923078, + "mmlu_eval_accuracy_jurisprudence": 0.36363636363636365, + "mmlu_eval_accuracy_logical_fallacies": 0.4444444444444444, + "mmlu_eval_accuracy_machine_learning": 0.45454545454545453, + "mmlu_eval_accuracy_management": 0.18181818181818182, + "mmlu_eval_accuracy_marketing": 0.44, + "mmlu_eval_accuracy_medical_genetics": 0.6363636363636364, + "mmlu_eval_accuracy_miscellaneous": 0.45348837209302323, + "mmlu_eval_accuracy_moral_disputes": 0.3157894736842105, + "mmlu_eval_accuracy_moral_scenarios": 0.31, + "mmlu_eval_accuracy_nutrition": 0.42424242424242425, + "mmlu_eval_accuracy_philosophy": 0.47058823529411764, + "mmlu_eval_accuracy_prehistory": 0.3142857142857143, + "mmlu_eval_accuracy_professional_accounting": 0.3225806451612903, + "mmlu_eval_accuracy_professional_law": 0.2529411764705882, + "mmlu_eval_accuracy_professional_medicine": 0.2903225806451613, + "mmlu_eval_accuracy_professional_psychology": 0.30434782608695654, + "mmlu_eval_accuracy_public_relations": 0.3333333333333333, + "mmlu_eval_accuracy_security_studies": 0.5185185185185185, + "mmlu_eval_accuracy_sociology": 0.5, + "mmlu_eval_accuracy_us_foreign_policy": 0.2727272727272727, + "mmlu_eval_accuracy_virology": 0.2777777777777778, + "mmlu_eval_accuracy_world_religions": 0.47368421052631576, + "mmlu_loss": 2.989393788252992, + "step": 1 + }, + { + "epoch": 0.003627295397868964, + "grad_norm": 0.10595703125, + "learning_rate": 0.0002, + "loss": 1.4853, + "step": 2 + }, + { + "epoch": 0.005440943096803446, + "grad_norm": 0.1376953125, + "learning_rate": 0.0002, + "loss": 1.454, + "step": 3 + }, + { + "epoch": 0.007254590795737928, + "grad_norm": 0.1064453125, + "learning_rate": 0.0002, + "loss": 1.277, + "step": 4 + }, + { + "epoch": 0.00906823849467241, + "grad_norm": 0.134765625, + "learning_rate": 0.0002, + "loss": 1.3075, + "step": 5 + }, + { + "epoch": 0.010881886193606891, + "grad_norm": 0.1357421875, + "learning_rate": 0.0002, + "loss": 1.2545, + "step": 6 + }, + { + "epoch": 0.012695533892541374, + "grad_norm": 0.08740234375, + "learning_rate": 0.0002, + "loss": 1.2888, + "step": 7 + }, + { + "epoch": 0.014509181591475856, + "grad_norm": 0.068359375, + "learning_rate": 0.0002, + "loss": 1.3043, + "step": 8 + }, + { + "epoch": 0.01632282929041034, + "grad_norm": 0.060791015625, + "learning_rate": 0.0002, + "loss": 1.3626, + "step": 9 + }, + { + "epoch": 0.01813647698934482, + "grad_norm": 0.0693359375, + "learning_rate": 0.0002, + "loss": 1.3498, + "step": 10 + }, + { + "epoch": 0.0199501246882793, + "grad_norm": 0.0703125, + "learning_rate": 0.0002, + "loss": 1.492, + "step": 11 + }, + { + "epoch": 0.021763772387213783, + "grad_norm": 0.06787109375, + "learning_rate": 0.0002, + "loss": 1.2398, + "step": 12 + }, + { + "epoch": 0.023577420086148267, + "grad_norm": 0.0654296875, + "learning_rate": 0.0002, + "loss": 1.2496, + "step": 13 + }, + { + "epoch": 0.02539106778508275, + "grad_norm": 0.0693359375, + "learning_rate": 0.0002, + "loss": 1.3481, + "step": 14 + }, + { + "epoch": 0.02720471548401723, + "grad_norm": 0.0673828125, + "learning_rate": 0.0002, + "loss": 1.2437, + "step": 15 + }, + { + "epoch": 0.02901836318295171, + "grad_norm": 0.07958984375, + "learning_rate": 0.0002, + "loss": 1.1388, + "step": 16 + }, + { + "epoch": 0.030832010881886193, + "grad_norm": 0.078125, + "learning_rate": 0.0002, + "loss": 1.202, + "step": 17 + }, + { + "epoch": 0.03264565858082068, + "grad_norm": 0.0673828125, + "learning_rate": 0.0002, + "loss": 1.2191, + "step": 18 + }, + { + "epoch": 0.03445930627975516, + "grad_norm": 0.0771484375, + "learning_rate": 0.0002, + "loss": 1.2095, + "step": 19 + }, + { + "epoch": 0.03627295397868964, + "grad_norm": 0.06494140625, + "learning_rate": 0.0002, + "loss": 1.2283, + "step": 20 + }, + { + "epoch": 0.03808660167762412, + "grad_norm": 0.06982421875, + "learning_rate": 0.0002, + "loss": 1.3249, + "step": 21 + }, + { + "epoch": 0.0399002493765586, + "grad_norm": 0.0712890625, + "learning_rate": 0.0002, + "loss": 1.1721, + "step": 22 + }, + { + "epoch": 0.041713897075493084, + "grad_norm": 0.07275390625, + "learning_rate": 0.0002, + "loss": 1.4626, + "step": 23 + }, + { + "epoch": 0.043527544774427565, + "grad_norm": 0.07080078125, + "learning_rate": 0.0002, + "loss": 1.4156, + "step": 24 + }, + { + "epoch": 0.045341192473362046, + "grad_norm": 0.06787109375, + "learning_rate": 0.0002, + "loss": 1.3012, + "step": 25 + }, + { + "epoch": 0.047154840172296535, + "grad_norm": 0.0751953125, + "learning_rate": 0.0002, + "loss": 1.1466, + "step": 26 + }, + { + "epoch": 0.048968487871231016, + "grad_norm": 0.07666015625, + "learning_rate": 0.0002, + "loss": 1.3871, + "step": 27 + }, + { + "epoch": 0.0507821355701655, + "grad_norm": 0.0703125, + "learning_rate": 0.0002, + "loss": 1.2032, + "step": 28 + }, + { + "epoch": 0.05259578326909998, + "grad_norm": 0.09130859375, + "learning_rate": 0.0002, + "loss": 1.5069, + "step": 29 + }, + { + "epoch": 0.05440943096803446, + "grad_norm": 0.08251953125, + "learning_rate": 0.0002, + "loss": 1.3424, + "step": 30 + }, + { + "epoch": 0.05622307866696894, + "grad_norm": 0.08740234375, + "learning_rate": 0.0002, + "loss": 1.5499, + "step": 31 + }, + { + "epoch": 0.05803672636590342, + "grad_norm": 0.0869140625, + "learning_rate": 0.0002, + "loss": 1.2665, + "step": 32 + }, + { + "epoch": 0.059850374064837904, + "grad_norm": 0.09033203125, + "learning_rate": 0.0002, + "loss": 1.5369, + "step": 33 + }, + { + "epoch": 0.061664021763772385, + "grad_norm": 0.08935546875, + "learning_rate": 0.0002, + "loss": 1.4733, + "step": 34 + }, + { + "epoch": 0.06347766946270687, + "grad_norm": 0.1005859375, + "learning_rate": 0.0002, + "loss": 1.6474, + "step": 35 + }, + { + "epoch": 0.06529131716164135, + "grad_norm": 0.1025390625, + "learning_rate": 0.0002, + "loss": 1.4219, + "step": 36 + }, + { + "epoch": 0.06710496486057584, + "grad_norm": 0.115234375, + "learning_rate": 0.0002, + "loss": 1.5001, + "step": 37 + }, + { + "epoch": 0.06891861255951032, + "grad_norm": 0.111328125, + "learning_rate": 0.0002, + "loss": 1.4979, + "step": 38 + }, + { + "epoch": 0.0707322602584448, + "grad_norm": 0.10546875, + "learning_rate": 0.0002, + "loss": 1.4651, + "step": 39 + }, + { + "epoch": 0.07254590795737928, + "grad_norm": 0.10009765625, + "learning_rate": 0.0002, + "loss": 1.5796, + "step": 40 + }, + { + "epoch": 0.07435955565631376, + "grad_norm": 0.1201171875, + "learning_rate": 0.0002, + "loss": 1.5675, + "step": 41 + }, + { + "epoch": 0.07617320335524824, + "grad_norm": 0.10400390625, + "learning_rate": 0.0002, + "loss": 1.5873, + "step": 42 + }, + { + "epoch": 0.07798685105418272, + "grad_norm": 0.11962890625, + "learning_rate": 0.0002, + "loss": 1.6359, + "step": 43 + }, + { + "epoch": 0.0798004987531172, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.4124, + "step": 44 + }, + { + "epoch": 0.08161414645205169, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.7627, + "step": 45 + }, + { + "epoch": 0.08342779415098617, + "grad_norm": 0.12451171875, + "learning_rate": 0.0002, + "loss": 1.516, + "step": 46 + }, + { + "epoch": 0.08524144184992065, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.508, + "step": 47 + }, + { + "epoch": 0.08705508954885513, + "grad_norm": 0.140625, + "learning_rate": 0.0002, + "loss": 1.5579, + "step": 48 + }, + { + "epoch": 0.08886873724778961, + "grad_norm": 0.177734375, + "learning_rate": 0.0002, + "loss": 1.6311, + "step": 49 + }, + { + "epoch": 0.09068238494672409, + "grad_norm": 0.4609375, + "learning_rate": 0.0002, + "loss": 1.5965, + "step": 50 + }, + { + "epoch": 0.09249603264565857, + "grad_norm": 0.068359375, + "learning_rate": 0.0002, + "loss": 1.4892, + "step": 51 + }, + { + "epoch": 0.09430968034459307, + "grad_norm": 0.07666015625, + "learning_rate": 0.0002, + "loss": 1.2443, + "step": 52 + }, + { + "epoch": 0.09612332804352755, + "grad_norm": 0.0712890625, + "learning_rate": 0.0002, + "loss": 1.3003, + "step": 53 + }, + { + "epoch": 0.09793697574246203, + "grad_norm": 0.072265625, + "learning_rate": 0.0002, + "loss": 1.1665, + "step": 54 + }, + { + "epoch": 0.09975062344139651, + "grad_norm": 0.064453125, + "learning_rate": 0.0002, + "loss": 1.3761, + "step": 55 + }, + { + "epoch": 0.101564271140331, + "grad_norm": 0.057861328125, + "learning_rate": 0.0002, + "loss": 1.2376, + "step": 56 + }, + { + "epoch": 0.10337791883926548, + "grad_norm": 0.0556640625, + "learning_rate": 0.0002, + "loss": 1.2492, + "step": 57 + }, + { + "epoch": 0.10519156653819996, + "grad_norm": 0.0546875, + "learning_rate": 0.0002, + "loss": 1.283, + "step": 58 + }, + { + "epoch": 0.10700521423713444, + "grad_norm": 0.0546875, + "learning_rate": 0.0002, + "loss": 1.1633, + "step": 59 + }, + { + "epoch": 0.10881886193606892, + "grad_norm": 0.0537109375, + "learning_rate": 0.0002, + "loss": 1.2039, + "step": 60 + }, + { + "epoch": 0.1106325096350034, + "grad_norm": 0.06298828125, + "learning_rate": 0.0002, + "loss": 1.1874, + "step": 61 + }, + { + "epoch": 0.11244615733393788, + "grad_norm": 0.062255859375, + "learning_rate": 0.0002, + "loss": 1.1358, + "step": 62 + }, + { + "epoch": 0.11425980503287236, + "grad_norm": 0.06298828125, + "learning_rate": 0.0002, + "loss": 1.2759, + "step": 63 + }, + { + "epoch": 0.11607345273180684, + "grad_norm": 0.0556640625, + "learning_rate": 0.0002, + "loss": 1.0627, + "step": 64 + }, + { + "epoch": 0.11788710043074133, + "grad_norm": 0.061767578125, + "learning_rate": 0.0002, + "loss": 1.1376, + "step": 65 + }, + { + "epoch": 0.11970074812967581, + "grad_norm": 0.06201171875, + "learning_rate": 0.0002, + "loss": 1.1857, + "step": 66 + }, + { + "epoch": 0.12151439582861029, + "grad_norm": 0.0673828125, + "learning_rate": 0.0002, + "loss": 1.3241, + "step": 67 + }, + { + "epoch": 0.12332804352754477, + "grad_norm": 0.060546875, + "learning_rate": 0.0002, + "loss": 1.1876, + "step": 68 + }, + { + "epoch": 0.12514169122647925, + "grad_norm": 0.06982421875, + "learning_rate": 0.0002, + "loss": 1.1988, + "step": 69 + }, + { + "epoch": 0.12695533892541375, + "grad_norm": 0.0625, + "learning_rate": 0.0002, + "loss": 1.3486, + "step": 70 + }, + { + "epoch": 0.12876898662434821, + "grad_norm": 0.064453125, + "learning_rate": 0.0002, + "loss": 1.1787, + "step": 71 + }, + { + "epoch": 0.1305826343232827, + "grad_norm": 0.062255859375, + "learning_rate": 0.0002, + "loss": 1.206, + "step": 72 + }, + { + "epoch": 0.13239628202221718, + "grad_norm": 0.06396484375, + "learning_rate": 0.0002, + "loss": 1.3174, + "step": 73 + }, + { + "epoch": 0.13420992972115167, + "grad_norm": 0.0654296875, + "learning_rate": 0.0002, + "loss": 1.4282, + "step": 74 + }, + { + "epoch": 0.13602357742008614, + "grad_norm": 0.0693359375, + "learning_rate": 0.0002, + "loss": 1.3765, + "step": 75 + }, + { + "epoch": 0.13783722511902063, + "grad_norm": 0.07275390625, + "learning_rate": 0.0002, + "loss": 1.3293, + "step": 76 + }, + { + "epoch": 0.1396508728179551, + "grad_norm": 0.0712890625, + "learning_rate": 0.0002, + "loss": 1.5105, + "step": 77 + }, + { + "epoch": 0.1414645205168896, + "grad_norm": 0.0712890625, + "learning_rate": 0.0002, + "loss": 1.3131, + "step": 78 + }, + { + "epoch": 0.14327816821582406, + "grad_norm": 0.07421875, + "learning_rate": 0.0002, + "loss": 1.2725, + "step": 79 + }, + { + "epoch": 0.14509181591475856, + "grad_norm": 0.07763671875, + "learning_rate": 0.0002, + "loss": 1.288, + "step": 80 + }, + { + "epoch": 0.14690546361369303, + "grad_norm": 0.0849609375, + "learning_rate": 0.0002, + "loss": 1.2573, + "step": 81 + }, + { + "epoch": 0.14871911131262752, + "grad_norm": 0.0791015625, + "learning_rate": 0.0002, + "loss": 1.1305, + "step": 82 + }, + { + "epoch": 0.15053275901156202, + "grad_norm": 0.09326171875, + "learning_rate": 0.0002, + "loss": 1.649, + "step": 83 + }, + { + "epoch": 0.15234640671049648, + "grad_norm": 0.1064453125, + "learning_rate": 0.0002, + "loss": 1.3976, + "step": 84 + }, + { + "epoch": 0.15416005440943098, + "grad_norm": 0.08642578125, + "learning_rate": 0.0002, + "loss": 1.5323, + "step": 85 + }, + { + "epoch": 0.15597370210836545, + "grad_norm": 0.0869140625, + "learning_rate": 0.0002, + "loss": 1.4472, + "step": 86 + }, + { + "epoch": 0.15778734980729994, + "grad_norm": 0.08447265625, + "learning_rate": 0.0002, + "loss": 1.2111, + "step": 87 + }, + { + "epoch": 0.1596009975062344, + "grad_norm": 0.0888671875, + "learning_rate": 0.0002, + "loss": 1.4978, + "step": 88 + }, + { + "epoch": 0.1614146452051689, + "grad_norm": 0.09521484375, + "learning_rate": 0.0002, + "loss": 1.6266, + "step": 89 + }, + { + "epoch": 0.16322829290410337, + "grad_norm": 0.095703125, + "learning_rate": 0.0002, + "loss": 1.5776, + "step": 90 + }, + { + "epoch": 0.16504194060303787, + "grad_norm": 0.10205078125, + "learning_rate": 0.0002, + "loss": 1.4753, + "step": 91 + }, + { + "epoch": 0.16685558830197234, + "grad_norm": 0.10546875, + "learning_rate": 0.0002, + "loss": 1.4608, + "step": 92 + }, + { + "epoch": 0.16866923600090683, + "grad_norm": 0.11083984375, + "learning_rate": 0.0002, + "loss": 1.5456, + "step": 93 + }, + { + "epoch": 0.1704828836998413, + "grad_norm": 0.10986328125, + "learning_rate": 0.0002, + "loss": 1.7174, + "step": 94 + }, + { + "epoch": 0.1722965313987758, + "grad_norm": 0.115234375, + "learning_rate": 0.0002, + "loss": 1.4952, + "step": 95 + }, + { + "epoch": 0.17411017909771026, + "grad_norm": 0.1904296875, + "learning_rate": 0.0002, + "loss": 1.544, + "step": 96 + }, + { + "epoch": 0.17592382679664476, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.4924, + "step": 97 + }, + { + "epoch": 0.17773747449557922, + "grad_norm": 0.1650390625, + "learning_rate": 0.0002, + "loss": 1.7263, + "step": 98 + }, + { + "epoch": 0.17955112219451372, + "grad_norm": 0.19921875, + "learning_rate": 0.0002, + "loss": 1.4741, + "step": 99 + }, + { + "epoch": 0.18136476989344819, + "grad_norm": 0.31640625, + "learning_rate": 0.0002, + "loss": 1.6506, + "step": 100 + }, + { + "epoch": 0.18317841759238268, + "grad_norm": 0.068359375, + "learning_rate": 0.0002, + "loss": 1.2065, + "step": 101 + }, + { + "epoch": 0.18499206529131715, + "grad_norm": 0.0693359375, + "learning_rate": 0.0002, + "loss": 1.2892, + "step": 102 + }, + { + "epoch": 0.18680571299025164, + "grad_norm": 0.0673828125, + "learning_rate": 0.0002, + "loss": 1.3591, + "step": 103 + }, + { + "epoch": 0.18861936068918614, + "grad_norm": 0.06640625, + "learning_rate": 0.0002, + "loss": 1.1415, + "step": 104 + }, + { + "epoch": 0.1904330083881206, + "grad_norm": 0.06884765625, + "learning_rate": 0.0002, + "loss": 1.0833, + "step": 105 + }, + { + "epoch": 0.1922466560870551, + "grad_norm": 0.06298828125, + "learning_rate": 0.0002, + "loss": 1.279, + "step": 106 + }, + { + "epoch": 0.19406030378598957, + "grad_norm": 0.0595703125, + "learning_rate": 0.0002, + "loss": 1.0978, + "step": 107 + }, + { + "epoch": 0.19587395148492406, + "grad_norm": 0.060791015625, + "learning_rate": 0.0002, + "loss": 1.0248, + "step": 108 + }, + { + "epoch": 0.19768759918385853, + "grad_norm": 0.056884765625, + "learning_rate": 0.0002, + "loss": 1.12, + "step": 109 + }, + { + "epoch": 0.19950124688279303, + "grad_norm": 0.05615234375, + "learning_rate": 0.0002, + "loss": 1.2115, + "step": 110 + }, + { + "epoch": 0.2013148945817275, + "grad_norm": 0.05712890625, + "learning_rate": 0.0002, + "loss": 1.177, + "step": 111 + }, + { + "epoch": 0.203128542280662, + "grad_norm": 0.06298828125, + "learning_rate": 0.0002, + "loss": 1.2777, + "step": 112 + }, + { + "epoch": 0.20494218997959646, + "grad_norm": 0.0693359375, + "learning_rate": 0.0002, + "loss": 1.3176, + "step": 113 + }, + { + "epoch": 0.20675583767853095, + "grad_norm": 0.0654296875, + "learning_rate": 0.0002, + "loss": 1.118, + "step": 114 + }, + { + "epoch": 0.20856948537746542, + "grad_norm": 0.06787109375, + "learning_rate": 0.0002, + "loss": 1.1188, + "step": 115 + }, + { + "epoch": 0.21038313307639991, + "grad_norm": 0.0556640625, + "learning_rate": 0.0002, + "loss": 1.095, + "step": 116 + }, + { + "epoch": 0.21219678077533438, + "grad_norm": 0.0595703125, + "learning_rate": 0.0002, + "loss": 1.1232, + "step": 117 + }, + { + "epoch": 0.21401042847426888, + "grad_norm": 0.06640625, + "learning_rate": 0.0002, + "loss": 1.3575, + "step": 118 + }, + { + "epoch": 0.21582407617320334, + "grad_norm": 0.0712890625, + "learning_rate": 0.0002, + "loss": 1.4422, + "step": 119 + }, + { + "epoch": 0.21763772387213784, + "grad_norm": 0.06396484375, + "learning_rate": 0.0002, + "loss": 1.1178, + "step": 120 + }, + { + "epoch": 0.2194513715710723, + "grad_norm": 0.0615234375, + "learning_rate": 0.0002, + "loss": 1.1726, + "step": 121 + }, + { + "epoch": 0.2212650192700068, + "grad_norm": 0.06689453125, + "learning_rate": 0.0002, + "loss": 1.2156, + "step": 122 + }, + { + "epoch": 0.22307866696894127, + "grad_norm": 0.0693359375, + "learning_rate": 0.0002, + "loss": 1.3486, + "step": 123 + }, + { + "epoch": 0.22489231466787576, + "grad_norm": 0.06591796875, + "learning_rate": 0.0002, + "loss": 1.3202, + "step": 124 + }, + { + "epoch": 0.22670596236681026, + "grad_norm": 0.07080078125, + "learning_rate": 0.0002, + "loss": 1.4002, + "step": 125 + }, + { + "epoch": 0.22851961006574473, + "grad_norm": 0.07373046875, + "learning_rate": 0.0002, + "loss": 1.3013, + "step": 126 + }, + { + "epoch": 0.23033325776467922, + "grad_norm": 0.0703125, + "learning_rate": 0.0002, + "loss": 1.2434, + "step": 127 + }, + { + "epoch": 0.2321469054636137, + "grad_norm": 0.068359375, + "learning_rate": 0.0002, + "loss": 1.1724, + "step": 128 + }, + { + "epoch": 0.23396055316254818, + "grad_norm": 0.06982421875, + "learning_rate": 0.0002, + "loss": 1.2805, + "step": 129 + }, + { + "epoch": 0.23577420086148265, + "grad_norm": 0.07373046875, + "learning_rate": 0.0002, + "loss": 1.2215, + "step": 130 + }, + { + "epoch": 0.23758784856041715, + "grad_norm": 0.07373046875, + "learning_rate": 0.0002, + "loss": 1.4209, + "step": 131 + }, + { + "epoch": 0.23940149625935161, + "grad_norm": 0.08056640625, + "learning_rate": 0.0002, + "loss": 1.6111, + "step": 132 + }, + { + "epoch": 0.2412151439582861, + "grad_norm": 0.08837890625, + "learning_rate": 0.0002, + "loss": 1.4562, + "step": 133 + }, + { + "epoch": 0.24302879165722058, + "grad_norm": 0.07861328125, + "learning_rate": 0.0002, + "loss": 1.4723, + "step": 134 + }, + { + "epoch": 0.24484243935615507, + "grad_norm": 0.08544921875, + "learning_rate": 0.0002, + "loss": 1.4316, + "step": 135 + }, + { + "epoch": 0.24665608705508954, + "grad_norm": 0.08251953125, + "learning_rate": 0.0002, + "loss": 1.2866, + "step": 136 + }, + { + "epoch": 0.24846973475402404, + "grad_norm": 0.0830078125, + "learning_rate": 0.0002, + "loss": 1.435, + "step": 137 + }, + { + "epoch": 0.2502833824529585, + "grad_norm": 0.08935546875, + "learning_rate": 0.0002, + "loss": 1.4653, + "step": 138 + }, + { + "epoch": 0.252097030151893, + "grad_norm": 0.09228515625, + "learning_rate": 0.0002, + "loss": 1.434, + "step": 139 + }, + { + "epoch": 0.2539106778508275, + "grad_norm": 0.08935546875, + "learning_rate": 0.0002, + "loss": 1.5078, + "step": 140 + }, + { + "epoch": 0.25572432554976193, + "grad_norm": 0.111328125, + "learning_rate": 0.0002, + "loss": 1.6724, + "step": 141 + }, + { + "epoch": 0.25753797324869643, + "grad_norm": 0.107421875, + "learning_rate": 0.0002, + "loss": 1.666, + "step": 142 + }, + { + "epoch": 0.2593516209476309, + "grad_norm": 0.1298828125, + "learning_rate": 0.0002, + "loss": 1.6956, + "step": 143 + }, + { + "epoch": 0.2611652686465654, + "grad_norm": 0.11767578125, + "learning_rate": 0.0002, + "loss": 1.5085, + "step": 144 + }, + { + "epoch": 0.2629789163454999, + "grad_norm": 0.1328125, + "learning_rate": 0.0002, + "loss": 1.6667, + "step": 145 + }, + { + "epoch": 0.26479256404443435, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.5643, + "step": 146 + }, + { + "epoch": 0.26660621174336885, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.6623, + "step": 147 + }, + { + "epoch": 0.26841985944230334, + "grad_norm": 0.1630859375, + "learning_rate": 0.0002, + "loss": 1.6658, + "step": 148 + }, + { + "epoch": 0.27023350714123784, + "grad_norm": 0.1904296875, + "learning_rate": 0.0002, + "loss": 1.5728, + "step": 149 + }, + { + "epoch": 0.2720471548401723, + "grad_norm": 0.302734375, + "learning_rate": 0.0002, + "loss": 1.4196, + "step": 150 + }, + { + "epoch": 0.2738608025391068, + "grad_norm": 0.06884765625, + "learning_rate": 0.0002, + "loss": 1.1208, + "step": 151 + }, + { + "epoch": 0.27567445023804127, + "grad_norm": 0.06689453125, + "learning_rate": 0.0002, + "loss": 1.2796, + "step": 152 + }, + { + "epoch": 0.27748809793697576, + "grad_norm": 0.06982421875, + "learning_rate": 0.0002, + "loss": 1.2558, + "step": 153 + }, + { + "epoch": 0.2793017456359102, + "grad_norm": 0.0673828125, + "learning_rate": 0.0002, + "loss": 1.1244, + "step": 154 + }, + { + "epoch": 0.2811153933348447, + "grad_norm": 0.0673828125, + "learning_rate": 0.0002, + "loss": 1.525, + "step": 155 + }, + { + "epoch": 0.2829290410337792, + "grad_norm": 0.05859375, + "learning_rate": 0.0002, + "loss": 1.0439, + "step": 156 + }, + { + "epoch": 0.2847426887327137, + "grad_norm": 0.0654296875, + "learning_rate": 0.0002, + "loss": 1.1811, + "step": 157 + }, + { + "epoch": 0.28655633643164813, + "grad_norm": 0.06298828125, + "learning_rate": 0.0002, + "loss": 1.037, + "step": 158 + }, + { + "epoch": 0.2883699841305826, + "grad_norm": 0.0654296875, + "learning_rate": 0.0002, + "loss": 1.3015, + "step": 159 + }, + { + "epoch": 0.2901836318295171, + "grad_norm": 0.06591796875, + "learning_rate": 0.0002, + "loss": 1.1239, + "step": 160 + }, + { + "epoch": 0.2919972795284516, + "grad_norm": 0.05712890625, + "learning_rate": 0.0002, + "loss": 1.195, + "step": 161 + }, + { + "epoch": 0.29381092722738605, + "grad_norm": 0.0615234375, + "learning_rate": 0.0002, + "loss": 1.1679, + "step": 162 + }, + { + "epoch": 0.29562457492632055, + "grad_norm": 0.0595703125, + "learning_rate": 0.0002, + "loss": 1.2832, + "step": 163 + }, + { + "epoch": 0.29743822262525504, + "grad_norm": 0.05615234375, + "learning_rate": 0.0002, + "loss": 1.3119, + "step": 164 + }, + { + "epoch": 0.29925187032418954, + "grad_norm": 0.060302734375, + "learning_rate": 0.0002, + "loss": 1.1096, + "step": 165 + }, + { + "epoch": 0.30106551802312403, + "grad_norm": 0.0595703125, + "learning_rate": 0.0002, + "loss": 1.2032, + "step": 166 + }, + { + "epoch": 0.3028791657220585, + "grad_norm": 0.056884765625, + "learning_rate": 0.0002, + "loss": 1.2652, + "step": 167 + }, + { + "epoch": 0.30469281342099297, + "grad_norm": 0.058349609375, + "learning_rate": 0.0002, + "loss": 0.9518, + "step": 168 + }, + { + "epoch": 0.30650646111992746, + "grad_norm": 0.0634765625, + "learning_rate": 0.0002, + "loss": 1.267, + "step": 169 + }, + { + "epoch": 0.30832010881886196, + "grad_norm": 0.05810546875, + "learning_rate": 0.0002, + "loss": 1.1008, + "step": 170 + }, + { + "epoch": 0.3101337565177964, + "grad_norm": 0.06298828125, + "learning_rate": 0.0002, + "loss": 1.2333, + "step": 171 + }, + { + "epoch": 0.3119474042167309, + "grad_norm": 0.072265625, + "learning_rate": 0.0002, + "loss": 1.3316, + "step": 172 + }, + { + "epoch": 0.3137610519156654, + "grad_norm": 0.064453125, + "learning_rate": 0.0002, + "loss": 1.2817, + "step": 173 + }, + { + "epoch": 0.3155746996145999, + "grad_norm": 0.06689453125, + "learning_rate": 0.0002, + "loss": 1.1993, + "step": 174 + }, + { + "epoch": 0.3173883473135343, + "grad_norm": 0.078125, + "learning_rate": 0.0002, + "loss": 1.3337, + "step": 175 + }, + { + "epoch": 0.3192019950124688, + "grad_norm": 0.068359375, + "learning_rate": 0.0002, + "loss": 1.2697, + "step": 176 + }, + { + "epoch": 0.3210156427114033, + "grad_norm": 0.07177734375, + "learning_rate": 0.0002, + "loss": 1.4238, + "step": 177 + }, + { + "epoch": 0.3228292904103378, + "grad_norm": 0.08544921875, + "learning_rate": 0.0002, + "loss": 1.4342, + "step": 178 + }, + { + "epoch": 0.32464293810927225, + "grad_norm": 0.08251953125, + "learning_rate": 0.0002, + "loss": 1.3389, + "step": 179 + }, + { + "epoch": 0.32645658580820675, + "grad_norm": 0.07421875, + "learning_rate": 0.0002, + "loss": 1.2837, + "step": 180 + }, + { + "epoch": 0.32827023350714124, + "grad_norm": 0.0810546875, + "learning_rate": 0.0002, + "loss": 1.4128, + "step": 181 + }, + { + "epoch": 0.33008388120607574, + "grad_norm": 0.07763671875, + "learning_rate": 0.0002, + "loss": 1.1867, + "step": 182 + }, + { + "epoch": 0.3318975289050102, + "grad_norm": 0.07861328125, + "learning_rate": 0.0002, + "loss": 1.5998, + "step": 183 + }, + { + "epoch": 0.33371117660394467, + "grad_norm": 0.08203125, + "learning_rate": 0.0002, + "loss": 1.1903, + "step": 184 + }, + { + "epoch": 0.33552482430287917, + "grad_norm": 0.099609375, + "learning_rate": 0.0002, + "loss": 1.5607, + "step": 185 + }, + { + "epoch": 0.33733847200181366, + "grad_norm": 0.09619140625, + "learning_rate": 0.0002, + "loss": 1.4249, + "step": 186 + }, + { + "epoch": 0.33915211970074816, + "grad_norm": 0.10107421875, + "learning_rate": 0.0002, + "loss": 1.5123, + "step": 187 + }, + { + "epoch": 0.33915211970074816, + "eval_loss": 1.3207457065582275, + "eval_runtime": 152.4742, + "eval_samples_per_second": 6.558, + "eval_steps_per_second": 6.558, + "step": 187 + }, + { + "epoch": 0.33915211970074816, + "mmlu_eval_accuracy": 0.3300673977372765, + "mmlu_eval_accuracy_abstract_algebra": 0.36363636363636365, + "mmlu_eval_accuracy_anatomy": 0.5, + "mmlu_eval_accuracy_astronomy": 0.3125, + "mmlu_eval_accuracy_business_ethics": 0.6363636363636364, + "mmlu_eval_accuracy_clinical_knowledge": 0.27586206896551724, + "mmlu_eval_accuracy_college_biology": 0.4375, + "mmlu_eval_accuracy_college_chemistry": 0.25, + "mmlu_eval_accuracy_college_computer_science": 0.0, + "mmlu_eval_accuracy_college_mathematics": 0.2727272727272727, + "mmlu_eval_accuracy_college_medicine": 0.2727272727272727, + "mmlu_eval_accuracy_college_physics": 0.36363636363636365, + "mmlu_eval_accuracy_computer_security": 0.2727272727272727, + "mmlu_eval_accuracy_conceptual_physics": 0.2692307692307692, + "mmlu_eval_accuracy_econometrics": 0.0, + "mmlu_eval_accuracy_electrical_engineering": 0.125, + "mmlu_eval_accuracy_elementary_mathematics": 0.2682926829268293, + "mmlu_eval_accuracy_formal_logic": 0.21428571428571427, + "mmlu_eval_accuracy_global_facts": 0.2, + "mmlu_eval_accuracy_high_school_biology": 0.46875, + "mmlu_eval_accuracy_high_school_chemistry": 0.22727272727272727, + "mmlu_eval_accuracy_high_school_computer_science": 0.4444444444444444, + "mmlu_eval_accuracy_high_school_european_history": 0.4444444444444444, + "mmlu_eval_accuracy_high_school_geography": 0.4090909090909091, + "mmlu_eval_accuracy_high_school_government_and_politics": 0.38095238095238093, + "mmlu_eval_accuracy_high_school_macroeconomics": 0.32558139534883723, + "mmlu_eval_accuracy_high_school_mathematics": 0.3103448275862069, + "mmlu_eval_accuracy_high_school_microeconomics": 0.15384615384615385, + "mmlu_eval_accuracy_high_school_physics": 0.11764705882352941, + "mmlu_eval_accuracy_high_school_psychology": 0.5166666666666667, + "mmlu_eval_accuracy_high_school_statistics": 0.30434782608695654, + "mmlu_eval_accuracy_high_school_us_history": 0.4090909090909091, + "mmlu_eval_accuracy_high_school_world_history": 0.4230769230769231, + "mmlu_eval_accuracy_human_aging": 0.4782608695652174, + "mmlu_eval_accuracy_human_sexuality": 0.08333333333333333, + "mmlu_eval_accuracy_international_law": 0.5384615384615384, + "mmlu_eval_accuracy_jurisprudence": 0.36363636363636365, + "mmlu_eval_accuracy_logical_fallacies": 0.4444444444444444, + "mmlu_eval_accuracy_machine_learning": 0.45454545454545453, + "mmlu_eval_accuracy_management": 0.09090909090909091, + "mmlu_eval_accuracy_marketing": 0.56, + "mmlu_eval_accuracy_medical_genetics": 0.45454545454545453, + "mmlu_eval_accuracy_miscellaneous": 0.4069767441860465, + "mmlu_eval_accuracy_moral_disputes": 0.34210526315789475, + "mmlu_eval_accuracy_moral_scenarios": 0.31, + "mmlu_eval_accuracy_nutrition": 0.45454545454545453, + "mmlu_eval_accuracy_philosophy": 0.4117647058823529, + "mmlu_eval_accuracy_prehistory": 0.2571428571428571, + "mmlu_eval_accuracy_professional_accounting": 0.22580645161290322, + "mmlu_eval_accuracy_professional_law": 0.24705882352941178, + "mmlu_eval_accuracy_professional_medicine": 0.3548387096774194, + "mmlu_eval_accuracy_professional_psychology": 0.36231884057971014, + "mmlu_eval_accuracy_public_relations": 0.3333333333333333, + "mmlu_eval_accuracy_security_studies": 0.4074074074074074, + "mmlu_eval_accuracy_sociology": 0.45454545454545453, + "mmlu_eval_accuracy_us_foreign_policy": 0.2727272727272727, + "mmlu_eval_accuracy_virology": 0.16666666666666666, + "mmlu_eval_accuracy_world_religions": 0.3684210526315789, + "mmlu_loss": 1.906617237025036, + "step": 187 + }, + { + "epoch": 0.3409657673996826, + "grad_norm": 0.11181640625, + "learning_rate": 0.0002, + "loss": 1.4105, + "step": 188 + }, + { + "epoch": 0.3427794150986171, + "grad_norm": 0.10595703125, + "learning_rate": 0.0002, + "loss": 1.4816, + "step": 189 + }, + { + "epoch": 0.3445930627975516, + "grad_norm": 0.09765625, + "learning_rate": 0.0002, + "loss": 1.4072, + "step": 190 + }, + { + "epoch": 0.3464067104964861, + "grad_norm": 0.1015625, + "learning_rate": 0.0002, + "loss": 1.5767, + "step": 191 + }, + { + "epoch": 0.3482203581954205, + "grad_norm": 0.11181640625, + "learning_rate": 0.0002, + "loss": 1.5437, + "step": 192 + }, + { + "epoch": 0.350034005894355, + "grad_norm": 0.1171875, + "learning_rate": 0.0002, + "loss": 1.5936, + "step": 193 + }, + { + "epoch": 0.3518476535932895, + "grad_norm": 0.162109375, + "learning_rate": 0.0002, + "loss": 1.6973, + "step": 194 + }, + { + "epoch": 0.353661301292224, + "grad_norm": 0.138671875, + "learning_rate": 0.0002, + "loss": 1.637, + "step": 195 + }, + { + "epoch": 0.35547494899115845, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002, + "loss": 1.4448, + "step": 196 + }, + { + "epoch": 0.35728859669009294, + "grad_norm": 0.158203125, + "learning_rate": 0.0002, + "loss": 1.56, + "step": 197 + }, + { + "epoch": 0.35910224438902744, + "grad_norm": 0.1748046875, + "learning_rate": 0.0002, + "loss": 1.7515, + "step": 198 + }, + { + "epoch": 0.36091589208796193, + "grad_norm": 0.259765625, + "learning_rate": 0.0002, + "loss": 1.7178, + "step": 199 + }, + { + "epoch": 0.36272953978689637, + "grad_norm": 0.2578125, + "learning_rate": 0.0002, + "loss": 1.5201, + "step": 200 + }, + { + "epoch": 0.36454318748583087, + "grad_norm": 0.08203125, + "learning_rate": 0.0002, + "loss": 1.3518, + "step": 201 + }, + { + "epoch": 0.36635683518476536, + "grad_norm": 0.068359375, + "learning_rate": 0.0002, + "loss": 1.3225, + "step": 202 + }, + { + "epoch": 0.36817048288369986, + "grad_norm": 0.06640625, + "learning_rate": 0.0002, + "loss": 1.3031, + "step": 203 + }, + { + "epoch": 0.3699841305826343, + "grad_norm": 0.06396484375, + "learning_rate": 0.0002, + "loss": 1.1489, + "step": 204 + }, + { + "epoch": 0.3717977782815688, + "grad_norm": 0.061279296875, + "learning_rate": 0.0002, + "loss": 1.1596, + "step": 205 + }, + { + "epoch": 0.3736114259805033, + "grad_norm": 0.059814453125, + "learning_rate": 0.0002, + "loss": 1.2891, + "step": 206 + }, + { + "epoch": 0.3754250736794378, + "grad_norm": 0.06298828125, + "learning_rate": 0.0002, + "loss": 1.2737, + "step": 207 + }, + { + "epoch": 0.3772387213783723, + "grad_norm": 0.0615234375, + "learning_rate": 0.0002, + "loss": 1.2659, + "step": 208 + }, + { + "epoch": 0.3790523690773067, + "grad_norm": 0.0673828125, + "learning_rate": 0.0002, + "loss": 1.1453, + "step": 209 + }, + { + "epoch": 0.3808660167762412, + "grad_norm": 0.062255859375, + "learning_rate": 0.0002, + "loss": 1.2685, + "step": 210 + }, + { + "epoch": 0.3826796644751757, + "grad_norm": 0.0595703125, + "learning_rate": 0.0002, + "loss": 1.2236, + "step": 211 + }, + { + "epoch": 0.3844933121741102, + "grad_norm": 0.054443359375, + "learning_rate": 0.0002, + "loss": 1.203, + "step": 212 + }, + { + "epoch": 0.38630695987304464, + "grad_norm": 0.0625, + "learning_rate": 0.0002, + "loss": 1.3544, + "step": 213 + }, + { + "epoch": 0.38812060757197914, + "grad_norm": 0.0654296875, + "learning_rate": 0.0002, + "loss": 1.1657, + "step": 214 + }, + { + "epoch": 0.38993425527091363, + "grad_norm": 0.06591796875, + "learning_rate": 0.0002, + "loss": 1.1304, + "step": 215 + }, + { + "epoch": 0.3917479029698481, + "grad_norm": 0.0634765625, + "learning_rate": 0.0002, + "loss": 1.1127, + "step": 216 + }, + { + "epoch": 0.39356155066878257, + "grad_norm": 0.060302734375, + "learning_rate": 0.0002, + "loss": 1.1229, + "step": 217 + }, + { + "epoch": 0.39537519836771706, + "grad_norm": 0.062255859375, + "learning_rate": 0.0002, + "loss": 1.3115, + "step": 218 + }, + { + "epoch": 0.39718884606665156, + "grad_norm": 0.06787109375, + "learning_rate": 0.0002, + "loss": 1.5157, + "step": 219 + }, + { + "epoch": 0.39900249376558605, + "grad_norm": 0.0625, + "learning_rate": 0.0002, + "loss": 1.2344, + "step": 220 + }, + { + "epoch": 0.4008161414645205, + "grad_norm": 0.0654296875, + "learning_rate": 0.0002, + "loss": 1.3559, + "step": 221 + }, + { + "epoch": 0.402629789163455, + "grad_norm": 0.06396484375, + "learning_rate": 0.0002, + "loss": 1.3068, + "step": 222 + }, + { + "epoch": 0.4044434368623895, + "grad_norm": 0.06787109375, + "learning_rate": 0.0002, + "loss": 1.1531, + "step": 223 + }, + { + "epoch": 0.406257084561324, + "grad_norm": 0.068359375, + "learning_rate": 0.0002, + "loss": 1.2423, + "step": 224 + }, + { + "epoch": 0.4080707322602584, + "grad_norm": 0.0693359375, + "learning_rate": 0.0002, + "loss": 1.1252, + "step": 225 + }, + { + "epoch": 0.4098843799591929, + "grad_norm": 0.07275390625, + "learning_rate": 0.0002, + "loss": 1.6298, + "step": 226 + }, + { + "epoch": 0.4116980276581274, + "grad_norm": 0.0673828125, + "learning_rate": 0.0002, + "loss": 1.2494, + "step": 227 + }, + { + "epoch": 0.4135116753570619, + "grad_norm": 0.07373046875, + "learning_rate": 0.0002, + "loss": 1.5485, + "step": 228 + }, + { + "epoch": 0.4153253230559964, + "grad_norm": 0.07861328125, + "learning_rate": 0.0002, + "loss": 1.4175, + "step": 229 + }, + { + "epoch": 0.41713897075493084, + "grad_norm": 0.07666015625, + "learning_rate": 0.0002, + "loss": 1.4689, + "step": 230 + }, + { + "epoch": 0.41895261845386533, + "grad_norm": 0.07763671875, + "learning_rate": 0.0002, + "loss": 1.4414, + "step": 231 + }, + { + "epoch": 0.42076626615279983, + "grad_norm": 0.08056640625, + "learning_rate": 0.0002, + "loss": 1.6171, + "step": 232 + }, + { + "epoch": 0.4225799138517343, + "grad_norm": 0.09130859375, + "learning_rate": 0.0002, + "loss": 1.5312, + "step": 233 + }, + { + "epoch": 0.42439356155066876, + "grad_norm": 0.07275390625, + "learning_rate": 0.0002, + "loss": 1.3524, + "step": 234 + }, + { + "epoch": 0.42620720924960326, + "grad_norm": 0.08056640625, + "learning_rate": 0.0002, + "loss": 1.3284, + "step": 235 + }, + { + "epoch": 0.42802085694853775, + "grad_norm": 0.0771484375, + "learning_rate": 0.0002, + "loss": 1.3026, + "step": 236 + }, + { + "epoch": 0.42983450464747225, + "grad_norm": 0.08984375, + "learning_rate": 0.0002, + "loss": 1.6436, + "step": 237 + }, + { + "epoch": 0.4316481523464067, + "grad_norm": 0.091796875, + "learning_rate": 0.0002, + "loss": 1.7089, + "step": 238 + }, + { + "epoch": 0.4334618000453412, + "grad_norm": 0.09423828125, + "learning_rate": 0.0002, + "loss": 1.3015, + "step": 239 + }, + { + "epoch": 0.4352754477442757, + "grad_norm": 0.09912109375, + "learning_rate": 0.0002, + "loss": 1.5256, + "step": 240 + }, + { + "epoch": 0.4370890954432102, + "grad_norm": 0.0986328125, + "learning_rate": 0.0002, + "loss": 1.4876, + "step": 241 + }, + { + "epoch": 0.4389027431421446, + "grad_norm": 0.1083984375, + "learning_rate": 0.0002, + "loss": 1.6071, + "step": 242 + }, + { + "epoch": 0.4407163908410791, + "grad_norm": 0.115234375, + "learning_rate": 0.0002, + "loss": 1.7493, + "step": 243 + }, + { + "epoch": 0.4425300385400136, + "grad_norm": 0.11767578125, + "learning_rate": 0.0002, + "loss": 1.4893, + "step": 244 + }, + { + "epoch": 0.4443436862389481, + "grad_norm": 0.1298828125, + "learning_rate": 0.0002, + "loss": 1.7197, + "step": 245 + }, + { + "epoch": 0.44615733393788254, + "grad_norm": 0.1376953125, + "learning_rate": 0.0002, + "loss": 1.8953, + "step": 246 + }, + { + "epoch": 0.44797098163681703, + "grad_norm": 0.1337890625, + "learning_rate": 0.0002, + "loss": 1.5628, + "step": 247 + }, + { + "epoch": 0.44978462933575153, + "grad_norm": 0.18359375, + "learning_rate": 0.0002, + "loss": 1.7738, + "step": 248 + }, + { + "epoch": 0.451598277034686, + "grad_norm": 0.193359375, + "learning_rate": 0.0002, + "loss": 1.6575, + "step": 249 + }, + { + "epoch": 0.4534119247336205, + "grad_norm": 0.337890625, + "learning_rate": 0.0002, + "loss": 1.547, + "step": 250 + }, + { + "epoch": 0.45522557243255496, + "grad_norm": 0.0634765625, + "learning_rate": 0.0002, + "loss": 1.2773, + "step": 251 + }, + { + "epoch": 0.45703922013148945, + "grad_norm": 0.0673828125, + "learning_rate": 0.0002, + "loss": 1.0091, + "step": 252 + }, + { + "epoch": 0.45885286783042395, + "grad_norm": 0.06689453125, + "learning_rate": 0.0002, + "loss": 1.1133, + "step": 253 + }, + { + "epoch": 0.46066651552935844, + "grad_norm": 0.055908203125, + "learning_rate": 0.0002, + "loss": 1.2742, + "step": 254 + }, + { + "epoch": 0.4624801632282929, + "grad_norm": 0.0654296875, + "learning_rate": 0.0002, + "loss": 1.0411, + "step": 255 + }, + { + "epoch": 0.4642938109272274, + "grad_norm": 0.060791015625, + "learning_rate": 0.0002, + "loss": 1.0528, + "step": 256 + }, + { + "epoch": 0.4661074586261619, + "grad_norm": 0.058837890625, + "learning_rate": 0.0002, + "loss": 1.293, + "step": 257 + }, + { + "epoch": 0.46792110632509637, + "grad_norm": 0.0693359375, + "learning_rate": 0.0002, + "loss": 1.1024, + "step": 258 + }, + { + "epoch": 0.4697347540240308, + "grad_norm": 0.059814453125, + "learning_rate": 0.0002, + "loss": 1.1806, + "step": 259 + }, + { + "epoch": 0.4715484017229653, + "grad_norm": 0.06494140625, + "learning_rate": 0.0002, + "loss": 1.1565, + "step": 260 + }, + { + "epoch": 0.4733620494218998, + "grad_norm": 0.06689453125, + "learning_rate": 0.0002, + "loss": 1.1127, + "step": 261 + }, + { + "epoch": 0.4751756971208343, + "grad_norm": 0.06298828125, + "learning_rate": 0.0002, + "loss": 0.9769, + "step": 262 + }, + { + "epoch": 0.47698934481976873, + "grad_norm": 0.059814453125, + "learning_rate": 0.0002, + "loss": 1.0284, + "step": 263 + }, + { + "epoch": 0.47880299251870323, + "grad_norm": 0.06689453125, + "learning_rate": 0.0002, + "loss": 1.3195, + "step": 264 + }, + { + "epoch": 0.4806166402176377, + "grad_norm": 0.060791015625, + "learning_rate": 0.0002, + "loss": 1.0843, + "step": 265 + }, + { + "epoch": 0.4824302879165722, + "grad_norm": 0.06494140625, + "learning_rate": 0.0002, + "loss": 1.2012, + "step": 266 + }, + { + "epoch": 0.4842439356155067, + "grad_norm": 0.056884765625, + "learning_rate": 0.0002, + "loss": 1.0684, + "step": 267 + }, + { + "epoch": 0.48605758331444116, + "grad_norm": 0.061279296875, + "learning_rate": 0.0002, + "loss": 1.2575, + "step": 268 + }, + { + "epoch": 0.48787123101337565, + "grad_norm": 0.056396484375, + "learning_rate": 0.0002, + "loss": 1.1685, + "step": 269 + }, + { + "epoch": 0.48968487871231015, + "grad_norm": 0.0634765625, + "learning_rate": 0.0002, + "loss": 1.076, + "step": 270 + }, + { + "epoch": 0.49149852641124464, + "grad_norm": 0.06640625, + "learning_rate": 0.0002, + "loss": 1.3619, + "step": 271 + }, + { + "epoch": 0.4933121741101791, + "grad_norm": 0.06494140625, + "learning_rate": 0.0002, + "loss": 1.2756, + "step": 272 + }, + { + "epoch": 0.4951258218091136, + "grad_norm": 0.06640625, + "learning_rate": 0.0002, + "loss": 1.3098, + "step": 273 + }, + { + "epoch": 0.49693946950804807, + "grad_norm": 0.07373046875, + "learning_rate": 0.0002, + "loss": 1.3556, + "step": 274 + }, + { + "epoch": 0.49875311720698257, + "grad_norm": 0.07470703125, + "learning_rate": 0.0002, + "loss": 1.2452, + "step": 275 + }, + { + "epoch": 0.500566764905917, + "grad_norm": 0.07568359375, + "learning_rate": 0.0002, + "loss": 1.4595, + "step": 276 + }, + { + "epoch": 0.5023804126048516, + "grad_norm": 0.07373046875, + "learning_rate": 0.0002, + "loss": 1.4151, + "step": 277 + }, + { + "epoch": 0.504194060303786, + "grad_norm": 0.07080078125, + "learning_rate": 0.0002, + "loss": 1.2493, + "step": 278 + }, + { + "epoch": 0.5060077080027204, + "grad_norm": 0.07958984375, + "learning_rate": 0.0002, + "loss": 1.3478, + "step": 279 + }, + { + "epoch": 0.507821355701655, + "grad_norm": 0.07421875, + "learning_rate": 0.0002, + "loss": 1.2862, + "step": 280 + }, + { + "epoch": 0.5096350034005894, + "grad_norm": 0.07177734375, + "learning_rate": 0.0002, + "loss": 1.2423, + "step": 281 + }, + { + "epoch": 0.5114486510995239, + "grad_norm": 0.0830078125, + "learning_rate": 0.0002, + "loss": 1.4401, + "step": 282 + }, + { + "epoch": 0.5132622987984584, + "grad_norm": 0.08544921875, + "learning_rate": 0.0002, + "loss": 1.4219, + "step": 283 + }, + { + "epoch": 0.5150759464973929, + "grad_norm": 0.08251953125, + "learning_rate": 0.0002, + "loss": 1.2816, + "step": 284 + }, + { + "epoch": 0.5168895941963274, + "grad_norm": 0.0888671875, + "learning_rate": 0.0002, + "loss": 1.5715, + "step": 285 + }, + { + "epoch": 0.5187032418952618, + "grad_norm": 0.083984375, + "learning_rate": 0.0002, + "loss": 1.2814, + "step": 286 + }, + { + "epoch": 0.5205168895941963, + "grad_norm": 0.09912109375, + "learning_rate": 0.0002, + "loss": 1.477, + "step": 287 + }, + { + "epoch": 0.5223305372931308, + "grad_norm": 0.09716796875, + "learning_rate": 0.0002, + "loss": 1.5531, + "step": 288 + }, + { + "epoch": 0.5241441849920653, + "grad_norm": 0.09716796875, + "learning_rate": 0.0002, + "loss": 1.5789, + "step": 289 + }, + { + "epoch": 0.5259578326909998, + "grad_norm": 0.11181640625, + "learning_rate": 0.0002, + "loss": 1.5434, + "step": 290 + }, + { + "epoch": 0.5277714803899343, + "grad_norm": 0.111328125, + "learning_rate": 0.0002, + "loss": 1.4667, + "step": 291 + }, + { + "epoch": 0.5295851280888687, + "grad_norm": 0.10986328125, + "learning_rate": 0.0002, + "loss": 1.5547, + "step": 292 + }, + { + "epoch": 0.5313987757878033, + "grad_norm": 0.111328125, + "learning_rate": 0.0002, + "loss": 1.7992, + "step": 293 + }, + { + "epoch": 0.5332124234867377, + "grad_norm": 0.115234375, + "learning_rate": 0.0002, + "loss": 1.6851, + "step": 294 + }, + { + "epoch": 0.5350260711856721, + "grad_norm": 0.1298828125, + "learning_rate": 0.0002, + "loss": 1.7256, + "step": 295 + }, + { + "epoch": 0.5368397188846067, + "grad_norm": 0.1337890625, + "learning_rate": 0.0002, + "loss": 1.7851, + "step": 296 + }, + { + "epoch": 0.5386533665835411, + "grad_norm": 0.1572265625, + "learning_rate": 0.0002, + "loss": 1.4116, + "step": 297 + }, + { + "epoch": 0.5404670142824757, + "grad_norm": 0.166015625, + "learning_rate": 0.0002, + "loss": 1.5693, + "step": 298 + }, + { + "epoch": 0.5422806619814101, + "grad_norm": 0.1845703125, + "learning_rate": 0.0002, + "loss": 1.6627, + "step": 299 + }, + { + "epoch": 0.5440943096803446, + "grad_norm": 0.220703125, + "learning_rate": 0.0002, + "loss": 1.1811, + "step": 300 + }, + { + "epoch": 0.5459079573792791, + "grad_norm": 0.06591796875, + "learning_rate": 0.0002, + "loss": 1.2776, + "step": 301 + }, + { + "epoch": 0.5477216050782135, + "grad_norm": 0.05615234375, + "learning_rate": 0.0002, + "loss": 1.2174, + "step": 302 + }, + { + "epoch": 0.549535252777148, + "grad_norm": 0.061279296875, + "learning_rate": 0.0002, + "loss": 1.2217, + "step": 303 + }, + { + "epoch": 0.5513489004760825, + "grad_norm": 0.0654296875, + "learning_rate": 0.0002, + "loss": 1.2271, + "step": 304 + }, + { + "epoch": 0.553162548175017, + "grad_norm": 0.0615234375, + "learning_rate": 0.0002, + "loss": 1.1812, + "step": 305 + }, + { + "epoch": 0.5549761958739515, + "grad_norm": 0.0625, + "learning_rate": 0.0002, + "loss": 1.0711, + "step": 306 + }, + { + "epoch": 0.556789843572886, + "grad_norm": 0.05859375, + "learning_rate": 0.0002, + "loss": 1.2252, + "step": 307 + }, + { + "epoch": 0.5586034912718204, + "grad_norm": 0.05859375, + "learning_rate": 0.0002, + "loss": 1.0931, + "step": 308 + }, + { + "epoch": 0.560417138970755, + "grad_norm": 0.06591796875, + "learning_rate": 0.0002, + "loss": 1.3787, + "step": 309 + }, + { + "epoch": 0.5622307866696894, + "grad_norm": 0.0712890625, + "learning_rate": 0.0002, + "loss": 1.25, + "step": 310 + }, + { + "epoch": 0.564044434368624, + "grad_norm": 0.06982421875, + "learning_rate": 0.0002, + "loss": 1.3558, + "step": 311 + }, + { + "epoch": 0.5658580820675584, + "grad_norm": 0.0615234375, + "learning_rate": 0.0002, + "loss": 1.2276, + "step": 312 + }, + { + "epoch": 0.5676717297664928, + "grad_norm": 0.057373046875, + "learning_rate": 0.0002, + "loss": 1.1367, + "step": 313 + }, + { + "epoch": 0.5694853774654274, + "grad_norm": 0.0654296875, + "learning_rate": 0.0002, + "loss": 1.1328, + "step": 314 + }, + { + "epoch": 0.5712990251643618, + "grad_norm": 0.054931640625, + "learning_rate": 0.0002, + "loss": 1.2829, + "step": 315 + }, + { + "epoch": 0.5731126728632963, + "grad_norm": 0.057373046875, + "learning_rate": 0.0002, + "loss": 1.2134, + "step": 316 + }, + { + "epoch": 0.5749263205622308, + "grad_norm": 0.059326171875, + "learning_rate": 0.0002, + "loss": 0.9893, + "step": 317 + }, + { + "epoch": 0.5767399682611652, + "grad_norm": 0.06787109375, + "learning_rate": 0.0002, + "loss": 1.138, + "step": 318 + }, + { + "epoch": 0.5785536159600998, + "grad_norm": 0.060791015625, + "learning_rate": 0.0002, + "loss": 1.1663, + "step": 319 + }, + { + "epoch": 0.5803672636590342, + "grad_norm": 0.06103515625, + "learning_rate": 0.0002, + "loss": 1.1417, + "step": 320 + }, + { + "epoch": 0.5821809113579687, + "grad_norm": 0.06494140625, + "learning_rate": 0.0002, + "loss": 1.2092, + "step": 321 + }, + { + "epoch": 0.5839945590569032, + "grad_norm": 0.06591796875, + "learning_rate": 0.0002, + "loss": 1.2502, + "step": 322 + }, + { + "epoch": 0.5858082067558377, + "grad_norm": 0.07080078125, + "learning_rate": 0.0002, + "loss": 1.0949, + "step": 323 + }, + { + "epoch": 0.5876218544547721, + "grad_norm": 0.064453125, + "learning_rate": 0.0002, + "loss": 1.1324, + "step": 324 + }, + { + "epoch": 0.5894355021537067, + "grad_norm": 0.0732421875, + "learning_rate": 0.0002, + "loss": 1.2198, + "step": 325 + }, + { + "epoch": 0.5912491498526411, + "grad_norm": 0.0703125, + "learning_rate": 0.0002, + "loss": 1.4064, + "step": 326 + }, + { + "epoch": 0.5930627975515756, + "grad_norm": 0.09619140625, + "learning_rate": 0.0002, + "loss": 1.2512, + "step": 327 + }, + { + "epoch": 0.5948764452505101, + "grad_norm": 0.0693359375, + "learning_rate": 0.0002, + "loss": 1.2896, + "step": 328 + }, + { + "epoch": 0.5966900929494445, + "grad_norm": 0.07763671875, + "learning_rate": 0.0002, + "loss": 1.4599, + "step": 329 + }, + { + "epoch": 0.5985037406483791, + "grad_norm": 0.07373046875, + "learning_rate": 0.0002, + "loss": 1.3801, + "step": 330 + }, + { + "epoch": 0.6003173883473135, + "grad_norm": 0.07470703125, + "learning_rate": 0.0002, + "loss": 1.1684, + "step": 331 + }, + { + "epoch": 0.6021310360462481, + "grad_norm": 0.08203125, + "learning_rate": 0.0002, + "loss": 1.4548, + "step": 332 + }, + { + "epoch": 0.6039446837451825, + "grad_norm": 0.08154296875, + "learning_rate": 0.0002, + "loss": 1.7985, + "step": 333 + }, + { + "epoch": 0.605758331444117, + "grad_norm": 0.0810546875, + "learning_rate": 0.0002, + "loss": 1.5021, + "step": 334 + }, + { + "epoch": 0.6075719791430515, + "grad_norm": 0.08984375, + "learning_rate": 0.0002, + "loss": 1.6687, + "step": 335 + }, + { + "epoch": 0.6093856268419859, + "grad_norm": 0.091796875, + "learning_rate": 0.0002, + "loss": 1.5265, + "step": 336 + }, + { + "epoch": 0.6111992745409204, + "grad_norm": 0.0927734375, + "learning_rate": 0.0002, + "loss": 1.5693, + "step": 337 + }, + { + "epoch": 0.6130129222398549, + "grad_norm": 0.0947265625, + "learning_rate": 0.0002, + "loss": 1.6799, + "step": 338 + }, + { + "epoch": 0.6148265699387894, + "grad_norm": 0.08447265625, + "learning_rate": 0.0002, + "loss": 1.4595, + "step": 339 + }, + { + "epoch": 0.6166402176377239, + "grad_norm": 0.09716796875, + "learning_rate": 0.0002, + "loss": 1.4184, + "step": 340 + }, + { + "epoch": 0.6184538653366584, + "grad_norm": 0.09912109375, + "learning_rate": 0.0002, + "loss": 1.5791, + "step": 341 + }, + { + "epoch": 0.6202675130355928, + "grad_norm": 0.10302734375, + "learning_rate": 0.0002, + "loss": 1.5172, + "step": 342 + }, + { + "epoch": 0.6220811607345273, + "grad_norm": 0.10791015625, + "learning_rate": 0.0002, + "loss": 1.4815, + "step": 343 + }, + { + "epoch": 0.6238948084334618, + "grad_norm": 0.11376953125, + "learning_rate": 0.0002, + "loss": 1.4553, + "step": 344 + }, + { + "epoch": 0.6257084561323962, + "grad_norm": 0.1318359375, + "learning_rate": 0.0002, + "loss": 1.7873, + "step": 345 + }, + { + "epoch": 0.6275221038313308, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.603, + "step": 346 + }, + { + "epoch": 0.6293357515302652, + "grad_norm": 0.140625, + "learning_rate": 0.0002, + "loss": 1.7002, + "step": 347 + }, + { + "epoch": 0.6311493992291998, + "grad_norm": 0.16796875, + "learning_rate": 0.0002, + "loss": 1.5092, + "step": 348 + }, + { + "epoch": 0.6329630469281342, + "grad_norm": 0.1953125, + "learning_rate": 0.0002, + "loss": 1.7064, + "step": 349 + }, + { + "epoch": 0.6347766946270686, + "grad_norm": 0.310546875, + "learning_rate": 0.0002, + "loss": 1.5535, + "step": 350 + }, + { + "epoch": 0.6365903423260032, + "grad_norm": 0.06494140625, + "learning_rate": 0.0002, + "loss": 1.1438, + "step": 351 + }, + { + "epoch": 0.6384039900249376, + "grad_norm": 0.068359375, + "learning_rate": 0.0002, + "loss": 1.1838, + "step": 352 + }, + { + "epoch": 0.6402176377238722, + "grad_norm": 0.06591796875, + "learning_rate": 0.0002, + "loss": 1.4029, + "step": 353 + }, + { + "epoch": 0.6420312854228066, + "grad_norm": 0.06689453125, + "learning_rate": 0.0002, + "loss": 1.109, + "step": 354 + }, + { + "epoch": 0.6438449331217411, + "grad_norm": 0.064453125, + "learning_rate": 0.0002, + "loss": 1.2109, + "step": 355 + }, + { + "epoch": 0.6456585808206756, + "grad_norm": 0.064453125, + "learning_rate": 0.0002, + "loss": 1.1296, + "step": 356 + }, + { + "epoch": 0.6474722285196101, + "grad_norm": 0.06787109375, + "learning_rate": 0.0002, + "loss": 1.0627, + "step": 357 + }, + { + "epoch": 0.6492858762185445, + "grad_norm": 0.06884765625, + "learning_rate": 0.0002, + "loss": 1.3592, + "step": 358 + }, + { + "epoch": 0.651099523917479, + "grad_norm": 0.0654296875, + "learning_rate": 0.0002, + "loss": 1.1983, + "step": 359 + }, + { + "epoch": 0.6529131716164135, + "grad_norm": 0.062255859375, + "learning_rate": 0.0002, + "loss": 1.2319, + "step": 360 + }, + { + "epoch": 0.654726819315348, + "grad_norm": 0.058349609375, + "learning_rate": 0.0002, + "loss": 1.2569, + "step": 361 + }, + { + "epoch": 0.6565404670142825, + "grad_norm": 0.061767578125, + "learning_rate": 0.0002, + "loss": 1.2054, + "step": 362 + }, + { + "epoch": 0.6583541147132169, + "grad_norm": 0.06591796875, + "learning_rate": 0.0002, + "loss": 1.2278, + "step": 363 + }, + { + "epoch": 0.6601677624121515, + "grad_norm": 0.0673828125, + "learning_rate": 0.0002, + "loss": 1.1188, + "step": 364 + }, + { + "epoch": 0.6619814101110859, + "grad_norm": 0.0595703125, + "learning_rate": 0.0002, + "loss": 1.1407, + "step": 365 + }, + { + "epoch": 0.6637950578100204, + "grad_norm": 0.060302734375, + "learning_rate": 0.0002, + "loss": 1.236, + "step": 366 + }, + { + "epoch": 0.6656087055089549, + "grad_norm": 0.056640625, + "learning_rate": 0.0002, + "loss": 1.3748, + "step": 367 + }, + { + "epoch": 0.6674223532078893, + "grad_norm": 0.057861328125, + "learning_rate": 0.0002, + "loss": 1.1775, + "step": 368 + }, + { + "epoch": 0.6692360009068239, + "grad_norm": 0.056884765625, + "learning_rate": 0.0002, + "loss": 1.137, + "step": 369 + }, + { + "epoch": 0.6710496486057583, + "grad_norm": 0.06396484375, + "learning_rate": 0.0002, + "loss": 1.0827, + "step": 370 + }, + { + "epoch": 0.6728632963046928, + "grad_norm": 0.06689453125, + "learning_rate": 0.0002, + "loss": 1.1128, + "step": 371 + }, + { + "epoch": 0.6746769440036273, + "grad_norm": 0.06689453125, + "learning_rate": 0.0002, + "loss": 1.4156, + "step": 372 + }, + { + "epoch": 0.6764905917025618, + "grad_norm": 0.064453125, + "learning_rate": 0.0002, + "loss": 1.2363, + "step": 373 + }, + { + "epoch": 0.6783042394014963, + "grad_norm": 0.0693359375, + "learning_rate": 0.0002, + "loss": 1.1391, + "step": 374 + }, + { + "epoch": 0.6783042394014963, + "eval_loss": 1.3085720539093018, + "eval_runtime": 152.7321, + "eval_samples_per_second": 6.547, + "eval_steps_per_second": 6.547, + "step": 374 + }, + { + "epoch": 0.6783042394014963, + "mmlu_eval_accuracy": 0.3316627362592083, + "mmlu_eval_accuracy_abstract_algebra": 0.36363636363636365, + "mmlu_eval_accuracy_anatomy": 0.5, + "mmlu_eval_accuracy_astronomy": 0.25, + "mmlu_eval_accuracy_business_ethics": 0.7272727272727273, + "mmlu_eval_accuracy_clinical_knowledge": 0.27586206896551724, + "mmlu_eval_accuracy_college_biology": 0.4375, + "mmlu_eval_accuracy_college_chemistry": 0.125, + "mmlu_eval_accuracy_college_computer_science": 0.09090909090909091, + "mmlu_eval_accuracy_college_mathematics": 0.45454545454545453, + "mmlu_eval_accuracy_college_medicine": 0.22727272727272727, + "mmlu_eval_accuracy_college_physics": 0.2727272727272727, + "mmlu_eval_accuracy_computer_security": 0.36363636363636365, + "mmlu_eval_accuracy_conceptual_physics": 0.2692307692307692, + "mmlu_eval_accuracy_econometrics": 0.08333333333333333, + "mmlu_eval_accuracy_electrical_engineering": 0.125, + "mmlu_eval_accuracy_elementary_mathematics": 0.3170731707317073, + "mmlu_eval_accuracy_formal_logic": 0.14285714285714285, + "mmlu_eval_accuracy_global_facts": 0.2, + "mmlu_eval_accuracy_high_school_biology": 0.46875, + "mmlu_eval_accuracy_high_school_chemistry": 0.3181818181818182, + "mmlu_eval_accuracy_high_school_computer_science": 0.5555555555555556, + "mmlu_eval_accuracy_high_school_european_history": 0.4444444444444444, + "mmlu_eval_accuracy_high_school_geography": 0.4090909090909091, + "mmlu_eval_accuracy_high_school_government_and_politics": 0.38095238095238093, + "mmlu_eval_accuracy_high_school_macroeconomics": 0.3023255813953488, + "mmlu_eval_accuracy_high_school_mathematics": 0.3103448275862069, + "mmlu_eval_accuracy_high_school_microeconomics": 0.15384615384615385, + "mmlu_eval_accuracy_high_school_physics": 0.11764705882352941, + "mmlu_eval_accuracy_high_school_psychology": 0.55, + "mmlu_eval_accuracy_high_school_statistics": 0.2608695652173913, + "mmlu_eval_accuracy_high_school_us_history": 0.4090909090909091, + "mmlu_eval_accuracy_high_school_world_history": 0.34615384615384615, + "mmlu_eval_accuracy_human_aging": 0.4782608695652174, + "mmlu_eval_accuracy_human_sexuality": 0.16666666666666666, + "mmlu_eval_accuracy_international_law": 0.46153846153846156, + "mmlu_eval_accuracy_jurisprudence": 0.18181818181818182, + "mmlu_eval_accuracy_logical_fallacies": 0.2777777777777778, + "mmlu_eval_accuracy_machine_learning": 0.45454545454545453, + "mmlu_eval_accuracy_management": 0.09090909090909091, + "mmlu_eval_accuracy_marketing": 0.48, + "mmlu_eval_accuracy_medical_genetics": 0.45454545454545453, + "mmlu_eval_accuracy_miscellaneous": 0.4069767441860465, + "mmlu_eval_accuracy_moral_disputes": 0.3157894736842105, + "mmlu_eval_accuracy_moral_scenarios": 0.31, + "mmlu_eval_accuracy_nutrition": 0.42424242424242425, + "mmlu_eval_accuracy_philosophy": 0.35294117647058826, + "mmlu_eval_accuracy_prehistory": 0.2571428571428571, + "mmlu_eval_accuracy_professional_accounting": 0.25806451612903225, + "mmlu_eval_accuracy_professional_law": 0.2411764705882353, + "mmlu_eval_accuracy_professional_medicine": 0.3225806451612903, + "mmlu_eval_accuracy_professional_psychology": 0.34782608695652173, + "mmlu_eval_accuracy_public_relations": 0.3333333333333333, + "mmlu_eval_accuracy_security_studies": 0.4074074074074074, + "mmlu_eval_accuracy_sociology": 0.45454545454545453, + "mmlu_eval_accuracy_us_foreign_policy": 0.36363636363636365, + "mmlu_eval_accuracy_virology": 0.3888888888888889, + "mmlu_eval_accuracy_world_religions": 0.42105263157894735, + "mmlu_loss": 1.958029837019688, + "step": 374 + }, + { + "epoch": 0.6801178871004308, + "grad_norm": 0.07080078125, + "learning_rate": 0.0002, + "loss": 1.1288, + "step": 375 + }, + { + "epoch": 0.6819315347993652, + "grad_norm": 0.072265625, + "learning_rate": 0.0002, + "loss": 1.3663, + "step": 376 + }, + { + "epoch": 0.6837451824982997, + "grad_norm": 0.06884765625, + "learning_rate": 0.0002, + "loss": 1.2929, + "step": 377 + }, + { + "epoch": 0.6855588301972342, + "grad_norm": 0.0751953125, + "learning_rate": 0.0002, + "loss": 1.2172, + "step": 378 + }, + { + "epoch": 0.6873724778961686, + "grad_norm": 0.07861328125, + "learning_rate": 0.0002, + "loss": 1.2746, + "step": 379 + }, + { + "epoch": 0.6891861255951032, + "grad_norm": 0.080078125, + "learning_rate": 0.0002, + "loss": 1.4914, + "step": 380 + }, + { + "epoch": 0.6909997732940376, + "grad_norm": 0.083984375, + "learning_rate": 0.0002, + "loss": 1.4317, + "step": 381 + }, + { + "epoch": 0.6928134209929722, + "grad_norm": 0.0859375, + "learning_rate": 0.0002, + "loss": 1.4468, + "step": 382 + }, + { + "epoch": 0.6946270686919066, + "grad_norm": 0.08544921875, + "learning_rate": 0.0002, + "loss": 1.4492, + "step": 383 + }, + { + "epoch": 0.696440716390841, + "grad_norm": 0.1005859375, + "learning_rate": 0.0002, + "loss": 1.3948, + "step": 384 + }, + { + "epoch": 0.6982543640897756, + "grad_norm": 0.09130859375, + "learning_rate": 0.0002, + "loss": 1.3322, + "step": 385 + }, + { + "epoch": 0.70006801178871, + "grad_norm": 0.09423828125, + "learning_rate": 0.0002, + "loss": 1.465, + "step": 386 + }, + { + "epoch": 0.7018816594876445, + "grad_norm": 0.10107421875, + "learning_rate": 0.0002, + "loss": 1.6018, + "step": 387 + }, + { + "epoch": 0.703695307186579, + "grad_norm": 0.1005859375, + "learning_rate": 0.0002, + "loss": 1.3094, + "step": 388 + }, + { + "epoch": 0.7055089548855135, + "grad_norm": 0.09521484375, + "learning_rate": 0.0002, + "loss": 1.3817, + "step": 389 + }, + { + "epoch": 0.707322602584448, + "grad_norm": 0.1025390625, + "learning_rate": 0.0002, + "loss": 1.5991, + "step": 390 + }, + { + "epoch": 0.7091362502833825, + "grad_norm": 0.10546875, + "learning_rate": 0.0002, + "loss": 1.4896, + "step": 391 + }, + { + "epoch": 0.7109498979823169, + "grad_norm": 0.11181640625, + "learning_rate": 0.0002, + "loss": 1.5813, + "step": 392 + }, + { + "epoch": 0.7127635456812514, + "grad_norm": 0.1064453125, + "learning_rate": 0.0002, + "loss": 1.6656, + "step": 393 + }, + { + "epoch": 0.7145771933801859, + "grad_norm": 0.1328125, + "learning_rate": 0.0002, + "loss": 1.7671, + "step": 394 + }, + { + "epoch": 0.7163908410791204, + "grad_norm": 0.1142578125, + "learning_rate": 0.0002, + "loss": 1.5246, + "step": 395 + }, + { + "epoch": 0.7182044887780549, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.4376, + "step": 396 + }, + { + "epoch": 0.7200181364769893, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.5276, + "step": 397 + }, + { + "epoch": 0.7218317841759239, + "grad_norm": 0.1669921875, + "learning_rate": 0.0002, + "loss": 1.5526, + "step": 398 + }, + { + "epoch": 0.7236454318748583, + "grad_norm": 0.19140625, + "learning_rate": 0.0002, + "loss": 1.4558, + "step": 399 + }, + { + "epoch": 0.7254590795737927, + "grad_norm": 0.271484375, + "learning_rate": 0.0002, + "loss": 1.2466, + "step": 400 + }, + { + "epoch": 0.7272727272727273, + "grad_norm": 0.06103515625, + "learning_rate": 0.0002, + "loss": 1.2046, + "step": 401 + }, + { + "epoch": 0.7290863749716617, + "grad_norm": 0.05908203125, + "learning_rate": 0.0002, + "loss": 1.4117, + "step": 402 + }, + { + "epoch": 0.7309000226705963, + "grad_norm": 0.06298828125, + "learning_rate": 0.0002, + "loss": 1.109, + "step": 403 + }, + { + "epoch": 0.7327136703695307, + "grad_norm": 0.0625, + "learning_rate": 0.0002, + "loss": 1.1886, + "step": 404 + }, + { + "epoch": 0.7345273180684652, + "grad_norm": 0.052734375, + "learning_rate": 0.0002, + "loss": 1.1449, + "step": 405 + }, + { + "epoch": 0.7363409657673997, + "grad_norm": 0.06103515625, + "learning_rate": 0.0002, + "loss": 1.1434, + "step": 406 + }, + { + "epoch": 0.7381546134663342, + "grad_norm": 0.06298828125, + "learning_rate": 0.0002, + "loss": 1.0986, + "step": 407 + }, + { + "epoch": 0.7399682611652686, + "grad_norm": 0.06103515625, + "learning_rate": 0.0002, + "loss": 1.1215, + "step": 408 + }, + { + "epoch": 0.7417819088642031, + "grad_norm": 0.05908203125, + "learning_rate": 0.0002, + "loss": 1.0447, + "step": 409 + }, + { + "epoch": 0.7435955565631376, + "grad_norm": 0.06640625, + "learning_rate": 0.0002, + "loss": 0.9925, + "step": 410 + }, + { + "epoch": 0.7454092042620721, + "grad_norm": 0.06201171875, + "learning_rate": 0.0002, + "loss": 1.0033, + "step": 411 + }, + { + "epoch": 0.7472228519610066, + "grad_norm": 0.06201171875, + "learning_rate": 0.0002, + "loss": 1.2369, + "step": 412 + }, + { + "epoch": 0.749036499659941, + "grad_norm": 0.06787109375, + "learning_rate": 0.0002, + "loss": 1.0648, + "step": 413 + }, + { + "epoch": 0.7508501473588756, + "grad_norm": 0.0634765625, + "learning_rate": 0.0002, + "loss": 1.0261, + "step": 414 + }, + { + "epoch": 0.75266379505781, + "grad_norm": 0.0654296875, + "learning_rate": 0.0002, + "loss": 1.2148, + "step": 415 + }, + { + "epoch": 0.7544774427567446, + "grad_norm": 0.06298828125, + "learning_rate": 0.0002, + "loss": 1.1316, + "step": 416 + }, + { + "epoch": 0.756291090455679, + "grad_norm": 0.06201171875, + "learning_rate": 0.0002, + "loss": 1.1911, + "step": 417 + }, + { + "epoch": 0.7581047381546134, + "grad_norm": 0.0595703125, + "learning_rate": 0.0002, + "loss": 1.3275, + "step": 418 + }, + { + "epoch": 0.759918385853548, + "grad_norm": 0.056884765625, + "learning_rate": 0.0002, + "loss": 1.0982, + "step": 419 + }, + { + "epoch": 0.7617320335524824, + "grad_norm": 0.060791015625, + "learning_rate": 0.0002, + "loss": 1.2205, + "step": 420 + }, + { + "epoch": 0.7635456812514169, + "grad_norm": 0.06494140625, + "learning_rate": 0.0002, + "loss": 1.3535, + "step": 421 + }, + { + "epoch": 0.7653593289503514, + "grad_norm": 0.061767578125, + "learning_rate": 0.0002, + "loss": 1.2456, + "step": 422 + }, + { + "epoch": 0.7671729766492859, + "grad_norm": 0.06591796875, + "learning_rate": 0.0002, + "loss": 1.1351, + "step": 423 + }, + { + "epoch": 0.7689866243482204, + "grad_norm": 0.06689453125, + "learning_rate": 0.0002, + "loss": 1.2945, + "step": 424 + }, + { + "epoch": 0.7708002720471548, + "grad_norm": 0.064453125, + "learning_rate": 0.0002, + "loss": 1.1404, + "step": 425 + }, + { + "epoch": 0.7726139197460893, + "grad_norm": 0.0703125, + "learning_rate": 0.0002, + "loss": 1.3039, + "step": 426 + }, + { + "epoch": 0.7744275674450238, + "grad_norm": 0.06982421875, + "learning_rate": 0.0002, + "loss": 1.247, + "step": 427 + }, + { + "epoch": 0.7762412151439583, + "grad_norm": 0.0751953125, + "learning_rate": 0.0002, + "loss": 1.279, + "step": 428 + }, + { + "epoch": 0.7780548628428927, + "grad_norm": 0.076171875, + "learning_rate": 0.0002, + "loss": 1.3972, + "step": 429 + }, + { + "epoch": 0.7798685105418273, + "grad_norm": 0.0810546875, + "learning_rate": 0.0002, + "loss": 1.2373, + "step": 430 + }, + { + "epoch": 0.7816821582407617, + "grad_norm": 0.08984375, + "learning_rate": 0.0002, + "loss": 1.6622, + "step": 431 + }, + { + "epoch": 0.7834958059396963, + "grad_norm": 0.09765625, + "learning_rate": 0.0002, + "loss": 1.5947, + "step": 432 + }, + { + "epoch": 0.7853094536386307, + "grad_norm": 0.10009765625, + "learning_rate": 0.0002, + "loss": 1.3585, + "step": 433 + }, + { + "epoch": 0.7871231013375651, + "grad_norm": 0.10009765625, + "learning_rate": 0.0002, + "loss": 1.4948, + "step": 434 + }, + { + "epoch": 0.7889367490364997, + "grad_norm": 0.1005859375, + "learning_rate": 0.0002, + "loss": 1.3164, + "step": 435 + }, + { + "epoch": 0.7907503967354341, + "grad_norm": 0.0986328125, + "learning_rate": 0.0002, + "loss": 1.4155, + "step": 436 + }, + { + "epoch": 0.7925640444343687, + "grad_norm": 0.1005859375, + "learning_rate": 0.0002, + "loss": 1.5113, + "step": 437 + }, + { + "epoch": 0.7943776921333031, + "grad_norm": 0.0947265625, + "learning_rate": 0.0002, + "loss": 1.5757, + "step": 438 + }, + { + "epoch": 0.7961913398322376, + "grad_norm": 0.11865234375, + "learning_rate": 0.0002, + "loss": 1.8355, + "step": 439 + }, + { + "epoch": 0.7980049875311721, + "grad_norm": 0.10595703125, + "learning_rate": 0.0002, + "loss": 1.5718, + "step": 440 + }, + { + "epoch": 0.7998186352301065, + "grad_norm": 0.10107421875, + "learning_rate": 0.0002, + "loss": 1.8861, + "step": 441 + }, + { + "epoch": 0.801632282929041, + "grad_norm": 0.11181640625, + "learning_rate": 0.0002, + "loss": 1.6033, + "step": 442 + }, + { + "epoch": 0.8034459306279755, + "grad_norm": 0.12255859375, + "learning_rate": 0.0002, + "loss": 1.6664, + "step": 443 + }, + { + "epoch": 0.80525957832691, + "grad_norm": 0.10986328125, + "learning_rate": 0.0002, + "loss": 1.7209, + "step": 444 + }, + { + "epoch": 0.8070732260258445, + "grad_norm": 0.10302734375, + "learning_rate": 0.0002, + "loss": 1.3422, + "step": 445 + }, + { + "epoch": 0.808886873724779, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.5707, + "step": 446 + }, + { + "epoch": 0.8107005214237134, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.809, + "step": 447 + }, + { + "epoch": 0.812514169122648, + "grad_norm": 0.1708984375, + "learning_rate": 0.0002, + "loss": 1.5928, + "step": 448 + }, + { + "epoch": 0.8143278168215824, + "grad_norm": 0.232421875, + "learning_rate": 0.0002, + "loss": 1.502, + "step": 449 + }, + { + "epoch": 0.8161414645205168, + "grad_norm": 0.220703125, + "learning_rate": 0.0002, + "loss": 1.4847, + "step": 450 + }, + { + "epoch": 0.8179551122194514, + "grad_norm": 0.0537109375, + "learning_rate": 0.0002, + "loss": 1.2204, + "step": 451 + }, + { + "epoch": 0.8197687599183858, + "grad_norm": 0.0556640625, + "learning_rate": 0.0002, + "loss": 1.1765, + "step": 452 + }, + { + "epoch": 0.8215824076173204, + "grad_norm": 0.060546875, + "learning_rate": 0.0002, + "loss": 0.9263, + "step": 453 + }, + { + "epoch": 0.8233960553162548, + "grad_norm": 0.11669921875, + "learning_rate": 0.0002, + "loss": 1.1783, + "step": 454 + }, + { + "epoch": 0.8252097030151893, + "grad_norm": 0.05810546875, + "learning_rate": 0.0002, + "loss": 1.1101, + "step": 455 + }, + { + "epoch": 0.8270233507141238, + "grad_norm": 0.06298828125, + "learning_rate": 0.0002, + "loss": 1.0944, + "step": 456 + }, + { + "epoch": 0.8288369984130582, + "grad_norm": 0.058349609375, + "learning_rate": 0.0002, + "loss": 1.3128, + "step": 457 + }, + { + "epoch": 0.8306506461119928, + "grad_norm": 0.06005859375, + "learning_rate": 0.0002, + "loss": 1.1006, + "step": 458 + }, + { + "epoch": 0.8324642938109272, + "grad_norm": 0.07470703125, + "learning_rate": 0.0002, + "loss": 1.1999, + "step": 459 + }, + { + "epoch": 0.8342779415098617, + "grad_norm": 0.056884765625, + "learning_rate": 0.0002, + "loss": 1.1203, + "step": 460 + }, + { + "epoch": 0.8360915892087962, + "grad_norm": 0.06103515625, + "learning_rate": 0.0002, + "loss": 1.1265, + "step": 461 + }, + { + "epoch": 0.8379052369077307, + "grad_norm": 0.05908203125, + "learning_rate": 0.0002, + "loss": 1.0807, + "step": 462 + }, + { + "epoch": 0.8397188846066651, + "grad_norm": 0.06494140625, + "learning_rate": 0.0002, + "loss": 1.2856, + "step": 463 + }, + { + "epoch": 0.8415325323055997, + "grad_norm": 0.0654296875, + "learning_rate": 0.0002, + "loss": 1.2799, + "step": 464 + }, + { + "epoch": 0.8433461800045341, + "grad_norm": 0.08203125, + "learning_rate": 0.0002, + "loss": 1.0699, + "step": 465 + }, + { + "epoch": 0.8451598277034686, + "grad_norm": 0.059326171875, + "learning_rate": 0.0002, + "loss": 1.0292, + "step": 466 + }, + { + "epoch": 0.8469734754024031, + "grad_norm": 0.06396484375, + "learning_rate": 0.0002, + "loss": 1.0614, + "step": 467 + }, + { + "epoch": 0.8487871231013375, + "grad_norm": 0.060791015625, + "learning_rate": 0.0002, + "loss": 1.0859, + "step": 468 + }, + { + "epoch": 0.8506007708002721, + "grad_norm": 0.060302734375, + "learning_rate": 0.0002, + "loss": 1.1746, + "step": 469 + }, + { + "epoch": 0.8524144184992065, + "grad_norm": 0.060546875, + "learning_rate": 0.0002, + "loss": 1.1823, + "step": 470 + }, + { + "epoch": 0.854228066198141, + "grad_norm": 0.06591796875, + "learning_rate": 0.0002, + "loss": 1.1419, + "step": 471 + }, + { + "epoch": 0.8560417138970755, + "grad_norm": 0.06298828125, + "learning_rate": 0.0002, + "loss": 1.2661, + "step": 472 + }, + { + "epoch": 0.85785536159601, + "grad_norm": 0.06689453125, + "learning_rate": 0.0002, + "loss": 1.2389, + "step": 473 + }, + { + "epoch": 0.8596690092949445, + "grad_norm": 0.068359375, + "learning_rate": 0.0002, + "loss": 1.4032, + "step": 474 + }, + { + "epoch": 0.8614826569938789, + "grad_norm": 0.06982421875, + "learning_rate": 0.0002, + "loss": 1.2054, + "step": 475 + }, + { + "epoch": 0.8632963046928134, + "grad_norm": 0.0673828125, + "learning_rate": 0.0002, + "loss": 1.0378, + "step": 476 + }, + { + "epoch": 0.8651099523917479, + "grad_norm": 0.07421875, + "learning_rate": 0.0002, + "loss": 1.5396, + "step": 477 + }, + { + "epoch": 0.8669236000906824, + "grad_norm": 0.07177734375, + "learning_rate": 0.0002, + "loss": 1.2899, + "step": 478 + }, + { + "epoch": 0.8687372477896169, + "grad_norm": 0.0810546875, + "learning_rate": 0.0002, + "loss": 1.208, + "step": 479 + }, + { + "epoch": 0.8705508954885514, + "grad_norm": 0.0732421875, + "learning_rate": 0.0002, + "loss": 1.3191, + "step": 480 + }, + { + "epoch": 0.8723645431874858, + "grad_norm": 0.0791015625, + "learning_rate": 0.0002, + "loss": 1.4453, + "step": 481 + }, + { + "epoch": 0.8741781908864203, + "grad_norm": 0.08349609375, + "learning_rate": 0.0002, + "loss": 1.2528, + "step": 482 + }, + { + "epoch": 0.8759918385853548, + "grad_norm": 0.08154296875, + "learning_rate": 0.0002, + "loss": 1.331, + "step": 483 + }, + { + "epoch": 0.8778054862842892, + "grad_norm": 0.0888671875, + "learning_rate": 0.0002, + "loss": 1.4447, + "step": 484 + }, + { + "epoch": 0.8796191339832238, + "grad_norm": 0.0947265625, + "learning_rate": 0.0002, + "loss": 1.4324, + "step": 485 + }, + { + "epoch": 0.8814327816821582, + "grad_norm": 0.0908203125, + "learning_rate": 0.0002, + "loss": 1.4836, + "step": 486 + }, + { + "epoch": 0.8832464293810928, + "grad_norm": 0.09033203125, + "learning_rate": 0.0002, + "loss": 1.5349, + "step": 487 + }, + { + "epoch": 0.8850600770800272, + "grad_norm": 0.09423828125, + "learning_rate": 0.0002, + "loss": 1.4715, + "step": 488 + }, + { + "epoch": 0.8868737247789616, + "grad_norm": 0.09375, + "learning_rate": 0.0002, + "loss": 1.4883, + "step": 489 + }, + { + "epoch": 0.8886873724778962, + "grad_norm": 0.10546875, + "learning_rate": 0.0002, + "loss": 1.6667, + "step": 490 + }, + { + "epoch": 0.8905010201768306, + "grad_norm": 0.10400390625, + "learning_rate": 0.0002, + "loss": 1.5108, + "step": 491 + }, + { + "epoch": 0.8923146678757651, + "grad_norm": 0.11083984375, + "learning_rate": 0.0002, + "loss": 1.6476, + "step": 492 + }, + { + "epoch": 0.8941283155746996, + "grad_norm": 0.138671875, + "learning_rate": 0.0002, + "loss": 1.8198, + "step": 493 + }, + { + "epoch": 0.8959419632736341, + "grad_norm": 0.1279296875, + "learning_rate": 0.0002, + "loss": 1.6147, + "step": 494 + }, + { + "epoch": 0.8977556109725686, + "grad_norm": 0.166015625, + "learning_rate": 0.0002, + "loss": 1.7742, + "step": 495 + }, + { + "epoch": 0.8995692586715031, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.5657, + "step": 496 + }, + { + "epoch": 0.9013829063704375, + "grad_norm": 0.138671875, + "learning_rate": 0.0002, + "loss": 1.5574, + "step": 497 + }, + { + "epoch": 0.903196554069372, + "grad_norm": 0.16015625, + "learning_rate": 0.0002, + "loss": 1.5506, + "step": 498 + }, + { + "epoch": 0.9050102017683065, + "grad_norm": 0.25390625, + "learning_rate": 0.0002, + "loss": 1.8956, + "step": 499 + }, + { + "epoch": 0.906823849467241, + "grad_norm": 0.25, + "learning_rate": 0.0002, + "loss": 1.4009, + "step": 500 + }, + { + "epoch": 0.9086374971661755, + "grad_norm": 0.05908203125, + "learning_rate": 0.0002, + "loss": 1.2544, + "step": 501 + }, + { + "epoch": 0.9104511448651099, + "grad_norm": 0.06005859375, + "learning_rate": 0.0002, + "loss": 1.0434, + "step": 502 + }, + { + "epoch": 0.9122647925640445, + "grad_norm": 0.0615234375, + "learning_rate": 0.0002, + "loss": 1.032, + "step": 503 + }, + { + "epoch": 0.9140784402629789, + "grad_norm": 0.059326171875, + "learning_rate": 0.0002, + "loss": 1.0471, + "step": 504 + }, + { + "epoch": 0.9158920879619133, + "grad_norm": 0.05712890625, + "learning_rate": 0.0002, + "loss": 1.1256, + "step": 505 + }, + { + "epoch": 0.9177057356608479, + "grad_norm": 0.06298828125, + "learning_rate": 0.0002, + "loss": 1.1673, + "step": 506 + }, + { + "epoch": 0.9195193833597823, + "grad_norm": 0.0625, + "learning_rate": 0.0002, + "loss": 1.198, + "step": 507 + }, + { + "epoch": 0.9213330310587169, + "grad_norm": 0.0595703125, + "learning_rate": 0.0002, + "loss": 1.22, + "step": 508 + }, + { + "epoch": 0.9231466787576513, + "grad_norm": 0.06640625, + "learning_rate": 0.0002, + "loss": 1.1798, + "step": 509 + }, + { + "epoch": 0.9249603264565858, + "grad_norm": 0.0654296875, + "learning_rate": 0.0002, + "loss": 1.0613, + "step": 510 + }, + { + "epoch": 0.9267739741555203, + "grad_norm": 0.0859375, + "learning_rate": 0.0002, + "loss": 1.3239, + "step": 511 + }, + { + "epoch": 0.9285876218544548, + "grad_norm": 0.06396484375, + "learning_rate": 0.0002, + "loss": 1.3091, + "step": 512 + }, + { + "epoch": 0.9304012695533893, + "grad_norm": 0.06201171875, + "learning_rate": 0.0002, + "loss": 1.1875, + "step": 513 + }, + { + "epoch": 0.9322149172523237, + "grad_norm": 0.06396484375, + "learning_rate": 0.0002, + "loss": 1.1469, + "step": 514 + }, + { + "epoch": 0.9340285649512582, + "grad_norm": 0.060791015625, + "learning_rate": 0.0002, + "loss": 1.1556, + "step": 515 + }, + { + "epoch": 0.9358422126501927, + "grad_norm": 0.0654296875, + "learning_rate": 0.0002, + "loss": 1.291, + "step": 516 + }, + { + "epoch": 0.9376558603491272, + "grad_norm": 0.0634765625, + "learning_rate": 0.0002, + "loss": 1.1423, + "step": 517 + }, + { + "epoch": 0.9394695080480616, + "grad_norm": 0.10107421875, + "learning_rate": 0.0002, + "loss": 1.0861, + "step": 518 + }, + { + "epoch": 0.9412831557469962, + "grad_norm": 0.06494140625, + "learning_rate": 0.0002, + "loss": 1.2623, + "step": 519 + }, + { + "epoch": 0.9430968034459306, + "grad_norm": 0.064453125, + "learning_rate": 0.0002, + "loss": 1.0522, + "step": 520 + }, + { + "epoch": 0.9449104511448652, + "grad_norm": 0.0791015625, + "learning_rate": 0.0002, + "loss": 1.4398, + "step": 521 + }, + { + "epoch": 0.9467240988437996, + "grad_norm": 0.06298828125, + "learning_rate": 0.0002, + "loss": 1.0998, + "step": 522 + }, + { + "epoch": 0.948537746542734, + "grad_norm": 0.0693359375, + "learning_rate": 0.0002, + "loss": 1.2699, + "step": 523 + }, + { + "epoch": 0.9503513942416686, + "grad_norm": 0.0849609375, + "learning_rate": 0.0002, + "loss": 1.1873, + "step": 524 + }, + { + "epoch": 0.952165041940603, + "grad_norm": 0.080078125, + "learning_rate": 0.0002, + "loss": 1.1286, + "step": 525 + }, + { + "epoch": 0.9539786896395375, + "grad_norm": 0.07958984375, + "learning_rate": 0.0002, + "loss": 1.1452, + "step": 526 + }, + { + "epoch": 0.955792337338472, + "grad_norm": 0.07275390625, + "learning_rate": 0.0002, + "loss": 1.2793, + "step": 527 + }, + { + "epoch": 0.9576059850374065, + "grad_norm": 0.07861328125, + "learning_rate": 0.0002, + "loss": 1.4551, + "step": 528 + }, + { + "epoch": 0.959419632736341, + "grad_norm": 0.08349609375, + "learning_rate": 0.0002, + "loss": 1.6303, + "step": 529 + }, + { + "epoch": 0.9612332804352755, + "grad_norm": 0.08203125, + "learning_rate": 0.0002, + "loss": 1.3612, + "step": 530 + }, + { + "epoch": 0.9630469281342099, + "grad_norm": 0.078125, + "learning_rate": 0.0002, + "loss": 1.3146, + "step": 531 + }, + { + "epoch": 0.9648605758331444, + "grad_norm": 0.087890625, + "learning_rate": 0.0002, + "loss": 1.4975, + "step": 532 + }, + { + "epoch": 0.9666742235320789, + "grad_norm": 0.0810546875, + "learning_rate": 0.0002, + "loss": 1.4206, + "step": 533 + }, + { + "epoch": 0.9684878712310134, + "grad_norm": 0.0966796875, + "learning_rate": 0.0002, + "loss": 1.3021, + "step": 534 + }, + { + "epoch": 0.9703015189299479, + "grad_norm": 0.09228515625, + "learning_rate": 0.0002, + "loss": 1.5162, + "step": 535 + }, + { + "epoch": 0.9721151666288823, + "grad_norm": 0.08837890625, + "learning_rate": 0.0002, + "loss": 1.5064, + "step": 536 + }, + { + "epoch": 0.9739288143278169, + "grad_norm": 0.0908203125, + "learning_rate": 0.0002, + "loss": 1.2148, + "step": 537 + }, + { + "epoch": 0.9757424620267513, + "grad_norm": 0.09912109375, + "learning_rate": 0.0002, + "loss": 1.5231, + "step": 538 + }, + { + "epoch": 0.9775561097256857, + "grad_norm": 0.103515625, + "learning_rate": 0.0002, + "loss": 1.5255, + "step": 539 + }, + { + "epoch": 0.9793697574246203, + "grad_norm": 0.10400390625, + "learning_rate": 0.0002, + "loss": 1.6215, + "step": 540 + }, + { + "epoch": 0.9811834051235547, + "grad_norm": 0.1044921875, + "learning_rate": 0.0002, + "loss": 1.2271, + "step": 541 + }, + { + "epoch": 0.9829970528224893, + "grad_norm": 0.11767578125, + "learning_rate": 0.0002, + "loss": 1.7265, + "step": 542 + }, + { + "epoch": 0.9848107005214237, + "grad_norm": 0.1240234375, + "learning_rate": 0.0002, + "loss": 1.3321, + "step": 543 + }, + { + "epoch": 0.9866243482203582, + "grad_norm": 0.1162109375, + "learning_rate": 0.0002, + "loss": 1.4907, + "step": 544 + }, + { + "epoch": 0.9884379959192927, + "grad_norm": 0.13671875, + "learning_rate": 0.0002, + "loss": 1.4443, + "step": 545 + }, + { + "epoch": 0.9902516436182272, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.6704, + "step": 546 + }, + { + "epoch": 0.9920652913171616, + "grad_norm": 0.1923828125, + "learning_rate": 0.0002, + "loss": 1.5202, + "step": 547 + }, + { + "epoch": 0.9938789390160961, + "grad_norm": 0.1884765625, + "learning_rate": 0.0002, + "loss": 1.6728, + "step": 548 + }, + { + "epoch": 0.9956925867150306, + "grad_norm": 0.255859375, + "learning_rate": 0.0002, + "loss": 1.4623, + "step": 549 + }, + { + "epoch": 0.9975062344139651, + "grad_norm": 0.267578125, + "learning_rate": 0.0002, + "loss": 1.4649, + "step": 550 + }, + { + "epoch": 0.9993198821128996, + "grad_norm": 0.09912109375, + "learning_rate": 0.0002, + "loss": 1.4402, + "step": 551 + }, + { + "epoch": 1.001133529811834, + "grad_norm": 0.11083984375, + "learning_rate": 0.0002, + "loss": 1.3495, + "step": 552 + }, + { + "epoch": 1.0029471775107686, + "grad_norm": 0.0625, + "learning_rate": 0.0002, + "loss": 1.1794, + "step": 553 + }, + { + "epoch": 1.0047608252097031, + "grad_norm": 0.07177734375, + "learning_rate": 0.0002, + "loss": 1.0528, + "step": 554 + }, + { + "epoch": 1.0065744729086374, + "grad_norm": 0.0595703125, + "learning_rate": 0.0002, + "loss": 1.1413, + "step": 555 + }, + { + "epoch": 1.008388120607572, + "grad_norm": 0.06396484375, + "learning_rate": 0.0002, + "loss": 1.1241, + "step": 556 + }, + { + "epoch": 1.0102017683065065, + "grad_norm": 0.05810546875, + "learning_rate": 0.0002, + "loss": 1.1919, + "step": 557 + }, + { + "epoch": 1.0120154160054409, + "grad_norm": 0.060791015625, + "learning_rate": 0.0002, + "loss": 1.3291, + "step": 558 + }, + { + "epoch": 1.0138290637043754, + "grad_norm": 0.06298828125, + "learning_rate": 0.0002, + "loss": 1.3033, + "step": 559 + }, + { + "epoch": 1.01564271140331, + "grad_norm": 0.06591796875, + "learning_rate": 0.0002, + "loss": 1.2358, + "step": 560 + }, + { + "epoch": 1.0174563591022443, + "grad_norm": 0.0673828125, + "learning_rate": 0.0002, + "loss": 1.4068, + "step": 561 + }, + { + "epoch": 1.0174563591022443, + "eval_loss": 1.3091340065002441, + "eval_runtime": 152.3982, + "eval_samples_per_second": 6.562, + "eval_steps_per_second": 6.562, + "step": 561 + }, + { + "epoch": 1.0174563591022443, + "mmlu_eval_accuracy": 0.33641092222710456, + "mmlu_eval_accuracy_abstract_algebra": 0.36363636363636365, + "mmlu_eval_accuracy_anatomy": 0.42857142857142855, + "mmlu_eval_accuracy_astronomy": 0.3125, + "mmlu_eval_accuracy_business_ethics": 0.6363636363636364, + "mmlu_eval_accuracy_clinical_knowledge": 0.27586206896551724, + "mmlu_eval_accuracy_college_biology": 0.3125, + "mmlu_eval_accuracy_college_chemistry": 0.375, + "mmlu_eval_accuracy_college_computer_science": 0.18181818181818182, + "mmlu_eval_accuracy_college_mathematics": 0.36363636363636365, + "mmlu_eval_accuracy_college_medicine": 0.22727272727272727, + "mmlu_eval_accuracy_college_physics": 0.36363636363636365, + "mmlu_eval_accuracy_computer_security": 0.36363636363636365, + "mmlu_eval_accuracy_conceptual_physics": 0.2692307692307692, + "mmlu_eval_accuracy_econometrics": 0.0, + "mmlu_eval_accuracy_electrical_engineering": 0.125, + "mmlu_eval_accuracy_elementary_mathematics": 0.34146341463414637, + "mmlu_eval_accuracy_formal_logic": 0.21428571428571427, + "mmlu_eval_accuracy_global_facts": 0.2, + "mmlu_eval_accuracy_high_school_biology": 0.4375, + "mmlu_eval_accuracy_high_school_chemistry": 0.3181818181818182, + "mmlu_eval_accuracy_high_school_computer_science": 0.3333333333333333, + "mmlu_eval_accuracy_high_school_european_history": 0.5, + "mmlu_eval_accuracy_high_school_geography": 0.45454545454545453, + "mmlu_eval_accuracy_high_school_government_and_politics": 0.38095238095238093, + "mmlu_eval_accuracy_high_school_macroeconomics": 0.32558139534883723, + "mmlu_eval_accuracy_high_school_mathematics": 0.3103448275862069, + "mmlu_eval_accuracy_high_school_microeconomics": 0.15384615384615385, + "mmlu_eval_accuracy_high_school_physics": 0.17647058823529413, + "mmlu_eval_accuracy_high_school_psychology": 0.45, + "mmlu_eval_accuracy_high_school_statistics": 0.34782608695652173, + "mmlu_eval_accuracy_high_school_us_history": 0.4090909090909091, + "mmlu_eval_accuracy_high_school_world_history": 0.34615384615384615, + "mmlu_eval_accuracy_human_aging": 0.4782608695652174, + "mmlu_eval_accuracy_human_sexuality": 0.16666666666666666, + "mmlu_eval_accuracy_international_law": 0.38461538461538464, + "mmlu_eval_accuracy_jurisprudence": 0.18181818181818182, + "mmlu_eval_accuracy_logical_fallacies": 0.3888888888888889, + "mmlu_eval_accuracy_machine_learning": 0.45454545454545453, + "mmlu_eval_accuracy_management": 0.18181818181818182, + "mmlu_eval_accuracy_marketing": 0.44, + "mmlu_eval_accuracy_medical_genetics": 0.5454545454545454, + "mmlu_eval_accuracy_miscellaneous": 0.38372093023255816, + "mmlu_eval_accuracy_moral_disputes": 0.2894736842105263, + "mmlu_eval_accuracy_moral_scenarios": 0.31, + "mmlu_eval_accuracy_nutrition": 0.48484848484848486, + "mmlu_eval_accuracy_philosophy": 0.4117647058823529, + "mmlu_eval_accuracy_prehistory": 0.22857142857142856, + "mmlu_eval_accuracy_professional_accounting": 0.2903225806451613, + "mmlu_eval_accuracy_professional_law": 0.23529411764705882, + "mmlu_eval_accuracy_professional_medicine": 0.3225806451612903, + "mmlu_eval_accuracy_professional_psychology": 0.37681159420289856, + "mmlu_eval_accuracy_public_relations": 0.3333333333333333, + "mmlu_eval_accuracy_security_studies": 0.37037037037037035, + "mmlu_eval_accuracy_sociology": 0.5, + "mmlu_eval_accuracy_us_foreign_policy": 0.36363636363636365, + "mmlu_eval_accuracy_virology": 0.3333333333333333, + "mmlu_eval_accuracy_world_religions": 0.42105263157894735, + "mmlu_loss": 1.7300849244689567, + "step": 561 + }, + { + "epoch": 1.0192700068011789, + "grad_norm": 0.0654296875, + "learning_rate": 0.0002, + "loss": 1.2832, + "step": 562 + }, + { + "epoch": 1.0210836545001134, + "grad_norm": 0.06884765625, + "learning_rate": 0.0002, + "loss": 1.1443, + "step": 563 + }, + { + "epoch": 1.0228973021990477, + "grad_norm": 0.06640625, + "learning_rate": 0.0002, + "loss": 1.1415, + "step": 564 + }, + { + "epoch": 1.0247109498979823, + "grad_norm": 0.0703125, + "learning_rate": 0.0002, + "loss": 1.103, + "step": 565 + }, + { + "epoch": 1.0265245975969168, + "grad_norm": 0.0703125, + "learning_rate": 0.0002, + "loss": 1.0946, + "step": 566 + }, + { + "epoch": 1.0283382452958514, + "grad_norm": 0.064453125, + "learning_rate": 0.0002, + "loss": 1.3528, + "step": 567 + }, + { + "epoch": 1.0301518929947857, + "grad_norm": 0.0654296875, + "learning_rate": 0.0002, + "loss": 0.9947, + "step": 568 + }, + { + "epoch": 1.0319655406937203, + "grad_norm": 0.0654296875, + "learning_rate": 0.0002, + "loss": 1.1647, + "step": 569 + }, + { + "epoch": 1.0337791883926548, + "grad_norm": 0.06884765625, + "learning_rate": 0.0002, + "loss": 1.1055, + "step": 570 + }, + { + "epoch": 1.0355928360915891, + "grad_norm": 0.076171875, + "learning_rate": 0.0002, + "loss": 1.0869, + "step": 571 + }, + { + "epoch": 1.0374064837905237, + "grad_norm": 0.0751953125, + "learning_rate": 0.0002, + "loss": 1.201, + "step": 572 + }, + { + "epoch": 1.0392201314894582, + "grad_norm": 0.0693359375, + "learning_rate": 0.0002, + "loss": 1.1547, + "step": 573 + }, + { + "epoch": 1.0410337791883926, + "grad_norm": 0.07666015625, + "learning_rate": 0.0002, + "loss": 1.1911, + "step": 574 + }, + { + "epoch": 1.0428474268873271, + "grad_norm": 0.0859375, + "learning_rate": 0.0002, + "loss": 1.1878, + "step": 575 + }, + { + "epoch": 1.0446610745862617, + "grad_norm": 0.0810546875, + "learning_rate": 0.0002, + "loss": 1.1602, + "step": 576 + }, + { + "epoch": 1.046474722285196, + "grad_norm": 0.0791015625, + "learning_rate": 0.0002, + "loss": 1.4808, + "step": 577 + }, + { + "epoch": 1.0482883699841306, + "grad_norm": 0.08447265625, + "learning_rate": 0.0002, + "loss": 1.4668, + "step": 578 + }, + { + "epoch": 1.050102017683065, + "grad_norm": 0.0888671875, + "learning_rate": 0.0002, + "loss": 1.171, + "step": 579 + }, + { + "epoch": 1.0519156653819997, + "grad_norm": 0.09619140625, + "learning_rate": 0.0002, + "loss": 1.2209, + "step": 580 + }, + { + "epoch": 1.053729313080934, + "grad_norm": 0.09228515625, + "learning_rate": 0.0002, + "loss": 1.0698, + "step": 581 + }, + { + "epoch": 1.0555429607798685, + "grad_norm": 0.0927734375, + "learning_rate": 0.0002, + "loss": 1.5785, + "step": 582 + }, + { + "epoch": 1.057356608478803, + "grad_norm": 0.0927734375, + "learning_rate": 0.0002, + "loss": 1.2038, + "step": 583 + }, + { + "epoch": 1.0591702561777374, + "grad_norm": 0.10205078125, + "learning_rate": 0.0002, + "loss": 1.209, + "step": 584 + }, + { + "epoch": 1.060983903876672, + "grad_norm": 0.1591796875, + "learning_rate": 0.0002, + "loss": 1.4523, + "step": 585 + }, + { + "epoch": 1.0627975515756065, + "grad_norm": 0.10302734375, + "learning_rate": 0.0002, + "loss": 1.1169, + "step": 586 + }, + { + "epoch": 1.0646111992745408, + "grad_norm": 0.11279296875, + "learning_rate": 0.0002, + "loss": 1.2779, + "step": 587 + }, + { + "epoch": 1.0664248469734754, + "grad_norm": 0.1123046875, + "learning_rate": 0.0002, + "loss": 1.3885, + "step": 588 + }, + { + "epoch": 1.06823849467241, + "grad_norm": 0.10986328125, + "learning_rate": 0.0002, + "loss": 1.3052, + "step": 589 + }, + { + "epoch": 1.0700521423713443, + "grad_norm": 0.10546875, + "learning_rate": 0.0002, + "loss": 1.2739, + "step": 590 + }, + { + "epoch": 1.0718657900702788, + "grad_norm": 0.1083984375, + "learning_rate": 0.0002, + "loss": 1.2474, + "step": 591 + }, + { + "epoch": 1.0736794377692134, + "grad_norm": 0.1162109375, + "learning_rate": 0.0002, + "loss": 1.5427, + "step": 592 + }, + { + "epoch": 1.075493085468148, + "grad_norm": 0.1123046875, + "learning_rate": 0.0002, + "loss": 1.2834, + "step": 593 + }, + { + "epoch": 1.0773067331670823, + "grad_norm": 0.12060546875, + "learning_rate": 0.0002, + "loss": 1.3885, + "step": 594 + }, + { + "epoch": 1.0791203808660168, + "grad_norm": 0.1376953125, + "learning_rate": 0.0002, + "loss": 1.4621, + "step": 595 + }, + { + "epoch": 1.0809340285649514, + "grad_norm": 0.15625, + "learning_rate": 0.0002, + "loss": 1.4573, + "step": 596 + }, + { + "epoch": 1.0827476762638857, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 1.2997, + "step": 597 + }, + { + "epoch": 1.0845613239628202, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.2601, + "step": 598 + }, + { + "epoch": 1.0863749716617548, + "grad_norm": 0.185546875, + "learning_rate": 0.0002, + "loss": 1.5254, + "step": 599 + }, + { + "epoch": 1.0881886193606891, + "grad_norm": 0.205078125, + "learning_rate": 0.0002, + "loss": 1.1527, + "step": 600 + }, + { + "epoch": 1.0900022670596237, + "grad_norm": 0.21875, + "learning_rate": 0.0002, + "loss": 1.0143, + "step": 601 + }, + { + "epoch": 1.0918159147585582, + "grad_norm": 0.1923828125, + "learning_rate": 0.0002, + "loss": 1.1456, + "step": 602 + }, + { + "epoch": 1.0936295624574925, + "grad_norm": 0.08447265625, + "learning_rate": 0.0002, + "loss": 1.2433, + "step": 603 + }, + { + "epoch": 1.095443210156427, + "grad_norm": 0.08935546875, + "learning_rate": 0.0002, + "loss": 1.1389, + "step": 604 + }, + { + "epoch": 1.0972568578553616, + "grad_norm": 0.08544921875, + "learning_rate": 0.0002, + "loss": 1.2433, + "step": 605 + }, + { + "epoch": 1.099070505554296, + "grad_norm": 0.0888671875, + "learning_rate": 0.0002, + "loss": 1.182, + "step": 606 + }, + { + "epoch": 1.1008841532532305, + "grad_norm": 0.0849609375, + "learning_rate": 0.0002, + "loss": 1.1303, + "step": 607 + }, + { + "epoch": 1.102697800952165, + "grad_norm": 0.0751953125, + "learning_rate": 0.0002, + "loss": 1.096, + "step": 608 + }, + { + "epoch": 1.1045114486510996, + "grad_norm": 0.087890625, + "learning_rate": 0.0002, + "loss": 1.1924, + "step": 609 + }, + { + "epoch": 1.106325096350034, + "grad_norm": 0.0830078125, + "learning_rate": 0.0002, + "loss": 1.1254, + "step": 610 + }, + { + "epoch": 1.1081387440489685, + "grad_norm": 0.07958984375, + "learning_rate": 0.0002, + "loss": 1.0745, + "step": 611 + }, + { + "epoch": 1.109952391747903, + "grad_norm": 0.0751953125, + "learning_rate": 0.0002, + "loss": 1.1565, + "step": 612 + }, + { + "epoch": 1.1117660394468374, + "grad_norm": 0.07958984375, + "learning_rate": 0.0002, + "loss": 1.1284, + "step": 613 + }, + { + "epoch": 1.113579687145772, + "grad_norm": 0.072265625, + "learning_rate": 0.0002, + "loss": 1.0948, + "step": 614 + }, + { + "epoch": 1.1153933348447065, + "grad_norm": 0.08203125, + "learning_rate": 0.0002, + "loss": 1.1279, + "step": 615 + }, + { + "epoch": 1.1172069825436408, + "grad_norm": 0.0810546875, + "learning_rate": 0.0002, + "loss": 0.9667, + "step": 616 + }, + { + "epoch": 1.1190206302425754, + "grad_norm": 0.080078125, + "learning_rate": 0.0002, + "loss": 1.1027, + "step": 617 + }, + { + "epoch": 1.12083427794151, + "grad_norm": 0.0888671875, + "learning_rate": 0.0002, + "loss": 1.0697, + "step": 618 + }, + { + "epoch": 1.1226479256404442, + "grad_norm": 0.0908203125, + "learning_rate": 0.0002, + "loss": 1.1882, + "step": 619 + }, + { + "epoch": 1.1244615733393788, + "grad_norm": 0.080078125, + "learning_rate": 0.0002, + "loss": 1.1001, + "step": 620 + }, + { + "epoch": 1.1262752210383133, + "grad_norm": 0.09326171875, + "learning_rate": 0.0002, + "loss": 1.2024, + "step": 621 + }, + { + "epoch": 1.128088868737248, + "grad_norm": 0.09716796875, + "learning_rate": 0.0002, + "loss": 1.0259, + "step": 622 + }, + { + "epoch": 1.1299025164361822, + "grad_norm": 0.08544921875, + "learning_rate": 0.0002, + "loss": 1.2392, + "step": 623 + }, + { + "epoch": 1.1317161641351168, + "grad_norm": 0.0859375, + "learning_rate": 0.0002, + "loss": 1.1479, + "step": 624 + }, + { + "epoch": 1.1335298118340513, + "grad_norm": 0.08154296875, + "learning_rate": 0.0002, + "loss": 1.1652, + "step": 625 + }, + { + "epoch": 1.1353434595329857, + "grad_norm": 0.0859375, + "learning_rate": 0.0002, + "loss": 1.0396, + "step": 626 + }, + { + "epoch": 1.1371571072319202, + "grad_norm": 0.09033203125, + "learning_rate": 0.0002, + "loss": 1.1517, + "step": 627 + }, + { + "epoch": 1.1389707549308548, + "grad_norm": 0.09228515625, + "learning_rate": 0.0002, + "loss": 1.2138, + "step": 628 + }, + { + "epoch": 1.140784402629789, + "grad_norm": 0.0966796875, + "learning_rate": 0.0002, + "loss": 1.2618, + "step": 629 + }, + { + "epoch": 1.1425980503287236, + "grad_norm": 0.0966796875, + "learning_rate": 0.0002, + "loss": 1.1971, + "step": 630 + }, + { + "epoch": 1.1444116980276582, + "grad_norm": 0.10205078125, + "learning_rate": 0.0002, + "loss": 1.1333, + "step": 631 + }, + { + "epoch": 1.1462253457265925, + "grad_norm": 0.10498046875, + "learning_rate": 0.0002, + "loss": 1.2521, + "step": 632 + }, + { + "epoch": 1.148038993425527, + "grad_norm": 0.103515625, + "learning_rate": 0.0002, + "loss": 1.181, + "step": 633 + }, + { + "epoch": 1.1498526411244616, + "grad_norm": 0.10498046875, + "learning_rate": 0.0002, + "loss": 1.306, + "step": 634 + }, + { + "epoch": 1.151666288823396, + "grad_norm": 0.10986328125, + "learning_rate": 0.0002, + "loss": 1.4144, + "step": 635 + }, + { + "epoch": 1.1534799365223305, + "grad_norm": 0.107421875, + "learning_rate": 0.0002, + "loss": 1.3941, + "step": 636 + }, + { + "epoch": 1.155293584221265, + "grad_norm": 0.11767578125, + "learning_rate": 0.0002, + "loss": 1.3968, + "step": 637 + }, + { + "epoch": 1.1571072319201996, + "grad_norm": 0.11181640625, + "learning_rate": 0.0002, + "loss": 1.3581, + "step": 638 + }, + { + "epoch": 1.158920879619134, + "grad_norm": 0.1220703125, + "learning_rate": 0.0002, + "loss": 1.2511, + "step": 639 + }, + { + "epoch": 1.1607345273180685, + "grad_norm": 0.125, + "learning_rate": 0.0002, + "loss": 1.1639, + "step": 640 + }, + { + "epoch": 1.162548175017003, + "grad_norm": 0.13671875, + "learning_rate": 0.0002, + "loss": 1.3287, + "step": 641 + }, + { + "epoch": 1.1643618227159374, + "grad_norm": 0.130859375, + "learning_rate": 0.0002, + "loss": 1.4549, + "step": 642 + }, + { + "epoch": 1.166175470414872, + "grad_norm": 0.1318359375, + "learning_rate": 0.0002, + "loss": 1.2102, + "step": 643 + }, + { + "epoch": 1.1679891181138065, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.4226, + "step": 644 + }, + { + "epoch": 1.1698027658127408, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.5269, + "step": 645 + }, + { + "epoch": 1.1716164135116753, + "grad_norm": 0.1376953125, + "learning_rate": 0.0002, + "loss": 1.2539, + "step": 646 + }, + { + "epoch": 1.1734300612106099, + "grad_norm": 0.166015625, + "learning_rate": 0.0002, + "loss": 1.4963, + "step": 647 + }, + { + "epoch": 1.1752437089095444, + "grad_norm": 0.1630859375, + "learning_rate": 0.0002, + "loss": 1.2909, + "step": 648 + }, + { + "epoch": 1.1770573566084788, + "grad_norm": 0.17578125, + "learning_rate": 0.0002, + "loss": 1.1209, + "step": 649 + }, + { + "epoch": 1.1788710043074133, + "grad_norm": 0.20703125, + "learning_rate": 0.0002, + "loss": 1.2696, + "step": 650 + }, + { + "epoch": 1.1806846520063479, + "grad_norm": 0.30859375, + "learning_rate": 0.0002, + "loss": 1.1885, + "step": 651 + }, + { + "epoch": 1.1824982997052822, + "grad_norm": 0.1650390625, + "learning_rate": 0.0002, + "loss": 1.0639, + "step": 652 + }, + { + "epoch": 1.1843119474042167, + "grad_norm": 0.087890625, + "learning_rate": 0.0002, + "loss": 1.0987, + "step": 653 + }, + { + "epoch": 1.1861255951031513, + "grad_norm": 0.08837890625, + "learning_rate": 0.0002, + "loss": 1.0899, + "step": 654 + }, + { + "epoch": 1.1879392428020856, + "grad_norm": 0.08349609375, + "learning_rate": 0.0002, + "loss": 1.0669, + "step": 655 + }, + { + "epoch": 1.1897528905010202, + "grad_norm": 0.0849609375, + "learning_rate": 0.0002, + "loss": 1.1559, + "step": 656 + }, + { + "epoch": 1.1915665381999547, + "grad_norm": 0.0732421875, + "learning_rate": 0.0002, + "loss": 1.0155, + "step": 657 + }, + { + "epoch": 1.193380185898889, + "grad_norm": 0.080078125, + "learning_rate": 0.0002, + "loss": 1.0527, + "step": 658 + }, + { + "epoch": 1.1951938335978236, + "grad_norm": 0.07666015625, + "learning_rate": 0.0002, + "loss": 1.1569, + "step": 659 + }, + { + "epoch": 1.1970074812967582, + "grad_norm": 0.0869140625, + "learning_rate": 0.0002, + "loss": 1.0259, + "step": 660 + }, + { + "epoch": 1.1988211289956925, + "grad_norm": 0.0791015625, + "learning_rate": 0.0002, + "loss": 1.1505, + "step": 661 + }, + { + "epoch": 1.200634776694627, + "grad_norm": 0.08837890625, + "learning_rate": 0.0002, + "loss": 1.1273, + "step": 662 + }, + { + "epoch": 1.2024484243935616, + "grad_norm": 0.0830078125, + "learning_rate": 0.0002, + "loss": 1.0746, + "step": 663 + }, + { + "epoch": 1.204262072092496, + "grad_norm": 0.08447265625, + "learning_rate": 0.0002, + "loss": 1.0105, + "step": 664 + }, + { + "epoch": 1.2060757197914305, + "grad_norm": 0.080078125, + "learning_rate": 0.0002, + "loss": 1.0967, + "step": 665 + }, + { + "epoch": 1.207889367490365, + "grad_norm": 0.0947265625, + "learning_rate": 0.0002, + "loss": 1.1679, + "step": 666 + }, + { + "epoch": 1.2097030151892996, + "grad_norm": 0.09423828125, + "learning_rate": 0.0002, + "loss": 1.0739, + "step": 667 + }, + { + "epoch": 1.211516662888234, + "grad_norm": 0.08544921875, + "learning_rate": 0.0002, + "loss": 1.1693, + "step": 668 + }, + { + "epoch": 1.2133303105871684, + "grad_norm": 0.0869140625, + "learning_rate": 0.0002, + "loss": 0.9475, + "step": 669 + }, + { + "epoch": 1.215143958286103, + "grad_norm": 0.099609375, + "learning_rate": 0.0002, + "loss": 1.2027, + "step": 670 + }, + { + "epoch": 1.2169576059850373, + "grad_norm": 0.09521484375, + "learning_rate": 0.0002, + "loss": 0.971, + "step": 671 + }, + { + "epoch": 1.2187712536839719, + "grad_norm": 0.0859375, + "learning_rate": 0.0002, + "loss": 1.084, + "step": 672 + }, + { + "epoch": 1.2205849013829064, + "grad_norm": 0.0927734375, + "learning_rate": 0.0002, + "loss": 1.2558, + "step": 673 + }, + { + "epoch": 1.2223985490818408, + "grad_norm": 0.10498046875, + "learning_rate": 0.0002, + "loss": 1.2139, + "step": 674 + }, + { + "epoch": 1.2242121967807753, + "grad_norm": 0.09716796875, + "learning_rate": 0.0002, + "loss": 1.3715, + "step": 675 + }, + { + "epoch": 1.2260258444797099, + "grad_norm": 0.0830078125, + "learning_rate": 0.0002, + "loss": 1.1306, + "step": 676 + }, + { + "epoch": 1.2278394921786444, + "grad_norm": 0.09228515625, + "learning_rate": 0.0002, + "loss": 1.1444, + "step": 677 + }, + { + "epoch": 1.2296531398775787, + "grad_norm": 0.0947265625, + "learning_rate": 0.0002, + "loss": 1.224, + "step": 678 + }, + { + "epoch": 1.2314667875765133, + "grad_norm": 0.0927734375, + "learning_rate": 0.0002, + "loss": 1.2181, + "step": 679 + }, + { + "epoch": 1.2332804352754478, + "grad_norm": 0.10546875, + "learning_rate": 0.0002, + "loss": 1.2179, + "step": 680 + }, + { + "epoch": 1.2350940829743822, + "grad_norm": 0.0927734375, + "learning_rate": 0.0002, + "loss": 1.1343, + "step": 681 + }, + { + "epoch": 1.2369077306733167, + "grad_norm": 0.1123046875, + "learning_rate": 0.0002, + "loss": 1.3663, + "step": 682 + }, + { + "epoch": 1.2387213783722513, + "grad_norm": 0.0947265625, + "learning_rate": 0.0002, + "loss": 1.2218, + "step": 683 + }, + { + "epoch": 1.2405350260711856, + "grad_norm": 0.0986328125, + "learning_rate": 0.0002, + "loss": 1.3345, + "step": 684 + }, + { + "epoch": 1.2423486737701201, + "grad_norm": 0.111328125, + "learning_rate": 0.0002, + "loss": 1.2699, + "step": 685 + }, + { + "epoch": 1.2441623214690547, + "grad_norm": 0.11767578125, + "learning_rate": 0.0002, + "loss": 1.4857, + "step": 686 + }, + { + "epoch": 1.245975969167989, + "grad_norm": 0.1171875, + "learning_rate": 0.0002, + "loss": 1.2654, + "step": 687 + }, + { + "epoch": 1.2477896168669236, + "grad_norm": 0.1220703125, + "learning_rate": 0.0002, + "loss": 1.3353, + "step": 688 + }, + { + "epoch": 1.2496032645658581, + "grad_norm": 0.130859375, + "learning_rate": 0.0002, + "loss": 1.4494, + "step": 689 + }, + { + "epoch": 1.2514169122647925, + "grad_norm": 0.138671875, + "learning_rate": 0.0002, + "loss": 1.5026, + "step": 690 + }, + { + "epoch": 1.253230559963727, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.4741, + "step": 691 + }, + { + "epoch": 1.2550442076626616, + "grad_norm": 0.13671875, + "learning_rate": 0.0002, + "loss": 1.3986, + "step": 692 + }, + { + "epoch": 1.2568578553615959, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.3973, + "step": 693 + }, + { + "epoch": 1.2586715030605304, + "grad_norm": 0.130859375, + "learning_rate": 0.0002, + "loss": 1.2672, + "step": 694 + }, + { + "epoch": 1.260485150759465, + "grad_norm": 0.1796875, + "learning_rate": 0.0002, + "loss": 1.7823, + "step": 695 + }, + { + "epoch": 1.2622987984583993, + "grad_norm": 0.158203125, + "learning_rate": 0.0002, + "loss": 1.3074, + "step": 696 + }, + { + "epoch": 1.2641124461573339, + "grad_norm": 0.173828125, + "learning_rate": 0.0002, + "loss": 1.6263, + "step": 697 + }, + { + "epoch": 1.2659260938562684, + "grad_norm": 0.181640625, + "learning_rate": 0.0002, + "loss": 1.4274, + "step": 698 + }, + { + "epoch": 1.267739741555203, + "grad_norm": 0.220703125, + "learning_rate": 0.0002, + "loss": 1.5028, + "step": 699 + }, + { + "epoch": 1.2695533892541375, + "grad_norm": 0.2041015625, + "learning_rate": 0.0002, + "loss": 1.2372, + "step": 700 + }, + { + "epoch": 1.2713670369530718, + "grad_norm": 0.333984375, + "learning_rate": 0.0002, + "loss": 1.176, + "step": 701 + }, + { + "epoch": 1.2731806846520064, + "grad_norm": 0.1845703125, + "learning_rate": 0.0002, + "loss": 1.0666, + "step": 702 + }, + { + "epoch": 1.274994332350941, + "grad_norm": 0.08740234375, + "learning_rate": 0.0002, + "loss": 1.3238, + "step": 703 + }, + { + "epoch": 1.2768079800498753, + "grad_norm": 0.0869140625, + "learning_rate": 0.0002, + "loss": 1.2862, + "step": 704 + }, + { + "epoch": 1.2786216277488098, + "grad_norm": 0.0810546875, + "learning_rate": 0.0002, + "loss": 1.0864, + "step": 705 + }, + { + "epoch": 1.2804352754477444, + "grad_norm": 0.0791015625, + "learning_rate": 0.0002, + "loss": 1.0556, + "step": 706 + }, + { + "epoch": 1.2822489231466787, + "grad_norm": 0.083984375, + "learning_rate": 0.0002, + "loss": 1.1804, + "step": 707 + }, + { + "epoch": 1.2840625708456133, + "grad_norm": 0.078125, + "learning_rate": 0.0002, + "loss": 1.1477, + "step": 708 + }, + { + "epoch": 1.2858762185445478, + "grad_norm": 0.07958984375, + "learning_rate": 0.0002, + "loss": 1.306, + "step": 709 + }, + { + "epoch": 1.2876898662434821, + "grad_norm": 0.083984375, + "learning_rate": 0.0002, + "loss": 1.0583, + "step": 710 + }, + { + "epoch": 1.2895035139424167, + "grad_norm": 0.0869140625, + "learning_rate": 0.0002, + "loss": 1.1869, + "step": 711 + }, + { + "epoch": 1.2913171616413512, + "grad_norm": 0.0771484375, + "learning_rate": 0.0002, + "loss": 1.05, + "step": 712 + }, + { + "epoch": 1.2931308093402856, + "grad_norm": 0.07861328125, + "learning_rate": 0.0002, + "loss": 1.0996, + "step": 713 + }, + { + "epoch": 1.2949444570392201, + "grad_norm": 0.08251953125, + "learning_rate": 0.0002, + "loss": 1.0042, + "step": 714 + }, + { + "epoch": 1.2967581047381547, + "grad_norm": 0.1005859375, + "learning_rate": 0.0002, + "loss": 1.2859, + "step": 715 + }, + { + "epoch": 1.298571752437089, + "grad_norm": 0.08740234375, + "learning_rate": 0.0002, + "loss": 1.0243, + "step": 716 + }, + { + "epoch": 1.3003854001360236, + "grad_norm": 0.1181640625, + "learning_rate": 0.0002, + "loss": 1.0557, + "step": 717 + }, + { + "epoch": 1.302199047834958, + "grad_norm": 0.0927734375, + "learning_rate": 0.0002, + "loss": 1.1305, + "step": 718 + }, + { + "epoch": 1.3040126955338924, + "grad_norm": 0.08642578125, + "learning_rate": 0.0002, + "loss": 0.9766, + "step": 719 + }, + { + "epoch": 1.305826343232827, + "grad_norm": 0.08740234375, + "learning_rate": 0.0002, + "loss": 1.1767, + "step": 720 + }, + { + "epoch": 1.3076399909317615, + "grad_norm": 0.09765625, + "learning_rate": 0.0002, + "loss": 1.2959, + "step": 721 + }, + { + "epoch": 1.3094536386306959, + "grad_norm": 0.09033203125, + "learning_rate": 0.0002, + "loss": 1.0964, + "step": 722 + }, + { + "epoch": 1.3112672863296304, + "grad_norm": 0.0966796875, + "learning_rate": 0.0002, + "loss": 1.2494, + "step": 723 + }, + { + "epoch": 1.313080934028565, + "grad_norm": 0.09130859375, + "learning_rate": 0.0002, + "loss": 1.3522, + "step": 724 + }, + { + "epoch": 1.3148945817274995, + "grad_norm": 0.09375, + "learning_rate": 0.0002, + "loss": 1.1617, + "step": 725 + }, + { + "epoch": 1.3167082294264338, + "grad_norm": 0.0966796875, + "learning_rate": 0.0002, + "loss": 1.1767, + "step": 726 + }, + { + "epoch": 1.3185218771253684, + "grad_norm": 0.099609375, + "learning_rate": 0.0002, + "loss": 1.2113, + "step": 727 + }, + { + "epoch": 1.320335524824303, + "grad_norm": 0.095703125, + "learning_rate": 0.0002, + "loss": 1.0601, + "step": 728 + }, + { + "epoch": 1.3221491725232375, + "grad_norm": 0.10302734375, + "learning_rate": 0.0002, + "loss": 1.1325, + "step": 729 + }, + { + "epoch": 1.3239628202221718, + "grad_norm": 0.10400390625, + "learning_rate": 0.0002, + "loss": 1.4291, + "step": 730 + }, + { + "epoch": 1.3257764679211064, + "grad_norm": 0.11181640625, + "learning_rate": 0.0002, + "loss": 1.4967, + "step": 731 + }, + { + "epoch": 1.327590115620041, + "grad_norm": 0.1083984375, + "learning_rate": 0.0002, + "loss": 1.2424, + "step": 732 + }, + { + "epoch": 1.3294037633189753, + "grad_norm": 0.1123046875, + "learning_rate": 0.0002, + "loss": 1.3542, + "step": 733 + }, + { + "epoch": 1.3312174110179098, + "grad_norm": 0.1171875, + "learning_rate": 0.0002, + "loss": 1.3928, + "step": 734 + }, + { + "epoch": 1.3330310587168444, + "grad_norm": 0.123046875, + "learning_rate": 0.0002, + "loss": 1.2592, + "step": 735 + }, + { + "epoch": 1.3348447064157787, + "grad_norm": 0.1220703125, + "learning_rate": 0.0002, + "loss": 1.3279, + "step": 736 + }, + { + "epoch": 1.3366583541147132, + "grad_norm": 0.11767578125, + "learning_rate": 0.0002, + "loss": 1.3588, + "step": 737 + }, + { + "epoch": 1.3384720018136478, + "grad_norm": 0.1240234375, + "learning_rate": 0.0002, + "loss": 1.1772, + "step": 738 + }, + { + "epoch": 1.3402856495125821, + "grad_norm": 0.134765625, + "learning_rate": 0.0002, + "loss": 1.4035, + "step": 739 + }, + { + "epoch": 1.3420992972115167, + "grad_norm": 0.126953125, + "learning_rate": 0.0002, + "loss": 1.3896, + "step": 740 + }, + { + "epoch": 1.3439129449104512, + "grad_norm": 0.134765625, + "learning_rate": 0.0002, + "loss": 1.2935, + "step": 741 + }, + { + "epoch": 1.3457265926093855, + "grad_norm": 0.12255859375, + "learning_rate": 0.0002, + "loss": 1.3456, + "step": 742 + }, + { + "epoch": 1.34754024030832, + "grad_norm": 0.138671875, + "learning_rate": 0.0002, + "loss": 1.3892, + "step": 743 + }, + { + "epoch": 1.3493538880072546, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 1.5456, + "step": 744 + }, + { + "epoch": 1.351167535706189, + "grad_norm": 0.162109375, + "learning_rate": 0.0002, + "loss": 1.4362, + "step": 745 + }, + { + "epoch": 1.3529811834051235, + "grad_norm": 0.1572265625, + "learning_rate": 0.0002, + "loss": 1.3322, + "step": 746 + }, + { + "epoch": 1.354794831104058, + "grad_norm": 0.15625, + "learning_rate": 0.0002, + "loss": 1.2813, + "step": 747 + }, + { + "epoch": 1.3566084788029924, + "grad_norm": 0.19921875, + "learning_rate": 0.0002, + "loss": 1.2847, + "step": 748 + }, + { + "epoch": 1.3566084788029924, + "eval_loss": 1.3036738634109497, + "eval_runtime": 152.5753, + "eval_samples_per_second": 6.554, + "eval_steps_per_second": 6.554, + "step": 748 + }, + { + "epoch": 1.3566084788029924, + "mmlu_eval_accuracy": 0.32009850022918107, + "mmlu_eval_accuracy_abstract_algebra": 0.36363636363636365, + "mmlu_eval_accuracy_anatomy": 0.35714285714285715, + "mmlu_eval_accuracy_astronomy": 0.25, + "mmlu_eval_accuracy_business_ethics": 0.6363636363636364, + "mmlu_eval_accuracy_clinical_knowledge": 0.27586206896551724, + "mmlu_eval_accuracy_college_biology": 0.4375, + "mmlu_eval_accuracy_college_chemistry": 0.25, + "mmlu_eval_accuracy_college_computer_science": 0.0, + "mmlu_eval_accuracy_college_mathematics": 0.2727272727272727, + "mmlu_eval_accuracy_college_medicine": 0.22727272727272727, + "mmlu_eval_accuracy_college_physics": 0.36363636363636365, + "mmlu_eval_accuracy_computer_security": 0.36363636363636365, + "mmlu_eval_accuracy_conceptual_physics": 0.3076923076923077, + "mmlu_eval_accuracy_econometrics": 0.0, + "mmlu_eval_accuracy_electrical_engineering": 0.125, + "mmlu_eval_accuracy_elementary_mathematics": 0.2926829268292683, + "mmlu_eval_accuracy_formal_logic": 0.14285714285714285, + "mmlu_eval_accuracy_global_facts": 0.2, + "mmlu_eval_accuracy_high_school_biology": 0.46875, + "mmlu_eval_accuracy_high_school_chemistry": 0.18181818181818182, + "mmlu_eval_accuracy_high_school_computer_science": 0.3333333333333333, + "mmlu_eval_accuracy_high_school_european_history": 0.3888888888888889, + "mmlu_eval_accuracy_high_school_geography": 0.4090909090909091, + "mmlu_eval_accuracy_high_school_government_and_politics": 0.3333333333333333, + "mmlu_eval_accuracy_high_school_macroeconomics": 0.27906976744186046, + "mmlu_eval_accuracy_high_school_mathematics": 0.27586206896551724, + "mmlu_eval_accuracy_high_school_microeconomics": 0.15384615384615385, + "mmlu_eval_accuracy_high_school_physics": 0.23529411764705882, + "mmlu_eval_accuracy_high_school_psychology": 0.5333333333333333, + "mmlu_eval_accuracy_high_school_statistics": 0.30434782608695654, + "mmlu_eval_accuracy_high_school_us_history": 0.36363636363636365, + "mmlu_eval_accuracy_high_school_world_history": 0.4230769230769231, + "mmlu_eval_accuracy_human_aging": 0.4782608695652174, + "mmlu_eval_accuracy_human_sexuality": 0.16666666666666666, + "mmlu_eval_accuracy_international_law": 0.3076923076923077, + "mmlu_eval_accuracy_jurisprudence": 0.18181818181818182, + "mmlu_eval_accuracy_logical_fallacies": 0.5, + "mmlu_eval_accuracy_machine_learning": 0.36363636363636365, + "mmlu_eval_accuracy_management": 0.18181818181818182, + "mmlu_eval_accuracy_marketing": 0.56, + "mmlu_eval_accuracy_medical_genetics": 0.45454545454545453, + "mmlu_eval_accuracy_miscellaneous": 0.38372093023255816, + "mmlu_eval_accuracy_moral_disputes": 0.34210526315789475, + "mmlu_eval_accuracy_moral_scenarios": 0.31, + "mmlu_eval_accuracy_nutrition": 0.45454545454545453, + "mmlu_eval_accuracy_philosophy": 0.3235294117647059, + "mmlu_eval_accuracy_prehistory": 0.2857142857142857, + "mmlu_eval_accuracy_professional_accounting": 0.22580645161290322, + "mmlu_eval_accuracy_professional_law": 0.25882352941176473, + "mmlu_eval_accuracy_professional_medicine": 0.3870967741935484, + "mmlu_eval_accuracy_professional_psychology": 0.37681159420289856, + "mmlu_eval_accuracy_public_relations": 0.4166666666666667, + "mmlu_eval_accuracy_security_studies": 0.37037037037037035, + "mmlu_eval_accuracy_sociology": 0.5, + "mmlu_eval_accuracy_us_foreign_policy": 0.2727272727272727, + "mmlu_eval_accuracy_virology": 0.2777777777777778, + "mmlu_eval_accuracy_world_religions": 0.3157894736842105, + "mmlu_loss": 1.7180121196865803, + "step": 748 + }, + { + "epoch": 1.358422126501927, + "grad_norm": 0.197265625, + "learning_rate": 0.0002, + "loss": 1.2982, + "step": 749 + }, + { + "epoch": 1.3602357742008615, + "grad_norm": 0.2060546875, + "learning_rate": 0.0002, + "loss": 1.229, + "step": 750 + }, + { + "epoch": 1.3620494218997958, + "grad_norm": 0.27734375, + "learning_rate": 0.0002, + "loss": 1.1974, + "step": 751 + }, + { + "epoch": 1.3638630695987304, + "grad_norm": 0.205078125, + "learning_rate": 0.0002, + "loss": 1.1999, + "step": 752 + }, + { + "epoch": 1.365676717297665, + "grad_norm": 0.08984375, + "learning_rate": 0.0002, + "loss": 1.1777, + "step": 753 + }, + { + "epoch": 1.3674903649965995, + "grad_norm": 0.08642578125, + "learning_rate": 0.0002, + "loss": 0.9935, + "step": 754 + }, + { + "epoch": 1.3693040126955338, + "grad_norm": 0.10009765625, + "learning_rate": 0.0002, + "loss": 1.2167, + "step": 755 + }, + { + "epoch": 1.3711176603944684, + "grad_norm": 0.08447265625, + "learning_rate": 0.0002, + "loss": 1.0875, + "step": 756 + }, + { + "epoch": 1.372931308093403, + "grad_norm": 0.08642578125, + "learning_rate": 0.0002, + "loss": 1.2933, + "step": 757 + }, + { + "epoch": 1.3747449557923375, + "grad_norm": 0.087890625, + "learning_rate": 0.0002, + "loss": 1.2365, + "step": 758 + }, + { + "epoch": 1.3765586034912718, + "grad_norm": 0.09326171875, + "learning_rate": 0.0002, + "loss": 1.2239, + "step": 759 + }, + { + "epoch": 1.3783722511902063, + "grad_norm": 0.083984375, + "learning_rate": 0.0002, + "loss": 1.2011, + "step": 760 + }, + { + "epoch": 1.380185898889141, + "grad_norm": 0.08203125, + "learning_rate": 0.0002, + "loss": 1.0835, + "step": 761 + }, + { + "epoch": 1.3819995465880752, + "grad_norm": 0.08251953125, + "learning_rate": 0.0002, + "loss": 1.2604, + "step": 762 + }, + { + "epoch": 1.3838131942870098, + "grad_norm": 0.07763671875, + "learning_rate": 0.0002, + "loss": 1.2757, + "step": 763 + }, + { + "epoch": 1.3856268419859443, + "grad_norm": 0.087890625, + "learning_rate": 0.0002, + "loss": 1.1268, + "step": 764 + }, + { + "epoch": 1.3874404896848787, + "grad_norm": 0.08544921875, + "learning_rate": 0.0002, + "loss": 1.2312, + "step": 765 + }, + { + "epoch": 1.3892541373838132, + "grad_norm": 0.0908203125, + "learning_rate": 0.0002, + "loss": 1.1651, + "step": 766 + }, + { + "epoch": 1.3910677850827478, + "grad_norm": 0.0908203125, + "learning_rate": 0.0002, + "loss": 1.1689, + "step": 767 + }, + { + "epoch": 1.392881432781682, + "grad_norm": 0.0869140625, + "learning_rate": 0.0002, + "loss": 1.0772, + "step": 768 + }, + { + "epoch": 1.3946950804806166, + "grad_norm": 0.10107421875, + "learning_rate": 0.0002, + "loss": 1.2206, + "step": 769 + }, + { + "epoch": 1.3965087281795512, + "grad_norm": 0.08740234375, + "learning_rate": 0.0002, + "loss": 0.9819, + "step": 770 + }, + { + "epoch": 1.3983223758784855, + "grad_norm": 0.0947265625, + "learning_rate": 0.0002, + "loss": 1.1067, + "step": 771 + }, + { + "epoch": 1.40013602357742, + "grad_norm": 0.09521484375, + "learning_rate": 0.0002, + "loss": 1.0804, + "step": 772 + }, + { + "epoch": 1.4019496712763546, + "grad_norm": 0.09814453125, + "learning_rate": 0.0002, + "loss": 1.2522, + "step": 773 + }, + { + "epoch": 1.403763318975289, + "grad_norm": 0.09619140625, + "learning_rate": 0.0002, + "loss": 1.2546, + "step": 774 + }, + { + "epoch": 1.4055769666742235, + "grad_norm": 0.095703125, + "learning_rate": 0.0002, + "loss": 1.0892, + "step": 775 + }, + { + "epoch": 1.407390614373158, + "grad_norm": 0.09716796875, + "learning_rate": 0.0002, + "loss": 1.2029, + "step": 776 + }, + { + "epoch": 1.4092042620720924, + "grad_norm": 0.0986328125, + "learning_rate": 0.0002, + "loss": 1.1268, + "step": 777 + }, + { + "epoch": 1.411017909771027, + "grad_norm": 0.10009765625, + "learning_rate": 0.0002, + "loss": 1.2337, + "step": 778 + }, + { + "epoch": 1.4128315574699615, + "grad_norm": 0.10595703125, + "learning_rate": 0.0002, + "loss": 1.275, + "step": 779 + }, + { + "epoch": 1.414645205168896, + "grad_norm": 0.103515625, + "learning_rate": 0.0002, + "loss": 1.3055, + "step": 780 + }, + { + "epoch": 1.4164588528678304, + "grad_norm": 0.107421875, + "learning_rate": 0.0002, + "loss": 1.4672, + "step": 781 + }, + { + "epoch": 1.418272500566765, + "grad_norm": 0.11328125, + "learning_rate": 0.0002, + "loss": 1.3578, + "step": 782 + }, + { + "epoch": 1.4200861482656995, + "grad_norm": 0.10693359375, + "learning_rate": 0.0002, + "loss": 1.2495, + "step": 783 + }, + { + "epoch": 1.421899795964634, + "grad_norm": 0.107421875, + "learning_rate": 0.0002, + "loss": 1.2101, + "step": 784 + }, + { + "epoch": 1.4237134436635683, + "grad_norm": 0.12060546875, + "learning_rate": 0.0002, + "loss": 1.3842, + "step": 785 + }, + { + "epoch": 1.4255270913625029, + "grad_norm": 0.1279296875, + "learning_rate": 0.0002, + "loss": 1.399, + "step": 786 + }, + { + "epoch": 1.4273407390614374, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.2636, + "step": 787 + }, + { + "epoch": 1.4291543867603718, + "grad_norm": 0.1337890625, + "learning_rate": 0.0002, + "loss": 1.3807, + "step": 788 + }, + { + "epoch": 1.4309680344593063, + "grad_norm": 0.1298828125, + "learning_rate": 0.0002, + "loss": 1.4685, + "step": 789 + }, + { + "epoch": 1.4327816821582409, + "grad_norm": 0.134765625, + "learning_rate": 0.0002, + "loss": 1.3387, + "step": 790 + }, + { + "epoch": 1.4345953298571752, + "grad_norm": 0.1337890625, + "learning_rate": 0.0002, + "loss": 1.5089, + "step": 791 + }, + { + "epoch": 1.4364089775561097, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.4434, + "step": 792 + }, + { + "epoch": 1.4382226252550443, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.3416, + "step": 793 + }, + { + "epoch": 1.4400362729539786, + "grad_norm": 0.1396484375, + "learning_rate": 0.0002, + "loss": 1.2225, + "step": 794 + }, + { + "epoch": 1.4418499206529132, + "grad_norm": 0.13671875, + "learning_rate": 0.0002, + "loss": 1.4264, + "step": 795 + }, + { + "epoch": 1.4436635683518477, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.3475, + "step": 796 + }, + { + "epoch": 1.445477216050782, + "grad_norm": 0.185546875, + "learning_rate": 0.0002, + "loss": 1.4908, + "step": 797 + }, + { + "epoch": 1.4472908637497166, + "grad_norm": 0.17578125, + "learning_rate": 0.0002, + "loss": 1.373, + "step": 798 + }, + { + "epoch": 1.4491045114486512, + "grad_norm": 0.232421875, + "learning_rate": 0.0002, + "loss": 1.2749, + "step": 799 + }, + { + "epoch": 1.4509181591475855, + "grad_norm": 0.2138671875, + "learning_rate": 0.0002, + "loss": 1.1473, + "step": 800 + }, + { + "epoch": 1.45273180684652, + "grad_norm": 0.224609375, + "learning_rate": 0.0002, + "loss": 1.0466, + "step": 801 + }, + { + "epoch": 1.4545454545454546, + "grad_norm": 0.2109375, + "learning_rate": 0.0002, + "loss": 0.9371, + "step": 802 + }, + { + "epoch": 1.456359102244389, + "grad_norm": 0.091796875, + "learning_rate": 0.0002, + "loss": 1.2877, + "step": 803 + }, + { + "epoch": 1.4581727499433235, + "grad_norm": 0.087890625, + "learning_rate": 0.0002, + "loss": 1.1164, + "step": 804 + }, + { + "epoch": 1.459986397642258, + "grad_norm": 0.0810546875, + "learning_rate": 0.0002, + "loss": 1.1594, + "step": 805 + }, + { + "epoch": 1.4618000453411923, + "grad_norm": 0.08251953125, + "learning_rate": 0.0002, + "loss": 0.939, + "step": 806 + }, + { + "epoch": 1.463613693040127, + "grad_norm": 0.0791015625, + "learning_rate": 0.0002, + "loss": 1.1459, + "step": 807 + }, + { + "epoch": 1.4654273407390614, + "grad_norm": 0.0849609375, + "learning_rate": 0.0002, + "loss": 1.0453, + "step": 808 + }, + { + "epoch": 1.467240988437996, + "grad_norm": 0.07568359375, + "learning_rate": 0.0002, + "loss": 1.0304, + "step": 809 + }, + { + "epoch": 1.4690546361369303, + "grad_norm": 0.0810546875, + "learning_rate": 0.0002, + "loss": 1.0118, + "step": 810 + }, + { + "epoch": 1.4708682838358649, + "grad_norm": 0.09521484375, + "learning_rate": 0.0002, + "loss": 1.0825, + "step": 811 + }, + { + "epoch": 1.4726819315347994, + "grad_norm": 0.09521484375, + "learning_rate": 0.0002, + "loss": 1.0608, + "step": 812 + }, + { + "epoch": 1.474495579233734, + "grad_norm": 0.08447265625, + "learning_rate": 0.0002, + "loss": 1.1236, + "step": 813 + }, + { + "epoch": 1.4763092269326683, + "grad_norm": 0.09375, + "learning_rate": 0.0002, + "loss": 1.0186, + "step": 814 + }, + { + "epoch": 1.4781228746316029, + "grad_norm": 0.09375, + "learning_rate": 0.0002, + "loss": 1.1948, + "step": 815 + }, + { + "epoch": 1.4799365223305374, + "grad_norm": 0.0927734375, + "learning_rate": 0.0002, + "loss": 1.0646, + "step": 816 + }, + { + "epoch": 1.4817501700294717, + "grad_norm": 0.0947265625, + "learning_rate": 0.0002, + "loss": 1.1676, + "step": 817 + }, + { + "epoch": 1.4835638177284063, + "grad_norm": 0.09033203125, + "learning_rate": 0.0002, + "loss": 1.1536, + "step": 818 + }, + { + "epoch": 1.4853774654273408, + "grad_norm": 0.09765625, + "learning_rate": 0.0002, + "loss": 1.1931, + "step": 819 + }, + { + "epoch": 1.4871911131262752, + "grad_norm": 0.0908203125, + "learning_rate": 0.0002, + "loss": 1.0371, + "step": 820 + }, + { + "epoch": 1.4890047608252097, + "grad_norm": 0.1005859375, + "learning_rate": 0.0002, + "loss": 1.1466, + "step": 821 + }, + { + "epoch": 1.4908184085241443, + "grad_norm": 0.09619140625, + "learning_rate": 0.0002, + "loss": 1.0825, + "step": 822 + }, + { + "epoch": 1.4926320562230786, + "grad_norm": 0.09814453125, + "learning_rate": 0.0002, + "loss": 1.2071, + "step": 823 + }, + { + "epoch": 1.4944457039220131, + "grad_norm": 0.0986328125, + "learning_rate": 0.0002, + "loss": 1.1839, + "step": 824 + }, + { + "epoch": 1.4962593516209477, + "grad_norm": 0.1015625, + "learning_rate": 0.0002, + "loss": 1.1462, + "step": 825 + }, + { + "epoch": 1.498072999319882, + "grad_norm": 0.10400390625, + "learning_rate": 0.0002, + "loss": 1.2995, + "step": 826 + }, + { + "epoch": 1.4998866470188166, + "grad_norm": 0.10595703125, + "learning_rate": 0.0002, + "loss": 1.175, + "step": 827 + }, + { + "epoch": 1.5017002947177511, + "grad_norm": 0.10693359375, + "learning_rate": 0.0002, + "loss": 1.1292, + "step": 828 + }, + { + "epoch": 1.5035139424166855, + "grad_norm": 0.11083984375, + "learning_rate": 0.0002, + "loss": 1.2499, + "step": 829 + }, + { + "epoch": 1.50532759011562, + "grad_norm": 0.1181640625, + "learning_rate": 0.0002, + "loss": 1.1595, + "step": 830 + }, + { + "epoch": 1.5071412378145546, + "grad_norm": 0.1298828125, + "learning_rate": 0.0002, + "loss": 1.2649, + "step": 831 + }, + { + "epoch": 1.5089548855134889, + "grad_norm": 0.10986328125, + "learning_rate": 0.0002, + "loss": 1.4093, + "step": 832 + }, + { + "epoch": 1.5107685332124234, + "grad_norm": 0.123046875, + "learning_rate": 0.0002, + "loss": 1.2861, + "step": 833 + }, + { + "epoch": 1.512582180911358, + "grad_norm": 0.13671875, + "learning_rate": 0.0002, + "loss": 1.2829, + "step": 834 + }, + { + "epoch": 1.5143958286102923, + "grad_norm": 0.1337890625, + "learning_rate": 0.0002, + "loss": 1.3734, + "step": 835 + }, + { + "epoch": 1.516209476309227, + "grad_norm": 0.11376953125, + "learning_rate": 0.0002, + "loss": 1.1474, + "step": 836 + }, + { + "epoch": 1.5180231240081614, + "grad_norm": 0.1298828125, + "learning_rate": 0.0002, + "loss": 1.4568, + "step": 837 + }, + { + "epoch": 1.5198367717070957, + "grad_norm": 0.134765625, + "learning_rate": 0.0002, + "loss": 1.5314, + "step": 838 + }, + { + "epoch": 1.5216504194060305, + "grad_norm": 0.1318359375, + "learning_rate": 0.0002, + "loss": 1.4612, + "step": 839 + }, + { + "epoch": 1.5234640671049648, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.4435, + "step": 840 + }, + { + "epoch": 1.5252777148038992, + "grad_norm": 0.138671875, + "learning_rate": 0.0002, + "loss": 1.2487, + "step": 841 + }, + { + "epoch": 1.527091362502834, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.4567, + "step": 842 + }, + { + "epoch": 1.5289050102017683, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.3902, + "step": 843 + }, + { + "epoch": 1.5307186579007028, + "grad_norm": 0.1572265625, + "learning_rate": 0.0002, + "loss": 1.5065, + "step": 844 + }, + { + "epoch": 1.5325323055996374, + "grad_norm": 0.1787109375, + "learning_rate": 0.0002, + "loss": 1.4396, + "step": 845 + }, + { + "epoch": 1.5343459532985717, + "grad_norm": 0.162109375, + "learning_rate": 0.0002, + "loss": 1.4198, + "step": 846 + }, + { + "epoch": 1.5361596009975063, + "grad_norm": 0.1962890625, + "learning_rate": 0.0002, + "loss": 1.5692, + "step": 847 + }, + { + "epoch": 1.5379732486964408, + "grad_norm": 0.18359375, + "learning_rate": 0.0002, + "loss": 1.2914, + "step": 848 + }, + { + "epoch": 1.5397868963953751, + "grad_norm": 0.171875, + "learning_rate": 0.0002, + "loss": 1.2018, + "step": 849 + }, + { + "epoch": 1.5416005440943097, + "grad_norm": 0.224609375, + "learning_rate": 0.0002, + "loss": 1.3486, + "step": 850 + }, + { + "epoch": 1.5434141917932442, + "grad_norm": 0.248046875, + "learning_rate": 0.0002, + "loss": 1.154, + "step": 851 + }, + { + "epoch": 1.5452278394921786, + "grad_norm": 0.158203125, + "learning_rate": 0.0002, + "loss": 1.023, + "step": 852 + }, + { + "epoch": 1.5470414871911131, + "grad_norm": 0.080078125, + "learning_rate": 0.0002, + "loss": 1.0485, + "step": 853 + }, + { + "epoch": 1.5488551348900477, + "grad_norm": 0.08251953125, + "learning_rate": 0.0002, + "loss": 0.9934, + "step": 854 + }, + { + "epoch": 1.550668782588982, + "grad_norm": 0.08935546875, + "learning_rate": 0.0002, + "loss": 1.1232, + "step": 855 + }, + { + "epoch": 1.5524824302879165, + "grad_norm": 0.08203125, + "learning_rate": 0.0002, + "loss": 1.2353, + "step": 856 + }, + { + "epoch": 1.554296077986851, + "grad_norm": 0.0859375, + "learning_rate": 0.0002, + "loss": 1.0611, + "step": 857 + }, + { + "epoch": 1.5561097256857854, + "grad_norm": 0.0849609375, + "learning_rate": 0.0002, + "loss": 1.2375, + "step": 858 + }, + { + "epoch": 1.55792337338472, + "grad_norm": 0.0888671875, + "learning_rate": 0.0002, + "loss": 1.0286, + "step": 859 + }, + { + "epoch": 1.5597370210836545, + "grad_norm": 0.087890625, + "learning_rate": 0.0002, + "loss": 1.1207, + "step": 860 + }, + { + "epoch": 1.5615506687825889, + "grad_norm": 0.08349609375, + "learning_rate": 0.0002, + "loss": 1.0205, + "step": 861 + }, + { + "epoch": 1.5633643164815236, + "grad_norm": 0.09130859375, + "learning_rate": 0.0002, + "loss": 1.0996, + "step": 862 + }, + { + "epoch": 1.565177964180458, + "grad_norm": 0.0859375, + "learning_rate": 0.0002, + "loss": 1.1193, + "step": 863 + }, + { + "epoch": 1.5669916118793923, + "grad_norm": 0.083984375, + "learning_rate": 0.0002, + "loss": 1.1666, + "step": 864 + }, + { + "epoch": 1.568805259578327, + "grad_norm": 0.0888671875, + "learning_rate": 0.0002, + "loss": 1.2014, + "step": 865 + }, + { + "epoch": 1.5706189072772614, + "grad_norm": 0.09716796875, + "learning_rate": 0.0002, + "loss": 1.2506, + "step": 866 + }, + { + "epoch": 1.5724325549761957, + "grad_norm": 0.09619140625, + "learning_rate": 0.0002, + "loss": 1.022, + "step": 867 + }, + { + "epoch": 1.5742462026751305, + "grad_norm": 0.0908203125, + "learning_rate": 0.0002, + "loss": 1.0619, + "step": 868 + }, + { + "epoch": 1.5760598503740648, + "grad_norm": 0.095703125, + "learning_rate": 0.0002, + "loss": 1.2869, + "step": 869 + }, + { + "epoch": 1.5778734980729994, + "grad_norm": 0.10498046875, + "learning_rate": 0.0002, + "loss": 1.0704, + "step": 870 + }, + { + "epoch": 1.579687145771934, + "grad_norm": 0.09228515625, + "learning_rate": 0.0002, + "loss": 1.1171, + "step": 871 + }, + { + "epoch": 1.5815007934708682, + "grad_norm": 0.09814453125, + "learning_rate": 0.0002, + "loss": 1.216, + "step": 872 + }, + { + "epoch": 1.5833144411698028, + "grad_norm": 0.10400390625, + "learning_rate": 0.0002, + "loss": 1.3069, + "step": 873 + }, + { + "epoch": 1.5851280888687374, + "grad_norm": 0.09326171875, + "learning_rate": 0.0002, + "loss": 1.119, + "step": 874 + }, + { + "epoch": 1.5869417365676717, + "grad_norm": 0.09912109375, + "learning_rate": 0.0002, + "loss": 1.2, + "step": 875 + }, + { + "epoch": 1.5887553842666062, + "grad_norm": 0.0986328125, + "learning_rate": 0.0002, + "loss": 1.1714, + "step": 876 + }, + { + "epoch": 1.5905690319655408, + "grad_norm": 0.10546875, + "learning_rate": 0.0002, + "loss": 1.2747, + "step": 877 + }, + { + "epoch": 1.592382679664475, + "grad_norm": 0.1005859375, + "learning_rate": 0.0002, + "loss": 1.2009, + "step": 878 + }, + { + "epoch": 1.5941963273634097, + "grad_norm": 0.10546875, + "learning_rate": 0.0002, + "loss": 1.2722, + "step": 879 + }, + { + "epoch": 1.5960099750623442, + "grad_norm": 0.1103515625, + "learning_rate": 0.0002, + "loss": 1.3954, + "step": 880 + }, + { + "epoch": 1.5978236227612785, + "grad_norm": 0.11572265625, + "learning_rate": 0.0002, + "loss": 1.2018, + "step": 881 + }, + { + "epoch": 1.599637270460213, + "grad_norm": 0.1123046875, + "learning_rate": 0.0002, + "loss": 1.3332, + "step": 882 + }, + { + "epoch": 1.6014509181591476, + "grad_norm": 0.12255859375, + "learning_rate": 0.0002, + "loss": 1.4421, + "step": 883 + }, + { + "epoch": 1.603264565858082, + "grad_norm": 0.12109375, + "learning_rate": 0.0002, + "loss": 1.4709, + "step": 884 + }, + { + "epoch": 1.6050782135570165, + "grad_norm": 0.123046875, + "learning_rate": 0.0002, + "loss": 1.0779, + "step": 885 + }, + { + "epoch": 1.606891861255951, + "grad_norm": 0.1318359375, + "learning_rate": 0.0002, + "loss": 1.2871, + "step": 886 + }, + { + "epoch": 1.6087055089548854, + "grad_norm": 0.12255859375, + "learning_rate": 0.0002, + "loss": 1.1512, + "step": 887 + }, + { + "epoch": 1.61051915665382, + "grad_norm": 0.130859375, + "learning_rate": 0.0002, + "loss": 1.2426, + "step": 888 + }, + { + "epoch": 1.6123328043527545, + "grad_norm": 0.1337890625, + "learning_rate": 0.0002, + "loss": 1.3817, + "step": 889 + }, + { + "epoch": 1.6141464520516888, + "grad_norm": 0.134765625, + "learning_rate": 0.0002, + "loss": 1.223, + "step": 890 + }, + { + "epoch": 1.6159600997506236, + "grad_norm": 0.1357421875, + "learning_rate": 0.0002, + "loss": 1.3195, + "step": 891 + }, + { + "epoch": 1.617773747449558, + "grad_norm": 0.1357421875, + "learning_rate": 0.0002, + "loss": 1.2233, + "step": 892 + }, + { + "epoch": 1.6195873951484923, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.4292, + "step": 893 + }, + { + "epoch": 1.621401042847427, + "grad_norm": 0.16796875, + "learning_rate": 0.0002, + "loss": 1.4234, + "step": 894 + }, + { + "epoch": 1.6232146905463614, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 1.3977, + "step": 895 + }, + { + "epoch": 1.6250283382452957, + "grad_norm": 0.18359375, + "learning_rate": 0.0002, + "loss": 1.4376, + "step": 896 + }, + { + "epoch": 1.6268419859442305, + "grad_norm": 0.1640625, + "learning_rate": 0.0002, + "loss": 1.3169, + "step": 897 + }, + { + "epoch": 1.6286556336431648, + "grad_norm": 0.197265625, + "learning_rate": 0.0002, + "loss": 1.4186, + "step": 898 + }, + { + "epoch": 1.6304692813420993, + "grad_norm": 0.197265625, + "learning_rate": 0.0002, + "loss": 1.1455, + "step": 899 + }, + { + "epoch": 1.632282929041034, + "grad_norm": 0.2333984375, + "learning_rate": 0.0002, + "loss": 1.1851, + "step": 900 + }, + { + "epoch": 1.6340965767399682, + "grad_norm": 0.25390625, + "learning_rate": 0.0002, + "loss": 1.2691, + "step": 901 + }, + { + "epoch": 1.6359102244389028, + "grad_norm": 0.259765625, + "learning_rate": 0.0002, + "loss": 1.136, + "step": 902 + }, + { + "epoch": 1.6377238721378373, + "grad_norm": 0.08154296875, + "learning_rate": 0.0002, + "loss": 0.9717, + "step": 903 + }, + { + "epoch": 1.6395375198367717, + "grad_norm": 0.09130859375, + "learning_rate": 0.0002, + "loss": 1.1771, + "step": 904 + }, + { + "epoch": 1.6413511675357062, + "grad_norm": 0.0966796875, + "learning_rate": 0.0002, + "loss": 1.1116, + "step": 905 + }, + { + "epoch": 1.6431648152346408, + "grad_norm": 0.0830078125, + "learning_rate": 0.0002, + "loss": 1.2549, + "step": 906 + }, + { + "epoch": 1.644978462933575, + "grad_norm": 0.08544921875, + "learning_rate": 0.0002, + "loss": 1.3015, + "step": 907 + }, + { + "epoch": 1.6467921106325096, + "grad_norm": 0.083984375, + "learning_rate": 0.0002, + "loss": 1.126, + "step": 908 + }, + { + "epoch": 1.6486057583314442, + "grad_norm": 0.08544921875, + "learning_rate": 0.0002, + "loss": 1.0288, + "step": 909 + }, + { + "epoch": 1.6504194060303785, + "grad_norm": 0.0830078125, + "learning_rate": 0.0002, + "loss": 1.0473, + "step": 910 + }, + { + "epoch": 1.652233053729313, + "grad_norm": 0.0859375, + "learning_rate": 0.0002, + "loss": 1.0263, + "step": 911 + }, + { + "epoch": 1.6540467014282476, + "grad_norm": 0.08447265625, + "learning_rate": 0.0002, + "loss": 1.0511, + "step": 912 + }, + { + "epoch": 1.655860349127182, + "grad_norm": 0.09375, + "learning_rate": 0.0002, + "loss": 1.0747, + "step": 913 + }, + { + "epoch": 1.6576739968261165, + "grad_norm": 0.091796875, + "learning_rate": 0.0002, + "loss": 1.3132, + "step": 914 + }, + { + "epoch": 1.659487644525051, + "grad_norm": 0.0859375, + "learning_rate": 0.0002, + "loss": 1.0435, + "step": 915 + }, + { + "epoch": 1.6613012922239854, + "grad_norm": 0.09619140625, + "learning_rate": 0.0002, + "loss": 1.175, + "step": 916 + }, + { + "epoch": 1.66311493992292, + "grad_norm": 0.09521484375, + "learning_rate": 0.0002, + "loss": 0.9709, + "step": 917 + }, + { + "epoch": 1.6649285876218545, + "grad_norm": 0.09814453125, + "learning_rate": 0.0002, + "loss": 1.1698, + "step": 918 + }, + { + "epoch": 1.6667422353207888, + "grad_norm": 0.0947265625, + "learning_rate": 0.0002, + "loss": 0.9333, + "step": 919 + }, + { + "epoch": 1.6685558830197236, + "grad_norm": 0.09814453125, + "learning_rate": 0.0002, + "loss": 0.9995, + "step": 920 + }, + { + "epoch": 1.670369530718658, + "grad_norm": 0.09765625, + "learning_rate": 0.0002, + "loss": 1.075, + "step": 921 + }, + { + "epoch": 1.6721831784175922, + "grad_norm": 0.087890625, + "learning_rate": 0.0002, + "loss": 1.0794, + "step": 922 + }, + { + "epoch": 1.673996826116527, + "grad_norm": 0.10205078125, + "learning_rate": 0.0002, + "loss": 1.2243, + "step": 923 + }, + { + "epoch": 1.6758104738154613, + "grad_norm": 0.0947265625, + "learning_rate": 0.0002, + "loss": 1.0507, + "step": 924 + }, + { + "epoch": 1.6776241215143959, + "grad_norm": 0.10009765625, + "learning_rate": 0.0002, + "loss": 1.2873, + "step": 925 + }, + { + "epoch": 1.6794377692133304, + "grad_norm": 0.10009765625, + "learning_rate": 0.0002, + "loss": 1.1545, + "step": 926 + }, + { + "epoch": 1.6812514169122648, + "grad_norm": 0.095703125, + "learning_rate": 0.0002, + "loss": 1.1654, + "step": 927 + }, + { + "epoch": 1.6830650646111993, + "grad_norm": 0.1025390625, + "learning_rate": 0.0002, + "loss": 1.0404, + "step": 928 + }, + { + "epoch": 1.6848787123101339, + "grad_norm": 0.0986328125, + "learning_rate": 0.0002, + "loss": 1.0557, + "step": 929 + }, + { + "epoch": 1.6866923600090682, + "grad_norm": 0.099609375, + "learning_rate": 0.0002, + "loss": 1.3435, + "step": 930 + }, + { + "epoch": 1.6885060077080027, + "grad_norm": 0.11083984375, + "learning_rate": 0.0002, + "loss": 1.2377, + "step": 931 + }, + { + "epoch": 1.6903196554069373, + "grad_norm": 0.1044921875, + "learning_rate": 0.0002, + "loss": 1.1489, + "step": 932 + }, + { + "epoch": 1.6921333031058716, + "grad_norm": 0.10546875, + "learning_rate": 0.0002, + "loss": 1.1826, + "step": 933 + }, + { + "epoch": 1.6939469508048062, + "grad_norm": 0.11474609375, + "learning_rate": 0.0002, + "loss": 1.2303, + "step": 934 + }, + { + "epoch": 1.6957605985037407, + "grad_norm": 0.12255859375, + "learning_rate": 0.0002, + "loss": 1.2433, + "step": 935 + }, + { + "epoch": 1.6957605985037407, + "eval_loss": 1.3003318309783936, + "eval_runtime": 152.7047, + "eval_samples_per_second": 6.549, + "eval_steps_per_second": 6.549, + "step": 935 + }, + { + "epoch": 1.6957605985037407, + "mmlu_eval_accuracy": 0.3463494491938701, + "mmlu_eval_accuracy_abstract_algebra": 0.36363636363636365, + "mmlu_eval_accuracy_anatomy": 0.5714285714285714, + "mmlu_eval_accuracy_astronomy": 0.3125, + "mmlu_eval_accuracy_business_ethics": 0.5454545454545454, + "mmlu_eval_accuracy_clinical_knowledge": 0.3103448275862069, + "mmlu_eval_accuracy_college_biology": 0.375, + "mmlu_eval_accuracy_college_chemistry": 0.25, + "mmlu_eval_accuracy_college_computer_science": 0.0, + "mmlu_eval_accuracy_college_mathematics": 0.18181818181818182, + "mmlu_eval_accuracy_college_medicine": 0.22727272727272727, + "mmlu_eval_accuracy_college_physics": 0.36363636363636365, + "mmlu_eval_accuracy_computer_security": 0.45454545454545453, + "mmlu_eval_accuracy_conceptual_physics": 0.3076923076923077, + "mmlu_eval_accuracy_econometrics": 0.0, + "mmlu_eval_accuracy_electrical_engineering": 0.25, + "mmlu_eval_accuracy_elementary_mathematics": 0.3170731707317073, + "mmlu_eval_accuracy_formal_logic": 0.14285714285714285, + "mmlu_eval_accuracy_global_facts": 0.2, + "mmlu_eval_accuracy_high_school_biology": 0.46875, + "mmlu_eval_accuracy_high_school_chemistry": 0.2727272727272727, + "mmlu_eval_accuracy_high_school_computer_science": 0.4444444444444444, + "mmlu_eval_accuracy_high_school_european_history": 0.3888888888888889, + "mmlu_eval_accuracy_high_school_geography": 0.5, + "mmlu_eval_accuracy_high_school_government_and_politics": 0.3333333333333333, + "mmlu_eval_accuracy_high_school_macroeconomics": 0.32558139534883723, + "mmlu_eval_accuracy_high_school_mathematics": 0.2413793103448276, + "mmlu_eval_accuracy_high_school_microeconomics": 0.23076923076923078, + "mmlu_eval_accuracy_high_school_physics": 0.23529411764705882, + "mmlu_eval_accuracy_high_school_psychology": 0.5166666666666667, + "mmlu_eval_accuracy_high_school_statistics": 0.391304347826087, + "mmlu_eval_accuracy_high_school_us_history": 0.4090909090909091, + "mmlu_eval_accuracy_high_school_world_history": 0.38461538461538464, + "mmlu_eval_accuracy_human_aging": 0.391304347826087, + "mmlu_eval_accuracy_human_sexuality": 0.16666666666666666, + "mmlu_eval_accuracy_international_law": 0.46153846153846156, + "mmlu_eval_accuracy_jurisprudence": 0.2727272727272727, + "mmlu_eval_accuracy_logical_fallacies": 0.5555555555555556, + "mmlu_eval_accuracy_machine_learning": 0.36363636363636365, + "mmlu_eval_accuracy_management": 0.2727272727272727, + "mmlu_eval_accuracy_marketing": 0.52, + "mmlu_eval_accuracy_medical_genetics": 0.6363636363636364, + "mmlu_eval_accuracy_miscellaneous": 0.45348837209302323, + "mmlu_eval_accuracy_moral_disputes": 0.34210526315789475, + "mmlu_eval_accuracy_moral_scenarios": 0.31, + "mmlu_eval_accuracy_nutrition": 0.48484848484848486, + "mmlu_eval_accuracy_philosophy": 0.4411764705882353, + "mmlu_eval_accuracy_prehistory": 0.2857142857142857, + "mmlu_eval_accuracy_professional_accounting": 0.25806451612903225, + "mmlu_eval_accuracy_professional_law": 0.27058823529411763, + "mmlu_eval_accuracy_professional_medicine": 0.3870967741935484, + "mmlu_eval_accuracy_professional_psychology": 0.391304347826087, + "mmlu_eval_accuracy_public_relations": 0.3333333333333333, + "mmlu_eval_accuracy_security_studies": 0.37037037037037035, + "mmlu_eval_accuracy_sociology": 0.5909090909090909, + "mmlu_eval_accuracy_us_foreign_policy": 0.2727272727272727, + "mmlu_eval_accuracy_virology": 0.2777777777777778, + "mmlu_eval_accuracy_world_religions": 0.3157894736842105, + "mmlu_loss": 1.9676394225879872, + "step": 935 + }, + { + "epoch": 1.697574246202675, + "grad_norm": 0.1279296875, + "learning_rate": 0.0002, + "loss": 1.4804, + "step": 936 + }, + { + "epoch": 1.6993878939016096, + "grad_norm": 0.134765625, + "learning_rate": 0.0002, + "loss": 1.2931, + "step": 937 + }, + { + "epoch": 1.7012015416005442, + "grad_norm": 0.1376953125, + "learning_rate": 0.0002, + "loss": 1.3085, + "step": 938 + }, + { + "epoch": 1.7030151892994785, + "grad_norm": 0.1328125, + "learning_rate": 0.0002, + "loss": 1.3688, + "step": 939 + }, + { + "epoch": 1.704828836998413, + "grad_norm": 0.1337890625, + "learning_rate": 0.0002, + "loss": 1.4286, + "step": 940 + }, + { + "epoch": 1.7066424846973476, + "grad_norm": 0.1318359375, + "learning_rate": 0.0002, + "loss": 1.3309, + "step": 941 + }, + { + "epoch": 1.708456132396282, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.4955, + "step": 942 + }, + { + "epoch": 1.7102697800952165, + "grad_norm": 0.1396484375, + "learning_rate": 0.0002, + "loss": 1.4938, + "step": 943 + }, + { + "epoch": 1.712083427794151, + "grad_norm": 0.1669921875, + "learning_rate": 0.0002, + "loss": 1.5625, + "step": 944 + }, + { + "epoch": 1.7138970754930853, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.2913, + "step": 945 + }, + { + "epoch": 1.7157107231920201, + "grad_norm": 0.177734375, + "learning_rate": 0.0002, + "loss": 1.5008, + "step": 946 + }, + { + "epoch": 1.7175243708909544, + "grad_norm": 0.2021484375, + "learning_rate": 0.0002, + "loss": 1.4719, + "step": 947 + }, + { + "epoch": 1.7193380185898888, + "grad_norm": 0.1884765625, + "learning_rate": 0.0002, + "loss": 1.3399, + "step": 948 + }, + { + "epoch": 1.7211516662888235, + "grad_norm": 0.20703125, + "learning_rate": 0.0002, + "loss": 1.4045, + "step": 949 + }, + { + "epoch": 1.7229653139877579, + "grad_norm": 0.2236328125, + "learning_rate": 0.0002, + "loss": 1.1543, + "step": 950 + }, + { + "epoch": 1.7247789616866922, + "grad_norm": 0.25390625, + "learning_rate": 0.0002, + "loss": 1.0346, + "step": 951 + }, + { + "epoch": 1.726592609385627, + "grad_norm": 0.1650390625, + "learning_rate": 0.0002, + "loss": 1.1496, + "step": 952 + }, + { + "epoch": 1.7284062570845613, + "grad_norm": 0.08544921875, + "learning_rate": 0.0002, + "loss": 1.2991, + "step": 953 + }, + { + "epoch": 1.7302199047834959, + "grad_norm": 0.09033203125, + "learning_rate": 0.0002, + "loss": 0.9756, + "step": 954 + }, + { + "epoch": 1.7320335524824304, + "grad_norm": 0.0869140625, + "learning_rate": 0.0002, + "loss": 1.1537, + "step": 955 + }, + { + "epoch": 1.7338472001813647, + "grad_norm": 0.08642578125, + "learning_rate": 0.0002, + "loss": 1.0084, + "step": 956 + }, + { + "epoch": 1.7356608478802993, + "grad_norm": 0.08251953125, + "learning_rate": 0.0002, + "loss": 1.0283, + "step": 957 + }, + { + "epoch": 1.7374744955792338, + "grad_norm": 0.08203125, + "learning_rate": 0.0002, + "loss": 0.8987, + "step": 958 + }, + { + "epoch": 1.7392881432781682, + "grad_norm": 0.09130859375, + "learning_rate": 0.0002, + "loss": 0.9652, + "step": 959 + }, + { + "epoch": 1.7411017909771027, + "grad_norm": 0.087890625, + "learning_rate": 0.0002, + "loss": 1.1018, + "step": 960 + }, + { + "epoch": 1.7429154386760373, + "grad_norm": 0.08544921875, + "learning_rate": 0.0002, + "loss": 1.2219, + "step": 961 + }, + { + "epoch": 1.7447290863749716, + "grad_norm": 0.09130859375, + "learning_rate": 0.0002, + "loss": 1.0568, + "step": 962 + }, + { + "epoch": 1.7465427340739061, + "grad_norm": 0.08740234375, + "learning_rate": 0.0002, + "loss": 1.0658, + "step": 963 + }, + { + "epoch": 1.7483563817728407, + "grad_norm": 0.08984375, + "learning_rate": 0.0002, + "loss": 1.0462, + "step": 964 + }, + { + "epoch": 1.750170029471775, + "grad_norm": 0.08984375, + "learning_rate": 0.0002, + "loss": 1.0673, + "step": 965 + }, + { + "epoch": 1.7519836771707096, + "grad_norm": 0.08984375, + "learning_rate": 0.0002, + "loss": 0.9063, + "step": 966 + }, + { + "epoch": 1.7537973248696441, + "grad_norm": 0.10009765625, + "learning_rate": 0.0002, + "loss": 1.1331, + "step": 967 + }, + { + "epoch": 1.7556109725685785, + "grad_norm": 0.08837890625, + "learning_rate": 0.0002, + "loss": 0.963, + "step": 968 + }, + { + "epoch": 1.757424620267513, + "grad_norm": 0.0986328125, + "learning_rate": 0.0002, + "loss": 1.1218, + "step": 969 + }, + { + "epoch": 1.7592382679664476, + "grad_norm": 0.0986328125, + "learning_rate": 0.0002, + "loss": 1.381, + "step": 970 + }, + { + "epoch": 1.7610519156653819, + "grad_norm": 0.09033203125, + "learning_rate": 0.0002, + "loss": 1.1115, + "step": 971 + }, + { + "epoch": 1.7628655633643164, + "grad_norm": 0.0927734375, + "learning_rate": 0.0002, + "loss": 1.348, + "step": 972 + }, + { + "epoch": 1.764679211063251, + "grad_norm": 0.09228515625, + "learning_rate": 0.0002, + "loss": 1.0428, + "step": 973 + }, + { + "epoch": 1.7664928587621853, + "grad_norm": 0.10302734375, + "learning_rate": 0.0002, + "loss": 1.2929, + "step": 974 + }, + { + "epoch": 1.76830650646112, + "grad_norm": 0.099609375, + "learning_rate": 0.0002, + "loss": 1.1506, + "step": 975 + }, + { + "epoch": 1.7701201541600544, + "grad_norm": 0.10888671875, + "learning_rate": 0.0002, + "loss": 1.2688, + "step": 976 + }, + { + "epoch": 1.7719338018589887, + "grad_norm": 0.10498046875, + "learning_rate": 0.0002, + "loss": 1.151, + "step": 977 + }, + { + "epoch": 1.7737474495579235, + "grad_norm": 0.10595703125, + "learning_rate": 0.0002, + "loss": 1.5008, + "step": 978 + }, + { + "epoch": 1.7755610972568578, + "grad_norm": 0.107421875, + "learning_rate": 0.0002, + "loss": 1.2052, + "step": 979 + }, + { + "epoch": 1.7773747449557922, + "grad_norm": 0.10693359375, + "learning_rate": 0.0002, + "loss": 1.0401, + "step": 980 + }, + { + "epoch": 1.779188392654727, + "grad_norm": 0.1220703125, + "learning_rate": 0.0002, + "loss": 1.2216, + "step": 981 + }, + { + "epoch": 1.7810020403536613, + "grad_norm": 0.12353515625, + "learning_rate": 0.0002, + "loss": 1.3721, + "step": 982 + }, + { + "epoch": 1.7828156880525958, + "grad_norm": 0.1181640625, + "learning_rate": 0.0002, + "loss": 1.3002, + "step": 983 + }, + { + "epoch": 1.7846293357515304, + "grad_norm": 0.1181640625, + "learning_rate": 0.0002, + "loss": 1.3948, + "step": 984 + }, + { + "epoch": 1.7864429834504647, + "grad_norm": 0.12060546875, + "learning_rate": 0.0002, + "loss": 1.4158, + "step": 985 + }, + { + "epoch": 1.7882566311493993, + "grad_norm": 0.12890625, + "learning_rate": 0.0002, + "loss": 1.3581, + "step": 986 + }, + { + "epoch": 1.7900702788483338, + "grad_norm": 0.1279296875, + "learning_rate": 0.0002, + "loss": 1.3009, + "step": 987 + }, + { + "epoch": 1.7918839265472681, + "grad_norm": 0.1279296875, + "learning_rate": 0.0002, + "loss": 1.2828, + "step": 988 + }, + { + "epoch": 1.7936975742462027, + "grad_norm": 0.1337890625, + "learning_rate": 0.0002, + "loss": 1.3255, + "step": 989 + }, + { + "epoch": 1.7955112219451372, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.2644, + "step": 990 + }, + { + "epoch": 1.7973248696440716, + "grad_norm": 0.134765625, + "learning_rate": 0.0002, + "loss": 1.2491, + "step": 991 + }, + { + "epoch": 1.7991385173430061, + "grad_norm": 0.1708984375, + "learning_rate": 0.0002, + "loss": 1.5046, + "step": 992 + }, + { + "epoch": 1.8009521650419407, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.6946, + "step": 993 + }, + { + "epoch": 1.802765812740875, + "grad_norm": 0.177734375, + "learning_rate": 0.0002, + "loss": 1.4978, + "step": 994 + }, + { + "epoch": 1.8045794604398095, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 1.3918, + "step": 995 + }, + { + "epoch": 1.806393108138744, + "grad_norm": 0.1884765625, + "learning_rate": 0.0002, + "loss": 1.3654, + "step": 996 + }, + { + "epoch": 1.8082067558376784, + "grad_norm": 0.1923828125, + "learning_rate": 0.0002, + "loss": 1.4264, + "step": 997 + }, + { + "epoch": 1.810020403536613, + "grad_norm": 0.203125, + "learning_rate": 0.0002, + "loss": 1.1053, + "step": 998 + }, + { + "epoch": 1.8118340512355475, + "grad_norm": 0.22265625, + "learning_rate": 0.0002, + "loss": 1.4791, + "step": 999 + }, + { + "epoch": 1.8136476989344819, + "grad_norm": 0.2138671875, + "learning_rate": 0.0002, + "loss": 1.1548, + "step": 1000 + }, + { + "epoch": 1.8154613466334164, + "grad_norm": 0.263671875, + "learning_rate": 0.0002, + "loss": 1.2511, + "step": 1001 + }, + { + "epoch": 1.817274994332351, + "grad_norm": 0.2158203125, + "learning_rate": 0.0002, + "loss": 1.2406, + "step": 1002 + }, + { + "epoch": 1.8190886420312853, + "grad_norm": 0.0869140625, + "learning_rate": 0.0002, + "loss": 0.9535, + "step": 1003 + }, + { + "epoch": 1.82090228973022, + "grad_norm": 0.09130859375, + "learning_rate": 0.0002, + "loss": 1.1947, + "step": 1004 + }, + { + "epoch": 1.8227159374291544, + "grad_norm": 0.09130859375, + "learning_rate": 0.0002, + "loss": 1.1732, + "step": 1005 + }, + { + "epoch": 1.8245295851280887, + "grad_norm": 0.0791015625, + "learning_rate": 0.0002, + "loss": 0.9983, + "step": 1006 + }, + { + "epoch": 1.8263432328270235, + "grad_norm": 0.09228515625, + "learning_rate": 0.0002, + "loss": 1.0795, + "step": 1007 + }, + { + "epoch": 1.8281568805259578, + "grad_norm": 0.08154296875, + "learning_rate": 0.0002, + "loss": 1.0616, + "step": 1008 + }, + { + "epoch": 1.8299705282248924, + "grad_norm": 0.08642578125, + "learning_rate": 0.0002, + "loss": 1.2051, + "step": 1009 + }, + { + "epoch": 1.831784175923827, + "grad_norm": 0.1357421875, + "learning_rate": 0.0002, + "loss": 1.0186, + "step": 1010 + }, + { + "epoch": 1.8335978236227612, + "grad_norm": 0.0849609375, + "learning_rate": 0.0002, + "loss": 1.1597, + "step": 1011 + }, + { + "epoch": 1.8354114713216958, + "grad_norm": 0.09765625, + "learning_rate": 0.0002, + "loss": 1.0907, + "step": 1012 + }, + { + "epoch": 1.8372251190206303, + "grad_norm": 0.09912109375, + "learning_rate": 0.0002, + "loss": 1.3346, + "step": 1013 + }, + { + "epoch": 1.8390387667195647, + "grad_norm": 0.09228515625, + "learning_rate": 0.0002, + "loss": 1.1651, + "step": 1014 + }, + { + "epoch": 1.8408524144184992, + "grad_norm": 0.08935546875, + "learning_rate": 0.0002, + "loss": 1.1045, + "step": 1015 + }, + { + "epoch": 1.8426660621174338, + "grad_norm": 0.09619140625, + "learning_rate": 0.0002, + "loss": 1.0264, + "step": 1016 + }, + { + "epoch": 1.844479709816368, + "grad_norm": 0.09423828125, + "learning_rate": 0.0002, + "loss": 1.2168, + "step": 1017 + }, + { + "epoch": 1.8462933575153027, + "grad_norm": 0.103515625, + "learning_rate": 0.0002, + "loss": 1.1731, + "step": 1018 + }, + { + "epoch": 1.8481070052142372, + "grad_norm": 0.08984375, + "learning_rate": 0.0002, + "loss": 1.2791, + "step": 1019 + }, + { + "epoch": 1.8499206529131715, + "grad_norm": 0.0966796875, + "learning_rate": 0.0002, + "loss": 1.0432, + "step": 1020 + }, + { + "epoch": 1.851734300612106, + "grad_norm": 0.099609375, + "learning_rate": 0.0002, + "loss": 1.0449, + "step": 1021 + }, + { + "epoch": 1.8535479483110406, + "grad_norm": 0.099609375, + "learning_rate": 0.0002, + "loss": 1.1692, + "step": 1022 + }, + { + "epoch": 1.855361596009975, + "grad_norm": 0.1044921875, + "learning_rate": 0.0002, + "loss": 1.4557, + "step": 1023 + }, + { + "epoch": 1.8571752437089095, + "grad_norm": 0.10400390625, + "learning_rate": 0.0002, + "loss": 1.1503, + "step": 1024 + }, + { + "epoch": 1.858988891407844, + "grad_norm": 0.0966796875, + "learning_rate": 0.0002, + "loss": 1.0672, + "step": 1025 + }, + { + "epoch": 1.8608025391067784, + "grad_norm": 0.1015625, + "learning_rate": 0.0002, + "loss": 1.1239, + "step": 1026 + }, + { + "epoch": 1.862616186805713, + "grad_norm": 0.09814453125, + "learning_rate": 0.0002, + "loss": 1.2035, + "step": 1027 + }, + { + "epoch": 1.8644298345046475, + "grad_norm": 0.09814453125, + "learning_rate": 0.0002, + "loss": 1.0774, + "step": 1028 + }, + { + "epoch": 1.8662434822035818, + "grad_norm": 0.10498046875, + "learning_rate": 0.0002, + "loss": 1.2227, + "step": 1029 + }, + { + "epoch": 1.8680571299025166, + "grad_norm": 0.10205078125, + "learning_rate": 0.0002, + "loss": 1.2881, + "step": 1030 + }, + { + "epoch": 1.869870777601451, + "grad_norm": 0.10546875, + "learning_rate": 0.0002, + "loss": 1.1551, + "step": 1031 + }, + { + "epoch": 1.8716844253003853, + "grad_norm": 0.11181640625, + "learning_rate": 0.0002, + "loss": 1.2855, + "step": 1032 + }, + { + "epoch": 1.87349807299932, + "grad_norm": 0.12060546875, + "learning_rate": 0.0002, + "loss": 1.3233, + "step": 1033 + }, + { + "epoch": 1.8753117206982544, + "grad_norm": 0.11328125, + "learning_rate": 0.0002, + "loss": 1.1676, + "step": 1034 + }, + { + "epoch": 1.8771253683971887, + "grad_norm": 0.12255859375, + "learning_rate": 0.0002, + "loss": 1.2136, + "step": 1035 + }, + { + "epoch": 1.8789390160961235, + "grad_norm": 0.130859375, + "learning_rate": 0.0002, + "loss": 1.1565, + "step": 1036 + }, + { + "epoch": 1.8807526637950578, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.2634, + "step": 1037 + }, + { + "epoch": 1.8825663114939923, + "grad_norm": 0.138671875, + "learning_rate": 0.0002, + "loss": 1.5519, + "step": 1038 + }, + { + "epoch": 1.884379959192927, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002, + "loss": 1.2671, + "step": 1039 + }, + { + "epoch": 1.8861936068918612, + "grad_norm": 0.140625, + "learning_rate": 0.0002, + "loss": 1.2868, + "step": 1040 + }, + { + "epoch": 1.8880072545907958, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.5456, + "step": 1041 + }, + { + "epoch": 1.8898209022897303, + "grad_norm": 0.1328125, + "learning_rate": 0.0002, + "loss": 1.3375, + "step": 1042 + }, + { + "epoch": 1.8916345499886646, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002, + "loss": 1.2135, + "step": 1043 + }, + { + "epoch": 1.8934481976875992, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.4532, + "step": 1044 + }, + { + "epoch": 1.8952618453865338, + "grad_norm": 0.169921875, + "learning_rate": 0.0002, + "loss": 1.4675, + "step": 1045 + }, + { + "epoch": 1.897075493085468, + "grad_norm": 0.181640625, + "learning_rate": 0.0002, + "loss": 1.2738, + "step": 1046 + }, + { + "epoch": 1.8988891407844026, + "grad_norm": 0.189453125, + "learning_rate": 0.0002, + "loss": 1.4043, + "step": 1047 + }, + { + "epoch": 1.9007027884833372, + "grad_norm": 0.189453125, + "learning_rate": 0.0002, + "loss": 1.2947, + "step": 1048 + }, + { + "epoch": 1.9025164361822715, + "grad_norm": 0.2177734375, + "learning_rate": 0.0002, + "loss": 1.3452, + "step": 1049 + }, + { + "epoch": 1.904330083881206, + "grad_norm": 0.2216796875, + "learning_rate": 0.0002, + "loss": 1.1973, + "step": 1050 + }, + { + "epoch": 1.9061437315801406, + "grad_norm": 0.2373046875, + "learning_rate": 0.0002, + "loss": 1.1648, + "step": 1051 + }, + { + "epoch": 1.907957379279075, + "grad_norm": 0.2197265625, + "learning_rate": 0.0002, + "loss": 1.0331, + "step": 1052 + }, + { + "epoch": 1.9097710269780095, + "grad_norm": 0.0859375, + "learning_rate": 0.0002, + "loss": 1.0594, + "step": 1053 + }, + { + "epoch": 1.911584674676944, + "grad_norm": 0.08203125, + "learning_rate": 0.0002, + "loss": 1.0812, + "step": 1054 + }, + { + "epoch": 1.9133983223758784, + "grad_norm": 0.08251953125, + "learning_rate": 0.0002, + "loss": 1.1578, + "step": 1055 + }, + { + "epoch": 1.915211970074813, + "grad_norm": 0.080078125, + "learning_rate": 0.0002, + "loss": 1.0647, + "step": 1056 + }, + { + "epoch": 1.9170256177737475, + "grad_norm": 0.08642578125, + "learning_rate": 0.0002, + "loss": 0.9838, + "step": 1057 + }, + { + "epoch": 1.9188392654726818, + "grad_norm": 0.08642578125, + "learning_rate": 0.0002, + "loss": 1.087, + "step": 1058 + }, + { + "epoch": 1.9206529131716166, + "grad_norm": 0.08740234375, + "learning_rate": 0.0002, + "loss": 1.138, + "step": 1059 + }, + { + "epoch": 1.922466560870551, + "grad_norm": 0.08544921875, + "learning_rate": 0.0002, + "loss": 0.9758, + "step": 1060 + }, + { + "epoch": 1.9242802085694852, + "grad_norm": 0.0869140625, + "learning_rate": 0.0002, + "loss": 1.1318, + "step": 1061 + }, + { + "epoch": 1.92609385626842, + "grad_norm": 0.0869140625, + "learning_rate": 0.0002, + "loss": 1.0723, + "step": 1062 + }, + { + "epoch": 1.9279075039673543, + "grad_norm": 0.0869140625, + "learning_rate": 0.0002, + "loss": 1.3201, + "step": 1063 + }, + { + "epoch": 1.9297211516662887, + "grad_norm": 0.09130859375, + "learning_rate": 0.0002, + "loss": 1.2047, + "step": 1064 + }, + { + "epoch": 1.9315347993652234, + "grad_norm": 0.09619140625, + "learning_rate": 0.0002, + "loss": 1.1885, + "step": 1065 + }, + { + "epoch": 1.9333484470641578, + "grad_norm": 0.09326171875, + "learning_rate": 0.0002, + "loss": 1.1022, + "step": 1066 + }, + { + "epoch": 1.9351620947630923, + "grad_norm": 0.09033203125, + "learning_rate": 0.0002, + "loss": 1.1693, + "step": 1067 + }, + { + "epoch": 1.9369757424620269, + "grad_norm": 0.09765625, + "learning_rate": 0.0002, + "loss": 1.0877, + "step": 1068 + }, + { + "epoch": 1.9387893901609612, + "grad_norm": 0.08984375, + "learning_rate": 0.0002, + "loss": 1.0973, + "step": 1069 + }, + { + "epoch": 1.9406030378598957, + "grad_norm": 0.0927734375, + "learning_rate": 0.0002, + "loss": 1.0565, + "step": 1070 + }, + { + "epoch": 1.9424166855588303, + "grad_norm": 0.091796875, + "learning_rate": 0.0002, + "loss": 1.0229, + "step": 1071 + }, + { + "epoch": 1.9442303332577646, + "grad_norm": 0.0927734375, + "learning_rate": 0.0002, + "loss": 1.0211, + "step": 1072 + }, + { + "epoch": 1.9460439809566992, + "grad_norm": 0.09619140625, + "learning_rate": 0.0002, + "loss": 1.232, + "step": 1073 + }, + { + "epoch": 1.9478576286556337, + "grad_norm": 0.1015625, + "learning_rate": 0.0002, + "loss": 1.2966, + "step": 1074 + }, + { + "epoch": 1.949671276354568, + "grad_norm": 0.10498046875, + "learning_rate": 0.0002, + "loss": 1.1, + "step": 1075 + }, + { + "epoch": 1.9514849240535026, + "grad_norm": 0.103515625, + "learning_rate": 0.0002, + "loss": 1.09, + "step": 1076 + }, + { + "epoch": 1.9532985717524372, + "grad_norm": 0.09716796875, + "learning_rate": 0.0002, + "loss": 1.271, + "step": 1077 + }, + { + "epoch": 1.9551122194513715, + "grad_norm": 0.1044921875, + "learning_rate": 0.0002, + "loss": 1.2393, + "step": 1078 + }, + { + "epoch": 1.956925867150306, + "grad_norm": 0.10205078125, + "learning_rate": 0.0002, + "loss": 1.074, + "step": 1079 + }, + { + "epoch": 1.9587395148492406, + "grad_norm": 0.103515625, + "learning_rate": 0.0002, + "loss": 1.1056, + "step": 1080 + }, + { + "epoch": 1.960553162548175, + "grad_norm": 0.11376953125, + "learning_rate": 0.0002, + "loss": 1.4835, + "step": 1081 + }, + { + "epoch": 1.9623668102471095, + "grad_norm": 0.1162109375, + "learning_rate": 0.0002, + "loss": 1.4037, + "step": 1082 + }, + { + "epoch": 1.964180457946044, + "grad_norm": 0.134765625, + "learning_rate": 0.0002, + "loss": 1.2204, + "step": 1083 + }, + { + "epoch": 1.9659941056449783, + "grad_norm": 0.125, + "learning_rate": 0.0002, + "loss": 1.4704, + "step": 1084 + }, + { + "epoch": 1.967807753343913, + "grad_norm": 0.1328125, + "learning_rate": 0.0002, + "loss": 1.5898, + "step": 1085 + }, + { + "epoch": 1.9696214010428474, + "grad_norm": 0.123046875, + "learning_rate": 0.0002, + "loss": 1.2417, + "step": 1086 + }, + { + "epoch": 1.9714350487417818, + "grad_norm": 0.1318359375, + "learning_rate": 0.0002, + "loss": 1.4372, + "step": 1087 + }, + { + "epoch": 1.9732486964407165, + "grad_norm": 0.140625, + "learning_rate": 0.0002, + "loss": 1.4462, + "step": 1088 + }, + { + "epoch": 1.9750623441396509, + "grad_norm": 0.140625, + "learning_rate": 0.0002, + "loss": 1.3378, + "step": 1089 + }, + { + "epoch": 1.9768759918385852, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.5582, + "step": 1090 + }, + { + "epoch": 1.97868963953752, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.2268, + "step": 1091 + }, + { + "epoch": 1.9805032872364543, + "grad_norm": 0.140625, + "learning_rate": 0.0002, + "loss": 1.2749, + "step": 1092 + }, + { + "epoch": 1.9823169349353889, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 1.5037, + "step": 1093 + }, + { + "epoch": 1.9841305826343234, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 1.2161, + "step": 1094 + }, + { + "epoch": 1.9859442303332577, + "grad_norm": 0.1806640625, + "learning_rate": 0.0002, + "loss": 1.3546, + "step": 1095 + }, + { + "epoch": 1.9877578780321923, + "grad_norm": 0.1884765625, + "learning_rate": 0.0002, + "loss": 1.4681, + "step": 1096 + }, + { + "epoch": 1.9895715257311268, + "grad_norm": 0.166015625, + "learning_rate": 0.0002, + "loss": 1.2068, + "step": 1097 + }, + { + "epoch": 1.9913851734300612, + "grad_norm": 0.189453125, + "learning_rate": 0.0002, + "loss": 1.2634, + "step": 1098 + }, + { + "epoch": 1.9931988211289957, + "grad_norm": 0.185546875, + "learning_rate": 0.0002, + "loss": 1.1854, + "step": 1099 + }, + { + "epoch": 1.9950124688279303, + "grad_norm": 0.29296875, + "learning_rate": 0.0002, + "loss": 1.3777, + "step": 1100 + }, + { + "epoch": 1.9968261165268646, + "grad_norm": 0.259765625, + "learning_rate": 0.0002, + "loss": 1.1697, + "step": 1101 + }, + { + "epoch": 1.9986397642257991, + "grad_norm": 0.1826171875, + "learning_rate": 0.0002, + "loss": 0.9272, + "step": 1102 + }, + { + "epoch": 2.0004534119247337, + "grad_norm": 0.130859375, + "learning_rate": 0.0002, + "loss": 1.1516, + "step": 1103 + }, + { + "epoch": 2.002267059623668, + "grad_norm": 0.08056640625, + "learning_rate": 0.0002, + "loss": 1.1323, + "step": 1104 + }, + { + "epoch": 2.004080707322603, + "grad_norm": 0.076171875, + "learning_rate": 0.0002, + "loss": 1.0832, + "step": 1105 + }, + { + "epoch": 2.005894355021537, + "grad_norm": 0.08447265625, + "learning_rate": 0.0002, + "loss": 1.1672, + "step": 1106 + }, + { + "epoch": 2.0077080027204715, + "grad_norm": 0.08837890625, + "learning_rate": 0.0002, + "loss": 0.8231, + "step": 1107 + }, + { + "epoch": 2.0095216504194062, + "grad_norm": 0.0791015625, + "learning_rate": 0.0002, + "loss": 0.9409, + "step": 1108 + }, + { + "epoch": 2.0113352981183406, + "grad_norm": 0.08154296875, + "learning_rate": 0.0002, + "loss": 0.9595, + "step": 1109 + }, + { + "epoch": 2.013148945817275, + "grad_norm": 0.09375, + "learning_rate": 0.0002, + "loss": 1.111, + "step": 1110 + }, + { + "epoch": 2.0149625935162097, + "grad_norm": 0.091796875, + "learning_rate": 0.0002, + "loss": 1.0043, + "step": 1111 + }, + { + "epoch": 2.016776241215144, + "grad_norm": 0.09912109375, + "learning_rate": 0.0002, + "loss": 0.9566, + "step": 1112 + }, + { + "epoch": 2.0185898889140783, + "grad_norm": 0.08935546875, + "learning_rate": 0.0002, + "loss": 1.102, + "step": 1113 + }, + { + "epoch": 2.020403536613013, + "grad_norm": 0.09228515625, + "learning_rate": 0.0002, + "loss": 1.1907, + "step": 1114 + }, + { + "epoch": 2.0222171843119474, + "grad_norm": 0.08251953125, + "learning_rate": 0.0002, + "loss": 1.0665, + "step": 1115 + }, + { + "epoch": 2.0240308320108817, + "grad_norm": 0.09619140625, + "learning_rate": 0.0002, + "loss": 0.9386, + "step": 1116 + }, + { + "epoch": 2.0258444797098165, + "grad_norm": 0.09716796875, + "learning_rate": 0.0002, + "loss": 1.1662, + "step": 1117 + }, + { + "epoch": 2.027658127408751, + "grad_norm": 0.09716796875, + "learning_rate": 0.0002, + "loss": 1.066, + "step": 1118 + }, + { + "epoch": 2.029471775107685, + "grad_norm": 0.0986328125, + "learning_rate": 0.0002, + "loss": 1.133, + "step": 1119 + }, + { + "epoch": 2.03128542280662, + "grad_norm": 0.09033203125, + "learning_rate": 0.0002, + "loss": 1.102, + "step": 1120 + }, + { + "epoch": 2.0330990705055543, + "grad_norm": 0.0947265625, + "learning_rate": 0.0002, + "loss": 1.0457, + "step": 1121 + }, + { + "epoch": 2.0349127182044886, + "grad_norm": 0.1005859375, + "learning_rate": 0.0002, + "loss": 0.9507, + "step": 1122 + }, + { + "epoch": 2.0349127182044886, + "eval_loss": 1.3159377574920654, + "eval_runtime": 152.8173, + "eval_samples_per_second": 6.544, + "eval_steps_per_second": 6.544, + "step": 1122 + }, + { + "epoch": 2.0349127182044886, + "mmlu_eval_accuracy": 0.34121329981931386, + "mmlu_eval_accuracy_abstract_algebra": 0.45454545454545453, + "mmlu_eval_accuracy_anatomy": 0.42857142857142855, + "mmlu_eval_accuracy_astronomy": 0.25, + "mmlu_eval_accuracy_business_ethics": 0.7272727272727273, + "mmlu_eval_accuracy_clinical_knowledge": 0.3103448275862069, + "mmlu_eval_accuracy_college_biology": 0.3125, + "mmlu_eval_accuracy_college_chemistry": 0.125, + "mmlu_eval_accuracy_college_computer_science": 0.0, + "mmlu_eval_accuracy_college_mathematics": 0.2727272727272727, + "mmlu_eval_accuracy_college_medicine": 0.22727272727272727, + "mmlu_eval_accuracy_college_physics": 0.2727272727272727, + "mmlu_eval_accuracy_computer_security": 0.36363636363636365, + "mmlu_eval_accuracy_conceptual_physics": 0.3076923076923077, + "mmlu_eval_accuracy_econometrics": 0.08333333333333333, + "mmlu_eval_accuracy_electrical_engineering": 0.1875, + "mmlu_eval_accuracy_elementary_mathematics": 0.3170731707317073, + "mmlu_eval_accuracy_formal_logic": 0.14285714285714285, + "mmlu_eval_accuracy_global_facts": 0.2, + "mmlu_eval_accuracy_high_school_biology": 0.4375, + "mmlu_eval_accuracy_high_school_chemistry": 0.2727272727272727, + "mmlu_eval_accuracy_high_school_computer_science": 0.5555555555555556, + "mmlu_eval_accuracy_high_school_european_history": 0.3888888888888889, + "mmlu_eval_accuracy_high_school_geography": 0.5, + "mmlu_eval_accuracy_high_school_government_and_politics": 0.42857142857142855, + "mmlu_eval_accuracy_high_school_macroeconomics": 0.32558139534883723, + "mmlu_eval_accuracy_high_school_mathematics": 0.2413793103448276, + "mmlu_eval_accuracy_high_school_microeconomics": 0.11538461538461539, + "mmlu_eval_accuracy_high_school_physics": 0.35294117647058826, + "mmlu_eval_accuracy_high_school_psychology": 0.5166666666666667, + "mmlu_eval_accuracy_high_school_statistics": 0.30434782608695654, + "mmlu_eval_accuracy_high_school_us_history": 0.36363636363636365, + "mmlu_eval_accuracy_high_school_world_history": 0.2692307692307692, + "mmlu_eval_accuracy_human_aging": 0.43478260869565216, + "mmlu_eval_accuracy_human_sexuality": 0.16666666666666666, + "mmlu_eval_accuracy_international_law": 0.3076923076923077, + "mmlu_eval_accuracy_jurisprudence": 0.2727272727272727, + "mmlu_eval_accuracy_logical_fallacies": 0.3888888888888889, + "mmlu_eval_accuracy_machine_learning": 0.45454545454545453, + "mmlu_eval_accuracy_management": 0.2727272727272727, + "mmlu_eval_accuracy_marketing": 0.52, + "mmlu_eval_accuracy_medical_genetics": 0.6363636363636364, + "mmlu_eval_accuracy_miscellaneous": 0.45348837209302323, + "mmlu_eval_accuracy_moral_disputes": 0.34210526315789475, + "mmlu_eval_accuracy_moral_scenarios": 0.31, + "mmlu_eval_accuracy_nutrition": 0.45454545454545453, + "mmlu_eval_accuracy_philosophy": 0.4411764705882353, + "mmlu_eval_accuracy_prehistory": 0.3142857142857143, + "mmlu_eval_accuracy_professional_accounting": 0.3225806451612903, + "mmlu_eval_accuracy_professional_law": 0.2529411764705882, + "mmlu_eval_accuracy_professional_medicine": 0.3548387096774194, + "mmlu_eval_accuracy_professional_psychology": 0.3333333333333333, + "mmlu_eval_accuracy_public_relations": 0.3333333333333333, + "mmlu_eval_accuracy_security_studies": 0.37037037037037035, + "mmlu_eval_accuracy_sociology": 0.5909090909090909, + "mmlu_eval_accuracy_us_foreign_policy": 0.36363636363636365, + "mmlu_eval_accuracy_virology": 0.3333333333333333, + "mmlu_eval_accuracy_world_religions": 0.3684210526315789, + "mmlu_loss": 1.9105861847265182, + "step": 1122 + }, + { + "epoch": 2.0367263659034234, + "grad_norm": 0.0966796875, + "learning_rate": 0.0002, + "loss": 0.9072, + "step": 1123 + }, + { + "epoch": 2.0385400136023577, + "grad_norm": 0.11181640625, + "learning_rate": 0.0002, + "loss": 1.1344, + "step": 1124 + }, + { + "epoch": 2.040353661301292, + "grad_norm": 0.1005859375, + "learning_rate": 0.0002, + "loss": 1.0909, + "step": 1125 + }, + { + "epoch": 2.042167309000227, + "grad_norm": 0.11279296875, + "learning_rate": 0.0002, + "loss": 1.0302, + "step": 1126 + }, + { + "epoch": 2.043980956699161, + "grad_norm": 0.1142578125, + "learning_rate": 0.0002, + "loss": 1.0355, + "step": 1127 + }, + { + "epoch": 2.0457946043980955, + "grad_norm": 0.10595703125, + "learning_rate": 0.0002, + "loss": 1.2111, + "step": 1128 + }, + { + "epoch": 2.0476082520970302, + "grad_norm": 0.1201171875, + "learning_rate": 0.0002, + "loss": 1.1055, + "step": 1129 + }, + { + "epoch": 2.0494218997959646, + "grad_norm": 0.11083984375, + "learning_rate": 0.0002, + "loss": 1.0979, + "step": 1130 + }, + { + "epoch": 2.0512355474948993, + "grad_norm": 0.11669921875, + "learning_rate": 0.0002, + "loss": 1.0601, + "step": 1131 + }, + { + "epoch": 2.0530491951938337, + "grad_norm": 0.123046875, + "learning_rate": 0.0002, + "loss": 1.1919, + "step": 1132 + }, + { + "epoch": 2.054862842892768, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 0.879, + "step": 1133 + }, + { + "epoch": 2.0566764905917028, + "grad_norm": 0.13671875, + "learning_rate": 0.0002, + "loss": 0.9238, + "step": 1134 + }, + { + "epoch": 2.058490138290637, + "grad_norm": 0.130859375, + "learning_rate": 0.0002, + "loss": 1.1153, + "step": 1135 + }, + { + "epoch": 2.0603037859895714, + "grad_norm": 0.12890625, + "learning_rate": 0.0002, + "loss": 1.1572, + "step": 1136 + }, + { + "epoch": 2.062117433688506, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.3619, + "step": 1137 + }, + { + "epoch": 2.0639310813874405, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.195, + "step": 1138 + }, + { + "epoch": 2.065744729086375, + "grad_norm": 0.166015625, + "learning_rate": 0.0002, + "loss": 1.0982, + "step": 1139 + }, + { + "epoch": 2.0675583767853096, + "grad_norm": 0.158203125, + "learning_rate": 0.0002, + "loss": 1.0983, + "step": 1140 + }, + { + "epoch": 2.069372024484244, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.2154, + "step": 1141 + }, + { + "epoch": 2.0711856721831783, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.0071, + "step": 1142 + }, + { + "epoch": 2.072999319882113, + "grad_norm": 0.169921875, + "learning_rate": 0.0002, + "loss": 1.1693, + "step": 1143 + }, + { + "epoch": 2.0748129675810474, + "grad_norm": 0.1787109375, + "learning_rate": 0.0002, + "loss": 1.0641, + "step": 1144 + }, + { + "epoch": 2.0766266152799817, + "grad_norm": 0.1787109375, + "learning_rate": 0.0002, + "loss": 1.2416, + "step": 1145 + }, + { + "epoch": 2.0784402629789165, + "grad_norm": 0.1904296875, + "learning_rate": 0.0002, + "loss": 1.3444, + "step": 1146 + }, + { + "epoch": 2.080253910677851, + "grad_norm": 0.2216796875, + "learning_rate": 0.0002, + "loss": 1.1044, + "step": 1147 + }, + { + "epoch": 2.082067558376785, + "grad_norm": 0.2392578125, + "learning_rate": 0.0002, + "loss": 1.0598, + "step": 1148 + }, + { + "epoch": 2.08388120607572, + "grad_norm": 0.2041015625, + "learning_rate": 0.0002, + "loss": 0.8369, + "step": 1149 + }, + { + "epoch": 2.0856948537746542, + "grad_norm": 0.267578125, + "learning_rate": 0.0002, + "loss": 1.0252, + "step": 1150 + }, + { + "epoch": 2.0875085014735886, + "grad_norm": 0.255859375, + "learning_rate": 0.0002, + "loss": 0.8118, + "step": 1151 + }, + { + "epoch": 2.0893221491725233, + "grad_norm": 0.296875, + "learning_rate": 0.0002, + "loss": 0.7014, + "step": 1152 + }, + { + "epoch": 2.0911357968714577, + "grad_norm": 0.3671875, + "learning_rate": 0.0002, + "loss": 0.8538, + "step": 1153 + }, + { + "epoch": 2.092949444570392, + "grad_norm": 0.18359375, + "learning_rate": 0.0002, + "loss": 1.2886, + "step": 1154 + }, + { + "epoch": 2.0947630922693268, + "grad_norm": 0.1806640625, + "learning_rate": 0.0002, + "loss": 1.0126, + "step": 1155 + }, + { + "epoch": 2.096576739968261, + "grad_norm": 0.162109375, + "learning_rate": 0.0002, + "loss": 1.0156, + "step": 1156 + }, + { + "epoch": 2.0983903876671954, + "grad_norm": 0.1083984375, + "learning_rate": 0.0002, + "loss": 1.1495, + "step": 1157 + }, + { + "epoch": 2.10020403536613, + "grad_norm": 0.1142578125, + "learning_rate": 0.0002, + "loss": 1.0509, + "step": 1158 + }, + { + "epoch": 2.1020176830650645, + "grad_norm": 0.11865234375, + "learning_rate": 0.0002, + "loss": 0.9727, + "step": 1159 + }, + { + "epoch": 2.1038313307639993, + "grad_norm": 0.10302734375, + "learning_rate": 0.0002, + "loss": 1.0211, + "step": 1160 + }, + { + "epoch": 2.1056449784629336, + "grad_norm": 0.10400390625, + "learning_rate": 0.0002, + "loss": 0.9177, + "step": 1161 + }, + { + "epoch": 2.107458626161868, + "grad_norm": 0.10693359375, + "learning_rate": 0.0002, + "loss": 0.9344, + "step": 1162 + }, + { + "epoch": 2.1092722738608027, + "grad_norm": 0.109375, + "learning_rate": 0.0002, + "loss": 1.1165, + "step": 1163 + }, + { + "epoch": 2.111085921559737, + "grad_norm": 0.1376953125, + "learning_rate": 0.0002, + "loss": 1.1318, + "step": 1164 + }, + { + "epoch": 2.1128995692586714, + "grad_norm": 0.12353515625, + "learning_rate": 0.0002, + "loss": 1.0447, + "step": 1165 + }, + { + "epoch": 2.114713216957606, + "grad_norm": 0.1171875, + "learning_rate": 0.0002, + "loss": 1.1694, + "step": 1166 + }, + { + "epoch": 2.1165268646565405, + "grad_norm": 0.111328125, + "learning_rate": 0.0002, + "loss": 1.0552, + "step": 1167 + }, + { + "epoch": 2.118340512355475, + "grad_norm": 0.1083984375, + "learning_rate": 0.0002, + "loss": 0.972, + "step": 1168 + }, + { + "epoch": 2.1201541600544096, + "grad_norm": 0.10693359375, + "learning_rate": 0.0002, + "loss": 1.0862, + "step": 1169 + }, + { + "epoch": 2.121967807753344, + "grad_norm": 0.11328125, + "learning_rate": 0.0002, + "loss": 1.1094, + "step": 1170 + }, + { + "epoch": 2.1237814554522783, + "grad_norm": 0.11767578125, + "learning_rate": 0.0002, + "loss": 1.2127, + "step": 1171 + }, + { + "epoch": 2.125595103151213, + "grad_norm": 0.12060546875, + "learning_rate": 0.0002, + "loss": 0.9912, + "step": 1172 + }, + { + "epoch": 2.1274087508501474, + "grad_norm": 0.125, + "learning_rate": 0.0002, + "loss": 1.3807, + "step": 1173 + }, + { + "epoch": 2.1292223985490817, + "grad_norm": 0.12109375, + "learning_rate": 0.0002, + "loss": 1.0758, + "step": 1174 + }, + { + "epoch": 2.1310360462480165, + "grad_norm": 0.119140625, + "learning_rate": 0.0002, + "loss": 1.1258, + "step": 1175 + }, + { + "epoch": 2.132849693946951, + "grad_norm": 0.123046875, + "learning_rate": 0.0002, + "loss": 1.3588, + "step": 1176 + }, + { + "epoch": 2.134663341645885, + "grad_norm": 0.1318359375, + "learning_rate": 0.0002, + "loss": 1.2522, + "step": 1177 + }, + { + "epoch": 2.13647698934482, + "grad_norm": 0.138671875, + "learning_rate": 0.0002, + "loss": 1.0165, + "step": 1178 + }, + { + "epoch": 2.138290637043754, + "grad_norm": 0.1337890625, + "learning_rate": 0.0002, + "loss": 1.1728, + "step": 1179 + }, + { + "epoch": 2.1401042847426885, + "grad_norm": 0.11767578125, + "learning_rate": 0.0002, + "loss": 1.1778, + "step": 1180 + }, + { + "epoch": 2.1419179324416233, + "grad_norm": 0.134765625, + "learning_rate": 0.0002, + "loss": 1.08, + "step": 1181 + }, + { + "epoch": 2.1437315801405576, + "grad_norm": 0.1279296875, + "learning_rate": 0.0002, + "loss": 1.2082, + "step": 1182 + }, + { + "epoch": 2.145545227839492, + "grad_norm": 0.138671875, + "learning_rate": 0.0002, + "loss": 1.16, + "step": 1183 + }, + { + "epoch": 2.1473588755384267, + "grad_norm": 0.1337890625, + "learning_rate": 0.0002, + "loss": 1.2845, + "step": 1184 + }, + { + "epoch": 2.149172523237361, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.1868, + "step": 1185 + }, + { + "epoch": 2.150986170936296, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.0637, + "step": 1186 + }, + { + "epoch": 2.15279981863523, + "grad_norm": 0.16796875, + "learning_rate": 0.0002, + "loss": 1.2166, + "step": 1187 + }, + { + "epoch": 2.1546134663341645, + "grad_norm": 0.1572265625, + "learning_rate": 0.0002, + "loss": 0.986, + "step": 1188 + }, + { + "epoch": 2.1564271140330993, + "grad_norm": 0.1767578125, + "learning_rate": 0.0002, + "loss": 1.2118, + "step": 1189 + }, + { + "epoch": 2.1582407617320336, + "grad_norm": 0.16796875, + "learning_rate": 0.0002, + "loss": 1.1517, + "step": 1190 + }, + { + "epoch": 2.160054409430968, + "grad_norm": 0.1787109375, + "learning_rate": 0.0002, + "loss": 1.0659, + "step": 1191 + }, + { + "epoch": 2.1618680571299027, + "grad_norm": 0.18359375, + "learning_rate": 0.0002, + "loss": 1.1776, + "step": 1192 + }, + { + "epoch": 2.163681704828837, + "grad_norm": 0.1865234375, + "learning_rate": 0.0002, + "loss": 1.0466, + "step": 1193 + }, + { + "epoch": 2.1654953525277714, + "grad_norm": 0.189453125, + "learning_rate": 0.0002, + "loss": 1.1537, + "step": 1194 + }, + { + "epoch": 2.167309000226706, + "grad_norm": 0.19921875, + "learning_rate": 0.0002, + "loss": 1.2162, + "step": 1195 + }, + { + "epoch": 2.1691226479256405, + "grad_norm": 0.2080078125, + "learning_rate": 0.0002, + "loss": 1.2201, + "step": 1196 + }, + { + "epoch": 2.170936295624575, + "grad_norm": 0.193359375, + "learning_rate": 0.0002, + "loss": 1.0152, + "step": 1197 + }, + { + "epoch": 2.1727499433235096, + "grad_norm": 0.228515625, + "learning_rate": 0.0002, + "loss": 1.2536, + "step": 1198 + }, + { + "epoch": 2.174563591022444, + "grad_norm": 0.2216796875, + "learning_rate": 0.0002, + "loss": 0.9876, + "step": 1199 + }, + { + "epoch": 2.1763772387213782, + "grad_norm": 0.220703125, + "learning_rate": 0.0002, + "loss": 0.8731, + "step": 1200 + }, + { + "epoch": 2.178190886420313, + "grad_norm": 0.287109375, + "learning_rate": 0.0002, + "loss": 0.8621, + "step": 1201 + }, + { + "epoch": 2.1800045341192473, + "grad_norm": 0.28515625, + "learning_rate": 0.0002, + "loss": 0.7672, + "step": 1202 + }, + { + "epoch": 2.1818181818181817, + "grad_norm": 0.28125, + "learning_rate": 0.0002, + "loss": 0.6965, + "step": 1203 + }, + { + "epoch": 2.1836318295171164, + "grad_norm": 0.1318359375, + "learning_rate": 0.0002, + "loss": 0.9156, + "step": 1204 + }, + { + "epoch": 2.1854454772160508, + "grad_norm": 0.1376953125, + "learning_rate": 0.0002, + "loss": 0.9352, + "step": 1205 + }, + { + "epoch": 2.187259124914985, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.1801, + "step": 1206 + }, + { + "epoch": 2.18907277261392, + "grad_norm": 0.12109375, + "learning_rate": 0.0002, + "loss": 1.172, + "step": 1207 + }, + { + "epoch": 2.190886420312854, + "grad_norm": 0.12109375, + "learning_rate": 0.0002, + "loss": 1.0794, + "step": 1208 + }, + { + "epoch": 2.1927000680117885, + "grad_norm": 0.1201171875, + "learning_rate": 0.0002, + "loss": 1.0886, + "step": 1209 + }, + { + "epoch": 2.1945137157107233, + "grad_norm": 0.10888671875, + "learning_rate": 0.0002, + "loss": 1.0697, + "step": 1210 + }, + { + "epoch": 2.1963273634096576, + "grad_norm": 0.11474609375, + "learning_rate": 0.0002, + "loss": 0.9965, + "step": 1211 + }, + { + "epoch": 2.198141011108592, + "grad_norm": 0.11083984375, + "learning_rate": 0.0002, + "loss": 1.2483, + "step": 1212 + }, + { + "epoch": 2.1999546588075267, + "grad_norm": 0.11767578125, + "learning_rate": 0.0002, + "loss": 1.1804, + "step": 1213 + }, + { + "epoch": 2.201768306506461, + "grad_norm": 0.10693359375, + "learning_rate": 0.0002, + "loss": 0.9292, + "step": 1214 + }, + { + "epoch": 2.2035819542053954, + "grad_norm": 0.10986328125, + "learning_rate": 0.0002, + "loss": 1.0386, + "step": 1215 + }, + { + "epoch": 2.20539560190433, + "grad_norm": 0.11083984375, + "learning_rate": 0.0002, + "loss": 1.028, + "step": 1216 + }, + { + "epoch": 2.2072092496032645, + "grad_norm": 0.11376953125, + "learning_rate": 0.0002, + "loss": 1.075, + "step": 1217 + }, + { + "epoch": 2.2090228973021993, + "grad_norm": 0.1162109375, + "learning_rate": 0.0002, + "loss": 0.9485, + "step": 1218 + }, + { + "epoch": 2.2108365450011336, + "grad_norm": 0.11181640625, + "learning_rate": 0.0002, + "loss": 0.9464, + "step": 1219 + }, + { + "epoch": 2.212650192700068, + "grad_norm": 0.1083984375, + "learning_rate": 0.0002, + "loss": 0.939, + "step": 1220 + }, + { + "epoch": 2.2144638403990027, + "grad_norm": 0.130859375, + "learning_rate": 0.0002, + "loss": 0.9279, + "step": 1221 + }, + { + "epoch": 2.216277488097937, + "grad_norm": 0.1181640625, + "learning_rate": 0.0002, + "loss": 1.0548, + "step": 1222 + }, + { + "epoch": 2.2180911357968713, + "grad_norm": 0.12353515625, + "learning_rate": 0.0002, + "loss": 1.0191, + "step": 1223 + }, + { + "epoch": 2.219904783495806, + "grad_norm": 0.130859375, + "learning_rate": 0.0002, + "loss": 1.0664, + "step": 1224 + }, + { + "epoch": 2.2217184311947404, + "grad_norm": 0.1357421875, + "learning_rate": 0.0002, + "loss": 1.1497, + "step": 1225 + }, + { + "epoch": 2.2235320788936748, + "grad_norm": 0.1328125, + "learning_rate": 0.0002, + "loss": 0.9337, + "step": 1226 + }, + { + "epoch": 2.2253457265926095, + "grad_norm": 0.119140625, + "learning_rate": 0.0002, + "loss": 1.0148, + "step": 1227 + }, + { + "epoch": 2.227159374291544, + "grad_norm": 0.1259765625, + "learning_rate": 0.0002, + "loss": 1.0999, + "step": 1228 + }, + { + "epoch": 2.228973021990478, + "grad_norm": 0.126953125, + "learning_rate": 0.0002, + "loss": 1.0847, + "step": 1229 + }, + { + "epoch": 2.230786669689413, + "grad_norm": 0.1279296875, + "learning_rate": 0.0002, + "loss": 1.0256, + "step": 1230 + }, + { + "epoch": 2.2326003173883473, + "grad_norm": 0.125, + "learning_rate": 0.0002, + "loss": 1.3034, + "step": 1231 + }, + { + "epoch": 2.2344139650872816, + "grad_norm": 0.13671875, + "learning_rate": 0.0002, + "loss": 1.2169, + "step": 1232 + }, + { + "epoch": 2.2362276127862164, + "grad_norm": 0.1357421875, + "learning_rate": 0.0002, + "loss": 1.132, + "step": 1233 + }, + { + "epoch": 2.2380412604851507, + "grad_norm": 0.1357421875, + "learning_rate": 0.0002, + "loss": 1.122, + "step": 1234 + }, + { + "epoch": 2.239854908184085, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.1585, + "step": 1235 + }, + { + "epoch": 2.24166855588302, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.1239, + "step": 1236 + }, + { + "epoch": 2.243482203581954, + "grad_norm": 0.162109375, + "learning_rate": 0.0002, + "loss": 1.0386, + "step": 1237 + }, + { + "epoch": 2.2452958512808885, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.0898, + "step": 1238 + }, + { + "epoch": 2.2471094989798233, + "grad_norm": 0.1591796875, + "learning_rate": 0.0002, + "loss": 1.0348, + "step": 1239 + }, + { + "epoch": 2.2489231466787576, + "grad_norm": 0.1923828125, + "learning_rate": 0.0002, + "loss": 1.2301, + "step": 1240 + }, + { + "epoch": 2.2507367943776924, + "grad_norm": 0.1845703125, + "learning_rate": 0.0002, + "loss": 1.1232, + "step": 1241 + }, + { + "epoch": 2.2525504420766267, + "grad_norm": 0.17578125, + "learning_rate": 0.0002, + "loss": 1.2482, + "step": 1242 + }, + { + "epoch": 2.254364089775561, + "grad_norm": 0.1953125, + "learning_rate": 0.0002, + "loss": 1.0592, + "step": 1243 + }, + { + "epoch": 2.256177737474496, + "grad_norm": 0.1875, + "learning_rate": 0.0002, + "loss": 1.0438, + "step": 1244 + }, + { + "epoch": 2.25799138517343, + "grad_norm": 0.1845703125, + "learning_rate": 0.0002, + "loss": 1.1489, + "step": 1245 + }, + { + "epoch": 2.2598050328723644, + "grad_norm": 0.1904296875, + "learning_rate": 0.0002, + "loss": 1.3816, + "step": 1246 + }, + { + "epoch": 2.2616186805712992, + "grad_norm": 0.205078125, + "learning_rate": 0.0002, + "loss": 1.0488, + "step": 1247 + }, + { + "epoch": 2.2634323282702336, + "grad_norm": 0.2080078125, + "learning_rate": 0.0002, + "loss": 1.0034, + "step": 1248 + }, + { + "epoch": 2.265245975969168, + "grad_norm": 0.2451171875, + "learning_rate": 0.0002, + "loss": 1.0489, + "step": 1249 + }, + { + "epoch": 2.2670596236681027, + "grad_norm": 0.283203125, + "learning_rate": 0.0002, + "loss": 0.8817, + "step": 1250 + }, + { + "epoch": 2.268873271367037, + "grad_norm": 0.3125, + "learning_rate": 0.0002, + "loss": 0.8321, + "step": 1251 + }, + { + "epoch": 2.2706869190659713, + "grad_norm": 0.265625, + "learning_rate": 0.0002, + "loss": 0.744, + "step": 1252 + }, + { + "epoch": 2.272500566764906, + "grad_norm": 0.248046875, + "learning_rate": 0.0002, + "loss": 0.8236, + "step": 1253 + }, + { + "epoch": 2.2743142144638404, + "grad_norm": 0.1767578125, + "learning_rate": 0.0002, + "loss": 1.0713, + "step": 1254 + }, + { + "epoch": 2.2761278621627747, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 1.0161, + "step": 1255 + }, + { + "epoch": 2.2779415098617095, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.0942, + "step": 1256 + }, + { + "epoch": 2.279755157560644, + "grad_norm": 0.12451171875, + "learning_rate": 0.0002, + "loss": 0.9883, + "step": 1257 + }, + { + "epoch": 2.281568805259578, + "grad_norm": 0.1201171875, + "learning_rate": 0.0002, + "loss": 1.0397, + "step": 1258 + }, + { + "epoch": 2.283382452958513, + "grad_norm": 0.11962890625, + "learning_rate": 0.0002, + "loss": 1.0419, + "step": 1259 + }, + { + "epoch": 2.2851961006574473, + "grad_norm": 0.1328125, + "learning_rate": 0.0002, + "loss": 1.0917, + "step": 1260 + }, + { + "epoch": 2.2870097483563816, + "grad_norm": 0.1103515625, + "learning_rate": 0.0002, + "loss": 1.0908, + "step": 1261 + }, + { + "epoch": 2.2888233960553164, + "grad_norm": 0.12451171875, + "learning_rate": 0.0002, + "loss": 1.252, + "step": 1262 + }, + { + "epoch": 2.2906370437542507, + "grad_norm": 0.111328125, + "learning_rate": 0.0002, + "loss": 0.9815, + "step": 1263 + }, + { + "epoch": 2.292450691453185, + "grad_norm": 0.1123046875, + "learning_rate": 0.0002, + "loss": 1.0962, + "step": 1264 + }, + { + "epoch": 2.29426433915212, + "grad_norm": 0.115234375, + "learning_rate": 0.0002, + "loss": 1.0368, + "step": 1265 + }, + { + "epoch": 2.296077986851054, + "grad_norm": 0.119140625, + "learning_rate": 0.0002, + "loss": 0.9697, + "step": 1266 + }, + { + "epoch": 2.2978916345499885, + "grad_norm": 0.11767578125, + "learning_rate": 0.0002, + "loss": 0.9563, + "step": 1267 + }, + { + "epoch": 2.2997052822489232, + "grad_norm": 0.11279296875, + "learning_rate": 0.0002, + "loss": 0.9728, + "step": 1268 + }, + { + "epoch": 2.3015189299478576, + "grad_norm": 0.11572265625, + "learning_rate": 0.0002, + "loss": 0.9444, + "step": 1269 + }, + { + "epoch": 2.303332577646792, + "grad_norm": 0.11865234375, + "learning_rate": 0.0002, + "loss": 1.0222, + "step": 1270 + }, + { + "epoch": 2.3051462253457267, + "grad_norm": 0.12109375, + "learning_rate": 0.0002, + "loss": 0.8754, + "step": 1271 + }, + { + "epoch": 2.306959873044661, + "grad_norm": 0.12353515625, + "learning_rate": 0.0002, + "loss": 0.7931, + "step": 1272 + }, + { + "epoch": 2.3087735207435953, + "grad_norm": 0.1171875, + "learning_rate": 0.0002, + "loss": 1.113, + "step": 1273 + }, + { + "epoch": 2.31058716844253, + "grad_norm": 0.12451171875, + "learning_rate": 0.0002, + "loss": 1.037, + "step": 1274 + }, + { + "epoch": 2.3124008161414644, + "grad_norm": 0.130859375, + "learning_rate": 0.0002, + "loss": 0.9742, + "step": 1275 + }, + { + "epoch": 2.314214463840399, + "grad_norm": 0.1328125, + "learning_rate": 0.0002, + "loss": 1.1346, + "step": 1276 + }, + { + "epoch": 2.3160281115393335, + "grad_norm": 0.1171875, + "learning_rate": 0.0002, + "loss": 1.0078, + "step": 1277 + }, + { + "epoch": 2.317841759238268, + "grad_norm": 0.12890625, + "learning_rate": 0.0002, + "loss": 1.2321, + "step": 1278 + }, + { + "epoch": 2.3196554069372026, + "grad_norm": 0.134765625, + "learning_rate": 0.0002, + "loss": 1.0239, + "step": 1279 + }, + { + "epoch": 2.321469054636137, + "grad_norm": 0.1337890625, + "learning_rate": 0.0002, + "loss": 1.0937, + "step": 1280 + }, + { + "epoch": 2.3232827023350713, + "grad_norm": 0.1357421875, + "learning_rate": 0.0002, + "loss": 0.9792, + "step": 1281 + }, + { + "epoch": 2.325096350034006, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.1973, + "step": 1282 + }, + { + "epoch": 2.3269099977329404, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002, + "loss": 1.1368, + "step": 1283 + }, + { + "epoch": 2.3287236454318747, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.0477, + "step": 1284 + }, + { + "epoch": 2.3305372931308095, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.3197, + "step": 1285 + }, + { + "epoch": 2.332350940829744, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.1089, + "step": 1286 + }, + { + "epoch": 2.334164588528678, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.0913, + "step": 1287 + }, + { + "epoch": 2.335978236227613, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 1.1159, + "step": 1288 + }, + { + "epoch": 2.3377918839265472, + "grad_norm": 0.1767578125, + "learning_rate": 0.0002, + "loss": 1.1076, + "step": 1289 + }, + { + "epoch": 2.3396055316254816, + "grad_norm": 0.173828125, + "learning_rate": 0.0002, + "loss": 1.2129, + "step": 1290 + }, + { + "epoch": 2.3414191793244163, + "grad_norm": 0.1826171875, + "learning_rate": 0.0002, + "loss": 1.1974, + "step": 1291 + }, + { + "epoch": 2.3432328270233507, + "grad_norm": 0.1826171875, + "learning_rate": 0.0002, + "loss": 0.9469, + "step": 1292 + }, + { + "epoch": 2.3450464747222854, + "grad_norm": 0.1806640625, + "learning_rate": 0.0002, + "loss": 1.2522, + "step": 1293 + }, + { + "epoch": 2.3468601224212198, + "grad_norm": 0.1845703125, + "learning_rate": 0.0002, + "loss": 1.1398, + "step": 1294 + }, + { + "epoch": 2.348673770120154, + "grad_norm": 0.203125, + "learning_rate": 0.0002, + "loss": 0.9641, + "step": 1295 + }, + { + "epoch": 2.350487417819089, + "grad_norm": 0.2041015625, + "learning_rate": 0.0002, + "loss": 1.2437, + "step": 1296 + }, + { + "epoch": 2.352301065518023, + "grad_norm": 0.248046875, + "learning_rate": 0.0002, + "loss": 1.2916, + "step": 1297 + }, + { + "epoch": 2.3541147132169575, + "grad_norm": 0.2099609375, + "learning_rate": 0.0002, + "loss": 0.9539, + "step": 1298 + }, + { + "epoch": 2.3559283609158923, + "grad_norm": 0.2412109375, + "learning_rate": 0.0002, + "loss": 0.9095, + "step": 1299 + }, + { + "epoch": 2.3577420086148266, + "grad_norm": 0.23046875, + "learning_rate": 0.0002, + "loss": 0.9975, + "step": 1300 + }, + { + "epoch": 2.359555656313761, + "grad_norm": 0.296875, + "learning_rate": 0.0002, + "loss": 0.8853, + "step": 1301 + }, + { + "epoch": 2.3613693040126957, + "grad_norm": 0.291015625, + "learning_rate": 0.0002, + "loss": 0.6919, + "step": 1302 + }, + { + "epoch": 2.36318295171163, + "grad_norm": 0.306640625, + "learning_rate": 0.0002, + "loss": 0.8184, + "step": 1303 + }, + { + "epoch": 2.3649965994105644, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 1.2145, + "step": 1304 + }, + { + "epoch": 2.366810247109499, + "grad_norm": 0.1650390625, + "learning_rate": 0.0002, + "loss": 1.139, + "step": 1305 + }, + { + "epoch": 2.3686238948084335, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.0048, + "step": 1306 + }, + { + "epoch": 2.370437542507368, + "grad_norm": 0.1259765625, + "learning_rate": 0.0002, + "loss": 1.1266, + "step": 1307 + }, + { + "epoch": 2.3722511902063026, + "grad_norm": 0.1181640625, + "learning_rate": 0.0002, + "loss": 0.8677, + "step": 1308 + }, + { + "epoch": 2.374064837905237, + "grad_norm": 0.12109375, + "learning_rate": 0.0002, + "loss": 1.0924, + "step": 1309 + }, + { + "epoch": 2.374064837905237, + "eval_loss": 1.3710259199142456, + "eval_runtime": 153.6602, + "eval_samples_per_second": 6.508, + "eval_steps_per_second": 6.508, + "step": 1309 + }, + { + "epoch": 2.374064837905237, + "mmlu_eval_accuracy": 0.33831826050377845, + "mmlu_eval_accuracy_abstract_algebra": 0.36363636363636365, + "mmlu_eval_accuracy_anatomy": 0.6428571428571429, + "mmlu_eval_accuracy_astronomy": 0.3125, + "mmlu_eval_accuracy_business_ethics": 0.45454545454545453, + "mmlu_eval_accuracy_clinical_knowledge": 0.27586206896551724, + "mmlu_eval_accuracy_college_biology": 0.4375, + "mmlu_eval_accuracy_college_chemistry": 0.25, + "mmlu_eval_accuracy_college_computer_science": 0.0, + "mmlu_eval_accuracy_college_mathematics": 0.18181818181818182, + "mmlu_eval_accuracy_college_medicine": 0.13636363636363635, + "mmlu_eval_accuracy_college_physics": 0.2727272727272727, + "mmlu_eval_accuracy_computer_security": 0.6363636363636364, + "mmlu_eval_accuracy_conceptual_physics": 0.23076923076923078, + "mmlu_eval_accuracy_econometrics": 0.16666666666666666, + "mmlu_eval_accuracy_electrical_engineering": 0.3125, + "mmlu_eval_accuracy_elementary_mathematics": 0.2682926829268293, + "mmlu_eval_accuracy_formal_logic": 0.21428571428571427, + "mmlu_eval_accuracy_global_facts": 0.2, + "mmlu_eval_accuracy_high_school_biology": 0.46875, + "mmlu_eval_accuracy_high_school_chemistry": 0.18181818181818182, + "mmlu_eval_accuracy_high_school_computer_science": 0.3333333333333333, + "mmlu_eval_accuracy_high_school_european_history": 0.3888888888888889, + "mmlu_eval_accuracy_high_school_geography": 0.5, + "mmlu_eval_accuracy_high_school_government_and_politics": 0.38095238095238093, + "mmlu_eval_accuracy_high_school_macroeconomics": 0.32558139534883723, + "mmlu_eval_accuracy_high_school_mathematics": 0.1724137931034483, + "mmlu_eval_accuracy_high_school_microeconomics": 0.19230769230769232, + "mmlu_eval_accuracy_high_school_physics": 0.35294117647058826, + "mmlu_eval_accuracy_high_school_psychology": 0.5333333333333333, + "mmlu_eval_accuracy_high_school_statistics": 0.2608695652173913, + "mmlu_eval_accuracy_high_school_us_history": 0.36363636363636365, + "mmlu_eval_accuracy_high_school_world_history": 0.38461538461538464, + "mmlu_eval_accuracy_human_aging": 0.43478260869565216, + "mmlu_eval_accuracy_human_sexuality": 0.08333333333333333, + "mmlu_eval_accuracy_international_law": 0.38461538461538464, + "mmlu_eval_accuracy_jurisprudence": 0.36363636363636365, + "mmlu_eval_accuracy_logical_fallacies": 0.6111111111111112, + "mmlu_eval_accuracy_machine_learning": 0.36363636363636365, + "mmlu_eval_accuracy_management": 0.2727272727272727, + "mmlu_eval_accuracy_marketing": 0.56, + "mmlu_eval_accuracy_medical_genetics": 0.45454545454545453, + "mmlu_eval_accuracy_miscellaneous": 0.45348837209302323, + "mmlu_eval_accuracy_moral_disputes": 0.3684210526315789, + "mmlu_eval_accuracy_moral_scenarios": 0.31, + "mmlu_eval_accuracy_nutrition": 0.3939393939393939, + "mmlu_eval_accuracy_philosophy": 0.38235294117647056, + "mmlu_eval_accuracy_prehistory": 0.2857142857142857, + "mmlu_eval_accuracy_professional_accounting": 0.3225806451612903, + "mmlu_eval_accuracy_professional_law": 0.25882352941176473, + "mmlu_eval_accuracy_professional_medicine": 0.2903225806451613, + "mmlu_eval_accuracy_professional_psychology": 0.34782608695652173, + "mmlu_eval_accuracy_public_relations": 0.4166666666666667, + "mmlu_eval_accuracy_security_studies": 0.37037037037037035, + "mmlu_eval_accuracy_sociology": 0.5454545454545454, + "mmlu_eval_accuracy_us_foreign_policy": 0.2727272727272727, + "mmlu_eval_accuracy_virology": 0.2777777777777778, + "mmlu_eval_accuracy_world_religions": 0.2631578947368421, + "mmlu_loss": 1.3897863577036667, + "step": 1309 + }, + { + "epoch": 2.3758784856041713, + "grad_norm": 0.12451171875, + "learning_rate": 0.0002, + "loss": 0.9415, + "step": 1310 + }, + { + "epoch": 2.377692133303106, + "grad_norm": 0.12255859375, + "learning_rate": 0.0002, + "loss": 1.0255, + "step": 1311 + }, + { + "epoch": 2.3795057810020404, + "grad_norm": 0.12451171875, + "learning_rate": 0.0002, + "loss": 0.937, + "step": 1312 + }, + { + "epoch": 2.3813194287009747, + "grad_norm": 0.111328125, + "learning_rate": 0.0002, + "loss": 1.1924, + "step": 1313 + }, + { + "epoch": 2.3831330763999095, + "grad_norm": 0.125, + "learning_rate": 0.0002, + "loss": 1.2413, + "step": 1314 + }, + { + "epoch": 2.384946724098844, + "grad_norm": 0.1259765625, + "learning_rate": 0.0002, + "loss": 1.028, + "step": 1315 + }, + { + "epoch": 2.386760371797778, + "grad_norm": 0.11767578125, + "learning_rate": 0.0002, + "loss": 1.126, + "step": 1316 + }, + { + "epoch": 2.388574019496713, + "grad_norm": 0.11865234375, + "learning_rate": 0.0002, + "loss": 1.0833, + "step": 1317 + }, + { + "epoch": 2.390387667195647, + "grad_norm": 0.115234375, + "learning_rate": 0.0002, + "loss": 0.8839, + "step": 1318 + }, + { + "epoch": 2.3922013148945815, + "grad_norm": 0.1240234375, + "learning_rate": 0.0002, + "loss": 0.9881, + "step": 1319 + }, + { + "epoch": 2.3940149625935163, + "grad_norm": 0.12890625, + "learning_rate": 0.0002, + "loss": 0.9071, + "step": 1320 + }, + { + "epoch": 2.3958286102924506, + "grad_norm": 0.12255859375, + "learning_rate": 0.0002, + "loss": 1.0469, + "step": 1321 + }, + { + "epoch": 2.397642257991385, + "grad_norm": 0.11865234375, + "learning_rate": 0.0002, + "loss": 0.9924, + "step": 1322 + }, + { + "epoch": 2.3994559056903197, + "grad_norm": 0.140625, + "learning_rate": 0.0002, + "loss": 1.0344, + "step": 1323 + }, + { + "epoch": 2.401269553389254, + "grad_norm": 0.1240234375, + "learning_rate": 0.0002, + "loss": 1.0085, + "step": 1324 + }, + { + "epoch": 2.4030832010881884, + "grad_norm": 0.1171875, + "learning_rate": 0.0002, + "loss": 0.9358, + "step": 1325 + }, + { + "epoch": 2.404896848787123, + "grad_norm": 0.1318359375, + "learning_rate": 0.0002, + "loss": 0.978, + "step": 1326 + }, + { + "epoch": 2.4067104964860575, + "grad_norm": 0.12158203125, + "learning_rate": 0.0002, + "loss": 0.9407, + "step": 1327 + }, + { + "epoch": 2.408524144184992, + "grad_norm": 0.1318359375, + "learning_rate": 0.0002, + "loss": 1.001, + "step": 1328 + }, + { + "epoch": 2.4103377918839266, + "grad_norm": 0.130859375, + "learning_rate": 0.0002, + "loss": 1.0847, + "step": 1329 + }, + { + "epoch": 2.412151439582861, + "grad_norm": 0.1337890625, + "learning_rate": 0.0002, + "loss": 1.1809, + "step": 1330 + }, + { + "epoch": 2.4139650872817953, + "grad_norm": 0.1376953125, + "learning_rate": 0.0002, + "loss": 1.0888, + "step": 1331 + }, + { + "epoch": 2.41577873498073, + "grad_norm": 0.1357421875, + "learning_rate": 0.0002, + "loss": 1.146, + "step": 1332 + }, + { + "epoch": 2.4175923826796644, + "grad_norm": 0.1337890625, + "learning_rate": 0.0002, + "loss": 0.9777, + "step": 1333 + }, + { + "epoch": 2.419406030378599, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 0.9455, + "step": 1334 + }, + { + "epoch": 2.4212196780775335, + "grad_norm": 0.140625, + "learning_rate": 0.0002, + "loss": 0.9351, + "step": 1335 + }, + { + "epoch": 2.423033325776468, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002, + "loss": 1.2438, + "step": 1336 + }, + { + "epoch": 2.4248469734754026, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 0.9998, + "step": 1337 + }, + { + "epoch": 2.426660621174337, + "grad_norm": 0.1669921875, + "learning_rate": 0.0002, + "loss": 1.0583, + "step": 1338 + }, + { + "epoch": 2.4284742688732712, + "grad_norm": 0.16796875, + "learning_rate": 0.0002, + "loss": 1.0275, + "step": 1339 + }, + { + "epoch": 2.430287916572206, + "grad_norm": 0.171875, + "learning_rate": 0.0002, + "loss": 1.1089, + "step": 1340 + }, + { + "epoch": 2.4321015642711403, + "grad_norm": 0.181640625, + "learning_rate": 0.0002, + "loss": 1.3093, + "step": 1341 + }, + { + "epoch": 2.4339152119700747, + "grad_norm": 0.1689453125, + "learning_rate": 0.0002, + "loss": 1.291, + "step": 1342 + }, + { + "epoch": 2.4357288596690094, + "grad_norm": 0.2021484375, + "learning_rate": 0.0002, + "loss": 1.2347, + "step": 1343 + }, + { + "epoch": 2.4375425073679438, + "grad_norm": 0.1943359375, + "learning_rate": 0.0002, + "loss": 1.0307, + "step": 1344 + }, + { + "epoch": 2.439356155066878, + "grad_norm": 0.2275390625, + "learning_rate": 0.0002, + "loss": 1.1833, + "step": 1345 + }, + { + "epoch": 2.441169802765813, + "grad_norm": 0.212890625, + "learning_rate": 0.0002, + "loss": 1.0294, + "step": 1346 + }, + { + "epoch": 2.442983450464747, + "grad_norm": 0.2099609375, + "learning_rate": 0.0002, + "loss": 1.0313, + "step": 1347 + }, + { + "epoch": 2.4447970981636815, + "grad_norm": 0.2333984375, + "learning_rate": 0.0002, + "loss": 1.0547, + "step": 1348 + }, + { + "epoch": 2.4466107458626163, + "grad_norm": 0.2314453125, + "learning_rate": 0.0002, + "loss": 1.0096, + "step": 1349 + }, + { + "epoch": 2.4484243935615506, + "grad_norm": 0.265625, + "learning_rate": 0.0002, + "loss": 0.9896, + "step": 1350 + }, + { + "epoch": 2.4502380412604854, + "grad_norm": 0.25, + "learning_rate": 0.0002, + "loss": 0.7992, + "step": 1351 + }, + { + "epoch": 2.4520516889594197, + "grad_norm": 0.478515625, + "learning_rate": 0.0002, + "loss": 0.8278, + "step": 1352 + }, + { + "epoch": 2.453865336658354, + "grad_norm": 0.515625, + "learning_rate": 0.0002, + "loss": 0.8283, + "step": 1353 + }, + { + "epoch": 2.455678984357289, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.0601, + "step": 1354 + }, + { + "epoch": 2.457492632056223, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.1811, + "step": 1355 + }, + { + "epoch": 2.4593062797551575, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002, + "loss": 1.0413, + "step": 1356 + }, + { + "epoch": 2.4611199274540922, + "grad_norm": 0.1279296875, + "learning_rate": 0.0002, + "loss": 1.0496, + "step": 1357 + }, + { + "epoch": 2.4629335751530266, + "grad_norm": 0.1357421875, + "learning_rate": 0.0002, + "loss": 1.1323, + "step": 1358 + }, + { + "epoch": 2.464747222851961, + "grad_norm": 0.11376953125, + "learning_rate": 0.0002, + "loss": 1.135, + "step": 1359 + }, + { + "epoch": 2.4665608705508957, + "grad_norm": 0.125, + "learning_rate": 0.0002, + "loss": 1.0572, + "step": 1360 + }, + { + "epoch": 2.46837451824983, + "grad_norm": 0.126953125, + "learning_rate": 0.0002, + "loss": 1.1711, + "step": 1361 + }, + { + "epoch": 2.4701881659487643, + "grad_norm": 0.12890625, + "learning_rate": 0.0002, + "loss": 1.0882, + "step": 1362 + }, + { + "epoch": 2.472001813647699, + "grad_norm": 0.1162109375, + "learning_rate": 0.0002, + "loss": 0.8916, + "step": 1363 + }, + { + "epoch": 2.4738154613466334, + "grad_norm": 0.115234375, + "learning_rate": 0.0002, + "loss": 1.1201, + "step": 1364 + }, + { + "epoch": 2.4756291090455678, + "grad_norm": 0.109375, + "learning_rate": 0.0002, + "loss": 1.2056, + "step": 1365 + }, + { + "epoch": 2.4774427567445025, + "grad_norm": 0.1083984375, + "learning_rate": 0.0002, + "loss": 0.997, + "step": 1366 + }, + { + "epoch": 2.479256404443437, + "grad_norm": 0.11279296875, + "learning_rate": 0.0002, + "loss": 0.8934, + "step": 1367 + }, + { + "epoch": 2.481070052142371, + "grad_norm": 0.1142578125, + "learning_rate": 0.0002, + "loss": 1.0883, + "step": 1368 + }, + { + "epoch": 2.482883699841306, + "grad_norm": 0.1220703125, + "learning_rate": 0.0002, + "loss": 1.1448, + "step": 1369 + }, + { + "epoch": 2.4846973475402403, + "grad_norm": 0.1171875, + "learning_rate": 0.0002, + "loss": 0.8749, + "step": 1370 + }, + { + "epoch": 2.4865109952391746, + "grad_norm": 0.125, + "learning_rate": 0.0002, + "loss": 1.0441, + "step": 1371 + }, + { + "epoch": 2.4883246429381094, + "grad_norm": 0.134765625, + "learning_rate": 0.0002, + "loss": 1.0024, + "step": 1372 + }, + { + "epoch": 2.4901382906370437, + "grad_norm": 0.1328125, + "learning_rate": 0.0002, + "loss": 0.7992, + "step": 1373 + }, + { + "epoch": 2.491951938335978, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.1031, + "step": 1374 + }, + { + "epoch": 2.493765586034913, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002, + "loss": 1.0543, + "step": 1375 + }, + { + "epoch": 2.495579233733847, + "grad_norm": 0.1298828125, + "learning_rate": 0.0002, + "loss": 0.9275, + "step": 1376 + }, + { + "epoch": 2.4973928814327815, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 0.9139, + "step": 1377 + }, + { + "epoch": 2.4992065291317163, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.1127, + "step": 1378 + }, + { + "epoch": 2.5010201768306506, + "grad_norm": 0.134765625, + "learning_rate": 0.0002, + "loss": 1.0613, + "step": 1379 + }, + { + "epoch": 2.502833824529585, + "grad_norm": 0.13671875, + "learning_rate": 0.0002, + "loss": 1.0057, + "step": 1380 + }, + { + "epoch": 2.5046474722285197, + "grad_norm": 0.166015625, + "learning_rate": 0.0002, + "loss": 1.2414, + "step": 1381 + }, + { + "epoch": 2.506461119927454, + "grad_norm": 0.1396484375, + "learning_rate": 0.0002, + "loss": 1.249, + "step": 1382 + }, + { + "epoch": 2.5082747676263883, + "grad_norm": 0.140625, + "learning_rate": 0.0002, + "loss": 1.1387, + "step": 1383 + }, + { + "epoch": 2.510088415325323, + "grad_norm": 0.1572265625, + "learning_rate": 0.0002, + "loss": 1.3518, + "step": 1384 + }, + { + "epoch": 2.5119020630242574, + "grad_norm": 0.1787109375, + "learning_rate": 0.0002, + "loss": 1.5608, + "step": 1385 + }, + { + "epoch": 2.5137157107231918, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 1.3045, + "step": 1386 + }, + { + "epoch": 2.5155293584221265, + "grad_norm": 0.166015625, + "learning_rate": 0.0002, + "loss": 1.0414, + "step": 1387 + }, + { + "epoch": 2.517343006121061, + "grad_norm": 0.158203125, + "learning_rate": 0.0002, + "loss": 1.0701, + "step": 1388 + }, + { + "epoch": 2.519156653819995, + "grad_norm": 0.1796875, + "learning_rate": 0.0002, + "loss": 1.0294, + "step": 1389 + }, + { + "epoch": 2.52097030151893, + "grad_norm": 0.16796875, + "learning_rate": 0.0002, + "loss": 1.147, + "step": 1390 + }, + { + "epoch": 2.5227839492178643, + "grad_norm": 0.1845703125, + "learning_rate": 0.0002, + "loss": 1.0457, + "step": 1391 + }, + { + "epoch": 2.5245975969167986, + "grad_norm": 0.1689453125, + "learning_rate": 0.0002, + "loss": 1.1026, + "step": 1392 + }, + { + "epoch": 2.5264112446157334, + "grad_norm": 0.216796875, + "learning_rate": 0.0002, + "loss": 1.1096, + "step": 1393 + }, + { + "epoch": 2.5282248923146677, + "grad_norm": 0.212890625, + "learning_rate": 0.0002, + "loss": 1.2925, + "step": 1394 + }, + { + "epoch": 2.5300385400136025, + "grad_norm": 0.1875, + "learning_rate": 0.0002, + "loss": 1.1415, + "step": 1395 + }, + { + "epoch": 2.531852187712537, + "grad_norm": 0.212890625, + "learning_rate": 0.0002, + "loss": 1.3598, + "step": 1396 + }, + { + "epoch": 2.533665835411471, + "grad_norm": 0.2080078125, + "learning_rate": 0.0002, + "loss": 1.0973, + "step": 1397 + }, + { + "epoch": 2.535479483110406, + "grad_norm": 0.251953125, + "learning_rate": 0.0002, + "loss": 1.0352, + "step": 1398 + }, + { + "epoch": 2.5372931308093403, + "grad_norm": 0.2392578125, + "learning_rate": 0.0002, + "loss": 1.0864, + "step": 1399 + }, + { + "epoch": 2.539106778508275, + "grad_norm": 0.2578125, + "learning_rate": 0.0002, + "loss": 0.9675, + "step": 1400 + }, + { + "epoch": 2.5409204262072094, + "grad_norm": 0.23046875, + "learning_rate": 0.0002, + "loss": 0.7787, + "step": 1401 + }, + { + "epoch": 2.5427340739061437, + "grad_norm": 0.27734375, + "learning_rate": 0.0002, + "loss": 0.7596, + "step": 1402 + }, + { + "epoch": 2.5445477216050785, + "grad_norm": 0.2578125, + "learning_rate": 0.0002, + "loss": 0.8117, + "step": 1403 + }, + { + "epoch": 2.546361369304013, + "grad_norm": 0.130859375, + "learning_rate": 0.0002, + "loss": 1.0522, + "step": 1404 + }, + { + "epoch": 2.548175017002947, + "grad_norm": 0.1396484375, + "learning_rate": 0.0002, + "loss": 1.1735, + "step": 1405 + }, + { + "epoch": 2.549988664701882, + "grad_norm": 0.15625, + "learning_rate": 0.0002, + "loss": 0.9207, + "step": 1406 + }, + { + "epoch": 2.5518023124008162, + "grad_norm": 0.12890625, + "learning_rate": 0.0002, + "loss": 1.0498, + "step": 1407 + }, + { + "epoch": 2.5536159600997506, + "grad_norm": 0.12353515625, + "learning_rate": 0.0002, + "loss": 0.8888, + "step": 1408 + }, + { + "epoch": 2.5554296077986853, + "grad_norm": 0.1279296875, + "learning_rate": 0.0002, + "loss": 1.0763, + "step": 1409 + }, + { + "epoch": 2.5572432554976197, + "grad_norm": 0.10986328125, + "learning_rate": 0.0002, + "loss": 0.9656, + "step": 1410 + }, + { + "epoch": 2.559056903196554, + "grad_norm": 0.140625, + "learning_rate": 0.0002, + "loss": 1.097, + "step": 1411 + }, + { + "epoch": 2.5608705508954888, + "grad_norm": 0.11669921875, + "learning_rate": 0.0002, + "loss": 0.9013, + "step": 1412 + }, + { + "epoch": 2.562684198594423, + "grad_norm": 0.125, + "learning_rate": 0.0002, + "loss": 1.0709, + "step": 1413 + }, + { + "epoch": 2.5644978462933574, + "grad_norm": 0.11669921875, + "learning_rate": 0.0002, + "loss": 1.0308, + "step": 1414 + }, + { + "epoch": 2.566311493992292, + "grad_norm": 0.11474609375, + "learning_rate": 0.0002, + "loss": 0.9962, + "step": 1415 + }, + { + "epoch": 2.5681251416912265, + "grad_norm": 0.125, + "learning_rate": 0.0002, + "loss": 0.9801, + "step": 1416 + }, + { + "epoch": 2.569938789390161, + "grad_norm": 0.12109375, + "learning_rate": 0.0002, + "loss": 1.0364, + "step": 1417 + }, + { + "epoch": 2.5717524370890956, + "grad_norm": 0.13671875, + "learning_rate": 0.0002, + "loss": 1.0681, + "step": 1418 + }, + { + "epoch": 2.57356608478803, + "grad_norm": 0.1220703125, + "learning_rate": 0.0002, + "loss": 0.9021, + "step": 1419 + }, + { + "epoch": 2.5753797324869643, + "grad_norm": 0.1201171875, + "learning_rate": 0.0002, + "loss": 1.0497, + "step": 1420 + }, + { + "epoch": 2.577193380185899, + "grad_norm": 0.126953125, + "learning_rate": 0.0002, + "loss": 0.876, + "step": 1421 + }, + { + "epoch": 2.5790070278848334, + "grad_norm": 0.134765625, + "learning_rate": 0.0002, + "loss": 1.1056, + "step": 1422 + }, + { + "epoch": 2.5808206755837677, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.356, + "step": 1423 + }, + { + "epoch": 2.5826343232827025, + "grad_norm": 0.119140625, + "learning_rate": 0.0002, + "loss": 1.1112, + "step": 1424 + }, + { + "epoch": 2.584447970981637, + "grad_norm": 0.1279296875, + "learning_rate": 0.0002, + "loss": 1.0622, + "step": 1425 + }, + { + "epoch": 2.586261618680571, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002, + "loss": 1.0669, + "step": 1426 + }, + { + "epoch": 2.588075266379506, + "grad_norm": 0.130859375, + "learning_rate": 0.0002, + "loss": 1.0634, + "step": 1427 + }, + { + "epoch": 2.5898889140784402, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 0.9888, + "step": 1428 + }, + { + "epoch": 2.5917025617773746, + "grad_norm": 0.1337890625, + "learning_rate": 0.0002, + "loss": 0.8637, + "step": 1429 + }, + { + "epoch": 2.5935162094763093, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.2496, + "step": 1430 + }, + { + "epoch": 2.5953298571752437, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.4122, + "step": 1431 + }, + { + "epoch": 2.597143504874178, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.2422, + "step": 1432 + }, + { + "epoch": 2.5989571525731128, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.1919, + "step": 1433 + }, + { + "epoch": 2.600770800272047, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.0362, + "step": 1434 + }, + { + "epoch": 2.6025844479709814, + "grad_norm": 0.162109375, + "learning_rate": 0.0002, + "loss": 1.0544, + "step": 1435 + }, + { + "epoch": 2.604398095669916, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 0.9821, + "step": 1436 + }, + { + "epoch": 2.6062117433688505, + "grad_norm": 0.1669921875, + "learning_rate": 0.0002, + "loss": 1.0501, + "step": 1437 + }, + { + "epoch": 2.608025391067785, + "grad_norm": 0.1591796875, + "learning_rate": 0.0002, + "loss": 1.0653, + "step": 1438 + }, + { + "epoch": 2.6098390387667196, + "grad_norm": 0.171875, + "learning_rate": 0.0002, + "loss": 1.1117, + "step": 1439 + }, + { + "epoch": 2.611652686465654, + "grad_norm": 0.18359375, + "learning_rate": 0.0002, + "loss": 1.0241, + "step": 1440 + }, + { + "epoch": 2.6134663341645883, + "grad_norm": 0.1767578125, + "learning_rate": 0.0002, + "loss": 1.1985, + "step": 1441 + }, + { + "epoch": 2.615279981863523, + "grad_norm": 0.1884765625, + "learning_rate": 0.0002, + "loss": 1.1026, + "step": 1442 + }, + { + "epoch": 2.6170936295624574, + "grad_norm": 0.1884765625, + "learning_rate": 0.0002, + "loss": 1.1456, + "step": 1443 + }, + { + "epoch": 2.6189072772613917, + "grad_norm": 0.1806640625, + "learning_rate": 0.0002, + "loss": 0.9895, + "step": 1444 + }, + { + "epoch": 2.6207209249603265, + "grad_norm": 0.21875, + "learning_rate": 0.0002, + "loss": 1.3088, + "step": 1445 + }, + { + "epoch": 2.622534572659261, + "grad_norm": 0.19921875, + "learning_rate": 0.0002, + "loss": 1.1112, + "step": 1446 + }, + { + "epoch": 2.624348220358195, + "grad_norm": 0.244140625, + "learning_rate": 0.0002, + "loss": 1.1432, + "step": 1447 + }, + { + "epoch": 2.62616186805713, + "grad_norm": 0.2373046875, + "learning_rate": 0.0002, + "loss": 1.2337, + "step": 1448 + }, + { + "epoch": 2.6279755157560643, + "grad_norm": 0.2333984375, + "learning_rate": 0.0002, + "loss": 1.0129, + "step": 1449 + }, + { + "epoch": 2.629789163454999, + "grad_norm": 0.248046875, + "learning_rate": 0.0002, + "loss": 0.9727, + "step": 1450 + }, + { + "epoch": 2.6316028111539334, + "grad_norm": 0.267578125, + "learning_rate": 0.0002, + "loss": 0.7538, + "step": 1451 + }, + { + "epoch": 2.6334164588528677, + "grad_norm": 0.3046875, + "learning_rate": 0.0002, + "loss": 0.8151, + "step": 1452 + }, + { + "epoch": 2.6352301065518025, + "grad_norm": 0.251953125, + "learning_rate": 0.0002, + "loss": 0.6862, + "step": 1453 + }, + { + "epoch": 2.637043754250737, + "grad_norm": 0.130859375, + "learning_rate": 0.0002, + "loss": 1.09, + "step": 1454 + }, + { + "epoch": 2.638857401949671, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 0.9134, + "step": 1455 + }, + { + "epoch": 2.640671049648606, + "grad_norm": 0.1318359375, + "learning_rate": 0.0002, + "loss": 1.1902, + "step": 1456 + }, + { + "epoch": 2.64248469734754, + "grad_norm": 0.15625, + "learning_rate": 0.0002, + "loss": 0.9684, + "step": 1457 + }, + { + "epoch": 2.644298345046475, + "grad_norm": 0.1376953125, + "learning_rate": 0.0002, + "loss": 1.1683, + "step": 1458 + }, + { + "epoch": 2.6461119927454093, + "grad_norm": 0.11865234375, + "learning_rate": 0.0002, + "loss": 0.9862, + "step": 1459 + }, + { + "epoch": 2.6479256404443436, + "grad_norm": 0.1259765625, + "learning_rate": 0.0002, + "loss": 0.9166, + "step": 1460 + }, + { + "epoch": 2.6497392881432784, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 0.9938, + "step": 1461 + }, + { + "epoch": 2.6515529358422127, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 0.9278, + "step": 1462 + }, + { + "epoch": 2.653366583541147, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.159, + "step": 1463 + }, + { + "epoch": 2.655180231240082, + "grad_norm": 0.1328125, + "learning_rate": 0.0002, + "loss": 0.9883, + "step": 1464 + }, + { + "epoch": 2.656993878939016, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.18, + "step": 1465 + }, + { + "epoch": 2.6588075266379505, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.0992, + "step": 1466 + }, + { + "epoch": 2.6606211743368853, + "grad_norm": 0.126953125, + "learning_rate": 0.0002, + "loss": 1.0507, + "step": 1467 + }, + { + "epoch": 2.6624348220358196, + "grad_norm": 0.12109375, + "learning_rate": 0.0002, + "loss": 0.9195, + "step": 1468 + }, + { + "epoch": 2.664248469734754, + "grad_norm": 0.1240234375, + "learning_rate": 0.0002, + "loss": 0.9741, + "step": 1469 + }, + { + "epoch": 2.6660621174336887, + "grad_norm": 0.123046875, + "learning_rate": 0.0002, + "loss": 0.8482, + "step": 1470 + }, + { + "epoch": 2.667875765132623, + "grad_norm": 0.130859375, + "learning_rate": 0.0002, + "loss": 1.0486, + "step": 1471 + }, + { + "epoch": 2.6696894128315574, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.1301, + "step": 1472 + }, + { + "epoch": 2.671503060530492, + "grad_norm": 0.15625, + "learning_rate": 0.0002, + "loss": 1.1821, + "step": 1473 + }, + { + "epoch": 2.6733167082294265, + "grad_norm": 0.1376953125, + "learning_rate": 0.0002, + "loss": 1.0408, + "step": 1474 + }, + { + "epoch": 2.675130355928361, + "grad_norm": 0.1298828125, + "learning_rate": 0.0002, + "loss": 0.9681, + "step": 1475 + }, + { + "epoch": 2.6769440036272956, + "grad_norm": 0.1396484375, + "learning_rate": 0.0002, + "loss": 1.0161, + "step": 1476 + }, + { + "epoch": 2.67875765132623, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.1859, + "step": 1477 + }, + { + "epoch": 2.6805712990251642, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.0036, + "step": 1478 + }, + { + "epoch": 2.682384946724099, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 0.953, + "step": 1479 + }, + { + "epoch": 2.6841985944230333, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.0595, + "step": 1480 + }, + { + "epoch": 2.6860122421219677, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.1091, + "step": 1481 + }, + { + "epoch": 2.6878258898209024, + "grad_norm": 0.1630859375, + "learning_rate": 0.0002, + "loss": 1.0746, + "step": 1482 + }, + { + "epoch": 2.6896395375198368, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.3303, + "step": 1483 + }, + { + "epoch": 2.691453185218771, + "grad_norm": 0.1748046875, + "learning_rate": 0.0002, + "loss": 1.4732, + "step": 1484 + }, + { + "epoch": 2.693266832917706, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 1.1705, + "step": 1485 + }, + { + "epoch": 2.69508048061664, + "grad_norm": 0.1728515625, + "learning_rate": 0.0002, + "loss": 1.019, + "step": 1486 + }, + { + "epoch": 2.6968941283155745, + "grad_norm": 0.16015625, + "learning_rate": 0.0002, + "loss": 1.2032, + "step": 1487 + }, + { + "epoch": 2.6987077760145093, + "grad_norm": 0.26171875, + "learning_rate": 0.0002, + "loss": 1.0185, + "step": 1488 + }, + { + "epoch": 2.7005214237134436, + "grad_norm": 0.171875, + "learning_rate": 0.0002, + "loss": 1.1396, + "step": 1489 + }, + { + "epoch": 2.702335071412378, + "grad_norm": 0.173828125, + "learning_rate": 0.0002, + "loss": 1.0043, + "step": 1490 + }, + { + "epoch": 2.7041487191113127, + "grad_norm": 0.173828125, + "learning_rate": 0.0002, + "loss": 1.1343, + "step": 1491 + }, + { + "epoch": 2.705962366810247, + "grad_norm": 0.1884765625, + "learning_rate": 0.0002, + "loss": 1.1074, + "step": 1492 + }, + { + "epoch": 2.7077760145091814, + "grad_norm": 0.197265625, + "learning_rate": 0.0002, + "loss": 1.1461, + "step": 1493 + }, + { + "epoch": 2.709589662208116, + "grad_norm": 0.1923828125, + "learning_rate": 0.0002, + "loss": 1.1263, + "step": 1494 + }, + { + "epoch": 2.7114033099070505, + "grad_norm": 0.220703125, + "learning_rate": 0.0002, + "loss": 0.9731, + "step": 1495 + }, + { + "epoch": 2.713216957605985, + "grad_norm": 0.21875, + "learning_rate": 0.0002, + "loss": 0.9754, + "step": 1496 + }, + { + "epoch": 2.713216957605985, + "eval_loss": 1.3432918787002563, + "eval_runtime": 152.0876, + "eval_samples_per_second": 6.575, + "eval_steps_per_second": 6.575, + "step": 1496 + }, + { + "epoch": 2.713216957605985, + "mmlu_eval_accuracy": 0.33171537220228237, + "mmlu_eval_accuracy_abstract_algebra": 0.36363636363636365, + "mmlu_eval_accuracy_anatomy": 0.5, + "mmlu_eval_accuracy_astronomy": 0.25, + "mmlu_eval_accuracy_business_ethics": 0.2727272727272727, + "mmlu_eval_accuracy_clinical_knowledge": 0.3103448275862069, + "mmlu_eval_accuracy_college_biology": 0.4375, + "mmlu_eval_accuracy_college_chemistry": 0.25, + "mmlu_eval_accuracy_college_computer_science": 0.0, + "mmlu_eval_accuracy_college_mathematics": 0.18181818181818182, + "mmlu_eval_accuracy_college_medicine": 0.18181818181818182, + "mmlu_eval_accuracy_college_physics": 0.45454545454545453, + "mmlu_eval_accuracy_computer_security": 0.45454545454545453, + "mmlu_eval_accuracy_conceptual_physics": 0.3076923076923077, + "mmlu_eval_accuracy_econometrics": 0.08333333333333333, + "mmlu_eval_accuracy_electrical_engineering": 0.125, + "mmlu_eval_accuracy_elementary_mathematics": 0.2926829268292683, + "mmlu_eval_accuracy_formal_logic": 0.14285714285714285, + "mmlu_eval_accuracy_global_facts": 0.3, + "mmlu_eval_accuracy_high_school_biology": 0.5625, + "mmlu_eval_accuracy_high_school_chemistry": 0.09090909090909091, + "mmlu_eval_accuracy_high_school_computer_science": 0.3333333333333333, + "mmlu_eval_accuracy_high_school_european_history": 0.3888888888888889, + "mmlu_eval_accuracy_high_school_geography": 0.5, + "mmlu_eval_accuracy_high_school_government_and_politics": 0.38095238095238093, + "mmlu_eval_accuracy_high_school_macroeconomics": 0.3023255813953488, + "mmlu_eval_accuracy_high_school_mathematics": 0.1724137931034483, + "mmlu_eval_accuracy_high_school_microeconomics": 0.15384615384615385, + "mmlu_eval_accuracy_high_school_physics": 0.4117647058823529, + "mmlu_eval_accuracy_high_school_psychology": 0.5166666666666667, + "mmlu_eval_accuracy_high_school_statistics": 0.34782608695652173, + "mmlu_eval_accuracy_high_school_us_history": 0.45454545454545453, + "mmlu_eval_accuracy_high_school_world_history": 0.34615384615384615, + "mmlu_eval_accuracy_human_aging": 0.391304347826087, + "mmlu_eval_accuracy_human_sexuality": 0.16666666666666666, + "mmlu_eval_accuracy_international_law": 0.46153846153846156, + "mmlu_eval_accuracy_jurisprudence": 0.36363636363636365, + "mmlu_eval_accuracy_logical_fallacies": 0.6111111111111112, + "mmlu_eval_accuracy_machine_learning": 0.2727272727272727, + "mmlu_eval_accuracy_management": 0.18181818181818182, + "mmlu_eval_accuracy_marketing": 0.6, + "mmlu_eval_accuracy_medical_genetics": 0.45454545454545453, + "mmlu_eval_accuracy_miscellaneous": 0.37209302325581395, + "mmlu_eval_accuracy_moral_disputes": 0.39473684210526316, + "mmlu_eval_accuracy_moral_scenarios": 0.31, + "mmlu_eval_accuracy_nutrition": 0.42424242424242425, + "mmlu_eval_accuracy_philosophy": 0.4411764705882353, + "mmlu_eval_accuracy_prehistory": 0.2571428571428571, + "mmlu_eval_accuracy_professional_accounting": 0.2903225806451613, + "mmlu_eval_accuracy_professional_law": 0.2411764705882353, + "mmlu_eval_accuracy_professional_medicine": 0.3225806451612903, + "mmlu_eval_accuracy_professional_psychology": 0.3333333333333333, + "mmlu_eval_accuracy_public_relations": 0.5, + "mmlu_eval_accuracy_security_studies": 0.3333333333333333, + "mmlu_eval_accuracy_sociology": 0.5, + "mmlu_eval_accuracy_us_foreign_policy": 0.2727272727272727, + "mmlu_eval_accuracy_virology": 0.2777777777777778, + "mmlu_eval_accuracy_world_religions": 0.2631578947368421, + "mmlu_loss": 1.7185278401353015, + "step": 1496 + }, + { + "epoch": 2.7150306053049196, + "grad_norm": 0.21875, + "learning_rate": 0.0002, + "loss": 1.0622, + "step": 1497 + }, + { + "epoch": 2.716844253003854, + "grad_norm": 0.228515625, + "learning_rate": 0.0002, + "loss": 1.1846, + "step": 1498 + }, + { + "epoch": 2.7186579007027882, + "grad_norm": 0.234375, + "learning_rate": 0.0002, + "loss": 0.9019, + "step": 1499 + }, + { + "epoch": 2.720471548401723, + "grad_norm": 0.271484375, + "learning_rate": 0.0002, + "loss": 0.7644, + "step": 1500 + }, + { + "epoch": 2.7222851961006573, + "grad_norm": 0.30859375, + "learning_rate": 0.0002, + "loss": 0.8663, + "step": 1501 + }, + { + "epoch": 2.7240988437995917, + "grad_norm": 0.296875, + "learning_rate": 0.0002, + "loss": 0.7565, + "step": 1502 + }, + { + "epoch": 2.7259124914985264, + "grad_norm": 0.267578125, + "learning_rate": 0.0002, + "loss": 0.7747, + "step": 1503 + }, + { + "epoch": 2.7277261391974608, + "grad_norm": 0.1337890625, + "learning_rate": 0.0002, + "loss": 1.0898, + "step": 1504 + }, + { + "epoch": 2.7295397868963955, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 1.1278, + "step": 1505 + }, + { + "epoch": 2.73135343459533, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 0.9712, + "step": 1506 + }, + { + "epoch": 2.733167082294264, + "grad_norm": 0.1357421875, + "learning_rate": 0.0002, + "loss": 1.088, + "step": 1507 + }, + { + "epoch": 2.734980729993199, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.1572, + "step": 1508 + }, + { + "epoch": 2.7367943776921333, + "grad_norm": 0.1298828125, + "learning_rate": 0.0002, + "loss": 0.9415, + "step": 1509 + }, + { + "epoch": 2.7386080253910676, + "grad_norm": 0.1279296875, + "learning_rate": 0.0002, + "loss": 1.2383, + "step": 1510 + }, + { + "epoch": 2.7404216730900024, + "grad_norm": 0.1328125, + "learning_rate": 0.0002, + "loss": 1.0566, + "step": 1511 + }, + { + "epoch": 2.7422353207889367, + "grad_norm": 0.12890625, + "learning_rate": 0.0002, + "loss": 1.0482, + "step": 1512 + }, + { + "epoch": 2.7440489684878715, + "grad_norm": 0.12255859375, + "learning_rate": 0.0002, + "loss": 1.0465, + "step": 1513 + }, + { + "epoch": 2.745862616186806, + "grad_norm": 0.130859375, + "learning_rate": 0.0002, + "loss": 1.2735, + "step": 1514 + }, + { + "epoch": 2.74767626388574, + "grad_norm": 0.11376953125, + "learning_rate": 0.0002, + "loss": 0.9711, + "step": 1515 + }, + { + "epoch": 2.749489911584675, + "grad_norm": 0.11767578125, + "learning_rate": 0.0002, + "loss": 1.0915, + "step": 1516 + }, + { + "epoch": 2.7513035592836093, + "grad_norm": 0.11865234375, + "learning_rate": 0.0002, + "loss": 1.1036, + "step": 1517 + }, + { + "epoch": 2.7531172069825436, + "grad_norm": 0.1162109375, + "learning_rate": 0.0002, + "loss": 0.9734, + "step": 1518 + }, + { + "epoch": 2.7549308546814784, + "grad_norm": 0.125, + "learning_rate": 0.0002, + "loss": 1.1446, + "step": 1519 + }, + { + "epoch": 2.7567445023804127, + "grad_norm": 0.134765625, + "learning_rate": 0.0002, + "loss": 1.0902, + "step": 1520 + }, + { + "epoch": 2.758558150079347, + "grad_norm": 0.1298828125, + "learning_rate": 0.0002, + "loss": 0.8404, + "step": 1521 + }, + { + "epoch": 2.760371797778282, + "grad_norm": 0.130859375, + "learning_rate": 0.0002, + "loss": 0.9461, + "step": 1522 + }, + { + "epoch": 2.762185445477216, + "grad_norm": 0.1357421875, + "learning_rate": 0.0002, + "loss": 1.1278, + "step": 1523 + }, + { + "epoch": 2.7639990931761504, + "grad_norm": 0.1396484375, + "learning_rate": 0.0002, + "loss": 1.0729, + "step": 1524 + }, + { + "epoch": 2.765812740875085, + "grad_norm": 0.140625, + "learning_rate": 0.0002, + "loss": 1.0219, + "step": 1525 + }, + { + "epoch": 2.7676263885740195, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.1427, + "step": 1526 + }, + { + "epoch": 2.769440036272954, + "grad_norm": 0.1318359375, + "learning_rate": 0.0002, + "loss": 1.1042, + "step": 1527 + }, + { + "epoch": 2.7712536839718886, + "grad_norm": 0.1572265625, + "learning_rate": 0.0002, + "loss": 1.0167, + "step": 1528 + }, + { + "epoch": 2.773067331670823, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 0.9811, + "step": 1529 + }, + { + "epoch": 2.7748809793697573, + "grad_norm": 0.158203125, + "learning_rate": 0.0002, + "loss": 1.1321, + "step": 1530 + }, + { + "epoch": 2.776694627068692, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 1.0544, + "step": 1531 + }, + { + "epoch": 2.7785082747676264, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 0.8751, + "step": 1532 + }, + { + "epoch": 2.7803219224665607, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 1.2317, + "step": 1533 + }, + { + "epoch": 2.7821355701654955, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.1319, + "step": 1534 + }, + { + "epoch": 2.78394921786443, + "grad_norm": 0.1640625, + "learning_rate": 0.0002, + "loss": 1.1703, + "step": 1535 + }, + { + "epoch": 2.785762865563364, + "grad_norm": 0.15625, + "learning_rate": 0.0002, + "loss": 1.1221, + "step": 1536 + }, + { + "epoch": 2.787576513262299, + "grad_norm": 0.1572265625, + "learning_rate": 0.0002, + "loss": 1.3495, + "step": 1537 + }, + { + "epoch": 2.7893901609612333, + "grad_norm": 0.1669921875, + "learning_rate": 0.0002, + "loss": 1.0684, + "step": 1538 + }, + { + "epoch": 2.7912038086601676, + "grad_norm": 0.17578125, + "learning_rate": 0.0002, + "loss": 1.0875, + "step": 1539 + }, + { + "epoch": 2.7930174563591024, + "grad_norm": 0.171875, + "learning_rate": 0.0002, + "loss": 1.0831, + "step": 1540 + }, + { + "epoch": 2.7948311040580367, + "grad_norm": 0.18359375, + "learning_rate": 0.0002, + "loss": 1.2297, + "step": 1541 + }, + { + "epoch": 2.796644751756971, + "grad_norm": 0.1943359375, + "learning_rate": 0.0002, + "loss": 1.2224, + "step": 1542 + }, + { + "epoch": 2.798458399455906, + "grad_norm": 0.2236328125, + "learning_rate": 0.0002, + "loss": 1.3018, + "step": 1543 + }, + { + "epoch": 2.80027204715484, + "grad_norm": 0.1904296875, + "learning_rate": 0.0002, + "loss": 1.1941, + "step": 1544 + }, + { + "epoch": 2.8020856948537745, + "grad_norm": 0.1962890625, + "learning_rate": 0.0002, + "loss": 1.0463, + "step": 1545 + }, + { + "epoch": 2.8038993425527092, + "grad_norm": 0.2041015625, + "learning_rate": 0.0002, + "loss": 1.0311, + "step": 1546 + }, + { + "epoch": 2.8057129902516436, + "grad_norm": 0.21484375, + "learning_rate": 0.0002, + "loss": 1.0432, + "step": 1547 + }, + { + "epoch": 2.807526637950578, + "grad_norm": 0.2158203125, + "learning_rate": 0.0002, + "loss": 1.0432, + "step": 1548 + }, + { + "epoch": 2.8093402856495127, + "grad_norm": 0.263671875, + "learning_rate": 0.0002, + "loss": 0.9175, + "step": 1549 + }, + { + "epoch": 2.811153933348447, + "grad_norm": 0.2734375, + "learning_rate": 0.0002, + "loss": 0.8755, + "step": 1550 + }, + { + "epoch": 2.8129675810473813, + "grad_norm": 0.240234375, + "learning_rate": 0.0002, + "loss": 0.8115, + "step": 1551 + }, + { + "epoch": 2.814781228746316, + "grad_norm": 0.30078125, + "learning_rate": 0.0002, + "loss": 0.7488, + "step": 1552 + }, + { + "epoch": 2.8165948764452504, + "grad_norm": 0.271484375, + "learning_rate": 0.0002, + "loss": 0.872, + "step": 1553 + }, + { + "epoch": 2.8184085241441847, + "grad_norm": 0.12890625, + "learning_rate": 0.0002, + "loss": 1.2214, + "step": 1554 + }, + { + "epoch": 2.8202221718431195, + "grad_norm": 0.125, + "learning_rate": 0.0002, + "loss": 1.0899, + "step": 1555 + }, + { + "epoch": 2.822035819542054, + "grad_norm": 0.134765625, + "learning_rate": 0.0002, + "loss": 1.21, + "step": 1556 + }, + { + "epoch": 2.823849467240988, + "grad_norm": 0.1181640625, + "learning_rate": 0.0002, + "loss": 1.2616, + "step": 1557 + }, + { + "epoch": 2.825663114939923, + "grad_norm": 0.1318359375, + "learning_rate": 0.0002, + "loss": 0.9803, + "step": 1558 + }, + { + "epoch": 2.8274767626388573, + "grad_norm": 0.11328125, + "learning_rate": 0.0002, + "loss": 1.0435, + "step": 1559 + }, + { + "epoch": 2.829290410337792, + "grad_norm": 0.12158203125, + "learning_rate": 0.0002, + "loss": 0.8833, + "step": 1560 + }, + { + "epoch": 2.8311040580367264, + "grad_norm": 0.12109375, + "learning_rate": 0.0002, + "loss": 1.0821, + "step": 1561 + }, + { + "epoch": 2.8329177057356607, + "grad_norm": 0.12890625, + "learning_rate": 0.0002, + "loss": 1.0026, + "step": 1562 + }, + { + "epoch": 2.8347313534345955, + "grad_norm": 0.123046875, + "learning_rate": 0.0002, + "loss": 1.0208, + "step": 1563 + }, + { + "epoch": 2.83654500113353, + "grad_norm": 0.126953125, + "learning_rate": 0.0002, + "loss": 0.9826, + "step": 1564 + }, + { + "epoch": 2.838358648832464, + "grad_norm": 0.126953125, + "learning_rate": 0.0002, + "loss": 1.0573, + "step": 1565 + }, + { + "epoch": 2.840172296531399, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 0.8418, + "step": 1566 + }, + { + "epoch": 2.8419859442303332, + "grad_norm": 0.126953125, + "learning_rate": 0.0002, + "loss": 1.0469, + "step": 1567 + }, + { + "epoch": 2.843799591929268, + "grad_norm": 0.12890625, + "learning_rate": 0.0002, + "loss": 0.9516, + "step": 1568 + }, + { + "epoch": 2.8456132396282023, + "grad_norm": 0.1357421875, + "learning_rate": 0.0002, + "loss": 0.9759, + "step": 1569 + }, + { + "epoch": 2.8474268873271367, + "grad_norm": 0.12353515625, + "learning_rate": 0.0002, + "loss": 1.1514, + "step": 1570 + }, + { + "epoch": 2.8492405350260714, + "grad_norm": 0.1328125, + "learning_rate": 0.0002, + "loss": 0.9953, + "step": 1571 + }, + { + "epoch": 2.8510541827250058, + "grad_norm": 0.1279296875, + "learning_rate": 0.0002, + "loss": 1.2484, + "step": 1572 + }, + { + "epoch": 2.85286783042394, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.0754, + "step": 1573 + }, + { + "epoch": 2.854681478122875, + "grad_norm": 0.138671875, + "learning_rate": 0.0002, + "loss": 1.0403, + "step": 1574 + }, + { + "epoch": 2.856495125821809, + "grad_norm": 0.1357421875, + "learning_rate": 0.0002, + "loss": 1.1504, + "step": 1575 + }, + { + "epoch": 2.8583087735207435, + "grad_norm": 0.140625, + "learning_rate": 0.0002, + "loss": 1.291, + "step": 1576 + }, + { + "epoch": 2.8601224212196783, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 0.9459, + "step": 1577 + }, + { + "epoch": 2.8619360689186126, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.0079, + "step": 1578 + }, + { + "epoch": 2.863749716617547, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 1.0324, + "step": 1579 + }, + { + "epoch": 2.8655633643164817, + "grad_norm": 0.1318359375, + "learning_rate": 0.0002, + "loss": 1.0717, + "step": 1580 + }, + { + "epoch": 2.867377012015416, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 0.9659, + "step": 1581 + }, + { + "epoch": 2.8691906597143504, + "grad_norm": 0.15625, + "learning_rate": 0.0002, + "loss": 1.1645, + "step": 1582 + }, + { + "epoch": 2.871004307413285, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 1.0987, + "step": 1583 + }, + { + "epoch": 2.8728179551122195, + "grad_norm": 0.16015625, + "learning_rate": 0.0002, + "loss": 1.2104, + "step": 1584 + }, + { + "epoch": 2.874631602811154, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 0.984, + "step": 1585 + }, + { + "epoch": 2.8764452505100886, + "grad_norm": 0.1650390625, + "learning_rate": 0.0002, + "loss": 1.1389, + "step": 1586 + }, + { + "epoch": 2.878258898209023, + "grad_norm": 0.18359375, + "learning_rate": 0.0002, + "loss": 1.0096, + "step": 1587 + }, + { + "epoch": 2.8800725459079572, + "grad_norm": 0.17578125, + "learning_rate": 0.0002, + "loss": 1.1036, + "step": 1588 + }, + { + "epoch": 2.881886193606892, + "grad_norm": 0.1943359375, + "learning_rate": 0.0002, + "loss": 1.3009, + "step": 1589 + }, + { + "epoch": 2.8836998413058264, + "grad_norm": 0.1650390625, + "learning_rate": 0.0002, + "loss": 1.0521, + "step": 1590 + }, + { + "epoch": 2.8855134890047607, + "grad_norm": 0.1845703125, + "learning_rate": 0.0002, + "loss": 1.1253, + "step": 1591 + }, + { + "epoch": 2.8873271367036955, + "grad_norm": 0.1826171875, + "learning_rate": 0.0002, + "loss": 1.0267, + "step": 1592 + }, + { + "epoch": 2.88914078440263, + "grad_norm": 0.201171875, + "learning_rate": 0.0002, + "loss": 1.1875, + "step": 1593 + }, + { + "epoch": 2.890954432101564, + "grad_norm": 0.193359375, + "learning_rate": 0.0002, + "loss": 1.1356, + "step": 1594 + }, + { + "epoch": 2.892768079800499, + "grad_norm": 0.20703125, + "learning_rate": 0.0002, + "loss": 1.1022, + "step": 1595 + }, + { + "epoch": 2.894581727499433, + "grad_norm": 0.255859375, + "learning_rate": 0.0002, + "loss": 1.0988, + "step": 1596 + }, + { + "epoch": 2.8963953751983675, + "grad_norm": 0.22265625, + "learning_rate": 0.0002, + "loss": 1.0468, + "step": 1597 + }, + { + "epoch": 2.8982090228973023, + "grad_norm": 0.244140625, + "learning_rate": 0.0002, + "loss": 1.0126, + "step": 1598 + }, + { + "epoch": 2.9000226705962366, + "grad_norm": 0.2431640625, + "learning_rate": 0.0002, + "loss": 1.1191, + "step": 1599 + }, + { + "epoch": 2.901836318295171, + "grad_norm": 0.244140625, + "learning_rate": 0.0002, + "loss": 0.8969, + "step": 1600 + }, + { + "epoch": 2.9036499659941057, + "grad_norm": 0.2734375, + "learning_rate": 0.0002, + "loss": 1.0663, + "step": 1601 + }, + { + "epoch": 2.90546361369304, + "grad_norm": 0.30078125, + "learning_rate": 0.0002, + "loss": 0.7716, + "step": 1602 + }, + { + "epoch": 2.9072772613919744, + "grad_norm": 0.2890625, + "learning_rate": 0.0002, + "loss": 0.783, + "step": 1603 + }, + { + "epoch": 2.909090909090909, + "grad_norm": 0.1865234375, + "learning_rate": 0.0002, + "loss": 1.0416, + "step": 1604 + }, + { + "epoch": 2.9109045567898435, + "grad_norm": 0.13671875, + "learning_rate": 0.0002, + "loss": 1.0197, + "step": 1605 + }, + { + "epoch": 2.912718204488778, + "grad_norm": 0.1328125, + "learning_rate": 0.0002, + "loss": 1.0033, + "step": 1606 + }, + { + "epoch": 2.9145318521877126, + "grad_norm": 0.138671875, + "learning_rate": 0.0002, + "loss": 1.0355, + "step": 1607 + }, + { + "epoch": 2.916345499886647, + "grad_norm": 0.1162109375, + "learning_rate": 0.0002, + "loss": 1.0571, + "step": 1608 + }, + { + "epoch": 2.9181591475855813, + "grad_norm": 0.1279296875, + "learning_rate": 0.0002, + "loss": 0.9924, + "step": 1609 + }, + { + "epoch": 2.919972795284516, + "grad_norm": 0.123046875, + "learning_rate": 0.0002, + "loss": 1.0741, + "step": 1610 + }, + { + "epoch": 2.9217864429834504, + "grad_norm": 0.11962890625, + "learning_rate": 0.0002, + "loss": 0.9588, + "step": 1611 + }, + { + "epoch": 2.9236000906823847, + "grad_norm": 0.1220703125, + "learning_rate": 0.0002, + "loss": 1.0121, + "step": 1612 + }, + { + "epoch": 2.9254137383813195, + "grad_norm": 0.11865234375, + "learning_rate": 0.0002, + "loss": 1.1927, + "step": 1613 + }, + { + "epoch": 2.927227386080254, + "grad_norm": 0.130859375, + "learning_rate": 0.0002, + "loss": 1.103, + "step": 1614 + }, + { + "epoch": 2.929041033779188, + "grad_norm": 0.12109375, + "learning_rate": 0.0002, + "loss": 0.9085, + "step": 1615 + }, + { + "epoch": 2.930854681478123, + "grad_norm": 0.1337890625, + "learning_rate": 0.0002, + "loss": 1.151, + "step": 1616 + }, + { + "epoch": 2.932668329177057, + "grad_norm": 0.119140625, + "learning_rate": 0.0002, + "loss": 0.9213, + "step": 1617 + }, + { + "epoch": 2.934481976875992, + "grad_norm": 0.125, + "learning_rate": 0.0002, + "loss": 1.1589, + "step": 1618 + }, + { + "epoch": 2.9362956245749263, + "grad_norm": 0.12353515625, + "learning_rate": 0.0002, + "loss": 0.936, + "step": 1619 + }, + { + "epoch": 2.9381092722738607, + "grad_norm": 0.1298828125, + "learning_rate": 0.0002, + "loss": 1.1342, + "step": 1620 + }, + { + "epoch": 2.9399229199727954, + "grad_norm": 0.1337890625, + "learning_rate": 0.0002, + "loss": 1.0442, + "step": 1621 + }, + { + "epoch": 2.9417365676717298, + "grad_norm": 0.1318359375, + "learning_rate": 0.0002, + "loss": 1.1308, + "step": 1622 + }, + { + "epoch": 2.9435502153706645, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.1591, + "step": 1623 + }, + { + "epoch": 2.945363863069599, + "grad_norm": 0.13671875, + "learning_rate": 0.0002, + "loss": 0.933, + "step": 1624 + }, + { + "epoch": 2.947177510768533, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 1.0166, + "step": 1625 + }, + { + "epoch": 2.948991158467468, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 1.0113, + "step": 1626 + }, + { + "epoch": 2.9508048061664023, + "grad_norm": 0.1640625, + "learning_rate": 0.0002, + "loss": 1.1843, + "step": 1627 + }, + { + "epoch": 2.9526184538653366, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002, + "loss": 0.8874, + "step": 1628 + }, + { + "epoch": 2.9544321015642714, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.0158, + "step": 1629 + }, + { + "epoch": 2.9562457492632057, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 1.3613, + "step": 1630 + }, + { + "epoch": 2.95805939696214, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.2298, + "step": 1631 + }, + { + "epoch": 2.959873044661075, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 1.1545, + "step": 1632 + }, + { + "epoch": 2.961686692360009, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 0.8905, + "step": 1633 + }, + { + "epoch": 2.9635003400589435, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 1.1363, + "step": 1634 + }, + { + "epoch": 2.9653139877578782, + "grad_norm": 0.16796875, + "learning_rate": 0.0002, + "loss": 1.2543, + "step": 1635 + }, + { + "epoch": 2.9671276354568126, + "grad_norm": 0.171875, + "learning_rate": 0.0002, + "loss": 1.254, + "step": 1636 + }, + { + "epoch": 2.968941283155747, + "grad_norm": 0.1806640625, + "learning_rate": 0.0002, + "loss": 1.2086, + "step": 1637 + }, + { + "epoch": 2.9707549308546817, + "grad_norm": 0.22265625, + "learning_rate": 0.0002, + "loss": 1.0, + "step": 1638 + }, + { + "epoch": 2.972568578553616, + "grad_norm": 0.1787109375, + "learning_rate": 0.0002, + "loss": 1.2749, + "step": 1639 + }, + { + "epoch": 2.9743822262525503, + "grad_norm": 0.1962890625, + "learning_rate": 0.0002, + "loss": 1.3907, + "step": 1640 + }, + { + "epoch": 2.976195873951485, + "grad_norm": 0.1787109375, + "learning_rate": 0.0002, + "loss": 1.1903, + "step": 1641 + }, + { + "epoch": 2.9780095216504194, + "grad_norm": 0.1904296875, + "learning_rate": 0.0002, + "loss": 1.0966, + "step": 1642 + }, + { + "epoch": 2.9798231693493538, + "grad_norm": 0.177734375, + "learning_rate": 0.0002, + "loss": 0.9868, + "step": 1643 + }, + { + "epoch": 2.9816368170482885, + "grad_norm": 0.1962890625, + "learning_rate": 0.0002, + "loss": 1.158, + "step": 1644 + }, + { + "epoch": 2.983450464747223, + "grad_norm": 0.1962890625, + "learning_rate": 0.0002, + "loss": 0.9751, + "step": 1645 + }, + { + "epoch": 2.985264112446157, + "grad_norm": 0.234375, + "learning_rate": 0.0002, + "loss": 1.3124, + "step": 1646 + }, + { + "epoch": 2.987077760145092, + "grad_norm": 0.2236328125, + "learning_rate": 0.0002, + "loss": 1.0771, + "step": 1647 + }, + { + "epoch": 2.9888914078440263, + "grad_norm": 0.291015625, + "learning_rate": 0.0002, + "loss": 1.1159, + "step": 1648 + }, + { + "epoch": 2.9907050555429606, + "grad_norm": 0.2578125, + "learning_rate": 0.0002, + "loss": 0.9314, + "step": 1649 + }, + { + "epoch": 2.9925187032418954, + "grad_norm": 0.302734375, + "learning_rate": 0.0002, + "loss": 0.9723, + "step": 1650 + }, + { + "epoch": 2.9943323509408297, + "grad_norm": 0.287109375, + "learning_rate": 0.0002, + "loss": 1.0103, + "step": 1651 + }, + { + "epoch": 2.996145998639764, + "grad_norm": 0.3125, + "learning_rate": 0.0002, + "loss": 0.8572, + "step": 1652 + }, + { + "epoch": 2.997959646338699, + "grad_norm": 0.30859375, + "learning_rate": 0.0002, + "loss": 0.7633, + "step": 1653 + }, + { + "epoch": 2.999773294037633, + "grad_norm": 0.2001953125, + "learning_rate": 0.0002, + "loss": 1.1702, + "step": 1654 + }, + { + "epoch": 3.0015869417365675, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 0.9394, + "step": 1655 + }, + { + "epoch": 3.0034005894355023, + "grad_norm": 0.11376953125, + "learning_rate": 0.0002, + "loss": 0.8312, + "step": 1656 + }, + { + "epoch": 3.0052142371344366, + "grad_norm": 0.10986328125, + "learning_rate": 0.0002, + "loss": 0.8536, + "step": 1657 + }, + { + "epoch": 3.007027884833371, + "grad_norm": 0.11083984375, + "learning_rate": 0.0002, + "loss": 0.9129, + "step": 1658 + }, + { + "epoch": 3.0088415325323057, + "grad_norm": 0.11181640625, + "learning_rate": 0.0002, + "loss": 0.9847, + "step": 1659 + }, + { + "epoch": 3.01065518023124, + "grad_norm": 0.1181640625, + "learning_rate": 0.0002, + "loss": 0.8875, + "step": 1660 + }, + { + "epoch": 3.0124688279301743, + "grad_norm": 0.11083984375, + "learning_rate": 0.0002, + "loss": 0.9319, + "step": 1661 + }, + { + "epoch": 3.014282475629109, + "grad_norm": 0.125, + "learning_rate": 0.0002, + "loss": 1.0231, + "step": 1662 + }, + { + "epoch": 3.0160961233280434, + "grad_norm": 0.12158203125, + "learning_rate": 0.0002, + "loss": 0.9334, + "step": 1663 + }, + { + "epoch": 3.017909771026978, + "grad_norm": 0.1328125, + "learning_rate": 0.0002, + "loss": 0.9365, + "step": 1664 + }, + { + "epoch": 3.0197234187259125, + "grad_norm": 0.1376953125, + "learning_rate": 0.0002, + "loss": 0.9678, + "step": 1665 + }, + { + "epoch": 3.021537066424847, + "grad_norm": 0.134765625, + "learning_rate": 0.0002, + "loss": 0.9536, + "step": 1666 + }, + { + "epoch": 3.0233507141237816, + "grad_norm": 0.1279296875, + "learning_rate": 0.0002, + "loss": 0.7781, + "step": 1667 + }, + { + "epoch": 3.025164361822716, + "grad_norm": 0.138671875, + "learning_rate": 0.0002, + "loss": 0.9528, + "step": 1668 + }, + { + "epoch": 3.0269780095216503, + "grad_norm": 0.1259765625, + "learning_rate": 0.0002, + "loss": 0.9206, + "step": 1669 + }, + { + "epoch": 3.028791657220585, + "grad_norm": 0.12890625, + "learning_rate": 0.0002, + "loss": 0.8393, + "step": 1670 + }, + { + "epoch": 3.0306053049195194, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 0.8923, + "step": 1671 + }, + { + "epoch": 3.0324189526184537, + "grad_norm": 0.12890625, + "learning_rate": 0.0002, + "loss": 0.9847, + "step": 1672 + }, + { + "epoch": 3.0342326003173885, + "grad_norm": 0.1171875, + "learning_rate": 0.0002, + "loss": 0.9267, + "step": 1673 + }, + { + "epoch": 3.036046248016323, + "grad_norm": 0.1279296875, + "learning_rate": 0.0002, + "loss": 0.9283, + "step": 1674 + }, + { + "epoch": 3.037859895715257, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 0.9193, + "step": 1675 + }, + { + "epoch": 3.039673543414192, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 0.9148, + "step": 1676 + }, + { + "epoch": 3.0414871911131263, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 0.8113, + "step": 1677 + }, + { + "epoch": 3.0433008388120606, + "grad_norm": 0.1650390625, + "learning_rate": 0.0002, + "loss": 1.0235, + "step": 1678 + }, + { + "epoch": 3.0451144865109954, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 0.9456, + "step": 1679 + }, + { + "epoch": 3.0469281342099297, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 1.0025, + "step": 1680 + }, + { + "epoch": 3.048741781908864, + "grad_norm": 0.1650390625, + "learning_rate": 0.0002, + "loss": 0.8115, + "step": 1681 + }, + { + "epoch": 3.050555429607799, + "grad_norm": 0.1611328125, + "learning_rate": 0.0002, + "loss": 1.0139, + "step": 1682 + }, + { + "epoch": 3.052369077306733, + "grad_norm": 0.16015625, + "learning_rate": 0.0002, + "loss": 0.858, + "step": 1683 + }, + { + "epoch": 3.052369077306733, + "eval_loss": 1.387996792793274, + "eval_runtime": 152.4061, + "eval_samples_per_second": 6.561, + "eval_steps_per_second": 6.561, + "step": 1683 + }, + { + "epoch": 3.052369077306733, + "mmlu_eval_accuracy": 0.3399757982782723, + "mmlu_eval_accuracy_abstract_algebra": 0.36363636363636365, + "mmlu_eval_accuracy_anatomy": 0.5, + "mmlu_eval_accuracy_astronomy": 0.4375, + "mmlu_eval_accuracy_business_ethics": 0.36363636363636365, + "mmlu_eval_accuracy_clinical_knowledge": 0.27586206896551724, + "mmlu_eval_accuracy_college_biology": 0.4375, + "mmlu_eval_accuracy_college_chemistry": 0.25, + "mmlu_eval_accuracy_college_computer_science": 0.0, + "mmlu_eval_accuracy_college_mathematics": 0.18181818181818182, + "mmlu_eval_accuracy_college_medicine": 0.13636363636363635, + "mmlu_eval_accuracy_college_physics": 0.36363636363636365, + "mmlu_eval_accuracy_computer_security": 0.5454545454545454, + "mmlu_eval_accuracy_conceptual_physics": 0.3076923076923077, + "mmlu_eval_accuracy_econometrics": 0.25, + "mmlu_eval_accuracy_electrical_engineering": 0.1875, + "mmlu_eval_accuracy_elementary_mathematics": 0.2682926829268293, + "mmlu_eval_accuracy_formal_logic": 0.21428571428571427, + "mmlu_eval_accuracy_global_facts": 0.4, + "mmlu_eval_accuracy_high_school_biology": 0.5625, + "mmlu_eval_accuracy_high_school_chemistry": 0.13636363636363635, + "mmlu_eval_accuracy_high_school_computer_science": 0.3333333333333333, + "mmlu_eval_accuracy_high_school_european_history": 0.4444444444444444, + "mmlu_eval_accuracy_high_school_geography": 0.5909090909090909, + "mmlu_eval_accuracy_high_school_government_and_politics": 0.47619047619047616, + "mmlu_eval_accuracy_high_school_macroeconomics": 0.32558139534883723, + "mmlu_eval_accuracy_high_school_mathematics": 0.1724137931034483, + "mmlu_eval_accuracy_high_school_microeconomics": 0.15384615384615385, + "mmlu_eval_accuracy_high_school_physics": 0.4117647058823529, + "mmlu_eval_accuracy_high_school_psychology": 0.5, + "mmlu_eval_accuracy_high_school_statistics": 0.30434782608695654, + "mmlu_eval_accuracy_high_school_us_history": 0.4090909090909091, + "mmlu_eval_accuracy_high_school_world_history": 0.2692307692307692, + "mmlu_eval_accuracy_human_aging": 0.43478260869565216, + "mmlu_eval_accuracy_human_sexuality": 0.08333333333333333, + "mmlu_eval_accuracy_international_law": 0.23076923076923078, + "mmlu_eval_accuracy_jurisprudence": 0.2727272727272727, + "mmlu_eval_accuracy_logical_fallacies": 0.5555555555555556, + "mmlu_eval_accuracy_machine_learning": 0.2727272727272727, + "mmlu_eval_accuracy_management": 0.18181818181818182, + "mmlu_eval_accuracy_marketing": 0.56, + "mmlu_eval_accuracy_medical_genetics": 0.45454545454545453, + "mmlu_eval_accuracy_miscellaneous": 0.43023255813953487, + "mmlu_eval_accuracy_moral_disputes": 0.39473684210526316, + "mmlu_eval_accuracy_moral_scenarios": 0.31, + "mmlu_eval_accuracy_nutrition": 0.5757575757575758, + "mmlu_eval_accuracy_philosophy": 0.4411764705882353, + "mmlu_eval_accuracy_prehistory": 0.2857142857142857, + "mmlu_eval_accuracy_professional_accounting": 0.3225806451612903, + "mmlu_eval_accuracy_professional_law": 0.25882352941176473, + "mmlu_eval_accuracy_professional_medicine": 0.3225806451612903, + "mmlu_eval_accuracy_professional_psychology": 0.36231884057971014, + "mmlu_eval_accuracy_public_relations": 0.4166666666666667, + "mmlu_eval_accuracy_security_studies": 0.37037037037037035, + "mmlu_eval_accuracy_sociology": 0.5454545454545454, + "mmlu_eval_accuracy_us_foreign_policy": 0.18181818181818182, + "mmlu_eval_accuracy_virology": 0.2777777777777778, + "mmlu_eval_accuracy_world_religions": 0.2631578947368421, + "mmlu_loss": 1.8283339652027826, + "step": 1683 + }, + { + "epoch": 3.0541827250056675, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 0.7957, + "step": 1684 + }, + { + "epoch": 3.0559963727046022, + "grad_norm": 0.169921875, + "learning_rate": 0.0002, + "loss": 0.9316, + "step": 1685 + }, + { + "epoch": 3.0578100204035366, + "grad_norm": 0.1630859375, + "learning_rate": 0.0002, + "loss": 0.9668, + "step": 1686 + }, + { + "epoch": 3.059623668102471, + "grad_norm": 0.171875, + "learning_rate": 0.0002, + "loss": 0.9507, + "step": 1687 + }, + { + "epoch": 3.0614373158014057, + "grad_norm": 0.1796875, + "learning_rate": 0.0002, + "loss": 1.0102, + "step": 1688 + }, + { + "epoch": 3.06325096350034, + "grad_norm": 0.1943359375, + "learning_rate": 0.0002, + "loss": 1.0242, + "step": 1689 + }, + { + "epoch": 3.0650646111992748, + "grad_norm": 0.1923828125, + "learning_rate": 0.0002, + "loss": 0.9167, + "step": 1690 + }, + { + "epoch": 3.066878258898209, + "grad_norm": 0.18359375, + "learning_rate": 0.0002, + "loss": 0.6857, + "step": 1691 + }, + { + "epoch": 3.0686919065971434, + "grad_norm": 0.2041015625, + "learning_rate": 0.0002, + "loss": 1.0009, + "step": 1692 + }, + { + "epoch": 3.070505554296078, + "grad_norm": 0.19921875, + "learning_rate": 0.0002, + "loss": 0.902, + "step": 1693 + }, + { + "epoch": 3.0723192019950125, + "grad_norm": 0.1923828125, + "learning_rate": 0.0002, + "loss": 0.8839, + "step": 1694 + }, + { + "epoch": 3.074132849693947, + "grad_norm": 0.201171875, + "learning_rate": 0.0002, + "loss": 0.7496, + "step": 1695 + }, + { + "epoch": 3.0759464973928816, + "grad_norm": 0.23046875, + "learning_rate": 0.0002, + "loss": 0.7987, + "step": 1696 + }, + { + "epoch": 3.077760145091816, + "grad_norm": 0.244140625, + "learning_rate": 0.0002, + "loss": 0.8159, + "step": 1697 + }, + { + "epoch": 3.0795737927907503, + "grad_norm": 0.2421875, + "learning_rate": 0.0002, + "loss": 0.8313, + "step": 1698 + }, + { + "epoch": 3.081387440489685, + "grad_norm": 0.25, + "learning_rate": 0.0002, + "loss": 1.0129, + "step": 1699 + }, + { + "epoch": 3.0832010881886194, + "grad_norm": 0.27734375, + "learning_rate": 0.0002, + "loss": 0.5781, + "step": 1700 + }, + { + "epoch": 3.0850147358875537, + "grad_norm": 0.322265625, + "learning_rate": 0.0002, + "loss": 0.6604, + "step": 1701 + }, + { + "epoch": 3.0868283835864885, + "grad_norm": 0.376953125, + "learning_rate": 0.0002, + "loss": 0.619, + "step": 1702 + }, + { + "epoch": 3.088642031285423, + "grad_norm": 0.427734375, + "learning_rate": 0.0002, + "loss": 0.587, + "step": 1703 + }, + { + "epoch": 3.090455678984357, + "grad_norm": 0.33203125, + "learning_rate": 0.0002, + "loss": 0.5331, + "step": 1704 + }, + { + "epoch": 3.092269326683292, + "grad_norm": 0.1953125, + "learning_rate": 0.0002, + "loss": 0.9706, + "step": 1705 + }, + { + "epoch": 3.0940829743822262, + "grad_norm": 0.189453125, + "learning_rate": 0.0002, + "loss": 1.0147, + "step": 1706 + }, + { + "epoch": 3.0958966220811606, + "grad_norm": 0.1875, + "learning_rate": 0.0002, + "loss": 0.9665, + "step": 1707 + }, + { + "epoch": 3.0977102697800953, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 0.8396, + "step": 1708 + }, + { + "epoch": 3.0995239174790297, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 0.9005, + "step": 1709 + }, + { + "epoch": 3.101337565177964, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 0.8225, + "step": 1710 + }, + { + "epoch": 3.1031512128768988, + "grad_norm": 0.1259765625, + "learning_rate": 0.0002, + "loss": 0.8192, + "step": 1711 + }, + { + "epoch": 3.104964860575833, + "grad_norm": 0.1279296875, + "learning_rate": 0.0002, + "loss": 0.9365, + "step": 1712 + }, + { + "epoch": 3.1067785082747674, + "grad_norm": 0.140625, + "learning_rate": 0.0002, + "loss": 1.0776, + "step": 1713 + }, + { + "epoch": 3.108592155973702, + "grad_norm": 0.1357421875, + "learning_rate": 0.0002, + "loss": 0.989, + "step": 1714 + }, + { + "epoch": 3.1104058036726365, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002, + "loss": 0.8458, + "step": 1715 + }, + { + "epoch": 3.112219451371571, + "grad_norm": 0.138671875, + "learning_rate": 0.0002, + "loss": 0.9623, + "step": 1716 + }, + { + "epoch": 3.1140330990705056, + "grad_norm": 0.1376953125, + "learning_rate": 0.0002, + "loss": 1.0396, + "step": 1717 + }, + { + "epoch": 3.11584674676944, + "grad_norm": 0.138671875, + "learning_rate": 0.0002, + "loss": 0.8904, + "step": 1718 + }, + { + "epoch": 3.1176603944683747, + "grad_norm": 0.1318359375, + "learning_rate": 0.0002, + "loss": 0.8417, + "step": 1719 + }, + { + "epoch": 3.119474042167309, + "grad_norm": 0.1376953125, + "learning_rate": 0.0002, + "loss": 1.1259, + "step": 1720 + }, + { + "epoch": 3.1212876898662434, + "grad_norm": 0.1396484375, + "learning_rate": 0.0002, + "loss": 0.9305, + "step": 1721 + }, + { + "epoch": 3.123101337565178, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 0.846, + "step": 1722 + }, + { + "epoch": 3.1249149852641125, + "grad_norm": 0.16015625, + "learning_rate": 0.0002, + "loss": 1.0114, + "step": 1723 + }, + { + "epoch": 3.126728632963047, + "grad_norm": 0.1650390625, + "learning_rate": 0.0002, + "loss": 1.0126, + "step": 1724 + }, + { + "epoch": 3.1285422806619816, + "grad_norm": 0.15625, + "learning_rate": 0.0002, + "loss": 0.8175, + "step": 1725 + }, + { + "epoch": 3.130355928360916, + "grad_norm": 0.16796875, + "learning_rate": 0.0002, + "loss": 0.9149, + "step": 1726 + }, + { + "epoch": 3.1321695760598502, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 0.7961, + "step": 1727 + }, + { + "epoch": 3.133983223758785, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 0.8805, + "step": 1728 + }, + { + "epoch": 3.1357968714577193, + "grad_norm": 0.1796875, + "learning_rate": 0.0002, + "loss": 0.8564, + "step": 1729 + }, + { + "epoch": 3.1376105191566537, + "grad_norm": 0.177734375, + "learning_rate": 0.0002, + "loss": 0.9348, + "step": 1730 + }, + { + "epoch": 3.1394241668555884, + "grad_norm": 0.1591796875, + "learning_rate": 0.0002, + "loss": 0.9708, + "step": 1731 + }, + { + "epoch": 3.141237814554523, + "grad_norm": 0.1728515625, + "learning_rate": 0.0002, + "loss": 0.9943, + "step": 1732 + }, + { + "epoch": 3.143051462253457, + "grad_norm": 0.1826171875, + "learning_rate": 0.0002, + "loss": 0.7929, + "step": 1733 + }, + { + "epoch": 3.144865109952392, + "grad_norm": 0.1767578125, + "learning_rate": 0.0002, + "loss": 0.8415, + "step": 1734 + }, + { + "epoch": 3.146678757651326, + "grad_norm": 0.17578125, + "learning_rate": 0.0002, + "loss": 0.8797, + "step": 1735 + }, + { + "epoch": 3.1484924053502605, + "grad_norm": 0.17578125, + "learning_rate": 0.0002, + "loss": 1.1165, + "step": 1736 + }, + { + "epoch": 3.1503060530491953, + "grad_norm": 0.2021484375, + "learning_rate": 0.0002, + "loss": 1.0148, + "step": 1737 + }, + { + "epoch": 3.1521197007481296, + "grad_norm": 0.1943359375, + "learning_rate": 0.0002, + "loss": 1.0592, + "step": 1738 + }, + { + "epoch": 3.153933348447064, + "grad_norm": 0.1884765625, + "learning_rate": 0.0002, + "loss": 0.8912, + "step": 1739 + }, + { + "epoch": 3.1557469961459987, + "grad_norm": 0.19140625, + "learning_rate": 0.0002, + "loss": 0.7016, + "step": 1740 + }, + { + "epoch": 3.157560643844933, + "grad_norm": 0.2001953125, + "learning_rate": 0.0002, + "loss": 0.9565, + "step": 1741 + }, + { + "epoch": 3.1593742915438674, + "grad_norm": 0.212890625, + "learning_rate": 0.0002, + "loss": 0.904, + "step": 1742 + }, + { + "epoch": 3.161187939242802, + "grad_norm": 0.2177734375, + "learning_rate": 0.0002, + "loss": 0.9222, + "step": 1743 + }, + { + "epoch": 3.1630015869417365, + "grad_norm": 0.2421875, + "learning_rate": 0.0002, + "loss": 0.8487, + "step": 1744 + }, + { + "epoch": 3.1648152346406713, + "grad_norm": 0.232421875, + "learning_rate": 0.0002, + "loss": 0.8458, + "step": 1745 + }, + { + "epoch": 3.1666288823396056, + "grad_norm": 0.28515625, + "learning_rate": 0.0002, + "loss": 0.8297, + "step": 1746 + }, + { + "epoch": 3.16844253003854, + "grad_norm": 0.2392578125, + "learning_rate": 0.0002, + "loss": 0.8322, + "step": 1747 + }, + { + "epoch": 3.1702561777374747, + "grad_norm": 0.2470703125, + "learning_rate": 0.0002, + "loss": 0.7111, + "step": 1748 + }, + { + "epoch": 3.172069825436409, + "grad_norm": 0.294921875, + "learning_rate": 0.0002, + "loss": 0.5637, + "step": 1749 + }, + { + "epoch": 3.1738834731353434, + "grad_norm": 0.3203125, + "learning_rate": 0.0002, + "loss": 0.622, + "step": 1750 + }, + { + "epoch": 3.175697120834278, + "grad_norm": 0.3828125, + "learning_rate": 0.0002, + "loss": 0.5889, + "step": 1751 + }, + { + "epoch": 3.1775107685332125, + "grad_norm": 0.365234375, + "learning_rate": 0.0002, + "loss": 0.5157, + "step": 1752 + }, + { + "epoch": 3.179324416232147, + "grad_norm": 0.3359375, + "learning_rate": 0.0002, + "loss": 0.5188, + "step": 1753 + }, + { + "epoch": 3.1811380639310816, + "grad_norm": 0.427734375, + "learning_rate": 0.0002, + "loss": 0.4595, + "step": 1754 + }, + { + "epoch": 3.182951711630016, + "grad_norm": 0.208984375, + "learning_rate": 0.0002, + "loss": 1.0969, + "step": 1755 + }, + { + "epoch": 3.18476535932895, + "grad_norm": 0.2041015625, + "learning_rate": 0.0002, + "loss": 0.9927, + "step": 1756 + }, + { + "epoch": 3.186579007027885, + "grad_norm": 0.1845703125, + "learning_rate": 0.0002, + "loss": 0.8851, + "step": 1757 + }, + { + "epoch": 3.1883926547268193, + "grad_norm": 0.181640625, + "learning_rate": 0.0002, + "loss": 1.016, + "step": 1758 + }, + { + "epoch": 3.1902063024257536, + "grad_norm": 0.16015625, + "learning_rate": 0.0002, + "loss": 1.0327, + "step": 1759 + }, + { + "epoch": 3.1920199501246884, + "grad_norm": 0.162109375, + "learning_rate": 0.0002, + "loss": 0.9016, + "step": 1760 + }, + { + "epoch": 3.1938335978236227, + "grad_norm": 0.1318359375, + "learning_rate": 0.0002, + "loss": 0.8316, + "step": 1761 + }, + { + "epoch": 3.195647245522557, + "grad_norm": 0.130859375, + "learning_rate": 0.0002, + "loss": 0.7826, + "step": 1762 + }, + { + "epoch": 3.197460893221492, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 0.9915, + "step": 1763 + }, + { + "epoch": 3.199274540920426, + "grad_norm": 0.13671875, + "learning_rate": 0.0002, + "loss": 0.9155, + "step": 1764 + }, + { + "epoch": 3.2010881886193605, + "grad_norm": 0.13671875, + "learning_rate": 0.0002, + "loss": 0.972, + "step": 1765 + }, + { + "epoch": 3.2029018363182953, + "grad_norm": 0.140625, + "learning_rate": 0.0002, + "loss": 0.9962, + "step": 1766 + }, + { + "epoch": 3.2047154840172296, + "grad_norm": 0.1279296875, + "learning_rate": 0.0002, + "loss": 0.9334, + "step": 1767 + }, + { + "epoch": 3.206529131716164, + "grad_norm": 0.130859375, + "learning_rate": 0.0002, + "loss": 0.9453, + "step": 1768 + }, + { + "epoch": 3.2083427794150987, + "grad_norm": 0.12451171875, + "learning_rate": 0.0002, + "loss": 0.7883, + "step": 1769 + }, + { + "epoch": 3.210156427114033, + "grad_norm": 0.140625, + "learning_rate": 0.0002, + "loss": 0.8614, + "step": 1770 + }, + { + "epoch": 3.2119700748129674, + "grad_norm": 0.11962890625, + "learning_rate": 0.0002, + "loss": 0.7892, + "step": 1771 + }, + { + "epoch": 3.213783722511902, + "grad_norm": 0.1357421875, + "learning_rate": 0.0002, + "loss": 0.9387, + "step": 1772 + }, + { + "epoch": 3.2155973702108365, + "grad_norm": 0.1572265625, + "learning_rate": 0.0002, + "loss": 0.89, + "step": 1773 + }, + { + "epoch": 3.217411017909771, + "grad_norm": 0.1611328125, + "learning_rate": 0.0002, + "loss": 0.8272, + "step": 1774 + }, + { + "epoch": 3.2192246656087056, + "grad_norm": 0.171875, + "learning_rate": 0.0002, + "loss": 1.0023, + "step": 1775 + }, + { + "epoch": 3.22103831330764, + "grad_norm": 0.1591796875, + "learning_rate": 0.0002, + "loss": 0.9437, + "step": 1776 + }, + { + "epoch": 3.2228519610065747, + "grad_norm": 0.15625, + "learning_rate": 0.0002, + "loss": 0.9861, + "step": 1777 + }, + { + "epoch": 3.224665608705509, + "grad_norm": 0.1591796875, + "learning_rate": 0.0002, + "loss": 0.8701, + "step": 1778 + }, + { + "epoch": 3.2264792564044433, + "grad_norm": 0.173828125, + "learning_rate": 0.0002, + "loss": 0.9995, + "step": 1779 + }, + { + "epoch": 3.228292904103378, + "grad_norm": 0.1748046875, + "learning_rate": 0.0002, + "loss": 0.9945, + "step": 1780 + }, + { + "epoch": 3.2301065518023124, + "grad_norm": 0.177734375, + "learning_rate": 0.0002, + "loss": 1.2207, + "step": 1781 + }, + { + "epoch": 3.2319201995012468, + "grad_norm": 0.1572265625, + "learning_rate": 0.0002, + "loss": 0.8848, + "step": 1782 + }, + { + "epoch": 3.2337338472001815, + "grad_norm": 0.1787109375, + "learning_rate": 0.0002, + "loss": 0.9833, + "step": 1783 + }, + { + "epoch": 3.235547494899116, + "grad_norm": 0.1669921875, + "learning_rate": 0.0002, + "loss": 0.9499, + "step": 1784 + }, + { + "epoch": 3.23736114259805, + "grad_norm": 0.1767578125, + "learning_rate": 0.0002, + "loss": 1.0464, + "step": 1785 + }, + { + "epoch": 3.239174790296985, + "grad_norm": 0.1787109375, + "learning_rate": 0.0002, + "loss": 0.9785, + "step": 1786 + }, + { + "epoch": 3.2409884379959193, + "grad_norm": 0.1728515625, + "learning_rate": 0.0002, + "loss": 0.9583, + "step": 1787 + }, + { + "epoch": 3.2428020856948536, + "grad_norm": 0.1748046875, + "learning_rate": 0.0002, + "loss": 0.8693, + "step": 1788 + }, + { + "epoch": 3.2446157333937884, + "grad_norm": 0.1875, + "learning_rate": 0.0002, + "loss": 0.9053, + "step": 1789 + }, + { + "epoch": 3.2464293810927227, + "grad_norm": 0.2021484375, + "learning_rate": 0.0002, + "loss": 0.8704, + "step": 1790 + }, + { + "epoch": 3.248243028791657, + "grad_norm": 0.19140625, + "learning_rate": 0.0002, + "loss": 0.8705, + "step": 1791 + }, + { + "epoch": 3.250056676490592, + "grad_norm": 0.2001953125, + "learning_rate": 0.0002, + "loss": 0.9177, + "step": 1792 + }, + { + "epoch": 3.251870324189526, + "grad_norm": 0.224609375, + "learning_rate": 0.0002, + "loss": 0.9478, + "step": 1793 + }, + { + "epoch": 3.2536839718884605, + "grad_norm": 0.2158203125, + "learning_rate": 0.0002, + "loss": 0.8744, + "step": 1794 + }, + { + "epoch": 3.2554976195873953, + "grad_norm": 0.2392578125, + "learning_rate": 0.0002, + "loss": 0.9513, + "step": 1795 + }, + { + "epoch": 3.2573112672863296, + "grad_norm": 0.2578125, + "learning_rate": 0.0002, + "loss": 0.8561, + "step": 1796 + }, + { + "epoch": 3.2591249149852644, + "grad_norm": 0.26171875, + "learning_rate": 0.0002, + "loss": 0.7639, + "step": 1797 + }, + { + "epoch": 3.2609385626841987, + "grad_norm": 0.3671875, + "learning_rate": 0.0002, + "loss": 0.8008, + "step": 1798 + }, + { + "epoch": 3.262752210383133, + "grad_norm": 0.330078125, + "learning_rate": 0.0002, + "loss": 0.6228, + "step": 1799 + }, + { + "epoch": 3.264565858082068, + "grad_norm": 0.283203125, + "learning_rate": 0.0002, + "loss": 0.674, + "step": 1800 + }, + { + "epoch": 3.266379505781002, + "grad_norm": 0.263671875, + "learning_rate": 0.0002, + "loss": 0.6317, + "step": 1801 + }, + { + "epoch": 3.2681931534799364, + "grad_norm": 0.39453125, + "learning_rate": 0.0002, + "loss": 0.5396, + "step": 1802 + }, + { + "epoch": 3.270006801178871, + "grad_norm": 0.41015625, + "learning_rate": 0.0002, + "loss": 0.4855, + "step": 1803 + }, + { + "epoch": 3.2718204488778055, + "grad_norm": 0.373046875, + "learning_rate": 0.0002, + "loss": 0.5534, + "step": 1804 + }, + { + "epoch": 3.27363409657674, + "grad_norm": 0.2099609375, + "learning_rate": 0.0002, + "loss": 0.8692, + "step": 1805 + }, + { + "epoch": 3.2754477442756746, + "grad_norm": 0.1962890625, + "learning_rate": 0.0002, + "loss": 0.8095, + "step": 1806 + }, + { + "epoch": 3.277261391974609, + "grad_norm": 0.1767578125, + "learning_rate": 0.0002, + "loss": 0.9967, + "step": 1807 + }, + { + "epoch": 3.2790750396735433, + "grad_norm": 0.1865234375, + "learning_rate": 0.0002, + "loss": 1.0181, + "step": 1808 + }, + { + "epoch": 3.280888687372478, + "grad_norm": 0.1630859375, + "learning_rate": 0.0002, + "loss": 0.9718, + "step": 1809 + }, + { + "epoch": 3.2827023350714124, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 1.056, + "step": 1810 + }, + { + "epoch": 3.2845159827703467, + "grad_norm": 0.140625, + "learning_rate": 0.0002, + "loss": 1.0506, + "step": 1811 + }, + { + "epoch": 3.2863296304692815, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 0.8821, + "step": 1812 + }, + { + "epoch": 3.288143278168216, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 0.9534, + "step": 1813 + }, + { + "epoch": 3.28995692586715, + "grad_norm": 0.13671875, + "learning_rate": 0.0002, + "loss": 0.9624, + "step": 1814 + }, + { + "epoch": 3.291770573566085, + "grad_norm": 0.140625, + "learning_rate": 0.0002, + "loss": 0.9026, + "step": 1815 + }, + { + "epoch": 3.2935842212650193, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 1.0478, + "step": 1816 + }, + { + "epoch": 3.2953978689639536, + "grad_norm": 0.1396484375, + "learning_rate": 0.0002, + "loss": 0.9933, + "step": 1817 + }, + { + "epoch": 3.2972115166628884, + "grad_norm": 0.158203125, + "learning_rate": 0.0002, + "loss": 0.9763, + "step": 1818 + }, + { + "epoch": 3.2990251643618227, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 0.9648, + "step": 1819 + }, + { + "epoch": 3.300838812060757, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002, + "loss": 0.8742, + "step": 1820 + }, + { + "epoch": 3.302652459759692, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 0.9851, + "step": 1821 + }, + { + "epoch": 3.304466107458626, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 0.702, + "step": 1822 + }, + { + "epoch": 3.3062797551575605, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 0.8253, + "step": 1823 + }, + { + "epoch": 3.3080934028564952, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 0.8239, + "step": 1824 + }, + { + "epoch": 3.3099070505554296, + "grad_norm": 0.1572265625, + "learning_rate": 0.0002, + "loss": 0.8632, + "step": 1825 + }, + { + "epoch": 3.311720698254364, + "grad_norm": 0.1728515625, + "learning_rate": 0.0002, + "loss": 0.9527, + "step": 1826 + }, + { + "epoch": 3.3135343459532987, + "grad_norm": 0.1630859375, + "learning_rate": 0.0002, + "loss": 0.875, + "step": 1827 + }, + { + "epoch": 3.315347993652233, + "grad_norm": 0.1669921875, + "learning_rate": 0.0002, + "loss": 0.9155, + "step": 1828 + }, + { + "epoch": 3.3171616413511673, + "grad_norm": 0.1572265625, + "learning_rate": 0.0002, + "loss": 0.9693, + "step": 1829 + }, + { + "epoch": 3.318975289050102, + "grad_norm": 0.177734375, + "learning_rate": 0.0002, + "loss": 0.9708, + "step": 1830 + }, + { + "epoch": 3.3207889367490364, + "grad_norm": 0.162109375, + "learning_rate": 0.0002, + "loss": 0.9125, + "step": 1831 + }, + { + "epoch": 3.3226025844479707, + "grad_norm": 0.1796875, + "learning_rate": 0.0002, + "loss": 0.9238, + "step": 1832 + }, + { + "epoch": 3.3244162321469055, + "grad_norm": 0.16796875, + "learning_rate": 0.0002, + "loss": 0.8328, + "step": 1833 + }, + { + "epoch": 3.32622987984584, + "grad_norm": 0.173828125, + "learning_rate": 0.0002, + "loss": 0.9553, + "step": 1834 + }, + { + "epoch": 3.3280435275447746, + "grad_norm": 0.189453125, + "learning_rate": 0.0002, + "loss": 0.8458, + "step": 1835 + }, + { + "epoch": 3.329857175243709, + "grad_norm": 0.19140625, + "learning_rate": 0.0002, + "loss": 0.8777, + "step": 1836 + }, + { + "epoch": 3.3316708229426433, + "grad_norm": 0.193359375, + "learning_rate": 0.0002, + "loss": 1.1167, + "step": 1837 + }, + { + "epoch": 3.333484470641578, + "grad_norm": 0.2099609375, + "learning_rate": 0.0002, + "loss": 1.0183, + "step": 1838 + }, + { + "epoch": 3.3352981183405124, + "grad_norm": 0.1796875, + "learning_rate": 0.0002, + "loss": 0.8423, + "step": 1839 + }, + { + "epoch": 3.3371117660394467, + "grad_norm": 0.20703125, + "learning_rate": 0.0002, + "loss": 0.928, + "step": 1840 + }, + { + "epoch": 3.3389254137383815, + "grad_norm": 0.220703125, + "learning_rate": 0.0002, + "loss": 0.841, + "step": 1841 + }, + { + "epoch": 3.340739061437316, + "grad_norm": 0.21875, + "learning_rate": 0.0002, + "loss": 0.9057, + "step": 1842 + }, + { + "epoch": 3.34255270913625, + "grad_norm": 0.212890625, + "learning_rate": 0.0002, + "loss": 0.807, + "step": 1843 + }, + { + "epoch": 3.344366356835185, + "grad_norm": 0.2236328125, + "learning_rate": 0.0002, + "loss": 0.8268, + "step": 1844 + }, + { + "epoch": 3.3461800045341192, + "grad_norm": 0.26953125, + "learning_rate": 0.0002, + "loss": 0.6308, + "step": 1845 + }, + { + "epoch": 3.3479936522330536, + "grad_norm": 0.232421875, + "learning_rate": 0.0002, + "loss": 0.862, + "step": 1846 + }, + { + "epoch": 3.3498072999319883, + "grad_norm": 0.265625, + "learning_rate": 0.0002, + "loss": 0.6722, + "step": 1847 + }, + { + "epoch": 3.3516209476309227, + "grad_norm": 0.279296875, + "learning_rate": 0.0002, + "loss": 0.7188, + "step": 1848 + }, + { + "epoch": 3.353434595329857, + "grad_norm": 0.330078125, + "learning_rate": 0.0002, + "loss": 0.7432, + "step": 1849 + }, + { + "epoch": 3.3552482430287918, + "grad_norm": 0.30078125, + "learning_rate": 0.0002, + "loss": 0.8024, + "step": 1850 + }, + { + "epoch": 3.357061890727726, + "grad_norm": 0.341796875, + "learning_rate": 0.0002, + "loss": 0.6552, + "step": 1851 + }, + { + "epoch": 3.358875538426661, + "grad_norm": 0.34375, + "learning_rate": 0.0002, + "loss": 0.4728, + "step": 1852 + }, + { + "epoch": 3.360689186125595, + "grad_norm": 0.345703125, + "learning_rate": 0.0002, + "loss": 0.6133, + "step": 1853 + }, + { + "epoch": 3.3625028338245295, + "grad_norm": 0.375, + "learning_rate": 0.0002, + "loss": 0.5255, + "step": 1854 + }, + { + "epoch": 3.3643164815234643, + "grad_norm": 0.2080078125, + "learning_rate": 0.0002, + "loss": 0.9128, + "step": 1855 + }, + { + "epoch": 3.3661301292223986, + "grad_norm": 0.2021484375, + "learning_rate": 0.0002, + "loss": 1.123, + "step": 1856 + }, + { + "epoch": 3.367943776921333, + "grad_norm": 0.173828125, + "learning_rate": 0.0002, + "loss": 0.7508, + "step": 1857 + }, + { + "epoch": 3.3697574246202677, + "grad_norm": 0.1796875, + "learning_rate": 0.0002, + "loss": 1.0473, + "step": 1858 + }, + { + "epoch": 3.371571072319202, + "grad_norm": 0.185546875, + "learning_rate": 0.0002, + "loss": 0.9086, + "step": 1859 + }, + { + "epoch": 3.3733847200181364, + "grad_norm": 0.1640625, + "learning_rate": 0.0002, + "loss": 1.0318, + "step": 1860 + }, + { + "epoch": 3.375198367717071, + "grad_norm": 0.1640625, + "learning_rate": 0.0002, + "loss": 0.8874, + "step": 1861 + }, + { + "epoch": 3.3770120154160055, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 0.8898, + "step": 1862 + }, + { + "epoch": 3.37882566311494, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 0.988, + "step": 1863 + }, + { + "epoch": 3.3806393108138746, + "grad_norm": 0.1396484375, + "learning_rate": 0.0002, + "loss": 0.7517, + "step": 1864 + }, + { + "epoch": 3.382452958512809, + "grad_norm": 0.15625, + "learning_rate": 0.0002, + "loss": 1.0273, + "step": 1865 + }, + { + "epoch": 3.3842666062117432, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 0.9953, + "step": 1866 + }, + { + "epoch": 3.386080253910678, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 0.951, + "step": 1867 + }, + { + "epoch": 3.3878939016096123, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 1.0102, + "step": 1868 + }, + { + "epoch": 3.3897075493085467, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 0.7872, + "step": 1869 + }, + { + "epoch": 3.3915211970074814, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002, + "loss": 0.8205, + "step": 1870 + }, + { + "epoch": 3.3915211970074814, + "eval_loss": 1.3864154815673828, + "eval_runtime": 152.6665, + "eval_samples_per_second": 6.55, + "eval_steps_per_second": 6.55, + "step": 1870 + }, + { + "epoch": 3.3915211970074814, + "mmlu_eval_accuracy": 0.34068781646248997, + "mmlu_eval_accuracy_abstract_algebra": 0.36363636363636365, + "mmlu_eval_accuracy_anatomy": 0.42857142857142855, + "mmlu_eval_accuracy_astronomy": 0.375, + "mmlu_eval_accuracy_business_ethics": 0.36363636363636365, + "mmlu_eval_accuracy_clinical_knowledge": 0.3103448275862069, + "mmlu_eval_accuracy_college_biology": 0.3125, + "mmlu_eval_accuracy_college_chemistry": 0.125, + "mmlu_eval_accuracy_college_computer_science": 0.0, + "mmlu_eval_accuracy_college_mathematics": 0.18181818181818182, + "mmlu_eval_accuracy_college_medicine": 0.2727272727272727, + "mmlu_eval_accuracy_college_physics": 0.36363636363636365, + "mmlu_eval_accuracy_computer_security": 0.45454545454545453, + "mmlu_eval_accuracy_conceptual_physics": 0.3076923076923077, + "mmlu_eval_accuracy_econometrics": 0.16666666666666666, + "mmlu_eval_accuracy_electrical_engineering": 0.25, + "mmlu_eval_accuracy_elementary_mathematics": 0.2682926829268293, + "mmlu_eval_accuracy_formal_logic": 0.21428571428571427, + "mmlu_eval_accuracy_global_facts": 0.3, + "mmlu_eval_accuracy_high_school_biology": 0.53125, + "mmlu_eval_accuracy_high_school_chemistry": 0.13636363636363635, + "mmlu_eval_accuracy_high_school_computer_science": 0.4444444444444444, + "mmlu_eval_accuracy_high_school_european_history": 0.3888888888888889, + "mmlu_eval_accuracy_high_school_geography": 0.5, + "mmlu_eval_accuracy_high_school_government_and_politics": 0.38095238095238093, + "mmlu_eval_accuracy_high_school_macroeconomics": 0.37209302325581395, + "mmlu_eval_accuracy_high_school_mathematics": 0.13793103448275862, + "mmlu_eval_accuracy_high_school_microeconomics": 0.19230769230769232, + "mmlu_eval_accuracy_high_school_physics": 0.35294117647058826, + "mmlu_eval_accuracy_high_school_psychology": 0.5166666666666667, + "mmlu_eval_accuracy_high_school_statistics": 0.2608695652173913, + "mmlu_eval_accuracy_high_school_us_history": 0.4090909090909091, + "mmlu_eval_accuracy_high_school_world_history": 0.3076923076923077, + "mmlu_eval_accuracy_human_aging": 0.43478260869565216, + "mmlu_eval_accuracy_human_sexuality": 0.16666666666666666, + "mmlu_eval_accuracy_international_law": 0.46153846153846156, + "mmlu_eval_accuracy_jurisprudence": 0.2727272727272727, + "mmlu_eval_accuracy_logical_fallacies": 0.5555555555555556, + "mmlu_eval_accuracy_machine_learning": 0.45454545454545453, + "mmlu_eval_accuracy_management": 0.18181818181818182, + "mmlu_eval_accuracy_marketing": 0.48, + "mmlu_eval_accuracy_medical_genetics": 0.5454545454545454, + "mmlu_eval_accuracy_miscellaneous": 0.45348837209302323, + "mmlu_eval_accuracy_moral_disputes": 0.3684210526315789, + "mmlu_eval_accuracy_moral_scenarios": 0.31, + "mmlu_eval_accuracy_nutrition": 0.5151515151515151, + "mmlu_eval_accuracy_philosophy": 0.5, + "mmlu_eval_accuracy_prehistory": 0.2857142857142857, + "mmlu_eval_accuracy_professional_accounting": 0.3225806451612903, + "mmlu_eval_accuracy_professional_law": 0.27647058823529413, + "mmlu_eval_accuracy_professional_medicine": 0.3225806451612903, + "mmlu_eval_accuracy_professional_psychology": 0.391304347826087, + "mmlu_eval_accuracy_public_relations": 0.4166666666666667, + "mmlu_eval_accuracy_security_studies": 0.4074074074074074, + "mmlu_eval_accuracy_sociology": 0.6363636363636364, + "mmlu_eval_accuracy_us_foreign_policy": 0.18181818181818182, + "mmlu_eval_accuracy_virology": 0.2777777777777778, + "mmlu_eval_accuracy_world_religions": 0.21052631578947367, + "mmlu_loss": 1.845924968426715, + "step": 1870 + }, + { + "epoch": 3.3933348447064158, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 0.8641, + "step": 1871 + }, + { + "epoch": 3.39514849240535, + "grad_norm": 0.181640625, + "learning_rate": 0.0002, + "loss": 0.83, + "step": 1872 + }, + { + "epoch": 3.396962140104285, + "grad_norm": 0.15625, + "learning_rate": 0.0002, + "loss": 0.8246, + "step": 1873 + }, + { + "epoch": 3.398775787803219, + "grad_norm": 0.17578125, + "learning_rate": 0.0002, + "loss": 0.8513, + "step": 1874 + }, + { + "epoch": 3.4005894355021535, + "grad_norm": 0.1640625, + "learning_rate": 0.0002, + "loss": 0.9705, + "step": 1875 + }, + { + "epoch": 3.4024030832010883, + "grad_norm": 0.19921875, + "learning_rate": 0.0002, + "loss": 0.9463, + "step": 1876 + }, + { + "epoch": 3.4042167309000226, + "grad_norm": 0.2099609375, + "learning_rate": 0.0002, + "loss": 1.0664, + "step": 1877 + }, + { + "epoch": 3.406030378598957, + "grad_norm": 0.17578125, + "learning_rate": 0.0002, + "loss": 0.8189, + "step": 1878 + }, + { + "epoch": 3.4078440262978917, + "grad_norm": 0.2099609375, + "learning_rate": 0.0002, + "loss": 0.9434, + "step": 1879 + }, + { + "epoch": 3.409657673996826, + "grad_norm": 0.1875, + "learning_rate": 0.0002, + "loss": 1.0752, + "step": 1880 + }, + { + "epoch": 3.4114713216957604, + "grad_norm": 0.1796875, + "learning_rate": 0.0002, + "loss": 0.8931, + "step": 1881 + }, + { + "epoch": 3.413284969394695, + "grad_norm": 0.189453125, + "learning_rate": 0.0002, + "loss": 0.8153, + "step": 1882 + }, + { + "epoch": 3.4150986170936295, + "grad_norm": 0.169921875, + "learning_rate": 0.0002, + "loss": 0.8143, + "step": 1883 + }, + { + "epoch": 3.416912264792564, + "grad_norm": 0.1845703125, + "learning_rate": 0.0002, + "loss": 0.8445, + "step": 1884 + }, + { + "epoch": 3.4187259124914986, + "grad_norm": 0.1787109375, + "learning_rate": 0.0002, + "loss": 0.9317, + "step": 1885 + }, + { + "epoch": 3.420539560190433, + "grad_norm": 0.173828125, + "learning_rate": 0.0002, + "loss": 0.9844, + "step": 1886 + }, + { + "epoch": 3.4223532078893673, + "grad_norm": 0.1923828125, + "learning_rate": 0.0002, + "loss": 0.9951, + "step": 1887 + }, + { + "epoch": 3.424166855588302, + "grad_norm": 0.208984375, + "learning_rate": 0.0002, + "loss": 0.8716, + "step": 1888 + }, + { + "epoch": 3.4259805032872364, + "grad_norm": 0.2236328125, + "learning_rate": 0.0002, + "loss": 0.8491, + "step": 1889 + }, + { + "epoch": 3.427794150986171, + "grad_norm": 0.2333984375, + "learning_rate": 0.0002, + "loss": 1.0125, + "step": 1890 + }, + { + "epoch": 3.4296077986851055, + "grad_norm": 0.2177734375, + "learning_rate": 0.0002, + "loss": 0.8207, + "step": 1891 + }, + { + "epoch": 3.43142144638404, + "grad_norm": 0.2431640625, + "learning_rate": 0.0002, + "loss": 0.8824, + "step": 1892 + }, + { + "epoch": 3.4332350940829746, + "grad_norm": 0.2099609375, + "learning_rate": 0.0002, + "loss": 0.8238, + "step": 1893 + }, + { + "epoch": 3.435048741781909, + "grad_norm": 0.23046875, + "learning_rate": 0.0002, + "loss": 0.7338, + "step": 1894 + }, + { + "epoch": 3.436862389480843, + "grad_norm": 0.2373046875, + "learning_rate": 0.0002, + "loss": 0.7088, + "step": 1895 + }, + { + "epoch": 3.438676037179778, + "grad_norm": 0.2734375, + "learning_rate": 0.0002, + "loss": 0.7265, + "step": 1896 + }, + { + "epoch": 3.4404896848787123, + "grad_norm": 0.271484375, + "learning_rate": 0.0002, + "loss": 0.7901, + "step": 1897 + }, + { + "epoch": 3.4423033325776466, + "grad_norm": 0.306640625, + "learning_rate": 0.0002, + "loss": 0.9498, + "step": 1898 + }, + { + "epoch": 3.4441169802765814, + "grad_norm": 0.28125, + "learning_rate": 0.0002, + "loss": 0.6938, + "step": 1899 + }, + { + "epoch": 3.4459306279755157, + "grad_norm": 0.294921875, + "learning_rate": 0.0002, + "loss": 0.6633, + "step": 1900 + }, + { + "epoch": 3.44774427567445, + "grad_norm": 0.3515625, + "learning_rate": 0.0002, + "loss": 0.5486, + "step": 1901 + }, + { + "epoch": 3.449557923373385, + "grad_norm": 0.33984375, + "learning_rate": 0.0002, + "loss": 0.5645, + "step": 1902 + }, + { + "epoch": 3.451371571072319, + "grad_norm": 0.390625, + "learning_rate": 0.0002, + "loss": 0.5579, + "step": 1903 + }, + { + "epoch": 3.4531852187712535, + "grad_norm": 0.46484375, + "learning_rate": 0.0002, + "loss": 0.5354, + "step": 1904 + }, + { + "epoch": 3.4549988664701883, + "grad_norm": 0.189453125, + "learning_rate": 0.0002, + "loss": 1.0136, + "step": 1905 + }, + { + "epoch": 3.4568125141691226, + "grad_norm": 0.21484375, + "learning_rate": 0.0002, + "loss": 1.0289, + "step": 1906 + }, + { + "epoch": 3.458626161868057, + "grad_norm": 0.18359375, + "learning_rate": 0.0002, + "loss": 0.7741, + "step": 1907 + }, + { + "epoch": 3.4604398095669917, + "grad_norm": 0.185546875, + "learning_rate": 0.0002, + "loss": 0.8833, + "step": 1908 + }, + { + "epoch": 3.462253457265926, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 0.8766, + "step": 1909 + }, + { + "epoch": 3.464067104964861, + "grad_norm": 0.166015625, + "learning_rate": 0.0002, + "loss": 0.7989, + "step": 1910 + }, + { + "epoch": 3.465880752663795, + "grad_norm": 0.1826171875, + "learning_rate": 0.0002, + "loss": 1.0823, + "step": 1911 + }, + { + "epoch": 3.4676944003627295, + "grad_norm": 0.1748046875, + "learning_rate": 0.0002, + "loss": 0.972, + "step": 1912 + }, + { + "epoch": 3.4695080480616642, + "grad_norm": 0.1650390625, + "learning_rate": 0.0002, + "loss": 0.9527, + "step": 1913 + }, + { + "epoch": 3.4713216957605986, + "grad_norm": 0.1669921875, + "learning_rate": 0.0002, + "loss": 0.8402, + "step": 1914 + }, + { + "epoch": 3.473135343459533, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 0.8926, + "step": 1915 + }, + { + "epoch": 3.4749489911584677, + "grad_norm": 0.16015625, + "learning_rate": 0.0002, + "loss": 1.0036, + "step": 1916 + }, + { + "epoch": 3.476762638857402, + "grad_norm": 0.1748046875, + "learning_rate": 0.0002, + "loss": 0.9563, + "step": 1917 + }, + { + "epoch": 3.4785762865563363, + "grad_norm": 0.140625, + "learning_rate": 0.0002, + "loss": 1.0999, + "step": 1918 + }, + { + "epoch": 3.480389934255271, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 0.9079, + "step": 1919 + }, + { + "epoch": 3.4822035819542054, + "grad_norm": 0.13671875, + "learning_rate": 0.0002, + "loss": 0.8518, + "step": 1920 + }, + { + "epoch": 3.4840172296531398, + "grad_norm": 0.1396484375, + "learning_rate": 0.0002, + "loss": 0.8105, + "step": 1921 + }, + { + "epoch": 3.4858308773520745, + "grad_norm": 0.140625, + "learning_rate": 0.0002, + "loss": 0.7612, + "step": 1922 + }, + { + "epoch": 3.487644525051009, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 0.8195, + "step": 1923 + }, + { + "epoch": 3.489458172749943, + "grad_norm": 0.1669921875, + "learning_rate": 0.0002, + "loss": 1.0151, + "step": 1924 + }, + { + "epoch": 3.491271820448878, + "grad_norm": 0.16015625, + "learning_rate": 0.0002, + "loss": 0.9253, + "step": 1925 + }, + { + "epoch": 3.4930854681478123, + "grad_norm": 0.169921875, + "learning_rate": 0.0002, + "loss": 0.935, + "step": 1926 + }, + { + "epoch": 3.4948991158467466, + "grad_norm": 0.185546875, + "learning_rate": 0.0002, + "loss": 0.9971, + "step": 1927 + }, + { + "epoch": 3.4967127635456814, + "grad_norm": 0.1796875, + "learning_rate": 0.0002, + "loss": 0.8765, + "step": 1928 + }, + { + "epoch": 3.4985264112446157, + "grad_norm": 0.1884765625, + "learning_rate": 0.0002, + "loss": 0.9583, + "step": 1929 + }, + { + "epoch": 3.50034005894355, + "grad_norm": 0.189453125, + "learning_rate": 0.0002, + "loss": 0.8221, + "step": 1930 + }, + { + "epoch": 3.502153706642485, + "grad_norm": 0.1875, + "learning_rate": 0.0002, + "loss": 0.8287, + "step": 1931 + }, + { + "epoch": 3.503967354341419, + "grad_norm": 0.1728515625, + "learning_rate": 0.0002, + "loss": 0.8857, + "step": 1932 + }, + { + "epoch": 3.5057810020403535, + "grad_norm": 0.189453125, + "learning_rate": 0.0002, + "loss": 0.7803, + "step": 1933 + }, + { + "epoch": 3.5075946497392883, + "grad_norm": 0.23046875, + "learning_rate": 0.0002, + "loss": 1.0132, + "step": 1934 + }, + { + "epoch": 3.5094082974382226, + "grad_norm": 0.1826171875, + "learning_rate": 0.0002, + "loss": 0.8775, + "step": 1935 + }, + { + "epoch": 3.511221945137157, + "grad_norm": 0.193359375, + "learning_rate": 0.0002, + "loss": 0.9485, + "step": 1936 + }, + { + "epoch": 3.5130355928360917, + "grad_norm": 0.205078125, + "learning_rate": 0.0002, + "loss": 0.9935, + "step": 1937 + }, + { + "epoch": 3.514849240535026, + "grad_norm": 0.1953125, + "learning_rate": 0.0002, + "loss": 0.7365, + "step": 1938 + }, + { + "epoch": 3.5166628882339603, + "grad_norm": 0.2060546875, + "learning_rate": 0.0002, + "loss": 0.7637, + "step": 1939 + }, + { + "epoch": 3.518476535932895, + "grad_norm": 0.2109375, + "learning_rate": 0.0002, + "loss": 0.6995, + "step": 1940 + }, + { + "epoch": 3.5202901836318294, + "grad_norm": 0.1923828125, + "learning_rate": 0.0002, + "loss": 0.6696, + "step": 1941 + }, + { + "epoch": 3.5221038313307638, + "grad_norm": 0.2255859375, + "learning_rate": 0.0002, + "loss": 1.074, + "step": 1942 + }, + { + "epoch": 3.5239174790296985, + "grad_norm": 0.2060546875, + "learning_rate": 0.0002, + "loss": 0.7937, + "step": 1943 + }, + { + "epoch": 3.525731126728633, + "grad_norm": 0.23046875, + "learning_rate": 0.0002, + "loss": 0.7814, + "step": 1944 + }, + { + "epoch": 3.527544774427567, + "grad_norm": 0.2490234375, + "learning_rate": 0.0002, + "loss": 0.7347, + "step": 1945 + }, + { + "epoch": 3.529358422126502, + "grad_norm": 0.2353515625, + "learning_rate": 0.0002, + "loss": 0.8286, + "step": 1946 + }, + { + "epoch": 3.5311720698254363, + "grad_norm": 0.26953125, + "learning_rate": 0.0002, + "loss": 0.8966, + "step": 1947 + }, + { + "epoch": 3.5329857175243706, + "grad_norm": 0.240234375, + "learning_rate": 0.0002, + "loss": 0.768, + "step": 1948 + }, + { + "epoch": 3.5347993652233054, + "grad_norm": 0.28515625, + "learning_rate": 0.0002, + "loss": 0.7675, + "step": 1949 + }, + { + "epoch": 3.5366130129222397, + "grad_norm": 0.306640625, + "learning_rate": 0.0002, + "loss": 0.893, + "step": 1950 + }, + { + "epoch": 3.5384266606211745, + "grad_norm": 0.28515625, + "learning_rate": 0.0002, + "loss": 0.6249, + "step": 1951 + }, + { + "epoch": 3.540240308320109, + "grad_norm": 0.349609375, + "learning_rate": 0.0002, + "loss": 0.5255, + "step": 1952 + }, + { + "epoch": 3.542053956019043, + "grad_norm": 0.39453125, + "learning_rate": 0.0002, + "loss": 0.5415, + "step": 1953 + }, + { + "epoch": 3.543867603717978, + "grad_norm": 0.361328125, + "learning_rate": 0.0002, + "loss": 0.4866, + "step": 1954 + }, + { + "epoch": 3.5456812514169123, + "grad_norm": 0.265625, + "learning_rate": 0.0002, + "loss": 0.977, + "step": 1955 + }, + { + "epoch": 3.5474948991158466, + "grad_norm": 0.1923828125, + "learning_rate": 0.0002, + "loss": 0.9259, + "step": 1956 + }, + { + "epoch": 3.5493085468147814, + "grad_norm": 0.189453125, + "learning_rate": 0.0002, + "loss": 0.8838, + "step": 1957 + }, + { + "epoch": 3.5511221945137157, + "grad_norm": 0.173828125, + "learning_rate": 0.0002, + "loss": 1.0299, + "step": 1958 + }, + { + "epoch": 3.5529358422126505, + "grad_norm": 0.1796875, + "learning_rate": 0.0002, + "loss": 0.9681, + "step": 1959 + }, + { + "epoch": 3.554749489911585, + "grad_norm": 0.1767578125, + "learning_rate": 0.0002, + "loss": 0.8672, + "step": 1960 + }, + { + "epoch": 3.556563137610519, + "grad_norm": 0.1875, + "learning_rate": 0.0002, + "loss": 1.0068, + "step": 1961 + }, + { + "epoch": 3.558376785309454, + "grad_norm": 0.1630859375, + "learning_rate": 0.0002, + "loss": 0.9758, + "step": 1962 + }, + { + "epoch": 3.5601904330083882, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.0234, + "step": 1963 + }, + { + "epoch": 3.5620040807073226, + "grad_norm": 0.1650390625, + "learning_rate": 0.0002, + "loss": 1.0713, + "step": 1964 + }, + { + "epoch": 3.5638177284062573, + "grad_norm": 0.1708984375, + "learning_rate": 0.0002, + "loss": 1.0402, + "step": 1965 + }, + { + "epoch": 3.5656313761051917, + "grad_norm": 0.16015625, + "learning_rate": 0.0002, + "loss": 1.0756, + "step": 1966 + }, + { + "epoch": 3.567445023804126, + "grad_norm": 0.1650390625, + "learning_rate": 0.0002, + "loss": 0.8236, + "step": 1967 + }, + { + "epoch": 3.5692586715030608, + "grad_norm": 0.169921875, + "learning_rate": 0.0002, + "loss": 1.0869, + "step": 1968 + }, + { + "epoch": 3.571072319201995, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 0.9673, + "step": 1969 + }, + { + "epoch": 3.5728859669009294, + "grad_norm": 0.1640625, + "learning_rate": 0.0002, + "loss": 0.9459, + "step": 1970 + }, + { + "epoch": 3.574699614599864, + "grad_norm": 0.13671875, + "learning_rate": 0.0002, + "loss": 0.9682, + "step": 1971 + }, + { + "epoch": 3.5765132622987985, + "grad_norm": 0.162109375, + "learning_rate": 0.0002, + "loss": 1.0311, + "step": 1972 + }, + { + "epoch": 3.578326909997733, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 0.8282, + "step": 1973 + }, + { + "epoch": 3.5801405576966676, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 0.9774, + "step": 1974 + }, + { + "epoch": 3.581954205395602, + "grad_norm": 0.15625, + "learning_rate": 0.0002, + "loss": 0.899, + "step": 1975 + }, + { + "epoch": 3.5837678530945363, + "grad_norm": 0.1669921875, + "learning_rate": 0.0002, + "loss": 0.7442, + "step": 1976 + }, + { + "epoch": 3.585581500793471, + "grad_norm": 0.1884765625, + "learning_rate": 0.0002, + "loss": 0.7686, + "step": 1977 + }, + { + "epoch": 3.5873951484924054, + "grad_norm": 0.171875, + "learning_rate": 0.0002, + "loss": 0.746, + "step": 1978 + }, + { + "epoch": 3.5892087961913397, + "grad_norm": 0.1708984375, + "learning_rate": 0.0002, + "loss": 0.9462, + "step": 1979 + }, + { + "epoch": 3.5910224438902745, + "grad_norm": 0.1875, + "learning_rate": 0.0002, + "loss": 0.9524, + "step": 1980 + }, + { + "epoch": 3.592836091589209, + "grad_norm": 0.1953125, + "learning_rate": 0.0002, + "loss": 0.9663, + "step": 1981 + }, + { + "epoch": 3.594649739288143, + "grad_norm": 0.2138671875, + "learning_rate": 0.0002, + "loss": 0.9295, + "step": 1982 + }, + { + "epoch": 3.596463386987078, + "grad_norm": 0.1953125, + "learning_rate": 0.0002, + "loss": 0.9405, + "step": 1983 + }, + { + "epoch": 3.5982770346860122, + "grad_norm": 0.1962890625, + "learning_rate": 0.0002, + "loss": 1.0836, + "step": 1984 + }, + { + "epoch": 3.6000906823849466, + "grad_norm": 0.197265625, + "learning_rate": 0.0002, + "loss": 0.9503, + "step": 1985 + }, + { + "epoch": 3.6019043300838813, + "grad_norm": 0.1923828125, + "learning_rate": 0.0002, + "loss": 0.9725, + "step": 1986 + }, + { + "epoch": 3.6037179777828157, + "grad_norm": 0.1953125, + "learning_rate": 0.0002, + "loss": 0.9562, + "step": 1987 + }, + { + "epoch": 3.60553162548175, + "grad_norm": 0.2099609375, + "learning_rate": 0.0002, + "loss": 0.9289, + "step": 1988 + }, + { + "epoch": 3.6073452731806848, + "grad_norm": 0.1982421875, + "learning_rate": 0.0002, + "loss": 0.8863, + "step": 1989 + }, + { + "epoch": 3.609158920879619, + "grad_norm": 0.21875, + "learning_rate": 0.0002, + "loss": 0.8862, + "step": 1990 + }, + { + "epoch": 3.6109725685785534, + "grad_norm": 0.240234375, + "learning_rate": 0.0002, + "loss": 0.89, + "step": 1991 + }, + { + "epoch": 3.612786216277488, + "grad_norm": 0.232421875, + "learning_rate": 0.0002, + "loss": 0.9569, + "step": 1992 + }, + { + "epoch": 3.6145998639764225, + "grad_norm": 0.2431640625, + "learning_rate": 0.0002, + "loss": 1.0044, + "step": 1993 + }, + { + "epoch": 3.616413511675357, + "grad_norm": 0.2392578125, + "learning_rate": 0.0002, + "loss": 0.7211, + "step": 1994 + }, + { + "epoch": 3.6182271593742916, + "grad_norm": 0.2158203125, + "learning_rate": 0.0002, + "loss": 0.6931, + "step": 1995 + }, + { + "epoch": 3.620040807073226, + "grad_norm": 0.26171875, + "learning_rate": 0.0002, + "loss": 0.9322, + "step": 1996 + }, + { + "epoch": 3.6218544547721603, + "grad_norm": 0.2734375, + "learning_rate": 0.0002, + "loss": 0.7777, + "step": 1997 + }, + { + "epoch": 3.623668102471095, + "grad_norm": 0.2890625, + "learning_rate": 0.0002, + "loss": 0.7538, + "step": 1998 + }, + { + "epoch": 3.6254817501700294, + "grad_norm": 0.33203125, + "learning_rate": 0.0002, + "loss": 0.7239, + "step": 1999 + }, + { + "epoch": 3.6272953978689637, + "grad_norm": 0.37109375, + "learning_rate": 0.0002, + "loss": 0.6698, + "step": 2000 + }, + { + "epoch": 3.6291090455678985, + "grad_norm": 0.390625, + "learning_rate": 0.0002, + "loss": 0.5776, + "step": 2001 + }, + { + "epoch": 3.630922693266833, + "grad_norm": 0.349609375, + "learning_rate": 0.0002, + "loss": 0.5021, + "step": 2002 + }, + { + "epoch": 3.632736340965767, + "grad_norm": 0.298828125, + "learning_rate": 0.0002, + "loss": 0.4504, + "step": 2003 + }, + { + "epoch": 3.634549988664702, + "grad_norm": 0.328125, + "learning_rate": 0.0002, + "loss": 0.4304, + "step": 2004 + }, + { + "epoch": 3.6363636363636362, + "grad_norm": 0.19921875, + "learning_rate": 0.0002, + "loss": 0.9629, + "step": 2005 + }, + { + "epoch": 3.6381772840625706, + "grad_norm": 0.2109375, + "learning_rate": 0.0002, + "loss": 1.1605, + "step": 2006 + }, + { + "epoch": 3.6399909317615053, + "grad_norm": 0.19140625, + "learning_rate": 0.0002, + "loss": 1.0129, + "step": 2007 + }, + { + "epoch": 3.6418045794604397, + "grad_norm": 0.1845703125, + "learning_rate": 0.0002, + "loss": 1.1031, + "step": 2008 + }, + { + "epoch": 3.6436182271593744, + "grad_norm": 0.169921875, + "learning_rate": 0.0002, + "loss": 0.867, + "step": 2009 + }, + { + "epoch": 3.6454318748583088, + "grad_norm": 0.16015625, + "learning_rate": 0.0002, + "loss": 0.9761, + "step": 2010 + }, + { + "epoch": 3.647245522557243, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.0518, + "step": 2011 + }, + { + "epoch": 3.649059170256178, + "grad_norm": 0.169921875, + "learning_rate": 0.0002, + "loss": 0.8214, + "step": 2012 + }, + { + "epoch": 3.650872817955112, + "grad_norm": 0.1884765625, + "learning_rate": 0.0002, + "loss": 0.8938, + "step": 2013 + }, + { + "epoch": 3.652686465654047, + "grad_norm": 0.17578125, + "learning_rate": 0.0002, + "loss": 0.9843, + "step": 2014 + }, + { + "epoch": 3.6545001133529813, + "grad_norm": 0.177734375, + "learning_rate": 0.0002, + "loss": 0.9713, + "step": 2015 + }, + { + "epoch": 3.6563137610519156, + "grad_norm": 0.203125, + "learning_rate": 0.0002, + "loss": 0.7932, + "step": 2016 + }, + { + "epoch": 3.6581274087508504, + "grad_norm": 0.169921875, + "learning_rate": 0.0002, + "loss": 0.786, + "step": 2017 + }, + { + "epoch": 3.6599410564497847, + "grad_norm": 0.16015625, + "learning_rate": 0.0002, + "loss": 0.8929, + "step": 2018 + }, + { + "epoch": 3.661754704148719, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 0.8394, + "step": 2019 + }, + { + "epoch": 3.663568351847654, + "grad_norm": 0.1650390625, + "learning_rate": 0.0002, + "loss": 0.915, + "step": 2020 + }, + { + "epoch": 3.665381999546588, + "grad_norm": 0.1640625, + "learning_rate": 0.0002, + "loss": 0.9116, + "step": 2021 + }, + { + "epoch": 3.6671956472455225, + "grad_norm": 0.1669921875, + "learning_rate": 0.0002, + "loss": 0.9004, + "step": 2022 + }, + { + "epoch": 3.6690092949444573, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 0.904, + "step": 2023 + }, + { + "epoch": 3.6708229426433916, + "grad_norm": 0.169921875, + "learning_rate": 0.0002, + "loss": 1.0528, + "step": 2024 + }, + { + "epoch": 3.672636590342326, + "grad_norm": 0.1611328125, + "learning_rate": 0.0002, + "loss": 1.0433, + "step": 2025 + }, + { + "epoch": 3.6744502380412607, + "grad_norm": 0.1630859375, + "learning_rate": 0.0002, + "loss": 0.8485, + "step": 2026 + }, + { + "epoch": 3.676263885740195, + "grad_norm": 0.1591796875, + "learning_rate": 0.0002, + "loss": 0.9328, + "step": 2027 + }, + { + "epoch": 3.6780775334391294, + "grad_norm": 0.1787109375, + "learning_rate": 0.0002, + "loss": 1.1324, + "step": 2028 + }, + { + "epoch": 3.679891181138064, + "grad_norm": 0.166015625, + "learning_rate": 0.0002, + "loss": 0.9609, + "step": 2029 + }, + { + "epoch": 3.6817048288369985, + "grad_norm": 0.181640625, + "learning_rate": 0.0002, + "loss": 0.9945, + "step": 2030 + }, + { + "epoch": 3.683518476535933, + "grad_norm": 0.1787109375, + "learning_rate": 0.0002, + "loss": 0.8433, + "step": 2031 + }, + { + "epoch": 3.6853321242348676, + "grad_norm": 0.189453125, + "learning_rate": 0.0002, + "loss": 1.0287, + "step": 2032 + }, + { + "epoch": 3.687145771933802, + "grad_norm": 0.205078125, + "learning_rate": 0.0002, + "loss": 0.89, + "step": 2033 + }, + { + "epoch": 3.688959419632736, + "grad_norm": 0.1845703125, + "learning_rate": 0.0002, + "loss": 1.3357, + "step": 2034 + }, + { + "epoch": 3.690773067331671, + "grad_norm": 0.1875, + "learning_rate": 0.0002, + "loss": 0.8755, + "step": 2035 + }, + { + "epoch": 3.6925867150306053, + "grad_norm": 0.1875, + "learning_rate": 0.0002, + "loss": 0.8753, + "step": 2036 + }, + { + "epoch": 3.6944003627295396, + "grad_norm": 0.1923828125, + "learning_rate": 0.0002, + "loss": 0.8871, + "step": 2037 + }, + { + "epoch": 3.6962140104284744, + "grad_norm": 0.2041015625, + "learning_rate": 0.0002, + "loss": 1.1742, + "step": 2038 + }, + { + "epoch": 3.6980276581274087, + "grad_norm": 0.193359375, + "learning_rate": 0.0002, + "loss": 0.8656, + "step": 2039 + }, + { + "epoch": 3.699841305826343, + "grad_norm": 0.2099609375, + "learning_rate": 0.0002, + "loss": 0.8743, + "step": 2040 + }, + { + "epoch": 3.701654953525278, + "grad_norm": 0.1982421875, + "learning_rate": 0.0002, + "loss": 0.9499, + "step": 2041 + }, + { + "epoch": 3.703468601224212, + "grad_norm": 0.2236328125, + "learning_rate": 0.0002, + "loss": 0.9082, + "step": 2042 + }, + { + "epoch": 3.7052822489231465, + "grad_norm": 0.2177734375, + "learning_rate": 0.0002, + "loss": 0.7107, + "step": 2043 + }, + { + "epoch": 3.7070958966220813, + "grad_norm": 0.232421875, + "learning_rate": 0.0002, + "loss": 0.9882, + "step": 2044 + }, + { + "epoch": 3.7089095443210156, + "grad_norm": 0.2412109375, + "learning_rate": 0.0002, + "loss": 0.8356, + "step": 2045 + }, + { + "epoch": 3.71072319201995, + "grad_norm": 0.236328125, + "learning_rate": 0.0002, + "loss": 0.8516, + "step": 2046 + }, + { + "epoch": 3.7125368397188847, + "grad_norm": 0.259765625, + "learning_rate": 0.0002, + "loss": 0.9084, + "step": 2047 + }, + { + "epoch": 3.714350487417819, + "grad_norm": 0.275390625, + "learning_rate": 0.0002, + "loss": 0.8284, + "step": 2048 + }, + { + "epoch": 3.7161641351167534, + "grad_norm": 0.30078125, + "learning_rate": 0.0002, + "loss": 0.7011, + "step": 2049 + }, + { + "epoch": 3.717977782815688, + "grad_norm": 0.3046875, + "learning_rate": 0.0002, + "loss": 0.7774, + "step": 2050 + }, + { + "epoch": 3.7197914305146225, + "grad_norm": 0.2890625, + "learning_rate": 0.0002, + "loss": 0.689, + "step": 2051 + }, + { + "epoch": 3.721605078213557, + "grad_norm": 0.384765625, + "learning_rate": 0.0002, + "loss": 0.5365, + "step": 2052 + }, + { + "epoch": 3.7234187259124916, + "grad_norm": 0.5078125, + "learning_rate": 0.0002, + "loss": 0.5347, + "step": 2053 + }, + { + "epoch": 3.725232373611426, + "grad_norm": 0.3515625, + "learning_rate": 0.0002, + "loss": 0.5147, + "step": 2054 + }, + { + "epoch": 3.7270460213103602, + "grad_norm": 0.1884765625, + "learning_rate": 0.0002, + "loss": 0.9707, + "step": 2055 + }, + { + "epoch": 3.728859669009295, + "grad_norm": 0.2294921875, + "learning_rate": 0.0002, + "loss": 0.9584, + "step": 2056 + }, + { + "epoch": 3.7306733167082293, + "grad_norm": 0.1962890625, + "learning_rate": 0.0002, + "loss": 0.9249, + "step": 2057 + }, + { + "epoch": 3.7306733167082293, + "eval_loss": 1.4946327209472656, + "eval_runtime": 152.4485, + "eval_samples_per_second": 6.56, + "eval_steps_per_second": 6.56, + "step": 2057 + }, + { + "epoch": 3.7306733167082293, + "mmlu_eval_accuracy": 0.348965135141256, + "mmlu_eval_accuracy_abstract_algebra": 0.45454545454545453, + "mmlu_eval_accuracy_anatomy": 0.42857142857142855, + "mmlu_eval_accuracy_astronomy": 0.375, + "mmlu_eval_accuracy_business_ethics": 0.45454545454545453, + "mmlu_eval_accuracy_clinical_knowledge": 0.3448275862068966, + "mmlu_eval_accuracy_college_biology": 0.3125, + "mmlu_eval_accuracy_college_chemistry": 0.25, + "mmlu_eval_accuracy_college_computer_science": 0.0, + "mmlu_eval_accuracy_college_mathematics": 0.18181818181818182, + "mmlu_eval_accuracy_college_medicine": 0.22727272727272727, + "mmlu_eval_accuracy_college_physics": 0.45454545454545453, + "mmlu_eval_accuracy_computer_security": 0.45454545454545453, + "mmlu_eval_accuracy_conceptual_physics": 0.34615384615384615, + "mmlu_eval_accuracy_econometrics": 0.08333333333333333, + "mmlu_eval_accuracy_electrical_engineering": 0.25, + "mmlu_eval_accuracy_elementary_mathematics": 0.2926829268292683, + "mmlu_eval_accuracy_formal_logic": 0.21428571428571427, + "mmlu_eval_accuracy_global_facts": 0.3, + "mmlu_eval_accuracy_high_school_biology": 0.53125, + "mmlu_eval_accuracy_high_school_chemistry": 0.13636363636363635, + "mmlu_eval_accuracy_high_school_computer_science": 0.3333333333333333, + "mmlu_eval_accuracy_high_school_european_history": 0.4444444444444444, + "mmlu_eval_accuracy_high_school_geography": 0.5, + "mmlu_eval_accuracy_high_school_government_and_politics": 0.42857142857142855, + "mmlu_eval_accuracy_high_school_macroeconomics": 0.32558139534883723, + "mmlu_eval_accuracy_high_school_mathematics": 0.13793103448275862, + "mmlu_eval_accuracy_high_school_microeconomics": 0.23076923076923078, + "mmlu_eval_accuracy_high_school_physics": 0.4117647058823529, + "mmlu_eval_accuracy_high_school_psychology": 0.4666666666666667, + "mmlu_eval_accuracy_high_school_statistics": 0.34782608695652173, + "mmlu_eval_accuracy_high_school_us_history": 0.4090909090909091, + "mmlu_eval_accuracy_high_school_world_history": 0.34615384615384615, + "mmlu_eval_accuracy_human_aging": 0.43478260869565216, + "mmlu_eval_accuracy_human_sexuality": 0.16666666666666666, + "mmlu_eval_accuracy_international_law": 0.38461538461538464, + "mmlu_eval_accuracy_jurisprudence": 0.2727272727272727, + "mmlu_eval_accuracy_logical_fallacies": 0.6111111111111112, + "mmlu_eval_accuracy_machine_learning": 0.36363636363636365, + "mmlu_eval_accuracy_management": 0.18181818181818182, + "mmlu_eval_accuracy_marketing": 0.48, + "mmlu_eval_accuracy_medical_genetics": 0.5454545454545454, + "mmlu_eval_accuracy_miscellaneous": 0.4418604651162791, + "mmlu_eval_accuracy_moral_disputes": 0.34210526315789475, + "mmlu_eval_accuracy_moral_scenarios": 0.31, + "mmlu_eval_accuracy_nutrition": 0.48484848484848486, + "mmlu_eval_accuracy_philosophy": 0.38235294117647056, + "mmlu_eval_accuracy_prehistory": 0.34285714285714286, + "mmlu_eval_accuracy_professional_accounting": 0.2903225806451613, + "mmlu_eval_accuracy_professional_law": 0.2647058823529412, + "mmlu_eval_accuracy_professional_medicine": 0.3225806451612903, + "mmlu_eval_accuracy_professional_psychology": 0.391304347826087, + "mmlu_eval_accuracy_public_relations": 0.5, + "mmlu_eval_accuracy_security_studies": 0.4074074074074074, + "mmlu_eval_accuracy_sociology": 0.6818181818181818, + "mmlu_eval_accuracy_us_foreign_policy": 0.2727272727272727, + "mmlu_eval_accuracy_virology": 0.2777777777777778, + "mmlu_eval_accuracy_world_religions": 0.2631578947368421, + "mmlu_loss": 1.6174835193242847, + "step": 2057 + }, + { + "epoch": 3.7324869644071637, + "grad_norm": 0.1865234375, + "learning_rate": 0.0002, + "loss": 0.8244, + "step": 2058 + }, + { + "epoch": 3.7343006121060984, + "grad_norm": 0.177734375, + "learning_rate": 0.0002, + "loss": 0.9379, + "step": 2059 + }, + { + "epoch": 3.7361142598050328, + "grad_norm": 0.177734375, + "learning_rate": 0.0002, + "loss": 0.9199, + "step": 2060 + }, + { + "epoch": 3.737927907503967, + "grad_norm": 0.1826171875, + "learning_rate": 0.0002, + "loss": 0.9965, + "step": 2061 + }, + { + "epoch": 3.739741555202902, + "grad_norm": 0.158203125, + "learning_rate": 0.0002, + "loss": 0.9576, + "step": 2062 + }, + { + "epoch": 3.741555202901836, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.0345, + "step": 2063 + }, + { + "epoch": 3.743368850600771, + "grad_norm": 0.15625, + "learning_rate": 0.0002, + "loss": 0.9197, + "step": 2064 + }, + { + "epoch": 3.7451824982997053, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 0.9602, + "step": 2065 + }, + { + "epoch": 3.7469961459986396, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002, + "loss": 0.9177, + "step": 2066 + }, + { + "epoch": 3.7488097936975744, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 0.8941, + "step": 2067 + }, + { + "epoch": 3.7506234413965087, + "grad_norm": 0.1640625, + "learning_rate": 0.0002, + "loss": 0.8737, + "step": 2068 + }, + { + "epoch": 3.752437089095443, + "grad_norm": 0.1708984375, + "learning_rate": 0.0002, + "loss": 0.9778, + "step": 2069 + }, + { + "epoch": 3.754250736794378, + "grad_norm": 0.1640625, + "learning_rate": 0.0002, + "loss": 0.969, + "step": 2070 + }, + { + "epoch": 3.756064384493312, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 0.982, + "step": 2071 + }, + { + "epoch": 3.757878032192247, + "grad_norm": 0.15625, + "learning_rate": 0.0002, + "loss": 0.9469, + "step": 2072 + }, + { + "epoch": 3.7596916798911812, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 0.8341, + "step": 2073 + }, + { + "epoch": 3.7615053275901156, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 0.8461, + "step": 2074 + }, + { + "epoch": 3.7633189752890503, + "grad_norm": 0.1572265625, + "learning_rate": 0.0002, + "loss": 0.8469, + "step": 2075 + }, + { + "epoch": 3.7651326229879847, + "grad_norm": 0.1591796875, + "learning_rate": 0.0002, + "loss": 0.9269, + "step": 2076 + }, + { + "epoch": 3.766946270686919, + "grad_norm": 0.1591796875, + "learning_rate": 0.0002, + "loss": 0.8996, + "step": 2077 + }, + { + "epoch": 3.768759918385854, + "grad_norm": 0.158203125, + "learning_rate": 0.0002, + "loss": 0.9822, + "step": 2078 + }, + { + "epoch": 3.770573566084788, + "grad_norm": 0.166015625, + "learning_rate": 0.0002, + "loss": 1.1719, + "step": 2079 + }, + { + "epoch": 3.7723872137837224, + "grad_norm": 0.1689453125, + "learning_rate": 0.0002, + "loss": 0.7073, + "step": 2080 + }, + { + "epoch": 3.774200861482657, + "grad_norm": 0.1806640625, + "learning_rate": 0.0002, + "loss": 0.8591, + "step": 2081 + }, + { + "epoch": 3.7760145091815915, + "grad_norm": 0.18359375, + "learning_rate": 0.0002, + "loss": 0.8909, + "step": 2082 + }, + { + "epoch": 3.777828156880526, + "grad_norm": 0.1865234375, + "learning_rate": 0.0002, + "loss": 1.1059, + "step": 2083 + }, + { + "epoch": 3.7796418045794606, + "grad_norm": 0.1806640625, + "learning_rate": 0.0002, + "loss": 1.019, + "step": 2084 + }, + { + "epoch": 3.781455452278395, + "grad_norm": 0.2021484375, + "learning_rate": 0.0002, + "loss": 1.0705, + "step": 2085 + }, + { + "epoch": 3.7832690999773293, + "grad_norm": 0.1826171875, + "learning_rate": 0.0002, + "loss": 0.9372, + "step": 2086 + }, + { + "epoch": 3.785082747676264, + "grad_norm": 0.193359375, + "learning_rate": 0.0002, + "loss": 0.9471, + "step": 2087 + }, + { + "epoch": 3.7868963953751984, + "grad_norm": 0.216796875, + "learning_rate": 0.0002, + "loss": 0.9676, + "step": 2088 + }, + { + "epoch": 3.7887100430741327, + "grad_norm": 0.2080078125, + "learning_rate": 0.0002, + "loss": 0.9228, + "step": 2089 + }, + { + "epoch": 3.7905236907730675, + "grad_norm": 0.2080078125, + "learning_rate": 0.0002, + "loss": 0.8304, + "step": 2090 + }, + { + "epoch": 3.792337338472002, + "grad_norm": 0.2216796875, + "learning_rate": 0.0002, + "loss": 0.8733, + "step": 2091 + }, + { + "epoch": 3.794150986170936, + "grad_norm": 0.220703125, + "learning_rate": 0.0002, + "loss": 0.9626, + "step": 2092 + }, + { + "epoch": 3.795964633869871, + "grad_norm": 0.2099609375, + "learning_rate": 0.0002, + "loss": 0.9986, + "step": 2093 + }, + { + "epoch": 3.7977782815688053, + "grad_norm": 0.2080078125, + "learning_rate": 0.0002, + "loss": 0.7393, + "step": 2094 + }, + { + "epoch": 3.7995919292677396, + "grad_norm": 0.2470703125, + "learning_rate": 0.0002, + "loss": 1.013, + "step": 2095 + }, + { + "epoch": 3.8014055769666744, + "grad_norm": 0.2314453125, + "learning_rate": 0.0002, + "loss": 0.9341, + "step": 2096 + }, + { + "epoch": 3.8032192246656087, + "grad_norm": 0.31640625, + "learning_rate": 0.0002, + "loss": 1.0689, + "step": 2097 + }, + { + "epoch": 3.805032872364543, + "grad_norm": 0.267578125, + "learning_rate": 0.0002, + "loss": 0.6509, + "step": 2098 + }, + { + "epoch": 3.806846520063478, + "grad_norm": 0.345703125, + "learning_rate": 0.0002, + "loss": 0.8328, + "step": 2099 + }, + { + "epoch": 3.808660167762412, + "grad_norm": 0.3359375, + "learning_rate": 0.0002, + "loss": 0.6806, + "step": 2100 + }, + { + "epoch": 3.8104738154613464, + "grad_norm": 0.337890625, + "learning_rate": 0.0002, + "loss": 0.5701, + "step": 2101 + }, + { + "epoch": 3.812287463160281, + "grad_norm": 0.314453125, + "learning_rate": 0.0002, + "loss": 0.5604, + "step": 2102 + }, + { + "epoch": 3.8141011108592155, + "grad_norm": 0.396484375, + "learning_rate": 0.0002, + "loss": 0.5339, + "step": 2103 + }, + { + "epoch": 3.81591475855815, + "grad_norm": 0.333984375, + "learning_rate": 0.0002, + "loss": 0.4091, + "step": 2104 + }, + { + "epoch": 3.8177284062570847, + "grad_norm": 0.244140625, + "learning_rate": 0.0002, + "loss": 0.807, + "step": 2105 + }, + { + "epoch": 3.819542053956019, + "grad_norm": 0.2109375, + "learning_rate": 0.0002, + "loss": 1.1667, + "step": 2106 + }, + { + "epoch": 3.8213557016549533, + "grad_norm": 0.224609375, + "learning_rate": 0.0002, + "loss": 0.991, + "step": 2107 + }, + { + "epoch": 3.823169349353888, + "grad_norm": 0.203125, + "learning_rate": 0.0002, + "loss": 0.984, + "step": 2108 + }, + { + "epoch": 3.8249829970528224, + "grad_norm": 0.2001953125, + "learning_rate": 0.0002, + "loss": 0.979, + "step": 2109 + }, + { + "epoch": 3.8267966447517567, + "grad_norm": 0.177734375, + "learning_rate": 0.0002, + "loss": 1.0149, + "step": 2110 + }, + { + "epoch": 3.8286102924506915, + "grad_norm": 0.21484375, + "learning_rate": 0.0002, + "loss": 1.0539, + "step": 2111 + }, + { + "epoch": 3.830423940149626, + "grad_norm": 0.16796875, + "learning_rate": 0.0002, + "loss": 0.8935, + "step": 2112 + }, + { + "epoch": 3.83223758784856, + "grad_norm": 0.1572265625, + "learning_rate": 0.0002, + "loss": 0.9301, + "step": 2113 + }, + { + "epoch": 3.834051235547495, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 0.8229, + "step": 2114 + }, + { + "epoch": 3.8358648832464293, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 0.9512, + "step": 2115 + }, + { + "epoch": 3.8376785309453636, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 1.0582, + "step": 2116 + }, + { + "epoch": 3.8394921786442984, + "grad_norm": 0.1650390625, + "learning_rate": 0.0002, + "loss": 1.0256, + "step": 2117 + }, + { + "epoch": 3.8413058263432327, + "grad_norm": 0.1630859375, + "learning_rate": 0.0002, + "loss": 0.9947, + "step": 2118 + }, + { + "epoch": 3.8431194740421675, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 0.938, + "step": 2119 + }, + { + "epoch": 3.844933121741102, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.0748, + "step": 2120 + }, + { + "epoch": 3.846746769440036, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.019, + "step": 2121 + }, + { + "epoch": 3.848560417138971, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 0.8671, + "step": 2122 + }, + { + "epoch": 3.8503740648379052, + "grad_norm": 0.1572265625, + "learning_rate": 0.0002, + "loss": 1.0182, + "step": 2123 + }, + { + "epoch": 3.8521877125368396, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 0.9205, + "step": 2124 + }, + { + "epoch": 3.8540013602357743, + "grad_norm": 0.158203125, + "learning_rate": 0.0002, + "loss": 0.9831, + "step": 2125 + }, + { + "epoch": 3.8558150079347087, + "grad_norm": 0.158203125, + "learning_rate": 0.0002, + "loss": 0.8826, + "step": 2126 + }, + { + "epoch": 3.8576286556336434, + "grad_norm": 0.17578125, + "learning_rate": 0.0002, + "loss": 0.8875, + "step": 2127 + }, + { + "epoch": 3.8594423033325778, + "grad_norm": 0.1806640625, + "learning_rate": 0.0002, + "loss": 1.1782, + "step": 2128 + }, + { + "epoch": 3.861255951031512, + "grad_norm": 0.2001953125, + "learning_rate": 0.0002, + "loss": 0.9554, + "step": 2129 + }, + { + "epoch": 3.863069598730447, + "grad_norm": 0.1748046875, + "learning_rate": 0.0002, + "loss": 0.9037, + "step": 2130 + }, + { + "epoch": 3.864883246429381, + "grad_norm": 0.1689453125, + "learning_rate": 0.0002, + "loss": 0.7515, + "step": 2131 + }, + { + "epoch": 3.8666968941283155, + "grad_norm": 0.2041015625, + "learning_rate": 0.0002, + "loss": 1.009, + "step": 2132 + }, + { + "epoch": 3.8685105418272503, + "grad_norm": 0.1904296875, + "learning_rate": 0.0002, + "loss": 0.9102, + "step": 2133 + }, + { + "epoch": 3.8703241895261846, + "grad_norm": 0.197265625, + "learning_rate": 0.0002, + "loss": 0.8457, + "step": 2134 + }, + { + "epoch": 3.872137837225119, + "grad_norm": 0.1962890625, + "learning_rate": 0.0002, + "loss": 1.1545, + "step": 2135 + }, + { + "epoch": 3.8739514849240537, + "grad_norm": 0.189453125, + "learning_rate": 0.0002, + "loss": 0.8891, + "step": 2136 + }, + { + "epoch": 3.875765132622988, + "grad_norm": 0.2099609375, + "learning_rate": 0.0002, + "loss": 0.9353, + "step": 2137 + }, + { + "epoch": 3.8775787803219224, + "grad_norm": 0.2001953125, + "learning_rate": 0.0002, + "loss": 0.8802, + "step": 2138 + }, + { + "epoch": 3.879392428020857, + "grad_norm": 0.212890625, + "learning_rate": 0.0002, + "loss": 0.9205, + "step": 2139 + }, + { + "epoch": 3.8812060757197915, + "grad_norm": 0.1865234375, + "learning_rate": 0.0002, + "loss": 0.8451, + "step": 2140 + }, + { + "epoch": 3.883019723418726, + "grad_norm": 0.1884765625, + "learning_rate": 0.0002, + "loss": 0.6995, + "step": 2141 + }, + { + "epoch": 3.8848333711176606, + "grad_norm": 0.2099609375, + "learning_rate": 0.0002, + "loss": 0.8646, + "step": 2142 + }, + { + "epoch": 3.886647018816595, + "grad_norm": 0.240234375, + "learning_rate": 0.0002, + "loss": 0.9642, + "step": 2143 + }, + { + "epoch": 3.8884606665155292, + "grad_norm": 0.23828125, + "learning_rate": 0.0002, + "loss": 1.074, + "step": 2144 + }, + { + "epoch": 3.890274314214464, + "grad_norm": 0.25390625, + "learning_rate": 0.0002, + "loss": 1.0585, + "step": 2145 + }, + { + "epoch": 3.8920879619133983, + "grad_norm": 0.220703125, + "learning_rate": 0.0002, + "loss": 0.8145, + "step": 2146 + }, + { + "epoch": 3.8939016096123327, + "grad_norm": 0.228515625, + "learning_rate": 0.0002, + "loss": 0.8256, + "step": 2147 + }, + { + "epoch": 3.8957152573112674, + "grad_norm": 0.265625, + "learning_rate": 0.0002, + "loss": 0.7152, + "step": 2148 + }, + { + "epoch": 3.8975289050102018, + "grad_norm": 0.30078125, + "learning_rate": 0.0002, + "loss": 0.7361, + "step": 2149 + }, + { + "epoch": 3.899342552709136, + "grad_norm": 0.31640625, + "learning_rate": 0.0002, + "loss": 0.7155, + "step": 2150 + }, + { + "epoch": 3.901156200408071, + "grad_norm": 0.3046875, + "learning_rate": 0.0002, + "loss": 0.531, + "step": 2151 + }, + { + "epoch": 3.902969848107005, + "grad_norm": 0.310546875, + "learning_rate": 0.0002, + "loss": 0.5475, + "step": 2152 + }, + { + "epoch": 3.9047834958059395, + "grad_norm": 0.375, + "learning_rate": 0.0002, + "loss": 0.4649, + "step": 2153 + }, + { + "epoch": 3.9065971435048743, + "grad_norm": 0.43359375, + "learning_rate": 0.0002, + "loss": 0.5092, + "step": 2154 + }, + { + "epoch": 3.9084107912038086, + "grad_norm": 0.2109375, + "learning_rate": 0.0002, + "loss": 1.1134, + "step": 2155 + }, + { + "epoch": 3.910224438902743, + "grad_norm": 0.1796875, + "learning_rate": 0.0002, + "loss": 1.0215, + "step": 2156 + }, + { + "epoch": 3.9120380866016777, + "grad_norm": 0.197265625, + "learning_rate": 0.0002, + "loss": 0.8975, + "step": 2157 + }, + { + "epoch": 3.913851734300612, + "grad_norm": 0.1865234375, + "learning_rate": 0.0002, + "loss": 0.964, + "step": 2158 + }, + { + "epoch": 3.9156653819995464, + "grad_norm": 0.1943359375, + "learning_rate": 0.0002, + "loss": 0.986, + "step": 2159 + }, + { + "epoch": 3.917479029698481, + "grad_norm": 0.1611328125, + "learning_rate": 0.0002, + "loss": 1.0072, + "step": 2160 + }, + { + "epoch": 3.9192926773974155, + "grad_norm": 0.177734375, + "learning_rate": 0.0002, + "loss": 0.9557, + "step": 2161 + }, + { + "epoch": 3.92110632509635, + "grad_norm": 0.1689453125, + "learning_rate": 0.0002, + "loss": 0.9848, + "step": 2162 + }, + { + "epoch": 3.9229199727952846, + "grad_norm": 0.158203125, + "learning_rate": 0.0002, + "loss": 1.1124, + "step": 2163 + }, + { + "epoch": 3.924733620494219, + "grad_norm": 0.162109375, + "learning_rate": 0.0002, + "loss": 0.993, + "step": 2164 + }, + { + "epoch": 3.9265472681931533, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 1.0561, + "step": 2165 + }, + { + "epoch": 3.928360915892088, + "grad_norm": 0.169921875, + "learning_rate": 0.0002, + "loss": 0.9009, + "step": 2166 + }, + { + "epoch": 3.9301745635910224, + "grad_norm": 0.1611328125, + "learning_rate": 0.0002, + "loss": 0.8725, + "step": 2167 + }, + { + "epoch": 3.9319882112899567, + "grad_norm": 0.173828125, + "learning_rate": 0.0002, + "loss": 0.8487, + "step": 2168 + }, + { + "epoch": 3.9338018589888915, + "grad_norm": 0.15625, + "learning_rate": 0.0002, + "loss": 0.9553, + "step": 2169 + }, + { + "epoch": 3.935615506687826, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 0.9159, + "step": 2170 + }, + { + "epoch": 3.93742915438676, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 0.9147, + "step": 2171 + }, + { + "epoch": 3.939242802085695, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 0.942, + "step": 2172 + }, + { + "epoch": 3.941056449784629, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 0.8472, + "step": 2173 + }, + { + "epoch": 3.9428700974835635, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 0.8762, + "step": 2174 + }, + { + "epoch": 3.9446837451824983, + "grad_norm": 0.1728515625, + "learning_rate": 0.0002, + "loss": 0.9233, + "step": 2175 + }, + { + "epoch": 3.9464973928814326, + "grad_norm": 0.1865234375, + "learning_rate": 0.0002, + "loss": 0.979, + "step": 2176 + }, + { + "epoch": 3.9483110405803674, + "grad_norm": 0.1748046875, + "learning_rate": 0.0002, + "loss": 0.9095, + "step": 2177 + }, + { + "epoch": 3.9501246882793017, + "grad_norm": 0.1748046875, + "learning_rate": 0.0002, + "loss": 0.9284, + "step": 2178 + }, + { + "epoch": 3.951938335978236, + "grad_norm": 0.1806640625, + "learning_rate": 0.0002, + "loss": 0.96, + "step": 2179 + }, + { + "epoch": 3.953751983677171, + "grad_norm": 0.1650390625, + "learning_rate": 0.0002, + "loss": 0.7597, + "step": 2180 + }, + { + "epoch": 3.955565631376105, + "grad_norm": 0.1865234375, + "learning_rate": 0.0002, + "loss": 0.9693, + "step": 2181 + }, + { + "epoch": 3.95737927907504, + "grad_norm": 0.2099609375, + "learning_rate": 0.0002, + "loss": 1.1114, + "step": 2182 + }, + { + "epoch": 3.9591929267739743, + "grad_norm": 0.1943359375, + "learning_rate": 0.0002, + "loss": 0.9999, + "step": 2183 + }, + { + "epoch": 3.9610065744729086, + "grad_norm": 0.1923828125, + "learning_rate": 0.0002, + "loss": 0.9128, + "step": 2184 + }, + { + "epoch": 3.9628202221718434, + "grad_norm": 0.1884765625, + "learning_rate": 0.0002, + "loss": 0.8134, + "step": 2185 + }, + { + "epoch": 3.9646338698707777, + "grad_norm": 0.181640625, + "learning_rate": 0.0002, + "loss": 1.0309, + "step": 2186 + }, + { + "epoch": 3.966447517569712, + "grad_norm": 0.1865234375, + "learning_rate": 0.0002, + "loss": 0.9088, + "step": 2187 + }, + { + "epoch": 3.968261165268647, + "grad_norm": 0.208984375, + "learning_rate": 0.0002, + "loss": 1.0499, + "step": 2188 + }, + { + "epoch": 3.970074812967581, + "grad_norm": 0.2197265625, + "learning_rate": 0.0002, + "loss": 0.9558, + "step": 2189 + }, + { + "epoch": 3.9718884606665155, + "grad_norm": 0.22265625, + "learning_rate": 0.0002, + "loss": 0.959, + "step": 2190 + }, + { + "epoch": 3.9737021083654502, + "grad_norm": 0.2470703125, + "learning_rate": 0.0002, + "loss": 0.9383, + "step": 2191 + }, + { + "epoch": 3.9755157560643846, + "grad_norm": 0.2236328125, + "learning_rate": 0.0002, + "loss": 0.8646, + "step": 2192 + }, + { + "epoch": 3.977329403763319, + "grad_norm": 0.2333984375, + "learning_rate": 0.0002, + "loss": 0.9102, + "step": 2193 + }, + { + "epoch": 3.9791430514622537, + "grad_norm": 0.234375, + "learning_rate": 0.0002, + "loss": 0.8382, + "step": 2194 + }, + { + "epoch": 3.980956699161188, + "grad_norm": 0.265625, + "learning_rate": 0.0002, + "loss": 0.8789, + "step": 2195 + }, + { + "epoch": 3.9827703468601223, + "grad_norm": 0.2333984375, + "learning_rate": 0.0002, + "loss": 0.8174, + "step": 2196 + }, + { + "epoch": 3.984583994559057, + "grad_norm": 0.2890625, + "learning_rate": 0.0002, + "loss": 0.7263, + "step": 2197 + }, + { + "epoch": 3.9863976422579914, + "grad_norm": 0.30078125, + "learning_rate": 0.0002, + "loss": 0.8172, + "step": 2198 + }, + { + "epoch": 3.9882112899569258, + "grad_norm": 0.28125, + "learning_rate": 0.0002, + "loss": 0.7197, + "step": 2199 + }, + { + "epoch": 3.9900249376558605, + "grad_norm": 0.3671875, + "learning_rate": 0.0002, + "loss": 0.6399, + "step": 2200 + }, + { + "epoch": 3.991838585354795, + "grad_norm": 0.31640625, + "learning_rate": 0.0002, + "loss": 0.5516, + "step": 2201 + }, + { + "epoch": 3.993652233053729, + "grad_norm": 0.51171875, + "learning_rate": 0.0002, + "loss": 0.7058, + "step": 2202 + }, + { + "epoch": 3.995465880752664, + "grad_norm": 0.400390625, + "learning_rate": 0.0002, + "loss": 0.5069, + "step": 2203 + }, + { + "epoch": 3.9972795284515983, + "grad_norm": 0.4609375, + "learning_rate": 0.0002, + "loss": 0.4844, + "step": 2204 + }, + { + "epoch": 3.9990931761505326, + "grad_norm": 0.2490234375, + "learning_rate": 0.0002, + "loss": 1.0759, + "step": 2205 + }, + { + "epoch": 4.000906823849467, + "grad_norm": 0.220703125, + "learning_rate": 0.0002, + "loss": 0.8785, + "step": 2206 + }, + { + "epoch": 4.002720471548402, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 0.8429, + "step": 2207 + }, + { + "epoch": 4.004534119247336, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 0.8151, + "step": 2208 + }, + { + "epoch": 4.006347766946271, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 0.9178, + "step": 2209 + }, + { + "epoch": 4.008161414645206, + "grad_norm": 0.1669921875, + "learning_rate": 0.0002, + "loss": 0.8458, + "step": 2210 + }, + { + "epoch": 4.0099750623441395, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 0.736, + "step": 2211 + }, + { + "epoch": 4.011788710043074, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 0.746, + "step": 2212 + }, + { + "epoch": 4.013602357742009, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 0.9146, + "step": 2213 + }, + { + "epoch": 4.015416005440943, + "grad_norm": 0.1572265625, + "learning_rate": 0.0002, + "loss": 0.805, + "step": 2214 + }, + { + "epoch": 4.017229653139878, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 0.8478, + "step": 2215 + }, + { + "epoch": 4.0190433008388124, + "grad_norm": 0.1591796875, + "learning_rate": 0.0002, + "loss": 0.7769, + "step": 2216 + }, + { + "epoch": 4.020856948537746, + "grad_norm": 0.1728515625, + "learning_rate": 0.0002, + "loss": 0.8353, + "step": 2217 + }, + { + "epoch": 4.022670596236681, + "grad_norm": 0.166015625, + "learning_rate": 0.0002, + "loss": 0.7535, + "step": 2218 + }, + { + "epoch": 4.024484243935616, + "grad_norm": 0.1708984375, + "learning_rate": 0.0002, + "loss": 0.6652, + "step": 2219 + }, + { + "epoch": 4.02629789163455, + "grad_norm": 0.166015625, + "learning_rate": 0.0002, + "loss": 0.7873, + "step": 2220 + }, + { + "epoch": 4.0281115393334845, + "grad_norm": 0.1630859375, + "learning_rate": 0.0002, + "loss": 0.7999, + "step": 2221 + }, + { + "epoch": 4.029925187032419, + "grad_norm": 0.16796875, + "learning_rate": 0.0002, + "loss": 0.7178, + "step": 2222 + }, + { + "epoch": 4.031738834731353, + "grad_norm": 0.1708984375, + "learning_rate": 0.0002, + "loss": 0.7077, + "step": 2223 + }, + { + "epoch": 4.033552482430288, + "grad_norm": 0.1669921875, + "learning_rate": 0.0002, + "loss": 0.7126, + "step": 2224 + }, + { + "epoch": 4.035366130129223, + "grad_norm": 0.1767578125, + "learning_rate": 0.0002, + "loss": 0.7451, + "step": 2225 + }, + { + "epoch": 4.037179777828157, + "grad_norm": 0.18359375, + "learning_rate": 0.0002, + "loss": 0.6789, + "step": 2226 + }, + { + "epoch": 4.038993425527091, + "grad_norm": 0.189453125, + "learning_rate": 0.0002, + "loss": 0.7314, + "step": 2227 + }, + { + "epoch": 4.040807073226026, + "grad_norm": 0.169921875, + "learning_rate": 0.0002, + "loss": 0.6576, + "step": 2228 + }, + { + "epoch": 4.04262072092496, + "grad_norm": 0.181640625, + "learning_rate": 0.0002, + "loss": 0.7163, + "step": 2229 + }, + { + "epoch": 4.044434368623895, + "grad_norm": 0.171875, + "learning_rate": 0.0002, + "loss": 0.6308, + "step": 2230 + }, + { + "epoch": 4.04624801632283, + "grad_norm": 0.1943359375, + "learning_rate": 0.0002, + "loss": 0.7076, + "step": 2231 + }, + { + "epoch": 4.0480616640217635, + "grad_norm": 0.2080078125, + "learning_rate": 0.0002, + "loss": 0.8716, + "step": 2232 + }, + { + "epoch": 4.049875311720698, + "grad_norm": 0.22265625, + "learning_rate": 0.0002, + "loss": 0.8024, + "step": 2233 + }, + { + "epoch": 4.051688959419633, + "grad_norm": 0.212890625, + "learning_rate": 0.0002, + "loss": 0.7631, + "step": 2234 + }, + { + "epoch": 4.053502607118567, + "grad_norm": 0.216796875, + "learning_rate": 0.0002, + "loss": 0.7123, + "step": 2235 + }, + { + "epoch": 4.055316254817502, + "grad_norm": 0.1982421875, + "learning_rate": 0.0002, + "loss": 0.7436, + "step": 2236 + }, + { + "epoch": 4.0571299025164365, + "grad_norm": 0.2021484375, + "learning_rate": 0.0002, + "loss": 0.7069, + "step": 2237 + }, + { + "epoch": 4.05894355021537, + "grad_norm": 0.2236328125, + "learning_rate": 0.0002, + "loss": 0.6688, + "step": 2238 + }, + { + "epoch": 4.060757197914305, + "grad_norm": 0.2451171875, + "learning_rate": 0.0002, + "loss": 0.8314, + "step": 2239 + }, + { + "epoch": 4.06257084561324, + "grad_norm": 0.224609375, + "learning_rate": 0.0002, + "loss": 0.7898, + "step": 2240 + }, + { + "epoch": 4.064384493312174, + "grad_norm": 0.2216796875, + "learning_rate": 0.0002, + "loss": 0.6305, + "step": 2241 + }, + { + "epoch": 4.0661981410111085, + "grad_norm": 0.220703125, + "learning_rate": 0.0002, + "loss": 0.6619, + "step": 2242 + }, + { + "epoch": 4.068011788710043, + "grad_norm": 0.259765625, + "learning_rate": 0.0002, + "loss": 0.6793, + "step": 2243 + }, + { + "epoch": 4.069825436408977, + "grad_norm": 0.255859375, + "learning_rate": 0.0002, + "loss": 0.6185, + "step": 2244 + }, + { + "epoch": 4.069825436408977, + "eval_loss": 1.5165576934814453, + "eval_runtime": 152.5485, + "eval_samples_per_second": 6.555, + "eval_steps_per_second": 6.555, + "step": 2244 + }, + { + "epoch": 4.069825436408977, + "mmlu_eval_accuracy": 0.3178681103957788, + "mmlu_eval_accuracy_abstract_algebra": 0.45454545454545453, + "mmlu_eval_accuracy_anatomy": 0.5, + "mmlu_eval_accuracy_astronomy": 0.3125, + "mmlu_eval_accuracy_business_ethics": 0.36363636363636365, + "mmlu_eval_accuracy_clinical_knowledge": 0.27586206896551724, + "mmlu_eval_accuracy_college_biology": 0.3125, + "mmlu_eval_accuracy_college_chemistry": 0.25, + "mmlu_eval_accuracy_college_computer_science": 0.0, + "mmlu_eval_accuracy_college_mathematics": 0.18181818181818182, + "mmlu_eval_accuracy_college_medicine": 0.13636363636363635, + "mmlu_eval_accuracy_college_physics": 0.36363636363636365, + "mmlu_eval_accuracy_computer_security": 0.5454545454545454, + "mmlu_eval_accuracy_conceptual_physics": 0.2692307692307692, + "mmlu_eval_accuracy_econometrics": 0.16666666666666666, + "mmlu_eval_accuracy_electrical_engineering": 0.1875, + "mmlu_eval_accuracy_elementary_mathematics": 0.2926829268292683, + "mmlu_eval_accuracy_formal_logic": 0.14285714285714285, + "mmlu_eval_accuracy_global_facts": 0.4, + "mmlu_eval_accuracy_high_school_biology": 0.40625, + "mmlu_eval_accuracy_high_school_chemistry": 0.22727272727272727, + "mmlu_eval_accuracy_high_school_computer_science": 0.3333333333333333, + "mmlu_eval_accuracy_high_school_european_history": 0.3333333333333333, + "mmlu_eval_accuracy_high_school_geography": 0.5454545454545454, + "mmlu_eval_accuracy_high_school_government_and_politics": 0.42857142857142855, + "mmlu_eval_accuracy_high_school_macroeconomics": 0.27906976744186046, + "mmlu_eval_accuracy_high_school_mathematics": 0.10344827586206896, + "mmlu_eval_accuracy_high_school_microeconomics": 0.34615384615384615, + "mmlu_eval_accuracy_high_school_physics": 0.47058823529411764, + "mmlu_eval_accuracy_high_school_psychology": 0.4, + "mmlu_eval_accuracy_high_school_statistics": 0.2608695652173913, + "mmlu_eval_accuracy_high_school_us_history": 0.4090909090909091, + "mmlu_eval_accuracy_high_school_world_history": 0.19230769230769232, + "mmlu_eval_accuracy_human_aging": 0.391304347826087, + "mmlu_eval_accuracy_human_sexuality": 0.08333333333333333, + "mmlu_eval_accuracy_international_law": 0.3076923076923077, + "mmlu_eval_accuracy_jurisprudence": 0.18181818181818182, + "mmlu_eval_accuracy_logical_fallacies": 0.3888888888888889, + "mmlu_eval_accuracy_machine_learning": 0.36363636363636365, + "mmlu_eval_accuracy_management": 0.18181818181818182, + "mmlu_eval_accuracy_marketing": 0.56, + "mmlu_eval_accuracy_medical_genetics": 0.36363636363636365, + "mmlu_eval_accuracy_miscellaneous": 0.4418604651162791, + "mmlu_eval_accuracy_moral_disputes": 0.3157894736842105, + "mmlu_eval_accuracy_moral_scenarios": 0.31, + "mmlu_eval_accuracy_nutrition": 0.45454545454545453, + "mmlu_eval_accuracy_philosophy": 0.4117647058823529, + "mmlu_eval_accuracy_prehistory": 0.4, + "mmlu_eval_accuracy_professional_accounting": 0.3225806451612903, + "mmlu_eval_accuracy_professional_law": 0.27058823529411763, + "mmlu_eval_accuracy_professional_medicine": 0.2903225806451613, + "mmlu_eval_accuracy_professional_psychology": 0.37681159420289856, + "mmlu_eval_accuracy_public_relations": 0.3333333333333333, + "mmlu_eval_accuracy_security_studies": 0.37037037037037035, + "mmlu_eval_accuracy_sociology": 0.45454545454545453, + "mmlu_eval_accuracy_us_foreign_policy": 0.2727272727272727, + "mmlu_eval_accuracy_virology": 0.2222222222222222, + "mmlu_eval_accuracy_world_religions": 0.15789473684210525, + "mmlu_loss": 1.707381868814036, + "step": 2244 + }, + { + "epoch": 4.071639084107912, + "grad_norm": 0.2353515625, + "learning_rate": 0.0002, + "loss": 0.671, + "step": 2245 + }, + { + "epoch": 4.073452731806847, + "grad_norm": 0.291015625, + "learning_rate": 0.0002, + "loss": 0.5644, + "step": 2246 + }, + { + "epoch": 4.075266379505781, + "grad_norm": 0.2734375, + "learning_rate": 0.0002, + "loss": 0.5307, + "step": 2247 + }, + { + "epoch": 4.077080027204715, + "grad_norm": 0.2890625, + "learning_rate": 0.0002, + "loss": 0.4626, + "step": 2248 + }, + { + "epoch": 4.07889367490365, + "grad_norm": 0.30078125, + "learning_rate": 0.0002, + "loss": 0.4743, + "step": 2249 + }, + { + "epoch": 4.080707322602584, + "grad_norm": 0.388671875, + "learning_rate": 0.0002, + "loss": 0.6475, + "step": 2250 + }, + { + "epoch": 4.082520970301519, + "grad_norm": 0.416015625, + "learning_rate": 0.0002, + "loss": 0.3329, + "step": 2251 + }, + { + "epoch": 4.084334618000454, + "grad_norm": 0.31640625, + "learning_rate": 0.0002, + "loss": 0.4433, + "step": 2252 + }, + { + "epoch": 4.0861482656993875, + "grad_norm": 0.408203125, + "learning_rate": 0.0002, + "loss": 0.3526, + "step": 2253 + }, + { + "epoch": 4.087961913398322, + "grad_norm": 0.3984375, + "learning_rate": 0.0002, + "loss": 0.3501, + "step": 2254 + }, + { + "epoch": 4.089775561097257, + "grad_norm": 0.271484375, + "learning_rate": 0.0002, + "loss": 0.3317, + "step": 2255 + }, + { + "epoch": 4.091589208796191, + "grad_norm": 0.1982421875, + "learning_rate": 0.0002, + "loss": 0.5283, + "step": 2256 + }, + { + "epoch": 4.093402856495126, + "grad_norm": 0.2138671875, + "learning_rate": 0.0002, + "loss": 0.7428, + "step": 2257 + }, + { + "epoch": 4.0952165041940605, + "grad_norm": 0.2021484375, + "learning_rate": 0.0002, + "loss": 0.756, + "step": 2258 + }, + { + "epoch": 4.097030151892994, + "grad_norm": 0.2060546875, + "learning_rate": 0.0002, + "loss": 0.9337, + "step": 2259 + }, + { + "epoch": 4.098843799591929, + "grad_norm": 0.19140625, + "learning_rate": 0.0002, + "loss": 0.717, + "step": 2260 + }, + { + "epoch": 4.100657447290864, + "grad_norm": 0.189453125, + "learning_rate": 0.0002, + "loss": 0.759, + "step": 2261 + }, + { + "epoch": 4.102471094989799, + "grad_norm": 0.197265625, + "learning_rate": 0.0002, + "loss": 0.7138, + "step": 2262 + }, + { + "epoch": 4.104284742688733, + "grad_norm": 0.1650390625, + "learning_rate": 0.0002, + "loss": 0.687, + "step": 2263 + }, + { + "epoch": 4.106098390387667, + "grad_norm": 0.1767578125, + "learning_rate": 0.0002, + "loss": 0.7743, + "step": 2264 + }, + { + "epoch": 4.107912038086602, + "grad_norm": 0.16796875, + "learning_rate": 0.0002, + "loss": 0.8097, + "step": 2265 + }, + { + "epoch": 4.109725685785536, + "grad_norm": 0.1806640625, + "learning_rate": 0.0002, + "loss": 0.814, + "step": 2266 + }, + { + "epoch": 4.111539333484471, + "grad_norm": 0.162109375, + "learning_rate": 0.0002, + "loss": 0.8231, + "step": 2267 + }, + { + "epoch": 4.1133529811834055, + "grad_norm": 0.162109375, + "learning_rate": 0.0002, + "loss": 0.6724, + "step": 2268 + }, + { + "epoch": 4.115166628882339, + "grad_norm": 0.1767578125, + "learning_rate": 0.0002, + "loss": 0.8259, + "step": 2269 + }, + { + "epoch": 4.116980276581274, + "grad_norm": 0.18359375, + "learning_rate": 0.0002, + "loss": 1.043, + "step": 2270 + }, + { + "epoch": 4.118793924280209, + "grad_norm": 0.1611328125, + "learning_rate": 0.0002, + "loss": 0.7259, + "step": 2271 + }, + { + "epoch": 4.120607571979143, + "grad_norm": 0.1669921875, + "learning_rate": 0.0002, + "loss": 0.8131, + "step": 2272 + }, + { + "epoch": 4.122421219678078, + "grad_norm": 0.166015625, + "learning_rate": 0.0002, + "loss": 0.7641, + "step": 2273 + }, + { + "epoch": 4.124234867377012, + "grad_norm": 0.173828125, + "learning_rate": 0.0002, + "loss": 0.7864, + "step": 2274 + }, + { + "epoch": 4.126048515075946, + "grad_norm": 0.16796875, + "learning_rate": 0.0002, + "loss": 0.9034, + "step": 2275 + }, + { + "epoch": 4.127862162774881, + "grad_norm": 0.177734375, + "learning_rate": 0.0002, + "loss": 0.7199, + "step": 2276 + }, + { + "epoch": 4.129675810473816, + "grad_norm": 0.1943359375, + "learning_rate": 0.0002, + "loss": 0.6717, + "step": 2277 + }, + { + "epoch": 4.13148945817275, + "grad_norm": 0.193359375, + "learning_rate": 0.0002, + "loss": 0.8061, + "step": 2278 + }, + { + "epoch": 4.1333031058716845, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 0.6099, + "step": 2279 + }, + { + "epoch": 4.135116753570619, + "grad_norm": 0.1845703125, + "learning_rate": 0.0002, + "loss": 0.7045, + "step": 2280 + }, + { + "epoch": 4.136930401269553, + "grad_norm": 0.2001953125, + "learning_rate": 0.0002, + "loss": 0.7778, + "step": 2281 + }, + { + "epoch": 4.138744048968488, + "grad_norm": 0.1982421875, + "learning_rate": 0.0002, + "loss": 0.8761, + "step": 2282 + }, + { + "epoch": 4.140557696667423, + "grad_norm": 0.2099609375, + "learning_rate": 0.0002, + "loss": 0.8092, + "step": 2283 + }, + { + "epoch": 4.142371344366357, + "grad_norm": 0.2060546875, + "learning_rate": 0.0002, + "loss": 0.7304, + "step": 2284 + }, + { + "epoch": 4.144184992065291, + "grad_norm": 0.2236328125, + "learning_rate": 0.0002, + "loss": 0.7174, + "step": 2285 + }, + { + "epoch": 4.145998639764226, + "grad_norm": 0.203125, + "learning_rate": 0.0002, + "loss": 0.5608, + "step": 2286 + }, + { + "epoch": 4.14781228746316, + "grad_norm": 0.2216796875, + "learning_rate": 0.0002, + "loss": 0.8662, + "step": 2287 + }, + { + "epoch": 4.149625935162095, + "grad_norm": 0.23828125, + "learning_rate": 0.0002, + "loss": 0.6423, + "step": 2288 + }, + { + "epoch": 4.1514395828610295, + "grad_norm": 0.1982421875, + "learning_rate": 0.0002, + "loss": 0.554, + "step": 2289 + }, + { + "epoch": 4.153253230559963, + "grad_norm": 0.2333984375, + "learning_rate": 0.0002, + "loss": 0.6679, + "step": 2290 + }, + { + "epoch": 4.155066878258898, + "grad_norm": 0.240234375, + "learning_rate": 0.0002, + "loss": 0.6408, + "step": 2291 + }, + { + "epoch": 4.156880525957833, + "grad_norm": 0.255859375, + "learning_rate": 0.0002, + "loss": 0.5926, + "step": 2292 + }, + { + "epoch": 4.158694173656767, + "grad_norm": 0.271484375, + "learning_rate": 0.0002, + "loss": 0.5981, + "step": 2293 + }, + { + "epoch": 4.160507821355702, + "grad_norm": 0.27734375, + "learning_rate": 0.0002, + "loss": 0.5107, + "step": 2294 + }, + { + "epoch": 4.162321469054636, + "grad_norm": 0.224609375, + "learning_rate": 0.0002, + "loss": 0.4489, + "step": 2295 + }, + { + "epoch": 4.16413511675357, + "grad_norm": 0.28125, + "learning_rate": 0.0002, + "loss": 0.4799, + "step": 2296 + }, + { + "epoch": 4.165948764452505, + "grad_norm": 0.29296875, + "learning_rate": 0.0002, + "loss": 0.5097, + "step": 2297 + }, + { + "epoch": 4.16776241215144, + "grad_norm": 0.3203125, + "learning_rate": 0.0002, + "loss": 0.5601, + "step": 2298 + }, + { + "epoch": 4.169576059850374, + "grad_norm": 0.275390625, + "learning_rate": 0.0002, + "loss": 0.3407, + "step": 2299 + }, + { + "epoch": 4.1713897075493085, + "grad_norm": 0.357421875, + "learning_rate": 0.0002, + "loss": 0.3889, + "step": 2300 + }, + { + "epoch": 4.173203355248243, + "grad_norm": 0.302734375, + "learning_rate": 0.0002, + "loss": 0.404, + "step": 2301 + }, + { + "epoch": 4.175017002947177, + "grad_norm": 0.404296875, + "learning_rate": 0.0002, + "loss": 0.42, + "step": 2302 + }, + { + "epoch": 4.176830650646112, + "grad_norm": 0.396484375, + "learning_rate": 0.0002, + "loss": 0.3843, + "step": 2303 + }, + { + "epoch": 4.178644298345047, + "grad_norm": 0.349609375, + "learning_rate": 0.0002, + "loss": 0.3671, + "step": 2304 + }, + { + "epoch": 4.180457946043981, + "grad_norm": 0.29296875, + "learning_rate": 0.0002, + "loss": 0.3186, + "step": 2305 + }, + { + "epoch": 4.182271593742915, + "grad_norm": 0.25390625, + "learning_rate": 0.0002, + "loss": 0.5467, + "step": 2306 + }, + { + "epoch": 4.18408524144185, + "grad_norm": 0.197265625, + "learning_rate": 0.0002, + "loss": 0.6993, + "step": 2307 + }, + { + "epoch": 4.185898889140784, + "grad_norm": 0.2216796875, + "learning_rate": 0.0002, + "loss": 0.9741, + "step": 2308 + }, + { + "epoch": 4.187712536839719, + "grad_norm": 0.2216796875, + "learning_rate": 0.0002, + "loss": 0.8219, + "step": 2309 + }, + { + "epoch": 4.1895261845386536, + "grad_norm": 0.2041015625, + "learning_rate": 0.0002, + "loss": 0.7511, + "step": 2310 + }, + { + "epoch": 4.191339832237587, + "grad_norm": 0.193359375, + "learning_rate": 0.0002, + "loss": 0.6952, + "step": 2311 + }, + { + "epoch": 4.193153479936522, + "grad_norm": 0.201171875, + "learning_rate": 0.0002, + "loss": 0.8706, + "step": 2312 + }, + { + "epoch": 4.194967127635457, + "grad_norm": 0.20703125, + "learning_rate": 0.0002, + "loss": 0.9849, + "step": 2313 + }, + { + "epoch": 4.196780775334391, + "grad_norm": 0.189453125, + "learning_rate": 0.0002, + "loss": 0.8267, + "step": 2314 + }, + { + "epoch": 4.198594423033326, + "grad_norm": 0.1826171875, + "learning_rate": 0.0002, + "loss": 0.8796, + "step": 2315 + }, + { + "epoch": 4.20040807073226, + "grad_norm": 0.1796875, + "learning_rate": 0.0002, + "loss": 0.7312, + "step": 2316 + }, + { + "epoch": 4.202221718431195, + "grad_norm": 0.162109375, + "learning_rate": 0.0002, + "loss": 0.6886, + "step": 2317 + }, + { + "epoch": 4.204035366130129, + "grad_norm": 0.1748046875, + "learning_rate": 0.0002, + "loss": 0.8838, + "step": 2318 + }, + { + "epoch": 4.205849013829064, + "grad_norm": 0.1708984375, + "learning_rate": 0.0002, + "loss": 0.6873, + "step": 2319 + }, + { + "epoch": 4.207662661527999, + "grad_norm": 0.1708984375, + "learning_rate": 0.0002, + "loss": 0.9088, + "step": 2320 + }, + { + "epoch": 4.2094763092269325, + "grad_norm": 0.1689453125, + "learning_rate": 0.0002, + "loss": 0.7652, + "step": 2321 + }, + { + "epoch": 4.211289956925867, + "grad_norm": 0.177734375, + "learning_rate": 0.0002, + "loss": 0.7904, + "step": 2322 + }, + { + "epoch": 4.213103604624802, + "grad_norm": 0.1787109375, + "learning_rate": 0.0002, + "loss": 0.6853, + "step": 2323 + }, + { + "epoch": 4.214917252323736, + "grad_norm": 0.1796875, + "learning_rate": 0.0002, + "loss": 0.782, + "step": 2324 + }, + { + "epoch": 4.216730900022671, + "grad_norm": 0.173828125, + "learning_rate": 0.0002, + "loss": 0.6856, + "step": 2325 + }, + { + "epoch": 4.2185445477216055, + "grad_norm": 0.1787109375, + "learning_rate": 0.0002, + "loss": 0.7173, + "step": 2326 + }, + { + "epoch": 4.220358195420539, + "grad_norm": 0.1884765625, + "learning_rate": 0.0002, + "loss": 0.9038, + "step": 2327 + }, + { + "epoch": 4.222171843119474, + "grad_norm": 0.173828125, + "learning_rate": 0.0002, + "loss": 0.624, + "step": 2328 + }, + { + "epoch": 4.223985490818409, + "grad_norm": 0.1708984375, + "learning_rate": 0.0002, + "loss": 0.7816, + "step": 2329 + }, + { + "epoch": 4.225799138517343, + "grad_norm": 0.1875, + "learning_rate": 0.0002, + "loss": 0.6629, + "step": 2330 + }, + { + "epoch": 4.227612786216278, + "grad_norm": 0.185546875, + "learning_rate": 0.0002, + "loss": 0.8909, + "step": 2331 + }, + { + "epoch": 4.229426433915212, + "grad_norm": 0.1962890625, + "learning_rate": 0.0002, + "loss": 0.6975, + "step": 2332 + }, + { + "epoch": 4.231240081614146, + "grad_norm": 0.216796875, + "learning_rate": 0.0002, + "loss": 0.8398, + "step": 2333 + }, + { + "epoch": 4.233053729313081, + "grad_norm": 0.21875, + "learning_rate": 0.0002, + "loss": 0.7149, + "step": 2334 + }, + { + "epoch": 4.234867377012016, + "grad_norm": 0.2353515625, + "learning_rate": 0.0002, + "loss": 0.9458, + "step": 2335 + }, + { + "epoch": 4.23668102471095, + "grad_norm": 0.21484375, + "learning_rate": 0.0002, + "loss": 0.7146, + "step": 2336 + }, + { + "epoch": 4.238494672409884, + "grad_norm": 0.21484375, + "learning_rate": 0.0002, + "loss": 0.8261, + "step": 2337 + }, + { + "epoch": 4.240308320108819, + "grad_norm": 0.2255859375, + "learning_rate": 0.0002, + "loss": 0.9315, + "step": 2338 + }, + { + "epoch": 4.242121967807753, + "grad_norm": 0.23046875, + "learning_rate": 0.0002, + "loss": 0.5101, + "step": 2339 + }, + { + "epoch": 4.243935615506688, + "grad_norm": 0.220703125, + "learning_rate": 0.0002, + "loss": 0.6636, + "step": 2340 + }, + { + "epoch": 4.245749263205623, + "grad_norm": 0.2216796875, + "learning_rate": 0.0002, + "loss": 0.6818, + "step": 2341 + }, + { + "epoch": 4.2475629109045565, + "grad_norm": 0.2080078125, + "learning_rate": 0.0002, + "loss": 0.4724, + "step": 2342 + }, + { + "epoch": 4.249376558603491, + "grad_norm": 0.267578125, + "learning_rate": 0.0002, + "loss": 0.6587, + "step": 2343 + }, + { + "epoch": 4.251190206302426, + "grad_norm": 0.2431640625, + "learning_rate": 0.0002, + "loss": 0.5863, + "step": 2344 + }, + { + "epoch": 4.25300385400136, + "grad_norm": 0.2421875, + "learning_rate": 0.0002, + "loss": 0.6676, + "step": 2345 + }, + { + "epoch": 4.254817501700295, + "grad_norm": 0.2431640625, + "learning_rate": 0.0002, + "loss": 0.5566, + "step": 2346 + }, + { + "epoch": 4.2566311493992295, + "grad_norm": 0.296875, + "learning_rate": 0.0002, + "loss": 0.5177, + "step": 2347 + }, + { + "epoch": 4.258444797098163, + "grad_norm": 0.33203125, + "learning_rate": 0.0002, + "loss": 0.5932, + "step": 2348 + }, + { + "epoch": 4.260258444797098, + "grad_norm": 0.33984375, + "learning_rate": 0.0002, + "loss": 0.4884, + "step": 2349 + }, + { + "epoch": 4.262072092496033, + "grad_norm": 0.283203125, + "learning_rate": 0.0002, + "loss": 0.3277, + "step": 2350 + }, + { + "epoch": 4.263885740194967, + "grad_norm": 0.4609375, + "learning_rate": 0.0002, + "loss": 0.5673, + "step": 2351 + }, + { + "epoch": 4.265699387893902, + "grad_norm": 0.37890625, + "learning_rate": 0.0002, + "loss": 0.4038, + "step": 2352 + }, + { + "epoch": 4.267513035592836, + "grad_norm": 0.373046875, + "learning_rate": 0.0002, + "loss": 0.3639, + "step": 2353 + }, + { + "epoch": 4.26932668329177, + "grad_norm": 0.337890625, + "learning_rate": 0.0002, + "loss": 0.3471, + "step": 2354 + }, + { + "epoch": 4.271140330990705, + "grad_norm": 0.2890625, + "learning_rate": 0.0002, + "loss": 0.3463, + "step": 2355 + }, + { + "epoch": 4.27295397868964, + "grad_norm": 0.2255859375, + "learning_rate": 0.0002, + "loss": 0.822, + "step": 2356 + }, + { + "epoch": 4.274767626388574, + "grad_norm": 0.2255859375, + "learning_rate": 0.0002, + "loss": 0.7684, + "step": 2357 + }, + { + "epoch": 4.276581274087508, + "grad_norm": 0.232421875, + "learning_rate": 0.0002, + "loss": 0.7838, + "step": 2358 + }, + { + "epoch": 4.278394921786443, + "grad_norm": 0.2294921875, + "learning_rate": 0.0002, + "loss": 0.9819, + "step": 2359 + }, + { + "epoch": 4.280208569485377, + "grad_norm": 0.2265625, + "learning_rate": 0.0002, + "loss": 1.0065, + "step": 2360 + }, + { + "epoch": 4.282022217184312, + "grad_norm": 0.212890625, + "learning_rate": 0.0002, + "loss": 0.6858, + "step": 2361 + }, + { + "epoch": 4.283835864883247, + "grad_norm": 0.197265625, + "learning_rate": 0.0002, + "loss": 0.7411, + "step": 2362 + }, + { + "epoch": 4.2856495125821805, + "grad_norm": 0.1953125, + "learning_rate": 0.0002, + "loss": 0.8348, + "step": 2363 + }, + { + "epoch": 4.287463160281115, + "grad_norm": 0.181640625, + "learning_rate": 0.0002, + "loss": 0.8216, + "step": 2364 + }, + { + "epoch": 4.28927680798005, + "grad_norm": 0.166015625, + "learning_rate": 0.0002, + "loss": 0.8432, + "step": 2365 + }, + { + "epoch": 4.291090455678984, + "grad_norm": 0.17578125, + "learning_rate": 0.0002, + "loss": 0.8184, + "step": 2366 + }, + { + "epoch": 4.292904103377919, + "grad_norm": 0.1748046875, + "learning_rate": 0.0002, + "loss": 0.9209, + "step": 2367 + }, + { + "epoch": 4.2947177510768535, + "grad_norm": 0.1689453125, + "learning_rate": 0.0002, + "loss": 0.8092, + "step": 2368 + }, + { + "epoch": 4.296531398775787, + "grad_norm": 0.1875, + "learning_rate": 0.0002, + "loss": 0.7995, + "step": 2369 + }, + { + "epoch": 4.298345046474722, + "grad_norm": 0.1708984375, + "learning_rate": 0.0002, + "loss": 0.8623, + "step": 2370 + }, + { + "epoch": 4.300158694173657, + "grad_norm": 0.16015625, + "learning_rate": 0.0002, + "loss": 0.7852, + "step": 2371 + }, + { + "epoch": 4.301972341872592, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 0.6645, + "step": 2372 + }, + { + "epoch": 4.303785989571526, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 0.6892, + "step": 2373 + }, + { + "epoch": 4.30559963727046, + "grad_norm": 0.166015625, + "learning_rate": 0.0002, + "loss": 0.8196, + "step": 2374 + }, + { + "epoch": 4.307413284969394, + "grad_norm": 0.169921875, + "learning_rate": 0.0002, + "loss": 0.7628, + "step": 2375 + }, + { + "epoch": 4.309226932668329, + "grad_norm": 0.1708984375, + "learning_rate": 0.0002, + "loss": 0.8399, + "step": 2376 + }, + { + "epoch": 4.311040580367264, + "grad_norm": 0.171875, + "learning_rate": 0.0002, + "loss": 0.7765, + "step": 2377 + }, + { + "epoch": 4.312854228066199, + "grad_norm": 0.193359375, + "learning_rate": 0.0002, + "loss": 0.8757, + "step": 2378 + }, + { + "epoch": 4.314667875765132, + "grad_norm": 0.1796875, + "learning_rate": 0.0002, + "loss": 0.8499, + "step": 2379 + }, + { + "epoch": 4.316481523464067, + "grad_norm": 0.1845703125, + "learning_rate": 0.0002, + "loss": 0.768, + "step": 2380 + }, + { + "epoch": 4.318295171163002, + "grad_norm": 0.208984375, + "learning_rate": 0.0002, + "loss": 0.8555, + "step": 2381 + }, + { + "epoch": 4.320108818861936, + "grad_norm": 0.1875, + "learning_rate": 0.0002, + "loss": 0.8151, + "step": 2382 + }, + { + "epoch": 4.321922466560871, + "grad_norm": 0.2021484375, + "learning_rate": 0.0002, + "loss": 0.7159, + "step": 2383 + }, + { + "epoch": 4.323736114259805, + "grad_norm": 0.19140625, + "learning_rate": 0.0002, + "loss": 0.7323, + "step": 2384 + }, + { + "epoch": 4.325549761958739, + "grad_norm": 0.205078125, + "learning_rate": 0.0002, + "loss": 0.5812, + "step": 2385 + }, + { + "epoch": 4.327363409657674, + "grad_norm": 0.212890625, + "learning_rate": 0.0002, + "loss": 0.7005, + "step": 2386 + }, + { + "epoch": 4.329177057356609, + "grad_norm": 0.2138671875, + "learning_rate": 0.0002, + "loss": 0.5649, + "step": 2387 + }, + { + "epoch": 4.330990705055543, + "grad_norm": 0.2451171875, + "learning_rate": 0.0002, + "loss": 0.7246, + "step": 2388 + }, + { + "epoch": 4.3328043527544775, + "grad_norm": 0.2451171875, + "learning_rate": 0.0002, + "loss": 0.746, + "step": 2389 + }, + { + "epoch": 4.334618000453412, + "grad_norm": 0.255859375, + "learning_rate": 0.0002, + "loss": 0.7197, + "step": 2390 + }, + { + "epoch": 4.336431648152346, + "grad_norm": 0.2265625, + "learning_rate": 0.0002, + "loss": 0.7524, + "step": 2391 + }, + { + "epoch": 4.338245295851281, + "grad_norm": 0.248046875, + "learning_rate": 0.0002, + "loss": 0.5359, + "step": 2392 + }, + { + "epoch": 4.340058943550216, + "grad_norm": 0.2451171875, + "learning_rate": 0.0002, + "loss": 0.5172, + "step": 2393 + }, + { + "epoch": 4.34187259124915, + "grad_norm": 0.2412109375, + "learning_rate": 0.0002, + "loss": 0.5307, + "step": 2394 + }, + { + "epoch": 4.343686238948084, + "grad_norm": 0.26171875, + "learning_rate": 0.0002, + "loss": 0.5628, + "step": 2395 + }, + { + "epoch": 4.345499886647019, + "grad_norm": 0.2734375, + "learning_rate": 0.0002, + "loss": 0.5994, + "step": 2396 + }, + { + "epoch": 4.347313534345953, + "grad_norm": 0.26171875, + "learning_rate": 0.0002, + "loss": 0.6037, + "step": 2397 + }, + { + "epoch": 4.349127182044888, + "grad_norm": 0.318359375, + "learning_rate": 0.0002, + "loss": 0.5578, + "step": 2398 + }, + { + "epoch": 4.350940829743823, + "grad_norm": 0.279296875, + "learning_rate": 0.0002, + "loss": 0.6241, + "step": 2399 + }, + { + "epoch": 4.3527544774427565, + "grad_norm": 0.2890625, + "learning_rate": 0.0002, + "loss": 0.4595, + "step": 2400 + }, + { + "epoch": 4.354568125141691, + "grad_norm": 0.3125, + "learning_rate": 0.0002, + "loss": 0.5169, + "step": 2401 + }, + { + "epoch": 4.356381772840626, + "grad_norm": 0.29296875, + "learning_rate": 0.0002, + "loss": 0.3668, + "step": 2402 + }, + { + "epoch": 4.35819542053956, + "grad_norm": 0.37890625, + "learning_rate": 0.0002, + "loss": 0.38, + "step": 2403 + }, + { + "epoch": 4.360009068238495, + "grad_norm": 0.55078125, + "learning_rate": 0.0002, + "loss": 0.4978, + "step": 2404 + }, + { + "epoch": 4.361822715937429, + "grad_norm": 0.5625, + "learning_rate": 0.0002, + "loss": 0.3988, + "step": 2405 + }, + { + "epoch": 4.363636363636363, + "grad_norm": 0.318359375, + "learning_rate": 0.0002, + "loss": 0.6569, + "step": 2406 + }, + { + "epoch": 4.365450011335298, + "grad_norm": 0.2119140625, + "learning_rate": 0.0002, + "loss": 0.9518, + "step": 2407 + }, + { + "epoch": 4.367263659034233, + "grad_norm": 0.244140625, + "learning_rate": 0.0002, + "loss": 0.7856, + "step": 2408 + }, + { + "epoch": 4.369077306733167, + "grad_norm": 0.2060546875, + "learning_rate": 0.0002, + "loss": 0.811, + "step": 2409 + }, + { + "epoch": 4.3708909544321015, + "grad_norm": 0.208984375, + "learning_rate": 0.0002, + "loss": 0.7385, + "step": 2410 + }, + { + "epoch": 4.372704602131036, + "grad_norm": 0.2021484375, + "learning_rate": 0.0002, + "loss": 0.9418, + "step": 2411 + }, + { + "epoch": 4.37451824982997, + "grad_norm": 0.212890625, + "learning_rate": 0.0002, + "loss": 0.8574, + "step": 2412 + }, + { + "epoch": 4.376331897528905, + "grad_norm": 0.19140625, + "learning_rate": 0.0002, + "loss": 0.8296, + "step": 2413 + }, + { + "epoch": 4.37814554522784, + "grad_norm": 0.1875, + "learning_rate": 0.0002, + "loss": 0.7901, + "step": 2414 + }, + { + "epoch": 4.379959192926774, + "grad_norm": 0.1845703125, + "learning_rate": 0.0002, + "loss": 0.753, + "step": 2415 + }, + { + "epoch": 4.381772840625708, + "grad_norm": 0.1796875, + "learning_rate": 0.0002, + "loss": 0.7595, + "step": 2416 + }, + { + "epoch": 4.383586488324643, + "grad_norm": 0.193359375, + "learning_rate": 0.0002, + "loss": 0.9175, + "step": 2417 + }, + { + "epoch": 4.385400136023577, + "grad_norm": 0.1787109375, + "learning_rate": 0.0002, + "loss": 0.8502, + "step": 2418 + }, + { + "epoch": 4.387213783722512, + "grad_norm": 0.1650390625, + "learning_rate": 0.0002, + "loss": 0.7707, + "step": 2419 + }, + { + "epoch": 4.389027431421447, + "grad_norm": 0.177734375, + "learning_rate": 0.0002, + "loss": 0.8315, + "step": 2420 + }, + { + "epoch": 4.3908410791203805, + "grad_norm": 0.173828125, + "learning_rate": 0.0002, + "loss": 0.8346, + "step": 2421 + }, + { + "epoch": 4.392654726819315, + "grad_norm": 0.1806640625, + "learning_rate": 0.0002, + "loss": 0.8195, + "step": 2422 + }, + { + "epoch": 4.39446837451825, + "grad_norm": 0.1689453125, + "learning_rate": 0.0002, + "loss": 0.9081, + "step": 2423 + }, + { + "epoch": 4.396282022217184, + "grad_norm": 0.181640625, + "learning_rate": 0.0002, + "loss": 0.7897, + "step": 2424 + }, + { + "epoch": 4.398095669916119, + "grad_norm": 0.1845703125, + "learning_rate": 0.0002, + "loss": 0.7611, + "step": 2425 + }, + { + "epoch": 4.399909317615053, + "grad_norm": 0.1650390625, + "learning_rate": 0.0002, + "loss": 0.699, + "step": 2426 + }, + { + "epoch": 4.401722965313988, + "grad_norm": 0.1728515625, + "learning_rate": 0.0002, + "loss": 0.7196, + "step": 2427 + }, + { + "epoch": 4.403536613012922, + "grad_norm": 0.18359375, + "learning_rate": 0.0002, + "loss": 0.8706, + "step": 2428 + }, + { + "epoch": 4.405350260711857, + "grad_norm": 0.1796875, + "learning_rate": 0.0002, + "loss": 0.8551, + "step": 2429 + }, + { + "epoch": 4.407163908410791, + "grad_norm": 0.17578125, + "learning_rate": 0.0002, + "loss": 0.6885, + "step": 2430 + }, + { + "epoch": 4.4089775561097255, + "grad_norm": 0.1796875, + "learning_rate": 0.0002, + "loss": 0.7531, + "step": 2431 + }, + { + "epoch": 4.4089775561097255, + "eval_loss": 1.4575642347335815, + "eval_runtime": 150.6655, + "eval_samples_per_second": 6.637, + "eval_steps_per_second": 6.637, + "step": 2431 + }, + { + "epoch": 4.4089775561097255, + "mmlu_eval_accuracy": 0.32540992148593395, + "mmlu_eval_accuracy_abstract_algebra": 0.2727272727272727, + "mmlu_eval_accuracy_anatomy": 0.5, + "mmlu_eval_accuracy_astronomy": 0.3125, + "mmlu_eval_accuracy_business_ethics": 0.45454545454545453, + "mmlu_eval_accuracy_clinical_knowledge": 0.3103448275862069, + "mmlu_eval_accuracy_college_biology": 0.3125, + "mmlu_eval_accuracy_college_chemistry": 0.125, + "mmlu_eval_accuracy_college_computer_science": 0.09090909090909091, + "mmlu_eval_accuracy_college_mathematics": 0.18181818181818182, + "mmlu_eval_accuracy_college_medicine": 0.13636363636363635, + "mmlu_eval_accuracy_college_physics": 0.36363636363636365, + "mmlu_eval_accuracy_computer_security": 0.5454545454545454, + "mmlu_eval_accuracy_conceptual_physics": 0.3076923076923077, + "mmlu_eval_accuracy_econometrics": 0.08333333333333333, + "mmlu_eval_accuracy_electrical_engineering": 0.1875, + "mmlu_eval_accuracy_elementary_mathematics": 0.2926829268292683, + "mmlu_eval_accuracy_formal_logic": 0.14285714285714285, + "mmlu_eval_accuracy_global_facts": 0.3, + "mmlu_eval_accuracy_high_school_biology": 0.5, + "mmlu_eval_accuracy_high_school_chemistry": 0.22727272727272727, + "mmlu_eval_accuracy_high_school_computer_science": 0.3333333333333333, + "mmlu_eval_accuracy_high_school_european_history": 0.2777777777777778, + "mmlu_eval_accuracy_high_school_geography": 0.5454545454545454, + "mmlu_eval_accuracy_high_school_government_and_politics": 0.42857142857142855, + "mmlu_eval_accuracy_high_school_macroeconomics": 0.32558139534883723, + "mmlu_eval_accuracy_high_school_mathematics": 0.10344827586206896, + "mmlu_eval_accuracy_high_school_microeconomics": 0.3076923076923077, + "mmlu_eval_accuracy_high_school_physics": 0.47058823529411764, + "mmlu_eval_accuracy_high_school_psychology": 0.36666666666666664, + "mmlu_eval_accuracy_high_school_statistics": 0.21739130434782608, + "mmlu_eval_accuracy_high_school_us_history": 0.4090909090909091, + "mmlu_eval_accuracy_high_school_world_history": 0.15384615384615385, + "mmlu_eval_accuracy_human_aging": 0.43478260869565216, + "mmlu_eval_accuracy_human_sexuality": 0.08333333333333333, + "mmlu_eval_accuracy_international_law": 0.3076923076923077, + "mmlu_eval_accuracy_jurisprudence": 0.36363636363636365, + "mmlu_eval_accuracy_logical_fallacies": 0.4444444444444444, + "mmlu_eval_accuracy_machine_learning": 0.36363636363636365, + "mmlu_eval_accuracy_management": 0.18181818181818182, + "mmlu_eval_accuracy_marketing": 0.52, + "mmlu_eval_accuracy_medical_genetics": 0.5454545454545454, + "mmlu_eval_accuracy_miscellaneous": 0.5116279069767442, + "mmlu_eval_accuracy_moral_disputes": 0.34210526315789475, + "mmlu_eval_accuracy_moral_scenarios": 0.31, + "mmlu_eval_accuracy_nutrition": 0.3939393939393939, + "mmlu_eval_accuracy_philosophy": 0.35294117647058826, + "mmlu_eval_accuracy_prehistory": 0.42857142857142855, + "mmlu_eval_accuracy_professional_accounting": 0.2903225806451613, + "mmlu_eval_accuracy_professional_law": 0.25882352941176473, + "mmlu_eval_accuracy_professional_medicine": 0.2903225806451613, + "mmlu_eval_accuracy_professional_psychology": 0.36231884057971014, + "mmlu_eval_accuracy_public_relations": 0.4166666666666667, + "mmlu_eval_accuracy_security_studies": 0.4074074074074074, + "mmlu_eval_accuracy_sociology": 0.5, + "mmlu_eval_accuracy_us_foreign_policy": 0.36363636363636365, + "mmlu_eval_accuracy_virology": 0.2777777777777778, + "mmlu_eval_accuracy_world_religions": 0.21052631578947367, + "mmlu_loss": 1.8006818812707917, + "step": 2431 + }, + { + "epoch": 4.41079120380866, + "grad_norm": 0.185546875, + "learning_rate": 0.0002, + "loss": 0.7572, + "step": 2432 + }, + { + "epoch": 4.412604851507595, + "grad_norm": 0.1962890625, + "learning_rate": 0.0002, + "loss": 0.6269, + "step": 2433 + }, + { + "epoch": 4.414418499206529, + "grad_norm": 0.2099609375, + "learning_rate": 0.0002, + "loss": 0.7599, + "step": 2434 + }, + { + "epoch": 4.416232146905464, + "grad_norm": 0.21875, + "learning_rate": 0.0002, + "loss": 0.7428, + "step": 2435 + }, + { + "epoch": 4.4180457946043985, + "grad_norm": 0.220703125, + "learning_rate": 0.0002, + "loss": 0.712, + "step": 2436 + }, + { + "epoch": 4.419859442303332, + "grad_norm": 0.2197265625, + "learning_rate": 0.0002, + "loss": 0.677, + "step": 2437 + }, + { + "epoch": 4.421673090002267, + "grad_norm": 0.232421875, + "learning_rate": 0.0002, + "loss": 0.8107, + "step": 2438 + }, + { + "epoch": 4.423486737701202, + "grad_norm": 0.2099609375, + "learning_rate": 0.0002, + "loss": 0.5903, + "step": 2439 + }, + { + "epoch": 4.425300385400136, + "grad_norm": 0.2373046875, + "learning_rate": 0.0002, + "loss": 0.7865, + "step": 2440 + }, + { + "epoch": 4.427114033099071, + "grad_norm": 0.263671875, + "learning_rate": 0.0002, + "loss": 0.7507, + "step": 2441 + }, + { + "epoch": 4.428927680798005, + "grad_norm": 0.2412109375, + "learning_rate": 0.0002, + "loss": 0.5175, + "step": 2442 + }, + { + "epoch": 4.430741328496939, + "grad_norm": 0.23828125, + "learning_rate": 0.0002, + "loss": 0.6114, + "step": 2443 + }, + { + "epoch": 4.432554976195874, + "grad_norm": 0.27734375, + "learning_rate": 0.0002, + "loss": 0.6661, + "step": 2444 + }, + { + "epoch": 4.434368623894809, + "grad_norm": 0.25, + "learning_rate": 0.0002, + "loss": 0.522, + "step": 2445 + }, + { + "epoch": 4.436182271593743, + "grad_norm": 0.298828125, + "learning_rate": 0.0002, + "loss": 0.5886, + "step": 2446 + }, + { + "epoch": 4.4379959192926774, + "grad_norm": 0.2392578125, + "learning_rate": 0.0002, + "loss": 0.7717, + "step": 2447 + }, + { + "epoch": 4.439809566991612, + "grad_norm": 0.287109375, + "learning_rate": 0.0002, + "loss": 0.5577, + "step": 2448 + }, + { + "epoch": 4.441623214690546, + "grad_norm": 0.28125, + "learning_rate": 0.0002, + "loss": 0.5099, + "step": 2449 + }, + { + "epoch": 4.443436862389481, + "grad_norm": 0.384765625, + "learning_rate": 0.0002, + "loss": 0.5905, + "step": 2450 + }, + { + "epoch": 4.445250510088416, + "grad_norm": 0.353515625, + "learning_rate": 0.0002, + "loss": 0.4065, + "step": 2451 + }, + { + "epoch": 4.4470641577873495, + "grad_norm": 0.318359375, + "learning_rate": 0.0002, + "loss": 0.3815, + "step": 2452 + }, + { + "epoch": 4.448877805486284, + "grad_norm": 0.376953125, + "learning_rate": 0.0002, + "loss": 0.399, + "step": 2453 + }, + { + "epoch": 4.450691453185219, + "grad_norm": 0.3359375, + "learning_rate": 0.0002, + "loss": 0.3913, + "step": 2454 + }, + { + "epoch": 4.452505100884153, + "grad_norm": 0.474609375, + "learning_rate": 0.0002, + "loss": 0.3643, + "step": 2455 + }, + { + "epoch": 4.454318748583088, + "grad_norm": 0.24609375, + "learning_rate": 0.0002, + "loss": 0.6294, + "step": 2456 + }, + { + "epoch": 4.4561323962820225, + "grad_norm": 0.2119140625, + "learning_rate": 0.0002, + "loss": 0.8681, + "step": 2457 + }, + { + "epoch": 4.457946043980956, + "grad_norm": 0.2275390625, + "learning_rate": 0.0002, + "loss": 0.7778, + "step": 2458 + }, + { + "epoch": 4.459759691679891, + "grad_norm": 0.2236328125, + "learning_rate": 0.0002, + "loss": 0.6988, + "step": 2459 + }, + { + "epoch": 4.461573339378826, + "grad_norm": 0.23828125, + "learning_rate": 0.0002, + "loss": 0.7938, + "step": 2460 + }, + { + "epoch": 4.46338698707776, + "grad_norm": 0.2119140625, + "learning_rate": 0.0002, + "loss": 0.8529, + "step": 2461 + }, + { + "epoch": 4.465200634776695, + "grad_norm": 0.1884765625, + "learning_rate": 0.0002, + "loss": 0.804, + "step": 2462 + }, + { + "epoch": 4.467014282475629, + "grad_norm": 0.21875, + "learning_rate": 0.0002, + "loss": 0.7368, + "step": 2463 + }, + { + "epoch": 4.468827930174563, + "grad_norm": 0.2099609375, + "learning_rate": 0.0002, + "loss": 0.838, + "step": 2464 + }, + { + "epoch": 4.470641577873498, + "grad_norm": 0.2001953125, + "learning_rate": 0.0002, + "loss": 0.8842, + "step": 2465 + }, + { + "epoch": 4.472455225572433, + "grad_norm": 0.1875, + "learning_rate": 0.0002, + "loss": 0.8685, + "step": 2466 + }, + { + "epoch": 4.474268873271367, + "grad_norm": 0.197265625, + "learning_rate": 0.0002, + "loss": 0.8565, + "step": 2467 + }, + { + "epoch": 4.4760825209703015, + "grad_norm": 0.18359375, + "learning_rate": 0.0002, + "loss": 0.9247, + "step": 2468 + }, + { + "epoch": 4.477896168669236, + "grad_norm": 0.18359375, + "learning_rate": 0.0002, + "loss": 0.8192, + "step": 2469 + }, + { + "epoch": 4.47970981636817, + "grad_norm": 0.1806640625, + "learning_rate": 0.0002, + "loss": 0.6682, + "step": 2470 + }, + { + "epoch": 4.481523464067105, + "grad_norm": 0.1748046875, + "learning_rate": 0.0002, + "loss": 0.8369, + "step": 2471 + }, + { + "epoch": 4.48333711176604, + "grad_norm": 0.1728515625, + "learning_rate": 0.0002, + "loss": 0.7594, + "step": 2472 + }, + { + "epoch": 4.4851507594649735, + "grad_norm": 0.1650390625, + "learning_rate": 0.0002, + "loss": 0.7251, + "step": 2473 + }, + { + "epoch": 4.486964407163908, + "grad_norm": 0.16796875, + "learning_rate": 0.0002, + "loss": 0.7083, + "step": 2474 + }, + { + "epoch": 4.488778054862843, + "grad_norm": 0.16796875, + "learning_rate": 0.0002, + "loss": 0.7356, + "step": 2475 + }, + { + "epoch": 4.490591702561777, + "grad_norm": 0.177734375, + "learning_rate": 0.0002, + "loss": 0.8559, + "step": 2476 + }, + { + "epoch": 4.492405350260712, + "grad_norm": 0.1689453125, + "learning_rate": 0.0002, + "loss": 0.6121, + "step": 2477 + }, + { + "epoch": 4.4942189979596465, + "grad_norm": 0.1865234375, + "learning_rate": 0.0002, + "loss": 0.7074, + "step": 2478 + }, + { + "epoch": 4.49603264565858, + "grad_norm": 0.1884765625, + "learning_rate": 0.0002, + "loss": 0.8275, + "step": 2479 + }, + { + "epoch": 4.497846293357515, + "grad_norm": 0.1796875, + "learning_rate": 0.0002, + "loss": 0.7121, + "step": 2480 + }, + { + "epoch": 4.49965994105645, + "grad_norm": 0.1865234375, + "learning_rate": 0.0002, + "loss": 0.817, + "step": 2481 + }, + { + "epoch": 4.501473588755385, + "grad_norm": 0.18359375, + "learning_rate": 0.0002, + "loss": 0.5992, + "step": 2482 + }, + { + "epoch": 4.503287236454319, + "grad_norm": 0.2001953125, + "learning_rate": 0.0002, + "loss": 0.7767, + "step": 2483 + }, + { + "epoch": 4.505100884153253, + "grad_norm": 0.2080078125, + "learning_rate": 0.0002, + "loss": 0.7857, + "step": 2484 + }, + { + "epoch": 4.506914531852187, + "grad_norm": 0.1826171875, + "learning_rate": 0.0002, + "loss": 0.5932, + "step": 2485 + }, + { + "epoch": 4.508728179551122, + "grad_norm": 0.203125, + "learning_rate": 0.0002, + "loss": 0.8234, + "step": 2486 + }, + { + "epoch": 4.510541827250057, + "grad_norm": 0.205078125, + "learning_rate": 0.0002, + "loss": 0.6934, + "step": 2487 + }, + { + "epoch": 4.512355474948992, + "grad_norm": 0.2041015625, + "learning_rate": 0.0002, + "loss": 0.7906, + "step": 2488 + }, + { + "epoch": 4.5141691226479255, + "grad_norm": 0.208984375, + "learning_rate": 0.0002, + "loss": 0.7272, + "step": 2489 + }, + { + "epoch": 4.51598277034686, + "grad_norm": 0.234375, + "learning_rate": 0.0002, + "loss": 0.7513, + "step": 2490 + }, + { + "epoch": 4.517796418045794, + "grad_norm": 0.24609375, + "learning_rate": 0.0002, + "loss": 0.6588, + "step": 2491 + }, + { + "epoch": 4.519610065744729, + "grad_norm": 0.2412109375, + "learning_rate": 0.0002, + "loss": 0.7183, + "step": 2492 + }, + { + "epoch": 4.521423713443664, + "grad_norm": 0.236328125, + "learning_rate": 0.0002, + "loss": 0.6179, + "step": 2493 + }, + { + "epoch": 4.5232373611425984, + "grad_norm": 0.2421875, + "learning_rate": 0.0002, + "loss": 0.6503, + "step": 2494 + }, + { + "epoch": 4.525051008841532, + "grad_norm": 0.251953125, + "learning_rate": 0.0002, + "loss": 0.5818, + "step": 2495 + }, + { + "epoch": 4.526864656540467, + "grad_norm": 0.2578125, + "learning_rate": 0.0002, + "loss": 0.6711, + "step": 2496 + }, + { + "epoch": 4.528678304239402, + "grad_norm": 0.2890625, + "learning_rate": 0.0002, + "loss": 0.6513, + "step": 2497 + }, + { + "epoch": 4.530491951938336, + "grad_norm": 0.32421875, + "learning_rate": 0.0002, + "loss": 0.5959, + "step": 2498 + }, + { + "epoch": 4.5323055996372705, + "grad_norm": 0.337890625, + "learning_rate": 0.0002, + "loss": 0.4927, + "step": 2499 + }, + { + "epoch": 4.534119247336205, + "grad_norm": 0.298828125, + "learning_rate": 0.0002, + "loss": 0.5514, + "step": 2500 + }, + { + "epoch": 4.535932895035139, + "grad_norm": 0.328125, + "learning_rate": 0.0002, + "loss": 0.515, + "step": 2501 + }, + { + "epoch": 4.537746542734074, + "grad_norm": 0.36328125, + "learning_rate": 0.0002, + "loss": 0.4188, + "step": 2502 + }, + { + "epoch": 4.539560190433009, + "grad_norm": 0.466796875, + "learning_rate": 0.0002, + "loss": 0.399, + "step": 2503 + }, + { + "epoch": 4.541373838131943, + "grad_norm": 0.26953125, + "learning_rate": 0.0002, + "loss": 0.3524, + "step": 2504 + }, + { + "epoch": 4.543187485830877, + "grad_norm": 0.396484375, + "learning_rate": 0.0002, + "loss": 0.3893, + "step": 2505 + }, + { + "epoch": 4.545001133529812, + "grad_norm": 0.412109375, + "learning_rate": 0.0002, + "loss": 0.6368, + "step": 2506 + }, + { + "epoch": 4.546814781228746, + "grad_norm": 0.212890625, + "learning_rate": 0.0002, + "loss": 0.7389, + "step": 2507 + }, + { + "epoch": 4.548628428927681, + "grad_norm": 0.1962890625, + "learning_rate": 0.0002, + "loss": 0.7985, + "step": 2508 + }, + { + "epoch": 4.550442076626616, + "grad_norm": 0.197265625, + "learning_rate": 0.0002, + "loss": 0.6852, + "step": 2509 + }, + { + "epoch": 4.5522557243255495, + "grad_norm": 0.1943359375, + "learning_rate": 0.0002, + "loss": 0.9228, + "step": 2510 + }, + { + "epoch": 4.554069372024484, + "grad_norm": 0.212890625, + "learning_rate": 0.0002, + "loss": 0.7356, + "step": 2511 + }, + { + "epoch": 4.555883019723419, + "grad_norm": 0.2177734375, + "learning_rate": 0.0002, + "loss": 0.8471, + "step": 2512 + }, + { + "epoch": 4.557696667422353, + "grad_norm": 0.205078125, + "learning_rate": 0.0002, + "loss": 0.761, + "step": 2513 + }, + { + "epoch": 4.559510315121288, + "grad_norm": 0.2314453125, + "learning_rate": 0.0002, + "loss": 0.8557, + "step": 2514 + }, + { + "epoch": 4.5613239628202225, + "grad_norm": 0.2109375, + "learning_rate": 0.0002, + "loss": 0.8558, + "step": 2515 + }, + { + "epoch": 4.563137610519156, + "grad_norm": 0.203125, + "learning_rate": 0.0002, + "loss": 0.8402, + "step": 2516 + }, + { + "epoch": 4.564951258218091, + "grad_norm": 0.1953125, + "learning_rate": 0.0002, + "loss": 1.0007, + "step": 2517 + }, + { + "epoch": 4.566764905917026, + "grad_norm": 0.1904296875, + "learning_rate": 0.0002, + "loss": 0.7367, + "step": 2518 + }, + { + "epoch": 4.56857855361596, + "grad_norm": 0.19140625, + "learning_rate": 0.0002, + "loss": 0.7928, + "step": 2519 + }, + { + "epoch": 4.5703922013148945, + "grad_norm": 0.189453125, + "learning_rate": 0.0002, + "loss": 0.7745, + "step": 2520 + }, + { + "epoch": 4.572205849013829, + "grad_norm": 0.17578125, + "learning_rate": 0.0002, + "loss": 0.756, + "step": 2521 + }, + { + "epoch": 4.574019496712763, + "grad_norm": 0.185546875, + "learning_rate": 0.0002, + "loss": 0.8801, + "step": 2522 + }, + { + "epoch": 4.575833144411698, + "grad_norm": 0.1796875, + "learning_rate": 0.0002, + "loss": 0.8663, + "step": 2523 + }, + { + "epoch": 4.577646792110633, + "grad_norm": 0.1748046875, + "learning_rate": 0.0002, + "loss": 0.6676, + "step": 2524 + }, + { + "epoch": 4.579460439809567, + "grad_norm": 0.1943359375, + "learning_rate": 0.0002, + "loss": 0.924, + "step": 2525 + }, + { + "epoch": 4.581274087508501, + "grad_norm": 0.1904296875, + "learning_rate": 0.0002, + "loss": 0.8613, + "step": 2526 + }, + { + "epoch": 4.583087735207436, + "grad_norm": 0.2197265625, + "learning_rate": 0.0002, + "loss": 0.8797, + "step": 2527 + }, + { + "epoch": 4.58490138290637, + "grad_norm": 0.177734375, + "learning_rate": 0.0002, + "loss": 0.7987, + "step": 2528 + }, + { + "epoch": 4.586715030605305, + "grad_norm": 0.19140625, + "learning_rate": 0.0002, + "loss": 0.7323, + "step": 2529 + }, + { + "epoch": 4.58852867830424, + "grad_norm": 0.193359375, + "learning_rate": 0.0002, + "loss": 0.7031, + "step": 2530 + }, + { + "epoch": 4.590342326003174, + "grad_norm": 0.1923828125, + "learning_rate": 0.0002, + "loss": 0.7029, + "step": 2531 + }, + { + "epoch": 4.592155973702108, + "grad_norm": 0.2177734375, + "learning_rate": 0.0002, + "loss": 0.7704, + "step": 2532 + }, + { + "epoch": 4.593969621401043, + "grad_norm": 0.212890625, + "learning_rate": 0.0002, + "loss": 0.7024, + "step": 2533 + }, + { + "epoch": 4.595783269099977, + "grad_norm": 0.3125, + "learning_rate": 0.0002, + "loss": 0.8249, + "step": 2534 + }, + { + "epoch": 4.597596916798912, + "grad_norm": 0.23828125, + "learning_rate": 0.0002, + "loss": 0.8688, + "step": 2535 + }, + { + "epoch": 4.5994105644978465, + "grad_norm": 0.22265625, + "learning_rate": 0.0002, + "loss": 0.7591, + "step": 2536 + }, + { + "epoch": 4.601224212196781, + "grad_norm": 0.265625, + "learning_rate": 0.0002, + "loss": 0.8033, + "step": 2537 + }, + { + "epoch": 4.603037859895715, + "grad_norm": 0.25, + "learning_rate": 0.0002, + "loss": 0.7097, + "step": 2538 + }, + { + "epoch": 4.60485150759465, + "grad_norm": 0.2353515625, + "learning_rate": 0.0002, + "loss": 0.6608, + "step": 2539 + }, + { + "epoch": 4.606665155293584, + "grad_norm": 0.259765625, + "learning_rate": 0.0002, + "loss": 0.6208, + "step": 2540 + }, + { + "epoch": 4.6084788029925186, + "grad_norm": 0.2373046875, + "learning_rate": 0.0002, + "loss": 0.5162, + "step": 2541 + }, + { + "epoch": 4.610292450691453, + "grad_norm": 0.2734375, + "learning_rate": 0.0002, + "loss": 0.699, + "step": 2542 + }, + { + "epoch": 4.612106098390388, + "grad_norm": 0.255859375, + "learning_rate": 0.0002, + "loss": 0.6459, + "step": 2543 + }, + { + "epoch": 4.613919746089322, + "grad_norm": 0.24609375, + "learning_rate": 0.0002, + "loss": 0.6789, + "step": 2544 + }, + { + "epoch": 4.615733393788257, + "grad_norm": 0.25390625, + "learning_rate": 0.0002, + "loss": 0.5949, + "step": 2545 + }, + { + "epoch": 4.617547041487191, + "grad_norm": 0.24609375, + "learning_rate": 0.0002, + "loss": 0.509, + "step": 2546 + }, + { + "epoch": 4.619360689186125, + "grad_norm": 0.279296875, + "learning_rate": 0.0002, + "loss": 0.6932, + "step": 2547 + }, + { + "epoch": 4.62117433688506, + "grad_norm": 0.2890625, + "learning_rate": 0.0002, + "loss": 0.6506, + "step": 2548 + }, + { + "epoch": 4.622987984583995, + "grad_norm": 0.30859375, + "learning_rate": 0.0002, + "loss": 0.5159, + "step": 2549 + }, + { + "epoch": 4.624801632282929, + "grad_norm": 0.298828125, + "learning_rate": 0.0002, + "loss": 0.4213, + "step": 2550 + }, + { + "epoch": 4.626615279981864, + "grad_norm": 0.37890625, + "learning_rate": 0.0002, + "loss": 0.5883, + "step": 2551 + }, + { + "epoch": 4.628428927680798, + "grad_norm": 0.283203125, + "learning_rate": 0.0002, + "loss": 0.3929, + "step": 2552 + }, + { + "epoch": 4.630242575379732, + "grad_norm": 0.373046875, + "learning_rate": 0.0002, + "loss": 0.3725, + "step": 2553 + }, + { + "epoch": 4.632056223078667, + "grad_norm": 0.44140625, + "learning_rate": 0.0002, + "loss": 0.3725, + "step": 2554 + }, + { + "epoch": 4.633869870777602, + "grad_norm": 0.4453125, + "learning_rate": 0.0002, + "loss": 0.3813, + "step": 2555 + }, + { + "epoch": 4.635683518476536, + "grad_norm": 0.4296875, + "learning_rate": 0.0002, + "loss": 0.6517, + "step": 2556 + }, + { + "epoch": 4.6374971661754705, + "grad_norm": 0.208984375, + "learning_rate": 0.0002, + "loss": 0.8769, + "step": 2557 + }, + { + "epoch": 4.639310813874405, + "grad_norm": 0.228515625, + "learning_rate": 0.0002, + "loss": 0.9445, + "step": 2558 + }, + { + "epoch": 4.641124461573339, + "grad_norm": 0.224609375, + "learning_rate": 0.0002, + "loss": 0.8151, + "step": 2559 + }, + { + "epoch": 4.642938109272274, + "grad_norm": 0.216796875, + "learning_rate": 0.0002, + "loss": 0.7297, + "step": 2560 + }, + { + "epoch": 4.644751756971209, + "grad_norm": 0.220703125, + "learning_rate": 0.0002, + "loss": 0.8704, + "step": 2561 + }, + { + "epoch": 4.646565404670143, + "grad_norm": 0.2138671875, + "learning_rate": 0.0002, + "loss": 1.0055, + "step": 2562 + }, + { + "epoch": 4.648379052369077, + "grad_norm": 0.19921875, + "learning_rate": 0.0002, + "loss": 0.7139, + "step": 2563 + }, + { + "epoch": 4.650192700068012, + "grad_norm": 0.2119140625, + "learning_rate": 0.0002, + "loss": 0.82, + "step": 2564 + }, + { + "epoch": 4.652006347766946, + "grad_norm": 0.19921875, + "learning_rate": 0.0002, + "loss": 0.8965, + "step": 2565 + }, + { + "epoch": 4.653819995465881, + "grad_norm": 0.2099609375, + "learning_rate": 0.0002, + "loss": 0.9891, + "step": 2566 + }, + { + "epoch": 4.6556336431648155, + "grad_norm": 0.2001953125, + "learning_rate": 0.0002, + "loss": 0.803, + "step": 2567 + }, + { + "epoch": 4.657447290863749, + "grad_norm": 0.1953125, + "learning_rate": 0.0002, + "loss": 0.7523, + "step": 2568 + }, + { + "epoch": 4.659260938562684, + "grad_norm": 0.1962890625, + "learning_rate": 0.0002, + "loss": 0.815, + "step": 2569 + }, + { + "epoch": 4.661074586261619, + "grad_norm": 0.19140625, + "learning_rate": 0.0002, + "loss": 0.7458, + "step": 2570 + }, + { + "epoch": 4.662888233960553, + "grad_norm": 0.181640625, + "learning_rate": 0.0002, + "loss": 0.7809, + "step": 2571 + }, + { + "epoch": 4.664701881659488, + "grad_norm": 0.181640625, + "learning_rate": 0.0002, + "loss": 0.7107, + "step": 2572 + }, + { + "epoch": 4.666515529358422, + "grad_norm": 0.1865234375, + "learning_rate": 0.0002, + "loss": 0.8562, + "step": 2573 + }, + { + "epoch": 4.668329177057356, + "grad_norm": 0.16796875, + "learning_rate": 0.0002, + "loss": 0.8147, + "step": 2574 + }, + { + "epoch": 4.670142824756291, + "grad_norm": 0.1845703125, + "learning_rate": 0.0002, + "loss": 0.809, + "step": 2575 + }, + { + "epoch": 4.671956472455226, + "grad_norm": 0.189453125, + "learning_rate": 0.0002, + "loss": 0.7885, + "step": 2576 + }, + { + "epoch": 4.67377012015416, + "grad_norm": 0.19140625, + "learning_rate": 0.0002, + "loss": 0.8962, + "step": 2577 + }, + { + "epoch": 4.6755837678530945, + "grad_norm": 0.1806640625, + "learning_rate": 0.0002, + "loss": 0.8435, + "step": 2578 + }, + { + "epoch": 4.677397415552029, + "grad_norm": 0.1875, + "learning_rate": 0.0002, + "loss": 0.8294, + "step": 2579 + }, + { + "epoch": 4.679211063250963, + "grad_norm": 0.208984375, + "learning_rate": 0.0002, + "loss": 0.79, + "step": 2580 + }, + { + "epoch": 4.681024710949898, + "grad_norm": 0.201171875, + "learning_rate": 0.0002, + "loss": 0.8953, + "step": 2581 + }, + { + "epoch": 4.682838358648833, + "grad_norm": 0.22265625, + "learning_rate": 0.0002, + "loss": 0.8395, + "step": 2582 + }, + { + "epoch": 4.684652006347767, + "grad_norm": 0.2041015625, + "learning_rate": 0.0002, + "loss": 0.7156, + "step": 2583 + }, + { + "epoch": 4.686465654046701, + "grad_norm": 0.2041015625, + "learning_rate": 0.0002, + "loss": 0.6244, + "step": 2584 + }, + { + "epoch": 4.688279301745636, + "grad_norm": 0.2158203125, + "learning_rate": 0.0002, + "loss": 0.7993, + "step": 2585 + }, + { + "epoch": 4.690092949444571, + "grad_norm": 0.2578125, + "learning_rate": 0.0002, + "loss": 0.8164, + "step": 2586 + }, + { + "epoch": 4.691906597143505, + "grad_norm": 0.248046875, + "learning_rate": 0.0002, + "loss": 0.824, + "step": 2587 + }, + { + "epoch": 4.6937202448424395, + "grad_norm": 0.2734375, + "learning_rate": 0.0002, + "loss": 0.7842, + "step": 2588 + }, + { + "epoch": 4.695533892541373, + "grad_norm": 0.2578125, + "learning_rate": 0.0002, + "loss": 0.5956, + "step": 2589 + }, + { + "epoch": 4.697347540240308, + "grad_norm": 0.279296875, + "learning_rate": 0.0002, + "loss": 0.744, + "step": 2590 + }, + { + "epoch": 4.699161187939243, + "grad_norm": 0.26953125, + "learning_rate": 0.0002, + "loss": 0.7017, + "step": 2591 + }, + { + "epoch": 4.700974835638178, + "grad_norm": 0.279296875, + "learning_rate": 0.0002, + "loss": 0.8493, + "step": 2592 + }, + { + "epoch": 4.702788483337112, + "grad_norm": 0.26171875, + "learning_rate": 0.0002, + "loss": 0.834, + "step": 2593 + }, + { + "epoch": 4.704602131036046, + "grad_norm": 0.271484375, + "learning_rate": 0.0002, + "loss": 0.5306, + "step": 2594 + }, + { + "epoch": 4.70641577873498, + "grad_norm": 0.2890625, + "learning_rate": 0.0002, + "loss": 0.5836, + "step": 2595 + }, + { + "epoch": 4.708229426433915, + "grad_norm": 0.275390625, + "learning_rate": 0.0002, + "loss": 0.6181, + "step": 2596 + }, + { + "epoch": 4.71004307413285, + "grad_norm": 0.287109375, + "learning_rate": 0.0002, + "loss": 0.6275, + "step": 2597 + }, + { + "epoch": 4.711856721831785, + "grad_norm": 0.294921875, + "learning_rate": 0.0002, + "loss": 0.5671, + "step": 2598 + }, + { + "epoch": 4.7136703695307185, + "grad_norm": 0.359375, + "learning_rate": 0.0002, + "loss": 0.6023, + "step": 2599 + }, + { + "epoch": 4.715484017229653, + "grad_norm": 0.3359375, + "learning_rate": 0.0002, + "loss": 0.4159, + "step": 2600 + }, + { + "epoch": 4.717297664928587, + "grad_norm": 0.302734375, + "learning_rate": 0.0002, + "loss": 0.4193, + "step": 2601 + }, + { + "epoch": 4.719111312627522, + "grad_norm": 0.359375, + "learning_rate": 0.0002, + "loss": 0.3905, + "step": 2602 + }, + { + "epoch": 4.720924960326457, + "grad_norm": 0.361328125, + "learning_rate": 0.0002, + "loss": 0.3749, + "step": 2603 + }, + { + "epoch": 4.7227386080253915, + "grad_norm": 0.68359375, + "learning_rate": 0.0002, + "loss": 0.3681, + "step": 2604 + }, + { + "epoch": 4.724552255724325, + "grad_norm": 0.361328125, + "learning_rate": 0.0002, + "loss": 0.3851, + "step": 2605 + }, + { + "epoch": 4.72636590342326, + "grad_norm": 0.36328125, + "learning_rate": 0.0002, + "loss": 0.8777, + "step": 2606 + }, + { + "epoch": 4.728179551122195, + "grad_norm": 0.2119140625, + "learning_rate": 0.0002, + "loss": 0.8619, + "step": 2607 + }, + { + "epoch": 4.729993198821129, + "grad_norm": 0.19140625, + "learning_rate": 0.0002, + "loss": 0.826, + "step": 2608 + }, + { + "epoch": 4.731806846520064, + "grad_norm": 0.2080078125, + "learning_rate": 0.0002, + "loss": 0.6726, + "step": 2609 + }, + { + "epoch": 4.733620494218998, + "grad_norm": 0.244140625, + "learning_rate": 0.0002, + "loss": 0.9199, + "step": 2610 + }, + { + "epoch": 4.735434141917932, + "grad_norm": 0.2275390625, + "learning_rate": 0.0002, + "loss": 0.823, + "step": 2611 + }, + { + "epoch": 4.737247789616867, + "grad_norm": 0.21484375, + "learning_rate": 0.0002, + "loss": 0.7822, + "step": 2612 + }, + { + "epoch": 4.739061437315802, + "grad_norm": 0.2177734375, + "learning_rate": 0.0002, + "loss": 0.778, + "step": 2613 + }, + { + "epoch": 4.740875085014736, + "grad_norm": 0.201171875, + "learning_rate": 0.0002, + "loss": 0.8941, + "step": 2614 + }, + { + "epoch": 4.74268873271367, + "grad_norm": 0.197265625, + "learning_rate": 0.0002, + "loss": 0.8129, + "step": 2615 + }, + { + "epoch": 4.744502380412605, + "grad_norm": 0.1884765625, + "learning_rate": 0.0002, + "loss": 0.9385, + "step": 2616 + }, + { + "epoch": 4.746316028111539, + "grad_norm": 0.2001953125, + "learning_rate": 0.0002, + "loss": 1.0734, + "step": 2617 + }, + { + "epoch": 4.748129675810474, + "grad_norm": 0.18359375, + "learning_rate": 0.0002, + "loss": 0.9268, + "step": 2618 + }, + { + "epoch": 4.748129675810474, + "eval_loss": 1.4874032735824585, + "eval_runtime": 150.6086, + "eval_samples_per_second": 6.64, + "eval_steps_per_second": 6.64, + "step": 2618 + }, + { + "epoch": 4.748129675810474, + "mmlu_eval_accuracy": 0.330382995808742, + "mmlu_eval_accuracy_abstract_algebra": 0.2727272727272727, + "mmlu_eval_accuracy_anatomy": 0.35714285714285715, + "mmlu_eval_accuracy_astronomy": 0.3125, + "mmlu_eval_accuracy_business_ethics": 0.45454545454545453, + "mmlu_eval_accuracy_clinical_knowledge": 0.3793103448275862, + "mmlu_eval_accuracy_college_biology": 0.375, + "mmlu_eval_accuracy_college_chemistry": 0.125, + "mmlu_eval_accuracy_college_computer_science": 0.09090909090909091, + "mmlu_eval_accuracy_college_mathematics": 0.18181818181818182, + "mmlu_eval_accuracy_college_medicine": 0.22727272727272727, + "mmlu_eval_accuracy_college_physics": 0.45454545454545453, + "mmlu_eval_accuracy_computer_security": 0.45454545454545453, + "mmlu_eval_accuracy_conceptual_physics": 0.3076923076923077, + "mmlu_eval_accuracy_econometrics": 0.08333333333333333, + "mmlu_eval_accuracy_electrical_engineering": 0.125, + "mmlu_eval_accuracy_elementary_mathematics": 0.2682926829268293, + "mmlu_eval_accuracy_formal_logic": 0.14285714285714285, + "mmlu_eval_accuracy_global_facts": 0.4, + "mmlu_eval_accuracy_high_school_biology": 0.46875, + "mmlu_eval_accuracy_high_school_chemistry": 0.22727272727272727, + "mmlu_eval_accuracy_high_school_computer_science": 0.3333333333333333, + "mmlu_eval_accuracy_high_school_european_history": 0.2777777777777778, + "mmlu_eval_accuracy_high_school_geography": 0.5, + "mmlu_eval_accuracy_high_school_government_and_politics": 0.47619047619047616, + "mmlu_eval_accuracy_high_school_macroeconomics": 0.27906976744186046, + "mmlu_eval_accuracy_high_school_mathematics": 0.10344827586206896, + "mmlu_eval_accuracy_high_school_microeconomics": 0.3076923076923077, + "mmlu_eval_accuracy_high_school_physics": 0.4117647058823529, + "mmlu_eval_accuracy_high_school_psychology": 0.38333333333333336, + "mmlu_eval_accuracy_high_school_statistics": 0.21739130434782608, + "mmlu_eval_accuracy_high_school_us_history": 0.3181818181818182, + "mmlu_eval_accuracy_high_school_world_history": 0.23076923076923078, + "mmlu_eval_accuracy_human_aging": 0.391304347826087, + "mmlu_eval_accuracy_human_sexuality": 0.08333333333333333, + "mmlu_eval_accuracy_international_law": 0.3076923076923077, + "mmlu_eval_accuracy_jurisprudence": 0.2727272727272727, + "mmlu_eval_accuracy_logical_fallacies": 0.5555555555555556, + "mmlu_eval_accuracy_machine_learning": 0.36363636363636365, + "mmlu_eval_accuracy_management": 0.45454545454545453, + "mmlu_eval_accuracy_marketing": 0.52, + "mmlu_eval_accuracy_medical_genetics": 0.36363636363636365, + "mmlu_eval_accuracy_miscellaneous": 0.4883720930232558, + "mmlu_eval_accuracy_moral_disputes": 0.3684210526315789, + "mmlu_eval_accuracy_moral_scenarios": 0.31, + "mmlu_eval_accuracy_nutrition": 0.3939393939393939, + "mmlu_eval_accuracy_philosophy": 0.35294117647058826, + "mmlu_eval_accuracy_prehistory": 0.34285714285714286, + "mmlu_eval_accuracy_professional_accounting": 0.3225806451612903, + "mmlu_eval_accuracy_professional_law": 0.2647058823529412, + "mmlu_eval_accuracy_professional_medicine": 0.3225806451612903, + "mmlu_eval_accuracy_professional_psychology": 0.391304347826087, + "mmlu_eval_accuracy_public_relations": 0.5, + "mmlu_eval_accuracy_security_studies": 0.37037037037037035, + "mmlu_eval_accuracy_sociology": 0.5454545454545454, + "mmlu_eval_accuracy_us_foreign_policy": 0.45454545454545453, + "mmlu_eval_accuracy_virology": 0.3333333333333333, + "mmlu_eval_accuracy_world_religions": 0.21052631578947367, + "mmlu_loss": 1.7621390248419488, + "step": 2618 + }, + { + "epoch": 4.749943323509409, + "grad_norm": 0.185546875, + "learning_rate": 0.0002, + "loss": 0.8619, + "step": 2619 + }, + { + "epoch": 4.7517569712083425, + "grad_norm": 0.1875, + "learning_rate": 0.0002, + "loss": 0.8971, + "step": 2620 + }, + { + "epoch": 4.753570618907277, + "grad_norm": 0.1767578125, + "learning_rate": 0.0002, + "loss": 0.7435, + "step": 2621 + }, + { + "epoch": 4.755384266606212, + "grad_norm": 0.173828125, + "learning_rate": 0.0002, + "loss": 0.719, + "step": 2622 + }, + { + "epoch": 4.757197914305146, + "grad_norm": 0.1826171875, + "learning_rate": 0.0002, + "loss": 0.933, + "step": 2623 + }, + { + "epoch": 4.759011562004081, + "grad_norm": 0.181640625, + "learning_rate": 0.0002, + "loss": 0.7692, + "step": 2624 + }, + { + "epoch": 4.7608252097030155, + "grad_norm": 0.181640625, + "learning_rate": 0.0002, + "loss": 0.9044, + "step": 2625 + }, + { + "epoch": 4.762638857401949, + "grad_norm": 0.1826171875, + "learning_rate": 0.0002, + "loss": 0.8722, + "step": 2626 + }, + { + "epoch": 4.764452505100884, + "grad_norm": 0.1826171875, + "learning_rate": 0.0002, + "loss": 0.9009, + "step": 2627 + }, + { + "epoch": 4.766266152799819, + "grad_norm": 0.177734375, + "learning_rate": 0.0002, + "loss": 0.7185, + "step": 2628 + }, + { + "epoch": 4.768079800498753, + "grad_norm": 0.201171875, + "learning_rate": 0.0002, + "loss": 0.7942, + "step": 2629 + }, + { + "epoch": 4.769893448197688, + "grad_norm": 0.18359375, + "learning_rate": 0.0002, + "loss": 0.7729, + "step": 2630 + }, + { + "epoch": 4.771707095896622, + "grad_norm": 0.19921875, + "learning_rate": 0.0002, + "loss": 0.6779, + "step": 2631 + }, + { + "epoch": 4.773520743595556, + "grad_norm": 0.1953125, + "learning_rate": 0.0002, + "loss": 0.5268, + "step": 2632 + }, + { + "epoch": 4.775334391294491, + "grad_norm": 0.203125, + "learning_rate": 0.0002, + "loss": 0.888, + "step": 2633 + }, + { + "epoch": 4.777148038993426, + "grad_norm": 0.2119140625, + "learning_rate": 0.0002, + "loss": 0.7664, + "step": 2634 + }, + { + "epoch": 4.77896168669236, + "grad_norm": 0.2119140625, + "learning_rate": 0.0002, + "loss": 0.9642, + "step": 2635 + }, + { + "epoch": 4.780775334391294, + "grad_norm": 0.224609375, + "learning_rate": 0.0002, + "loss": 0.8736, + "step": 2636 + }, + { + "epoch": 4.782588982090229, + "grad_norm": 0.2392578125, + "learning_rate": 0.0002, + "loss": 0.7647, + "step": 2637 + }, + { + "epoch": 4.784402629789163, + "grad_norm": 0.2255859375, + "learning_rate": 0.0002, + "loss": 0.7564, + "step": 2638 + }, + { + "epoch": 4.786216277488098, + "grad_norm": 0.22265625, + "learning_rate": 0.0002, + "loss": 0.7944, + "step": 2639 + }, + { + "epoch": 4.788029925187033, + "grad_norm": 0.259765625, + "learning_rate": 0.0002, + "loss": 0.8815, + "step": 2640 + }, + { + "epoch": 4.7898435728859665, + "grad_norm": 0.259765625, + "learning_rate": 0.0002, + "loss": 0.5524, + "step": 2641 + }, + { + "epoch": 4.791657220584901, + "grad_norm": 0.2578125, + "learning_rate": 0.0002, + "loss": 0.7657, + "step": 2642 + }, + { + "epoch": 4.793470868283836, + "grad_norm": 0.271484375, + "learning_rate": 0.0002, + "loss": 0.7343, + "step": 2643 + }, + { + "epoch": 4.79528451598277, + "grad_norm": 0.28125, + "learning_rate": 0.0002, + "loss": 0.7348, + "step": 2644 + }, + { + "epoch": 4.797098163681705, + "grad_norm": 0.28125, + "learning_rate": 0.0002, + "loss": 0.5512, + "step": 2645 + }, + { + "epoch": 4.7989118113806395, + "grad_norm": 0.3125, + "learning_rate": 0.0002, + "loss": 0.7666, + "step": 2646 + }, + { + "epoch": 4.800725459079574, + "grad_norm": 0.2890625, + "learning_rate": 0.0002, + "loss": 0.5053, + "step": 2647 + }, + { + "epoch": 4.802539106778508, + "grad_norm": 0.310546875, + "learning_rate": 0.0002, + "loss": 0.6042, + "step": 2648 + }, + { + "epoch": 4.804352754477443, + "grad_norm": 0.265625, + "learning_rate": 0.0002, + "loss": 0.5404, + "step": 2649 + }, + { + "epoch": 4.806166402176377, + "grad_norm": 0.330078125, + "learning_rate": 0.0002, + "loss": 0.5388, + "step": 2650 + }, + { + "epoch": 4.807980049875312, + "grad_norm": 0.296875, + "learning_rate": 0.0002, + "loss": 0.5246, + "step": 2651 + }, + { + "epoch": 4.809793697574246, + "grad_norm": 0.427734375, + "learning_rate": 0.0002, + "loss": 0.4634, + "step": 2652 + }, + { + "epoch": 4.811607345273181, + "grad_norm": 0.353515625, + "learning_rate": 0.0002, + "loss": 0.3014, + "step": 2653 + }, + { + "epoch": 4.813420992972115, + "grad_norm": 0.43359375, + "learning_rate": 0.0002, + "loss": 0.3712, + "step": 2654 + }, + { + "epoch": 4.81523464067105, + "grad_norm": 0.349609375, + "learning_rate": 0.0002, + "loss": 0.4013, + "step": 2655 + }, + { + "epoch": 4.817048288369984, + "grad_norm": 0.29296875, + "learning_rate": 0.0002, + "loss": 0.5546, + "step": 2656 + }, + { + "epoch": 4.818861936068918, + "grad_norm": 0.193359375, + "learning_rate": 0.0002, + "loss": 0.8721, + "step": 2657 + }, + { + "epoch": 4.820675583767853, + "grad_norm": 0.185546875, + "learning_rate": 0.0002, + "loss": 0.7498, + "step": 2658 + }, + { + "epoch": 4.822489231466788, + "grad_norm": 0.2265625, + "learning_rate": 0.0002, + "loss": 0.7898, + "step": 2659 + }, + { + "epoch": 4.824302879165722, + "grad_norm": 0.2265625, + "learning_rate": 0.0002, + "loss": 0.9692, + "step": 2660 + }, + { + "epoch": 4.826116526864657, + "grad_norm": 0.208984375, + "learning_rate": 0.0002, + "loss": 0.7714, + "step": 2661 + }, + { + "epoch": 4.8279301745635905, + "grad_norm": 0.2236328125, + "learning_rate": 0.0002, + "loss": 0.9721, + "step": 2662 + }, + { + "epoch": 4.829743822262525, + "grad_norm": 0.2041015625, + "learning_rate": 0.0002, + "loss": 0.847, + "step": 2663 + }, + { + "epoch": 4.83155746996146, + "grad_norm": 0.2109375, + "learning_rate": 0.0002, + "loss": 0.8837, + "step": 2664 + }, + { + "epoch": 4.833371117660395, + "grad_norm": 0.1943359375, + "learning_rate": 0.0002, + "loss": 0.9025, + "step": 2665 + }, + { + "epoch": 4.835184765359329, + "grad_norm": 0.212890625, + "learning_rate": 0.0002, + "loss": 0.8552, + "step": 2666 + }, + { + "epoch": 4.8369984130582635, + "grad_norm": 0.203125, + "learning_rate": 0.0002, + "loss": 0.7818, + "step": 2667 + }, + { + "epoch": 4.838812060757198, + "grad_norm": 0.189453125, + "learning_rate": 0.0002, + "loss": 0.8312, + "step": 2668 + }, + { + "epoch": 4.840625708456132, + "grad_norm": 0.18359375, + "learning_rate": 0.0002, + "loss": 0.8936, + "step": 2669 + }, + { + "epoch": 4.842439356155067, + "grad_norm": 0.1884765625, + "learning_rate": 0.0002, + "loss": 0.7945, + "step": 2670 + }, + { + "epoch": 4.844253003854002, + "grad_norm": 0.1865234375, + "learning_rate": 0.0002, + "loss": 0.8242, + "step": 2671 + }, + { + "epoch": 4.846066651552936, + "grad_norm": 0.1826171875, + "learning_rate": 0.0002, + "loss": 0.823, + "step": 2672 + }, + { + "epoch": 4.84788029925187, + "grad_norm": 0.1923828125, + "learning_rate": 0.0002, + "loss": 0.9177, + "step": 2673 + }, + { + "epoch": 4.849693946950805, + "grad_norm": 0.1796875, + "learning_rate": 0.0002, + "loss": 0.7789, + "step": 2674 + }, + { + "epoch": 4.851507594649739, + "grad_norm": 0.1865234375, + "learning_rate": 0.0002, + "loss": 0.9456, + "step": 2675 + }, + { + "epoch": 4.853321242348674, + "grad_norm": 0.1767578125, + "learning_rate": 0.0002, + "loss": 0.7961, + "step": 2676 + }, + { + "epoch": 4.855134890047609, + "grad_norm": 0.1875, + "learning_rate": 0.0002, + "loss": 0.7533, + "step": 2677 + }, + { + "epoch": 4.8569485377465424, + "grad_norm": 0.2041015625, + "learning_rate": 0.0002, + "loss": 0.883, + "step": 2678 + }, + { + "epoch": 4.858762185445477, + "grad_norm": 0.208984375, + "learning_rate": 0.0002, + "loss": 0.8985, + "step": 2679 + }, + { + "epoch": 4.860575833144412, + "grad_norm": 0.2041015625, + "learning_rate": 0.0002, + "loss": 0.8848, + "step": 2680 + }, + { + "epoch": 4.862389480843346, + "grad_norm": 0.1943359375, + "learning_rate": 0.0002, + "loss": 0.632, + "step": 2681 + }, + { + "epoch": 4.864203128542281, + "grad_norm": 0.1904296875, + "learning_rate": 0.0002, + "loss": 0.6936, + "step": 2682 + }, + { + "epoch": 4.866016776241215, + "grad_norm": 0.2353515625, + "learning_rate": 0.0002, + "loss": 0.7956, + "step": 2683 + }, + { + "epoch": 4.867830423940149, + "grad_norm": 0.205078125, + "learning_rate": 0.0002, + "loss": 0.818, + "step": 2684 + }, + { + "epoch": 4.869644071639084, + "grad_norm": 0.2138671875, + "learning_rate": 0.0002, + "loss": 0.7251, + "step": 2685 + }, + { + "epoch": 4.871457719338019, + "grad_norm": 0.228515625, + "learning_rate": 0.0002, + "loss": 0.7904, + "step": 2686 + }, + { + "epoch": 4.873271367036953, + "grad_norm": 0.2353515625, + "learning_rate": 0.0002, + "loss": 0.72, + "step": 2687 + }, + { + "epoch": 4.8750850147358875, + "grad_norm": 0.2265625, + "learning_rate": 0.0002, + "loss": 0.5874, + "step": 2688 + }, + { + "epoch": 4.876898662434822, + "grad_norm": 0.251953125, + "learning_rate": 0.0002, + "loss": 0.8005, + "step": 2689 + }, + { + "epoch": 4.878712310133756, + "grad_norm": 0.259765625, + "learning_rate": 0.0002, + "loss": 0.6924, + "step": 2690 + }, + { + "epoch": 4.880525957832691, + "grad_norm": 0.2578125, + "learning_rate": 0.0002, + "loss": 0.5944, + "step": 2691 + }, + { + "epoch": 4.882339605531626, + "grad_norm": 0.255859375, + "learning_rate": 0.0002, + "loss": 0.7426, + "step": 2692 + }, + { + "epoch": 4.88415325323056, + "grad_norm": 0.283203125, + "learning_rate": 0.0002, + "loss": 0.6928, + "step": 2693 + }, + { + "epoch": 4.885966900929494, + "grad_norm": 0.28515625, + "learning_rate": 0.0002, + "loss": 0.7306, + "step": 2694 + }, + { + "epoch": 4.887780548628429, + "grad_norm": 0.28125, + "learning_rate": 0.0002, + "loss": 0.6527, + "step": 2695 + }, + { + "epoch": 4.889594196327363, + "grad_norm": 0.279296875, + "learning_rate": 0.0002, + "loss": 0.5671, + "step": 2696 + }, + { + "epoch": 4.891407844026298, + "grad_norm": 0.287109375, + "learning_rate": 0.0002, + "loss": 0.6822, + "step": 2697 + }, + { + "epoch": 4.893221491725233, + "grad_norm": 0.2734375, + "learning_rate": 0.0002, + "loss": 0.6358, + "step": 2698 + }, + { + "epoch": 4.8950351394241665, + "grad_norm": 0.3125, + "learning_rate": 0.0002, + "loss": 0.5505, + "step": 2699 + }, + { + "epoch": 4.896848787123101, + "grad_norm": 0.30859375, + "learning_rate": 0.0002, + "loss": 0.5409, + "step": 2700 + }, + { + "epoch": 4.898662434822036, + "grad_norm": 0.298828125, + "learning_rate": 0.0002, + "loss": 0.4563, + "step": 2701 + }, + { + "epoch": 4.900476082520971, + "grad_norm": 0.3984375, + "learning_rate": 0.0002, + "loss": 0.4803, + "step": 2702 + }, + { + "epoch": 4.902289730219905, + "grad_norm": 0.32421875, + "learning_rate": 0.0002, + "loss": 0.4439, + "step": 2703 + }, + { + "epoch": 4.904103377918839, + "grad_norm": 0.388671875, + "learning_rate": 0.0002, + "loss": 0.4363, + "step": 2704 + }, + { + "epoch": 4.905917025617773, + "grad_norm": 0.318359375, + "learning_rate": 0.0002, + "loss": 0.369, + "step": 2705 + }, + { + "epoch": 4.907730673316708, + "grad_norm": 0.2578125, + "learning_rate": 0.0002, + "loss": 0.6675, + "step": 2706 + }, + { + "epoch": 4.909544321015643, + "grad_norm": 0.201171875, + "learning_rate": 0.0002, + "loss": 1.0202, + "step": 2707 + }, + { + "epoch": 4.911357968714578, + "grad_norm": 0.201171875, + "learning_rate": 0.0002, + "loss": 0.6949, + "step": 2708 + }, + { + "epoch": 4.9131716164135115, + "grad_norm": 0.251953125, + "learning_rate": 0.0002, + "loss": 1.0192, + "step": 2709 + }, + { + "epoch": 4.914985264112446, + "grad_norm": 0.2373046875, + "learning_rate": 0.0002, + "loss": 0.7454, + "step": 2710 + }, + { + "epoch": 4.91679891181138, + "grad_norm": 0.2099609375, + "learning_rate": 0.0002, + "loss": 0.8745, + "step": 2711 + }, + { + "epoch": 4.918612559510315, + "grad_norm": 0.228515625, + "learning_rate": 0.0002, + "loss": 0.9337, + "step": 2712 + }, + { + "epoch": 4.92042620720925, + "grad_norm": 0.2197265625, + "learning_rate": 0.0002, + "loss": 0.9774, + "step": 2713 + }, + { + "epoch": 4.9222398549081845, + "grad_norm": 0.216796875, + "learning_rate": 0.0002, + "loss": 0.8549, + "step": 2714 + }, + { + "epoch": 4.924053502607118, + "grad_norm": 0.201171875, + "learning_rate": 0.0002, + "loss": 0.8542, + "step": 2715 + }, + { + "epoch": 4.925867150306053, + "grad_norm": 0.19140625, + "learning_rate": 0.0002, + "loss": 0.8526, + "step": 2716 + }, + { + "epoch": 4.927680798004987, + "grad_norm": 0.216796875, + "learning_rate": 0.0002, + "loss": 0.9075, + "step": 2717 + }, + { + "epoch": 4.929494445703922, + "grad_norm": 0.2021484375, + "learning_rate": 0.0002, + "loss": 0.81, + "step": 2718 + }, + { + "epoch": 4.931308093402857, + "grad_norm": 0.2021484375, + "learning_rate": 0.0002, + "loss": 0.8274, + "step": 2719 + }, + { + "epoch": 4.933121741101791, + "grad_norm": 0.2080078125, + "learning_rate": 0.0002, + "loss": 0.7158, + "step": 2720 + }, + { + "epoch": 4.934935388800725, + "grad_norm": 0.1943359375, + "learning_rate": 0.0002, + "loss": 0.7554, + "step": 2721 + }, + { + "epoch": 4.93674903649966, + "grad_norm": 0.205078125, + "learning_rate": 0.0002, + "loss": 1.0055, + "step": 2722 + }, + { + "epoch": 4.938562684198595, + "grad_norm": 0.2001953125, + "learning_rate": 0.0002, + "loss": 0.8427, + "step": 2723 + }, + { + "epoch": 4.940376331897529, + "grad_norm": 0.197265625, + "learning_rate": 0.0002, + "loss": 0.7608, + "step": 2724 + }, + { + "epoch": 4.9421899795964634, + "grad_norm": 0.1884765625, + "learning_rate": 0.0002, + "loss": 0.8719, + "step": 2725 + }, + { + "epoch": 4.944003627295398, + "grad_norm": 0.19140625, + "learning_rate": 0.0002, + "loss": 0.7739, + "step": 2726 + }, + { + "epoch": 4.945817274994332, + "grad_norm": 0.208984375, + "learning_rate": 0.0002, + "loss": 0.9112, + "step": 2727 + }, + { + "epoch": 4.947630922693267, + "grad_norm": 0.1865234375, + "learning_rate": 0.0002, + "loss": 0.6607, + "step": 2728 + }, + { + "epoch": 4.949444570392202, + "grad_norm": 0.2021484375, + "learning_rate": 0.0002, + "loss": 0.8223, + "step": 2729 + }, + { + "epoch": 4.9512582180911355, + "grad_norm": 0.2041015625, + "learning_rate": 0.0002, + "loss": 0.9249, + "step": 2730 + }, + { + "epoch": 4.95307186579007, + "grad_norm": 0.1806640625, + "learning_rate": 0.0002, + "loss": 0.7566, + "step": 2731 + }, + { + "epoch": 4.954885513489005, + "grad_norm": 0.23046875, + "learning_rate": 0.0002, + "loss": 0.927, + "step": 2732 + }, + { + "epoch": 4.956699161187939, + "grad_norm": 0.1943359375, + "learning_rate": 0.0002, + "loss": 0.7491, + "step": 2733 + }, + { + "epoch": 4.958512808886874, + "grad_norm": 0.212890625, + "learning_rate": 0.0002, + "loss": 0.8384, + "step": 2734 + }, + { + "epoch": 4.9603264565858085, + "grad_norm": 0.2265625, + "learning_rate": 0.0002, + "loss": 0.7357, + "step": 2735 + }, + { + "epoch": 4.962140104284742, + "grad_norm": 0.2177734375, + "learning_rate": 0.0002, + "loss": 0.7297, + "step": 2736 + }, + { + "epoch": 4.963953751983677, + "grad_norm": 0.2265625, + "learning_rate": 0.0002, + "loss": 0.7198, + "step": 2737 + }, + { + "epoch": 4.965767399682612, + "grad_norm": 0.248046875, + "learning_rate": 0.0002, + "loss": 0.7699, + "step": 2738 + }, + { + "epoch": 4.967581047381546, + "grad_norm": 0.2294921875, + "learning_rate": 0.0002, + "loss": 0.7898, + "step": 2739 + }, + { + "epoch": 4.969394695080481, + "grad_norm": 0.2578125, + "learning_rate": 0.0002, + "loss": 0.8595, + "step": 2740 + }, + { + "epoch": 4.971208342779415, + "grad_norm": 0.2412109375, + "learning_rate": 0.0002, + "loss": 0.6027, + "step": 2741 + }, + { + "epoch": 4.973021990478349, + "grad_norm": 0.259765625, + "learning_rate": 0.0002, + "loss": 0.6508, + "step": 2742 + }, + { + "epoch": 4.974835638177284, + "grad_norm": 0.287109375, + "learning_rate": 0.0002, + "loss": 0.7372, + "step": 2743 + }, + { + "epoch": 4.976649285876219, + "grad_norm": 0.283203125, + "learning_rate": 0.0002, + "loss": 0.638, + "step": 2744 + }, + { + "epoch": 4.978462933575153, + "grad_norm": 0.27734375, + "learning_rate": 0.0002, + "loss": 0.8264, + "step": 2745 + }, + { + "epoch": 4.9802765812740875, + "grad_norm": 0.259765625, + "learning_rate": 0.0002, + "loss": 0.569, + "step": 2746 + }, + { + "epoch": 4.982090228973022, + "grad_norm": 0.3125, + "learning_rate": 0.0002, + "loss": 0.5221, + "step": 2747 + }, + { + "epoch": 4.983903876671956, + "grad_norm": 0.26171875, + "learning_rate": 0.0002, + "loss": 0.5161, + "step": 2748 + }, + { + "epoch": 4.985717524370891, + "grad_norm": 0.306640625, + "learning_rate": 0.0002, + "loss": 0.5652, + "step": 2749 + }, + { + "epoch": 4.987531172069826, + "grad_norm": 0.3515625, + "learning_rate": 0.0002, + "loss": 0.4812, + "step": 2750 + }, + { + "epoch": 4.9893448197687595, + "grad_norm": 1.0078125, + "learning_rate": 0.0002, + "loss": 0.4852, + "step": 2751 + }, + { + "epoch": 4.991158467467694, + "grad_norm": 0.353515625, + "learning_rate": 0.0002, + "loss": 0.4484, + "step": 2752 + }, + { + "epoch": 4.992972115166629, + "grad_norm": 0.373046875, + "learning_rate": 0.0002, + "loss": 0.4012, + "step": 2753 + }, + { + "epoch": 4.994785762865563, + "grad_norm": 0.423828125, + "learning_rate": 0.0002, + "loss": 0.4376, + "step": 2754 + }, + { + "epoch": 4.996599410564498, + "grad_norm": 0.34375, + "learning_rate": 0.0002, + "loss": 0.3649, + "step": 2755 + }, + { + "epoch": 4.9984130582634325, + "grad_norm": 0.482421875, + "learning_rate": 0.0002, + "loss": 0.7361, + "step": 2756 + }, + { + "epoch": 5.000226705962366, + "grad_norm": 0.31640625, + "learning_rate": 0.0002, + "loss": 0.7672, + "step": 2757 + }, + { + "epoch": 5.002040353661301, + "grad_norm": 0.1650390625, + "learning_rate": 0.0002, + "loss": 0.6469, + "step": 2758 + }, + { + "epoch": 5.003854001360236, + "grad_norm": 0.1572265625, + "learning_rate": 0.0002, + "loss": 0.6182, + "step": 2759 + }, + { + "epoch": 5.00566764905917, + "grad_norm": 0.17578125, + "learning_rate": 0.0002, + "loss": 0.6069, + "step": 2760 + }, + { + "epoch": 5.007481296758105, + "grad_norm": 0.162109375, + "learning_rate": 0.0002, + "loss": 0.8822, + "step": 2761 + }, + { + "epoch": 5.009294944457039, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 0.5495, + "step": 2762 + }, + { + "epoch": 5.011108592155973, + "grad_norm": 0.18359375, + "learning_rate": 0.0002, + "loss": 0.5596, + "step": 2763 + }, + { + "epoch": 5.012922239854908, + "grad_norm": 0.1787109375, + "learning_rate": 0.0002, + "loss": 0.5642, + "step": 2764 + }, + { + "epoch": 5.014735887553843, + "grad_norm": 0.197265625, + "learning_rate": 0.0002, + "loss": 0.5996, + "step": 2765 + }, + { + "epoch": 5.016549535252778, + "grad_norm": 0.2265625, + "learning_rate": 0.0002, + "loss": 0.6736, + "step": 2766 + }, + { + "epoch": 5.0183631829517115, + "grad_norm": 0.2060546875, + "learning_rate": 0.0002, + "loss": 0.7703, + "step": 2767 + }, + { + "epoch": 5.020176830650646, + "grad_norm": 0.2275390625, + "learning_rate": 0.0002, + "loss": 0.7628, + "step": 2768 + }, + { + "epoch": 5.021990478349581, + "grad_norm": 0.2255859375, + "learning_rate": 0.0002, + "loss": 0.8193, + "step": 2769 + }, + { + "epoch": 5.023804126048515, + "grad_norm": 0.216796875, + "learning_rate": 0.0002, + "loss": 0.6245, + "step": 2770 + }, + { + "epoch": 5.02561777374745, + "grad_norm": 0.2001953125, + "learning_rate": 0.0002, + "loss": 0.6635, + "step": 2771 + }, + { + "epoch": 5.027431421446384, + "grad_norm": 0.2001953125, + "learning_rate": 0.0002, + "loss": 0.6042, + "step": 2772 + }, + { + "epoch": 5.029245069145318, + "grad_norm": 0.216796875, + "learning_rate": 0.0002, + "loss": 0.6973, + "step": 2773 + }, + { + "epoch": 5.031058716844253, + "grad_norm": 0.2109375, + "learning_rate": 0.0002, + "loss": 0.6943, + "step": 2774 + }, + { + "epoch": 5.032872364543188, + "grad_norm": 0.2158203125, + "learning_rate": 0.0002, + "loss": 0.7094, + "step": 2775 + }, + { + "epoch": 5.034686012242122, + "grad_norm": 0.2119140625, + "learning_rate": 0.0002, + "loss": 0.6977, + "step": 2776 + }, + { + "epoch": 5.0364996599410565, + "grad_norm": 0.19140625, + "learning_rate": 0.0002, + "loss": 0.5754, + "step": 2777 + }, + { + "epoch": 5.038313307639991, + "grad_norm": 0.2060546875, + "learning_rate": 0.0002, + "loss": 0.6104, + "step": 2778 + }, + { + "epoch": 5.040126955338925, + "grad_norm": 0.2041015625, + "learning_rate": 0.0002, + "loss": 0.6677, + "step": 2779 + }, + { + "epoch": 5.04194060303786, + "grad_norm": 0.20703125, + "learning_rate": 0.0002, + "loss": 0.4811, + "step": 2780 + }, + { + "epoch": 5.043754250736795, + "grad_norm": 0.19140625, + "learning_rate": 0.0002, + "loss": 0.6936, + "step": 2781 + }, + { + "epoch": 5.045567898435729, + "grad_norm": 0.1953125, + "learning_rate": 0.0002, + "loss": 0.4538, + "step": 2782 + }, + { + "epoch": 5.047381546134663, + "grad_norm": 0.216796875, + "learning_rate": 0.0002, + "loss": 0.616, + "step": 2783 + }, + { + "epoch": 5.049195193833598, + "grad_norm": 0.287109375, + "learning_rate": 0.0002, + "loss": 0.6261, + "step": 2784 + }, + { + "epoch": 5.051008841532532, + "grad_norm": 0.2119140625, + "learning_rate": 0.0002, + "loss": 0.567, + "step": 2785 + }, + { + "epoch": 5.052822489231467, + "grad_norm": 0.2216796875, + "learning_rate": 0.0002, + "loss": 0.5074, + "step": 2786 + }, + { + "epoch": 5.054636136930402, + "grad_norm": 0.224609375, + "learning_rate": 0.0002, + "loss": 0.5342, + "step": 2787 + }, + { + "epoch": 5.0564497846293355, + "grad_norm": 0.240234375, + "learning_rate": 0.0002, + "loss": 0.4726, + "step": 2788 + }, + { + "epoch": 5.05826343232827, + "grad_norm": 0.232421875, + "learning_rate": 0.0002, + "loss": 0.4752, + "step": 2789 + }, + { + "epoch": 5.060077080027205, + "grad_norm": 0.21875, + "learning_rate": 0.0002, + "loss": 0.4315, + "step": 2790 + }, + { + "epoch": 5.061890727726139, + "grad_norm": 0.255859375, + "learning_rate": 0.0002, + "loss": 0.4408, + "step": 2791 + }, + { + "epoch": 5.063704375425074, + "grad_norm": 0.24609375, + "learning_rate": 0.0002, + "loss": 0.4515, + "step": 2792 + }, + { + "epoch": 5.0655180231240085, + "grad_norm": 0.263671875, + "learning_rate": 0.0002, + "loss": 0.4656, + "step": 2793 + }, + { + "epoch": 5.067331670822942, + "grad_norm": 0.267578125, + "learning_rate": 0.0002, + "loss": 0.4535, + "step": 2794 + }, + { + "epoch": 5.069145318521877, + "grad_norm": 0.27734375, + "learning_rate": 0.0002, + "loss": 0.3867, + "step": 2795 + }, + { + "epoch": 5.070958966220812, + "grad_norm": 0.31640625, + "learning_rate": 0.0002, + "loss": 0.4781, + "step": 2796 + }, + { + "epoch": 5.072772613919746, + "grad_norm": 0.384765625, + "learning_rate": 0.0002, + "loss": 0.3807, + "step": 2797 + }, + { + "epoch": 5.0745862616186805, + "grad_norm": 0.25390625, + "learning_rate": 0.0002, + "loss": 0.2941, + "step": 2798 + }, + { + "epoch": 5.076399909317615, + "grad_norm": 0.291015625, + "learning_rate": 0.0002, + "loss": 0.2988, + "step": 2799 + }, + { + "epoch": 5.078213557016549, + "grad_norm": 0.357421875, + "learning_rate": 0.0002, + "loss": 0.4092, + "step": 2800 + }, + { + "epoch": 5.080027204715484, + "grad_norm": 0.294921875, + "learning_rate": 0.0002, + "loss": 0.267, + "step": 2801 + }, + { + "epoch": 5.081840852414419, + "grad_norm": 0.294921875, + "learning_rate": 0.0002, + "loss": 0.3068, + "step": 2802 + }, + { + "epoch": 5.083654500113353, + "grad_norm": 0.306640625, + "learning_rate": 0.0002, + "loss": 0.238, + "step": 2803 + }, + { + "epoch": 5.085468147812287, + "grad_norm": 0.318359375, + "learning_rate": 0.0002, + "loss": 0.2917, + "step": 2804 + }, + { + "epoch": 5.087281795511222, + "grad_norm": 0.2392578125, + "learning_rate": 0.0002, + "loss": 0.2016, + "step": 2805 + }, + { + "epoch": 5.087281795511222, + "eval_loss": 1.688940167427063, + "eval_runtime": 153.2775, + "eval_samples_per_second": 6.524, + "eval_steps_per_second": 6.524, + "step": 2805 + }, + { + "epoch": 5.087281795511222, + "mmlu_eval_accuracy": 0.32203038527577005, + "mmlu_eval_accuracy_abstract_algebra": 0.2727272727272727, + "mmlu_eval_accuracy_anatomy": 0.2857142857142857, + "mmlu_eval_accuracy_astronomy": 0.3125, + "mmlu_eval_accuracy_business_ethics": 0.45454545454545453, + "mmlu_eval_accuracy_clinical_knowledge": 0.3103448275862069, + "mmlu_eval_accuracy_college_biology": 0.375, + "mmlu_eval_accuracy_college_chemistry": 0.125, + "mmlu_eval_accuracy_college_computer_science": 0.09090909090909091, + "mmlu_eval_accuracy_college_mathematics": 0.18181818181818182, + "mmlu_eval_accuracy_college_medicine": 0.18181818181818182, + "mmlu_eval_accuracy_college_physics": 0.36363636363636365, + "mmlu_eval_accuracy_computer_security": 0.5454545454545454, + "mmlu_eval_accuracy_conceptual_physics": 0.2692307692307692, + "mmlu_eval_accuracy_econometrics": 0.08333333333333333, + "mmlu_eval_accuracy_electrical_engineering": 0.125, + "mmlu_eval_accuracy_elementary_mathematics": 0.2926829268292683, + "mmlu_eval_accuracy_formal_logic": 0.14285714285714285, + "mmlu_eval_accuracy_global_facts": 0.4, + "mmlu_eval_accuracy_high_school_biology": 0.40625, + "mmlu_eval_accuracy_high_school_chemistry": 0.22727272727272727, + "mmlu_eval_accuracy_high_school_computer_science": 0.2222222222222222, + "mmlu_eval_accuracy_high_school_european_history": 0.2777777777777778, + "mmlu_eval_accuracy_high_school_geography": 0.5, + "mmlu_eval_accuracy_high_school_government_and_politics": 0.38095238095238093, + "mmlu_eval_accuracy_high_school_macroeconomics": 0.32558139534883723, + "mmlu_eval_accuracy_high_school_mathematics": 0.10344827586206896, + "mmlu_eval_accuracy_high_school_microeconomics": 0.3076923076923077, + "mmlu_eval_accuracy_high_school_physics": 0.4117647058823529, + "mmlu_eval_accuracy_high_school_psychology": 0.35, + "mmlu_eval_accuracy_high_school_statistics": 0.30434782608695654, + "mmlu_eval_accuracy_high_school_us_history": 0.36363636363636365, + "mmlu_eval_accuracy_high_school_world_history": 0.19230769230769232, + "mmlu_eval_accuracy_human_aging": 0.34782608695652173, + "mmlu_eval_accuracy_human_sexuality": 0.08333333333333333, + "mmlu_eval_accuracy_international_law": 0.38461538461538464, + "mmlu_eval_accuracy_jurisprudence": 0.36363636363636365, + "mmlu_eval_accuracy_logical_fallacies": 0.3888888888888889, + "mmlu_eval_accuracy_machine_learning": 0.36363636363636365, + "mmlu_eval_accuracy_management": 0.36363636363636365, + "mmlu_eval_accuracy_marketing": 0.56, + "mmlu_eval_accuracy_medical_genetics": 0.36363636363636365, + "mmlu_eval_accuracy_miscellaneous": 0.5, + "mmlu_eval_accuracy_moral_disputes": 0.34210526315789475, + "mmlu_eval_accuracy_moral_scenarios": 0.32, + "mmlu_eval_accuracy_nutrition": 0.42424242424242425, + "mmlu_eval_accuracy_philosophy": 0.38235294117647056, + "mmlu_eval_accuracy_prehistory": 0.4, + "mmlu_eval_accuracy_professional_accounting": 0.3225806451612903, + "mmlu_eval_accuracy_professional_law": 0.27647058823529413, + "mmlu_eval_accuracy_professional_medicine": 0.25806451612903225, + "mmlu_eval_accuracy_professional_psychology": 0.3333333333333333, + "mmlu_eval_accuracy_public_relations": 0.5, + "mmlu_eval_accuracy_security_studies": 0.3333333333333333, + "mmlu_eval_accuracy_sociology": 0.5454545454545454, + "mmlu_eval_accuracy_us_foreign_policy": 0.36363636363636365, + "mmlu_eval_accuracy_virology": 0.3333333333333333, + "mmlu_eval_accuracy_world_religions": 0.3157894736842105, + "mmlu_loss": 1.6905166505988636, + "step": 2805 + }, + { + "epoch": 5.089095443210156, + "grad_norm": 0.443359375, + "learning_rate": 0.0002, + "loss": 0.3128, + "step": 2806 + }, + { + "epoch": 5.090909090909091, + "grad_norm": 0.296875, + "learning_rate": 0.0002, + "loss": 0.3669, + "step": 2807 + }, + { + "epoch": 5.092722738608026, + "grad_norm": 0.18359375, + "learning_rate": 0.0002, + "loss": 0.6378, + "step": 2808 + }, + { + "epoch": 5.0945363863069595, + "grad_norm": 0.1884765625, + "learning_rate": 0.0002, + "loss": 0.7307, + "step": 2809 + }, + { + "epoch": 5.096350034005894, + "grad_norm": 0.1943359375, + "learning_rate": 0.0002, + "loss": 0.5913, + "step": 2810 + }, + { + "epoch": 5.098163681704829, + "grad_norm": 0.19140625, + "learning_rate": 0.0002, + "loss": 0.6326, + "step": 2811 + }, + { + "epoch": 5.099977329403763, + "grad_norm": 0.22265625, + "learning_rate": 0.0002, + "loss": 0.8157, + "step": 2812 + }, + { + "epoch": 5.101790977102698, + "grad_norm": 0.201171875, + "learning_rate": 0.0002, + "loss": 0.7875, + "step": 2813 + }, + { + "epoch": 5.1036046248016325, + "grad_norm": 0.2109375, + "learning_rate": 0.0002, + "loss": 0.6969, + "step": 2814 + }, + { + "epoch": 5.105418272500566, + "grad_norm": 0.2275390625, + "learning_rate": 0.0002, + "loss": 0.6812, + "step": 2815 + }, + { + "epoch": 5.107231920199501, + "grad_norm": 0.216796875, + "learning_rate": 0.0002, + "loss": 0.7079, + "step": 2816 + }, + { + "epoch": 5.109045567898436, + "grad_norm": 0.2236328125, + "learning_rate": 0.0002, + "loss": 0.8141, + "step": 2817 + }, + { + "epoch": 5.11085921559737, + "grad_norm": 0.2177734375, + "learning_rate": 0.0002, + "loss": 0.631, + "step": 2818 + }, + { + "epoch": 5.1126728632963045, + "grad_norm": 0.2041015625, + "learning_rate": 0.0002, + "loss": 0.58, + "step": 2819 + }, + { + "epoch": 5.114486510995239, + "grad_norm": 0.2177734375, + "learning_rate": 0.0002, + "loss": 0.6747, + "step": 2820 + }, + { + "epoch": 5.116300158694174, + "grad_norm": 0.23046875, + "learning_rate": 0.0002, + "loss": 0.7538, + "step": 2821 + }, + { + "epoch": 5.118113806393108, + "grad_norm": 0.1982421875, + "learning_rate": 0.0002, + "loss": 0.6027, + "step": 2822 + }, + { + "epoch": 5.119927454092043, + "grad_norm": 0.1982421875, + "learning_rate": 0.0002, + "loss": 0.622, + "step": 2823 + }, + { + "epoch": 5.1217411017909775, + "grad_norm": 0.1923828125, + "learning_rate": 0.0002, + "loss": 0.8052, + "step": 2824 + }, + { + "epoch": 5.123554749489911, + "grad_norm": 0.2138671875, + "learning_rate": 0.0002, + "loss": 0.6923, + "step": 2825 + }, + { + "epoch": 5.125368397188846, + "grad_norm": 0.201171875, + "learning_rate": 0.0002, + "loss": 0.7117, + "step": 2826 + }, + { + "epoch": 5.127182044887781, + "grad_norm": 0.2177734375, + "learning_rate": 0.0002, + "loss": 0.6764, + "step": 2827 + }, + { + "epoch": 5.128995692586715, + "grad_norm": 0.24609375, + "learning_rate": 0.0002, + "loss": 0.5926, + "step": 2828 + }, + { + "epoch": 5.13080934028565, + "grad_norm": 0.2333984375, + "learning_rate": 0.0002, + "loss": 0.7627, + "step": 2829 + }, + { + "epoch": 5.132622987984584, + "grad_norm": 0.2001953125, + "learning_rate": 0.0002, + "loss": 0.5879, + "step": 2830 + }, + { + "epoch": 5.134436635683518, + "grad_norm": 0.21484375, + "learning_rate": 0.0002, + "loss": 0.7147, + "step": 2831 + }, + { + "epoch": 5.136250283382453, + "grad_norm": 0.1962890625, + "learning_rate": 0.0002, + "loss": 0.4816, + "step": 2832 + }, + { + "epoch": 5.138063931081388, + "grad_norm": 0.2119140625, + "learning_rate": 0.0002, + "loss": 0.5574, + "step": 2833 + }, + { + "epoch": 5.139877578780322, + "grad_norm": 0.2373046875, + "learning_rate": 0.0002, + "loss": 0.5751, + "step": 2834 + }, + { + "epoch": 5.1416912264792565, + "grad_norm": 0.2236328125, + "learning_rate": 0.0002, + "loss": 0.6286, + "step": 2835 + }, + { + "epoch": 5.143504874178191, + "grad_norm": 0.2236328125, + "learning_rate": 0.0002, + "loss": 0.5654, + "step": 2836 + }, + { + "epoch": 5.145318521877125, + "grad_norm": 0.2490234375, + "learning_rate": 0.0002, + "loss": 0.6226, + "step": 2837 + }, + { + "epoch": 5.14713216957606, + "grad_norm": 0.251953125, + "learning_rate": 0.0002, + "loss": 0.6249, + "step": 2838 + }, + { + "epoch": 5.148945817274995, + "grad_norm": 0.2216796875, + "learning_rate": 0.0002, + "loss": 0.4944, + "step": 2839 + }, + { + "epoch": 5.150759464973929, + "grad_norm": 0.2578125, + "learning_rate": 0.0002, + "loss": 0.5769, + "step": 2840 + }, + { + "epoch": 5.152573112672863, + "grad_norm": 0.26953125, + "learning_rate": 0.0002, + "loss": 0.5701, + "step": 2841 + }, + { + "epoch": 5.154386760371798, + "grad_norm": 0.287109375, + "learning_rate": 0.0002, + "loss": 0.5189, + "step": 2842 + }, + { + "epoch": 5.156200408070732, + "grad_norm": 0.263671875, + "learning_rate": 0.0002, + "loss": 0.4098, + "step": 2843 + }, + { + "epoch": 5.158014055769667, + "grad_norm": 0.23828125, + "learning_rate": 0.0002, + "loss": 0.3809, + "step": 2844 + }, + { + "epoch": 5.1598277034686015, + "grad_norm": 0.328125, + "learning_rate": 0.0002, + "loss": 0.3923, + "step": 2845 + }, + { + "epoch": 5.161641351167535, + "grad_norm": 0.298828125, + "learning_rate": 0.0002, + "loss": 0.4657, + "step": 2846 + }, + { + "epoch": 5.16345499886647, + "grad_norm": 0.30859375, + "learning_rate": 0.0002, + "loss": 0.4079, + "step": 2847 + }, + { + "epoch": 5.165268646565405, + "grad_norm": 0.322265625, + "learning_rate": 0.0002, + "loss": 0.4583, + "step": 2848 + }, + { + "epoch": 5.167082294264339, + "grad_norm": 0.298828125, + "learning_rate": 0.0002, + "loss": 0.3626, + "step": 2849 + }, + { + "epoch": 5.168895941963274, + "grad_norm": 0.279296875, + "learning_rate": 0.0002, + "loss": 0.3125, + "step": 2850 + }, + { + "epoch": 5.170709589662208, + "grad_norm": 0.384765625, + "learning_rate": 0.0002, + "loss": 0.4285, + "step": 2851 + }, + { + "epoch": 5.172523237361142, + "grad_norm": 0.337890625, + "learning_rate": 0.0002, + "loss": 0.3113, + "step": 2852 + }, + { + "epoch": 5.174336885060077, + "grad_norm": 0.375, + "learning_rate": 0.0002, + "loss": 0.3004, + "step": 2853 + }, + { + "epoch": 5.176150532759012, + "grad_norm": 0.40234375, + "learning_rate": 0.0002, + "loss": 0.2871, + "step": 2854 + }, + { + "epoch": 5.177964180457946, + "grad_norm": 0.294921875, + "learning_rate": 0.0002, + "loss": 0.2709, + "step": 2855 + }, + { + "epoch": 5.1797778281568805, + "grad_norm": 0.2060546875, + "learning_rate": 0.0002, + "loss": 0.2583, + "step": 2856 + }, + { + "epoch": 5.181591475855815, + "grad_norm": 0.47265625, + "learning_rate": 0.0002, + "loss": 0.377, + "step": 2857 + }, + { + "epoch": 5.183405123554749, + "grad_norm": 0.2138671875, + "learning_rate": 0.0002, + "loss": 0.7288, + "step": 2858 + }, + { + "epoch": 5.185218771253684, + "grad_norm": 0.185546875, + "learning_rate": 0.0002, + "loss": 0.7184, + "step": 2859 + }, + { + "epoch": 5.187032418952619, + "grad_norm": 0.2109375, + "learning_rate": 0.0002, + "loss": 0.5751, + "step": 2860 + }, + { + "epoch": 5.188846066651553, + "grad_norm": 0.1875, + "learning_rate": 0.0002, + "loss": 0.6742, + "step": 2861 + }, + { + "epoch": 5.190659714350487, + "grad_norm": 0.236328125, + "learning_rate": 0.0002, + "loss": 0.6827, + "step": 2862 + }, + { + "epoch": 5.192473362049422, + "grad_norm": 0.2158203125, + "learning_rate": 0.0002, + "loss": 0.6874, + "step": 2863 + }, + { + "epoch": 5.194287009748356, + "grad_norm": 0.2236328125, + "learning_rate": 0.0002, + "loss": 0.7062, + "step": 2864 + }, + { + "epoch": 5.196100657447291, + "grad_norm": 0.2333984375, + "learning_rate": 0.0002, + "loss": 0.4936, + "step": 2865 + }, + { + "epoch": 5.1979143051462255, + "grad_norm": 0.236328125, + "learning_rate": 0.0002, + "loss": 0.8013, + "step": 2866 + }, + { + "epoch": 5.199727952845159, + "grad_norm": 0.2236328125, + "learning_rate": 0.0002, + "loss": 0.6406, + "step": 2867 + }, + { + "epoch": 5.201541600544094, + "grad_norm": 0.234375, + "learning_rate": 0.0002, + "loss": 0.7707, + "step": 2868 + }, + { + "epoch": 5.203355248243029, + "grad_norm": 0.2294921875, + "learning_rate": 0.0002, + "loss": 0.7367, + "step": 2869 + }, + { + "epoch": 5.205168895941963, + "grad_norm": 0.2158203125, + "learning_rate": 0.0002, + "loss": 0.6951, + "step": 2870 + }, + { + "epoch": 5.206982543640898, + "grad_norm": 0.2119140625, + "learning_rate": 0.0002, + "loss": 0.6126, + "step": 2871 + }, + { + "epoch": 5.208796191339832, + "grad_norm": 0.2197265625, + "learning_rate": 0.0002, + "loss": 0.6816, + "step": 2872 + }, + { + "epoch": 5.210609839038767, + "grad_norm": 0.205078125, + "learning_rate": 0.0002, + "loss": 0.52, + "step": 2873 + }, + { + "epoch": 5.212423486737701, + "grad_norm": 0.2109375, + "learning_rate": 0.0002, + "loss": 0.6108, + "step": 2874 + }, + { + "epoch": 5.214237134436636, + "grad_norm": 0.2041015625, + "learning_rate": 0.0002, + "loss": 0.5338, + "step": 2875 + }, + { + "epoch": 5.21605078213557, + "grad_norm": 0.201171875, + "learning_rate": 0.0002, + "loss": 0.7069, + "step": 2876 + }, + { + "epoch": 5.2178644298345045, + "grad_norm": 0.197265625, + "learning_rate": 0.0002, + "loss": 0.5765, + "step": 2877 + }, + { + "epoch": 5.219678077533439, + "grad_norm": 0.2216796875, + "learning_rate": 0.0002, + "loss": 0.5697, + "step": 2878 + }, + { + "epoch": 5.221491725232374, + "grad_norm": 0.2216796875, + "learning_rate": 0.0002, + "loss": 0.561, + "step": 2879 + }, + { + "epoch": 5.223305372931308, + "grad_norm": 0.2041015625, + "learning_rate": 0.0002, + "loss": 0.5587, + "step": 2880 + }, + { + "epoch": 5.225119020630243, + "grad_norm": 0.2216796875, + "learning_rate": 0.0002, + "loss": 0.7359, + "step": 2881 + }, + { + "epoch": 5.2269326683291775, + "grad_norm": 0.228515625, + "learning_rate": 0.0002, + "loss": 0.6649, + "step": 2882 + }, + { + "epoch": 5.228746316028111, + "grad_norm": 0.23046875, + "learning_rate": 0.0002, + "loss": 0.5757, + "step": 2883 + }, + { + "epoch": 5.230559963727046, + "grad_norm": 0.21484375, + "learning_rate": 0.0002, + "loss": 0.5687, + "step": 2884 + }, + { + "epoch": 5.232373611425981, + "grad_norm": 0.2255859375, + "learning_rate": 0.0002, + "loss": 0.5769, + "step": 2885 + }, + { + "epoch": 5.234187259124915, + "grad_norm": 0.2314453125, + "learning_rate": 0.0002, + "loss": 0.6837, + "step": 2886 + }, + { + "epoch": 5.2360009068238496, + "grad_norm": 0.24609375, + "learning_rate": 0.0002, + "loss": 0.5549, + "step": 2887 + }, + { + "epoch": 5.237814554522784, + "grad_norm": 0.2177734375, + "learning_rate": 0.0002, + "loss": 0.4643, + "step": 2888 + }, + { + "epoch": 5.239628202221718, + "grad_norm": 0.259765625, + "learning_rate": 0.0002, + "loss": 0.5631, + "step": 2889 + }, + { + "epoch": 5.241441849920653, + "grad_norm": 0.2412109375, + "learning_rate": 0.0002, + "loss": 0.5069, + "step": 2890 + }, + { + "epoch": 5.243255497619588, + "grad_norm": 0.27734375, + "learning_rate": 0.0002, + "loss": 0.6113, + "step": 2891 + }, + { + "epoch": 5.245069145318522, + "grad_norm": 0.263671875, + "learning_rate": 0.0002, + "loss": 0.3993, + "step": 2892 + }, + { + "epoch": 5.246882793017456, + "grad_norm": 0.26171875, + "learning_rate": 0.0002, + "loss": 0.3565, + "step": 2893 + }, + { + "epoch": 5.248696440716391, + "grad_norm": 0.28515625, + "learning_rate": 0.0002, + "loss": 0.4506, + "step": 2894 + }, + { + "epoch": 5.250510088415325, + "grad_norm": 0.30078125, + "learning_rate": 0.0002, + "loss": 0.5064, + "step": 2895 + }, + { + "epoch": 5.25232373611426, + "grad_norm": 0.28515625, + "learning_rate": 0.0002, + "loss": 0.4009, + "step": 2896 + }, + { + "epoch": 5.254137383813195, + "grad_norm": 0.287109375, + "learning_rate": 0.0002, + "loss": 0.4047, + "step": 2897 + }, + { + "epoch": 5.2559510315121285, + "grad_norm": 0.2890625, + "learning_rate": 0.0002, + "loss": 0.3857, + "step": 2898 + }, + { + "epoch": 5.257764679211063, + "grad_norm": 0.30859375, + "learning_rate": 0.0002, + "loss": 0.3757, + "step": 2899 + }, + { + "epoch": 5.259578326909998, + "grad_norm": 0.330078125, + "learning_rate": 0.0002, + "loss": 0.3294, + "step": 2900 + }, + { + "epoch": 5.261391974608932, + "grad_norm": 0.33203125, + "learning_rate": 0.0002, + "loss": 0.2893, + "step": 2901 + }, + { + "epoch": 5.263205622307867, + "grad_norm": 0.447265625, + "learning_rate": 0.0002, + "loss": 0.3013, + "step": 2902 + }, + { + "epoch": 5.2650192700068015, + "grad_norm": 0.3828125, + "learning_rate": 0.0002, + "loss": 0.3008, + "step": 2903 + }, + { + "epoch": 5.266832917705735, + "grad_norm": 0.51171875, + "learning_rate": 0.0002, + "loss": 0.2993, + "step": 2904 + }, + { + "epoch": 5.26864656540467, + "grad_norm": 0.439453125, + "learning_rate": 0.0002, + "loss": 0.3575, + "step": 2905 + }, + { + "epoch": 5.270460213103605, + "grad_norm": 0.330078125, + "learning_rate": 0.0002, + "loss": 0.2373, + "step": 2906 + }, + { + "epoch": 5.272273860802539, + "grad_norm": 0.27734375, + "learning_rate": 0.0002, + "loss": 0.3382, + "step": 2907 + }, + { + "epoch": 5.274087508501474, + "grad_norm": 0.23046875, + "learning_rate": 0.0002, + "loss": 0.666, + "step": 2908 + }, + { + "epoch": 5.275901156200408, + "grad_norm": 0.216796875, + "learning_rate": 0.0002, + "loss": 0.9848, + "step": 2909 + }, + { + "epoch": 5.277714803899342, + "grad_norm": 0.19140625, + "learning_rate": 0.0002, + "loss": 0.5894, + "step": 2910 + }, + { + "epoch": 5.279528451598277, + "grad_norm": 0.20703125, + "learning_rate": 0.0002, + "loss": 0.7001, + "step": 2911 + }, + { + "epoch": 5.281342099297212, + "grad_norm": 0.19921875, + "learning_rate": 0.0002, + "loss": 0.6801, + "step": 2912 + }, + { + "epoch": 5.283155746996146, + "grad_norm": 0.21484375, + "learning_rate": 0.0002, + "loss": 0.6296, + "step": 2913 + }, + { + "epoch": 5.28496939469508, + "grad_norm": 0.2158203125, + "learning_rate": 0.0002, + "loss": 0.6066, + "step": 2914 + }, + { + "epoch": 5.286783042394015, + "grad_norm": 0.2421875, + "learning_rate": 0.0002, + "loss": 0.711, + "step": 2915 + }, + { + "epoch": 5.288596690092949, + "grad_norm": 0.2490234375, + "learning_rate": 0.0002, + "loss": 0.7192, + "step": 2916 + }, + { + "epoch": 5.290410337791884, + "grad_norm": 0.2314453125, + "learning_rate": 0.0002, + "loss": 0.6688, + "step": 2917 + }, + { + "epoch": 5.292223985490819, + "grad_norm": 0.228515625, + "learning_rate": 0.0002, + "loss": 0.8288, + "step": 2918 + }, + { + "epoch": 5.2940376331897525, + "grad_norm": 0.2138671875, + "learning_rate": 0.0002, + "loss": 0.6423, + "step": 2919 + }, + { + "epoch": 5.295851280888687, + "grad_norm": 0.2421875, + "learning_rate": 0.0002, + "loss": 0.6645, + "step": 2920 + }, + { + "epoch": 5.297664928587622, + "grad_norm": 0.2177734375, + "learning_rate": 0.0002, + "loss": 0.6536, + "step": 2921 + }, + { + "epoch": 5.299478576286556, + "grad_norm": 0.220703125, + "learning_rate": 0.0002, + "loss": 0.7758, + "step": 2922 + }, + { + "epoch": 5.301292223985491, + "grad_norm": 0.2060546875, + "learning_rate": 0.0002, + "loss": 0.6097, + "step": 2923 + }, + { + "epoch": 5.3031058716844255, + "grad_norm": 0.197265625, + "learning_rate": 0.0002, + "loss": 0.7554, + "step": 2924 + }, + { + "epoch": 5.304919519383359, + "grad_norm": 0.1982421875, + "learning_rate": 0.0002, + "loss": 0.6907, + "step": 2925 + }, + { + "epoch": 5.306733167082294, + "grad_norm": 0.2021484375, + "learning_rate": 0.0002, + "loss": 0.6018, + "step": 2926 + }, + { + "epoch": 5.308546814781229, + "grad_norm": 0.1923828125, + "learning_rate": 0.0002, + "loss": 0.5541, + "step": 2927 + }, + { + "epoch": 5.310360462480164, + "grad_norm": 0.22265625, + "learning_rate": 0.0002, + "loss": 0.6859, + "step": 2928 + }, + { + "epoch": 5.312174110179098, + "grad_norm": 0.220703125, + "learning_rate": 0.0002, + "loss": 0.7153, + "step": 2929 + }, + { + "epoch": 5.313987757878032, + "grad_norm": 0.236328125, + "learning_rate": 0.0002, + "loss": 0.6095, + "step": 2930 + }, + { + "epoch": 5.315801405576966, + "grad_norm": 0.2373046875, + "learning_rate": 0.0002, + "loss": 0.6295, + "step": 2931 + }, + { + "epoch": 5.317615053275901, + "grad_norm": 0.197265625, + "learning_rate": 0.0002, + "loss": 0.5622, + "step": 2932 + }, + { + "epoch": 5.319428700974836, + "grad_norm": 0.236328125, + "learning_rate": 0.0002, + "loss": 0.6413, + "step": 2933 + }, + { + "epoch": 5.3212423486737706, + "grad_norm": 0.2119140625, + "learning_rate": 0.0002, + "loss": 0.5976, + "step": 2934 + }, + { + "epoch": 5.323055996372704, + "grad_norm": 0.19921875, + "learning_rate": 0.0002, + "loss": 0.4679, + "step": 2935 + }, + { + "epoch": 5.324869644071639, + "grad_norm": 0.240234375, + "learning_rate": 0.0002, + "loss": 0.754, + "step": 2936 + }, + { + "epoch": 5.326683291770574, + "grad_norm": 0.2314453125, + "learning_rate": 0.0002, + "loss": 0.7805, + "step": 2937 + }, + { + "epoch": 5.328496939469508, + "grad_norm": 0.2265625, + "learning_rate": 0.0002, + "loss": 0.5283, + "step": 2938 + }, + { + "epoch": 5.330310587168443, + "grad_norm": 0.240234375, + "learning_rate": 0.0002, + "loss": 0.5162, + "step": 2939 + }, + { + "epoch": 5.332124234867377, + "grad_norm": 0.271484375, + "learning_rate": 0.0002, + "loss": 0.6059, + "step": 2940 + }, + { + "epoch": 5.333937882566311, + "grad_norm": 0.27734375, + "learning_rate": 0.0002, + "loss": 0.5062, + "step": 2941 + }, + { + "epoch": 5.335751530265246, + "grad_norm": 0.271484375, + "learning_rate": 0.0002, + "loss": 0.4752, + "step": 2942 + }, + { + "epoch": 5.337565177964181, + "grad_norm": 0.2470703125, + "learning_rate": 0.0002, + "loss": 0.407, + "step": 2943 + }, + { + "epoch": 5.339378825663115, + "grad_norm": 0.255859375, + "learning_rate": 0.0002, + "loss": 0.4422, + "step": 2944 + }, + { + "epoch": 5.3411924733620495, + "grad_norm": 0.314453125, + "learning_rate": 0.0002, + "loss": 0.4148, + "step": 2945 + }, + { + "epoch": 5.343006121060984, + "grad_norm": 0.283203125, + "learning_rate": 0.0002, + "loss": 0.43, + "step": 2946 + }, + { + "epoch": 5.344819768759918, + "grad_norm": 0.3515625, + "learning_rate": 0.0002, + "loss": 0.4191, + "step": 2947 + }, + { + "epoch": 5.346633416458853, + "grad_norm": 0.33203125, + "learning_rate": 0.0002, + "loss": 0.4881, + "step": 2948 + }, + { + "epoch": 5.348447064157788, + "grad_norm": 0.33984375, + "learning_rate": 0.0002, + "loss": 0.3562, + "step": 2949 + }, + { + "epoch": 5.350260711856722, + "grad_norm": 0.30859375, + "learning_rate": 0.0002, + "loss": 0.3548, + "step": 2950 + }, + { + "epoch": 5.352074359555656, + "grad_norm": 0.353515625, + "learning_rate": 0.0002, + "loss": 0.3616, + "step": 2951 + }, + { + "epoch": 5.353888007254591, + "grad_norm": 0.3125, + "learning_rate": 0.0002, + "loss": 0.2723, + "step": 2952 + }, + { + "epoch": 5.355701654953525, + "grad_norm": 0.294921875, + "learning_rate": 0.0002, + "loss": 0.2652, + "step": 2953 + }, + { + "epoch": 5.35751530265246, + "grad_norm": 0.380859375, + "learning_rate": 0.0002, + "loss": 0.322, + "step": 2954 + }, + { + "epoch": 5.359328950351395, + "grad_norm": 0.333984375, + "learning_rate": 0.0002, + "loss": 0.2946, + "step": 2955 + }, + { + "epoch": 5.3611425980503284, + "grad_norm": 0.2373046875, + "learning_rate": 0.0002, + "loss": 0.2311, + "step": 2956 + }, + { + "epoch": 5.362956245749263, + "grad_norm": 0.375, + "learning_rate": 0.0002, + "loss": 0.3657, + "step": 2957 + }, + { + "epoch": 5.364769893448198, + "grad_norm": 0.2099609375, + "learning_rate": 0.0002, + "loss": 0.7646, + "step": 2958 + }, + { + "epoch": 5.366583541147132, + "grad_norm": 0.2138671875, + "learning_rate": 0.0002, + "loss": 0.7114, + "step": 2959 + }, + { + "epoch": 5.368397188846067, + "grad_norm": 0.2255859375, + "learning_rate": 0.0002, + "loss": 0.6298, + "step": 2960 + }, + { + "epoch": 5.370210836545001, + "grad_norm": 0.20703125, + "learning_rate": 0.0002, + "loss": 0.7035, + "step": 2961 + }, + { + "epoch": 5.372024484243935, + "grad_norm": 0.23046875, + "learning_rate": 0.0002, + "loss": 0.7439, + "step": 2962 + }, + { + "epoch": 5.37383813194287, + "grad_norm": 0.25, + "learning_rate": 0.0002, + "loss": 0.5545, + "step": 2963 + }, + { + "epoch": 5.375651779641805, + "grad_norm": 0.2216796875, + "learning_rate": 0.0002, + "loss": 0.6962, + "step": 2964 + }, + { + "epoch": 5.377465427340739, + "grad_norm": 0.23046875, + "learning_rate": 0.0002, + "loss": 0.5982, + "step": 2965 + }, + { + "epoch": 5.3792790750396735, + "grad_norm": 0.2373046875, + "learning_rate": 0.0002, + "loss": 0.7536, + "step": 2966 + }, + { + "epoch": 5.381092722738608, + "grad_norm": 0.2333984375, + "learning_rate": 0.0002, + "loss": 0.6992, + "step": 2967 + }, + { + "epoch": 5.382906370437542, + "grad_norm": 0.2294921875, + "learning_rate": 0.0002, + "loss": 0.7884, + "step": 2968 + }, + { + "epoch": 5.384720018136477, + "grad_norm": 0.255859375, + "learning_rate": 0.0002, + "loss": 0.7297, + "step": 2969 + }, + { + "epoch": 5.386533665835412, + "grad_norm": 0.2158203125, + "learning_rate": 0.0002, + "loss": 0.6746, + "step": 2970 + }, + { + "epoch": 5.388347313534346, + "grad_norm": 0.22265625, + "learning_rate": 0.0002, + "loss": 0.7262, + "step": 2971 + }, + { + "epoch": 5.39016096123328, + "grad_norm": 0.2001953125, + "learning_rate": 0.0002, + "loss": 0.6072, + "step": 2972 + }, + { + "epoch": 5.391974608932215, + "grad_norm": 0.21484375, + "learning_rate": 0.0002, + "loss": 0.723, + "step": 2973 + }, + { + "epoch": 5.393788256631149, + "grad_norm": 0.2041015625, + "learning_rate": 0.0002, + "loss": 0.6311, + "step": 2974 + }, + { + "epoch": 5.395601904330084, + "grad_norm": 0.2158203125, + "learning_rate": 0.0002, + "loss": 0.6445, + "step": 2975 + }, + { + "epoch": 5.397415552029019, + "grad_norm": 0.2060546875, + "learning_rate": 0.0002, + "loss": 0.5892, + "step": 2976 + }, + { + "epoch": 5.3992291997279525, + "grad_norm": 0.2236328125, + "learning_rate": 0.0002, + "loss": 0.5821, + "step": 2977 + }, + { + "epoch": 5.401042847426887, + "grad_norm": 0.23828125, + "learning_rate": 0.0002, + "loss": 0.7337, + "step": 2978 + }, + { + "epoch": 5.402856495125822, + "grad_norm": 0.2197265625, + "learning_rate": 0.0002, + "loss": 0.7002, + "step": 2979 + }, + { + "epoch": 5.404670142824756, + "grad_norm": 0.2138671875, + "learning_rate": 0.0002, + "loss": 0.6564, + "step": 2980 + }, + { + "epoch": 5.406483790523691, + "grad_norm": 0.234375, + "learning_rate": 0.0002, + "loss": 0.6134, + "step": 2981 + }, + { + "epoch": 5.408297438222625, + "grad_norm": 0.212890625, + "learning_rate": 0.0002, + "loss": 0.5229, + "step": 2982 + }, + { + "epoch": 5.41011108592156, + "grad_norm": 0.212890625, + "learning_rate": 0.0002, + "loss": 0.5479, + "step": 2983 + }, + { + "epoch": 5.411924733620494, + "grad_norm": 0.2412109375, + "learning_rate": 0.0002, + "loss": 0.7677, + "step": 2984 + }, + { + "epoch": 5.413738381319429, + "grad_norm": 0.23828125, + "learning_rate": 0.0002, + "loss": 0.5726, + "step": 2985 + }, + { + "epoch": 5.415552029018363, + "grad_norm": 0.228515625, + "learning_rate": 0.0002, + "loss": 0.5206, + "step": 2986 + }, + { + "epoch": 5.4173656767172975, + "grad_norm": 0.2412109375, + "learning_rate": 0.0002, + "loss": 0.6206, + "step": 2987 + }, + { + "epoch": 5.419179324416232, + "grad_norm": 0.271484375, + "learning_rate": 0.0002, + "loss": 0.7061, + "step": 2988 + }, + { + "epoch": 5.420992972115167, + "grad_norm": 0.283203125, + "learning_rate": 0.0002, + "loss": 0.6811, + "step": 2989 + }, + { + "epoch": 5.422806619814101, + "grad_norm": 0.296875, + "learning_rate": 0.0002, + "loss": 0.5634, + "step": 2990 + }, + { + "epoch": 5.424620267513036, + "grad_norm": 0.2890625, + "learning_rate": 0.0002, + "loss": 0.6393, + "step": 2991 + }, + { + "epoch": 5.42643391521197, + "grad_norm": 0.26953125, + "learning_rate": 0.0002, + "loss": 0.4437, + "step": 2992 + }, + { + "epoch": 5.42643391521197, + "eval_loss": 1.6355559825897217, + "eval_runtime": 152.5142, + "eval_samples_per_second": 6.557, + "eval_steps_per_second": 6.557, + "step": 2992 + }, + { + "epoch": 5.42643391521197, + "mmlu_eval_accuracy": 0.3209948145579392, + "mmlu_eval_accuracy_abstract_algebra": 0.2727272727272727, + "mmlu_eval_accuracy_anatomy": 0.42857142857142855, + "mmlu_eval_accuracy_astronomy": 0.25, + "mmlu_eval_accuracy_business_ethics": 0.45454545454545453, + "mmlu_eval_accuracy_clinical_knowledge": 0.2413793103448276, + "mmlu_eval_accuracy_college_biology": 0.375, + "mmlu_eval_accuracy_college_chemistry": 0.0, + "mmlu_eval_accuracy_college_computer_science": 0.09090909090909091, + "mmlu_eval_accuracy_college_mathematics": 0.18181818181818182, + "mmlu_eval_accuracy_college_medicine": 0.3181818181818182, + "mmlu_eval_accuracy_college_physics": 0.36363636363636365, + "mmlu_eval_accuracy_computer_security": 0.45454545454545453, + "mmlu_eval_accuracy_conceptual_physics": 0.3076923076923077, + "mmlu_eval_accuracy_econometrics": 0.08333333333333333, + "mmlu_eval_accuracy_electrical_engineering": 0.1875, + "mmlu_eval_accuracy_elementary_mathematics": 0.2682926829268293, + "mmlu_eval_accuracy_formal_logic": 0.14285714285714285, + "mmlu_eval_accuracy_global_facts": 0.4, + "mmlu_eval_accuracy_high_school_biology": 0.5, + "mmlu_eval_accuracy_high_school_chemistry": 0.22727272727272727, + "mmlu_eval_accuracy_high_school_computer_science": 0.3333333333333333, + "mmlu_eval_accuracy_high_school_european_history": 0.2777777777777778, + "mmlu_eval_accuracy_high_school_geography": 0.5, + "mmlu_eval_accuracy_high_school_government_and_politics": 0.2857142857142857, + "mmlu_eval_accuracy_high_school_macroeconomics": 0.32558139534883723, + "mmlu_eval_accuracy_high_school_mathematics": 0.10344827586206896, + "mmlu_eval_accuracy_high_school_microeconomics": 0.3076923076923077, + "mmlu_eval_accuracy_high_school_physics": 0.4117647058823529, + "mmlu_eval_accuracy_high_school_psychology": 0.38333333333333336, + "mmlu_eval_accuracy_high_school_statistics": 0.30434782608695654, + "mmlu_eval_accuracy_high_school_us_history": 0.3181818181818182, + "mmlu_eval_accuracy_high_school_world_history": 0.11538461538461539, + "mmlu_eval_accuracy_human_aging": 0.43478260869565216, + "mmlu_eval_accuracy_human_sexuality": 0.08333333333333333, + "mmlu_eval_accuracy_international_law": 0.38461538461538464, + "mmlu_eval_accuracy_jurisprudence": 0.18181818181818182, + "mmlu_eval_accuracy_logical_fallacies": 0.3888888888888889, + "mmlu_eval_accuracy_machine_learning": 0.36363636363636365, + "mmlu_eval_accuracy_management": 0.45454545454545453, + "mmlu_eval_accuracy_marketing": 0.64, + "mmlu_eval_accuracy_medical_genetics": 0.36363636363636365, + "mmlu_eval_accuracy_miscellaneous": 0.45348837209302323, + "mmlu_eval_accuracy_moral_disputes": 0.39473684210526316, + "mmlu_eval_accuracy_moral_scenarios": 0.31, + "mmlu_eval_accuracy_nutrition": 0.3939393939393939, + "mmlu_eval_accuracy_philosophy": 0.38235294117647056, + "mmlu_eval_accuracy_prehistory": 0.4, + "mmlu_eval_accuracy_professional_accounting": 0.2903225806451613, + "mmlu_eval_accuracy_professional_law": 0.2823529411764706, + "mmlu_eval_accuracy_professional_medicine": 0.3225806451612903, + "mmlu_eval_accuracy_professional_psychology": 0.36231884057971014, + "mmlu_eval_accuracy_public_relations": 0.4166666666666667, + "mmlu_eval_accuracy_security_studies": 0.37037037037037035, + "mmlu_eval_accuracy_sociology": 0.5909090909090909, + "mmlu_eval_accuracy_us_foreign_policy": 0.2727272727272727, + "mmlu_eval_accuracy_virology": 0.3333333333333333, + "mmlu_eval_accuracy_world_religions": 0.21052631578947367, + "mmlu_loss": 1.6601912918860113, + "step": 2992 + }, + { + "epoch": 5.428247562910904, + "grad_norm": 0.291015625, + "learning_rate": 0.0002, + "loss": 0.4261, + "step": 2993 + }, + { + "epoch": 5.430061210609839, + "grad_norm": 0.36328125, + "learning_rate": 0.0002, + "loss": 0.4424, + "step": 2994 + }, + { + "epoch": 5.431874858308774, + "grad_norm": 0.298828125, + "learning_rate": 0.0002, + "loss": 0.3943, + "step": 2995 + }, + { + "epoch": 5.433688506007708, + "grad_norm": 0.326171875, + "learning_rate": 0.0002, + "loss": 0.4089, + "step": 2996 + }, + { + "epoch": 5.435502153706643, + "grad_norm": 0.298828125, + "learning_rate": 0.0002, + "loss": 0.4281, + "step": 2997 + }, + { + "epoch": 5.437315801405577, + "grad_norm": 0.33984375, + "learning_rate": 0.0002, + "loss": 0.3324, + "step": 2998 + }, + { + "epoch": 5.439129449104511, + "grad_norm": 0.32421875, + "learning_rate": 0.0002, + "loss": 0.4252, + "step": 2999 + }, + { + "epoch": 5.440943096803446, + "grad_norm": 0.328125, + "learning_rate": 0.0002, + "loss": 0.3583, + "step": 3000 + }, + { + "epoch": 5.442756744502381, + "grad_norm": 0.302734375, + "learning_rate": 0.0002, + "loss": 0.3639, + "step": 3001 + }, + { + "epoch": 5.444570392201315, + "grad_norm": 0.279296875, + "learning_rate": 0.0002, + "loss": 0.2979, + "step": 3002 + }, + { + "epoch": 5.446384039900249, + "grad_norm": 0.4609375, + "learning_rate": 0.0002, + "loss": 0.4448, + "step": 3003 + }, + { + "epoch": 5.448197687599184, + "grad_norm": 0.31640625, + "learning_rate": 0.0002, + "loss": 0.3467, + "step": 3004 + }, + { + "epoch": 5.450011335298118, + "grad_norm": 0.341796875, + "learning_rate": 0.0002, + "loss": 0.3168, + "step": 3005 + }, + { + "epoch": 5.451824982997053, + "grad_norm": 0.38671875, + "learning_rate": 0.0002, + "loss": 0.3148, + "step": 3006 + }, + { + "epoch": 5.453638630695988, + "grad_norm": 0.29296875, + "learning_rate": 0.0002, + "loss": 0.444, + "step": 3007 + }, + { + "epoch": 5.4554522783949215, + "grad_norm": 0.2021484375, + "learning_rate": 0.0002, + "loss": 0.677, + "step": 3008 + }, + { + "epoch": 5.457265926093856, + "grad_norm": 0.2099609375, + "learning_rate": 0.0002, + "loss": 0.5592, + "step": 3009 + }, + { + "epoch": 5.459079573792791, + "grad_norm": 0.203125, + "learning_rate": 0.0002, + "loss": 0.6701, + "step": 3010 + }, + { + "epoch": 5.460893221491725, + "grad_norm": 0.244140625, + "learning_rate": 0.0002, + "loss": 0.724, + "step": 3011 + }, + { + "epoch": 5.46270686919066, + "grad_norm": 0.2177734375, + "learning_rate": 0.0002, + "loss": 0.5294, + "step": 3012 + }, + { + "epoch": 5.4645205168895945, + "grad_norm": 0.22265625, + "learning_rate": 0.0002, + "loss": 0.7073, + "step": 3013 + }, + { + "epoch": 5.466334164588528, + "grad_norm": 0.24609375, + "learning_rate": 0.0002, + "loss": 0.8298, + "step": 3014 + }, + { + "epoch": 5.468147812287463, + "grad_norm": 0.2578125, + "learning_rate": 0.0002, + "loss": 0.678, + "step": 3015 + }, + { + "epoch": 5.469961459986398, + "grad_norm": 0.2451171875, + "learning_rate": 0.0002, + "loss": 0.6482, + "step": 3016 + }, + { + "epoch": 5.471775107685332, + "grad_norm": 0.265625, + "learning_rate": 0.0002, + "loss": 0.7177, + "step": 3017 + }, + { + "epoch": 5.473588755384267, + "grad_norm": 0.255859375, + "learning_rate": 0.0002, + "loss": 0.7422, + "step": 3018 + }, + { + "epoch": 5.475402403083201, + "grad_norm": 0.240234375, + "learning_rate": 0.0002, + "loss": 0.8503, + "step": 3019 + }, + { + "epoch": 5.477216050782135, + "grad_norm": 0.369140625, + "learning_rate": 0.0002, + "loss": 0.6606, + "step": 3020 + }, + { + "epoch": 5.47902969848107, + "grad_norm": 0.2177734375, + "learning_rate": 0.0002, + "loss": 0.6661, + "step": 3021 + }, + { + "epoch": 5.480843346180005, + "grad_norm": 0.220703125, + "learning_rate": 0.0002, + "loss": 0.6365, + "step": 3022 + }, + { + "epoch": 5.482656993878939, + "grad_norm": 0.2099609375, + "learning_rate": 0.0002, + "loss": 0.6226, + "step": 3023 + }, + { + "epoch": 5.4844706415778735, + "grad_norm": 0.212890625, + "learning_rate": 0.0002, + "loss": 0.5991, + "step": 3024 + }, + { + "epoch": 5.486284289276808, + "grad_norm": 0.216796875, + "learning_rate": 0.0002, + "loss": 0.7246, + "step": 3025 + }, + { + "epoch": 5.488097936975742, + "grad_norm": 0.2021484375, + "learning_rate": 0.0002, + "loss": 0.6372, + "step": 3026 + }, + { + "epoch": 5.489911584674677, + "grad_norm": 0.208984375, + "learning_rate": 0.0002, + "loss": 0.6362, + "step": 3027 + }, + { + "epoch": 5.491725232373612, + "grad_norm": 0.21875, + "learning_rate": 0.0002, + "loss": 0.5581, + "step": 3028 + }, + { + "epoch": 5.4935388800725455, + "grad_norm": 0.2177734375, + "learning_rate": 0.0002, + "loss": 0.5057, + "step": 3029 + }, + { + "epoch": 5.49535252777148, + "grad_norm": 0.224609375, + "learning_rate": 0.0002, + "loss": 0.7124, + "step": 3030 + }, + { + "epoch": 5.497166175470415, + "grad_norm": 0.2333984375, + "learning_rate": 0.0002, + "loss": 0.7526, + "step": 3031 + }, + { + "epoch": 5.498979823169349, + "grad_norm": 0.2255859375, + "learning_rate": 0.0002, + "loss": 0.4609, + "step": 3032 + }, + { + "epoch": 5.500793470868284, + "grad_norm": 0.21875, + "learning_rate": 0.0002, + "loss": 0.553, + "step": 3033 + }, + { + "epoch": 5.5026071185672185, + "grad_norm": 0.2392578125, + "learning_rate": 0.0002, + "loss": 0.7255, + "step": 3034 + }, + { + "epoch": 5.504420766266152, + "grad_norm": 0.2333984375, + "learning_rate": 0.0002, + "loss": 0.6479, + "step": 3035 + }, + { + "epoch": 5.506234413965087, + "grad_norm": 0.216796875, + "learning_rate": 0.0002, + "loss": 0.5744, + "step": 3036 + }, + { + "epoch": 5.508048061664022, + "grad_norm": 0.2734375, + "learning_rate": 0.0002, + "loss": 0.6554, + "step": 3037 + }, + { + "epoch": 5.509861709362957, + "grad_norm": 0.2578125, + "learning_rate": 0.0002, + "loss": 0.6696, + "step": 3038 + }, + { + "epoch": 5.511675357061891, + "grad_norm": 0.255859375, + "learning_rate": 0.0002, + "loss": 0.7503, + "step": 3039 + }, + { + "epoch": 5.513489004760825, + "grad_norm": 0.28515625, + "learning_rate": 0.0002, + "loss": 0.6323, + "step": 3040 + }, + { + "epoch": 5.515302652459759, + "grad_norm": 0.2734375, + "learning_rate": 0.0002, + "loss": 0.4033, + "step": 3041 + }, + { + "epoch": 5.517116300158694, + "grad_norm": 0.296875, + "learning_rate": 0.0002, + "loss": 0.4689, + "step": 3042 + }, + { + "epoch": 5.518929947857629, + "grad_norm": 0.259765625, + "learning_rate": 0.0002, + "loss": 0.4524, + "step": 3043 + }, + { + "epoch": 5.520743595556564, + "grad_norm": 0.27734375, + "learning_rate": 0.0002, + "loss": 0.3794, + "step": 3044 + }, + { + "epoch": 5.5225572432554975, + "grad_norm": 0.330078125, + "learning_rate": 0.0002, + "loss": 0.4673, + "step": 3045 + }, + { + "epoch": 5.524370890954432, + "grad_norm": 0.34765625, + "learning_rate": 0.0002, + "loss": 0.4842, + "step": 3046 + }, + { + "epoch": 5.526184538653366, + "grad_norm": 0.267578125, + "learning_rate": 0.0002, + "loss": 0.3034, + "step": 3047 + }, + { + "epoch": 5.527998186352301, + "grad_norm": 0.3046875, + "learning_rate": 0.0002, + "loss": 0.4039, + "step": 3048 + }, + { + "epoch": 5.529811834051236, + "grad_norm": 0.408203125, + "learning_rate": 0.0002, + "loss": 0.3665, + "step": 3049 + }, + { + "epoch": 5.53162548175017, + "grad_norm": 0.421875, + "learning_rate": 0.0002, + "loss": 0.3229, + "step": 3050 + }, + { + "epoch": 5.533439129449104, + "grad_norm": 0.3359375, + "learning_rate": 0.0002, + "loss": 0.294, + "step": 3051 + }, + { + "epoch": 5.535252777148039, + "grad_norm": 0.302734375, + "learning_rate": 0.0002, + "loss": 0.3761, + "step": 3052 + }, + { + "epoch": 5.537066424846973, + "grad_norm": 0.388671875, + "learning_rate": 0.0002, + "loss": 0.2902, + "step": 3053 + }, + { + "epoch": 5.538880072545908, + "grad_norm": 0.3515625, + "learning_rate": 0.0002, + "loss": 0.2437, + "step": 3054 + }, + { + "epoch": 5.5406937202448425, + "grad_norm": 0.40625, + "learning_rate": 0.0002, + "loss": 0.2516, + "step": 3055 + }, + { + "epoch": 5.542507367943777, + "grad_norm": 0.32421875, + "learning_rate": 0.0002, + "loss": 0.3031, + "step": 3056 + }, + { + "epoch": 5.544321015642711, + "grad_norm": 0.333984375, + "learning_rate": 0.0002, + "loss": 0.3205, + "step": 3057 + }, + { + "epoch": 5.546134663341646, + "grad_norm": 0.220703125, + "learning_rate": 0.0002, + "loss": 0.6441, + "step": 3058 + }, + { + "epoch": 5.547948311040581, + "grad_norm": 0.2099609375, + "learning_rate": 0.0002, + "loss": 0.6661, + "step": 3059 + }, + { + "epoch": 5.549761958739515, + "grad_norm": 0.22265625, + "learning_rate": 0.0002, + "loss": 0.6867, + "step": 3060 + }, + { + "epoch": 5.551575606438449, + "grad_norm": 0.22265625, + "learning_rate": 0.0002, + "loss": 0.7113, + "step": 3061 + }, + { + "epoch": 5.553389254137384, + "grad_norm": 0.212890625, + "learning_rate": 0.0002, + "loss": 0.7501, + "step": 3062 + }, + { + "epoch": 5.555202901836318, + "grad_norm": 0.234375, + "learning_rate": 0.0002, + "loss": 0.7401, + "step": 3063 + }, + { + "epoch": 5.557016549535253, + "grad_norm": 0.2353515625, + "learning_rate": 0.0002, + "loss": 0.8502, + "step": 3064 + }, + { + "epoch": 5.558830197234188, + "grad_norm": 0.244140625, + "learning_rate": 0.0002, + "loss": 0.7348, + "step": 3065 + }, + { + "epoch": 5.5606438449331215, + "grad_norm": 0.234375, + "learning_rate": 0.0002, + "loss": 0.769, + "step": 3066 + }, + { + "epoch": 5.562457492632056, + "grad_norm": 0.2412109375, + "learning_rate": 0.0002, + "loss": 0.7835, + "step": 3067 + }, + { + "epoch": 5.564271140330991, + "grad_norm": 0.224609375, + "learning_rate": 0.0002, + "loss": 0.6817, + "step": 3068 + }, + { + "epoch": 5.566084788029925, + "grad_norm": 0.2353515625, + "learning_rate": 0.0002, + "loss": 0.7615, + "step": 3069 + }, + { + "epoch": 5.56789843572886, + "grad_norm": 0.21875, + "learning_rate": 0.0002, + "loss": 0.6474, + "step": 3070 + }, + { + "epoch": 5.5697120834277944, + "grad_norm": 0.224609375, + "learning_rate": 0.0002, + "loss": 0.7534, + "step": 3071 + }, + { + "epoch": 5.571525731126728, + "grad_norm": 0.232421875, + "learning_rate": 0.0002, + "loss": 0.6363, + "step": 3072 + }, + { + "epoch": 5.573339378825663, + "grad_norm": 0.2275390625, + "learning_rate": 0.0002, + "loss": 0.6037, + "step": 3073 + }, + { + "epoch": 5.575153026524598, + "grad_norm": 0.2294921875, + "learning_rate": 0.0002, + "loss": 0.7316, + "step": 3074 + }, + { + "epoch": 5.576966674223532, + "grad_norm": 0.2158203125, + "learning_rate": 0.0002, + "loss": 0.6057, + "step": 3075 + }, + { + "epoch": 5.5787803219224665, + "grad_norm": 0.22265625, + "learning_rate": 0.0002, + "loss": 0.7697, + "step": 3076 + }, + { + "epoch": 5.580593969621401, + "grad_norm": 0.23046875, + "learning_rate": 0.0002, + "loss": 0.6084, + "step": 3077 + }, + { + "epoch": 5.582407617320335, + "grad_norm": 0.2373046875, + "learning_rate": 0.0002, + "loss": 0.5863, + "step": 3078 + }, + { + "epoch": 5.58422126501927, + "grad_norm": 0.248046875, + "learning_rate": 0.0002, + "loss": 0.634, + "step": 3079 + }, + { + "epoch": 5.586034912718205, + "grad_norm": 0.2265625, + "learning_rate": 0.0002, + "loss": 0.6328, + "step": 3080 + }, + { + "epoch": 5.587848560417139, + "grad_norm": 0.2265625, + "learning_rate": 0.0002, + "loss": 0.7087, + "step": 3081 + }, + { + "epoch": 5.589662208116073, + "grad_norm": 0.2060546875, + "learning_rate": 0.0002, + "loss": 0.5769, + "step": 3082 + }, + { + "epoch": 5.591475855815008, + "grad_norm": 0.244140625, + "learning_rate": 0.0002, + "loss": 0.5936, + "step": 3083 + }, + { + "epoch": 5.593289503513942, + "grad_norm": 0.234375, + "learning_rate": 0.0002, + "loss": 0.5513, + "step": 3084 + }, + { + "epoch": 5.595103151212877, + "grad_norm": 0.234375, + "learning_rate": 0.0002, + "loss": 0.5493, + "step": 3085 + }, + { + "epoch": 5.596916798911812, + "grad_norm": 0.259765625, + "learning_rate": 0.0002, + "loss": 0.6, + "step": 3086 + }, + { + "epoch": 5.5987304466107455, + "grad_norm": 0.244140625, + "learning_rate": 0.0002, + "loss": 0.4721, + "step": 3087 + }, + { + "epoch": 5.60054409430968, + "grad_norm": 0.263671875, + "learning_rate": 0.0002, + "loss": 0.5256, + "step": 3088 + }, + { + "epoch": 5.602357742008615, + "grad_norm": 0.26953125, + "learning_rate": 0.0002, + "loss": 0.6533, + "step": 3089 + }, + { + "epoch": 5.604171389707549, + "grad_norm": 0.251953125, + "learning_rate": 0.0002, + "loss": 0.4158, + "step": 3090 + }, + { + "epoch": 5.605985037406484, + "grad_norm": 0.294921875, + "learning_rate": 0.0002, + "loss": 0.527, + "step": 3091 + }, + { + "epoch": 5.6077986851054185, + "grad_norm": 0.33203125, + "learning_rate": 0.0002, + "loss": 0.5002, + "step": 3092 + }, + { + "epoch": 5.609612332804353, + "grad_norm": 0.29296875, + "learning_rate": 0.0002, + "loss": 0.4699, + "step": 3093 + }, + { + "epoch": 5.611425980503287, + "grad_norm": 0.27734375, + "learning_rate": 0.0002, + "loss": 0.4768, + "step": 3094 + }, + { + "epoch": 5.613239628202222, + "grad_norm": 0.369140625, + "learning_rate": 0.0002, + "loss": 0.4384, + "step": 3095 + }, + { + "epoch": 5.615053275901156, + "grad_norm": 0.376953125, + "learning_rate": 0.0002, + "loss": 0.4224, + "step": 3096 + }, + { + "epoch": 5.6168669236000905, + "grad_norm": 0.265625, + "learning_rate": 0.0002, + "loss": 0.3836, + "step": 3097 + }, + { + "epoch": 5.618680571299025, + "grad_norm": 0.294921875, + "learning_rate": 0.0002, + "loss": 0.3725, + "step": 3098 + }, + { + "epoch": 5.62049421899796, + "grad_norm": 0.31640625, + "learning_rate": 0.0002, + "loss": 0.3523, + "step": 3099 + }, + { + "epoch": 5.622307866696894, + "grad_norm": 0.431640625, + "learning_rate": 0.0002, + "loss": 0.5017, + "step": 3100 + }, + { + "epoch": 5.624121514395829, + "grad_norm": 0.345703125, + "learning_rate": 0.0002, + "loss": 0.3084, + "step": 3101 + }, + { + "epoch": 5.625935162094763, + "grad_norm": 0.341796875, + "learning_rate": 0.0002, + "loss": 0.3305, + "step": 3102 + }, + { + "epoch": 5.627748809793697, + "grad_norm": 0.3359375, + "learning_rate": 0.0002, + "loss": 0.239, + "step": 3103 + }, + { + "epoch": 5.629562457492632, + "grad_norm": 0.390625, + "learning_rate": 0.0002, + "loss": 0.3161, + "step": 3104 + }, + { + "epoch": 5.631376105191567, + "grad_norm": 0.3359375, + "learning_rate": 0.0002, + "loss": 0.3135, + "step": 3105 + }, + { + "epoch": 5.633189752890501, + "grad_norm": 0.48828125, + "learning_rate": 0.0002, + "loss": 0.3495, + "step": 3106 + }, + { + "epoch": 5.635003400589436, + "grad_norm": 0.40625, + "learning_rate": 0.0002, + "loss": 0.4111, + "step": 3107 + }, + { + "epoch": 5.6368170482883695, + "grad_norm": 0.220703125, + "learning_rate": 0.0002, + "loss": 0.6589, + "step": 3108 + }, + { + "epoch": 5.638630695987304, + "grad_norm": 0.2412109375, + "learning_rate": 0.0002, + "loss": 0.6679, + "step": 3109 + }, + { + "epoch": 5.640444343686239, + "grad_norm": 0.21875, + "learning_rate": 0.0002, + "loss": 0.5885, + "step": 3110 + }, + { + "epoch": 5.642257991385174, + "grad_norm": 0.2275390625, + "learning_rate": 0.0002, + "loss": 0.6631, + "step": 3111 + }, + { + "epoch": 5.644071639084108, + "grad_norm": 0.2275390625, + "learning_rate": 0.0002, + "loss": 0.5836, + "step": 3112 + }, + { + "epoch": 5.6458852867830425, + "grad_norm": 0.234375, + "learning_rate": 0.0002, + "loss": 0.8232, + "step": 3113 + }, + { + "epoch": 5.647698934481977, + "grad_norm": 0.228515625, + "learning_rate": 0.0002, + "loss": 0.7603, + "step": 3114 + }, + { + "epoch": 5.649512582180911, + "grad_norm": 0.232421875, + "learning_rate": 0.0002, + "loss": 0.6714, + "step": 3115 + }, + { + "epoch": 5.651326229879846, + "grad_norm": 0.2099609375, + "learning_rate": 0.0002, + "loss": 0.5941, + "step": 3116 + }, + { + "epoch": 5.653139877578781, + "grad_norm": 0.216796875, + "learning_rate": 0.0002, + "loss": 0.6683, + "step": 3117 + }, + { + "epoch": 5.6549535252777146, + "grad_norm": 0.25390625, + "learning_rate": 0.0002, + "loss": 0.8031, + "step": 3118 + }, + { + "epoch": 5.656767172976649, + "grad_norm": 0.2119140625, + "learning_rate": 0.0002, + "loss": 0.6404, + "step": 3119 + }, + { + "epoch": 5.658580820675584, + "grad_norm": 0.2353515625, + "learning_rate": 0.0002, + "loss": 0.8066, + "step": 3120 + }, + { + "epoch": 5.660394468374518, + "grad_norm": 0.234375, + "learning_rate": 0.0002, + "loss": 0.6956, + "step": 3121 + }, + { + "epoch": 5.662208116073453, + "grad_norm": 0.224609375, + "learning_rate": 0.0002, + "loss": 0.6528, + "step": 3122 + }, + { + "epoch": 5.6640217637723875, + "grad_norm": 0.2216796875, + "learning_rate": 0.0002, + "loss": 0.7262, + "step": 3123 + }, + { + "epoch": 5.665835411471321, + "grad_norm": 0.2158203125, + "learning_rate": 0.0002, + "loss": 0.7532, + "step": 3124 + }, + { + "epoch": 5.667649059170256, + "grad_norm": 0.2490234375, + "learning_rate": 0.0002, + "loss": 0.7059, + "step": 3125 + }, + { + "epoch": 5.669462706869191, + "grad_norm": 0.236328125, + "learning_rate": 0.0002, + "loss": 0.7417, + "step": 3126 + }, + { + "epoch": 5.671276354568125, + "grad_norm": 0.251953125, + "learning_rate": 0.0002, + "loss": 0.9164, + "step": 3127 + }, + { + "epoch": 5.67309000226706, + "grad_norm": 0.2392578125, + "learning_rate": 0.0002, + "loss": 0.7613, + "step": 3128 + }, + { + "epoch": 5.674903649965994, + "grad_norm": 0.2451171875, + "learning_rate": 0.0002, + "loss": 0.6867, + "step": 3129 + }, + { + "epoch": 5.676717297664928, + "grad_norm": 0.234375, + "learning_rate": 0.0002, + "loss": 0.6806, + "step": 3130 + }, + { + "epoch": 5.678530945363863, + "grad_norm": 0.24609375, + "learning_rate": 0.0002, + "loss": 0.5719, + "step": 3131 + }, + { + "epoch": 5.680344593062798, + "grad_norm": 0.244140625, + "learning_rate": 0.0002, + "loss": 0.7333, + "step": 3132 + }, + { + "epoch": 5.682158240761732, + "grad_norm": 0.2275390625, + "learning_rate": 0.0002, + "loss": 0.6454, + "step": 3133 + }, + { + "epoch": 5.6839718884606665, + "grad_norm": 0.224609375, + "learning_rate": 0.0002, + "loss": 0.5851, + "step": 3134 + }, + { + "epoch": 5.685785536159601, + "grad_norm": 0.220703125, + "learning_rate": 0.0002, + "loss": 0.5335, + "step": 3135 + }, + { + "epoch": 5.687599183858535, + "grad_norm": 0.271484375, + "learning_rate": 0.0002, + "loss": 0.548, + "step": 3136 + }, + { + "epoch": 5.68941283155747, + "grad_norm": 0.2890625, + "learning_rate": 0.0002, + "loss": 0.6011, + "step": 3137 + }, + { + "epoch": 5.691226479256405, + "grad_norm": 0.275390625, + "learning_rate": 0.0002, + "loss": 0.5337, + "step": 3138 + }, + { + "epoch": 5.693040126955339, + "grad_norm": 0.310546875, + "learning_rate": 0.0002, + "loss": 0.5715, + "step": 3139 + }, + { + "epoch": 5.694853774654273, + "grad_norm": 0.30859375, + "learning_rate": 0.0002, + "loss": 0.4127, + "step": 3140 + }, + { + "epoch": 5.696667422353208, + "grad_norm": 0.3046875, + "learning_rate": 0.0002, + "loss": 0.5419, + "step": 3141 + }, + { + "epoch": 5.698481070052142, + "grad_norm": 0.302734375, + "learning_rate": 0.0002, + "loss": 0.4193, + "step": 3142 + }, + { + "epoch": 5.700294717751077, + "grad_norm": 0.326171875, + "learning_rate": 0.0002, + "loss": 0.4734, + "step": 3143 + }, + { + "epoch": 5.7021083654500115, + "grad_norm": 0.361328125, + "learning_rate": 0.0002, + "loss": 0.4883, + "step": 3144 + }, + { + "epoch": 5.703922013148945, + "grad_norm": 0.3359375, + "learning_rate": 0.0002, + "loss": 0.4342, + "step": 3145 + }, + { + "epoch": 5.70573566084788, + "grad_norm": 0.33203125, + "learning_rate": 0.0002, + "loss": 0.4603, + "step": 3146 + }, + { + "epoch": 5.707549308546815, + "grad_norm": 0.35546875, + "learning_rate": 0.0002, + "loss": 0.4498, + "step": 3147 + }, + { + "epoch": 5.70936295624575, + "grad_norm": 0.328125, + "learning_rate": 0.0002, + "loss": 0.3702, + "step": 3148 + }, + { + "epoch": 5.711176603944684, + "grad_norm": 0.318359375, + "learning_rate": 0.0002, + "loss": 0.4065, + "step": 3149 + }, + { + "epoch": 5.712990251643618, + "grad_norm": 0.31640625, + "learning_rate": 0.0002, + "loss": 0.3796, + "step": 3150 + }, + { + "epoch": 5.714803899342552, + "grad_norm": 0.365234375, + "learning_rate": 0.0002, + "loss": 0.3364, + "step": 3151 + }, + { + "epoch": 5.716617547041487, + "grad_norm": 0.443359375, + "learning_rate": 0.0002, + "loss": 0.3038, + "step": 3152 + }, + { + "epoch": 5.718431194740422, + "grad_norm": 0.38671875, + "learning_rate": 0.0002, + "loss": 0.2828, + "step": 3153 + }, + { + "epoch": 5.720244842439357, + "grad_norm": 0.279296875, + "learning_rate": 0.0002, + "loss": 0.3343, + "step": 3154 + }, + { + "epoch": 5.7220584901382905, + "grad_norm": 0.439453125, + "learning_rate": 0.0002, + "loss": 0.2739, + "step": 3155 + }, + { + "epoch": 5.723872137837225, + "grad_norm": 0.494140625, + "learning_rate": 0.0002, + "loss": 0.3446, + "step": 3156 + }, + { + "epoch": 5.725685785536159, + "grad_norm": 0.25390625, + "learning_rate": 0.0002, + "loss": 0.3369, + "step": 3157 + }, + { + "epoch": 5.727499433235094, + "grad_norm": 0.255859375, + "learning_rate": 0.0002, + "loss": 0.8191, + "step": 3158 + }, + { + "epoch": 5.729313080934029, + "grad_norm": 0.2255859375, + "learning_rate": 0.0002, + "loss": 0.8178, + "step": 3159 + }, + { + "epoch": 5.7311267286329635, + "grad_norm": 0.224609375, + "learning_rate": 0.0002, + "loss": 0.7445, + "step": 3160 + }, + { + "epoch": 5.732940376331897, + "grad_norm": 0.2265625, + "learning_rate": 0.0002, + "loss": 0.5969, + "step": 3161 + }, + { + "epoch": 5.734754024030832, + "grad_norm": 0.2119140625, + "learning_rate": 0.0002, + "loss": 0.7056, + "step": 3162 + }, + { + "epoch": 5.736567671729766, + "grad_norm": 0.2275390625, + "learning_rate": 0.0002, + "loss": 0.6995, + "step": 3163 + }, + { + "epoch": 5.738381319428701, + "grad_norm": 0.259765625, + "learning_rate": 0.0002, + "loss": 0.7367, + "step": 3164 + }, + { + "epoch": 5.7401949671276356, + "grad_norm": 0.2470703125, + "learning_rate": 0.0002, + "loss": 0.6885, + "step": 3165 + }, + { + "epoch": 5.74200861482657, + "grad_norm": 0.2353515625, + "learning_rate": 0.0002, + "loss": 0.8376, + "step": 3166 + }, + { + "epoch": 5.743822262525504, + "grad_norm": 0.2373046875, + "learning_rate": 0.0002, + "loss": 0.6541, + "step": 3167 + }, + { + "epoch": 5.745635910224439, + "grad_norm": 0.2412109375, + "learning_rate": 0.0002, + "loss": 0.8783, + "step": 3168 + }, + { + "epoch": 5.747449557923374, + "grad_norm": 0.2265625, + "learning_rate": 0.0002, + "loss": 0.6232, + "step": 3169 + }, + { + "epoch": 5.749263205622308, + "grad_norm": 0.26171875, + "learning_rate": 0.0002, + "loss": 0.8201, + "step": 3170 + }, + { + "epoch": 5.751076853321242, + "grad_norm": 0.2197265625, + "learning_rate": 0.0002, + "loss": 0.6716, + "step": 3171 + }, + { + "epoch": 5.752890501020177, + "grad_norm": 0.2060546875, + "learning_rate": 0.0002, + "loss": 0.6398, + "step": 3172 + }, + { + "epoch": 5.754704148719111, + "grad_norm": 0.2236328125, + "learning_rate": 0.0002, + "loss": 0.6662, + "step": 3173 + }, + { + "epoch": 5.756517796418046, + "grad_norm": 0.224609375, + "learning_rate": 0.0002, + "loss": 0.7987, + "step": 3174 + }, + { + "epoch": 5.758331444116981, + "grad_norm": 0.23828125, + "learning_rate": 0.0002, + "loss": 0.6398, + "step": 3175 + }, + { + "epoch": 5.7601450918159145, + "grad_norm": 0.208984375, + "learning_rate": 0.0002, + "loss": 0.6256, + "step": 3176 + }, + { + "epoch": 5.761958739514849, + "grad_norm": 0.2314453125, + "learning_rate": 0.0002, + "loss": 0.724, + "step": 3177 + }, + { + "epoch": 5.763772387213784, + "grad_norm": 0.23046875, + "learning_rate": 0.0002, + "loss": 0.618, + "step": 3178 + }, + { + "epoch": 5.765586034912718, + "grad_norm": 0.267578125, + "learning_rate": 0.0002, + "loss": 0.818, + "step": 3179 + }, + { + "epoch": 5.765586034912718, + "eval_loss": 1.527501106262207, + "eval_runtime": 152.6136, + "eval_samples_per_second": 6.552, + "eval_steps_per_second": 6.552, + "step": 3179 + }, + { + "epoch": 5.765586034912718, + "mmlu_eval_accuracy": 0.3172040749795567, + "mmlu_eval_accuracy_abstract_algebra": 0.2727272727272727, + "mmlu_eval_accuracy_anatomy": 0.42857142857142855, + "mmlu_eval_accuracy_astronomy": 0.375, + "mmlu_eval_accuracy_business_ethics": 0.5454545454545454, + "mmlu_eval_accuracy_clinical_knowledge": 0.2413793103448276, + "mmlu_eval_accuracy_college_biology": 0.375, + "mmlu_eval_accuracy_college_chemistry": 0.0, + "mmlu_eval_accuracy_college_computer_science": 0.09090909090909091, + "mmlu_eval_accuracy_college_mathematics": 0.18181818181818182, + "mmlu_eval_accuracy_college_medicine": 0.22727272727272727, + "mmlu_eval_accuracy_college_physics": 0.36363636363636365, + "mmlu_eval_accuracy_computer_security": 0.45454545454545453, + "mmlu_eval_accuracy_conceptual_physics": 0.3076923076923077, + "mmlu_eval_accuracy_econometrics": 0.08333333333333333, + "mmlu_eval_accuracy_electrical_engineering": 0.125, + "mmlu_eval_accuracy_elementary_mathematics": 0.2926829268292683, + "mmlu_eval_accuracy_formal_logic": 0.14285714285714285, + "mmlu_eval_accuracy_global_facts": 0.3, + "mmlu_eval_accuracy_high_school_biology": 0.5, + "mmlu_eval_accuracy_high_school_chemistry": 0.22727272727272727, + "mmlu_eval_accuracy_high_school_computer_science": 0.3333333333333333, + "mmlu_eval_accuracy_high_school_european_history": 0.2777777777777778, + "mmlu_eval_accuracy_high_school_geography": 0.4090909090909091, + "mmlu_eval_accuracy_high_school_government_and_politics": 0.38095238095238093, + "mmlu_eval_accuracy_high_school_macroeconomics": 0.3023255813953488, + "mmlu_eval_accuracy_high_school_mathematics": 0.10344827586206896, + "mmlu_eval_accuracy_high_school_microeconomics": 0.3076923076923077, + "mmlu_eval_accuracy_high_school_physics": 0.4117647058823529, + "mmlu_eval_accuracy_high_school_psychology": 0.35, + "mmlu_eval_accuracy_high_school_statistics": 0.30434782608695654, + "mmlu_eval_accuracy_high_school_us_history": 0.3181818181818182, + "mmlu_eval_accuracy_high_school_world_history": 0.15384615384615385, + "mmlu_eval_accuracy_human_aging": 0.43478260869565216, + "mmlu_eval_accuracy_human_sexuality": 0.08333333333333333, + "mmlu_eval_accuracy_international_law": 0.3076923076923077, + "mmlu_eval_accuracy_jurisprudence": 0.18181818181818182, + "mmlu_eval_accuracy_logical_fallacies": 0.3333333333333333, + "mmlu_eval_accuracy_machine_learning": 0.36363636363636365, + "mmlu_eval_accuracy_management": 0.36363636363636365, + "mmlu_eval_accuracy_marketing": 0.52, + "mmlu_eval_accuracy_medical_genetics": 0.5454545454545454, + "mmlu_eval_accuracy_miscellaneous": 0.4883720930232558, + "mmlu_eval_accuracy_moral_disputes": 0.3684210526315789, + "mmlu_eval_accuracy_moral_scenarios": 0.26, + "mmlu_eval_accuracy_nutrition": 0.36363636363636365, + "mmlu_eval_accuracy_philosophy": 0.35294117647058826, + "mmlu_eval_accuracy_prehistory": 0.42857142857142855, + "mmlu_eval_accuracy_professional_accounting": 0.3225806451612903, + "mmlu_eval_accuracy_professional_law": 0.27058823529411763, + "mmlu_eval_accuracy_professional_medicine": 0.3225806451612903, + "mmlu_eval_accuracy_professional_psychology": 0.3333333333333333, + "mmlu_eval_accuracy_public_relations": 0.5, + "mmlu_eval_accuracy_security_studies": 0.3333333333333333, + "mmlu_eval_accuracy_sociology": 0.45454545454545453, + "mmlu_eval_accuracy_us_foreign_policy": 0.36363636363636365, + "mmlu_eval_accuracy_virology": 0.3333333333333333, + "mmlu_eval_accuracy_world_religions": 0.2631578947368421, + "mmlu_loss": 1.8536448302337503, + "step": 3179 + }, + { + "epoch": 5.767399682611653, + "grad_norm": 0.2392578125, + "learning_rate": 0.0002, + "loss": 0.7436, + "step": 3180 + }, + { + "epoch": 5.7692133303105875, + "grad_norm": 0.23046875, + "learning_rate": 0.0002, + "loss": 0.7583, + "step": 3181 + }, + { + "epoch": 5.771026978009521, + "grad_norm": 0.2255859375, + "learning_rate": 0.0002, + "loss": 0.5225, + "step": 3182 + }, + { + "epoch": 5.772840625708456, + "grad_norm": 0.2431640625, + "learning_rate": 0.0002, + "loss": 0.7957, + "step": 3183 + }, + { + "epoch": 5.774654273407391, + "grad_norm": 0.271484375, + "learning_rate": 0.0002, + "loss": 0.7392, + "step": 3184 + }, + { + "epoch": 5.776467921106325, + "grad_norm": 0.2119140625, + "learning_rate": 0.0002, + "loss": 0.6041, + "step": 3185 + }, + { + "epoch": 5.77828156880526, + "grad_norm": 0.232421875, + "learning_rate": 0.0002, + "loss": 0.5476, + "step": 3186 + }, + { + "epoch": 5.780095216504194, + "grad_norm": 0.2275390625, + "learning_rate": 0.0002, + "loss": 0.5317, + "step": 3187 + }, + { + "epoch": 5.781908864203128, + "grad_norm": 0.2216796875, + "learning_rate": 0.0002, + "loss": 0.4835, + "step": 3188 + }, + { + "epoch": 5.783722511902063, + "grad_norm": 0.271484375, + "learning_rate": 0.0002, + "loss": 0.5426, + "step": 3189 + }, + { + "epoch": 5.785536159600998, + "grad_norm": 0.310546875, + "learning_rate": 0.0002, + "loss": 0.6692, + "step": 3190 + }, + { + "epoch": 5.787349807299932, + "grad_norm": 0.220703125, + "learning_rate": 0.0002, + "loss": 0.4316, + "step": 3191 + }, + { + "epoch": 5.789163454998866, + "grad_norm": 0.28125, + "learning_rate": 0.0002, + "loss": 0.5371, + "step": 3192 + }, + { + "epoch": 5.790977102697801, + "grad_norm": 0.306640625, + "learning_rate": 0.0002, + "loss": 0.5187, + "step": 3193 + }, + { + "epoch": 5.792790750396735, + "grad_norm": 0.267578125, + "learning_rate": 0.0002, + "loss": 0.4405, + "step": 3194 + }, + { + "epoch": 5.79460439809567, + "grad_norm": 0.279296875, + "learning_rate": 0.0002, + "loss": 0.5303, + "step": 3195 + }, + { + "epoch": 5.796418045794605, + "grad_norm": 0.314453125, + "learning_rate": 0.0002, + "loss": 0.3249, + "step": 3196 + }, + { + "epoch": 5.7982316934935385, + "grad_norm": 0.376953125, + "learning_rate": 0.0002, + "loss": 0.4423, + "step": 3197 + }, + { + "epoch": 5.800045341192473, + "grad_norm": 0.349609375, + "learning_rate": 0.0002, + "loss": 0.4645, + "step": 3198 + }, + { + "epoch": 5.801858988891408, + "grad_norm": 0.3125, + "learning_rate": 0.0002, + "loss": 0.4559, + "step": 3199 + }, + { + "epoch": 5.803672636590342, + "grad_norm": 0.3046875, + "learning_rate": 0.0002, + "loss": 0.3399, + "step": 3200 + }, + { + "epoch": 5.805486284289277, + "grad_norm": 0.310546875, + "learning_rate": 0.0002, + "loss": 0.4651, + "step": 3201 + }, + { + "epoch": 5.8072999319882115, + "grad_norm": 0.330078125, + "learning_rate": 0.0002, + "loss": 0.3459, + "step": 3202 + }, + { + "epoch": 5.809113579687146, + "grad_norm": 0.33984375, + "learning_rate": 0.0002, + "loss": 0.2703, + "step": 3203 + }, + { + "epoch": 5.81092722738608, + "grad_norm": 0.3203125, + "learning_rate": 0.0002, + "loss": 0.2575, + "step": 3204 + }, + { + "epoch": 5.812740875085015, + "grad_norm": 0.28125, + "learning_rate": 0.0002, + "loss": 0.2677, + "step": 3205 + }, + { + "epoch": 5.814554522783949, + "grad_norm": 0.75, + "learning_rate": 0.0002, + "loss": 0.3005, + "step": 3206 + }, + { + "epoch": 5.816368170482884, + "grad_norm": 0.400390625, + "learning_rate": 0.0002, + "loss": 0.4086, + "step": 3207 + }, + { + "epoch": 5.818181818181818, + "grad_norm": 0.2236328125, + "learning_rate": 0.0002, + "loss": 0.9342, + "step": 3208 + }, + { + "epoch": 5.819995465880753, + "grad_norm": 0.2333984375, + "learning_rate": 0.0002, + "loss": 0.6108, + "step": 3209 + }, + { + "epoch": 5.821809113579687, + "grad_norm": 0.220703125, + "learning_rate": 0.0002, + "loss": 0.7051, + "step": 3210 + }, + { + "epoch": 5.823622761278622, + "grad_norm": 0.2451171875, + "learning_rate": 0.0002, + "loss": 0.6232, + "step": 3211 + }, + { + "epoch": 5.825436408977556, + "grad_norm": 0.2373046875, + "learning_rate": 0.0002, + "loss": 0.6555, + "step": 3212 + }, + { + "epoch": 5.82725005667649, + "grad_norm": 0.2421875, + "learning_rate": 0.0002, + "loss": 0.7602, + "step": 3213 + }, + { + "epoch": 5.829063704375425, + "grad_norm": 0.236328125, + "learning_rate": 0.0002, + "loss": 0.6844, + "step": 3214 + }, + { + "epoch": 5.83087735207436, + "grad_norm": 0.2197265625, + "learning_rate": 0.0002, + "loss": 0.7241, + "step": 3215 + }, + { + "epoch": 5.832690999773294, + "grad_norm": 0.232421875, + "learning_rate": 0.0002, + "loss": 0.7361, + "step": 3216 + }, + { + "epoch": 5.834504647472229, + "grad_norm": 0.267578125, + "learning_rate": 0.0002, + "loss": 0.6427, + "step": 3217 + }, + { + "epoch": 5.8363182951711625, + "grad_norm": 0.2255859375, + "learning_rate": 0.0002, + "loss": 0.7295, + "step": 3218 + }, + { + "epoch": 5.838131942870097, + "grad_norm": 0.2431640625, + "learning_rate": 0.0002, + "loss": 0.5549, + "step": 3219 + }, + { + "epoch": 5.839945590569032, + "grad_norm": 0.25, + "learning_rate": 0.0002, + "loss": 0.754, + "step": 3220 + }, + { + "epoch": 5.841759238267967, + "grad_norm": 0.23828125, + "learning_rate": 0.0002, + "loss": 0.6571, + "step": 3221 + }, + { + "epoch": 5.843572885966901, + "grad_norm": 0.20703125, + "learning_rate": 0.0002, + "loss": 0.5931, + "step": 3222 + }, + { + "epoch": 5.8453865336658355, + "grad_norm": 0.2333984375, + "learning_rate": 0.0002, + "loss": 0.7021, + "step": 3223 + }, + { + "epoch": 5.84720018136477, + "grad_norm": 0.2060546875, + "learning_rate": 0.0002, + "loss": 0.6642, + "step": 3224 + }, + { + "epoch": 5.849013829063704, + "grad_norm": 0.216796875, + "learning_rate": 0.0002, + "loss": 0.5441, + "step": 3225 + }, + { + "epoch": 5.850827476762639, + "grad_norm": 0.2119140625, + "learning_rate": 0.0002, + "loss": 0.6123, + "step": 3226 + }, + { + "epoch": 5.852641124461574, + "grad_norm": 0.2333984375, + "learning_rate": 0.0002, + "loss": 0.8017, + "step": 3227 + }, + { + "epoch": 5.854454772160508, + "grad_norm": 0.2333984375, + "learning_rate": 0.0002, + "loss": 0.6602, + "step": 3228 + }, + { + "epoch": 5.856268419859442, + "grad_norm": 0.259765625, + "learning_rate": 0.0002, + "loss": 0.6357, + "step": 3229 + }, + { + "epoch": 5.858082067558377, + "grad_norm": 0.232421875, + "learning_rate": 0.0002, + "loss": 0.5758, + "step": 3230 + }, + { + "epoch": 5.859895715257311, + "grad_norm": 0.2412109375, + "learning_rate": 0.0002, + "loss": 0.6865, + "step": 3231 + }, + { + "epoch": 5.861709362956246, + "grad_norm": 0.2099609375, + "learning_rate": 0.0002, + "loss": 0.4666, + "step": 3232 + }, + { + "epoch": 5.863523010655181, + "grad_norm": 0.2265625, + "learning_rate": 0.0002, + "loss": 0.5727, + "step": 3233 + }, + { + "epoch": 5.865336658354114, + "grad_norm": 0.2373046875, + "learning_rate": 0.0002, + "loss": 0.639, + "step": 3234 + }, + { + "epoch": 5.867150306053049, + "grad_norm": 0.224609375, + "learning_rate": 0.0002, + "loss": 0.5711, + "step": 3235 + }, + { + "epoch": 5.868963953751984, + "grad_norm": 0.2421875, + "learning_rate": 0.0002, + "loss": 0.5618, + "step": 3236 + }, + { + "epoch": 5.870777601450918, + "grad_norm": 0.251953125, + "learning_rate": 0.0002, + "loss": 0.6543, + "step": 3237 + }, + { + "epoch": 5.872591249149853, + "grad_norm": 0.267578125, + "learning_rate": 0.0002, + "loss": 0.5317, + "step": 3238 + }, + { + "epoch": 5.874404896848787, + "grad_norm": 0.2431640625, + "learning_rate": 0.0002, + "loss": 0.4854, + "step": 3239 + }, + { + "epoch": 5.876218544547721, + "grad_norm": 0.26171875, + "learning_rate": 0.0002, + "loss": 0.5961, + "step": 3240 + }, + { + "epoch": 5.878032192246656, + "grad_norm": 0.279296875, + "learning_rate": 0.0002, + "loss": 0.4647, + "step": 3241 + }, + { + "epoch": 5.879845839945591, + "grad_norm": 0.298828125, + "learning_rate": 0.0002, + "loss": 0.5257, + "step": 3242 + }, + { + "epoch": 5.881659487644525, + "grad_norm": 0.283203125, + "learning_rate": 0.0002, + "loss": 0.5503, + "step": 3243 + }, + { + "epoch": 5.8834731353434595, + "grad_norm": 0.396484375, + "learning_rate": 0.0002, + "loss": 0.4814, + "step": 3244 + }, + { + "epoch": 5.885286783042394, + "grad_norm": 0.2734375, + "learning_rate": 0.0002, + "loss": 0.4261, + "step": 3245 + }, + { + "epoch": 5.887100430741328, + "grad_norm": 0.333984375, + "learning_rate": 0.0002, + "loss": 0.4286, + "step": 3246 + }, + { + "epoch": 5.888914078440263, + "grad_norm": 0.34375, + "learning_rate": 0.0002, + "loss": 0.3648, + "step": 3247 + }, + { + "epoch": 5.890727726139198, + "grad_norm": 0.33203125, + "learning_rate": 0.0002, + "loss": 0.3701, + "step": 3248 + }, + { + "epoch": 5.892541373838132, + "grad_norm": 0.310546875, + "learning_rate": 0.0002, + "loss": 0.3777, + "step": 3249 + }, + { + "epoch": 5.894355021537066, + "grad_norm": 0.3359375, + "learning_rate": 0.0002, + "loss": 0.3681, + "step": 3250 + }, + { + "epoch": 5.896168669236001, + "grad_norm": 0.3359375, + "learning_rate": 0.0002, + "loss": 0.3214, + "step": 3251 + }, + { + "epoch": 5.897982316934935, + "grad_norm": 0.328125, + "learning_rate": 0.0002, + "loss": 0.3166, + "step": 3252 + }, + { + "epoch": 5.89979596463387, + "grad_norm": 0.37890625, + "learning_rate": 0.0002, + "loss": 0.3459, + "step": 3253 + }, + { + "epoch": 5.901609612332805, + "grad_norm": 0.375, + "learning_rate": 0.0002, + "loss": 0.2826, + "step": 3254 + }, + { + "epoch": 5.9034232600317385, + "grad_norm": 0.353515625, + "learning_rate": 0.0002, + "loss": 0.2852, + "step": 3255 + }, + { + "epoch": 5.905236907730673, + "grad_norm": 0.35546875, + "learning_rate": 0.0002, + "loss": 0.311, + "step": 3256 + }, + { + "epoch": 5.907050555429608, + "grad_norm": 0.30859375, + "learning_rate": 0.0002, + "loss": 0.4135, + "step": 3257 + }, + { + "epoch": 5.908864203128543, + "grad_norm": 0.2314453125, + "learning_rate": 0.0002, + "loss": 0.7982, + "step": 3258 + }, + { + "epoch": 5.910677850827477, + "grad_norm": 0.23046875, + "learning_rate": 0.0002, + "loss": 0.7037, + "step": 3259 + }, + { + "epoch": 5.912491498526411, + "grad_norm": 0.2578125, + "learning_rate": 0.0002, + "loss": 0.7104, + "step": 3260 + }, + { + "epoch": 5.914305146225345, + "grad_norm": 0.2421875, + "learning_rate": 0.0002, + "loss": 0.6595, + "step": 3261 + }, + { + "epoch": 5.91611879392428, + "grad_norm": 0.23828125, + "learning_rate": 0.0002, + "loss": 0.8639, + "step": 3262 + }, + { + "epoch": 5.917932441623215, + "grad_norm": 0.234375, + "learning_rate": 0.0002, + "loss": 0.6369, + "step": 3263 + }, + { + "epoch": 5.91974608932215, + "grad_norm": 0.2470703125, + "learning_rate": 0.0002, + "loss": 0.7876, + "step": 3264 + }, + { + "epoch": 5.9215597370210835, + "grad_norm": 0.2275390625, + "learning_rate": 0.0002, + "loss": 0.6234, + "step": 3265 + }, + { + "epoch": 5.923373384720018, + "grad_norm": 0.2578125, + "learning_rate": 0.0002, + "loss": 0.6221, + "step": 3266 + }, + { + "epoch": 5.925187032418952, + "grad_norm": 0.2392578125, + "learning_rate": 0.0002, + "loss": 0.7459, + "step": 3267 + }, + { + "epoch": 5.927000680117887, + "grad_norm": 0.244140625, + "learning_rate": 0.0002, + "loss": 0.7093, + "step": 3268 + }, + { + "epoch": 5.928814327816822, + "grad_norm": 0.244140625, + "learning_rate": 0.0002, + "loss": 0.8468, + "step": 3269 + }, + { + "epoch": 5.9306279755157565, + "grad_norm": 0.2138671875, + "learning_rate": 0.0002, + "loss": 0.6196, + "step": 3270 + }, + { + "epoch": 5.93244162321469, + "grad_norm": 0.224609375, + "learning_rate": 0.0002, + "loss": 0.8251, + "step": 3271 + }, + { + "epoch": 5.934255270913625, + "grad_norm": 0.23046875, + "learning_rate": 0.0002, + "loss": 0.6898, + "step": 3272 + }, + { + "epoch": 5.936068918612559, + "grad_norm": 0.2412109375, + "learning_rate": 0.0002, + "loss": 0.6667, + "step": 3273 + }, + { + "epoch": 5.937882566311494, + "grad_norm": 0.203125, + "learning_rate": 0.0002, + "loss": 0.5557, + "step": 3274 + }, + { + "epoch": 5.939696214010429, + "grad_norm": 0.2177734375, + "learning_rate": 0.0002, + "loss": 0.6904, + "step": 3275 + }, + { + "epoch": 5.941509861709363, + "grad_norm": 0.224609375, + "learning_rate": 0.0002, + "loss": 0.6553, + "step": 3276 + }, + { + "epoch": 5.943323509408297, + "grad_norm": 0.234375, + "learning_rate": 0.0002, + "loss": 0.6237, + "step": 3277 + }, + { + "epoch": 5.945137157107232, + "grad_norm": 0.2314453125, + "learning_rate": 0.0002, + "loss": 0.5736, + "step": 3278 + }, + { + "epoch": 5.946950804806167, + "grad_norm": 0.2197265625, + "learning_rate": 0.0002, + "loss": 0.5667, + "step": 3279 + }, + { + "epoch": 5.948764452505101, + "grad_norm": 0.2255859375, + "learning_rate": 0.0002, + "loss": 0.6485, + "step": 3280 + }, + { + "epoch": 5.950578100204035, + "grad_norm": 0.2333984375, + "learning_rate": 0.0002, + "loss": 0.6051, + "step": 3281 + }, + { + "epoch": 5.95239174790297, + "grad_norm": 0.21875, + "learning_rate": 0.0002, + "loss": 0.5558, + "step": 3282 + }, + { + "epoch": 5.954205395601904, + "grad_norm": 0.2099609375, + "learning_rate": 0.0002, + "loss": 0.5679, + "step": 3283 + }, + { + "epoch": 5.956019043300839, + "grad_norm": 0.275390625, + "learning_rate": 0.0002, + "loss": 0.6842, + "step": 3284 + }, + { + "epoch": 5.957832690999774, + "grad_norm": 0.251953125, + "learning_rate": 0.0002, + "loss": 0.6795, + "step": 3285 + }, + { + "epoch": 5.9596463386987075, + "grad_norm": 0.2353515625, + "learning_rate": 0.0002, + "loss": 0.4524, + "step": 3286 + }, + { + "epoch": 5.961459986397642, + "grad_norm": 0.271484375, + "learning_rate": 0.0002, + "loss": 0.5925, + "step": 3287 + }, + { + "epoch": 5.963273634096577, + "grad_norm": 0.259765625, + "learning_rate": 0.0002, + "loss": 0.5346, + "step": 3288 + }, + { + "epoch": 5.965087281795511, + "grad_norm": 0.330078125, + "learning_rate": 0.0002, + "loss": 0.5467, + "step": 3289 + }, + { + "epoch": 5.966900929494446, + "grad_norm": 0.263671875, + "learning_rate": 0.0002, + "loss": 0.5602, + "step": 3290 + }, + { + "epoch": 5.9687145771933805, + "grad_norm": 0.28515625, + "learning_rate": 0.0002, + "loss": 0.4942, + "step": 3291 + }, + { + "epoch": 5.970528224892314, + "grad_norm": 0.25, + "learning_rate": 0.0002, + "loss": 0.4536, + "step": 3292 + }, + { + "epoch": 5.972341872591249, + "grad_norm": 0.283203125, + "learning_rate": 0.0002, + "loss": 0.4943, + "step": 3293 + }, + { + "epoch": 5.974155520290184, + "grad_norm": 0.36328125, + "learning_rate": 0.0002, + "loss": 0.5342, + "step": 3294 + }, + { + "epoch": 5.975969167989118, + "grad_norm": 0.3125, + "learning_rate": 0.0002, + "loss": 0.4647, + "step": 3295 + }, + { + "epoch": 5.977782815688053, + "grad_norm": 0.31640625, + "learning_rate": 0.0002, + "loss": 0.4111, + "step": 3296 + }, + { + "epoch": 5.979596463386987, + "grad_norm": 0.390625, + "learning_rate": 0.0002, + "loss": 0.5582, + "step": 3297 + }, + { + "epoch": 5.981410111085921, + "grad_norm": 0.33984375, + "learning_rate": 0.0002, + "loss": 0.4112, + "step": 3298 + }, + { + "epoch": 5.983223758784856, + "grad_norm": 0.376953125, + "learning_rate": 0.0002, + "loss": 0.4877, + "step": 3299 + }, + { + "epoch": 5.985037406483791, + "grad_norm": 0.310546875, + "learning_rate": 0.0002, + "loss": 0.4242, + "step": 3300 + }, + { + "epoch": 5.986851054182725, + "grad_norm": 0.3125, + "learning_rate": 0.0002, + "loss": 0.3809, + "step": 3301 + }, + { + "epoch": 5.9886647018816594, + "grad_norm": 0.28515625, + "learning_rate": 0.0002, + "loss": 0.2904, + "step": 3302 + }, + { + "epoch": 5.990478349580594, + "grad_norm": 0.359375, + "learning_rate": 0.0002, + "loss": 0.3678, + "step": 3303 + }, + { + "epoch": 5.992291997279528, + "grad_norm": 0.447265625, + "learning_rate": 0.0002, + "loss": 0.2914, + "step": 3304 + }, + { + "epoch": 5.994105644978463, + "grad_norm": 0.3828125, + "learning_rate": 0.0002, + "loss": 0.2898, + "step": 3305 + }, + { + "epoch": 5.995919292677398, + "grad_norm": 0.34765625, + "learning_rate": 0.0002, + "loss": 0.3094, + "step": 3306 + }, + { + "epoch": 5.9977329403763315, + "grad_norm": 0.38671875, + "learning_rate": 0.0002, + "loss": 0.3865, + "step": 3307 + }, + { + "epoch": 5.999546588075266, + "grad_norm": 0.341796875, + "learning_rate": 0.0002, + "loss": 0.6863, + "step": 3308 + }, + { + "epoch": 6.001360235774201, + "grad_norm": 0.26953125, + "learning_rate": 0.0002, + "loss": 0.7502, + "step": 3309 + }, + { + "epoch": 6.003173883473135, + "grad_norm": 0.1806640625, + "learning_rate": 0.0002, + "loss": 0.5001, + "step": 3310 + }, + { + "epoch": 6.00498753117207, + "grad_norm": 0.171875, + "learning_rate": 0.0002, + "loss": 0.4463, + "step": 3311 + }, + { + "epoch": 6.0068011788710045, + "grad_norm": 0.1875, + "learning_rate": 0.0002, + "loss": 0.474, + "step": 3312 + }, + { + "epoch": 6.008614826569938, + "grad_norm": 0.19921875, + "learning_rate": 0.0002, + "loss": 0.6016, + "step": 3313 + }, + { + "epoch": 6.010428474268873, + "grad_norm": 0.171875, + "learning_rate": 0.0002, + "loss": 0.4248, + "step": 3314 + }, + { + "epoch": 6.012242121967808, + "grad_norm": 0.1845703125, + "learning_rate": 0.0002, + "loss": 0.4877, + "step": 3315 + }, + { + "epoch": 6.014055769666742, + "grad_norm": 0.193359375, + "learning_rate": 0.0002, + "loss": 0.4848, + "step": 3316 + }, + { + "epoch": 6.015869417365677, + "grad_norm": 0.216796875, + "learning_rate": 0.0002, + "loss": 0.4489, + "step": 3317 + }, + { + "epoch": 6.017683065064611, + "grad_norm": 0.2470703125, + "learning_rate": 0.0002, + "loss": 0.6876, + "step": 3318 + }, + { + "epoch": 6.019496712763545, + "grad_norm": 0.2255859375, + "learning_rate": 0.0002, + "loss": 0.4865, + "step": 3319 + }, + { + "epoch": 6.02131036046248, + "grad_norm": 0.2451171875, + "learning_rate": 0.0002, + "loss": 0.6125, + "step": 3320 + }, + { + "epoch": 6.023124008161415, + "grad_norm": 0.2412109375, + "learning_rate": 0.0002, + "loss": 0.5467, + "step": 3321 + }, + { + "epoch": 6.024937655860349, + "grad_norm": 0.2255859375, + "learning_rate": 0.0002, + "loss": 0.466, + "step": 3322 + }, + { + "epoch": 6.0267513035592835, + "grad_norm": 0.2451171875, + "learning_rate": 0.0002, + "loss": 0.4378, + "step": 3323 + }, + { + "epoch": 6.028564951258218, + "grad_norm": 0.275390625, + "learning_rate": 0.0002, + "loss": 0.4642, + "step": 3324 + }, + { + "epoch": 6.030378598957153, + "grad_norm": 0.2392578125, + "learning_rate": 0.0002, + "loss": 0.4416, + "step": 3325 + }, + { + "epoch": 6.032192246656087, + "grad_norm": 0.2392578125, + "learning_rate": 0.0002, + "loss": 0.3846, + "step": 3326 + }, + { + "epoch": 6.034005894355022, + "grad_norm": 0.23828125, + "learning_rate": 0.0002, + "loss": 0.5035, + "step": 3327 + }, + { + "epoch": 6.035819542053956, + "grad_norm": 0.2275390625, + "learning_rate": 0.0002, + "loss": 0.4232, + "step": 3328 + }, + { + "epoch": 6.03763318975289, + "grad_norm": 0.2353515625, + "learning_rate": 0.0002, + "loss": 0.4846, + "step": 3329 + }, + { + "epoch": 6.039446837451825, + "grad_norm": 0.2294921875, + "learning_rate": 0.0002, + "loss": 0.4944, + "step": 3330 + }, + { + "epoch": 6.04126048515076, + "grad_norm": 0.232421875, + "learning_rate": 0.0002, + "loss": 0.448, + "step": 3331 + }, + { + "epoch": 6.043074132849694, + "grad_norm": 0.2353515625, + "learning_rate": 0.0002, + "loss": 0.4066, + "step": 3332 + }, + { + "epoch": 6.0448877805486285, + "grad_norm": 0.2353515625, + "learning_rate": 0.0002, + "loss": 0.5455, + "step": 3333 + }, + { + "epoch": 6.046701428247563, + "grad_norm": 0.2333984375, + "learning_rate": 0.0002, + "loss": 0.4435, + "step": 3334 + }, + { + "epoch": 6.048515075946497, + "grad_norm": 0.22265625, + "learning_rate": 0.0002, + "loss": 0.365, + "step": 3335 + }, + { + "epoch": 6.050328723645432, + "grad_norm": 0.234375, + "learning_rate": 0.0002, + "loss": 0.4612, + "step": 3336 + }, + { + "epoch": 6.052142371344367, + "grad_norm": 0.2392578125, + "learning_rate": 0.0002, + "loss": 0.5498, + "step": 3337 + }, + { + "epoch": 6.053956019043301, + "grad_norm": 0.2236328125, + "learning_rate": 0.0002, + "loss": 0.3925, + "step": 3338 + }, + { + "epoch": 6.055769666742235, + "grad_norm": 0.2734375, + "learning_rate": 0.0002, + "loss": 0.4757, + "step": 3339 + }, + { + "epoch": 6.05758331444117, + "grad_norm": 0.2578125, + "learning_rate": 0.0002, + "loss": 0.4224, + "step": 3340 + }, + { + "epoch": 6.059396962140104, + "grad_norm": 0.2431640625, + "learning_rate": 0.0002, + "loss": 0.3031, + "step": 3341 + }, + { + "epoch": 6.061210609839039, + "grad_norm": 0.28515625, + "learning_rate": 0.0002, + "loss": 0.4714, + "step": 3342 + }, + { + "epoch": 6.063024257537974, + "grad_norm": 0.259765625, + "learning_rate": 0.0002, + "loss": 0.2986, + "step": 3343 + }, + { + "epoch": 6.0648379052369075, + "grad_norm": 0.267578125, + "learning_rate": 0.0002, + "loss": 0.2857, + "step": 3344 + }, + { + "epoch": 6.066651552935842, + "grad_norm": 0.25, + "learning_rate": 0.0002, + "loss": 0.2871, + "step": 3345 + }, + { + "epoch": 6.068465200634777, + "grad_norm": 0.2412109375, + "learning_rate": 0.0002, + "loss": 0.2239, + "step": 3346 + }, + { + "epoch": 6.070278848333711, + "grad_norm": 0.30078125, + "learning_rate": 0.0002, + "loss": 0.2565, + "step": 3347 + }, + { + "epoch": 6.072092496032646, + "grad_norm": 0.283203125, + "learning_rate": 0.0002, + "loss": 0.2737, + "step": 3348 + }, + { + "epoch": 6.07390614373158, + "grad_norm": 0.3671875, + "learning_rate": 0.0002, + "loss": 0.3227, + "step": 3349 + }, + { + "epoch": 6.075719791430514, + "grad_norm": 0.421875, + "learning_rate": 0.0002, + "loss": 0.2712, + "step": 3350 + }, + { + "epoch": 6.077533439129449, + "grad_norm": 0.310546875, + "learning_rate": 0.0002, + "loss": 0.4349, + "step": 3351 + }, + { + "epoch": 6.079347086828384, + "grad_norm": 0.3671875, + "learning_rate": 0.0002, + "loss": 0.2813, + "step": 3352 + }, + { + "epoch": 6.081160734527318, + "grad_norm": 0.48828125, + "learning_rate": 0.0002, + "loss": 0.2546, + "step": 3353 + }, + { + "epoch": 6.0829743822262525, + "grad_norm": 0.3671875, + "learning_rate": 0.0002, + "loss": 0.2333, + "step": 3354 + }, + { + "epoch": 6.084788029925187, + "grad_norm": 0.359375, + "learning_rate": 0.0002, + "loss": 0.2, + "step": 3355 + }, + { + "epoch": 6.086601677624121, + "grad_norm": 0.28515625, + "learning_rate": 0.0002, + "loss": 0.1945, + "step": 3356 + }, + { + "epoch": 6.088415325323056, + "grad_norm": 0.291015625, + "learning_rate": 0.0002, + "loss": 0.2072, + "step": 3357 + }, + { + "epoch": 6.090228973021991, + "grad_norm": 0.2099609375, + "learning_rate": 0.0002, + "loss": 0.2059, + "step": 3358 + }, + { + "epoch": 6.092042620720925, + "grad_norm": 0.2138671875, + "learning_rate": 0.0002, + "loss": 0.627, + "step": 3359 + }, + { + "epoch": 6.093856268419859, + "grad_norm": 0.228515625, + "learning_rate": 0.0002, + "loss": 0.5651, + "step": 3360 + }, + { + "epoch": 6.095669916118794, + "grad_norm": 0.248046875, + "learning_rate": 0.0002, + "loss": 0.5589, + "step": 3361 + }, + { + "epoch": 6.097483563817728, + "grad_norm": 0.2255859375, + "learning_rate": 0.0002, + "loss": 0.522, + "step": 3362 + }, + { + "epoch": 6.099297211516663, + "grad_norm": 0.22265625, + "learning_rate": 0.0002, + "loss": 0.5915, + "step": 3363 + }, + { + "epoch": 6.101110859215598, + "grad_norm": 0.248046875, + "learning_rate": 0.0002, + "loss": 0.467, + "step": 3364 + }, + { + "epoch": 6.1029245069145315, + "grad_norm": 0.255859375, + "learning_rate": 0.0002, + "loss": 0.5768, + "step": 3365 + }, + { + "epoch": 6.104738154613466, + "grad_norm": 0.26953125, + "learning_rate": 0.0002, + "loss": 0.5957, + "step": 3366 + }, + { + "epoch": 6.104738154613466, + "eval_loss": 1.828487515449524, + "eval_runtime": 152.5227, + "eval_samples_per_second": 6.556, + "eval_steps_per_second": 6.556, + "step": 3366 + }, + { + "epoch": 6.104738154613466, + "mmlu_eval_accuracy": 0.32565134623454167, + "mmlu_eval_accuracy_abstract_algebra": 0.2727272727272727, + "mmlu_eval_accuracy_anatomy": 0.2857142857142857, + "mmlu_eval_accuracy_astronomy": 0.3125, + "mmlu_eval_accuracy_business_ethics": 0.45454545454545453, + "mmlu_eval_accuracy_clinical_knowledge": 0.2413793103448276, + "mmlu_eval_accuracy_college_biology": 0.375, + "mmlu_eval_accuracy_college_chemistry": 0.375, + "mmlu_eval_accuracy_college_computer_science": 0.09090909090909091, + "mmlu_eval_accuracy_college_mathematics": 0.18181818181818182, + "mmlu_eval_accuracy_college_medicine": 0.22727272727272727, + "mmlu_eval_accuracy_college_physics": 0.45454545454545453, + "mmlu_eval_accuracy_computer_security": 0.5454545454545454, + "mmlu_eval_accuracy_conceptual_physics": 0.2692307692307692, + "mmlu_eval_accuracy_econometrics": 0.08333333333333333, + "mmlu_eval_accuracy_electrical_engineering": 0.125, + "mmlu_eval_accuracy_elementary_mathematics": 0.2682926829268293, + "mmlu_eval_accuracy_formal_logic": 0.14285714285714285, + "mmlu_eval_accuracy_global_facts": 0.4, + "mmlu_eval_accuracy_high_school_biology": 0.46875, + "mmlu_eval_accuracy_high_school_chemistry": 0.09090909090909091, + "mmlu_eval_accuracy_high_school_computer_science": 0.3333333333333333, + "mmlu_eval_accuracy_high_school_european_history": 0.3333333333333333, + "mmlu_eval_accuracy_high_school_geography": 0.45454545454545453, + "mmlu_eval_accuracy_high_school_government_and_politics": 0.47619047619047616, + "mmlu_eval_accuracy_high_school_macroeconomics": 0.2558139534883721, + "mmlu_eval_accuracy_high_school_mathematics": 0.10344827586206896, + "mmlu_eval_accuracy_high_school_microeconomics": 0.3076923076923077, + "mmlu_eval_accuracy_high_school_physics": 0.4117647058823529, + "mmlu_eval_accuracy_high_school_psychology": 0.35, + "mmlu_eval_accuracy_high_school_statistics": 0.30434782608695654, + "mmlu_eval_accuracy_high_school_us_history": 0.36363636363636365, + "mmlu_eval_accuracy_high_school_world_history": 0.19230769230769232, + "mmlu_eval_accuracy_human_aging": 0.34782608695652173, + "mmlu_eval_accuracy_human_sexuality": 0.08333333333333333, + "mmlu_eval_accuracy_international_law": 0.3076923076923077, + "mmlu_eval_accuracy_jurisprudence": 0.2727272727272727, + "mmlu_eval_accuracy_logical_fallacies": 0.3888888888888889, + "mmlu_eval_accuracy_machine_learning": 0.36363636363636365, + "mmlu_eval_accuracy_management": 0.5454545454545454, + "mmlu_eval_accuracy_marketing": 0.52, + "mmlu_eval_accuracy_medical_genetics": 0.45454545454545453, + "mmlu_eval_accuracy_miscellaneous": 0.47674418604651164, + "mmlu_eval_accuracy_moral_disputes": 0.34210526315789475, + "mmlu_eval_accuracy_moral_scenarios": 0.31, + "mmlu_eval_accuracy_nutrition": 0.36363636363636365, + "mmlu_eval_accuracy_philosophy": 0.4411764705882353, + "mmlu_eval_accuracy_prehistory": 0.45714285714285713, + "mmlu_eval_accuracy_professional_accounting": 0.3225806451612903, + "mmlu_eval_accuracy_professional_law": 0.25882352941176473, + "mmlu_eval_accuracy_professional_medicine": 0.25806451612903225, + "mmlu_eval_accuracy_professional_psychology": 0.3188405797101449, + "mmlu_eval_accuracy_public_relations": 0.4166666666666667, + "mmlu_eval_accuracy_security_studies": 0.37037037037037035, + "mmlu_eval_accuracy_sociology": 0.5909090909090909, + "mmlu_eval_accuracy_us_foreign_policy": 0.36363636363636365, + "mmlu_eval_accuracy_virology": 0.2777777777777778, + "mmlu_eval_accuracy_world_religions": 0.15789473684210525, + "mmlu_loss": 1.4922463240224813, + "step": 3366 + }, + { + "epoch": 6.106551802312401, + "grad_norm": 0.2578125, + "learning_rate": 0.0002, + "loss": 0.517, + "step": 3367 + }, + { + "epoch": 6.108365450011335, + "grad_norm": 0.248046875, + "learning_rate": 0.0002, + "loss": 0.5332, + "step": 3368 + }, + { + "epoch": 6.11017909771027, + "grad_norm": 0.228515625, + "learning_rate": 0.0002, + "loss": 0.468, + "step": 3369 + }, + { + "epoch": 6.1119927454092045, + "grad_norm": 0.25, + "learning_rate": 0.0002, + "loss": 0.5336, + "step": 3370 + }, + { + "epoch": 6.113806393108138, + "grad_norm": 0.259765625, + "learning_rate": 0.0002, + "loss": 0.593, + "step": 3371 + }, + { + "epoch": 6.115620040807073, + "grad_norm": 0.267578125, + "learning_rate": 0.0002, + "loss": 0.4555, + "step": 3372 + }, + { + "epoch": 6.117433688506008, + "grad_norm": 0.2353515625, + "learning_rate": 0.0002, + "loss": 0.4518, + "step": 3373 + }, + { + "epoch": 6.119247336204942, + "grad_norm": 0.267578125, + "learning_rate": 0.0002, + "loss": 0.5816, + "step": 3374 + }, + { + "epoch": 6.1210609839038765, + "grad_norm": 0.2314453125, + "learning_rate": 0.0002, + "loss": 0.4068, + "step": 3375 + }, + { + "epoch": 6.122874631602811, + "grad_norm": 0.234375, + "learning_rate": 0.0002, + "loss": 0.5366, + "step": 3376 + }, + { + "epoch": 6.124688279301745, + "grad_norm": 0.216796875, + "learning_rate": 0.0002, + "loss": 0.3763, + "step": 3377 + }, + { + "epoch": 6.12650192700068, + "grad_norm": 0.2236328125, + "learning_rate": 0.0002, + "loss": 0.432, + "step": 3378 + }, + { + "epoch": 6.128315574699615, + "grad_norm": 0.2197265625, + "learning_rate": 0.0002, + "loss": 0.3835, + "step": 3379 + }, + { + "epoch": 6.1301292223985495, + "grad_norm": 0.2216796875, + "learning_rate": 0.0002, + "loss": 0.4813, + "step": 3380 + }, + { + "epoch": 6.131942870097483, + "grad_norm": 0.23046875, + "learning_rate": 0.0002, + "loss": 0.5639, + "step": 3381 + }, + { + "epoch": 6.133756517796418, + "grad_norm": 0.279296875, + "learning_rate": 0.0002, + "loss": 0.5083, + "step": 3382 + }, + { + "epoch": 6.135570165495353, + "grad_norm": 0.244140625, + "learning_rate": 0.0002, + "loss": 0.5594, + "step": 3383 + }, + { + "epoch": 6.137383813194287, + "grad_norm": 0.216796875, + "learning_rate": 0.0002, + "loss": 0.4009, + "step": 3384 + }, + { + "epoch": 6.139197460893222, + "grad_norm": 0.25, + "learning_rate": 0.0002, + "loss": 0.4232, + "step": 3385 + }, + { + "epoch": 6.141011108592156, + "grad_norm": 0.28125, + "learning_rate": 0.0002, + "loss": 0.4462, + "step": 3386 + }, + { + "epoch": 6.14282475629109, + "grad_norm": 0.263671875, + "learning_rate": 0.0002, + "loss": 0.5076, + "step": 3387 + }, + { + "epoch": 6.144638403990025, + "grad_norm": 0.232421875, + "learning_rate": 0.0002, + "loss": 0.4422, + "step": 3388 + }, + { + "epoch": 6.14645205168896, + "grad_norm": 0.251953125, + "learning_rate": 0.0002, + "loss": 0.4179, + "step": 3389 + }, + { + "epoch": 6.148265699387894, + "grad_norm": 0.21875, + "learning_rate": 0.0002, + "loss": 0.3861, + "step": 3390 + }, + { + "epoch": 6.1500793470868285, + "grad_norm": 0.228515625, + "learning_rate": 0.0002, + "loss": 0.3382, + "step": 3391 + }, + { + "epoch": 6.151892994785763, + "grad_norm": 0.27734375, + "learning_rate": 0.0002, + "loss": 0.3845, + "step": 3392 + }, + { + "epoch": 6.153706642484697, + "grad_norm": 0.29296875, + "learning_rate": 0.0002, + "loss": 0.3619, + "step": 3393 + }, + { + "epoch": 6.155520290183632, + "grad_norm": 0.26953125, + "learning_rate": 0.0002, + "loss": 0.3075, + "step": 3394 + }, + { + "epoch": 6.157333937882567, + "grad_norm": 0.291015625, + "learning_rate": 0.0002, + "loss": 0.3058, + "step": 3395 + }, + { + "epoch": 6.1591475855815006, + "grad_norm": 0.26953125, + "learning_rate": 0.0002, + "loss": 0.2835, + "step": 3396 + }, + { + "epoch": 6.160961233280435, + "grad_norm": 0.31640625, + "learning_rate": 0.0002, + "loss": 0.2503, + "step": 3397 + }, + { + "epoch": 6.16277488097937, + "grad_norm": 0.296875, + "learning_rate": 0.0002, + "loss": 0.3151, + "step": 3398 + }, + { + "epoch": 6.164588528678304, + "grad_norm": 0.291015625, + "learning_rate": 0.0002, + "loss": 0.2888, + "step": 3399 + }, + { + "epoch": 6.166402176377239, + "grad_norm": 0.26171875, + "learning_rate": 0.0002, + "loss": 0.2188, + "step": 3400 + }, + { + "epoch": 6.1682158240761735, + "grad_norm": 0.322265625, + "learning_rate": 0.0002, + "loss": 0.2411, + "step": 3401 + }, + { + "epoch": 6.170029471775107, + "grad_norm": 0.392578125, + "learning_rate": 0.0002, + "loss": 0.2313, + "step": 3402 + }, + { + "epoch": 6.171843119474042, + "grad_norm": 0.271484375, + "learning_rate": 0.0002, + "loss": 0.2458, + "step": 3403 + }, + { + "epoch": 6.173656767172977, + "grad_norm": 0.490234375, + "learning_rate": 0.0002, + "loss": 0.2888, + "step": 3404 + }, + { + "epoch": 6.175470414871911, + "grad_norm": 0.291015625, + "learning_rate": 0.0002, + "loss": 0.2111, + "step": 3405 + }, + { + "epoch": 6.177284062570846, + "grad_norm": 0.2734375, + "learning_rate": 0.0002, + "loss": 0.1658, + "step": 3406 + }, + { + "epoch": 6.17909771026978, + "grad_norm": 0.310546875, + "learning_rate": 0.0002, + "loss": 0.2094, + "step": 3407 + }, + { + "epoch": 6.180911357968714, + "grad_norm": 0.390625, + "learning_rate": 0.0002, + "loss": 0.2709, + "step": 3408 + }, + { + "epoch": 6.182725005667649, + "grad_norm": 0.2392578125, + "learning_rate": 0.0002, + "loss": 0.5271, + "step": 3409 + }, + { + "epoch": 6.184538653366584, + "grad_norm": 0.2216796875, + "learning_rate": 0.0002, + "loss": 0.5939, + "step": 3410 + }, + { + "epoch": 6.186352301065518, + "grad_norm": 0.255859375, + "learning_rate": 0.0002, + "loss": 0.6375, + "step": 3411 + }, + { + "epoch": 6.1881659487644525, + "grad_norm": 0.20703125, + "learning_rate": 0.0002, + "loss": 0.6622, + "step": 3412 + }, + { + "epoch": 6.189979596463387, + "grad_norm": 0.2275390625, + "learning_rate": 0.0002, + "loss": 0.5095, + "step": 3413 + }, + { + "epoch": 6.191793244162321, + "grad_norm": 0.23046875, + "learning_rate": 0.0002, + "loss": 0.5325, + "step": 3414 + }, + { + "epoch": 6.193606891861256, + "grad_norm": 0.234375, + "learning_rate": 0.0002, + "loss": 0.4719, + "step": 3415 + }, + { + "epoch": 6.195420539560191, + "grad_norm": 0.244140625, + "learning_rate": 0.0002, + "loss": 0.4859, + "step": 3416 + }, + { + "epoch": 6.197234187259125, + "grad_norm": 0.259765625, + "learning_rate": 0.0002, + "loss": 0.6033, + "step": 3417 + }, + { + "epoch": 6.199047834958059, + "grad_norm": 0.255859375, + "learning_rate": 0.0002, + "loss": 0.5401, + "step": 3418 + }, + { + "epoch": 6.200861482656994, + "grad_norm": 0.26953125, + "learning_rate": 0.0002, + "loss": 0.5235, + "step": 3419 + }, + { + "epoch": 6.202675130355928, + "grad_norm": 0.279296875, + "learning_rate": 0.0002, + "loss": 0.5814, + "step": 3420 + }, + { + "epoch": 6.204488778054863, + "grad_norm": 0.27734375, + "learning_rate": 0.0002, + "loss": 0.596, + "step": 3421 + }, + { + "epoch": 6.2063024257537975, + "grad_norm": 0.255859375, + "learning_rate": 0.0002, + "loss": 0.5669, + "step": 3422 + }, + { + "epoch": 6.208116073452731, + "grad_norm": 0.265625, + "learning_rate": 0.0002, + "loss": 0.6151, + "step": 3423 + }, + { + "epoch": 6.209929721151666, + "grad_norm": 0.255859375, + "learning_rate": 0.0002, + "loss": 0.5804, + "step": 3424 + }, + { + "epoch": 6.211743368850601, + "grad_norm": 0.248046875, + "learning_rate": 0.0002, + "loss": 0.3631, + "step": 3425 + }, + { + "epoch": 6.213557016549535, + "grad_norm": 0.29296875, + "learning_rate": 0.0002, + "loss": 0.644, + "step": 3426 + }, + { + "epoch": 6.21537066424847, + "grad_norm": 0.263671875, + "learning_rate": 0.0002, + "loss": 0.5943, + "step": 3427 + }, + { + "epoch": 6.217184311947404, + "grad_norm": 0.259765625, + "learning_rate": 0.0002, + "loss": 0.4847, + "step": 3428 + }, + { + "epoch": 6.218997959646338, + "grad_norm": 0.3203125, + "learning_rate": 0.0002, + "loss": 0.6086, + "step": 3429 + }, + { + "epoch": 6.220811607345273, + "grad_norm": 0.3046875, + "learning_rate": 0.0002, + "loss": 0.5407, + "step": 3430 + }, + { + "epoch": 6.222625255044208, + "grad_norm": 0.2470703125, + "learning_rate": 0.0002, + "loss": 0.4397, + "step": 3431 + }, + { + "epoch": 6.224438902743142, + "grad_norm": 0.2451171875, + "learning_rate": 0.0002, + "loss": 0.509, + "step": 3432 + }, + { + "epoch": 6.2262525504420765, + "grad_norm": 0.26953125, + "learning_rate": 0.0002, + "loss": 0.3305, + "step": 3433 + }, + { + "epoch": 6.228066198141011, + "grad_norm": 0.265625, + "learning_rate": 0.0002, + "loss": 0.4954, + "step": 3434 + }, + { + "epoch": 6.229879845839946, + "grad_norm": 0.287109375, + "learning_rate": 0.0002, + "loss": 0.4345, + "step": 3435 + }, + { + "epoch": 6.23169349353888, + "grad_norm": 0.2470703125, + "learning_rate": 0.0002, + "loss": 0.3918, + "step": 3436 + }, + { + "epoch": 6.233507141237815, + "grad_norm": 0.291015625, + "learning_rate": 0.0002, + "loss": 0.4672, + "step": 3437 + }, + { + "epoch": 6.2353207889367495, + "grad_norm": 0.25390625, + "learning_rate": 0.0002, + "loss": 0.3406, + "step": 3438 + }, + { + "epoch": 6.237134436635683, + "grad_norm": 0.28515625, + "learning_rate": 0.0002, + "loss": 0.4243, + "step": 3439 + }, + { + "epoch": 6.238948084334618, + "grad_norm": 0.26171875, + "learning_rate": 0.0002, + "loss": 0.428, + "step": 3440 + }, + { + "epoch": 6.240761732033553, + "grad_norm": 0.287109375, + "learning_rate": 0.0002, + "loss": 0.3474, + "step": 3441 + }, + { + "epoch": 6.242575379732487, + "grad_norm": 0.2890625, + "learning_rate": 0.0002, + "loss": 0.2502, + "step": 3442 + }, + { + "epoch": 6.2443890274314215, + "grad_norm": 0.296875, + "learning_rate": 0.0002, + "loss": 0.3655, + "step": 3443 + }, + { + "epoch": 6.246202675130356, + "grad_norm": 0.3359375, + "learning_rate": 0.0002, + "loss": 0.3002, + "step": 3444 + }, + { + "epoch": 6.24801632282929, + "grad_norm": 0.341796875, + "learning_rate": 0.0002, + "loss": 0.2988, + "step": 3445 + }, + { + "epoch": 6.249829970528225, + "grad_norm": 0.353515625, + "learning_rate": 0.0002, + "loss": 0.2621, + "step": 3446 + }, + { + "epoch": 6.25164361822716, + "grad_norm": 0.388671875, + "learning_rate": 0.0002, + "loss": 0.3209, + "step": 3447 + }, + { + "epoch": 6.253457265926094, + "grad_norm": 0.44921875, + "learning_rate": 0.0002, + "loss": 0.261, + "step": 3448 + }, + { + "epoch": 6.255270913625028, + "grad_norm": 0.33203125, + "learning_rate": 0.0002, + "loss": 0.2938, + "step": 3449 + }, + { + "epoch": 6.257084561323963, + "grad_norm": 0.29296875, + "learning_rate": 0.0002, + "loss": 0.2218, + "step": 3450 + }, + { + "epoch": 6.258898209022897, + "grad_norm": 0.287109375, + "learning_rate": 0.0002, + "loss": 0.2571, + "step": 3451 + }, + { + "epoch": 6.260711856721832, + "grad_norm": 0.38671875, + "learning_rate": 0.0002, + "loss": 0.2659, + "step": 3452 + }, + { + "epoch": 6.262525504420767, + "grad_norm": 0.275390625, + "learning_rate": 0.0002, + "loss": 0.1831, + "step": 3453 + }, + { + "epoch": 6.2643391521197005, + "grad_norm": 0.419921875, + "learning_rate": 0.0002, + "loss": 0.2404, + "step": 3454 + }, + { + "epoch": 6.266152799818635, + "grad_norm": 0.296875, + "learning_rate": 0.0002, + "loss": 0.2379, + "step": 3455 + }, + { + "epoch": 6.26796644751757, + "grad_norm": 0.224609375, + "learning_rate": 0.0002, + "loss": 0.2198, + "step": 3456 + }, + { + "epoch": 6.269780095216504, + "grad_norm": 0.34765625, + "learning_rate": 0.0002, + "loss": 0.2498, + "step": 3457 + }, + { + "epoch": 6.271593742915439, + "grad_norm": 0.35546875, + "learning_rate": 0.0002, + "loss": 0.2958, + "step": 3458 + }, + { + "epoch": 6.2734073906143735, + "grad_norm": 0.328125, + "learning_rate": 0.0002, + "loss": 0.4976, + "step": 3459 + }, + { + "epoch": 6.275221038313307, + "grad_norm": 0.205078125, + "learning_rate": 0.0002, + "loss": 0.4564, + "step": 3460 + }, + { + "epoch": 6.277034686012242, + "grad_norm": 0.2314453125, + "learning_rate": 0.0002, + "loss": 0.4173, + "step": 3461 + }, + { + "epoch": 6.278848333711177, + "grad_norm": 0.2451171875, + "learning_rate": 0.0002, + "loss": 0.5591, + "step": 3462 + }, + { + "epoch": 6.280661981410111, + "grad_norm": 0.228515625, + "learning_rate": 0.0002, + "loss": 0.6164, + "step": 3463 + }, + { + "epoch": 6.282475629109046, + "grad_norm": 0.236328125, + "learning_rate": 0.0002, + "loss": 0.607, + "step": 3464 + }, + { + "epoch": 6.28428927680798, + "grad_norm": 0.2236328125, + "learning_rate": 0.0002, + "loss": 0.5581, + "step": 3465 + }, + { + "epoch": 6.286102924506914, + "grad_norm": 0.2255859375, + "learning_rate": 0.0002, + "loss": 0.5503, + "step": 3466 + }, + { + "epoch": 6.287916572205849, + "grad_norm": 0.2421875, + "learning_rate": 0.0002, + "loss": 0.509, + "step": 3467 + }, + { + "epoch": 6.289730219904784, + "grad_norm": 0.2412109375, + "learning_rate": 0.0002, + "loss": 0.523, + "step": 3468 + }, + { + "epoch": 6.291543867603718, + "grad_norm": 0.25390625, + "learning_rate": 0.0002, + "loss": 0.5934, + "step": 3469 + }, + { + "epoch": 6.293357515302652, + "grad_norm": 0.248046875, + "learning_rate": 0.0002, + "loss": 0.5701, + "step": 3470 + }, + { + "epoch": 6.295171163001587, + "grad_norm": 0.2275390625, + "learning_rate": 0.0002, + "loss": 0.625, + "step": 3471 + }, + { + "epoch": 6.296984810700521, + "grad_norm": 0.251953125, + "learning_rate": 0.0002, + "loss": 0.5387, + "step": 3472 + }, + { + "epoch": 6.298798458399456, + "grad_norm": 0.2373046875, + "learning_rate": 0.0002, + "loss": 0.4831, + "step": 3473 + }, + { + "epoch": 6.300612106098391, + "grad_norm": 0.26953125, + "learning_rate": 0.0002, + "loss": 0.5385, + "step": 3474 + }, + { + "epoch": 6.3024257537973245, + "grad_norm": 0.2490234375, + "learning_rate": 0.0002, + "loss": 0.5852, + "step": 3475 + }, + { + "epoch": 6.304239401496259, + "grad_norm": 0.267578125, + "learning_rate": 0.0002, + "loss": 0.5559, + "step": 3476 + }, + { + "epoch": 6.306053049195194, + "grad_norm": 0.279296875, + "learning_rate": 0.0002, + "loss": 0.5292, + "step": 3477 + }, + { + "epoch": 6.307866696894128, + "grad_norm": 0.26171875, + "learning_rate": 0.0002, + "loss": 0.6167, + "step": 3478 + }, + { + "epoch": 6.309680344593063, + "grad_norm": 0.255859375, + "learning_rate": 0.0002, + "loss": 0.4438, + "step": 3479 + }, + { + "epoch": 6.3114939922919975, + "grad_norm": 0.255859375, + "learning_rate": 0.0002, + "loss": 0.4626, + "step": 3480 + }, + { + "epoch": 6.313307639990931, + "grad_norm": 0.279296875, + "learning_rate": 0.0002, + "loss": 0.3945, + "step": 3481 + }, + { + "epoch": 6.315121287689866, + "grad_norm": 0.2578125, + "learning_rate": 0.0002, + "loss": 0.5704, + "step": 3482 + }, + { + "epoch": 6.316934935388801, + "grad_norm": 0.267578125, + "learning_rate": 0.0002, + "loss": 0.4553, + "step": 3483 + }, + { + "epoch": 6.318748583087735, + "grad_norm": 0.283203125, + "learning_rate": 0.0002, + "loss": 0.4474, + "step": 3484 + }, + { + "epoch": 6.32056223078667, + "grad_norm": 0.314453125, + "learning_rate": 0.0002, + "loss": 0.4935, + "step": 3485 + }, + { + "epoch": 6.322375878485604, + "grad_norm": 0.267578125, + "learning_rate": 0.0002, + "loss": 0.3718, + "step": 3486 + }, + { + "epoch": 6.324189526184538, + "grad_norm": 0.26171875, + "learning_rate": 0.0002, + "loss": 0.3483, + "step": 3487 + }, + { + "epoch": 6.326003173883473, + "grad_norm": 0.283203125, + "learning_rate": 0.0002, + "loss": 0.475, + "step": 3488 + }, + { + "epoch": 6.327816821582408, + "grad_norm": 0.30859375, + "learning_rate": 0.0002, + "loss": 0.4289, + "step": 3489 + }, + { + "epoch": 6.3296304692813425, + "grad_norm": 0.3046875, + "learning_rate": 0.0002, + "loss": 0.3916, + "step": 3490 + }, + { + "epoch": 6.331444116980276, + "grad_norm": 0.2890625, + "learning_rate": 0.0002, + "loss": 0.4009, + "step": 3491 + }, + { + "epoch": 6.333257764679211, + "grad_norm": 0.294921875, + "learning_rate": 0.0002, + "loss": 0.3384, + "step": 3492 + }, + { + "epoch": 6.335071412378145, + "grad_norm": 0.251953125, + "learning_rate": 0.0002, + "loss": 0.3184, + "step": 3493 + }, + { + "epoch": 6.33688506007708, + "grad_norm": 0.328125, + "learning_rate": 0.0002, + "loss": 0.3176, + "step": 3494 + }, + { + "epoch": 6.338698707776015, + "grad_norm": 0.296875, + "learning_rate": 0.0002, + "loss": 0.2999, + "step": 3495 + }, + { + "epoch": 6.340512355474949, + "grad_norm": 0.291015625, + "learning_rate": 0.0002, + "loss": 0.282, + "step": 3496 + }, + { + "epoch": 6.342326003173883, + "grad_norm": 0.322265625, + "learning_rate": 0.0002, + "loss": 0.3148, + "step": 3497 + }, + { + "epoch": 6.344139650872818, + "grad_norm": 0.287109375, + "learning_rate": 0.0002, + "loss": 0.2881, + "step": 3498 + }, + { + "epoch": 6.345953298571753, + "grad_norm": 0.388671875, + "learning_rate": 0.0002, + "loss": 0.3122, + "step": 3499 + }, + { + "epoch": 6.347766946270687, + "grad_norm": 0.33203125, + "learning_rate": 0.0002, + "loss": 0.2454, + "step": 3500 + }, + { + "epoch": 6.3495805939696215, + "grad_norm": 0.328125, + "learning_rate": 0.0002, + "loss": 0.3056, + "step": 3501 + }, + { + "epoch": 6.351394241668556, + "grad_norm": 0.390625, + "learning_rate": 0.0002, + "loss": 0.3153, + "step": 3502 + }, + { + "epoch": 6.35320788936749, + "grad_norm": 0.31640625, + "learning_rate": 0.0002, + "loss": 0.2661, + "step": 3503 + }, + { + "epoch": 6.355021537066425, + "grad_norm": 0.2373046875, + "learning_rate": 0.0002, + "loss": 0.1925, + "step": 3504 + }, + { + "epoch": 6.35683518476536, + "grad_norm": 0.390625, + "learning_rate": 0.0002, + "loss": 0.2526, + "step": 3505 + }, + { + "epoch": 6.358648832464294, + "grad_norm": 0.408203125, + "learning_rate": 0.0002, + "loss": 0.2192, + "step": 3506 + }, + { + "epoch": 6.360462480163228, + "grad_norm": 0.33203125, + "learning_rate": 0.0002, + "loss": 0.2334, + "step": 3507 + }, + { + "epoch": 6.362276127862163, + "grad_norm": 0.203125, + "learning_rate": 0.0002, + "loss": 0.2101, + "step": 3508 + }, + { + "epoch": 6.364089775561097, + "grad_norm": 0.265625, + "learning_rate": 0.0002, + "loss": 0.5955, + "step": 3509 + }, + { + "epoch": 6.365903423260032, + "grad_norm": 0.2412109375, + "learning_rate": 0.0002, + "loss": 0.6589, + "step": 3510 + }, + { + "epoch": 6.3677170709589666, + "grad_norm": 0.236328125, + "learning_rate": 0.0002, + "loss": 0.5693, + "step": 3511 + }, + { + "epoch": 6.3695307186579, + "grad_norm": 0.255859375, + "learning_rate": 0.0002, + "loss": 0.6402, + "step": 3512 + }, + { + "epoch": 6.371344366356835, + "grad_norm": 0.2314453125, + "learning_rate": 0.0002, + "loss": 0.5898, + "step": 3513 + }, + { + "epoch": 6.37315801405577, + "grad_norm": 0.234375, + "learning_rate": 0.0002, + "loss": 0.4773, + "step": 3514 + }, + { + "epoch": 6.374971661754704, + "grad_norm": 0.259765625, + "learning_rate": 0.0002, + "loss": 0.5362, + "step": 3515 + }, + { + "epoch": 6.376785309453639, + "grad_norm": 0.298828125, + "learning_rate": 0.0002, + "loss": 0.5433, + "step": 3516 + }, + { + "epoch": 6.378598957152573, + "grad_norm": 0.2353515625, + "learning_rate": 0.0002, + "loss": 0.3875, + "step": 3517 + }, + { + "epoch": 6.380412604851507, + "grad_norm": 0.267578125, + "learning_rate": 0.0002, + "loss": 0.578, + "step": 3518 + }, + { + "epoch": 6.382226252550442, + "grad_norm": 0.2451171875, + "learning_rate": 0.0002, + "loss": 0.5427, + "step": 3519 + }, + { + "epoch": 6.384039900249377, + "grad_norm": 0.41015625, + "learning_rate": 0.0002, + "loss": 0.7006, + "step": 3520 + }, + { + "epoch": 6.385853547948311, + "grad_norm": 0.2734375, + "learning_rate": 0.0002, + "loss": 0.5653, + "step": 3521 + }, + { + "epoch": 6.3876671956472455, + "grad_norm": 0.23828125, + "learning_rate": 0.0002, + "loss": 0.5137, + "step": 3522 + }, + { + "epoch": 6.38948084334618, + "grad_norm": 0.240234375, + "learning_rate": 0.0002, + "loss": 0.454, + "step": 3523 + }, + { + "epoch": 6.391294491045114, + "grad_norm": 0.2470703125, + "learning_rate": 0.0002, + "loss": 0.5556, + "step": 3524 + }, + { + "epoch": 6.393108138744049, + "grad_norm": 0.24609375, + "learning_rate": 0.0002, + "loss": 0.621, + "step": 3525 + }, + { + "epoch": 6.394921786442984, + "grad_norm": 0.2373046875, + "learning_rate": 0.0002, + "loss": 0.4516, + "step": 3526 + }, + { + "epoch": 6.396735434141918, + "grad_norm": 0.2333984375, + "learning_rate": 0.0002, + "loss": 0.4219, + "step": 3527 + }, + { + "epoch": 6.398549081840852, + "grad_norm": 0.2236328125, + "learning_rate": 0.0002, + "loss": 0.411, + "step": 3528 + }, + { + "epoch": 6.400362729539787, + "grad_norm": 0.2421875, + "learning_rate": 0.0002, + "loss": 0.4435, + "step": 3529 + }, + { + "epoch": 6.402176377238721, + "grad_norm": 0.23046875, + "learning_rate": 0.0002, + "loss": 0.4528, + "step": 3530 + }, + { + "epoch": 6.403990024937656, + "grad_norm": 0.2392578125, + "learning_rate": 0.0002, + "loss": 0.5499, + "step": 3531 + }, + { + "epoch": 6.405803672636591, + "grad_norm": 0.28515625, + "learning_rate": 0.0002, + "loss": 0.5742, + "step": 3532 + }, + { + "epoch": 6.4076173203355244, + "grad_norm": 0.251953125, + "learning_rate": 0.0002, + "loss": 0.4874, + "step": 3533 + }, + { + "epoch": 6.409430968034459, + "grad_norm": 0.26171875, + "learning_rate": 0.0002, + "loss": 0.5171, + "step": 3534 + }, + { + "epoch": 6.411244615733394, + "grad_norm": 0.30078125, + "learning_rate": 0.0002, + "loss": 0.5704, + "step": 3535 + }, + { + "epoch": 6.413058263432328, + "grad_norm": 0.27734375, + "learning_rate": 0.0002, + "loss": 0.4674, + "step": 3536 + }, + { + "epoch": 6.414871911131263, + "grad_norm": 0.255859375, + "learning_rate": 0.0002, + "loss": 0.4926, + "step": 3537 + }, + { + "epoch": 6.416685558830197, + "grad_norm": 0.275390625, + "learning_rate": 0.0002, + "loss": 0.4081, + "step": 3538 + }, + { + "epoch": 6.418499206529131, + "grad_norm": 0.26953125, + "learning_rate": 0.0002, + "loss": 0.4568, + "step": 3539 + }, + { + "epoch": 6.420312854228066, + "grad_norm": 0.318359375, + "learning_rate": 0.0002, + "loss": 0.4029, + "step": 3540 + }, + { + "epoch": 6.422126501927001, + "grad_norm": 0.294921875, + "learning_rate": 0.0002, + "loss": 0.3924, + "step": 3541 + }, + { + "epoch": 6.423940149625935, + "grad_norm": 0.3046875, + "learning_rate": 0.0002, + "loss": 0.3345, + "step": 3542 + }, + { + "epoch": 6.4257537973248695, + "grad_norm": 0.33984375, + "learning_rate": 0.0002, + "loss": 0.2834, + "step": 3543 + }, + { + "epoch": 6.427567445023804, + "grad_norm": 0.345703125, + "learning_rate": 0.0002, + "loss": 0.297, + "step": 3544 + }, + { + "epoch": 6.429381092722739, + "grad_norm": 0.33203125, + "learning_rate": 0.0002, + "loss": 0.3697, + "step": 3545 + }, + { + "epoch": 6.431194740421673, + "grad_norm": 0.33203125, + "learning_rate": 0.0002, + "loss": 0.2775, + "step": 3546 + }, + { + "epoch": 6.433008388120608, + "grad_norm": 0.326171875, + "learning_rate": 0.0002, + "loss": 0.2853, + "step": 3547 + }, + { + "epoch": 6.434822035819542, + "grad_norm": 0.30078125, + "learning_rate": 0.0002, + "loss": 0.2348, + "step": 3548 + }, + { + "epoch": 6.436635683518476, + "grad_norm": 0.33984375, + "learning_rate": 0.0002, + "loss": 0.3391, + "step": 3549 + }, + { + "epoch": 6.438449331217411, + "grad_norm": 0.296875, + "learning_rate": 0.0002, + "loss": 0.2349, + "step": 3550 + }, + { + "epoch": 6.440262978916346, + "grad_norm": 0.412109375, + "learning_rate": 0.0002, + "loss": 0.3022, + "step": 3551 + }, + { + "epoch": 6.44207662661528, + "grad_norm": 0.306640625, + "learning_rate": 0.0002, + "loss": 0.2082, + "step": 3552 + }, + { + "epoch": 6.443890274314215, + "grad_norm": 0.404296875, + "learning_rate": 0.0002, + "loss": 0.2364, + "step": 3553 + }, + { + "epoch": 6.443890274314215, + "eval_loss": 1.851540207862854, + "eval_runtime": 152.4468, + "eval_samples_per_second": 6.56, + "eval_steps_per_second": 6.56, + "step": 3553 + }, + { + "epoch": 6.443890274314215, + "mmlu_eval_accuracy": 0.3136716821486766, + "mmlu_eval_accuracy_abstract_algebra": 0.2727272727272727, + "mmlu_eval_accuracy_anatomy": 0.2857142857142857, + "mmlu_eval_accuracy_astronomy": 0.375, + "mmlu_eval_accuracy_business_ethics": 0.5454545454545454, + "mmlu_eval_accuracy_clinical_knowledge": 0.27586206896551724, + "mmlu_eval_accuracy_college_biology": 0.375, + "mmlu_eval_accuracy_college_chemistry": 0.0, + "mmlu_eval_accuracy_college_computer_science": 0.09090909090909091, + "mmlu_eval_accuracy_college_mathematics": 0.18181818181818182, + "mmlu_eval_accuracy_college_medicine": 0.22727272727272727, + "mmlu_eval_accuracy_college_physics": 0.2727272727272727, + "mmlu_eval_accuracy_computer_security": 0.45454545454545453, + "mmlu_eval_accuracy_conceptual_physics": 0.3076923076923077, + "mmlu_eval_accuracy_econometrics": 0.16666666666666666, + "mmlu_eval_accuracy_electrical_engineering": 0.125, + "mmlu_eval_accuracy_elementary_mathematics": 0.24390243902439024, + "mmlu_eval_accuracy_formal_logic": 0.14285714285714285, + "mmlu_eval_accuracy_global_facts": 0.4, + "mmlu_eval_accuracy_high_school_biology": 0.4375, + "mmlu_eval_accuracy_high_school_chemistry": 0.22727272727272727, + "mmlu_eval_accuracy_high_school_computer_science": 0.3333333333333333, + "mmlu_eval_accuracy_high_school_european_history": 0.2222222222222222, + "mmlu_eval_accuracy_high_school_geography": 0.45454545454545453, + "mmlu_eval_accuracy_high_school_government_and_politics": 0.42857142857142855, + "mmlu_eval_accuracy_high_school_macroeconomics": 0.27906976744186046, + "mmlu_eval_accuracy_high_school_mathematics": 0.10344827586206896, + "mmlu_eval_accuracy_high_school_microeconomics": 0.3076923076923077, + "mmlu_eval_accuracy_high_school_physics": 0.47058823529411764, + "mmlu_eval_accuracy_high_school_psychology": 0.35, + "mmlu_eval_accuracy_high_school_statistics": 0.2608695652173913, + "mmlu_eval_accuracy_high_school_us_history": 0.36363636363636365, + "mmlu_eval_accuracy_high_school_world_history": 0.11538461538461539, + "mmlu_eval_accuracy_human_aging": 0.34782608695652173, + "mmlu_eval_accuracy_human_sexuality": 0.08333333333333333, + "mmlu_eval_accuracy_international_law": 0.3076923076923077, + "mmlu_eval_accuracy_jurisprudence": 0.18181818181818182, + "mmlu_eval_accuracy_logical_fallacies": 0.5, + "mmlu_eval_accuracy_machine_learning": 0.36363636363636365, + "mmlu_eval_accuracy_management": 0.5454545454545454, + "mmlu_eval_accuracy_marketing": 0.56, + "mmlu_eval_accuracy_medical_genetics": 0.2727272727272727, + "mmlu_eval_accuracy_miscellaneous": 0.4883720930232558, + "mmlu_eval_accuracy_moral_disputes": 0.2894736842105263, + "mmlu_eval_accuracy_moral_scenarios": 0.27, + "mmlu_eval_accuracy_nutrition": 0.3939393939393939, + "mmlu_eval_accuracy_philosophy": 0.38235294117647056, + "mmlu_eval_accuracy_prehistory": 0.4, + "mmlu_eval_accuracy_professional_accounting": 0.3225806451612903, + "mmlu_eval_accuracy_professional_law": 0.27647058823529413, + "mmlu_eval_accuracy_professional_medicine": 0.25806451612903225, + "mmlu_eval_accuracy_professional_psychology": 0.2898550724637681, + "mmlu_eval_accuracy_public_relations": 0.4166666666666667, + "mmlu_eval_accuracy_security_studies": 0.3333333333333333, + "mmlu_eval_accuracy_sociology": 0.5909090909090909, + "mmlu_eval_accuracy_us_foreign_policy": 0.36363636363636365, + "mmlu_eval_accuracy_virology": 0.3333333333333333, + "mmlu_eval_accuracy_world_religions": 0.21052631578947367, + "mmlu_loss": 1.5295933871040306, + "step": 3553 + }, + { + "epoch": 6.445703922013149, + "grad_norm": 0.498046875, + "learning_rate": 0.0002, + "loss": 0.2874, + "step": 3554 + }, + { + "epoch": 6.447517569712083, + "grad_norm": 0.4453125, + "learning_rate": 0.0002, + "loss": 0.2712, + "step": 3555 + }, + { + "epoch": 6.449331217411018, + "grad_norm": 0.365234375, + "learning_rate": 0.0002, + "loss": 0.237, + "step": 3556 + }, + { + "epoch": 6.451144865109953, + "grad_norm": 0.439453125, + "learning_rate": 0.0002, + "loss": 0.2438, + "step": 3557 + }, + { + "epoch": 6.452958512808887, + "grad_norm": 0.396484375, + "learning_rate": 0.0002, + "loss": 0.2823, + "step": 3558 + }, + { + "epoch": 6.454772160507821, + "grad_norm": 0.2353515625, + "learning_rate": 0.0002, + "loss": 0.4851, + "step": 3559 + }, + { + "epoch": 6.456585808206756, + "grad_norm": 0.236328125, + "learning_rate": 0.0002, + "loss": 0.6215, + "step": 3560 + }, + { + "epoch": 6.45839945590569, + "grad_norm": 0.24609375, + "learning_rate": 0.0002, + "loss": 0.4663, + "step": 3561 + }, + { + "epoch": 6.460213103604625, + "grad_norm": 0.26953125, + "learning_rate": 0.0002, + "loss": 0.5962, + "step": 3562 + }, + { + "epoch": 6.46202675130356, + "grad_norm": 0.24609375, + "learning_rate": 0.0002, + "loss": 0.7456, + "step": 3563 + }, + { + "epoch": 6.4638403990024935, + "grad_norm": 0.255859375, + "learning_rate": 0.0002, + "loss": 0.561, + "step": 3564 + }, + { + "epoch": 6.465654046701428, + "grad_norm": 0.29296875, + "learning_rate": 0.0002, + "loss": 0.6985, + "step": 3565 + }, + { + "epoch": 6.467467694400363, + "grad_norm": 0.341796875, + "learning_rate": 0.0002, + "loss": 0.5351, + "step": 3566 + }, + { + "epoch": 6.469281342099297, + "grad_norm": 0.265625, + "learning_rate": 0.0002, + "loss": 0.6256, + "step": 3567 + }, + { + "epoch": 6.471094989798232, + "grad_norm": 0.25, + "learning_rate": 0.0002, + "loss": 0.4613, + "step": 3568 + }, + { + "epoch": 6.4729086374971665, + "grad_norm": 0.310546875, + "learning_rate": 0.0002, + "loss": 0.641, + "step": 3569 + }, + { + "epoch": 6.4747222851961, + "grad_norm": 0.25390625, + "learning_rate": 0.0002, + "loss": 0.5934, + "step": 3570 + }, + { + "epoch": 6.476535932895035, + "grad_norm": 0.275390625, + "learning_rate": 0.0002, + "loss": 0.4747, + "step": 3571 + }, + { + "epoch": 6.47834958059397, + "grad_norm": 0.265625, + "learning_rate": 0.0002, + "loss": 0.5933, + "step": 3572 + }, + { + "epoch": 6.480163228292904, + "grad_norm": 0.26953125, + "learning_rate": 0.0002, + "loss": 0.5814, + "step": 3573 + }, + { + "epoch": 6.481976875991839, + "grad_norm": 0.255859375, + "learning_rate": 0.0002, + "loss": 0.522, + "step": 3574 + }, + { + "epoch": 6.483790523690773, + "grad_norm": 0.251953125, + "learning_rate": 0.0002, + "loss": 0.5925, + "step": 3575 + }, + { + "epoch": 6.485604171389707, + "grad_norm": 0.251953125, + "learning_rate": 0.0002, + "loss": 0.443, + "step": 3576 + }, + { + "epoch": 6.487417819088642, + "grad_norm": 0.244140625, + "learning_rate": 0.0002, + "loss": 0.5857, + "step": 3577 + }, + { + "epoch": 6.489231466787577, + "grad_norm": 0.2412109375, + "learning_rate": 0.0002, + "loss": 0.484, + "step": 3578 + }, + { + "epoch": 6.491045114486511, + "grad_norm": 0.251953125, + "learning_rate": 0.0002, + "loss": 0.489, + "step": 3579 + }, + { + "epoch": 6.492858762185445, + "grad_norm": 0.283203125, + "learning_rate": 0.0002, + "loss": 0.4922, + "step": 3580 + }, + { + "epoch": 6.49467240988438, + "grad_norm": 0.2490234375, + "learning_rate": 0.0002, + "loss": 0.4933, + "step": 3581 + }, + { + "epoch": 6.496486057583314, + "grad_norm": 0.26171875, + "learning_rate": 0.0002, + "loss": 0.4929, + "step": 3582 + }, + { + "epoch": 6.498299705282249, + "grad_norm": 0.240234375, + "learning_rate": 0.0002, + "loss": 0.3888, + "step": 3583 + }, + { + "epoch": 6.500113352981184, + "grad_norm": 0.28515625, + "learning_rate": 0.0002, + "loss": 0.4941, + "step": 3584 + }, + { + "epoch": 6.5019270006801175, + "grad_norm": 0.2578125, + "learning_rate": 0.0002, + "loss": 0.3801, + "step": 3585 + }, + { + "epoch": 6.503740648379052, + "grad_norm": 0.24609375, + "learning_rate": 0.0002, + "loss": 0.4111, + "step": 3586 + }, + { + "epoch": 6.505554296077987, + "grad_norm": 0.24609375, + "learning_rate": 0.0002, + "loss": 0.391, + "step": 3587 + }, + { + "epoch": 6.507367943776921, + "grad_norm": 0.3125, + "learning_rate": 0.0002, + "loss": 0.4887, + "step": 3588 + }, + { + "epoch": 6.509181591475856, + "grad_norm": 0.310546875, + "learning_rate": 0.0002, + "loss": 0.4382, + "step": 3589 + }, + { + "epoch": 6.5109952391747905, + "grad_norm": 0.265625, + "learning_rate": 0.0002, + "loss": 0.3144, + "step": 3590 + }, + { + "epoch": 6.512808886873724, + "grad_norm": 0.3203125, + "learning_rate": 0.0002, + "loss": 0.4037, + "step": 3591 + }, + { + "epoch": 6.514622534572659, + "grad_norm": 0.365234375, + "learning_rate": 0.0002, + "loss": 0.3074, + "step": 3592 + }, + { + "epoch": 6.516436182271594, + "grad_norm": 0.3125, + "learning_rate": 0.0002, + "loss": 0.3069, + "step": 3593 + }, + { + "epoch": 6.518249829970529, + "grad_norm": 0.3359375, + "learning_rate": 0.0002, + "loss": 0.3283, + "step": 3594 + }, + { + "epoch": 6.520063477669463, + "grad_norm": 0.359375, + "learning_rate": 0.0002, + "loss": 0.327, + "step": 3595 + }, + { + "epoch": 6.521877125368397, + "grad_norm": 0.306640625, + "learning_rate": 0.0002, + "loss": 0.2945, + "step": 3596 + }, + { + "epoch": 6.523690773067331, + "grad_norm": 0.33203125, + "learning_rate": 0.0002, + "loss": 0.2345, + "step": 3597 + }, + { + "epoch": 6.525504420766266, + "grad_norm": 0.314453125, + "learning_rate": 0.0002, + "loss": 0.2199, + "step": 3598 + }, + { + "epoch": 6.527318068465201, + "grad_norm": 0.416015625, + "learning_rate": 0.0002, + "loss": 0.3333, + "step": 3599 + }, + { + "epoch": 6.529131716164136, + "grad_norm": 0.390625, + "learning_rate": 0.0002, + "loss": 0.281, + "step": 3600 + }, + { + "epoch": 6.5309453638630695, + "grad_norm": 0.314453125, + "learning_rate": 0.0002, + "loss": 0.1768, + "step": 3601 + }, + { + "epoch": 6.532759011562004, + "grad_norm": 0.34375, + "learning_rate": 0.0002, + "loss": 0.2313, + "step": 3602 + }, + { + "epoch": 6.534572659260938, + "grad_norm": 0.55078125, + "learning_rate": 0.0002, + "loss": 0.216, + "step": 3603 + }, + { + "epoch": 6.536386306959873, + "grad_norm": 0.30078125, + "learning_rate": 0.0002, + "loss": 0.2675, + "step": 3604 + }, + { + "epoch": 6.538199954658808, + "grad_norm": 0.453125, + "learning_rate": 0.0002, + "loss": 0.2292, + "step": 3605 + }, + { + "epoch": 6.540013602357742, + "grad_norm": 0.279296875, + "learning_rate": 0.0002, + "loss": 0.1977, + "step": 3606 + }, + { + "epoch": 6.541827250056676, + "grad_norm": 0.330078125, + "learning_rate": 0.0002, + "loss": 0.2492, + "step": 3607 + }, + { + "epoch": 6.543640897755611, + "grad_norm": 0.291015625, + "learning_rate": 0.0002, + "loss": 0.2542, + "step": 3608 + }, + { + "epoch": 6.545454545454545, + "grad_norm": 0.21875, + "learning_rate": 0.0002, + "loss": 0.4104, + "step": 3609 + }, + { + "epoch": 6.54726819315348, + "grad_norm": 0.2353515625, + "learning_rate": 0.0002, + "loss": 0.648, + "step": 3610 + }, + { + "epoch": 6.5490818408524145, + "grad_norm": 0.22265625, + "learning_rate": 0.0002, + "loss": 0.475, + "step": 3611 + }, + { + "epoch": 6.550895488551349, + "grad_norm": 0.255859375, + "learning_rate": 0.0002, + "loss": 0.4738, + "step": 3612 + }, + { + "epoch": 6.552709136250283, + "grad_norm": 0.248046875, + "learning_rate": 0.0002, + "loss": 0.5094, + "step": 3613 + }, + { + "epoch": 6.554522783949218, + "grad_norm": 0.248046875, + "learning_rate": 0.0002, + "loss": 0.501, + "step": 3614 + }, + { + "epoch": 6.556336431648153, + "grad_norm": 0.259765625, + "learning_rate": 0.0002, + "loss": 0.7094, + "step": 3615 + }, + { + "epoch": 6.558150079347087, + "grad_norm": 0.2490234375, + "learning_rate": 0.0002, + "loss": 0.476, + "step": 3616 + }, + { + "epoch": 6.559963727046021, + "grad_norm": 0.2412109375, + "learning_rate": 0.0002, + "loss": 0.4847, + "step": 3617 + }, + { + "epoch": 6.561777374744956, + "grad_norm": 0.259765625, + "learning_rate": 0.0002, + "loss": 0.5885, + "step": 3618 + }, + { + "epoch": 6.56359102244389, + "grad_norm": 0.267578125, + "learning_rate": 0.0002, + "loss": 0.7182, + "step": 3619 + }, + { + "epoch": 6.565404670142825, + "grad_norm": 0.263671875, + "learning_rate": 0.0002, + "loss": 0.577, + "step": 3620 + }, + { + "epoch": 6.56721831784176, + "grad_norm": 0.271484375, + "learning_rate": 0.0002, + "loss": 0.6126, + "step": 3621 + }, + { + "epoch": 6.5690319655406935, + "grad_norm": 0.271484375, + "learning_rate": 0.0002, + "loss": 0.5766, + "step": 3622 + }, + { + "epoch": 6.570845613239628, + "grad_norm": 0.267578125, + "learning_rate": 0.0002, + "loss": 0.515, + "step": 3623 + }, + { + "epoch": 6.572659260938563, + "grad_norm": 0.265625, + "learning_rate": 0.0002, + "loss": 0.4547, + "step": 3624 + }, + { + "epoch": 6.574472908637497, + "grad_norm": 0.271484375, + "learning_rate": 0.0002, + "loss": 0.6885, + "step": 3625 + }, + { + "epoch": 6.576286556336432, + "grad_norm": 0.25390625, + "learning_rate": 0.0002, + "loss": 0.4885, + "step": 3626 + }, + { + "epoch": 6.578100204035366, + "grad_norm": 0.263671875, + "learning_rate": 0.0002, + "loss": 0.4932, + "step": 3627 + }, + { + "epoch": 6.5799138517343, + "grad_norm": 0.275390625, + "learning_rate": 0.0002, + "loss": 0.5608, + "step": 3628 + }, + { + "epoch": 6.581727499433235, + "grad_norm": 0.294921875, + "learning_rate": 0.0002, + "loss": 0.5751, + "step": 3629 + }, + { + "epoch": 6.58354114713217, + "grad_norm": 0.287109375, + "learning_rate": 0.0002, + "loss": 0.4605, + "step": 3630 + }, + { + "epoch": 6.585354794831104, + "grad_norm": 0.302734375, + "learning_rate": 0.0002, + "loss": 0.6012, + "step": 3631 + }, + { + "epoch": 6.5871684425300385, + "grad_norm": 0.251953125, + "learning_rate": 0.0002, + "loss": 0.4746, + "step": 3632 + }, + { + "epoch": 6.588982090228973, + "grad_norm": 0.3359375, + "learning_rate": 0.0002, + "loss": 0.4883, + "step": 3633 + }, + { + "epoch": 6.590795737927907, + "grad_norm": 0.30078125, + "learning_rate": 0.0002, + "loss": 0.4163, + "step": 3634 + }, + { + "epoch": 6.592609385626842, + "grad_norm": 0.279296875, + "learning_rate": 0.0002, + "loss": 0.3762, + "step": 3635 + }, + { + "epoch": 6.594423033325777, + "grad_norm": 0.283203125, + "learning_rate": 0.0002, + "loss": 0.4303, + "step": 3636 + }, + { + "epoch": 6.596236681024711, + "grad_norm": 0.3125, + "learning_rate": 0.0002, + "loss": 0.5087, + "step": 3637 + }, + { + "epoch": 6.598050328723645, + "grad_norm": 0.27734375, + "learning_rate": 0.0002, + "loss": 0.5303, + "step": 3638 + }, + { + "epoch": 6.59986397642258, + "grad_norm": 0.232421875, + "learning_rate": 0.0002, + "loss": 0.386, + "step": 3639 + }, + { + "epoch": 6.601677624121514, + "grad_norm": 0.25390625, + "learning_rate": 0.0002, + "loss": 0.4247, + "step": 3640 + }, + { + "epoch": 6.603491271820449, + "grad_norm": 0.30859375, + "learning_rate": 0.0002, + "loss": 0.4673, + "step": 3641 + }, + { + "epoch": 6.605304919519384, + "grad_norm": 0.2890625, + "learning_rate": 0.0002, + "loss": 0.4412, + "step": 3642 + }, + { + "epoch": 6.6071185672183175, + "grad_norm": 0.2890625, + "learning_rate": 0.0002, + "loss": 0.3218, + "step": 3643 + }, + { + "epoch": 6.608932214917252, + "grad_norm": 0.298828125, + "learning_rate": 0.0002, + "loss": 0.3278, + "step": 3644 + }, + { + "epoch": 6.610745862616187, + "grad_norm": 0.32421875, + "learning_rate": 0.0002, + "loss": 0.3872, + "step": 3645 + }, + { + "epoch": 6.612559510315121, + "grad_norm": 0.369140625, + "learning_rate": 0.0002, + "loss": 0.3615, + "step": 3646 + }, + { + "epoch": 6.614373158014056, + "grad_norm": 0.314453125, + "learning_rate": 0.0002, + "loss": 0.2779, + "step": 3647 + }, + { + "epoch": 6.6161868057129904, + "grad_norm": 0.333984375, + "learning_rate": 0.0002, + "loss": 0.2594, + "step": 3648 + }, + { + "epoch": 6.618000453411925, + "grad_norm": 0.388671875, + "learning_rate": 0.0002, + "loss": 0.2433, + "step": 3649 + }, + { + "epoch": 6.619814101110859, + "grad_norm": 0.31640625, + "learning_rate": 0.0002, + "loss": 0.3106, + "step": 3650 + }, + { + "epoch": 6.621627748809794, + "grad_norm": 0.357421875, + "learning_rate": 0.0002, + "loss": 0.2685, + "step": 3651 + }, + { + "epoch": 6.623441396508728, + "grad_norm": 0.306640625, + "learning_rate": 0.0002, + "loss": 0.1596, + "step": 3652 + }, + { + "epoch": 6.6252550442076625, + "grad_norm": 0.29296875, + "learning_rate": 0.0002, + "loss": 0.2369, + "step": 3653 + }, + { + "epoch": 6.627068691906597, + "grad_norm": 0.337890625, + "learning_rate": 0.0002, + "loss": 0.224, + "step": 3654 + }, + { + "epoch": 6.628882339605532, + "grad_norm": 0.28515625, + "learning_rate": 0.0002, + "loss": 0.1907, + "step": 3655 + }, + { + "epoch": 6.630695987304466, + "grad_norm": 0.291015625, + "learning_rate": 0.0002, + "loss": 0.1911, + "step": 3656 + }, + { + "epoch": 6.632509635003401, + "grad_norm": 0.26171875, + "learning_rate": 0.0002, + "loss": 0.2192, + "step": 3657 + }, + { + "epoch": 6.634323282702335, + "grad_norm": 0.384765625, + "learning_rate": 0.0002, + "loss": 0.3132, + "step": 3658 + }, + { + "epoch": 6.636136930401269, + "grad_norm": 0.28125, + "learning_rate": 0.0002, + "loss": 0.5692, + "step": 3659 + }, + { + "epoch": 6.637950578100204, + "grad_norm": 0.306640625, + "learning_rate": 0.0002, + "loss": 0.5905, + "step": 3660 + }, + { + "epoch": 6.639764225799139, + "grad_norm": 0.28125, + "learning_rate": 0.0002, + "loss": 0.61, + "step": 3661 + }, + { + "epoch": 6.641577873498073, + "grad_norm": 0.279296875, + "learning_rate": 0.0002, + "loss": 0.469, + "step": 3662 + }, + { + "epoch": 6.643391521197008, + "grad_norm": 0.275390625, + "learning_rate": 0.0002, + "loss": 0.5973, + "step": 3663 + }, + { + "epoch": 6.6452051688959415, + "grad_norm": 0.255859375, + "learning_rate": 0.0002, + "loss": 0.5321, + "step": 3664 + }, + { + "epoch": 6.647018816594876, + "grad_norm": 0.232421875, + "learning_rate": 0.0002, + "loss": 0.5043, + "step": 3665 + }, + { + "epoch": 6.648832464293811, + "grad_norm": 0.26953125, + "learning_rate": 0.0002, + "loss": 0.6605, + "step": 3666 + }, + { + "epoch": 6.650646111992746, + "grad_norm": 0.2421875, + "learning_rate": 0.0002, + "loss": 0.4737, + "step": 3667 + }, + { + "epoch": 6.65245975969168, + "grad_norm": 0.255859375, + "learning_rate": 0.0002, + "loss": 0.5922, + "step": 3668 + }, + { + "epoch": 6.6542734073906145, + "grad_norm": 0.26171875, + "learning_rate": 0.0002, + "loss": 0.5538, + "step": 3669 + }, + { + "epoch": 6.656087055089549, + "grad_norm": 0.271484375, + "learning_rate": 0.0002, + "loss": 0.5668, + "step": 3670 + }, + { + "epoch": 6.657900702788483, + "grad_norm": 0.263671875, + "learning_rate": 0.0002, + "loss": 0.5654, + "step": 3671 + }, + { + "epoch": 6.659714350487418, + "grad_norm": 0.263671875, + "learning_rate": 0.0002, + "loss": 0.53, + "step": 3672 + }, + { + "epoch": 6.661527998186353, + "grad_norm": 0.244140625, + "learning_rate": 0.0002, + "loss": 0.451, + "step": 3673 + }, + { + "epoch": 6.6633416458852865, + "grad_norm": 0.2490234375, + "learning_rate": 0.0002, + "loss": 0.5169, + "step": 3674 + }, + { + "epoch": 6.665155293584221, + "grad_norm": 0.255859375, + "learning_rate": 0.0002, + "loss": 0.7089, + "step": 3675 + }, + { + "epoch": 6.666968941283156, + "grad_norm": 0.2392578125, + "learning_rate": 0.0002, + "loss": 0.62, + "step": 3676 + }, + { + "epoch": 6.66878258898209, + "grad_norm": 0.271484375, + "learning_rate": 0.0002, + "loss": 0.4187, + "step": 3677 + }, + { + "epoch": 6.670596236681025, + "grad_norm": 0.263671875, + "learning_rate": 0.0002, + "loss": 0.6731, + "step": 3678 + }, + { + "epoch": 6.6724098843799595, + "grad_norm": 0.34765625, + "learning_rate": 0.0002, + "loss": 0.6212, + "step": 3679 + }, + { + "epoch": 6.674223532078893, + "grad_norm": 0.3203125, + "learning_rate": 0.0002, + "loss": 0.452, + "step": 3680 + }, + { + "epoch": 6.676037179777828, + "grad_norm": 0.267578125, + "learning_rate": 0.0002, + "loss": 0.3991, + "step": 3681 + }, + { + "epoch": 6.677850827476763, + "grad_norm": 0.341796875, + "learning_rate": 0.0002, + "loss": 0.5461, + "step": 3682 + }, + { + "epoch": 6.679664475175697, + "grad_norm": 0.31640625, + "learning_rate": 0.0002, + "loss": 0.4214, + "step": 3683 + }, + { + "epoch": 6.681478122874632, + "grad_norm": 0.35546875, + "learning_rate": 0.0002, + "loss": 0.4601, + "step": 3684 + }, + { + "epoch": 6.683291770573566, + "grad_norm": 0.369140625, + "learning_rate": 0.0002, + "loss": 0.4666, + "step": 3685 + }, + { + "epoch": 6.6851054182725, + "grad_norm": 0.388671875, + "learning_rate": 0.0002, + "loss": 0.5807, + "step": 3686 + }, + { + "epoch": 6.686919065971435, + "grad_norm": 0.30859375, + "learning_rate": 0.0002, + "loss": 0.4962, + "step": 3687 + }, + { + "epoch": 6.68873271367037, + "grad_norm": 0.29296875, + "learning_rate": 0.0002, + "loss": 0.4095, + "step": 3688 + }, + { + "epoch": 6.690546361369304, + "grad_norm": 0.29296875, + "learning_rate": 0.0002, + "loss": 0.3753, + "step": 3689 + }, + { + "epoch": 6.6923600090682385, + "grad_norm": 0.30859375, + "learning_rate": 0.0002, + "loss": 0.4663, + "step": 3690 + }, + { + "epoch": 6.694173656767173, + "grad_norm": 0.2734375, + "learning_rate": 0.0002, + "loss": 0.3972, + "step": 3691 + }, + { + "epoch": 6.695987304466107, + "grad_norm": 0.2734375, + "learning_rate": 0.0002, + "loss": 0.3465, + "step": 3692 + }, + { + "epoch": 6.697800952165042, + "grad_norm": 0.302734375, + "learning_rate": 0.0002, + "loss": 0.3734, + "step": 3693 + }, + { + "epoch": 6.699614599863977, + "grad_norm": 0.296875, + "learning_rate": 0.0002, + "loss": 0.3382, + "step": 3694 + }, + { + "epoch": 6.701428247562911, + "grad_norm": 0.349609375, + "learning_rate": 0.0002, + "loss": 0.3049, + "step": 3695 + }, + { + "epoch": 6.703241895261845, + "grad_norm": 0.310546875, + "learning_rate": 0.0002, + "loss": 0.3207, + "step": 3696 + }, + { + "epoch": 6.70505554296078, + "grad_norm": 0.32421875, + "learning_rate": 0.0002, + "loss": 0.3054, + "step": 3697 + }, + { + "epoch": 6.706869190659714, + "grad_norm": 0.314453125, + "learning_rate": 0.0002, + "loss": 0.247, + "step": 3698 + }, + { + "epoch": 6.708682838358649, + "grad_norm": 0.37109375, + "learning_rate": 0.0002, + "loss": 0.303, + "step": 3699 + }, + { + "epoch": 6.7104964860575835, + "grad_norm": 0.302734375, + "learning_rate": 0.0002, + "loss": 0.3079, + "step": 3700 + }, + { + "epoch": 6.712310133756517, + "grad_norm": 0.4296875, + "learning_rate": 0.0002, + "loss": 0.2657, + "step": 3701 + }, + { + "epoch": 6.714123781455452, + "grad_norm": 0.357421875, + "learning_rate": 0.0002, + "loss": 0.298, + "step": 3702 + }, + { + "epoch": 6.715937429154387, + "grad_norm": 0.4140625, + "learning_rate": 0.0002, + "loss": 0.3076, + "step": 3703 + }, + { + "epoch": 6.717751076853322, + "grad_norm": 0.357421875, + "learning_rate": 0.0002, + "loss": 0.255, + "step": 3704 + }, + { + "epoch": 6.719564724552256, + "grad_norm": 0.40234375, + "learning_rate": 0.0002, + "loss": 0.2245, + "step": 3705 + }, + { + "epoch": 6.72137837225119, + "grad_norm": 0.267578125, + "learning_rate": 0.0002, + "loss": 0.2468, + "step": 3706 + }, + { + "epoch": 6.723192019950124, + "grad_norm": 0.294921875, + "learning_rate": 0.0002, + "loss": 0.2459, + "step": 3707 + }, + { + "epoch": 6.725005667649059, + "grad_norm": 0.5234375, + "learning_rate": 0.0002, + "loss": 0.2926, + "step": 3708 + }, + { + "epoch": 6.726819315347994, + "grad_norm": 0.283203125, + "learning_rate": 0.0002, + "loss": 0.5507, + "step": 3709 + }, + { + "epoch": 6.728632963046929, + "grad_norm": 0.3125, + "learning_rate": 0.0002, + "loss": 0.6765, + "step": 3710 + }, + { + "epoch": 6.7304466107458625, + "grad_norm": 0.255859375, + "learning_rate": 0.0002, + "loss": 0.603, + "step": 3711 + }, + { + "epoch": 6.732260258444797, + "grad_norm": 0.2412109375, + "learning_rate": 0.0002, + "loss": 0.6851, + "step": 3712 + }, + { + "epoch": 6.734073906143731, + "grad_norm": 0.259765625, + "learning_rate": 0.0002, + "loss": 0.6, + "step": 3713 + }, + { + "epoch": 6.735887553842666, + "grad_norm": 0.28515625, + "learning_rate": 0.0002, + "loss": 0.638, + "step": 3714 + }, + { + "epoch": 6.737701201541601, + "grad_norm": 0.283203125, + "learning_rate": 0.0002, + "loss": 0.6232, + "step": 3715 + }, + { + "epoch": 6.7395148492405355, + "grad_norm": 0.248046875, + "learning_rate": 0.0002, + "loss": 0.5651, + "step": 3716 + }, + { + "epoch": 6.741328496939469, + "grad_norm": 0.2734375, + "learning_rate": 0.0002, + "loss": 0.5823, + "step": 3717 + }, + { + "epoch": 6.743142144638404, + "grad_norm": 0.263671875, + "learning_rate": 0.0002, + "loss": 0.5561, + "step": 3718 + }, + { + "epoch": 6.744955792337338, + "grad_norm": 0.27734375, + "learning_rate": 0.0002, + "loss": 0.6137, + "step": 3719 + }, + { + "epoch": 6.746769440036273, + "grad_norm": 0.25390625, + "learning_rate": 0.0002, + "loss": 0.5089, + "step": 3720 + }, + { + "epoch": 6.7485830877352075, + "grad_norm": 0.263671875, + "learning_rate": 0.0002, + "loss": 0.5989, + "step": 3721 + }, + { + "epoch": 6.750396735434142, + "grad_norm": 0.26171875, + "learning_rate": 0.0002, + "loss": 0.6027, + "step": 3722 + }, + { + "epoch": 6.752210383133076, + "grad_norm": 0.2470703125, + "learning_rate": 0.0002, + "loss": 0.5017, + "step": 3723 + }, + { + "epoch": 6.754024030832011, + "grad_norm": 0.2392578125, + "learning_rate": 0.0002, + "loss": 0.5824, + "step": 3724 + }, + { + "epoch": 6.755837678530946, + "grad_norm": 0.255859375, + "learning_rate": 0.0002, + "loss": 0.5563, + "step": 3725 + }, + { + "epoch": 6.75765132622988, + "grad_norm": 0.25, + "learning_rate": 0.0002, + "loss": 0.6311, + "step": 3726 + }, + { + "epoch": 6.759464973928814, + "grad_norm": 0.232421875, + "learning_rate": 0.0002, + "loss": 0.4154, + "step": 3727 + }, + { + "epoch": 6.761278621627749, + "grad_norm": 0.2451171875, + "learning_rate": 0.0002, + "loss": 0.4711, + "step": 3728 + }, + { + "epoch": 6.763092269326683, + "grad_norm": 0.265625, + "learning_rate": 0.0002, + "loss": 0.6101, + "step": 3729 + }, + { + "epoch": 6.764905917025618, + "grad_norm": 0.259765625, + "learning_rate": 0.0002, + "loss": 0.5281, + "step": 3730 + }, + { + "epoch": 6.766719564724553, + "grad_norm": 0.263671875, + "learning_rate": 0.0002, + "loss": 0.4907, + "step": 3731 + }, + { + "epoch": 6.7685332124234865, + "grad_norm": 0.275390625, + "learning_rate": 0.0002, + "loss": 0.5788, + "step": 3732 + }, + { + "epoch": 6.770346860122421, + "grad_norm": 0.275390625, + "learning_rate": 0.0002, + "loss": 0.5367, + "step": 3733 + }, + { + "epoch": 6.772160507821356, + "grad_norm": 0.28125, + "learning_rate": 0.0002, + "loss": 0.5245, + "step": 3734 + }, + { + "epoch": 6.77397415552029, + "grad_norm": 0.259765625, + "learning_rate": 0.0002, + "loss": 0.4851, + "step": 3735 + }, + { + "epoch": 6.775787803219225, + "grad_norm": 0.2470703125, + "learning_rate": 0.0002, + "loss": 0.4734, + "step": 3736 + }, + { + "epoch": 6.7776014509181595, + "grad_norm": 0.244140625, + "learning_rate": 0.0002, + "loss": 0.4199, + "step": 3737 + }, + { + "epoch": 6.779415098617093, + "grad_norm": 0.26171875, + "learning_rate": 0.0002, + "loss": 0.3858, + "step": 3738 + }, + { + "epoch": 6.781228746316028, + "grad_norm": 0.291015625, + "learning_rate": 0.0002, + "loss": 0.4093, + "step": 3739 + }, + { + "epoch": 6.783042394014963, + "grad_norm": 0.25390625, + "learning_rate": 0.0002, + "loss": 0.3734, + "step": 3740 + }, + { + "epoch": 6.783042394014963, + "eval_loss": 1.7052702903747559, + "eval_runtime": 152.5593, + "eval_samples_per_second": 6.555, + "eval_steps_per_second": 6.555, + "step": 3740 + }, + { + "epoch": 6.783042394014963, + "mmlu_eval_accuracy": 0.30854550260425556, + "mmlu_eval_accuracy_abstract_algebra": 0.2727272727272727, + "mmlu_eval_accuracy_anatomy": 0.2857142857142857, + "mmlu_eval_accuracy_astronomy": 0.3125, + "mmlu_eval_accuracy_business_ethics": 0.5454545454545454, + "mmlu_eval_accuracy_clinical_knowledge": 0.27586206896551724, + "mmlu_eval_accuracy_college_biology": 0.375, + "mmlu_eval_accuracy_college_chemistry": 0.0, + "mmlu_eval_accuracy_college_computer_science": 0.09090909090909091, + "mmlu_eval_accuracy_college_mathematics": 0.18181818181818182, + "mmlu_eval_accuracy_college_medicine": 0.22727272727272727, + "mmlu_eval_accuracy_college_physics": 0.45454545454545453, + "mmlu_eval_accuracy_computer_security": 0.36363636363636365, + "mmlu_eval_accuracy_conceptual_physics": 0.3076923076923077, + "mmlu_eval_accuracy_econometrics": 0.08333333333333333, + "mmlu_eval_accuracy_electrical_engineering": 0.125, + "mmlu_eval_accuracy_elementary_mathematics": 0.2682926829268293, + "mmlu_eval_accuracy_formal_logic": 0.14285714285714285, + "mmlu_eval_accuracy_global_facts": 0.4, + "mmlu_eval_accuracy_high_school_biology": 0.4375, + "mmlu_eval_accuracy_high_school_chemistry": 0.22727272727272727, + "mmlu_eval_accuracy_high_school_computer_science": 0.3333333333333333, + "mmlu_eval_accuracy_high_school_european_history": 0.2222222222222222, + "mmlu_eval_accuracy_high_school_geography": 0.4090909090909091, + "mmlu_eval_accuracy_high_school_government_and_politics": 0.42857142857142855, + "mmlu_eval_accuracy_high_school_macroeconomics": 0.2558139534883721, + "mmlu_eval_accuracy_high_school_mathematics": 0.10344827586206896, + "mmlu_eval_accuracy_high_school_microeconomics": 0.34615384615384615, + "mmlu_eval_accuracy_high_school_physics": 0.4117647058823529, + "mmlu_eval_accuracy_high_school_psychology": 0.4, + "mmlu_eval_accuracy_high_school_statistics": 0.30434782608695654, + "mmlu_eval_accuracy_high_school_us_history": 0.2727272727272727, + "mmlu_eval_accuracy_high_school_world_history": 0.15384615384615385, + "mmlu_eval_accuracy_human_aging": 0.34782608695652173, + "mmlu_eval_accuracy_human_sexuality": 0.08333333333333333, + "mmlu_eval_accuracy_international_law": 0.3076923076923077, + "mmlu_eval_accuracy_jurisprudence": 0.18181818181818182, + "mmlu_eval_accuracy_logical_fallacies": 0.3888888888888889, + "mmlu_eval_accuracy_machine_learning": 0.36363636363636365, + "mmlu_eval_accuracy_management": 0.36363636363636365, + "mmlu_eval_accuracy_marketing": 0.56, + "mmlu_eval_accuracy_medical_genetics": 0.2727272727272727, + "mmlu_eval_accuracy_miscellaneous": 0.5, + "mmlu_eval_accuracy_moral_disputes": 0.2894736842105263, + "mmlu_eval_accuracy_moral_scenarios": 0.23, + "mmlu_eval_accuracy_nutrition": 0.36363636363636365, + "mmlu_eval_accuracy_philosophy": 0.3235294117647059, + "mmlu_eval_accuracy_prehistory": 0.4, + "mmlu_eval_accuracy_professional_accounting": 0.3225806451612903, + "mmlu_eval_accuracy_professional_law": 0.2823529411764706, + "mmlu_eval_accuracy_professional_medicine": 0.3225806451612903, + "mmlu_eval_accuracy_professional_psychology": 0.3333333333333333, + "mmlu_eval_accuracy_public_relations": 0.5833333333333334, + "mmlu_eval_accuracy_security_studies": 0.3333333333333333, + "mmlu_eval_accuracy_sociology": 0.45454545454545453, + "mmlu_eval_accuracy_us_foreign_policy": 0.36363636363636365, + "mmlu_eval_accuracy_virology": 0.3333333333333333, + "mmlu_eval_accuracy_world_religions": 0.2631578947368421, + "mmlu_loss": 1.7823183545873182, + "step": 3740 + }, + { + "epoch": 6.784856041713897, + "grad_norm": 0.33984375, + "learning_rate": 0.0002, + "loss": 0.5086, + "step": 3741 + }, + { + "epoch": 6.7866696894128316, + "grad_norm": 0.291015625, + "learning_rate": 0.0002, + "loss": 0.3801, + "step": 3742 + }, + { + "epoch": 6.788483337111766, + "grad_norm": 0.283203125, + "learning_rate": 0.0002, + "loss": 0.3219, + "step": 3743 + }, + { + "epoch": 6.7902969848107, + "grad_norm": 0.296875, + "learning_rate": 0.0002, + "loss": 0.338, + "step": 3744 + }, + { + "epoch": 6.792110632509635, + "grad_norm": 0.2734375, + "learning_rate": 0.0002, + "loss": 0.2813, + "step": 3745 + }, + { + "epoch": 6.79392428020857, + "grad_norm": 0.33984375, + "learning_rate": 0.0002, + "loss": 0.299, + "step": 3746 + }, + { + "epoch": 6.795737927907504, + "grad_norm": 0.345703125, + "learning_rate": 0.0002, + "loss": 0.2882, + "step": 3747 + }, + { + "epoch": 6.797551575606438, + "grad_norm": 0.365234375, + "learning_rate": 0.0002, + "loss": 0.2668, + "step": 3748 + }, + { + "epoch": 6.799365223305373, + "grad_norm": 0.4140625, + "learning_rate": 0.0002, + "loss": 0.3437, + "step": 3749 + }, + { + "epoch": 6.801178871004307, + "grad_norm": 0.3046875, + "learning_rate": 0.0002, + "loss": 0.2365, + "step": 3750 + }, + { + "epoch": 6.802992518703242, + "grad_norm": 0.259765625, + "learning_rate": 0.0002, + "loss": 0.3174, + "step": 3751 + }, + { + "epoch": 6.804806166402177, + "grad_norm": 0.39453125, + "learning_rate": 0.0002, + "loss": 0.2899, + "step": 3752 + }, + { + "epoch": 6.8066198141011105, + "grad_norm": 0.365234375, + "learning_rate": 0.0002, + "loss": 0.2595, + "step": 3753 + }, + { + "epoch": 6.808433461800045, + "grad_norm": 0.330078125, + "learning_rate": 0.0002, + "loss": 0.225, + "step": 3754 + }, + { + "epoch": 6.81024710949898, + "grad_norm": 0.431640625, + "learning_rate": 0.0002, + "loss": 0.2247, + "step": 3755 + }, + { + "epoch": 6.812060757197914, + "grad_norm": 0.330078125, + "learning_rate": 0.0002, + "loss": 0.2317, + "step": 3756 + }, + { + "epoch": 6.813874404896849, + "grad_norm": 0.220703125, + "learning_rate": 0.0002, + "loss": 0.2455, + "step": 3757 + }, + { + "epoch": 6.8156880525957835, + "grad_norm": 0.298828125, + "learning_rate": 0.0002, + "loss": 0.2863, + "step": 3758 + }, + { + "epoch": 6.817501700294718, + "grad_norm": 0.349609375, + "learning_rate": 0.0002, + "loss": 0.5878, + "step": 3759 + }, + { + "epoch": 6.819315347993652, + "grad_norm": 0.267578125, + "learning_rate": 0.0002, + "loss": 0.59, + "step": 3760 + }, + { + "epoch": 6.821128995692587, + "grad_norm": 0.275390625, + "learning_rate": 0.0002, + "loss": 0.5372, + "step": 3761 + }, + { + "epoch": 6.822942643391521, + "grad_norm": 0.25390625, + "learning_rate": 0.0002, + "loss": 0.484, + "step": 3762 + }, + { + "epoch": 6.824756291090456, + "grad_norm": 0.267578125, + "learning_rate": 0.0002, + "loss": 0.5093, + "step": 3763 + }, + { + "epoch": 6.82656993878939, + "grad_norm": 0.26953125, + "learning_rate": 0.0002, + "loss": 0.6962, + "step": 3764 + }, + { + "epoch": 6.828383586488325, + "grad_norm": 0.275390625, + "learning_rate": 0.0002, + "loss": 0.6873, + "step": 3765 + }, + { + "epoch": 6.830197234187259, + "grad_norm": 0.283203125, + "learning_rate": 0.0002, + "loss": 0.6156, + "step": 3766 + }, + { + "epoch": 6.832010881886194, + "grad_norm": 0.259765625, + "learning_rate": 0.0002, + "loss": 0.5119, + "step": 3767 + }, + { + "epoch": 6.833824529585128, + "grad_norm": 0.28515625, + "learning_rate": 0.0002, + "loss": 0.6171, + "step": 3768 + }, + { + "epoch": 6.835638177284062, + "grad_norm": 0.2890625, + "learning_rate": 0.0002, + "loss": 0.6917, + "step": 3769 + }, + { + "epoch": 6.837451824982997, + "grad_norm": 0.294921875, + "learning_rate": 0.0002, + "loss": 0.5973, + "step": 3770 + }, + { + "epoch": 6.839265472681932, + "grad_norm": 0.291015625, + "learning_rate": 0.0002, + "loss": 0.6301, + "step": 3771 + }, + { + "epoch": 6.841079120380866, + "grad_norm": 0.275390625, + "learning_rate": 0.0002, + "loss": 0.6282, + "step": 3772 + }, + { + "epoch": 6.842892768079801, + "grad_norm": 0.30078125, + "learning_rate": 0.0002, + "loss": 0.4937, + "step": 3773 + }, + { + "epoch": 6.8447064157787345, + "grad_norm": 0.267578125, + "learning_rate": 0.0002, + "loss": 0.6344, + "step": 3774 + }, + { + "epoch": 6.846520063477669, + "grad_norm": 0.28125, + "learning_rate": 0.0002, + "loss": 0.698, + "step": 3775 + }, + { + "epoch": 6.848333711176604, + "grad_norm": 0.25390625, + "learning_rate": 0.0002, + "loss": 0.5623, + "step": 3776 + }, + { + "epoch": 6.850147358875539, + "grad_norm": 0.25, + "learning_rate": 0.0002, + "loss": 0.5017, + "step": 3777 + }, + { + "epoch": 6.851961006574473, + "grad_norm": 0.259765625, + "learning_rate": 0.0002, + "loss": 0.4826, + "step": 3778 + }, + { + "epoch": 6.8537746542734075, + "grad_norm": 0.2470703125, + "learning_rate": 0.0002, + "loss": 0.461, + "step": 3779 + }, + { + "epoch": 6.855588301972342, + "grad_norm": 0.27734375, + "learning_rate": 0.0002, + "loss": 0.5564, + "step": 3780 + }, + { + "epoch": 6.857401949671276, + "grad_norm": 0.263671875, + "learning_rate": 0.0002, + "loss": 0.3984, + "step": 3781 + }, + { + "epoch": 6.859215597370211, + "grad_norm": 0.28125, + "learning_rate": 0.0002, + "loss": 0.5557, + "step": 3782 + }, + { + "epoch": 6.861029245069146, + "grad_norm": 0.28515625, + "learning_rate": 0.0002, + "loss": 0.4389, + "step": 3783 + }, + { + "epoch": 6.86284289276808, + "grad_norm": 0.2578125, + "learning_rate": 0.0002, + "loss": 0.4453, + "step": 3784 + }, + { + "epoch": 6.864656540467014, + "grad_norm": 0.271484375, + "learning_rate": 0.0002, + "loss": 0.5202, + "step": 3785 + }, + { + "epoch": 6.866470188165949, + "grad_norm": 0.279296875, + "learning_rate": 0.0002, + "loss": 0.4734, + "step": 3786 + }, + { + "epoch": 6.868283835864883, + "grad_norm": 0.26953125, + "learning_rate": 0.0002, + "loss": 0.5884, + "step": 3787 + }, + { + "epoch": 6.870097483563818, + "grad_norm": 0.28515625, + "learning_rate": 0.0002, + "loss": 0.4162, + "step": 3788 + }, + { + "epoch": 6.8719111312627525, + "grad_norm": 0.2578125, + "learning_rate": 0.0002, + "loss": 0.3652, + "step": 3789 + }, + { + "epoch": 6.873724778961686, + "grad_norm": 0.314453125, + "learning_rate": 0.0002, + "loss": 0.378, + "step": 3790 + }, + { + "epoch": 6.875538426660621, + "grad_norm": 0.263671875, + "learning_rate": 0.0002, + "loss": 0.4785, + "step": 3791 + }, + { + "epoch": 6.877352074359556, + "grad_norm": 0.328125, + "learning_rate": 0.0002, + "loss": 0.3805, + "step": 3792 + }, + { + "epoch": 6.87916572205849, + "grad_norm": 0.380859375, + "learning_rate": 0.0002, + "loss": 0.3499, + "step": 3793 + }, + { + "epoch": 6.880979369757425, + "grad_norm": 0.294921875, + "learning_rate": 0.0002, + "loss": 0.3179, + "step": 3794 + }, + { + "epoch": 6.882793017456359, + "grad_norm": 0.33984375, + "learning_rate": 0.0002, + "loss": 0.3548, + "step": 3795 + }, + { + "epoch": 6.884606665155293, + "grad_norm": 0.423828125, + "learning_rate": 0.0002, + "loss": 0.3354, + "step": 3796 + }, + { + "epoch": 6.886420312854228, + "grad_norm": 0.384765625, + "learning_rate": 0.0002, + "loss": 0.362, + "step": 3797 + }, + { + "epoch": 6.888233960553163, + "grad_norm": 0.3984375, + "learning_rate": 0.0002, + "loss": 0.2806, + "step": 3798 + }, + { + "epoch": 6.890047608252097, + "grad_norm": 0.361328125, + "learning_rate": 0.0002, + "loss": 0.3204, + "step": 3799 + }, + { + "epoch": 6.8918612559510315, + "grad_norm": 0.328125, + "learning_rate": 0.0002, + "loss": 0.2414, + "step": 3800 + }, + { + "epoch": 6.893674903649966, + "grad_norm": 0.34765625, + "learning_rate": 0.0002, + "loss": 0.3342, + "step": 3801 + }, + { + "epoch": 6.8954885513489, + "grad_norm": 0.3125, + "learning_rate": 0.0002, + "loss": 0.2574, + "step": 3802 + }, + { + "epoch": 6.897302199047835, + "grad_norm": 0.3515625, + "learning_rate": 0.0002, + "loss": 0.3604, + "step": 3803 + }, + { + "epoch": 6.89911584674677, + "grad_norm": 0.27734375, + "learning_rate": 0.0002, + "loss": 0.2169, + "step": 3804 + }, + { + "epoch": 6.900929494445704, + "grad_norm": 0.39453125, + "learning_rate": 0.0002, + "loss": 0.3094, + "step": 3805 + }, + { + "epoch": 6.902743142144638, + "grad_norm": 0.330078125, + "learning_rate": 0.0002, + "loss": 0.2347, + "step": 3806 + }, + { + "epoch": 6.904556789843573, + "grad_norm": 0.25, + "learning_rate": 0.0002, + "loss": 0.2232, + "step": 3807 + }, + { + "epoch": 6.906370437542507, + "grad_norm": 0.294921875, + "learning_rate": 0.0002, + "loss": 0.2621, + "step": 3808 + }, + { + "epoch": 6.908184085241442, + "grad_norm": 0.3203125, + "learning_rate": 0.0002, + "loss": 0.7104, + "step": 3809 + }, + { + "epoch": 6.909997732940377, + "grad_norm": 0.23828125, + "learning_rate": 0.0002, + "loss": 0.5536, + "step": 3810 + }, + { + "epoch": 6.91181138063931, + "grad_norm": 0.263671875, + "learning_rate": 0.0002, + "loss": 0.5971, + "step": 3811 + }, + { + "epoch": 6.913625028338245, + "grad_norm": 0.2578125, + "learning_rate": 0.0002, + "loss": 0.5342, + "step": 3812 + }, + { + "epoch": 6.91543867603718, + "grad_norm": 0.2236328125, + "learning_rate": 0.0002, + "loss": 0.512, + "step": 3813 + }, + { + "epoch": 6.917252323736114, + "grad_norm": 0.24609375, + "learning_rate": 0.0002, + "loss": 0.5475, + "step": 3814 + }, + { + "epoch": 6.919065971435049, + "grad_norm": 0.263671875, + "learning_rate": 0.0002, + "loss": 0.5465, + "step": 3815 + }, + { + "epoch": 6.920879619133983, + "grad_norm": 0.236328125, + "learning_rate": 0.0002, + "loss": 0.622, + "step": 3816 + }, + { + "epoch": 6.922693266832917, + "grad_norm": 0.25390625, + "learning_rate": 0.0002, + "loss": 0.6112, + "step": 3817 + }, + { + "epoch": 6.924506914531852, + "grad_norm": 0.279296875, + "learning_rate": 0.0002, + "loss": 0.6935, + "step": 3818 + }, + { + "epoch": 6.926320562230787, + "grad_norm": 0.2421875, + "learning_rate": 0.0002, + "loss": 0.6339, + "step": 3819 + }, + { + "epoch": 6.928134209929722, + "grad_norm": 0.2734375, + "learning_rate": 0.0002, + "loss": 0.5874, + "step": 3820 + }, + { + "epoch": 6.9299478576286555, + "grad_norm": 0.2890625, + "learning_rate": 0.0002, + "loss": 0.5948, + "step": 3821 + }, + { + "epoch": 6.93176150532759, + "grad_norm": 0.265625, + "learning_rate": 0.0002, + "loss": 0.6005, + "step": 3822 + }, + { + "epoch": 6.933575153026524, + "grad_norm": 0.271484375, + "learning_rate": 0.0002, + "loss": 0.6448, + "step": 3823 + }, + { + "epoch": 6.935388800725459, + "grad_norm": 0.259765625, + "learning_rate": 0.0002, + "loss": 0.5513, + "step": 3824 + }, + { + "epoch": 6.937202448424394, + "grad_norm": 0.26953125, + "learning_rate": 0.0002, + "loss": 0.5388, + "step": 3825 + }, + { + "epoch": 6.9390160961233285, + "grad_norm": 0.2578125, + "learning_rate": 0.0002, + "loss": 0.5085, + "step": 3826 + }, + { + "epoch": 6.940829743822262, + "grad_norm": 0.2578125, + "learning_rate": 0.0002, + "loss": 0.5158, + "step": 3827 + }, + { + "epoch": 6.942643391521197, + "grad_norm": 0.279296875, + "learning_rate": 0.0002, + "loss": 0.5804, + "step": 3828 + }, + { + "epoch": 6.944457039220131, + "grad_norm": 0.275390625, + "learning_rate": 0.0002, + "loss": 0.5475, + "step": 3829 + }, + { + "epoch": 6.946270686919066, + "grad_norm": 0.275390625, + "learning_rate": 0.0002, + "loss": 0.5047, + "step": 3830 + }, + { + "epoch": 6.948084334618001, + "grad_norm": 0.2890625, + "learning_rate": 0.0002, + "loss": 0.5016, + "step": 3831 + }, + { + "epoch": 6.949897982316935, + "grad_norm": 0.33203125, + "learning_rate": 0.0002, + "loss": 0.5806, + "step": 3832 + }, + { + "epoch": 6.951711630015869, + "grad_norm": 0.2734375, + "learning_rate": 0.0002, + "loss": 0.4368, + "step": 3833 + }, + { + "epoch": 6.953525277714804, + "grad_norm": 0.296875, + "learning_rate": 0.0002, + "loss": 0.5296, + "step": 3834 + }, + { + "epoch": 6.955338925413738, + "grad_norm": 0.330078125, + "learning_rate": 0.0002, + "loss": 0.4785, + "step": 3835 + }, + { + "epoch": 6.957152573112673, + "grad_norm": 0.314453125, + "learning_rate": 0.0002, + "loss": 0.4228, + "step": 3836 + }, + { + "epoch": 6.958966220811607, + "grad_norm": 0.294921875, + "learning_rate": 0.0002, + "loss": 0.4652, + "step": 3837 + }, + { + "epoch": 6.960779868510542, + "grad_norm": 0.30078125, + "learning_rate": 0.0002, + "loss": 0.3987, + "step": 3838 + }, + { + "epoch": 6.962593516209476, + "grad_norm": 0.318359375, + "learning_rate": 0.0002, + "loss": 0.4797, + "step": 3839 + }, + { + "epoch": 6.964407163908411, + "grad_norm": 0.259765625, + "learning_rate": 0.0002, + "loss": 0.3898, + "step": 3840 + }, + { + "epoch": 6.966220811607346, + "grad_norm": 0.30078125, + "learning_rate": 0.0002, + "loss": 0.413, + "step": 3841 + }, + { + "epoch": 6.9680344593062795, + "grad_norm": 0.28515625, + "learning_rate": 0.0002, + "loss": 0.3759, + "step": 3842 + }, + { + "epoch": 6.969848107005214, + "grad_norm": 0.30859375, + "learning_rate": 0.0002, + "loss": 0.3616, + "step": 3843 + }, + { + "epoch": 6.971661754704149, + "grad_norm": 0.28515625, + "learning_rate": 0.0002, + "loss": 0.3949, + "step": 3844 + }, + { + "epoch": 6.973475402403083, + "grad_norm": 0.306640625, + "learning_rate": 0.0002, + "loss": 0.3548, + "step": 3845 + }, + { + "epoch": 6.975289050102018, + "grad_norm": 0.36328125, + "learning_rate": 0.0002, + "loss": 0.3682, + "step": 3846 + }, + { + "epoch": 6.9771026978009525, + "grad_norm": 0.388671875, + "learning_rate": 0.0002, + "loss": 0.2838, + "step": 3847 + }, + { + "epoch": 6.978916345499886, + "grad_norm": 0.423828125, + "learning_rate": 0.0002, + "loss": 0.3199, + "step": 3848 + }, + { + "epoch": 6.980729993198821, + "grad_norm": 0.361328125, + "learning_rate": 0.0002, + "loss": 0.2667, + "step": 3849 + }, + { + "epoch": 6.982543640897756, + "grad_norm": 0.314453125, + "learning_rate": 0.0002, + "loss": 0.2194, + "step": 3850 + }, + { + "epoch": 6.98435728859669, + "grad_norm": 0.29296875, + "learning_rate": 0.0002, + "loss": 0.2606, + "step": 3851 + }, + { + "epoch": 6.986170936295625, + "grad_norm": 0.365234375, + "learning_rate": 0.0002, + "loss": 0.2165, + "step": 3852 + }, + { + "epoch": 6.987984583994559, + "grad_norm": 0.396484375, + "learning_rate": 0.0002, + "loss": 0.307, + "step": 3853 + }, + { + "epoch": 6.989798231693493, + "grad_norm": 0.34765625, + "learning_rate": 0.0002, + "loss": 0.2486, + "step": 3854 + }, + { + "epoch": 6.991611879392428, + "grad_norm": 0.470703125, + "learning_rate": 0.0002, + "loss": 0.2686, + "step": 3855 + }, + { + "epoch": 6.993425527091363, + "grad_norm": 0.3515625, + "learning_rate": 0.0002, + "loss": 0.2224, + "step": 3856 + }, + { + "epoch": 6.995239174790297, + "grad_norm": 0.3203125, + "learning_rate": 0.0002, + "loss": 0.2421, + "step": 3857 + }, + { + "epoch": 6.997052822489231, + "grad_norm": 0.30078125, + "learning_rate": 0.0002, + "loss": 0.2828, + "step": 3858 + }, + { + "epoch": 6.998866470188166, + "grad_norm": 0.390625, + "learning_rate": 0.0002, + "loss": 0.5361, + "step": 3859 + }, + { + "epoch": 7.0006801178871, + "grad_norm": 0.30859375, + "learning_rate": 0.0002, + "loss": 0.4449, + "step": 3860 + }, + { + "epoch": 7.002493765586035, + "grad_norm": 0.1923828125, + "learning_rate": 0.0002, + "loss": 0.3692, + "step": 3861 + }, + { + "epoch": 7.00430741328497, + "grad_norm": 0.1962890625, + "learning_rate": 0.0002, + "loss": 0.4244, + "step": 3862 + }, + { + "epoch": 7.0061210609839035, + "grad_norm": 0.1845703125, + "learning_rate": 0.0002, + "loss": 0.4268, + "step": 3863 + }, + { + "epoch": 7.007934708682838, + "grad_norm": 0.19140625, + "learning_rate": 0.0002, + "loss": 0.475, + "step": 3864 + }, + { + "epoch": 7.009748356381773, + "grad_norm": 0.201171875, + "learning_rate": 0.0002, + "loss": 0.3952, + "step": 3865 + }, + { + "epoch": 7.011562004080707, + "grad_norm": 0.2021484375, + "learning_rate": 0.0002, + "loss": 0.3475, + "step": 3866 + }, + { + "epoch": 7.013375651779642, + "grad_norm": 0.1904296875, + "learning_rate": 0.0002, + "loss": 0.3118, + "step": 3867 + }, + { + "epoch": 7.0151892994785765, + "grad_norm": 0.216796875, + "learning_rate": 0.0002, + "loss": 0.3669, + "step": 3868 + }, + { + "epoch": 7.01700294717751, + "grad_norm": 0.2412109375, + "learning_rate": 0.0002, + "loss": 0.4472, + "step": 3869 + }, + { + "epoch": 7.018816594876445, + "grad_norm": 0.2734375, + "learning_rate": 0.0002, + "loss": 0.4797, + "step": 3870 + }, + { + "epoch": 7.02063024257538, + "grad_norm": 0.283203125, + "learning_rate": 0.0002, + "loss": 0.533, + "step": 3871 + }, + { + "epoch": 7.022443890274314, + "grad_norm": 0.25390625, + "learning_rate": 0.0002, + "loss": 0.379, + "step": 3872 + }, + { + "epoch": 7.024257537973249, + "grad_norm": 0.29296875, + "learning_rate": 0.0002, + "loss": 0.4268, + "step": 3873 + }, + { + "epoch": 7.026071185672183, + "grad_norm": 0.298828125, + "learning_rate": 0.0002, + "loss": 0.3009, + "step": 3874 + }, + { + "epoch": 7.027884833371117, + "grad_norm": 0.322265625, + "learning_rate": 0.0002, + "loss": 0.4636, + "step": 3875 + }, + { + "epoch": 7.029698481070052, + "grad_norm": 0.306640625, + "learning_rate": 0.0002, + "loss": 0.3832, + "step": 3876 + }, + { + "epoch": 7.031512128768987, + "grad_norm": 0.302734375, + "learning_rate": 0.0002, + "loss": 0.4456, + "step": 3877 + }, + { + "epoch": 7.033325776467921, + "grad_norm": 0.298828125, + "learning_rate": 0.0002, + "loss": 0.3899, + "step": 3878 + }, + { + "epoch": 7.0351394241668554, + "grad_norm": 0.24609375, + "learning_rate": 0.0002, + "loss": 0.3628, + "step": 3879 + }, + { + "epoch": 7.03695307186579, + "grad_norm": 0.267578125, + "learning_rate": 0.0002, + "loss": 0.3407, + "step": 3880 + }, + { + "epoch": 7.038766719564725, + "grad_norm": 0.287109375, + "learning_rate": 0.0002, + "loss": 0.4307, + "step": 3881 + }, + { + "epoch": 7.040580367263659, + "grad_norm": 0.267578125, + "learning_rate": 0.0002, + "loss": 0.3035, + "step": 3882 + }, + { + "epoch": 7.042394014962594, + "grad_norm": 0.3046875, + "learning_rate": 0.0002, + "loss": 0.4277, + "step": 3883 + }, + { + "epoch": 7.044207662661528, + "grad_norm": 0.283203125, + "learning_rate": 0.0002, + "loss": 0.4136, + "step": 3884 + }, + { + "epoch": 7.046021310360462, + "grad_norm": 0.240234375, + "learning_rate": 0.0002, + "loss": 0.2832, + "step": 3885 + }, + { + "epoch": 7.047834958059397, + "grad_norm": 0.275390625, + "learning_rate": 0.0002, + "loss": 0.3391, + "step": 3886 + }, + { + "epoch": 7.049648605758332, + "grad_norm": 0.27734375, + "learning_rate": 0.0002, + "loss": 0.4086, + "step": 3887 + }, + { + "epoch": 7.051462253457266, + "grad_norm": 0.263671875, + "learning_rate": 0.0002, + "loss": 0.3296, + "step": 3888 + }, + { + "epoch": 7.0532759011562005, + "grad_norm": 0.263671875, + "learning_rate": 0.0002, + "loss": 0.3396, + "step": 3889 + }, + { + "epoch": 7.055089548855135, + "grad_norm": 0.279296875, + "learning_rate": 0.0002, + "loss": 0.3082, + "step": 3890 + }, + { + "epoch": 7.056903196554069, + "grad_norm": 0.23828125, + "learning_rate": 0.0002, + "loss": 0.2124, + "step": 3891 + }, + { + "epoch": 7.058716844253004, + "grad_norm": 0.21875, + "learning_rate": 0.0002, + "loss": 0.2194, + "step": 3892 + }, + { + "epoch": 7.060530491951939, + "grad_norm": 0.279296875, + "learning_rate": 0.0002, + "loss": 0.2441, + "step": 3893 + }, + { + "epoch": 7.062344139650873, + "grad_norm": 0.287109375, + "learning_rate": 0.0002, + "loss": 0.2155, + "step": 3894 + }, + { + "epoch": 7.064157787349807, + "grad_norm": 0.2431640625, + "learning_rate": 0.0002, + "loss": 0.2474, + "step": 3895 + }, + { + "epoch": 7.065971435048742, + "grad_norm": 0.279296875, + "learning_rate": 0.0002, + "loss": 0.2127, + "step": 3896 + }, + { + "epoch": 7.067785082747676, + "grad_norm": 0.271484375, + "learning_rate": 0.0002, + "loss": 0.2302, + "step": 3897 + }, + { + "epoch": 7.069598730446611, + "grad_norm": 0.3203125, + "learning_rate": 0.0002, + "loss": 0.3067, + "step": 3898 + }, + { + "epoch": 7.071412378145546, + "grad_norm": 0.28125, + "learning_rate": 0.0002, + "loss": 0.2351, + "step": 3899 + }, + { + "epoch": 7.0732260258444795, + "grad_norm": 0.298828125, + "learning_rate": 0.0002, + "loss": 0.1856, + "step": 3900 + }, + { + "epoch": 7.075039673543414, + "grad_norm": 0.2236328125, + "learning_rate": 0.0002, + "loss": 0.2246, + "step": 3901 + }, + { + "epoch": 7.076853321242349, + "grad_norm": 0.30078125, + "learning_rate": 0.0002, + "loss": 0.1763, + "step": 3902 + }, + { + "epoch": 7.078666968941283, + "grad_norm": 0.294921875, + "learning_rate": 0.0002, + "loss": 0.1958, + "step": 3903 + }, + { + "epoch": 7.080480616640218, + "grad_norm": 0.255859375, + "learning_rate": 0.0002, + "loss": 0.2119, + "step": 3904 + }, + { + "epoch": 7.082294264339152, + "grad_norm": 0.29296875, + "learning_rate": 0.0002, + "loss": 0.1888, + "step": 3905 + }, + { + "epoch": 7.084107912038086, + "grad_norm": 0.26171875, + "learning_rate": 0.0002, + "loss": 0.1836, + "step": 3906 + }, + { + "epoch": 7.085921559737021, + "grad_norm": 0.365234375, + "learning_rate": 0.0002, + "loss": 0.2052, + "step": 3907 + }, + { + "epoch": 7.087735207435956, + "grad_norm": 0.2255859375, + "learning_rate": 0.0002, + "loss": 0.1677, + "step": 3908 + }, + { + "epoch": 7.08954885513489, + "grad_norm": 0.341796875, + "learning_rate": 0.0002, + "loss": 0.1932, + "step": 3909 + }, + { + "epoch": 7.0913625028338245, + "grad_norm": 0.25390625, + "learning_rate": 0.0002, + "loss": 0.364, + "step": 3910 + }, + { + "epoch": 7.093176150532759, + "grad_norm": 0.2373046875, + "learning_rate": 0.0002, + "loss": 0.4824, + "step": 3911 + }, + { + "epoch": 7.094989798231693, + "grad_norm": 0.234375, + "learning_rate": 0.0002, + "loss": 0.3873, + "step": 3912 + }, + { + "epoch": 7.096803445930628, + "grad_norm": 0.208984375, + "learning_rate": 0.0002, + "loss": 0.3641, + "step": 3913 + }, + { + "epoch": 7.098617093629563, + "grad_norm": 0.2431640625, + "learning_rate": 0.0002, + "loss": 0.4146, + "step": 3914 + }, + { + "epoch": 7.100430741328497, + "grad_norm": 0.244140625, + "learning_rate": 0.0002, + "loss": 0.4088, + "step": 3915 + }, + { + "epoch": 7.102244389027431, + "grad_norm": 0.263671875, + "learning_rate": 0.0002, + "loss": 0.5044, + "step": 3916 + }, + { + "epoch": 7.104058036726366, + "grad_norm": 0.2421875, + "learning_rate": 0.0002, + "loss": 0.3839, + "step": 3917 + }, + { + "epoch": 7.1058716844253, + "grad_norm": 0.2578125, + "learning_rate": 0.0002, + "loss": 0.5499, + "step": 3918 + }, + { + "epoch": 7.107685332124235, + "grad_norm": 0.283203125, + "learning_rate": 0.0002, + "loss": 0.4029, + "step": 3919 + }, + { + "epoch": 7.10949897982317, + "grad_norm": 0.2578125, + "learning_rate": 0.0002, + "loss": 0.3687, + "step": 3920 + }, + { + "epoch": 7.1113126275221035, + "grad_norm": 0.251953125, + "learning_rate": 0.0002, + "loss": 0.3543, + "step": 3921 + }, + { + "epoch": 7.113126275221038, + "grad_norm": 0.251953125, + "learning_rate": 0.0002, + "loss": 0.3959, + "step": 3922 + }, + { + "epoch": 7.114939922919973, + "grad_norm": 0.298828125, + "learning_rate": 0.0002, + "loss": 0.5957, + "step": 3923 + }, + { + "epoch": 7.116753570618907, + "grad_norm": 0.2734375, + "learning_rate": 0.0002, + "loss": 0.3188, + "step": 3924 + }, + { + "epoch": 7.118567218317842, + "grad_norm": 0.2890625, + "learning_rate": 0.0002, + "loss": 0.4063, + "step": 3925 + }, + { + "epoch": 7.1203808660167764, + "grad_norm": 0.287109375, + "learning_rate": 0.0002, + "loss": 0.4289, + "step": 3926 + }, + { + "epoch": 7.12219451371571, + "grad_norm": 0.265625, + "learning_rate": 0.0002, + "loss": 0.3691, + "step": 3927 + }, + { + "epoch": 7.12219451371571, + "eval_loss": 1.8441636562347412, + "eval_runtime": 152.9087, + "eval_samples_per_second": 6.54, + "eval_steps_per_second": 6.54, + "step": 3927 + }, + { + "epoch": 7.12219451371571, + "mmlu_eval_accuracy": 0.304247740420161, + "mmlu_eval_accuracy_abstract_algebra": 0.2727272727272727, + "mmlu_eval_accuracy_anatomy": 0.2857142857142857, + "mmlu_eval_accuracy_astronomy": 0.375, + "mmlu_eval_accuracy_business_ethics": 0.5454545454545454, + "mmlu_eval_accuracy_clinical_knowledge": 0.27586206896551724, + "mmlu_eval_accuracy_college_biology": 0.5, + "mmlu_eval_accuracy_college_chemistry": 0.25, + "mmlu_eval_accuracy_college_computer_science": 0.09090909090909091, + "mmlu_eval_accuracy_college_mathematics": 0.18181818181818182, + "mmlu_eval_accuracy_college_medicine": 0.18181818181818182, + "mmlu_eval_accuracy_college_physics": 0.45454545454545453, + "mmlu_eval_accuracy_computer_security": 0.45454545454545453, + "mmlu_eval_accuracy_conceptual_physics": 0.3076923076923077, + "mmlu_eval_accuracy_econometrics": 0.16666666666666666, + "mmlu_eval_accuracy_electrical_engineering": 0.125, + "mmlu_eval_accuracy_elementary_mathematics": 0.24390243902439024, + "mmlu_eval_accuracy_formal_logic": 0.14285714285714285, + "mmlu_eval_accuracy_global_facts": 0.4, + "mmlu_eval_accuracy_high_school_biology": 0.40625, + "mmlu_eval_accuracy_high_school_chemistry": 0.18181818181818182, + "mmlu_eval_accuracy_high_school_computer_science": 0.2222222222222222, + "mmlu_eval_accuracy_high_school_european_history": 0.2222222222222222, + "mmlu_eval_accuracy_high_school_geography": 0.4090909090909091, + "mmlu_eval_accuracy_high_school_government_and_politics": 0.42857142857142855, + "mmlu_eval_accuracy_high_school_macroeconomics": 0.20930232558139536, + "mmlu_eval_accuracy_high_school_mathematics": 0.10344827586206896, + "mmlu_eval_accuracy_high_school_microeconomics": 0.38461538461538464, + "mmlu_eval_accuracy_high_school_physics": 0.4117647058823529, + "mmlu_eval_accuracy_high_school_psychology": 0.3333333333333333, + "mmlu_eval_accuracy_high_school_statistics": 0.30434782608695654, + "mmlu_eval_accuracy_high_school_us_history": 0.36363636363636365, + "mmlu_eval_accuracy_high_school_world_history": 0.15384615384615385, + "mmlu_eval_accuracy_human_aging": 0.30434782608695654, + "mmlu_eval_accuracy_human_sexuality": 0.08333333333333333, + "mmlu_eval_accuracy_international_law": 0.3076923076923077, + "mmlu_eval_accuracy_jurisprudence": 0.18181818181818182, + "mmlu_eval_accuracy_logical_fallacies": 0.3888888888888889, + "mmlu_eval_accuracy_machine_learning": 0.2727272727272727, + "mmlu_eval_accuracy_management": 0.36363636363636365, + "mmlu_eval_accuracy_marketing": 0.48, + "mmlu_eval_accuracy_medical_genetics": 0.36363636363636365, + "mmlu_eval_accuracy_miscellaneous": 0.4418604651162791, + "mmlu_eval_accuracy_moral_disputes": 0.21052631578947367, + "mmlu_eval_accuracy_moral_scenarios": 0.22, + "mmlu_eval_accuracy_nutrition": 0.36363636363636365, + "mmlu_eval_accuracy_philosophy": 0.35294117647058826, + "mmlu_eval_accuracy_prehistory": 0.2857142857142857, + "mmlu_eval_accuracy_professional_accounting": 0.3225806451612903, + "mmlu_eval_accuracy_professional_law": 0.22941176470588234, + "mmlu_eval_accuracy_professional_medicine": 0.2903225806451613, + "mmlu_eval_accuracy_professional_psychology": 0.30434782608695654, + "mmlu_eval_accuracy_public_relations": 0.5, + "mmlu_eval_accuracy_security_studies": 0.4074074074074074, + "mmlu_eval_accuracy_sociology": 0.5, + "mmlu_eval_accuracy_us_foreign_policy": 0.18181818181818182, + "mmlu_eval_accuracy_virology": 0.3333333333333333, + "mmlu_eval_accuracy_world_religions": 0.2631578947368421, + "mmlu_loss": 1.841658030048373, + "step": 3927 + }, + { + "epoch": 7.124008161414645, + "grad_norm": 0.29296875, + "learning_rate": 0.0002, + "loss": 0.38, + "step": 3928 + }, + { + "epoch": 7.12582180911358, + "grad_norm": 0.2734375, + "learning_rate": 0.0002, + "loss": 0.3804, + "step": 3929 + }, + { + "epoch": 7.127635456812514, + "grad_norm": 0.29296875, + "learning_rate": 0.0002, + "loss": 0.3611, + "step": 3930 + }, + { + "epoch": 7.1294491045114485, + "grad_norm": 0.29296875, + "learning_rate": 0.0002, + "loss": 0.3541, + "step": 3931 + }, + { + "epoch": 7.131262752210383, + "grad_norm": 0.310546875, + "learning_rate": 0.0002, + "loss": 0.4606, + "step": 3932 + }, + { + "epoch": 7.133076399909317, + "grad_norm": 0.2734375, + "learning_rate": 0.0002, + "loss": 0.4025, + "step": 3933 + }, + { + "epoch": 7.134890047608252, + "grad_norm": 0.29296875, + "learning_rate": 0.0002, + "loss": 0.3833, + "step": 3934 + }, + { + "epoch": 7.136703695307187, + "grad_norm": 0.263671875, + "learning_rate": 0.0002, + "loss": 0.3274, + "step": 3935 + }, + { + "epoch": 7.1385173430061215, + "grad_norm": 0.296875, + "learning_rate": 0.0002, + "loss": 0.3754, + "step": 3936 + }, + { + "epoch": 7.140330990705055, + "grad_norm": 0.240234375, + "learning_rate": 0.0002, + "loss": 0.2343, + "step": 3937 + }, + { + "epoch": 7.14214463840399, + "grad_norm": 0.283203125, + "learning_rate": 0.0002, + "loss": 0.3366, + "step": 3938 + }, + { + "epoch": 7.143958286102924, + "grad_norm": 0.3046875, + "learning_rate": 0.0002, + "loss": 0.3075, + "step": 3939 + }, + { + "epoch": 7.145771933801859, + "grad_norm": 0.283203125, + "learning_rate": 0.0002, + "loss": 0.3477, + "step": 3940 + }, + { + "epoch": 7.147585581500794, + "grad_norm": 0.251953125, + "learning_rate": 0.0002, + "loss": 0.2555, + "step": 3941 + }, + { + "epoch": 7.149399229199728, + "grad_norm": 0.271484375, + "learning_rate": 0.0002, + "loss": 0.2947, + "step": 3942 + }, + { + "epoch": 7.151212876898662, + "grad_norm": 0.2578125, + "learning_rate": 0.0002, + "loss": 0.3015, + "step": 3943 + }, + { + "epoch": 7.153026524597597, + "grad_norm": 0.27734375, + "learning_rate": 0.0002, + "loss": 0.1881, + "step": 3944 + }, + { + "epoch": 7.154840172296532, + "grad_norm": 0.271484375, + "learning_rate": 0.0002, + "loss": 0.2059, + "step": 3945 + }, + { + "epoch": 7.156653819995466, + "grad_norm": 0.279296875, + "learning_rate": 0.0002, + "loss": 0.2079, + "step": 3946 + }, + { + "epoch": 7.1584674676944005, + "grad_norm": 0.318359375, + "learning_rate": 0.0002, + "loss": 0.2414, + "step": 3947 + }, + { + "epoch": 7.160281115393335, + "grad_norm": 0.2421875, + "learning_rate": 0.0002, + "loss": 0.1905, + "step": 3948 + }, + { + "epoch": 7.162094763092269, + "grad_norm": 0.388671875, + "learning_rate": 0.0002, + "loss": 0.2409, + "step": 3949 + }, + { + "epoch": 7.163908410791204, + "grad_norm": 0.263671875, + "learning_rate": 0.0002, + "loss": 0.2308, + "step": 3950 + }, + { + "epoch": 7.165722058490139, + "grad_norm": 0.248046875, + "learning_rate": 0.0002, + "loss": 0.1637, + "step": 3951 + }, + { + "epoch": 7.1675357061890725, + "grad_norm": 0.33984375, + "learning_rate": 0.0002, + "loss": 0.1955, + "step": 3952 + }, + { + "epoch": 7.169349353888007, + "grad_norm": 0.318359375, + "learning_rate": 0.0002, + "loss": 0.2876, + "step": 3953 + }, + { + "epoch": 7.171163001586942, + "grad_norm": 0.408203125, + "learning_rate": 0.0002, + "loss": 0.2238, + "step": 3954 + }, + { + "epoch": 7.172976649285876, + "grad_norm": 0.318359375, + "learning_rate": 0.0002, + "loss": 0.2052, + "step": 3955 + }, + { + "epoch": 7.174790296984811, + "grad_norm": 0.296875, + "learning_rate": 0.0002, + "loss": 0.1756, + "step": 3956 + }, + { + "epoch": 7.1766039446837455, + "grad_norm": 0.296875, + "learning_rate": 0.0002, + "loss": 0.1668, + "step": 3957 + }, + { + "epoch": 7.178417592382679, + "grad_norm": 0.26953125, + "learning_rate": 0.0002, + "loss": 0.1828, + "step": 3958 + }, + { + "epoch": 7.180231240081614, + "grad_norm": 0.455078125, + "learning_rate": 0.0002, + "loss": 0.221, + "step": 3959 + }, + { + "epoch": 7.182044887780549, + "grad_norm": 0.23046875, + "learning_rate": 0.0002, + "loss": 0.4574, + "step": 3960 + }, + { + "epoch": 7.183858535479483, + "grad_norm": 0.251953125, + "learning_rate": 0.0002, + "loss": 0.5204, + "step": 3961 + }, + { + "epoch": 7.185672183178418, + "grad_norm": 0.251953125, + "learning_rate": 0.0002, + "loss": 0.4694, + "step": 3962 + }, + { + "epoch": 7.187485830877352, + "grad_norm": 0.26171875, + "learning_rate": 0.0002, + "loss": 0.5383, + "step": 3963 + }, + { + "epoch": 7.189299478576286, + "grad_norm": 0.234375, + "learning_rate": 0.0002, + "loss": 0.3492, + "step": 3964 + }, + { + "epoch": 7.191113126275221, + "grad_norm": 0.248046875, + "learning_rate": 0.0002, + "loss": 0.493, + "step": 3965 + }, + { + "epoch": 7.192926773974156, + "grad_norm": 0.2470703125, + "learning_rate": 0.0002, + "loss": 0.5019, + "step": 3966 + }, + { + "epoch": 7.19474042167309, + "grad_norm": 0.25390625, + "learning_rate": 0.0002, + "loss": 0.4739, + "step": 3967 + }, + { + "epoch": 7.1965540693720245, + "grad_norm": 0.240234375, + "learning_rate": 0.0002, + "loss": 0.5087, + "step": 3968 + }, + { + "epoch": 7.198367717070959, + "grad_norm": 0.21875, + "learning_rate": 0.0002, + "loss": 0.4424, + "step": 3969 + }, + { + "epoch": 7.200181364769893, + "grad_norm": 0.2734375, + "learning_rate": 0.0002, + "loss": 0.5195, + "step": 3970 + }, + { + "epoch": 7.201995012468828, + "grad_norm": 0.263671875, + "learning_rate": 0.0002, + "loss": 0.5366, + "step": 3971 + }, + { + "epoch": 7.203808660167763, + "grad_norm": 0.279296875, + "learning_rate": 0.0002, + "loss": 0.4284, + "step": 3972 + }, + { + "epoch": 7.2056223078666966, + "grad_norm": 0.259765625, + "learning_rate": 0.0002, + "loss": 0.4698, + "step": 3973 + }, + { + "epoch": 7.207435955565631, + "grad_norm": 0.2890625, + "learning_rate": 0.0002, + "loss": 0.4166, + "step": 3974 + }, + { + "epoch": 7.209249603264566, + "grad_norm": 0.26953125, + "learning_rate": 0.0002, + "loss": 0.3729, + "step": 3975 + }, + { + "epoch": 7.2110632509635, + "grad_norm": 0.265625, + "learning_rate": 0.0002, + "loss": 0.354, + "step": 3976 + }, + { + "epoch": 7.212876898662435, + "grad_norm": 0.302734375, + "learning_rate": 0.0002, + "loss": 0.3448, + "step": 3977 + }, + { + "epoch": 7.2146905463613695, + "grad_norm": 0.279296875, + "learning_rate": 0.0002, + "loss": 0.3258, + "step": 3978 + }, + { + "epoch": 7.216504194060303, + "grad_norm": 0.294921875, + "learning_rate": 0.0002, + "loss": 0.4432, + "step": 3979 + }, + { + "epoch": 7.218317841759238, + "grad_norm": 0.287109375, + "learning_rate": 0.0002, + "loss": 0.3476, + "step": 3980 + }, + { + "epoch": 7.220131489458173, + "grad_norm": 0.298828125, + "learning_rate": 0.0002, + "loss": 0.3899, + "step": 3981 + }, + { + "epoch": 7.221945137157107, + "grad_norm": 0.2734375, + "learning_rate": 0.0002, + "loss": 0.3325, + "step": 3982 + }, + { + "epoch": 7.223758784856042, + "grad_norm": 0.267578125, + "learning_rate": 0.0002, + "loss": 0.3121, + "step": 3983 + }, + { + "epoch": 7.225572432554976, + "grad_norm": 0.27734375, + "learning_rate": 0.0002, + "loss": 0.3254, + "step": 3984 + }, + { + "epoch": 7.22738608025391, + "grad_norm": 0.259765625, + "learning_rate": 0.0002, + "loss": 0.3676, + "step": 3985 + }, + { + "epoch": 7.229199727952845, + "grad_norm": 0.302734375, + "learning_rate": 0.0002, + "loss": 0.4203, + "step": 3986 + }, + { + "epoch": 7.23101337565178, + "grad_norm": 0.291015625, + "learning_rate": 0.0002, + "loss": 0.3075, + "step": 3987 + }, + { + "epoch": 7.232827023350714, + "grad_norm": 0.25, + "learning_rate": 0.0002, + "loss": 0.3139, + "step": 3988 + }, + { + "epoch": 7.2346406710496485, + "grad_norm": 0.26953125, + "learning_rate": 0.0002, + "loss": 0.2789, + "step": 3989 + }, + { + "epoch": 7.236454318748583, + "grad_norm": 0.28515625, + "learning_rate": 0.0002, + "loss": 0.2484, + "step": 3990 + }, + { + "epoch": 7.238267966447518, + "grad_norm": 0.265625, + "learning_rate": 0.0002, + "loss": 0.2505, + "step": 3991 + }, + { + "epoch": 7.240081614146452, + "grad_norm": 0.29296875, + "learning_rate": 0.0002, + "loss": 0.311, + "step": 3992 + }, + { + "epoch": 7.241895261845387, + "grad_norm": 0.279296875, + "learning_rate": 0.0002, + "loss": 0.3012, + "step": 3993 + }, + { + "epoch": 7.243708909544321, + "grad_norm": 0.283203125, + "learning_rate": 0.0002, + "loss": 0.2643, + "step": 3994 + }, + { + "epoch": 7.245522557243255, + "grad_norm": 0.28125, + "learning_rate": 0.0002, + "loss": 0.2009, + "step": 3995 + }, + { + "epoch": 7.24733620494219, + "grad_norm": 0.27734375, + "learning_rate": 0.0002, + "loss": 0.2281, + "step": 3996 + }, + { + "epoch": 7.249149852641125, + "grad_norm": 0.26171875, + "learning_rate": 0.0002, + "loss": 0.2459, + "step": 3997 + }, + { + "epoch": 7.250963500340059, + "grad_norm": 0.310546875, + "learning_rate": 0.0002, + "loss": 0.1805, + "step": 3998 + }, + { + "epoch": 7.2527771480389935, + "grad_norm": 0.267578125, + "learning_rate": 0.0002, + "loss": 0.2015, + "step": 3999 + }, + { + "epoch": 7.254590795737928, + "grad_norm": 0.296875, + "learning_rate": 0.0002, + "loss": 0.1691, + "step": 4000 + }, + { + "epoch": 7.256404443436862, + "grad_norm": 0.3125, + "learning_rate": 0.0002, + "loss": 0.1941, + "step": 4001 + }, + { + "epoch": 7.258218091135797, + "grad_norm": 0.314453125, + "learning_rate": 0.0002, + "loss": 0.1686, + "step": 4002 + }, + { + "epoch": 7.260031738834732, + "grad_norm": 0.3046875, + "learning_rate": 0.0002, + "loss": 0.1842, + "step": 4003 + }, + { + "epoch": 7.261845386533666, + "grad_norm": 0.2265625, + "learning_rate": 0.0002, + "loss": 0.1764, + "step": 4004 + }, + { + "epoch": 7.2636590342326, + "grad_norm": 0.287109375, + "learning_rate": 0.0002, + "loss": 0.2198, + "step": 4005 + }, + { + "epoch": 7.265472681931535, + "grad_norm": 0.337890625, + "learning_rate": 0.0002, + "loss": 0.1921, + "step": 4006 + }, + { + "epoch": 7.267286329630469, + "grad_norm": 0.345703125, + "learning_rate": 0.0002, + "loss": 0.1565, + "step": 4007 + }, + { + "epoch": 7.269099977329404, + "grad_norm": 0.2333984375, + "learning_rate": 0.0002, + "loss": 0.1905, + "step": 4008 + }, + { + "epoch": 7.270913625028339, + "grad_norm": 0.1953125, + "learning_rate": 0.0002, + "loss": 0.2015, + "step": 4009 + }, + { + "epoch": 7.2727272727272725, + "grad_norm": 0.236328125, + "learning_rate": 0.0002, + "loss": 0.4001, + "step": 4010 + }, + { + "epoch": 7.274540920426207, + "grad_norm": 0.255859375, + "learning_rate": 0.0002, + "loss": 0.4376, + "step": 4011 + }, + { + "epoch": 7.276354568125142, + "grad_norm": 0.2431640625, + "learning_rate": 0.0002, + "loss": 0.5377, + "step": 4012 + }, + { + "epoch": 7.278168215824076, + "grad_norm": 0.2578125, + "learning_rate": 0.0002, + "loss": 0.4006, + "step": 4013 + }, + { + "epoch": 7.279981863523011, + "grad_norm": 0.25390625, + "learning_rate": 0.0002, + "loss": 0.4738, + "step": 4014 + }, + { + "epoch": 7.2817955112219455, + "grad_norm": 0.2236328125, + "learning_rate": 0.0002, + "loss": 0.4025, + "step": 4015 + }, + { + "epoch": 7.283609158920879, + "grad_norm": 0.2197265625, + "learning_rate": 0.0002, + "loss": 0.3247, + "step": 4016 + }, + { + "epoch": 7.285422806619814, + "grad_norm": 0.251953125, + "learning_rate": 0.0002, + "loss": 0.4149, + "step": 4017 + }, + { + "epoch": 7.287236454318749, + "grad_norm": 0.275390625, + "learning_rate": 0.0002, + "loss": 0.4051, + "step": 4018 + }, + { + "epoch": 7.289050102017683, + "grad_norm": 0.259765625, + "learning_rate": 0.0002, + "loss": 0.4083, + "step": 4019 + }, + { + "epoch": 7.2908637497166175, + "grad_norm": 0.265625, + "learning_rate": 0.0002, + "loss": 0.4646, + "step": 4020 + }, + { + "epoch": 7.292677397415552, + "grad_norm": 0.25390625, + "learning_rate": 0.0002, + "loss": 0.3643, + "step": 4021 + }, + { + "epoch": 7.294491045114486, + "grad_norm": 0.271484375, + "learning_rate": 0.0002, + "loss": 0.4198, + "step": 4022 + }, + { + "epoch": 7.296304692813421, + "grad_norm": 0.30078125, + "learning_rate": 0.0002, + "loss": 0.5416, + "step": 4023 + }, + { + "epoch": 7.298118340512356, + "grad_norm": 0.296875, + "learning_rate": 0.0002, + "loss": 0.5197, + "step": 4024 + }, + { + "epoch": 7.29993198821129, + "grad_norm": 0.271484375, + "learning_rate": 0.0002, + "loss": 0.3328, + "step": 4025 + }, + { + "epoch": 7.301745635910224, + "grad_norm": 0.271484375, + "learning_rate": 0.0002, + "loss": 0.3145, + "step": 4026 + }, + { + "epoch": 7.303559283609159, + "grad_norm": 0.26953125, + "learning_rate": 0.0002, + "loss": 0.3132, + "step": 4027 + }, + { + "epoch": 7.305372931308093, + "grad_norm": 0.287109375, + "learning_rate": 0.0002, + "loss": 0.4094, + "step": 4028 + }, + { + "epoch": 7.307186579007028, + "grad_norm": 0.2890625, + "learning_rate": 0.0002, + "loss": 0.4007, + "step": 4029 + }, + { + "epoch": 7.309000226705963, + "grad_norm": 0.291015625, + "learning_rate": 0.0002, + "loss": 0.357, + "step": 4030 + }, + { + "epoch": 7.3108138744048965, + "grad_norm": 0.294921875, + "learning_rate": 0.0002, + "loss": 0.3546, + "step": 4031 + }, + { + "epoch": 7.312627522103831, + "grad_norm": 0.2890625, + "learning_rate": 0.0002, + "loss": 0.4022, + "step": 4032 + }, + { + "epoch": 7.314441169802766, + "grad_norm": 0.267578125, + "learning_rate": 0.0002, + "loss": 0.3258, + "step": 4033 + }, + { + "epoch": 7.3162548175017, + "grad_norm": 0.296875, + "learning_rate": 0.0002, + "loss": 0.3743, + "step": 4034 + }, + { + "epoch": 7.318068465200635, + "grad_norm": 0.287109375, + "learning_rate": 0.0002, + "loss": 0.3233, + "step": 4035 + }, + { + "epoch": 7.3198821128995695, + "grad_norm": 0.306640625, + "learning_rate": 0.0002, + "loss": 0.3484, + "step": 4036 + }, + { + "epoch": 7.321695760598503, + "grad_norm": 0.32421875, + "learning_rate": 0.0002, + "loss": 0.3499, + "step": 4037 + }, + { + "epoch": 7.323509408297438, + "grad_norm": 0.283203125, + "learning_rate": 0.0002, + "loss": 0.2476, + "step": 4038 + }, + { + "epoch": 7.325323055996373, + "grad_norm": 0.302734375, + "learning_rate": 0.0002, + "loss": 0.326, + "step": 4039 + }, + { + "epoch": 7.327136703695307, + "grad_norm": 0.25390625, + "learning_rate": 0.0002, + "loss": 0.239, + "step": 4040 + }, + { + "epoch": 7.328950351394242, + "grad_norm": 0.35546875, + "learning_rate": 0.0002, + "loss": 0.3785, + "step": 4041 + }, + { + "epoch": 7.330763999093176, + "grad_norm": 0.275390625, + "learning_rate": 0.0002, + "loss": 0.2149, + "step": 4042 + }, + { + "epoch": 7.33257764679211, + "grad_norm": 0.291015625, + "learning_rate": 0.0002, + "loss": 0.2348, + "step": 4043 + }, + { + "epoch": 7.334391294491045, + "grad_norm": 0.255859375, + "learning_rate": 0.0002, + "loss": 0.1671, + "step": 4044 + }, + { + "epoch": 7.33620494218998, + "grad_norm": 0.23046875, + "learning_rate": 0.0002, + "loss": 0.1577, + "step": 4045 + }, + { + "epoch": 7.3380185898889145, + "grad_norm": 0.26171875, + "learning_rate": 0.0002, + "loss": 0.2142, + "step": 4046 + }, + { + "epoch": 7.339832237587848, + "grad_norm": 0.345703125, + "learning_rate": 0.0002, + "loss": 0.1923, + "step": 4047 + }, + { + "epoch": 7.341645885286783, + "grad_norm": 0.265625, + "learning_rate": 0.0002, + "loss": 0.1876, + "step": 4048 + }, + { + "epoch": 7.343459532985717, + "grad_norm": 0.26953125, + "learning_rate": 0.0002, + "loss": 0.1982, + "step": 4049 + }, + { + "epoch": 7.345273180684652, + "grad_norm": 0.2890625, + "learning_rate": 0.0002, + "loss": 0.1841, + "step": 4050 + }, + { + "epoch": 7.347086828383587, + "grad_norm": 0.2451171875, + "learning_rate": 0.0002, + "loss": 0.1589, + "step": 4051 + }, + { + "epoch": 7.348900476082521, + "grad_norm": 0.2734375, + "learning_rate": 0.0002, + "loss": 0.1967, + "step": 4052 + }, + { + "epoch": 7.350714123781455, + "grad_norm": 0.322265625, + "learning_rate": 0.0002, + "loss": 0.1698, + "step": 4053 + }, + { + "epoch": 7.35252777148039, + "grad_norm": 0.376953125, + "learning_rate": 0.0002, + "loss": 0.2497, + "step": 4054 + }, + { + "epoch": 7.354341419179325, + "grad_norm": 0.20703125, + "learning_rate": 0.0002, + "loss": 0.1675, + "step": 4055 + }, + { + "epoch": 7.356155066878259, + "grad_norm": 0.306640625, + "learning_rate": 0.0002, + "loss": 0.1669, + "step": 4056 + }, + { + "epoch": 7.3579687145771935, + "grad_norm": 0.35546875, + "learning_rate": 0.0002, + "loss": 0.2195, + "step": 4057 + }, + { + "epoch": 7.359782362276128, + "grad_norm": 0.263671875, + "learning_rate": 0.0002, + "loss": 0.2196, + "step": 4058 + }, + { + "epoch": 7.361596009975062, + "grad_norm": 0.32421875, + "learning_rate": 0.0002, + "loss": 0.235, + "step": 4059 + }, + { + "epoch": 7.363409657673997, + "grad_norm": 0.25390625, + "learning_rate": 0.0002, + "loss": 0.3991, + "step": 4060 + }, + { + "epoch": 7.365223305372932, + "grad_norm": 0.2451171875, + "learning_rate": 0.0002, + "loss": 0.378, + "step": 4061 + }, + { + "epoch": 7.367036953071866, + "grad_norm": 0.28515625, + "learning_rate": 0.0002, + "loss": 0.4588, + "step": 4062 + }, + { + "epoch": 7.3688506007708, + "grad_norm": 0.2578125, + "learning_rate": 0.0002, + "loss": 0.5478, + "step": 4063 + }, + { + "epoch": 7.370664248469735, + "grad_norm": 0.2578125, + "learning_rate": 0.0002, + "loss": 0.3371, + "step": 4064 + }, + { + "epoch": 7.372477896168669, + "grad_norm": 0.2431640625, + "learning_rate": 0.0002, + "loss": 0.3846, + "step": 4065 + }, + { + "epoch": 7.374291543867604, + "grad_norm": 0.2421875, + "learning_rate": 0.0002, + "loss": 0.4515, + "step": 4066 + }, + { + "epoch": 7.3761051915665385, + "grad_norm": 0.27734375, + "learning_rate": 0.0002, + "loss": 0.4265, + "step": 4067 + }, + { + "epoch": 7.377918839265472, + "grad_norm": 0.25, + "learning_rate": 0.0002, + "loss": 0.4004, + "step": 4068 + }, + { + "epoch": 7.379732486964407, + "grad_norm": 0.263671875, + "learning_rate": 0.0002, + "loss": 0.373, + "step": 4069 + }, + { + "epoch": 7.381546134663342, + "grad_norm": 0.251953125, + "learning_rate": 0.0002, + "loss": 0.426, + "step": 4070 + }, + { + "epoch": 7.383359782362276, + "grad_norm": 0.2890625, + "learning_rate": 0.0002, + "loss": 0.5286, + "step": 4071 + }, + { + "epoch": 7.385173430061211, + "grad_norm": 0.2734375, + "learning_rate": 0.0002, + "loss": 0.3981, + "step": 4072 + }, + { + "epoch": 7.386987077760145, + "grad_norm": 0.259765625, + "learning_rate": 0.0002, + "loss": 0.3316, + "step": 4073 + }, + { + "epoch": 7.388800725459079, + "grad_norm": 0.2890625, + "learning_rate": 0.0002, + "loss": 0.3781, + "step": 4074 + }, + { + "epoch": 7.390614373158014, + "grad_norm": 0.302734375, + "learning_rate": 0.0002, + "loss": 0.4728, + "step": 4075 + }, + { + "epoch": 7.392428020856949, + "grad_norm": 0.28125, + "learning_rate": 0.0002, + "loss": 0.3392, + "step": 4076 + }, + { + "epoch": 7.394241668555883, + "grad_norm": 0.263671875, + "learning_rate": 0.0002, + "loss": 0.3354, + "step": 4077 + }, + { + "epoch": 7.3960553162548175, + "grad_norm": 0.30859375, + "learning_rate": 0.0002, + "loss": 0.3575, + "step": 4078 + }, + { + "epoch": 7.397868963953752, + "grad_norm": 0.314453125, + "learning_rate": 0.0002, + "loss": 0.4081, + "step": 4079 + }, + { + "epoch": 7.399682611652686, + "grad_norm": 0.3125, + "learning_rate": 0.0002, + "loss": 0.4108, + "step": 4080 + }, + { + "epoch": 7.401496259351621, + "grad_norm": 0.31640625, + "learning_rate": 0.0002, + "loss": 0.3981, + "step": 4081 + }, + { + "epoch": 7.403309907050556, + "grad_norm": 0.318359375, + "learning_rate": 0.0002, + "loss": 0.4112, + "step": 4082 + }, + { + "epoch": 7.40512355474949, + "grad_norm": 0.28125, + "learning_rate": 0.0002, + "loss": 0.3502, + "step": 4083 + }, + { + "epoch": 7.406937202448424, + "grad_norm": 0.25390625, + "learning_rate": 0.0002, + "loss": 0.3155, + "step": 4084 + }, + { + "epoch": 7.408750850147359, + "grad_norm": 0.306640625, + "learning_rate": 0.0002, + "loss": 0.3632, + "step": 4085 + }, + { + "epoch": 7.410564497846293, + "grad_norm": 0.279296875, + "learning_rate": 0.0002, + "loss": 0.384, + "step": 4086 + }, + { + "epoch": 7.412378145545228, + "grad_norm": 0.279296875, + "learning_rate": 0.0002, + "loss": 0.2916, + "step": 4087 + }, + { + "epoch": 7.4141917932441626, + "grad_norm": 0.30859375, + "learning_rate": 0.0002, + "loss": 0.3925, + "step": 4088 + }, + { + "epoch": 7.416005440943096, + "grad_norm": 0.2578125, + "learning_rate": 0.0002, + "loss": 0.3181, + "step": 4089 + }, + { + "epoch": 7.417819088642031, + "grad_norm": 0.287109375, + "learning_rate": 0.0002, + "loss": 0.2957, + "step": 4090 + }, + { + "epoch": 7.419632736340966, + "grad_norm": 0.26953125, + "learning_rate": 0.0002, + "loss": 0.2606, + "step": 4091 + }, + { + "epoch": 7.4214463840399, + "grad_norm": 0.28515625, + "learning_rate": 0.0002, + "loss": 0.2964, + "step": 4092 + }, + { + "epoch": 7.423260031738835, + "grad_norm": 0.3203125, + "learning_rate": 0.0002, + "loss": 0.2123, + "step": 4093 + }, + { + "epoch": 7.425073679437769, + "grad_norm": 0.2578125, + "learning_rate": 0.0002, + "loss": 0.2152, + "step": 4094 + }, + { + "epoch": 7.426887327136703, + "grad_norm": 0.25390625, + "learning_rate": 0.0002, + "loss": 0.2111, + "step": 4095 + }, + { + "epoch": 7.428700974835638, + "grad_norm": 0.302734375, + "learning_rate": 0.0002, + "loss": 0.2019, + "step": 4096 + }, + { + "epoch": 7.430514622534573, + "grad_norm": 0.310546875, + "learning_rate": 0.0002, + "loss": 0.2054, + "step": 4097 + }, + { + "epoch": 7.432328270233507, + "grad_norm": 0.26953125, + "learning_rate": 0.0002, + "loss": 0.1958, + "step": 4098 + }, + { + "epoch": 7.4341419179324415, + "grad_norm": 0.318359375, + "learning_rate": 0.0002, + "loss": 0.1925, + "step": 4099 + }, + { + "epoch": 7.435955565631376, + "grad_norm": 0.28515625, + "learning_rate": 0.0002, + "loss": 0.1828, + "step": 4100 + }, + { + "epoch": 7.437769213330311, + "grad_norm": 0.251953125, + "learning_rate": 0.0002, + "loss": 0.156, + "step": 4101 + }, + { + "epoch": 7.439582861029245, + "grad_norm": 0.33203125, + "learning_rate": 0.0002, + "loss": 0.3208, + "step": 4102 + }, + { + "epoch": 7.44139650872818, + "grad_norm": 0.4140625, + "learning_rate": 0.0002, + "loss": 0.2751, + "step": 4103 + }, + { + "epoch": 7.443210156427114, + "grad_norm": 0.30078125, + "learning_rate": 0.0002, + "loss": 0.2131, + "step": 4104 + }, + { + "epoch": 7.445023804126048, + "grad_norm": 0.232421875, + "learning_rate": 0.0002, + "loss": 0.1632, + "step": 4105 + }, + { + "epoch": 7.446837451824983, + "grad_norm": 0.283203125, + "learning_rate": 0.0002, + "loss": 0.2026, + "step": 4106 + }, + { + "epoch": 7.448651099523918, + "grad_norm": 0.25390625, + "learning_rate": 0.0002, + "loss": 0.1725, + "step": 4107 + }, + { + "epoch": 7.450464747222852, + "grad_norm": 0.2314453125, + "learning_rate": 0.0002, + "loss": 0.1796, + "step": 4108 + }, + { + "epoch": 7.452278394921787, + "grad_norm": 0.33203125, + "learning_rate": 0.0002, + "loss": 0.2229, + "step": 4109 + }, + { + "epoch": 7.454092042620721, + "grad_norm": 0.59375, + "learning_rate": 0.0002, + "loss": 0.4567, + "step": 4110 + }, + { + "epoch": 7.455905690319655, + "grad_norm": 0.259765625, + "learning_rate": 0.0002, + "loss": 0.4209, + "step": 4111 + }, + { + "epoch": 7.45771933801859, + "grad_norm": 0.23828125, + "learning_rate": 0.0002, + "loss": 0.3674, + "step": 4112 + }, + { + "epoch": 7.459532985717525, + "grad_norm": 0.26953125, + "learning_rate": 0.0002, + "loss": 0.4894, + "step": 4113 + }, + { + "epoch": 7.461346633416459, + "grad_norm": 0.2431640625, + "learning_rate": 0.0002, + "loss": 0.4452, + "step": 4114 + }, + { + "epoch": 7.461346633416459, + "eval_loss": 1.9495465755462646, + "eval_runtime": 152.9119, + "eval_samples_per_second": 6.54, + "eval_steps_per_second": 6.54, + "step": 4114 + }, + { + "epoch": 7.461346633416459, + "mmlu_eval_accuracy": 0.30907816413171113, + "mmlu_eval_accuracy_abstract_algebra": 0.2727272727272727, + "mmlu_eval_accuracy_anatomy": 0.35714285714285715, + "mmlu_eval_accuracy_astronomy": 0.25, + "mmlu_eval_accuracy_business_ethics": 0.45454545454545453, + "mmlu_eval_accuracy_clinical_knowledge": 0.3103448275862069, + "mmlu_eval_accuracy_college_biology": 0.375, + "mmlu_eval_accuracy_college_chemistry": 0.125, + "mmlu_eval_accuracy_college_computer_science": 0.09090909090909091, + "mmlu_eval_accuracy_college_mathematics": 0.18181818181818182, + "mmlu_eval_accuracy_college_medicine": 0.13636363636363635, + "mmlu_eval_accuracy_college_physics": 0.36363636363636365, + "mmlu_eval_accuracy_computer_security": 0.36363636363636365, + "mmlu_eval_accuracy_conceptual_physics": 0.34615384615384615, + "mmlu_eval_accuracy_econometrics": 0.16666666666666666, + "mmlu_eval_accuracy_electrical_engineering": 0.0625, + "mmlu_eval_accuracy_elementary_mathematics": 0.2682926829268293, + "mmlu_eval_accuracy_formal_logic": 0.14285714285714285, + "mmlu_eval_accuracy_global_facts": 0.4, + "mmlu_eval_accuracy_high_school_biology": 0.46875, + "mmlu_eval_accuracy_high_school_chemistry": 0.22727272727272727, + "mmlu_eval_accuracy_high_school_computer_science": 0.2222222222222222, + "mmlu_eval_accuracy_high_school_european_history": 0.2777777777777778, + "mmlu_eval_accuracy_high_school_geography": 0.3181818181818182, + "mmlu_eval_accuracy_high_school_government_and_politics": 0.42857142857142855, + "mmlu_eval_accuracy_high_school_macroeconomics": 0.2558139534883721, + "mmlu_eval_accuracy_high_school_mathematics": 0.10344827586206896, + "mmlu_eval_accuracy_high_school_microeconomics": 0.34615384615384615, + "mmlu_eval_accuracy_high_school_physics": 0.4117647058823529, + "mmlu_eval_accuracy_high_school_psychology": 0.31666666666666665, + "mmlu_eval_accuracy_high_school_statistics": 0.30434782608695654, + "mmlu_eval_accuracy_high_school_us_history": 0.3181818181818182, + "mmlu_eval_accuracy_high_school_world_history": 0.15384615384615385, + "mmlu_eval_accuracy_human_aging": 0.43478260869565216, + "mmlu_eval_accuracy_human_sexuality": 0.16666666666666666, + "mmlu_eval_accuracy_international_law": 0.38461538461538464, + "mmlu_eval_accuracy_jurisprudence": 0.2727272727272727, + "mmlu_eval_accuracy_logical_fallacies": 0.3333333333333333, + "mmlu_eval_accuracy_machine_learning": 0.36363636363636365, + "mmlu_eval_accuracy_management": 0.5454545454545454, + "mmlu_eval_accuracy_marketing": 0.48, + "mmlu_eval_accuracy_medical_genetics": 0.45454545454545453, + "mmlu_eval_accuracy_miscellaneous": 0.5348837209302325, + "mmlu_eval_accuracy_moral_disputes": 0.21052631578947367, + "mmlu_eval_accuracy_moral_scenarios": 0.22, + "mmlu_eval_accuracy_nutrition": 0.3939393939393939, + "mmlu_eval_accuracy_philosophy": 0.2647058823529412, + "mmlu_eval_accuracy_prehistory": 0.42857142857142855, + "mmlu_eval_accuracy_professional_accounting": 0.3225806451612903, + "mmlu_eval_accuracy_professional_law": 0.23529411764705882, + "mmlu_eval_accuracy_professional_medicine": 0.22580645161290322, + "mmlu_eval_accuracy_professional_psychology": 0.2898550724637681, + "mmlu_eval_accuracy_public_relations": 0.5833333333333334, + "mmlu_eval_accuracy_security_studies": 0.37037037037037035, + "mmlu_eval_accuracy_sociology": 0.4090909090909091, + "mmlu_eval_accuracy_us_foreign_policy": 0.2727272727272727, + "mmlu_eval_accuracy_virology": 0.3888888888888889, + "mmlu_eval_accuracy_world_religions": 0.21052631578947367, + "mmlu_loss": 1.6078224223007336, + "step": 4114 + }, + { + "epoch": 7.463160281115393, + "grad_norm": 0.259765625, + "learning_rate": 0.0002, + "loss": 0.4446, + "step": 4115 + }, + { + "epoch": 7.464973928814328, + "grad_norm": 0.251953125, + "learning_rate": 0.0002, + "loss": 0.5745, + "step": 4116 + }, + { + "epoch": 7.466787576513262, + "grad_norm": 0.265625, + "learning_rate": 0.0002, + "loss": 0.4369, + "step": 4117 + }, + { + "epoch": 7.468601224212197, + "grad_norm": 0.25, + "learning_rate": 0.0002, + "loss": 0.4916, + "step": 4118 + }, + { + "epoch": 7.470414871911132, + "grad_norm": 0.26953125, + "learning_rate": 0.0002, + "loss": 0.4019, + "step": 4119 + }, + { + "epoch": 7.4722285196100655, + "grad_norm": 0.26171875, + "learning_rate": 0.0002, + "loss": 0.4064, + "step": 4120 + }, + { + "epoch": 7.474042167309, + "grad_norm": 0.2490234375, + "learning_rate": 0.0002, + "loss": 0.4901, + "step": 4121 + }, + { + "epoch": 7.475855815007935, + "grad_norm": 0.275390625, + "learning_rate": 0.0002, + "loss": 0.3668, + "step": 4122 + }, + { + "epoch": 7.477669462706869, + "grad_norm": 0.2578125, + "learning_rate": 0.0002, + "loss": 0.3936, + "step": 4123 + }, + { + "epoch": 7.479483110405804, + "grad_norm": 0.2734375, + "learning_rate": 0.0002, + "loss": 0.36, + "step": 4124 + }, + { + "epoch": 7.4812967581047385, + "grad_norm": 0.283203125, + "learning_rate": 0.0002, + "loss": 0.4419, + "step": 4125 + }, + { + "epoch": 7.483110405803672, + "grad_norm": 0.265625, + "learning_rate": 0.0002, + "loss": 0.3244, + "step": 4126 + }, + { + "epoch": 7.484924053502607, + "grad_norm": 0.2890625, + "learning_rate": 0.0002, + "loss": 0.467, + "step": 4127 + }, + { + "epoch": 7.486737701201542, + "grad_norm": 0.2890625, + "learning_rate": 0.0002, + "loss": 0.4258, + "step": 4128 + }, + { + "epoch": 7.488551348900476, + "grad_norm": 0.279296875, + "learning_rate": 0.0002, + "loss": 0.3954, + "step": 4129 + }, + { + "epoch": 7.490364996599411, + "grad_norm": 0.29296875, + "learning_rate": 0.0002, + "loss": 0.3763, + "step": 4130 + }, + { + "epoch": 7.492178644298345, + "grad_norm": 0.2578125, + "learning_rate": 0.0002, + "loss": 0.2979, + "step": 4131 + }, + { + "epoch": 7.493992291997279, + "grad_norm": 0.310546875, + "learning_rate": 0.0002, + "loss": 0.3909, + "step": 4132 + }, + { + "epoch": 7.495805939696214, + "grad_norm": 0.29296875, + "learning_rate": 0.0002, + "loss": 0.3498, + "step": 4133 + }, + { + "epoch": 7.497619587395149, + "grad_norm": 0.2578125, + "learning_rate": 0.0002, + "loss": 0.3306, + "step": 4134 + }, + { + "epoch": 7.499433235094083, + "grad_norm": 0.275390625, + "learning_rate": 0.0002, + "loss": 0.3689, + "step": 4135 + }, + { + "epoch": 7.501246882793017, + "grad_norm": 0.271484375, + "learning_rate": 0.0002, + "loss": 0.3194, + "step": 4136 + }, + { + "epoch": 7.503060530491952, + "grad_norm": 0.26953125, + "learning_rate": 0.0002, + "loss": 0.366, + "step": 4137 + }, + { + "epoch": 7.504874178190886, + "grad_norm": 0.294921875, + "learning_rate": 0.0002, + "loss": 0.3201, + "step": 4138 + }, + { + "epoch": 7.506687825889821, + "grad_norm": 0.3046875, + "learning_rate": 0.0002, + "loss": 0.2858, + "step": 4139 + }, + { + "epoch": 7.508501473588756, + "grad_norm": 0.34765625, + "learning_rate": 0.0002, + "loss": 0.399, + "step": 4140 + }, + { + "epoch": 7.5103151212876895, + "grad_norm": 0.33984375, + "learning_rate": 0.0002, + "loss": 0.3334, + "step": 4141 + }, + { + "epoch": 7.512128768986624, + "grad_norm": 0.306640625, + "learning_rate": 0.0002, + "loss": 0.2661, + "step": 4142 + }, + { + "epoch": 7.513942416685559, + "grad_norm": 0.3203125, + "learning_rate": 0.0002, + "loss": 0.3466, + "step": 4143 + }, + { + "epoch": 7.515756064384493, + "grad_norm": 0.283203125, + "learning_rate": 0.0002, + "loss": 0.2197, + "step": 4144 + }, + { + "epoch": 7.517569712083428, + "grad_norm": 0.287109375, + "learning_rate": 0.0002, + "loss": 0.2024, + "step": 4145 + }, + { + "epoch": 7.5193833597823625, + "grad_norm": 0.287109375, + "learning_rate": 0.0002, + "loss": 0.2026, + "step": 4146 + }, + { + "epoch": 7.521197007481296, + "grad_norm": 0.23046875, + "learning_rate": 0.0002, + "loss": 0.2167, + "step": 4147 + }, + { + "epoch": 7.523010655180231, + "grad_norm": 0.330078125, + "learning_rate": 0.0002, + "loss": 0.2389, + "step": 4148 + }, + { + "epoch": 7.524824302879166, + "grad_norm": 0.328125, + "learning_rate": 0.0002, + "loss": 0.2571, + "step": 4149 + }, + { + "epoch": 7.526637950578101, + "grad_norm": 0.27734375, + "learning_rate": 0.0002, + "loss": 0.165, + "step": 4150 + }, + { + "epoch": 7.528451598277035, + "grad_norm": 0.2421875, + "learning_rate": 0.0002, + "loss": 0.1714, + "step": 4151 + }, + { + "epoch": 7.530265245975969, + "grad_norm": 0.28125, + "learning_rate": 0.0002, + "loss": 0.2239, + "step": 4152 + }, + { + "epoch": 7.532078893674903, + "grad_norm": 0.30078125, + "learning_rate": 0.0002, + "loss": 0.2454, + "step": 4153 + }, + { + "epoch": 7.533892541373838, + "grad_norm": 0.3671875, + "learning_rate": 0.0002, + "loss": 0.2174, + "step": 4154 + }, + { + "epoch": 7.535706189072773, + "grad_norm": 0.3046875, + "learning_rate": 0.0002, + "loss": 0.1912, + "step": 4155 + }, + { + "epoch": 7.537519836771708, + "grad_norm": 0.224609375, + "learning_rate": 0.0002, + "loss": 0.1507, + "step": 4156 + }, + { + "epoch": 7.5393334844706414, + "grad_norm": 0.263671875, + "learning_rate": 0.0002, + "loss": 0.1984, + "step": 4157 + }, + { + "epoch": 7.541147132169576, + "grad_norm": 0.32421875, + "learning_rate": 0.0002, + "loss": 0.1901, + "step": 4158 + }, + { + "epoch": 7.54296077986851, + "grad_norm": 0.208984375, + "learning_rate": 0.0002, + "loss": 0.2054, + "step": 4159 + }, + { + "epoch": 7.544774427567445, + "grad_norm": 0.220703125, + "learning_rate": 0.0002, + "loss": 0.3696, + "step": 4160 + }, + { + "epoch": 7.54658807526638, + "grad_norm": 0.267578125, + "learning_rate": 0.0002, + "loss": 0.4592, + "step": 4161 + }, + { + "epoch": 7.548401722965314, + "grad_norm": 0.23828125, + "learning_rate": 0.0002, + "loss": 0.4513, + "step": 4162 + }, + { + "epoch": 7.550215370664248, + "grad_norm": 0.275390625, + "learning_rate": 0.0002, + "loss": 0.4243, + "step": 4163 + }, + { + "epoch": 7.552029018363183, + "grad_norm": 0.275390625, + "learning_rate": 0.0002, + "loss": 0.5131, + "step": 4164 + }, + { + "epoch": 7.553842666062117, + "grad_norm": 0.251953125, + "learning_rate": 0.0002, + "loss": 0.4261, + "step": 4165 + }, + { + "epoch": 7.555656313761052, + "grad_norm": 0.279296875, + "learning_rate": 0.0002, + "loss": 0.5062, + "step": 4166 + }, + { + "epoch": 7.5574699614599865, + "grad_norm": 0.259765625, + "learning_rate": 0.0002, + "loss": 0.5213, + "step": 4167 + }, + { + "epoch": 7.559283609158921, + "grad_norm": 0.259765625, + "learning_rate": 0.0002, + "loss": 0.4999, + "step": 4168 + }, + { + "epoch": 7.561097256857855, + "grad_norm": 0.275390625, + "learning_rate": 0.0002, + "loss": 0.4422, + "step": 4169 + }, + { + "epoch": 7.56291090455679, + "grad_norm": 0.26953125, + "learning_rate": 0.0002, + "loss": 0.4025, + "step": 4170 + }, + { + "epoch": 7.564724552255725, + "grad_norm": 0.298828125, + "learning_rate": 0.0002, + "loss": 0.471, + "step": 4171 + }, + { + "epoch": 7.566538199954659, + "grad_norm": 0.271484375, + "learning_rate": 0.0002, + "loss": 0.4204, + "step": 4172 + }, + { + "epoch": 7.568351847653593, + "grad_norm": 0.271484375, + "learning_rate": 0.0002, + "loss": 0.3812, + "step": 4173 + }, + { + "epoch": 7.570165495352528, + "grad_norm": 0.33203125, + "learning_rate": 0.0002, + "loss": 0.4289, + "step": 4174 + }, + { + "epoch": 7.571979143051462, + "grad_norm": 0.265625, + "learning_rate": 0.0002, + "loss": 0.415, + "step": 4175 + }, + { + "epoch": 7.573792790750397, + "grad_norm": 0.26953125, + "learning_rate": 0.0002, + "loss": 0.4044, + "step": 4176 + }, + { + "epoch": 7.575606438449332, + "grad_norm": 0.287109375, + "learning_rate": 0.0002, + "loss": 0.45, + "step": 4177 + }, + { + "epoch": 7.5774200861482655, + "grad_norm": 0.27734375, + "learning_rate": 0.0002, + "loss": 0.3879, + "step": 4178 + }, + { + "epoch": 7.5792337338472, + "grad_norm": 0.2734375, + "learning_rate": 0.0002, + "loss": 0.3822, + "step": 4179 + }, + { + "epoch": 7.581047381546135, + "grad_norm": 0.275390625, + "learning_rate": 0.0002, + "loss": 0.4019, + "step": 4180 + }, + { + "epoch": 7.582861029245069, + "grad_norm": 0.28125, + "learning_rate": 0.0002, + "loss": 0.3141, + "step": 4181 + }, + { + "epoch": 7.584674676944004, + "grad_norm": 0.2734375, + "learning_rate": 0.0002, + "loss": 0.4105, + "step": 4182 + }, + { + "epoch": 7.586488324642938, + "grad_norm": 0.30859375, + "learning_rate": 0.0002, + "loss": 0.3644, + "step": 4183 + }, + { + "epoch": 7.588301972341872, + "grad_norm": 0.2890625, + "learning_rate": 0.0002, + "loss": 0.3546, + "step": 4184 + }, + { + "epoch": 7.590115620040807, + "grad_norm": 0.2734375, + "learning_rate": 0.0002, + "loss": 0.3497, + "step": 4185 + }, + { + "epoch": 7.591929267739742, + "grad_norm": 0.294921875, + "learning_rate": 0.0002, + "loss": 0.2946, + "step": 4186 + }, + { + "epoch": 7.593742915438676, + "grad_norm": 0.294921875, + "learning_rate": 0.0002, + "loss": 0.2933, + "step": 4187 + }, + { + "epoch": 7.5955565631376105, + "grad_norm": 0.2314453125, + "learning_rate": 0.0002, + "loss": 0.2614, + "step": 4188 + }, + { + "epoch": 7.597370210836545, + "grad_norm": 0.310546875, + "learning_rate": 0.0002, + "loss": 0.3618, + "step": 4189 + }, + { + "epoch": 7.599183858535479, + "grad_norm": 0.28515625, + "learning_rate": 0.0002, + "loss": 0.3184, + "step": 4190 + }, + { + "epoch": 7.600997506234414, + "grad_norm": 0.283203125, + "learning_rate": 0.0002, + "loss": 0.3396, + "step": 4191 + }, + { + "epoch": 7.602811153933349, + "grad_norm": 0.298828125, + "learning_rate": 0.0002, + "loss": 0.3232, + "step": 4192 + }, + { + "epoch": 7.604624801632283, + "grad_norm": 0.314453125, + "learning_rate": 0.0002, + "loss": 0.363, + "step": 4193 + }, + { + "epoch": 7.606438449331217, + "grad_norm": 0.287109375, + "learning_rate": 0.0002, + "loss": 0.2575, + "step": 4194 + }, + { + "epoch": 7.608252097030152, + "grad_norm": 0.3203125, + "learning_rate": 0.0002, + "loss": 0.2929, + "step": 4195 + }, + { + "epoch": 7.610065744729086, + "grad_norm": 0.337890625, + "learning_rate": 0.0002, + "loss": 0.3138, + "step": 4196 + }, + { + "epoch": 7.611879392428021, + "grad_norm": 0.2080078125, + "learning_rate": 0.0002, + "loss": 0.155, + "step": 4197 + }, + { + "epoch": 7.613693040126956, + "grad_norm": 0.30859375, + "learning_rate": 0.0002, + "loss": 0.1613, + "step": 4198 + }, + { + "epoch": 7.6155066878258895, + "grad_norm": 0.3125, + "learning_rate": 0.0002, + "loss": 0.2327, + "step": 4199 + }, + { + "epoch": 7.617320335524824, + "grad_norm": 0.26953125, + "learning_rate": 0.0002, + "loss": 0.1614, + "step": 4200 + }, + { + "epoch": 7.619133983223759, + "grad_norm": 0.28125, + "learning_rate": 0.0002, + "loss": 0.2649, + "step": 4201 + }, + { + "epoch": 7.620947630922693, + "grad_norm": 0.390625, + "learning_rate": 0.0002, + "loss": 0.1834, + "step": 4202 + }, + { + "epoch": 7.622761278621628, + "grad_norm": 0.294921875, + "learning_rate": 0.0002, + "loss": 0.2081, + "step": 4203 + }, + { + "epoch": 7.624574926320562, + "grad_norm": 0.30859375, + "learning_rate": 0.0002, + "loss": 0.1429, + "step": 4204 + }, + { + "epoch": 7.626388574019496, + "grad_norm": 0.263671875, + "learning_rate": 0.0002, + "loss": 0.1849, + "step": 4205 + }, + { + "epoch": 7.628202221718431, + "grad_norm": 0.35546875, + "learning_rate": 0.0002, + "loss": 0.2018, + "step": 4206 + }, + { + "epoch": 7.630015869417366, + "grad_norm": 0.47265625, + "learning_rate": 0.0002, + "loss": 0.2265, + "step": 4207 + }, + { + "epoch": 7.6318295171163, + "grad_norm": 0.59375, + "learning_rate": 0.0002, + "loss": 0.2019, + "step": 4208 + }, + { + "epoch": 7.6336431648152345, + "grad_norm": 0.166015625, + "learning_rate": 0.0002, + "loss": 0.1953, + "step": 4209 + }, + { + "epoch": 7.635456812514169, + "grad_norm": 0.259765625, + "learning_rate": 0.0002, + "loss": 0.4956, + "step": 4210 + }, + { + "epoch": 7.637270460213104, + "grad_norm": 0.24609375, + "learning_rate": 0.0002, + "loss": 0.4695, + "step": 4211 + }, + { + "epoch": 7.639084107912038, + "grad_norm": 0.2470703125, + "learning_rate": 0.0002, + "loss": 0.3251, + "step": 4212 + }, + { + "epoch": 7.640897755610973, + "grad_norm": 0.279296875, + "learning_rate": 0.0002, + "loss": 0.4757, + "step": 4213 + }, + { + "epoch": 7.642711403309907, + "grad_norm": 0.25, + "learning_rate": 0.0002, + "loss": 0.3884, + "step": 4214 + }, + { + "epoch": 7.644525051008841, + "grad_norm": 0.27734375, + "learning_rate": 0.0002, + "loss": 0.4415, + "step": 4215 + }, + { + "epoch": 7.646338698707776, + "grad_norm": 0.2451171875, + "learning_rate": 0.0002, + "loss": 0.432, + "step": 4216 + }, + { + "epoch": 7.648152346406711, + "grad_norm": 0.2373046875, + "learning_rate": 0.0002, + "loss": 0.39, + "step": 4217 + }, + { + "epoch": 7.649965994105645, + "grad_norm": 0.259765625, + "learning_rate": 0.0002, + "loss": 0.4458, + "step": 4218 + }, + { + "epoch": 7.65177964180458, + "grad_norm": 0.275390625, + "learning_rate": 0.0002, + "loss": 0.5386, + "step": 4219 + }, + { + "epoch": 7.6535932895035135, + "grad_norm": 0.259765625, + "learning_rate": 0.0002, + "loss": 0.4104, + "step": 4220 + }, + { + "epoch": 7.655406937202448, + "grad_norm": 0.287109375, + "learning_rate": 0.0002, + "loss": 0.4711, + "step": 4221 + }, + { + "epoch": 7.657220584901383, + "grad_norm": 0.298828125, + "learning_rate": 0.0002, + "loss": 0.432, + "step": 4222 + }, + { + "epoch": 7.659034232600318, + "grad_norm": 0.28125, + "learning_rate": 0.0002, + "loss": 0.4279, + "step": 4223 + }, + { + "epoch": 7.660847880299252, + "grad_norm": 0.28515625, + "learning_rate": 0.0002, + "loss": 0.4539, + "step": 4224 + }, + { + "epoch": 7.6626615279981865, + "grad_norm": 0.28125, + "learning_rate": 0.0002, + "loss": 0.4182, + "step": 4225 + }, + { + "epoch": 7.66447517569712, + "grad_norm": 0.2734375, + "learning_rate": 0.0002, + "loss": 0.4005, + "step": 4226 + }, + { + "epoch": 7.666288823396055, + "grad_norm": 0.359375, + "learning_rate": 0.0002, + "loss": 0.4909, + "step": 4227 + }, + { + "epoch": 7.66810247109499, + "grad_norm": 0.279296875, + "learning_rate": 0.0002, + "loss": 0.3656, + "step": 4228 + }, + { + "epoch": 7.669916118793925, + "grad_norm": 0.3125, + "learning_rate": 0.0002, + "loss": 0.4329, + "step": 4229 + }, + { + "epoch": 7.6717297664928585, + "grad_norm": 0.318359375, + "learning_rate": 0.0002, + "loss": 0.5725, + "step": 4230 + }, + { + "epoch": 7.673543414191793, + "grad_norm": 0.3046875, + "learning_rate": 0.0002, + "loss": 0.3766, + "step": 4231 + }, + { + "epoch": 7.675357061890728, + "grad_norm": 0.294921875, + "learning_rate": 0.0002, + "loss": 0.3802, + "step": 4232 + }, + { + "epoch": 7.677170709589662, + "grad_norm": 0.3203125, + "learning_rate": 0.0002, + "loss": 0.4803, + "step": 4233 + }, + { + "epoch": 7.678984357288597, + "grad_norm": 0.306640625, + "learning_rate": 0.0002, + "loss": 0.3259, + "step": 4234 + }, + { + "epoch": 7.6807980049875315, + "grad_norm": 0.267578125, + "learning_rate": 0.0002, + "loss": 0.3424, + "step": 4235 + }, + { + "epoch": 7.682611652686465, + "grad_norm": 0.2890625, + "learning_rate": 0.0002, + "loss": 0.349, + "step": 4236 + }, + { + "epoch": 7.6844253003854, + "grad_norm": 0.296875, + "learning_rate": 0.0002, + "loss": 0.3022, + "step": 4237 + }, + { + "epoch": 7.686238948084335, + "grad_norm": 0.30078125, + "learning_rate": 0.0002, + "loss": 0.3329, + "step": 4238 + }, + { + "epoch": 7.688052595783269, + "grad_norm": 0.302734375, + "learning_rate": 0.0002, + "loss": 0.3764, + "step": 4239 + }, + { + "epoch": 7.689866243482204, + "grad_norm": 0.2890625, + "learning_rate": 0.0002, + "loss": 0.3054, + "step": 4240 + }, + { + "epoch": 7.691679891181138, + "grad_norm": 0.279296875, + "learning_rate": 0.0002, + "loss": 0.2651, + "step": 4241 + }, + { + "epoch": 7.693493538880072, + "grad_norm": 0.3203125, + "learning_rate": 0.0002, + "loss": 0.2685, + "step": 4242 + }, + { + "epoch": 7.695307186579007, + "grad_norm": 0.291015625, + "learning_rate": 0.0002, + "loss": 0.2341, + "step": 4243 + }, + { + "epoch": 7.697120834277942, + "grad_norm": 0.341796875, + "learning_rate": 0.0002, + "loss": 0.2541, + "step": 4244 + }, + { + "epoch": 7.698934481976876, + "grad_norm": 0.294921875, + "learning_rate": 0.0002, + "loss": 0.2076, + "step": 4245 + }, + { + "epoch": 7.7007481296758105, + "grad_norm": 0.3125, + "learning_rate": 0.0002, + "loss": 0.2759, + "step": 4246 + }, + { + "epoch": 7.702561777374745, + "grad_norm": 0.314453125, + "learning_rate": 0.0002, + "loss": 0.2121, + "step": 4247 + }, + { + "epoch": 7.704375425073679, + "grad_norm": 0.302734375, + "learning_rate": 0.0002, + "loss": 0.3224, + "step": 4248 + }, + { + "epoch": 7.706189072772614, + "grad_norm": 0.322265625, + "learning_rate": 0.0002, + "loss": 0.1918, + "step": 4249 + }, + { + "epoch": 7.708002720471549, + "grad_norm": 0.3359375, + "learning_rate": 0.0002, + "loss": 0.1813, + "step": 4250 + }, + { + "epoch": 7.7098163681704825, + "grad_norm": 0.30859375, + "learning_rate": 0.0002, + "loss": 0.163, + "step": 4251 + }, + { + "epoch": 7.711630015869417, + "grad_norm": 0.3125, + "learning_rate": 0.0002, + "loss": 0.2415, + "step": 4252 + }, + { + "epoch": 7.713443663568352, + "grad_norm": 0.345703125, + "learning_rate": 0.0002, + "loss": 0.1702, + "step": 4253 + }, + { + "epoch": 7.715257311267286, + "grad_norm": 0.279296875, + "learning_rate": 0.0002, + "loss": 0.1852, + "step": 4254 + }, + { + "epoch": 7.717070958966221, + "grad_norm": 0.30078125, + "learning_rate": 0.0002, + "loss": 0.1767, + "step": 4255 + }, + { + "epoch": 7.7188846066651555, + "grad_norm": 0.54296875, + "learning_rate": 0.0002, + "loss": 0.1988, + "step": 4256 + }, + { + "epoch": 7.720698254364089, + "grad_norm": 0.267578125, + "learning_rate": 0.0002, + "loss": 0.1819, + "step": 4257 + }, + { + "epoch": 7.722511902063024, + "grad_norm": 0.267578125, + "learning_rate": 0.0002, + "loss": 0.1883, + "step": 4258 + }, + { + "epoch": 7.724325549761959, + "grad_norm": 0.2490234375, + "learning_rate": 0.0002, + "loss": 0.2283, + "step": 4259 + }, + { + "epoch": 7.726139197460893, + "grad_norm": 0.2490234375, + "learning_rate": 0.0002, + "loss": 0.3939, + "step": 4260 + }, + { + "epoch": 7.727952845159828, + "grad_norm": 0.2890625, + "learning_rate": 0.0002, + "loss": 0.5343, + "step": 4261 + }, + { + "epoch": 7.729766492858762, + "grad_norm": 0.267578125, + "learning_rate": 0.0002, + "loss": 0.4467, + "step": 4262 + }, + { + "epoch": 7.731580140557696, + "grad_norm": 0.29296875, + "learning_rate": 0.0002, + "loss": 0.5455, + "step": 4263 + }, + { + "epoch": 7.733393788256631, + "grad_norm": 0.259765625, + "learning_rate": 0.0002, + "loss": 0.5063, + "step": 4264 + }, + { + "epoch": 7.735207435955566, + "grad_norm": 0.232421875, + "learning_rate": 0.0002, + "loss": 0.4029, + "step": 4265 + }, + { + "epoch": 7.737021083654501, + "grad_norm": 0.26953125, + "learning_rate": 0.0002, + "loss": 0.36, + "step": 4266 + }, + { + "epoch": 7.7388347313534345, + "grad_norm": 0.263671875, + "learning_rate": 0.0002, + "loss": 0.5399, + "step": 4267 + }, + { + "epoch": 7.740648379052369, + "grad_norm": 0.265625, + "learning_rate": 0.0002, + "loss": 0.3806, + "step": 4268 + }, + { + "epoch": 7.742462026751303, + "grad_norm": 0.296875, + "learning_rate": 0.0002, + "loss": 0.423, + "step": 4269 + }, + { + "epoch": 7.744275674450238, + "grad_norm": 0.263671875, + "learning_rate": 0.0002, + "loss": 0.5227, + "step": 4270 + }, + { + "epoch": 7.746089322149173, + "grad_norm": 0.259765625, + "learning_rate": 0.0002, + "loss": 0.3759, + "step": 4271 + }, + { + "epoch": 7.7479029698481074, + "grad_norm": 0.298828125, + "learning_rate": 0.0002, + "loss": 0.4436, + "step": 4272 + }, + { + "epoch": 7.749716617547041, + "grad_norm": 0.27734375, + "learning_rate": 0.0002, + "loss": 0.4164, + "step": 4273 + }, + { + "epoch": 7.751530265245976, + "grad_norm": 0.30078125, + "learning_rate": 0.0002, + "loss": 0.4317, + "step": 4274 + }, + { + "epoch": 7.75334391294491, + "grad_norm": 0.29296875, + "learning_rate": 0.0002, + "loss": 0.5136, + "step": 4275 + }, + { + "epoch": 7.755157560643845, + "grad_norm": 0.302734375, + "learning_rate": 0.0002, + "loss": 0.4129, + "step": 4276 + }, + { + "epoch": 7.7569712083427795, + "grad_norm": 0.26953125, + "learning_rate": 0.0002, + "loss": 0.3672, + "step": 4277 + }, + { + "epoch": 7.758784856041714, + "grad_norm": 0.2734375, + "learning_rate": 0.0002, + "loss": 0.4033, + "step": 4278 + }, + { + "epoch": 7.760598503740648, + "grad_norm": 0.271484375, + "learning_rate": 0.0002, + "loss": 0.3862, + "step": 4279 + }, + { + "epoch": 7.762412151439583, + "grad_norm": 0.30859375, + "learning_rate": 0.0002, + "loss": 0.437, + "step": 4280 + }, + { + "epoch": 7.764225799138517, + "grad_norm": 0.318359375, + "learning_rate": 0.0002, + "loss": 0.3996, + "step": 4281 + }, + { + "epoch": 7.766039446837452, + "grad_norm": 0.302734375, + "learning_rate": 0.0002, + "loss": 0.4339, + "step": 4282 + }, + { + "epoch": 7.767853094536386, + "grad_norm": 0.310546875, + "learning_rate": 0.0002, + "loss": 0.4691, + "step": 4283 + }, + { + "epoch": 7.769666742235321, + "grad_norm": 0.279296875, + "learning_rate": 0.0002, + "loss": 0.2996, + "step": 4284 + }, + { + "epoch": 7.771480389934255, + "grad_norm": 0.271484375, + "learning_rate": 0.0002, + "loss": 0.2619, + "step": 4285 + }, + { + "epoch": 7.77329403763319, + "grad_norm": 0.32421875, + "learning_rate": 0.0002, + "loss": 0.3653, + "step": 4286 + }, + { + "epoch": 7.775107685332125, + "grad_norm": 0.330078125, + "learning_rate": 0.0002, + "loss": 0.3963, + "step": 4287 + }, + { + "epoch": 7.7769213330310585, + "grad_norm": 0.279296875, + "learning_rate": 0.0002, + "loss": 0.3052, + "step": 4288 + }, + { + "epoch": 7.778734980729993, + "grad_norm": 0.2734375, + "learning_rate": 0.0002, + "loss": 0.2694, + "step": 4289 + }, + { + "epoch": 7.780548628428928, + "grad_norm": 0.28125, + "learning_rate": 0.0002, + "loss": 0.2418, + "step": 4290 + }, + { + "epoch": 7.782362276127862, + "grad_norm": 0.279296875, + "learning_rate": 0.0002, + "loss": 0.2569, + "step": 4291 + }, + { + "epoch": 7.784175923826797, + "grad_norm": 0.275390625, + "learning_rate": 0.0002, + "loss": 0.2226, + "step": 4292 + }, + { + "epoch": 7.7859895715257315, + "grad_norm": 0.30859375, + "learning_rate": 0.0002, + "loss": 0.3152, + "step": 4293 + }, + { + "epoch": 7.787803219224665, + "grad_norm": 0.2734375, + "learning_rate": 0.0002, + "loss": 0.2155, + "step": 4294 + }, + { + "epoch": 7.7896168669236, + "grad_norm": 0.322265625, + "learning_rate": 0.0002, + "loss": 0.3001, + "step": 4295 + }, + { + "epoch": 7.791430514622535, + "grad_norm": 0.228515625, + "learning_rate": 0.0002, + "loss": 0.2234, + "step": 4296 + }, + { + "epoch": 7.793244162321469, + "grad_norm": 0.3046875, + "learning_rate": 0.0002, + "loss": 0.2299, + "step": 4297 + }, + { + "epoch": 7.7950578100204035, + "grad_norm": 0.259765625, + "learning_rate": 0.0002, + "loss": 0.1935, + "step": 4298 + }, + { + "epoch": 7.796871457719338, + "grad_norm": 0.27734375, + "learning_rate": 0.0002, + "loss": 0.217, + "step": 4299 + }, + { + "epoch": 7.798685105418272, + "grad_norm": 0.294921875, + "learning_rate": 0.0002, + "loss": 0.2977, + "step": 4300 + }, + { + "epoch": 7.800498753117207, + "grad_norm": 0.31640625, + "learning_rate": 0.0002, + "loss": 0.2076, + "step": 4301 + }, + { + "epoch": 7.800498753117207, + "eval_loss": 1.9194700717926025, + "eval_runtime": 153.0309, + "eval_samples_per_second": 6.535, + "eval_steps_per_second": 6.535, + "step": 4301 + }, + { + "epoch": 7.800498753117207, + "mmlu_eval_accuracy": 0.29605439886040297, + "mmlu_eval_accuracy_abstract_algebra": 0.2727272727272727, + "mmlu_eval_accuracy_anatomy": 0.5, + "mmlu_eval_accuracy_astronomy": 0.3125, + "mmlu_eval_accuracy_business_ethics": 0.5454545454545454, + "mmlu_eval_accuracy_clinical_knowledge": 0.2413793103448276, + "mmlu_eval_accuracy_college_biology": 0.375, + "mmlu_eval_accuracy_college_chemistry": 0.0, + "mmlu_eval_accuracy_college_computer_science": 0.09090909090909091, + "mmlu_eval_accuracy_college_mathematics": 0.18181818181818182, + "mmlu_eval_accuracy_college_medicine": 0.13636363636363635, + "mmlu_eval_accuracy_college_physics": 0.2727272727272727, + "mmlu_eval_accuracy_computer_security": 0.36363636363636365, + "mmlu_eval_accuracy_conceptual_physics": 0.2692307692307692, + "mmlu_eval_accuracy_econometrics": 0.16666666666666666, + "mmlu_eval_accuracy_electrical_engineering": 0.125, + "mmlu_eval_accuracy_elementary_mathematics": 0.24390243902439024, + "mmlu_eval_accuracy_formal_logic": 0.14285714285714285, + "mmlu_eval_accuracy_global_facts": 0.3, + "mmlu_eval_accuracy_high_school_biology": 0.4375, + "mmlu_eval_accuracy_high_school_chemistry": 0.18181818181818182, + "mmlu_eval_accuracy_high_school_computer_science": 0.2222222222222222, + "mmlu_eval_accuracy_high_school_european_history": 0.2777777777777778, + "mmlu_eval_accuracy_high_school_geography": 0.36363636363636365, + "mmlu_eval_accuracy_high_school_government_and_politics": 0.38095238095238093, + "mmlu_eval_accuracy_high_school_macroeconomics": 0.27906976744186046, + "mmlu_eval_accuracy_high_school_mathematics": 0.10344827586206896, + "mmlu_eval_accuracy_high_school_microeconomics": 0.38461538461538464, + "mmlu_eval_accuracy_high_school_physics": 0.4117647058823529, + "mmlu_eval_accuracy_high_school_psychology": 0.2833333333333333, + "mmlu_eval_accuracy_high_school_statistics": 0.2608695652173913, + "mmlu_eval_accuracy_high_school_us_history": 0.2727272727272727, + "mmlu_eval_accuracy_high_school_world_history": 0.11538461538461539, + "mmlu_eval_accuracy_human_aging": 0.30434782608695654, + "mmlu_eval_accuracy_human_sexuality": 0.08333333333333333, + "mmlu_eval_accuracy_international_law": 0.3076923076923077, + "mmlu_eval_accuracy_jurisprudence": 0.2727272727272727, + "mmlu_eval_accuracy_logical_fallacies": 0.3333333333333333, + "mmlu_eval_accuracy_machine_learning": 0.36363636363636365, + "mmlu_eval_accuracy_management": 0.45454545454545453, + "mmlu_eval_accuracy_marketing": 0.4, + "mmlu_eval_accuracy_medical_genetics": 0.2727272727272727, + "mmlu_eval_accuracy_miscellaneous": 0.4883720930232558, + "mmlu_eval_accuracy_moral_disputes": 0.21052631578947367, + "mmlu_eval_accuracy_moral_scenarios": 0.22, + "mmlu_eval_accuracy_nutrition": 0.3939393939393939, + "mmlu_eval_accuracy_philosophy": 0.38235294117647056, + "mmlu_eval_accuracy_prehistory": 0.42857142857142855, + "mmlu_eval_accuracy_professional_accounting": 0.3225806451612903, + "mmlu_eval_accuracy_professional_law": 0.2647058823529412, + "mmlu_eval_accuracy_professional_medicine": 0.3225806451612903, + "mmlu_eval_accuracy_professional_psychology": 0.30434782608695654, + "mmlu_eval_accuracy_public_relations": 0.5, + "mmlu_eval_accuracy_security_studies": 0.37037037037037035, + "mmlu_eval_accuracy_sociology": 0.36363636363636365, + "mmlu_eval_accuracy_us_foreign_policy": 0.45454545454545453, + "mmlu_eval_accuracy_virology": 0.2777777777777778, + "mmlu_eval_accuracy_world_religions": 0.2631578947368421, + "mmlu_loss": 1.6964148499465783, + "step": 4301 + }, + { + "epoch": 7.802312400816142, + "grad_norm": 0.380859375, + "learning_rate": 0.0002, + "loss": 0.2448, + "step": 4302 + }, + { + "epoch": 7.804126048515076, + "grad_norm": 0.400390625, + "learning_rate": 0.0002, + "loss": 0.2465, + "step": 4303 + }, + { + "epoch": 7.80593969621401, + "grad_norm": 0.3671875, + "learning_rate": 0.0002, + "loss": 0.2698, + "step": 4304 + }, + { + "epoch": 7.807753343912945, + "grad_norm": 0.396484375, + "learning_rate": 0.0002, + "loss": 0.1939, + "step": 4305 + }, + { + "epoch": 7.809566991611879, + "grad_norm": 0.322265625, + "learning_rate": 0.0002, + "loss": 0.2116, + "step": 4306 + }, + { + "epoch": 7.811380639310814, + "grad_norm": 0.337890625, + "learning_rate": 0.0002, + "loss": 0.1944, + "step": 4307 + }, + { + "epoch": 7.813194287009749, + "grad_norm": 0.357421875, + "learning_rate": 0.0002, + "loss": 0.2035, + "step": 4308 + }, + { + "epoch": 7.8150079347086825, + "grad_norm": 0.2099609375, + "learning_rate": 0.0002, + "loss": 0.2291, + "step": 4309 + }, + { + "epoch": 7.816821582407617, + "grad_norm": 0.2578125, + "learning_rate": 0.0002, + "loss": 0.4535, + "step": 4310 + }, + { + "epoch": 7.818635230106552, + "grad_norm": 0.267578125, + "learning_rate": 0.0002, + "loss": 0.5306, + "step": 4311 + }, + { + "epoch": 7.820448877805486, + "grad_norm": 0.29296875, + "learning_rate": 0.0002, + "loss": 0.5198, + "step": 4312 + }, + { + "epoch": 7.822262525504421, + "grad_norm": 0.275390625, + "learning_rate": 0.0002, + "loss": 0.5277, + "step": 4313 + }, + { + "epoch": 7.8240761732033555, + "grad_norm": 0.251953125, + "learning_rate": 0.0002, + "loss": 0.4273, + "step": 4314 + }, + { + "epoch": 7.825889820902289, + "grad_norm": 0.263671875, + "learning_rate": 0.0002, + "loss": 0.5002, + "step": 4315 + }, + { + "epoch": 7.827703468601224, + "grad_norm": 0.28515625, + "learning_rate": 0.0002, + "loss": 0.4936, + "step": 4316 + }, + { + "epoch": 7.829517116300159, + "grad_norm": 0.2470703125, + "learning_rate": 0.0002, + "loss": 0.5087, + "step": 4317 + }, + { + "epoch": 7.831330763999093, + "grad_norm": 0.29296875, + "learning_rate": 0.0002, + "loss": 0.4883, + "step": 4318 + }, + { + "epoch": 7.8331444116980276, + "grad_norm": 0.25, + "learning_rate": 0.0002, + "loss": 0.4143, + "step": 4319 + }, + { + "epoch": 7.834958059396962, + "grad_norm": 0.279296875, + "learning_rate": 0.0002, + "loss": 0.4391, + "step": 4320 + }, + { + "epoch": 7.836771707095897, + "grad_norm": 0.26171875, + "learning_rate": 0.0002, + "loss": 0.4453, + "step": 4321 + }, + { + "epoch": 7.838585354794831, + "grad_norm": 0.275390625, + "learning_rate": 0.0002, + "loss": 0.4511, + "step": 4322 + }, + { + "epoch": 7.840399002493766, + "grad_norm": 0.28125, + "learning_rate": 0.0002, + "loss": 0.4425, + "step": 4323 + }, + { + "epoch": 7.8422126501927, + "grad_norm": 0.275390625, + "learning_rate": 0.0002, + "loss": 0.4211, + "step": 4324 + }, + { + "epoch": 7.844026297891634, + "grad_norm": 0.27734375, + "learning_rate": 0.0002, + "loss": 0.3841, + "step": 4325 + }, + { + "epoch": 7.845839945590569, + "grad_norm": 0.314453125, + "learning_rate": 0.0002, + "loss": 0.4505, + "step": 4326 + }, + { + "epoch": 7.847653593289504, + "grad_norm": 0.294921875, + "learning_rate": 0.0002, + "loss": 0.4109, + "step": 4327 + }, + { + "epoch": 7.849467240988438, + "grad_norm": 0.291015625, + "learning_rate": 0.0002, + "loss": 0.4489, + "step": 4328 + }, + { + "epoch": 7.851280888687373, + "grad_norm": 0.30859375, + "learning_rate": 0.0002, + "loss": 0.4303, + "step": 4329 + }, + { + "epoch": 7.8530945363863065, + "grad_norm": 0.275390625, + "learning_rate": 0.0002, + "loss": 0.3479, + "step": 4330 + }, + { + "epoch": 7.854908184085241, + "grad_norm": 0.30859375, + "learning_rate": 0.0002, + "loss": 0.4274, + "step": 4331 + }, + { + "epoch": 7.856721831784176, + "grad_norm": 0.28515625, + "learning_rate": 0.0002, + "loss": 0.3617, + "step": 4332 + }, + { + "epoch": 7.858535479483111, + "grad_norm": 0.30859375, + "learning_rate": 0.0002, + "loss": 0.4494, + "step": 4333 + }, + { + "epoch": 7.860349127182045, + "grad_norm": 0.287109375, + "learning_rate": 0.0002, + "loss": 0.312, + "step": 4334 + }, + { + "epoch": 7.8621627748809795, + "grad_norm": 0.337890625, + "learning_rate": 0.0002, + "loss": 0.3211, + "step": 4335 + }, + { + "epoch": 7.863976422579913, + "grad_norm": 0.298828125, + "learning_rate": 0.0002, + "loss": 0.4053, + "step": 4336 + }, + { + "epoch": 7.865790070278848, + "grad_norm": 0.294921875, + "learning_rate": 0.0002, + "loss": 0.3683, + "step": 4337 + }, + { + "epoch": 7.867603717977783, + "grad_norm": 0.31640625, + "learning_rate": 0.0002, + "loss": 0.4073, + "step": 4338 + }, + { + "epoch": 7.869417365676718, + "grad_norm": 0.30078125, + "learning_rate": 0.0002, + "loss": 0.2995, + "step": 4339 + }, + { + "epoch": 7.871231013375652, + "grad_norm": 0.283203125, + "learning_rate": 0.0002, + "loss": 0.4683, + "step": 4340 + }, + { + "epoch": 7.873044661074586, + "grad_norm": 0.310546875, + "learning_rate": 0.0002, + "loss": 0.3603, + "step": 4341 + }, + { + "epoch": 7.874858308773521, + "grad_norm": 0.310546875, + "learning_rate": 0.0002, + "loss": 0.3503, + "step": 4342 + }, + { + "epoch": 7.876671956472455, + "grad_norm": 0.3046875, + "learning_rate": 0.0002, + "loss": 0.3214, + "step": 4343 + }, + { + "epoch": 7.87848560417139, + "grad_norm": 0.2890625, + "learning_rate": 0.0002, + "loss": 0.2458, + "step": 4344 + }, + { + "epoch": 7.8802992518703245, + "grad_norm": 0.376953125, + "learning_rate": 0.0002, + "loss": 0.2764, + "step": 4345 + }, + { + "epoch": 7.882112899569258, + "grad_norm": 0.283203125, + "learning_rate": 0.0002, + "loss": 0.2247, + "step": 4346 + }, + { + "epoch": 7.883926547268193, + "grad_norm": 0.283203125, + "learning_rate": 0.0002, + "loss": 0.2692, + "step": 4347 + }, + { + "epoch": 7.885740194967128, + "grad_norm": 0.341796875, + "learning_rate": 0.0002, + "loss": 0.2637, + "step": 4348 + }, + { + "epoch": 7.887553842666062, + "grad_norm": 0.28125, + "learning_rate": 0.0002, + "loss": 0.1928, + "step": 4349 + }, + { + "epoch": 7.889367490364997, + "grad_norm": 0.26953125, + "learning_rate": 0.0002, + "loss": 0.2139, + "step": 4350 + }, + { + "epoch": 7.891181138063931, + "grad_norm": 0.365234375, + "learning_rate": 0.0002, + "loss": 0.3471, + "step": 4351 + }, + { + "epoch": 7.892994785762865, + "grad_norm": 0.259765625, + "learning_rate": 0.0002, + "loss": 0.1804, + "step": 4352 + }, + { + "epoch": 7.8948084334618, + "grad_norm": 0.330078125, + "learning_rate": 0.0002, + "loss": 0.192, + "step": 4353 + }, + { + "epoch": 7.896622081160735, + "grad_norm": 0.287109375, + "learning_rate": 0.0002, + "loss": 0.1657, + "step": 4354 + }, + { + "epoch": 7.898435728859669, + "grad_norm": 0.390625, + "learning_rate": 0.0002, + "loss": 0.2454, + "step": 4355 + }, + { + "epoch": 7.9002493765586035, + "grad_norm": 0.3984375, + "learning_rate": 0.0002, + "loss": 0.1947, + "step": 4356 + }, + { + "epoch": 7.902063024257538, + "grad_norm": 0.3359375, + "learning_rate": 0.0002, + "loss": 0.1911, + "step": 4357 + }, + { + "epoch": 7.903876671956472, + "grad_norm": 0.2412109375, + "learning_rate": 0.0002, + "loss": 0.1983, + "step": 4358 + }, + { + "epoch": 7.905690319655407, + "grad_norm": 0.294921875, + "learning_rate": 0.0002, + "loss": 0.2206, + "step": 4359 + }, + { + "epoch": 7.907503967354342, + "grad_norm": 0.28125, + "learning_rate": 0.0002, + "loss": 0.435, + "step": 4360 + }, + { + "epoch": 7.909317615053276, + "grad_norm": 0.279296875, + "learning_rate": 0.0002, + "loss": 0.5262, + "step": 4361 + }, + { + "epoch": 7.91113126275221, + "grad_norm": 0.28515625, + "learning_rate": 0.0002, + "loss": 0.481, + "step": 4362 + }, + { + "epoch": 7.912944910451145, + "grad_norm": 0.2734375, + "learning_rate": 0.0002, + "loss": 0.5489, + "step": 4363 + }, + { + "epoch": 7.914758558150079, + "grad_norm": 0.26171875, + "learning_rate": 0.0002, + "loss": 0.5539, + "step": 4364 + }, + { + "epoch": 7.916572205849014, + "grad_norm": 0.3046875, + "learning_rate": 0.0002, + "loss": 0.5671, + "step": 4365 + }, + { + "epoch": 7.9183858535479485, + "grad_norm": 0.267578125, + "learning_rate": 0.0002, + "loss": 0.5482, + "step": 4366 + }, + { + "epoch": 7.920199501246882, + "grad_norm": 0.255859375, + "learning_rate": 0.0002, + "loss": 0.5081, + "step": 4367 + }, + { + "epoch": 7.922013148945817, + "grad_norm": 0.240234375, + "learning_rate": 0.0002, + "loss": 0.5593, + "step": 4368 + }, + { + "epoch": 7.923826796644752, + "grad_norm": 0.28125, + "learning_rate": 0.0002, + "loss": 0.4864, + "step": 4369 + }, + { + "epoch": 7.925640444343686, + "grad_norm": 0.275390625, + "learning_rate": 0.0002, + "loss": 0.5621, + "step": 4370 + }, + { + "epoch": 7.927454092042621, + "grad_norm": 0.30078125, + "learning_rate": 0.0002, + "loss": 0.5712, + "step": 4371 + }, + { + "epoch": 7.929267739741555, + "grad_norm": 0.263671875, + "learning_rate": 0.0002, + "loss": 0.4079, + "step": 4372 + }, + { + "epoch": 7.931081387440489, + "grad_norm": 0.310546875, + "learning_rate": 0.0002, + "loss": 0.571, + "step": 4373 + }, + { + "epoch": 7.932895035139424, + "grad_norm": 0.263671875, + "learning_rate": 0.0002, + "loss": 0.4031, + "step": 4374 + }, + { + "epoch": 7.934708682838359, + "grad_norm": 0.28125, + "learning_rate": 0.0002, + "loss": 0.4155, + "step": 4375 + }, + { + "epoch": 7.936522330537294, + "grad_norm": 0.27734375, + "learning_rate": 0.0002, + "loss": 0.4208, + "step": 4376 + }, + { + "epoch": 7.9383359782362275, + "grad_norm": 0.3046875, + "learning_rate": 0.0002, + "loss": 0.501, + "step": 4377 + }, + { + "epoch": 7.940149625935162, + "grad_norm": 0.298828125, + "learning_rate": 0.0002, + "loss": 0.4443, + "step": 4378 + }, + { + "epoch": 7.941963273634096, + "grad_norm": 0.3046875, + "learning_rate": 0.0002, + "loss": 0.4298, + "step": 4379 + }, + { + "epoch": 7.943776921333031, + "grad_norm": 0.3203125, + "learning_rate": 0.0002, + "loss": 0.4136, + "step": 4380 + }, + { + "epoch": 7.945590569031966, + "grad_norm": 0.298828125, + "learning_rate": 0.0002, + "loss": 0.3845, + "step": 4381 + }, + { + "epoch": 7.9474042167309005, + "grad_norm": 0.279296875, + "learning_rate": 0.0002, + "loss": 0.3194, + "step": 4382 + }, + { + "epoch": 7.949217864429834, + "grad_norm": 0.2890625, + "learning_rate": 0.0002, + "loss": 0.3468, + "step": 4383 + }, + { + "epoch": 7.951031512128769, + "grad_norm": 0.28515625, + "learning_rate": 0.0002, + "loss": 0.2902, + "step": 4384 + }, + { + "epoch": 7.952845159827703, + "grad_norm": 0.3203125, + "learning_rate": 0.0002, + "loss": 0.3681, + "step": 4385 + }, + { + "epoch": 7.954658807526638, + "grad_norm": 0.28515625, + "learning_rate": 0.0002, + "loss": 0.388, + "step": 4386 + }, + { + "epoch": 7.956472455225573, + "grad_norm": 0.3125, + "learning_rate": 0.0002, + "loss": 0.3102, + "step": 4387 + }, + { + "epoch": 7.958286102924507, + "grad_norm": 0.291015625, + "learning_rate": 0.0002, + "loss": 0.3037, + "step": 4388 + }, + { + "epoch": 7.960099750623441, + "grad_norm": 0.345703125, + "learning_rate": 0.0002, + "loss": 0.2848, + "step": 4389 + }, + { + "epoch": 7.961913398322376, + "grad_norm": 0.31640625, + "learning_rate": 0.0002, + "loss": 0.3309, + "step": 4390 + }, + { + "epoch": 7.96372704602131, + "grad_norm": 0.30078125, + "learning_rate": 0.0002, + "loss": 0.2266, + "step": 4391 + }, + { + "epoch": 7.965540693720245, + "grad_norm": 0.337890625, + "learning_rate": 0.0002, + "loss": 0.2892, + "step": 4392 + }, + { + "epoch": 7.967354341419179, + "grad_norm": 0.314453125, + "learning_rate": 0.0002, + "loss": 0.2902, + "step": 4393 + }, + { + "epoch": 7.969167989118114, + "grad_norm": 0.296875, + "learning_rate": 0.0002, + "loss": 0.2615, + "step": 4394 + }, + { + "epoch": 7.970981636817048, + "grad_norm": 0.322265625, + "learning_rate": 0.0002, + "loss": 0.3436, + "step": 4395 + }, + { + "epoch": 7.972795284515983, + "grad_norm": 0.291015625, + "learning_rate": 0.0002, + "loss": 0.1944, + "step": 4396 + }, + { + "epoch": 7.974608932214918, + "grad_norm": 0.330078125, + "learning_rate": 0.0002, + "loss": 0.2289, + "step": 4397 + }, + { + "epoch": 7.9764225799138515, + "grad_norm": 0.345703125, + "learning_rate": 0.0002, + "loss": 0.3273, + "step": 4398 + }, + { + "epoch": 7.978236227612786, + "grad_norm": 0.310546875, + "learning_rate": 0.0002, + "loss": 0.252, + "step": 4399 + }, + { + "epoch": 7.980049875311721, + "grad_norm": 0.33984375, + "learning_rate": 0.0002, + "loss": 0.2029, + "step": 4400 + }, + { + "epoch": 7.981863523010655, + "grad_norm": 0.330078125, + "learning_rate": 0.0002, + "loss": 0.1603, + "step": 4401 + }, + { + "epoch": 7.98367717070959, + "grad_norm": 0.279296875, + "learning_rate": 0.0002, + "loss": 0.177, + "step": 4402 + }, + { + "epoch": 7.9854908184085245, + "grad_norm": 0.33984375, + "learning_rate": 0.0002, + "loss": 0.238, + "step": 4403 + }, + { + "epoch": 7.987304466107458, + "grad_norm": 0.287109375, + "learning_rate": 0.0002, + "loss": 0.2107, + "step": 4404 + }, + { + "epoch": 7.989118113806393, + "grad_norm": 0.40625, + "learning_rate": 0.0002, + "loss": 0.2036, + "step": 4405 + }, + { + "epoch": 7.990931761505328, + "grad_norm": 0.345703125, + "learning_rate": 0.0002, + "loss": 0.1843, + "step": 4406 + }, + { + "epoch": 7.992745409204262, + "grad_norm": 0.345703125, + "learning_rate": 0.0002, + "loss": 0.2081, + "step": 4407 + }, + { + "epoch": 7.994559056903197, + "grad_norm": 0.3359375, + "learning_rate": 0.0002, + "loss": 0.2239, + "step": 4408 + }, + { + "epoch": 7.996372704602131, + "grad_norm": 0.3359375, + "learning_rate": 0.0002, + "loss": 0.2538, + "step": 4409 + }, + { + "epoch": 7.998186352301065, + "grad_norm": 0.25, + "learning_rate": 0.0002, + "loss": 0.3969, + "step": 4410 + }, + { + "epoch": 8.0, + "grad_norm": 0.45703125, + "learning_rate": 0.0002, + "loss": 0.4009, + "step": 4411 + }, + { + "epoch": 8.001813647698935, + "grad_norm": 0.2138671875, + "learning_rate": 0.0002, + "loss": 0.5035, + "step": 4412 + }, + { + "epoch": 8.00362729539787, + "grad_norm": 0.2158203125, + "learning_rate": 0.0002, + "loss": 0.2992, + "step": 4413 + }, + { + "epoch": 8.005440943096804, + "grad_norm": 0.1943359375, + "learning_rate": 0.0002, + "loss": 0.2677, + "step": 4414 + }, + { + "epoch": 8.007254590795737, + "grad_norm": 0.228515625, + "learning_rate": 0.0002, + "loss": 0.3451, + "step": 4415 + }, + { + "epoch": 8.009068238494672, + "grad_norm": 0.22265625, + "learning_rate": 0.0002, + "loss": 0.3537, + "step": 4416 + }, + { + "epoch": 8.010881886193607, + "grad_norm": 0.212890625, + "learning_rate": 0.0002, + "loss": 0.4189, + "step": 4417 + }, + { + "epoch": 8.012695533892542, + "grad_norm": 0.2421875, + "learning_rate": 0.0002, + "loss": 0.3755, + "step": 4418 + }, + { + "epoch": 8.014509181591476, + "grad_norm": 0.2197265625, + "learning_rate": 0.0002, + "loss": 0.3891, + "step": 4419 + }, + { + "epoch": 8.016322829290411, + "grad_norm": 0.2431640625, + "learning_rate": 0.0002, + "loss": 0.352, + "step": 4420 + }, + { + "epoch": 8.018136476989344, + "grad_norm": 0.234375, + "learning_rate": 0.0002, + "loss": 0.3016, + "step": 4421 + }, + { + "epoch": 8.019950124688279, + "grad_norm": 0.267578125, + "learning_rate": 0.0002, + "loss": 0.3442, + "step": 4422 + }, + { + "epoch": 8.021763772387214, + "grad_norm": 0.26171875, + "learning_rate": 0.0002, + "loss": 0.3284, + "step": 4423 + }, + { + "epoch": 8.023577420086148, + "grad_norm": 0.28515625, + "learning_rate": 0.0002, + "loss": 0.3112, + "step": 4424 + }, + { + "epoch": 8.025391067785083, + "grad_norm": 0.28515625, + "learning_rate": 0.0002, + "loss": 0.2324, + "step": 4425 + }, + { + "epoch": 8.027204715484018, + "grad_norm": 0.251953125, + "learning_rate": 0.0002, + "loss": 0.2124, + "step": 4426 + }, + { + "epoch": 8.029018363182951, + "grad_norm": 0.302734375, + "learning_rate": 0.0002, + "loss": 0.3522, + "step": 4427 + }, + { + "epoch": 8.030832010881886, + "grad_norm": 0.279296875, + "learning_rate": 0.0002, + "loss": 0.257, + "step": 4428 + }, + { + "epoch": 8.03264565858082, + "grad_norm": 0.2734375, + "learning_rate": 0.0002, + "loss": 0.2026, + "step": 4429 + }, + { + "epoch": 8.034459306279755, + "grad_norm": 0.294921875, + "learning_rate": 0.0002, + "loss": 0.245, + "step": 4430 + }, + { + "epoch": 8.03627295397869, + "grad_norm": 0.265625, + "learning_rate": 0.0002, + "loss": 0.2659, + "step": 4431 + }, + { + "epoch": 8.038086601677625, + "grad_norm": 0.314453125, + "learning_rate": 0.0002, + "loss": 0.3092, + "step": 4432 + }, + { + "epoch": 8.039900249376558, + "grad_norm": 0.302734375, + "learning_rate": 0.0002, + "loss": 0.2974, + "step": 4433 + }, + { + "epoch": 8.041713897075493, + "grad_norm": 0.25, + "learning_rate": 0.0002, + "loss": 0.2216, + "step": 4434 + }, + { + "epoch": 8.043527544774427, + "grad_norm": 0.279296875, + "learning_rate": 0.0002, + "loss": 0.2845, + "step": 4435 + }, + { + "epoch": 8.045341192473362, + "grad_norm": 0.263671875, + "learning_rate": 0.0002, + "loss": 0.1959, + "step": 4436 + }, + { + "epoch": 8.047154840172297, + "grad_norm": 0.30078125, + "learning_rate": 0.0002, + "loss": 0.2319, + "step": 4437 + }, + { + "epoch": 8.048968487871232, + "grad_norm": 0.287109375, + "learning_rate": 0.0002, + "loss": 0.2189, + "step": 4438 + }, + { + "epoch": 8.050782135570165, + "grad_norm": 0.26953125, + "learning_rate": 0.0002, + "loss": 0.1736, + "step": 4439 + }, + { + "epoch": 8.0525957832691, + "grad_norm": 0.275390625, + "learning_rate": 0.0002, + "loss": 0.2409, + "step": 4440 + }, + { + "epoch": 8.054409430968034, + "grad_norm": 0.271484375, + "learning_rate": 0.0002, + "loss": 0.2334, + "step": 4441 + }, + { + "epoch": 8.056223078666969, + "grad_norm": 0.302734375, + "learning_rate": 0.0002, + "loss": 0.213, + "step": 4442 + }, + { + "epoch": 8.058036726365904, + "grad_norm": 0.31640625, + "learning_rate": 0.0002, + "loss": 0.2416, + "step": 4443 + }, + { + "epoch": 8.059850374064839, + "grad_norm": 0.263671875, + "learning_rate": 0.0002, + "loss": 0.1916, + "step": 4444 + }, + { + "epoch": 8.061664021763772, + "grad_norm": 0.291015625, + "learning_rate": 0.0002, + "loss": 0.1776, + "step": 4445 + }, + { + "epoch": 8.063477669462706, + "grad_norm": 0.2490234375, + "learning_rate": 0.0002, + "loss": 0.1404, + "step": 4446 + }, + { + "epoch": 8.065291317161641, + "grad_norm": 0.291015625, + "learning_rate": 0.0002, + "loss": 0.1605, + "step": 4447 + }, + { + "epoch": 8.067104964860576, + "grad_norm": 0.263671875, + "learning_rate": 0.0002, + "loss": 0.1559, + "step": 4448 + }, + { + "epoch": 8.06891861255951, + "grad_norm": 0.1982421875, + "learning_rate": 0.0002, + "loss": 0.1314, + "step": 4449 + }, + { + "epoch": 8.070732260258445, + "grad_norm": 0.267578125, + "learning_rate": 0.0002, + "loss": 0.1328, + "step": 4450 + }, + { + "epoch": 8.072545907957378, + "grad_norm": 0.32421875, + "learning_rate": 0.0002, + "loss": 0.1726, + "step": 4451 + }, + { + "epoch": 8.074359555656313, + "grad_norm": 0.326171875, + "learning_rate": 0.0002, + "loss": 0.139, + "step": 4452 + }, + { + "epoch": 8.076173203355248, + "grad_norm": 0.2734375, + "learning_rate": 0.0002, + "loss": 0.198, + "step": 4453 + }, + { + "epoch": 8.077986851054183, + "grad_norm": 0.216796875, + "learning_rate": 0.0002, + "loss": 0.1939, + "step": 4454 + }, + { + "epoch": 8.079800498753118, + "grad_norm": 0.296875, + "learning_rate": 0.0002, + "loss": 0.1377, + "step": 4455 + }, + { + "epoch": 8.081614146452052, + "grad_norm": 0.236328125, + "learning_rate": 0.0002, + "loss": 0.1741, + "step": 4456 + }, + { + "epoch": 8.083427794150985, + "grad_norm": 0.251953125, + "learning_rate": 0.0002, + "loss": 0.1584, + "step": 4457 + }, + { + "epoch": 8.08524144184992, + "grad_norm": 0.359375, + "learning_rate": 0.0002, + "loss": 0.1881, + "step": 4458 + }, + { + "epoch": 8.087055089548855, + "grad_norm": 0.283203125, + "learning_rate": 0.0002, + "loss": 0.1523, + "step": 4459 + }, + { + "epoch": 8.08886873724779, + "grad_norm": 0.345703125, + "learning_rate": 0.0002, + "loss": 0.1895, + "step": 4460 + }, + { + "epoch": 8.090682384946724, + "grad_norm": 0.33984375, + "learning_rate": 0.0002, + "loss": 0.2398, + "step": 4461 + }, + { + "epoch": 8.09249603264566, + "grad_norm": 0.2421875, + "learning_rate": 0.0002, + "loss": 0.3828, + "step": 4462 + }, + { + "epoch": 8.094309680344592, + "grad_norm": 0.2314453125, + "learning_rate": 0.0002, + "loss": 0.3664, + "step": 4463 + }, + { + "epoch": 8.096123328043527, + "grad_norm": 0.2236328125, + "learning_rate": 0.0002, + "loss": 0.2557, + "step": 4464 + }, + { + "epoch": 8.097936975742462, + "grad_norm": 0.24609375, + "learning_rate": 0.0002, + "loss": 0.3217, + "step": 4465 + }, + { + "epoch": 8.099750623441397, + "grad_norm": 0.251953125, + "learning_rate": 0.0002, + "loss": 0.3975, + "step": 4466 + }, + { + "epoch": 8.101564271140331, + "grad_norm": 0.2470703125, + "learning_rate": 0.0002, + "loss": 0.284, + "step": 4467 + }, + { + "epoch": 8.103377918839266, + "grad_norm": 0.205078125, + "learning_rate": 0.0002, + "loss": 0.2762, + "step": 4468 + }, + { + "epoch": 8.1051915665382, + "grad_norm": 0.2412109375, + "learning_rate": 0.0002, + "loss": 0.3249, + "step": 4469 + }, + { + "epoch": 8.107005214237134, + "grad_norm": 0.2138671875, + "learning_rate": 0.0002, + "loss": 0.3173, + "step": 4470 + }, + { + "epoch": 8.108818861936069, + "grad_norm": 0.2265625, + "learning_rate": 0.0002, + "loss": 0.3058, + "step": 4471 + }, + { + "epoch": 8.110632509635003, + "grad_norm": 0.2255859375, + "learning_rate": 0.0002, + "loss": 0.2658, + "step": 4472 + }, + { + "epoch": 8.112446157333938, + "grad_norm": 0.234375, + "learning_rate": 0.0002, + "loss": 0.2432, + "step": 4473 + }, + { + "epoch": 8.114259805032873, + "grad_norm": 0.337890625, + "learning_rate": 0.0002, + "loss": 0.3781, + "step": 4474 + }, + { + "epoch": 8.116073452731808, + "grad_norm": 0.25, + "learning_rate": 0.0002, + "loss": 0.2377, + "step": 4475 + }, + { + "epoch": 8.11788710043074, + "grad_norm": 0.275390625, + "learning_rate": 0.0002, + "loss": 0.2382, + "step": 4476 + }, + { + "epoch": 8.119700748129675, + "grad_norm": 0.263671875, + "learning_rate": 0.0002, + "loss": 0.2242, + "step": 4477 + }, + { + "epoch": 8.12151439582861, + "grad_norm": 0.33203125, + "learning_rate": 0.0002, + "loss": 0.2898, + "step": 4478 + }, + { + "epoch": 8.123328043527545, + "grad_norm": 0.30078125, + "learning_rate": 0.0002, + "loss": 0.2303, + "step": 4479 + }, + { + "epoch": 8.12514169122648, + "grad_norm": 0.283203125, + "learning_rate": 0.0002, + "loss": 0.2305, + "step": 4480 + }, + { + "epoch": 8.126955338925415, + "grad_norm": 0.3125, + "learning_rate": 0.0002, + "loss": 0.2864, + "step": 4481 + }, + { + "epoch": 8.128768986624348, + "grad_norm": 0.2734375, + "learning_rate": 0.0002, + "loss": 0.2392, + "step": 4482 + }, + { + "epoch": 8.130582634323282, + "grad_norm": 0.271484375, + "learning_rate": 0.0002, + "loss": 0.1928, + "step": 4483 + }, + { + "epoch": 8.132396282022217, + "grad_norm": 0.396484375, + "learning_rate": 0.0002, + "loss": 0.345, + "step": 4484 + }, + { + "epoch": 8.134209929721152, + "grad_norm": 0.349609375, + "learning_rate": 0.0002, + "loss": 0.3084, + "step": 4485 + }, + { + "epoch": 8.136023577420087, + "grad_norm": 0.298828125, + "learning_rate": 0.0002, + "loss": 0.1932, + "step": 4486 + }, + { + "epoch": 8.137837225119021, + "grad_norm": 0.3125, + "learning_rate": 0.0002, + "loss": 0.2112, + "step": 4487 + }, + { + "epoch": 8.139650872817954, + "grad_norm": 0.33984375, + "learning_rate": 0.0002, + "loss": 0.2793, + "step": 4488 + }, + { + "epoch": 8.139650872817954, + "eval_loss": 1.9102683067321777, + "eval_runtime": 152.405, + "eval_samples_per_second": 6.561, + "eval_steps_per_second": 6.561, + "step": 4488 + }, + { + "epoch": 8.139650872817954, + "mmlu_eval_accuracy": 0.2915706556873578, + "mmlu_eval_accuracy_abstract_algebra": 0.2727272727272727, + "mmlu_eval_accuracy_anatomy": 0.42857142857142855, + "mmlu_eval_accuracy_astronomy": 0.25, + "mmlu_eval_accuracy_business_ethics": 0.5454545454545454, + "mmlu_eval_accuracy_clinical_knowledge": 0.3448275862068966, + "mmlu_eval_accuracy_college_biology": 0.375, + "mmlu_eval_accuracy_college_chemistry": 0.0, + "mmlu_eval_accuracy_college_computer_science": 0.09090909090909091, + "mmlu_eval_accuracy_college_mathematics": 0.18181818181818182, + "mmlu_eval_accuracy_college_medicine": 0.18181818181818182, + "mmlu_eval_accuracy_college_physics": 0.2727272727272727, + "mmlu_eval_accuracy_computer_security": 0.2727272727272727, + "mmlu_eval_accuracy_conceptual_physics": 0.2692307692307692, + "mmlu_eval_accuracy_econometrics": 0.25, + "mmlu_eval_accuracy_electrical_engineering": 0.125, + "mmlu_eval_accuracy_elementary_mathematics": 0.2682926829268293, + "mmlu_eval_accuracy_formal_logic": 0.14285714285714285, + "mmlu_eval_accuracy_global_facts": 0.4, + "mmlu_eval_accuracy_high_school_biology": 0.4375, + "mmlu_eval_accuracy_high_school_chemistry": 0.13636363636363635, + "mmlu_eval_accuracy_high_school_computer_science": 0.2222222222222222, + "mmlu_eval_accuracy_high_school_european_history": 0.2777777777777778, + "mmlu_eval_accuracy_high_school_geography": 0.3181818181818182, + "mmlu_eval_accuracy_high_school_government_and_politics": 0.38095238095238093, + "mmlu_eval_accuracy_high_school_macroeconomics": 0.3023255813953488, + "mmlu_eval_accuracy_high_school_mathematics": 0.10344827586206896, + "mmlu_eval_accuracy_high_school_microeconomics": 0.34615384615384615, + "mmlu_eval_accuracy_high_school_physics": 0.4117647058823529, + "mmlu_eval_accuracy_high_school_psychology": 0.26666666666666666, + "mmlu_eval_accuracy_high_school_statistics": 0.21739130434782608, + "mmlu_eval_accuracy_high_school_us_history": 0.2727272727272727, + "mmlu_eval_accuracy_high_school_world_history": 0.11538461538461539, + "mmlu_eval_accuracy_human_aging": 0.21739130434782608, + "mmlu_eval_accuracy_human_sexuality": 0.08333333333333333, + "mmlu_eval_accuracy_international_law": 0.38461538461538464, + "mmlu_eval_accuracy_jurisprudence": 0.2727272727272727, + "mmlu_eval_accuracy_logical_fallacies": 0.3333333333333333, + "mmlu_eval_accuracy_machine_learning": 0.36363636363636365, + "mmlu_eval_accuracy_management": 0.2727272727272727, + "mmlu_eval_accuracy_marketing": 0.36, + "mmlu_eval_accuracy_medical_genetics": 0.36363636363636365, + "mmlu_eval_accuracy_miscellaneous": 0.45348837209302323, + "mmlu_eval_accuracy_moral_disputes": 0.18421052631578946, + "mmlu_eval_accuracy_moral_scenarios": 0.22, + "mmlu_eval_accuracy_nutrition": 0.36363636363636365, + "mmlu_eval_accuracy_philosophy": 0.3235294117647059, + "mmlu_eval_accuracy_prehistory": 0.4857142857142857, + "mmlu_eval_accuracy_professional_accounting": 0.3225806451612903, + "mmlu_eval_accuracy_professional_law": 0.25882352941176473, + "mmlu_eval_accuracy_professional_medicine": 0.2903225806451613, + "mmlu_eval_accuracy_professional_psychology": 0.2898550724637681, + "mmlu_eval_accuracy_public_relations": 0.5, + "mmlu_eval_accuracy_security_studies": 0.37037037037037035, + "mmlu_eval_accuracy_sociology": 0.4090909090909091, + "mmlu_eval_accuracy_us_foreign_policy": 0.36363636363636365, + "mmlu_eval_accuracy_virology": 0.3888888888888889, + "mmlu_eval_accuracy_world_religions": 0.2631578947368421, + "mmlu_loss": 1.7600339715345321, + "step": 4488 + }, + { + "epoch": 8.14146452051689, + "grad_norm": 0.2275390625, + "learning_rate": 0.0002, + "loss": 0.1662, + "step": 4489 + }, + { + "epoch": 8.143278168215824, + "grad_norm": 0.349609375, + "learning_rate": 0.0002, + "loss": 0.211, + "step": 4490 + }, + { + "epoch": 8.145091815914759, + "grad_norm": 0.310546875, + "learning_rate": 0.0002, + "loss": 0.377, + "step": 4491 + }, + { + "epoch": 8.146905463613693, + "grad_norm": 0.251953125, + "learning_rate": 0.0002, + "loss": 0.2055, + "step": 4492 + }, + { + "epoch": 8.148719111312628, + "grad_norm": 0.271484375, + "learning_rate": 0.0002, + "loss": 0.1767, + "step": 4493 + }, + { + "epoch": 8.150532759011561, + "grad_norm": 0.3203125, + "learning_rate": 0.0002, + "loss": 0.1904, + "step": 4494 + }, + { + "epoch": 8.152346406710496, + "grad_norm": 0.3046875, + "learning_rate": 0.0002, + "loss": 0.1898, + "step": 4495 + }, + { + "epoch": 8.15416005440943, + "grad_norm": 0.2314453125, + "learning_rate": 0.0002, + "loss": 0.1541, + "step": 4496 + }, + { + "epoch": 8.155973702108366, + "grad_norm": 0.2314453125, + "learning_rate": 0.0002, + "loss": 0.1373, + "step": 4497 + }, + { + "epoch": 8.1577873498073, + "grad_norm": 0.248046875, + "learning_rate": 0.0002, + "loss": 0.1446, + "step": 4498 + }, + { + "epoch": 8.159600997506235, + "grad_norm": 0.283203125, + "learning_rate": 0.0002, + "loss": 0.1827, + "step": 4499 + }, + { + "epoch": 8.161414645205168, + "grad_norm": 0.271484375, + "learning_rate": 0.0002, + "loss": 0.1716, + "step": 4500 + }, + { + "epoch": 8.163228292904103, + "grad_norm": 0.26953125, + "learning_rate": 0.0002, + "loss": 0.1498, + "step": 4501 + }, + { + "epoch": 8.165041940603038, + "grad_norm": 0.212890625, + "learning_rate": 0.0002, + "loss": 0.1398, + "step": 4502 + }, + { + "epoch": 8.166855588301972, + "grad_norm": 0.2314453125, + "learning_rate": 0.0002, + "loss": 0.1187, + "step": 4503 + }, + { + "epoch": 8.168669236000907, + "grad_norm": 0.21484375, + "learning_rate": 0.0002, + "loss": 0.1342, + "step": 4504 + }, + { + "epoch": 8.170482883699842, + "grad_norm": 0.263671875, + "learning_rate": 0.0002, + "loss": 0.1762, + "step": 4505 + }, + { + "epoch": 8.172296531398775, + "grad_norm": 0.275390625, + "learning_rate": 0.0002, + "loss": 0.1815, + "step": 4506 + }, + { + "epoch": 8.17411017909771, + "grad_norm": 0.28125, + "learning_rate": 0.0002, + "loss": 0.1378, + "step": 4507 + }, + { + "epoch": 8.175923826796645, + "grad_norm": 0.2314453125, + "learning_rate": 0.0002, + "loss": 0.1508, + "step": 4508 + }, + { + "epoch": 8.17773747449558, + "grad_norm": 0.2314453125, + "learning_rate": 0.0002, + "loss": 0.1472, + "step": 4509 + }, + { + "epoch": 8.179551122194514, + "grad_norm": 0.3046875, + "learning_rate": 0.0002, + "loss": 0.1964, + "step": 4510 + }, + { + "epoch": 8.181364769893449, + "grad_norm": 0.291015625, + "learning_rate": 0.0002, + "loss": 0.2482, + "step": 4511 + }, + { + "epoch": 8.183178417592382, + "grad_norm": 0.279296875, + "learning_rate": 0.0002, + "loss": 0.5597, + "step": 4512 + }, + { + "epoch": 8.184992065291317, + "grad_norm": 0.25, + "learning_rate": 0.0002, + "loss": 0.346, + "step": 4513 + }, + { + "epoch": 8.186805712990251, + "grad_norm": 0.251953125, + "learning_rate": 0.0002, + "loss": 0.2868, + "step": 4514 + }, + { + "epoch": 8.188619360689186, + "grad_norm": 0.26953125, + "learning_rate": 0.0002, + "loss": 0.3716, + "step": 4515 + }, + { + "epoch": 8.190433008388121, + "grad_norm": 0.287109375, + "learning_rate": 0.0002, + "loss": 0.3739, + "step": 4516 + }, + { + "epoch": 8.192246656087056, + "grad_norm": 0.26171875, + "learning_rate": 0.0002, + "loss": 0.2856, + "step": 4517 + }, + { + "epoch": 8.194060303785989, + "grad_norm": 0.2392578125, + "learning_rate": 0.0002, + "loss": 0.3458, + "step": 4518 + }, + { + "epoch": 8.195873951484923, + "grad_norm": 0.26171875, + "learning_rate": 0.0002, + "loss": 0.2789, + "step": 4519 + }, + { + "epoch": 8.197687599183858, + "grad_norm": 0.24609375, + "learning_rate": 0.0002, + "loss": 0.3124, + "step": 4520 + }, + { + "epoch": 8.199501246882793, + "grad_norm": 0.283203125, + "learning_rate": 0.0002, + "loss": 0.3937, + "step": 4521 + }, + { + "epoch": 8.201314894581728, + "grad_norm": 0.275390625, + "learning_rate": 0.0002, + "loss": 0.361, + "step": 4522 + }, + { + "epoch": 8.203128542280663, + "grad_norm": 0.255859375, + "learning_rate": 0.0002, + "loss": 0.3058, + "step": 4523 + }, + { + "epoch": 8.204942189979597, + "grad_norm": 0.25, + "learning_rate": 0.0002, + "loss": 0.2843, + "step": 4524 + }, + { + "epoch": 8.20675583767853, + "grad_norm": 0.287109375, + "learning_rate": 0.0002, + "loss": 0.3546, + "step": 4525 + }, + { + "epoch": 8.208569485377465, + "grad_norm": 0.29296875, + "learning_rate": 0.0002, + "loss": 0.2786, + "step": 4526 + }, + { + "epoch": 8.2103831330764, + "grad_norm": 0.294921875, + "learning_rate": 0.0002, + "loss": 0.3195, + "step": 4527 + }, + { + "epoch": 8.212196780775335, + "grad_norm": 0.66796875, + "learning_rate": 0.0002, + "loss": 0.3055, + "step": 4528 + }, + { + "epoch": 8.21401042847427, + "grad_norm": 0.2490234375, + "learning_rate": 0.0002, + "loss": 0.2552, + "step": 4529 + }, + { + "epoch": 8.215824076173204, + "grad_norm": 0.310546875, + "learning_rate": 0.0002, + "loss": 0.2807, + "step": 4530 + }, + { + "epoch": 8.217637723872137, + "grad_norm": 0.2890625, + "learning_rate": 0.0002, + "loss": 0.2768, + "step": 4531 + }, + { + "epoch": 8.219451371571072, + "grad_norm": 0.31640625, + "learning_rate": 0.0002, + "loss": 0.2744, + "step": 4532 + }, + { + "epoch": 8.221265019270007, + "grad_norm": 0.302734375, + "learning_rate": 0.0002, + "loss": 0.2872, + "step": 4533 + }, + { + "epoch": 8.223078666968942, + "grad_norm": 0.3203125, + "learning_rate": 0.0002, + "loss": 0.2717, + "step": 4534 + }, + { + "epoch": 8.224892314667876, + "grad_norm": 0.30078125, + "learning_rate": 0.0002, + "loss": 0.2315, + "step": 4535 + }, + { + "epoch": 8.226705962366811, + "grad_norm": 0.609375, + "learning_rate": 0.0002, + "loss": 0.239, + "step": 4536 + }, + { + "epoch": 8.228519610065744, + "grad_norm": 0.28125, + "learning_rate": 0.0002, + "loss": 0.238, + "step": 4537 + }, + { + "epoch": 8.230333257764679, + "grad_norm": 0.333984375, + "learning_rate": 0.0002, + "loss": 0.2502, + "step": 4538 + }, + { + "epoch": 8.232146905463614, + "grad_norm": 0.32421875, + "learning_rate": 0.0002, + "loss": 0.2225, + "step": 4539 + }, + { + "epoch": 8.233960553162548, + "grad_norm": 0.34375, + "learning_rate": 0.0002, + "loss": 0.2947, + "step": 4540 + }, + { + "epoch": 8.235774200861483, + "grad_norm": 0.2734375, + "learning_rate": 0.0002, + "loss": 0.1787, + "step": 4541 + }, + { + "epoch": 8.237587848560418, + "grad_norm": 0.34375, + "learning_rate": 0.0002, + "loss": 0.1878, + "step": 4542 + }, + { + "epoch": 8.239401496259351, + "grad_norm": 0.28125, + "learning_rate": 0.0002, + "loss": 0.2082, + "step": 4543 + }, + { + "epoch": 8.241215143958286, + "grad_norm": 0.3046875, + "learning_rate": 0.0002, + "loss": 0.2791, + "step": 4544 + }, + { + "epoch": 8.24302879165722, + "grad_norm": 0.283203125, + "learning_rate": 0.0002, + "loss": 0.1527, + "step": 4545 + }, + { + "epoch": 8.244842439356155, + "grad_norm": 0.3046875, + "learning_rate": 0.0002, + "loss": 0.1727, + "step": 4546 + }, + { + "epoch": 8.24665608705509, + "grad_norm": 0.26953125, + "learning_rate": 0.0002, + "loss": 0.1559, + "step": 4547 + }, + { + "epoch": 8.248469734754025, + "grad_norm": 0.2734375, + "learning_rate": 0.0002, + "loss": 0.1721, + "step": 4548 + }, + { + "epoch": 8.250283382452958, + "grad_norm": 0.3125, + "learning_rate": 0.0002, + "loss": 0.1391, + "step": 4549 + }, + { + "epoch": 8.252097030151893, + "grad_norm": 0.26953125, + "learning_rate": 0.0002, + "loss": 0.1725, + "step": 4550 + }, + { + "epoch": 8.253910677850827, + "grad_norm": 0.2119140625, + "learning_rate": 0.0002, + "loss": 0.1465, + "step": 4551 + }, + { + "epoch": 8.255724325549762, + "grad_norm": 0.2392578125, + "learning_rate": 0.0002, + "loss": 0.1363, + "step": 4552 + }, + { + "epoch": 8.257537973248697, + "grad_norm": 0.2177734375, + "learning_rate": 0.0002, + "loss": 0.1668, + "step": 4553 + }, + { + "epoch": 8.259351620947632, + "grad_norm": 0.298828125, + "learning_rate": 0.0002, + "loss": 0.1861, + "step": 4554 + }, + { + "epoch": 8.261165268646565, + "grad_norm": 0.365234375, + "learning_rate": 0.0002, + "loss": 0.165, + "step": 4555 + }, + { + "epoch": 8.2629789163455, + "grad_norm": 0.310546875, + "learning_rate": 0.0002, + "loss": 0.1498, + "step": 4556 + }, + { + "epoch": 8.264792564044434, + "grad_norm": 0.4453125, + "learning_rate": 0.0002, + "loss": 0.201, + "step": 4557 + }, + { + "epoch": 8.266606211743369, + "grad_norm": 0.392578125, + "learning_rate": 0.0002, + "loss": 0.1728, + "step": 4558 + }, + { + "epoch": 8.268419859442304, + "grad_norm": 0.2578125, + "learning_rate": 0.0002, + "loss": 0.15, + "step": 4559 + }, + { + "epoch": 8.270233507141239, + "grad_norm": 0.2392578125, + "learning_rate": 0.0002, + "loss": 0.1835, + "step": 4560 + }, + { + "epoch": 8.272047154840172, + "grad_norm": 0.220703125, + "learning_rate": 0.0002, + "loss": 0.1981, + "step": 4561 + }, + { + "epoch": 8.273860802539106, + "grad_norm": 0.2412109375, + "learning_rate": 0.0002, + "loss": 0.3889, + "step": 4562 + }, + { + "epoch": 8.275674450238041, + "grad_norm": 0.2412109375, + "learning_rate": 0.0002, + "loss": 0.3733, + "step": 4563 + }, + { + "epoch": 8.277488097936976, + "grad_norm": 0.255859375, + "learning_rate": 0.0002, + "loss": 0.3572, + "step": 4564 + }, + { + "epoch": 8.27930174563591, + "grad_norm": 0.25, + "learning_rate": 0.0002, + "loss": 0.3527, + "step": 4565 + }, + { + "epoch": 8.281115393334845, + "grad_norm": 0.259765625, + "learning_rate": 0.0002, + "loss": 0.2996, + "step": 4566 + }, + { + "epoch": 8.282929041033778, + "grad_norm": 0.2451171875, + "learning_rate": 0.0002, + "loss": 0.3544, + "step": 4567 + }, + { + "epoch": 8.284742688732713, + "grad_norm": 0.244140625, + "learning_rate": 0.0002, + "loss": 0.3388, + "step": 4568 + }, + { + "epoch": 8.286556336431648, + "grad_norm": 0.294921875, + "learning_rate": 0.0002, + "loss": 0.3882, + "step": 4569 + }, + { + "epoch": 8.288369984130583, + "grad_norm": 0.271484375, + "learning_rate": 0.0002, + "loss": 0.3874, + "step": 4570 + }, + { + "epoch": 8.290183631829517, + "grad_norm": 0.25, + "learning_rate": 0.0002, + "loss": 0.2699, + "step": 4571 + }, + { + "epoch": 8.291997279528452, + "grad_norm": 0.3125, + "learning_rate": 0.0002, + "loss": 0.3992, + "step": 4572 + }, + { + "epoch": 8.293810927227385, + "grad_norm": 0.259765625, + "learning_rate": 0.0002, + "loss": 0.2446, + "step": 4573 + }, + { + "epoch": 8.29562457492632, + "grad_norm": 0.294921875, + "learning_rate": 0.0002, + "loss": 0.2905, + "step": 4574 + }, + { + "epoch": 8.297438222625255, + "grad_norm": 0.263671875, + "learning_rate": 0.0002, + "loss": 0.2268, + "step": 4575 + }, + { + "epoch": 8.29925187032419, + "grad_norm": 0.279296875, + "learning_rate": 0.0002, + "loss": 0.2805, + "step": 4576 + }, + { + "epoch": 8.301065518023124, + "grad_norm": 0.369140625, + "learning_rate": 0.0002, + "loss": 0.386, + "step": 4577 + }, + { + "epoch": 8.302879165722059, + "grad_norm": 0.2734375, + "learning_rate": 0.0002, + "loss": 0.2882, + "step": 4578 + }, + { + "epoch": 8.304692813420992, + "grad_norm": 0.28515625, + "learning_rate": 0.0002, + "loss": 0.3091, + "step": 4579 + }, + { + "epoch": 8.306506461119927, + "grad_norm": 0.2734375, + "learning_rate": 0.0002, + "loss": 0.223, + "step": 4580 + }, + { + "epoch": 8.308320108818862, + "grad_norm": 0.31640625, + "learning_rate": 0.0002, + "loss": 0.3141, + "step": 4581 + }, + { + "epoch": 8.310133756517796, + "grad_norm": 0.3046875, + "learning_rate": 0.0002, + "loss": 0.282, + "step": 4582 + }, + { + "epoch": 8.311947404216731, + "grad_norm": 0.28125, + "learning_rate": 0.0002, + "loss": 0.2063, + "step": 4583 + }, + { + "epoch": 8.313761051915666, + "grad_norm": 0.30859375, + "learning_rate": 0.0002, + "loss": 0.2624, + "step": 4584 + }, + { + "epoch": 8.3155746996146, + "grad_norm": 0.283203125, + "learning_rate": 0.0002, + "loss": 0.2124, + "step": 4585 + }, + { + "epoch": 8.317388347313534, + "grad_norm": 0.25390625, + "learning_rate": 0.0002, + "loss": 0.2113, + "step": 4586 + }, + { + "epoch": 8.319201995012468, + "grad_norm": 0.357421875, + "learning_rate": 0.0002, + "loss": 0.3243, + "step": 4587 + }, + { + "epoch": 8.321015642711403, + "grad_norm": 0.283203125, + "learning_rate": 0.0002, + "loss": 0.2056, + "step": 4588 + }, + { + "epoch": 8.322829290410338, + "grad_norm": 0.302734375, + "learning_rate": 0.0002, + "loss": 0.187, + "step": 4589 + }, + { + "epoch": 8.324642938109273, + "grad_norm": 0.2431640625, + "learning_rate": 0.0002, + "loss": 0.2034, + "step": 4590 + }, + { + "epoch": 8.326456585808208, + "grad_norm": 0.287109375, + "learning_rate": 0.0002, + "loss": 0.224, + "step": 4591 + }, + { + "epoch": 8.32827023350714, + "grad_norm": 0.2392578125, + "learning_rate": 0.0002, + "loss": 0.171, + "step": 4592 + }, + { + "epoch": 8.330083881206075, + "grad_norm": 0.3125, + "learning_rate": 0.0002, + "loss": 0.215, + "step": 4593 + }, + { + "epoch": 8.33189752890501, + "grad_norm": 0.2265625, + "learning_rate": 0.0002, + "loss": 0.1364, + "step": 4594 + }, + { + "epoch": 8.333711176603945, + "grad_norm": 0.30078125, + "learning_rate": 0.0002, + "loss": 0.1756, + "step": 4595 + }, + { + "epoch": 8.33552482430288, + "grad_norm": 0.197265625, + "learning_rate": 0.0002, + "loss": 0.1659, + "step": 4596 + }, + { + "epoch": 8.337338472001814, + "grad_norm": 0.26953125, + "learning_rate": 0.0002, + "loss": 0.1679, + "step": 4597 + }, + { + "epoch": 8.339152119700747, + "grad_norm": 0.34765625, + "learning_rate": 0.0002, + "loss": 0.1481, + "step": 4598 + }, + { + "epoch": 8.340965767399682, + "grad_norm": 0.21484375, + "learning_rate": 0.0002, + "loss": 0.1568, + "step": 4599 + }, + { + "epoch": 8.342779415098617, + "grad_norm": 0.2197265625, + "learning_rate": 0.0002, + "loss": 0.1371, + "step": 4600 + }, + { + "epoch": 8.344593062797552, + "grad_norm": 0.259765625, + "learning_rate": 0.0002, + "loss": 0.1812, + "step": 4601 + }, + { + "epoch": 8.346406710496487, + "grad_norm": 0.25390625, + "learning_rate": 0.0002, + "loss": 0.1624, + "step": 4602 + }, + { + "epoch": 8.348220358195421, + "grad_norm": 0.314453125, + "learning_rate": 0.0002, + "loss": 0.1963, + "step": 4603 + }, + { + "epoch": 8.350034005894354, + "grad_norm": 0.2470703125, + "learning_rate": 0.0002, + "loss": 0.1935, + "step": 4604 + }, + { + "epoch": 8.351847653593289, + "grad_norm": 0.255859375, + "learning_rate": 0.0002, + "loss": 0.1773, + "step": 4605 + }, + { + "epoch": 8.353661301292224, + "grad_norm": 0.28515625, + "learning_rate": 0.0002, + "loss": 0.1923, + "step": 4606 + }, + { + "epoch": 8.355474948991159, + "grad_norm": 0.2099609375, + "learning_rate": 0.0002, + "loss": 0.1644, + "step": 4607 + }, + { + "epoch": 8.357288596690093, + "grad_norm": 0.26171875, + "learning_rate": 0.0002, + "loss": 0.1638, + "step": 4608 + }, + { + "epoch": 8.359102244389028, + "grad_norm": 0.2138671875, + "learning_rate": 0.0002, + "loss": 0.1664, + "step": 4609 + }, + { + "epoch": 8.360915892087961, + "grad_norm": 0.39453125, + "learning_rate": 0.0002, + "loss": 0.1935, + "step": 4610 + }, + { + "epoch": 8.362729539786896, + "grad_norm": 0.2099609375, + "learning_rate": 0.0002, + "loss": 0.228, + "step": 4611 + }, + { + "epoch": 8.36454318748583, + "grad_norm": 0.25390625, + "learning_rate": 0.0002, + "loss": 0.6148, + "step": 4612 + }, + { + "epoch": 8.366356835184765, + "grad_norm": 0.265625, + "learning_rate": 0.0002, + "loss": 0.4114, + "step": 4613 + }, + { + "epoch": 8.3681704828837, + "grad_norm": 0.240234375, + "learning_rate": 0.0002, + "loss": 0.3497, + "step": 4614 + }, + { + "epoch": 8.369984130582635, + "grad_norm": 0.244140625, + "learning_rate": 0.0002, + "loss": 0.4707, + "step": 4615 + }, + { + "epoch": 8.371797778281568, + "grad_norm": 0.255859375, + "learning_rate": 0.0002, + "loss": 0.3955, + "step": 4616 + }, + { + "epoch": 8.373611425980503, + "grad_norm": 0.2392578125, + "learning_rate": 0.0002, + "loss": 0.3658, + "step": 4617 + }, + { + "epoch": 8.375425073679438, + "grad_norm": 0.248046875, + "learning_rate": 0.0002, + "loss": 0.3481, + "step": 4618 + }, + { + "epoch": 8.377238721378372, + "grad_norm": 0.248046875, + "learning_rate": 0.0002, + "loss": 0.334, + "step": 4619 + }, + { + "epoch": 8.379052369077307, + "grad_norm": 0.240234375, + "learning_rate": 0.0002, + "loss": 0.2876, + "step": 4620 + }, + { + "epoch": 8.380866016776242, + "grad_norm": 0.267578125, + "learning_rate": 0.0002, + "loss": 0.3002, + "step": 4621 + }, + { + "epoch": 8.382679664475175, + "grad_norm": 0.287109375, + "learning_rate": 0.0002, + "loss": 0.49, + "step": 4622 + }, + { + "epoch": 8.38449331217411, + "grad_norm": 0.267578125, + "learning_rate": 0.0002, + "loss": 0.3121, + "step": 4623 + }, + { + "epoch": 8.386306959873044, + "grad_norm": 0.259765625, + "learning_rate": 0.0002, + "loss": 0.2876, + "step": 4624 + }, + { + "epoch": 8.38812060757198, + "grad_norm": 0.263671875, + "learning_rate": 0.0002, + "loss": 0.356, + "step": 4625 + }, + { + "epoch": 8.389934255270914, + "grad_norm": 0.291015625, + "learning_rate": 0.0002, + "loss": 0.3432, + "step": 4626 + }, + { + "epoch": 8.391747902969849, + "grad_norm": 0.287109375, + "learning_rate": 0.0002, + "loss": 0.3111, + "step": 4627 + }, + { + "epoch": 8.393561550668782, + "grad_norm": 0.26953125, + "learning_rate": 0.0002, + "loss": 0.2451, + "step": 4628 + }, + { + "epoch": 8.395375198367717, + "grad_norm": 0.32421875, + "learning_rate": 0.0002, + "loss": 0.272, + "step": 4629 + }, + { + "epoch": 8.397188846066651, + "grad_norm": 0.333984375, + "learning_rate": 0.0002, + "loss": 0.3303, + "step": 4630 + }, + { + "epoch": 8.399002493765586, + "grad_norm": 0.302734375, + "learning_rate": 0.0002, + "loss": 0.3124, + "step": 4631 + }, + { + "epoch": 8.40081614146452, + "grad_norm": 0.322265625, + "learning_rate": 0.0002, + "loss": 0.3118, + "step": 4632 + }, + { + "epoch": 8.402629789163456, + "grad_norm": 0.3046875, + "learning_rate": 0.0002, + "loss": 0.2372, + "step": 4633 + }, + { + "epoch": 8.40444343686239, + "grad_norm": 0.3046875, + "learning_rate": 0.0002, + "loss": 0.2954, + "step": 4634 + }, + { + "epoch": 8.406257084561323, + "grad_norm": 0.33203125, + "learning_rate": 0.0002, + "loss": 0.3437, + "step": 4635 + }, + { + "epoch": 8.408070732260258, + "grad_norm": 0.265625, + "learning_rate": 0.0002, + "loss": 0.1955, + "step": 4636 + }, + { + "epoch": 8.409884379959193, + "grad_norm": 0.294921875, + "learning_rate": 0.0002, + "loss": 0.2547, + "step": 4637 + }, + { + "epoch": 8.411698027658128, + "grad_norm": 0.302734375, + "learning_rate": 0.0002, + "loss": 0.2817, + "step": 4638 + }, + { + "epoch": 8.413511675357062, + "grad_norm": 0.2890625, + "learning_rate": 0.0002, + "loss": 0.2478, + "step": 4639 + }, + { + "epoch": 8.415325323055997, + "grad_norm": 0.31640625, + "learning_rate": 0.0002, + "loss": 0.2418, + "step": 4640 + }, + { + "epoch": 8.41713897075493, + "grad_norm": 0.25, + "learning_rate": 0.0002, + "loss": 0.2176, + "step": 4641 + }, + { + "epoch": 8.418952618453865, + "grad_norm": 0.28125, + "learning_rate": 0.0002, + "loss": 0.1805, + "step": 4642 + }, + { + "epoch": 8.4207662661528, + "grad_norm": 0.296875, + "learning_rate": 0.0002, + "loss": 0.1985, + "step": 4643 + }, + { + "epoch": 8.422579913851735, + "grad_norm": 0.408203125, + "learning_rate": 0.0002, + "loss": 0.2416, + "step": 4644 + }, + { + "epoch": 8.42439356155067, + "grad_norm": 0.3046875, + "learning_rate": 0.0002, + "loss": 0.1508, + "step": 4645 + }, + { + "epoch": 8.426207209249604, + "grad_norm": 0.259765625, + "learning_rate": 0.0002, + "loss": 0.1396, + "step": 4646 + }, + { + "epoch": 8.428020856948537, + "grad_norm": 0.240234375, + "learning_rate": 0.0002, + "loss": 0.1641, + "step": 4647 + }, + { + "epoch": 8.429834504647472, + "grad_norm": 0.30859375, + "learning_rate": 0.0002, + "loss": 0.1714, + "step": 4648 + }, + { + "epoch": 8.431648152346407, + "grad_norm": 0.32421875, + "learning_rate": 0.0002, + "loss": 0.1896, + "step": 4649 + }, + { + "epoch": 8.433461800045341, + "grad_norm": 0.263671875, + "learning_rate": 0.0002, + "loss": 0.1975, + "step": 4650 + }, + { + "epoch": 8.435275447744276, + "grad_norm": 0.306640625, + "learning_rate": 0.0002, + "loss": 0.2252, + "step": 4651 + }, + { + "epoch": 8.437089095443211, + "grad_norm": 0.291015625, + "learning_rate": 0.0002, + "loss": 0.3588, + "step": 4652 + }, + { + "epoch": 8.438902743142144, + "grad_norm": 0.2431640625, + "learning_rate": 0.0002, + "loss": 0.1516, + "step": 4653 + }, + { + "epoch": 8.440716390841079, + "grad_norm": 0.25390625, + "learning_rate": 0.0002, + "loss": 0.2257, + "step": 4654 + }, + { + "epoch": 8.442530038540013, + "grad_norm": 0.2734375, + "learning_rate": 0.0002, + "loss": 0.1517, + "step": 4655 + }, + { + "epoch": 8.444343686238948, + "grad_norm": 0.255859375, + "learning_rate": 0.0002, + "loss": 0.1446, + "step": 4656 + }, + { + "epoch": 8.446157333937883, + "grad_norm": 0.35546875, + "learning_rate": 0.0002, + "loss": 0.1674, + "step": 4657 + }, + { + "epoch": 8.447970981636818, + "grad_norm": 0.27734375, + "learning_rate": 0.0002, + "loss": 0.1783, + "step": 4658 + }, + { + "epoch": 8.44978462933575, + "grad_norm": 0.19140625, + "learning_rate": 0.0002, + "loss": 0.1592, + "step": 4659 + }, + { + "epoch": 8.451598277034686, + "grad_norm": 0.26953125, + "learning_rate": 0.0002, + "loss": 0.1791, + "step": 4660 + }, + { + "epoch": 8.45341192473362, + "grad_norm": 0.57421875, + "learning_rate": 0.0002, + "loss": 0.3506, + "step": 4661 + }, + { + "epoch": 8.455225572432555, + "grad_norm": 0.25390625, + "learning_rate": 0.0002, + "loss": 0.3721, + "step": 4662 + }, + { + "epoch": 8.45703922013149, + "grad_norm": 0.275390625, + "learning_rate": 0.0002, + "loss": 0.4066, + "step": 4663 + }, + { + "epoch": 8.458852867830425, + "grad_norm": 0.25390625, + "learning_rate": 0.0002, + "loss": 0.3981, + "step": 4664 + }, + { + "epoch": 8.460666515529358, + "grad_norm": 0.259765625, + "learning_rate": 0.0002, + "loss": 0.4205, + "step": 4665 + }, + { + "epoch": 8.462480163228292, + "grad_norm": 0.2177734375, + "learning_rate": 0.0002, + "loss": 0.3855, + "step": 4666 + }, + { + "epoch": 8.464293810927227, + "grad_norm": 0.255859375, + "learning_rate": 0.0002, + "loss": 0.398, + "step": 4667 + }, + { + "epoch": 8.466107458626162, + "grad_norm": 0.25390625, + "learning_rate": 0.0002, + "loss": 0.3553, + "step": 4668 + }, + { + "epoch": 8.467921106325097, + "grad_norm": 0.259765625, + "learning_rate": 0.0002, + "loss": 0.3376, + "step": 4669 + }, + { + "epoch": 8.469734754024032, + "grad_norm": 0.2314453125, + "learning_rate": 0.0002, + "loss": 0.289, + "step": 4670 + }, + { + "epoch": 8.471548401722965, + "grad_norm": 0.29296875, + "learning_rate": 0.0002, + "loss": 0.3259, + "step": 4671 + }, + { + "epoch": 8.4733620494219, + "grad_norm": 0.283203125, + "learning_rate": 0.0002, + "loss": 0.4231, + "step": 4672 + }, + { + "epoch": 8.475175697120834, + "grad_norm": 0.263671875, + "learning_rate": 0.0002, + "loss": 0.3024, + "step": 4673 + }, + { + "epoch": 8.476989344819769, + "grad_norm": 0.263671875, + "learning_rate": 0.0002, + "loss": 0.3938, + "step": 4674 + }, + { + "epoch": 8.478802992518704, + "grad_norm": 0.26953125, + "learning_rate": 0.0002, + "loss": 0.2388, + "step": 4675 + }, + { + "epoch": 8.478802992518704, + "eval_loss": 1.9957058429718018, + "eval_runtime": 152.9909, + "eval_samples_per_second": 6.536, + "eval_steps_per_second": 6.536, + "step": 4675 + }, + { + "epoch": 8.478802992518704, + "mmlu_eval_accuracy": 0.29803512020941314, + "mmlu_eval_accuracy_abstract_algebra": 0.2727272727272727, + "mmlu_eval_accuracy_anatomy": 0.35714285714285715, + "mmlu_eval_accuracy_astronomy": 0.25, + "mmlu_eval_accuracy_business_ethics": 0.5454545454545454, + "mmlu_eval_accuracy_clinical_knowledge": 0.2413793103448276, + "mmlu_eval_accuracy_college_biology": 0.375, + "mmlu_eval_accuracy_college_chemistry": 0.375, + "mmlu_eval_accuracy_college_computer_science": 0.09090909090909091, + "mmlu_eval_accuracy_college_mathematics": 0.18181818181818182, + "mmlu_eval_accuracy_college_medicine": 0.18181818181818182, + "mmlu_eval_accuracy_college_physics": 0.2727272727272727, + "mmlu_eval_accuracy_computer_security": 0.36363636363636365, + "mmlu_eval_accuracy_conceptual_physics": 0.2692307692307692, + "mmlu_eval_accuracy_econometrics": 0.08333333333333333, + "mmlu_eval_accuracy_electrical_engineering": 0.125, + "mmlu_eval_accuracy_elementary_mathematics": 0.2682926829268293, + "mmlu_eval_accuracy_formal_logic": 0.14285714285714285, + "mmlu_eval_accuracy_global_facts": 0.4, + "mmlu_eval_accuracy_high_school_biology": 0.40625, + "mmlu_eval_accuracy_high_school_chemistry": 0.18181818181818182, + "mmlu_eval_accuracy_high_school_computer_science": 0.2222222222222222, + "mmlu_eval_accuracy_high_school_european_history": 0.2777777777777778, + "mmlu_eval_accuracy_high_school_geography": 0.3181818181818182, + "mmlu_eval_accuracy_high_school_government_and_politics": 0.3333333333333333, + "mmlu_eval_accuracy_high_school_macroeconomics": 0.27906976744186046, + "mmlu_eval_accuracy_high_school_mathematics": 0.10344827586206896, + "mmlu_eval_accuracy_high_school_microeconomics": 0.34615384615384615, + "mmlu_eval_accuracy_high_school_physics": 0.4117647058823529, + "mmlu_eval_accuracy_high_school_psychology": 0.2833333333333333, + "mmlu_eval_accuracy_high_school_statistics": 0.30434782608695654, + "mmlu_eval_accuracy_high_school_us_history": 0.2727272727272727, + "mmlu_eval_accuracy_high_school_world_history": 0.15384615384615385, + "mmlu_eval_accuracy_human_aging": 0.2608695652173913, + "mmlu_eval_accuracy_human_sexuality": 0.16666666666666666, + "mmlu_eval_accuracy_international_law": 0.3076923076923077, + "mmlu_eval_accuracy_jurisprudence": 0.2727272727272727, + "mmlu_eval_accuracy_logical_fallacies": 0.2777777777777778, + "mmlu_eval_accuracy_machine_learning": 0.36363636363636365, + "mmlu_eval_accuracy_management": 0.45454545454545453, + "mmlu_eval_accuracy_marketing": 0.44, + "mmlu_eval_accuracy_medical_genetics": 0.36363636363636365, + "mmlu_eval_accuracy_miscellaneous": 0.45348837209302323, + "mmlu_eval_accuracy_moral_disputes": 0.15789473684210525, + "mmlu_eval_accuracy_moral_scenarios": 0.22, + "mmlu_eval_accuracy_nutrition": 0.3939393939393939, + "mmlu_eval_accuracy_philosophy": 0.29411764705882354, + "mmlu_eval_accuracy_prehistory": 0.4, + "mmlu_eval_accuracy_professional_accounting": 0.3225806451612903, + "mmlu_eval_accuracy_professional_law": 0.2529411764705882, + "mmlu_eval_accuracy_professional_medicine": 0.25806451612903225, + "mmlu_eval_accuracy_professional_psychology": 0.2608695652173913, + "mmlu_eval_accuracy_public_relations": 0.5, + "mmlu_eval_accuracy_security_studies": 0.37037037037037035, + "mmlu_eval_accuracy_sociology": 0.45454545454545453, + "mmlu_eval_accuracy_us_foreign_policy": 0.45454545454545453, + "mmlu_eval_accuracy_virology": 0.3333333333333333, + "mmlu_eval_accuracy_world_religions": 0.2631578947368421, + "mmlu_loss": 1.9029088770151916, + "step": 4675 + }, + { + "epoch": 8.480616640217638, + "grad_norm": 0.30078125, + "learning_rate": 0.0002, + "loss": 0.3918, + "step": 4676 + }, + { + "epoch": 8.482430287916571, + "grad_norm": 0.283203125, + "learning_rate": 0.0002, + "loss": 0.2803, + "step": 4677 + }, + { + "epoch": 8.484243935615506, + "grad_norm": 0.310546875, + "learning_rate": 0.0002, + "loss": 0.3933, + "step": 4678 + }, + { + "epoch": 8.486057583314441, + "grad_norm": 0.31640625, + "learning_rate": 0.0002, + "loss": 0.3615, + "step": 4679 + }, + { + "epoch": 8.487871231013376, + "grad_norm": 0.32421875, + "learning_rate": 0.0002, + "loss": 0.2841, + "step": 4680 + }, + { + "epoch": 8.48968487871231, + "grad_norm": 0.32421875, + "learning_rate": 0.0002, + "loss": 0.2734, + "step": 4681 + }, + { + "epoch": 8.491498526411245, + "grad_norm": 0.326171875, + "learning_rate": 0.0002, + "loss": 0.2804, + "step": 4682 + }, + { + "epoch": 8.493312174110178, + "grad_norm": 0.314453125, + "learning_rate": 0.0002, + "loss": 0.2909, + "step": 4683 + }, + { + "epoch": 8.495125821809113, + "grad_norm": 0.30078125, + "learning_rate": 0.0002, + "loss": 0.2713, + "step": 4684 + }, + { + "epoch": 8.496939469508048, + "grad_norm": 0.3203125, + "learning_rate": 0.0002, + "loss": 0.3178, + "step": 4685 + }, + { + "epoch": 8.498753117206983, + "grad_norm": 0.294921875, + "learning_rate": 0.0002, + "loss": 0.2719, + "step": 4686 + }, + { + "epoch": 8.500566764905917, + "grad_norm": 0.28515625, + "learning_rate": 0.0002, + "loss": 0.2752, + "step": 4687 + }, + { + "epoch": 8.502380412604852, + "grad_norm": 0.326171875, + "learning_rate": 0.0002, + "loss": 0.2961, + "step": 4688 + }, + { + "epoch": 8.504194060303785, + "grad_norm": 0.34375, + "learning_rate": 0.0002, + "loss": 0.2072, + "step": 4689 + }, + { + "epoch": 8.50600770800272, + "grad_norm": 0.279296875, + "learning_rate": 0.0002, + "loss": 0.2079, + "step": 4690 + }, + { + "epoch": 8.507821355701655, + "grad_norm": 0.287109375, + "learning_rate": 0.0002, + "loss": 0.2268, + "step": 4691 + }, + { + "epoch": 8.50963500340059, + "grad_norm": 0.267578125, + "learning_rate": 0.0002, + "loss": 0.1821, + "step": 4692 + }, + { + "epoch": 8.511448651099524, + "grad_norm": 0.29296875, + "learning_rate": 0.0002, + "loss": 0.1762, + "step": 4693 + }, + { + "epoch": 8.513262298798459, + "grad_norm": 0.29296875, + "learning_rate": 0.0002, + "loss": 0.1945, + "step": 4694 + }, + { + "epoch": 8.515075946497394, + "grad_norm": 0.2373046875, + "learning_rate": 0.0002, + "loss": 0.1793, + "step": 4695 + }, + { + "epoch": 8.516889594196327, + "grad_norm": 0.267578125, + "learning_rate": 0.0002, + "loss": 0.169, + "step": 4696 + }, + { + "epoch": 8.518703241895262, + "grad_norm": 0.265625, + "learning_rate": 0.0002, + "loss": 0.1285, + "step": 4697 + }, + { + "epoch": 8.520516889594196, + "grad_norm": 0.3046875, + "learning_rate": 0.0002, + "loss": 0.1856, + "step": 4698 + }, + { + "epoch": 8.522330537293131, + "grad_norm": 0.275390625, + "learning_rate": 0.0002, + "loss": 0.1933, + "step": 4699 + }, + { + "epoch": 8.524144184992066, + "grad_norm": 0.2490234375, + "learning_rate": 0.0002, + "loss": 0.171, + "step": 4700 + }, + { + "epoch": 8.525957832691, + "grad_norm": 0.294921875, + "learning_rate": 0.0002, + "loss": 0.1958, + "step": 4701 + }, + { + "epoch": 8.527771480389934, + "grad_norm": 0.2353515625, + "learning_rate": 0.0002, + "loss": 0.125, + "step": 4702 + }, + { + "epoch": 8.529585128088868, + "grad_norm": 0.32421875, + "learning_rate": 0.0002, + "loss": 0.1828, + "step": 4703 + }, + { + "epoch": 8.531398775787803, + "grad_norm": 0.302734375, + "learning_rate": 0.0002, + "loss": 0.1393, + "step": 4704 + }, + { + "epoch": 8.533212423486738, + "grad_norm": 0.326171875, + "learning_rate": 0.0002, + "loss": 0.1731, + "step": 4705 + }, + { + "epoch": 8.535026071185673, + "grad_norm": 0.3984375, + "learning_rate": 0.0002, + "loss": 0.1571, + "step": 4706 + }, + { + "epoch": 8.536839718884607, + "grad_norm": 0.390625, + "learning_rate": 0.0002, + "loss": 0.1661, + "step": 4707 + }, + { + "epoch": 8.53865336658354, + "grad_norm": 0.2177734375, + "learning_rate": 0.0002, + "loss": 0.1483, + "step": 4708 + }, + { + "epoch": 8.540467014282475, + "grad_norm": 0.3671875, + "learning_rate": 0.0002, + "loss": 0.169, + "step": 4709 + }, + { + "epoch": 8.54228066198141, + "grad_norm": 0.2490234375, + "learning_rate": 0.0002, + "loss": 0.1965, + "step": 4710 + }, + { + "epoch": 8.544094309680345, + "grad_norm": 0.2099609375, + "learning_rate": 0.0002, + "loss": 0.2612, + "step": 4711 + }, + { + "epoch": 8.54590795737928, + "grad_norm": 0.267578125, + "learning_rate": 0.0002, + "loss": 0.4178, + "step": 4712 + }, + { + "epoch": 8.547721605078214, + "grad_norm": 0.26171875, + "learning_rate": 0.0002, + "loss": 0.4055, + "step": 4713 + }, + { + "epoch": 8.549535252777147, + "grad_norm": 0.27734375, + "learning_rate": 0.0002, + "loss": 0.4566, + "step": 4714 + }, + { + "epoch": 8.551348900476082, + "grad_norm": 0.2578125, + "learning_rate": 0.0002, + "loss": 0.3396, + "step": 4715 + }, + { + "epoch": 8.553162548175017, + "grad_norm": 0.251953125, + "learning_rate": 0.0002, + "loss": 0.4961, + "step": 4716 + }, + { + "epoch": 8.554976195873952, + "grad_norm": 0.2421875, + "learning_rate": 0.0002, + "loss": 0.339, + "step": 4717 + }, + { + "epoch": 8.556789843572886, + "grad_norm": 0.2451171875, + "learning_rate": 0.0002, + "loss": 0.3599, + "step": 4718 + }, + { + "epoch": 8.558603491271821, + "grad_norm": 0.265625, + "learning_rate": 0.0002, + "loss": 0.3437, + "step": 4719 + }, + { + "epoch": 8.560417138970754, + "grad_norm": 0.267578125, + "learning_rate": 0.0002, + "loss": 0.3547, + "step": 4720 + }, + { + "epoch": 8.562230786669689, + "grad_norm": 0.259765625, + "learning_rate": 0.0002, + "loss": 0.3528, + "step": 4721 + }, + { + "epoch": 8.564044434368624, + "grad_norm": 0.298828125, + "learning_rate": 0.0002, + "loss": 0.4155, + "step": 4722 + }, + { + "epoch": 8.565858082067558, + "grad_norm": 0.236328125, + "learning_rate": 0.0002, + "loss": 0.2819, + "step": 4723 + }, + { + "epoch": 8.567671729766493, + "grad_norm": 0.2734375, + "learning_rate": 0.0002, + "loss": 0.3453, + "step": 4724 + }, + { + "epoch": 8.569485377465428, + "grad_norm": 0.29296875, + "learning_rate": 0.0002, + "loss": 0.3564, + "step": 4725 + }, + { + "epoch": 8.571299025164361, + "grad_norm": 0.314453125, + "learning_rate": 0.0002, + "loss": 0.2884, + "step": 4726 + }, + { + "epoch": 8.573112672863296, + "grad_norm": 0.296875, + "learning_rate": 0.0002, + "loss": 0.2983, + "step": 4727 + }, + { + "epoch": 8.57492632056223, + "grad_norm": 0.322265625, + "learning_rate": 0.0002, + "loss": 0.3207, + "step": 4728 + }, + { + "epoch": 8.576739968261165, + "grad_norm": 0.3203125, + "learning_rate": 0.0002, + "loss": 0.313, + "step": 4729 + }, + { + "epoch": 8.5785536159601, + "grad_norm": 0.28515625, + "learning_rate": 0.0002, + "loss": 0.254, + "step": 4730 + }, + { + "epoch": 8.580367263659035, + "grad_norm": 0.31640625, + "learning_rate": 0.0002, + "loss": 0.3344, + "step": 4731 + }, + { + "epoch": 8.582180911357968, + "grad_norm": 0.328125, + "learning_rate": 0.0002, + "loss": 0.254, + "step": 4732 + }, + { + "epoch": 8.583994559056903, + "grad_norm": 0.30859375, + "learning_rate": 0.0002, + "loss": 0.2597, + "step": 4733 + }, + { + "epoch": 8.585808206755837, + "grad_norm": 0.3515625, + "learning_rate": 0.0002, + "loss": 0.3492, + "step": 4734 + }, + { + "epoch": 8.587621854454772, + "grad_norm": 0.3515625, + "learning_rate": 0.0002, + "loss": 0.2683, + "step": 4735 + }, + { + "epoch": 8.589435502153707, + "grad_norm": 0.32421875, + "learning_rate": 0.0002, + "loss": 0.272, + "step": 4736 + }, + { + "epoch": 8.591249149852642, + "grad_norm": 0.34375, + "learning_rate": 0.0002, + "loss": 0.2907, + "step": 4737 + }, + { + "epoch": 8.593062797551575, + "grad_norm": 0.3046875, + "learning_rate": 0.0002, + "loss": 0.2588, + "step": 4738 + }, + { + "epoch": 8.59487644525051, + "grad_norm": 0.263671875, + "learning_rate": 0.0002, + "loss": 0.2034, + "step": 4739 + }, + { + "epoch": 8.596690092949444, + "grad_norm": 0.294921875, + "learning_rate": 0.0002, + "loss": 0.2527, + "step": 4740 + }, + { + "epoch": 8.598503740648379, + "grad_norm": 0.294921875, + "learning_rate": 0.0002, + "loss": 0.2252, + "step": 4741 + }, + { + "epoch": 8.600317388347314, + "grad_norm": 0.302734375, + "learning_rate": 0.0002, + "loss": 0.2254, + "step": 4742 + }, + { + "epoch": 8.602131036046249, + "grad_norm": 0.349609375, + "learning_rate": 0.0002, + "loss": 0.2233, + "step": 4743 + }, + { + "epoch": 8.603944683745183, + "grad_norm": 0.251953125, + "learning_rate": 0.0002, + "loss": 0.1964, + "step": 4744 + }, + { + "epoch": 8.605758331444116, + "grad_norm": 0.30078125, + "learning_rate": 0.0002, + "loss": 0.2182, + "step": 4745 + }, + { + "epoch": 8.607571979143051, + "grad_norm": 0.310546875, + "learning_rate": 0.0002, + "loss": 0.1675, + "step": 4746 + }, + { + "epoch": 8.609385626841986, + "grad_norm": 0.318359375, + "learning_rate": 0.0002, + "loss": 0.1661, + "step": 4747 + }, + { + "epoch": 8.61119927454092, + "grad_norm": 0.2392578125, + "learning_rate": 0.0002, + "loss": 0.1529, + "step": 4748 + }, + { + "epoch": 8.613012922239855, + "grad_norm": 0.26171875, + "learning_rate": 0.0002, + "loss": 0.1754, + "step": 4749 + }, + { + "epoch": 8.614826569938788, + "grad_norm": 0.23046875, + "learning_rate": 0.0002, + "loss": 0.1446, + "step": 4750 + }, + { + "epoch": 8.616640217637723, + "grad_norm": 0.2890625, + "learning_rate": 0.0002, + "loss": 0.1778, + "step": 4751 + }, + { + "epoch": 8.618453865336658, + "grad_norm": 0.27734375, + "learning_rate": 0.0002, + "loss": 0.1989, + "step": 4752 + }, + { + "epoch": 8.620267513035593, + "grad_norm": 0.3359375, + "learning_rate": 0.0002, + "loss": 0.1917, + "step": 4753 + }, + { + "epoch": 8.622081160734528, + "grad_norm": 0.28515625, + "learning_rate": 0.0002, + "loss": 0.2023, + "step": 4754 + }, + { + "epoch": 8.623894808433462, + "grad_norm": 0.263671875, + "learning_rate": 0.0002, + "loss": 0.1454, + "step": 4755 + }, + { + "epoch": 8.625708456132397, + "grad_norm": 0.216796875, + "learning_rate": 0.0002, + "loss": 0.1442, + "step": 4756 + }, + { + "epoch": 8.62752210383133, + "grad_norm": 0.3359375, + "learning_rate": 0.0002, + "loss": 0.1558, + "step": 4757 + }, + { + "epoch": 8.629335751530265, + "grad_norm": 0.2890625, + "learning_rate": 0.0002, + "loss": 0.1643, + "step": 4758 + }, + { + "epoch": 8.6311493992292, + "grad_norm": 0.314453125, + "learning_rate": 0.0002, + "loss": 0.1957, + "step": 4759 + }, + { + "epoch": 8.632963046928134, + "grad_norm": 0.29296875, + "learning_rate": 0.0002, + "loss": 0.217, + "step": 4760 + }, + { + "epoch": 8.63477669462707, + "grad_norm": 0.298828125, + "learning_rate": 0.0002, + "loss": 0.2474, + "step": 4761 + }, + { + "epoch": 8.636590342326004, + "grad_norm": 0.2412109375, + "learning_rate": 0.0002, + "loss": 0.3371, + "step": 4762 + }, + { + "epoch": 8.638403990024937, + "grad_norm": 0.2734375, + "learning_rate": 0.0002, + "loss": 0.4007, + "step": 4763 + }, + { + "epoch": 8.640217637723872, + "grad_norm": 0.251953125, + "learning_rate": 0.0002, + "loss": 0.3359, + "step": 4764 + }, + { + "epoch": 8.642031285422807, + "grad_norm": 0.28125, + "learning_rate": 0.0002, + "loss": 0.3152, + "step": 4765 + }, + { + "epoch": 8.643844933121741, + "grad_norm": 0.255859375, + "learning_rate": 0.0002, + "loss": 0.3515, + "step": 4766 + }, + { + "epoch": 8.645658580820676, + "grad_norm": 0.287109375, + "learning_rate": 0.0002, + "loss": 0.3864, + "step": 4767 + }, + { + "epoch": 8.64747222851961, + "grad_norm": 0.283203125, + "learning_rate": 0.0002, + "loss": 0.3505, + "step": 4768 + }, + { + "epoch": 8.649285876218544, + "grad_norm": 0.263671875, + "learning_rate": 0.0002, + "loss": 0.3867, + "step": 4769 + }, + { + "epoch": 8.651099523917479, + "grad_norm": 0.26953125, + "learning_rate": 0.0002, + "loss": 0.2993, + "step": 4770 + }, + { + "epoch": 8.652913171616413, + "grad_norm": 0.2578125, + "learning_rate": 0.0002, + "loss": 0.4484, + "step": 4771 + }, + { + "epoch": 8.654726819315348, + "grad_norm": 0.3046875, + "learning_rate": 0.0002, + "loss": 0.4688, + "step": 4772 + }, + { + "epoch": 8.656540467014283, + "grad_norm": 0.27734375, + "learning_rate": 0.0002, + "loss": 0.2754, + "step": 4773 + }, + { + "epoch": 8.658354114713218, + "grad_norm": 0.302734375, + "learning_rate": 0.0002, + "loss": 0.3193, + "step": 4774 + }, + { + "epoch": 8.66016776241215, + "grad_norm": 0.271484375, + "learning_rate": 0.0002, + "loss": 0.3051, + "step": 4775 + }, + { + "epoch": 8.661981410111085, + "grad_norm": 0.2890625, + "learning_rate": 0.0002, + "loss": 0.3124, + "step": 4776 + }, + { + "epoch": 8.66379505781002, + "grad_norm": 0.287109375, + "learning_rate": 0.0002, + "loss": 0.3548, + "step": 4777 + }, + { + "epoch": 8.665608705508955, + "grad_norm": 0.296875, + "learning_rate": 0.0002, + "loss": 0.3365, + "step": 4778 + }, + { + "epoch": 8.66742235320789, + "grad_norm": 0.275390625, + "learning_rate": 0.0002, + "loss": 0.277, + "step": 4779 + }, + { + "epoch": 8.669236000906825, + "grad_norm": 0.328125, + "learning_rate": 0.0002, + "loss": 0.3423, + "step": 4780 + }, + { + "epoch": 8.671049648605758, + "grad_norm": 0.32421875, + "learning_rate": 0.0002, + "loss": 0.2522, + "step": 4781 + }, + { + "epoch": 8.672863296304692, + "grad_norm": 0.357421875, + "learning_rate": 0.0002, + "loss": 0.3587, + "step": 4782 + }, + { + "epoch": 8.674676944003627, + "grad_norm": 0.30859375, + "learning_rate": 0.0002, + "loss": 0.2356, + "step": 4783 + }, + { + "epoch": 8.676490591702562, + "grad_norm": 0.3046875, + "learning_rate": 0.0002, + "loss": 0.2596, + "step": 4784 + }, + { + "epoch": 8.678304239401497, + "grad_norm": 0.322265625, + "learning_rate": 0.0002, + "loss": 0.2154, + "step": 4785 + }, + { + "epoch": 8.680117887100431, + "grad_norm": 0.314453125, + "learning_rate": 0.0002, + "loss": 0.2797, + "step": 4786 + }, + { + "epoch": 8.681931534799364, + "grad_norm": 0.328125, + "learning_rate": 0.0002, + "loss": 0.2306, + "step": 4787 + }, + { + "epoch": 8.6837451824983, + "grad_norm": 0.3125, + "learning_rate": 0.0002, + "loss": 0.2255, + "step": 4788 + }, + { + "epoch": 8.685558830197234, + "grad_norm": 0.318359375, + "learning_rate": 0.0002, + "loss": 0.2575, + "step": 4789 + }, + { + "epoch": 8.687372477896169, + "grad_norm": 0.39453125, + "learning_rate": 0.0002, + "loss": 0.258, + "step": 4790 + }, + { + "epoch": 8.689186125595104, + "grad_norm": 0.27734375, + "learning_rate": 0.0002, + "loss": 0.1645, + "step": 4791 + }, + { + "epoch": 8.690999773294038, + "grad_norm": 0.2890625, + "learning_rate": 0.0002, + "loss": 0.2221, + "step": 4792 + }, + { + "epoch": 8.692813420992973, + "grad_norm": 0.26171875, + "learning_rate": 0.0002, + "loss": 0.1622, + "step": 4793 + }, + { + "epoch": 8.694627068691906, + "grad_norm": 0.306640625, + "learning_rate": 0.0002, + "loss": 0.1849, + "step": 4794 + }, + { + "epoch": 8.69644071639084, + "grad_norm": 0.2353515625, + "learning_rate": 0.0002, + "loss": 0.1976, + "step": 4795 + }, + { + "epoch": 8.698254364089776, + "grad_norm": 0.28515625, + "learning_rate": 0.0002, + "loss": 0.2218, + "step": 4796 + }, + { + "epoch": 8.70006801178871, + "grad_norm": 0.3125, + "learning_rate": 0.0002, + "loss": 0.1644, + "step": 4797 + }, + { + "epoch": 8.701881659487645, + "grad_norm": 0.25390625, + "learning_rate": 0.0002, + "loss": 0.1537, + "step": 4798 + }, + { + "epoch": 8.703695307186578, + "grad_norm": 0.2578125, + "learning_rate": 0.0002, + "loss": 0.1835, + "step": 4799 + }, + { + "epoch": 8.705508954885513, + "grad_norm": 0.314453125, + "learning_rate": 0.0002, + "loss": 0.1522, + "step": 4800 + }, + { + "epoch": 8.707322602584448, + "grad_norm": 0.23828125, + "learning_rate": 0.0002, + "loss": 0.188, + "step": 4801 + }, + { + "epoch": 8.709136250283382, + "grad_norm": 0.265625, + "learning_rate": 0.0002, + "loss": 0.1572, + "step": 4802 + }, + { + "epoch": 8.710949897982317, + "grad_norm": 0.251953125, + "learning_rate": 0.0002, + "loss": 0.1373, + "step": 4803 + }, + { + "epoch": 8.712763545681252, + "grad_norm": 0.30859375, + "learning_rate": 0.0002, + "loss": 0.2012, + "step": 4804 + }, + { + "epoch": 8.714577193380187, + "grad_norm": 0.25390625, + "learning_rate": 0.0002, + "loss": 0.1328, + "step": 4805 + }, + { + "epoch": 8.71639084107912, + "grad_norm": 0.23046875, + "learning_rate": 0.0002, + "loss": 0.1384, + "step": 4806 + }, + { + "epoch": 8.718204488778055, + "grad_norm": 0.322265625, + "learning_rate": 0.0002, + "loss": 0.1657, + "step": 4807 + }, + { + "epoch": 8.72001813647699, + "grad_norm": 0.275390625, + "learning_rate": 0.0002, + "loss": 0.174, + "step": 4808 + }, + { + "epoch": 8.721831784175924, + "grad_norm": 0.28515625, + "learning_rate": 0.0002, + "loss": 0.1763, + "step": 4809 + }, + { + "epoch": 8.723645431874859, + "grad_norm": 0.2353515625, + "learning_rate": 0.0002, + "loss": 0.1784, + "step": 4810 + }, + { + "epoch": 8.725459079573792, + "grad_norm": 0.421875, + "learning_rate": 0.0002, + "loss": 0.2693, + "step": 4811 + }, + { + "epoch": 8.727272727272727, + "grad_norm": 0.26171875, + "learning_rate": 0.0002, + "loss": 0.4265, + "step": 4812 + }, + { + "epoch": 8.729086374971661, + "grad_norm": 0.259765625, + "learning_rate": 0.0002, + "loss": 0.3638, + "step": 4813 + }, + { + "epoch": 8.730900022670596, + "grad_norm": 0.294921875, + "learning_rate": 0.0002, + "loss": 0.4686, + "step": 4814 + }, + { + "epoch": 8.732713670369531, + "grad_norm": 0.2333984375, + "learning_rate": 0.0002, + "loss": 0.43, + "step": 4815 + }, + { + "epoch": 8.734527318068466, + "grad_norm": 0.26953125, + "learning_rate": 0.0002, + "loss": 0.4185, + "step": 4816 + }, + { + "epoch": 8.7363409657674, + "grad_norm": 0.2421875, + "learning_rate": 0.0002, + "loss": 0.2532, + "step": 4817 + }, + { + "epoch": 8.738154613466333, + "grad_norm": 0.271484375, + "learning_rate": 0.0002, + "loss": 0.3686, + "step": 4818 + }, + { + "epoch": 8.739968261165268, + "grad_norm": 0.314453125, + "learning_rate": 0.0002, + "loss": 0.3883, + "step": 4819 + }, + { + "epoch": 8.741781908864203, + "grad_norm": 0.265625, + "learning_rate": 0.0002, + "loss": 0.3425, + "step": 4820 + }, + { + "epoch": 8.743595556563138, + "grad_norm": 0.2490234375, + "learning_rate": 0.0002, + "loss": 0.2747, + "step": 4821 + }, + { + "epoch": 8.745409204262073, + "grad_norm": 0.296875, + "learning_rate": 0.0002, + "loss": 0.3328, + "step": 4822 + }, + { + "epoch": 8.747222851961007, + "grad_norm": 0.294921875, + "learning_rate": 0.0002, + "loss": 0.3777, + "step": 4823 + }, + { + "epoch": 8.74903649965994, + "grad_norm": 0.30078125, + "learning_rate": 0.0002, + "loss": 0.3508, + "step": 4824 + }, + { + "epoch": 8.750850147358875, + "grad_norm": 0.30078125, + "learning_rate": 0.0002, + "loss": 0.4089, + "step": 4825 + }, + { + "epoch": 8.75266379505781, + "grad_norm": 0.2734375, + "learning_rate": 0.0002, + "loss": 0.257, + "step": 4826 + }, + { + "epoch": 8.754477442756745, + "grad_norm": 0.310546875, + "learning_rate": 0.0002, + "loss": 0.3368, + "step": 4827 + }, + { + "epoch": 8.75629109045568, + "grad_norm": 0.3046875, + "learning_rate": 0.0002, + "loss": 0.3009, + "step": 4828 + }, + { + "epoch": 8.758104738154614, + "grad_norm": 0.265625, + "learning_rate": 0.0002, + "loss": 0.2965, + "step": 4829 + }, + { + "epoch": 8.759918385853547, + "grad_norm": 0.294921875, + "learning_rate": 0.0002, + "loss": 0.2775, + "step": 4830 + }, + { + "epoch": 8.761732033552482, + "grad_norm": 0.359375, + "learning_rate": 0.0002, + "loss": 0.3695, + "step": 4831 + }, + { + "epoch": 8.763545681251417, + "grad_norm": 0.33203125, + "learning_rate": 0.0002, + "loss": 0.3282, + "step": 4832 + }, + { + "epoch": 8.765359328950352, + "grad_norm": 0.33203125, + "learning_rate": 0.0002, + "loss": 0.3589, + "step": 4833 + }, + { + "epoch": 8.767172976649286, + "grad_norm": 0.33203125, + "learning_rate": 0.0002, + "loss": 0.3061, + "step": 4834 + }, + { + "epoch": 8.768986624348221, + "grad_norm": 0.326171875, + "learning_rate": 0.0002, + "loss": 0.2831, + "step": 4835 + }, + { + "epoch": 8.770800272047154, + "grad_norm": 0.255859375, + "learning_rate": 0.0002, + "loss": 0.2344, + "step": 4836 + }, + { + "epoch": 8.772613919746089, + "grad_norm": 0.29296875, + "learning_rate": 0.0002, + "loss": 0.222, + "step": 4837 + }, + { + "epoch": 8.774427567445024, + "grad_norm": 0.294921875, + "learning_rate": 0.0002, + "loss": 0.2372, + "step": 4838 + }, + { + "epoch": 8.776241215143958, + "grad_norm": 0.349609375, + "learning_rate": 0.0002, + "loss": 0.3008, + "step": 4839 + }, + { + "epoch": 8.778054862842893, + "grad_norm": 0.328125, + "learning_rate": 0.0002, + "loss": 0.2756, + "step": 4840 + }, + { + "epoch": 8.779868510541828, + "grad_norm": 0.33203125, + "learning_rate": 0.0002, + "loss": 0.273, + "step": 4841 + }, + { + "epoch": 8.781682158240761, + "grad_norm": 0.314453125, + "learning_rate": 0.0002, + "loss": 0.2381, + "step": 4842 + }, + { + "epoch": 8.783495805939696, + "grad_norm": 0.30859375, + "learning_rate": 0.0002, + "loss": 0.2037, + "step": 4843 + }, + { + "epoch": 8.78530945363863, + "grad_norm": 0.279296875, + "learning_rate": 0.0002, + "loss": 0.1932, + "step": 4844 + }, + { + "epoch": 8.787123101337565, + "grad_norm": 0.34375, + "learning_rate": 0.0002, + "loss": 0.2015, + "step": 4845 + }, + { + "epoch": 8.7889367490365, + "grad_norm": 0.28515625, + "learning_rate": 0.0002, + "loss": 0.1589, + "step": 4846 + }, + { + "epoch": 8.790750396735435, + "grad_norm": 0.333984375, + "learning_rate": 0.0002, + "loss": 0.2254, + "step": 4847 + }, + { + "epoch": 8.792564044434368, + "grad_norm": 0.2265625, + "learning_rate": 0.0002, + "loss": 0.1503, + "step": 4848 + }, + { + "epoch": 8.794377692133303, + "grad_norm": 0.29296875, + "learning_rate": 0.0002, + "loss": 0.1485, + "step": 4849 + }, + { + "epoch": 8.796191339832237, + "grad_norm": 0.2890625, + "learning_rate": 0.0002, + "loss": 0.2239, + "step": 4850 + }, + { + "epoch": 8.798004987531172, + "grad_norm": 0.3125, + "learning_rate": 0.0002, + "loss": 0.1703, + "step": 4851 + }, + { + "epoch": 8.799818635230107, + "grad_norm": 0.333984375, + "learning_rate": 0.0002, + "loss": 0.1511, + "step": 4852 + }, + { + "epoch": 8.801632282929042, + "grad_norm": 0.3515625, + "learning_rate": 0.0002, + "loss": 0.172, + "step": 4853 + }, + { + "epoch": 8.803445930627976, + "grad_norm": 0.26953125, + "learning_rate": 0.0002, + "loss": 0.1867, + "step": 4854 + }, + { + "epoch": 8.80525957832691, + "grad_norm": 0.271484375, + "learning_rate": 0.0002, + "loss": 0.1941, + "step": 4855 + }, + { + "epoch": 8.807073226025844, + "grad_norm": 0.333984375, + "learning_rate": 0.0002, + "loss": 0.196, + "step": 4856 + }, + { + "epoch": 8.808886873724779, + "grad_norm": 0.302734375, + "learning_rate": 0.0002, + "loss": 0.189, + "step": 4857 + }, + { + "epoch": 8.810700521423714, + "grad_norm": 0.2890625, + "learning_rate": 0.0002, + "loss": 0.2021, + "step": 4858 + }, + { + "epoch": 8.812514169122649, + "grad_norm": 0.37890625, + "learning_rate": 0.0002, + "loss": 0.1917, + "step": 4859 + }, + { + "epoch": 8.814327816821582, + "grad_norm": 0.2734375, + "learning_rate": 0.0002, + "loss": 0.212, + "step": 4860 + }, + { + "epoch": 8.816141464520516, + "grad_norm": 0.283203125, + "learning_rate": 0.0002, + "loss": 0.2896, + "step": 4861 + }, + { + "epoch": 8.817955112219451, + "grad_norm": 0.2470703125, + "learning_rate": 0.0002, + "loss": 0.4627, + "step": 4862 + }, + { + "epoch": 8.817955112219451, + "eval_loss": 2.0253140926361084, + "eval_runtime": 150.3873, + "eval_samples_per_second": 6.649, + "eval_steps_per_second": 6.649, + "step": 4862 + }, + { + "epoch": 8.817955112219451, + "mmlu_eval_accuracy": 0.3074791647843592, + "mmlu_eval_accuracy_abstract_algebra": 0.2727272727272727, + "mmlu_eval_accuracy_anatomy": 0.5, + "mmlu_eval_accuracy_astronomy": 0.25, + "mmlu_eval_accuracy_business_ethics": 0.5454545454545454, + "mmlu_eval_accuracy_clinical_knowledge": 0.27586206896551724, + "mmlu_eval_accuracy_college_biology": 0.375, + "mmlu_eval_accuracy_college_chemistry": 0.125, + "mmlu_eval_accuracy_college_computer_science": 0.09090909090909091, + "mmlu_eval_accuracy_college_mathematics": 0.18181818181818182, + "mmlu_eval_accuracy_college_medicine": 0.13636363636363635, + "mmlu_eval_accuracy_college_physics": 0.36363636363636365, + "mmlu_eval_accuracy_computer_security": 0.36363636363636365, + "mmlu_eval_accuracy_conceptual_physics": 0.3076923076923077, + "mmlu_eval_accuracy_econometrics": 0.16666666666666666, + "mmlu_eval_accuracy_electrical_engineering": 0.125, + "mmlu_eval_accuracy_elementary_mathematics": 0.2682926829268293, + "mmlu_eval_accuracy_formal_logic": 0.14285714285714285, + "mmlu_eval_accuracy_global_facts": 0.4, + "mmlu_eval_accuracy_high_school_biology": 0.4375, + "mmlu_eval_accuracy_high_school_chemistry": 0.22727272727272727, + "mmlu_eval_accuracy_high_school_computer_science": 0.3333333333333333, + "mmlu_eval_accuracy_high_school_european_history": 0.2777777777777778, + "mmlu_eval_accuracy_high_school_geography": 0.3181818181818182, + "mmlu_eval_accuracy_high_school_government_and_politics": 0.38095238095238093, + "mmlu_eval_accuracy_high_school_macroeconomics": 0.32558139534883723, + "mmlu_eval_accuracy_high_school_mathematics": 0.10344827586206896, + "mmlu_eval_accuracy_high_school_microeconomics": 0.3076923076923077, + "mmlu_eval_accuracy_high_school_physics": 0.4117647058823529, + "mmlu_eval_accuracy_high_school_psychology": 0.3, + "mmlu_eval_accuracy_high_school_statistics": 0.30434782608695654, + "mmlu_eval_accuracy_high_school_us_history": 0.2727272727272727, + "mmlu_eval_accuracy_high_school_world_history": 0.11538461538461539, + "mmlu_eval_accuracy_human_aging": 0.30434782608695654, + "mmlu_eval_accuracy_human_sexuality": 0.0, + "mmlu_eval_accuracy_international_law": 0.3076923076923077, + "mmlu_eval_accuracy_jurisprudence": 0.2727272727272727, + "mmlu_eval_accuracy_logical_fallacies": 0.3333333333333333, + "mmlu_eval_accuracy_machine_learning": 0.36363636363636365, + "mmlu_eval_accuracy_management": 0.45454545454545453, + "mmlu_eval_accuracy_marketing": 0.48, + "mmlu_eval_accuracy_medical_genetics": 0.45454545454545453, + "mmlu_eval_accuracy_miscellaneous": 0.45348837209302323, + "mmlu_eval_accuracy_moral_disputes": 0.23684210526315788, + "mmlu_eval_accuracy_moral_scenarios": 0.22, + "mmlu_eval_accuracy_nutrition": 0.36363636363636365, + "mmlu_eval_accuracy_philosophy": 0.35294117647058826, + "mmlu_eval_accuracy_prehistory": 0.4857142857142857, + "mmlu_eval_accuracy_professional_accounting": 0.3225806451612903, + "mmlu_eval_accuracy_professional_law": 0.2529411764705882, + "mmlu_eval_accuracy_professional_medicine": 0.2903225806451613, + "mmlu_eval_accuracy_professional_psychology": 0.34782608695652173, + "mmlu_eval_accuracy_public_relations": 0.5, + "mmlu_eval_accuracy_security_studies": 0.37037037037037035, + "mmlu_eval_accuracy_sociology": 0.4090909090909091, + "mmlu_eval_accuracy_us_foreign_policy": 0.45454545454545453, + "mmlu_eval_accuracy_virology": 0.2777777777777778, + "mmlu_eval_accuracy_world_religions": 0.21052631578947367, + "mmlu_loss": 2.053722585516767, + "step": 4862 + }, + { + "epoch": 8.819768759918386, + "grad_norm": 0.2734375, + "learning_rate": 0.0002, + "loss": 0.3922, + "step": 4863 + }, + { + "epoch": 8.82158240761732, + "grad_norm": 0.25390625, + "learning_rate": 0.0002, + "loss": 0.3918, + "step": 4864 + }, + { + "epoch": 8.823396055316255, + "grad_norm": 0.2451171875, + "learning_rate": 0.0002, + "loss": 0.3155, + "step": 4865 + }, + { + "epoch": 8.82520970301519, + "grad_norm": 0.251953125, + "learning_rate": 0.0002, + "loss": 0.4714, + "step": 4866 + }, + { + "epoch": 8.827023350714123, + "grad_norm": 0.3046875, + "learning_rate": 0.0002, + "loss": 0.4278, + "step": 4867 + }, + { + "epoch": 8.828836998413058, + "grad_norm": 0.24609375, + "learning_rate": 0.0002, + "loss": 0.3741, + "step": 4868 + }, + { + "epoch": 8.830650646111993, + "grad_norm": 0.26171875, + "learning_rate": 0.0002, + "loss": 0.4722, + "step": 4869 + }, + { + "epoch": 8.832464293810927, + "grad_norm": 0.25, + "learning_rate": 0.0002, + "loss": 0.4075, + "step": 4870 + }, + { + "epoch": 8.834277941509862, + "grad_norm": 0.2431640625, + "learning_rate": 0.0002, + "loss": 0.3034, + "step": 4871 + }, + { + "epoch": 8.836091589208797, + "grad_norm": 0.25390625, + "learning_rate": 0.0002, + "loss": 0.3137, + "step": 4872 + }, + { + "epoch": 8.83790523690773, + "grad_norm": 0.2490234375, + "learning_rate": 0.0002, + "loss": 0.2372, + "step": 4873 + }, + { + "epoch": 8.839718884606665, + "grad_norm": 0.3125, + "learning_rate": 0.0002, + "loss": 0.3824, + "step": 4874 + }, + { + "epoch": 8.8415325323056, + "grad_norm": 0.271484375, + "learning_rate": 0.0002, + "loss": 0.3619, + "step": 4875 + }, + { + "epoch": 8.843346180004534, + "grad_norm": 0.3125, + "learning_rate": 0.0002, + "loss": 0.3991, + "step": 4876 + }, + { + "epoch": 8.845159827703469, + "grad_norm": 0.291015625, + "learning_rate": 0.0002, + "loss": 0.2848, + "step": 4877 + }, + { + "epoch": 8.846973475402404, + "grad_norm": 0.2890625, + "learning_rate": 0.0002, + "loss": 0.277, + "step": 4878 + }, + { + "epoch": 8.848787123101337, + "grad_norm": 0.287109375, + "learning_rate": 0.0002, + "loss": 0.3242, + "step": 4879 + }, + { + "epoch": 8.850600770800272, + "grad_norm": 0.296875, + "learning_rate": 0.0002, + "loss": 0.2721, + "step": 4880 + }, + { + "epoch": 8.852414418499206, + "grad_norm": 0.333984375, + "learning_rate": 0.0002, + "loss": 0.3155, + "step": 4881 + }, + { + "epoch": 8.854228066198141, + "grad_norm": 0.3046875, + "learning_rate": 0.0002, + "loss": 0.2273, + "step": 4882 + }, + { + "epoch": 8.856041713897076, + "grad_norm": 0.310546875, + "learning_rate": 0.0002, + "loss": 0.2656, + "step": 4883 + }, + { + "epoch": 8.85785536159601, + "grad_norm": 0.318359375, + "learning_rate": 0.0002, + "loss": 0.2633, + "step": 4884 + }, + { + "epoch": 8.859669009294944, + "grad_norm": 0.30859375, + "learning_rate": 0.0002, + "loss": 0.271, + "step": 4885 + }, + { + "epoch": 8.861482656993878, + "grad_norm": 0.40234375, + "learning_rate": 0.0002, + "loss": 0.3199, + "step": 4886 + }, + { + "epoch": 8.863296304692813, + "grad_norm": 0.3359375, + "learning_rate": 0.0002, + "loss": 0.2462, + "step": 4887 + }, + { + "epoch": 8.865109952391748, + "grad_norm": 0.31640625, + "learning_rate": 0.0002, + "loss": 0.3536, + "step": 4888 + }, + { + "epoch": 8.866923600090683, + "grad_norm": 0.294921875, + "learning_rate": 0.0002, + "loss": 0.2477, + "step": 4889 + }, + { + "epoch": 8.868737247789618, + "grad_norm": 0.34765625, + "learning_rate": 0.0002, + "loss": 0.2736, + "step": 4890 + }, + { + "epoch": 8.87055089548855, + "grad_norm": 0.287109375, + "learning_rate": 0.0002, + "loss": 0.2185, + "step": 4891 + }, + { + "epoch": 8.872364543187485, + "grad_norm": 0.267578125, + "learning_rate": 0.0002, + "loss": 0.1739, + "step": 4892 + }, + { + "epoch": 8.87417819088642, + "grad_norm": 0.287109375, + "learning_rate": 0.0002, + "loss": 0.2418, + "step": 4893 + }, + { + "epoch": 8.875991838585355, + "grad_norm": 0.2421875, + "learning_rate": 0.0002, + "loss": 0.208, + "step": 4894 + }, + { + "epoch": 8.87780548628429, + "grad_norm": 0.306640625, + "learning_rate": 0.0002, + "loss": 0.2015, + "step": 4895 + }, + { + "epoch": 8.879619133983224, + "grad_norm": 0.294921875, + "learning_rate": 0.0002, + "loss": 0.2067, + "step": 4896 + }, + { + "epoch": 8.881432781682157, + "grad_norm": 0.310546875, + "learning_rate": 0.0002, + "loss": 0.1874, + "step": 4897 + }, + { + "epoch": 8.883246429381092, + "grad_norm": 0.3359375, + "learning_rate": 0.0002, + "loss": 0.1771, + "step": 4898 + }, + { + "epoch": 8.885060077080027, + "grad_norm": 0.291015625, + "learning_rate": 0.0002, + "loss": 0.1749, + "step": 4899 + }, + { + "epoch": 8.886873724778962, + "grad_norm": 0.369140625, + "learning_rate": 0.0002, + "loss": 0.1962, + "step": 4900 + }, + { + "epoch": 8.888687372477897, + "grad_norm": 0.26953125, + "learning_rate": 0.0002, + "loss": 0.1404, + "step": 4901 + }, + { + "epoch": 8.890501020176831, + "grad_norm": 0.30859375, + "learning_rate": 0.0002, + "loss": 0.1574, + "step": 4902 + }, + { + "epoch": 8.892314667875764, + "grad_norm": 0.2412109375, + "learning_rate": 0.0002, + "loss": 0.1475, + "step": 4903 + }, + { + "epoch": 8.894128315574699, + "grad_norm": 0.333984375, + "learning_rate": 0.0002, + "loss": 0.1892, + "step": 4904 + }, + { + "epoch": 8.895941963273634, + "grad_norm": 0.3671875, + "learning_rate": 0.0002, + "loss": 0.2365, + "step": 4905 + }, + { + "epoch": 8.897755610972569, + "grad_norm": 0.28515625, + "learning_rate": 0.0002, + "loss": 0.1549, + "step": 4906 + }, + { + "epoch": 8.899569258671503, + "grad_norm": 0.40625, + "learning_rate": 0.0002, + "loss": 0.1623, + "step": 4907 + }, + { + "epoch": 8.901382906370438, + "grad_norm": 0.328125, + "learning_rate": 0.0002, + "loss": 0.1836, + "step": 4908 + }, + { + "epoch": 8.903196554069371, + "grad_norm": 0.255859375, + "learning_rate": 0.0002, + "loss": 0.173, + "step": 4909 + }, + { + "epoch": 8.905010201768306, + "grad_norm": 0.228515625, + "learning_rate": 0.0002, + "loss": 0.2095, + "step": 4910 + }, + { + "epoch": 8.90682384946724, + "grad_norm": 0.2412109375, + "learning_rate": 0.0002, + "loss": 0.2404, + "step": 4911 + }, + { + "epoch": 8.908637497166175, + "grad_norm": 0.2451171875, + "learning_rate": 0.0002, + "loss": 0.3588, + "step": 4912 + }, + { + "epoch": 8.91045114486511, + "grad_norm": 0.294921875, + "learning_rate": 0.0002, + "loss": 0.4695, + "step": 4913 + }, + { + "epoch": 8.912264792564045, + "grad_norm": 0.267578125, + "learning_rate": 0.0002, + "loss": 0.4378, + "step": 4914 + }, + { + "epoch": 8.91407844026298, + "grad_norm": 0.2314453125, + "learning_rate": 0.0002, + "loss": 0.3085, + "step": 4915 + }, + { + "epoch": 8.915892087961913, + "grad_norm": 0.2890625, + "learning_rate": 0.0002, + "loss": 0.4771, + "step": 4916 + }, + { + "epoch": 8.917705735660848, + "grad_norm": 0.2578125, + "learning_rate": 0.0002, + "loss": 0.3563, + "step": 4917 + }, + { + "epoch": 8.919519383359782, + "grad_norm": 0.271484375, + "learning_rate": 0.0002, + "loss": 0.3722, + "step": 4918 + }, + { + "epoch": 8.921333031058717, + "grad_norm": 0.265625, + "learning_rate": 0.0002, + "loss": 0.3359, + "step": 4919 + }, + { + "epoch": 8.923146678757652, + "grad_norm": 0.27734375, + "learning_rate": 0.0002, + "loss": 0.4076, + "step": 4920 + }, + { + "epoch": 8.924960326456585, + "grad_norm": 0.26953125, + "learning_rate": 0.0002, + "loss": 0.4035, + "step": 4921 + }, + { + "epoch": 8.92677397415552, + "grad_norm": 0.279296875, + "learning_rate": 0.0002, + "loss": 0.321, + "step": 4922 + }, + { + "epoch": 8.928587621854454, + "grad_norm": 0.28515625, + "learning_rate": 0.0002, + "loss": 0.426, + "step": 4923 + }, + { + "epoch": 8.93040126955339, + "grad_norm": 0.283203125, + "learning_rate": 0.0002, + "loss": 0.3471, + "step": 4924 + }, + { + "epoch": 8.932214917252324, + "grad_norm": 0.296875, + "learning_rate": 0.0002, + "loss": 0.3171, + "step": 4925 + }, + { + "epoch": 8.934028564951259, + "grad_norm": 0.28515625, + "learning_rate": 0.0002, + "loss": 0.3277, + "step": 4926 + }, + { + "epoch": 8.935842212650194, + "grad_norm": 0.3515625, + "learning_rate": 0.0002, + "loss": 0.3991, + "step": 4927 + }, + { + "epoch": 8.937655860349127, + "grad_norm": 0.30078125, + "learning_rate": 0.0002, + "loss": 0.3129, + "step": 4928 + }, + { + "epoch": 8.939469508048061, + "grad_norm": 0.259765625, + "learning_rate": 0.0002, + "loss": 0.2525, + "step": 4929 + }, + { + "epoch": 8.941283155746996, + "grad_norm": 0.32421875, + "learning_rate": 0.0002, + "loss": 0.3359, + "step": 4930 + }, + { + "epoch": 8.94309680344593, + "grad_norm": 0.3515625, + "learning_rate": 0.0002, + "loss": 0.4342, + "step": 4931 + }, + { + "epoch": 8.944910451144866, + "grad_norm": 0.333984375, + "learning_rate": 0.0002, + "loss": 0.3714, + "step": 4932 + }, + { + "epoch": 8.9467240988438, + "grad_norm": 0.322265625, + "learning_rate": 0.0002, + "loss": 0.293, + "step": 4933 + }, + { + "epoch": 8.948537746542733, + "grad_norm": 0.3203125, + "learning_rate": 0.0002, + "loss": 0.2148, + "step": 4934 + }, + { + "epoch": 8.950351394241668, + "grad_norm": 0.33203125, + "learning_rate": 0.0002, + "loss": 0.2758, + "step": 4935 + }, + { + "epoch": 8.952165041940603, + "grad_norm": 0.357421875, + "learning_rate": 0.0002, + "loss": 0.3262, + "step": 4936 + }, + { + "epoch": 8.953978689639538, + "grad_norm": 0.302734375, + "learning_rate": 0.0002, + "loss": 0.2091, + "step": 4937 + }, + { + "epoch": 8.955792337338472, + "grad_norm": 0.29296875, + "learning_rate": 0.0002, + "loss": 0.239, + "step": 4938 + }, + { + "epoch": 8.957605985037407, + "grad_norm": 0.29296875, + "learning_rate": 0.0002, + "loss": 0.2257, + "step": 4939 + }, + { + "epoch": 8.95941963273634, + "grad_norm": 0.345703125, + "learning_rate": 0.0002, + "loss": 0.3274, + "step": 4940 + }, + { + "epoch": 8.961233280435275, + "grad_norm": 0.32421875, + "learning_rate": 0.0002, + "loss": 0.2996, + "step": 4941 + }, + { + "epoch": 8.96304692813421, + "grad_norm": 0.306640625, + "learning_rate": 0.0002, + "loss": 0.1936, + "step": 4942 + }, + { + "epoch": 8.964860575833145, + "grad_norm": 0.275390625, + "learning_rate": 0.0002, + "loss": 0.2021, + "step": 4943 + }, + { + "epoch": 8.96667422353208, + "grad_norm": 0.376953125, + "learning_rate": 0.0002, + "loss": 0.2392, + "step": 4944 + }, + { + "epoch": 8.968487871231014, + "grad_norm": 0.27734375, + "learning_rate": 0.0002, + "loss": 0.1742, + "step": 4945 + }, + { + "epoch": 8.970301518929947, + "grad_norm": 0.2890625, + "learning_rate": 0.0002, + "loss": 0.1983, + "step": 4946 + }, + { + "epoch": 8.972115166628882, + "grad_norm": 0.263671875, + "learning_rate": 0.0002, + "loss": 0.1594, + "step": 4947 + }, + { + "epoch": 8.973928814327817, + "grad_norm": 0.283203125, + "learning_rate": 0.0002, + "loss": 0.1663, + "step": 4948 + }, + { + "epoch": 8.975742462026751, + "grad_norm": 0.287109375, + "learning_rate": 0.0002, + "loss": 0.2098, + "step": 4949 + }, + { + "epoch": 8.977556109725686, + "grad_norm": 0.306640625, + "learning_rate": 0.0002, + "loss": 0.1534, + "step": 4950 + }, + { + "epoch": 8.979369757424621, + "grad_norm": 0.2421875, + "learning_rate": 0.0002, + "loss": 0.1565, + "step": 4951 + }, + { + "epoch": 8.981183405123554, + "grad_norm": 0.287109375, + "learning_rate": 0.0002, + "loss": 0.17, + "step": 4952 + }, + { + "epoch": 8.982997052822489, + "grad_norm": 0.3515625, + "learning_rate": 0.0002, + "loss": 0.1907, + "step": 4953 + }, + { + "epoch": 8.984810700521423, + "grad_norm": 0.26171875, + "learning_rate": 0.0002, + "loss": 0.1907, + "step": 4954 + }, + { + "epoch": 8.986624348220358, + "grad_norm": 0.24609375, + "learning_rate": 0.0002, + "loss": 0.1683, + "step": 4955 + }, + { + "epoch": 8.988437995919293, + "grad_norm": 0.2734375, + "learning_rate": 0.0002, + "loss": 0.1619, + "step": 4956 + }, + { + "epoch": 8.990251643618228, + "grad_norm": 0.267578125, + "learning_rate": 0.0002, + "loss": 0.1797, + "step": 4957 + }, + { + "epoch": 8.99206529131716, + "grad_norm": 0.52734375, + "learning_rate": 0.0002, + "loss": 0.1841, + "step": 4958 + }, + { + "epoch": 8.993878939016096, + "grad_norm": 0.2353515625, + "learning_rate": 0.0002, + "loss": 0.1789, + "step": 4959 + }, + { + "epoch": 8.99569258671503, + "grad_norm": 0.2578125, + "learning_rate": 0.0002, + "loss": 0.2022, + "step": 4960 + }, + { + "epoch": 8.997506234413965, + "grad_norm": 0.236328125, + "learning_rate": 0.0002, + "loss": 0.2608, + "step": 4961 + }, + { + "epoch": 8.9993198821129, + "grad_norm": 0.314453125, + "learning_rate": 0.0002, + "loss": 0.2898, + "step": 4962 + }, + { + "epoch": 9.001133529811835, + "grad_norm": 0.369140625, + "learning_rate": 0.0002, + "loss": 0.2843, + "step": 4963 + }, + { + "epoch": 9.002947177510768, + "grad_norm": 0.2119140625, + "learning_rate": 0.0002, + "loss": 0.2897, + "step": 4964 + }, + { + "epoch": 9.004760825209702, + "grad_norm": 0.2197265625, + "learning_rate": 0.0002, + "loss": 0.274, + "step": 4965 + }, + { + "epoch": 9.006574472908637, + "grad_norm": 0.2265625, + "learning_rate": 0.0002, + "loss": 0.2256, + "step": 4966 + }, + { + "epoch": 9.008388120607572, + "grad_norm": 0.2177734375, + "learning_rate": 0.0002, + "loss": 0.2867, + "step": 4967 + }, + { + "epoch": 9.010201768306507, + "grad_norm": 0.2197265625, + "learning_rate": 0.0002, + "loss": 0.2709, + "step": 4968 + }, + { + "epoch": 9.012015416005442, + "grad_norm": 0.2177734375, + "learning_rate": 0.0002, + "loss": 0.2875, + "step": 4969 + }, + { + "epoch": 9.013829063704376, + "grad_norm": 0.220703125, + "learning_rate": 0.0002, + "loss": 0.2419, + "step": 4970 + }, + { + "epoch": 9.01564271140331, + "grad_norm": 0.2451171875, + "learning_rate": 0.0002, + "loss": 0.2154, + "step": 4971 + }, + { + "epoch": 9.017456359102244, + "grad_norm": 0.255859375, + "learning_rate": 0.0002, + "loss": 0.253, + "step": 4972 + }, + { + "epoch": 9.019270006801179, + "grad_norm": 0.296875, + "learning_rate": 0.0002, + "loss": 0.3188, + "step": 4973 + }, + { + "epoch": 9.021083654500114, + "grad_norm": 0.27734375, + "learning_rate": 0.0002, + "loss": 0.2378, + "step": 4974 + }, + { + "epoch": 9.022897302199048, + "grad_norm": 0.2333984375, + "learning_rate": 0.0002, + "loss": 0.1633, + "step": 4975 + }, + { + "epoch": 9.024710949897983, + "grad_norm": 0.30078125, + "learning_rate": 0.0002, + "loss": 0.2134, + "step": 4976 + }, + { + "epoch": 9.026524597596916, + "grad_norm": 0.263671875, + "learning_rate": 0.0002, + "loss": 0.1864, + "step": 4977 + }, + { + "epoch": 9.028338245295851, + "grad_norm": 0.2470703125, + "learning_rate": 0.0002, + "loss": 0.2101, + "step": 4978 + }, + { + "epoch": 9.030151892994786, + "grad_norm": 0.291015625, + "learning_rate": 0.0002, + "loss": 0.2059, + "step": 4979 + }, + { + "epoch": 9.03196554069372, + "grad_norm": 0.25390625, + "learning_rate": 0.0002, + "loss": 0.1513, + "step": 4980 + }, + { + "epoch": 9.033779188392655, + "grad_norm": 0.255859375, + "learning_rate": 0.0002, + "loss": 0.1932, + "step": 4981 + }, + { + "epoch": 9.03559283609159, + "grad_norm": 0.318359375, + "learning_rate": 0.0002, + "loss": 0.2437, + "step": 4982 + }, + { + "epoch": 9.037406483790523, + "grad_norm": 0.31640625, + "learning_rate": 0.0002, + "loss": 0.1981, + "step": 4983 + }, + { + "epoch": 9.039220131489458, + "grad_norm": 0.283203125, + "learning_rate": 0.0002, + "loss": 0.2072, + "step": 4984 + }, + { + "epoch": 9.041033779188393, + "grad_norm": 0.28515625, + "learning_rate": 0.0002, + "loss": 0.1795, + "step": 4985 + }, + { + "epoch": 9.042847426887327, + "grad_norm": 0.287109375, + "learning_rate": 0.0002, + "loss": 0.1846, + "step": 4986 + }, + { + "epoch": 9.044661074586262, + "grad_norm": 0.267578125, + "learning_rate": 0.0002, + "loss": 0.1771, + "step": 4987 + }, + { + "epoch": 9.046474722285197, + "grad_norm": 0.267578125, + "learning_rate": 0.0002, + "loss": 0.2606, + "step": 4988 + }, + { + "epoch": 9.04828836998413, + "grad_norm": 0.26171875, + "learning_rate": 0.0002, + "loss": 0.1509, + "step": 4989 + }, + { + "epoch": 9.050102017683065, + "grad_norm": 0.2578125, + "learning_rate": 0.0002, + "loss": 0.1544, + "step": 4990 + }, + { + "epoch": 9.051915665382, + "grad_norm": 0.27734375, + "learning_rate": 0.0002, + "loss": 0.1496, + "step": 4991 + }, + { + "epoch": 9.053729313080934, + "grad_norm": 0.2314453125, + "learning_rate": 0.0002, + "loss": 0.1396, + "step": 4992 + }, + { + "epoch": 9.055542960779869, + "grad_norm": 0.2314453125, + "learning_rate": 0.0002, + "loss": 0.151, + "step": 4993 + }, + { + "epoch": 9.057356608478804, + "grad_norm": 0.259765625, + "learning_rate": 0.0002, + "loss": 0.1471, + "step": 4994 + }, + { + "epoch": 9.059170256177737, + "grad_norm": 0.234375, + "learning_rate": 0.0002, + "loss": 0.1235, + "step": 4995 + }, + { + "epoch": 9.060983903876672, + "grad_norm": 0.30078125, + "learning_rate": 0.0002, + "loss": 0.1181, + "step": 4996 + }, + { + "epoch": 9.062797551575606, + "grad_norm": 0.2099609375, + "learning_rate": 0.0002, + "loss": 0.0964, + "step": 4997 + }, + { + "epoch": 9.064611199274541, + "grad_norm": 0.271484375, + "learning_rate": 0.0002, + "loss": 0.1383, + "step": 4998 + }, + { + "epoch": 9.066424846973476, + "grad_norm": 0.2119140625, + "learning_rate": 0.0002, + "loss": 0.1096, + "step": 4999 + }, + { + "epoch": 9.06823849467241, + "grad_norm": 0.236328125, + "learning_rate": 0.0002, + "loss": 0.1254, + "step": 5000 + }, + { + "epoch": 9.070052142371344, + "grad_norm": 0.216796875, + "learning_rate": 0.0002, + "loss": 0.1416, + "step": 5001 + }, + { + "epoch": 9.071865790070278, + "grad_norm": 0.205078125, + "learning_rate": 0.0002, + "loss": 0.1245, + "step": 5002 + }, + { + "epoch": 9.073679437769213, + "grad_norm": 0.2734375, + "learning_rate": 0.0002, + "loss": 0.1589, + "step": 5003 + }, + { + "epoch": 9.075493085468148, + "grad_norm": 0.357421875, + "learning_rate": 0.0002, + "loss": 0.123, + "step": 5004 + }, + { + "epoch": 9.077306733167083, + "grad_norm": 0.32421875, + "learning_rate": 0.0002, + "loss": 0.1591, + "step": 5005 + }, + { + "epoch": 9.079120380866017, + "grad_norm": 0.232421875, + "learning_rate": 0.0002, + "loss": 0.137, + "step": 5006 + }, + { + "epoch": 9.08093402856495, + "grad_norm": 0.19140625, + "learning_rate": 0.0002, + "loss": 0.1288, + "step": 5007 + }, + { + "epoch": 9.082747676263885, + "grad_norm": 0.1943359375, + "learning_rate": 0.0002, + "loss": 0.1294, + "step": 5008 + }, + { + "epoch": 9.08456132396282, + "grad_norm": 0.396484375, + "learning_rate": 0.0002, + "loss": 0.1478, + "step": 5009 + }, + { + "epoch": 9.086374971661755, + "grad_norm": 0.404296875, + "learning_rate": 0.0002, + "loss": 0.1568, + "step": 5010 + }, + { + "epoch": 9.08818861936069, + "grad_norm": 0.201171875, + "learning_rate": 0.0002, + "loss": 0.1382, + "step": 5011 + }, + { + "epoch": 9.090002267059624, + "grad_norm": 0.2734375, + "learning_rate": 0.0002, + "loss": 0.2018, + "step": 5012 + }, + { + "epoch": 9.091815914758557, + "grad_norm": 0.26171875, + "learning_rate": 0.0002, + "loss": 0.2533, + "step": 5013 + }, + { + "epoch": 9.093629562457492, + "grad_norm": 0.234375, + "learning_rate": 0.0002, + "loss": 0.3194, + "step": 5014 + }, + { + "epoch": 9.095443210156427, + "grad_norm": 0.2431640625, + "learning_rate": 0.0002, + "loss": 0.368, + "step": 5015 + }, + { + "epoch": 9.097256857855362, + "grad_norm": 0.251953125, + "learning_rate": 0.0002, + "loss": 0.2905, + "step": 5016 + }, + { + "epoch": 9.099070505554296, + "grad_norm": 0.2431640625, + "learning_rate": 0.0002, + "loss": 0.2904, + "step": 5017 + }, + { + "epoch": 9.100884153253231, + "grad_norm": 0.2119140625, + "learning_rate": 0.0002, + "loss": 0.2162, + "step": 5018 + }, + { + "epoch": 9.102697800952164, + "grad_norm": 0.236328125, + "learning_rate": 0.0002, + "loss": 0.301, + "step": 5019 + }, + { + "epoch": 9.104511448651099, + "grad_norm": 0.212890625, + "learning_rate": 0.0002, + "loss": 0.2058, + "step": 5020 + }, + { + "epoch": 9.106325096350034, + "grad_norm": 0.216796875, + "learning_rate": 0.0002, + "loss": 0.2378, + "step": 5021 + }, + { + "epoch": 9.108138744048969, + "grad_norm": 0.28515625, + "learning_rate": 0.0002, + "loss": 0.3427, + "step": 5022 + }, + { + "epoch": 9.109952391747903, + "grad_norm": 0.26171875, + "learning_rate": 0.0002, + "loss": 0.2535, + "step": 5023 + }, + { + "epoch": 9.111766039446838, + "grad_norm": 0.28125, + "learning_rate": 0.0002, + "loss": 0.2731, + "step": 5024 + }, + { + "epoch": 9.113579687145773, + "grad_norm": 0.2431640625, + "learning_rate": 0.0002, + "loss": 0.2055, + "step": 5025 + }, + { + "epoch": 9.115393334844706, + "grad_norm": 0.29296875, + "learning_rate": 0.0002, + "loss": 0.2691, + "step": 5026 + }, + { + "epoch": 9.11720698254364, + "grad_norm": 0.26171875, + "learning_rate": 0.0002, + "loss": 0.1816, + "step": 5027 + }, + { + "epoch": 9.119020630242575, + "grad_norm": 0.291015625, + "learning_rate": 0.0002, + "loss": 0.2374, + "step": 5028 + }, + { + "epoch": 9.12083427794151, + "grad_norm": 0.287109375, + "learning_rate": 0.0002, + "loss": 0.2111, + "step": 5029 + }, + { + "epoch": 9.122647925640445, + "grad_norm": 0.294921875, + "learning_rate": 0.0002, + "loss": 0.2735, + "step": 5030 + }, + { + "epoch": 9.12446157333938, + "grad_norm": 0.2578125, + "learning_rate": 0.0002, + "loss": 0.1725, + "step": 5031 + }, + { + "epoch": 9.126275221038313, + "grad_norm": 0.2490234375, + "learning_rate": 0.0002, + "loss": 0.2169, + "step": 5032 + }, + { + "epoch": 9.128088868737247, + "grad_norm": 0.298828125, + "learning_rate": 0.0002, + "loss": 0.1775, + "step": 5033 + }, + { + "epoch": 9.129902516436182, + "grad_norm": 0.30078125, + "learning_rate": 0.0002, + "loss": 0.1658, + "step": 5034 + }, + { + "epoch": 9.131716164135117, + "grad_norm": 0.24609375, + "learning_rate": 0.0002, + "loss": 0.1536, + "step": 5035 + }, + { + "epoch": 9.133529811834052, + "grad_norm": 0.25390625, + "learning_rate": 0.0002, + "loss": 0.1427, + "step": 5036 + }, + { + "epoch": 9.135343459532987, + "grad_norm": 0.3984375, + "learning_rate": 0.0002, + "loss": 0.1718, + "step": 5037 + }, + { + "epoch": 9.13715710723192, + "grad_norm": 0.30078125, + "learning_rate": 0.0002, + "loss": 0.197, + "step": 5038 + }, + { + "epoch": 9.138970754930854, + "grad_norm": 0.2734375, + "learning_rate": 0.0002, + "loss": 0.1332, + "step": 5039 + }, + { + "epoch": 9.140784402629789, + "grad_norm": 0.306640625, + "learning_rate": 0.0002, + "loss": 0.1601, + "step": 5040 + }, + { + "epoch": 9.142598050328724, + "grad_norm": 0.369140625, + "learning_rate": 0.0002, + "loss": 0.194, + "step": 5041 + }, + { + "epoch": 9.144411698027659, + "grad_norm": 0.24609375, + "learning_rate": 0.0002, + "loss": 0.1283, + "step": 5042 + }, + { + "epoch": 9.146225345726593, + "grad_norm": 0.28515625, + "learning_rate": 0.0002, + "loss": 0.1446, + "step": 5043 + }, + { + "epoch": 9.148038993425526, + "grad_norm": 0.2578125, + "learning_rate": 0.0002, + "loss": 0.1156, + "step": 5044 + }, + { + "epoch": 9.149852641124461, + "grad_norm": 0.291015625, + "learning_rate": 0.0002, + "loss": 0.2043, + "step": 5045 + }, + { + "epoch": 9.151666288823396, + "grad_norm": 0.244140625, + "learning_rate": 0.0002, + "loss": 0.1219, + "step": 5046 + }, + { + "epoch": 9.15347993652233, + "grad_norm": 0.25, + "learning_rate": 0.0002, + "loss": 0.1405, + "step": 5047 + }, + { + "epoch": 9.155293584221265, + "grad_norm": 0.216796875, + "learning_rate": 0.0002, + "loss": 0.1119, + "step": 5048 + }, + { + "epoch": 9.1571072319202, + "grad_norm": 0.2119140625, + "learning_rate": 0.0002, + "loss": 0.1041, + "step": 5049 + }, + { + "epoch": 9.1571072319202, + "eval_loss": 1.9997354745864868, + "eval_runtime": 152.2828, + "eval_samples_per_second": 6.567, + "eval_steps_per_second": 6.567, + "step": 5049 + }, + { + "epoch": 9.1571072319202, + "mmlu_eval_accuracy": 0.3023945693713378, + "mmlu_eval_accuracy_abstract_algebra": 0.2727272727272727, + "mmlu_eval_accuracy_anatomy": 0.42857142857142855, + "mmlu_eval_accuracy_astronomy": 0.3125, + "mmlu_eval_accuracy_business_ethics": 0.5454545454545454, + "mmlu_eval_accuracy_clinical_knowledge": 0.27586206896551724, + "mmlu_eval_accuracy_college_biology": 0.4375, + "mmlu_eval_accuracy_college_chemistry": 0.25, + "mmlu_eval_accuracy_college_computer_science": 0.09090909090909091, + "mmlu_eval_accuracy_college_mathematics": 0.18181818181818182, + "mmlu_eval_accuracy_college_medicine": 0.22727272727272727, + "mmlu_eval_accuracy_college_physics": 0.45454545454545453, + "mmlu_eval_accuracy_computer_security": 0.45454545454545453, + "mmlu_eval_accuracy_conceptual_physics": 0.2692307692307692, + "mmlu_eval_accuracy_econometrics": 0.08333333333333333, + "mmlu_eval_accuracy_electrical_engineering": 0.125, + "mmlu_eval_accuracy_elementary_mathematics": 0.2926829268292683, + "mmlu_eval_accuracy_formal_logic": 0.14285714285714285, + "mmlu_eval_accuracy_global_facts": 0.4, + "mmlu_eval_accuracy_high_school_biology": 0.4375, + "mmlu_eval_accuracy_high_school_chemistry": 0.18181818181818182, + "mmlu_eval_accuracy_high_school_computer_science": 0.1111111111111111, + "mmlu_eval_accuracy_high_school_european_history": 0.2777777777777778, + "mmlu_eval_accuracy_high_school_geography": 0.3181818181818182, + "mmlu_eval_accuracy_high_school_government_and_politics": 0.19047619047619047, + "mmlu_eval_accuracy_high_school_macroeconomics": 0.2558139534883721, + "mmlu_eval_accuracy_high_school_mathematics": 0.10344827586206896, + "mmlu_eval_accuracy_high_school_microeconomics": 0.38461538461538464, + "mmlu_eval_accuracy_high_school_physics": 0.47058823529411764, + "mmlu_eval_accuracy_high_school_psychology": 0.3, + "mmlu_eval_accuracy_high_school_statistics": 0.30434782608695654, + "mmlu_eval_accuracy_high_school_us_history": 0.3181818181818182, + "mmlu_eval_accuracy_high_school_world_history": 0.19230769230769232, + "mmlu_eval_accuracy_human_aging": 0.21739130434782608, + "mmlu_eval_accuracy_human_sexuality": 0.08333333333333333, + "mmlu_eval_accuracy_international_law": 0.38461538461538464, + "mmlu_eval_accuracy_jurisprudence": 0.36363636363636365, + "mmlu_eval_accuracy_logical_fallacies": 0.3333333333333333, + "mmlu_eval_accuracy_machine_learning": 0.36363636363636365, + "mmlu_eval_accuracy_management": 0.5454545454545454, + "mmlu_eval_accuracy_marketing": 0.36, + "mmlu_eval_accuracy_medical_genetics": 0.18181818181818182, + "mmlu_eval_accuracy_miscellaneous": 0.45348837209302323, + "mmlu_eval_accuracy_moral_disputes": 0.23684210526315788, + "mmlu_eval_accuracy_moral_scenarios": 0.22, + "mmlu_eval_accuracy_nutrition": 0.36363636363636365, + "mmlu_eval_accuracy_philosophy": 0.35294117647058826, + "mmlu_eval_accuracy_prehistory": 0.4, + "mmlu_eval_accuracy_professional_accounting": 0.3225806451612903, + "mmlu_eval_accuracy_professional_law": 0.2529411764705882, + "mmlu_eval_accuracy_professional_medicine": 0.2903225806451613, + "mmlu_eval_accuracy_professional_psychology": 0.3188405797101449, + "mmlu_eval_accuracy_public_relations": 0.4166666666666667, + "mmlu_eval_accuracy_security_studies": 0.37037037037037035, + "mmlu_eval_accuracy_sociology": 0.4090909090909091, + "mmlu_eval_accuracy_us_foreign_policy": 0.36363636363636365, + "mmlu_eval_accuracy_virology": 0.2777777777777778, + "mmlu_eval_accuracy_world_religions": 0.2631578947368421, + "mmlu_loss": 1.859329504117243, + "step": 5049 + }, + { + "epoch": 9.158920879619133, + "grad_norm": 0.24609375, + "learning_rate": 0.0002, + "loss": 0.1265, + "step": 5050 + }, + { + "epoch": 9.160734527318068, + "grad_norm": 0.275390625, + "learning_rate": 0.0002, + "loss": 0.1148, + "step": 5051 + }, + { + "epoch": 9.162548175017003, + "grad_norm": 0.26171875, + "learning_rate": 0.0002, + "loss": 0.1752, + "step": 5052 + }, + { + "epoch": 9.164361822715938, + "grad_norm": 0.1796875, + "learning_rate": 0.0002, + "loss": 0.119, + "step": 5053 + }, + { + "epoch": 9.166175470414872, + "grad_norm": 0.275390625, + "learning_rate": 0.0002, + "loss": 0.166, + "step": 5054 + }, + { + "epoch": 9.167989118113807, + "grad_norm": 0.3046875, + "learning_rate": 0.0002, + "loss": 0.1637, + "step": 5055 + }, + { + "epoch": 9.16980276581274, + "grad_norm": 0.359375, + "learning_rate": 0.0002, + "loss": 0.1759, + "step": 5056 + }, + { + "epoch": 9.171616413511675, + "grad_norm": 0.302734375, + "learning_rate": 0.0002, + "loss": 0.1296, + "step": 5057 + }, + { + "epoch": 9.17343006121061, + "grad_norm": 0.3828125, + "learning_rate": 0.0002, + "loss": 0.1572, + "step": 5058 + }, + { + "epoch": 9.175243708909544, + "grad_norm": 0.2001953125, + "learning_rate": 0.0002, + "loss": 0.1353, + "step": 5059 + }, + { + "epoch": 9.17705735660848, + "grad_norm": 0.400390625, + "learning_rate": 0.0002, + "loss": 0.1639, + "step": 5060 + }, + { + "epoch": 9.178871004307414, + "grad_norm": 0.25390625, + "learning_rate": 0.0002, + "loss": 0.1674, + "step": 5061 + }, + { + "epoch": 9.180684652006347, + "grad_norm": 0.2119140625, + "learning_rate": 0.0002, + "loss": 0.2141, + "step": 5062 + }, + { + "epoch": 9.182498299705282, + "grad_norm": 0.2255859375, + "learning_rate": 0.0002, + "loss": 0.3257, + "step": 5063 + }, + { + "epoch": 9.184311947404217, + "grad_norm": 0.25390625, + "learning_rate": 0.0002, + "loss": 0.2915, + "step": 5064 + }, + { + "epoch": 9.186125595103151, + "grad_norm": 0.2490234375, + "learning_rate": 0.0002, + "loss": 0.3386, + "step": 5065 + }, + { + "epoch": 9.187939242802086, + "grad_norm": 0.23828125, + "learning_rate": 0.0002, + "loss": 0.3249, + "step": 5066 + }, + { + "epoch": 9.18975289050102, + "grad_norm": 0.224609375, + "learning_rate": 0.0002, + "loss": 0.2542, + "step": 5067 + }, + { + "epoch": 9.191566538199954, + "grad_norm": 0.23828125, + "learning_rate": 0.0002, + "loss": 0.2697, + "step": 5068 + }, + { + "epoch": 9.193380185898889, + "grad_norm": 0.28125, + "learning_rate": 0.0002, + "loss": 0.3237, + "step": 5069 + }, + { + "epoch": 9.195193833597823, + "grad_norm": 0.2265625, + "learning_rate": 0.0002, + "loss": 0.251, + "step": 5070 + }, + { + "epoch": 9.197007481296758, + "grad_norm": 0.251953125, + "learning_rate": 0.0002, + "loss": 0.2686, + "step": 5071 + }, + { + "epoch": 9.198821128995693, + "grad_norm": 0.244140625, + "learning_rate": 0.0002, + "loss": 0.2258, + "step": 5072 + }, + { + "epoch": 9.200634776694628, + "grad_norm": 0.275390625, + "learning_rate": 0.0002, + "loss": 0.3055, + "step": 5073 + }, + { + "epoch": 9.20244842439356, + "grad_norm": 0.279296875, + "learning_rate": 0.0002, + "loss": 0.2795, + "step": 5074 + }, + { + "epoch": 9.204262072092495, + "grad_norm": 0.279296875, + "learning_rate": 0.0002, + "loss": 0.2254, + "step": 5075 + }, + { + "epoch": 9.20607571979143, + "grad_norm": 0.265625, + "learning_rate": 0.0002, + "loss": 0.1985, + "step": 5076 + }, + { + "epoch": 9.207889367490365, + "grad_norm": 0.267578125, + "learning_rate": 0.0002, + "loss": 0.1894, + "step": 5077 + }, + { + "epoch": 9.2097030151893, + "grad_norm": 0.341796875, + "learning_rate": 0.0002, + "loss": 0.2635, + "step": 5078 + }, + { + "epoch": 9.211516662888235, + "grad_norm": 0.279296875, + "learning_rate": 0.0002, + "loss": 0.1821, + "step": 5079 + }, + { + "epoch": 9.213330310587168, + "grad_norm": 0.306640625, + "learning_rate": 0.0002, + "loss": 0.2203, + "step": 5080 + }, + { + "epoch": 9.215143958286102, + "grad_norm": 0.376953125, + "learning_rate": 0.0002, + "loss": 0.2676, + "step": 5081 + }, + { + "epoch": 9.216957605985037, + "grad_norm": 0.267578125, + "learning_rate": 0.0002, + "loss": 0.1581, + "step": 5082 + }, + { + "epoch": 9.218771253683972, + "grad_norm": 0.306640625, + "learning_rate": 0.0002, + "loss": 0.1992, + "step": 5083 + }, + { + "epoch": 9.220584901382907, + "grad_norm": 0.283203125, + "learning_rate": 0.0002, + "loss": 0.1762, + "step": 5084 + }, + { + "epoch": 9.222398549081841, + "grad_norm": 0.29296875, + "learning_rate": 0.0002, + "loss": 0.2053, + "step": 5085 + }, + { + "epoch": 9.224212196780776, + "grad_norm": 0.41015625, + "learning_rate": 0.0002, + "loss": 0.2517, + "step": 5086 + }, + { + "epoch": 9.22602584447971, + "grad_norm": 0.310546875, + "learning_rate": 0.0002, + "loss": 0.1477, + "step": 5087 + }, + { + "epoch": 9.227839492178644, + "grad_norm": 0.333984375, + "learning_rate": 0.0002, + "loss": 0.1993, + "step": 5088 + }, + { + "epoch": 9.229653139877579, + "grad_norm": 0.33203125, + "learning_rate": 0.0002, + "loss": 0.1591, + "step": 5089 + }, + { + "epoch": 9.231466787576514, + "grad_norm": 0.283203125, + "learning_rate": 0.0002, + "loss": 0.1785, + "step": 5090 + }, + { + "epoch": 9.233280435275448, + "grad_norm": 0.279296875, + "learning_rate": 0.0002, + "loss": 0.1515, + "step": 5091 + }, + { + "epoch": 9.235094082974383, + "grad_norm": 0.291015625, + "learning_rate": 0.0002, + "loss": 0.1675, + "step": 5092 + }, + { + "epoch": 9.236907730673316, + "grad_norm": 0.33984375, + "learning_rate": 0.0002, + "loss": 0.185, + "step": 5093 + }, + { + "epoch": 9.23872137837225, + "grad_norm": 0.26953125, + "learning_rate": 0.0002, + "loss": 0.1292, + "step": 5094 + }, + { + "epoch": 9.240535026071186, + "grad_norm": 0.208984375, + "learning_rate": 0.0002, + "loss": 0.0996, + "step": 5095 + }, + { + "epoch": 9.24234867377012, + "grad_norm": 0.2373046875, + "learning_rate": 0.0002, + "loss": 0.1325, + "step": 5096 + }, + { + "epoch": 9.244162321469055, + "grad_norm": 0.2412109375, + "learning_rate": 0.0002, + "loss": 0.1315, + "step": 5097 + }, + { + "epoch": 9.24597596916799, + "grad_norm": 0.259765625, + "learning_rate": 0.0002, + "loss": 0.1383, + "step": 5098 + }, + { + "epoch": 9.247789616866923, + "grad_norm": 0.24609375, + "learning_rate": 0.0002, + "loss": 0.125, + "step": 5099 + }, + { + "epoch": 9.249603264565858, + "grad_norm": 0.244140625, + "learning_rate": 0.0002, + "loss": 0.1431, + "step": 5100 + }, + { + "epoch": 9.251416912264792, + "grad_norm": 0.1796875, + "learning_rate": 0.0002, + "loss": 0.1041, + "step": 5101 + }, + { + "epoch": 9.253230559963727, + "grad_norm": 0.240234375, + "learning_rate": 0.0002, + "loss": 0.1299, + "step": 5102 + }, + { + "epoch": 9.255044207662662, + "grad_norm": 0.25, + "learning_rate": 0.0002, + "loss": 0.1801, + "step": 5103 + }, + { + "epoch": 9.256857855361597, + "grad_norm": 0.3203125, + "learning_rate": 0.0002, + "loss": 0.1631, + "step": 5104 + }, + { + "epoch": 9.25867150306053, + "grad_norm": 0.228515625, + "learning_rate": 0.0002, + "loss": 0.1383, + "step": 5105 + }, + { + "epoch": 9.260485150759465, + "grad_norm": 0.306640625, + "learning_rate": 0.0002, + "loss": 0.1231, + "step": 5106 + }, + { + "epoch": 9.2622987984584, + "grad_norm": 0.349609375, + "learning_rate": 0.0002, + "loss": 0.1642, + "step": 5107 + }, + { + "epoch": 9.264112446157334, + "grad_norm": 0.2353515625, + "learning_rate": 0.0002, + "loss": 0.1382, + "step": 5108 + }, + { + "epoch": 9.265926093856269, + "grad_norm": 0.17578125, + "learning_rate": 0.0002, + "loss": 0.1405, + "step": 5109 + }, + { + "epoch": 9.267739741555204, + "grad_norm": 0.1396484375, + "learning_rate": 0.0002, + "loss": 0.1406, + "step": 5110 + }, + { + "epoch": 9.269553389254137, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 0.1638, + "step": 5111 + }, + { + "epoch": 9.271367036953071, + "grad_norm": 0.2890625, + "learning_rate": 0.0002, + "loss": 0.16, + "step": 5112 + }, + { + "epoch": 9.273180684652006, + "grad_norm": 0.228515625, + "learning_rate": 0.0002, + "loss": 0.4048, + "step": 5113 + }, + { + "epoch": 9.274994332350941, + "grad_norm": 0.2294921875, + "learning_rate": 0.0002, + "loss": 0.3291, + "step": 5114 + }, + { + "epoch": 9.276807980049876, + "grad_norm": 0.220703125, + "learning_rate": 0.0002, + "loss": 0.2371, + "step": 5115 + }, + { + "epoch": 9.27862162774881, + "grad_norm": 0.2265625, + "learning_rate": 0.0002, + "loss": 0.4167, + "step": 5116 + }, + { + "epoch": 9.280435275447743, + "grad_norm": 0.2236328125, + "learning_rate": 0.0002, + "loss": 0.2782, + "step": 5117 + }, + { + "epoch": 9.282248923146678, + "grad_norm": 0.2314453125, + "learning_rate": 0.0002, + "loss": 0.2424, + "step": 5118 + }, + { + "epoch": 9.284062570845613, + "grad_norm": 0.2431640625, + "learning_rate": 0.0002, + "loss": 0.2664, + "step": 5119 + }, + { + "epoch": 9.285876218544548, + "grad_norm": 0.2265625, + "learning_rate": 0.0002, + "loss": 0.2517, + "step": 5120 + }, + { + "epoch": 9.287689866243483, + "grad_norm": 0.240234375, + "learning_rate": 0.0002, + "loss": 0.2243, + "step": 5121 + }, + { + "epoch": 9.289503513942417, + "grad_norm": 0.259765625, + "learning_rate": 0.0002, + "loss": 0.2298, + "step": 5122 + }, + { + "epoch": 9.29131716164135, + "grad_norm": 0.236328125, + "learning_rate": 0.0002, + "loss": 0.2016, + "step": 5123 + }, + { + "epoch": 9.293130809340285, + "grad_norm": 0.265625, + "learning_rate": 0.0002, + "loss": 0.3073, + "step": 5124 + }, + { + "epoch": 9.29494445703922, + "grad_norm": 0.287109375, + "learning_rate": 0.0002, + "loss": 0.2649, + "step": 5125 + }, + { + "epoch": 9.296758104738155, + "grad_norm": 0.265625, + "learning_rate": 0.0002, + "loss": 0.2288, + "step": 5126 + }, + { + "epoch": 9.29857175243709, + "grad_norm": 0.29296875, + "learning_rate": 0.0002, + "loss": 0.2152, + "step": 5127 + }, + { + "epoch": 9.300385400136024, + "grad_norm": 0.296875, + "learning_rate": 0.0002, + "loss": 0.1971, + "step": 5128 + }, + { + "epoch": 9.302199047834957, + "grad_norm": 0.326171875, + "learning_rate": 0.0002, + "loss": 0.2346, + "step": 5129 + }, + { + "epoch": 9.304012695533892, + "grad_norm": 0.25, + "learning_rate": 0.0002, + "loss": 0.18, + "step": 5130 + }, + { + "epoch": 9.305826343232827, + "grad_norm": 0.359375, + "learning_rate": 0.0002, + "loss": 0.239, + "step": 5131 + }, + { + "epoch": 9.307639990931762, + "grad_norm": 0.302734375, + "learning_rate": 0.0002, + "loss": 0.1556, + "step": 5132 + }, + { + "epoch": 9.309453638630696, + "grad_norm": 0.318359375, + "learning_rate": 0.0002, + "loss": 0.226, + "step": 5133 + }, + { + "epoch": 9.311267286329631, + "grad_norm": 0.283203125, + "learning_rate": 0.0002, + "loss": 0.209, + "step": 5134 + }, + { + "epoch": 9.313080934028566, + "grad_norm": 0.318359375, + "learning_rate": 0.0002, + "loss": 0.2467, + "step": 5135 + }, + { + "epoch": 9.314894581727499, + "grad_norm": 0.275390625, + "learning_rate": 0.0002, + "loss": 0.1613, + "step": 5136 + }, + { + "epoch": 9.316708229426434, + "grad_norm": 0.27734375, + "learning_rate": 0.0002, + "loss": 0.1616, + "step": 5137 + }, + { + "epoch": 9.318521877125368, + "grad_norm": 0.28125, + "learning_rate": 0.0002, + "loss": 0.1559, + "step": 5138 + }, + { + "epoch": 9.320335524824303, + "grad_norm": 0.3046875, + "learning_rate": 0.0002, + "loss": 0.1635, + "step": 5139 + }, + { + "epoch": 9.322149172523238, + "grad_norm": 0.341796875, + "learning_rate": 0.0002, + "loss": 0.2189, + "step": 5140 + }, + { + "epoch": 9.323962820222171, + "grad_norm": 0.333984375, + "learning_rate": 0.0002, + "loss": 0.1754, + "step": 5141 + }, + { + "epoch": 9.325776467921106, + "grad_norm": 0.298828125, + "learning_rate": 0.0002, + "loss": 0.1752, + "step": 5142 + }, + { + "epoch": 9.32759011562004, + "grad_norm": 0.3203125, + "learning_rate": 0.0002, + "loss": 0.1701, + "step": 5143 + }, + { + "epoch": 9.329403763318975, + "grad_norm": 0.251953125, + "learning_rate": 0.0002, + "loss": 0.1209, + "step": 5144 + }, + { + "epoch": 9.33121741101791, + "grad_norm": 0.283203125, + "learning_rate": 0.0002, + "loss": 0.1368, + "step": 5145 + }, + { + "epoch": 9.333031058716845, + "grad_norm": 0.23046875, + "learning_rate": 0.0002, + "loss": 0.1234, + "step": 5146 + }, + { + "epoch": 9.33484470641578, + "grad_norm": 0.259765625, + "learning_rate": 0.0002, + "loss": 0.1194, + "step": 5147 + }, + { + "epoch": 9.336658354114713, + "grad_norm": 0.220703125, + "learning_rate": 0.0002, + "loss": 0.1238, + "step": 5148 + }, + { + "epoch": 9.338472001813647, + "grad_norm": 0.318359375, + "learning_rate": 0.0002, + "loss": 0.1807, + "step": 5149 + }, + { + "epoch": 9.340285649512582, + "grad_norm": 0.2021484375, + "learning_rate": 0.0002, + "loss": 0.1175, + "step": 5150 + }, + { + "epoch": 9.342099297211517, + "grad_norm": 0.212890625, + "learning_rate": 0.0002, + "loss": 0.1149, + "step": 5151 + }, + { + "epoch": 9.343912944910452, + "grad_norm": 0.2392578125, + "learning_rate": 0.0002, + "loss": 0.1153, + "step": 5152 + }, + { + "epoch": 9.345726592609386, + "grad_norm": 0.25, + "learning_rate": 0.0002, + "loss": 0.1537, + "step": 5153 + }, + { + "epoch": 9.34754024030832, + "grad_norm": 0.2265625, + "learning_rate": 0.0002, + "loss": 0.1239, + "step": 5154 + }, + { + "epoch": 9.349353888007254, + "grad_norm": 0.240234375, + "learning_rate": 0.0002, + "loss": 0.1557, + "step": 5155 + }, + { + "epoch": 9.351167535706189, + "grad_norm": 0.3359375, + "learning_rate": 0.0002, + "loss": 0.13, + "step": 5156 + }, + { + "epoch": 9.352981183405124, + "grad_norm": 0.38671875, + "learning_rate": 0.0002, + "loss": 0.1947, + "step": 5157 + }, + { + "epoch": 9.354794831104059, + "grad_norm": 0.25, + "learning_rate": 0.0002, + "loss": 0.1359, + "step": 5158 + }, + { + "epoch": 9.356608478802993, + "grad_norm": 0.158203125, + "learning_rate": 0.0002, + "loss": 0.1359, + "step": 5159 + }, + { + "epoch": 9.358422126501926, + "grad_norm": 0.1923828125, + "learning_rate": 0.0002, + "loss": 0.1481, + "step": 5160 + }, + { + "epoch": 9.360235774200861, + "grad_norm": 0.294921875, + "learning_rate": 0.0002, + "loss": 0.1975, + "step": 5161 + }, + { + "epoch": 9.362049421899796, + "grad_norm": 0.1923828125, + "learning_rate": 0.0002, + "loss": 0.2142, + "step": 5162 + }, + { + "epoch": 9.36386306959873, + "grad_norm": 0.23046875, + "learning_rate": 0.0002, + "loss": 0.3391, + "step": 5163 + }, + { + "epoch": 9.365676717297665, + "grad_norm": 0.240234375, + "learning_rate": 0.0002, + "loss": 0.2172, + "step": 5164 + }, + { + "epoch": 9.3674903649966, + "grad_norm": 0.2451171875, + "learning_rate": 0.0002, + "loss": 0.279, + "step": 5165 + }, + { + "epoch": 9.369304012695533, + "grad_norm": 0.224609375, + "learning_rate": 0.0002, + "loss": 0.2523, + "step": 5166 + }, + { + "epoch": 9.371117660394468, + "grad_norm": 0.2431640625, + "learning_rate": 0.0002, + "loss": 0.307, + "step": 5167 + }, + { + "epoch": 9.372931308093403, + "grad_norm": 0.25, + "learning_rate": 0.0002, + "loss": 0.2434, + "step": 5168 + }, + { + "epoch": 9.374744955792337, + "grad_norm": 0.22265625, + "learning_rate": 0.0002, + "loss": 0.3106, + "step": 5169 + }, + { + "epoch": 9.376558603491272, + "grad_norm": 0.240234375, + "learning_rate": 0.0002, + "loss": 0.2457, + "step": 5170 + }, + { + "epoch": 9.378372251190207, + "grad_norm": 0.24609375, + "learning_rate": 0.0002, + "loss": 0.279, + "step": 5171 + }, + { + "epoch": 9.38018589888914, + "grad_norm": 0.2578125, + "learning_rate": 0.0002, + "loss": 0.3182, + "step": 5172 + }, + { + "epoch": 9.381999546588075, + "grad_norm": 0.25390625, + "learning_rate": 0.0002, + "loss": 0.2495, + "step": 5173 + }, + { + "epoch": 9.38381319428701, + "grad_norm": 0.29296875, + "learning_rate": 0.0002, + "loss": 0.273, + "step": 5174 + }, + { + "epoch": 9.385626841985944, + "grad_norm": 0.26171875, + "learning_rate": 0.0002, + "loss": 0.225, + "step": 5175 + }, + { + "epoch": 9.387440489684879, + "grad_norm": 0.279296875, + "learning_rate": 0.0002, + "loss": 0.2392, + "step": 5176 + }, + { + "epoch": 9.389254137383814, + "grad_norm": 0.267578125, + "learning_rate": 0.0002, + "loss": 0.1802, + "step": 5177 + }, + { + "epoch": 9.391067785082747, + "grad_norm": 0.322265625, + "learning_rate": 0.0002, + "loss": 0.2628, + "step": 5178 + }, + { + "epoch": 9.392881432781682, + "grad_norm": 0.26171875, + "learning_rate": 0.0002, + "loss": 0.1656, + "step": 5179 + }, + { + "epoch": 9.394695080480616, + "grad_norm": 0.310546875, + "learning_rate": 0.0002, + "loss": 0.1818, + "step": 5180 + }, + { + "epoch": 9.396508728179551, + "grad_norm": 0.328125, + "learning_rate": 0.0002, + "loss": 0.2435, + "step": 5181 + }, + { + "epoch": 9.398322375878486, + "grad_norm": 0.2734375, + "learning_rate": 0.0002, + "loss": 0.1855, + "step": 5182 + }, + { + "epoch": 9.40013602357742, + "grad_norm": 0.298828125, + "learning_rate": 0.0002, + "loss": 0.185, + "step": 5183 + }, + { + "epoch": 9.401949671276354, + "grad_norm": 0.30078125, + "learning_rate": 0.0002, + "loss": 0.2052, + "step": 5184 + }, + { + "epoch": 9.403763318975288, + "grad_norm": 0.322265625, + "learning_rate": 0.0002, + "loss": 0.2151, + "step": 5185 + }, + { + "epoch": 9.405576966674223, + "grad_norm": 0.298828125, + "learning_rate": 0.0002, + "loss": 0.2113, + "step": 5186 + }, + { + "epoch": 9.407390614373158, + "grad_norm": 0.30859375, + "learning_rate": 0.0002, + "loss": 0.1799, + "step": 5187 + }, + { + "epoch": 9.409204262072093, + "grad_norm": 0.326171875, + "learning_rate": 0.0002, + "loss": 0.1957, + "step": 5188 + }, + { + "epoch": 9.411017909771028, + "grad_norm": 0.298828125, + "learning_rate": 0.0002, + "loss": 0.1985, + "step": 5189 + }, + { + "epoch": 9.41283155746996, + "grad_norm": 0.283203125, + "learning_rate": 0.0002, + "loss": 0.1908, + "step": 5190 + }, + { + "epoch": 9.414645205168895, + "grad_norm": 0.31640625, + "learning_rate": 0.0002, + "loss": 0.1581, + "step": 5191 + }, + { + "epoch": 9.41645885286783, + "grad_norm": 0.322265625, + "learning_rate": 0.0002, + "loss": 0.2102, + "step": 5192 + }, + { + "epoch": 9.418272500566765, + "grad_norm": 0.296875, + "learning_rate": 0.0002, + "loss": 0.1324, + "step": 5193 + }, + { + "epoch": 9.4200861482657, + "grad_norm": 0.287109375, + "learning_rate": 0.0002, + "loss": 0.1432, + "step": 5194 + }, + { + "epoch": 9.421899795964634, + "grad_norm": 0.265625, + "learning_rate": 0.0002, + "loss": 0.1385, + "step": 5195 + }, + { + "epoch": 9.42371344366357, + "grad_norm": 0.275390625, + "learning_rate": 0.0002, + "loss": 0.1827, + "step": 5196 + }, + { + "epoch": 9.425527091362502, + "grad_norm": 0.3671875, + "learning_rate": 0.0002, + "loss": 0.143, + "step": 5197 + }, + { + "epoch": 9.427340739061437, + "grad_norm": 0.255859375, + "learning_rate": 0.0002, + "loss": 0.1149, + "step": 5198 + }, + { + "epoch": 9.429154386760372, + "grad_norm": 0.2392578125, + "learning_rate": 0.0002, + "loss": 0.1175, + "step": 5199 + }, + { + "epoch": 9.430968034459307, + "grad_norm": 0.208984375, + "learning_rate": 0.0002, + "loss": 0.1204, + "step": 5200 + }, + { + "epoch": 9.432781682158241, + "grad_norm": 0.248046875, + "learning_rate": 0.0002, + "loss": 0.153, + "step": 5201 + }, + { + "epoch": 9.434595329857176, + "grad_norm": 0.326171875, + "learning_rate": 0.0002, + "loss": 0.1397, + "step": 5202 + }, + { + "epoch": 9.436408977556109, + "grad_norm": 0.33984375, + "learning_rate": 0.0002, + "loss": 0.1476, + "step": 5203 + }, + { + "epoch": 9.438222625255044, + "grad_norm": 0.28125, + "learning_rate": 0.0002, + "loss": 0.1647, + "step": 5204 + }, + { + "epoch": 9.440036272953979, + "grad_norm": 0.25390625, + "learning_rate": 0.0002, + "loss": 0.1366, + "step": 5205 + }, + { + "epoch": 9.441849920652913, + "grad_norm": 0.318359375, + "learning_rate": 0.0002, + "loss": 0.1622, + "step": 5206 + }, + { + "epoch": 9.443663568351848, + "grad_norm": 0.330078125, + "learning_rate": 0.0002, + "loss": 0.1747, + "step": 5207 + }, + { + "epoch": 9.445477216050783, + "grad_norm": 0.369140625, + "learning_rate": 0.0002, + "loss": 0.138, + "step": 5208 + }, + { + "epoch": 9.447290863749716, + "grad_norm": 0.341796875, + "learning_rate": 0.0002, + "loss": 0.1623, + "step": 5209 + }, + { + "epoch": 9.44910451144865, + "grad_norm": 0.275390625, + "learning_rate": 0.0002, + "loss": 0.1537, + "step": 5210 + }, + { + "epoch": 9.450918159147585, + "grad_norm": 0.296875, + "learning_rate": 0.0002, + "loss": 0.1597, + "step": 5211 + }, + { + "epoch": 9.45273180684652, + "grad_norm": 0.2333984375, + "learning_rate": 0.0002, + "loss": 0.2047, + "step": 5212 + }, + { + "epoch": 9.454545454545455, + "grad_norm": 0.279296875, + "learning_rate": 0.0002, + "loss": 0.3277, + "step": 5213 + }, + { + "epoch": 9.45635910224439, + "grad_norm": 0.232421875, + "learning_rate": 0.0002, + "loss": 0.2483, + "step": 5214 + }, + { + "epoch": 9.458172749943323, + "grad_norm": 0.2294921875, + "learning_rate": 0.0002, + "loss": 0.2781, + "step": 5215 + }, + { + "epoch": 9.459986397642258, + "grad_norm": 0.232421875, + "learning_rate": 0.0002, + "loss": 0.2363, + "step": 5216 + }, + { + "epoch": 9.461800045341192, + "grad_norm": 0.228515625, + "learning_rate": 0.0002, + "loss": 0.2853, + "step": 5217 + }, + { + "epoch": 9.463613693040127, + "grad_norm": 0.23828125, + "learning_rate": 0.0002, + "loss": 0.3116, + "step": 5218 + }, + { + "epoch": 9.465427340739062, + "grad_norm": 0.232421875, + "learning_rate": 0.0002, + "loss": 0.2598, + "step": 5219 + }, + { + "epoch": 9.467240988437997, + "grad_norm": 0.23828125, + "learning_rate": 0.0002, + "loss": 0.2664, + "step": 5220 + }, + { + "epoch": 9.46905463613693, + "grad_norm": 0.244140625, + "learning_rate": 0.0002, + "loss": 0.2457, + "step": 5221 + }, + { + "epoch": 9.470868283835864, + "grad_norm": 0.2734375, + "learning_rate": 0.0002, + "loss": 0.2981, + "step": 5222 + }, + { + "epoch": 9.4726819315348, + "grad_norm": 0.27734375, + "learning_rate": 0.0002, + "loss": 0.2614, + "step": 5223 + }, + { + "epoch": 9.474495579233734, + "grad_norm": 0.2890625, + "learning_rate": 0.0002, + "loss": 0.3106, + "step": 5224 + }, + { + "epoch": 9.476309226932669, + "grad_norm": 0.265625, + "learning_rate": 0.0002, + "loss": 0.206, + "step": 5225 + }, + { + "epoch": 9.478122874631604, + "grad_norm": 0.2734375, + "learning_rate": 0.0002, + "loss": 0.1824, + "step": 5226 + }, + { + "epoch": 9.479936522330537, + "grad_norm": 0.296875, + "learning_rate": 0.0002, + "loss": 0.2413, + "step": 5227 + }, + { + "epoch": 9.481750170029471, + "grad_norm": 0.2890625, + "learning_rate": 0.0002, + "loss": 0.2017, + "step": 5228 + }, + { + "epoch": 9.483563817728406, + "grad_norm": 0.296875, + "learning_rate": 0.0002, + "loss": 0.1857, + "step": 5229 + }, + { + "epoch": 9.48537746542734, + "grad_norm": 0.337890625, + "learning_rate": 0.0002, + "loss": 0.2662, + "step": 5230 + }, + { + "epoch": 9.487191113126276, + "grad_norm": 0.30859375, + "learning_rate": 0.0002, + "loss": 0.1879, + "step": 5231 + }, + { + "epoch": 9.48900476082521, + "grad_norm": 0.318359375, + "learning_rate": 0.0002, + "loss": 0.2166, + "step": 5232 + }, + { + "epoch": 9.490818408524143, + "grad_norm": 0.337890625, + "learning_rate": 0.0002, + "loss": 0.2652, + "step": 5233 + }, + { + "epoch": 9.492632056223078, + "grad_norm": 0.298828125, + "learning_rate": 0.0002, + "loss": 0.1656, + "step": 5234 + }, + { + "epoch": 9.494445703922013, + "grad_norm": 0.349609375, + "learning_rate": 0.0002, + "loss": 0.18, + "step": 5235 + }, + { + "epoch": 9.496259351620948, + "grad_norm": 0.33203125, + "learning_rate": 0.0002, + "loss": 0.1822, + "step": 5236 + }, + { + "epoch": 9.496259351620948, + "eval_loss": 2.056051254272461, + "eval_runtime": 152.5317, + "eval_samples_per_second": 6.556, + "eval_steps_per_second": 6.556, + "step": 5236 + }, + { + "epoch": 9.496259351620948, + "mmlu_eval_accuracy": 0.30759994501024174, + "mmlu_eval_accuracy_abstract_algebra": 0.45454545454545453, + "mmlu_eval_accuracy_anatomy": 0.35714285714285715, + "mmlu_eval_accuracy_astronomy": 0.25, + "mmlu_eval_accuracy_business_ethics": 0.5454545454545454, + "mmlu_eval_accuracy_clinical_knowledge": 0.2413793103448276, + "mmlu_eval_accuracy_college_biology": 0.4375, + "mmlu_eval_accuracy_college_chemistry": 0.25, + "mmlu_eval_accuracy_college_computer_science": 0.18181818181818182, + "mmlu_eval_accuracy_college_mathematics": 0.18181818181818182, + "mmlu_eval_accuracy_college_medicine": 0.2727272727272727, + "mmlu_eval_accuracy_college_physics": 0.36363636363636365, + "mmlu_eval_accuracy_computer_security": 0.45454545454545453, + "mmlu_eval_accuracy_conceptual_physics": 0.2692307692307692, + "mmlu_eval_accuracy_econometrics": 0.16666666666666666, + "mmlu_eval_accuracy_electrical_engineering": 0.25, + "mmlu_eval_accuracy_elementary_mathematics": 0.2926829268292683, + "mmlu_eval_accuracy_formal_logic": 0.21428571428571427, + "mmlu_eval_accuracy_global_facts": 0.3, + "mmlu_eval_accuracy_high_school_biology": 0.40625, + "mmlu_eval_accuracy_high_school_chemistry": 0.22727272727272727, + "mmlu_eval_accuracy_high_school_computer_science": 0.1111111111111111, + "mmlu_eval_accuracy_high_school_european_history": 0.3333333333333333, + "mmlu_eval_accuracy_high_school_geography": 0.2727272727272727, + "mmlu_eval_accuracy_high_school_government_and_politics": 0.42857142857142855, + "mmlu_eval_accuracy_high_school_macroeconomics": 0.20930232558139536, + "mmlu_eval_accuracy_high_school_mathematics": 0.10344827586206896, + "mmlu_eval_accuracy_high_school_microeconomics": 0.3076923076923077, + "mmlu_eval_accuracy_high_school_physics": 0.47058823529411764, + "mmlu_eval_accuracy_high_school_psychology": 0.31666666666666665, + "mmlu_eval_accuracy_high_school_statistics": 0.34782608695652173, + "mmlu_eval_accuracy_high_school_us_history": 0.2727272727272727, + "mmlu_eval_accuracy_high_school_world_history": 0.11538461538461539, + "mmlu_eval_accuracy_human_aging": 0.2608695652173913, + "mmlu_eval_accuracy_human_sexuality": 0.08333333333333333, + "mmlu_eval_accuracy_international_law": 0.38461538461538464, + "mmlu_eval_accuracy_jurisprudence": 0.36363636363636365, + "mmlu_eval_accuracy_logical_fallacies": 0.3888888888888889, + "mmlu_eval_accuracy_machine_learning": 0.36363636363636365, + "mmlu_eval_accuracy_management": 0.5454545454545454, + "mmlu_eval_accuracy_marketing": 0.4, + "mmlu_eval_accuracy_medical_genetics": 0.2727272727272727, + "mmlu_eval_accuracy_miscellaneous": 0.43023255813953487, + "mmlu_eval_accuracy_moral_disputes": 0.23684210526315788, + "mmlu_eval_accuracy_moral_scenarios": 0.22, + "mmlu_eval_accuracy_nutrition": 0.30303030303030304, + "mmlu_eval_accuracy_philosophy": 0.35294117647058826, + "mmlu_eval_accuracy_prehistory": 0.2571428571428571, + "mmlu_eval_accuracy_professional_accounting": 0.3225806451612903, + "mmlu_eval_accuracy_professional_law": 0.25882352941176473, + "mmlu_eval_accuracy_professional_medicine": 0.25806451612903225, + "mmlu_eval_accuracy_professional_psychology": 0.2753623188405797, + "mmlu_eval_accuracy_public_relations": 0.5, + "mmlu_eval_accuracy_security_studies": 0.37037037037037035, + "mmlu_eval_accuracy_sociology": 0.5, + "mmlu_eval_accuracy_us_foreign_policy": 0.18181818181818182, + "mmlu_eval_accuracy_virology": 0.3333333333333333, + "mmlu_eval_accuracy_world_religions": 0.2631578947368421, + "mmlu_loss": 1.9013636934842884, + "step": 5236 + }, + { + "epoch": 9.498072999319882, + "grad_norm": 0.283203125, + "learning_rate": 0.0002, + "loss": 0.1578, + "step": 5237 + }, + { + "epoch": 9.499886647018817, + "grad_norm": 0.3046875, + "learning_rate": 0.0002, + "loss": 0.1822, + "step": 5238 + }, + { + "epoch": 9.50170029471775, + "grad_norm": 0.255859375, + "learning_rate": 0.0002, + "loss": 0.1426, + "step": 5239 + }, + { + "epoch": 9.503513942416685, + "grad_norm": 0.314453125, + "learning_rate": 0.0002, + "loss": 0.2052, + "step": 5240 + }, + { + "epoch": 9.50532759011562, + "grad_norm": 0.361328125, + "learning_rate": 0.0002, + "loss": 0.1907, + "step": 5241 + }, + { + "epoch": 9.507141237814555, + "grad_norm": 0.3125, + "learning_rate": 0.0002, + "loss": 0.1743, + "step": 5242 + }, + { + "epoch": 9.50895488551349, + "grad_norm": 0.298828125, + "learning_rate": 0.0002, + "loss": 0.1757, + "step": 5243 + }, + { + "epoch": 9.510768533212424, + "grad_norm": 0.298828125, + "learning_rate": 0.0002, + "loss": 0.2065, + "step": 5244 + }, + { + "epoch": 9.512582180911359, + "grad_norm": 0.291015625, + "learning_rate": 0.0002, + "loss": 0.1464, + "step": 5245 + }, + { + "epoch": 9.514395828610292, + "grad_norm": 0.28125, + "learning_rate": 0.0002, + "loss": 0.149, + "step": 5246 + }, + { + "epoch": 9.516209476309227, + "grad_norm": 0.28515625, + "learning_rate": 0.0002, + "loss": 0.1346, + "step": 5247 + }, + { + "epoch": 9.518023124008161, + "grad_norm": 0.294921875, + "learning_rate": 0.0002, + "loss": 0.1383, + "step": 5248 + }, + { + "epoch": 9.519836771707096, + "grad_norm": 0.23828125, + "learning_rate": 0.0002, + "loss": 0.1233, + "step": 5249 + }, + { + "epoch": 9.521650419406031, + "grad_norm": 0.2431640625, + "learning_rate": 0.0002, + "loss": 0.1375, + "step": 5250 + }, + { + "epoch": 9.523464067104964, + "grad_norm": 0.25, + "learning_rate": 0.0002, + "loss": 0.1193, + "step": 5251 + }, + { + "epoch": 9.525277714803899, + "grad_norm": 0.3125, + "learning_rate": 0.0002, + "loss": 0.1615, + "step": 5252 + }, + { + "epoch": 9.527091362502834, + "grad_norm": 0.20703125, + "learning_rate": 0.0002, + "loss": 0.1148, + "step": 5253 + }, + { + "epoch": 9.528905010201768, + "grad_norm": 0.2431640625, + "learning_rate": 0.0002, + "loss": 0.1497, + "step": 5254 + }, + { + "epoch": 9.530718657900703, + "grad_norm": 0.322265625, + "learning_rate": 0.0002, + "loss": 0.1377, + "step": 5255 + }, + { + "epoch": 9.532532305599638, + "grad_norm": 0.203125, + "learning_rate": 0.0002, + "loss": 0.1304, + "step": 5256 + }, + { + "epoch": 9.534345953298573, + "grad_norm": 0.294921875, + "learning_rate": 0.0002, + "loss": 0.1422, + "step": 5257 + }, + { + "epoch": 9.536159600997506, + "grad_norm": 0.287109375, + "learning_rate": 0.0002, + "loss": 0.1753, + "step": 5258 + }, + { + "epoch": 9.53797324869644, + "grad_norm": 0.482421875, + "learning_rate": 0.0002, + "loss": 0.1575, + "step": 5259 + }, + { + "epoch": 9.539786896395375, + "grad_norm": 0.224609375, + "learning_rate": 0.0002, + "loss": 0.1577, + "step": 5260 + }, + { + "epoch": 9.54160054409431, + "grad_norm": 0.17578125, + "learning_rate": 0.0002, + "loss": 0.1587, + "step": 5261 + }, + { + "epoch": 9.543414191793245, + "grad_norm": 0.291015625, + "learning_rate": 0.0002, + "loss": 0.2416, + "step": 5262 + }, + { + "epoch": 9.54522783949218, + "grad_norm": 0.296875, + "learning_rate": 0.0002, + "loss": 0.5587, + "step": 5263 + }, + { + "epoch": 9.547041487191112, + "grad_norm": 0.25390625, + "learning_rate": 0.0002, + "loss": 0.2977, + "step": 5264 + }, + { + "epoch": 9.548855134890047, + "grad_norm": 0.240234375, + "learning_rate": 0.0002, + "loss": 0.2855, + "step": 5265 + }, + { + "epoch": 9.550668782588982, + "grad_norm": 0.244140625, + "learning_rate": 0.0002, + "loss": 0.3749, + "step": 5266 + }, + { + "epoch": 9.552482430287917, + "grad_norm": 0.2421875, + "learning_rate": 0.0002, + "loss": 0.275, + "step": 5267 + }, + { + "epoch": 9.554296077986852, + "grad_norm": 0.234375, + "learning_rate": 0.0002, + "loss": 0.2962, + "step": 5268 + }, + { + "epoch": 9.556109725685786, + "grad_norm": 0.255859375, + "learning_rate": 0.0002, + "loss": 0.2787, + "step": 5269 + }, + { + "epoch": 9.55792337338472, + "grad_norm": 0.244140625, + "learning_rate": 0.0002, + "loss": 0.2119, + "step": 5270 + }, + { + "epoch": 9.559737021083654, + "grad_norm": 0.267578125, + "learning_rate": 0.0002, + "loss": 0.3153, + "step": 5271 + }, + { + "epoch": 9.561550668782589, + "grad_norm": 0.271484375, + "learning_rate": 0.0002, + "loss": 0.2944, + "step": 5272 + }, + { + "epoch": 9.563364316481524, + "grad_norm": 0.240234375, + "learning_rate": 0.0002, + "loss": 0.2301, + "step": 5273 + }, + { + "epoch": 9.565177964180458, + "grad_norm": 0.25390625, + "learning_rate": 0.0002, + "loss": 0.2262, + "step": 5274 + }, + { + "epoch": 9.566991611879393, + "grad_norm": 0.306640625, + "learning_rate": 0.0002, + "loss": 0.3502, + "step": 5275 + }, + { + "epoch": 9.568805259578326, + "grad_norm": 0.294921875, + "learning_rate": 0.0002, + "loss": 0.2334, + "step": 5276 + }, + { + "epoch": 9.570618907277261, + "grad_norm": 0.298828125, + "learning_rate": 0.0002, + "loss": 0.2732, + "step": 5277 + }, + { + "epoch": 9.572432554976196, + "grad_norm": 0.275390625, + "learning_rate": 0.0002, + "loss": 0.2213, + "step": 5278 + }, + { + "epoch": 9.57424620267513, + "grad_norm": 0.306640625, + "learning_rate": 0.0002, + "loss": 0.1974, + "step": 5279 + }, + { + "epoch": 9.576059850374065, + "grad_norm": 0.283203125, + "learning_rate": 0.0002, + "loss": 0.2351, + "step": 5280 + }, + { + "epoch": 9.577873498073, + "grad_norm": 0.326171875, + "learning_rate": 0.0002, + "loss": 0.2924, + "step": 5281 + }, + { + "epoch": 9.579687145771933, + "grad_norm": 0.287109375, + "learning_rate": 0.0002, + "loss": 0.2122, + "step": 5282 + }, + { + "epoch": 9.581500793470868, + "grad_norm": 0.275390625, + "learning_rate": 0.0002, + "loss": 0.1832, + "step": 5283 + }, + { + "epoch": 9.583314441169803, + "grad_norm": 0.28515625, + "learning_rate": 0.0002, + "loss": 0.2006, + "step": 5284 + }, + { + "epoch": 9.585128088868737, + "grad_norm": 0.3046875, + "learning_rate": 0.0002, + "loss": 0.1879, + "step": 5285 + }, + { + "epoch": 9.586941736567672, + "grad_norm": 0.296875, + "learning_rate": 0.0002, + "loss": 0.2101, + "step": 5286 + }, + { + "epoch": 9.588755384266607, + "grad_norm": 0.318359375, + "learning_rate": 0.0002, + "loss": 0.1814, + "step": 5287 + }, + { + "epoch": 9.59056903196554, + "grad_norm": 0.302734375, + "learning_rate": 0.0002, + "loss": 0.1711, + "step": 5288 + }, + { + "epoch": 9.592382679664475, + "grad_norm": 0.26953125, + "learning_rate": 0.0002, + "loss": 0.1608, + "step": 5289 + }, + { + "epoch": 9.59419632736341, + "grad_norm": 0.32421875, + "learning_rate": 0.0002, + "loss": 0.2016, + "step": 5290 + }, + { + "epoch": 9.596009975062344, + "grad_norm": 0.31640625, + "learning_rate": 0.0002, + "loss": 0.201, + "step": 5291 + }, + { + "epoch": 9.597823622761279, + "grad_norm": 0.33984375, + "learning_rate": 0.0002, + "loss": 0.1998, + "step": 5292 + }, + { + "epoch": 9.599637270460214, + "grad_norm": 0.275390625, + "learning_rate": 0.0002, + "loss": 0.139, + "step": 5293 + }, + { + "epoch": 9.601450918159149, + "grad_norm": 0.310546875, + "learning_rate": 0.0002, + "loss": 0.1877, + "step": 5294 + }, + { + "epoch": 9.603264565858082, + "grad_norm": 0.30078125, + "learning_rate": 0.0002, + "loss": 0.1494, + "step": 5295 + }, + { + "epoch": 9.605078213557016, + "grad_norm": 0.31640625, + "learning_rate": 0.0002, + "loss": 0.164, + "step": 5296 + }, + { + "epoch": 9.606891861255951, + "grad_norm": 0.37890625, + "learning_rate": 0.0002, + "loss": 0.1755, + "step": 5297 + }, + { + "epoch": 9.608705508954886, + "grad_norm": 0.2490234375, + "learning_rate": 0.0002, + "loss": 0.1244, + "step": 5298 + }, + { + "epoch": 9.61051915665382, + "grad_norm": 0.267578125, + "learning_rate": 0.0002, + "loss": 0.1472, + "step": 5299 + }, + { + "epoch": 9.612332804352754, + "grad_norm": 0.21484375, + "learning_rate": 0.0002, + "loss": 0.1247, + "step": 5300 + }, + { + "epoch": 9.614146452051688, + "grad_norm": 0.2412109375, + "learning_rate": 0.0002, + "loss": 0.1441, + "step": 5301 + }, + { + "epoch": 9.615960099750623, + "grad_norm": 0.220703125, + "learning_rate": 0.0002, + "loss": 0.1125, + "step": 5302 + }, + { + "epoch": 9.617773747449558, + "grad_norm": 0.1923828125, + "learning_rate": 0.0002, + "loss": 0.1035, + "step": 5303 + }, + { + "epoch": 9.619587395148493, + "grad_norm": 0.259765625, + "learning_rate": 0.0002, + "loss": 0.1301, + "step": 5304 + }, + { + "epoch": 9.621401042847427, + "grad_norm": 0.3984375, + "learning_rate": 0.0002, + "loss": 0.1425, + "step": 5305 + }, + { + "epoch": 9.623214690546362, + "grad_norm": 0.37109375, + "learning_rate": 0.0002, + "loss": 0.14, + "step": 5306 + }, + { + "epoch": 9.625028338245295, + "grad_norm": 0.291015625, + "learning_rate": 0.0002, + "loss": 0.1402, + "step": 5307 + }, + { + "epoch": 9.62684198594423, + "grad_norm": 0.2265625, + "learning_rate": 0.0002, + "loss": 0.1467, + "step": 5308 + }, + { + "epoch": 9.628655633643165, + "grad_norm": 0.220703125, + "learning_rate": 0.0002, + "loss": 0.1701, + "step": 5309 + }, + { + "epoch": 9.6304692813421, + "grad_norm": 0.2578125, + "learning_rate": 0.0002, + "loss": 0.1701, + "step": 5310 + }, + { + "epoch": 9.632282929041034, + "grad_norm": 0.265625, + "learning_rate": 0.0002, + "loss": 0.2158, + "step": 5311 + }, + { + "epoch": 9.634096576739967, + "grad_norm": 0.234375, + "learning_rate": 0.0002, + "loss": 0.2255, + "step": 5312 + }, + { + "epoch": 9.635910224438902, + "grad_norm": 0.23828125, + "learning_rate": 0.0002, + "loss": 0.2807, + "step": 5313 + }, + { + "epoch": 9.637723872137837, + "grad_norm": 0.2333984375, + "learning_rate": 0.0002, + "loss": 0.3292, + "step": 5314 + }, + { + "epoch": 9.639537519836772, + "grad_norm": 0.25, + "learning_rate": 0.0002, + "loss": 0.3329, + "step": 5315 + }, + { + "epoch": 9.641351167535706, + "grad_norm": 0.2470703125, + "learning_rate": 0.0002, + "loss": 0.3522, + "step": 5316 + }, + { + "epoch": 9.643164815234641, + "grad_norm": 0.2578125, + "learning_rate": 0.0002, + "loss": 0.3437, + "step": 5317 + }, + { + "epoch": 9.644978462933576, + "grad_norm": 0.248046875, + "learning_rate": 0.0002, + "loss": 0.2409, + "step": 5318 + }, + { + "epoch": 9.646792110632509, + "grad_norm": 0.26171875, + "learning_rate": 0.0002, + "loss": 0.3315, + "step": 5319 + }, + { + "epoch": 9.648605758331444, + "grad_norm": 0.265625, + "learning_rate": 0.0002, + "loss": 0.3064, + "step": 5320 + }, + { + "epoch": 9.650419406030379, + "grad_norm": 0.251953125, + "learning_rate": 0.0002, + "loss": 0.3225, + "step": 5321 + }, + { + "epoch": 9.652233053729313, + "grad_norm": 0.26171875, + "learning_rate": 0.0002, + "loss": 0.2442, + "step": 5322 + }, + { + "epoch": 9.654046701428248, + "grad_norm": 0.2470703125, + "learning_rate": 0.0002, + "loss": 0.2311, + "step": 5323 + }, + { + "epoch": 9.655860349127183, + "grad_norm": 0.3125, + "learning_rate": 0.0002, + "loss": 0.2787, + "step": 5324 + }, + { + "epoch": 9.657673996826116, + "grad_norm": 0.2333984375, + "learning_rate": 0.0002, + "loss": 0.2124, + "step": 5325 + }, + { + "epoch": 9.65948764452505, + "grad_norm": 0.255859375, + "learning_rate": 0.0002, + "loss": 0.2381, + "step": 5326 + }, + { + "epoch": 9.661301292223985, + "grad_norm": 0.26953125, + "learning_rate": 0.0002, + "loss": 0.2262, + "step": 5327 + }, + { + "epoch": 9.66311493992292, + "grad_norm": 0.283203125, + "learning_rate": 0.0002, + "loss": 0.2097, + "step": 5328 + }, + { + "epoch": 9.664928587621855, + "grad_norm": 0.3203125, + "learning_rate": 0.0002, + "loss": 0.2485, + "step": 5329 + }, + { + "epoch": 9.66674223532079, + "grad_norm": 0.330078125, + "learning_rate": 0.0002, + "loss": 0.2895, + "step": 5330 + }, + { + "epoch": 9.668555883019723, + "grad_norm": 0.287109375, + "learning_rate": 0.0002, + "loss": 0.2029, + "step": 5331 + }, + { + "epoch": 9.670369530718657, + "grad_norm": 0.3046875, + "learning_rate": 0.0002, + "loss": 0.2423, + "step": 5332 + }, + { + "epoch": 9.672183178417592, + "grad_norm": 0.328125, + "learning_rate": 0.0002, + "loss": 0.2286, + "step": 5333 + }, + { + "epoch": 9.673996826116527, + "grad_norm": 0.30859375, + "learning_rate": 0.0002, + "loss": 0.1926, + "step": 5334 + }, + { + "epoch": 9.675810473815462, + "grad_norm": 0.333984375, + "learning_rate": 0.0002, + "loss": 0.2628, + "step": 5335 + }, + { + "epoch": 9.677624121514397, + "grad_norm": 0.330078125, + "learning_rate": 0.0002, + "loss": 0.2483, + "step": 5336 + }, + { + "epoch": 9.67943776921333, + "grad_norm": 0.34765625, + "learning_rate": 0.0002, + "loss": 0.2287, + "step": 5337 + }, + { + "epoch": 9.681251416912264, + "grad_norm": 0.318359375, + "learning_rate": 0.0002, + "loss": 0.2056, + "step": 5338 + }, + { + "epoch": 9.683065064611199, + "grad_norm": 0.3125, + "learning_rate": 0.0002, + "loss": 0.1817, + "step": 5339 + }, + { + "epoch": 9.684878712310134, + "grad_norm": 0.33203125, + "learning_rate": 0.0002, + "loss": 0.174, + "step": 5340 + }, + { + "epoch": 9.686692360009069, + "grad_norm": 0.330078125, + "learning_rate": 0.0002, + "loss": 0.1984, + "step": 5341 + }, + { + "epoch": 9.688506007708003, + "grad_norm": 0.3359375, + "learning_rate": 0.0002, + "loss": 0.1699, + "step": 5342 + }, + { + "epoch": 9.690319655406936, + "grad_norm": 0.29296875, + "learning_rate": 0.0002, + "loss": 0.1727, + "step": 5343 + }, + { + "epoch": 9.692133303105871, + "grad_norm": 0.341796875, + "learning_rate": 0.0002, + "loss": 0.1609, + "step": 5344 + }, + { + "epoch": 9.693946950804806, + "grad_norm": 0.27734375, + "learning_rate": 0.0002, + "loss": 0.1509, + "step": 5345 + }, + { + "epoch": 9.69576059850374, + "grad_norm": 0.28515625, + "learning_rate": 0.0002, + "loss": 0.1754, + "step": 5346 + }, + { + "epoch": 9.697574246202675, + "grad_norm": 0.2451171875, + "learning_rate": 0.0002, + "loss": 0.1357, + "step": 5347 + }, + { + "epoch": 9.69938789390161, + "grad_norm": 0.28125, + "learning_rate": 0.0002, + "loss": 0.1216, + "step": 5348 + }, + { + "epoch": 9.701201541600543, + "grad_norm": 0.337890625, + "learning_rate": 0.0002, + "loss": 0.1541, + "step": 5349 + }, + { + "epoch": 9.703015189299478, + "grad_norm": 0.2265625, + "learning_rate": 0.0002, + "loss": 0.1518, + "step": 5350 + }, + { + "epoch": 9.704828836998413, + "grad_norm": 0.35546875, + "learning_rate": 0.0002, + "loss": 0.1524, + "step": 5351 + }, + { + "epoch": 9.706642484697348, + "grad_norm": 0.298828125, + "learning_rate": 0.0002, + "loss": 0.1314, + "step": 5352 + }, + { + "epoch": 9.708456132396282, + "grad_norm": 0.24609375, + "learning_rate": 0.0002, + "loss": 0.1444, + "step": 5353 + }, + { + "epoch": 9.710269780095217, + "grad_norm": 0.2216796875, + "learning_rate": 0.0002, + "loss": 0.1392, + "step": 5354 + }, + { + "epoch": 9.712083427794152, + "grad_norm": 0.5, + "learning_rate": 0.0002, + "loss": 0.1721, + "step": 5355 + }, + { + "epoch": 9.713897075493085, + "grad_norm": 0.31640625, + "learning_rate": 0.0002, + "loss": 0.1499, + "step": 5356 + }, + { + "epoch": 9.71571072319202, + "grad_norm": 0.302734375, + "learning_rate": 0.0002, + "loss": 0.1732, + "step": 5357 + }, + { + "epoch": 9.717524370890954, + "grad_norm": 0.330078125, + "learning_rate": 0.0002, + "loss": 0.1635, + "step": 5358 + }, + { + "epoch": 9.71933801858989, + "grad_norm": 0.26953125, + "learning_rate": 0.0002, + "loss": 0.1565, + "step": 5359 + }, + { + "epoch": 9.721151666288824, + "grad_norm": 0.5234375, + "learning_rate": 0.0002, + "loss": 0.1641, + "step": 5360 + }, + { + "epoch": 9.722965313987757, + "grad_norm": 0.271484375, + "learning_rate": 0.0002, + "loss": 0.147, + "step": 5361 + }, + { + "epoch": 9.724778961686692, + "grad_norm": 0.2451171875, + "learning_rate": 0.0002, + "loss": 0.2259, + "step": 5362 + }, + { + "epoch": 9.726592609385627, + "grad_norm": 0.267578125, + "learning_rate": 0.0002, + "loss": 0.3669, + "step": 5363 + }, + { + "epoch": 9.728406257084561, + "grad_norm": 0.2412109375, + "learning_rate": 0.0002, + "loss": 0.245, + "step": 5364 + }, + { + "epoch": 9.730219904783496, + "grad_norm": 0.2392578125, + "learning_rate": 0.0002, + "loss": 0.2961, + "step": 5365 + }, + { + "epoch": 9.73203355248243, + "grad_norm": 0.27734375, + "learning_rate": 0.0002, + "loss": 0.3263, + "step": 5366 + }, + { + "epoch": 9.733847200181366, + "grad_norm": 0.23828125, + "learning_rate": 0.0002, + "loss": 0.2466, + "step": 5367 + }, + { + "epoch": 9.735660847880299, + "grad_norm": 0.259765625, + "learning_rate": 0.0002, + "loss": 0.2672, + "step": 5368 + }, + { + "epoch": 9.737474495579233, + "grad_norm": 0.2421875, + "learning_rate": 0.0002, + "loss": 0.2431, + "step": 5369 + }, + { + "epoch": 9.739288143278168, + "grad_norm": 0.26953125, + "learning_rate": 0.0002, + "loss": 0.305, + "step": 5370 + }, + { + "epoch": 9.741101790977103, + "grad_norm": 0.263671875, + "learning_rate": 0.0002, + "loss": 0.2438, + "step": 5371 + }, + { + "epoch": 9.742915438676038, + "grad_norm": 0.27734375, + "learning_rate": 0.0002, + "loss": 0.2921, + "step": 5372 + }, + { + "epoch": 9.744729086374972, + "grad_norm": 0.296875, + "learning_rate": 0.0002, + "loss": 0.253, + "step": 5373 + }, + { + "epoch": 9.746542734073905, + "grad_norm": 0.30078125, + "learning_rate": 0.0002, + "loss": 0.2803, + "step": 5374 + }, + { + "epoch": 9.74835638177284, + "grad_norm": 0.287109375, + "learning_rate": 0.0002, + "loss": 0.2285, + "step": 5375 + }, + { + "epoch": 9.750170029471775, + "grad_norm": 0.296875, + "learning_rate": 0.0002, + "loss": 0.3491, + "step": 5376 + }, + { + "epoch": 9.75198367717071, + "grad_norm": 0.2890625, + "learning_rate": 0.0002, + "loss": 0.2263, + "step": 5377 + }, + { + "epoch": 9.753797324869645, + "grad_norm": 0.283203125, + "learning_rate": 0.0002, + "loss": 0.1888, + "step": 5378 + }, + { + "epoch": 9.75561097256858, + "grad_norm": 0.30078125, + "learning_rate": 0.0002, + "loss": 0.2543, + "step": 5379 + }, + { + "epoch": 9.757424620267512, + "grad_norm": 0.30078125, + "learning_rate": 0.0002, + "loss": 0.1981, + "step": 5380 + }, + { + "epoch": 9.759238267966447, + "grad_norm": 0.296875, + "learning_rate": 0.0002, + "loss": 0.1991, + "step": 5381 + }, + { + "epoch": 9.761051915665382, + "grad_norm": 0.337890625, + "learning_rate": 0.0002, + "loss": 0.201, + "step": 5382 + }, + { + "epoch": 9.762865563364317, + "grad_norm": 0.294921875, + "learning_rate": 0.0002, + "loss": 0.1971, + "step": 5383 + }, + { + "epoch": 9.764679211063251, + "grad_norm": 0.3203125, + "learning_rate": 0.0002, + "loss": 0.1809, + "step": 5384 + }, + { + "epoch": 9.766492858762186, + "grad_norm": 0.36328125, + "learning_rate": 0.0002, + "loss": 0.2325, + "step": 5385 + }, + { + "epoch": 9.76830650646112, + "grad_norm": 0.3359375, + "learning_rate": 0.0002, + "loss": 0.1944, + "step": 5386 + }, + { + "epoch": 9.770120154160054, + "grad_norm": 0.291015625, + "learning_rate": 0.0002, + "loss": 0.1878, + "step": 5387 + }, + { + "epoch": 9.771933801858989, + "grad_norm": 0.31640625, + "learning_rate": 0.0002, + "loss": 0.1817, + "step": 5388 + }, + { + "epoch": 9.773747449557924, + "grad_norm": 0.337890625, + "learning_rate": 0.0002, + "loss": 0.203, + "step": 5389 + }, + { + "epoch": 9.775561097256858, + "grad_norm": 0.267578125, + "learning_rate": 0.0002, + "loss": 0.1593, + "step": 5390 + }, + { + "epoch": 9.777374744955793, + "grad_norm": 0.34375, + "learning_rate": 0.0002, + "loss": 0.1514, + "step": 5391 + }, + { + "epoch": 9.779188392654726, + "grad_norm": 0.33984375, + "learning_rate": 0.0002, + "loss": 0.1499, + "step": 5392 + }, + { + "epoch": 9.78100204035366, + "grad_norm": 0.3359375, + "learning_rate": 0.0002, + "loss": 0.1856, + "step": 5393 + }, + { + "epoch": 9.782815688052596, + "grad_norm": 0.31640625, + "learning_rate": 0.0002, + "loss": 0.1454, + "step": 5394 + }, + { + "epoch": 9.78462933575153, + "grad_norm": 0.30078125, + "learning_rate": 0.0002, + "loss": 0.1602, + "step": 5395 + }, + { + "epoch": 9.786442983450465, + "grad_norm": 0.267578125, + "learning_rate": 0.0002, + "loss": 0.1411, + "step": 5396 + }, + { + "epoch": 9.7882566311494, + "grad_norm": 0.283203125, + "learning_rate": 0.0002, + "loss": 0.1281, + "step": 5397 + }, + { + "epoch": 9.790070278848333, + "grad_norm": 0.2490234375, + "learning_rate": 0.0002, + "loss": 0.1255, + "step": 5398 + }, + { + "epoch": 9.791883926547268, + "grad_norm": 0.26171875, + "learning_rate": 0.0002, + "loss": 0.1611, + "step": 5399 + }, + { + "epoch": 9.793697574246202, + "grad_norm": 0.2734375, + "learning_rate": 0.0002, + "loss": 0.1336, + "step": 5400 + }, + { + "epoch": 9.795511221945137, + "grad_norm": 0.478515625, + "learning_rate": 0.0002, + "loss": 0.2323, + "step": 5401 + }, + { + "epoch": 9.797324869644072, + "grad_norm": 0.3125, + "learning_rate": 0.0002, + "loss": 0.1245, + "step": 5402 + }, + { + "epoch": 9.799138517343007, + "grad_norm": 0.26953125, + "learning_rate": 0.0002, + "loss": 0.2222, + "step": 5403 + }, + { + "epoch": 9.80095216504194, + "grad_norm": 0.193359375, + "learning_rate": 0.0002, + "loss": 0.1147, + "step": 5404 + }, + { + "epoch": 9.802765812740875, + "grad_norm": 0.2177734375, + "learning_rate": 0.0002, + "loss": 0.1736, + "step": 5405 + }, + { + "epoch": 9.80457946043981, + "grad_norm": 0.26171875, + "learning_rate": 0.0002, + "loss": 0.1522, + "step": 5406 + }, + { + "epoch": 9.806393108138744, + "grad_norm": 0.291015625, + "learning_rate": 0.0002, + "loss": 0.1675, + "step": 5407 + }, + { + "epoch": 9.808206755837679, + "grad_norm": 0.255859375, + "learning_rate": 0.0002, + "loss": 0.1349, + "step": 5408 + }, + { + "epoch": 9.810020403536614, + "grad_norm": 0.2734375, + "learning_rate": 0.0002, + "loss": 0.1606, + "step": 5409 + }, + { + "epoch": 9.811834051235547, + "grad_norm": 0.21484375, + "learning_rate": 0.0002, + "loss": 0.1528, + "step": 5410 + }, + { + "epoch": 9.813647698934481, + "grad_norm": 0.20703125, + "learning_rate": 0.0002, + "loss": 0.1563, + "step": 5411 + }, + { + "epoch": 9.815461346633416, + "grad_norm": 0.193359375, + "learning_rate": 0.0002, + "loss": 0.1939, + "step": 5412 + }, + { + "epoch": 9.817274994332351, + "grad_norm": 0.236328125, + "learning_rate": 0.0002, + "loss": 0.2953, + "step": 5413 + }, + { + "epoch": 9.819088642031286, + "grad_norm": 0.259765625, + "learning_rate": 0.0002, + "loss": 0.3857, + "step": 5414 + }, + { + "epoch": 9.82090228973022, + "grad_norm": 0.271484375, + "learning_rate": 0.0002, + "loss": 0.3539, + "step": 5415 + }, + { + "epoch": 9.822715937429155, + "grad_norm": 0.240234375, + "learning_rate": 0.0002, + "loss": 0.3213, + "step": 5416 + }, + { + "epoch": 9.824529585128088, + "grad_norm": 0.259765625, + "learning_rate": 0.0002, + "loss": 0.2851, + "step": 5417 + }, + { + "epoch": 9.826343232827023, + "grad_norm": 0.2470703125, + "learning_rate": 0.0002, + "loss": 0.2533, + "step": 5418 + }, + { + "epoch": 9.828156880525958, + "grad_norm": 0.271484375, + "learning_rate": 0.0002, + "loss": 0.4039, + "step": 5419 + }, + { + "epoch": 9.829970528224893, + "grad_norm": 0.2734375, + "learning_rate": 0.0002, + "loss": 0.3837, + "step": 5420 + }, + { + "epoch": 9.831784175923827, + "grad_norm": 0.259765625, + "learning_rate": 0.0002, + "loss": 0.2499, + "step": 5421 + }, + { + "epoch": 9.83359782362276, + "grad_norm": 0.271484375, + "learning_rate": 0.0002, + "loss": 0.2616, + "step": 5422 + }, + { + "epoch": 9.835411471321695, + "grad_norm": 0.2412109375, + "learning_rate": 0.0002, + "loss": 0.242, + "step": 5423 + }, + { + "epoch": 9.835411471321695, + "eval_loss": 2.1229794025421143, + "eval_runtime": 152.7294, + "eval_samples_per_second": 6.548, + "eval_steps_per_second": 6.548, + "step": 5423 + }, + { + "epoch": 9.835411471321695, + "mmlu_eval_accuracy": 0.30429546062952284, + "mmlu_eval_accuracy_abstract_algebra": 0.36363636363636365, + "mmlu_eval_accuracy_anatomy": 0.42857142857142855, + "mmlu_eval_accuracy_astronomy": 0.3125, + "mmlu_eval_accuracy_business_ethics": 0.5454545454545454, + "mmlu_eval_accuracy_clinical_knowledge": 0.20689655172413793, + "mmlu_eval_accuracy_college_biology": 0.4375, + "mmlu_eval_accuracy_college_chemistry": 0.0, + "mmlu_eval_accuracy_college_computer_science": 0.09090909090909091, + "mmlu_eval_accuracy_college_mathematics": 0.18181818181818182, + "mmlu_eval_accuracy_college_medicine": 0.2727272727272727, + "mmlu_eval_accuracy_college_physics": 0.2727272727272727, + "mmlu_eval_accuracy_computer_security": 0.45454545454545453, + "mmlu_eval_accuracy_conceptual_physics": 0.2692307692307692, + "mmlu_eval_accuracy_econometrics": 0.16666666666666666, + "mmlu_eval_accuracy_electrical_engineering": 0.1875, + "mmlu_eval_accuracy_elementary_mathematics": 0.2682926829268293, + "mmlu_eval_accuracy_formal_logic": 0.21428571428571427, + "mmlu_eval_accuracy_global_facts": 0.3, + "mmlu_eval_accuracy_high_school_biology": 0.40625, + "mmlu_eval_accuracy_high_school_chemistry": 0.22727272727272727, + "mmlu_eval_accuracy_high_school_computer_science": 0.2222222222222222, + "mmlu_eval_accuracy_high_school_european_history": 0.2222222222222222, + "mmlu_eval_accuracy_high_school_geography": 0.3181818181818182, + "mmlu_eval_accuracy_high_school_government_and_politics": 0.42857142857142855, + "mmlu_eval_accuracy_high_school_macroeconomics": 0.27906976744186046, + "mmlu_eval_accuracy_high_school_mathematics": 0.10344827586206896, + "mmlu_eval_accuracy_high_school_microeconomics": 0.34615384615384615, + "mmlu_eval_accuracy_high_school_physics": 0.47058823529411764, + "mmlu_eval_accuracy_high_school_psychology": 0.35, + "mmlu_eval_accuracy_high_school_statistics": 0.30434782608695654, + "mmlu_eval_accuracy_high_school_us_history": 0.2727272727272727, + "mmlu_eval_accuracy_high_school_world_history": 0.11538461538461539, + "mmlu_eval_accuracy_human_aging": 0.391304347826087, + "mmlu_eval_accuracy_human_sexuality": 0.0, + "mmlu_eval_accuracy_international_law": 0.38461538461538464, + "mmlu_eval_accuracy_jurisprudence": 0.2727272727272727, + "mmlu_eval_accuracy_logical_fallacies": 0.2222222222222222, + "mmlu_eval_accuracy_machine_learning": 0.36363636363636365, + "mmlu_eval_accuracy_management": 0.5454545454545454, + "mmlu_eval_accuracy_marketing": 0.48, + "mmlu_eval_accuracy_medical_genetics": 0.2727272727272727, + "mmlu_eval_accuracy_miscellaneous": 0.45348837209302323, + "mmlu_eval_accuracy_moral_disputes": 0.2894736842105263, + "mmlu_eval_accuracy_moral_scenarios": 0.22, + "mmlu_eval_accuracy_nutrition": 0.36363636363636365, + "mmlu_eval_accuracy_philosophy": 0.38235294117647056, + "mmlu_eval_accuracy_prehistory": 0.34285714285714286, + "mmlu_eval_accuracy_professional_accounting": 0.3225806451612903, + "mmlu_eval_accuracy_professional_law": 0.27647058823529413, + "mmlu_eval_accuracy_professional_medicine": 0.25806451612903225, + "mmlu_eval_accuracy_professional_psychology": 0.30434782608695654, + "mmlu_eval_accuracy_public_relations": 0.4166666666666667, + "mmlu_eval_accuracy_security_studies": 0.37037037037037035, + "mmlu_eval_accuracy_sociology": 0.5909090909090909, + "mmlu_eval_accuracy_us_foreign_policy": 0.18181818181818182, + "mmlu_eval_accuracy_virology": 0.3888888888888889, + "mmlu_eval_accuracy_world_religions": 0.21052631578947367, + "mmlu_loss": 1.733098420167888, + "step": 5423 + }, + { + "epoch": 9.83722511902063, + "grad_norm": 0.28515625, + "learning_rate": 0.0002, + "loss": 0.2477, + "step": 5424 + }, + { + "epoch": 9.839038766719565, + "grad_norm": 0.275390625, + "learning_rate": 0.0002, + "loss": 0.3053, + "step": 5425 + }, + { + "epoch": 9.8408524144185, + "grad_norm": 0.296875, + "learning_rate": 0.0002, + "loss": 0.3028, + "step": 5426 + }, + { + "epoch": 9.842666062117434, + "grad_norm": 0.29296875, + "learning_rate": 0.0002, + "loss": 0.2757, + "step": 5427 + }, + { + "epoch": 9.844479709816369, + "grad_norm": 0.271484375, + "learning_rate": 0.0002, + "loss": 0.2135, + "step": 5428 + }, + { + "epoch": 9.846293357515302, + "grad_norm": 0.29296875, + "learning_rate": 0.0002, + "loss": 0.2317, + "step": 5429 + }, + { + "epoch": 9.848107005214237, + "grad_norm": 0.333984375, + "learning_rate": 0.0002, + "loss": 0.2757, + "step": 5430 + }, + { + "epoch": 9.849920652913172, + "grad_norm": 0.30078125, + "learning_rate": 0.0002, + "loss": 0.2296, + "step": 5431 + }, + { + "epoch": 9.851734300612106, + "grad_norm": 0.31640625, + "learning_rate": 0.0002, + "loss": 0.2667, + "step": 5432 + }, + { + "epoch": 9.853547948311041, + "grad_norm": 0.35546875, + "learning_rate": 0.0002, + "loss": 0.2419, + "step": 5433 + }, + { + "epoch": 9.855361596009976, + "grad_norm": 0.384765625, + "learning_rate": 0.0002, + "loss": 0.2765, + "step": 5434 + }, + { + "epoch": 9.857175243708909, + "grad_norm": 0.29296875, + "learning_rate": 0.0002, + "loss": 0.1915, + "step": 5435 + }, + { + "epoch": 9.858988891407844, + "grad_norm": 0.32421875, + "learning_rate": 0.0002, + "loss": 0.2128, + "step": 5436 + }, + { + "epoch": 9.860802539106778, + "grad_norm": 0.34765625, + "learning_rate": 0.0002, + "loss": 0.1586, + "step": 5437 + }, + { + "epoch": 9.862616186805713, + "grad_norm": 0.37109375, + "learning_rate": 0.0002, + "loss": 0.2263, + "step": 5438 + }, + { + "epoch": 9.864429834504648, + "grad_norm": 0.326171875, + "learning_rate": 0.0002, + "loss": 0.1867, + "step": 5439 + }, + { + "epoch": 9.866243482203583, + "grad_norm": 0.337890625, + "learning_rate": 0.0002, + "loss": 0.1593, + "step": 5440 + }, + { + "epoch": 9.868057129902516, + "grad_norm": 0.361328125, + "learning_rate": 0.0002, + "loss": 0.1874, + "step": 5441 + }, + { + "epoch": 9.86987077760145, + "grad_norm": 0.296875, + "learning_rate": 0.0002, + "loss": 0.1685, + "step": 5442 + }, + { + "epoch": 9.871684425300385, + "grad_norm": 0.28125, + "learning_rate": 0.0002, + "loss": 0.1574, + "step": 5443 + }, + { + "epoch": 9.87349807299932, + "grad_norm": 0.361328125, + "learning_rate": 0.0002, + "loss": 0.1926, + "step": 5444 + }, + { + "epoch": 9.875311720698255, + "grad_norm": 0.4375, + "learning_rate": 0.0002, + "loss": 0.233, + "step": 5445 + }, + { + "epoch": 9.87712536839719, + "grad_norm": 0.255859375, + "learning_rate": 0.0002, + "loss": 0.1284, + "step": 5446 + }, + { + "epoch": 9.878939016096123, + "grad_norm": 0.3671875, + "learning_rate": 0.0002, + "loss": 0.1659, + "step": 5447 + }, + { + "epoch": 9.880752663795057, + "grad_norm": 0.2890625, + "learning_rate": 0.0002, + "loss": 0.1565, + "step": 5448 + }, + { + "epoch": 9.882566311493992, + "grad_norm": 0.21484375, + "learning_rate": 0.0002, + "loss": 0.1369, + "step": 5449 + }, + { + "epoch": 9.884379959192927, + "grad_norm": 0.21875, + "learning_rate": 0.0002, + "loss": 0.1503, + "step": 5450 + }, + { + "epoch": 9.886193606891862, + "grad_norm": 0.27734375, + "learning_rate": 0.0002, + "loss": 0.1515, + "step": 5451 + }, + { + "epoch": 9.888007254590796, + "grad_norm": 0.2294921875, + "learning_rate": 0.0002, + "loss": 0.1091, + "step": 5452 + }, + { + "epoch": 9.88982090228973, + "grad_norm": 0.283203125, + "learning_rate": 0.0002, + "loss": 0.1423, + "step": 5453 + }, + { + "epoch": 9.891634549988664, + "grad_norm": 0.271484375, + "learning_rate": 0.0002, + "loss": 0.1425, + "step": 5454 + }, + { + "epoch": 9.893448197687599, + "grad_norm": 0.216796875, + "learning_rate": 0.0002, + "loss": 0.1322, + "step": 5455 + }, + { + "epoch": 9.895261845386534, + "grad_norm": 0.265625, + "learning_rate": 0.0002, + "loss": 0.1496, + "step": 5456 + }, + { + "epoch": 9.897075493085469, + "grad_norm": 0.26953125, + "learning_rate": 0.0002, + "loss": 0.1418, + "step": 5457 + }, + { + "epoch": 9.898889140784403, + "grad_norm": 0.265625, + "learning_rate": 0.0002, + "loss": 0.1526, + "step": 5458 + }, + { + "epoch": 9.900702788483336, + "grad_norm": 0.236328125, + "learning_rate": 0.0002, + "loss": 0.1534, + "step": 5459 + }, + { + "epoch": 9.902516436182271, + "grad_norm": 0.20703125, + "learning_rate": 0.0002, + "loss": 0.1834, + "step": 5460 + }, + { + "epoch": 9.904330083881206, + "grad_norm": 0.240234375, + "learning_rate": 0.0002, + "loss": 0.214, + "step": 5461 + }, + { + "epoch": 9.90614373158014, + "grad_norm": 0.177734375, + "learning_rate": 0.0002, + "loss": 0.2085, + "step": 5462 + }, + { + "epoch": 9.907957379279075, + "grad_norm": 0.30078125, + "learning_rate": 0.0002, + "loss": 0.4418, + "step": 5463 + }, + { + "epoch": 9.90977102697801, + "grad_norm": 0.2451171875, + "learning_rate": 0.0002, + "loss": 0.3038, + "step": 5464 + }, + { + "epoch": 9.911584674676945, + "grad_norm": 0.2431640625, + "learning_rate": 0.0002, + "loss": 0.3308, + "step": 5465 + }, + { + "epoch": 9.913398322375878, + "grad_norm": 0.23046875, + "learning_rate": 0.0002, + "loss": 0.2592, + "step": 5466 + }, + { + "epoch": 9.915211970074813, + "grad_norm": 0.2255859375, + "learning_rate": 0.0002, + "loss": 0.2166, + "step": 5467 + }, + { + "epoch": 9.917025617773747, + "grad_norm": 0.2578125, + "learning_rate": 0.0002, + "loss": 0.4062, + "step": 5468 + }, + { + "epoch": 9.918839265472682, + "grad_norm": 0.2734375, + "learning_rate": 0.0002, + "loss": 0.3025, + "step": 5469 + }, + { + "epoch": 9.920652913171617, + "grad_norm": 0.263671875, + "learning_rate": 0.0002, + "loss": 0.271, + "step": 5470 + }, + { + "epoch": 9.92246656087055, + "grad_norm": 0.271484375, + "learning_rate": 0.0002, + "loss": 0.2558, + "step": 5471 + }, + { + "epoch": 9.924280208569485, + "grad_norm": 0.26953125, + "learning_rate": 0.0002, + "loss": 0.3394, + "step": 5472 + }, + { + "epoch": 9.92609385626842, + "grad_norm": 0.275390625, + "learning_rate": 0.0002, + "loss": 0.2989, + "step": 5473 + }, + { + "epoch": 9.927907503967354, + "grad_norm": 0.255859375, + "learning_rate": 0.0002, + "loss": 0.2843, + "step": 5474 + }, + { + "epoch": 9.929721151666289, + "grad_norm": 0.28515625, + "learning_rate": 0.0002, + "loss": 0.2545, + "step": 5475 + }, + { + "epoch": 9.931534799365224, + "grad_norm": 0.287109375, + "learning_rate": 0.0002, + "loss": 0.235, + "step": 5476 + }, + { + "epoch": 9.933348447064159, + "grad_norm": 0.27734375, + "learning_rate": 0.0002, + "loss": 0.2969, + "step": 5477 + }, + { + "epoch": 9.935162094763092, + "grad_norm": 0.26953125, + "learning_rate": 0.0002, + "loss": 0.2359, + "step": 5478 + }, + { + "epoch": 9.936975742462026, + "grad_norm": 0.3125, + "learning_rate": 0.0002, + "loss": 0.251, + "step": 5479 + }, + { + "epoch": 9.938789390160961, + "grad_norm": 0.291015625, + "learning_rate": 0.0002, + "loss": 0.2227, + "step": 5480 + }, + { + "epoch": 9.940603037859896, + "grad_norm": 0.26953125, + "learning_rate": 0.0002, + "loss": 0.1826, + "step": 5481 + }, + { + "epoch": 9.94241668555883, + "grad_norm": 0.314453125, + "learning_rate": 0.0002, + "loss": 0.1979, + "step": 5482 + }, + { + "epoch": 9.944230333257766, + "grad_norm": 0.3359375, + "learning_rate": 0.0002, + "loss": 0.2006, + "step": 5483 + }, + { + "epoch": 9.946043980956699, + "grad_norm": 0.3515625, + "learning_rate": 0.0002, + "loss": 0.2926, + "step": 5484 + }, + { + "epoch": 9.947857628655633, + "grad_norm": 0.357421875, + "learning_rate": 0.0002, + "loss": 0.2554, + "step": 5485 + }, + { + "epoch": 9.949671276354568, + "grad_norm": 0.34765625, + "learning_rate": 0.0002, + "loss": 0.257, + "step": 5486 + }, + { + "epoch": 9.951484924053503, + "grad_norm": 0.310546875, + "learning_rate": 0.0002, + "loss": 0.1894, + "step": 5487 + }, + { + "epoch": 9.953298571752438, + "grad_norm": 0.29296875, + "learning_rate": 0.0002, + "loss": 0.1795, + "step": 5488 + }, + { + "epoch": 9.955112219451372, + "grad_norm": 0.328125, + "learning_rate": 0.0002, + "loss": 0.1913, + "step": 5489 + }, + { + "epoch": 9.956925867150305, + "grad_norm": 0.294921875, + "learning_rate": 0.0002, + "loss": 0.1941, + "step": 5490 + }, + { + "epoch": 9.95873951484924, + "grad_norm": 0.27734375, + "learning_rate": 0.0002, + "loss": 0.1938, + "step": 5491 + }, + { + "epoch": 9.960553162548175, + "grad_norm": 0.3515625, + "learning_rate": 0.0002, + "loss": 0.3717, + "step": 5492 + }, + { + "epoch": 9.96236681024711, + "grad_norm": 0.359375, + "learning_rate": 0.0002, + "loss": 0.1605, + "step": 5493 + }, + { + "epoch": 9.964180457946044, + "grad_norm": 0.3203125, + "learning_rate": 0.0002, + "loss": 0.1369, + "step": 5494 + }, + { + "epoch": 9.96599410564498, + "grad_norm": 0.310546875, + "learning_rate": 0.0002, + "loss": 0.1566, + "step": 5495 + }, + { + "epoch": 9.967807753343912, + "grad_norm": 0.302734375, + "learning_rate": 0.0002, + "loss": 0.1461, + "step": 5496 + }, + { + "epoch": 9.969621401042847, + "grad_norm": 0.283203125, + "learning_rate": 0.0002, + "loss": 0.17, + "step": 5497 + }, + { + "epoch": 9.971435048741782, + "grad_norm": 0.26953125, + "learning_rate": 0.0002, + "loss": 0.1549, + "step": 5498 + }, + { + "epoch": 9.973248696440717, + "grad_norm": 0.2421875, + "learning_rate": 0.0002, + "loss": 0.1217, + "step": 5499 + }, + { + "epoch": 9.975062344139651, + "grad_norm": 0.27734375, + "learning_rate": 0.0002, + "loss": 0.1459, + "step": 5500 + }, + { + "epoch": 9.976875991838586, + "grad_norm": 0.2470703125, + "learning_rate": 0.0002, + "loss": 0.1423, + "step": 5501 + }, + { + "epoch": 9.978689639537519, + "grad_norm": 0.279296875, + "learning_rate": 0.0002, + "loss": 0.1199, + "step": 5502 + }, + { + "epoch": 9.980503287236454, + "grad_norm": 0.2294921875, + "learning_rate": 0.0002, + "loss": 0.1336, + "step": 5503 + }, + { + "epoch": 9.982316934935389, + "grad_norm": 0.28125, + "learning_rate": 0.0002, + "loss": 0.1362, + "step": 5504 + }, + { + "epoch": 9.984130582634323, + "grad_norm": 0.287109375, + "learning_rate": 0.0002, + "loss": 0.1111, + "step": 5505 + }, + { + "epoch": 9.985944230333258, + "grad_norm": 0.37109375, + "learning_rate": 0.0002, + "loss": 0.186, + "step": 5506 + }, + { + "epoch": 9.987757878032193, + "grad_norm": 0.31640625, + "learning_rate": 0.0002, + "loss": 0.1508, + "step": 5507 + }, + { + "epoch": 9.989571525731126, + "grad_norm": 0.291015625, + "learning_rate": 0.0002, + "loss": 0.163, + "step": 5508 + }, + { + "epoch": 9.99138517343006, + "grad_norm": 0.2158203125, + "learning_rate": 0.0002, + "loss": 0.1426, + "step": 5509 + }, + { + "epoch": 9.993198821128995, + "grad_norm": 0.4453125, + "learning_rate": 0.0002, + "loss": 0.1697, + "step": 5510 + }, + { + "epoch": 9.99501246882793, + "grad_norm": 0.2255859375, + "learning_rate": 0.0002, + "loss": 0.1571, + "step": 5511 + }, + { + "epoch": 9.996826116526865, + "grad_norm": 0.265625, + "learning_rate": 0.0002, + "loss": 0.2042, + "step": 5512 + }, + { + "epoch": 9.9986397642258, + "grad_norm": 0.287109375, + "learning_rate": 0.0002, + "loss": 0.2712, + "step": 5513 + }, + { + "epoch": 10.000453411924733, + "grad_norm": 0.333984375, + "learning_rate": 0.0002, + "loss": 0.2311, + "step": 5514 + }, + { + "epoch": 10.002267059623668, + "grad_norm": 0.20703125, + "learning_rate": 0.0002, + "loss": 0.2324, + "step": 5515 + }, + { + "epoch": 10.004080707322602, + "grad_norm": 0.177734375, + "learning_rate": 0.0002, + "loss": 0.1609, + "step": 5516 + }, + { + "epoch": 10.005894355021537, + "grad_norm": 0.2119140625, + "learning_rate": 0.0002, + "loss": 0.2098, + "step": 5517 + }, + { + "epoch": 10.007708002720472, + "grad_norm": 0.197265625, + "learning_rate": 0.0002, + "loss": 0.198, + "step": 5518 + }, + { + "epoch": 10.009521650419407, + "grad_norm": 0.1982421875, + "learning_rate": 0.0002, + "loss": 0.1728, + "step": 5519 + }, + { + "epoch": 10.01133529811834, + "grad_norm": 0.2138671875, + "learning_rate": 0.0002, + "loss": 0.2277, + "step": 5520 + }, + { + "epoch": 10.013148945817274, + "grad_norm": 0.2060546875, + "learning_rate": 0.0002, + "loss": 0.1501, + "step": 5521 + }, + { + "epoch": 10.01496259351621, + "grad_norm": 0.2216796875, + "learning_rate": 0.0002, + "loss": 0.1843, + "step": 5522 + }, + { + "epoch": 10.016776241215144, + "grad_norm": 0.212890625, + "learning_rate": 0.0002, + "loss": 0.1857, + "step": 5523 + }, + { + "epoch": 10.018589888914079, + "grad_norm": 0.1943359375, + "learning_rate": 0.0002, + "loss": 0.1231, + "step": 5524 + }, + { + "epoch": 10.020403536613014, + "grad_norm": 0.240234375, + "learning_rate": 0.0002, + "loss": 0.1569, + "step": 5525 + }, + { + "epoch": 10.022217184311947, + "grad_norm": 0.2734375, + "learning_rate": 0.0002, + "loss": 0.1894, + "step": 5526 + }, + { + "epoch": 10.024030832010881, + "grad_norm": 0.251953125, + "learning_rate": 0.0002, + "loss": 0.174, + "step": 5527 + }, + { + "epoch": 10.025844479709816, + "grad_norm": 0.25390625, + "learning_rate": 0.0002, + "loss": 0.1424, + "step": 5528 + }, + { + "epoch": 10.02765812740875, + "grad_norm": 0.265625, + "learning_rate": 0.0002, + "loss": 0.1533, + "step": 5529 + }, + { + "epoch": 10.029471775107686, + "grad_norm": 0.2265625, + "learning_rate": 0.0002, + "loss": 0.13, + "step": 5530 + }, + { + "epoch": 10.03128542280662, + "grad_norm": 0.279296875, + "learning_rate": 0.0002, + "loss": 0.1659, + "step": 5531 + }, + { + "epoch": 10.033099070505555, + "grad_norm": 0.255859375, + "learning_rate": 0.0002, + "loss": 0.1426, + "step": 5532 + }, + { + "epoch": 10.034912718204488, + "grad_norm": 0.28515625, + "learning_rate": 0.0002, + "loss": 0.1353, + "step": 5533 + }, + { + "epoch": 10.036726365903423, + "grad_norm": 0.26171875, + "learning_rate": 0.0002, + "loss": 0.1522, + "step": 5534 + }, + { + "epoch": 10.038540013602358, + "grad_norm": 0.28125, + "learning_rate": 0.0002, + "loss": 0.1348, + "step": 5535 + }, + { + "epoch": 10.040353661301292, + "grad_norm": 0.28125, + "learning_rate": 0.0002, + "loss": 0.1265, + "step": 5536 + }, + { + "epoch": 10.042167309000227, + "grad_norm": 0.25390625, + "learning_rate": 0.0002, + "loss": 0.1136, + "step": 5537 + }, + { + "epoch": 10.043980956699162, + "grad_norm": 0.275390625, + "learning_rate": 0.0002, + "loss": 0.1328, + "step": 5538 + }, + { + "epoch": 10.045794604398095, + "grad_norm": 0.28125, + "learning_rate": 0.0002, + "loss": 0.1216, + "step": 5539 + }, + { + "epoch": 10.04760825209703, + "grad_norm": 0.1875, + "learning_rate": 0.0002, + "loss": 0.0921, + "step": 5540 + }, + { + "epoch": 10.049421899795965, + "grad_norm": 0.25390625, + "learning_rate": 0.0002, + "loss": 0.1086, + "step": 5541 + }, + { + "epoch": 10.0512355474949, + "grad_norm": 0.30078125, + "learning_rate": 0.0002, + "loss": 0.1273, + "step": 5542 + }, + { + "epoch": 10.053049195193834, + "grad_norm": 0.29296875, + "learning_rate": 0.0002, + "loss": 0.1408, + "step": 5543 + }, + { + "epoch": 10.054862842892769, + "grad_norm": 0.302734375, + "learning_rate": 0.0002, + "loss": 0.1228, + "step": 5544 + }, + { + "epoch": 10.056676490591702, + "grad_norm": 0.251953125, + "learning_rate": 0.0002, + "loss": 0.1014, + "step": 5545 + }, + { + "epoch": 10.058490138290637, + "grad_norm": 0.25390625, + "learning_rate": 0.0002, + "loss": 0.1069, + "step": 5546 + }, + { + "epoch": 10.060303785989571, + "grad_norm": 0.27734375, + "learning_rate": 0.0002, + "loss": 0.0992, + "step": 5547 + }, + { + "epoch": 10.062117433688506, + "grad_norm": 0.27734375, + "learning_rate": 0.0002, + "loss": 0.0956, + "step": 5548 + }, + { + "epoch": 10.063931081387441, + "grad_norm": 0.255859375, + "learning_rate": 0.0002, + "loss": 0.1083, + "step": 5549 + }, + { + "epoch": 10.065744729086376, + "grad_norm": 0.169921875, + "learning_rate": 0.0002, + "loss": 0.0971, + "step": 5550 + }, + { + "epoch": 10.067558376785309, + "grad_norm": 0.287109375, + "learning_rate": 0.0002, + "loss": 0.0969, + "step": 5551 + }, + { + "epoch": 10.069372024484244, + "grad_norm": 0.1728515625, + "learning_rate": 0.0002, + "loss": 0.0971, + "step": 5552 + }, + { + "epoch": 10.071185672183178, + "grad_norm": 0.19921875, + "learning_rate": 0.0002, + "loss": 0.097, + "step": 5553 + }, + { + "epoch": 10.072999319882113, + "grad_norm": 0.2890625, + "learning_rate": 0.0002, + "loss": 0.1088, + "step": 5554 + }, + { + "epoch": 10.074812967581048, + "grad_norm": 0.1787109375, + "learning_rate": 0.0002, + "loss": 0.1085, + "step": 5555 + }, + { + "epoch": 10.076626615279983, + "grad_norm": 0.1845703125, + "learning_rate": 0.0002, + "loss": 0.1032, + "step": 5556 + }, + { + "epoch": 10.078440262978916, + "grad_norm": 0.275390625, + "learning_rate": 0.0002, + "loss": 0.132, + "step": 5557 + }, + { + "epoch": 10.08025391067785, + "grad_norm": 0.23828125, + "learning_rate": 0.0002, + "loss": 0.123, + "step": 5558 + }, + { + "epoch": 10.082067558376785, + "grad_norm": 0.283203125, + "learning_rate": 0.0002, + "loss": 0.1321, + "step": 5559 + }, + { + "epoch": 10.08388120607572, + "grad_norm": 0.23046875, + "learning_rate": 0.0002, + "loss": 0.1196, + "step": 5560 + }, + { + "epoch": 10.085694853774655, + "grad_norm": 0.275390625, + "learning_rate": 0.0002, + "loss": 0.1396, + "step": 5561 + }, + { + "epoch": 10.08750850147359, + "grad_norm": 0.1162109375, + "learning_rate": 0.0002, + "loss": 0.1317, + "step": 5562 + }, + { + "epoch": 10.089322149172522, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 0.1588, + "step": 5563 + }, + { + "epoch": 10.091135796871457, + "grad_norm": 0.255859375, + "learning_rate": 0.0002, + "loss": 0.2307, + "step": 5564 + }, + { + "epoch": 10.092949444570392, + "grad_norm": 0.21875, + "learning_rate": 0.0002, + "loss": 0.2261, + "step": 5565 + }, + { + "epoch": 10.094763092269327, + "grad_norm": 0.21484375, + "learning_rate": 0.0002, + "loss": 0.2036, + "step": 5566 + }, + { + "epoch": 10.096576739968262, + "grad_norm": 0.23828125, + "learning_rate": 0.0002, + "loss": 0.2888, + "step": 5567 + }, + { + "epoch": 10.098390387667196, + "grad_norm": 0.236328125, + "learning_rate": 0.0002, + "loss": 0.2988, + "step": 5568 + }, + { + "epoch": 10.10020403536613, + "grad_norm": 0.2294921875, + "learning_rate": 0.0002, + "loss": 0.2347, + "step": 5569 + }, + { + "epoch": 10.102017683065064, + "grad_norm": 0.2197265625, + "learning_rate": 0.0002, + "loss": 0.1992, + "step": 5570 + }, + { + "epoch": 10.103831330763999, + "grad_norm": 0.26171875, + "learning_rate": 0.0002, + "loss": 0.2209, + "step": 5571 + }, + { + "epoch": 10.105644978462934, + "grad_norm": 0.203125, + "learning_rate": 0.0002, + "loss": 0.1742, + "step": 5572 + }, + { + "epoch": 10.107458626161868, + "grad_norm": 0.181640625, + "learning_rate": 0.0002, + "loss": 0.1428, + "step": 5573 + }, + { + "epoch": 10.109272273860803, + "grad_norm": 0.240234375, + "learning_rate": 0.0002, + "loss": 0.1943, + "step": 5574 + }, + { + "epoch": 10.111085921559736, + "grad_norm": 0.2177734375, + "learning_rate": 0.0002, + "loss": 0.1625, + "step": 5575 + }, + { + "epoch": 10.112899569258671, + "grad_norm": 0.25390625, + "learning_rate": 0.0002, + "loss": 0.1931, + "step": 5576 + }, + { + "epoch": 10.114713216957606, + "grad_norm": 0.248046875, + "learning_rate": 0.0002, + "loss": 0.1864, + "step": 5577 + }, + { + "epoch": 10.11652686465654, + "grad_norm": 0.275390625, + "learning_rate": 0.0002, + "loss": 0.1838, + "step": 5578 + }, + { + "epoch": 10.118340512355475, + "grad_norm": 0.28125, + "learning_rate": 0.0002, + "loss": 0.1909, + "step": 5579 + }, + { + "epoch": 10.12015416005441, + "grad_norm": 0.2333984375, + "learning_rate": 0.0002, + "loss": 0.117, + "step": 5580 + }, + { + "epoch": 10.121967807753343, + "grad_norm": 0.330078125, + "learning_rate": 0.0002, + "loss": 0.1936, + "step": 5581 + }, + { + "epoch": 10.123781455452278, + "grad_norm": 0.2734375, + "learning_rate": 0.0002, + "loss": 0.156, + "step": 5582 + }, + { + "epoch": 10.125595103151213, + "grad_norm": 0.267578125, + "learning_rate": 0.0002, + "loss": 0.1545, + "step": 5583 + }, + { + "epoch": 10.127408750850147, + "grad_norm": 0.3359375, + "learning_rate": 0.0002, + "loss": 0.2038, + "step": 5584 + }, + { + "epoch": 10.129222398549082, + "grad_norm": 0.314453125, + "learning_rate": 0.0002, + "loss": 0.1567, + "step": 5585 + }, + { + "epoch": 10.131036046248017, + "grad_norm": 0.29296875, + "learning_rate": 0.0002, + "loss": 0.1281, + "step": 5586 + }, + { + "epoch": 10.132849693946952, + "grad_norm": 0.28515625, + "learning_rate": 0.0002, + "loss": 0.1187, + "step": 5587 + }, + { + "epoch": 10.134663341645885, + "grad_norm": 0.314453125, + "learning_rate": 0.0002, + "loss": 0.1342, + "step": 5588 + }, + { + "epoch": 10.13647698934482, + "grad_norm": 0.341796875, + "learning_rate": 0.0002, + "loss": 0.153, + "step": 5589 + }, + { + "epoch": 10.138290637043754, + "grad_norm": 0.283203125, + "learning_rate": 0.0002, + "loss": 0.1153, + "step": 5590 + }, + { + "epoch": 10.140104284742689, + "grad_norm": 0.28125, + "learning_rate": 0.0002, + "loss": 0.1233, + "step": 5591 + }, + { + "epoch": 10.141917932441624, + "grad_norm": 0.33984375, + "learning_rate": 0.0002, + "loss": 0.1289, + "step": 5592 + }, + { + "epoch": 10.143731580140559, + "grad_norm": 0.265625, + "learning_rate": 0.0002, + "loss": 0.1169, + "step": 5593 + }, + { + "epoch": 10.145545227839492, + "grad_norm": 0.251953125, + "learning_rate": 0.0002, + "loss": 0.0999, + "step": 5594 + }, + { + "epoch": 10.147358875538426, + "grad_norm": 0.33203125, + "learning_rate": 0.0002, + "loss": 0.1101, + "step": 5595 + }, + { + "epoch": 10.149172523237361, + "grad_norm": 0.302734375, + "learning_rate": 0.0002, + "loss": 0.1231, + "step": 5596 + }, + { + "epoch": 10.150986170936296, + "grad_norm": 0.251953125, + "learning_rate": 0.0002, + "loss": 0.1026, + "step": 5597 + }, + { + "epoch": 10.15279981863523, + "grad_norm": 0.279296875, + "learning_rate": 0.0002, + "loss": 0.1168, + "step": 5598 + }, + { + "epoch": 10.154613466334165, + "grad_norm": 0.26953125, + "learning_rate": 0.0002, + "loss": 0.1241, + "step": 5599 + }, + { + "epoch": 10.156427114033098, + "grad_norm": 0.2216796875, + "learning_rate": 0.0002, + "loss": 0.097, + "step": 5600 + }, + { + "epoch": 10.158240761732033, + "grad_norm": 0.244140625, + "learning_rate": 0.0002, + "loss": 0.1119, + "step": 5601 + }, + { + "epoch": 10.160054409430968, + "grad_norm": 0.2578125, + "learning_rate": 0.0002, + "loss": 0.144, + "step": 5602 + }, + { + "epoch": 10.161868057129903, + "grad_norm": 0.2080078125, + "learning_rate": 0.0002, + "loss": 0.0939, + "step": 5603 + }, + { + "epoch": 10.163681704828837, + "grad_norm": 0.28515625, + "learning_rate": 0.0002, + "loss": 0.1018, + "step": 5604 + }, + { + "epoch": 10.165495352527772, + "grad_norm": 0.291015625, + "learning_rate": 0.0002, + "loss": 0.1406, + "step": 5605 + }, + { + "epoch": 10.167309000226705, + "grad_norm": 0.27734375, + "learning_rate": 0.0002, + "loss": 0.111, + "step": 5606 + }, + { + "epoch": 10.16912264792564, + "grad_norm": 0.2119140625, + "learning_rate": 0.0002, + "loss": 0.1181, + "step": 5607 + }, + { + "epoch": 10.170936295624575, + "grad_norm": 0.255859375, + "learning_rate": 0.0002, + "loss": 0.1155, + "step": 5608 + }, + { + "epoch": 10.17274994332351, + "grad_norm": 0.11669921875, + "learning_rate": 0.0002, + "loss": 0.1087, + "step": 5609 + }, + { + "epoch": 10.174563591022444, + "grad_norm": 0.333984375, + "learning_rate": 0.0002, + "loss": 0.1277, + "step": 5610 + }, + { + "epoch": 10.174563591022444, + "eval_loss": 2.1025922298431396, + "eval_runtime": 152.672, + "eval_samples_per_second": 6.55, + "eval_steps_per_second": 6.55, + "step": 5610 + }, + { + "epoch": 10.174563591022444, + "mmlu_eval_accuracy": 0.290846330202936, + "mmlu_eval_accuracy_abstract_algebra": 0.45454545454545453, + "mmlu_eval_accuracy_anatomy": 0.35714285714285715, + "mmlu_eval_accuracy_astronomy": 0.25, + "mmlu_eval_accuracy_business_ethics": 0.45454545454545453, + "mmlu_eval_accuracy_clinical_knowledge": 0.2413793103448276, + "mmlu_eval_accuracy_college_biology": 0.375, + "mmlu_eval_accuracy_college_chemistry": 0.25, + "mmlu_eval_accuracy_college_computer_science": 0.09090909090909091, + "mmlu_eval_accuracy_college_mathematics": 0.18181818181818182, + "mmlu_eval_accuracy_college_medicine": 0.22727272727272727, + "mmlu_eval_accuracy_college_physics": 0.2727272727272727, + "mmlu_eval_accuracy_computer_security": 0.45454545454545453, + "mmlu_eval_accuracy_conceptual_physics": 0.2692307692307692, + "mmlu_eval_accuracy_econometrics": 0.08333333333333333, + "mmlu_eval_accuracy_electrical_engineering": 0.0625, + "mmlu_eval_accuracy_elementary_mathematics": 0.2926829268292683, + "mmlu_eval_accuracy_formal_logic": 0.14285714285714285, + "mmlu_eval_accuracy_global_facts": 0.4, + "mmlu_eval_accuracy_high_school_biology": 0.40625, + "mmlu_eval_accuracy_high_school_chemistry": 0.18181818181818182, + "mmlu_eval_accuracy_high_school_computer_science": 0.2222222222222222, + "mmlu_eval_accuracy_high_school_european_history": 0.2777777777777778, + "mmlu_eval_accuracy_high_school_geography": 0.36363636363636365, + "mmlu_eval_accuracy_high_school_government_and_politics": 0.2857142857142857, + "mmlu_eval_accuracy_high_school_macroeconomics": 0.27906976744186046, + "mmlu_eval_accuracy_high_school_mathematics": 0.10344827586206896, + "mmlu_eval_accuracy_high_school_microeconomics": 0.3076923076923077, + "mmlu_eval_accuracy_high_school_physics": 0.4117647058823529, + "mmlu_eval_accuracy_high_school_psychology": 0.3333333333333333, + "mmlu_eval_accuracy_high_school_statistics": 0.2608695652173913, + "mmlu_eval_accuracy_high_school_us_history": 0.3181818181818182, + "mmlu_eval_accuracy_high_school_world_history": 0.11538461538461539, + "mmlu_eval_accuracy_human_aging": 0.391304347826087, + "mmlu_eval_accuracy_human_sexuality": 0.08333333333333333, + "mmlu_eval_accuracy_international_law": 0.38461538461538464, + "mmlu_eval_accuracy_jurisprudence": 0.2727272727272727, + "mmlu_eval_accuracy_logical_fallacies": 0.16666666666666666, + "mmlu_eval_accuracy_machine_learning": 0.36363636363636365, + "mmlu_eval_accuracy_management": 0.36363636363636365, + "mmlu_eval_accuracy_marketing": 0.44, + "mmlu_eval_accuracy_medical_genetics": 0.2727272727272727, + "mmlu_eval_accuracy_miscellaneous": 0.4418604651162791, + "mmlu_eval_accuracy_moral_disputes": 0.23684210526315788, + "mmlu_eval_accuracy_moral_scenarios": 0.22, + "mmlu_eval_accuracy_nutrition": 0.36363636363636365, + "mmlu_eval_accuracy_philosophy": 0.38235294117647056, + "mmlu_eval_accuracy_prehistory": 0.3142857142857143, + "mmlu_eval_accuracy_professional_accounting": 0.3225806451612903, + "mmlu_eval_accuracy_professional_law": 0.2823529411764706, + "mmlu_eval_accuracy_professional_medicine": 0.22580645161290322, + "mmlu_eval_accuracy_professional_psychology": 0.2463768115942029, + "mmlu_eval_accuracy_public_relations": 0.3333333333333333, + "mmlu_eval_accuracy_security_studies": 0.37037037037037035, + "mmlu_eval_accuracy_sociology": 0.4090909090909091, + "mmlu_eval_accuracy_us_foreign_policy": 0.36363636363636365, + "mmlu_eval_accuracy_virology": 0.3888888888888889, + "mmlu_eval_accuracy_world_religions": 0.21052631578947367, + "mmlu_loss": 1.7593372665457598, + "step": 5610 + }, + { + "epoch": 10.17637723872138, + "grad_norm": 0.2216796875, + "learning_rate": 0.0002, + "loss": 0.1319, + "step": 5611 + }, + { + "epoch": 10.178190886420312, + "grad_norm": 0.1328125, + "learning_rate": 0.0002, + "loss": 0.1244, + "step": 5612 + }, + { + "epoch": 10.180004534119247, + "grad_norm": 0.62109375, + "learning_rate": 0.0002, + "loss": 0.2042, + "step": 5613 + }, + { + "epoch": 10.181818181818182, + "grad_norm": 0.189453125, + "learning_rate": 0.0002, + "loss": 0.2586, + "step": 5614 + }, + { + "epoch": 10.183631829517116, + "grad_norm": 0.2294921875, + "learning_rate": 0.0002, + "loss": 0.2465, + "step": 5615 + }, + { + "epoch": 10.185445477216051, + "grad_norm": 0.2216796875, + "learning_rate": 0.0002, + "loss": 0.2111, + "step": 5616 + }, + { + "epoch": 10.187259124914986, + "grad_norm": 0.228515625, + "learning_rate": 0.0002, + "loss": 0.1847, + "step": 5617 + }, + { + "epoch": 10.189072772613919, + "grad_norm": 0.2275390625, + "learning_rate": 0.0002, + "loss": 0.2057, + "step": 5618 + }, + { + "epoch": 10.190886420312854, + "grad_norm": 0.228515625, + "learning_rate": 0.0002, + "loss": 0.1886, + "step": 5619 + }, + { + "epoch": 10.192700068011789, + "grad_norm": 0.2431640625, + "learning_rate": 0.0002, + "loss": 0.2659, + "step": 5620 + }, + { + "epoch": 10.194513715710723, + "grad_norm": 0.2236328125, + "learning_rate": 0.0002, + "loss": 0.1841, + "step": 5621 + }, + { + "epoch": 10.196327363409658, + "grad_norm": 0.224609375, + "learning_rate": 0.0002, + "loss": 0.2209, + "step": 5622 + }, + { + "epoch": 10.198141011108593, + "grad_norm": 0.283203125, + "learning_rate": 0.0002, + "loss": 0.3118, + "step": 5623 + }, + { + "epoch": 10.199954658807526, + "grad_norm": 0.25, + "learning_rate": 0.0002, + "loss": 0.2172, + "step": 5624 + }, + { + "epoch": 10.20176830650646, + "grad_norm": 0.2421875, + "learning_rate": 0.0002, + "loss": 0.1781, + "step": 5625 + }, + { + "epoch": 10.203581954205395, + "grad_norm": 0.279296875, + "learning_rate": 0.0002, + "loss": 0.1967, + "step": 5626 + }, + { + "epoch": 10.20539560190433, + "grad_norm": 0.259765625, + "learning_rate": 0.0002, + "loss": 0.1733, + "step": 5627 + }, + { + "epoch": 10.207209249603265, + "grad_norm": 0.2734375, + "learning_rate": 0.0002, + "loss": 0.1708, + "step": 5628 + }, + { + "epoch": 10.2090228973022, + "grad_norm": 0.2265625, + "learning_rate": 0.0002, + "loss": 0.1499, + "step": 5629 + }, + { + "epoch": 10.210836545001133, + "grad_norm": 0.26171875, + "learning_rate": 0.0002, + "loss": 0.1201, + "step": 5630 + }, + { + "epoch": 10.212650192700067, + "grad_norm": 0.267578125, + "learning_rate": 0.0002, + "loss": 0.1467, + "step": 5631 + }, + { + "epoch": 10.214463840399002, + "grad_norm": 0.291015625, + "learning_rate": 0.0002, + "loss": 0.1359, + "step": 5632 + }, + { + "epoch": 10.216277488097937, + "grad_norm": 0.3203125, + "learning_rate": 0.0002, + "loss": 0.162, + "step": 5633 + }, + { + "epoch": 10.218091135796872, + "grad_norm": 0.3046875, + "learning_rate": 0.0002, + "loss": 0.1429, + "step": 5634 + }, + { + "epoch": 10.219904783495807, + "grad_norm": 0.302734375, + "learning_rate": 0.0002, + "loss": 0.1535, + "step": 5635 + }, + { + "epoch": 10.22171843119474, + "grad_norm": 0.279296875, + "learning_rate": 0.0002, + "loss": 0.1148, + "step": 5636 + }, + { + "epoch": 10.223532078893674, + "grad_norm": 0.341796875, + "learning_rate": 0.0002, + "loss": 0.1571, + "step": 5637 + }, + { + "epoch": 10.225345726592609, + "grad_norm": 0.359375, + "learning_rate": 0.0002, + "loss": 0.1674, + "step": 5638 + }, + { + "epoch": 10.227159374291544, + "grad_norm": 0.25390625, + "learning_rate": 0.0002, + "loss": 0.1015, + "step": 5639 + }, + { + "epoch": 10.228973021990479, + "grad_norm": 0.291015625, + "learning_rate": 0.0002, + "loss": 0.1227, + "step": 5640 + }, + { + "epoch": 10.230786669689413, + "grad_norm": 0.275390625, + "learning_rate": 0.0002, + "loss": 0.1184, + "step": 5641 + }, + { + "epoch": 10.232600317388348, + "grad_norm": 0.400390625, + "learning_rate": 0.0002, + "loss": 0.1175, + "step": 5642 + }, + { + "epoch": 10.234413965087281, + "grad_norm": 0.302734375, + "learning_rate": 0.0002, + "loss": 0.1358, + "step": 5643 + }, + { + "epoch": 10.236227612786216, + "grad_norm": 0.287109375, + "learning_rate": 0.0002, + "loss": 0.1276, + "step": 5644 + }, + { + "epoch": 10.23804126048515, + "grad_norm": 0.35546875, + "learning_rate": 0.0002, + "loss": 0.1486, + "step": 5645 + }, + { + "epoch": 10.239854908184086, + "grad_norm": 0.27734375, + "learning_rate": 0.0002, + "loss": 0.1145, + "step": 5646 + }, + { + "epoch": 10.24166855588302, + "grad_norm": 0.328125, + "learning_rate": 0.0002, + "loss": 0.113, + "step": 5647 + }, + { + "epoch": 10.243482203581955, + "grad_norm": 0.283203125, + "learning_rate": 0.0002, + "loss": 0.1015, + "step": 5648 + }, + { + "epoch": 10.245295851280888, + "grad_norm": 0.271484375, + "learning_rate": 0.0002, + "loss": 0.1507, + "step": 5649 + }, + { + "epoch": 10.247109498979823, + "grad_norm": 0.2578125, + "learning_rate": 0.0002, + "loss": 0.1047, + "step": 5650 + }, + { + "epoch": 10.248923146678758, + "grad_norm": 0.28515625, + "learning_rate": 0.0002, + "loss": 0.1015, + "step": 5651 + }, + { + "epoch": 10.250736794377692, + "grad_norm": 0.2060546875, + "learning_rate": 0.0002, + "loss": 0.1454, + "step": 5652 + }, + { + "epoch": 10.252550442076627, + "grad_norm": 0.259765625, + "learning_rate": 0.0002, + "loss": 0.1236, + "step": 5653 + }, + { + "epoch": 10.254364089775562, + "grad_norm": 0.2578125, + "learning_rate": 0.0002, + "loss": 0.1101, + "step": 5654 + }, + { + "epoch": 10.256177737474495, + "grad_norm": 0.2734375, + "learning_rate": 0.0002, + "loss": 0.1352, + "step": 5655 + }, + { + "epoch": 10.25799138517343, + "grad_norm": 0.294921875, + "learning_rate": 0.0002, + "loss": 0.1065, + "step": 5656 + }, + { + "epoch": 10.259805032872364, + "grad_norm": 0.263671875, + "learning_rate": 0.0002, + "loss": 0.1241, + "step": 5657 + }, + { + "epoch": 10.2616186805713, + "grad_norm": 0.224609375, + "learning_rate": 0.0002, + "loss": 0.1265, + "step": 5658 + }, + { + "epoch": 10.263432328270234, + "grad_norm": 0.259765625, + "learning_rate": 0.0002, + "loss": 0.1252, + "step": 5659 + }, + { + "epoch": 10.265245975969169, + "grad_norm": 0.271484375, + "learning_rate": 0.0002, + "loss": 0.1241, + "step": 5660 + }, + { + "epoch": 10.267059623668102, + "grad_norm": 0.1953125, + "learning_rate": 0.0002, + "loss": 0.1481, + "step": 5661 + }, + { + "epoch": 10.268873271367037, + "grad_norm": 0.1259765625, + "learning_rate": 0.0002, + "loss": 0.1344, + "step": 5662 + }, + { + "epoch": 10.270686919065971, + "grad_norm": 0.173828125, + "learning_rate": 0.0002, + "loss": 0.1626, + "step": 5663 + }, + { + "epoch": 10.272500566764906, + "grad_norm": 0.2412109375, + "learning_rate": 0.0002, + "loss": 0.2769, + "step": 5664 + }, + { + "epoch": 10.27431421446384, + "grad_norm": 0.2119140625, + "learning_rate": 0.0002, + "loss": 0.2058, + "step": 5665 + }, + { + "epoch": 10.276127862162776, + "grad_norm": 0.212890625, + "learning_rate": 0.0002, + "loss": 0.1868, + "step": 5666 + }, + { + "epoch": 10.277941509861709, + "grad_norm": 0.2109375, + "learning_rate": 0.0002, + "loss": 0.1897, + "step": 5667 + }, + { + "epoch": 10.279755157560643, + "grad_norm": 0.2421875, + "learning_rate": 0.0002, + "loss": 0.2474, + "step": 5668 + }, + { + "epoch": 10.281568805259578, + "grad_norm": 0.232421875, + "learning_rate": 0.0002, + "loss": 0.1885, + "step": 5669 + }, + { + "epoch": 10.283382452958513, + "grad_norm": 0.2216796875, + "learning_rate": 0.0002, + "loss": 0.1691, + "step": 5670 + }, + { + "epoch": 10.285196100657448, + "grad_norm": 0.2255859375, + "learning_rate": 0.0002, + "loss": 0.2575, + "step": 5671 + }, + { + "epoch": 10.287009748356382, + "grad_norm": 0.2490234375, + "learning_rate": 0.0002, + "loss": 0.2163, + "step": 5672 + }, + { + "epoch": 10.288823396055315, + "grad_norm": 0.251953125, + "learning_rate": 0.0002, + "loss": 0.2028, + "step": 5673 + }, + { + "epoch": 10.29063704375425, + "grad_norm": 0.2421875, + "learning_rate": 0.0002, + "loss": 0.1798, + "step": 5674 + }, + { + "epoch": 10.292450691453185, + "grad_norm": 0.244140625, + "learning_rate": 0.0002, + "loss": 0.1588, + "step": 5675 + }, + { + "epoch": 10.29426433915212, + "grad_norm": 0.263671875, + "learning_rate": 0.0002, + "loss": 0.175, + "step": 5676 + }, + { + "epoch": 10.296077986851055, + "grad_norm": 0.2421875, + "learning_rate": 0.0002, + "loss": 0.1386, + "step": 5677 + }, + { + "epoch": 10.29789163454999, + "grad_norm": 0.240234375, + "learning_rate": 0.0002, + "loss": 0.2089, + "step": 5678 + }, + { + "epoch": 10.299705282248922, + "grad_norm": 0.29296875, + "learning_rate": 0.0002, + "loss": 0.1506, + "step": 5679 + }, + { + "epoch": 10.301518929947857, + "grad_norm": 0.25390625, + "learning_rate": 0.0002, + "loss": 0.1648, + "step": 5680 + }, + { + "epoch": 10.303332577646792, + "grad_norm": 0.322265625, + "learning_rate": 0.0002, + "loss": 0.164, + "step": 5681 + }, + { + "epoch": 10.305146225345727, + "grad_norm": 0.29296875, + "learning_rate": 0.0002, + "loss": 0.1686, + "step": 5682 + }, + { + "epoch": 10.306959873044661, + "grad_norm": 0.3046875, + "learning_rate": 0.0002, + "loss": 0.1653, + "step": 5683 + }, + { + "epoch": 10.308773520743596, + "grad_norm": 0.30078125, + "learning_rate": 0.0002, + "loss": 0.148, + "step": 5684 + }, + { + "epoch": 10.31058716844253, + "grad_norm": 0.3125, + "learning_rate": 0.0002, + "loss": 0.1248, + "step": 5685 + }, + { + "epoch": 10.312400816141464, + "grad_norm": 0.337890625, + "learning_rate": 0.0002, + "loss": 0.1646, + "step": 5686 + }, + { + "epoch": 10.314214463840399, + "grad_norm": 0.318359375, + "learning_rate": 0.0002, + "loss": 0.1518, + "step": 5687 + }, + { + "epoch": 10.316028111539334, + "grad_norm": 0.296875, + "learning_rate": 0.0002, + "loss": 0.116, + "step": 5688 + }, + { + "epoch": 10.317841759238268, + "grad_norm": 0.275390625, + "learning_rate": 0.0002, + "loss": 0.1043, + "step": 5689 + }, + { + "epoch": 10.319655406937203, + "grad_norm": 0.255859375, + "learning_rate": 0.0002, + "loss": 0.107, + "step": 5690 + }, + { + "epoch": 10.321469054636136, + "grad_norm": 0.31640625, + "learning_rate": 0.0002, + "loss": 0.156, + "step": 5691 + }, + { + "epoch": 10.32328270233507, + "grad_norm": 0.267578125, + "learning_rate": 0.0002, + "loss": 0.1162, + "step": 5692 + }, + { + "epoch": 10.325096350034006, + "grad_norm": 0.330078125, + "learning_rate": 0.0002, + "loss": 0.1285, + "step": 5693 + }, + { + "epoch": 10.32690999773294, + "grad_norm": 0.3828125, + "learning_rate": 0.0002, + "loss": 0.1391, + "step": 5694 + }, + { + "epoch": 10.328723645431875, + "grad_norm": 0.232421875, + "learning_rate": 0.0002, + "loss": 0.1202, + "step": 5695 + }, + { + "epoch": 10.33053729313081, + "grad_norm": 0.265625, + "learning_rate": 0.0002, + "loss": 0.12, + "step": 5696 + }, + { + "epoch": 10.332350940829745, + "grad_norm": 0.291015625, + "learning_rate": 0.0002, + "loss": 0.1411, + "step": 5697 + }, + { + "epoch": 10.334164588528678, + "grad_norm": 0.32421875, + "learning_rate": 0.0002, + "loss": 0.127, + "step": 5698 + }, + { + "epoch": 10.335978236227612, + "grad_norm": 0.28515625, + "learning_rate": 0.0002, + "loss": 0.103, + "step": 5699 + }, + { + "epoch": 10.337791883926547, + "grad_norm": 0.36328125, + "learning_rate": 0.0002, + "loss": 0.1109, + "step": 5700 + }, + { + "epoch": 10.339605531625482, + "grad_norm": 0.21484375, + "learning_rate": 0.0002, + "loss": 0.0928, + "step": 5701 + }, + { + "epoch": 10.341419179324417, + "grad_norm": 0.2119140625, + "learning_rate": 0.0002, + "loss": 0.1169, + "step": 5702 + }, + { + "epoch": 10.343232827023352, + "grad_norm": 0.28125, + "learning_rate": 0.0002, + "loss": 0.1055, + "step": 5703 + }, + { + "epoch": 10.345046474722285, + "grad_norm": 0.255859375, + "learning_rate": 0.0002, + "loss": 0.1234, + "step": 5704 + }, + { + "epoch": 10.34686012242122, + "grad_norm": 0.208984375, + "learning_rate": 0.0002, + "loss": 0.0991, + "step": 5705 + }, + { + "epoch": 10.348673770120154, + "grad_norm": 0.287109375, + "learning_rate": 0.0002, + "loss": 0.1681, + "step": 5706 + }, + { + "epoch": 10.350487417819089, + "grad_norm": 0.21875, + "learning_rate": 0.0002, + "loss": 0.1466, + "step": 5707 + }, + { + "epoch": 10.352301065518024, + "grad_norm": 0.189453125, + "learning_rate": 0.0002, + "loss": 0.1233, + "step": 5708 + }, + { + "epoch": 10.354114713216958, + "grad_norm": 0.2265625, + "learning_rate": 0.0002, + "loss": 0.1211, + "step": 5709 + }, + { + "epoch": 10.355928360915891, + "grad_norm": 0.236328125, + "learning_rate": 0.0002, + "loss": 0.1329, + "step": 5710 + }, + { + "epoch": 10.357742008614826, + "grad_norm": 0.22265625, + "learning_rate": 0.0002, + "loss": 0.1405, + "step": 5711 + }, + { + "epoch": 10.359555656313761, + "grad_norm": 0.3046875, + "learning_rate": 0.0002, + "loss": 0.1468, + "step": 5712 + }, + { + "epoch": 10.361369304012696, + "grad_norm": 0.255859375, + "learning_rate": 0.0002, + "loss": 0.214, + "step": 5713 + }, + { + "epoch": 10.36318295171163, + "grad_norm": 0.27734375, + "learning_rate": 0.0002, + "loss": 0.4304, + "step": 5714 + }, + { + "epoch": 10.364996599410565, + "grad_norm": 0.2001953125, + "learning_rate": 0.0002, + "loss": 0.199, + "step": 5715 + }, + { + "epoch": 10.366810247109498, + "grad_norm": 0.23046875, + "learning_rate": 0.0002, + "loss": 0.2714, + "step": 5716 + }, + { + "epoch": 10.368623894808433, + "grad_norm": 0.220703125, + "learning_rate": 0.0002, + "loss": 0.2102, + "step": 5717 + }, + { + "epoch": 10.370437542507368, + "grad_norm": 0.2158203125, + "learning_rate": 0.0002, + "loss": 0.1919, + "step": 5718 + }, + { + "epoch": 10.372251190206303, + "grad_norm": 0.2216796875, + "learning_rate": 0.0002, + "loss": 0.2359, + "step": 5719 + }, + { + "epoch": 10.374064837905237, + "grad_norm": 0.2216796875, + "learning_rate": 0.0002, + "loss": 0.1755, + "step": 5720 + }, + { + "epoch": 10.375878485604172, + "grad_norm": 0.287109375, + "learning_rate": 0.0002, + "loss": 0.3523, + "step": 5721 + }, + { + "epoch": 10.377692133303105, + "grad_norm": 0.2373046875, + "learning_rate": 0.0002, + "loss": 0.1947, + "step": 5722 + }, + { + "epoch": 10.37950578100204, + "grad_norm": 0.26171875, + "learning_rate": 0.0002, + "loss": 0.2045, + "step": 5723 + }, + { + "epoch": 10.381319428700975, + "grad_norm": 0.279296875, + "learning_rate": 0.0002, + "loss": 0.2133, + "step": 5724 + }, + { + "epoch": 10.38313307639991, + "grad_norm": 0.287109375, + "learning_rate": 0.0002, + "loss": 0.2426, + "step": 5725 + }, + { + "epoch": 10.384946724098844, + "grad_norm": 0.2734375, + "learning_rate": 0.0002, + "loss": 0.197, + "step": 5726 + }, + { + "epoch": 10.386760371797779, + "grad_norm": 0.287109375, + "learning_rate": 0.0002, + "loss": 0.1611, + "step": 5727 + }, + { + "epoch": 10.388574019496712, + "grad_norm": 0.302734375, + "learning_rate": 0.0002, + "loss": 0.1434, + "step": 5728 + }, + { + "epoch": 10.390387667195647, + "grad_norm": 0.28125, + "learning_rate": 0.0002, + "loss": 0.1913, + "step": 5729 + }, + { + "epoch": 10.392201314894582, + "grad_norm": 0.279296875, + "learning_rate": 0.0002, + "loss": 0.1658, + "step": 5730 + }, + { + "epoch": 10.394014962593516, + "grad_norm": 0.283203125, + "learning_rate": 0.0002, + "loss": 0.1479, + "step": 5731 + }, + { + "epoch": 10.395828610292451, + "grad_norm": 0.28125, + "learning_rate": 0.0002, + "loss": 0.1273, + "step": 5732 + }, + { + "epoch": 10.397642257991386, + "grad_norm": 0.263671875, + "learning_rate": 0.0002, + "loss": 0.1529, + "step": 5733 + }, + { + "epoch": 10.399455905690319, + "grad_norm": 0.29296875, + "learning_rate": 0.0002, + "loss": 0.1462, + "step": 5734 + }, + { + "epoch": 10.401269553389254, + "grad_norm": 0.30859375, + "learning_rate": 0.0002, + "loss": 0.1443, + "step": 5735 + }, + { + "epoch": 10.403083201088188, + "grad_norm": 0.35546875, + "learning_rate": 0.0002, + "loss": 0.174, + "step": 5736 + }, + { + "epoch": 10.404896848787123, + "grad_norm": 0.3203125, + "learning_rate": 0.0002, + "loss": 0.1696, + "step": 5737 + }, + { + "epoch": 10.406710496486058, + "grad_norm": 0.298828125, + "learning_rate": 0.0002, + "loss": 0.1205, + "step": 5738 + }, + { + "epoch": 10.408524144184993, + "grad_norm": 0.271484375, + "learning_rate": 0.0002, + "loss": 0.1078, + "step": 5739 + }, + { + "epoch": 10.410337791883926, + "grad_norm": 0.3359375, + "learning_rate": 0.0002, + "loss": 0.1423, + "step": 5740 + }, + { + "epoch": 10.41215143958286, + "grad_norm": 0.40625, + "learning_rate": 0.0002, + "loss": 0.1491, + "step": 5741 + }, + { + "epoch": 10.413965087281795, + "grad_norm": 0.30859375, + "learning_rate": 0.0002, + "loss": 0.13, + "step": 5742 + }, + { + "epoch": 10.41577873498073, + "grad_norm": 0.318359375, + "learning_rate": 0.0002, + "loss": 0.1177, + "step": 5743 + }, + { + "epoch": 10.417592382679665, + "grad_norm": 0.279296875, + "learning_rate": 0.0002, + "loss": 0.1277, + "step": 5744 + }, + { + "epoch": 10.4194060303786, + "grad_norm": 0.296875, + "learning_rate": 0.0002, + "loss": 0.1092, + "step": 5745 + }, + { + "epoch": 10.421219678077534, + "grad_norm": 0.30078125, + "learning_rate": 0.0002, + "loss": 0.1405, + "step": 5746 + }, + { + "epoch": 10.423033325776467, + "grad_norm": 0.31640625, + "learning_rate": 0.0002, + "loss": 0.1113, + "step": 5747 + }, + { + "epoch": 10.424846973475402, + "grad_norm": 0.28125, + "learning_rate": 0.0002, + "loss": 0.1175, + "step": 5748 + }, + { + "epoch": 10.426660621174337, + "grad_norm": 0.25, + "learning_rate": 0.0002, + "loss": 0.0984, + "step": 5749 + }, + { + "epoch": 10.428474268873272, + "grad_norm": 0.240234375, + "learning_rate": 0.0002, + "loss": 0.122, + "step": 5750 + }, + { + "epoch": 10.430287916572206, + "grad_norm": 0.2255859375, + "learning_rate": 0.0002, + "loss": 0.1049, + "step": 5751 + }, + { + "epoch": 10.43210156427114, + "grad_norm": 0.314453125, + "learning_rate": 0.0002, + "loss": 0.1334, + "step": 5752 + }, + { + "epoch": 10.433915211970074, + "grad_norm": 0.271484375, + "learning_rate": 0.0002, + "loss": 0.1171, + "step": 5753 + }, + { + "epoch": 10.435728859669009, + "grad_norm": 0.306640625, + "learning_rate": 0.0002, + "loss": 0.1124, + "step": 5754 + }, + { + "epoch": 10.437542507367944, + "grad_norm": 0.2314453125, + "learning_rate": 0.0002, + "loss": 0.1171, + "step": 5755 + }, + { + "epoch": 10.439356155066879, + "grad_norm": 0.2001953125, + "learning_rate": 0.0002, + "loss": 0.154, + "step": 5756 + }, + { + "epoch": 10.441169802765813, + "grad_norm": 0.185546875, + "learning_rate": 0.0002, + "loss": 0.1165, + "step": 5757 + }, + { + "epoch": 10.442983450464748, + "grad_norm": 0.1884765625, + "learning_rate": 0.0002, + "loss": 0.1296, + "step": 5758 + }, + { + "epoch": 10.444797098163681, + "grad_norm": 0.2021484375, + "learning_rate": 0.0002, + "loss": 0.1274, + "step": 5759 + }, + { + "epoch": 10.446610745862616, + "grad_norm": 0.3828125, + "learning_rate": 0.0002, + "loss": 0.1282, + "step": 5760 + }, + { + "epoch": 10.44842439356155, + "grad_norm": 0.185546875, + "learning_rate": 0.0002, + "loss": 0.1558, + "step": 5761 + }, + { + "epoch": 10.450238041260485, + "grad_norm": 0.240234375, + "learning_rate": 0.0002, + "loss": 0.1686, + "step": 5762 + }, + { + "epoch": 10.45205168895942, + "grad_norm": 0.173828125, + "learning_rate": 0.0002, + "loss": 0.1958, + "step": 5763 + }, + { + "epoch": 10.453865336658355, + "grad_norm": 0.34765625, + "learning_rate": 0.0002, + "loss": 0.3196, + "step": 5764 + }, + { + "epoch": 10.455678984357288, + "grad_norm": 0.240234375, + "learning_rate": 0.0002, + "loss": 0.3177, + "step": 5765 + }, + { + "epoch": 10.457492632056223, + "grad_norm": 0.24609375, + "learning_rate": 0.0002, + "loss": 0.2335, + "step": 5766 + }, + { + "epoch": 10.459306279755157, + "grad_norm": 0.236328125, + "learning_rate": 0.0002, + "loss": 0.2752, + "step": 5767 + }, + { + "epoch": 10.461119927454092, + "grad_norm": 0.236328125, + "learning_rate": 0.0002, + "loss": 0.2489, + "step": 5768 + }, + { + "epoch": 10.462933575153027, + "grad_norm": 0.25390625, + "learning_rate": 0.0002, + "loss": 0.2714, + "step": 5769 + }, + { + "epoch": 10.464747222851962, + "grad_norm": 0.2314453125, + "learning_rate": 0.0002, + "loss": 0.2101, + "step": 5770 + }, + { + "epoch": 10.466560870550895, + "grad_norm": 0.224609375, + "learning_rate": 0.0002, + "loss": 0.1916, + "step": 5771 + }, + { + "epoch": 10.46837451824983, + "grad_norm": 0.251953125, + "learning_rate": 0.0002, + "loss": 0.2333, + "step": 5772 + }, + { + "epoch": 10.470188165948764, + "grad_norm": 0.2314453125, + "learning_rate": 0.0002, + "loss": 0.218, + "step": 5773 + }, + { + "epoch": 10.472001813647699, + "grad_norm": 0.25, + "learning_rate": 0.0002, + "loss": 0.1783, + "step": 5774 + }, + { + "epoch": 10.473815461346634, + "grad_norm": 0.240234375, + "learning_rate": 0.0002, + "loss": 0.175, + "step": 5775 + }, + { + "epoch": 10.475629109045569, + "grad_norm": 0.248046875, + "learning_rate": 0.0002, + "loss": 0.1697, + "step": 5776 + }, + { + "epoch": 10.477442756744502, + "grad_norm": 0.263671875, + "learning_rate": 0.0002, + "loss": 0.2484, + "step": 5777 + }, + { + "epoch": 10.479256404443436, + "grad_norm": 0.259765625, + "learning_rate": 0.0002, + "loss": 0.14, + "step": 5778 + }, + { + "epoch": 10.481070052142371, + "grad_norm": 0.287109375, + "learning_rate": 0.0002, + "loss": 0.1544, + "step": 5779 + }, + { + "epoch": 10.482883699841306, + "grad_norm": 0.24609375, + "learning_rate": 0.0002, + "loss": 0.1386, + "step": 5780 + }, + { + "epoch": 10.48469734754024, + "grad_norm": 0.314453125, + "learning_rate": 0.0002, + "loss": 0.18, + "step": 5781 + }, + { + "epoch": 10.486510995239176, + "grad_norm": 0.33203125, + "learning_rate": 0.0002, + "loss": 0.1589, + "step": 5782 + }, + { + "epoch": 10.488324642938109, + "grad_norm": 0.294921875, + "learning_rate": 0.0002, + "loss": 0.1736, + "step": 5783 + }, + { + "epoch": 10.490138290637043, + "grad_norm": 0.3359375, + "learning_rate": 0.0002, + "loss": 0.1649, + "step": 5784 + }, + { + "epoch": 10.491951938335978, + "grad_norm": 0.279296875, + "learning_rate": 0.0002, + "loss": 0.1601, + "step": 5785 + }, + { + "epoch": 10.493765586034913, + "grad_norm": 0.353515625, + "learning_rate": 0.0002, + "loss": 0.1376, + "step": 5786 + }, + { + "epoch": 10.495579233733848, + "grad_norm": 0.328125, + "learning_rate": 0.0002, + "loss": 0.1437, + "step": 5787 + }, + { + "epoch": 10.497392881432782, + "grad_norm": 0.349609375, + "learning_rate": 0.0002, + "loss": 0.1507, + "step": 5788 + }, + { + "epoch": 10.499206529131715, + "grad_norm": 0.375, + "learning_rate": 0.0002, + "loss": 0.1401, + "step": 5789 + }, + { + "epoch": 10.50102017683065, + "grad_norm": 0.419921875, + "learning_rate": 0.0002, + "loss": 0.1864, + "step": 5790 + }, + { + "epoch": 10.502833824529585, + "grad_norm": 0.390625, + "learning_rate": 0.0002, + "loss": 0.1497, + "step": 5791 + }, + { + "epoch": 10.50464747222852, + "grad_norm": 0.294921875, + "learning_rate": 0.0002, + "loss": 0.1452, + "step": 5792 + }, + { + "epoch": 10.506461119927454, + "grad_norm": 0.298828125, + "learning_rate": 0.0002, + "loss": 0.1141, + "step": 5793 + }, + { + "epoch": 10.50827476762639, + "grad_norm": 0.3359375, + "learning_rate": 0.0002, + "loss": 0.1657, + "step": 5794 + }, + { + "epoch": 10.510088415325322, + "grad_norm": 0.30859375, + "learning_rate": 0.0002, + "loss": 0.1293, + "step": 5795 + }, + { + "epoch": 10.511902063024257, + "grad_norm": 0.2734375, + "learning_rate": 0.0002, + "loss": 0.1072, + "step": 5796 + }, + { + "epoch": 10.513715710723192, + "grad_norm": 0.3359375, + "learning_rate": 0.0002, + "loss": 0.1238, + "step": 5797 + }, + { + "epoch": 10.513715710723192, + "eval_loss": 2.1110692024230957, + "eval_runtime": 150.9863, + "eval_samples_per_second": 6.623, + "eval_steps_per_second": 6.623, + "step": 5797 + }, + { + "epoch": 10.513715710723192, + "mmlu_eval_accuracy": 0.29920312637160384, + "mmlu_eval_accuracy_abstract_algebra": 0.36363636363636365, + "mmlu_eval_accuracy_anatomy": 0.35714285714285715, + "mmlu_eval_accuracy_astronomy": 0.25, + "mmlu_eval_accuracy_business_ethics": 0.45454545454545453, + "mmlu_eval_accuracy_clinical_knowledge": 0.27586206896551724, + "mmlu_eval_accuracy_college_biology": 0.4375, + "mmlu_eval_accuracy_college_chemistry": 0.25, + "mmlu_eval_accuracy_college_computer_science": 0.09090909090909091, + "mmlu_eval_accuracy_college_mathematics": 0.18181818181818182, + "mmlu_eval_accuracy_college_medicine": 0.36363636363636365, + "mmlu_eval_accuracy_college_physics": 0.36363636363636365, + "mmlu_eval_accuracy_computer_security": 0.36363636363636365, + "mmlu_eval_accuracy_conceptual_physics": 0.2692307692307692, + "mmlu_eval_accuracy_econometrics": 0.16666666666666666, + "mmlu_eval_accuracy_electrical_engineering": 0.1875, + "mmlu_eval_accuracy_elementary_mathematics": 0.2926829268292683, + "mmlu_eval_accuracy_formal_logic": 0.21428571428571427, + "mmlu_eval_accuracy_global_facts": 0.3, + "mmlu_eval_accuracy_high_school_biology": 0.40625, + "mmlu_eval_accuracy_high_school_chemistry": 0.22727272727272727, + "mmlu_eval_accuracy_high_school_computer_science": 0.2222222222222222, + "mmlu_eval_accuracy_high_school_european_history": 0.3333333333333333, + "mmlu_eval_accuracy_high_school_geography": 0.36363636363636365, + "mmlu_eval_accuracy_high_school_government_and_politics": 0.23809523809523808, + "mmlu_eval_accuracy_high_school_macroeconomics": 0.27906976744186046, + "mmlu_eval_accuracy_high_school_mathematics": 0.10344827586206896, + "mmlu_eval_accuracy_high_school_microeconomics": 0.3076923076923077, + "mmlu_eval_accuracy_high_school_physics": 0.5294117647058824, + "mmlu_eval_accuracy_high_school_psychology": 0.3333333333333333, + "mmlu_eval_accuracy_high_school_statistics": 0.30434782608695654, + "mmlu_eval_accuracy_high_school_us_history": 0.2727272727272727, + "mmlu_eval_accuracy_high_school_world_history": 0.19230769230769232, + "mmlu_eval_accuracy_human_aging": 0.21739130434782608, + "mmlu_eval_accuracy_human_sexuality": 0.08333333333333333, + "mmlu_eval_accuracy_international_law": 0.38461538461538464, + "mmlu_eval_accuracy_jurisprudence": 0.2727272727272727, + "mmlu_eval_accuracy_logical_fallacies": 0.2222222222222222, + "mmlu_eval_accuracy_machine_learning": 0.36363636363636365, + "mmlu_eval_accuracy_management": 0.5454545454545454, + "mmlu_eval_accuracy_marketing": 0.36, + "mmlu_eval_accuracy_medical_genetics": 0.18181818181818182, + "mmlu_eval_accuracy_miscellaneous": 0.4186046511627907, + "mmlu_eval_accuracy_moral_disputes": 0.2631578947368421, + "mmlu_eval_accuracy_moral_scenarios": 0.22, + "mmlu_eval_accuracy_nutrition": 0.3333333333333333, + "mmlu_eval_accuracy_philosophy": 0.3235294117647059, + "mmlu_eval_accuracy_prehistory": 0.34285714285714286, + "mmlu_eval_accuracy_professional_accounting": 0.3225806451612903, + "mmlu_eval_accuracy_professional_law": 0.29411764705882354, + "mmlu_eval_accuracy_professional_medicine": 0.2903225806451613, + "mmlu_eval_accuracy_professional_psychology": 0.2608695652173913, + "mmlu_eval_accuracy_public_relations": 0.4166666666666667, + "mmlu_eval_accuracy_security_studies": 0.37037037037037035, + "mmlu_eval_accuracy_sociology": 0.45454545454545453, + "mmlu_eval_accuracy_us_foreign_policy": 0.2727272727272727, + "mmlu_eval_accuracy_virology": 0.3333333333333333, + "mmlu_eval_accuracy_world_religions": 0.21052631578947367, + "mmlu_loss": 1.836123676762559, + "step": 5797 + }, + { + "epoch": 10.515529358422127, + "grad_norm": 0.306640625, + "learning_rate": 0.0002, + "loss": 0.1053, + "step": 5798 + }, + { + "epoch": 10.517343006121061, + "grad_norm": 0.2001953125, + "learning_rate": 0.0002, + "loss": 0.1067, + "step": 5799 + }, + { + "epoch": 10.519156653819996, + "grad_norm": 0.2099609375, + "learning_rate": 0.0002, + "loss": 0.1235, + "step": 5800 + }, + { + "epoch": 10.520970301518929, + "grad_norm": 0.251953125, + "learning_rate": 0.0002, + "loss": 0.122, + "step": 5801 + }, + { + "epoch": 10.522783949217864, + "grad_norm": 0.1962890625, + "learning_rate": 0.0002, + "loss": 0.1133, + "step": 5802 + }, + { + "epoch": 10.524597596916799, + "grad_norm": 0.25390625, + "learning_rate": 0.0002, + "loss": 0.1144, + "step": 5803 + }, + { + "epoch": 10.526411244615733, + "grad_norm": 0.1982421875, + "learning_rate": 0.0002, + "loss": 0.1412, + "step": 5804 + }, + { + "epoch": 10.528224892314668, + "grad_norm": 0.1630859375, + "learning_rate": 0.0002, + "loss": 0.107, + "step": 5805 + }, + { + "epoch": 10.530038540013603, + "grad_norm": 0.296875, + "learning_rate": 0.0002, + "loss": 0.1525, + "step": 5806 + }, + { + "epoch": 10.531852187712538, + "grad_norm": 0.208984375, + "learning_rate": 0.0002, + "loss": 0.1627, + "step": 5807 + }, + { + "epoch": 10.53366583541147, + "grad_norm": 0.1865234375, + "learning_rate": 0.0002, + "loss": 0.1334, + "step": 5808 + }, + { + "epoch": 10.535479483110405, + "grad_norm": 0.1884765625, + "learning_rate": 0.0002, + "loss": 0.1504, + "step": 5809 + }, + { + "epoch": 10.53729313080934, + "grad_norm": 0.1953125, + "learning_rate": 0.0002, + "loss": 0.1522, + "step": 5810 + }, + { + "epoch": 10.539106778508275, + "grad_norm": 0.2109375, + "learning_rate": 0.0002, + "loss": 0.1531, + "step": 5811 + }, + { + "epoch": 10.54092042620721, + "grad_norm": 0.2353515625, + "learning_rate": 0.0002, + "loss": 0.1556, + "step": 5812 + }, + { + "epoch": 10.542734073906143, + "grad_norm": 0.474609375, + "learning_rate": 0.0002, + "loss": 0.2258, + "step": 5813 + }, + { + "epoch": 10.544547721605078, + "grad_norm": 0.2294921875, + "learning_rate": 0.0002, + "loss": 0.4113, + "step": 5814 + }, + { + "epoch": 10.546361369304012, + "grad_norm": 0.25, + "learning_rate": 0.0002, + "loss": 0.2603, + "step": 5815 + }, + { + "epoch": 10.548175017002947, + "grad_norm": 0.2373046875, + "learning_rate": 0.0002, + "loss": 0.2457, + "step": 5816 + }, + { + "epoch": 10.549988664701882, + "grad_norm": 0.228515625, + "learning_rate": 0.0002, + "loss": 0.2718, + "step": 5817 + }, + { + "epoch": 10.551802312400817, + "grad_norm": 0.216796875, + "learning_rate": 0.0002, + "loss": 0.1889, + "step": 5818 + }, + { + "epoch": 10.553615960099751, + "grad_norm": 0.21875, + "learning_rate": 0.0002, + "loss": 0.2019, + "step": 5819 + }, + { + "epoch": 10.555429607798684, + "grad_norm": 0.2060546875, + "learning_rate": 0.0002, + "loss": 0.1538, + "step": 5820 + }, + { + "epoch": 10.55724325549762, + "grad_norm": 0.25, + "learning_rate": 0.0002, + "loss": 0.1968, + "step": 5821 + }, + { + "epoch": 10.559056903196554, + "grad_norm": 0.248046875, + "learning_rate": 0.0002, + "loss": 0.2216, + "step": 5822 + }, + { + "epoch": 10.560870550895489, + "grad_norm": 0.234375, + "learning_rate": 0.0002, + "loss": 0.1718, + "step": 5823 + }, + { + "epoch": 10.562684198594424, + "grad_norm": 0.251953125, + "learning_rate": 0.0002, + "loss": 0.1999, + "step": 5824 + }, + { + "epoch": 10.564497846293358, + "grad_norm": 0.23828125, + "learning_rate": 0.0002, + "loss": 0.1626, + "step": 5825 + }, + { + "epoch": 10.566311493992291, + "grad_norm": 0.26953125, + "learning_rate": 0.0002, + "loss": 0.2118, + "step": 5826 + }, + { + "epoch": 10.568125141691226, + "grad_norm": 0.271484375, + "learning_rate": 0.0002, + "loss": 0.1835, + "step": 5827 + }, + { + "epoch": 10.56993878939016, + "grad_norm": 0.26953125, + "learning_rate": 0.0002, + "loss": 0.1586, + "step": 5828 + }, + { + "epoch": 10.571752437089096, + "grad_norm": 0.2890625, + "learning_rate": 0.0002, + "loss": 0.1795, + "step": 5829 + }, + { + "epoch": 10.57356608478803, + "grad_norm": 0.29296875, + "learning_rate": 0.0002, + "loss": 0.1871, + "step": 5830 + }, + { + "epoch": 10.575379732486965, + "grad_norm": 0.26953125, + "learning_rate": 0.0002, + "loss": 0.1455, + "step": 5831 + }, + { + "epoch": 10.577193380185898, + "grad_norm": 0.275390625, + "learning_rate": 0.0002, + "loss": 0.1715, + "step": 5832 + }, + { + "epoch": 10.579007027884833, + "grad_norm": 0.2734375, + "learning_rate": 0.0002, + "loss": 0.1713, + "step": 5833 + }, + { + "epoch": 10.580820675583768, + "grad_norm": 0.31640625, + "learning_rate": 0.0002, + "loss": 0.1726, + "step": 5834 + }, + { + "epoch": 10.582634323282702, + "grad_norm": 0.2890625, + "learning_rate": 0.0002, + "loss": 0.1526, + "step": 5835 + }, + { + "epoch": 10.584447970981637, + "grad_norm": 0.2578125, + "learning_rate": 0.0002, + "loss": 0.1295, + "step": 5836 + }, + { + "epoch": 10.586261618680572, + "grad_norm": 0.2734375, + "learning_rate": 0.0002, + "loss": 0.1277, + "step": 5837 + }, + { + "epoch": 10.588075266379505, + "grad_norm": 0.265625, + "learning_rate": 0.0002, + "loss": 0.1422, + "step": 5838 + }, + { + "epoch": 10.58988891407844, + "grad_norm": 0.27734375, + "learning_rate": 0.0002, + "loss": 0.1155, + "step": 5839 + }, + { + "epoch": 10.591702561777375, + "grad_norm": 0.28125, + "learning_rate": 0.0002, + "loss": 0.1194, + "step": 5840 + }, + { + "epoch": 10.59351620947631, + "grad_norm": 0.2734375, + "learning_rate": 0.0002, + "loss": 0.1464, + "step": 5841 + }, + { + "epoch": 10.595329857175244, + "grad_norm": 0.34375, + "learning_rate": 0.0002, + "loss": 0.1181, + "step": 5842 + }, + { + "epoch": 10.597143504874179, + "grad_norm": 0.337890625, + "learning_rate": 0.0002, + "loss": 0.1365, + "step": 5843 + }, + { + "epoch": 10.598957152573112, + "grad_norm": 0.33984375, + "learning_rate": 0.0002, + "loss": 0.1443, + "step": 5844 + }, + { + "epoch": 10.600770800272047, + "grad_norm": 0.353515625, + "learning_rate": 0.0002, + "loss": 0.1136, + "step": 5845 + }, + { + "epoch": 10.602584447970981, + "grad_norm": 0.28515625, + "learning_rate": 0.0002, + "loss": 0.111, + "step": 5846 + }, + { + "epoch": 10.604398095669916, + "grad_norm": 0.28515625, + "learning_rate": 0.0002, + "loss": 0.1559, + "step": 5847 + }, + { + "epoch": 10.606211743368851, + "grad_norm": 0.30859375, + "learning_rate": 0.0002, + "loss": 0.1061, + "step": 5848 + }, + { + "epoch": 10.608025391067786, + "grad_norm": 0.27734375, + "learning_rate": 0.0002, + "loss": 0.1044, + "step": 5849 + }, + { + "epoch": 10.609839038766719, + "grad_norm": 0.279296875, + "learning_rate": 0.0002, + "loss": 0.1133, + "step": 5850 + }, + { + "epoch": 10.611652686465654, + "grad_norm": 0.310546875, + "learning_rate": 0.0002, + "loss": 0.1103, + "step": 5851 + }, + { + "epoch": 10.613466334164588, + "grad_norm": 0.35546875, + "learning_rate": 0.0002, + "loss": 0.1296, + "step": 5852 + }, + { + "epoch": 10.615279981863523, + "grad_norm": 0.2470703125, + "learning_rate": 0.0002, + "loss": 0.1169, + "step": 5853 + }, + { + "epoch": 10.617093629562458, + "grad_norm": 0.197265625, + "learning_rate": 0.0002, + "loss": 0.1094, + "step": 5854 + }, + { + "epoch": 10.618907277261393, + "grad_norm": 0.29296875, + "learning_rate": 0.0002, + "loss": 0.1062, + "step": 5855 + }, + { + "epoch": 10.620720924960327, + "grad_norm": 0.322265625, + "learning_rate": 0.0002, + "loss": 0.144, + "step": 5856 + }, + { + "epoch": 10.62253457265926, + "grad_norm": 0.294921875, + "learning_rate": 0.0002, + "loss": 0.1303, + "step": 5857 + }, + { + "epoch": 10.624348220358195, + "grad_norm": 0.228515625, + "learning_rate": 0.0002, + "loss": 0.1356, + "step": 5858 + }, + { + "epoch": 10.62616186805713, + "grad_norm": 0.18359375, + "learning_rate": 0.0002, + "loss": 0.1304, + "step": 5859 + }, + { + "epoch": 10.627975515756065, + "grad_norm": 0.2373046875, + "learning_rate": 0.0002, + "loss": 0.1524, + "step": 5860 + }, + { + "epoch": 10.629789163455, + "grad_norm": 0.1875, + "learning_rate": 0.0002, + "loss": 0.1394, + "step": 5861 + }, + { + "epoch": 10.631602811153932, + "grad_norm": 0.205078125, + "learning_rate": 0.0002, + "loss": 0.1688, + "step": 5862 + }, + { + "epoch": 10.633416458852867, + "grad_norm": 0.421875, + "learning_rate": 0.0002, + "loss": 0.2003, + "step": 5863 + }, + { + "epoch": 10.635230106551802, + "grad_norm": 0.23046875, + "learning_rate": 0.0002, + "loss": 0.2517, + "step": 5864 + }, + { + "epoch": 10.637043754250737, + "grad_norm": 0.2119140625, + "learning_rate": 0.0002, + "loss": 0.2246, + "step": 5865 + }, + { + "epoch": 10.638857401949672, + "grad_norm": 0.24609375, + "learning_rate": 0.0002, + "loss": 0.2649, + "step": 5866 + }, + { + "epoch": 10.640671049648606, + "grad_norm": 0.2099609375, + "learning_rate": 0.0002, + "loss": 0.2197, + "step": 5867 + }, + { + "epoch": 10.642484697347541, + "grad_norm": 0.220703125, + "learning_rate": 0.0002, + "loss": 0.2017, + "step": 5868 + }, + { + "epoch": 10.644298345046474, + "grad_norm": 0.2314453125, + "learning_rate": 0.0002, + "loss": 0.3208, + "step": 5869 + }, + { + "epoch": 10.646111992745409, + "grad_norm": 0.255859375, + "learning_rate": 0.0002, + "loss": 0.2435, + "step": 5870 + }, + { + "epoch": 10.647925640444344, + "grad_norm": 0.2109375, + "learning_rate": 0.0002, + "loss": 0.2079, + "step": 5871 + }, + { + "epoch": 10.649739288143278, + "grad_norm": 0.287109375, + "learning_rate": 0.0002, + "loss": 0.3187, + "step": 5872 + }, + { + "epoch": 10.651552935842213, + "grad_norm": 0.2412109375, + "learning_rate": 0.0002, + "loss": 0.2468, + "step": 5873 + }, + { + "epoch": 10.653366583541148, + "grad_norm": 0.26171875, + "learning_rate": 0.0002, + "loss": 0.2603, + "step": 5874 + }, + { + "epoch": 10.655180231240081, + "grad_norm": 0.24609375, + "learning_rate": 0.0002, + "loss": 0.1607, + "step": 5875 + }, + { + "epoch": 10.656993878939016, + "grad_norm": 0.251953125, + "learning_rate": 0.0002, + "loss": 0.2737, + "step": 5876 + }, + { + "epoch": 10.65880752663795, + "grad_norm": 0.287109375, + "learning_rate": 0.0002, + "loss": 0.2209, + "step": 5877 + }, + { + "epoch": 10.660621174336885, + "grad_norm": 0.2490234375, + "learning_rate": 0.0002, + "loss": 0.1973, + "step": 5878 + }, + { + "epoch": 10.66243482203582, + "grad_norm": 0.30078125, + "learning_rate": 0.0002, + "loss": 0.1843, + "step": 5879 + }, + { + "epoch": 10.664248469734755, + "grad_norm": 0.275390625, + "learning_rate": 0.0002, + "loss": 0.1915, + "step": 5880 + }, + { + "epoch": 10.666062117433688, + "grad_norm": 0.259765625, + "learning_rate": 0.0002, + "loss": 0.1704, + "step": 5881 + }, + { + "epoch": 10.667875765132623, + "grad_norm": 0.302734375, + "learning_rate": 0.0002, + "loss": 0.1881, + "step": 5882 + }, + { + "epoch": 10.669689412831557, + "grad_norm": 0.3515625, + "learning_rate": 0.0002, + "loss": 0.2119, + "step": 5883 + }, + { + "epoch": 10.671503060530492, + "grad_norm": 0.3125, + "learning_rate": 0.0002, + "loss": 0.1692, + "step": 5884 + }, + { + "epoch": 10.673316708229427, + "grad_norm": 0.310546875, + "learning_rate": 0.0002, + "loss": 0.1678, + "step": 5885 + }, + { + "epoch": 10.675130355928362, + "grad_norm": 0.248046875, + "learning_rate": 0.0002, + "loss": 0.1177, + "step": 5886 + }, + { + "epoch": 10.676944003627295, + "grad_norm": 0.34375, + "learning_rate": 0.0002, + "loss": 0.1591, + "step": 5887 + }, + { + "epoch": 10.67875765132623, + "grad_norm": 0.32421875, + "learning_rate": 0.0002, + "loss": 0.1828, + "step": 5888 + }, + { + "epoch": 10.680571299025164, + "grad_norm": 0.3203125, + "learning_rate": 0.0002, + "loss": 0.1398, + "step": 5889 + }, + { + "epoch": 10.682384946724099, + "grad_norm": 0.318359375, + "learning_rate": 0.0002, + "loss": 0.1337, + "step": 5890 + }, + { + "epoch": 10.684198594423034, + "grad_norm": 0.36328125, + "learning_rate": 0.0002, + "loss": 0.1205, + "step": 5891 + }, + { + "epoch": 10.686012242121969, + "grad_norm": 0.314453125, + "learning_rate": 0.0002, + "loss": 0.1214, + "step": 5892 + }, + { + "epoch": 10.687825889820902, + "grad_norm": 0.318359375, + "learning_rate": 0.0002, + "loss": 0.1193, + "step": 5893 + }, + { + "epoch": 10.689639537519836, + "grad_norm": 0.24609375, + "learning_rate": 0.0002, + "loss": 0.1214, + "step": 5894 + }, + { + "epoch": 10.691453185218771, + "grad_norm": 0.328125, + "learning_rate": 0.0002, + "loss": 0.1367, + "step": 5895 + }, + { + "epoch": 10.693266832917706, + "grad_norm": 0.37109375, + "learning_rate": 0.0002, + "loss": 0.121, + "step": 5896 + }, + { + "epoch": 10.69508048061664, + "grad_norm": 0.359375, + "learning_rate": 0.0002, + "loss": 0.1465, + "step": 5897 + }, + { + "epoch": 10.696894128315575, + "grad_norm": 0.318359375, + "learning_rate": 0.0002, + "loss": 0.1322, + "step": 5898 + }, + { + "epoch": 10.698707776014508, + "grad_norm": 0.34765625, + "learning_rate": 0.0002, + "loss": 0.116, + "step": 5899 + }, + { + "epoch": 10.700521423713443, + "grad_norm": 0.259765625, + "learning_rate": 0.0002, + "loss": 0.1468, + "step": 5900 + }, + { + "epoch": 10.702335071412378, + "grad_norm": 0.287109375, + "learning_rate": 0.0002, + "loss": 0.1123, + "step": 5901 + }, + { + "epoch": 10.704148719111313, + "grad_norm": 0.26171875, + "learning_rate": 0.0002, + "loss": 0.1339, + "step": 5902 + }, + { + "epoch": 10.705962366810247, + "grad_norm": 0.263671875, + "learning_rate": 0.0002, + "loss": 0.1336, + "step": 5903 + }, + { + "epoch": 10.707776014509182, + "grad_norm": 0.255859375, + "learning_rate": 0.0002, + "loss": 0.1476, + "step": 5904 + }, + { + "epoch": 10.709589662208115, + "grad_norm": 0.21484375, + "learning_rate": 0.0002, + "loss": 0.1129, + "step": 5905 + }, + { + "epoch": 10.71140330990705, + "grad_norm": 0.3125, + "learning_rate": 0.0002, + "loss": 0.1329, + "step": 5906 + }, + { + "epoch": 10.713216957605985, + "grad_norm": 0.3515625, + "learning_rate": 0.0002, + "loss": 0.1475, + "step": 5907 + }, + { + "epoch": 10.71503060530492, + "grad_norm": 0.251953125, + "learning_rate": 0.0002, + "loss": 0.1401, + "step": 5908 + }, + { + "epoch": 10.716844253003854, + "grad_norm": 0.23046875, + "learning_rate": 0.0002, + "loss": 0.134, + "step": 5909 + }, + { + "epoch": 10.71865790070279, + "grad_norm": 0.2294921875, + "learning_rate": 0.0002, + "loss": 0.1384, + "step": 5910 + }, + { + "epoch": 10.720471548401722, + "grad_norm": 0.24609375, + "learning_rate": 0.0002, + "loss": 0.1509, + "step": 5911 + }, + { + "epoch": 10.722285196100657, + "grad_norm": 0.224609375, + "learning_rate": 0.0002, + "loss": 0.1478, + "step": 5912 + }, + { + "epoch": 10.724098843799592, + "grad_norm": 0.1982421875, + "learning_rate": 0.0002, + "loss": 0.1706, + "step": 5913 + }, + { + "epoch": 10.725912491498526, + "grad_norm": 0.2265625, + "learning_rate": 0.0002, + "loss": 0.2937, + "step": 5914 + }, + { + "epoch": 10.727726139197461, + "grad_norm": 0.240234375, + "learning_rate": 0.0002, + "loss": 0.253, + "step": 5915 + }, + { + "epoch": 10.729539786896396, + "grad_norm": 0.23046875, + "learning_rate": 0.0002, + "loss": 0.2157, + "step": 5916 + }, + { + "epoch": 10.73135343459533, + "grad_norm": 0.24609375, + "learning_rate": 0.0002, + "loss": 0.229, + "step": 5917 + }, + { + "epoch": 10.733167082294264, + "grad_norm": 0.2275390625, + "learning_rate": 0.0002, + "loss": 0.2429, + "step": 5918 + }, + { + "epoch": 10.734980729993199, + "grad_norm": 0.25390625, + "learning_rate": 0.0002, + "loss": 0.255, + "step": 5919 + }, + { + "epoch": 10.736794377692133, + "grad_norm": 0.255859375, + "learning_rate": 0.0002, + "loss": 0.2496, + "step": 5920 + }, + { + "epoch": 10.738608025391068, + "grad_norm": 0.25, + "learning_rate": 0.0002, + "loss": 0.2726, + "step": 5921 + }, + { + "epoch": 10.740421673090003, + "grad_norm": 0.287109375, + "learning_rate": 0.0002, + "loss": 0.2958, + "step": 5922 + }, + { + "epoch": 10.742235320788936, + "grad_norm": 0.1982421875, + "learning_rate": 0.0002, + "loss": 0.1551, + "step": 5923 + }, + { + "epoch": 10.74404896848787, + "grad_norm": 0.26171875, + "learning_rate": 0.0002, + "loss": 0.2625, + "step": 5924 + }, + { + "epoch": 10.745862616186805, + "grad_norm": 0.28125, + "learning_rate": 0.0002, + "loss": 0.2375, + "step": 5925 + }, + { + "epoch": 10.74767626388574, + "grad_norm": 0.267578125, + "learning_rate": 0.0002, + "loss": 0.2214, + "step": 5926 + }, + { + "epoch": 10.749489911584675, + "grad_norm": 0.27734375, + "learning_rate": 0.0002, + "loss": 0.2102, + "step": 5927 + }, + { + "epoch": 10.75130355928361, + "grad_norm": 0.255859375, + "learning_rate": 0.0002, + "loss": 0.1506, + "step": 5928 + }, + { + "epoch": 10.753117206982544, + "grad_norm": 0.267578125, + "learning_rate": 0.0002, + "loss": 0.1559, + "step": 5929 + }, + { + "epoch": 10.754930854681477, + "grad_norm": 0.3203125, + "learning_rate": 0.0002, + "loss": 0.1997, + "step": 5930 + }, + { + "epoch": 10.756744502380412, + "grad_norm": 0.28515625, + "learning_rate": 0.0002, + "loss": 0.1293, + "step": 5931 + }, + { + "epoch": 10.758558150079347, + "grad_norm": 0.3515625, + "learning_rate": 0.0002, + "loss": 0.2052, + "step": 5932 + }, + { + "epoch": 10.760371797778282, + "grad_norm": 0.341796875, + "learning_rate": 0.0002, + "loss": 0.1428, + "step": 5933 + }, + { + "epoch": 10.762185445477217, + "grad_norm": 0.376953125, + "learning_rate": 0.0002, + "loss": 0.1887, + "step": 5934 + }, + { + "epoch": 10.763999093176151, + "grad_norm": 0.3046875, + "learning_rate": 0.0002, + "loss": 0.1429, + "step": 5935 + }, + { + "epoch": 10.765812740875084, + "grad_norm": 0.390625, + "learning_rate": 0.0002, + "loss": 0.1597, + "step": 5936 + }, + { + "epoch": 10.767626388574019, + "grad_norm": 0.3359375, + "learning_rate": 0.0002, + "loss": 0.145, + "step": 5937 + }, + { + "epoch": 10.769440036272954, + "grad_norm": 0.345703125, + "learning_rate": 0.0002, + "loss": 0.1395, + "step": 5938 + }, + { + "epoch": 10.771253683971889, + "grad_norm": 0.28515625, + "learning_rate": 0.0002, + "loss": 0.1499, + "step": 5939 + }, + { + "epoch": 10.773067331670823, + "grad_norm": 0.396484375, + "learning_rate": 0.0002, + "loss": 0.1707, + "step": 5940 + }, + { + "epoch": 10.774880979369758, + "grad_norm": 0.33203125, + "learning_rate": 0.0002, + "loss": 0.1205, + "step": 5941 + }, + { + "epoch": 10.776694627068691, + "grad_norm": 0.326171875, + "learning_rate": 0.0002, + "loss": 0.2634, + "step": 5942 + }, + { + "epoch": 10.778508274767626, + "grad_norm": 0.30078125, + "learning_rate": 0.0002, + "loss": 0.1518, + "step": 5943 + }, + { + "epoch": 10.78032192246656, + "grad_norm": 0.296875, + "learning_rate": 0.0002, + "loss": 0.1235, + "step": 5944 + }, + { + "epoch": 10.782135570165496, + "grad_norm": 0.263671875, + "learning_rate": 0.0002, + "loss": 0.1398, + "step": 5945 + }, + { + "epoch": 10.78394921786443, + "grad_norm": 0.31640625, + "learning_rate": 0.0002, + "loss": 0.1295, + "step": 5946 + }, + { + "epoch": 10.785762865563365, + "grad_norm": 0.287109375, + "learning_rate": 0.0002, + "loss": 0.1198, + "step": 5947 + }, + { + "epoch": 10.787576513262298, + "grad_norm": 0.29296875, + "learning_rate": 0.0002, + "loss": 0.1077, + "step": 5948 + }, + { + "epoch": 10.789390160961233, + "grad_norm": 0.2275390625, + "learning_rate": 0.0002, + "loss": 0.1017, + "step": 5949 + }, + { + "epoch": 10.791203808660168, + "grad_norm": 0.29296875, + "learning_rate": 0.0002, + "loss": 0.1166, + "step": 5950 + }, + { + "epoch": 10.793017456359102, + "grad_norm": 0.248046875, + "learning_rate": 0.0002, + "loss": 0.1256, + "step": 5951 + }, + { + "epoch": 10.794831104058037, + "grad_norm": 0.287109375, + "learning_rate": 0.0002, + "loss": 0.136, + "step": 5952 + }, + { + "epoch": 10.796644751756972, + "grad_norm": 0.2578125, + "learning_rate": 0.0002, + "loss": 0.1125, + "step": 5953 + }, + { + "epoch": 10.798458399455905, + "grad_norm": 0.369140625, + "learning_rate": 0.0002, + "loss": 0.1508, + "step": 5954 + }, + { + "epoch": 10.80027204715484, + "grad_norm": 0.220703125, + "learning_rate": 0.0002, + "loss": 0.1129, + "step": 5955 + }, + { + "epoch": 10.802085694853774, + "grad_norm": 0.314453125, + "learning_rate": 0.0002, + "loss": 0.1173, + "step": 5956 + }, + { + "epoch": 10.80389934255271, + "grad_norm": 0.197265625, + "learning_rate": 0.0002, + "loss": 0.1307, + "step": 5957 + }, + { + "epoch": 10.805712990251644, + "grad_norm": 0.384765625, + "learning_rate": 0.0002, + "loss": 0.1767, + "step": 5958 + }, + { + "epoch": 10.807526637950579, + "grad_norm": 0.2080078125, + "learning_rate": 0.0002, + "loss": 0.1278, + "step": 5959 + }, + { + "epoch": 10.809340285649512, + "grad_norm": 0.23046875, + "learning_rate": 0.0002, + "loss": 0.1394, + "step": 5960 + }, + { + "epoch": 10.811153933348447, + "grad_norm": 0.1591796875, + "learning_rate": 0.0002, + "loss": 0.1596, + "step": 5961 + }, + { + "epoch": 10.812967581047381, + "grad_norm": 0.2275390625, + "learning_rate": 0.0002, + "loss": 0.1608, + "step": 5962 + }, + { + "epoch": 10.814781228746316, + "grad_norm": 0.240234375, + "learning_rate": 0.0002, + "loss": 0.1823, + "step": 5963 + }, + { + "epoch": 10.81659487644525, + "grad_norm": 0.2392578125, + "learning_rate": 0.0002, + "loss": 0.2523, + "step": 5964 + }, + { + "epoch": 10.818408524144186, + "grad_norm": 0.30859375, + "learning_rate": 0.0002, + "loss": 0.2687, + "step": 5965 + }, + { + "epoch": 10.82022217184312, + "grad_norm": 0.2119140625, + "learning_rate": 0.0002, + "loss": 0.1927, + "step": 5966 + }, + { + "epoch": 10.822035819542053, + "grad_norm": 0.263671875, + "learning_rate": 0.0002, + "loss": 0.2821, + "step": 5967 + }, + { + "epoch": 10.823849467240988, + "grad_norm": 0.259765625, + "learning_rate": 0.0002, + "loss": 0.2921, + "step": 5968 + }, + { + "epoch": 10.825663114939923, + "grad_norm": 0.2431640625, + "learning_rate": 0.0002, + "loss": 0.2264, + "step": 5969 + }, + { + "epoch": 10.827476762638858, + "grad_norm": 0.23828125, + "learning_rate": 0.0002, + "loss": 0.2103, + "step": 5970 + }, + { + "epoch": 10.829290410337792, + "grad_norm": 0.2451171875, + "learning_rate": 0.0002, + "loss": 0.1675, + "step": 5971 + }, + { + "epoch": 10.831104058036725, + "grad_norm": 0.265625, + "learning_rate": 0.0002, + "loss": 0.2148, + "step": 5972 + }, + { + "epoch": 10.83291770573566, + "grad_norm": 0.27734375, + "learning_rate": 0.0002, + "loss": 0.245, + "step": 5973 + }, + { + "epoch": 10.834731353434595, + "grad_norm": 0.283203125, + "learning_rate": 0.0002, + "loss": 0.2436, + "step": 5974 + }, + { + "epoch": 10.83654500113353, + "grad_norm": 0.255859375, + "learning_rate": 0.0002, + "loss": 0.1766, + "step": 5975 + }, + { + "epoch": 10.838358648832465, + "grad_norm": 0.26953125, + "learning_rate": 0.0002, + "loss": 0.1835, + "step": 5976 + }, + { + "epoch": 10.8401722965314, + "grad_norm": 0.30859375, + "learning_rate": 0.0002, + "loss": 0.2104, + "step": 5977 + }, + { + "epoch": 10.841985944230334, + "grad_norm": 0.259765625, + "learning_rate": 0.0002, + "loss": 0.1499, + "step": 5978 + }, + { + "epoch": 10.843799591929267, + "grad_norm": 0.314453125, + "learning_rate": 0.0002, + "loss": 0.2438, + "step": 5979 + }, + { + "epoch": 10.845613239628202, + "grad_norm": 0.32421875, + "learning_rate": 0.0002, + "loss": 0.2201, + "step": 5980 + }, + { + "epoch": 10.847426887327137, + "grad_norm": 0.3203125, + "learning_rate": 0.0002, + "loss": 0.1939, + "step": 5981 + }, + { + "epoch": 10.849240535026071, + "grad_norm": 0.365234375, + "learning_rate": 0.0002, + "loss": 0.2066, + "step": 5982 + }, + { + "epoch": 10.851054182725006, + "grad_norm": 0.310546875, + "learning_rate": 0.0002, + "loss": 0.1621, + "step": 5983 + }, + { + "epoch": 10.85286783042394, + "grad_norm": 0.337890625, + "learning_rate": 0.0002, + "loss": 0.1503, + "step": 5984 + }, + { + "epoch": 10.85286783042394, + "eval_loss": 2.235506296157837, + "eval_runtime": 156.4689, + "eval_samples_per_second": 6.391, + "eval_steps_per_second": 6.391, + "step": 5984 + }, + { + "epoch": 10.85286783042394, + "mmlu_eval_accuracy": 0.298891445952274, + "mmlu_eval_accuracy_abstract_algebra": 0.45454545454545453, + "mmlu_eval_accuracy_anatomy": 0.35714285714285715, + "mmlu_eval_accuracy_astronomy": 0.25, + "mmlu_eval_accuracy_business_ethics": 0.45454545454545453, + "mmlu_eval_accuracy_clinical_knowledge": 0.27586206896551724, + "mmlu_eval_accuracy_college_biology": 0.4375, + "mmlu_eval_accuracy_college_chemistry": 0.25, + "mmlu_eval_accuracy_college_computer_science": 0.09090909090909091, + "mmlu_eval_accuracy_college_mathematics": 0.18181818181818182, + "mmlu_eval_accuracy_college_medicine": 0.2727272727272727, + "mmlu_eval_accuracy_college_physics": 0.45454545454545453, + "mmlu_eval_accuracy_computer_security": 0.5454545454545454, + "mmlu_eval_accuracy_conceptual_physics": 0.19230769230769232, + "mmlu_eval_accuracy_econometrics": 0.08333333333333333, + "mmlu_eval_accuracy_electrical_engineering": 0.1875, + "mmlu_eval_accuracy_elementary_mathematics": 0.2926829268292683, + "mmlu_eval_accuracy_formal_logic": 0.14285714285714285, + "mmlu_eval_accuracy_global_facts": 0.3, + "mmlu_eval_accuracy_high_school_biology": 0.4375, + "mmlu_eval_accuracy_high_school_chemistry": 0.13636363636363635, + "mmlu_eval_accuracy_high_school_computer_science": 0.2222222222222222, + "mmlu_eval_accuracy_high_school_european_history": 0.2777777777777778, + "mmlu_eval_accuracy_high_school_geography": 0.3181818181818182, + "mmlu_eval_accuracy_high_school_government_and_politics": 0.19047619047619047, + "mmlu_eval_accuracy_high_school_macroeconomics": 0.2558139534883721, + "mmlu_eval_accuracy_high_school_mathematics": 0.10344827586206896, + "mmlu_eval_accuracy_high_school_microeconomics": 0.3076923076923077, + "mmlu_eval_accuracy_high_school_physics": 0.4117647058823529, + "mmlu_eval_accuracy_high_school_psychology": 0.3, + "mmlu_eval_accuracy_high_school_statistics": 0.34782608695652173, + "mmlu_eval_accuracy_high_school_us_history": 0.2727272727272727, + "mmlu_eval_accuracy_high_school_world_history": 0.15384615384615385, + "mmlu_eval_accuracy_human_aging": 0.21739130434782608, + "mmlu_eval_accuracy_human_sexuality": 0.08333333333333333, + "mmlu_eval_accuracy_international_law": 0.38461538461538464, + "mmlu_eval_accuracy_jurisprudence": 0.45454545454545453, + "mmlu_eval_accuracy_logical_fallacies": 0.3333333333333333, + "mmlu_eval_accuracy_machine_learning": 0.36363636363636365, + "mmlu_eval_accuracy_management": 0.2727272727272727, + "mmlu_eval_accuracy_marketing": 0.28, + "mmlu_eval_accuracy_medical_genetics": 0.2727272727272727, + "mmlu_eval_accuracy_miscellaneous": 0.4186046511627907, + "mmlu_eval_accuracy_moral_disputes": 0.23684210526315788, + "mmlu_eval_accuracy_moral_scenarios": 0.22, + "mmlu_eval_accuracy_nutrition": 0.30303030303030304, + "mmlu_eval_accuracy_philosophy": 0.35294117647058826, + "mmlu_eval_accuracy_prehistory": 0.34285714285714286, + "mmlu_eval_accuracy_professional_accounting": 0.3225806451612903, + "mmlu_eval_accuracy_professional_law": 0.2529411764705882, + "mmlu_eval_accuracy_professional_medicine": 0.25806451612903225, + "mmlu_eval_accuracy_professional_psychology": 0.2753623188405797, + "mmlu_eval_accuracy_public_relations": 0.4166666666666667, + "mmlu_eval_accuracy_security_studies": 0.37037037037037035, + "mmlu_eval_accuracy_sociology": 0.5909090909090909, + "mmlu_eval_accuracy_us_foreign_policy": 0.45454545454545453, + "mmlu_eval_accuracy_virology": 0.3888888888888889, + "mmlu_eval_accuracy_world_religions": 0.21052631578947367, + "mmlu_loss": 2.0648028887929986, + "step": 5984 + }, + { + "epoch": 10.854681478122874, + "grad_norm": 0.33984375, + "learning_rate": 0.0002, + "loss": 0.1409, + "step": 5985 + }, + { + "epoch": 10.856495125821809, + "grad_norm": 0.3125, + "learning_rate": 0.0002, + "loss": 0.1626, + "step": 5986 + }, + { + "epoch": 10.858308773520744, + "grad_norm": 0.361328125, + "learning_rate": 0.0002, + "loss": 0.1996, + "step": 5987 + }, + { + "epoch": 10.860122421219678, + "grad_norm": 0.3515625, + "learning_rate": 0.0002, + "loss": 0.1834, + "step": 5988 + }, + { + "epoch": 10.861936068918613, + "grad_norm": 0.39453125, + "learning_rate": 0.0002, + "loss": 0.168, + "step": 5989 + }, + { + "epoch": 10.863749716617548, + "grad_norm": 0.388671875, + "learning_rate": 0.0002, + "loss": 0.2217, + "step": 5990 + }, + { + "epoch": 10.86556336431648, + "grad_norm": 0.37890625, + "learning_rate": 0.0002, + "loss": 0.1657, + "step": 5991 + }, + { + "epoch": 10.867377012015416, + "grad_norm": 0.404296875, + "learning_rate": 0.0002, + "loss": 0.1825, + "step": 5992 + }, + { + "epoch": 10.86919065971435, + "grad_norm": 0.28515625, + "learning_rate": 0.0002, + "loss": 0.1409, + "step": 5993 + }, + { + "epoch": 10.871004307413285, + "grad_norm": 0.30859375, + "learning_rate": 0.0002, + "loss": 0.127, + "step": 5994 + }, + { + "epoch": 10.87281795511222, + "grad_norm": 0.35546875, + "learning_rate": 0.0002, + "loss": 0.1366, + "step": 5995 + }, + { + "epoch": 10.874631602811155, + "grad_norm": 0.279296875, + "learning_rate": 0.0002, + "loss": 0.1148, + "step": 5996 + }, + { + "epoch": 10.876445250510088, + "grad_norm": 0.349609375, + "learning_rate": 0.0002, + "loss": 0.1471, + "step": 5997 + }, + { + "epoch": 10.878258898209022, + "grad_norm": 0.28125, + "learning_rate": 0.0002, + "loss": 0.1453, + "step": 5998 + }, + { + "epoch": 10.880072545907957, + "grad_norm": 0.2412109375, + "learning_rate": 0.0002, + "loss": 0.1, + "step": 5999 + }, + { + "epoch": 10.881886193606892, + "grad_norm": 0.236328125, + "learning_rate": 0.0002, + "loss": 0.1199, + "step": 6000 + }, + { + "epoch": 10.883699841305827, + "grad_norm": 0.33984375, + "learning_rate": 0.0002, + "loss": 0.1201, + "step": 6001 + }, + { + "epoch": 10.885513489004762, + "grad_norm": 0.31640625, + "learning_rate": 0.0002, + "loss": 0.1205, + "step": 6002 + }, + { + "epoch": 10.887327136703695, + "grad_norm": 0.275390625, + "learning_rate": 0.0002, + "loss": 0.1219, + "step": 6003 + }, + { + "epoch": 10.88914078440263, + "grad_norm": 0.248046875, + "learning_rate": 0.0002, + "loss": 0.1501, + "step": 6004 + }, + { + "epoch": 10.890954432101564, + "grad_norm": 0.29296875, + "learning_rate": 0.0002, + "loss": 0.1246, + "step": 6005 + }, + { + "epoch": 10.892768079800499, + "grad_norm": 0.26953125, + "learning_rate": 0.0002, + "loss": 0.1219, + "step": 6006 + }, + { + "epoch": 10.894581727499434, + "grad_norm": 0.275390625, + "learning_rate": 0.0002, + "loss": 0.1445, + "step": 6007 + }, + { + "epoch": 10.896395375198368, + "grad_norm": 0.212890625, + "learning_rate": 0.0002, + "loss": 0.1372, + "step": 6008 + }, + { + "epoch": 10.898209022897301, + "grad_norm": 0.337890625, + "learning_rate": 0.0002, + "loss": 0.1542, + "step": 6009 + }, + { + "epoch": 10.900022670596236, + "grad_norm": 0.1708984375, + "learning_rate": 0.0002, + "loss": 0.1361, + "step": 6010 + }, + { + "epoch": 10.901836318295171, + "grad_norm": 0.1337890625, + "learning_rate": 0.0002, + "loss": 0.1329, + "step": 6011 + }, + { + "epoch": 10.903649965994106, + "grad_norm": 0.2158203125, + "learning_rate": 0.0002, + "loss": 0.1646, + "step": 6012 + }, + { + "epoch": 10.90546361369304, + "grad_norm": 0.1728515625, + "learning_rate": 0.0002, + "loss": 0.1825, + "step": 6013 + }, + { + "epoch": 10.907277261391975, + "grad_norm": 0.1943359375, + "learning_rate": 0.0002, + "loss": 0.2095, + "step": 6014 + }, + { + "epoch": 10.909090909090908, + "grad_norm": 0.2255859375, + "learning_rate": 0.0002, + "loss": 0.2488, + "step": 6015 + }, + { + "epoch": 10.910904556789843, + "grad_norm": 0.294921875, + "learning_rate": 0.0002, + "loss": 0.3454, + "step": 6016 + }, + { + "epoch": 10.912718204488778, + "grad_norm": 0.251953125, + "learning_rate": 0.0002, + "loss": 0.3246, + "step": 6017 + }, + { + "epoch": 10.914531852187713, + "grad_norm": 0.2392578125, + "learning_rate": 0.0002, + "loss": 0.2026, + "step": 6018 + }, + { + "epoch": 10.916345499886647, + "grad_norm": 0.267578125, + "learning_rate": 0.0002, + "loss": 0.2111, + "step": 6019 + }, + { + "epoch": 10.918159147585582, + "grad_norm": 0.2373046875, + "learning_rate": 0.0002, + "loss": 0.2357, + "step": 6020 + }, + { + "epoch": 10.919972795284515, + "grad_norm": 0.26171875, + "learning_rate": 0.0002, + "loss": 0.2458, + "step": 6021 + }, + { + "epoch": 10.92178644298345, + "grad_norm": 0.267578125, + "learning_rate": 0.0002, + "loss": 0.2448, + "step": 6022 + }, + { + "epoch": 10.923600090682385, + "grad_norm": 0.263671875, + "learning_rate": 0.0002, + "loss": 0.1866, + "step": 6023 + }, + { + "epoch": 10.92541373838132, + "grad_norm": 0.2451171875, + "learning_rate": 0.0002, + "loss": 0.1859, + "step": 6024 + }, + { + "epoch": 10.927227386080254, + "grad_norm": 0.314453125, + "learning_rate": 0.0002, + "loss": 0.2722, + "step": 6025 + }, + { + "epoch": 10.929041033779189, + "grad_norm": 0.251953125, + "learning_rate": 0.0002, + "loss": 0.1872, + "step": 6026 + }, + { + "epoch": 10.930854681478124, + "grad_norm": 0.28125, + "learning_rate": 0.0002, + "loss": 0.1789, + "step": 6027 + }, + { + "epoch": 10.932668329177057, + "grad_norm": 0.28125, + "learning_rate": 0.0002, + "loss": 0.1814, + "step": 6028 + }, + { + "epoch": 10.934481976875992, + "grad_norm": 0.251953125, + "learning_rate": 0.0002, + "loss": 0.1666, + "step": 6029 + }, + { + "epoch": 10.936295624574926, + "grad_norm": 0.263671875, + "learning_rate": 0.0002, + "loss": 0.1659, + "step": 6030 + }, + { + "epoch": 10.938109272273861, + "grad_norm": 0.29296875, + "learning_rate": 0.0002, + "loss": 0.1477, + "step": 6031 + }, + { + "epoch": 10.939922919972796, + "grad_norm": 0.279296875, + "learning_rate": 0.0002, + "loss": 0.162, + "step": 6032 + }, + { + "epoch": 10.941736567671729, + "grad_norm": 0.271484375, + "learning_rate": 0.0002, + "loss": 0.1459, + "step": 6033 + }, + { + "epoch": 10.943550215370664, + "grad_norm": 0.3359375, + "learning_rate": 0.0002, + "loss": 0.1773, + "step": 6034 + }, + { + "epoch": 10.945363863069598, + "grad_norm": 0.314453125, + "learning_rate": 0.0002, + "loss": 0.1946, + "step": 6035 + }, + { + "epoch": 10.947177510768533, + "grad_norm": 0.302734375, + "learning_rate": 0.0002, + "loss": 0.1815, + "step": 6036 + }, + { + "epoch": 10.948991158467468, + "grad_norm": 0.318359375, + "learning_rate": 0.0002, + "loss": 0.1576, + "step": 6037 + }, + { + "epoch": 10.950804806166403, + "grad_norm": 0.359375, + "learning_rate": 0.0002, + "loss": 0.1879, + "step": 6038 + }, + { + "epoch": 10.952618453865338, + "grad_norm": 0.32421875, + "learning_rate": 0.0002, + "loss": 0.1597, + "step": 6039 + }, + { + "epoch": 10.95443210156427, + "grad_norm": 0.359375, + "learning_rate": 0.0002, + "loss": 0.1531, + "step": 6040 + }, + { + "epoch": 10.956245749263205, + "grad_norm": 0.3515625, + "learning_rate": 0.0002, + "loss": 0.1604, + "step": 6041 + }, + { + "epoch": 10.95805939696214, + "grad_norm": 0.412109375, + "learning_rate": 0.0002, + "loss": 0.1548, + "step": 6042 + }, + { + "epoch": 10.959873044661075, + "grad_norm": 0.4140625, + "learning_rate": 0.0002, + "loss": 0.1884, + "step": 6043 + }, + { + "epoch": 10.96168669236001, + "grad_norm": 0.294921875, + "learning_rate": 0.0002, + "loss": 0.1177, + "step": 6044 + }, + { + "epoch": 10.963500340058944, + "grad_norm": 0.34765625, + "learning_rate": 0.0002, + "loss": 0.1439, + "step": 6045 + }, + { + "epoch": 10.965313987757877, + "grad_norm": 0.265625, + "learning_rate": 0.0002, + "loss": 0.1083, + "step": 6046 + }, + { + "epoch": 10.967127635456812, + "grad_norm": 0.337890625, + "learning_rate": 0.0002, + "loss": 0.1392, + "step": 6047 + }, + { + "epoch": 10.968941283155747, + "grad_norm": 0.326171875, + "learning_rate": 0.0002, + "loss": 0.1112, + "step": 6048 + }, + { + "epoch": 10.970754930854682, + "grad_norm": 0.333984375, + "learning_rate": 0.0002, + "loss": 0.111, + "step": 6049 + }, + { + "epoch": 10.972568578553616, + "grad_norm": 0.33984375, + "learning_rate": 0.0002, + "loss": 0.1369, + "step": 6050 + }, + { + "epoch": 10.974382226252551, + "grad_norm": 0.28125, + "learning_rate": 0.0002, + "loss": 0.1321, + "step": 6051 + }, + { + "epoch": 10.976195873951484, + "grad_norm": 0.294921875, + "learning_rate": 0.0002, + "loss": 0.1343, + "step": 6052 + }, + { + "epoch": 10.978009521650419, + "grad_norm": 0.26171875, + "learning_rate": 0.0002, + "loss": 0.1362, + "step": 6053 + }, + { + "epoch": 10.979823169349354, + "grad_norm": 0.255859375, + "learning_rate": 0.0002, + "loss": 0.1263, + "step": 6054 + }, + { + "epoch": 10.981636817048289, + "grad_norm": 0.220703125, + "learning_rate": 0.0002, + "loss": 0.1216, + "step": 6055 + }, + { + "epoch": 10.983450464747223, + "grad_norm": 0.25, + "learning_rate": 0.0002, + "loss": 0.1478, + "step": 6056 + }, + { + "epoch": 10.985264112446158, + "grad_norm": 0.318359375, + "learning_rate": 0.0002, + "loss": 0.1132, + "step": 6057 + }, + { + "epoch": 10.987077760145091, + "grad_norm": 0.2275390625, + "learning_rate": 0.0002, + "loss": 0.1208, + "step": 6058 + }, + { + "epoch": 10.988891407844026, + "grad_norm": 0.3046875, + "learning_rate": 0.0002, + "loss": 0.1271, + "step": 6059 + }, + { + "epoch": 10.99070505554296, + "grad_norm": 0.318359375, + "learning_rate": 0.0002, + "loss": 0.1336, + "step": 6060 + }, + { + "epoch": 10.992518703241895, + "grad_norm": 0.283203125, + "learning_rate": 0.0002, + "loss": 0.1337, + "step": 6061 + }, + { + "epoch": 10.99433235094083, + "grad_norm": 0.2236328125, + "learning_rate": 0.0002, + "loss": 0.1505, + "step": 6062 + }, + { + "epoch": 10.996145998639765, + "grad_norm": 0.201171875, + "learning_rate": 0.0002, + "loss": 0.1525, + "step": 6063 + }, + { + "epoch": 10.997959646338698, + "grad_norm": 0.203125, + "learning_rate": 0.0002, + "loss": 0.2435, + "step": 6064 + }, + { + "epoch": 10.999773294037633, + "grad_norm": 0.259765625, + "learning_rate": 0.0002, + "loss": 0.1569, + "step": 6065 + }, + { + "epoch": 11.001586941736567, + "grad_norm": 0.171875, + "learning_rate": 0.0002, + "loss": 0.1538, + "step": 6066 + }, + { + "epoch": 11.003400589435502, + "grad_norm": 0.2001953125, + "learning_rate": 0.0002, + "loss": 0.1945, + "step": 6067 + }, + { + "epoch": 11.005214237134437, + "grad_norm": 0.208984375, + "learning_rate": 0.0002, + "loss": 0.1918, + "step": 6068 + }, + { + "epoch": 11.007027884833372, + "grad_norm": 0.1611328125, + "learning_rate": 0.0002, + "loss": 0.1146, + "step": 6069 + }, + { + "epoch": 11.008841532532305, + "grad_norm": 0.17578125, + "learning_rate": 0.0002, + "loss": 0.1317, + "step": 6070 + }, + { + "epoch": 11.01065518023124, + "grad_norm": 0.228515625, + "learning_rate": 0.0002, + "loss": 0.1688, + "step": 6071 + }, + { + "epoch": 11.012468827930174, + "grad_norm": 0.216796875, + "learning_rate": 0.0002, + "loss": 0.1252, + "step": 6072 + }, + { + "epoch": 11.01428247562911, + "grad_norm": 0.2255859375, + "learning_rate": 0.0002, + "loss": 0.1775, + "step": 6073 + }, + { + "epoch": 11.016096123328044, + "grad_norm": 0.2197265625, + "learning_rate": 0.0002, + "loss": 0.1281, + "step": 6074 + }, + { + "epoch": 11.017909771026979, + "grad_norm": 0.1943359375, + "learning_rate": 0.0002, + "loss": 0.1315, + "step": 6075 + }, + { + "epoch": 11.019723418725912, + "grad_norm": 0.220703125, + "learning_rate": 0.0002, + "loss": 0.1505, + "step": 6076 + }, + { + "epoch": 11.021537066424846, + "grad_norm": 0.205078125, + "learning_rate": 0.0002, + "loss": 0.1273, + "step": 6077 + }, + { + "epoch": 11.023350714123781, + "grad_norm": 0.236328125, + "learning_rate": 0.0002, + "loss": 0.1378, + "step": 6078 + }, + { + "epoch": 11.025164361822716, + "grad_norm": 0.2265625, + "learning_rate": 0.0002, + "loss": 0.1182, + "step": 6079 + }, + { + "epoch": 11.02697800952165, + "grad_norm": 0.2080078125, + "learning_rate": 0.0002, + "loss": 0.0986, + "step": 6080 + }, + { + "epoch": 11.028791657220586, + "grad_norm": 0.2197265625, + "learning_rate": 0.0002, + "loss": 0.1128, + "step": 6081 + }, + { + "epoch": 11.030605304919519, + "grad_norm": 0.259765625, + "learning_rate": 0.0002, + "loss": 0.1257, + "step": 6082 + }, + { + "epoch": 11.032418952618453, + "grad_norm": 0.2734375, + "learning_rate": 0.0002, + "loss": 0.1251, + "step": 6083 + }, + { + "epoch": 11.034232600317388, + "grad_norm": 0.265625, + "learning_rate": 0.0002, + "loss": 0.105, + "step": 6084 + }, + { + "epoch": 11.036046248016323, + "grad_norm": 0.30078125, + "learning_rate": 0.0002, + "loss": 0.1147, + "step": 6085 + }, + { + "epoch": 11.037859895715258, + "grad_norm": 0.31640625, + "learning_rate": 0.0002, + "loss": 0.1165, + "step": 6086 + }, + { + "epoch": 11.039673543414192, + "grad_norm": 0.30078125, + "learning_rate": 0.0002, + "loss": 0.1161, + "step": 6087 + }, + { + "epoch": 11.041487191113127, + "grad_norm": 0.28515625, + "learning_rate": 0.0002, + "loss": 0.0951, + "step": 6088 + }, + { + "epoch": 11.04330083881206, + "grad_norm": 0.3359375, + "learning_rate": 0.0002, + "loss": 0.1348, + "step": 6089 + }, + { + "epoch": 11.045114486510995, + "grad_norm": 0.32421875, + "learning_rate": 0.0002, + "loss": 0.1329, + "step": 6090 + }, + { + "epoch": 11.04692813420993, + "grad_norm": 0.265625, + "learning_rate": 0.0002, + "loss": 0.1474, + "step": 6091 + }, + { + "epoch": 11.048741781908864, + "grad_norm": 0.294921875, + "learning_rate": 0.0002, + "loss": 0.1172, + "step": 6092 + }, + { + "epoch": 11.0505554296078, + "grad_norm": 0.357421875, + "learning_rate": 0.0002, + "loss": 0.0848, + "step": 6093 + }, + { + "epoch": 11.052369077306734, + "grad_norm": 0.333984375, + "learning_rate": 0.0002, + "loss": 0.105, + "step": 6094 + }, + { + "epoch": 11.054182725005667, + "grad_norm": 0.32421875, + "learning_rate": 0.0002, + "loss": 0.2183, + "step": 6095 + }, + { + "epoch": 11.055996372704602, + "grad_norm": 0.265625, + "learning_rate": 0.0002, + "loss": 0.094, + "step": 6096 + }, + { + "epoch": 11.057810020403537, + "grad_norm": 0.279296875, + "learning_rate": 0.0002, + "loss": 0.0938, + "step": 6097 + }, + { + "epoch": 11.059623668102471, + "grad_norm": 0.259765625, + "learning_rate": 0.0002, + "loss": 0.0914, + "step": 6098 + }, + { + "epoch": 11.061437315801406, + "grad_norm": 0.361328125, + "learning_rate": 0.0002, + "loss": 0.111, + "step": 6099 + }, + { + "epoch": 11.06325096350034, + "grad_norm": 0.216796875, + "learning_rate": 0.0002, + "loss": 0.0823, + "step": 6100 + }, + { + "epoch": 11.065064611199274, + "grad_norm": 0.26953125, + "learning_rate": 0.0002, + "loss": 0.0929, + "step": 6101 + }, + { + "epoch": 11.066878258898209, + "grad_norm": 0.22265625, + "learning_rate": 0.0002, + "loss": 0.0772, + "step": 6102 + }, + { + "epoch": 11.068691906597143, + "grad_norm": 0.232421875, + "learning_rate": 0.0002, + "loss": 0.0975, + "step": 6103 + }, + { + "epoch": 11.070505554296078, + "grad_norm": 0.373046875, + "learning_rate": 0.0002, + "loss": 0.1089, + "step": 6104 + }, + { + "epoch": 11.072319201995013, + "grad_norm": 0.236328125, + "learning_rate": 0.0002, + "loss": 0.0902, + "step": 6105 + }, + { + "epoch": 11.074132849693948, + "grad_norm": 0.1982421875, + "learning_rate": 0.0002, + "loss": 0.0847, + "step": 6106 + }, + { + "epoch": 11.07594649739288, + "grad_norm": 0.34765625, + "learning_rate": 0.0002, + "loss": 0.1293, + "step": 6107 + }, + { + "epoch": 11.077760145091816, + "grad_norm": 0.2158203125, + "learning_rate": 0.0002, + "loss": 0.107, + "step": 6108 + }, + { + "epoch": 11.07957379279075, + "grad_norm": 0.263671875, + "learning_rate": 0.0002, + "loss": 0.1159, + "step": 6109 + }, + { + "epoch": 11.081387440489685, + "grad_norm": 0.1279296875, + "learning_rate": 0.0002, + "loss": 0.1055, + "step": 6110 + }, + { + "epoch": 11.08320108818862, + "grad_norm": 0.1318359375, + "learning_rate": 0.0002, + "loss": 0.1032, + "step": 6111 + }, + { + "epoch": 11.085014735887555, + "grad_norm": 0.251953125, + "learning_rate": 0.0002, + "loss": 0.1357, + "step": 6112 + }, + { + "epoch": 11.086828383586488, + "grad_norm": 0.138671875, + "learning_rate": 0.0002, + "loss": 0.1127, + "step": 6113 + }, + { + "epoch": 11.088642031285422, + "grad_norm": 0.298828125, + "learning_rate": 0.0002, + "loss": 0.1675, + "step": 6114 + }, + { + "epoch": 11.090455678984357, + "grad_norm": 0.65625, + "learning_rate": 0.0002, + "loss": 0.2118, + "step": 6115 + }, + { + "epoch": 11.092269326683292, + "grad_norm": 0.2333984375, + "learning_rate": 0.0002, + "loss": 0.238, + "step": 6116 + }, + { + "epoch": 11.094082974382227, + "grad_norm": 0.212890625, + "learning_rate": 0.0002, + "loss": 0.146, + "step": 6117 + }, + { + "epoch": 11.095896622081161, + "grad_norm": 0.20703125, + "learning_rate": 0.0002, + "loss": 0.1405, + "step": 6118 + }, + { + "epoch": 11.097710269780094, + "grad_norm": 0.220703125, + "learning_rate": 0.0002, + "loss": 0.1713, + "step": 6119 + }, + { + "epoch": 11.09952391747903, + "grad_norm": 0.2734375, + "learning_rate": 0.0002, + "loss": 0.2389, + "step": 6120 + }, + { + "epoch": 11.101337565177964, + "grad_norm": 0.1845703125, + "learning_rate": 0.0002, + "loss": 0.1157, + "step": 6121 + }, + { + "epoch": 11.103151212876899, + "grad_norm": 0.1982421875, + "learning_rate": 0.0002, + "loss": 0.1504, + "step": 6122 + }, + { + "epoch": 11.104964860575834, + "grad_norm": 0.2255859375, + "learning_rate": 0.0002, + "loss": 0.1529, + "step": 6123 + }, + { + "epoch": 11.106778508274768, + "grad_norm": 0.2099609375, + "learning_rate": 0.0002, + "loss": 0.1503, + "step": 6124 + }, + { + "epoch": 11.108592155973701, + "grad_norm": 0.2177734375, + "learning_rate": 0.0002, + "loss": 0.1299, + "step": 6125 + }, + { + "epoch": 11.110405803672636, + "grad_norm": 0.2431640625, + "learning_rate": 0.0002, + "loss": 0.2247, + "step": 6126 + }, + { + "epoch": 11.11221945137157, + "grad_norm": 0.2470703125, + "learning_rate": 0.0002, + "loss": 0.142, + "step": 6127 + }, + { + "epoch": 11.114033099070506, + "grad_norm": 0.2236328125, + "learning_rate": 0.0002, + "loss": 0.1253, + "step": 6128 + }, + { + "epoch": 11.11584674676944, + "grad_norm": 0.267578125, + "learning_rate": 0.0002, + "loss": 0.1651, + "step": 6129 + }, + { + "epoch": 11.117660394468375, + "grad_norm": 0.23046875, + "learning_rate": 0.0002, + "loss": 0.1278, + "step": 6130 + }, + { + "epoch": 11.119474042167308, + "grad_norm": 0.26953125, + "learning_rate": 0.0002, + "loss": 0.1207, + "step": 6131 + }, + { + "epoch": 11.121287689866243, + "grad_norm": 0.232421875, + "learning_rate": 0.0002, + "loss": 0.1017, + "step": 6132 + }, + { + "epoch": 11.123101337565178, + "grad_norm": 0.263671875, + "learning_rate": 0.0002, + "loss": 0.1065, + "step": 6133 + }, + { + "epoch": 11.124914985264112, + "grad_norm": 0.267578125, + "learning_rate": 0.0002, + "loss": 0.1124, + "step": 6134 + }, + { + "epoch": 11.126728632963047, + "grad_norm": 0.283203125, + "learning_rate": 0.0002, + "loss": 0.1043, + "step": 6135 + }, + { + "epoch": 11.128542280661982, + "grad_norm": 0.33984375, + "learning_rate": 0.0002, + "loss": 0.1498, + "step": 6136 + }, + { + "epoch": 11.130355928360915, + "grad_norm": 0.271484375, + "learning_rate": 0.0002, + "loss": 0.1074, + "step": 6137 + }, + { + "epoch": 11.13216957605985, + "grad_norm": 0.2890625, + "learning_rate": 0.0002, + "loss": 0.1069, + "step": 6138 + }, + { + "epoch": 11.133983223758785, + "grad_norm": 0.35546875, + "learning_rate": 0.0002, + "loss": 0.1282, + "step": 6139 + }, + { + "epoch": 11.13579687145772, + "grad_norm": 0.2490234375, + "learning_rate": 0.0002, + "loss": 0.1005, + "step": 6140 + }, + { + "epoch": 11.137610519156654, + "grad_norm": 0.326171875, + "learning_rate": 0.0002, + "loss": 0.1002, + "step": 6141 + }, + { + "epoch": 11.139424166855589, + "grad_norm": 0.3359375, + "learning_rate": 0.0002, + "loss": 0.1011, + "step": 6142 + }, + { + "epoch": 11.141237814554522, + "grad_norm": 0.26171875, + "learning_rate": 0.0002, + "loss": 0.1068, + "step": 6143 + }, + { + "epoch": 11.143051462253457, + "grad_norm": 0.296875, + "learning_rate": 0.0002, + "loss": 0.0979, + "step": 6144 + }, + { + "epoch": 11.144865109952391, + "grad_norm": 0.298828125, + "learning_rate": 0.0002, + "loss": 0.1097, + "step": 6145 + }, + { + "epoch": 11.146678757651326, + "grad_norm": 0.26953125, + "learning_rate": 0.0002, + "loss": 0.0896, + "step": 6146 + }, + { + "epoch": 11.148492405350261, + "grad_norm": 0.33984375, + "learning_rate": 0.0002, + "loss": 0.1044, + "step": 6147 + }, + { + "epoch": 11.150306053049196, + "grad_norm": 0.384765625, + "learning_rate": 0.0002, + "loss": 0.1099, + "step": 6148 + }, + { + "epoch": 11.15211970074813, + "grad_norm": 0.294921875, + "learning_rate": 0.0002, + "loss": 0.1035, + "step": 6149 + }, + { + "epoch": 11.153933348447064, + "grad_norm": 0.287109375, + "learning_rate": 0.0002, + "loss": 0.0974, + "step": 6150 + }, + { + "epoch": 11.155746996145998, + "grad_norm": 0.2578125, + "learning_rate": 0.0002, + "loss": 0.093, + "step": 6151 + }, + { + "epoch": 11.157560643844933, + "grad_norm": 0.2216796875, + "learning_rate": 0.0002, + "loss": 0.0901, + "step": 6152 + }, + { + "epoch": 11.159374291543868, + "grad_norm": 0.296875, + "learning_rate": 0.0002, + "loss": 0.0936, + "step": 6153 + }, + { + "epoch": 11.161187939242803, + "grad_norm": 0.2001953125, + "learning_rate": 0.0002, + "loss": 0.0993, + "step": 6154 + }, + { + "epoch": 11.163001586941737, + "grad_norm": 0.263671875, + "learning_rate": 0.0002, + "loss": 0.0924, + "step": 6155 + }, + { + "epoch": 11.16481523464067, + "grad_norm": 0.251953125, + "learning_rate": 0.0002, + "loss": 0.1222, + "step": 6156 + }, + { + "epoch": 11.166628882339605, + "grad_norm": 0.2412109375, + "learning_rate": 0.0002, + "loss": 0.1134, + "step": 6157 + }, + { + "epoch": 11.16844253003854, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002, + "loss": 0.1065, + "step": 6158 + }, + { + "epoch": 11.170256177737475, + "grad_norm": 0.25, + "learning_rate": 0.0002, + "loss": 0.119, + "step": 6159 + }, + { + "epoch": 11.17206982543641, + "grad_norm": 0.609375, + "learning_rate": 0.0002, + "loss": 0.108, + "step": 6160 + }, + { + "epoch": 11.173883473135344, + "grad_norm": 0.310546875, + "learning_rate": 0.0002, + "loss": 0.1151, + "step": 6161 + }, + { + "epoch": 11.175697120834277, + "grad_norm": 0.302734375, + "learning_rate": 0.0002, + "loss": 0.1344, + "step": 6162 + }, + { + "epoch": 11.177510768533212, + "grad_norm": 0.294921875, + "learning_rate": 0.0002, + "loss": 0.1204, + "step": 6163 + }, + { + "epoch": 11.179324416232147, + "grad_norm": 0.181640625, + "learning_rate": 0.0002, + "loss": 0.1602, + "step": 6164 + }, + { + "epoch": 11.181138063931082, + "grad_norm": 0.1708984375, + "learning_rate": 0.0002, + "loss": 0.1998, + "step": 6165 + }, + { + "epoch": 11.182951711630016, + "grad_norm": 0.25, + "learning_rate": 0.0002, + "loss": 0.1972, + "step": 6166 + }, + { + "epoch": 11.184765359328951, + "grad_norm": 0.2373046875, + "learning_rate": 0.0002, + "loss": 0.2219, + "step": 6167 + }, + { + "epoch": 11.186579007027884, + "grad_norm": 0.23046875, + "learning_rate": 0.0002, + "loss": 0.1901, + "step": 6168 + }, + { + "epoch": 11.188392654726819, + "grad_norm": 0.234375, + "learning_rate": 0.0002, + "loss": 0.2071, + "step": 6169 + }, + { + "epoch": 11.190206302425754, + "grad_norm": 0.25390625, + "learning_rate": 0.0002, + "loss": 0.2052, + "step": 6170 + }, + { + "epoch": 11.192019950124688, + "grad_norm": 0.21875, + "learning_rate": 0.0002, + "loss": 0.1341, + "step": 6171 + }, + { + "epoch": 11.192019950124688, + "eval_loss": 2.2268667221069336, + "eval_runtime": 152.8129, + "eval_samples_per_second": 6.544, + "eval_steps_per_second": 6.544, + "step": 6171 + }, + { + "epoch": 11.192019950124688, + "mmlu_eval_accuracy": 0.3092034560626582, + "mmlu_eval_accuracy_abstract_algebra": 0.45454545454545453, + "mmlu_eval_accuracy_anatomy": 0.42857142857142855, + "mmlu_eval_accuracy_astronomy": 0.3125, + "mmlu_eval_accuracy_business_ethics": 0.36363636363636365, + "mmlu_eval_accuracy_clinical_knowledge": 0.3103448275862069, + "mmlu_eval_accuracy_college_biology": 0.4375, + "mmlu_eval_accuracy_college_chemistry": 0.25, + "mmlu_eval_accuracy_college_computer_science": 0.09090909090909091, + "mmlu_eval_accuracy_college_mathematics": 0.18181818181818182, + "mmlu_eval_accuracy_college_medicine": 0.22727272727272727, + "mmlu_eval_accuracy_college_physics": 0.36363636363636365, + "mmlu_eval_accuracy_computer_security": 0.45454545454545453, + "mmlu_eval_accuracy_conceptual_physics": 0.2692307692307692, + "mmlu_eval_accuracy_econometrics": 0.08333333333333333, + "mmlu_eval_accuracy_electrical_engineering": 0.25, + "mmlu_eval_accuracy_elementary_mathematics": 0.24390243902439024, + "mmlu_eval_accuracy_formal_logic": 0.14285714285714285, + "mmlu_eval_accuracy_global_facts": 0.3, + "mmlu_eval_accuracy_high_school_biology": 0.40625, + "mmlu_eval_accuracy_high_school_chemistry": 0.18181818181818182, + "mmlu_eval_accuracy_high_school_computer_science": 0.2222222222222222, + "mmlu_eval_accuracy_high_school_european_history": 0.2777777777777778, + "mmlu_eval_accuracy_high_school_geography": 0.3181818181818182, + "mmlu_eval_accuracy_high_school_government_and_politics": 0.3333333333333333, + "mmlu_eval_accuracy_high_school_macroeconomics": 0.3023255813953488, + "mmlu_eval_accuracy_high_school_mathematics": 0.10344827586206896, + "mmlu_eval_accuracy_high_school_microeconomics": 0.3076923076923077, + "mmlu_eval_accuracy_high_school_physics": 0.4117647058823529, + "mmlu_eval_accuracy_high_school_psychology": 0.3, + "mmlu_eval_accuracy_high_school_statistics": 0.2608695652173913, + "mmlu_eval_accuracy_high_school_us_history": 0.2727272727272727, + "mmlu_eval_accuracy_high_school_world_history": 0.15384615384615385, + "mmlu_eval_accuracy_human_aging": 0.30434782608695654, + "mmlu_eval_accuracy_human_sexuality": 0.08333333333333333, + "mmlu_eval_accuracy_international_law": 0.38461538461538464, + "mmlu_eval_accuracy_jurisprudence": 0.2727272727272727, + "mmlu_eval_accuracy_logical_fallacies": 0.2777777777777778, + "mmlu_eval_accuracy_machine_learning": 0.36363636363636365, + "mmlu_eval_accuracy_management": 0.45454545454545453, + "mmlu_eval_accuracy_marketing": 0.4, + "mmlu_eval_accuracy_medical_genetics": 0.45454545454545453, + "mmlu_eval_accuracy_miscellaneous": 0.5348837209302325, + "mmlu_eval_accuracy_moral_disputes": 0.3157894736842105, + "mmlu_eval_accuracy_moral_scenarios": 0.23, + "mmlu_eval_accuracy_nutrition": 0.30303030303030304, + "mmlu_eval_accuracy_philosophy": 0.38235294117647056, + "mmlu_eval_accuracy_prehistory": 0.4, + "mmlu_eval_accuracy_professional_accounting": 0.3225806451612903, + "mmlu_eval_accuracy_professional_law": 0.27058823529411763, + "mmlu_eval_accuracy_professional_medicine": 0.25806451612903225, + "mmlu_eval_accuracy_professional_psychology": 0.2753623188405797, + "mmlu_eval_accuracy_public_relations": 0.4166666666666667, + "mmlu_eval_accuracy_security_studies": 0.4074074074074074, + "mmlu_eval_accuracy_sociology": 0.5, + "mmlu_eval_accuracy_us_foreign_policy": 0.45454545454545453, + "mmlu_eval_accuracy_virology": 0.2777777777777778, + "mmlu_eval_accuracy_world_religions": 0.2631578947368421, + "mmlu_loss": 1.8082188657496507, + "step": 6171 + }, + { + "epoch": 11.193833597823623, + "grad_norm": 0.2138671875, + "learning_rate": 0.0002, + "loss": 0.1362, + "step": 6172 + }, + { + "epoch": 11.195647245522558, + "grad_norm": 0.197265625, + "learning_rate": 0.0002, + "loss": 0.1314, + "step": 6173 + }, + { + "epoch": 11.197460893221491, + "grad_norm": 0.20703125, + "learning_rate": 0.0002, + "loss": 0.1569, + "step": 6174 + }, + { + "epoch": 11.199274540920426, + "grad_norm": 0.244140625, + "learning_rate": 0.0002, + "loss": 0.1678, + "step": 6175 + }, + { + "epoch": 11.20108818861936, + "grad_norm": 0.2197265625, + "learning_rate": 0.0002, + "loss": 0.146, + "step": 6176 + }, + { + "epoch": 11.202901836318295, + "grad_norm": 0.2177734375, + "learning_rate": 0.0002, + "loss": 0.1273, + "step": 6177 + }, + { + "epoch": 11.20471548401723, + "grad_norm": 0.224609375, + "learning_rate": 0.0002, + "loss": 0.153, + "step": 6178 + }, + { + "epoch": 11.206529131716165, + "grad_norm": 0.2470703125, + "learning_rate": 0.0002, + "loss": 0.1247, + "step": 6179 + }, + { + "epoch": 11.208342779415098, + "grad_norm": 0.2197265625, + "learning_rate": 0.0002, + "loss": 0.1156, + "step": 6180 + }, + { + "epoch": 11.210156427114033, + "grad_norm": 0.240234375, + "learning_rate": 0.0002, + "loss": 0.1079, + "step": 6181 + }, + { + "epoch": 11.211970074812967, + "grad_norm": 0.279296875, + "learning_rate": 0.0002, + "loss": 0.1263, + "step": 6182 + }, + { + "epoch": 11.213783722511902, + "grad_norm": 0.232421875, + "learning_rate": 0.0002, + "loss": 0.0981, + "step": 6183 + }, + { + "epoch": 11.215597370210837, + "grad_norm": 0.287109375, + "learning_rate": 0.0002, + "loss": 0.1284, + "step": 6184 + }, + { + "epoch": 11.217411017909772, + "grad_norm": 0.26953125, + "learning_rate": 0.0002, + "loss": 0.1203, + "step": 6185 + }, + { + "epoch": 11.219224665608705, + "grad_norm": 0.2490234375, + "learning_rate": 0.0002, + "loss": 0.1158, + "step": 6186 + }, + { + "epoch": 11.22103831330764, + "grad_norm": 0.314453125, + "learning_rate": 0.0002, + "loss": 0.1142, + "step": 6187 + }, + { + "epoch": 11.222851961006574, + "grad_norm": 0.287109375, + "learning_rate": 0.0002, + "loss": 0.1013, + "step": 6188 + }, + { + "epoch": 11.224665608705509, + "grad_norm": 0.302734375, + "learning_rate": 0.0002, + "loss": 0.1278, + "step": 6189 + }, + { + "epoch": 11.226479256404444, + "grad_norm": 0.267578125, + "learning_rate": 0.0002, + "loss": 0.0975, + "step": 6190 + }, + { + "epoch": 11.228292904103379, + "grad_norm": 0.27734375, + "learning_rate": 0.0002, + "loss": 0.1105, + "step": 6191 + }, + { + "epoch": 11.230106551802312, + "grad_norm": 0.353515625, + "learning_rate": 0.0002, + "loss": 0.1227, + "step": 6192 + }, + { + "epoch": 11.231920199501246, + "grad_norm": 0.2890625, + "learning_rate": 0.0002, + "loss": 0.1178, + "step": 6193 + }, + { + "epoch": 11.233733847200181, + "grad_norm": 0.41796875, + "learning_rate": 0.0002, + "loss": 0.1131, + "step": 6194 + }, + { + "epoch": 11.235547494899116, + "grad_norm": 0.28515625, + "learning_rate": 0.0002, + "loss": 0.0917, + "step": 6195 + }, + { + "epoch": 11.23736114259805, + "grad_norm": 0.328125, + "learning_rate": 0.0002, + "loss": 0.0906, + "step": 6196 + }, + { + "epoch": 11.239174790296985, + "grad_norm": 0.34765625, + "learning_rate": 0.0002, + "loss": 0.0967, + "step": 6197 + }, + { + "epoch": 11.24098843799592, + "grad_norm": 0.392578125, + "learning_rate": 0.0002, + "loss": 0.1027, + "step": 6198 + }, + { + "epoch": 11.242802085694853, + "grad_norm": 0.466796875, + "learning_rate": 0.0002, + "loss": 0.1052, + "step": 6199 + }, + { + "epoch": 11.244615733393788, + "grad_norm": 0.302734375, + "learning_rate": 0.0002, + "loss": 0.1189, + "step": 6200 + }, + { + "epoch": 11.246429381092723, + "grad_norm": 0.306640625, + "learning_rate": 0.0002, + "loss": 0.1123, + "step": 6201 + }, + { + "epoch": 11.248243028791657, + "grad_norm": 0.2138671875, + "learning_rate": 0.0002, + "loss": 0.088, + "step": 6202 + }, + { + "epoch": 11.250056676490592, + "grad_norm": 0.2158203125, + "learning_rate": 0.0002, + "loss": 0.0964, + "step": 6203 + }, + { + "epoch": 11.251870324189527, + "grad_norm": 0.48828125, + "learning_rate": 0.0002, + "loss": 0.0981, + "step": 6204 + }, + { + "epoch": 11.25368397188846, + "grad_norm": 0.27734375, + "learning_rate": 0.0002, + "loss": 0.1023, + "step": 6205 + }, + { + "epoch": 11.255497619587395, + "grad_norm": 0.18359375, + "learning_rate": 0.0002, + "loss": 0.0967, + "step": 6206 + }, + { + "epoch": 11.25731126728633, + "grad_norm": 0.365234375, + "learning_rate": 0.0002, + "loss": 0.116, + "step": 6207 + }, + { + "epoch": 11.259124914985264, + "grad_norm": 0.2119140625, + "learning_rate": 0.0002, + "loss": 0.1168, + "step": 6208 + }, + { + "epoch": 11.2609385626842, + "grad_norm": 0.21875, + "learning_rate": 0.0002, + "loss": 0.1257, + "step": 6209 + }, + { + "epoch": 11.262752210383134, + "grad_norm": 0.2021484375, + "learning_rate": 0.0002, + "loss": 0.1174, + "step": 6210 + }, + { + "epoch": 11.264565858082067, + "grad_norm": 0.328125, + "learning_rate": 0.0002, + "loss": 0.1233, + "step": 6211 + }, + { + "epoch": 11.266379505781002, + "grad_norm": 0.2197265625, + "learning_rate": 0.0002, + "loss": 0.1387, + "step": 6212 + }, + { + "epoch": 11.268193153479936, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002, + "loss": 0.1402, + "step": 6213 + }, + { + "epoch": 11.270006801178871, + "grad_norm": 0.171875, + "learning_rate": 0.0002, + "loss": 0.1708, + "step": 6214 + }, + { + "epoch": 11.271820448877806, + "grad_norm": 0.181640625, + "learning_rate": 0.0002, + "loss": 0.223, + "step": 6215 + }, + { + "epoch": 11.27363409657674, + "grad_norm": 0.2431640625, + "learning_rate": 0.0002, + "loss": 0.2398, + "step": 6216 + }, + { + "epoch": 11.275447744275674, + "grad_norm": 0.228515625, + "learning_rate": 0.0002, + "loss": 0.1519, + "step": 6217 + }, + { + "epoch": 11.277261391974609, + "grad_norm": 0.234375, + "learning_rate": 0.0002, + "loss": 0.17, + "step": 6218 + }, + { + "epoch": 11.279075039673543, + "grad_norm": 0.2431640625, + "learning_rate": 0.0002, + "loss": 0.1658, + "step": 6219 + }, + { + "epoch": 11.280888687372478, + "grad_norm": 0.25, + "learning_rate": 0.0002, + "loss": 0.2084, + "step": 6220 + }, + { + "epoch": 11.282702335071413, + "grad_norm": 0.255859375, + "learning_rate": 0.0002, + "loss": 0.1896, + "step": 6221 + }, + { + "epoch": 11.284515982770348, + "grad_norm": 0.2197265625, + "learning_rate": 0.0002, + "loss": 0.144, + "step": 6222 + }, + { + "epoch": 11.28632963046928, + "grad_norm": 0.2314453125, + "learning_rate": 0.0002, + "loss": 0.1658, + "step": 6223 + }, + { + "epoch": 11.288143278168215, + "grad_norm": 0.24609375, + "learning_rate": 0.0002, + "loss": 0.193, + "step": 6224 + }, + { + "epoch": 11.28995692586715, + "grad_norm": 0.251953125, + "learning_rate": 0.0002, + "loss": 0.1676, + "step": 6225 + }, + { + "epoch": 11.291770573566085, + "grad_norm": 0.2255859375, + "learning_rate": 0.0002, + "loss": 0.138, + "step": 6226 + }, + { + "epoch": 11.29358422126502, + "grad_norm": 0.2119140625, + "learning_rate": 0.0002, + "loss": 0.1378, + "step": 6227 + }, + { + "epoch": 11.295397868963954, + "grad_norm": 0.25, + "learning_rate": 0.0002, + "loss": 0.1434, + "step": 6228 + }, + { + "epoch": 11.297211516662887, + "grad_norm": 0.271484375, + "learning_rate": 0.0002, + "loss": 0.1623, + "step": 6229 + }, + { + "epoch": 11.299025164361822, + "grad_norm": 0.2470703125, + "learning_rate": 0.0002, + "loss": 0.149, + "step": 6230 + }, + { + "epoch": 11.300838812060757, + "grad_norm": 0.259765625, + "learning_rate": 0.0002, + "loss": 0.1147, + "step": 6231 + }, + { + "epoch": 11.302652459759692, + "grad_norm": 0.25390625, + "learning_rate": 0.0002, + "loss": 0.1148, + "step": 6232 + }, + { + "epoch": 11.304466107458627, + "grad_norm": 0.28125, + "learning_rate": 0.0002, + "loss": 0.1598, + "step": 6233 + }, + { + "epoch": 11.306279755157561, + "grad_norm": 0.283203125, + "learning_rate": 0.0002, + "loss": 0.1458, + "step": 6234 + }, + { + "epoch": 11.308093402856494, + "grad_norm": 0.234375, + "learning_rate": 0.0002, + "loss": 0.1128, + "step": 6235 + }, + { + "epoch": 11.309907050555429, + "grad_norm": 0.294921875, + "learning_rate": 0.0002, + "loss": 0.1429, + "step": 6236 + }, + { + "epoch": 11.311720698254364, + "grad_norm": 0.306640625, + "learning_rate": 0.0002, + "loss": 0.1183, + "step": 6237 + }, + { + "epoch": 11.313534345953299, + "grad_norm": 0.310546875, + "learning_rate": 0.0002, + "loss": 0.1239, + "step": 6238 + }, + { + "epoch": 11.315347993652233, + "grad_norm": 0.296875, + "learning_rate": 0.0002, + "loss": 0.1054, + "step": 6239 + }, + { + "epoch": 11.317161641351168, + "grad_norm": 0.251953125, + "learning_rate": 0.0002, + "loss": 0.0958, + "step": 6240 + }, + { + "epoch": 11.318975289050101, + "grad_norm": 0.24609375, + "learning_rate": 0.0002, + "loss": 0.0905, + "step": 6241 + }, + { + "epoch": 11.320788936749036, + "grad_norm": 0.3515625, + "learning_rate": 0.0002, + "loss": 0.1146, + "step": 6242 + }, + { + "epoch": 11.32260258444797, + "grad_norm": 0.251953125, + "learning_rate": 0.0002, + "loss": 0.1036, + "step": 6243 + }, + { + "epoch": 11.324416232146906, + "grad_norm": 0.232421875, + "learning_rate": 0.0002, + "loss": 0.0919, + "step": 6244 + }, + { + "epoch": 11.32622987984584, + "grad_norm": 0.30078125, + "learning_rate": 0.0002, + "loss": 0.0994, + "step": 6245 + }, + { + "epoch": 11.328043527544775, + "grad_norm": 0.26171875, + "learning_rate": 0.0002, + "loss": 0.0986, + "step": 6246 + }, + { + "epoch": 11.329857175243708, + "grad_norm": 0.298828125, + "learning_rate": 0.0002, + "loss": 0.0982, + "step": 6247 + }, + { + "epoch": 11.331670822942643, + "grad_norm": 0.255859375, + "learning_rate": 0.0002, + "loss": 0.1069, + "step": 6248 + }, + { + "epoch": 11.333484470641578, + "grad_norm": 0.353515625, + "learning_rate": 0.0002, + "loss": 0.0977, + "step": 6249 + }, + { + "epoch": 11.335298118340512, + "grad_norm": 0.333984375, + "learning_rate": 0.0002, + "loss": 0.1053, + "step": 6250 + }, + { + "epoch": 11.337111766039447, + "grad_norm": 0.275390625, + "learning_rate": 0.0002, + "loss": 0.1, + "step": 6251 + }, + { + "epoch": 11.338925413738382, + "grad_norm": 0.322265625, + "learning_rate": 0.0002, + "loss": 0.1121, + "step": 6252 + }, + { + "epoch": 11.340739061437315, + "grad_norm": 0.25390625, + "learning_rate": 0.0002, + "loss": 0.1126, + "step": 6253 + }, + { + "epoch": 11.34255270913625, + "grad_norm": 0.322265625, + "learning_rate": 0.0002, + "loss": 0.1087, + "step": 6254 + }, + { + "epoch": 11.344366356835184, + "grad_norm": 0.2451171875, + "learning_rate": 0.0002, + "loss": 0.1022, + "step": 6255 + }, + { + "epoch": 11.34618000453412, + "grad_norm": 0.2314453125, + "learning_rate": 0.0002, + "loss": 0.111, + "step": 6256 + }, + { + "epoch": 11.347993652233054, + "grad_norm": 0.236328125, + "learning_rate": 0.0002, + "loss": 0.12, + "step": 6257 + }, + { + "epoch": 11.349807299931989, + "grad_norm": 0.236328125, + "learning_rate": 0.0002, + "loss": 0.0993, + "step": 6258 + }, + { + "epoch": 11.351620947630924, + "grad_norm": 0.2158203125, + "learning_rate": 0.0002, + "loss": 0.1108, + "step": 6259 + }, + { + "epoch": 11.353434595329857, + "grad_norm": 0.263671875, + "learning_rate": 0.0002, + "loss": 0.1391, + "step": 6260 + }, + { + "epoch": 11.355248243028791, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 0.1123, + "step": 6261 + }, + { + "epoch": 11.357061890727726, + "grad_norm": 0.193359375, + "learning_rate": 0.0002, + "loss": 0.1404, + "step": 6262 + }, + { + "epoch": 11.35887553842666, + "grad_norm": 0.1376953125, + "learning_rate": 0.0002, + "loss": 0.143, + "step": 6263 + }, + { + "epoch": 11.360689186125596, + "grad_norm": 0.1572265625, + "learning_rate": 0.0002, + "loss": 0.1489, + "step": 6264 + }, + { + "epoch": 11.36250283382453, + "grad_norm": 0.32421875, + "learning_rate": 0.0002, + "loss": 0.278, + "step": 6265 + }, + { + "epoch": 11.364316481523463, + "grad_norm": 0.2373046875, + "learning_rate": 0.0002, + "loss": 0.2767, + "step": 6266 + }, + { + "epoch": 11.366130129222398, + "grad_norm": 0.25390625, + "learning_rate": 0.0002, + "loss": 0.2078, + "step": 6267 + }, + { + "epoch": 11.367943776921333, + "grad_norm": 0.2451171875, + "learning_rate": 0.0002, + "loss": 0.2089, + "step": 6268 + }, + { + "epoch": 11.369757424620268, + "grad_norm": 0.244140625, + "learning_rate": 0.0002, + "loss": 0.2219, + "step": 6269 + }, + { + "epoch": 11.371571072319203, + "grad_norm": 0.2197265625, + "learning_rate": 0.0002, + "loss": 0.1282, + "step": 6270 + }, + { + "epoch": 11.373384720018137, + "grad_norm": 0.1923828125, + "learning_rate": 0.0002, + "loss": 0.1258, + "step": 6271 + }, + { + "epoch": 11.37519836771707, + "grad_norm": 0.287109375, + "learning_rate": 0.0002, + "loss": 0.2101, + "step": 6272 + }, + { + "epoch": 11.377012015416005, + "grad_norm": 0.24609375, + "learning_rate": 0.0002, + "loss": 0.1895, + "step": 6273 + }, + { + "epoch": 11.37882566311494, + "grad_norm": 0.21875, + "learning_rate": 0.0002, + "loss": 0.1603, + "step": 6274 + }, + { + "epoch": 11.380639310813875, + "grad_norm": 0.2578125, + "learning_rate": 0.0002, + "loss": 0.2001, + "step": 6275 + }, + { + "epoch": 11.38245295851281, + "grad_norm": 0.224609375, + "learning_rate": 0.0002, + "loss": 0.1363, + "step": 6276 + }, + { + "epoch": 11.384266606211744, + "grad_norm": 0.2333984375, + "learning_rate": 0.0002, + "loss": 0.1536, + "step": 6277 + }, + { + "epoch": 11.386080253910677, + "grad_norm": 0.26953125, + "learning_rate": 0.0002, + "loss": 0.1673, + "step": 6278 + }, + { + "epoch": 11.387893901609612, + "grad_norm": 0.21484375, + "learning_rate": 0.0002, + "loss": 0.1251, + "step": 6279 + }, + { + "epoch": 11.389707549308547, + "grad_norm": 0.228515625, + "learning_rate": 0.0002, + "loss": 0.1209, + "step": 6280 + }, + { + "epoch": 11.391521197007481, + "grad_norm": 0.27734375, + "learning_rate": 0.0002, + "loss": 0.1746, + "step": 6281 + }, + { + "epoch": 11.393334844706416, + "grad_norm": 0.23828125, + "learning_rate": 0.0002, + "loss": 0.1161, + "step": 6282 + }, + { + "epoch": 11.395148492405351, + "grad_norm": 0.263671875, + "learning_rate": 0.0002, + "loss": 0.1447, + "step": 6283 + }, + { + "epoch": 11.396962140104284, + "grad_norm": 0.25, + "learning_rate": 0.0002, + "loss": 0.1294, + "step": 6284 + }, + { + "epoch": 11.398775787803219, + "grad_norm": 0.251953125, + "learning_rate": 0.0002, + "loss": 0.1166, + "step": 6285 + }, + { + "epoch": 11.400589435502154, + "grad_norm": 0.2373046875, + "learning_rate": 0.0002, + "loss": 0.0968, + "step": 6286 + }, + { + "epoch": 11.402403083201088, + "grad_norm": 0.275390625, + "learning_rate": 0.0002, + "loss": 0.106, + "step": 6287 + }, + { + "epoch": 11.404216730900023, + "grad_norm": 0.326171875, + "learning_rate": 0.0002, + "loss": 0.1206, + "step": 6288 + }, + { + "epoch": 11.406030378598958, + "grad_norm": 0.265625, + "learning_rate": 0.0002, + "loss": 0.1238, + "step": 6289 + }, + { + "epoch": 11.40784402629789, + "grad_norm": 0.27734375, + "learning_rate": 0.0002, + "loss": 0.1007, + "step": 6290 + }, + { + "epoch": 11.409657673996826, + "grad_norm": 0.267578125, + "learning_rate": 0.0002, + "loss": 0.1102, + "step": 6291 + }, + { + "epoch": 11.41147132169576, + "grad_norm": 0.287109375, + "learning_rate": 0.0002, + "loss": 0.1153, + "step": 6292 + }, + { + "epoch": 11.413284969394695, + "grad_norm": 0.32421875, + "learning_rate": 0.0002, + "loss": 0.1065, + "step": 6293 + }, + { + "epoch": 11.41509861709363, + "grad_norm": 0.30859375, + "learning_rate": 0.0002, + "loss": 0.1128, + "step": 6294 + }, + { + "epoch": 11.416912264792565, + "grad_norm": 0.318359375, + "learning_rate": 0.0002, + "loss": 0.0995, + "step": 6295 + }, + { + "epoch": 11.418725912491498, + "grad_norm": 0.318359375, + "learning_rate": 0.0002, + "loss": 0.1143, + "step": 6296 + }, + { + "epoch": 11.420539560190432, + "grad_norm": 0.357421875, + "learning_rate": 0.0002, + "loss": 0.0989, + "step": 6297 + }, + { + "epoch": 11.422353207889367, + "grad_norm": 0.34765625, + "learning_rate": 0.0002, + "loss": 0.1296, + "step": 6298 + }, + { + "epoch": 11.424166855588302, + "grad_norm": 0.306640625, + "learning_rate": 0.0002, + "loss": 0.1182, + "step": 6299 + }, + { + "epoch": 11.425980503287237, + "grad_norm": 0.322265625, + "learning_rate": 0.0002, + "loss": 0.1032, + "step": 6300 + }, + { + "epoch": 11.427794150986172, + "grad_norm": 0.306640625, + "learning_rate": 0.0002, + "loss": 0.1121, + "step": 6301 + }, + { + "epoch": 11.429607798685105, + "grad_norm": 0.23828125, + "learning_rate": 0.0002, + "loss": 0.0888, + "step": 6302 + }, + { + "epoch": 11.43142144638404, + "grad_norm": 0.328125, + "learning_rate": 0.0002, + "loss": 0.1208, + "step": 6303 + }, + { + "epoch": 11.433235094082974, + "grad_norm": 0.294921875, + "learning_rate": 0.0002, + "loss": 0.0879, + "step": 6304 + }, + { + "epoch": 11.435048741781909, + "grad_norm": 0.244140625, + "learning_rate": 0.0002, + "loss": 0.0871, + "step": 6305 + }, + { + "epoch": 11.436862389480844, + "grad_norm": 0.3046875, + "learning_rate": 0.0002, + "loss": 0.1005, + "step": 6306 + }, + { + "epoch": 11.438676037179778, + "grad_norm": 0.275390625, + "learning_rate": 0.0002, + "loss": 0.1545, + "step": 6307 + }, + { + "epoch": 11.440489684878713, + "grad_norm": 0.43359375, + "learning_rate": 0.0002, + "loss": 0.1238, + "step": 6308 + }, + { + "epoch": 11.442303332577646, + "grad_norm": 0.384765625, + "learning_rate": 0.0002, + "loss": 0.1232, + "step": 6309 + }, + { + "epoch": 11.444116980276581, + "grad_norm": 0.1787109375, + "learning_rate": 0.0002, + "loss": 0.1206, + "step": 6310 + }, + { + "epoch": 11.445930627975516, + "grad_norm": 0.283203125, + "learning_rate": 0.0002, + "loss": 0.1198, + "step": 6311 + }, + { + "epoch": 11.44774427567445, + "grad_norm": 0.267578125, + "learning_rate": 0.0002, + "loss": 0.1346, + "step": 6312 + }, + { + "epoch": 11.449557923373385, + "grad_norm": 0.43359375, + "learning_rate": 0.0002, + "loss": 0.1427, + "step": 6313 + }, + { + "epoch": 11.451371571072318, + "grad_norm": 0.1962890625, + "learning_rate": 0.0002, + "loss": 0.156, + "step": 6314 + }, + { + "epoch": 11.453185218771253, + "grad_norm": 0.12890625, + "learning_rate": 0.0002, + "loss": 0.1921, + "step": 6315 + }, + { + "epoch": 11.454998866470188, + "grad_norm": 0.2578125, + "learning_rate": 0.0002, + "loss": 0.2237, + "step": 6316 + }, + { + "epoch": 11.456812514169123, + "grad_norm": 0.2734375, + "learning_rate": 0.0002, + "loss": 0.2691, + "step": 6317 + }, + { + "epoch": 11.458626161868057, + "grad_norm": 0.248046875, + "learning_rate": 0.0002, + "loss": 0.2073, + "step": 6318 + }, + { + "epoch": 11.460439809566992, + "grad_norm": 0.2197265625, + "learning_rate": 0.0002, + "loss": 0.1458, + "step": 6319 + }, + { + "epoch": 11.462253457265927, + "grad_norm": 0.2275390625, + "learning_rate": 0.0002, + "loss": 0.1741, + "step": 6320 + }, + { + "epoch": 11.46406710496486, + "grad_norm": 0.2138671875, + "learning_rate": 0.0002, + "loss": 0.1625, + "step": 6321 + }, + { + "epoch": 11.465880752663795, + "grad_norm": 0.251953125, + "learning_rate": 0.0002, + "loss": 0.228, + "step": 6322 + }, + { + "epoch": 11.46769440036273, + "grad_norm": 0.2099609375, + "learning_rate": 0.0002, + "loss": 0.1343, + "step": 6323 + }, + { + "epoch": 11.469508048061664, + "grad_norm": 0.2392578125, + "learning_rate": 0.0002, + "loss": 0.2014, + "step": 6324 + }, + { + "epoch": 11.471321695760599, + "grad_norm": 0.232421875, + "learning_rate": 0.0002, + "loss": 0.1702, + "step": 6325 + }, + { + "epoch": 11.473135343459534, + "grad_norm": 0.2421875, + "learning_rate": 0.0002, + "loss": 0.159, + "step": 6326 + }, + { + "epoch": 11.474948991158467, + "grad_norm": 0.2255859375, + "learning_rate": 0.0002, + "loss": 0.1336, + "step": 6327 + }, + { + "epoch": 11.476762638857402, + "grad_norm": 0.2734375, + "learning_rate": 0.0002, + "loss": 0.1715, + "step": 6328 + }, + { + "epoch": 11.478576286556336, + "grad_norm": 0.26171875, + "learning_rate": 0.0002, + "loss": 0.1525, + "step": 6329 + }, + { + "epoch": 11.480389934255271, + "grad_norm": 0.22265625, + "learning_rate": 0.0002, + "loss": 0.1003, + "step": 6330 + }, + { + "epoch": 11.482203581954206, + "grad_norm": 0.2451171875, + "learning_rate": 0.0002, + "loss": 0.1206, + "step": 6331 + }, + { + "epoch": 11.48401722965314, + "grad_norm": 0.26953125, + "learning_rate": 0.0002, + "loss": 0.1441, + "step": 6332 + }, + { + "epoch": 11.485830877352074, + "grad_norm": 0.2578125, + "learning_rate": 0.0002, + "loss": 0.1133, + "step": 6333 + }, + { + "epoch": 11.487644525051008, + "grad_norm": 0.291015625, + "learning_rate": 0.0002, + "loss": 0.1186, + "step": 6334 + }, + { + "epoch": 11.489458172749943, + "grad_norm": 0.328125, + "learning_rate": 0.0002, + "loss": 0.1231, + "step": 6335 + }, + { + "epoch": 11.491271820448878, + "grad_norm": 0.26171875, + "learning_rate": 0.0002, + "loss": 0.1114, + "step": 6336 + }, + { + "epoch": 11.493085468147813, + "grad_norm": 0.251953125, + "learning_rate": 0.0002, + "loss": 0.1129, + "step": 6337 + }, + { + "epoch": 11.494899115846748, + "grad_norm": 0.314453125, + "learning_rate": 0.0002, + "loss": 0.1194, + "step": 6338 + }, + { + "epoch": 11.49671276354568, + "grad_norm": 0.263671875, + "learning_rate": 0.0002, + "loss": 0.1022, + "step": 6339 + }, + { + "epoch": 11.498526411244615, + "grad_norm": 0.3046875, + "learning_rate": 0.0002, + "loss": 0.1222, + "step": 6340 + }, + { + "epoch": 11.50034005894355, + "grad_norm": 0.265625, + "learning_rate": 0.0002, + "loss": 0.112, + "step": 6341 + }, + { + "epoch": 11.502153706642485, + "grad_norm": 0.294921875, + "learning_rate": 0.0002, + "loss": 0.0948, + "step": 6342 + }, + { + "epoch": 11.50396735434142, + "grad_norm": 0.365234375, + "learning_rate": 0.0002, + "loss": 0.1463, + "step": 6343 + }, + { + "epoch": 11.505781002040354, + "grad_norm": 0.2734375, + "learning_rate": 0.0002, + "loss": 0.1022, + "step": 6344 + }, + { + "epoch": 11.507594649739287, + "grad_norm": 0.3125, + "learning_rate": 0.0002, + "loss": 0.1, + "step": 6345 + }, + { + "epoch": 11.509408297438222, + "grad_norm": 0.33203125, + "learning_rate": 0.0002, + "loss": 0.1146, + "step": 6346 + }, + { + "epoch": 11.511221945137157, + "grad_norm": 0.33203125, + "learning_rate": 0.0002, + "loss": 0.0986, + "step": 6347 + }, + { + "epoch": 11.513035592836092, + "grad_norm": 0.34375, + "learning_rate": 0.0002, + "loss": 0.108, + "step": 6348 + }, + { + "epoch": 11.514849240535026, + "grad_norm": 0.42578125, + "learning_rate": 0.0002, + "loss": 0.1258, + "step": 6349 + }, + { + "epoch": 11.516662888233961, + "grad_norm": 0.2421875, + "learning_rate": 0.0002, + "loss": 0.0833, + "step": 6350 + }, + { + "epoch": 11.518476535932894, + "grad_norm": 0.3046875, + "learning_rate": 0.0002, + "loss": 0.0925, + "step": 6351 + }, + { + "epoch": 11.520290183631829, + "grad_norm": 0.220703125, + "learning_rate": 0.0002, + "loss": 0.0938, + "step": 6352 + }, + { + "epoch": 11.522103831330764, + "grad_norm": 0.2431640625, + "learning_rate": 0.0002, + "loss": 0.094, + "step": 6353 + }, + { + "epoch": 11.523917479029699, + "grad_norm": 0.248046875, + "learning_rate": 0.0002, + "loss": 0.1036, + "step": 6354 + }, + { + "epoch": 11.525731126728633, + "grad_norm": 0.314453125, + "learning_rate": 0.0002, + "loss": 0.0961, + "step": 6355 + }, + { + "epoch": 11.527544774427568, + "grad_norm": 0.2470703125, + "learning_rate": 0.0002, + "loss": 0.1069, + "step": 6356 + }, + { + "epoch": 11.529358422126503, + "grad_norm": 0.34375, + "learning_rate": 0.0002, + "loss": 0.1064, + "step": 6357 + }, + { + "epoch": 11.531172069825436, + "grad_norm": 0.2333984375, + "learning_rate": 0.0002, + "loss": 0.1374, + "step": 6358 + }, + { + "epoch": 11.531172069825436, + "eval_loss": 2.2021710872650146, + "eval_runtime": 152.045, + "eval_samples_per_second": 6.577, + "eval_steps_per_second": 6.577, + "step": 6358 + }, + { + "epoch": 11.531172069825436, + "mmlu_eval_accuracy": 0.30812275257075866, + "mmlu_eval_accuracy_abstract_algebra": 0.36363636363636365, + "mmlu_eval_accuracy_anatomy": 0.35714285714285715, + "mmlu_eval_accuracy_astronomy": 0.3125, + "mmlu_eval_accuracy_business_ethics": 0.36363636363636365, + "mmlu_eval_accuracy_clinical_knowledge": 0.3103448275862069, + "mmlu_eval_accuracy_college_biology": 0.375, + "mmlu_eval_accuracy_college_chemistry": 0.25, + "mmlu_eval_accuracy_college_computer_science": 0.09090909090909091, + "mmlu_eval_accuracy_college_mathematics": 0.18181818181818182, + "mmlu_eval_accuracy_college_medicine": 0.2727272727272727, + "mmlu_eval_accuracy_college_physics": 0.2727272727272727, + "mmlu_eval_accuracy_computer_security": 0.5454545454545454, + "mmlu_eval_accuracy_conceptual_physics": 0.3076923076923077, + "mmlu_eval_accuracy_econometrics": 0.16666666666666666, + "mmlu_eval_accuracy_electrical_engineering": 0.25, + "mmlu_eval_accuracy_elementary_mathematics": 0.34146341463414637, + "mmlu_eval_accuracy_formal_logic": 0.14285714285714285, + "mmlu_eval_accuracy_global_facts": 0.3, + "mmlu_eval_accuracy_high_school_biology": 0.5, + "mmlu_eval_accuracy_high_school_chemistry": 0.13636363636363635, + "mmlu_eval_accuracy_high_school_computer_science": 0.2222222222222222, + "mmlu_eval_accuracy_high_school_european_history": 0.3333333333333333, + "mmlu_eval_accuracy_high_school_geography": 0.2727272727272727, + "mmlu_eval_accuracy_high_school_government_and_politics": 0.38095238095238093, + "mmlu_eval_accuracy_high_school_macroeconomics": 0.32558139534883723, + "mmlu_eval_accuracy_high_school_mathematics": 0.10344827586206896, + "mmlu_eval_accuracy_high_school_microeconomics": 0.3076923076923077, + "mmlu_eval_accuracy_high_school_physics": 0.4117647058823529, + "mmlu_eval_accuracy_high_school_psychology": 0.3, + "mmlu_eval_accuracy_high_school_statistics": 0.2608695652173913, + "mmlu_eval_accuracy_high_school_us_history": 0.2727272727272727, + "mmlu_eval_accuracy_high_school_world_history": 0.15384615384615385, + "mmlu_eval_accuracy_human_aging": 0.2608695652173913, + "mmlu_eval_accuracy_human_sexuality": 0.08333333333333333, + "mmlu_eval_accuracy_international_law": 0.3076923076923077, + "mmlu_eval_accuracy_jurisprudence": 0.18181818181818182, + "mmlu_eval_accuracy_logical_fallacies": 0.2777777777777778, + "mmlu_eval_accuracy_machine_learning": 0.36363636363636365, + "mmlu_eval_accuracy_management": 0.45454545454545453, + "mmlu_eval_accuracy_marketing": 0.44, + "mmlu_eval_accuracy_medical_genetics": 0.45454545454545453, + "mmlu_eval_accuracy_miscellaneous": 0.46511627906976744, + "mmlu_eval_accuracy_moral_disputes": 0.34210526315789475, + "mmlu_eval_accuracy_moral_scenarios": 0.22, + "mmlu_eval_accuracy_nutrition": 0.3333333333333333, + "mmlu_eval_accuracy_philosophy": 0.4411764705882353, + "mmlu_eval_accuracy_prehistory": 0.34285714285714286, + "mmlu_eval_accuracy_professional_accounting": 0.3225806451612903, + "mmlu_eval_accuracy_professional_law": 0.27647058823529413, + "mmlu_eval_accuracy_professional_medicine": 0.1935483870967742, + "mmlu_eval_accuracy_professional_psychology": 0.30434782608695654, + "mmlu_eval_accuracy_public_relations": 0.4166666666666667, + "mmlu_eval_accuracy_security_studies": 0.3333333333333333, + "mmlu_eval_accuracy_sociology": 0.5454545454545454, + "mmlu_eval_accuracy_us_foreign_policy": 0.36363636363636365, + "mmlu_eval_accuracy_virology": 0.3888888888888889, + "mmlu_eval_accuracy_world_religions": 0.2631578947368421, + "mmlu_loss": 2.2475070487744353, + "step": 6358 + }, + { + "epoch": 11.53298571752437, + "grad_norm": 0.2041015625, + "learning_rate": 0.0002, + "loss": 0.1285, + "step": 6359 + }, + { + "epoch": 11.534799365223305, + "grad_norm": 0.236328125, + "learning_rate": 0.0002, + "loss": 0.1199, + "step": 6360 + }, + { + "epoch": 11.53661301292224, + "grad_norm": 0.1923828125, + "learning_rate": 0.0002, + "loss": 0.1258, + "step": 6361 + }, + { + "epoch": 11.538426660621175, + "grad_norm": 0.380859375, + "learning_rate": 0.0002, + "loss": 0.1512, + "step": 6362 + }, + { + "epoch": 11.540240308320108, + "grad_norm": 0.263671875, + "learning_rate": 0.0002, + "loss": 0.1485, + "step": 6363 + }, + { + "epoch": 11.542053956019043, + "grad_norm": 0.341796875, + "learning_rate": 0.0002, + "loss": 0.1831, + "step": 6364 + }, + { + "epoch": 11.543867603717977, + "grad_norm": 0.205078125, + "learning_rate": 0.0002, + "loss": 0.2353, + "step": 6365 + }, + { + "epoch": 11.545681251416912, + "grad_norm": 0.224609375, + "learning_rate": 0.0002, + "loss": 0.186, + "step": 6366 + }, + { + "epoch": 11.547494899115847, + "grad_norm": 0.25, + "learning_rate": 0.0002, + "loss": 0.175, + "step": 6367 + }, + { + "epoch": 11.549308546814782, + "grad_norm": 0.248046875, + "learning_rate": 0.0002, + "loss": 0.1905, + "step": 6368 + }, + { + "epoch": 11.551122194513717, + "grad_norm": 0.232421875, + "learning_rate": 0.0002, + "loss": 0.2346, + "step": 6369 + }, + { + "epoch": 11.55293584221265, + "grad_norm": 0.240234375, + "learning_rate": 0.0002, + "loss": 0.2136, + "step": 6370 + }, + { + "epoch": 11.554749489911584, + "grad_norm": 0.2578125, + "learning_rate": 0.0002, + "loss": 0.2088, + "step": 6371 + }, + { + "epoch": 11.55656313761052, + "grad_norm": 0.23046875, + "learning_rate": 0.0002, + "loss": 0.1791, + "step": 6372 + }, + { + "epoch": 11.558376785309454, + "grad_norm": 0.25, + "learning_rate": 0.0002, + "loss": 0.165, + "step": 6373 + }, + { + "epoch": 11.560190433008389, + "grad_norm": 0.2236328125, + "learning_rate": 0.0002, + "loss": 0.1508, + "step": 6374 + }, + { + "epoch": 11.562004080707322, + "grad_norm": 0.2392578125, + "learning_rate": 0.0002, + "loss": 0.1522, + "step": 6375 + }, + { + "epoch": 11.563817728406256, + "grad_norm": 0.255859375, + "learning_rate": 0.0002, + "loss": 0.1343, + "step": 6376 + }, + { + "epoch": 11.565631376105191, + "grad_norm": 0.2255859375, + "learning_rate": 0.0002, + "loss": 0.1358, + "step": 6377 + }, + { + "epoch": 11.567445023804126, + "grad_norm": 0.23828125, + "learning_rate": 0.0002, + "loss": 0.1271, + "step": 6378 + }, + { + "epoch": 11.56925867150306, + "grad_norm": 0.265625, + "learning_rate": 0.0002, + "loss": 0.1409, + "step": 6379 + }, + { + "epoch": 11.571072319201996, + "grad_norm": 0.244140625, + "learning_rate": 0.0002, + "loss": 0.1283, + "step": 6380 + }, + { + "epoch": 11.57288596690093, + "grad_norm": 0.26953125, + "learning_rate": 0.0002, + "loss": 0.1442, + "step": 6381 + }, + { + "epoch": 11.574699614599863, + "grad_norm": 0.271484375, + "learning_rate": 0.0002, + "loss": 0.1168, + "step": 6382 + }, + { + "epoch": 11.576513262298798, + "grad_norm": 0.28125, + "learning_rate": 0.0002, + "loss": 0.123, + "step": 6383 + }, + { + "epoch": 11.578326909997733, + "grad_norm": 1.0546875, + "learning_rate": 0.0002, + "loss": 0.1423, + "step": 6384 + }, + { + "epoch": 11.580140557696668, + "grad_norm": 0.2490234375, + "learning_rate": 0.0002, + "loss": 0.1116, + "step": 6385 + }, + { + "epoch": 11.581954205395602, + "grad_norm": 0.283203125, + "learning_rate": 0.0002, + "loss": 0.1185, + "step": 6386 + }, + { + "epoch": 11.583767853094537, + "grad_norm": 0.30078125, + "learning_rate": 0.0002, + "loss": 0.1115, + "step": 6387 + }, + { + "epoch": 11.58558150079347, + "grad_norm": 0.2490234375, + "learning_rate": 0.0002, + "loss": 0.0987, + "step": 6388 + }, + { + "epoch": 11.587395148492405, + "grad_norm": 0.345703125, + "learning_rate": 0.0002, + "loss": 0.1171, + "step": 6389 + }, + { + "epoch": 11.58920879619134, + "grad_norm": 0.310546875, + "learning_rate": 0.0002, + "loss": 0.1038, + "step": 6390 + }, + { + "epoch": 11.591022443890274, + "grad_norm": 0.345703125, + "learning_rate": 0.0002, + "loss": 0.1261, + "step": 6391 + }, + { + "epoch": 11.59283609158921, + "grad_norm": 0.291015625, + "learning_rate": 0.0002, + "loss": 0.1099, + "step": 6392 + }, + { + "epoch": 11.594649739288144, + "grad_norm": 0.36328125, + "learning_rate": 0.0002, + "loss": 0.1025, + "step": 6393 + }, + { + "epoch": 11.596463386987077, + "grad_norm": 0.287109375, + "learning_rate": 0.0002, + "loss": 0.104, + "step": 6394 + }, + { + "epoch": 11.598277034686012, + "grad_norm": 0.353515625, + "learning_rate": 0.0002, + "loss": 0.1057, + "step": 6395 + }, + { + "epoch": 11.600090682384947, + "grad_norm": 0.322265625, + "learning_rate": 0.0002, + "loss": 0.0961, + "step": 6396 + }, + { + "epoch": 11.601904330083881, + "grad_norm": 0.33203125, + "learning_rate": 0.0002, + "loss": 0.1109, + "step": 6397 + }, + { + "epoch": 11.603717977782816, + "grad_norm": 0.330078125, + "learning_rate": 0.0002, + "loss": 0.1086, + "step": 6398 + }, + { + "epoch": 11.60553162548175, + "grad_norm": 0.22265625, + "learning_rate": 0.0002, + "loss": 0.093, + "step": 6399 + }, + { + "epoch": 11.607345273180684, + "grad_norm": 0.34765625, + "learning_rate": 0.0002, + "loss": 0.1069, + "step": 6400 + }, + { + "epoch": 11.609158920879619, + "grad_norm": 0.318359375, + "learning_rate": 0.0002, + "loss": 0.106, + "step": 6401 + }, + { + "epoch": 11.610972568578553, + "grad_norm": 0.2890625, + "learning_rate": 0.0002, + "loss": 0.1063, + "step": 6402 + }, + { + "epoch": 11.612786216277488, + "grad_norm": 0.279296875, + "learning_rate": 0.0002, + "loss": 0.1508, + "step": 6403 + }, + { + "epoch": 11.614599863976423, + "grad_norm": 0.3046875, + "learning_rate": 0.0002, + "loss": 0.0996, + "step": 6404 + }, + { + "epoch": 11.616413511675358, + "grad_norm": 0.251953125, + "learning_rate": 0.0002, + "loss": 0.1185, + "step": 6405 + }, + { + "epoch": 11.61822715937429, + "grad_norm": 0.24609375, + "learning_rate": 0.0002, + "loss": 0.1093, + "step": 6406 + }, + { + "epoch": 11.620040807073226, + "grad_norm": 0.33984375, + "learning_rate": 0.0002, + "loss": 0.1224, + "step": 6407 + }, + { + "epoch": 11.62185445477216, + "grad_norm": 0.29296875, + "learning_rate": 0.0002, + "loss": 0.1442, + "step": 6408 + }, + { + "epoch": 11.623668102471095, + "grad_norm": 0.30078125, + "learning_rate": 0.0002, + "loss": 0.125, + "step": 6409 + }, + { + "epoch": 11.62548175017003, + "grad_norm": 0.1923828125, + "learning_rate": 0.0002, + "loss": 0.1114, + "step": 6410 + }, + { + "epoch": 11.627295397868965, + "grad_norm": 0.244140625, + "learning_rate": 0.0002, + "loss": 0.122, + "step": 6411 + }, + { + "epoch": 11.629109045567898, + "grad_norm": 0.1943359375, + "learning_rate": 0.0002, + "loss": 0.1339, + "step": 6412 + }, + { + "epoch": 11.630922693266832, + "grad_norm": 0.181640625, + "learning_rate": 0.0002, + "loss": 0.1527, + "step": 6413 + }, + { + "epoch": 11.632736340965767, + "grad_norm": 0.1630859375, + "learning_rate": 0.0002, + "loss": 0.1933, + "step": 6414 + }, + { + "epoch": 11.634549988664702, + "grad_norm": 0.318359375, + "learning_rate": 0.0002, + "loss": 0.2198, + "step": 6415 + }, + { + "epoch": 11.636363636363637, + "grad_norm": 0.21875, + "learning_rate": 0.0002, + "loss": 0.172, + "step": 6416 + }, + { + "epoch": 11.638177284062571, + "grad_norm": 0.2490234375, + "learning_rate": 0.0002, + "loss": 0.2069, + "step": 6417 + }, + { + "epoch": 11.639990931761506, + "grad_norm": 0.2158203125, + "learning_rate": 0.0002, + "loss": 0.1542, + "step": 6418 + }, + { + "epoch": 11.64180457946044, + "grad_norm": 0.2275390625, + "learning_rate": 0.0002, + "loss": 0.1782, + "step": 6419 + }, + { + "epoch": 11.643618227159374, + "grad_norm": 0.2353515625, + "learning_rate": 0.0002, + "loss": 0.1721, + "step": 6420 + }, + { + "epoch": 11.645431874858309, + "grad_norm": 0.2412109375, + "learning_rate": 0.0002, + "loss": 0.1972, + "step": 6421 + }, + { + "epoch": 11.647245522557244, + "grad_norm": 0.212890625, + "learning_rate": 0.0002, + "loss": 0.1691, + "step": 6422 + }, + { + "epoch": 11.649059170256178, + "grad_norm": 0.2373046875, + "learning_rate": 0.0002, + "loss": 0.2211, + "step": 6423 + }, + { + "epoch": 11.650872817955111, + "grad_norm": 0.216796875, + "learning_rate": 0.0002, + "loss": 0.1548, + "step": 6424 + }, + { + "epoch": 11.652686465654046, + "grad_norm": 0.21875, + "learning_rate": 0.0002, + "loss": 0.1448, + "step": 6425 + }, + { + "epoch": 11.65450011335298, + "grad_norm": 0.263671875, + "learning_rate": 0.0002, + "loss": 0.1917, + "step": 6426 + }, + { + "epoch": 11.656313761051916, + "grad_norm": 0.2490234375, + "learning_rate": 0.0002, + "loss": 0.1681, + "step": 6427 + }, + { + "epoch": 11.65812740875085, + "grad_norm": 0.255859375, + "learning_rate": 0.0002, + "loss": 0.1412, + "step": 6428 + }, + { + "epoch": 11.659941056449785, + "grad_norm": 0.248046875, + "learning_rate": 0.0002, + "loss": 0.1661, + "step": 6429 + }, + { + "epoch": 11.66175470414872, + "grad_norm": 0.263671875, + "learning_rate": 0.0002, + "loss": 0.1538, + "step": 6430 + }, + { + "epoch": 11.663568351847653, + "grad_norm": 0.25, + "learning_rate": 0.0002, + "loss": 0.1144, + "step": 6431 + }, + { + "epoch": 11.665381999546588, + "grad_norm": 0.23828125, + "learning_rate": 0.0002, + "loss": 0.0993, + "step": 6432 + }, + { + "epoch": 11.667195647245522, + "grad_norm": 0.2412109375, + "learning_rate": 0.0002, + "loss": 0.1314, + "step": 6433 + }, + { + "epoch": 11.669009294944457, + "grad_norm": 0.263671875, + "learning_rate": 0.0002, + "loss": 0.117, + "step": 6434 + }, + { + "epoch": 11.670822942643392, + "grad_norm": 0.283203125, + "learning_rate": 0.0002, + "loss": 0.1394, + "step": 6435 + }, + { + "epoch": 11.672636590342327, + "grad_norm": 0.333984375, + "learning_rate": 0.0002, + "loss": 0.1252, + "step": 6436 + }, + { + "epoch": 11.67445023804126, + "grad_norm": 0.2734375, + "learning_rate": 0.0002, + "loss": 0.1165, + "step": 6437 + }, + { + "epoch": 11.676263885740195, + "grad_norm": 0.3046875, + "learning_rate": 0.0002, + "loss": 0.1272, + "step": 6438 + }, + { + "epoch": 11.67807753343913, + "grad_norm": 0.328125, + "learning_rate": 0.0002, + "loss": 0.1699, + "step": 6439 + }, + { + "epoch": 11.679891181138064, + "grad_norm": 0.337890625, + "learning_rate": 0.0002, + "loss": 0.1332, + "step": 6440 + }, + { + "epoch": 11.681704828836999, + "grad_norm": 0.306640625, + "learning_rate": 0.0002, + "loss": 0.125, + "step": 6441 + }, + { + "epoch": 11.683518476535934, + "grad_norm": 0.34375, + "learning_rate": 0.0002, + "loss": 0.1054, + "step": 6442 + }, + { + "epoch": 11.685332124234867, + "grad_norm": 0.3046875, + "learning_rate": 0.0002, + "loss": 0.0968, + "step": 6443 + }, + { + "epoch": 11.687145771933801, + "grad_norm": 0.310546875, + "learning_rate": 0.0002, + "loss": 0.1276, + "step": 6444 + }, + { + "epoch": 11.688959419632736, + "grad_norm": 0.3125, + "learning_rate": 0.0002, + "loss": 0.1098, + "step": 6445 + }, + { + "epoch": 11.690773067331671, + "grad_norm": 0.3046875, + "learning_rate": 0.0002, + "loss": 0.1331, + "step": 6446 + }, + { + "epoch": 11.692586715030606, + "grad_norm": 0.32421875, + "learning_rate": 0.0002, + "loss": 0.1076, + "step": 6447 + }, + { + "epoch": 11.69440036272954, + "grad_norm": 0.32421875, + "learning_rate": 0.0002, + "loss": 0.1169, + "step": 6448 + }, + { + "epoch": 11.696214010428474, + "grad_norm": 0.5546875, + "learning_rate": 0.0002, + "loss": 0.1154, + "step": 6449 + }, + { + "epoch": 11.698027658127408, + "grad_norm": 0.333984375, + "learning_rate": 0.0002, + "loss": 0.1176, + "step": 6450 + }, + { + "epoch": 11.699841305826343, + "grad_norm": 0.33203125, + "learning_rate": 0.0002, + "loss": 0.1066, + "step": 6451 + }, + { + "epoch": 11.701654953525278, + "grad_norm": 0.294921875, + "learning_rate": 0.0002, + "loss": 0.1041, + "step": 6452 + }, + { + "epoch": 11.703468601224213, + "grad_norm": 0.294921875, + "learning_rate": 0.0002, + "loss": 0.0901, + "step": 6453 + }, + { + "epoch": 11.705282248923147, + "grad_norm": 0.28515625, + "learning_rate": 0.0002, + "loss": 0.1077, + "step": 6454 + }, + { + "epoch": 11.70709589662208, + "grad_norm": 0.337890625, + "learning_rate": 0.0002, + "loss": 0.122, + "step": 6455 + }, + { + "epoch": 11.708909544321015, + "grad_norm": 0.22265625, + "learning_rate": 0.0002, + "loss": 0.0928, + "step": 6456 + }, + { + "epoch": 11.71072319201995, + "grad_norm": 0.291015625, + "learning_rate": 0.0002, + "loss": 0.1134, + "step": 6457 + }, + { + "epoch": 11.712536839718885, + "grad_norm": 0.28125, + "learning_rate": 0.0002, + "loss": 0.1261, + "step": 6458 + }, + { + "epoch": 11.71435048741782, + "grad_norm": 0.3046875, + "learning_rate": 0.0002, + "loss": 0.1384, + "step": 6459 + }, + { + "epoch": 11.716164135116754, + "grad_norm": 0.318359375, + "learning_rate": 0.0002, + "loss": 0.139, + "step": 6460 + }, + { + "epoch": 11.717977782815687, + "grad_norm": 0.1943359375, + "learning_rate": 0.0002, + "loss": 0.1266, + "step": 6461 + }, + { + "epoch": 11.719791430514622, + "grad_norm": 0.2890625, + "learning_rate": 0.0002, + "loss": 0.1311, + "step": 6462 + }, + { + "epoch": 11.721605078213557, + "grad_norm": 0.220703125, + "learning_rate": 0.0002, + "loss": 0.1436, + "step": 6463 + }, + { + "epoch": 11.723418725912492, + "grad_norm": 0.2314453125, + "learning_rate": 0.0002, + "loss": 0.1563, + "step": 6464 + }, + { + "epoch": 11.725232373611426, + "grad_norm": 0.3515625, + "learning_rate": 0.0002, + "loss": 0.2261, + "step": 6465 + }, + { + "epoch": 11.727046021310361, + "grad_norm": 0.2578125, + "learning_rate": 0.0002, + "loss": 0.2405, + "step": 6466 + }, + { + "epoch": 11.728859669009296, + "grad_norm": 0.2275390625, + "learning_rate": 0.0002, + "loss": 0.1547, + "step": 6467 + }, + { + "epoch": 11.730673316708229, + "grad_norm": 0.26953125, + "learning_rate": 0.0002, + "loss": 0.2179, + "step": 6468 + }, + { + "epoch": 11.732486964407164, + "grad_norm": 0.234375, + "learning_rate": 0.0002, + "loss": 0.183, + "step": 6469 + }, + { + "epoch": 11.734300612106098, + "grad_norm": 0.234375, + "learning_rate": 0.0002, + "loss": 0.1878, + "step": 6470 + }, + { + "epoch": 11.736114259805033, + "grad_norm": 0.25, + "learning_rate": 0.0002, + "loss": 0.1792, + "step": 6471 + }, + { + "epoch": 11.737927907503968, + "grad_norm": 0.255859375, + "learning_rate": 0.0002, + "loss": 0.217, + "step": 6472 + }, + { + "epoch": 11.739741555202901, + "grad_norm": 0.22265625, + "learning_rate": 0.0002, + "loss": 0.1597, + "step": 6473 + }, + { + "epoch": 11.741555202901836, + "grad_norm": 0.27734375, + "learning_rate": 0.0002, + "loss": 0.2019, + "step": 6474 + }, + { + "epoch": 11.74336885060077, + "grad_norm": 0.19921875, + "learning_rate": 0.0002, + "loss": 0.1327, + "step": 6475 + }, + { + "epoch": 11.745182498299705, + "grad_norm": 0.283203125, + "learning_rate": 0.0002, + "loss": 0.1719, + "step": 6476 + }, + { + "epoch": 11.74699614599864, + "grad_norm": 0.26171875, + "learning_rate": 0.0002, + "loss": 0.1712, + "step": 6477 + }, + { + "epoch": 11.748809793697575, + "grad_norm": 0.2294921875, + "learning_rate": 0.0002, + "loss": 0.148, + "step": 6478 + }, + { + "epoch": 11.75062344139651, + "grad_norm": 0.251953125, + "learning_rate": 0.0002, + "loss": 0.1453, + "step": 6479 + }, + { + "epoch": 11.752437089095443, + "grad_norm": 0.267578125, + "learning_rate": 0.0002, + "loss": 0.1967, + "step": 6480 + }, + { + "epoch": 11.754250736794377, + "grad_norm": 0.255859375, + "learning_rate": 0.0002, + "loss": 0.1516, + "step": 6481 + }, + { + "epoch": 11.756064384493312, + "grad_norm": 0.30078125, + "learning_rate": 0.0002, + "loss": 0.157, + "step": 6482 + }, + { + "epoch": 11.757878032192247, + "grad_norm": 0.2578125, + "learning_rate": 0.0002, + "loss": 0.1222, + "step": 6483 + }, + { + "epoch": 11.759691679891182, + "grad_norm": 0.302734375, + "learning_rate": 0.0002, + "loss": 0.1451, + "step": 6484 + }, + { + "epoch": 11.761505327590115, + "grad_norm": 0.296875, + "learning_rate": 0.0002, + "loss": 0.1047, + "step": 6485 + }, + { + "epoch": 11.76331897528905, + "grad_norm": 0.271484375, + "learning_rate": 0.0002, + "loss": 0.1454, + "step": 6486 + }, + { + "epoch": 11.765132622987984, + "grad_norm": 0.294921875, + "learning_rate": 0.0002, + "loss": 0.1248, + "step": 6487 + }, + { + "epoch": 11.766946270686919, + "grad_norm": 0.279296875, + "learning_rate": 0.0002, + "loss": 0.1076, + "step": 6488 + }, + { + "epoch": 11.768759918385854, + "grad_norm": 0.302734375, + "learning_rate": 0.0002, + "loss": 0.1548, + "step": 6489 + }, + { + "epoch": 11.770573566084789, + "grad_norm": 0.33203125, + "learning_rate": 0.0002, + "loss": 0.1193, + "step": 6490 + }, + { + "epoch": 11.772387213783723, + "grad_norm": 0.3515625, + "learning_rate": 0.0002, + "loss": 0.1142, + "step": 6491 + }, + { + "epoch": 11.774200861482656, + "grad_norm": 0.34765625, + "learning_rate": 0.0002, + "loss": 0.1034, + "step": 6492 + }, + { + "epoch": 11.776014509181591, + "grad_norm": 0.296875, + "learning_rate": 0.0002, + "loss": 0.1059, + "step": 6493 + }, + { + "epoch": 11.777828156880526, + "grad_norm": 0.337890625, + "learning_rate": 0.0002, + "loss": 0.1118, + "step": 6494 + }, + { + "epoch": 11.77964180457946, + "grad_norm": 0.369140625, + "learning_rate": 0.0002, + "loss": 0.1145, + "step": 6495 + }, + { + "epoch": 11.781455452278395, + "grad_norm": 0.25, + "learning_rate": 0.0002, + "loss": 0.0804, + "step": 6496 + }, + { + "epoch": 11.78326909997733, + "grad_norm": 0.34375, + "learning_rate": 0.0002, + "loss": 0.1163, + "step": 6497 + }, + { + "epoch": 11.785082747676263, + "grad_norm": 0.30859375, + "learning_rate": 0.0002, + "loss": 0.1082, + "step": 6498 + }, + { + "epoch": 11.786896395375198, + "grad_norm": 0.3828125, + "learning_rate": 0.0002, + "loss": 0.1195, + "step": 6499 + }, + { + "epoch": 11.788710043074133, + "grad_norm": 0.2734375, + "learning_rate": 0.0002, + "loss": 0.0995, + "step": 6500 + }, + { + "epoch": 11.790523690773068, + "grad_norm": 0.28125, + "learning_rate": 0.0002, + "loss": 0.0987, + "step": 6501 + }, + { + "epoch": 11.792337338472002, + "grad_norm": 0.265625, + "learning_rate": 0.0002, + "loss": 0.1023, + "step": 6502 + }, + { + "epoch": 11.794150986170937, + "grad_norm": 0.328125, + "learning_rate": 0.0002, + "loss": 0.1003, + "step": 6503 + }, + { + "epoch": 11.79596463386987, + "grad_norm": 0.53125, + "learning_rate": 0.0002, + "loss": 0.1754, + "step": 6504 + }, + { + "epoch": 11.797778281568805, + "grad_norm": 0.376953125, + "learning_rate": 0.0002, + "loss": 0.1283, + "step": 6505 + }, + { + "epoch": 11.79959192926774, + "grad_norm": 0.2158203125, + "learning_rate": 0.0002, + "loss": 0.1162, + "step": 6506 + }, + { + "epoch": 11.801405576966674, + "grad_norm": 0.2578125, + "learning_rate": 0.0002, + "loss": 0.1172, + "step": 6507 + }, + { + "epoch": 11.80321922466561, + "grad_norm": 0.271484375, + "learning_rate": 0.0002, + "loss": 0.1246, + "step": 6508 + }, + { + "epoch": 11.805032872364544, + "grad_norm": 0.2197265625, + "learning_rate": 0.0002, + "loss": 0.1252, + "step": 6509 + }, + { + "epoch": 11.806846520063477, + "grad_norm": 0.25, + "learning_rate": 0.0002, + "loss": 0.1254, + "step": 6510 + }, + { + "epoch": 11.808660167762412, + "grad_norm": 0.2138671875, + "learning_rate": 0.0002, + "loss": 0.1234, + "step": 6511 + }, + { + "epoch": 11.810473815461346, + "grad_norm": 0.26171875, + "learning_rate": 0.0002, + "loss": 0.1284, + "step": 6512 + }, + { + "epoch": 11.812287463160281, + "grad_norm": 0.1904296875, + "learning_rate": 0.0002, + "loss": 0.1358, + "step": 6513 + }, + { + "epoch": 11.814101110859216, + "grad_norm": 0.1650390625, + "learning_rate": 0.0002, + "loss": 0.1483, + "step": 6514 + }, + { + "epoch": 11.81591475855815, + "grad_norm": 0.2158203125, + "learning_rate": 0.0002, + "loss": 0.2081, + "step": 6515 + }, + { + "epoch": 11.817728406257084, + "grad_norm": 0.279296875, + "learning_rate": 0.0002, + "loss": 0.3212, + "step": 6516 + }, + { + "epoch": 11.819542053956019, + "grad_norm": 0.265625, + "learning_rate": 0.0002, + "loss": 0.2201, + "step": 6517 + }, + { + "epoch": 11.821355701654953, + "grad_norm": 0.2734375, + "learning_rate": 0.0002, + "loss": 0.2302, + "step": 6518 + }, + { + "epoch": 11.823169349353888, + "grad_norm": 0.279296875, + "learning_rate": 0.0002, + "loss": 0.2134, + "step": 6519 + }, + { + "epoch": 11.824982997052823, + "grad_norm": 0.2421875, + "learning_rate": 0.0002, + "loss": 0.2065, + "step": 6520 + }, + { + "epoch": 11.826796644751758, + "grad_norm": 0.259765625, + "learning_rate": 0.0002, + "loss": 0.2049, + "step": 6521 + }, + { + "epoch": 11.82861029245069, + "grad_norm": 0.2275390625, + "learning_rate": 0.0002, + "loss": 0.1558, + "step": 6522 + }, + { + "epoch": 11.830423940149625, + "grad_norm": 0.22265625, + "learning_rate": 0.0002, + "loss": 0.1759, + "step": 6523 + }, + { + "epoch": 11.83223758784856, + "grad_norm": 0.2060546875, + "learning_rate": 0.0002, + "loss": 0.1342, + "step": 6524 + }, + { + "epoch": 11.834051235547495, + "grad_norm": 0.2275390625, + "learning_rate": 0.0002, + "loss": 0.1304, + "step": 6525 + }, + { + "epoch": 11.83586488324643, + "grad_norm": 0.2294921875, + "learning_rate": 0.0002, + "loss": 0.1394, + "step": 6526 + }, + { + "epoch": 11.837678530945364, + "grad_norm": 0.26953125, + "learning_rate": 0.0002, + "loss": 0.1893, + "step": 6527 + }, + { + "epoch": 11.8394921786443, + "grad_norm": 0.265625, + "learning_rate": 0.0002, + "loss": 0.1374, + "step": 6528 + }, + { + "epoch": 11.841305826343232, + "grad_norm": 0.302734375, + "learning_rate": 0.0002, + "loss": 0.1884, + "step": 6529 + }, + { + "epoch": 11.843119474042167, + "grad_norm": 0.255859375, + "learning_rate": 0.0002, + "loss": 0.1293, + "step": 6530 + }, + { + "epoch": 11.844933121741102, + "grad_norm": 0.26171875, + "learning_rate": 0.0002, + "loss": 0.1363, + "step": 6531 + }, + { + "epoch": 11.846746769440037, + "grad_norm": 0.2451171875, + "learning_rate": 0.0002, + "loss": 0.1455, + "step": 6532 + }, + { + "epoch": 11.848560417138971, + "grad_norm": 0.322265625, + "learning_rate": 0.0002, + "loss": 0.1563, + "step": 6533 + }, + { + "epoch": 11.850374064837904, + "grad_norm": 0.255859375, + "learning_rate": 0.0002, + "loss": 0.1223, + "step": 6534 + }, + { + "epoch": 11.85218771253684, + "grad_norm": 0.318359375, + "learning_rate": 0.0002, + "loss": 0.1468, + "step": 6535 + }, + { + "epoch": 11.854001360235774, + "grad_norm": 0.30078125, + "learning_rate": 0.0002, + "loss": 0.128, + "step": 6536 + }, + { + "epoch": 11.855815007934709, + "grad_norm": 0.236328125, + "learning_rate": 0.0002, + "loss": 0.1135, + "step": 6537 + }, + { + "epoch": 11.857628655633643, + "grad_norm": 0.353515625, + "learning_rate": 0.0002, + "loss": 0.1366, + "step": 6538 + }, + { + "epoch": 11.859442303332578, + "grad_norm": 0.287109375, + "learning_rate": 0.0002, + "loss": 0.1118, + "step": 6539 + }, + { + "epoch": 11.861255951031513, + "grad_norm": 0.333984375, + "learning_rate": 0.0002, + "loss": 0.1371, + "step": 6540 + }, + { + "epoch": 11.863069598730446, + "grad_norm": 0.3203125, + "learning_rate": 0.0002, + "loss": 0.1499, + "step": 6541 + }, + { + "epoch": 11.86488324642938, + "grad_norm": 0.283203125, + "learning_rate": 0.0002, + "loss": 0.1356, + "step": 6542 + }, + { + "epoch": 11.866696894128316, + "grad_norm": 0.271484375, + "learning_rate": 0.0002, + "loss": 0.1059, + "step": 6543 + }, + { + "epoch": 11.86851054182725, + "grad_norm": 0.302734375, + "learning_rate": 0.0002, + "loss": 0.1371, + "step": 6544 + }, + { + "epoch": 11.870324189526185, + "grad_norm": 0.29296875, + "learning_rate": 0.0002, + "loss": 0.1162, + "step": 6545 + }, + { + "epoch": 11.870324189526185, + "eval_loss": 2.3055381774902344, + "eval_runtime": 152.4381, + "eval_samples_per_second": 6.56, + "eval_steps_per_second": 6.56, + "step": 6545 + }, + { + "epoch": 11.870324189526185, + "mmlu_eval_accuracy": 0.30722955283456416, + "mmlu_eval_accuracy_abstract_algebra": 0.36363636363636365, + "mmlu_eval_accuracy_anatomy": 0.42857142857142855, + "mmlu_eval_accuracy_astronomy": 0.3125, + "mmlu_eval_accuracy_business_ethics": 0.45454545454545453, + "mmlu_eval_accuracy_clinical_knowledge": 0.27586206896551724, + "mmlu_eval_accuracy_college_biology": 0.4375, + "mmlu_eval_accuracy_college_chemistry": 0.25, + "mmlu_eval_accuracy_college_computer_science": 0.09090909090909091, + "mmlu_eval_accuracy_college_mathematics": 0.18181818181818182, + "mmlu_eval_accuracy_college_medicine": 0.3181818181818182, + "mmlu_eval_accuracy_college_physics": 0.45454545454545453, + "mmlu_eval_accuracy_computer_security": 0.6363636363636364, + "mmlu_eval_accuracy_conceptual_physics": 0.23076923076923078, + "mmlu_eval_accuracy_econometrics": 0.08333333333333333, + "mmlu_eval_accuracy_electrical_engineering": 0.25, + "mmlu_eval_accuracy_elementary_mathematics": 0.3170731707317073, + "mmlu_eval_accuracy_formal_logic": 0.14285714285714285, + "mmlu_eval_accuracy_global_facts": 0.3, + "mmlu_eval_accuracy_high_school_biology": 0.40625, + "mmlu_eval_accuracy_high_school_chemistry": 0.18181818181818182, + "mmlu_eval_accuracy_high_school_computer_science": 0.2222222222222222, + "mmlu_eval_accuracy_high_school_european_history": 0.2777777777777778, + "mmlu_eval_accuracy_high_school_geography": 0.3181818181818182, + "mmlu_eval_accuracy_high_school_government_and_politics": 0.2857142857142857, + "mmlu_eval_accuracy_high_school_macroeconomics": 0.23255813953488372, + "mmlu_eval_accuracy_high_school_mathematics": 0.10344827586206896, + "mmlu_eval_accuracy_high_school_microeconomics": 0.23076923076923078, + "mmlu_eval_accuracy_high_school_physics": 0.47058823529411764, + "mmlu_eval_accuracy_high_school_psychology": 0.31666666666666665, + "mmlu_eval_accuracy_high_school_statistics": 0.30434782608695654, + "mmlu_eval_accuracy_high_school_us_history": 0.2727272727272727, + "mmlu_eval_accuracy_high_school_world_history": 0.19230769230769232, + "mmlu_eval_accuracy_human_aging": 0.2608695652173913, + "mmlu_eval_accuracy_human_sexuality": 0.25, + "mmlu_eval_accuracy_international_law": 0.38461538461538464, + "mmlu_eval_accuracy_jurisprudence": 0.2727272727272727, + "mmlu_eval_accuracy_logical_fallacies": 0.2777777777777778, + "mmlu_eval_accuracy_machine_learning": 0.36363636363636365, + "mmlu_eval_accuracy_management": 0.36363636363636365, + "mmlu_eval_accuracy_marketing": 0.32, + "mmlu_eval_accuracy_medical_genetics": 0.2727272727272727, + "mmlu_eval_accuracy_miscellaneous": 0.46511627906976744, + "mmlu_eval_accuracy_moral_disputes": 0.2631578947368421, + "mmlu_eval_accuracy_moral_scenarios": 0.22, + "mmlu_eval_accuracy_nutrition": 0.3939393939393939, + "mmlu_eval_accuracy_philosophy": 0.35294117647058826, + "mmlu_eval_accuracy_prehistory": 0.3142857142857143, + "mmlu_eval_accuracy_professional_accounting": 0.3225806451612903, + "mmlu_eval_accuracy_professional_law": 0.2411764705882353, + "mmlu_eval_accuracy_professional_medicine": 0.1935483870967742, + "mmlu_eval_accuracy_professional_psychology": 0.2898550724637681, + "mmlu_eval_accuracy_public_relations": 0.3333333333333333, + "mmlu_eval_accuracy_security_studies": 0.37037037037037035, + "mmlu_eval_accuracy_sociology": 0.5909090909090909, + "mmlu_eval_accuracy_us_foreign_policy": 0.45454545454545453, + "mmlu_eval_accuracy_virology": 0.3333333333333333, + "mmlu_eval_accuracy_world_religions": 0.2631578947368421, + "mmlu_loss": 2.1817706773905563, + "step": 6545 + }, + { + "epoch": 11.87213783722512, + "grad_norm": 0.265625, + "learning_rate": 0.0002, + "loss": 0.1024, + "step": 6546 + }, + { + "epoch": 11.873951484924053, + "grad_norm": 0.390625, + "learning_rate": 0.0002, + "loss": 0.119, + "step": 6547 + }, + { + "epoch": 11.875765132622988, + "grad_norm": 0.28515625, + "learning_rate": 0.0002, + "loss": 0.1049, + "step": 6548 + }, + { + "epoch": 11.877578780321922, + "grad_norm": 0.3046875, + "learning_rate": 0.0002, + "loss": 0.1018, + "step": 6549 + }, + { + "epoch": 11.879392428020857, + "grad_norm": 0.27734375, + "learning_rate": 0.0002, + "loss": 0.1089, + "step": 6550 + }, + { + "epoch": 11.881206075719792, + "grad_norm": 0.287109375, + "learning_rate": 0.0002, + "loss": 0.1002, + "step": 6551 + }, + { + "epoch": 11.883019723418727, + "grad_norm": 0.3203125, + "learning_rate": 0.0002, + "loss": 0.1017, + "step": 6552 + }, + { + "epoch": 11.88483337111766, + "grad_norm": 0.333984375, + "learning_rate": 0.0002, + "loss": 0.1095, + "step": 6553 + }, + { + "epoch": 11.886647018816594, + "grad_norm": 0.23046875, + "learning_rate": 0.0002, + "loss": 0.12, + "step": 6554 + }, + { + "epoch": 11.88846066651553, + "grad_norm": 0.298828125, + "learning_rate": 0.0002, + "loss": 0.1142, + "step": 6555 + }, + { + "epoch": 11.890274314214464, + "grad_norm": 0.26171875, + "learning_rate": 0.0002, + "loss": 0.1138, + "step": 6556 + }, + { + "epoch": 11.892087961913399, + "grad_norm": 0.265625, + "learning_rate": 0.0002, + "loss": 0.1116, + "step": 6557 + }, + { + "epoch": 11.893901609612334, + "grad_norm": 0.27734375, + "learning_rate": 0.0002, + "loss": 0.1215, + "step": 6558 + }, + { + "epoch": 11.895715257311267, + "grad_norm": 0.240234375, + "learning_rate": 0.0002, + "loss": 0.1155, + "step": 6559 + }, + { + "epoch": 11.897528905010201, + "grad_norm": 0.32421875, + "learning_rate": 0.0002, + "loss": 0.1359, + "step": 6560 + }, + { + "epoch": 11.899342552709136, + "grad_norm": 0.19140625, + "learning_rate": 0.0002, + "loss": 0.1243, + "step": 6561 + }, + { + "epoch": 11.90115620040807, + "grad_norm": 0.3125, + "learning_rate": 0.0002, + "loss": 0.1564, + "step": 6562 + }, + { + "epoch": 11.902969848107006, + "grad_norm": 0.2177734375, + "learning_rate": 0.0002, + "loss": 0.1535, + "step": 6563 + }, + { + "epoch": 11.90478349580594, + "grad_norm": 0.41796875, + "learning_rate": 0.0002, + "loss": 0.1662, + "step": 6564 + }, + { + "epoch": 11.906597143504873, + "grad_norm": 0.275390625, + "learning_rate": 0.0002, + "loss": 0.2157, + "step": 6565 + }, + { + "epoch": 11.908410791203808, + "grad_norm": 0.279296875, + "learning_rate": 0.0002, + "loss": 0.3949, + "step": 6566 + }, + { + "epoch": 11.910224438902743, + "grad_norm": 0.25, + "learning_rate": 0.0002, + "loss": 0.1828, + "step": 6567 + }, + { + "epoch": 11.912038086601678, + "grad_norm": 0.2353515625, + "learning_rate": 0.0002, + "loss": 0.1469, + "step": 6568 + }, + { + "epoch": 11.913851734300613, + "grad_norm": 0.2412109375, + "learning_rate": 0.0002, + "loss": 0.2171, + "step": 6569 + }, + { + "epoch": 11.915665381999547, + "grad_norm": 0.271484375, + "learning_rate": 0.0002, + "loss": 0.239, + "step": 6570 + }, + { + "epoch": 11.91747902969848, + "grad_norm": 0.2373046875, + "learning_rate": 0.0002, + "loss": 0.1752, + "step": 6571 + }, + { + "epoch": 11.919292677397415, + "grad_norm": 0.275390625, + "learning_rate": 0.0002, + "loss": 0.1808, + "step": 6572 + }, + { + "epoch": 11.92110632509635, + "grad_norm": 0.265625, + "learning_rate": 0.0002, + "loss": 0.2053, + "step": 6573 + }, + { + "epoch": 11.922919972795285, + "grad_norm": 0.23828125, + "learning_rate": 0.0002, + "loss": 0.1874, + "step": 6574 + }, + { + "epoch": 11.92473362049422, + "grad_norm": 0.2431640625, + "learning_rate": 0.0002, + "loss": 0.14, + "step": 6575 + }, + { + "epoch": 11.926547268193154, + "grad_norm": 0.2431640625, + "learning_rate": 0.0002, + "loss": 0.1473, + "step": 6576 + }, + { + "epoch": 11.928360915892087, + "grad_norm": 0.26953125, + "learning_rate": 0.0002, + "loss": 0.1594, + "step": 6577 + }, + { + "epoch": 11.930174563591022, + "grad_norm": 0.296875, + "learning_rate": 0.0002, + "loss": 0.2042, + "step": 6578 + }, + { + "epoch": 11.931988211289957, + "grad_norm": 0.2470703125, + "learning_rate": 0.0002, + "loss": 0.1519, + "step": 6579 + }, + { + "epoch": 11.933801858988891, + "grad_norm": 0.2734375, + "learning_rate": 0.0002, + "loss": 0.1518, + "step": 6580 + }, + { + "epoch": 11.935615506687826, + "grad_norm": 0.2421875, + "learning_rate": 0.0002, + "loss": 0.1262, + "step": 6581 + }, + { + "epoch": 11.937429154386761, + "grad_norm": 0.267578125, + "learning_rate": 0.0002, + "loss": 0.1341, + "step": 6582 + }, + { + "epoch": 11.939242802085694, + "grad_norm": 0.296875, + "learning_rate": 0.0002, + "loss": 0.1337, + "step": 6583 + }, + { + "epoch": 11.941056449784629, + "grad_norm": 0.294921875, + "learning_rate": 0.0002, + "loss": 0.1317, + "step": 6584 + }, + { + "epoch": 11.942870097483564, + "grad_norm": 0.279296875, + "learning_rate": 0.0002, + "loss": 0.1296, + "step": 6585 + }, + { + "epoch": 11.944683745182498, + "grad_norm": 0.32421875, + "learning_rate": 0.0002, + "loss": 0.152, + "step": 6586 + }, + { + "epoch": 11.946497392881433, + "grad_norm": 0.3046875, + "learning_rate": 0.0002, + "loss": 0.125, + "step": 6587 + }, + { + "epoch": 11.948311040580368, + "grad_norm": 0.283203125, + "learning_rate": 0.0002, + "loss": 0.1024, + "step": 6588 + }, + { + "epoch": 11.950124688279303, + "grad_norm": 0.376953125, + "learning_rate": 0.0002, + "loss": 0.1148, + "step": 6589 + }, + { + "epoch": 11.951938335978236, + "grad_norm": 0.35546875, + "learning_rate": 0.0002, + "loss": 0.1131, + "step": 6590 + }, + { + "epoch": 11.95375198367717, + "grad_norm": 0.34375, + "learning_rate": 0.0002, + "loss": 0.1183, + "step": 6591 + }, + { + "epoch": 11.955565631376105, + "grad_norm": 0.330078125, + "learning_rate": 0.0002, + "loss": 0.1069, + "step": 6592 + }, + { + "epoch": 11.95737927907504, + "grad_norm": 0.306640625, + "learning_rate": 0.0002, + "loss": 0.1089, + "step": 6593 + }, + { + "epoch": 11.959192926773975, + "grad_norm": 0.287109375, + "learning_rate": 0.0002, + "loss": 0.0977, + "step": 6594 + }, + { + "epoch": 11.961006574472908, + "grad_norm": 0.32421875, + "learning_rate": 0.0002, + "loss": 0.1147, + "step": 6595 + }, + { + "epoch": 11.962820222171842, + "grad_norm": 0.318359375, + "learning_rate": 0.0002, + "loss": 0.096, + "step": 6596 + }, + { + "epoch": 11.964633869870777, + "grad_norm": 0.330078125, + "learning_rate": 0.0002, + "loss": 0.118, + "step": 6597 + }, + { + "epoch": 11.966447517569712, + "grad_norm": 0.322265625, + "learning_rate": 0.0002, + "loss": 0.1048, + "step": 6598 + }, + { + "epoch": 11.968261165268647, + "grad_norm": 0.2392578125, + "learning_rate": 0.0002, + "loss": 0.0894, + "step": 6599 + }, + { + "epoch": 11.970074812967582, + "grad_norm": 0.380859375, + "learning_rate": 0.0002, + "loss": 0.1104, + "step": 6600 + }, + { + "epoch": 11.971888460666516, + "grad_norm": 0.29296875, + "learning_rate": 0.0002, + "loss": 0.1266, + "step": 6601 + }, + { + "epoch": 11.97370210836545, + "grad_norm": 0.388671875, + "learning_rate": 0.0002, + "loss": 0.1362, + "step": 6602 + }, + { + "epoch": 11.975515756064384, + "grad_norm": 0.365234375, + "learning_rate": 0.0002, + "loss": 0.1214, + "step": 6603 + }, + { + "epoch": 11.977329403763319, + "grad_norm": 0.380859375, + "learning_rate": 0.0002, + "loss": 0.1113, + "step": 6604 + }, + { + "epoch": 11.979143051462254, + "grad_norm": 0.251953125, + "learning_rate": 0.0002, + "loss": 0.1167, + "step": 6605 + }, + { + "epoch": 11.980956699161188, + "grad_norm": 0.283203125, + "learning_rate": 0.0002, + "loss": 0.1135, + "step": 6606 + }, + { + "epoch": 11.982770346860123, + "grad_norm": 0.291015625, + "learning_rate": 0.0002, + "loss": 0.1144, + "step": 6607 + }, + { + "epoch": 11.984583994559056, + "grad_norm": 0.275390625, + "learning_rate": 0.0002, + "loss": 0.1268, + "step": 6608 + }, + { + "epoch": 11.986397642257991, + "grad_norm": 0.2734375, + "learning_rate": 0.0002, + "loss": 0.1328, + "step": 6609 + }, + { + "epoch": 11.988211289956926, + "grad_norm": 0.37109375, + "learning_rate": 0.0002, + "loss": 0.1381, + "step": 6610 + }, + { + "epoch": 11.99002493765586, + "grad_norm": 0.255859375, + "learning_rate": 0.0002, + "loss": 0.1391, + "step": 6611 + }, + { + "epoch": 11.991838585354795, + "grad_norm": 0.224609375, + "learning_rate": 0.0002, + "loss": 0.1391, + "step": 6612 + }, + { + "epoch": 11.99365223305373, + "grad_norm": 0.294921875, + "learning_rate": 0.0002, + "loss": 0.1406, + "step": 6613 + }, + { + "epoch": 11.995465880752663, + "grad_norm": 0.2255859375, + "learning_rate": 0.0002, + "loss": 0.188, + "step": 6614 + }, + { + "epoch": 11.997279528451598, + "grad_norm": 0.181640625, + "learning_rate": 0.0002, + "loss": 0.238, + "step": 6615 + }, + { + "epoch": 11.999093176150533, + "grad_norm": 0.283203125, + "learning_rate": 0.0002, + "loss": 0.1654, + "step": 6616 + }, + { + "epoch": 12.000906823849467, + "grad_norm": 0.2119140625, + "learning_rate": 0.0002, + "loss": 0.268, + "step": 6617 + }, + { + "epoch": 12.002720471548402, + "grad_norm": 0.1943359375, + "learning_rate": 0.0002, + "loss": 0.1184, + "step": 6618 + }, + { + "epoch": 12.004534119247337, + "grad_norm": 0.208984375, + "learning_rate": 0.0002, + "loss": 0.1413, + "step": 6619 + }, + { + "epoch": 12.00634776694627, + "grad_norm": 0.1826171875, + "learning_rate": 0.0002, + "loss": 0.1284, + "step": 6620 + }, + { + "epoch": 12.008161414645205, + "grad_norm": 0.177734375, + "learning_rate": 0.0002, + "loss": 0.1121, + "step": 6621 + }, + { + "epoch": 12.00997506234414, + "grad_norm": 0.177734375, + "learning_rate": 0.0002, + "loss": 0.1131, + "step": 6622 + }, + { + "epoch": 12.011788710043074, + "grad_norm": 0.166015625, + "learning_rate": 0.0002, + "loss": 0.0983, + "step": 6623 + }, + { + "epoch": 12.013602357742009, + "grad_norm": 0.19140625, + "learning_rate": 0.0002, + "loss": 0.1041, + "step": 6624 + }, + { + "epoch": 12.015416005440944, + "grad_norm": 0.205078125, + "learning_rate": 0.0002, + "loss": 0.1204, + "step": 6625 + }, + { + "epoch": 12.017229653139877, + "grad_norm": 0.189453125, + "learning_rate": 0.0002, + "loss": 0.0999, + "step": 6626 + }, + { + "epoch": 12.019043300838812, + "grad_norm": 0.18359375, + "learning_rate": 0.0002, + "loss": 0.0871, + "step": 6627 + }, + { + "epoch": 12.020856948537746, + "grad_norm": 0.20703125, + "learning_rate": 0.0002, + "loss": 0.1024, + "step": 6628 + }, + { + "epoch": 12.022670596236681, + "grad_norm": 0.1865234375, + "learning_rate": 0.0002, + "loss": 0.0839, + "step": 6629 + }, + { + "epoch": 12.024484243935616, + "grad_norm": 0.1943359375, + "learning_rate": 0.0002, + "loss": 0.1001, + "step": 6630 + }, + { + "epoch": 12.02629789163455, + "grad_norm": 0.1845703125, + "learning_rate": 0.0002, + "loss": 0.0895, + "step": 6631 + }, + { + "epoch": 12.028111539333484, + "grad_norm": 0.189453125, + "learning_rate": 0.0002, + "loss": 0.0834, + "step": 6632 + }, + { + "epoch": 12.029925187032418, + "grad_norm": 0.2216796875, + "learning_rate": 0.0002, + "loss": 0.0961, + "step": 6633 + }, + { + "epoch": 12.031738834731353, + "grad_norm": 0.2060546875, + "learning_rate": 0.0002, + "loss": 0.0761, + "step": 6634 + }, + { + "epoch": 12.033552482430288, + "grad_norm": 0.181640625, + "learning_rate": 0.0002, + "loss": 0.0898, + "step": 6635 + }, + { + "epoch": 12.035366130129223, + "grad_norm": 0.26953125, + "learning_rate": 0.0002, + "loss": 0.0913, + "step": 6636 + }, + { + "epoch": 12.037179777828158, + "grad_norm": 0.255859375, + "learning_rate": 0.0002, + "loss": 0.0954, + "step": 6637 + }, + { + "epoch": 12.03899342552709, + "grad_norm": 0.275390625, + "learning_rate": 0.0002, + "loss": 0.0988, + "step": 6638 + }, + { + "epoch": 12.040807073226025, + "grad_norm": 0.2490234375, + "learning_rate": 0.0002, + "loss": 0.093, + "step": 6639 + }, + { + "epoch": 12.04262072092496, + "grad_norm": 0.37109375, + "learning_rate": 0.0002, + "loss": 0.1053, + "step": 6640 + }, + { + "epoch": 12.044434368623895, + "grad_norm": 0.279296875, + "learning_rate": 0.0002, + "loss": 0.1012, + "step": 6641 + }, + { + "epoch": 12.04624801632283, + "grad_norm": 0.287109375, + "learning_rate": 0.0002, + "loss": 0.0889, + "step": 6642 + }, + { + "epoch": 12.048061664021764, + "grad_norm": 0.271484375, + "learning_rate": 0.0002, + "loss": 0.0823, + "step": 6643 + }, + { + "epoch": 12.049875311720697, + "grad_norm": 0.3203125, + "learning_rate": 0.0002, + "loss": 0.105, + "step": 6644 + }, + { + "epoch": 12.051688959419632, + "grad_norm": 0.23046875, + "learning_rate": 0.0002, + "loss": 0.0844, + "step": 6645 + }, + { + "epoch": 12.053502607118567, + "grad_norm": 0.3203125, + "learning_rate": 0.0002, + "loss": 0.0886, + "step": 6646 + }, + { + "epoch": 12.055316254817502, + "grad_norm": 0.314453125, + "learning_rate": 0.0002, + "loss": 0.0859, + "step": 6647 + }, + { + "epoch": 12.057129902516436, + "grad_norm": 0.265625, + "learning_rate": 0.0002, + "loss": 0.0746, + "step": 6648 + }, + { + "epoch": 12.058943550215371, + "grad_norm": 0.345703125, + "learning_rate": 0.0002, + "loss": 0.0942, + "step": 6649 + }, + { + "epoch": 12.060757197914306, + "grad_norm": 0.345703125, + "learning_rate": 0.0002, + "loss": 0.0926, + "step": 6650 + }, + { + "epoch": 12.062570845613239, + "grad_norm": 0.34375, + "learning_rate": 0.0002, + "loss": 0.0965, + "step": 6651 + }, + { + "epoch": 12.064384493312174, + "grad_norm": 0.1806640625, + "learning_rate": 0.0002, + "loss": 0.0659, + "step": 6652 + }, + { + "epoch": 12.066198141011109, + "grad_norm": 0.25390625, + "learning_rate": 0.0002, + "loss": 0.0929, + "step": 6653 + }, + { + "epoch": 12.068011788710043, + "grad_norm": 0.203125, + "learning_rate": 0.0002, + "loss": 0.0757, + "step": 6654 + }, + { + "epoch": 12.069825436408978, + "grad_norm": 0.27734375, + "learning_rate": 0.0002, + "loss": 0.089, + "step": 6655 + }, + { + "epoch": 12.071639084107913, + "grad_norm": 0.201171875, + "learning_rate": 0.0002, + "loss": 0.0867, + "step": 6656 + }, + { + "epoch": 12.073452731806846, + "grad_norm": 0.3203125, + "learning_rate": 0.0002, + "loss": 0.1067, + "step": 6657 + }, + { + "epoch": 12.07526637950578, + "grad_norm": 0.3984375, + "learning_rate": 0.0002, + "loss": 0.0944, + "step": 6658 + }, + { + "epoch": 12.077080027204715, + "grad_norm": 0.1640625, + "learning_rate": 0.0002, + "loss": 0.0958, + "step": 6659 + }, + { + "epoch": 12.07889367490365, + "grad_norm": 0.236328125, + "learning_rate": 0.0002, + "loss": 0.1048, + "step": 6660 + }, + { + "epoch": 12.080707322602585, + "grad_norm": 0.22265625, + "learning_rate": 0.0002, + "loss": 0.111, + "step": 6661 + }, + { + "epoch": 12.08252097030152, + "grad_norm": 0.474609375, + "learning_rate": 0.0002, + "loss": 0.1241, + "step": 6662 + }, + { + "epoch": 12.084334618000453, + "grad_norm": 0.3203125, + "learning_rate": 0.0002, + "loss": 0.1073, + "step": 6663 + }, + { + "epoch": 12.086148265699387, + "grad_norm": 0.1220703125, + "learning_rate": 0.0002, + "loss": 0.1158, + "step": 6664 + }, + { + "epoch": 12.087961913398322, + "grad_norm": 0.359375, + "learning_rate": 0.0002, + "loss": 0.1384, + "step": 6665 + }, + { + "epoch": 12.089775561097257, + "grad_norm": 0.1328125, + "learning_rate": 0.0002, + "loss": 0.148, + "step": 6666 + }, + { + "epoch": 12.091589208796192, + "grad_norm": 0.1904296875, + "learning_rate": 0.0002, + "loss": 0.2045, + "step": 6667 + }, + { + "epoch": 12.093402856495127, + "grad_norm": 0.1953125, + "learning_rate": 0.0002, + "loss": 0.1063, + "step": 6668 + }, + { + "epoch": 12.09521650419406, + "grad_norm": 0.3125, + "learning_rate": 0.0002, + "loss": 0.202, + "step": 6669 + }, + { + "epoch": 12.097030151892994, + "grad_norm": 0.2236328125, + "learning_rate": 0.0002, + "loss": 0.124, + "step": 6670 + }, + { + "epoch": 12.09884379959193, + "grad_norm": 0.21484375, + "learning_rate": 0.0002, + "loss": 0.1736, + "step": 6671 + }, + { + "epoch": 12.100657447290864, + "grad_norm": 0.2109375, + "learning_rate": 0.0002, + "loss": 0.1239, + "step": 6672 + }, + { + "epoch": 12.102471094989799, + "grad_norm": 0.2060546875, + "learning_rate": 0.0002, + "loss": 0.1186, + "step": 6673 + }, + { + "epoch": 12.104284742688733, + "grad_norm": 0.2177734375, + "learning_rate": 0.0002, + "loss": 0.1621, + "step": 6674 + }, + { + "epoch": 12.106098390387666, + "grad_norm": 0.2138671875, + "learning_rate": 0.0002, + "loss": 0.1246, + "step": 6675 + }, + { + "epoch": 12.107912038086601, + "grad_norm": 0.1845703125, + "learning_rate": 0.0002, + "loss": 0.1163, + "step": 6676 + }, + { + "epoch": 12.109725685785536, + "grad_norm": 0.189453125, + "learning_rate": 0.0002, + "loss": 0.1112, + "step": 6677 + }, + { + "epoch": 12.11153933348447, + "grad_norm": 0.1884765625, + "learning_rate": 0.0002, + "loss": 0.1048, + "step": 6678 + }, + { + "epoch": 12.113352981183406, + "grad_norm": 0.197265625, + "learning_rate": 0.0002, + "loss": 0.1039, + "step": 6679 + }, + { + "epoch": 12.11516662888234, + "grad_norm": 0.212890625, + "learning_rate": 0.0002, + "loss": 0.1028, + "step": 6680 + }, + { + "epoch": 12.116980276581273, + "grad_norm": 0.2490234375, + "learning_rate": 0.0002, + "loss": 0.1221, + "step": 6681 + }, + { + "epoch": 12.118793924280208, + "grad_norm": 0.216796875, + "learning_rate": 0.0002, + "loss": 0.1036, + "step": 6682 + }, + { + "epoch": 12.120607571979143, + "grad_norm": 0.2216796875, + "learning_rate": 0.0002, + "loss": 0.0997, + "step": 6683 + }, + { + "epoch": 12.122421219678078, + "grad_norm": 0.2314453125, + "learning_rate": 0.0002, + "loss": 0.104, + "step": 6684 + }, + { + "epoch": 12.124234867377012, + "grad_norm": 0.2236328125, + "learning_rate": 0.0002, + "loss": 0.0928, + "step": 6685 + }, + { + "epoch": 12.126048515075947, + "grad_norm": 0.24609375, + "learning_rate": 0.0002, + "loss": 0.1001, + "step": 6686 + }, + { + "epoch": 12.12786216277488, + "grad_norm": 0.2265625, + "learning_rate": 0.0002, + "loss": 0.1088, + "step": 6687 + }, + { + "epoch": 12.129675810473815, + "grad_norm": 0.2109375, + "learning_rate": 0.0002, + "loss": 0.0885, + "step": 6688 + }, + { + "epoch": 12.13148945817275, + "grad_norm": 0.2275390625, + "learning_rate": 0.0002, + "loss": 0.0815, + "step": 6689 + }, + { + "epoch": 12.133303105871684, + "grad_norm": 0.267578125, + "learning_rate": 0.0002, + "loss": 0.1048, + "step": 6690 + }, + { + "epoch": 12.13511675357062, + "grad_norm": 0.28125, + "learning_rate": 0.0002, + "loss": 0.0881, + "step": 6691 + }, + { + "epoch": 12.136930401269554, + "grad_norm": 0.234375, + "learning_rate": 0.0002, + "loss": 0.0797, + "step": 6692 + }, + { + "epoch": 12.138744048968487, + "grad_norm": 0.25390625, + "learning_rate": 0.0002, + "loss": 0.0851, + "step": 6693 + }, + { + "epoch": 12.140557696667422, + "grad_norm": 0.244140625, + "learning_rate": 0.0002, + "loss": 0.0851, + "step": 6694 + }, + { + "epoch": 12.142371344366357, + "grad_norm": 0.294921875, + "learning_rate": 0.0002, + "loss": 0.0899, + "step": 6695 + }, + { + "epoch": 12.144184992065291, + "grad_norm": 0.232421875, + "learning_rate": 0.0002, + "loss": 0.0958, + "step": 6696 + }, + { + "epoch": 12.145998639764226, + "grad_norm": 0.28515625, + "learning_rate": 0.0002, + "loss": 0.0845, + "step": 6697 + }, + { + "epoch": 12.14781228746316, + "grad_norm": 0.474609375, + "learning_rate": 0.0002, + "loss": 0.2191, + "step": 6698 + }, + { + "epoch": 12.149625935162096, + "grad_norm": 0.3203125, + "learning_rate": 0.0002, + "loss": 0.0862, + "step": 6699 + }, + { + "epoch": 12.151439582861029, + "grad_norm": 0.318359375, + "learning_rate": 0.0002, + "loss": 0.0867, + "step": 6700 + }, + { + "epoch": 12.153253230559963, + "grad_norm": 0.283203125, + "learning_rate": 0.0002, + "loss": 0.0854, + "step": 6701 + }, + { + "epoch": 12.155066878258898, + "grad_norm": 0.328125, + "learning_rate": 0.0002, + "loss": 0.0965, + "step": 6702 + }, + { + "epoch": 12.156880525957833, + "grad_norm": 0.310546875, + "learning_rate": 0.0002, + "loss": 0.0904, + "step": 6703 + }, + { + "epoch": 12.158694173656768, + "grad_norm": 0.220703125, + "learning_rate": 0.0002, + "loss": 0.0857, + "step": 6704 + }, + { + "epoch": 12.160507821355703, + "grad_norm": 0.3515625, + "learning_rate": 0.0002, + "loss": 0.1065, + "step": 6705 + }, + { + "epoch": 12.162321469054636, + "grad_norm": 0.353515625, + "learning_rate": 0.0002, + "loss": 0.0846, + "step": 6706 + }, + { + "epoch": 12.16413511675357, + "grad_norm": 0.390625, + "learning_rate": 0.0002, + "loss": 0.1244, + "step": 6707 + }, + { + "epoch": 12.165948764452505, + "grad_norm": 0.205078125, + "learning_rate": 0.0002, + "loss": 0.0944, + "step": 6708 + }, + { + "epoch": 12.16776241215144, + "grad_norm": 0.34765625, + "learning_rate": 0.0002, + "loss": 0.0948, + "step": 6709 + }, + { + "epoch": 12.169576059850375, + "grad_norm": 0.25390625, + "learning_rate": 0.0002, + "loss": 0.0993, + "step": 6710 + }, + { + "epoch": 12.17138970754931, + "grad_norm": 0.28515625, + "learning_rate": 0.0002, + "loss": 0.1073, + "step": 6711 + }, + { + "epoch": 12.173203355248242, + "grad_norm": 0.1796875, + "learning_rate": 0.0002, + "loss": 0.111, + "step": 6712 + }, + { + "epoch": 12.175017002947177, + "grad_norm": 0.3125, + "learning_rate": 0.0002, + "loss": 0.1593, + "step": 6713 + }, + { + "epoch": 12.176830650646112, + "grad_norm": 0.291015625, + "learning_rate": 0.0002, + "loss": 0.1437, + "step": 6714 + }, + { + "epoch": 12.178644298345047, + "grad_norm": 0.361328125, + "learning_rate": 0.0002, + "loss": 0.1513, + "step": 6715 + }, + { + "epoch": 12.180457946043981, + "grad_norm": 0.1396484375, + "learning_rate": 0.0002, + "loss": 0.1798, + "step": 6716 + }, + { + "epoch": 12.182271593742916, + "grad_norm": 0.2138671875, + "learning_rate": 0.0002, + "loss": 0.1813, + "step": 6717 + }, + { + "epoch": 12.18408524144185, + "grad_norm": 0.25, + "learning_rate": 0.0002, + "loss": 0.1351, + "step": 6718 + }, + { + "epoch": 12.185898889140784, + "grad_norm": 0.22265625, + "learning_rate": 0.0002, + "loss": 0.1178, + "step": 6719 + }, + { + "epoch": 12.187712536839719, + "grad_norm": 0.263671875, + "learning_rate": 0.0002, + "loss": 0.1548, + "step": 6720 + }, + { + "epoch": 12.189526184538654, + "grad_norm": 0.26171875, + "learning_rate": 0.0002, + "loss": 0.1895, + "step": 6721 + }, + { + "epoch": 12.191339832237588, + "grad_norm": 0.2138671875, + "learning_rate": 0.0002, + "loss": 0.1272, + "step": 6722 + }, + { + "epoch": 12.193153479936523, + "grad_norm": 0.26953125, + "learning_rate": 0.0002, + "loss": 0.1528, + "step": 6723 + }, + { + "epoch": 12.194967127635456, + "grad_norm": 0.1953125, + "learning_rate": 0.0002, + "loss": 0.1083, + "step": 6724 + }, + { + "epoch": 12.19678077533439, + "grad_norm": 0.2470703125, + "learning_rate": 0.0002, + "loss": 0.1332, + "step": 6725 + }, + { + "epoch": 12.198594423033326, + "grad_norm": 0.1982421875, + "learning_rate": 0.0002, + "loss": 0.1093, + "step": 6726 + }, + { + "epoch": 12.20040807073226, + "grad_norm": 0.2119140625, + "learning_rate": 0.0002, + "loss": 0.1176, + "step": 6727 + }, + { + "epoch": 12.202221718431195, + "grad_norm": 0.181640625, + "learning_rate": 0.0002, + "loss": 0.0991, + "step": 6728 + }, + { + "epoch": 12.20403536613013, + "grad_norm": 0.197265625, + "learning_rate": 0.0002, + "loss": 0.1173, + "step": 6729 + }, + { + "epoch": 12.205849013829063, + "grad_norm": 0.2578125, + "learning_rate": 0.0002, + "loss": 0.114, + "step": 6730 + }, + { + "epoch": 12.207662661527998, + "grad_norm": 0.216796875, + "learning_rate": 0.0002, + "loss": 0.0891, + "step": 6731 + }, + { + "epoch": 12.209476309226932, + "grad_norm": 0.228515625, + "learning_rate": 0.0002, + "loss": 0.1062, + "step": 6732 + }, + { + "epoch": 12.209476309226932, + "eval_loss": 2.384899854660034, + "eval_runtime": 152.8012, + "eval_samples_per_second": 6.544, + "eval_steps_per_second": 6.544, + "step": 6732 + }, + { + "epoch": 12.209476309226932, + "mmlu_eval_accuracy": 0.3079835698668534, + "mmlu_eval_accuracy_abstract_algebra": 0.45454545454545453, + "mmlu_eval_accuracy_anatomy": 0.2857142857142857, + "mmlu_eval_accuracy_astronomy": 0.3125, + "mmlu_eval_accuracy_business_ethics": 0.45454545454545453, + "mmlu_eval_accuracy_clinical_knowledge": 0.3448275862068966, + "mmlu_eval_accuracy_college_biology": 0.375, + "mmlu_eval_accuracy_college_chemistry": 0.125, + "mmlu_eval_accuracy_college_computer_science": 0.0, + "mmlu_eval_accuracy_college_mathematics": 0.18181818181818182, + "mmlu_eval_accuracy_college_medicine": 0.3181818181818182, + "mmlu_eval_accuracy_college_physics": 0.36363636363636365, + "mmlu_eval_accuracy_computer_security": 0.7272727272727273, + "mmlu_eval_accuracy_conceptual_physics": 0.38461538461538464, + "mmlu_eval_accuracy_econometrics": 0.08333333333333333, + "mmlu_eval_accuracy_electrical_engineering": 0.25, + "mmlu_eval_accuracy_elementary_mathematics": 0.34146341463414637, + "mmlu_eval_accuracy_formal_logic": 0.14285714285714285, + "mmlu_eval_accuracy_global_facts": 0.3, + "mmlu_eval_accuracy_high_school_biology": 0.40625, + "mmlu_eval_accuracy_high_school_chemistry": 0.18181818181818182, + "mmlu_eval_accuracy_high_school_computer_science": 0.2222222222222222, + "mmlu_eval_accuracy_high_school_european_history": 0.2777777777777778, + "mmlu_eval_accuracy_high_school_geography": 0.36363636363636365, + "mmlu_eval_accuracy_high_school_government_and_politics": 0.3333333333333333, + "mmlu_eval_accuracy_high_school_macroeconomics": 0.27906976744186046, + "mmlu_eval_accuracy_high_school_mathematics": 0.10344827586206896, + "mmlu_eval_accuracy_high_school_microeconomics": 0.3076923076923077, + "mmlu_eval_accuracy_high_school_physics": 0.47058823529411764, + "mmlu_eval_accuracy_high_school_psychology": 0.35, + "mmlu_eval_accuracy_high_school_statistics": 0.30434782608695654, + "mmlu_eval_accuracy_high_school_us_history": 0.22727272727272727, + "mmlu_eval_accuracy_high_school_world_history": 0.19230769230769232, + "mmlu_eval_accuracy_human_aging": 0.34782608695652173, + "mmlu_eval_accuracy_human_sexuality": 0.16666666666666666, + "mmlu_eval_accuracy_international_law": 0.38461538461538464, + "mmlu_eval_accuracy_jurisprudence": 0.2727272727272727, + "mmlu_eval_accuracy_logical_fallacies": 0.3333333333333333, + "mmlu_eval_accuracy_machine_learning": 0.36363636363636365, + "mmlu_eval_accuracy_management": 0.36363636363636365, + "mmlu_eval_accuracy_marketing": 0.48, + "mmlu_eval_accuracy_medical_genetics": 0.36363636363636365, + "mmlu_eval_accuracy_miscellaneous": 0.4418604651162791, + "mmlu_eval_accuracy_moral_disputes": 0.2631578947368421, + "mmlu_eval_accuracy_moral_scenarios": 0.22, + "mmlu_eval_accuracy_nutrition": 0.3939393939393939, + "mmlu_eval_accuracy_philosophy": 0.35294117647058826, + "mmlu_eval_accuracy_prehistory": 0.3142857142857143, + "mmlu_eval_accuracy_professional_accounting": 0.3225806451612903, + "mmlu_eval_accuracy_professional_law": 0.2823529411764706, + "mmlu_eval_accuracy_professional_medicine": 0.22580645161290322, + "mmlu_eval_accuracy_professional_psychology": 0.2608695652173913, + "mmlu_eval_accuracy_public_relations": 0.3333333333333333, + "mmlu_eval_accuracy_security_studies": 0.2962962962962963, + "mmlu_eval_accuracy_sociology": 0.5454545454545454, + "mmlu_eval_accuracy_us_foreign_policy": 0.2727272727272727, + "mmlu_eval_accuracy_virology": 0.2777777777777778, + "mmlu_eval_accuracy_world_religions": 0.21052631578947367, + "mmlu_loss": 2.136334472567641, + "step": 6732 + }, + { + "epoch": 12.211289956925867, + "grad_norm": 0.197265625, + "learning_rate": 0.0002, + "loss": 0.0968, + "step": 6733 + }, + { + "epoch": 12.213103604624802, + "grad_norm": 0.2490234375, + "learning_rate": 0.0002, + "loss": 0.1089, + "step": 6734 + }, + { + "epoch": 12.214917252323737, + "grad_norm": 0.275390625, + "learning_rate": 0.0002, + "loss": 0.1049, + "step": 6735 + }, + { + "epoch": 12.21673090002267, + "grad_norm": 0.26171875, + "learning_rate": 0.0002, + "loss": 0.1062, + "step": 6736 + }, + { + "epoch": 12.218544547721605, + "grad_norm": 0.25, + "learning_rate": 0.0002, + "loss": 0.0932, + "step": 6737 + }, + { + "epoch": 12.22035819542054, + "grad_norm": 0.2314453125, + "learning_rate": 0.0002, + "loss": 0.0864, + "step": 6738 + }, + { + "epoch": 12.222171843119474, + "grad_norm": 0.25390625, + "learning_rate": 0.0002, + "loss": 0.1064, + "step": 6739 + }, + { + "epoch": 12.223985490818409, + "grad_norm": 0.25390625, + "learning_rate": 0.0002, + "loss": 0.0962, + "step": 6740 + }, + { + "epoch": 12.225799138517344, + "grad_norm": 0.255859375, + "learning_rate": 0.0002, + "loss": 0.0916, + "step": 6741 + }, + { + "epoch": 12.227612786216277, + "grad_norm": 0.2353515625, + "learning_rate": 0.0002, + "loss": 0.0812, + "step": 6742 + }, + { + "epoch": 12.229426433915211, + "grad_norm": 0.2138671875, + "learning_rate": 0.0002, + "loss": 0.0801, + "step": 6743 + }, + { + "epoch": 12.231240081614146, + "grad_norm": 0.275390625, + "learning_rate": 0.0002, + "loss": 0.0944, + "step": 6744 + }, + { + "epoch": 12.233053729313081, + "grad_norm": 0.3203125, + "learning_rate": 0.0002, + "loss": 0.0958, + "step": 6745 + }, + { + "epoch": 12.234867377012016, + "grad_norm": 0.302734375, + "learning_rate": 0.0002, + "loss": 0.0786, + "step": 6746 + }, + { + "epoch": 12.23668102471095, + "grad_norm": 0.3046875, + "learning_rate": 0.0002, + "loss": 0.0952, + "step": 6747 + }, + { + "epoch": 12.238494672409884, + "grad_norm": 0.259765625, + "learning_rate": 0.0002, + "loss": 0.0875, + "step": 6748 + }, + { + "epoch": 12.240308320108818, + "grad_norm": 0.298828125, + "learning_rate": 0.0002, + "loss": 0.0935, + "step": 6749 + }, + { + "epoch": 12.242121967807753, + "grad_norm": 0.306640625, + "learning_rate": 0.0002, + "loss": 0.0827, + "step": 6750 + }, + { + "epoch": 12.243935615506688, + "grad_norm": 0.26953125, + "learning_rate": 0.0002, + "loss": 0.0827, + "step": 6751 + }, + { + "epoch": 12.245749263205623, + "grad_norm": 0.275390625, + "learning_rate": 0.0002, + "loss": 0.0845, + "step": 6752 + }, + { + "epoch": 12.247562910904557, + "grad_norm": 0.375, + "learning_rate": 0.0002, + "loss": 0.0974, + "step": 6753 + }, + { + "epoch": 12.24937655860349, + "grad_norm": 0.3203125, + "learning_rate": 0.0002, + "loss": 0.0948, + "step": 6754 + }, + { + "epoch": 12.251190206302425, + "grad_norm": 0.294921875, + "learning_rate": 0.0002, + "loss": 0.0937, + "step": 6755 + }, + { + "epoch": 12.25300385400136, + "grad_norm": 0.345703125, + "learning_rate": 0.0002, + "loss": 0.1054, + "step": 6756 + }, + { + "epoch": 12.254817501700295, + "grad_norm": 0.2275390625, + "learning_rate": 0.0002, + "loss": 0.0924, + "step": 6757 + }, + { + "epoch": 12.25663114939923, + "grad_norm": 0.1767578125, + "learning_rate": 0.0002, + "loss": 0.1008, + "step": 6758 + }, + { + "epoch": 12.258444797098164, + "grad_norm": 0.291015625, + "learning_rate": 0.0002, + "loss": 0.0977, + "step": 6759 + }, + { + "epoch": 12.260258444797099, + "grad_norm": 0.2041015625, + "learning_rate": 0.0002, + "loss": 0.094, + "step": 6760 + }, + { + "epoch": 12.262072092496032, + "grad_norm": 0.33203125, + "learning_rate": 0.0002, + "loss": 0.118, + "step": 6761 + }, + { + "epoch": 12.263885740194967, + "grad_norm": 0.1884765625, + "learning_rate": 0.0002, + "loss": 0.1142, + "step": 6762 + }, + { + "epoch": 12.265699387893902, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 0.1035, + "step": 6763 + }, + { + "epoch": 12.267513035592836, + "grad_norm": 0.396484375, + "learning_rate": 0.0002, + "loss": 0.1541, + "step": 6764 + }, + { + "epoch": 12.269326683291771, + "grad_norm": 0.33203125, + "learning_rate": 0.0002, + "loss": 0.1388, + "step": 6765 + }, + { + "epoch": 12.271140330990706, + "grad_norm": 0.54296875, + "learning_rate": 0.0002, + "loss": 0.1916, + "step": 6766 + }, + { + "epoch": 12.272953978689639, + "grad_norm": 0.40625, + "learning_rate": 0.0002, + "loss": 0.2403, + "step": 6767 + }, + { + "epoch": 12.274767626388574, + "grad_norm": 0.2333984375, + "learning_rate": 0.0002, + "loss": 0.1517, + "step": 6768 + }, + { + "epoch": 12.276581274087508, + "grad_norm": 0.30078125, + "learning_rate": 0.0002, + "loss": 0.1801, + "step": 6769 + }, + { + "epoch": 12.278394921786443, + "grad_norm": 0.2294921875, + "learning_rate": 0.0002, + "loss": 0.1318, + "step": 6770 + }, + { + "epoch": 12.280208569485378, + "grad_norm": 0.20703125, + "learning_rate": 0.0002, + "loss": 0.1485, + "step": 6771 + }, + { + "epoch": 12.282022217184313, + "grad_norm": 0.21875, + "learning_rate": 0.0002, + "loss": 0.1408, + "step": 6772 + }, + { + "epoch": 12.283835864883246, + "grad_norm": 0.21875, + "learning_rate": 0.0002, + "loss": 0.1134, + "step": 6773 + }, + { + "epoch": 12.28564951258218, + "grad_norm": 0.1943359375, + "learning_rate": 0.0002, + "loss": 0.1154, + "step": 6774 + }, + { + "epoch": 12.287463160281115, + "grad_norm": 0.25, + "learning_rate": 0.0002, + "loss": 0.1736, + "step": 6775 + }, + { + "epoch": 12.28927680798005, + "grad_norm": 0.1826171875, + "learning_rate": 0.0002, + "loss": 0.1124, + "step": 6776 + }, + { + "epoch": 12.291090455678985, + "grad_norm": 0.2138671875, + "learning_rate": 0.0002, + "loss": 0.1171, + "step": 6777 + }, + { + "epoch": 12.29290410337792, + "grad_norm": 0.2158203125, + "learning_rate": 0.0002, + "loss": 0.1179, + "step": 6778 + }, + { + "epoch": 12.294717751076853, + "grad_norm": 0.2265625, + "learning_rate": 0.0002, + "loss": 0.1322, + "step": 6779 + }, + { + "epoch": 12.296531398775787, + "grad_norm": 0.2119140625, + "learning_rate": 0.0002, + "loss": 0.1025, + "step": 6780 + }, + { + "epoch": 12.298345046474722, + "grad_norm": 0.2060546875, + "learning_rate": 0.0002, + "loss": 0.1016, + "step": 6781 + }, + { + "epoch": 12.300158694173657, + "grad_norm": 0.22265625, + "learning_rate": 0.0002, + "loss": 0.1031, + "step": 6782 + }, + { + "epoch": 12.301972341872592, + "grad_norm": 0.2041015625, + "learning_rate": 0.0002, + "loss": 0.0933, + "step": 6783 + }, + { + "epoch": 12.303785989571526, + "grad_norm": 0.2265625, + "learning_rate": 0.0002, + "loss": 0.1049, + "step": 6784 + }, + { + "epoch": 12.30559963727046, + "grad_norm": 0.28515625, + "learning_rate": 0.0002, + "loss": 0.1052, + "step": 6785 + }, + { + "epoch": 12.307413284969394, + "grad_norm": 0.2119140625, + "learning_rate": 0.0002, + "loss": 0.0932, + "step": 6786 + }, + { + "epoch": 12.309226932668329, + "grad_norm": 0.29296875, + "learning_rate": 0.0002, + "loss": 0.1042, + "step": 6787 + }, + { + "epoch": 12.311040580367264, + "grad_norm": 0.2373046875, + "learning_rate": 0.0002, + "loss": 0.0893, + "step": 6788 + }, + { + "epoch": 12.312854228066199, + "grad_norm": 0.2294921875, + "learning_rate": 0.0002, + "loss": 0.1029, + "step": 6789 + }, + { + "epoch": 12.314667875765133, + "grad_norm": 0.25, + "learning_rate": 0.0002, + "loss": 0.1008, + "step": 6790 + }, + { + "epoch": 12.316481523464066, + "grad_norm": 0.3203125, + "learning_rate": 0.0002, + "loss": 0.1033, + "step": 6791 + }, + { + "epoch": 12.318295171163001, + "grad_norm": 0.25390625, + "learning_rate": 0.0002, + "loss": 0.0917, + "step": 6792 + }, + { + "epoch": 12.320108818861936, + "grad_norm": 0.279296875, + "learning_rate": 0.0002, + "loss": 0.0991, + "step": 6793 + }, + { + "epoch": 12.32192246656087, + "grad_norm": 0.294921875, + "learning_rate": 0.0002, + "loss": 0.087, + "step": 6794 + }, + { + "epoch": 12.323736114259805, + "grad_norm": 0.25390625, + "learning_rate": 0.0002, + "loss": 0.1104, + "step": 6795 + }, + { + "epoch": 12.32554976195874, + "grad_norm": 0.271484375, + "learning_rate": 0.0002, + "loss": 0.0841, + "step": 6796 + }, + { + "epoch": 12.327363409657673, + "grad_norm": 0.271484375, + "learning_rate": 0.0002, + "loss": 0.1019, + "step": 6797 + }, + { + "epoch": 12.329177057356608, + "grad_norm": 0.3203125, + "learning_rate": 0.0002, + "loss": 0.0916, + "step": 6798 + }, + { + "epoch": 12.330990705055543, + "grad_norm": 0.328125, + "learning_rate": 0.0002, + "loss": 0.0843, + "step": 6799 + }, + { + "epoch": 12.332804352754478, + "grad_norm": 0.33984375, + "learning_rate": 0.0002, + "loss": 0.082, + "step": 6800 + }, + { + "epoch": 12.334618000453412, + "grad_norm": 0.1904296875, + "learning_rate": 0.0002, + "loss": 0.08, + "step": 6801 + }, + { + "epoch": 12.336431648152347, + "grad_norm": 0.2197265625, + "learning_rate": 0.0002, + "loss": 0.09, + "step": 6802 + }, + { + "epoch": 12.33824529585128, + "grad_norm": 0.3359375, + "learning_rate": 0.0002, + "loss": 0.0874, + "step": 6803 + }, + { + "epoch": 12.340058943550215, + "grad_norm": 0.2236328125, + "learning_rate": 0.0002, + "loss": 0.0901, + "step": 6804 + }, + { + "epoch": 12.34187259124915, + "grad_norm": 0.361328125, + "learning_rate": 0.0002, + "loss": 0.1019, + "step": 6805 + }, + { + "epoch": 12.343686238948084, + "grad_norm": 0.2734375, + "learning_rate": 0.0002, + "loss": 0.1029, + "step": 6806 + }, + { + "epoch": 12.34549988664702, + "grad_norm": 0.408203125, + "learning_rate": 0.0002, + "loss": 0.158, + "step": 6807 + }, + { + "epoch": 12.347313534345954, + "grad_norm": 0.26171875, + "learning_rate": 0.0002, + "loss": 0.1062, + "step": 6808 + }, + { + "epoch": 12.349127182044889, + "grad_norm": 0.57421875, + "learning_rate": 0.0002, + "loss": 0.1206, + "step": 6809 + }, + { + "epoch": 12.350940829743822, + "grad_norm": 0.185546875, + "learning_rate": 0.0002, + "loss": 0.1099, + "step": 6810 + }, + { + "epoch": 12.352754477442756, + "grad_norm": 0.341796875, + "learning_rate": 0.0002, + "loss": 0.1187, + "step": 6811 + }, + { + "epoch": 12.354568125141691, + "grad_norm": 0.18359375, + "learning_rate": 0.0002, + "loss": 0.1178, + "step": 6812 + }, + { + "epoch": 12.356381772840626, + "grad_norm": 0.2041015625, + "learning_rate": 0.0002, + "loss": 0.1168, + "step": 6813 + }, + { + "epoch": 12.35819542053956, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 0.1268, + "step": 6814 + }, + { + "epoch": 12.360009068238494, + "grad_norm": 0.1708984375, + "learning_rate": 0.0002, + "loss": 0.1591, + "step": 6815 + }, + { + "epoch": 12.361822715937429, + "grad_norm": 0.2734375, + "learning_rate": 0.0002, + "loss": 0.1922, + "step": 6816 + }, + { + "epoch": 12.363636363636363, + "grad_norm": 0.2275390625, + "learning_rate": 0.0002, + "loss": 0.2096, + "step": 6817 + }, + { + "epoch": 12.365450011335298, + "grad_norm": 0.27734375, + "learning_rate": 0.0002, + "loss": 0.1656, + "step": 6818 + }, + { + "epoch": 12.367263659034233, + "grad_norm": 0.22265625, + "learning_rate": 0.0002, + "loss": 0.1088, + "step": 6819 + }, + { + "epoch": 12.369077306733168, + "grad_norm": 0.236328125, + "learning_rate": 0.0002, + "loss": 0.1495, + "step": 6820 + }, + { + "epoch": 12.370890954432102, + "grad_norm": 0.2451171875, + "learning_rate": 0.0002, + "loss": 0.1379, + "step": 6821 + }, + { + "epoch": 12.372704602131035, + "grad_norm": 0.263671875, + "learning_rate": 0.0002, + "loss": 0.1624, + "step": 6822 + }, + { + "epoch": 12.37451824982997, + "grad_norm": 0.2021484375, + "learning_rate": 0.0002, + "loss": 0.1251, + "step": 6823 + }, + { + "epoch": 12.376331897528905, + "grad_norm": 0.2001953125, + "learning_rate": 0.0002, + "loss": 0.1058, + "step": 6824 + }, + { + "epoch": 12.37814554522784, + "grad_norm": 0.208984375, + "learning_rate": 0.0002, + "loss": 0.1059, + "step": 6825 + }, + { + "epoch": 12.379959192926774, + "grad_norm": 0.203125, + "learning_rate": 0.0002, + "loss": 0.1149, + "step": 6826 + }, + { + "epoch": 12.38177284062571, + "grad_norm": 0.2197265625, + "learning_rate": 0.0002, + "loss": 0.1246, + "step": 6827 + }, + { + "epoch": 12.383586488324642, + "grad_norm": 0.2421875, + "learning_rate": 0.0002, + "loss": 0.1144, + "step": 6828 + }, + { + "epoch": 12.385400136023577, + "grad_norm": 0.2353515625, + "learning_rate": 0.0002, + "loss": 0.1305, + "step": 6829 + }, + { + "epoch": 12.387213783722512, + "grad_norm": 0.23046875, + "learning_rate": 0.0002, + "loss": 0.1147, + "step": 6830 + }, + { + "epoch": 12.389027431421447, + "grad_norm": 0.216796875, + "learning_rate": 0.0002, + "loss": 0.0986, + "step": 6831 + }, + { + "epoch": 12.390841079120381, + "grad_norm": 0.2236328125, + "learning_rate": 0.0002, + "loss": 0.0924, + "step": 6832 + }, + { + "epoch": 12.392654726819316, + "grad_norm": 0.2431640625, + "learning_rate": 0.0002, + "loss": 0.1075, + "step": 6833 + }, + { + "epoch": 12.39446837451825, + "grad_norm": 0.2236328125, + "learning_rate": 0.0002, + "loss": 0.0898, + "step": 6834 + }, + { + "epoch": 12.396282022217184, + "grad_norm": 0.240234375, + "learning_rate": 0.0002, + "loss": 0.1065, + "step": 6835 + }, + { + "epoch": 12.398095669916119, + "grad_norm": 0.296875, + "learning_rate": 0.0002, + "loss": 0.1315, + "step": 6836 + }, + { + "epoch": 12.399909317615053, + "grad_norm": 0.248046875, + "learning_rate": 0.0002, + "loss": 0.0947, + "step": 6837 + }, + { + "epoch": 12.401722965313988, + "grad_norm": 0.255859375, + "learning_rate": 0.0002, + "loss": 0.1037, + "step": 6838 + }, + { + "epoch": 12.403536613012923, + "grad_norm": 0.255859375, + "learning_rate": 0.0002, + "loss": 0.0881, + "step": 6839 + }, + { + "epoch": 12.405350260711856, + "grad_norm": 0.255859375, + "learning_rate": 0.0002, + "loss": 0.0983, + "step": 6840 + }, + { + "epoch": 12.40716390841079, + "grad_norm": 0.31640625, + "learning_rate": 0.0002, + "loss": 0.0961, + "step": 6841 + }, + { + "epoch": 12.408977556109726, + "grad_norm": 0.25, + "learning_rate": 0.0002, + "loss": 0.0858, + "step": 6842 + }, + { + "epoch": 12.41079120380866, + "grad_norm": 0.30078125, + "learning_rate": 0.0002, + "loss": 0.1094, + "step": 6843 + }, + { + "epoch": 12.412604851507595, + "grad_norm": 0.322265625, + "learning_rate": 0.0002, + "loss": 0.0915, + "step": 6844 + }, + { + "epoch": 12.41441849920653, + "grad_norm": 0.318359375, + "learning_rate": 0.0002, + "loss": 0.0972, + "step": 6845 + }, + { + "epoch": 12.416232146905463, + "grad_norm": 0.265625, + "learning_rate": 0.0002, + "loss": 0.1006, + "step": 6846 + }, + { + "epoch": 12.418045794604398, + "grad_norm": 0.287109375, + "learning_rate": 0.0002, + "loss": 0.0929, + "step": 6847 + }, + { + "epoch": 12.419859442303332, + "grad_norm": 0.322265625, + "learning_rate": 0.0002, + "loss": 0.0912, + "step": 6848 + }, + { + "epoch": 12.421673090002267, + "grad_norm": 0.330078125, + "learning_rate": 0.0002, + "loss": 0.1051, + "step": 6849 + }, + { + "epoch": 12.423486737701202, + "grad_norm": 0.251953125, + "learning_rate": 0.0002, + "loss": 0.0916, + "step": 6850 + }, + { + "epoch": 12.425300385400137, + "grad_norm": 0.283203125, + "learning_rate": 0.0002, + "loss": 0.0939, + "step": 6851 + }, + { + "epoch": 12.42711403309907, + "grad_norm": 0.333984375, + "learning_rate": 0.0002, + "loss": 0.1022, + "step": 6852 + }, + { + "epoch": 12.428927680798004, + "grad_norm": 0.29296875, + "learning_rate": 0.0002, + "loss": 0.0926, + "step": 6853 + }, + { + "epoch": 12.43074132849694, + "grad_norm": 0.275390625, + "learning_rate": 0.0002, + "loss": 0.0862, + "step": 6854 + }, + { + "epoch": 12.432554976195874, + "grad_norm": 0.2490234375, + "learning_rate": 0.0002, + "loss": 0.1011, + "step": 6855 + }, + { + "epoch": 12.434368623894809, + "grad_norm": 0.5390625, + "learning_rate": 0.0002, + "loss": 0.1388, + "step": 6856 + }, + { + "epoch": 12.436182271593744, + "grad_norm": 0.283203125, + "learning_rate": 0.0002, + "loss": 0.103, + "step": 6857 + }, + { + "epoch": 12.437995919292677, + "grad_norm": 0.2109375, + "learning_rate": 0.0002, + "loss": 0.0971, + "step": 6858 + }, + { + "epoch": 12.439809566991611, + "grad_norm": 0.365234375, + "learning_rate": 0.0002, + "loss": 0.106, + "step": 6859 + }, + { + "epoch": 12.441623214690546, + "grad_norm": 0.400390625, + "learning_rate": 0.0002, + "loss": 0.1415, + "step": 6860 + }, + { + "epoch": 12.44343686238948, + "grad_norm": 0.203125, + "learning_rate": 0.0002, + "loss": 0.101, + "step": 6861 + }, + { + "epoch": 12.445250510088416, + "grad_norm": 0.173828125, + "learning_rate": 0.0002, + "loss": 0.1092, + "step": 6862 + }, + { + "epoch": 12.44706415778735, + "grad_norm": 0.2353515625, + "learning_rate": 0.0002, + "loss": 0.1245, + "step": 6863 + }, + { + "epoch": 12.448877805486283, + "grad_norm": 0.294921875, + "learning_rate": 0.0002, + "loss": 0.1452, + "step": 6864 + }, + { + "epoch": 12.450691453185218, + "grad_norm": 0.337890625, + "learning_rate": 0.0002, + "loss": 0.1403, + "step": 6865 + }, + { + "epoch": 12.452505100884153, + "grad_norm": 0.138671875, + "learning_rate": 0.0002, + "loss": 0.1695, + "step": 6866 + }, + { + "epoch": 12.454318748583088, + "grad_norm": 0.271484375, + "learning_rate": 0.0002, + "loss": 0.2454, + "step": 6867 + }, + { + "epoch": 12.456132396282023, + "grad_norm": 0.25, + "learning_rate": 0.0002, + "loss": 0.1582, + "step": 6868 + }, + { + "epoch": 12.457946043980957, + "grad_norm": 0.2451171875, + "learning_rate": 0.0002, + "loss": 0.1373, + "step": 6869 + }, + { + "epoch": 12.459759691679892, + "grad_norm": 0.2294921875, + "learning_rate": 0.0002, + "loss": 0.1552, + "step": 6870 + }, + { + "epoch": 12.461573339378825, + "grad_norm": 0.27734375, + "learning_rate": 0.0002, + "loss": 0.195, + "step": 6871 + }, + { + "epoch": 12.46338698707776, + "grad_norm": 0.2373046875, + "learning_rate": 0.0002, + "loss": 0.1288, + "step": 6872 + }, + { + "epoch": 12.465200634776695, + "grad_norm": 0.259765625, + "learning_rate": 0.0002, + "loss": 0.1828, + "step": 6873 + }, + { + "epoch": 12.46701428247563, + "grad_norm": 0.2392578125, + "learning_rate": 0.0002, + "loss": 0.1407, + "step": 6874 + }, + { + "epoch": 12.468827930174564, + "grad_norm": 0.2265625, + "learning_rate": 0.0002, + "loss": 0.1646, + "step": 6875 + }, + { + "epoch": 12.470641577873499, + "grad_norm": 0.2138671875, + "learning_rate": 0.0002, + "loss": 0.1301, + "step": 6876 + }, + { + "epoch": 12.472455225572432, + "grad_norm": 0.2138671875, + "learning_rate": 0.0002, + "loss": 0.1133, + "step": 6877 + }, + { + "epoch": 12.474268873271367, + "grad_norm": 0.21484375, + "learning_rate": 0.0002, + "loss": 0.1136, + "step": 6878 + }, + { + "epoch": 12.476082520970301, + "grad_norm": 0.2080078125, + "learning_rate": 0.0002, + "loss": 0.1097, + "step": 6879 + }, + { + "epoch": 12.477896168669236, + "grad_norm": 0.251953125, + "learning_rate": 0.0002, + "loss": 0.1379, + "step": 6880 + }, + { + "epoch": 12.479709816368171, + "grad_norm": 0.193359375, + "learning_rate": 0.0002, + "loss": 0.0896, + "step": 6881 + }, + { + "epoch": 12.481523464067106, + "grad_norm": 0.23828125, + "learning_rate": 0.0002, + "loss": 0.1176, + "step": 6882 + }, + { + "epoch": 12.483337111766039, + "grad_norm": 0.220703125, + "learning_rate": 0.0002, + "loss": 0.1001, + "step": 6883 + }, + { + "epoch": 12.485150759464974, + "grad_norm": 0.26171875, + "learning_rate": 0.0002, + "loss": 0.1152, + "step": 6884 + }, + { + "epoch": 12.486964407163908, + "grad_norm": 0.2734375, + "learning_rate": 0.0002, + "loss": 0.1033, + "step": 6885 + }, + { + "epoch": 12.488778054862843, + "grad_norm": 0.23046875, + "learning_rate": 0.0002, + "loss": 0.0886, + "step": 6886 + }, + { + "epoch": 12.490591702561778, + "grad_norm": 0.2138671875, + "learning_rate": 0.0002, + "loss": 0.0943, + "step": 6887 + }, + { + "epoch": 12.492405350260713, + "grad_norm": 0.25390625, + "learning_rate": 0.0002, + "loss": 0.0884, + "step": 6888 + }, + { + "epoch": 12.494218997959646, + "grad_norm": 0.2890625, + "learning_rate": 0.0002, + "loss": 0.0951, + "step": 6889 + }, + { + "epoch": 12.49603264565858, + "grad_norm": 0.27734375, + "learning_rate": 0.0002, + "loss": 0.1019, + "step": 6890 + }, + { + "epoch": 12.497846293357515, + "grad_norm": 0.265625, + "learning_rate": 0.0002, + "loss": 0.1047, + "step": 6891 + }, + { + "epoch": 12.49965994105645, + "grad_norm": 0.2265625, + "learning_rate": 0.0002, + "loss": 0.0835, + "step": 6892 + }, + { + "epoch": 12.501473588755385, + "grad_norm": 0.271484375, + "learning_rate": 0.0002, + "loss": 0.0965, + "step": 6893 + }, + { + "epoch": 12.50328723645432, + "grad_norm": 0.31640625, + "learning_rate": 0.0002, + "loss": 0.1029, + "step": 6894 + }, + { + "epoch": 12.505100884153252, + "grad_norm": 0.2421875, + "learning_rate": 0.0002, + "loss": 0.0906, + "step": 6895 + }, + { + "epoch": 12.506914531852187, + "grad_norm": 0.263671875, + "learning_rate": 0.0002, + "loss": 0.0853, + "step": 6896 + }, + { + "epoch": 12.508728179551122, + "grad_norm": 0.314453125, + "learning_rate": 0.0002, + "loss": 0.1081, + "step": 6897 + }, + { + "epoch": 12.510541827250057, + "grad_norm": 0.2412109375, + "learning_rate": 0.0002, + "loss": 0.077, + "step": 6898 + }, + { + "epoch": 12.512355474948992, + "grad_norm": 0.302734375, + "learning_rate": 0.0002, + "loss": 0.0826, + "step": 6899 + }, + { + "epoch": 12.514169122647926, + "grad_norm": 0.28125, + "learning_rate": 0.0002, + "loss": 0.0888, + "step": 6900 + }, + { + "epoch": 12.51598277034686, + "grad_norm": 0.27734375, + "learning_rate": 0.0002, + "loss": 0.0877, + "step": 6901 + }, + { + "epoch": 12.517796418045794, + "grad_norm": 0.279296875, + "learning_rate": 0.0002, + "loss": 0.1051, + "step": 6902 + }, + { + "epoch": 12.519610065744729, + "grad_norm": 0.263671875, + "learning_rate": 0.0002, + "loss": 0.0874, + "step": 6903 + }, + { + "epoch": 12.521423713443664, + "grad_norm": 0.279296875, + "learning_rate": 0.0002, + "loss": 0.108, + "step": 6904 + }, + { + "epoch": 12.523237361142598, + "grad_norm": 0.291015625, + "learning_rate": 0.0002, + "loss": 0.101, + "step": 6905 + }, + { + "epoch": 12.525051008841533, + "grad_norm": 0.26953125, + "learning_rate": 0.0002, + "loss": 0.0871, + "step": 6906 + }, + { + "epoch": 12.526864656540466, + "grad_norm": 0.2421875, + "learning_rate": 0.0002, + "loss": 0.0899, + "step": 6907 + }, + { + "epoch": 12.528678304239401, + "grad_norm": 0.3125, + "learning_rate": 0.0002, + "loss": 0.1455, + "step": 6908 + }, + { + "epoch": 12.530491951938336, + "grad_norm": 0.1787109375, + "learning_rate": 0.0002, + "loss": 0.0961, + "step": 6909 + }, + { + "epoch": 12.53230559963727, + "grad_norm": 0.2197265625, + "learning_rate": 0.0002, + "loss": 0.1099, + "step": 6910 + }, + { + "epoch": 12.534119247336205, + "grad_norm": 0.2353515625, + "learning_rate": 0.0002, + "loss": 0.1089, + "step": 6911 + }, + { + "epoch": 12.53593289503514, + "grad_norm": 0.365234375, + "learning_rate": 0.0002, + "loss": 0.1214, + "step": 6912 + }, + { + "epoch": 12.537746542734073, + "grad_norm": 0.2578125, + "learning_rate": 0.0002, + "loss": 0.1137, + "step": 6913 + }, + { + "epoch": 12.539560190433008, + "grad_norm": 0.337890625, + "learning_rate": 0.0002, + "loss": 0.1525, + "step": 6914 + }, + { + "epoch": 12.541373838131943, + "grad_norm": 0.1396484375, + "learning_rate": 0.0002, + "loss": 0.1329, + "step": 6915 + }, + { + "epoch": 12.543187485830877, + "grad_norm": 0.21484375, + "learning_rate": 0.0002, + "loss": 0.2107, + "step": 6916 + }, + { + "epoch": 12.545001133529812, + "grad_norm": 0.25, + "learning_rate": 0.0002, + "loss": 0.3068, + "step": 6917 + }, + { + "epoch": 12.546814781228747, + "grad_norm": 0.23828125, + "learning_rate": 0.0002, + "loss": 0.1456, + "step": 6918 + }, + { + "epoch": 12.548628428927682, + "grad_norm": 0.2373046875, + "learning_rate": 0.0002, + "loss": 0.1457, + "step": 6919 + }, + { + "epoch": 12.548628428927682, + "eval_loss": 2.2852742671966553, + "eval_runtime": 152.7, + "eval_samples_per_second": 6.549, + "eval_steps_per_second": 6.549, + "step": 6919 + }, + { + "epoch": 12.548628428927682, + "mmlu_eval_accuracy": 0.2992718733250504, + "mmlu_eval_accuracy_abstract_algebra": 0.45454545454545453, + "mmlu_eval_accuracy_anatomy": 0.2857142857142857, + "mmlu_eval_accuracy_astronomy": 0.3125, + "mmlu_eval_accuracy_business_ethics": 0.36363636363636365, + "mmlu_eval_accuracy_clinical_knowledge": 0.27586206896551724, + "mmlu_eval_accuracy_college_biology": 0.4375, + "mmlu_eval_accuracy_college_chemistry": 0.125, + "mmlu_eval_accuracy_college_computer_science": 0.09090909090909091, + "mmlu_eval_accuracy_college_mathematics": 0.18181818181818182, + "mmlu_eval_accuracy_college_medicine": 0.2727272727272727, + "mmlu_eval_accuracy_college_physics": 0.2727272727272727, + "mmlu_eval_accuracy_computer_security": 0.45454545454545453, + "mmlu_eval_accuracy_conceptual_physics": 0.2692307692307692, + "mmlu_eval_accuracy_econometrics": 0.08333333333333333, + "mmlu_eval_accuracy_electrical_engineering": 0.25, + "mmlu_eval_accuracy_elementary_mathematics": 0.3170731707317073, + "mmlu_eval_accuracy_formal_logic": 0.14285714285714285, + "mmlu_eval_accuracy_global_facts": 0.3, + "mmlu_eval_accuracy_high_school_biology": 0.40625, + "mmlu_eval_accuracy_high_school_chemistry": 0.18181818181818182, + "mmlu_eval_accuracy_high_school_computer_science": 0.2222222222222222, + "mmlu_eval_accuracy_high_school_european_history": 0.2777777777777778, + "mmlu_eval_accuracy_high_school_geography": 0.3181818181818182, + "mmlu_eval_accuracy_high_school_government_and_politics": 0.38095238095238093, + "mmlu_eval_accuracy_high_school_macroeconomics": 0.18604651162790697, + "mmlu_eval_accuracy_high_school_mathematics": 0.10344827586206896, + "mmlu_eval_accuracy_high_school_microeconomics": 0.3076923076923077, + "mmlu_eval_accuracy_high_school_physics": 0.47058823529411764, + "mmlu_eval_accuracy_high_school_psychology": 0.35, + "mmlu_eval_accuracy_high_school_statistics": 0.2608695652173913, + "mmlu_eval_accuracy_high_school_us_history": 0.2727272727272727, + "mmlu_eval_accuracy_high_school_world_history": 0.11538461538461539, + "mmlu_eval_accuracy_human_aging": 0.30434782608695654, + "mmlu_eval_accuracy_human_sexuality": 0.16666666666666666, + "mmlu_eval_accuracy_international_law": 0.38461538461538464, + "mmlu_eval_accuracy_jurisprudence": 0.45454545454545453, + "mmlu_eval_accuracy_logical_fallacies": 0.3888888888888889, + "mmlu_eval_accuracy_machine_learning": 0.36363636363636365, + "mmlu_eval_accuracy_management": 0.36363636363636365, + "mmlu_eval_accuracy_marketing": 0.44, + "mmlu_eval_accuracy_medical_genetics": 0.2727272727272727, + "mmlu_eval_accuracy_miscellaneous": 0.4418604651162791, + "mmlu_eval_accuracy_moral_disputes": 0.2894736842105263, + "mmlu_eval_accuracy_moral_scenarios": 0.22, + "mmlu_eval_accuracy_nutrition": 0.2727272727272727, + "mmlu_eval_accuracy_philosophy": 0.35294117647058826, + "mmlu_eval_accuracy_prehistory": 0.34285714285714286, + "mmlu_eval_accuracy_professional_accounting": 0.3225806451612903, + "mmlu_eval_accuracy_professional_law": 0.27647058823529413, + "mmlu_eval_accuracy_professional_medicine": 0.22580645161290322, + "mmlu_eval_accuracy_professional_psychology": 0.2753623188405797, + "mmlu_eval_accuracy_public_relations": 0.3333333333333333, + "mmlu_eval_accuracy_security_studies": 0.4074074074074074, + "mmlu_eval_accuracy_sociology": 0.45454545454545453, + "mmlu_eval_accuracy_us_foreign_policy": 0.36363636363636365, + "mmlu_eval_accuracy_virology": 0.3333333333333333, + "mmlu_eval_accuracy_world_religions": 0.2631578947368421, + "mmlu_loss": 1.8853358843216002, + "step": 6919 + }, + { + "epoch": 12.550442076626615, + "grad_norm": 0.234375, + "learning_rate": 0.0002, + "loss": 0.1533, + "step": 6920 + }, + { + "epoch": 12.55225572432555, + "grad_norm": 0.251953125, + "learning_rate": 0.0002, + "loss": 0.1358, + "step": 6921 + }, + { + "epoch": 12.554069372024484, + "grad_norm": 0.203125, + "learning_rate": 0.0002, + "loss": 0.1093, + "step": 6922 + }, + { + "epoch": 12.555883019723419, + "grad_norm": 0.236328125, + "learning_rate": 0.0002, + "loss": 0.1403, + "step": 6923 + }, + { + "epoch": 12.557696667422354, + "grad_norm": 0.220703125, + "learning_rate": 0.0002, + "loss": 0.1445, + "step": 6924 + }, + { + "epoch": 12.559510315121287, + "grad_norm": 0.1865234375, + "learning_rate": 0.0002, + "loss": 0.1088, + "step": 6925 + }, + { + "epoch": 12.561323962820222, + "grad_norm": 0.2099609375, + "learning_rate": 0.0002, + "loss": 0.1252, + "step": 6926 + }, + { + "epoch": 12.563137610519156, + "grad_norm": 0.2236328125, + "learning_rate": 0.0002, + "loss": 0.1545, + "step": 6927 + }, + { + "epoch": 12.564951258218091, + "grad_norm": 0.2119140625, + "learning_rate": 0.0002, + "loss": 0.1128, + "step": 6928 + }, + { + "epoch": 12.566764905917026, + "grad_norm": 0.265625, + "learning_rate": 0.0002, + "loss": 0.1619, + "step": 6929 + }, + { + "epoch": 12.56857855361596, + "grad_norm": 0.2216796875, + "learning_rate": 0.0002, + "loss": 0.1031, + "step": 6930 + }, + { + "epoch": 12.570392201314895, + "grad_norm": 0.2353515625, + "learning_rate": 0.0002, + "loss": 0.111, + "step": 6931 + }, + { + "epoch": 12.572205849013828, + "grad_norm": 0.2490234375, + "learning_rate": 0.0002, + "loss": 0.1082, + "step": 6932 + }, + { + "epoch": 12.574019496712763, + "grad_norm": 0.267578125, + "learning_rate": 0.0002, + "loss": 0.1093, + "step": 6933 + }, + { + "epoch": 12.575833144411698, + "grad_norm": 0.26953125, + "learning_rate": 0.0002, + "loss": 0.111, + "step": 6934 + }, + { + "epoch": 12.577646792110633, + "grad_norm": 0.2734375, + "learning_rate": 0.0002, + "loss": 0.1041, + "step": 6935 + }, + { + "epoch": 12.579460439809568, + "grad_norm": 0.2353515625, + "learning_rate": 0.0002, + "loss": 0.104, + "step": 6936 + }, + { + "epoch": 12.581274087508502, + "grad_norm": 0.28125, + "learning_rate": 0.0002, + "loss": 0.1051, + "step": 6937 + }, + { + "epoch": 12.583087735207435, + "grad_norm": 0.2734375, + "learning_rate": 0.0002, + "loss": 0.0943, + "step": 6938 + }, + { + "epoch": 12.58490138290637, + "grad_norm": 0.314453125, + "learning_rate": 0.0002, + "loss": 0.0981, + "step": 6939 + }, + { + "epoch": 12.586715030605305, + "grad_norm": 0.31640625, + "learning_rate": 0.0002, + "loss": 0.0927, + "step": 6940 + }, + { + "epoch": 12.58852867830424, + "grad_norm": 0.25390625, + "learning_rate": 0.0002, + "loss": 0.1056, + "step": 6941 + }, + { + "epoch": 12.590342326003174, + "grad_norm": 0.33984375, + "learning_rate": 0.0002, + "loss": 0.102, + "step": 6942 + }, + { + "epoch": 12.59215597370211, + "grad_norm": 0.29296875, + "learning_rate": 0.0002, + "loss": 0.093, + "step": 6943 + }, + { + "epoch": 12.593969621401042, + "grad_norm": 0.345703125, + "learning_rate": 0.0002, + "loss": 0.1092, + "step": 6944 + }, + { + "epoch": 12.595783269099977, + "grad_norm": 0.34375, + "learning_rate": 0.0002, + "loss": 0.0954, + "step": 6945 + }, + { + "epoch": 12.597596916798912, + "grad_norm": 0.283203125, + "learning_rate": 0.0002, + "loss": 0.0997, + "step": 6946 + }, + { + "epoch": 12.599410564497846, + "grad_norm": 0.263671875, + "learning_rate": 0.0002, + "loss": 0.0833, + "step": 6947 + }, + { + "epoch": 12.601224212196781, + "grad_norm": 0.30859375, + "learning_rate": 0.0002, + "loss": 0.0914, + "step": 6948 + }, + { + "epoch": 12.603037859895716, + "grad_norm": 0.27734375, + "learning_rate": 0.0002, + "loss": 0.0875, + "step": 6949 + }, + { + "epoch": 12.604851507594649, + "grad_norm": 0.26171875, + "learning_rate": 0.0002, + "loss": 0.0995, + "step": 6950 + }, + { + "epoch": 12.606665155293584, + "grad_norm": 0.31640625, + "learning_rate": 0.0002, + "loss": 0.0947, + "step": 6951 + }, + { + "epoch": 12.608478802992519, + "grad_norm": 0.33984375, + "learning_rate": 0.0002, + "loss": 0.0951, + "step": 6952 + }, + { + "epoch": 12.610292450691453, + "grad_norm": 0.283203125, + "learning_rate": 0.0002, + "loss": 0.1, + "step": 6953 + }, + { + "epoch": 12.612106098390388, + "grad_norm": 0.33984375, + "learning_rate": 0.0002, + "loss": 0.1017, + "step": 6954 + }, + { + "epoch": 12.613919746089323, + "grad_norm": 0.318359375, + "learning_rate": 0.0002, + "loss": 0.103, + "step": 6955 + }, + { + "epoch": 12.615733393788256, + "grad_norm": 0.25390625, + "learning_rate": 0.0002, + "loss": 0.1036, + "step": 6956 + }, + { + "epoch": 12.61754704148719, + "grad_norm": 0.333984375, + "learning_rate": 0.0002, + "loss": 0.1007, + "step": 6957 + }, + { + "epoch": 12.619360689186125, + "grad_norm": 0.26953125, + "learning_rate": 0.0002, + "loss": 0.1031, + "step": 6958 + }, + { + "epoch": 12.62117433688506, + "grad_norm": 0.2353515625, + "learning_rate": 0.0002, + "loss": 0.111, + "step": 6959 + }, + { + "epoch": 12.622987984583995, + "grad_norm": 0.314453125, + "learning_rate": 0.0002, + "loss": 0.1031, + "step": 6960 + }, + { + "epoch": 12.62480163228293, + "grad_norm": 0.23828125, + "learning_rate": 0.0002, + "loss": 0.1161, + "step": 6961 + }, + { + "epoch": 12.626615279981863, + "grad_norm": 0.26953125, + "learning_rate": 0.0002, + "loss": 0.1148, + "step": 6962 + }, + { + "epoch": 12.628428927680797, + "grad_norm": 0.1962890625, + "learning_rate": 0.0002, + "loss": 0.1252, + "step": 6963 + }, + { + "epoch": 12.630242575379732, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 0.1318, + "step": 6964 + }, + { + "epoch": 12.632056223078667, + "grad_norm": 0.17578125, + "learning_rate": 0.0002, + "loss": 0.1611, + "step": 6965 + }, + { + "epoch": 12.633869870777602, + "grad_norm": 0.1689453125, + "learning_rate": 0.0002, + "loss": 0.1851, + "step": 6966 + }, + { + "epoch": 12.635683518476537, + "grad_norm": 0.240234375, + "learning_rate": 0.0002, + "loss": 0.2183, + "step": 6967 + }, + { + "epoch": 12.63749716617547, + "grad_norm": 0.28515625, + "learning_rate": 0.0002, + "loss": 0.238, + "step": 6968 + }, + { + "epoch": 12.639310813874404, + "grad_norm": 0.2431640625, + "learning_rate": 0.0002, + "loss": 0.1345, + "step": 6969 + }, + { + "epoch": 12.64112446157334, + "grad_norm": 0.26171875, + "learning_rate": 0.0002, + "loss": 0.1397, + "step": 6970 + }, + { + "epoch": 12.642938109272274, + "grad_norm": 0.279296875, + "learning_rate": 0.0002, + "loss": 0.1743, + "step": 6971 + }, + { + "epoch": 12.644751756971209, + "grad_norm": 0.234375, + "learning_rate": 0.0002, + "loss": 0.1585, + "step": 6972 + }, + { + "epoch": 12.646565404670143, + "grad_norm": 0.2392578125, + "learning_rate": 0.0002, + "loss": 0.1583, + "step": 6973 + }, + { + "epoch": 12.648379052369076, + "grad_norm": 0.263671875, + "learning_rate": 0.0002, + "loss": 0.1661, + "step": 6974 + }, + { + "epoch": 12.650192700068011, + "grad_norm": 0.255859375, + "learning_rate": 0.0002, + "loss": 0.1757, + "step": 6975 + }, + { + "epoch": 12.652006347766946, + "grad_norm": 0.21484375, + "learning_rate": 0.0002, + "loss": 0.1114, + "step": 6976 + }, + { + "epoch": 12.65381999546588, + "grad_norm": 0.2177734375, + "learning_rate": 0.0002, + "loss": 0.1177, + "step": 6977 + }, + { + "epoch": 12.655633643164816, + "grad_norm": 0.255859375, + "learning_rate": 0.0002, + "loss": 0.1621, + "step": 6978 + }, + { + "epoch": 12.65744729086375, + "grad_norm": 0.2275390625, + "learning_rate": 0.0002, + "loss": 0.1098, + "step": 6979 + }, + { + "epoch": 12.659260938562685, + "grad_norm": 0.2333984375, + "learning_rate": 0.0002, + "loss": 0.1121, + "step": 6980 + }, + { + "epoch": 12.661074586261618, + "grad_norm": 0.224609375, + "learning_rate": 0.0002, + "loss": 0.1028, + "step": 6981 + }, + { + "epoch": 12.662888233960553, + "grad_norm": 0.2578125, + "learning_rate": 0.0002, + "loss": 0.1248, + "step": 6982 + }, + { + "epoch": 12.664701881659488, + "grad_norm": 0.28125, + "learning_rate": 0.0002, + "loss": 0.1258, + "step": 6983 + }, + { + "epoch": 12.666515529358422, + "grad_norm": 0.2255859375, + "learning_rate": 0.0002, + "loss": 0.0916, + "step": 6984 + }, + { + "epoch": 12.668329177057357, + "grad_norm": 0.287109375, + "learning_rate": 0.0002, + "loss": 0.1289, + "step": 6985 + }, + { + "epoch": 12.67014282475629, + "grad_norm": 0.3046875, + "learning_rate": 0.0002, + "loss": 0.1169, + "step": 6986 + }, + { + "epoch": 12.671956472455225, + "grad_norm": 0.291015625, + "learning_rate": 0.0002, + "loss": 0.12, + "step": 6987 + }, + { + "epoch": 12.67377012015416, + "grad_norm": 0.263671875, + "learning_rate": 0.0002, + "loss": 0.1067, + "step": 6988 + }, + { + "epoch": 12.675583767853094, + "grad_norm": 0.3359375, + "learning_rate": 0.0002, + "loss": 0.1104, + "step": 6989 + }, + { + "epoch": 12.67739741555203, + "grad_norm": 0.2890625, + "learning_rate": 0.0002, + "loss": 0.1162, + "step": 6990 + }, + { + "epoch": 12.679211063250964, + "grad_norm": 0.330078125, + "learning_rate": 0.0002, + "loss": 0.1481, + "step": 6991 + }, + { + "epoch": 12.681024710949899, + "grad_norm": 0.21484375, + "learning_rate": 0.0002, + "loss": 0.0827, + "step": 6992 + }, + { + "epoch": 12.682838358648832, + "grad_norm": 0.28515625, + "learning_rate": 0.0002, + "loss": 0.1075, + "step": 6993 + }, + { + "epoch": 12.684652006347767, + "grad_norm": 0.2255859375, + "learning_rate": 0.0002, + "loss": 0.094, + "step": 6994 + }, + { + "epoch": 12.686465654046701, + "grad_norm": 0.294921875, + "learning_rate": 0.0002, + "loss": 0.0871, + "step": 6995 + }, + { + "epoch": 12.688279301745636, + "grad_norm": 0.294921875, + "learning_rate": 0.0002, + "loss": 0.1004, + "step": 6996 + }, + { + "epoch": 12.690092949444571, + "grad_norm": 0.24609375, + "learning_rate": 0.0002, + "loss": 0.0835, + "step": 6997 + }, + { + "epoch": 12.691906597143506, + "grad_norm": 0.33203125, + "learning_rate": 0.0002, + "loss": 0.0878, + "step": 6998 + }, + { + "epoch": 12.693720244842439, + "grad_norm": 0.275390625, + "learning_rate": 0.0002, + "loss": 0.1203, + "step": 6999 + }, + { + "epoch": 12.695533892541373, + "grad_norm": 0.32421875, + "learning_rate": 0.0002, + "loss": 0.1072, + "step": 7000 + }, + { + "epoch": 12.697347540240308, + "grad_norm": 0.2890625, + "learning_rate": 0.0002, + "loss": 0.0866, + "step": 7001 + }, + { + "epoch": 12.699161187939243, + "grad_norm": 0.33984375, + "learning_rate": 0.0002, + "loss": 0.1136, + "step": 7002 + }, + { + "epoch": 12.700974835638178, + "grad_norm": 0.25, + "learning_rate": 0.0002, + "loss": 0.0887, + "step": 7003 + }, + { + "epoch": 12.702788483337113, + "grad_norm": 0.23828125, + "learning_rate": 0.0002, + "loss": 0.0901, + "step": 7004 + }, + { + "epoch": 12.704602131036046, + "grad_norm": 0.287109375, + "learning_rate": 0.0002, + "loss": 0.0932, + "step": 7005 + }, + { + "epoch": 12.70641577873498, + "grad_norm": 0.333984375, + "learning_rate": 0.0002, + "loss": 0.109, + "step": 7006 + }, + { + "epoch": 12.708229426433915, + "grad_norm": 0.22265625, + "learning_rate": 0.0002, + "loss": 0.1082, + "step": 7007 + }, + { + "epoch": 12.71004307413285, + "grad_norm": 0.263671875, + "learning_rate": 0.0002, + "loss": 0.1122, + "step": 7008 + }, + { + "epoch": 12.711856721831785, + "grad_norm": 0.369140625, + "learning_rate": 0.0002, + "loss": 0.1205, + "step": 7009 + }, + { + "epoch": 12.71367036953072, + "grad_norm": 0.28515625, + "learning_rate": 0.0002, + "loss": 0.1194, + "step": 7010 + }, + { + "epoch": 12.715484017229652, + "grad_norm": 0.244140625, + "learning_rate": 0.0002, + "loss": 0.125, + "step": 7011 + }, + { + "epoch": 12.717297664928587, + "grad_norm": 0.2080078125, + "learning_rate": 0.0002, + "loss": 0.1289, + "step": 7012 + }, + { + "epoch": 12.719111312627522, + "grad_norm": 0.2373046875, + "learning_rate": 0.0002, + "loss": 0.1483, + "step": 7013 + }, + { + "epoch": 12.720924960326457, + "grad_norm": 0.2109375, + "learning_rate": 0.0002, + "loss": 0.1327, + "step": 7014 + }, + { + "epoch": 12.722738608025391, + "grad_norm": 0.3046875, + "learning_rate": 0.0002, + "loss": 0.1643, + "step": 7015 + }, + { + "epoch": 12.724552255724326, + "grad_norm": 0.1943359375, + "learning_rate": 0.0002, + "loss": 0.2088, + "step": 7016 + }, + { + "epoch": 12.72636590342326, + "grad_norm": 0.2138671875, + "learning_rate": 0.0002, + "loss": 0.2007, + "step": 7017 + }, + { + "epoch": 12.728179551122194, + "grad_norm": 0.263671875, + "learning_rate": 0.0002, + "loss": 0.1514, + "step": 7018 + }, + { + "epoch": 12.729993198821129, + "grad_norm": 0.2470703125, + "learning_rate": 0.0002, + "loss": 0.1857, + "step": 7019 + }, + { + "epoch": 12.731806846520064, + "grad_norm": 0.208984375, + "learning_rate": 0.0002, + "loss": 0.1225, + "step": 7020 + }, + { + "epoch": 12.733620494218998, + "grad_norm": 0.232421875, + "learning_rate": 0.0002, + "loss": 0.1386, + "step": 7021 + }, + { + "epoch": 12.735434141917933, + "grad_norm": 0.25, + "learning_rate": 0.0002, + "loss": 0.1494, + "step": 7022 + }, + { + "epoch": 12.737247789616866, + "grad_norm": 0.2080078125, + "learning_rate": 0.0002, + "loss": 0.1096, + "step": 7023 + }, + { + "epoch": 12.7390614373158, + "grad_norm": 0.2734375, + "learning_rate": 0.0002, + "loss": 0.1614, + "step": 7024 + }, + { + "epoch": 12.740875085014736, + "grad_norm": 0.248046875, + "learning_rate": 0.0002, + "loss": 0.167, + "step": 7025 + }, + { + "epoch": 12.74268873271367, + "grad_norm": 0.267578125, + "learning_rate": 0.0002, + "loss": 0.154, + "step": 7026 + }, + { + "epoch": 12.744502380412605, + "grad_norm": 0.2265625, + "learning_rate": 0.0002, + "loss": 0.1245, + "step": 7027 + }, + { + "epoch": 12.74631602811154, + "grad_norm": 0.2197265625, + "learning_rate": 0.0002, + "loss": 0.123, + "step": 7028 + }, + { + "epoch": 12.748129675810475, + "grad_norm": 0.2421875, + "learning_rate": 0.0002, + "loss": 0.1312, + "step": 7029 + }, + { + "epoch": 12.749943323509408, + "grad_norm": 0.2392578125, + "learning_rate": 0.0002, + "loss": 0.1174, + "step": 7030 + }, + { + "epoch": 12.751756971208343, + "grad_norm": 0.279296875, + "learning_rate": 0.0002, + "loss": 0.146, + "step": 7031 + }, + { + "epoch": 12.753570618907277, + "grad_norm": 0.2333984375, + "learning_rate": 0.0002, + "loss": 0.0986, + "step": 7032 + }, + { + "epoch": 12.755384266606212, + "grad_norm": 0.25, + "learning_rate": 0.0002, + "loss": 0.1156, + "step": 7033 + }, + { + "epoch": 12.757197914305147, + "grad_norm": 0.265625, + "learning_rate": 0.0002, + "loss": 0.1274, + "step": 7034 + }, + { + "epoch": 12.75901156200408, + "grad_norm": 0.26171875, + "learning_rate": 0.0002, + "loss": 0.1167, + "step": 7035 + }, + { + "epoch": 12.760825209703015, + "grad_norm": 0.255859375, + "learning_rate": 0.0002, + "loss": 0.1133, + "step": 7036 + }, + { + "epoch": 12.76263885740195, + "grad_norm": 0.275390625, + "learning_rate": 0.0002, + "loss": 0.1127, + "step": 7037 + }, + { + "epoch": 12.764452505100884, + "grad_norm": 0.294921875, + "learning_rate": 0.0002, + "loss": 0.1099, + "step": 7038 + }, + { + "epoch": 12.766266152799819, + "grad_norm": 0.296875, + "learning_rate": 0.0002, + "loss": 0.1189, + "step": 7039 + }, + { + "epoch": 12.768079800498754, + "grad_norm": 0.287109375, + "learning_rate": 0.0002, + "loss": 0.111, + "step": 7040 + }, + { + "epoch": 12.769893448197688, + "grad_norm": 0.255859375, + "learning_rate": 0.0002, + "loss": 0.0983, + "step": 7041 + }, + { + "epoch": 12.771707095896621, + "grad_norm": 0.265625, + "learning_rate": 0.0002, + "loss": 0.108, + "step": 7042 + }, + { + "epoch": 12.773520743595556, + "grad_norm": 0.298828125, + "learning_rate": 0.0002, + "loss": 0.0945, + "step": 7043 + }, + { + "epoch": 12.775334391294491, + "grad_norm": 0.2431640625, + "learning_rate": 0.0002, + "loss": 0.0978, + "step": 7044 + }, + { + "epoch": 12.777148038993426, + "grad_norm": 0.35546875, + "learning_rate": 0.0002, + "loss": 0.1049, + "step": 7045 + }, + { + "epoch": 12.77896168669236, + "grad_norm": 0.3671875, + "learning_rate": 0.0002, + "loss": 0.1034, + "step": 7046 + }, + { + "epoch": 12.780775334391295, + "grad_norm": 0.349609375, + "learning_rate": 0.0002, + "loss": 0.102, + "step": 7047 + }, + { + "epoch": 12.782588982090228, + "grad_norm": 0.431640625, + "learning_rate": 0.0002, + "loss": 0.1003, + "step": 7048 + }, + { + "epoch": 12.784402629789163, + "grad_norm": 0.302734375, + "learning_rate": 0.0002, + "loss": 0.0977, + "step": 7049 + }, + { + "epoch": 12.786216277488098, + "grad_norm": 0.31640625, + "learning_rate": 0.0002, + "loss": 0.1005, + "step": 7050 + }, + { + "epoch": 12.788029925187033, + "grad_norm": 0.330078125, + "learning_rate": 0.0002, + "loss": 0.0929, + "step": 7051 + }, + { + "epoch": 12.789843572885967, + "grad_norm": 0.2578125, + "learning_rate": 0.0002, + "loss": 0.1021, + "step": 7052 + }, + { + "epoch": 12.791657220584902, + "grad_norm": 0.419921875, + "learning_rate": 0.0002, + "loss": 0.097, + "step": 7053 + }, + { + "epoch": 12.793470868283835, + "grad_norm": 0.328125, + "learning_rate": 0.0002, + "loss": 0.0997, + "step": 7054 + }, + { + "epoch": 12.79528451598277, + "grad_norm": 0.28125, + "learning_rate": 0.0002, + "loss": 0.0955, + "step": 7055 + }, + { + "epoch": 12.797098163681705, + "grad_norm": 0.24609375, + "learning_rate": 0.0002, + "loss": 0.0999, + "step": 7056 + }, + { + "epoch": 12.79891181138064, + "grad_norm": 0.392578125, + "learning_rate": 0.0002, + "loss": 0.1179, + "step": 7057 + }, + { + "epoch": 12.800725459079574, + "grad_norm": 0.322265625, + "learning_rate": 0.0002, + "loss": 0.1193, + "step": 7058 + }, + { + "epoch": 12.802539106778509, + "grad_norm": 0.345703125, + "learning_rate": 0.0002, + "loss": 0.118, + "step": 7059 + }, + { + "epoch": 12.804352754477442, + "grad_norm": 0.26953125, + "learning_rate": 0.0002, + "loss": 0.1106, + "step": 7060 + }, + { + "epoch": 12.806166402176377, + "grad_norm": 0.189453125, + "learning_rate": 0.0002, + "loss": 0.121, + "step": 7061 + }, + { + "epoch": 12.807980049875312, + "grad_norm": 0.265625, + "learning_rate": 0.0002, + "loss": 0.1369, + "step": 7062 + }, + { + "epoch": 12.809793697574246, + "grad_norm": 0.1767578125, + "learning_rate": 0.0002, + "loss": 0.1476, + "step": 7063 + }, + { + "epoch": 12.811607345273181, + "grad_norm": 0.29296875, + "learning_rate": 0.0002, + "loss": 0.1587, + "step": 7064 + }, + { + "epoch": 12.813420992972116, + "grad_norm": 0.177734375, + "learning_rate": 0.0002, + "loss": 0.1789, + "step": 7065 + }, + { + "epoch": 12.815234640671049, + "grad_norm": 0.1572265625, + "learning_rate": 0.0002, + "loss": 0.2102, + "step": 7066 + }, + { + "epoch": 12.817048288369984, + "grad_norm": 0.28515625, + "learning_rate": 0.0002, + "loss": 0.2927, + "step": 7067 + }, + { + "epoch": 12.818861936068918, + "grad_norm": 0.2373046875, + "learning_rate": 0.0002, + "loss": 0.1436, + "step": 7068 + }, + { + "epoch": 12.820675583767853, + "grad_norm": 0.27734375, + "learning_rate": 0.0002, + "loss": 0.173, + "step": 7069 + }, + { + "epoch": 12.822489231466788, + "grad_norm": 0.232421875, + "learning_rate": 0.0002, + "loss": 0.1591, + "step": 7070 + }, + { + "epoch": 12.824302879165723, + "grad_norm": 0.2734375, + "learning_rate": 0.0002, + "loss": 0.1776, + "step": 7071 + }, + { + "epoch": 12.826116526864656, + "grad_norm": 0.25390625, + "learning_rate": 0.0002, + "loss": 0.1396, + "step": 7072 + }, + { + "epoch": 12.82793017456359, + "grad_norm": 0.2333984375, + "learning_rate": 0.0002, + "loss": 0.1326, + "step": 7073 + }, + { + "epoch": 12.829743822262525, + "grad_norm": 0.236328125, + "learning_rate": 0.0002, + "loss": 0.1729, + "step": 7074 + }, + { + "epoch": 12.83155746996146, + "grad_norm": 0.224609375, + "learning_rate": 0.0002, + "loss": 0.1633, + "step": 7075 + }, + { + "epoch": 12.833371117660395, + "grad_norm": 0.2265625, + "learning_rate": 0.0002, + "loss": 0.1292, + "step": 7076 + }, + { + "epoch": 12.83518476535933, + "grad_norm": 0.22265625, + "learning_rate": 0.0002, + "loss": 0.1119, + "step": 7077 + }, + { + "epoch": 12.836998413058263, + "grad_norm": 0.26171875, + "learning_rate": 0.0002, + "loss": 0.1678, + "step": 7078 + }, + { + "epoch": 12.838812060757197, + "grad_norm": 0.22265625, + "learning_rate": 0.0002, + "loss": 0.1275, + "step": 7079 + }, + { + "epoch": 12.840625708456132, + "grad_norm": 0.2294921875, + "learning_rate": 0.0002, + "loss": 0.1434, + "step": 7080 + }, + { + "epoch": 12.842439356155067, + "grad_norm": 0.29296875, + "learning_rate": 0.0002, + "loss": 0.1516, + "step": 7081 + }, + { + "epoch": 12.844253003854002, + "grad_norm": 0.294921875, + "learning_rate": 0.0002, + "loss": 0.182, + "step": 7082 + }, + { + "epoch": 12.846066651552936, + "grad_norm": 0.255859375, + "learning_rate": 0.0002, + "loss": 0.1072, + "step": 7083 + }, + { + "epoch": 12.84788029925187, + "grad_norm": 0.23828125, + "learning_rate": 0.0002, + "loss": 0.1079, + "step": 7084 + }, + { + "epoch": 12.849693946950804, + "grad_norm": 0.2412109375, + "learning_rate": 0.0002, + "loss": 0.1105, + "step": 7085 + }, + { + "epoch": 12.851507594649739, + "grad_norm": 0.25, + "learning_rate": 0.0002, + "loss": 0.1039, + "step": 7086 + }, + { + "epoch": 12.853321242348674, + "grad_norm": 0.263671875, + "learning_rate": 0.0002, + "loss": 0.11, + "step": 7087 + }, + { + "epoch": 12.855134890047609, + "grad_norm": 0.27734375, + "learning_rate": 0.0002, + "loss": 0.1034, + "step": 7088 + }, + { + "epoch": 12.856948537746543, + "grad_norm": 0.29296875, + "learning_rate": 0.0002, + "loss": 0.1019, + "step": 7089 + }, + { + "epoch": 12.858762185445478, + "grad_norm": 0.314453125, + "learning_rate": 0.0002, + "loss": 0.1077, + "step": 7090 + }, + { + "epoch": 12.860575833144411, + "grad_norm": 0.263671875, + "learning_rate": 0.0002, + "loss": 0.0887, + "step": 7091 + }, + { + "epoch": 12.862389480843346, + "grad_norm": 0.314453125, + "learning_rate": 0.0002, + "loss": 0.103, + "step": 7092 + }, + { + "epoch": 12.86420312854228, + "grad_norm": 0.328125, + "learning_rate": 0.0002, + "loss": 0.0918, + "step": 7093 + }, + { + "epoch": 12.866016776241215, + "grad_norm": 0.271484375, + "learning_rate": 0.0002, + "loss": 0.0994, + "step": 7094 + }, + { + "epoch": 12.86783042394015, + "grad_norm": 0.326171875, + "learning_rate": 0.0002, + "loss": 0.1068, + "step": 7095 + }, + { + "epoch": 12.869644071639083, + "grad_norm": 0.35546875, + "learning_rate": 0.0002, + "loss": 0.1147, + "step": 7096 + }, + { + "epoch": 12.871457719338018, + "grad_norm": 0.26171875, + "learning_rate": 0.0002, + "loss": 0.0981, + "step": 7097 + }, + { + "epoch": 12.873271367036953, + "grad_norm": 0.25390625, + "learning_rate": 0.0002, + "loss": 0.0999, + "step": 7098 + }, + { + "epoch": 12.875085014735888, + "grad_norm": 0.283203125, + "learning_rate": 0.0002, + "loss": 0.1081, + "step": 7099 + }, + { + "epoch": 12.876898662434822, + "grad_norm": 0.375, + "learning_rate": 0.0002, + "loss": 0.1095, + "step": 7100 + }, + { + "epoch": 12.878712310133757, + "grad_norm": 0.287109375, + "learning_rate": 0.0002, + "loss": 0.12, + "step": 7101 + }, + { + "epoch": 12.880525957832692, + "grad_norm": 0.26171875, + "learning_rate": 0.0002, + "loss": 0.0869, + "step": 7102 + }, + { + "epoch": 12.882339605531625, + "grad_norm": 0.35546875, + "learning_rate": 0.0002, + "loss": 0.1017, + "step": 7103 + }, + { + "epoch": 12.88415325323056, + "grad_norm": 0.310546875, + "learning_rate": 0.0002, + "loss": 0.1015, + "step": 7104 + }, + { + "epoch": 12.885966900929494, + "grad_norm": 0.310546875, + "learning_rate": 0.0002, + "loss": 0.102, + "step": 7105 + }, + { + "epoch": 12.88778054862843, + "grad_norm": 0.283203125, + "learning_rate": 0.0002, + "loss": 0.1185, + "step": 7106 + }, + { + "epoch": 12.88778054862843, + "eval_loss": 2.3575503826141357, + "eval_runtime": 150.0431, + "eval_samples_per_second": 6.665, + "eval_steps_per_second": 6.665, + "step": 7106 + }, + { + "epoch": 12.88778054862843, + "mmlu_eval_accuracy": 0.28566996100588943, + "mmlu_eval_accuracy_abstract_algebra": 0.45454545454545453, + "mmlu_eval_accuracy_anatomy": 0.2857142857142857, + "mmlu_eval_accuracy_astronomy": 0.25, + "mmlu_eval_accuracy_business_ethics": 0.36363636363636365, + "mmlu_eval_accuracy_clinical_knowledge": 0.2413793103448276, + "mmlu_eval_accuracy_college_biology": 0.4375, + "mmlu_eval_accuracy_college_chemistry": 0.125, + "mmlu_eval_accuracy_college_computer_science": 0.09090909090909091, + "mmlu_eval_accuracy_college_mathematics": 0.18181818181818182, + "mmlu_eval_accuracy_college_medicine": 0.2727272727272727, + "mmlu_eval_accuracy_college_physics": 0.36363636363636365, + "mmlu_eval_accuracy_computer_security": 0.5454545454545454, + "mmlu_eval_accuracy_conceptual_physics": 0.23076923076923078, + "mmlu_eval_accuracy_econometrics": 0.08333333333333333, + "mmlu_eval_accuracy_electrical_engineering": 0.25, + "mmlu_eval_accuracy_elementary_mathematics": 0.2682926829268293, + "mmlu_eval_accuracy_formal_logic": 0.14285714285714285, + "mmlu_eval_accuracy_global_facts": 0.3, + "mmlu_eval_accuracy_high_school_biology": 0.375, + "mmlu_eval_accuracy_high_school_chemistry": 0.13636363636363635, + "mmlu_eval_accuracy_high_school_computer_science": 0.2222222222222222, + "mmlu_eval_accuracy_high_school_european_history": 0.2777777777777778, + "mmlu_eval_accuracy_high_school_geography": 0.3181818181818182, + "mmlu_eval_accuracy_high_school_government_and_politics": 0.23809523809523808, + "mmlu_eval_accuracy_high_school_macroeconomics": 0.23255813953488372, + "mmlu_eval_accuracy_high_school_mathematics": 0.10344827586206896, + "mmlu_eval_accuracy_high_school_microeconomics": 0.2692307692307692, + "mmlu_eval_accuracy_high_school_physics": 0.47058823529411764, + "mmlu_eval_accuracy_high_school_psychology": 0.3333333333333333, + "mmlu_eval_accuracy_high_school_statistics": 0.2608695652173913, + "mmlu_eval_accuracy_high_school_us_history": 0.22727272727272727, + "mmlu_eval_accuracy_high_school_world_history": 0.11538461538461539, + "mmlu_eval_accuracy_human_aging": 0.2608695652173913, + "mmlu_eval_accuracy_human_sexuality": 0.25, + "mmlu_eval_accuracy_international_law": 0.38461538461538464, + "mmlu_eval_accuracy_jurisprudence": 0.36363636363636365, + "mmlu_eval_accuracy_logical_fallacies": 0.2777777777777778, + "mmlu_eval_accuracy_machine_learning": 0.36363636363636365, + "mmlu_eval_accuracy_management": 0.36363636363636365, + "mmlu_eval_accuracy_marketing": 0.36, + "mmlu_eval_accuracy_medical_genetics": 0.2727272727272727, + "mmlu_eval_accuracy_miscellaneous": 0.46511627906976744, + "mmlu_eval_accuracy_moral_disputes": 0.2894736842105263, + "mmlu_eval_accuracy_moral_scenarios": 0.22, + "mmlu_eval_accuracy_nutrition": 0.30303030303030304, + "mmlu_eval_accuracy_philosophy": 0.35294117647058826, + "mmlu_eval_accuracy_prehistory": 0.34285714285714286, + "mmlu_eval_accuracy_professional_accounting": 0.3225806451612903, + "mmlu_eval_accuracy_professional_law": 0.2647058823529412, + "mmlu_eval_accuracy_professional_medicine": 0.22580645161290322, + "mmlu_eval_accuracy_professional_psychology": 0.30434782608695654, + "mmlu_eval_accuracy_public_relations": 0.25, + "mmlu_eval_accuracy_security_studies": 0.2962962962962963, + "mmlu_eval_accuracy_sociology": 0.4090909090909091, + "mmlu_eval_accuracy_us_foreign_policy": 0.2727272727272727, + "mmlu_eval_accuracy_virology": 0.3888888888888889, + "mmlu_eval_accuracy_world_religions": 0.21052631578947367, + "mmlu_loss": 2.1245896993198867, + "step": 7106 + }, + { + "epoch": 12.889594196327364, + "grad_norm": 0.310546875, + "learning_rate": 0.0002, + "loss": 0.1251, + "step": 7107 + }, + { + "epoch": 12.891407844026299, + "grad_norm": 0.37109375, + "learning_rate": 0.0002, + "loss": 0.1193, + "step": 7108 + }, + { + "epoch": 12.893221491725232, + "grad_norm": 0.1953125, + "learning_rate": 0.0002, + "loss": 0.1007, + "step": 7109 + }, + { + "epoch": 12.895035139424166, + "grad_norm": 0.2421875, + "learning_rate": 0.0002, + "loss": 0.1123, + "step": 7110 + }, + { + "epoch": 12.896848787123101, + "grad_norm": 0.2451171875, + "learning_rate": 0.0002, + "loss": 0.1204, + "step": 7111 + }, + { + "epoch": 12.898662434822036, + "grad_norm": 0.29296875, + "learning_rate": 0.0002, + "loss": 0.1122, + "step": 7112 + }, + { + "epoch": 12.90047608252097, + "grad_norm": 0.216796875, + "learning_rate": 0.0002, + "loss": 0.1208, + "step": 7113 + }, + { + "epoch": 12.902289730219906, + "grad_norm": 0.275390625, + "learning_rate": 0.0002, + "loss": 0.1426, + "step": 7114 + }, + { + "epoch": 12.904103377918839, + "grad_norm": 0.3125, + "learning_rate": 0.0002, + "loss": 0.1572, + "step": 7115 + }, + { + "epoch": 12.905917025617773, + "grad_norm": 0.203125, + "learning_rate": 0.0002, + "loss": 0.2039, + "step": 7116 + }, + { + "epoch": 12.907730673316708, + "grad_norm": 0.220703125, + "learning_rate": 0.0002, + "loss": 0.2136, + "step": 7117 + }, + { + "epoch": 12.909544321015643, + "grad_norm": 0.30078125, + "learning_rate": 0.0002, + "loss": 0.1703, + "step": 7118 + }, + { + "epoch": 12.911357968714578, + "grad_norm": 0.2431640625, + "learning_rate": 0.0002, + "loss": 0.1439, + "step": 7119 + }, + { + "epoch": 12.913171616413512, + "grad_norm": 0.267578125, + "learning_rate": 0.0002, + "loss": 0.1759, + "step": 7120 + }, + { + "epoch": 12.914985264112445, + "grad_norm": 0.287109375, + "learning_rate": 0.0002, + "loss": 0.1617, + "step": 7121 + }, + { + "epoch": 12.91679891181138, + "grad_norm": 0.23828125, + "learning_rate": 0.0002, + "loss": 0.1447, + "step": 7122 + }, + { + "epoch": 12.918612559510315, + "grad_norm": 0.2490234375, + "learning_rate": 0.0002, + "loss": 0.16, + "step": 7123 + }, + { + "epoch": 12.92042620720925, + "grad_norm": 0.25, + "learning_rate": 0.0002, + "loss": 0.1358, + "step": 7124 + }, + { + "epoch": 12.922239854908184, + "grad_norm": 0.2431640625, + "learning_rate": 0.0002, + "loss": 0.1637, + "step": 7125 + }, + { + "epoch": 12.92405350260712, + "grad_norm": 0.2373046875, + "learning_rate": 0.0002, + "loss": 0.1336, + "step": 7126 + }, + { + "epoch": 12.925867150306052, + "grad_norm": 0.2373046875, + "learning_rate": 0.0002, + "loss": 0.1423, + "step": 7127 + }, + { + "epoch": 12.927680798004987, + "grad_norm": 0.28125, + "learning_rate": 0.0002, + "loss": 0.1905, + "step": 7128 + }, + { + "epoch": 12.929494445703922, + "grad_norm": 0.23828125, + "learning_rate": 0.0002, + "loss": 0.1419, + "step": 7129 + }, + { + "epoch": 12.931308093402857, + "grad_norm": 0.2421875, + "learning_rate": 0.0002, + "loss": 0.1454, + "step": 7130 + }, + { + "epoch": 12.933121741101791, + "grad_norm": 0.259765625, + "learning_rate": 0.0002, + "loss": 0.124, + "step": 7131 + }, + { + "epoch": 12.934935388800726, + "grad_norm": 0.2177734375, + "learning_rate": 0.0002, + "loss": 0.1048, + "step": 7132 + }, + { + "epoch": 12.93674903649966, + "grad_norm": 0.2734375, + "learning_rate": 0.0002, + "loss": 0.1403, + "step": 7133 + }, + { + "epoch": 12.938562684198594, + "grad_norm": 0.23046875, + "learning_rate": 0.0002, + "loss": 0.0983, + "step": 7134 + }, + { + "epoch": 12.940376331897529, + "grad_norm": 0.263671875, + "learning_rate": 0.0002, + "loss": 0.1089, + "step": 7135 + }, + { + "epoch": 12.942189979596463, + "grad_norm": 0.2734375, + "learning_rate": 0.0002, + "loss": 0.1153, + "step": 7136 + }, + { + "epoch": 12.944003627295398, + "grad_norm": 0.298828125, + "learning_rate": 0.0002, + "loss": 0.1062, + "step": 7137 + }, + { + "epoch": 12.945817274994333, + "grad_norm": 0.236328125, + "learning_rate": 0.0002, + "loss": 0.0957, + "step": 7138 + }, + { + "epoch": 12.947630922693268, + "grad_norm": 0.33203125, + "learning_rate": 0.0002, + "loss": 0.1317, + "step": 7139 + }, + { + "epoch": 12.9494445703922, + "grad_norm": 0.328125, + "learning_rate": 0.0002, + "loss": 0.1114, + "step": 7140 + }, + { + "epoch": 12.951258218091136, + "grad_norm": 0.291015625, + "learning_rate": 0.0002, + "loss": 0.0997, + "step": 7141 + }, + { + "epoch": 12.95307186579007, + "grad_norm": 0.30859375, + "learning_rate": 0.0002, + "loss": 0.1061, + "step": 7142 + }, + { + "epoch": 12.954885513489005, + "grad_norm": 0.24609375, + "learning_rate": 0.0002, + "loss": 0.0924, + "step": 7143 + }, + { + "epoch": 12.95669916118794, + "grad_norm": 0.32421875, + "learning_rate": 0.0002, + "loss": 0.1081, + "step": 7144 + }, + { + "epoch": 12.958512808886873, + "grad_norm": 0.30078125, + "learning_rate": 0.0002, + "loss": 0.1067, + "step": 7145 + }, + { + "epoch": 12.960326456585808, + "grad_norm": 0.328125, + "learning_rate": 0.0002, + "loss": 0.0999, + "step": 7146 + }, + { + "epoch": 12.962140104284742, + "grad_norm": 0.3125, + "learning_rate": 0.0002, + "loss": 0.0978, + "step": 7147 + }, + { + "epoch": 12.963953751983677, + "grad_norm": 0.298828125, + "learning_rate": 0.0002, + "loss": 0.1146, + "step": 7148 + }, + { + "epoch": 12.965767399682612, + "grad_norm": 0.30859375, + "learning_rate": 0.0002, + "loss": 0.1052, + "step": 7149 + }, + { + "epoch": 12.967581047381547, + "grad_norm": 0.3046875, + "learning_rate": 0.0002, + "loss": 0.0968, + "step": 7150 + }, + { + "epoch": 12.969394695080481, + "grad_norm": 0.3046875, + "learning_rate": 0.0002, + "loss": 0.095, + "step": 7151 + }, + { + "epoch": 12.971208342779414, + "grad_norm": 0.298828125, + "learning_rate": 0.0002, + "loss": 0.0939, + "step": 7152 + }, + { + "epoch": 12.97302199047835, + "grad_norm": 0.30078125, + "learning_rate": 0.0002, + "loss": 0.0905, + "step": 7153 + }, + { + "epoch": 12.974835638177284, + "grad_norm": 0.375, + "learning_rate": 0.0002, + "loss": 0.1185, + "step": 7154 + }, + { + "epoch": 12.976649285876219, + "grad_norm": 0.31640625, + "learning_rate": 0.0002, + "loss": 0.1104, + "step": 7155 + }, + { + "epoch": 12.978462933575154, + "grad_norm": 0.3515625, + "learning_rate": 0.0002, + "loss": 0.1184, + "step": 7156 + }, + { + "epoch": 12.980276581274087, + "grad_norm": 0.2333984375, + "learning_rate": 0.0002, + "loss": 0.1071, + "step": 7157 + }, + { + "epoch": 12.982090228973021, + "grad_norm": 0.296875, + "learning_rate": 0.0002, + "loss": 0.1105, + "step": 7158 + }, + { + "epoch": 12.983903876671956, + "grad_norm": 0.302734375, + "learning_rate": 0.0002, + "loss": 0.1304, + "step": 7159 + }, + { + "epoch": 12.98571752437089, + "grad_norm": 0.34375, + "learning_rate": 0.0002, + "loss": 0.1191, + "step": 7160 + }, + { + "epoch": 12.987531172069826, + "grad_norm": 0.375, + "learning_rate": 0.0002, + "loss": 0.1303, + "step": 7161 + }, + { + "epoch": 12.98934481976876, + "grad_norm": 0.197265625, + "learning_rate": 0.0002, + "loss": 0.1265, + "step": 7162 + }, + { + "epoch": 12.991158467467695, + "grad_norm": 0.251953125, + "learning_rate": 0.0002, + "loss": 0.1276, + "step": 7163 + }, + { + "epoch": 12.992972115166628, + "grad_norm": 0.2412109375, + "learning_rate": 0.0002, + "loss": 0.145, + "step": 7164 + }, + { + "epoch": 12.994785762865563, + "grad_norm": 0.37890625, + "learning_rate": 0.0002, + "loss": 0.1599, + "step": 7165 + }, + { + "epoch": 12.996599410564498, + "grad_norm": 0.5546875, + "learning_rate": 0.0002, + "loss": 0.1817, + "step": 7166 + }, + { + "epoch": 12.998413058263433, + "grad_norm": 0.2470703125, + "learning_rate": 0.0002, + "loss": 0.2228, + "step": 7167 + }, + { + "epoch": 13.000226705962367, + "grad_norm": 0.294921875, + "learning_rate": 0.0002, + "loss": 0.1403, + "step": 7168 + }, + { + "epoch": 13.002040353661302, + "grad_norm": 0.2216796875, + "learning_rate": 0.0002, + "loss": 0.1006, + "step": 7169 + }, + { + "epoch": 13.003854001360235, + "grad_norm": 0.208984375, + "learning_rate": 0.0002, + "loss": 0.1172, + "step": 7170 + }, + { + "epoch": 13.00566764905917, + "grad_norm": 0.20703125, + "learning_rate": 0.0002, + "loss": 0.1033, + "step": 7171 + }, + { + "epoch": 13.007481296758105, + "grad_norm": 0.18359375, + "learning_rate": 0.0002, + "loss": 0.1064, + "step": 7172 + }, + { + "epoch": 13.00929494445704, + "grad_norm": 0.201171875, + "learning_rate": 0.0002, + "loss": 0.1041, + "step": 7173 + }, + { + "epoch": 13.011108592155974, + "grad_norm": 0.208984375, + "learning_rate": 0.0002, + "loss": 0.1262, + "step": 7174 + }, + { + "epoch": 13.012922239854909, + "grad_norm": 0.1875, + "learning_rate": 0.0002, + "loss": 0.1132, + "step": 7175 + }, + { + "epoch": 13.014735887553842, + "grad_norm": 0.1953125, + "learning_rate": 0.0002, + "loss": 0.1055, + "step": 7176 + }, + { + "epoch": 13.016549535252777, + "grad_norm": 0.1826171875, + "learning_rate": 0.0002, + "loss": 0.0858, + "step": 7177 + }, + { + "epoch": 13.018363182951711, + "grad_norm": 0.1884765625, + "learning_rate": 0.0002, + "loss": 0.082, + "step": 7178 + }, + { + "epoch": 13.020176830650646, + "grad_norm": 0.1640625, + "learning_rate": 0.0002, + "loss": 0.0863, + "step": 7179 + }, + { + "epoch": 13.021990478349581, + "grad_norm": 0.1728515625, + "learning_rate": 0.0002, + "loss": 0.0881, + "step": 7180 + }, + { + "epoch": 13.023804126048516, + "grad_norm": 0.1943359375, + "learning_rate": 0.0002, + "loss": 0.0849, + "step": 7181 + }, + { + "epoch": 13.025617773747449, + "grad_norm": 0.19140625, + "learning_rate": 0.0002, + "loss": 0.0836, + "step": 7182 + }, + { + "epoch": 13.027431421446384, + "grad_norm": 0.1943359375, + "learning_rate": 0.0002, + "loss": 0.1144, + "step": 7183 + }, + { + "epoch": 13.029245069145318, + "grad_norm": 0.181640625, + "learning_rate": 0.0002, + "loss": 0.0753, + "step": 7184 + }, + { + "epoch": 13.031058716844253, + "grad_norm": 0.2041015625, + "learning_rate": 0.0002, + "loss": 0.0805, + "step": 7185 + }, + { + "epoch": 13.032872364543188, + "grad_norm": 0.208984375, + "learning_rate": 0.0002, + "loss": 0.0813, + "step": 7186 + }, + { + "epoch": 13.034686012242123, + "grad_norm": 0.2021484375, + "learning_rate": 0.0002, + "loss": 0.0703, + "step": 7187 + }, + { + "epoch": 13.036499659941056, + "grad_norm": 0.255859375, + "learning_rate": 0.0002, + "loss": 0.08, + "step": 7188 + }, + { + "epoch": 13.03831330763999, + "grad_norm": 0.171875, + "learning_rate": 0.0002, + "loss": 0.0671, + "step": 7189 + }, + { + "epoch": 13.040126955338925, + "grad_norm": 0.2177734375, + "learning_rate": 0.0002, + "loss": 0.0723, + "step": 7190 + }, + { + "epoch": 13.04194060303786, + "grad_norm": 0.23828125, + "learning_rate": 0.0002, + "loss": 0.079, + "step": 7191 + }, + { + "epoch": 13.043754250736795, + "grad_norm": 0.205078125, + "learning_rate": 0.0002, + "loss": 0.0679, + "step": 7192 + }, + { + "epoch": 13.04556789843573, + "grad_norm": 0.2392578125, + "learning_rate": 0.0002, + "loss": 0.0766, + "step": 7193 + }, + { + "epoch": 13.047381546134662, + "grad_norm": 0.2119140625, + "learning_rate": 0.0002, + "loss": 0.0723, + "step": 7194 + }, + { + "epoch": 13.049195193833597, + "grad_norm": 0.279296875, + "learning_rate": 0.0002, + "loss": 0.0763, + "step": 7195 + }, + { + "epoch": 13.051008841532532, + "grad_norm": 0.25, + "learning_rate": 0.0002, + "loss": 0.0907, + "step": 7196 + }, + { + "epoch": 13.052822489231467, + "grad_norm": 0.224609375, + "learning_rate": 0.0002, + "loss": 0.074, + "step": 7197 + }, + { + "epoch": 13.054636136930402, + "grad_norm": 0.2099609375, + "learning_rate": 0.0002, + "loss": 0.0678, + "step": 7198 + }, + { + "epoch": 13.056449784629336, + "grad_norm": 0.265625, + "learning_rate": 0.0002, + "loss": 0.0742, + "step": 7199 + }, + { + "epoch": 13.05826343232827, + "grad_norm": 0.3671875, + "learning_rate": 0.0002, + "loss": 0.0938, + "step": 7200 + }, + { + "epoch": 13.060077080027204, + "grad_norm": 0.3046875, + "learning_rate": 0.0002, + "loss": 0.0816, + "step": 7201 + }, + { + "epoch": 13.061890727726139, + "grad_norm": 0.291015625, + "learning_rate": 0.0002, + "loss": 0.0736, + "step": 7202 + }, + { + "epoch": 13.063704375425074, + "grad_norm": 0.25390625, + "learning_rate": 0.0002, + "loss": 0.0694, + "step": 7203 + }, + { + "epoch": 13.065518023124008, + "grad_norm": 0.25, + "learning_rate": 0.0002, + "loss": 0.0726, + "step": 7204 + }, + { + "epoch": 13.067331670822943, + "grad_norm": 0.251953125, + "learning_rate": 0.0002, + "loss": 0.0875, + "step": 7205 + }, + { + "epoch": 13.069145318521878, + "grad_norm": 0.33203125, + "learning_rate": 0.0002, + "loss": 0.0863, + "step": 7206 + }, + { + "epoch": 13.070958966220811, + "grad_norm": 0.349609375, + "learning_rate": 0.0002, + "loss": 0.1, + "step": 7207 + }, + { + "epoch": 13.072772613919746, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 0.0758, + "step": 7208 + }, + { + "epoch": 13.07458626161868, + "grad_norm": 0.1689453125, + "learning_rate": 0.0002, + "loss": 0.0853, + "step": 7209 + }, + { + "epoch": 13.076399909317615, + "grad_norm": 0.20703125, + "learning_rate": 0.0002, + "loss": 0.0856, + "step": 7210 + }, + { + "epoch": 13.07821355701655, + "grad_norm": 0.26953125, + "learning_rate": 0.0002, + "loss": 0.0973, + "step": 7211 + }, + { + "epoch": 13.080027204715485, + "grad_norm": 0.275390625, + "learning_rate": 0.0002, + "loss": 0.091, + "step": 7212 + }, + { + "epoch": 13.081840852414418, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 0.1004, + "step": 7213 + }, + { + "epoch": 13.083654500113353, + "grad_norm": 0.22265625, + "learning_rate": 0.0002, + "loss": 0.1047, + "step": 7214 + }, + { + "epoch": 13.085468147812287, + "grad_norm": 0.287109375, + "learning_rate": 0.0002, + "loss": 0.1369, + "step": 7215 + }, + { + "epoch": 13.087281795511222, + "grad_norm": 0.296875, + "learning_rate": 0.0002, + "loss": 0.1179, + "step": 7216 + }, + { + "epoch": 13.089095443210157, + "grad_norm": 0.28515625, + "learning_rate": 0.0002, + "loss": 0.1697, + "step": 7217 + }, + { + "epoch": 13.090909090909092, + "grad_norm": 0.1328125, + "learning_rate": 0.0002, + "loss": 0.1658, + "step": 7218 + }, + { + "epoch": 13.092722738608025, + "grad_norm": 0.25, + "learning_rate": 0.0002, + "loss": 0.1099, + "step": 7219 + }, + { + "epoch": 13.09453638630696, + "grad_norm": 0.244140625, + "learning_rate": 0.0002, + "loss": 0.1064, + "step": 7220 + }, + { + "epoch": 13.096350034005894, + "grad_norm": 0.322265625, + "learning_rate": 0.0002, + "loss": 0.1842, + "step": 7221 + }, + { + "epoch": 13.098163681704829, + "grad_norm": 0.203125, + "learning_rate": 0.0002, + "loss": 0.0862, + "step": 7222 + }, + { + "epoch": 13.099977329403764, + "grad_norm": 0.1845703125, + "learning_rate": 0.0002, + "loss": 0.0758, + "step": 7223 + }, + { + "epoch": 13.101790977102699, + "grad_norm": 0.19921875, + "learning_rate": 0.0002, + "loss": 0.0975, + "step": 7224 + }, + { + "epoch": 13.103604624801632, + "grad_norm": 0.2431640625, + "learning_rate": 0.0002, + "loss": 0.1021, + "step": 7225 + }, + { + "epoch": 13.105418272500566, + "grad_norm": 0.23046875, + "learning_rate": 0.0002, + "loss": 0.1065, + "step": 7226 + }, + { + "epoch": 13.107231920199501, + "grad_norm": 0.1953125, + "learning_rate": 0.0002, + "loss": 0.0855, + "step": 7227 + }, + { + "epoch": 13.109045567898436, + "grad_norm": 0.2001953125, + "learning_rate": 0.0002, + "loss": 0.0968, + "step": 7228 + }, + { + "epoch": 13.11085921559737, + "grad_norm": 0.2021484375, + "learning_rate": 0.0002, + "loss": 0.0968, + "step": 7229 + }, + { + "epoch": 13.112672863296305, + "grad_norm": 0.2080078125, + "learning_rate": 0.0002, + "loss": 0.1006, + "step": 7230 + }, + { + "epoch": 13.114486510995238, + "grad_norm": 0.1767578125, + "learning_rate": 0.0002, + "loss": 0.0812, + "step": 7231 + }, + { + "epoch": 13.116300158694173, + "grad_norm": 0.177734375, + "learning_rate": 0.0002, + "loss": 0.093, + "step": 7232 + }, + { + "epoch": 13.118113806393108, + "grad_norm": 0.1904296875, + "learning_rate": 0.0002, + "loss": 0.0753, + "step": 7233 + }, + { + "epoch": 13.119927454092043, + "grad_norm": 0.2138671875, + "learning_rate": 0.0002, + "loss": 0.0881, + "step": 7234 + }, + { + "epoch": 13.121741101790978, + "grad_norm": 0.181640625, + "learning_rate": 0.0002, + "loss": 0.0777, + "step": 7235 + }, + { + "epoch": 13.123554749489912, + "grad_norm": 0.158203125, + "learning_rate": 0.0002, + "loss": 0.0686, + "step": 7236 + }, + { + "epoch": 13.125368397188845, + "grad_norm": 0.181640625, + "learning_rate": 0.0002, + "loss": 0.0747, + "step": 7237 + }, + { + "epoch": 13.12718204488778, + "grad_norm": 0.2392578125, + "learning_rate": 0.0002, + "loss": 0.0844, + "step": 7238 + }, + { + "epoch": 13.128995692586715, + "grad_norm": 0.2080078125, + "learning_rate": 0.0002, + "loss": 0.0682, + "step": 7239 + }, + { + "epoch": 13.13080934028565, + "grad_norm": 0.2490234375, + "learning_rate": 0.0002, + "loss": 0.0741, + "step": 7240 + }, + { + "epoch": 13.132622987984584, + "grad_norm": 0.232421875, + "learning_rate": 0.0002, + "loss": 0.0824, + "step": 7241 + }, + { + "epoch": 13.13443663568352, + "grad_norm": 0.265625, + "learning_rate": 0.0002, + "loss": 0.0874, + "step": 7242 + }, + { + "epoch": 13.136250283382452, + "grad_norm": 0.208984375, + "learning_rate": 0.0002, + "loss": 0.0756, + "step": 7243 + }, + { + "epoch": 13.138063931081387, + "grad_norm": 0.2138671875, + "learning_rate": 0.0002, + "loss": 0.0731, + "step": 7244 + }, + { + "epoch": 13.139877578780322, + "grad_norm": 0.244140625, + "learning_rate": 0.0002, + "loss": 0.0912, + "step": 7245 + }, + { + "epoch": 13.141691226479256, + "grad_norm": 0.1962890625, + "learning_rate": 0.0002, + "loss": 0.0754, + "step": 7246 + }, + { + "epoch": 13.143504874178191, + "grad_norm": 0.259765625, + "learning_rate": 0.0002, + "loss": 0.067, + "step": 7247 + }, + { + "epoch": 13.145318521877126, + "grad_norm": 0.275390625, + "learning_rate": 0.0002, + "loss": 0.0842, + "step": 7248 + }, + { + "epoch": 13.147132169576059, + "grad_norm": 0.28515625, + "learning_rate": 0.0002, + "loss": 0.0842, + "step": 7249 + }, + { + "epoch": 13.148945817274994, + "grad_norm": 0.26953125, + "learning_rate": 0.0002, + "loss": 0.0815, + "step": 7250 + }, + { + "epoch": 13.150759464973929, + "grad_norm": 0.310546875, + "learning_rate": 0.0002, + "loss": 0.0878, + "step": 7251 + }, + { + "epoch": 13.152573112672863, + "grad_norm": 0.306640625, + "learning_rate": 0.0002, + "loss": 0.0783, + "step": 7252 + }, + { + "epoch": 13.154386760371798, + "grad_norm": 0.216796875, + "learning_rate": 0.0002, + "loss": 0.0739, + "step": 7253 + }, + { + "epoch": 13.156200408070733, + "grad_norm": 0.2275390625, + "learning_rate": 0.0002, + "loss": 0.0764, + "step": 7254 + }, + { + "epoch": 13.158014055769666, + "grad_norm": 0.2275390625, + "learning_rate": 0.0002, + "loss": 0.0893, + "step": 7255 + }, + { + "epoch": 13.1598277034686, + "grad_norm": 0.26953125, + "learning_rate": 0.0002, + "loss": 0.0793, + "step": 7256 + }, + { + "epoch": 13.161641351167535, + "grad_norm": 0.1904296875, + "learning_rate": 0.0002, + "loss": 0.0874, + "step": 7257 + }, + { + "epoch": 13.16345499886647, + "grad_norm": 0.353515625, + "learning_rate": 0.0002, + "loss": 0.102, + "step": 7258 + }, + { + "epoch": 13.165268646565405, + "grad_norm": 0.2451171875, + "learning_rate": 0.0002, + "loss": 0.1104, + "step": 7259 + }, + { + "epoch": 13.16708229426434, + "grad_norm": 0.259765625, + "learning_rate": 0.0002, + "loss": 0.0945, + "step": 7260 + }, + { + "epoch": 13.168895941963275, + "grad_norm": 0.279296875, + "learning_rate": 0.0002, + "loss": 0.1036, + "step": 7261 + }, + { + "epoch": 13.170709589662208, + "grad_norm": 0.2734375, + "learning_rate": 0.0002, + "loss": 0.1081, + "step": 7262 + }, + { + "epoch": 13.172523237361142, + "grad_norm": 0.39453125, + "learning_rate": 0.0002, + "loss": 0.1123, + "step": 7263 + }, + { + "epoch": 13.174336885060077, + "grad_norm": 0.30078125, + "learning_rate": 0.0002, + "loss": 0.1054, + "step": 7264 + }, + { + "epoch": 13.176150532759012, + "grad_norm": 0.2060546875, + "learning_rate": 0.0002, + "loss": 0.1198, + "step": 7265 + }, + { + "epoch": 13.177964180457947, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 0.1302, + "step": 7266 + }, + { + "epoch": 13.179777828156881, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002, + "loss": 0.16, + "step": 7267 + }, + { + "epoch": 13.181591475855814, + "grad_norm": 0.16796875, + "learning_rate": 0.0002, + "loss": 0.1788, + "step": 7268 + }, + { + "epoch": 13.18340512355475, + "grad_norm": 0.2490234375, + "learning_rate": 0.0002, + "loss": 0.1314, + "step": 7269 + }, + { + "epoch": 13.185218771253684, + "grad_norm": 0.265625, + "learning_rate": 0.0002, + "loss": 0.1241, + "step": 7270 + }, + { + "epoch": 13.187032418952619, + "grad_norm": 0.2314453125, + "learning_rate": 0.0002, + "loss": 0.114, + "step": 7271 + }, + { + "epoch": 13.188846066651553, + "grad_norm": 0.2392578125, + "learning_rate": 0.0002, + "loss": 0.1288, + "step": 7272 + }, + { + "epoch": 13.190659714350488, + "grad_norm": 0.341796875, + "learning_rate": 0.0002, + "loss": 0.1657, + "step": 7273 + }, + { + "epoch": 13.192473362049421, + "grad_norm": 0.2099609375, + "learning_rate": 0.0002, + "loss": 0.0952, + "step": 7274 + }, + { + "epoch": 13.194287009748356, + "grad_norm": 0.19140625, + "learning_rate": 0.0002, + "loss": 0.0931, + "step": 7275 + }, + { + "epoch": 13.19610065744729, + "grad_norm": 0.19921875, + "learning_rate": 0.0002, + "loss": 0.1079, + "step": 7276 + }, + { + "epoch": 13.197914305146226, + "grad_norm": 0.2109375, + "learning_rate": 0.0002, + "loss": 0.1219, + "step": 7277 + }, + { + "epoch": 13.19972795284516, + "grad_norm": 0.2333984375, + "learning_rate": 0.0002, + "loss": 0.1167, + "step": 7278 + }, + { + "epoch": 13.201541600544095, + "grad_norm": 0.2021484375, + "learning_rate": 0.0002, + "loss": 0.0935, + "step": 7279 + }, + { + "epoch": 13.203355248243028, + "grad_norm": 0.203125, + "learning_rate": 0.0002, + "loss": 0.0882, + "step": 7280 + }, + { + "epoch": 13.205168895941963, + "grad_norm": 0.2353515625, + "learning_rate": 0.0002, + "loss": 0.0841, + "step": 7281 + }, + { + "epoch": 13.206982543640898, + "grad_norm": 0.185546875, + "learning_rate": 0.0002, + "loss": 0.091, + "step": 7282 + }, + { + "epoch": 13.208796191339832, + "grad_norm": 0.19140625, + "learning_rate": 0.0002, + "loss": 0.0903, + "step": 7283 + }, + { + "epoch": 13.210609839038767, + "grad_norm": 0.2392578125, + "learning_rate": 0.0002, + "loss": 0.1006, + "step": 7284 + }, + { + "epoch": 13.212423486737702, + "grad_norm": 0.22265625, + "learning_rate": 0.0002, + "loss": 0.1024, + "step": 7285 + }, + { + "epoch": 13.214237134436635, + "grad_norm": 0.1953125, + "learning_rate": 0.0002, + "loss": 0.0815, + "step": 7286 + }, + { + "epoch": 13.21605078213557, + "grad_norm": 0.212890625, + "learning_rate": 0.0002, + "loss": 0.0768, + "step": 7287 + }, + { + "epoch": 13.217864429834504, + "grad_norm": 0.19140625, + "learning_rate": 0.0002, + "loss": 0.0747, + "step": 7288 + }, + { + "epoch": 13.21967807753344, + "grad_norm": 0.2177734375, + "learning_rate": 0.0002, + "loss": 0.0818, + "step": 7289 + }, + { + "epoch": 13.221491725232374, + "grad_norm": 0.177734375, + "learning_rate": 0.0002, + "loss": 0.0711, + "step": 7290 + }, + { + "epoch": 13.223305372931309, + "grad_norm": 0.24609375, + "learning_rate": 0.0002, + "loss": 0.0829, + "step": 7291 + }, + { + "epoch": 13.225119020630242, + "grad_norm": 0.2490234375, + "learning_rate": 0.0002, + "loss": 0.0767, + "step": 7292 + }, + { + "epoch": 13.226932668329177, + "grad_norm": 0.32421875, + "learning_rate": 0.0002, + "loss": 0.0897, + "step": 7293 + }, + { + "epoch": 13.226932668329177, + "eval_loss": 2.465397834777832, + "eval_runtime": 152.4718, + "eval_samples_per_second": 6.559, + "eval_steps_per_second": 6.559, + "step": 7293 + }, + { + "epoch": 13.226932668329177, + "mmlu_eval_accuracy": 0.293571794765002, + "mmlu_eval_accuracy_abstract_algebra": 0.45454545454545453, + "mmlu_eval_accuracy_anatomy": 0.35714285714285715, + "mmlu_eval_accuracy_astronomy": 0.3125, + "mmlu_eval_accuracy_business_ethics": 0.2727272727272727, + "mmlu_eval_accuracy_clinical_knowledge": 0.3103448275862069, + "mmlu_eval_accuracy_college_biology": 0.375, + "mmlu_eval_accuracy_college_chemistry": 0.125, + "mmlu_eval_accuracy_college_computer_science": 0.09090909090909091, + "mmlu_eval_accuracy_college_mathematics": 0.18181818181818182, + "mmlu_eval_accuracy_college_medicine": 0.2727272727272727, + "mmlu_eval_accuracy_college_physics": 0.2727272727272727, + "mmlu_eval_accuracy_computer_security": 0.45454545454545453, + "mmlu_eval_accuracy_conceptual_physics": 0.23076923076923078, + "mmlu_eval_accuracy_econometrics": 0.16666666666666666, + "mmlu_eval_accuracy_electrical_engineering": 0.1875, + "mmlu_eval_accuracy_elementary_mathematics": 0.2926829268292683, + "mmlu_eval_accuracy_formal_logic": 0.14285714285714285, + "mmlu_eval_accuracy_global_facts": 0.2, + "mmlu_eval_accuracy_high_school_biology": 0.34375, + "mmlu_eval_accuracy_high_school_chemistry": 0.18181818181818182, + "mmlu_eval_accuracy_high_school_computer_science": 0.2222222222222222, + "mmlu_eval_accuracy_high_school_european_history": 0.2777777777777778, + "mmlu_eval_accuracy_high_school_geography": 0.3181818181818182, + "mmlu_eval_accuracy_high_school_government_and_politics": 0.38095238095238093, + "mmlu_eval_accuracy_high_school_macroeconomics": 0.3023255813953488, + "mmlu_eval_accuracy_high_school_mathematics": 0.10344827586206896, + "mmlu_eval_accuracy_high_school_microeconomics": 0.3076923076923077, + "mmlu_eval_accuracy_high_school_physics": 0.47058823529411764, + "mmlu_eval_accuracy_high_school_psychology": 0.3333333333333333, + "mmlu_eval_accuracy_high_school_statistics": 0.2608695652173913, + "mmlu_eval_accuracy_high_school_us_history": 0.22727272727272727, + "mmlu_eval_accuracy_high_school_world_history": 0.19230769230769232, + "mmlu_eval_accuracy_human_aging": 0.2608695652173913, + "mmlu_eval_accuracy_human_sexuality": 0.16666666666666666, + "mmlu_eval_accuracy_international_law": 0.46153846153846156, + "mmlu_eval_accuracy_jurisprudence": 0.36363636363636365, + "mmlu_eval_accuracy_logical_fallacies": 0.3333333333333333, + "mmlu_eval_accuracy_machine_learning": 0.36363636363636365, + "mmlu_eval_accuracy_management": 0.36363636363636365, + "mmlu_eval_accuracy_marketing": 0.4, + "mmlu_eval_accuracy_medical_genetics": 0.18181818181818182, + "mmlu_eval_accuracy_miscellaneous": 0.4418604651162791, + "mmlu_eval_accuracy_moral_disputes": 0.3157894736842105, + "mmlu_eval_accuracy_moral_scenarios": 0.22, + "mmlu_eval_accuracy_nutrition": 0.3333333333333333, + "mmlu_eval_accuracy_philosophy": 0.29411764705882354, + "mmlu_eval_accuracy_prehistory": 0.4, + "mmlu_eval_accuracy_professional_accounting": 0.3225806451612903, + "mmlu_eval_accuracy_professional_law": 0.2823529411764706, + "mmlu_eval_accuracy_professional_medicine": 0.22580645161290322, + "mmlu_eval_accuracy_professional_psychology": 0.2898550724637681, + "mmlu_eval_accuracy_public_relations": 0.3333333333333333, + "mmlu_eval_accuracy_security_studies": 0.2962962962962963, + "mmlu_eval_accuracy_sociology": 0.5, + "mmlu_eval_accuracy_us_foreign_policy": 0.36363636363636365, + "mmlu_eval_accuracy_virology": 0.3333333333333333, + "mmlu_eval_accuracy_world_religions": 0.2631578947368421, + "mmlu_loss": 2.157251998967084, + "step": 7293 + }, + { + "epoch": 13.228746316028111, + "grad_norm": 0.25390625, + "learning_rate": 0.0002, + "loss": 0.0794, + "step": 7294 + }, + { + "epoch": 13.230559963727046, + "grad_norm": 0.23046875, + "learning_rate": 0.0002, + "loss": 0.073, + "step": 7295 + }, + { + "epoch": 13.232373611425981, + "grad_norm": 0.25, + "learning_rate": 0.0002, + "loss": 0.0773, + "step": 7296 + }, + { + "epoch": 13.234187259124916, + "grad_norm": 0.2578125, + "learning_rate": 0.0002, + "loss": 0.106, + "step": 7297 + }, + { + "epoch": 13.236000906823849, + "grad_norm": 0.224609375, + "learning_rate": 0.0002, + "loss": 0.0809, + "step": 7298 + }, + { + "epoch": 13.237814554522783, + "grad_norm": 0.236328125, + "learning_rate": 0.0002, + "loss": 0.0727, + "step": 7299 + }, + { + "epoch": 13.239628202221718, + "grad_norm": 0.29296875, + "learning_rate": 0.0002, + "loss": 0.0822, + "step": 7300 + }, + { + "epoch": 13.241441849920653, + "grad_norm": 0.26171875, + "learning_rate": 0.0002, + "loss": 0.0942, + "step": 7301 + }, + { + "epoch": 13.243255497619588, + "grad_norm": 0.33203125, + "learning_rate": 0.0002, + "loss": 0.0815, + "step": 7302 + }, + { + "epoch": 13.245069145318523, + "grad_norm": 0.283203125, + "learning_rate": 0.0002, + "loss": 0.0787, + "step": 7303 + }, + { + "epoch": 13.246882793017456, + "grad_norm": 0.322265625, + "learning_rate": 0.0002, + "loss": 0.081, + "step": 7304 + }, + { + "epoch": 13.24869644071639, + "grad_norm": 0.298828125, + "learning_rate": 0.0002, + "loss": 0.0878, + "step": 7305 + }, + { + "epoch": 13.250510088415325, + "grad_norm": 0.2041015625, + "learning_rate": 0.0002, + "loss": 0.0791, + "step": 7306 + }, + { + "epoch": 13.25232373611426, + "grad_norm": 0.4140625, + "learning_rate": 0.0002, + "loss": 0.0983, + "step": 7307 + }, + { + "epoch": 13.254137383813195, + "grad_norm": 0.53515625, + "learning_rate": 0.0002, + "loss": 0.1058, + "step": 7308 + }, + { + "epoch": 13.25595103151213, + "grad_norm": 0.251953125, + "learning_rate": 0.0002, + "loss": 0.0929, + "step": 7309 + }, + { + "epoch": 13.257764679211064, + "grad_norm": 0.251953125, + "learning_rate": 0.0002, + "loss": 0.0992, + "step": 7310 + }, + { + "epoch": 13.259578326909997, + "grad_norm": 0.193359375, + "learning_rate": 0.0002, + "loss": 0.1036, + "step": 7311 + }, + { + "epoch": 13.261391974608932, + "grad_norm": 0.28515625, + "learning_rate": 0.0002, + "loss": 0.1101, + "step": 7312 + }, + { + "epoch": 13.263205622307867, + "grad_norm": 0.2734375, + "learning_rate": 0.0002, + "loss": 0.1217, + "step": 7313 + }, + { + "epoch": 13.265019270006801, + "grad_norm": 0.2080078125, + "learning_rate": 0.0002, + "loss": 0.1087, + "step": 7314 + }, + { + "epoch": 13.266832917705736, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 0.1227, + "step": 7315 + }, + { + "epoch": 13.26864656540467, + "grad_norm": 0.275390625, + "learning_rate": 0.0002, + "loss": 0.1283, + "step": 7316 + }, + { + "epoch": 13.270460213103604, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 0.1774, + "step": 7317 + }, + { + "epoch": 13.272273860802539, + "grad_norm": 0.1904296875, + "learning_rate": 0.0002, + "loss": 0.2404, + "step": 7318 + }, + { + "epoch": 13.274087508501474, + "grad_norm": 0.271484375, + "learning_rate": 0.0002, + "loss": 0.1081, + "step": 7319 + }, + { + "epoch": 13.275901156200408, + "grad_norm": 0.27734375, + "learning_rate": 0.0002, + "loss": 0.162, + "step": 7320 + }, + { + "epoch": 13.277714803899343, + "grad_norm": 0.23046875, + "learning_rate": 0.0002, + "loss": 0.1034, + "step": 7321 + }, + { + "epoch": 13.279528451598278, + "grad_norm": 0.2353515625, + "learning_rate": 0.0002, + "loss": 0.1114, + "step": 7322 + }, + { + "epoch": 13.28134209929721, + "grad_norm": 0.2275390625, + "learning_rate": 0.0002, + "loss": 0.1083, + "step": 7323 + }, + { + "epoch": 13.283155746996146, + "grad_norm": 0.22265625, + "learning_rate": 0.0002, + "loss": 0.0977, + "step": 7324 + }, + { + "epoch": 13.28496939469508, + "grad_norm": 0.21484375, + "learning_rate": 0.0002, + "loss": 0.1109, + "step": 7325 + }, + { + "epoch": 13.286783042394015, + "grad_norm": 0.1953125, + "learning_rate": 0.0002, + "loss": 0.1041, + "step": 7326 + }, + { + "epoch": 13.28859669009295, + "grad_norm": 0.248046875, + "learning_rate": 0.0002, + "loss": 0.1227, + "step": 7327 + }, + { + "epoch": 13.290410337791885, + "grad_norm": 0.205078125, + "learning_rate": 0.0002, + "loss": 0.0988, + "step": 7328 + }, + { + "epoch": 13.292223985490818, + "grad_norm": 0.19921875, + "learning_rate": 0.0002, + "loss": 0.1046, + "step": 7329 + }, + { + "epoch": 13.294037633189753, + "grad_norm": 0.197265625, + "learning_rate": 0.0002, + "loss": 0.0976, + "step": 7330 + }, + { + "epoch": 13.295851280888687, + "grad_norm": 0.224609375, + "learning_rate": 0.0002, + "loss": 0.0887, + "step": 7331 + }, + { + "epoch": 13.297664928587622, + "grad_norm": 0.1943359375, + "learning_rate": 0.0002, + "loss": 0.1041, + "step": 7332 + }, + { + "epoch": 13.299478576286557, + "grad_norm": 0.1904296875, + "learning_rate": 0.0002, + "loss": 0.0861, + "step": 7333 + }, + { + "epoch": 13.301292223985492, + "grad_norm": 0.2021484375, + "learning_rate": 0.0002, + "loss": 0.0842, + "step": 7334 + }, + { + "epoch": 13.303105871684425, + "grad_norm": 0.203125, + "learning_rate": 0.0002, + "loss": 0.0841, + "step": 7335 + }, + { + "epoch": 13.30491951938336, + "grad_norm": 0.2470703125, + "learning_rate": 0.0002, + "loss": 0.0858, + "step": 7336 + }, + { + "epoch": 13.306733167082294, + "grad_norm": 0.1982421875, + "learning_rate": 0.0002, + "loss": 0.081, + "step": 7337 + }, + { + "epoch": 13.308546814781229, + "grad_norm": 0.2197265625, + "learning_rate": 0.0002, + "loss": 0.0933, + "step": 7338 + }, + { + "epoch": 13.310360462480164, + "grad_norm": 0.21484375, + "learning_rate": 0.0002, + "loss": 0.0776, + "step": 7339 + }, + { + "epoch": 13.312174110179098, + "grad_norm": 0.2138671875, + "learning_rate": 0.0002, + "loss": 0.0865, + "step": 7340 + }, + { + "epoch": 13.313987757878031, + "grad_norm": 0.2109375, + "learning_rate": 0.0002, + "loss": 0.0774, + "step": 7341 + }, + { + "epoch": 13.315801405576966, + "grad_norm": 0.234375, + "learning_rate": 0.0002, + "loss": 0.0874, + "step": 7342 + }, + { + "epoch": 13.317615053275901, + "grad_norm": 0.2275390625, + "learning_rate": 0.0002, + "loss": 0.0748, + "step": 7343 + }, + { + "epoch": 13.319428700974836, + "grad_norm": 0.251953125, + "learning_rate": 0.0002, + "loss": 0.0799, + "step": 7344 + }, + { + "epoch": 13.32124234867377, + "grad_norm": 0.2080078125, + "learning_rate": 0.0002, + "loss": 0.0724, + "step": 7345 + }, + { + "epoch": 13.323055996372705, + "grad_norm": 0.2734375, + "learning_rate": 0.0002, + "loss": 0.0862, + "step": 7346 + }, + { + "epoch": 13.324869644071638, + "grad_norm": 0.2373046875, + "learning_rate": 0.0002, + "loss": 0.0687, + "step": 7347 + }, + { + "epoch": 13.326683291770573, + "grad_norm": 0.275390625, + "learning_rate": 0.0002, + "loss": 0.0783, + "step": 7348 + }, + { + "epoch": 13.328496939469508, + "grad_norm": 0.265625, + "learning_rate": 0.0002, + "loss": 0.0905, + "step": 7349 + }, + { + "epoch": 13.330310587168443, + "grad_norm": 0.291015625, + "learning_rate": 0.0002, + "loss": 0.0803, + "step": 7350 + }, + { + "epoch": 13.332124234867377, + "grad_norm": 0.32421875, + "learning_rate": 0.0002, + "loss": 0.0862, + "step": 7351 + }, + { + "epoch": 13.333937882566312, + "grad_norm": 0.296875, + "learning_rate": 0.0002, + "loss": 0.0843, + "step": 7352 + }, + { + "epoch": 13.335751530265245, + "grad_norm": 0.3359375, + "learning_rate": 0.0002, + "loss": 0.0937, + "step": 7353 + }, + { + "epoch": 13.33756517796418, + "grad_norm": 0.251953125, + "learning_rate": 0.0002, + "loss": 0.0806, + "step": 7354 + }, + { + "epoch": 13.339378825663115, + "grad_norm": 0.294921875, + "learning_rate": 0.0002, + "loss": 0.0869, + "step": 7355 + }, + { + "epoch": 13.34119247336205, + "grad_norm": 0.205078125, + "learning_rate": 0.0002, + "loss": 0.0808, + "step": 7356 + }, + { + "epoch": 13.343006121060984, + "grad_norm": 0.310546875, + "learning_rate": 0.0002, + "loss": 0.0971, + "step": 7357 + }, + { + "epoch": 13.344819768759919, + "grad_norm": 0.294921875, + "learning_rate": 0.0002, + "loss": 0.0943, + "step": 7358 + }, + { + "epoch": 13.346633416458852, + "grad_norm": 0.34375, + "learning_rate": 0.0002, + "loss": 0.0898, + "step": 7359 + }, + { + "epoch": 13.348447064157787, + "grad_norm": 0.236328125, + "learning_rate": 0.0002, + "loss": 0.0882, + "step": 7360 + }, + { + "epoch": 13.350260711856722, + "grad_norm": 0.361328125, + "learning_rate": 0.0002, + "loss": 0.1083, + "step": 7361 + }, + { + "epoch": 13.352074359555656, + "grad_norm": 0.279296875, + "learning_rate": 0.0002, + "loss": 0.1051, + "step": 7362 + }, + { + "epoch": 13.353888007254591, + "grad_norm": 0.3125, + "learning_rate": 0.0002, + "loss": 0.1112, + "step": 7363 + }, + { + "epoch": 13.355701654953526, + "grad_norm": 0.166015625, + "learning_rate": 0.0002, + "loss": 0.1058, + "step": 7364 + }, + { + "epoch": 13.357515302652459, + "grad_norm": 0.279296875, + "learning_rate": 0.0002, + "loss": 0.1098, + "step": 7365 + }, + { + "epoch": 13.359328950351394, + "grad_norm": 0.310546875, + "learning_rate": 0.0002, + "loss": 0.1324, + "step": 7366 + }, + { + "epoch": 13.361142598050328, + "grad_norm": 0.20703125, + "learning_rate": 0.0002, + "loss": 0.1424, + "step": 7367 + }, + { + "epoch": 13.362956245749263, + "grad_norm": 0.1611328125, + "learning_rate": 0.0002, + "loss": 0.1712, + "step": 7368 + }, + { + "epoch": 13.364769893448198, + "grad_norm": 0.2158203125, + "learning_rate": 0.0002, + "loss": 0.1175, + "step": 7369 + }, + { + "epoch": 13.366583541147133, + "grad_norm": 0.2294921875, + "learning_rate": 0.0002, + "loss": 0.1133, + "step": 7370 + }, + { + "epoch": 13.368397188846068, + "grad_norm": 0.255859375, + "learning_rate": 0.0002, + "loss": 0.1131, + "step": 7371 + }, + { + "epoch": 13.370210836545, + "grad_norm": 0.232421875, + "learning_rate": 0.0002, + "loss": 0.1132, + "step": 7372 + }, + { + "epoch": 13.372024484243935, + "grad_norm": 0.24609375, + "learning_rate": 0.0002, + "loss": 0.101, + "step": 7373 + }, + { + "epoch": 13.37383813194287, + "grad_norm": 0.2041015625, + "learning_rate": 0.0002, + "loss": 0.109, + "step": 7374 + }, + { + "epoch": 13.375651779641805, + "grad_norm": 0.2236328125, + "learning_rate": 0.0002, + "loss": 0.1041, + "step": 7375 + }, + { + "epoch": 13.37746542734074, + "grad_norm": 0.1904296875, + "learning_rate": 0.0002, + "loss": 0.0932, + "step": 7376 + }, + { + "epoch": 13.379279075039674, + "grad_norm": 0.212890625, + "learning_rate": 0.0002, + "loss": 0.0968, + "step": 7377 + }, + { + "epoch": 13.381092722738607, + "grad_norm": 0.2236328125, + "learning_rate": 0.0002, + "loss": 0.1078, + "step": 7378 + }, + { + "epoch": 13.382906370437542, + "grad_norm": 0.25390625, + "learning_rate": 0.0002, + "loss": 0.1041, + "step": 7379 + }, + { + "epoch": 13.384720018136477, + "grad_norm": 0.17578125, + "learning_rate": 0.0002, + "loss": 0.0897, + "step": 7380 + }, + { + "epoch": 13.386533665835412, + "grad_norm": 0.203125, + "learning_rate": 0.0002, + "loss": 0.1223, + "step": 7381 + }, + { + "epoch": 13.388347313534346, + "grad_norm": 0.2001953125, + "learning_rate": 0.0002, + "loss": 0.0825, + "step": 7382 + }, + { + "epoch": 13.390160961233281, + "grad_norm": 0.21875, + "learning_rate": 0.0002, + "loss": 0.0835, + "step": 7383 + }, + { + "epoch": 13.391974608932214, + "grad_norm": 0.2294921875, + "learning_rate": 0.0002, + "loss": 0.0992, + "step": 7384 + }, + { + "epoch": 13.393788256631149, + "grad_norm": 0.20703125, + "learning_rate": 0.0002, + "loss": 0.0718, + "step": 7385 + }, + { + "epoch": 13.395601904330084, + "grad_norm": 0.244140625, + "learning_rate": 0.0002, + "loss": 0.0793, + "step": 7386 + }, + { + "epoch": 13.397415552029019, + "grad_norm": 0.197265625, + "learning_rate": 0.0002, + "loss": 0.0842, + "step": 7387 + }, + { + "epoch": 13.399229199727953, + "grad_norm": 0.271484375, + "learning_rate": 0.0002, + "loss": 0.0891, + "step": 7388 + }, + { + "epoch": 13.401042847426888, + "grad_norm": 0.255859375, + "learning_rate": 0.0002, + "loss": 0.0977, + "step": 7389 + }, + { + "epoch": 13.402856495125821, + "grad_norm": 0.2294921875, + "learning_rate": 0.0002, + "loss": 0.0812, + "step": 7390 + }, + { + "epoch": 13.404670142824756, + "grad_norm": 0.259765625, + "learning_rate": 0.0002, + "loss": 0.0993, + "step": 7391 + }, + { + "epoch": 13.40648379052369, + "grad_norm": 0.2470703125, + "learning_rate": 0.0002, + "loss": 0.0833, + "step": 7392 + }, + { + "epoch": 13.408297438222625, + "grad_norm": 0.2490234375, + "learning_rate": 0.0002, + "loss": 0.0748, + "step": 7393 + }, + { + "epoch": 13.41011108592156, + "grad_norm": 0.287109375, + "learning_rate": 0.0002, + "loss": 0.0847, + "step": 7394 + }, + { + "epoch": 13.411924733620495, + "grad_norm": 0.310546875, + "learning_rate": 0.0002, + "loss": 0.1003, + "step": 7395 + }, + { + "epoch": 13.413738381319428, + "grad_norm": 0.25, + "learning_rate": 0.0002, + "loss": 0.0838, + "step": 7396 + }, + { + "epoch": 13.415552029018363, + "grad_norm": 0.31640625, + "learning_rate": 0.0002, + "loss": 0.0858, + "step": 7397 + }, + { + "epoch": 13.417365676717298, + "grad_norm": 0.25, + "learning_rate": 0.0002, + "loss": 0.0716, + "step": 7398 + }, + { + "epoch": 13.419179324416232, + "grad_norm": 0.263671875, + "learning_rate": 0.0002, + "loss": 0.0725, + "step": 7399 + }, + { + "epoch": 13.420992972115167, + "grad_norm": 0.2275390625, + "learning_rate": 0.0002, + "loss": 0.0821, + "step": 7400 + }, + { + "epoch": 13.422806619814102, + "grad_norm": 0.404296875, + "learning_rate": 0.0002, + "loss": 0.0871, + "step": 7401 + }, + { + "epoch": 13.424620267513035, + "grad_norm": 0.2490234375, + "learning_rate": 0.0002, + "loss": 0.0821, + "step": 7402 + }, + { + "epoch": 13.42643391521197, + "grad_norm": 0.30859375, + "learning_rate": 0.0002, + "loss": 0.0859, + "step": 7403 + }, + { + "epoch": 13.428247562910904, + "grad_norm": 0.26171875, + "learning_rate": 0.0002, + "loss": 0.0843, + "step": 7404 + }, + { + "epoch": 13.43006121060984, + "grad_norm": 0.3515625, + "learning_rate": 0.0002, + "loss": 0.1039, + "step": 7405 + }, + { + "epoch": 13.431874858308774, + "grad_norm": 0.29296875, + "learning_rate": 0.0002, + "loss": 0.1203, + "step": 7406 + }, + { + "epoch": 13.433688506007709, + "grad_norm": 0.24609375, + "learning_rate": 0.0002, + "loss": 0.1056, + "step": 7407 + }, + { + "epoch": 13.435502153706642, + "grad_norm": 0.341796875, + "learning_rate": 0.0002, + "loss": 0.0969, + "step": 7408 + }, + { + "epoch": 13.437315801405576, + "grad_norm": 0.412109375, + "learning_rate": 0.0002, + "loss": 0.1292, + "step": 7409 + }, + { + "epoch": 13.439129449104511, + "grad_norm": 0.2138671875, + "learning_rate": 0.0002, + "loss": 0.0935, + "step": 7410 + }, + { + "epoch": 13.440943096803446, + "grad_norm": 0.28515625, + "learning_rate": 0.0002, + "loss": 0.1013, + "step": 7411 + }, + { + "epoch": 13.44275674450238, + "grad_norm": 0.212890625, + "learning_rate": 0.0002, + "loss": 0.1167, + "step": 7412 + }, + { + "epoch": 13.444570392201316, + "grad_norm": 0.2060546875, + "learning_rate": 0.0002, + "loss": 0.1032, + "step": 7413 + }, + { + "epoch": 13.446384039900249, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 0.1404, + "step": 7414 + }, + { + "epoch": 13.448197687599183, + "grad_norm": 0.1279296875, + "learning_rate": 0.0002, + "loss": 0.1289, + "step": 7415 + }, + { + "epoch": 13.450011335298118, + "grad_norm": 0.1875, + "learning_rate": 0.0002, + "loss": 0.1387, + "step": 7416 + }, + { + "epoch": 13.451824982997053, + "grad_norm": 0.119140625, + "learning_rate": 0.0002, + "loss": 0.1527, + "step": 7417 + }, + { + "epoch": 13.453638630695988, + "grad_norm": 0.294921875, + "learning_rate": 0.0002, + "loss": 0.2709, + "step": 7418 + }, + { + "epoch": 13.455452278394922, + "grad_norm": 0.2490234375, + "learning_rate": 0.0002, + "loss": 0.1305, + "step": 7419 + }, + { + "epoch": 13.457265926093855, + "grad_norm": 0.240234375, + "learning_rate": 0.0002, + "loss": 0.1132, + "step": 7420 + }, + { + "epoch": 13.45907957379279, + "grad_norm": 0.2353515625, + "learning_rate": 0.0002, + "loss": 0.1089, + "step": 7421 + }, + { + "epoch": 13.460893221491725, + "grad_norm": 0.2392578125, + "learning_rate": 0.0002, + "loss": 0.1229, + "step": 7422 + }, + { + "epoch": 13.46270686919066, + "grad_norm": 0.25, + "learning_rate": 0.0002, + "loss": 0.1227, + "step": 7423 + }, + { + "epoch": 13.464520516889595, + "grad_norm": 0.23046875, + "learning_rate": 0.0002, + "loss": 0.1112, + "step": 7424 + }, + { + "epoch": 13.46633416458853, + "grad_norm": 0.2060546875, + "learning_rate": 0.0002, + "loss": 0.1011, + "step": 7425 + }, + { + "epoch": 13.468147812287462, + "grad_norm": 0.244140625, + "learning_rate": 0.0002, + "loss": 0.1205, + "step": 7426 + }, + { + "epoch": 13.469961459986397, + "grad_norm": 0.2021484375, + "learning_rate": 0.0002, + "loss": 0.0937, + "step": 7427 + }, + { + "epoch": 13.471775107685332, + "grad_norm": 0.2119140625, + "learning_rate": 0.0002, + "loss": 0.0953, + "step": 7428 + }, + { + "epoch": 13.473588755384267, + "grad_norm": 0.205078125, + "learning_rate": 0.0002, + "loss": 0.1029, + "step": 7429 + }, + { + "epoch": 13.475402403083201, + "grad_norm": 0.224609375, + "learning_rate": 0.0002, + "loss": 0.1029, + "step": 7430 + }, + { + "epoch": 13.477216050782136, + "grad_norm": 0.267578125, + "learning_rate": 0.0002, + "loss": 0.0895, + "step": 7431 + }, + { + "epoch": 13.479029698481071, + "grad_norm": 0.169921875, + "learning_rate": 0.0002, + "loss": 0.0862, + "step": 7432 + }, + { + "epoch": 13.480843346180004, + "grad_norm": 0.2021484375, + "learning_rate": 0.0002, + "loss": 0.0964, + "step": 7433 + }, + { + "epoch": 13.482656993878939, + "grad_norm": 0.1982421875, + "learning_rate": 0.0002, + "loss": 0.0813, + "step": 7434 + }, + { + "epoch": 13.484470641577873, + "grad_norm": 0.232421875, + "learning_rate": 0.0002, + "loss": 0.0935, + "step": 7435 + }, + { + "epoch": 13.486284289276808, + "grad_norm": 0.22265625, + "learning_rate": 0.0002, + "loss": 0.0892, + "step": 7436 + }, + { + "epoch": 13.488097936975743, + "grad_norm": 0.259765625, + "learning_rate": 0.0002, + "loss": 0.0849, + "step": 7437 + }, + { + "epoch": 13.489911584674678, + "grad_norm": 0.2236328125, + "learning_rate": 0.0002, + "loss": 0.0884, + "step": 7438 + }, + { + "epoch": 13.49172523237361, + "grad_norm": 0.21875, + "learning_rate": 0.0002, + "loss": 0.0767, + "step": 7439 + }, + { + "epoch": 13.493538880072546, + "grad_norm": 0.2333984375, + "learning_rate": 0.0002, + "loss": 0.0957, + "step": 7440 + }, + { + "epoch": 13.49535252777148, + "grad_norm": 0.28125, + "learning_rate": 0.0002, + "loss": 0.0913, + "step": 7441 + }, + { + "epoch": 13.497166175470415, + "grad_norm": 0.234375, + "learning_rate": 0.0002, + "loss": 0.0852, + "step": 7442 + }, + { + "epoch": 13.49897982316935, + "grad_norm": 0.2890625, + "learning_rate": 0.0002, + "loss": 0.0945, + "step": 7443 + }, + { + "epoch": 13.500793470868285, + "grad_norm": 0.2470703125, + "learning_rate": 0.0002, + "loss": 0.0919, + "step": 7444 + }, + { + "epoch": 13.502607118567218, + "grad_norm": 0.25, + "learning_rate": 0.0002, + "loss": 0.0934, + "step": 7445 + }, + { + "epoch": 13.504420766266152, + "grad_norm": 0.279296875, + "learning_rate": 0.0002, + "loss": 0.0845, + "step": 7446 + }, + { + "epoch": 13.506234413965087, + "grad_norm": 0.3125, + "learning_rate": 0.0002, + "loss": 0.0935, + "step": 7447 + }, + { + "epoch": 13.508048061664022, + "grad_norm": 0.2451171875, + "learning_rate": 0.0002, + "loss": 0.0832, + "step": 7448 + }, + { + "epoch": 13.509861709362957, + "grad_norm": 0.21875, + "learning_rate": 0.0002, + "loss": 0.0818, + "step": 7449 + }, + { + "epoch": 13.511675357061891, + "grad_norm": 0.26171875, + "learning_rate": 0.0002, + "loss": 0.0782, + "step": 7450 + }, + { + "epoch": 13.513489004760824, + "grad_norm": 0.2265625, + "learning_rate": 0.0002, + "loss": 0.0811, + "step": 7451 + }, + { + "epoch": 13.51530265245976, + "grad_norm": 0.216796875, + "learning_rate": 0.0002, + "loss": 0.0911, + "step": 7452 + }, + { + "epoch": 13.517116300158694, + "grad_norm": 0.298828125, + "learning_rate": 0.0002, + "loss": 0.0848, + "step": 7453 + }, + { + "epoch": 13.518929947857629, + "grad_norm": 0.244140625, + "learning_rate": 0.0002, + "loss": 0.084, + "step": 7454 + }, + { + "epoch": 13.520743595556564, + "grad_norm": 0.294921875, + "learning_rate": 0.0002, + "loss": 0.0931, + "step": 7455 + }, + { + "epoch": 13.522557243255498, + "grad_norm": 0.291015625, + "learning_rate": 0.0002, + "loss": 0.089, + "step": 7456 + }, + { + "epoch": 13.524370890954431, + "grad_norm": 0.2158203125, + "learning_rate": 0.0002, + "loss": 0.0888, + "step": 7457 + }, + { + "epoch": 13.526184538653366, + "grad_norm": 0.20703125, + "learning_rate": 0.0002, + "loss": 0.1421, + "step": 7458 + }, + { + "epoch": 13.527998186352301, + "grad_norm": 0.349609375, + "learning_rate": 0.0002, + "loss": 0.1006, + "step": 7459 + }, + { + "epoch": 13.529811834051236, + "grad_norm": 0.24609375, + "learning_rate": 0.0002, + "loss": 0.1087, + "step": 7460 + }, + { + "epoch": 13.53162548175017, + "grad_norm": 0.36328125, + "learning_rate": 0.0002, + "loss": 0.1058, + "step": 7461 + }, + { + "epoch": 13.533439129449105, + "grad_norm": 0.1748046875, + "learning_rate": 0.0002, + "loss": 0.108, + "step": 7462 + }, + { + "epoch": 13.535252777148038, + "grad_norm": 0.2255859375, + "learning_rate": 0.0002, + "loss": 0.1175, + "step": 7463 + }, + { + "epoch": 13.537066424846973, + "grad_norm": 0.294921875, + "learning_rate": 0.0002, + "loss": 0.1291, + "step": 7464 + }, + { + "epoch": 13.538880072545908, + "grad_norm": 0.17578125, + "learning_rate": 0.0002, + "loss": 0.1239, + "step": 7465 + }, + { + "epoch": 13.540693720244843, + "grad_norm": 0.30078125, + "learning_rate": 0.0002, + "loss": 0.1544, + "step": 7466 + }, + { + "epoch": 13.542507367943777, + "grad_norm": 0.2109375, + "learning_rate": 0.0002, + "loss": 0.186, + "step": 7467 + }, + { + "epoch": 13.544321015642712, + "grad_norm": 0.20703125, + "learning_rate": 0.0002, + "loss": 0.2613, + "step": 7468 + }, + { + "epoch": 13.546134663341645, + "grad_norm": 0.24609375, + "learning_rate": 0.0002, + "loss": 0.1387, + "step": 7469 + }, + { + "epoch": 13.54794831104058, + "grad_norm": 0.23828125, + "learning_rate": 0.0002, + "loss": 0.1254, + "step": 7470 + }, + { + "epoch": 13.549761958739515, + "grad_norm": 0.232421875, + "learning_rate": 0.0002, + "loss": 0.1133, + "step": 7471 + }, + { + "epoch": 13.55157560643845, + "grad_norm": 0.263671875, + "learning_rate": 0.0002, + "loss": 0.137, + "step": 7472 + }, + { + "epoch": 13.553389254137384, + "grad_norm": 0.2451171875, + "learning_rate": 0.0002, + "loss": 0.1168, + "step": 7473 + }, + { + "epoch": 13.555202901836319, + "grad_norm": 0.2255859375, + "learning_rate": 0.0002, + "loss": 0.0962, + "step": 7474 + }, + { + "epoch": 13.557016549535252, + "grad_norm": 0.24609375, + "learning_rate": 0.0002, + "loss": 0.1301, + "step": 7475 + }, + { + "epoch": 13.558830197234187, + "grad_norm": 0.2177734375, + "learning_rate": 0.0002, + "loss": 0.0975, + "step": 7476 + }, + { + "epoch": 13.560643844933121, + "grad_norm": 0.2119140625, + "learning_rate": 0.0002, + "loss": 0.0954, + "step": 7477 + }, + { + "epoch": 13.562457492632056, + "grad_norm": 0.220703125, + "learning_rate": 0.0002, + "loss": 0.1131, + "step": 7478 + }, + { + "epoch": 13.564271140330991, + "grad_norm": 0.1943359375, + "learning_rate": 0.0002, + "loss": 0.0871, + "step": 7479 + }, + { + "epoch": 13.566084788029926, + "grad_norm": 0.2294921875, + "learning_rate": 0.0002, + "loss": 0.1202, + "step": 7480 + }, + { + "epoch": 13.566084788029926, + "eval_loss": 2.3937833309173584, + "eval_runtime": 152.6489, + "eval_samples_per_second": 6.551, + "eval_steps_per_second": 6.551, + "step": 7480 + }, + { + "epoch": 13.566084788029926, + "mmlu_eval_accuracy": 0.2994188402994283, + "mmlu_eval_accuracy_abstract_algebra": 0.36363636363636365, + "mmlu_eval_accuracy_anatomy": 0.5, + "mmlu_eval_accuracy_astronomy": 0.3125, + "mmlu_eval_accuracy_business_ethics": 0.36363636363636365, + "mmlu_eval_accuracy_clinical_knowledge": 0.27586206896551724, + "mmlu_eval_accuracy_college_biology": 0.375, + "mmlu_eval_accuracy_college_chemistry": 0.125, + "mmlu_eval_accuracy_college_computer_science": 0.09090909090909091, + "mmlu_eval_accuracy_college_mathematics": 0.18181818181818182, + "mmlu_eval_accuracy_college_medicine": 0.22727272727272727, + "mmlu_eval_accuracy_college_physics": 0.2727272727272727, + "mmlu_eval_accuracy_computer_security": 0.5454545454545454, + "mmlu_eval_accuracy_conceptual_physics": 0.2692307692307692, + "mmlu_eval_accuracy_econometrics": 0.08333333333333333, + "mmlu_eval_accuracy_electrical_engineering": 0.1875, + "mmlu_eval_accuracy_elementary_mathematics": 0.2926829268292683, + "mmlu_eval_accuracy_formal_logic": 0.14285714285714285, + "mmlu_eval_accuracy_global_facts": 0.3, + "mmlu_eval_accuracy_high_school_biology": 0.375, + "mmlu_eval_accuracy_high_school_chemistry": 0.22727272727272727, + "mmlu_eval_accuracy_high_school_computer_science": 0.2222222222222222, + "mmlu_eval_accuracy_high_school_european_history": 0.2777777777777778, + "mmlu_eval_accuracy_high_school_geography": 0.3181818181818182, + "mmlu_eval_accuracy_high_school_government_and_politics": 0.3333333333333333, + "mmlu_eval_accuracy_high_school_macroeconomics": 0.32558139534883723, + "mmlu_eval_accuracy_high_school_mathematics": 0.10344827586206896, + "mmlu_eval_accuracy_high_school_microeconomics": 0.34615384615384615, + "mmlu_eval_accuracy_high_school_physics": 0.47058823529411764, + "mmlu_eval_accuracy_high_school_psychology": 0.36666666666666664, + "mmlu_eval_accuracy_high_school_statistics": 0.2608695652173913, + "mmlu_eval_accuracy_high_school_us_history": 0.22727272727272727, + "mmlu_eval_accuracy_high_school_world_history": 0.19230769230769232, + "mmlu_eval_accuracy_human_aging": 0.34782608695652173, + "mmlu_eval_accuracy_human_sexuality": 0.16666666666666666, + "mmlu_eval_accuracy_international_law": 0.38461538461538464, + "mmlu_eval_accuracy_jurisprudence": 0.18181818181818182, + "mmlu_eval_accuracy_logical_fallacies": 0.2777777777777778, + "mmlu_eval_accuracy_machine_learning": 0.36363636363636365, + "mmlu_eval_accuracy_management": 0.2727272727272727, + "mmlu_eval_accuracy_marketing": 0.48, + "mmlu_eval_accuracy_medical_genetics": 0.36363636363636365, + "mmlu_eval_accuracy_miscellaneous": 0.47674418604651164, + "mmlu_eval_accuracy_moral_disputes": 0.2894736842105263, + "mmlu_eval_accuracy_moral_scenarios": 0.22, + "mmlu_eval_accuracy_nutrition": 0.30303030303030304, + "mmlu_eval_accuracy_philosophy": 0.4117647058823529, + "mmlu_eval_accuracy_prehistory": 0.42857142857142855, + "mmlu_eval_accuracy_professional_accounting": 0.3225806451612903, + "mmlu_eval_accuracy_professional_law": 0.3235294117647059, + "mmlu_eval_accuracy_professional_medicine": 0.1935483870967742, + "mmlu_eval_accuracy_professional_psychology": 0.2753623188405797, + "mmlu_eval_accuracy_public_relations": 0.3333333333333333, + "mmlu_eval_accuracy_security_studies": 0.37037037037037035, + "mmlu_eval_accuracy_sociology": 0.5454545454545454, + "mmlu_eval_accuracy_us_foreign_policy": 0.18181818181818182, + "mmlu_eval_accuracy_virology": 0.3333333333333333, + "mmlu_eval_accuracy_world_religions": 0.2631578947368421, + "mmlu_loss": 2.0594946005759405, + "step": 7480 + }, + { + "epoch": 13.56789843572886, + "grad_norm": 0.2392578125, + "learning_rate": 0.0002, + "loss": 0.1067, + "step": 7481 + }, + { + "epoch": 13.569712083427794, + "grad_norm": 0.228515625, + "learning_rate": 0.0002, + "loss": 0.1002, + "step": 7482 + }, + { + "epoch": 13.571525731126728, + "grad_norm": 0.19921875, + "learning_rate": 0.0002, + "loss": 0.0778, + "step": 7483 + }, + { + "epoch": 13.573339378825663, + "grad_norm": 0.216796875, + "learning_rate": 0.0002, + "loss": 0.1022, + "step": 7484 + }, + { + "epoch": 13.575153026524598, + "grad_norm": 0.2265625, + "learning_rate": 0.0002, + "loss": 0.0905, + "step": 7485 + }, + { + "epoch": 13.576966674223533, + "grad_norm": 0.27734375, + "learning_rate": 0.0002, + "loss": 0.1051, + "step": 7486 + }, + { + "epoch": 13.578780321922466, + "grad_norm": 0.20703125, + "learning_rate": 0.0002, + "loss": 0.0827, + "step": 7487 + }, + { + "epoch": 13.5805939696214, + "grad_norm": 0.2451171875, + "learning_rate": 0.0002, + "loss": 0.0889, + "step": 7488 + }, + { + "epoch": 13.582407617320335, + "grad_norm": 0.2373046875, + "learning_rate": 0.0002, + "loss": 0.0824, + "step": 7489 + }, + { + "epoch": 13.58422126501927, + "grad_norm": 0.251953125, + "learning_rate": 0.0002, + "loss": 0.0941, + "step": 7490 + }, + { + "epoch": 13.586034912718205, + "grad_norm": 0.31640625, + "learning_rate": 0.0002, + "loss": 0.1041, + "step": 7491 + }, + { + "epoch": 13.58784856041714, + "grad_norm": 0.2314453125, + "learning_rate": 0.0002, + "loss": 0.0937, + "step": 7492 + }, + { + "epoch": 13.589662208116074, + "grad_norm": 0.263671875, + "learning_rate": 0.0002, + "loss": 0.0889, + "step": 7493 + }, + { + "epoch": 13.591475855815007, + "grad_norm": 0.27734375, + "learning_rate": 0.0002, + "loss": 0.0917, + "step": 7494 + }, + { + "epoch": 13.593289503513942, + "grad_norm": 0.2333984375, + "learning_rate": 0.0002, + "loss": 0.0935, + "step": 7495 + }, + { + "epoch": 13.595103151212877, + "grad_norm": 0.291015625, + "learning_rate": 0.0002, + "loss": 0.0836, + "step": 7496 + }, + { + "epoch": 13.596916798911812, + "grad_norm": 0.32421875, + "learning_rate": 0.0002, + "loss": 0.2061, + "step": 7497 + }, + { + "epoch": 13.598730446610746, + "grad_norm": 0.251953125, + "learning_rate": 0.0002, + "loss": 0.0803, + "step": 7498 + }, + { + "epoch": 13.600544094309681, + "grad_norm": 0.283203125, + "learning_rate": 0.0002, + "loss": 0.0849, + "step": 7499 + }, + { + "epoch": 13.602357742008614, + "grad_norm": 0.283203125, + "learning_rate": 0.0002, + "loss": 0.0859, + "step": 7500 + }, + { + "epoch": 13.604171389707549, + "grad_norm": 0.369140625, + "learning_rate": 0.0002, + "loss": 0.1026, + "step": 7501 + }, + { + "epoch": 13.605985037406484, + "grad_norm": 0.287109375, + "learning_rate": 0.0002, + "loss": 0.0853, + "step": 7502 + }, + { + "epoch": 13.607798685105418, + "grad_norm": 0.328125, + "learning_rate": 0.0002, + "loss": 0.0894, + "step": 7503 + }, + { + "epoch": 13.609612332804353, + "grad_norm": 0.2294921875, + "learning_rate": 0.0002, + "loss": 0.0834, + "step": 7504 + }, + { + "epoch": 13.611425980503288, + "grad_norm": 0.2890625, + "learning_rate": 0.0002, + "loss": 0.0951, + "step": 7505 + }, + { + "epoch": 13.613239628202221, + "grad_norm": 0.37890625, + "learning_rate": 0.0002, + "loss": 0.0997, + "step": 7506 + }, + { + "epoch": 13.615053275901156, + "grad_norm": 0.1982421875, + "learning_rate": 0.0002, + "loss": 0.0926, + "step": 7507 + }, + { + "epoch": 13.61686692360009, + "grad_norm": 0.2255859375, + "learning_rate": 0.0002, + "loss": 0.0993, + "step": 7508 + }, + { + "epoch": 13.618680571299025, + "grad_norm": 0.2890625, + "learning_rate": 0.0002, + "loss": 0.0961, + "step": 7509 + }, + { + "epoch": 13.62049421899796, + "grad_norm": 0.380859375, + "learning_rate": 0.0002, + "loss": 0.1096, + "step": 7510 + }, + { + "epoch": 13.622307866696895, + "grad_norm": 0.1708984375, + "learning_rate": 0.0002, + "loss": 0.1078, + "step": 7511 + }, + { + "epoch": 13.624121514395828, + "grad_norm": 0.1884765625, + "learning_rate": 0.0002, + "loss": 0.1052, + "step": 7512 + }, + { + "epoch": 13.625935162094763, + "grad_norm": 0.302734375, + "learning_rate": 0.0002, + "loss": 0.1241, + "step": 7513 + }, + { + "epoch": 13.627748809793697, + "grad_norm": 0.279296875, + "learning_rate": 0.0002, + "loss": 0.1208, + "step": 7514 + }, + { + "epoch": 13.629562457492632, + "grad_norm": 0.2431640625, + "learning_rate": 0.0002, + "loss": 0.1355, + "step": 7515 + }, + { + "epoch": 13.631376105191567, + "grad_norm": 0.177734375, + "learning_rate": 0.0002, + "loss": 0.1527, + "step": 7516 + }, + { + "epoch": 13.633189752890502, + "grad_norm": 0.1767578125, + "learning_rate": 0.0002, + "loss": 0.2121, + "step": 7517 + }, + { + "epoch": 13.635003400589435, + "grad_norm": 0.1669921875, + "learning_rate": 0.0002, + "loss": 0.2347, + "step": 7518 + }, + { + "epoch": 13.63681704828837, + "grad_norm": 0.220703125, + "learning_rate": 0.0002, + "loss": 0.1299, + "step": 7519 + }, + { + "epoch": 13.638630695987304, + "grad_norm": 0.2734375, + "learning_rate": 0.0002, + "loss": 0.1391, + "step": 7520 + }, + { + "epoch": 13.640444343686239, + "grad_norm": 0.2109375, + "learning_rate": 0.0002, + "loss": 0.0991, + "step": 7521 + }, + { + "epoch": 13.642257991385174, + "grad_norm": 0.23828125, + "learning_rate": 0.0002, + "loss": 0.1109, + "step": 7522 + }, + { + "epoch": 13.644071639084109, + "grad_norm": 0.26171875, + "learning_rate": 0.0002, + "loss": 0.1478, + "step": 7523 + }, + { + "epoch": 13.645885286783042, + "grad_norm": 0.25, + "learning_rate": 0.0002, + "loss": 0.1369, + "step": 7524 + }, + { + "epoch": 13.647698934481976, + "grad_norm": 0.2197265625, + "learning_rate": 0.0002, + "loss": 0.111, + "step": 7525 + }, + { + "epoch": 13.649512582180911, + "grad_norm": 0.2275390625, + "learning_rate": 0.0002, + "loss": 0.1052, + "step": 7526 + }, + { + "epoch": 13.651326229879846, + "grad_norm": 0.1982421875, + "learning_rate": 0.0002, + "loss": 0.0946, + "step": 7527 + }, + { + "epoch": 13.65313987757878, + "grad_norm": 0.2216796875, + "learning_rate": 0.0002, + "loss": 0.1151, + "step": 7528 + }, + { + "epoch": 13.654953525277715, + "grad_norm": 0.19921875, + "learning_rate": 0.0002, + "loss": 0.1181, + "step": 7529 + }, + { + "epoch": 13.65676717297665, + "grad_norm": 0.2392578125, + "learning_rate": 0.0002, + "loss": 0.1136, + "step": 7530 + }, + { + "epoch": 13.658580820675583, + "grad_norm": 0.2314453125, + "learning_rate": 0.0002, + "loss": 0.102, + "step": 7531 + }, + { + "epoch": 13.660394468374518, + "grad_norm": 0.220703125, + "learning_rate": 0.0002, + "loss": 0.0882, + "step": 7532 + }, + { + "epoch": 13.662208116073453, + "grad_norm": 0.2373046875, + "learning_rate": 0.0002, + "loss": 0.093, + "step": 7533 + }, + { + "epoch": 13.664021763772388, + "grad_norm": 0.1943359375, + "learning_rate": 0.0002, + "loss": 0.0857, + "step": 7534 + }, + { + "epoch": 13.665835411471322, + "grad_norm": 0.23046875, + "learning_rate": 0.0002, + "loss": 0.0945, + "step": 7535 + }, + { + "epoch": 13.667649059170255, + "grad_norm": 0.224609375, + "learning_rate": 0.0002, + "loss": 0.0845, + "step": 7536 + }, + { + "epoch": 13.66946270686919, + "grad_norm": 0.2490234375, + "learning_rate": 0.0002, + "loss": 0.0869, + "step": 7537 + }, + { + "epoch": 13.671276354568125, + "grad_norm": 0.25, + "learning_rate": 0.0002, + "loss": 0.0898, + "step": 7538 + }, + { + "epoch": 13.67309000226706, + "grad_norm": 0.23046875, + "learning_rate": 0.0002, + "loss": 0.0834, + "step": 7539 + }, + { + "epoch": 13.674903649965994, + "grad_norm": 0.2158203125, + "learning_rate": 0.0002, + "loss": 0.0846, + "step": 7540 + }, + { + "epoch": 13.67671729766493, + "grad_norm": 0.255859375, + "learning_rate": 0.0002, + "loss": 0.085, + "step": 7541 + }, + { + "epoch": 13.678530945363864, + "grad_norm": 0.302734375, + "learning_rate": 0.0002, + "loss": 0.0952, + "step": 7542 + }, + { + "epoch": 13.680344593062797, + "grad_norm": 0.2158203125, + "learning_rate": 0.0002, + "loss": 0.074, + "step": 7543 + }, + { + "epoch": 13.682158240761732, + "grad_norm": 0.255859375, + "learning_rate": 0.0002, + "loss": 0.0867, + "step": 7544 + }, + { + "epoch": 13.683971888460666, + "grad_norm": 0.24609375, + "learning_rate": 0.0002, + "loss": 0.0778, + "step": 7545 + }, + { + "epoch": 13.685785536159601, + "grad_norm": 0.306640625, + "learning_rate": 0.0002, + "loss": 0.0902, + "step": 7546 + }, + { + "epoch": 13.687599183858536, + "grad_norm": 0.2734375, + "learning_rate": 0.0002, + "loss": 0.0845, + "step": 7547 + }, + { + "epoch": 13.689412831557469, + "grad_norm": 0.318359375, + "learning_rate": 0.0002, + "loss": 0.0849, + "step": 7548 + }, + { + "epoch": 13.691226479256404, + "grad_norm": 0.248046875, + "learning_rate": 0.0002, + "loss": 0.0788, + "step": 7549 + }, + { + "epoch": 13.693040126955339, + "grad_norm": 0.25390625, + "learning_rate": 0.0002, + "loss": 0.0856, + "step": 7550 + }, + { + "epoch": 13.694853774654273, + "grad_norm": 0.22265625, + "learning_rate": 0.0002, + "loss": 0.0848, + "step": 7551 + }, + { + "epoch": 13.696667422353208, + "grad_norm": 0.1953125, + "learning_rate": 0.0002, + "loss": 0.0803, + "step": 7552 + }, + { + "epoch": 13.698481070052143, + "grad_norm": 0.333984375, + "learning_rate": 0.0002, + "loss": 0.0872, + "step": 7553 + }, + { + "epoch": 13.700294717751078, + "grad_norm": 0.33203125, + "learning_rate": 0.0002, + "loss": 0.0997, + "step": 7554 + }, + { + "epoch": 13.70210836545001, + "grad_norm": 0.2060546875, + "learning_rate": 0.0002, + "loss": 0.081, + "step": 7555 + }, + { + "epoch": 13.703922013148945, + "grad_norm": 0.220703125, + "learning_rate": 0.0002, + "loss": 0.0878, + "step": 7556 + }, + { + "epoch": 13.70573566084788, + "grad_norm": 0.376953125, + "learning_rate": 0.0002, + "loss": 0.0922, + "step": 7557 + }, + { + "epoch": 13.707549308546815, + "grad_norm": 0.2578125, + "learning_rate": 0.0002, + "loss": 0.0919, + "step": 7558 + }, + { + "epoch": 13.70936295624575, + "grad_norm": 0.283203125, + "learning_rate": 0.0002, + "loss": 0.0934, + "step": 7559 + }, + { + "epoch": 13.711176603944685, + "grad_norm": 0.3046875, + "learning_rate": 0.0002, + "loss": 0.1087, + "step": 7560 + }, + { + "epoch": 13.712990251643618, + "grad_norm": 0.392578125, + "learning_rate": 0.0002, + "loss": 0.1172, + "step": 7561 + }, + { + "epoch": 13.714803899342552, + "grad_norm": 0.31640625, + "learning_rate": 0.0002, + "loss": 0.1135, + "step": 7562 + }, + { + "epoch": 13.716617547041487, + "grad_norm": 0.61328125, + "learning_rate": 0.0002, + "loss": 0.1317, + "step": 7563 + }, + { + "epoch": 13.718431194740422, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 0.1181, + "step": 7564 + }, + { + "epoch": 13.720244842439357, + "grad_norm": 0.326171875, + "learning_rate": 0.0002, + "loss": 0.1321, + "step": 7565 + }, + { + "epoch": 13.722058490138291, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 0.1429, + "step": 7566 + }, + { + "epoch": 13.723872137837224, + "grad_norm": 0.318359375, + "learning_rate": 0.0002, + "loss": 0.2073, + "step": 7567 + }, + { + "epoch": 13.72568578553616, + "grad_norm": 0.220703125, + "learning_rate": 0.0002, + "loss": 0.2581, + "step": 7568 + }, + { + "epoch": 13.727499433235094, + "grad_norm": 0.2265625, + "learning_rate": 0.0002, + "loss": 0.1137, + "step": 7569 + }, + { + "epoch": 13.729313080934029, + "grad_norm": 0.275390625, + "learning_rate": 0.0002, + "loss": 0.1258, + "step": 7570 + }, + { + "epoch": 13.731126728632963, + "grad_norm": 0.2060546875, + "learning_rate": 0.0002, + "loss": 0.1108, + "step": 7571 + }, + { + "epoch": 13.732940376331898, + "grad_norm": 0.2177734375, + "learning_rate": 0.0002, + "loss": 0.1074, + "step": 7572 + }, + { + "epoch": 13.734754024030831, + "grad_norm": 0.2265625, + "learning_rate": 0.0002, + "loss": 0.1102, + "step": 7573 + }, + { + "epoch": 13.736567671729766, + "grad_norm": 0.275390625, + "learning_rate": 0.0002, + "loss": 0.1464, + "step": 7574 + }, + { + "epoch": 13.7383813194287, + "grad_norm": 0.2451171875, + "learning_rate": 0.0002, + "loss": 0.1368, + "step": 7575 + }, + { + "epoch": 13.740194967127636, + "grad_norm": 0.25390625, + "learning_rate": 0.0002, + "loss": 0.1126, + "step": 7576 + }, + { + "epoch": 13.74200861482657, + "grad_norm": 0.2421875, + "learning_rate": 0.0002, + "loss": 0.1448, + "step": 7577 + }, + { + "epoch": 13.743822262525505, + "grad_norm": 0.2255859375, + "learning_rate": 0.0002, + "loss": 0.1057, + "step": 7578 + }, + { + "epoch": 13.745635910224438, + "grad_norm": 0.2177734375, + "learning_rate": 0.0002, + "loss": 0.0923, + "step": 7579 + }, + { + "epoch": 13.747449557923373, + "grad_norm": 0.228515625, + "learning_rate": 0.0002, + "loss": 0.1063, + "step": 7580 + }, + { + "epoch": 13.749263205622308, + "grad_norm": 0.2138671875, + "learning_rate": 0.0002, + "loss": 0.1031, + "step": 7581 + }, + { + "epoch": 13.751076853321242, + "grad_norm": 0.2255859375, + "learning_rate": 0.0002, + "loss": 0.0983, + "step": 7582 + }, + { + "epoch": 13.752890501020177, + "grad_norm": 0.2041015625, + "learning_rate": 0.0002, + "loss": 0.0863, + "step": 7583 + }, + { + "epoch": 13.754704148719112, + "grad_norm": 0.2001953125, + "learning_rate": 0.0002, + "loss": 0.0973, + "step": 7584 + }, + { + "epoch": 13.756517796418045, + "grad_norm": 0.2490234375, + "learning_rate": 0.0002, + "loss": 0.0949, + "step": 7585 + }, + { + "epoch": 13.75833144411698, + "grad_norm": 0.25, + "learning_rate": 0.0002, + "loss": 0.1007, + "step": 7586 + }, + { + "epoch": 13.760145091815914, + "grad_norm": 0.2001953125, + "learning_rate": 0.0002, + "loss": 0.0836, + "step": 7587 + }, + { + "epoch": 13.76195873951485, + "grad_norm": 0.267578125, + "learning_rate": 0.0002, + "loss": 0.0908, + "step": 7588 + }, + { + "epoch": 13.763772387213784, + "grad_norm": 0.259765625, + "learning_rate": 0.0002, + "loss": 0.09, + "step": 7589 + }, + { + "epoch": 13.765586034912719, + "grad_norm": 0.2578125, + "learning_rate": 0.0002, + "loss": 0.0921, + "step": 7590 + }, + { + "epoch": 13.767399682611654, + "grad_norm": 0.2216796875, + "learning_rate": 0.0002, + "loss": 0.0799, + "step": 7591 + }, + { + "epoch": 13.769213330310587, + "grad_norm": 0.2197265625, + "learning_rate": 0.0002, + "loss": 0.0729, + "step": 7592 + }, + { + "epoch": 13.771026978009521, + "grad_norm": 0.3046875, + "learning_rate": 0.0002, + "loss": 0.1163, + "step": 7593 + }, + { + "epoch": 13.772840625708456, + "grad_norm": 0.337890625, + "learning_rate": 0.0002, + "loss": 0.0886, + "step": 7594 + }, + { + "epoch": 13.774654273407391, + "grad_norm": 0.265625, + "learning_rate": 0.0002, + "loss": 0.0909, + "step": 7595 + }, + { + "epoch": 13.776467921106326, + "grad_norm": 0.318359375, + "learning_rate": 0.0002, + "loss": 0.089, + "step": 7596 + }, + { + "epoch": 13.778281568805259, + "grad_norm": 0.27734375, + "learning_rate": 0.0002, + "loss": 0.1012, + "step": 7597 + }, + { + "epoch": 13.780095216504193, + "grad_norm": 0.365234375, + "learning_rate": 0.0002, + "loss": 0.0969, + "step": 7598 + }, + { + "epoch": 13.781908864203128, + "grad_norm": 0.3203125, + "learning_rate": 0.0002, + "loss": 0.0976, + "step": 7599 + }, + { + "epoch": 13.783722511902063, + "grad_norm": 0.2578125, + "learning_rate": 0.0002, + "loss": 0.0937, + "step": 7600 + }, + { + "epoch": 13.785536159600998, + "grad_norm": 0.255859375, + "learning_rate": 0.0002, + "loss": 0.0907, + "step": 7601 + }, + { + "epoch": 13.787349807299933, + "grad_norm": 0.203125, + "learning_rate": 0.0002, + "loss": 0.0796, + "step": 7602 + }, + { + "epoch": 13.789163454998867, + "grad_norm": 0.357421875, + "learning_rate": 0.0002, + "loss": 0.096, + "step": 7603 + }, + { + "epoch": 13.7909771026978, + "grad_norm": 0.25390625, + "learning_rate": 0.0002, + "loss": 0.0843, + "step": 7604 + }, + { + "epoch": 13.792790750396735, + "grad_norm": 0.2734375, + "learning_rate": 0.0002, + "loss": 0.0842, + "step": 7605 + }, + { + "epoch": 13.79460439809567, + "grad_norm": 0.30859375, + "learning_rate": 0.0002, + "loss": 0.0951, + "step": 7606 + }, + { + "epoch": 13.796418045794605, + "grad_norm": 0.296875, + "learning_rate": 0.0002, + "loss": 0.1032, + "step": 7607 + }, + { + "epoch": 13.79823169349354, + "grad_norm": 0.330078125, + "learning_rate": 0.0002, + "loss": 0.106, + "step": 7608 + }, + { + "epoch": 13.800045341192474, + "grad_norm": 0.302734375, + "learning_rate": 0.0002, + "loss": 0.1147, + "step": 7609 + }, + { + "epoch": 13.801858988891407, + "grad_norm": 0.2578125, + "learning_rate": 0.0002, + "loss": 0.0965, + "step": 7610 + }, + { + "epoch": 13.803672636590342, + "grad_norm": 0.220703125, + "learning_rate": 0.0002, + "loss": 0.1064, + "step": 7611 + }, + { + "epoch": 13.805486284289277, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 0.1051, + "step": 7612 + }, + { + "epoch": 13.807299931988211, + "grad_norm": 0.2265625, + "learning_rate": 0.0002, + "loss": 0.1192, + "step": 7613 + }, + { + "epoch": 13.809113579687146, + "grad_norm": 0.1943359375, + "learning_rate": 0.0002, + "loss": 0.1197, + "step": 7614 + }, + { + "epoch": 13.810927227386081, + "grad_norm": 0.34765625, + "learning_rate": 0.0002, + "loss": 0.1764, + "step": 7615 + }, + { + "epoch": 13.812740875085014, + "grad_norm": 0.189453125, + "learning_rate": 0.0002, + "loss": 0.1275, + "step": 7616 + }, + { + "epoch": 13.814554522783949, + "grad_norm": 0.220703125, + "learning_rate": 0.0002, + "loss": 0.1993, + "step": 7617 + }, + { + "epoch": 13.816368170482884, + "grad_norm": 0.267578125, + "learning_rate": 0.0002, + "loss": 0.2533, + "step": 7618 + }, + { + "epoch": 13.818181818181818, + "grad_norm": 0.26171875, + "learning_rate": 0.0002, + "loss": 0.1455, + "step": 7619 + }, + { + "epoch": 13.819995465880753, + "grad_norm": 0.2392578125, + "learning_rate": 0.0002, + "loss": 0.1288, + "step": 7620 + }, + { + "epoch": 13.821809113579688, + "grad_norm": 0.2294921875, + "learning_rate": 0.0002, + "loss": 0.1176, + "step": 7621 + }, + { + "epoch": 13.82362276127862, + "grad_norm": 0.3125, + "learning_rate": 0.0002, + "loss": 0.1874, + "step": 7622 + }, + { + "epoch": 13.825436408977556, + "grad_norm": 0.21875, + "learning_rate": 0.0002, + "loss": 0.105, + "step": 7623 + }, + { + "epoch": 13.82725005667649, + "grad_norm": 0.2041015625, + "learning_rate": 0.0002, + "loss": 0.0911, + "step": 7624 + }, + { + "epoch": 13.829063704375425, + "grad_norm": 0.23046875, + "learning_rate": 0.0002, + "loss": 0.1077, + "step": 7625 + }, + { + "epoch": 13.83087735207436, + "grad_norm": 0.2275390625, + "learning_rate": 0.0002, + "loss": 0.107, + "step": 7626 + }, + { + "epoch": 13.832690999773295, + "grad_norm": 0.232421875, + "learning_rate": 0.0002, + "loss": 0.1191, + "step": 7627 + }, + { + "epoch": 13.834504647472228, + "grad_norm": 0.2333984375, + "learning_rate": 0.0002, + "loss": 0.1143, + "step": 7628 + }, + { + "epoch": 13.836318295171163, + "grad_norm": 0.2578125, + "learning_rate": 0.0002, + "loss": 0.1155, + "step": 7629 + }, + { + "epoch": 13.838131942870097, + "grad_norm": 0.2236328125, + "learning_rate": 0.0002, + "loss": 0.1083, + "step": 7630 + }, + { + "epoch": 13.839945590569032, + "grad_norm": 0.2216796875, + "learning_rate": 0.0002, + "loss": 0.0934, + "step": 7631 + }, + { + "epoch": 13.841759238267967, + "grad_norm": 0.2431640625, + "learning_rate": 0.0002, + "loss": 0.1114, + "step": 7632 + }, + { + "epoch": 13.843572885966902, + "grad_norm": 0.23828125, + "learning_rate": 0.0002, + "loss": 0.0904, + "step": 7633 + }, + { + "epoch": 13.845386533665835, + "grad_norm": 0.193359375, + "learning_rate": 0.0002, + "loss": 0.0898, + "step": 7634 + }, + { + "epoch": 13.84720018136477, + "grad_norm": 0.2138671875, + "learning_rate": 0.0002, + "loss": 0.0815, + "step": 7635 + }, + { + "epoch": 13.849013829063704, + "grad_norm": 0.24609375, + "learning_rate": 0.0002, + "loss": 0.0969, + "step": 7636 + }, + { + "epoch": 13.850827476762639, + "grad_norm": 0.259765625, + "learning_rate": 0.0002, + "loss": 0.1041, + "step": 7637 + }, + { + "epoch": 13.852641124461574, + "grad_norm": 0.26171875, + "learning_rate": 0.0002, + "loss": 0.103, + "step": 7638 + }, + { + "epoch": 13.854454772160508, + "grad_norm": 0.2373046875, + "learning_rate": 0.0002, + "loss": 0.0854, + "step": 7639 + }, + { + "epoch": 13.856268419859443, + "grad_norm": 0.22265625, + "learning_rate": 0.0002, + "loss": 0.0825, + "step": 7640 + }, + { + "epoch": 13.858082067558376, + "grad_norm": 0.240234375, + "learning_rate": 0.0002, + "loss": 0.0857, + "step": 7641 + }, + { + "epoch": 13.859895715257311, + "grad_norm": 0.326171875, + "learning_rate": 0.0002, + "loss": 0.0967, + "step": 7642 + }, + { + "epoch": 13.861709362956246, + "grad_norm": 0.2578125, + "learning_rate": 0.0002, + "loss": 0.0836, + "step": 7643 + }, + { + "epoch": 13.86352301065518, + "grad_norm": 0.259765625, + "learning_rate": 0.0002, + "loss": 0.0807, + "step": 7644 + }, + { + "epoch": 13.865336658354115, + "grad_norm": 0.255859375, + "learning_rate": 0.0002, + "loss": 0.0785, + "step": 7645 + }, + { + "epoch": 13.867150306053048, + "grad_norm": 0.234375, + "learning_rate": 0.0002, + "loss": 0.0829, + "step": 7646 + }, + { + "epoch": 13.868963953751983, + "grad_norm": 0.279296875, + "learning_rate": 0.0002, + "loss": 0.0886, + "step": 7647 + }, + { + "epoch": 13.870777601450918, + "grad_norm": 0.28515625, + "learning_rate": 0.0002, + "loss": 0.0835, + "step": 7648 + }, + { + "epoch": 13.872591249149853, + "grad_norm": 0.271484375, + "learning_rate": 0.0002, + "loss": 0.0833, + "step": 7649 + }, + { + "epoch": 13.874404896848787, + "grad_norm": 0.265625, + "learning_rate": 0.0002, + "loss": 0.0815, + "step": 7650 + }, + { + "epoch": 13.876218544547722, + "grad_norm": 0.263671875, + "learning_rate": 0.0002, + "loss": 0.0785, + "step": 7651 + }, + { + "epoch": 13.878032192246657, + "grad_norm": 0.283203125, + "learning_rate": 0.0002, + "loss": 0.0907, + "step": 7652 + }, + { + "epoch": 13.87984583994559, + "grad_norm": 0.2197265625, + "learning_rate": 0.0002, + "loss": 0.0864, + "step": 7653 + }, + { + "epoch": 13.881659487644525, + "grad_norm": 0.185546875, + "learning_rate": 0.0002, + "loss": 0.0952, + "step": 7654 + }, + { + "epoch": 13.88347313534346, + "grad_norm": 0.1982421875, + "learning_rate": 0.0002, + "loss": 0.0793, + "step": 7655 + }, + { + "epoch": 13.885286783042394, + "grad_norm": 0.34375, + "learning_rate": 0.0002, + "loss": 0.1028, + "step": 7656 + }, + { + "epoch": 13.887100430741329, + "grad_norm": 0.2392578125, + "learning_rate": 0.0002, + "loss": 0.0969, + "step": 7657 + }, + { + "epoch": 13.888914078440262, + "grad_norm": 0.26171875, + "learning_rate": 0.0002, + "loss": 0.0967, + "step": 7658 + }, + { + "epoch": 13.890727726139197, + "grad_norm": 0.2734375, + "learning_rate": 0.0002, + "loss": 0.0965, + "step": 7659 + }, + { + "epoch": 13.892541373838132, + "grad_norm": 0.2890625, + "learning_rate": 0.0002, + "loss": 0.1077, + "step": 7660 + }, + { + "epoch": 13.894355021537066, + "grad_norm": 0.1904296875, + "learning_rate": 0.0002, + "loss": 0.1007, + "step": 7661 + }, + { + "epoch": 13.896168669236001, + "grad_norm": 0.21484375, + "learning_rate": 0.0002, + "loss": 0.1019, + "step": 7662 + }, + { + "epoch": 13.897982316934936, + "grad_norm": 0.240234375, + "learning_rate": 0.0002, + "loss": 0.11, + "step": 7663 + }, + { + "epoch": 13.89979596463387, + "grad_norm": 0.28515625, + "learning_rate": 0.0002, + "loss": 0.1162, + "step": 7664 + }, + { + "epoch": 13.901609612332804, + "grad_norm": 0.298828125, + "learning_rate": 0.0002, + "loss": 0.1389, + "step": 7665 + }, + { + "epoch": 13.903423260031738, + "grad_norm": 0.390625, + "learning_rate": 0.0002, + "loss": 0.1573, + "step": 7666 + }, + { + "epoch": 13.905236907730673, + "grad_norm": 0.1640625, + "learning_rate": 0.0002, + "loss": 0.1729, + "step": 7667 + }, + { + "epoch": 13.905236907730673, + "eval_loss": 2.395604133605957, + "eval_runtime": 154.5695, + "eval_samples_per_second": 6.47, + "eval_steps_per_second": 6.47, + "step": 7667 + }, + { + "epoch": 13.905236907730673, + "mmlu_eval_accuracy": 0.2974309684393886, + "mmlu_eval_accuracy_abstract_algebra": 0.5454545454545454, + "mmlu_eval_accuracy_anatomy": 0.42857142857142855, + "mmlu_eval_accuracy_astronomy": 0.25, + "mmlu_eval_accuracy_business_ethics": 0.36363636363636365, + "mmlu_eval_accuracy_clinical_knowledge": 0.2413793103448276, + "mmlu_eval_accuracy_college_biology": 0.4375, + "mmlu_eval_accuracy_college_chemistry": 0.125, + "mmlu_eval_accuracy_college_computer_science": 0.09090909090909091, + "mmlu_eval_accuracy_college_mathematics": 0.18181818181818182, + "mmlu_eval_accuracy_college_medicine": 0.22727272727272727, + "mmlu_eval_accuracy_college_physics": 0.45454545454545453, + "mmlu_eval_accuracy_computer_security": 0.5454545454545454, + "mmlu_eval_accuracy_conceptual_physics": 0.2692307692307692, + "mmlu_eval_accuracy_econometrics": 0.08333333333333333, + "mmlu_eval_accuracy_electrical_engineering": 0.1875, + "mmlu_eval_accuracy_elementary_mathematics": 0.24390243902439024, + "mmlu_eval_accuracy_formal_logic": 0.14285714285714285, + "mmlu_eval_accuracy_global_facts": 0.3, + "mmlu_eval_accuracy_high_school_biology": 0.40625, + "mmlu_eval_accuracy_high_school_chemistry": 0.18181818181818182, + "mmlu_eval_accuracy_high_school_computer_science": 0.2222222222222222, + "mmlu_eval_accuracy_high_school_european_history": 0.2777777777777778, + "mmlu_eval_accuracy_high_school_geography": 0.2727272727272727, + "mmlu_eval_accuracy_high_school_government_and_politics": 0.2857142857142857, + "mmlu_eval_accuracy_high_school_macroeconomics": 0.20930232558139536, + "mmlu_eval_accuracy_high_school_mathematics": 0.10344827586206896, + "mmlu_eval_accuracy_high_school_microeconomics": 0.34615384615384615, + "mmlu_eval_accuracy_high_school_physics": 0.47058823529411764, + "mmlu_eval_accuracy_high_school_psychology": 0.3333333333333333, + "mmlu_eval_accuracy_high_school_statistics": 0.2608695652173913, + "mmlu_eval_accuracy_high_school_us_history": 0.22727272727272727, + "mmlu_eval_accuracy_high_school_world_history": 0.19230769230769232, + "mmlu_eval_accuracy_human_aging": 0.2608695652173913, + "mmlu_eval_accuracy_human_sexuality": 0.08333333333333333, + "mmlu_eval_accuracy_international_law": 0.38461538461538464, + "mmlu_eval_accuracy_jurisprudence": 0.2727272727272727, + "mmlu_eval_accuracy_logical_fallacies": 0.2777777777777778, + "mmlu_eval_accuracy_machine_learning": 0.36363636363636365, + "mmlu_eval_accuracy_management": 0.36363636363636365, + "mmlu_eval_accuracy_marketing": 0.36, + "mmlu_eval_accuracy_medical_genetics": 0.36363636363636365, + "mmlu_eval_accuracy_miscellaneous": 0.47674418604651164, + "mmlu_eval_accuracy_moral_disputes": 0.2631578947368421, + "mmlu_eval_accuracy_moral_scenarios": 0.22, + "mmlu_eval_accuracy_nutrition": 0.30303030303030304, + "mmlu_eval_accuracy_philosophy": 0.35294117647058826, + "mmlu_eval_accuracy_prehistory": 0.34285714285714286, + "mmlu_eval_accuracy_professional_accounting": 0.3225806451612903, + "mmlu_eval_accuracy_professional_law": 0.2823529411764706, + "mmlu_eval_accuracy_professional_medicine": 0.22580645161290322, + "mmlu_eval_accuracy_professional_psychology": 0.2608695652173913, + "mmlu_eval_accuracy_public_relations": 0.3333333333333333, + "mmlu_eval_accuracy_security_studies": 0.37037037037037035, + "mmlu_eval_accuracy_sociology": 0.45454545454545453, + "mmlu_eval_accuracy_us_foreign_policy": 0.45454545454545453, + "mmlu_eval_accuracy_virology": 0.3888888888888889, + "mmlu_eval_accuracy_world_religions": 0.2631578947368421, + "mmlu_loss": 2.01518896157877, + "step": 7667 + }, + { + "epoch": 13.907050555429608, + "grad_norm": 0.234375, + "learning_rate": 0.0002, + "loss": 0.3628, + "step": 7668 + }, + { + "epoch": 13.908864203128543, + "grad_norm": 0.310546875, + "learning_rate": 0.0002, + "loss": 0.1646, + "step": 7669 + }, + { + "epoch": 13.910677850827478, + "grad_norm": 0.2734375, + "learning_rate": 0.0002, + "loss": 0.1605, + "step": 7670 + }, + { + "epoch": 13.91249149852641, + "grad_norm": 0.2578125, + "learning_rate": 0.0002, + "loss": 0.1341, + "step": 7671 + }, + { + "epoch": 13.914305146225345, + "grad_norm": 0.248046875, + "learning_rate": 0.0002, + "loss": 0.1279, + "step": 7672 + }, + { + "epoch": 13.91611879392428, + "grad_norm": 0.2490234375, + "learning_rate": 0.0002, + "loss": 0.1352, + "step": 7673 + }, + { + "epoch": 13.917932441623215, + "grad_norm": 0.20703125, + "learning_rate": 0.0002, + "loss": 0.1031, + "step": 7674 + }, + { + "epoch": 13.91974608932215, + "grad_norm": 0.251953125, + "learning_rate": 0.0002, + "loss": 0.1282, + "step": 7675 + }, + { + "epoch": 13.921559737021084, + "grad_norm": 0.263671875, + "learning_rate": 0.0002, + "loss": 0.1658, + "step": 7676 + }, + { + "epoch": 13.923373384720017, + "grad_norm": 0.201171875, + "learning_rate": 0.0002, + "loss": 0.1031, + "step": 7677 + }, + { + "epoch": 13.925187032418952, + "grad_norm": 0.244140625, + "learning_rate": 0.0002, + "loss": 0.1196, + "step": 7678 + }, + { + "epoch": 13.927000680117887, + "grad_norm": 0.2158203125, + "learning_rate": 0.0002, + "loss": 0.1037, + "step": 7679 + }, + { + "epoch": 13.928814327816822, + "grad_norm": 0.2490234375, + "learning_rate": 0.0002, + "loss": 0.1392, + "step": 7680 + }, + { + "epoch": 13.930627975515756, + "grad_norm": 0.234375, + "learning_rate": 0.0002, + "loss": 0.1031, + "step": 7681 + }, + { + "epoch": 13.932441623214691, + "grad_norm": 0.25, + "learning_rate": 0.0002, + "loss": 0.1259, + "step": 7682 + }, + { + "epoch": 13.934255270913624, + "grad_norm": 0.1875, + "learning_rate": 0.0002, + "loss": 0.0871, + "step": 7683 + }, + { + "epoch": 13.936068918612559, + "grad_norm": 0.203125, + "learning_rate": 0.0002, + "loss": 0.0917, + "step": 7684 + }, + { + "epoch": 13.937882566311494, + "grad_norm": 0.2138671875, + "learning_rate": 0.0002, + "loss": 0.0824, + "step": 7685 + }, + { + "epoch": 13.939696214010429, + "grad_norm": 0.205078125, + "learning_rate": 0.0002, + "loss": 0.0811, + "step": 7686 + }, + { + "epoch": 13.941509861709363, + "grad_norm": 0.2392578125, + "learning_rate": 0.0002, + "loss": 0.093, + "step": 7687 + }, + { + "epoch": 13.943323509408298, + "grad_norm": 0.22265625, + "learning_rate": 0.0002, + "loss": 0.0897, + "step": 7688 + }, + { + "epoch": 13.945137157107231, + "grad_norm": 0.2314453125, + "learning_rate": 0.0002, + "loss": 0.0843, + "step": 7689 + }, + { + "epoch": 13.946950804806166, + "grad_norm": 0.2197265625, + "learning_rate": 0.0002, + "loss": 0.0846, + "step": 7690 + }, + { + "epoch": 13.9487644525051, + "grad_norm": 0.2275390625, + "learning_rate": 0.0002, + "loss": 0.087, + "step": 7691 + }, + { + "epoch": 13.950578100204035, + "grad_norm": 0.333984375, + "learning_rate": 0.0002, + "loss": 0.1145, + "step": 7692 + }, + { + "epoch": 13.95239174790297, + "grad_norm": 0.271484375, + "learning_rate": 0.0002, + "loss": 0.0926, + "step": 7693 + }, + { + "epoch": 13.954205395601905, + "grad_norm": 0.28125, + "learning_rate": 0.0002, + "loss": 0.0948, + "step": 7694 + }, + { + "epoch": 13.956019043300838, + "grad_norm": 0.27734375, + "learning_rate": 0.0002, + "loss": 0.0782, + "step": 7695 + }, + { + "epoch": 13.957832690999773, + "grad_norm": 0.306640625, + "learning_rate": 0.0002, + "loss": 0.1079, + "step": 7696 + }, + { + "epoch": 13.959646338698708, + "grad_norm": 0.24609375, + "learning_rate": 0.0002, + "loss": 0.0895, + "step": 7697 + }, + { + "epoch": 13.961459986397642, + "grad_norm": 0.283203125, + "learning_rate": 0.0002, + "loss": 0.0953, + "step": 7698 + }, + { + "epoch": 13.963273634096577, + "grad_norm": 0.322265625, + "learning_rate": 0.0002, + "loss": 0.087, + "step": 7699 + }, + { + "epoch": 13.965087281795512, + "grad_norm": 0.30078125, + "learning_rate": 0.0002, + "loss": 0.0887, + "step": 7700 + }, + { + "epoch": 13.966900929494447, + "grad_norm": 0.4296875, + "learning_rate": 0.0002, + "loss": 0.1018, + "step": 7701 + }, + { + "epoch": 13.96871457719338, + "grad_norm": 0.2890625, + "learning_rate": 0.0002, + "loss": 0.0845, + "step": 7702 + }, + { + "epoch": 13.970528224892314, + "grad_norm": 0.322265625, + "learning_rate": 0.0002, + "loss": 0.0808, + "step": 7703 + }, + { + "epoch": 13.97234187259125, + "grad_norm": 0.30078125, + "learning_rate": 0.0002, + "loss": 0.0978, + "step": 7704 + }, + { + "epoch": 13.974155520290184, + "grad_norm": 0.326171875, + "learning_rate": 0.0002, + "loss": 0.0921, + "step": 7705 + }, + { + "epoch": 13.975969167989119, + "grad_norm": 0.267578125, + "learning_rate": 0.0002, + "loss": 0.0887, + "step": 7706 + }, + { + "epoch": 13.977782815688052, + "grad_norm": 0.310546875, + "learning_rate": 0.0002, + "loss": 0.0902, + "step": 7707 + }, + { + "epoch": 13.979596463386986, + "grad_norm": 0.234375, + "learning_rate": 0.0002, + "loss": 0.0907, + "step": 7708 + }, + { + "epoch": 13.981410111085921, + "grad_norm": 0.296875, + "learning_rate": 0.0002, + "loss": 0.0916, + "step": 7709 + }, + { + "epoch": 13.983223758784856, + "grad_norm": 0.337890625, + "learning_rate": 0.0002, + "loss": 0.1026, + "step": 7710 + }, + { + "epoch": 13.98503740648379, + "grad_norm": 0.29296875, + "learning_rate": 0.0002, + "loss": 0.1042, + "step": 7711 + }, + { + "epoch": 13.986851054182726, + "grad_norm": 0.328125, + "learning_rate": 0.0002, + "loss": 0.1322, + "step": 7712 + }, + { + "epoch": 13.98866470188166, + "grad_norm": 0.26953125, + "learning_rate": 0.0002, + "loss": 0.1068, + "step": 7713 + }, + { + "epoch": 13.990478349580593, + "grad_norm": 0.2451171875, + "learning_rate": 0.0002, + "loss": 0.1235, + "step": 7714 + }, + { + "epoch": 13.992291997279528, + "grad_norm": 0.265625, + "learning_rate": 0.0002, + "loss": 0.1292, + "step": 7715 + }, + { + "epoch": 13.994105644978463, + "grad_norm": 0.263671875, + "learning_rate": 0.0002, + "loss": 0.1554, + "step": 7716 + }, + { + "epoch": 13.995919292677398, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 0.1573, + "step": 7717 + }, + { + "epoch": 13.997732940376332, + "grad_norm": 0.2314453125, + "learning_rate": 0.0002, + "loss": 0.2422, + "step": 7718 + }, + { + "epoch": 13.999546588075267, + "grad_norm": 0.33203125, + "learning_rate": 0.0002, + "loss": 0.1087, + "step": 7719 + }, + { + "epoch": 14.0013602357742, + "grad_norm": 0.1865234375, + "learning_rate": 0.0002, + "loss": 0.1139, + "step": 7720 + }, + { + "epoch": 14.003173883473135, + "grad_norm": 0.197265625, + "learning_rate": 0.0002, + "loss": 0.0925, + "step": 7721 + }, + { + "epoch": 14.00498753117207, + "grad_norm": 0.2158203125, + "learning_rate": 0.0002, + "loss": 0.1236, + "step": 7722 + }, + { + "epoch": 14.006801178871005, + "grad_norm": 0.19140625, + "learning_rate": 0.0002, + "loss": 0.0867, + "step": 7723 + }, + { + "epoch": 14.00861482656994, + "grad_norm": 0.205078125, + "learning_rate": 0.0002, + "loss": 0.0988, + "step": 7724 + }, + { + "epoch": 14.010428474268874, + "grad_norm": 0.322265625, + "learning_rate": 0.0002, + "loss": 0.1067, + "step": 7725 + }, + { + "epoch": 14.012242121967807, + "grad_norm": 0.185546875, + "learning_rate": 0.0002, + "loss": 0.0757, + "step": 7726 + }, + { + "epoch": 14.014055769666742, + "grad_norm": 0.2060546875, + "learning_rate": 0.0002, + "loss": 0.0822, + "step": 7727 + }, + { + "epoch": 14.015869417365677, + "grad_norm": 0.1787109375, + "learning_rate": 0.0002, + "loss": 0.0674, + "step": 7728 + }, + { + "epoch": 14.017683065064611, + "grad_norm": 0.19921875, + "learning_rate": 0.0002, + "loss": 0.0767, + "step": 7729 + }, + { + "epoch": 14.019496712763546, + "grad_norm": 0.2119140625, + "learning_rate": 0.0002, + "loss": 0.0825, + "step": 7730 + }, + { + "epoch": 14.021310360462481, + "grad_norm": 0.193359375, + "learning_rate": 0.0002, + "loss": 0.079, + "step": 7731 + }, + { + "epoch": 14.023124008161414, + "grad_norm": 0.19921875, + "learning_rate": 0.0002, + "loss": 0.109, + "step": 7732 + }, + { + "epoch": 14.024937655860349, + "grad_norm": 0.1748046875, + "learning_rate": 0.0002, + "loss": 0.073, + "step": 7733 + }, + { + "epoch": 14.026751303559283, + "grad_norm": 0.2138671875, + "learning_rate": 0.0002, + "loss": 0.0709, + "step": 7734 + }, + { + "epoch": 14.028564951258218, + "grad_norm": 0.19140625, + "learning_rate": 0.0002, + "loss": 0.0672, + "step": 7735 + }, + { + "epoch": 14.030378598957153, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 0.0547, + "step": 7736 + }, + { + "epoch": 14.032192246656088, + "grad_norm": 0.1796875, + "learning_rate": 0.0002, + "loss": 0.0688, + "step": 7737 + }, + { + "epoch": 14.03400589435502, + "grad_norm": 0.1982421875, + "learning_rate": 0.0002, + "loss": 0.0722, + "step": 7738 + }, + { + "epoch": 14.035819542053956, + "grad_norm": 0.26171875, + "learning_rate": 0.0002, + "loss": 0.067, + "step": 7739 + }, + { + "epoch": 14.03763318975289, + "grad_norm": 0.1982421875, + "learning_rate": 0.0002, + "loss": 0.0574, + "step": 7740 + }, + { + "epoch": 14.039446837451825, + "grad_norm": 0.2177734375, + "learning_rate": 0.0002, + "loss": 0.067, + "step": 7741 + }, + { + "epoch": 14.04126048515076, + "grad_norm": 0.2216796875, + "learning_rate": 0.0002, + "loss": 0.0658, + "step": 7742 + }, + { + "epoch": 14.043074132849695, + "grad_norm": 0.212890625, + "learning_rate": 0.0002, + "loss": 0.0791, + "step": 7743 + }, + { + "epoch": 14.044887780548628, + "grad_norm": 0.240234375, + "learning_rate": 0.0002, + "loss": 0.0707, + "step": 7744 + }, + { + "epoch": 14.046701428247562, + "grad_norm": 0.2060546875, + "learning_rate": 0.0002, + "loss": 0.0741, + "step": 7745 + }, + { + "epoch": 14.048515075946497, + "grad_norm": 0.1875, + "learning_rate": 0.0002, + "loss": 0.074, + "step": 7746 + }, + { + "epoch": 14.050328723645432, + "grad_norm": 0.25390625, + "learning_rate": 0.0002, + "loss": 0.0754, + "step": 7747 + }, + { + "epoch": 14.052142371344367, + "grad_norm": 0.2236328125, + "learning_rate": 0.0002, + "loss": 0.0765, + "step": 7748 + }, + { + "epoch": 14.053956019043301, + "grad_norm": 0.26171875, + "learning_rate": 0.0002, + "loss": 0.0695, + "step": 7749 + }, + { + "epoch": 14.055769666742234, + "grad_norm": 0.212890625, + "learning_rate": 0.0002, + "loss": 0.067, + "step": 7750 + }, + { + "epoch": 14.05758331444117, + "grad_norm": 0.2333984375, + "learning_rate": 0.0002, + "loss": 0.0812, + "step": 7751 + }, + { + "epoch": 14.059396962140104, + "grad_norm": 0.337890625, + "learning_rate": 0.0002, + "loss": 0.0781, + "step": 7752 + }, + { + "epoch": 14.061210609839039, + "grad_norm": 0.244140625, + "learning_rate": 0.0002, + "loss": 0.0837, + "step": 7753 + }, + { + "epoch": 14.063024257537974, + "grad_norm": 0.171875, + "learning_rate": 0.0002, + "loss": 0.0686, + "step": 7754 + }, + { + "epoch": 14.064837905236908, + "grad_norm": 0.28515625, + "learning_rate": 0.0002, + "loss": 0.0774, + "step": 7755 + }, + { + "epoch": 14.066651552935841, + "grad_norm": 0.255859375, + "learning_rate": 0.0002, + "loss": 0.073, + "step": 7756 + }, + { + "epoch": 14.068465200634776, + "grad_norm": 0.353515625, + "learning_rate": 0.0002, + "loss": 0.0923, + "step": 7757 + }, + { + "epoch": 14.070278848333711, + "grad_norm": 0.3046875, + "learning_rate": 0.0002, + "loss": 0.0782, + "step": 7758 + }, + { + "epoch": 14.072092496032646, + "grad_norm": 0.296875, + "learning_rate": 0.0002, + "loss": 0.0927, + "step": 7759 + }, + { + "epoch": 14.07390614373158, + "grad_norm": 0.396484375, + "learning_rate": 0.0002, + "loss": 0.1021, + "step": 7760 + }, + { + "epoch": 14.075719791430515, + "grad_norm": 0.2158203125, + "learning_rate": 0.0002, + "loss": 0.0879, + "step": 7761 + }, + { + "epoch": 14.07753343912945, + "grad_norm": 0.2080078125, + "learning_rate": 0.0002, + "loss": 0.0812, + "step": 7762 + }, + { + "epoch": 14.079347086828383, + "grad_norm": 0.2236328125, + "learning_rate": 0.0002, + "loss": 0.0871, + "step": 7763 + }, + { + "epoch": 14.081160734527318, + "grad_norm": 0.283203125, + "learning_rate": 0.0002, + "loss": 0.0954, + "step": 7764 + }, + { + "epoch": 14.082974382226253, + "grad_norm": 0.2578125, + "learning_rate": 0.0002, + "loss": 0.0972, + "step": 7765 + }, + { + "epoch": 14.084788029925187, + "grad_norm": 0.19921875, + "learning_rate": 0.0002, + "loss": 0.1108, + "step": 7766 + }, + { + "epoch": 14.086601677624122, + "grad_norm": 0.166015625, + "learning_rate": 0.0002, + "loss": 0.1112, + "step": 7767 + }, + { + "epoch": 14.088415325323057, + "grad_norm": 0.2353515625, + "learning_rate": 0.0002, + "loss": 0.1347, + "step": 7768 + }, + { + "epoch": 14.09022897302199, + "grad_norm": 0.1591796875, + "learning_rate": 0.0002, + "loss": 0.1808, + "step": 7769 + }, + { + "epoch": 14.092042620720925, + "grad_norm": 0.2353515625, + "learning_rate": 0.0002, + "loss": 0.1807, + "step": 7770 + }, + { + "epoch": 14.09385626841986, + "grad_norm": 0.1982421875, + "learning_rate": 0.0002, + "loss": 0.0874, + "step": 7771 + }, + { + "epoch": 14.095669916118794, + "grad_norm": 0.1875, + "learning_rate": 0.0002, + "loss": 0.077, + "step": 7772 + }, + { + "epoch": 14.097483563817729, + "grad_norm": 0.181640625, + "learning_rate": 0.0002, + "loss": 0.0756, + "step": 7773 + }, + { + "epoch": 14.099297211516664, + "grad_norm": 0.23828125, + "learning_rate": 0.0002, + "loss": 0.0845, + "step": 7774 + }, + { + "epoch": 14.101110859215597, + "grad_norm": 0.20703125, + "learning_rate": 0.0002, + "loss": 0.0924, + "step": 7775 + }, + { + "epoch": 14.102924506914531, + "grad_norm": 0.21484375, + "learning_rate": 0.0002, + "loss": 0.0892, + "step": 7776 + }, + { + "epoch": 14.104738154613466, + "grad_norm": 0.255859375, + "learning_rate": 0.0002, + "loss": 0.1043, + "step": 7777 + }, + { + "epoch": 14.106551802312401, + "grad_norm": 0.2412109375, + "learning_rate": 0.0002, + "loss": 0.0909, + "step": 7778 + }, + { + "epoch": 14.108365450011336, + "grad_norm": 0.203125, + "learning_rate": 0.0002, + "loss": 0.0771, + "step": 7779 + }, + { + "epoch": 14.11017909771027, + "grad_norm": 0.2177734375, + "learning_rate": 0.0002, + "loss": 0.0863, + "step": 7780 + }, + { + "epoch": 14.111992745409204, + "grad_norm": 0.1904296875, + "learning_rate": 0.0002, + "loss": 0.0804, + "step": 7781 + }, + { + "epoch": 14.113806393108138, + "grad_norm": 0.2216796875, + "learning_rate": 0.0002, + "loss": 0.0839, + "step": 7782 + }, + { + "epoch": 14.115620040807073, + "grad_norm": 0.203125, + "learning_rate": 0.0002, + "loss": 0.0822, + "step": 7783 + }, + { + "epoch": 14.117433688506008, + "grad_norm": 0.1982421875, + "learning_rate": 0.0002, + "loss": 0.0779, + "step": 7784 + }, + { + "epoch": 14.119247336204943, + "grad_norm": 0.1689453125, + "learning_rate": 0.0002, + "loss": 0.0632, + "step": 7785 + }, + { + "epoch": 14.121060983903877, + "grad_norm": 0.189453125, + "learning_rate": 0.0002, + "loss": 0.0656, + "step": 7786 + }, + { + "epoch": 14.12287463160281, + "grad_norm": 0.2001953125, + "learning_rate": 0.0002, + "loss": 0.0783, + "step": 7787 + }, + { + "epoch": 14.124688279301745, + "grad_norm": 0.21484375, + "learning_rate": 0.0002, + "loss": 0.0715, + "step": 7788 + }, + { + "epoch": 14.12650192700068, + "grad_norm": 0.2314453125, + "learning_rate": 0.0002, + "loss": 0.0701, + "step": 7789 + }, + { + "epoch": 14.128315574699615, + "grad_norm": 0.2265625, + "learning_rate": 0.0002, + "loss": 0.074, + "step": 7790 + }, + { + "epoch": 14.13012922239855, + "grad_norm": 0.1845703125, + "learning_rate": 0.0002, + "loss": 0.074, + "step": 7791 + }, + { + "epoch": 14.131942870097484, + "grad_norm": 0.2197265625, + "learning_rate": 0.0002, + "loss": 0.0686, + "step": 7792 + }, + { + "epoch": 14.133756517796417, + "grad_norm": 0.2099609375, + "learning_rate": 0.0002, + "loss": 0.0612, + "step": 7793 + }, + { + "epoch": 14.135570165495352, + "grad_norm": 0.205078125, + "learning_rate": 0.0002, + "loss": 0.0623, + "step": 7794 + }, + { + "epoch": 14.137383813194287, + "grad_norm": 0.2412109375, + "learning_rate": 0.0002, + "loss": 0.0689, + "step": 7795 + }, + { + "epoch": 14.139197460893222, + "grad_norm": 0.21875, + "learning_rate": 0.0002, + "loss": 0.0727, + "step": 7796 + }, + { + "epoch": 14.141011108592156, + "grad_norm": 0.265625, + "learning_rate": 0.0002, + "loss": 0.0863, + "step": 7797 + }, + { + "epoch": 14.142824756291091, + "grad_norm": 0.236328125, + "learning_rate": 0.0002, + "loss": 0.0669, + "step": 7798 + }, + { + "epoch": 14.144638403990024, + "grad_norm": 0.2294921875, + "learning_rate": 0.0002, + "loss": 0.074, + "step": 7799 + }, + { + "epoch": 14.146452051688959, + "grad_norm": 0.251953125, + "learning_rate": 0.0002, + "loss": 0.0731, + "step": 7800 + }, + { + "epoch": 14.148265699387894, + "grad_norm": 0.271484375, + "learning_rate": 0.0002, + "loss": 0.0737, + "step": 7801 + }, + { + "epoch": 14.150079347086828, + "grad_norm": 0.30078125, + "learning_rate": 0.0002, + "loss": 0.0739, + "step": 7802 + }, + { + "epoch": 14.151892994785763, + "grad_norm": 0.353515625, + "learning_rate": 0.0002, + "loss": 0.0892, + "step": 7803 + }, + { + "epoch": 14.153706642484698, + "grad_norm": 0.2470703125, + "learning_rate": 0.0002, + "loss": 0.075, + "step": 7804 + }, + { + "epoch": 14.155520290183631, + "grad_norm": 0.1806640625, + "learning_rate": 0.0002, + "loss": 0.0703, + "step": 7805 + }, + { + "epoch": 14.157333937882566, + "grad_norm": 0.244140625, + "learning_rate": 0.0002, + "loss": 0.0825, + "step": 7806 + }, + { + "epoch": 14.1591475855815, + "grad_norm": 0.220703125, + "learning_rate": 0.0002, + "loss": 0.0788, + "step": 7807 + }, + { + "epoch": 14.160961233280435, + "grad_norm": 0.291015625, + "learning_rate": 0.0002, + "loss": 0.0931, + "step": 7808 + }, + { + "epoch": 14.16277488097937, + "grad_norm": 0.16015625, + "learning_rate": 0.0002, + "loss": 0.0801, + "step": 7809 + }, + { + "epoch": 14.164588528678305, + "grad_norm": 0.33984375, + "learning_rate": 0.0002, + "loss": 0.0861, + "step": 7810 + }, + { + "epoch": 14.166402176377238, + "grad_norm": 0.16015625, + "learning_rate": 0.0002, + "loss": 0.0832, + "step": 7811 + }, + { + "epoch": 14.168215824076173, + "grad_norm": 0.21484375, + "learning_rate": 0.0002, + "loss": 0.0883, + "step": 7812 + }, + { + "epoch": 14.170029471775107, + "grad_norm": 0.2060546875, + "learning_rate": 0.0002, + "loss": 0.0973, + "step": 7813 + }, + { + "epoch": 14.171843119474042, + "grad_norm": 0.244140625, + "learning_rate": 0.0002, + "loss": 0.1035, + "step": 7814 + }, + { + "epoch": 14.173656767172977, + "grad_norm": 0.302734375, + "learning_rate": 0.0002, + "loss": 0.1141, + "step": 7815 + }, + { + "epoch": 14.175470414871912, + "grad_norm": 0.55859375, + "learning_rate": 0.0002, + "loss": 0.1213, + "step": 7816 + }, + { + "epoch": 14.177284062570845, + "grad_norm": 0.2333984375, + "learning_rate": 0.0002, + "loss": 0.1132, + "step": 7817 + }, + { + "epoch": 14.17909771026978, + "grad_norm": 0.2236328125, + "learning_rate": 0.0002, + "loss": 0.1747, + "step": 7818 + }, + { + "epoch": 14.180911357968714, + "grad_norm": 0.1279296875, + "learning_rate": 0.0002, + "loss": 0.1895, + "step": 7819 + }, + { + "epoch": 14.182725005667649, + "grad_norm": 0.263671875, + "learning_rate": 0.0002, + "loss": 0.1481, + "step": 7820 + }, + { + "epoch": 14.184538653366584, + "grad_norm": 0.251953125, + "learning_rate": 0.0002, + "loss": 0.0869, + "step": 7821 + }, + { + "epoch": 14.186352301065519, + "grad_norm": 0.287109375, + "learning_rate": 0.0002, + "loss": 0.1118, + "step": 7822 + }, + { + "epoch": 14.188165948764453, + "grad_norm": 0.208984375, + "learning_rate": 0.0002, + "loss": 0.0976, + "step": 7823 + }, + { + "epoch": 14.189979596463386, + "grad_norm": 0.2060546875, + "learning_rate": 0.0002, + "loss": 0.0754, + "step": 7824 + }, + { + "epoch": 14.191793244162321, + "grad_norm": 0.255859375, + "learning_rate": 0.0002, + "loss": 0.1067, + "step": 7825 + }, + { + "epoch": 14.193606891861256, + "grad_norm": 0.2265625, + "learning_rate": 0.0002, + "loss": 0.0943, + "step": 7826 + }, + { + "epoch": 14.19542053956019, + "grad_norm": 0.2099609375, + "learning_rate": 0.0002, + "loss": 0.0872, + "step": 7827 + }, + { + "epoch": 14.197234187259125, + "grad_norm": 0.197265625, + "learning_rate": 0.0002, + "loss": 0.0858, + "step": 7828 + }, + { + "epoch": 14.19904783495806, + "grad_norm": 0.177734375, + "learning_rate": 0.0002, + "loss": 0.0737, + "step": 7829 + }, + { + "epoch": 14.200861482656993, + "grad_norm": 0.1884765625, + "learning_rate": 0.0002, + "loss": 0.0874, + "step": 7830 + }, + { + "epoch": 14.202675130355928, + "grad_norm": 0.2333984375, + "learning_rate": 0.0002, + "loss": 0.0877, + "step": 7831 + }, + { + "epoch": 14.204488778054863, + "grad_norm": 0.2216796875, + "learning_rate": 0.0002, + "loss": 0.0938, + "step": 7832 + }, + { + "epoch": 14.206302425753798, + "grad_norm": 0.1767578125, + "learning_rate": 0.0002, + "loss": 0.071, + "step": 7833 + }, + { + "epoch": 14.208116073452732, + "grad_norm": 0.1923828125, + "learning_rate": 0.0002, + "loss": 0.0788, + "step": 7834 + }, + { + "epoch": 14.209929721151667, + "grad_norm": 0.1728515625, + "learning_rate": 0.0002, + "loss": 0.0713, + "step": 7835 + }, + { + "epoch": 14.2117433688506, + "grad_norm": 0.1826171875, + "learning_rate": 0.0002, + "loss": 0.071, + "step": 7836 + }, + { + "epoch": 14.213557016549535, + "grad_norm": 0.1923828125, + "learning_rate": 0.0002, + "loss": 0.0699, + "step": 7837 + }, + { + "epoch": 14.21537066424847, + "grad_norm": 0.2421875, + "learning_rate": 0.0002, + "loss": 0.0731, + "step": 7838 + }, + { + "epoch": 14.217184311947404, + "grad_norm": 0.2041015625, + "learning_rate": 0.0002, + "loss": 0.0657, + "step": 7839 + }, + { + "epoch": 14.21899795964634, + "grad_norm": 0.2109375, + "learning_rate": 0.0002, + "loss": 0.0817, + "step": 7840 + }, + { + "epoch": 14.220811607345274, + "grad_norm": 0.193359375, + "learning_rate": 0.0002, + "loss": 0.0675, + "step": 7841 + }, + { + "epoch": 14.222625255044207, + "grad_norm": 0.236328125, + "learning_rate": 0.0002, + "loss": 0.0749, + "step": 7842 + }, + { + "epoch": 14.224438902743142, + "grad_norm": 0.2236328125, + "learning_rate": 0.0002, + "loss": 0.0732, + "step": 7843 + }, + { + "epoch": 14.226252550442076, + "grad_norm": 0.2294921875, + "learning_rate": 0.0002, + "loss": 0.0677, + "step": 7844 + }, + { + "epoch": 14.228066198141011, + "grad_norm": 0.279296875, + "learning_rate": 0.0002, + "loss": 0.0725, + "step": 7845 + }, + { + "epoch": 14.229879845839946, + "grad_norm": 0.220703125, + "learning_rate": 0.0002, + "loss": 0.0734, + "step": 7846 + }, + { + "epoch": 14.23169349353888, + "grad_norm": 0.2578125, + "learning_rate": 0.0002, + "loss": 0.0738, + "step": 7847 + }, + { + "epoch": 14.233507141237814, + "grad_norm": 0.166015625, + "learning_rate": 0.0002, + "loss": 0.0666, + "step": 7848 + }, + { + "epoch": 14.235320788936749, + "grad_norm": 0.1943359375, + "learning_rate": 0.0002, + "loss": 0.0736, + "step": 7849 + }, + { + "epoch": 14.237134436635683, + "grad_norm": 0.197265625, + "learning_rate": 0.0002, + "loss": 0.0798, + "step": 7850 + }, + { + "epoch": 14.238948084334618, + "grad_norm": 0.248046875, + "learning_rate": 0.0002, + "loss": 0.0723, + "step": 7851 + }, + { + "epoch": 14.240761732033553, + "grad_norm": 0.28515625, + "learning_rate": 0.0002, + "loss": 0.08, + "step": 7852 + }, + { + "epoch": 14.242575379732488, + "grad_norm": 0.265625, + "learning_rate": 0.0002, + "loss": 0.0698, + "step": 7853 + }, + { + "epoch": 14.24438902743142, + "grad_norm": 0.271484375, + "learning_rate": 0.0002, + "loss": 0.083, + "step": 7854 + }, + { + "epoch": 14.24438902743142, + "eval_loss": 2.4934141635894775, + "eval_runtime": 152.4581, + "eval_samples_per_second": 6.559, + "eval_steps_per_second": 6.559, + "step": 7854 + }, + { + "epoch": 14.24438902743142, + "mmlu_eval_accuracy": 0.2930142030058774, + "mmlu_eval_accuracy_abstract_algebra": 0.36363636363636365, + "mmlu_eval_accuracy_anatomy": 0.2857142857142857, + "mmlu_eval_accuracy_astronomy": 0.25, + "mmlu_eval_accuracy_business_ethics": 0.36363636363636365, + "mmlu_eval_accuracy_clinical_knowledge": 0.2413793103448276, + "mmlu_eval_accuracy_college_biology": 0.4375, + "mmlu_eval_accuracy_college_chemistry": 0.125, + "mmlu_eval_accuracy_college_computer_science": 0.09090909090909091, + "mmlu_eval_accuracy_college_mathematics": 0.18181818181818182, + "mmlu_eval_accuracy_college_medicine": 0.22727272727272727, + "mmlu_eval_accuracy_college_physics": 0.2727272727272727, + "mmlu_eval_accuracy_computer_security": 0.5454545454545454, + "mmlu_eval_accuracy_conceptual_physics": 0.2692307692307692, + "mmlu_eval_accuracy_econometrics": 0.08333333333333333, + "mmlu_eval_accuracy_electrical_engineering": 0.1875, + "mmlu_eval_accuracy_elementary_mathematics": 0.2926829268292683, + "mmlu_eval_accuracy_formal_logic": 0.14285714285714285, + "mmlu_eval_accuracy_global_facts": 0.3, + "mmlu_eval_accuracy_high_school_biology": 0.40625, + "mmlu_eval_accuracy_high_school_chemistry": 0.22727272727272727, + "mmlu_eval_accuracy_high_school_computer_science": 0.2222222222222222, + "mmlu_eval_accuracy_high_school_european_history": 0.2777777777777778, + "mmlu_eval_accuracy_high_school_geography": 0.3181818181818182, + "mmlu_eval_accuracy_high_school_government_and_politics": 0.38095238095238093, + "mmlu_eval_accuracy_high_school_macroeconomics": 0.3023255813953488, + "mmlu_eval_accuracy_high_school_mathematics": 0.10344827586206896, + "mmlu_eval_accuracy_high_school_microeconomics": 0.3076923076923077, + "mmlu_eval_accuracy_high_school_physics": 0.47058823529411764, + "mmlu_eval_accuracy_high_school_psychology": 0.2833333333333333, + "mmlu_eval_accuracy_high_school_statistics": 0.2608695652173913, + "mmlu_eval_accuracy_high_school_us_history": 0.2727272727272727, + "mmlu_eval_accuracy_high_school_world_history": 0.19230769230769232, + "mmlu_eval_accuracy_human_aging": 0.30434782608695654, + "mmlu_eval_accuracy_human_sexuality": 0.16666666666666666, + "mmlu_eval_accuracy_international_law": 0.38461538461538464, + "mmlu_eval_accuracy_jurisprudence": 0.36363636363636365, + "mmlu_eval_accuracy_logical_fallacies": 0.3333333333333333, + "mmlu_eval_accuracy_machine_learning": 0.36363636363636365, + "mmlu_eval_accuracy_management": 0.36363636363636365, + "mmlu_eval_accuracy_marketing": 0.36, + "mmlu_eval_accuracy_medical_genetics": 0.2727272727272727, + "mmlu_eval_accuracy_miscellaneous": 0.4883720930232558, + "mmlu_eval_accuracy_moral_disputes": 0.23684210526315788, + "mmlu_eval_accuracy_moral_scenarios": 0.22, + "mmlu_eval_accuracy_nutrition": 0.30303030303030304, + "mmlu_eval_accuracy_philosophy": 0.35294117647058826, + "mmlu_eval_accuracy_prehistory": 0.3142857142857143, + "mmlu_eval_accuracy_professional_accounting": 0.3225806451612903, + "mmlu_eval_accuracy_professional_law": 0.2529411764705882, + "mmlu_eval_accuracy_professional_medicine": 0.25806451612903225, + "mmlu_eval_accuracy_professional_psychology": 0.2608695652173913, + "mmlu_eval_accuracy_public_relations": 0.3333333333333333, + "mmlu_eval_accuracy_security_studies": 0.2962962962962963, + "mmlu_eval_accuracy_sociology": 0.5, + "mmlu_eval_accuracy_us_foreign_policy": 0.36363636363636365, + "mmlu_eval_accuracy_virology": 0.3888888888888889, + "mmlu_eval_accuracy_world_religions": 0.21052631578947367, + "mmlu_loss": 2.484776864540024, + "step": 7854 + }, + { + "epoch": 14.246202675130355, + "grad_norm": 0.22265625, + "learning_rate": 0.0002, + "loss": 0.0724, + "step": 7855 + }, + { + "epoch": 14.24801632282929, + "grad_norm": 0.283203125, + "learning_rate": 0.0002, + "loss": 0.0777, + "step": 7856 + }, + { + "epoch": 14.249829970528225, + "grad_norm": 0.25, + "learning_rate": 0.0002, + "loss": 0.0761, + "step": 7857 + }, + { + "epoch": 14.25164361822716, + "grad_norm": 0.1962890625, + "learning_rate": 0.0002, + "loss": 0.0776, + "step": 7858 + }, + { + "epoch": 14.253457265926095, + "grad_norm": 0.234375, + "learning_rate": 0.0002, + "loss": 0.0795, + "step": 7859 + }, + { + "epoch": 14.255270913625028, + "grad_norm": 0.30078125, + "learning_rate": 0.0002, + "loss": 0.0903, + "step": 7860 + }, + { + "epoch": 14.257084561323962, + "grad_norm": 0.427734375, + "learning_rate": 0.0002, + "loss": 0.0885, + "step": 7861 + }, + { + "epoch": 14.258898209022897, + "grad_norm": 0.3984375, + "learning_rate": 0.0002, + "loss": 0.0949, + "step": 7862 + }, + { + "epoch": 14.260711856721832, + "grad_norm": 0.224609375, + "learning_rate": 0.0002, + "loss": 0.0946, + "step": 7863 + }, + { + "epoch": 14.262525504420767, + "grad_norm": 0.1572265625, + "learning_rate": 0.0002, + "loss": 0.0962, + "step": 7864 + }, + { + "epoch": 14.264339152119701, + "grad_norm": 0.404296875, + "learning_rate": 0.0002, + "loss": 0.1199, + "step": 7865 + }, + { + "epoch": 14.266152799818634, + "grad_norm": 0.234375, + "learning_rate": 0.0002, + "loss": 0.1228, + "step": 7866 + }, + { + "epoch": 14.26796644751757, + "grad_norm": 0.185546875, + "learning_rate": 0.0002, + "loss": 0.1282, + "step": 7867 + }, + { + "epoch": 14.269780095216504, + "grad_norm": 0.130859375, + "learning_rate": 0.0002, + "loss": 0.1408, + "step": 7868 + }, + { + "epoch": 14.271593742915439, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 0.1956, + "step": 7869 + }, + { + "epoch": 14.273407390614373, + "grad_norm": 0.216796875, + "learning_rate": 0.0002, + "loss": 0.1538, + "step": 7870 + }, + { + "epoch": 14.275221038313308, + "grad_norm": 0.23828125, + "learning_rate": 0.0002, + "loss": 0.0828, + "step": 7871 + }, + { + "epoch": 14.277034686012243, + "grad_norm": 0.228515625, + "learning_rate": 0.0002, + "loss": 0.0936, + "step": 7872 + }, + { + "epoch": 14.278848333711176, + "grad_norm": 0.2236328125, + "learning_rate": 0.0002, + "loss": 0.1009, + "step": 7873 + }, + { + "epoch": 14.28066198141011, + "grad_norm": 0.2060546875, + "learning_rate": 0.0002, + "loss": 0.0857, + "step": 7874 + }, + { + "epoch": 14.282475629109046, + "grad_norm": 0.2099609375, + "learning_rate": 0.0002, + "loss": 0.0981, + "step": 7875 + }, + { + "epoch": 14.28428927680798, + "grad_norm": 0.24609375, + "learning_rate": 0.0002, + "loss": 0.1034, + "step": 7876 + }, + { + "epoch": 14.286102924506915, + "grad_norm": 0.2412109375, + "learning_rate": 0.0002, + "loss": 0.0963, + "step": 7877 + }, + { + "epoch": 14.287916572205848, + "grad_norm": 0.2041015625, + "learning_rate": 0.0002, + "loss": 0.0937, + "step": 7878 + }, + { + "epoch": 14.289730219904783, + "grad_norm": 0.203125, + "learning_rate": 0.0002, + "loss": 0.1027, + "step": 7879 + }, + { + "epoch": 14.291543867603718, + "grad_norm": 0.20703125, + "learning_rate": 0.0002, + "loss": 0.0807, + "step": 7880 + }, + { + "epoch": 14.293357515302652, + "grad_norm": 0.2373046875, + "learning_rate": 0.0002, + "loss": 0.0948, + "step": 7881 + }, + { + "epoch": 14.295171163001587, + "grad_norm": 0.1982421875, + "learning_rate": 0.0002, + "loss": 0.075, + "step": 7882 + }, + { + "epoch": 14.296984810700522, + "grad_norm": 0.2236328125, + "learning_rate": 0.0002, + "loss": 0.0865, + "step": 7883 + }, + { + "epoch": 14.298798458399457, + "grad_norm": 0.2158203125, + "learning_rate": 0.0002, + "loss": 0.1169, + "step": 7884 + }, + { + "epoch": 14.30061210609839, + "grad_norm": 0.2099609375, + "learning_rate": 0.0002, + "loss": 0.0765, + "step": 7885 + }, + { + "epoch": 14.302425753797325, + "grad_norm": 0.1904296875, + "learning_rate": 0.0002, + "loss": 0.0749, + "step": 7886 + }, + { + "epoch": 14.30423940149626, + "grad_norm": 0.22265625, + "learning_rate": 0.0002, + "loss": 0.0812, + "step": 7887 + }, + { + "epoch": 14.306053049195194, + "grad_norm": 0.19921875, + "learning_rate": 0.0002, + "loss": 0.0679, + "step": 7888 + }, + { + "epoch": 14.307866696894129, + "grad_norm": 0.21875, + "learning_rate": 0.0002, + "loss": 0.0735, + "step": 7889 + }, + { + "epoch": 14.309680344593064, + "grad_norm": 0.19921875, + "learning_rate": 0.0002, + "loss": 0.0688, + "step": 7890 + }, + { + "epoch": 14.311493992291997, + "grad_norm": 0.1826171875, + "learning_rate": 0.0002, + "loss": 0.0755, + "step": 7891 + }, + { + "epoch": 14.313307639990931, + "grad_norm": 0.2099609375, + "learning_rate": 0.0002, + "loss": 0.0676, + "step": 7892 + }, + { + "epoch": 14.315121287689866, + "grad_norm": 0.2275390625, + "learning_rate": 0.0002, + "loss": 0.0695, + "step": 7893 + }, + { + "epoch": 14.316934935388801, + "grad_norm": 0.232421875, + "learning_rate": 0.0002, + "loss": 0.0656, + "step": 7894 + }, + { + "epoch": 14.318748583087736, + "grad_norm": 0.24609375, + "learning_rate": 0.0002, + "loss": 0.075, + "step": 7895 + }, + { + "epoch": 14.32056223078667, + "grad_norm": 0.26171875, + "learning_rate": 0.0002, + "loss": 0.0809, + "step": 7896 + }, + { + "epoch": 14.322375878485603, + "grad_norm": 0.275390625, + "learning_rate": 0.0002, + "loss": 0.0695, + "step": 7897 + }, + { + "epoch": 14.324189526184538, + "grad_norm": 0.2734375, + "learning_rate": 0.0002, + "loss": 0.0879, + "step": 7898 + }, + { + "epoch": 14.326003173883473, + "grad_norm": 0.271484375, + "learning_rate": 0.0002, + "loss": 0.0748, + "step": 7899 + }, + { + "epoch": 14.327816821582408, + "grad_norm": 0.240234375, + "learning_rate": 0.0002, + "loss": 0.0729, + "step": 7900 + }, + { + "epoch": 14.329630469281343, + "grad_norm": 0.216796875, + "learning_rate": 0.0002, + "loss": 0.0718, + "step": 7901 + }, + { + "epoch": 14.331444116980277, + "grad_norm": 0.271484375, + "learning_rate": 0.0002, + "loss": 0.0759, + "step": 7902 + }, + { + "epoch": 14.33325776467921, + "grad_norm": 0.32421875, + "learning_rate": 0.0002, + "loss": 0.0859, + "step": 7903 + }, + { + "epoch": 14.335071412378145, + "grad_norm": 0.1845703125, + "learning_rate": 0.0002, + "loss": 0.073, + "step": 7904 + }, + { + "epoch": 14.33688506007708, + "grad_norm": 0.240234375, + "learning_rate": 0.0002, + "loss": 0.083, + "step": 7905 + }, + { + "epoch": 14.338698707776015, + "grad_norm": 0.19921875, + "learning_rate": 0.0002, + "loss": 0.0775, + "step": 7906 + }, + { + "epoch": 14.34051235547495, + "grad_norm": 0.353515625, + "learning_rate": 0.0002, + "loss": 0.1001, + "step": 7907 + }, + { + "epoch": 14.342326003173884, + "grad_norm": 0.294921875, + "learning_rate": 0.0002, + "loss": 0.0874, + "step": 7908 + }, + { + "epoch": 14.344139650872817, + "grad_norm": 0.390625, + "learning_rate": 0.0002, + "loss": 0.0906, + "step": 7909 + }, + { + "epoch": 14.345953298571752, + "grad_norm": 0.2490234375, + "learning_rate": 0.0002, + "loss": 0.0991, + "step": 7910 + }, + { + "epoch": 14.347766946270687, + "grad_norm": 0.28125, + "learning_rate": 0.0002, + "loss": 0.1009, + "step": 7911 + }, + { + "epoch": 14.349580593969621, + "grad_norm": 0.1748046875, + "learning_rate": 0.0002, + "loss": 0.0895, + "step": 7912 + }, + { + "epoch": 14.351394241668556, + "grad_norm": 0.2431640625, + "learning_rate": 0.0002, + "loss": 0.1, + "step": 7913 + }, + { + "epoch": 14.353207889367491, + "grad_norm": 0.248046875, + "learning_rate": 0.0002, + "loss": 0.1047, + "step": 7914 + }, + { + "epoch": 14.355021537066424, + "grad_norm": 0.267578125, + "learning_rate": 0.0002, + "loss": 0.1014, + "step": 7915 + }, + { + "epoch": 14.356835184765359, + "grad_norm": 0.291015625, + "learning_rate": 0.0002, + "loss": 0.1271, + "step": 7916 + }, + { + "epoch": 14.358648832464294, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 0.1252, + "step": 7917 + }, + { + "epoch": 14.360462480163228, + "grad_norm": 0.1640625, + "learning_rate": 0.0002, + "loss": 0.1491, + "step": 7918 + }, + { + "epoch": 14.362276127862163, + "grad_norm": 0.1572265625, + "learning_rate": 0.0002, + "loss": 0.194, + "step": 7919 + }, + { + "epoch": 14.364089775561098, + "grad_norm": 0.212890625, + "learning_rate": 0.0002, + "loss": 0.1503, + "step": 7920 + }, + { + "epoch": 14.365903423260031, + "grad_norm": 0.212890625, + "learning_rate": 0.0002, + "loss": 0.0855, + "step": 7921 + }, + { + "epoch": 14.367717070958966, + "grad_norm": 0.23828125, + "learning_rate": 0.0002, + "loss": 0.0932, + "step": 7922 + }, + { + "epoch": 14.3695307186579, + "grad_norm": 0.2275390625, + "learning_rate": 0.0002, + "loss": 0.0917, + "step": 7923 + }, + { + "epoch": 14.371344366356835, + "grad_norm": 0.28125, + "learning_rate": 0.0002, + "loss": 0.1226, + "step": 7924 + }, + { + "epoch": 14.37315801405577, + "grad_norm": 0.2119140625, + "learning_rate": 0.0002, + "loss": 0.092, + "step": 7925 + }, + { + "epoch": 14.374971661754705, + "grad_norm": 0.220703125, + "learning_rate": 0.0002, + "loss": 0.1009, + "step": 7926 + }, + { + "epoch": 14.376785309453638, + "grad_norm": 0.1953125, + "learning_rate": 0.0002, + "loss": 0.081, + "step": 7927 + }, + { + "epoch": 14.378598957152573, + "grad_norm": 0.1962890625, + "learning_rate": 0.0002, + "loss": 0.0807, + "step": 7928 + }, + { + "epoch": 14.380412604851507, + "grad_norm": 0.2060546875, + "learning_rate": 0.0002, + "loss": 0.0923, + "step": 7929 + }, + { + "epoch": 14.382226252550442, + "grad_norm": 0.1865234375, + "learning_rate": 0.0002, + "loss": 0.0816, + "step": 7930 + }, + { + "epoch": 14.384039900249377, + "grad_norm": 0.2001953125, + "learning_rate": 0.0002, + "loss": 0.0854, + "step": 7931 + }, + { + "epoch": 14.385853547948312, + "grad_norm": 0.2197265625, + "learning_rate": 0.0002, + "loss": 0.0826, + "step": 7932 + }, + { + "epoch": 14.387667195647246, + "grad_norm": 0.2177734375, + "learning_rate": 0.0002, + "loss": 0.0977, + "step": 7933 + }, + { + "epoch": 14.38948084334618, + "grad_norm": 0.2109375, + "learning_rate": 0.0002, + "loss": 0.0887, + "step": 7934 + }, + { + "epoch": 14.391294491045114, + "grad_norm": 0.1787109375, + "learning_rate": 0.0002, + "loss": 0.073, + "step": 7935 + }, + { + "epoch": 14.393108138744049, + "grad_norm": 0.1953125, + "learning_rate": 0.0002, + "loss": 0.0761, + "step": 7936 + }, + { + "epoch": 14.394921786442984, + "grad_norm": 0.205078125, + "learning_rate": 0.0002, + "loss": 0.0826, + "step": 7937 + }, + { + "epoch": 14.396735434141918, + "grad_norm": 0.21875, + "learning_rate": 0.0002, + "loss": 0.0726, + "step": 7938 + }, + { + "epoch": 14.398549081840853, + "grad_norm": 0.18359375, + "learning_rate": 0.0002, + "loss": 0.0751, + "step": 7939 + }, + { + "epoch": 14.400362729539786, + "grad_norm": 0.19140625, + "learning_rate": 0.0002, + "loss": 0.0767, + "step": 7940 + }, + { + "epoch": 14.402176377238721, + "grad_norm": 0.1923828125, + "learning_rate": 0.0002, + "loss": 0.0674, + "step": 7941 + }, + { + "epoch": 14.403990024937656, + "grad_norm": 0.1904296875, + "learning_rate": 0.0002, + "loss": 0.0667, + "step": 7942 + }, + { + "epoch": 14.40580367263659, + "grad_norm": 0.1826171875, + "learning_rate": 0.0002, + "loss": 0.0685, + "step": 7943 + }, + { + "epoch": 14.407617320335525, + "grad_norm": 0.23046875, + "learning_rate": 0.0002, + "loss": 0.0775, + "step": 7944 + }, + { + "epoch": 14.40943096803446, + "grad_norm": 0.2353515625, + "learning_rate": 0.0002, + "loss": 0.0712, + "step": 7945 + }, + { + "epoch": 14.411244615733393, + "grad_norm": 0.2265625, + "learning_rate": 0.0002, + "loss": 0.0712, + "step": 7946 + }, + { + "epoch": 14.413058263432328, + "grad_norm": 0.34765625, + "learning_rate": 0.0002, + "loss": 0.1039, + "step": 7947 + }, + { + "epoch": 14.414871911131263, + "grad_norm": 0.224609375, + "learning_rate": 0.0002, + "loss": 0.0717, + "step": 7948 + }, + { + "epoch": 14.416685558830197, + "grad_norm": 0.263671875, + "learning_rate": 0.0002, + "loss": 0.0807, + "step": 7949 + }, + { + "epoch": 14.418499206529132, + "grad_norm": 0.21484375, + "learning_rate": 0.0002, + "loss": 0.0781, + "step": 7950 + }, + { + "epoch": 14.420312854228067, + "grad_norm": 0.2158203125, + "learning_rate": 0.0002, + "loss": 0.0819, + "step": 7951 + }, + { + "epoch": 14.422126501927, + "grad_norm": 0.244140625, + "learning_rate": 0.0002, + "loss": 0.0715, + "step": 7952 + }, + { + "epoch": 14.423940149625935, + "grad_norm": 0.205078125, + "learning_rate": 0.0002, + "loss": 0.0777, + "step": 7953 + }, + { + "epoch": 14.42575379732487, + "grad_norm": 0.2431640625, + "learning_rate": 0.0002, + "loss": 0.0755, + "step": 7954 + }, + { + "epoch": 14.427567445023804, + "grad_norm": 0.1865234375, + "learning_rate": 0.0002, + "loss": 0.0755, + "step": 7955 + }, + { + "epoch": 14.429381092722739, + "grad_norm": 0.2490234375, + "learning_rate": 0.0002, + "loss": 0.0895, + "step": 7956 + }, + { + "epoch": 14.431194740421674, + "grad_norm": 0.1865234375, + "learning_rate": 0.0002, + "loss": 0.0788, + "step": 7957 + }, + { + "epoch": 14.433008388120607, + "grad_norm": 0.3203125, + "learning_rate": 0.0002, + "loss": 0.0864, + "step": 7958 + }, + { + "epoch": 14.434822035819542, + "grad_norm": 0.244140625, + "learning_rate": 0.0002, + "loss": 0.0938, + "step": 7959 + }, + { + "epoch": 14.436635683518476, + "grad_norm": 0.279296875, + "learning_rate": 0.0002, + "loss": 0.093, + "step": 7960 + }, + { + "epoch": 14.438449331217411, + "grad_norm": 0.29296875, + "learning_rate": 0.0002, + "loss": 0.0939, + "step": 7961 + }, + { + "epoch": 14.440262978916346, + "grad_norm": 0.1689453125, + "learning_rate": 0.0002, + "loss": 0.0996, + "step": 7962 + }, + { + "epoch": 14.44207662661528, + "grad_norm": 0.205078125, + "learning_rate": 0.0002, + "loss": 0.0921, + "step": 7963 + }, + { + "epoch": 14.443890274314214, + "grad_norm": 0.234375, + "learning_rate": 0.0002, + "loss": 0.1031, + "step": 7964 + }, + { + "epoch": 14.445703922013148, + "grad_norm": 0.1787109375, + "learning_rate": 0.0002, + "loss": 0.1099, + "step": 7965 + }, + { + "epoch": 14.447517569712083, + "grad_norm": 0.1572265625, + "learning_rate": 0.0002, + "loss": 0.1238, + "step": 7966 + }, + { + "epoch": 14.449331217411018, + "grad_norm": 0.427734375, + "learning_rate": 0.0002, + "loss": 0.1603, + "step": 7967 + }, + { + "epoch": 14.451144865109953, + "grad_norm": 0.1630859375, + "learning_rate": 0.0002, + "loss": 0.163, + "step": 7968 + }, + { + "epoch": 14.452958512808888, + "grad_norm": 0.1748046875, + "learning_rate": 0.0002, + "loss": 0.1967, + "step": 7969 + }, + { + "epoch": 14.45477216050782, + "grad_norm": 0.248046875, + "learning_rate": 0.0002, + "loss": 0.189, + "step": 7970 + }, + { + "epoch": 14.456585808206755, + "grad_norm": 0.2265625, + "learning_rate": 0.0002, + "loss": 0.0969, + "step": 7971 + }, + { + "epoch": 14.45839945590569, + "grad_norm": 0.1796875, + "learning_rate": 0.0002, + "loss": 0.0715, + "step": 7972 + }, + { + "epoch": 14.460213103604625, + "grad_norm": 0.283203125, + "learning_rate": 0.0002, + "loss": 0.1397, + "step": 7973 + }, + { + "epoch": 14.46202675130356, + "grad_norm": 0.23828125, + "learning_rate": 0.0002, + "loss": 0.1093, + "step": 7974 + }, + { + "epoch": 14.463840399002494, + "grad_norm": 0.1767578125, + "learning_rate": 0.0002, + "loss": 0.0811, + "step": 7975 + }, + { + "epoch": 14.465654046701427, + "grad_norm": 0.2373046875, + "learning_rate": 0.0002, + "loss": 0.0982, + "step": 7976 + }, + { + "epoch": 14.467467694400362, + "grad_norm": 0.2236328125, + "learning_rate": 0.0002, + "loss": 0.1009, + "step": 7977 + }, + { + "epoch": 14.469281342099297, + "grad_norm": 0.224609375, + "learning_rate": 0.0002, + "loss": 0.0949, + "step": 7978 + }, + { + "epoch": 14.471094989798232, + "grad_norm": 0.2001953125, + "learning_rate": 0.0002, + "loss": 0.0834, + "step": 7979 + }, + { + "epoch": 14.472908637497166, + "grad_norm": 0.2109375, + "learning_rate": 0.0002, + "loss": 0.0906, + "step": 7980 + }, + { + "epoch": 14.474722285196101, + "grad_norm": 0.20703125, + "learning_rate": 0.0002, + "loss": 0.0781, + "step": 7981 + }, + { + "epoch": 14.476535932895036, + "grad_norm": 0.1953125, + "learning_rate": 0.0002, + "loss": 0.0799, + "step": 7982 + }, + { + "epoch": 14.478349580593969, + "grad_norm": 0.2021484375, + "learning_rate": 0.0002, + "loss": 0.0769, + "step": 7983 + }, + { + "epoch": 14.480163228292904, + "grad_norm": 0.1767578125, + "learning_rate": 0.0002, + "loss": 0.0691, + "step": 7984 + }, + { + "epoch": 14.481976875991839, + "grad_norm": 0.2001953125, + "learning_rate": 0.0002, + "loss": 0.0734, + "step": 7985 + }, + { + "epoch": 14.483790523690773, + "grad_norm": 0.1943359375, + "learning_rate": 0.0002, + "loss": 0.0875, + "step": 7986 + }, + { + "epoch": 14.485604171389708, + "grad_norm": 0.173828125, + "learning_rate": 0.0002, + "loss": 0.0713, + "step": 7987 + }, + { + "epoch": 14.487417819088641, + "grad_norm": 0.228515625, + "learning_rate": 0.0002, + "loss": 0.0821, + "step": 7988 + }, + { + "epoch": 14.489231466787576, + "grad_norm": 0.21484375, + "learning_rate": 0.0002, + "loss": 0.0765, + "step": 7989 + }, + { + "epoch": 14.49104511448651, + "grad_norm": 0.22265625, + "learning_rate": 0.0002, + "loss": 0.0864, + "step": 7990 + }, + { + "epoch": 14.492858762185445, + "grad_norm": 0.2158203125, + "learning_rate": 0.0002, + "loss": 0.0712, + "step": 7991 + }, + { + "epoch": 14.49467240988438, + "grad_norm": 0.189453125, + "learning_rate": 0.0002, + "loss": 0.0732, + "step": 7992 + }, + { + "epoch": 14.496486057583315, + "grad_norm": 0.2353515625, + "learning_rate": 0.0002, + "loss": 0.0872, + "step": 7993 + }, + { + "epoch": 14.49829970528225, + "grad_norm": 0.25, + "learning_rate": 0.0002, + "loss": 0.0751, + "step": 7994 + }, + { + "epoch": 14.500113352981183, + "grad_norm": 0.185546875, + "learning_rate": 0.0002, + "loss": 0.0661, + "step": 7995 + }, + { + "epoch": 14.501927000680118, + "grad_norm": 0.1640625, + "learning_rate": 0.0002, + "loss": 0.0673, + "step": 7996 + }, + { + "epoch": 14.503740648379052, + "grad_norm": 0.287109375, + "learning_rate": 0.0002, + "loss": 0.078, + "step": 7997 + }, + { + "epoch": 14.505554296077987, + "grad_norm": 0.2431640625, + "learning_rate": 0.0002, + "loss": 0.0788, + "step": 7998 + }, + { + "epoch": 14.507367943776922, + "grad_norm": 0.283203125, + "learning_rate": 0.0002, + "loss": 0.1525, + "step": 7999 + }, + { + "epoch": 14.509181591475857, + "grad_norm": 0.1796875, + "learning_rate": 0.0002, + "loss": 0.0676, + "step": 8000 + }, + { + "epoch": 14.51099523917479, + "grad_norm": 0.2373046875, + "learning_rate": 0.0002, + "loss": 0.0843, + "step": 8001 + }, + { + "epoch": 14.512808886873724, + "grad_norm": 0.25, + "learning_rate": 0.0002, + "loss": 0.0816, + "step": 8002 + }, + { + "epoch": 14.51462253457266, + "grad_norm": 0.279296875, + "learning_rate": 0.0002, + "loss": 0.0804, + "step": 8003 + }, + { + "epoch": 14.516436182271594, + "grad_norm": 0.2353515625, + "learning_rate": 0.0002, + "loss": 0.0736, + "step": 8004 + }, + { + "epoch": 14.518249829970529, + "grad_norm": 0.294921875, + "learning_rate": 0.0002, + "loss": 0.0739, + "step": 8005 + }, + { + "epoch": 14.520063477669463, + "grad_norm": 0.1328125, + "learning_rate": 0.0002, + "loss": 0.0711, + "step": 8006 + }, + { + "epoch": 14.521877125368396, + "grad_norm": 0.34765625, + "learning_rate": 0.0002, + "loss": 0.083, + "step": 8007 + }, + { + "epoch": 14.523690773067331, + "grad_norm": 0.244140625, + "learning_rate": 0.0002, + "loss": 0.0891, + "step": 8008 + }, + { + "epoch": 14.525504420766266, + "grad_norm": 0.234375, + "learning_rate": 0.0002, + "loss": 0.088, + "step": 8009 + }, + { + "epoch": 14.5273180684652, + "grad_norm": 0.2236328125, + "learning_rate": 0.0002, + "loss": 0.0933, + "step": 8010 + }, + { + "epoch": 14.529131716164136, + "grad_norm": 0.259765625, + "learning_rate": 0.0002, + "loss": 0.0897, + "step": 8011 + }, + { + "epoch": 14.53094536386307, + "grad_norm": 0.40625, + "learning_rate": 0.0002, + "loss": 0.1111, + "step": 8012 + }, + { + "epoch": 14.532759011562003, + "grad_norm": 0.2578125, + "learning_rate": 0.0002, + "loss": 0.1071, + "step": 8013 + }, + { + "epoch": 14.534572659260938, + "grad_norm": 0.353515625, + "learning_rate": 0.0002, + "loss": 0.1186, + "step": 8014 + }, + { + "epoch": 14.536386306959873, + "grad_norm": 0.34765625, + "learning_rate": 0.0002, + "loss": 0.123, + "step": 8015 + }, + { + "epoch": 14.538199954658808, + "grad_norm": 0.240234375, + "learning_rate": 0.0002, + "loss": 0.1317, + "step": 8016 + }, + { + "epoch": 14.540013602357742, + "grad_norm": 0.38671875, + "learning_rate": 0.0002, + "loss": 0.1596, + "step": 8017 + }, + { + "epoch": 14.541827250056677, + "grad_norm": 0.423828125, + "learning_rate": 0.0002, + "loss": 0.1823, + "step": 8018 + }, + { + "epoch": 14.54364089775561, + "grad_norm": 0.1826171875, + "learning_rate": 0.0002, + "loss": 0.1711, + "step": 8019 + }, + { + "epoch": 14.545454545454545, + "grad_norm": 0.244140625, + "learning_rate": 0.0002, + "loss": 0.1262, + "step": 8020 + }, + { + "epoch": 14.54726819315348, + "grad_norm": 0.2138671875, + "learning_rate": 0.0002, + "loss": 0.0923, + "step": 8021 + }, + { + "epoch": 14.549081840852415, + "grad_norm": 0.2578125, + "learning_rate": 0.0002, + "loss": 0.1219, + "step": 8022 + }, + { + "epoch": 14.55089548855135, + "grad_norm": 0.224609375, + "learning_rate": 0.0002, + "loss": 0.0908, + "step": 8023 + }, + { + "epoch": 14.552709136250284, + "grad_norm": 0.224609375, + "learning_rate": 0.0002, + "loss": 0.0911, + "step": 8024 + }, + { + "epoch": 14.554522783949217, + "grad_norm": 0.2333984375, + "learning_rate": 0.0002, + "loss": 0.0811, + "step": 8025 + }, + { + "epoch": 14.556336431648152, + "grad_norm": 0.21875, + "learning_rate": 0.0002, + "loss": 0.0968, + "step": 8026 + }, + { + "epoch": 14.558150079347087, + "grad_norm": 0.255859375, + "learning_rate": 0.0002, + "loss": 0.1035, + "step": 8027 + }, + { + "epoch": 14.559963727046021, + "grad_norm": 0.189453125, + "learning_rate": 0.0002, + "loss": 0.0809, + "step": 8028 + }, + { + "epoch": 14.561777374744956, + "grad_norm": 0.1748046875, + "learning_rate": 0.0002, + "loss": 0.0755, + "step": 8029 + }, + { + "epoch": 14.563591022443891, + "grad_norm": 0.23828125, + "learning_rate": 0.0002, + "loss": 0.1014, + "step": 8030 + }, + { + "epoch": 14.565404670142826, + "grad_norm": 0.203125, + "learning_rate": 0.0002, + "loss": 0.0836, + "step": 8031 + }, + { + "epoch": 14.567218317841759, + "grad_norm": 0.1748046875, + "learning_rate": 0.0002, + "loss": 0.077, + "step": 8032 + }, + { + "epoch": 14.569031965540693, + "grad_norm": 0.1640625, + "learning_rate": 0.0002, + "loss": 0.0722, + "step": 8033 + }, + { + "epoch": 14.570845613239628, + "grad_norm": 0.19921875, + "learning_rate": 0.0002, + "loss": 0.0824, + "step": 8034 + }, + { + "epoch": 14.572659260938563, + "grad_norm": 0.18359375, + "learning_rate": 0.0002, + "loss": 0.068, + "step": 8035 + }, + { + "epoch": 14.574472908637498, + "grad_norm": 0.1962890625, + "learning_rate": 0.0002, + "loss": 0.0726, + "step": 8036 + }, + { + "epoch": 14.57628655633643, + "grad_norm": 0.2265625, + "learning_rate": 0.0002, + "loss": 0.0765, + "step": 8037 + }, + { + "epoch": 14.578100204035366, + "grad_norm": 0.171875, + "learning_rate": 0.0002, + "loss": 0.0714, + "step": 8038 + }, + { + "epoch": 14.5799138517343, + "grad_norm": 0.19921875, + "learning_rate": 0.0002, + "loss": 0.0746, + "step": 8039 + }, + { + "epoch": 14.581727499433235, + "grad_norm": 0.2216796875, + "learning_rate": 0.0002, + "loss": 0.0736, + "step": 8040 + }, + { + "epoch": 14.58354114713217, + "grad_norm": 0.2470703125, + "learning_rate": 0.0002, + "loss": 0.0805, + "step": 8041 + }, + { + "epoch": 14.58354114713217, + "eval_loss": 2.5021004676818848, + "eval_runtime": 152.6419, + "eval_samples_per_second": 6.551, + "eval_steps_per_second": 6.551, + "step": 8041 + }, + { + "epoch": 14.58354114713217, + "mmlu_eval_accuracy": 0.29720390243045675, + "mmlu_eval_accuracy_abstract_algebra": 0.45454545454545453, + "mmlu_eval_accuracy_anatomy": 0.35714285714285715, + "mmlu_eval_accuracy_astronomy": 0.25, + "mmlu_eval_accuracy_business_ethics": 0.36363636363636365, + "mmlu_eval_accuracy_clinical_knowledge": 0.27586206896551724, + "mmlu_eval_accuracy_college_biology": 0.375, + "mmlu_eval_accuracy_college_chemistry": 0.125, + "mmlu_eval_accuracy_college_computer_science": 0.09090909090909091, + "mmlu_eval_accuracy_college_mathematics": 0.18181818181818182, + "mmlu_eval_accuracy_college_medicine": 0.2727272727272727, + "mmlu_eval_accuracy_college_physics": 0.36363636363636365, + "mmlu_eval_accuracy_computer_security": 0.45454545454545453, + "mmlu_eval_accuracy_conceptual_physics": 0.3076923076923077, + "mmlu_eval_accuracy_econometrics": 0.08333333333333333, + "mmlu_eval_accuracy_electrical_engineering": 0.1875, + "mmlu_eval_accuracy_elementary_mathematics": 0.3170731707317073, + "mmlu_eval_accuracy_formal_logic": 0.14285714285714285, + "mmlu_eval_accuracy_global_facts": 0.3, + "mmlu_eval_accuracy_high_school_biology": 0.40625, + "mmlu_eval_accuracy_high_school_chemistry": 0.18181818181818182, + "mmlu_eval_accuracy_high_school_computer_science": 0.2222222222222222, + "mmlu_eval_accuracy_high_school_european_history": 0.2777777777777778, + "mmlu_eval_accuracy_high_school_geography": 0.3181818181818182, + "mmlu_eval_accuracy_high_school_government_and_politics": 0.23809523809523808, + "mmlu_eval_accuracy_high_school_macroeconomics": 0.3023255813953488, + "mmlu_eval_accuracy_high_school_mathematics": 0.10344827586206896, + "mmlu_eval_accuracy_high_school_microeconomics": 0.34615384615384615, + "mmlu_eval_accuracy_high_school_physics": 0.47058823529411764, + "mmlu_eval_accuracy_high_school_psychology": 0.31666666666666665, + "mmlu_eval_accuracy_high_school_statistics": 0.2608695652173913, + "mmlu_eval_accuracy_high_school_us_history": 0.2727272727272727, + "mmlu_eval_accuracy_high_school_world_history": 0.19230769230769232, + "mmlu_eval_accuracy_human_aging": 0.34782608695652173, + "mmlu_eval_accuracy_human_sexuality": 0.16666666666666666, + "mmlu_eval_accuracy_international_law": 0.46153846153846156, + "mmlu_eval_accuracy_jurisprudence": 0.36363636363636365, + "mmlu_eval_accuracy_logical_fallacies": 0.3333333333333333, + "mmlu_eval_accuracy_machine_learning": 0.36363636363636365, + "mmlu_eval_accuracy_management": 0.36363636363636365, + "mmlu_eval_accuracy_marketing": 0.44, + "mmlu_eval_accuracy_medical_genetics": 0.2727272727272727, + "mmlu_eval_accuracy_miscellaneous": 0.46511627906976744, + "mmlu_eval_accuracy_moral_disputes": 0.2894736842105263, + "mmlu_eval_accuracy_moral_scenarios": 0.22, + "mmlu_eval_accuracy_nutrition": 0.2727272727272727, + "mmlu_eval_accuracy_philosophy": 0.3235294117647059, + "mmlu_eval_accuracy_prehistory": 0.42857142857142855, + "mmlu_eval_accuracy_professional_accounting": 0.3225806451612903, + "mmlu_eval_accuracy_professional_law": 0.2823529411764706, + "mmlu_eval_accuracy_professional_medicine": 0.12903225806451613, + "mmlu_eval_accuracy_professional_psychology": 0.2898550724637681, + "mmlu_eval_accuracy_public_relations": 0.3333333333333333, + "mmlu_eval_accuracy_security_studies": 0.2962962962962963, + "mmlu_eval_accuracy_sociology": 0.45454545454545453, + "mmlu_eval_accuracy_us_foreign_policy": 0.36363636363636365, + "mmlu_eval_accuracy_virology": 0.3333333333333333, + "mmlu_eval_accuracy_world_religions": 0.21052631578947367, + "mmlu_loss": 2.1896909020797324, + "step": 8041 + }, + { + "epoch": 14.585354794831105, + "grad_norm": 0.1943359375, + "learning_rate": 0.0002, + "loss": 0.0684, + "step": 8042 + }, + { + "epoch": 14.58716844253004, + "grad_norm": 0.296875, + "learning_rate": 0.0002, + "loss": 0.0877, + "step": 8043 + }, + { + "epoch": 14.588982090228972, + "grad_norm": 0.244140625, + "learning_rate": 0.0002, + "loss": 0.0725, + "step": 8044 + }, + { + "epoch": 14.590795737927907, + "grad_norm": 0.251953125, + "learning_rate": 0.0002, + "loss": 0.0784, + "step": 8045 + }, + { + "epoch": 14.592609385626842, + "grad_norm": 0.251953125, + "learning_rate": 0.0002, + "loss": 0.0754, + "step": 8046 + }, + { + "epoch": 14.594423033325777, + "grad_norm": 0.2490234375, + "learning_rate": 0.0002, + "loss": 0.0797, + "step": 8047 + }, + { + "epoch": 14.596236681024712, + "grad_norm": 0.240234375, + "learning_rate": 0.0002, + "loss": 0.0788, + "step": 8048 + }, + { + "epoch": 14.598050328723644, + "grad_norm": 0.2734375, + "learning_rate": 0.0002, + "loss": 0.0827, + "step": 8049 + }, + { + "epoch": 14.59986397642258, + "grad_norm": 0.451171875, + "learning_rate": 0.0002, + "loss": 0.0791, + "step": 8050 + }, + { + "epoch": 14.601677624121514, + "grad_norm": 0.267578125, + "learning_rate": 0.0002, + "loss": 0.0822, + "step": 8051 + }, + { + "epoch": 14.603491271820449, + "grad_norm": 0.328125, + "learning_rate": 0.0002, + "loss": 0.0757, + "step": 8052 + }, + { + "epoch": 14.605304919519384, + "grad_norm": 0.21484375, + "learning_rate": 0.0002, + "loss": 0.077, + "step": 8053 + }, + { + "epoch": 14.607118567218318, + "grad_norm": 0.322265625, + "learning_rate": 0.0002, + "loss": 0.0898, + "step": 8054 + }, + { + "epoch": 14.608932214917253, + "grad_norm": 0.271484375, + "learning_rate": 0.0002, + "loss": 0.0827, + "step": 8055 + }, + { + "epoch": 14.610745862616186, + "grad_norm": 0.28125, + "learning_rate": 0.0002, + "loss": 0.0831, + "step": 8056 + }, + { + "epoch": 14.612559510315121, + "grad_norm": 0.236328125, + "learning_rate": 0.0002, + "loss": 0.0809, + "step": 8057 + }, + { + "epoch": 14.614373158014056, + "grad_norm": 0.265625, + "learning_rate": 0.0002, + "loss": 0.0825, + "step": 8058 + }, + { + "epoch": 14.61618680571299, + "grad_norm": 0.279296875, + "learning_rate": 0.0002, + "loss": 0.0882, + "step": 8059 + }, + { + "epoch": 14.618000453411925, + "grad_norm": 0.1689453125, + "learning_rate": 0.0002, + "loss": 0.0841, + "step": 8060 + }, + { + "epoch": 14.61981410111086, + "grad_norm": 0.39453125, + "learning_rate": 0.0002, + "loss": 0.1069, + "step": 8061 + }, + { + "epoch": 14.621627748809793, + "grad_norm": 0.265625, + "learning_rate": 0.0002, + "loss": 0.1011, + "step": 8062 + }, + { + "epoch": 14.623441396508728, + "grad_norm": 0.3125, + "learning_rate": 0.0002, + "loss": 0.1045, + "step": 8063 + }, + { + "epoch": 14.625255044207663, + "grad_norm": 0.4765625, + "learning_rate": 0.0002, + "loss": 0.1276, + "step": 8064 + }, + { + "epoch": 14.627068691906597, + "grad_norm": 0.54296875, + "learning_rate": 0.0002, + "loss": 0.1369, + "step": 8065 + }, + { + "epoch": 14.628882339605532, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 0.1168, + "step": 8066 + }, + { + "epoch": 14.630695987304467, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 0.1352, + "step": 8067 + }, + { + "epoch": 14.6325096350034, + "grad_norm": 0.25390625, + "learning_rate": 0.0002, + "loss": 0.1364, + "step": 8068 + }, + { + "epoch": 14.634323282702335, + "grad_norm": 0.2197265625, + "learning_rate": 0.0002, + "loss": 0.1934, + "step": 8069 + }, + { + "epoch": 14.63613693040127, + "grad_norm": 0.234375, + "learning_rate": 0.0002, + "loss": 0.1371, + "step": 8070 + }, + { + "epoch": 14.637950578100204, + "grad_norm": 0.255859375, + "learning_rate": 0.0002, + "loss": 0.108, + "step": 8071 + }, + { + "epoch": 14.639764225799139, + "grad_norm": 0.25, + "learning_rate": 0.0002, + "loss": 0.1013, + "step": 8072 + }, + { + "epoch": 14.641577873498074, + "grad_norm": 0.2431640625, + "learning_rate": 0.0002, + "loss": 0.112, + "step": 8073 + }, + { + "epoch": 14.643391521197007, + "grad_norm": 0.248046875, + "learning_rate": 0.0002, + "loss": 0.093, + "step": 8074 + }, + { + "epoch": 14.645205168895941, + "grad_norm": 0.248046875, + "learning_rate": 0.0002, + "loss": 0.0988, + "step": 8075 + }, + { + "epoch": 14.647018816594876, + "grad_norm": 0.2265625, + "learning_rate": 0.0002, + "loss": 0.0975, + "step": 8076 + }, + { + "epoch": 14.648832464293811, + "grad_norm": 0.236328125, + "learning_rate": 0.0002, + "loss": 0.0939, + "step": 8077 + }, + { + "epoch": 14.650646111992746, + "grad_norm": 0.220703125, + "learning_rate": 0.0002, + "loss": 0.0885, + "step": 8078 + }, + { + "epoch": 14.65245975969168, + "grad_norm": 0.2080078125, + "learning_rate": 0.0002, + "loss": 0.0848, + "step": 8079 + }, + { + "epoch": 14.654273407390614, + "grad_norm": 0.2080078125, + "learning_rate": 0.0002, + "loss": 0.0827, + "step": 8080 + }, + { + "epoch": 14.656087055089548, + "grad_norm": 0.2041015625, + "learning_rate": 0.0002, + "loss": 0.0803, + "step": 8081 + }, + { + "epoch": 14.657900702788483, + "grad_norm": 0.21875, + "learning_rate": 0.0002, + "loss": 0.089, + "step": 8082 + }, + { + "epoch": 14.659714350487418, + "grad_norm": 0.1953125, + "learning_rate": 0.0002, + "loss": 0.0834, + "step": 8083 + }, + { + "epoch": 14.661527998186353, + "grad_norm": 0.1748046875, + "learning_rate": 0.0002, + "loss": 0.0666, + "step": 8084 + }, + { + "epoch": 14.663341645885287, + "grad_norm": 0.21875, + "learning_rate": 0.0002, + "loss": 0.0816, + "step": 8085 + }, + { + "epoch": 14.66515529358422, + "grad_norm": 0.2109375, + "learning_rate": 0.0002, + "loss": 0.0845, + "step": 8086 + }, + { + "epoch": 14.666968941283155, + "grad_norm": 0.2158203125, + "learning_rate": 0.0002, + "loss": 0.0805, + "step": 8087 + }, + { + "epoch": 14.66878258898209, + "grad_norm": 0.248046875, + "learning_rate": 0.0002, + "loss": 0.0798, + "step": 8088 + }, + { + "epoch": 14.670596236681025, + "grad_norm": 0.2578125, + "learning_rate": 0.0002, + "loss": 0.0845, + "step": 8089 + }, + { + "epoch": 14.67240988437996, + "grad_norm": 0.21875, + "learning_rate": 0.0002, + "loss": 0.0792, + "step": 8090 + }, + { + "epoch": 14.674223532078894, + "grad_norm": 0.251953125, + "learning_rate": 0.0002, + "loss": 0.0804, + "step": 8091 + }, + { + "epoch": 14.676037179777829, + "grad_norm": 0.2138671875, + "learning_rate": 0.0002, + "loss": 0.0776, + "step": 8092 + }, + { + "epoch": 14.677850827476762, + "grad_norm": 0.2451171875, + "learning_rate": 0.0002, + "loss": 0.076, + "step": 8093 + }, + { + "epoch": 14.679664475175697, + "grad_norm": 0.23828125, + "learning_rate": 0.0002, + "loss": 0.0758, + "step": 8094 + }, + { + "epoch": 14.681478122874632, + "grad_norm": 0.2431640625, + "learning_rate": 0.0002, + "loss": 0.0796, + "step": 8095 + }, + { + "epoch": 14.683291770573566, + "grad_norm": 0.255859375, + "learning_rate": 0.0002, + "loss": 0.08, + "step": 8096 + }, + { + "epoch": 14.685105418272501, + "grad_norm": 0.298828125, + "learning_rate": 0.0002, + "loss": 0.1133, + "step": 8097 + }, + { + "epoch": 14.686919065971434, + "grad_norm": 0.26171875, + "learning_rate": 0.0002, + "loss": 0.0752, + "step": 8098 + }, + { + "epoch": 14.688732713670369, + "grad_norm": 0.224609375, + "learning_rate": 0.0002, + "loss": 0.0711, + "step": 8099 + }, + { + "epoch": 14.690546361369304, + "grad_norm": 0.263671875, + "learning_rate": 0.0002, + "loss": 0.0788, + "step": 8100 + }, + { + "epoch": 14.692360009068238, + "grad_norm": 0.1943359375, + "learning_rate": 0.0002, + "loss": 0.0816, + "step": 8101 + }, + { + "epoch": 14.694173656767173, + "grad_norm": 0.341796875, + "learning_rate": 0.0002, + "loss": 0.0808, + "step": 8102 + }, + { + "epoch": 14.695987304466108, + "grad_norm": 0.322265625, + "learning_rate": 0.0002, + "loss": 0.0809, + "step": 8103 + }, + { + "epoch": 14.697800952165043, + "grad_norm": 0.23046875, + "learning_rate": 0.0002, + "loss": 0.0745, + "step": 8104 + }, + { + "epoch": 14.699614599863976, + "grad_norm": 0.1962890625, + "learning_rate": 0.0002, + "loss": 0.0816, + "step": 8105 + }, + { + "epoch": 14.70142824756291, + "grad_norm": 0.28515625, + "learning_rate": 0.0002, + "loss": 0.092, + "step": 8106 + }, + { + "epoch": 14.703241895261845, + "grad_norm": 0.259765625, + "learning_rate": 0.0002, + "loss": 0.0879, + "step": 8107 + }, + { + "epoch": 14.70505554296078, + "grad_norm": 0.2373046875, + "learning_rate": 0.0002, + "loss": 0.0886, + "step": 8108 + }, + { + "epoch": 14.706869190659715, + "grad_norm": 0.23828125, + "learning_rate": 0.0002, + "loss": 0.0796, + "step": 8109 + }, + { + "epoch": 14.70868283835865, + "grad_norm": 0.31640625, + "learning_rate": 0.0002, + "loss": 0.1004, + "step": 8110 + }, + { + "epoch": 14.710496486057583, + "grad_norm": 0.2734375, + "learning_rate": 0.0002, + "loss": 0.0988, + "step": 8111 + }, + { + "epoch": 14.712310133756517, + "grad_norm": 0.27734375, + "learning_rate": 0.0002, + "loss": 0.1122, + "step": 8112 + }, + { + "epoch": 14.714123781455452, + "grad_norm": 0.1884765625, + "learning_rate": 0.0002, + "loss": 0.102, + "step": 8113 + }, + { + "epoch": 14.715937429154387, + "grad_norm": 0.2392578125, + "learning_rate": 0.0002, + "loss": 0.1097, + "step": 8114 + }, + { + "epoch": 14.717751076853322, + "grad_norm": 0.19921875, + "learning_rate": 0.0002, + "loss": 0.1086, + "step": 8115 + }, + { + "epoch": 14.719564724552257, + "grad_norm": 0.24609375, + "learning_rate": 0.0002, + "loss": 0.1323, + "step": 8116 + }, + { + "epoch": 14.72137837225119, + "grad_norm": 0.298828125, + "learning_rate": 0.0002, + "loss": 0.1321, + "step": 8117 + }, + { + "epoch": 14.723192019950124, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 0.1605, + "step": 8118 + }, + { + "epoch": 14.725005667649059, + "grad_norm": 0.11572265625, + "learning_rate": 0.0002, + "loss": 0.1708, + "step": 8119 + }, + { + "epoch": 14.726819315347994, + "grad_norm": 0.2578125, + "learning_rate": 0.0002, + "loss": 0.3119, + "step": 8120 + }, + { + "epoch": 14.728632963046929, + "grad_norm": 0.27734375, + "learning_rate": 0.0002, + "loss": 0.1135, + "step": 8121 + }, + { + "epoch": 14.730446610745863, + "grad_norm": 0.2578125, + "learning_rate": 0.0002, + "loss": 0.115, + "step": 8122 + }, + { + "epoch": 14.732260258444796, + "grad_norm": 0.2275390625, + "learning_rate": 0.0002, + "loss": 0.0957, + "step": 8123 + }, + { + "epoch": 14.734073906143731, + "grad_norm": 0.24609375, + "learning_rate": 0.0002, + "loss": 0.1116, + "step": 8124 + }, + { + "epoch": 14.735887553842666, + "grad_norm": 0.2490234375, + "learning_rate": 0.0002, + "loss": 0.1001, + "step": 8125 + }, + { + "epoch": 14.7377012015416, + "grad_norm": 0.259765625, + "learning_rate": 0.0002, + "loss": 0.0906, + "step": 8126 + }, + { + "epoch": 14.739514849240535, + "grad_norm": 0.2373046875, + "learning_rate": 0.0002, + "loss": 0.11, + "step": 8127 + }, + { + "epoch": 14.74132849693947, + "grad_norm": 0.2021484375, + "learning_rate": 0.0002, + "loss": 0.0883, + "step": 8128 + }, + { + "epoch": 14.743142144638403, + "grad_norm": 0.2177734375, + "learning_rate": 0.0002, + "loss": 0.0957, + "step": 8129 + }, + { + "epoch": 14.744955792337338, + "grad_norm": 0.21875, + "learning_rate": 0.0002, + "loss": 0.0906, + "step": 8130 + }, + { + "epoch": 14.746769440036273, + "grad_norm": 0.2314453125, + "learning_rate": 0.0002, + "loss": 0.0955, + "step": 8131 + }, + { + "epoch": 14.748583087735208, + "grad_norm": 0.212890625, + "learning_rate": 0.0002, + "loss": 0.0861, + "step": 8132 + }, + { + "epoch": 14.750396735434142, + "grad_norm": 0.2001953125, + "learning_rate": 0.0002, + "loss": 0.0751, + "step": 8133 + }, + { + "epoch": 14.752210383133077, + "grad_norm": 0.1982421875, + "learning_rate": 0.0002, + "loss": 0.0851, + "step": 8134 + }, + { + "epoch": 14.75402403083201, + "grad_norm": 0.2060546875, + "learning_rate": 0.0002, + "loss": 0.0698, + "step": 8135 + }, + { + "epoch": 14.755837678530945, + "grad_norm": 0.181640625, + "learning_rate": 0.0002, + "loss": 0.07, + "step": 8136 + }, + { + "epoch": 14.75765132622988, + "grad_norm": 0.220703125, + "learning_rate": 0.0002, + "loss": 0.0817, + "step": 8137 + }, + { + "epoch": 14.759464973928814, + "grad_norm": 0.21484375, + "learning_rate": 0.0002, + "loss": 0.0714, + "step": 8138 + }, + { + "epoch": 14.76127862162775, + "grad_norm": 0.23828125, + "learning_rate": 0.0002, + "loss": 0.0822, + "step": 8139 + }, + { + "epoch": 14.763092269326684, + "grad_norm": 0.2265625, + "learning_rate": 0.0002, + "loss": 0.081, + "step": 8140 + }, + { + "epoch": 14.764905917025617, + "grad_norm": 0.279296875, + "learning_rate": 0.0002, + "loss": 0.0739, + "step": 8141 + }, + { + "epoch": 14.766719564724552, + "grad_norm": 0.240234375, + "learning_rate": 0.0002, + "loss": 0.0854, + "step": 8142 + }, + { + "epoch": 14.768533212423486, + "grad_norm": 0.2158203125, + "learning_rate": 0.0002, + "loss": 0.0699, + "step": 8143 + }, + { + "epoch": 14.770346860122421, + "grad_norm": 0.24609375, + "learning_rate": 0.0002, + "loss": 0.0783, + "step": 8144 + }, + { + "epoch": 14.772160507821356, + "grad_norm": 0.1923828125, + "learning_rate": 0.0002, + "loss": 0.065, + "step": 8145 + }, + { + "epoch": 14.77397415552029, + "grad_norm": 0.2333984375, + "learning_rate": 0.0002, + "loss": 0.0864, + "step": 8146 + }, + { + "epoch": 14.775787803219224, + "grad_norm": 0.1943359375, + "learning_rate": 0.0002, + "loss": 0.078, + "step": 8147 + }, + { + "epoch": 14.777601450918159, + "grad_norm": 0.251953125, + "learning_rate": 0.0002, + "loss": 0.0844, + "step": 8148 + }, + { + "epoch": 14.779415098617093, + "grad_norm": 0.2451171875, + "learning_rate": 0.0002, + "loss": 0.0764, + "step": 8149 + }, + { + "epoch": 14.781228746316028, + "grad_norm": 0.275390625, + "learning_rate": 0.0002, + "loss": 0.0753, + "step": 8150 + }, + { + "epoch": 14.783042394014963, + "grad_norm": 0.2578125, + "learning_rate": 0.0002, + "loss": 0.0794, + "step": 8151 + }, + { + "epoch": 14.784856041713898, + "grad_norm": 0.2470703125, + "learning_rate": 0.0002, + "loss": 0.078, + "step": 8152 + }, + { + "epoch": 14.786669689412832, + "grad_norm": 0.2490234375, + "learning_rate": 0.0002, + "loss": 0.0807, + "step": 8153 + }, + { + "epoch": 14.788483337111765, + "grad_norm": 0.259765625, + "learning_rate": 0.0002, + "loss": 0.079, + "step": 8154 + }, + { + "epoch": 14.7902969848107, + "grad_norm": 0.2353515625, + "learning_rate": 0.0002, + "loss": 0.0831, + "step": 8155 + }, + { + "epoch": 14.792110632509635, + "grad_norm": 0.298828125, + "learning_rate": 0.0002, + "loss": 0.0836, + "step": 8156 + }, + { + "epoch": 14.79392428020857, + "grad_norm": 0.3046875, + "learning_rate": 0.0002, + "loss": 0.0974, + "step": 8157 + }, + { + "epoch": 14.795737927907505, + "grad_norm": 0.328125, + "learning_rate": 0.0002, + "loss": 0.1045, + "step": 8158 + }, + { + "epoch": 14.797551575606438, + "grad_norm": 0.2578125, + "learning_rate": 0.0002, + "loss": 0.0882, + "step": 8159 + }, + { + "epoch": 14.799365223305372, + "grad_norm": 0.2421875, + "learning_rate": 0.0002, + "loss": 0.0916, + "step": 8160 + }, + { + "epoch": 14.801178871004307, + "grad_norm": 0.255859375, + "learning_rate": 0.0002, + "loss": 0.1091, + "step": 8161 + }, + { + "epoch": 14.802992518703242, + "grad_norm": 0.1728515625, + "learning_rate": 0.0002, + "loss": 0.1009, + "step": 8162 + }, + { + "epoch": 14.804806166402177, + "grad_norm": 0.279296875, + "learning_rate": 0.0002, + "loss": 0.1071, + "step": 8163 + }, + { + "epoch": 14.806619814101111, + "grad_norm": 0.4140625, + "learning_rate": 0.0002, + "loss": 0.1189, + "step": 8164 + }, + { + "epoch": 14.808433461800046, + "grad_norm": 0.2001953125, + "learning_rate": 0.0002, + "loss": 0.1214, + "step": 8165 + }, + { + "epoch": 14.81024710949898, + "grad_norm": 0.228515625, + "learning_rate": 0.0002, + "loss": 0.1345, + "step": 8166 + }, + { + "epoch": 14.812060757197914, + "grad_norm": 0.2197265625, + "learning_rate": 0.0002, + "loss": 0.1322, + "step": 8167 + }, + { + "epoch": 14.813874404896849, + "grad_norm": 0.59375, + "learning_rate": 0.0002, + "loss": 0.168, + "step": 8168 + }, + { + "epoch": 14.815688052595783, + "grad_norm": 0.244140625, + "learning_rate": 0.0002, + "loss": 0.2239, + "step": 8169 + }, + { + "epoch": 14.817501700294718, + "grad_norm": 0.279296875, + "learning_rate": 0.0002, + "loss": 0.1479, + "step": 8170 + }, + { + "epoch": 14.819315347993653, + "grad_norm": 0.2333984375, + "learning_rate": 0.0002, + "loss": 0.1094, + "step": 8171 + }, + { + "epoch": 14.821128995692586, + "grad_norm": 0.294921875, + "learning_rate": 0.0002, + "loss": 0.1278, + "step": 8172 + }, + { + "epoch": 14.82294264339152, + "grad_norm": 0.2265625, + "learning_rate": 0.0002, + "loss": 0.1038, + "step": 8173 + }, + { + "epoch": 14.824756291090456, + "grad_norm": 0.2412109375, + "learning_rate": 0.0002, + "loss": 0.1034, + "step": 8174 + }, + { + "epoch": 14.82656993878939, + "grad_norm": 0.2294921875, + "learning_rate": 0.0002, + "loss": 0.1072, + "step": 8175 + }, + { + "epoch": 14.828383586488325, + "grad_norm": 0.21484375, + "learning_rate": 0.0002, + "loss": 0.0869, + "step": 8176 + }, + { + "epoch": 14.83019723418726, + "grad_norm": 0.2197265625, + "learning_rate": 0.0002, + "loss": 0.1002, + "step": 8177 + }, + { + "epoch": 14.832010881886193, + "grad_norm": 0.21484375, + "learning_rate": 0.0002, + "loss": 0.0847, + "step": 8178 + }, + { + "epoch": 14.833824529585128, + "grad_norm": 0.2197265625, + "learning_rate": 0.0002, + "loss": 0.0952, + "step": 8179 + }, + { + "epoch": 14.835638177284062, + "grad_norm": 0.259765625, + "learning_rate": 0.0002, + "loss": 0.1125, + "step": 8180 + }, + { + "epoch": 14.837451824982997, + "grad_norm": 0.2177734375, + "learning_rate": 0.0002, + "loss": 0.0932, + "step": 8181 + }, + { + "epoch": 14.839265472681932, + "grad_norm": 0.2216796875, + "learning_rate": 0.0002, + "loss": 0.0986, + "step": 8182 + }, + { + "epoch": 14.841079120380867, + "grad_norm": 0.21484375, + "learning_rate": 0.0002, + "loss": 0.0874, + "step": 8183 + }, + { + "epoch": 14.8428927680798, + "grad_norm": 0.2080078125, + "learning_rate": 0.0002, + "loss": 0.0766, + "step": 8184 + }, + { + "epoch": 14.844706415778735, + "grad_norm": 0.22265625, + "learning_rate": 0.0002, + "loss": 0.0874, + "step": 8185 + }, + { + "epoch": 14.84652006347767, + "grad_norm": 0.2275390625, + "learning_rate": 0.0002, + "loss": 0.0814, + "step": 8186 + }, + { + "epoch": 14.848333711176604, + "grad_norm": 0.1943359375, + "learning_rate": 0.0002, + "loss": 0.0862, + "step": 8187 + }, + { + "epoch": 14.850147358875539, + "grad_norm": 0.19921875, + "learning_rate": 0.0002, + "loss": 0.0893, + "step": 8188 + }, + { + "epoch": 14.851961006574474, + "grad_norm": 0.205078125, + "learning_rate": 0.0002, + "loss": 0.0706, + "step": 8189 + }, + { + "epoch": 14.853774654273407, + "grad_norm": 0.2177734375, + "learning_rate": 0.0002, + "loss": 0.0777, + "step": 8190 + }, + { + "epoch": 14.855588301972341, + "grad_norm": 0.2333984375, + "learning_rate": 0.0002, + "loss": 0.0834, + "step": 8191 + }, + { + "epoch": 14.857401949671276, + "grad_norm": 0.26953125, + "learning_rate": 0.0002, + "loss": 0.0711, + "step": 8192 + }, + { + "epoch": 14.859215597370211, + "grad_norm": 0.208984375, + "learning_rate": 0.0002, + "loss": 0.0794, + "step": 8193 + }, + { + "epoch": 14.861029245069146, + "grad_norm": 0.2353515625, + "learning_rate": 0.0002, + "loss": 0.0703, + "step": 8194 + }, + { + "epoch": 14.86284289276808, + "grad_norm": 0.21484375, + "learning_rate": 0.0002, + "loss": 0.0649, + "step": 8195 + }, + { + "epoch": 14.864656540467013, + "grad_norm": 0.236328125, + "learning_rate": 0.0002, + "loss": 0.0753, + "step": 8196 + }, + { + "epoch": 14.866470188165948, + "grad_norm": 0.255859375, + "learning_rate": 0.0002, + "loss": 0.0744, + "step": 8197 + }, + { + "epoch": 14.868283835864883, + "grad_norm": 0.228515625, + "learning_rate": 0.0002, + "loss": 0.0782, + "step": 8198 + }, + { + "epoch": 14.870097483563818, + "grad_norm": 0.228515625, + "learning_rate": 0.0002, + "loss": 0.0781, + "step": 8199 + }, + { + "epoch": 14.871911131262753, + "grad_norm": 0.2021484375, + "learning_rate": 0.0002, + "loss": 0.0682, + "step": 8200 + }, + { + "epoch": 14.873724778961687, + "grad_norm": 0.41796875, + "learning_rate": 0.0002, + "loss": 0.0974, + "step": 8201 + }, + { + "epoch": 14.875538426660622, + "grad_norm": 0.224609375, + "learning_rate": 0.0002, + "loss": 0.0766, + "step": 8202 + }, + { + "epoch": 14.877352074359555, + "grad_norm": 0.392578125, + "learning_rate": 0.0002, + "loss": 0.091, + "step": 8203 + }, + { + "epoch": 14.87916572205849, + "grad_norm": 0.291015625, + "learning_rate": 0.0002, + "loss": 0.0884, + "step": 8204 + }, + { + "epoch": 14.880979369757425, + "grad_norm": 0.2421875, + "learning_rate": 0.0002, + "loss": 0.0792, + "step": 8205 + }, + { + "epoch": 14.88279301745636, + "grad_norm": 0.306640625, + "learning_rate": 0.0002, + "loss": 0.0953, + "step": 8206 + }, + { + "epoch": 14.884606665155294, + "grad_norm": 0.287109375, + "learning_rate": 0.0002, + "loss": 0.0875, + "step": 8207 + }, + { + "epoch": 14.886420312854227, + "grad_norm": 0.267578125, + "learning_rate": 0.0002, + "loss": 0.0819, + "step": 8208 + }, + { + "epoch": 14.888233960553162, + "grad_norm": 0.40625, + "learning_rate": 0.0002, + "loss": 0.1341, + "step": 8209 + }, + { + "epoch": 14.890047608252097, + "grad_norm": 0.318359375, + "learning_rate": 0.0002, + "loss": 0.1135, + "step": 8210 + }, + { + "epoch": 14.891861255951031, + "grad_norm": 0.255859375, + "learning_rate": 0.0002, + "loss": 0.1413, + "step": 8211 + }, + { + "epoch": 14.893674903649966, + "grad_norm": 0.2451171875, + "learning_rate": 0.0002, + "loss": 0.0944, + "step": 8212 + }, + { + "epoch": 14.895488551348901, + "grad_norm": 0.40234375, + "learning_rate": 0.0002, + "loss": 0.1105, + "step": 8213 + }, + { + "epoch": 14.897302199047836, + "grad_norm": 0.2734375, + "learning_rate": 0.0002, + "loss": 0.1111, + "step": 8214 + }, + { + "epoch": 14.899115846746769, + "grad_norm": 0.1962890625, + "learning_rate": 0.0002, + "loss": 0.1177, + "step": 8215 + }, + { + "epoch": 14.900929494445704, + "grad_norm": 0.51171875, + "learning_rate": 0.0002, + "loss": 0.1524, + "step": 8216 + }, + { + "epoch": 14.902743142144638, + "grad_norm": 0.21875, + "learning_rate": 0.0002, + "loss": 0.1299, + "step": 8217 + }, + { + "epoch": 14.904556789843573, + "grad_norm": 0.1279296875, + "learning_rate": 0.0002, + "loss": 0.1376, + "step": 8218 + }, + { + "epoch": 14.906370437542508, + "grad_norm": 0.2294921875, + "learning_rate": 0.0002, + "loss": 0.2064, + "step": 8219 + }, + { + "epoch": 14.908184085241443, + "grad_norm": 0.240234375, + "learning_rate": 0.0002, + "loss": 0.1814, + "step": 8220 + }, + { + "epoch": 14.909997732940376, + "grad_norm": 0.25390625, + "learning_rate": 0.0002, + "loss": 0.1225, + "step": 8221 + }, + { + "epoch": 14.91181138063931, + "grad_norm": 0.251953125, + "learning_rate": 0.0002, + "loss": 0.1089, + "step": 8222 + }, + { + "epoch": 14.913625028338245, + "grad_norm": 0.240234375, + "learning_rate": 0.0002, + "loss": 0.1039, + "step": 8223 + }, + { + "epoch": 14.91543867603718, + "grad_norm": 0.2021484375, + "learning_rate": 0.0002, + "loss": 0.0903, + "step": 8224 + }, + { + "epoch": 14.917252323736115, + "grad_norm": 0.23046875, + "learning_rate": 0.0002, + "loss": 0.0932, + "step": 8225 + }, + { + "epoch": 14.91906597143505, + "grad_norm": 0.2021484375, + "learning_rate": 0.0002, + "loss": 0.0856, + "step": 8226 + }, + { + "epoch": 14.920879619133983, + "grad_norm": 0.2216796875, + "learning_rate": 0.0002, + "loss": 0.0976, + "step": 8227 + }, + { + "epoch": 14.922693266832917, + "grad_norm": 0.294921875, + "learning_rate": 0.0002, + "loss": 0.1386, + "step": 8228 + }, + { + "epoch": 14.922693266832917, + "eval_loss": 2.4269862174987793, + "eval_runtime": 150.498, + "eval_samples_per_second": 6.645, + "eval_steps_per_second": 6.645, + "step": 8228 + }, + { + "epoch": 14.922693266832917, + "mmlu_eval_accuracy": 0.3072634488495057, + "mmlu_eval_accuracy_abstract_algebra": 0.36363636363636365, + "mmlu_eval_accuracy_anatomy": 0.2857142857142857, + "mmlu_eval_accuracy_astronomy": 0.3125, + "mmlu_eval_accuracy_business_ethics": 0.45454545454545453, + "mmlu_eval_accuracy_clinical_knowledge": 0.27586206896551724, + "mmlu_eval_accuracy_college_biology": 0.375, + "mmlu_eval_accuracy_college_chemistry": 0.125, + "mmlu_eval_accuracy_college_computer_science": 0.09090909090909091, + "mmlu_eval_accuracy_college_mathematics": 0.18181818181818182, + "mmlu_eval_accuracy_college_medicine": 0.36363636363636365, + "mmlu_eval_accuracy_college_physics": 0.36363636363636365, + "mmlu_eval_accuracy_computer_security": 0.6363636363636364, + "mmlu_eval_accuracy_conceptual_physics": 0.34615384615384615, + "mmlu_eval_accuracy_econometrics": 0.16666666666666666, + "mmlu_eval_accuracy_electrical_engineering": 0.1875, + "mmlu_eval_accuracy_elementary_mathematics": 0.3170731707317073, + "mmlu_eval_accuracy_formal_logic": 0.21428571428571427, + "mmlu_eval_accuracy_global_facts": 0.3, + "mmlu_eval_accuracy_high_school_biology": 0.40625, + "mmlu_eval_accuracy_high_school_chemistry": 0.18181818181818182, + "mmlu_eval_accuracy_high_school_computer_science": 0.2222222222222222, + "mmlu_eval_accuracy_high_school_european_history": 0.2777777777777778, + "mmlu_eval_accuracy_high_school_geography": 0.3181818181818182, + "mmlu_eval_accuracy_high_school_government_and_politics": 0.38095238095238093, + "mmlu_eval_accuracy_high_school_macroeconomics": 0.2558139534883721, + "mmlu_eval_accuracy_high_school_mathematics": 0.10344827586206896, + "mmlu_eval_accuracy_high_school_microeconomics": 0.2692307692307692, + "mmlu_eval_accuracy_high_school_physics": 0.47058823529411764, + "mmlu_eval_accuracy_high_school_psychology": 0.36666666666666664, + "mmlu_eval_accuracy_high_school_statistics": 0.2608695652173913, + "mmlu_eval_accuracy_high_school_us_history": 0.2727272727272727, + "mmlu_eval_accuracy_high_school_world_history": 0.19230769230769232, + "mmlu_eval_accuracy_human_aging": 0.30434782608695654, + "mmlu_eval_accuracy_human_sexuality": 0.16666666666666666, + "mmlu_eval_accuracy_international_law": 0.3076923076923077, + "mmlu_eval_accuracy_jurisprudence": 0.36363636363636365, + "mmlu_eval_accuracy_logical_fallacies": 0.3888888888888889, + "mmlu_eval_accuracy_machine_learning": 0.36363636363636365, + "mmlu_eval_accuracy_management": 0.36363636363636365, + "mmlu_eval_accuracy_marketing": 0.44, + "mmlu_eval_accuracy_medical_genetics": 0.18181818181818182, + "mmlu_eval_accuracy_miscellaneous": 0.45348837209302323, + "mmlu_eval_accuracy_moral_disputes": 0.2894736842105263, + "mmlu_eval_accuracy_moral_scenarios": 0.22, + "mmlu_eval_accuracy_nutrition": 0.24242424242424243, + "mmlu_eval_accuracy_philosophy": 0.38235294117647056, + "mmlu_eval_accuracy_prehistory": 0.37142857142857144, + "mmlu_eval_accuracy_professional_accounting": 0.2903225806451613, + "mmlu_eval_accuracy_professional_law": 0.25882352941176473, + "mmlu_eval_accuracy_professional_medicine": 0.25806451612903225, + "mmlu_eval_accuracy_professional_psychology": 0.2898550724637681, + "mmlu_eval_accuracy_public_relations": 0.4166666666666667, + "mmlu_eval_accuracy_security_studies": 0.3333333333333333, + "mmlu_eval_accuracy_sociology": 0.5454545454545454, + "mmlu_eval_accuracy_us_foreign_policy": 0.45454545454545453, + "mmlu_eval_accuracy_virology": 0.2777777777777778, + "mmlu_eval_accuracy_world_religions": 0.21052631578947367, + "mmlu_loss": 2.224019964162934, + "step": 8228 + }, + { + "epoch": 14.924506914531852, + "grad_norm": 0.2490234375, + "learning_rate": 0.0002, + "loss": 0.1102, + "step": 8229 + }, + { + "epoch": 14.926320562230787, + "grad_norm": 0.2353515625, + "learning_rate": 0.0002, + "loss": 0.1009, + "step": 8230 + }, + { + "epoch": 14.928134209929722, + "grad_norm": 0.2255859375, + "learning_rate": 0.0002, + "loss": 0.0836, + "step": 8231 + }, + { + "epoch": 14.929947857628656, + "grad_norm": 0.2177734375, + "learning_rate": 0.0002, + "loss": 0.0949, + "step": 8232 + }, + { + "epoch": 14.93176150532759, + "grad_norm": 0.1708984375, + "learning_rate": 0.0002, + "loss": 0.0685, + "step": 8233 + }, + { + "epoch": 14.933575153026524, + "grad_norm": 0.1982421875, + "learning_rate": 0.0002, + "loss": 0.1001, + "step": 8234 + }, + { + "epoch": 14.935388800725459, + "grad_norm": 0.251953125, + "learning_rate": 0.0002, + "loss": 0.099, + "step": 8235 + }, + { + "epoch": 14.937202448424394, + "grad_norm": 0.240234375, + "learning_rate": 0.0002, + "loss": 0.0824, + "step": 8236 + }, + { + "epoch": 14.939016096123328, + "grad_norm": 0.205078125, + "learning_rate": 0.0002, + "loss": 0.069, + "step": 8237 + }, + { + "epoch": 14.940829743822263, + "grad_norm": 0.185546875, + "learning_rate": 0.0002, + "loss": 0.0747, + "step": 8238 + }, + { + "epoch": 14.942643391521196, + "grad_norm": 0.240234375, + "learning_rate": 0.0002, + "loss": 0.073, + "step": 8239 + }, + { + "epoch": 14.944457039220131, + "grad_norm": 0.20703125, + "learning_rate": 0.0002, + "loss": 0.0795, + "step": 8240 + }, + { + "epoch": 14.946270686919066, + "grad_norm": 0.2275390625, + "learning_rate": 0.0002, + "loss": 0.0798, + "step": 8241 + }, + { + "epoch": 14.948084334618, + "grad_norm": 0.220703125, + "learning_rate": 0.0002, + "loss": 0.0865, + "step": 8242 + }, + { + "epoch": 14.949897982316935, + "grad_norm": 0.263671875, + "learning_rate": 0.0002, + "loss": 0.082, + "step": 8243 + }, + { + "epoch": 14.95171163001587, + "grad_norm": 0.251953125, + "learning_rate": 0.0002, + "loss": 0.0818, + "step": 8244 + }, + { + "epoch": 14.953525277714803, + "grad_norm": 0.275390625, + "learning_rate": 0.0002, + "loss": 0.0879, + "step": 8245 + }, + { + "epoch": 14.955338925413738, + "grad_norm": 0.283203125, + "learning_rate": 0.0002, + "loss": 0.0825, + "step": 8246 + }, + { + "epoch": 14.957152573112673, + "grad_norm": 0.259765625, + "learning_rate": 0.0002, + "loss": 0.0752, + "step": 8247 + }, + { + "epoch": 14.958966220811607, + "grad_norm": 0.267578125, + "learning_rate": 0.0002, + "loss": 0.08, + "step": 8248 + }, + { + "epoch": 14.960779868510542, + "grad_norm": 0.19921875, + "learning_rate": 0.0002, + "loss": 0.0701, + "step": 8249 + }, + { + "epoch": 14.962593516209477, + "grad_norm": 0.3046875, + "learning_rate": 0.0002, + "loss": 0.0888, + "step": 8250 + }, + { + "epoch": 14.96440716390841, + "grad_norm": 0.240234375, + "learning_rate": 0.0002, + "loss": 0.0709, + "step": 8251 + }, + { + "epoch": 14.966220811607345, + "grad_norm": 0.267578125, + "learning_rate": 0.0002, + "loss": 0.0797, + "step": 8252 + }, + { + "epoch": 14.96803445930628, + "grad_norm": 0.279296875, + "learning_rate": 0.0002, + "loss": 0.0823, + "step": 8253 + }, + { + "epoch": 14.969848107005214, + "grad_norm": 0.37109375, + "learning_rate": 0.0002, + "loss": 0.0875, + "step": 8254 + }, + { + "epoch": 14.971661754704149, + "grad_norm": 0.34765625, + "learning_rate": 0.0002, + "loss": 0.0834, + "step": 8255 + }, + { + "epoch": 14.973475402403084, + "grad_norm": 0.263671875, + "learning_rate": 0.0002, + "loss": 0.088, + "step": 8256 + }, + { + "epoch": 14.975289050102017, + "grad_norm": 0.353515625, + "learning_rate": 0.0002, + "loss": 0.0961, + "step": 8257 + }, + { + "epoch": 14.977102697800952, + "grad_norm": 0.27734375, + "learning_rate": 0.0002, + "loss": 0.0932, + "step": 8258 + }, + { + "epoch": 14.978916345499886, + "grad_norm": 0.1787109375, + "learning_rate": 0.0002, + "loss": 0.0875, + "step": 8259 + }, + { + "epoch": 14.980729993198821, + "grad_norm": 0.322265625, + "learning_rate": 0.0002, + "loss": 0.1001, + "step": 8260 + }, + { + "epoch": 14.982543640897756, + "grad_norm": 0.400390625, + "learning_rate": 0.0002, + "loss": 0.1083, + "step": 8261 + }, + { + "epoch": 14.98435728859669, + "grad_norm": 0.251953125, + "learning_rate": 0.0002, + "loss": 0.0987, + "step": 8262 + }, + { + "epoch": 14.986170936295625, + "grad_norm": 0.30078125, + "learning_rate": 0.0002, + "loss": 0.1053, + "step": 8263 + }, + { + "epoch": 14.987984583994558, + "grad_norm": 0.2138671875, + "learning_rate": 0.0002, + "loss": 0.1091, + "step": 8264 + }, + { + "epoch": 14.989798231693493, + "grad_norm": 0.251953125, + "learning_rate": 0.0002, + "loss": 0.1117, + "step": 8265 + }, + { + "epoch": 14.991611879392428, + "grad_norm": 0.37890625, + "learning_rate": 0.0002, + "loss": 0.1425, + "step": 8266 + }, + { + "epoch": 14.993425527091363, + "grad_norm": 0.34765625, + "learning_rate": 0.0002, + "loss": 0.1464, + "step": 8267 + }, + { + "epoch": 14.995239174790298, + "grad_norm": 0.337890625, + "learning_rate": 0.0002, + "loss": 0.1618, + "step": 8268 + }, + { + "epoch": 14.99705282248923, + "grad_norm": 0.3359375, + "learning_rate": 0.0002, + "loss": 0.2316, + "step": 8269 + }, + { + "epoch": 14.998866470188165, + "grad_norm": 0.2265625, + "learning_rate": 0.0002, + "loss": 0.1496, + "step": 8270 + }, + { + "epoch": 15.0006801178871, + "grad_norm": 0.3125, + "learning_rate": 0.0002, + "loss": 0.1075, + "step": 8271 + }, + { + "epoch": 15.002493765586035, + "grad_norm": 0.201171875, + "learning_rate": 0.0002, + "loss": 0.0786, + "step": 8272 + }, + { + "epoch": 15.00430741328497, + "grad_norm": 0.197265625, + "learning_rate": 0.0002, + "loss": 0.0774, + "step": 8273 + }, + { + "epoch": 15.006121060983904, + "grad_norm": 0.1787109375, + "learning_rate": 0.0002, + "loss": 0.0662, + "step": 8274 + }, + { + "epoch": 15.00793470868284, + "grad_norm": 0.208984375, + "learning_rate": 0.0002, + "loss": 0.0751, + "step": 8275 + }, + { + "epoch": 15.009748356381772, + "grad_norm": 0.185546875, + "learning_rate": 0.0002, + "loss": 0.073, + "step": 8276 + }, + { + "epoch": 15.011562004080707, + "grad_norm": 0.169921875, + "learning_rate": 0.0002, + "loss": 0.0673, + "step": 8277 + }, + { + "epoch": 15.013375651779642, + "grad_norm": 0.1787109375, + "learning_rate": 0.0002, + "loss": 0.0794, + "step": 8278 + }, + { + "epoch": 15.015189299478577, + "grad_norm": 0.2080078125, + "learning_rate": 0.0002, + "loss": 0.0734, + "step": 8279 + }, + { + "epoch": 15.017002947177511, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 0.0609, + "step": 8280 + }, + { + "epoch": 15.018816594876446, + "grad_norm": 0.224609375, + "learning_rate": 0.0002, + "loss": 0.0812, + "step": 8281 + }, + { + "epoch": 15.020630242575379, + "grad_norm": 0.1953125, + "learning_rate": 0.0002, + "loss": 0.0772, + "step": 8282 + }, + { + "epoch": 15.022443890274314, + "grad_norm": 0.20703125, + "learning_rate": 0.0002, + "loss": 0.0701, + "step": 8283 + }, + { + "epoch": 15.024257537973249, + "grad_norm": 0.1826171875, + "learning_rate": 0.0002, + "loss": 0.0963, + "step": 8284 + }, + { + "epoch": 15.026071185672183, + "grad_norm": 0.1728515625, + "learning_rate": 0.0002, + "loss": 0.0588, + "step": 8285 + }, + { + "epoch": 15.027884833371118, + "grad_norm": 0.1630859375, + "learning_rate": 0.0002, + "loss": 0.0606, + "step": 8286 + }, + { + "epoch": 15.029698481070053, + "grad_norm": 0.1396484375, + "learning_rate": 0.0002, + "loss": 0.0572, + "step": 8287 + }, + { + "epoch": 15.031512128768986, + "grad_norm": 0.1826171875, + "learning_rate": 0.0002, + "loss": 0.0681, + "step": 8288 + }, + { + "epoch": 15.03332577646792, + "grad_norm": 0.1875, + "learning_rate": 0.0002, + "loss": 0.0631, + "step": 8289 + }, + { + "epoch": 15.035139424166855, + "grad_norm": 0.23046875, + "learning_rate": 0.0002, + "loss": 0.0623, + "step": 8290 + }, + { + "epoch": 15.03695307186579, + "grad_norm": 0.16796875, + "learning_rate": 0.0002, + "loss": 0.0633, + "step": 8291 + }, + { + "epoch": 15.038766719564725, + "grad_norm": 0.189453125, + "learning_rate": 0.0002, + "loss": 0.0647, + "step": 8292 + }, + { + "epoch": 15.04058036726366, + "grad_norm": 0.2158203125, + "learning_rate": 0.0002, + "loss": 0.0622, + "step": 8293 + }, + { + "epoch": 15.042394014962593, + "grad_norm": 0.177734375, + "learning_rate": 0.0002, + "loss": 0.0546, + "step": 8294 + }, + { + "epoch": 15.044207662661528, + "grad_norm": 0.240234375, + "learning_rate": 0.0002, + "loss": 0.0657, + "step": 8295 + }, + { + "epoch": 15.046021310360462, + "grad_norm": 0.2158203125, + "learning_rate": 0.0002, + "loss": 0.0649, + "step": 8296 + }, + { + "epoch": 15.047834958059397, + "grad_norm": 0.265625, + "learning_rate": 0.0002, + "loss": 0.0726, + "step": 8297 + }, + { + "epoch": 15.049648605758332, + "grad_norm": 0.2021484375, + "learning_rate": 0.0002, + "loss": 0.0697, + "step": 8298 + }, + { + "epoch": 15.051462253457267, + "grad_norm": 0.212890625, + "learning_rate": 0.0002, + "loss": 0.0674, + "step": 8299 + }, + { + "epoch": 15.0532759011562, + "grad_norm": 0.2275390625, + "learning_rate": 0.0002, + "loss": 0.0688, + "step": 8300 + }, + { + "epoch": 15.055089548855134, + "grad_norm": 0.19140625, + "learning_rate": 0.0002, + "loss": 0.0634, + "step": 8301 + }, + { + "epoch": 15.05690319655407, + "grad_norm": 0.2314453125, + "learning_rate": 0.0002, + "loss": 0.0808, + "step": 8302 + }, + { + "epoch": 15.058716844253004, + "grad_norm": 0.2451171875, + "learning_rate": 0.0002, + "loss": 0.0616, + "step": 8303 + }, + { + "epoch": 15.060530491951939, + "grad_norm": 0.232421875, + "learning_rate": 0.0002, + "loss": 0.0874, + "step": 8304 + }, + { + "epoch": 15.062344139650873, + "grad_norm": 0.171875, + "learning_rate": 0.0002, + "loss": 0.0604, + "step": 8305 + }, + { + "epoch": 15.064157787349806, + "grad_norm": 0.1748046875, + "learning_rate": 0.0002, + "loss": 0.0624, + "step": 8306 + }, + { + "epoch": 15.065971435048741, + "grad_norm": 0.25390625, + "learning_rate": 0.0002, + "loss": 0.073, + "step": 8307 + }, + { + "epoch": 15.067785082747676, + "grad_norm": 0.248046875, + "learning_rate": 0.0002, + "loss": 0.0799, + "step": 8308 + }, + { + "epoch": 15.06959873044661, + "grad_norm": 0.2578125, + "learning_rate": 0.0002, + "loss": 0.0762, + "step": 8309 + }, + { + "epoch": 15.071412378145546, + "grad_norm": 0.333984375, + "learning_rate": 0.0002, + "loss": 0.0888, + "step": 8310 + }, + { + "epoch": 15.07322602584448, + "grad_norm": 0.2041015625, + "learning_rate": 0.0002, + "loss": 0.0874, + "step": 8311 + }, + { + "epoch": 15.075039673543413, + "grad_norm": 0.2578125, + "learning_rate": 0.0002, + "loss": 0.0946, + "step": 8312 + }, + { + "epoch": 15.076853321242348, + "grad_norm": 0.265625, + "learning_rate": 0.0002, + "loss": 0.0877, + "step": 8313 + }, + { + "epoch": 15.078666968941283, + "grad_norm": 0.2431640625, + "learning_rate": 0.0002, + "loss": 0.0845, + "step": 8314 + }, + { + "epoch": 15.080480616640218, + "grad_norm": 0.212890625, + "learning_rate": 0.0002, + "loss": 0.0939, + "step": 8315 + }, + { + "epoch": 15.082294264339152, + "grad_norm": 0.1796875, + "learning_rate": 0.0002, + "loss": 0.1124, + "step": 8316 + }, + { + "epoch": 15.084107912038087, + "grad_norm": 0.115234375, + "learning_rate": 0.0002, + "loss": 0.1044, + "step": 8317 + }, + { + "epoch": 15.08592155973702, + "grad_norm": 0.234375, + "learning_rate": 0.0002, + "loss": 0.1308, + "step": 8318 + }, + { + "epoch": 15.087735207435955, + "grad_norm": 0.09765625, + "learning_rate": 0.0002, + "loss": 0.1364, + "step": 8319 + }, + { + "epoch": 15.08954885513489, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 0.153, + "step": 8320 + }, + { + "epoch": 15.091362502833825, + "grad_norm": 0.1640625, + "learning_rate": 0.0002, + "loss": 0.1489, + "step": 8321 + }, + { + "epoch": 15.09317615053276, + "grad_norm": 0.20703125, + "learning_rate": 0.0002, + "loss": 0.0715, + "step": 8322 + }, + { + "epoch": 15.094989798231694, + "grad_norm": 0.205078125, + "learning_rate": 0.0002, + "loss": 0.0859, + "step": 8323 + }, + { + "epoch": 15.096803445930629, + "grad_norm": 0.2236328125, + "learning_rate": 0.0002, + "loss": 0.0867, + "step": 8324 + }, + { + "epoch": 15.098617093629562, + "grad_norm": 0.2041015625, + "learning_rate": 0.0002, + "loss": 0.073, + "step": 8325 + }, + { + "epoch": 15.100430741328497, + "grad_norm": 0.216796875, + "learning_rate": 0.0002, + "loss": 0.0749, + "step": 8326 + }, + { + "epoch": 15.102244389027431, + "grad_norm": 0.2138671875, + "learning_rate": 0.0002, + "loss": 0.0747, + "step": 8327 + }, + { + "epoch": 15.104058036726366, + "grad_norm": 0.1826171875, + "learning_rate": 0.0002, + "loss": 0.0809, + "step": 8328 + }, + { + "epoch": 15.105871684425301, + "grad_norm": 0.2158203125, + "learning_rate": 0.0002, + "loss": 0.0694, + "step": 8329 + }, + { + "epoch": 15.107685332124236, + "grad_norm": 0.1748046875, + "learning_rate": 0.0002, + "loss": 0.0756, + "step": 8330 + }, + { + "epoch": 15.109498979823169, + "grad_norm": 0.2373046875, + "learning_rate": 0.0002, + "loss": 0.0871, + "step": 8331 + }, + { + "epoch": 15.111312627522103, + "grad_norm": 0.1611328125, + "learning_rate": 0.0002, + "loss": 0.0673, + "step": 8332 + }, + { + "epoch": 15.113126275221038, + "grad_norm": 0.2578125, + "learning_rate": 0.0002, + "loss": 0.091, + "step": 8333 + }, + { + "epoch": 15.114939922919973, + "grad_norm": 0.1962890625, + "learning_rate": 0.0002, + "loss": 0.0691, + "step": 8334 + }, + { + "epoch": 15.116753570618908, + "grad_norm": 0.25390625, + "learning_rate": 0.0002, + "loss": 0.076, + "step": 8335 + }, + { + "epoch": 15.118567218317843, + "grad_norm": 0.2275390625, + "learning_rate": 0.0002, + "loss": 0.0671, + "step": 8336 + }, + { + "epoch": 15.120380866016776, + "grad_norm": 0.166015625, + "learning_rate": 0.0002, + "loss": 0.0719, + "step": 8337 + }, + { + "epoch": 15.12219451371571, + "grad_norm": 0.1630859375, + "learning_rate": 0.0002, + "loss": 0.0587, + "step": 8338 + }, + { + "epoch": 15.124008161414645, + "grad_norm": 0.1845703125, + "learning_rate": 0.0002, + "loss": 0.0701, + "step": 8339 + }, + { + "epoch": 15.12582180911358, + "grad_norm": 0.197265625, + "learning_rate": 0.0002, + "loss": 0.0595, + "step": 8340 + }, + { + "epoch": 15.127635456812515, + "grad_norm": 0.21484375, + "learning_rate": 0.0002, + "loss": 0.0651, + "step": 8341 + }, + { + "epoch": 15.12944910451145, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 0.0603, + "step": 8342 + }, + { + "epoch": 15.131262752210382, + "grad_norm": 0.2080078125, + "learning_rate": 0.0002, + "loss": 0.0712, + "step": 8343 + }, + { + "epoch": 15.133076399909317, + "grad_norm": 0.1796875, + "learning_rate": 0.0002, + "loss": 0.0687, + "step": 8344 + }, + { + "epoch": 15.134890047608252, + "grad_norm": 0.2119140625, + "learning_rate": 0.0002, + "loss": 0.063, + "step": 8345 + }, + { + "epoch": 15.136703695307187, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 0.0635, + "step": 8346 + }, + { + "epoch": 15.138517343006122, + "grad_norm": 0.2275390625, + "learning_rate": 0.0002, + "loss": 0.0625, + "step": 8347 + }, + { + "epoch": 15.140330990705056, + "grad_norm": 0.2490234375, + "learning_rate": 0.0002, + "loss": 0.0754, + "step": 8348 + }, + { + "epoch": 15.14214463840399, + "grad_norm": 0.1875, + "learning_rate": 0.0002, + "loss": 0.0614, + "step": 8349 + }, + { + "epoch": 15.143958286102924, + "grad_norm": 0.265625, + "learning_rate": 0.0002, + "loss": 0.0676, + "step": 8350 + }, + { + "epoch": 15.145771933801859, + "grad_norm": 0.1884765625, + "learning_rate": 0.0002, + "loss": 0.0613, + "step": 8351 + }, + { + "epoch": 15.147585581500794, + "grad_norm": 0.2041015625, + "learning_rate": 0.0002, + "loss": 0.07, + "step": 8352 + }, + { + "epoch": 15.149399229199728, + "grad_norm": 0.1845703125, + "learning_rate": 0.0002, + "loss": 0.0697, + "step": 8353 + }, + { + "epoch": 15.151212876898663, + "grad_norm": 0.2275390625, + "learning_rate": 0.0002, + "loss": 0.0666, + "step": 8354 + }, + { + "epoch": 15.153026524597596, + "grad_norm": 0.2080078125, + "learning_rate": 0.0002, + "loss": 0.0683, + "step": 8355 + }, + { + "epoch": 15.154840172296531, + "grad_norm": 0.302734375, + "learning_rate": 0.0002, + "loss": 0.0817, + "step": 8356 + }, + { + "epoch": 15.156653819995466, + "grad_norm": 0.2109375, + "learning_rate": 0.0002, + "loss": 0.0864, + "step": 8357 + }, + { + "epoch": 15.1584674676944, + "grad_norm": 0.302734375, + "learning_rate": 0.0002, + "loss": 0.0699, + "step": 8358 + }, + { + "epoch": 15.160281115393335, + "grad_norm": 0.333984375, + "learning_rate": 0.0002, + "loss": 0.087, + "step": 8359 + }, + { + "epoch": 15.16209476309227, + "grad_norm": 0.279296875, + "learning_rate": 0.0002, + "loss": 0.0827, + "step": 8360 + }, + { + "epoch": 15.163908410791203, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 0.0794, + "step": 8361 + }, + { + "epoch": 15.165722058490138, + "grad_norm": 0.326171875, + "learning_rate": 0.0002, + "loss": 0.1172, + "step": 8362 + }, + { + "epoch": 15.167535706189073, + "grad_norm": 0.2109375, + "learning_rate": 0.0002, + "loss": 0.0891, + "step": 8363 + }, + { + "epoch": 15.169349353888007, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 0.0923, + "step": 8364 + }, + { + "epoch": 15.171163001586942, + "grad_norm": 0.16015625, + "learning_rate": 0.0002, + "loss": 0.0883, + "step": 8365 + }, + { + "epoch": 15.172976649285877, + "grad_norm": 0.1875, + "learning_rate": 0.0002, + "loss": 0.1012, + "step": 8366 + }, + { + "epoch": 15.17479029698481, + "grad_norm": 0.2578125, + "learning_rate": 0.0002, + "loss": 0.1058, + "step": 8367 + }, + { + "epoch": 15.176603944683745, + "grad_norm": 0.099609375, + "learning_rate": 0.0002, + "loss": 0.1048, + "step": 8368 + }, + { + "epoch": 15.17841759238268, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 0.1187, + "step": 8369 + }, + { + "epoch": 15.180231240081614, + "grad_norm": 0.28125, + "learning_rate": 0.0002, + "loss": 0.1659, + "step": 8370 + }, + { + "epoch": 15.182044887780549, + "grad_norm": 0.1865234375, + "learning_rate": 0.0002, + "loss": 0.1669, + "step": 8371 + }, + { + "epoch": 15.183858535479484, + "grad_norm": 0.2412109375, + "learning_rate": 0.0002, + "loss": 0.1016, + "step": 8372 + }, + { + "epoch": 15.185672183178418, + "grad_norm": 0.25390625, + "learning_rate": 0.0002, + "loss": 0.0875, + "step": 8373 + }, + { + "epoch": 15.187485830877351, + "grad_norm": 0.208984375, + "learning_rate": 0.0002, + "loss": 0.0806, + "step": 8374 + }, + { + "epoch": 15.189299478576286, + "grad_norm": 0.166015625, + "learning_rate": 0.0002, + "loss": 0.0672, + "step": 8375 + }, + { + "epoch": 15.191113126275221, + "grad_norm": 0.236328125, + "learning_rate": 0.0002, + "loss": 0.0887, + "step": 8376 + }, + { + "epoch": 15.192926773974156, + "grad_norm": 0.18359375, + "learning_rate": 0.0002, + "loss": 0.0648, + "step": 8377 + }, + { + "epoch": 15.19474042167309, + "grad_norm": 0.1875, + "learning_rate": 0.0002, + "loss": 0.0703, + "step": 8378 + }, + { + "epoch": 15.196554069372025, + "grad_norm": 0.271484375, + "learning_rate": 0.0002, + "loss": 0.0918, + "step": 8379 + }, + { + "epoch": 15.198367717070958, + "grad_norm": 0.232421875, + "learning_rate": 0.0002, + "loss": 0.0763, + "step": 8380 + }, + { + "epoch": 15.200181364769893, + "grad_norm": 0.197265625, + "learning_rate": 0.0002, + "loss": 0.0712, + "step": 8381 + }, + { + "epoch": 15.201995012468828, + "grad_norm": 0.205078125, + "learning_rate": 0.0002, + "loss": 0.0781, + "step": 8382 + }, + { + "epoch": 15.203808660167763, + "grad_norm": 0.1943359375, + "learning_rate": 0.0002, + "loss": 0.0649, + "step": 8383 + }, + { + "epoch": 15.205622307866697, + "grad_norm": 0.181640625, + "learning_rate": 0.0002, + "loss": 0.0636, + "step": 8384 + }, + { + "epoch": 15.207435955565632, + "grad_norm": 0.181640625, + "learning_rate": 0.0002, + "loss": 0.067, + "step": 8385 + }, + { + "epoch": 15.209249603264565, + "grad_norm": 0.1943359375, + "learning_rate": 0.0002, + "loss": 0.0662, + "step": 8386 + }, + { + "epoch": 15.2110632509635, + "grad_norm": 0.2138671875, + "learning_rate": 0.0002, + "loss": 0.0622, + "step": 8387 + }, + { + "epoch": 15.212876898662435, + "grad_norm": 0.1962890625, + "learning_rate": 0.0002, + "loss": 0.0655, + "step": 8388 + }, + { + "epoch": 15.21469054636137, + "grad_norm": 0.177734375, + "learning_rate": 0.0002, + "loss": 0.0622, + "step": 8389 + }, + { + "epoch": 15.216504194060304, + "grad_norm": 0.1767578125, + "learning_rate": 0.0002, + "loss": 0.0607, + "step": 8390 + }, + { + "epoch": 15.218317841759239, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 0.0609, + "step": 8391 + }, + { + "epoch": 15.220131489458172, + "grad_norm": 0.1689453125, + "learning_rate": 0.0002, + "loss": 0.063, + "step": 8392 + }, + { + "epoch": 15.221945137157107, + "grad_norm": 0.2333984375, + "learning_rate": 0.0002, + "loss": 0.0645, + "step": 8393 + }, + { + "epoch": 15.223758784856042, + "grad_norm": 0.19921875, + "learning_rate": 0.0002, + "loss": 0.0637, + "step": 8394 + }, + { + "epoch": 15.225572432554976, + "grad_norm": 0.1826171875, + "learning_rate": 0.0002, + "loss": 0.0622, + "step": 8395 + }, + { + "epoch": 15.227386080253911, + "grad_norm": 0.25390625, + "learning_rate": 0.0002, + "loss": 0.0666, + "step": 8396 + }, + { + "epoch": 15.229199727952846, + "grad_norm": 0.279296875, + "learning_rate": 0.0002, + "loss": 0.0705, + "step": 8397 + }, + { + "epoch": 15.231013375651779, + "grad_norm": 0.271484375, + "learning_rate": 0.0002, + "loss": 0.0694, + "step": 8398 + }, + { + "epoch": 15.232827023350714, + "grad_norm": 0.2470703125, + "learning_rate": 0.0002, + "loss": 0.0711, + "step": 8399 + }, + { + "epoch": 15.234640671049648, + "grad_norm": 0.189453125, + "learning_rate": 0.0002, + "loss": 0.0703, + "step": 8400 + }, + { + "epoch": 15.236454318748583, + "grad_norm": 0.1865234375, + "learning_rate": 0.0002, + "loss": 0.0658, + "step": 8401 + }, + { + "epoch": 15.238267966447518, + "grad_norm": 0.13671875, + "learning_rate": 0.0002, + "loss": 0.0679, + "step": 8402 + }, + { + "epoch": 15.240081614146453, + "grad_norm": 0.25390625, + "learning_rate": 0.0002, + "loss": 0.0687, + "step": 8403 + }, + { + "epoch": 15.241895261845386, + "grad_norm": 0.19140625, + "learning_rate": 0.0002, + "loss": 0.0665, + "step": 8404 + }, + { + "epoch": 15.24370890954432, + "grad_norm": 0.275390625, + "learning_rate": 0.0002, + "loss": 0.0728, + "step": 8405 + }, + { + "epoch": 15.245522557243255, + "grad_norm": 0.1591796875, + "learning_rate": 0.0002, + "loss": 0.0658, + "step": 8406 + }, + { + "epoch": 15.24733620494219, + "grad_norm": 0.263671875, + "learning_rate": 0.0002, + "loss": 0.08, + "step": 8407 + }, + { + "epoch": 15.249149852641125, + "grad_norm": 0.32421875, + "learning_rate": 0.0002, + "loss": 0.0776, + "step": 8408 + }, + { + "epoch": 15.25096350034006, + "grad_norm": 0.2890625, + "learning_rate": 0.0002, + "loss": 0.079, + "step": 8409 + }, + { + "epoch": 15.252777148038993, + "grad_norm": 0.484375, + "learning_rate": 0.0002, + "loss": 0.0868, + "step": 8410 + }, + { + "epoch": 15.254590795737927, + "grad_norm": 0.259765625, + "learning_rate": 0.0002, + "loss": 0.0909, + "step": 8411 + }, + { + "epoch": 15.256404443436862, + "grad_norm": 0.3203125, + "learning_rate": 0.0002, + "loss": 0.0909, + "step": 8412 + }, + { + "epoch": 15.258218091135797, + "grad_norm": 0.1748046875, + "learning_rate": 0.0002, + "loss": 0.0821, + "step": 8413 + }, + { + "epoch": 15.260031738834732, + "grad_norm": 0.23046875, + "learning_rate": 0.0002, + "loss": 0.0891, + "step": 8414 + }, + { + "epoch": 15.261845386533667, + "grad_norm": 0.392578125, + "learning_rate": 0.0002, + "loss": 0.1107, + "step": 8415 + }, + { + "epoch": 15.261845386533667, + "eval_loss": 2.5473625659942627, + "eval_runtime": 152.6544, + "eval_samples_per_second": 6.551, + "eval_steps_per_second": 6.551, + "step": 8415 + }, + { + "epoch": 15.261845386533667, + "mmlu_eval_accuracy": 0.3048136879358609, + "mmlu_eval_accuracy_abstract_algebra": 0.45454545454545453, + "mmlu_eval_accuracy_anatomy": 0.2857142857142857, + "mmlu_eval_accuracy_astronomy": 0.3125, + "mmlu_eval_accuracy_business_ethics": 0.36363636363636365, + "mmlu_eval_accuracy_clinical_knowledge": 0.2413793103448276, + "mmlu_eval_accuracy_college_biology": 0.375, + "mmlu_eval_accuracy_college_chemistry": 0.25, + "mmlu_eval_accuracy_college_computer_science": 0.09090909090909091, + "mmlu_eval_accuracy_college_mathematics": 0.18181818181818182, + "mmlu_eval_accuracy_college_medicine": 0.22727272727272727, + "mmlu_eval_accuracy_college_physics": 0.45454545454545453, + "mmlu_eval_accuracy_computer_security": 0.6363636363636364, + "mmlu_eval_accuracy_conceptual_physics": 0.2692307692307692, + "mmlu_eval_accuracy_econometrics": 0.16666666666666666, + "mmlu_eval_accuracy_electrical_engineering": 0.25, + "mmlu_eval_accuracy_elementary_mathematics": 0.2926829268292683, + "mmlu_eval_accuracy_formal_logic": 0.14285714285714285, + "mmlu_eval_accuracy_global_facts": 0.3, + "mmlu_eval_accuracy_high_school_biology": 0.4375, + "mmlu_eval_accuracy_high_school_chemistry": 0.18181818181818182, + "mmlu_eval_accuracy_high_school_computer_science": 0.2222222222222222, + "mmlu_eval_accuracy_high_school_european_history": 0.2222222222222222, + "mmlu_eval_accuracy_high_school_geography": 0.22727272727272727, + "mmlu_eval_accuracy_high_school_government_and_politics": 0.3333333333333333, + "mmlu_eval_accuracy_high_school_macroeconomics": 0.23255813953488372, + "mmlu_eval_accuracy_high_school_mathematics": 0.10344827586206896, + "mmlu_eval_accuracy_high_school_microeconomics": 0.2692307692307692, + "mmlu_eval_accuracy_high_school_physics": 0.5294117647058824, + "mmlu_eval_accuracy_high_school_psychology": 0.35, + "mmlu_eval_accuracy_high_school_statistics": 0.2608695652173913, + "mmlu_eval_accuracy_high_school_us_history": 0.2727272727272727, + "mmlu_eval_accuracy_high_school_world_history": 0.19230769230769232, + "mmlu_eval_accuracy_human_aging": 0.2608695652173913, + "mmlu_eval_accuracy_human_sexuality": 0.16666666666666666, + "mmlu_eval_accuracy_international_law": 0.38461538461538464, + "mmlu_eval_accuracy_jurisprudence": 0.45454545454545453, + "mmlu_eval_accuracy_logical_fallacies": 0.3888888888888889, + "mmlu_eval_accuracy_machine_learning": 0.36363636363636365, + "mmlu_eval_accuracy_management": 0.45454545454545453, + "mmlu_eval_accuracy_marketing": 0.4, + "mmlu_eval_accuracy_medical_genetics": 0.18181818181818182, + "mmlu_eval_accuracy_miscellaneous": 0.46511627906976744, + "mmlu_eval_accuracy_moral_disputes": 0.21052631578947367, + "mmlu_eval_accuracy_moral_scenarios": 0.22, + "mmlu_eval_accuracy_nutrition": 0.30303030303030304, + "mmlu_eval_accuracy_philosophy": 0.3235294117647059, + "mmlu_eval_accuracy_prehistory": 0.3142857142857143, + "mmlu_eval_accuracy_professional_accounting": 0.3225806451612903, + "mmlu_eval_accuracy_professional_law": 0.27058823529411763, + "mmlu_eval_accuracy_professional_medicine": 0.22580645161290322, + "mmlu_eval_accuracy_professional_psychology": 0.2608695652173913, + "mmlu_eval_accuracy_public_relations": 0.4166666666666667, + "mmlu_eval_accuracy_security_studies": 0.37037037037037035, + "mmlu_eval_accuracy_sociology": 0.5454545454545454, + "mmlu_eval_accuracy_us_foreign_policy": 0.45454545454545453, + "mmlu_eval_accuracy_virology": 0.2222222222222222, + "mmlu_eval_accuracy_world_religions": 0.2631578947368421, + "mmlu_loss": 2.3309767661065077, + "step": 8415 + }, + { + "epoch": 15.2636590342326, + "grad_norm": 0.296875, + "learning_rate": 0.0002, + "loss": 0.116, + "step": 8416 + }, + { + "epoch": 15.265472681931534, + "grad_norm": 0.19140625, + "learning_rate": 0.0002, + "loss": 0.1042, + "step": 8417 + }, + { + "epoch": 15.267286329630469, + "grad_norm": 0.70703125, + "learning_rate": 0.0002, + "loss": 0.119, + "step": 8418 + }, + { + "epoch": 15.269099977329404, + "grad_norm": 0.1611328125, + "learning_rate": 0.0002, + "loss": 0.1188, + "step": 8419 + }, + { + "epoch": 15.270913625028339, + "grad_norm": 0.28125, + "learning_rate": 0.0002, + "loss": 0.1742, + "step": 8420 + }, + { + "epoch": 15.272727272727273, + "grad_norm": 0.1767578125, + "learning_rate": 0.0002, + "loss": 0.1955, + "step": 8421 + }, + { + "epoch": 15.274540920426206, + "grad_norm": 0.2109375, + "learning_rate": 0.0002, + "loss": 0.0729, + "step": 8422 + }, + { + "epoch": 15.276354568125141, + "grad_norm": 0.216796875, + "learning_rate": 0.0002, + "loss": 0.0837, + "step": 8423 + }, + { + "epoch": 15.278168215824076, + "grad_norm": 0.212890625, + "learning_rate": 0.0002, + "loss": 0.0852, + "step": 8424 + }, + { + "epoch": 15.27998186352301, + "grad_norm": 0.2412109375, + "learning_rate": 0.0002, + "loss": 0.0865, + "step": 8425 + }, + { + "epoch": 15.281795511221945, + "grad_norm": 0.24609375, + "learning_rate": 0.0002, + "loss": 0.0802, + "step": 8426 + }, + { + "epoch": 15.28360915892088, + "grad_norm": 0.2197265625, + "learning_rate": 0.0002, + "loss": 0.0699, + "step": 8427 + }, + { + "epoch": 15.285422806619813, + "grad_norm": 0.203125, + "learning_rate": 0.0002, + "loss": 0.0865, + "step": 8428 + }, + { + "epoch": 15.287236454318748, + "grad_norm": 0.21484375, + "learning_rate": 0.0002, + "loss": 0.08, + "step": 8429 + }, + { + "epoch": 15.289050102017683, + "grad_norm": 0.197265625, + "learning_rate": 0.0002, + "loss": 0.0765, + "step": 8430 + }, + { + "epoch": 15.290863749716618, + "grad_norm": 0.169921875, + "learning_rate": 0.0002, + "loss": 0.0736, + "step": 8431 + }, + { + "epoch": 15.292677397415552, + "grad_norm": 0.2265625, + "learning_rate": 0.0002, + "loss": 0.0777, + "step": 8432 + }, + { + "epoch": 15.294491045114487, + "grad_norm": 0.19140625, + "learning_rate": 0.0002, + "loss": 0.0665, + "step": 8433 + }, + { + "epoch": 15.296304692813422, + "grad_norm": 0.2099609375, + "learning_rate": 0.0002, + "loss": 0.0666, + "step": 8434 + }, + { + "epoch": 15.298118340512355, + "grad_norm": 0.1826171875, + "learning_rate": 0.0002, + "loss": 0.0721, + "step": 8435 + }, + { + "epoch": 15.29993198821129, + "grad_norm": 0.1787109375, + "learning_rate": 0.0002, + "loss": 0.0633, + "step": 8436 + }, + { + "epoch": 15.301745635910224, + "grad_norm": 0.169921875, + "learning_rate": 0.0002, + "loss": 0.0629, + "step": 8437 + }, + { + "epoch": 15.30355928360916, + "grad_norm": 0.23046875, + "learning_rate": 0.0002, + "loss": 0.0743, + "step": 8438 + }, + { + "epoch": 15.305372931308094, + "grad_norm": 0.173828125, + "learning_rate": 0.0002, + "loss": 0.0671, + "step": 8439 + }, + { + "epoch": 15.307186579007029, + "grad_norm": 0.2158203125, + "learning_rate": 0.0002, + "loss": 0.0694, + "step": 8440 + }, + { + "epoch": 15.309000226705962, + "grad_norm": 0.19140625, + "learning_rate": 0.0002, + "loss": 0.0589, + "step": 8441 + }, + { + "epoch": 15.310813874404896, + "grad_norm": 0.140625, + "learning_rate": 0.0002, + "loss": 0.0631, + "step": 8442 + }, + { + "epoch": 15.312627522103831, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 0.0603, + "step": 8443 + }, + { + "epoch": 15.314441169802766, + "grad_norm": 0.2138671875, + "learning_rate": 0.0002, + "loss": 0.0712, + "step": 8444 + }, + { + "epoch": 15.3162548175017, + "grad_norm": 0.162109375, + "learning_rate": 0.0002, + "loss": 0.059, + "step": 8445 + }, + { + "epoch": 15.318068465200636, + "grad_norm": 0.2373046875, + "learning_rate": 0.0002, + "loss": 0.0675, + "step": 8446 + }, + { + "epoch": 15.319882112899569, + "grad_norm": 0.263671875, + "learning_rate": 0.0002, + "loss": 0.0835, + "step": 8447 + }, + { + "epoch": 15.321695760598503, + "grad_norm": 0.296875, + "learning_rate": 0.0002, + "loss": 0.0684, + "step": 8448 + }, + { + "epoch": 15.323509408297438, + "grad_norm": 0.263671875, + "learning_rate": 0.0002, + "loss": 0.0714, + "step": 8449 + }, + { + "epoch": 15.325323055996373, + "grad_norm": 0.19921875, + "learning_rate": 0.0002, + "loss": 0.071, + "step": 8450 + }, + { + "epoch": 15.327136703695308, + "grad_norm": 0.1806640625, + "learning_rate": 0.0002, + "loss": 0.0662, + "step": 8451 + }, + { + "epoch": 15.328950351394242, + "grad_norm": 0.1796875, + "learning_rate": 0.0002, + "loss": 0.0615, + "step": 8452 + }, + { + "epoch": 15.330763999093175, + "grad_norm": 0.291015625, + "learning_rate": 0.0002, + "loss": 0.0759, + "step": 8453 + }, + { + "epoch": 15.33257764679211, + "grad_norm": 0.28515625, + "learning_rate": 0.0002, + "loss": 0.0819, + "step": 8454 + }, + { + "epoch": 15.334391294491045, + "grad_norm": 0.2197265625, + "learning_rate": 0.0002, + "loss": 0.0701, + "step": 8455 + }, + { + "epoch": 15.33620494218998, + "grad_norm": 0.25390625, + "learning_rate": 0.0002, + "loss": 0.0784, + "step": 8456 + }, + { + "epoch": 15.338018589888915, + "grad_norm": 0.181640625, + "learning_rate": 0.0002, + "loss": 0.0746, + "step": 8457 + }, + { + "epoch": 15.33983223758785, + "grad_norm": 0.283203125, + "learning_rate": 0.0002, + "loss": 0.0763, + "step": 8458 + }, + { + "epoch": 15.341645885286782, + "grad_norm": 0.255859375, + "learning_rate": 0.0002, + "loss": 0.0797, + "step": 8459 + }, + { + "epoch": 15.343459532985717, + "grad_norm": 0.220703125, + "learning_rate": 0.0002, + "loss": 0.0825, + "step": 8460 + }, + { + "epoch": 15.345273180684652, + "grad_norm": 0.1826171875, + "learning_rate": 0.0002, + "loss": 0.0899, + "step": 8461 + }, + { + "epoch": 15.347086828383587, + "grad_norm": 0.294921875, + "learning_rate": 0.0002, + "loss": 0.0894, + "step": 8462 + }, + { + "epoch": 15.348900476082521, + "grad_norm": 0.26171875, + "learning_rate": 0.0002, + "loss": 0.0896, + "step": 8463 + }, + { + "epoch": 15.350714123781456, + "grad_norm": 0.2294921875, + "learning_rate": 0.0002, + "loss": 0.1044, + "step": 8464 + }, + { + "epoch": 15.35252777148039, + "grad_norm": 0.330078125, + "learning_rate": 0.0002, + "loss": 0.1203, + "step": 8465 + }, + { + "epoch": 15.354341419179324, + "grad_norm": 0.208984375, + "learning_rate": 0.0002, + "loss": 0.1147, + "step": 8466 + }, + { + "epoch": 15.356155066878259, + "grad_norm": 0.3046875, + "learning_rate": 0.0002, + "loss": 0.123, + "step": 8467 + }, + { + "epoch": 15.357968714577193, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002, + "loss": 0.1107, + "step": 8468 + }, + { + "epoch": 15.359782362276128, + "grad_norm": 0.1748046875, + "learning_rate": 0.0002, + "loss": 0.1309, + "step": 8469 + }, + { + "epoch": 15.361596009975063, + "grad_norm": 0.1904296875, + "learning_rate": 0.0002, + "loss": 0.1752, + "step": 8470 + }, + { + "epoch": 15.363409657673996, + "grad_norm": 0.171875, + "learning_rate": 0.0002, + "loss": 0.1683, + "step": 8471 + }, + { + "epoch": 15.36522330537293, + "grad_norm": 0.296875, + "learning_rate": 0.0002, + "loss": 0.086, + "step": 8472 + }, + { + "epoch": 15.367036953071866, + "grad_norm": 0.2099609375, + "learning_rate": 0.0002, + "loss": 0.0762, + "step": 8473 + }, + { + "epoch": 15.3688506007708, + "grad_norm": 0.2431640625, + "learning_rate": 0.0002, + "loss": 0.0878, + "step": 8474 + }, + { + "epoch": 15.370664248469735, + "grad_norm": 0.287109375, + "learning_rate": 0.0002, + "loss": 0.1114, + "step": 8475 + }, + { + "epoch": 15.37247789616867, + "grad_norm": 0.25, + "learning_rate": 0.0002, + "loss": 0.0923, + "step": 8476 + }, + { + "epoch": 15.374291543867603, + "grad_norm": 0.201171875, + "learning_rate": 0.0002, + "loss": 0.0747, + "step": 8477 + }, + { + "epoch": 15.376105191566538, + "grad_norm": 0.212890625, + "learning_rate": 0.0002, + "loss": 0.0891, + "step": 8478 + }, + { + "epoch": 15.377918839265472, + "grad_norm": 0.1845703125, + "learning_rate": 0.0002, + "loss": 0.0733, + "step": 8479 + }, + { + "epoch": 15.379732486964407, + "grad_norm": 0.2060546875, + "learning_rate": 0.0002, + "loss": 0.0797, + "step": 8480 + }, + { + "epoch": 15.381546134663342, + "grad_norm": 0.1796875, + "learning_rate": 0.0002, + "loss": 0.0677, + "step": 8481 + }, + { + "epoch": 15.383359782362277, + "grad_norm": 0.220703125, + "learning_rate": 0.0002, + "loss": 0.0829, + "step": 8482 + }, + { + "epoch": 15.385173430061212, + "grad_norm": 0.185546875, + "learning_rate": 0.0002, + "loss": 0.0695, + "step": 8483 + }, + { + "epoch": 15.386987077760145, + "grad_norm": 0.19140625, + "learning_rate": 0.0002, + "loss": 0.0704, + "step": 8484 + }, + { + "epoch": 15.38880072545908, + "grad_norm": 0.17578125, + "learning_rate": 0.0002, + "loss": 0.0729, + "step": 8485 + }, + { + "epoch": 15.390614373158014, + "grad_norm": 0.2001953125, + "learning_rate": 0.0002, + "loss": 0.0773, + "step": 8486 + }, + { + "epoch": 15.392428020856949, + "grad_norm": 0.177734375, + "learning_rate": 0.0002, + "loss": 0.0662, + "step": 8487 + }, + { + "epoch": 15.394241668555884, + "grad_norm": 0.2109375, + "learning_rate": 0.0002, + "loss": 0.0687, + "step": 8488 + }, + { + "epoch": 15.396055316254817, + "grad_norm": 0.181640625, + "learning_rate": 0.0002, + "loss": 0.0612, + "step": 8489 + }, + { + "epoch": 15.397868963953751, + "grad_norm": 0.197265625, + "learning_rate": 0.0002, + "loss": 0.0683, + "step": 8490 + }, + { + "epoch": 15.399682611652686, + "grad_norm": 0.1904296875, + "learning_rate": 0.0002, + "loss": 0.0608, + "step": 8491 + }, + { + "epoch": 15.401496259351621, + "grad_norm": 0.2041015625, + "learning_rate": 0.0002, + "loss": 0.0713, + "step": 8492 + }, + { + "epoch": 15.403309907050556, + "grad_norm": 0.1982421875, + "learning_rate": 0.0002, + "loss": 0.0663, + "step": 8493 + }, + { + "epoch": 15.40512355474949, + "grad_norm": 0.251953125, + "learning_rate": 0.0002, + "loss": 0.068, + "step": 8494 + }, + { + "epoch": 15.406937202448425, + "grad_norm": 0.1728515625, + "learning_rate": 0.0002, + "loss": 0.0637, + "step": 8495 + }, + { + "epoch": 15.408750850147358, + "grad_norm": 0.22265625, + "learning_rate": 0.0002, + "loss": 0.0662, + "step": 8496 + }, + { + "epoch": 15.410564497846293, + "grad_norm": 0.22265625, + "learning_rate": 0.0002, + "loss": 0.0695, + "step": 8497 + }, + { + "epoch": 15.412378145545228, + "grad_norm": 0.2333984375, + "learning_rate": 0.0002, + "loss": 0.065, + "step": 8498 + }, + { + "epoch": 15.414191793244163, + "grad_norm": 0.24609375, + "learning_rate": 0.0002, + "loss": 0.0681, + "step": 8499 + }, + { + "epoch": 15.416005440943097, + "grad_norm": 0.498046875, + "learning_rate": 0.0002, + "loss": 0.1774, + "step": 8500 + }, + { + "epoch": 15.417819088642032, + "grad_norm": 0.30859375, + "learning_rate": 0.0002, + "loss": 0.1132, + "step": 8501 + }, + { + "epoch": 15.419632736340965, + "grad_norm": 0.2021484375, + "learning_rate": 0.0002, + "loss": 0.067, + "step": 8502 + }, + { + "epoch": 15.4214463840399, + "grad_norm": 0.28515625, + "learning_rate": 0.0002, + "loss": 0.0846, + "step": 8503 + }, + { + "epoch": 15.423260031738835, + "grad_norm": 0.314453125, + "learning_rate": 0.0002, + "loss": 0.0848, + "step": 8504 + }, + { + "epoch": 15.42507367943777, + "grad_norm": 0.212890625, + "learning_rate": 0.0002, + "loss": 0.0764, + "step": 8505 + }, + { + "epoch": 15.426887327136704, + "grad_norm": 0.2099609375, + "learning_rate": 0.0002, + "loss": 0.0739, + "step": 8506 + }, + { + "epoch": 15.428700974835639, + "grad_norm": 0.26953125, + "learning_rate": 0.0002, + "loss": 0.0839, + "step": 8507 + }, + { + "epoch": 15.430514622534572, + "grad_norm": 0.2314453125, + "learning_rate": 0.0002, + "loss": 0.0786, + "step": 8508 + }, + { + "epoch": 15.432328270233507, + "grad_norm": 0.2353515625, + "learning_rate": 0.0002, + "loss": 0.0773, + "step": 8509 + }, + { + "epoch": 15.434141917932442, + "grad_norm": 0.2041015625, + "learning_rate": 0.0002, + "loss": 0.0801, + "step": 8510 + }, + { + "epoch": 15.435955565631376, + "grad_norm": 0.2099609375, + "learning_rate": 0.0002, + "loss": 0.0871, + "step": 8511 + }, + { + "epoch": 15.437769213330311, + "grad_norm": 0.2578125, + "learning_rate": 0.0002, + "loss": 0.0915, + "step": 8512 + }, + { + "epoch": 15.439582861029246, + "grad_norm": 0.244140625, + "learning_rate": 0.0002, + "loss": 0.088, + "step": 8513 + }, + { + "epoch": 15.441396508728179, + "grad_norm": 0.1806640625, + "learning_rate": 0.0002, + "loss": 0.0869, + "step": 8514 + }, + { + "epoch": 15.443210156427114, + "grad_norm": 0.205078125, + "learning_rate": 0.0002, + "loss": 0.0907, + "step": 8515 + }, + { + "epoch": 15.445023804126048, + "grad_norm": 0.291015625, + "learning_rate": 0.0002, + "loss": 0.106, + "step": 8516 + }, + { + "epoch": 15.446837451824983, + "grad_norm": 0.232421875, + "learning_rate": 0.0002, + "loss": 0.1078, + "step": 8517 + }, + { + "epoch": 15.448651099523918, + "grad_norm": 0.166015625, + "learning_rate": 0.0002, + "loss": 0.1209, + "step": 8518 + }, + { + "epoch": 15.450464747222853, + "grad_norm": 0.1591796875, + "learning_rate": 0.0002, + "loss": 0.159, + "step": 8519 + }, + { + "epoch": 15.452278394921786, + "grad_norm": 0.1865234375, + "learning_rate": 0.0002, + "loss": 0.2008, + "step": 8520 + }, + { + "epoch": 15.45409204262072, + "grad_norm": 0.2060546875, + "learning_rate": 0.0002, + "loss": 0.1844, + "step": 8521 + }, + { + "epoch": 15.455905690319655, + "grad_norm": 0.21484375, + "learning_rate": 0.0002, + "loss": 0.0896, + "step": 8522 + }, + { + "epoch": 15.45771933801859, + "grad_norm": 0.208984375, + "learning_rate": 0.0002, + "loss": 0.0748, + "step": 8523 + }, + { + "epoch": 15.459532985717525, + "grad_norm": 0.2216796875, + "learning_rate": 0.0002, + "loss": 0.084, + "step": 8524 + }, + { + "epoch": 15.46134663341646, + "grad_norm": 0.2236328125, + "learning_rate": 0.0002, + "loss": 0.0932, + "step": 8525 + }, + { + "epoch": 15.463160281115393, + "grad_norm": 0.1904296875, + "learning_rate": 0.0002, + "loss": 0.0731, + "step": 8526 + }, + { + "epoch": 15.464973928814327, + "grad_norm": 0.1953125, + "learning_rate": 0.0002, + "loss": 0.0702, + "step": 8527 + }, + { + "epoch": 15.466787576513262, + "grad_norm": 0.224609375, + "learning_rate": 0.0002, + "loss": 0.0675, + "step": 8528 + }, + { + "epoch": 15.468601224212197, + "grad_norm": 0.2158203125, + "learning_rate": 0.0002, + "loss": 0.0833, + "step": 8529 + }, + { + "epoch": 15.470414871911132, + "grad_norm": 0.29296875, + "learning_rate": 0.0002, + "loss": 0.0946, + "step": 8530 + }, + { + "epoch": 15.472228519610066, + "grad_norm": 0.2109375, + "learning_rate": 0.0002, + "loss": 0.081, + "step": 8531 + }, + { + "epoch": 15.474042167309, + "grad_norm": 0.205078125, + "learning_rate": 0.0002, + "loss": 0.0713, + "step": 8532 + }, + { + "epoch": 15.475855815007934, + "grad_norm": 0.1787109375, + "learning_rate": 0.0002, + "loss": 0.0661, + "step": 8533 + }, + { + "epoch": 15.477669462706869, + "grad_norm": 0.2158203125, + "learning_rate": 0.0002, + "loss": 0.0809, + "step": 8534 + }, + { + "epoch": 15.479483110405804, + "grad_norm": 0.2470703125, + "learning_rate": 0.0002, + "loss": 0.0684, + "step": 8535 + }, + { + "epoch": 15.481296758104738, + "grad_norm": 0.177734375, + "learning_rate": 0.0002, + "loss": 0.0704, + "step": 8536 + }, + { + "epoch": 15.483110405803673, + "grad_norm": 0.189453125, + "learning_rate": 0.0002, + "loss": 0.0648, + "step": 8537 + }, + { + "epoch": 15.484924053502606, + "grad_norm": 0.2109375, + "learning_rate": 0.0002, + "loss": 0.0699, + "step": 8538 + }, + { + "epoch": 15.486737701201541, + "grad_norm": 0.2060546875, + "learning_rate": 0.0002, + "loss": 0.0655, + "step": 8539 + }, + { + "epoch": 15.488551348900476, + "grad_norm": 0.1845703125, + "learning_rate": 0.0002, + "loss": 0.0732, + "step": 8540 + }, + { + "epoch": 15.49036499659941, + "grad_norm": 0.1982421875, + "learning_rate": 0.0002, + "loss": 0.0634, + "step": 8541 + }, + { + "epoch": 15.492178644298345, + "grad_norm": 0.193359375, + "learning_rate": 0.0002, + "loss": 0.0693, + "step": 8542 + }, + { + "epoch": 15.49399229199728, + "grad_norm": 0.20703125, + "learning_rate": 0.0002, + "loss": 0.0647, + "step": 8543 + }, + { + "epoch": 15.495805939696215, + "grad_norm": 0.2421875, + "learning_rate": 0.0002, + "loss": 0.072, + "step": 8544 + }, + { + "epoch": 15.497619587395148, + "grad_norm": 0.25, + "learning_rate": 0.0002, + "loss": 0.0737, + "step": 8545 + }, + { + "epoch": 15.499433235094083, + "grad_norm": 0.2431640625, + "learning_rate": 0.0002, + "loss": 0.0696, + "step": 8546 + }, + { + "epoch": 15.501246882793017, + "grad_norm": 0.19140625, + "learning_rate": 0.0002, + "loss": 0.0584, + "step": 8547 + }, + { + "epoch": 15.503060530491952, + "grad_norm": 0.19921875, + "learning_rate": 0.0002, + "loss": 0.0652, + "step": 8548 + }, + { + "epoch": 15.504874178190887, + "grad_norm": 0.2109375, + "learning_rate": 0.0002, + "loss": 0.0691, + "step": 8549 + }, + { + "epoch": 15.50668782588982, + "grad_norm": 0.2177734375, + "learning_rate": 0.0002, + "loss": 0.0706, + "step": 8550 + }, + { + "epoch": 15.508501473588755, + "grad_norm": 0.265625, + "learning_rate": 0.0002, + "loss": 0.0793, + "step": 8551 + }, + { + "epoch": 15.51031512128769, + "grad_norm": 0.2060546875, + "learning_rate": 0.0002, + "loss": 0.0659, + "step": 8552 + }, + { + "epoch": 15.512128768986624, + "grad_norm": 0.326171875, + "learning_rate": 0.0002, + "loss": 0.0783, + "step": 8553 + }, + { + "epoch": 15.513942416685559, + "grad_norm": 0.30078125, + "learning_rate": 0.0002, + "loss": 0.0754, + "step": 8554 + }, + { + "epoch": 15.515756064384494, + "grad_norm": 0.2294921875, + "learning_rate": 0.0002, + "loss": 0.0794, + "step": 8555 + }, + { + "epoch": 15.517569712083429, + "grad_norm": 0.255859375, + "learning_rate": 0.0002, + "loss": 0.0748, + "step": 8556 + }, + { + "epoch": 15.519383359782362, + "grad_norm": 0.2578125, + "learning_rate": 0.0002, + "loss": 0.0731, + "step": 8557 + }, + { + "epoch": 15.521197007481296, + "grad_norm": 0.26953125, + "learning_rate": 0.0002, + "loss": 0.0747, + "step": 8558 + }, + { + "epoch": 15.523010655180231, + "grad_norm": 0.2890625, + "learning_rate": 0.0002, + "loss": 0.0922, + "step": 8559 + }, + { + "epoch": 15.524824302879166, + "grad_norm": 0.2158203125, + "learning_rate": 0.0002, + "loss": 0.0876, + "step": 8560 + }, + { + "epoch": 15.5266379505781, + "grad_norm": 0.26171875, + "learning_rate": 0.0002, + "loss": 0.0882, + "step": 8561 + }, + { + "epoch": 15.528451598277035, + "grad_norm": 0.162109375, + "learning_rate": 0.0002, + "loss": 0.0803, + "step": 8562 + }, + { + "epoch": 15.530265245975968, + "grad_norm": 0.30078125, + "learning_rate": 0.0002, + "loss": 0.1026, + "step": 8563 + }, + { + "epoch": 15.532078893674903, + "grad_norm": 0.33203125, + "learning_rate": 0.0002, + "loss": 0.0973, + "step": 8564 + }, + { + "epoch": 15.533892541373838, + "grad_norm": 0.2041015625, + "learning_rate": 0.0002, + "loss": 0.1005, + "step": 8565 + }, + { + "epoch": 15.535706189072773, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 0.1114, + "step": 8566 + }, + { + "epoch": 15.537519836771708, + "grad_norm": 0.322265625, + "learning_rate": 0.0002, + "loss": 0.138, + "step": 8567 + }, + { + "epoch": 15.539333484470642, + "grad_norm": 0.22265625, + "learning_rate": 0.0002, + "loss": 0.1287, + "step": 8568 + }, + { + "epoch": 15.541147132169575, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 0.1296, + "step": 8569 + }, + { + "epoch": 15.54296077986851, + "grad_norm": 0.1259765625, + "learning_rate": 0.0002, + "loss": 0.1748, + "step": 8570 + }, + { + "epoch": 15.544774427567445, + "grad_norm": 0.2041015625, + "learning_rate": 0.0002, + "loss": 0.2005, + "step": 8571 + }, + { + "epoch": 15.54658807526638, + "grad_norm": 0.2451171875, + "learning_rate": 0.0002, + "loss": 0.0863, + "step": 8572 + }, + { + "epoch": 15.548401722965314, + "grad_norm": 0.2314453125, + "learning_rate": 0.0002, + "loss": 0.0826, + "step": 8573 + }, + { + "epoch": 15.55021537066425, + "grad_norm": 0.240234375, + "learning_rate": 0.0002, + "loss": 0.0943, + "step": 8574 + }, + { + "epoch": 15.552029018363182, + "grad_norm": 0.224609375, + "learning_rate": 0.0002, + "loss": 0.0936, + "step": 8575 + }, + { + "epoch": 15.553842666062117, + "grad_norm": 0.2138671875, + "learning_rate": 0.0002, + "loss": 0.0821, + "step": 8576 + }, + { + "epoch": 15.555656313761052, + "grad_norm": 0.2119140625, + "learning_rate": 0.0002, + "loss": 0.0919, + "step": 8577 + }, + { + "epoch": 15.557469961459987, + "grad_norm": 0.263671875, + "learning_rate": 0.0002, + "loss": 0.1033, + "step": 8578 + }, + { + "epoch": 15.559283609158921, + "grad_norm": 0.2421875, + "learning_rate": 0.0002, + "loss": 0.0872, + "step": 8579 + }, + { + "epoch": 15.561097256857856, + "grad_norm": 0.1923828125, + "learning_rate": 0.0002, + "loss": 0.0771, + "step": 8580 + }, + { + "epoch": 15.562910904556789, + "grad_norm": 0.22265625, + "learning_rate": 0.0002, + "loss": 0.0772, + "step": 8581 + }, + { + "epoch": 15.564724552255724, + "grad_norm": 0.2119140625, + "learning_rate": 0.0002, + "loss": 0.0719, + "step": 8582 + }, + { + "epoch": 15.566538199954659, + "grad_norm": 0.1796875, + "learning_rate": 0.0002, + "loss": 0.0948, + "step": 8583 + }, + { + "epoch": 15.568351847653593, + "grad_norm": 0.2314453125, + "learning_rate": 0.0002, + "loss": 0.0788, + "step": 8584 + }, + { + "epoch": 15.570165495352528, + "grad_norm": 0.244140625, + "learning_rate": 0.0002, + "loss": 0.0681, + "step": 8585 + }, + { + "epoch": 15.571979143051463, + "grad_norm": 0.2060546875, + "learning_rate": 0.0002, + "loss": 0.0757, + "step": 8586 + }, + { + "epoch": 15.573792790750396, + "grad_norm": 0.1923828125, + "learning_rate": 0.0002, + "loss": 0.0697, + "step": 8587 + }, + { + "epoch": 15.57560643844933, + "grad_norm": 0.19140625, + "learning_rate": 0.0002, + "loss": 0.0689, + "step": 8588 + }, + { + "epoch": 15.577420086148265, + "grad_norm": 0.212890625, + "learning_rate": 0.0002, + "loss": 0.066, + "step": 8589 + }, + { + "epoch": 15.5792337338472, + "grad_norm": 0.2041015625, + "learning_rate": 0.0002, + "loss": 0.071, + "step": 8590 + }, + { + "epoch": 15.581047381546135, + "grad_norm": 0.1904296875, + "learning_rate": 0.0002, + "loss": 0.0599, + "step": 8591 + }, + { + "epoch": 15.58286102924507, + "grad_norm": 0.193359375, + "learning_rate": 0.0002, + "loss": 0.0664, + "step": 8592 + }, + { + "epoch": 15.584674676944005, + "grad_norm": 0.2275390625, + "learning_rate": 0.0002, + "loss": 0.0714, + "step": 8593 + }, + { + "epoch": 15.586488324642938, + "grad_norm": 0.2265625, + "learning_rate": 0.0002, + "loss": 0.0759, + "step": 8594 + }, + { + "epoch": 15.588301972341872, + "grad_norm": 0.220703125, + "learning_rate": 0.0002, + "loss": 0.0702, + "step": 8595 + }, + { + "epoch": 15.590115620040807, + "grad_norm": 0.25390625, + "learning_rate": 0.0002, + "loss": 0.0752, + "step": 8596 + }, + { + "epoch": 15.591929267739742, + "grad_norm": 0.240234375, + "learning_rate": 0.0002, + "loss": 0.088, + "step": 8597 + }, + { + "epoch": 15.593742915438677, + "grad_norm": 0.1904296875, + "learning_rate": 0.0002, + "loss": 0.0643, + "step": 8598 + }, + { + "epoch": 15.59555656313761, + "grad_norm": 0.2333984375, + "learning_rate": 0.0002, + "loss": 0.0744, + "step": 8599 + }, + { + "epoch": 15.597370210836544, + "grad_norm": 0.169921875, + "learning_rate": 0.0002, + "loss": 0.0696, + "step": 8600 + }, + { + "epoch": 15.59918385853548, + "grad_norm": 0.232421875, + "learning_rate": 0.0002, + "loss": 0.0737, + "step": 8601 + }, + { + "epoch": 15.600997506234414, + "grad_norm": 0.26171875, + "learning_rate": 0.0002, + "loss": 0.0821, + "step": 8602 + }, + { + "epoch": 15.600997506234414, + "eval_loss": 2.56882381439209, + "eval_runtime": 152.4717, + "eval_samples_per_second": 6.559, + "eval_steps_per_second": 6.559, + "step": 8602 + }, + { + "epoch": 15.600997506234414, + "mmlu_eval_accuracy": 0.30241369638574567, + "mmlu_eval_accuracy_abstract_algebra": 0.45454545454545453, + "mmlu_eval_accuracy_anatomy": 0.2857142857142857, + "mmlu_eval_accuracy_astronomy": 0.3125, + "mmlu_eval_accuracy_business_ethics": 0.2727272727272727, + "mmlu_eval_accuracy_clinical_knowledge": 0.20689655172413793, + "mmlu_eval_accuracy_college_biology": 0.4375, + "mmlu_eval_accuracy_college_chemistry": 0.25, + "mmlu_eval_accuracy_college_computer_science": 0.09090909090909091, + "mmlu_eval_accuracy_college_mathematics": 0.18181818181818182, + "mmlu_eval_accuracy_college_medicine": 0.3181818181818182, + "mmlu_eval_accuracy_college_physics": 0.36363636363636365, + "mmlu_eval_accuracy_computer_security": 0.6363636363636364, + "mmlu_eval_accuracy_conceptual_physics": 0.3076923076923077, + "mmlu_eval_accuracy_econometrics": 0.16666666666666666, + "mmlu_eval_accuracy_electrical_engineering": 0.125, + "mmlu_eval_accuracy_elementary_mathematics": 0.2926829268292683, + "mmlu_eval_accuracy_formal_logic": 0.21428571428571427, + "mmlu_eval_accuracy_global_facts": 0.3, + "mmlu_eval_accuracy_high_school_biology": 0.34375, + "mmlu_eval_accuracy_high_school_chemistry": 0.18181818181818182, + "mmlu_eval_accuracy_high_school_computer_science": 0.2222222222222222, + "mmlu_eval_accuracy_high_school_european_history": 0.3333333333333333, + "mmlu_eval_accuracy_high_school_geography": 0.2727272727272727, + "mmlu_eval_accuracy_high_school_government_and_politics": 0.2857142857142857, + "mmlu_eval_accuracy_high_school_macroeconomics": 0.18604651162790697, + "mmlu_eval_accuracy_high_school_mathematics": 0.10344827586206896, + "mmlu_eval_accuracy_high_school_microeconomics": 0.3076923076923077, + "mmlu_eval_accuracy_high_school_physics": 0.5294117647058824, + "mmlu_eval_accuracy_high_school_psychology": 0.4166666666666667, + "mmlu_eval_accuracy_high_school_statistics": 0.2608695652173913, + "mmlu_eval_accuracy_high_school_us_history": 0.2727272727272727, + "mmlu_eval_accuracy_high_school_world_history": 0.19230769230769232, + "mmlu_eval_accuracy_human_aging": 0.2608695652173913, + "mmlu_eval_accuracy_human_sexuality": 0.16666666666666666, + "mmlu_eval_accuracy_international_law": 0.38461538461538464, + "mmlu_eval_accuracy_jurisprudence": 0.36363636363636365, + "mmlu_eval_accuracy_logical_fallacies": 0.3333333333333333, + "mmlu_eval_accuracy_machine_learning": 0.36363636363636365, + "mmlu_eval_accuracy_management": 0.5454545454545454, + "mmlu_eval_accuracy_marketing": 0.4, + "mmlu_eval_accuracy_medical_genetics": 0.36363636363636365, + "mmlu_eval_accuracy_miscellaneous": 0.4883720930232558, + "mmlu_eval_accuracy_moral_disputes": 0.23684210526315788, + "mmlu_eval_accuracy_moral_scenarios": 0.22, + "mmlu_eval_accuracy_nutrition": 0.2727272727272727, + "mmlu_eval_accuracy_philosophy": 0.3235294117647059, + "mmlu_eval_accuracy_prehistory": 0.3142857142857143, + "mmlu_eval_accuracy_professional_accounting": 0.3225806451612903, + "mmlu_eval_accuracy_professional_law": 0.27647058823529413, + "mmlu_eval_accuracy_professional_medicine": 0.25806451612903225, + "mmlu_eval_accuracy_professional_psychology": 0.2463768115942029, + "mmlu_eval_accuracy_public_relations": 0.25, + "mmlu_eval_accuracy_security_studies": 0.3333333333333333, + "mmlu_eval_accuracy_sociology": 0.5, + "mmlu_eval_accuracy_us_foreign_policy": 0.45454545454545453, + "mmlu_eval_accuracy_virology": 0.2222222222222222, + "mmlu_eval_accuracy_world_religions": 0.21052631578947367, + "mmlu_loss": 2.653066355192451, + "step": 8602 + }, + { + "epoch": 15.602811153933349, + "grad_norm": 0.23046875, + "learning_rate": 0.0002, + "loss": 0.072, + "step": 8603 + }, + { + "epoch": 15.604624801632283, + "grad_norm": 0.23046875, + "learning_rate": 0.0002, + "loss": 0.076, + "step": 8604 + }, + { + "epoch": 15.606438449331218, + "grad_norm": 0.318359375, + "learning_rate": 0.0002, + "loss": 0.0771, + "step": 8605 + }, + { + "epoch": 15.608252097030151, + "grad_norm": 0.33984375, + "learning_rate": 0.0002, + "loss": 0.0808, + "step": 8606 + }, + { + "epoch": 15.610065744729086, + "grad_norm": 0.2890625, + "learning_rate": 0.0002, + "loss": 0.0799, + "step": 8607 + }, + { + "epoch": 15.61187939242802, + "grad_norm": 0.2578125, + "learning_rate": 0.0002, + "loss": 0.074, + "step": 8608 + }, + { + "epoch": 15.613693040126956, + "grad_norm": 0.287109375, + "learning_rate": 0.0002, + "loss": 0.0896, + "step": 8609 + }, + { + "epoch": 15.61550668782589, + "grad_norm": 0.1923828125, + "learning_rate": 0.0002, + "loss": 0.0769, + "step": 8610 + }, + { + "epoch": 15.617320335524825, + "grad_norm": 0.26953125, + "learning_rate": 0.0002, + "loss": 0.1016, + "step": 8611 + }, + { + "epoch": 15.619133983223758, + "grad_norm": 0.23046875, + "learning_rate": 0.0002, + "loss": 0.0885, + "step": 8612 + }, + { + "epoch": 15.620947630922693, + "grad_norm": 0.2578125, + "learning_rate": 0.0002, + "loss": 0.1023, + "step": 8613 + }, + { + "epoch": 15.622761278621628, + "grad_norm": 0.546875, + "learning_rate": 0.0002, + "loss": 0.119, + "step": 8614 + }, + { + "epoch": 15.624574926320562, + "grad_norm": 0.328125, + "learning_rate": 0.0002, + "loss": 0.1126, + "step": 8615 + }, + { + "epoch": 15.626388574019497, + "grad_norm": 0.2177734375, + "learning_rate": 0.0002, + "loss": 0.1059, + "step": 8616 + }, + { + "epoch": 15.628202221718432, + "grad_norm": 0.271484375, + "learning_rate": 0.0002, + "loss": 0.1335, + "step": 8617 + }, + { + "epoch": 15.630015869417365, + "grad_norm": 0.2392578125, + "learning_rate": 0.0002, + "loss": 0.1447, + "step": 8618 + }, + { + "epoch": 15.6318295171163, + "grad_norm": 0.1865234375, + "learning_rate": 0.0002, + "loss": 0.162, + "step": 8619 + }, + { + "epoch": 15.633643164815235, + "grad_norm": 0.1865234375, + "learning_rate": 0.0002, + "loss": 0.2164, + "step": 8620 + }, + { + "epoch": 15.63545681251417, + "grad_norm": 0.19921875, + "learning_rate": 0.0002, + "loss": 0.1943, + "step": 8621 + }, + { + "epoch": 15.637270460213104, + "grad_norm": 0.2109375, + "learning_rate": 0.0002, + "loss": 0.0786, + "step": 8622 + }, + { + "epoch": 15.639084107912039, + "grad_norm": 0.26953125, + "learning_rate": 0.0002, + "loss": 0.0996, + "step": 8623 + }, + { + "epoch": 15.640897755610972, + "grad_norm": 0.2470703125, + "learning_rate": 0.0002, + "loss": 0.0936, + "step": 8624 + }, + { + "epoch": 15.642711403309907, + "grad_norm": 0.2275390625, + "learning_rate": 0.0002, + "loss": 0.0756, + "step": 8625 + }, + { + "epoch": 15.644525051008841, + "grad_norm": 0.2314453125, + "learning_rate": 0.0002, + "loss": 0.1061, + "step": 8626 + }, + { + "epoch": 15.646338698707776, + "grad_norm": 0.2734375, + "learning_rate": 0.0002, + "loss": 0.0903, + "step": 8627 + }, + { + "epoch": 15.648152346406711, + "grad_norm": 0.21484375, + "learning_rate": 0.0002, + "loss": 0.0835, + "step": 8628 + }, + { + "epoch": 15.649965994105646, + "grad_norm": 0.19921875, + "learning_rate": 0.0002, + "loss": 0.0718, + "step": 8629 + }, + { + "epoch": 15.651779641804579, + "grad_norm": 0.1962890625, + "learning_rate": 0.0002, + "loss": 0.0772, + "step": 8630 + }, + { + "epoch": 15.653593289503513, + "grad_norm": 0.2431640625, + "learning_rate": 0.0002, + "loss": 0.0957, + "step": 8631 + }, + { + "epoch": 15.655406937202448, + "grad_norm": 0.2236328125, + "learning_rate": 0.0002, + "loss": 0.0884, + "step": 8632 + }, + { + "epoch": 15.657220584901383, + "grad_norm": 0.216796875, + "learning_rate": 0.0002, + "loss": 0.086, + "step": 8633 + }, + { + "epoch": 15.659034232600318, + "grad_norm": 0.21875, + "learning_rate": 0.0002, + "loss": 0.0776, + "step": 8634 + }, + { + "epoch": 15.660847880299253, + "grad_norm": 0.1943359375, + "learning_rate": 0.0002, + "loss": 0.0704, + "step": 8635 + }, + { + "epoch": 15.662661527998186, + "grad_norm": 0.193359375, + "learning_rate": 0.0002, + "loss": 0.0723, + "step": 8636 + }, + { + "epoch": 15.66447517569712, + "grad_norm": 0.2021484375, + "learning_rate": 0.0002, + "loss": 0.0739, + "step": 8637 + }, + { + "epoch": 15.666288823396055, + "grad_norm": 0.21875, + "learning_rate": 0.0002, + "loss": 0.0717, + "step": 8638 + }, + { + "epoch": 15.66810247109499, + "grad_norm": 0.2373046875, + "learning_rate": 0.0002, + "loss": 0.0701, + "step": 8639 + }, + { + "epoch": 15.669916118793925, + "grad_norm": 0.193359375, + "learning_rate": 0.0002, + "loss": 0.0693, + "step": 8640 + }, + { + "epoch": 15.67172976649286, + "grad_norm": 0.197265625, + "learning_rate": 0.0002, + "loss": 0.066, + "step": 8641 + }, + { + "epoch": 15.673543414191792, + "grad_norm": 0.2099609375, + "learning_rate": 0.0002, + "loss": 0.0713, + "step": 8642 + }, + { + "epoch": 15.675357061890727, + "grad_norm": 0.25, + "learning_rate": 0.0002, + "loss": 0.081, + "step": 8643 + }, + { + "epoch": 15.677170709589662, + "grad_norm": 0.2109375, + "learning_rate": 0.0002, + "loss": 0.0687, + "step": 8644 + }, + { + "epoch": 15.678984357288597, + "grad_norm": 0.2001953125, + "learning_rate": 0.0002, + "loss": 0.0639, + "step": 8645 + }, + { + "epoch": 15.680798004987532, + "grad_norm": 0.21484375, + "learning_rate": 0.0002, + "loss": 0.072, + "step": 8646 + }, + { + "epoch": 15.682611652686466, + "grad_norm": 0.1884765625, + "learning_rate": 0.0002, + "loss": 0.0713, + "step": 8647 + }, + { + "epoch": 15.6844253003854, + "grad_norm": 0.2890625, + "learning_rate": 0.0002, + "loss": 0.0785, + "step": 8648 + }, + { + "epoch": 15.686238948084334, + "grad_norm": 0.2041015625, + "learning_rate": 0.0002, + "loss": 0.0651, + "step": 8649 + }, + { + "epoch": 15.688052595783269, + "grad_norm": 0.1748046875, + "learning_rate": 0.0002, + "loss": 0.0648, + "step": 8650 + }, + { + "epoch": 15.689866243482204, + "grad_norm": 0.2373046875, + "learning_rate": 0.0002, + "loss": 0.0721, + "step": 8651 + }, + { + "epoch": 15.691679891181138, + "grad_norm": 0.29296875, + "learning_rate": 0.0002, + "loss": 0.0797, + "step": 8652 + }, + { + "epoch": 15.693493538880073, + "grad_norm": 0.349609375, + "learning_rate": 0.0002, + "loss": 0.0868, + "step": 8653 + }, + { + "epoch": 15.695307186579008, + "grad_norm": 0.2490234375, + "learning_rate": 0.0002, + "loss": 0.0807, + "step": 8654 + }, + { + "epoch": 15.697120834277941, + "grad_norm": 0.265625, + "learning_rate": 0.0002, + "loss": 0.0878, + "step": 8655 + }, + { + "epoch": 15.698934481976876, + "grad_norm": 0.2578125, + "learning_rate": 0.0002, + "loss": 0.0873, + "step": 8656 + }, + { + "epoch": 15.70074812967581, + "grad_norm": 0.2275390625, + "learning_rate": 0.0002, + "loss": 0.0755, + "step": 8657 + }, + { + "epoch": 15.702561777374745, + "grad_norm": 0.2109375, + "learning_rate": 0.0002, + "loss": 0.0796, + "step": 8658 + }, + { + "epoch": 15.70437542507368, + "grad_norm": 0.259765625, + "learning_rate": 0.0002, + "loss": 0.089, + "step": 8659 + }, + { + "epoch": 15.706189072772613, + "grad_norm": 0.236328125, + "learning_rate": 0.0002, + "loss": 0.0848, + "step": 8660 + }, + { + "epoch": 15.708002720471548, + "grad_norm": 0.2060546875, + "learning_rate": 0.0002, + "loss": 0.0854, + "step": 8661 + }, + { + "epoch": 15.709816368170483, + "grad_norm": 0.22265625, + "learning_rate": 0.0002, + "loss": 0.0942, + "step": 8662 + }, + { + "epoch": 15.711630015869417, + "grad_norm": 0.23046875, + "learning_rate": 0.0002, + "loss": 0.1043, + "step": 8663 + }, + { + "epoch": 15.713443663568352, + "grad_norm": 0.1982421875, + "learning_rate": 0.0002, + "loss": 0.1025, + "step": 8664 + }, + { + "epoch": 15.715257311267287, + "grad_norm": 0.306640625, + "learning_rate": 0.0002, + "loss": 0.1093, + "step": 8665 + }, + { + "epoch": 15.717070958966222, + "grad_norm": 0.2138671875, + "learning_rate": 0.0002, + "loss": 0.1109, + "step": 8666 + }, + { + "epoch": 15.718884606665155, + "grad_norm": 0.1806640625, + "learning_rate": 0.0002, + "loss": 0.117, + "step": 8667 + }, + { + "epoch": 15.72069825436409, + "grad_norm": 0.50390625, + "learning_rate": 0.0002, + "loss": 0.1575, + "step": 8668 + }, + { + "epoch": 15.722511902063024, + "grad_norm": 0.33984375, + "learning_rate": 0.0002, + "loss": 0.1411, + "step": 8669 + }, + { + "epoch": 15.724325549761959, + "grad_norm": 0.1318359375, + "learning_rate": 0.0002, + "loss": 0.1418, + "step": 8670 + }, + { + "epoch": 15.726139197460894, + "grad_norm": 0.2275390625, + "learning_rate": 0.0002, + "loss": 0.2311, + "step": 8671 + }, + { + "epoch": 15.727952845159828, + "grad_norm": 0.2099609375, + "learning_rate": 0.0002, + "loss": 0.0843, + "step": 8672 + }, + { + "epoch": 15.729766492858761, + "grad_norm": 0.2431640625, + "learning_rate": 0.0002, + "loss": 0.0925, + "step": 8673 + }, + { + "epoch": 15.731580140557696, + "grad_norm": 0.21484375, + "learning_rate": 0.0002, + "loss": 0.0763, + "step": 8674 + }, + { + "epoch": 15.733393788256631, + "grad_norm": 0.234375, + "learning_rate": 0.0002, + "loss": 0.105, + "step": 8675 + }, + { + "epoch": 15.735207435955566, + "grad_norm": 0.2294921875, + "learning_rate": 0.0002, + "loss": 0.0892, + "step": 8676 + }, + { + "epoch": 15.7370210836545, + "grad_norm": 0.234375, + "learning_rate": 0.0002, + "loss": 0.0828, + "step": 8677 + }, + { + "epoch": 15.738834731353435, + "grad_norm": 0.2138671875, + "learning_rate": 0.0002, + "loss": 0.0944, + "step": 8678 + }, + { + "epoch": 15.740648379052368, + "grad_norm": 0.197265625, + "learning_rate": 0.0002, + "loss": 0.0866, + "step": 8679 + }, + { + "epoch": 15.742462026751303, + "grad_norm": 0.234375, + "learning_rate": 0.0002, + "loss": 0.0801, + "step": 8680 + }, + { + "epoch": 15.744275674450238, + "grad_norm": 0.2275390625, + "learning_rate": 0.0002, + "loss": 0.078, + "step": 8681 + }, + { + "epoch": 15.746089322149173, + "grad_norm": 0.2578125, + "learning_rate": 0.0002, + "loss": 0.0875, + "step": 8682 + }, + { + "epoch": 15.747902969848107, + "grad_norm": 0.25, + "learning_rate": 0.0002, + "loss": 0.0855, + "step": 8683 + }, + { + "epoch": 15.749716617547042, + "grad_norm": 0.2236328125, + "learning_rate": 0.0002, + "loss": 0.0855, + "step": 8684 + }, + { + "epoch": 15.751530265245975, + "grad_norm": 0.1767578125, + "learning_rate": 0.0002, + "loss": 0.0714, + "step": 8685 + }, + { + "epoch": 15.75334391294491, + "grad_norm": 0.16015625, + "learning_rate": 0.0002, + "loss": 0.0716, + "step": 8686 + }, + { + "epoch": 15.755157560643845, + "grad_norm": 0.19140625, + "learning_rate": 0.0002, + "loss": 0.0682, + "step": 8687 + }, + { + "epoch": 15.75697120834278, + "grad_norm": 0.1904296875, + "learning_rate": 0.0002, + "loss": 0.0736, + "step": 8688 + }, + { + "epoch": 15.758784856041714, + "grad_norm": 0.171875, + "learning_rate": 0.0002, + "loss": 0.0613, + "step": 8689 + }, + { + "epoch": 15.760598503740649, + "grad_norm": 0.1904296875, + "learning_rate": 0.0002, + "loss": 0.0626, + "step": 8690 + }, + { + "epoch": 15.762412151439582, + "grad_norm": 0.265625, + "learning_rate": 0.0002, + "loss": 0.0799, + "step": 8691 + }, + { + "epoch": 15.764225799138517, + "grad_norm": 0.2080078125, + "learning_rate": 0.0002, + "loss": 0.0687, + "step": 8692 + }, + { + "epoch": 15.766039446837452, + "grad_norm": 0.177734375, + "learning_rate": 0.0002, + "loss": 0.0615, + "step": 8693 + }, + { + "epoch": 15.767853094536386, + "grad_norm": 0.263671875, + "learning_rate": 0.0002, + "loss": 0.0738, + "step": 8694 + }, + { + "epoch": 15.769666742235321, + "grad_norm": 0.244140625, + "learning_rate": 0.0002, + "loss": 0.0662, + "step": 8695 + }, + { + "epoch": 15.771480389934256, + "grad_norm": 0.2060546875, + "learning_rate": 0.0002, + "loss": 0.0651, + "step": 8696 + }, + { + "epoch": 15.773294037633189, + "grad_norm": 0.28125, + "learning_rate": 0.0002, + "loss": 0.0723, + "step": 8697 + }, + { + "epoch": 15.775107685332124, + "grad_norm": 0.2197265625, + "learning_rate": 0.0002, + "loss": 0.0716, + "step": 8698 + }, + { + "epoch": 15.776921333031058, + "grad_norm": 0.1953125, + "learning_rate": 0.0002, + "loss": 0.0678, + "step": 8699 + }, + { + "epoch": 15.778734980729993, + "grad_norm": 0.2373046875, + "learning_rate": 0.0002, + "loss": 0.0803, + "step": 8700 + }, + { + "epoch": 15.780548628428928, + "grad_norm": 0.33984375, + "learning_rate": 0.0002, + "loss": 0.0867, + "step": 8701 + }, + { + "epoch": 15.782362276127863, + "grad_norm": 0.2041015625, + "learning_rate": 0.0002, + "loss": 0.0717, + "step": 8702 + }, + { + "epoch": 15.784175923826798, + "grad_norm": 0.298828125, + "learning_rate": 0.0002, + "loss": 0.0782, + "step": 8703 + }, + { + "epoch": 15.78598957152573, + "grad_norm": 0.251953125, + "learning_rate": 0.0002, + "loss": 0.0778, + "step": 8704 + }, + { + "epoch": 15.787803219224665, + "grad_norm": 0.1806640625, + "learning_rate": 0.0002, + "loss": 0.0734, + "step": 8705 + }, + { + "epoch": 15.7896168669236, + "grad_norm": 0.240234375, + "learning_rate": 0.0002, + "loss": 0.0722, + "step": 8706 + }, + { + "epoch": 15.791430514622535, + "grad_norm": 0.2216796875, + "learning_rate": 0.0002, + "loss": 0.0723, + "step": 8707 + }, + { + "epoch": 15.79324416232147, + "grad_norm": 0.2197265625, + "learning_rate": 0.0002, + "loss": 0.0758, + "step": 8708 + }, + { + "epoch": 15.795057810020403, + "grad_norm": 0.224609375, + "learning_rate": 0.0002, + "loss": 0.083, + "step": 8709 + }, + { + "epoch": 15.796871457719337, + "grad_norm": 0.244140625, + "learning_rate": 0.0002, + "loss": 0.0861, + "step": 8710 + }, + { + "epoch": 15.798685105418272, + "grad_norm": 0.25390625, + "learning_rate": 0.0002, + "loss": 0.0949, + "step": 8711 + }, + { + "epoch": 15.800498753117207, + "grad_norm": 0.408203125, + "learning_rate": 0.0002, + "loss": 0.0887, + "step": 8712 + }, + { + "epoch": 15.802312400816142, + "grad_norm": 0.2041015625, + "learning_rate": 0.0002, + "loss": 0.1019, + "step": 8713 + }, + { + "epoch": 15.804126048515077, + "grad_norm": 0.2138671875, + "learning_rate": 0.0002, + "loss": 0.1064, + "step": 8714 + }, + { + "epoch": 15.805939696214011, + "grad_norm": 0.3828125, + "learning_rate": 0.0002, + "loss": 0.1133, + "step": 8715 + }, + { + "epoch": 15.807753343912944, + "grad_norm": 0.2421875, + "learning_rate": 0.0002, + "loss": 0.101, + "step": 8716 + }, + { + "epoch": 15.809566991611879, + "grad_norm": 0.1650390625, + "learning_rate": 0.0002, + "loss": 0.1038, + "step": 8717 + }, + { + "epoch": 15.811380639310814, + "grad_norm": 0.259765625, + "learning_rate": 0.0002, + "loss": 0.137, + "step": 8718 + }, + { + "epoch": 15.813194287009749, + "grad_norm": 0.12060546875, + "learning_rate": 0.0002, + "loss": 0.1436, + "step": 8719 + }, + { + "epoch": 15.815007934708683, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 0.1795, + "step": 8720 + }, + { + "epoch": 15.816821582407618, + "grad_norm": 0.259765625, + "learning_rate": 0.0002, + "loss": 0.3416, + "step": 8721 + }, + { + "epoch": 15.818635230106551, + "grad_norm": 0.2216796875, + "learning_rate": 0.0002, + "loss": 0.0952, + "step": 8722 + }, + { + "epoch": 15.820448877805486, + "grad_norm": 0.224609375, + "learning_rate": 0.0002, + "loss": 0.0843, + "step": 8723 + }, + { + "epoch": 15.82226252550442, + "grad_norm": 0.2431640625, + "learning_rate": 0.0002, + "loss": 0.0981, + "step": 8724 + }, + { + "epoch": 15.824076173203355, + "grad_norm": 0.26171875, + "learning_rate": 0.0002, + "loss": 0.0897, + "step": 8725 + }, + { + "epoch": 15.82588982090229, + "grad_norm": 0.263671875, + "learning_rate": 0.0002, + "loss": 0.0919, + "step": 8726 + }, + { + "epoch": 15.827703468601225, + "grad_norm": 0.251953125, + "learning_rate": 0.0002, + "loss": 0.1006, + "step": 8727 + }, + { + "epoch": 15.829517116300158, + "grad_norm": 0.2294921875, + "learning_rate": 0.0002, + "loss": 0.0879, + "step": 8728 + }, + { + "epoch": 15.831330763999093, + "grad_norm": 0.1865234375, + "learning_rate": 0.0002, + "loss": 0.0765, + "step": 8729 + }, + { + "epoch": 15.833144411698028, + "grad_norm": 0.1953125, + "learning_rate": 0.0002, + "loss": 0.0806, + "step": 8730 + }, + { + "epoch": 15.834958059396962, + "grad_norm": 0.2060546875, + "learning_rate": 0.0002, + "loss": 0.0842, + "step": 8731 + }, + { + "epoch": 15.836771707095897, + "grad_norm": 0.1806640625, + "learning_rate": 0.0002, + "loss": 0.0768, + "step": 8732 + }, + { + "epoch": 15.838585354794832, + "grad_norm": 0.2412109375, + "learning_rate": 0.0002, + "loss": 0.0932, + "step": 8733 + }, + { + "epoch": 15.840399002493765, + "grad_norm": 0.1826171875, + "learning_rate": 0.0002, + "loss": 0.0782, + "step": 8734 + }, + { + "epoch": 15.8422126501927, + "grad_norm": 0.1943359375, + "learning_rate": 0.0002, + "loss": 0.0741, + "step": 8735 + }, + { + "epoch": 15.844026297891634, + "grad_norm": 0.1796875, + "learning_rate": 0.0002, + "loss": 0.072, + "step": 8736 + }, + { + "epoch": 15.84583994559057, + "grad_norm": 0.21484375, + "learning_rate": 0.0002, + "loss": 0.0759, + "step": 8737 + }, + { + "epoch": 15.847653593289504, + "grad_norm": 0.23046875, + "learning_rate": 0.0002, + "loss": 0.0769, + "step": 8738 + }, + { + "epoch": 15.849467240988439, + "grad_norm": 0.201171875, + "learning_rate": 0.0002, + "loss": 0.0707, + "step": 8739 + }, + { + "epoch": 15.851280888687372, + "grad_norm": 0.2138671875, + "learning_rate": 0.0002, + "loss": 0.0701, + "step": 8740 + }, + { + "epoch": 15.853094536386307, + "grad_norm": 0.1865234375, + "learning_rate": 0.0002, + "loss": 0.0706, + "step": 8741 + }, + { + "epoch": 15.854908184085241, + "grad_norm": 0.2099609375, + "learning_rate": 0.0002, + "loss": 0.0755, + "step": 8742 + }, + { + "epoch": 15.856721831784176, + "grad_norm": 0.232421875, + "learning_rate": 0.0002, + "loss": 0.0702, + "step": 8743 + }, + { + "epoch": 15.85853547948311, + "grad_norm": 0.2734375, + "learning_rate": 0.0002, + "loss": 0.0783, + "step": 8744 + }, + { + "epoch": 15.860349127182046, + "grad_norm": 0.224609375, + "learning_rate": 0.0002, + "loss": 0.0767, + "step": 8745 + }, + { + "epoch": 15.862162774880979, + "grad_norm": 0.2294921875, + "learning_rate": 0.0002, + "loss": 0.0676, + "step": 8746 + }, + { + "epoch": 15.863976422579913, + "grad_norm": 0.2080078125, + "learning_rate": 0.0002, + "loss": 0.0706, + "step": 8747 + }, + { + "epoch": 15.865790070278848, + "grad_norm": 0.216796875, + "learning_rate": 0.0002, + "loss": 0.0686, + "step": 8748 + }, + { + "epoch": 15.867603717977783, + "grad_norm": 0.17578125, + "learning_rate": 0.0002, + "loss": 0.0674, + "step": 8749 + }, + { + "epoch": 15.869417365676718, + "grad_norm": 0.283203125, + "learning_rate": 0.0002, + "loss": 0.0781, + "step": 8750 + }, + { + "epoch": 15.871231013375652, + "grad_norm": 0.27734375, + "learning_rate": 0.0002, + "loss": 0.0836, + "step": 8751 + }, + { + "epoch": 15.873044661074585, + "grad_norm": 0.28515625, + "learning_rate": 0.0002, + "loss": 0.0765, + "step": 8752 + }, + { + "epoch": 15.87485830877352, + "grad_norm": 0.287109375, + "learning_rate": 0.0002, + "loss": 0.0842, + "step": 8753 + }, + { + "epoch": 15.876671956472455, + "grad_norm": 0.29296875, + "learning_rate": 0.0002, + "loss": 0.0765, + "step": 8754 + }, + { + "epoch": 15.87848560417139, + "grad_norm": 0.287109375, + "learning_rate": 0.0002, + "loss": 0.0781, + "step": 8755 + }, + { + "epoch": 15.880299251870325, + "grad_norm": 0.2451171875, + "learning_rate": 0.0002, + "loss": 0.0787, + "step": 8756 + }, + { + "epoch": 15.88211289956926, + "grad_norm": 0.25, + "learning_rate": 0.0002, + "loss": 0.0821, + "step": 8757 + }, + { + "epoch": 15.883926547268192, + "grad_norm": 0.294921875, + "learning_rate": 0.0002, + "loss": 0.0875, + "step": 8758 + }, + { + "epoch": 15.885740194967127, + "grad_norm": 0.326171875, + "learning_rate": 0.0002, + "loss": 0.0845, + "step": 8759 + }, + { + "epoch": 15.887553842666062, + "grad_norm": 0.279296875, + "learning_rate": 0.0002, + "loss": 0.1018, + "step": 8760 + }, + { + "epoch": 15.889367490364997, + "grad_norm": 0.31640625, + "learning_rate": 0.0002, + "loss": 0.0924, + "step": 8761 + }, + { + "epoch": 15.891181138063931, + "grad_norm": 0.271484375, + "learning_rate": 0.0002, + "loss": 0.0938, + "step": 8762 + }, + { + "epoch": 15.892994785762866, + "grad_norm": 0.251953125, + "learning_rate": 0.0002, + "loss": 0.1031, + "step": 8763 + }, + { + "epoch": 15.894808433461801, + "grad_norm": 0.201171875, + "learning_rate": 0.0002, + "loss": 0.1014, + "step": 8764 + }, + { + "epoch": 15.896622081160734, + "grad_norm": 0.33203125, + "learning_rate": 0.0002, + "loss": 0.1175, + "step": 8765 + }, + { + "epoch": 15.898435728859669, + "grad_norm": 0.30859375, + "learning_rate": 0.0002, + "loss": 0.1155, + "step": 8766 + }, + { + "epoch": 15.900249376558603, + "grad_norm": 0.23046875, + "learning_rate": 0.0002, + "loss": 0.1199, + "step": 8767 + }, + { + "epoch": 15.902063024257538, + "grad_norm": 0.345703125, + "learning_rate": 0.0002, + "loss": 0.1482, + "step": 8768 + }, + { + "epoch": 15.903876671956473, + "grad_norm": 0.2333984375, + "learning_rate": 0.0002, + "loss": 0.1317, + "step": 8769 + }, + { + "epoch": 15.905690319655406, + "grad_norm": 0.19140625, + "learning_rate": 0.0002, + "loss": 0.1682, + "step": 8770 + }, + { + "epoch": 15.90750396735434, + "grad_norm": 0.251953125, + "learning_rate": 0.0002, + "loss": 0.2527, + "step": 8771 + }, + { + "epoch": 15.909317615053276, + "grad_norm": 0.2236328125, + "learning_rate": 0.0002, + "loss": 0.09, + "step": 8772 + }, + { + "epoch": 15.91113126275221, + "grad_norm": 0.2490234375, + "learning_rate": 0.0002, + "loss": 0.0897, + "step": 8773 + }, + { + "epoch": 15.912944910451145, + "grad_norm": 0.2138671875, + "learning_rate": 0.0002, + "loss": 0.0911, + "step": 8774 + }, + { + "epoch": 15.91475855815008, + "grad_norm": 0.263671875, + "learning_rate": 0.0002, + "loss": 0.1003, + "step": 8775 + }, + { + "epoch": 15.916572205849015, + "grad_norm": 0.2275390625, + "learning_rate": 0.0002, + "loss": 0.0823, + "step": 8776 + }, + { + "epoch": 15.918385853547948, + "grad_norm": 0.240234375, + "learning_rate": 0.0002, + "loss": 0.1022, + "step": 8777 + }, + { + "epoch": 15.920199501246882, + "grad_norm": 0.2734375, + "learning_rate": 0.0002, + "loss": 0.1113, + "step": 8778 + }, + { + "epoch": 15.922013148945817, + "grad_norm": 0.21484375, + "learning_rate": 0.0002, + "loss": 0.0874, + "step": 8779 + }, + { + "epoch": 15.923826796644752, + "grad_norm": 0.2255859375, + "learning_rate": 0.0002, + "loss": 0.0864, + "step": 8780 + }, + { + "epoch": 15.925640444343687, + "grad_norm": 0.2109375, + "learning_rate": 0.0002, + "loss": 0.0803, + "step": 8781 + }, + { + "epoch": 15.927454092042622, + "grad_norm": 0.21484375, + "learning_rate": 0.0002, + "loss": 0.0761, + "step": 8782 + }, + { + "epoch": 15.929267739741555, + "grad_norm": 0.205078125, + "learning_rate": 0.0002, + "loss": 0.0773, + "step": 8783 + }, + { + "epoch": 15.93108138744049, + "grad_norm": 0.21484375, + "learning_rate": 0.0002, + "loss": 0.0856, + "step": 8784 + }, + { + "epoch": 15.932895035139424, + "grad_norm": 0.2236328125, + "learning_rate": 0.0002, + "loss": 0.0931, + "step": 8785 + }, + { + "epoch": 15.934708682838359, + "grad_norm": 0.2265625, + "learning_rate": 0.0002, + "loss": 0.0812, + "step": 8786 + }, + { + "epoch": 15.936522330537294, + "grad_norm": 0.2080078125, + "learning_rate": 0.0002, + "loss": 0.0683, + "step": 8787 + }, + { + "epoch": 15.938335978236228, + "grad_norm": 0.2060546875, + "learning_rate": 0.0002, + "loss": 0.0781, + "step": 8788 + }, + { + "epoch": 15.940149625935161, + "grad_norm": 0.2099609375, + "learning_rate": 0.0002, + "loss": 0.0774, + "step": 8789 + }, + { + "epoch": 15.940149625935161, + "eval_loss": 2.532299280166626, + "eval_runtime": 152.7481, + "eval_samples_per_second": 6.547, + "eval_steps_per_second": 6.547, + "step": 8789 + }, + { + "epoch": 15.940149625935161, + "mmlu_eval_accuracy": 0.30193748506564905, + "mmlu_eval_accuracy_abstract_algebra": 0.45454545454545453, + "mmlu_eval_accuracy_anatomy": 0.2857142857142857, + "mmlu_eval_accuracy_astronomy": 0.3125, + "mmlu_eval_accuracy_business_ethics": 0.36363636363636365, + "mmlu_eval_accuracy_clinical_knowledge": 0.20689655172413793, + "mmlu_eval_accuracy_college_biology": 0.4375, + "mmlu_eval_accuracy_college_chemistry": 0.125, + "mmlu_eval_accuracy_college_computer_science": 0.09090909090909091, + "mmlu_eval_accuracy_college_mathematics": 0.18181818181818182, + "mmlu_eval_accuracy_college_medicine": 0.36363636363636365, + "mmlu_eval_accuracy_college_physics": 0.36363636363636365, + "mmlu_eval_accuracy_computer_security": 0.6363636363636364, + "mmlu_eval_accuracy_conceptual_physics": 0.2692307692307692, + "mmlu_eval_accuracy_econometrics": 0.16666666666666666, + "mmlu_eval_accuracy_electrical_engineering": 0.125, + "mmlu_eval_accuracy_elementary_mathematics": 0.24390243902439024, + "mmlu_eval_accuracy_formal_logic": 0.14285714285714285, + "mmlu_eval_accuracy_global_facts": 0.3, + "mmlu_eval_accuracy_high_school_biology": 0.34375, + "mmlu_eval_accuracy_high_school_chemistry": 0.13636363636363635, + "mmlu_eval_accuracy_high_school_computer_science": 0.2222222222222222, + "mmlu_eval_accuracy_high_school_european_history": 0.3888888888888889, + "mmlu_eval_accuracy_high_school_geography": 0.3181818181818182, + "mmlu_eval_accuracy_high_school_government_and_politics": 0.2857142857142857, + "mmlu_eval_accuracy_high_school_macroeconomics": 0.23255813953488372, + "mmlu_eval_accuracy_high_school_mathematics": 0.06896551724137931, + "mmlu_eval_accuracy_high_school_microeconomics": 0.3076923076923077, + "mmlu_eval_accuracy_high_school_physics": 0.5294117647058824, + "mmlu_eval_accuracy_high_school_psychology": 0.38333333333333336, + "mmlu_eval_accuracy_high_school_statistics": 0.2608695652173913, + "mmlu_eval_accuracy_high_school_us_history": 0.22727272727272727, + "mmlu_eval_accuracy_high_school_world_history": 0.15384615384615385, + "mmlu_eval_accuracy_human_aging": 0.30434782608695654, + "mmlu_eval_accuracy_human_sexuality": 0.16666666666666666, + "mmlu_eval_accuracy_international_law": 0.38461538461538464, + "mmlu_eval_accuracy_jurisprudence": 0.36363636363636365, + "mmlu_eval_accuracy_logical_fallacies": 0.3333333333333333, + "mmlu_eval_accuracy_machine_learning": 0.36363636363636365, + "mmlu_eval_accuracy_management": 0.36363636363636365, + "mmlu_eval_accuracy_marketing": 0.48, + "mmlu_eval_accuracy_medical_genetics": 0.45454545454545453, + "mmlu_eval_accuracy_miscellaneous": 0.4186046511627907, + "mmlu_eval_accuracy_moral_disputes": 0.3157894736842105, + "mmlu_eval_accuracy_moral_scenarios": 0.22, + "mmlu_eval_accuracy_nutrition": 0.30303030303030304, + "mmlu_eval_accuracy_philosophy": 0.3235294117647059, + "mmlu_eval_accuracy_prehistory": 0.37142857142857144, + "mmlu_eval_accuracy_professional_accounting": 0.3548387096774194, + "mmlu_eval_accuracy_professional_law": 0.2823529411764706, + "mmlu_eval_accuracy_professional_medicine": 0.25806451612903225, + "mmlu_eval_accuracy_professional_psychology": 0.2753623188405797, + "mmlu_eval_accuracy_public_relations": 0.4166666666666667, + "mmlu_eval_accuracy_security_studies": 0.25925925925925924, + "mmlu_eval_accuracy_sociology": 0.45454545454545453, + "mmlu_eval_accuracy_us_foreign_policy": 0.2727272727272727, + "mmlu_eval_accuracy_virology": 0.2777777777777778, + "mmlu_eval_accuracy_world_religions": 0.2631578947368421, + "mmlu_loss": 2.6661016244385305, + "step": 8789 + }, + { + "epoch": 15.941963273634096, + "grad_norm": 0.2041015625, + "learning_rate": 0.0002, + "loss": 0.0764, + "step": 8790 + }, + { + "epoch": 15.943776921333031, + "grad_norm": 0.267578125, + "learning_rate": 0.0002, + "loss": 0.0855, + "step": 8791 + }, + { + "epoch": 15.945590569031966, + "grad_norm": 0.2119140625, + "learning_rate": 0.0002, + "loss": 0.0731, + "step": 8792 + }, + { + "epoch": 15.9474042167309, + "grad_norm": 0.203125, + "learning_rate": 0.0002, + "loss": 0.0753, + "step": 8793 + }, + { + "epoch": 15.949217864429835, + "grad_norm": 0.1845703125, + "learning_rate": 0.0002, + "loss": 0.0728, + "step": 8794 + }, + { + "epoch": 15.951031512128768, + "grad_norm": 0.265625, + "learning_rate": 0.0002, + "loss": 0.0731, + "step": 8795 + }, + { + "epoch": 15.952845159827703, + "grad_norm": 0.232421875, + "learning_rate": 0.0002, + "loss": 0.067, + "step": 8796 + }, + { + "epoch": 15.954658807526638, + "grad_norm": 0.2265625, + "learning_rate": 0.0002, + "loss": 0.0783, + "step": 8797 + }, + { + "epoch": 15.956472455225573, + "grad_norm": 0.26953125, + "learning_rate": 0.0002, + "loss": 0.073, + "step": 8798 + }, + { + "epoch": 15.958286102924507, + "grad_norm": 0.26953125, + "learning_rate": 0.0002, + "loss": 0.0881, + "step": 8799 + }, + { + "epoch": 15.960099750623442, + "grad_norm": 0.2421875, + "learning_rate": 0.0002, + "loss": 0.066, + "step": 8800 + }, + { + "epoch": 15.961913398322375, + "grad_norm": 0.267578125, + "learning_rate": 0.0002, + "loss": 0.0712, + "step": 8801 + }, + { + "epoch": 15.96372704602131, + "grad_norm": 0.2080078125, + "learning_rate": 0.0002, + "loss": 0.0675, + "step": 8802 + }, + { + "epoch": 15.965540693720245, + "grad_norm": 0.28515625, + "learning_rate": 0.0002, + "loss": 0.0762, + "step": 8803 + }, + { + "epoch": 15.96735434141918, + "grad_norm": 0.25, + "learning_rate": 0.0002, + "loss": 0.0762, + "step": 8804 + }, + { + "epoch": 15.969167989118114, + "grad_norm": 0.3359375, + "learning_rate": 0.0002, + "loss": 0.0816, + "step": 8805 + }, + { + "epoch": 15.970981636817049, + "grad_norm": 0.2412109375, + "learning_rate": 0.0002, + "loss": 0.0801, + "step": 8806 + }, + { + "epoch": 15.972795284515982, + "grad_norm": 0.232421875, + "learning_rate": 0.0002, + "loss": 0.0763, + "step": 8807 + }, + { + "epoch": 15.974608932214917, + "grad_norm": 0.232421875, + "learning_rate": 0.0002, + "loss": 0.0811, + "step": 8808 + }, + { + "epoch": 15.976422579913852, + "grad_norm": 0.298828125, + "learning_rate": 0.0002, + "loss": 0.099, + "step": 8809 + }, + { + "epoch": 15.978236227612786, + "grad_norm": 0.26953125, + "learning_rate": 0.0002, + "loss": 0.0867, + "step": 8810 + }, + { + "epoch": 15.980049875311721, + "grad_norm": 0.291015625, + "learning_rate": 0.0002, + "loss": 0.09, + "step": 8811 + }, + { + "epoch": 15.981863523010656, + "grad_norm": 0.33203125, + "learning_rate": 0.0002, + "loss": 0.0996, + "step": 8812 + }, + { + "epoch": 15.98367717070959, + "grad_norm": 0.30078125, + "learning_rate": 0.0002, + "loss": 0.1043, + "step": 8813 + }, + { + "epoch": 15.985490818408524, + "grad_norm": 0.353515625, + "learning_rate": 0.0002, + "loss": 0.1049, + "step": 8814 + }, + { + "epoch": 15.987304466107458, + "grad_norm": 0.2060546875, + "learning_rate": 0.0002, + "loss": 0.0946, + "step": 8815 + }, + { + "epoch": 15.989118113806393, + "grad_norm": 0.23828125, + "learning_rate": 0.0002, + "loss": 0.1106, + "step": 8816 + }, + { + "epoch": 15.990931761505328, + "grad_norm": 0.1328125, + "learning_rate": 0.0002, + "loss": 0.1232, + "step": 8817 + }, + { + "epoch": 15.992745409204263, + "grad_norm": 0.431640625, + "learning_rate": 0.0002, + "loss": 0.1305, + "step": 8818 + }, + { + "epoch": 15.994559056903196, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 0.1605, + "step": 8819 + }, + { + "epoch": 15.99637270460213, + "grad_norm": 0.251953125, + "learning_rate": 0.0002, + "loss": 0.196, + "step": 8820 + }, + { + "epoch": 15.998186352301065, + "grad_norm": 0.2236328125, + "learning_rate": 0.0002, + "loss": 0.1857, + "step": 8821 + }, + { + "epoch": 16.0, + "grad_norm": 0.287109375, + "learning_rate": 0.0002, + "loss": 0.1053, + "step": 8822 + }, + { + "epoch": 16.001813647698935, + "grad_norm": 0.203125, + "learning_rate": 0.0002, + "loss": 0.0799, + "step": 8823 + }, + { + "epoch": 16.00362729539787, + "grad_norm": 0.2001953125, + "learning_rate": 0.0002, + "loss": 0.0728, + "step": 8824 + }, + { + "epoch": 16.005440943096804, + "grad_norm": 0.2177734375, + "learning_rate": 0.0002, + "loss": 0.0734, + "step": 8825 + }, + { + "epoch": 16.00725459079574, + "grad_norm": 0.2080078125, + "learning_rate": 0.0002, + "loss": 0.074, + "step": 8826 + }, + { + "epoch": 16.009068238494674, + "grad_norm": 0.255859375, + "learning_rate": 0.0002, + "loss": 0.0917, + "step": 8827 + }, + { + "epoch": 16.01088188619361, + "grad_norm": 0.181640625, + "learning_rate": 0.0002, + "loss": 0.065, + "step": 8828 + }, + { + "epoch": 16.01269553389254, + "grad_norm": 0.185546875, + "learning_rate": 0.0002, + "loss": 0.0654, + "step": 8829 + }, + { + "epoch": 16.014509181591475, + "grad_norm": 0.1767578125, + "learning_rate": 0.0002, + "loss": 0.0644, + "step": 8830 + }, + { + "epoch": 16.01632282929041, + "grad_norm": 0.244140625, + "learning_rate": 0.0002, + "loss": 0.079, + "step": 8831 + }, + { + "epoch": 16.018136476989344, + "grad_norm": 0.181640625, + "learning_rate": 0.0002, + "loss": 0.062, + "step": 8832 + }, + { + "epoch": 16.01995012468828, + "grad_norm": 0.173828125, + "learning_rate": 0.0002, + "loss": 0.0597, + "step": 8833 + }, + { + "epoch": 16.021763772387214, + "grad_norm": 0.1826171875, + "learning_rate": 0.0002, + "loss": 0.0598, + "step": 8834 + }, + { + "epoch": 16.02357742008615, + "grad_norm": 0.16015625, + "learning_rate": 0.0002, + "loss": 0.0678, + "step": 8835 + }, + { + "epoch": 16.025391067785083, + "grad_norm": 0.19140625, + "learning_rate": 0.0002, + "loss": 0.0626, + "step": 8836 + }, + { + "epoch": 16.027204715484018, + "grad_norm": 0.1953125, + "learning_rate": 0.0002, + "loss": 0.0574, + "step": 8837 + }, + { + "epoch": 16.029018363182953, + "grad_norm": 0.18359375, + "learning_rate": 0.0002, + "loss": 0.0635, + "step": 8838 + }, + { + "epoch": 16.030832010881888, + "grad_norm": 0.1708984375, + "learning_rate": 0.0002, + "loss": 0.0574, + "step": 8839 + }, + { + "epoch": 16.032645658580822, + "grad_norm": 0.20703125, + "learning_rate": 0.0002, + "loss": 0.0678, + "step": 8840 + }, + { + "epoch": 16.034459306279754, + "grad_norm": 0.224609375, + "learning_rate": 0.0002, + "loss": 0.0637, + "step": 8841 + }, + { + "epoch": 16.03627295397869, + "grad_norm": 0.1796875, + "learning_rate": 0.0002, + "loss": 0.0566, + "step": 8842 + }, + { + "epoch": 16.038086601677623, + "grad_norm": 0.146484375, + "learning_rate": 0.0002, + "loss": 0.0531, + "step": 8843 + }, + { + "epoch": 16.039900249376558, + "grad_norm": 0.166015625, + "learning_rate": 0.0002, + "loss": 0.0592, + "step": 8844 + }, + { + "epoch": 16.041713897075493, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002, + "loss": 0.0548, + "step": 8845 + }, + { + "epoch": 16.043527544774427, + "grad_norm": 0.21484375, + "learning_rate": 0.0002, + "loss": 0.0583, + "step": 8846 + }, + { + "epoch": 16.045341192473362, + "grad_norm": 0.171875, + "learning_rate": 0.0002, + "loss": 0.0595, + "step": 8847 + }, + { + "epoch": 16.047154840172297, + "grad_norm": 0.2353515625, + "learning_rate": 0.0002, + "loss": 0.0626, + "step": 8848 + }, + { + "epoch": 16.04896848787123, + "grad_norm": 0.205078125, + "learning_rate": 0.0002, + "loss": 0.065, + "step": 8849 + }, + { + "epoch": 16.050782135570167, + "grad_norm": 0.20703125, + "learning_rate": 0.0002, + "loss": 0.0654, + "step": 8850 + }, + { + "epoch": 16.0525957832691, + "grad_norm": 0.236328125, + "learning_rate": 0.0002, + "loss": 0.0679, + "step": 8851 + }, + { + "epoch": 16.054409430968036, + "grad_norm": 0.197265625, + "learning_rate": 0.0002, + "loss": 0.0606, + "step": 8852 + }, + { + "epoch": 16.056223078666967, + "grad_norm": 0.2060546875, + "learning_rate": 0.0002, + "loss": 0.0636, + "step": 8853 + }, + { + "epoch": 16.058036726365902, + "grad_norm": 0.1982421875, + "learning_rate": 0.0002, + "loss": 0.0578, + "step": 8854 + }, + { + "epoch": 16.059850374064837, + "grad_norm": 0.373046875, + "learning_rate": 0.0002, + "loss": 0.0694, + "step": 8855 + }, + { + "epoch": 16.06166402176377, + "grad_norm": 0.205078125, + "learning_rate": 0.0002, + "loss": 0.0628, + "step": 8856 + }, + { + "epoch": 16.063477669462706, + "grad_norm": 0.2294921875, + "learning_rate": 0.0002, + "loss": 0.0657, + "step": 8857 + }, + { + "epoch": 16.06529131716164, + "grad_norm": 0.2265625, + "learning_rate": 0.0002, + "loss": 0.0661, + "step": 8858 + }, + { + "epoch": 16.067104964860576, + "grad_norm": 0.322265625, + "learning_rate": 0.0002, + "loss": 0.0803, + "step": 8859 + }, + { + "epoch": 16.06891861255951, + "grad_norm": 0.224609375, + "learning_rate": 0.0002, + "loss": 0.0775, + "step": 8860 + }, + { + "epoch": 16.070732260258445, + "grad_norm": 0.43359375, + "learning_rate": 0.0002, + "loss": 0.083, + "step": 8861 + }, + { + "epoch": 16.07254590795738, + "grad_norm": 0.2578125, + "learning_rate": 0.0002, + "loss": 0.0833, + "step": 8862 + }, + { + "epoch": 16.074359555656315, + "grad_norm": 0.1650390625, + "learning_rate": 0.0002, + "loss": 0.0828, + "step": 8863 + }, + { + "epoch": 16.07617320335525, + "grad_norm": 0.27734375, + "learning_rate": 0.0002, + "loss": 0.0799, + "step": 8864 + }, + { + "epoch": 16.07798685105418, + "grad_norm": 0.283203125, + "learning_rate": 0.0002, + "loss": 0.0947, + "step": 8865 + }, + { + "epoch": 16.079800498753116, + "grad_norm": 0.2158203125, + "learning_rate": 0.0002, + "loss": 0.0878, + "step": 8866 + }, + { + "epoch": 16.08161414645205, + "grad_norm": 0.25, + "learning_rate": 0.0002, + "loss": 0.0966, + "step": 8867 + }, + { + "epoch": 16.083427794150985, + "grad_norm": 0.30078125, + "learning_rate": 0.0002, + "loss": 0.118, + "step": 8868 + }, + { + "epoch": 16.08524144184992, + "grad_norm": 0.283203125, + "learning_rate": 0.0002, + "loss": 0.1108, + "step": 8869 + }, + { + "epoch": 16.087055089548855, + "grad_norm": 0.2275390625, + "learning_rate": 0.0002, + "loss": 0.1248, + "step": 8870 + }, + { + "epoch": 16.08886873724779, + "grad_norm": 0.16015625, + "learning_rate": 0.0002, + "loss": 0.1155, + "step": 8871 + }, + { + "epoch": 16.090682384946724, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 0.2199, + "step": 8872 + }, + { + "epoch": 16.09249603264566, + "grad_norm": 0.255859375, + "learning_rate": 0.0002, + "loss": 0.079, + "step": 8873 + }, + { + "epoch": 16.094309680344594, + "grad_norm": 0.1845703125, + "learning_rate": 0.0002, + "loss": 0.0628, + "step": 8874 + }, + { + "epoch": 16.09612332804353, + "grad_norm": 0.2001953125, + "learning_rate": 0.0002, + "loss": 0.0703, + "step": 8875 + }, + { + "epoch": 16.097936975742464, + "grad_norm": 0.1806640625, + "learning_rate": 0.0002, + "loss": 0.0716, + "step": 8876 + }, + { + "epoch": 16.099750623441395, + "grad_norm": 0.20703125, + "learning_rate": 0.0002, + "loss": 0.0694, + "step": 8877 + }, + { + "epoch": 16.10156427114033, + "grad_norm": 0.2236328125, + "learning_rate": 0.0002, + "loss": 0.0749, + "step": 8878 + }, + { + "epoch": 16.103377918839264, + "grad_norm": 0.212890625, + "learning_rate": 0.0002, + "loss": 0.0688, + "step": 8879 + }, + { + "epoch": 16.1051915665382, + "grad_norm": 0.1953125, + "learning_rate": 0.0002, + "loss": 0.0641, + "step": 8880 + }, + { + "epoch": 16.107005214237134, + "grad_norm": 0.1796875, + "learning_rate": 0.0002, + "loss": 0.0583, + "step": 8881 + }, + { + "epoch": 16.10881886193607, + "grad_norm": 0.1865234375, + "learning_rate": 0.0002, + "loss": 0.0668, + "step": 8882 + }, + { + "epoch": 16.110632509635003, + "grad_norm": 0.2177734375, + "learning_rate": 0.0002, + "loss": 0.0769, + "step": 8883 + }, + { + "epoch": 16.112446157333938, + "grad_norm": 0.1796875, + "learning_rate": 0.0002, + "loss": 0.0617, + "step": 8884 + }, + { + "epoch": 16.114259805032873, + "grad_norm": 0.1728515625, + "learning_rate": 0.0002, + "loss": 0.0795, + "step": 8885 + }, + { + "epoch": 16.116073452731808, + "grad_norm": 0.208984375, + "learning_rate": 0.0002, + "loss": 0.0666, + "step": 8886 + }, + { + "epoch": 16.117887100430742, + "grad_norm": 0.1650390625, + "learning_rate": 0.0002, + "loss": 0.0637, + "step": 8887 + }, + { + "epoch": 16.119700748129677, + "grad_norm": 0.177734375, + "learning_rate": 0.0002, + "loss": 0.0577, + "step": 8888 + }, + { + "epoch": 16.121514395828612, + "grad_norm": 0.203125, + "learning_rate": 0.0002, + "loss": 0.0646, + "step": 8889 + }, + { + "epoch": 16.123328043527543, + "grad_norm": 0.2001953125, + "learning_rate": 0.0002, + "loss": 0.0578, + "step": 8890 + }, + { + "epoch": 16.125141691226478, + "grad_norm": 0.2001953125, + "learning_rate": 0.0002, + "loss": 0.065, + "step": 8891 + }, + { + "epoch": 16.126955338925413, + "grad_norm": 0.173828125, + "learning_rate": 0.0002, + "loss": 0.058, + "step": 8892 + }, + { + "epoch": 16.128768986624348, + "grad_norm": 0.2060546875, + "learning_rate": 0.0002, + "loss": 0.0619, + "step": 8893 + }, + { + "epoch": 16.130582634323282, + "grad_norm": 0.1953125, + "learning_rate": 0.0002, + "loss": 0.0606, + "step": 8894 + }, + { + "epoch": 16.132396282022217, + "grad_norm": 0.2109375, + "learning_rate": 0.0002, + "loss": 0.0678, + "step": 8895 + }, + { + "epoch": 16.134209929721152, + "grad_norm": 0.23828125, + "learning_rate": 0.0002, + "loss": 0.0669, + "step": 8896 + }, + { + "epoch": 16.136023577420087, + "grad_norm": 0.2041015625, + "learning_rate": 0.0002, + "loss": 0.0612, + "step": 8897 + }, + { + "epoch": 16.13783722511902, + "grad_norm": 0.2080078125, + "learning_rate": 0.0002, + "loss": 0.0607, + "step": 8898 + }, + { + "epoch": 16.139650872817956, + "grad_norm": 0.1806640625, + "learning_rate": 0.0002, + "loss": 0.0578, + "step": 8899 + }, + { + "epoch": 16.14146452051689, + "grad_norm": 0.2236328125, + "learning_rate": 0.0002, + "loss": 0.0576, + "step": 8900 + }, + { + "epoch": 16.143278168215826, + "grad_norm": 0.27734375, + "learning_rate": 0.0002, + "loss": 0.1267, + "step": 8901 + }, + { + "epoch": 16.145091815914757, + "grad_norm": 0.2138671875, + "learning_rate": 0.0002, + "loss": 0.0678, + "step": 8902 + }, + { + "epoch": 16.14690546361369, + "grad_norm": 0.23046875, + "learning_rate": 0.0002, + "loss": 0.0681, + "step": 8903 + }, + { + "epoch": 16.148719111312626, + "grad_norm": 0.1796875, + "learning_rate": 0.0002, + "loss": 0.0615, + "step": 8904 + }, + { + "epoch": 16.15053275901156, + "grad_norm": 0.2216796875, + "learning_rate": 0.0002, + "loss": 0.067, + "step": 8905 + }, + { + "epoch": 16.152346406710496, + "grad_norm": 0.1708984375, + "learning_rate": 0.0002, + "loss": 0.0687, + "step": 8906 + }, + { + "epoch": 16.15416005440943, + "grad_norm": 0.26171875, + "learning_rate": 0.0002, + "loss": 0.0687, + "step": 8907 + }, + { + "epoch": 16.155973702108366, + "grad_norm": 0.205078125, + "learning_rate": 0.0002, + "loss": 0.0729, + "step": 8908 + }, + { + "epoch": 16.1577873498073, + "grad_norm": 0.197265625, + "learning_rate": 0.0002, + "loss": 0.0734, + "step": 8909 + }, + { + "epoch": 16.159600997506235, + "grad_norm": 0.3984375, + "learning_rate": 0.0002, + "loss": 0.0786, + "step": 8910 + }, + { + "epoch": 16.16141464520517, + "grad_norm": 0.294921875, + "learning_rate": 0.0002, + "loss": 0.0907, + "step": 8911 + }, + { + "epoch": 16.163228292904105, + "grad_norm": 0.26171875, + "learning_rate": 0.0002, + "loss": 0.084, + "step": 8912 + }, + { + "epoch": 16.16504194060304, + "grad_norm": 0.283203125, + "learning_rate": 0.0002, + "loss": 0.1012, + "step": 8913 + }, + { + "epoch": 16.16685558830197, + "grad_norm": 0.2578125, + "learning_rate": 0.0002, + "loss": 0.083, + "step": 8914 + }, + { + "epoch": 16.168669236000905, + "grad_norm": 0.390625, + "learning_rate": 0.0002, + "loss": 0.0967, + "step": 8915 + }, + { + "epoch": 16.17048288369984, + "grad_norm": 0.283203125, + "learning_rate": 0.0002, + "loss": 0.0928, + "step": 8916 + }, + { + "epoch": 16.172296531398775, + "grad_norm": 0.27734375, + "learning_rate": 0.0002, + "loss": 0.1043, + "step": 8917 + }, + { + "epoch": 16.17411017909771, + "grad_norm": 0.11083984375, + "learning_rate": 0.0002, + "loss": 0.088, + "step": 8918 + }, + { + "epoch": 16.175923826796645, + "grad_norm": 0.2470703125, + "learning_rate": 0.0002, + "loss": 0.1352, + "step": 8919 + }, + { + "epoch": 16.17773747449558, + "grad_norm": 0.1591796875, + "learning_rate": 0.0002, + "loss": 0.1259, + "step": 8920 + }, + { + "epoch": 16.179551122194514, + "grad_norm": 0.140625, + "learning_rate": 0.0002, + "loss": 0.1713, + "step": 8921 + }, + { + "epoch": 16.18136476989345, + "grad_norm": 0.15625, + "learning_rate": 0.0002, + "loss": 0.201, + "step": 8922 + }, + { + "epoch": 16.183178417592384, + "grad_norm": 0.216796875, + "learning_rate": 0.0002, + "loss": 0.0753, + "step": 8923 + }, + { + "epoch": 16.18499206529132, + "grad_norm": 0.20703125, + "learning_rate": 0.0002, + "loss": 0.0686, + "step": 8924 + }, + { + "epoch": 16.186805712990253, + "grad_norm": 0.2431640625, + "learning_rate": 0.0002, + "loss": 0.0912, + "step": 8925 + }, + { + "epoch": 16.188619360689184, + "grad_norm": 0.2177734375, + "learning_rate": 0.0002, + "loss": 0.0737, + "step": 8926 + }, + { + "epoch": 16.19043300838812, + "grad_norm": 0.1826171875, + "learning_rate": 0.0002, + "loss": 0.0626, + "step": 8927 + }, + { + "epoch": 16.192246656087054, + "grad_norm": 0.1689453125, + "learning_rate": 0.0002, + "loss": 0.0643, + "step": 8928 + }, + { + "epoch": 16.19406030378599, + "grad_norm": 0.2392578125, + "learning_rate": 0.0002, + "loss": 0.0692, + "step": 8929 + }, + { + "epoch": 16.195873951484923, + "grad_norm": 0.201171875, + "learning_rate": 0.0002, + "loss": 0.0651, + "step": 8930 + }, + { + "epoch": 16.19768759918386, + "grad_norm": 0.1796875, + "learning_rate": 0.0002, + "loss": 0.0595, + "step": 8931 + }, + { + "epoch": 16.199501246882793, + "grad_norm": 0.1923828125, + "learning_rate": 0.0002, + "loss": 0.065, + "step": 8932 + }, + { + "epoch": 16.201314894581728, + "grad_norm": 0.2138671875, + "learning_rate": 0.0002, + "loss": 0.0645, + "step": 8933 + }, + { + "epoch": 16.203128542280663, + "grad_norm": 0.1845703125, + "learning_rate": 0.0002, + "loss": 0.0578, + "step": 8934 + }, + { + "epoch": 16.204942189979597, + "grad_norm": 0.169921875, + "learning_rate": 0.0002, + "loss": 0.064, + "step": 8935 + }, + { + "epoch": 16.206755837678532, + "grad_norm": 0.162109375, + "learning_rate": 0.0002, + "loss": 0.0585, + "step": 8936 + }, + { + "epoch": 16.208569485377467, + "grad_norm": 0.1669921875, + "learning_rate": 0.0002, + "loss": 0.07, + "step": 8937 + }, + { + "epoch": 16.2103831330764, + "grad_norm": 0.2294921875, + "learning_rate": 0.0002, + "loss": 0.0716, + "step": 8938 + }, + { + "epoch": 16.212196780775333, + "grad_norm": 0.15625, + "learning_rate": 0.0002, + "loss": 0.0601, + "step": 8939 + }, + { + "epoch": 16.214010428474268, + "grad_norm": 0.2109375, + "learning_rate": 0.0002, + "loss": 0.0658, + "step": 8940 + }, + { + "epoch": 16.215824076173202, + "grad_norm": 0.1884765625, + "learning_rate": 0.0002, + "loss": 0.0664, + "step": 8941 + }, + { + "epoch": 16.217637723872137, + "grad_norm": 0.177734375, + "learning_rate": 0.0002, + "loss": 0.058, + "step": 8942 + }, + { + "epoch": 16.219451371571072, + "grad_norm": 0.2177734375, + "learning_rate": 0.0002, + "loss": 0.0637, + "step": 8943 + }, + { + "epoch": 16.221265019270007, + "grad_norm": 0.1904296875, + "learning_rate": 0.0002, + "loss": 0.0665, + "step": 8944 + }, + { + "epoch": 16.22307866696894, + "grad_norm": 0.18359375, + "learning_rate": 0.0002, + "loss": 0.0545, + "step": 8945 + }, + { + "epoch": 16.224892314667876, + "grad_norm": 0.1865234375, + "learning_rate": 0.0002, + "loss": 0.0624, + "step": 8946 + }, + { + "epoch": 16.22670596236681, + "grad_norm": 0.2138671875, + "learning_rate": 0.0002, + "loss": 0.0625, + "step": 8947 + }, + { + "epoch": 16.228519610065746, + "grad_norm": 0.197265625, + "learning_rate": 0.0002, + "loss": 0.0623, + "step": 8948 + }, + { + "epoch": 16.23033325776468, + "grad_norm": 0.1845703125, + "learning_rate": 0.0002, + "loss": 0.065, + "step": 8949 + }, + { + "epoch": 16.232146905463615, + "grad_norm": 0.2099609375, + "learning_rate": 0.0002, + "loss": 0.06, + "step": 8950 + }, + { + "epoch": 16.233960553162547, + "grad_norm": 0.3125, + "learning_rate": 0.0002, + "loss": 0.0626, + "step": 8951 + }, + { + "epoch": 16.23577420086148, + "grad_norm": 0.228515625, + "learning_rate": 0.0002, + "loss": 0.068, + "step": 8952 + }, + { + "epoch": 16.237587848560416, + "grad_norm": 0.185546875, + "learning_rate": 0.0002, + "loss": 0.0617, + "step": 8953 + }, + { + "epoch": 16.23940149625935, + "grad_norm": 0.2255859375, + "learning_rate": 0.0002, + "loss": 0.0657, + "step": 8954 + }, + { + "epoch": 16.241215143958286, + "grad_norm": 0.21484375, + "learning_rate": 0.0002, + "loss": 0.0759, + "step": 8955 + }, + { + "epoch": 16.24302879165722, + "grad_norm": 0.287109375, + "learning_rate": 0.0002, + "loss": 0.0738, + "step": 8956 + }, + { + "epoch": 16.244842439356155, + "grad_norm": 0.25, + "learning_rate": 0.0002, + "loss": 0.066, + "step": 8957 + }, + { + "epoch": 16.24665608705509, + "grad_norm": 0.197265625, + "learning_rate": 0.0002, + "loss": 0.0735, + "step": 8958 + }, + { + "epoch": 16.248469734754025, + "grad_norm": 0.267578125, + "learning_rate": 0.0002, + "loss": 0.0738, + "step": 8959 + }, + { + "epoch": 16.25028338245296, + "grad_norm": 0.2412109375, + "learning_rate": 0.0002, + "loss": 0.0784, + "step": 8960 + }, + { + "epoch": 16.252097030151894, + "grad_norm": 0.2099609375, + "learning_rate": 0.0002, + "loss": 0.0747, + "step": 8961 + }, + { + "epoch": 16.25391067785083, + "grad_norm": 0.271484375, + "learning_rate": 0.0002, + "loss": 0.0821, + "step": 8962 + }, + { + "epoch": 16.25572432554976, + "grad_norm": 0.2197265625, + "learning_rate": 0.0002, + "loss": 0.082, + "step": 8963 + }, + { + "epoch": 16.257537973248695, + "grad_norm": 0.2421875, + "learning_rate": 0.0002, + "loss": 0.0859, + "step": 8964 + }, + { + "epoch": 16.25935162094763, + "grad_norm": 0.287109375, + "learning_rate": 0.0002, + "loss": 0.097, + "step": 8965 + }, + { + "epoch": 16.261165268646565, + "grad_norm": 0.1865234375, + "learning_rate": 0.0002, + "loss": 0.0924, + "step": 8966 + }, + { + "epoch": 16.2629789163455, + "grad_norm": 0.416015625, + "learning_rate": 0.0002, + "loss": 0.1207, + "step": 8967 + }, + { + "epoch": 16.264792564044434, + "grad_norm": 0.2099609375, + "learning_rate": 0.0002, + "loss": 0.0986, + "step": 8968 + }, + { + "epoch": 16.26660621174337, + "grad_norm": 0.1318359375, + "learning_rate": 0.0002, + "loss": 0.1178, + "step": 8969 + }, + { + "epoch": 16.268419859442304, + "grad_norm": 0.2412109375, + "learning_rate": 0.0002, + "loss": 0.1253, + "step": 8970 + }, + { + "epoch": 16.27023350714124, + "grad_norm": 0.337890625, + "learning_rate": 0.0002, + "loss": 0.1764, + "step": 8971 + }, + { + "epoch": 16.272047154840173, + "grad_norm": 0.193359375, + "learning_rate": 0.0002, + "loss": 0.214, + "step": 8972 + }, + { + "epoch": 16.273860802539108, + "grad_norm": 0.2353515625, + "learning_rate": 0.0002, + "loss": 0.0928, + "step": 8973 + }, + { + "epoch": 16.275674450238043, + "grad_norm": 0.2119140625, + "learning_rate": 0.0002, + "loss": 0.0713, + "step": 8974 + }, + { + "epoch": 16.277488097936974, + "grad_norm": 0.23828125, + "learning_rate": 0.0002, + "loss": 0.0717, + "step": 8975 + }, + { + "epoch": 16.27930174563591, + "grad_norm": 0.2412109375, + "learning_rate": 0.0002, + "loss": 0.0953, + "step": 8976 + }, + { + "epoch": 16.27930174563591, + "eval_loss": 2.575951099395752, + "eval_runtime": 153.064, + "eval_samples_per_second": 6.533, + "eval_steps_per_second": 6.533, + "step": 8976 + }, + { + "epoch": 16.27930174563591, + "mmlu_eval_accuracy": 0.29659346305822426, + "mmlu_eval_accuracy_abstract_algebra": 0.45454545454545453, + "mmlu_eval_accuracy_anatomy": 0.2857142857142857, + "mmlu_eval_accuracy_astronomy": 0.3125, + "mmlu_eval_accuracy_business_ethics": 0.2727272727272727, + "mmlu_eval_accuracy_clinical_knowledge": 0.3103448275862069, + "mmlu_eval_accuracy_college_biology": 0.4375, + "mmlu_eval_accuracy_college_chemistry": 0.125, + "mmlu_eval_accuracy_college_computer_science": 0.09090909090909091, + "mmlu_eval_accuracy_college_mathematics": 0.18181818181818182, + "mmlu_eval_accuracy_college_medicine": 0.2727272727272727, + "mmlu_eval_accuracy_college_physics": 0.2727272727272727, + "mmlu_eval_accuracy_computer_security": 0.7272727272727273, + "mmlu_eval_accuracy_conceptual_physics": 0.23076923076923078, + "mmlu_eval_accuracy_econometrics": 0.16666666666666666, + "mmlu_eval_accuracy_electrical_engineering": 0.0625, + "mmlu_eval_accuracy_elementary_mathematics": 0.2926829268292683, + "mmlu_eval_accuracy_formal_logic": 0.14285714285714285, + "mmlu_eval_accuracy_global_facts": 0.2, + "mmlu_eval_accuracy_high_school_biology": 0.3125, + "mmlu_eval_accuracy_high_school_chemistry": 0.13636363636363635, + "mmlu_eval_accuracy_high_school_computer_science": 0.2222222222222222, + "mmlu_eval_accuracy_high_school_european_history": 0.2777777777777778, + "mmlu_eval_accuracy_high_school_geography": 0.2727272727272727, + "mmlu_eval_accuracy_high_school_government_and_politics": 0.2857142857142857, + "mmlu_eval_accuracy_high_school_macroeconomics": 0.2558139534883721, + "mmlu_eval_accuracy_high_school_mathematics": 0.06896551724137931, + "mmlu_eval_accuracy_high_school_microeconomics": 0.3076923076923077, + "mmlu_eval_accuracy_high_school_physics": 0.47058823529411764, + "mmlu_eval_accuracy_high_school_psychology": 0.35, + "mmlu_eval_accuracy_high_school_statistics": 0.30434782608695654, + "mmlu_eval_accuracy_high_school_us_history": 0.3181818181818182, + "mmlu_eval_accuracy_high_school_world_history": 0.15384615384615385, + "mmlu_eval_accuracy_human_aging": 0.2608695652173913, + "mmlu_eval_accuracy_human_sexuality": 0.16666666666666666, + "mmlu_eval_accuracy_international_law": 0.3076923076923077, + "mmlu_eval_accuracy_jurisprudence": 0.36363636363636365, + "mmlu_eval_accuracy_logical_fallacies": 0.3888888888888889, + "mmlu_eval_accuracy_machine_learning": 0.36363636363636365, + "mmlu_eval_accuracy_management": 0.45454545454545453, + "mmlu_eval_accuracy_marketing": 0.44, + "mmlu_eval_accuracy_medical_genetics": 0.36363636363636365, + "mmlu_eval_accuracy_miscellaneous": 0.46511627906976744, + "mmlu_eval_accuracy_moral_disputes": 0.2894736842105263, + "mmlu_eval_accuracy_moral_scenarios": 0.22, + "mmlu_eval_accuracy_nutrition": 0.3333333333333333, + "mmlu_eval_accuracy_philosophy": 0.3235294117647059, + "mmlu_eval_accuracy_prehistory": 0.4, + "mmlu_eval_accuracy_professional_accounting": 0.3225806451612903, + "mmlu_eval_accuracy_professional_law": 0.2529411764705882, + "mmlu_eval_accuracy_professional_medicine": 0.2903225806451613, + "mmlu_eval_accuracy_professional_psychology": 0.30434782608695654, + "mmlu_eval_accuracy_public_relations": 0.3333333333333333, + "mmlu_eval_accuracy_security_studies": 0.3333333333333333, + "mmlu_eval_accuracy_sociology": 0.5, + "mmlu_eval_accuracy_us_foreign_policy": 0.36363636363636365, + "mmlu_eval_accuracy_virology": 0.2777777777777778, + "mmlu_eval_accuracy_world_religions": 0.21052631578947367, + "mmlu_loss": 2.7035428486097723, + "step": 8976 + }, + { + "epoch": 16.281115393334844, + "grad_norm": 0.2060546875, + "learning_rate": 0.0002, + "loss": 0.0689, + "step": 8977 + }, + { + "epoch": 16.28292904103378, + "grad_norm": 0.181640625, + "learning_rate": 0.0002, + "loss": 0.0663, + "step": 8978 + }, + { + "epoch": 16.284742688732713, + "grad_norm": 0.20703125, + "learning_rate": 0.0002, + "loss": 0.077, + "step": 8979 + }, + { + "epoch": 16.286556336431648, + "grad_norm": 0.259765625, + "learning_rate": 0.0002, + "loss": 0.0782, + "step": 8980 + }, + { + "epoch": 16.288369984130583, + "grad_norm": 0.1982421875, + "learning_rate": 0.0002, + "loss": 0.0672, + "step": 8981 + }, + { + "epoch": 16.290183631829517, + "grad_norm": 0.2333984375, + "learning_rate": 0.0002, + "loss": 0.083, + "step": 8982 + }, + { + "epoch": 16.291997279528452, + "grad_norm": 0.232421875, + "learning_rate": 0.0002, + "loss": 0.0766, + "step": 8983 + }, + { + "epoch": 16.293810927227387, + "grad_norm": 0.25390625, + "learning_rate": 0.0002, + "loss": 0.0736, + "step": 8984 + }, + { + "epoch": 16.29562457492632, + "grad_norm": 0.197265625, + "learning_rate": 0.0002, + "loss": 0.0636, + "step": 8985 + }, + { + "epoch": 16.297438222625257, + "grad_norm": 0.173828125, + "learning_rate": 0.0002, + "loss": 0.0625, + "step": 8986 + }, + { + "epoch": 16.29925187032419, + "grad_norm": 0.173828125, + "learning_rate": 0.0002, + "loss": 0.0642, + "step": 8987 + }, + { + "epoch": 16.301065518023123, + "grad_norm": 0.181640625, + "learning_rate": 0.0002, + "loss": 0.0565, + "step": 8988 + }, + { + "epoch": 16.302879165722057, + "grad_norm": 0.228515625, + "learning_rate": 0.0002, + "loss": 0.0732, + "step": 8989 + }, + { + "epoch": 16.304692813420992, + "grad_norm": 0.2099609375, + "learning_rate": 0.0002, + "loss": 0.0675, + "step": 8990 + }, + { + "epoch": 16.306506461119927, + "grad_norm": 0.23828125, + "learning_rate": 0.0002, + "loss": 0.0687, + "step": 8991 + }, + { + "epoch": 16.30832010881886, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 0.0596, + "step": 8992 + }, + { + "epoch": 16.310133756517796, + "grad_norm": 0.212890625, + "learning_rate": 0.0002, + "loss": 0.064, + "step": 8993 + }, + { + "epoch": 16.31194740421673, + "grad_norm": 0.1826171875, + "learning_rate": 0.0002, + "loss": 0.0677, + "step": 8994 + }, + { + "epoch": 16.313761051915666, + "grad_norm": 0.19140625, + "learning_rate": 0.0002, + "loss": 0.0626, + "step": 8995 + }, + { + "epoch": 16.3155746996146, + "grad_norm": 0.17578125, + "learning_rate": 0.0002, + "loss": 0.0593, + "step": 8996 + }, + { + "epoch": 16.317388347313535, + "grad_norm": 0.1962890625, + "learning_rate": 0.0002, + "loss": 0.0627, + "step": 8997 + }, + { + "epoch": 16.31920199501247, + "grad_norm": 0.2265625, + "learning_rate": 0.0002, + "loss": 0.0646, + "step": 8998 + }, + { + "epoch": 16.321015642711405, + "grad_norm": 0.166015625, + "learning_rate": 0.0002, + "loss": 0.0602, + "step": 8999 + }, + { + "epoch": 16.322829290410336, + "grad_norm": 0.19140625, + "learning_rate": 0.0002, + "loss": 0.0609, + "step": 9000 + }, + { + "epoch": 16.32464293810927, + "grad_norm": 0.2255859375, + "learning_rate": 0.0002, + "loss": 0.0673, + "step": 9001 + }, + { + "epoch": 16.326456585808206, + "grad_norm": 0.265625, + "learning_rate": 0.0002, + "loss": 0.0758, + "step": 9002 + }, + { + "epoch": 16.32827023350714, + "grad_norm": 0.2021484375, + "learning_rate": 0.0002, + "loss": 0.062, + "step": 9003 + }, + { + "epoch": 16.330083881206075, + "grad_norm": 0.1826171875, + "learning_rate": 0.0002, + "loss": 0.0632, + "step": 9004 + }, + { + "epoch": 16.33189752890501, + "grad_norm": 0.302734375, + "learning_rate": 0.0002, + "loss": 0.0778, + "step": 9005 + }, + { + "epoch": 16.333711176603945, + "grad_norm": 0.236328125, + "learning_rate": 0.0002, + "loss": 0.0776, + "step": 9006 + }, + { + "epoch": 16.33552482430288, + "grad_norm": 0.181640625, + "learning_rate": 0.0002, + "loss": 0.0675, + "step": 9007 + }, + { + "epoch": 16.337338472001814, + "grad_norm": 0.2578125, + "learning_rate": 0.0002, + "loss": 0.069, + "step": 9008 + }, + { + "epoch": 16.33915211970075, + "grad_norm": 0.283203125, + "learning_rate": 0.0002, + "loss": 0.0722, + "step": 9009 + }, + { + "epoch": 16.340965767399684, + "grad_norm": 0.27734375, + "learning_rate": 0.0002, + "loss": 0.0779, + "step": 9010 + }, + { + "epoch": 16.34277941509862, + "grad_norm": 0.203125, + "learning_rate": 0.0002, + "loss": 0.07, + "step": 9011 + }, + { + "epoch": 16.34459306279755, + "grad_norm": 0.330078125, + "learning_rate": 0.0002, + "loss": 0.0902, + "step": 9012 + }, + { + "epoch": 16.346406710496485, + "grad_norm": 0.3203125, + "learning_rate": 0.0002, + "loss": 0.091, + "step": 9013 + }, + { + "epoch": 16.34822035819542, + "grad_norm": 0.1826171875, + "learning_rate": 0.0002, + "loss": 0.0771, + "step": 9014 + }, + { + "epoch": 16.350034005894354, + "grad_norm": 0.2158203125, + "learning_rate": 0.0002, + "loss": 0.0834, + "step": 9015 + }, + { + "epoch": 16.35184765359329, + "grad_norm": 0.283203125, + "learning_rate": 0.0002, + "loss": 0.0959, + "step": 9016 + }, + { + "epoch": 16.353661301292224, + "grad_norm": 0.251953125, + "learning_rate": 0.0002, + "loss": 0.1176, + "step": 9017 + }, + { + "epoch": 16.35547494899116, + "grad_norm": 0.1953125, + "learning_rate": 0.0002, + "loss": 0.098, + "step": 9018 + }, + { + "epoch": 16.357288596690093, + "grad_norm": 0.1845703125, + "learning_rate": 0.0002, + "loss": 0.115, + "step": 9019 + }, + { + "epoch": 16.359102244389028, + "grad_norm": 0.154296875, + "learning_rate": 0.0002, + "loss": 0.1324, + "step": 9020 + }, + { + "epoch": 16.360915892087963, + "grad_norm": 0.236328125, + "learning_rate": 0.0002, + "loss": 0.1451, + "step": 9021 + }, + { + "epoch": 16.362729539786898, + "grad_norm": 0.1748046875, + "learning_rate": 0.0002, + "loss": 0.1897, + "step": 9022 + }, + { + "epoch": 16.364543187485832, + "grad_norm": 0.2265625, + "learning_rate": 0.0002, + "loss": 0.0794, + "step": 9023 + }, + { + "epoch": 16.366356835184764, + "grad_norm": 0.2255859375, + "learning_rate": 0.0002, + "loss": 0.0843, + "step": 9024 + }, + { + "epoch": 16.3681704828837, + "grad_norm": 0.265625, + "learning_rate": 0.0002, + "loss": 0.0816, + "step": 9025 + }, + { + "epoch": 16.369984130582633, + "grad_norm": 0.2099609375, + "learning_rate": 0.0002, + "loss": 0.0694, + "step": 9026 + }, + { + "epoch": 16.371797778281568, + "grad_norm": 0.201171875, + "learning_rate": 0.0002, + "loss": 0.0701, + "step": 9027 + }, + { + "epoch": 16.373611425980503, + "grad_norm": 0.193359375, + "learning_rate": 0.0002, + "loss": 0.0709, + "step": 9028 + }, + { + "epoch": 16.375425073679438, + "grad_norm": 0.189453125, + "learning_rate": 0.0002, + "loss": 0.0659, + "step": 9029 + }, + { + "epoch": 16.377238721378372, + "grad_norm": 0.177734375, + "learning_rate": 0.0002, + "loss": 0.0689, + "step": 9030 + }, + { + "epoch": 16.379052369077307, + "grad_norm": 0.1982421875, + "learning_rate": 0.0002, + "loss": 0.0697, + "step": 9031 + }, + { + "epoch": 16.380866016776242, + "grad_norm": 0.185546875, + "learning_rate": 0.0002, + "loss": 0.0638, + "step": 9032 + }, + { + "epoch": 16.382679664475177, + "grad_norm": 0.2158203125, + "learning_rate": 0.0002, + "loss": 0.0766, + "step": 9033 + }, + { + "epoch": 16.38449331217411, + "grad_norm": 0.21484375, + "learning_rate": 0.0002, + "loss": 0.0711, + "step": 9034 + }, + { + "epoch": 16.386306959873046, + "grad_norm": 0.22265625, + "learning_rate": 0.0002, + "loss": 0.0678, + "step": 9035 + }, + { + "epoch": 16.388120607571977, + "grad_norm": 0.228515625, + "learning_rate": 0.0002, + "loss": 0.0801, + "step": 9036 + }, + { + "epoch": 16.389934255270912, + "grad_norm": 0.216796875, + "learning_rate": 0.0002, + "loss": 0.0718, + "step": 9037 + }, + { + "epoch": 16.391747902969847, + "grad_norm": 0.19140625, + "learning_rate": 0.0002, + "loss": 0.0632, + "step": 9038 + }, + { + "epoch": 16.39356155066878, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 0.0547, + "step": 9039 + }, + { + "epoch": 16.395375198367717, + "grad_norm": 0.1826171875, + "learning_rate": 0.0002, + "loss": 0.0612, + "step": 9040 + }, + { + "epoch": 16.39718884606665, + "grad_norm": 0.197265625, + "learning_rate": 0.0002, + "loss": 0.0717, + "step": 9041 + }, + { + "epoch": 16.399002493765586, + "grad_norm": 0.1826171875, + "learning_rate": 0.0002, + "loss": 0.0637, + "step": 9042 + }, + { + "epoch": 16.40081614146452, + "grad_norm": 0.21484375, + "learning_rate": 0.0002, + "loss": 0.0665, + "step": 9043 + }, + { + "epoch": 16.402629789163456, + "grad_norm": 0.2080078125, + "learning_rate": 0.0002, + "loss": 0.0707, + "step": 9044 + }, + { + "epoch": 16.40444343686239, + "grad_norm": 0.1962890625, + "learning_rate": 0.0002, + "loss": 0.0608, + "step": 9045 + }, + { + "epoch": 16.406257084561325, + "grad_norm": 0.2412109375, + "learning_rate": 0.0002, + "loss": 0.0697, + "step": 9046 + }, + { + "epoch": 16.40807073226026, + "grad_norm": 0.2197265625, + "learning_rate": 0.0002, + "loss": 0.0636, + "step": 9047 + }, + { + "epoch": 16.409884379959195, + "grad_norm": 0.197265625, + "learning_rate": 0.0002, + "loss": 0.0647, + "step": 9048 + }, + { + "epoch": 16.411698027658126, + "grad_norm": 0.2099609375, + "learning_rate": 0.0002, + "loss": 0.0638, + "step": 9049 + }, + { + "epoch": 16.41351167535706, + "grad_norm": 0.240234375, + "learning_rate": 0.0002, + "loss": 0.0667, + "step": 9050 + }, + { + "epoch": 16.415325323055995, + "grad_norm": 0.2255859375, + "learning_rate": 0.0002, + "loss": 0.0662, + "step": 9051 + }, + { + "epoch": 16.41713897075493, + "grad_norm": 0.345703125, + "learning_rate": 0.0002, + "loss": 0.0803, + "step": 9052 + }, + { + "epoch": 16.418952618453865, + "grad_norm": 0.271484375, + "learning_rate": 0.0002, + "loss": 0.0831, + "step": 9053 + }, + { + "epoch": 16.4207662661528, + "grad_norm": 0.2412109375, + "learning_rate": 0.0002, + "loss": 0.078, + "step": 9054 + }, + { + "epoch": 16.422579913851735, + "grad_norm": 0.2099609375, + "learning_rate": 0.0002, + "loss": 0.0681, + "step": 9055 + }, + { + "epoch": 16.42439356155067, + "grad_norm": 0.267578125, + "learning_rate": 0.0002, + "loss": 0.0705, + "step": 9056 + }, + { + "epoch": 16.426207209249604, + "grad_norm": 0.26953125, + "learning_rate": 0.0002, + "loss": 0.0789, + "step": 9057 + }, + { + "epoch": 16.42802085694854, + "grad_norm": 0.244140625, + "learning_rate": 0.0002, + "loss": 0.0779, + "step": 9058 + }, + { + "epoch": 16.429834504647474, + "grad_norm": 0.1376953125, + "learning_rate": 0.0002, + "loss": 0.0773, + "step": 9059 + }, + { + "epoch": 16.43164815234641, + "grad_norm": 0.18359375, + "learning_rate": 0.0002, + "loss": 0.0789, + "step": 9060 + }, + { + "epoch": 16.43346180004534, + "grad_norm": 0.1865234375, + "learning_rate": 0.0002, + "loss": 0.081, + "step": 9061 + }, + { + "epoch": 16.435275447744274, + "grad_norm": 0.259765625, + "learning_rate": 0.0002, + "loss": 0.0882, + "step": 9062 + }, + { + "epoch": 16.43708909544321, + "grad_norm": 0.3515625, + "learning_rate": 0.0002, + "loss": 0.0836, + "step": 9063 + }, + { + "epoch": 16.438902743142144, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 0.0905, + "step": 9064 + }, + { + "epoch": 16.44071639084108, + "grad_norm": 0.271484375, + "learning_rate": 0.0002, + "loss": 0.1157, + "step": 9065 + }, + { + "epoch": 16.442530038540013, + "grad_norm": 0.1611328125, + "learning_rate": 0.0002, + "loss": 0.0933, + "step": 9066 + }, + { + "epoch": 16.44434368623895, + "grad_norm": 0.12109375, + "learning_rate": 0.0002, + "loss": 0.1062, + "step": 9067 + }, + { + "epoch": 16.446157333937883, + "grad_norm": 0.296875, + "learning_rate": 0.0002, + "loss": 0.1157, + "step": 9068 + }, + { + "epoch": 16.447970981636818, + "grad_norm": 0.296875, + "learning_rate": 0.0002, + "loss": 0.128, + "step": 9069 + }, + { + "epoch": 16.449784629335753, + "grad_norm": 0.1337890625, + "learning_rate": 0.0002, + "loss": 0.1254, + "step": 9070 + }, + { + "epoch": 16.451598277034687, + "grad_norm": 0.1923828125, + "learning_rate": 0.0002, + "loss": 0.1721, + "step": 9071 + }, + { + "epoch": 16.453411924733622, + "grad_norm": 0.1572265625, + "learning_rate": 0.0002, + "loss": 0.2405, + "step": 9072 + }, + { + "epoch": 16.455225572432553, + "grad_norm": 0.2080078125, + "learning_rate": 0.0002, + "loss": 0.0771, + "step": 9073 + }, + { + "epoch": 16.457039220131488, + "grad_norm": 0.2353515625, + "learning_rate": 0.0002, + "loss": 0.0823, + "step": 9074 + }, + { + "epoch": 16.458852867830423, + "grad_norm": 0.203125, + "learning_rate": 0.0002, + "loss": 0.0725, + "step": 9075 + }, + { + "epoch": 16.460666515529358, + "grad_norm": 0.21875, + "learning_rate": 0.0002, + "loss": 0.075, + "step": 9076 + }, + { + "epoch": 16.462480163228292, + "grad_norm": 0.2314453125, + "learning_rate": 0.0002, + "loss": 0.0824, + "step": 9077 + }, + { + "epoch": 16.464293810927227, + "grad_norm": 0.2275390625, + "learning_rate": 0.0002, + "loss": 0.0734, + "step": 9078 + }, + { + "epoch": 16.466107458626162, + "grad_norm": 0.2109375, + "learning_rate": 0.0002, + "loss": 0.0756, + "step": 9079 + }, + { + "epoch": 16.467921106325097, + "grad_norm": 0.2216796875, + "learning_rate": 0.0002, + "loss": 0.0717, + "step": 9080 + }, + { + "epoch": 16.46973475402403, + "grad_norm": 0.1884765625, + "learning_rate": 0.0002, + "loss": 0.0766, + "step": 9081 + }, + { + "epoch": 16.471548401722966, + "grad_norm": 0.208984375, + "learning_rate": 0.0002, + "loss": 0.0729, + "step": 9082 + }, + { + "epoch": 16.4733620494219, + "grad_norm": 0.2158203125, + "learning_rate": 0.0002, + "loss": 0.0752, + "step": 9083 + }, + { + "epoch": 16.475175697120836, + "grad_norm": 0.19140625, + "learning_rate": 0.0002, + "loss": 0.0673, + "step": 9084 + }, + { + "epoch": 16.476989344819767, + "grad_norm": 0.1728515625, + "learning_rate": 0.0002, + "loss": 0.0666, + "step": 9085 + }, + { + "epoch": 16.478802992518702, + "grad_norm": 0.197265625, + "learning_rate": 0.0002, + "loss": 0.0717, + "step": 9086 + }, + { + "epoch": 16.480616640217637, + "grad_norm": 0.212890625, + "learning_rate": 0.0002, + "loss": 0.0614, + "step": 9087 + }, + { + "epoch": 16.48243028791657, + "grad_norm": 0.1640625, + "learning_rate": 0.0002, + "loss": 0.0586, + "step": 9088 + }, + { + "epoch": 16.484243935615506, + "grad_norm": 0.181640625, + "learning_rate": 0.0002, + "loss": 0.0663, + "step": 9089 + }, + { + "epoch": 16.48605758331444, + "grad_norm": 0.197265625, + "learning_rate": 0.0002, + "loss": 0.0663, + "step": 9090 + }, + { + "epoch": 16.487871231013376, + "grad_norm": 0.203125, + "learning_rate": 0.0002, + "loss": 0.0633, + "step": 9091 + }, + { + "epoch": 16.48968487871231, + "grad_norm": 0.1806640625, + "learning_rate": 0.0002, + "loss": 0.0631, + "step": 9092 + }, + { + "epoch": 16.491498526411245, + "grad_norm": 0.224609375, + "learning_rate": 0.0002, + "loss": 0.0657, + "step": 9093 + }, + { + "epoch": 16.49331217411018, + "grad_norm": 0.17578125, + "learning_rate": 0.0002, + "loss": 0.06, + "step": 9094 + }, + { + "epoch": 16.495125821809115, + "grad_norm": 0.208984375, + "learning_rate": 0.0002, + "loss": 0.065, + "step": 9095 + }, + { + "epoch": 16.49693946950805, + "grad_norm": 0.19140625, + "learning_rate": 0.0002, + "loss": 0.0664, + "step": 9096 + }, + { + "epoch": 16.49875311720698, + "grad_norm": 0.2138671875, + "learning_rate": 0.0002, + "loss": 0.065, + "step": 9097 + }, + { + "epoch": 16.500566764905916, + "grad_norm": 0.2451171875, + "learning_rate": 0.0002, + "loss": 0.0795, + "step": 9098 + }, + { + "epoch": 16.50238041260485, + "grad_norm": 0.1640625, + "learning_rate": 0.0002, + "loss": 0.062, + "step": 9099 + }, + { + "epoch": 16.504194060303785, + "grad_norm": 0.171875, + "learning_rate": 0.0002, + "loss": 0.0639, + "step": 9100 + }, + { + "epoch": 16.50600770800272, + "grad_norm": 0.263671875, + "learning_rate": 0.0002, + "loss": 0.0659, + "step": 9101 + }, + { + "epoch": 16.507821355701655, + "grad_norm": 0.251953125, + "learning_rate": 0.0002, + "loss": 0.0735, + "step": 9102 + }, + { + "epoch": 16.50963500340059, + "grad_norm": 0.1923828125, + "learning_rate": 0.0002, + "loss": 0.0647, + "step": 9103 + }, + { + "epoch": 16.511448651099524, + "grad_norm": 0.2109375, + "learning_rate": 0.0002, + "loss": 0.0629, + "step": 9104 + }, + { + "epoch": 16.51326229879846, + "grad_norm": 0.23828125, + "learning_rate": 0.0002, + "loss": 0.0648, + "step": 9105 + }, + { + "epoch": 16.515075946497394, + "grad_norm": 0.2138671875, + "learning_rate": 0.0002, + "loss": 0.0712, + "step": 9106 + }, + { + "epoch": 16.51688959419633, + "grad_norm": 0.2470703125, + "learning_rate": 0.0002, + "loss": 0.0714, + "step": 9107 + }, + { + "epoch": 16.518703241895263, + "grad_norm": 0.27734375, + "learning_rate": 0.0002, + "loss": 0.0826, + "step": 9108 + }, + { + "epoch": 16.520516889594198, + "grad_norm": 0.244140625, + "learning_rate": 0.0002, + "loss": 0.0747, + "step": 9109 + }, + { + "epoch": 16.52233053729313, + "grad_norm": 0.2578125, + "learning_rate": 0.0002, + "loss": 0.0785, + "step": 9110 + }, + { + "epoch": 16.524144184992064, + "grad_norm": 0.275390625, + "learning_rate": 0.0002, + "loss": 0.0876, + "step": 9111 + }, + { + "epoch": 16.525957832691, + "grad_norm": 0.1689453125, + "learning_rate": 0.0002, + "loss": 0.0755, + "step": 9112 + }, + { + "epoch": 16.527771480389934, + "grad_norm": 0.1650390625, + "learning_rate": 0.0002, + "loss": 0.0803, + "step": 9113 + }, + { + "epoch": 16.52958512808887, + "grad_norm": 0.37890625, + "learning_rate": 0.0002, + "loss": 0.0936, + "step": 9114 + }, + { + "epoch": 16.531398775787803, + "grad_norm": 0.2578125, + "learning_rate": 0.0002, + "loss": 0.0976, + "step": 9115 + }, + { + "epoch": 16.533212423486738, + "grad_norm": 0.189453125, + "learning_rate": 0.0002, + "loss": 0.0941, + "step": 9116 + }, + { + "epoch": 16.535026071185673, + "grad_norm": 0.318359375, + "learning_rate": 0.0002, + "loss": 0.1112, + "step": 9117 + }, + { + "epoch": 16.536839718884607, + "grad_norm": 0.1962890625, + "learning_rate": 0.0002, + "loss": 0.1253, + "step": 9118 + }, + { + "epoch": 16.538653366583542, + "grad_norm": 0.36328125, + "learning_rate": 0.0002, + "loss": 0.1339, + "step": 9119 + }, + { + "epoch": 16.540467014282477, + "grad_norm": 0.1796875, + "learning_rate": 0.0002, + "loss": 0.1345, + "step": 9120 + }, + { + "epoch": 16.542280661981412, + "grad_norm": 0.123046875, + "learning_rate": 0.0002, + "loss": 0.1603, + "step": 9121 + }, + { + "epoch": 16.544094309680343, + "grad_norm": 0.431640625, + "learning_rate": 0.0002, + "loss": 0.276, + "step": 9122 + }, + { + "epoch": 16.545907957379278, + "grad_norm": 0.287109375, + "learning_rate": 0.0002, + "loss": 0.171, + "step": 9123 + }, + { + "epoch": 16.547721605078213, + "grad_norm": 0.1875, + "learning_rate": 0.0002, + "loss": 0.0705, + "step": 9124 + }, + { + "epoch": 16.549535252777147, + "grad_norm": 0.1904296875, + "learning_rate": 0.0002, + "loss": 0.0689, + "step": 9125 + }, + { + "epoch": 16.551348900476082, + "grad_norm": 0.177734375, + "learning_rate": 0.0002, + "loss": 0.0721, + "step": 9126 + }, + { + "epoch": 16.553162548175017, + "grad_norm": 0.2041015625, + "learning_rate": 0.0002, + "loss": 0.0682, + "step": 9127 + }, + { + "epoch": 16.55497619587395, + "grad_norm": 0.1875, + "learning_rate": 0.0002, + "loss": 0.0667, + "step": 9128 + }, + { + "epoch": 16.556789843572886, + "grad_norm": 0.2041015625, + "learning_rate": 0.0002, + "loss": 0.0695, + "step": 9129 + }, + { + "epoch": 16.55860349127182, + "grad_norm": 0.21484375, + "learning_rate": 0.0002, + "loss": 0.0726, + "step": 9130 + }, + { + "epoch": 16.560417138970756, + "grad_norm": 0.2021484375, + "learning_rate": 0.0002, + "loss": 0.0732, + "step": 9131 + }, + { + "epoch": 16.56223078666969, + "grad_norm": 0.16796875, + "learning_rate": 0.0002, + "loss": 0.0668, + "step": 9132 + }, + { + "epoch": 16.564044434368626, + "grad_norm": 0.2392578125, + "learning_rate": 0.0002, + "loss": 0.088, + "step": 9133 + }, + { + "epoch": 16.565858082067557, + "grad_norm": 0.2177734375, + "learning_rate": 0.0002, + "loss": 0.0753, + "step": 9134 + }, + { + "epoch": 16.56767172976649, + "grad_norm": 0.2353515625, + "learning_rate": 0.0002, + "loss": 0.0814, + "step": 9135 + }, + { + "epoch": 16.569485377465426, + "grad_norm": 0.1728515625, + "learning_rate": 0.0002, + "loss": 0.0651, + "step": 9136 + }, + { + "epoch": 16.57129902516436, + "grad_norm": 0.2138671875, + "learning_rate": 0.0002, + "loss": 0.0718, + "step": 9137 + }, + { + "epoch": 16.573112672863296, + "grad_norm": 0.1962890625, + "learning_rate": 0.0002, + "loss": 0.0731, + "step": 9138 + }, + { + "epoch": 16.57492632056223, + "grad_norm": 0.1689453125, + "learning_rate": 0.0002, + "loss": 0.0576, + "step": 9139 + }, + { + "epoch": 16.576739968261165, + "grad_norm": 0.1982421875, + "learning_rate": 0.0002, + "loss": 0.0658, + "step": 9140 + }, + { + "epoch": 16.5785536159601, + "grad_norm": 0.2138671875, + "learning_rate": 0.0002, + "loss": 0.0676, + "step": 9141 + }, + { + "epoch": 16.580367263659035, + "grad_norm": 0.19140625, + "learning_rate": 0.0002, + "loss": 0.061, + "step": 9142 + }, + { + "epoch": 16.58218091135797, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 0.0634, + "step": 9143 + }, + { + "epoch": 16.583994559056904, + "grad_norm": 0.189453125, + "learning_rate": 0.0002, + "loss": 0.0622, + "step": 9144 + }, + { + "epoch": 16.58580820675584, + "grad_norm": 0.171875, + "learning_rate": 0.0002, + "loss": 0.0643, + "step": 9145 + }, + { + "epoch": 16.58762185445477, + "grad_norm": 0.2099609375, + "learning_rate": 0.0002, + "loss": 0.0692, + "step": 9146 + }, + { + "epoch": 16.589435502153705, + "grad_norm": 0.2421875, + "learning_rate": 0.0002, + "loss": 0.0677, + "step": 9147 + }, + { + "epoch": 16.59124914985264, + "grad_norm": 0.232421875, + "learning_rate": 0.0002, + "loss": 0.0698, + "step": 9148 + }, + { + "epoch": 16.593062797551575, + "grad_norm": 0.240234375, + "learning_rate": 0.0002, + "loss": 0.072, + "step": 9149 + }, + { + "epoch": 16.59487644525051, + "grad_norm": 0.24609375, + "learning_rate": 0.0002, + "loss": 0.0793, + "step": 9150 + }, + { + "epoch": 16.596690092949444, + "grad_norm": 0.291015625, + "learning_rate": 0.0002, + "loss": 0.1068, + "step": 9151 + }, + { + "epoch": 16.59850374064838, + "grad_norm": 0.21875, + "learning_rate": 0.0002, + "loss": 0.0669, + "step": 9152 + }, + { + "epoch": 16.600317388347314, + "grad_norm": 0.216796875, + "learning_rate": 0.0002, + "loss": 0.0753, + "step": 9153 + }, + { + "epoch": 16.60213103604625, + "grad_norm": 0.251953125, + "learning_rate": 0.0002, + "loss": 0.0731, + "step": 9154 + }, + { + "epoch": 16.603944683745183, + "grad_norm": 0.2041015625, + "learning_rate": 0.0002, + "loss": 0.0672, + "step": 9155 + }, + { + "epoch": 16.605758331444118, + "grad_norm": 0.259765625, + "learning_rate": 0.0002, + "loss": 0.0733, + "step": 9156 + }, + { + "epoch": 16.607571979143053, + "grad_norm": 0.2421875, + "learning_rate": 0.0002, + "loss": 0.0729, + "step": 9157 + }, + { + "epoch": 16.609385626841984, + "grad_norm": 0.2236328125, + "learning_rate": 0.0002, + "loss": 0.0757, + "step": 9158 + }, + { + "epoch": 16.61119927454092, + "grad_norm": 0.2119140625, + "learning_rate": 0.0002, + "loss": 0.0803, + "step": 9159 + }, + { + "epoch": 16.613012922239854, + "grad_norm": 0.259765625, + "learning_rate": 0.0002, + "loss": 0.0778, + "step": 9160 + }, + { + "epoch": 16.61482656993879, + "grad_norm": 0.1904296875, + "learning_rate": 0.0002, + "loss": 0.0798, + "step": 9161 + }, + { + "epoch": 16.616640217637723, + "grad_norm": 0.2275390625, + "learning_rate": 0.0002, + "loss": 0.084, + "step": 9162 + }, + { + "epoch": 16.618453865336658, + "grad_norm": 0.216796875, + "learning_rate": 0.0002, + "loss": 0.0841, + "step": 9163 + }, + { + "epoch": 16.618453865336658, + "eval_loss": 2.587026357650757, + "eval_runtime": 151.839, + "eval_samples_per_second": 6.586, + "eval_steps_per_second": 6.586, + "step": 9163 + }, + { + "epoch": 16.618453865336658, + "mmlu_eval_accuracy": 0.30414575372429237, + "mmlu_eval_accuracy_abstract_algebra": 0.45454545454545453, + "mmlu_eval_accuracy_anatomy": 0.35714285714285715, + "mmlu_eval_accuracy_astronomy": 0.3125, + "mmlu_eval_accuracy_business_ethics": 0.36363636363636365, + "mmlu_eval_accuracy_clinical_knowledge": 0.27586206896551724, + "mmlu_eval_accuracy_college_biology": 0.375, + "mmlu_eval_accuracy_college_chemistry": 0.125, + "mmlu_eval_accuracy_college_computer_science": 0.0, + "mmlu_eval_accuracy_college_mathematics": 0.18181818181818182, + "mmlu_eval_accuracy_college_medicine": 0.3181818181818182, + "mmlu_eval_accuracy_college_physics": 0.36363636363636365, + "mmlu_eval_accuracy_computer_security": 0.5454545454545454, + "mmlu_eval_accuracy_conceptual_physics": 0.3076923076923077, + "mmlu_eval_accuracy_econometrics": 0.08333333333333333, + "mmlu_eval_accuracy_electrical_engineering": 0.1875, + "mmlu_eval_accuracy_elementary_mathematics": 0.34146341463414637, + "mmlu_eval_accuracy_formal_logic": 0.14285714285714285, + "mmlu_eval_accuracy_global_facts": 0.3, + "mmlu_eval_accuracy_high_school_biology": 0.34375, + "mmlu_eval_accuracy_high_school_chemistry": 0.22727272727272727, + "mmlu_eval_accuracy_high_school_computer_science": 0.2222222222222222, + "mmlu_eval_accuracy_high_school_european_history": 0.2777777777777778, + "mmlu_eval_accuracy_high_school_geography": 0.3181818181818182, + "mmlu_eval_accuracy_high_school_government_and_politics": 0.2857142857142857, + "mmlu_eval_accuracy_high_school_macroeconomics": 0.20930232558139536, + "mmlu_eval_accuracy_high_school_mathematics": 0.06896551724137931, + "mmlu_eval_accuracy_high_school_microeconomics": 0.2692307692307692, + "mmlu_eval_accuracy_high_school_physics": 0.5882352941176471, + "mmlu_eval_accuracy_high_school_psychology": 0.38333333333333336, + "mmlu_eval_accuracy_high_school_statistics": 0.2608695652173913, + "mmlu_eval_accuracy_high_school_us_history": 0.2727272727272727, + "mmlu_eval_accuracy_high_school_world_history": 0.19230769230769232, + "mmlu_eval_accuracy_human_aging": 0.2608695652173913, + "mmlu_eval_accuracy_human_sexuality": 0.16666666666666666, + "mmlu_eval_accuracy_international_law": 0.38461538461538464, + "mmlu_eval_accuracy_jurisprudence": 0.2727272727272727, + "mmlu_eval_accuracy_logical_fallacies": 0.3333333333333333, + "mmlu_eval_accuracy_machine_learning": 0.36363636363636365, + "mmlu_eval_accuracy_management": 0.36363636363636365, + "mmlu_eval_accuracy_marketing": 0.44, + "mmlu_eval_accuracy_medical_genetics": 0.36363636363636365, + "mmlu_eval_accuracy_miscellaneous": 0.47674418604651164, + "mmlu_eval_accuracy_moral_disputes": 0.3157894736842105, + "mmlu_eval_accuracy_moral_scenarios": 0.22, + "mmlu_eval_accuracy_nutrition": 0.2727272727272727, + "mmlu_eval_accuracy_philosophy": 0.29411764705882354, + "mmlu_eval_accuracy_prehistory": 0.34285714285714286, + "mmlu_eval_accuracy_professional_accounting": 0.3870967741935484, + "mmlu_eval_accuracy_professional_law": 0.2647058823529412, + "mmlu_eval_accuracy_professional_medicine": 0.25806451612903225, + "mmlu_eval_accuracy_professional_psychology": 0.2753623188405797, + "mmlu_eval_accuracy_public_relations": 0.3333333333333333, + "mmlu_eval_accuracy_security_studies": 0.2962962962962963, + "mmlu_eval_accuracy_sociology": 0.5909090909090909, + "mmlu_eval_accuracy_us_foreign_policy": 0.45454545454545453, + "mmlu_eval_accuracy_virology": 0.3333333333333333, + "mmlu_eval_accuracy_world_religions": 0.3157894736842105, + "mmlu_loss": 2.4934391367676834, + "step": 9163 + }, + { + "epoch": 16.620267513035593, + "grad_norm": 0.19140625, + "learning_rate": 0.0002, + "loss": 0.09, + "step": 9164 + }, + { + "epoch": 16.622081160734528, + "grad_norm": 0.279296875, + "learning_rate": 0.0002, + "loss": 0.0867, + "step": 9165 + }, + { + "epoch": 16.623894808433462, + "grad_norm": 0.298828125, + "learning_rate": 0.0002, + "loss": 0.1053, + "step": 9166 + }, + { + "epoch": 16.625708456132397, + "grad_norm": 0.1728515625, + "learning_rate": 0.0002, + "loss": 0.1016, + "step": 9167 + }, + { + "epoch": 16.627522103831332, + "grad_norm": 0.248046875, + "learning_rate": 0.0002, + "loss": 0.0993, + "step": 9168 + }, + { + "epoch": 16.629335751530267, + "grad_norm": 0.171875, + "learning_rate": 0.0002, + "loss": 0.1264, + "step": 9169 + }, + { + "epoch": 16.6311493992292, + "grad_norm": 0.212890625, + "learning_rate": 0.0002, + "loss": 0.1316, + "step": 9170 + }, + { + "epoch": 16.632963046928133, + "grad_norm": 0.1708984375, + "learning_rate": 0.0002, + "loss": 0.1451, + "step": 9171 + }, + { + "epoch": 16.634776694627067, + "grad_norm": 1.2890625, + "learning_rate": 0.0002, + "loss": 0.2489, + "step": 9172 + }, + { + "epoch": 16.636590342326002, + "grad_norm": 0.1953125, + "learning_rate": 0.0002, + "loss": 0.0699, + "step": 9173 + }, + { + "epoch": 16.638403990024937, + "grad_norm": 0.197265625, + "learning_rate": 0.0002, + "loss": 0.0765, + "step": 9174 + }, + { + "epoch": 16.64021763772387, + "grad_norm": 0.216796875, + "learning_rate": 0.0002, + "loss": 0.0722, + "step": 9175 + }, + { + "epoch": 16.642031285422807, + "grad_norm": 0.267578125, + "learning_rate": 0.0002, + "loss": 0.0881, + "step": 9176 + }, + { + "epoch": 16.64384493312174, + "grad_norm": 0.2275390625, + "learning_rate": 0.0002, + "loss": 0.0756, + "step": 9177 + }, + { + "epoch": 16.645658580820676, + "grad_norm": 0.1953125, + "learning_rate": 0.0002, + "loss": 0.0677, + "step": 9178 + }, + { + "epoch": 16.64747222851961, + "grad_norm": 0.2470703125, + "learning_rate": 0.0002, + "loss": 0.0804, + "step": 9179 + }, + { + "epoch": 16.649285876218546, + "grad_norm": 0.279296875, + "learning_rate": 0.0002, + "loss": 0.0747, + "step": 9180 + }, + { + "epoch": 16.65109952391748, + "grad_norm": 0.185546875, + "learning_rate": 0.0002, + "loss": 0.064, + "step": 9181 + }, + { + "epoch": 16.652913171616415, + "grad_norm": 0.2158203125, + "learning_rate": 0.0002, + "loss": 0.0749, + "step": 9182 + }, + { + "epoch": 16.654726819315346, + "grad_norm": 0.1953125, + "learning_rate": 0.0002, + "loss": 0.0746, + "step": 9183 + }, + { + "epoch": 16.65654046701428, + "grad_norm": 0.2255859375, + "learning_rate": 0.0002, + "loss": 0.0703, + "step": 9184 + }, + { + "epoch": 16.658354114713216, + "grad_norm": 0.232421875, + "learning_rate": 0.0002, + "loss": 0.0767, + "step": 9185 + }, + { + "epoch": 16.66016776241215, + "grad_norm": 0.1923828125, + "learning_rate": 0.0002, + "loss": 0.0676, + "step": 9186 + }, + { + "epoch": 16.661981410111085, + "grad_norm": 0.2216796875, + "learning_rate": 0.0002, + "loss": 0.075, + "step": 9187 + }, + { + "epoch": 16.66379505781002, + "grad_norm": 0.1904296875, + "learning_rate": 0.0002, + "loss": 0.0663, + "step": 9188 + }, + { + "epoch": 16.665608705508955, + "grad_norm": 0.1953125, + "learning_rate": 0.0002, + "loss": 0.0682, + "step": 9189 + }, + { + "epoch": 16.66742235320789, + "grad_norm": 0.1689453125, + "learning_rate": 0.0002, + "loss": 0.0612, + "step": 9190 + }, + { + "epoch": 16.669236000906825, + "grad_norm": 0.1884765625, + "learning_rate": 0.0002, + "loss": 0.0627, + "step": 9191 + }, + { + "epoch": 16.67104964860576, + "grad_norm": 0.2373046875, + "learning_rate": 0.0002, + "loss": 0.0627, + "step": 9192 + }, + { + "epoch": 16.672863296304694, + "grad_norm": 0.18359375, + "learning_rate": 0.0002, + "loss": 0.0686, + "step": 9193 + }, + { + "epoch": 16.67467694400363, + "grad_norm": 0.26171875, + "learning_rate": 0.0002, + "loss": 0.0663, + "step": 9194 + }, + { + "epoch": 16.67649059170256, + "grad_norm": 0.2001953125, + "learning_rate": 0.0002, + "loss": 0.0666, + "step": 9195 + }, + { + "epoch": 16.678304239401495, + "grad_norm": 0.2158203125, + "learning_rate": 0.0002, + "loss": 0.0626, + "step": 9196 + }, + { + "epoch": 16.68011788710043, + "grad_norm": 0.203125, + "learning_rate": 0.0002, + "loss": 0.0663, + "step": 9197 + }, + { + "epoch": 16.681931534799364, + "grad_norm": 0.212890625, + "learning_rate": 0.0002, + "loss": 0.0666, + "step": 9198 + }, + { + "epoch": 16.6837451824983, + "grad_norm": 0.2041015625, + "learning_rate": 0.0002, + "loss": 0.0637, + "step": 9199 + }, + { + "epoch": 16.685558830197234, + "grad_norm": 0.23046875, + "learning_rate": 0.0002, + "loss": 0.0729, + "step": 9200 + }, + { + "epoch": 16.68737247789617, + "grad_norm": 0.26171875, + "learning_rate": 0.0002, + "loss": 0.0697, + "step": 9201 + }, + { + "epoch": 16.689186125595104, + "grad_norm": 0.2353515625, + "learning_rate": 0.0002, + "loss": 0.0685, + "step": 9202 + }, + { + "epoch": 16.69099977329404, + "grad_norm": 0.224609375, + "learning_rate": 0.0002, + "loss": 0.0655, + "step": 9203 + }, + { + "epoch": 16.692813420992973, + "grad_norm": 0.28125, + "learning_rate": 0.0002, + "loss": 0.077, + "step": 9204 + }, + { + "epoch": 16.694627068691908, + "grad_norm": 0.2177734375, + "learning_rate": 0.0002, + "loss": 0.0754, + "step": 9205 + }, + { + "epoch": 16.696440716390843, + "grad_norm": 0.302734375, + "learning_rate": 0.0002, + "loss": 0.0855, + "step": 9206 + }, + { + "epoch": 16.698254364089777, + "grad_norm": 0.2412109375, + "learning_rate": 0.0002, + "loss": 0.0727, + "step": 9207 + }, + { + "epoch": 16.70006801178871, + "grad_norm": 0.265625, + "learning_rate": 0.0002, + "loss": 0.0789, + "step": 9208 + }, + { + "epoch": 16.701881659487643, + "grad_norm": 0.2080078125, + "learning_rate": 0.0002, + "loss": 0.0819, + "step": 9209 + }, + { + "epoch": 16.703695307186578, + "grad_norm": 0.18359375, + "learning_rate": 0.0002, + "loss": 0.0755, + "step": 9210 + }, + { + "epoch": 16.705508954885513, + "grad_norm": 0.1796875, + "learning_rate": 0.0002, + "loss": 0.0762, + "step": 9211 + }, + { + "epoch": 16.707322602584448, + "grad_norm": 0.275390625, + "learning_rate": 0.0002, + "loss": 0.0857, + "step": 9212 + }, + { + "epoch": 16.709136250283382, + "grad_norm": 0.2890625, + "learning_rate": 0.0002, + "loss": 0.1228, + "step": 9213 + }, + { + "epoch": 16.710949897982317, + "grad_norm": 0.1884765625, + "learning_rate": 0.0002, + "loss": 0.088, + "step": 9214 + }, + { + "epoch": 16.712763545681252, + "grad_norm": 0.396484375, + "learning_rate": 0.0002, + "loss": 0.1168, + "step": 9215 + }, + { + "epoch": 16.714577193380187, + "grad_norm": 0.291015625, + "learning_rate": 0.0002, + "loss": 0.1078, + "step": 9216 + }, + { + "epoch": 16.71639084107912, + "grad_norm": 0.26171875, + "learning_rate": 0.0002, + "loss": 0.1203, + "step": 9217 + }, + { + "epoch": 16.718204488778056, + "grad_norm": 0.2041015625, + "learning_rate": 0.0002, + "loss": 0.1129, + "step": 9218 + }, + { + "epoch": 16.720018136476988, + "grad_norm": 0.330078125, + "learning_rate": 0.0002, + "loss": 0.1248, + "step": 9219 + }, + { + "epoch": 16.721831784175922, + "grad_norm": 0.359375, + "learning_rate": 0.0002, + "loss": 0.15, + "step": 9220 + }, + { + "epoch": 16.723645431874857, + "grad_norm": 0.18359375, + "learning_rate": 0.0002, + "loss": 0.1518, + "step": 9221 + }, + { + "epoch": 16.725459079573792, + "grad_norm": 0.267578125, + "learning_rate": 0.0002, + "loss": 0.2091, + "step": 9222 + }, + { + "epoch": 16.727272727272727, + "grad_norm": 0.2119140625, + "learning_rate": 0.0002, + "loss": 0.0932, + "step": 9223 + }, + { + "epoch": 16.72908637497166, + "grad_norm": 0.2060546875, + "learning_rate": 0.0002, + "loss": 0.0762, + "step": 9224 + }, + { + "epoch": 16.730900022670596, + "grad_norm": 0.2392578125, + "learning_rate": 0.0002, + "loss": 0.078, + "step": 9225 + }, + { + "epoch": 16.73271367036953, + "grad_norm": 0.23828125, + "learning_rate": 0.0002, + "loss": 0.081, + "step": 9226 + }, + { + "epoch": 16.734527318068466, + "grad_norm": 0.228515625, + "learning_rate": 0.0002, + "loss": 0.0777, + "step": 9227 + }, + { + "epoch": 16.7363409657674, + "grad_norm": 0.240234375, + "learning_rate": 0.0002, + "loss": 0.0831, + "step": 9228 + }, + { + "epoch": 16.738154613466335, + "grad_norm": 0.216796875, + "learning_rate": 0.0002, + "loss": 0.0809, + "step": 9229 + }, + { + "epoch": 16.73996826116527, + "grad_norm": 0.25, + "learning_rate": 0.0002, + "loss": 0.0815, + "step": 9230 + }, + { + "epoch": 16.741781908864205, + "grad_norm": 0.2421875, + "learning_rate": 0.0002, + "loss": 0.074, + "step": 9231 + }, + { + "epoch": 16.743595556563136, + "grad_norm": 0.2255859375, + "learning_rate": 0.0002, + "loss": 0.082, + "step": 9232 + }, + { + "epoch": 16.74540920426207, + "grad_norm": 0.251953125, + "learning_rate": 0.0002, + "loss": 0.0929, + "step": 9233 + }, + { + "epoch": 16.747222851961006, + "grad_norm": 0.19921875, + "learning_rate": 0.0002, + "loss": 0.0732, + "step": 9234 + }, + { + "epoch": 16.74903649965994, + "grad_norm": 0.216796875, + "learning_rate": 0.0002, + "loss": 0.074, + "step": 9235 + }, + { + "epoch": 16.750850147358875, + "grad_norm": 0.2158203125, + "learning_rate": 0.0002, + "loss": 0.0719, + "step": 9236 + }, + { + "epoch": 16.75266379505781, + "grad_norm": 0.1982421875, + "learning_rate": 0.0002, + "loss": 0.0679, + "step": 9237 + }, + { + "epoch": 16.754477442756745, + "grad_norm": 0.1845703125, + "learning_rate": 0.0002, + "loss": 0.0597, + "step": 9238 + }, + { + "epoch": 16.75629109045568, + "grad_norm": 0.220703125, + "learning_rate": 0.0002, + "loss": 0.07, + "step": 9239 + }, + { + "epoch": 16.758104738154614, + "grad_norm": 0.2099609375, + "learning_rate": 0.0002, + "loss": 0.0654, + "step": 9240 + }, + { + "epoch": 16.75991838585355, + "grad_norm": 0.2333984375, + "learning_rate": 0.0002, + "loss": 0.0702, + "step": 9241 + }, + { + "epoch": 16.761732033552484, + "grad_norm": 0.1748046875, + "learning_rate": 0.0002, + "loss": 0.0643, + "step": 9242 + }, + { + "epoch": 16.76354568125142, + "grad_norm": 0.197265625, + "learning_rate": 0.0002, + "loss": 0.0665, + "step": 9243 + }, + { + "epoch": 16.76535932895035, + "grad_norm": 0.185546875, + "learning_rate": 0.0002, + "loss": 0.0651, + "step": 9244 + }, + { + "epoch": 16.767172976649285, + "grad_norm": 0.181640625, + "learning_rate": 0.0002, + "loss": 0.0636, + "step": 9245 + }, + { + "epoch": 16.76898662434822, + "grad_norm": 0.224609375, + "learning_rate": 0.0002, + "loss": 0.0679, + "step": 9246 + }, + { + "epoch": 16.770800272047154, + "grad_norm": 0.2353515625, + "learning_rate": 0.0002, + "loss": 0.0645, + "step": 9247 + }, + { + "epoch": 16.77261391974609, + "grad_norm": 0.2236328125, + "learning_rate": 0.0002, + "loss": 0.0662, + "step": 9248 + }, + { + "epoch": 16.774427567445024, + "grad_norm": 0.2333984375, + "learning_rate": 0.0002, + "loss": 0.0708, + "step": 9249 + }, + { + "epoch": 16.77624121514396, + "grad_norm": 0.2490234375, + "learning_rate": 0.0002, + "loss": 0.0707, + "step": 9250 + }, + { + "epoch": 16.778054862842893, + "grad_norm": 0.2490234375, + "learning_rate": 0.0002, + "loss": 0.0721, + "step": 9251 + }, + { + "epoch": 16.779868510541828, + "grad_norm": 0.2041015625, + "learning_rate": 0.0002, + "loss": 0.0663, + "step": 9252 + }, + { + "epoch": 16.781682158240763, + "grad_norm": 0.3203125, + "learning_rate": 0.0002, + "loss": 0.0738, + "step": 9253 + }, + { + "epoch": 16.783495805939697, + "grad_norm": 0.224609375, + "learning_rate": 0.0002, + "loss": 0.0703, + "step": 9254 + }, + { + "epoch": 16.785309453638632, + "grad_norm": 0.28125, + "learning_rate": 0.0002, + "loss": 0.0788, + "step": 9255 + }, + { + "epoch": 16.787123101337563, + "grad_norm": 0.236328125, + "learning_rate": 0.0002, + "loss": 0.0731, + "step": 9256 + }, + { + "epoch": 16.7889367490365, + "grad_norm": 0.302734375, + "learning_rate": 0.0002, + "loss": 0.0847, + "step": 9257 + }, + { + "epoch": 16.790750396735433, + "grad_norm": 0.2392578125, + "learning_rate": 0.0002, + "loss": 0.0723, + "step": 9258 + }, + { + "epoch": 16.792564044434368, + "grad_norm": 0.279296875, + "learning_rate": 0.0002, + "loss": 0.086, + "step": 9259 + }, + { + "epoch": 16.794377692133303, + "grad_norm": 0.3515625, + "learning_rate": 0.0002, + "loss": 0.0943, + "step": 9260 + }, + { + "epoch": 16.796191339832237, + "grad_norm": 0.287109375, + "learning_rate": 0.0002, + "loss": 0.0779, + "step": 9261 + }, + { + "epoch": 16.798004987531172, + "grad_norm": 0.205078125, + "learning_rate": 0.0002, + "loss": 0.0819, + "step": 9262 + }, + { + "epoch": 16.799818635230107, + "grad_norm": 0.1806640625, + "learning_rate": 0.0002, + "loss": 0.0817, + "step": 9263 + }, + { + "epoch": 16.80163228292904, + "grad_norm": 0.314453125, + "learning_rate": 0.0002, + "loss": 0.0996, + "step": 9264 + }, + { + "epoch": 16.803445930627976, + "grad_norm": 0.353515625, + "learning_rate": 0.0002, + "loss": 0.0976, + "step": 9265 + }, + { + "epoch": 16.80525957832691, + "grad_norm": 0.24609375, + "learning_rate": 0.0002, + "loss": 0.0948, + "step": 9266 + }, + { + "epoch": 16.807073226025846, + "grad_norm": 0.287109375, + "learning_rate": 0.0002, + "loss": 0.1006, + "step": 9267 + }, + { + "epoch": 16.80888687372478, + "grad_norm": 0.408203125, + "learning_rate": 0.0002, + "loss": 0.1199, + "step": 9268 + }, + { + "epoch": 16.810700521423712, + "grad_norm": 0.19140625, + "learning_rate": 0.0002, + "loss": 0.1124, + "step": 9269 + }, + { + "epoch": 16.812514169122647, + "grad_norm": 0.453125, + "learning_rate": 0.0002, + "loss": 0.1371, + "step": 9270 + }, + { + "epoch": 16.81432781682158, + "grad_norm": 0.2734375, + "learning_rate": 0.0002, + "loss": 0.1798, + "step": 9271 + }, + { + "epoch": 16.816141464520516, + "grad_norm": 0.2001953125, + "learning_rate": 0.0002, + "loss": 0.2193, + "step": 9272 + }, + { + "epoch": 16.81795511221945, + "grad_norm": 0.25390625, + "learning_rate": 0.0002, + "loss": 0.088, + "step": 9273 + }, + { + "epoch": 16.819768759918386, + "grad_norm": 0.2373046875, + "learning_rate": 0.0002, + "loss": 0.0848, + "step": 9274 + }, + { + "epoch": 16.82158240761732, + "grad_norm": 0.240234375, + "learning_rate": 0.0002, + "loss": 0.089, + "step": 9275 + }, + { + "epoch": 16.823396055316255, + "grad_norm": 0.25, + "learning_rate": 0.0002, + "loss": 0.0879, + "step": 9276 + }, + { + "epoch": 16.82520970301519, + "grad_norm": 0.271484375, + "learning_rate": 0.0002, + "loss": 0.0936, + "step": 9277 + }, + { + "epoch": 16.827023350714125, + "grad_norm": 0.2265625, + "learning_rate": 0.0002, + "loss": 0.0802, + "step": 9278 + }, + { + "epoch": 16.82883699841306, + "grad_norm": 0.251953125, + "learning_rate": 0.0002, + "loss": 0.0878, + "step": 9279 + }, + { + "epoch": 16.830650646111994, + "grad_norm": 0.2294921875, + "learning_rate": 0.0002, + "loss": 0.0857, + "step": 9280 + }, + { + "epoch": 16.832464293810926, + "grad_norm": 0.2236328125, + "learning_rate": 0.0002, + "loss": 0.0803, + "step": 9281 + }, + { + "epoch": 16.83427794150986, + "grad_norm": 0.2255859375, + "learning_rate": 0.0002, + "loss": 0.0827, + "step": 9282 + }, + { + "epoch": 16.836091589208795, + "grad_norm": 0.189453125, + "learning_rate": 0.0002, + "loss": 0.0757, + "step": 9283 + }, + { + "epoch": 16.83790523690773, + "grad_norm": 0.212890625, + "learning_rate": 0.0002, + "loss": 0.0823, + "step": 9284 + }, + { + "epoch": 16.839718884606665, + "grad_norm": 0.248046875, + "learning_rate": 0.0002, + "loss": 0.0898, + "step": 9285 + }, + { + "epoch": 16.8415325323056, + "grad_norm": 0.279296875, + "learning_rate": 0.0002, + "loss": 0.1056, + "step": 9286 + }, + { + "epoch": 16.843346180004534, + "grad_norm": 0.21484375, + "learning_rate": 0.0002, + "loss": 0.072, + "step": 9287 + }, + { + "epoch": 16.84515982770347, + "grad_norm": 0.197265625, + "learning_rate": 0.0002, + "loss": 0.0664, + "step": 9288 + }, + { + "epoch": 16.846973475402404, + "grad_norm": 0.2119140625, + "learning_rate": 0.0002, + "loss": 0.0726, + "step": 9289 + }, + { + "epoch": 16.84878712310134, + "grad_norm": 0.2158203125, + "learning_rate": 0.0002, + "loss": 0.0626, + "step": 9290 + }, + { + "epoch": 16.850600770800273, + "grad_norm": 0.220703125, + "learning_rate": 0.0002, + "loss": 0.0691, + "step": 9291 + }, + { + "epoch": 16.852414418499208, + "grad_norm": 0.2412109375, + "learning_rate": 0.0002, + "loss": 0.0766, + "step": 9292 + }, + { + "epoch": 16.85422806619814, + "grad_norm": 0.234375, + "learning_rate": 0.0002, + "loss": 0.071, + "step": 9293 + }, + { + "epoch": 16.856041713897074, + "grad_norm": 0.2158203125, + "learning_rate": 0.0002, + "loss": 0.0645, + "step": 9294 + }, + { + "epoch": 16.85785536159601, + "grad_norm": 0.224609375, + "learning_rate": 0.0002, + "loss": 0.0682, + "step": 9295 + }, + { + "epoch": 16.859669009294944, + "grad_norm": 0.2060546875, + "learning_rate": 0.0002, + "loss": 0.0649, + "step": 9296 + }, + { + "epoch": 16.86148265699388, + "grad_norm": 0.2060546875, + "learning_rate": 0.0002, + "loss": 0.0707, + "step": 9297 + }, + { + "epoch": 16.863296304692813, + "grad_norm": 0.2060546875, + "learning_rate": 0.0002, + "loss": 0.0632, + "step": 9298 + }, + { + "epoch": 16.865109952391748, + "grad_norm": 0.201171875, + "learning_rate": 0.0002, + "loss": 0.0703, + "step": 9299 + }, + { + "epoch": 16.866923600090683, + "grad_norm": 0.25390625, + "learning_rate": 0.0002, + "loss": 0.0693, + "step": 9300 + }, + { + "epoch": 16.868737247789618, + "grad_norm": 0.1962890625, + "learning_rate": 0.0002, + "loss": 0.0769, + "step": 9301 + }, + { + "epoch": 16.870550895488552, + "grad_norm": 0.23046875, + "learning_rate": 0.0002, + "loss": 0.0676, + "step": 9302 + }, + { + "epoch": 16.872364543187487, + "grad_norm": 0.2275390625, + "learning_rate": 0.0002, + "loss": 0.0682, + "step": 9303 + }, + { + "epoch": 16.874178190886422, + "grad_norm": 0.30078125, + "learning_rate": 0.0002, + "loss": 0.0821, + "step": 9304 + }, + { + "epoch": 16.875991838585353, + "grad_norm": 0.37109375, + "learning_rate": 0.0002, + "loss": 0.1011, + "step": 9305 + }, + { + "epoch": 16.877805486284288, + "grad_norm": 0.33984375, + "learning_rate": 0.0002, + "loss": 0.0828, + "step": 9306 + }, + { + "epoch": 16.879619133983223, + "grad_norm": 0.333984375, + "learning_rate": 0.0002, + "loss": 0.083, + "step": 9307 + }, + { + "epoch": 16.881432781682157, + "grad_norm": 0.2109375, + "learning_rate": 0.0002, + "loss": 0.0687, + "step": 9308 + }, + { + "epoch": 16.883246429381092, + "grad_norm": 0.2333984375, + "learning_rate": 0.0002, + "loss": 0.0796, + "step": 9309 + }, + { + "epoch": 16.885060077080027, + "grad_norm": 0.318359375, + "learning_rate": 0.0002, + "loss": 0.0911, + "step": 9310 + }, + { + "epoch": 16.88687372477896, + "grad_norm": 0.2197265625, + "learning_rate": 0.0002, + "loss": 0.0824, + "step": 9311 + }, + { + "epoch": 16.888687372477897, + "grad_norm": 0.341796875, + "learning_rate": 0.0002, + "loss": 0.0964, + "step": 9312 + }, + { + "epoch": 16.89050102017683, + "grad_norm": 0.25390625, + "learning_rate": 0.0002, + "loss": 0.0934, + "step": 9313 + }, + { + "epoch": 16.892314667875766, + "grad_norm": 0.369140625, + "learning_rate": 0.0002, + "loss": 0.0918, + "step": 9314 + }, + { + "epoch": 16.8941283155747, + "grad_norm": 0.2236328125, + "learning_rate": 0.0002, + "loss": 0.1026, + "step": 9315 + }, + { + "epoch": 16.895941963273636, + "grad_norm": 0.263671875, + "learning_rate": 0.0002, + "loss": 0.1145, + "step": 9316 + }, + { + "epoch": 16.897755610972567, + "grad_norm": 0.2255859375, + "learning_rate": 0.0002, + "loss": 0.107, + "step": 9317 + }, + { + "epoch": 16.8995692586715, + "grad_norm": 0.41796875, + "learning_rate": 0.0002, + "loss": 0.1203, + "step": 9318 + }, + { + "epoch": 16.901382906370436, + "grad_norm": 0.265625, + "learning_rate": 0.0002, + "loss": 0.1241, + "step": 9319 + }, + { + "epoch": 16.90319655406937, + "grad_norm": 0.369140625, + "learning_rate": 0.0002, + "loss": 0.1548, + "step": 9320 + }, + { + "epoch": 16.905010201768306, + "grad_norm": 0.2353515625, + "learning_rate": 0.0002, + "loss": 0.1858, + "step": 9321 + }, + { + "epoch": 16.90682384946724, + "grad_norm": 0.166015625, + "learning_rate": 0.0002, + "loss": 0.2059, + "step": 9322 + }, + { + "epoch": 16.908637497166175, + "grad_norm": 0.2158203125, + "learning_rate": 0.0002, + "loss": 0.0778, + "step": 9323 + }, + { + "epoch": 16.91045114486511, + "grad_norm": 0.2412109375, + "learning_rate": 0.0002, + "loss": 0.0962, + "step": 9324 + }, + { + "epoch": 16.912264792564045, + "grad_norm": 0.240234375, + "learning_rate": 0.0002, + "loss": 0.0798, + "step": 9325 + }, + { + "epoch": 16.91407844026298, + "grad_norm": 0.251953125, + "learning_rate": 0.0002, + "loss": 0.0859, + "step": 9326 + }, + { + "epoch": 16.915892087961915, + "grad_norm": 0.2578125, + "learning_rate": 0.0002, + "loss": 0.0925, + "step": 9327 + }, + { + "epoch": 16.91770573566085, + "grad_norm": 0.255859375, + "learning_rate": 0.0002, + "loss": 0.094, + "step": 9328 + }, + { + "epoch": 16.919519383359784, + "grad_norm": 0.248046875, + "learning_rate": 0.0002, + "loss": 0.0968, + "step": 9329 + }, + { + "epoch": 16.921333031058715, + "grad_norm": 0.2197265625, + "learning_rate": 0.0002, + "loss": 0.0771, + "step": 9330 + }, + { + "epoch": 16.92314667875765, + "grad_norm": 0.2421875, + "learning_rate": 0.0002, + "loss": 0.0886, + "step": 9331 + }, + { + "epoch": 16.924960326456585, + "grad_norm": 0.2294921875, + "learning_rate": 0.0002, + "loss": 0.0813, + "step": 9332 + }, + { + "epoch": 16.92677397415552, + "grad_norm": 0.25390625, + "learning_rate": 0.0002, + "loss": 0.0848, + "step": 9333 + }, + { + "epoch": 16.928587621854454, + "grad_norm": 0.21484375, + "learning_rate": 0.0002, + "loss": 0.0783, + "step": 9334 + }, + { + "epoch": 16.93040126955339, + "grad_norm": 0.2294921875, + "learning_rate": 0.0002, + "loss": 0.0754, + "step": 9335 + }, + { + "epoch": 16.932214917252324, + "grad_norm": 0.255859375, + "learning_rate": 0.0002, + "loss": 0.0826, + "step": 9336 + }, + { + "epoch": 16.93402856495126, + "grad_norm": 0.2021484375, + "learning_rate": 0.0002, + "loss": 0.0689, + "step": 9337 + }, + { + "epoch": 16.935842212650194, + "grad_norm": 0.21484375, + "learning_rate": 0.0002, + "loss": 0.0724, + "step": 9338 + }, + { + "epoch": 16.93765586034913, + "grad_norm": 0.2333984375, + "learning_rate": 0.0002, + "loss": 0.0731, + "step": 9339 + }, + { + "epoch": 16.939469508048063, + "grad_norm": 0.2099609375, + "learning_rate": 0.0002, + "loss": 0.0725, + "step": 9340 + }, + { + "epoch": 16.941283155746998, + "grad_norm": 0.19140625, + "learning_rate": 0.0002, + "loss": 0.0638, + "step": 9341 + }, + { + "epoch": 16.94309680344593, + "grad_norm": 0.25390625, + "learning_rate": 0.0002, + "loss": 0.0718, + "step": 9342 + }, + { + "epoch": 16.944910451144864, + "grad_norm": 0.2451171875, + "learning_rate": 0.0002, + "loss": 0.0787, + "step": 9343 + }, + { + "epoch": 16.9467240988438, + "grad_norm": 0.2041015625, + "learning_rate": 0.0002, + "loss": 0.0696, + "step": 9344 + }, + { + "epoch": 16.948537746542733, + "grad_norm": 0.25, + "learning_rate": 0.0002, + "loss": 0.0669, + "step": 9345 + }, + { + "epoch": 16.950351394241668, + "grad_norm": 0.2177734375, + "learning_rate": 0.0002, + "loss": 0.0685, + "step": 9346 + }, + { + "epoch": 16.952165041940603, + "grad_norm": 0.201171875, + "learning_rate": 0.0002, + "loss": 0.0635, + "step": 9347 + }, + { + "epoch": 16.953978689639538, + "grad_norm": 0.2431640625, + "learning_rate": 0.0002, + "loss": 0.0683, + "step": 9348 + }, + { + "epoch": 16.955792337338472, + "grad_norm": 0.296875, + "learning_rate": 0.0002, + "loss": 0.0916, + "step": 9349 + }, + { + "epoch": 16.957605985037407, + "grad_norm": 0.2119140625, + "learning_rate": 0.0002, + "loss": 0.0784, + "step": 9350 + }, + { + "epoch": 16.957605985037407, + "eval_loss": 2.5857815742492676, + "eval_runtime": 152.5417, + "eval_samples_per_second": 6.556, + "eval_steps_per_second": 6.556, + "step": 9350 + }, + { + "epoch": 16.957605985037407, + "mmlu_eval_accuracy": 0.2940650867999455, + "mmlu_eval_accuracy_abstract_algebra": 0.45454545454545453, + "mmlu_eval_accuracy_anatomy": 0.21428571428571427, + "mmlu_eval_accuracy_astronomy": 0.3125, + "mmlu_eval_accuracy_business_ethics": 0.36363636363636365, + "mmlu_eval_accuracy_clinical_knowledge": 0.20689655172413793, + "mmlu_eval_accuracy_college_biology": 0.375, + "mmlu_eval_accuracy_college_chemistry": 0.125, + "mmlu_eval_accuracy_college_computer_science": 0.0, + "mmlu_eval_accuracy_college_mathematics": 0.18181818181818182, + "mmlu_eval_accuracy_college_medicine": 0.36363636363636365, + "mmlu_eval_accuracy_college_physics": 0.2727272727272727, + "mmlu_eval_accuracy_computer_security": 0.7272727272727273, + "mmlu_eval_accuracy_conceptual_physics": 0.2692307692307692, + "mmlu_eval_accuracy_econometrics": 0.08333333333333333, + "mmlu_eval_accuracy_electrical_engineering": 0.125, + "mmlu_eval_accuracy_elementary_mathematics": 0.34146341463414637, + "mmlu_eval_accuracy_formal_logic": 0.14285714285714285, + "mmlu_eval_accuracy_global_facts": 0.2, + "mmlu_eval_accuracy_high_school_biology": 0.375, + "mmlu_eval_accuracy_high_school_chemistry": 0.13636363636363635, + "mmlu_eval_accuracy_high_school_computer_science": 0.2222222222222222, + "mmlu_eval_accuracy_high_school_european_history": 0.2777777777777778, + "mmlu_eval_accuracy_high_school_geography": 0.3181818181818182, + "mmlu_eval_accuracy_high_school_government_and_politics": 0.23809523809523808, + "mmlu_eval_accuracy_high_school_macroeconomics": 0.20930232558139536, + "mmlu_eval_accuracy_high_school_mathematics": 0.06896551724137931, + "mmlu_eval_accuracy_high_school_microeconomics": 0.34615384615384615, + "mmlu_eval_accuracy_high_school_physics": 0.6470588235294118, + "mmlu_eval_accuracy_high_school_psychology": 0.36666666666666664, + "mmlu_eval_accuracy_high_school_statistics": 0.2608695652173913, + "mmlu_eval_accuracy_high_school_us_history": 0.3181818181818182, + "mmlu_eval_accuracy_high_school_world_history": 0.11538461538461539, + "mmlu_eval_accuracy_human_aging": 0.30434782608695654, + "mmlu_eval_accuracy_human_sexuality": 0.16666666666666666, + "mmlu_eval_accuracy_international_law": 0.38461538461538464, + "mmlu_eval_accuracy_jurisprudence": 0.36363636363636365, + "mmlu_eval_accuracy_logical_fallacies": 0.3333333333333333, + "mmlu_eval_accuracy_machine_learning": 0.36363636363636365, + "mmlu_eval_accuracy_management": 0.45454545454545453, + "mmlu_eval_accuracy_marketing": 0.44, + "mmlu_eval_accuracy_medical_genetics": 0.2727272727272727, + "mmlu_eval_accuracy_miscellaneous": 0.46511627906976744, + "mmlu_eval_accuracy_moral_disputes": 0.23684210526315788, + "mmlu_eval_accuracy_moral_scenarios": 0.22, + "mmlu_eval_accuracy_nutrition": 0.2727272727272727, + "mmlu_eval_accuracy_philosophy": 0.3235294117647059, + "mmlu_eval_accuracy_prehistory": 0.34285714285714286, + "mmlu_eval_accuracy_professional_accounting": 0.3548387096774194, + "mmlu_eval_accuracy_professional_law": 0.27058823529411763, + "mmlu_eval_accuracy_professional_medicine": 0.25806451612903225, + "mmlu_eval_accuracy_professional_psychology": 0.2753623188405797, + "mmlu_eval_accuracy_public_relations": 0.3333333333333333, + "mmlu_eval_accuracy_security_studies": 0.2962962962962963, + "mmlu_eval_accuracy_sociology": 0.4090909090909091, + "mmlu_eval_accuracy_us_foreign_policy": 0.36363636363636365, + "mmlu_eval_accuracy_virology": 0.3333333333333333, + "mmlu_eval_accuracy_world_religions": 0.2631578947368421, + "mmlu_loss": 2.405745451081274, + "step": 9350 + }, + { + "epoch": 16.959419632736342, + "grad_norm": 0.265625, + "learning_rate": 0.0002, + "loss": 0.0729, + "step": 9351 + }, + { + "epoch": 16.961233280435277, + "grad_norm": 0.22265625, + "learning_rate": 0.0002, + "loss": 0.0723, + "step": 9352 + }, + { + "epoch": 16.96304692813421, + "grad_norm": 0.25, + "learning_rate": 0.0002, + "loss": 0.0769, + "step": 9353 + }, + { + "epoch": 16.964860575833143, + "grad_norm": 0.353515625, + "learning_rate": 0.0002, + "loss": 0.1038, + "step": 9354 + }, + { + "epoch": 16.966674223532078, + "grad_norm": 0.2138671875, + "learning_rate": 0.0002, + "loss": 0.0686, + "step": 9355 + }, + { + "epoch": 16.968487871231012, + "grad_norm": 0.2294921875, + "learning_rate": 0.0002, + "loss": 0.0741, + "step": 9356 + }, + { + "epoch": 16.970301518929947, + "grad_norm": 0.2490234375, + "learning_rate": 0.0002, + "loss": 0.0858, + "step": 9357 + }, + { + "epoch": 16.972115166628882, + "grad_norm": 0.240234375, + "learning_rate": 0.0002, + "loss": 0.0848, + "step": 9358 + }, + { + "epoch": 16.973928814327817, + "grad_norm": 0.1796875, + "learning_rate": 0.0002, + "loss": 0.0757, + "step": 9359 + }, + { + "epoch": 16.97574246202675, + "grad_norm": 0.33984375, + "learning_rate": 0.0002, + "loss": 0.0879, + "step": 9360 + }, + { + "epoch": 16.977556109725686, + "grad_norm": 0.318359375, + "learning_rate": 0.0002, + "loss": 0.0889, + "step": 9361 + }, + { + "epoch": 16.97936975742462, + "grad_norm": 0.267578125, + "learning_rate": 0.0002, + "loss": 0.0981, + "step": 9362 + }, + { + "epoch": 16.981183405123556, + "grad_norm": 0.30078125, + "learning_rate": 0.0002, + "loss": 0.11, + "step": 9363 + }, + { + "epoch": 16.98299705282249, + "grad_norm": 0.40625, + "learning_rate": 0.0002, + "loss": 0.1267, + "step": 9364 + }, + { + "epoch": 16.984810700521425, + "grad_norm": 0.25390625, + "learning_rate": 0.0002, + "loss": 0.0917, + "step": 9365 + }, + { + "epoch": 16.986624348220356, + "grad_norm": 0.43359375, + "learning_rate": 0.0002, + "loss": 0.1075, + "step": 9366 + }, + { + "epoch": 16.98843799591929, + "grad_norm": 0.20703125, + "learning_rate": 0.0002, + "loss": 0.1136, + "step": 9367 + }, + { + "epoch": 16.990251643618226, + "grad_norm": 0.2333984375, + "learning_rate": 0.0002, + "loss": 0.1143, + "step": 9368 + }, + { + "epoch": 16.99206529131716, + "grad_norm": 0.228515625, + "learning_rate": 0.0002, + "loss": 0.1305, + "step": 9369 + }, + { + "epoch": 16.993878939016096, + "grad_norm": 0.1376953125, + "learning_rate": 0.0002, + "loss": 0.1434, + "step": 9370 + }, + { + "epoch": 16.99569258671503, + "grad_norm": 0.2578125, + "learning_rate": 0.0002, + "loss": 0.1892, + "step": 9371 + }, + { + "epoch": 16.997506234413965, + "grad_norm": 0.171875, + "learning_rate": 0.0002, + "loss": 0.2253, + "step": 9372 + }, + { + "epoch": 16.9993198821129, + "grad_norm": 0.265625, + "learning_rate": 0.0002, + "loss": 0.0843, + "step": 9373 + }, + { + "epoch": 17.001133529811835, + "grad_norm": 0.1708984375, + "learning_rate": 0.0002, + "loss": 0.1185, + "step": 9374 + }, + { + "epoch": 17.00294717751077, + "grad_norm": 0.16015625, + "learning_rate": 0.0002, + "loss": 0.0605, + "step": 9375 + }, + { + "epoch": 17.004760825209704, + "grad_norm": 0.1826171875, + "learning_rate": 0.0002, + "loss": 0.0629, + "step": 9376 + }, + { + "epoch": 17.00657447290864, + "grad_norm": 0.177734375, + "learning_rate": 0.0002, + "loss": 0.0628, + "step": 9377 + }, + { + "epoch": 17.00838812060757, + "grad_norm": 0.2041015625, + "learning_rate": 0.0002, + "loss": 0.0672, + "step": 9378 + }, + { + "epoch": 17.010201768306505, + "grad_norm": 0.173828125, + "learning_rate": 0.0002, + "loss": 0.0645, + "step": 9379 + }, + { + "epoch": 17.01201541600544, + "grad_norm": 0.224609375, + "learning_rate": 0.0002, + "loss": 0.0691, + "step": 9380 + }, + { + "epoch": 17.013829063704375, + "grad_norm": 0.20703125, + "learning_rate": 0.0002, + "loss": 0.0654, + "step": 9381 + }, + { + "epoch": 17.01564271140331, + "grad_norm": 0.1982421875, + "learning_rate": 0.0002, + "loss": 0.0662, + "step": 9382 + }, + { + "epoch": 17.017456359102244, + "grad_norm": 0.1845703125, + "learning_rate": 0.0002, + "loss": 0.0592, + "step": 9383 + }, + { + "epoch": 17.01927000680118, + "grad_norm": 0.16796875, + "learning_rate": 0.0002, + "loss": 0.0561, + "step": 9384 + }, + { + "epoch": 17.021083654500114, + "grad_norm": 0.193359375, + "learning_rate": 0.0002, + "loss": 0.0595, + "step": 9385 + }, + { + "epoch": 17.02289730219905, + "grad_norm": 0.150390625, + "learning_rate": 0.0002, + "loss": 0.0565, + "step": 9386 + }, + { + "epoch": 17.024710949897983, + "grad_norm": 0.2392578125, + "learning_rate": 0.0002, + "loss": 0.0687, + "step": 9387 + }, + { + "epoch": 17.026524597596918, + "grad_norm": 0.2060546875, + "learning_rate": 0.0002, + "loss": 0.0588, + "step": 9388 + }, + { + "epoch": 17.028338245295853, + "grad_norm": 0.1728515625, + "learning_rate": 0.0002, + "loss": 0.0604, + "step": 9389 + }, + { + "epoch": 17.030151892994787, + "grad_norm": 0.1689453125, + "learning_rate": 0.0002, + "loss": 0.0554, + "step": 9390 + }, + { + "epoch": 17.03196554069372, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 0.0481, + "step": 9391 + }, + { + "epoch": 17.033779188392653, + "grad_norm": 0.171875, + "learning_rate": 0.0002, + "loss": 0.0537, + "step": 9392 + }, + { + "epoch": 17.03559283609159, + "grad_norm": 0.158203125, + "learning_rate": 0.0002, + "loss": 0.0574, + "step": 9393 + }, + { + "epoch": 17.037406483790523, + "grad_norm": 0.2158203125, + "learning_rate": 0.0002, + "loss": 0.0625, + "step": 9394 + }, + { + "epoch": 17.039220131489458, + "grad_norm": 0.1943359375, + "learning_rate": 0.0002, + "loss": 0.0547, + "step": 9395 + }, + { + "epoch": 17.041033779188393, + "grad_norm": 0.21484375, + "learning_rate": 0.0002, + "loss": 0.0558, + "step": 9396 + }, + { + "epoch": 17.042847426887327, + "grad_norm": 0.1982421875, + "learning_rate": 0.0002, + "loss": 0.0574, + "step": 9397 + }, + { + "epoch": 17.044661074586262, + "grad_norm": 0.2216796875, + "learning_rate": 0.0002, + "loss": 0.0533, + "step": 9398 + }, + { + "epoch": 17.046474722285197, + "grad_norm": 0.240234375, + "learning_rate": 0.0002, + "loss": 0.0573, + "step": 9399 + }, + { + "epoch": 17.04828836998413, + "grad_norm": 0.1953125, + "learning_rate": 0.0002, + "loss": 0.0611, + "step": 9400 + }, + { + "epoch": 17.050102017683066, + "grad_norm": 0.2177734375, + "learning_rate": 0.0002, + "loss": 0.0597, + "step": 9401 + }, + { + "epoch": 17.051915665382, + "grad_norm": 0.1953125, + "learning_rate": 0.0002, + "loss": 0.0655, + "step": 9402 + }, + { + "epoch": 17.053729313080932, + "grad_norm": 0.1923828125, + "learning_rate": 0.0002, + "loss": 0.0632, + "step": 9403 + }, + { + "epoch": 17.055542960779867, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 0.0608, + "step": 9404 + }, + { + "epoch": 17.057356608478802, + "grad_norm": 0.25, + "learning_rate": 0.0002, + "loss": 0.0609, + "step": 9405 + }, + { + "epoch": 17.059170256177737, + "grad_norm": 0.26171875, + "learning_rate": 0.0002, + "loss": 0.0697, + "step": 9406 + }, + { + "epoch": 17.06098390387667, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 0.0564, + "step": 9407 + }, + { + "epoch": 17.062797551575606, + "grad_norm": 0.21484375, + "learning_rate": 0.0002, + "loss": 0.0628, + "step": 9408 + }, + { + "epoch": 17.06461119927454, + "grad_norm": 0.275390625, + "learning_rate": 0.0002, + "loss": 0.0783, + "step": 9409 + }, + { + "epoch": 17.066424846973476, + "grad_norm": 0.2734375, + "learning_rate": 0.0002, + "loss": 0.0739, + "step": 9410 + }, + { + "epoch": 17.06823849467241, + "grad_norm": 0.203125, + "learning_rate": 0.0002, + "loss": 0.0729, + "step": 9411 + }, + { + "epoch": 17.070052142371345, + "grad_norm": 0.318359375, + "learning_rate": 0.0002, + "loss": 0.0837, + "step": 9412 + }, + { + "epoch": 17.07186579007028, + "grad_norm": 0.1923828125, + "learning_rate": 0.0002, + "loss": 0.0717, + "step": 9413 + }, + { + "epoch": 17.073679437769215, + "grad_norm": 0.2197265625, + "learning_rate": 0.0002, + "loss": 0.0763, + "step": 9414 + }, + { + "epoch": 17.075493085468146, + "grad_norm": 0.361328125, + "learning_rate": 0.0002, + "loss": 0.087, + "step": 9415 + }, + { + "epoch": 17.07730673316708, + "grad_norm": 0.29296875, + "learning_rate": 0.0002, + "loss": 0.1005, + "step": 9416 + }, + { + "epoch": 17.079120380866016, + "grad_norm": 0.17578125, + "learning_rate": 0.0002, + "loss": 0.0918, + "step": 9417 + }, + { + "epoch": 17.08093402856495, + "grad_norm": 0.349609375, + "learning_rate": 0.0002, + "loss": 0.1175, + "step": 9418 + }, + { + "epoch": 17.082747676263885, + "grad_norm": 0.1015625, + "learning_rate": 0.0002, + "loss": 0.0863, + "step": 9419 + }, + { + "epoch": 17.08456132396282, + "grad_norm": 0.279296875, + "learning_rate": 0.0002, + "loss": 0.1128, + "step": 9420 + }, + { + "epoch": 17.086374971661755, + "grad_norm": 0.11572265625, + "learning_rate": 0.0002, + "loss": 0.1154, + "step": 9421 + }, + { + "epoch": 17.08818861936069, + "grad_norm": 0.470703125, + "learning_rate": 0.0002, + "loss": 0.1637, + "step": 9422 + }, + { + "epoch": 17.090002267059624, + "grad_norm": 0.34765625, + "learning_rate": 0.0002, + "loss": 0.1581, + "step": 9423 + }, + { + "epoch": 17.09181591475856, + "grad_norm": 0.18359375, + "learning_rate": 0.0002, + "loss": 0.1312, + "step": 9424 + }, + { + "epoch": 17.093629562457494, + "grad_norm": 0.19921875, + "learning_rate": 0.0002, + "loss": 0.0745, + "step": 9425 + }, + { + "epoch": 17.09544321015643, + "grad_norm": 0.1748046875, + "learning_rate": 0.0002, + "loss": 0.0621, + "step": 9426 + }, + { + "epoch": 17.09725685785536, + "grad_norm": 0.1962890625, + "learning_rate": 0.0002, + "loss": 0.0616, + "step": 9427 + }, + { + "epoch": 17.099070505554295, + "grad_norm": 0.2255859375, + "learning_rate": 0.0002, + "loss": 0.0713, + "step": 9428 + }, + { + "epoch": 17.10088415325323, + "grad_norm": 0.1689453125, + "learning_rate": 0.0002, + "loss": 0.058, + "step": 9429 + }, + { + "epoch": 17.102697800952164, + "grad_norm": 0.2099609375, + "learning_rate": 0.0002, + "loss": 0.0661, + "step": 9430 + }, + { + "epoch": 17.1045114486511, + "grad_norm": 0.2041015625, + "learning_rate": 0.0002, + "loss": 0.0649, + "step": 9431 + }, + { + "epoch": 17.106325096350034, + "grad_norm": 0.2041015625, + "learning_rate": 0.0002, + "loss": 0.0598, + "step": 9432 + }, + { + "epoch": 17.10813874404897, + "grad_norm": 0.2041015625, + "learning_rate": 0.0002, + "loss": 0.0697, + "step": 9433 + }, + { + "epoch": 17.109952391747903, + "grad_norm": 0.1943359375, + "learning_rate": 0.0002, + "loss": 0.0652, + "step": 9434 + }, + { + "epoch": 17.111766039446838, + "grad_norm": 0.208984375, + "learning_rate": 0.0002, + "loss": 0.0602, + "step": 9435 + }, + { + "epoch": 17.113579687145773, + "grad_norm": 0.16796875, + "learning_rate": 0.0002, + "loss": 0.0608, + "step": 9436 + }, + { + "epoch": 17.115393334844708, + "grad_norm": 0.1875, + "learning_rate": 0.0002, + "loss": 0.0577, + "step": 9437 + }, + { + "epoch": 17.117206982543642, + "grad_norm": 0.2236328125, + "learning_rate": 0.0002, + "loss": 0.1196, + "step": 9438 + }, + { + "epoch": 17.119020630242577, + "grad_norm": 0.1943359375, + "learning_rate": 0.0002, + "loss": 0.0617, + "step": 9439 + }, + { + "epoch": 17.12083427794151, + "grad_norm": 0.1982421875, + "learning_rate": 0.0002, + "loss": 0.0652, + "step": 9440 + }, + { + "epoch": 17.122647925640443, + "grad_norm": 0.177734375, + "learning_rate": 0.0002, + "loss": 0.054, + "step": 9441 + }, + { + "epoch": 17.124461573339378, + "grad_norm": 0.1640625, + "learning_rate": 0.0002, + "loss": 0.0508, + "step": 9442 + }, + { + "epoch": 17.126275221038313, + "grad_norm": 0.1650390625, + "learning_rate": 0.0002, + "loss": 0.0585, + "step": 9443 + }, + { + "epoch": 17.128088868737247, + "grad_norm": 0.193359375, + "learning_rate": 0.0002, + "loss": 0.0672, + "step": 9444 + }, + { + "epoch": 17.129902516436182, + "grad_norm": 0.2109375, + "learning_rate": 0.0002, + "loss": 0.0652, + "step": 9445 + }, + { + "epoch": 17.131716164135117, + "grad_norm": 0.240234375, + "learning_rate": 0.0002, + "loss": 0.0631, + "step": 9446 + }, + { + "epoch": 17.13352981183405, + "grad_norm": 0.2177734375, + "learning_rate": 0.0002, + "loss": 0.0634, + "step": 9447 + }, + { + "epoch": 17.135343459532987, + "grad_norm": 0.1728515625, + "learning_rate": 0.0002, + "loss": 0.0564, + "step": 9448 + }, + { + "epoch": 17.13715710723192, + "grad_norm": 0.236328125, + "learning_rate": 0.0002, + "loss": 0.0592, + "step": 9449 + }, + { + "epoch": 17.138970754930856, + "grad_norm": 0.166015625, + "learning_rate": 0.0002, + "loss": 0.0562, + "step": 9450 + }, + { + "epoch": 17.14078440262979, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 0.0523, + "step": 9451 + }, + { + "epoch": 17.142598050328722, + "grad_norm": 0.267578125, + "learning_rate": 0.0002, + "loss": 0.0585, + "step": 9452 + }, + { + "epoch": 17.144411698027657, + "grad_norm": 0.1806640625, + "learning_rate": 0.0002, + "loss": 0.0568, + "step": 9453 + }, + { + "epoch": 17.14622534572659, + "grad_norm": 0.1943359375, + "learning_rate": 0.0002, + "loss": 0.0609, + "step": 9454 + }, + { + "epoch": 17.148038993425526, + "grad_norm": 0.2333984375, + "learning_rate": 0.0002, + "loss": 0.0731, + "step": 9455 + }, + { + "epoch": 17.14985264112446, + "grad_norm": 0.21484375, + "learning_rate": 0.0002, + "loss": 0.0742, + "step": 9456 + }, + { + "epoch": 17.151666288823396, + "grad_norm": 0.2470703125, + "learning_rate": 0.0002, + "loss": 0.0744, + "step": 9457 + }, + { + "epoch": 17.15347993652233, + "grad_norm": 0.2470703125, + "learning_rate": 0.0002, + "loss": 0.0796, + "step": 9458 + }, + { + "epoch": 17.155293584221265, + "grad_norm": 0.328125, + "learning_rate": 0.0002, + "loss": 0.0859, + "step": 9459 + }, + { + "epoch": 17.1571072319202, + "grad_norm": 0.296875, + "learning_rate": 0.0002, + "loss": 0.0771, + "step": 9460 + }, + { + "epoch": 17.158920879619135, + "grad_norm": 0.130859375, + "learning_rate": 0.0002, + "loss": 0.0648, + "step": 9461 + }, + { + "epoch": 17.16073452731807, + "grad_norm": 0.359375, + "learning_rate": 0.0002, + "loss": 0.0758, + "step": 9462 + }, + { + "epoch": 17.162548175017005, + "grad_norm": 0.359375, + "learning_rate": 0.0002, + "loss": 0.0907, + "step": 9463 + }, + { + "epoch": 17.164361822715936, + "grad_norm": 0.296875, + "learning_rate": 0.0002, + "loss": 0.1037, + "step": 9464 + }, + { + "epoch": 17.16617547041487, + "grad_norm": 0.291015625, + "learning_rate": 0.0002, + "loss": 0.0829, + "step": 9465 + }, + { + "epoch": 17.167989118113805, + "grad_norm": 0.462890625, + "learning_rate": 0.0002, + "loss": 0.1, + "step": 9466 + }, + { + "epoch": 17.16980276581274, + "grad_norm": 0.12158203125, + "learning_rate": 0.0002, + "loss": 0.0875, + "step": 9467 + }, + { + "epoch": 17.171616413511675, + "grad_norm": 0.361328125, + "learning_rate": 0.0002, + "loss": 0.1134, + "step": 9468 + }, + { + "epoch": 17.17343006121061, + "grad_norm": 0.373046875, + "learning_rate": 0.0002, + "loss": 0.1269, + "step": 9469 + }, + { + "epoch": 17.175243708909544, + "grad_norm": 0.2451171875, + "learning_rate": 0.0002, + "loss": 0.1363, + "step": 9470 + }, + { + "epoch": 17.17705735660848, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 0.134, + "step": 9471 + }, + { + "epoch": 17.178871004307414, + "grad_norm": 0.10107421875, + "learning_rate": 0.0002, + "loss": 0.1294, + "step": 9472 + }, + { + "epoch": 17.18068465200635, + "grad_norm": 0.353515625, + "learning_rate": 0.0002, + "loss": 0.1785, + "step": 9473 + }, + { + "epoch": 17.182498299705284, + "grad_norm": 0.244140625, + "learning_rate": 0.0002, + "loss": 0.2657, + "step": 9474 + }, + { + "epoch": 17.18431194740422, + "grad_norm": 0.1953125, + "learning_rate": 0.0002, + "loss": 0.068, + "step": 9475 + }, + { + "epoch": 17.18612559510315, + "grad_norm": 0.1962890625, + "learning_rate": 0.0002, + "loss": 0.0658, + "step": 9476 + }, + { + "epoch": 17.187939242802084, + "grad_norm": 0.1865234375, + "learning_rate": 0.0002, + "loss": 0.0692, + "step": 9477 + }, + { + "epoch": 17.18975289050102, + "grad_norm": 0.1962890625, + "learning_rate": 0.0002, + "loss": 0.0651, + "step": 9478 + }, + { + "epoch": 17.191566538199954, + "grad_norm": 0.169921875, + "learning_rate": 0.0002, + "loss": 0.0512, + "step": 9479 + }, + { + "epoch": 17.19338018589889, + "grad_norm": 0.1787109375, + "learning_rate": 0.0002, + "loss": 0.0624, + "step": 9480 + }, + { + "epoch": 17.195193833597823, + "grad_norm": 0.216796875, + "learning_rate": 0.0002, + "loss": 0.0651, + "step": 9481 + }, + { + "epoch": 17.197007481296758, + "grad_norm": 0.224609375, + "learning_rate": 0.0002, + "loss": 0.0704, + "step": 9482 + }, + { + "epoch": 17.198821128995693, + "grad_norm": 0.212890625, + "learning_rate": 0.0002, + "loss": 0.0714, + "step": 9483 + }, + { + "epoch": 17.200634776694628, + "grad_norm": 0.19140625, + "learning_rate": 0.0002, + "loss": 0.0686, + "step": 9484 + }, + { + "epoch": 17.202448424393562, + "grad_norm": 0.2158203125, + "learning_rate": 0.0002, + "loss": 0.0675, + "step": 9485 + }, + { + "epoch": 17.204262072092497, + "grad_norm": 0.193359375, + "learning_rate": 0.0002, + "loss": 0.0669, + "step": 9486 + }, + { + "epoch": 17.206075719791432, + "grad_norm": 0.1826171875, + "learning_rate": 0.0002, + "loss": 0.0585, + "step": 9487 + }, + { + "epoch": 17.207889367490363, + "grad_norm": 0.185546875, + "learning_rate": 0.0002, + "loss": 0.0642, + "step": 9488 + }, + { + "epoch": 17.209703015189298, + "grad_norm": 0.1689453125, + "learning_rate": 0.0002, + "loss": 0.0598, + "step": 9489 + }, + { + "epoch": 17.211516662888233, + "grad_norm": 0.1650390625, + "learning_rate": 0.0002, + "loss": 0.058, + "step": 9490 + }, + { + "epoch": 17.213330310587168, + "grad_norm": 0.1953125, + "learning_rate": 0.0002, + "loss": 0.06, + "step": 9491 + }, + { + "epoch": 17.215143958286102, + "grad_norm": 0.1787109375, + "learning_rate": 0.0002, + "loss": 0.0598, + "step": 9492 + }, + { + "epoch": 17.216957605985037, + "grad_norm": 0.2041015625, + "learning_rate": 0.0002, + "loss": 0.0658, + "step": 9493 + }, + { + "epoch": 17.218771253683972, + "grad_norm": 0.1806640625, + "learning_rate": 0.0002, + "loss": 0.0619, + "step": 9494 + }, + { + "epoch": 17.220584901382907, + "grad_norm": 0.1904296875, + "learning_rate": 0.0002, + "loss": 0.0649, + "step": 9495 + }, + { + "epoch": 17.22239854908184, + "grad_norm": 0.2001953125, + "learning_rate": 0.0002, + "loss": 0.0571, + "step": 9496 + }, + { + "epoch": 17.224212196780776, + "grad_norm": 0.1923828125, + "learning_rate": 0.0002, + "loss": 0.0648, + "step": 9497 + }, + { + "epoch": 17.22602584447971, + "grad_norm": 0.197265625, + "learning_rate": 0.0002, + "loss": 0.0578, + "step": 9498 + }, + { + "epoch": 17.227839492178646, + "grad_norm": 0.2001953125, + "learning_rate": 0.0002, + "loss": 0.0657, + "step": 9499 + }, + { + "epoch": 17.22965313987758, + "grad_norm": 0.23828125, + "learning_rate": 0.0002, + "loss": 0.0636, + "step": 9500 + }, + { + "epoch": 17.23146678757651, + "grad_norm": 0.26171875, + "learning_rate": 0.0002, + "loss": 0.0749, + "step": 9501 + }, + { + "epoch": 17.233280435275447, + "grad_norm": 0.28125, + "learning_rate": 0.0002, + "loss": 0.1004, + "step": 9502 + }, + { + "epoch": 17.23509408297438, + "grad_norm": 0.1875, + "learning_rate": 0.0002, + "loss": 0.0614, + "step": 9503 + }, + { + "epoch": 17.236907730673316, + "grad_norm": 0.14453125, + "learning_rate": 0.0002, + "loss": 0.0564, + "step": 9504 + }, + { + "epoch": 17.23872137837225, + "grad_norm": 0.25390625, + "learning_rate": 0.0002, + "loss": 0.0652, + "step": 9505 + }, + { + "epoch": 17.240535026071186, + "grad_norm": 0.326171875, + "learning_rate": 0.0002, + "loss": 0.075, + "step": 9506 + }, + { + "epoch": 17.24234867377012, + "grad_norm": 0.294921875, + "learning_rate": 0.0002, + "loss": 0.0768, + "step": 9507 + }, + { + "epoch": 17.244162321469055, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 0.0614, + "step": 9508 + }, + { + "epoch": 17.24597596916799, + "grad_norm": 0.251953125, + "learning_rate": 0.0002, + "loss": 0.0669, + "step": 9509 + }, + { + "epoch": 17.247789616866925, + "grad_norm": 0.2490234375, + "learning_rate": 0.0002, + "loss": 0.0787, + "step": 9510 + }, + { + "epoch": 17.24960326456586, + "grad_norm": 0.26953125, + "learning_rate": 0.0002, + "loss": 0.0772, + "step": 9511 + }, + { + "epoch": 17.251416912264794, + "grad_norm": 0.189453125, + "learning_rate": 0.0002, + "loss": 0.0698, + "step": 9512 + }, + { + "epoch": 17.253230559963725, + "grad_norm": 0.24609375, + "learning_rate": 0.0002, + "loss": 0.0968, + "step": 9513 + }, + { + "epoch": 17.25504420766266, + "grad_norm": 0.162109375, + "learning_rate": 0.0002, + "loss": 0.0831, + "step": 9514 + }, + { + "epoch": 17.256857855361595, + "grad_norm": 0.34765625, + "learning_rate": 0.0002, + "loss": 0.096, + "step": 9515 + }, + { + "epoch": 17.25867150306053, + "grad_norm": 0.2490234375, + "learning_rate": 0.0002, + "loss": 0.0918, + "step": 9516 + }, + { + "epoch": 17.260485150759465, + "grad_norm": 0.3203125, + "learning_rate": 0.0002, + "loss": 0.1064, + "step": 9517 + }, + { + "epoch": 17.2622987984584, + "grad_norm": 0.30078125, + "learning_rate": 0.0002, + "loss": 0.1123, + "step": 9518 + }, + { + "epoch": 17.264112446157334, + "grad_norm": 0.2392578125, + "learning_rate": 0.0002, + "loss": 0.1201, + "step": 9519 + }, + { + "epoch": 17.26592609385627, + "grad_norm": 0.1181640625, + "learning_rate": 0.0002, + "loss": 0.1184, + "step": 9520 + }, + { + "epoch": 17.267739741555204, + "grad_norm": 0.1689453125, + "learning_rate": 0.0002, + "loss": 0.1234, + "step": 9521 + }, + { + "epoch": 17.26955338925414, + "grad_norm": 0.140625, + "learning_rate": 0.0002, + "loss": 0.1427, + "step": 9522 + }, + { + "epoch": 17.271367036953073, + "grad_norm": 0.17578125, + "learning_rate": 0.0002, + "loss": 0.1567, + "step": 9523 + }, + { + "epoch": 17.273180684652008, + "grad_norm": 0.177734375, + "learning_rate": 0.0002, + "loss": 0.1195, + "step": 9524 + }, + { + "epoch": 17.27499433235094, + "grad_norm": 0.220703125, + "learning_rate": 0.0002, + "loss": 0.078, + "step": 9525 + }, + { + "epoch": 17.276807980049874, + "grad_norm": 0.2490234375, + "learning_rate": 0.0002, + "loss": 0.0765, + "step": 9526 + }, + { + "epoch": 17.27862162774881, + "grad_norm": 0.197265625, + "learning_rate": 0.0002, + "loss": 0.0678, + "step": 9527 + }, + { + "epoch": 17.280435275447743, + "grad_norm": 0.1640625, + "learning_rate": 0.0002, + "loss": 0.0607, + "step": 9528 + }, + { + "epoch": 17.28224892314668, + "grad_norm": 0.185546875, + "learning_rate": 0.0002, + "loss": 0.0702, + "step": 9529 + }, + { + "epoch": 17.284062570845613, + "grad_norm": 0.2333984375, + "learning_rate": 0.0002, + "loss": 0.0844, + "step": 9530 + }, + { + "epoch": 17.285876218544548, + "grad_norm": 0.193359375, + "learning_rate": 0.0002, + "loss": 0.0657, + "step": 9531 + }, + { + "epoch": 17.287689866243483, + "grad_norm": 0.224609375, + "learning_rate": 0.0002, + "loss": 0.0679, + "step": 9532 + }, + { + "epoch": 17.289503513942417, + "grad_norm": 0.19921875, + "learning_rate": 0.0002, + "loss": 0.0641, + "step": 9533 + }, + { + "epoch": 17.291317161641352, + "grad_norm": 0.22265625, + "learning_rate": 0.0002, + "loss": 0.0736, + "step": 9534 + }, + { + "epoch": 17.293130809340287, + "grad_norm": 0.181640625, + "learning_rate": 0.0002, + "loss": 0.0623, + "step": 9535 + }, + { + "epoch": 17.29494445703922, + "grad_norm": 0.1845703125, + "learning_rate": 0.0002, + "loss": 0.0651, + "step": 9536 + }, + { + "epoch": 17.296758104738153, + "grad_norm": 0.20703125, + "learning_rate": 0.0002, + "loss": 0.0673, + "step": 9537 + }, + { + "epoch": 17.296758104738153, + "eval_loss": 2.558560609817505, + "eval_runtime": 152.4323, + "eval_samples_per_second": 6.56, + "eval_steps_per_second": 6.56, + "step": 9537 + }, + { + "epoch": 17.296758104738153, + "mmlu_eval_accuracy": 0.29845953476389775, + "mmlu_eval_accuracy_abstract_algebra": 0.36363636363636365, + "mmlu_eval_accuracy_anatomy": 0.2857142857142857, + "mmlu_eval_accuracy_astronomy": 0.3125, + "mmlu_eval_accuracy_business_ethics": 0.36363636363636365, + "mmlu_eval_accuracy_clinical_knowledge": 0.27586206896551724, + "mmlu_eval_accuracy_college_biology": 0.3125, + "mmlu_eval_accuracy_college_chemistry": 0.125, + "mmlu_eval_accuracy_college_computer_science": 0.0, + "mmlu_eval_accuracy_college_mathematics": 0.18181818181818182, + "mmlu_eval_accuracy_college_medicine": 0.3181818181818182, + "mmlu_eval_accuracy_college_physics": 0.36363636363636365, + "mmlu_eval_accuracy_computer_security": 0.45454545454545453, + "mmlu_eval_accuracy_conceptual_physics": 0.34615384615384615, + "mmlu_eval_accuracy_econometrics": 0.16666666666666666, + "mmlu_eval_accuracy_electrical_engineering": 0.1875, + "mmlu_eval_accuracy_elementary_mathematics": 0.34146341463414637, + "mmlu_eval_accuracy_formal_logic": 0.14285714285714285, + "mmlu_eval_accuracy_global_facts": 0.3, + "mmlu_eval_accuracy_high_school_biology": 0.34375, + "mmlu_eval_accuracy_high_school_chemistry": 0.2727272727272727, + "mmlu_eval_accuracy_high_school_computer_science": 0.2222222222222222, + "mmlu_eval_accuracy_high_school_european_history": 0.2777777777777778, + "mmlu_eval_accuracy_high_school_geography": 0.3181818181818182, + "mmlu_eval_accuracy_high_school_government_and_politics": 0.23809523809523808, + "mmlu_eval_accuracy_high_school_macroeconomics": 0.20930232558139536, + "mmlu_eval_accuracy_high_school_mathematics": 0.06896551724137931, + "mmlu_eval_accuracy_high_school_microeconomics": 0.3076923076923077, + "mmlu_eval_accuracy_high_school_physics": 0.5294117647058824, + "mmlu_eval_accuracy_high_school_psychology": 0.4, + "mmlu_eval_accuracy_high_school_statistics": 0.30434782608695654, + "mmlu_eval_accuracy_high_school_us_history": 0.36363636363636365, + "mmlu_eval_accuracy_high_school_world_history": 0.23076923076923078, + "mmlu_eval_accuracy_human_aging": 0.2608695652173913, + "mmlu_eval_accuracy_human_sexuality": 0.16666666666666666, + "mmlu_eval_accuracy_international_law": 0.38461538461538464, + "mmlu_eval_accuracy_jurisprudence": 0.36363636363636365, + "mmlu_eval_accuracy_logical_fallacies": 0.3888888888888889, + "mmlu_eval_accuracy_machine_learning": 0.36363636363636365, + "mmlu_eval_accuracy_management": 0.36363636363636365, + "mmlu_eval_accuracy_marketing": 0.52, + "mmlu_eval_accuracy_medical_genetics": 0.45454545454545453, + "mmlu_eval_accuracy_miscellaneous": 0.46511627906976744, + "mmlu_eval_accuracy_moral_disputes": 0.3157894736842105, + "mmlu_eval_accuracy_moral_scenarios": 0.22, + "mmlu_eval_accuracy_nutrition": 0.2727272727272727, + "mmlu_eval_accuracy_philosophy": 0.35294117647058826, + "mmlu_eval_accuracy_prehistory": 0.37142857142857144, + "mmlu_eval_accuracy_professional_accounting": 0.3548387096774194, + "mmlu_eval_accuracy_professional_law": 0.28823529411764703, + "mmlu_eval_accuracy_professional_medicine": 0.1935483870967742, + "mmlu_eval_accuracy_professional_psychology": 0.2753623188405797, + "mmlu_eval_accuracy_public_relations": 0.3333333333333333, + "mmlu_eval_accuracy_security_studies": 0.25925925925925924, + "mmlu_eval_accuracy_sociology": 0.4090909090909091, + "mmlu_eval_accuracy_us_foreign_policy": 0.2727272727272727, + "mmlu_eval_accuracy_virology": 0.2222222222222222, + "mmlu_eval_accuracy_world_religions": 0.21052631578947367, + "mmlu_loss": 2.320738100542265, + "step": 9537 + }, + { + "epoch": 17.298571752437088, + "grad_norm": 0.171875, + "learning_rate": 0.0002, + "loss": 0.0593, + "step": 9538 + }, + { + "epoch": 17.300385400136022, + "grad_norm": 0.193359375, + "learning_rate": 0.0002, + "loss": 0.0566, + "step": 9539 + }, + { + "epoch": 17.302199047834957, + "grad_norm": 0.20703125, + "learning_rate": 0.0002, + "loss": 0.0642, + "step": 9540 + }, + { + "epoch": 17.304012695533892, + "grad_norm": 0.20703125, + "learning_rate": 0.0002, + "loss": 0.0579, + "step": 9541 + }, + { + "epoch": 17.305826343232827, + "grad_norm": 0.2216796875, + "learning_rate": 0.0002, + "loss": 0.0586, + "step": 9542 + }, + { + "epoch": 17.30763999093176, + "grad_norm": 0.2041015625, + "learning_rate": 0.0002, + "loss": 0.063, + "step": 9543 + }, + { + "epoch": 17.309453638630696, + "grad_norm": 0.2216796875, + "learning_rate": 0.0002, + "loss": 0.0602, + "step": 9544 + }, + { + "epoch": 17.31126728632963, + "grad_norm": 0.1728515625, + "learning_rate": 0.0002, + "loss": 0.061, + "step": 9545 + }, + { + "epoch": 17.313080934028566, + "grad_norm": 0.1875, + "learning_rate": 0.0002, + "loss": 0.0602, + "step": 9546 + }, + { + "epoch": 17.3148945817275, + "grad_norm": 0.2080078125, + "learning_rate": 0.0002, + "loss": 0.0557, + "step": 9547 + }, + { + "epoch": 17.316708229426435, + "grad_norm": 0.173828125, + "learning_rate": 0.0002, + "loss": 0.0599, + "step": 9548 + }, + { + "epoch": 17.31852187712537, + "grad_norm": 0.2021484375, + "learning_rate": 0.0002, + "loss": 0.0582, + "step": 9549 + }, + { + "epoch": 17.3203355248243, + "grad_norm": 0.2021484375, + "learning_rate": 0.0002, + "loss": 0.0586, + "step": 9550 + }, + { + "epoch": 17.322149172523236, + "grad_norm": 0.1591796875, + "learning_rate": 0.0002, + "loss": 0.0626, + "step": 9551 + }, + { + "epoch": 17.32396282022217, + "grad_norm": 0.2041015625, + "learning_rate": 0.0002, + "loss": 0.0705, + "step": 9552 + }, + { + "epoch": 17.325776467921106, + "grad_norm": 0.2197265625, + "learning_rate": 0.0002, + "loss": 0.0607, + "step": 9553 + }, + { + "epoch": 17.32759011562004, + "grad_norm": 0.267578125, + "learning_rate": 0.0002, + "loss": 0.0646, + "step": 9554 + }, + { + "epoch": 17.329403763318975, + "grad_norm": 0.189453125, + "learning_rate": 0.0002, + "loss": 0.0636, + "step": 9555 + }, + { + "epoch": 17.33121741101791, + "grad_norm": 0.248046875, + "learning_rate": 0.0002, + "loss": 0.0755, + "step": 9556 + }, + { + "epoch": 17.333031058716845, + "grad_norm": 0.2451171875, + "learning_rate": 0.0002, + "loss": 0.074, + "step": 9557 + }, + { + "epoch": 17.33484470641578, + "grad_norm": 0.26171875, + "learning_rate": 0.0002, + "loss": 0.0818, + "step": 9558 + }, + { + "epoch": 17.336658354114714, + "grad_norm": 0.1923828125, + "learning_rate": 0.0002, + "loss": 0.0707, + "step": 9559 + }, + { + "epoch": 17.33847200181365, + "grad_norm": 0.240234375, + "learning_rate": 0.0002, + "loss": 0.0733, + "step": 9560 + }, + { + "epoch": 17.340285649512584, + "grad_norm": 0.25, + "learning_rate": 0.0002, + "loss": 0.0768, + "step": 9561 + }, + { + "epoch": 17.342099297211515, + "grad_norm": 0.25390625, + "learning_rate": 0.0002, + "loss": 0.0758, + "step": 9562 + }, + { + "epoch": 17.34391294491045, + "grad_norm": 0.271484375, + "learning_rate": 0.0002, + "loss": 0.0877, + "step": 9563 + }, + { + "epoch": 17.345726592609385, + "grad_norm": 0.279296875, + "learning_rate": 0.0002, + "loss": 0.0881, + "step": 9564 + }, + { + "epoch": 17.34754024030832, + "grad_norm": 0.208984375, + "learning_rate": 0.0002, + "loss": 0.0867, + "step": 9565 + }, + { + "epoch": 17.349353888007254, + "grad_norm": 0.447265625, + "learning_rate": 0.0002, + "loss": 0.1125, + "step": 9566 + }, + { + "epoch": 17.35116753570619, + "grad_norm": 0.283203125, + "learning_rate": 0.0002, + "loss": 0.0973, + "step": 9567 + }, + { + "epoch": 17.352981183405124, + "grad_norm": 0.341796875, + "learning_rate": 0.0002, + "loss": 0.1084, + "step": 9568 + }, + { + "epoch": 17.35479483110406, + "grad_norm": 0.138671875, + "learning_rate": 0.0002, + "loss": 0.0991, + "step": 9569 + }, + { + "epoch": 17.356608478802993, + "grad_norm": 0.224609375, + "learning_rate": 0.0002, + "loss": 0.1324, + "step": 9570 + }, + { + "epoch": 17.358422126501928, + "grad_norm": 0.125, + "learning_rate": 0.0002, + "loss": 0.115, + "step": 9571 + }, + { + "epoch": 17.360235774200863, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002, + "loss": 0.1472, + "step": 9572 + }, + { + "epoch": 17.362049421899798, + "grad_norm": 0.24609375, + "learning_rate": 0.0002, + "loss": 0.1913, + "step": 9573 + }, + { + "epoch": 17.36386306959873, + "grad_norm": 0.2392578125, + "learning_rate": 0.0002, + "loss": 0.1607, + "step": 9574 + }, + { + "epoch": 17.365676717297664, + "grad_norm": 0.2109375, + "learning_rate": 0.0002, + "loss": 0.0625, + "step": 9575 + }, + { + "epoch": 17.3674903649966, + "grad_norm": 0.2060546875, + "learning_rate": 0.0002, + "loss": 0.0772, + "step": 9576 + }, + { + "epoch": 17.369304012695533, + "grad_norm": 0.2451171875, + "learning_rate": 0.0002, + "loss": 0.0771, + "step": 9577 + }, + { + "epoch": 17.371117660394468, + "grad_norm": 0.25390625, + "learning_rate": 0.0002, + "loss": 0.0841, + "step": 9578 + }, + { + "epoch": 17.372931308093403, + "grad_norm": 0.25390625, + "learning_rate": 0.0002, + "loss": 0.0914, + "step": 9579 + }, + { + "epoch": 17.374744955792337, + "grad_norm": 0.2021484375, + "learning_rate": 0.0002, + "loss": 0.0669, + "step": 9580 + }, + { + "epoch": 17.376558603491272, + "grad_norm": 0.23046875, + "learning_rate": 0.0002, + "loss": 0.0773, + "step": 9581 + }, + { + "epoch": 17.378372251190207, + "grad_norm": 0.236328125, + "learning_rate": 0.0002, + "loss": 0.0732, + "step": 9582 + }, + { + "epoch": 17.380185898889142, + "grad_norm": 0.22265625, + "learning_rate": 0.0002, + "loss": 0.0687, + "step": 9583 + }, + { + "epoch": 17.381999546588077, + "grad_norm": 0.1982421875, + "learning_rate": 0.0002, + "loss": 0.0663, + "step": 9584 + }, + { + "epoch": 17.38381319428701, + "grad_norm": 0.1962890625, + "learning_rate": 0.0002, + "loss": 0.0672, + "step": 9585 + }, + { + "epoch": 17.385626841985943, + "grad_norm": 0.234375, + "learning_rate": 0.0002, + "loss": 0.0694, + "step": 9586 + }, + { + "epoch": 17.387440489684877, + "grad_norm": 0.19921875, + "learning_rate": 0.0002, + "loss": 0.0644, + "step": 9587 + }, + { + "epoch": 17.389254137383812, + "grad_norm": 0.185546875, + "learning_rate": 0.0002, + "loss": 0.0607, + "step": 9588 + }, + { + "epoch": 17.391067785082747, + "grad_norm": 0.19140625, + "learning_rate": 0.0002, + "loss": 0.06, + "step": 9589 + }, + { + "epoch": 17.39288143278168, + "grad_norm": 0.1845703125, + "learning_rate": 0.0002, + "loss": 0.064, + "step": 9590 + }, + { + "epoch": 17.394695080480616, + "grad_norm": 0.19921875, + "learning_rate": 0.0002, + "loss": 0.0594, + "step": 9591 + }, + { + "epoch": 17.39650872817955, + "grad_norm": 0.1962890625, + "learning_rate": 0.0002, + "loss": 0.0587, + "step": 9592 + }, + { + "epoch": 17.398322375878486, + "grad_norm": 0.1806640625, + "learning_rate": 0.0002, + "loss": 0.0689, + "step": 9593 + }, + { + "epoch": 17.40013602357742, + "grad_norm": 0.20703125, + "learning_rate": 0.0002, + "loss": 0.0692, + "step": 9594 + }, + { + "epoch": 17.401949671276356, + "grad_norm": 0.1943359375, + "learning_rate": 0.0002, + "loss": 0.064, + "step": 9595 + }, + { + "epoch": 17.40376331897529, + "grad_norm": 0.212890625, + "learning_rate": 0.0002, + "loss": 0.0658, + "step": 9596 + }, + { + "epoch": 17.405576966674225, + "grad_norm": 0.205078125, + "learning_rate": 0.0002, + "loss": 0.0594, + "step": 9597 + }, + { + "epoch": 17.407390614373156, + "grad_norm": 0.197265625, + "learning_rate": 0.0002, + "loss": 0.0571, + "step": 9598 + }, + { + "epoch": 17.40920426207209, + "grad_norm": 0.205078125, + "learning_rate": 0.0002, + "loss": 0.0622, + "step": 9599 + }, + { + "epoch": 17.411017909771026, + "grad_norm": 0.23046875, + "learning_rate": 0.0002, + "loss": 0.0694, + "step": 9600 + }, + { + "epoch": 17.41283155746996, + "grad_norm": 0.2119140625, + "learning_rate": 0.0002, + "loss": 0.0661, + "step": 9601 + }, + { + "epoch": 17.414645205168895, + "grad_norm": 0.216796875, + "learning_rate": 0.0002, + "loss": 0.0658, + "step": 9602 + }, + { + "epoch": 17.41645885286783, + "grad_norm": 0.201171875, + "learning_rate": 0.0002, + "loss": 0.0618, + "step": 9603 + }, + { + "epoch": 17.418272500566765, + "grad_norm": 0.1962890625, + "learning_rate": 0.0002, + "loss": 0.07, + "step": 9604 + }, + { + "epoch": 17.4200861482657, + "grad_norm": 0.220703125, + "learning_rate": 0.0002, + "loss": 0.0715, + "step": 9605 + }, + { + "epoch": 17.421899795964634, + "grad_norm": 0.26171875, + "learning_rate": 0.0002, + "loss": 0.0716, + "step": 9606 + }, + { + "epoch": 17.42371344366357, + "grad_norm": 0.255859375, + "learning_rate": 0.0002, + "loss": 0.0706, + "step": 9607 + }, + { + "epoch": 17.425527091362504, + "grad_norm": 0.259765625, + "learning_rate": 0.0002, + "loss": 0.0711, + "step": 9608 + }, + { + "epoch": 17.42734073906144, + "grad_norm": 0.275390625, + "learning_rate": 0.0002, + "loss": 0.0767, + "step": 9609 + }, + { + "epoch": 17.429154386760374, + "grad_norm": 0.205078125, + "learning_rate": 0.0002, + "loss": 0.0722, + "step": 9610 + }, + { + "epoch": 17.430968034459305, + "grad_norm": 0.294921875, + "learning_rate": 0.0002, + "loss": 0.0749, + "step": 9611 + }, + { + "epoch": 17.43278168215824, + "grad_norm": 0.26953125, + "learning_rate": 0.0002, + "loss": 0.0783, + "step": 9612 + }, + { + "epoch": 17.434595329857174, + "grad_norm": 0.255859375, + "learning_rate": 0.0002, + "loss": 0.0733, + "step": 9613 + }, + { + "epoch": 17.43640897755611, + "grad_norm": 0.28515625, + "learning_rate": 0.0002, + "loss": 0.0807, + "step": 9614 + }, + { + "epoch": 17.438222625255044, + "grad_norm": 0.30078125, + "learning_rate": 0.0002, + "loss": 0.0847, + "step": 9615 + }, + { + "epoch": 17.44003627295398, + "grad_norm": 0.203125, + "learning_rate": 0.0002, + "loss": 0.0923, + "step": 9616 + }, + { + "epoch": 17.441849920652913, + "grad_norm": 0.419921875, + "learning_rate": 0.0002, + "loss": 0.1149, + "step": 9617 + }, + { + "epoch": 17.443663568351848, + "grad_norm": 0.32421875, + "learning_rate": 0.0002, + "loss": 0.1021, + "step": 9618 + }, + { + "epoch": 17.445477216050783, + "grad_norm": 0.49609375, + "learning_rate": 0.0002, + "loss": 0.1004, + "step": 9619 + }, + { + "epoch": 17.447290863749718, + "grad_norm": 0.255859375, + "learning_rate": 0.0002, + "loss": 0.118, + "step": 9620 + }, + { + "epoch": 17.449104511448652, + "grad_norm": 0.462890625, + "learning_rate": 0.0002, + "loss": 0.1537, + "step": 9621 + }, + { + "epoch": 17.450918159147587, + "grad_norm": 0.2236328125, + "learning_rate": 0.0002, + "loss": 0.1406, + "step": 9622 + }, + { + "epoch": 17.45273180684652, + "grad_norm": 0.1064453125, + "learning_rate": 0.0002, + "loss": 0.1579, + "step": 9623 + }, + { + "epoch": 17.454545454545453, + "grad_norm": 0.20703125, + "learning_rate": 0.0002, + "loss": 0.1624, + "step": 9624 + }, + { + "epoch": 17.456359102244388, + "grad_norm": 0.2421875, + "learning_rate": 0.0002, + "loss": 0.0794, + "step": 9625 + }, + { + "epoch": 17.458172749943323, + "grad_norm": 0.224609375, + "learning_rate": 0.0002, + "loss": 0.0687, + "step": 9626 + }, + { + "epoch": 17.459986397642258, + "grad_norm": 0.2353515625, + "learning_rate": 0.0002, + "loss": 0.0707, + "step": 9627 + }, + { + "epoch": 17.461800045341192, + "grad_norm": 0.20703125, + "learning_rate": 0.0002, + "loss": 0.0673, + "step": 9628 + }, + { + "epoch": 17.463613693040127, + "grad_norm": 0.2197265625, + "learning_rate": 0.0002, + "loss": 0.0723, + "step": 9629 + }, + { + "epoch": 17.465427340739062, + "grad_norm": 0.216796875, + "learning_rate": 0.0002, + "loss": 0.0716, + "step": 9630 + }, + { + "epoch": 17.467240988437997, + "grad_norm": 0.201171875, + "learning_rate": 0.0002, + "loss": 0.069, + "step": 9631 + }, + { + "epoch": 17.46905463613693, + "grad_norm": 0.228515625, + "learning_rate": 0.0002, + "loss": 0.0653, + "step": 9632 + }, + { + "epoch": 17.470868283835866, + "grad_norm": 0.2109375, + "learning_rate": 0.0002, + "loss": 0.0715, + "step": 9633 + }, + { + "epoch": 17.4726819315348, + "grad_norm": 0.220703125, + "learning_rate": 0.0002, + "loss": 0.07, + "step": 9634 + }, + { + "epoch": 17.474495579233732, + "grad_norm": 0.263671875, + "learning_rate": 0.0002, + "loss": 0.0745, + "step": 9635 + }, + { + "epoch": 17.476309226932667, + "grad_norm": 0.255859375, + "learning_rate": 0.0002, + "loss": 0.0726, + "step": 9636 + }, + { + "epoch": 17.4781228746316, + "grad_norm": 0.2060546875, + "learning_rate": 0.0002, + "loss": 0.0875, + "step": 9637 + }, + { + "epoch": 17.479936522330537, + "grad_norm": 0.1806640625, + "learning_rate": 0.0002, + "loss": 0.065, + "step": 9638 + }, + { + "epoch": 17.48175017002947, + "grad_norm": 0.2119140625, + "learning_rate": 0.0002, + "loss": 0.0666, + "step": 9639 + }, + { + "epoch": 17.483563817728406, + "grad_norm": 0.2080078125, + "learning_rate": 0.0002, + "loss": 0.0708, + "step": 9640 + }, + { + "epoch": 17.48537746542734, + "grad_norm": 0.2197265625, + "learning_rate": 0.0002, + "loss": 0.0703, + "step": 9641 + }, + { + "epoch": 17.487191113126276, + "grad_norm": 0.2099609375, + "learning_rate": 0.0002, + "loss": 0.067, + "step": 9642 + }, + { + "epoch": 17.48900476082521, + "grad_norm": 0.19921875, + "learning_rate": 0.0002, + "loss": 0.0601, + "step": 9643 + }, + { + "epoch": 17.490818408524145, + "grad_norm": 0.2138671875, + "learning_rate": 0.0002, + "loss": 0.063, + "step": 9644 + }, + { + "epoch": 17.49263205622308, + "grad_norm": 0.2265625, + "learning_rate": 0.0002, + "loss": 0.0631, + "step": 9645 + }, + { + "epoch": 17.494445703922015, + "grad_norm": 0.1640625, + "learning_rate": 0.0002, + "loss": 0.0545, + "step": 9646 + }, + { + "epoch": 17.496259351620946, + "grad_norm": 0.2177734375, + "learning_rate": 0.0002, + "loss": 0.0634, + "step": 9647 + }, + { + "epoch": 17.49807299931988, + "grad_norm": 0.2109375, + "learning_rate": 0.0002, + "loss": 0.0636, + "step": 9648 + }, + { + "epoch": 17.499886647018815, + "grad_norm": 0.2578125, + "learning_rate": 0.0002, + "loss": 0.0661, + "step": 9649 + }, + { + "epoch": 17.50170029471775, + "grad_norm": 0.2080078125, + "learning_rate": 0.0002, + "loss": 0.0675, + "step": 9650 + }, + { + "epoch": 17.503513942416685, + "grad_norm": 0.2158203125, + "learning_rate": 0.0002, + "loss": 0.0753, + "step": 9651 + }, + { + "epoch": 17.50532759011562, + "grad_norm": 0.220703125, + "learning_rate": 0.0002, + "loss": 0.0692, + "step": 9652 + }, + { + "epoch": 17.507141237814555, + "grad_norm": 0.216796875, + "learning_rate": 0.0002, + "loss": 0.0866, + "step": 9653 + }, + { + "epoch": 17.50895488551349, + "grad_norm": 0.228515625, + "learning_rate": 0.0002, + "loss": 0.0666, + "step": 9654 + }, + { + "epoch": 17.510768533212424, + "grad_norm": 0.2578125, + "learning_rate": 0.0002, + "loss": 0.0725, + "step": 9655 + }, + { + "epoch": 17.51258218091136, + "grad_norm": 0.306640625, + "learning_rate": 0.0002, + "loss": 0.0761, + "step": 9656 + }, + { + "epoch": 17.514395828610294, + "grad_norm": 0.19921875, + "learning_rate": 0.0002, + "loss": 0.0642, + "step": 9657 + }, + { + "epoch": 17.51620947630923, + "grad_norm": 0.2578125, + "learning_rate": 0.0002, + "loss": 0.0772, + "step": 9658 + }, + { + "epoch": 17.51802312400816, + "grad_norm": 0.27734375, + "learning_rate": 0.0002, + "loss": 0.0735, + "step": 9659 + }, + { + "epoch": 17.519836771707094, + "grad_norm": 0.263671875, + "learning_rate": 0.0002, + "loss": 0.0901, + "step": 9660 + }, + { + "epoch": 17.52165041940603, + "grad_norm": 0.240234375, + "learning_rate": 0.0002, + "loss": 0.0774, + "step": 9661 + }, + { + "epoch": 17.523464067104964, + "grad_norm": 0.24609375, + "learning_rate": 0.0002, + "loss": 0.0856, + "step": 9662 + }, + { + "epoch": 17.5252777148039, + "grad_norm": 0.208984375, + "learning_rate": 0.0002, + "loss": 0.0975, + "step": 9663 + }, + { + "epoch": 17.527091362502834, + "grad_norm": 0.412109375, + "learning_rate": 0.0002, + "loss": 0.1399, + "step": 9664 + }, + { + "epoch": 17.52890501020177, + "grad_norm": 0.29296875, + "learning_rate": 0.0002, + "loss": 0.0868, + "step": 9665 + }, + { + "epoch": 17.530718657900703, + "grad_norm": 0.34375, + "learning_rate": 0.0002, + "loss": 0.0923, + "step": 9666 + }, + { + "epoch": 17.532532305599638, + "grad_norm": 0.181640625, + "learning_rate": 0.0002, + "loss": 0.0936, + "step": 9667 + }, + { + "epoch": 17.534345953298573, + "grad_norm": 0.234375, + "learning_rate": 0.0002, + "loss": 0.1029, + "step": 9668 + }, + { + "epoch": 17.536159600997507, + "grad_norm": 0.27734375, + "learning_rate": 0.0002, + "loss": 0.1079, + "step": 9669 + }, + { + "epoch": 17.537973248696442, + "grad_norm": 0.15625, + "learning_rate": 0.0002, + "loss": 0.0978, + "step": 9670 + }, + { + "epoch": 17.539786896395377, + "grad_norm": 0.185546875, + "learning_rate": 0.0002, + "loss": 0.148, + "step": 9671 + }, + { + "epoch": 17.541600544094308, + "grad_norm": 0.28515625, + "learning_rate": 0.0002, + "loss": 0.1294, + "step": 9672 + }, + { + "epoch": 17.543414191793243, + "grad_norm": 0.1279296875, + "learning_rate": 0.0002, + "loss": 0.1746, + "step": 9673 + }, + { + "epoch": 17.545227839492178, + "grad_norm": 0.205078125, + "learning_rate": 0.0002, + "loss": 0.1462, + "step": 9674 + }, + { + "epoch": 17.547041487191112, + "grad_norm": 0.208984375, + "learning_rate": 0.0002, + "loss": 0.0734, + "step": 9675 + }, + { + "epoch": 17.548855134890047, + "grad_norm": 0.2431640625, + "learning_rate": 0.0002, + "loss": 0.0789, + "step": 9676 + }, + { + "epoch": 17.550668782588982, + "grad_norm": 0.205078125, + "learning_rate": 0.0002, + "loss": 0.068, + "step": 9677 + }, + { + "epoch": 17.552482430287917, + "grad_norm": 0.21875, + "learning_rate": 0.0002, + "loss": 0.0732, + "step": 9678 + }, + { + "epoch": 17.55429607798685, + "grad_norm": 0.2119140625, + "learning_rate": 0.0002, + "loss": 0.0728, + "step": 9679 + }, + { + "epoch": 17.556109725685786, + "grad_norm": 0.2255859375, + "learning_rate": 0.0002, + "loss": 0.0724, + "step": 9680 + }, + { + "epoch": 17.55792337338472, + "grad_norm": 0.2236328125, + "learning_rate": 0.0002, + "loss": 0.0715, + "step": 9681 + }, + { + "epoch": 17.559737021083656, + "grad_norm": 0.2177734375, + "learning_rate": 0.0002, + "loss": 0.0815, + "step": 9682 + }, + { + "epoch": 17.56155066878259, + "grad_norm": 0.1865234375, + "learning_rate": 0.0002, + "loss": 0.0676, + "step": 9683 + }, + { + "epoch": 17.563364316481522, + "grad_norm": 0.177734375, + "learning_rate": 0.0002, + "loss": 0.066, + "step": 9684 + }, + { + "epoch": 17.565177964180457, + "grad_norm": 0.2216796875, + "learning_rate": 0.0002, + "loss": 0.0736, + "step": 9685 + }, + { + "epoch": 17.56699161187939, + "grad_norm": 0.208984375, + "learning_rate": 0.0002, + "loss": 0.0705, + "step": 9686 + }, + { + "epoch": 17.568805259578326, + "grad_norm": 0.2109375, + "learning_rate": 0.0002, + "loss": 0.0677, + "step": 9687 + }, + { + "epoch": 17.57061890727726, + "grad_norm": 0.189453125, + "learning_rate": 0.0002, + "loss": 0.0629, + "step": 9688 + }, + { + "epoch": 17.572432554976196, + "grad_norm": 0.1787109375, + "learning_rate": 0.0002, + "loss": 0.0577, + "step": 9689 + }, + { + "epoch": 17.57424620267513, + "grad_norm": 0.1806640625, + "learning_rate": 0.0002, + "loss": 0.0675, + "step": 9690 + }, + { + "epoch": 17.576059850374065, + "grad_norm": 0.21875, + "learning_rate": 0.0002, + "loss": 0.066, + "step": 9691 + }, + { + "epoch": 17.577873498073, + "grad_norm": 0.2177734375, + "learning_rate": 0.0002, + "loss": 0.0641, + "step": 9692 + }, + { + "epoch": 17.579687145771935, + "grad_norm": 0.2060546875, + "learning_rate": 0.0002, + "loss": 0.0649, + "step": 9693 + }, + { + "epoch": 17.58150079347087, + "grad_norm": 0.2080078125, + "learning_rate": 0.0002, + "loss": 0.0688, + "step": 9694 + }, + { + "epoch": 17.583314441169804, + "grad_norm": 0.244140625, + "learning_rate": 0.0002, + "loss": 0.0616, + "step": 9695 + }, + { + "epoch": 17.585128088868736, + "grad_norm": 0.1669921875, + "learning_rate": 0.0002, + "loss": 0.0573, + "step": 9696 + }, + { + "epoch": 17.58694173656767, + "grad_norm": 0.259765625, + "learning_rate": 0.0002, + "loss": 0.077, + "step": 9697 + }, + { + "epoch": 17.588755384266605, + "grad_norm": 0.2119140625, + "learning_rate": 0.0002, + "loss": 0.0632, + "step": 9698 + }, + { + "epoch": 17.59056903196554, + "grad_norm": 0.2197265625, + "learning_rate": 0.0002, + "loss": 0.0618, + "step": 9699 + }, + { + "epoch": 17.592382679664475, + "grad_norm": 0.201171875, + "learning_rate": 0.0002, + "loss": 0.0594, + "step": 9700 + }, + { + "epoch": 17.59419632736341, + "grad_norm": 0.166015625, + "learning_rate": 0.0002, + "loss": 0.0619, + "step": 9701 + }, + { + "epoch": 17.596009975062344, + "grad_norm": 0.21875, + "learning_rate": 0.0002, + "loss": 0.0685, + "step": 9702 + }, + { + "epoch": 17.59782362276128, + "grad_norm": 0.2578125, + "learning_rate": 0.0002, + "loss": 0.0655, + "step": 9703 + }, + { + "epoch": 17.599637270460214, + "grad_norm": 0.23828125, + "learning_rate": 0.0002, + "loss": 0.0644, + "step": 9704 + }, + { + "epoch": 17.60145091815915, + "grad_norm": 0.19140625, + "learning_rate": 0.0002, + "loss": 0.0632, + "step": 9705 + }, + { + "epoch": 17.603264565858083, + "grad_norm": 0.1923828125, + "learning_rate": 0.0002, + "loss": 0.0652, + "step": 9706 + }, + { + "epoch": 17.605078213557018, + "grad_norm": 0.2578125, + "learning_rate": 0.0002, + "loss": 0.0858, + "step": 9707 + }, + { + "epoch": 17.606891861255953, + "grad_norm": 0.271484375, + "learning_rate": 0.0002, + "loss": 0.0747, + "step": 9708 + }, + { + "epoch": 17.608705508954884, + "grad_norm": 0.244140625, + "learning_rate": 0.0002, + "loss": 0.07, + "step": 9709 + }, + { + "epoch": 17.61051915665382, + "grad_norm": 0.298828125, + "learning_rate": 0.0002, + "loss": 0.0874, + "step": 9710 + }, + { + "epoch": 17.612332804352754, + "grad_norm": 0.1953125, + "learning_rate": 0.0002, + "loss": 0.0736, + "step": 9711 + }, + { + "epoch": 17.61414645205169, + "grad_norm": 0.23828125, + "learning_rate": 0.0002, + "loss": 0.0757, + "step": 9712 + }, + { + "epoch": 17.615960099750623, + "grad_norm": 0.2353515625, + "learning_rate": 0.0002, + "loss": 0.079, + "step": 9713 + }, + { + "epoch": 17.617773747449558, + "grad_norm": 0.419921875, + "learning_rate": 0.0002, + "loss": 0.0993, + "step": 9714 + }, + { + "epoch": 17.619587395148493, + "grad_norm": 0.283203125, + "learning_rate": 0.0002, + "loss": 0.0892, + "step": 9715 + }, + { + "epoch": 17.621401042847427, + "grad_norm": 0.1796875, + "learning_rate": 0.0002, + "loss": 0.0795, + "step": 9716 + }, + { + "epoch": 17.623214690546362, + "grad_norm": 0.302734375, + "learning_rate": 0.0002, + "loss": 0.0956, + "step": 9717 + }, + { + "epoch": 17.625028338245297, + "grad_norm": 0.3984375, + "learning_rate": 0.0002, + "loss": 0.1024, + "step": 9718 + }, + { + "epoch": 17.626841985944232, + "grad_norm": 0.2314453125, + "learning_rate": 0.0002, + "loss": 0.1059, + "step": 9719 + }, + { + "epoch": 17.628655633643163, + "grad_norm": 0.115234375, + "learning_rate": 0.0002, + "loss": 0.0971, + "step": 9720 + }, + { + "epoch": 17.630469281342098, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 0.1239, + "step": 9721 + }, + { + "epoch": 17.632282929041033, + "grad_norm": 0.1689453125, + "learning_rate": 0.0002, + "loss": 0.1485, + "step": 9722 + }, + { + "epoch": 17.634096576739967, + "grad_norm": 0.263671875, + "learning_rate": 0.0002, + "loss": 0.1911, + "step": 9723 + }, + { + "epoch": 17.635910224438902, + "grad_norm": 0.203125, + "learning_rate": 0.0002, + "loss": 0.131, + "step": 9724 + }, + { + "epoch": 17.635910224438902, + "eval_loss": 2.58010196685791, + "eval_runtime": 152.627, + "eval_samples_per_second": 6.552, + "eval_steps_per_second": 6.552, + "step": 9724 + }, + { + "epoch": 17.635910224438902, + "mmlu_eval_accuracy": 0.3101326320552257, + "mmlu_eval_accuracy_abstract_algebra": 0.45454545454545453, + "mmlu_eval_accuracy_anatomy": 0.35714285714285715, + "mmlu_eval_accuracy_astronomy": 0.3125, + "mmlu_eval_accuracy_business_ethics": 0.2727272727272727, + "mmlu_eval_accuracy_clinical_knowledge": 0.27586206896551724, + "mmlu_eval_accuracy_college_biology": 0.375, + "mmlu_eval_accuracy_college_chemistry": 0.125, + "mmlu_eval_accuracy_college_computer_science": 0.0, + "mmlu_eval_accuracy_college_mathematics": 0.18181818181818182, + "mmlu_eval_accuracy_college_medicine": 0.3181818181818182, + "mmlu_eval_accuracy_college_physics": 0.36363636363636365, + "mmlu_eval_accuracy_computer_security": 0.6363636363636364, + "mmlu_eval_accuracy_conceptual_physics": 0.3076923076923077, + "mmlu_eval_accuracy_econometrics": 0.16666666666666666, + "mmlu_eval_accuracy_electrical_engineering": 0.125, + "mmlu_eval_accuracy_elementary_mathematics": 0.34146341463414637, + "mmlu_eval_accuracy_formal_logic": 0.14285714285714285, + "mmlu_eval_accuracy_global_facts": 0.3, + "mmlu_eval_accuracy_high_school_biology": 0.34375, + "mmlu_eval_accuracy_high_school_chemistry": 0.22727272727272727, + "mmlu_eval_accuracy_high_school_computer_science": 0.2222222222222222, + "mmlu_eval_accuracy_high_school_european_history": 0.2777777777777778, + "mmlu_eval_accuracy_high_school_geography": 0.3181818181818182, + "mmlu_eval_accuracy_high_school_government_and_politics": 0.3333333333333333, + "mmlu_eval_accuracy_high_school_macroeconomics": 0.2558139534883721, + "mmlu_eval_accuracy_high_school_mathematics": 0.10344827586206896, + "mmlu_eval_accuracy_high_school_microeconomics": 0.2692307692307692, + "mmlu_eval_accuracy_high_school_physics": 0.5882352941176471, + "mmlu_eval_accuracy_high_school_psychology": 0.35, + "mmlu_eval_accuracy_high_school_statistics": 0.2608695652173913, + "mmlu_eval_accuracy_high_school_us_history": 0.3181818181818182, + "mmlu_eval_accuracy_high_school_world_history": 0.19230769230769232, + "mmlu_eval_accuracy_human_aging": 0.30434782608695654, + "mmlu_eval_accuracy_human_sexuality": 0.25, + "mmlu_eval_accuracy_international_law": 0.38461538461538464, + "mmlu_eval_accuracy_jurisprudence": 0.2727272727272727, + "mmlu_eval_accuracy_logical_fallacies": 0.3888888888888889, + "mmlu_eval_accuracy_machine_learning": 0.36363636363636365, + "mmlu_eval_accuracy_management": 0.36363636363636365, + "mmlu_eval_accuracy_marketing": 0.48, + "mmlu_eval_accuracy_medical_genetics": 0.2727272727272727, + "mmlu_eval_accuracy_miscellaneous": 0.5, + "mmlu_eval_accuracy_moral_disputes": 0.3157894736842105, + "mmlu_eval_accuracy_moral_scenarios": 0.22, + "mmlu_eval_accuracy_nutrition": 0.2727272727272727, + "mmlu_eval_accuracy_philosophy": 0.35294117647058826, + "mmlu_eval_accuracy_prehistory": 0.42857142857142855, + "mmlu_eval_accuracy_professional_accounting": 0.3548387096774194, + "mmlu_eval_accuracy_professional_law": 0.3176470588235294, + "mmlu_eval_accuracy_professional_medicine": 0.25806451612903225, + "mmlu_eval_accuracy_professional_psychology": 0.30434782608695654, + "mmlu_eval_accuracy_public_relations": 0.3333333333333333, + "mmlu_eval_accuracy_security_studies": 0.3333333333333333, + "mmlu_eval_accuracy_sociology": 0.5454545454545454, + "mmlu_eval_accuracy_us_foreign_policy": 0.45454545454545453, + "mmlu_eval_accuracy_virology": 0.2777777777777778, + "mmlu_eval_accuracy_world_religions": 0.21052631578947367, + "mmlu_loss": 2.2833776471176463, + "step": 9724 + }, + { + "epoch": 17.637723872137837, + "grad_norm": 0.2080078125, + "learning_rate": 0.0002, + "loss": 0.0671, + "step": 9725 + }, + { + "epoch": 17.63953751983677, + "grad_norm": 0.1953125, + "learning_rate": 0.0002, + "loss": 0.0687, + "step": 9726 + }, + { + "epoch": 17.641351167535706, + "grad_norm": 0.2255859375, + "learning_rate": 0.0002, + "loss": 0.0705, + "step": 9727 + }, + { + "epoch": 17.64316481523464, + "grad_norm": 0.1787109375, + "learning_rate": 0.0002, + "loss": 0.0656, + "step": 9728 + }, + { + "epoch": 17.644978462933576, + "grad_norm": 0.228515625, + "learning_rate": 0.0002, + "loss": 0.0724, + "step": 9729 + }, + { + "epoch": 17.64679211063251, + "grad_norm": 0.2177734375, + "learning_rate": 0.0002, + "loss": 0.0719, + "step": 9730 + }, + { + "epoch": 17.648605758331446, + "grad_norm": 0.22265625, + "learning_rate": 0.0002, + "loss": 0.0692, + "step": 9731 + }, + { + "epoch": 17.65041940603038, + "grad_norm": 0.2255859375, + "learning_rate": 0.0002, + "loss": 0.0814, + "step": 9732 + }, + { + "epoch": 17.65223305372931, + "grad_norm": 0.236328125, + "learning_rate": 0.0002, + "loss": 0.0843, + "step": 9733 + }, + { + "epoch": 17.654046701428246, + "grad_norm": 0.2177734375, + "learning_rate": 0.0002, + "loss": 0.066, + "step": 9734 + }, + { + "epoch": 17.65586034912718, + "grad_norm": 0.21484375, + "learning_rate": 0.0002, + "loss": 0.0693, + "step": 9735 + }, + { + "epoch": 17.657673996826116, + "grad_norm": 0.22265625, + "learning_rate": 0.0002, + "loss": 0.0739, + "step": 9736 + }, + { + "epoch": 17.65948764452505, + "grad_norm": 0.216796875, + "learning_rate": 0.0002, + "loss": 0.0686, + "step": 9737 + }, + { + "epoch": 17.661301292223985, + "grad_norm": 0.21484375, + "learning_rate": 0.0002, + "loss": 0.0649, + "step": 9738 + }, + { + "epoch": 17.66311493992292, + "grad_norm": 0.22265625, + "learning_rate": 0.0002, + "loss": 0.0837, + "step": 9739 + }, + { + "epoch": 17.664928587621855, + "grad_norm": 0.1943359375, + "learning_rate": 0.0002, + "loss": 0.0607, + "step": 9740 + }, + { + "epoch": 17.66674223532079, + "grad_norm": 0.185546875, + "learning_rate": 0.0002, + "loss": 0.0654, + "step": 9741 + }, + { + "epoch": 17.668555883019724, + "grad_norm": 0.1962890625, + "learning_rate": 0.0002, + "loss": 0.0617, + "step": 9742 + }, + { + "epoch": 17.67036953071866, + "grad_norm": 0.2431640625, + "learning_rate": 0.0002, + "loss": 0.0747, + "step": 9743 + }, + { + "epoch": 17.672183178417594, + "grad_norm": 0.205078125, + "learning_rate": 0.0002, + "loss": 0.0641, + "step": 9744 + }, + { + "epoch": 17.673996826116525, + "grad_norm": 0.2080078125, + "learning_rate": 0.0002, + "loss": 0.0566, + "step": 9745 + }, + { + "epoch": 17.67581047381546, + "grad_norm": 0.21484375, + "learning_rate": 0.0002, + "loss": 0.0654, + "step": 9746 + }, + { + "epoch": 17.677624121514395, + "grad_norm": 0.2177734375, + "learning_rate": 0.0002, + "loss": 0.0645, + "step": 9747 + }, + { + "epoch": 17.67943776921333, + "grad_norm": 0.2080078125, + "learning_rate": 0.0002, + "loss": 0.0627, + "step": 9748 + }, + { + "epoch": 17.681251416912264, + "grad_norm": 0.267578125, + "learning_rate": 0.0002, + "loss": 0.081, + "step": 9749 + }, + { + "epoch": 17.6830650646112, + "grad_norm": 0.2138671875, + "learning_rate": 0.0002, + "loss": 0.064, + "step": 9750 + }, + { + "epoch": 17.684878712310134, + "grad_norm": 0.251953125, + "learning_rate": 0.0002, + "loss": 0.0696, + "step": 9751 + }, + { + "epoch": 17.68669236000907, + "grad_norm": 0.2265625, + "learning_rate": 0.0002, + "loss": 0.064, + "step": 9752 + }, + { + "epoch": 17.688506007708003, + "grad_norm": 0.2578125, + "learning_rate": 0.0002, + "loss": 0.0767, + "step": 9753 + }, + { + "epoch": 17.690319655406938, + "grad_norm": 0.255859375, + "learning_rate": 0.0002, + "loss": 0.0681, + "step": 9754 + }, + { + "epoch": 17.692133303105873, + "grad_norm": 0.251953125, + "learning_rate": 0.0002, + "loss": 0.0727, + "step": 9755 + }, + { + "epoch": 17.693946950804808, + "grad_norm": 0.19921875, + "learning_rate": 0.0002, + "loss": 0.0731, + "step": 9756 + }, + { + "epoch": 17.69576059850374, + "grad_norm": 0.203125, + "learning_rate": 0.0002, + "loss": 0.0679, + "step": 9757 + }, + { + "epoch": 17.697574246202674, + "grad_norm": 0.2255859375, + "learning_rate": 0.0002, + "loss": 0.0738, + "step": 9758 + }, + { + "epoch": 17.69938789390161, + "grad_norm": 0.259765625, + "learning_rate": 0.0002, + "loss": 0.0705, + "step": 9759 + }, + { + "epoch": 17.701201541600543, + "grad_norm": 0.1787109375, + "learning_rate": 0.0002, + "loss": 0.0698, + "step": 9760 + }, + { + "epoch": 17.703015189299478, + "grad_norm": 0.298828125, + "learning_rate": 0.0002, + "loss": 0.0751, + "step": 9761 + }, + { + "epoch": 17.704828836998413, + "grad_norm": 0.33203125, + "learning_rate": 0.0002, + "loss": 0.0791, + "step": 9762 + }, + { + "epoch": 17.706642484697348, + "grad_norm": 0.392578125, + "learning_rate": 0.0002, + "loss": 0.0998, + "step": 9763 + }, + { + "epoch": 17.708456132396282, + "grad_norm": 0.2265625, + "learning_rate": 0.0002, + "loss": 0.0852, + "step": 9764 + }, + { + "epoch": 17.710269780095217, + "grad_norm": 0.2490234375, + "learning_rate": 0.0002, + "loss": 0.0931, + "step": 9765 + }, + { + "epoch": 17.712083427794152, + "grad_norm": 0.25390625, + "learning_rate": 0.0002, + "loss": 0.0982, + "step": 9766 + }, + { + "epoch": 17.713897075493087, + "grad_norm": 0.244140625, + "learning_rate": 0.0002, + "loss": 0.0991, + "step": 9767 + }, + { + "epoch": 17.71571072319202, + "grad_norm": 0.201171875, + "learning_rate": 0.0002, + "loss": 0.1004, + "step": 9768 + }, + { + "epoch": 17.717524370890956, + "grad_norm": 0.314453125, + "learning_rate": 0.0002, + "loss": 0.1244, + "step": 9769 + }, + { + "epoch": 17.719338018589887, + "grad_norm": 0.318359375, + "learning_rate": 0.0002, + "loss": 0.1232, + "step": 9770 + }, + { + "epoch": 17.721151666288822, + "grad_norm": 0.17578125, + "learning_rate": 0.0002, + "loss": 0.1309, + "step": 9771 + }, + { + "epoch": 17.722965313987757, + "grad_norm": 0.16015625, + "learning_rate": 0.0002, + "loss": 0.122, + "step": 9772 + }, + { + "epoch": 17.72477896168669, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002, + "loss": 0.2105, + "step": 9773 + }, + { + "epoch": 17.726592609385627, + "grad_norm": 0.28125, + "learning_rate": 0.0002, + "loss": 0.161, + "step": 9774 + }, + { + "epoch": 17.72840625708456, + "grad_norm": 0.21484375, + "learning_rate": 0.0002, + "loss": 0.0728, + "step": 9775 + }, + { + "epoch": 17.730219904783496, + "grad_norm": 0.234375, + "learning_rate": 0.0002, + "loss": 0.0775, + "step": 9776 + }, + { + "epoch": 17.73203355248243, + "grad_norm": 0.2021484375, + "learning_rate": 0.0002, + "loss": 0.0671, + "step": 9777 + }, + { + "epoch": 17.733847200181366, + "grad_norm": 0.2431640625, + "learning_rate": 0.0002, + "loss": 0.0854, + "step": 9778 + }, + { + "epoch": 17.7356608478803, + "grad_norm": 0.22265625, + "learning_rate": 0.0002, + "loss": 0.0742, + "step": 9779 + }, + { + "epoch": 17.737474495579235, + "grad_norm": 0.212890625, + "learning_rate": 0.0002, + "loss": 0.0703, + "step": 9780 + }, + { + "epoch": 17.739288143278166, + "grad_norm": 0.2109375, + "learning_rate": 0.0002, + "loss": 0.0699, + "step": 9781 + }, + { + "epoch": 17.7411017909771, + "grad_norm": 0.23046875, + "learning_rate": 0.0002, + "loss": 0.0766, + "step": 9782 + }, + { + "epoch": 17.742915438676036, + "grad_norm": 0.2265625, + "learning_rate": 0.0002, + "loss": 0.0726, + "step": 9783 + }, + { + "epoch": 17.74472908637497, + "grad_norm": 0.2138671875, + "learning_rate": 0.0002, + "loss": 0.0677, + "step": 9784 + }, + { + "epoch": 17.746542734073905, + "grad_norm": 0.2421875, + "learning_rate": 0.0002, + "loss": 0.0735, + "step": 9785 + }, + { + "epoch": 17.74835638177284, + "grad_norm": 0.228515625, + "learning_rate": 0.0002, + "loss": 0.074, + "step": 9786 + }, + { + "epoch": 17.750170029471775, + "grad_norm": 0.2041015625, + "learning_rate": 0.0002, + "loss": 0.07, + "step": 9787 + }, + { + "epoch": 17.75198367717071, + "grad_norm": 0.1923828125, + "learning_rate": 0.0002, + "loss": 0.0685, + "step": 9788 + }, + { + "epoch": 17.753797324869645, + "grad_norm": 0.22265625, + "learning_rate": 0.0002, + "loss": 0.0767, + "step": 9789 + }, + { + "epoch": 17.75561097256858, + "grad_norm": 0.244140625, + "learning_rate": 0.0002, + "loss": 0.0741, + "step": 9790 + }, + { + "epoch": 17.757424620267514, + "grad_norm": 0.23046875, + "learning_rate": 0.0002, + "loss": 0.065, + "step": 9791 + }, + { + "epoch": 17.75923826796645, + "grad_norm": 0.21484375, + "learning_rate": 0.0002, + "loss": 0.0658, + "step": 9792 + }, + { + "epoch": 17.761051915665384, + "grad_norm": 0.2060546875, + "learning_rate": 0.0002, + "loss": 0.0654, + "step": 9793 + }, + { + "epoch": 17.762865563364315, + "grad_norm": 0.1884765625, + "learning_rate": 0.0002, + "loss": 0.0633, + "step": 9794 + }, + { + "epoch": 17.76467921106325, + "grad_norm": 0.25390625, + "learning_rate": 0.0002, + "loss": 0.0689, + "step": 9795 + }, + { + "epoch": 17.766492858762184, + "grad_norm": 0.203125, + "learning_rate": 0.0002, + "loss": 0.0674, + "step": 9796 + }, + { + "epoch": 17.76830650646112, + "grad_norm": 0.197265625, + "learning_rate": 0.0002, + "loss": 0.0679, + "step": 9797 + }, + { + "epoch": 17.770120154160054, + "grad_norm": 0.2236328125, + "learning_rate": 0.0002, + "loss": 0.0664, + "step": 9798 + }, + { + "epoch": 17.77193380185899, + "grad_norm": 0.2265625, + "learning_rate": 0.0002, + "loss": 0.0651, + "step": 9799 + }, + { + "epoch": 17.773747449557924, + "grad_norm": 0.2431640625, + "learning_rate": 0.0002, + "loss": 0.0708, + "step": 9800 + }, + { + "epoch": 17.77556109725686, + "grad_norm": 0.232421875, + "learning_rate": 0.0002, + "loss": 0.0716, + "step": 9801 + }, + { + "epoch": 17.777374744955793, + "grad_norm": 0.248046875, + "learning_rate": 0.0002, + "loss": 0.0738, + "step": 9802 + }, + { + "epoch": 17.779188392654728, + "grad_norm": 0.283203125, + "learning_rate": 0.0002, + "loss": 0.0714, + "step": 9803 + }, + { + "epoch": 17.781002040353663, + "grad_norm": 0.30078125, + "learning_rate": 0.0002, + "loss": 0.073, + "step": 9804 + }, + { + "epoch": 17.782815688052597, + "grad_norm": 0.2353515625, + "learning_rate": 0.0002, + "loss": 0.067, + "step": 9805 + }, + { + "epoch": 17.78462933575153, + "grad_norm": 0.2353515625, + "learning_rate": 0.0002, + "loss": 0.073, + "step": 9806 + }, + { + "epoch": 17.786442983450463, + "grad_norm": 0.2109375, + "learning_rate": 0.0002, + "loss": 0.0768, + "step": 9807 + }, + { + "epoch": 17.788256631149398, + "grad_norm": 0.2490234375, + "learning_rate": 0.0002, + "loss": 0.0721, + "step": 9808 + }, + { + "epoch": 17.790070278848333, + "grad_norm": 0.1669921875, + "learning_rate": 0.0002, + "loss": 0.069, + "step": 9809 + }, + { + "epoch": 17.791883926547268, + "grad_norm": 0.20703125, + "learning_rate": 0.0002, + "loss": 0.0752, + "step": 9810 + }, + { + "epoch": 17.793697574246202, + "grad_norm": 0.2158203125, + "learning_rate": 0.0002, + "loss": 0.0845, + "step": 9811 + }, + { + "epoch": 17.795511221945137, + "grad_norm": 0.203125, + "learning_rate": 0.0002, + "loss": 0.0833, + "step": 9812 + }, + { + "epoch": 17.797324869644072, + "grad_norm": 0.2421875, + "learning_rate": 0.0002, + "loss": 0.0894, + "step": 9813 + }, + { + "epoch": 17.799138517343007, + "grad_norm": 0.2216796875, + "learning_rate": 0.0002, + "loss": 0.0806, + "step": 9814 + }, + { + "epoch": 17.80095216504194, + "grad_norm": 0.48828125, + "learning_rate": 0.0002, + "loss": 0.0995, + "step": 9815 + }, + { + "epoch": 17.802765812740876, + "grad_norm": 0.275390625, + "learning_rate": 0.0002, + "loss": 0.0889, + "step": 9816 + }, + { + "epoch": 17.80457946043981, + "grad_norm": 0.310546875, + "learning_rate": 0.0002, + "loss": 0.1177, + "step": 9817 + }, + { + "epoch": 17.806393108138742, + "grad_norm": 0.375, + "learning_rate": 0.0002, + "loss": 0.1128, + "step": 9818 + }, + { + "epoch": 17.808206755837677, + "grad_norm": 0.3359375, + "learning_rate": 0.0002, + "loss": 0.1126, + "step": 9819 + }, + { + "epoch": 17.810020403536612, + "grad_norm": 0.2578125, + "learning_rate": 0.0002, + "loss": 0.1092, + "step": 9820 + }, + { + "epoch": 17.811834051235547, + "grad_norm": 0.158203125, + "learning_rate": 0.0002, + "loss": 0.1256, + "step": 9821 + }, + { + "epoch": 17.81364769893448, + "grad_norm": 0.1982421875, + "learning_rate": 0.0002, + "loss": 0.1819, + "step": 9822 + }, + { + "epoch": 17.815461346633416, + "grad_norm": 0.279296875, + "learning_rate": 0.0002, + "loss": 0.2605, + "step": 9823 + }, + { + "epoch": 17.81727499433235, + "grad_norm": 0.259765625, + "learning_rate": 0.0002, + "loss": 0.1513, + "step": 9824 + }, + { + "epoch": 17.819088642031286, + "grad_norm": 0.25390625, + "learning_rate": 0.0002, + "loss": 0.0775, + "step": 9825 + }, + { + "epoch": 17.82090228973022, + "grad_norm": 0.23828125, + "learning_rate": 0.0002, + "loss": 0.0772, + "step": 9826 + }, + { + "epoch": 17.822715937429155, + "grad_norm": 0.2177734375, + "learning_rate": 0.0002, + "loss": 0.0772, + "step": 9827 + }, + { + "epoch": 17.82452958512809, + "grad_norm": 0.2392578125, + "learning_rate": 0.0002, + "loss": 0.0809, + "step": 9828 + }, + { + "epoch": 17.826343232827025, + "grad_norm": 0.2314453125, + "learning_rate": 0.0002, + "loss": 0.0806, + "step": 9829 + }, + { + "epoch": 17.82815688052596, + "grad_norm": 0.1875, + "learning_rate": 0.0002, + "loss": 0.065, + "step": 9830 + }, + { + "epoch": 17.82997052822489, + "grad_norm": 0.2421875, + "learning_rate": 0.0002, + "loss": 0.0746, + "step": 9831 + }, + { + "epoch": 17.831784175923826, + "grad_norm": 0.2041015625, + "learning_rate": 0.0002, + "loss": 0.07, + "step": 9832 + }, + { + "epoch": 17.83359782362276, + "grad_norm": 0.2734375, + "learning_rate": 0.0002, + "loss": 0.0841, + "step": 9833 + }, + { + "epoch": 17.835411471321695, + "grad_norm": 0.201171875, + "learning_rate": 0.0002, + "loss": 0.073, + "step": 9834 + }, + { + "epoch": 17.83722511902063, + "grad_norm": 0.2138671875, + "learning_rate": 0.0002, + "loss": 0.0743, + "step": 9835 + }, + { + "epoch": 17.839038766719565, + "grad_norm": 0.22265625, + "learning_rate": 0.0002, + "loss": 0.075, + "step": 9836 + }, + { + "epoch": 17.8408524144185, + "grad_norm": 0.224609375, + "learning_rate": 0.0002, + "loss": 0.0739, + "step": 9837 + }, + { + "epoch": 17.842666062117434, + "grad_norm": 0.2353515625, + "learning_rate": 0.0002, + "loss": 0.0739, + "step": 9838 + }, + { + "epoch": 17.84447970981637, + "grad_norm": 0.1982421875, + "learning_rate": 0.0002, + "loss": 0.068, + "step": 9839 + }, + { + "epoch": 17.846293357515304, + "grad_norm": 0.2099609375, + "learning_rate": 0.0002, + "loss": 0.0669, + "step": 9840 + }, + { + "epoch": 17.84810700521424, + "grad_norm": 0.19921875, + "learning_rate": 0.0002, + "loss": 0.0649, + "step": 9841 + }, + { + "epoch": 17.849920652913173, + "grad_norm": 0.1875, + "learning_rate": 0.0002, + "loss": 0.06, + "step": 9842 + }, + { + "epoch": 17.851734300612105, + "grad_norm": 0.189453125, + "learning_rate": 0.0002, + "loss": 0.0616, + "step": 9843 + }, + { + "epoch": 17.85354794831104, + "grad_norm": 0.2392578125, + "learning_rate": 0.0002, + "loss": 0.0681, + "step": 9844 + }, + { + "epoch": 17.855361596009974, + "grad_norm": 0.2001953125, + "learning_rate": 0.0002, + "loss": 0.0614, + "step": 9845 + }, + { + "epoch": 17.85717524370891, + "grad_norm": 0.220703125, + "learning_rate": 0.0002, + "loss": 0.0767, + "step": 9846 + }, + { + "epoch": 17.858988891407844, + "grad_norm": 0.251953125, + "learning_rate": 0.0002, + "loss": 0.0722, + "step": 9847 + }, + { + "epoch": 17.86080253910678, + "grad_norm": 0.2578125, + "learning_rate": 0.0002, + "loss": 0.0677, + "step": 9848 + }, + { + "epoch": 17.862616186805713, + "grad_norm": 0.283203125, + "learning_rate": 0.0002, + "loss": 0.0762, + "step": 9849 + }, + { + "epoch": 17.864429834504648, + "grad_norm": 0.2275390625, + "learning_rate": 0.0002, + "loss": 0.0773, + "step": 9850 + }, + { + "epoch": 17.866243482203583, + "grad_norm": 0.208984375, + "learning_rate": 0.0002, + "loss": 0.0649, + "step": 9851 + }, + { + "epoch": 17.868057129902517, + "grad_norm": 0.259765625, + "learning_rate": 0.0002, + "loss": 0.0745, + "step": 9852 + }, + { + "epoch": 17.869870777601452, + "grad_norm": 0.2158203125, + "learning_rate": 0.0002, + "loss": 0.0677, + "step": 9853 + }, + { + "epoch": 17.871684425300387, + "grad_norm": 0.2265625, + "learning_rate": 0.0002, + "loss": 0.071, + "step": 9854 + }, + { + "epoch": 17.87349807299932, + "grad_norm": 0.1875, + "learning_rate": 0.0002, + "loss": 0.0689, + "step": 9855 + }, + { + "epoch": 17.875311720698253, + "grad_norm": 0.3046875, + "learning_rate": 0.0002, + "loss": 0.0802, + "step": 9856 + }, + { + "epoch": 17.877125368397188, + "grad_norm": 0.31640625, + "learning_rate": 0.0002, + "loss": 0.0835, + "step": 9857 + }, + { + "epoch": 17.878939016096123, + "grad_norm": 0.228515625, + "learning_rate": 0.0002, + "loss": 0.0748, + "step": 9858 + }, + { + "epoch": 17.880752663795057, + "grad_norm": 0.32421875, + "learning_rate": 0.0002, + "loss": 0.0812, + "step": 9859 + }, + { + "epoch": 17.882566311493992, + "grad_norm": 0.2431640625, + "learning_rate": 0.0002, + "loss": 0.076, + "step": 9860 + }, + { + "epoch": 17.884379959192927, + "grad_norm": 0.287109375, + "learning_rate": 0.0002, + "loss": 0.0834, + "step": 9861 + }, + { + "epoch": 17.88619360689186, + "grad_norm": 0.197265625, + "learning_rate": 0.0002, + "loss": 0.0753, + "step": 9862 + }, + { + "epoch": 17.888007254590796, + "grad_norm": 0.1767578125, + "learning_rate": 0.0002, + "loss": 0.0749, + "step": 9863 + }, + { + "epoch": 17.88982090228973, + "grad_norm": 0.259765625, + "learning_rate": 0.0002, + "loss": 0.0839, + "step": 9864 + }, + { + "epoch": 17.891634549988666, + "grad_norm": 0.2392578125, + "learning_rate": 0.0002, + "loss": 0.0936, + "step": 9865 + }, + { + "epoch": 17.8934481976876, + "grad_norm": 0.1787109375, + "learning_rate": 0.0002, + "loss": 0.0858, + "step": 9866 + }, + { + "epoch": 17.895261845386532, + "grad_norm": 0.265625, + "learning_rate": 0.0002, + "loss": 0.096, + "step": 9867 + }, + { + "epoch": 17.897075493085467, + "grad_norm": 0.5234375, + "learning_rate": 0.0002, + "loss": 0.1327, + "step": 9868 + }, + { + "epoch": 17.8988891407844, + "grad_norm": 0.1943359375, + "learning_rate": 0.0002, + "loss": 0.1163, + "step": 9869 + }, + { + "epoch": 17.900702788483336, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 0.1258, + "step": 9870 + }, + { + "epoch": 17.90251643618227, + "grad_norm": 0.2099609375, + "learning_rate": 0.0002, + "loss": 0.1322, + "step": 9871 + }, + { + "epoch": 17.904330083881206, + "grad_norm": 0.3203125, + "learning_rate": 0.0002, + "loss": 0.197, + "step": 9872 + }, + { + "epoch": 17.90614373158014, + "grad_norm": 0.3046875, + "learning_rate": 0.0002, + "loss": 0.2203, + "step": 9873 + }, + { + "epoch": 17.907957379279075, + "grad_norm": 0.24609375, + "learning_rate": 0.0002, + "loss": 0.187, + "step": 9874 + }, + { + "epoch": 17.90977102697801, + "grad_norm": 0.25390625, + "learning_rate": 0.0002, + "loss": 0.0766, + "step": 9875 + }, + { + "epoch": 17.911584674676945, + "grad_norm": 0.2275390625, + "learning_rate": 0.0002, + "loss": 0.0804, + "step": 9876 + }, + { + "epoch": 17.91339832237588, + "grad_norm": 0.248046875, + "learning_rate": 0.0002, + "loss": 0.078, + "step": 9877 + }, + { + "epoch": 17.915211970074814, + "grad_norm": 0.21875, + "learning_rate": 0.0002, + "loss": 0.0758, + "step": 9878 + }, + { + "epoch": 17.917025617773746, + "grad_norm": 0.251953125, + "learning_rate": 0.0002, + "loss": 0.08, + "step": 9879 + }, + { + "epoch": 17.91883926547268, + "grad_norm": 0.220703125, + "learning_rate": 0.0002, + "loss": 0.0832, + "step": 9880 + }, + { + "epoch": 17.920652913171615, + "grad_norm": 0.2060546875, + "learning_rate": 0.0002, + "loss": 0.0668, + "step": 9881 + }, + { + "epoch": 17.92246656087055, + "grad_norm": 0.2158203125, + "learning_rate": 0.0002, + "loss": 0.0743, + "step": 9882 + }, + { + "epoch": 17.924280208569485, + "grad_norm": 0.2314453125, + "learning_rate": 0.0002, + "loss": 0.0787, + "step": 9883 + }, + { + "epoch": 17.92609385626842, + "grad_norm": 0.232421875, + "learning_rate": 0.0002, + "loss": 0.0775, + "step": 9884 + }, + { + "epoch": 17.927907503967354, + "grad_norm": 0.23828125, + "learning_rate": 0.0002, + "loss": 0.0772, + "step": 9885 + }, + { + "epoch": 17.92972115166629, + "grad_norm": 0.2578125, + "learning_rate": 0.0002, + "loss": 0.0754, + "step": 9886 + }, + { + "epoch": 17.931534799365224, + "grad_norm": 0.228515625, + "learning_rate": 0.0002, + "loss": 0.0831, + "step": 9887 + }, + { + "epoch": 17.93334844706416, + "grad_norm": 0.193359375, + "learning_rate": 0.0002, + "loss": 0.0656, + "step": 9888 + }, + { + "epoch": 17.935162094763093, + "grad_norm": 0.2236328125, + "learning_rate": 0.0002, + "loss": 0.0701, + "step": 9889 + }, + { + "epoch": 17.936975742462028, + "grad_norm": 0.2333984375, + "learning_rate": 0.0002, + "loss": 0.0771, + "step": 9890 + }, + { + "epoch": 17.938789390160963, + "grad_norm": 0.2138671875, + "learning_rate": 0.0002, + "loss": 0.068, + "step": 9891 + }, + { + "epoch": 17.940603037859894, + "grad_norm": 0.2470703125, + "learning_rate": 0.0002, + "loss": 0.0674, + "step": 9892 + }, + { + "epoch": 17.94241668555883, + "grad_norm": 0.2490234375, + "learning_rate": 0.0002, + "loss": 0.0702, + "step": 9893 + }, + { + "epoch": 17.944230333257764, + "grad_norm": 0.21484375, + "learning_rate": 0.0002, + "loss": 0.0657, + "step": 9894 + }, + { + "epoch": 17.9460439809567, + "grad_norm": 0.2578125, + "learning_rate": 0.0002, + "loss": 0.0678, + "step": 9895 + }, + { + "epoch": 17.947857628655633, + "grad_norm": 0.201171875, + "learning_rate": 0.0002, + "loss": 0.0674, + "step": 9896 + }, + { + "epoch": 17.949671276354568, + "grad_norm": 0.359375, + "learning_rate": 0.0002, + "loss": 0.0759, + "step": 9897 + }, + { + "epoch": 17.951484924053503, + "grad_norm": 0.2177734375, + "learning_rate": 0.0002, + "loss": 0.0676, + "step": 9898 + }, + { + "epoch": 17.953298571752438, + "grad_norm": 0.271484375, + "learning_rate": 0.0002, + "loss": 0.0736, + "step": 9899 + }, + { + "epoch": 17.955112219451372, + "grad_norm": 0.2236328125, + "learning_rate": 0.0002, + "loss": 0.0636, + "step": 9900 + }, + { + "epoch": 17.956925867150307, + "grad_norm": 0.2109375, + "learning_rate": 0.0002, + "loss": 0.0702, + "step": 9901 + }, + { + "epoch": 17.958739514849242, + "grad_norm": 0.203125, + "learning_rate": 0.0002, + "loss": 0.0697, + "step": 9902 + }, + { + "epoch": 17.960553162548177, + "grad_norm": 0.25390625, + "learning_rate": 0.0002, + "loss": 0.0715, + "step": 9903 + }, + { + "epoch": 17.962366810247108, + "grad_norm": 0.2490234375, + "learning_rate": 0.0002, + "loss": 0.0764, + "step": 9904 + }, + { + "epoch": 17.964180457946043, + "grad_norm": 0.263671875, + "learning_rate": 0.0002, + "loss": 0.0757, + "step": 9905 + }, + { + "epoch": 17.965994105644977, + "grad_norm": 0.283203125, + "learning_rate": 0.0002, + "loss": 0.0732, + "step": 9906 + }, + { + "epoch": 17.967807753343912, + "grad_norm": 0.365234375, + "learning_rate": 0.0002, + "loss": 0.0845, + "step": 9907 + }, + { + "epoch": 17.969621401042847, + "grad_norm": 0.3046875, + "learning_rate": 0.0002, + "loss": 0.0827, + "step": 9908 + }, + { + "epoch": 17.97143504874178, + "grad_norm": 0.255859375, + "learning_rate": 0.0002, + "loss": 0.081, + "step": 9909 + }, + { + "epoch": 17.973248696440717, + "grad_norm": 0.19921875, + "learning_rate": 0.0002, + "loss": 0.075, + "step": 9910 + }, + { + "epoch": 17.97506234413965, + "grad_norm": 0.21875, + "learning_rate": 0.0002, + "loss": 0.0789, + "step": 9911 + }, + { + "epoch": 17.97506234413965, + "eval_loss": 2.6012446880340576, + "eval_runtime": 152.6266, + "eval_samples_per_second": 6.552, + "eval_steps_per_second": 6.552, + "step": 9911 + }, + { + "epoch": 17.97506234413965, + "mmlu_eval_accuracy": 0.30480168772921123, + "mmlu_eval_accuracy_abstract_algebra": 0.36363636363636365, + "mmlu_eval_accuracy_anatomy": 0.42857142857142855, + "mmlu_eval_accuracy_astronomy": 0.25, + "mmlu_eval_accuracy_business_ethics": 0.2727272727272727, + "mmlu_eval_accuracy_clinical_knowledge": 0.27586206896551724, + "mmlu_eval_accuracy_college_biology": 0.375, + "mmlu_eval_accuracy_college_chemistry": 0.0, + "mmlu_eval_accuracy_college_computer_science": 0.09090909090909091, + "mmlu_eval_accuracy_college_mathematics": 0.18181818181818182, + "mmlu_eval_accuracy_college_medicine": 0.2727272727272727, + "mmlu_eval_accuracy_college_physics": 0.36363636363636365, + "mmlu_eval_accuracy_computer_security": 0.6363636363636364, + "mmlu_eval_accuracy_conceptual_physics": 0.3076923076923077, + "mmlu_eval_accuracy_econometrics": 0.16666666666666666, + "mmlu_eval_accuracy_electrical_engineering": 0.125, + "mmlu_eval_accuracy_elementary_mathematics": 0.2926829268292683, + "mmlu_eval_accuracy_formal_logic": 0.14285714285714285, + "mmlu_eval_accuracy_global_facts": 0.3, + "mmlu_eval_accuracy_high_school_biology": 0.34375, + "mmlu_eval_accuracy_high_school_chemistry": 0.2727272727272727, + "mmlu_eval_accuracy_high_school_computer_science": 0.2222222222222222, + "mmlu_eval_accuracy_high_school_european_history": 0.2777777777777778, + "mmlu_eval_accuracy_high_school_geography": 0.3181818181818182, + "mmlu_eval_accuracy_high_school_government_and_politics": 0.23809523809523808, + "mmlu_eval_accuracy_high_school_macroeconomics": 0.2558139534883721, + "mmlu_eval_accuracy_high_school_mathematics": 0.10344827586206896, + "mmlu_eval_accuracy_high_school_microeconomics": 0.34615384615384615, + "mmlu_eval_accuracy_high_school_physics": 0.5294117647058824, + "mmlu_eval_accuracy_high_school_psychology": 0.35, + "mmlu_eval_accuracy_high_school_statistics": 0.30434782608695654, + "mmlu_eval_accuracy_high_school_us_history": 0.2727272727272727, + "mmlu_eval_accuracy_high_school_world_history": 0.19230769230769232, + "mmlu_eval_accuracy_human_aging": 0.34782608695652173, + "mmlu_eval_accuracy_human_sexuality": 0.08333333333333333, + "mmlu_eval_accuracy_international_law": 0.46153846153846156, + "mmlu_eval_accuracy_jurisprudence": 0.36363636363636365, + "mmlu_eval_accuracy_logical_fallacies": 0.3888888888888889, + "mmlu_eval_accuracy_machine_learning": 0.36363636363636365, + "mmlu_eval_accuracy_management": 0.36363636363636365, + "mmlu_eval_accuracy_marketing": 0.56, + "mmlu_eval_accuracy_medical_genetics": 0.2727272727272727, + "mmlu_eval_accuracy_miscellaneous": 0.46511627906976744, + "mmlu_eval_accuracy_moral_disputes": 0.3157894736842105, + "mmlu_eval_accuracy_moral_scenarios": 0.22, + "mmlu_eval_accuracy_nutrition": 0.30303030303030304, + "mmlu_eval_accuracy_philosophy": 0.4117647058823529, + "mmlu_eval_accuracy_prehistory": 0.42857142857142855, + "mmlu_eval_accuracy_professional_accounting": 0.3548387096774194, + "mmlu_eval_accuracy_professional_law": 0.27647058823529413, + "mmlu_eval_accuracy_professional_medicine": 0.22580645161290322, + "mmlu_eval_accuracy_professional_psychology": 0.2753623188405797, + "mmlu_eval_accuracy_public_relations": 0.3333333333333333, + "mmlu_eval_accuracy_security_studies": 0.3333333333333333, + "mmlu_eval_accuracy_sociology": 0.5, + "mmlu_eval_accuracy_us_foreign_policy": 0.36363636363636365, + "mmlu_eval_accuracy_virology": 0.2777777777777778, + "mmlu_eval_accuracy_world_religions": 0.21052631578947367, + "mmlu_loss": 2.185068111912944, + "step": 9911 + }, + { + "epoch": 17.976875991838586, + "grad_norm": 0.181640625, + "learning_rate": 0.0002, + "loss": 0.0907, + "step": 9912 + }, + { + "epoch": 17.97868963953752, + "grad_norm": 0.205078125, + "learning_rate": 0.0002, + "loss": 0.085, + "step": 9913 + }, + { + "epoch": 17.980503287236456, + "grad_norm": 0.166015625, + "learning_rate": 0.0002, + "loss": 0.0907, + "step": 9914 + }, + { + "epoch": 17.98231693493539, + "grad_norm": 0.34375, + "learning_rate": 0.0002, + "loss": 0.0933, + "step": 9915 + }, + { + "epoch": 17.98413058263432, + "grad_norm": 0.390625, + "learning_rate": 0.0002, + "loss": 0.1173, + "step": 9916 + }, + { + "epoch": 17.985944230333256, + "grad_norm": 0.27734375, + "learning_rate": 0.0002, + "loss": 0.1173, + "step": 9917 + }, + { + "epoch": 17.98775787803219, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002, + "loss": 0.1022, + "step": 9918 + }, + { + "epoch": 17.989571525731126, + "grad_norm": 0.3828125, + "learning_rate": 0.0002, + "loss": 0.1157, + "step": 9919 + }, + { + "epoch": 17.99138517343006, + "grad_norm": 0.27734375, + "learning_rate": 0.0002, + "loss": 0.1282, + "step": 9920 + }, + { + "epoch": 17.993198821128995, + "grad_norm": 0.1484375, + "learning_rate": 0.0002, + "loss": 0.128, + "step": 9921 + }, + { + "epoch": 17.99501246882793, + "grad_norm": 1.3515625, + "learning_rate": 0.0002, + "loss": 0.2287, + "step": 9922 + }, + { + "epoch": 17.996826116526865, + "grad_norm": 0.2353515625, + "learning_rate": 0.0002, + "loss": 0.1875, + "step": 9923 + }, + { + "epoch": 17.9986397642258, + "grad_norm": 0.384765625, + "learning_rate": 0.0002, + "loss": 0.1435, + "step": 9924 + }, + { + "epoch": 18.000453411924735, + "grad_norm": 0.220703125, + "learning_rate": 0.0002, + "loss": 0.0783, + "step": 9925 + }, + { + "epoch": 18.00226705962367, + "grad_norm": 0.1826171875, + "learning_rate": 0.0002, + "loss": 0.0589, + "step": 9926 + }, + { + "epoch": 18.004080707322604, + "grad_norm": 0.1806640625, + "learning_rate": 0.0002, + "loss": 0.0603, + "step": 9927 + }, + { + "epoch": 18.005894355021535, + "grad_norm": 0.1943359375, + "learning_rate": 0.0002, + "loss": 0.0589, + "step": 9928 + }, + { + "epoch": 18.00770800272047, + "grad_norm": 0.2060546875, + "learning_rate": 0.0002, + "loss": 0.0591, + "step": 9929 + }, + { + "epoch": 18.009521650419405, + "grad_norm": 0.15234375, + "learning_rate": 0.0002, + "loss": 0.0511, + "step": 9930 + }, + { + "epoch": 18.01133529811834, + "grad_norm": 0.1923828125, + "learning_rate": 0.0002, + "loss": 0.0614, + "step": 9931 + }, + { + "epoch": 18.013148945817274, + "grad_norm": 0.1748046875, + "learning_rate": 0.0002, + "loss": 0.0528, + "step": 9932 + }, + { + "epoch": 18.01496259351621, + "grad_norm": 0.2177734375, + "learning_rate": 0.0002, + "loss": 0.0565, + "step": 9933 + }, + { + "epoch": 18.016776241215144, + "grad_norm": 0.1943359375, + "learning_rate": 0.0002, + "loss": 0.0596, + "step": 9934 + }, + { + "epoch": 18.01858988891408, + "grad_norm": 0.193359375, + "learning_rate": 0.0002, + "loss": 0.0647, + "step": 9935 + }, + { + "epoch": 18.020403536613014, + "grad_norm": 0.2470703125, + "learning_rate": 0.0002, + "loss": 0.0762, + "step": 9936 + }, + { + "epoch": 18.02221718431195, + "grad_norm": 0.1572265625, + "learning_rate": 0.0002, + "loss": 0.0503, + "step": 9937 + }, + { + "epoch": 18.024030832010883, + "grad_norm": 0.2314453125, + "learning_rate": 0.0002, + "loss": 0.0633, + "step": 9938 + }, + { + "epoch": 18.025844479709818, + "grad_norm": 0.2060546875, + "learning_rate": 0.0002, + "loss": 0.0642, + "step": 9939 + }, + { + "epoch": 18.027658127408753, + "grad_norm": 0.2412109375, + "learning_rate": 0.0002, + "loss": 0.06, + "step": 9940 + }, + { + "epoch": 18.029471775107684, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002, + "loss": 0.0546, + "step": 9941 + }, + { + "epoch": 18.03128542280662, + "grad_norm": 0.169921875, + "learning_rate": 0.0002, + "loss": 0.0528, + "step": 9942 + }, + { + "epoch": 18.033099070505553, + "grad_norm": 0.1953125, + "learning_rate": 0.0002, + "loss": 0.0594, + "step": 9943 + }, + { + "epoch": 18.034912718204488, + "grad_norm": 0.2109375, + "learning_rate": 0.0002, + "loss": 0.0551, + "step": 9944 + }, + { + "epoch": 18.036726365903423, + "grad_norm": 0.201171875, + "learning_rate": 0.0002, + "loss": 0.055, + "step": 9945 + }, + { + "epoch": 18.038540013602358, + "grad_norm": 0.173828125, + "learning_rate": 0.0002, + "loss": 0.0558, + "step": 9946 + }, + { + "epoch": 18.040353661301292, + "grad_norm": 0.2060546875, + "learning_rate": 0.0002, + "loss": 0.057, + "step": 9947 + }, + { + "epoch": 18.042167309000227, + "grad_norm": 0.19140625, + "learning_rate": 0.0002, + "loss": 0.0569, + "step": 9948 + }, + { + "epoch": 18.043980956699162, + "grad_norm": 0.189453125, + "learning_rate": 0.0002, + "loss": 0.0579, + "step": 9949 + }, + { + "epoch": 18.045794604398097, + "grad_norm": 0.1845703125, + "learning_rate": 0.0002, + "loss": 0.0519, + "step": 9950 + }, + { + "epoch": 18.04760825209703, + "grad_norm": 0.251953125, + "learning_rate": 0.0002, + "loss": 0.0594, + "step": 9951 + }, + { + "epoch": 18.049421899795966, + "grad_norm": 0.2294921875, + "learning_rate": 0.0002, + "loss": 0.063, + "step": 9952 + }, + { + "epoch": 18.051235547494898, + "grad_norm": 0.2109375, + "learning_rate": 0.0002, + "loss": 0.0656, + "step": 9953 + }, + { + "epoch": 18.053049195193832, + "grad_norm": 0.2099609375, + "learning_rate": 0.0002, + "loss": 0.0735, + "step": 9954 + }, + { + "epoch": 18.054862842892767, + "grad_norm": 0.2578125, + "learning_rate": 0.0002, + "loss": 0.0608, + "step": 9955 + }, + { + "epoch": 18.056676490591702, + "grad_norm": 0.28515625, + "learning_rate": 0.0002, + "loss": 0.0806, + "step": 9956 + }, + { + "epoch": 18.058490138290637, + "grad_norm": 0.333984375, + "learning_rate": 0.0002, + "loss": 0.0826, + "step": 9957 + }, + { + "epoch": 18.06030378598957, + "grad_norm": 0.208984375, + "learning_rate": 0.0002, + "loss": 0.0675, + "step": 9958 + }, + { + "epoch": 18.062117433688506, + "grad_norm": 0.21875, + "learning_rate": 0.0002, + "loss": 0.0676, + "step": 9959 + }, + { + "epoch": 18.06393108138744, + "grad_norm": 0.248046875, + "learning_rate": 0.0002, + "loss": 0.0772, + "step": 9960 + }, + { + "epoch": 18.065744729086376, + "grad_norm": 0.263671875, + "learning_rate": 0.0002, + "loss": 0.0861, + "step": 9961 + }, + { + "epoch": 18.06755837678531, + "grad_norm": 0.236328125, + "learning_rate": 0.0002, + "loss": 0.067, + "step": 9962 + }, + { + "epoch": 18.069372024484245, + "grad_norm": 0.142578125, + "learning_rate": 0.0002, + "loss": 0.0708, + "step": 9963 + }, + { + "epoch": 18.07118567218318, + "grad_norm": 0.2021484375, + "learning_rate": 0.0002, + "loss": 0.0774, + "step": 9964 + }, + { + "epoch": 18.07299931988211, + "grad_norm": 0.2490234375, + "learning_rate": 0.0002, + "loss": 0.0771, + "step": 9965 + }, + { + "epoch": 18.074812967581046, + "grad_norm": 0.12451171875, + "learning_rate": 0.0002, + "loss": 0.0717, + "step": 9966 + }, + { + "epoch": 18.07662661527998, + "grad_norm": 0.27734375, + "learning_rate": 0.0002, + "loss": 0.0791, + "step": 9967 + }, + { + "epoch": 18.078440262978916, + "grad_norm": 0.16796875, + "learning_rate": 0.0002, + "loss": 0.0865, + "step": 9968 + }, + { + "epoch": 18.08025391067785, + "grad_norm": 0.37109375, + "learning_rate": 0.0002, + "loss": 0.1094, + "step": 9969 + }, + { + "epoch": 18.082067558376785, + "grad_norm": 0.1923828125, + "learning_rate": 0.0002, + "loss": 0.0896, + "step": 9970 + }, + { + "epoch": 18.08388120607572, + "grad_norm": 0.2451171875, + "learning_rate": 0.0002, + "loss": 0.1226, + "step": 9971 + }, + { + "epoch": 18.085694853774655, + "grad_norm": 0.291015625, + "learning_rate": 0.0002, + "loss": 0.1175, + "step": 9972 + }, + { + "epoch": 18.08750850147359, + "grad_norm": 0.2578125, + "learning_rate": 0.0002, + "loss": 0.1296, + "step": 9973 + }, + { + "epoch": 18.089322149172524, + "grad_norm": 0.2421875, + "learning_rate": 0.0002, + "loss": 0.1799, + "step": 9974 + }, + { + "epoch": 18.09113579687146, + "grad_norm": 0.16796875, + "learning_rate": 0.0002, + "loss": 0.2167, + "step": 9975 + }, + { + "epoch": 18.092949444570394, + "grad_norm": 0.2099609375, + "learning_rate": 0.0002, + "loss": 0.0738, + "step": 9976 + }, + { + "epoch": 18.094763092269325, + "grad_norm": 0.236328125, + "learning_rate": 0.0002, + "loss": 0.0678, + "step": 9977 + }, + { + "epoch": 18.09657673996826, + "grad_norm": 0.2392578125, + "learning_rate": 0.0002, + "loss": 0.0748, + "step": 9978 + }, + { + "epoch": 18.098390387667195, + "grad_norm": 0.2216796875, + "learning_rate": 0.0002, + "loss": 0.068, + "step": 9979 + }, + { + "epoch": 18.10020403536613, + "grad_norm": 0.1865234375, + "learning_rate": 0.0002, + "loss": 0.0599, + "step": 9980 + }, + { + "epoch": 18.102017683065064, + "grad_norm": 0.2138671875, + "learning_rate": 0.0002, + "loss": 0.0737, + "step": 9981 + }, + { + "epoch": 18.103831330764, + "grad_norm": 0.1865234375, + "learning_rate": 0.0002, + "loss": 0.0606, + "step": 9982 + }, + { + "epoch": 18.105644978462934, + "grad_norm": 0.1748046875, + "learning_rate": 0.0002, + "loss": 0.0563, + "step": 9983 + }, + { + "epoch": 18.10745862616187, + "grad_norm": 0.2080078125, + "learning_rate": 0.0002, + "loss": 0.0641, + "step": 9984 + }, + { + "epoch": 18.109272273860803, + "grad_norm": 0.197265625, + "learning_rate": 0.0002, + "loss": 0.0623, + "step": 9985 + }, + { + "epoch": 18.111085921559738, + "grad_norm": 0.189453125, + "learning_rate": 0.0002, + "loss": 0.0645, + "step": 9986 + }, + { + "epoch": 18.112899569258673, + "grad_norm": 0.1767578125, + "learning_rate": 0.0002, + "loss": 0.0582, + "step": 9987 + }, + { + "epoch": 18.114713216957608, + "grad_norm": 0.1982421875, + "learning_rate": 0.0002, + "loss": 0.0611, + "step": 9988 + }, + { + "epoch": 18.11652686465654, + "grad_norm": 0.1806640625, + "learning_rate": 0.0002, + "loss": 0.0571, + "step": 9989 + }, + { + "epoch": 18.118340512355473, + "grad_norm": 0.197265625, + "learning_rate": 0.0002, + "loss": 0.0627, + "step": 9990 + }, + { + "epoch": 18.12015416005441, + "grad_norm": 0.212890625, + "learning_rate": 0.0002, + "loss": 0.0634, + "step": 9991 + }, + { + "epoch": 18.121967807753343, + "grad_norm": 0.1767578125, + "learning_rate": 0.0002, + "loss": 0.059, + "step": 9992 + }, + { + "epoch": 18.123781455452278, + "grad_norm": 0.228515625, + "learning_rate": 0.0002, + "loss": 0.0621, + "step": 9993 + }, + { + "epoch": 18.125595103151213, + "grad_norm": 0.1953125, + "learning_rate": 0.0002, + "loss": 0.0562, + "step": 9994 + }, + { + "epoch": 18.127408750850147, + "grad_norm": 0.1875, + "learning_rate": 0.0002, + "loss": 0.0589, + "step": 9995 + }, + { + "epoch": 18.129222398549082, + "grad_norm": 0.1875, + "learning_rate": 0.0002, + "loss": 0.0632, + "step": 9996 + }, + { + "epoch": 18.131036046248017, + "grad_norm": 0.2158203125, + "learning_rate": 0.0002, + "loss": 0.0632, + "step": 9997 + }, + { + "epoch": 18.13284969394695, + "grad_norm": 0.20703125, + "learning_rate": 0.0002, + "loss": 0.0561, + "step": 9998 + }, + { + "epoch": 18.134663341645886, + "grad_norm": 0.2890625, + "learning_rate": 0.0002, + "loss": 0.0643, + "step": 9999 + }, + { + "epoch": 18.13647698934482, + "grad_norm": 0.2119140625, + "learning_rate": 0.0002, + "loss": 0.0588, + "step": 10000 + }, + { + "epoch": 18.13647698934482, + "step": 10000, + "total_flos": 2.272703333604655e+18, + "train_loss": 0.43576861339397727, + "train_runtime": 182687.1281, + "train_samples_per_second": 0.876, + "train_steps_per_second": 0.055 + } + ], + "logging_steps": 1, + "max_steps": 10000, + "num_input_tokens_seen": 0, + "num_train_epochs": 19, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 2.272703333604655e+18, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}