diff --git a/all_results.json b/all_results.json new file mode 100644 index 0000000000000000000000000000000000000000..70524eda2366a51e505667bb9f1e80fd36a79c12 --- /dev/null +++ b/all_results.json @@ -0,0 +1,11 @@ +{ + "epoch": 3.4, + "eval_loss": 1.151158332824707, + "eval_runtime": 572.7509, + "eval_samples_per_second": 1.746, + "eval_steps_per_second": 1.746, + "train_loss": 0.8699908837636312, + "train_runtime": 112632.6276, + "train_samples_per_second": 0.266, + "train_steps_per_second": 0.017 +} \ No newline at end of file diff --git a/checkpoint-1000/README.md b/checkpoint-1000/README.md new file mode 100644 index 0000000000000000000000000000000000000000..82793f73e61dbb024e11fc6697bba1622d4d0db6 --- /dev/null +++ b/checkpoint-1000/README.md @@ -0,0 +1,20 @@ +--- +library_name: peft +--- +## Training procedure + + +The following `bitsandbytes` quantization config was used during training: +- load_in_8bit: False +- load_in_4bit: True +- llm_int8_threshold: 6.0 +- llm_int8_skip_modules: None +- llm_int8_enable_fp32_cpu_offload: False +- llm_int8_has_fp16_weight: False +- bnb_4bit_quant_type: nf4 +- bnb_4bit_use_double_quant: True +- bnb_4bit_compute_dtype: bfloat16 +### Framework versions + + +- PEFT 0.4.0 diff --git a/checkpoint-1000/adapter_config.json b/checkpoint-1000/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..f6f42652a8f2f3cf02936153fca1af5d04e7983d --- /dev/null +++ b/checkpoint-1000/adapter_config.json @@ -0,0 +1,26 @@ +{ + "auto_mapping": null, + "base_model_name_or_path": "152334H/miqu-1-70b-sf", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "lora_alpha": 16.0, + "lora_dropout": 0.05, + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "revision": null, + "target_modules": [ + "gate_proj", + "down_proj", + "v_proj", + "up_proj", + "q_proj", + "k_proj", + "o_proj" + ], + "task_type": "CAUSAL_LM" +} \ No newline at end of file diff --git a/checkpoint-1000/adapter_model.bin b/checkpoint-1000/adapter_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..dadc215a63fed58c89df77db1bd8df3d49058b2a --- /dev/null +++ b/checkpoint-1000/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8bc29d1b7b8d9fbb7bdf2819f6d0628cea3d5ab845cc689cb80acece39912a3b +size 1657155522 diff --git a/checkpoint-1000/adapter_model/README.md b/checkpoint-1000/adapter_model/README.md new file mode 100644 index 0000000000000000000000000000000000000000..82793f73e61dbb024e11fc6697bba1622d4d0db6 --- /dev/null +++ b/checkpoint-1000/adapter_model/README.md @@ -0,0 +1,20 @@ +--- +library_name: peft +--- +## Training procedure + + +The following `bitsandbytes` quantization config was used during training: +- load_in_8bit: False +- load_in_4bit: True +- llm_int8_threshold: 6.0 +- llm_int8_skip_modules: None +- llm_int8_enable_fp32_cpu_offload: False +- llm_int8_has_fp16_weight: False +- bnb_4bit_quant_type: nf4 +- bnb_4bit_use_double_quant: True +- bnb_4bit_compute_dtype: bfloat16 +### Framework versions + + +- PEFT 0.4.0 diff --git a/checkpoint-1000/adapter_model/adapter_config.json b/checkpoint-1000/adapter_model/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..f6f42652a8f2f3cf02936153fca1af5d04e7983d --- /dev/null +++ b/checkpoint-1000/adapter_model/adapter_config.json @@ -0,0 +1,26 @@ +{ + "auto_mapping": null, + "base_model_name_or_path": "152334H/miqu-1-70b-sf", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "lora_alpha": 16.0, + "lora_dropout": 0.05, + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "revision": null, + "target_modules": [ + "gate_proj", + "down_proj", + "v_proj", + "up_proj", + "q_proj", + "k_proj", + "o_proj" + ], + "task_type": "CAUSAL_LM" +} \ No newline at end of file diff --git a/checkpoint-1000/adapter_model/adapter_model.bin b/checkpoint-1000/adapter_model/adapter_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..dadc215a63fed58c89df77db1bd8df3d49058b2a --- /dev/null +++ b/checkpoint-1000/adapter_model/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8bc29d1b7b8d9fbb7bdf2819f6d0628cea3d5ab845cc689cb80acece39912a3b +size 1657155522 diff --git a/checkpoint-1000/optimizer.pt b/checkpoint-1000/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..2509cc47164aed9cafb70d5ef512cc700a282cca --- /dev/null +++ b/checkpoint-1000/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7fff8ab4eb57b8e9147ca09f977042d1e861ca602fa63f83f85702beb67365ec +size 6627702922 diff --git a/checkpoint-1000/rng_state.pth b/checkpoint-1000/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..264165dcb2a05b58a99b090b5d58834c28e05bc3 --- /dev/null +++ b/checkpoint-1000/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e113e075e7ce260c7f7e75bb24de1ff504604347dbc91f90709c86d5a09023f2 +size 14180 diff --git a/checkpoint-1000/scheduler.pt b/checkpoint-1000/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..e48445ba4d02f1dfa03918780d1ef0e6a4198b00 --- /dev/null +++ b/checkpoint-1000/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:016164c1868d1353a972df97439a4a6f6ad10c19164c770b2c7d8301f524b82a +size 1064 diff --git a/checkpoint-1000/special_tokens_map.json b/checkpoint-1000/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..55431d8ca0ffd8e996c646341b6136e749142547 --- /dev/null +++ b/checkpoint-1000/special_tokens_map.json @@ -0,0 +1,12 @@ +{ + "bos_token": "", + "eos_token": "", + "pad_token": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false + }, + "unk_token": "" +} diff --git a/checkpoint-1000/tokenizer.model b/checkpoint-1000/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..6c00c742ce03c627d6cd5b795984876fa49fa899 --- /dev/null +++ b/checkpoint-1000/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347 +size 499723 diff --git a/checkpoint-1000/tokenizer_config.json b/checkpoint-1000/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..a3b9160e6f055447e1b1927482cf0ea394366055 --- /dev/null +++ b/checkpoint-1000/tokenizer_config.json @@ -0,0 +1,71 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": { + "__type": "AddedToken", + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false + }, + "chat_template": "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}", + "clean_up_tokenization_spaces": false, + "eos_token": { + "__type": "AddedToken", + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false + }, + "legacy": false, + "model_max_length": 1000000000000000019884624838656, + "pad_token": { + "__type": "AddedToken", + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false + }, + "padding_side": "right", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "LlamaTokenizer", + "unk_token": { + "__type": "AddedToken", + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false + }, + "use_default_system_prompt": false +} diff --git a/checkpoint-1000/trainer_state.json b/checkpoint-1000/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..3c75ae97af80eab73ceca339bae5bd106a62f2cd --- /dev/null +++ b/checkpoint-1000/trainer_state.json @@ -0,0 +1,971 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.8136476989344819, + "global_step": 1000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.02, + "learning_rate": 0.0001, + "loss": 1.1655, + "step": 10 + }, + { + "epoch": 0.04, + "learning_rate": 0.0001, + "loss": 1.001, + "step": 20 + }, + { + "epoch": 0.05, + "learning_rate": 0.0001, + "loss": 1.0287, + "step": 30 + }, + { + "epoch": 0.07, + "learning_rate": 0.0001, + "loss": 1.1578, + "step": 40 + }, + { + "epoch": 0.09, + "learning_rate": 0.0001, + "loss": 1.2146, + "step": 50 + }, + { + "epoch": 0.11, + "learning_rate": 0.0001, + "loss": 0.997, + "step": 60 + }, + { + "epoch": 0.13, + "learning_rate": 0.0001, + "loss": 0.9024, + "step": 70 + }, + { + "epoch": 0.15, + "learning_rate": 0.0001, + "loss": 0.9901, + "step": 80 + }, + { + "epoch": 0.16, + "learning_rate": 0.0001, + "loss": 1.1264, + "step": 90 + }, + { + "epoch": 0.18, + "learning_rate": 0.0001, + "loss": 1.2038, + "step": 100 + }, + { + "epoch": 0.2, + "learning_rate": 0.0001, + "loss": 0.8935, + "step": 110 + }, + { + "epoch": 0.22, + "learning_rate": 0.0001, + "loss": 0.9178, + "step": 120 + }, + { + "epoch": 0.24, + "learning_rate": 0.0001, + "loss": 0.9746, + "step": 130 + }, + { + "epoch": 0.25, + "learning_rate": 0.0001, + "loss": 1.1566, + "step": 140 + }, + { + "epoch": 0.27, + "learning_rate": 0.0001, + "loss": 1.2877, + "step": 150 + }, + { + "epoch": 0.29, + "learning_rate": 0.0001, + "loss": 0.9146, + "step": 160 + }, + { + "epoch": 0.31, + "learning_rate": 0.0001, + "loss": 0.8895, + "step": 170 + }, + { + "epoch": 0.33, + "learning_rate": 0.0001, + "loss": 1.0121, + "step": 180 + }, + { + "epoch": 0.34, + "eval_loss": 1.0215636491775513, + "eval_runtime": 950.138, + "eval_samples_per_second": 1.052, + "eval_steps_per_second": 1.052, + "step": 187 + }, + { + "epoch": 0.34, + "mmlu_eval_accuracy": 0.731892294851104, + "mmlu_eval_accuracy_abstract_algebra": 0.18181818181818182, + "mmlu_eval_accuracy_anatomy": 0.7857142857142857, + "mmlu_eval_accuracy_astronomy": 0.6875, + "mmlu_eval_accuracy_business_ethics": 0.7272727272727273, + "mmlu_eval_accuracy_clinical_knowledge": 0.896551724137931, + "mmlu_eval_accuracy_college_biology": 0.875, + "mmlu_eval_accuracy_college_chemistry": 0.5, + "mmlu_eval_accuracy_college_computer_science": 0.7272727272727273, + "mmlu_eval_accuracy_college_mathematics": 0.45454545454545453, + "mmlu_eval_accuracy_college_medicine": 0.9090909090909091, + "mmlu_eval_accuracy_college_physics": 0.6363636363636364, + "mmlu_eval_accuracy_computer_security": 0.6363636363636364, + "mmlu_eval_accuracy_conceptual_physics": 0.6538461538461539, + "mmlu_eval_accuracy_econometrics": 0.8333333333333334, + "mmlu_eval_accuracy_electrical_engineering": 0.8125, + "mmlu_eval_accuracy_elementary_mathematics": 0.7073170731707317, + "mmlu_eval_accuracy_formal_logic": 0.5714285714285714, + "mmlu_eval_accuracy_global_facts": 0.5, + "mmlu_eval_accuracy_high_school_biology": 0.8125, + "mmlu_eval_accuracy_high_school_chemistry": 0.4090909090909091, + "mmlu_eval_accuracy_high_school_computer_science": 0.6666666666666666, + "mmlu_eval_accuracy_high_school_european_history": 0.7777777777777778, + "mmlu_eval_accuracy_high_school_geography": 0.9090909090909091, + "mmlu_eval_accuracy_high_school_government_and_politics": 0.9523809523809523, + "mmlu_eval_accuracy_high_school_macroeconomics": 0.7441860465116279, + "mmlu_eval_accuracy_high_school_mathematics": 0.3793103448275862, + "mmlu_eval_accuracy_high_school_microeconomics": 0.9615384615384616, + "mmlu_eval_accuracy_high_school_physics": 0.23529411764705882, + "mmlu_eval_accuracy_high_school_psychology": 0.9333333333333333, + "mmlu_eval_accuracy_high_school_statistics": 0.6956521739130435, + "mmlu_eval_accuracy_high_school_us_history": 0.9090909090909091, + "mmlu_eval_accuracy_high_school_world_history": 0.8076923076923077, + "mmlu_eval_accuracy_human_aging": 0.8260869565217391, + "mmlu_eval_accuracy_human_sexuality": 0.6666666666666666, + "mmlu_eval_accuracy_international_law": 1.0, + "mmlu_eval_accuracy_jurisprudence": 0.6363636363636364, + "mmlu_eval_accuracy_logical_fallacies": 0.7777777777777778, + "mmlu_eval_accuracy_machine_learning": 0.5454545454545454, + "mmlu_eval_accuracy_management": 0.9090909090909091, + "mmlu_eval_accuracy_marketing": 0.88, + "mmlu_eval_accuracy_medical_genetics": 0.9090909090909091, + "mmlu_eval_accuracy_miscellaneous": 0.7906976744186046, + "mmlu_eval_accuracy_moral_disputes": 0.8157894736842105, + "mmlu_eval_accuracy_moral_scenarios": 0.59, + "mmlu_eval_accuracy_nutrition": 0.7878787878787878, + "mmlu_eval_accuracy_philosophy": 0.7941176470588235, + "mmlu_eval_accuracy_prehistory": 0.8285714285714286, + "mmlu_eval_accuracy_professional_accounting": 0.6451612903225806, + "mmlu_eval_accuracy_professional_law": 0.6294117647058823, + "mmlu_eval_accuracy_professional_medicine": 0.8064516129032258, + "mmlu_eval_accuracy_professional_psychology": 0.8115942028985508, + "mmlu_eval_accuracy_public_relations": 0.6666666666666666, + "mmlu_eval_accuracy_security_studies": 0.8148148148148148, + "mmlu_eval_accuracy_sociology": 0.9545454545454546, + "mmlu_eval_accuracy_us_foreign_policy": 1.0, + "mmlu_eval_accuracy_virology": 0.5, + "mmlu_eval_accuracy_world_religions": 0.8421052631578947, + "mmlu_loss": 1.326305795171384, + "step": 187 + }, + { + "epoch": 0.34, + "learning_rate": 0.0001, + "loss": 1.1133, + "step": 190 + }, + { + "epoch": 0.36, + "learning_rate": 0.0001, + "loss": 1.2485, + "step": 200 + }, + { + "epoch": 0.38, + "learning_rate": 0.0001, + "loss": 0.9653, + "step": 210 + }, + { + "epoch": 0.4, + "learning_rate": 0.0001, + "loss": 0.9455, + "step": 220 + }, + { + "epoch": 0.42, + "learning_rate": 0.0001, + "loss": 1.0373, + "step": 230 + }, + { + "epoch": 0.44, + "learning_rate": 0.0001, + "loss": 1.1425, + "step": 240 + }, + { + "epoch": 0.45, + "learning_rate": 0.0001, + "loss": 1.3136, + "step": 250 + }, + { + "epoch": 0.47, + "learning_rate": 0.0001, + "loss": 0.8695, + "step": 260 + }, + { + "epoch": 0.49, + "learning_rate": 0.0001, + "loss": 0.872, + "step": 270 + }, + { + "epoch": 0.51, + "learning_rate": 0.0001, + "loss": 1.0152, + "step": 280 + }, + { + "epoch": 0.53, + "learning_rate": 0.0001, + "loss": 1.1309, + "step": 290 + }, + { + "epoch": 0.54, + "learning_rate": 0.0001, + "loss": 1.267, + "step": 300 + }, + { + "epoch": 0.56, + "learning_rate": 0.0001, + "loss": 0.9249, + "step": 310 + }, + { + "epoch": 0.58, + "learning_rate": 0.0001, + "loss": 0.9148, + "step": 320 + }, + { + "epoch": 0.6, + "learning_rate": 0.0001, + "loss": 0.9864, + "step": 330 + }, + { + "epoch": 0.62, + "learning_rate": 0.0001, + "loss": 1.2312, + "step": 340 + }, + { + "epoch": 0.63, + "learning_rate": 0.0001, + "loss": 1.2354, + "step": 350 + }, + { + "epoch": 0.65, + "learning_rate": 0.0001, + "loss": 0.9126, + "step": 360 + }, + { + "epoch": 0.67, + "learning_rate": 0.0001, + "loss": 0.9213, + "step": 370 + }, + { + "epoch": 0.68, + "eval_loss": 1.0163359642028809, + "eval_runtime": 948.1151, + "eval_samples_per_second": 1.055, + "eval_steps_per_second": 1.055, + "step": 374 + }, + { + "epoch": 0.68, + "mmlu_eval_accuracy": 0.7395476061435284, + "mmlu_eval_accuracy_abstract_algebra": 0.2727272727272727, + "mmlu_eval_accuracy_anatomy": 0.7857142857142857, + "mmlu_eval_accuracy_astronomy": 0.75, + "mmlu_eval_accuracy_business_ethics": 0.7272727272727273, + "mmlu_eval_accuracy_clinical_knowledge": 0.896551724137931, + "mmlu_eval_accuracy_college_biology": 0.875, + "mmlu_eval_accuracy_college_chemistry": 0.5, + "mmlu_eval_accuracy_college_computer_science": 0.7272727272727273, + "mmlu_eval_accuracy_college_mathematics": 0.36363636363636365, + "mmlu_eval_accuracy_college_medicine": 0.9090909090909091, + "mmlu_eval_accuracy_college_physics": 0.6363636363636364, + "mmlu_eval_accuracy_computer_security": 0.6363636363636364, + "mmlu_eval_accuracy_conceptual_physics": 0.5769230769230769, + "mmlu_eval_accuracy_econometrics": 0.8333333333333334, + "mmlu_eval_accuracy_electrical_engineering": 0.875, + "mmlu_eval_accuracy_elementary_mathematics": 0.6829268292682927, + "mmlu_eval_accuracy_formal_logic": 0.6428571428571429, + "mmlu_eval_accuracy_global_facts": 0.5, + "mmlu_eval_accuracy_high_school_biology": 0.8125, + "mmlu_eval_accuracy_high_school_chemistry": 0.45454545454545453, + "mmlu_eval_accuracy_high_school_computer_science": 0.7777777777777778, + "mmlu_eval_accuracy_high_school_european_history": 0.7777777777777778, + "mmlu_eval_accuracy_high_school_geography": 0.9090909090909091, + "mmlu_eval_accuracy_high_school_government_and_politics": 0.9523809523809523, + "mmlu_eval_accuracy_high_school_macroeconomics": 0.7674418604651163, + "mmlu_eval_accuracy_high_school_mathematics": 0.41379310344827586, + "mmlu_eval_accuracy_high_school_microeconomics": 0.9615384615384616, + "mmlu_eval_accuracy_high_school_physics": 0.23529411764705882, + "mmlu_eval_accuracy_high_school_psychology": 0.95, + "mmlu_eval_accuracy_high_school_statistics": 0.7391304347826086, + "mmlu_eval_accuracy_high_school_us_history": 0.9090909090909091, + "mmlu_eval_accuracy_high_school_world_history": 0.7692307692307693, + "mmlu_eval_accuracy_human_aging": 0.8260869565217391, + "mmlu_eval_accuracy_human_sexuality": 0.6666666666666666, + "mmlu_eval_accuracy_international_law": 1.0, + "mmlu_eval_accuracy_jurisprudence": 0.6363636363636364, + "mmlu_eval_accuracy_logical_fallacies": 0.7777777777777778, + "mmlu_eval_accuracy_machine_learning": 0.5454545454545454, + "mmlu_eval_accuracy_management": 0.9090909090909091, + "mmlu_eval_accuracy_marketing": 0.92, + "mmlu_eval_accuracy_medical_genetics": 0.9090909090909091, + "mmlu_eval_accuracy_miscellaneous": 0.7790697674418605, + "mmlu_eval_accuracy_moral_disputes": 0.8157894736842105, + "mmlu_eval_accuracy_moral_scenarios": 0.57, + "mmlu_eval_accuracy_nutrition": 0.7272727272727273, + "mmlu_eval_accuracy_philosophy": 0.7941176470588235, + "mmlu_eval_accuracy_prehistory": 0.8571428571428571, + "mmlu_eval_accuracy_professional_accounting": 0.6774193548387096, + "mmlu_eval_accuracy_professional_law": 0.6411764705882353, + "mmlu_eval_accuracy_professional_medicine": 0.8387096774193549, + "mmlu_eval_accuracy_professional_psychology": 0.8115942028985508, + "mmlu_eval_accuracy_public_relations": 0.6666666666666666, + "mmlu_eval_accuracy_security_studies": 0.8148148148148148, + "mmlu_eval_accuracy_sociology": 0.9545454545454546, + "mmlu_eval_accuracy_us_foreign_policy": 1.0, + "mmlu_eval_accuracy_virology": 0.5, + "mmlu_eval_accuracy_world_religions": 0.8947368421052632, + "mmlu_loss": 1.2796503596061355, + "step": 374 + }, + { + "epoch": 0.69, + "learning_rate": 0.0001, + "loss": 0.9737, + "step": 380 + }, + { + "epoch": 0.71, + "learning_rate": 0.0001, + "loss": 1.157, + "step": 390 + }, + { + "epoch": 0.73, + "learning_rate": 0.0001, + "loss": 1.2106, + "step": 400 + }, + { + "epoch": 0.74, + "learning_rate": 0.0001, + "loss": 0.8687, + "step": 410 + }, + { + "epoch": 0.76, + "learning_rate": 0.0001, + "loss": 0.8742, + "step": 420 + }, + { + "epoch": 0.78, + "learning_rate": 0.0001, + "loss": 0.9901, + "step": 430 + }, + { + "epoch": 0.8, + "learning_rate": 0.0001, + "loss": 1.2238, + "step": 440 + }, + { + "epoch": 0.82, + "learning_rate": 0.0001, + "loss": 1.2604, + "step": 450 + }, + { + "epoch": 0.83, + "learning_rate": 0.0001, + "loss": 0.8756, + "step": 460 + }, + { + "epoch": 0.85, + "learning_rate": 0.0001, + "loss": 0.8683, + "step": 470 + }, + { + "epoch": 0.87, + "learning_rate": 0.0001, + "loss": 0.9824, + "step": 480 + }, + { + "epoch": 0.89, + "learning_rate": 0.0001, + "loss": 1.1574, + "step": 490 + }, + { + "epoch": 0.91, + "learning_rate": 0.0001, + "loss": 1.2687, + "step": 500 + }, + { + "epoch": 0.92, + "learning_rate": 0.0001, + "loss": 0.8657, + "step": 510 + }, + { + "epoch": 0.94, + "learning_rate": 0.0001, + "loss": 0.9207, + "step": 520 + }, + { + "epoch": 0.96, + "learning_rate": 0.0001, + "loss": 1.012, + "step": 530 + }, + { + "epoch": 0.98, + "learning_rate": 0.0001, + "loss": 1.1517, + "step": 540 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 1.1654, + "step": 550 + }, + { + "epoch": 1.02, + "learning_rate": 0.0001, + "loss": 0.8931, + "step": 560 + }, + { + "epoch": 1.02, + "eval_loss": 1.0150845050811768, + "eval_runtime": 949.8392, + "eval_samples_per_second": 1.053, + "eval_steps_per_second": 1.053, + "step": 561 + }, + { + "epoch": 1.02, + "mmlu_eval_accuracy": 0.7346397374699287, + "mmlu_eval_accuracy_abstract_algebra": 0.2727272727272727, + "mmlu_eval_accuracy_anatomy": 0.7142857142857143, + "mmlu_eval_accuracy_astronomy": 0.6875, + "mmlu_eval_accuracy_business_ethics": 0.8181818181818182, + "mmlu_eval_accuracy_clinical_knowledge": 0.8620689655172413, + "mmlu_eval_accuracy_college_biology": 0.875, + "mmlu_eval_accuracy_college_chemistry": 0.5, + "mmlu_eval_accuracy_college_computer_science": 0.7272727272727273, + "mmlu_eval_accuracy_college_mathematics": 0.36363636363636365, + "mmlu_eval_accuracy_college_medicine": 0.9090909090909091, + "mmlu_eval_accuracy_college_physics": 0.5454545454545454, + "mmlu_eval_accuracy_computer_security": 0.6363636363636364, + "mmlu_eval_accuracy_conceptual_physics": 0.6153846153846154, + "mmlu_eval_accuracy_econometrics": 0.8333333333333334, + "mmlu_eval_accuracy_electrical_engineering": 0.8125, + "mmlu_eval_accuracy_elementary_mathematics": 0.6829268292682927, + "mmlu_eval_accuracy_formal_logic": 0.6428571428571429, + "mmlu_eval_accuracy_global_facts": 0.5, + "mmlu_eval_accuracy_high_school_biology": 0.84375, + "mmlu_eval_accuracy_high_school_chemistry": 0.45454545454545453, + "mmlu_eval_accuracy_high_school_computer_science": 0.7777777777777778, + "mmlu_eval_accuracy_high_school_european_history": 0.7777777777777778, + "mmlu_eval_accuracy_high_school_geography": 0.9090909090909091, + "mmlu_eval_accuracy_high_school_government_and_politics": 0.9523809523809523, + "mmlu_eval_accuracy_high_school_macroeconomics": 0.7674418604651163, + "mmlu_eval_accuracy_high_school_mathematics": 0.41379310344827586, + "mmlu_eval_accuracy_high_school_microeconomics": 0.9230769230769231, + "mmlu_eval_accuracy_high_school_physics": 0.23529411764705882, + "mmlu_eval_accuracy_high_school_psychology": 0.9666666666666667, + "mmlu_eval_accuracy_high_school_statistics": 0.7391304347826086, + "mmlu_eval_accuracy_high_school_us_history": 0.9090909090909091, + "mmlu_eval_accuracy_high_school_world_history": 0.8076923076923077, + "mmlu_eval_accuracy_human_aging": 0.8260869565217391, + "mmlu_eval_accuracy_human_sexuality": 0.6666666666666666, + "mmlu_eval_accuracy_international_law": 1.0, + "mmlu_eval_accuracy_jurisprudence": 0.6363636363636364, + "mmlu_eval_accuracy_logical_fallacies": 0.7222222222222222, + "mmlu_eval_accuracy_machine_learning": 0.6363636363636364, + "mmlu_eval_accuracy_management": 0.9090909090909091, + "mmlu_eval_accuracy_marketing": 0.88, + "mmlu_eval_accuracy_medical_genetics": 0.9090909090909091, + "mmlu_eval_accuracy_miscellaneous": 0.7558139534883721, + "mmlu_eval_accuracy_moral_disputes": 0.8157894736842105, + "mmlu_eval_accuracy_moral_scenarios": 0.56, + "mmlu_eval_accuracy_nutrition": 0.7575757575757576, + "mmlu_eval_accuracy_philosophy": 0.7941176470588235, + "mmlu_eval_accuracy_prehistory": 0.8571428571428571, + "mmlu_eval_accuracy_professional_accounting": 0.6774193548387096, + "mmlu_eval_accuracy_professional_law": 0.6294117647058823, + "mmlu_eval_accuracy_professional_medicine": 0.8064516129032258, + "mmlu_eval_accuracy_professional_psychology": 0.8260869565217391, + "mmlu_eval_accuracy_public_relations": 0.6666666666666666, + "mmlu_eval_accuracy_security_studies": 0.8148148148148148, + "mmlu_eval_accuracy_sociology": 0.9090909090909091, + "mmlu_eval_accuracy_us_foreign_policy": 1.0, + "mmlu_eval_accuracy_virology": 0.5, + "mmlu_eval_accuracy_world_religions": 0.8421052631578947, + "mmlu_loss": 1.186674291658526, + "step": 561 + }, + { + "epoch": 1.03, + "learning_rate": 0.0001, + "loss": 0.8507, + "step": 570 + }, + { + "epoch": 1.05, + "learning_rate": 0.0001, + "loss": 0.9164, + "step": 580 + }, + { + "epoch": 1.07, + "learning_rate": 0.0001, + "loss": 1.0908, + "step": 590 + }, + { + "epoch": 1.09, + "learning_rate": 0.0001, + "loss": 1.0431, + "step": 600 + }, + { + "epoch": 1.11, + "learning_rate": 0.0001, + "loss": 0.8567, + "step": 610 + }, + { + "epoch": 1.12, + "learning_rate": 0.0001, + "loss": 0.8818, + "step": 620 + }, + { + "epoch": 1.14, + "learning_rate": 0.0001, + "loss": 0.9499, + "step": 630 + }, + { + "epoch": 1.16, + "learning_rate": 0.0001, + "loss": 1.0437, + "step": 640 + }, + { + "epoch": 1.18, + "learning_rate": 0.0001, + "loss": 1.0487, + "step": 650 + }, + { + "epoch": 1.2, + "learning_rate": 0.0001, + "loss": 0.8405, + "step": 660 + }, + { + "epoch": 1.22, + "learning_rate": 0.0001, + "loss": 0.8818, + "step": 670 + }, + { + "epoch": 1.23, + "learning_rate": 0.0001, + "loss": 0.9619, + "step": 680 + }, + { + "epoch": 1.25, + "learning_rate": 0.0001, + "loss": 1.0753, + "step": 690 + }, + { + "epoch": 1.27, + "learning_rate": 0.0001, + "loss": 1.0218, + "step": 700 + }, + { + "epoch": 1.29, + "learning_rate": 0.0001, + "loss": 0.8763, + "step": 710 + }, + { + "epoch": 1.31, + "learning_rate": 0.0001, + "loss": 0.8789, + "step": 720 + }, + { + "epoch": 1.32, + "learning_rate": 0.0001, + "loss": 0.8631, + "step": 730 + }, + { + "epoch": 1.34, + "learning_rate": 0.0001, + "loss": 0.9846, + "step": 740 + }, + { + "epoch": 1.36, + "eval_loss": 1.0305067300796509, + "eval_runtime": 948.7106, + "eval_samples_per_second": 1.054, + "eval_steps_per_second": 1.054, + "step": 748 + }, + { + "epoch": 1.36, + "mmlu_eval_accuracy": 0.7324229372189777, + "mmlu_eval_accuracy_abstract_algebra": 0.2727272727272727, + "mmlu_eval_accuracy_anatomy": 0.6428571428571429, + "mmlu_eval_accuracy_astronomy": 0.6875, + "mmlu_eval_accuracy_business_ethics": 1.0, + "mmlu_eval_accuracy_clinical_knowledge": 0.896551724137931, + "mmlu_eval_accuracy_college_biology": 0.875, + "mmlu_eval_accuracy_college_chemistry": 0.5, + "mmlu_eval_accuracy_college_computer_science": 0.6363636363636364, + "mmlu_eval_accuracy_college_mathematics": 0.36363636363636365, + "mmlu_eval_accuracy_college_medicine": 0.8636363636363636, + "mmlu_eval_accuracy_college_physics": 0.6363636363636364, + "mmlu_eval_accuracy_computer_security": 0.6363636363636364, + "mmlu_eval_accuracy_conceptual_physics": 0.6153846153846154, + "mmlu_eval_accuracy_econometrics": 0.8333333333333334, + "mmlu_eval_accuracy_electrical_engineering": 0.8125, + "mmlu_eval_accuracy_elementary_mathematics": 0.7073170731707317, + "mmlu_eval_accuracy_formal_logic": 0.6428571428571429, + "mmlu_eval_accuracy_global_facts": 0.4, + "mmlu_eval_accuracy_high_school_biology": 0.8125, + "mmlu_eval_accuracy_high_school_chemistry": 0.36363636363636365, + "mmlu_eval_accuracy_high_school_computer_science": 0.6666666666666666, + "mmlu_eval_accuracy_high_school_european_history": 0.7777777777777778, + "mmlu_eval_accuracy_high_school_geography": 0.9090909090909091, + "mmlu_eval_accuracy_high_school_government_and_politics": 0.9523809523809523, + "mmlu_eval_accuracy_high_school_macroeconomics": 0.7674418604651163, + "mmlu_eval_accuracy_high_school_mathematics": 0.3448275862068966, + "mmlu_eval_accuracy_high_school_microeconomics": 0.9615384615384616, + "mmlu_eval_accuracy_high_school_physics": 0.35294117647058826, + "mmlu_eval_accuracy_high_school_psychology": 0.9333333333333333, + "mmlu_eval_accuracy_high_school_statistics": 0.782608695652174, + "mmlu_eval_accuracy_high_school_us_history": 0.9545454545454546, + "mmlu_eval_accuracy_high_school_world_history": 0.7692307692307693, + "mmlu_eval_accuracy_human_aging": 0.782608695652174, + "mmlu_eval_accuracy_human_sexuality": 0.6666666666666666, + "mmlu_eval_accuracy_international_law": 1.0, + "mmlu_eval_accuracy_jurisprudence": 0.6363636363636364, + "mmlu_eval_accuracy_logical_fallacies": 0.7777777777777778, + "mmlu_eval_accuracy_machine_learning": 0.6363636363636364, + "mmlu_eval_accuracy_management": 0.9090909090909091, + "mmlu_eval_accuracy_marketing": 0.96, + "mmlu_eval_accuracy_medical_genetics": 0.9090909090909091, + "mmlu_eval_accuracy_miscellaneous": 0.7441860465116279, + "mmlu_eval_accuracy_moral_disputes": 0.8157894736842105, + "mmlu_eval_accuracy_moral_scenarios": 0.57, + "mmlu_eval_accuracy_nutrition": 0.7272727272727273, + "mmlu_eval_accuracy_philosophy": 0.7941176470588235, + "mmlu_eval_accuracy_prehistory": 0.8285714285714286, + "mmlu_eval_accuracy_professional_accounting": 0.6129032258064516, + "mmlu_eval_accuracy_professional_law": 0.6411764705882353, + "mmlu_eval_accuracy_professional_medicine": 0.8064516129032258, + "mmlu_eval_accuracy_professional_psychology": 0.8260869565217391, + "mmlu_eval_accuracy_public_relations": 0.6666666666666666, + "mmlu_eval_accuracy_security_studies": 0.8148148148148148, + "mmlu_eval_accuracy_sociology": 0.9090909090909091, + "mmlu_eval_accuracy_us_foreign_policy": 1.0, + "mmlu_eval_accuracy_virology": 0.5, + "mmlu_eval_accuracy_world_religions": 0.8421052631578947, + "mmlu_loss": 1.2988067958029479, + "step": 748 + }, + { + "epoch": 1.36, + "learning_rate": 0.0001, + "loss": 1.0735, + "step": 750 + }, + { + "epoch": 1.38, + "learning_rate": 0.0001, + "loss": 0.9066, + "step": 760 + }, + { + "epoch": 1.4, + "learning_rate": 0.0001, + "loss": 0.8716, + "step": 770 + }, + { + "epoch": 1.41, + "learning_rate": 0.0001, + "loss": 0.9144, + "step": 780 + }, + { + "epoch": 1.43, + "learning_rate": 0.0001, + "loss": 1.0338, + "step": 790 + }, + { + "epoch": 1.45, + "learning_rate": 0.0001, + "loss": 1.0275, + "step": 800 + }, + { + "epoch": 1.47, + "learning_rate": 0.0001, + "loss": 0.8382, + "step": 810 + }, + { + "epoch": 1.49, + "learning_rate": 0.0001, + "loss": 0.8489, + "step": 820 + }, + { + "epoch": 1.51, + "learning_rate": 0.0001, + "loss": 0.8931, + "step": 830 + }, + { + "epoch": 1.52, + "learning_rate": 0.0001, + "loss": 1.0515, + "step": 840 + }, + { + "epoch": 1.54, + "learning_rate": 0.0001, + "loss": 1.0965, + "step": 850 + }, + { + "epoch": 1.56, + "learning_rate": 0.0001, + "loss": 0.8928, + "step": 860 + }, + { + "epoch": 1.58, + "learning_rate": 0.0001, + "loss": 0.8608, + "step": 870 + }, + { + "epoch": 1.6, + "learning_rate": 0.0001, + "loss": 0.8831, + "step": 880 + }, + { + "epoch": 1.61, + "learning_rate": 0.0001, + "loss": 1.0253, + "step": 890 + }, + { + "epoch": 1.63, + "learning_rate": 0.0001, + "loss": 0.9905, + "step": 900 + }, + { + "epoch": 1.65, + "learning_rate": 0.0001, + "loss": 0.8487, + "step": 910 + }, + { + "epoch": 1.67, + "learning_rate": 0.0001, + "loss": 0.8568, + "step": 920 + }, + { + "epoch": 1.69, + "learning_rate": 0.0001, + "loss": 0.9047, + "step": 930 + }, + { + "epoch": 1.7, + "eval_loss": 1.0250624418258667, + "eval_runtime": 946.4035, + "eval_samples_per_second": 1.057, + "eval_steps_per_second": 1.057, + "step": 935 + }, + { + "epoch": 1.7, + "mmlu_eval_accuracy": 0.7288948695878031, + "mmlu_eval_accuracy_abstract_algebra": 0.2727272727272727, + "mmlu_eval_accuracy_anatomy": 0.5714285714285714, + "mmlu_eval_accuracy_astronomy": 0.6875, + "mmlu_eval_accuracy_business_ethics": 0.9090909090909091, + "mmlu_eval_accuracy_clinical_knowledge": 0.8620689655172413, + "mmlu_eval_accuracy_college_biology": 0.875, + "mmlu_eval_accuracy_college_chemistry": 0.5, + "mmlu_eval_accuracy_college_computer_science": 0.7272727272727273, + "mmlu_eval_accuracy_college_mathematics": 0.36363636363636365, + "mmlu_eval_accuracy_college_medicine": 0.9090909090909091, + "mmlu_eval_accuracy_college_physics": 0.6363636363636364, + "mmlu_eval_accuracy_computer_security": 0.6363636363636364, + "mmlu_eval_accuracy_conceptual_physics": 0.6153846153846154, + "mmlu_eval_accuracy_econometrics": 0.8333333333333334, + "mmlu_eval_accuracy_electrical_engineering": 0.8125, + "mmlu_eval_accuracy_elementary_mathematics": 0.6097560975609756, + "mmlu_eval_accuracy_formal_logic": 0.6428571428571429, + "mmlu_eval_accuracy_global_facts": 0.5, + "mmlu_eval_accuracy_high_school_biology": 0.84375, + "mmlu_eval_accuracy_high_school_chemistry": 0.45454545454545453, + "mmlu_eval_accuracy_high_school_computer_science": 0.6666666666666666, + "mmlu_eval_accuracy_high_school_european_history": 0.7777777777777778, + "mmlu_eval_accuracy_high_school_geography": 0.9090909090909091, + "mmlu_eval_accuracy_high_school_government_and_politics": 0.9523809523809523, + "mmlu_eval_accuracy_high_school_macroeconomics": 0.7906976744186046, + "mmlu_eval_accuracy_high_school_mathematics": 0.3793103448275862, + "mmlu_eval_accuracy_high_school_microeconomics": 0.9230769230769231, + "mmlu_eval_accuracy_high_school_physics": 0.23529411764705882, + "mmlu_eval_accuracy_high_school_psychology": 0.9333333333333333, + "mmlu_eval_accuracy_high_school_statistics": 0.7391304347826086, + "mmlu_eval_accuracy_high_school_us_history": 0.9090909090909091, + "mmlu_eval_accuracy_high_school_world_history": 0.7692307692307693, + "mmlu_eval_accuracy_human_aging": 0.782608695652174, + "mmlu_eval_accuracy_human_sexuality": 0.6666666666666666, + "mmlu_eval_accuracy_international_law": 1.0, + "mmlu_eval_accuracy_jurisprudence": 0.5454545454545454, + "mmlu_eval_accuracy_logical_fallacies": 0.7222222222222222, + "mmlu_eval_accuracy_machine_learning": 0.6363636363636364, + "mmlu_eval_accuracy_management": 0.9090909090909091, + "mmlu_eval_accuracy_marketing": 0.96, + "mmlu_eval_accuracy_medical_genetics": 0.9090909090909091, + "mmlu_eval_accuracy_miscellaneous": 0.7790697674418605, + "mmlu_eval_accuracy_moral_disputes": 0.8157894736842105, + "mmlu_eval_accuracy_moral_scenarios": 0.57, + "mmlu_eval_accuracy_nutrition": 0.7575757575757576, + "mmlu_eval_accuracy_philosophy": 0.8235294117647058, + "mmlu_eval_accuracy_prehistory": 0.8857142857142857, + "mmlu_eval_accuracy_professional_accounting": 0.6129032258064516, + "mmlu_eval_accuracy_professional_law": 0.6235294117647059, + "mmlu_eval_accuracy_professional_medicine": 0.8387096774193549, + "mmlu_eval_accuracy_professional_psychology": 0.8115942028985508, + "mmlu_eval_accuracy_public_relations": 0.5833333333333334, + "mmlu_eval_accuracy_security_studies": 0.8148148148148148, + "mmlu_eval_accuracy_sociology": 0.9090909090909091, + "mmlu_eval_accuracy_us_foreign_policy": 1.0, + "mmlu_eval_accuracy_virology": 0.5, + "mmlu_eval_accuracy_world_religions": 0.8421052631578947, + "mmlu_loss": 1.243813282909306, + "step": 935 + }, + { + "epoch": 1.7, + "learning_rate": 0.0001, + "loss": 1.0174, + "step": 940 + }, + { + "epoch": 1.72, + "learning_rate": 0.0001, + "loss": 1.0302, + "step": 950 + }, + { + "epoch": 1.74, + "learning_rate": 0.0001, + "loss": 0.8799, + "step": 960 + }, + { + "epoch": 1.76, + "learning_rate": 0.0001, + "loss": 0.8447, + "step": 970 + }, + { + "epoch": 1.78, + "learning_rate": 0.0001, + "loss": 0.9053, + "step": 980 + }, + { + "epoch": 1.8, + "learning_rate": 0.0001, + "loss": 1.0331, + "step": 990 + }, + { + "epoch": 1.81, + "learning_rate": 0.0001, + "loss": 1.0412, + "step": 1000 + } + ], + "max_steps": 1875, + "num_train_epochs": 4, + "total_flos": 1.158485717946876e+18, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-1000/training_args.bin b/checkpoint-1000/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..cab5478c763bfedd6ea1ae136d922355ec23f73d --- /dev/null +++ b/checkpoint-1000/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3739b542a2914edbd2d7eb3b727d6fa2a7752c75b0d7a856f5e87dd807fa1ef9 +size 6200 diff --git a/checkpoint-1200/README.md b/checkpoint-1200/README.md new file mode 100644 index 0000000000000000000000000000000000000000..82793f73e61dbb024e11fc6697bba1622d4d0db6 --- /dev/null +++ b/checkpoint-1200/README.md @@ -0,0 +1,20 @@ +--- +library_name: peft +--- +## Training procedure + + +The following `bitsandbytes` quantization config was used during training: +- load_in_8bit: False +- load_in_4bit: True +- llm_int8_threshold: 6.0 +- llm_int8_skip_modules: None +- llm_int8_enable_fp32_cpu_offload: False +- llm_int8_has_fp16_weight: False +- bnb_4bit_quant_type: nf4 +- bnb_4bit_use_double_quant: True +- bnb_4bit_compute_dtype: bfloat16 +### Framework versions + + +- PEFT 0.4.0 diff --git a/checkpoint-1200/adapter_config.json b/checkpoint-1200/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..f6f42652a8f2f3cf02936153fca1af5d04e7983d --- /dev/null +++ b/checkpoint-1200/adapter_config.json @@ -0,0 +1,26 @@ +{ + "auto_mapping": null, + "base_model_name_or_path": "152334H/miqu-1-70b-sf", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "lora_alpha": 16.0, + "lora_dropout": 0.05, + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "revision": null, + "target_modules": [ + "gate_proj", + "down_proj", + "v_proj", + "up_proj", + "q_proj", + "k_proj", + "o_proj" + ], + "task_type": "CAUSAL_LM" +} \ No newline at end of file diff --git a/checkpoint-1200/adapter_model.bin b/checkpoint-1200/adapter_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..766929c3eeb35ef67e2453a2b8312d3772511c37 --- /dev/null +++ b/checkpoint-1200/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ad32043ca97c48601084cb3f502e591bccca0879804c4979972d332fc79a801f +size 1657155522 diff --git a/checkpoint-1200/adapter_model/README.md b/checkpoint-1200/adapter_model/README.md new file mode 100644 index 0000000000000000000000000000000000000000..82793f73e61dbb024e11fc6697bba1622d4d0db6 --- /dev/null +++ b/checkpoint-1200/adapter_model/README.md @@ -0,0 +1,20 @@ +--- +library_name: peft +--- +## Training procedure + + +The following `bitsandbytes` quantization config was used during training: +- load_in_8bit: False +- load_in_4bit: True +- llm_int8_threshold: 6.0 +- llm_int8_skip_modules: None +- llm_int8_enable_fp32_cpu_offload: False +- llm_int8_has_fp16_weight: False +- bnb_4bit_quant_type: nf4 +- bnb_4bit_use_double_quant: True +- bnb_4bit_compute_dtype: bfloat16 +### Framework versions + + +- PEFT 0.4.0 diff --git a/checkpoint-1200/adapter_model/adapter_config.json b/checkpoint-1200/adapter_model/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..f6f42652a8f2f3cf02936153fca1af5d04e7983d --- /dev/null +++ b/checkpoint-1200/adapter_model/adapter_config.json @@ -0,0 +1,26 @@ +{ + "auto_mapping": null, + "base_model_name_or_path": "152334H/miqu-1-70b-sf", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "lora_alpha": 16.0, + "lora_dropout": 0.05, + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "revision": null, + "target_modules": [ + "gate_proj", + "down_proj", + "v_proj", + "up_proj", + "q_proj", + "k_proj", + "o_proj" + ], + "task_type": "CAUSAL_LM" +} \ No newline at end of file diff --git a/checkpoint-1200/adapter_model/adapter_model.bin b/checkpoint-1200/adapter_model/adapter_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..766929c3eeb35ef67e2453a2b8312d3772511c37 --- /dev/null +++ b/checkpoint-1200/adapter_model/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ad32043ca97c48601084cb3f502e591bccca0879804c4979972d332fc79a801f +size 1657155522 diff --git a/checkpoint-1200/optimizer.pt b/checkpoint-1200/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..51a0de5b7f40cbeea6450792d41e1596d1cf84a8 --- /dev/null +++ b/checkpoint-1200/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:de2f756f0c8b03dbc310fe2201c53f94e44de0a72ecbe1a58087f0f6916b3c1b +size 6627702922 diff --git a/checkpoint-1200/rng_state.pth b/checkpoint-1200/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..68e45ee8ecadc1c0d946b3eaccb8f9133cab023e --- /dev/null +++ b/checkpoint-1200/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:36539f193dde7f6dd2cc8b72d99a411c97b376a7260c2930cb324081a6c6ee3c +size 14180 diff --git a/checkpoint-1200/scheduler.pt b/checkpoint-1200/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..eec2e1f675639f4b8ac1a827e2890c1cc4642b76 --- /dev/null +++ b/checkpoint-1200/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2c5671c0de422189dff152cae166eb49cff39ac2aa88bdb353ba6e07d93451bf +size 1064 diff --git a/checkpoint-1200/special_tokens_map.json b/checkpoint-1200/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..55431d8ca0ffd8e996c646341b6136e749142547 --- /dev/null +++ b/checkpoint-1200/special_tokens_map.json @@ -0,0 +1,12 @@ +{ + "bos_token": "", + "eos_token": "", + "pad_token": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false + }, + "unk_token": "" +} diff --git a/checkpoint-1200/tokenizer.model b/checkpoint-1200/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..6c00c742ce03c627d6cd5b795984876fa49fa899 --- /dev/null +++ b/checkpoint-1200/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347 +size 499723 diff --git a/checkpoint-1200/tokenizer_config.json b/checkpoint-1200/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..a3b9160e6f055447e1b1927482cf0ea394366055 --- /dev/null +++ b/checkpoint-1200/tokenizer_config.json @@ -0,0 +1,71 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": { + "__type": "AddedToken", + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false + }, + "chat_template": "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}", + "clean_up_tokenization_spaces": false, + "eos_token": { + "__type": "AddedToken", + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false + }, + "legacy": false, + "model_max_length": 1000000000000000019884624838656, + "pad_token": { + "__type": "AddedToken", + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false + }, + "padding_side": "right", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "LlamaTokenizer", + "unk_token": { + "__type": "AddedToken", + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false + }, + "use_default_system_prompt": false +} diff --git a/checkpoint-1200/trainer_state.json b/checkpoint-1200/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..43d8f393b77e880eff76fef57b90bcab7e271e26 --- /dev/null +++ b/checkpoint-1200/trainer_state.json @@ -0,0 +1,1162 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.1763772387213782, + "global_step": 1200, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.02, + "learning_rate": 0.0001, + "loss": 1.1655, + "step": 10 + }, + { + "epoch": 0.04, + "learning_rate": 0.0001, + "loss": 1.001, + "step": 20 + }, + { + "epoch": 0.05, + "learning_rate": 0.0001, + "loss": 1.0287, + "step": 30 + }, + { + "epoch": 0.07, + "learning_rate": 0.0001, + "loss": 1.1578, + "step": 40 + }, + { + "epoch": 0.09, + "learning_rate": 0.0001, + "loss": 1.2146, + "step": 50 + }, + { + "epoch": 0.11, + "learning_rate": 0.0001, + "loss": 0.997, + "step": 60 + }, + { + "epoch": 0.13, + "learning_rate": 0.0001, + "loss": 0.9024, + "step": 70 + }, + { + "epoch": 0.15, + "learning_rate": 0.0001, + "loss": 0.9901, + "step": 80 + }, + { + "epoch": 0.16, + "learning_rate": 0.0001, + "loss": 1.1264, + "step": 90 + }, + { + "epoch": 0.18, + "learning_rate": 0.0001, + "loss": 1.2038, + "step": 100 + }, + { + "epoch": 0.2, + "learning_rate": 0.0001, + "loss": 0.8935, + "step": 110 + }, + { + "epoch": 0.22, + "learning_rate": 0.0001, + "loss": 0.9178, + "step": 120 + }, + { + "epoch": 0.24, + "learning_rate": 0.0001, + "loss": 0.9746, + "step": 130 + }, + { + "epoch": 0.25, + "learning_rate": 0.0001, + "loss": 1.1566, + "step": 140 + }, + { + "epoch": 0.27, + "learning_rate": 0.0001, + "loss": 1.2877, + "step": 150 + }, + { + "epoch": 0.29, + "learning_rate": 0.0001, + "loss": 0.9146, + "step": 160 + }, + { + "epoch": 0.31, + "learning_rate": 0.0001, + "loss": 0.8895, + "step": 170 + }, + { + "epoch": 0.33, + "learning_rate": 0.0001, + "loss": 1.0121, + "step": 180 + }, + { + "epoch": 0.34, + "eval_loss": 1.0215636491775513, + "eval_runtime": 950.138, + "eval_samples_per_second": 1.052, + "eval_steps_per_second": 1.052, + "step": 187 + }, + { + "epoch": 0.34, + "mmlu_eval_accuracy": 0.731892294851104, + "mmlu_eval_accuracy_abstract_algebra": 0.18181818181818182, + "mmlu_eval_accuracy_anatomy": 0.7857142857142857, + "mmlu_eval_accuracy_astronomy": 0.6875, + "mmlu_eval_accuracy_business_ethics": 0.7272727272727273, + "mmlu_eval_accuracy_clinical_knowledge": 0.896551724137931, + "mmlu_eval_accuracy_college_biology": 0.875, + "mmlu_eval_accuracy_college_chemistry": 0.5, + "mmlu_eval_accuracy_college_computer_science": 0.7272727272727273, + "mmlu_eval_accuracy_college_mathematics": 0.45454545454545453, + "mmlu_eval_accuracy_college_medicine": 0.9090909090909091, + "mmlu_eval_accuracy_college_physics": 0.6363636363636364, + "mmlu_eval_accuracy_computer_security": 0.6363636363636364, + "mmlu_eval_accuracy_conceptual_physics": 0.6538461538461539, + "mmlu_eval_accuracy_econometrics": 0.8333333333333334, + "mmlu_eval_accuracy_electrical_engineering": 0.8125, + "mmlu_eval_accuracy_elementary_mathematics": 0.7073170731707317, + "mmlu_eval_accuracy_formal_logic": 0.5714285714285714, + "mmlu_eval_accuracy_global_facts": 0.5, + "mmlu_eval_accuracy_high_school_biology": 0.8125, + "mmlu_eval_accuracy_high_school_chemistry": 0.4090909090909091, + "mmlu_eval_accuracy_high_school_computer_science": 0.6666666666666666, + "mmlu_eval_accuracy_high_school_european_history": 0.7777777777777778, + "mmlu_eval_accuracy_high_school_geography": 0.9090909090909091, + "mmlu_eval_accuracy_high_school_government_and_politics": 0.9523809523809523, + "mmlu_eval_accuracy_high_school_macroeconomics": 0.7441860465116279, + "mmlu_eval_accuracy_high_school_mathematics": 0.3793103448275862, + "mmlu_eval_accuracy_high_school_microeconomics": 0.9615384615384616, + "mmlu_eval_accuracy_high_school_physics": 0.23529411764705882, + "mmlu_eval_accuracy_high_school_psychology": 0.9333333333333333, + "mmlu_eval_accuracy_high_school_statistics": 0.6956521739130435, + "mmlu_eval_accuracy_high_school_us_history": 0.9090909090909091, + "mmlu_eval_accuracy_high_school_world_history": 0.8076923076923077, + "mmlu_eval_accuracy_human_aging": 0.8260869565217391, + "mmlu_eval_accuracy_human_sexuality": 0.6666666666666666, + "mmlu_eval_accuracy_international_law": 1.0, + "mmlu_eval_accuracy_jurisprudence": 0.6363636363636364, + "mmlu_eval_accuracy_logical_fallacies": 0.7777777777777778, + "mmlu_eval_accuracy_machine_learning": 0.5454545454545454, + "mmlu_eval_accuracy_management": 0.9090909090909091, + "mmlu_eval_accuracy_marketing": 0.88, + "mmlu_eval_accuracy_medical_genetics": 0.9090909090909091, + "mmlu_eval_accuracy_miscellaneous": 0.7906976744186046, + "mmlu_eval_accuracy_moral_disputes": 0.8157894736842105, + "mmlu_eval_accuracy_moral_scenarios": 0.59, + "mmlu_eval_accuracy_nutrition": 0.7878787878787878, + "mmlu_eval_accuracy_philosophy": 0.7941176470588235, + "mmlu_eval_accuracy_prehistory": 0.8285714285714286, + "mmlu_eval_accuracy_professional_accounting": 0.6451612903225806, + "mmlu_eval_accuracy_professional_law": 0.6294117647058823, + "mmlu_eval_accuracy_professional_medicine": 0.8064516129032258, + "mmlu_eval_accuracy_professional_psychology": 0.8115942028985508, + "mmlu_eval_accuracy_public_relations": 0.6666666666666666, + "mmlu_eval_accuracy_security_studies": 0.8148148148148148, + "mmlu_eval_accuracy_sociology": 0.9545454545454546, + "mmlu_eval_accuracy_us_foreign_policy": 1.0, + "mmlu_eval_accuracy_virology": 0.5, + "mmlu_eval_accuracy_world_religions": 0.8421052631578947, + "mmlu_loss": 1.326305795171384, + "step": 187 + }, + { + "epoch": 0.34, + "learning_rate": 0.0001, + "loss": 1.1133, + "step": 190 + }, + { + "epoch": 0.36, + "learning_rate": 0.0001, + "loss": 1.2485, + "step": 200 + }, + { + "epoch": 0.38, + "learning_rate": 0.0001, + "loss": 0.9653, + "step": 210 + }, + { + "epoch": 0.4, + "learning_rate": 0.0001, + "loss": 0.9455, + "step": 220 + }, + { + "epoch": 0.42, + "learning_rate": 0.0001, + "loss": 1.0373, + "step": 230 + }, + { + "epoch": 0.44, + "learning_rate": 0.0001, + "loss": 1.1425, + "step": 240 + }, + { + "epoch": 0.45, + "learning_rate": 0.0001, + "loss": 1.3136, + "step": 250 + }, + { + "epoch": 0.47, + "learning_rate": 0.0001, + "loss": 0.8695, + "step": 260 + }, + { + "epoch": 0.49, + "learning_rate": 0.0001, + "loss": 0.872, + "step": 270 + }, + { + "epoch": 0.51, + "learning_rate": 0.0001, + "loss": 1.0152, + "step": 280 + }, + { + "epoch": 0.53, + "learning_rate": 0.0001, + "loss": 1.1309, + "step": 290 + }, + { + "epoch": 0.54, + "learning_rate": 0.0001, + "loss": 1.267, + "step": 300 + }, + { + "epoch": 0.56, + "learning_rate": 0.0001, + "loss": 0.9249, + "step": 310 + }, + { + "epoch": 0.58, + "learning_rate": 0.0001, + "loss": 0.9148, + "step": 320 + }, + { + "epoch": 0.6, + "learning_rate": 0.0001, + "loss": 0.9864, + "step": 330 + }, + { + "epoch": 0.62, + "learning_rate": 0.0001, + "loss": 1.2312, + "step": 340 + }, + { + "epoch": 0.63, + "learning_rate": 0.0001, + "loss": 1.2354, + "step": 350 + }, + { + "epoch": 0.65, + "learning_rate": 0.0001, + "loss": 0.9126, + "step": 360 + }, + { + "epoch": 0.67, + "learning_rate": 0.0001, + "loss": 0.9213, + "step": 370 + }, + { + "epoch": 0.68, + "eval_loss": 1.0163359642028809, + "eval_runtime": 948.1151, + "eval_samples_per_second": 1.055, + "eval_steps_per_second": 1.055, + "step": 374 + }, + { + "epoch": 0.68, + "mmlu_eval_accuracy": 0.7395476061435284, + "mmlu_eval_accuracy_abstract_algebra": 0.2727272727272727, + "mmlu_eval_accuracy_anatomy": 0.7857142857142857, + "mmlu_eval_accuracy_astronomy": 0.75, + "mmlu_eval_accuracy_business_ethics": 0.7272727272727273, + "mmlu_eval_accuracy_clinical_knowledge": 0.896551724137931, + "mmlu_eval_accuracy_college_biology": 0.875, + "mmlu_eval_accuracy_college_chemistry": 0.5, + "mmlu_eval_accuracy_college_computer_science": 0.7272727272727273, + "mmlu_eval_accuracy_college_mathematics": 0.36363636363636365, + "mmlu_eval_accuracy_college_medicine": 0.9090909090909091, + "mmlu_eval_accuracy_college_physics": 0.6363636363636364, + "mmlu_eval_accuracy_computer_security": 0.6363636363636364, + "mmlu_eval_accuracy_conceptual_physics": 0.5769230769230769, + "mmlu_eval_accuracy_econometrics": 0.8333333333333334, + "mmlu_eval_accuracy_electrical_engineering": 0.875, + "mmlu_eval_accuracy_elementary_mathematics": 0.6829268292682927, + "mmlu_eval_accuracy_formal_logic": 0.6428571428571429, + "mmlu_eval_accuracy_global_facts": 0.5, + "mmlu_eval_accuracy_high_school_biology": 0.8125, + "mmlu_eval_accuracy_high_school_chemistry": 0.45454545454545453, + "mmlu_eval_accuracy_high_school_computer_science": 0.7777777777777778, + "mmlu_eval_accuracy_high_school_european_history": 0.7777777777777778, + "mmlu_eval_accuracy_high_school_geography": 0.9090909090909091, + "mmlu_eval_accuracy_high_school_government_and_politics": 0.9523809523809523, + "mmlu_eval_accuracy_high_school_macroeconomics": 0.7674418604651163, + "mmlu_eval_accuracy_high_school_mathematics": 0.41379310344827586, + "mmlu_eval_accuracy_high_school_microeconomics": 0.9615384615384616, + "mmlu_eval_accuracy_high_school_physics": 0.23529411764705882, + "mmlu_eval_accuracy_high_school_psychology": 0.95, + "mmlu_eval_accuracy_high_school_statistics": 0.7391304347826086, + "mmlu_eval_accuracy_high_school_us_history": 0.9090909090909091, + "mmlu_eval_accuracy_high_school_world_history": 0.7692307692307693, + "mmlu_eval_accuracy_human_aging": 0.8260869565217391, + "mmlu_eval_accuracy_human_sexuality": 0.6666666666666666, + "mmlu_eval_accuracy_international_law": 1.0, + "mmlu_eval_accuracy_jurisprudence": 0.6363636363636364, + "mmlu_eval_accuracy_logical_fallacies": 0.7777777777777778, + "mmlu_eval_accuracy_machine_learning": 0.5454545454545454, + "mmlu_eval_accuracy_management": 0.9090909090909091, + "mmlu_eval_accuracy_marketing": 0.92, + "mmlu_eval_accuracy_medical_genetics": 0.9090909090909091, + "mmlu_eval_accuracy_miscellaneous": 0.7790697674418605, + "mmlu_eval_accuracy_moral_disputes": 0.8157894736842105, + "mmlu_eval_accuracy_moral_scenarios": 0.57, + "mmlu_eval_accuracy_nutrition": 0.7272727272727273, + "mmlu_eval_accuracy_philosophy": 0.7941176470588235, + "mmlu_eval_accuracy_prehistory": 0.8571428571428571, + "mmlu_eval_accuracy_professional_accounting": 0.6774193548387096, + "mmlu_eval_accuracy_professional_law": 0.6411764705882353, + "mmlu_eval_accuracy_professional_medicine": 0.8387096774193549, + "mmlu_eval_accuracy_professional_psychology": 0.8115942028985508, + "mmlu_eval_accuracy_public_relations": 0.6666666666666666, + "mmlu_eval_accuracy_security_studies": 0.8148148148148148, + "mmlu_eval_accuracy_sociology": 0.9545454545454546, + "mmlu_eval_accuracy_us_foreign_policy": 1.0, + "mmlu_eval_accuracy_virology": 0.5, + "mmlu_eval_accuracy_world_religions": 0.8947368421052632, + "mmlu_loss": 1.2796503596061355, + "step": 374 + }, + { + "epoch": 0.69, + "learning_rate": 0.0001, + "loss": 0.9737, + "step": 380 + }, + { + "epoch": 0.71, + "learning_rate": 0.0001, + "loss": 1.157, + "step": 390 + }, + { + "epoch": 0.73, + "learning_rate": 0.0001, + "loss": 1.2106, + "step": 400 + }, + { + "epoch": 0.74, + "learning_rate": 0.0001, + "loss": 0.8687, + "step": 410 + }, + { + "epoch": 0.76, + "learning_rate": 0.0001, + "loss": 0.8742, + "step": 420 + }, + { + "epoch": 0.78, + "learning_rate": 0.0001, + "loss": 0.9901, + "step": 430 + }, + { + "epoch": 0.8, + "learning_rate": 0.0001, + "loss": 1.2238, + "step": 440 + }, + { + "epoch": 0.82, + "learning_rate": 0.0001, + "loss": 1.2604, + "step": 450 + }, + { + "epoch": 0.83, + "learning_rate": 0.0001, + "loss": 0.8756, + "step": 460 + }, + { + "epoch": 0.85, + "learning_rate": 0.0001, + "loss": 0.8683, + "step": 470 + }, + { + "epoch": 0.87, + "learning_rate": 0.0001, + "loss": 0.9824, + "step": 480 + }, + { + "epoch": 0.89, + "learning_rate": 0.0001, + "loss": 1.1574, + "step": 490 + }, + { + "epoch": 0.91, + "learning_rate": 0.0001, + "loss": 1.2687, + "step": 500 + }, + { + "epoch": 0.92, + "learning_rate": 0.0001, + "loss": 0.8657, + "step": 510 + }, + { + "epoch": 0.94, + "learning_rate": 0.0001, + "loss": 0.9207, + "step": 520 + }, + { + "epoch": 0.96, + "learning_rate": 0.0001, + "loss": 1.012, + "step": 530 + }, + { + "epoch": 0.98, + "learning_rate": 0.0001, + "loss": 1.1517, + "step": 540 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 1.1654, + "step": 550 + }, + { + "epoch": 1.02, + "learning_rate": 0.0001, + "loss": 0.8931, + "step": 560 + }, + { + "epoch": 1.02, + "eval_loss": 1.0150845050811768, + "eval_runtime": 949.8392, + "eval_samples_per_second": 1.053, + "eval_steps_per_second": 1.053, + "step": 561 + }, + { + "epoch": 1.02, + "mmlu_eval_accuracy": 0.7346397374699287, + "mmlu_eval_accuracy_abstract_algebra": 0.2727272727272727, + "mmlu_eval_accuracy_anatomy": 0.7142857142857143, + "mmlu_eval_accuracy_astronomy": 0.6875, + "mmlu_eval_accuracy_business_ethics": 0.8181818181818182, + "mmlu_eval_accuracy_clinical_knowledge": 0.8620689655172413, + "mmlu_eval_accuracy_college_biology": 0.875, + "mmlu_eval_accuracy_college_chemistry": 0.5, + "mmlu_eval_accuracy_college_computer_science": 0.7272727272727273, + "mmlu_eval_accuracy_college_mathematics": 0.36363636363636365, + "mmlu_eval_accuracy_college_medicine": 0.9090909090909091, + "mmlu_eval_accuracy_college_physics": 0.5454545454545454, + "mmlu_eval_accuracy_computer_security": 0.6363636363636364, + "mmlu_eval_accuracy_conceptual_physics": 0.6153846153846154, + "mmlu_eval_accuracy_econometrics": 0.8333333333333334, + "mmlu_eval_accuracy_electrical_engineering": 0.8125, + "mmlu_eval_accuracy_elementary_mathematics": 0.6829268292682927, + "mmlu_eval_accuracy_formal_logic": 0.6428571428571429, + "mmlu_eval_accuracy_global_facts": 0.5, + "mmlu_eval_accuracy_high_school_biology": 0.84375, + "mmlu_eval_accuracy_high_school_chemistry": 0.45454545454545453, + "mmlu_eval_accuracy_high_school_computer_science": 0.7777777777777778, + "mmlu_eval_accuracy_high_school_european_history": 0.7777777777777778, + "mmlu_eval_accuracy_high_school_geography": 0.9090909090909091, + "mmlu_eval_accuracy_high_school_government_and_politics": 0.9523809523809523, + "mmlu_eval_accuracy_high_school_macroeconomics": 0.7674418604651163, + "mmlu_eval_accuracy_high_school_mathematics": 0.41379310344827586, + "mmlu_eval_accuracy_high_school_microeconomics": 0.9230769230769231, + "mmlu_eval_accuracy_high_school_physics": 0.23529411764705882, + "mmlu_eval_accuracy_high_school_psychology": 0.9666666666666667, + "mmlu_eval_accuracy_high_school_statistics": 0.7391304347826086, + "mmlu_eval_accuracy_high_school_us_history": 0.9090909090909091, + "mmlu_eval_accuracy_high_school_world_history": 0.8076923076923077, + "mmlu_eval_accuracy_human_aging": 0.8260869565217391, + "mmlu_eval_accuracy_human_sexuality": 0.6666666666666666, + "mmlu_eval_accuracy_international_law": 1.0, + "mmlu_eval_accuracy_jurisprudence": 0.6363636363636364, + "mmlu_eval_accuracy_logical_fallacies": 0.7222222222222222, + "mmlu_eval_accuracy_machine_learning": 0.6363636363636364, + "mmlu_eval_accuracy_management": 0.9090909090909091, + "mmlu_eval_accuracy_marketing": 0.88, + "mmlu_eval_accuracy_medical_genetics": 0.9090909090909091, + "mmlu_eval_accuracy_miscellaneous": 0.7558139534883721, + "mmlu_eval_accuracy_moral_disputes": 0.8157894736842105, + "mmlu_eval_accuracy_moral_scenarios": 0.56, + "mmlu_eval_accuracy_nutrition": 0.7575757575757576, + "mmlu_eval_accuracy_philosophy": 0.7941176470588235, + "mmlu_eval_accuracy_prehistory": 0.8571428571428571, + "mmlu_eval_accuracy_professional_accounting": 0.6774193548387096, + "mmlu_eval_accuracy_professional_law": 0.6294117647058823, + "mmlu_eval_accuracy_professional_medicine": 0.8064516129032258, + "mmlu_eval_accuracy_professional_psychology": 0.8260869565217391, + "mmlu_eval_accuracy_public_relations": 0.6666666666666666, + "mmlu_eval_accuracy_security_studies": 0.8148148148148148, + "mmlu_eval_accuracy_sociology": 0.9090909090909091, + "mmlu_eval_accuracy_us_foreign_policy": 1.0, + "mmlu_eval_accuracy_virology": 0.5, + "mmlu_eval_accuracy_world_religions": 0.8421052631578947, + "mmlu_loss": 1.186674291658526, + "step": 561 + }, + { + "epoch": 1.03, + "learning_rate": 0.0001, + "loss": 0.8507, + "step": 570 + }, + { + "epoch": 1.05, + "learning_rate": 0.0001, + "loss": 0.9164, + "step": 580 + }, + { + "epoch": 1.07, + "learning_rate": 0.0001, + "loss": 1.0908, + "step": 590 + }, + { + "epoch": 1.09, + "learning_rate": 0.0001, + "loss": 1.0431, + "step": 600 + }, + { + "epoch": 1.11, + "learning_rate": 0.0001, + "loss": 0.8567, + "step": 610 + }, + { + "epoch": 1.12, + "learning_rate": 0.0001, + "loss": 0.8818, + "step": 620 + }, + { + "epoch": 1.14, + "learning_rate": 0.0001, + "loss": 0.9499, + "step": 630 + }, + { + "epoch": 1.16, + "learning_rate": 0.0001, + "loss": 1.0437, + "step": 640 + }, + { + "epoch": 1.18, + "learning_rate": 0.0001, + "loss": 1.0487, + "step": 650 + }, + { + "epoch": 1.2, + "learning_rate": 0.0001, + "loss": 0.8405, + "step": 660 + }, + { + "epoch": 1.22, + "learning_rate": 0.0001, + "loss": 0.8818, + "step": 670 + }, + { + "epoch": 1.23, + "learning_rate": 0.0001, + "loss": 0.9619, + "step": 680 + }, + { + "epoch": 1.25, + "learning_rate": 0.0001, + "loss": 1.0753, + "step": 690 + }, + { + "epoch": 1.27, + "learning_rate": 0.0001, + "loss": 1.0218, + "step": 700 + }, + { + "epoch": 1.29, + "learning_rate": 0.0001, + "loss": 0.8763, + "step": 710 + }, + { + "epoch": 1.31, + "learning_rate": 0.0001, + "loss": 0.8789, + "step": 720 + }, + { + "epoch": 1.32, + "learning_rate": 0.0001, + "loss": 0.8631, + "step": 730 + }, + { + "epoch": 1.34, + "learning_rate": 0.0001, + "loss": 0.9846, + "step": 740 + }, + { + "epoch": 1.36, + "eval_loss": 1.0305067300796509, + "eval_runtime": 948.7106, + "eval_samples_per_second": 1.054, + "eval_steps_per_second": 1.054, + "step": 748 + }, + { + "epoch": 1.36, + "mmlu_eval_accuracy": 0.7324229372189777, + "mmlu_eval_accuracy_abstract_algebra": 0.2727272727272727, + "mmlu_eval_accuracy_anatomy": 0.6428571428571429, + "mmlu_eval_accuracy_astronomy": 0.6875, + "mmlu_eval_accuracy_business_ethics": 1.0, + "mmlu_eval_accuracy_clinical_knowledge": 0.896551724137931, + "mmlu_eval_accuracy_college_biology": 0.875, + "mmlu_eval_accuracy_college_chemistry": 0.5, + "mmlu_eval_accuracy_college_computer_science": 0.6363636363636364, + "mmlu_eval_accuracy_college_mathematics": 0.36363636363636365, + "mmlu_eval_accuracy_college_medicine": 0.8636363636363636, + "mmlu_eval_accuracy_college_physics": 0.6363636363636364, + "mmlu_eval_accuracy_computer_security": 0.6363636363636364, + "mmlu_eval_accuracy_conceptual_physics": 0.6153846153846154, + "mmlu_eval_accuracy_econometrics": 0.8333333333333334, + "mmlu_eval_accuracy_electrical_engineering": 0.8125, + "mmlu_eval_accuracy_elementary_mathematics": 0.7073170731707317, + "mmlu_eval_accuracy_formal_logic": 0.6428571428571429, + "mmlu_eval_accuracy_global_facts": 0.4, + "mmlu_eval_accuracy_high_school_biology": 0.8125, + "mmlu_eval_accuracy_high_school_chemistry": 0.36363636363636365, + "mmlu_eval_accuracy_high_school_computer_science": 0.6666666666666666, + "mmlu_eval_accuracy_high_school_european_history": 0.7777777777777778, + "mmlu_eval_accuracy_high_school_geography": 0.9090909090909091, + "mmlu_eval_accuracy_high_school_government_and_politics": 0.9523809523809523, + "mmlu_eval_accuracy_high_school_macroeconomics": 0.7674418604651163, + "mmlu_eval_accuracy_high_school_mathematics": 0.3448275862068966, + "mmlu_eval_accuracy_high_school_microeconomics": 0.9615384615384616, + "mmlu_eval_accuracy_high_school_physics": 0.35294117647058826, + "mmlu_eval_accuracy_high_school_psychology": 0.9333333333333333, + "mmlu_eval_accuracy_high_school_statistics": 0.782608695652174, + "mmlu_eval_accuracy_high_school_us_history": 0.9545454545454546, + "mmlu_eval_accuracy_high_school_world_history": 0.7692307692307693, + "mmlu_eval_accuracy_human_aging": 0.782608695652174, + "mmlu_eval_accuracy_human_sexuality": 0.6666666666666666, + "mmlu_eval_accuracy_international_law": 1.0, + "mmlu_eval_accuracy_jurisprudence": 0.6363636363636364, + "mmlu_eval_accuracy_logical_fallacies": 0.7777777777777778, + "mmlu_eval_accuracy_machine_learning": 0.6363636363636364, + "mmlu_eval_accuracy_management": 0.9090909090909091, + "mmlu_eval_accuracy_marketing": 0.96, + "mmlu_eval_accuracy_medical_genetics": 0.9090909090909091, + "mmlu_eval_accuracy_miscellaneous": 0.7441860465116279, + "mmlu_eval_accuracy_moral_disputes": 0.8157894736842105, + "mmlu_eval_accuracy_moral_scenarios": 0.57, + "mmlu_eval_accuracy_nutrition": 0.7272727272727273, + "mmlu_eval_accuracy_philosophy": 0.7941176470588235, + "mmlu_eval_accuracy_prehistory": 0.8285714285714286, + "mmlu_eval_accuracy_professional_accounting": 0.6129032258064516, + "mmlu_eval_accuracy_professional_law": 0.6411764705882353, + "mmlu_eval_accuracy_professional_medicine": 0.8064516129032258, + "mmlu_eval_accuracy_professional_psychology": 0.8260869565217391, + "mmlu_eval_accuracy_public_relations": 0.6666666666666666, + "mmlu_eval_accuracy_security_studies": 0.8148148148148148, + "mmlu_eval_accuracy_sociology": 0.9090909090909091, + "mmlu_eval_accuracy_us_foreign_policy": 1.0, + "mmlu_eval_accuracy_virology": 0.5, + "mmlu_eval_accuracy_world_religions": 0.8421052631578947, + "mmlu_loss": 1.2988067958029479, + "step": 748 + }, + { + "epoch": 1.36, + "learning_rate": 0.0001, + "loss": 1.0735, + "step": 750 + }, + { + "epoch": 1.38, + "learning_rate": 0.0001, + "loss": 0.9066, + "step": 760 + }, + { + "epoch": 1.4, + "learning_rate": 0.0001, + "loss": 0.8716, + "step": 770 + }, + { + "epoch": 1.41, + "learning_rate": 0.0001, + "loss": 0.9144, + "step": 780 + }, + { + "epoch": 1.43, + "learning_rate": 0.0001, + "loss": 1.0338, + "step": 790 + }, + { + "epoch": 1.45, + "learning_rate": 0.0001, + "loss": 1.0275, + "step": 800 + }, + { + "epoch": 1.47, + "learning_rate": 0.0001, + "loss": 0.8382, + "step": 810 + }, + { + "epoch": 1.49, + "learning_rate": 0.0001, + "loss": 0.8489, + "step": 820 + }, + { + "epoch": 1.51, + "learning_rate": 0.0001, + "loss": 0.8931, + "step": 830 + }, + { + "epoch": 1.52, + "learning_rate": 0.0001, + "loss": 1.0515, + "step": 840 + }, + { + "epoch": 1.54, + "learning_rate": 0.0001, + "loss": 1.0965, + "step": 850 + }, + { + "epoch": 1.56, + "learning_rate": 0.0001, + "loss": 0.8928, + "step": 860 + }, + { + "epoch": 1.58, + "learning_rate": 0.0001, + "loss": 0.8608, + "step": 870 + }, + { + "epoch": 1.6, + "learning_rate": 0.0001, + "loss": 0.8831, + "step": 880 + }, + { + "epoch": 1.61, + "learning_rate": 0.0001, + "loss": 1.0253, + "step": 890 + }, + { + "epoch": 1.63, + "learning_rate": 0.0001, + "loss": 0.9905, + "step": 900 + }, + { + "epoch": 1.65, + "learning_rate": 0.0001, + "loss": 0.8487, + "step": 910 + }, + { + "epoch": 1.67, + "learning_rate": 0.0001, + "loss": 0.8568, + "step": 920 + }, + { + "epoch": 1.69, + "learning_rate": 0.0001, + "loss": 0.9047, + "step": 930 + }, + { + "epoch": 1.7, + "eval_loss": 1.0250624418258667, + "eval_runtime": 946.4035, + "eval_samples_per_second": 1.057, + "eval_steps_per_second": 1.057, + "step": 935 + }, + { + "epoch": 1.7, + "mmlu_eval_accuracy": 0.7288948695878031, + "mmlu_eval_accuracy_abstract_algebra": 0.2727272727272727, + "mmlu_eval_accuracy_anatomy": 0.5714285714285714, + "mmlu_eval_accuracy_astronomy": 0.6875, + "mmlu_eval_accuracy_business_ethics": 0.9090909090909091, + "mmlu_eval_accuracy_clinical_knowledge": 0.8620689655172413, + "mmlu_eval_accuracy_college_biology": 0.875, + "mmlu_eval_accuracy_college_chemistry": 0.5, + "mmlu_eval_accuracy_college_computer_science": 0.7272727272727273, + "mmlu_eval_accuracy_college_mathematics": 0.36363636363636365, + "mmlu_eval_accuracy_college_medicine": 0.9090909090909091, + "mmlu_eval_accuracy_college_physics": 0.6363636363636364, + "mmlu_eval_accuracy_computer_security": 0.6363636363636364, + "mmlu_eval_accuracy_conceptual_physics": 0.6153846153846154, + "mmlu_eval_accuracy_econometrics": 0.8333333333333334, + "mmlu_eval_accuracy_electrical_engineering": 0.8125, + "mmlu_eval_accuracy_elementary_mathematics": 0.6097560975609756, + "mmlu_eval_accuracy_formal_logic": 0.6428571428571429, + "mmlu_eval_accuracy_global_facts": 0.5, + "mmlu_eval_accuracy_high_school_biology": 0.84375, + "mmlu_eval_accuracy_high_school_chemistry": 0.45454545454545453, + "mmlu_eval_accuracy_high_school_computer_science": 0.6666666666666666, + "mmlu_eval_accuracy_high_school_european_history": 0.7777777777777778, + "mmlu_eval_accuracy_high_school_geography": 0.9090909090909091, + "mmlu_eval_accuracy_high_school_government_and_politics": 0.9523809523809523, + "mmlu_eval_accuracy_high_school_macroeconomics": 0.7906976744186046, + "mmlu_eval_accuracy_high_school_mathematics": 0.3793103448275862, + "mmlu_eval_accuracy_high_school_microeconomics": 0.9230769230769231, + "mmlu_eval_accuracy_high_school_physics": 0.23529411764705882, + "mmlu_eval_accuracy_high_school_psychology": 0.9333333333333333, + "mmlu_eval_accuracy_high_school_statistics": 0.7391304347826086, + "mmlu_eval_accuracy_high_school_us_history": 0.9090909090909091, + "mmlu_eval_accuracy_high_school_world_history": 0.7692307692307693, + "mmlu_eval_accuracy_human_aging": 0.782608695652174, + "mmlu_eval_accuracy_human_sexuality": 0.6666666666666666, + "mmlu_eval_accuracy_international_law": 1.0, + "mmlu_eval_accuracy_jurisprudence": 0.5454545454545454, + "mmlu_eval_accuracy_logical_fallacies": 0.7222222222222222, + "mmlu_eval_accuracy_machine_learning": 0.6363636363636364, + "mmlu_eval_accuracy_management": 0.9090909090909091, + "mmlu_eval_accuracy_marketing": 0.96, + "mmlu_eval_accuracy_medical_genetics": 0.9090909090909091, + "mmlu_eval_accuracy_miscellaneous": 0.7790697674418605, + "mmlu_eval_accuracy_moral_disputes": 0.8157894736842105, + "mmlu_eval_accuracy_moral_scenarios": 0.57, + "mmlu_eval_accuracy_nutrition": 0.7575757575757576, + "mmlu_eval_accuracy_philosophy": 0.8235294117647058, + "mmlu_eval_accuracy_prehistory": 0.8857142857142857, + "mmlu_eval_accuracy_professional_accounting": 0.6129032258064516, + "mmlu_eval_accuracy_professional_law": 0.6235294117647059, + "mmlu_eval_accuracy_professional_medicine": 0.8387096774193549, + "mmlu_eval_accuracy_professional_psychology": 0.8115942028985508, + "mmlu_eval_accuracy_public_relations": 0.5833333333333334, + "mmlu_eval_accuracy_security_studies": 0.8148148148148148, + "mmlu_eval_accuracy_sociology": 0.9090909090909091, + "mmlu_eval_accuracy_us_foreign_policy": 1.0, + "mmlu_eval_accuracy_virology": 0.5, + "mmlu_eval_accuracy_world_religions": 0.8421052631578947, + "mmlu_loss": 1.243813282909306, + "step": 935 + }, + { + "epoch": 1.7, + "learning_rate": 0.0001, + "loss": 1.0174, + "step": 940 + }, + { + "epoch": 1.72, + "learning_rate": 0.0001, + "loss": 1.0302, + "step": 950 + }, + { + "epoch": 1.74, + "learning_rate": 0.0001, + "loss": 0.8799, + "step": 960 + }, + { + "epoch": 1.76, + "learning_rate": 0.0001, + "loss": 0.8447, + "step": 970 + }, + { + "epoch": 1.78, + "learning_rate": 0.0001, + "loss": 0.9053, + "step": 980 + }, + { + "epoch": 1.8, + "learning_rate": 0.0001, + "loss": 1.0331, + "step": 990 + }, + { + "epoch": 1.81, + "learning_rate": 0.0001, + "loss": 1.0412, + "step": 1000 + }, + { + "epoch": 1.83, + "learning_rate": 0.0001, + "loss": 0.8753, + "step": 1010 + }, + { + "epoch": 1.85, + "learning_rate": 0.0001, + "loss": 0.8744, + "step": 1020 + }, + { + "epoch": 1.87, + "learning_rate": 0.0001, + "loss": 0.8899, + "step": 1030 + }, + { + "epoch": 1.89, + "learning_rate": 0.0001, + "loss": 1.0053, + "step": 1040 + }, + { + "epoch": 1.9, + "learning_rate": 0.0001, + "loss": 1.0127, + "step": 1050 + }, + { + "epoch": 1.92, + "learning_rate": 0.0001, + "loss": 0.8023, + "step": 1060 + }, + { + "epoch": 1.94, + "learning_rate": 0.0001, + "loss": 0.8349, + "step": 1070 + }, + { + "epoch": 1.96, + "learning_rate": 0.0001, + "loss": 0.9742, + "step": 1080 + }, + { + "epoch": 1.98, + "learning_rate": 0.0001, + "loss": 1.0971, + "step": 1090 + }, + { + "epoch": 2.0, + "learning_rate": 0.0001, + "loss": 1.0728, + "step": 1100 + }, + { + "epoch": 2.01, + "learning_rate": 0.0001, + "loss": 0.7724, + "step": 1110 + }, + { + "epoch": 2.03, + "learning_rate": 0.0001, + "loss": 0.7675, + "step": 1120 + }, + { + "epoch": 2.03, + "eval_loss": 1.052681565284729, + "eval_runtime": 942.0722, + "eval_samples_per_second": 1.061, + "eval_steps_per_second": 1.061, + "step": 1122 + }, + { + "epoch": 2.03, + "mmlu_eval_accuracy": 0.7373981967098951, + "mmlu_eval_accuracy_abstract_algebra": 0.36363636363636365, + "mmlu_eval_accuracy_anatomy": 0.6428571428571429, + "mmlu_eval_accuracy_astronomy": 0.6875, + "mmlu_eval_accuracy_business_ethics": 0.9090909090909091, + "mmlu_eval_accuracy_clinical_knowledge": 0.896551724137931, + "mmlu_eval_accuracy_college_biology": 0.875, + "mmlu_eval_accuracy_college_chemistry": 0.5, + "mmlu_eval_accuracy_college_computer_science": 0.5454545454545454, + "mmlu_eval_accuracy_college_mathematics": 0.36363636363636365, + "mmlu_eval_accuracy_college_medicine": 0.9090909090909091, + "mmlu_eval_accuracy_college_physics": 0.5454545454545454, + "mmlu_eval_accuracy_computer_security": 0.6363636363636364, + "mmlu_eval_accuracy_conceptual_physics": 0.6153846153846154, + "mmlu_eval_accuracy_econometrics": 0.8333333333333334, + "mmlu_eval_accuracy_electrical_engineering": 0.875, + "mmlu_eval_accuracy_elementary_mathematics": 0.5853658536585366, + "mmlu_eval_accuracy_formal_logic": 0.7142857142857143, + "mmlu_eval_accuracy_global_facts": 0.4, + "mmlu_eval_accuracy_high_school_biology": 0.84375, + "mmlu_eval_accuracy_high_school_chemistry": 0.45454545454545453, + "mmlu_eval_accuracy_high_school_computer_science": 0.7777777777777778, + "mmlu_eval_accuracy_high_school_european_history": 0.7777777777777778, + "mmlu_eval_accuracy_high_school_geography": 0.9090909090909091, + "mmlu_eval_accuracy_high_school_government_and_politics": 0.9523809523809523, + "mmlu_eval_accuracy_high_school_macroeconomics": 0.7674418604651163, + "mmlu_eval_accuracy_high_school_mathematics": 0.41379310344827586, + "mmlu_eval_accuracy_high_school_microeconomics": 0.9230769230769231, + "mmlu_eval_accuracy_high_school_physics": 0.23529411764705882, + "mmlu_eval_accuracy_high_school_psychology": 0.95, + "mmlu_eval_accuracy_high_school_statistics": 0.782608695652174, + "mmlu_eval_accuracy_high_school_us_history": 0.9090909090909091, + "mmlu_eval_accuracy_high_school_world_history": 0.7692307692307693, + "mmlu_eval_accuracy_human_aging": 0.8260869565217391, + "mmlu_eval_accuracy_human_sexuality": 0.6666666666666666, + "mmlu_eval_accuracy_international_law": 1.0, + "mmlu_eval_accuracy_jurisprudence": 0.6363636363636364, + "mmlu_eval_accuracy_logical_fallacies": 0.7777777777777778, + "mmlu_eval_accuracy_machine_learning": 0.6363636363636364, + "mmlu_eval_accuracy_management": 0.9090909090909091, + "mmlu_eval_accuracy_marketing": 0.92, + "mmlu_eval_accuracy_medical_genetics": 0.9090909090909091, + "mmlu_eval_accuracy_miscellaneous": 0.7906976744186046, + "mmlu_eval_accuracy_moral_disputes": 0.8157894736842105, + "mmlu_eval_accuracy_moral_scenarios": 0.62, + "mmlu_eval_accuracy_nutrition": 0.7575757575757576, + "mmlu_eval_accuracy_philosophy": 0.8235294117647058, + "mmlu_eval_accuracy_prehistory": 0.8857142857142857, + "mmlu_eval_accuracy_professional_accounting": 0.6774193548387096, + "mmlu_eval_accuracy_professional_law": 0.6294117647058823, + "mmlu_eval_accuracy_professional_medicine": 0.8709677419354839, + "mmlu_eval_accuracy_professional_psychology": 0.782608695652174, + "mmlu_eval_accuracy_public_relations": 0.6666666666666666, + "mmlu_eval_accuracy_security_studies": 0.8148148148148148, + "mmlu_eval_accuracy_sociology": 0.9090909090909091, + "mmlu_eval_accuracy_us_foreign_policy": 1.0, + "mmlu_eval_accuracy_virology": 0.5, + "mmlu_eval_accuracy_world_religions": 0.8421052631578947, + "mmlu_loss": 1.2340081441760609, + "step": 1122 + }, + { + "epoch": 2.05, + "learning_rate": 0.0001, + "loss": 0.7194, + "step": 1130 + }, + { + "epoch": 2.07, + "learning_rate": 0.0001, + "loss": 0.8236, + "step": 1140 + }, + { + "epoch": 2.09, + "learning_rate": 0.0001, + "loss": 0.6652, + "step": 1150 + }, + { + "epoch": 2.1, + "learning_rate": 0.0001, + "loss": 0.7177, + "step": 1160 + }, + { + "epoch": 2.12, + "learning_rate": 0.0001, + "loss": 0.7788, + "step": 1170 + }, + { + "epoch": 2.14, + "learning_rate": 0.0001, + "loss": 0.8117, + "step": 1180 + }, + { + "epoch": 2.16, + "learning_rate": 0.0001, + "loss": 0.8145, + "step": 1190 + }, + { + "epoch": 2.18, + "learning_rate": 0.0001, + "loss": 0.6984, + "step": 1200 + } + ], + "max_steps": 1875, + "num_train_epochs": 4, + "total_flos": 1.3906525682785812e+18, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-1200/training_args.bin b/checkpoint-1200/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..cab5478c763bfedd6ea1ae136d922355ec23f73d --- /dev/null +++ b/checkpoint-1200/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3739b542a2914edbd2d7eb3b727d6fa2a7752c75b0d7a856f5e87dd807fa1ef9 +size 6200 diff --git a/checkpoint-1400/README.md b/checkpoint-1400/README.md new file mode 100644 index 0000000000000000000000000000000000000000..82793f73e61dbb024e11fc6697bba1622d4d0db6 --- /dev/null +++ b/checkpoint-1400/README.md @@ -0,0 +1,20 @@ +--- +library_name: peft +--- +## Training procedure + + +The following `bitsandbytes` quantization config was used during training: +- load_in_8bit: False +- load_in_4bit: True +- llm_int8_threshold: 6.0 +- llm_int8_skip_modules: None +- llm_int8_enable_fp32_cpu_offload: False +- llm_int8_has_fp16_weight: False +- bnb_4bit_quant_type: nf4 +- bnb_4bit_use_double_quant: True +- bnb_4bit_compute_dtype: bfloat16 +### Framework versions + + +- PEFT 0.4.0 diff --git a/checkpoint-1400/adapter_config.json b/checkpoint-1400/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..f6f42652a8f2f3cf02936153fca1af5d04e7983d --- /dev/null +++ b/checkpoint-1400/adapter_config.json @@ -0,0 +1,26 @@ +{ + "auto_mapping": null, + "base_model_name_or_path": "152334H/miqu-1-70b-sf", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "lora_alpha": 16.0, + "lora_dropout": 0.05, + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "revision": null, + "target_modules": [ + "gate_proj", + "down_proj", + "v_proj", + "up_proj", + "q_proj", + "k_proj", + "o_proj" + ], + "task_type": "CAUSAL_LM" +} \ No newline at end of file diff --git a/checkpoint-1400/adapter_model.bin b/checkpoint-1400/adapter_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..7ad5754242cfc89592d1fda9051dc7105f2ce05e --- /dev/null +++ b/checkpoint-1400/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d1533cea48affa188e9822cffe2f4176c550f93d32a8939add43058d977fbd29 +size 1657155522 diff --git a/checkpoint-1400/adapter_model/README.md b/checkpoint-1400/adapter_model/README.md new file mode 100644 index 0000000000000000000000000000000000000000..82793f73e61dbb024e11fc6697bba1622d4d0db6 --- /dev/null +++ b/checkpoint-1400/adapter_model/README.md @@ -0,0 +1,20 @@ +--- +library_name: peft +--- +## Training procedure + + +The following `bitsandbytes` quantization config was used during training: +- load_in_8bit: False +- load_in_4bit: True +- llm_int8_threshold: 6.0 +- llm_int8_skip_modules: None +- llm_int8_enable_fp32_cpu_offload: False +- llm_int8_has_fp16_weight: False +- bnb_4bit_quant_type: nf4 +- bnb_4bit_use_double_quant: True +- bnb_4bit_compute_dtype: bfloat16 +### Framework versions + + +- PEFT 0.4.0 diff --git a/checkpoint-1400/adapter_model/adapter_config.json b/checkpoint-1400/adapter_model/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..f6f42652a8f2f3cf02936153fca1af5d04e7983d --- /dev/null +++ b/checkpoint-1400/adapter_model/adapter_config.json @@ -0,0 +1,26 @@ +{ + "auto_mapping": null, + "base_model_name_or_path": "152334H/miqu-1-70b-sf", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "lora_alpha": 16.0, + "lora_dropout": 0.05, + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "revision": null, + "target_modules": [ + "gate_proj", + "down_proj", + "v_proj", + "up_proj", + "q_proj", + "k_proj", + "o_proj" + ], + "task_type": "CAUSAL_LM" +} \ No newline at end of file diff --git a/checkpoint-1400/adapter_model/adapter_model.bin b/checkpoint-1400/adapter_model/adapter_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..7ad5754242cfc89592d1fda9051dc7105f2ce05e --- /dev/null +++ b/checkpoint-1400/adapter_model/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d1533cea48affa188e9822cffe2f4176c550f93d32a8939add43058d977fbd29 +size 1657155522 diff --git a/checkpoint-1400/optimizer.pt b/checkpoint-1400/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..f71fdb8460793afb2967d10919225ca1ef07b2e4 --- /dev/null +++ b/checkpoint-1400/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:55ab8de049e35c0670e2b97c2dcffcc99fcb8e1c34ca4e4de2f3d6c3717664bf +size 6627702922 diff --git a/checkpoint-1400/rng_state.pth b/checkpoint-1400/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..0ba81bae71c24abadf8790a49b1509caec021df5 --- /dev/null +++ b/checkpoint-1400/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6bb2503442a3c1d2dd808417344726e9ee1fe213f212edf8a440ebcb1863ef6f +size 14180 diff --git a/checkpoint-1400/scheduler.pt b/checkpoint-1400/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..2000daee8a135b01a9047e3a8cd2aff0dbc58155 --- /dev/null +++ b/checkpoint-1400/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:90d628735beb51fe7df523288b184f458e4b2c3ef9d3bc30ab77bfb1a76e0ba0 +size 1064 diff --git a/checkpoint-1400/special_tokens_map.json b/checkpoint-1400/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..55431d8ca0ffd8e996c646341b6136e749142547 --- /dev/null +++ b/checkpoint-1400/special_tokens_map.json @@ -0,0 +1,12 @@ +{ + "bos_token": "", + "eos_token": "", + "pad_token": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false + }, + "unk_token": "" +} diff --git a/checkpoint-1400/tokenizer.model b/checkpoint-1400/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..6c00c742ce03c627d6cd5b795984876fa49fa899 --- /dev/null +++ b/checkpoint-1400/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347 +size 499723 diff --git a/checkpoint-1400/tokenizer_config.json b/checkpoint-1400/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..a3b9160e6f055447e1b1927482cf0ea394366055 --- /dev/null +++ b/checkpoint-1400/tokenizer_config.json @@ -0,0 +1,71 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": { + "__type": "AddedToken", + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false + }, + "chat_template": "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}", + "clean_up_tokenization_spaces": false, + "eos_token": { + "__type": "AddedToken", + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false + }, + "legacy": false, + "model_max_length": 1000000000000000019884624838656, + "pad_token": { + "__type": "AddedToken", + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false + }, + "padding_side": "right", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "LlamaTokenizer", + "unk_token": { + "__type": "AddedToken", + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false + }, + "use_default_system_prompt": false +} diff --git a/checkpoint-1400/trainer_state.json b/checkpoint-1400/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..2c510dd1f4395f3a3845e79d7d972e971b70f339 --- /dev/null +++ b/checkpoint-1400/trainer_state.json @@ -0,0 +1,1353 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.539106778508275, + "global_step": 1400, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.02, + "learning_rate": 0.0001, + "loss": 1.1655, + "step": 10 + }, + { + "epoch": 0.04, + "learning_rate": 0.0001, + "loss": 1.001, + "step": 20 + }, + { + "epoch": 0.05, + "learning_rate": 0.0001, + "loss": 1.0287, + "step": 30 + }, + { + "epoch": 0.07, + "learning_rate": 0.0001, + "loss": 1.1578, + "step": 40 + }, + { + "epoch": 0.09, + "learning_rate": 0.0001, + "loss": 1.2146, + "step": 50 + }, + { + "epoch": 0.11, + "learning_rate": 0.0001, + "loss": 0.997, + "step": 60 + }, + { + "epoch": 0.13, + "learning_rate": 0.0001, + "loss": 0.9024, + "step": 70 + }, + { + "epoch": 0.15, + "learning_rate": 0.0001, + "loss": 0.9901, + "step": 80 + }, + { + "epoch": 0.16, + "learning_rate": 0.0001, + "loss": 1.1264, + "step": 90 + }, + { + "epoch": 0.18, + "learning_rate": 0.0001, + "loss": 1.2038, + "step": 100 + }, + { + "epoch": 0.2, + "learning_rate": 0.0001, + "loss": 0.8935, + "step": 110 + }, + { + "epoch": 0.22, + "learning_rate": 0.0001, + "loss": 0.9178, + "step": 120 + }, + { + "epoch": 0.24, + "learning_rate": 0.0001, + "loss": 0.9746, + "step": 130 + }, + { + "epoch": 0.25, + "learning_rate": 0.0001, + "loss": 1.1566, + "step": 140 + }, + { + "epoch": 0.27, + "learning_rate": 0.0001, + "loss": 1.2877, + "step": 150 + }, + { + "epoch": 0.29, + "learning_rate": 0.0001, + "loss": 0.9146, + "step": 160 + }, + { + "epoch": 0.31, + "learning_rate": 0.0001, + "loss": 0.8895, + "step": 170 + }, + { + "epoch": 0.33, + "learning_rate": 0.0001, + "loss": 1.0121, + "step": 180 + }, + { + "epoch": 0.34, + "eval_loss": 1.0215636491775513, + "eval_runtime": 950.138, + "eval_samples_per_second": 1.052, + "eval_steps_per_second": 1.052, + "step": 187 + }, + { + "epoch": 0.34, + "mmlu_eval_accuracy": 0.731892294851104, + "mmlu_eval_accuracy_abstract_algebra": 0.18181818181818182, + "mmlu_eval_accuracy_anatomy": 0.7857142857142857, + "mmlu_eval_accuracy_astronomy": 0.6875, + "mmlu_eval_accuracy_business_ethics": 0.7272727272727273, + "mmlu_eval_accuracy_clinical_knowledge": 0.896551724137931, + "mmlu_eval_accuracy_college_biology": 0.875, + "mmlu_eval_accuracy_college_chemistry": 0.5, + "mmlu_eval_accuracy_college_computer_science": 0.7272727272727273, + "mmlu_eval_accuracy_college_mathematics": 0.45454545454545453, + "mmlu_eval_accuracy_college_medicine": 0.9090909090909091, + "mmlu_eval_accuracy_college_physics": 0.6363636363636364, + "mmlu_eval_accuracy_computer_security": 0.6363636363636364, + "mmlu_eval_accuracy_conceptual_physics": 0.6538461538461539, + "mmlu_eval_accuracy_econometrics": 0.8333333333333334, + "mmlu_eval_accuracy_electrical_engineering": 0.8125, + "mmlu_eval_accuracy_elementary_mathematics": 0.7073170731707317, + "mmlu_eval_accuracy_formal_logic": 0.5714285714285714, + "mmlu_eval_accuracy_global_facts": 0.5, + "mmlu_eval_accuracy_high_school_biology": 0.8125, + "mmlu_eval_accuracy_high_school_chemistry": 0.4090909090909091, + "mmlu_eval_accuracy_high_school_computer_science": 0.6666666666666666, + "mmlu_eval_accuracy_high_school_european_history": 0.7777777777777778, + "mmlu_eval_accuracy_high_school_geography": 0.9090909090909091, + "mmlu_eval_accuracy_high_school_government_and_politics": 0.9523809523809523, + "mmlu_eval_accuracy_high_school_macroeconomics": 0.7441860465116279, + "mmlu_eval_accuracy_high_school_mathematics": 0.3793103448275862, + "mmlu_eval_accuracy_high_school_microeconomics": 0.9615384615384616, + "mmlu_eval_accuracy_high_school_physics": 0.23529411764705882, + "mmlu_eval_accuracy_high_school_psychology": 0.9333333333333333, + "mmlu_eval_accuracy_high_school_statistics": 0.6956521739130435, + "mmlu_eval_accuracy_high_school_us_history": 0.9090909090909091, + "mmlu_eval_accuracy_high_school_world_history": 0.8076923076923077, + "mmlu_eval_accuracy_human_aging": 0.8260869565217391, + "mmlu_eval_accuracy_human_sexuality": 0.6666666666666666, + "mmlu_eval_accuracy_international_law": 1.0, + "mmlu_eval_accuracy_jurisprudence": 0.6363636363636364, + "mmlu_eval_accuracy_logical_fallacies": 0.7777777777777778, + "mmlu_eval_accuracy_machine_learning": 0.5454545454545454, + "mmlu_eval_accuracy_management": 0.9090909090909091, + "mmlu_eval_accuracy_marketing": 0.88, + "mmlu_eval_accuracy_medical_genetics": 0.9090909090909091, + "mmlu_eval_accuracy_miscellaneous": 0.7906976744186046, + "mmlu_eval_accuracy_moral_disputes": 0.8157894736842105, + "mmlu_eval_accuracy_moral_scenarios": 0.59, + "mmlu_eval_accuracy_nutrition": 0.7878787878787878, + "mmlu_eval_accuracy_philosophy": 0.7941176470588235, + "mmlu_eval_accuracy_prehistory": 0.8285714285714286, + "mmlu_eval_accuracy_professional_accounting": 0.6451612903225806, + "mmlu_eval_accuracy_professional_law": 0.6294117647058823, + "mmlu_eval_accuracy_professional_medicine": 0.8064516129032258, + "mmlu_eval_accuracy_professional_psychology": 0.8115942028985508, + "mmlu_eval_accuracy_public_relations": 0.6666666666666666, + "mmlu_eval_accuracy_security_studies": 0.8148148148148148, + "mmlu_eval_accuracy_sociology": 0.9545454545454546, + "mmlu_eval_accuracy_us_foreign_policy": 1.0, + "mmlu_eval_accuracy_virology": 0.5, + "mmlu_eval_accuracy_world_religions": 0.8421052631578947, + "mmlu_loss": 1.326305795171384, + "step": 187 + }, + { + "epoch": 0.34, + "learning_rate": 0.0001, + "loss": 1.1133, + "step": 190 + }, + { + "epoch": 0.36, + "learning_rate": 0.0001, + "loss": 1.2485, + "step": 200 + }, + { + "epoch": 0.38, + "learning_rate": 0.0001, + "loss": 0.9653, + "step": 210 + }, + { + "epoch": 0.4, + "learning_rate": 0.0001, + "loss": 0.9455, + "step": 220 + }, + { + "epoch": 0.42, + "learning_rate": 0.0001, + "loss": 1.0373, + "step": 230 + }, + { + "epoch": 0.44, + "learning_rate": 0.0001, + "loss": 1.1425, + "step": 240 + }, + { + "epoch": 0.45, + "learning_rate": 0.0001, + "loss": 1.3136, + "step": 250 + }, + { + "epoch": 0.47, + "learning_rate": 0.0001, + "loss": 0.8695, + "step": 260 + }, + { + "epoch": 0.49, + "learning_rate": 0.0001, + "loss": 0.872, + "step": 270 + }, + { + "epoch": 0.51, + "learning_rate": 0.0001, + "loss": 1.0152, + "step": 280 + }, + { + "epoch": 0.53, + "learning_rate": 0.0001, + "loss": 1.1309, + "step": 290 + }, + { + "epoch": 0.54, + "learning_rate": 0.0001, + "loss": 1.267, + "step": 300 + }, + { + "epoch": 0.56, + "learning_rate": 0.0001, + "loss": 0.9249, + "step": 310 + }, + { + "epoch": 0.58, + "learning_rate": 0.0001, + "loss": 0.9148, + "step": 320 + }, + { + "epoch": 0.6, + "learning_rate": 0.0001, + "loss": 0.9864, + "step": 330 + }, + { + "epoch": 0.62, + "learning_rate": 0.0001, + "loss": 1.2312, + "step": 340 + }, + { + "epoch": 0.63, + "learning_rate": 0.0001, + "loss": 1.2354, + "step": 350 + }, + { + "epoch": 0.65, + "learning_rate": 0.0001, + "loss": 0.9126, + "step": 360 + }, + { + "epoch": 0.67, + "learning_rate": 0.0001, + "loss": 0.9213, + "step": 370 + }, + { + "epoch": 0.68, + "eval_loss": 1.0163359642028809, + "eval_runtime": 948.1151, + "eval_samples_per_second": 1.055, + "eval_steps_per_second": 1.055, + "step": 374 + }, + { + "epoch": 0.68, + "mmlu_eval_accuracy": 0.7395476061435284, + "mmlu_eval_accuracy_abstract_algebra": 0.2727272727272727, + "mmlu_eval_accuracy_anatomy": 0.7857142857142857, + "mmlu_eval_accuracy_astronomy": 0.75, + "mmlu_eval_accuracy_business_ethics": 0.7272727272727273, + "mmlu_eval_accuracy_clinical_knowledge": 0.896551724137931, + "mmlu_eval_accuracy_college_biology": 0.875, + "mmlu_eval_accuracy_college_chemistry": 0.5, + "mmlu_eval_accuracy_college_computer_science": 0.7272727272727273, + "mmlu_eval_accuracy_college_mathematics": 0.36363636363636365, + "mmlu_eval_accuracy_college_medicine": 0.9090909090909091, + "mmlu_eval_accuracy_college_physics": 0.6363636363636364, + "mmlu_eval_accuracy_computer_security": 0.6363636363636364, + "mmlu_eval_accuracy_conceptual_physics": 0.5769230769230769, + "mmlu_eval_accuracy_econometrics": 0.8333333333333334, + "mmlu_eval_accuracy_electrical_engineering": 0.875, + "mmlu_eval_accuracy_elementary_mathematics": 0.6829268292682927, + "mmlu_eval_accuracy_formal_logic": 0.6428571428571429, + "mmlu_eval_accuracy_global_facts": 0.5, + "mmlu_eval_accuracy_high_school_biology": 0.8125, + "mmlu_eval_accuracy_high_school_chemistry": 0.45454545454545453, + "mmlu_eval_accuracy_high_school_computer_science": 0.7777777777777778, + "mmlu_eval_accuracy_high_school_european_history": 0.7777777777777778, + "mmlu_eval_accuracy_high_school_geography": 0.9090909090909091, + "mmlu_eval_accuracy_high_school_government_and_politics": 0.9523809523809523, + "mmlu_eval_accuracy_high_school_macroeconomics": 0.7674418604651163, + "mmlu_eval_accuracy_high_school_mathematics": 0.41379310344827586, + "mmlu_eval_accuracy_high_school_microeconomics": 0.9615384615384616, + "mmlu_eval_accuracy_high_school_physics": 0.23529411764705882, + "mmlu_eval_accuracy_high_school_psychology": 0.95, + "mmlu_eval_accuracy_high_school_statistics": 0.7391304347826086, + "mmlu_eval_accuracy_high_school_us_history": 0.9090909090909091, + "mmlu_eval_accuracy_high_school_world_history": 0.7692307692307693, + "mmlu_eval_accuracy_human_aging": 0.8260869565217391, + "mmlu_eval_accuracy_human_sexuality": 0.6666666666666666, + "mmlu_eval_accuracy_international_law": 1.0, + "mmlu_eval_accuracy_jurisprudence": 0.6363636363636364, + "mmlu_eval_accuracy_logical_fallacies": 0.7777777777777778, + "mmlu_eval_accuracy_machine_learning": 0.5454545454545454, + "mmlu_eval_accuracy_management": 0.9090909090909091, + "mmlu_eval_accuracy_marketing": 0.92, + "mmlu_eval_accuracy_medical_genetics": 0.9090909090909091, + "mmlu_eval_accuracy_miscellaneous": 0.7790697674418605, + "mmlu_eval_accuracy_moral_disputes": 0.8157894736842105, + "mmlu_eval_accuracy_moral_scenarios": 0.57, + "mmlu_eval_accuracy_nutrition": 0.7272727272727273, + "mmlu_eval_accuracy_philosophy": 0.7941176470588235, + "mmlu_eval_accuracy_prehistory": 0.8571428571428571, + "mmlu_eval_accuracy_professional_accounting": 0.6774193548387096, + "mmlu_eval_accuracy_professional_law": 0.6411764705882353, + "mmlu_eval_accuracy_professional_medicine": 0.8387096774193549, + "mmlu_eval_accuracy_professional_psychology": 0.8115942028985508, + "mmlu_eval_accuracy_public_relations": 0.6666666666666666, + "mmlu_eval_accuracy_security_studies": 0.8148148148148148, + "mmlu_eval_accuracy_sociology": 0.9545454545454546, + "mmlu_eval_accuracy_us_foreign_policy": 1.0, + "mmlu_eval_accuracy_virology": 0.5, + "mmlu_eval_accuracy_world_religions": 0.8947368421052632, + "mmlu_loss": 1.2796503596061355, + "step": 374 + }, + { + "epoch": 0.69, + "learning_rate": 0.0001, + "loss": 0.9737, + "step": 380 + }, + { + "epoch": 0.71, + "learning_rate": 0.0001, + "loss": 1.157, + "step": 390 + }, + { + "epoch": 0.73, + "learning_rate": 0.0001, + "loss": 1.2106, + "step": 400 + }, + { + "epoch": 0.74, + "learning_rate": 0.0001, + "loss": 0.8687, + "step": 410 + }, + { + "epoch": 0.76, + "learning_rate": 0.0001, + "loss": 0.8742, + "step": 420 + }, + { + "epoch": 0.78, + "learning_rate": 0.0001, + "loss": 0.9901, + "step": 430 + }, + { + "epoch": 0.8, + "learning_rate": 0.0001, + "loss": 1.2238, + "step": 440 + }, + { + "epoch": 0.82, + "learning_rate": 0.0001, + "loss": 1.2604, + "step": 450 + }, + { + "epoch": 0.83, + "learning_rate": 0.0001, + "loss": 0.8756, + "step": 460 + }, + { + "epoch": 0.85, + "learning_rate": 0.0001, + "loss": 0.8683, + "step": 470 + }, + { + "epoch": 0.87, + "learning_rate": 0.0001, + "loss": 0.9824, + "step": 480 + }, + { + "epoch": 0.89, + "learning_rate": 0.0001, + "loss": 1.1574, + "step": 490 + }, + { + "epoch": 0.91, + "learning_rate": 0.0001, + "loss": 1.2687, + "step": 500 + }, + { + "epoch": 0.92, + "learning_rate": 0.0001, + "loss": 0.8657, + "step": 510 + }, + { + "epoch": 0.94, + "learning_rate": 0.0001, + "loss": 0.9207, + "step": 520 + }, + { + "epoch": 0.96, + "learning_rate": 0.0001, + "loss": 1.012, + "step": 530 + }, + { + "epoch": 0.98, + "learning_rate": 0.0001, + "loss": 1.1517, + "step": 540 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 1.1654, + "step": 550 + }, + { + "epoch": 1.02, + "learning_rate": 0.0001, + "loss": 0.8931, + "step": 560 + }, + { + "epoch": 1.02, + "eval_loss": 1.0150845050811768, + "eval_runtime": 949.8392, + "eval_samples_per_second": 1.053, + "eval_steps_per_second": 1.053, + "step": 561 + }, + { + "epoch": 1.02, + "mmlu_eval_accuracy": 0.7346397374699287, + "mmlu_eval_accuracy_abstract_algebra": 0.2727272727272727, + "mmlu_eval_accuracy_anatomy": 0.7142857142857143, + "mmlu_eval_accuracy_astronomy": 0.6875, + "mmlu_eval_accuracy_business_ethics": 0.8181818181818182, + "mmlu_eval_accuracy_clinical_knowledge": 0.8620689655172413, + "mmlu_eval_accuracy_college_biology": 0.875, + "mmlu_eval_accuracy_college_chemistry": 0.5, + "mmlu_eval_accuracy_college_computer_science": 0.7272727272727273, + "mmlu_eval_accuracy_college_mathematics": 0.36363636363636365, + "mmlu_eval_accuracy_college_medicine": 0.9090909090909091, + "mmlu_eval_accuracy_college_physics": 0.5454545454545454, + "mmlu_eval_accuracy_computer_security": 0.6363636363636364, + "mmlu_eval_accuracy_conceptual_physics": 0.6153846153846154, + "mmlu_eval_accuracy_econometrics": 0.8333333333333334, + "mmlu_eval_accuracy_electrical_engineering": 0.8125, + "mmlu_eval_accuracy_elementary_mathematics": 0.6829268292682927, + "mmlu_eval_accuracy_formal_logic": 0.6428571428571429, + "mmlu_eval_accuracy_global_facts": 0.5, + "mmlu_eval_accuracy_high_school_biology": 0.84375, + "mmlu_eval_accuracy_high_school_chemistry": 0.45454545454545453, + "mmlu_eval_accuracy_high_school_computer_science": 0.7777777777777778, + "mmlu_eval_accuracy_high_school_european_history": 0.7777777777777778, + "mmlu_eval_accuracy_high_school_geography": 0.9090909090909091, + "mmlu_eval_accuracy_high_school_government_and_politics": 0.9523809523809523, + "mmlu_eval_accuracy_high_school_macroeconomics": 0.7674418604651163, + "mmlu_eval_accuracy_high_school_mathematics": 0.41379310344827586, + "mmlu_eval_accuracy_high_school_microeconomics": 0.9230769230769231, + "mmlu_eval_accuracy_high_school_physics": 0.23529411764705882, + "mmlu_eval_accuracy_high_school_psychology": 0.9666666666666667, + "mmlu_eval_accuracy_high_school_statistics": 0.7391304347826086, + "mmlu_eval_accuracy_high_school_us_history": 0.9090909090909091, + "mmlu_eval_accuracy_high_school_world_history": 0.8076923076923077, + "mmlu_eval_accuracy_human_aging": 0.8260869565217391, + "mmlu_eval_accuracy_human_sexuality": 0.6666666666666666, + "mmlu_eval_accuracy_international_law": 1.0, + "mmlu_eval_accuracy_jurisprudence": 0.6363636363636364, + "mmlu_eval_accuracy_logical_fallacies": 0.7222222222222222, + "mmlu_eval_accuracy_machine_learning": 0.6363636363636364, + "mmlu_eval_accuracy_management": 0.9090909090909091, + "mmlu_eval_accuracy_marketing": 0.88, + "mmlu_eval_accuracy_medical_genetics": 0.9090909090909091, + "mmlu_eval_accuracy_miscellaneous": 0.7558139534883721, + "mmlu_eval_accuracy_moral_disputes": 0.8157894736842105, + "mmlu_eval_accuracy_moral_scenarios": 0.56, + "mmlu_eval_accuracy_nutrition": 0.7575757575757576, + "mmlu_eval_accuracy_philosophy": 0.7941176470588235, + "mmlu_eval_accuracy_prehistory": 0.8571428571428571, + "mmlu_eval_accuracy_professional_accounting": 0.6774193548387096, + "mmlu_eval_accuracy_professional_law": 0.6294117647058823, + "mmlu_eval_accuracy_professional_medicine": 0.8064516129032258, + "mmlu_eval_accuracy_professional_psychology": 0.8260869565217391, + "mmlu_eval_accuracy_public_relations": 0.6666666666666666, + "mmlu_eval_accuracy_security_studies": 0.8148148148148148, + "mmlu_eval_accuracy_sociology": 0.9090909090909091, + "mmlu_eval_accuracy_us_foreign_policy": 1.0, + "mmlu_eval_accuracy_virology": 0.5, + "mmlu_eval_accuracy_world_religions": 0.8421052631578947, + "mmlu_loss": 1.186674291658526, + "step": 561 + }, + { + "epoch": 1.03, + "learning_rate": 0.0001, + "loss": 0.8507, + "step": 570 + }, + { + "epoch": 1.05, + "learning_rate": 0.0001, + "loss": 0.9164, + "step": 580 + }, + { + "epoch": 1.07, + "learning_rate": 0.0001, + "loss": 1.0908, + "step": 590 + }, + { + "epoch": 1.09, + "learning_rate": 0.0001, + "loss": 1.0431, + "step": 600 + }, + { + "epoch": 1.11, + "learning_rate": 0.0001, + "loss": 0.8567, + "step": 610 + }, + { + "epoch": 1.12, + "learning_rate": 0.0001, + "loss": 0.8818, + "step": 620 + }, + { + "epoch": 1.14, + "learning_rate": 0.0001, + "loss": 0.9499, + "step": 630 + }, + { + "epoch": 1.16, + "learning_rate": 0.0001, + "loss": 1.0437, + "step": 640 + }, + { + "epoch": 1.18, + "learning_rate": 0.0001, + "loss": 1.0487, + "step": 650 + }, + { + "epoch": 1.2, + "learning_rate": 0.0001, + "loss": 0.8405, + "step": 660 + }, + { + "epoch": 1.22, + "learning_rate": 0.0001, + "loss": 0.8818, + "step": 670 + }, + { + "epoch": 1.23, + "learning_rate": 0.0001, + "loss": 0.9619, + "step": 680 + }, + { + "epoch": 1.25, + "learning_rate": 0.0001, + "loss": 1.0753, + "step": 690 + }, + { + "epoch": 1.27, + "learning_rate": 0.0001, + "loss": 1.0218, + "step": 700 + }, + { + "epoch": 1.29, + "learning_rate": 0.0001, + "loss": 0.8763, + "step": 710 + }, + { + "epoch": 1.31, + "learning_rate": 0.0001, + "loss": 0.8789, + "step": 720 + }, + { + "epoch": 1.32, + "learning_rate": 0.0001, + "loss": 0.8631, + "step": 730 + }, + { + "epoch": 1.34, + "learning_rate": 0.0001, + "loss": 0.9846, + "step": 740 + }, + { + "epoch": 1.36, + "eval_loss": 1.0305067300796509, + "eval_runtime": 948.7106, + "eval_samples_per_second": 1.054, + "eval_steps_per_second": 1.054, + "step": 748 + }, + { + "epoch": 1.36, + "mmlu_eval_accuracy": 0.7324229372189777, + "mmlu_eval_accuracy_abstract_algebra": 0.2727272727272727, + "mmlu_eval_accuracy_anatomy": 0.6428571428571429, + "mmlu_eval_accuracy_astronomy": 0.6875, + "mmlu_eval_accuracy_business_ethics": 1.0, + "mmlu_eval_accuracy_clinical_knowledge": 0.896551724137931, + "mmlu_eval_accuracy_college_biology": 0.875, + "mmlu_eval_accuracy_college_chemistry": 0.5, + "mmlu_eval_accuracy_college_computer_science": 0.6363636363636364, + "mmlu_eval_accuracy_college_mathematics": 0.36363636363636365, + "mmlu_eval_accuracy_college_medicine": 0.8636363636363636, + "mmlu_eval_accuracy_college_physics": 0.6363636363636364, + "mmlu_eval_accuracy_computer_security": 0.6363636363636364, + "mmlu_eval_accuracy_conceptual_physics": 0.6153846153846154, + "mmlu_eval_accuracy_econometrics": 0.8333333333333334, + "mmlu_eval_accuracy_electrical_engineering": 0.8125, + "mmlu_eval_accuracy_elementary_mathematics": 0.7073170731707317, + "mmlu_eval_accuracy_formal_logic": 0.6428571428571429, + "mmlu_eval_accuracy_global_facts": 0.4, + "mmlu_eval_accuracy_high_school_biology": 0.8125, + "mmlu_eval_accuracy_high_school_chemistry": 0.36363636363636365, + "mmlu_eval_accuracy_high_school_computer_science": 0.6666666666666666, + "mmlu_eval_accuracy_high_school_european_history": 0.7777777777777778, + "mmlu_eval_accuracy_high_school_geography": 0.9090909090909091, + "mmlu_eval_accuracy_high_school_government_and_politics": 0.9523809523809523, + "mmlu_eval_accuracy_high_school_macroeconomics": 0.7674418604651163, + "mmlu_eval_accuracy_high_school_mathematics": 0.3448275862068966, + "mmlu_eval_accuracy_high_school_microeconomics": 0.9615384615384616, + "mmlu_eval_accuracy_high_school_physics": 0.35294117647058826, + "mmlu_eval_accuracy_high_school_psychology": 0.9333333333333333, + "mmlu_eval_accuracy_high_school_statistics": 0.782608695652174, + "mmlu_eval_accuracy_high_school_us_history": 0.9545454545454546, + "mmlu_eval_accuracy_high_school_world_history": 0.7692307692307693, + "mmlu_eval_accuracy_human_aging": 0.782608695652174, + "mmlu_eval_accuracy_human_sexuality": 0.6666666666666666, + "mmlu_eval_accuracy_international_law": 1.0, + "mmlu_eval_accuracy_jurisprudence": 0.6363636363636364, + "mmlu_eval_accuracy_logical_fallacies": 0.7777777777777778, + "mmlu_eval_accuracy_machine_learning": 0.6363636363636364, + "mmlu_eval_accuracy_management": 0.9090909090909091, + "mmlu_eval_accuracy_marketing": 0.96, + "mmlu_eval_accuracy_medical_genetics": 0.9090909090909091, + "mmlu_eval_accuracy_miscellaneous": 0.7441860465116279, + "mmlu_eval_accuracy_moral_disputes": 0.8157894736842105, + "mmlu_eval_accuracy_moral_scenarios": 0.57, + "mmlu_eval_accuracy_nutrition": 0.7272727272727273, + "mmlu_eval_accuracy_philosophy": 0.7941176470588235, + "mmlu_eval_accuracy_prehistory": 0.8285714285714286, + "mmlu_eval_accuracy_professional_accounting": 0.6129032258064516, + "mmlu_eval_accuracy_professional_law": 0.6411764705882353, + "mmlu_eval_accuracy_professional_medicine": 0.8064516129032258, + "mmlu_eval_accuracy_professional_psychology": 0.8260869565217391, + "mmlu_eval_accuracy_public_relations": 0.6666666666666666, + "mmlu_eval_accuracy_security_studies": 0.8148148148148148, + "mmlu_eval_accuracy_sociology": 0.9090909090909091, + "mmlu_eval_accuracy_us_foreign_policy": 1.0, + "mmlu_eval_accuracy_virology": 0.5, + "mmlu_eval_accuracy_world_religions": 0.8421052631578947, + "mmlu_loss": 1.2988067958029479, + "step": 748 + }, + { + "epoch": 1.36, + "learning_rate": 0.0001, + "loss": 1.0735, + "step": 750 + }, + { + "epoch": 1.38, + "learning_rate": 0.0001, + "loss": 0.9066, + "step": 760 + }, + { + "epoch": 1.4, + "learning_rate": 0.0001, + "loss": 0.8716, + "step": 770 + }, + { + "epoch": 1.41, + "learning_rate": 0.0001, + "loss": 0.9144, + "step": 780 + }, + { + "epoch": 1.43, + "learning_rate": 0.0001, + "loss": 1.0338, + "step": 790 + }, + { + "epoch": 1.45, + "learning_rate": 0.0001, + "loss": 1.0275, + "step": 800 + }, + { + "epoch": 1.47, + "learning_rate": 0.0001, + "loss": 0.8382, + "step": 810 + }, + { + "epoch": 1.49, + "learning_rate": 0.0001, + "loss": 0.8489, + "step": 820 + }, + { + "epoch": 1.51, + "learning_rate": 0.0001, + "loss": 0.8931, + "step": 830 + }, + { + "epoch": 1.52, + "learning_rate": 0.0001, + "loss": 1.0515, + "step": 840 + }, + { + "epoch": 1.54, + "learning_rate": 0.0001, + "loss": 1.0965, + "step": 850 + }, + { + "epoch": 1.56, + "learning_rate": 0.0001, + "loss": 0.8928, + "step": 860 + }, + { + "epoch": 1.58, + "learning_rate": 0.0001, + "loss": 0.8608, + "step": 870 + }, + { + "epoch": 1.6, + "learning_rate": 0.0001, + "loss": 0.8831, + "step": 880 + }, + { + "epoch": 1.61, + "learning_rate": 0.0001, + "loss": 1.0253, + "step": 890 + }, + { + "epoch": 1.63, + "learning_rate": 0.0001, + "loss": 0.9905, + "step": 900 + }, + { + "epoch": 1.65, + "learning_rate": 0.0001, + "loss": 0.8487, + "step": 910 + }, + { + "epoch": 1.67, + "learning_rate": 0.0001, + "loss": 0.8568, + "step": 920 + }, + { + "epoch": 1.69, + "learning_rate": 0.0001, + "loss": 0.9047, + "step": 930 + }, + { + "epoch": 1.7, + "eval_loss": 1.0250624418258667, + "eval_runtime": 946.4035, + "eval_samples_per_second": 1.057, + "eval_steps_per_second": 1.057, + "step": 935 + }, + { + "epoch": 1.7, + "mmlu_eval_accuracy": 0.7288948695878031, + "mmlu_eval_accuracy_abstract_algebra": 0.2727272727272727, + "mmlu_eval_accuracy_anatomy": 0.5714285714285714, + "mmlu_eval_accuracy_astronomy": 0.6875, + "mmlu_eval_accuracy_business_ethics": 0.9090909090909091, + "mmlu_eval_accuracy_clinical_knowledge": 0.8620689655172413, + "mmlu_eval_accuracy_college_biology": 0.875, + "mmlu_eval_accuracy_college_chemistry": 0.5, + "mmlu_eval_accuracy_college_computer_science": 0.7272727272727273, + "mmlu_eval_accuracy_college_mathematics": 0.36363636363636365, + "mmlu_eval_accuracy_college_medicine": 0.9090909090909091, + "mmlu_eval_accuracy_college_physics": 0.6363636363636364, + "mmlu_eval_accuracy_computer_security": 0.6363636363636364, + "mmlu_eval_accuracy_conceptual_physics": 0.6153846153846154, + "mmlu_eval_accuracy_econometrics": 0.8333333333333334, + "mmlu_eval_accuracy_electrical_engineering": 0.8125, + "mmlu_eval_accuracy_elementary_mathematics": 0.6097560975609756, + "mmlu_eval_accuracy_formal_logic": 0.6428571428571429, + "mmlu_eval_accuracy_global_facts": 0.5, + "mmlu_eval_accuracy_high_school_biology": 0.84375, + "mmlu_eval_accuracy_high_school_chemistry": 0.45454545454545453, + "mmlu_eval_accuracy_high_school_computer_science": 0.6666666666666666, + "mmlu_eval_accuracy_high_school_european_history": 0.7777777777777778, + "mmlu_eval_accuracy_high_school_geography": 0.9090909090909091, + "mmlu_eval_accuracy_high_school_government_and_politics": 0.9523809523809523, + "mmlu_eval_accuracy_high_school_macroeconomics": 0.7906976744186046, + "mmlu_eval_accuracy_high_school_mathematics": 0.3793103448275862, + "mmlu_eval_accuracy_high_school_microeconomics": 0.9230769230769231, + "mmlu_eval_accuracy_high_school_physics": 0.23529411764705882, + "mmlu_eval_accuracy_high_school_psychology": 0.9333333333333333, + "mmlu_eval_accuracy_high_school_statistics": 0.7391304347826086, + "mmlu_eval_accuracy_high_school_us_history": 0.9090909090909091, + "mmlu_eval_accuracy_high_school_world_history": 0.7692307692307693, + "mmlu_eval_accuracy_human_aging": 0.782608695652174, + "mmlu_eval_accuracy_human_sexuality": 0.6666666666666666, + "mmlu_eval_accuracy_international_law": 1.0, + "mmlu_eval_accuracy_jurisprudence": 0.5454545454545454, + "mmlu_eval_accuracy_logical_fallacies": 0.7222222222222222, + "mmlu_eval_accuracy_machine_learning": 0.6363636363636364, + "mmlu_eval_accuracy_management": 0.9090909090909091, + "mmlu_eval_accuracy_marketing": 0.96, + "mmlu_eval_accuracy_medical_genetics": 0.9090909090909091, + "mmlu_eval_accuracy_miscellaneous": 0.7790697674418605, + "mmlu_eval_accuracy_moral_disputes": 0.8157894736842105, + "mmlu_eval_accuracy_moral_scenarios": 0.57, + "mmlu_eval_accuracy_nutrition": 0.7575757575757576, + "mmlu_eval_accuracy_philosophy": 0.8235294117647058, + "mmlu_eval_accuracy_prehistory": 0.8857142857142857, + "mmlu_eval_accuracy_professional_accounting": 0.6129032258064516, + "mmlu_eval_accuracy_professional_law": 0.6235294117647059, + "mmlu_eval_accuracy_professional_medicine": 0.8387096774193549, + "mmlu_eval_accuracy_professional_psychology": 0.8115942028985508, + "mmlu_eval_accuracy_public_relations": 0.5833333333333334, + "mmlu_eval_accuracy_security_studies": 0.8148148148148148, + "mmlu_eval_accuracy_sociology": 0.9090909090909091, + "mmlu_eval_accuracy_us_foreign_policy": 1.0, + "mmlu_eval_accuracy_virology": 0.5, + "mmlu_eval_accuracy_world_religions": 0.8421052631578947, + "mmlu_loss": 1.243813282909306, + "step": 935 + }, + { + "epoch": 1.7, + "learning_rate": 0.0001, + "loss": 1.0174, + "step": 940 + }, + { + "epoch": 1.72, + "learning_rate": 0.0001, + "loss": 1.0302, + "step": 950 + }, + { + "epoch": 1.74, + "learning_rate": 0.0001, + "loss": 0.8799, + "step": 960 + }, + { + "epoch": 1.76, + "learning_rate": 0.0001, + "loss": 0.8447, + "step": 970 + }, + { + "epoch": 1.78, + "learning_rate": 0.0001, + "loss": 0.9053, + "step": 980 + }, + { + "epoch": 1.8, + "learning_rate": 0.0001, + "loss": 1.0331, + "step": 990 + }, + { + "epoch": 1.81, + "learning_rate": 0.0001, + "loss": 1.0412, + "step": 1000 + }, + { + "epoch": 1.83, + "learning_rate": 0.0001, + "loss": 0.8753, + "step": 1010 + }, + { + "epoch": 1.85, + "learning_rate": 0.0001, + "loss": 0.8744, + "step": 1020 + }, + { + "epoch": 1.87, + "learning_rate": 0.0001, + "loss": 0.8899, + "step": 1030 + }, + { + "epoch": 1.89, + "learning_rate": 0.0001, + "loss": 1.0053, + "step": 1040 + }, + { + "epoch": 1.9, + "learning_rate": 0.0001, + "loss": 1.0127, + "step": 1050 + }, + { + "epoch": 1.92, + "learning_rate": 0.0001, + "loss": 0.8023, + "step": 1060 + }, + { + "epoch": 1.94, + "learning_rate": 0.0001, + "loss": 0.8349, + "step": 1070 + }, + { + "epoch": 1.96, + "learning_rate": 0.0001, + "loss": 0.9742, + "step": 1080 + }, + { + "epoch": 1.98, + "learning_rate": 0.0001, + "loss": 1.0971, + "step": 1090 + }, + { + "epoch": 2.0, + "learning_rate": 0.0001, + "loss": 1.0728, + "step": 1100 + }, + { + "epoch": 2.01, + "learning_rate": 0.0001, + "loss": 0.7724, + "step": 1110 + }, + { + "epoch": 2.03, + "learning_rate": 0.0001, + "loss": 0.7675, + "step": 1120 + }, + { + "epoch": 2.03, + "eval_loss": 1.052681565284729, + "eval_runtime": 942.0722, + "eval_samples_per_second": 1.061, + "eval_steps_per_second": 1.061, + "step": 1122 + }, + { + "epoch": 2.03, + "mmlu_eval_accuracy": 0.7373981967098951, + "mmlu_eval_accuracy_abstract_algebra": 0.36363636363636365, + "mmlu_eval_accuracy_anatomy": 0.6428571428571429, + "mmlu_eval_accuracy_astronomy": 0.6875, + "mmlu_eval_accuracy_business_ethics": 0.9090909090909091, + "mmlu_eval_accuracy_clinical_knowledge": 0.896551724137931, + "mmlu_eval_accuracy_college_biology": 0.875, + "mmlu_eval_accuracy_college_chemistry": 0.5, + "mmlu_eval_accuracy_college_computer_science": 0.5454545454545454, + "mmlu_eval_accuracy_college_mathematics": 0.36363636363636365, + "mmlu_eval_accuracy_college_medicine": 0.9090909090909091, + "mmlu_eval_accuracy_college_physics": 0.5454545454545454, + "mmlu_eval_accuracy_computer_security": 0.6363636363636364, + "mmlu_eval_accuracy_conceptual_physics": 0.6153846153846154, + "mmlu_eval_accuracy_econometrics": 0.8333333333333334, + "mmlu_eval_accuracy_electrical_engineering": 0.875, + "mmlu_eval_accuracy_elementary_mathematics": 0.5853658536585366, + "mmlu_eval_accuracy_formal_logic": 0.7142857142857143, + "mmlu_eval_accuracy_global_facts": 0.4, + "mmlu_eval_accuracy_high_school_biology": 0.84375, + "mmlu_eval_accuracy_high_school_chemistry": 0.45454545454545453, + "mmlu_eval_accuracy_high_school_computer_science": 0.7777777777777778, + "mmlu_eval_accuracy_high_school_european_history": 0.7777777777777778, + "mmlu_eval_accuracy_high_school_geography": 0.9090909090909091, + "mmlu_eval_accuracy_high_school_government_and_politics": 0.9523809523809523, + "mmlu_eval_accuracy_high_school_macroeconomics": 0.7674418604651163, + "mmlu_eval_accuracy_high_school_mathematics": 0.41379310344827586, + "mmlu_eval_accuracy_high_school_microeconomics": 0.9230769230769231, + "mmlu_eval_accuracy_high_school_physics": 0.23529411764705882, + "mmlu_eval_accuracy_high_school_psychology": 0.95, + "mmlu_eval_accuracy_high_school_statistics": 0.782608695652174, + "mmlu_eval_accuracy_high_school_us_history": 0.9090909090909091, + "mmlu_eval_accuracy_high_school_world_history": 0.7692307692307693, + "mmlu_eval_accuracy_human_aging": 0.8260869565217391, + "mmlu_eval_accuracy_human_sexuality": 0.6666666666666666, + "mmlu_eval_accuracy_international_law": 1.0, + "mmlu_eval_accuracy_jurisprudence": 0.6363636363636364, + "mmlu_eval_accuracy_logical_fallacies": 0.7777777777777778, + "mmlu_eval_accuracy_machine_learning": 0.6363636363636364, + "mmlu_eval_accuracy_management": 0.9090909090909091, + "mmlu_eval_accuracy_marketing": 0.92, + "mmlu_eval_accuracy_medical_genetics": 0.9090909090909091, + "mmlu_eval_accuracy_miscellaneous": 0.7906976744186046, + "mmlu_eval_accuracy_moral_disputes": 0.8157894736842105, + "mmlu_eval_accuracy_moral_scenarios": 0.62, + "mmlu_eval_accuracy_nutrition": 0.7575757575757576, + "mmlu_eval_accuracy_philosophy": 0.8235294117647058, + "mmlu_eval_accuracy_prehistory": 0.8857142857142857, + "mmlu_eval_accuracy_professional_accounting": 0.6774193548387096, + "mmlu_eval_accuracy_professional_law": 0.6294117647058823, + "mmlu_eval_accuracy_professional_medicine": 0.8709677419354839, + "mmlu_eval_accuracy_professional_psychology": 0.782608695652174, + "mmlu_eval_accuracy_public_relations": 0.6666666666666666, + "mmlu_eval_accuracy_security_studies": 0.8148148148148148, + "mmlu_eval_accuracy_sociology": 0.9090909090909091, + "mmlu_eval_accuracy_us_foreign_policy": 1.0, + "mmlu_eval_accuracy_virology": 0.5, + "mmlu_eval_accuracy_world_religions": 0.8421052631578947, + "mmlu_loss": 1.2340081441760609, + "step": 1122 + }, + { + "epoch": 2.05, + "learning_rate": 0.0001, + "loss": 0.7194, + "step": 1130 + }, + { + "epoch": 2.07, + "learning_rate": 0.0001, + "loss": 0.8236, + "step": 1140 + }, + { + "epoch": 2.09, + "learning_rate": 0.0001, + "loss": 0.6652, + "step": 1150 + }, + { + "epoch": 2.1, + "learning_rate": 0.0001, + "loss": 0.7177, + "step": 1160 + }, + { + "epoch": 2.12, + "learning_rate": 0.0001, + "loss": 0.7788, + "step": 1170 + }, + { + "epoch": 2.14, + "learning_rate": 0.0001, + "loss": 0.8117, + "step": 1180 + }, + { + "epoch": 2.16, + "learning_rate": 0.0001, + "loss": 0.8145, + "step": 1190 + }, + { + "epoch": 2.18, + "learning_rate": 0.0001, + "loss": 0.6984, + "step": 1200 + }, + { + "epoch": 2.19, + "learning_rate": 0.0001, + "loss": 0.7011, + "step": 1210 + }, + { + "epoch": 2.21, + "learning_rate": 0.0001, + "loss": 0.769, + "step": 1220 + }, + { + "epoch": 2.23, + "learning_rate": 0.0001, + "loss": 0.7705, + "step": 1230 + }, + { + "epoch": 2.25, + "learning_rate": 0.0001, + "loss": 0.8066, + "step": 1240 + }, + { + "epoch": 2.27, + "learning_rate": 0.0001, + "loss": 0.6622, + "step": 1250 + }, + { + "epoch": 2.29, + "learning_rate": 0.0001, + "loss": 0.6641, + "step": 1260 + }, + { + "epoch": 2.3, + "learning_rate": 0.0001, + "loss": 0.7239, + "step": 1270 + }, + { + "epoch": 2.32, + "learning_rate": 0.0001, + "loss": 0.7618, + "step": 1280 + }, + { + "epoch": 2.34, + "learning_rate": 0.0001, + "loss": 0.7845, + "step": 1290 + }, + { + "epoch": 2.36, + "learning_rate": 0.0001, + "loss": 0.719, + "step": 1300 + }, + { + "epoch": 2.37, + "eval_loss": 1.1104822158813477, + "eval_runtime": 948.1299, + "eval_samples_per_second": 1.055, + "eval_steps_per_second": 1.055, + "step": 1309 + }, + { + "epoch": 2.37, + "mmlu_eval_accuracy": 0.7369285730399766, + "mmlu_eval_accuracy_abstract_algebra": 0.36363636363636365, + "mmlu_eval_accuracy_anatomy": 0.6428571428571429, + "mmlu_eval_accuracy_astronomy": 0.6875, + "mmlu_eval_accuracy_business_ethics": 1.0, + "mmlu_eval_accuracy_clinical_knowledge": 0.8275862068965517, + "mmlu_eval_accuracy_college_biology": 0.875, + "mmlu_eval_accuracy_college_chemistry": 0.5, + "mmlu_eval_accuracy_college_computer_science": 0.6363636363636364, + "mmlu_eval_accuracy_college_mathematics": 0.45454545454545453, + "mmlu_eval_accuracy_college_medicine": 0.8636363636363636, + "mmlu_eval_accuracy_college_physics": 0.6363636363636364, + "mmlu_eval_accuracy_computer_security": 0.7272727272727273, + "mmlu_eval_accuracy_conceptual_physics": 0.6923076923076923, + "mmlu_eval_accuracy_econometrics": 0.75, + "mmlu_eval_accuracy_electrical_engineering": 0.8125, + "mmlu_eval_accuracy_elementary_mathematics": 0.6341463414634146, + "mmlu_eval_accuracy_formal_logic": 0.7857142857142857, + "mmlu_eval_accuracy_global_facts": 0.5, + "mmlu_eval_accuracy_high_school_biology": 0.84375, + "mmlu_eval_accuracy_high_school_chemistry": 0.4090909090909091, + "mmlu_eval_accuracy_high_school_computer_science": 0.8888888888888888, + "mmlu_eval_accuracy_high_school_european_history": 0.7777777777777778, + "mmlu_eval_accuracy_high_school_geography": 0.9090909090909091, + "mmlu_eval_accuracy_high_school_government_and_politics": 0.9523809523809523, + "mmlu_eval_accuracy_high_school_macroeconomics": 0.7906976744186046, + "mmlu_eval_accuracy_high_school_mathematics": 0.3793103448275862, + "mmlu_eval_accuracy_high_school_microeconomics": 0.9230769230769231, + "mmlu_eval_accuracy_high_school_physics": 0.17647058823529413, + "mmlu_eval_accuracy_high_school_psychology": 0.95, + "mmlu_eval_accuracy_high_school_statistics": 0.6956521739130435, + "mmlu_eval_accuracy_high_school_us_history": 0.9090909090909091, + "mmlu_eval_accuracy_high_school_world_history": 0.7307692307692307, + "mmlu_eval_accuracy_human_aging": 0.7391304347826086, + "mmlu_eval_accuracy_human_sexuality": 0.6666666666666666, + "mmlu_eval_accuracy_international_law": 1.0, + "mmlu_eval_accuracy_jurisprudence": 0.5454545454545454, + "mmlu_eval_accuracy_logical_fallacies": 0.7222222222222222, + "mmlu_eval_accuracy_machine_learning": 0.6363636363636364, + "mmlu_eval_accuracy_management": 0.9090909090909091, + "mmlu_eval_accuracy_marketing": 0.88, + "mmlu_eval_accuracy_medical_genetics": 0.9090909090909091, + "mmlu_eval_accuracy_miscellaneous": 0.7906976744186046, + "mmlu_eval_accuracy_moral_disputes": 0.8157894736842105, + "mmlu_eval_accuracy_moral_scenarios": 0.57, + "mmlu_eval_accuracy_nutrition": 0.7575757575757576, + "mmlu_eval_accuracy_philosophy": 0.8529411764705882, + "mmlu_eval_accuracy_prehistory": 0.8571428571428571, + "mmlu_eval_accuracy_professional_accounting": 0.6774193548387096, + "mmlu_eval_accuracy_professional_law": 0.6058823529411764, + "mmlu_eval_accuracy_professional_medicine": 0.8709677419354839, + "mmlu_eval_accuracy_professional_psychology": 0.7681159420289855, + "mmlu_eval_accuracy_public_relations": 0.5833333333333334, + "mmlu_eval_accuracy_security_studies": 0.8148148148148148, + "mmlu_eval_accuracy_sociology": 0.9090909090909091, + "mmlu_eval_accuracy_us_foreign_policy": 1.0, + "mmlu_eval_accuracy_virology": 0.5555555555555556, + "mmlu_eval_accuracy_world_religions": 0.8421052631578947, + "mmlu_loss": 1.0866562834095908, + "step": 1309 + }, + { + "epoch": 2.38, + "learning_rate": 0.0001, + "loss": 0.7093, + "step": 1310 + }, + { + "epoch": 2.39, + "learning_rate": 0.0001, + "loss": 0.7684, + "step": 1320 + }, + { + "epoch": 2.41, + "learning_rate": 0.0001, + "loss": 0.7501, + "step": 1330 + }, + { + "epoch": 2.43, + "learning_rate": 0.0001, + "loss": 0.8043, + "step": 1340 + }, + { + "epoch": 2.45, + "learning_rate": 0.0001, + "loss": 0.6927, + "step": 1350 + }, + { + "epoch": 2.47, + "learning_rate": 0.0001, + "loss": 0.7278, + "step": 1360 + }, + { + "epoch": 2.48, + "learning_rate": 0.0001, + "loss": 0.8095, + "step": 1370 + }, + { + "epoch": 2.5, + "learning_rate": 0.0001, + "loss": 0.7463, + "step": 1380 + }, + { + "epoch": 2.52, + "learning_rate": 0.0001, + "loss": 0.7707, + "step": 1390 + }, + { + "epoch": 2.54, + "learning_rate": 0.0001, + "loss": 0.7152, + "step": 1400 + } + ], + "max_steps": 1875, + "num_train_epochs": 4, + "total_flos": 1.6231629152399524e+18, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-1400/training_args.bin b/checkpoint-1400/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..cab5478c763bfedd6ea1ae136d922355ec23f73d --- /dev/null +++ b/checkpoint-1400/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3739b542a2914edbd2d7eb3b727d6fa2a7752c75b0d7a856f5e87dd807fa1ef9 +size 6200 diff --git a/checkpoint-1600/README.md b/checkpoint-1600/README.md new file mode 100644 index 0000000000000000000000000000000000000000..82793f73e61dbb024e11fc6697bba1622d4d0db6 --- /dev/null +++ b/checkpoint-1600/README.md @@ -0,0 +1,20 @@ +--- +library_name: peft +--- +## Training procedure + + +The following `bitsandbytes` quantization config was used during training: +- load_in_8bit: False +- load_in_4bit: True +- llm_int8_threshold: 6.0 +- llm_int8_skip_modules: None +- llm_int8_enable_fp32_cpu_offload: False +- llm_int8_has_fp16_weight: False +- bnb_4bit_quant_type: nf4 +- bnb_4bit_use_double_quant: True +- bnb_4bit_compute_dtype: bfloat16 +### Framework versions + + +- PEFT 0.4.0 diff --git a/checkpoint-1600/adapter_config.json b/checkpoint-1600/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..f6f42652a8f2f3cf02936153fca1af5d04e7983d --- /dev/null +++ b/checkpoint-1600/adapter_config.json @@ -0,0 +1,26 @@ +{ + "auto_mapping": null, + "base_model_name_or_path": "152334H/miqu-1-70b-sf", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "lora_alpha": 16.0, + "lora_dropout": 0.05, + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "revision": null, + "target_modules": [ + "gate_proj", + "down_proj", + "v_proj", + "up_proj", + "q_proj", + "k_proj", + "o_proj" + ], + "task_type": "CAUSAL_LM" +} \ No newline at end of file diff --git a/checkpoint-1600/adapter_model.bin b/checkpoint-1600/adapter_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..c7c854cb374928ff554bac17fa2598f6139b1f5b --- /dev/null +++ b/checkpoint-1600/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e9a0f9e046389866de742b94122dd6dbb44196a645e8a8912ab9becd3b6e9ee2 +size 1657155522 diff --git a/checkpoint-1600/adapter_model/README.md b/checkpoint-1600/adapter_model/README.md new file mode 100644 index 0000000000000000000000000000000000000000..82793f73e61dbb024e11fc6697bba1622d4d0db6 --- /dev/null +++ b/checkpoint-1600/adapter_model/README.md @@ -0,0 +1,20 @@ +--- +library_name: peft +--- +## Training procedure + + +The following `bitsandbytes` quantization config was used during training: +- load_in_8bit: False +- load_in_4bit: True +- llm_int8_threshold: 6.0 +- llm_int8_skip_modules: None +- llm_int8_enable_fp32_cpu_offload: False +- llm_int8_has_fp16_weight: False +- bnb_4bit_quant_type: nf4 +- bnb_4bit_use_double_quant: True +- bnb_4bit_compute_dtype: bfloat16 +### Framework versions + + +- PEFT 0.4.0 diff --git a/checkpoint-1600/adapter_model/adapter_config.json b/checkpoint-1600/adapter_model/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..f6f42652a8f2f3cf02936153fca1af5d04e7983d --- /dev/null +++ b/checkpoint-1600/adapter_model/adapter_config.json @@ -0,0 +1,26 @@ +{ + "auto_mapping": null, + "base_model_name_or_path": "152334H/miqu-1-70b-sf", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "lora_alpha": 16.0, + "lora_dropout": 0.05, + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "revision": null, + "target_modules": [ + "gate_proj", + "down_proj", + "v_proj", + "up_proj", + "q_proj", + "k_proj", + "o_proj" + ], + "task_type": "CAUSAL_LM" +} \ No newline at end of file diff --git a/checkpoint-1600/adapter_model/adapter_model.bin b/checkpoint-1600/adapter_model/adapter_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..c7c854cb374928ff554bac17fa2598f6139b1f5b --- /dev/null +++ b/checkpoint-1600/adapter_model/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e9a0f9e046389866de742b94122dd6dbb44196a645e8a8912ab9becd3b6e9ee2 +size 1657155522 diff --git a/checkpoint-1600/optimizer.pt b/checkpoint-1600/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..5489cdfdf53254e879064dec7d1893dd750da9ac --- /dev/null +++ b/checkpoint-1600/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:991313c3544fd29570cdd6d6c35cee055932460db308158ecc93c1bf6e12e312 +size 6627702922 diff --git a/checkpoint-1600/rng_state.pth b/checkpoint-1600/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..cbfce4ead5f82f01ff08db755454e08003608666 --- /dev/null +++ b/checkpoint-1600/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a0a23ca79cae1540f2b114a028cd9472e6869461ba37577bd02be151a5f22a4e +size 14180 diff --git a/checkpoint-1600/scheduler.pt b/checkpoint-1600/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..b878c5736da6c68181e3daa099482e413def6cdf --- /dev/null +++ b/checkpoint-1600/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:91889afd10b68ec25b037a974a49914c98ca7cbeb59686cdaae2470ce449354d +size 1064 diff --git a/checkpoint-1600/special_tokens_map.json b/checkpoint-1600/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..55431d8ca0ffd8e996c646341b6136e749142547 --- /dev/null +++ b/checkpoint-1600/special_tokens_map.json @@ -0,0 +1,12 @@ +{ + "bos_token": "", + "eos_token": "", + "pad_token": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false + }, + "unk_token": "" +} diff --git a/checkpoint-1600/tokenizer.model b/checkpoint-1600/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..6c00c742ce03c627d6cd5b795984876fa49fa899 --- /dev/null +++ b/checkpoint-1600/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347 +size 499723 diff --git a/checkpoint-1600/tokenizer_config.json b/checkpoint-1600/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..a3b9160e6f055447e1b1927482cf0ea394366055 --- /dev/null +++ b/checkpoint-1600/tokenizer_config.json @@ -0,0 +1,71 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": { + "__type": "AddedToken", + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false + }, + "chat_template": "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}", + "clean_up_tokenization_spaces": false, + "eos_token": { + "__type": "AddedToken", + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false + }, + "legacy": false, + "model_max_length": 1000000000000000019884624838656, + "pad_token": { + "__type": "AddedToken", + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false + }, + "padding_side": "right", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "LlamaTokenizer", + "unk_token": { + "__type": "AddedToken", + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false + }, + "use_default_system_prompt": false +} diff --git a/checkpoint-1600/trainer_state.json b/checkpoint-1600/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..6bac3e557f68d0cc3646c7838b196f86fb937a04 --- /dev/null +++ b/checkpoint-1600/trainer_state.json @@ -0,0 +1,1544 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.901836318295171, + "global_step": 1600, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.02, + "learning_rate": 0.0001, + "loss": 1.1655, + "step": 10 + }, + { + "epoch": 0.04, + "learning_rate": 0.0001, + "loss": 1.001, + "step": 20 + }, + { + "epoch": 0.05, + "learning_rate": 0.0001, + "loss": 1.0287, + "step": 30 + }, + { + "epoch": 0.07, + "learning_rate": 0.0001, + "loss": 1.1578, + "step": 40 + }, + { + "epoch": 0.09, + "learning_rate": 0.0001, + "loss": 1.2146, + "step": 50 + }, + { + "epoch": 0.11, + "learning_rate": 0.0001, + "loss": 0.997, + "step": 60 + }, + { + "epoch": 0.13, + "learning_rate": 0.0001, + "loss": 0.9024, + "step": 70 + }, + { + "epoch": 0.15, + "learning_rate": 0.0001, + "loss": 0.9901, + "step": 80 + }, + { + "epoch": 0.16, + "learning_rate": 0.0001, + "loss": 1.1264, + "step": 90 + }, + { + "epoch": 0.18, + "learning_rate": 0.0001, + "loss": 1.2038, + "step": 100 + }, + { + "epoch": 0.2, + "learning_rate": 0.0001, + "loss": 0.8935, + "step": 110 + }, + { + "epoch": 0.22, + "learning_rate": 0.0001, + "loss": 0.9178, + "step": 120 + }, + { + "epoch": 0.24, + "learning_rate": 0.0001, + "loss": 0.9746, + "step": 130 + }, + { + "epoch": 0.25, + "learning_rate": 0.0001, + "loss": 1.1566, + "step": 140 + }, + { + "epoch": 0.27, + "learning_rate": 0.0001, + "loss": 1.2877, + "step": 150 + }, + { + "epoch": 0.29, + "learning_rate": 0.0001, + "loss": 0.9146, + "step": 160 + }, + { + "epoch": 0.31, + "learning_rate": 0.0001, + "loss": 0.8895, + "step": 170 + }, + { + "epoch": 0.33, + "learning_rate": 0.0001, + "loss": 1.0121, + "step": 180 + }, + { + "epoch": 0.34, + "eval_loss": 1.0215636491775513, + "eval_runtime": 950.138, + "eval_samples_per_second": 1.052, + "eval_steps_per_second": 1.052, + "step": 187 + }, + { + "epoch": 0.34, + "mmlu_eval_accuracy": 0.731892294851104, + "mmlu_eval_accuracy_abstract_algebra": 0.18181818181818182, + "mmlu_eval_accuracy_anatomy": 0.7857142857142857, + "mmlu_eval_accuracy_astronomy": 0.6875, + "mmlu_eval_accuracy_business_ethics": 0.7272727272727273, + "mmlu_eval_accuracy_clinical_knowledge": 0.896551724137931, + "mmlu_eval_accuracy_college_biology": 0.875, + "mmlu_eval_accuracy_college_chemistry": 0.5, + "mmlu_eval_accuracy_college_computer_science": 0.7272727272727273, + "mmlu_eval_accuracy_college_mathematics": 0.45454545454545453, + "mmlu_eval_accuracy_college_medicine": 0.9090909090909091, + "mmlu_eval_accuracy_college_physics": 0.6363636363636364, + "mmlu_eval_accuracy_computer_security": 0.6363636363636364, + "mmlu_eval_accuracy_conceptual_physics": 0.6538461538461539, + "mmlu_eval_accuracy_econometrics": 0.8333333333333334, + "mmlu_eval_accuracy_electrical_engineering": 0.8125, + "mmlu_eval_accuracy_elementary_mathematics": 0.7073170731707317, + "mmlu_eval_accuracy_formal_logic": 0.5714285714285714, + "mmlu_eval_accuracy_global_facts": 0.5, + "mmlu_eval_accuracy_high_school_biology": 0.8125, + "mmlu_eval_accuracy_high_school_chemistry": 0.4090909090909091, + "mmlu_eval_accuracy_high_school_computer_science": 0.6666666666666666, + "mmlu_eval_accuracy_high_school_european_history": 0.7777777777777778, + "mmlu_eval_accuracy_high_school_geography": 0.9090909090909091, + "mmlu_eval_accuracy_high_school_government_and_politics": 0.9523809523809523, + "mmlu_eval_accuracy_high_school_macroeconomics": 0.7441860465116279, + "mmlu_eval_accuracy_high_school_mathematics": 0.3793103448275862, + "mmlu_eval_accuracy_high_school_microeconomics": 0.9615384615384616, + "mmlu_eval_accuracy_high_school_physics": 0.23529411764705882, + "mmlu_eval_accuracy_high_school_psychology": 0.9333333333333333, + "mmlu_eval_accuracy_high_school_statistics": 0.6956521739130435, + "mmlu_eval_accuracy_high_school_us_history": 0.9090909090909091, + "mmlu_eval_accuracy_high_school_world_history": 0.8076923076923077, + "mmlu_eval_accuracy_human_aging": 0.8260869565217391, + "mmlu_eval_accuracy_human_sexuality": 0.6666666666666666, + "mmlu_eval_accuracy_international_law": 1.0, + "mmlu_eval_accuracy_jurisprudence": 0.6363636363636364, + "mmlu_eval_accuracy_logical_fallacies": 0.7777777777777778, + "mmlu_eval_accuracy_machine_learning": 0.5454545454545454, + "mmlu_eval_accuracy_management": 0.9090909090909091, + "mmlu_eval_accuracy_marketing": 0.88, + "mmlu_eval_accuracy_medical_genetics": 0.9090909090909091, + "mmlu_eval_accuracy_miscellaneous": 0.7906976744186046, + "mmlu_eval_accuracy_moral_disputes": 0.8157894736842105, + "mmlu_eval_accuracy_moral_scenarios": 0.59, + "mmlu_eval_accuracy_nutrition": 0.7878787878787878, + "mmlu_eval_accuracy_philosophy": 0.7941176470588235, + "mmlu_eval_accuracy_prehistory": 0.8285714285714286, + "mmlu_eval_accuracy_professional_accounting": 0.6451612903225806, + "mmlu_eval_accuracy_professional_law": 0.6294117647058823, + "mmlu_eval_accuracy_professional_medicine": 0.8064516129032258, + "mmlu_eval_accuracy_professional_psychology": 0.8115942028985508, + "mmlu_eval_accuracy_public_relations": 0.6666666666666666, + "mmlu_eval_accuracy_security_studies": 0.8148148148148148, + "mmlu_eval_accuracy_sociology": 0.9545454545454546, + "mmlu_eval_accuracy_us_foreign_policy": 1.0, + "mmlu_eval_accuracy_virology": 0.5, + "mmlu_eval_accuracy_world_religions": 0.8421052631578947, + "mmlu_loss": 1.326305795171384, + "step": 187 + }, + { + "epoch": 0.34, + "learning_rate": 0.0001, + "loss": 1.1133, + "step": 190 + }, + { + "epoch": 0.36, + "learning_rate": 0.0001, + "loss": 1.2485, + "step": 200 + }, + { + "epoch": 0.38, + "learning_rate": 0.0001, + "loss": 0.9653, + "step": 210 + }, + { + "epoch": 0.4, + "learning_rate": 0.0001, + "loss": 0.9455, + "step": 220 + }, + { + "epoch": 0.42, + "learning_rate": 0.0001, + "loss": 1.0373, + "step": 230 + }, + { + "epoch": 0.44, + "learning_rate": 0.0001, + "loss": 1.1425, + "step": 240 + }, + { + "epoch": 0.45, + "learning_rate": 0.0001, + "loss": 1.3136, + "step": 250 + }, + { + "epoch": 0.47, + "learning_rate": 0.0001, + "loss": 0.8695, + "step": 260 + }, + { + "epoch": 0.49, + "learning_rate": 0.0001, + "loss": 0.872, + "step": 270 + }, + { + "epoch": 0.51, + "learning_rate": 0.0001, + "loss": 1.0152, + "step": 280 + }, + { + "epoch": 0.53, + "learning_rate": 0.0001, + "loss": 1.1309, + "step": 290 + }, + { + "epoch": 0.54, + "learning_rate": 0.0001, + "loss": 1.267, + "step": 300 + }, + { + "epoch": 0.56, + "learning_rate": 0.0001, + "loss": 0.9249, + "step": 310 + }, + { + "epoch": 0.58, + "learning_rate": 0.0001, + "loss": 0.9148, + "step": 320 + }, + { + "epoch": 0.6, + "learning_rate": 0.0001, + "loss": 0.9864, + "step": 330 + }, + { + "epoch": 0.62, + "learning_rate": 0.0001, + "loss": 1.2312, + "step": 340 + }, + { + "epoch": 0.63, + "learning_rate": 0.0001, + "loss": 1.2354, + "step": 350 + }, + { + "epoch": 0.65, + "learning_rate": 0.0001, + "loss": 0.9126, + "step": 360 + }, + { + "epoch": 0.67, + "learning_rate": 0.0001, + "loss": 0.9213, + "step": 370 + }, + { + "epoch": 0.68, + "eval_loss": 1.0163359642028809, + "eval_runtime": 948.1151, + "eval_samples_per_second": 1.055, + "eval_steps_per_second": 1.055, + "step": 374 + }, + { + "epoch": 0.68, + "mmlu_eval_accuracy": 0.7395476061435284, + "mmlu_eval_accuracy_abstract_algebra": 0.2727272727272727, + "mmlu_eval_accuracy_anatomy": 0.7857142857142857, + "mmlu_eval_accuracy_astronomy": 0.75, + "mmlu_eval_accuracy_business_ethics": 0.7272727272727273, + "mmlu_eval_accuracy_clinical_knowledge": 0.896551724137931, + "mmlu_eval_accuracy_college_biology": 0.875, + "mmlu_eval_accuracy_college_chemistry": 0.5, + "mmlu_eval_accuracy_college_computer_science": 0.7272727272727273, + "mmlu_eval_accuracy_college_mathematics": 0.36363636363636365, + "mmlu_eval_accuracy_college_medicine": 0.9090909090909091, + "mmlu_eval_accuracy_college_physics": 0.6363636363636364, + "mmlu_eval_accuracy_computer_security": 0.6363636363636364, + "mmlu_eval_accuracy_conceptual_physics": 0.5769230769230769, + "mmlu_eval_accuracy_econometrics": 0.8333333333333334, + "mmlu_eval_accuracy_electrical_engineering": 0.875, + "mmlu_eval_accuracy_elementary_mathematics": 0.6829268292682927, + "mmlu_eval_accuracy_formal_logic": 0.6428571428571429, + "mmlu_eval_accuracy_global_facts": 0.5, + "mmlu_eval_accuracy_high_school_biology": 0.8125, + "mmlu_eval_accuracy_high_school_chemistry": 0.45454545454545453, + "mmlu_eval_accuracy_high_school_computer_science": 0.7777777777777778, + "mmlu_eval_accuracy_high_school_european_history": 0.7777777777777778, + "mmlu_eval_accuracy_high_school_geography": 0.9090909090909091, + "mmlu_eval_accuracy_high_school_government_and_politics": 0.9523809523809523, + "mmlu_eval_accuracy_high_school_macroeconomics": 0.7674418604651163, + "mmlu_eval_accuracy_high_school_mathematics": 0.41379310344827586, + "mmlu_eval_accuracy_high_school_microeconomics": 0.9615384615384616, + "mmlu_eval_accuracy_high_school_physics": 0.23529411764705882, + "mmlu_eval_accuracy_high_school_psychology": 0.95, + "mmlu_eval_accuracy_high_school_statistics": 0.7391304347826086, + "mmlu_eval_accuracy_high_school_us_history": 0.9090909090909091, + "mmlu_eval_accuracy_high_school_world_history": 0.7692307692307693, + "mmlu_eval_accuracy_human_aging": 0.8260869565217391, + "mmlu_eval_accuracy_human_sexuality": 0.6666666666666666, + "mmlu_eval_accuracy_international_law": 1.0, + "mmlu_eval_accuracy_jurisprudence": 0.6363636363636364, + "mmlu_eval_accuracy_logical_fallacies": 0.7777777777777778, + "mmlu_eval_accuracy_machine_learning": 0.5454545454545454, + "mmlu_eval_accuracy_management": 0.9090909090909091, + "mmlu_eval_accuracy_marketing": 0.92, + "mmlu_eval_accuracy_medical_genetics": 0.9090909090909091, + "mmlu_eval_accuracy_miscellaneous": 0.7790697674418605, + "mmlu_eval_accuracy_moral_disputes": 0.8157894736842105, + "mmlu_eval_accuracy_moral_scenarios": 0.57, + "mmlu_eval_accuracy_nutrition": 0.7272727272727273, + "mmlu_eval_accuracy_philosophy": 0.7941176470588235, + "mmlu_eval_accuracy_prehistory": 0.8571428571428571, + "mmlu_eval_accuracy_professional_accounting": 0.6774193548387096, + "mmlu_eval_accuracy_professional_law": 0.6411764705882353, + "mmlu_eval_accuracy_professional_medicine": 0.8387096774193549, + "mmlu_eval_accuracy_professional_psychology": 0.8115942028985508, + "mmlu_eval_accuracy_public_relations": 0.6666666666666666, + "mmlu_eval_accuracy_security_studies": 0.8148148148148148, + "mmlu_eval_accuracy_sociology": 0.9545454545454546, + "mmlu_eval_accuracy_us_foreign_policy": 1.0, + "mmlu_eval_accuracy_virology": 0.5, + "mmlu_eval_accuracy_world_religions": 0.8947368421052632, + "mmlu_loss": 1.2796503596061355, + "step": 374 + }, + { + "epoch": 0.69, + "learning_rate": 0.0001, + "loss": 0.9737, + "step": 380 + }, + { + "epoch": 0.71, + "learning_rate": 0.0001, + "loss": 1.157, + "step": 390 + }, + { + "epoch": 0.73, + "learning_rate": 0.0001, + "loss": 1.2106, + "step": 400 + }, + { + "epoch": 0.74, + "learning_rate": 0.0001, + "loss": 0.8687, + "step": 410 + }, + { + "epoch": 0.76, + "learning_rate": 0.0001, + "loss": 0.8742, + "step": 420 + }, + { + "epoch": 0.78, + "learning_rate": 0.0001, + "loss": 0.9901, + "step": 430 + }, + { + "epoch": 0.8, + "learning_rate": 0.0001, + "loss": 1.2238, + "step": 440 + }, + { + "epoch": 0.82, + "learning_rate": 0.0001, + "loss": 1.2604, + "step": 450 + }, + { + "epoch": 0.83, + "learning_rate": 0.0001, + "loss": 0.8756, + "step": 460 + }, + { + "epoch": 0.85, + "learning_rate": 0.0001, + "loss": 0.8683, + "step": 470 + }, + { + "epoch": 0.87, + "learning_rate": 0.0001, + "loss": 0.9824, + "step": 480 + }, + { + "epoch": 0.89, + "learning_rate": 0.0001, + "loss": 1.1574, + "step": 490 + }, + { + "epoch": 0.91, + "learning_rate": 0.0001, + "loss": 1.2687, + "step": 500 + }, + { + "epoch": 0.92, + "learning_rate": 0.0001, + "loss": 0.8657, + "step": 510 + }, + { + "epoch": 0.94, + "learning_rate": 0.0001, + "loss": 0.9207, + "step": 520 + }, + { + "epoch": 0.96, + "learning_rate": 0.0001, + "loss": 1.012, + "step": 530 + }, + { + "epoch": 0.98, + "learning_rate": 0.0001, + "loss": 1.1517, + "step": 540 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 1.1654, + "step": 550 + }, + { + "epoch": 1.02, + "learning_rate": 0.0001, + "loss": 0.8931, + "step": 560 + }, + { + "epoch": 1.02, + "eval_loss": 1.0150845050811768, + "eval_runtime": 949.8392, + "eval_samples_per_second": 1.053, + "eval_steps_per_second": 1.053, + "step": 561 + }, + { + "epoch": 1.02, + "mmlu_eval_accuracy": 0.7346397374699287, + "mmlu_eval_accuracy_abstract_algebra": 0.2727272727272727, + "mmlu_eval_accuracy_anatomy": 0.7142857142857143, + "mmlu_eval_accuracy_astronomy": 0.6875, + "mmlu_eval_accuracy_business_ethics": 0.8181818181818182, + "mmlu_eval_accuracy_clinical_knowledge": 0.8620689655172413, + "mmlu_eval_accuracy_college_biology": 0.875, + "mmlu_eval_accuracy_college_chemistry": 0.5, + "mmlu_eval_accuracy_college_computer_science": 0.7272727272727273, + "mmlu_eval_accuracy_college_mathematics": 0.36363636363636365, + "mmlu_eval_accuracy_college_medicine": 0.9090909090909091, + "mmlu_eval_accuracy_college_physics": 0.5454545454545454, + "mmlu_eval_accuracy_computer_security": 0.6363636363636364, + "mmlu_eval_accuracy_conceptual_physics": 0.6153846153846154, + "mmlu_eval_accuracy_econometrics": 0.8333333333333334, + "mmlu_eval_accuracy_electrical_engineering": 0.8125, + "mmlu_eval_accuracy_elementary_mathematics": 0.6829268292682927, + "mmlu_eval_accuracy_formal_logic": 0.6428571428571429, + "mmlu_eval_accuracy_global_facts": 0.5, + "mmlu_eval_accuracy_high_school_biology": 0.84375, + "mmlu_eval_accuracy_high_school_chemistry": 0.45454545454545453, + "mmlu_eval_accuracy_high_school_computer_science": 0.7777777777777778, + "mmlu_eval_accuracy_high_school_european_history": 0.7777777777777778, + "mmlu_eval_accuracy_high_school_geography": 0.9090909090909091, + "mmlu_eval_accuracy_high_school_government_and_politics": 0.9523809523809523, + "mmlu_eval_accuracy_high_school_macroeconomics": 0.7674418604651163, + "mmlu_eval_accuracy_high_school_mathematics": 0.41379310344827586, + "mmlu_eval_accuracy_high_school_microeconomics": 0.9230769230769231, + "mmlu_eval_accuracy_high_school_physics": 0.23529411764705882, + "mmlu_eval_accuracy_high_school_psychology": 0.9666666666666667, + "mmlu_eval_accuracy_high_school_statistics": 0.7391304347826086, + "mmlu_eval_accuracy_high_school_us_history": 0.9090909090909091, + "mmlu_eval_accuracy_high_school_world_history": 0.8076923076923077, + "mmlu_eval_accuracy_human_aging": 0.8260869565217391, + "mmlu_eval_accuracy_human_sexuality": 0.6666666666666666, + "mmlu_eval_accuracy_international_law": 1.0, + "mmlu_eval_accuracy_jurisprudence": 0.6363636363636364, + "mmlu_eval_accuracy_logical_fallacies": 0.7222222222222222, + "mmlu_eval_accuracy_machine_learning": 0.6363636363636364, + "mmlu_eval_accuracy_management": 0.9090909090909091, + "mmlu_eval_accuracy_marketing": 0.88, + "mmlu_eval_accuracy_medical_genetics": 0.9090909090909091, + "mmlu_eval_accuracy_miscellaneous": 0.7558139534883721, + "mmlu_eval_accuracy_moral_disputes": 0.8157894736842105, + "mmlu_eval_accuracy_moral_scenarios": 0.56, + "mmlu_eval_accuracy_nutrition": 0.7575757575757576, + "mmlu_eval_accuracy_philosophy": 0.7941176470588235, + "mmlu_eval_accuracy_prehistory": 0.8571428571428571, + "mmlu_eval_accuracy_professional_accounting": 0.6774193548387096, + "mmlu_eval_accuracy_professional_law": 0.6294117647058823, + "mmlu_eval_accuracy_professional_medicine": 0.8064516129032258, + "mmlu_eval_accuracy_professional_psychology": 0.8260869565217391, + "mmlu_eval_accuracy_public_relations": 0.6666666666666666, + "mmlu_eval_accuracy_security_studies": 0.8148148148148148, + "mmlu_eval_accuracy_sociology": 0.9090909090909091, + "mmlu_eval_accuracy_us_foreign_policy": 1.0, + "mmlu_eval_accuracy_virology": 0.5, + "mmlu_eval_accuracy_world_religions": 0.8421052631578947, + "mmlu_loss": 1.186674291658526, + "step": 561 + }, + { + "epoch": 1.03, + "learning_rate": 0.0001, + "loss": 0.8507, + "step": 570 + }, + { + "epoch": 1.05, + "learning_rate": 0.0001, + "loss": 0.9164, + "step": 580 + }, + { + "epoch": 1.07, + "learning_rate": 0.0001, + "loss": 1.0908, + "step": 590 + }, + { + "epoch": 1.09, + "learning_rate": 0.0001, + "loss": 1.0431, + "step": 600 + }, + { + "epoch": 1.11, + "learning_rate": 0.0001, + "loss": 0.8567, + "step": 610 + }, + { + "epoch": 1.12, + "learning_rate": 0.0001, + "loss": 0.8818, + "step": 620 + }, + { + "epoch": 1.14, + "learning_rate": 0.0001, + "loss": 0.9499, + "step": 630 + }, + { + "epoch": 1.16, + "learning_rate": 0.0001, + "loss": 1.0437, + "step": 640 + }, + { + "epoch": 1.18, + "learning_rate": 0.0001, + "loss": 1.0487, + "step": 650 + }, + { + "epoch": 1.2, + "learning_rate": 0.0001, + "loss": 0.8405, + "step": 660 + }, + { + "epoch": 1.22, + "learning_rate": 0.0001, + "loss": 0.8818, + "step": 670 + }, + { + "epoch": 1.23, + "learning_rate": 0.0001, + "loss": 0.9619, + "step": 680 + }, + { + "epoch": 1.25, + "learning_rate": 0.0001, + "loss": 1.0753, + "step": 690 + }, + { + "epoch": 1.27, + "learning_rate": 0.0001, + "loss": 1.0218, + "step": 700 + }, + { + "epoch": 1.29, + "learning_rate": 0.0001, + "loss": 0.8763, + "step": 710 + }, + { + "epoch": 1.31, + "learning_rate": 0.0001, + "loss": 0.8789, + "step": 720 + }, + { + "epoch": 1.32, + "learning_rate": 0.0001, + "loss": 0.8631, + "step": 730 + }, + { + "epoch": 1.34, + "learning_rate": 0.0001, + "loss": 0.9846, + "step": 740 + }, + { + "epoch": 1.36, + "eval_loss": 1.0305067300796509, + "eval_runtime": 948.7106, + "eval_samples_per_second": 1.054, + "eval_steps_per_second": 1.054, + "step": 748 + }, + { + "epoch": 1.36, + "mmlu_eval_accuracy": 0.7324229372189777, + "mmlu_eval_accuracy_abstract_algebra": 0.2727272727272727, + "mmlu_eval_accuracy_anatomy": 0.6428571428571429, + "mmlu_eval_accuracy_astronomy": 0.6875, + "mmlu_eval_accuracy_business_ethics": 1.0, + "mmlu_eval_accuracy_clinical_knowledge": 0.896551724137931, + "mmlu_eval_accuracy_college_biology": 0.875, + "mmlu_eval_accuracy_college_chemistry": 0.5, + "mmlu_eval_accuracy_college_computer_science": 0.6363636363636364, + "mmlu_eval_accuracy_college_mathematics": 0.36363636363636365, + "mmlu_eval_accuracy_college_medicine": 0.8636363636363636, + "mmlu_eval_accuracy_college_physics": 0.6363636363636364, + "mmlu_eval_accuracy_computer_security": 0.6363636363636364, + "mmlu_eval_accuracy_conceptual_physics": 0.6153846153846154, + "mmlu_eval_accuracy_econometrics": 0.8333333333333334, + "mmlu_eval_accuracy_electrical_engineering": 0.8125, + "mmlu_eval_accuracy_elementary_mathematics": 0.7073170731707317, + "mmlu_eval_accuracy_formal_logic": 0.6428571428571429, + "mmlu_eval_accuracy_global_facts": 0.4, + "mmlu_eval_accuracy_high_school_biology": 0.8125, + "mmlu_eval_accuracy_high_school_chemistry": 0.36363636363636365, + "mmlu_eval_accuracy_high_school_computer_science": 0.6666666666666666, + "mmlu_eval_accuracy_high_school_european_history": 0.7777777777777778, + "mmlu_eval_accuracy_high_school_geography": 0.9090909090909091, + "mmlu_eval_accuracy_high_school_government_and_politics": 0.9523809523809523, + "mmlu_eval_accuracy_high_school_macroeconomics": 0.7674418604651163, + "mmlu_eval_accuracy_high_school_mathematics": 0.3448275862068966, + "mmlu_eval_accuracy_high_school_microeconomics": 0.9615384615384616, + "mmlu_eval_accuracy_high_school_physics": 0.35294117647058826, + "mmlu_eval_accuracy_high_school_psychology": 0.9333333333333333, + "mmlu_eval_accuracy_high_school_statistics": 0.782608695652174, + "mmlu_eval_accuracy_high_school_us_history": 0.9545454545454546, + "mmlu_eval_accuracy_high_school_world_history": 0.7692307692307693, + "mmlu_eval_accuracy_human_aging": 0.782608695652174, + "mmlu_eval_accuracy_human_sexuality": 0.6666666666666666, + "mmlu_eval_accuracy_international_law": 1.0, + "mmlu_eval_accuracy_jurisprudence": 0.6363636363636364, + "mmlu_eval_accuracy_logical_fallacies": 0.7777777777777778, + "mmlu_eval_accuracy_machine_learning": 0.6363636363636364, + "mmlu_eval_accuracy_management": 0.9090909090909091, + "mmlu_eval_accuracy_marketing": 0.96, + "mmlu_eval_accuracy_medical_genetics": 0.9090909090909091, + "mmlu_eval_accuracy_miscellaneous": 0.7441860465116279, + "mmlu_eval_accuracy_moral_disputes": 0.8157894736842105, + "mmlu_eval_accuracy_moral_scenarios": 0.57, + "mmlu_eval_accuracy_nutrition": 0.7272727272727273, + "mmlu_eval_accuracy_philosophy": 0.7941176470588235, + "mmlu_eval_accuracy_prehistory": 0.8285714285714286, + "mmlu_eval_accuracy_professional_accounting": 0.6129032258064516, + "mmlu_eval_accuracy_professional_law": 0.6411764705882353, + "mmlu_eval_accuracy_professional_medicine": 0.8064516129032258, + "mmlu_eval_accuracy_professional_psychology": 0.8260869565217391, + "mmlu_eval_accuracy_public_relations": 0.6666666666666666, + "mmlu_eval_accuracy_security_studies": 0.8148148148148148, + "mmlu_eval_accuracy_sociology": 0.9090909090909091, + "mmlu_eval_accuracy_us_foreign_policy": 1.0, + "mmlu_eval_accuracy_virology": 0.5, + "mmlu_eval_accuracy_world_religions": 0.8421052631578947, + "mmlu_loss": 1.2988067958029479, + "step": 748 + }, + { + "epoch": 1.36, + "learning_rate": 0.0001, + "loss": 1.0735, + "step": 750 + }, + { + "epoch": 1.38, + "learning_rate": 0.0001, + "loss": 0.9066, + "step": 760 + }, + { + "epoch": 1.4, + "learning_rate": 0.0001, + "loss": 0.8716, + "step": 770 + }, + { + "epoch": 1.41, + "learning_rate": 0.0001, + "loss": 0.9144, + "step": 780 + }, + { + "epoch": 1.43, + "learning_rate": 0.0001, + "loss": 1.0338, + "step": 790 + }, + { + "epoch": 1.45, + "learning_rate": 0.0001, + "loss": 1.0275, + "step": 800 + }, + { + "epoch": 1.47, + "learning_rate": 0.0001, + "loss": 0.8382, + "step": 810 + }, + { + "epoch": 1.49, + "learning_rate": 0.0001, + "loss": 0.8489, + "step": 820 + }, + { + "epoch": 1.51, + "learning_rate": 0.0001, + "loss": 0.8931, + "step": 830 + }, + { + "epoch": 1.52, + "learning_rate": 0.0001, + "loss": 1.0515, + "step": 840 + }, + { + "epoch": 1.54, + "learning_rate": 0.0001, + "loss": 1.0965, + "step": 850 + }, + { + "epoch": 1.56, + "learning_rate": 0.0001, + "loss": 0.8928, + "step": 860 + }, + { + "epoch": 1.58, + "learning_rate": 0.0001, + "loss": 0.8608, + "step": 870 + }, + { + "epoch": 1.6, + "learning_rate": 0.0001, + "loss": 0.8831, + "step": 880 + }, + { + "epoch": 1.61, + "learning_rate": 0.0001, + "loss": 1.0253, + "step": 890 + }, + { + "epoch": 1.63, + "learning_rate": 0.0001, + "loss": 0.9905, + "step": 900 + }, + { + "epoch": 1.65, + "learning_rate": 0.0001, + "loss": 0.8487, + "step": 910 + }, + { + "epoch": 1.67, + "learning_rate": 0.0001, + "loss": 0.8568, + "step": 920 + }, + { + "epoch": 1.69, + "learning_rate": 0.0001, + "loss": 0.9047, + "step": 930 + }, + { + "epoch": 1.7, + "eval_loss": 1.0250624418258667, + "eval_runtime": 946.4035, + "eval_samples_per_second": 1.057, + "eval_steps_per_second": 1.057, + "step": 935 + }, + { + "epoch": 1.7, + "mmlu_eval_accuracy": 0.7288948695878031, + "mmlu_eval_accuracy_abstract_algebra": 0.2727272727272727, + "mmlu_eval_accuracy_anatomy": 0.5714285714285714, + "mmlu_eval_accuracy_astronomy": 0.6875, + "mmlu_eval_accuracy_business_ethics": 0.9090909090909091, + "mmlu_eval_accuracy_clinical_knowledge": 0.8620689655172413, + "mmlu_eval_accuracy_college_biology": 0.875, + "mmlu_eval_accuracy_college_chemistry": 0.5, + "mmlu_eval_accuracy_college_computer_science": 0.7272727272727273, + "mmlu_eval_accuracy_college_mathematics": 0.36363636363636365, + "mmlu_eval_accuracy_college_medicine": 0.9090909090909091, + "mmlu_eval_accuracy_college_physics": 0.6363636363636364, + "mmlu_eval_accuracy_computer_security": 0.6363636363636364, + "mmlu_eval_accuracy_conceptual_physics": 0.6153846153846154, + "mmlu_eval_accuracy_econometrics": 0.8333333333333334, + "mmlu_eval_accuracy_electrical_engineering": 0.8125, + "mmlu_eval_accuracy_elementary_mathematics": 0.6097560975609756, + "mmlu_eval_accuracy_formal_logic": 0.6428571428571429, + "mmlu_eval_accuracy_global_facts": 0.5, + "mmlu_eval_accuracy_high_school_biology": 0.84375, + "mmlu_eval_accuracy_high_school_chemistry": 0.45454545454545453, + "mmlu_eval_accuracy_high_school_computer_science": 0.6666666666666666, + "mmlu_eval_accuracy_high_school_european_history": 0.7777777777777778, + "mmlu_eval_accuracy_high_school_geography": 0.9090909090909091, + "mmlu_eval_accuracy_high_school_government_and_politics": 0.9523809523809523, + "mmlu_eval_accuracy_high_school_macroeconomics": 0.7906976744186046, + "mmlu_eval_accuracy_high_school_mathematics": 0.3793103448275862, + "mmlu_eval_accuracy_high_school_microeconomics": 0.9230769230769231, + "mmlu_eval_accuracy_high_school_physics": 0.23529411764705882, + "mmlu_eval_accuracy_high_school_psychology": 0.9333333333333333, + "mmlu_eval_accuracy_high_school_statistics": 0.7391304347826086, + "mmlu_eval_accuracy_high_school_us_history": 0.9090909090909091, + "mmlu_eval_accuracy_high_school_world_history": 0.7692307692307693, + "mmlu_eval_accuracy_human_aging": 0.782608695652174, + "mmlu_eval_accuracy_human_sexuality": 0.6666666666666666, + "mmlu_eval_accuracy_international_law": 1.0, + "mmlu_eval_accuracy_jurisprudence": 0.5454545454545454, + "mmlu_eval_accuracy_logical_fallacies": 0.7222222222222222, + "mmlu_eval_accuracy_machine_learning": 0.6363636363636364, + "mmlu_eval_accuracy_management": 0.9090909090909091, + "mmlu_eval_accuracy_marketing": 0.96, + "mmlu_eval_accuracy_medical_genetics": 0.9090909090909091, + "mmlu_eval_accuracy_miscellaneous": 0.7790697674418605, + "mmlu_eval_accuracy_moral_disputes": 0.8157894736842105, + "mmlu_eval_accuracy_moral_scenarios": 0.57, + "mmlu_eval_accuracy_nutrition": 0.7575757575757576, + "mmlu_eval_accuracy_philosophy": 0.8235294117647058, + "mmlu_eval_accuracy_prehistory": 0.8857142857142857, + "mmlu_eval_accuracy_professional_accounting": 0.6129032258064516, + "mmlu_eval_accuracy_professional_law": 0.6235294117647059, + "mmlu_eval_accuracy_professional_medicine": 0.8387096774193549, + "mmlu_eval_accuracy_professional_psychology": 0.8115942028985508, + "mmlu_eval_accuracy_public_relations": 0.5833333333333334, + "mmlu_eval_accuracy_security_studies": 0.8148148148148148, + "mmlu_eval_accuracy_sociology": 0.9090909090909091, + "mmlu_eval_accuracy_us_foreign_policy": 1.0, + "mmlu_eval_accuracy_virology": 0.5, + "mmlu_eval_accuracy_world_religions": 0.8421052631578947, + "mmlu_loss": 1.243813282909306, + "step": 935 + }, + { + "epoch": 1.7, + "learning_rate": 0.0001, + "loss": 1.0174, + "step": 940 + }, + { + "epoch": 1.72, + "learning_rate": 0.0001, + "loss": 1.0302, + "step": 950 + }, + { + "epoch": 1.74, + "learning_rate": 0.0001, + "loss": 0.8799, + "step": 960 + }, + { + "epoch": 1.76, + "learning_rate": 0.0001, + "loss": 0.8447, + "step": 970 + }, + { + "epoch": 1.78, + "learning_rate": 0.0001, + "loss": 0.9053, + "step": 980 + }, + { + "epoch": 1.8, + "learning_rate": 0.0001, + "loss": 1.0331, + "step": 990 + }, + { + "epoch": 1.81, + "learning_rate": 0.0001, + "loss": 1.0412, + "step": 1000 + }, + { + "epoch": 1.83, + "learning_rate": 0.0001, + "loss": 0.8753, + "step": 1010 + }, + { + "epoch": 1.85, + "learning_rate": 0.0001, + "loss": 0.8744, + "step": 1020 + }, + { + "epoch": 1.87, + "learning_rate": 0.0001, + "loss": 0.8899, + "step": 1030 + }, + { + "epoch": 1.89, + "learning_rate": 0.0001, + "loss": 1.0053, + "step": 1040 + }, + { + "epoch": 1.9, + "learning_rate": 0.0001, + "loss": 1.0127, + "step": 1050 + }, + { + "epoch": 1.92, + "learning_rate": 0.0001, + "loss": 0.8023, + "step": 1060 + }, + { + "epoch": 1.94, + "learning_rate": 0.0001, + "loss": 0.8349, + "step": 1070 + }, + { + "epoch": 1.96, + "learning_rate": 0.0001, + "loss": 0.9742, + "step": 1080 + }, + { + "epoch": 1.98, + "learning_rate": 0.0001, + "loss": 1.0971, + "step": 1090 + }, + { + "epoch": 2.0, + "learning_rate": 0.0001, + "loss": 1.0728, + "step": 1100 + }, + { + "epoch": 2.01, + "learning_rate": 0.0001, + "loss": 0.7724, + "step": 1110 + }, + { + "epoch": 2.03, + "learning_rate": 0.0001, + "loss": 0.7675, + "step": 1120 + }, + { + "epoch": 2.03, + "eval_loss": 1.052681565284729, + "eval_runtime": 942.0722, + "eval_samples_per_second": 1.061, + "eval_steps_per_second": 1.061, + "step": 1122 + }, + { + "epoch": 2.03, + "mmlu_eval_accuracy": 0.7373981967098951, + "mmlu_eval_accuracy_abstract_algebra": 0.36363636363636365, + "mmlu_eval_accuracy_anatomy": 0.6428571428571429, + "mmlu_eval_accuracy_astronomy": 0.6875, + "mmlu_eval_accuracy_business_ethics": 0.9090909090909091, + "mmlu_eval_accuracy_clinical_knowledge": 0.896551724137931, + "mmlu_eval_accuracy_college_biology": 0.875, + "mmlu_eval_accuracy_college_chemistry": 0.5, + "mmlu_eval_accuracy_college_computer_science": 0.5454545454545454, + "mmlu_eval_accuracy_college_mathematics": 0.36363636363636365, + "mmlu_eval_accuracy_college_medicine": 0.9090909090909091, + "mmlu_eval_accuracy_college_physics": 0.5454545454545454, + "mmlu_eval_accuracy_computer_security": 0.6363636363636364, + "mmlu_eval_accuracy_conceptual_physics": 0.6153846153846154, + "mmlu_eval_accuracy_econometrics": 0.8333333333333334, + "mmlu_eval_accuracy_electrical_engineering": 0.875, + "mmlu_eval_accuracy_elementary_mathematics": 0.5853658536585366, + "mmlu_eval_accuracy_formal_logic": 0.7142857142857143, + "mmlu_eval_accuracy_global_facts": 0.4, + "mmlu_eval_accuracy_high_school_biology": 0.84375, + "mmlu_eval_accuracy_high_school_chemistry": 0.45454545454545453, + "mmlu_eval_accuracy_high_school_computer_science": 0.7777777777777778, + "mmlu_eval_accuracy_high_school_european_history": 0.7777777777777778, + "mmlu_eval_accuracy_high_school_geography": 0.9090909090909091, + "mmlu_eval_accuracy_high_school_government_and_politics": 0.9523809523809523, + "mmlu_eval_accuracy_high_school_macroeconomics": 0.7674418604651163, + "mmlu_eval_accuracy_high_school_mathematics": 0.41379310344827586, + "mmlu_eval_accuracy_high_school_microeconomics": 0.9230769230769231, + "mmlu_eval_accuracy_high_school_physics": 0.23529411764705882, + "mmlu_eval_accuracy_high_school_psychology": 0.95, + "mmlu_eval_accuracy_high_school_statistics": 0.782608695652174, + "mmlu_eval_accuracy_high_school_us_history": 0.9090909090909091, + "mmlu_eval_accuracy_high_school_world_history": 0.7692307692307693, + "mmlu_eval_accuracy_human_aging": 0.8260869565217391, + "mmlu_eval_accuracy_human_sexuality": 0.6666666666666666, + "mmlu_eval_accuracy_international_law": 1.0, + "mmlu_eval_accuracy_jurisprudence": 0.6363636363636364, + "mmlu_eval_accuracy_logical_fallacies": 0.7777777777777778, + "mmlu_eval_accuracy_machine_learning": 0.6363636363636364, + "mmlu_eval_accuracy_management": 0.9090909090909091, + "mmlu_eval_accuracy_marketing": 0.92, + "mmlu_eval_accuracy_medical_genetics": 0.9090909090909091, + "mmlu_eval_accuracy_miscellaneous": 0.7906976744186046, + "mmlu_eval_accuracy_moral_disputes": 0.8157894736842105, + "mmlu_eval_accuracy_moral_scenarios": 0.62, + "mmlu_eval_accuracy_nutrition": 0.7575757575757576, + "mmlu_eval_accuracy_philosophy": 0.8235294117647058, + "mmlu_eval_accuracy_prehistory": 0.8857142857142857, + "mmlu_eval_accuracy_professional_accounting": 0.6774193548387096, + "mmlu_eval_accuracy_professional_law": 0.6294117647058823, + "mmlu_eval_accuracy_professional_medicine": 0.8709677419354839, + "mmlu_eval_accuracy_professional_psychology": 0.782608695652174, + "mmlu_eval_accuracy_public_relations": 0.6666666666666666, + "mmlu_eval_accuracy_security_studies": 0.8148148148148148, + "mmlu_eval_accuracy_sociology": 0.9090909090909091, + "mmlu_eval_accuracy_us_foreign_policy": 1.0, + "mmlu_eval_accuracy_virology": 0.5, + "mmlu_eval_accuracy_world_religions": 0.8421052631578947, + "mmlu_loss": 1.2340081441760609, + "step": 1122 + }, + { + "epoch": 2.05, + "learning_rate": 0.0001, + "loss": 0.7194, + "step": 1130 + }, + { + "epoch": 2.07, + "learning_rate": 0.0001, + "loss": 0.8236, + "step": 1140 + }, + { + "epoch": 2.09, + "learning_rate": 0.0001, + "loss": 0.6652, + "step": 1150 + }, + { + "epoch": 2.1, + "learning_rate": 0.0001, + "loss": 0.7177, + "step": 1160 + }, + { + "epoch": 2.12, + "learning_rate": 0.0001, + "loss": 0.7788, + "step": 1170 + }, + { + "epoch": 2.14, + "learning_rate": 0.0001, + "loss": 0.8117, + "step": 1180 + }, + { + "epoch": 2.16, + "learning_rate": 0.0001, + "loss": 0.8145, + "step": 1190 + }, + { + "epoch": 2.18, + "learning_rate": 0.0001, + "loss": 0.6984, + "step": 1200 + }, + { + "epoch": 2.19, + "learning_rate": 0.0001, + "loss": 0.7011, + "step": 1210 + }, + { + "epoch": 2.21, + "learning_rate": 0.0001, + "loss": 0.769, + "step": 1220 + }, + { + "epoch": 2.23, + "learning_rate": 0.0001, + "loss": 0.7705, + "step": 1230 + }, + { + "epoch": 2.25, + "learning_rate": 0.0001, + "loss": 0.8066, + "step": 1240 + }, + { + "epoch": 2.27, + "learning_rate": 0.0001, + "loss": 0.6622, + "step": 1250 + }, + { + "epoch": 2.29, + "learning_rate": 0.0001, + "loss": 0.6641, + "step": 1260 + }, + { + "epoch": 2.3, + "learning_rate": 0.0001, + "loss": 0.7239, + "step": 1270 + }, + { + "epoch": 2.32, + "learning_rate": 0.0001, + "loss": 0.7618, + "step": 1280 + }, + { + "epoch": 2.34, + "learning_rate": 0.0001, + "loss": 0.7845, + "step": 1290 + }, + { + "epoch": 2.36, + "learning_rate": 0.0001, + "loss": 0.719, + "step": 1300 + }, + { + "epoch": 2.37, + "eval_loss": 1.1104822158813477, + "eval_runtime": 948.1299, + "eval_samples_per_second": 1.055, + "eval_steps_per_second": 1.055, + "step": 1309 + }, + { + "epoch": 2.37, + "mmlu_eval_accuracy": 0.7369285730399766, + "mmlu_eval_accuracy_abstract_algebra": 0.36363636363636365, + "mmlu_eval_accuracy_anatomy": 0.6428571428571429, + "mmlu_eval_accuracy_astronomy": 0.6875, + "mmlu_eval_accuracy_business_ethics": 1.0, + "mmlu_eval_accuracy_clinical_knowledge": 0.8275862068965517, + "mmlu_eval_accuracy_college_biology": 0.875, + "mmlu_eval_accuracy_college_chemistry": 0.5, + "mmlu_eval_accuracy_college_computer_science": 0.6363636363636364, + "mmlu_eval_accuracy_college_mathematics": 0.45454545454545453, + "mmlu_eval_accuracy_college_medicine": 0.8636363636363636, + "mmlu_eval_accuracy_college_physics": 0.6363636363636364, + "mmlu_eval_accuracy_computer_security": 0.7272727272727273, + "mmlu_eval_accuracy_conceptual_physics": 0.6923076923076923, + "mmlu_eval_accuracy_econometrics": 0.75, + "mmlu_eval_accuracy_electrical_engineering": 0.8125, + "mmlu_eval_accuracy_elementary_mathematics": 0.6341463414634146, + "mmlu_eval_accuracy_formal_logic": 0.7857142857142857, + "mmlu_eval_accuracy_global_facts": 0.5, + "mmlu_eval_accuracy_high_school_biology": 0.84375, + "mmlu_eval_accuracy_high_school_chemistry": 0.4090909090909091, + "mmlu_eval_accuracy_high_school_computer_science": 0.8888888888888888, + "mmlu_eval_accuracy_high_school_european_history": 0.7777777777777778, + "mmlu_eval_accuracy_high_school_geography": 0.9090909090909091, + "mmlu_eval_accuracy_high_school_government_and_politics": 0.9523809523809523, + "mmlu_eval_accuracy_high_school_macroeconomics": 0.7906976744186046, + "mmlu_eval_accuracy_high_school_mathematics": 0.3793103448275862, + "mmlu_eval_accuracy_high_school_microeconomics": 0.9230769230769231, + "mmlu_eval_accuracy_high_school_physics": 0.17647058823529413, + "mmlu_eval_accuracy_high_school_psychology": 0.95, + "mmlu_eval_accuracy_high_school_statistics": 0.6956521739130435, + "mmlu_eval_accuracy_high_school_us_history": 0.9090909090909091, + "mmlu_eval_accuracy_high_school_world_history": 0.7307692307692307, + "mmlu_eval_accuracy_human_aging": 0.7391304347826086, + "mmlu_eval_accuracy_human_sexuality": 0.6666666666666666, + "mmlu_eval_accuracy_international_law": 1.0, + "mmlu_eval_accuracy_jurisprudence": 0.5454545454545454, + "mmlu_eval_accuracy_logical_fallacies": 0.7222222222222222, + "mmlu_eval_accuracy_machine_learning": 0.6363636363636364, + "mmlu_eval_accuracy_management": 0.9090909090909091, + "mmlu_eval_accuracy_marketing": 0.88, + "mmlu_eval_accuracy_medical_genetics": 0.9090909090909091, + "mmlu_eval_accuracy_miscellaneous": 0.7906976744186046, + "mmlu_eval_accuracy_moral_disputes": 0.8157894736842105, + "mmlu_eval_accuracy_moral_scenarios": 0.57, + "mmlu_eval_accuracy_nutrition": 0.7575757575757576, + "mmlu_eval_accuracy_philosophy": 0.8529411764705882, + "mmlu_eval_accuracy_prehistory": 0.8571428571428571, + "mmlu_eval_accuracy_professional_accounting": 0.6774193548387096, + "mmlu_eval_accuracy_professional_law": 0.6058823529411764, + "mmlu_eval_accuracy_professional_medicine": 0.8709677419354839, + "mmlu_eval_accuracy_professional_psychology": 0.7681159420289855, + "mmlu_eval_accuracy_public_relations": 0.5833333333333334, + "mmlu_eval_accuracy_security_studies": 0.8148148148148148, + "mmlu_eval_accuracy_sociology": 0.9090909090909091, + "mmlu_eval_accuracy_us_foreign_policy": 1.0, + "mmlu_eval_accuracy_virology": 0.5555555555555556, + "mmlu_eval_accuracy_world_religions": 0.8421052631578947, + "mmlu_loss": 1.0866562834095908, + "step": 1309 + }, + { + "epoch": 2.38, + "learning_rate": 0.0001, + "loss": 0.7093, + "step": 1310 + }, + { + "epoch": 2.39, + "learning_rate": 0.0001, + "loss": 0.7684, + "step": 1320 + }, + { + "epoch": 2.41, + "learning_rate": 0.0001, + "loss": 0.7501, + "step": 1330 + }, + { + "epoch": 2.43, + "learning_rate": 0.0001, + "loss": 0.8043, + "step": 1340 + }, + { + "epoch": 2.45, + "learning_rate": 0.0001, + "loss": 0.6927, + "step": 1350 + }, + { + "epoch": 2.47, + "learning_rate": 0.0001, + "loss": 0.7278, + "step": 1360 + }, + { + "epoch": 2.48, + "learning_rate": 0.0001, + "loss": 0.8095, + "step": 1370 + }, + { + "epoch": 2.5, + "learning_rate": 0.0001, + "loss": 0.7463, + "step": 1380 + }, + { + "epoch": 2.52, + "learning_rate": 0.0001, + "loss": 0.7707, + "step": 1390 + }, + { + "epoch": 2.54, + "learning_rate": 0.0001, + "loss": 0.7152, + "step": 1400 + }, + { + "epoch": 2.56, + "learning_rate": 0.0001, + "loss": 0.687, + "step": 1410 + }, + { + "epoch": 2.58, + "learning_rate": 0.0001, + "loss": 0.7529, + "step": 1420 + }, + { + "epoch": 2.59, + "learning_rate": 0.0001, + "loss": 0.7565, + "step": 1430 + }, + { + "epoch": 2.61, + "learning_rate": 0.0001, + "loss": 0.8066, + "step": 1440 + }, + { + "epoch": 2.63, + "learning_rate": 0.0001, + "loss": 0.7623, + "step": 1450 + }, + { + "epoch": 2.65, + "learning_rate": 0.0001, + "loss": 0.6947, + "step": 1460 + }, + { + "epoch": 2.67, + "learning_rate": 0.0001, + "loss": 0.7756, + "step": 1470 + }, + { + "epoch": 2.68, + "learning_rate": 0.0001, + "loss": 0.8453, + "step": 1480 + }, + { + "epoch": 2.7, + "learning_rate": 0.0001, + "loss": 0.8306, + "step": 1490 + }, + { + "epoch": 2.71, + "eval_loss": 1.100826621055603, + "eval_runtime": 940.4488, + "eval_samples_per_second": 1.063, + "eval_steps_per_second": 1.063, + "step": 1496 + }, + { + "epoch": 2.71, + "mmlu_eval_accuracy": 0.7363077307176445, + "mmlu_eval_accuracy_abstract_algebra": 0.36363636363636365, + "mmlu_eval_accuracy_anatomy": 0.5714285714285714, + "mmlu_eval_accuracy_astronomy": 0.6875, + "mmlu_eval_accuracy_business_ethics": 0.9090909090909091, + "mmlu_eval_accuracy_clinical_knowledge": 0.8620689655172413, + "mmlu_eval_accuracy_college_biology": 0.8125, + "mmlu_eval_accuracy_college_chemistry": 0.5, + "mmlu_eval_accuracy_college_computer_science": 0.5454545454545454, + "mmlu_eval_accuracy_college_mathematics": 0.36363636363636365, + "mmlu_eval_accuracy_college_medicine": 0.9090909090909091, + "mmlu_eval_accuracy_college_physics": 0.6363636363636364, + "mmlu_eval_accuracy_computer_security": 0.6363636363636364, + "mmlu_eval_accuracy_conceptual_physics": 0.6153846153846154, + "mmlu_eval_accuracy_econometrics": 0.75, + "mmlu_eval_accuracy_electrical_engineering": 0.8125, + "mmlu_eval_accuracy_elementary_mathematics": 0.6585365853658537, + "mmlu_eval_accuracy_formal_logic": 0.6428571428571429, + "mmlu_eval_accuracy_global_facts": 0.6, + "mmlu_eval_accuracy_high_school_biology": 0.78125, + "mmlu_eval_accuracy_high_school_chemistry": 0.5909090909090909, + "mmlu_eval_accuracy_high_school_computer_science": 0.7777777777777778, + "mmlu_eval_accuracy_high_school_european_history": 0.7777777777777778, + "mmlu_eval_accuracy_high_school_geography": 0.9545454545454546, + "mmlu_eval_accuracy_high_school_government_and_politics": 0.9523809523809523, + "mmlu_eval_accuracy_high_school_macroeconomics": 0.7906976744186046, + "mmlu_eval_accuracy_high_school_mathematics": 0.4827586206896552, + "mmlu_eval_accuracy_high_school_microeconomics": 0.9615384615384616, + "mmlu_eval_accuracy_high_school_physics": 0.17647058823529413, + "mmlu_eval_accuracy_high_school_psychology": 0.9166666666666666, + "mmlu_eval_accuracy_high_school_statistics": 0.6521739130434783, + "mmlu_eval_accuracy_high_school_us_history": 0.9090909090909091, + "mmlu_eval_accuracy_high_school_world_history": 0.8076923076923077, + "mmlu_eval_accuracy_human_aging": 0.7391304347826086, + "mmlu_eval_accuracy_human_sexuality": 0.75, + "mmlu_eval_accuracy_international_law": 1.0, + "mmlu_eval_accuracy_jurisprudence": 0.45454545454545453, + "mmlu_eval_accuracy_logical_fallacies": 0.7777777777777778, + "mmlu_eval_accuracy_machine_learning": 0.6363636363636364, + "mmlu_eval_accuracy_management": 0.9090909090909091, + "mmlu_eval_accuracy_marketing": 0.88, + "mmlu_eval_accuracy_medical_genetics": 0.9090909090909091, + "mmlu_eval_accuracy_miscellaneous": 0.7906976744186046, + "mmlu_eval_accuracy_moral_disputes": 0.8157894736842105, + "mmlu_eval_accuracy_moral_scenarios": 0.64, + "mmlu_eval_accuracy_nutrition": 0.7575757575757576, + "mmlu_eval_accuracy_philosophy": 0.8235294117647058, + "mmlu_eval_accuracy_prehistory": 0.8571428571428571, + "mmlu_eval_accuracy_professional_accounting": 0.7096774193548387, + "mmlu_eval_accuracy_professional_law": 0.6176470588235294, + "mmlu_eval_accuracy_professional_medicine": 0.8709677419354839, + "mmlu_eval_accuracy_professional_psychology": 0.7971014492753623, + "mmlu_eval_accuracy_public_relations": 0.6666666666666666, + "mmlu_eval_accuracy_security_studies": 0.8518518518518519, + "mmlu_eval_accuracy_sociology": 0.9090909090909091, + "mmlu_eval_accuracy_us_foreign_policy": 1.0, + "mmlu_eval_accuracy_virology": 0.5555555555555556, + "mmlu_eval_accuracy_world_religions": 0.8421052631578947, + "mmlu_loss": 1.2313211129857853, + "step": 1496 + }, + { + "epoch": 2.72, + "learning_rate": 0.0001, + "loss": 0.6937, + "step": 1500 + }, + { + "epoch": 2.74, + "learning_rate": 0.0001, + "loss": 0.6997, + "step": 1510 + }, + { + "epoch": 2.76, + "learning_rate": 0.0001, + "loss": 0.7588, + "step": 1520 + }, + { + "epoch": 2.77, + "learning_rate": 0.0001, + "loss": 0.7731, + "step": 1530 + }, + { + "epoch": 2.79, + "learning_rate": 0.0001, + "loss": 0.7914, + "step": 1540 + }, + { + "epoch": 2.81, + "learning_rate": 0.0001, + "loss": 0.7175, + "step": 1550 + }, + { + "epoch": 2.83, + "learning_rate": 0.0001, + "loss": 0.7046, + "step": 1560 + }, + { + "epoch": 2.85, + "learning_rate": 0.0001, + "loss": 0.7597, + "step": 1570 + }, + { + "epoch": 2.87, + "learning_rate": 0.0001, + "loss": 0.7932, + "step": 1580 + }, + { + "epoch": 2.88, + "learning_rate": 0.0001, + "loss": 0.8059, + "step": 1590 + }, + { + "epoch": 2.9, + "learning_rate": 0.0001, + "loss": 0.7258, + "step": 1600 + } + ], + "max_steps": 1875, + "num_train_epochs": 4, + "total_flos": 1.854216633063555e+18, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-1600/training_args.bin b/checkpoint-1600/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..cab5478c763bfedd6ea1ae136d922355ec23f73d --- /dev/null +++ b/checkpoint-1600/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3739b542a2914edbd2d7eb3b727d6fa2a7752c75b0d7a856f5e87dd807fa1ef9 +size 6200 diff --git a/checkpoint-1800/README.md b/checkpoint-1800/README.md new file mode 100644 index 0000000000000000000000000000000000000000..82793f73e61dbb024e11fc6697bba1622d4d0db6 --- /dev/null +++ b/checkpoint-1800/README.md @@ -0,0 +1,20 @@ +--- +library_name: peft +--- +## Training procedure + + +The following `bitsandbytes` quantization config was used during training: +- load_in_8bit: False +- load_in_4bit: True +- llm_int8_threshold: 6.0 +- llm_int8_skip_modules: None +- llm_int8_enable_fp32_cpu_offload: False +- llm_int8_has_fp16_weight: False +- bnb_4bit_quant_type: nf4 +- bnb_4bit_use_double_quant: True +- bnb_4bit_compute_dtype: bfloat16 +### Framework versions + + +- PEFT 0.4.0 diff --git a/checkpoint-1800/adapter_config.json b/checkpoint-1800/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..f6f42652a8f2f3cf02936153fca1af5d04e7983d --- /dev/null +++ b/checkpoint-1800/adapter_config.json @@ -0,0 +1,26 @@ +{ + "auto_mapping": null, + "base_model_name_or_path": "152334H/miqu-1-70b-sf", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "lora_alpha": 16.0, + "lora_dropout": 0.05, + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "revision": null, + "target_modules": [ + "gate_proj", + "down_proj", + "v_proj", + "up_proj", + "q_proj", + "k_proj", + "o_proj" + ], + "task_type": "CAUSAL_LM" +} \ No newline at end of file diff --git a/checkpoint-1800/adapter_model.bin b/checkpoint-1800/adapter_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..ddb953e82f587734106fd3b3a58fe5249c14e398 --- /dev/null +++ b/checkpoint-1800/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3f5d456fc1c68dcc8b3d0d05d8b272ed31331a38e04f42c6a819c109109e12a3 +size 1657155522 diff --git a/checkpoint-1800/adapter_model/README.md b/checkpoint-1800/adapter_model/README.md new file mode 100644 index 0000000000000000000000000000000000000000..82793f73e61dbb024e11fc6697bba1622d4d0db6 --- /dev/null +++ b/checkpoint-1800/adapter_model/README.md @@ -0,0 +1,20 @@ +--- +library_name: peft +--- +## Training procedure + + +The following `bitsandbytes` quantization config was used during training: +- load_in_8bit: False +- load_in_4bit: True +- llm_int8_threshold: 6.0 +- llm_int8_skip_modules: None +- llm_int8_enable_fp32_cpu_offload: False +- llm_int8_has_fp16_weight: False +- bnb_4bit_quant_type: nf4 +- bnb_4bit_use_double_quant: True +- bnb_4bit_compute_dtype: bfloat16 +### Framework versions + + +- PEFT 0.4.0 diff --git a/checkpoint-1800/adapter_model/adapter_config.json b/checkpoint-1800/adapter_model/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..f6f42652a8f2f3cf02936153fca1af5d04e7983d --- /dev/null +++ b/checkpoint-1800/adapter_model/adapter_config.json @@ -0,0 +1,26 @@ +{ + "auto_mapping": null, + "base_model_name_or_path": "152334H/miqu-1-70b-sf", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "lora_alpha": 16.0, + "lora_dropout": 0.05, + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "revision": null, + "target_modules": [ + "gate_proj", + "down_proj", + "v_proj", + "up_proj", + "q_proj", + "k_proj", + "o_proj" + ], + "task_type": "CAUSAL_LM" +} \ No newline at end of file diff --git a/checkpoint-1800/adapter_model/adapter_model.bin b/checkpoint-1800/adapter_model/adapter_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..ddb953e82f587734106fd3b3a58fe5249c14e398 --- /dev/null +++ b/checkpoint-1800/adapter_model/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3f5d456fc1c68dcc8b3d0d05d8b272ed31331a38e04f42c6a819c109109e12a3 +size 1657155522 diff --git a/checkpoint-1800/optimizer.pt b/checkpoint-1800/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..e8232ee68f1ce9913f58108be2744eb1cfaf40c8 --- /dev/null +++ b/checkpoint-1800/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e6f1b2b088ddc71268a259283ba00336729affa7b86a0f96ff793428c7a03f22 +size 6627702922 diff --git a/checkpoint-1800/rng_state.pth b/checkpoint-1800/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..bcfe264a0f0e44db8038326b878cc688729f6ba6 --- /dev/null +++ b/checkpoint-1800/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:066b0713c3dfd9c0b1175954102d461fd0b3d344ec2b477efae72ee68a4f2535 +size 14180 diff --git a/checkpoint-1800/scheduler.pt b/checkpoint-1800/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..91975212481bf6d9e40df716cccd4882750baf1c --- /dev/null +++ b/checkpoint-1800/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:110b2f0ce7be25c09ed998ed1965f38a8d3a448ca6aa07e3d4392461b80d705f +size 1064 diff --git a/checkpoint-1800/special_tokens_map.json b/checkpoint-1800/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..55431d8ca0ffd8e996c646341b6136e749142547 --- /dev/null +++ b/checkpoint-1800/special_tokens_map.json @@ -0,0 +1,12 @@ +{ + "bos_token": "", + "eos_token": "", + "pad_token": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false + }, + "unk_token": "" +} diff --git a/checkpoint-1800/tokenizer.model b/checkpoint-1800/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..6c00c742ce03c627d6cd5b795984876fa49fa899 --- /dev/null +++ b/checkpoint-1800/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347 +size 499723 diff --git a/checkpoint-1800/tokenizer_config.json b/checkpoint-1800/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..a3b9160e6f055447e1b1927482cf0ea394366055 --- /dev/null +++ b/checkpoint-1800/tokenizer_config.json @@ -0,0 +1,71 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": { + "__type": "AddedToken", + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false + }, + "chat_template": "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}", + "clean_up_tokenization_spaces": false, + "eos_token": { + "__type": "AddedToken", + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false + }, + "legacy": false, + "model_max_length": 1000000000000000019884624838656, + "pad_token": { + "__type": "AddedToken", + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false + }, + "padding_side": "right", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "LlamaTokenizer", + "unk_token": { + "__type": "AddedToken", + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false + }, + "use_default_system_prompt": false +} diff --git a/checkpoint-1800/trainer_state.json b/checkpoint-1800/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..33184b197f9161bd888dc38f2ba3cd2f19a3c050 --- /dev/null +++ b/checkpoint-1800/trainer_state.json @@ -0,0 +1,1735 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.264565858082068, + "global_step": 1800, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.02, + "learning_rate": 0.0001, + "loss": 1.1655, + "step": 10 + }, + { + "epoch": 0.04, + "learning_rate": 0.0001, + "loss": 1.001, + "step": 20 + }, + { + "epoch": 0.05, + "learning_rate": 0.0001, + "loss": 1.0287, + "step": 30 + }, + { + "epoch": 0.07, + "learning_rate": 0.0001, + "loss": 1.1578, + "step": 40 + }, + { + "epoch": 0.09, + "learning_rate": 0.0001, + "loss": 1.2146, + "step": 50 + }, + { + "epoch": 0.11, + "learning_rate": 0.0001, + "loss": 0.997, + "step": 60 + }, + { + "epoch": 0.13, + "learning_rate": 0.0001, + "loss": 0.9024, + "step": 70 + }, + { + "epoch": 0.15, + "learning_rate": 0.0001, + "loss": 0.9901, + "step": 80 + }, + { + "epoch": 0.16, + "learning_rate": 0.0001, + "loss": 1.1264, + "step": 90 + }, + { + "epoch": 0.18, + "learning_rate": 0.0001, + "loss": 1.2038, + "step": 100 + }, + { + "epoch": 0.2, + "learning_rate": 0.0001, + "loss": 0.8935, + "step": 110 + }, + { + "epoch": 0.22, + "learning_rate": 0.0001, + "loss": 0.9178, + "step": 120 + }, + { + "epoch": 0.24, + "learning_rate": 0.0001, + "loss": 0.9746, + "step": 130 + }, + { + "epoch": 0.25, + "learning_rate": 0.0001, + "loss": 1.1566, + "step": 140 + }, + { + "epoch": 0.27, + "learning_rate": 0.0001, + "loss": 1.2877, + "step": 150 + }, + { + "epoch": 0.29, + "learning_rate": 0.0001, + "loss": 0.9146, + "step": 160 + }, + { + "epoch": 0.31, + "learning_rate": 0.0001, + "loss": 0.8895, + "step": 170 + }, + { + "epoch": 0.33, + "learning_rate": 0.0001, + "loss": 1.0121, + "step": 180 + }, + { + "epoch": 0.34, + "eval_loss": 1.0215636491775513, + "eval_runtime": 950.138, + "eval_samples_per_second": 1.052, + "eval_steps_per_second": 1.052, + "step": 187 + }, + { + "epoch": 0.34, + "mmlu_eval_accuracy": 0.731892294851104, + "mmlu_eval_accuracy_abstract_algebra": 0.18181818181818182, + "mmlu_eval_accuracy_anatomy": 0.7857142857142857, + "mmlu_eval_accuracy_astronomy": 0.6875, + "mmlu_eval_accuracy_business_ethics": 0.7272727272727273, + "mmlu_eval_accuracy_clinical_knowledge": 0.896551724137931, + "mmlu_eval_accuracy_college_biology": 0.875, + "mmlu_eval_accuracy_college_chemistry": 0.5, + "mmlu_eval_accuracy_college_computer_science": 0.7272727272727273, + "mmlu_eval_accuracy_college_mathematics": 0.45454545454545453, + "mmlu_eval_accuracy_college_medicine": 0.9090909090909091, + "mmlu_eval_accuracy_college_physics": 0.6363636363636364, + "mmlu_eval_accuracy_computer_security": 0.6363636363636364, + "mmlu_eval_accuracy_conceptual_physics": 0.6538461538461539, + "mmlu_eval_accuracy_econometrics": 0.8333333333333334, + "mmlu_eval_accuracy_electrical_engineering": 0.8125, + "mmlu_eval_accuracy_elementary_mathematics": 0.7073170731707317, + "mmlu_eval_accuracy_formal_logic": 0.5714285714285714, + "mmlu_eval_accuracy_global_facts": 0.5, + "mmlu_eval_accuracy_high_school_biology": 0.8125, + "mmlu_eval_accuracy_high_school_chemistry": 0.4090909090909091, + "mmlu_eval_accuracy_high_school_computer_science": 0.6666666666666666, + "mmlu_eval_accuracy_high_school_european_history": 0.7777777777777778, + "mmlu_eval_accuracy_high_school_geography": 0.9090909090909091, + "mmlu_eval_accuracy_high_school_government_and_politics": 0.9523809523809523, + "mmlu_eval_accuracy_high_school_macroeconomics": 0.7441860465116279, + "mmlu_eval_accuracy_high_school_mathematics": 0.3793103448275862, + "mmlu_eval_accuracy_high_school_microeconomics": 0.9615384615384616, + "mmlu_eval_accuracy_high_school_physics": 0.23529411764705882, + "mmlu_eval_accuracy_high_school_psychology": 0.9333333333333333, + "mmlu_eval_accuracy_high_school_statistics": 0.6956521739130435, + "mmlu_eval_accuracy_high_school_us_history": 0.9090909090909091, + "mmlu_eval_accuracy_high_school_world_history": 0.8076923076923077, + "mmlu_eval_accuracy_human_aging": 0.8260869565217391, + "mmlu_eval_accuracy_human_sexuality": 0.6666666666666666, + "mmlu_eval_accuracy_international_law": 1.0, + "mmlu_eval_accuracy_jurisprudence": 0.6363636363636364, + "mmlu_eval_accuracy_logical_fallacies": 0.7777777777777778, + "mmlu_eval_accuracy_machine_learning": 0.5454545454545454, + "mmlu_eval_accuracy_management": 0.9090909090909091, + "mmlu_eval_accuracy_marketing": 0.88, + "mmlu_eval_accuracy_medical_genetics": 0.9090909090909091, + "mmlu_eval_accuracy_miscellaneous": 0.7906976744186046, + "mmlu_eval_accuracy_moral_disputes": 0.8157894736842105, + "mmlu_eval_accuracy_moral_scenarios": 0.59, + "mmlu_eval_accuracy_nutrition": 0.7878787878787878, + "mmlu_eval_accuracy_philosophy": 0.7941176470588235, + "mmlu_eval_accuracy_prehistory": 0.8285714285714286, + "mmlu_eval_accuracy_professional_accounting": 0.6451612903225806, + "mmlu_eval_accuracy_professional_law": 0.6294117647058823, + "mmlu_eval_accuracy_professional_medicine": 0.8064516129032258, + "mmlu_eval_accuracy_professional_psychology": 0.8115942028985508, + "mmlu_eval_accuracy_public_relations": 0.6666666666666666, + "mmlu_eval_accuracy_security_studies": 0.8148148148148148, + "mmlu_eval_accuracy_sociology": 0.9545454545454546, + "mmlu_eval_accuracy_us_foreign_policy": 1.0, + "mmlu_eval_accuracy_virology": 0.5, + "mmlu_eval_accuracy_world_religions": 0.8421052631578947, + "mmlu_loss": 1.326305795171384, + "step": 187 + }, + { + "epoch": 0.34, + "learning_rate": 0.0001, + "loss": 1.1133, + "step": 190 + }, + { + "epoch": 0.36, + "learning_rate": 0.0001, + "loss": 1.2485, + "step": 200 + }, + { + "epoch": 0.38, + "learning_rate": 0.0001, + "loss": 0.9653, + "step": 210 + }, + { + "epoch": 0.4, + "learning_rate": 0.0001, + "loss": 0.9455, + "step": 220 + }, + { + "epoch": 0.42, + "learning_rate": 0.0001, + "loss": 1.0373, + "step": 230 + }, + { + "epoch": 0.44, + "learning_rate": 0.0001, + "loss": 1.1425, + "step": 240 + }, + { + "epoch": 0.45, + "learning_rate": 0.0001, + "loss": 1.3136, + "step": 250 + }, + { + "epoch": 0.47, + "learning_rate": 0.0001, + "loss": 0.8695, + "step": 260 + }, + { + "epoch": 0.49, + "learning_rate": 0.0001, + "loss": 0.872, + "step": 270 + }, + { + "epoch": 0.51, + "learning_rate": 0.0001, + "loss": 1.0152, + "step": 280 + }, + { + "epoch": 0.53, + "learning_rate": 0.0001, + "loss": 1.1309, + "step": 290 + }, + { + "epoch": 0.54, + "learning_rate": 0.0001, + "loss": 1.267, + "step": 300 + }, + { + "epoch": 0.56, + "learning_rate": 0.0001, + "loss": 0.9249, + "step": 310 + }, + { + "epoch": 0.58, + "learning_rate": 0.0001, + "loss": 0.9148, + "step": 320 + }, + { + "epoch": 0.6, + "learning_rate": 0.0001, + "loss": 0.9864, + "step": 330 + }, + { + "epoch": 0.62, + "learning_rate": 0.0001, + "loss": 1.2312, + "step": 340 + }, + { + "epoch": 0.63, + "learning_rate": 0.0001, + "loss": 1.2354, + "step": 350 + }, + { + "epoch": 0.65, + "learning_rate": 0.0001, + "loss": 0.9126, + "step": 360 + }, + { + "epoch": 0.67, + "learning_rate": 0.0001, + "loss": 0.9213, + "step": 370 + }, + { + "epoch": 0.68, + "eval_loss": 1.0163359642028809, + "eval_runtime": 948.1151, + "eval_samples_per_second": 1.055, + "eval_steps_per_second": 1.055, + "step": 374 + }, + { + "epoch": 0.68, + "mmlu_eval_accuracy": 0.7395476061435284, + "mmlu_eval_accuracy_abstract_algebra": 0.2727272727272727, + "mmlu_eval_accuracy_anatomy": 0.7857142857142857, + "mmlu_eval_accuracy_astronomy": 0.75, + "mmlu_eval_accuracy_business_ethics": 0.7272727272727273, + "mmlu_eval_accuracy_clinical_knowledge": 0.896551724137931, + "mmlu_eval_accuracy_college_biology": 0.875, + "mmlu_eval_accuracy_college_chemistry": 0.5, + "mmlu_eval_accuracy_college_computer_science": 0.7272727272727273, + "mmlu_eval_accuracy_college_mathematics": 0.36363636363636365, + "mmlu_eval_accuracy_college_medicine": 0.9090909090909091, + "mmlu_eval_accuracy_college_physics": 0.6363636363636364, + "mmlu_eval_accuracy_computer_security": 0.6363636363636364, + "mmlu_eval_accuracy_conceptual_physics": 0.5769230769230769, + "mmlu_eval_accuracy_econometrics": 0.8333333333333334, + "mmlu_eval_accuracy_electrical_engineering": 0.875, + "mmlu_eval_accuracy_elementary_mathematics": 0.6829268292682927, + "mmlu_eval_accuracy_formal_logic": 0.6428571428571429, + "mmlu_eval_accuracy_global_facts": 0.5, + "mmlu_eval_accuracy_high_school_biology": 0.8125, + "mmlu_eval_accuracy_high_school_chemistry": 0.45454545454545453, + "mmlu_eval_accuracy_high_school_computer_science": 0.7777777777777778, + "mmlu_eval_accuracy_high_school_european_history": 0.7777777777777778, + "mmlu_eval_accuracy_high_school_geography": 0.9090909090909091, + "mmlu_eval_accuracy_high_school_government_and_politics": 0.9523809523809523, + "mmlu_eval_accuracy_high_school_macroeconomics": 0.7674418604651163, + "mmlu_eval_accuracy_high_school_mathematics": 0.41379310344827586, + "mmlu_eval_accuracy_high_school_microeconomics": 0.9615384615384616, + "mmlu_eval_accuracy_high_school_physics": 0.23529411764705882, + "mmlu_eval_accuracy_high_school_psychology": 0.95, + "mmlu_eval_accuracy_high_school_statistics": 0.7391304347826086, + "mmlu_eval_accuracy_high_school_us_history": 0.9090909090909091, + "mmlu_eval_accuracy_high_school_world_history": 0.7692307692307693, + "mmlu_eval_accuracy_human_aging": 0.8260869565217391, + "mmlu_eval_accuracy_human_sexuality": 0.6666666666666666, + "mmlu_eval_accuracy_international_law": 1.0, + "mmlu_eval_accuracy_jurisprudence": 0.6363636363636364, + "mmlu_eval_accuracy_logical_fallacies": 0.7777777777777778, + "mmlu_eval_accuracy_machine_learning": 0.5454545454545454, + "mmlu_eval_accuracy_management": 0.9090909090909091, + "mmlu_eval_accuracy_marketing": 0.92, + "mmlu_eval_accuracy_medical_genetics": 0.9090909090909091, + "mmlu_eval_accuracy_miscellaneous": 0.7790697674418605, + "mmlu_eval_accuracy_moral_disputes": 0.8157894736842105, + "mmlu_eval_accuracy_moral_scenarios": 0.57, + "mmlu_eval_accuracy_nutrition": 0.7272727272727273, + "mmlu_eval_accuracy_philosophy": 0.7941176470588235, + "mmlu_eval_accuracy_prehistory": 0.8571428571428571, + "mmlu_eval_accuracy_professional_accounting": 0.6774193548387096, + "mmlu_eval_accuracy_professional_law": 0.6411764705882353, + "mmlu_eval_accuracy_professional_medicine": 0.8387096774193549, + "mmlu_eval_accuracy_professional_psychology": 0.8115942028985508, + "mmlu_eval_accuracy_public_relations": 0.6666666666666666, + "mmlu_eval_accuracy_security_studies": 0.8148148148148148, + "mmlu_eval_accuracy_sociology": 0.9545454545454546, + "mmlu_eval_accuracy_us_foreign_policy": 1.0, + "mmlu_eval_accuracy_virology": 0.5, + "mmlu_eval_accuracy_world_religions": 0.8947368421052632, + "mmlu_loss": 1.2796503596061355, + "step": 374 + }, + { + "epoch": 0.69, + "learning_rate": 0.0001, + "loss": 0.9737, + "step": 380 + }, + { + "epoch": 0.71, + "learning_rate": 0.0001, + "loss": 1.157, + "step": 390 + }, + { + "epoch": 0.73, + "learning_rate": 0.0001, + "loss": 1.2106, + "step": 400 + }, + { + "epoch": 0.74, + "learning_rate": 0.0001, + "loss": 0.8687, + "step": 410 + }, + { + "epoch": 0.76, + "learning_rate": 0.0001, + "loss": 0.8742, + "step": 420 + }, + { + "epoch": 0.78, + "learning_rate": 0.0001, + "loss": 0.9901, + "step": 430 + }, + { + "epoch": 0.8, + "learning_rate": 0.0001, + "loss": 1.2238, + "step": 440 + }, + { + "epoch": 0.82, + "learning_rate": 0.0001, + "loss": 1.2604, + "step": 450 + }, + { + "epoch": 0.83, + "learning_rate": 0.0001, + "loss": 0.8756, + "step": 460 + }, + { + "epoch": 0.85, + "learning_rate": 0.0001, + "loss": 0.8683, + "step": 470 + }, + { + "epoch": 0.87, + "learning_rate": 0.0001, + "loss": 0.9824, + "step": 480 + }, + { + "epoch": 0.89, + "learning_rate": 0.0001, + "loss": 1.1574, + "step": 490 + }, + { + "epoch": 0.91, + "learning_rate": 0.0001, + "loss": 1.2687, + "step": 500 + }, + { + "epoch": 0.92, + "learning_rate": 0.0001, + "loss": 0.8657, + "step": 510 + }, + { + "epoch": 0.94, + "learning_rate": 0.0001, + "loss": 0.9207, + "step": 520 + }, + { + "epoch": 0.96, + "learning_rate": 0.0001, + "loss": 1.012, + "step": 530 + }, + { + "epoch": 0.98, + "learning_rate": 0.0001, + "loss": 1.1517, + "step": 540 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 1.1654, + "step": 550 + }, + { + "epoch": 1.02, + "learning_rate": 0.0001, + "loss": 0.8931, + "step": 560 + }, + { + "epoch": 1.02, + "eval_loss": 1.0150845050811768, + "eval_runtime": 949.8392, + "eval_samples_per_second": 1.053, + "eval_steps_per_second": 1.053, + "step": 561 + }, + { + "epoch": 1.02, + "mmlu_eval_accuracy": 0.7346397374699287, + "mmlu_eval_accuracy_abstract_algebra": 0.2727272727272727, + "mmlu_eval_accuracy_anatomy": 0.7142857142857143, + "mmlu_eval_accuracy_astronomy": 0.6875, + "mmlu_eval_accuracy_business_ethics": 0.8181818181818182, + "mmlu_eval_accuracy_clinical_knowledge": 0.8620689655172413, + "mmlu_eval_accuracy_college_biology": 0.875, + "mmlu_eval_accuracy_college_chemistry": 0.5, + "mmlu_eval_accuracy_college_computer_science": 0.7272727272727273, + "mmlu_eval_accuracy_college_mathematics": 0.36363636363636365, + "mmlu_eval_accuracy_college_medicine": 0.9090909090909091, + "mmlu_eval_accuracy_college_physics": 0.5454545454545454, + "mmlu_eval_accuracy_computer_security": 0.6363636363636364, + "mmlu_eval_accuracy_conceptual_physics": 0.6153846153846154, + "mmlu_eval_accuracy_econometrics": 0.8333333333333334, + "mmlu_eval_accuracy_electrical_engineering": 0.8125, + "mmlu_eval_accuracy_elementary_mathematics": 0.6829268292682927, + "mmlu_eval_accuracy_formal_logic": 0.6428571428571429, + "mmlu_eval_accuracy_global_facts": 0.5, + "mmlu_eval_accuracy_high_school_biology": 0.84375, + "mmlu_eval_accuracy_high_school_chemistry": 0.45454545454545453, + "mmlu_eval_accuracy_high_school_computer_science": 0.7777777777777778, + "mmlu_eval_accuracy_high_school_european_history": 0.7777777777777778, + "mmlu_eval_accuracy_high_school_geography": 0.9090909090909091, + "mmlu_eval_accuracy_high_school_government_and_politics": 0.9523809523809523, + "mmlu_eval_accuracy_high_school_macroeconomics": 0.7674418604651163, + "mmlu_eval_accuracy_high_school_mathematics": 0.41379310344827586, + "mmlu_eval_accuracy_high_school_microeconomics": 0.9230769230769231, + "mmlu_eval_accuracy_high_school_physics": 0.23529411764705882, + "mmlu_eval_accuracy_high_school_psychology": 0.9666666666666667, + "mmlu_eval_accuracy_high_school_statistics": 0.7391304347826086, + "mmlu_eval_accuracy_high_school_us_history": 0.9090909090909091, + "mmlu_eval_accuracy_high_school_world_history": 0.8076923076923077, + "mmlu_eval_accuracy_human_aging": 0.8260869565217391, + "mmlu_eval_accuracy_human_sexuality": 0.6666666666666666, + "mmlu_eval_accuracy_international_law": 1.0, + "mmlu_eval_accuracy_jurisprudence": 0.6363636363636364, + "mmlu_eval_accuracy_logical_fallacies": 0.7222222222222222, + "mmlu_eval_accuracy_machine_learning": 0.6363636363636364, + "mmlu_eval_accuracy_management": 0.9090909090909091, + "mmlu_eval_accuracy_marketing": 0.88, + "mmlu_eval_accuracy_medical_genetics": 0.9090909090909091, + "mmlu_eval_accuracy_miscellaneous": 0.7558139534883721, + "mmlu_eval_accuracy_moral_disputes": 0.8157894736842105, + "mmlu_eval_accuracy_moral_scenarios": 0.56, + "mmlu_eval_accuracy_nutrition": 0.7575757575757576, + "mmlu_eval_accuracy_philosophy": 0.7941176470588235, + "mmlu_eval_accuracy_prehistory": 0.8571428571428571, + "mmlu_eval_accuracy_professional_accounting": 0.6774193548387096, + "mmlu_eval_accuracy_professional_law": 0.6294117647058823, + "mmlu_eval_accuracy_professional_medicine": 0.8064516129032258, + "mmlu_eval_accuracy_professional_psychology": 0.8260869565217391, + "mmlu_eval_accuracy_public_relations": 0.6666666666666666, + "mmlu_eval_accuracy_security_studies": 0.8148148148148148, + "mmlu_eval_accuracy_sociology": 0.9090909090909091, + "mmlu_eval_accuracy_us_foreign_policy": 1.0, + "mmlu_eval_accuracy_virology": 0.5, + "mmlu_eval_accuracy_world_religions": 0.8421052631578947, + "mmlu_loss": 1.186674291658526, + "step": 561 + }, + { + "epoch": 1.03, + "learning_rate": 0.0001, + "loss": 0.8507, + "step": 570 + }, + { + "epoch": 1.05, + "learning_rate": 0.0001, + "loss": 0.9164, + "step": 580 + }, + { + "epoch": 1.07, + "learning_rate": 0.0001, + "loss": 1.0908, + "step": 590 + }, + { + "epoch": 1.09, + "learning_rate": 0.0001, + "loss": 1.0431, + "step": 600 + }, + { + "epoch": 1.11, + "learning_rate": 0.0001, + "loss": 0.8567, + "step": 610 + }, + { + "epoch": 1.12, + "learning_rate": 0.0001, + "loss": 0.8818, + "step": 620 + }, + { + "epoch": 1.14, + "learning_rate": 0.0001, + "loss": 0.9499, + "step": 630 + }, + { + "epoch": 1.16, + "learning_rate": 0.0001, + "loss": 1.0437, + "step": 640 + }, + { + "epoch": 1.18, + "learning_rate": 0.0001, + "loss": 1.0487, + "step": 650 + }, + { + "epoch": 1.2, + "learning_rate": 0.0001, + "loss": 0.8405, + "step": 660 + }, + { + "epoch": 1.22, + "learning_rate": 0.0001, + "loss": 0.8818, + "step": 670 + }, + { + "epoch": 1.23, + "learning_rate": 0.0001, + "loss": 0.9619, + "step": 680 + }, + { + "epoch": 1.25, + "learning_rate": 0.0001, + "loss": 1.0753, + "step": 690 + }, + { + "epoch": 1.27, + "learning_rate": 0.0001, + "loss": 1.0218, + "step": 700 + }, + { + "epoch": 1.29, + "learning_rate": 0.0001, + "loss": 0.8763, + "step": 710 + }, + { + "epoch": 1.31, + "learning_rate": 0.0001, + "loss": 0.8789, + "step": 720 + }, + { + "epoch": 1.32, + "learning_rate": 0.0001, + "loss": 0.8631, + "step": 730 + }, + { + "epoch": 1.34, + "learning_rate": 0.0001, + "loss": 0.9846, + "step": 740 + }, + { + "epoch": 1.36, + "eval_loss": 1.0305067300796509, + "eval_runtime": 948.7106, + "eval_samples_per_second": 1.054, + "eval_steps_per_second": 1.054, + "step": 748 + }, + { + "epoch": 1.36, + "mmlu_eval_accuracy": 0.7324229372189777, + "mmlu_eval_accuracy_abstract_algebra": 0.2727272727272727, + "mmlu_eval_accuracy_anatomy": 0.6428571428571429, + "mmlu_eval_accuracy_astronomy": 0.6875, + "mmlu_eval_accuracy_business_ethics": 1.0, + "mmlu_eval_accuracy_clinical_knowledge": 0.896551724137931, + "mmlu_eval_accuracy_college_biology": 0.875, + "mmlu_eval_accuracy_college_chemistry": 0.5, + "mmlu_eval_accuracy_college_computer_science": 0.6363636363636364, + "mmlu_eval_accuracy_college_mathematics": 0.36363636363636365, + "mmlu_eval_accuracy_college_medicine": 0.8636363636363636, + "mmlu_eval_accuracy_college_physics": 0.6363636363636364, + "mmlu_eval_accuracy_computer_security": 0.6363636363636364, + "mmlu_eval_accuracy_conceptual_physics": 0.6153846153846154, + "mmlu_eval_accuracy_econometrics": 0.8333333333333334, + "mmlu_eval_accuracy_electrical_engineering": 0.8125, + "mmlu_eval_accuracy_elementary_mathematics": 0.7073170731707317, + "mmlu_eval_accuracy_formal_logic": 0.6428571428571429, + "mmlu_eval_accuracy_global_facts": 0.4, + "mmlu_eval_accuracy_high_school_biology": 0.8125, + "mmlu_eval_accuracy_high_school_chemistry": 0.36363636363636365, + "mmlu_eval_accuracy_high_school_computer_science": 0.6666666666666666, + "mmlu_eval_accuracy_high_school_european_history": 0.7777777777777778, + "mmlu_eval_accuracy_high_school_geography": 0.9090909090909091, + "mmlu_eval_accuracy_high_school_government_and_politics": 0.9523809523809523, + "mmlu_eval_accuracy_high_school_macroeconomics": 0.7674418604651163, + "mmlu_eval_accuracy_high_school_mathematics": 0.3448275862068966, + "mmlu_eval_accuracy_high_school_microeconomics": 0.9615384615384616, + "mmlu_eval_accuracy_high_school_physics": 0.35294117647058826, + "mmlu_eval_accuracy_high_school_psychology": 0.9333333333333333, + "mmlu_eval_accuracy_high_school_statistics": 0.782608695652174, + "mmlu_eval_accuracy_high_school_us_history": 0.9545454545454546, + "mmlu_eval_accuracy_high_school_world_history": 0.7692307692307693, + "mmlu_eval_accuracy_human_aging": 0.782608695652174, + "mmlu_eval_accuracy_human_sexuality": 0.6666666666666666, + "mmlu_eval_accuracy_international_law": 1.0, + "mmlu_eval_accuracy_jurisprudence": 0.6363636363636364, + "mmlu_eval_accuracy_logical_fallacies": 0.7777777777777778, + "mmlu_eval_accuracy_machine_learning": 0.6363636363636364, + "mmlu_eval_accuracy_management": 0.9090909090909091, + "mmlu_eval_accuracy_marketing": 0.96, + "mmlu_eval_accuracy_medical_genetics": 0.9090909090909091, + "mmlu_eval_accuracy_miscellaneous": 0.7441860465116279, + "mmlu_eval_accuracy_moral_disputes": 0.8157894736842105, + "mmlu_eval_accuracy_moral_scenarios": 0.57, + "mmlu_eval_accuracy_nutrition": 0.7272727272727273, + "mmlu_eval_accuracy_philosophy": 0.7941176470588235, + "mmlu_eval_accuracy_prehistory": 0.8285714285714286, + "mmlu_eval_accuracy_professional_accounting": 0.6129032258064516, + "mmlu_eval_accuracy_professional_law": 0.6411764705882353, + "mmlu_eval_accuracy_professional_medicine": 0.8064516129032258, + "mmlu_eval_accuracy_professional_psychology": 0.8260869565217391, + "mmlu_eval_accuracy_public_relations": 0.6666666666666666, + "mmlu_eval_accuracy_security_studies": 0.8148148148148148, + "mmlu_eval_accuracy_sociology": 0.9090909090909091, + "mmlu_eval_accuracy_us_foreign_policy": 1.0, + "mmlu_eval_accuracy_virology": 0.5, + "mmlu_eval_accuracy_world_religions": 0.8421052631578947, + "mmlu_loss": 1.2988067958029479, + "step": 748 + }, + { + "epoch": 1.36, + "learning_rate": 0.0001, + "loss": 1.0735, + "step": 750 + }, + { + "epoch": 1.38, + "learning_rate": 0.0001, + "loss": 0.9066, + "step": 760 + }, + { + "epoch": 1.4, + "learning_rate": 0.0001, + "loss": 0.8716, + "step": 770 + }, + { + "epoch": 1.41, + "learning_rate": 0.0001, + "loss": 0.9144, + "step": 780 + }, + { + "epoch": 1.43, + "learning_rate": 0.0001, + "loss": 1.0338, + "step": 790 + }, + { + "epoch": 1.45, + "learning_rate": 0.0001, + "loss": 1.0275, + "step": 800 + }, + { + "epoch": 1.47, + "learning_rate": 0.0001, + "loss": 0.8382, + "step": 810 + }, + { + "epoch": 1.49, + "learning_rate": 0.0001, + "loss": 0.8489, + "step": 820 + }, + { + "epoch": 1.51, + "learning_rate": 0.0001, + "loss": 0.8931, + "step": 830 + }, + { + "epoch": 1.52, + "learning_rate": 0.0001, + "loss": 1.0515, + "step": 840 + }, + { + "epoch": 1.54, + "learning_rate": 0.0001, + "loss": 1.0965, + "step": 850 + }, + { + "epoch": 1.56, + "learning_rate": 0.0001, + "loss": 0.8928, + "step": 860 + }, + { + "epoch": 1.58, + "learning_rate": 0.0001, + "loss": 0.8608, + "step": 870 + }, + { + "epoch": 1.6, + "learning_rate": 0.0001, + "loss": 0.8831, + "step": 880 + }, + { + "epoch": 1.61, + "learning_rate": 0.0001, + "loss": 1.0253, + "step": 890 + }, + { + "epoch": 1.63, + "learning_rate": 0.0001, + "loss": 0.9905, + "step": 900 + }, + { + "epoch": 1.65, + "learning_rate": 0.0001, + "loss": 0.8487, + "step": 910 + }, + { + "epoch": 1.67, + "learning_rate": 0.0001, + "loss": 0.8568, + "step": 920 + }, + { + "epoch": 1.69, + "learning_rate": 0.0001, + "loss": 0.9047, + "step": 930 + }, + { + "epoch": 1.7, + "eval_loss": 1.0250624418258667, + "eval_runtime": 946.4035, + "eval_samples_per_second": 1.057, + "eval_steps_per_second": 1.057, + "step": 935 + }, + { + "epoch": 1.7, + "mmlu_eval_accuracy": 0.7288948695878031, + "mmlu_eval_accuracy_abstract_algebra": 0.2727272727272727, + "mmlu_eval_accuracy_anatomy": 0.5714285714285714, + "mmlu_eval_accuracy_astronomy": 0.6875, + "mmlu_eval_accuracy_business_ethics": 0.9090909090909091, + "mmlu_eval_accuracy_clinical_knowledge": 0.8620689655172413, + "mmlu_eval_accuracy_college_biology": 0.875, + "mmlu_eval_accuracy_college_chemistry": 0.5, + "mmlu_eval_accuracy_college_computer_science": 0.7272727272727273, + "mmlu_eval_accuracy_college_mathematics": 0.36363636363636365, + "mmlu_eval_accuracy_college_medicine": 0.9090909090909091, + "mmlu_eval_accuracy_college_physics": 0.6363636363636364, + "mmlu_eval_accuracy_computer_security": 0.6363636363636364, + "mmlu_eval_accuracy_conceptual_physics": 0.6153846153846154, + "mmlu_eval_accuracy_econometrics": 0.8333333333333334, + "mmlu_eval_accuracy_electrical_engineering": 0.8125, + "mmlu_eval_accuracy_elementary_mathematics": 0.6097560975609756, + "mmlu_eval_accuracy_formal_logic": 0.6428571428571429, + "mmlu_eval_accuracy_global_facts": 0.5, + "mmlu_eval_accuracy_high_school_biology": 0.84375, + "mmlu_eval_accuracy_high_school_chemistry": 0.45454545454545453, + "mmlu_eval_accuracy_high_school_computer_science": 0.6666666666666666, + "mmlu_eval_accuracy_high_school_european_history": 0.7777777777777778, + "mmlu_eval_accuracy_high_school_geography": 0.9090909090909091, + "mmlu_eval_accuracy_high_school_government_and_politics": 0.9523809523809523, + "mmlu_eval_accuracy_high_school_macroeconomics": 0.7906976744186046, + "mmlu_eval_accuracy_high_school_mathematics": 0.3793103448275862, + "mmlu_eval_accuracy_high_school_microeconomics": 0.9230769230769231, + "mmlu_eval_accuracy_high_school_physics": 0.23529411764705882, + "mmlu_eval_accuracy_high_school_psychology": 0.9333333333333333, + "mmlu_eval_accuracy_high_school_statistics": 0.7391304347826086, + "mmlu_eval_accuracy_high_school_us_history": 0.9090909090909091, + "mmlu_eval_accuracy_high_school_world_history": 0.7692307692307693, + "mmlu_eval_accuracy_human_aging": 0.782608695652174, + "mmlu_eval_accuracy_human_sexuality": 0.6666666666666666, + "mmlu_eval_accuracy_international_law": 1.0, + "mmlu_eval_accuracy_jurisprudence": 0.5454545454545454, + "mmlu_eval_accuracy_logical_fallacies": 0.7222222222222222, + "mmlu_eval_accuracy_machine_learning": 0.6363636363636364, + "mmlu_eval_accuracy_management": 0.9090909090909091, + "mmlu_eval_accuracy_marketing": 0.96, + "mmlu_eval_accuracy_medical_genetics": 0.9090909090909091, + "mmlu_eval_accuracy_miscellaneous": 0.7790697674418605, + "mmlu_eval_accuracy_moral_disputes": 0.8157894736842105, + "mmlu_eval_accuracy_moral_scenarios": 0.57, + "mmlu_eval_accuracy_nutrition": 0.7575757575757576, + "mmlu_eval_accuracy_philosophy": 0.8235294117647058, + "mmlu_eval_accuracy_prehistory": 0.8857142857142857, + "mmlu_eval_accuracy_professional_accounting": 0.6129032258064516, + "mmlu_eval_accuracy_professional_law": 0.6235294117647059, + "mmlu_eval_accuracy_professional_medicine": 0.8387096774193549, + "mmlu_eval_accuracy_professional_psychology": 0.8115942028985508, + "mmlu_eval_accuracy_public_relations": 0.5833333333333334, + "mmlu_eval_accuracy_security_studies": 0.8148148148148148, + "mmlu_eval_accuracy_sociology": 0.9090909090909091, + "mmlu_eval_accuracy_us_foreign_policy": 1.0, + "mmlu_eval_accuracy_virology": 0.5, + "mmlu_eval_accuracy_world_religions": 0.8421052631578947, + "mmlu_loss": 1.243813282909306, + "step": 935 + }, + { + "epoch": 1.7, + "learning_rate": 0.0001, + "loss": 1.0174, + "step": 940 + }, + { + "epoch": 1.72, + "learning_rate": 0.0001, + "loss": 1.0302, + "step": 950 + }, + { + "epoch": 1.74, + "learning_rate": 0.0001, + "loss": 0.8799, + "step": 960 + }, + { + "epoch": 1.76, + "learning_rate": 0.0001, + "loss": 0.8447, + "step": 970 + }, + { + "epoch": 1.78, + "learning_rate": 0.0001, + "loss": 0.9053, + "step": 980 + }, + { + "epoch": 1.8, + "learning_rate": 0.0001, + "loss": 1.0331, + "step": 990 + }, + { + "epoch": 1.81, + "learning_rate": 0.0001, + "loss": 1.0412, + "step": 1000 + }, + { + "epoch": 1.83, + "learning_rate": 0.0001, + "loss": 0.8753, + "step": 1010 + }, + { + "epoch": 1.85, + "learning_rate": 0.0001, + "loss": 0.8744, + "step": 1020 + }, + { + "epoch": 1.87, + "learning_rate": 0.0001, + "loss": 0.8899, + "step": 1030 + }, + { + "epoch": 1.89, + "learning_rate": 0.0001, + "loss": 1.0053, + "step": 1040 + }, + { + "epoch": 1.9, + "learning_rate": 0.0001, + "loss": 1.0127, + "step": 1050 + }, + { + "epoch": 1.92, + "learning_rate": 0.0001, + "loss": 0.8023, + "step": 1060 + }, + { + "epoch": 1.94, + "learning_rate": 0.0001, + "loss": 0.8349, + "step": 1070 + }, + { + "epoch": 1.96, + "learning_rate": 0.0001, + "loss": 0.9742, + "step": 1080 + }, + { + "epoch": 1.98, + "learning_rate": 0.0001, + "loss": 1.0971, + "step": 1090 + }, + { + "epoch": 2.0, + "learning_rate": 0.0001, + "loss": 1.0728, + "step": 1100 + }, + { + "epoch": 2.01, + "learning_rate": 0.0001, + "loss": 0.7724, + "step": 1110 + }, + { + "epoch": 2.03, + "learning_rate": 0.0001, + "loss": 0.7675, + "step": 1120 + }, + { + "epoch": 2.03, + "eval_loss": 1.052681565284729, + "eval_runtime": 942.0722, + "eval_samples_per_second": 1.061, + "eval_steps_per_second": 1.061, + "step": 1122 + }, + { + "epoch": 2.03, + "mmlu_eval_accuracy": 0.7373981967098951, + "mmlu_eval_accuracy_abstract_algebra": 0.36363636363636365, + "mmlu_eval_accuracy_anatomy": 0.6428571428571429, + "mmlu_eval_accuracy_astronomy": 0.6875, + "mmlu_eval_accuracy_business_ethics": 0.9090909090909091, + "mmlu_eval_accuracy_clinical_knowledge": 0.896551724137931, + "mmlu_eval_accuracy_college_biology": 0.875, + "mmlu_eval_accuracy_college_chemistry": 0.5, + "mmlu_eval_accuracy_college_computer_science": 0.5454545454545454, + "mmlu_eval_accuracy_college_mathematics": 0.36363636363636365, + "mmlu_eval_accuracy_college_medicine": 0.9090909090909091, + "mmlu_eval_accuracy_college_physics": 0.5454545454545454, + "mmlu_eval_accuracy_computer_security": 0.6363636363636364, + "mmlu_eval_accuracy_conceptual_physics": 0.6153846153846154, + "mmlu_eval_accuracy_econometrics": 0.8333333333333334, + "mmlu_eval_accuracy_electrical_engineering": 0.875, + "mmlu_eval_accuracy_elementary_mathematics": 0.5853658536585366, + "mmlu_eval_accuracy_formal_logic": 0.7142857142857143, + "mmlu_eval_accuracy_global_facts": 0.4, + "mmlu_eval_accuracy_high_school_biology": 0.84375, + "mmlu_eval_accuracy_high_school_chemistry": 0.45454545454545453, + "mmlu_eval_accuracy_high_school_computer_science": 0.7777777777777778, + "mmlu_eval_accuracy_high_school_european_history": 0.7777777777777778, + "mmlu_eval_accuracy_high_school_geography": 0.9090909090909091, + "mmlu_eval_accuracy_high_school_government_and_politics": 0.9523809523809523, + "mmlu_eval_accuracy_high_school_macroeconomics": 0.7674418604651163, + "mmlu_eval_accuracy_high_school_mathematics": 0.41379310344827586, + "mmlu_eval_accuracy_high_school_microeconomics": 0.9230769230769231, + "mmlu_eval_accuracy_high_school_physics": 0.23529411764705882, + "mmlu_eval_accuracy_high_school_psychology": 0.95, + "mmlu_eval_accuracy_high_school_statistics": 0.782608695652174, + "mmlu_eval_accuracy_high_school_us_history": 0.9090909090909091, + "mmlu_eval_accuracy_high_school_world_history": 0.7692307692307693, + "mmlu_eval_accuracy_human_aging": 0.8260869565217391, + "mmlu_eval_accuracy_human_sexuality": 0.6666666666666666, + "mmlu_eval_accuracy_international_law": 1.0, + "mmlu_eval_accuracy_jurisprudence": 0.6363636363636364, + "mmlu_eval_accuracy_logical_fallacies": 0.7777777777777778, + "mmlu_eval_accuracy_machine_learning": 0.6363636363636364, + "mmlu_eval_accuracy_management": 0.9090909090909091, + "mmlu_eval_accuracy_marketing": 0.92, + "mmlu_eval_accuracy_medical_genetics": 0.9090909090909091, + "mmlu_eval_accuracy_miscellaneous": 0.7906976744186046, + "mmlu_eval_accuracy_moral_disputes": 0.8157894736842105, + "mmlu_eval_accuracy_moral_scenarios": 0.62, + "mmlu_eval_accuracy_nutrition": 0.7575757575757576, + "mmlu_eval_accuracy_philosophy": 0.8235294117647058, + "mmlu_eval_accuracy_prehistory": 0.8857142857142857, + "mmlu_eval_accuracy_professional_accounting": 0.6774193548387096, + "mmlu_eval_accuracy_professional_law": 0.6294117647058823, + "mmlu_eval_accuracy_professional_medicine": 0.8709677419354839, + "mmlu_eval_accuracy_professional_psychology": 0.782608695652174, + "mmlu_eval_accuracy_public_relations": 0.6666666666666666, + "mmlu_eval_accuracy_security_studies": 0.8148148148148148, + "mmlu_eval_accuracy_sociology": 0.9090909090909091, + "mmlu_eval_accuracy_us_foreign_policy": 1.0, + "mmlu_eval_accuracy_virology": 0.5, + "mmlu_eval_accuracy_world_religions": 0.8421052631578947, + "mmlu_loss": 1.2340081441760609, + "step": 1122 + }, + { + "epoch": 2.05, + "learning_rate": 0.0001, + "loss": 0.7194, + "step": 1130 + }, + { + "epoch": 2.07, + "learning_rate": 0.0001, + "loss": 0.8236, + "step": 1140 + }, + { + "epoch": 2.09, + "learning_rate": 0.0001, + "loss": 0.6652, + "step": 1150 + }, + { + "epoch": 2.1, + "learning_rate": 0.0001, + "loss": 0.7177, + "step": 1160 + }, + { + "epoch": 2.12, + "learning_rate": 0.0001, + "loss": 0.7788, + "step": 1170 + }, + { + "epoch": 2.14, + "learning_rate": 0.0001, + "loss": 0.8117, + "step": 1180 + }, + { + "epoch": 2.16, + "learning_rate": 0.0001, + "loss": 0.8145, + "step": 1190 + }, + { + "epoch": 2.18, + "learning_rate": 0.0001, + "loss": 0.6984, + "step": 1200 + }, + { + "epoch": 2.19, + "learning_rate": 0.0001, + "loss": 0.7011, + "step": 1210 + }, + { + "epoch": 2.21, + "learning_rate": 0.0001, + "loss": 0.769, + "step": 1220 + }, + { + "epoch": 2.23, + "learning_rate": 0.0001, + "loss": 0.7705, + "step": 1230 + }, + { + "epoch": 2.25, + "learning_rate": 0.0001, + "loss": 0.8066, + "step": 1240 + }, + { + "epoch": 2.27, + "learning_rate": 0.0001, + "loss": 0.6622, + "step": 1250 + }, + { + "epoch": 2.29, + "learning_rate": 0.0001, + "loss": 0.6641, + "step": 1260 + }, + { + "epoch": 2.3, + "learning_rate": 0.0001, + "loss": 0.7239, + "step": 1270 + }, + { + "epoch": 2.32, + "learning_rate": 0.0001, + "loss": 0.7618, + "step": 1280 + }, + { + "epoch": 2.34, + "learning_rate": 0.0001, + "loss": 0.7845, + "step": 1290 + }, + { + "epoch": 2.36, + "learning_rate": 0.0001, + "loss": 0.719, + "step": 1300 + }, + { + "epoch": 2.37, + "eval_loss": 1.1104822158813477, + "eval_runtime": 948.1299, + "eval_samples_per_second": 1.055, + "eval_steps_per_second": 1.055, + "step": 1309 + }, + { + "epoch": 2.37, + "mmlu_eval_accuracy": 0.7369285730399766, + "mmlu_eval_accuracy_abstract_algebra": 0.36363636363636365, + "mmlu_eval_accuracy_anatomy": 0.6428571428571429, + "mmlu_eval_accuracy_astronomy": 0.6875, + "mmlu_eval_accuracy_business_ethics": 1.0, + "mmlu_eval_accuracy_clinical_knowledge": 0.8275862068965517, + "mmlu_eval_accuracy_college_biology": 0.875, + "mmlu_eval_accuracy_college_chemistry": 0.5, + "mmlu_eval_accuracy_college_computer_science": 0.6363636363636364, + "mmlu_eval_accuracy_college_mathematics": 0.45454545454545453, + "mmlu_eval_accuracy_college_medicine": 0.8636363636363636, + "mmlu_eval_accuracy_college_physics": 0.6363636363636364, + "mmlu_eval_accuracy_computer_security": 0.7272727272727273, + "mmlu_eval_accuracy_conceptual_physics": 0.6923076923076923, + "mmlu_eval_accuracy_econometrics": 0.75, + "mmlu_eval_accuracy_electrical_engineering": 0.8125, + "mmlu_eval_accuracy_elementary_mathematics": 0.6341463414634146, + "mmlu_eval_accuracy_formal_logic": 0.7857142857142857, + "mmlu_eval_accuracy_global_facts": 0.5, + "mmlu_eval_accuracy_high_school_biology": 0.84375, + "mmlu_eval_accuracy_high_school_chemistry": 0.4090909090909091, + "mmlu_eval_accuracy_high_school_computer_science": 0.8888888888888888, + "mmlu_eval_accuracy_high_school_european_history": 0.7777777777777778, + "mmlu_eval_accuracy_high_school_geography": 0.9090909090909091, + "mmlu_eval_accuracy_high_school_government_and_politics": 0.9523809523809523, + "mmlu_eval_accuracy_high_school_macroeconomics": 0.7906976744186046, + "mmlu_eval_accuracy_high_school_mathematics": 0.3793103448275862, + "mmlu_eval_accuracy_high_school_microeconomics": 0.9230769230769231, + "mmlu_eval_accuracy_high_school_physics": 0.17647058823529413, + "mmlu_eval_accuracy_high_school_psychology": 0.95, + "mmlu_eval_accuracy_high_school_statistics": 0.6956521739130435, + "mmlu_eval_accuracy_high_school_us_history": 0.9090909090909091, + "mmlu_eval_accuracy_high_school_world_history": 0.7307692307692307, + "mmlu_eval_accuracy_human_aging": 0.7391304347826086, + "mmlu_eval_accuracy_human_sexuality": 0.6666666666666666, + "mmlu_eval_accuracy_international_law": 1.0, + "mmlu_eval_accuracy_jurisprudence": 0.5454545454545454, + "mmlu_eval_accuracy_logical_fallacies": 0.7222222222222222, + "mmlu_eval_accuracy_machine_learning": 0.6363636363636364, + "mmlu_eval_accuracy_management": 0.9090909090909091, + "mmlu_eval_accuracy_marketing": 0.88, + "mmlu_eval_accuracy_medical_genetics": 0.9090909090909091, + "mmlu_eval_accuracy_miscellaneous": 0.7906976744186046, + "mmlu_eval_accuracy_moral_disputes": 0.8157894736842105, + "mmlu_eval_accuracy_moral_scenarios": 0.57, + "mmlu_eval_accuracy_nutrition": 0.7575757575757576, + "mmlu_eval_accuracy_philosophy": 0.8529411764705882, + "mmlu_eval_accuracy_prehistory": 0.8571428571428571, + "mmlu_eval_accuracy_professional_accounting": 0.6774193548387096, + "mmlu_eval_accuracy_professional_law": 0.6058823529411764, + "mmlu_eval_accuracy_professional_medicine": 0.8709677419354839, + "mmlu_eval_accuracy_professional_psychology": 0.7681159420289855, + "mmlu_eval_accuracy_public_relations": 0.5833333333333334, + "mmlu_eval_accuracy_security_studies": 0.8148148148148148, + "mmlu_eval_accuracy_sociology": 0.9090909090909091, + "mmlu_eval_accuracy_us_foreign_policy": 1.0, + "mmlu_eval_accuracy_virology": 0.5555555555555556, + "mmlu_eval_accuracy_world_religions": 0.8421052631578947, + "mmlu_loss": 1.0866562834095908, + "step": 1309 + }, + { + "epoch": 2.38, + "learning_rate": 0.0001, + "loss": 0.7093, + "step": 1310 + }, + { + "epoch": 2.39, + "learning_rate": 0.0001, + "loss": 0.7684, + "step": 1320 + }, + { + "epoch": 2.41, + "learning_rate": 0.0001, + "loss": 0.7501, + "step": 1330 + }, + { + "epoch": 2.43, + "learning_rate": 0.0001, + "loss": 0.8043, + "step": 1340 + }, + { + "epoch": 2.45, + "learning_rate": 0.0001, + "loss": 0.6927, + "step": 1350 + }, + { + "epoch": 2.47, + "learning_rate": 0.0001, + "loss": 0.7278, + "step": 1360 + }, + { + "epoch": 2.48, + "learning_rate": 0.0001, + "loss": 0.8095, + "step": 1370 + }, + { + "epoch": 2.5, + "learning_rate": 0.0001, + "loss": 0.7463, + "step": 1380 + }, + { + "epoch": 2.52, + "learning_rate": 0.0001, + "loss": 0.7707, + "step": 1390 + }, + { + "epoch": 2.54, + "learning_rate": 0.0001, + "loss": 0.7152, + "step": 1400 + }, + { + "epoch": 2.56, + "learning_rate": 0.0001, + "loss": 0.687, + "step": 1410 + }, + { + "epoch": 2.58, + "learning_rate": 0.0001, + "loss": 0.7529, + "step": 1420 + }, + { + "epoch": 2.59, + "learning_rate": 0.0001, + "loss": 0.7565, + "step": 1430 + }, + { + "epoch": 2.61, + "learning_rate": 0.0001, + "loss": 0.8066, + "step": 1440 + }, + { + "epoch": 2.63, + "learning_rate": 0.0001, + "loss": 0.7623, + "step": 1450 + }, + { + "epoch": 2.65, + "learning_rate": 0.0001, + "loss": 0.6947, + "step": 1460 + }, + { + "epoch": 2.67, + "learning_rate": 0.0001, + "loss": 0.7756, + "step": 1470 + }, + { + "epoch": 2.68, + "learning_rate": 0.0001, + "loss": 0.8453, + "step": 1480 + }, + { + "epoch": 2.7, + "learning_rate": 0.0001, + "loss": 0.8306, + "step": 1490 + }, + { + "epoch": 2.71, + "eval_loss": 1.100826621055603, + "eval_runtime": 940.4488, + "eval_samples_per_second": 1.063, + "eval_steps_per_second": 1.063, + "step": 1496 + }, + { + "epoch": 2.71, + "mmlu_eval_accuracy": 0.7363077307176445, + "mmlu_eval_accuracy_abstract_algebra": 0.36363636363636365, + "mmlu_eval_accuracy_anatomy": 0.5714285714285714, + "mmlu_eval_accuracy_astronomy": 0.6875, + "mmlu_eval_accuracy_business_ethics": 0.9090909090909091, + "mmlu_eval_accuracy_clinical_knowledge": 0.8620689655172413, + "mmlu_eval_accuracy_college_biology": 0.8125, + "mmlu_eval_accuracy_college_chemistry": 0.5, + "mmlu_eval_accuracy_college_computer_science": 0.5454545454545454, + "mmlu_eval_accuracy_college_mathematics": 0.36363636363636365, + "mmlu_eval_accuracy_college_medicine": 0.9090909090909091, + "mmlu_eval_accuracy_college_physics": 0.6363636363636364, + "mmlu_eval_accuracy_computer_security": 0.6363636363636364, + "mmlu_eval_accuracy_conceptual_physics": 0.6153846153846154, + "mmlu_eval_accuracy_econometrics": 0.75, + "mmlu_eval_accuracy_electrical_engineering": 0.8125, + "mmlu_eval_accuracy_elementary_mathematics": 0.6585365853658537, + "mmlu_eval_accuracy_formal_logic": 0.6428571428571429, + "mmlu_eval_accuracy_global_facts": 0.6, + "mmlu_eval_accuracy_high_school_biology": 0.78125, + "mmlu_eval_accuracy_high_school_chemistry": 0.5909090909090909, + "mmlu_eval_accuracy_high_school_computer_science": 0.7777777777777778, + "mmlu_eval_accuracy_high_school_european_history": 0.7777777777777778, + "mmlu_eval_accuracy_high_school_geography": 0.9545454545454546, + "mmlu_eval_accuracy_high_school_government_and_politics": 0.9523809523809523, + "mmlu_eval_accuracy_high_school_macroeconomics": 0.7906976744186046, + "mmlu_eval_accuracy_high_school_mathematics": 0.4827586206896552, + "mmlu_eval_accuracy_high_school_microeconomics": 0.9615384615384616, + "mmlu_eval_accuracy_high_school_physics": 0.17647058823529413, + "mmlu_eval_accuracy_high_school_psychology": 0.9166666666666666, + "mmlu_eval_accuracy_high_school_statistics": 0.6521739130434783, + "mmlu_eval_accuracy_high_school_us_history": 0.9090909090909091, + "mmlu_eval_accuracy_high_school_world_history": 0.8076923076923077, + "mmlu_eval_accuracy_human_aging": 0.7391304347826086, + "mmlu_eval_accuracy_human_sexuality": 0.75, + "mmlu_eval_accuracy_international_law": 1.0, + "mmlu_eval_accuracy_jurisprudence": 0.45454545454545453, + "mmlu_eval_accuracy_logical_fallacies": 0.7777777777777778, + "mmlu_eval_accuracy_machine_learning": 0.6363636363636364, + "mmlu_eval_accuracy_management": 0.9090909090909091, + "mmlu_eval_accuracy_marketing": 0.88, + "mmlu_eval_accuracy_medical_genetics": 0.9090909090909091, + "mmlu_eval_accuracy_miscellaneous": 0.7906976744186046, + "mmlu_eval_accuracy_moral_disputes": 0.8157894736842105, + "mmlu_eval_accuracy_moral_scenarios": 0.64, + "mmlu_eval_accuracy_nutrition": 0.7575757575757576, + "mmlu_eval_accuracy_philosophy": 0.8235294117647058, + "mmlu_eval_accuracy_prehistory": 0.8571428571428571, + "mmlu_eval_accuracy_professional_accounting": 0.7096774193548387, + "mmlu_eval_accuracy_professional_law": 0.6176470588235294, + "mmlu_eval_accuracy_professional_medicine": 0.8709677419354839, + "mmlu_eval_accuracy_professional_psychology": 0.7971014492753623, + "mmlu_eval_accuracy_public_relations": 0.6666666666666666, + "mmlu_eval_accuracy_security_studies": 0.8518518518518519, + "mmlu_eval_accuracy_sociology": 0.9090909090909091, + "mmlu_eval_accuracy_us_foreign_policy": 1.0, + "mmlu_eval_accuracy_virology": 0.5555555555555556, + "mmlu_eval_accuracy_world_religions": 0.8421052631578947, + "mmlu_loss": 1.2313211129857853, + "step": 1496 + }, + { + "epoch": 2.72, + "learning_rate": 0.0001, + "loss": 0.6937, + "step": 1500 + }, + { + "epoch": 2.74, + "learning_rate": 0.0001, + "loss": 0.6997, + "step": 1510 + }, + { + "epoch": 2.76, + "learning_rate": 0.0001, + "loss": 0.7588, + "step": 1520 + }, + { + "epoch": 2.77, + "learning_rate": 0.0001, + "loss": 0.7731, + "step": 1530 + }, + { + "epoch": 2.79, + "learning_rate": 0.0001, + "loss": 0.7914, + "step": 1540 + }, + { + "epoch": 2.81, + "learning_rate": 0.0001, + "loss": 0.7175, + "step": 1550 + }, + { + "epoch": 2.83, + "learning_rate": 0.0001, + "loss": 0.7046, + "step": 1560 + }, + { + "epoch": 2.85, + "learning_rate": 0.0001, + "loss": 0.7597, + "step": 1570 + }, + { + "epoch": 2.87, + "learning_rate": 0.0001, + "loss": 0.7932, + "step": 1580 + }, + { + "epoch": 2.88, + "learning_rate": 0.0001, + "loss": 0.8059, + "step": 1590 + }, + { + "epoch": 2.9, + "learning_rate": 0.0001, + "loss": 0.7258, + "step": 1600 + }, + { + "epoch": 2.92, + "learning_rate": 0.0001, + "loss": 0.7486, + "step": 1610 + }, + { + "epoch": 2.94, + "learning_rate": 0.0001, + "loss": 0.7233, + "step": 1620 + }, + { + "epoch": 2.96, + "learning_rate": 0.0001, + "loss": 0.7945, + "step": 1630 + }, + { + "epoch": 2.97, + "learning_rate": 0.0001, + "loss": 0.8324, + "step": 1640 + }, + { + "epoch": 2.99, + "learning_rate": 0.0001, + "loss": 0.7294, + "step": 1650 + }, + { + "epoch": 3.01, + "learning_rate": 0.0001, + "loss": 0.6117, + "step": 1660 + }, + { + "epoch": 3.03, + "learning_rate": 0.0001, + "loss": 0.6464, + "step": 1670 + }, + { + "epoch": 3.05, + "learning_rate": 0.0001, + "loss": 0.6156, + "step": 1680 + }, + { + "epoch": 3.05, + "eval_loss": 1.1478718519210815, + "eval_runtime": 932.4225, + "eval_samples_per_second": 1.072, + "eval_steps_per_second": 1.072, + "step": 1683 + }, + { + "epoch": 3.05, + "mmlu_eval_accuracy": 0.745366643285036, + "mmlu_eval_accuracy_abstract_algebra": 0.2727272727272727, + "mmlu_eval_accuracy_anatomy": 0.5714285714285714, + "mmlu_eval_accuracy_astronomy": 0.6875, + "mmlu_eval_accuracy_business_ethics": 1.0, + "mmlu_eval_accuracy_clinical_knowledge": 0.8620689655172413, + "mmlu_eval_accuracy_college_biology": 0.875, + "mmlu_eval_accuracy_college_chemistry": 0.5, + "mmlu_eval_accuracy_college_computer_science": 0.7272727272727273, + "mmlu_eval_accuracy_college_mathematics": 0.36363636363636365, + "mmlu_eval_accuracy_college_medicine": 0.9090909090909091, + "mmlu_eval_accuracy_college_physics": 0.6363636363636364, + "mmlu_eval_accuracy_computer_security": 0.6363636363636364, + "mmlu_eval_accuracy_conceptual_physics": 0.6538461538461539, + "mmlu_eval_accuracy_econometrics": 0.75, + "mmlu_eval_accuracy_electrical_engineering": 0.875, + "mmlu_eval_accuracy_elementary_mathematics": 0.7073170731707317, + "mmlu_eval_accuracy_formal_logic": 0.7857142857142857, + "mmlu_eval_accuracy_global_facts": 0.8, + "mmlu_eval_accuracy_high_school_biology": 0.8125, + "mmlu_eval_accuracy_high_school_chemistry": 0.5454545454545454, + "mmlu_eval_accuracy_high_school_computer_science": 0.7777777777777778, + "mmlu_eval_accuracy_high_school_european_history": 0.7777777777777778, + "mmlu_eval_accuracy_high_school_geography": 0.9090909090909091, + "mmlu_eval_accuracy_high_school_government_and_politics": 0.9523809523809523, + "mmlu_eval_accuracy_high_school_macroeconomics": 0.7674418604651163, + "mmlu_eval_accuracy_high_school_mathematics": 0.3793103448275862, + "mmlu_eval_accuracy_high_school_microeconomics": 0.9230769230769231, + "mmlu_eval_accuracy_high_school_physics": 0.23529411764705882, + "mmlu_eval_accuracy_high_school_psychology": 0.9333333333333333, + "mmlu_eval_accuracy_high_school_statistics": 0.6521739130434783, + "mmlu_eval_accuracy_high_school_us_history": 0.9090909090909091, + "mmlu_eval_accuracy_high_school_world_history": 0.8076923076923077, + "mmlu_eval_accuracy_human_aging": 0.7391304347826086, + "mmlu_eval_accuracy_human_sexuality": 0.75, + "mmlu_eval_accuracy_international_law": 1.0, + "mmlu_eval_accuracy_jurisprudence": 0.5454545454545454, + "mmlu_eval_accuracy_logical_fallacies": 0.7777777777777778, + "mmlu_eval_accuracy_machine_learning": 0.6363636363636364, + "mmlu_eval_accuracy_management": 0.9090909090909091, + "mmlu_eval_accuracy_marketing": 0.96, + "mmlu_eval_accuracy_medical_genetics": 1.0, + "mmlu_eval_accuracy_miscellaneous": 0.7906976744186046, + "mmlu_eval_accuracy_moral_disputes": 0.7894736842105263, + "mmlu_eval_accuracy_moral_scenarios": 0.61, + "mmlu_eval_accuracy_nutrition": 0.7272727272727273, + "mmlu_eval_accuracy_philosophy": 0.7647058823529411, + "mmlu_eval_accuracy_prehistory": 0.8571428571428571, + "mmlu_eval_accuracy_professional_accounting": 0.6774193548387096, + "mmlu_eval_accuracy_professional_law": 0.6176470588235294, + "mmlu_eval_accuracy_professional_medicine": 0.9032258064516129, + "mmlu_eval_accuracy_professional_psychology": 0.7391304347826086, + "mmlu_eval_accuracy_public_relations": 0.6666666666666666, + "mmlu_eval_accuracy_security_studies": 0.7777777777777778, + "mmlu_eval_accuracy_sociology": 0.9090909090909091, + "mmlu_eval_accuracy_us_foreign_policy": 1.0, + "mmlu_eval_accuracy_virology": 0.5, + "mmlu_eval_accuracy_world_religions": 0.8421052631578947, + "mmlu_loss": 1.4050485734687297, + "step": 1683 + }, + { + "epoch": 3.07, + "learning_rate": 0.0001, + "loss": 0.5237, + "step": 1690 + }, + { + "epoch": 3.08, + "learning_rate": 0.0001, + "loss": 0.3516, + "step": 1700 + }, + { + "epoch": 3.1, + "learning_rate": 0.0001, + "loss": 0.4976, + "step": 1710 + }, + { + "epoch": 3.12, + "learning_rate": 0.0001, + "loss": 0.6535, + "step": 1720 + }, + { + "epoch": 3.14, + "learning_rate": 0.0001, + "loss": 0.5926, + "step": 1730 + }, + { + "epoch": 3.16, + "learning_rate": 0.0001, + "loss": 0.5476, + "step": 1740 + }, + { + "epoch": 3.17, + "learning_rate": 0.0001, + "loss": 0.368, + "step": 1750 + }, + { + "epoch": 3.19, + "learning_rate": 0.0001, + "loss": 0.5043, + "step": 1760 + }, + { + "epoch": 3.21, + "learning_rate": 0.0001, + "loss": 0.5907, + "step": 1770 + }, + { + "epoch": 3.23, + "learning_rate": 0.0001, + "loss": 0.5609, + "step": 1780 + }, + { + "epoch": 3.25, + "learning_rate": 0.0001, + "loss": 0.5272, + "step": 1790 + }, + { + "epoch": 3.26, + "learning_rate": 0.0001, + "loss": 0.3672, + "step": 1800 + } + ], + "max_steps": 1875, + "num_train_epochs": 4, + "total_flos": 2.0859408156226683e+18, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-1800/training_args.bin b/checkpoint-1800/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..cab5478c763bfedd6ea1ae136d922355ec23f73d --- /dev/null +++ b/checkpoint-1800/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3739b542a2914edbd2d7eb3b727d6fa2a7752c75b0d7a856f5e87dd807fa1ef9 +size 6200 diff --git a/checkpoint-1875/adapter_model/README.md b/checkpoint-1875/adapter_model/README.md new file mode 100644 index 0000000000000000000000000000000000000000..82793f73e61dbb024e11fc6697bba1622d4d0db6 --- /dev/null +++ b/checkpoint-1875/adapter_model/README.md @@ -0,0 +1,20 @@ +--- +library_name: peft +--- +## Training procedure + + +The following `bitsandbytes` quantization config was used during training: +- load_in_8bit: False +- load_in_4bit: True +- llm_int8_threshold: 6.0 +- llm_int8_skip_modules: None +- llm_int8_enable_fp32_cpu_offload: False +- llm_int8_has_fp16_weight: False +- bnb_4bit_quant_type: nf4 +- bnb_4bit_use_double_quant: True +- bnb_4bit_compute_dtype: bfloat16 +### Framework versions + + +- PEFT 0.4.0 diff --git a/checkpoint-1875/adapter_model/adapter_config.json b/checkpoint-1875/adapter_model/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..f6f42652a8f2f3cf02936153fca1af5d04e7983d --- /dev/null +++ b/checkpoint-1875/adapter_model/adapter_config.json @@ -0,0 +1,26 @@ +{ + "auto_mapping": null, + "base_model_name_or_path": "152334H/miqu-1-70b-sf", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "lora_alpha": 16.0, + "lora_dropout": 0.05, + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "revision": null, + "target_modules": [ + "gate_proj", + "down_proj", + "v_proj", + "up_proj", + "q_proj", + "k_proj", + "o_proj" + ], + "task_type": "CAUSAL_LM" +} \ No newline at end of file diff --git a/checkpoint-1875/adapter_model/adapter_model.bin b/checkpoint-1875/adapter_model/adapter_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..b3cc80572a3a9adf10c365d14473c6e9f58ca36d --- /dev/null +++ b/checkpoint-1875/adapter_model/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5a795db67c92d569560b93ef875abfe3a0ccefc1c40b817da09011db27e24b21 +size 1657155522 diff --git a/checkpoint-200/README.md b/checkpoint-200/README.md new file mode 100644 index 0000000000000000000000000000000000000000..82793f73e61dbb024e11fc6697bba1622d4d0db6 --- /dev/null +++ b/checkpoint-200/README.md @@ -0,0 +1,20 @@ +--- +library_name: peft +--- +## Training procedure + + +The following `bitsandbytes` quantization config was used during training: +- load_in_8bit: False +- load_in_4bit: True +- llm_int8_threshold: 6.0 +- llm_int8_skip_modules: None +- llm_int8_enable_fp32_cpu_offload: False +- llm_int8_has_fp16_weight: False +- bnb_4bit_quant_type: nf4 +- bnb_4bit_use_double_quant: True +- bnb_4bit_compute_dtype: bfloat16 +### Framework versions + + +- PEFT 0.4.0 diff --git a/checkpoint-200/adapter_config.json b/checkpoint-200/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..f6f42652a8f2f3cf02936153fca1af5d04e7983d --- /dev/null +++ b/checkpoint-200/adapter_config.json @@ -0,0 +1,26 @@ +{ + "auto_mapping": null, + "base_model_name_or_path": "152334H/miqu-1-70b-sf", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "lora_alpha": 16.0, + "lora_dropout": 0.05, + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "revision": null, + "target_modules": [ + "gate_proj", + "down_proj", + "v_proj", + "up_proj", + "q_proj", + "k_proj", + "o_proj" + ], + "task_type": "CAUSAL_LM" +} \ No newline at end of file diff --git a/checkpoint-200/adapter_model.bin b/checkpoint-200/adapter_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..01ad48986bf743f111ee718296b90b7a8bba1eea --- /dev/null +++ b/checkpoint-200/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:95a8af86f16910a2e20955a36b9674a05f94ca9d2cc31eb37b44236be346de2a +size 1657155522 diff --git a/checkpoint-200/adapter_model/README.md b/checkpoint-200/adapter_model/README.md new file mode 100644 index 0000000000000000000000000000000000000000..82793f73e61dbb024e11fc6697bba1622d4d0db6 --- /dev/null +++ b/checkpoint-200/adapter_model/README.md @@ -0,0 +1,20 @@ +--- +library_name: peft +--- +## Training procedure + + +The following `bitsandbytes` quantization config was used during training: +- load_in_8bit: False +- load_in_4bit: True +- llm_int8_threshold: 6.0 +- llm_int8_skip_modules: None +- llm_int8_enable_fp32_cpu_offload: False +- llm_int8_has_fp16_weight: False +- bnb_4bit_quant_type: nf4 +- bnb_4bit_use_double_quant: True +- bnb_4bit_compute_dtype: bfloat16 +### Framework versions + + +- PEFT 0.4.0 diff --git a/checkpoint-200/adapter_model/adapter_config.json b/checkpoint-200/adapter_model/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..f6f42652a8f2f3cf02936153fca1af5d04e7983d --- /dev/null +++ b/checkpoint-200/adapter_model/adapter_config.json @@ -0,0 +1,26 @@ +{ + "auto_mapping": null, + "base_model_name_or_path": "152334H/miqu-1-70b-sf", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "lora_alpha": 16.0, + "lora_dropout": 0.05, + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "revision": null, + "target_modules": [ + "gate_proj", + "down_proj", + "v_proj", + "up_proj", + "q_proj", + "k_proj", + "o_proj" + ], + "task_type": "CAUSAL_LM" +} \ No newline at end of file diff --git a/checkpoint-200/adapter_model/adapter_model.bin b/checkpoint-200/adapter_model/adapter_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..01ad48986bf743f111ee718296b90b7a8bba1eea --- /dev/null +++ b/checkpoint-200/adapter_model/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:95a8af86f16910a2e20955a36b9674a05f94ca9d2cc31eb37b44236be346de2a +size 1657155522 diff --git a/checkpoint-200/optimizer.pt b/checkpoint-200/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..b758a3fb6e96913ad0c5d937fc764f8749101243 --- /dev/null +++ b/checkpoint-200/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ec843ab7d5d89384bbcbd10dae3adddc2257138f4bb8515316463b30c8590dd4 +size 6627701834 diff --git a/checkpoint-200/rng_state.pth b/checkpoint-200/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..671fa9b31c3de2fc3c9b740145b7bd5c04aa428e --- /dev/null +++ b/checkpoint-200/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:49dfee8275a21d28d179a51fdac0680234e691d6d101a40d18419fe308e1eab6 +size 14180 diff --git a/checkpoint-200/scheduler.pt b/checkpoint-200/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..cd85ebfd5283eff16808c1da8eb6f40ac2533b35 --- /dev/null +++ b/checkpoint-200/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:da963d3b7106f3b4c395d03ef6897e64220165c23480563310345c76ab7b120d +size 1064 diff --git a/checkpoint-200/special_tokens_map.json b/checkpoint-200/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..55431d8ca0ffd8e996c646341b6136e749142547 --- /dev/null +++ b/checkpoint-200/special_tokens_map.json @@ -0,0 +1,12 @@ +{ + "bos_token": "", + "eos_token": "", + "pad_token": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false + }, + "unk_token": "" +} diff --git a/checkpoint-200/tokenizer.model b/checkpoint-200/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..6c00c742ce03c627d6cd5b795984876fa49fa899 --- /dev/null +++ b/checkpoint-200/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347 +size 499723 diff --git a/checkpoint-200/tokenizer_config.json b/checkpoint-200/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..a3b9160e6f055447e1b1927482cf0ea394366055 --- /dev/null +++ b/checkpoint-200/tokenizer_config.json @@ -0,0 +1,71 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": { + "__type": "AddedToken", + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false + }, + "chat_template": "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}", + "clean_up_tokenization_spaces": false, + "eos_token": { + "__type": "AddedToken", + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false + }, + "legacy": false, + "model_max_length": 1000000000000000019884624838656, + "pad_token": { + "__type": "AddedToken", + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false + }, + "padding_side": "right", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "LlamaTokenizer", + "unk_token": { + "__type": "AddedToken", + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false + }, + "use_default_system_prompt": false +} diff --git a/checkpoint-200/trainer_state.json b/checkpoint-200/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..eb97380660a81934bf8beacd83d146ea31d1a8d1 --- /dev/null +++ b/checkpoint-200/trainer_state.json @@ -0,0 +1,207 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.36272953978689637, + "global_step": 200, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.02, + "learning_rate": 0.0001, + "loss": 1.1655, + "step": 10 + }, + { + "epoch": 0.04, + "learning_rate": 0.0001, + "loss": 1.001, + "step": 20 + }, + { + "epoch": 0.05, + "learning_rate": 0.0001, + "loss": 1.0287, + "step": 30 + }, + { + "epoch": 0.07, + "learning_rate": 0.0001, + "loss": 1.1578, + "step": 40 + }, + { + "epoch": 0.09, + "learning_rate": 0.0001, + "loss": 1.2146, + "step": 50 + }, + { + "epoch": 0.11, + "learning_rate": 0.0001, + "loss": 0.997, + "step": 60 + }, + { + "epoch": 0.13, + "learning_rate": 0.0001, + "loss": 0.9024, + "step": 70 + }, + { + "epoch": 0.15, + "learning_rate": 0.0001, + "loss": 0.9901, + "step": 80 + }, + { + "epoch": 0.16, + "learning_rate": 0.0001, + "loss": 1.1264, + "step": 90 + }, + { + "epoch": 0.18, + "learning_rate": 0.0001, + "loss": 1.2038, + "step": 100 + }, + { + "epoch": 0.2, + "learning_rate": 0.0001, + "loss": 0.8935, + "step": 110 + }, + { + "epoch": 0.22, + "learning_rate": 0.0001, + "loss": 0.9178, + "step": 120 + }, + { + "epoch": 0.24, + "learning_rate": 0.0001, + "loss": 0.9746, + "step": 130 + }, + { + "epoch": 0.25, + "learning_rate": 0.0001, + "loss": 1.1566, + "step": 140 + }, + { + "epoch": 0.27, + "learning_rate": 0.0001, + "loss": 1.2877, + "step": 150 + }, + { + "epoch": 0.29, + "learning_rate": 0.0001, + "loss": 0.9146, + "step": 160 + }, + { + "epoch": 0.31, + "learning_rate": 0.0001, + "loss": 0.8895, + "step": 170 + }, + { + "epoch": 0.33, + "learning_rate": 0.0001, + "loss": 1.0121, + "step": 180 + }, + { + "epoch": 0.34, + "eval_loss": 1.0215636491775513, + "eval_runtime": 950.138, + "eval_samples_per_second": 1.052, + "eval_steps_per_second": 1.052, + "step": 187 + }, + { + "epoch": 0.34, + "mmlu_eval_accuracy": 0.731892294851104, + "mmlu_eval_accuracy_abstract_algebra": 0.18181818181818182, + "mmlu_eval_accuracy_anatomy": 0.7857142857142857, + "mmlu_eval_accuracy_astronomy": 0.6875, + "mmlu_eval_accuracy_business_ethics": 0.7272727272727273, + "mmlu_eval_accuracy_clinical_knowledge": 0.896551724137931, + "mmlu_eval_accuracy_college_biology": 0.875, + "mmlu_eval_accuracy_college_chemistry": 0.5, + "mmlu_eval_accuracy_college_computer_science": 0.7272727272727273, + "mmlu_eval_accuracy_college_mathematics": 0.45454545454545453, + "mmlu_eval_accuracy_college_medicine": 0.9090909090909091, + "mmlu_eval_accuracy_college_physics": 0.6363636363636364, + "mmlu_eval_accuracy_computer_security": 0.6363636363636364, + "mmlu_eval_accuracy_conceptual_physics": 0.6538461538461539, + "mmlu_eval_accuracy_econometrics": 0.8333333333333334, + "mmlu_eval_accuracy_electrical_engineering": 0.8125, + "mmlu_eval_accuracy_elementary_mathematics": 0.7073170731707317, + "mmlu_eval_accuracy_formal_logic": 0.5714285714285714, + "mmlu_eval_accuracy_global_facts": 0.5, + "mmlu_eval_accuracy_high_school_biology": 0.8125, + "mmlu_eval_accuracy_high_school_chemistry": 0.4090909090909091, + "mmlu_eval_accuracy_high_school_computer_science": 0.6666666666666666, + "mmlu_eval_accuracy_high_school_european_history": 0.7777777777777778, + "mmlu_eval_accuracy_high_school_geography": 0.9090909090909091, + "mmlu_eval_accuracy_high_school_government_and_politics": 0.9523809523809523, + "mmlu_eval_accuracy_high_school_macroeconomics": 0.7441860465116279, + "mmlu_eval_accuracy_high_school_mathematics": 0.3793103448275862, + "mmlu_eval_accuracy_high_school_microeconomics": 0.9615384615384616, + "mmlu_eval_accuracy_high_school_physics": 0.23529411764705882, + "mmlu_eval_accuracy_high_school_psychology": 0.9333333333333333, + "mmlu_eval_accuracy_high_school_statistics": 0.6956521739130435, + "mmlu_eval_accuracy_high_school_us_history": 0.9090909090909091, + "mmlu_eval_accuracy_high_school_world_history": 0.8076923076923077, + "mmlu_eval_accuracy_human_aging": 0.8260869565217391, + "mmlu_eval_accuracy_human_sexuality": 0.6666666666666666, + "mmlu_eval_accuracy_international_law": 1.0, + "mmlu_eval_accuracy_jurisprudence": 0.6363636363636364, + "mmlu_eval_accuracy_logical_fallacies": 0.7777777777777778, + "mmlu_eval_accuracy_machine_learning": 0.5454545454545454, + "mmlu_eval_accuracy_management": 0.9090909090909091, + "mmlu_eval_accuracy_marketing": 0.88, + "mmlu_eval_accuracy_medical_genetics": 0.9090909090909091, + "mmlu_eval_accuracy_miscellaneous": 0.7906976744186046, + "mmlu_eval_accuracy_moral_disputes": 0.8157894736842105, + "mmlu_eval_accuracy_moral_scenarios": 0.59, + "mmlu_eval_accuracy_nutrition": 0.7878787878787878, + "mmlu_eval_accuracy_philosophy": 0.7941176470588235, + "mmlu_eval_accuracy_prehistory": 0.8285714285714286, + "mmlu_eval_accuracy_professional_accounting": 0.6451612903225806, + "mmlu_eval_accuracy_professional_law": 0.6294117647058823, + "mmlu_eval_accuracy_professional_medicine": 0.8064516129032258, + "mmlu_eval_accuracy_professional_psychology": 0.8115942028985508, + "mmlu_eval_accuracy_public_relations": 0.6666666666666666, + "mmlu_eval_accuracy_security_studies": 0.8148148148148148, + "mmlu_eval_accuracy_sociology": 0.9545454545454546, + "mmlu_eval_accuracy_us_foreign_policy": 1.0, + "mmlu_eval_accuracy_virology": 0.5, + "mmlu_eval_accuracy_world_religions": 0.8421052631578947, + "mmlu_loss": 1.326305795171384, + "step": 187 + }, + { + "epoch": 0.34, + "learning_rate": 0.0001, + "loss": 1.1133, + "step": 190 + }, + { + "epoch": 0.36, + "learning_rate": 0.0001, + "loss": 1.2485, + "step": 200 + } + ], + "max_steps": 1875, + "num_train_epochs": 4, + "total_flos": 2.3241435438071808e+17, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-200/training_args.bin b/checkpoint-200/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..cab5478c763bfedd6ea1ae136d922355ec23f73d --- /dev/null +++ b/checkpoint-200/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3739b542a2914edbd2d7eb3b727d6fa2a7752c75b0d7a856f5e87dd807fa1ef9 +size 6200 diff --git a/checkpoint-400/README.md b/checkpoint-400/README.md new file mode 100644 index 0000000000000000000000000000000000000000..82793f73e61dbb024e11fc6697bba1622d4d0db6 --- /dev/null +++ b/checkpoint-400/README.md @@ -0,0 +1,20 @@ +--- +library_name: peft +--- +## Training procedure + + +The following `bitsandbytes` quantization config was used during training: +- load_in_8bit: False +- load_in_4bit: True +- llm_int8_threshold: 6.0 +- llm_int8_skip_modules: None +- llm_int8_enable_fp32_cpu_offload: False +- llm_int8_has_fp16_weight: False +- bnb_4bit_quant_type: nf4 +- bnb_4bit_use_double_quant: True +- bnb_4bit_compute_dtype: bfloat16 +### Framework versions + + +- PEFT 0.4.0 diff --git a/checkpoint-400/adapter_config.json b/checkpoint-400/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..f6f42652a8f2f3cf02936153fca1af5d04e7983d --- /dev/null +++ b/checkpoint-400/adapter_config.json @@ -0,0 +1,26 @@ +{ + "auto_mapping": null, + "base_model_name_or_path": "152334H/miqu-1-70b-sf", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "lora_alpha": 16.0, + "lora_dropout": 0.05, + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "revision": null, + "target_modules": [ + "gate_proj", + "down_proj", + "v_proj", + "up_proj", + "q_proj", + "k_proj", + "o_proj" + ], + "task_type": "CAUSAL_LM" +} \ No newline at end of file diff --git a/checkpoint-400/adapter_model.bin b/checkpoint-400/adapter_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..5b5cbd401da9be76942a5f77e68c6aee19b7377d --- /dev/null +++ b/checkpoint-400/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ee21fd2086ed33b012216471edb3a6578cae41d61a3e2d70e9c32aa8da111518 +size 1657155522 diff --git a/checkpoint-400/adapter_model/README.md b/checkpoint-400/adapter_model/README.md new file mode 100644 index 0000000000000000000000000000000000000000..82793f73e61dbb024e11fc6697bba1622d4d0db6 --- /dev/null +++ b/checkpoint-400/adapter_model/README.md @@ -0,0 +1,20 @@ +--- +library_name: peft +--- +## Training procedure + + +The following `bitsandbytes` quantization config was used during training: +- load_in_8bit: False +- load_in_4bit: True +- llm_int8_threshold: 6.0 +- llm_int8_skip_modules: None +- llm_int8_enable_fp32_cpu_offload: False +- llm_int8_has_fp16_weight: False +- bnb_4bit_quant_type: nf4 +- bnb_4bit_use_double_quant: True +- bnb_4bit_compute_dtype: bfloat16 +### Framework versions + + +- PEFT 0.4.0 diff --git a/checkpoint-400/adapter_model/adapter_config.json b/checkpoint-400/adapter_model/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..f6f42652a8f2f3cf02936153fca1af5d04e7983d --- /dev/null +++ b/checkpoint-400/adapter_model/adapter_config.json @@ -0,0 +1,26 @@ +{ + "auto_mapping": null, + "base_model_name_or_path": "152334H/miqu-1-70b-sf", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "lora_alpha": 16.0, + "lora_dropout": 0.05, + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "revision": null, + "target_modules": [ + "gate_proj", + "down_proj", + "v_proj", + "up_proj", + "q_proj", + "k_proj", + "o_proj" + ], + "task_type": "CAUSAL_LM" +} \ No newline at end of file diff --git a/checkpoint-400/adapter_model/adapter_model.bin b/checkpoint-400/adapter_model/adapter_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..5b5cbd401da9be76942a5f77e68c6aee19b7377d --- /dev/null +++ b/checkpoint-400/adapter_model/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ee21fd2086ed33b012216471edb3a6578cae41d61a3e2d70e9c32aa8da111518 +size 1657155522 diff --git a/checkpoint-400/optimizer.pt b/checkpoint-400/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..da652808e74d2335e9e0572a7564d59b23f9f729 --- /dev/null +++ b/checkpoint-400/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dfe0aca92d683f4a2153f482e9b37c816c0eb75dbf95625d79e620a89e37225b +size 6627702922 diff --git a/checkpoint-400/rng_state.pth b/checkpoint-400/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..295605bfcf37aa6885f63d5bf46f8d7a4cdc4f3a --- /dev/null +++ b/checkpoint-400/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8a266b55b76c6083a19d9348b15793530dbea83d95778a2278dedabf1a2bf7cc +size 14180 diff --git a/checkpoint-400/scheduler.pt b/checkpoint-400/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..832e690d212928d7077c9c0b5e0914e58918690f --- /dev/null +++ b/checkpoint-400/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1fe875adeb5e81f688f2c1021fea61085edbfb214cacba1a2ad6a420d6d64c64 +size 1064 diff --git a/checkpoint-400/special_tokens_map.json b/checkpoint-400/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..55431d8ca0ffd8e996c646341b6136e749142547 --- /dev/null +++ b/checkpoint-400/special_tokens_map.json @@ -0,0 +1,12 @@ +{ + "bos_token": "", + "eos_token": "", + "pad_token": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false + }, + "unk_token": "" +} diff --git a/checkpoint-400/tokenizer.model b/checkpoint-400/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..6c00c742ce03c627d6cd5b795984876fa49fa899 --- /dev/null +++ b/checkpoint-400/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347 +size 499723 diff --git a/checkpoint-400/tokenizer_config.json b/checkpoint-400/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..a3b9160e6f055447e1b1927482cf0ea394366055 --- /dev/null +++ b/checkpoint-400/tokenizer_config.json @@ -0,0 +1,71 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": { + "__type": "AddedToken", + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false + }, + "chat_template": "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}", + "clean_up_tokenization_spaces": false, + "eos_token": { + "__type": "AddedToken", + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false + }, + "legacy": false, + "model_max_length": 1000000000000000019884624838656, + "pad_token": { + "__type": "AddedToken", + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false + }, + "padding_side": "right", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "LlamaTokenizer", + "unk_token": { + "__type": "AddedToken", + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false + }, + "use_default_system_prompt": false +} diff --git a/checkpoint-400/trainer_state.json b/checkpoint-400/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..db088e9f6799e766ca691155cfac614cb6943703 --- /dev/null +++ b/checkpoint-400/trainer_state.json @@ -0,0 +1,398 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.7254590795737927, + "global_step": 400, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.02, + "learning_rate": 0.0001, + "loss": 1.1655, + "step": 10 + }, + { + "epoch": 0.04, + "learning_rate": 0.0001, + "loss": 1.001, + "step": 20 + }, + { + "epoch": 0.05, + "learning_rate": 0.0001, + "loss": 1.0287, + "step": 30 + }, + { + "epoch": 0.07, + "learning_rate": 0.0001, + "loss": 1.1578, + "step": 40 + }, + { + "epoch": 0.09, + "learning_rate": 0.0001, + "loss": 1.2146, + "step": 50 + }, + { + "epoch": 0.11, + "learning_rate": 0.0001, + "loss": 0.997, + "step": 60 + }, + { + "epoch": 0.13, + "learning_rate": 0.0001, + "loss": 0.9024, + "step": 70 + }, + { + "epoch": 0.15, + "learning_rate": 0.0001, + "loss": 0.9901, + "step": 80 + }, + { + "epoch": 0.16, + "learning_rate": 0.0001, + "loss": 1.1264, + "step": 90 + }, + { + "epoch": 0.18, + "learning_rate": 0.0001, + "loss": 1.2038, + "step": 100 + }, + { + "epoch": 0.2, + "learning_rate": 0.0001, + "loss": 0.8935, + "step": 110 + }, + { + "epoch": 0.22, + "learning_rate": 0.0001, + "loss": 0.9178, + "step": 120 + }, + { + "epoch": 0.24, + "learning_rate": 0.0001, + "loss": 0.9746, + "step": 130 + }, + { + "epoch": 0.25, + "learning_rate": 0.0001, + "loss": 1.1566, + "step": 140 + }, + { + "epoch": 0.27, + "learning_rate": 0.0001, + "loss": 1.2877, + "step": 150 + }, + { + "epoch": 0.29, + "learning_rate": 0.0001, + "loss": 0.9146, + "step": 160 + }, + { + "epoch": 0.31, + "learning_rate": 0.0001, + "loss": 0.8895, + "step": 170 + }, + { + "epoch": 0.33, + "learning_rate": 0.0001, + "loss": 1.0121, + "step": 180 + }, + { + "epoch": 0.34, + "eval_loss": 1.0215636491775513, + "eval_runtime": 950.138, + "eval_samples_per_second": 1.052, + "eval_steps_per_second": 1.052, + "step": 187 + }, + { + "epoch": 0.34, + "mmlu_eval_accuracy": 0.731892294851104, + "mmlu_eval_accuracy_abstract_algebra": 0.18181818181818182, + "mmlu_eval_accuracy_anatomy": 0.7857142857142857, + "mmlu_eval_accuracy_astronomy": 0.6875, + "mmlu_eval_accuracy_business_ethics": 0.7272727272727273, + "mmlu_eval_accuracy_clinical_knowledge": 0.896551724137931, + "mmlu_eval_accuracy_college_biology": 0.875, + "mmlu_eval_accuracy_college_chemistry": 0.5, + "mmlu_eval_accuracy_college_computer_science": 0.7272727272727273, + "mmlu_eval_accuracy_college_mathematics": 0.45454545454545453, + "mmlu_eval_accuracy_college_medicine": 0.9090909090909091, + "mmlu_eval_accuracy_college_physics": 0.6363636363636364, + "mmlu_eval_accuracy_computer_security": 0.6363636363636364, + "mmlu_eval_accuracy_conceptual_physics": 0.6538461538461539, + "mmlu_eval_accuracy_econometrics": 0.8333333333333334, + "mmlu_eval_accuracy_electrical_engineering": 0.8125, + "mmlu_eval_accuracy_elementary_mathematics": 0.7073170731707317, + "mmlu_eval_accuracy_formal_logic": 0.5714285714285714, + "mmlu_eval_accuracy_global_facts": 0.5, + "mmlu_eval_accuracy_high_school_biology": 0.8125, + "mmlu_eval_accuracy_high_school_chemistry": 0.4090909090909091, + "mmlu_eval_accuracy_high_school_computer_science": 0.6666666666666666, + "mmlu_eval_accuracy_high_school_european_history": 0.7777777777777778, + "mmlu_eval_accuracy_high_school_geography": 0.9090909090909091, + "mmlu_eval_accuracy_high_school_government_and_politics": 0.9523809523809523, + "mmlu_eval_accuracy_high_school_macroeconomics": 0.7441860465116279, + "mmlu_eval_accuracy_high_school_mathematics": 0.3793103448275862, + "mmlu_eval_accuracy_high_school_microeconomics": 0.9615384615384616, + "mmlu_eval_accuracy_high_school_physics": 0.23529411764705882, + "mmlu_eval_accuracy_high_school_psychology": 0.9333333333333333, + "mmlu_eval_accuracy_high_school_statistics": 0.6956521739130435, + "mmlu_eval_accuracy_high_school_us_history": 0.9090909090909091, + "mmlu_eval_accuracy_high_school_world_history": 0.8076923076923077, + "mmlu_eval_accuracy_human_aging": 0.8260869565217391, + "mmlu_eval_accuracy_human_sexuality": 0.6666666666666666, + "mmlu_eval_accuracy_international_law": 1.0, + "mmlu_eval_accuracy_jurisprudence": 0.6363636363636364, + "mmlu_eval_accuracy_logical_fallacies": 0.7777777777777778, + "mmlu_eval_accuracy_machine_learning": 0.5454545454545454, + "mmlu_eval_accuracy_management": 0.9090909090909091, + "mmlu_eval_accuracy_marketing": 0.88, + "mmlu_eval_accuracy_medical_genetics": 0.9090909090909091, + "mmlu_eval_accuracy_miscellaneous": 0.7906976744186046, + "mmlu_eval_accuracy_moral_disputes": 0.8157894736842105, + "mmlu_eval_accuracy_moral_scenarios": 0.59, + "mmlu_eval_accuracy_nutrition": 0.7878787878787878, + "mmlu_eval_accuracy_philosophy": 0.7941176470588235, + "mmlu_eval_accuracy_prehistory": 0.8285714285714286, + "mmlu_eval_accuracy_professional_accounting": 0.6451612903225806, + "mmlu_eval_accuracy_professional_law": 0.6294117647058823, + "mmlu_eval_accuracy_professional_medicine": 0.8064516129032258, + "mmlu_eval_accuracy_professional_psychology": 0.8115942028985508, + "mmlu_eval_accuracy_public_relations": 0.6666666666666666, + "mmlu_eval_accuracy_security_studies": 0.8148148148148148, + "mmlu_eval_accuracy_sociology": 0.9545454545454546, + "mmlu_eval_accuracy_us_foreign_policy": 1.0, + "mmlu_eval_accuracy_virology": 0.5, + "mmlu_eval_accuracy_world_religions": 0.8421052631578947, + "mmlu_loss": 1.326305795171384, + "step": 187 + }, + { + "epoch": 0.34, + "learning_rate": 0.0001, + "loss": 1.1133, + "step": 190 + }, + { + "epoch": 0.36, + "learning_rate": 0.0001, + "loss": 1.2485, + "step": 200 + }, + { + "epoch": 0.38, + "learning_rate": 0.0001, + "loss": 0.9653, + "step": 210 + }, + { + "epoch": 0.4, + "learning_rate": 0.0001, + "loss": 0.9455, + "step": 220 + }, + { + "epoch": 0.42, + "learning_rate": 0.0001, + "loss": 1.0373, + "step": 230 + }, + { + "epoch": 0.44, + "learning_rate": 0.0001, + "loss": 1.1425, + "step": 240 + }, + { + "epoch": 0.45, + "learning_rate": 0.0001, + "loss": 1.3136, + "step": 250 + }, + { + "epoch": 0.47, + "learning_rate": 0.0001, + "loss": 0.8695, + "step": 260 + }, + { + "epoch": 0.49, + "learning_rate": 0.0001, + "loss": 0.872, + "step": 270 + }, + { + "epoch": 0.51, + "learning_rate": 0.0001, + "loss": 1.0152, + "step": 280 + }, + { + "epoch": 0.53, + "learning_rate": 0.0001, + "loss": 1.1309, + "step": 290 + }, + { + "epoch": 0.54, + "learning_rate": 0.0001, + "loss": 1.267, + "step": 300 + }, + { + "epoch": 0.56, + "learning_rate": 0.0001, + "loss": 0.9249, + "step": 310 + }, + { + "epoch": 0.58, + "learning_rate": 0.0001, + "loss": 0.9148, + "step": 320 + }, + { + "epoch": 0.6, + "learning_rate": 0.0001, + "loss": 0.9864, + "step": 330 + }, + { + "epoch": 0.62, + "learning_rate": 0.0001, + "loss": 1.2312, + "step": 340 + }, + { + "epoch": 0.63, + "learning_rate": 0.0001, + "loss": 1.2354, + "step": 350 + }, + { + "epoch": 0.65, + "learning_rate": 0.0001, + "loss": 0.9126, + "step": 360 + }, + { + "epoch": 0.67, + "learning_rate": 0.0001, + "loss": 0.9213, + "step": 370 + }, + { + "epoch": 0.68, + "eval_loss": 1.0163359642028809, + "eval_runtime": 948.1151, + "eval_samples_per_second": 1.055, + "eval_steps_per_second": 1.055, + "step": 374 + }, + { + "epoch": 0.68, + "mmlu_eval_accuracy": 0.7395476061435284, + "mmlu_eval_accuracy_abstract_algebra": 0.2727272727272727, + "mmlu_eval_accuracy_anatomy": 0.7857142857142857, + "mmlu_eval_accuracy_astronomy": 0.75, + "mmlu_eval_accuracy_business_ethics": 0.7272727272727273, + "mmlu_eval_accuracy_clinical_knowledge": 0.896551724137931, + "mmlu_eval_accuracy_college_biology": 0.875, + "mmlu_eval_accuracy_college_chemistry": 0.5, + "mmlu_eval_accuracy_college_computer_science": 0.7272727272727273, + "mmlu_eval_accuracy_college_mathematics": 0.36363636363636365, + "mmlu_eval_accuracy_college_medicine": 0.9090909090909091, + "mmlu_eval_accuracy_college_physics": 0.6363636363636364, + "mmlu_eval_accuracy_computer_security": 0.6363636363636364, + "mmlu_eval_accuracy_conceptual_physics": 0.5769230769230769, + "mmlu_eval_accuracy_econometrics": 0.8333333333333334, + "mmlu_eval_accuracy_electrical_engineering": 0.875, + "mmlu_eval_accuracy_elementary_mathematics": 0.6829268292682927, + "mmlu_eval_accuracy_formal_logic": 0.6428571428571429, + "mmlu_eval_accuracy_global_facts": 0.5, + "mmlu_eval_accuracy_high_school_biology": 0.8125, + "mmlu_eval_accuracy_high_school_chemistry": 0.45454545454545453, + "mmlu_eval_accuracy_high_school_computer_science": 0.7777777777777778, + "mmlu_eval_accuracy_high_school_european_history": 0.7777777777777778, + "mmlu_eval_accuracy_high_school_geography": 0.9090909090909091, + "mmlu_eval_accuracy_high_school_government_and_politics": 0.9523809523809523, + "mmlu_eval_accuracy_high_school_macroeconomics": 0.7674418604651163, + "mmlu_eval_accuracy_high_school_mathematics": 0.41379310344827586, + "mmlu_eval_accuracy_high_school_microeconomics": 0.9615384615384616, + "mmlu_eval_accuracy_high_school_physics": 0.23529411764705882, + "mmlu_eval_accuracy_high_school_psychology": 0.95, + "mmlu_eval_accuracy_high_school_statistics": 0.7391304347826086, + "mmlu_eval_accuracy_high_school_us_history": 0.9090909090909091, + "mmlu_eval_accuracy_high_school_world_history": 0.7692307692307693, + "mmlu_eval_accuracy_human_aging": 0.8260869565217391, + "mmlu_eval_accuracy_human_sexuality": 0.6666666666666666, + "mmlu_eval_accuracy_international_law": 1.0, + "mmlu_eval_accuracy_jurisprudence": 0.6363636363636364, + "mmlu_eval_accuracy_logical_fallacies": 0.7777777777777778, + "mmlu_eval_accuracy_machine_learning": 0.5454545454545454, + "mmlu_eval_accuracy_management": 0.9090909090909091, + "mmlu_eval_accuracy_marketing": 0.92, + "mmlu_eval_accuracy_medical_genetics": 0.9090909090909091, + "mmlu_eval_accuracy_miscellaneous": 0.7790697674418605, + "mmlu_eval_accuracy_moral_disputes": 0.8157894736842105, + "mmlu_eval_accuracy_moral_scenarios": 0.57, + "mmlu_eval_accuracy_nutrition": 0.7272727272727273, + "mmlu_eval_accuracy_philosophy": 0.7941176470588235, + "mmlu_eval_accuracy_prehistory": 0.8571428571428571, + "mmlu_eval_accuracy_professional_accounting": 0.6774193548387096, + "mmlu_eval_accuracy_professional_law": 0.6411764705882353, + "mmlu_eval_accuracy_professional_medicine": 0.8387096774193549, + "mmlu_eval_accuracy_professional_psychology": 0.8115942028985508, + "mmlu_eval_accuracy_public_relations": 0.6666666666666666, + "mmlu_eval_accuracy_security_studies": 0.8148148148148148, + "mmlu_eval_accuracy_sociology": 0.9545454545454546, + "mmlu_eval_accuracy_us_foreign_policy": 1.0, + "mmlu_eval_accuracy_virology": 0.5, + "mmlu_eval_accuracy_world_religions": 0.8947368421052632, + "mmlu_loss": 1.2796503596061355, + "step": 374 + }, + { + "epoch": 0.69, + "learning_rate": 0.0001, + "loss": 0.9737, + "step": 380 + }, + { + "epoch": 0.71, + "learning_rate": 0.0001, + "loss": 1.157, + "step": 390 + }, + { + "epoch": 0.73, + "learning_rate": 0.0001, + "loss": 1.2106, + "step": 400 + } + ], + "max_steps": 1875, + "num_train_epochs": 4, + "total_flos": 4.626609292910592e+17, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-400/training_args.bin b/checkpoint-400/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..cab5478c763bfedd6ea1ae136d922355ec23f73d --- /dev/null +++ b/checkpoint-400/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3739b542a2914edbd2d7eb3b727d6fa2a7752c75b0d7a856f5e87dd807fa1ef9 +size 6200 diff --git a/checkpoint-600/README.md b/checkpoint-600/README.md new file mode 100644 index 0000000000000000000000000000000000000000..82793f73e61dbb024e11fc6697bba1622d4d0db6 --- /dev/null +++ b/checkpoint-600/README.md @@ -0,0 +1,20 @@ +--- +library_name: peft +--- +## Training procedure + + +The following `bitsandbytes` quantization config was used during training: +- load_in_8bit: False +- load_in_4bit: True +- llm_int8_threshold: 6.0 +- llm_int8_skip_modules: None +- llm_int8_enable_fp32_cpu_offload: False +- llm_int8_has_fp16_weight: False +- bnb_4bit_quant_type: nf4 +- bnb_4bit_use_double_quant: True +- bnb_4bit_compute_dtype: bfloat16 +### Framework versions + + +- PEFT 0.4.0 diff --git a/checkpoint-600/adapter_config.json b/checkpoint-600/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..f6f42652a8f2f3cf02936153fca1af5d04e7983d --- /dev/null +++ b/checkpoint-600/adapter_config.json @@ -0,0 +1,26 @@ +{ + "auto_mapping": null, + "base_model_name_or_path": "152334H/miqu-1-70b-sf", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "lora_alpha": 16.0, + "lora_dropout": 0.05, + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "revision": null, + "target_modules": [ + "gate_proj", + "down_proj", + "v_proj", + "up_proj", + "q_proj", + "k_proj", + "o_proj" + ], + "task_type": "CAUSAL_LM" +} \ No newline at end of file diff --git a/checkpoint-600/adapter_model.bin b/checkpoint-600/adapter_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..2d8e46d626c407af8850f4b6628f927409226368 --- /dev/null +++ b/checkpoint-600/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cee2928ef56942f3235728ff2c231d6d5861d904136af34d2027741964fe209d +size 1657155522 diff --git a/checkpoint-600/adapter_model/README.md b/checkpoint-600/adapter_model/README.md new file mode 100644 index 0000000000000000000000000000000000000000..82793f73e61dbb024e11fc6697bba1622d4d0db6 --- /dev/null +++ b/checkpoint-600/adapter_model/README.md @@ -0,0 +1,20 @@ +--- +library_name: peft +--- +## Training procedure + + +The following `bitsandbytes` quantization config was used during training: +- load_in_8bit: False +- load_in_4bit: True +- llm_int8_threshold: 6.0 +- llm_int8_skip_modules: None +- llm_int8_enable_fp32_cpu_offload: False +- llm_int8_has_fp16_weight: False +- bnb_4bit_quant_type: nf4 +- bnb_4bit_use_double_quant: True +- bnb_4bit_compute_dtype: bfloat16 +### Framework versions + + +- PEFT 0.4.0 diff --git a/checkpoint-600/adapter_model/adapter_config.json b/checkpoint-600/adapter_model/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..f6f42652a8f2f3cf02936153fca1af5d04e7983d --- /dev/null +++ b/checkpoint-600/adapter_model/adapter_config.json @@ -0,0 +1,26 @@ +{ + "auto_mapping": null, + "base_model_name_or_path": "152334H/miqu-1-70b-sf", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "lora_alpha": 16.0, + "lora_dropout": 0.05, + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "revision": null, + "target_modules": [ + "gate_proj", + "down_proj", + "v_proj", + "up_proj", + "q_proj", + "k_proj", + "o_proj" + ], + "task_type": "CAUSAL_LM" +} \ No newline at end of file diff --git a/checkpoint-600/adapter_model/adapter_model.bin b/checkpoint-600/adapter_model/adapter_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..2d8e46d626c407af8850f4b6628f927409226368 --- /dev/null +++ b/checkpoint-600/adapter_model/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cee2928ef56942f3235728ff2c231d6d5861d904136af34d2027741964fe209d +size 1657155522 diff --git a/checkpoint-600/optimizer.pt b/checkpoint-600/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..86772dc685541c6933d9c7ce16e075c2a5e76188 --- /dev/null +++ b/checkpoint-600/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6f963a7064821784cfc371e33d455fde7e372f6be8ceacb32268b28b1464c09b +size 6627702922 diff --git a/checkpoint-600/rng_state.pth b/checkpoint-600/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..3252702672ec1e139ba52e6ebef69a1ccfe6a306 --- /dev/null +++ b/checkpoint-600/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d994d4a5b7b22ba4c5e1c4afc33f7135ce8ec062ddee21f5c0d47955fa2a6382 +size 14180 diff --git a/checkpoint-600/scheduler.pt b/checkpoint-600/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..60116c1c19e8f7c4feaba1cfdd17285e828d28fd --- /dev/null +++ b/checkpoint-600/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:df3cd0cb1c67a326b465332bde0ecdeac62626bea0e2ea96c60fa6d8786b9f65 +size 1064 diff --git a/checkpoint-600/special_tokens_map.json b/checkpoint-600/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..55431d8ca0ffd8e996c646341b6136e749142547 --- /dev/null +++ b/checkpoint-600/special_tokens_map.json @@ -0,0 +1,12 @@ +{ + "bos_token": "", + "eos_token": "", + "pad_token": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false + }, + "unk_token": "" +} diff --git a/checkpoint-600/tokenizer.model b/checkpoint-600/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..6c00c742ce03c627d6cd5b795984876fa49fa899 --- /dev/null +++ b/checkpoint-600/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347 +size 499723 diff --git a/checkpoint-600/tokenizer_config.json b/checkpoint-600/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..a3b9160e6f055447e1b1927482cf0ea394366055 --- /dev/null +++ b/checkpoint-600/tokenizer_config.json @@ -0,0 +1,71 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": { + "__type": "AddedToken", + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false + }, + "chat_template": "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}", + "clean_up_tokenization_spaces": false, + "eos_token": { + "__type": "AddedToken", + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false + }, + "legacy": false, + "model_max_length": 1000000000000000019884624838656, + "pad_token": { + "__type": "AddedToken", + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false + }, + "padding_side": "right", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "LlamaTokenizer", + "unk_token": { + "__type": "AddedToken", + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false + }, + "use_default_system_prompt": false +} diff --git a/checkpoint-600/trainer_state.json b/checkpoint-600/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..3fbd0179aa011a7591d70c68e28ecb6a15f07367 --- /dev/null +++ b/checkpoint-600/trainer_state.json @@ -0,0 +1,589 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0881886193606891, + "global_step": 600, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.02, + "learning_rate": 0.0001, + "loss": 1.1655, + "step": 10 + }, + { + "epoch": 0.04, + "learning_rate": 0.0001, + "loss": 1.001, + "step": 20 + }, + { + "epoch": 0.05, + "learning_rate": 0.0001, + "loss": 1.0287, + "step": 30 + }, + { + "epoch": 0.07, + "learning_rate": 0.0001, + "loss": 1.1578, + "step": 40 + }, + { + "epoch": 0.09, + "learning_rate": 0.0001, + "loss": 1.2146, + "step": 50 + }, + { + "epoch": 0.11, + "learning_rate": 0.0001, + "loss": 0.997, + "step": 60 + }, + { + "epoch": 0.13, + "learning_rate": 0.0001, + "loss": 0.9024, + "step": 70 + }, + { + "epoch": 0.15, + "learning_rate": 0.0001, + "loss": 0.9901, + "step": 80 + }, + { + "epoch": 0.16, + "learning_rate": 0.0001, + "loss": 1.1264, + "step": 90 + }, + { + "epoch": 0.18, + "learning_rate": 0.0001, + "loss": 1.2038, + "step": 100 + }, + { + "epoch": 0.2, + "learning_rate": 0.0001, + "loss": 0.8935, + "step": 110 + }, + { + "epoch": 0.22, + "learning_rate": 0.0001, + "loss": 0.9178, + "step": 120 + }, + { + "epoch": 0.24, + "learning_rate": 0.0001, + "loss": 0.9746, + "step": 130 + }, + { + "epoch": 0.25, + "learning_rate": 0.0001, + "loss": 1.1566, + "step": 140 + }, + { + "epoch": 0.27, + "learning_rate": 0.0001, + "loss": 1.2877, + "step": 150 + }, + { + "epoch": 0.29, + "learning_rate": 0.0001, + "loss": 0.9146, + "step": 160 + }, + { + "epoch": 0.31, + "learning_rate": 0.0001, + "loss": 0.8895, + "step": 170 + }, + { + "epoch": 0.33, + "learning_rate": 0.0001, + "loss": 1.0121, + "step": 180 + }, + { + "epoch": 0.34, + "eval_loss": 1.0215636491775513, + "eval_runtime": 950.138, + "eval_samples_per_second": 1.052, + "eval_steps_per_second": 1.052, + "step": 187 + }, + { + "epoch": 0.34, + "mmlu_eval_accuracy": 0.731892294851104, + "mmlu_eval_accuracy_abstract_algebra": 0.18181818181818182, + "mmlu_eval_accuracy_anatomy": 0.7857142857142857, + "mmlu_eval_accuracy_astronomy": 0.6875, + "mmlu_eval_accuracy_business_ethics": 0.7272727272727273, + "mmlu_eval_accuracy_clinical_knowledge": 0.896551724137931, + "mmlu_eval_accuracy_college_biology": 0.875, + "mmlu_eval_accuracy_college_chemistry": 0.5, + "mmlu_eval_accuracy_college_computer_science": 0.7272727272727273, + "mmlu_eval_accuracy_college_mathematics": 0.45454545454545453, + "mmlu_eval_accuracy_college_medicine": 0.9090909090909091, + "mmlu_eval_accuracy_college_physics": 0.6363636363636364, + "mmlu_eval_accuracy_computer_security": 0.6363636363636364, + "mmlu_eval_accuracy_conceptual_physics": 0.6538461538461539, + "mmlu_eval_accuracy_econometrics": 0.8333333333333334, + "mmlu_eval_accuracy_electrical_engineering": 0.8125, + "mmlu_eval_accuracy_elementary_mathematics": 0.7073170731707317, + "mmlu_eval_accuracy_formal_logic": 0.5714285714285714, + "mmlu_eval_accuracy_global_facts": 0.5, + "mmlu_eval_accuracy_high_school_biology": 0.8125, + "mmlu_eval_accuracy_high_school_chemistry": 0.4090909090909091, + "mmlu_eval_accuracy_high_school_computer_science": 0.6666666666666666, + "mmlu_eval_accuracy_high_school_european_history": 0.7777777777777778, + "mmlu_eval_accuracy_high_school_geography": 0.9090909090909091, + "mmlu_eval_accuracy_high_school_government_and_politics": 0.9523809523809523, + "mmlu_eval_accuracy_high_school_macroeconomics": 0.7441860465116279, + "mmlu_eval_accuracy_high_school_mathematics": 0.3793103448275862, + "mmlu_eval_accuracy_high_school_microeconomics": 0.9615384615384616, + "mmlu_eval_accuracy_high_school_physics": 0.23529411764705882, + "mmlu_eval_accuracy_high_school_psychology": 0.9333333333333333, + "mmlu_eval_accuracy_high_school_statistics": 0.6956521739130435, + "mmlu_eval_accuracy_high_school_us_history": 0.9090909090909091, + "mmlu_eval_accuracy_high_school_world_history": 0.8076923076923077, + "mmlu_eval_accuracy_human_aging": 0.8260869565217391, + "mmlu_eval_accuracy_human_sexuality": 0.6666666666666666, + "mmlu_eval_accuracy_international_law": 1.0, + "mmlu_eval_accuracy_jurisprudence": 0.6363636363636364, + "mmlu_eval_accuracy_logical_fallacies": 0.7777777777777778, + "mmlu_eval_accuracy_machine_learning": 0.5454545454545454, + "mmlu_eval_accuracy_management": 0.9090909090909091, + "mmlu_eval_accuracy_marketing": 0.88, + "mmlu_eval_accuracy_medical_genetics": 0.9090909090909091, + "mmlu_eval_accuracy_miscellaneous": 0.7906976744186046, + "mmlu_eval_accuracy_moral_disputes": 0.8157894736842105, + "mmlu_eval_accuracy_moral_scenarios": 0.59, + "mmlu_eval_accuracy_nutrition": 0.7878787878787878, + "mmlu_eval_accuracy_philosophy": 0.7941176470588235, + "mmlu_eval_accuracy_prehistory": 0.8285714285714286, + "mmlu_eval_accuracy_professional_accounting": 0.6451612903225806, + "mmlu_eval_accuracy_professional_law": 0.6294117647058823, + "mmlu_eval_accuracy_professional_medicine": 0.8064516129032258, + "mmlu_eval_accuracy_professional_psychology": 0.8115942028985508, + "mmlu_eval_accuracy_public_relations": 0.6666666666666666, + "mmlu_eval_accuracy_security_studies": 0.8148148148148148, + "mmlu_eval_accuracy_sociology": 0.9545454545454546, + "mmlu_eval_accuracy_us_foreign_policy": 1.0, + "mmlu_eval_accuracy_virology": 0.5, + "mmlu_eval_accuracy_world_religions": 0.8421052631578947, + "mmlu_loss": 1.326305795171384, + "step": 187 + }, + { + "epoch": 0.34, + "learning_rate": 0.0001, + "loss": 1.1133, + "step": 190 + }, + { + "epoch": 0.36, + "learning_rate": 0.0001, + "loss": 1.2485, + "step": 200 + }, + { + "epoch": 0.38, + "learning_rate": 0.0001, + "loss": 0.9653, + "step": 210 + }, + { + "epoch": 0.4, + "learning_rate": 0.0001, + "loss": 0.9455, + "step": 220 + }, + { + "epoch": 0.42, + "learning_rate": 0.0001, + "loss": 1.0373, + "step": 230 + }, + { + "epoch": 0.44, + "learning_rate": 0.0001, + "loss": 1.1425, + "step": 240 + }, + { + "epoch": 0.45, + "learning_rate": 0.0001, + "loss": 1.3136, + "step": 250 + }, + { + "epoch": 0.47, + "learning_rate": 0.0001, + "loss": 0.8695, + "step": 260 + }, + { + "epoch": 0.49, + "learning_rate": 0.0001, + "loss": 0.872, + "step": 270 + }, + { + "epoch": 0.51, + "learning_rate": 0.0001, + "loss": 1.0152, + "step": 280 + }, + { + "epoch": 0.53, + "learning_rate": 0.0001, + "loss": 1.1309, + "step": 290 + }, + { + "epoch": 0.54, + "learning_rate": 0.0001, + "loss": 1.267, + "step": 300 + }, + { + "epoch": 0.56, + "learning_rate": 0.0001, + "loss": 0.9249, + "step": 310 + }, + { + "epoch": 0.58, + "learning_rate": 0.0001, + "loss": 0.9148, + "step": 320 + }, + { + "epoch": 0.6, + "learning_rate": 0.0001, + "loss": 0.9864, + "step": 330 + }, + { + "epoch": 0.62, + "learning_rate": 0.0001, + "loss": 1.2312, + "step": 340 + }, + { + "epoch": 0.63, + "learning_rate": 0.0001, + "loss": 1.2354, + "step": 350 + }, + { + "epoch": 0.65, + "learning_rate": 0.0001, + "loss": 0.9126, + "step": 360 + }, + { + "epoch": 0.67, + "learning_rate": 0.0001, + "loss": 0.9213, + "step": 370 + }, + { + "epoch": 0.68, + "eval_loss": 1.0163359642028809, + "eval_runtime": 948.1151, + "eval_samples_per_second": 1.055, + "eval_steps_per_second": 1.055, + "step": 374 + }, + { + "epoch": 0.68, + "mmlu_eval_accuracy": 0.7395476061435284, + "mmlu_eval_accuracy_abstract_algebra": 0.2727272727272727, + "mmlu_eval_accuracy_anatomy": 0.7857142857142857, + "mmlu_eval_accuracy_astronomy": 0.75, + "mmlu_eval_accuracy_business_ethics": 0.7272727272727273, + "mmlu_eval_accuracy_clinical_knowledge": 0.896551724137931, + "mmlu_eval_accuracy_college_biology": 0.875, + "mmlu_eval_accuracy_college_chemistry": 0.5, + "mmlu_eval_accuracy_college_computer_science": 0.7272727272727273, + "mmlu_eval_accuracy_college_mathematics": 0.36363636363636365, + "mmlu_eval_accuracy_college_medicine": 0.9090909090909091, + "mmlu_eval_accuracy_college_physics": 0.6363636363636364, + "mmlu_eval_accuracy_computer_security": 0.6363636363636364, + "mmlu_eval_accuracy_conceptual_physics": 0.5769230769230769, + "mmlu_eval_accuracy_econometrics": 0.8333333333333334, + "mmlu_eval_accuracy_electrical_engineering": 0.875, + "mmlu_eval_accuracy_elementary_mathematics": 0.6829268292682927, + "mmlu_eval_accuracy_formal_logic": 0.6428571428571429, + "mmlu_eval_accuracy_global_facts": 0.5, + "mmlu_eval_accuracy_high_school_biology": 0.8125, + "mmlu_eval_accuracy_high_school_chemistry": 0.45454545454545453, + "mmlu_eval_accuracy_high_school_computer_science": 0.7777777777777778, + "mmlu_eval_accuracy_high_school_european_history": 0.7777777777777778, + "mmlu_eval_accuracy_high_school_geography": 0.9090909090909091, + "mmlu_eval_accuracy_high_school_government_and_politics": 0.9523809523809523, + "mmlu_eval_accuracy_high_school_macroeconomics": 0.7674418604651163, + "mmlu_eval_accuracy_high_school_mathematics": 0.41379310344827586, + "mmlu_eval_accuracy_high_school_microeconomics": 0.9615384615384616, + "mmlu_eval_accuracy_high_school_physics": 0.23529411764705882, + "mmlu_eval_accuracy_high_school_psychology": 0.95, + "mmlu_eval_accuracy_high_school_statistics": 0.7391304347826086, + "mmlu_eval_accuracy_high_school_us_history": 0.9090909090909091, + "mmlu_eval_accuracy_high_school_world_history": 0.7692307692307693, + "mmlu_eval_accuracy_human_aging": 0.8260869565217391, + "mmlu_eval_accuracy_human_sexuality": 0.6666666666666666, + "mmlu_eval_accuracy_international_law": 1.0, + "mmlu_eval_accuracy_jurisprudence": 0.6363636363636364, + "mmlu_eval_accuracy_logical_fallacies": 0.7777777777777778, + "mmlu_eval_accuracy_machine_learning": 0.5454545454545454, + "mmlu_eval_accuracy_management": 0.9090909090909091, + "mmlu_eval_accuracy_marketing": 0.92, + "mmlu_eval_accuracy_medical_genetics": 0.9090909090909091, + "mmlu_eval_accuracy_miscellaneous": 0.7790697674418605, + "mmlu_eval_accuracy_moral_disputes": 0.8157894736842105, + "mmlu_eval_accuracy_moral_scenarios": 0.57, + "mmlu_eval_accuracy_nutrition": 0.7272727272727273, + "mmlu_eval_accuracy_philosophy": 0.7941176470588235, + "mmlu_eval_accuracy_prehistory": 0.8571428571428571, + "mmlu_eval_accuracy_professional_accounting": 0.6774193548387096, + "mmlu_eval_accuracy_professional_law": 0.6411764705882353, + "mmlu_eval_accuracy_professional_medicine": 0.8387096774193549, + "mmlu_eval_accuracy_professional_psychology": 0.8115942028985508, + "mmlu_eval_accuracy_public_relations": 0.6666666666666666, + "mmlu_eval_accuracy_security_studies": 0.8148148148148148, + "mmlu_eval_accuracy_sociology": 0.9545454545454546, + "mmlu_eval_accuracy_us_foreign_policy": 1.0, + "mmlu_eval_accuracy_virology": 0.5, + "mmlu_eval_accuracy_world_religions": 0.8947368421052632, + "mmlu_loss": 1.2796503596061355, + "step": 374 + }, + { + "epoch": 0.69, + "learning_rate": 0.0001, + "loss": 0.9737, + "step": 380 + }, + { + "epoch": 0.71, + "learning_rate": 0.0001, + "loss": 1.157, + "step": 390 + }, + { + "epoch": 0.73, + "learning_rate": 0.0001, + "loss": 1.2106, + "step": 400 + }, + { + "epoch": 0.74, + "learning_rate": 0.0001, + "loss": 0.8687, + "step": 410 + }, + { + "epoch": 0.76, + "learning_rate": 0.0001, + "loss": 0.8742, + "step": 420 + }, + { + "epoch": 0.78, + "learning_rate": 0.0001, + "loss": 0.9901, + "step": 430 + }, + { + "epoch": 0.8, + "learning_rate": 0.0001, + "loss": 1.2238, + "step": 440 + }, + { + "epoch": 0.82, + "learning_rate": 0.0001, + "loss": 1.2604, + "step": 450 + }, + { + "epoch": 0.83, + "learning_rate": 0.0001, + "loss": 0.8756, + "step": 460 + }, + { + "epoch": 0.85, + "learning_rate": 0.0001, + "loss": 0.8683, + "step": 470 + }, + { + "epoch": 0.87, + "learning_rate": 0.0001, + "loss": 0.9824, + "step": 480 + }, + { + "epoch": 0.89, + "learning_rate": 0.0001, + "loss": 1.1574, + "step": 490 + }, + { + "epoch": 0.91, + "learning_rate": 0.0001, + "loss": 1.2687, + "step": 500 + }, + { + "epoch": 0.92, + "learning_rate": 0.0001, + "loss": 0.8657, + "step": 510 + }, + { + "epoch": 0.94, + "learning_rate": 0.0001, + "loss": 0.9207, + "step": 520 + }, + { + "epoch": 0.96, + "learning_rate": 0.0001, + "loss": 1.012, + "step": 530 + }, + { + "epoch": 0.98, + "learning_rate": 0.0001, + "loss": 1.1517, + "step": 540 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 1.1654, + "step": 550 + }, + { + "epoch": 1.02, + "learning_rate": 0.0001, + "loss": 0.8931, + "step": 560 + }, + { + "epoch": 1.02, + "eval_loss": 1.0150845050811768, + "eval_runtime": 949.8392, + "eval_samples_per_second": 1.053, + "eval_steps_per_second": 1.053, + "step": 561 + }, + { + "epoch": 1.02, + "mmlu_eval_accuracy": 0.7346397374699287, + "mmlu_eval_accuracy_abstract_algebra": 0.2727272727272727, + "mmlu_eval_accuracy_anatomy": 0.7142857142857143, + "mmlu_eval_accuracy_astronomy": 0.6875, + "mmlu_eval_accuracy_business_ethics": 0.8181818181818182, + "mmlu_eval_accuracy_clinical_knowledge": 0.8620689655172413, + "mmlu_eval_accuracy_college_biology": 0.875, + "mmlu_eval_accuracy_college_chemistry": 0.5, + "mmlu_eval_accuracy_college_computer_science": 0.7272727272727273, + "mmlu_eval_accuracy_college_mathematics": 0.36363636363636365, + "mmlu_eval_accuracy_college_medicine": 0.9090909090909091, + "mmlu_eval_accuracy_college_physics": 0.5454545454545454, + "mmlu_eval_accuracy_computer_security": 0.6363636363636364, + "mmlu_eval_accuracy_conceptual_physics": 0.6153846153846154, + "mmlu_eval_accuracy_econometrics": 0.8333333333333334, + "mmlu_eval_accuracy_electrical_engineering": 0.8125, + "mmlu_eval_accuracy_elementary_mathematics": 0.6829268292682927, + "mmlu_eval_accuracy_formal_logic": 0.6428571428571429, + "mmlu_eval_accuracy_global_facts": 0.5, + "mmlu_eval_accuracy_high_school_biology": 0.84375, + "mmlu_eval_accuracy_high_school_chemistry": 0.45454545454545453, + "mmlu_eval_accuracy_high_school_computer_science": 0.7777777777777778, + "mmlu_eval_accuracy_high_school_european_history": 0.7777777777777778, + "mmlu_eval_accuracy_high_school_geography": 0.9090909090909091, + "mmlu_eval_accuracy_high_school_government_and_politics": 0.9523809523809523, + "mmlu_eval_accuracy_high_school_macroeconomics": 0.7674418604651163, + "mmlu_eval_accuracy_high_school_mathematics": 0.41379310344827586, + "mmlu_eval_accuracy_high_school_microeconomics": 0.9230769230769231, + "mmlu_eval_accuracy_high_school_physics": 0.23529411764705882, + "mmlu_eval_accuracy_high_school_psychology": 0.9666666666666667, + "mmlu_eval_accuracy_high_school_statistics": 0.7391304347826086, + "mmlu_eval_accuracy_high_school_us_history": 0.9090909090909091, + "mmlu_eval_accuracy_high_school_world_history": 0.8076923076923077, + "mmlu_eval_accuracy_human_aging": 0.8260869565217391, + "mmlu_eval_accuracy_human_sexuality": 0.6666666666666666, + "mmlu_eval_accuracy_international_law": 1.0, + "mmlu_eval_accuracy_jurisprudence": 0.6363636363636364, + "mmlu_eval_accuracy_logical_fallacies": 0.7222222222222222, + "mmlu_eval_accuracy_machine_learning": 0.6363636363636364, + "mmlu_eval_accuracy_management": 0.9090909090909091, + "mmlu_eval_accuracy_marketing": 0.88, + "mmlu_eval_accuracy_medical_genetics": 0.9090909090909091, + "mmlu_eval_accuracy_miscellaneous": 0.7558139534883721, + "mmlu_eval_accuracy_moral_disputes": 0.8157894736842105, + "mmlu_eval_accuracy_moral_scenarios": 0.56, + "mmlu_eval_accuracy_nutrition": 0.7575757575757576, + "mmlu_eval_accuracy_philosophy": 0.7941176470588235, + "mmlu_eval_accuracy_prehistory": 0.8571428571428571, + "mmlu_eval_accuracy_professional_accounting": 0.6774193548387096, + "mmlu_eval_accuracy_professional_law": 0.6294117647058823, + "mmlu_eval_accuracy_professional_medicine": 0.8064516129032258, + "mmlu_eval_accuracy_professional_psychology": 0.8260869565217391, + "mmlu_eval_accuracy_public_relations": 0.6666666666666666, + "mmlu_eval_accuracy_security_studies": 0.8148148148148148, + "mmlu_eval_accuracy_sociology": 0.9090909090909091, + "mmlu_eval_accuracy_us_foreign_policy": 1.0, + "mmlu_eval_accuracy_virology": 0.5, + "mmlu_eval_accuracy_world_religions": 0.8421052631578947, + "mmlu_loss": 1.186674291658526, + "step": 561 + }, + { + "epoch": 1.03, + "learning_rate": 0.0001, + "loss": 0.8507, + "step": 570 + }, + { + "epoch": 1.05, + "learning_rate": 0.0001, + "loss": 0.9164, + "step": 580 + }, + { + "epoch": 1.07, + "learning_rate": 0.0001, + "loss": 1.0908, + "step": 590 + }, + { + "epoch": 1.09, + "learning_rate": 0.0001, + "loss": 1.0431, + "step": 600 + } + ], + "max_steps": 1875, + "num_train_epochs": 4, + "total_flos": 6.959436668848374e+17, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-600/training_args.bin b/checkpoint-600/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..cab5478c763bfedd6ea1ae136d922355ec23f73d --- /dev/null +++ b/checkpoint-600/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3739b542a2914edbd2d7eb3b727d6fa2a7752c75b0d7a856f5e87dd807fa1ef9 +size 6200 diff --git a/checkpoint-800/README.md b/checkpoint-800/README.md new file mode 100644 index 0000000000000000000000000000000000000000..82793f73e61dbb024e11fc6697bba1622d4d0db6 --- /dev/null +++ b/checkpoint-800/README.md @@ -0,0 +1,20 @@ +--- +library_name: peft +--- +## Training procedure + + +The following `bitsandbytes` quantization config was used during training: +- load_in_8bit: False +- load_in_4bit: True +- llm_int8_threshold: 6.0 +- llm_int8_skip_modules: None +- llm_int8_enable_fp32_cpu_offload: False +- llm_int8_has_fp16_weight: False +- bnb_4bit_quant_type: nf4 +- bnb_4bit_use_double_quant: True +- bnb_4bit_compute_dtype: bfloat16 +### Framework versions + + +- PEFT 0.4.0 diff --git a/checkpoint-800/adapter_config.json b/checkpoint-800/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..f6f42652a8f2f3cf02936153fca1af5d04e7983d --- /dev/null +++ b/checkpoint-800/adapter_config.json @@ -0,0 +1,26 @@ +{ + "auto_mapping": null, + "base_model_name_or_path": "152334H/miqu-1-70b-sf", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "lora_alpha": 16.0, + "lora_dropout": 0.05, + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "revision": null, + "target_modules": [ + "gate_proj", + "down_proj", + "v_proj", + "up_proj", + "q_proj", + "k_proj", + "o_proj" + ], + "task_type": "CAUSAL_LM" +} \ No newline at end of file diff --git a/checkpoint-800/adapter_model.bin b/checkpoint-800/adapter_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..1522fcec552ed71923375ad0d1a2f8299ff17ed8 --- /dev/null +++ b/checkpoint-800/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:98e2d860c21b4cf37dd4e6241f85216aa00c5210b7bcd612e3d4c46a109a66f1 +size 1657155522 diff --git a/checkpoint-800/adapter_model/README.md b/checkpoint-800/adapter_model/README.md new file mode 100644 index 0000000000000000000000000000000000000000..82793f73e61dbb024e11fc6697bba1622d4d0db6 --- /dev/null +++ b/checkpoint-800/adapter_model/README.md @@ -0,0 +1,20 @@ +--- +library_name: peft +--- +## Training procedure + + +The following `bitsandbytes` quantization config was used during training: +- load_in_8bit: False +- load_in_4bit: True +- llm_int8_threshold: 6.0 +- llm_int8_skip_modules: None +- llm_int8_enable_fp32_cpu_offload: False +- llm_int8_has_fp16_weight: False +- bnb_4bit_quant_type: nf4 +- bnb_4bit_use_double_quant: True +- bnb_4bit_compute_dtype: bfloat16 +### Framework versions + + +- PEFT 0.4.0 diff --git a/checkpoint-800/adapter_model/adapter_config.json b/checkpoint-800/adapter_model/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..f6f42652a8f2f3cf02936153fca1af5d04e7983d --- /dev/null +++ b/checkpoint-800/adapter_model/adapter_config.json @@ -0,0 +1,26 @@ +{ + "auto_mapping": null, + "base_model_name_or_path": "152334H/miqu-1-70b-sf", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "lora_alpha": 16.0, + "lora_dropout": 0.05, + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "revision": null, + "target_modules": [ + "gate_proj", + "down_proj", + "v_proj", + "up_proj", + "q_proj", + "k_proj", + "o_proj" + ], + "task_type": "CAUSAL_LM" +} \ No newline at end of file diff --git a/checkpoint-800/adapter_model/adapter_model.bin b/checkpoint-800/adapter_model/adapter_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..1522fcec552ed71923375ad0d1a2f8299ff17ed8 --- /dev/null +++ b/checkpoint-800/adapter_model/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:98e2d860c21b4cf37dd4e6241f85216aa00c5210b7bcd612e3d4c46a109a66f1 +size 1657155522 diff --git a/checkpoint-800/optimizer.pt b/checkpoint-800/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..c4cdbf8757d30d2ea6758894dda592a06ce51e06 --- /dev/null +++ b/checkpoint-800/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e23f49320e9a9c2e5554c70f1d6e10e281e4f668be89773db1e245dd51e05fd7 +size 6627702922 diff --git a/checkpoint-800/rng_state.pth b/checkpoint-800/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..579ed11183d40da36869ebc636d6b0ab92aecf6a --- /dev/null +++ b/checkpoint-800/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d9da05579932a2ba95d9f4364f034fd07049787d493fafba1013252edc115be8 +size 14180 diff --git a/checkpoint-800/scheduler.pt b/checkpoint-800/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..a0d9ca64f73600b59484091bfccba500da841d15 --- /dev/null +++ b/checkpoint-800/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7847f5134d705577faee25af150145f5a0a552ec46c6008bc52ad812b9e8f038 +size 1064 diff --git a/checkpoint-800/special_tokens_map.json b/checkpoint-800/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..55431d8ca0ffd8e996c646341b6136e749142547 --- /dev/null +++ b/checkpoint-800/special_tokens_map.json @@ -0,0 +1,12 @@ +{ + "bos_token": "", + "eos_token": "", + "pad_token": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false + }, + "unk_token": "" +} diff --git a/checkpoint-800/tokenizer.model b/checkpoint-800/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..6c00c742ce03c627d6cd5b795984876fa49fa899 --- /dev/null +++ b/checkpoint-800/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347 +size 499723 diff --git a/checkpoint-800/tokenizer_config.json b/checkpoint-800/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..a3b9160e6f055447e1b1927482cf0ea394366055 --- /dev/null +++ b/checkpoint-800/tokenizer_config.json @@ -0,0 +1,71 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": { + "__type": "AddedToken", + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false + }, + "chat_template": "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}", + "clean_up_tokenization_spaces": false, + "eos_token": { + "__type": "AddedToken", + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false + }, + "legacy": false, + "model_max_length": 1000000000000000019884624838656, + "pad_token": { + "__type": "AddedToken", + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false + }, + "padding_side": "right", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "LlamaTokenizer", + "unk_token": { + "__type": "AddedToken", + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false + }, + "use_default_system_prompt": false +} diff --git a/checkpoint-800/trainer_state.json b/checkpoint-800/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..1936c7bda19baa859d443136a20bea43bb3b3be9 --- /dev/null +++ b/checkpoint-800/trainer_state.json @@ -0,0 +1,780 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.4509181591475855, + "global_step": 800, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.02, + "learning_rate": 0.0001, + "loss": 1.1655, + "step": 10 + }, + { + "epoch": 0.04, + "learning_rate": 0.0001, + "loss": 1.001, + "step": 20 + }, + { + "epoch": 0.05, + "learning_rate": 0.0001, + "loss": 1.0287, + "step": 30 + }, + { + "epoch": 0.07, + "learning_rate": 0.0001, + "loss": 1.1578, + "step": 40 + }, + { + "epoch": 0.09, + "learning_rate": 0.0001, + "loss": 1.2146, + "step": 50 + }, + { + "epoch": 0.11, + "learning_rate": 0.0001, + "loss": 0.997, + "step": 60 + }, + { + "epoch": 0.13, + "learning_rate": 0.0001, + "loss": 0.9024, + "step": 70 + }, + { + "epoch": 0.15, + "learning_rate": 0.0001, + "loss": 0.9901, + "step": 80 + }, + { + "epoch": 0.16, + "learning_rate": 0.0001, + "loss": 1.1264, + "step": 90 + }, + { + "epoch": 0.18, + "learning_rate": 0.0001, + "loss": 1.2038, + "step": 100 + }, + { + "epoch": 0.2, + "learning_rate": 0.0001, + "loss": 0.8935, + "step": 110 + }, + { + "epoch": 0.22, + "learning_rate": 0.0001, + "loss": 0.9178, + "step": 120 + }, + { + "epoch": 0.24, + "learning_rate": 0.0001, + "loss": 0.9746, + "step": 130 + }, + { + "epoch": 0.25, + "learning_rate": 0.0001, + "loss": 1.1566, + "step": 140 + }, + { + "epoch": 0.27, + "learning_rate": 0.0001, + "loss": 1.2877, + "step": 150 + }, + { + "epoch": 0.29, + "learning_rate": 0.0001, + "loss": 0.9146, + "step": 160 + }, + { + "epoch": 0.31, + "learning_rate": 0.0001, + "loss": 0.8895, + "step": 170 + }, + { + "epoch": 0.33, + "learning_rate": 0.0001, + "loss": 1.0121, + "step": 180 + }, + { + "epoch": 0.34, + "eval_loss": 1.0215636491775513, + "eval_runtime": 950.138, + "eval_samples_per_second": 1.052, + "eval_steps_per_second": 1.052, + "step": 187 + }, + { + "epoch": 0.34, + "mmlu_eval_accuracy": 0.731892294851104, + "mmlu_eval_accuracy_abstract_algebra": 0.18181818181818182, + "mmlu_eval_accuracy_anatomy": 0.7857142857142857, + "mmlu_eval_accuracy_astronomy": 0.6875, + "mmlu_eval_accuracy_business_ethics": 0.7272727272727273, + "mmlu_eval_accuracy_clinical_knowledge": 0.896551724137931, + "mmlu_eval_accuracy_college_biology": 0.875, + "mmlu_eval_accuracy_college_chemistry": 0.5, + "mmlu_eval_accuracy_college_computer_science": 0.7272727272727273, + "mmlu_eval_accuracy_college_mathematics": 0.45454545454545453, + "mmlu_eval_accuracy_college_medicine": 0.9090909090909091, + "mmlu_eval_accuracy_college_physics": 0.6363636363636364, + "mmlu_eval_accuracy_computer_security": 0.6363636363636364, + "mmlu_eval_accuracy_conceptual_physics": 0.6538461538461539, + "mmlu_eval_accuracy_econometrics": 0.8333333333333334, + "mmlu_eval_accuracy_electrical_engineering": 0.8125, + "mmlu_eval_accuracy_elementary_mathematics": 0.7073170731707317, + "mmlu_eval_accuracy_formal_logic": 0.5714285714285714, + "mmlu_eval_accuracy_global_facts": 0.5, + "mmlu_eval_accuracy_high_school_biology": 0.8125, + "mmlu_eval_accuracy_high_school_chemistry": 0.4090909090909091, + "mmlu_eval_accuracy_high_school_computer_science": 0.6666666666666666, + "mmlu_eval_accuracy_high_school_european_history": 0.7777777777777778, + "mmlu_eval_accuracy_high_school_geography": 0.9090909090909091, + "mmlu_eval_accuracy_high_school_government_and_politics": 0.9523809523809523, + "mmlu_eval_accuracy_high_school_macroeconomics": 0.7441860465116279, + "mmlu_eval_accuracy_high_school_mathematics": 0.3793103448275862, + "mmlu_eval_accuracy_high_school_microeconomics": 0.9615384615384616, + "mmlu_eval_accuracy_high_school_physics": 0.23529411764705882, + "mmlu_eval_accuracy_high_school_psychology": 0.9333333333333333, + "mmlu_eval_accuracy_high_school_statistics": 0.6956521739130435, + "mmlu_eval_accuracy_high_school_us_history": 0.9090909090909091, + "mmlu_eval_accuracy_high_school_world_history": 0.8076923076923077, + "mmlu_eval_accuracy_human_aging": 0.8260869565217391, + "mmlu_eval_accuracy_human_sexuality": 0.6666666666666666, + "mmlu_eval_accuracy_international_law": 1.0, + "mmlu_eval_accuracy_jurisprudence": 0.6363636363636364, + "mmlu_eval_accuracy_logical_fallacies": 0.7777777777777778, + "mmlu_eval_accuracy_machine_learning": 0.5454545454545454, + "mmlu_eval_accuracy_management": 0.9090909090909091, + "mmlu_eval_accuracy_marketing": 0.88, + "mmlu_eval_accuracy_medical_genetics": 0.9090909090909091, + "mmlu_eval_accuracy_miscellaneous": 0.7906976744186046, + "mmlu_eval_accuracy_moral_disputes": 0.8157894736842105, + "mmlu_eval_accuracy_moral_scenarios": 0.59, + "mmlu_eval_accuracy_nutrition": 0.7878787878787878, + "mmlu_eval_accuracy_philosophy": 0.7941176470588235, + "mmlu_eval_accuracy_prehistory": 0.8285714285714286, + "mmlu_eval_accuracy_professional_accounting": 0.6451612903225806, + "mmlu_eval_accuracy_professional_law": 0.6294117647058823, + "mmlu_eval_accuracy_professional_medicine": 0.8064516129032258, + "mmlu_eval_accuracy_professional_psychology": 0.8115942028985508, + "mmlu_eval_accuracy_public_relations": 0.6666666666666666, + "mmlu_eval_accuracy_security_studies": 0.8148148148148148, + "mmlu_eval_accuracy_sociology": 0.9545454545454546, + "mmlu_eval_accuracy_us_foreign_policy": 1.0, + "mmlu_eval_accuracy_virology": 0.5, + "mmlu_eval_accuracy_world_religions": 0.8421052631578947, + "mmlu_loss": 1.326305795171384, + "step": 187 + }, + { + "epoch": 0.34, + "learning_rate": 0.0001, + "loss": 1.1133, + "step": 190 + }, + { + "epoch": 0.36, + "learning_rate": 0.0001, + "loss": 1.2485, + "step": 200 + }, + { + "epoch": 0.38, + "learning_rate": 0.0001, + "loss": 0.9653, + "step": 210 + }, + { + "epoch": 0.4, + "learning_rate": 0.0001, + "loss": 0.9455, + "step": 220 + }, + { + "epoch": 0.42, + "learning_rate": 0.0001, + "loss": 1.0373, + "step": 230 + }, + { + "epoch": 0.44, + "learning_rate": 0.0001, + "loss": 1.1425, + "step": 240 + }, + { + "epoch": 0.45, + "learning_rate": 0.0001, + "loss": 1.3136, + "step": 250 + }, + { + "epoch": 0.47, + "learning_rate": 0.0001, + "loss": 0.8695, + "step": 260 + }, + { + "epoch": 0.49, + "learning_rate": 0.0001, + "loss": 0.872, + "step": 270 + }, + { + "epoch": 0.51, + "learning_rate": 0.0001, + "loss": 1.0152, + "step": 280 + }, + { + "epoch": 0.53, + "learning_rate": 0.0001, + "loss": 1.1309, + "step": 290 + }, + { + "epoch": 0.54, + "learning_rate": 0.0001, + "loss": 1.267, + "step": 300 + }, + { + "epoch": 0.56, + "learning_rate": 0.0001, + "loss": 0.9249, + "step": 310 + }, + { + "epoch": 0.58, + "learning_rate": 0.0001, + "loss": 0.9148, + "step": 320 + }, + { + "epoch": 0.6, + "learning_rate": 0.0001, + "loss": 0.9864, + "step": 330 + }, + { + "epoch": 0.62, + "learning_rate": 0.0001, + "loss": 1.2312, + "step": 340 + }, + { + "epoch": 0.63, + "learning_rate": 0.0001, + "loss": 1.2354, + "step": 350 + }, + { + "epoch": 0.65, + "learning_rate": 0.0001, + "loss": 0.9126, + "step": 360 + }, + { + "epoch": 0.67, + "learning_rate": 0.0001, + "loss": 0.9213, + "step": 370 + }, + { + "epoch": 0.68, + "eval_loss": 1.0163359642028809, + "eval_runtime": 948.1151, + "eval_samples_per_second": 1.055, + "eval_steps_per_second": 1.055, + "step": 374 + }, + { + "epoch": 0.68, + "mmlu_eval_accuracy": 0.7395476061435284, + "mmlu_eval_accuracy_abstract_algebra": 0.2727272727272727, + "mmlu_eval_accuracy_anatomy": 0.7857142857142857, + "mmlu_eval_accuracy_astronomy": 0.75, + "mmlu_eval_accuracy_business_ethics": 0.7272727272727273, + "mmlu_eval_accuracy_clinical_knowledge": 0.896551724137931, + "mmlu_eval_accuracy_college_biology": 0.875, + "mmlu_eval_accuracy_college_chemistry": 0.5, + "mmlu_eval_accuracy_college_computer_science": 0.7272727272727273, + "mmlu_eval_accuracy_college_mathematics": 0.36363636363636365, + "mmlu_eval_accuracy_college_medicine": 0.9090909090909091, + "mmlu_eval_accuracy_college_physics": 0.6363636363636364, + "mmlu_eval_accuracy_computer_security": 0.6363636363636364, + "mmlu_eval_accuracy_conceptual_physics": 0.5769230769230769, + "mmlu_eval_accuracy_econometrics": 0.8333333333333334, + "mmlu_eval_accuracy_electrical_engineering": 0.875, + "mmlu_eval_accuracy_elementary_mathematics": 0.6829268292682927, + "mmlu_eval_accuracy_formal_logic": 0.6428571428571429, + "mmlu_eval_accuracy_global_facts": 0.5, + "mmlu_eval_accuracy_high_school_biology": 0.8125, + "mmlu_eval_accuracy_high_school_chemistry": 0.45454545454545453, + "mmlu_eval_accuracy_high_school_computer_science": 0.7777777777777778, + "mmlu_eval_accuracy_high_school_european_history": 0.7777777777777778, + "mmlu_eval_accuracy_high_school_geography": 0.9090909090909091, + "mmlu_eval_accuracy_high_school_government_and_politics": 0.9523809523809523, + "mmlu_eval_accuracy_high_school_macroeconomics": 0.7674418604651163, + "mmlu_eval_accuracy_high_school_mathematics": 0.41379310344827586, + "mmlu_eval_accuracy_high_school_microeconomics": 0.9615384615384616, + "mmlu_eval_accuracy_high_school_physics": 0.23529411764705882, + "mmlu_eval_accuracy_high_school_psychology": 0.95, + "mmlu_eval_accuracy_high_school_statistics": 0.7391304347826086, + "mmlu_eval_accuracy_high_school_us_history": 0.9090909090909091, + "mmlu_eval_accuracy_high_school_world_history": 0.7692307692307693, + "mmlu_eval_accuracy_human_aging": 0.8260869565217391, + "mmlu_eval_accuracy_human_sexuality": 0.6666666666666666, + "mmlu_eval_accuracy_international_law": 1.0, + "mmlu_eval_accuracy_jurisprudence": 0.6363636363636364, + "mmlu_eval_accuracy_logical_fallacies": 0.7777777777777778, + "mmlu_eval_accuracy_machine_learning": 0.5454545454545454, + "mmlu_eval_accuracy_management": 0.9090909090909091, + "mmlu_eval_accuracy_marketing": 0.92, + "mmlu_eval_accuracy_medical_genetics": 0.9090909090909091, + "mmlu_eval_accuracy_miscellaneous": 0.7790697674418605, + "mmlu_eval_accuracy_moral_disputes": 0.8157894736842105, + "mmlu_eval_accuracy_moral_scenarios": 0.57, + "mmlu_eval_accuracy_nutrition": 0.7272727272727273, + "mmlu_eval_accuracy_philosophy": 0.7941176470588235, + "mmlu_eval_accuracy_prehistory": 0.8571428571428571, + "mmlu_eval_accuracy_professional_accounting": 0.6774193548387096, + "mmlu_eval_accuracy_professional_law": 0.6411764705882353, + "mmlu_eval_accuracy_professional_medicine": 0.8387096774193549, + "mmlu_eval_accuracy_professional_psychology": 0.8115942028985508, + "mmlu_eval_accuracy_public_relations": 0.6666666666666666, + "mmlu_eval_accuracy_security_studies": 0.8148148148148148, + "mmlu_eval_accuracy_sociology": 0.9545454545454546, + "mmlu_eval_accuracy_us_foreign_policy": 1.0, + "mmlu_eval_accuracy_virology": 0.5, + "mmlu_eval_accuracy_world_religions": 0.8947368421052632, + "mmlu_loss": 1.2796503596061355, + "step": 374 + }, + { + "epoch": 0.69, + "learning_rate": 0.0001, + "loss": 0.9737, + "step": 380 + }, + { + "epoch": 0.71, + "learning_rate": 0.0001, + "loss": 1.157, + "step": 390 + }, + { + "epoch": 0.73, + "learning_rate": 0.0001, + "loss": 1.2106, + "step": 400 + }, + { + "epoch": 0.74, + "learning_rate": 0.0001, + "loss": 0.8687, + "step": 410 + }, + { + "epoch": 0.76, + "learning_rate": 0.0001, + "loss": 0.8742, + "step": 420 + }, + { + "epoch": 0.78, + "learning_rate": 0.0001, + "loss": 0.9901, + "step": 430 + }, + { + "epoch": 0.8, + "learning_rate": 0.0001, + "loss": 1.2238, + "step": 440 + }, + { + "epoch": 0.82, + "learning_rate": 0.0001, + "loss": 1.2604, + "step": 450 + }, + { + "epoch": 0.83, + "learning_rate": 0.0001, + "loss": 0.8756, + "step": 460 + }, + { + "epoch": 0.85, + "learning_rate": 0.0001, + "loss": 0.8683, + "step": 470 + }, + { + "epoch": 0.87, + "learning_rate": 0.0001, + "loss": 0.9824, + "step": 480 + }, + { + "epoch": 0.89, + "learning_rate": 0.0001, + "loss": 1.1574, + "step": 490 + }, + { + "epoch": 0.91, + "learning_rate": 0.0001, + "loss": 1.2687, + "step": 500 + }, + { + "epoch": 0.92, + "learning_rate": 0.0001, + "loss": 0.8657, + "step": 510 + }, + { + "epoch": 0.94, + "learning_rate": 0.0001, + "loss": 0.9207, + "step": 520 + }, + { + "epoch": 0.96, + "learning_rate": 0.0001, + "loss": 1.012, + "step": 530 + }, + { + "epoch": 0.98, + "learning_rate": 0.0001, + "loss": 1.1517, + "step": 540 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 1.1654, + "step": 550 + }, + { + "epoch": 1.02, + "learning_rate": 0.0001, + "loss": 0.8931, + "step": 560 + }, + { + "epoch": 1.02, + "eval_loss": 1.0150845050811768, + "eval_runtime": 949.8392, + "eval_samples_per_second": 1.053, + "eval_steps_per_second": 1.053, + "step": 561 + }, + { + "epoch": 1.02, + "mmlu_eval_accuracy": 0.7346397374699287, + "mmlu_eval_accuracy_abstract_algebra": 0.2727272727272727, + "mmlu_eval_accuracy_anatomy": 0.7142857142857143, + "mmlu_eval_accuracy_astronomy": 0.6875, + "mmlu_eval_accuracy_business_ethics": 0.8181818181818182, + "mmlu_eval_accuracy_clinical_knowledge": 0.8620689655172413, + "mmlu_eval_accuracy_college_biology": 0.875, + "mmlu_eval_accuracy_college_chemistry": 0.5, + "mmlu_eval_accuracy_college_computer_science": 0.7272727272727273, + "mmlu_eval_accuracy_college_mathematics": 0.36363636363636365, + "mmlu_eval_accuracy_college_medicine": 0.9090909090909091, + "mmlu_eval_accuracy_college_physics": 0.5454545454545454, + "mmlu_eval_accuracy_computer_security": 0.6363636363636364, + "mmlu_eval_accuracy_conceptual_physics": 0.6153846153846154, + "mmlu_eval_accuracy_econometrics": 0.8333333333333334, + "mmlu_eval_accuracy_electrical_engineering": 0.8125, + "mmlu_eval_accuracy_elementary_mathematics": 0.6829268292682927, + "mmlu_eval_accuracy_formal_logic": 0.6428571428571429, + "mmlu_eval_accuracy_global_facts": 0.5, + "mmlu_eval_accuracy_high_school_biology": 0.84375, + "mmlu_eval_accuracy_high_school_chemistry": 0.45454545454545453, + "mmlu_eval_accuracy_high_school_computer_science": 0.7777777777777778, + "mmlu_eval_accuracy_high_school_european_history": 0.7777777777777778, + "mmlu_eval_accuracy_high_school_geography": 0.9090909090909091, + "mmlu_eval_accuracy_high_school_government_and_politics": 0.9523809523809523, + "mmlu_eval_accuracy_high_school_macroeconomics": 0.7674418604651163, + "mmlu_eval_accuracy_high_school_mathematics": 0.41379310344827586, + "mmlu_eval_accuracy_high_school_microeconomics": 0.9230769230769231, + "mmlu_eval_accuracy_high_school_physics": 0.23529411764705882, + "mmlu_eval_accuracy_high_school_psychology": 0.9666666666666667, + "mmlu_eval_accuracy_high_school_statistics": 0.7391304347826086, + "mmlu_eval_accuracy_high_school_us_history": 0.9090909090909091, + "mmlu_eval_accuracy_high_school_world_history": 0.8076923076923077, + "mmlu_eval_accuracy_human_aging": 0.8260869565217391, + "mmlu_eval_accuracy_human_sexuality": 0.6666666666666666, + "mmlu_eval_accuracy_international_law": 1.0, + "mmlu_eval_accuracy_jurisprudence": 0.6363636363636364, + "mmlu_eval_accuracy_logical_fallacies": 0.7222222222222222, + "mmlu_eval_accuracy_machine_learning": 0.6363636363636364, + "mmlu_eval_accuracy_management": 0.9090909090909091, + "mmlu_eval_accuracy_marketing": 0.88, + "mmlu_eval_accuracy_medical_genetics": 0.9090909090909091, + "mmlu_eval_accuracy_miscellaneous": 0.7558139534883721, + "mmlu_eval_accuracy_moral_disputes": 0.8157894736842105, + "mmlu_eval_accuracy_moral_scenarios": 0.56, + "mmlu_eval_accuracy_nutrition": 0.7575757575757576, + "mmlu_eval_accuracy_philosophy": 0.7941176470588235, + "mmlu_eval_accuracy_prehistory": 0.8571428571428571, + "mmlu_eval_accuracy_professional_accounting": 0.6774193548387096, + "mmlu_eval_accuracy_professional_law": 0.6294117647058823, + "mmlu_eval_accuracy_professional_medicine": 0.8064516129032258, + "mmlu_eval_accuracy_professional_psychology": 0.8260869565217391, + "mmlu_eval_accuracy_public_relations": 0.6666666666666666, + "mmlu_eval_accuracy_security_studies": 0.8148148148148148, + "mmlu_eval_accuracy_sociology": 0.9090909090909091, + "mmlu_eval_accuracy_us_foreign_policy": 1.0, + "mmlu_eval_accuracy_virology": 0.5, + "mmlu_eval_accuracy_world_religions": 0.8421052631578947, + "mmlu_loss": 1.186674291658526, + "step": 561 + }, + { + "epoch": 1.03, + "learning_rate": 0.0001, + "loss": 0.8507, + "step": 570 + }, + { + "epoch": 1.05, + "learning_rate": 0.0001, + "loss": 0.9164, + "step": 580 + }, + { + "epoch": 1.07, + "learning_rate": 0.0001, + "loss": 1.0908, + "step": 590 + }, + { + "epoch": 1.09, + "learning_rate": 0.0001, + "loss": 1.0431, + "step": 600 + }, + { + "epoch": 1.11, + "learning_rate": 0.0001, + "loss": 0.8567, + "step": 610 + }, + { + "epoch": 1.12, + "learning_rate": 0.0001, + "loss": 0.8818, + "step": 620 + }, + { + "epoch": 1.14, + "learning_rate": 0.0001, + "loss": 0.9499, + "step": 630 + }, + { + "epoch": 1.16, + "learning_rate": 0.0001, + "loss": 1.0437, + "step": 640 + }, + { + "epoch": 1.18, + "learning_rate": 0.0001, + "loss": 1.0487, + "step": 650 + }, + { + "epoch": 1.2, + "learning_rate": 0.0001, + "loss": 0.8405, + "step": 660 + }, + { + "epoch": 1.22, + "learning_rate": 0.0001, + "loss": 0.8818, + "step": 670 + }, + { + "epoch": 1.23, + "learning_rate": 0.0001, + "loss": 0.9619, + "step": 680 + }, + { + "epoch": 1.25, + "learning_rate": 0.0001, + "loss": 1.0753, + "step": 690 + }, + { + "epoch": 1.27, + "learning_rate": 0.0001, + "loss": 1.0218, + "step": 700 + }, + { + "epoch": 1.29, + "learning_rate": 0.0001, + "loss": 0.8763, + "step": 710 + }, + { + "epoch": 1.31, + "learning_rate": 0.0001, + "loss": 0.8789, + "step": 720 + }, + { + "epoch": 1.32, + "learning_rate": 0.0001, + "loss": 0.8631, + "step": 730 + }, + { + "epoch": 1.34, + "learning_rate": 0.0001, + "loss": 0.9846, + "step": 740 + }, + { + "epoch": 1.36, + "eval_loss": 1.0305067300796509, + "eval_runtime": 948.7106, + "eval_samples_per_second": 1.054, + "eval_steps_per_second": 1.054, + "step": 748 + }, + { + "epoch": 1.36, + "mmlu_eval_accuracy": 0.7324229372189777, + "mmlu_eval_accuracy_abstract_algebra": 0.2727272727272727, + "mmlu_eval_accuracy_anatomy": 0.6428571428571429, + "mmlu_eval_accuracy_astronomy": 0.6875, + "mmlu_eval_accuracy_business_ethics": 1.0, + "mmlu_eval_accuracy_clinical_knowledge": 0.896551724137931, + "mmlu_eval_accuracy_college_biology": 0.875, + "mmlu_eval_accuracy_college_chemistry": 0.5, + "mmlu_eval_accuracy_college_computer_science": 0.6363636363636364, + "mmlu_eval_accuracy_college_mathematics": 0.36363636363636365, + "mmlu_eval_accuracy_college_medicine": 0.8636363636363636, + "mmlu_eval_accuracy_college_physics": 0.6363636363636364, + "mmlu_eval_accuracy_computer_security": 0.6363636363636364, + "mmlu_eval_accuracy_conceptual_physics": 0.6153846153846154, + "mmlu_eval_accuracy_econometrics": 0.8333333333333334, + "mmlu_eval_accuracy_electrical_engineering": 0.8125, + "mmlu_eval_accuracy_elementary_mathematics": 0.7073170731707317, + "mmlu_eval_accuracy_formal_logic": 0.6428571428571429, + "mmlu_eval_accuracy_global_facts": 0.4, + "mmlu_eval_accuracy_high_school_biology": 0.8125, + "mmlu_eval_accuracy_high_school_chemistry": 0.36363636363636365, + "mmlu_eval_accuracy_high_school_computer_science": 0.6666666666666666, + "mmlu_eval_accuracy_high_school_european_history": 0.7777777777777778, + "mmlu_eval_accuracy_high_school_geography": 0.9090909090909091, + "mmlu_eval_accuracy_high_school_government_and_politics": 0.9523809523809523, + "mmlu_eval_accuracy_high_school_macroeconomics": 0.7674418604651163, + "mmlu_eval_accuracy_high_school_mathematics": 0.3448275862068966, + "mmlu_eval_accuracy_high_school_microeconomics": 0.9615384615384616, + "mmlu_eval_accuracy_high_school_physics": 0.35294117647058826, + "mmlu_eval_accuracy_high_school_psychology": 0.9333333333333333, + "mmlu_eval_accuracy_high_school_statistics": 0.782608695652174, + "mmlu_eval_accuracy_high_school_us_history": 0.9545454545454546, + "mmlu_eval_accuracy_high_school_world_history": 0.7692307692307693, + "mmlu_eval_accuracy_human_aging": 0.782608695652174, + "mmlu_eval_accuracy_human_sexuality": 0.6666666666666666, + "mmlu_eval_accuracy_international_law": 1.0, + "mmlu_eval_accuracy_jurisprudence": 0.6363636363636364, + "mmlu_eval_accuracy_logical_fallacies": 0.7777777777777778, + "mmlu_eval_accuracy_machine_learning": 0.6363636363636364, + "mmlu_eval_accuracy_management": 0.9090909090909091, + "mmlu_eval_accuracy_marketing": 0.96, + "mmlu_eval_accuracy_medical_genetics": 0.9090909090909091, + "mmlu_eval_accuracy_miscellaneous": 0.7441860465116279, + "mmlu_eval_accuracy_moral_disputes": 0.8157894736842105, + "mmlu_eval_accuracy_moral_scenarios": 0.57, + "mmlu_eval_accuracy_nutrition": 0.7272727272727273, + "mmlu_eval_accuracy_philosophy": 0.7941176470588235, + "mmlu_eval_accuracy_prehistory": 0.8285714285714286, + "mmlu_eval_accuracy_professional_accounting": 0.6129032258064516, + "mmlu_eval_accuracy_professional_law": 0.6411764705882353, + "mmlu_eval_accuracy_professional_medicine": 0.8064516129032258, + "mmlu_eval_accuracy_professional_psychology": 0.8260869565217391, + "mmlu_eval_accuracy_public_relations": 0.6666666666666666, + "mmlu_eval_accuracy_security_studies": 0.8148148148148148, + "mmlu_eval_accuracy_sociology": 0.9090909090909091, + "mmlu_eval_accuracy_us_foreign_policy": 1.0, + "mmlu_eval_accuracy_virology": 0.5, + "mmlu_eval_accuracy_world_religions": 0.8421052631578947, + "mmlu_loss": 1.2988067958029479, + "step": 748 + }, + { + "epoch": 1.36, + "learning_rate": 0.0001, + "loss": 1.0735, + "step": 750 + }, + { + "epoch": 1.38, + "learning_rate": 0.0001, + "loss": 0.9066, + "step": 760 + }, + { + "epoch": 1.4, + "learning_rate": 0.0001, + "loss": 0.8716, + "step": 770 + }, + { + "epoch": 1.41, + "learning_rate": 0.0001, + "loss": 0.9144, + "step": 780 + }, + { + "epoch": 1.43, + "learning_rate": 0.0001, + "loss": 1.0338, + "step": 790 + }, + { + "epoch": 1.45, + "learning_rate": 0.0001, + "loss": 1.0275, + "step": 800 + } + ], + "max_steps": 1875, + "num_train_epochs": 4, + "total_flos": 9.275226951001375e+17, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-800/training_args.bin b/checkpoint-800/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..cab5478c763bfedd6ea1ae136d922355ec23f73d --- /dev/null +++ b/checkpoint-800/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3739b542a2914edbd2d7eb3b727d6fa2a7752c75b0d7a856f5e87dd807fa1ef9 +size 6200 diff --git a/completed b/completed new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/eval_results.json b/eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..7b6299ba979e0777e182dadf7015ce4065c50fb6 --- /dev/null +++ b/eval_results.json @@ -0,0 +1,7 @@ +{ + "epoch": 3.4, + "eval_loss": 1.151158332824707, + "eval_runtime": 572.7509, + "eval_samples_per_second": 1.746, + "eval_steps_per_second": 1.746 +} \ No newline at end of file diff --git a/metrics.json b/metrics.json new file mode 100644 index 0000000000000000000000000000000000000000..8c0d9005ae0ee5a590979163b89a80a58a8487b6 --- /dev/null +++ b/metrics.json @@ -0,0 +1 @@ +{"run_name": "./output/miqu-70b", "train_runtime": 112632.6276, "train_samples_per_second": 0.266, "train_steps_per_second": 0.017, "train_loss": 0.8699908837636312, "epoch": 3.4, "eval_loss": 1.151158332824707, "eval_runtime": 572.7509, "eval_samples_per_second": 1.746, "eval_steps_per_second": 1.746} \ No newline at end of file diff --git a/train_results.json b/train_results.json new file mode 100644 index 0000000000000000000000000000000000000000..85c0e2c2b53c0e560da22153b6264d6769293d16 --- /dev/null +++ b/train_results.json @@ -0,0 +1,7 @@ +{ + "epoch": 3.4, + "train_loss": 0.8699908837636312, + "train_runtime": 112632.6276, + "train_samples_per_second": 0.266, + "train_steps_per_second": 0.017 +} \ No newline at end of file diff --git a/trainer_state.json b/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..c5436d6886b3aee93c6cd7f4cd19023561fc8abe --- /dev/null +++ b/trainer_state.json @@ -0,0 +1,1857 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.4005894355021535, + "global_step": 1875, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.02, + "learning_rate": 0.0001, + "loss": 1.1655, + "step": 10 + }, + { + "epoch": 0.04, + "learning_rate": 0.0001, + "loss": 1.001, + "step": 20 + }, + { + "epoch": 0.05, + "learning_rate": 0.0001, + "loss": 1.0287, + "step": 30 + }, + { + "epoch": 0.07, + "learning_rate": 0.0001, + "loss": 1.1578, + "step": 40 + }, + { + "epoch": 0.09, + "learning_rate": 0.0001, + "loss": 1.2146, + "step": 50 + }, + { + "epoch": 0.11, + "learning_rate": 0.0001, + "loss": 0.997, + "step": 60 + }, + { + "epoch": 0.13, + "learning_rate": 0.0001, + "loss": 0.9024, + "step": 70 + }, + { + "epoch": 0.15, + "learning_rate": 0.0001, + "loss": 0.9901, + "step": 80 + }, + { + "epoch": 0.16, + "learning_rate": 0.0001, + "loss": 1.1264, + "step": 90 + }, + { + "epoch": 0.18, + "learning_rate": 0.0001, + "loss": 1.2038, + "step": 100 + }, + { + "epoch": 0.2, + "learning_rate": 0.0001, + "loss": 0.8935, + "step": 110 + }, + { + "epoch": 0.22, + "learning_rate": 0.0001, + "loss": 0.9178, + "step": 120 + }, + { + "epoch": 0.24, + "learning_rate": 0.0001, + "loss": 0.9746, + "step": 130 + }, + { + "epoch": 0.25, + "learning_rate": 0.0001, + "loss": 1.1566, + "step": 140 + }, + { + "epoch": 0.27, + "learning_rate": 0.0001, + "loss": 1.2877, + "step": 150 + }, + { + "epoch": 0.29, + "learning_rate": 0.0001, + "loss": 0.9146, + "step": 160 + }, + { + "epoch": 0.31, + "learning_rate": 0.0001, + "loss": 0.8895, + "step": 170 + }, + { + "epoch": 0.33, + "learning_rate": 0.0001, + "loss": 1.0121, + "step": 180 + }, + { + "epoch": 0.34, + "eval_loss": 1.0215636491775513, + "eval_runtime": 950.138, + "eval_samples_per_second": 1.052, + "eval_steps_per_second": 1.052, + "step": 187 + }, + { + "epoch": 0.34, + "mmlu_eval_accuracy": 0.731892294851104, + "mmlu_eval_accuracy_abstract_algebra": 0.18181818181818182, + "mmlu_eval_accuracy_anatomy": 0.7857142857142857, + "mmlu_eval_accuracy_astronomy": 0.6875, + "mmlu_eval_accuracy_business_ethics": 0.7272727272727273, + "mmlu_eval_accuracy_clinical_knowledge": 0.896551724137931, + "mmlu_eval_accuracy_college_biology": 0.875, + "mmlu_eval_accuracy_college_chemistry": 0.5, + "mmlu_eval_accuracy_college_computer_science": 0.7272727272727273, + "mmlu_eval_accuracy_college_mathematics": 0.45454545454545453, + "mmlu_eval_accuracy_college_medicine": 0.9090909090909091, + "mmlu_eval_accuracy_college_physics": 0.6363636363636364, + "mmlu_eval_accuracy_computer_security": 0.6363636363636364, + "mmlu_eval_accuracy_conceptual_physics": 0.6538461538461539, + "mmlu_eval_accuracy_econometrics": 0.8333333333333334, + "mmlu_eval_accuracy_electrical_engineering": 0.8125, + "mmlu_eval_accuracy_elementary_mathematics": 0.7073170731707317, + "mmlu_eval_accuracy_formal_logic": 0.5714285714285714, + "mmlu_eval_accuracy_global_facts": 0.5, + "mmlu_eval_accuracy_high_school_biology": 0.8125, + "mmlu_eval_accuracy_high_school_chemistry": 0.4090909090909091, + "mmlu_eval_accuracy_high_school_computer_science": 0.6666666666666666, + "mmlu_eval_accuracy_high_school_european_history": 0.7777777777777778, + "mmlu_eval_accuracy_high_school_geography": 0.9090909090909091, + "mmlu_eval_accuracy_high_school_government_and_politics": 0.9523809523809523, + "mmlu_eval_accuracy_high_school_macroeconomics": 0.7441860465116279, + "mmlu_eval_accuracy_high_school_mathematics": 0.3793103448275862, + "mmlu_eval_accuracy_high_school_microeconomics": 0.9615384615384616, + "mmlu_eval_accuracy_high_school_physics": 0.23529411764705882, + "mmlu_eval_accuracy_high_school_psychology": 0.9333333333333333, + "mmlu_eval_accuracy_high_school_statistics": 0.6956521739130435, + "mmlu_eval_accuracy_high_school_us_history": 0.9090909090909091, + "mmlu_eval_accuracy_high_school_world_history": 0.8076923076923077, + "mmlu_eval_accuracy_human_aging": 0.8260869565217391, + "mmlu_eval_accuracy_human_sexuality": 0.6666666666666666, + "mmlu_eval_accuracy_international_law": 1.0, + "mmlu_eval_accuracy_jurisprudence": 0.6363636363636364, + "mmlu_eval_accuracy_logical_fallacies": 0.7777777777777778, + "mmlu_eval_accuracy_machine_learning": 0.5454545454545454, + "mmlu_eval_accuracy_management": 0.9090909090909091, + "mmlu_eval_accuracy_marketing": 0.88, + "mmlu_eval_accuracy_medical_genetics": 0.9090909090909091, + "mmlu_eval_accuracy_miscellaneous": 0.7906976744186046, + "mmlu_eval_accuracy_moral_disputes": 0.8157894736842105, + "mmlu_eval_accuracy_moral_scenarios": 0.59, + "mmlu_eval_accuracy_nutrition": 0.7878787878787878, + "mmlu_eval_accuracy_philosophy": 0.7941176470588235, + "mmlu_eval_accuracy_prehistory": 0.8285714285714286, + "mmlu_eval_accuracy_professional_accounting": 0.6451612903225806, + "mmlu_eval_accuracy_professional_law": 0.6294117647058823, + "mmlu_eval_accuracy_professional_medicine": 0.8064516129032258, + "mmlu_eval_accuracy_professional_psychology": 0.8115942028985508, + "mmlu_eval_accuracy_public_relations": 0.6666666666666666, + "mmlu_eval_accuracy_security_studies": 0.8148148148148148, + "mmlu_eval_accuracy_sociology": 0.9545454545454546, + "mmlu_eval_accuracy_us_foreign_policy": 1.0, + "mmlu_eval_accuracy_virology": 0.5, + "mmlu_eval_accuracy_world_religions": 0.8421052631578947, + "mmlu_loss": 1.326305795171384, + "step": 187 + }, + { + "epoch": 0.34, + "learning_rate": 0.0001, + "loss": 1.1133, + "step": 190 + }, + { + "epoch": 0.36, + "learning_rate": 0.0001, + "loss": 1.2485, + "step": 200 + }, + { + "epoch": 0.38, + "learning_rate": 0.0001, + "loss": 0.9653, + "step": 210 + }, + { + "epoch": 0.4, + "learning_rate": 0.0001, + "loss": 0.9455, + "step": 220 + }, + { + "epoch": 0.42, + "learning_rate": 0.0001, + "loss": 1.0373, + "step": 230 + }, + { + "epoch": 0.44, + "learning_rate": 0.0001, + "loss": 1.1425, + "step": 240 + }, + { + "epoch": 0.45, + "learning_rate": 0.0001, + "loss": 1.3136, + "step": 250 + }, + { + "epoch": 0.47, + "learning_rate": 0.0001, + "loss": 0.8695, + "step": 260 + }, + { + "epoch": 0.49, + "learning_rate": 0.0001, + "loss": 0.872, + "step": 270 + }, + { + "epoch": 0.51, + "learning_rate": 0.0001, + "loss": 1.0152, + "step": 280 + }, + { + "epoch": 0.53, + "learning_rate": 0.0001, + "loss": 1.1309, + "step": 290 + }, + { + "epoch": 0.54, + "learning_rate": 0.0001, + "loss": 1.267, + "step": 300 + }, + { + "epoch": 0.56, + "learning_rate": 0.0001, + "loss": 0.9249, + "step": 310 + }, + { + "epoch": 0.58, + "learning_rate": 0.0001, + "loss": 0.9148, + "step": 320 + }, + { + "epoch": 0.6, + "learning_rate": 0.0001, + "loss": 0.9864, + "step": 330 + }, + { + "epoch": 0.62, + "learning_rate": 0.0001, + "loss": 1.2312, + "step": 340 + }, + { + "epoch": 0.63, + "learning_rate": 0.0001, + "loss": 1.2354, + "step": 350 + }, + { + "epoch": 0.65, + "learning_rate": 0.0001, + "loss": 0.9126, + "step": 360 + }, + { + "epoch": 0.67, + "learning_rate": 0.0001, + "loss": 0.9213, + "step": 370 + }, + { + "epoch": 0.68, + "eval_loss": 1.0163359642028809, + "eval_runtime": 948.1151, + "eval_samples_per_second": 1.055, + "eval_steps_per_second": 1.055, + "step": 374 + }, + { + "epoch": 0.68, + "mmlu_eval_accuracy": 0.7395476061435284, + "mmlu_eval_accuracy_abstract_algebra": 0.2727272727272727, + "mmlu_eval_accuracy_anatomy": 0.7857142857142857, + "mmlu_eval_accuracy_astronomy": 0.75, + "mmlu_eval_accuracy_business_ethics": 0.7272727272727273, + "mmlu_eval_accuracy_clinical_knowledge": 0.896551724137931, + "mmlu_eval_accuracy_college_biology": 0.875, + "mmlu_eval_accuracy_college_chemistry": 0.5, + "mmlu_eval_accuracy_college_computer_science": 0.7272727272727273, + "mmlu_eval_accuracy_college_mathematics": 0.36363636363636365, + "mmlu_eval_accuracy_college_medicine": 0.9090909090909091, + "mmlu_eval_accuracy_college_physics": 0.6363636363636364, + "mmlu_eval_accuracy_computer_security": 0.6363636363636364, + "mmlu_eval_accuracy_conceptual_physics": 0.5769230769230769, + "mmlu_eval_accuracy_econometrics": 0.8333333333333334, + "mmlu_eval_accuracy_electrical_engineering": 0.875, + "mmlu_eval_accuracy_elementary_mathematics": 0.6829268292682927, + "mmlu_eval_accuracy_formal_logic": 0.6428571428571429, + "mmlu_eval_accuracy_global_facts": 0.5, + "mmlu_eval_accuracy_high_school_biology": 0.8125, + "mmlu_eval_accuracy_high_school_chemistry": 0.45454545454545453, + "mmlu_eval_accuracy_high_school_computer_science": 0.7777777777777778, + "mmlu_eval_accuracy_high_school_european_history": 0.7777777777777778, + "mmlu_eval_accuracy_high_school_geography": 0.9090909090909091, + "mmlu_eval_accuracy_high_school_government_and_politics": 0.9523809523809523, + "mmlu_eval_accuracy_high_school_macroeconomics": 0.7674418604651163, + "mmlu_eval_accuracy_high_school_mathematics": 0.41379310344827586, + "mmlu_eval_accuracy_high_school_microeconomics": 0.9615384615384616, + "mmlu_eval_accuracy_high_school_physics": 0.23529411764705882, + "mmlu_eval_accuracy_high_school_psychology": 0.95, + "mmlu_eval_accuracy_high_school_statistics": 0.7391304347826086, + "mmlu_eval_accuracy_high_school_us_history": 0.9090909090909091, + "mmlu_eval_accuracy_high_school_world_history": 0.7692307692307693, + "mmlu_eval_accuracy_human_aging": 0.8260869565217391, + "mmlu_eval_accuracy_human_sexuality": 0.6666666666666666, + "mmlu_eval_accuracy_international_law": 1.0, + "mmlu_eval_accuracy_jurisprudence": 0.6363636363636364, + "mmlu_eval_accuracy_logical_fallacies": 0.7777777777777778, + "mmlu_eval_accuracy_machine_learning": 0.5454545454545454, + "mmlu_eval_accuracy_management": 0.9090909090909091, + "mmlu_eval_accuracy_marketing": 0.92, + "mmlu_eval_accuracy_medical_genetics": 0.9090909090909091, + "mmlu_eval_accuracy_miscellaneous": 0.7790697674418605, + "mmlu_eval_accuracy_moral_disputes": 0.8157894736842105, + "mmlu_eval_accuracy_moral_scenarios": 0.57, + "mmlu_eval_accuracy_nutrition": 0.7272727272727273, + "mmlu_eval_accuracy_philosophy": 0.7941176470588235, + "mmlu_eval_accuracy_prehistory": 0.8571428571428571, + "mmlu_eval_accuracy_professional_accounting": 0.6774193548387096, + "mmlu_eval_accuracy_professional_law": 0.6411764705882353, + "mmlu_eval_accuracy_professional_medicine": 0.8387096774193549, + "mmlu_eval_accuracy_professional_psychology": 0.8115942028985508, + "mmlu_eval_accuracy_public_relations": 0.6666666666666666, + "mmlu_eval_accuracy_security_studies": 0.8148148148148148, + "mmlu_eval_accuracy_sociology": 0.9545454545454546, + "mmlu_eval_accuracy_us_foreign_policy": 1.0, + "mmlu_eval_accuracy_virology": 0.5, + "mmlu_eval_accuracy_world_religions": 0.8947368421052632, + "mmlu_loss": 1.2796503596061355, + "step": 374 + }, + { + "epoch": 0.69, + "learning_rate": 0.0001, + "loss": 0.9737, + "step": 380 + }, + { + "epoch": 0.71, + "learning_rate": 0.0001, + "loss": 1.157, + "step": 390 + }, + { + "epoch": 0.73, + "learning_rate": 0.0001, + "loss": 1.2106, + "step": 400 + }, + { + "epoch": 0.74, + "learning_rate": 0.0001, + "loss": 0.8687, + "step": 410 + }, + { + "epoch": 0.76, + "learning_rate": 0.0001, + "loss": 0.8742, + "step": 420 + }, + { + "epoch": 0.78, + "learning_rate": 0.0001, + "loss": 0.9901, + "step": 430 + }, + { + "epoch": 0.8, + "learning_rate": 0.0001, + "loss": 1.2238, + "step": 440 + }, + { + "epoch": 0.82, + "learning_rate": 0.0001, + "loss": 1.2604, + "step": 450 + }, + { + "epoch": 0.83, + "learning_rate": 0.0001, + "loss": 0.8756, + "step": 460 + }, + { + "epoch": 0.85, + "learning_rate": 0.0001, + "loss": 0.8683, + "step": 470 + }, + { + "epoch": 0.87, + "learning_rate": 0.0001, + "loss": 0.9824, + "step": 480 + }, + { + "epoch": 0.89, + "learning_rate": 0.0001, + "loss": 1.1574, + "step": 490 + }, + { + "epoch": 0.91, + "learning_rate": 0.0001, + "loss": 1.2687, + "step": 500 + }, + { + "epoch": 0.92, + "learning_rate": 0.0001, + "loss": 0.8657, + "step": 510 + }, + { + "epoch": 0.94, + "learning_rate": 0.0001, + "loss": 0.9207, + "step": 520 + }, + { + "epoch": 0.96, + "learning_rate": 0.0001, + "loss": 1.012, + "step": 530 + }, + { + "epoch": 0.98, + "learning_rate": 0.0001, + "loss": 1.1517, + "step": 540 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 1.1654, + "step": 550 + }, + { + "epoch": 1.02, + "learning_rate": 0.0001, + "loss": 0.8931, + "step": 560 + }, + { + "epoch": 1.02, + "eval_loss": 1.0150845050811768, + "eval_runtime": 949.8392, + "eval_samples_per_second": 1.053, + "eval_steps_per_second": 1.053, + "step": 561 + }, + { + "epoch": 1.02, + "mmlu_eval_accuracy": 0.7346397374699287, + "mmlu_eval_accuracy_abstract_algebra": 0.2727272727272727, + "mmlu_eval_accuracy_anatomy": 0.7142857142857143, + "mmlu_eval_accuracy_astronomy": 0.6875, + "mmlu_eval_accuracy_business_ethics": 0.8181818181818182, + "mmlu_eval_accuracy_clinical_knowledge": 0.8620689655172413, + "mmlu_eval_accuracy_college_biology": 0.875, + "mmlu_eval_accuracy_college_chemistry": 0.5, + "mmlu_eval_accuracy_college_computer_science": 0.7272727272727273, + "mmlu_eval_accuracy_college_mathematics": 0.36363636363636365, + "mmlu_eval_accuracy_college_medicine": 0.9090909090909091, + "mmlu_eval_accuracy_college_physics": 0.5454545454545454, + "mmlu_eval_accuracy_computer_security": 0.6363636363636364, + "mmlu_eval_accuracy_conceptual_physics": 0.6153846153846154, + "mmlu_eval_accuracy_econometrics": 0.8333333333333334, + "mmlu_eval_accuracy_electrical_engineering": 0.8125, + "mmlu_eval_accuracy_elementary_mathematics": 0.6829268292682927, + "mmlu_eval_accuracy_formal_logic": 0.6428571428571429, + "mmlu_eval_accuracy_global_facts": 0.5, + "mmlu_eval_accuracy_high_school_biology": 0.84375, + "mmlu_eval_accuracy_high_school_chemistry": 0.45454545454545453, + "mmlu_eval_accuracy_high_school_computer_science": 0.7777777777777778, + "mmlu_eval_accuracy_high_school_european_history": 0.7777777777777778, + "mmlu_eval_accuracy_high_school_geography": 0.9090909090909091, + "mmlu_eval_accuracy_high_school_government_and_politics": 0.9523809523809523, + "mmlu_eval_accuracy_high_school_macroeconomics": 0.7674418604651163, + "mmlu_eval_accuracy_high_school_mathematics": 0.41379310344827586, + "mmlu_eval_accuracy_high_school_microeconomics": 0.9230769230769231, + "mmlu_eval_accuracy_high_school_physics": 0.23529411764705882, + "mmlu_eval_accuracy_high_school_psychology": 0.9666666666666667, + "mmlu_eval_accuracy_high_school_statistics": 0.7391304347826086, + "mmlu_eval_accuracy_high_school_us_history": 0.9090909090909091, + "mmlu_eval_accuracy_high_school_world_history": 0.8076923076923077, + "mmlu_eval_accuracy_human_aging": 0.8260869565217391, + "mmlu_eval_accuracy_human_sexuality": 0.6666666666666666, + "mmlu_eval_accuracy_international_law": 1.0, + "mmlu_eval_accuracy_jurisprudence": 0.6363636363636364, + "mmlu_eval_accuracy_logical_fallacies": 0.7222222222222222, + "mmlu_eval_accuracy_machine_learning": 0.6363636363636364, + "mmlu_eval_accuracy_management": 0.9090909090909091, + "mmlu_eval_accuracy_marketing": 0.88, + "mmlu_eval_accuracy_medical_genetics": 0.9090909090909091, + "mmlu_eval_accuracy_miscellaneous": 0.7558139534883721, + "mmlu_eval_accuracy_moral_disputes": 0.8157894736842105, + "mmlu_eval_accuracy_moral_scenarios": 0.56, + "mmlu_eval_accuracy_nutrition": 0.7575757575757576, + "mmlu_eval_accuracy_philosophy": 0.7941176470588235, + "mmlu_eval_accuracy_prehistory": 0.8571428571428571, + "mmlu_eval_accuracy_professional_accounting": 0.6774193548387096, + "mmlu_eval_accuracy_professional_law": 0.6294117647058823, + "mmlu_eval_accuracy_professional_medicine": 0.8064516129032258, + "mmlu_eval_accuracy_professional_psychology": 0.8260869565217391, + "mmlu_eval_accuracy_public_relations": 0.6666666666666666, + "mmlu_eval_accuracy_security_studies": 0.8148148148148148, + "mmlu_eval_accuracy_sociology": 0.9090909090909091, + "mmlu_eval_accuracy_us_foreign_policy": 1.0, + "mmlu_eval_accuracy_virology": 0.5, + "mmlu_eval_accuracy_world_religions": 0.8421052631578947, + "mmlu_loss": 1.186674291658526, + "step": 561 + }, + { + "epoch": 1.03, + "learning_rate": 0.0001, + "loss": 0.8507, + "step": 570 + }, + { + "epoch": 1.05, + "learning_rate": 0.0001, + "loss": 0.9164, + "step": 580 + }, + { + "epoch": 1.07, + "learning_rate": 0.0001, + "loss": 1.0908, + "step": 590 + }, + { + "epoch": 1.09, + "learning_rate": 0.0001, + "loss": 1.0431, + "step": 600 + }, + { + "epoch": 1.11, + "learning_rate": 0.0001, + "loss": 0.8567, + "step": 610 + }, + { + "epoch": 1.12, + "learning_rate": 0.0001, + "loss": 0.8818, + "step": 620 + }, + { + "epoch": 1.14, + "learning_rate": 0.0001, + "loss": 0.9499, + "step": 630 + }, + { + "epoch": 1.16, + "learning_rate": 0.0001, + "loss": 1.0437, + "step": 640 + }, + { + "epoch": 1.18, + "learning_rate": 0.0001, + "loss": 1.0487, + "step": 650 + }, + { + "epoch": 1.2, + "learning_rate": 0.0001, + "loss": 0.8405, + "step": 660 + }, + { + "epoch": 1.22, + "learning_rate": 0.0001, + "loss": 0.8818, + "step": 670 + }, + { + "epoch": 1.23, + "learning_rate": 0.0001, + "loss": 0.9619, + "step": 680 + }, + { + "epoch": 1.25, + "learning_rate": 0.0001, + "loss": 1.0753, + "step": 690 + }, + { + "epoch": 1.27, + "learning_rate": 0.0001, + "loss": 1.0218, + "step": 700 + }, + { + "epoch": 1.29, + "learning_rate": 0.0001, + "loss": 0.8763, + "step": 710 + }, + { + "epoch": 1.31, + "learning_rate": 0.0001, + "loss": 0.8789, + "step": 720 + }, + { + "epoch": 1.32, + "learning_rate": 0.0001, + "loss": 0.8631, + "step": 730 + }, + { + "epoch": 1.34, + "learning_rate": 0.0001, + "loss": 0.9846, + "step": 740 + }, + { + "epoch": 1.36, + "eval_loss": 1.0305067300796509, + "eval_runtime": 948.7106, + "eval_samples_per_second": 1.054, + "eval_steps_per_second": 1.054, + "step": 748 + }, + { + "epoch": 1.36, + "mmlu_eval_accuracy": 0.7324229372189777, + "mmlu_eval_accuracy_abstract_algebra": 0.2727272727272727, + "mmlu_eval_accuracy_anatomy": 0.6428571428571429, + "mmlu_eval_accuracy_astronomy": 0.6875, + "mmlu_eval_accuracy_business_ethics": 1.0, + "mmlu_eval_accuracy_clinical_knowledge": 0.896551724137931, + "mmlu_eval_accuracy_college_biology": 0.875, + "mmlu_eval_accuracy_college_chemistry": 0.5, + "mmlu_eval_accuracy_college_computer_science": 0.6363636363636364, + "mmlu_eval_accuracy_college_mathematics": 0.36363636363636365, + "mmlu_eval_accuracy_college_medicine": 0.8636363636363636, + "mmlu_eval_accuracy_college_physics": 0.6363636363636364, + "mmlu_eval_accuracy_computer_security": 0.6363636363636364, + "mmlu_eval_accuracy_conceptual_physics": 0.6153846153846154, + "mmlu_eval_accuracy_econometrics": 0.8333333333333334, + "mmlu_eval_accuracy_electrical_engineering": 0.8125, + "mmlu_eval_accuracy_elementary_mathematics": 0.7073170731707317, + "mmlu_eval_accuracy_formal_logic": 0.6428571428571429, + "mmlu_eval_accuracy_global_facts": 0.4, + "mmlu_eval_accuracy_high_school_biology": 0.8125, + "mmlu_eval_accuracy_high_school_chemistry": 0.36363636363636365, + "mmlu_eval_accuracy_high_school_computer_science": 0.6666666666666666, + "mmlu_eval_accuracy_high_school_european_history": 0.7777777777777778, + "mmlu_eval_accuracy_high_school_geography": 0.9090909090909091, + "mmlu_eval_accuracy_high_school_government_and_politics": 0.9523809523809523, + "mmlu_eval_accuracy_high_school_macroeconomics": 0.7674418604651163, + "mmlu_eval_accuracy_high_school_mathematics": 0.3448275862068966, + "mmlu_eval_accuracy_high_school_microeconomics": 0.9615384615384616, + "mmlu_eval_accuracy_high_school_physics": 0.35294117647058826, + "mmlu_eval_accuracy_high_school_psychology": 0.9333333333333333, + "mmlu_eval_accuracy_high_school_statistics": 0.782608695652174, + "mmlu_eval_accuracy_high_school_us_history": 0.9545454545454546, + "mmlu_eval_accuracy_high_school_world_history": 0.7692307692307693, + "mmlu_eval_accuracy_human_aging": 0.782608695652174, + "mmlu_eval_accuracy_human_sexuality": 0.6666666666666666, + "mmlu_eval_accuracy_international_law": 1.0, + "mmlu_eval_accuracy_jurisprudence": 0.6363636363636364, + "mmlu_eval_accuracy_logical_fallacies": 0.7777777777777778, + "mmlu_eval_accuracy_machine_learning": 0.6363636363636364, + "mmlu_eval_accuracy_management": 0.9090909090909091, + "mmlu_eval_accuracy_marketing": 0.96, + "mmlu_eval_accuracy_medical_genetics": 0.9090909090909091, + "mmlu_eval_accuracy_miscellaneous": 0.7441860465116279, + "mmlu_eval_accuracy_moral_disputes": 0.8157894736842105, + "mmlu_eval_accuracy_moral_scenarios": 0.57, + "mmlu_eval_accuracy_nutrition": 0.7272727272727273, + "mmlu_eval_accuracy_philosophy": 0.7941176470588235, + "mmlu_eval_accuracy_prehistory": 0.8285714285714286, + "mmlu_eval_accuracy_professional_accounting": 0.6129032258064516, + "mmlu_eval_accuracy_professional_law": 0.6411764705882353, + "mmlu_eval_accuracy_professional_medicine": 0.8064516129032258, + "mmlu_eval_accuracy_professional_psychology": 0.8260869565217391, + "mmlu_eval_accuracy_public_relations": 0.6666666666666666, + "mmlu_eval_accuracy_security_studies": 0.8148148148148148, + "mmlu_eval_accuracy_sociology": 0.9090909090909091, + "mmlu_eval_accuracy_us_foreign_policy": 1.0, + "mmlu_eval_accuracy_virology": 0.5, + "mmlu_eval_accuracy_world_religions": 0.8421052631578947, + "mmlu_loss": 1.2988067958029479, + "step": 748 + }, + { + "epoch": 1.36, + "learning_rate": 0.0001, + "loss": 1.0735, + "step": 750 + }, + { + "epoch": 1.38, + "learning_rate": 0.0001, + "loss": 0.9066, + "step": 760 + }, + { + "epoch": 1.4, + "learning_rate": 0.0001, + "loss": 0.8716, + "step": 770 + }, + { + "epoch": 1.41, + "learning_rate": 0.0001, + "loss": 0.9144, + "step": 780 + }, + { + "epoch": 1.43, + "learning_rate": 0.0001, + "loss": 1.0338, + "step": 790 + }, + { + "epoch": 1.45, + "learning_rate": 0.0001, + "loss": 1.0275, + "step": 800 + }, + { + "epoch": 1.47, + "learning_rate": 0.0001, + "loss": 0.8382, + "step": 810 + }, + { + "epoch": 1.49, + "learning_rate": 0.0001, + "loss": 0.8489, + "step": 820 + }, + { + "epoch": 1.51, + "learning_rate": 0.0001, + "loss": 0.8931, + "step": 830 + }, + { + "epoch": 1.52, + "learning_rate": 0.0001, + "loss": 1.0515, + "step": 840 + }, + { + "epoch": 1.54, + "learning_rate": 0.0001, + "loss": 1.0965, + "step": 850 + }, + { + "epoch": 1.56, + "learning_rate": 0.0001, + "loss": 0.8928, + "step": 860 + }, + { + "epoch": 1.58, + "learning_rate": 0.0001, + "loss": 0.8608, + "step": 870 + }, + { + "epoch": 1.6, + "learning_rate": 0.0001, + "loss": 0.8831, + "step": 880 + }, + { + "epoch": 1.61, + "learning_rate": 0.0001, + "loss": 1.0253, + "step": 890 + }, + { + "epoch": 1.63, + "learning_rate": 0.0001, + "loss": 0.9905, + "step": 900 + }, + { + "epoch": 1.65, + "learning_rate": 0.0001, + "loss": 0.8487, + "step": 910 + }, + { + "epoch": 1.67, + "learning_rate": 0.0001, + "loss": 0.8568, + "step": 920 + }, + { + "epoch": 1.69, + "learning_rate": 0.0001, + "loss": 0.9047, + "step": 930 + }, + { + "epoch": 1.7, + "eval_loss": 1.0250624418258667, + "eval_runtime": 946.4035, + "eval_samples_per_second": 1.057, + "eval_steps_per_second": 1.057, + "step": 935 + }, + { + "epoch": 1.7, + "mmlu_eval_accuracy": 0.7288948695878031, + "mmlu_eval_accuracy_abstract_algebra": 0.2727272727272727, + "mmlu_eval_accuracy_anatomy": 0.5714285714285714, + "mmlu_eval_accuracy_astronomy": 0.6875, + "mmlu_eval_accuracy_business_ethics": 0.9090909090909091, + "mmlu_eval_accuracy_clinical_knowledge": 0.8620689655172413, + "mmlu_eval_accuracy_college_biology": 0.875, + "mmlu_eval_accuracy_college_chemistry": 0.5, + "mmlu_eval_accuracy_college_computer_science": 0.7272727272727273, + "mmlu_eval_accuracy_college_mathematics": 0.36363636363636365, + "mmlu_eval_accuracy_college_medicine": 0.9090909090909091, + "mmlu_eval_accuracy_college_physics": 0.6363636363636364, + "mmlu_eval_accuracy_computer_security": 0.6363636363636364, + "mmlu_eval_accuracy_conceptual_physics": 0.6153846153846154, + "mmlu_eval_accuracy_econometrics": 0.8333333333333334, + "mmlu_eval_accuracy_electrical_engineering": 0.8125, + "mmlu_eval_accuracy_elementary_mathematics": 0.6097560975609756, + "mmlu_eval_accuracy_formal_logic": 0.6428571428571429, + "mmlu_eval_accuracy_global_facts": 0.5, + "mmlu_eval_accuracy_high_school_biology": 0.84375, + "mmlu_eval_accuracy_high_school_chemistry": 0.45454545454545453, + "mmlu_eval_accuracy_high_school_computer_science": 0.6666666666666666, + "mmlu_eval_accuracy_high_school_european_history": 0.7777777777777778, + "mmlu_eval_accuracy_high_school_geography": 0.9090909090909091, + "mmlu_eval_accuracy_high_school_government_and_politics": 0.9523809523809523, + "mmlu_eval_accuracy_high_school_macroeconomics": 0.7906976744186046, + "mmlu_eval_accuracy_high_school_mathematics": 0.3793103448275862, + "mmlu_eval_accuracy_high_school_microeconomics": 0.9230769230769231, + "mmlu_eval_accuracy_high_school_physics": 0.23529411764705882, + "mmlu_eval_accuracy_high_school_psychology": 0.9333333333333333, + "mmlu_eval_accuracy_high_school_statistics": 0.7391304347826086, + "mmlu_eval_accuracy_high_school_us_history": 0.9090909090909091, + "mmlu_eval_accuracy_high_school_world_history": 0.7692307692307693, + "mmlu_eval_accuracy_human_aging": 0.782608695652174, + "mmlu_eval_accuracy_human_sexuality": 0.6666666666666666, + "mmlu_eval_accuracy_international_law": 1.0, + "mmlu_eval_accuracy_jurisprudence": 0.5454545454545454, + "mmlu_eval_accuracy_logical_fallacies": 0.7222222222222222, + "mmlu_eval_accuracy_machine_learning": 0.6363636363636364, + "mmlu_eval_accuracy_management": 0.9090909090909091, + "mmlu_eval_accuracy_marketing": 0.96, + "mmlu_eval_accuracy_medical_genetics": 0.9090909090909091, + "mmlu_eval_accuracy_miscellaneous": 0.7790697674418605, + "mmlu_eval_accuracy_moral_disputes": 0.8157894736842105, + "mmlu_eval_accuracy_moral_scenarios": 0.57, + "mmlu_eval_accuracy_nutrition": 0.7575757575757576, + "mmlu_eval_accuracy_philosophy": 0.8235294117647058, + "mmlu_eval_accuracy_prehistory": 0.8857142857142857, + "mmlu_eval_accuracy_professional_accounting": 0.6129032258064516, + "mmlu_eval_accuracy_professional_law": 0.6235294117647059, + "mmlu_eval_accuracy_professional_medicine": 0.8387096774193549, + "mmlu_eval_accuracy_professional_psychology": 0.8115942028985508, + "mmlu_eval_accuracy_public_relations": 0.5833333333333334, + "mmlu_eval_accuracy_security_studies": 0.8148148148148148, + "mmlu_eval_accuracy_sociology": 0.9090909090909091, + "mmlu_eval_accuracy_us_foreign_policy": 1.0, + "mmlu_eval_accuracy_virology": 0.5, + "mmlu_eval_accuracy_world_religions": 0.8421052631578947, + "mmlu_loss": 1.243813282909306, + "step": 935 + }, + { + "epoch": 1.7, + "learning_rate": 0.0001, + "loss": 1.0174, + "step": 940 + }, + { + "epoch": 1.72, + "learning_rate": 0.0001, + "loss": 1.0302, + "step": 950 + }, + { + "epoch": 1.74, + "learning_rate": 0.0001, + "loss": 0.8799, + "step": 960 + }, + { + "epoch": 1.76, + "learning_rate": 0.0001, + "loss": 0.8447, + "step": 970 + }, + { + "epoch": 1.78, + "learning_rate": 0.0001, + "loss": 0.9053, + "step": 980 + }, + { + "epoch": 1.8, + "learning_rate": 0.0001, + "loss": 1.0331, + "step": 990 + }, + { + "epoch": 1.81, + "learning_rate": 0.0001, + "loss": 1.0412, + "step": 1000 + }, + { + "epoch": 1.83, + "learning_rate": 0.0001, + "loss": 0.8753, + "step": 1010 + }, + { + "epoch": 1.85, + "learning_rate": 0.0001, + "loss": 0.8744, + "step": 1020 + }, + { + "epoch": 1.87, + "learning_rate": 0.0001, + "loss": 0.8899, + "step": 1030 + }, + { + "epoch": 1.89, + "learning_rate": 0.0001, + "loss": 1.0053, + "step": 1040 + }, + { + "epoch": 1.9, + "learning_rate": 0.0001, + "loss": 1.0127, + "step": 1050 + }, + { + "epoch": 1.92, + "learning_rate": 0.0001, + "loss": 0.8023, + "step": 1060 + }, + { + "epoch": 1.94, + "learning_rate": 0.0001, + "loss": 0.8349, + "step": 1070 + }, + { + "epoch": 1.96, + "learning_rate": 0.0001, + "loss": 0.9742, + "step": 1080 + }, + { + "epoch": 1.98, + "learning_rate": 0.0001, + "loss": 1.0971, + "step": 1090 + }, + { + "epoch": 2.0, + "learning_rate": 0.0001, + "loss": 1.0728, + "step": 1100 + }, + { + "epoch": 2.01, + "learning_rate": 0.0001, + "loss": 0.7724, + "step": 1110 + }, + { + "epoch": 2.03, + "learning_rate": 0.0001, + "loss": 0.7675, + "step": 1120 + }, + { + "epoch": 2.03, + "eval_loss": 1.052681565284729, + "eval_runtime": 942.0722, + "eval_samples_per_second": 1.061, + "eval_steps_per_second": 1.061, + "step": 1122 + }, + { + "epoch": 2.03, + "mmlu_eval_accuracy": 0.7373981967098951, + "mmlu_eval_accuracy_abstract_algebra": 0.36363636363636365, + "mmlu_eval_accuracy_anatomy": 0.6428571428571429, + "mmlu_eval_accuracy_astronomy": 0.6875, + "mmlu_eval_accuracy_business_ethics": 0.9090909090909091, + "mmlu_eval_accuracy_clinical_knowledge": 0.896551724137931, + "mmlu_eval_accuracy_college_biology": 0.875, + "mmlu_eval_accuracy_college_chemistry": 0.5, + "mmlu_eval_accuracy_college_computer_science": 0.5454545454545454, + "mmlu_eval_accuracy_college_mathematics": 0.36363636363636365, + "mmlu_eval_accuracy_college_medicine": 0.9090909090909091, + "mmlu_eval_accuracy_college_physics": 0.5454545454545454, + "mmlu_eval_accuracy_computer_security": 0.6363636363636364, + "mmlu_eval_accuracy_conceptual_physics": 0.6153846153846154, + "mmlu_eval_accuracy_econometrics": 0.8333333333333334, + "mmlu_eval_accuracy_electrical_engineering": 0.875, + "mmlu_eval_accuracy_elementary_mathematics": 0.5853658536585366, + "mmlu_eval_accuracy_formal_logic": 0.7142857142857143, + "mmlu_eval_accuracy_global_facts": 0.4, + "mmlu_eval_accuracy_high_school_biology": 0.84375, + "mmlu_eval_accuracy_high_school_chemistry": 0.45454545454545453, + "mmlu_eval_accuracy_high_school_computer_science": 0.7777777777777778, + "mmlu_eval_accuracy_high_school_european_history": 0.7777777777777778, + "mmlu_eval_accuracy_high_school_geography": 0.9090909090909091, + "mmlu_eval_accuracy_high_school_government_and_politics": 0.9523809523809523, + "mmlu_eval_accuracy_high_school_macroeconomics": 0.7674418604651163, + "mmlu_eval_accuracy_high_school_mathematics": 0.41379310344827586, + "mmlu_eval_accuracy_high_school_microeconomics": 0.9230769230769231, + "mmlu_eval_accuracy_high_school_physics": 0.23529411764705882, + "mmlu_eval_accuracy_high_school_psychology": 0.95, + "mmlu_eval_accuracy_high_school_statistics": 0.782608695652174, + "mmlu_eval_accuracy_high_school_us_history": 0.9090909090909091, + "mmlu_eval_accuracy_high_school_world_history": 0.7692307692307693, + "mmlu_eval_accuracy_human_aging": 0.8260869565217391, + "mmlu_eval_accuracy_human_sexuality": 0.6666666666666666, + "mmlu_eval_accuracy_international_law": 1.0, + "mmlu_eval_accuracy_jurisprudence": 0.6363636363636364, + "mmlu_eval_accuracy_logical_fallacies": 0.7777777777777778, + "mmlu_eval_accuracy_machine_learning": 0.6363636363636364, + "mmlu_eval_accuracy_management": 0.9090909090909091, + "mmlu_eval_accuracy_marketing": 0.92, + "mmlu_eval_accuracy_medical_genetics": 0.9090909090909091, + "mmlu_eval_accuracy_miscellaneous": 0.7906976744186046, + "mmlu_eval_accuracy_moral_disputes": 0.8157894736842105, + "mmlu_eval_accuracy_moral_scenarios": 0.62, + "mmlu_eval_accuracy_nutrition": 0.7575757575757576, + "mmlu_eval_accuracy_philosophy": 0.8235294117647058, + "mmlu_eval_accuracy_prehistory": 0.8857142857142857, + "mmlu_eval_accuracy_professional_accounting": 0.6774193548387096, + "mmlu_eval_accuracy_professional_law": 0.6294117647058823, + "mmlu_eval_accuracy_professional_medicine": 0.8709677419354839, + "mmlu_eval_accuracy_professional_psychology": 0.782608695652174, + "mmlu_eval_accuracy_public_relations": 0.6666666666666666, + "mmlu_eval_accuracy_security_studies": 0.8148148148148148, + "mmlu_eval_accuracy_sociology": 0.9090909090909091, + "mmlu_eval_accuracy_us_foreign_policy": 1.0, + "mmlu_eval_accuracy_virology": 0.5, + "mmlu_eval_accuracy_world_religions": 0.8421052631578947, + "mmlu_loss": 1.2340081441760609, + "step": 1122 + }, + { + "epoch": 2.05, + "learning_rate": 0.0001, + "loss": 0.7194, + "step": 1130 + }, + { + "epoch": 2.07, + "learning_rate": 0.0001, + "loss": 0.8236, + "step": 1140 + }, + { + "epoch": 2.09, + "learning_rate": 0.0001, + "loss": 0.6652, + "step": 1150 + }, + { + "epoch": 2.1, + "learning_rate": 0.0001, + "loss": 0.7177, + "step": 1160 + }, + { + "epoch": 2.12, + "learning_rate": 0.0001, + "loss": 0.7788, + "step": 1170 + }, + { + "epoch": 2.14, + "learning_rate": 0.0001, + "loss": 0.8117, + "step": 1180 + }, + { + "epoch": 2.16, + "learning_rate": 0.0001, + "loss": 0.8145, + "step": 1190 + }, + { + "epoch": 2.18, + "learning_rate": 0.0001, + "loss": 0.6984, + "step": 1200 + }, + { + "epoch": 2.19, + "learning_rate": 0.0001, + "loss": 0.7011, + "step": 1210 + }, + { + "epoch": 2.21, + "learning_rate": 0.0001, + "loss": 0.769, + "step": 1220 + }, + { + "epoch": 2.23, + "learning_rate": 0.0001, + "loss": 0.7705, + "step": 1230 + }, + { + "epoch": 2.25, + "learning_rate": 0.0001, + "loss": 0.8066, + "step": 1240 + }, + { + "epoch": 2.27, + "learning_rate": 0.0001, + "loss": 0.6622, + "step": 1250 + }, + { + "epoch": 2.29, + "learning_rate": 0.0001, + "loss": 0.6641, + "step": 1260 + }, + { + "epoch": 2.3, + "learning_rate": 0.0001, + "loss": 0.7239, + "step": 1270 + }, + { + "epoch": 2.32, + "learning_rate": 0.0001, + "loss": 0.7618, + "step": 1280 + }, + { + "epoch": 2.34, + "learning_rate": 0.0001, + "loss": 0.7845, + "step": 1290 + }, + { + "epoch": 2.36, + "learning_rate": 0.0001, + "loss": 0.719, + "step": 1300 + }, + { + "epoch": 2.37, + "eval_loss": 1.1104822158813477, + "eval_runtime": 948.1299, + "eval_samples_per_second": 1.055, + "eval_steps_per_second": 1.055, + "step": 1309 + }, + { + "epoch": 2.37, + "mmlu_eval_accuracy": 0.7369285730399766, + "mmlu_eval_accuracy_abstract_algebra": 0.36363636363636365, + "mmlu_eval_accuracy_anatomy": 0.6428571428571429, + "mmlu_eval_accuracy_astronomy": 0.6875, + "mmlu_eval_accuracy_business_ethics": 1.0, + "mmlu_eval_accuracy_clinical_knowledge": 0.8275862068965517, + "mmlu_eval_accuracy_college_biology": 0.875, + "mmlu_eval_accuracy_college_chemistry": 0.5, + "mmlu_eval_accuracy_college_computer_science": 0.6363636363636364, + "mmlu_eval_accuracy_college_mathematics": 0.45454545454545453, + "mmlu_eval_accuracy_college_medicine": 0.8636363636363636, + "mmlu_eval_accuracy_college_physics": 0.6363636363636364, + "mmlu_eval_accuracy_computer_security": 0.7272727272727273, + "mmlu_eval_accuracy_conceptual_physics": 0.6923076923076923, + "mmlu_eval_accuracy_econometrics": 0.75, + "mmlu_eval_accuracy_electrical_engineering": 0.8125, + "mmlu_eval_accuracy_elementary_mathematics": 0.6341463414634146, + "mmlu_eval_accuracy_formal_logic": 0.7857142857142857, + "mmlu_eval_accuracy_global_facts": 0.5, + "mmlu_eval_accuracy_high_school_biology": 0.84375, + "mmlu_eval_accuracy_high_school_chemistry": 0.4090909090909091, + "mmlu_eval_accuracy_high_school_computer_science": 0.8888888888888888, + "mmlu_eval_accuracy_high_school_european_history": 0.7777777777777778, + "mmlu_eval_accuracy_high_school_geography": 0.9090909090909091, + "mmlu_eval_accuracy_high_school_government_and_politics": 0.9523809523809523, + "mmlu_eval_accuracy_high_school_macroeconomics": 0.7906976744186046, + "mmlu_eval_accuracy_high_school_mathematics": 0.3793103448275862, + "mmlu_eval_accuracy_high_school_microeconomics": 0.9230769230769231, + "mmlu_eval_accuracy_high_school_physics": 0.17647058823529413, + "mmlu_eval_accuracy_high_school_psychology": 0.95, + "mmlu_eval_accuracy_high_school_statistics": 0.6956521739130435, + "mmlu_eval_accuracy_high_school_us_history": 0.9090909090909091, + "mmlu_eval_accuracy_high_school_world_history": 0.7307692307692307, + "mmlu_eval_accuracy_human_aging": 0.7391304347826086, + "mmlu_eval_accuracy_human_sexuality": 0.6666666666666666, + "mmlu_eval_accuracy_international_law": 1.0, + "mmlu_eval_accuracy_jurisprudence": 0.5454545454545454, + "mmlu_eval_accuracy_logical_fallacies": 0.7222222222222222, + "mmlu_eval_accuracy_machine_learning": 0.6363636363636364, + "mmlu_eval_accuracy_management": 0.9090909090909091, + "mmlu_eval_accuracy_marketing": 0.88, + "mmlu_eval_accuracy_medical_genetics": 0.9090909090909091, + "mmlu_eval_accuracy_miscellaneous": 0.7906976744186046, + "mmlu_eval_accuracy_moral_disputes": 0.8157894736842105, + "mmlu_eval_accuracy_moral_scenarios": 0.57, + "mmlu_eval_accuracy_nutrition": 0.7575757575757576, + "mmlu_eval_accuracy_philosophy": 0.8529411764705882, + "mmlu_eval_accuracy_prehistory": 0.8571428571428571, + "mmlu_eval_accuracy_professional_accounting": 0.6774193548387096, + "mmlu_eval_accuracy_professional_law": 0.6058823529411764, + "mmlu_eval_accuracy_professional_medicine": 0.8709677419354839, + "mmlu_eval_accuracy_professional_psychology": 0.7681159420289855, + "mmlu_eval_accuracy_public_relations": 0.5833333333333334, + "mmlu_eval_accuracy_security_studies": 0.8148148148148148, + "mmlu_eval_accuracy_sociology": 0.9090909090909091, + "mmlu_eval_accuracy_us_foreign_policy": 1.0, + "mmlu_eval_accuracy_virology": 0.5555555555555556, + "mmlu_eval_accuracy_world_religions": 0.8421052631578947, + "mmlu_loss": 1.0866562834095908, + "step": 1309 + }, + { + "epoch": 2.38, + "learning_rate": 0.0001, + "loss": 0.7093, + "step": 1310 + }, + { + "epoch": 2.39, + "learning_rate": 0.0001, + "loss": 0.7684, + "step": 1320 + }, + { + "epoch": 2.41, + "learning_rate": 0.0001, + "loss": 0.7501, + "step": 1330 + }, + { + "epoch": 2.43, + "learning_rate": 0.0001, + "loss": 0.8043, + "step": 1340 + }, + { + "epoch": 2.45, + "learning_rate": 0.0001, + "loss": 0.6927, + "step": 1350 + }, + { + "epoch": 2.47, + "learning_rate": 0.0001, + "loss": 0.7278, + "step": 1360 + }, + { + "epoch": 2.48, + "learning_rate": 0.0001, + "loss": 0.8095, + "step": 1370 + }, + { + "epoch": 2.5, + "learning_rate": 0.0001, + "loss": 0.7463, + "step": 1380 + }, + { + "epoch": 2.52, + "learning_rate": 0.0001, + "loss": 0.7707, + "step": 1390 + }, + { + "epoch": 2.54, + "learning_rate": 0.0001, + "loss": 0.7152, + "step": 1400 + }, + { + "epoch": 2.56, + "learning_rate": 0.0001, + "loss": 0.687, + "step": 1410 + }, + { + "epoch": 2.58, + "learning_rate": 0.0001, + "loss": 0.7529, + "step": 1420 + }, + { + "epoch": 2.59, + "learning_rate": 0.0001, + "loss": 0.7565, + "step": 1430 + }, + { + "epoch": 2.61, + "learning_rate": 0.0001, + "loss": 0.8066, + "step": 1440 + }, + { + "epoch": 2.63, + "learning_rate": 0.0001, + "loss": 0.7623, + "step": 1450 + }, + { + "epoch": 2.65, + "learning_rate": 0.0001, + "loss": 0.6947, + "step": 1460 + }, + { + "epoch": 2.67, + "learning_rate": 0.0001, + "loss": 0.7756, + "step": 1470 + }, + { + "epoch": 2.68, + "learning_rate": 0.0001, + "loss": 0.8453, + "step": 1480 + }, + { + "epoch": 2.7, + "learning_rate": 0.0001, + "loss": 0.8306, + "step": 1490 + }, + { + "epoch": 2.71, + "eval_loss": 1.100826621055603, + "eval_runtime": 940.4488, + "eval_samples_per_second": 1.063, + "eval_steps_per_second": 1.063, + "step": 1496 + }, + { + "epoch": 2.71, + "mmlu_eval_accuracy": 0.7363077307176445, + "mmlu_eval_accuracy_abstract_algebra": 0.36363636363636365, + "mmlu_eval_accuracy_anatomy": 0.5714285714285714, + "mmlu_eval_accuracy_astronomy": 0.6875, + "mmlu_eval_accuracy_business_ethics": 0.9090909090909091, + "mmlu_eval_accuracy_clinical_knowledge": 0.8620689655172413, + "mmlu_eval_accuracy_college_biology": 0.8125, + "mmlu_eval_accuracy_college_chemistry": 0.5, + "mmlu_eval_accuracy_college_computer_science": 0.5454545454545454, + "mmlu_eval_accuracy_college_mathematics": 0.36363636363636365, + "mmlu_eval_accuracy_college_medicine": 0.9090909090909091, + "mmlu_eval_accuracy_college_physics": 0.6363636363636364, + "mmlu_eval_accuracy_computer_security": 0.6363636363636364, + "mmlu_eval_accuracy_conceptual_physics": 0.6153846153846154, + "mmlu_eval_accuracy_econometrics": 0.75, + "mmlu_eval_accuracy_electrical_engineering": 0.8125, + "mmlu_eval_accuracy_elementary_mathematics": 0.6585365853658537, + "mmlu_eval_accuracy_formal_logic": 0.6428571428571429, + "mmlu_eval_accuracy_global_facts": 0.6, + "mmlu_eval_accuracy_high_school_biology": 0.78125, + "mmlu_eval_accuracy_high_school_chemistry": 0.5909090909090909, + "mmlu_eval_accuracy_high_school_computer_science": 0.7777777777777778, + "mmlu_eval_accuracy_high_school_european_history": 0.7777777777777778, + "mmlu_eval_accuracy_high_school_geography": 0.9545454545454546, + "mmlu_eval_accuracy_high_school_government_and_politics": 0.9523809523809523, + "mmlu_eval_accuracy_high_school_macroeconomics": 0.7906976744186046, + "mmlu_eval_accuracy_high_school_mathematics": 0.4827586206896552, + "mmlu_eval_accuracy_high_school_microeconomics": 0.9615384615384616, + "mmlu_eval_accuracy_high_school_physics": 0.17647058823529413, + "mmlu_eval_accuracy_high_school_psychology": 0.9166666666666666, + "mmlu_eval_accuracy_high_school_statistics": 0.6521739130434783, + "mmlu_eval_accuracy_high_school_us_history": 0.9090909090909091, + "mmlu_eval_accuracy_high_school_world_history": 0.8076923076923077, + "mmlu_eval_accuracy_human_aging": 0.7391304347826086, + "mmlu_eval_accuracy_human_sexuality": 0.75, + "mmlu_eval_accuracy_international_law": 1.0, + "mmlu_eval_accuracy_jurisprudence": 0.45454545454545453, + "mmlu_eval_accuracy_logical_fallacies": 0.7777777777777778, + "mmlu_eval_accuracy_machine_learning": 0.6363636363636364, + "mmlu_eval_accuracy_management": 0.9090909090909091, + "mmlu_eval_accuracy_marketing": 0.88, + "mmlu_eval_accuracy_medical_genetics": 0.9090909090909091, + "mmlu_eval_accuracy_miscellaneous": 0.7906976744186046, + "mmlu_eval_accuracy_moral_disputes": 0.8157894736842105, + "mmlu_eval_accuracy_moral_scenarios": 0.64, + "mmlu_eval_accuracy_nutrition": 0.7575757575757576, + "mmlu_eval_accuracy_philosophy": 0.8235294117647058, + "mmlu_eval_accuracy_prehistory": 0.8571428571428571, + "mmlu_eval_accuracy_professional_accounting": 0.7096774193548387, + "mmlu_eval_accuracy_professional_law": 0.6176470588235294, + "mmlu_eval_accuracy_professional_medicine": 0.8709677419354839, + "mmlu_eval_accuracy_professional_psychology": 0.7971014492753623, + "mmlu_eval_accuracy_public_relations": 0.6666666666666666, + "mmlu_eval_accuracy_security_studies": 0.8518518518518519, + "mmlu_eval_accuracy_sociology": 0.9090909090909091, + "mmlu_eval_accuracy_us_foreign_policy": 1.0, + "mmlu_eval_accuracy_virology": 0.5555555555555556, + "mmlu_eval_accuracy_world_religions": 0.8421052631578947, + "mmlu_loss": 1.2313211129857853, + "step": 1496 + }, + { + "epoch": 2.72, + "learning_rate": 0.0001, + "loss": 0.6937, + "step": 1500 + }, + { + "epoch": 2.74, + "learning_rate": 0.0001, + "loss": 0.6997, + "step": 1510 + }, + { + "epoch": 2.76, + "learning_rate": 0.0001, + "loss": 0.7588, + "step": 1520 + }, + { + "epoch": 2.77, + "learning_rate": 0.0001, + "loss": 0.7731, + "step": 1530 + }, + { + "epoch": 2.79, + "learning_rate": 0.0001, + "loss": 0.7914, + "step": 1540 + }, + { + "epoch": 2.81, + "learning_rate": 0.0001, + "loss": 0.7175, + "step": 1550 + }, + { + "epoch": 2.83, + "learning_rate": 0.0001, + "loss": 0.7046, + "step": 1560 + }, + { + "epoch": 2.85, + "learning_rate": 0.0001, + "loss": 0.7597, + "step": 1570 + }, + { + "epoch": 2.87, + "learning_rate": 0.0001, + "loss": 0.7932, + "step": 1580 + }, + { + "epoch": 2.88, + "learning_rate": 0.0001, + "loss": 0.8059, + "step": 1590 + }, + { + "epoch": 2.9, + "learning_rate": 0.0001, + "loss": 0.7258, + "step": 1600 + }, + { + "epoch": 2.92, + "learning_rate": 0.0001, + "loss": 0.7486, + "step": 1610 + }, + { + "epoch": 2.94, + "learning_rate": 0.0001, + "loss": 0.7233, + "step": 1620 + }, + { + "epoch": 2.96, + "learning_rate": 0.0001, + "loss": 0.7945, + "step": 1630 + }, + { + "epoch": 2.97, + "learning_rate": 0.0001, + "loss": 0.8324, + "step": 1640 + }, + { + "epoch": 2.99, + "learning_rate": 0.0001, + "loss": 0.7294, + "step": 1650 + }, + { + "epoch": 3.01, + "learning_rate": 0.0001, + "loss": 0.6117, + "step": 1660 + }, + { + "epoch": 3.03, + "learning_rate": 0.0001, + "loss": 0.6464, + "step": 1670 + }, + { + "epoch": 3.05, + "learning_rate": 0.0001, + "loss": 0.6156, + "step": 1680 + }, + { + "epoch": 3.05, + "eval_loss": 1.1478718519210815, + "eval_runtime": 932.4225, + "eval_samples_per_second": 1.072, + "eval_steps_per_second": 1.072, + "step": 1683 + }, + { + "epoch": 3.05, + "mmlu_eval_accuracy": 0.745366643285036, + "mmlu_eval_accuracy_abstract_algebra": 0.2727272727272727, + "mmlu_eval_accuracy_anatomy": 0.5714285714285714, + "mmlu_eval_accuracy_astronomy": 0.6875, + "mmlu_eval_accuracy_business_ethics": 1.0, + "mmlu_eval_accuracy_clinical_knowledge": 0.8620689655172413, + "mmlu_eval_accuracy_college_biology": 0.875, + "mmlu_eval_accuracy_college_chemistry": 0.5, + "mmlu_eval_accuracy_college_computer_science": 0.7272727272727273, + "mmlu_eval_accuracy_college_mathematics": 0.36363636363636365, + "mmlu_eval_accuracy_college_medicine": 0.9090909090909091, + "mmlu_eval_accuracy_college_physics": 0.6363636363636364, + "mmlu_eval_accuracy_computer_security": 0.6363636363636364, + "mmlu_eval_accuracy_conceptual_physics": 0.6538461538461539, + "mmlu_eval_accuracy_econometrics": 0.75, + "mmlu_eval_accuracy_electrical_engineering": 0.875, + "mmlu_eval_accuracy_elementary_mathematics": 0.7073170731707317, + "mmlu_eval_accuracy_formal_logic": 0.7857142857142857, + "mmlu_eval_accuracy_global_facts": 0.8, + "mmlu_eval_accuracy_high_school_biology": 0.8125, + "mmlu_eval_accuracy_high_school_chemistry": 0.5454545454545454, + "mmlu_eval_accuracy_high_school_computer_science": 0.7777777777777778, + "mmlu_eval_accuracy_high_school_european_history": 0.7777777777777778, + "mmlu_eval_accuracy_high_school_geography": 0.9090909090909091, + "mmlu_eval_accuracy_high_school_government_and_politics": 0.9523809523809523, + "mmlu_eval_accuracy_high_school_macroeconomics": 0.7674418604651163, + "mmlu_eval_accuracy_high_school_mathematics": 0.3793103448275862, + "mmlu_eval_accuracy_high_school_microeconomics": 0.9230769230769231, + "mmlu_eval_accuracy_high_school_physics": 0.23529411764705882, + "mmlu_eval_accuracy_high_school_psychology": 0.9333333333333333, + "mmlu_eval_accuracy_high_school_statistics": 0.6521739130434783, + "mmlu_eval_accuracy_high_school_us_history": 0.9090909090909091, + "mmlu_eval_accuracy_high_school_world_history": 0.8076923076923077, + "mmlu_eval_accuracy_human_aging": 0.7391304347826086, + "mmlu_eval_accuracy_human_sexuality": 0.75, + "mmlu_eval_accuracy_international_law": 1.0, + "mmlu_eval_accuracy_jurisprudence": 0.5454545454545454, + "mmlu_eval_accuracy_logical_fallacies": 0.7777777777777778, + "mmlu_eval_accuracy_machine_learning": 0.6363636363636364, + "mmlu_eval_accuracy_management": 0.9090909090909091, + "mmlu_eval_accuracy_marketing": 0.96, + "mmlu_eval_accuracy_medical_genetics": 1.0, + "mmlu_eval_accuracy_miscellaneous": 0.7906976744186046, + "mmlu_eval_accuracy_moral_disputes": 0.7894736842105263, + "mmlu_eval_accuracy_moral_scenarios": 0.61, + "mmlu_eval_accuracy_nutrition": 0.7272727272727273, + "mmlu_eval_accuracy_philosophy": 0.7647058823529411, + "mmlu_eval_accuracy_prehistory": 0.8571428571428571, + "mmlu_eval_accuracy_professional_accounting": 0.6774193548387096, + "mmlu_eval_accuracy_professional_law": 0.6176470588235294, + "mmlu_eval_accuracy_professional_medicine": 0.9032258064516129, + "mmlu_eval_accuracy_professional_psychology": 0.7391304347826086, + "mmlu_eval_accuracy_public_relations": 0.6666666666666666, + "mmlu_eval_accuracy_security_studies": 0.7777777777777778, + "mmlu_eval_accuracy_sociology": 0.9090909090909091, + "mmlu_eval_accuracy_us_foreign_policy": 1.0, + "mmlu_eval_accuracy_virology": 0.5, + "mmlu_eval_accuracy_world_religions": 0.8421052631578947, + "mmlu_loss": 1.4050485734687297, + "step": 1683 + }, + { + "epoch": 3.07, + "learning_rate": 0.0001, + "loss": 0.5237, + "step": 1690 + }, + { + "epoch": 3.08, + "learning_rate": 0.0001, + "loss": 0.3516, + "step": 1700 + }, + { + "epoch": 3.1, + "learning_rate": 0.0001, + "loss": 0.4976, + "step": 1710 + }, + { + "epoch": 3.12, + "learning_rate": 0.0001, + "loss": 0.6535, + "step": 1720 + }, + { + "epoch": 3.14, + "learning_rate": 0.0001, + "loss": 0.5926, + "step": 1730 + }, + { + "epoch": 3.16, + "learning_rate": 0.0001, + "loss": 0.5476, + "step": 1740 + }, + { + "epoch": 3.17, + "learning_rate": 0.0001, + "loss": 0.368, + "step": 1750 + }, + { + "epoch": 3.19, + "learning_rate": 0.0001, + "loss": 0.5043, + "step": 1760 + }, + { + "epoch": 3.21, + "learning_rate": 0.0001, + "loss": 0.5907, + "step": 1770 + }, + { + "epoch": 3.23, + "learning_rate": 0.0001, + "loss": 0.5609, + "step": 1780 + }, + { + "epoch": 3.25, + "learning_rate": 0.0001, + "loss": 0.5272, + "step": 1790 + }, + { + "epoch": 3.26, + "learning_rate": 0.0001, + "loss": 0.3672, + "step": 1800 + }, + { + "epoch": 3.28, + "learning_rate": 0.0001, + "loss": 0.4947, + "step": 1810 + }, + { + "epoch": 3.3, + "learning_rate": 0.0001, + "loss": 0.6441, + "step": 1820 + }, + { + "epoch": 3.32, + "learning_rate": 0.0001, + "loss": 0.5989, + "step": 1830 + }, + { + "epoch": 3.34, + "learning_rate": 0.0001, + "loss": 0.5411, + "step": 1840 + }, + { + "epoch": 3.36, + "learning_rate": 0.0001, + "loss": 0.401, + "step": 1850 + }, + { + "epoch": 3.37, + "learning_rate": 0.0001, + "loss": 0.4685, + "step": 1860 + }, + { + "epoch": 3.39, + "learning_rate": 0.0001, + "loss": 0.6234, + "step": 1870 + }, + { + "epoch": 3.39, + "eval_loss": 1.1522600650787354, + "eval_runtime": 572.6447, + "eval_samples_per_second": 1.746, + "eval_steps_per_second": 1.746, + "step": 1870 + }, + { + "epoch": 3.39, + "mmlu_eval_accuracy": 0.7349633316353468, + "mmlu_eval_accuracy_abstract_algebra": 0.36363636363636365, + "mmlu_eval_accuracy_anatomy": 0.5714285714285714, + "mmlu_eval_accuracy_astronomy": 0.6875, + "mmlu_eval_accuracy_business_ethics": 1.0, + "mmlu_eval_accuracy_clinical_knowledge": 0.8620689655172413, + "mmlu_eval_accuracy_college_biology": 0.875, + "mmlu_eval_accuracy_college_chemistry": 0.5, + "mmlu_eval_accuracy_college_computer_science": 0.6363636363636364, + "mmlu_eval_accuracy_college_mathematics": 0.2727272727272727, + "mmlu_eval_accuracy_college_medicine": 0.8636363636363636, + "mmlu_eval_accuracy_college_physics": 0.6363636363636364, + "mmlu_eval_accuracy_computer_security": 0.7272727272727273, + "mmlu_eval_accuracy_conceptual_physics": 0.6153846153846154, + "mmlu_eval_accuracy_econometrics": 0.75, + "mmlu_eval_accuracy_electrical_engineering": 0.75, + "mmlu_eval_accuracy_elementary_mathematics": 0.6341463414634146, + "mmlu_eval_accuracy_formal_logic": 0.7142857142857143, + "mmlu_eval_accuracy_global_facts": 0.6, + "mmlu_eval_accuracy_high_school_biology": 0.8125, + "mmlu_eval_accuracy_high_school_chemistry": 0.45454545454545453, + "mmlu_eval_accuracy_high_school_computer_science": 0.8888888888888888, + "mmlu_eval_accuracy_high_school_european_history": 0.7777777777777778, + "mmlu_eval_accuracy_high_school_geography": 0.9090909090909091, + "mmlu_eval_accuracy_high_school_government_and_politics": 0.9523809523809523, + "mmlu_eval_accuracy_high_school_macroeconomics": 0.7674418604651163, + "mmlu_eval_accuracy_high_school_mathematics": 0.4482758620689655, + "mmlu_eval_accuracy_high_school_microeconomics": 0.9615384615384616, + "mmlu_eval_accuracy_high_school_physics": 0.17647058823529413, + "mmlu_eval_accuracy_high_school_psychology": 0.9333333333333333, + "mmlu_eval_accuracy_high_school_statistics": 0.6521739130434783, + "mmlu_eval_accuracy_high_school_us_history": 0.9090909090909091, + "mmlu_eval_accuracy_high_school_world_history": 0.8076923076923077, + "mmlu_eval_accuracy_human_aging": 0.7391304347826086, + "mmlu_eval_accuracy_human_sexuality": 0.75, + "mmlu_eval_accuracy_international_law": 1.0, + "mmlu_eval_accuracy_jurisprudence": 0.5454545454545454, + "mmlu_eval_accuracy_logical_fallacies": 0.7777777777777778, + "mmlu_eval_accuracy_machine_learning": 0.6363636363636364, + "mmlu_eval_accuracy_management": 0.9090909090909091, + "mmlu_eval_accuracy_marketing": 0.84, + "mmlu_eval_accuracy_medical_genetics": 0.9090909090909091, + "mmlu_eval_accuracy_miscellaneous": 0.7906976744186046, + "mmlu_eval_accuracy_moral_disputes": 0.8157894736842105, + "mmlu_eval_accuracy_moral_scenarios": 0.64, + "mmlu_eval_accuracy_nutrition": 0.7575757575757576, + "mmlu_eval_accuracy_philosophy": 0.7941176470588235, + "mmlu_eval_accuracy_prehistory": 0.8857142857142857, + "mmlu_eval_accuracy_professional_accounting": 0.6451612903225806, + "mmlu_eval_accuracy_professional_law": 0.6176470588235294, + "mmlu_eval_accuracy_professional_medicine": 0.8709677419354839, + "mmlu_eval_accuracy_professional_psychology": 0.7246376811594203, + "mmlu_eval_accuracy_public_relations": 0.6666666666666666, + "mmlu_eval_accuracy_security_studies": 0.8148148148148148, + "mmlu_eval_accuracy_sociology": 0.9090909090909091, + "mmlu_eval_accuracy_us_foreign_policy": 1.0, + "mmlu_eval_accuracy_virology": 0.5, + "mmlu_eval_accuracy_world_religions": 0.8421052631578947, + "mmlu_loss": 1.221846800616253, + "step": 1870 + }, + { + "epoch": 3.4, + "step": 1875, + "total_flos": 2.1784431229955113e+18, + "train_loss": 0.8699908837636312, + "train_runtime": 112632.6276, + "train_samples_per_second": 0.266, + "train_steps_per_second": 0.017 + } + ], + "max_steps": 1875, + "num_train_epochs": 4, + "total_flos": 2.1784431229955113e+18, + "trial_name": null, + "trial_params": null +}