diff --git a/all_results.json b/all_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..70524eda2366a51e505667bb9f1e80fd36a79c12
--- /dev/null
+++ b/all_results.json
@@ -0,0 +1,11 @@
+{
+ "epoch": 3.4,
+ "eval_loss": 1.151158332824707,
+ "eval_runtime": 572.7509,
+ "eval_samples_per_second": 1.746,
+ "eval_steps_per_second": 1.746,
+ "train_loss": 0.8699908837636312,
+ "train_runtime": 112632.6276,
+ "train_samples_per_second": 0.266,
+ "train_steps_per_second": 0.017
+}
\ No newline at end of file
diff --git a/checkpoint-1000/README.md b/checkpoint-1000/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..82793f73e61dbb024e11fc6697bba1622d4d0db6
--- /dev/null
+++ b/checkpoint-1000/README.md
@@ -0,0 +1,20 @@
+---
+library_name: peft
+---
+## Training procedure
+
+
+The following `bitsandbytes` quantization config was used during training:
+- load_in_8bit: False
+- load_in_4bit: True
+- llm_int8_threshold: 6.0
+- llm_int8_skip_modules: None
+- llm_int8_enable_fp32_cpu_offload: False
+- llm_int8_has_fp16_weight: False
+- bnb_4bit_quant_type: nf4
+- bnb_4bit_use_double_quant: True
+- bnb_4bit_compute_dtype: bfloat16
+### Framework versions
+
+
+- PEFT 0.4.0
diff --git a/checkpoint-1000/adapter_config.json b/checkpoint-1000/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..f6f42652a8f2f3cf02936153fca1af5d04e7983d
--- /dev/null
+++ b/checkpoint-1000/adapter_config.json
@@ -0,0 +1,26 @@
+{
+ "auto_mapping": null,
+ "base_model_name_or_path": "152334H/miqu-1-70b-sf",
+ "bias": "none",
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "lora_alpha": 16.0,
+ "lora_dropout": 0.05,
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "r": 64,
+ "revision": null,
+ "target_modules": [
+ "gate_proj",
+ "down_proj",
+ "v_proj",
+ "up_proj",
+ "q_proj",
+ "k_proj",
+ "o_proj"
+ ],
+ "task_type": "CAUSAL_LM"
+}
\ No newline at end of file
diff --git a/checkpoint-1000/adapter_model.bin b/checkpoint-1000/adapter_model.bin
new file mode 100644
index 0000000000000000000000000000000000000000..dadc215a63fed58c89df77db1bd8df3d49058b2a
--- /dev/null
+++ b/checkpoint-1000/adapter_model.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8bc29d1b7b8d9fbb7bdf2819f6d0628cea3d5ab845cc689cb80acece39912a3b
+size 1657155522
diff --git a/checkpoint-1000/adapter_model/README.md b/checkpoint-1000/adapter_model/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..82793f73e61dbb024e11fc6697bba1622d4d0db6
--- /dev/null
+++ b/checkpoint-1000/adapter_model/README.md
@@ -0,0 +1,20 @@
+---
+library_name: peft
+---
+## Training procedure
+
+
+The following `bitsandbytes` quantization config was used during training:
+- load_in_8bit: False
+- load_in_4bit: True
+- llm_int8_threshold: 6.0
+- llm_int8_skip_modules: None
+- llm_int8_enable_fp32_cpu_offload: False
+- llm_int8_has_fp16_weight: False
+- bnb_4bit_quant_type: nf4
+- bnb_4bit_use_double_quant: True
+- bnb_4bit_compute_dtype: bfloat16
+### Framework versions
+
+
+- PEFT 0.4.0
diff --git a/checkpoint-1000/adapter_model/adapter_config.json b/checkpoint-1000/adapter_model/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..f6f42652a8f2f3cf02936153fca1af5d04e7983d
--- /dev/null
+++ b/checkpoint-1000/adapter_model/adapter_config.json
@@ -0,0 +1,26 @@
+{
+ "auto_mapping": null,
+ "base_model_name_or_path": "152334H/miqu-1-70b-sf",
+ "bias": "none",
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "lora_alpha": 16.0,
+ "lora_dropout": 0.05,
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "r": 64,
+ "revision": null,
+ "target_modules": [
+ "gate_proj",
+ "down_proj",
+ "v_proj",
+ "up_proj",
+ "q_proj",
+ "k_proj",
+ "o_proj"
+ ],
+ "task_type": "CAUSAL_LM"
+}
\ No newline at end of file
diff --git a/checkpoint-1000/adapter_model/adapter_model.bin b/checkpoint-1000/adapter_model/adapter_model.bin
new file mode 100644
index 0000000000000000000000000000000000000000..dadc215a63fed58c89df77db1bd8df3d49058b2a
--- /dev/null
+++ b/checkpoint-1000/adapter_model/adapter_model.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8bc29d1b7b8d9fbb7bdf2819f6d0628cea3d5ab845cc689cb80acece39912a3b
+size 1657155522
diff --git a/checkpoint-1000/optimizer.pt b/checkpoint-1000/optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..2509cc47164aed9cafb70d5ef512cc700a282cca
--- /dev/null
+++ b/checkpoint-1000/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7fff8ab4eb57b8e9147ca09f977042d1e861ca602fa63f83f85702beb67365ec
+size 6627702922
diff --git a/checkpoint-1000/rng_state.pth b/checkpoint-1000/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..264165dcb2a05b58a99b090b5d58834c28e05bc3
--- /dev/null
+++ b/checkpoint-1000/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e113e075e7ce260c7f7e75bb24de1ff504604347dbc91f90709c86d5a09023f2
+size 14180
diff --git a/checkpoint-1000/scheduler.pt b/checkpoint-1000/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..e48445ba4d02f1dfa03918780d1ef0e6a4198b00
--- /dev/null
+++ b/checkpoint-1000/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:016164c1868d1353a972df97439a4a6f6ad10c19164c770b2c7d8301f524b82a
+size 1064
diff --git a/checkpoint-1000/special_tokens_map.json b/checkpoint-1000/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..55431d8ca0ffd8e996c646341b6136e749142547
--- /dev/null
+++ b/checkpoint-1000/special_tokens_map.json
@@ -0,0 +1,12 @@
+{
+ "bos_token": "",
+ "eos_token": "",
+ "pad_token": {
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false
+ },
+ "unk_token": ""
+}
diff --git a/checkpoint-1000/tokenizer.model b/checkpoint-1000/tokenizer.model
new file mode 100644
index 0000000000000000000000000000000000000000..6c00c742ce03c627d6cd5b795984876fa49fa899
--- /dev/null
+++ b/checkpoint-1000/tokenizer.model
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
+size 499723
diff --git a/checkpoint-1000/tokenizer_config.json b/checkpoint-1000/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..a3b9160e6f055447e1b1927482cf0ea394366055
--- /dev/null
+++ b/checkpoint-1000/tokenizer_config.json
@@ -0,0 +1,71 @@
+{
+ "add_bos_token": true,
+ "add_eos_token": false,
+ "added_tokens_decoder": {
+ "0": {
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "1": {
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "2": {
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ }
+ },
+ "bos_token": {
+ "__type": "AddedToken",
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false
+ },
+ "chat_template": "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}",
+ "clean_up_tokenization_spaces": false,
+ "eos_token": {
+ "__type": "AddedToken",
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false
+ },
+ "legacy": false,
+ "model_max_length": 1000000000000000019884624838656,
+ "pad_token": {
+ "__type": "AddedToken",
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false
+ },
+ "padding_side": "right",
+ "sp_model_kwargs": {},
+ "spaces_between_special_tokens": false,
+ "tokenizer_class": "LlamaTokenizer",
+ "unk_token": {
+ "__type": "AddedToken",
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false
+ },
+ "use_default_system_prompt": false
+}
diff --git a/checkpoint-1000/trainer_state.json b/checkpoint-1000/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..3c75ae97af80eab73ceca339bae5bd106a62f2cd
--- /dev/null
+++ b/checkpoint-1000/trainer_state.json
@@ -0,0 +1,971 @@
+{
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 1.8136476989344819,
+ "global_step": 1000,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0001,
+ "loss": 1.1655,
+ "step": 10
+ },
+ {
+ "epoch": 0.04,
+ "learning_rate": 0.0001,
+ "loss": 1.001,
+ "step": 20
+ },
+ {
+ "epoch": 0.05,
+ "learning_rate": 0.0001,
+ "loss": 1.0287,
+ "step": 30
+ },
+ {
+ "epoch": 0.07,
+ "learning_rate": 0.0001,
+ "loss": 1.1578,
+ "step": 40
+ },
+ {
+ "epoch": 0.09,
+ "learning_rate": 0.0001,
+ "loss": 1.2146,
+ "step": 50
+ },
+ {
+ "epoch": 0.11,
+ "learning_rate": 0.0001,
+ "loss": 0.997,
+ "step": 60
+ },
+ {
+ "epoch": 0.13,
+ "learning_rate": 0.0001,
+ "loss": 0.9024,
+ "step": 70
+ },
+ {
+ "epoch": 0.15,
+ "learning_rate": 0.0001,
+ "loss": 0.9901,
+ "step": 80
+ },
+ {
+ "epoch": 0.16,
+ "learning_rate": 0.0001,
+ "loss": 1.1264,
+ "step": 90
+ },
+ {
+ "epoch": 0.18,
+ "learning_rate": 0.0001,
+ "loss": 1.2038,
+ "step": 100
+ },
+ {
+ "epoch": 0.2,
+ "learning_rate": 0.0001,
+ "loss": 0.8935,
+ "step": 110
+ },
+ {
+ "epoch": 0.22,
+ "learning_rate": 0.0001,
+ "loss": 0.9178,
+ "step": 120
+ },
+ {
+ "epoch": 0.24,
+ "learning_rate": 0.0001,
+ "loss": 0.9746,
+ "step": 130
+ },
+ {
+ "epoch": 0.25,
+ "learning_rate": 0.0001,
+ "loss": 1.1566,
+ "step": 140
+ },
+ {
+ "epoch": 0.27,
+ "learning_rate": 0.0001,
+ "loss": 1.2877,
+ "step": 150
+ },
+ {
+ "epoch": 0.29,
+ "learning_rate": 0.0001,
+ "loss": 0.9146,
+ "step": 160
+ },
+ {
+ "epoch": 0.31,
+ "learning_rate": 0.0001,
+ "loss": 0.8895,
+ "step": 170
+ },
+ {
+ "epoch": 0.33,
+ "learning_rate": 0.0001,
+ "loss": 1.0121,
+ "step": 180
+ },
+ {
+ "epoch": 0.34,
+ "eval_loss": 1.0215636491775513,
+ "eval_runtime": 950.138,
+ "eval_samples_per_second": 1.052,
+ "eval_steps_per_second": 1.052,
+ "step": 187
+ },
+ {
+ "epoch": 0.34,
+ "mmlu_eval_accuracy": 0.731892294851104,
+ "mmlu_eval_accuracy_abstract_algebra": 0.18181818181818182,
+ "mmlu_eval_accuracy_anatomy": 0.7857142857142857,
+ "mmlu_eval_accuracy_astronomy": 0.6875,
+ "mmlu_eval_accuracy_business_ethics": 0.7272727272727273,
+ "mmlu_eval_accuracy_clinical_knowledge": 0.896551724137931,
+ "mmlu_eval_accuracy_college_biology": 0.875,
+ "mmlu_eval_accuracy_college_chemistry": 0.5,
+ "mmlu_eval_accuracy_college_computer_science": 0.7272727272727273,
+ "mmlu_eval_accuracy_college_mathematics": 0.45454545454545453,
+ "mmlu_eval_accuracy_college_medicine": 0.9090909090909091,
+ "mmlu_eval_accuracy_college_physics": 0.6363636363636364,
+ "mmlu_eval_accuracy_computer_security": 0.6363636363636364,
+ "mmlu_eval_accuracy_conceptual_physics": 0.6538461538461539,
+ "mmlu_eval_accuracy_econometrics": 0.8333333333333334,
+ "mmlu_eval_accuracy_electrical_engineering": 0.8125,
+ "mmlu_eval_accuracy_elementary_mathematics": 0.7073170731707317,
+ "mmlu_eval_accuracy_formal_logic": 0.5714285714285714,
+ "mmlu_eval_accuracy_global_facts": 0.5,
+ "mmlu_eval_accuracy_high_school_biology": 0.8125,
+ "mmlu_eval_accuracy_high_school_chemistry": 0.4090909090909091,
+ "mmlu_eval_accuracy_high_school_computer_science": 0.6666666666666666,
+ "mmlu_eval_accuracy_high_school_european_history": 0.7777777777777778,
+ "mmlu_eval_accuracy_high_school_geography": 0.9090909090909091,
+ "mmlu_eval_accuracy_high_school_government_and_politics": 0.9523809523809523,
+ "mmlu_eval_accuracy_high_school_macroeconomics": 0.7441860465116279,
+ "mmlu_eval_accuracy_high_school_mathematics": 0.3793103448275862,
+ "mmlu_eval_accuracy_high_school_microeconomics": 0.9615384615384616,
+ "mmlu_eval_accuracy_high_school_physics": 0.23529411764705882,
+ "mmlu_eval_accuracy_high_school_psychology": 0.9333333333333333,
+ "mmlu_eval_accuracy_high_school_statistics": 0.6956521739130435,
+ "mmlu_eval_accuracy_high_school_us_history": 0.9090909090909091,
+ "mmlu_eval_accuracy_high_school_world_history": 0.8076923076923077,
+ "mmlu_eval_accuracy_human_aging": 0.8260869565217391,
+ "mmlu_eval_accuracy_human_sexuality": 0.6666666666666666,
+ "mmlu_eval_accuracy_international_law": 1.0,
+ "mmlu_eval_accuracy_jurisprudence": 0.6363636363636364,
+ "mmlu_eval_accuracy_logical_fallacies": 0.7777777777777778,
+ "mmlu_eval_accuracy_machine_learning": 0.5454545454545454,
+ "mmlu_eval_accuracy_management": 0.9090909090909091,
+ "mmlu_eval_accuracy_marketing": 0.88,
+ "mmlu_eval_accuracy_medical_genetics": 0.9090909090909091,
+ "mmlu_eval_accuracy_miscellaneous": 0.7906976744186046,
+ "mmlu_eval_accuracy_moral_disputes": 0.8157894736842105,
+ "mmlu_eval_accuracy_moral_scenarios": 0.59,
+ "mmlu_eval_accuracy_nutrition": 0.7878787878787878,
+ "mmlu_eval_accuracy_philosophy": 0.7941176470588235,
+ "mmlu_eval_accuracy_prehistory": 0.8285714285714286,
+ "mmlu_eval_accuracy_professional_accounting": 0.6451612903225806,
+ "mmlu_eval_accuracy_professional_law": 0.6294117647058823,
+ "mmlu_eval_accuracy_professional_medicine": 0.8064516129032258,
+ "mmlu_eval_accuracy_professional_psychology": 0.8115942028985508,
+ "mmlu_eval_accuracy_public_relations": 0.6666666666666666,
+ "mmlu_eval_accuracy_security_studies": 0.8148148148148148,
+ "mmlu_eval_accuracy_sociology": 0.9545454545454546,
+ "mmlu_eval_accuracy_us_foreign_policy": 1.0,
+ "mmlu_eval_accuracy_virology": 0.5,
+ "mmlu_eval_accuracy_world_religions": 0.8421052631578947,
+ "mmlu_loss": 1.326305795171384,
+ "step": 187
+ },
+ {
+ "epoch": 0.34,
+ "learning_rate": 0.0001,
+ "loss": 1.1133,
+ "step": 190
+ },
+ {
+ "epoch": 0.36,
+ "learning_rate": 0.0001,
+ "loss": 1.2485,
+ "step": 200
+ },
+ {
+ "epoch": 0.38,
+ "learning_rate": 0.0001,
+ "loss": 0.9653,
+ "step": 210
+ },
+ {
+ "epoch": 0.4,
+ "learning_rate": 0.0001,
+ "loss": 0.9455,
+ "step": 220
+ },
+ {
+ "epoch": 0.42,
+ "learning_rate": 0.0001,
+ "loss": 1.0373,
+ "step": 230
+ },
+ {
+ "epoch": 0.44,
+ "learning_rate": 0.0001,
+ "loss": 1.1425,
+ "step": 240
+ },
+ {
+ "epoch": 0.45,
+ "learning_rate": 0.0001,
+ "loss": 1.3136,
+ "step": 250
+ },
+ {
+ "epoch": 0.47,
+ "learning_rate": 0.0001,
+ "loss": 0.8695,
+ "step": 260
+ },
+ {
+ "epoch": 0.49,
+ "learning_rate": 0.0001,
+ "loss": 0.872,
+ "step": 270
+ },
+ {
+ "epoch": 0.51,
+ "learning_rate": 0.0001,
+ "loss": 1.0152,
+ "step": 280
+ },
+ {
+ "epoch": 0.53,
+ "learning_rate": 0.0001,
+ "loss": 1.1309,
+ "step": 290
+ },
+ {
+ "epoch": 0.54,
+ "learning_rate": 0.0001,
+ "loss": 1.267,
+ "step": 300
+ },
+ {
+ "epoch": 0.56,
+ "learning_rate": 0.0001,
+ "loss": 0.9249,
+ "step": 310
+ },
+ {
+ "epoch": 0.58,
+ "learning_rate": 0.0001,
+ "loss": 0.9148,
+ "step": 320
+ },
+ {
+ "epoch": 0.6,
+ "learning_rate": 0.0001,
+ "loss": 0.9864,
+ "step": 330
+ },
+ {
+ "epoch": 0.62,
+ "learning_rate": 0.0001,
+ "loss": 1.2312,
+ "step": 340
+ },
+ {
+ "epoch": 0.63,
+ "learning_rate": 0.0001,
+ "loss": 1.2354,
+ "step": 350
+ },
+ {
+ "epoch": 0.65,
+ "learning_rate": 0.0001,
+ "loss": 0.9126,
+ "step": 360
+ },
+ {
+ "epoch": 0.67,
+ "learning_rate": 0.0001,
+ "loss": 0.9213,
+ "step": 370
+ },
+ {
+ "epoch": 0.68,
+ "eval_loss": 1.0163359642028809,
+ "eval_runtime": 948.1151,
+ "eval_samples_per_second": 1.055,
+ "eval_steps_per_second": 1.055,
+ "step": 374
+ },
+ {
+ "epoch": 0.68,
+ "mmlu_eval_accuracy": 0.7395476061435284,
+ "mmlu_eval_accuracy_abstract_algebra": 0.2727272727272727,
+ "mmlu_eval_accuracy_anatomy": 0.7857142857142857,
+ "mmlu_eval_accuracy_astronomy": 0.75,
+ "mmlu_eval_accuracy_business_ethics": 0.7272727272727273,
+ "mmlu_eval_accuracy_clinical_knowledge": 0.896551724137931,
+ "mmlu_eval_accuracy_college_biology": 0.875,
+ "mmlu_eval_accuracy_college_chemistry": 0.5,
+ "mmlu_eval_accuracy_college_computer_science": 0.7272727272727273,
+ "mmlu_eval_accuracy_college_mathematics": 0.36363636363636365,
+ "mmlu_eval_accuracy_college_medicine": 0.9090909090909091,
+ "mmlu_eval_accuracy_college_physics": 0.6363636363636364,
+ "mmlu_eval_accuracy_computer_security": 0.6363636363636364,
+ "mmlu_eval_accuracy_conceptual_physics": 0.5769230769230769,
+ "mmlu_eval_accuracy_econometrics": 0.8333333333333334,
+ "mmlu_eval_accuracy_electrical_engineering": 0.875,
+ "mmlu_eval_accuracy_elementary_mathematics": 0.6829268292682927,
+ "mmlu_eval_accuracy_formal_logic": 0.6428571428571429,
+ "mmlu_eval_accuracy_global_facts": 0.5,
+ "mmlu_eval_accuracy_high_school_biology": 0.8125,
+ "mmlu_eval_accuracy_high_school_chemistry": 0.45454545454545453,
+ "mmlu_eval_accuracy_high_school_computer_science": 0.7777777777777778,
+ "mmlu_eval_accuracy_high_school_european_history": 0.7777777777777778,
+ "mmlu_eval_accuracy_high_school_geography": 0.9090909090909091,
+ "mmlu_eval_accuracy_high_school_government_and_politics": 0.9523809523809523,
+ "mmlu_eval_accuracy_high_school_macroeconomics": 0.7674418604651163,
+ "mmlu_eval_accuracy_high_school_mathematics": 0.41379310344827586,
+ "mmlu_eval_accuracy_high_school_microeconomics": 0.9615384615384616,
+ "mmlu_eval_accuracy_high_school_physics": 0.23529411764705882,
+ "mmlu_eval_accuracy_high_school_psychology": 0.95,
+ "mmlu_eval_accuracy_high_school_statistics": 0.7391304347826086,
+ "mmlu_eval_accuracy_high_school_us_history": 0.9090909090909091,
+ "mmlu_eval_accuracy_high_school_world_history": 0.7692307692307693,
+ "mmlu_eval_accuracy_human_aging": 0.8260869565217391,
+ "mmlu_eval_accuracy_human_sexuality": 0.6666666666666666,
+ "mmlu_eval_accuracy_international_law": 1.0,
+ "mmlu_eval_accuracy_jurisprudence": 0.6363636363636364,
+ "mmlu_eval_accuracy_logical_fallacies": 0.7777777777777778,
+ "mmlu_eval_accuracy_machine_learning": 0.5454545454545454,
+ "mmlu_eval_accuracy_management": 0.9090909090909091,
+ "mmlu_eval_accuracy_marketing": 0.92,
+ "mmlu_eval_accuracy_medical_genetics": 0.9090909090909091,
+ "mmlu_eval_accuracy_miscellaneous": 0.7790697674418605,
+ "mmlu_eval_accuracy_moral_disputes": 0.8157894736842105,
+ "mmlu_eval_accuracy_moral_scenarios": 0.57,
+ "mmlu_eval_accuracy_nutrition": 0.7272727272727273,
+ "mmlu_eval_accuracy_philosophy": 0.7941176470588235,
+ "mmlu_eval_accuracy_prehistory": 0.8571428571428571,
+ "mmlu_eval_accuracy_professional_accounting": 0.6774193548387096,
+ "mmlu_eval_accuracy_professional_law": 0.6411764705882353,
+ "mmlu_eval_accuracy_professional_medicine": 0.8387096774193549,
+ "mmlu_eval_accuracy_professional_psychology": 0.8115942028985508,
+ "mmlu_eval_accuracy_public_relations": 0.6666666666666666,
+ "mmlu_eval_accuracy_security_studies": 0.8148148148148148,
+ "mmlu_eval_accuracy_sociology": 0.9545454545454546,
+ "mmlu_eval_accuracy_us_foreign_policy": 1.0,
+ "mmlu_eval_accuracy_virology": 0.5,
+ "mmlu_eval_accuracy_world_religions": 0.8947368421052632,
+ "mmlu_loss": 1.2796503596061355,
+ "step": 374
+ },
+ {
+ "epoch": 0.69,
+ "learning_rate": 0.0001,
+ "loss": 0.9737,
+ "step": 380
+ },
+ {
+ "epoch": 0.71,
+ "learning_rate": 0.0001,
+ "loss": 1.157,
+ "step": 390
+ },
+ {
+ "epoch": 0.73,
+ "learning_rate": 0.0001,
+ "loss": 1.2106,
+ "step": 400
+ },
+ {
+ "epoch": 0.74,
+ "learning_rate": 0.0001,
+ "loss": 0.8687,
+ "step": 410
+ },
+ {
+ "epoch": 0.76,
+ "learning_rate": 0.0001,
+ "loss": 0.8742,
+ "step": 420
+ },
+ {
+ "epoch": 0.78,
+ "learning_rate": 0.0001,
+ "loss": 0.9901,
+ "step": 430
+ },
+ {
+ "epoch": 0.8,
+ "learning_rate": 0.0001,
+ "loss": 1.2238,
+ "step": 440
+ },
+ {
+ "epoch": 0.82,
+ "learning_rate": 0.0001,
+ "loss": 1.2604,
+ "step": 450
+ },
+ {
+ "epoch": 0.83,
+ "learning_rate": 0.0001,
+ "loss": 0.8756,
+ "step": 460
+ },
+ {
+ "epoch": 0.85,
+ "learning_rate": 0.0001,
+ "loss": 0.8683,
+ "step": 470
+ },
+ {
+ "epoch": 0.87,
+ "learning_rate": 0.0001,
+ "loss": 0.9824,
+ "step": 480
+ },
+ {
+ "epoch": 0.89,
+ "learning_rate": 0.0001,
+ "loss": 1.1574,
+ "step": 490
+ },
+ {
+ "epoch": 0.91,
+ "learning_rate": 0.0001,
+ "loss": 1.2687,
+ "step": 500
+ },
+ {
+ "epoch": 0.92,
+ "learning_rate": 0.0001,
+ "loss": 0.8657,
+ "step": 510
+ },
+ {
+ "epoch": 0.94,
+ "learning_rate": 0.0001,
+ "loss": 0.9207,
+ "step": 520
+ },
+ {
+ "epoch": 0.96,
+ "learning_rate": 0.0001,
+ "loss": 1.012,
+ "step": 530
+ },
+ {
+ "epoch": 0.98,
+ "learning_rate": 0.0001,
+ "loss": 1.1517,
+ "step": 540
+ },
+ {
+ "epoch": 1.0,
+ "learning_rate": 0.0001,
+ "loss": 1.1654,
+ "step": 550
+ },
+ {
+ "epoch": 1.02,
+ "learning_rate": 0.0001,
+ "loss": 0.8931,
+ "step": 560
+ },
+ {
+ "epoch": 1.02,
+ "eval_loss": 1.0150845050811768,
+ "eval_runtime": 949.8392,
+ "eval_samples_per_second": 1.053,
+ "eval_steps_per_second": 1.053,
+ "step": 561
+ },
+ {
+ "epoch": 1.02,
+ "mmlu_eval_accuracy": 0.7346397374699287,
+ "mmlu_eval_accuracy_abstract_algebra": 0.2727272727272727,
+ "mmlu_eval_accuracy_anatomy": 0.7142857142857143,
+ "mmlu_eval_accuracy_astronomy": 0.6875,
+ "mmlu_eval_accuracy_business_ethics": 0.8181818181818182,
+ "mmlu_eval_accuracy_clinical_knowledge": 0.8620689655172413,
+ "mmlu_eval_accuracy_college_biology": 0.875,
+ "mmlu_eval_accuracy_college_chemistry": 0.5,
+ "mmlu_eval_accuracy_college_computer_science": 0.7272727272727273,
+ "mmlu_eval_accuracy_college_mathematics": 0.36363636363636365,
+ "mmlu_eval_accuracy_college_medicine": 0.9090909090909091,
+ "mmlu_eval_accuracy_college_physics": 0.5454545454545454,
+ "mmlu_eval_accuracy_computer_security": 0.6363636363636364,
+ "mmlu_eval_accuracy_conceptual_physics": 0.6153846153846154,
+ "mmlu_eval_accuracy_econometrics": 0.8333333333333334,
+ "mmlu_eval_accuracy_electrical_engineering": 0.8125,
+ "mmlu_eval_accuracy_elementary_mathematics": 0.6829268292682927,
+ "mmlu_eval_accuracy_formal_logic": 0.6428571428571429,
+ "mmlu_eval_accuracy_global_facts": 0.5,
+ "mmlu_eval_accuracy_high_school_biology": 0.84375,
+ "mmlu_eval_accuracy_high_school_chemistry": 0.45454545454545453,
+ "mmlu_eval_accuracy_high_school_computer_science": 0.7777777777777778,
+ "mmlu_eval_accuracy_high_school_european_history": 0.7777777777777778,
+ "mmlu_eval_accuracy_high_school_geography": 0.9090909090909091,
+ "mmlu_eval_accuracy_high_school_government_and_politics": 0.9523809523809523,
+ "mmlu_eval_accuracy_high_school_macroeconomics": 0.7674418604651163,
+ "mmlu_eval_accuracy_high_school_mathematics": 0.41379310344827586,
+ "mmlu_eval_accuracy_high_school_microeconomics": 0.9230769230769231,
+ "mmlu_eval_accuracy_high_school_physics": 0.23529411764705882,
+ "mmlu_eval_accuracy_high_school_psychology": 0.9666666666666667,
+ "mmlu_eval_accuracy_high_school_statistics": 0.7391304347826086,
+ "mmlu_eval_accuracy_high_school_us_history": 0.9090909090909091,
+ "mmlu_eval_accuracy_high_school_world_history": 0.8076923076923077,
+ "mmlu_eval_accuracy_human_aging": 0.8260869565217391,
+ "mmlu_eval_accuracy_human_sexuality": 0.6666666666666666,
+ "mmlu_eval_accuracy_international_law": 1.0,
+ "mmlu_eval_accuracy_jurisprudence": 0.6363636363636364,
+ "mmlu_eval_accuracy_logical_fallacies": 0.7222222222222222,
+ "mmlu_eval_accuracy_machine_learning": 0.6363636363636364,
+ "mmlu_eval_accuracy_management": 0.9090909090909091,
+ "mmlu_eval_accuracy_marketing": 0.88,
+ "mmlu_eval_accuracy_medical_genetics": 0.9090909090909091,
+ "mmlu_eval_accuracy_miscellaneous": 0.7558139534883721,
+ "mmlu_eval_accuracy_moral_disputes": 0.8157894736842105,
+ "mmlu_eval_accuracy_moral_scenarios": 0.56,
+ "mmlu_eval_accuracy_nutrition": 0.7575757575757576,
+ "mmlu_eval_accuracy_philosophy": 0.7941176470588235,
+ "mmlu_eval_accuracy_prehistory": 0.8571428571428571,
+ "mmlu_eval_accuracy_professional_accounting": 0.6774193548387096,
+ "mmlu_eval_accuracy_professional_law": 0.6294117647058823,
+ "mmlu_eval_accuracy_professional_medicine": 0.8064516129032258,
+ "mmlu_eval_accuracy_professional_psychology": 0.8260869565217391,
+ "mmlu_eval_accuracy_public_relations": 0.6666666666666666,
+ "mmlu_eval_accuracy_security_studies": 0.8148148148148148,
+ "mmlu_eval_accuracy_sociology": 0.9090909090909091,
+ "mmlu_eval_accuracy_us_foreign_policy": 1.0,
+ "mmlu_eval_accuracy_virology": 0.5,
+ "mmlu_eval_accuracy_world_religions": 0.8421052631578947,
+ "mmlu_loss": 1.186674291658526,
+ "step": 561
+ },
+ {
+ "epoch": 1.03,
+ "learning_rate": 0.0001,
+ "loss": 0.8507,
+ "step": 570
+ },
+ {
+ "epoch": 1.05,
+ "learning_rate": 0.0001,
+ "loss": 0.9164,
+ "step": 580
+ },
+ {
+ "epoch": 1.07,
+ "learning_rate": 0.0001,
+ "loss": 1.0908,
+ "step": 590
+ },
+ {
+ "epoch": 1.09,
+ "learning_rate": 0.0001,
+ "loss": 1.0431,
+ "step": 600
+ },
+ {
+ "epoch": 1.11,
+ "learning_rate": 0.0001,
+ "loss": 0.8567,
+ "step": 610
+ },
+ {
+ "epoch": 1.12,
+ "learning_rate": 0.0001,
+ "loss": 0.8818,
+ "step": 620
+ },
+ {
+ "epoch": 1.14,
+ "learning_rate": 0.0001,
+ "loss": 0.9499,
+ "step": 630
+ },
+ {
+ "epoch": 1.16,
+ "learning_rate": 0.0001,
+ "loss": 1.0437,
+ "step": 640
+ },
+ {
+ "epoch": 1.18,
+ "learning_rate": 0.0001,
+ "loss": 1.0487,
+ "step": 650
+ },
+ {
+ "epoch": 1.2,
+ "learning_rate": 0.0001,
+ "loss": 0.8405,
+ "step": 660
+ },
+ {
+ "epoch": 1.22,
+ "learning_rate": 0.0001,
+ "loss": 0.8818,
+ "step": 670
+ },
+ {
+ "epoch": 1.23,
+ "learning_rate": 0.0001,
+ "loss": 0.9619,
+ "step": 680
+ },
+ {
+ "epoch": 1.25,
+ "learning_rate": 0.0001,
+ "loss": 1.0753,
+ "step": 690
+ },
+ {
+ "epoch": 1.27,
+ "learning_rate": 0.0001,
+ "loss": 1.0218,
+ "step": 700
+ },
+ {
+ "epoch": 1.29,
+ "learning_rate": 0.0001,
+ "loss": 0.8763,
+ "step": 710
+ },
+ {
+ "epoch": 1.31,
+ "learning_rate": 0.0001,
+ "loss": 0.8789,
+ "step": 720
+ },
+ {
+ "epoch": 1.32,
+ "learning_rate": 0.0001,
+ "loss": 0.8631,
+ "step": 730
+ },
+ {
+ "epoch": 1.34,
+ "learning_rate": 0.0001,
+ "loss": 0.9846,
+ "step": 740
+ },
+ {
+ "epoch": 1.36,
+ "eval_loss": 1.0305067300796509,
+ "eval_runtime": 948.7106,
+ "eval_samples_per_second": 1.054,
+ "eval_steps_per_second": 1.054,
+ "step": 748
+ },
+ {
+ "epoch": 1.36,
+ "mmlu_eval_accuracy": 0.7324229372189777,
+ "mmlu_eval_accuracy_abstract_algebra": 0.2727272727272727,
+ "mmlu_eval_accuracy_anatomy": 0.6428571428571429,
+ "mmlu_eval_accuracy_astronomy": 0.6875,
+ "mmlu_eval_accuracy_business_ethics": 1.0,
+ "mmlu_eval_accuracy_clinical_knowledge": 0.896551724137931,
+ "mmlu_eval_accuracy_college_biology": 0.875,
+ "mmlu_eval_accuracy_college_chemistry": 0.5,
+ "mmlu_eval_accuracy_college_computer_science": 0.6363636363636364,
+ "mmlu_eval_accuracy_college_mathematics": 0.36363636363636365,
+ "mmlu_eval_accuracy_college_medicine": 0.8636363636363636,
+ "mmlu_eval_accuracy_college_physics": 0.6363636363636364,
+ "mmlu_eval_accuracy_computer_security": 0.6363636363636364,
+ "mmlu_eval_accuracy_conceptual_physics": 0.6153846153846154,
+ "mmlu_eval_accuracy_econometrics": 0.8333333333333334,
+ "mmlu_eval_accuracy_electrical_engineering": 0.8125,
+ "mmlu_eval_accuracy_elementary_mathematics": 0.7073170731707317,
+ "mmlu_eval_accuracy_formal_logic": 0.6428571428571429,
+ "mmlu_eval_accuracy_global_facts": 0.4,
+ "mmlu_eval_accuracy_high_school_biology": 0.8125,
+ "mmlu_eval_accuracy_high_school_chemistry": 0.36363636363636365,
+ "mmlu_eval_accuracy_high_school_computer_science": 0.6666666666666666,
+ "mmlu_eval_accuracy_high_school_european_history": 0.7777777777777778,
+ "mmlu_eval_accuracy_high_school_geography": 0.9090909090909091,
+ "mmlu_eval_accuracy_high_school_government_and_politics": 0.9523809523809523,
+ "mmlu_eval_accuracy_high_school_macroeconomics": 0.7674418604651163,
+ "mmlu_eval_accuracy_high_school_mathematics": 0.3448275862068966,
+ "mmlu_eval_accuracy_high_school_microeconomics": 0.9615384615384616,
+ "mmlu_eval_accuracy_high_school_physics": 0.35294117647058826,
+ "mmlu_eval_accuracy_high_school_psychology": 0.9333333333333333,
+ "mmlu_eval_accuracy_high_school_statistics": 0.782608695652174,
+ "mmlu_eval_accuracy_high_school_us_history": 0.9545454545454546,
+ "mmlu_eval_accuracy_high_school_world_history": 0.7692307692307693,
+ "mmlu_eval_accuracy_human_aging": 0.782608695652174,
+ "mmlu_eval_accuracy_human_sexuality": 0.6666666666666666,
+ "mmlu_eval_accuracy_international_law": 1.0,
+ "mmlu_eval_accuracy_jurisprudence": 0.6363636363636364,
+ "mmlu_eval_accuracy_logical_fallacies": 0.7777777777777778,
+ "mmlu_eval_accuracy_machine_learning": 0.6363636363636364,
+ "mmlu_eval_accuracy_management": 0.9090909090909091,
+ "mmlu_eval_accuracy_marketing": 0.96,
+ "mmlu_eval_accuracy_medical_genetics": 0.9090909090909091,
+ "mmlu_eval_accuracy_miscellaneous": 0.7441860465116279,
+ "mmlu_eval_accuracy_moral_disputes": 0.8157894736842105,
+ "mmlu_eval_accuracy_moral_scenarios": 0.57,
+ "mmlu_eval_accuracy_nutrition": 0.7272727272727273,
+ "mmlu_eval_accuracy_philosophy": 0.7941176470588235,
+ "mmlu_eval_accuracy_prehistory": 0.8285714285714286,
+ "mmlu_eval_accuracy_professional_accounting": 0.6129032258064516,
+ "mmlu_eval_accuracy_professional_law": 0.6411764705882353,
+ "mmlu_eval_accuracy_professional_medicine": 0.8064516129032258,
+ "mmlu_eval_accuracy_professional_psychology": 0.8260869565217391,
+ "mmlu_eval_accuracy_public_relations": 0.6666666666666666,
+ "mmlu_eval_accuracy_security_studies": 0.8148148148148148,
+ "mmlu_eval_accuracy_sociology": 0.9090909090909091,
+ "mmlu_eval_accuracy_us_foreign_policy": 1.0,
+ "mmlu_eval_accuracy_virology": 0.5,
+ "mmlu_eval_accuracy_world_religions": 0.8421052631578947,
+ "mmlu_loss": 1.2988067958029479,
+ "step": 748
+ },
+ {
+ "epoch": 1.36,
+ "learning_rate": 0.0001,
+ "loss": 1.0735,
+ "step": 750
+ },
+ {
+ "epoch": 1.38,
+ "learning_rate": 0.0001,
+ "loss": 0.9066,
+ "step": 760
+ },
+ {
+ "epoch": 1.4,
+ "learning_rate": 0.0001,
+ "loss": 0.8716,
+ "step": 770
+ },
+ {
+ "epoch": 1.41,
+ "learning_rate": 0.0001,
+ "loss": 0.9144,
+ "step": 780
+ },
+ {
+ "epoch": 1.43,
+ "learning_rate": 0.0001,
+ "loss": 1.0338,
+ "step": 790
+ },
+ {
+ "epoch": 1.45,
+ "learning_rate": 0.0001,
+ "loss": 1.0275,
+ "step": 800
+ },
+ {
+ "epoch": 1.47,
+ "learning_rate": 0.0001,
+ "loss": 0.8382,
+ "step": 810
+ },
+ {
+ "epoch": 1.49,
+ "learning_rate": 0.0001,
+ "loss": 0.8489,
+ "step": 820
+ },
+ {
+ "epoch": 1.51,
+ "learning_rate": 0.0001,
+ "loss": 0.8931,
+ "step": 830
+ },
+ {
+ "epoch": 1.52,
+ "learning_rate": 0.0001,
+ "loss": 1.0515,
+ "step": 840
+ },
+ {
+ "epoch": 1.54,
+ "learning_rate": 0.0001,
+ "loss": 1.0965,
+ "step": 850
+ },
+ {
+ "epoch": 1.56,
+ "learning_rate": 0.0001,
+ "loss": 0.8928,
+ "step": 860
+ },
+ {
+ "epoch": 1.58,
+ "learning_rate": 0.0001,
+ "loss": 0.8608,
+ "step": 870
+ },
+ {
+ "epoch": 1.6,
+ "learning_rate": 0.0001,
+ "loss": 0.8831,
+ "step": 880
+ },
+ {
+ "epoch": 1.61,
+ "learning_rate": 0.0001,
+ "loss": 1.0253,
+ "step": 890
+ },
+ {
+ "epoch": 1.63,
+ "learning_rate": 0.0001,
+ "loss": 0.9905,
+ "step": 900
+ },
+ {
+ "epoch": 1.65,
+ "learning_rate": 0.0001,
+ "loss": 0.8487,
+ "step": 910
+ },
+ {
+ "epoch": 1.67,
+ "learning_rate": 0.0001,
+ "loss": 0.8568,
+ "step": 920
+ },
+ {
+ "epoch": 1.69,
+ "learning_rate": 0.0001,
+ "loss": 0.9047,
+ "step": 930
+ },
+ {
+ "epoch": 1.7,
+ "eval_loss": 1.0250624418258667,
+ "eval_runtime": 946.4035,
+ "eval_samples_per_second": 1.057,
+ "eval_steps_per_second": 1.057,
+ "step": 935
+ },
+ {
+ "epoch": 1.7,
+ "mmlu_eval_accuracy": 0.7288948695878031,
+ "mmlu_eval_accuracy_abstract_algebra": 0.2727272727272727,
+ "mmlu_eval_accuracy_anatomy": 0.5714285714285714,
+ "mmlu_eval_accuracy_astronomy": 0.6875,
+ "mmlu_eval_accuracy_business_ethics": 0.9090909090909091,
+ "mmlu_eval_accuracy_clinical_knowledge": 0.8620689655172413,
+ "mmlu_eval_accuracy_college_biology": 0.875,
+ "mmlu_eval_accuracy_college_chemistry": 0.5,
+ "mmlu_eval_accuracy_college_computer_science": 0.7272727272727273,
+ "mmlu_eval_accuracy_college_mathematics": 0.36363636363636365,
+ "mmlu_eval_accuracy_college_medicine": 0.9090909090909091,
+ "mmlu_eval_accuracy_college_physics": 0.6363636363636364,
+ "mmlu_eval_accuracy_computer_security": 0.6363636363636364,
+ "mmlu_eval_accuracy_conceptual_physics": 0.6153846153846154,
+ "mmlu_eval_accuracy_econometrics": 0.8333333333333334,
+ "mmlu_eval_accuracy_electrical_engineering": 0.8125,
+ "mmlu_eval_accuracy_elementary_mathematics": 0.6097560975609756,
+ "mmlu_eval_accuracy_formal_logic": 0.6428571428571429,
+ "mmlu_eval_accuracy_global_facts": 0.5,
+ "mmlu_eval_accuracy_high_school_biology": 0.84375,
+ "mmlu_eval_accuracy_high_school_chemistry": 0.45454545454545453,
+ "mmlu_eval_accuracy_high_school_computer_science": 0.6666666666666666,
+ "mmlu_eval_accuracy_high_school_european_history": 0.7777777777777778,
+ "mmlu_eval_accuracy_high_school_geography": 0.9090909090909091,
+ "mmlu_eval_accuracy_high_school_government_and_politics": 0.9523809523809523,
+ "mmlu_eval_accuracy_high_school_macroeconomics": 0.7906976744186046,
+ "mmlu_eval_accuracy_high_school_mathematics": 0.3793103448275862,
+ "mmlu_eval_accuracy_high_school_microeconomics": 0.9230769230769231,
+ "mmlu_eval_accuracy_high_school_physics": 0.23529411764705882,
+ "mmlu_eval_accuracy_high_school_psychology": 0.9333333333333333,
+ "mmlu_eval_accuracy_high_school_statistics": 0.7391304347826086,
+ "mmlu_eval_accuracy_high_school_us_history": 0.9090909090909091,
+ "mmlu_eval_accuracy_high_school_world_history": 0.7692307692307693,
+ "mmlu_eval_accuracy_human_aging": 0.782608695652174,
+ "mmlu_eval_accuracy_human_sexuality": 0.6666666666666666,
+ "mmlu_eval_accuracy_international_law": 1.0,
+ "mmlu_eval_accuracy_jurisprudence": 0.5454545454545454,
+ "mmlu_eval_accuracy_logical_fallacies": 0.7222222222222222,
+ "mmlu_eval_accuracy_machine_learning": 0.6363636363636364,
+ "mmlu_eval_accuracy_management": 0.9090909090909091,
+ "mmlu_eval_accuracy_marketing": 0.96,
+ "mmlu_eval_accuracy_medical_genetics": 0.9090909090909091,
+ "mmlu_eval_accuracy_miscellaneous": 0.7790697674418605,
+ "mmlu_eval_accuracy_moral_disputes": 0.8157894736842105,
+ "mmlu_eval_accuracy_moral_scenarios": 0.57,
+ "mmlu_eval_accuracy_nutrition": 0.7575757575757576,
+ "mmlu_eval_accuracy_philosophy": 0.8235294117647058,
+ "mmlu_eval_accuracy_prehistory": 0.8857142857142857,
+ "mmlu_eval_accuracy_professional_accounting": 0.6129032258064516,
+ "mmlu_eval_accuracy_professional_law": 0.6235294117647059,
+ "mmlu_eval_accuracy_professional_medicine": 0.8387096774193549,
+ "mmlu_eval_accuracy_professional_psychology": 0.8115942028985508,
+ "mmlu_eval_accuracy_public_relations": 0.5833333333333334,
+ "mmlu_eval_accuracy_security_studies": 0.8148148148148148,
+ "mmlu_eval_accuracy_sociology": 0.9090909090909091,
+ "mmlu_eval_accuracy_us_foreign_policy": 1.0,
+ "mmlu_eval_accuracy_virology": 0.5,
+ "mmlu_eval_accuracy_world_religions": 0.8421052631578947,
+ "mmlu_loss": 1.243813282909306,
+ "step": 935
+ },
+ {
+ "epoch": 1.7,
+ "learning_rate": 0.0001,
+ "loss": 1.0174,
+ "step": 940
+ },
+ {
+ "epoch": 1.72,
+ "learning_rate": 0.0001,
+ "loss": 1.0302,
+ "step": 950
+ },
+ {
+ "epoch": 1.74,
+ "learning_rate": 0.0001,
+ "loss": 0.8799,
+ "step": 960
+ },
+ {
+ "epoch": 1.76,
+ "learning_rate": 0.0001,
+ "loss": 0.8447,
+ "step": 970
+ },
+ {
+ "epoch": 1.78,
+ "learning_rate": 0.0001,
+ "loss": 0.9053,
+ "step": 980
+ },
+ {
+ "epoch": 1.8,
+ "learning_rate": 0.0001,
+ "loss": 1.0331,
+ "step": 990
+ },
+ {
+ "epoch": 1.81,
+ "learning_rate": 0.0001,
+ "loss": 1.0412,
+ "step": 1000
+ }
+ ],
+ "max_steps": 1875,
+ "num_train_epochs": 4,
+ "total_flos": 1.158485717946876e+18,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/checkpoint-1000/training_args.bin b/checkpoint-1000/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..cab5478c763bfedd6ea1ae136d922355ec23f73d
--- /dev/null
+++ b/checkpoint-1000/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3739b542a2914edbd2d7eb3b727d6fa2a7752c75b0d7a856f5e87dd807fa1ef9
+size 6200
diff --git a/checkpoint-1200/README.md b/checkpoint-1200/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..82793f73e61dbb024e11fc6697bba1622d4d0db6
--- /dev/null
+++ b/checkpoint-1200/README.md
@@ -0,0 +1,20 @@
+---
+library_name: peft
+---
+## Training procedure
+
+
+The following `bitsandbytes` quantization config was used during training:
+- load_in_8bit: False
+- load_in_4bit: True
+- llm_int8_threshold: 6.0
+- llm_int8_skip_modules: None
+- llm_int8_enable_fp32_cpu_offload: False
+- llm_int8_has_fp16_weight: False
+- bnb_4bit_quant_type: nf4
+- bnb_4bit_use_double_quant: True
+- bnb_4bit_compute_dtype: bfloat16
+### Framework versions
+
+
+- PEFT 0.4.0
diff --git a/checkpoint-1200/adapter_config.json b/checkpoint-1200/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..f6f42652a8f2f3cf02936153fca1af5d04e7983d
--- /dev/null
+++ b/checkpoint-1200/adapter_config.json
@@ -0,0 +1,26 @@
+{
+ "auto_mapping": null,
+ "base_model_name_or_path": "152334H/miqu-1-70b-sf",
+ "bias": "none",
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "lora_alpha": 16.0,
+ "lora_dropout": 0.05,
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "r": 64,
+ "revision": null,
+ "target_modules": [
+ "gate_proj",
+ "down_proj",
+ "v_proj",
+ "up_proj",
+ "q_proj",
+ "k_proj",
+ "o_proj"
+ ],
+ "task_type": "CAUSAL_LM"
+}
\ No newline at end of file
diff --git a/checkpoint-1200/adapter_model.bin b/checkpoint-1200/adapter_model.bin
new file mode 100644
index 0000000000000000000000000000000000000000..766929c3eeb35ef67e2453a2b8312d3772511c37
--- /dev/null
+++ b/checkpoint-1200/adapter_model.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ad32043ca97c48601084cb3f502e591bccca0879804c4979972d332fc79a801f
+size 1657155522
diff --git a/checkpoint-1200/adapter_model/README.md b/checkpoint-1200/adapter_model/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..82793f73e61dbb024e11fc6697bba1622d4d0db6
--- /dev/null
+++ b/checkpoint-1200/adapter_model/README.md
@@ -0,0 +1,20 @@
+---
+library_name: peft
+---
+## Training procedure
+
+
+The following `bitsandbytes` quantization config was used during training:
+- load_in_8bit: False
+- load_in_4bit: True
+- llm_int8_threshold: 6.0
+- llm_int8_skip_modules: None
+- llm_int8_enable_fp32_cpu_offload: False
+- llm_int8_has_fp16_weight: False
+- bnb_4bit_quant_type: nf4
+- bnb_4bit_use_double_quant: True
+- bnb_4bit_compute_dtype: bfloat16
+### Framework versions
+
+
+- PEFT 0.4.0
diff --git a/checkpoint-1200/adapter_model/adapter_config.json b/checkpoint-1200/adapter_model/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..f6f42652a8f2f3cf02936153fca1af5d04e7983d
--- /dev/null
+++ b/checkpoint-1200/adapter_model/adapter_config.json
@@ -0,0 +1,26 @@
+{
+ "auto_mapping": null,
+ "base_model_name_or_path": "152334H/miqu-1-70b-sf",
+ "bias": "none",
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "lora_alpha": 16.0,
+ "lora_dropout": 0.05,
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "r": 64,
+ "revision": null,
+ "target_modules": [
+ "gate_proj",
+ "down_proj",
+ "v_proj",
+ "up_proj",
+ "q_proj",
+ "k_proj",
+ "o_proj"
+ ],
+ "task_type": "CAUSAL_LM"
+}
\ No newline at end of file
diff --git a/checkpoint-1200/adapter_model/adapter_model.bin b/checkpoint-1200/adapter_model/adapter_model.bin
new file mode 100644
index 0000000000000000000000000000000000000000..766929c3eeb35ef67e2453a2b8312d3772511c37
--- /dev/null
+++ b/checkpoint-1200/adapter_model/adapter_model.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ad32043ca97c48601084cb3f502e591bccca0879804c4979972d332fc79a801f
+size 1657155522
diff --git a/checkpoint-1200/optimizer.pt b/checkpoint-1200/optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..51a0de5b7f40cbeea6450792d41e1596d1cf84a8
--- /dev/null
+++ b/checkpoint-1200/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de2f756f0c8b03dbc310fe2201c53f94e44de0a72ecbe1a58087f0f6916b3c1b
+size 6627702922
diff --git a/checkpoint-1200/rng_state.pth b/checkpoint-1200/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..68e45ee8ecadc1c0d946b3eaccb8f9133cab023e
--- /dev/null
+++ b/checkpoint-1200/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:36539f193dde7f6dd2cc8b72d99a411c97b376a7260c2930cb324081a6c6ee3c
+size 14180
diff --git a/checkpoint-1200/scheduler.pt b/checkpoint-1200/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..eec2e1f675639f4b8ac1a827e2890c1cc4642b76
--- /dev/null
+++ b/checkpoint-1200/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2c5671c0de422189dff152cae166eb49cff39ac2aa88bdb353ba6e07d93451bf
+size 1064
diff --git a/checkpoint-1200/special_tokens_map.json b/checkpoint-1200/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..55431d8ca0ffd8e996c646341b6136e749142547
--- /dev/null
+++ b/checkpoint-1200/special_tokens_map.json
@@ -0,0 +1,12 @@
+{
+ "bos_token": "",
+ "eos_token": "",
+ "pad_token": {
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false
+ },
+ "unk_token": ""
+}
diff --git a/checkpoint-1200/tokenizer.model b/checkpoint-1200/tokenizer.model
new file mode 100644
index 0000000000000000000000000000000000000000..6c00c742ce03c627d6cd5b795984876fa49fa899
--- /dev/null
+++ b/checkpoint-1200/tokenizer.model
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
+size 499723
diff --git a/checkpoint-1200/tokenizer_config.json b/checkpoint-1200/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..a3b9160e6f055447e1b1927482cf0ea394366055
--- /dev/null
+++ b/checkpoint-1200/tokenizer_config.json
@@ -0,0 +1,71 @@
+{
+ "add_bos_token": true,
+ "add_eos_token": false,
+ "added_tokens_decoder": {
+ "0": {
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "1": {
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "2": {
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ }
+ },
+ "bos_token": {
+ "__type": "AddedToken",
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false
+ },
+ "chat_template": "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}",
+ "clean_up_tokenization_spaces": false,
+ "eos_token": {
+ "__type": "AddedToken",
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false
+ },
+ "legacy": false,
+ "model_max_length": 1000000000000000019884624838656,
+ "pad_token": {
+ "__type": "AddedToken",
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false
+ },
+ "padding_side": "right",
+ "sp_model_kwargs": {},
+ "spaces_between_special_tokens": false,
+ "tokenizer_class": "LlamaTokenizer",
+ "unk_token": {
+ "__type": "AddedToken",
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false
+ },
+ "use_default_system_prompt": false
+}
diff --git a/checkpoint-1200/trainer_state.json b/checkpoint-1200/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..43d8f393b77e880eff76fef57b90bcab7e271e26
--- /dev/null
+++ b/checkpoint-1200/trainer_state.json
@@ -0,0 +1,1162 @@
+{
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 2.1763772387213782,
+ "global_step": 1200,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0001,
+ "loss": 1.1655,
+ "step": 10
+ },
+ {
+ "epoch": 0.04,
+ "learning_rate": 0.0001,
+ "loss": 1.001,
+ "step": 20
+ },
+ {
+ "epoch": 0.05,
+ "learning_rate": 0.0001,
+ "loss": 1.0287,
+ "step": 30
+ },
+ {
+ "epoch": 0.07,
+ "learning_rate": 0.0001,
+ "loss": 1.1578,
+ "step": 40
+ },
+ {
+ "epoch": 0.09,
+ "learning_rate": 0.0001,
+ "loss": 1.2146,
+ "step": 50
+ },
+ {
+ "epoch": 0.11,
+ "learning_rate": 0.0001,
+ "loss": 0.997,
+ "step": 60
+ },
+ {
+ "epoch": 0.13,
+ "learning_rate": 0.0001,
+ "loss": 0.9024,
+ "step": 70
+ },
+ {
+ "epoch": 0.15,
+ "learning_rate": 0.0001,
+ "loss": 0.9901,
+ "step": 80
+ },
+ {
+ "epoch": 0.16,
+ "learning_rate": 0.0001,
+ "loss": 1.1264,
+ "step": 90
+ },
+ {
+ "epoch": 0.18,
+ "learning_rate": 0.0001,
+ "loss": 1.2038,
+ "step": 100
+ },
+ {
+ "epoch": 0.2,
+ "learning_rate": 0.0001,
+ "loss": 0.8935,
+ "step": 110
+ },
+ {
+ "epoch": 0.22,
+ "learning_rate": 0.0001,
+ "loss": 0.9178,
+ "step": 120
+ },
+ {
+ "epoch": 0.24,
+ "learning_rate": 0.0001,
+ "loss": 0.9746,
+ "step": 130
+ },
+ {
+ "epoch": 0.25,
+ "learning_rate": 0.0001,
+ "loss": 1.1566,
+ "step": 140
+ },
+ {
+ "epoch": 0.27,
+ "learning_rate": 0.0001,
+ "loss": 1.2877,
+ "step": 150
+ },
+ {
+ "epoch": 0.29,
+ "learning_rate": 0.0001,
+ "loss": 0.9146,
+ "step": 160
+ },
+ {
+ "epoch": 0.31,
+ "learning_rate": 0.0001,
+ "loss": 0.8895,
+ "step": 170
+ },
+ {
+ "epoch": 0.33,
+ "learning_rate": 0.0001,
+ "loss": 1.0121,
+ "step": 180
+ },
+ {
+ "epoch": 0.34,
+ "eval_loss": 1.0215636491775513,
+ "eval_runtime": 950.138,
+ "eval_samples_per_second": 1.052,
+ "eval_steps_per_second": 1.052,
+ "step": 187
+ },
+ {
+ "epoch": 0.34,
+ "mmlu_eval_accuracy": 0.731892294851104,
+ "mmlu_eval_accuracy_abstract_algebra": 0.18181818181818182,
+ "mmlu_eval_accuracy_anatomy": 0.7857142857142857,
+ "mmlu_eval_accuracy_astronomy": 0.6875,
+ "mmlu_eval_accuracy_business_ethics": 0.7272727272727273,
+ "mmlu_eval_accuracy_clinical_knowledge": 0.896551724137931,
+ "mmlu_eval_accuracy_college_biology": 0.875,
+ "mmlu_eval_accuracy_college_chemistry": 0.5,
+ "mmlu_eval_accuracy_college_computer_science": 0.7272727272727273,
+ "mmlu_eval_accuracy_college_mathematics": 0.45454545454545453,
+ "mmlu_eval_accuracy_college_medicine": 0.9090909090909091,
+ "mmlu_eval_accuracy_college_physics": 0.6363636363636364,
+ "mmlu_eval_accuracy_computer_security": 0.6363636363636364,
+ "mmlu_eval_accuracy_conceptual_physics": 0.6538461538461539,
+ "mmlu_eval_accuracy_econometrics": 0.8333333333333334,
+ "mmlu_eval_accuracy_electrical_engineering": 0.8125,
+ "mmlu_eval_accuracy_elementary_mathematics": 0.7073170731707317,
+ "mmlu_eval_accuracy_formal_logic": 0.5714285714285714,
+ "mmlu_eval_accuracy_global_facts": 0.5,
+ "mmlu_eval_accuracy_high_school_biology": 0.8125,
+ "mmlu_eval_accuracy_high_school_chemistry": 0.4090909090909091,
+ "mmlu_eval_accuracy_high_school_computer_science": 0.6666666666666666,
+ "mmlu_eval_accuracy_high_school_european_history": 0.7777777777777778,
+ "mmlu_eval_accuracy_high_school_geography": 0.9090909090909091,
+ "mmlu_eval_accuracy_high_school_government_and_politics": 0.9523809523809523,
+ "mmlu_eval_accuracy_high_school_macroeconomics": 0.7441860465116279,
+ "mmlu_eval_accuracy_high_school_mathematics": 0.3793103448275862,
+ "mmlu_eval_accuracy_high_school_microeconomics": 0.9615384615384616,
+ "mmlu_eval_accuracy_high_school_physics": 0.23529411764705882,
+ "mmlu_eval_accuracy_high_school_psychology": 0.9333333333333333,
+ "mmlu_eval_accuracy_high_school_statistics": 0.6956521739130435,
+ "mmlu_eval_accuracy_high_school_us_history": 0.9090909090909091,
+ "mmlu_eval_accuracy_high_school_world_history": 0.8076923076923077,
+ "mmlu_eval_accuracy_human_aging": 0.8260869565217391,
+ "mmlu_eval_accuracy_human_sexuality": 0.6666666666666666,
+ "mmlu_eval_accuracy_international_law": 1.0,
+ "mmlu_eval_accuracy_jurisprudence": 0.6363636363636364,
+ "mmlu_eval_accuracy_logical_fallacies": 0.7777777777777778,
+ "mmlu_eval_accuracy_machine_learning": 0.5454545454545454,
+ "mmlu_eval_accuracy_management": 0.9090909090909091,
+ "mmlu_eval_accuracy_marketing": 0.88,
+ "mmlu_eval_accuracy_medical_genetics": 0.9090909090909091,
+ "mmlu_eval_accuracy_miscellaneous": 0.7906976744186046,
+ "mmlu_eval_accuracy_moral_disputes": 0.8157894736842105,
+ "mmlu_eval_accuracy_moral_scenarios": 0.59,
+ "mmlu_eval_accuracy_nutrition": 0.7878787878787878,
+ "mmlu_eval_accuracy_philosophy": 0.7941176470588235,
+ "mmlu_eval_accuracy_prehistory": 0.8285714285714286,
+ "mmlu_eval_accuracy_professional_accounting": 0.6451612903225806,
+ "mmlu_eval_accuracy_professional_law": 0.6294117647058823,
+ "mmlu_eval_accuracy_professional_medicine": 0.8064516129032258,
+ "mmlu_eval_accuracy_professional_psychology": 0.8115942028985508,
+ "mmlu_eval_accuracy_public_relations": 0.6666666666666666,
+ "mmlu_eval_accuracy_security_studies": 0.8148148148148148,
+ "mmlu_eval_accuracy_sociology": 0.9545454545454546,
+ "mmlu_eval_accuracy_us_foreign_policy": 1.0,
+ "mmlu_eval_accuracy_virology": 0.5,
+ "mmlu_eval_accuracy_world_religions": 0.8421052631578947,
+ "mmlu_loss": 1.326305795171384,
+ "step": 187
+ },
+ {
+ "epoch": 0.34,
+ "learning_rate": 0.0001,
+ "loss": 1.1133,
+ "step": 190
+ },
+ {
+ "epoch": 0.36,
+ "learning_rate": 0.0001,
+ "loss": 1.2485,
+ "step": 200
+ },
+ {
+ "epoch": 0.38,
+ "learning_rate": 0.0001,
+ "loss": 0.9653,
+ "step": 210
+ },
+ {
+ "epoch": 0.4,
+ "learning_rate": 0.0001,
+ "loss": 0.9455,
+ "step": 220
+ },
+ {
+ "epoch": 0.42,
+ "learning_rate": 0.0001,
+ "loss": 1.0373,
+ "step": 230
+ },
+ {
+ "epoch": 0.44,
+ "learning_rate": 0.0001,
+ "loss": 1.1425,
+ "step": 240
+ },
+ {
+ "epoch": 0.45,
+ "learning_rate": 0.0001,
+ "loss": 1.3136,
+ "step": 250
+ },
+ {
+ "epoch": 0.47,
+ "learning_rate": 0.0001,
+ "loss": 0.8695,
+ "step": 260
+ },
+ {
+ "epoch": 0.49,
+ "learning_rate": 0.0001,
+ "loss": 0.872,
+ "step": 270
+ },
+ {
+ "epoch": 0.51,
+ "learning_rate": 0.0001,
+ "loss": 1.0152,
+ "step": 280
+ },
+ {
+ "epoch": 0.53,
+ "learning_rate": 0.0001,
+ "loss": 1.1309,
+ "step": 290
+ },
+ {
+ "epoch": 0.54,
+ "learning_rate": 0.0001,
+ "loss": 1.267,
+ "step": 300
+ },
+ {
+ "epoch": 0.56,
+ "learning_rate": 0.0001,
+ "loss": 0.9249,
+ "step": 310
+ },
+ {
+ "epoch": 0.58,
+ "learning_rate": 0.0001,
+ "loss": 0.9148,
+ "step": 320
+ },
+ {
+ "epoch": 0.6,
+ "learning_rate": 0.0001,
+ "loss": 0.9864,
+ "step": 330
+ },
+ {
+ "epoch": 0.62,
+ "learning_rate": 0.0001,
+ "loss": 1.2312,
+ "step": 340
+ },
+ {
+ "epoch": 0.63,
+ "learning_rate": 0.0001,
+ "loss": 1.2354,
+ "step": 350
+ },
+ {
+ "epoch": 0.65,
+ "learning_rate": 0.0001,
+ "loss": 0.9126,
+ "step": 360
+ },
+ {
+ "epoch": 0.67,
+ "learning_rate": 0.0001,
+ "loss": 0.9213,
+ "step": 370
+ },
+ {
+ "epoch": 0.68,
+ "eval_loss": 1.0163359642028809,
+ "eval_runtime": 948.1151,
+ "eval_samples_per_second": 1.055,
+ "eval_steps_per_second": 1.055,
+ "step": 374
+ },
+ {
+ "epoch": 0.68,
+ "mmlu_eval_accuracy": 0.7395476061435284,
+ "mmlu_eval_accuracy_abstract_algebra": 0.2727272727272727,
+ "mmlu_eval_accuracy_anatomy": 0.7857142857142857,
+ "mmlu_eval_accuracy_astronomy": 0.75,
+ "mmlu_eval_accuracy_business_ethics": 0.7272727272727273,
+ "mmlu_eval_accuracy_clinical_knowledge": 0.896551724137931,
+ "mmlu_eval_accuracy_college_biology": 0.875,
+ "mmlu_eval_accuracy_college_chemistry": 0.5,
+ "mmlu_eval_accuracy_college_computer_science": 0.7272727272727273,
+ "mmlu_eval_accuracy_college_mathematics": 0.36363636363636365,
+ "mmlu_eval_accuracy_college_medicine": 0.9090909090909091,
+ "mmlu_eval_accuracy_college_physics": 0.6363636363636364,
+ "mmlu_eval_accuracy_computer_security": 0.6363636363636364,
+ "mmlu_eval_accuracy_conceptual_physics": 0.5769230769230769,
+ "mmlu_eval_accuracy_econometrics": 0.8333333333333334,
+ "mmlu_eval_accuracy_electrical_engineering": 0.875,
+ "mmlu_eval_accuracy_elementary_mathematics": 0.6829268292682927,
+ "mmlu_eval_accuracy_formal_logic": 0.6428571428571429,
+ "mmlu_eval_accuracy_global_facts": 0.5,
+ "mmlu_eval_accuracy_high_school_biology": 0.8125,
+ "mmlu_eval_accuracy_high_school_chemistry": 0.45454545454545453,
+ "mmlu_eval_accuracy_high_school_computer_science": 0.7777777777777778,
+ "mmlu_eval_accuracy_high_school_european_history": 0.7777777777777778,
+ "mmlu_eval_accuracy_high_school_geography": 0.9090909090909091,
+ "mmlu_eval_accuracy_high_school_government_and_politics": 0.9523809523809523,
+ "mmlu_eval_accuracy_high_school_macroeconomics": 0.7674418604651163,
+ "mmlu_eval_accuracy_high_school_mathematics": 0.41379310344827586,
+ "mmlu_eval_accuracy_high_school_microeconomics": 0.9615384615384616,
+ "mmlu_eval_accuracy_high_school_physics": 0.23529411764705882,
+ "mmlu_eval_accuracy_high_school_psychology": 0.95,
+ "mmlu_eval_accuracy_high_school_statistics": 0.7391304347826086,
+ "mmlu_eval_accuracy_high_school_us_history": 0.9090909090909091,
+ "mmlu_eval_accuracy_high_school_world_history": 0.7692307692307693,
+ "mmlu_eval_accuracy_human_aging": 0.8260869565217391,
+ "mmlu_eval_accuracy_human_sexuality": 0.6666666666666666,
+ "mmlu_eval_accuracy_international_law": 1.0,
+ "mmlu_eval_accuracy_jurisprudence": 0.6363636363636364,
+ "mmlu_eval_accuracy_logical_fallacies": 0.7777777777777778,
+ "mmlu_eval_accuracy_machine_learning": 0.5454545454545454,
+ "mmlu_eval_accuracy_management": 0.9090909090909091,
+ "mmlu_eval_accuracy_marketing": 0.92,
+ "mmlu_eval_accuracy_medical_genetics": 0.9090909090909091,
+ "mmlu_eval_accuracy_miscellaneous": 0.7790697674418605,
+ "mmlu_eval_accuracy_moral_disputes": 0.8157894736842105,
+ "mmlu_eval_accuracy_moral_scenarios": 0.57,
+ "mmlu_eval_accuracy_nutrition": 0.7272727272727273,
+ "mmlu_eval_accuracy_philosophy": 0.7941176470588235,
+ "mmlu_eval_accuracy_prehistory": 0.8571428571428571,
+ "mmlu_eval_accuracy_professional_accounting": 0.6774193548387096,
+ "mmlu_eval_accuracy_professional_law": 0.6411764705882353,
+ "mmlu_eval_accuracy_professional_medicine": 0.8387096774193549,
+ "mmlu_eval_accuracy_professional_psychology": 0.8115942028985508,
+ "mmlu_eval_accuracy_public_relations": 0.6666666666666666,
+ "mmlu_eval_accuracy_security_studies": 0.8148148148148148,
+ "mmlu_eval_accuracy_sociology": 0.9545454545454546,
+ "mmlu_eval_accuracy_us_foreign_policy": 1.0,
+ "mmlu_eval_accuracy_virology": 0.5,
+ "mmlu_eval_accuracy_world_religions": 0.8947368421052632,
+ "mmlu_loss": 1.2796503596061355,
+ "step": 374
+ },
+ {
+ "epoch": 0.69,
+ "learning_rate": 0.0001,
+ "loss": 0.9737,
+ "step": 380
+ },
+ {
+ "epoch": 0.71,
+ "learning_rate": 0.0001,
+ "loss": 1.157,
+ "step": 390
+ },
+ {
+ "epoch": 0.73,
+ "learning_rate": 0.0001,
+ "loss": 1.2106,
+ "step": 400
+ },
+ {
+ "epoch": 0.74,
+ "learning_rate": 0.0001,
+ "loss": 0.8687,
+ "step": 410
+ },
+ {
+ "epoch": 0.76,
+ "learning_rate": 0.0001,
+ "loss": 0.8742,
+ "step": 420
+ },
+ {
+ "epoch": 0.78,
+ "learning_rate": 0.0001,
+ "loss": 0.9901,
+ "step": 430
+ },
+ {
+ "epoch": 0.8,
+ "learning_rate": 0.0001,
+ "loss": 1.2238,
+ "step": 440
+ },
+ {
+ "epoch": 0.82,
+ "learning_rate": 0.0001,
+ "loss": 1.2604,
+ "step": 450
+ },
+ {
+ "epoch": 0.83,
+ "learning_rate": 0.0001,
+ "loss": 0.8756,
+ "step": 460
+ },
+ {
+ "epoch": 0.85,
+ "learning_rate": 0.0001,
+ "loss": 0.8683,
+ "step": 470
+ },
+ {
+ "epoch": 0.87,
+ "learning_rate": 0.0001,
+ "loss": 0.9824,
+ "step": 480
+ },
+ {
+ "epoch": 0.89,
+ "learning_rate": 0.0001,
+ "loss": 1.1574,
+ "step": 490
+ },
+ {
+ "epoch": 0.91,
+ "learning_rate": 0.0001,
+ "loss": 1.2687,
+ "step": 500
+ },
+ {
+ "epoch": 0.92,
+ "learning_rate": 0.0001,
+ "loss": 0.8657,
+ "step": 510
+ },
+ {
+ "epoch": 0.94,
+ "learning_rate": 0.0001,
+ "loss": 0.9207,
+ "step": 520
+ },
+ {
+ "epoch": 0.96,
+ "learning_rate": 0.0001,
+ "loss": 1.012,
+ "step": 530
+ },
+ {
+ "epoch": 0.98,
+ "learning_rate": 0.0001,
+ "loss": 1.1517,
+ "step": 540
+ },
+ {
+ "epoch": 1.0,
+ "learning_rate": 0.0001,
+ "loss": 1.1654,
+ "step": 550
+ },
+ {
+ "epoch": 1.02,
+ "learning_rate": 0.0001,
+ "loss": 0.8931,
+ "step": 560
+ },
+ {
+ "epoch": 1.02,
+ "eval_loss": 1.0150845050811768,
+ "eval_runtime": 949.8392,
+ "eval_samples_per_second": 1.053,
+ "eval_steps_per_second": 1.053,
+ "step": 561
+ },
+ {
+ "epoch": 1.02,
+ "mmlu_eval_accuracy": 0.7346397374699287,
+ "mmlu_eval_accuracy_abstract_algebra": 0.2727272727272727,
+ "mmlu_eval_accuracy_anatomy": 0.7142857142857143,
+ "mmlu_eval_accuracy_astronomy": 0.6875,
+ "mmlu_eval_accuracy_business_ethics": 0.8181818181818182,
+ "mmlu_eval_accuracy_clinical_knowledge": 0.8620689655172413,
+ "mmlu_eval_accuracy_college_biology": 0.875,
+ "mmlu_eval_accuracy_college_chemistry": 0.5,
+ "mmlu_eval_accuracy_college_computer_science": 0.7272727272727273,
+ "mmlu_eval_accuracy_college_mathematics": 0.36363636363636365,
+ "mmlu_eval_accuracy_college_medicine": 0.9090909090909091,
+ "mmlu_eval_accuracy_college_physics": 0.5454545454545454,
+ "mmlu_eval_accuracy_computer_security": 0.6363636363636364,
+ "mmlu_eval_accuracy_conceptual_physics": 0.6153846153846154,
+ "mmlu_eval_accuracy_econometrics": 0.8333333333333334,
+ "mmlu_eval_accuracy_electrical_engineering": 0.8125,
+ "mmlu_eval_accuracy_elementary_mathematics": 0.6829268292682927,
+ "mmlu_eval_accuracy_formal_logic": 0.6428571428571429,
+ "mmlu_eval_accuracy_global_facts": 0.5,
+ "mmlu_eval_accuracy_high_school_biology": 0.84375,
+ "mmlu_eval_accuracy_high_school_chemistry": 0.45454545454545453,
+ "mmlu_eval_accuracy_high_school_computer_science": 0.7777777777777778,
+ "mmlu_eval_accuracy_high_school_european_history": 0.7777777777777778,
+ "mmlu_eval_accuracy_high_school_geography": 0.9090909090909091,
+ "mmlu_eval_accuracy_high_school_government_and_politics": 0.9523809523809523,
+ "mmlu_eval_accuracy_high_school_macroeconomics": 0.7674418604651163,
+ "mmlu_eval_accuracy_high_school_mathematics": 0.41379310344827586,
+ "mmlu_eval_accuracy_high_school_microeconomics": 0.9230769230769231,
+ "mmlu_eval_accuracy_high_school_physics": 0.23529411764705882,
+ "mmlu_eval_accuracy_high_school_psychology": 0.9666666666666667,
+ "mmlu_eval_accuracy_high_school_statistics": 0.7391304347826086,
+ "mmlu_eval_accuracy_high_school_us_history": 0.9090909090909091,
+ "mmlu_eval_accuracy_high_school_world_history": 0.8076923076923077,
+ "mmlu_eval_accuracy_human_aging": 0.8260869565217391,
+ "mmlu_eval_accuracy_human_sexuality": 0.6666666666666666,
+ "mmlu_eval_accuracy_international_law": 1.0,
+ "mmlu_eval_accuracy_jurisprudence": 0.6363636363636364,
+ "mmlu_eval_accuracy_logical_fallacies": 0.7222222222222222,
+ "mmlu_eval_accuracy_machine_learning": 0.6363636363636364,
+ "mmlu_eval_accuracy_management": 0.9090909090909091,
+ "mmlu_eval_accuracy_marketing": 0.88,
+ "mmlu_eval_accuracy_medical_genetics": 0.9090909090909091,
+ "mmlu_eval_accuracy_miscellaneous": 0.7558139534883721,
+ "mmlu_eval_accuracy_moral_disputes": 0.8157894736842105,
+ "mmlu_eval_accuracy_moral_scenarios": 0.56,
+ "mmlu_eval_accuracy_nutrition": 0.7575757575757576,
+ "mmlu_eval_accuracy_philosophy": 0.7941176470588235,
+ "mmlu_eval_accuracy_prehistory": 0.8571428571428571,
+ "mmlu_eval_accuracy_professional_accounting": 0.6774193548387096,
+ "mmlu_eval_accuracy_professional_law": 0.6294117647058823,
+ "mmlu_eval_accuracy_professional_medicine": 0.8064516129032258,
+ "mmlu_eval_accuracy_professional_psychology": 0.8260869565217391,
+ "mmlu_eval_accuracy_public_relations": 0.6666666666666666,
+ "mmlu_eval_accuracy_security_studies": 0.8148148148148148,
+ "mmlu_eval_accuracy_sociology": 0.9090909090909091,
+ "mmlu_eval_accuracy_us_foreign_policy": 1.0,
+ "mmlu_eval_accuracy_virology": 0.5,
+ "mmlu_eval_accuracy_world_religions": 0.8421052631578947,
+ "mmlu_loss": 1.186674291658526,
+ "step": 561
+ },
+ {
+ "epoch": 1.03,
+ "learning_rate": 0.0001,
+ "loss": 0.8507,
+ "step": 570
+ },
+ {
+ "epoch": 1.05,
+ "learning_rate": 0.0001,
+ "loss": 0.9164,
+ "step": 580
+ },
+ {
+ "epoch": 1.07,
+ "learning_rate": 0.0001,
+ "loss": 1.0908,
+ "step": 590
+ },
+ {
+ "epoch": 1.09,
+ "learning_rate": 0.0001,
+ "loss": 1.0431,
+ "step": 600
+ },
+ {
+ "epoch": 1.11,
+ "learning_rate": 0.0001,
+ "loss": 0.8567,
+ "step": 610
+ },
+ {
+ "epoch": 1.12,
+ "learning_rate": 0.0001,
+ "loss": 0.8818,
+ "step": 620
+ },
+ {
+ "epoch": 1.14,
+ "learning_rate": 0.0001,
+ "loss": 0.9499,
+ "step": 630
+ },
+ {
+ "epoch": 1.16,
+ "learning_rate": 0.0001,
+ "loss": 1.0437,
+ "step": 640
+ },
+ {
+ "epoch": 1.18,
+ "learning_rate": 0.0001,
+ "loss": 1.0487,
+ "step": 650
+ },
+ {
+ "epoch": 1.2,
+ "learning_rate": 0.0001,
+ "loss": 0.8405,
+ "step": 660
+ },
+ {
+ "epoch": 1.22,
+ "learning_rate": 0.0001,
+ "loss": 0.8818,
+ "step": 670
+ },
+ {
+ "epoch": 1.23,
+ "learning_rate": 0.0001,
+ "loss": 0.9619,
+ "step": 680
+ },
+ {
+ "epoch": 1.25,
+ "learning_rate": 0.0001,
+ "loss": 1.0753,
+ "step": 690
+ },
+ {
+ "epoch": 1.27,
+ "learning_rate": 0.0001,
+ "loss": 1.0218,
+ "step": 700
+ },
+ {
+ "epoch": 1.29,
+ "learning_rate": 0.0001,
+ "loss": 0.8763,
+ "step": 710
+ },
+ {
+ "epoch": 1.31,
+ "learning_rate": 0.0001,
+ "loss": 0.8789,
+ "step": 720
+ },
+ {
+ "epoch": 1.32,
+ "learning_rate": 0.0001,
+ "loss": 0.8631,
+ "step": 730
+ },
+ {
+ "epoch": 1.34,
+ "learning_rate": 0.0001,
+ "loss": 0.9846,
+ "step": 740
+ },
+ {
+ "epoch": 1.36,
+ "eval_loss": 1.0305067300796509,
+ "eval_runtime": 948.7106,
+ "eval_samples_per_second": 1.054,
+ "eval_steps_per_second": 1.054,
+ "step": 748
+ },
+ {
+ "epoch": 1.36,
+ "mmlu_eval_accuracy": 0.7324229372189777,
+ "mmlu_eval_accuracy_abstract_algebra": 0.2727272727272727,
+ "mmlu_eval_accuracy_anatomy": 0.6428571428571429,
+ "mmlu_eval_accuracy_astronomy": 0.6875,
+ "mmlu_eval_accuracy_business_ethics": 1.0,
+ "mmlu_eval_accuracy_clinical_knowledge": 0.896551724137931,
+ "mmlu_eval_accuracy_college_biology": 0.875,
+ "mmlu_eval_accuracy_college_chemistry": 0.5,
+ "mmlu_eval_accuracy_college_computer_science": 0.6363636363636364,
+ "mmlu_eval_accuracy_college_mathematics": 0.36363636363636365,
+ "mmlu_eval_accuracy_college_medicine": 0.8636363636363636,
+ "mmlu_eval_accuracy_college_physics": 0.6363636363636364,
+ "mmlu_eval_accuracy_computer_security": 0.6363636363636364,
+ "mmlu_eval_accuracy_conceptual_physics": 0.6153846153846154,
+ "mmlu_eval_accuracy_econometrics": 0.8333333333333334,
+ "mmlu_eval_accuracy_electrical_engineering": 0.8125,
+ "mmlu_eval_accuracy_elementary_mathematics": 0.7073170731707317,
+ "mmlu_eval_accuracy_formal_logic": 0.6428571428571429,
+ "mmlu_eval_accuracy_global_facts": 0.4,
+ "mmlu_eval_accuracy_high_school_biology": 0.8125,
+ "mmlu_eval_accuracy_high_school_chemistry": 0.36363636363636365,
+ "mmlu_eval_accuracy_high_school_computer_science": 0.6666666666666666,
+ "mmlu_eval_accuracy_high_school_european_history": 0.7777777777777778,
+ "mmlu_eval_accuracy_high_school_geography": 0.9090909090909091,
+ "mmlu_eval_accuracy_high_school_government_and_politics": 0.9523809523809523,
+ "mmlu_eval_accuracy_high_school_macroeconomics": 0.7674418604651163,
+ "mmlu_eval_accuracy_high_school_mathematics": 0.3448275862068966,
+ "mmlu_eval_accuracy_high_school_microeconomics": 0.9615384615384616,
+ "mmlu_eval_accuracy_high_school_physics": 0.35294117647058826,
+ "mmlu_eval_accuracy_high_school_psychology": 0.9333333333333333,
+ "mmlu_eval_accuracy_high_school_statistics": 0.782608695652174,
+ "mmlu_eval_accuracy_high_school_us_history": 0.9545454545454546,
+ "mmlu_eval_accuracy_high_school_world_history": 0.7692307692307693,
+ "mmlu_eval_accuracy_human_aging": 0.782608695652174,
+ "mmlu_eval_accuracy_human_sexuality": 0.6666666666666666,
+ "mmlu_eval_accuracy_international_law": 1.0,
+ "mmlu_eval_accuracy_jurisprudence": 0.6363636363636364,
+ "mmlu_eval_accuracy_logical_fallacies": 0.7777777777777778,
+ "mmlu_eval_accuracy_machine_learning": 0.6363636363636364,
+ "mmlu_eval_accuracy_management": 0.9090909090909091,
+ "mmlu_eval_accuracy_marketing": 0.96,
+ "mmlu_eval_accuracy_medical_genetics": 0.9090909090909091,
+ "mmlu_eval_accuracy_miscellaneous": 0.7441860465116279,
+ "mmlu_eval_accuracy_moral_disputes": 0.8157894736842105,
+ "mmlu_eval_accuracy_moral_scenarios": 0.57,
+ "mmlu_eval_accuracy_nutrition": 0.7272727272727273,
+ "mmlu_eval_accuracy_philosophy": 0.7941176470588235,
+ "mmlu_eval_accuracy_prehistory": 0.8285714285714286,
+ "mmlu_eval_accuracy_professional_accounting": 0.6129032258064516,
+ "mmlu_eval_accuracy_professional_law": 0.6411764705882353,
+ "mmlu_eval_accuracy_professional_medicine": 0.8064516129032258,
+ "mmlu_eval_accuracy_professional_psychology": 0.8260869565217391,
+ "mmlu_eval_accuracy_public_relations": 0.6666666666666666,
+ "mmlu_eval_accuracy_security_studies": 0.8148148148148148,
+ "mmlu_eval_accuracy_sociology": 0.9090909090909091,
+ "mmlu_eval_accuracy_us_foreign_policy": 1.0,
+ "mmlu_eval_accuracy_virology": 0.5,
+ "mmlu_eval_accuracy_world_religions": 0.8421052631578947,
+ "mmlu_loss": 1.2988067958029479,
+ "step": 748
+ },
+ {
+ "epoch": 1.36,
+ "learning_rate": 0.0001,
+ "loss": 1.0735,
+ "step": 750
+ },
+ {
+ "epoch": 1.38,
+ "learning_rate": 0.0001,
+ "loss": 0.9066,
+ "step": 760
+ },
+ {
+ "epoch": 1.4,
+ "learning_rate": 0.0001,
+ "loss": 0.8716,
+ "step": 770
+ },
+ {
+ "epoch": 1.41,
+ "learning_rate": 0.0001,
+ "loss": 0.9144,
+ "step": 780
+ },
+ {
+ "epoch": 1.43,
+ "learning_rate": 0.0001,
+ "loss": 1.0338,
+ "step": 790
+ },
+ {
+ "epoch": 1.45,
+ "learning_rate": 0.0001,
+ "loss": 1.0275,
+ "step": 800
+ },
+ {
+ "epoch": 1.47,
+ "learning_rate": 0.0001,
+ "loss": 0.8382,
+ "step": 810
+ },
+ {
+ "epoch": 1.49,
+ "learning_rate": 0.0001,
+ "loss": 0.8489,
+ "step": 820
+ },
+ {
+ "epoch": 1.51,
+ "learning_rate": 0.0001,
+ "loss": 0.8931,
+ "step": 830
+ },
+ {
+ "epoch": 1.52,
+ "learning_rate": 0.0001,
+ "loss": 1.0515,
+ "step": 840
+ },
+ {
+ "epoch": 1.54,
+ "learning_rate": 0.0001,
+ "loss": 1.0965,
+ "step": 850
+ },
+ {
+ "epoch": 1.56,
+ "learning_rate": 0.0001,
+ "loss": 0.8928,
+ "step": 860
+ },
+ {
+ "epoch": 1.58,
+ "learning_rate": 0.0001,
+ "loss": 0.8608,
+ "step": 870
+ },
+ {
+ "epoch": 1.6,
+ "learning_rate": 0.0001,
+ "loss": 0.8831,
+ "step": 880
+ },
+ {
+ "epoch": 1.61,
+ "learning_rate": 0.0001,
+ "loss": 1.0253,
+ "step": 890
+ },
+ {
+ "epoch": 1.63,
+ "learning_rate": 0.0001,
+ "loss": 0.9905,
+ "step": 900
+ },
+ {
+ "epoch": 1.65,
+ "learning_rate": 0.0001,
+ "loss": 0.8487,
+ "step": 910
+ },
+ {
+ "epoch": 1.67,
+ "learning_rate": 0.0001,
+ "loss": 0.8568,
+ "step": 920
+ },
+ {
+ "epoch": 1.69,
+ "learning_rate": 0.0001,
+ "loss": 0.9047,
+ "step": 930
+ },
+ {
+ "epoch": 1.7,
+ "eval_loss": 1.0250624418258667,
+ "eval_runtime": 946.4035,
+ "eval_samples_per_second": 1.057,
+ "eval_steps_per_second": 1.057,
+ "step": 935
+ },
+ {
+ "epoch": 1.7,
+ "mmlu_eval_accuracy": 0.7288948695878031,
+ "mmlu_eval_accuracy_abstract_algebra": 0.2727272727272727,
+ "mmlu_eval_accuracy_anatomy": 0.5714285714285714,
+ "mmlu_eval_accuracy_astronomy": 0.6875,
+ "mmlu_eval_accuracy_business_ethics": 0.9090909090909091,
+ "mmlu_eval_accuracy_clinical_knowledge": 0.8620689655172413,
+ "mmlu_eval_accuracy_college_biology": 0.875,
+ "mmlu_eval_accuracy_college_chemistry": 0.5,
+ "mmlu_eval_accuracy_college_computer_science": 0.7272727272727273,
+ "mmlu_eval_accuracy_college_mathematics": 0.36363636363636365,
+ "mmlu_eval_accuracy_college_medicine": 0.9090909090909091,
+ "mmlu_eval_accuracy_college_physics": 0.6363636363636364,
+ "mmlu_eval_accuracy_computer_security": 0.6363636363636364,
+ "mmlu_eval_accuracy_conceptual_physics": 0.6153846153846154,
+ "mmlu_eval_accuracy_econometrics": 0.8333333333333334,
+ "mmlu_eval_accuracy_electrical_engineering": 0.8125,
+ "mmlu_eval_accuracy_elementary_mathematics": 0.6097560975609756,
+ "mmlu_eval_accuracy_formal_logic": 0.6428571428571429,
+ "mmlu_eval_accuracy_global_facts": 0.5,
+ "mmlu_eval_accuracy_high_school_biology": 0.84375,
+ "mmlu_eval_accuracy_high_school_chemistry": 0.45454545454545453,
+ "mmlu_eval_accuracy_high_school_computer_science": 0.6666666666666666,
+ "mmlu_eval_accuracy_high_school_european_history": 0.7777777777777778,
+ "mmlu_eval_accuracy_high_school_geography": 0.9090909090909091,
+ "mmlu_eval_accuracy_high_school_government_and_politics": 0.9523809523809523,
+ "mmlu_eval_accuracy_high_school_macroeconomics": 0.7906976744186046,
+ "mmlu_eval_accuracy_high_school_mathematics": 0.3793103448275862,
+ "mmlu_eval_accuracy_high_school_microeconomics": 0.9230769230769231,
+ "mmlu_eval_accuracy_high_school_physics": 0.23529411764705882,
+ "mmlu_eval_accuracy_high_school_psychology": 0.9333333333333333,
+ "mmlu_eval_accuracy_high_school_statistics": 0.7391304347826086,
+ "mmlu_eval_accuracy_high_school_us_history": 0.9090909090909091,
+ "mmlu_eval_accuracy_high_school_world_history": 0.7692307692307693,
+ "mmlu_eval_accuracy_human_aging": 0.782608695652174,
+ "mmlu_eval_accuracy_human_sexuality": 0.6666666666666666,
+ "mmlu_eval_accuracy_international_law": 1.0,
+ "mmlu_eval_accuracy_jurisprudence": 0.5454545454545454,
+ "mmlu_eval_accuracy_logical_fallacies": 0.7222222222222222,
+ "mmlu_eval_accuracy_machine_learning": 0.6363636363636364,
+ "mmlu_eval_accuracy_management": 0.9090909090909091,
+ "mmlu_eval_accuracy_marketing": 0.96,
+ "mmlu_eval_accuracy_medical_genetics": 0.9090909090909091,
+ "mmlu_eval_accuracy_miscellaneous": 0.7790697674418605,
+ "mmlu_eval_accuracy_moral_disputes": 0.8157894736842105,
+ "mmlu_eval_accuracy_moral_scenarios": 0.57,
+ "mmlu_eval_accuracy_nutrition": 0.7575757575757576,
+ "mmlu_eval_accuracy_philosophy": 0.8235294117647058,
+ "mmlu_eval_accuracy_prehistory": 0.8857142857142857,
+ "mmlu_eval_accuracy_professional_accounting": 0.6129032258064516,
+ "mmlu_eval_accuracy_professional_law": 0.6235294117647059,
+ "mmlu_eval_accuracy_professional_medicine": 0.8387096774193549,
+ "mmlu_eval_accuracy_professional_psychology": 0.8115942028985508,
+ "mmlu_eval_accuracy_public_relations": 0.5833333333333334,
+ "mmlu_eval_accuracy_security_studies": 0.8148148148148148,
+ "mmlu_eval_accuracy_sociology": 0.9090909090909091,
+ "mmlu_eval_accuracy_us_foreign_policy": 1.0,
+ "mmlu_eval_accuracy_virology": 0.5,
+ "mmlu_eval_accuracy_world_religions": 0.8421052631578947,
+ "mmlu_loss": 1.243813282909306,
+ "step": 935
+ },
+ {
+ "epoch": 1.7,
+ "learning_rate": 0.0001,
+ "loss": 1.0174,
+ "step": 940
+ },
+ {
+ "epoch": 1.72,
+ "learning_rate": 0.0001,
+ "loss": 1.0302,
+ "step": 950
+ },
+ {
+ "epoch": 1.74,
+ "learning_rate": 0.0001,
+ "loss": 0.8799,
+ "step": 960
+ },
+ {
+ "epoch": 1.76,
+ "learning_rate": 0.0001,
+ "loss": 0.8447,
+ "step": 970
+ },
+ {
+ "epoch": 1.78,
+ "learning_rate": 0.0001,
+ "loss": 0.9053,
+ "step": 980
+ },
+ {
+ "epoch": 1.8,
+ "learning_rate": 0.0001,
+ "loss": 1.0331,
+ "step": 990
+ },
+ {
+ "epoch": 1.81,
+ "learning_rate": 0.0001,
+ "loss": 1.0412,
+ "step": 1000
+ },
+ {
+ "epoch": 1.83,
+ "learning_rate": 0.0001,
+ "loss": 0.8753,
+ "step": 1010
+ },
+ {
+ "epoch": 1.85,
+ "learning_rate": 0.0001,
+ "loss": 0.8744,
+ "step": 1020
+ },
+ {
+ "epoch": 1.87,
+ "learning_rate": 0.0001,
+ "loss": 0.8899,
+ "step": 1030
+ },
+ {
+ "epoch": 1.89,
+ "learning_rate": 0.0001,
+ "loss": 1.0053,
+ "step": 1040
+ },
+ {
+ "epoch": 1.9,
+ "learning_rate": 0.0001,
+ "loss": 1.0127,
+ "step": 1050
+ },
+ {
+ "epoch": 1.92,
+ "learning_rate": 0.0001,
+ "loss": 0.8023,
+ "step": 1060
+ },
+ {
+ "epoch": 1.94,
+ "learning_rate": 0.0001,
+ "loss": 0.8349,
+ "step": 1070
+ },
+ {
+ "epoch": 1.96,
+ "learning_rate": 0.0001,
+ "loss": 0.9742,
+ "step": 1080
+ },
+ {
+ "epoch": 1.98,
+ "learning_rate": 0.0001,
+ "loss": 1.0971,
+ "step": 1090
+ },
+ {
+ "epoch": 2.0,
+ "learning_rate": 0.0001,
+ "loss": 1.0728,
+ "step": 1100
+ },
+ {
+ "epoch": 2.01,
+ "learning_rate": 0.0001,
+ "loss": 0.7724,
+ "step": 1110
+ },
+ {
+ "epoch": 2.03,
+ "learning_rate": 0.0001,
+ "loss": 0.7675,
+ "step": 1120
+ },
+ {
+ "epoch": 2.03,
+ "eval_loss": 1.052681565284729,
+ "eval_runtime": 942.0722,
+ "eval_samples_per_second": 1.061,
+ "eval_steps_per_second": 1.061,
+ "step": 1122
+ },
+ {
+ "epoch": 2.03,
+ "mmlu_eval_accuracy": 0.7373981967098951,
+ "mmlu_eval_accuracy_abstract_algebra": 0.36363636363636365,
+ "mmlu_eval_accuracy_anatomy": 0.6428571428571429,
+ "mmlu_eval_accuracy_astronomy": 0.6875,
+ "mmlu_eval_accuracy_business_ethics": 0.9090909090909091,
+ "mmlu_eval_accuracy_clinical_knowledge": 0.896551724137931,
+ "mmlu_eval_accuracy_college_biology": 0.875,
+ "mmlu_eval_accuracy_college_chemistry": 0.5,
+ "mmlu_eval_accuracy_college_computer_science": 0.5454545454545454,
+ "mmlu_eval_accuracy_college_mathematics": 0.36363636363636365,
+ "mmlu_eval_accuracy_college_medicine": 0.9090909090909091,
+ "mmlu_eval_accuracy_college_physics": 0.5454545454545454,
+ "mmlu_eval_accuracy_computer_security": 0.6363636363636364,
+ "mmlu_eval_accuracy_conceptual_physics": 0.6153846153846154,
+ "mmlu_eval_accuracy_econometrics": 0.8333333333333334,
+ "mmlu_eval_accuracy_electrical_engineering": 0.875,
+ "mmlu_eval_accuracy_elementary_mathematics": 0.5853658536585366,
+ "mmlu_eval_accuracy_formal_logic": 0.7142857142857143,
+ "mmlu_eval_accuracy_global_facts": 0.4,
+ "mmlu_eval_accuracy_high_school_biology": 0.84375,
+ "mmlu_eval_accuracy_high_school_chemistry": 0.45454545454545453,
+ "mmlu_eval_accuracy_high_school_computer_science": 0.7777777777777778,
+ "mmlu_eval_accuracy_high_school_european_history": 0.7777777777777778,
+ "mmlu_eval_accuracy_high_school_geography": 0.9090909090909091,
+ "mmlu_eval_accuracy_high_school_government_and_politics": 0.9523809523809523,
+ "mmlu_eval_accuracy_high_school_macroeconomics": 0.7674418604651163,
+ "mmlu_eval_accuracy_high_school_mathematics": 0.41379310344827586,
+ "mmlu_eval_accuracy_high_school_microeconomics": 0.9230769230769231,
+ "mmlu_eval_accuracy_high_school_physics": 0.23529411764705882,
+ "mmlu_eval_accuracy_high_school_psychology": 0.95,
+ "mmlu_eval_accuracy_high_school_statistics": 0.782608695652174,
+ "mmlu_eval_accuracy_high_school_us_history": 0.9090909090909091,
+ "mmlu_eval_accuracy_high_school_world_history": 0.7692307692307693,
+ "mmlu_eval_accuracy_human_aging": 0.8260869565217391,
+ "mmlu_eval_accuracy_human_sexuality": 0.6666666666666666,
+ "mmlu_eval_accuracy_international_law": 1.0,
+ "mmlu_eval_accuracy_jurisprudence": 0.6363636363636364,
+ "mmlu_eval_accuracy_logical_fallacies": 0.7777777777777778,
+ "mmlu_eval_accuracy_machine_learning": 0.6363636363636364,
+ "mmlu_eval_accuracy_management": 0.9090909090909091,
+ "mmlu_eval_accuracy_marketing": 0.92,
+ "mmlu_eval_accuracy_medical_genetics": 0.9090909090909091,
+ "mmlu_eval_accuracy_miscellaneous": 0.7906976744186046,
+ "mmlu_eval_accuracy_moral_disputes": 0.8157894736842105,
+ "mmlu_eval_accuracy_moral_scenarios": 0.62,
+ "mmlu_eval_accuracy_nutrition": 0.7575757575757576,
+ "mmlu_eval_accuracy_philosophy": 0.8235294117647058,
+ "mmlu_eval_accuracy_prehistory": 0.8857142857142857,
+ "mmlu_eval_accuracy_professional_accounting": 0.6774193548387096,
+ "mmlu_eval_accuracy_professional_law": 0.6294117647058823,
+ "mmlu_eval_accuracy_professional_medicine": 0.8709677419354839,
+ "mmlu_eval_accuracy_professional_psychology": 0.782608695652174,
+ "mmlu_eval_accuracy_public_relations": 0.6666666666666666,
+ "mmlu_eval_accuracy_security_studies": 0.8148148148148148,
+ "mmlu_eval_accuracy_sociology": 0.9090909090909091,
+ "mmlu_eval_accuracy_us_foreign_policy": 1.0,
+ "mmlu_eval_accuracy_virology": 0.5,
+ "mmlu_eval_accuracy_world_religions": 0.8421052631578947,
+ "mmlu_loss": 1.2340081441760609,
+ "step": 1122
+ },
+ {
+ "epoch": 2.05,
+ "learning_rate": 0.0001,
+ "loss": 0.7194,
+ "step": 1130
+ },
+ {
+ "epoch": 2.07,
+ "learning_rate": 0.0001,
+ "loss": 0.8236,
+ "step": 1140
+ },
+ {
+ "epoch": 2.09,
+ "learning_rate": 0.0001,
+ "loss": 0.6652,
+ "step": 1150
+ },
+ {
+ "epoch": 2.1,
+ "learning_rate": 0.0001,
+ "loss": 0.7177,
+ "step": 1160
+ },
+ {
+ "epoch": 2.12,
+ "learning_rate": 0.0001,
+ "loss": 0.7788,
+ "step": 1170
+ },
+ {
+ "epoch": 2.14,
+ "learning_rate": 0.0001,
+ "loss": 0.8117,
+ "step": 1180
+ },
+ {
+ "epoch": 2.16,
+ "learning_rate": 0.0001,
+ "loss": 0.8145,
+ "step": 1190
+ },
+ {
+ "epoch": 2.18,
+ "learning_rate": 0.0001,
+ "loss": 0.6984,
+ "step": 1200
+ }
+ ],
+ "max_steps": 1875,
+ "num_train_epochs": 4,
+ "total_flos": 1.3906525682785812e+18,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/checkpoint-1200/training_args.bin b/checkpoint-1200/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..cab5478c763bfedd6ea1ae136d922355ec23f73d
--- /dev/null
+++ b/checkpoint-1200/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3739b542a2914edbd2d7eb3b727d6fa2a7752c75b0d7a856f5e87dd807fa1ef9
+size 6200
diff --git a/checkpoint-1400/README.md b/checkpoint-1400/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..82793f73e61dbb024e11fc6697bba1622d4d0db6
--- /dev/null
+++ b/checkpoint-1400/README.md
@@ -0,0 +1,20 @@
+---
+library_name: peft
+---
+## Training procedure
+
+
+The following `bitsandbytes` quantization config was used during training:
+- load_in_8bit: False
+- load_in_4bit: True
+- llm_int8_threshold: 6.0
+- llm_int8_skip_modules: None
+- llm_int8_enable_fp32_cpu_offload: False
+- llm_int8_has_fp16_weight: False
+- bnb_4bit_quant_type: nf4
+- bnb_4bit_use_double_quant: True
+- bnb_4bit_compute_dtype: bfloat16
+### Framework versions
+
+
+- PEFT 0.4.0
diff --git a/checkpoint-1400/adapter_config.json b/checkpoint-1400/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..f6f42652a8f2f3cf02936153fca1af5d04e7983d
--- /dev/null
+++ b/checkpoint-1400/adapter_config.json
@@ -0,0 +1,26 @@
+{
+ "auto_mapping": null,
+ "base_model_name_or_path": "152334H/miqu-1-70b-sf",
+ "bias": "none",
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "lora_alpha": 16.0,
+ "lora_dropout": 0.05,
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "r": 64,
+ "revision": null,
+ "target_modules": [
+ "gate_proj",
+ "down_proj",
+ "v_proj",
+ "up_proj",
+ "q_proj",
+ "k_proj",
+ "o_proj"
+ ],
+ "task_type": "CAUSAL_LM"
+}
\ No newline at end of file
diff --git a/checkpoint-1400/adapter_model.bin b/checkpoint-1400/adapter_model.bin
new file mode 100644
index 0000000000000000000000000000000000000000..7ad5754242cfc89592d1fda9051dc7105f2ce05e
--- /dev/null
+++ b/checkpoint-1400/adapter_model.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d1533cea48affa188e9822cffe2f4176c550f93d32a8939add43058d977fbd29
+size 1657155522
diff --git a/checkpoint-1400/adapter_model/README.md b/checkpoint-1400/adapter_model/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..82793f73e61dbb024e11fc6697bba1622d4d0db6
--- /dev/null
+++ b/checkpoint-1400/adapter_model/README.md
@@ -0,0 +1,20 @@
+---
+library_name: peft
+---
+## Training procedure
+
+
+The following `bitsandbytes` quantization config was used during training:
+- load_in_8bit: False
+- load_in_4bit: True
+- llm_int8_threshold: 6.0
+- llm_int8_skip_modules: None
+- llm_int8_enable_fp32_cpu_offload: False
+- llm_int8_has_fp16_weight: False
+- bnb_4bit_quant_type: nf4
+- bnb_4bit_use_double_quant: True
+- bnb_4bit_compute_dtype: bfloat16
+### Framework versions
+
+
+- PEFT 0.4.0
diff --git a/checkpoint-1400/adapter_model/adapter_config.json b/checkpoint-1400/adapter_model/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..f6f42652a8f2f3cf02936153fca1af5d04e7983d
--- /dev/null
+++ b/checkpoint-1400/adapter_model/adapter_config.json
@@ -0,0 +1,26 @@
+{
+ "auto_mapping": null,
+ "base_model_name_or_path": "152334H/miqu-1-70b-sf",
+ "bias": "none",
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "lora_alpha": 16.0,
+ "lora_dropout": 0.05,
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "r": 64,
+ "revision": null,
+ "target_modules": [
+ "gate_proj",
+ "down_proj",
+ "v_proj",
+ "up_proj",
+ "q_proj",
+ "k_proj",
+ "o_proj"
+ ],
+ "task_type": "CAUSAL_LM"
+}
\ No newline at end of file
diff --git a/checkpoint-1400/adapter_model/adapter_model.bin b/checkpoint-1400/adapter_model/adapter_model.bin
new file mode 100644
index 0000000000000000000000000000000000000000..7ad5754242cfc89592d1fda9051dc7105f2ce05e
--- /dev/null
+++ b/checkpoint-1400/adapter_model/adapter_model.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d1533cea48affa188e9822cffe2f4176c550f93d32a8939add43058d977fbd29
+size 1657155522
diff --git a/checkpoint-1400/optimizer.pt b/checkpoint-1400/optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..f71fdb8460793afb2967d10919225ca1ef07b2e4
--- /dev/null
+++ b/checkpoint-1400/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:55ab8de049e35c0670e2b97c2dcffcc99fcb8e1c34ca4e4de2f3d6c3717664bf
+size 6627702922
diff --git a/checkpoint-1400/rng_state.pth b/checkpoint-1400/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..0ba81bae71c24abadf8790a49b1509caec021df5
--- /dev/null
+++ b/checkpoint-1400/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6bb2503442a3c1d2dd808417344726e9ee1fe213f212edf8a440ebcb1863ef6f
+size 14180
diff --git a/checkpoint-1400/scheduler.pt b/checkpoint-1400/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..2000daee8a135b01a9047e3a8cd2aff0dbc58155
--- /dev/null
+++ b/checkpoint-1400/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:90d628735beb51fe7df523288b184f458e4b2c3ef9d3bc30ab77bfb1a76e0ba0
+size 1064
diff --git a/checkpoint-1400/special_tokens_map.json b/checkpoint-1400/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..55431d8ca0ffd8e996c646341b6136e749142547
--- /dev/null
+++ b/checkpoint-1400/special_tokens_map.json
@@ -0,0 +1,12 @@
+{
+ "bos_token": "",
+ "eos_token": "",
+ "pad_token": {
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false
+ },
+ "unk_token": ""
+}
diff --git a/checkpoint-1400/tokenizer.model b/checkpoint-1400/tokenizer.model
new file mode 100644
index 0000000000000000000000000000000000000000..6c00c742ce03c627d6cd5b795984876fa49fa899
--- /dev/null
+++ b/checkpoint-1400/tokenizer.model
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
+size 499723
diff --git a/checkpoint-1400/tokenizer_config.json b/checkpoint-1400/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..a3b9160e6f055447e1b1927482cf0ea394366055
--- /dev/null
+++ b/checkpoint-1400/tokenizer_config.json
@@ -0,0 +1,71 @@
+{
+ "add_bos_token": true,
+ "add_eos_token": false,
+ "added_tokens_decoder": {
+ "0": {
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "1": {
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "2": {
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ }
+ },
+ "bos_token": {
+ "__type": "AddedToken",
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false
+ },
+ "chat_template": "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}",
+ "clean_up_tokenization_spaces": false,
+ "eos_token": {
+ "__type": "AddedToken",
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false
+ },
+ "legacy": false,
+ "model_max_length": 1000000000000000019884624838656,
+ "pad_token": {
+ "__type": "AddedToken",
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false
+ },
+ "padding_side": "right",
+ "sp_model_kwargs": {},
+ "spaces_between_special_tokens": false,
+ "tokenizer_class": "LlamaTokenizer",
+ "unk_token": {
+ "__type": "AddedToken",
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false
+ },
+ "use_default_system_prompt": false
+}
diff --git a/checkpoint-1400/trainer_state.json b/checkpoint-1400/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..2c510dd1f4395f3a3845e79d7d972e971b70f339
--- /dev/null
+++ b/checkpoint-1400/trainer_state.json
@@ -0,0 +1,1353 @@
+{
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 2.539106778508275,
+ "global_step": 1400,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0001,
+ "loss": 1.1655,
+ "step": 10
+ },
+ {
+ "epoch": 0.04,
+ "learning_rate": 0.0001,
+ "loss": 1.001,
+ "step": 20
+ },
+ {
+ "epoch": 0.05,
+ "learning_rate": 0.0001,
+ "loss": 1.0287,
+ "step": 30
+ },
+ {
+ "epoch": 0.07,
+ "learning_rate": 0.0001,
+ "loss": 1.1578,
+ "step": 40
+ },
+ {
+ "epoch": 0.09,
+ "learning_rate": 0.0001,
+ "loss": 1.2146,
+ "step": 50
+ },
+ {
+ "epoch": 0.11,
+ "learning_rate": 0.0001,
+ "loss": 0.997,
+ "step": 60
+ },
+ {
+ "epoch": 0.13,
+ "learning_rate": 0.0001,
+ "loss": 0.9024,
+ "step": 70
+ },
+ {
+ "epoch": 0.15,
+ "learning_rate": 0.0001,
+ "loss": 0.9901,
+ "step": 80
+ },
+ {
+ "epoch": 0.16,
+ "learning_rate": 0.0001,
+ "loss": 1.1264,
+ "step": 90
+ },
+ {
+ "epoch": 0.18,
+ "learning_rate": 0.0001,
+ "loss": 1.2038,
+ "step": 100
+ },
+ {
+ "epoch": 0.2,
+ "learning_rate": 0.0001,
+ "loss": 0.8935,
+ "step": 110
+ },
+ {
+ "epoch": 0.22,
+ "learning_rate": 0.0001,
+ "loss": 0.9178,
+ "step": 120
+ },
+ {
+ "epoch": 0.24,
+ "learning_rate": 0.0001,
+ "loss": 0.9746,
+ "step": 130
+ },
+ {
+ "epoch": 0.25,
+ "learning_rate": 0.0001,
+ "loss": 1.1566,
+ "step": 140
+ },
+ {
+ "epoch": 0.27,
+ "learning_rate": 0.0001,
+ "loss": 1.2877,
+ "step": 150
+ },
+ {
+ "epoch": 0.29,
+ "learning_rate": 0.0001,
+ "loss": 0.9146,
+ "step": 160
+ },
+ {
+ "epoch": 0.31,
+ "learning_rate": 0.0001,
+ "loss": 0.8895,
+ "step": 170
+ },
+ {
+ "epoch": 0.33,
+ "learning_rate": 0.0001,
+ "loss": 1.0121,
+ "step": 180
+ },
+ {
+ "epoch": 0.34,
+ "eval_loss": 1.0215636491775513,
+ "eval_runtime": 950.138,
+ "eval_samples_per_second": 1.052,
+ "eval_steps_per_second": 1.052,
+ "step": 187
+ },
+ {
+ "epoch": 0.34,
+ "mmlu_eval_accuracy": 0.731892294851104,
+ "mmlu_eval_accuracy_abstract_algebra": 0.18181818181818182,
+ "mmlu_eval_accuracy_anatomy": 0.7857142857142857,
+ "mmlu_eval_accuracy_astronomy": 0.6875,
+ "mmlu_eval_accuracy_business_ethics": 0.7272727272727273,
+ "mmlu_eval_accuracy_clinical_knowledge": 0.896551724137931,
+ "mmlu_eval_accuracy_college_biology": 0.875,
+ "mmlu_eval_accuracy_college_chemistry": 0.5,
+ "mmlu_eval_accuracy_college_computer_science": 0.7272727272727273,
+ "mmlu_eval_accuracy_college_mathematics": 0.45454545454545453,
+ "mmlu_eval_accuracy_college_medicine": 0.9090909090909091,
+ "mmlu_eval_accuracy_college_physics": 0.6363636363636364,
+ "mmlu_eval_accuracy_computer_security": 0.6363636363636364,
+ "mmlu_eval_accuracy_conceptual_physics": 0.6538461538461539,
+ "mmlu_eval_accuracy_econometrics": 0.8333333333333334,
+ "mmlu_eval_accuracy_electrical_engineering": 0.8125,
+ "mmlu_eval_accuracy_elementary_mathematics": 0.7073170731707317,
+ "mmlu_eval_accuracy_formal_logic": 0.5714285714285714,
+ "mmlu_eval_accuracy_global_facts": 0.5,
+ "mmlu_eval_accuracy_high_school_biology": 0.8125,
+ "mmlu_eval_accuracy_high_school_chemistry": 0.4090909090909091,
+ "mmlu_eval_accuracy_high_school_computer_science": 0.6666666666666666,
+ "mmlu_eval_accuracy_high_school_european_history": 0.7777777777777778,
+ "mmlu_eval_accuracy_high_school_geography": 0.9090909090909091,
+ "mmlu_eval_accuracy_high_school_government_and_politics": 0.9523809523809523,
+ "mmlu_eval_accuracy_high_school_macroeconomics": 0.7441860465116279,
+ "mmlu_eval_accuracy_high_school_mathematics": 0.3793103448275862,
+ "mmlu_eval_accuracy_high_school_microeconomics": 0.9615384615384616,
+ "mmlu_eval_accuracy_high_school_physics": 0.23529411764705882,
+ "mmlu_eval_accuracy_high_school_psychology": 0.9333333333333333,
+ "mmlu_eval_accuracy_high_school_statistics": 0.6956521739130435,
+ "mmlu_eval_accuracy_high_school_us_history": 0.9090909090909091,
+ "mmlu_eval_accuracy_high_school_world_history": 0.8076923076923077,
+ "mmlu_eval_accuracy_human_aging": 0.8260869565217391,
+ "mmlu_eval_accuracy_human_sexuality": 0.6666666666666666,
+ "mmlu_eval_accuracy_international_law": 1.0,
+ "mmlu_eval_accuracy_jurisprudence": 0.6363636363636364,
+ "mmlu_eval_accuracy_logical_fallacies": 0.7777777777777778,
+ "mmlu_eval_accuracy_machine_learning": 0.5454545454545454,
+ "mmlu_eval_accuracy_management": 0.9090909090909091,
+ "mmlu_eval_accuracy_marketing": 0.88,
+ "mmlu_eval_accuracy_medical_genetics": 0.9090909090909091,
+ "mmlu_eval_accuracy_miscellaneous": 0.7906976744186046,
+ "mmlu_eval_accuracy_moral_disputes": 0.8157894736842105,
+ "mmlu_eval_accuracy_moral_scenarios": 0.59,
+ "mmlu_eval_accuracy_nutrition": 0.7878787878787878,
+ "mmlu_eval_accuracy_philosophy": 0.7941176470588235,
+ "mmlu_eval_accuracy_prehistory": 0.8285714285714286,
+ "mmlu_eval_accuracy_professional_accounting": 0.6451612903225806,
+ "mmlu_eval_accuracy_professional_law": 0.6294117647058823,
+ "mmlu_eval_accuracy_professional_medicine": 0.8064516129032258,
+ "mmlu_eval_accuracy_professional_psychology": 0.8115942028985508,
+ "mmlu_eval_accuracy_public_relations": 0.6666666666666666,
+ "mmlu_eval_accuracy_security_studies": 0.8148148148148148,
+ "mmlu_eval_accuracy_sociology": 0.9545454545454546,
+ "mmlu_eval_accuracy_us_foreign_policy": 1.0,
+ "mmlu_eval_accuracy_virology": 0.5,
+ "mmlu_eval_accuracy_world_religions": 0.8421052631578947,
+ "mmlu_loss": 1.326305795171384,
+ "step": 187
+ },
+ {
+ "epoch": 0.34,
+ "learning_rate": 0.0001,
+ "loss": 1.1133,
+ "step": 190
+ },
+ {
+ "epoch": 0.36,
+ "learning_rate": 0.0001,
+ "loss": 1.2485,
+ "step": 200
+ },
+ {
+ "epoch": 0.38,
+ "learning_rate": 0.0001,
+ "loss": 0.9653,
+ "step": 210
+ },
+ {
+ "epoch": 0.4,
+ "learning_rate": 0.0001,
+ "loss": 0.9455,
+ "step": 220
+ },
+ {
+ "epoch": 0.42,
+ "learning_rate": 0.0001,
+ "loss": 1.0373,
+ "step": 230
+ },
+ {
+ "epoch": 0.44,
+ "learning_rate": 0.0001,
+ "loss": 1.1425,
+ "step": 240
+ },
+ {
+ "epoch": 0.45,
+ "learning_rate": 0.0001,
+ "loss": 1.3136,
+ "step": 250
+ },
+ {
+ "epoch": 0.47,
+ "learning_rate": 0.0001,
+ "loss": 0.8695,
+ "step": 260
+ },
+ {
+ "epoch": 0.49,
+ "learning_rate": 0.0001,
+ "loss": 0.872,
+ "step": 270
+ },
+ {
+ "epoch": 0.51,
+ "learning_rate": 0.0001,
+ "loss": 1.0152,
+ "step": 280
+ },
+ {
+ "epoch": 0.53,
+ "learning_rate": 0.0001,
+ "loss": 1.1309,
+ "step": 290
+ },
+ {
+ "epoch": 0.54,
+ "learning_rate": 0.0001,
+ "loss": 1.267,
+ "step": 300
+ },
+ {
+ "epoch": 0.56,
+ "learning_rate": 0.0001,
+ "loss": 0.9249,
+ "step": 310
+ },
+ {
+ "epoch": 0.58,
+ "learning_rate": 0.0001,
+ "loss": 0.9148,
+ "step": 320
+ },
+ {
+ "epoch": 0.6,
+ "learning_rate": 0.0001,
+ "loss": 0.9864,
+ "step": 330
+ },
+ {
+ "epoch": 0.62,
+ "learning_rate": 0.0001,
+ "loss": 1.2312,
+ "step": 340
+ },
+ {
+ "epoch": 0.63,
+ "learning_rate": 0.0001,
+ "loss": 1.2354,
+ "step": 350
+ },
+ {
+ "epoch": 0.65,
+ "learning_rate": 0.0001,
+ "loss": 0.9126,
+ "step": 360
+ },
+ {
+ "epoch": 0.67,
+ "learning_rate": 0.0001,
+ "loss": 0.9213,
+ "step": 370
+ },
+ {
+ "epoch": 0.68,
+ "eval_loss": 1.0163359642028809,
+ "eval_runtime": 948.1151,
+ "eval_samples_per_second": 1.055,
+ "eval_steps_per_second": 1.055,
+ "step": 374
+ },
+ {
+ "epoch": 0.68,
+ "mmlu_eval_accuracy": 0.7395476061435284,
+ "mmlu_eval_accuracy_abstract_algebra": 0.2727272727272727,
+ "mmlu_eval_accuracy_anatomy": 0.7857142857142857,
+ "mmlu_eval_accuracy_astronomy": 0.75,
+ "mmlu_eval_accuracy_business_ethics": 0.7272727272727273,
+ "mmlu_eval_accuracy_clinical_knowledge": 0.896551724137931,
+ "mmlu_eval_accuracy_college_biology": 0.875,
+ "mmlu_eval_accuracy_college_chemistry": 0.5,
+ "mmlu_eval_accuracy_college_computer_science": 0.7272727272727273,
+ "mmlu_eval_accuracy_college_mathematics": 0.36363636363636365,
+ "mmlu_eval_accuracy_college_medicine": 0.9090909090909091,
+ "mmlu_eval_accuracy_college_physics": 0.6363636363636364,
+ "mmlu_eval_accuracy_computer_security": 0.6363636363636364,
+ "mmlu_eval_accuracy_conceptual_physics": 0.5769230769230769,
+ "mmlu_eval_accuracy_econometrics": 0.8333333333333334,
+ "mmlu_eval_accuracy_electrical_engineering": 0.875,
+ "mmlu_eval_accuracy_elementary_mathematics": 0.6829268292682927,
+ "mmlu_eval_accuracy_formal_logic": 0.6428571428571429,
+ "mmlu_eval_accuracy_global_facts": 0.5,
+ "mmlu_eval_accuracy_high_school_biology": 0.8125,
+ "mmlu_eval_accuracy_high_school_chemistry": 0.45454545454545453,
+ "mmlu_eval_accuracy_high_school_computer_science": 0.7777777777777778,
+ "mmlu_eval_accuracy_high_school_european_history": 0.7777777777777778,
+ "mmlu_eval_accuracy_high_school_geography": 0.9090909090909091,
+ "mmlu_eval_accuracy_high_school_government_and_politics": 0.9523809523809523,
+ "mmlu_eval_accuracy_high_school_macroeconomics": 0.7674418604651163,
+ "mmlu_eval_accuracy_high_school_mathematics": 0.41379310344827586,
+ "mmlu_eval_accuracy_high_school_microeconomics": 0.9615384615384616,
+ "mmlu_eval_accuracy_high_school_physics": 0.23529411764705882,
+ "mmlu_eval_accuracy_high_school_psychology": 0.95,
+ "mmlu_eval_accuracy_high_school_statistics": 0.7391304347826086,
+ "mmlu_eval_accuracy_high_school_us_history": 0.9090909090909091,
+ "mmlu_eval_accuracy_high_school_world_history": 0.7692307692307693,
+ "mmlu_eval_accuracy_human_aging": 0.8260869565217391,
+ "mmlu_eval_accuracy_human_sexuality": 0.6666666666666666,
+ "mmlu_eval_accuracy_international_law": 1.0,
+ "mmlu_eval_accuracy_jurisprudence": 0.6363636363636364,
+ "mmlu_eval_accuracy_logical_fallacies": 0.7777777777777778,
+ "mmlu_eval_accuracy_machine_learning": 0.5454545454545454,
+ "mmlu_eval_accuracy_management": 0.9090909090909091,
+ "mmlu_eval_accuracy_marketing": 0.92,
+ "mmlu_eval_accuracy_medical_genetics": 0.9090909090909091,
+ "mmlu_eval_accuracy_miscellaneous": 0.7790697674418605,
+ "mmlu_eval_accuracy_moral_disputes": 0.8157894736842105,
+ "mmlu_eval_accuracy_moral_scenarios": 0.57,
+ "mmlu_eval_accuracy_nutrition": 0.7272727272727273,
+ "mmlu_eval_accuracy_philosophy": 0.7941176470588235,
+ "mmlu_eval_accuracy_prehistory": 0.8571428571428571,
+ "mmlu_eval_accuracy_professional_accounting": 0.6774193548387096,
+ "mmlu_eval_accuracy_professional_law": 0.6411764705882353,
+ "mmlu_eval_accuracy_professional_medicine": 0.8387096774193549,
+ "mmlu_eval_accuracy_professional_psychology": 0.8115942028985508,
+ "mmlu_eval_accuracy_public_relations": 0.6666666666666666,
+ "mmlu_eval_accuracy_security_studies": 0.8148148148148148,
+ "mmlu_eval_accuracy_sociology": 0.9545454545454546,
+ "mmlu_eval_accuracy_us_foreign_policy": 1.0,
+ "mmlu_eval_accuracy_virology": 0.5,
+ "mmlu_eval_accuracy_world_religions": 0.8947368421052632,
+ "mmlu_loss": 1.2796503596061355,
+ "step": 374
+ },
+ {
+ "epoch": 0.69,
+ "learning_rate": 0.0001,
+ "loss": 0.9737,
+ "step": 380
+ },
+ {
+ "epoch": 0.71,
+ "learning_rate": 0.0001,
+ "loss": 1.157,
+ "step": 390
+ },
+ {
+ "epoch": 0.73,
+ "learning_rate": 0.0001,
+ "loss": 1.2106,
+ "step": 400
+ },
+ {
+ "epoch": 0.74,
+ "learning_rate": 0.0001,
+ "loss": 0.8687,
+ "step": 410
+ },
+ {
+ "epoch": 0.76,
+ "learning_rate": 0.0001,
+ "loss": 0.8742,
+ "step": 420
+ },
+ {
+ "epoch": 0.78,
+ "learning_rate": 0.0001,
+ "loss": 0.9901,
+ "step": 430
+ },
+ {
+ "epoch": 0.8,
+ "learning_rate": 0.0001,
+ "loss": 1.2238,
+ "step": 440
+ },
+ {
+ "epoch": 0.82,
+ "learning_rate": 0.0001,
+ "loss": 1.2604,
+ "step": 450
+ },
+ {
+ "epoch": 0.83,
+ "learning_rate": 0.0001,
+ "loss": 0.8756,
+ "step": 460
+ },
+ {
+ "epoch": 0.85,
+ "learning_rate": 0.0001,
+ "loss": 0.8683,
+ "step": 470
+ },
+ {
+ "epoch": 0.87,
+ "learning_rate": 0.0001,
+ "loss": 0.9824,
+ "step": 480
+ },
+ {
+ "epoch": 0.89,
+ "learning_rate": 0.0001,
+ "loss": 1.1574,
+ "step": 490
+ },
+ {
+ "epoch": 0.91,
+ "learning_rate": 0.0001,
+ "loss": 1.2687,
+ "step": 500
+ },
+ {
+ "epoch": 0.92,
+ "learning_rate": 0.0001,
+ "loss": 0.8657,
+ "step": 510
+ },
+ {
+ "epoch": 0.94,
+ "learning_rate": 0.0001,
+ "loss": 0.9207,
+ "step": 520
+ },
+ {
+ "epoch": 0.96,
+ "learning_rate": 0.0001,
+ "loss": 1.012,
+ "step": 530
+ },
+ {
+ "epoch": 0.98,
+ "learning_rate": 0.0001,
+ "loss": 1.1517,
+ "step": 540
+ },
+ {
+ "epoch": 1.0,
+ "learning_rate": 0.0001,
+ "loss": 1.1654,
+ "step": 550
+ },
+ {
+ "epoch": 1.02,
+ "learning_rate": 0.0001,
+ "loss": 0.8931,
+ "step": 560
+ },
+ {
+ "epoch": 1.02,
+ "eval_loss": 1.0150845050811768,
+ "eval_runtime": 949.8392,
+ "eval_samples_per_second": 1.053,
+ "eval_steps_per_second": 1.053,
+ "step": 561
+ },
+ {
+ "epoch": 1.02,
+ "mmlu_eval_accuracy": 0.7346397374699287,
+ "mmlu_eval_accuracy_abstract_algebra": 0.2727272727272727,
+ "mmlu_eval_accuracy_anatomy": 0.7142857142857143,
+ "mmlu_eval_accuracy_astronomy": 0.6875,
+ "mmlu_eval_accuracy_business_ethics": 0.8181818181818182,
+ "mmlu_eval_accuracy_clinical_knowledge": 0.8620689655172413,
+ "mmlu_eval_accuracy_college_biology": 0.875,
+ "mmlu_eval_accuracy_college_chemistry": 0.5,
+ "mmlu_eval_accuracy_college_computer_science": 0.7272727272727273,
+ "mmlu_eval_accuracy_college_mathematics": 0.36363636363636365,
+ "mmlu_eval_accuracy_college_medicine": 0.9090909090909091,
+ "mmlu_eval_accuracy_college_physics": 0.5454545454545454,
+ "mmlu_eval_accuracy_computer_security": 0.6363636363636364,
+ "mmlu_eval_accuracy_conceptual_physics": 0.6153846153846154,
+ "mmlu_eval_accuracy_econometrics": 0.8333333333333334,
+ "mmlu_eval_accuracy_electrical_engineering": 0.8125,
+ "mmlu_eval_accuracy_elementary_mathematics": 0.6829268292682927,
+ "mmlu_eval_accuracy_formal_logic": 0.6428571428571429,
+ "mmlu_eval_accuracy_global_facts": 0.5,
+ "mmlu_eval_accuracy_high_school_biology": 0.84375,
+ "mmlu_eval_accuracy_high_school_chemistry": 0.45454545454545453,
+ "mmlu_eval_accuracy_high_school_computer_science": 0.7777777777777778,
+ "mmlu_eval_accuracy_high_school_european_history": 0.7777777777777778,
+ "mmlu_eval_accuracy_high_school_geography": 0.9090909090909091,
+ "mmlu_eval_accuracy_high_school_government_and_politics": 0.9523809523809523,
+ "mmlu_eval_accuracy_high_school_macroeconomics": 0.7674418604651163,
+ "mmlu_eval_accuracy_high_school_mathematics": 0.41379310344827586,
+ "mmlu_eval_accuracy_high_school_microeconomics": 0.9230769230769231,
+ "mmlu_eval_accuracy_high_school_physics": 0.23529411764705882,
+ "mmlu_eval_accuracy_high_school_psychology": 0.9666666666666667,
+ "mmlu_eval_accuracy_high_school_statistics": 0.7391304347826086,
+ "mmlu_eval_accuracy_high_school_us_history": 0.9090909090909091,
+ "mmlu_eval_accuracy_high_school_world_history": 0.8076923076923077,
+ "mmlu_eval_accuracy_human_aging": 0.8260869565217391,
+ "mmlu_eval_accuracy_human_sexuality": 0.6666666666666666,
+ "mmlu_eval_accuracy_international_law": 1.0,
+ "mmlu_eval_accuracy_jurisprudence": 0.6363636363636364,
+ "mmlu_eval_accuracy_logical_fallacies": 0.7222222222222222,
+ "mmlu_eval_accuracy_machine_learning": 0.6363636363636364,
+ "mmlu_eval_accuracy_management": 0.9090909090909091,
+ "mmlu_eval_accuracy_marketing": 0.88,
+ "mmlu_eval_accuracy_medical_genetics": 0.9090909090909091,
+ "mmlu_eval_accuracy_miscellaneous": 0.7558139534883721,
+ "mmlu_eval_accuracy_moral_disputes": 0.8157894736842105,
+ "mmlu_eval_accuracy_moral_scenarios": 0.56,
+ "mmlu_eval_accuracy_nutrition": 0.7575757575757576,
+ "mmlu_eval_accuracy_philosophy": 0.7941176470588235,
+ "mmlu_eval_accuracy_prehistory": 0.8571428571428571,
+ "mmlu_eval_accuracy_professional_accounting": 0.6774193548387096,
+ "mmlu_eval_accuracy_professional_law": 0.6294117647058823,
+ "mmlu_eval_accuracy_professional_medicine": 0.8064516129032258,
+ "mmlu_eval_accuracy_professional_psychology": 0.8260869565217391,
+ "mmlu_eval_accuracy_public_relations": 0.6666666666666666,
+ "mmlu_eval_accuracy_security_studies": 0.8148148148148148,
+ "mmlu_eval_accuracy_sociology": 0.9090909090909091,
+ "mmlu_eval_accuracy_us_foreign_policy": 1.0,
+ "mmlu_eval_accuracy_virology": 0.5,
+ "mmlu_eval_accuracy_world_religions": 0.8421052631578947,
+ "mmlu_loss": 1.186674291658526,
+ "step": 561
+ },
+ {
+ "epoch": 1.03,
+ "learning_rate": 0.0001,
+ "loss": 0.8507,
+ "step": 570
+ },
+ {
+ "epoch": 1.05,
+ "learning_rate": 0.0001,
+ "loss": 0.9164,
+ "step": 580
+ },
+ {
+ "epoch": 1.07,
+ "learning_rate": 0.0001,
+ "loss": 1.0908,
+ "step": 590
+ },
+ {
+ "epoch": 1.09,
+ "learning_rate": 0.0001,
+ "loss": 1.0431,
+ "step": 600
+ },
+ {
+ "epoch": 1.11,
+ "learning_rate": 0.0001,
+ "loss": 0.8567,
+ "step": 610
+ },
+ {
+ "epoch": 1.12,
+ "learning_rate": 0.0001,
+ "loss": 0.8818,
+ "step": 620
+ },
+ {
+ "epoch": 1.14,
+ "learning_rate": 0.0001,
+ "loss": 0.9499,
+ "step": 630
+ },
+ {
+ "epoch": 1.16,
+ "learning_rate": 0.0001,
+ "loss": 1.0437,
+ "step": 640
+ },
+ {
+ "epoch": 1.18,
+ "learning_rate": 0.0001,
+ "loss": 1.0487,
+ "step": 650
+ },
+ {
+ "epoch": 1.2,
+ "learning_rate": 0.0001,
+ "loss": 0.8405,
+ "step": 660
+ },
+ {
+ "epoch": 1.22,
+ "learning_rate": 0.0001,
+ "loss": 0.8818,
+ "step": 670
+ },
+ {
+ "epoch": 1.23,
+ "learning_rate": 0.0001,
+ "loss": 0.9619,
+ "step": 680
+ },
+ {
+ "epoch": 1.25,
+ "learning_rate": 0.0001,
+ "loss": 1.0753,
+ "step": 690
+ },
+ {
+ "epoch": 1.27,
+ "learning_rate": 0.0001,
+ "loss": 1.0218,
+ "step": 700
+ },
+ {
+ "epoch": 1.29,
+ "learning_rate": 0.0001,
+ "loss": 0.8763,
+ "step": 710
+ },
+ {
+ "epoch": 1.31,
+ "learning_rate": 0.0001,
+ "loss": 0.8789,
+ "step": 720
+ },
+ {
+ "epoch": 1.32,
+ "learning_rate": 0.0001,
+ "loss": 0.8631,
+ "step": 730
+ },
+ {
+ "epoch": 1.34,
+ "learning_rate": 0.0001,
+ "loss": 0.9846,
+ "step": 740
+ },
+ {
+ "epoch": 1.36,
+ "eval_loss": 1.0305067300796509,
+ "eval_runtime": 948.7106,
+ "eval_samples_per_second": 1.054,
+ "eval_steps_per_second": 1.054,
+ "step": 748
+ },
+ {
+ "epoch": 1.36,
+ "mmlu_eval_accuracy": 0.7324229372189777,
+ "mmlu_eval_accuracy_abstract_algebra": 0.2727272727272727,
+ "mmlu_eval_accuracy_anatomy": 0.6428571428571429,
+ "mmlu_eval_accuracy_astronomy": 0.6875,
+ "mmlu_eval_accuracy_business_ethics": 1.0,
+ "mmlu_eval_accuracy_clinical_knowledge": 0.896551724137931,
+ "mmlu_eval_accuracy_college_biology": 0.875,
+ "mmlu_eval_accuracy_college_chemistry": 0.5,
+ "mmlu_eval_accuracy_college_computer_science": 0.6363636363636364,
+ "mmlu_eval_accuracy_college_mathematics": 0.36363636363636365,
+ "mmlu_eval_accuracy_college_medicine": 0.8636363636363636,
+ "mmlu_eval_accuracy_college_physics": 0.6363636363636364,
+ "mmlu_eval_accuracy_computer_security": 0.6363636363636364,
+ "mmlu_eval_accuracy_conceptual_physics": 0.6153846153846154,
+ "mmlu_eval_accuracy_econometrics": 0.8333333333333334,
+ "mmlu_eval_accuracy_electrical_engineering": 0.8125,
+ "mmlu_eval_accuracy_elementary_mathematics": 0.7073170731707317,
+ "mmlu_eval_accuracy_formal_logic": 0.6428571428571429,
+ "mmlu_eval_accuracy_global_facts": 0.4,
+ "mmlu_eval_accuracy_high_school_biology": 0.8125,
+ "mmlu_eval_accuracy_high_school_chemistry": 0.36363636363636365,
+ "mmlu_eval_accuracy_high_school_computer_science": 0.6666666666666666,
+ "mmlu_eval_accuracy_high_school_european_history": 0.7777777777777778,
+ "mmlu_eval_accuracy_high_school_geography": 0.9090909090909091,
+ "mmlu_eval_accuracy_high_school_government_and_politics": 0.9523809523809523,
+ "mmlu_eval_accuracy_high_school_macroeconomics": 0.7674418604651163,
+ "mmlu_eval_accuracy_high_school_mathematics": 0.3448275862068966,
+ "mmlu_eval_accuracy_high_school_microeconomics": 0.9615384615384616,
+ "mmlu_eval_accuracy_high_school_physics": 0.35294117647058826,
+ "mmlu_eval_accuracy_high_school_psychology": 0.9333333333333333,
+ "mmlu_eval_accuracy_high_school_statistics": 0.782608695652174,
+ "mmlu_eval_accuracy_high_school_us_history": 0.9545454545454546,
+ "mmlu_eval_accuracy_high_school_world_history": 0.7692307692307693,
+ "mmlu_eval_accuracy_human_aging": 0.782608695652174,
+ "mmlu_eval_accuracy_human_sexuality": 0.6666666666666666,
+ "mmlu_eval_accuracy_international_law": 1.0,
+ "mmlu_eval_accuracy_jurisprudence": 0.6363636363636364,
+ "mmlu_eval_accuracy_logical_fallacies": 0.7777777777777778,
+ "mmlu_eval_accuracy_machine_learning": 0.6363636363636364,
+ "mmlu_eval_accuracy_management": 0.9090909090909091,
+ "mmlu_eval_accuracy_marketing": 0.96,
+ "mmlu_eval_accuracy_medical_genetics": 0.9090909090909091,
+ "mmlu_eval_accuracy_miscellaneous": 0.7441860465116279,
+ "mmlu_eval_accuracy_moral_disputes": 0.8157894736842105,
+ "mmlu_eval_accuracy_moral_scenarios": 0.57,
+ "mmlu_eval_accuracy_nutrition": 0.7272727272727273,
+ "mmlu_eval_accuracy_philosophy": 0.7941176470588235,
+ "mmlu_eval_accuracy_prehistory": 0.8285714285714286,
+ "mmlu_eval_accuracy_professional_accounting": 0.6129032258064516,
+ "mmlu_eval_accuracy_professional_law": 0.6411764705882353,
+ "mmlu_eval_accuracy_professional_medicine": 0.8064516129032258,
+ "mmlu_eval_accuracy_professional_psychology": 0.8260869565217391,
+ "mmlu_eval_accuracy_public_relations": 0.6666666666666666,
+ "mmlu_eval_accuracy_security_studies": 0.8148148148148148,
+ "mmlu_eval_accuracy_sociology": 0.9090909090909091,
+ "mmlu_eval_accuracy_us_foreign_policy": 1.0,
+ "mmlu_eval_accuracy_virology": 0.5,
+ "mmlu_eval_accuracy_world_religions": 0.8421052631578947,
+ "mmlu_loss": 1.2988067958029479,
+ "step": 748
+ },
+ {
+ "epoch": 1.36,
+ "learning_rate": 0.0001,
+ "loss": 1.0735,
+ "step": 750
+ },
+ {
+ "epoch": 1.38,
+ "learning_rate": 0.0001,
+ "loss": 0.9066,
+ "step": 760
+ },
+ {
+ "epoch": 1.4,
+ "learning_rate": 0.0001,
+ "loss": 0.8716,
+ "step": 770
+ },
+ {
+ "epoch": 1.41,
+ "learning_rate": 0.0001,
+ "loss": 0.9144,
+ "step": 780
+ },
+ {
+ "epoch": 1.43,
+ "learning_rate": 0.0001,
+ "loss": 1.0338,
+ "step": 790
+ },
+ {
+ "epoch": 1.45,
+ "learning_rate": 0.0001,
+ "loss": 1.0275,
+ "step": 800
+ },
+ {
+ "epoch": 1.47,
+ "learning_rate": 0.0001,
+ "loss": 0.8382,
+ "step": 810
+ },
+ {
+ "epoch": 1.49,
+ "learning_rate": 0.0001,
+ "loss": 0.8489,
+ "step": 820
+ },
+ {
+ "epoch": 1.51,
+ "learning_rate": 0.0001,
+ "loss": 0.8931,
+ "step": 830
+ },
+ {
+ "epoch": 1.52,
+ "learning_rate": 0.0001,
+ "loss": 1.0515,
+ "step": 840
+ },
+ {
+ "epoch": 1.54,
+ "learning_rate": 0.0001,
+ "loss": 1.0965,
+ "step": 850
+ },
+ {
+ "epoch": 1.56,
+ "learning_rate": 0.0001,
+ "loss": 0.8928,
+ "step": 860
+ },
+ {
+ "epoch": 1.58,
+ "learning_rate": 0.0001,
+ "loss": 0.8608,
+ "step": 870
+ },
+ {
+ "epoch": 1.6,
+ "learning_rate": 0.0001,
+ "loss": 0.8831,
+ "step": 880
+ },
+ {
+ "epoch": 1.61,
+ "learning_rate": 0.0001,
+ "loss": 1.0253,
+ "step": 890
+ },
+ {
+ "epoch": 1.63,
+ "learning_rate": 0.0001,
+ "loss": 0.9905,
+ "step": 900
+ },
+ {
+ "epoch": 1.65,
+ "learning_rate": 0.0001,
+ "loss": 0.8487,
+ "step": 910
+ },
+ {
+ "epoch": 1.67,
+ "learning_rate": 0.0001,
+ "loss": 0.8568,
+ "step": 920
+ },
+ {
+ "epoch": 1.69,
+ "learning_rate": 0.0001,
+ "loss": 0.9047,
+ "step": 930
+ },
+ {
+ "epoch": 1.7,
+ "eval_loss": 1.0250624418258667,
+ "eval_runtime": 946.4035,
+ "eval_samples_per_second": 1.057,
+ "eval_steps_per_second": 1.057,
+ "step": 935
+ },
+ {
+ "epoch": 1.7,
+ "mmlu_eval_accuracy": 0.7288948695878031,
+ "mmlu_eval_accuracy_abstract_algebra": 0.2727272727272727,
+ "mmlu_eval_accuracy_anatomy": 0.5714285714285714,
+ "mmlu_eval_accuracy_astronomy": 0.6875,
+ "mmlu_eval_accuracy_business_ethics": 0.9090909090909091,
+ "mmlu_eval_accuracy_clinical_knowledge": 0.8620689655172413,
+ "mmlu_eval_accuracy_college_biology": 0.875,
+ "mmlu_eval_accuracy_college_chemistry": 0.5,
+ "mmlu_eval_accuracy_college_computer_science": 0.7272727272727273,
+ "mmlu_eval_accuracy_college_mathematics": 0.36363636363636365,
+ "mmlu_eval_accuracy_college_medicine": 0.9090909090909091,
+ "mmlu_eval_accuracy_college_physics": 0.6363636363636364,
+ "mmlu_eval_accuracy_computer_security": 0.6363636363636364,
+ "mmlu_eval_accuracy_conceptual_physics": 0.6153846153846154,
+ "mmlu_eval_accuracy_econometrics": 0.8333333333333334,
+ "mmlu_eval_accuracy_electrical_engineering": 0.8125,
+ "mmlu_eval_accuracy_elementary_mathematics": 0.6097560975609756,
+ "mmlu_eval_accuracy_formal_logic": 0.6428571428571429,
+ "mmlu_eval_accuracy_global_facts": 0.5,
+ "mmlu_eval_accuracy_high_school_biology": 0.84375,
+ "mmlu_eval_accuracy_high_school_chemistry": 0.45454545454545453,
+ "mmlu_eval_accuracy_high_school_computer_science": 0.6666666666666666,
+ "mmlu_eval_accuracy_high_school_european_history": 0.7777777777777778,
+ "mmlu_eval_accuracy_high_school_geography": 0.9090909090909091,
+ "mmlu_eval_accuracy_high_school_government_and_politics": 0.9523809523809523,
+ "mmlu_eval_accuracy_high_school_macroeconomics": 0.7906976744186046,
+ "mmlu_eval_accuracy_high_school_mathematics": 0.3793103448275862,
+ "mmlu_eval_accuracy_high_school_microeconomics": 0.9230769230769231,
+ "mmlu_eval_accuracy_high_school_physics": 0.23529411764705882,
+ "mmlu_eval_accuracy_high_school_psychology": 0.9333333333333333,
+ "mmlu_eval_accuracy_high_school_statistics": 0.7391304347826086,
+ "mmlu_eval_accuracy_high_school_us_history": 0.9090909090909091,
+ "mmlu_eval_accuracy_high_school_world_history": 0.7692307692307693,
+ "mmlu_eval_accuracy_human_aging": 0.782608695652174,
+ "mmlu_eval_accuracy_human_sexuality": 0.6666666666666666,
+ "mmlu_eval_accuracy_international_law": 1.0,
+ "mmlu_eval_accuracy_jurisprudence": 0.5454545454545454,
+ "mmlu_eval_accuracy_logical_fallacies": 0.7222222222222222,
+ "mmlu_eval_accuracy_machine_learning": 0.6363636363636364,
+ "mmlu_eval_accuracy_management": 0.9090909090909091,
+ "mmlu_eval_accuracy_marketing": 0.96,
+ "mmlu_eval_accuracy_medical_genetics": 0.9090909090909091,
+ "mmlu_eval_accuracy_miscellaneous": 0.7790697674418605,
+ "mmlu_eval_accuracy_moral_disputes": 0.8157894736842105,
+ "mmlu_eval_accuracy_moral_scenarios": 0.57,
+ "mmlu_eval_accuracy_nutrition": 0.7575757575757576,
+ "mmlu_eval_accuracy_philosophy": 0.8235294117647058,
+ "mmlu_eval_accuracy_prehistory": 0.8857142857142857,
+ "mmlu_eval_accuracy_professional_accounting": 0.6129032258064516,
+ "mmlu_eval_accuracy_professional_law": 0.6235294117647059,
+ "mmlu_eval_accuracy_professional_medicine": 0.8387096774193549,
+ "mmlu_eval_accuracy_professional_psychology": 0.8115942028985508,
+ "mmlu_eval_accuracy_public_relations": 0.5833333333333334,
+ "mmlu_eval_accuracy_security_studies": 0.8148148148148148,
+ "mmlu_eval_accuracy_sociology": 0.9090909090909091,
+ "mmlu_eval_accuracy_us_foreign_policy": 1.0,
+ "mmlu_eval_accuracy_virology": 0.5,
+ "mmlu_eval_accuracy_world_religions": 0.8421052631578947,
+ "mmlu_loss": 1.243813282909306,
+ "step": 935
+ },
+ {
+ "epoch": 1.7,
+ "learning_rate": 0.0001,
+ "loss": 1.0174,
+ "step": 940
+ },
+ {
+ "epoch": 1.72,
+ "learning_rate": 0.0001,
+ "loss": 1.0302,
+ "step": 950
+ },
+ {
+ "epoch": 1.74,
+ "learning_rate": 0.0001,
+ "loss": 0.8799,
+ "step": 960
+ },
+ {
+ "epoch": 1.76,
+ "learning_rate": 0.0001,
+ "loss": 0.8447,
+ "step": 970
+ },
+ {
+ "epoch": 1.78,
+ "learning_rate": 0.0001,
+ "loss": 0.9053,
+ "step": 980
+ },
+ {
+ "epoch": 1.8,
+ "learning_rate": 0.0001,
+ "loss": 1.0331,
+ "step": 990
+ },
+ {
+ "epoch": 1.81,
+ "learning_rate": 0.0001,
+ "loss": 1.0412,
+ "step": 1000
+ },
+ {
+ "epoch": 1.83,
+ "learning_rate": 0.0001,
+ "loss": 0.8753,
+ "step": 1010
+ },
+ {
+ "epoch": 1.85,
+ "learning_rate": 0.0001,
+ "loss": 0.8744,
+ "step": 1020
+ },
+ {
+ "epoch": 1.87,
+ "learning_rate": 0.0001,
+ "loss": 0.8899,
+ "step": 1030
+ },
+ {
+ "epoch": 1.89,
+ "learning_rate": 0.0001,
+ "loss": 1.0053,
+ "step": 1040
+ },
+ {
+ "epoch": 1.9,
+ "learning_rate": 0.0001,
+ "loss": 1.0127,
+ "step": 1050
+ },
+ {
+ "epoch": 1.92,
+ "learning_rate": 0.0001,
+ "loss": 0.8023,
+ "step": 1060
+ },
+ {
+ "epoch": 1.94,
+ "learning_rate": 0.0001,
+ "loss": 0.8349,
+ "step": 1070
+ },
+ {
+ "epoch": 1.96,
+ "learning_rate": 0.0001,
+ "loss": 0.9742,
+ "step": 1080
+ },
+ {
+ "epoch": 1.98,
+ "learning_rate": 0.0001,
+ "loss": 1.0971,
+ "step": 1090
+ },
+ {
+ "epoch": 2.0,
+ "learning_rate": 0.0001,
+ "loss": 1.0728,
+ "step": 1100
+ },
+ {
+ "epoch": 2.01,
+ "learning_rate": 0.0001,
+ "loss": 0.7724,
+ "step": 1110
+ },
+ {
+ "epoch": 2.03,
+ "learning_rate": 0.0001,
+ "loss": 0.7675,
+ "step": 1120
+ },
+ {
+ "epoch": 2.03,
+ "eval_loss": 1.052681565284729,
+ "eval_runtime": 942.0722,
+ "eval_samples_per_second": 1.061,
+ "eval_steps_per_second": 1.061,
+ "step": 1122
+ },
+ {
+ "epoch": 2.03,
+ "mmlu_eval_accuracy": 0.7373981967098951,
+ "mmlu_eval_accuracy_abstract_algebra": 0.36363636363636365,
+ "mmlu_eval_accuracy_anatomy": 0.6428571428571429,
+ "mmlu_eval_accuracy_astronomy": 0.6875,
+ "mmlu_eval_accuracy_business_ethics": 0.9090909090909091,
+ "mmlu_eval_accuracy_clinical_knowledge": 0.896551724137931,
+ "mmlu_eval_accuracy_college_biology": 0.875,
+ "mmlu_eval_accuracy_college_chemistry": 0.5,
+ "mmlu_eval_accuracy_college_computer_science": 0.5454545454545454,
+ "mmlu_eval_accuracy_college_mathematics": 0.36363636363636365,
+ "mmlu_eval_accuracy_college_medicine": 0.9090909090909091,
+ "mmlu_eval_accuracy_college_physics": 0.5454545454545454,
+ "mmlu_eval_accuracy_computer_security": 0.6363636363636364,
+ "mmlu_eval_accuracy_conceptual_physics": 0.6153846153846154,
+ "mmlu_eval_accuracy_econometrics": 0.8333333333333334,
+ "mmlu_eval_accuracy_electrical_engineering": 0.875,
+ "mmlu_eval_accuracy_elementary_mathematics": 0.5853658536585366,
+ "mmlu_eval_accuracy_formal_logic": 0.7142857142857143,
+ "mmlu_eval_accuracy_global_facts": 0.4,
+ "mmlu_eval_accuracy_high_school_biology": 0.84375,
+ "mmlu_eval_accuracy_high_school_chemistry": 0.45454545454545453,
+ "mmlu_eval_accuracy_high_school_computer_science": 0.7777777777777778,
+ "mmlu_eval_accuracy_high_school_european_history": 0.7777777777777778,
+ "mmlu_eval_accuracy_high_school_geography": 0.9090909090909091,
+ "mmlu_eval_accuracy_high_school_government_and_politics": 0.9523809523809523,
+ "mmlu_eval_accuracy_high_school_macroeconomics": 0.7674418604651163,
+ "mmlu_eval_accuracy_high_school_mathematics": 0.41379310344827586,
+ "mmlu_eval_accuracy_high_school_microeconomics": 0.9230769230769231,
+ "mmlu_eval_accuracy_high_school_physics": 0.23529411764705882,
+ "mmlu_eval_accuracy_high_school_psychology": 0.95,
+ "mmlu_eval_accuracy_high_school_statistics": 0.782608695652174,
+ "mmlu_eval_accuracy_high_school_us_history": 0.9090909090909091,
+ "mmlu_eval_accuracy_high_school_world_history": 0.7692307692307693,
+ "mmlu_eval_accuracy_human_aging": 0.8260869565217391,
+ "mmlu_eval_accuracy_human_sexuality": 0.6666666666666666,
+ "mmlu_eval_accuracy_international_law": 1.0,
+ "mmlu_eval_accuracy_jurisprudence": 0.6363636363636364,
+ "mmlu_eval_accuracy_logical_fallacies": 0.7777777777777778,
+ "mmlu_eval_accuracy_machine_learning": 0.6363636363636364,
+ "mmlu_eval_accuracy_management": 0.9090909090909091,
+ "mmlu_eval_accuracy_marketing": 0.92,
+ "mmlu_eval_accuracy_medical_genetics": 0.9090909090909091,
+ "mmlu_eval_accuracy_miscellaneous": 0.7906976744186046,
+ "mmlu_eval_accuracy_moral_disputes": 0.8157894736842105,
+ "mmlu_eval_accuracy_moral_scenarios": 0.62,
+ "mmlu_eval_accuracy_nutrition": 0.7575757575757576,
+ "mmlu_eval_accuracy_philosophy": 0.8235294117647058,
+ "mmlu_eval_accuracy_prehistory": 0.8857142857142857,
+ "mmlu_eval_accuracy_professional_accounting": 0.6774193548387096,
+ "mmlu_eval_accuracy_professional_law": 0.6294117647058823,
+ "mmlu_eval_accuracy_professional_medicine": 0.8709677419354839,
+ "mmlu_eval_accuracy_professional_psychology": 0.782608695652174,
+ "mmlu_eval_accuracy_public_relations": 0.6666666666666666,
+ "mmlu_eval_accuracy_security_studies": 0.8148148148148148,
+ "mmlu_eval_accuracy_sociology": 0.9090909090909091,
+ "mmlu_eval_accuracy_us_foreign_policy": 1.0,
+ "mmlu_eval_accuracy_virology": 0.5,
+ "mmlu_eval_accuracy_world_religions": 0.8421052631578947,
+ "mmlu_loss": 1.2340081441760609,
+ "step": 1122
+ },
+ {
+ "epoch": 2.05,
+ "learning_rate": 0.0001,
+ "loss": 0.7194,
+ "step": 1130
+ },
+ {
+ "epoch": 2.07,
+ "learning_rate": 0.0001,
+ "loss": 0.8236,
+ "step": 1140
+ },
+ {
+ "epoch": 2.09,
+ "learning_rate": 0.0001,
+ "loss": 0.6652,
+ "step": 1150
+ },
+ {
+ "epoch": 2.1,
+ "learning_rate": 0.0001,
+ "loss": 0.7177,
+ "step": 1160
+ },
+ {
+ "epoch": 2.12,
+ "learning_rate": 0.0001,
+ "loss": 0.7788,
+ "step": 1170
+ },
+ {
+ "epoch": 2.14,
+ "learning_rate": 0.0001,
+ "loss": 0.8117,
+ "step": 1180
+ },
+ {
+ "epoch": 2.16,
+ "learning_rate": 0.0001,
+ "loss": 0.8145,
+ "step": 1190
+ },
+ {
+ "epoch": 2.18,
+ "learning_rate": 0.0001,
+ "loss": 0.6984,
+ "step": 1200
+ },
+ {
+ "epoch": 2.19,
+ "learning_rate": 0.0001,
+ "loss": 0.7011,
+ "step": 1210
+ },
+ {
+ "epoch": 2.21,
+ "learning_rate": 0.0001,
+ "loss": 0.769,
+ "step": 1220
+ },
+ {
+ "epoch": 2.23,
+ "learning_rate": 0.0001,
+ "loss": 0.7705,
+ "step": 1230
+ },
+ {
+ "epoch": 2.25,
+ "learning_rate": 0.0001,
+ "loss": 0.8066,
+ "step": 1240
+ },
+ {
+ "epoch": 2.27,
+ "learning_rate": 0.0001,
+ "loss": 0.6622,
+ "step": 1250
+ },
+ {
+ "epoch": 2.29,
+ "learning_rate": 0.0001,
+ "loss": 0.6641,
+ "step": 1260
+ },
+ {
+ "epoch": 2.3,
+ "learning_rate": 0.0001,
+ "loss": 0.7239,
+ "step": 1270
+ },
+ {
+ "epoch": 2.32,
+ "learning_rate": 0.0001,
+ "loss": 0.7618,
+ "step": 1280
+ },
+ {
+ "epoch": 2.34,
+ "learning_rate": 0.0001,
+ "loss": 0.7845,
+ "step": 1290
+ },
+ {
+ "epoch": 2.36,
+ "learning_rate": 0.0001,
+ "loss": 0.719,
+ "step": 1300
+ },
+ {
+ "epoch": 2.37,
+ "eval_loss": 1.1104822158813477,
+ "eval_runtime": 948.1299,
+ "eval_samples_per_second": 1.055,
+ "eval_steps_per_second": 1.055,
+ "step": 1309
+ },
+ {
+ "epoch": 2.37,
+ "mmlu_eval_accuracy": 0.7369285730399766,
+ "mmlu_eval_accuracy_abstract_algebra": 0.36363636363636365,
+ "mmlu_eval_accuracy_anatomy": 0.6428571428571429,
+ "mmlu_eval_accuracy_astronomy": 0.6875,
+ "mmlu_eval_accuracy_business_ethics": 1.0,
+ "mmlu_eval_accuracy_clinical_knowledge": 0.8275862068965517,
+ "mmlu_eval_accuracy_college_biology": 0.875,
+ "mmlu_eval_accuracy_college_chemistry": 0.5,
+ "mmlu_eval_accuracy_college_computer_science": 0.6363636363636364,
+ "mmlu_eval_accuracy_college_mathematics": 0.45454545454545453,
+ "mmlu_eval_accuracy_college_medicine": 0.8636363636363636,
+ "mmlu_eval_accuracy_college_physics": 0.6363636363636364,
+ "mmlu_eval_accuracy_computer_security": 0.7272727272727273,
+ "mmlu_eval_accuracy_conceptual_physics": 0.6923076923076923,
+ "mmlu_eval_accuracy_econometrics": 0.75,
+ "mmlu_eval_accuracy_electrical_engineering": 0.8125,
+ "mmlu_eval_accuracy_elementary_mathematics": 0.6341463414634146,
+ "mmlu_eval_accuracy_formal_logic": 0.7857142857142857,
+ "mmlu_eval_accuracy_global_facts": 0.5,
+ "mmlu_eval_accuracy_high_school_biology": 0.84375,
+ "mmlu_eval_accuracy_high_school_chemistry": 0.4090909090909091,
+ "mmlu_eval_accuracy_high_school_computer_science": 0.8888888888888888,
+ "mmlu_eval_accuracy_high_school_european_history": 0.7777777777777778,
+ "mmlu_eval_accuracy_high_school_geography": 0.9090909090909091,
+ "mmlu_eval_accuracy_high_school_government_and_politics": 0.9523809523809523,
+ "mmlu_eval_accuracy_high_school_macroeconomics": 0.7906976744186046,
+ "mmlu_eval_accuracy_high_school_mathematics": 0.3793103448275862,
+ "mmlu_eval_accuracy_high_school_microeconomics": 0.9230769230769231,
+ "mmlu_eval_accuracy_high_school_physics": 0.17647058823529413,
+ "mmlu_eval_accuracy_high_school_psychology": 0.95,
+ "mmlu_eval_accuracy_high_school_statistics": 0.6956521739130435,
+ "mmlu_eval_accuracy_high_school_us_history": 0.9090909090909091,
+ "mmlu_eval_accuracy_high_school_world_history": 0.7307692307692307,
+ "mmlu_eval_accuracy_human_aging": 0.7391304347826086,
+ "mmlu_eval_accuracy_human_sexuality": 0.6666666666666666,
+ "mmlu_eval_accuracy_international_law": 1.0,
+ "mmlu_eval_accuracy_jurisprudence": 0.5454545454545454,
+ "mmlu_eval_accuracy_logical_fallacies": 0.7222222222222222,
+ "mmlu_eval_accuracy_machine_learning": 0.6363636363636364,
+ "mmlu_eval_accuracy_management": 0.9090909090909091,
+ "mmlu_eval_accuracy_marketing": 0.88,
+ "mmlu_eval_accuracy_medical_genetics": 0.9090909090909091,
+ "mmlu_eval_accuracy_miscellaneous": 0.7906976744186046,
+ "mmlu_eval_accuracy_moral_disputes": 0.8157894736842105,
+ "mmlu_eval_accuracy_moral_scenarios": 0.57,
+ "mmlu_eval_accuracy_nutrition": 0.7575757575757576,
+ "mmlu_eval_accuracy_philosophy": 0.8529411764705882,
+ "mmlu_eval_accuracy_prehistory": 0.8571428571428571,
+ "mmlu_eval_accuracy_professional_accounting": 0.6774193548387096,
+ "mmlu_eval_accuracy_professional_law": 0.6058823529411764,
+ "mmlu_eval_accuracy_professional_medicine": 0.8709677419354839,
+ "mmlu_eval_accuracy_professional_psychology": 0.7681159420289855,
+ "mmlu_eval_accuracy_public_relations": 0.5833333333333334,
+ "mmlu_eval_accuracy_security_studies": 0.8148148148148148,
+ "mmlu_eval_accuracy_sociology": 0.9090909090909091,
+ "mmlu_eval_accuracy_us_foreign_policy": 1.0,
+ "mmlu_eval_accuracy_virology": 0.5555555555555556,
+ "mmlu_eval_accuracy_world_religions": 0.8421052631578947,
+ "mmlu_loss": 1.0866562834095908,
+ "step": 1309
+ },
+ {
+ "epoch": 2.38,
+ "learning_rate": 0.0001,
+ "loss": 0.7093,
+ "step": 1310
+ },
+ {
+ "epoch": 2.39,
+ "learning_rate": 0.0001,
+ "loss": 0.7684,
+ "step": 1320
+ },
+ {
+ "epoch": 2.41,
+ "learning_rate": 0.0001,
+ "loss": 0.7501,
+ "step": 1330
+ },
+ {
+ "epoch": 2.43,
+ "learning_rate": 0.0001,
+ "loss": 0.8043,
+ "step": 1340
+ },
+ {
+ "epoch": 2.45,
+ "learning_rate": 0.0001,
+ "loss": 0.6927,
+ "step": 1350
+ },
+ {
+ "epoch": 2.47,
+ "learning_rate": 0.0001,
+ "loss": 0.7278,
+ "step": 1360
+ },
+ {
+ "epoch": 2.48,
+ "learning_rate": 0.0001,
+ "loss": 0.8095,
+ "step": 1370
+ },
+ {
+ "epoch": 2.5,
+ "learning_rate": 0.0001,
+ "loss": 0.7463,
+ "step": 1380
+ },
+ {
+ "epoch": 2.52,
+ "learning_rate": 0.0001,
+ "loss": 0.7707,
+ "step": 1390
+ },
+ {
+ "epoch": 2.54,
+ "learning_rate": 0.0001,
+ "loss": 0.7152,
+ "step": 1400
+ }
+ ],
+ "max_steps": 1875,
+ "num_train_epochs": 4,
+ "total_flos": 1.6231629152399524e+18,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/checkpoint-1400/training_args.bin b/checkpoint-1400/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..cab5478c763bfedd6ea1ae136d922355ec23f73d
--- /dev/null
+++ b/checkpoint-1400/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3739b542a2914edbd2d7eb3b727d6fa2a7752c75b0d7a856f5e87dd807fa1ef9
+size 6200
diff --git a/checkpoint-1600/README.md b/checkpoint-1600/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..82793f73e61dbb024e11fc6697bba1622d4d0db6
--- /dev/null
+++ b/checkpoint-1600/README.md
@@ -0,0 +1,20 @@
+---
+library_name: peft
+---
+## Training procedure
+
+
+The following `bitsandbytes` quantization config was used during training:
+- load_in_8bit: False
+- load_in_4bit: True
+- llm_int8_threshold: 6.0
+- llm_int8_skip_modules: None
+- llm_int8_enable_fp32_cpu_offload: False
+- llm_int8_has_fp16_weight: False
+- bnb_4bit_quant_type: nf4
+- bnb_4bit_use_double_quant: True
+- bnb_4bit_compute_dtype: bfloat16
+### Framework versions
+
+
+- PEFT 0.4.0
diff --git a/checkpoint-1600/adapter_config.json b/checkpoint-1600/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..f6f42652a8f2f3cf02936153fca1af5d04e7983d
--- /dev/null
+++ b/checkpoint-1600/adapter_config.json
@@ -0,0 +1,26 @@
+{
+ "auto_mapping": null,
+ "base_model_name_or_path": "152334H/miqu-1-70b-sf",
+ "bias": "none",
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "lora_alpha": 16.0,
+ "lora_dropout": 0.05,
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "r": 64,
+ "revision": null,
+ "target_modules": [
+ "gate_proj",
+ "down_proj",
+ "v_proj",
+ "up_proj",
+ "q_proj",
+ "k_proj",
+ "o_proj"
+ ],
+ "task_type": "CAUSAL_LM"
+}
\ No newline at end of file
diff --git a/checkpoint-1600/adapter_model.bin b/checkpoint-1600/adapter_model.bin
new file mode 100644
index 0000000000000000000000000000000000000000..c7c854cb374928ff554bac17fa2598f6139b1f5b
--- /dev/null
+++ b/checkpoint-1600/adapter_model.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e9a0f9e046389866de742b94122dd6dbb44196a645e8a8912ab9becd3b6e9ee2
+size 1657155522
diff --git a/checkpoint-1600/adapter_model/README.md b/checkpoint-1600/adapter_model/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..82793f73e61dbb024e11fc6697bba1622d4d0db6
--- /dev/null
+++ b/checkpoint-1600/adapter_model/README.md
@@ -0,0 +1,20 @@
+---
+library_name: peft
+---
+## Training procedure
+
+
+The following `bitsandbytes` quantization config was used during training:
+- load_in_8bit: False
+- load_in_4bit: True
+- llm_int8_threshold: 6.0
+- llm_int8_skip_modules: None
+- llm_int8_enable_fp32_cpu_offload: False
+- llm_int8_has_fp16_weight: False
+- bnb_4bit_quant_type: nf4
+- bnb_4bit_use_double_quant: True
+- bnb_4bit_compute_dtype: bfloat16
+### Framework versions
+
+
+- PEFT 0.4.0
diff --git a/checkpoint-1600/adapter_model/adapter_config.json b/checkpoint-1600/adapter_model/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..f6f42652a8f2f3cf02936153fca1af5d04e7983d
--- /dev/null
+++ b/checkpoint-1600/adapter_model/adapter_config.json
@@ -0,0 +1,26 @@
+{
+ "auto_mapping": null,
+ "base_model_name_or_path": "152334H/miqu-1-70b-sf",
+ "bias": "none",
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "lora_alpha": 16.0,
+ "lora_dropout": 0.05,
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "r": 64,
+ "revision": null,
+ "target_modules": [
+ "gate_proj",
+ "down_proj",
+ "v_proj",
+ "up_proj",
+ "q_proj",
+ "k_proj",
+ "o_proj"
+ ],
+ "task_type": "CAUSAL_LM"
+}
\ No newline at end of file
diff --git a/checkpoint-1600/adapter_model/adapter_model.bin b/checkpoint-1600/adapter_model/adapter_model.bin
new file mode 100644
index 0000000000000000000000000000000000000000..c7c854cb374928ff554bac17fa2598f6139b1f5b
--- /dev/null
+++ b/checkpoint-1600/adapter_model/adapter_model.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e9a0f9e046389866de742b94122dd6dbb44196a645e8a8912ab9becd3b6e9ee2
+size 1657155522
diff --git a/checkpoint-1600/optimizer.pt b/checkpoint-1600/optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..5489cdfdf53254e879064dec7d1893dd750da9ac
--- /dev/null
+++ b/checkpoint-1600/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:991313c3544fd29570cdd6d6c35cee055932460db308158ecc93c1bf6e12e312
+size 6627702922
diff --git a/checkpoint-1600/rng_state.pth b/checkpoint-1600/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..cbfce4ead5f82f01ff08db755454e08003608666
--- /dev/null
+++ b/checkpoint-1600/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a0a23ca79cae1540f2b114a028cd9472e6869461ba37577bd02be151a5f22a4e
+size 14180
diff --git a/checkpoint-1600/scheduler.pt b/checkpoint-1600/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..b878c5736da6c68181e3daa099482e413def6cdf
--- /dev/null
+++ b/checkpoint-1600/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:91889afd10b68ec25b037a974a49914c98ca7cbeb59686cdaae2470ce449354d
+size 1064
diff --git a/checkpoint-1600/special_tokens_map.json b/checkpoint-1600/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..55431d8ca0ffd8e996c646341b6136e749142547
--- /dev/null
+++ b/checkpoint-1600/special_tokens_map.json
@@ -0,0 +1,12 @@
+{
+ "bos_token": "",
+ "eos_token": "",
+ "pad_token": {
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false
+ },
+ "unk_token": ""
+}
diff --git a/checkpoint-1600/tokenizer.model b/checkpoint-1600/tokenizer.model
new file mode 100644
index 0000000000000000000000000000000000000000..6c00c742ce03c627d6cd5b795984876fa49fa899
--- /dev/null
+++ b/checkpoint-1600/tokenizer.model
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
+size 499723
diff --git a/checkpoint-1600/tokenizer_config.json b/checkpoint-1600/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..a3b9160e6f055447e1b1927482cf0ea394366055
--- /dev/null
+++ b/checkpoint-1600/tokenizer_config.json
@@ -0,0 +1,71 @@
+{
+ "add_bos_token": true,
+ "add_eos_token": false,
+ "added_tokens_decoder": {
+ "0": {
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "1": {
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "2": {
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ }
+ },
+ "bos_token": {
+ "__type": "AddedToken",
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false
+ },
+ "chat_template": "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}",
+ "clean_up_tokenization_spaces": false,
+ "eos_token": {
+ "__type": "AddedToken",
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false
+ },
+ "legacy": false,
+ "model_max_length": 1000000000000000019884624838656,
+ "pad_token": {
+ "__type": "AddedToken",
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false
+ },
+ "padding_side": "right",
+ "sp_model_kwargs": {},
+ "spaces_between_special_tokens": false,
+ "tokenizer_class": "LlamaTokenizer",
+ "unk_token": {
+ "__type": "AddedToken",
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false
+ },
+ "use_default_system_prompt": false
+}
diff --git a/checkpoint-1600/trainer_state.json b/checkpoint-1600/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..6bac3e557f68d0cc3646c7838b196f86fb937a04
--- /dev/null
+++ b/checkpoint-1600/trainer_state.json
@@ -0,0 +1,1544 @@
+{
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 2.901836318295171,
+ "global_step": 1600,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0001,
+ "loss": 1.1655,
+ "step": 10
+ },
+ {
+ "epoch": 0.04,
+ "learning_rate": 0.0001,
+ "loss": 1.001,
+ "step": 20
+ },
+ {
+ "epoch": 0.05,
+ "learning_rate": 0.0001,
+ "loss": 1.0287,
+ "step": 30
+ },
+ {
+ "epoch": 0.07,
+ "learning_rate": 0.0001,
+ "loss": 1.1578,
+ "step": 40
+ },
+ {
+ "epoch": 0.09,
+ "learning_rate": 0.0001,
+ "loss": 1.2146,
+ "step": 50
+ },
+ {
+ "epoch": 0.11,
+ "learning_rate": 0.0001,
+ "loss": 0.997,
+ "step": 60
+ },
+ {
+ "epoch": 0.13,
+ "learning_rate": 0.0001,
+ "loss": 0.9024,
+ "step": 70
+ },
+ {
+ "epoch": 0.15,
+ "learning_rate": 0.0001,
+ "loss": 0.9901,
+ "step": 80
+ },
+ {
+ "epoch": 0.16,
+ "learning_rate": 0.0001,
+ "loss": 1.1264,
+ "step": 90
+ },
+ {
+ "epoch": 0.18,
+ "learning_rate": 0.0001,
+ "loss": 1.2038,
+ "step": 100
+ },
+ {
+ "epoch": 0.2,
+ "learning_rate": 0.0001,
+ "loss": 0.8935,
+ "step": 110
+ },
+ {
+ "epoch": 0.22,
+ "learning_rate": 0.0001,
+ "loss": 0.9178,
+ "step": 120
+ },
+ {
+ "epoch": 0.24,
+ "learning_rate": 0.0001,
+ "loss": 0.9746,
+ "step": 130
+ },
+ {
+ "epoch": 0.25,
+ "learning_rate": 0.0001,
+ "loss": 1.1566,
+ "step": 140
+ },
+ {
+ "epoch": 0.27,
+ "learning_rate": 0.0001,
+ "loss": 1.2877,
+ "step": 150
+ },
+ {
+ "epoch": 0.29,
+ "learning_rate": 0.0001,
+ "loss": 0.9146,
+ "step": 160
+ },
+ {
+ "epoch": 0.31,
+ "learning_rate": 0.0001,
+ "loss": 0.8895,
+ "step": 170
+ },
+ {
+ "epoch": 0.33,
+ "learning_rate": 0.0001,
+ "loss": 1.0121,
+ "step": 180
+ },
+ {
+ "epoch": 0.34,
+ "eval_loss": 1.0215636491775513,
+ "eval_runtime": 950.138,
+ "eval_samples_per_second": 1.052,
+ "eval_steps_per_second": 1.052,
+ "step": 187
+ },
+ {
+ "epoch": 0.34,
+ "mmlu_eval_accuracy": 0.731892294851104,
+ "mmlu_eval_accuracy_abstract_algebra": 0.18181818181818182,
+ "mmlu_eval_accuracy_anatomy": 0.7857142857142857,
+ "mmlu_eval_accuracy_astronomy": 0.6875,
+ "mmlu_eval_accuracy_business_ethics": 0.7272727272727273,
+ "mmlu_eval_accuracy_clinical_knowledge": 0.896551724137931,
+ "mmlu_eval_accuracy_college_biology": 0.875,
+ "mmlu_eval_accuracy_college_chemistry": 0.5,
+ "mmlu_eval_accuracy_college_computer_science": 0.7272727272727273,
+ "mmlu_eval_accuracy_college_mathematics": 0.45454545454545453,
+ "mmlu_eval_accuracy_college_medicine": 0.9090909090909091,
+ "mmlu_eval_accuracy_college_physics": 0.6363636363636364,
+ "mmlu_eval_accuracy_computer_security": 0.6363636363636364,
+ "mmlu_eval_accuracy_conceptual_physics": 0.6538461538461539,
+ "mmlu_eval_accuracy_econometrics": 0.8333333333333334,
+ "mmlu_eval_accuracy_electrical_engineering": 0.8125,
+ "mmlu_eval_accuracy_elementary_mathematics": 0.7073170731707317,
+ "mmlu_eval_accuracy_formal_logic": 0.5714285714285714,
+ "mmlu_eval_accuracy_global_facts": 0.5,
+ "mmlu_eval_accuracy_high_school_biology": 0.8125,
+ "mmlu_eval_accuracy_high_school_chemistry": 0.4090909090909091,
+ "mmlu_eval_accuracy_high_school_computer_science": 0.6666666666666666,
+ "mmlu_eval_accuracy_high_school_european_history": 0.7777777777777778,
+ "mmlu_eval_accuracy_high_school_geography": 0.9090909090909091,
+ "mmlu_eval_accuracy_high_school_government_and_politics": 0.9523809523809523,
+ "mmlu_eval_accuracy_high_school_macroeconomics": 0.7441860465116279,
+ "mmlu_eval_accuracy_high_school_mathematics": 0.3793103448275862,
+ "mmlu_eval_accuracy_high_school_microeconomics": 0.9615384615384616,
+ "mmlu_eval_accuracy_high_school_physics": 0.23529411764705882,
+ "mmlu_eval_accuracy_high_school_psychology": 0.9333333333333333,
+ "mmlu_eval_accuracy_high_school_statistics": 0.6956521739130435,
+ "mmlu_eval_accuracy_high_school_us_history": 0.9090909090909091,
+ "mmlu_eval_accuracy_high_school_world_history": 0.8076923076923077,
+ "mmlu_eval_accuracy_human_aging": 0.8260869565217391,
+ "mmlu_eval_accuracy_human_sexuality": 0.6666666666666666,
+ "mmlu_eval_accuracy_international_law": 1.0,
+ "mmlu_eval_accuracy_jurisprudence": 0.6363636363636364,
+ "mmlu_eval_accuracy_logical_fallacies": 0.7777777777777778,
+ "mmlu_eval_accuracy_machine_learning": 0.5454545454545454,
+ "mmlu_eval_accuracy_management": 0.9090909090909091,
+ "mmlu_eval_accuracy_marketing": 0.88,
+ "mmlu_eval_accuracy_medical_genetics": 0.9090909090909091,
+ "mmlu_eval_accuracy_miscellaneous": 0.7906976744186046,
+ "mmlu_eval_accuracy_moral_disputes": 0.8157894736842105,
+ "mmlu_eval_accuracy_moral_scenarios": 0.59,
+ "mmlu_eval_accuracy_nutrition": 0.7878787878787878,
+ "mmlu_eval_accuracy_philosophy": 0.7941176470588235,
+ "mmlu_eval_accuracy_prehistory": 0.8285714285714286,
+ "mmlu_eval_accuracy_professional_accounting": 0.6451612903225806,
+ "mmlu_eval_accuracy_professional_law": 0.6294117647058823,
+ "mmlu_eval_accuracy_professional_medicine": 0.8064516129032258,
+ "mmlu_eval_accuracy_professional_psychology": 0.8115942028985508,
+ "mmlu_eval_accuracy_public_relations": 0.6666666666666666,
+ "mmlu_eval_accuracy_security_studies": 0.8148148148148148,
+ "mmlu_eval_accuracy_sociology": 0.9545454545454546,
+ "mmlu_eval_accuracy_us_foreign_policy": 1.0,
+ "mmlu_eval_accuracy_virology": 0.5,
+ "mmlu_eval_accuracy_world_religions": 0.8421052631578947,
+ "mmlu_loss": 1.326305795171384,
+ "step": 187
+ },
+ {
+ "epoch": 0.34,
+ "learning_rate": 0.0001,
+ "loss": 1.1133,
+ "step": 190
+ },
+ {
+ "epoch": 0.36,
+ "learning_rate": 0.0001,
+ "loss": 1.2485,
+ "step": 200
+ },
+ {
+ "epoch": 0.38,
+ "learning_rate": 0.0001,
+ "loss": 0.9653,
+ "step": 210
+ },
+ {
+ "epoch": 0.4,
+ "learning_rate": 0.0001,
+ "loss": 0.9455,
+ "step": 220
+ },
+ {
+ "epoch": 0.42,
+ "learning_rate": 0.0001,
+ "loss": 1.0373,
+ "step": 230
+ },
+ {
+ "epoch": 0.44,
+ "learning_rate": 0.0001,
+ "loss": 1.1425,
+ "step": 240
+ },
+ {
+ "epoch": 0.45,
+ "learning_rate": 0.0001,
+ "loss": 1.3136,
+ "step": 250
+ },
+ {
+ "epoch": 0.47,
+ "learning_rate": 0.0001,
+ "loss": 0.8695,
+ "step": 260
+ },
+ {
+ "epoch": 0.49,
+ "learning_rate": 0.0001,
+ "loss": 0.872,
+ "step": 270
+ },
+ {
+ "epoch": 0.51,
+ "learning_rate": 0.0001,
+ "loss": 1.0152,
+ "step": 280
+ },
+ {
+ "epoch": 0.53,
+ "learning_rate": 0.0001,
+ "loss": 1.1309,
+ "step": 290
+ },
+ {
+ "epoch": 0.54,
+ "learning_rate": 0.0001,
+ "loss": 1.267,
+ "step": 300
+ },
+ {
+ "epoch": 0.56,
+ "learning_rate": 0.0001,
+ "loss": 0.9249,
+ "step": 310
+ },
+ {
+ "epoch": 0.58,
+ "learning_rate": 0.0001,
+ "loss": 0.9148,
+ "step": 320
+ },
+ {
+ "epoch": 0.6,
+ "learning_rate": 0.0001,
+ "loss": 0.9864,
+ "step": 330
+ },
+ {
+ "epoch": 0.62,
+ "learning_rate": 0.0001,
+ "loss": 1.2312,
+ "step": 340
+ },
+ {
+ "epoch": 0.63,
+ "learning_rate": 0.0001,
+ "loss": 1.2354,
+ "step": 350
+ },
+ {
+ "epoch": 0.65,
+ "learning_rate": 0.0001,
+ "loss": 0.9126,
+ "step": 360
+ },
+ {
+ "epoch": 0.67,
+ "learning_rate": 0.0001,
+ "loss": 0.9213,
+ "step": 370
+ },
+ {
+ "epoch": 0.68,
+ "eval_loss": 1.0163359642028809,
+ "eval_runtime": 948.1151,
+ "eval_samples_per_second": 1.055,
+ "eval_steps_per_second": 1.055,
+ "step": 374
+ },
+ {
+ "epoch": 0.68,
+ "mmlu_eval_accuracy": 0.7395476061435284,
+ "mmlu_eval_accuracy_abstract_algebra": 0.2727272727272727,
+ "mmlu_eval_accuracy_anatomy": 0.7857142857142857,
+ "mmlu_eval_accuracy_astronomy": 0.75,
+ "mmlu_eval_accuracy_business_ethics": 0.7272727272727273,
+ "mmlu_eval_accuracy_clinical_knowledge": 0.896551724137931,
+ "mmlu_eval_accuracy_college_biology": 0.875,
+ "mmlu_eval_accuracy_college_chemistry": 0.5,
+ "mmlu_eval_accuracy_college_computer_science": 0.7272727272727273,
+ "mmlu_eval_accuracy_college_mathematics": 0.36363636363636365,
+ "mmlu_eval_accuracy_college_medicine": 0.9090909090909091,
+ "mmlu_eval_accuracy_college_physics": 0.6363636363636364,
+ "mmlu_eval_accuracy_computer_security": 0.6363636363636364,
+ "mmlu_eval_accuracy_conceptual_physics": 0.5769230769230769,
+ "mmlu_eval_accuracy_econometrics": 0.8333333333333334,
+ "mmlu_eval_accuracy_electrical_engineering": 0.875,
+ "mmlu_eval_accuracy_elementary_mathematics": 0.6829268292682927,
+ "mmlu_eval_accuracy_formal_logic": 0.6428571428571429,
+ "mmlu_eval_accuracy_global_facts": 0.5,
+ "mmlu_eval_accuracy_high_school_biology": 0.8125,
+ "mmlu_eval_accuracy_high_school_chemistry": 0.45454545454545453,
+ "mmlu_eval_accuracy_high_school_computer_science": 0.7777777777777778,
+ "mmlu_eval_accuracy_high_school_european_history": 0.7777777777777778,
+ "mmlu_eval_accuracy_high_school_geography": 0.9090909090909091,
+ "mmlu_eval_accuracy_high_school_government_and_politics": 0.9523809523809523,
+ "mmlu_eval_accuracy_high_school_macroeconomics": 0.7674418604651163,
+ "mmlu_eval_accuracy_high_school_mathematics": 0.41379310344827586,
+ "mmlu_eval_accuracy_high_school_microeconomics": 0.9615384615384616,
+ "mmlu_eval_accuracy_high_school_physics": 0.23529411764705882,
+ "mmlu_eval_accuracy_high_school_psychology": 0.95,
+ "mmlu_eval_accuracy_high_school_statistics": 0.7391304347826086,
+ "mmlu_eval_accuracy_high_school_us_history": 0.9090909090909091,
+ "mmlu_eval_accuracy_high_school_world_history": 0.7692307692307693,
+ "mmlu_eval_accuracy_human_aging": 0.8260869565217391,
+ "mmlu_eval_accuracy_human_sexuality": 0.6666666666666666,
+ "mmlu_eval_accuracy_international_law": 1.0,
+ "mmlu_eval_accuracy_jurisprudence": 0.6363636363636364,
+ "mmlu_eval_accuracy_logical_fallacies": 0.7777777777777778,
+ "mmlu_eval_accuracy_machine_learning": 0.5454545454545454,
+ "mmlu_eval_accuracy_management": 0.9090909090909091,
+ "mmlu_eval_accuracy_marketing": 0.92,
+ "mmlu_eval_accuracy_medical_genetics": 0.9090909090909091,
+ "mmlu_eval_accuracy_miscellaneous": 0.7790697674418605,
+ "mmlu_eval_accuracy_moral_disputes": 0.8157894736842105,
+ "mmlu_eval_accuracy_moral_scenarios": 0.57,
+ "mmlu_eval_accuracy_nutrition": 0.7272727272727273,
+ "mmlu_eval_accuracy_philosophy": 0.7941176470588235,
+ "mmlu_eval_accuracy_prehistory": 0.8571428571428571,
+ "mmlu_eval_accuracy_professional_accounting": 0.6774193548387096,
+ "mmlu_eval_accuracy_professional_law": 0.6411764705882353,
+ "mmlu_eval_accuracy_professional_medicine": 0.8387096774193549,
+ "mmlu_eval_accuracy_professional_psychology": 0.8115942028985508,
+ "mmlu_eval_accuracy_public_relations": 0.6666666666666666,
+ "mmlu_eval_accuracy_security_studies": 0.8148148148148148,
+ "mmlu_eval_accuracy_sociology": 0.9545454545454546,
+ "mmlu_eval_accuracy_us_foreign_policy": 1.0,
+ "mmlu_eval_accuracy_virology": 0.5,
+ "mmlu_eval_accuracy_world_religions": 0.8947368421052632,
+ "mmlu_loss": 1.2796503596061355,
+ "step": 374
+ },
+ {
+ "epoch": 0.69,
+ "learning_rate": 0.0001,
+ "loss": 0.9737,
+ "step": 380
+ },
+ {
+ "epoch": 0.71,
+ "learning_rate": 0.0001,
+ "loss": 1.157,
+ "step": 390
+ },
+ {
+ "epoch": 0.73,
+ "learning_rate": 0.0001,
+ "loss": 1.2106,
+ "step": 400
+ },
+ {
+ "epoch": 0.74,
+ "learning_rate": 0.0001,
+ "loss": 0.8687,
+ "step": 410
+ },
+ {
+ "epoch": 0.76,
+ "learning_rate": 0.0001,
+ "loss": 0.8742,
+ "step": 420
+ },
+ {
+ "epoch": 0.78,
+ "learning_rate": 0.0001,
+ "loss": 0.9901,
+ "step": 430
+ },
+ {
+ "epoch": 0.8,
+ "learning_rate": 0.0001,
+ "loss": 1.2238,
+ "step": 440
+ },
+ {
+ "epoch": 0.82,
+ "learning_rate": 0.0001,
+ "loss": 1.2604,
+ "step": 450
+ },
+ {
+ "epoch": 0.83,
+ "learning_rate": 0.0001,
+ "loss": 0.8756,
+ "step": 460
+ },
+ {
+ "epoch": 0.85,
+ "learning_rate": 0.0001,
+ "loss": 0.8683,
+ "step": 470
+ },
+ {
+ "epoch": 0.87,
+ "learning_rate": 0.0001,
+ "loss": 0.9824,
+ "step": 480
+ },
+ {
+ "epoch": 0.89,
+ "learning_rate": 0.0001,
+ "loss": 1.1574,
+ "step": 490
+ },
+ {
+ "epoch": 0.91,
+ "learning_rate": 0.0001,
+ "loss": 1.2687,
+ "step": 500
+ },
+ {
+ "epoch": 0.92,
+ "learning_rate": 0.0001,
+ "loss": 0.8657,
+ "step": 510
+ },
+ {
+ "epoch": 0.94,
+ "learning_rate": 0.0001,
+ "loss": 0.9207,
+ "step": 520
+ },
+ {
+ "epoch": 0.96,
+ "learning_rate": 0.0001,
+ "loss": 1.012,
+ "step": 530
+ },
+ {
+ "epoch": 0.98,
+ "learning_rate": 0.0001,
+ "loss": 1.1517,
+ "step": 540
+ },
+ {
+ "epoch": 1.0,
+ "learning_rate": 0.0001,
+ "loss": 1.1654,
+ "step": 550
+ },
+ {
+ "epoch": 1.02,
+ "learning_rate": 0.0001,
+ "loss": 0.8931,
+ "step": 560
+ },
+ {
+ "epoch": 1.02,
+ "eval_loss": 1.0150845050811768,
+ "eval_runtime": 949.8392,
+ "eval_samples_per_second": 1.053,
+ "eval_steps_per_second": 1.053,
+ "step": 561
+ },
+ {
+ "epoch": 1.02,
+ "mmlu_eval_accuracy": 0.7346397374699287,
+ "mmlu_eval_accuracy_abstract_algebra": 0.2727272727272727,
+ "mmlu_eval_accuracy_anatomy": 0.7142857142857143,
+ "mmlu_eval_accuracy_astronomy": 0.6875,
+ "mmlu_eval_accuracy_business_ethics": 0.8181818181818182,
+ "mmlu_eval_accuracy_clinical_knowledge": 0.8620689655172413,
+ "mmlu_eval_accuracy_college_biology": 0.875,
+ "mmlu_eval_accuracy_college_chemistry": 0.5,
+ "mmlu_eval_accuracy_college_computer_science": 0.7272727272727273,
+ "mmlu_eval_accuracy_college_mathematics": 0.36363636363636365,
+ "mmlu_eval_accuracy_college_medicine": 0.9090909090909091,
+ "mmlu_eval_accuracy_college_physics": 0.5454545454545454,
+ "mmlu_eval_accuracy_computer_security": 0.6363636363636364,
+ "mmlu_eval_accuracy_conceptual_physics": 0.6153846153846154,
+ "mmlu_eval_accuracy_econometrics": 0.8333333333333334,
+ "mmlu_eval_accuracy_electrical_engineering": 0.8125,
+ "mmlu_eval_accuracy_elementary_mathematics": 0.6829268292682927,
+ "mmlu_eval_accuracy_formal_logic": 0.6428571428571429,
+ "mmlu_eval_accuracy_global_facts": 0.5,
+ "mmlu_eval_accuracy_high_school_biology": 0.84375,
+ "mmlu_eval_accuracy_high_school_chemistry": 0.45454545454545453,
+ "mmlu_eval_accuracy_high_school_computer_science": 0.7777777777777778,
+ "mmlu_eval_accuracy_high_school_european_history": 0.7777777777777778,
+ "mmlu_eval_accuracy_high_school_geography": 0.9090909090909091,
+ "mmlu_eval_accuracy_high_school_government_and_politics": 0.9523809523809523,
+ "mmlu_eval_accuracy_high_school_macroeconomics": 0.7674418604651163,
+ "mmlu_eval_accuracy_high_school_mathematics": 0.41379310344827586,
+ "mmlu_eval_accuracy_high_school_microeconomics": 0.9230769230769231,
+ "mmlu_eval_accuracy_high_school_physics": 0.23529411764705882,
+ "mmlu_eval_accuracy_high_school_psychology": 0.9666666666666667,
+ "mmlu_eval_accuracy_high_school_statistics": 0.7391304347826086,
+ "mmlu_eval_accuracy_high_school_us_history": 0.9090909090909091,
+ "mmlu_eval_accuracy_high_school_world_history": 0.8076923076923077,
+ "mmlu_eval_accuracy_human_aging": 0.8260869565217391,
+ "mmlu_eval_accuracy_human_sexuality": 0.6666666666666666,
+ "mmlu_eval_accuracy_international_law": 1.0,
+ "mmlu_eval_accuracy_jurisprudence": 0.6363636363636364,
+ "mmlu_eval_accuracy_logical_fallacies": 0.7222222222222222,
+ "mmlu_eval_accuracy_machine_learning": 0.6363636363636364,
+ "mmlu_eval_accuracy_management": 0.9090909090909091,
+ "mmlu_eval_accuracy_marketing": 0.88,
+ "mmlu_eval_accuracy_medical_genetics": 0.9090909090909091,
+ "mmlu_eval_accuracy_miscellaneous": 0.7558139534883721,
+ "mmlu_eval_accuracy_moral_disputes": 0.8157894736842105,
+ "mmlu_eval_accuracy_moral_scenarios": 0.56,
+ "mmlu_eval_accuracy_nutrition": 0.7575757575757576,
+ "mmlu_eval_accuracy_philosophy": 0.7941176470588235,
+ "mmlu_eval_accuracy_prehistory": 0.8571428571428571,
+ "mmlu_eval_accuracy_professional_accounting": 0.6774193548387096,
+ "mmlu_eval_accuracy_professional_law": 0.6294117647058823,
+ "mmlu_eval_accuracy_professional_medicine": 0.8064516129032258,
+ "mmlu_eval_accuracy_professional_psychology": 0.8260869565217391,
+ "mmlu_eval_accuracy_public_relations": 0.6666666666666666,
+ "mmlu_eval_accuracy_security_studies": 0.8148148148148148,
+ "mmlu_eval_accuracy_sociology": 0.9090909090909091,
+ "mmlu_eval_accuracy_us_foreign_policy": 1.0,
+ "mmlu_eval_accuracy_virology": 0.5,
+ "mmlu_eval_accuracy_world_religions": 0.8421052631578947,
+ "mmlu_loss": 1.186674291658526,
+ "step": 561
+ },
+ {
+ "epoch": 1.03,
+ "learning_rate": 0.0001,
+ "loss": 0.8507,
+ "step": 570
+ },
+ {
+ "epoch": 1.05,
+ "learning_rate": 0.0001,
+ "loss": 0.9164,
+ "step": 580
+ },
+ {
+ "epoch": 1.07,
+ "learning_rate": 0.0001,
+ "loss": 1.0908,
+ "step": 590
+ },
+ {
+ "epoch": 1.09,
+ "learning_rate": 0.0001,
+ "loss": 1.0431,
+ "step": 600
+ },
+ {
+ "epoch": 1.11,
+ "learning_rate": 0.0001,
+ "loss": 0.8567,
+ "step": 610
+ },
+ {
+ "epoch": 1.12,
+ "learning_rate": 0.0001,
+ "loss": 0.8818,
+ "step": 620
+ },
+ {
+ "epoch": 1.14,
+ "learning_rate": 0.0001,
+ "loss": 0.9499,
+ "step": 630
+ },
+ {
+ "epoch": 1.16,
+ "learning_rate": 0.0001,
+ "loss": 1.0437,
+ "step": 640
+ },
+ {
+ "epoch": 1.18,
+ "learning_rate": 0.0001,
+ "loss": 1.0487,
+ "step": 650
+ },
+ {
+ "epoch": 1.2,
+ "learning_rate": 0.0001,
+ "loss": 0.8405,
+ "step": 660
+ },
+ {
+ "epoch": 1.22,
+ "learning_rate": 0.0001,
+ "loss": 0.8818,
+ "step": 670
+ },
+ {
+ "epoch": 1.23,
+ "learning_rate": 0.0001,
+ "loss": 0.9619,
+ "step": 680
+ },
+ {
+ "epoch": 1.25,
+ "learning_rate": 0.0001,
+ "loss": 1.0753,
+ "step": 690
+ },
+ {
+ "epoch": 1.27,
+ "learning_rate": 0.0001,
+ "loss": 1.0218,
+ "step": 700
+ },
+ {
+ "epoch": 1.29,
+ "learning_rate": 0.0001,
+ "loss": 0.8763,
+ "step": 710
+ },
+ {
+ "epoch": 1.31,
+ "learning_rate": 0.0001,
+ "loss": 0.8789,
+ "step": 720
+ },
+ {
+ "epoch": 1.32,
+ "learning_rate": 0.0001,
+ "loss": 0.8631,
+ "step": 730
+ },
+ {
+ "epoch": 1.34,
+ "learning_rate": 0.0001,
+ "loss": 0.9846,
+ "step": 740
+ },
+ {
+ "epoch": 1.36,
+ "eval_loss": 1.0305067300796509,
+ "eval_runtime": 948.7106,
+ "eval_samples_per_second": 1.054,
+ "eval_steps_per_second": 1.054,
+ "step": 748
+ },
+ {
+ "epoch": 1.36,
+ "mmlu_eval_accuracy": 0.7324229372189777,
+ "mmlu_eval_accuracy_abstract_algebra": 0.2727272727272727,
+ "mmlu_eval_accuracy_anatomy": 0.6428571428571429,
+ "mmlu_eval_accuracy_astronomy": 0.6875,
+ "mmlu_eval_accuracy_business_ethics": 1.0,
+ "mmlu_eval_accuracy_clinical_knowledge": 0.896551724137931,
+ "mmlu_eval_accuracy_college_biology": 0.875,
+ "mmlu_eval_accuracy_college_chemistry": 0.5,
+ "mmlu_eval_accuracy_college_computer_science": 0.6363636363636364,
+ "mmlu_eval_accuracy_college_mathematics": 0.36363636363636365,
+ "mmlu_eval_accuracy_college_medicine": 0.8636363636363636,
+ "mmlu_eval_accuracy_college_physics": 0.6363636363636364,
+ "mmlu_eval_accuracy_computer_security": 0.6363636363636364,
+ "mmlu_eval_accuracy_conceptual_physics": 0.6153846153846154,
+ "mmlu_eval_accuracy_econometrics": 0.8333333333333334,
+ "mmlu_eval_accuracy_electrical_engineering": 0.8125,
+ "mmlu_eval_accuracy_elementary_mathematics": 0.7073170731707317,
+ "mmlu_eval_accuracy_formal_logic": 0.6428571428571429,
+ "mmlu_eval_accuracy_global_facts": 0.4,
+ "mmlu_eval_accuracy_high_school_biology": 0.8125,
+ "mmlu_eval_accuracy_high_school_chemistry": 0.36363636363636365,
+ "mmlu_eval_accuracy_high_school_computer_science": 0.6666666666666666,
+ "mmlu_eval_accuracy_high_school_european_history": 0.7777777777777778,
+ "mmlu_eval_accuracy_high_school_geography": 0.9090909090909091,
+ "mmlu_eval_accuracy_high_school_government_and_politics": 0.9523809523809523,
+ "mmlu_eval_accuracy_high_school_macroeconomics": 0.7674418604651163,
+ "mmlu_eval_accuracy_high_school_mathematics": 0.3448275862068966,
+ "mmlu_eval_accuracy_high_school_microeconomics": 0.9615384615384616,
+ "mmlu_eval_accuracy_high_school_physics": 0.35294117647058826,
+ "mmlu_eval_accuracy_high_school_psychology": 0.9333333333333333,
+ "mmlu_eval_accuracy_high_school_statistics": 0.782608695652174,
+ "mmlu_eval_accuracy_high_school_us_history": 0.9545454545454546,
+ "mmlu_eval_accuracy_high_school_world_history": 0.7692307692307693,
+ "mmlu_eval_accuracy_human_aging": 0.782608695652174,
+ "mmlu_eval_accuracy_human_sexuality": 0.6666666666666666,
+ "mmlu_eval_accuracy_international_law": 1.0,
+ "mmlu_eval_accuracy_jurisprudence": 0.6363636363636364,
+ "mmlu_eval_accuracy_logical_fallacies": 0.7777777777777778,
+ "mmlu_eval_accuracy_machine_learning": 0.6363636363636364,
+ "mmlu_eval_accuracy_management": 0.9090909090909091,
+ "mmlu_eval_accuracy_marketing": 0.96,
+ "mmlu_eval_accuracy_medical_genetics": 0.9090909090909091,
+ "mmlu_eval_accuracy_miscellaneous": 0.7441860465116279,
+ "mmlu_eval_accuracy_moral_disputes": 0.8157894736842105,
+ "mmlu_eval_accuracy_moral_scenarios": 0.57,
+ "mmlu_eval_accuracy_nutrition": 0.7272727272727273,
+ "mmlu_eval_accuracy_philosophy": 0.7941176470588235,
+ "mmlu_eval_accuracy_prehistory": 0.8285714285714286,
+ "mmlu_eval_accuracy_professional_accounting": 0.6129032258064516,
+ "mmlu_eval_accuracy_professional_law": 0.6411764705882353,
+ "mmlu_eval_accuracy_professional_medicine": 0.8064516129032258,
+ "mmlu_eval_accuracy_professional_psychology": 0.8260869565217391,
+ "mmlu_eval_accuracy_public_relations": 0.6666666666666666,
+ "mmlu_eval_accuracy_security_studies": 0.8148148148148148,
+ "mmlu_eval_accuracy_sociology": 0.9090909090909091,
+ "mmlu_eval_accuracy_us_foreign_policy": 1.0,
+ "mmlu_eval_accuracy_virology": 0.5,
+ "mmlu_eval_accuracy_world_religions": 0.8421052631578947,
+ "mmlu_loss": 1.2988067958029479,
+ "step": 748
+ },
+ {
+ "epoch": 1.36,
+ "learning_rate": 0.0001,
+ "loss": 1.0735,
+ "step": 750
+ },
+ {
+ "epoch": 1.38,
+ "learning_rate": 0.0001,
+ "loss": 0.9066,
+ "step": 760
+ },
+ {
+ "epoch": 1.4,
+ "learning_rate": 0.0001,
+ "loss": 0.8716,
+ "step": 770
+ },
+ {
+ "epoch": 1.41,
+ "learning_rate": 0.0001,
+ "loss": 0.9144,
+ "step": 780
+ },
+ {
+ "epoch": 1.43,
+ "learning_rate": 0.0001,
+ "loss": 1.0338,
+ "step": 790
+ },
+ {
+ "epoch": 1.45,
+ "learning_rate": 0.0001,
+ "loss": 1.0275,
+ "step": 800
+ },
+ {
+ "epoch": 1.47,
+ "learning_rate": 0.0001,
+ "loss": 0.8382,
+ "step": 810
+ },
+ {
+ "epoch": 1.49,
+ "learning_rate": 0.0001,
+ "loss": 0.8489,
+ "step": 820
+ },
+ {
+ "epoch": 1.51,
+ "learning_rate": 0.0001,
+ "loss": 0.8931,
+ "step": 830
+ },
+ {
+ "epoch": 1.52,
+ "learning_rate": 0.0001,
+ "loss": 1.0515,
+ "step": 840
+ },
+ {
+ "epoch": 1.54,
+ "learning_rate": 0.0001,
+ "loss": 1.0965,
+ "step": 850
+ },
+ {
+ "epoch": 1.56,
+ "learning_rate": 0.0001,
+ "loss": 0.8928,
+ "step": 860
+ },
+ {
+ "epoch": 1.58,
+ "learning_rate": 0.0001,
+ "loss": 0.8608,
+ "step": 870
+ },
+ {
+ "epoch": 1.6,
+ "learning_rate": 0.0001,
+ "loss": 0.8831,
+ "step": 880
+ },
+ {
+ "epoch": 1.61,
+ "learning_rate": 0.0001,
+ "loss": 1.0253,
+ "step": 890
+ },
+ {
+ "epoch": 1.63,
+ "learning_rate": 0.0001,
+ "loss": 0.9905,
+ "step": 900
+ },
+ {
+ "epoch": 1.65,
+ "learning_rate": 0.0001,
+ "loss": 0.8487,
+ "step": 910
+ },
+ {
+ "epoch": 1.67,
+ "learning_rate": 0.0001,
+ "loss": 0.8568,
+ "step": 920
+ },
+ {
+ "epoch": 1.69,
+ "learning_rate": 0.0001,
+ "loss": 0.9047,
+ "step": 930
+ },
+ {
+ "epoch": 1.7,
+ "eval_loss": 1.0250624418258667,
+ "eval_runtime": 946.4035,
+ "eval_samples_per_second": 1.057,
+ "eval_steps_per_second": 1.057,
+ "step": 935
+ },
+ {
+ "epoch": 1.7,
+ "mmlu_eval_accuracy": 0.7288948695878031,
+ "mmlu_eval_accuracy_abstract_algebra": 0.2727272727272727,
+ "mmlu_eval_accuracy_anatomy": 0.5714285714285714,
+ "mmlu_eval_accuracy_astronomy": 0.6875,
+ "mmlu_eval_accuracy_business_ethics": 0.9090909090909091,
+ "mmlu_eval_accuracy_clinical_knowledge": 0.8620689655172413,
+ "mmlu_eval_accuracy_college_biology": 0.875,
+ "mmlu_eval_accuracy_college_chemistry": 0.5,
+ "mmlu_eval_accuracy_college_computer_science": 0.7272727272727273,
+ "mmlu_eval_accuracy_college_mathematics": 0.36363636363636365,
+ "mmlu_eval_accuracy_college_medicine": 0.9090909090909091,
+ "mmlu_eval_accuracy_college_physics": 0.6363636363636364,
+ "mmlu_eval_accuracy_computer_security": 0.6363636363636364,
+ "mmlu_eval_accuracy_conceptual_physics": 0.6153846153846154,
+ "mmlu_eval_accuracy_econometrics": 0.8333333333333334,
+ "mmlu_eval_accuracy_electrical_engineering": 0.8125,
+ "mmlu_eval_accuracy_elementary_mathematics": 0.6097560975609756,
+ "mmlu_eval_accuracy_formal_logic": 0.6428571428571429,
+ "mmlu_eval_accuracy_global_facts": 0.5,
+ "mmlu_eval_accuracy_high_school_biology": 0.84375,
+ "mmlu_eval_accuracy_high_school_chemistry": 0.45454545454545453,
+ "mmlu_eval_accuracy_high_school_computer_science": 0.6666666666666666,
+ "mmlu_eval_accuracy_high_school_european_history": 0.7777777777777778,
+ "mmlu_eval_accuracy_high_school_geography": 0.9090909090909091,
+ "mmlu_eval_accuracy_high_school_government_and_politics": 0.9523809523809523,
+ "mmlu_eval_accuracy_high_school_macroeconomics": 0.7906976744186046,
+ "mmlu_eval_accuracy_high_school_mathematics": 0.3793103448275862,
+ "mmlu_eval_accuracy_high_school_microeconomics": 0.9230769230769231,
+ "mmlu_eval_accuracy_high_school_physics": 0.23529411764705882,
+ "mmlu_eval_accuracy_high_school_psychology": 0.9333333333333333,
+ "mmlu_eval_accuracy_high_school_statistics": 0.7391304347826086,
+ "mmlu_eval_accuracy_high_school_us_history": 0.9090909090909091,
+ "mmlu_eval_accuracy_high_school_world_history": 0.7692307692307693,
+ "mmlu_eval_accuracy_human_aging": 0.782608695652174,
+ "mmlu_eval_accuracy_human_sexuality": 0.6666666666666666,
+ "mmlu_eval_accuracy_international_law": 1.0,
+ "mmlu_eval_accuracy_jurisprudence": 0.5454545454545454,
+ "mmlu_eval_accuracy_logical_fallacies": 0.7222222222222222,
+ "mmlu_eval_accuracy_machine_learning": 0.6363636363636364,
+ "mmlu_eval_accuracy_management": 0.9090909090909091,
+ "mmlu_eval_accuracy_marketing": 0.96,
+ "mmlu_eval_accuracy_medical_genetics": 0.9090909090909091,
+ "mmlu_eval_accuracy_miscellaneous": 0.7790697674418605,
+ "mmlu_eval_accuracy_moral_disputes": 0.8157894736842105,
+ "mmlu_eval_accuracy_moral_scenarios": 0.57,
+ "mmlu_eval_accuracy_nutrition": 0.7575757575757576,
+ "mmlu_eval_accuracy_philosophy": 0.8235294117647058,
+ "mmlu_eval_accuracy_prehistory": 0.8857142857142857,
+ "mmlu_eval_accuracy_professional_accounting": 0.6129032258064516,
+ "mmlu_eval_accuracy_professional_law": 0.6235294117647059,
+ "mmlu_eval_accuracy_professional_medicine": 0.8387096774193549,
+ "mmlu_eval_accuracy_professional_psychology": 0.8115942028985508,
+ "mmlu_eval_accuracy_public_relations": 0.5833333333333334,
+ "mmlu_eval_accuracy_security_studies": 0.8148148148148148,
+ "mmlu_eval_accuracy_sociology": 0.9090909090909091,
+ "mmlu_eval_accuracy_us_foreign_policy": 1.0,
+ "mmlu_eval_accuracy_virology": 0.5,
+ "mmlu_eval_accuracy_world_religions": 0.8421052631578947,
+ "mmlu_loss": 1.243813282909306,
+ "step": 935
+ },
+ {
+ "epoch": 1.7,
+ "learning_rate": 0.0001,
+ "loss": 1.0174,
+ "step": 940
+ },
+ {
+ "epoch": 1.72,
+ "learning_rate": 0.0001,
+ "loss": 1.0302,
+ "step": 950
+ },
+ {
+ "epoch": 1.74,
+ "learning_rate": 0.0001,
+ "loss": 0.8799,
+ "step": 960
+ },
+ {
+ "epoch": 1.76,
+ "learning_rate": 0.0001,
+ "loss": 0.8447,
+ "step": 970
+ },
+ {
+ "epoch": 1.78,
+ "learning_rate": 0.0001,
+ "loss": 0.9053,
+ "step": 980
+ },
+ {
+ "epoch": 1.8,
+ "learning_rate": 0.0001,
+ "loss": 1.0331,
+ "step": 990
+ },
+ {
+ "epoch": 1.81,
+ "learning_rate": 0.0001,
+ "loss": 1.0412,
+ "step": 1000
+ },
+ {
+ "epoch": 1.83,
+ "learning_rate": 0.0001,
+ "loss": 0.8753,
+ "step": 1010
+ },
+ {
+ "epoch": 1.85,
+ "learning_rate": 0.0001,
+ "loss": 0.8744,
+ "step": 1020
+ },
+ {
+ "epoch": 1.87,
+ "learning_rate": 0.0001,
+ "loss": 0.8899,
+ "step": 1030
+ },
+ {
+ "epoch": 1.89,
+ "learning_rate": 0.0001,
+ "loss": 1.0053,
+ "step": 1040
+ },
+ {
+ "epoch": 1.9,
+ "learning_rate": 0.0001,
+ "loss": 1.0127,
+ "step": 1050
+ },
+ {
+ "epoch": 1.92,
+ "learning_rate": 0.0001,
+ "loss": 0.8023,
+ "step": 1060
+ },
+ {
+ "epoch": 1.94,
+ "learning_rate": 0.0001,
+ "loss": 0.8349,
+ "step": 1070
+ },
+ {
+ "epoch": 1.96,
+ "learning_rate": 0.0001,
+ "loss": 0.9742,
+ "step": 1080
+ },
+ {
+ "epoch": 1.98,
+ "learning_rate": 0.0001,
+ "loss": 1.0971,
+ "step": 1090
+ },
+ {
+ "epoch": 2.0,
+ "learning_rate": 0.0001,
+ "loss": 1.0728,
+ "step": 1100
+ },
+ {
+ "epoch": 2.01,
+ "learning_rate": 0.0001,
+ "loss": 0.7724,
+ "step": 1110
+ },
+ {
+ "epoch": 2.03,
+ "learning_rate": 0.0001,
+ "loss": 0.7675,
+ "step": 1120
+ },
+ {
+ "epoch": 2.03,
+ "eval_loss": 1.052681565284729,
+ "eval_runtime": 942.0722,
+ "eval_samples_per_second": 1.061,
+ "eval_steps_per_second": 1.061,
+ "step": 1122
+ },
+ {
+ "epoch": 2.03,
+ "mmlu_eval_accuracy": 0.7373981967098951,
+ "mmlu_eval_accuracy_abstract_algebra": 0.36363636363636365,
+ "mmlu_eval_accuracy_anatomy": 0.6428571428571429,
+ "mmlu_eval_accuracy_astronomy": 0.6875,
+ "mmlu_eval_accuracy_business_ethics": 0.9090909090909091,
+ "mmlu_eval_accuracy_clinical_knowledge": 0.896551724137931,
+ "mmlu_eval_accuracy_college_biology": 0.875,
+ "mmlu_eval_accuracy_college_chemistry": 0.5,
+ "mmlu_eval_accuracy_college_computer_science": 0.5454545454545454,
+ "mmlu_eval_accuracy_college_mathematics": 0.36363636363636365,
+ "mmlu_eval_accuracy_college_medicine": 0.9090909090909091,
+ "mmlu_eval_accuracy_college_physics": 0.5454545454545454,
+ "mmlu_eval_accuracy_computer_security": 0.6363636363636364,
+ "mmlu_eval_accuracy_conceptual_physics": 0.6153846153846154,
+ "mmlu_eval_accuracy_econometrics": 0.8333333333333334,
+ "mmlu_eval_accuracy_electrical_engineering": 0.875,
+ "mmlu_eval_accuracy_elementary_mathematics": 0.5853658536585366,
+ "mmlu_eval_accuracy_formal_logic": 0.7142857142857143,
+ "mmlu_eval_accuracy_global_facts": 0.4,
+ "mmlu_eval_accuracy_high_school_biology": 0.84375,
+ "mmlu_eval_accuracy_high_school_chemistry": 0.45454545454545453,
+ "mmlu_eval_accuracy_high_school_computer_science": 0.7777777777777778,
+ "mmlu_eval_accuracy_high_school_european_history": 0.7777777777777778,
+ "mmlu_eval_accuracy_high_school_geography": 0.9090909090909091,
+ "mmlu_eval_accuracy_high_school_government_and_politics": 0.9523809523809523,
+ "mmlu_eval_accuracy_high_school_macroeconomics": 0.7674418604651163,
+ "mmlu_eval_accuracy_high_school_mathematics": 0.41379310344827586,
+ "mmlu_eval_accuracy_high_school_microeconomics": 0.9230769230769231,
+ "mmlu_eval_accuracy_high_school_physics": 0.23529411764705882,
+ "mmlu_eval_accuracy_high_school_psychology": 0.95,
+ "mmlu_eval_accuracy_high_school_statistics": 0.782608695652174,
+ "mmlu_eval_accuracy_high_school_us_history": 0.9090909090909091,
+ "mmlu_eval_accuracy_high_school_world_history": 0.7692307692307693,
+ "mmlu_eval_accuracy_human_aging": 0.8260869565217391,
+ "mmlu_eval_accuracy_human_sexuality": 0.6666666666666666,
+ "mmlu_eval_accuracy_international_law": 1.0,
+ "mmlu_eval_accuracy_jurisprudence": 0.6363636363636364,
+ "mmlu_eval_accuracy_logical_fallacies": 0.7777777777777778,
+ "mmlu_eval_accuracy_machine_learning": 0.6363636363636364,
+ "mmlu_eval_accuracy_management": 0.9090909090909091,
+ "mmlu_eval_accuracy_marketing": 0.92,
+ "mmlu_eval_accuracy_medical_genetics": 0.9090909090909091,
+ "mmlu_eval_accuracy_miscellaneous": 0.7906976744186046,
+ "mmlu_eval_accuracy_moral_disputes": 0.8157894736842105,
+ "mmlu_eval_accuracy_moral_scenarios": 0.62,
+ "mmlu_eval_accuracy_nutrition": 0.7575757575757576,
+ "mmlu_eval_accuracy_philosophy": 0.8235294117647058,
+ "mmlu_eval_accuracy_prehistory": 0.8857142857142857,
+ "mmlu_eval_accuracy_professional_accounting": 0.6774193548387096,
+ "mmlu_eval_accuracy_professional_law": 0.6294117647058823,
+ "mmlu_eval_accuracy_professional_medicine": 0.8709677419354839,
+ "mmlu_eval_accuracy_professional_psychology": 0.782608695652174,
+ "mmlu_eval_accuracy_public_relations": 0.6666666666666666,
+ "mmlu_eval_accuracy_security_studies": 0.8148148148148148,
+ "mmlu_eval_accuracy_sociology": 0.9090909090909091,
+ "mmlu_eval_accuracy_us_foreign_policy": 1.0,
+ "mmlu_eval_accuracy_virology": 0.5,
+ "mmlu_eval_accuracy_world_religions": 0.8421052631578947,
+ "mmlu_loss": 1.2340081441760609,
+ "step": 1122
+ },
+ {
+ "epoch": 2.05,
+ "learning_rate": 0.0001,
+ "loss": 0.7194,
+ "step": 1130
+ },
+ {
+ "epoch": 2.07,
+ "learning_rate": 0.0001,
+ "loss": 0.8236,
+ "step": 1140
+ },
+ {
+ "epoch": 2.09,
+ "learning_rate": 0.0001,
+ "loss": 0.6652,
+ "step": 1150
+ },
+ {
+ "epoch": 2.1,
+ "learning_rate": 0.0001,
+ "loss": 0.7177,
+ "step": 1160
+ },
+ {
+ "epoch": 2.12,
+ "learning_rate": 0.0001,
+ "loss": 0.7788,
+ "step": 1170
+ },
+ {
+ "epoch": 2.14,
+ "learning_rate": 0.0001,
+ "loss": 0.8117,
+ "step": 1180
+ },
+ {
+ "epoch": 2.16,
+ "learning_rate": 0.0001,
+ "loss": 0.8145,
+ "step": 1190
+ },
+ {
+ "epoch": 2.18,
+ "learning_rate": 0.0001,
+ "loss": 0.6984,
+ "step": 1200
+ },
+ {
+ "epoch": 2.19,
+ "learning_rate": 0.0001,
+ "loss": 0.7011,
+ "step": 1210
+ },
+ {
+ "epoch": 2.21,
+ "learning_rate": 0.0001,
+ "loss": 0.769,
+ "step": 1220
+ },
+ {
+ "epoch": 2.23,
+ "learning_rate": 0.0001,
+ "loss": 0.7705,
+ "step": 1230
+ },
+ {
+ "epoch": 2.25,
+ "learning_rate": 0.0001,
+ "loss": 0.8066,
+ "step": 1240
+ },
+ {
+ "epoch": 2.27,
+ "learning_rate": 0.0001,
+ "loss": 0.6622,
+ "step": 1250
+ },
+ {
+ "epoch": 2.29,
+ "learning_rate": 0.0001,
+ "loss": 0.6641,
+ "step": 1260
+ },
+ {
+ "epoch": 2.3,
+ "learning_rate": 0.0001,
+ "loss": 0.7239,
+ "step": 1270
+ },
+ {
+ "epoch": 2.32,
+ "learning_rate": 0.0001,
+ "loss": 0.7618,
+ "step": 1280
+ },
+ {
+ "epoch": 2.34,
+ "learning_rate": 0.0001,
+ "loss": 0.7845,
+ "step": 1290
+ },
+ {
+ "epoch": 2.36,
+ "learning_rate": 0.0001,
+ "loss": 0.719,
+ "step": 1300
+ },
+ {
+ "epoch": 2.37,
+ "eval_loss": 1.1104822158813477,
+ "eval_runtime": 948.1299,
+ "eval_samples_per_second": 1.055,
+ "eval_steps_per_second": 1.055,
+ "step": 1309
+ },
+ {
+ "epoch": 2.37,
+ "mmlu_eval_accuracy": 0.7369285730399766,
+ "mmlu_eval_accuracy_abstract_algebra": 0.36363636363636365,
+ "mmlu_eval_accuracy_anatomy": 0.6428571428571429,
+ "mmlu_eval_accuracy_astronomy": 0.6875,
+ "mmlu_eval_accuracy_business_ethics": 1.0,
+ "mmlu_eval_accuracy_clinical_knowledge": 0.8275862068965517,
+ "mmlu_eval_accuracy_college_biology": 0.875,
+ "mmlu_eval_accuracy_college_chemistry": 0.5,
+ "mmlu_eval_accuracy_college_computer_science": 0.6363636363636364,
+ "mmlu_eval_accuracy_college_mathematics": 0.45454545454545453,
+ "mmlu_eval_accuracy_college_medicine": 0.8636363636363636,
+ "mmlu_eval_accuracy_college_physics": 0.6363636363636364,
+ "mmlu_eval_accuracy_computer_security": 0.7272727272727273,
+ "mmlu_eval_accuracy_conceptual_physics": 0.6923076923076923,
+ "mmlu_eval_accuracy_econometrics": 0.75,
+ "mmlu_eval_accuracy_electrical_engineering": 0.8125,
+ "mmlu_eval_accuracy_elementary_mathematics": 0.6341463414634146,
+ "mmlu_eval_accuracy_formal_logic": 0.7857142857142857,
+ "mmlu_eval_accuracy_global_facts": 0.5,
+ "mmlu_eval_accuracy_high_school_biology": 0.84375,
+ "mmlu_eval_accuracy_high_school_chemistry": 0.4090909090909091,
+ "mmlu_eval_accuracy_high_school_computer_science": 0.8888888888888888,
+ "mmlu_eval_accuracy_high_school_european_history": 0.7777777777777778,
+ "mmlu_eval_accuracy_high_school_geography": 0.9090909090909091,
+ "mmlu_eval_accuracy_high_school_government_and_politics": 0.9523809523809523,
+ "mmlu_eval_accuracy_high_school_macroeconomics": 0.7906976744186046,
+ "mmlu_eval_accuracy_high_school_mathematics": 0.3793103448275862,
+ "mmlu_eval_accuracy_high_school_microeconomics": 0.9230769230769231,
+ "mmlu_eval_accuracy_high_school_physics": 0.17647058823529413,
+ "mmlu_eval_accuracy_high_school_psychology": 0.95,
+ "mmlu_eval_accuracy_high_school_statistics": 0.6956521739130435,
+ "mmlu_eval_accuracy_high_school_us_history": 0.9090909090909091,
+ "mmlu_eval_accuracy_high_school_world_history": 0.7307692307692307,
+ "mmlu_eval_accuracy_human_aging": 0.7391304347826086,
+ "mmlu_eval_accuracy_human_sexuality": 0.6666666666666666,
+ "mmlu_eval_accuracy_international_law": 1.0,
+ "mmlu_eval_accuracy_jurisprudence": 0.5454545454545454,
+ "mmlu_eval_accuracy_logical_fallacies": 0.7222222222222222,
+ "mmlu_eval_accuracy_machine_learning": 0.6363636363636364,
+ "mmlu_eval_accuracy_management": 0.9090909090909091,
+ "mmlu_eval_accuracy_marketing": 0.88,
+ "mmlu_eval_accuracy_medical_genetics": 0.9090909090909091,
+ "mmlu_eval_accuracy_miscellaneous": 0.7906976744186046,
+ "mmlu_eval_accuracy_moral_disputes": 0.8157894736842105,
+ "mmlu_eval_accuracy_moral_scenarios": 0.57,
+ "mmlu_eval_accuracy_nutrition": 0.7575757575757576,
+ "mmlu_eval_accuracy_philosophy": 0.8529411764705882,
+ "mmlu_eval_accuracy_prehistory": 0.8571428571428571,
+ "mmlu_eval_accuracy_professional_accounting": 0.6774193548387096,
+ "mmlu_eval_accuracy_professional_law": 0.6058823529411764,
+ "mmlu_eval_accuracy_professional_medicine": 0.8709677419354839,
+ "mmlu_eval_accuracy_professional_psychology": 0.7681159420289855,
+ "mmlu_eval_accuracy_public_relations": 0.5833333333333334,
+ "mmlu_eval_accuracy_security_studies": 0.8148148148148148,
+ "mmlu_eval_accuracy_sociology": 0.9090909090909091,
+ "mmlu_eval_accuracy_us_foreign_policy": 1.0,
+ "mmlu_eval_accuracy_virology": 0.5555555555555556,
+ "mmlu_eval_accuracy_world_religions": 0.8421052631578947,
+ "mmlu_loss": 1.0866562834095908,
+ "step": 1309
+ },
+ {
+ "epoch": 2.38,
+ "learning_rate": 0.0001,
+ "loss": 0.7093,
+ "step": 1310
+ },
+ {
+ "epoch": 2.39,
+ "learning_rate": 0.0001,
+ "loss": 0.7684,
+ "step": 1320
+ },
+ {
+ "epoch": 2.41,
+ "learning_rate": 0.0001,
+ "loss": 0.7501,
+ "step": 1330
+ },
+ {
+ "epoch": 2.43,
+ "learning_rate": 0.0001,
+ "loss": 0.8043,
+ "step": 1340
+ },
+ {
+ "epoch": 2.45,
+ "learning_rate": 0.0001,
+ "loss": 0.6927,
+ "step": 1350
+ },
+ {
+ "epoch": 2.47,
+ "learning_rate": 0.0001,
+ "loss": 0.7278,
+ "step": 1360
+ },
+ {
+ "epoch": 2.48,
+ "learning_rate": 0.0001,
+ "loss": 0.8095,
+ "step": 1370
+ },
+ {
+ "epoch": 2.5,
+ "learning_rate": 0.0001,
+ "loss": 0.7463,
+ "step": 1380
+ },
+ {
+ "epoch": 2.52,
+ "learning_rate": 0.0001,
+ "loss": 0.7707,
+ "step": 1390
+ },
+ {
+ "epoch": 2.54,
+ "learning_rate": 0.0001,
+ "loss": 0.7152,
+ "step": 1400
+ },
+ {
+ "epoch": 2.56,
+ "learning_rate": 0.0001,
+ "loss": 0.687,
+ "step": 1410
+ },
+ {
+ "epoch": 2.58,
+ "learning_rate": 0.0001,
+ "loss": 0.7529,
+ "step": 1420
+ },
+ {
+ "epoch": 2.59,
+ "learning_rate": 0.0001,
+ "loss": 0.7565,
+ "step": 1430
+ },
+ {
+ "epoch": 2.61,
+ "learning_rate": 0.0001,
+ "loss": 0.8066,
+ "step": 1440
+ },
+ {
+ "epoch": 2.63,
+ "learning_rate": 0.0001,
+ "loss": 0.7623,
+ "step": 1450
+ },
+ {
+ "epoch": 2.65,
+ "learning_rate": 0.0001,
+ "loss": 0.6947,
+ "step": 1460
+ },
+ {
+ "epoch": 2.67,
+ "learning_rate": 0.0001,
+ "loss": 0.7756,
+ "step": 1470
+ },
+ {
+ "epoch": 2.68,
+ "learning_rate": 0.0001,
+ "loss": 0.8453,
+ "step": 1480
+ },
+ {
+ "epoch": 2.7,
+ "learning_rate": 0.0001,
+ "loss": 0.8306,
+ "step": 1490
+ },
+ {
+ "epoch": 2.71,
+ "eval_loss": 1.100826621055603,
+ "eval_runtime": 940.4488,
+ "eval_samples_per_second": 1.063,
+ "eval_steps_per_second": 1.063,
+ "step": 1496
+ },
+ {
+ "epoch": 2.71,
+ "mmlu_eval_accuracy": 0.7363077307176445,
+ "mmlu_eval_accuracy_abstract_algebra": 0.36363636363636365,
+ "mmlu_eval_accuracy_anatomy": 0.5714285714285714,
+ "mmlu_eval_accuracy_astronomy": 0.6875,
+ "mmlu_eval_accuracy_business_ethics": 0.9090909090909091,
+ "mmlu_eval_accuracy_clinical_knowledge": 0.8620689655172413,
+ "mmlu_eval_accuracy_college_biology": 0.8125,
+ "mmlu_eval_accuracy_college_chemistry": 0.5,
+ "mmlu_eval_accuracy_college_computer_science": 0.5454545454545454,
+ "mmlu_eval_accuracy_college_mathematics": 0.36363636363636365,
+ "mmlu_eval_accuracy_college_medicine": 0.9090909090909091,
+ "mmlu_eval_accuracy_college_physics": 0.6363636363636364,
+ "mmlu_eval_accuracy_computer_security": 0.6363636363636364,
+ "mmlu_eval_accuracy_conceptual_physics": 0.6153846153846154,
+ "mmlu_eval_accuracy_econometrics": 0.75,
+ "mmlu_eval_accuracy_electrical_engineering": 0.8125,
+ "mmlu_eval_accuracy_elementary_mathematics": 0.6585365853658537,
+ "mmlu_eval_accuracy_formal_logic": 0.6428571428571429,
+ "mmlu_eval_accuracy_global_facts": 0.6,
+ "mmlu_eval_accuracy_high_school_biology": 0.78125,
+ "mmlu_eval_accuracy_high_school_chemistry": 0.5909090909090909,
+ "mmlu_eval_accuracy_high_school_computer_science": 0.7777777777777778,
+ "mmlu_eval_accuracy_high_school_european_history": 0.7777777777777778,
+ "mmlu_eval_accuracy_high_school_geography": 0.9545454545454546,
+ "mmlu_eval_accuracy_high_school_government_and_politics": 0.9523809523809523,
+ "mmlu_eval_accuracy_high_school_macroeconomics": 0.7906976744186046,
+ "mmlu_eval_accuracy_high_school_mathematics": 0.4827586206896552,
+ "mmlu_eval_accuracy_high_school_microeconomics": 0.9615384615384616,
+ "mmlu_eval_accuracy_high_school_physics": 0.17647058823529413,
+ "mmlu_eval_accuracy_high_school_psychology": 0.9166666666666666,
+ "mmlu_eval_accuracy_high_school_statistics": 0.6521739130434783,
+ "mmlu_eval_accuracy_high_school_us_history": 0.9090909090909091,
+ "mmlu_eval_accuracy_high_school_world_history": 0.8076923076923077,
+ "mmlu_eval_accuracy_human_aging": 0.7391304347826086,
+ "mmlu_eval_accuracy_human_sexuality": 0.75,
+ "mmlu_eval_accuracy_international_law": 1.0,
+ "mmlu_eval_accuracy_jurisprudence": 0.45454545454545453,
+ "mmlu_eval_accuracy_logical_fallacies": 0.7777777777777778,
+ "mmlu_eval_accuracy_machine_learning": 0.6363636363636364,
+ "mmlu_eval_accuracy_management": 0.9090909090909091,
+ "mmlu_eval_accuracy_marketing": 0.88,
+ "mmlu_eval_accuracy_medical_genetics": 0.9090909090909091,
+ "mmlu_eval_accuracy_miscellaneous": 0.7906976744186046,
+ "mmlu_eval_accuracy_moral_disputes": 0.8157894736842105,
+ "mmlu_eval_accuracy_moral_scenarios": 0.64,
+ "mmlu_eval_accuracy_nutrition": 0.7575757575757576,
+ "mmlu_eval_accuracy_philosophy": 0.8235294117647058,
+ "mmlu_eval_accuracy_prehistory": 0.8571428571428571,
+ "mmlu_eval_accuracy_professional_accounting": 0.7096774193548387,
+ "mmlu_eval_accuracy_professional_law": 0.6176470588235294,
+ "mmlu_eval_accuracy_professional_medicine": 0.8709677419354839,
+ "mmlu_eval_accuracy_professional_psychology": 0.7971014492753623,
+ "mmlu_eval_accuracy_public_relations": 0.6666666666666666,
+ "mmlu_eval_accuracy_security_studies": 0.8518518518518519,
+ "mmlu_eval_accuracy_sociology": 0.9090909090909091,
+ "mmlu_eval_accuracy_us_foreign_policy": 1.0,
+ "mmlu_eval_accuracy_virology": 0.5555555555555556,
+ "mmlu_eval_accuracy_world_religions": 0.8421052631578947,
+ "mmlu_loss": 1.2313211129857853,
+ "step": 1496
+ },
+ {
+ "epoch": 2.72,
+ "learning_rate": 0.0001,
+ "loss": 0.6937,
+ "step": 1500
+ },
+ {
+ "epoch": 2.74,
+ "learning_rate": 0.0001,
+ "loss": 0.6997,
+ "step": 1510
+ },
+ {
+ "epoch": 2.76,
+ "learning_rate": 0.0001,
+ "loss": 0.7588,
+ "step": 1520
+ },
+ {
+ "epoch": 2.77,
+ "learning_rate": 0.0001,
+ "loss": 0.7731,
+ "step": 1530
+ },
+ {
+ "epoch": 2.79,
+ "learning_rate": 0.0001,
+ "loss": 0.7914,
+ "step": 1540
+ },
+ {
+ "epoch": 2.81,
+ "learning_rate": 0.0001,
+ "loss": 0.7175,
+ "step": 1550
+ },
+ {
+ "epoch": 2.83,
+ "learning_rate": 0.0001,
+ "loss": 0.7046,
+ "step": 1560
+ },
+ {
+ "epoch": 2.85,
+ "learning_rate": 0.0001,
+ "loss": 0.7597,
+ "step": 1570
+ },
+ {
+ "epoch": 2.87,
+ "learning_rate": 0.0001,
+ "loss": 0.7932,
+ "step": 1580
+ },
+ {
+ "epoch": 2.88,
+ "learning_rate": 0.0001,
+ "loss": 0.8059,
+ "step": 1590
+ },
+ {
+ "epoch": 2.9,
+ "learning_rate": 0.0001,
+ "loss": 0.7258,
+ "step": 1600
+ }
+ ],
+ "max_steps": 1875,
+ "num_train_epochs": 4,
+ "total_flos": 1.854216633063555e+18,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/checkpoint-1600/training_args.bin b/checkpoint-1600/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..cab5478c763bfedd6ea1ae136d922355ec23f73d
--- /dev/null
+++ b/checkpoint-1600/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3739b542a2914edbd2d7eb3b727d6fa2a7752c75b0d7a856f5e87dd807fa1ef9
+size 6200
diff --git a/checkpoint-1800/README.md b/checkpoint-1800/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..82793f73e61dbb024e11fc6697bba1622d4d0db6
--- /dev/null
+++ b/checkpoint-1800/README.md
@@ -0,0 +1,20 @@
+---
+library_name: peft
+---
+## Training procedure
+
+
+The following `bitsandbytes` quantization config was used during training:
+- load_in_8bit: False
+- load_in_4bit: True
+- llm_int8_threshold: 6.0
+- llm_int8_skip_modules: None
+- llm_int8_enable_fp32_cpu_offload: False
+- llm_int8_has_fp16_weight: False
+- bnb_4bit_quant_type: nf4
+- bnb_4bit_use_double_quant: True
+- bnb_4bit_compute_dtype: bfloat16
+### Framework versions
+
+
+- PEFT 0.4.0
diff --git a/checkpoint-1800/adapter_config.json b/checkpoint-1800/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..f6f42652a8f2f3cf02936153fca1af5d04e7983d
--- /dev/null
+++ b/checkpoint-1800/adapter_config.json
@@ -0,0 +1,26 @@
+{
+ "auto_mapping": null,
+ "base_model_name_or_path": "152334H/miqu-1-70b-sf",
+ "bias": "none",
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "lora_alpha": 16.0,
+ "lora_dropout": 0.05,
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "r": 64,
+ "revision": null,
+ "target_modules": [
+ "gate_proj",
+ "down_proj",
+ "v_proj",
+ "up_proj",
+ "q_proj",
+ "k_proj",
+ "o_proj"
+ ],
+ "task_type": "CAUSAL_LM"
+}
\ No newline at end of file
diff --git a/checkpoint-1800/adapter_model.bin b/checkpoint-1800/adapter_model.bin
new file mode 100644
index 0000000000000000000000000000000000000000..ddb953e82f587734106fd3b3a58fe5249c14e398
--- /dev/null
+++ b/checkpoint-1800/adapter_model.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3f5d456fc1c68dcc8b3d0d05d8b272ed31331a38e04f42c6a819c109109e12a3
+size 1657155522
diff --git a/checkpoint-1800/adapter_model/README.md b/checkpoint-1800/adapter_model/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..82793f73e61dbb024e11fc6697bba1622d4d0db6
--- /dev/null
+++ b/checkpoint-1800/adapter_model/README.md
@@ -0,0 +1,20 @@
+---
+library_name: peft
+---
+## Training procedure
+
+
+The following `bitsandbytes` quantization config was used during training:
+- load_in_8bit: False
+- load_in_4bit: True
+- llm_int8_threshold: 6.0
+- llm_int8_skip_modules: None
+- llm_int8_enable_fp32_cpu_offload: False
+- llm_int8_has_fp16_weight: False
+- bnb_4bit_quant_type: nf4
+- bnb_4bit_use_double_quant: True
+- bnb_4bit_compute_dtype: bfloat16
+### Framework versions
+
+
+- PEFT 0.4.0
diff --git a/checkpoint-1800/adapter_model/adapter_config.json b/checkpoint-1800/adapter_model/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..f6f42652a8f2f3cf02936153fca1af5d04e7983d
--- /dev/null
+++ b/checkpoint-1800/adapter_model/adapter_config.json
@@ -0,0 +1,26 @@
+{
+ "auto_mapping": null,
+ "base_model_name_or_path": "152334H/miqu-1-70b-sf",
+ "bias": "none",
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "lora_alpha": 16.0,
+ "lora_dropout": 0.05,
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "r": 64,
+ "revision": null,
+ "target_modules": [
+ "gate_proj",
+ "down_proj",
+ "v_proj",
+ "up_proj",
+ "q_proj",
+ "k_proj",
+ "o_proj"
+ ],
+ "task_type": "CAUSAL_LM"
+}
\ No newline at end of file
diff --git a/checkpoint-1800/adapter_model/adapter_model.bin b/checkpoint-1800/adapter_model/adapter_model.bin
new file mode 100644
index 0000000000000000000000000000000000000000..ddb953e82f587734106fd3b3a58fe5249c14e398
--- /dev/null
+++ b/checkpoint-1800/adapter_model/adapter_model.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3f5d456fc1c68dcc8b3d0d05d8b272ed31331a38e04f42c6a819c109109e12a3
+size 1657155522
diff --git a/checkpoint-1800/optimizer.pt b/checkpoint-1800/optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..e8232ee68f1ce9913f58108be2744eb1cfaf40c8
--- /dev/null
+++ b/checkpoint-1800/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e6f1b2b088ddc71268a259283ba00336729affa7b86a0f96ff793428c7a03f22
+size 6627702922
diff --git a/checkpoint-1800/rng_state.pth b/checkpoint-1800/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..bcfe264a0f0e44db8038326b878cc688729f6ba6
--- /dev/null
+++ b/checkpoint-1800/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:066b0713c3dfd9c0b1175954102d461fd0b3d344ec2b477efae72ee68a4f2535
+size 14180
diff --git a/checkpoint-1800/scheduler.pt b/checkpoint-1800/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..91975212481bf6d9e40df716cccd4882750baf1c
--- /dev/null
+++ b/checkpoint-1800/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:110b2f0ce7be25c09ed998ed1965f38a8d3a448ca6aa07e3d4392461b80d705f
+size 1064
diff --git a/checkpoint-1800/special_tokens_map.json b/checkpoint-1800/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..55431d8ca0ffd8e996c646341b6136e749142547
--- /dev/null
+++ b/checkpoint-1800/special_tokens_map.json
@@ -0,0 +1,12 @@
+{
+ "bos_token": "",
+ "eos_token": "",
+ "pad_token": {
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false
+ },
+ "unk_token": ""
+}
diff --git a/checkpoint-1800/tokenizer.model b/checkpoint-1800/tokenizer.model
new file mode 100644
index 0000000000000000000000000000000000000000..6c00c742ce03c627d6cd5b795984876fa49fa899
--- /dev/null
+++ b/checkpoint-1800/tokenizer.model
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
+size 499723
diff --git a/checkpoint-1800/tokenizer_config.json b/checkpoint-1800/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..a3b9160e6f055447e1b1927482cf0ea394366055
--- /dev/null
+++ b/checkpoint-1800/tokenizer_config.json
@@ -0,0 +1,71 @@
+{
+ "add_bos_token": true,
+ "add_eos_token": false,
+ "added_tokens_decoder": {
+ "0": {
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "1": {
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "2": {
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ }
+ },
+ "bos_token": {
+ "__type": "AddedToken",
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false
+ },
+ "chat_template": "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}",
+ "clean_up_tokenization_spaces": false,
+ "eos_token": {
+ "__type": "AddedToken",
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false
+ },
+ "legacy": false,
+ "model_max_length": 1000000000000000019884624838656,
+ "pad_token": {
+ "__type": "AddedToken",
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false
+ },
+ "padding_side": "right",
+ "sp_model_kwargs": {},
+ "spaces_between_special_tokens": false,
+ "tokenizer_class": "LlamaTokenizer",
+ "unk_token": {
+ "__type": "AddedToken",
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false
+ },
+ "use_default_system_prompt": false
+}
diff --git a/checkpoint-1800/trainer_state.json b/checkpoint-1800/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..33184b197f9161bd888dc38f2ba3cd2f19a3c050
--- /dev/null
+++ b/checkpoint-1800/trainer_state.json
@@ -0,0 +1,1735 @@
+{
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 3.264565858082068,
+ "global_step": 1800,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0001,
+ "loss": 1.1655,
+ "step": 10
+ },
+ {
+ "epoch": 0.04,
+ "learning_rate": 0.0001,
+ "loss": 1.001,
+ "step": 20
+ },
+ {
+ "epoch": 0.05,
+ "learning_rate": 0.0001,
+ "loss": 1.0287,
+ "step": 30
+ },
+ {
+ "epoch": 0.07,
+ "learning_rate": 0.0001,
+ "loss": 1.1578,
+ "step": 40
+ },
+ {
+ "epoch": 0.09,
+ "learning_rate": 0.0001,
+ "loss": 1.2146,
+ "step": 50
+ },
+ {
+ "epoch": 0.11,
+ "learning_rate": 0.0001,
+ "loss": 0.997,
+ "step": 60
+ },
+ {
+ "epoch": 0.13,
+ "learning_rate": 0.0001,
+ "loss": 0.9024,
+ "step": 70
+ },
+ {
+ "epoch": 0.15,
+ "learning_rate": 0.0001,
+ "loss": 0.9901,
+ "step": 80
+ },
+ {
+ "epoch": 0.16,
+ "learning_rate": 0.0001,
+ "loss": 1.1264,
+ "step": 90
+ },
+ {
+ "epoch": 0.18,
+ "learning_rate": 0.0001,
+ "loss": 1.2038,
+ "step": 100
+ },
+ {
+ "epoch": 0.2,
+ "learning_rate": 0.0001,
+ "loss": 0.8935,
+ "step": 110
+ },
+ {
+ "epoch": 0.22,
+ "learning_rate": 0.0001,
+ "loss": 0.9178,
+ "step": 120
+ },
+ {
+ "epoch": 0.24,
+ "learning_rate": 0.0001,
+ "loss": 0.9746,
+ "step": 130
+ },
+ {
+ "epoch": 0.25,
+ "learning_rate": 0.0001,
+ "loss": 1.1566,
+ "step": 140
+ },
+ {
+ "epoch": 0.27,
+ "learning_rate": 0.0001,
+ "loss": 1.2877,
+ "step": 150
+ },
+ {
+ "epoch": 0.29,
+ "learning_rate": 0.0001,
+ "loss": 0.9146,
+ "step": 160
+ },
+ {
+ "epoch": 0.31,
+ "learning_rate": 0.0001,
+ "loss": 0.8895,
+ "step": 170
+ },
+ {
+ "epoch": 0.33,
+ "learning_rate": 0.0001,
+ "loss": 1.0121,
+ "step": 180
+ },
+ {
+ "epoch": 0.34,
+ "eval_loss": 1.0215636491775513,
+ "eval_runtime": 950.138,
+ "eval_samples_per_second": 1.052,
+ "eval_steps_per_second": 1.052,
+ "step": 187
+ },
+ {
+ "epoch": 0.34,
+ "mmlu_eval_accuracy": 0.731892294851104,
+ "mmlu_eval_accuracy_abstract_algebra": 0.18181818181818182,
+ "mmlu_eval_accuracy_anatomy": 0.7857142857142857,
+ "mmlu_eval_accuracy_astronomy": 0.6875,
+ "mmlu_eval_accuracy_business_ethics": 0.7272727272727273,
+ "mmlu_eval_accuracy_clinical_knowledge": 0.896551724137931,
+ "mmlu_eval_accuracy_college_biology": 0.875,
+ "mmlu_eval_accuracy_college_chemistry": 0.5,
+ "mmlu_eval_accuracy_college_computer_science": 0.7272727272727273,
+ "mmlu_eval_accuracy_college_mathematics": 0.45454545454545453,
+ "mmlu_eval_accuracy_college_medicine": 0.9090909090909091,
+ "mmlu_eval_accuracy_college_physics": 0.6363636363636364,
+ "mmlu_eval_accuracy_computer_security": 0.6363636363636364,
+ "mmlu_eval_accuracy_conceptual_physics": 0.6538461538461539,
+ "mmlu_eval_accuracy_econometrics": 0.8333333333333334,
+ "mmlu_eval_accuracy_electrical_engineering": 0.8125,
+ "mmlu_eval_accuracy_elementary_mathematics": 0.7073170731707317,
+ "mmlu_eval_accuracy_formal_logic": 0.5714285714285714,
+ "mmlu_eval_accuracy_global_facts": 0.5,
+ "mmlu_eval_accuracy_high_school_biology": 0.8125,
+ "mmlu_eval_accuracy_high_school_chemistry": 0.4090909090909091,
+ "mmlu_eval_accuracy_high_school_computer_science": 0.6666666666666666,
+ "mmlu_eval_accuracy_high_school_european_history": 0.7777777777777778,
+ "mmlu_eval_accuracy_high_school_geography": 0.9090909090909091,
+ "mmlu_eval_accuracy_high_school_government_and_politics": 0.9523809523809523,
+ "mmlu_eval_accuracy_high_school_macroeconomics": 0.7441860465116279,
+ "mmlu_eval_accuracy_high_school_mathematics": 0.3793103448275862,
+ "mmlu_eval_accuracy_high_school_microeconomics": 0.9615384615384616,
+ "mmlu_eval_accuracy_high_school_physics": 0.23529411764705882,
+ "mmlu_eval_accuracy_high_school_psychology": 0.9333333333333333,
+ "mmlu_eval_accuracy_high_school_statistics": 0.6956521739130435,
+ "mmlu_eval_accuracy_high_school_us_history": 0.9090909090909091,
+ "mmlu_eval_accuracy_high_school_world_history": 0.8076923076923077,
+ "mmlu_eval_accuracy_human_aging": 0.8260869565217391,
+ "mmlu_eval_accuracy_human_sexuality": 0.6666666666666666,
+ "mmlu_eval_accuracy_international_law": 1.0,
+ "mmlu_eval_accuracy_jurisprudence": 0.6363636363636364,
+ "mmlu_eval_accuracy_logical_fallacies": 0.7777777777777778,
+ "mmlu_eval_accuracy_machine_learning": 0.5454545454545454,
+ "mmlu_eval_accuracy_management": 0.9090909090909091,
+ "mmlu_eval_accuracy_marketing": 0.88,
+ "mmlu_eval_accuracy_medical_genetics": 0.9090909090909091,
+ "mmlu_eval_accuracy_miscellaneous": 0.7906976744186046,
+ "mmlu_eval_accuracy_moral_disputes": 0.8157894736842105,
+ "mmlu_eval_accuracy_moral_scenarios": 0.59,
+ "mmlu_eval_accuracy_nutrition": 0.7878787878787878,
+ "mmlu_eval_accuracy_philosophy": 0.7941176470588235,
+ "mmlu_eval_accuracy_prehistory": 0.8285714285714286,
+ "mmlu_eval_accuracy_professional_accounting": 0.6451612903225806,
+ "mmlu_eval_accuracy_professional_law": 0.6294117647058823,
+ "mmlu_eval_accuracy_professional_medicine": 0.8064516129032258,
+ "mmlu_eval_accuracy_professional_psychology": 0.8115942028985508,
+ "mmlu_eval_accuracy_public_relations": 0.6666666666666666,
+ "mmlu_eval_accuracy_security_studies": 0.8148148148148148,
+ "mmlu_eval_accuracy_sociology": 0.9545454545454546,
+ "mmlu_eval_accuracy_us_foreign_policy": 1.0,
+ "mmlu_eval_accuracy_virology": 0.5,
+ "mmlu_eval_accuracy_world_religions": 0.8421052631578947,
+ "mmlu_loss": 1.326305795171384,
+ "step": 187
+ },
+ {
+ "epoch": 0.34,
+ "learning_rate": 0.0001,
+ "loss": 1.1133,
+ "step": 190
+ },
+ {
+ "epoch": 0.36,
+ "learning_rate": 0.0001,
+ "loss": 1.2485,
+ "step": 200
+ },
+ {
+ "epoch": 0.38,
+ "learning_rate": 0.0001,
+ "loss": 0.9653,
+ "step": 210
+ },
+ {
+ "epoch": 0.4,
+ "learning_rate": 0.0001,
+ "loss": 0.9455,
+ "step": 220
+ },
+ {
+ "epoch": 0.42,
+ "learning_rate": 0.0001,
+ "loss": 1.0373,
+ "step": 230
+ },
+ {
+ "epoch": 0.44,
+ "learning_rate": 0.0001,
+ "loss": 1.1425,
+ "step": 240
+ },
+ {
+ "epoch": 0.45,
+ "learning_rate": 0.0001,
+ "loss": 1.3136,
+ "step": 250
+ },
+ {
+ "epoch": 0.47,
+ "learning_rate": 0.0001,
+ "loss": 0.8695,
+ "step": 260
+ },
+ {
+ "epoch": 0.49,
+ "learning_rate": 0.0001,
+ "loss": 0.872,
+ "step": 270
+ },
+ {
+ "epoch": 0.51,
+ "learning_rate": 0.0001,
+ "loss": 1.0152,
+ "step": 280
+ },
+ {
+ "epoch": 0.53,
+ "learning_rate": 0.0001,
+ "loss": 1.1309,
+ "step": 290
+ },
+ {
+ "epoch": 0.54,
+ "learning_rate": 0.0001,
+ "loss": 1.267,
+ "step": 300
+ },
+ {
+ "epoch": 0.56,
+ "learning_rate": 0.0001,
+ "loss": 0.9249,
+ "step": 310
+ },
+ {
+ "epoch": 0.58,
+ "learning_rate": 0.0001,
+ "loss": 0.9148,
+ "step": 320
+ },
+ {
+ "epoch": 0.6,
+ "learning_rate": 0.0001,
+ "loss": 0.9864,
+ "step": 330
+ },
+ {
+ "epoch": 0.62,
+ "learning_rate": 0.0001,
+ "loss": 1.2312,
+ "step": 340
+ },
+ {
+ "epoch": 0.63,
+ "learning_rate": 0.0001,
+ "loss": 1.2354,
+ "step": 350
+ },
+ {
+ "epoch": 0.65,
+ "learning_rate": 0.0001,
+ "loss": 0.9126,
+ "step": 360
+ },
+ {
+ "epoch": 0.67,
+ "learning_rate": 0.0001,
+ "loss": 0.9213,
+ "step": 370
+ },
+ {
+ "epoch": 0.68,
+ "eval_loss": 1.0163359642028809,
+ "eval_runtime": 948.1151,
+ "eval_samples_per_second": 1.055,
+ "eval_steps_per_second": 1.055,
+ "step": 374
+ },
+ {
+ "epoch": 0.68,
+ "mmlu_eval_accuracy": 0.7395476061435284,
+ "mmlu_eval_accuracy_abstract_algebra": 0.2727272727272727,
+ "mmlu_eval_accuracy_anatomy": 0.7857142857142857,
+ "mmlu_eval_accuracy_astronomy": 0.75,
+ "mmlu_eval_accuracy_business_ethics": 0.7272727272727273,
+ "mmlu_eval_accuracy_clinical_knowledge": 0.896551724137931,
+ "mmlu_eval_accuracy_college_biology": 0.875,
+ "mmlu_eval_accuracy_college_chemistry": 0.5,
+ "mmlu_eval_accuracy_college_computer_science": 0.7272727272727273,
+ "mmlu_eval_accuracy_college_mathematics": 0.36363636363636365,
+ "mmlu_eval_accuracy_college_medicine": 0.9090909090909091,
+ "mmlu_eval_accuracy_college_physics": 0.6363636363636364,
+ "mmlu_eval_accuracy_computer_security": 0.6363636363636364,
+ "mmlu_eval_accuracy_conceptual_physics": 0.5769230769230769,
+ "mmlu_eval_accuracy_econometrics": 0.8333333333333334,
+ "mmlu_eval_accuracy_electrical_engineering": 0.875,
+ "mmlu_eval_accuracy_elementary_mathematics": 0.6829268292682927,
+ "mmlu_eval_accuracy_formal_logic": 0.6428571428571429,
+ "mmlu_eval_accuracy_global_facts": 0.5,
+ "mmlu_eval_accuracy_high_school_biology": 0.8125,
+ "mmlu_eval_accuracy_high_school_chemistry": 0.45454545454545453,
+ "mmlu_eval_accuracy_high_school_computer_science": 0.7777777777777778,
+ "mmlu_eval_accuracy_high_school_european_history": 0.7777777777777778,
+ "mmlu_eval_accuracy_high_school_geography": 0.9090909090909091,
+ "mmlu_eval_accuracy_high_school_government_and_politics": 0.9523809523809523,
+ "mmlu_eval_accuracy_high_school_macroeconomics": 0.7674418604651163,
+ "mmlu_eval_accuracy_high_school_mathematics": 0.41379310344827586,
+ "mmlu_eval_accuracy_high_school_microeconomics": 0.9615384615384616,
+ "mmlu_eval_accuracy_high_school_physics": 0.23529411764705882,
+ "mmlu_eval_accuracy_high_school_psychology": 0.95,
+ "mmlu_eval_accuracy_high_school_statistics": 0.7391304347826086,
+ "mmlu_eval_accuracy_high_school_us_history": 0.9090909090909091,
+ "mmlu_eval_accuracy_high_school_world_history": 0.7692307692307693,
+ "mmlu_eval_accuracy_human_aging": 0.8260869565217391,
+ "mmlu_eval_accuracy_human_sexuality": 0.6666666666666666,
+ "mmlu_eval_accuracy_international_law": 1.0,
+ "mmlu_eval_accuracy_jurisprudence": 0.6363636363636364,
+ "mmlu_eval_accuracy_logical_fallacies": 0.7777777777777778,
+ "mmlu_eval_accuracy_machine_learning": 0.5454545454545454,
+ "mmlu_eval_accuracy_management": 0.9090909090909091,
+ "mmlu_eval_accuracy_marketing": 0.92,
+ "mmlu_eval_accuracy_medical_genetics": 0.9090909090909091,
+ "mmlu_eval_accuracy_miscellaneous": 0.7790697674418605,
+ "mmlu_eval_accuracy_moral_disputes": 0.8157894736842105,
+ "mmlu_eval_accuracy_moral_scenarios": 0.57,
+ "mmlu_eval_accuracy_nutrition": 0.7272727272727273,
+ "mmlu_eval_accuracy_philosophy": 0.7941176470588235,
+ "mmlu_eval_accuracy_prehistory": 0.8571428571428571,
+ "mmlu_eval_accuracy_professional_accounting": 0.6774193548387096,
+ "mmlu_eval_accuracy_professional_law": 0.6411764705882353,
+ "mmlu_eval_accuracy_professional_medicine": 0.8387096774193549,
+ "mmlu_eval_accuracy_professional_psychology": 0.8115942028985508,
+ "mmlu_eval_accuracy_public_relations": 0.6666666666666666,
+ "mmlu_eval_accuracy_security_studies": 0.8148148148148148,
+ "mmlu_eval_accuracy_sociology": 0.9545454545454546,
+ "mmlu_eval_accuracy_us_foreign_policy": 1.0,
+ "mmlu_eval_accuracy_virology": 0.5,
+ "mmlu_eval_accuracy_world_religions": 0.8947368421052632,
+ "mmlu_loss": 1.2796503596061355,
+ "step": 374
+ },
+ {
+ "epoch": 0.69,
+ "learning_rate": 0.0001,
+ "loss": 0.9737,
+ "step": 380
+ },
+ {
+ "epoch": 0.71,
+ "learning_rate": 0.0001,
+ "loss": 1.157,
+ "step": 390
+ },
+ {
+ "epoch": 0.73,
+ "learning_rate": 0.0001,
+ "loss": 1.2106,
+ "step": 400
+ },
+ {
+ "epoch": 0.74,
+ "learning_rate": 0.0001,
+ "loss": 0.8687,
+ "step": 410
+ },
+ {
+ "epoch": 0.76,
+ "learning_rate": 0.0001,
+ "loss": 0.8742,
+ "step": 420
+ },
+ {
+ "epoch": 0.78,
+ "learning_rate": 0.0001,
+ "loss": 0.9901,
+ "step": 430
+ },
+ {
+ "epoch": 0.8,
+ "learning_rate": 0.0001,
+ "loss": 1.2238,
+ "step": 440
+ },
+ {
+ "epoch": 0.82,
+ "learning_rate": 0.0001,
+ "loss": 1.2604,
+ "step": 450
+ },
+ {
+ "epoch": 0.83,
+ "learning_rate": 0.0001,
+ "loss": 0.8756,
+ "step": 460
+ },
+ {
+ "epoch": 0.85,
+ "learning_rate": 0.0001,
+ "loss": 0.8683,
+ "step": 470
+ },
+ {
+ "epoch": 0.87,
+ "learning_rate": 0.0001,
+ "loss": 0.9824,
+ "step": 480
+ },
+ {
+ "epoch": 0.89,
+ "learning_rate": 0.0001,
+ "loss": 1.1574,
+ "step": 490
+ },
+ {
+ "epoch": 0.91,
+ "learning_rate": 0.0001,
+ "loss": 1.2687,
+ "step": 500
+ },
+ {
+ "epoch": 0.92,
+ "learning_rate": 0.0001,
+ "loss": 0.8657,
+ "step": 510
+ },
+ {
+ "epoch": 0.94,
+ "learning_rate": 0.0001,
+ "loss": 0.9207,
+ "step": 520
+ },
+ {
+ "epoch": 0.96,
+ "learning_rate": 0.0001,
+ "loss": 1.012,
+ "step": 530
+ },
+ {
+ "epoch": 0.98,
+ "learning_rate": 0.0001,
+ "loss": 1.1517,
+ "step": 540
+ },
+ {
+ "epoch": 1.0,
+ "learning_rate": 0.0001,
+ "loss": 1.1654,
+ "step": 550
+ },
+ {
+ "epoch": 1.02,
+ "learning_rate": 0.0001,
+ "loss": 0.8931,
+ "step": 560
+ },
+ {
+ "epoch": 1.02,
+ "eval_loss": 1.0150845050811768,
+ "eval_runtime": 949.8392,
+ "eval_samples_per_second": 1.053,
+ "eval_steps_per_second": 1.053,
+ "step": 561
+ },
+ {
+ "epoch": 1.02,
+ "mmlu_eval_accuracy": 0.7346397374699287,
+ "mmlu_eval_accuracy_abstract_algebra": 0.2727272727272727,
+ "mmlu_eval_accuracy_anatomy": 0.7142857142857143,
+ "mmlu_eval_accuracy_astronomy": 0.6875,
+ "mmlu_eval_accuracy_business_ethics": 0.8181818181818182,
+ "mmlu_eval_accuracy_clinical_knowledge": 0.8620689655172413,
+ "mmlu_eval_accuracy_college_biology": 0.875,
+ "mmlu_eval_accuracy_college_chemistry": 0.5,
+ "mmlu_eval_accuracy_college_computer_science": 0.7272727272727273,
+ "mmlu_eval_accuracy_college_mathematics": 0.36363636363636365,
+ "mmlu_eval_accuracy_college_medicine": 0.9090909090909091,
+ "mmlu_eval_accuracy_college_physics": 0.5454545454545454,
+ "mmlu_eval_accuracy_computer_security": 0.6363636363636364,
+ "mmlu_eval_accuracy_conceptual_physics": 0.6153846153846154,
+ "mmlu_eval_accuracy_econometrics": 0.8333333333333334,
+ "mmlu_eval_accuracy_electrical_engineering": 0.8125,
+ "mmlu_eval_accuracy_elementary_mathematics": 0.6829268292682927,
+ "mmlu_eval_accuracy_formal_logic": 0.6428571428571429,
+ "mmlu_eval_accuracy_global_facts": 0.5,
+ "mmlu_eval_accuracy_high_school_biology": 0.84375,
+ "mmlu_eval_accuracy_high_school_chemistry": 0.45454545454545453,
+ "mmlu_eval_accuracy_high_school_computer_science": 0.7777777777777778,
+ "mmlu_eval_accuracy_high_school_european_history": 0.7777777777777778,
+ "mmlu_eval_accuracy_high_school_geography": 0.9090909090909091,
+ "mmlu_eval_accuracy_high_school_government_and_politics": 0.9523809523809523,
+ "mmlu_eval_accuracy_high_school_macroeconomics": 0.7674418604651163,
+ "mmlu_eval_accuracy_high_school_mathematics": 0.41379310344827586,
+ "mmlu_eval_accuracy_high_school_microeconomics": 0.9230769230769231,
+ "mmlu_eval_accuracy_high_school_physics": 0.23529411764705882,
+ "mmlu_eval_accuracy_high_school_psychology": 0.9666666666666667,
+ "mmlu_eval_accuracy_high_school_statistics": 0.7391304347826086,
+ "mmlu_eval_accuracy_high_school_us_history": 0.9090909090909091,
+ "mmlu_eval_accuracy_high_school_world_history": 0.8076923076923077,
+ "mmlu_eval_accuracy_human_aging": 0.8260869565217391,
+ "mmlu_eval_accuracy_human_sexuality": 0.6666666666666666,
+ "mmlu_eval_accuracy_international_law": 1.0,
+ "mmlu_eval_accuracy_jurisprudence": 0.6363636363636364,
+ "mmlu_eval_accuracy_logical_fallacies": 0.7222222222222222,
+ "mmlu_eval_accuracy_machine_learning": 0.6363636363636364,
+ "mmlu_eval_accuracy_management": 0.9090909090909091,
+ "mmlu_eval_accuracy_marketing": 0.88,
+ "mmlu_eval_accuracy_medical_genetics": 0.9090909090909091,
+ "mmlu_eval_accuracy_miscellaneous": 0.7558139534883721,
+ "mmlu_eval_accuracy_moral_disputes": 0.8157894736842105,
+ "mmlu_eval_accuracy_moral_scenarios": 0.56,
+ "mmlu_eval_accuracy_nutrition": 0.7575757575757576,
+ "mmlu_eval_accuracy_philosophy": 0.7941176470588235,
+ "mmlu_eval_accuracy_prehistory": 0.8571428571428571,
+ "mmlu_eval_accuracy_professional_accounting": 0.6774193548387096,
+ "mmlu_eval_accuracy_professional_law": 0.6294117647058823,
+ "mmlu_eval_accuracy_professional_medicine": 0.8064516129032258,
+ "mmlu_eval_accuracy_professional_psychology": 0.8260869565217391,
+ "mmlu_eval_accuracy_public_relations": 0.6666666666666666,
+ "mmlu_eval_accuracy_security_studies": 0.8148148148148148,
+ "mmlu_eval_accuracy_sociology": 0.9090909090909091,
+ "mmlu_eval_accuracy_us_foreign_policy": 1.0,
+ "mmlu_eval_accuracy_virology": 0.5,
+ "mmlu_eval_accuracy_world_religions": 0.8421052631578947,
+ "mmlu_loss": 1.186674291658526,
+ "step": 561
+ },
+ {
+ "epoch": 1.03,
+ "learning_rate": 0.0001,
+ "loss": 0.8507,
+ "step": 570
+ },
+ {
+ "epoch": 1.05,
+ "learning_rate": 0.0001,
+ "loss": 0.9164,
+ "step": 580
+ },
+ {
+ "epoch": 1.07,
+ "learning_rate": 0.0001,
+ "loss": 1.0908,
+ "step": 590
+ },
+ {
+ "epoch": 1.09,
+ "learning_rate": 0.0001,
+ "loss": 1.0431,
+ "step": 600
+ },
+ {
+ "epoch": 1.11,
+ "learning_rate": 0.0001,
+ "loss": 0.8567,
+ "step": 610
+ },
+ {
+ "epoch": 1.12,
+ "learning_rate": 0.0001,
+ "loss": 0.8818,
+ "step": 620
+ },
+ {
+ "epoch": 1.14,
+ "learning_rate": 0.0001,
+ "loss": 0.9499,
+ "step": 630
+ },
+ {
+ "epoch": 1.16,
+ "learning_rate": 0.0001,
+ "loss": 1.0437,
+ "step": 640
+ },
+ {
+ "epoch": 1.18,
+ "learning_rate": 0.0001,
+ "loss": 1.0487,
+ "step": 650
+ },
+ {
+ "epoch": 1.2,
+ "learning_rate": 0.0001,
+ "loss": 0.8405,
+ "step": 660
+ },
+ {
+ "epoch": 1.22,
+ "learning_rate": 0.0001,
+ "loss": 0.8818,
+ "step": 670
+ },
+ {
+ "epoch": 1.23,
+ "learning_rate": 0.0001,
+ "loss": 0.9619,
+ "step": 680
+ },
+ {
+ "epoch": 1.25,
+ "learning_rate": 0.0001,
+ "loss": 1.0753,
+ "step": 690
+ },
+ {
+ "epoch": 1.27,
+ "learning_rate": 0.0001,
+ "loss": 1.0218,
+ "step": 700
+ },
+ {
+ "epoch": 1.29,
+ "learning_rate": 0.0001,
+ "loss": 0.8763,
+ "step": 710
+ },
+ {
+ "epoch": 1.31,
+ "learning_rate": 0.0001,
+ "loss": 0.8789,
+ "step": 720
+ },
+ {
+ "epoch": 1.32,
+ "learning_rate": 0.0001,
+ "loss": 0.8631,
+ "step": 730
+ },
+ {
+ "epoch": 1.34,
+ "learning_rate": 0.0001,
+ "loss": 0.9846,
+ "step": 740
+ },
+ {
+ "epoch": 1.36,
+ "eval_loss": 1.0305067300796509,
+ "eval_runtime": 948.7106,
+ "eval_samples_per_second": 1.054,
+ "eval_steps_per_second": 1.054,
+ "step": 748
+ },
+ {
+ "epoch": 1.36,
+ "mmlu_eval_accuracy": 0.7324229372189777,
+ "mmlu_eval_accuracy_abstract_algebra": 0.2727272727272727,
+ "mmlu_eval_accuracy_anatomy": 0.6428571428571429,
+ "mmlu_eval_accuracy_astronomy": 0.6875,
+ "mmlu_eval_accuracy_business_ethics": 1.0,
+ "mmlu_eval_accuracy_clinical_knowledge": 0.896551724137931,
+ "mmlu_eval_accuracy_college_biology": 0.875,
+ "mmlu_eval_accuracy_college_chemistry": 0.5,
+ "mmlu_eval_accuracy_college_computer_science": 0.6363636363636364,
+ "mmlu_eval_accuracy_college_mathematics": 0.36363636363636365,
+ "mmlu_eval_accuracy_college_medicine": 0.8636363636363636,
+ "mmlu_eval_accuracy_college_physics": 0.6363636363636364,
+ "mmlu_eval_accuracy_computer_security": 0.6363636363636364,
+ "mmlu_eval_accuracy_conceptual_physics": 0.6153846153846154,
+ "mmlu_eval_accuracy_econometrics": 0.8333333333333334,
+ "mmlu_eval_accuracy_electrical_engineering": 0.8125,
+ "mmlu_eval_accuracy_elementary_mathematics": 0.7073170731707317,
+ "mmlu_eval_accuracy_formal_logic": 0.6428571428571429,
+ "mmlu_eval_accuracy_global_facts": 0.4,
+ "mmlu_eval_accuracy_high_school_biology": 0.8125,
+ "mmlu_eval_accuracy_high_school_chemistry": 0.36363636363636365,
+ "mmlu_eval_accuracy_high_school_computer_science": 0.6666666666666666,
+ "mmlu_eval_accuracy_high_school_european_history": 0.7777777777777778,
+ "mmlu_eval_accuracy_high_school_geography": 0.9090909090909091,
+ "mmlu_eval_accuracy_high_school_government_and_politics": 0.9523809523809523,
+ "mmlu_eval_accuracy_high_school_macroeconomics": 0.7674418604651163,
+ "mmlu_eval_accuracy_high_school_mathematics": 0.3448275862068966,
+ "mmlu_eval_accuracy_high_school_microeconomics": 0.9615384615384616,
+ "mmlu_eval_accuracy_high_school_physics": 0.35294117647058826,
+ "mmlu_eval_accuracy_high_school_psychology": 0.9333333333333333,
+ "mmlu_eval_accuracy_high_school_statistics": 0.782608695652174,
+ "mmlu_eval_accuracy_high_school_us_history": 0.9545454545454546,
+ "mmlu_eval_accuracy_high_school_world_history": 0.7692307692307693,
+ "mmlu_eval_accuracy_human_aging": 0.782608695652174,
+ "mmlu_eval_accuracy_human_sexuality": 0.6666666666666666,
+ "mmlu_eval_accuracy_international_law": 1.0,
+ "mmlu_eval_accuracy_jurisprudence": 0.6363636363636364,
+ "mmlu_eval_accuracy_logical_fallacies": 0.7777777777777778,
+ "mmlu_eval_accuracy_machine_learning": 0.6363636363636364,
+ "mmlu_eval_accuracy_management": 0.9090909090909091,
+ "mmlu_eval_accuracy_marketing": 0.96,
+ "mmlu_eval_accuracy_medical_genetics": 0.9090909090909091,
+ "mmlu_eval_accuracy_miscellaneous": 0.7441860465116279,
+ "mmlu_eval_accuracy_moral_disputes": 0.8157894736842105,
+ "mmlu_eval_accuracy_moral_scenarios": 0.57,
+ "mmlu_eval_accuracy_nutrition": 0.7272727272727273,
+ "mmlu_eval_accuracy_philosophy": 0.7941176470588235,
+ "mmlu_eval_accuracy_prehistory": 0.8285714285714286,
+ "mmlu_eval_accuracy_professional_accounting": 0.6129032258064516,
+ "mmlu_eval_accuracy_professional_law": 0.6411764705882353,
+ "mmlu_eval_accuracy_professional_medicine": 0.8064516129032258,
+ "mmlu_eval_accuracy_professional_psychology": 0.8260869565217391,
+ "mmlu_eval_accuracy_public_relations": 0.6666666666666666,
+ "mmlu_eval_accuracy_security_studies": 0.8148148148148148,
+ "mmlu_eval_accuracy_sociology": 0.9090909090909091,
+ "mmlu_eval_accuracy_us_foreign_policy": 1.0,
+ "mmlu_eval_accuracy_virology": 0.5,
+ "mmlu_eval_accuracy_world_religions": 0.8421052631578947,
+ "mmlu_loss": 1.2988067958029479,
+ "step": 748
+ },
+ {
+ "epoch": 1.36,
+ "learning_rate": 0.0001,
+ "loss": 1.0735,
+ "step": 750
+ },
+ {
+ "epoch": 1.38,
+ "learning_rate": 0.0001,
+ "loss": 0.9066,
+ "step": 760
+ },
+ {
+ "epoch": 1.4,
+ "learning_rate": 0.0001,
+ "loss": 0.8716,
+ "step": 770
+ },
+ {
+ "epoch": 1.41,
+ "learning_rate": 0.0001,
+ "loss": 0.9144,
+ "step": 780
+ },
+ {
+ "epoch": 1.43,
+ "learning_rate": 0.0001,
+ "loss": 1.0338,
+ "step": 790
+ },
+ {
+ "epoch": 1.45,
+ "learning_rate": 0.0001,
+ "loss": 1.0275,
+ "step": 800
+ },
+ {
+ "epoch": 1.47,
+ "learning_rate": 0.0001,
+ "loss": 0.8382,
+ "step": 810
+ },
+ {
+ "epoch": 1.49,
+ "learning_rate": 0.0001,
+ "loss": 0.8489,
+ "step": 820
+ },
+ {
+ "epoch": 1.51,
+ "learning_rate": 0.0001,
+ "loss": 0.8931,
+ "step": 830
+ },
+ {
+ "epoch": 1.52,
+ "learning_rate": 0.0001,
+ "loss": 1.0515,
+ "step": 840
+ },
+ {
+ "epoch": 1.54,
+ "learning_rate": 0.0001,
+ "loss": 1.0965,
+ "step": 850
+ },
+ {
+ "epoch": 1.56,
+ "learning_rate": 0.0001,
+ "loss": 0.8928,
+ "step": 860
+ },
+ {
+ "epoch": 1.58,
+ "learning_rate": 0.0001,
+ "loss": 0.8608,
+ "step": 870
+ },
+ {
+ "epoch": 1.6,
+ "learning_rate": 0.0001,
+ "loss": 0.8831,
+ "step": 880
+ },
+ {
+ "epoch": 1.61,
+ "learning_rate": 0.0001,
+ "loss": 1.0253,
+ "step": 890
+ },
+ {
+ "epoch": 1.63,
+ "learning_rate": 0.0001,
+ "loss": 0.9905,
+ "step": 900
+ },
+ {
+ "epoch": 1.65,
+ "learning_rate": 0.0001,
+ "loss": 0.8487,
+ "step": 910
+ },
+ {
+ "epoch": 1.67,
+ "learning_rate": 0.0001,
+ "loss": 0.8568,
+ "step": 920
+ },
+ {
+ "epoch": 1.69,
+ "learning_rate": 0.0001,
+ "loss": 0.9047,
+ "step": 930
+ },
+ {
+ "epoch": 1.7,
+ "eval_loss": 1.0250624418258667,
+ "eval_runtime": 946.4035,
+ "eval_samples_per_second": 1.057,
+ "eval_steps_per_second": 1.057,
+ "step": 935
+ },
+ {
+ "epoch": 1.7,
+ "mmlu_eval_accuracy": 0.7288948695878031,
+ "mmlu_eval_accuracy_abstract_algebra": 0.2727272727272727,
+ "mmlu_eval_accuracy_anatomy": 0.5714285714285714,
+ "mmlu_eval_accuracy_astronomy": 0.6875,
+ "mmlu_eval_accuracy_business_ethics": 0.9090909090909091,
+ "mmlu_eval_accuracy_clinical_knowledge": 0.8620689655172413,
+ "mmlu_eval_accuracy_college_biology": 0.875,
+ "mmlu_eval_accuracy_college_chemistry": 0.5,
+ "mmlu_eval_accuracy_college_computer_science": 0.7272727272727273,
+ "mmlu_eval_accuracy_college_mathematics": 0.36363636363636365,
+ "mmlu_eval_accuracy_college_medicine": 0.9090909090909091,
+ "mmlu_eval_accuracy_college_physics": 0.6363636363636364,
+ "mmlu_eval_accuracy_computer_security": 0.6363636363636364,
+ "mmlu_eval_accuracy_conceptual_physics": 0.6153846153846154,
+ "mmlu_eval_accuracy_econometrics": 0.8333333333333334,
+ "mmlu_eval_accuracy_electrical_engineering": 0.8125,
+ "mmlu_eval_accuracy_elementary_mathematics": 0.6097560975609756,
+ "mmlu_eval_accuracy_formal_logic": 0.6428571428571429,
+ "mmlu_eval_accuracy_global_facts": 0.5,
+ "mmlu_eval_accuracy_high_school_biology": 0.84375,
+ "mmlu_eval_accuracy_high_school_chemistry": 0.45454545454545453,
+ "mmlu_eval_accuracy_high_school_computer_science": 0.6666666666666666,
+ "mmlu_eval_accuracy_high_school_european_history": 0.7777777777777778,
+ "mmlu_eval_accuracy_high_school_geography": 0.9090909090909091,
+ "mmlu_eval_accuracy_high_school_government_and_politics": 0.9523809523809523,
+ "mmlu_eval_accuracy_high_school_macroeconomics": 0.7906976744186046,
+ "mmlu_eval_accuracy_high_school_mathematics": 0.3793103448275862,
+ "mmlu_eval_accuracy_high_school_microeconomics": 0.9230769230769231,
+ "mmlu_eval_accuracy_high_school_physics": 0.23529411764705882,
+ "mmlu_eval_accuracy_high_school_psychology": 0.9333333333333333,
+ "mmlu_eval_accuracy_high_school_statistics": 0.7391304347826086,
+ "mmlu_eval_accuracy_high_school_us_history": 0.9090909090909091,
+ "mmlu_eval_accuracy_high_school_world_history": 0.7692307692307693,
+ "mmlu_eval_accuracy_human_aging": 0.782608695652174,
+ "mmlu_eval_accuracy_human_sexuality": 0.6666666666666666,
+ "mmlu_eval_accuracy_international_law": 1.0,
+ "mmlu_eval_accuracy_jurisprudence": 0.5454545454545454,
+ "mmlu_eval_accuracy_logical_fallacies": 0.7222222222222222,
+ "mmlu_eval_accuracy_machine_learning": 0.6363636363636364,
+ "mmlu_eval_accuracy_management": 0.9090909090909091,
+ "mmlu_eval_accuracy_marketing": 0.96,
+ "mmlu_eval_accuracy_medical_genetics": 0.9090909090909091,
+ "mmlu_eval_accuracy_miscellaneous": 0.7790697674418605,
+ "mmlu_eval_accuracy_moral_disputes": 0.8157894736842105,
+ "mmlu_eval_accuracy_moral_scenarios": 0.57,
+ "mmlu_eval_accuracy_nutrition": 0.7575757575757576,
+ "mmlu_eval_accuracy_philosophy": 0.8235294117647058,
+ "mmlu_eval_accuracy_prehistory": 0.8857142857142857,
+ "mmlu_eval_accuracy_professional_accounting": 0.6129032258064516,
+ "mmlu_eval_accuracy_professional_law": 0.6235294117647059,
+ "mmlu_eval_accuracy_professional_medicine": 0.8387096774193549,
+ "mmlu_eval_accuracy_professional_psychology": 0.8115942028985508,
+ "mmlu_eval_accuracy_public_relations": 0.5833333333333334,
+ "mmlu_eval_accuracy_security_studies": 0.8148148148148148,
+ "mmlu_eval_accuracy_sociology": 0.9090909090909091,
+ "mmlu_eval_accuracy_us_foreign_policy": 1.0,
+ "mmlu_eval_accuracy_virology": 0.5,
+ "mmlu_eval_accuracy_world_religions": 0.8421052631578947,
+ "mmlu_loss": 1.243813282909306,
+ "step": 935
+ },
+ {
+ "epoch": 1.7,
+ "learning_rate": 0.0001,
+ "loss": 1.0174,
+ "step": 940
+ },
+ {
+ "epoch": 1.72,
+ "learning_rate": 0.0001,
+ "loss": 1.0302,
+ "step": 950
+ },
+ {
+ "epoch": 1.74,
+ "learning_rate": 0.0001,
+ "loss": 0.8799,
+ "step": 960
+ },
+ {
+ "epoch": 1.76,
+ "learning_rate": 0.0001,
+ "loss": 0.8447,
+ "step": 970
+ },
+ {
+ "epoch": 1.78,
+ "learning_rate": 0.0001,
+ "loss": 0.9053,
+ "step": 980
+ },
+ {
+ "epoch": 1.8,
+ "learning_rate": 0.0001,
+ "loss": 1.0331,
+ "step": 990
+ },
+ {
+ "epoch": 1.81,
+ "learning_rate": 0.0001,
+ "loss": 1.0412,
+ "step": 1000
+ },
+ {
+ "epoch": 1.83,
+ "learning_rate": 0.0001,
+ "loss": 0.8753,
+ "step": 1010
+ },
+ {
+ "epoch": 1.85,
+ "learning_rate": 0.0001,
+ "loss": 0.8744,
+ "step": 1020
+ },
+ {
+ "epoch": 1.87,
+ "learning_rate": 0.0001,
+ "loss": 0.8899,
+ "step": 1030
+ },
+ {
+ "epoch": 1.89,
+ "learning_rate": 0.0001,
+ "loss": 1.0053,
+ "step": 1040
+ },
+ {
+ "epoch": 1.9,
+ "learning_rate": 0.0001,
+ "loss": 1.0127,
+ "step": 1050
+ },
+ {
+ "epoch": 1.92,
+ "learning_rate": 0.0001,
+ "loss": 0.8023,
+ "step": 1060
+ },
+ {
+ "epoch": 1.94,
+ "learning_rate": 0.0001,
+ "loss": 0.8349,
+ "step": 1070
+ },
+ {
+ "epoch": 1.96,
+ "learning_rate": 0.0001,
+ "loss": 0.9742,
+ "step": 1080
+ },
+ {
+ "epoch": 1.98,
+ "learning_rate": 0.0001,
+ "loss": 1.0971,
+ "step": 1090
+ },
+ {
+ "epoch": 2.0,
+ "learning_rate": 0.0001,
+ "loss": 1.0728,
+ "step": 1100
+ },
+ {
+ "epoch": 2.01,
+ "learning_rate": 0.0001,
+ "loss": 0.7724,
+ "step": 1110
+ },
+ {
+ "epoch": 2.03,
+ "learning_rate": 0.0001,
+ "loss": 0.7675,
+ "step": 1120
+ },
+ {
+ "epoch": 2.03,
+ "eval_loss": 1.052681565284729,
+ "eval_runtime": 942.0722,
+ "eval_samples_per_second": 1.061,
+ "eval_steps_per_second": 1.061,
+ "step": 1122
+ },
+ {
+ "epoch": 2.03,
+ "mmlu_eval_accuracy": 0.7373981967098951,
+ "mmlu_eval_accuracy_abstract_algebra": 0.36363636363636365,
+ "mmlu_eval_accuracy_anatomy": 0.6428571428571429,
+ "mmlu_eval_accuracy_astronomy": 0.6875,
+ "mmlu_eval_accuracy_business_ethics": 0.9090909090909091,
+ "mmlu_eval_accuracy_clinical_knowledge": 0.896551724137931,
+ "mmlu_eval_accuracy_college_biology": 0.875,
+ "mmlu_eval_accuracy_college_chemistry": 0.5,
+ "mmlu_eval_accuracy_college_computer_science": 0.5454545454545454,
+ "mmlu_eval_accuracy_college_mathematics": 0.36363636363636365,
+ "mmlu_eval_accuracy_college_medicine": 0.9090909090909091,
+ "mmlu_eval_accuracy_college_physics": 0.5454545454545454,
+ "mmlu_eval_accuracy_computer_security": 0.6363636363636364,
+ "mmlu_eval_accuracy_conceptual_physics": 0.6153846153846154,
+ "mmlu_eval_accuracy_econometrics": 0.8333333333333334,
+ "mmlu_eval_accuracy_electrical_engineering": 0.875,
+ "mmlu_eval_accuracy_elementary_mathematics": 0.5853658536585366,
+ "mmlu_eval_accuracy_formal_logic": 0.7142857142857143,
+ "mmlu_eval_accuracy_global_facts": 0.4,
+ "mmlu_eval_accuracy_high_school_biology": 0.84375,
+ "mmlu_eval_accuracy_high_school_chemistry": 0.45454545454545453,
+ "mmlu_eval_accuracy_high_school_computer_science": 0.7777777777777778,
+ "mmlu_eval_accuracy_high_school_european_history": 0.7777777777777778,
+ "mmlu_eval_accuracy_high_school_geography": 0.9090909090909091,
+ "mmlu_eval_accuracy_high_school_government_and_politics": 0.9523809523809523,
+ "mmlu_eval_accuracy_high_school_macroeconomics": 0.7674418604651163,
+ "mmlu_eval_accuracy_high_school_mathematics": 0.41379310344827586,
+ "mmlu_eval_accuracy_high_school_microeconomics": 0.9230769230769231,
+ "mmlu_eval_accuracy_high_school_physics": 0.23529411764705882,
+ "mmlu_eval_accuracy_high_school_psychology": 0.95,
+ "mmlu_eval_accuracy_high_school_statistics": 0.782608695652174,
+ "mmlu_eval_accuracy_high_school_us_history": 0.9090909090909091,
+ "mmlu_eval_accuracy_high_school_world_history": 0.7692307692307693,
+ "mmlu_eval_accuracy_human_aging": 0.8260869565217391,
+ "mmlu_eval_accuracy_human_sexuality": 0.6666666666666666,
+ "mmlu_eval_accuracy_international_law": 1.0,
+ "mmlu_eval_accuracy_jurisprudence": 0.6363636363636364,
+ "mmlu_eval_accuracy_logical_fallacies": 0.7777777777777778,
+ "mmlu_eval_accuracy_machine_learning": 0.6363636363636364,
+ "mmlu_eval_accuracy_management": 0.9090909090909091,
+ "mmlu_eval_accuracy_marketing": 0.92,
+ "mmlu_eval_accuracy_medical_genetics": 0.9090909090909091,
+ "mmlu_eval_accuracy_miscellaneous": 0.7906976744186046,
+ "mmlu_eval_accuracy_moral_disputes": 0.8157894736842105,
+ "mmlu_eval_accuracy_moral_scenarios": 0.62,
+ "mmlu_eval_accuracy_nutrition": 0.7575757575757576,
+ "mmlu_eval_accuracy_philosophy": 0.8235294117647058,
+ "mmlu_eval_accuracy_prehistory": 0.8857142857142857,
+ "mmlu_eval_accuracy_professional_accounting": 0.6774193548387096,
+ "mmlu_eval_accuracy_professional_law": 0.6294117647058823,
+ "mmlu_eval_accuracy_professional_medicine": 0.8709677419354839,
+ "mmlu_eval_accuracy_professional_psychology": 0.782608695652174,
+ "mmlu_eval_accuracy_public_relations": 0.6666666666666666,
+ "mmlu_eval_accuracy_security_studies": 0.8148148148148148,
+ "mmlu_eval_accuracy_sociology": 0.9090909090909091,
+ "mmlu_eval_accuracy_us_foreign_policy": 1.0,
+ "mmlu_eval_accuracy_virology": 0.5,
+ "mmlu_eval_accuracy_world_religions": 0.8421052631578947,
+ "mmlu_loss": 1.2340081441760609,
+ "step": 1122
+ },
+ {
+ "epoch": 2.05,
+ "learning_rate": 0.0001,
+ "loss": 0.7194,
+ "step": 1130
+ },
+ {
+ "epoch": 2.07,
+ "learning_rate": 0.0001,
+ "loss": 0.8236,
+ "step": 1140
+ },
+ {
+ "epoch": 2.09,
+ "learning_rate": 0.0001,
+ "loss": 0.6652,
+ "step": 1150
+ },
+ {
+ "epoch": 2.1,
+ "learning_rate": 0.0001,
+ "loss": 0.7177,
+ "step": 1160
+ },
+ {
+ "epoch": 2.12,
+ "learning_rate": 0.0001,
+ "loss": 0.7788,
+ "step": 1170
+ },
+ {
+ "epoch": 2.14,
+ "learning_rate": 0.0001,
+ "loss": 0.8117,
+ "step": 1180
+ },
+ {
+ "epoch": 2.16,
+ "learning_rate": 0.0001,
+ "loss": 0.8145,
+ "step": 1190
+ },
+ {
+ "epoch": 2.18,
+ "learning_rate": 0.0001,
+ "loss": 0.6984,
+ "step": 1200
+ },
+ {
+ "epoch": 2.19,
+ "learning_rate": 0.0001,
+ "loss": 0.7011,
+ "step": 1210
+ },
+ {
+ "epoch": 2.21,
+ "learning_rate": 0.0001,
+ "loss": 0.769,
+ "step": 1220
+ },
+ {
+ "epoch": 2.23,
+ "learning_rate": 0.0001,
+ "loss": 0.7705,
+ "step": 1230
+ },
+ {
+ "epoch": 2.25,
+ "learning_rate": 0.0001,
+ "loss": 0.8066,
+ "step": 1240
+ },
+ {
+ "epoch": 2.27,
+ "learning_rate": 0.0001,
+ "loss": 0.6622,
+ "step": 1250
+ },
+ {
+ "epoch": 2.29,
+ "learning_rate": 0.0001,
+ "loss": 0.6641,
+ "step": 1260
+ },
+ {
+ "epoch": 2.3,
+ "learning_rate": 0.0001,
+ "loss": 0.7239,
+ "step": 1270
+ },
+ {
+ "epoch": 2.32,
+ "learning_rate": 0.0001,
+ "loss": 0.7618,
+ "step": 1280
+ },
+ {
+ "epoch": 2.34,
+ "learning_rate": 0.0001,
+ "loss": 0.7845,
+ "step": 1290
+ },
+ {
+ "epoch": 2.36,
+ "learning_rate": 0.0001,
+ "loss": 0.719,
+ "step": 1300
+ },
+ {
+ "epoch": 2.37,
+ "eval_loss": 1.1104822158813477,
+ "eval_runtime": 948.1299,
+ "eval_samples_per_second": 1.055,
+ "eval_steps_per_second": 1.055,
+ "step": 1309
+ },
+ {
+ "epoch": 2.37,
+ "mmlu_eval_accuracy": 0.7369285730399766,
+ "mmlu_eval_accuracy_abstract_algebra": 0.36363636363636365,
+ "mmlu_eval_accuracy_anatomy": 0.6428571428571429,
+ "mmlu_eval_accuracy_astronomy": 0.6875,
+ "mmlu_eval_accuracy_business_ethics": 1.0,
+ "mmlu_eval_accuracy_clinical_knowledge": 0.8275862068965517,
+ "mmlu_eval_accuracy_college_biology": 0.875,
+ "mmlu_eval_accuracy_college_chemistry": 0.5,
+ "mmlu_eval_accuracy_college_computer_science": 0.6363636363636364,
+ "mmlu_eval_accuracy_college_mathematics": 0.45454545454545453,
+ "mmlu_eval_accuracy_college_medicine": 0.8636363636363636,
+ "mmlu_eval_accuracy_college_physics": 0.6363636363636364,
+ "mmlu_eval_accuracy_computer_security": 0.7272727272727273,
+ "mmlu_eval_accuracy_conceptual_physics": 0.6923076923076923,
+ "mmlu_eval_accuracy_econometrics": 0.75,
+ "mmlu_eval_accuracy_electrical_engineering": 0.8125,
+ "mmlu_eval_accuracy_elementary_mathematics": 0.6341463414634146,
+ "mmlu_eval_accuracy_formal_logic": 0.7857142857142857,
+ "mmlu_eval_accuracy_global_facts": 0.5,
+ "mmlu_eval_accuracy_high_school_biology": 0.84375,
+ "mmlu_eval_accuracy_high_school_chemistry": 0.4090909090909091,
+ "mmlu_eval_accuracy_high_school_computer_science": 0.8888888888888888,
+ "mmlu_eval_accuracy_high_school_european_history": 0.7777777777777778,
+ "mmlu_eval_accuracy_high_school_geography": 0.9090909090909091,
+ "mmlu_eval_accuracy_high_school_government_and_politics": 0.9523809523809523,
+ "mmlu_eval_accuracy_high_school_macroeconomics": 0.7906976744186046,
+ "mmlu_eval_accuracy_high_school_mathematics": 0.3793103448275862,
+ "mmlu_eval_accuracy_high_school_microeconomics": 0.9230769230769231,
+ "mmlu_eval_accuracy_high_school_physics": 0.17647058823529413,
+ "mmlu_eval_accuracy_high_school_psychology": 0.95,
+ "mmlu_eval_accuracy_high_school_statistics": 0.6956521739130435,
+ "mmlu_eval_accuracy_high_school_us_history": 0.9090909090909091,
+ "mmlu_eval_accuracy_high_school_world_history": 0.7307692307692307,
+ "mmlu_eval_accuracy_human_aging": 0.7391304347826086,
+ "mmlu_eval_accuracy_human_sexuality": 0.6666666666666666,
+ "mmlu_eval_accuracy_international_law": 1.0,
+ "mmlu_eval_accuracy_jurisprudence": 0.5454545454545454,
+ "mmlu_eval_accuracy_logical_fallacies": 0.7222222222222222,
+ "mmlu_eval_accuracy_machine_learning": 0.6363636363636364,
+ "mmlu_eval_accuracy_management": 0.9090909090909091,
+ "mmlu_eval_accuracy_marketing": 0.88,
+ "mmlu_eval_accuracy_medical_genetics": 0.9090909090909091,
+ "mmlu_eval_accuracy_miscellaneous": 0.7906976744186046,
+ "mmlu_eval_accuracy_moral_disputes": 0.8157894736842105,
+ "mmlu_eval_accuracy_moral_scenarios": 0.57,
+ "mmlu_eval_accuracy_nutrition": 0.7575757575757576,
+ "mmlu_eval_accuracy_philosophy": 0.8529411764705882,
+ "mmlu_eval_accuracy_prehistory": 0.8571428571428571,
+ "mmlu_eval_accuracy_professional_accounting": 0.6774193548387096,
+ "mmlu_eval_accuracy_professional_law": 0.6058823529411764,
+ "mmlu_eval_accuracy_professional_medicine": 0.8709677419354839,
+ "mmlu_eval_accuracy_professional_psychology": 0.7681159420289855,
+ "mmlu_eval_accuracy_public_relations": 0.5833333333333334,
+ "mmlu_eval_accuracy_security_studies": 0.8148148148148148,
+ "mmlu_eval_accuracy_sociology": 0.9090909090909091,
+ "mmlu_eval_accuracy_us_foreign_policy": 1.0,
+ "mmlu_eval_accuracy_virology": 0.5555555555555556,
+ "mmlu_eval_accuracy_world_religions": 0.8421052631578947,
+ "mmlu_loss": 1.0866562834095908,
+ "step": 1309
+ },
+ {
+ "epoch": 2.38,
+ "learning_rate": 0.0001,
+ "loss": 0.7093,
+ "step": 1310
+ },
+ {
+ "epoch": 2.39,
+ "learning_rate": 0.0001,
+ "loss": 0.7684,
+ "step": 1320
+ },
+ {
+ "epoch": 2.41,
+ "learning_rate": 0.0001,
+ "loss": 0.7501,
+ "step": 1330
+ },
+ {
+ "epoch": 2.43,
+ "learning_rate": 0.0001,
+ "loss": 0.8043,
+ "step": 1340
+ },
+ {
+ "epoch": 2.45,
+ "learning_rate": 0.0001,
+ "loss": 0.6927,
+ "step": 1350
+ },
+ {
+ "epoch": 2.47,
+ "learning_rate": 0.0001,
+ "loss": 0.7278,
+ "step": 1360
+ },
+ {
+ "epoch": 2.48,
+ "learning_rate": 0.0001,
+ "loss": 0.8095,
+ "step": 1370
+ },
+ {
+ "epoch": 2.5,
+ "learning_rate": 0.0001,
+ "loss": 0.7463,
+ "step": 1380
+ },
+ {
+ "epoch": 2.52,
+ "learning_rate": 0.0001,
+ "loss": 0.7707,
+ "step": 1390
+ },
+ {
+ "epoch": 2.54,
+ "learning_rate": 0.0001,
+ "loss": 0.7152,
+ "step": 1400
+ },
+ {
+ "epoch": 2.56,
+ "learning_rate": 0.0001,
+ "loss": 0.687,
+ "step": 1410
+ },
+ {
+ "epoch": 2.58,
+ "learning_rate": 0.0001,
+ "loss": 0.7529,
+ "step": 1420
+ },
+ {
+ "epoch": 2.59,
+ "learning_rate": 0.0001,
+ "loss": 0.7565,
+ "step": 1430
+ },
+ {
+ "epoch": 2.61,
+ "learning_rate": 0.0001,
+ "loss": 0.8066,
+ "step": 1440
+ },
+ {
+ "epoch": 2.63,
+ "learning_rate": 0.0001,
+ "loss": 0.7623,
+ "step": 1450
+ },
+ {
+ "epoch": 2.65,
+ "learning_rate": 0.0001,
+ "loss": 0.6947,
+ "step": 1460
+ },
+ {
+ "epoch": 2.67,
+ "learning_rate": 0.0001,
+ "loss": 0.7756,
+ "step": 1470
+ },
+ {
+ "epoch": 2.68,
+ "learning_rate": 0.0001,
+ "loss": 0.8453,
+ "step": 1480
+ },
+ {
+ "epoch": 2.7,
+ "learning_rate": 0.0001,
+ "loss": 0.8306,
+ "step": 1490
+ },
+ {
+ "epoch": 2.71,
+ "eval_loss": 1.100826621055603,
+ "eval_runtime": 940.4488,
+ "eval_samples_per_second": 1.063,
+ "eval_steps_per_second": 1.063,
+ "step": 1496
+ },
+ {
+ "epoch": 2.71,
+ "mmlu_eval_accuracy": 0.7363077307176445,
+ "mmlu_eval_accuracy_abstract_algebra": 0.36363636363636365,
+ "mmlu_eval_accuracy_anatomy": 0.5714285714285714,
+ "mmlu_eval_accuracy_astronomy": 0.6875,
+ "mmlu_eval_accuracy_business_ethics": 0.9090909090909091,
+ "mmlu_eval_accuracy_clinical_knowledge": 0.8620689655172413,
+ "mmlu_eval_accuracy_college_biology": 0.8125,
+ "mmlu_eval_accuracy_college_chemistry": 0.5,
+ "mmlu_eval_accuracy_college_computer_science": 0.5454545454545454,
+ "mmlu_eval_accuracy_college_mathematics": 0.36363636363636365,
+ "mmlu_eval_accuracy_college_medicine": 0.9090909090909091,
+ "mmlu_eval_accuracy_college_physics": 0.6363636363636364,
+ "mmlu_eval_accuracy_computer_security": 0.6363636363636364,
+ "mmlu_eval_accuracy_conceptual_physics": 0.6153846153846154,
+ "mmlu_eval_accuracy_econometrics": 0.75,
+ "mmlu_eval_accuracy_electrical_engineering": 0.8125,
+ "mmlu_eval_accuracy_elementary_mathematics": 0.6585365853658537,
+ "mmlu_eval_accuracy_formal_logic": 0.6428571428571429,
+ "mmlu_eval_accuracy_global_facts": 0.6,
+ "mmlu_eval_accuracy_high_school_biology": 0.78125,
+ "mmlu_eval_accuracy_high_school_chemistry": 0.5909090909090909,
+ "mmlu_eval_accuracy_high_school_computer_science": 0.7777777777777778,
+ "mmlu_eval_accuracy_high_school_european_history": 0.7777777777777778,
+ "mmlu_eval_accuracy_high_school_geography": 0.9545454545454546,
+ "mmlu_eval_accuracy_high_school_government_and_politics": 0.9523809523809523,
+ "mmlu_eval_accuracy_high_school_macroeconomics": 0.7906976744186046,
+ "mmlu_eval_accuracy_high_school_mathematics": 0.4827586206896552,
+ "mmlu_eval_accuracy_high_school_microeconomics": 0.9615384615384616,
+ "mmlu_eval_accuracy_high_school_physics": 0.17647058823529413,
+ "mmlu_eval_accuracy_high_school_psychology": 0.9166666666666666,
+ "mmlu_eval_accuracy_high_school_statistics": 0.6521739130434783,
+ "mmlu_eval_accuracy_high_school_us_history": 0.9090909090909091,
+ "mmlu_eval_accuracy_high_school_world_history": 0.8076923076923077,
+ "mmlu_eval_accuracy_human_aging": 0.7391304347826086,
+ "mmlu_eval_accuracy_human_sexuality": 0.75,
+ "mmlu_eval_accuracy_international_law": 1.0,
+ "mmlu_eval_accuracy_jurisprudence": 0.45454545454545453,
+ "mmlu_eval_accuracy_logical_fallacies": 0.7777777777777778,
+ "mmlu_eval_accuracy_machine_learning": 0.6363636363636364,
+ "mmlu_eval_accuracy_management": 0.9090909090909091,
+ "mmlu_eval_accuracy_marketing": 0.88,
+ "mmlu_eval_accuracy_medical_genetics": 0.9090909090909091,
+ "mmlu_eval_accuracy_miscellaneous": 0.7906976744186046,
+ "mmlu_eval_accuracy_moral_disputes": 0.8157894736842105,
+ "mmlu_eval_accuracy_moral_scenarios": 0.64,
+ "mmlu_eval_accuracy_nutrition": 0.7575757575757576,
+ "mmlu_eval_accuracy_philosophy": 0.8235294117647058,
+ "mmlu_eval_accuracy_prehistory": 0.8571428571428571,
+ "mmlu_eval_accuracy_professional_accounting": 0.7096774193548387,
+ "mmlu_eval_accuracy_professional_law": 0.6176470588235294,
+ "mmlu_eval_accuracy_professional_medicine": 0.8709677419354839,
+ "mmlu_eval_accuracy_professional_psychology": 0.7971014492753623,
+ "mmlu_eval_accuracy_public_relations": 0.6666666666666666,
+ "mmlu_eval_accuracy_security_studies": 0.8518518518518519,
+ "mmlu_eval_accuracy_sociology": 0.9090909090909091,
+ "mmlu_eval_accuracy_us_foreign_policy": 1.0,
+ "mmlu_eval_accuracy_virology": 0.5555555555555556,
+ "mmlu_eval_accuracy_world_religions": 0.8421052631578947,
+ "mmlu_loss": 1.2313211129857853,
+ "step": 1496
+ },
+ {
+ "epoch": 2.72,
+ "learning_rate": 0.0001,
+ "loss": 0.6937,
+ "step": 1500
+ },
+ {
+ "epoch": 2.74,
+ "learning_rate": 0.0001,
+ "loss": 0.6997,
+ "step": 1510
+ },
+ {
+ "epoch": 2.76,
+ "learning_rate": 0.0001,
+ "loss": 0.7588,
+ "step": 1520
+ },
+ {
+ "epoch": 2.77,
+ "learning_rate": 0.0001,
+ "loss": 0.7731,
+ "step": 1530
+ },
+ {
+ "epoch": 2.79,
+ "learning_rate": 0.0001,
+ "loss": 0.7914,
+ "step": 1540
+ },
+ {
+ "epoch": 2.81,
+ "learning_rate": 0.0001,
+ "loss": 0.7175,
+ "step": 1550
+ },
+ {
+ "epoch": 2.83,
+ "learning_rate": 0.0001,
+ "loss": 0.7046,
+ "step": 1560
+ },
+ {
+ "epoch": 2.85,
+ "learning_rate": 0.0001,
+ "loss": 0.7597,
+ "step": 1570
+ },
+ {
+ "epoch": 2.87,
+ "learning_rate": 0.0001,
+ "loss": 0.7932,
+ "step": 1580
+ },
+ {
+ "epoch": 2.88,
+ "learning_rate": 0.0001,
+ "loss": 0.8059,
+ "step": 1590
+ },
+ {
+ "epoch": 2.9,
+ "learning_rate": 0.0001,
+ "loss": 0.7258,
+ "step": 1600
+ },
+ {
+ "epoch": 2.92,
+ "learning_rate": 0.0001,
+ "loss": 0.7486,
+ "step": 1610
+ },
+ {
+ "epoch": 2.94,
+ "learning_rate": 0.0001,
+ "loss": 0.7233,
+ "step": 1620
+ },
+ {
+ "epoch": 2.96,
+ "learning_rate": 0.0001,
+ "loss": 0.7945,
+ "step": 1630
+ },
+ {
+ "epoch": 2.97,
+ "learning_rate": 0.0001,
+ "loss": 0.8324,
+ "step": 1640
+ },
+ {
+ "epoch": 2.99,
+ "learning_rate": 0.0001,
+ "loss": 0.7294,
+ "step": 1650
+ },
+ {
+ "epoch": 3.01,
+ "learning_rate": 0.0001,
+ "loss": 0.6117,
+ "step": 1660
+ },
+ {
+ "epoch": 3.03,
+ "learning_rate": 0.0001,
+ "loss": 0.6464,
+ "step": 1670
+ },
+ {
+ "epoch": 3.05,
+ "learning_rate": 0.0001,
+ "loss": 0.6156,
+ "step": 1680
+ },
+ {
+ "epoch": 3.05,
+ "eval_loss": 1.1478718519210815,
+ "eval_runtime": 932.4225,
+ "eval_samples_per_second": 1.072,
+ "eval_steps_per_second": 1.072,
+ "step": 1683
+ },
+ {
+ "epoch": 3.05,
+ "mmlu_eval_accuracy": 0.745366643285036,
+ "mmlu_eval_accuracy_abstract_algebra": 0.2727272727272727,
+ "mmlu_eval_accuracy_anatomy": 0.5714285714285714,
+ "mmlu_eval_accuracy_astronomy": 0.6875,
+ "mmlu_eval_accuracy_business_ethics": 1.0,
+ "mmlu_eval_accuracy_clinical_knowledge": 0.8620689655172413,
+ "mmlu_eval_accuracy_college_biology": 0.875,
+ "mmlu_eval_accuracy_college_chemistry": 0.5,
+ "mmlu_eval_accuracy_college_computer_science": 0.7272727272727273,
+ "mmlu_eval_accuracy_college_mathematics": 0.36363636363636365,
+ "mmlu_eval_accuracy_college_medicine": 0.9090909090909091,
+ "mmlu_eval_accuracy_college_physics": 0.6363636363636364,
+ "mmlu_eval_accuracy_computer_security": 0.6363636363636364,
+ "mmlu_eval_accuracy_conceptual_physics": 0.6538461538461539,
+ "mmlu_eval_accuracy_econometrics": 0.75,
+ "mmlu_eval_accuracy_electrical_engineering": 0.875,
+ "mmlu_eval_accuracy_elementary_mathematics": 0.7073170731707317,
+ "mmlu_eval_accuracy_formal_logic": 0.7857142857142857,
+ "mmlu_eval_accuracy_global_facts": 0.8,
+ "mmlu_eval_accuracy_high_school_biology": 0.8125,
+ "mmlu_eval_accuracy_high_school_chemistry": 0.5454545454545454,
+ "mmlu_eval_accuracy_high_school_computer_science": 0.7777777777777778,
+ "mmlu_eval_accuracy_high_school_european_history": 0.7777777777777778,
+ "mmlu_eval_accuracy_high_school_geography": 0.9090909090909091,
+ "mmlu_eval_accuracy_high_school_government_and_politics": 0.9523809523809523,
+ "mmlu_eval_accuracy_high_school_macroeconomics": 0.7674418604651163,
+ "mmlu_eval_accuracy_high_school_mathematics": 0.3793103448275862,
+ "mmlu_eval_accuracy_high_school_microeconomics": 0.9230769230769231,
+ "mmlu_eval_accuracy_high_school_physics": 0.23529411764705882,
+ "mmlu_eval_accuracy_high_school_psychology": 0.9333333333333333,
+ "mmlu_eval_accuracy_high_school_statistics": 0.6521739130434783,
+ "mmlu_eval_accuracy_high_school_us_history": 0.9090909090909091,
+ "mmlu_eval_accuracy_high_school_world_history": 0.8076923076923077,
+ "mmlu_eval_accuracy_human_aging": 0.7391304347826086,
+ "mmlu_eval_accuracy_human_sexuality": 0.75,
+ "mmlu_eval_accuracy_international_law": 1.0,
+ "mmlu_eval_accuracy_jurisprudence": 0.5454545454545454,
+ "mmlu_eval_accuracy_logical_fallacies": 0.7777777777777778,
+ "mmlu_eval_accuracy_machine_learning": 0.6363636363636364,
+ "mmlu_eval_accuracy_management": 0.9090909090909091,
+ "mmlu_eval_accuracy_marketing": 0.96,
+ "mmlu_eval_accuracy_medical_genetics": 1.0,
+ "mmlu_eval_accuracy_miscellaneous": 0.7906976744186046,
+ "mmlu_eval_accuracy_moral_disputes": 0.7894736842105263,
+ "mmlu_eval_accuracy_moral_scenarios": 0.61,
+ "mmlu_eval_accuracy_nutrition": 0.7272727272727273,
+ "mmlu_eval_accuracy_philosophy": 0.7647058823529411,
+ "mmlu_eval_accuracy_prehistory": 0.8571428571428571,
+ "mmlu_eval_accuracy_professional_accounting": 0.6774193548387096,
+ "mmlu_eval_accuracy_professional_law": 0.6176470588235294,
+ "mmlu_eval_accuracy_professional_medicine": 0.9032258064516129,
+ "mmlu_eval_accuracy_professional_psychology": 0.7391304347826086,
+ "mmlu_eval_accuracy_public_relations": 0.6666666666666666,
+ "mmlu_eval_accuracy_security_studies": 0.7777777777777778,
+ "mmlu_eval_accuracy_sociology": 0.9090909090909091,
+ "mmlu_eval_accuracy_us_foreign_policy": 1.0,
+ "mmlu_eval_accuracy_virology": 0.5,
+ "mmlu_eval_accuracy_world_religions": 0.8421052631578947,
+ "mmlu_loss": 1.4050485734687297,
+ "step": 1683
+ },
+ {
+ "epoch": 3.07,
+ "learning_rate": 0.0001,
+ "loss": 0.5237,
+ "step": 1690
+ },
+ {
+ "epoch": 3.08,
+ "learning_rate": 0.0001,
+ "loss": 0.3516,
+ "step": 1700
+ },
+ {
+ "epoch": 3.1,
+ "learning_rate": 0.0001,
+ "loss": 0.4976,
+ "step": 1710
+ },
+ {
+ "epoch": 3.12,
+ "learning_rate": 0.0001,
+ "loss": 0.6535,
+ "step": 1720
+ },
+ {
+ "epoch": 3.14,
+ "learning_rate": 0.0001,
+ "loss": 0.5926,
+ "step": 1730
+ },
+ {
+ "epoch": 3.16,
+ "learning_rate": 0.0001,
+ "loss": 0.5476,
+ "step": 1740
+ },
+ {
+ "epoch": 3.17,
+ "learning_rate": 0.0001,
+ "loss": 0.368,
+ "step": 1750
+ },
+ {
+ "epoch": 3.19,
+ "learning_rate": 0.0001,
+ "loss": 0.5043,
+ "step": 1760
+ },
+ {
+ "epoch": 3.21,
+ "learning_rate": 0.0001,
+ "loss": 0.5907,
+ "step": 1770
+ },
+ {
+ "epoch": 3.23,
+ "learning_rate": 0.0001,
+ "loss": 0.5609,
+ "step": 1780
+ },
+ {
+ "epoch": 3.25,
+ "learning_rate": 0.0001,
+ "loss": 0.5272,
+ "step": 1790
+ },
+ {
+ "epoch": 3.26,
+ "learning_rate": 0.0001,
+ "loss": 0.3672,
+ "step": 1800
+ }
+ ],
+ "max_steps": 1875,
+ "num_train_epochs": 4,
+ "total_flos": 2.0859408156226683e+18,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/checkpoint-1800/training_args.bin b/checkpoint-1800/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..cab5478c763bfedd6ea1ae136d922355ec23f73d
--- /dev/null
+++ b/checkpoint-1800/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3739b542a2914edbd2d7eb3b727d6fa2a7752c75b0d7a856f5e87dd807fa1ef9
+size 6200
diff --git a/checkpoint-1875/adapter_model/README.md b/checkpoint-1875/adapter_model/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..82793f73e61dbb024e11fc6697bba1622d4d0db6
--- /dev/null
+++ b/checkpoint-1875/adapter_model/README.md
@@ -0,0 +1,20 @@
+---
+library_name: peft
+---
+## Training procedure
+
+
+The following `bitsandbytes` quantization config was used during training:
+- load_in_8bit: False
+- load_in_4bit: True
+- llm_int8_threshold: 6.0
+- llm_int8_skip_modules: None
+- llm_int8_enable_fp32_cpu_offload: False
+- llm_int8_has_fp16_weight: False
+- bnb_4bit_quant_type: nf4
+- bnb_4bit_use_double_quant: True
+- bnb_4bit_compute_dtype: bfloat16
+### Framework versions
+
+
+- PEFT 0.4.0
diff --git a/checkpoint-1875/adapter_model/adapter_config.json b/checkpoint-1875/adapter_model/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..f6f42652a8f2f3cf02936153fca1af5d04e7983d
--- /dev/null
+++ b/checkpoint-1875/adapter_model/adapter_config.json
@@ -0,0 +1,26 @@
+{
+ "auto_mapping": null,
+ "base_model_name_or_path": "152334H/miqu-1-70b-sf",
+ "bias": "none",
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "lora_alpha": 16.0,
+ "lora_dropout": 0.05,
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "r": 64,
+ "revision": null,
+ "target_modules": [
+ "gate_proj",
+ "down_proj",
+ "v_proj",
+ "up_proj",
+ "q_proj",
+ "k_proj",
+ "o_proj"
+ ],
+ "task_type": "CAUSAL_LM"
+}
\ No newline at end of file
diff --git a/checkpoint-1875/adapter_model/adapter_model.bin b/checkpoint-1875/adapter_model/adapter_model.bin
new file mode 100644
index 0000000000000000000000000000000000000000..b3cc80572a3a9adf10c365d14473c6e9f58ca36d
--- /dev/null
+++ b/checkpoint-1875/adapter_model/adapter_model.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5a795db67c92d569560b93ef875abfe3a0ccefc1c40b817da09011db27e24b21
+size 1657155522
diff --git a/checkpoint-200/README.md b/checkpoint-200/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..82793f73e61dbb024e11fc6697bba1622d4d0db6
--- /dev/null
+++ b/checkpoint-200/README.md
@@ -0,0 +1,20 @@
+---
+library_name: peft
+---
+## Training procedure
+
+
+The following `bitsandbytes` quantization config was used during training:
+- load_in_8bit: False
+- load_in_4bit: True
+- llm_int8_threshold: 6.0
+- llm_int8_skip_modules: None
+- llm_int8_enable_fp32_cpu_offload: False
+- llm_int8_has_fp16_weight: False
+- bnb_4bit_quant_type: nf4
+- bnb_4bit_use_double_quant: True
+- bnb_4bit_compute_dtype: bfloat16
+### Framework versions
+
+
+- PEFT 0.4.0
diff --git a/checkpoint-200/adapter_config.json b/checkpoint-200/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..f6f42652a8f2f3cf02936153fca1af5d04e7983d
--- /dev/null
+++ b/checkpoint-200/adapter_config.json
@@ -0,0 +1,26 @@
+{
+ "auto_mapping": null,
+ "base_model_name_or_path": "152334H/miqu-1-70b-sf",
+ "bias": "none",
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "lora_alpha": 16.0,
+ "lora_dropout": 0.05,
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "r": 64,
+ "revision": null,
+ "target_modules": [
+ "gate_proj",
+ "down_proj",
+ "v_proj",
+ "up_proj",
+ "q_proj",
+ "k_proj",
+ "o_proj"
+ ],
+ "task_type": "CAUSAL_LM"
+}
\ No newline at end of file
diff --git a/checkpoint-200/adapter_model.bin b/checkpoint-200/adapter_model.bin
new file mode 100644
index 0000000000000000000000000000000000000000..01ad48986bf743f111ee718296b90b7a8bba1eea
--- /dev/null
+++ b/checkpoint-200/adapter_model.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:95a8af86f16910a2e20955a36b9674a05f94ca9d2cc31eb37b44236be346de2a
+size 1657155522
diff --git a/checkpoint-200/adapter_model/README.md b/checkpoint-200/adapter_model/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..82793f73e61dbb024e11fc6697bba1622d4d0db6
--- /dev/null
+++ b/checkpoint-200/adapter_model/README.md
@@ -0,0 +1,20 @@
+---
+library_name: peft
+---
+## Training procedure
+
+
+The following `bitsandbytes` quantization config was used during training:
+- load_in_8bit: False
+- load_in_4bit: True
+- llm_int8_threshold: 6.0
+- llm_int8_skip_modules: None
+- llm_int8_enable_fp32_cpu_offload: False
+- llm_int8_has_fp16_weight: False
+- bnb_4bit_quant_type: nf4
+- bnb_4bit_use_double_quant: True
+- bnb_4bit_compute_dtype: bfloat16
+### Framework versions
+
+
+- PEFT 0.4.0
diff --git a/checkpoint-200/adapter_model/adapter_config.json b/checkpoint-200/adapter_model/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..f6f42652a8f2f3cf02936153fca1af5d04e7983d
--- /dev/null
+++ b/checkpoint-200/adapter_model/adapter_config.json
@@ -0,0 +1,26 @@
+{
+ "auto_mapping": null,
+ "base_model_name_or_path": "152334H/miqu-1-70b-sf",
+ "bias": "none",
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "lora_alpha": 16.0,
+ "lora_dropout": 0.05,
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "r": 64,
+ "revision": null,
+ "target_modules": [
+ "gate_proj",
+ "down_proj",
+ "v_proj",
+ "up_proj",
+ "q_proj",
+ "k_proj",
+ "o_proj"
+ ],
+ "task_type": "CAUSAL_LM"
+}
\ No newline at end of file
diff --git a/checkpoint-200/adapter_model/adapter_model.bin b/checkpoint-200/adapter_model/adapter_model.bin
new file mode 100644
index 0000000000000000000000000000000000000000..01ad48986bf743f111ee718296b90b7a8bba1eea
--- /dev/null
+++ b/checkpoint-200/adapter_model/adapter_model.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:95a8af86f16910a2e20955a36b9674a05f94ca9d2cc31eb37b44236be346de2a
+size 1657155522
diff --git a/checkpoint-200/optimizer.pt b/checkpoint-200/optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..b758a3fb6e96913ad0c5d937fc764f8749101243
--- /dev/null
+++ b/checkpoint-200/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ec843ab7d5d89384bbcbd10dae3adddc2257138f4bb8515316463b30c8590dd4
+size 6627701834
diff --git a/checkpoint-200/rng_state.pth b/checkpoint-200/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..671fa9b31c3de2fc3c9b740145b7bd5c04aa428e
--- /dev/null
+++ b/checkpoint-200/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:49dfee8275a21d28d179a51fdac0680234e691d6d101a40d18419fe308e1eab6
+size 14180
diff --git a/checkpoint-200/scheduler.pt b/checkpoint-200/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..cd85ebfd5283eff16808c1da8eb6f40ac2533b35
--- /dev/null
+++ b/checkpoint-200/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:da963d3b7106f3b4c395d03ef6897e64220165c23480563310345c76ab7b120d
+size 1064
diff --git a/checkpoint-200/special_tokens_map.json b/checkpoint-200/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..55431d8ca0ffd8e996c646341b6136e749142547
--- /dev/null
+++ b/checkpoint-200/special_tokens_map.json
@@ -0,0 +1,12 @@
+{
+ "bos_token": "",
+ "eos_token": "",
+ "pad_token": {
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false
+ },
+ "unk_token": ""
+}
diff --git a/checkpoint-200/tokenizer.model b/checkpoint-200/tokenizer.model
new file mode 100644
index 0000000000000000000000000000000000000000..6c00c742ce03c627d6cd5b795984876fa49fa899
--- /dev/null
+++ b/checkpoint-200/tokenizer.model
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
+size 499723
diff --git a/checkpoint-200/tokenizer_config.json b/checkpoint-200/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..a3b9160e6f055447e1b1927482cf0ea394366055
--- /dev/null
+++ b/checkpoint-200/tokenizer_config.json
@@ -0,0 +1,71 @@
+{
+ "add_bos_token": true,
+ "add_eos_token": false,
+ "added_tokens_decoder": {
+ "0": {
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "1": {
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "2": {
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ }
+ },
+ "bos_token": {
+ "__type": "AddedToken",
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false
+ },
+ "chat_template": "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}",
+ "clean_up_tokenization_spaces": false,
+ "eos_token": {
+ "__type": "AddedToken",
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false
+ },
+ "legacy": false,
+ "model_max_length": 1000000000000000019884624838656,
+ "pad_token": {
+ "__type": "AddedToken",
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false
+ },
+ "padding_side": "right",
+ "sp_model_kwargs": {},
+ "spaces_between_special_tokens": false,
+ "tokenizer_class": "LlamaTokenizer",
+ "unk_token": {
+ "__type": "AddedToken",
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false
+ },
+ "use_default_system_prompt": false
+}
diff --git a/checkpoint-200/trainer_state.json b/checkpoint-200/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..eb97380660a81934bf8beacd83d146ea31d1a8d1
--- /dev/null
+++ b/checkpoint-200/trainer_state.json
@@ -0,0 +1,207 @@
+{
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 0.36272953978689637,
+ "global_step": 200,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0001,
+ "loss": 1.1655,
+ "step": 10
+ },
+ {
+ "epoch": 0.04,
+ "learning_rate": 0.0001,
+ "loss": 1.001,
+ "step": 20
+ },
+ {
+ "epoch": 0.05,
+ "learning_rate": 0.0001,
+ "loss": 1.0287,
+ "step": 30
+ },
+ {
+ "epoch": 0.07,
+ "learning_rate": 0.0001,
+ "loss": 1.1578,
+ "step": 40
+ },
+ {
+ "epoch": 0.09,
+ "learning_rate": 0.0001,
+ "loss": 1.2146,
+ "step": 50
+ },
+ {
+ "epoch": 0.11,
+ "learning_rate": 0.0001,
+ "loss": 0.997,
+ "step": 60
+ },
+ {
+ "epoch": 0.13,
+ "learning_rate": 0.0001,
+ "loss": 0.9024,
+ "step": 70
+ },
+ {
+ "epoch": 0.15,
+ "learning_rate": 0.0001,
+ "loss": 0.9901,
+ "step": 80
+ },
+ {
+ "epoch": 0.16,
+ "learning_rate": 0.0001,
+ "loss": 1.1264,
+ "step": 90
+ },
+ {
+ "epoch": 0.18,
+ "learning_rate": 0.0001,
+ "loss": 1.2038,
+ "step": 100
+ },
+ {
+ "epoch": 0.2,
+ "learning_rate": 0.0001,
+ "loss": 0.8935,
+ "step": 110
+ },
+ {
+ "epoch": 0.22,
+ "learning_rate": 0.0001,
+ "loss": 0.9178,
+ "step": 120
+ },
+ {
+ "epoch": 0.24,
+ "learning_rate": 0.0001,
+ "loss": 0.9746,
+ "step": 130
+ },
+ {
+ "epoch": 0.25,
+ "learning_rate": 0.0001,
+ "loss": 1.1566,
+ "step": 140
+ },
+ {
+ "epoch": 0.27,
+ "learning_rate": 0.0001,
+ "loss": 1.2877,
+ "step": 150
+ },
+ {
+ "epoch": 0.29,
+ "learning_rate": 0.0001,
+ "loss": 0.9146,
+ "step": 160
+ },
+ {
+ "epoch": 0.31,
+ "learning_rate": 0.0001,
+ "loss": 0.8895,
+ "step": 170
+ },
+ {
+ "epoch": 0.33,
+ "learning_rate": 0.0001,
+ "loss": 1.0121,
+ "step": 180
+ },
+ {
+ "epoch": 0.34,
+ "eval_loss": 1.0215636491775513,
+ "eval_runtime": 950.138,
+ "eval_samples_per_second": 1.052,
+ "eval_steps_per_second": 1.052,
+ "step": 187
+ },
+ {
+ "epoch": 0.34,
+ "mmlu_eval_accuracy": 0.731892294851104,
+ "mmlu_eval_accuracy_abstract_algebra": 0.18181818181818182,
+ "mmlu_eval_accuracy_anatomy": 0.7857142857142857,
+ "mmlu_eval_accuracy_astronomy": 0.6875,
+ "mmlu_eval_accuracy_business_ethics": 0.7272727272727273,
+ "mmlu_eval_accuracy_clinical_knowledge": 0.896551724137931,
+ "mmlu_eval_accuracy_college_biology": 0.875,
+ "mmlu_eval_accuracy_college_chemistry": 0.5,
+ "mmlu_eval_accuracy_college_computer_science": 0.7272727272727273,
+ "mmlu_eval_accuracy_college_mathematics": 0.45454545454545453,
+ "mmlu_eval_accuracy_college_medicine": 0.9090909090909091,
+ "mmlu_eval_accuracy_college_physics": 0.6363636363636364,
+ "mmlu_eval_accuracy_computer_security": 0.6363636363636364,
+ "mmlu_eval_accuracy_conceptual_physics": 0.6538461538461539,
+ "mmlu_eval_accuracy_econometrics": 0.8333333333333334,
+ "mmlu_eval_accuracy_electrical_engineering": 0.8125,
+ "mmlu_eval_accuracy_elementary_mathematics": 0.7073170731707317,
+ "mmlu_eval_accuracy_formal_logic": 0.5714285714285714,
+ "mmlu_eval_accuracy_global_facts": 0.5,
+ "mmlu_eval_accuracy_high_school_biology": 0.8125,
+ "mmlu_eval_accuracy_high_school_chemistry": 0.4090909090909091,
+ "mmlu_eval_accuracy_high_school_computer_science": 0.6666666666666666,
+ "mmlu_eval_accuracy_high_school_european_history": 0.7777777777777778,
+ "mmlu_eval_accuracy_high_school_geography": 0.9090909090909091,
+ "mmlu_eval_accuracy_high_school_government_and_politics": 0.9523809523809523,
+ "mmlu_eval_accuracy_high_school_macroeconomics": 0.7441860465116279,
+ "mmlu_eval_accuracy_high_school_mathematics": 0.3793103448275862,
+ "mmlu_eval_accuracy_high_school_microeconomics": 0.9615384615384616,
+ "mmlu_eval_accuracy_high_school_physics": 0.23529411764705882,
+ "mmlu_eval_accuracy_high_school_psychology": 0.9333333333333333,
+ "mmlu_eval_accuracy_high_school_statistics": 0.6956521739130435,
+ "mmlu_eval_accuracy_high_school_us_history": 0.9090909090909091,
+ "mmlu_eval_accuracy_high_school_world_history": 0.8076923076923077,
+ "mmlu_eval_accuracy_human_aging": 0.8260869565217391,
+ "mmlu_eval_accuracy_human_sexuality": 0.6666666666666666,
+ "mmlu_eval_accuracy_international_law": 1.0,
+ "mmlu_eval_accuracy_jurisprudence": 0.6363636363636364,
+ "mmlu_eval_accuracy_logical_fallacies": 0.7777777777777778,
+ "mmlu_eval_accuracy_machine_learning": 0.5454545454545454,
+ "mmlu_eval_accuracy_management": 0.9090909090909091,
+ "mmlu_eval_accuracy_marketing": 0.88,
+ "mmlu_eval_accuracy_medical_genetics": 0.9090909090909091,
+ "mmlu_eval_accuracy_miscellaneous": 0.7906976744186046,
+ "mmlu_eval_accuracy_moral_disputes": 0.8157894736842105,
+ "mmlu_eval_accuracy_moral_scenarios": 0.59,
+ "mmlu_eval_accuracy_nutrition": 0.7878787878787878,
+ "mmlu_eval_accuracy_philosophy": 0.7941176470588235,
+ "mmlu_eval_accuracy_prehistory": 0.8285714285714286,
+ "mmlu_eval_accuracy_professional_accounting": 0.6451612903225806,
+ "mmlu_eval_accuracy_professional_law": 0.6294117647058823,
+ "mmlu_eval_accuracy_professional_medicine": 0.8064516129032258,
+ "mmlu_eval_accuracy_professional_psychology": 0.8115942028985508,
+ "mmlu_eval_accuracy_public_relations": 0.6666666666666666,
+ "mmlu_eval_accuracy_security_studies": 0.8148148148148148,
+ "mmlu_eval_accuracy_sociology": 0.9545454545454546,
+ "mmlu_eval_accuracy_us_foreign_policy": 1.0,
+ "mmlu_eval_accuracy_virology": 0.5,
+ "mmlu_eval_accuracy_world_religions": 0.8421052631578947,
+ "mmlu_loss": 1.326305795171384,
+ "step": 187
+ },
+ {
+ "epoch": 0.34,
+ "learning_rate": 0.0001,
+ "loss": 1.1133,
+ "step": 190
+ },
+ {
+ "epoch": 0.36,
+ "learning_rate": 0.0001,
+ "loss": 1.2485,
+ "step": 200
+ }
+ ],
+ "max_steps": 1875,
+ "num_train_epochs": 4,
+ "total_flos": 2.3241435438071808e+17,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/checkpoint-200/training_args.bin b/checkpoint-200/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..cab5478c763bfedd6ea1ae136d922355ec23f73d
--- /dev/null
+++ b/checkpoint-200/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3739b542a2914edbd2d7eb3b727d6fa2a7752c75b0d7a856f5e87dd807fa1ef9
+size 6200
diff --git a/checkpoint-400/README.md b/checkpoint-400/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..82793f73e61dbb024e11fc6697bba1622d4d0db6
--- /dev/null
+++ b/checkpoint-400/README.md
@@ -0,0 +1,20 @@
+---
+library_name: peft
+---
+## Training procedure
+
+
+The following `bitsandbytes` quantization config was used during training:
+- load_in_8bit: False
+- load_in_4bit: True
+- llm_int8_threshold: 6.0
+- llm_int8_skip_modules: None
+- llm_int8_enable_fp32_cpu_offload: False
+- llm_int8_has_fp16_weight: False
+- bnb_4bit_quant_type: nf4
+- bnb_4bit_use_double_quant: True
+- bnb_4bit_compute_dtype: bfloat16
+### Framework versions
+
+
+- PEFT 0.4.0
diff --git a/checkpoint-400/adapter_config.json b/checkpoint-400/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..f6f42652a8f2f3cf02936153fca1af5d04e7983d
--- /dev/null
+++ b/checkpoint-400/adapter_config.json
@@ -0,0 +1,26 @@
+{
+ "auto_mapping": null,
+ "base_model_name_or_path": "152334H/miqu-1-70b-sf",
+ "bias": "none",
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "lora_alpha": 16.0,
+ "lora_dropout": 0.05,
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "r": 64,
+ "revision": null,
+ "target_modules": [
+ "gate_proj",
+ "down_proj",
+ "v_proj",
+ "up_proj",
+ "q_proj",
+ "k_proj",
+ "o_proj"
+ ],
+ "task_type": "CAUSAL_LM"
+}
\ No newline at end of file
diff --git a/checkpoint-400/adapter_model.bin b/checkpoint-400/adapter_model.bin
new file mode 100644
index 0000000000000000000000000000000000000000..5b5cbd401da9be76942a5f77e68c6aee19b7377d
--- /dev/null
+++ b/checkpoint-400/adapter_model.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ee21fd2086ed33b012216471edb3a6578cae41d61a3e2d70e9c32aa8da111518
+size 1657155522
diff --git a/checkpoint-400/adapter_model/README.md b/checkpoint-400/adapter_model/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..82793f73e61dbb024e11fc6697bba1622d4d0db6
--- /dev/null
+++ b/checkpoint-400/adapter_model/README.md
@@ -0,0 +1,20 @@
+---
+library_name: peft
+---
+## Training procedure
+
+
+The following `bitsandbytes` quantization config was used during training:
+- load_in_8bit: False
+- load_in_4bit: True
+- llm_int8_threshold: 6.0
+- llm_int8_skip_modules: None
+- llm_int8_enable_fp32_cpu_offload: False
+- llm_int8_has_fp16_weight: False
+- bnb_4bit_quant_type: nf4
+- bnb_4bit_use_double_quant: True
+- bnb_4bit_compute_dtype: bfloat16
+### Framework versions
+
+
+- PEFT 0.4.0
diff --git a/checkpoint-400/adapter_model/adapter_config.json b/checkpoint-400/adapter_model/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..f6f42652a8f2f3cf02936153fca1af5d04e7983d
--- /dev/null
+++ b/checkpoint-400/adapter_model/adapter_config.json
@@ -0,0 +1,26 @@
+{
+ "auto_mapping": null,
+ "base_model_name_or_path": "152334H/miqu-1-70b-sf",
+ "bias": "none",
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "lora_alpha": 16.0,
+ "lora_dropout": 0.05,
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "r": 64,
+ "revision": null,
+ "target_modules": [
+ "gate_proj",
+ "down_proj",
+ "v_proj",
+ "up_proj",
+ "q_proj",
+ "k_proj",
+ "o_proj"
+ ],
+ "task_type": "CAUSAL_LM"
+}
\ No newline at end of file
diff --git a/checkpoint-400/adapter_model/adapter_model.bin b/checkpoint-400/adapter_model/adapter_model.bin
new file mode 100644
index 0000000000000000000000000000000000000000..5b5cbd401da9be76942a5f77e68c6aee19b7377d
--- /dev/null
+++ b/checkpoint-400/adapter_model/adapter_model.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ee21fd2086ed33b012216471edb3a6578cae41d61a3e2d70e9c32aa8da111518
+size 1657155522
diff --git a/checkpoint-400/optimizer.pt b/checkpoint-400/optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..da652808e74d2335e9e0572a7564d59b23f9f729
--- /dev/null
+++ b/checkpoint-400/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:dfe0aca92d683f4a2153f482e9b37c816c0eb75dbf95625d79e620a89e37225b
+size 6627702922
diff --git a/checkpoint-400/rng_state.pth b/checkpoint-400/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..295605bfcf37aa6885f63d5bf46f8d7a4cdc4f3a
--- /dev/null
+++ b/checkpoint-400/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8a266b55b76c6083a19d9348b15793530dbea83d95778a2278dedabf1a2bf7cc
+size 14180
diff --git a/checkpoint-400/scheduler.pt b/checkpoint-400/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..832e690d212928d7077c9c0b5e0914e58918690f
--- /dev/null
+++ b/checkpoint-400/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1fe875adeb5e81f688f2c1021fea61085edbfb214cacba1a2ad6a420d6d64c64
+size 1064
diff --git a/checkpoint-400/special_tokens_map.json b/checkpoint-400/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..55431d8ca0ffd8e996c646341b6136e749142547
--- /dev/null
+++ b/checkpoint-400/special_tokens_map.json
@@ -0,0 +1,12 @@
+{
+ "bos_token": "",
+ "eos_token": "",
+ "pad_token": {
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false
+ },
+ "unk_token": ""
+}
diff --git a/checkpoint-400/tokenizer.model b/checkpoint-400/tokenizer.model
new file mode 100644
index 0000000000000000000000000000000000000000..6c00c742ce03c627d6cd5b795984876fa49fa899
--- /dev/null
+++ b/checkpoint-400/tokenizer.model
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
+size 499723
diff --git a/checkpoint-400/tokenizer_config.json b/checkpoint-400/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..a3b9160e6f055447e1b1927482cf0ea394366055
--- /dev/null
+++ b/checkpoint-400/tokenizer_config.json
@@ -0,0 +1,71 @@
+{
+ "add_bos_token": true,
+ "add_eos_token": false,
+ "added_tokens_decoder": {
+ "0": {
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "1": {
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "2": {
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ }
+ },
+ "bos_token": {
+ "__type": "AddedToken",
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false
+ },
+ "chat_template": "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}",
+ "clean_up_tokenization_spaces": false,
+ "eos_token": {
+ "__type": "AddedToken",
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false
+ },
+ "legacy": false,
+ "model_max_length": 1000000000000000019884624838656,
+ "pad_token": {
+ "__type": "AddedToken",
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false
+ },
+ "padding_side": "right",
+ "sp_model_kwargs": {},
+ "spaces_between_special_tokens": false,
+ "tokenizer_class": "LlamaTokenizer",
+ "unk_token": {
+ "__type": "AddedToken",
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false
+ },
+ "use_default_system_prompt": false
+}
diff --git a/checkpoint-400/trainer_state.json b/checkpoint-400/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..db088e9f6799e766ca691155cfac614cb6943703
--- /dev/null
+++ b/checkpoint-400/trainer_state.json
@@ -0,0 +1,398 @@
+{
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 0.7254590795737927,
+ "global_step": 400,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0001,
+ "loss": 1.1655,
+ "step": 10
+ },
+ {
+ "epoch": 0.04,
+ "learning_rate": 0.0001,
+ "loss": 1.001,
+ "step": 20
+ },
+ {
+ "epoch": 0.05,
+ "learning_rate": 0.0001,
+ "loss": 1.0287,
+ "step": 30
+ },
+ {
+ "epoch": 0.07,
+ "learning_rate": 0.0001,
+ "loss": 1.1578,
+ "step": 40
+ },
+ {
+ "epoch": 0.09,
+ "learning_rate": 0.0001,
+ "loss": 1.2146,
+ "step": 50
+ },
+ {
+ "epoch": 0.11,
+ "learning_rate": 0.0001,
+ "loss": 0.997,
+ "step": 60
+ },
+ {
+ "epoch": 0.13,
+ "learning_rate": 0.0001,
+ "loss": 0.9024,
+ "step": 70
+ },
+ {
+ "epoch": 0.15,
+ "learning_rate": 0.0001,
+ "loss": 0.9901,
+ "step": 80
+ },
+ {
+ "epoch": 0.16,
+ "learning_rate": 0.0001,
+ "loss": 1.1264,
+ "step": 90
+ },
+ {
+ "epoch": 0.18,
+ "learning_rate": 0.0001,
+ "loss": 1.2038,
+ "step": 100
+ },
+ {
+ "epoch": 0.2,
+ "learning_rate": 0.0001,
+ "loss": 0.8935,
+ "step": 110
+ },
+ {
+ "epoch": 0.22,
+ "learning_rate": 0.0001,
+ "loss": 0.9178,
+ "step": 120
+ },
+ {
+ "epoch": 0.24,
+ "learning_rate": 0.0001,
+ "loss": 0.9746,
+ "step": 130
+ },
+ {
+ "epoch": 0.25,
+ "learning_rate": 0.0001,
+ "loss": 1.1566,
+ "step": 140
+ },
+ {
+ "epoch": 0.27,
+ "learning_rate": 0.0001,
+ "loss": 1.2877,
+ "step": 150
+ },
+ {
+ "epoch": 0.29,
+ "learning_rate": 0.0001,
+ "loss": 0.9146,
+ "step": 160
+ },
+ {
+ "epoch": 0.31,
+ "learning_rate": 0.0001,
+ "loss": 0.8895,
+ "step": 170
+ },
+ {
+ "epoch": 0.33,
+ "learning_rate": 0.0001,
+ "loss": 1.0121,
+ "step": 180
+ },
+ {
+ "epoch": 0.34,
+ "eval_loss": 1.0215636491775513,
+ "eval_runtime": 950.138,
+ "eval_samples_per_second": 1.052,
+ "eval_steps_per_second": 1.052,
+ "step": 187
+ },
+ {
+ "epoch": 0.34,
+ "mmlu_eval_accuracy": 0.731892294851104,
+ "mmlu_eval_accuracy_abstract_algebra": 0.18181818181818182,
+ "mmlu_eval_accuracy_anatomy": 0.7857142857142857,
+ "mmlu_eval_accuracy_astronomy": 0.6875,
+ "mmlu_eval_accuracy_business_ethics": 0.7272727272727273,
+ "mmlu_eval_accuracy_clinical_knowledge": 0.896551724137931,
+ "mmlu_eval_accuracy_college_biology": 0.875,
+ "mmlu_eval_accuracy_college_chemistry": 0.5,
+ "mmlu_eval_accuracy_college_computer_science": 0.7272727272727273,
+ "mmlu_eval_accuracy_college_mathematics": 0.45454545454545453,
+ "mmlu_eval_accuracy_college_medicine": 0.9090909090909091,
+ "mmlu_eval_accuracy_college_physics": 0.6363636363636364,
+ "mmlu_eval_accuracy_computer_security": 0.6363636363636364,
+ "mmlu_eval_accuracy_conceptual_physics": 0.6538461538461539,
+ "mmlu_eval_accuracy_econometrics": 0.8333333333333334,
+ "mmlu_eval_accuracy_electrical_engineering": 0.8125,
+ "mmlu_eval_accuracy_elementary_mathematics": 0.7073170731707317,
+ "mmlu_eval_accuracy_formal_logic": 0.5714285714285714,
+ "mmlu_eval_accuracy_global_facts": 0.5,
+ "mmlu_eval_accuracy_high_school_biology": 0.8125,
+ "mmlu_eval_accuracy_high_school_chemistry": 0.4090909090909091,
+ "mmlu_eval_accuracy_high_school_computer_science": 0.6666666666666666,
+ "mmlu_eval_accuracy_high_school_european_history": 0.7777777777777778,
+ "mmlu_eval_accuracy_high_school_geography": 0.9090909090909091,
+ "mmlu_eval_accuracy_high_school_government_and_politics": 0.9523809523809523,
+ "mmlu_eval_accuracy_high_school_macroeconomics": 0.7441860465116279,
+ "mmlu_eval_accuracy_high_school_mathematics": 0.3793103448275862,
+ "mmlu_eval_accuracy_high_school_microeconomics": 0.9615384615384616,
+ "mmlu_eval_accuracy_high_school_physics": 0.23529411764705882,
+ "mmlu_eval_accuracy_high_school_psychology": 0.9333333333333333,
+ "mmlu_eval_accuracy_high_school_statistics": 0.6956521739130435,
+ "mmlu_eval_accuracy_high_school_us_history": 0.9090909090909091,
+ "mmlu_eval_accuracy_high_school_world_history": 0.8076923076923077,
+ "mmlu_eval_accuracy_human_aging": 0.8260869565217391,
+ "mmlu_eval_accuracy_human_sexuality": 0.6666666666666666,
+ "mmlu_eval_accuracy_international_law": 1.0,
+ "mmlu_eval_accuracy_jurisprudence": 0.6363636363636364,
+ "mmlu_eval_accuracy_logical_fallacies": 0.7777777777777778,
+ "mmlu_eval_accuracy_machine_learning": 0.5454545454545454,
+ "mmlu_eval_accuracy_management": 0.9090909090909091,
+ "mmlu_eval_accuracy_marketing": 0.88,
+ "mmlu_eval_accuracy_medical_genetics": 0.9090909090909091,
+ "mmlu_eval_accuracy_miscellaneous": 0.7906976744186046,
+ "mmlu_eval_accuracy_moral_disputes": 0.8157894736842105,
+ "mmlu_eval_accuracy_moral_scenarios": 0.59,
+ "mmlu_eval_accuracy_nutrition": 0.7878787878787878,
+ "mmlu_eval_accuracy_philosophy": 0.7941176470588235,
+ "mmlu_eval_accuracy_prehistory": 0.8285714285714286,
+ "mmlu_eval_accuracy_professional_accounting": 0.6451612903225806,
+ "mmlu_eval_accuracy_professional_law": 0.6294117647058823,
+ "mmlu_eval_accuracy_professional_medicine": 0.8064516129032258,
+ "mmlu_eval_accuracy_professional_psychology": 0.8115942028985508,
+ "mmlu_eval_accuracy_public_relations": 0.6666666666666666,
+ "mmlu_eval_accuracy_security_studies": 0.8148148148148148,
+ "mmlu_eval_accuracy_sociology": 0.9545454545454546,
+ "mmlu_eval_accuracy_us_foreign_policy": 1.0,
+ "mmlu_eval_accuracy_virology": 0.5,
+ "mmlu_eval_accuracy_world_religions": 0.8421052631578947,
+ "mmlu_loss": 1.326305795171384,
+ "step": 187
+ },
+ {
+ "epoch": 0.34,
+ "learning_rate": 0.0001,
+ "loss": 1.1133,
+ "step": 190
+ },
+ {
+ "epoch": 0.36,
+ "learning_rate": 0.0001,
+ "loss": 1.2485,
+ "step": 200
+ },
+ {
+ "epoch": 0.38,
+ "learning_rate": 0.0001,
+ "loss": 0.9653,
+ "step": 210
+ },
+ {
+ "epoch": 0.4,
+ "learning_rate": 0.0001,
+ "loss": 0.9455,
+ "step": 220
+ },
+ {
+ "epoch": 0.42,
+ "learning_rate": 0.0001,
+ "loss": 1.0373,
+ "step": 230
+ },
+ {
+ "epoch": 0.44,
+ "learning_rate": 0.0001,
+ "loss": 1.1425,
+ "step": 240
+ },
+ {
+ "epoch": 0.45,
+ "learning_rate": 0.0001,
+ "loss": 1.3136,
+ "step": 250
+ },
+ {
+ "epoch": 0.47,
+ "learning_rate": 0.0001,
+ "loss": 0.8695,
+ "step": 260
+ },
+ {
+ "epoch": 0.49,
+ "learning_rate": 0.0001,
+ "loss": 0.872,
+ "step": 270
+ },
+ {
+ "epoch": 0.51,
+ "learning_rate": 0.0001,
+ "loss": 1.0152,
+ "step": 280
+ },
+ {
+ "epoch": 0.53,
+ "learning_rate": 0.0001,
+ "loss": 1.1309,
+ "step": 290
+ },
+ {
+ "epoch": 0.54,
+ "learning_rate": 0.0001,
+ "loss": 1.267,
+ "step": 300
+ },
+ {
+ "epoch": 0.56,
+ "learning_rate": 0.0001,
+ "loss": 0.9249,
+ "step": 310
+ },
+ {
+ "epoch": 0.58,
+ "learning_rate": 0.0001,
+ "loss": 0.9148,
+ "step": 320
+ },
+ {
+ "epoch": 0.6,
+ "learning_rate": 0.0001,
+ "loss": 0.9864,
+ "step": 330
+ },
+ {
+ "epoch": 0.62,
+ "learning_rate": 0.0001,
+ "loss": 1.2312,
+ "step": 340
+ },
+ {
+ "epoch": 0.63,
+ "learning_rate": 0.0001,
+ "loss": 1.2354,
+ "step": 350
+ },
+ {
+ "epoch": 0.65,
+ "learning_rate": 0.0001,
+ "loss": 0.9126,
+ "step": 360
+ },
+ {
+ "epoch": 0.67,
+ "learning_rate": 0.0001,
+ "loss": 0.9213,
+ "step": 370
+ },
+ {
+ "epoch": 0.68,
+ "eval_loss": 1.0163359642028809,
+ "eval_runtime": 948.1151,
+ "eval_samples_per_second": 1.055,
+ "eval_steps_per_second": 1.055,
+ "step": 374
+ },
+ {
+ "epoch": 0.68,
+ "mmlu_eval_accuracy": 0.7395476061435284,
+ "mmlu_eval_accuracy_abstract_algebra": 0.2727272727272727,
+ "mmlu_eval_accuracy_anatomy": 0.7857142857142857,
+ "mmlu_eval_accuracy_astronomy": 0.75,
+ "mmlu_eval_accuracy_business_ethics": 0.7272727272727273,
+ "mmlu_eval_accuracy_clinical_knowledge": 0.896551724137931,
+ "mmlu_eval_accuracy_college_biology": 0.875,
+ "mmlu_eval_accuracy_college_chemistry": 0.5,
+ "mmlu_eval_accuracy_college_computer_science": 0.7272727272727273,
+ "mmlu_eval_accuracy_college_mathematics": 0.36363636363636365,
+ "mmlu_eval_accuracy_college_medicine": 0.9090909090909091,
+ "mmlu_eval_accuracy_college_physics": 0.6363636363636364,
+ "mmlu_eval_accuracy_computer_security": 0.6363636363636364,
+ "mmlu_eval_accuracy_conceptual_physics": 0.5769230769230769,
+ "mmlu_eval_accuracy_econometrics": 0.8333333333333334,
+ "mmlu_eval_accuracy_electrical_engineering": 0.875,
+ "mmlu_eval_accuracy_elementary_mathematics": 0.6829268292682927,
+ "mmlu_eval_accuracy_formal_logic": 0.6428571428571429,
+ "mmlu_eval_accuracy_global_facts": 0.5,
+ "mmlu_eval_accuracy_high_school_biology": 0.8125,
+ "mmlu_eval_accuracy_high_school_chemistry": 0.45454545454545453,
+ "mmlu_eval_accuracy_high_school_computer_science": 0.7777777777777778,
+ "mmlu_eval_accuracy_high_school_european_history": 0.7777777777777778,
+ "mmlu_eval_accuracy_high_school_geography": 0.9090909090909091,
+ "mmlu_eval_accuracy_high_school_government_and_politics": 0.9523809523809523,
+ "mmlu_eval_accuracy_high_school_macroeconomics": 0.7674418604651163,
+ "mmlu_eval_accuracy_high_school_mathematics": 0.41379310344827586,
+ "mmlu_eval_accuracy_high_school_microeconomics": 0.9615384615384616,
+ "mmlu_eval_accuracy_high_school_physics": 0.23529411764705882,
+ "mmlu_eval_accuracy_high_school_psychology": 0.95,
+ "mmlu_eval_accuracy_high_school_statistics": 0.7391304347826086,
+ "mmlu_eval_accuracy_high_school_us_history": 0.9090909090909091,
+ "mmlu_eval_accuracy_high_school_world_history": 0.7692307692307693,
+ "mmlu_eval_accuracy_human_aging": 0.8260869565217391,
+ "mmlu_eval_accuracy_human_sexuality": 0.6666666666666666,
+ "mmlu_eval_accuracy_international_law": 1.0,
+ "mmlu_eval_accuracy_jurisprudence": 0.6363636363636364,
+ "mmlu_eval_accuracy_logical_fallacies": 0.7777777777777778,
+ "mmlu_eval_accuracy_machine_learning": 0.5454545454545454,
+ "mmlu_eval_accuracy_management": 0.9090909090909091,
+ "mmlu_eval_accuracy_marketing": 0.92,
+ "mmlu_eval_accuracy_medical_genetics": 0.9090909090909091,
+ "mmlu_eval_accuracy_miscellaneous": 0.7790697674418605,
+ "mmlu_eval_accuracy_moral_disputes": 0.8157894736842105,
+ "mmlu_eval_accuracy_moral_scenarios": 0.57,
+ "mmlu_eval_accuracy_nutrition": 0.7272727272727273,
+ "mmlu_eval_accuracy_philosophy": 0.7941176470588235,
+ "mmlu_eval_accuracy_prehistory": 0.8571428571428571,
+ "mmlu_eval_accuracy_professional_accounting": 0.6774193548387096,
+ "mmlu_eval_accuracy_professional_law": 0.6411764705882353,
+ "mmlu_eval_accuracy_professional_medicine": 0.8387096774193549,
+ "mmlu_eval_accuracy_professional_psychology": 0.8115942028985508,
+ "mmlu_eval_accuracy_public_relations": 0.6666666666666666,
+ "mmlu_eval_accuracy_security_studies": 0.8148148148148148,
+ "mmlu_eval_accuracy_sociology": 0.9545454545454546,
+ "mmlu_eval_accuracy_us_foreign_policy": 1.0,
+ "mmlu_eval_accuracy_virology": 0.5,
+ "mmlu_eval_accuracy_world_religions": 0.8947368421052632,
+ "mmlu_loss": 1.2796503596061355,
+ "step": 374
+ },
+ {
+ "epoch": 0.69,
+ "learning_rate": 0.0001,
+ "loss": 0.9737,
+ "step": 380
+ },
+ {
+ "epoch": 0.71,
+ "learning_rate": 0.0001,
+ "loss": 1.157,
+ "step": 390
+ },
+ {
+ "epoch": 0.73,
+ "learning_rate": 0.0001,
+ "loss": 1.2106,
+ "step": 400
+ }
+ ],
+ "max_steps": 1875,
+ "num_train_epochs": 4,
+ "total_flos": 4.626609292910592e+17,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/checkpoint-400/training_args.bin b/checkpoint-400/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..cab5478c763bfedd6ea1ae136d922355ec23f73d
--- /dev/null
+++ b/checkpoint-400/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3739b542a2914edbd2d7eb3b727d6fa2a7752c75b0d7a856f5e87dd807fa1ef9
+size 6200
diff --git a/checkpoint-600/README.md b/checkpoint-600/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..82793f73e61dbb024e11fc6697bba1622d4d0db6
--- /dev/null
+++ b/checkpoint-600/README.md
@@ -0,0 +1,20 @@
+---
+library_name: peft
+---
+## Training procedure
+
+
+The following `bitsandbytes` quantization config was used during training:
+- load_in_8bit: False
+- load_in_4bit: True
+- llm_int8_threshold: 6.0
+- llm_int8_skip_modules: None
+- llm_int8_enable_fp32_cpu_offload: False
+- llm_int8_has_fp16_weight: False
+- bnb_4bit_quant_type: nf4
+- bnb_4bit_use_double_quant: True
+- bnb_4bit_compute_dtype: bfloat16
+### Framework versions
+
+
+- PEFT 0.4.0
diff --git a/checkpoint-600/adapter_config.json b/checkpoint-600/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..f6f42652a8f2f3cf02936153fca1af5d04e7983d
--- /dev/null
+++ b/checkpoint-600/adapter_config.json
@@ -0,0 +1,26 @@
+{
+ "auto_mapping": null,
+ "base_model_name_or_path": "152334H/miqu-1-70b-sf",
+ "bias": "none",
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "lora_alpha": 16.0,
+ "lora_dropout": 0.05,
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "r": 64,
+ "revision": null,
+ "target_modules": [
+ "gate_proj",
+ "down_proj",
+ "v_proj",
+ "up_proj",
+ "q_proj",
+ "k_proj",
+ "o_proj"
+ ],
+ "task_type": "CAUSAL_LM"
+}
\ No newline at end of file
diff --git a/checkpoint-600/adapter_model.bin b/checkpoint-600/adapter_model.bin
new file mode 100644
index 0000000000000000000000000000000000000000..2d8e46d626c407af8850f4b6628f927409226368
--- /dev/null
+++ b/checkpoint-600/adapter_model.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cee2928ef56942f3235728ff2c231d6d5861d904136af34d2027741964fe209d
+size 1657155522
diff --git a/checkpoint-600/adapter_model/README.md b/checkpoint-600/adapter_model/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..82793f73e61dbb024e11fc6697bba1622d4d0db6
--- /dev/null
+++ b/checkpoint-600/adapter_model/README.md
@@ -0,0 +1,20 @@
+---
+library_name: peft
+---
+## Training procedure
+
+
+The following `bitsandbytes` quantization config was used during training:
+- load_in_8bit: False
+- load_in_4bit: True
+- llm_int8_threshold: 6.0
+- llm_int8_skip_modules: None
+- llm_int8_enable_fp32_cpu_offload: False
+- llm_int8_has_fp16_weight: False
+- bnb_4bit_quant_type: nf4
+- bnb_4bit_use_double_quant: True
+- bnb_4bit_compute_dtype: bfloat16
+### Framework versions
+
+
+- PEFT 0.4.0
diff --git a/checkpoint-600/adapter_model/adapter_config.json b/checkpoint-600/adapter_model/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..f6f42652a8f2f3cf02936153fca1af5d04e7983d
--- /dev/null
+++ b/checkpoint-600/adapter_model/adapter_config.json
@@ -0,0 +1,26 @@
+{
+ "auto_mapping": null,
+ "base_model_name_or_path": "152334H/miqu-1-70b-sf",
+ "bias": "none",
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "lora_alpha": 16.0,
+ "lora_dropout": 0.05,
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "r": 64,
+ "revision": null,
+ "target_modules": [
+ "gate_proj",
+ "down_proj",
+ "v_proj",
+ "up_proj",
+ "q_proj",
+ "k_proj",
+ "o_proj"
+ ],
+ "task_type": "CAUSAL_LM"
+}
\ No newline at end of file
diff --git a/checkpoint-600/adapter_model/adapter_model.bin b/checkpoint-600/adapter_model/adapter_model.bin
new file mode 100644
index 0000000000000000000000000000000000000000..2d8e46d626c407af8850f4b6628f927409226368
--- /dev/null
+++ b/checkpoint-600/adapter_model/adapter_model.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cee2928ef56942f3235728ff2c231d6d5861d904136af34d2027741964fe209d
+size 1657155522
diff --git a/checkpoint-600/optimizer.pt b/checkpoint-600/optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..86772dc685541c6933d9c7ce16e075c2a5e76188
--- /dev/null
+++ b/checkpoint-600/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6f963a7064821784cfc371e33d455fde7e372f6be8ceacb32268b28b1464c09b
+size 6627702922
diff --git a/checkpoint-600/rng_state.pth b/checkpoint-600/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..3252702672ec1e139ba52e6ebef69a1ccfe6a306
--- /dev/null
+++ b/checkpoint-600/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d994d4a5b7b22ba4c5e1c4afc33f7135ce8ec062ddee21f5c0d47955fa2a6382
+size 14180
diff --git a/checkpoint-600/scheduler.pt b/checkpoint-600/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..60116c1c19e8f7c4feaba1cfdd17285e828d28fd
--- /dev/null
+++ b/checkpoint-600/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:df3cd0cb1c67a326b465332bde0ecdeac62626bea0e2ea96c60fa6d8786b9f65
+size 1064
diff --git a/checkpoint-600/special_tokens_map.json b/checkpoint-600/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..55431d8ca0ffd8e996c646341b6136e749142547
--- /dev/null
+++ b/checkpoint-600/special_tokens_map.json
@@ -0,0 +1,12 @@
+{
+ "bos_token": "",
+ "eos_token": "",
+ "pad_token": {
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false
+ },
+ "unk_token": ""
+}
diff --git a/checkpoint-600/tokenizer.model b/checkpoint-600/tokenizer.model
new file mode 100644
index 0000000000000000000000000000000000000000..6c00c742ce03c627d6cd5b795984876fa49fa899
--- /dev/null
+++ b/checkpoint-600/tokenizer.model
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
+size 499723
diff --git a/checkpoint-600/tokenizer_config.json b/checkpoint-600/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..a3b9160e6f055447e1b1927482cf0ea394366055
--- /dev/null
+++ b/checkpoint-600/tokenizer_config.json
@@ -0,0 +1,71 @@
+{
+ "add_bos_token": true,
+ "add_eos_token": false,
+ "added_tokens_decoder": {
+ "0": {
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "1": {
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "2": {
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ }
+ },
+ "bos_token": {
+ "__type": "AddedToken",
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false
+ },
+ "chat_template": "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}",
+ "clean_up_tokenization_spaces": false,
+ "eos_token": {
+ "__type": "AddedToken",
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false
+ },
+ "legacy": false,
+ "model_max_length": 1000000000000000019884624838656,
+ "pad_token": {
+ "__type": "AddedToken",
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false
+ },
+ "padding_side": "right",
+ "sp_model_kwargs": {},
+ "spaces_between_special_tokens": false,
+ "tokenizer_class": "LlamaTokenizer",
+ "unk_token": {
+ "__type": "AddedToken",
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false
+ },
+ "use_default_system_prompt": false
+}
diff --git a/checkpoint-600/trainer_state.json b/checkpoint-600/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..3fbd0179aa011a7591d70c68e28ecb6a15f07367
--- /dev/null
+++ b/checkpoint-600/trainer_state.json
@@ -0,0 +1,589 @@
+{
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 1.0881886193606891,
+ "global_step": 600,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0001,
+ "loss": 1.1655,
+ "step": 10
+ },
+ {
+ "epoch": 0.04,
+ "learning_rate": 0.0001,
+ "loss": 1.001,
+ "step": 20
+ },
+ {
+ "epoch": 0.05,
+ "learning_rate": 0.0001,
+ "loss": 1.0287,
+ "step": 30
+ },
+ {
+ "epoch": 0.07,
+ "learning_rate": 0.0001,
+ "loss": 1.1578,
+ "step": 40
+ },
+ {
+ "epoch": 0.09,
+ "learning_rate": 0.0001,
+ "loss": 1.2146,
+ "step": 50
+ },
+ {
+ "epoch": 0.11,
+ "learning_rate": 0.0001,
+ "loss": 0.997,
+ "step": 60
+ },
+ {
+ "epoch": 0.13,
+ "learning_rate": 0.0001,
+ "loss": 0.9024,
+ "step": 70
+ },
+ {
+ "epoch": 0.15,
+ "learning_rate": 0.0001,
+ "loss": 0.9901,
+ "step": 80
+ },
+ {
+ "epoch": 0.16,
+ "learning_rate": 0.0001,
+ "loss": 1.1264,
+ "step": 90
+ },
+ {
+ "epoch": 0.18,
+ "learning_rate": 0.0001,
+ "loss": 1.2038,
+ "step": 100
+ },
+ {
+ "epoch": 0.2,
+ "learning_rate": 0.0001,
+ "loss": 0.8935,
+ "step": 110
+ },
+ {
+ "epoch": 0.22,
+ "learning_rate": 0.0001,
+ "loss": 0.9178,
+ "step": 120
+ },
+ {
+ "epoch": 0.24,
+ "learning_rate": 0.0001,
+ "loss": 0.9746,
+ "step": 130
+ },
+ {
+ "epoch": 0.25,
+ "learning_rate": 0.0001,
+ "loss": 1.1566,
+ "step": 140
+ },
+ {
+ "epoch": 0.27,
+ "learning_rate": 0.0001,
+ "loss": 1.2877,
+ "step": 150
+ },
+ {
+ "epoch": 0.29,
+ "learning_rate": 0.0001,
+ "loss": 0.9146,
+ "step": 160
+ },
+ {
+ "epoch": 0.31,
+ "learning_rate": 0.0001,
+ "loss": 0.8895,
+ "step": 170
+ },
+ {
+ "epoch": 0.33,
+ "learning_rate": 0.0001,
+ "loss": 1.0121,
+ "step": 180
+ },
+ {
+ "epoch": 0.34,
+ "eval_loss": 1.0215636491775513,
+ "eval_runtime": 950.138,
+ "eval_samples_per_second": 1.052,
+ "eval_steps_per_second": 1.052,
+ "step": 187
+ },
+ {
+ "epoch": 0.34,
+ "mmlu_eval_accuracy": 0.731892294851104,
+ "mmlu_eval_accuracy_abstract_algebra": 0.18181818181818182,
+ "mmlu_eval_accuracy_anatomy": 0.7857142857142857,
+ "mmlu_eval_accuracy_astronomy": 0.6875,
+ "mmlu_eval_accuracy_business_ethics": 0.7272727272727273,
+ "mmlu_eval_accuracy_clinical_knowledge": 0.896551724137931,
+ "mmlu_eval_accuracy_college_biology": 0.875,
+ "mmlu_eval_accuracy_college_chemistry": 0.5,
+ "mmlu_eval_accuracy_college_computer_science": 0.7272727272727273,
+ "mmlu_eval_accuracy_college_mathematics": 0.45454545454545453,
+ "mmlu_eval_accuracy_college_medicine": 0.9090909090909091,
+ "mmlu_eval_accuracy_college_physics": 0.6363636363636364,
+ "mmlu_eval_accuracy_computer_security": 0.6363636363636364,
+ "mmlu_eval_accuracy_conceptual_physics": 0.6538461538461539,
+ "mmlu_eval_accuracy_econometrics": 0.8333333333333334,
+ "mmlu_eval_accuracy_electrical_engineering": 0.8125,
+ "mmlu_eval_accuracy_elementary_mathematics": 0.7073170731707317,
+ "mmlu_eval_accuracy_formal_logic": 0.5714285714285714,
+ "mmlu_eval_accuracy_global_facts": 0.5,
+ "mmlu_eval_accuracy_high_school_biology": 0.8125,
+ "mmlu_eval_accuracy_high_school_chemistry": 0.4090909090909091,
+ "mmlu_eval_accuracy_high_school_computer_science": 0.6666666666666666,
+ "mmlu_eval_accuracy_high_school_european_history": 0.7777777777777778,
+ "mmlu_eval_accuracy_high_school_geography": 0.9090909090909091,
+ "mmlu_eval_accuracy_high_school_government_and_politics": 0.9523809523809523,
+ "mmlu_eval_accuracy_high_school_macroeconomics": 0.7441860465116279,
+ "mmlu_eval_accuracy_high_school_mathematics": 0.3793103448275862,
+ "mmlu_eval_accuracy_high_school_microeconomics": 0.9615384615384616,
+ "mmlu_eval_accuracy_high_school_physics": 0.23529411764705882,
+ "mmlu_eval_accuracy_high_school_psychology": 0.9333333333333333,
+ "mmlu_eval_accuracy_high_school_statistics": 0.6956521739130435,
+ "mmlu_eval_accuracy_high_school_us_history": 0.9090909090909091,
+ "mmlu_eval_accuracy_high_school_world_history": 0.8076923076923077,
+ "mmlu_eval_accuracy_human_aging": 0.8260869565217391,
+ "mmlu_eval_accuracy_human_sexuality": 0.6666666666666666,
+ "mmlu_eval_accuracy_international_law": 1.0,
+ "mmlu_eval_accuracy_jurisprudence": 0.6363636363636364,
+ "mmlu_eval_accuracy_logical_fallacies": 0.7777777777777778,
+ "mmlu_eval_accuracy_machine_learning": 0.5454545454545454,
+ "mmlu_eval_accuracy_management": 0.9090909090909091,
+ "mmlu_eval_accuracy_marketing": 0.88,
+ "mmlu_eval_accuracy_medical_genetics": 0.9090909090909091,
+ "mmlu_eval_accuracy_miscellaneous": 0.7906976744186046,
+ "mmlu_eval_accuracy_moral_disputes": 0.8157894736842105,
+ "mmlu_eval_accuracy_moral_scenarios": 0.59,
+ "mmlu_eval_accuracy_nutrition": 0.7878787878787878,
+ "mmlu_eval_accuracy_philosophy": 0.7941176470588235,
+ "mmlu_eval_accuracy_prehistory": 0.8285714285714286,
+ "mmlu_eval_accuracy_professional_accounting": 0.6451612903225806,
+ "mmlu_eval_accuracy_professional_law": 0.6294117647058823,
+ "mmlu_eval_accuracy_professional_medicine": 0.8064516129032258,
+ "mmlu_eval_accuracy_professional_psychology": 0.8115942028985508,
+ "mmlu_eval_accuracy_public_relations": 0.6666666666666666,
+ "mmlu_eval_accuracy_security_studies": 0.8148148148148148,
+ "mmlu_eval_accuracy_sociology": 0.9545454545454546,
+ "mmlu_eval_accuracy_us_foreign_policy": 1.0,
+ "mmlu_eval_accuracy_virology": 0.5,
+ "mmlu_eval_accuracy_world_religions": 0.8421052631578947,
+ "mmlu_loss": 1.326305795171384,
+ "step": 187
+ },
+ {
+ "epoch": 0.34,
+ "learning_rate": 0.0001,
+ "loss": 1.1133,
+ "step": 190
+ },
+ {
+ "epoch": 0.36,
+ "learning_rate": 0.0001,
+ "loss": 1.2485,
+ "step": 200
+ },
+ {
+ "epoch": 0.38,
+ "learning_rate": 0.0001,
+ "loss": 0.9653,
+ "step": 210
+ },
+ {
+ "epoch": 0.4,
+ "learning_rate": 0.0001,
+ "loss": 0.9455,
+ "step": 220
+ },
+ {
+ "epoch": 0.42,
+ "learning_rate": 0.0001,
+ "loss": 1.0373,
+ "step": 230
+ },
+ {
+ "epoch": 0.44,
+ "learning_rate": 0.0001,
+ "loss": 1.1425,
+ "step": 240
+ },
+ {
+ "epoch": 0.45,
+ "learning_rate": 0.0001,
+ "loss": 1.3136,
+ "step": 250
+ },
+ {
+ "epoch": 0.47,
+ "learning_rate": 0.0001,
+ "loss": 0.8695,
+ "step": 260
+ },
+ {
+ "epoch": 0.49,
+ "learning_rate": 0.0001,
+ "loss": 0.872,
+ "step": 270
+ },
+ {
+ "epoch": 0.51,
+ "learning_rate": 0.0001,
+ "loss": 1.0152,
+ "step": 280
+ },
+ {
+ "epoch": 0.53,
+ "learning_rate": 0.0001,
+ "loss": 1.1309,
+ "step": 290
+ },
+ {
+ "epoch": 0.54,
+ "learning_rate": 0.0001,
+ "loss": 1.267,
+ "step": 300
+ },
+ {
+ "epoch": 0.56,
+ "learning_rate": 0.0001,
+ "loss": 0.9249,
+ "step": 310
+ },
+ {
+ "epoch": 0.58,
+ "learning_rate": 0.0001,
+ "loss": 0.9148,
+ "step": 320
+ },
+ {
+ "epoch": 0.6,
+ "learning_rate": 0.0001,
+ "loss": 0.9864,
+ "step": 330
+ },
+ {
+ "epoch": 0.62,
+ "learning_rate": 0.0001,
+ "loss": 1.2312,
+ "step": 340
+ },
+ {
+ "epoch": 0.63,
+ "learning_rate": 0.0001,
+ "loss": 1.2354,
+ "step": 350
+ },
+ {
+ "epoch": 0.65,
+ "learning_rate": 0.0001,
+ "loss": 0.9126,
+ "step": 360
+ },
+ {
+ "epoch": 0.67,
+ "learning_rate": 0.0001,
+ "loss": 0.9213,
+ "step": 370
+ },
+ {
+ "epoch": 0.68,
+ "eval_loss": 1.0163359642028809,
+ "eval_runtime": 948.1151,
+ "eval_samples_per_second": 1.055,
+ "eval_steps_per_second": 1.055,
+ "step": 374
+ },
+ {
+ "epoch": 0.68,
+ "mmlu_eval_accuracy": 0.7395476061435284,
+ "mmlu_eval_accuracy_abstract_algebra": 0.2727272727272727,
+ "mmlu_eval_accuracy_anatomy": 0.7857142857142857,
+ "mmlu_eval_accuracy_astronomy": 0.75,
+ "mmlu_eval_accuracy_business_ethics": 0.7272727272727273,
+ "mmlu_eval_accuracy_clinical_knowledge": 0.896551724137931,
+ "mmlu_eval_accuracy_college_biology": 0.875,
+ "mmlu_eval_accuracy_college_chemistry": 0.5,
+ "mmlu_eval_accuracy_college_computer_science": 0.7272727272727273,
+ "mmlu_eval_accuracy_college_mathematics": 0.36363636363636365,
+ "mmlu_eval_accuracy_college_medicine": 0.9090909090909091,
+ "mmlu_eval_accuracy_college_physics": 0.6363636363636364,
+ "mmlu_eval_accuracy_computer_security": 0.6363636363636364,
+ "mmlu_eval_accuracy_conceptual_physics": 0.5769230769230769,
+ "mmlu_eval_accuracy_econometrics": 0.8333333333333334,
+ "mmlu_eval_accuracy_electrical_engineering": 0.875,
+ "mmlu_eval_accuracy_elementary_mathematics": 0.6829268292682927,
+ "mmlu_eval_accuracy_formal_logic": 0.6428571428571429,
+ "mmlu_eval_accuracy_global_facts": 0.5,
+ "mmlu_eval_accuracy_high_school_biology": 0.8125,
+ "mmlu_eval_accuracy_high_school_chemistry": 0.45454545454545453,
+ "mmlu_eval_accuracy_high_school_computer_science": 0.7777777777777778,
+ "mmlu_eval_accuracy_high_school_european_history": 0.7777777777777778,
+ "mmlu_eval_accuracy_high_school_geography": 0.9090909090909091,
+ "mmlu_eval_accuracy_high_school_government_and_politics": 0.9523809523809523,
+ "mmlu_eval_accuracy_high_school_macroeconomics": 0.7674418604651163,
+ "mmlu_eval_accuracy_high_school_mathematics": 0.41379310344827586,
+ "mmlu_eval_accuracy_high_school_microeconomics": 0.9615384615384616,
+ "mmlu_eval_accuracy_high_school_physics": 0.23529411764705882,
+ "mmlu_eval_accuracy_high_school_psychology": 0.95,
+ "mmlu_eval_accuracy_high_school_statistics": 0.7391304347826086,
+ "mmlu_eval_accuracy_high_school_us_history": 0.9090909090909091,
+ "mmlu_eval_accuracy_high_school_world_history": 0.7692307692307693,
+ "mmlu_eval_accuracy_human_aging": 0.8260869565217391,
+ "mmlu_eval_accuracy_human_sexuality": 0.6666666666666666,
+ "mmlu_eval_accuracy_international_law": 1.0,
+ "mmlu_eval_accuracy_jurisprudence": 0.6363636363636364,
+ "mmlu_eval_accuracy_logical_fallacies": 0.7777777777777778,
+ "mmlu_eval_accuracy_machine_learning": 0.5454545454545454,
+ "mmlu_eval_accuracy_management": 0.9090909090909091,
+ "mmlu_eval_accuracy_marketing": 0.92,
+ "mmlu_eval_accuracy_medical_genetics": 0.9090909090909091,
+ "mmlu_eval_accuracy_miscellaneous": 0.7790697674418605,
+ "mmlu_eval_accuracy_moral_disputes": 0.8157894736842105,
+ "mmlu_eval_accuracy_moral_scenarios": 0.57,
+ "mmlu_eval_accuracy_nutrition": 0.7272727272727273,
+ "mmlu_eval_accuracy_philosophy": 0.7941176470588235,
+ "mmlu_eval_accuracy_prehistory": 0.8571428571428571,
+ "mmlu_eval_accuracy_professional_accounting": 0.6774193548387096,
+ "mmlu_eval_accuracy_professional_law": 0.6411764705882353,
+ "mmlu_eval_accuracy_professional_medicine": 0.8387096774193549,
+ "mmlu_eval_accuracy_professional_psychology": 0.8115942028985508,
+ "mmlu_eval_accuracy_public_relations": 0.6666666666666666,
+ "mmlu_eval_accuracy_security_studies": 0.8148148148148148,
+ "mmlu_eval_accuracy_sociology": 0.9545454545454546,
+ "mmlu_eval_accuracy_us_foreign_policy": 1.0,
+ "mmlu_eval_accuracy_virology": 0.5,
+ "mmlu_eval_accuracy_world_religions": 0.8947368421052632,
+ "mmlu_loss": 1.2796503596061355,
+ "step": 374
+ },
+ {
+ "epoch": 0.69,
+ "learning_rate": 0.0001,
+ "loss": 0.9737,
+ "step": 380
+ },
+ {
+ "epoch": 0.71,
+ "learning_rate": 0.0001,
+ "loss": 1.157,
+ "step": 390
+ },
+ {
+ "epoch": 0.73,
+ "learning_rate": 0.0001,
+ "loss": 1.2106,
+ "step": 400
+ },
+ {
+ "epoch": 0.74,
+ "learning_rate": 0.0001,
+ "loss": 0.8687,
+ "step": 410
+ },
+ {
+ "epoch": 0.76,
+ "learning_rate": 0.0001,
+ "loss": 0.8742,
+ "step": 420
+ },
+ {
+ "epoch": 0.78,
+ "learning_rate": 0.0001,
+ "loss": 0.9901,
+ "step": 430
+ },
+ {
+ "epoch": 0.8,
+ "learning_rate": 0.0001,
+ "loss": 1.2238,
+ "step": 440
+ },
+ {
+ "epoch": 0.82,
+ "learning_rate": 0.0001,
+ "loss": 1.2604,
+ "step": 450
+ },
+ {
+ "epoch": 0.83,
+ "learning_rate": 0.0001,
+ "loss": 0.8756,
+ "step": 460
+ },
+ {
+ "epoch": 0.85,
+ "learning_rate": 0.0001,
+ "loss": 0.8683,
+ "step": 470
+ },
+ {
+ "epoch": 0.87,
+ "learning_rate": 0.0001,
+ "loss": 0.9824,
+ "step": 480
+ },
+ {
+ "epoch": 0.89,
+ "learning_rate": 0.0001,
+ "loss": 1.1574,
+ "step": 490
+ },
+ {
+ "epoch": 0.91,
+ "learning_rate": 0.0001,
+ "loss": 1.2687,
+ "step": 500
+ },
+ {
+ "epoch": 0.92,
+ "learning_rate": 0.0001,
+ "loss": 0.8657,
+ "step": 510
+ },
+ {
+ "epoch": 0.94,
+ "learning_rate": 0.0001,
+ "loss": 0.9207,
+ "step": 520
+ },
+ {
+ "epoch": 0.96,
+ "learning_rate": 0.0001,
+ "loss": 1.012,
+ "step": 530
+ },
+ {
+ "epoch": 0.98,
+ "learning_rate": 0.0001,
+ "loss": 1.1517,
+ "step": 540
+ },
+ {
+ "epoch": 1.0,
+ "learning_rate": 0.0001,
+ "loss": 1.1654,
+ "step": 550
+ },
+ {
+ "epoch": 1.02,
+ "learning_rate": 0.0001,
+ "loss": 0.8931,
+ "step": 560
+ },
+ {
+ "epoch": 1.02,
+ "eval_loss": 1.0150845050811768,
+ "eval_runtime": 949.8392,
+ "eval_samples_per_second": 1.053,
+ "eval_steps_per_second": 1.053,
+ "step": 561
+ },
+ {
+ "epoch": 1.02,
+ "mmlu_eval_accuracy": 0.7346397374699287,
+ "mmlu_eval_accuracy_abstract_algebra": 0.2727272727272727,
+ "mmlu_eval_accuracy_anatomy": 0.7142857142857143,
+ "mmlu_eval_accuracy_astronomy": 0.6875,
+ "mmlu_eval_accuracy_business_ethics": 0.8181818181818182,
+ "mmlu_eval_accuracy_clinical_knowledge": 0.8620689655172413,
+ "mmlu_eval_accuracy_college_biology": 0.875,
+ "mmlu_eval_accuracy_college_chemistry": 0.5,
+ "mmlu_eval_accuracy_college_computer_science": 0.7272727272727273,
+ "mmlu_eval_accuracy_college_mathematics": 0.36363636363636365,
+ "mmlu_eval_accuracy_college_medicine": 0.9090909090909091,
+ "mmlu_eval_accuracy_college_physics": 0.5454545454545454,
+ "mmlu_eval_accuracy_computer_security": 0.6363636363636364,
+ "mmlu_eval_accuracy_conceptual_physics": 0.6153846153846154,
+ "mmlu_eval_accuracy_econometrics": 0.8333333333333334,
+ "mmlu_eval_accuracy_electrical_engineering": 0.8125,
+ "mmlu_eval_accuracy_elementary_mathematics": 0.6829268292682927,
+ "mmlu_eval_accuracy_formal_logic": 0.6428571428571429,
+ "mmlu_eval_accuracy_global_facts": 0.5,
+ "mmlu_eval_accuracy_high_school_biology": 0.84375,
+ "mmlu_eval_accuracy_high_school_chemistry": 0.45454545454545453,
+ "mmlu_eval_accuracy_high_school_computer_science": 0.7777777777777778,
+ "mmlu_eval_accuracy_high_school_european_history": 0.7777777777777778,
+ "mmlu_eval_accuracy_high_school_geography": 0.9090909090909091,
+ "mmlu_eval_accuracy_high_school_government_and_politics": 0.9523809523809523,
+ "mmlu_eval_accuracy_high_school_macroeconomics": 0.7674418604651163,
+ "mmlu_eval_accuracy_high_school_mathematics": 0.41379310344827586,
+ "mmlu_eval_accuracy_high_school_microeconomics": 0.9230769230769231,
+ "mmlu_eval_accuracy_high_school_physics": 0.23529411764705882,
+ "mmlu_eval_accuracy_high_school_psychology": 0.9666666666666667,
+ "mmlu_eval_accuracy_high_school_statistics": 0.7391304347826086,
+ "mmlu_eval_accuracy_high_school_us_history": 0.9090909090909091,
+ "mmlu_eval_accuracy_high_school_world_history": 0.8076923076923077,
+ "mmlu_eval_accuracy_human_aging": 0.8260869565217391,
+ "mmlu_eval_accuracy_human_sexuality": 0.6666666666666666,
+ "mmlu_eval_accuracy_international_law": 1.0,
+ "mmlu_eval_accuracy_jurisprudence": 0.6363636363636364,
+ "mmlu_eval_accuracy_logical_fallacies": 0.7222222222222222,
+ "mmlu_eval_accuracy_machine_learning": 0.6363636363636364,
+ "mmlu_eval_accuracy_management": 0.9090909090909091,
+ "mmlu_eval_accuracy_marketing": 0.88,
+ "mmlu_eval_accuracy_medical_genetics": 0.9090909090909091,
+ "mmlu_eval_accuracy_miscellaneous": 0.7558139534883721,
+ "mmlu_eval_accuracy_moral_disputes": 0.8157894736842105,
+ "mmlu_eval_accuracy_moral_scenarios": 0.56,
+ "mmlu_eval_accuracy_nutrition": 0.7575757575757576,
+ "mmlu_eval_accuracy_philosophy": 0.7941176470588235,
+ "mmlu_eval_accuracy_prehistory": 0.8571428571428571,
+ "mmlu_eval_accuracy_professional_accounting": 0.6774193548387096,
+ "mmlu_eval_accuracy_professional_law": 0.6294117647058823,
+ "mmlu_eval_accuracy_professional_medicine": 0.8064516129032258,
+ "mmlu_eval_accuracy_professional_psychology": 0.8260869565217391,
+ "mmlu_eval_accuracy_public_relations": 0.6666666666666666,
+ "mmlu_eval_accuracy_security_studies": 0.8148148148148148,
+ "mmlu_eval_accuracy_sociology": 0.9090909090909091,
+ "mmlu_eval_accuracy_us_foreign_policy": 1.0,
+ "mmlu_eval_accuracy_virology": 0.5,
+ "mmlu_eval_accuracy_world_religions": 0.8421052631578947,
+ "mmlu_loss": 1.186674291658526,
+ "step": 561
+ },
+ {
+ "epoch": 1.03,
+ "learning_rate": 0.0001,
+ "loss": 0.8507,
+ "step": 570
+ },
+ {
+ "epoch": 1.05,
+ "learning_rate": 0.0001,
+ "loss": 0.9164,
+ "step": 580
+ },
+ {
+ "epoch": 1.07,
+ "learning_rate": 0.0001,
+ "loss": 1.0908,
+ "step": 590
+ },
+ {
+ "epoch": 1.09,
+ "learning_rate": 0.0001,
+ "loss": 1.0431,
+ "step": 600
+ }
+ ],
+ "max_steps": 1875,
+ "num_train_epochs": 4,
+ "total_flos": 6.959436668848374e+17,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/checkpoint-600/training_args.bin b/checkpoint-600/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..cab5478c763bfedd6ea1ae136d922355ec23f73d
--- /dev/null
+++ b/checkpoint-600/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3739b542a2914edbd2d7eb3b727d6fa2a7752c75b0d7a856f5e87dd807fa1ef9
+size 6200
diff --git a/checkpoint-800/README.md b/checkpoint-800/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..82793f73e61dbb024e11fc6697bba1622d4d0db6
--- /dev/null
+++ b/checkpoint-800/README.md
@@ -0,0 +1,20 @@
+---
+library_name: peft
+---
+## Training procedure
+
+
+The following `bitsandbytes` quantization config was used during training:
+- load_in_8bit: False
+- load_in_4bit: True
+- llm_int8_threshold: 6.0
+- llm_int8_skip_modules: None
+- llm_int8_enable_fp32_cpu_offload: False
+- llm_int8_has_fp16_weight: False
+- bnb_4bit_quant_type: nf4
+- bnb_4bit_use_double_quant: True
+- bnb_4bit_compute_dtype: bfloat16
+### Framework versions
+
+
+- PEFT 0.4.0
diff --git a/checkpoint-800/adapter_config.json b/checkpoint-800/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..f6f42652a8f2f3cf02936153fca1af5d04e7983d
--- /dev/null
+++ b/checkpoint-800/adapter_config.json
@@ -0,0 +1,26 @@
+{
+ "auto_mapping": null,
+ "base_model_name_or_path": "152334H/miqu-1-70b-sf",
+ "bias": "none",
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "lora_alpha": 16.0,
+ "lora_dropout": 0.05,
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "r": 64,
+ "revision": null,
+ "target_modules": [
+ "gate_proj",
+ "down_proj",
+ "v_proj",
+ "up_proj",
+ "q_proj",
+ "k_proj",
+ "o_proj"
+ ],
+ "task_type": "CAUSAL_LM"
+}
\ No newline at end of file
diff --git a/checkpoint-800/adapter_model.bin b/checkpoint-800/adapter_model.bin
new file mode 100644
index 0000000000000000000000000000000000000000..1522fcec552ed71923375ad0d1a2f8299ff17ed8
--- /dev/null
+++ b/checkpoint-800/adapter_model.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:98e2d860c21b4cf37dd4e6241f85216aa00c5210b7bcd612e3d4c46a109a66f1
+size 1657155522
diff --git a/checkpoint-800/adapter_model/README.md b/checkpoint-800/adapter_model/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..82793f73e61dbb024e11fc6697bba1622d4d0db6
--- /dev/null
+++ b/checkpoint-800/adapter_model/README.md
@@ -0,0 +1,20 @@
+---
+library_name: peft
+---
+## Training procedure
+
+
+The following `bitsandbytes` quantization config was used during training:
+- load_in_8bit: False
+- load_in_4bit: True
+- llm_int8_threshold: 6.0
+- llm_int8_skip_modules: None
+- llm_int8_enable_fp32_cpu_offload: False
+- llm_int8_has_fp16_weight: False
+- bnb_4bit_quant_type: nf4
+- bnb_4bit_use_double_quant: True
+- bnb_4bit_compute_dtype: bfloat16
+### Framework versions
+
+
+- PEFT 0.4.0
diff --git a/checkpoint-800/adapter_model/adapter_config.json b/checkpoint-800/adapter_model/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..f6f42652a8f2f3cf02936153fca1af5d04e7983d
--- /dev/null
+++ b/checkpoint-800/adapter_model/adapter_config.json
@@ -0,0 +1,26 @@
+{
+ "auto_mapping": null,
+ "base_model_name_or_path": "152334H/miqu-1-70b-sf",
+ "bias": "none",
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "lora_alpha": 16.0,
+ "lora_dropout": 0.05,
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "r": 64,
+ "revision": null,
+ "target_modules": [
+ "gate_proj",
+ "down_proj",
+ "v_proj",
+ "up_proj",
+ "q_proj",
+ "k_proj",
+ "o_proj"
+ ],
+ "task_type": "CAUSAL_LM"
+}
\ No newline at end of file
diff --git a/checkpoint-800/adapter_model/adapter_model.bin b/checkpoint-800/adapter_model/adapter_model.bin
new file mode 100644
index 0000000000000000000000000000000000000000..1522fcec552ed71923375ad0d1a2f8299ff17ed8
--- /dev/null
+++ b/checkpoint-800/adapter_model/adapter_model.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:98e2d860c21b4cf37dd4e6241f85216aa00c5210b7bcd612e3d4c46a109a66f1
+size 1657155522
diff --git a/checkpoint-800/optimizer.pt b/checkpoint-800/optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..c4cdbf8757d30d2ea6758894dda592a06ce51e06
--- /dev/null
+++ b/checkpoint-800/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e23f49320e9a9c2e5554c70f1d6e10e281e4f668be89773db1e245dd51e05fd7
+size 6627702922
diff --git a/checkpoint-800/rng_state.pth b/checkpoint-800/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..579ed11183d40da36869ebc636d6b0ab92aecf6a
--- /dev/null
+++ b/checkpoint-800/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d9da05579932a2ba95d9f4364f034fd07049787d493fafba1013252edc115be8
+size 14180
diff --git a/checkpoint-800/scheduler.pt b/checkpoint-800/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..a0d9ca64f73600b59484091bfccba500da841d15
--- /dev/null
+++ b/checkpoint-800/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7847f5134d705577faee25af150145f5a0a552ec46c6008bc52ad812b9e8f038
+size 1064
diff --git a/checkpoint-800/special_tokens_map.json b/checkpoint-800/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..55431d8ca0ffd8e996c646341b6136e749142547
--- /dev/null
+++ b/checkpoint-800/special_tokens_map.json
@@ -0,0 +1,12 @@
+{
+ "bos_token": "",
+ "eos_token": "",
+ "pad_token": {
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false
+ },
+ "unk_token": ""
+}
diff --git a/checkpoint-800/tokenizer.model b/checkpoint-800/tokenizer.model
new file mode 100644
index 0000000000000000000000000000000000000000..6c00c742ce03c627d6cd5b795984876fa49fa899
--- /dev/null
+++ b/checkpoint-800/tokenizer.model
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
+size 499723
diff --git a/checkpoint-800/tokenizer_config.json b/checkpoint-800/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..a3b9160e6f055447e1b1927482cf0ea394366055
--- /dev/null
+++ b/checkpoint-800/tokenizer_config.json
@@ -0,0 +1,71 @@
+{
+ "add_bos_token": true,
+ "add_eos_token": false,
+ "added_tokens_decoder": {
+ "0": {
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "1": {
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "2": {
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ }
+ },
+ "bos_token": {
+ "__type": "AddedToken",
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false
+ },
+ "chat_template": "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}",
+ "clean_up_tokenization_spaces": false,
+ "eos_token": {
+ "__type": "AddedToken",
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false
+ },
+ "legacy": false,
+ "model_max_length": 1000000000000000019884624838656,
+ "pad_token": {
+ "__type": "AddedToken",
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false
+ },
+ "padding_side": "right",
+ "sp_model_kwargs": {},
+ "spaces_between_special_tokens": false,
+ "tokenizer_class": "LlamaTokenizer",
+ "unk_token": {
+ "__type": "AddedToken",
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false
+ },
+ "use_default_system_prompt": false
+}
diff --git a/checkpoint-800/trainer_state.json b/checkpoint-800/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..1936c7bda19baa859d443136a20bea43bb3b3be9
--- /dev/null
+++ b/checkpoint-800/trainer_state.json
@@ -0,0 +1,780 @@
+{
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 1.4509181591475855,
+ "global_step": 800,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0001,
+ "loss": 1.1655,
+ "step": 10
+ },
+ {
+ "epoch": 0.04,
+ "learning_rate": 0.0001,
+ "loss": 1.001,
+ "step": 20
+ },
+ {
+ "epoch": 0.05,
+ "learning_rate": 0.0001,
+ "loss": 1.0287,
+ "step": 30
+ },
+ {
+ "epoch": 0.07,
+ "learning_rate": 0.0001,
+ "loss": 1.1578,
+ "step": 40
+ },
+ {
+ "epoch": 0.09,
+ "learning_rate": 0.0001,
+ "loss": 1.2146,
+ "step": 50
+ },
+ {
+ "epoch": 0.11,
+ "learning_rate": 0.0001,
+ "loss": 0.997,
+ "step": 60
+ },
+ {
+ "epoch": 0.13,
+ "learning_rate": 0.0001,
+ "loss": 0.9024,
+ "step": 70
+ },
+ {
+ "epoch": 0.15,
+ "learning_rate": 0.0001,
+ "loss": 0.9901,
+ "step": 80
+ },
+ {
+ "epoch": 0.16,
+ "learning_rate": 0.0001,
+ "loss": 1.1264,
+ "step": 90
+ },
+ {
+ "epoch": 0.18,
+ "learning_rate": 0.0001,
+ "loss": 1.2038,
+ "step": 100
+ },
+ {
+ "epoch": 0.2,
+ "learning_rate": 0.0001,
+ "loss": 0.8935,
+ "step": 110
+ },
+ {
+ "epoch": 0.22,
+ "learning_rate": 0.0001,
+ "loss": 0.9178,
+ "step": 120
+ },
+ {
+ "epoch": 0.24,
+ "learning_rate": 0.0001,
+ "loss": 0.9746,
+ "step": 130
+ },
+ {
+ "epoch": 0.25,
+ "learning_rate": 0.0001,
+ "loss": 1.1566,
+ "step": 140
+ },
+ {
+ "epoch": 0.27,
+ "learning_rate": 0.0001,
+ "loss": 1.2877,
+ "step": 150
+ },
+ {
+ "epoch": 0.29,
+ "learning_rate": 0.0001,
+ "loss": 0.9146,
+ "step": 160
+ },
+ {
+ "epoch": 0.31,
+ "learning_rate": 0.0001,
+ "loss": 0.8895,
+ "step": 170
+ },
+ {
+ "epoch": 0.33,
+ "learning_rate": 0.0001,
+ "loss": 1.0121,
+ "step": 180
+ },
+ {
+ "epoch": 0.34,
+ "eval_loss": 1.0215636491775513,
+ "eval_runtime": 950.138,
+ "eval_samples_per_second": 1.052,
+ "eval_steps_per_second": 1.052,
+ "step": 187
+ },
+ {
+ "epoch": 0.34,
+ "mmlu_eval_accuracy": 0.731892294851104,
+ "mmlu_eval_accuracy_abstract_algebra": 0.18181818181818182,
+ "mmlu_eval_accuracy_anatomy": 0.7857142857142857,
+ "mmlu_eval_accuracy_astronomy": 0.6875,
+ "mmlu_eval_accuracy_business_ethics": 0.7272727272727273,
+ "mmlu_eval_accuracy_clinical_knowledge": 0.896551724137931,
+ "mmlu_eval_accuracy_college_biology": 0.875,
+ "mmlu_eval_accuracy_college_chemistry": 0.5,
+ "mmlu_eval_accuracy_college_computer_science": 0.7272727272727273,
+ "mmlu_eval_accuracy_college_mathematics": 0.45454545454545453,
+ "mmlu_eval_accuracy_college_medicine": 0.9090909090909091,
+ "mmlu_eval_accuracy_college_physics": 0.6363636363636364,
+ "mmlu_eval_accuracy_computer_security": 0.6363636363636364,
+ "mmlu_eval_accuracy_conceptual_physics": 0.6538461538461539,
+ "mmlu_eval_accuracy_econometrics": 0.8333333333333334,
+ "mmlu_eval_accuracy_electrical_engineering": 0.8125,
+ "mmlu_eval_accuracy_elementary_mathematics": 0.7073170731707317,
+ "mmlu_eval_accuracy_formal_logic": 0.5714285714285714,
+ "mmlu_eval_accuracy_global_facts": 0.5,
+ "mmlu_eval_accuracy_high_school_biology": 0.8125,
+ "mmlu_eval_accuracy_high_school_chemistry": 0.4090909090909091,
+ "mmlu_eval_accuracy_high_school_computer_science": 0.6666666666666666,
+ "mmlu_eval_accuracy_high_school_european_history": 0.7777777777777778,
+ "mmlu_eval_accuracy_high_school_geography": 0.9090909090909091,
+ "mmlu_eval_accuracy_high_school_government_and_politics": 0.9523809523809523,
+ "mmlu_eval_accuracy_high_school_macroeconomics": 0.7441860465116279,
+ "mmlu_eval_accuracy_high_school_mathematics": 0.3793103448275862,
+ "mmlu_eval_accuracy_high_school_microeconomics": 0.9615384615384616,
+ "mmlu_eval_accuracy_high_school_physics": 0.23529411764705882,
+ "mmlu_eval_accuracy_high_school_psychology": 0.9333333333333333,
+ "mmlu_eval_accuracy_high_school_statistics": 0.6956521739130435,
+ "mmlu_eval_accuracy_high_school_us_history": 0.9090909090909091,
+ "mmlu_eval_accuracy_high_school_world_history": 0.8076923076923077,
+ "mmlu_eval_accuracy_human_aging": 0.8260869565217391,
+ "mmlu_eval_accuracy_human_sexuality": 0.6666666666666666,
+ "mmlu_eval_accuracy_international_law": 1.0,
+ "mmlu_eval_accuracy_jurisprudence": 0.6363636363636364,
+ "mmlu_eval_accuracy_logical_fallacies": 0.7777777777777778,
+ "mmlu_eval_accuracy_machine_learning": 0.5454545454545454,
+ "mmlu_eval_accuracy_management": 0.9090909090909091,
+ "mmlu_eval_accuracy_marketing": 0.88,
+ "mmlu_eval_accuracy_medical_genetics": 0.9090909090909091,
+ "mmlu_eval_accuracy_miscellaneous": 0.7906976744186046,
+ "mmlu_eval_accuracy_moral_disputes": 0.8157894736842105,
+ "mmlu_eval_accuracy_moral_scenarios": 0.59,
+ "mmlu_eval_accuracy_nutrition": 0.7878787878787878,
+ "mmlu_eval_accuracy_philosophy": 0.7941176470588235,
+ "mmlu_eval_accuracy_prehistory": 0.8285714285714286,
+ "mmlu_eval_accuracy_professional_accounting": 0.6451612903225806,
+ "mmlu_eval_accuracy_professional_law": 0.6294117647058823,
+ "mmlu_eval_accuracy_professional_medicine": 0.8064516129032258,
+ "mmlu_eval_accuracy_professional_psychology": 0.8115942028985508,
+ "mmlu_eval_accuracy_public_relations": 0.6666666666666666,
+ "mmlu_eval_accuracy_security_studies": 0.8148148148148148,
+ "mmlu_eval_accuracy_sociology": 0.9545454545454546,
+ "mmlu_eval_accuracy_us_foreign_policy": 1.0,
+ "mmlu_eval_accuracy_virology": 0.5,
+ "mmlu_eval_accuracy_world_religions": 0.8421052631578947,
+ "mmlu_loss": 1.326305795171384,
+ "step": 187
+ },
+ {
+ "epoch": 0.34,
+ "learning_rate": 0.0001,
+ "loss": 1.1133,
+ "step": 190
+ },
+ {
+ "epoch": 0.36,
+ "learning_rate": 0.0001,
+ "loss": 1.2485,
+ "step": 200
+ },
+ {
+ "epoch": 0.38,
+ "learning_rate": 0.0001,
+ "loss": 0.9653,
+ "step": 210
+ },
+ {
+ "epoch": 0.4,
+ "learning_rate": 0.0001,
+ "loss": 0.9455,
+ "step": 220
+ },
+ {
+ "epoch": 0.42,
+ "learning_rate": 0.0001,
+ "loss": 1.0373,
+ "step": 230
+ },
+ {
+ "epoch": 0.44,
+ "learning_rate": 0.0001,
+ "loss": 1.1425,
+ "step": 240
+ },
+ {
+ "epoch": 0.45,
+ "learning_rate": 0.0001,
+ "loss": 1.3136,
+ "step": 250
+ },
+ {
+ "epoch": 0.47,
+ "learning_rate": 0.0001,
+ "loss": 0.8695,
+ "step": 260
+ },
+ {
+ "epoch": 0.49,
+ "learning_rate": 0.0001,
+ "loss": 0.872,
+ "step": 270
+ },
+ {
+ "epoch": 0.51,
+ "learning_rate": 0.0001,
+ "loss": 1.0152,
+ "step": 280
+ },
+ {
+ "epoch": 0.53,
+ "learning_rate": 0.0001,
+ "loss": 1.1309,
+ "step": 290
+ },
+ {
+ "epoch": 0.54,
+ "learning_rate": 0.0001,
+ "loss": 1.267,
+ "step": 300
+ },
+ {
+ "epoch": 0.56,
+ "learning_rate": 0.0001,
+ "loss": 0.9249,
+ "step": 310
+ },
+ {
+ "epoch": 0.58,
+ "learning_rate": 0.0001,
+ "loss": 0.9148,
+ "step": 320
+ },
+ {
+ "epoch": 0.6,
+ "learning_rate": 0.0001,
+ "loss": 0.9864,
+ "step": 330
+ },
+ {
+ "epoch": 0.62,
+ "learning_rate": 0.0001,
+ "loss": 1.2312,
+ "step": 340
+ },
+ {
+ "epoch": 0.63,
+ "learning_rate": 0.0001,
+ "loss": 1.2354,
+ "step": 350
+ },
+ {
+ "epoch": 0.65,
+ "learning_rate": 0.0001,
+ "loss": 0.9126,
+ "step": 360
+ },
+ {
+ "epoch": 0.67,
+ "learning_rate": 0.0001,
+ "loss": 0.9213,
+ "step": 370
+ },
+ {
+ "epoch": 0.68,
+ "eval_loss": 1.0163359642028809,
+ "eval_runtime": 948.1151,
+ "eval_samples_per_second": 1.055,
+ "eval_steps_per_second": 1.055,
+ "step": 374
+ },
+ {
+ "epoch": 0.68,
+ "mmlu_eval_accuracy": 0.7395476061435284,
+ "mmlu_eval_accuracy_abstract_algebra": 0.2727272727272727,
+ "mmlu_eval_accuracy_anatomy": 0.7857142857142857,
+ "mmlu_eval_accuracy_astronomy": 0.75,
+ "mmlu_eval_accuracy_business_ethics": 0.7272727272727273,
+ "mmlu_eval_accuracy_clinical_knowledge": 0.896551724137931,
+ "mmlu_eval_accuracy_college_biology": 0.875,
+ "mmlu_eval_accuracy_college_chemistry": 0.5,
+ "mmlu_eval_accuracy_college_computer_science": 0.7272727272727273,
+ "mmlu_eval_accuracy_college_mathematics": 0.36363636363636365,
+ "mmlu_eval_accuracy_college_medicine": 0.9090909090909091,
+ "mmlu_eval_accuracy_college_physics": 0.6363636363636364,
+ "mmlu_eval_accuracy_computer_security": 0.6363636363636364,
+ "mmlu_eval_accuracy_conceptual_physics": 0.5769230769230769,
+ "mmlu_eval_accuracy_econometrics": 0.8333333333333334,
+ "mmlu_eval_accuracy_electrical_engineering": 0.875,
+ "mmlu_eval_accuracy_elementary_mathematics": 0.6829268292682927,
+ "mmlu_eval_accuracy_formal_logic": 0.6428571428571429,
+ "mmlu_eval_accuracy_global_facts": 0.5,
+ "mmlu_eval_accuracy_high_school_biology": 0.8125,
+ "mmlu_eval_accuracy_high_school_chemistry": 0.45454545454545453,
+ "mmlu_eval_accuracy_high_school_computer_science": 0.7777777777777778,
+ "mmlu_eval_accuracy_high_school_european_history": 0.7777777777777778,
+ "mmlu_eval_accuracy_high_school_geography": 0.9090909090909091,
+ "mmlu_eval_accuracy_high_school_government_and_politics": 0.9523809523809523,
+ "mmlu_eval_accuracy_high_school_macroeconomics": 0.7674418604651163,
+ "mmlu_eval_accuracy_high_school_mathematics": 0.41379310344827586,
+ "mmlu_eval_accuracy_high_school_microeconomics": 0.9615384615384616,
+ "mmlu_eval_accuracy_high_school_physics": 0.23529411764705882,
+ "mmlu_eval_accuracy_high_school_psychology": 0.95,
+ "mmlu_eval_accuracy_high_school_statistics": 0.7391304347826086,
+ "mmlu_eval_accuracy_high_school_us_history": 0.9090909090909091,
+ "mmlu_eval_accuracy_high_school_world_history": 0.7692307692307693,
+ "mmlu_eval_accuracy_human_aging": 0.8260869565217391,
+ "mmlu_eval_accuracy_human_sexuality": 0.6666666666666666,
+ "mmlu_eval_accuracy_international_law": 1.0,
+ "mmlu_eval_accuracy_jurisprudence": 0.6363636363636364,
+ "mmlu_eval_accuracy_logical_fallacies": 0.7777777777777778,
+ "mmlu_eval_accuracy_machine_learning": 0.5454545454545454,
+ "mmlu_eval_accuracy_management": 0.9090909090909091,
+ "mmlu_eval_accuracy_marketing": 0.92,
+ "mmlu_eval_accuracy_medical_genetics": 0.9090909090909091,
+ "mmlu_eval_accuracy_miscellaneous": 0.7790697674418605,
+ "mmlu_eval_accuracy_moral_disputes": 0.8157894736842105,
+ "mmlu_eval_accuracy_moral_scenarios": 0.57,
+ "mmlu_eval_accuracy_nutrition": 0.7272727272727273,
+ "mmlu_eval_accuracy_philosophy": 0.7941176470588235,
+ "mmlu_eval_accuracy_prehistory": 0.8571428571428571,
+ "mmlu_eval_accuracy_professional_accounting": 0.6774193548387096,
+ "mmlu_eval_accuracy_professional_law": 0.6411764705882353,
+ "mmlu_eval_accuracy_professional_medicine": 0.8387096774193549,
+ "mmlu_eval_accuracy_professional_psychology": 0.8115942028985508,
+ "mmlu_eval_accuracy_public_relations": 0.6666666666666666,
+ "mmlu_eval_accuracy_security_studies": 0.8148148148148148,
+ "mmlu_eval_accuracy_sociology": 0.9545454545454546,
+ "mmlu_eval_accuracy_us_foreign_policy": 1.0,
+ "mmlu_eval_accuracy_virology": 0.5,
+ "mmlu_eval_accuracy_world_religions": 0.8947368421052632,
+ "mmlu_loss": 1.2796503596061355,
+ "step": 374
+ },
+ {
+ "epoch": 0.69,
+ "learning_rate": 0.0001,
+ "loss": 0.9737,
+ "step": 380
+ },
+ {
+ "epoch": 0.71,
+ "learning_rate": 0.0001,
+ "loss": 1.157,
+ "step": 390
+ },
+ {
+ "epoch": 0.73,
+ "learning_rate": 0.0001,
+ "loss": 1.2106,
+ "step": 400
+ },
+ {
+ "epoch": 0.74,
+ "learning_rate": 0.0001,
+ "loss": 0.8687,
+ "step": 410
+ },
+ {
+ "epoch": 0.76,
+ "learning_rate": 0.0001,
+ "loss": 0.8742,
+ "step": 420
+ },
+ {
+ "epoch": 0.78,
+ "learning_rate": 0.0001,
+ "loss": 0.9901,
+ "step": 430
+ },
+ {
+ "epoch": 0.8,
+ "learning_rate": 0.0001,
+ "loss": 1.2238,
+ "step": 440
+ },
+ {
+ "epoch": 0.82,
+ "learning_rate": 0.0001,
+ "loss": 1.2604,
+ "step": 450
+ },
+ {
+ "epoch": 0.83,
+ "learning_rate": 0.0001,
+ "loss": 0.8756,
+ "step": 460
+ },
+ {
+ "epoch": 0.85,
+ "learning_rate": 0.0001,
+ "loss": 0.8683,
+ "step": 470
+ },
+ {
+ "epoch": 0.87,
+ "learning_rate": 0.0001,
+ "loss": 0.9824,
+ "step": 480
+ },
+ {
+ "epoch": 0.89,
+ "learning_rate": 0.0001,
+ "loss": 1.1574,
+ "step": 490
+ },
+ {
+ "epoch": 0.91,
+ "learning_rate": 0.0001,
+ "loss": 1.2687,
+ "step": 500
+ },
+ {
+ "epoch": 0.92,
+ "learning_rate": 0.0001,
+ "loss": 0.8657,
+ "step": 510
+ },
+ {
+ "epoch": 0.94,
+ "learning_rate": 0.0001,
+ "loss": 0.9207,
+ "step": 520
+ },
+ {
+ "epoch": 0.96,
+ "learning_rate": 0.0001,
+ "loss": 1.012,
+ "step": 530
+ },
+ {
+ "epoch": 0.98,
+ "learning_rate": 0.0001,
+ "loss": 1.1517,
+ "step": 540
+ },
+ {
+ "epoch": 1.0,
+ "learning_rate": 0.0001,
+ "loss": 1.1654,
+ "step": 550
+ },
+ {
+ "epoch": 1.02,
+ "learning_rate": 0.0001,
+ "loss": 0.8931,
+ "step": 560
+ },
+ {
+ "epoch": 1.02,
+ "eval_loss": 1.0150845050811768,
+ "eval_runtime": 949.8392,
+ "eval_samples_per_second": 1.053,
+ "eval_steps_per_second": 1.053,
+ "step": 561
+ },
+ {
+ "epoch": 1.02,
+ "mmlu_eval_accuracy": 0.7346397374699287,
+ "mmlu_eval_accuracy_abstract_algebra": 0.2727272727272727,
+ "mmlu_eval_accuracy_anatomy": 0.7142857142857143,
+ "mmlu_eval_accuracy_astronomy": 0.6875,
+ "mmlu_eval_accuracy_business_ethics": 0.8181818181818182,
+ "mmlu_eval_accuracy_clinical_knowledge": 0.8620689655172413,
+ "mmlu_eval_accuracy_college_biology": 0.875,
+ "mmlu_eval_accuracy_college_chemistry": 0.5,
+ "mmlu_eval_accuracy_college_computer_science": 0.7272727272727273,
+ "mmlu_eval_accuracy_college_mathematics": 0.36363636363636365,
+ "mmlu_eval_accuracy_college_medicine": 0.9090909090909091,
+ "mmlu_eval_accuracy_college_physics": 0.5454545454545454,
+ "mmlu_eval_accuracy_computer_security": 0.6363636363636364,
+ "mmlu_eval_accuracy_conceptual_physics": 0.6153846153846154,
+ "mmlu_eval_accuracy_econometrics": 0.8333333333333334,
+ "mmlu_eval_accuracy_electrical_engineering": 0.8125,
+ "mmlu_eval_accuracy_elementary_mathematics": 0.6829268292682927,
+ "mmlu_eval_accuracy_formal_logic": 0.6428571428571429,
+ "mmlu_eval_accuracy_global_facts": 0.5,
+ "mmlu_eval_accuracy_high_school_biology": 0.84375,
+ "mmlu_eval_accuracy_high_school_chemistry": 0.45454545454545453,
+ "mmlu_eval_accuracy_high_school_computer_science": 0.7777777777777778,
+ "mmlu_eval_accuracy_high_school_european_history": 0.7777777777777778,
+ "mmlu_eval_accuracy_high_school_geography": 0.9090909090909091,
+ "mmlu_eval_accuracy_high_school_government_and_politics": 0.9523809523809523,
+ "mmlu_eval_accuracy_high_school_macroeconomics": 0.7674418604651163,
+ "mmlu_eval_accuracy_high_school_mathematics": 0.41379310344827586,
+ "mmlu_eval_accuracy_high_school_microeconomics": 0.9230769230769231,
+ "mmlu_eval_accuracy_high_school_physics": 0.23529411764705882,
+ "mmlu_eval_accuracy_high_school_psychology": 0.9666666666666667,
+ "mmlu_eval_accuracy_high_school_statistics": 0.7391304347826086,
+ "mmlu_eval_accuracy_high_school_us_history": 0.9090909090909091,
+ "mmlu_eval_accuracy_high_school_world_history": 0.8076923076923077,
+ "mmlu_eval_accuracy_human_aging": 0.8260869565217391,
+ "mmlu_eval_accuracy_human_sexuality": 0.6666666666666666,
+ "mmlu_eval_accuracy_international_law": 1.0,
+ "mmlu_eval_accuracy_jurisprudence": 0.6363636363636364,
+ "mmlu_eval_accuracy_logical_fallacies": 0.7222222222222222,
+ "mmlu_eval_accuracy_machine_learning": 0.6363636363636364,
+ "mmlu_eval_accuracy_management": 0.9090909090909091,
+ "mmlu_eval_accuracy_marketing": 0.88,
+ "mmlu_eval_accuracy_medical_genetics": 0.9090909090909091,
+ "mmlu_eval_accuracy_miscellaneous": 0.7558139534883721,
+ "mmlu_eval_accuracy_moral_disputes": 0.8157894736842105,
+ "mmlu_eval_accuracy_moral_scenarios": 0.56,
+ "mmlu_eval_accuracy_nutrition": 0.7575757575757576,
+ "mmlu_eval_accuracy_philosophy": 0.7941176470588235,
+ "mmlu_eval_accuracy_prehistory": 0.8571428571428571,
+ "mmlu_eval_accuracy_professional_accounting": 0.6774193548387096,
+ "mmlu_eval_accuracy_professional_law": 0.6294117647058823,
+ "mmlu_eval_accuracy_professional_medicine": 0.8064516129032258,
+ "mmlu_eval_accuracy_professional_psychology": 0.8260869565217391,
+ "mmlu_eval_accuracy_public_relations": 0.6666666666666666,
+ "mmlu_eval_accuracy_security_studies": 0.8148148148148148,
+ "mmlu_eval_accuracy_sociology": 0.9090909090909091,
+ "mmlu_eval_accuracy_us_foreign_policy": 1.0,
+ "mmlu_eval_accuracy_virology": 0.5,
+ "mmlu_eval_accuracy_world_religions": 0.8421052631578947,
+ "mmlu_loss": 1.186674291658526,
+ "step": 561
+ },
+ {
+ "epoch": 1.03,
+ "learning_rate": 0.0001,
+ "loss": 0.8507,
+ "step": 570
+ },
+ {
+ "epoch": 1.05,
+ "learning_rate": 0.0001,
+ "loss": 0.9164,
+ "step": 580
+ },
+ {
+ "epoch": 1.07,
+ "learning_rate": 0.0001,
+ "loss": 1.0908,
+ "step": 590
+ },
+ {
+ "epoch": 1.09,
+ "learning_rate": 0.0001,
+ "loss": 1.0431,
+ "step": 600
+ },
+ {
+ "epoch": 1.11,
+ "learning_rate": 0.0001,
+ "loss": 0.8567,
+ "step": 610
+ },
+ {
+ "epoch": 1.12,
+ "learning_rate": 0.0001,
+ "loss": 0.8818,
+ "step": 620
+ },
+ {
+ "epoch": 1.14,
+ "learning_rate": 0.0001,
+ "loss": 0.9499,
+ "step": 630
+ },
+ {
+ "epoch": 1.16,
+ "learning_rate": 0.0001,
+ "loss": 1.0437,
+ "step": 640
+ },
+ {
+ "epoch": 1.18,
+ "learning_rate": 0.0001,
+ "loss": 1.0487,
+ "step": 650
+ },
+ {
+ "epoch": 1.2,
+ "learning_rate": 0.0001,
+ "loss": 0.8405,
+ "step": 660
+ },
+ {
+ "epoch": 1.22,
+ "learning_rate": 0.0001,
+ "loss": 0.8818,
+ "step": 670
+ },
+ {
+ "epoch": 1.23,
+ "learning_rate": 0.0001,
+ "loss": 0.9619,
+ "step": 680
+ },
+ {
+ "epoch": 1.25,
+ "learning_rate": 0.0001,
+ "loss": 1.0753,
+ "step": 690
+ },
+ {
+ "epoch": 1.27,
+ "learning_rate": 0.0001,
+ "loss": 1.0218,
+ "step": 700
+ },
+ {
+ "epoch": 1.29,
+ "learning_rate": 0.0001,
+ "loss": 0.8763,
+ "step": 710
+ },
+ {
+ "epoch": 1.31,
+ "learning_rate": 0.0001,
+ "loss": 0.8789,
+ "step": 720
+ },
+ {
+ "epoch": 1.32,
+ "learning_rate": 0.0001,
+ "loss": 0.8631,
+ "step": 730
+ },
+ {
+ "epoch": 1.34,
+ "learning_rate": 0.0001,
+ "loss": 0.9846,
+ "step": 740
+ },
+ {
+ "epoch": 1.36,
+ "eval_loss": 1.0305067300796509,
+ "eval_runtime": 948.7106,
+ "eval_samples_per_second": 1.054,
+ "eval_steps_per_second": 1.054,
+ "step": 748
+ },
+ {
+ "epoch": 1.36,
+ "mmlu_eval_accuracy": 0.7324229372189777,
+ "mmlu_eval_accuracy_abstract_algebra": 0.2727272727272727,
+ "mmlu_eval_accuracy_anatomy": 0.6428571428571429,
+ "mmlu_eval_accuracy_astronomy": 0.6875,
+ "mmlu_eval_accuracy_business_ethics": 1.0,
+ "mmlu_eval_accuracy_clinical_knowledge": 0.896551724137931,
+ "mmlu_eval_accuracy_college_biology": 0.875,
+ "mmlu_eval_accuracy_college_chemistry": 0.5,
+ "mmlu_eval_accuracy_college_computer_science": 0.6363636363636364,
+ "mmlu_eval_accuracy_college_mathematics": 0.36363636363636365,
+ "mmlu_eval_accuracy_college_medicine": 0.8636363636363636,
+ "mmlu_eval_accuracy_college_physics": 0.6363636363636364,
+ "mmlu_eval_accuracy_computer_security": 0.6363636363636364,
+ "mmlu_eval_accuracy_conceptual_physics": 0.6153846153846154,
+ "mmlu_eval_accuracy_econometrics": 0.8333333333333334,
+ "mmlu_eval_accuracy_electrical_engineering": 0.8125,
+ "mmlu_eval_accuracy_elementary_mathematics": 0.7073170731707317,
+ "mmlu_eval_accuracy_formal_logic": 0.6428571428571429,
+ "mmlu_eval_accuracy_global_facts": 0.4,
+ "mmlu_eval_accuracy_high_school_biology": 0.8125,
+ "mmlu_eval_accuracy_high_school_chemistry": 0.36363636363636365,
+ "mmlu_eval_accuracy_high_school_computer_science": 0.6666666666666666,
+ "mmlu_eval_accuracy_high_school_european_history": 0.7777777777777778,
+ "mmlu_eval_accuracy_high_school_geography": 0.9090909090909091,
+ "mmlu_eval_accuracy_high_school_government_and_politics": 0.9523809523809523,
+ "mmlu_eval_accuracy_high_school_macroeconomics": 0.7674418604651163,
+ "mmlu_eval_accuracy_high_school_mathematics": 0.3448275862068966,
+ "mmlu_eval_accuracy_high_school_microeconomics": 0.9615384615384616,
+ "mmlu_eval_accuracy_high_school_physics": 0.35294117647058826,
+ "mmlu_eval_accuracy_high_school_psychology": 0.9333333333333333,
+ "mmlu_eval_accuracy_high_school_statistics": 0.782608695652174,
+ "mmlu_eval_accuracy_high_school_us_history": 0.9545454545454546,
+ "mmlu_eval_accuracy_high_school_world_history": 0.7692307692307693,
+ "mmlu_eval_accuracy_human_aging": 0.782608695652174,
+ "mmlu_eval_accuracy_human_sexuality": 0.6666666666666666,
+ "mmlu_eval_accuracy_international_law": 1.0,
+ "mmlu_eval_accuracy_jurisprudence": 0.6363636363636364,
+ "mmlu_eval_accuracy_logical_fallacies": 0.7777777777777778,
+ "mmlu_eval_accuracy_machine_learning": 0.6363636363636364,
+ "mmlu_eval_accuracy_management": 0.9090909090909091,
+ "mmlu_eval_accuracy_marketing": 0.96,
+ "mmlu_eval_accuracy_medical_genetics": 0.9090909090909091,
+ "mmlu_eval_accuracy_miscellaneous": 0.7441860465116279,
+ "mmlu_eval_accuracy_moral_disputes": 0.8157894736842105,
+ "mmlu_eval_accuracy_moral_scenarios": 0.57,
+ "mmlu_eval_accuracy_nutrition": 0.7272727272727273,
+ "mmlu_eval_accuracy_philosophy": 0.7941176470588235,
+ "mmlu_eval_accuracy_prehistory": 0.8285714285714286,
+ "mmlu_eval_accuracy_professional_accounting": 0.6129032258064516,
+ "mmlu_eval_accuracy_professional_law": 0.6411764705882353,
+ "mmlu_eval_accuracy_professional_medicine": 0.8064516129032258,
+ "mmlu_eval_accuracy_professional_psychology": 0.8260869565217391,
+ "mmlu_eval_accuracy_public_relations": 0.6666666666666666,
+ "mmlu_eval_accuracy_security_studies": 0.8148148148148148,
+ "mmlu_eval_accuracy_sociology": 0.9090909090909091,
+ "mmlu_eval_accuracy_us_foreign_policy": 1.0,
+ "mmlu_eval_accuracy_virology": 0.5,
+ "mmlu_eval_accuracy_world_religions": 0.8421052631578947,
+ "mmlu_loss": 1.2988067958029479,
+ "step": 748
+ },
+ {
+ "epoch": 1.36,
+ "learning_rate": 0.0001,
+ "loss": 1.0735,
+ "step": 750
+ },
+ {
+ "epoch": 1.38,
+ "learning_rate": 0.0001,
+ "loss": 0.9066,
+ "step": 760
+ },
+ {
+ "epoch": 1.4,
+ "learning_rate": 0.0001,
+ "loss": 0.8716,
+ "step": 770
+ },
+ {
+ "epoch": 1.41,
+ "learning_rate": 0.0001,
+ "loss": 0.9144,
+ "step": 780
+ },
+ {
+ "epoch": 1.43,
+ "learning_rate": 0.0001,
+ "loss": 1.0338,
+ "step": 790
+ },
+ {
+ "epoch": 1.45,
+ "learning_rate": 0.0001,
+ "loss": 1.0275,
+ "step": 800
+ }
+ ],
+ "max_steps": 1875,
+ "num_train_epochs": 4,
+ "total_flos": 9.275226951001375e+17,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/checkpoint-800/training_args.bin b/checkpoint-800/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..cab5478c763bfedd6ea1ae136d922355ec23f73d
--- /dev/null
+++ b/checkpoint-800/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3739b542a2914edbd2d7eb3b727d6fa2a7752c75b0d7a856f5e87dd807fa1ef9
+size 6200
diff --git a/completed b/completed
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/eval_results.json b/eval_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..7b6299ba979e0777e182dadf7015ce4065c50fb6
--- /dev/null
+++ b/eval_results.json
@@ -0,0 +1,7 @@
+{
+ "epoch": 3.4,
+ "eval_loss": 1.151158332824707,
+ "eval_runtime": 572.7509,
+ "eval_samples_per_second": 1.746,
+ "eval_steps_per_second": 1.746
+}
\ No newline at end of file
diff --git a/metrics.json b/metrics.json
new file mode 100644
index 0000000000000000000000000000000000000000..8c0d9005ae0ee5a590979163b89a80a58a8487b6
--- /dev/null
+++ b/metrics.json
@@ -0,0 +1 @@
+{"run_name": "./output/miqu-70b", "train_runtime": 112632.6276, "train_samples_per_second": 0.266, "train_steps_per_second": 0.017, "train_loss": 0.8699908837636312, "epoch": 3.4, "eval_loss": 1.151158332824707, "eval_runtime": 572.7509, "eval_samples_per_second": 1.746, "eval_steps_per_second": 1.746}
\ No newline at end of file
diff --git a/train_results.json b/train_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..85c0e2c2b53c0e560da22153b6264d6769293d16
--- /dev/null
+++ b/train_results.json
@@ -0,0 +1,7 @@
+{
+ "epoch": 3.4,
+ "train_loss": 0.8699908837636312,
+ "train_runtime": 112632.6276,
+ "train_samples_per_second": 0.266,
+ "train_steps_per_second": 0.017
+}
\ No newline at end of file
diff --git a/trainer_state.json b/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..c5436d6886b3aee93c6cd7f4cd19023561fc8abe
--- /dev/null
+++ b/trainer_state.json
@@ -0,0 +1,1857 @@
+{
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 3.4005894355021535,
+ "global_step": 1875,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0001,
+ "loss": 1.1655,
+ "step": 10
+ },
+ {
+ "epoch": 0.04,
+ "learning_rate": 0.0001,
+ "loss": 1.001,
+ "step": 20
+ },
+ {
+ "epoch": 0.05,
+ "learning_rate": 0.0001,
+ "loss": 1.0287,
+ "step": 30
+ },
+ {
+ "epoch": 0.07,
+ "learning_rate": 0.0001,
+ "loss": 1.1578,
+ "step": 40
+ },
+ {
+ "epoch": 0.09,
+ "learning_rate": 0.0001,
+ "loss": 1.2146,
+ "step": 50
+ },
+ {
+ "epoch": 0.11,
+ "learning_rate": 0.0001,
+ "loss": 0.997,
+ "step": 60
+ },
+ {
+ "epoch": 0.13,
+ "learning_rate": 0.0001,
+ "loss": 0.9024,
+ "step": 70
+ },
+ {
+ "epoch": 0.15,
+ "learning_rate": 0.0001,
+ "loss": 0.9901,
+ "step": 80
+ },
+ {
+ "epoch": 0.16,
+ "learning_rate": 0.0001,
+ "loss": 1.1264,
+ "step": 90
+ },
+ {
+ "epoch": 0.18,
+ "learning_rate": 0.0001,
+ "loss": 1.2038,
+ "step": 100
+ },
+ {
+ "epoch": 0.2,
+ "learning_rate": 0.0001,
+ "loss": 0.8935,
+ "step": 110
+ },
+ {
+ "epoch": 0.22,
+ "learning_rate": 0.0001,
+ "loss": 0.9178,
+ "step": 120
+ },
+ {
+ "epoch": 0.24,
+ "learning_rate": 0.0001,
+ "loss": 0.9746,
+ "step": 130
+ },
+ {
+ "epoch": 0.25,
+ "learning_rate": 0.0001,
+ "loss": 1.1566,
+ "step": 140
+ },
+ {
+ "epoch": 0.27,
+ "learning_rate": 0.0001,
+ "loss": 1.2877,
+ "step": 150
+ },
+ {
+ "epoch": 0.29,
+ "learning_rate": 0.0001,
+ "loss": 0.9146,
+ "step": 160
+ },
+ {
+ "epoch": 0.31,
+ "learning_rate": 0.0001,
+ "loss": 0.8895,
+ "step": 170
+ },
+ {
+ "epoch": 0.33,
+ "learning_rate": 0.0001,
+ "loss": 1.0121,
+ "step": 180
+ },
+ {
+ "epoch": 0.34,
+ "eval_loss": 1.0215636491775513,
+ "eval_runtime": 950.138,
+ "eval_samples_per_second": 1.052,
+ "eval_steps_per_second": 1.052,
+ "step": 187
+ },
+ {
+ "epoch": 0.34,
+ "mmlu_eval_accuracy": 0.731892294851104,
+ "mmlu_eval_accuracy_abstract_algebra": 0.18181818181818182,
+ "mmlu_eval_accuracy_anatomy": 0.7857142857142857,
+ "mmlu_eval_accuracy_astronomy": 0.6875,
+ "mmlu_eval_accuracy_business_ethics": 0.7272727272727273,
+ "mmlu_eval_accuracy_clinical_knowledge": 0.896551724137931,
+ "mmlu_eval_accuracy_college_biology": 0.875,
+ "mmlu_eval_accuracy_college_chemistry": 0.5,
+ "mmlu_eval_accuracy_college_computer_science": 0.7272727272727273,
+ "mmlu_eval_accuracy_college_mathematics": 0.45454545454545453,
+ "mmlu_eval_accuracy_college_medicine": 0.9090909090909091,
+ "mmlu_eval_accuracy_college_physics": 0.6363636363636364,
+ "mmlu_eval_accuracy_computer_security": 0.6363636363636364,
+ "mmlu_eval_accuracy_conceptual_physics": 0.6538461538461539,
+ "mmlu_eval_accuracy_econometrics": 0.8333333333333334,
+ "mmlu_eval_accuracy_electrical_engineering": 0.8125,
+ "mmlu_eval_accuracy_elementary_mathematics": 0.7073170731707317,
+ "mmlu_eval_accuracy_formal_logic": 0.5714285714285714,
+ "mmlu_eval_accuracy_global_facts": 0.5,
+ "mmlu_eval_accuracy_high_school_biology": 0.8125,
+ "mmlu_eval_accuracy_high_school_chemistry": 0.4090909090909091,
+ "mmlu_eval_accuracy_high_school_computer_science": 0.6666666666666666,
+ "mmlu_eval_accuracy_high_school_european_history": 0.7777777777777778,
+ "mmlu_eval_accuracy_high_school_geography": 0.9090909090909091,
+ "mmlu_eval_accuracy_high_school_government_and_politics": 0.9523809523809523,
+ "mmlu_eval_accuracy_high_school_macroeconomics": 0.7441860465116279,
+ "mmlu_eval_accuracy_high_school_mathematics": 0.3793103448275862,
+ "mmlu_eval_accuracy_high_school_microeconomics": 0.9615384615384616,
+ "mmlu_eval_accuracy_high_school_physics": 0.23529411764705882,
+ "mmlu_eval_accuracy_high_school_psychology": 0.9333333333333333,
+ "mmlu_eval_accuracy_high_school_statistics": 0.6956521739130435,
+ "mmlu_eval_accuracy_high_school_us_history": 0.9090909090909091,
+ "mmlu_eval_accuracy_high_school_world_history": 0.8076923076923077,
+ "mmlu_eval_accuracy_human_aging": 0.8260869565217391,
+ "mmlu_eval_accuracy_human_sexuality": 0.6666666666666666,
+ "mmlu_eval_accuracy_international_law": 1.0,
+ "mmlu_eval_accuracy_jurisprudence": 0.6363636363636364,
+ "mmlu_eval_accuracy_logical_fallacies": 0.7777777777777778,
+ "mmlu_eval_accuracy_machine_learning": 0.5454545454545454,
+ "mmlu_eval_accuracy_management": 0.9090909090909091,
+ "mmlu_eval_accuracy_marketing": 0.88,
+ "mmlu_eval_accuracy_medical_genetics": 0.9090909090909091,
+ "mmlu_eval_accuracy_miscellaneous": 0.7906976744186046,
+ "mmlu_eval_accuracy_moral_disputes": 0.8157894736842105,
+ "mmlu_eval_accuracy_moral_scenarios": 0.59,
+ "mmlu_eval_accuracy_nutrition": 0.7878787878787878,
+ "mmlu_eval_accuracy_philosophy": 0.7941176470588235,
+ "mmlu_eval_accuracy_prehistory": 0.8285714285714286,
+ "mmlu_eval_accuracy_professional_accounting": 0.6451612903225806,
+ "mmlu_eval_accuracy_professional_law": 0.6294117647058823,
+ "mmlu_eval_accuracy_professional_medicine": 0.8064516129032258,
+ "mmlu_eval_accuracy_professional_psychology": 0.8115942028985508,
+ "mmlu_eval_accuracy_public_relations": 0.6666666666666666,
+ "mmlu_eval_accuracy_security_studies": 0.8148148148148148,
+ "mmlu_eval_accuracy_sociology": 0.9545454545454546,
+ "mmlu_eval_accuracy_us_foreign_policy": 1.0,
+ "mmlu_eval_accuracy_virology": 0.5,
+ "mmlu_eval_accuracy_world_religions": 0.8421052631578947,
+ "mmlu_loss": 1.326305795171384,
+ "step": 187
+ },
+ {
+ "epoch": 0.34,
+ "learning_rate": 0.0001,
+ "loss": 1.1133,
+ "step": 190
+ },
+ {
+ "epoch": 0.36,
+ "learning_rate": 0.0001,
+ "loss": 1.2485,
+ "step": 200
+ },
+ {
+ "epoch": 0.38,
+ "learning_rate": 0.0001,
+ "loss": 0.9653,
+ "step": 210
+ },
+ {
+ "epoch": 0.4,
+ "learning_rate": 0.0001,
+ "loss": 0.9455,
+ "step": 220
+ },
+ {
+ "epoch": 0.42,
+ "learning_rate": 0.0001,
+ "loss": 1.0373,
+ "step": 230
+ },
+ {
+ "epoch": 0.44,
+ "learning_rate": 0.0001,
+ "loss": 1.1425,
+ "step": 240
+ },
+ {
+ "epoch": 0.45,
+ "learning_rate": 0.0001,
+ "loss": 1.3136,
+ "step": 250
+ },
+ {
+ "epoch": 0.47,
+ "learning_rate": 0.0001,
+ "loss": 0.8695,
+ "step": 260
+ },
+ {
+ "epoch": 0.49,
+ "learning_rate": 0.0001,
+ "loss": 0.872,
+ "step": 270
+ },
+ {
+ "epoch": 0.51,
+ "learning_rate": 0.0001,
+ "loss": 1.0152,
+ "step": 280
+ },
+ {
+ "epoch": 0.53,
+ "learning_rate": 0.0001,
+ "loss": 1.1309,
+ "step": 290
+ },
+ {
+ "epoch": 0.54,
+ "learning_rate": 0.0001,
+ "loss": 1.267,
+ "step": 300
+ },
+ {
+ "epoch": 0.56,
+ "learning_rate": 0.0001,
+ "loss": 0.9249,
+ "step": 310
+ },
+ {
+ "epoch": 0.58,
+ "learning_rate": 0.0001,
+ "loss": 0.9148,
+ "step": 320
+ },
+ {
+ "epoch": 0.6,
+ "learning_rate": 0.0001,
+ "loss": 0.9864,
+ "step": 330
+ },
+ {
+ "epoch": 0.62,
+ "learning_rate": 0.0001,
+ "loss": 1.2312,
+ "step": 340
+ },
+ {
+ "epoch": 0.63,
+ "learning_rate": 0.0001,
+ "loss": 1.2354,
+ "step": 350
+ },
+ {
+ "epoch": 0.65,
+ "learning_rate": 0.0001,
+ "loss": 0.9126,
+ "step": 360
+ },
+ {
+ "epoch": 0.67,
+ "learning_rate": 0.0001,
+ "loss": 0.9213,
+ "step": 370
+ },
+ {
+ "epoch": 0.68,
+ "eval_loss": 1.0163359642028809,
+ "eval_runtime": 948.1151,
+ "eval_samples_per_second": 1.055,
+ "eval_steps_per_second": 1.055,
+ "step": 374
+ },
+ {
+ "epoch": 0.68,
+ "mmlu_eval_accuracy": 0.7395476061435284,
+ "mmlu_eval_accuracy_abstract_algebra": 0.2727272727272727,
+ "mmlu_eval_accuracy_anatomy": 0.7857142857142857,
+ "mmlu_eval_accuracy_astronomy": 0.75,
+ "mmlu_eval_accuracy_business_ethics": 0.7272727272727273,
+ "mmlu_eval_accuracy_clinical_knowledge": 0.896551724137931,
+ "mmlu_eval_accuracy_college_biology": 0.875,
+ "mmlu_eval_accuracy_college_chemistry": 0.5,
+ "mmlu_eval_accuracy_college_computer_science": 0.7272727272727273,
+ "mmlu_eval_accuracy_college_mathematics": 0.36363636363636365,
+ "mmlu_eval_accuracy_college_medicine": 0.9090909090909091,
+ "mmlu_eval_accuracy_college_physics": 0.6363636363636364,
+ "mmlu_eval_accuracy_computer_security": 0.6363636363636364,
+ "mmlu_eval_accuracy_conceptual_physics": 0.5769230769230769,
+ "mmlu_eval_accuracy_econometrics": 0.8333333333333334,
+ "mmlu_eval_accuracy_electrical_engineering": 0.875,
+ "mmlu_eval_accuracy_elementary_mathematics": 0.6829268292682927,
+ "mmlu_eval_accuracy_formal_logic": 0.6428571428571429,
+ "mmlu_eval_accuracy_global_facts": 0.5,
+ "mmlu_eval_accuracy_high_school_biology": 0.8125,
+ "mmlu_eval_accuracy_high_school_chemistry": 0.45454545454545453,
+ "mmlu_eval_accuracy_high_school_computer_science": 0.7777777777777778,
+ "mmlu_eval_accuracy_high_school_european_history": 0.7777777777777778,
+ "mmlu_eval_accuracy_high_school_geography": 0.9090909090909091,
+ "mmlu_eval_accuracy_high_school_government_and_politics": 0.9523809523809523,
+ "mmlu_eval_accuracy_high_school_macroeconomics": 0.7674418604651163,
+ "mmlu_eval_accuracy_high_school_mathematics": 0.41379310344827586,
+ "mmlu_eval_accuracy_high_school_microeconomics": 0.9615384615384616,
+ "mmlu_eval_accuracy_high_school_physics": 0.23529411764705882,
+ "mmlu_eval_accuracy_high_school_psychology": 0.95,
+ "mmlu_eval_accuracy_high_school_statistics": 0.7391304347826086,
+ "mmlu_eval_accuracy_high_school_us_history": 0.9090909090909091,
+ "mmlu_eval_accuracy_high_school_world_history": 0.7692307692307693,
+ "mmlu_eval_accuracy_human_aging": 0.8260869565217391,
+ "mmlu_eval_accuracy_human_sexuality": 0.6666666666666666,
+ "mmlu_eval_accuracy_international_law": 1.0,
+ "mmlu_eval_accuracy_jurisprudence": 0.6363636363636364,
+ "mmlu_eval_accuracy_logical_fallacies": 0.7777777777777778,
+ "mmlu_eval_accuracy_machine_learning": 0.5454545454545454,
+ "mmlu_eval_accuracy_management": 0.9090909090909091,
+ "mmlu_eval_accuracy_marketing": 0.92,
+ "mmlu_eval_accuracy_medical_genetics": 0.9090909090909091,
+ "mmlu_eval_accuracy_miscellaneous": 0.7790697674418605,
+ "mmlu_eval_accuracy_moral_disputes": 0.8157894736842105,
+ "mmlu_eval_accuracy_moral_scenarios": 0.57,
+ "mmlu_eval_accuracy_nutrition": 0.7272727272727273,
+ "mmlu_eval_accuracy_philosophy": 0.7941176470588235,
+ "mmlu_eval_accuracy_prehistory": 0.8571428571428571,
+ "mmlu_eval_accuracy_professional_accounting": 0.6774193548387096,
+ "mmlu_eval_accuracy_professional_law": 0.6411764705882353,
+ "mmlu_eval_accuracy_professional_medicine": 0.8387096774193549,
+ "mmlu_eval_accuracy_professional_psychology": 0.8115942028985508,
+ "mmlu_eval_accuracy_public_relations": 0.6666666666666666,
+ "mmlu_eval_accuracy_security_studies": 0.8148148148148148,
+ "mmlu_eval_accuracy_sociology": 0.9545454545454546,
+ "mmlu_eval_accuracy_us_foreign_policy": 1.0,
+ "mmlu_eval_accuracy_virology": 0.5,
+ "mmlu_eval_accuracy_world_religions": 0.8947368421052632,
+ "mmlu_loss": 1.2796503596061355,
+ "step": 374
+ },
+ {
+ "epoch": 0.69,
+ "learning_rate": 0.0001,
+ "loss": 0.9737,
+ "step": 380
+ },
+ {
+ "epoch": 0.71,
+ "learning_rate": 0.0001,
+ "loss": 1.157,
+ "step": 390
+ },
+ {
+ "epoch": 0.73,
+ "learning_rate": 0.0001,
+ "loss": 1.2106,
+ "step": 400
+ },
+ {
+ "epoch": 0.74,
+ "learning_rate": 0.0001,
+ "loss": 0.8687,
+ "step": 410
+ },
+ {
+ "epoch": 0.76,
+ "learning_rate": 0.0001,
+ "loss": 0.8742,
+ "step": 420
+ },
+ {
+ "epoch": 0.78,
+ "learning_rate": 0.0001,
+ "loss": 0.9901,
+ "step": 430
+ },
+ {
+ "epoch": 0.8,
+ "learning_rate": 0.0001,
+ "loss": 1.2238,
+ "step": 440
+ },
+ {
+ "epoch": 0.82,
+ "learning_rate": 0.0001,
+ "loss": 1.2604,
+ "step": 450
+ },
+ {
+ "epoch": 0.83,
+ "learning_rate": 0.0001,
+ "loss": 0.8756,
+ "step": 460
+ },
+ {
+ "epoch": 0.85,
+ "learning_rate": 0.0001,
+ "loss": 0.8683,
+ "step": 470
+ },
+ {
+ "epoch": 0.87,
+ "learning_rate": 0.0001,
+ "loss": 0.9824,
+ "step": 480
+ },
+ {
+ "epoch": 0.89,
+ "learning_rate": 0.0001,
+ "loss": 1.1574,
+ "step": 490
+ },
+ {
+ "epoch": 0.91,
+ "learning_rate": 0.0001,
+ "loss": 1.2687,
+ "step": 500
+ },
+ {
+ "epoch": 0.92,
+ "learning_rate": 0.0001,
+ "loss": 0.8657,
+ "step": 510
+ },
+ {
+ "epoch": 0.94,
+ "learning_rate": 0.0001,
+ "loss": 0.9207,
+ "step": 520
+ },
+ {
+ "epoch": 0.96,
+ "learning_rate": 0.0001,
+ "loss": 1.012,
+ "step": 530
+ },
+ {
+ "epoch": 0.98,
+ "learning_rate": 0.0001,
+ "loss": 1.1517,
+ "step": 540
+ },
+ {
+ "epoch": 1.0,
+ "learning_rate": 0.0001,
+ "loss": 1.1654,
+ "step": 550
+ },
+ {
+ "epoch": 1.02,
+ "learning_rate": 0.0001,
+ "loss": 0.8931,
+ "step": 560
+ },
+ {
+ "epoch": 1.02,
+ "eval_loss": 1.0150845050811768,
+ "eval_runtime": 949.8392,
+ "eval_samples_per_second": 1.053,
+ "eval_steps_per_second": 1.053,
+ "step": 561
+ },
+ {
+ "epoch": 1.02,
+ "mmlu_eval_accuracy": 0.7346397374699287,
+ "mmlu_eval_accuracy_abstract_algebra": 0.2727272727272727,
+ "mmlu_eval_accuracy_anatomy": 0.7142857142857143,
+ "mmlu_eval_accuracy_astronomy": 0.6875,
+ "mmlu_eval_accuracy_business_ethics": 0.8181818181818182,
+ "mmlu_eval_accuracy_clinical_knowledge": 0.8620689655172413,
+ "mmlu_eval_accuracy_college_biology": 0.875,
+ "mmlu_eval_accuracy_college_chemistry": 0.5,
+ "mmlu_eval_accuracy_college_computer_science": 0.7272727272727273,
+ "mmlu_eval_accuracy_college_mathematics": 0.36363636363636365,
+ "mmlu_eval_accuracy_college_medicine": 0.9090909090909091,
+ "mmlu_eval_accuracy_college_physics": 0.5454545454545454,
+ "mmlu_eval_accuracy_computer_security": 0.6363636363636364,
+ "mmlu_eval_accuracy_conceptual_physics": 0.6153846153846154,
+ "mmlu_eval_accuracy_econometrics": 0.8333333333333334,
+ "mmlu_eval_accuracy_electrical_engineering": 0.8125,
+ "mmlu_eval_accuracy_elementary_mathematics": 0.6829268292682927,
+ "mmlu_eval_accuracy_formal_logic": 0.6428571428571429,
+ "mmlu_eval_accuracy_global_facts": 0.5,
+ "mmlu_eval_accuracy_high_school_biology": 0.84375,
+ "mmlu_eval_accuracy_high_school_chemistry": 0.45454545454545453,
+ "mmlu_eval_accuracy_high_school_computer_science": 0.7777777777777778,
+ "mmlu_eval_accuracy_high_school_european_history": 0.7777777777777778,
+ "mmlu_eval_accuracy_high_school_geography": 0.9090909090909091,
+ "mmlu_eval_accuracy_high_school_government_and_politics": 0.9523809523809523,
+ "mmlu_eval_accuracy_high_school_macroeconomics": 0.7674418604651163,
+ "mmlu_eval_accuracy_high_school_mathematics": 0.41379310344827586,
+ "mmlu_eval_accuracy_high_school_microeconomics": 0.9230769230769231,
+ "mmlu_eval_accuracy_high_school_physics": 0.23529411764705882,
+ "mmlu_eval_accuracy_high_school_psychology": 0.9666666666666667,
+ "mmlu_eval_accuracy_high_school_statistics": 0.7391304347826086,
+ "mmlu_eval_accuracy_high_school_us_history": 0.9090909090909091,
+ "mmlu_eval_accuracy_high_school_world_history": 0.8076923076923077,
+ "mmlu_eval_accuracy_human_aging": 0.8260869565217391,
+ "mmlu_eval_accuracy_human_sexuality": 0.6666666666666666,
+ "mmlu_eval_accuracy_international_law": 1.0,
+ "mmlu_eval_accuracy_jurisprudence": 0.6363636363636364,
+ "mmlu_eval_accuracy_logical_fallacies": 0.7222222222222222,
+ "mmlu_eval_accuracy_machine_learning": 0.6363636363636364,
+ "mmlu_eval_accuracy_management": 0.9090909090909091,
+ "mmlu_eval_accuracy_marketing": 0.88,
+ "mmlu_eval_accuracy_medical_genetics": 0.9090909090909091,
+ "mmlu_eval_accuracy_miscellaneous": 0.7558139534883721,
+ "mmlu_eval_accuracy_moral_disputes": 0.8157894736842105,
+ "mmlu_eval_accuracy_moral_scenarios": 0.56,
+ "mmlu_eval_accuracy_nutrition": 0.7575757575757576,
+ "mmlu_eval_accuracy_philosophy": 0.7941176470588235,
+ "mmlu_eval_accuracy_prehistory": 0.8571428571428571,
+ "mmlu_eval_accuracy_professional_accounting": 0.6774193548387096,
+ "mmlu_eval_accuracy_professional_law": 0.6294117647058823,
+ "mmlu_eval_accuracy_professional_medicine": 0.8064516129032258,
+ "mmlu_eval_accuracy_professional_psychology": 0.8260869565217391,
+ "mmlu_eval_accuracy_public_relations": 0.6666666666666666,
+ "mmlu_eval_accuracy_security_studies": 0.8148148148148148,
+ "mmlu_eval_accuracy_sociology": 0.9090909090909091,
+ "mmlu_eval_accuracy_us_foreign_policy": 1.0,
+ "mmlu_eval_accuracy_virology": 0.5,
+ "mmlu_eval_accuracy_world_religions": 0.8421052631578947,
+ "mmlu_loss": 1.186674291658526,
+ "step": 561
+ },
+ {
+ "epoch": 1.03,
+ "learning_rate": 0.0001,
+ "loss": 0.8507,
+ "step": 570
+ },
+ {
+ "epoch": 1.05,
+ "learning_rate": 0.0001,
+ "loss": 0.9164,
+ "step": 580
+ },
+ {
+ "epoch": 1.07,
+ "learning_rate": 0.0001,
+ "loss": 1.0908,
+ "step": 590
+ },
+ {
+ "epoch": 1.09,
+ "learning_rate": 0.0001,
+ "loss": 1.0431,
+ "step": 600
+ },
+ {
+ "epoch": 1.11,
+ "learning_rate": 0.0001,
+ "loss": 0.8567,
+ "step": 610
+ },
+ {
+ "epoch": 1.12,
+ "learning_rate": 0.0001,
+ "loss": 0.8818,
+ "step": 620
+ },
+ {
+ "epoch": 1.14,
+ "learning_rate": 0.0001,
+ "loss": 0.9499,
+ "step": 630
+ },
+ {
+ "epoch": 1.16,
+ "learning_rate": 0.0001,
+ "loss": 1.0437,
+ "step": 640
+ },
+ {
+ "epoch": 1.18,
+ "learning_rate": 0.0001,
+ "loss": 1.0487,
+ "step": 650
+ },
+ {
+ "epoch": 1.2,
+ "learning_rate": 0.0001,
+ "loss": 0.8405,
+ "step": 660
+ },
+ {
+ "epoch": 1.22,
+ "learning_rate": 0.0001,
+ "loss": 0.8818,
+ "step": 670
+ },
+ {
+ "epoch": 1.23,
+ "learning_rate": 0.0001,
+ "loss": 0.9619,
+ "step": 680
+ },
+ {
+ "epoch": 1.25,
+ "learning_rate": 0.0001,
+ "loss": 1.0753,
+ "step": 690
+ },
+ {
+ "epoch": 1.27,
+ "learning_rate": 0.0001,
+ "loss": 1.0218,
+ "step": 700
+ },
+ {
+ "epoch": 1.29,
+ "learning_rate": 0.0001,
+ "loss": 0.8763,
+ "step": 710
+ },
+ {
+ "epoch": 1.31,
+ "learning_rate": 0.0001,
+ "loss": 0.8789,
+ "step": 720
+ },
+ {
+ "epoch": 1.32,
+ "learning_rate": 0.0001,
+ "loss": 0.8631,
+ "step": 730
+ },
+ {
+ "epoch": 1.34,
+ "learning_rate": 0.0001,
+ "loss": 0.9846,
+ "step": 740
+ },
+ {
+ "epoch": 1.36,
+ "eval_loss": 1.0305067300796509,
+ "eval_runtime": 948.7106,
+ "eval_samples_per_second": 1.054,
+ "eval_steps_per_second": 1.054,
+ "step": 748
+ },
+ {
+ "epoch": 1.36,
+ "mmlu_eval_accuracy": 0.7324229372189777,
+ "mmlu_eval_accuracy_abstract_algebra": 0.2727272727272727,
+ "mmlu_eval_accuracy_anatomy": 0.6428571428571429,
+ "mmlu_eval_accuracy_astronomy": 0.6875,
+ "mmlu_eval_accuracy_business_ethics": 1.0,
+ "mmlu_eval_accuracy_clinical_knowledge": 0.896551724137931,
+ "mmlu_eval_accuracy_college_biology": 0.875,
+ "mmlu_eval_accuracy_college_chemistry": 0.5,
+ "mmlu_eval_accuracy_college_computer_science": 0.6363636363636364,
+ "mmlu_eval_accuracy_college_mathematics": 0.36363636363636365,
+ "mmlu_eval_accuracy_college_medicine": 0.8636363636363636,
+ "mmlu_eval_accuracy_college_physics": 0.6363636363636364,
+ "mmlu_eval_accuracy_computer_security": 0.6363636363636364,
+ "mmlu_eval_accuracy_conceptual_physics": 0.6153846153846154,
+ "mmlu_eval_accuracy_econometrics": 0.8333333333333334,
+ "mmlu_eval_accuracy_electrical_engineering": 0.8125,
+ "mmlu_eval_accuracy_elementary_mathematics": 0.7073170731707317,
+ "mmlu_eval_accuracy_formal_logic": 0.6428571428571429,
+ "mmlu_eval_accuracy_global_facts": 0.4,
+ "mmlu_eval_accuracy_high_school_biology": 0.8125,
+ "mmlu_eval_accuracy_high_school_chemistry": 0.36363636363636365,
+ "mmlu_eval_accuracy_high_school_computer_science": 0.6666666666666666,
+ "mmlu_eval_accuracy_high_school_european_history": 0.7777777777777778,
+ "mmlu_eval_accuracy_high_school_geography": 0.9090909090909091,
+ "mmlu_eval_accuracy_high_school_government_and_politics": 0.9523809523809523,
+ "mmlu_eval_accuracy_high_school_macroeconomics": 0.7674418604651163,
+ "mmlu_eval_accuracy_high_school_mathematics": 0.3448275862068966,
+ "mmlu_eval_accuracy_high_school_microeconomics": 0.9615384615384616,
+ "mmlu_eval_accuracy_high_school_physics": 0.35294117647058826,
+ "mmlu_eval_accuracy_high_school_psychology": 0.9333333333333333,
+ "mmlu_eval_accuracy_high_school_statistics": 0.782608695652174,
+ "mmlu_eval_accuracy_high_school_us_history": 0.9545454545454546,
+ "mmlu_eval_accuracy_high_school_world_history": 0.7692307692307693,
+ "mmlu_eval_accuracy_human_aging": 0.782608695652174,
+ "mmlu_eval_accuracy_human_sexuality": 0.6666666666666666,
+ "mmlu_eval_accuracy_international_law": 1.0,
+ "mmlu_eval_accuracy_jurisprudence": 0.6363636363636364,
+ "mmlu_eval_accuracy_logical_fallacies": 0.7777777777777778,
+ "mmlu_eval_accuracy_machine_learning": 0.6363636363636364,
+ "mmlu_eval_accuracy_management": 0.9090909090909091,
+ "mmlu_eval_accuracy_marketing": 0.96,
+ "mmlu_eval_accuracy_medical_genetics": 0.9090909090909091,
+ "mmlu_eval_accuracy_miscellaneous": 0.7441860465116279,
+ "mmlu_eval_accuracy_moral_disputes": 0.8157894736842105,
+ "mmlu_eval_accuracy_moral_scenarios": 0.57,
+ "mmlu_eval_accuracy_nutrition": 0.7272727272727273,
+ "mmlu_eval_accuracy_philosophy": 0.7941176470588235,
+ "mmlu_eval_accuracy_prehistory": 0.8285714285714286,
+ "mmlu_eval_accuracy_professional_accounting": 0.6129032258064516,
+ "mmlu_eval_accuracy_professional_law": 0.6411764705882353,
+ "mmlu_eval_accuracy_professional_medicine": 0.8064516129032258,
+ "mmlu_eval_accuracy_professional_psychology": 0.8260869565217391,
+ "mmlu_eval_accuracy_public_relations": 0.6666666666666666,
+ "mmlu_eval_accuracy_security_studies": 0.8148148148148148,
+ "mmlu_eval_accuracy_sociology": 0.9090909090909091,
+ "mmlu_eval_accuracy_us_foreign_policy": 1.0,
+ "mmlu_eval_accuracy_virology": 0.5,
+ "mmlu_eval_accuracy_world_religions": 0.8421052631578947,
+ "mmlu_loss": 1.2988067958029479,
+ "step": 748
+ },
+ {
+ "epoch": 1.36,
+ "learning_rate": 0.0001,
+ "loss": 1.0735,
+ "step": 750
+ },
+ {
+ "epoch": 1.38,
+ "learning_rate": 0.0001,
+ "loss": 0.9066,
+ "step": 760
+ },
+ {
+ "epoch": 1.4,
+ "learning_rate": 0.0001,
+ "loss": 0.8716,
+ "step": 770
+ },
+ {
+ "epoch": 1.41,
+ "learning_rate": 0.0001,
+ "loss": 0.9144,
+ "step": 780
+ },
+ {
+ "epoch": 1.43,
+ "learning_rate": 0.0001,
+ "loss": 1.0338,
+ "step": 790
+ },
+ {
+ "epoch": 1.45,
+ "learning_rate": 0.0001,
+ "loss": 1.0275,
+ "step": 800
+ },
+ {
+ "epoch": 1.47,
+ "learning_rate": 0.0001,
+ "loss": 0.8382,
+ "step": 810
+ },
+ {
+ "epoch": 1.49,
+ "learning_rate": 0.0001,
+ "loss": 0.8489,
+ "step": 820
+ },
+ {
+ "epoch": 1.51,
+ "learning_rate": 0.0001,
+ "loss": 0.8931,
+ "step": 830
+ },
+ {
+ "epoch": 1.52,
+ "learning_rate": 0.0001,
+ "loss": 1.0515,
+ "step": 840
+ },
+ {
+ "epoch": 1.54,
+ "learning_rate": 0.0001,
+ "loss": 1.0965,
+ "step": 850
+ },
+ {
+ "epoch": 1.56,
+ "learning_rate": 0.0001,
+ "loss": 0.8928,
+ "step": 860
+ },
+ {
+ "epoch": 1.58,
+ "learning_rate": 0.0001,
+ "loss": 0.8608,
+ "step": 870
+ },
+ {
+ "epoch": 1.6,
+ "learning_rate": 0.0001,
+ "loss": 0.8831,
+ "step": 880
+ },
+ {
+ "epoch": 1.61,
+ "learning_rate": 0.0001,
+ "loss": 1.0253,
+ "step": 890
+ },
+ {
+ "epoch": 1.63,
+ "learning_rate": 0.0001,
+ "loss": 0.9905,
+ "step": 900
+ },
+ {
+ "epoch": 1.65,
+ "learning_rate": 0.0001,
+ "loss": 0.8487,
+ "step": 910
+ },
+ {
+ "epoch": 1.67,
+ "learning_rate": 0.0001,
+ "loss": 0.8568,
+ "step": 920
+ },
+ {
+ "epoch": 1.69,
+ "learning_rate": 0.0001,
+ "loss": 0.9047,
+ "step": 930
+ },
+ {
+ "epoch": 1.7,
+ "eval_loss": 1.0250624418258667,
+ "eval_runtime": 946.4035,
+ "eval_samples_per_second": 1.057,
+ "eval_steps_per_second": 1.057,
+ "step": 935
+ },
+ {
+ "epoch": 1.7,
+ "mmlu_eval_accuracy": 0.7288948695878031,
+ "mmlu_eval_accuracy_abstract_algebra": 0.2727272727272727,
+ "mmlu_eval_accuracy_anatomy": 0.5714285714285714,
+ "mmlu_eval_accuracy_astronomy": 0.6875,
+ "mmlu_eval_accuracy_business_ethics": 0.9090909090909091,
+ "mmlu_eval_accuracy_clinical_knowledge": 0.8620689655172413,
+ "mmlu_eval_accuracy_college_biology": 0.875,
+ "mmlu_eval_accuracy_college_chemistry": 0.5,
+ "mmlu_eval_accuracy_college_computer_science": 0.7272727272727273,
+ "mmlu_eval_accuracy_college_mathematics": 0.36363636363636365,
+ "mmlu_eval_accuracy_college_medicine": 0.9090909090909091,
+ "mmlu_eval_accuracy_college_physics": 0.6363636363636364,
+ "mmlu_eval_accuracy_computer_security": 0.6363636363636364,
+ "mmlu_eval_accuracy_conceptual_physics": 0.6153846153846154,
+ "mmlu_eval_accuracy_econometrics": 0.8333333333333334,
+ "mmlu_eval_accuracy_electrical_engineering": 0.8125,
+ "mmlu_eval_accuracy_elementary_mathematics": 0.6097560975609756,
+ "mmlu_eval_accuracy_formal_logic": 0.6428571428571429,
+ "mmlu_eval_accuracy_global_facts": 0.5,
+ "mmlu_eval_accuracy_high_school_biology": 0.84375,
+ "mmlu_eval_accuracy_high_school_chemistry": 0.45454545454545453,
+ "mmlu_eval_accuracy_high_school_computer_science": 0.6666666666666666,
+ "mmlu_eval_accuracy_high_school_european_history": 0.7777777777777778,
+ "mmlu_eval_accuracy_high_school_geography": 0.9090909090909091,
+ "mmlu_eval_accuracy_high_school_government_and_politics": 0.9523809523809523,
+ "mmlu_eval_accuracy_high_school_macroeconomics": 0.7906976744186046,
+ "mmlu_eval_accuracy_high_school_mathematics": 0.3793103448275862,
+ "mmlu_eval_accuracy_high_school_microeconomics": 0.9230769230769231,
+ "mmlu_eval_accuracy_high_school_physics": 0.23529411764705882,
+ "mmlu_eval_accuracy_high_school_psychology": 0.9333333333333333,
+ "mmlu_eval_accuracy_high_school_statistics": 0.7391304347826086,
+ "mmlu_eval_accuracy_high_school_us_history": 0.9090909090909091,
+ "mmlu_eval_accuracy_high_school_world_history": 0.7692307692307693,
+ "mmlu_eval_accuracy_human_aging": 0.782608695652174,
+ "mmlu_eval_accuracy_human_sexuality": 0.6666666666666666,
+ "mmlu_eval_accuracy_international_law": 1.0,
+ "mmlu_eval_accuracy_jurisprudence": 0.5454545454545454,
+ "mmlu_eval_accuracy_logical_fallacies": 0.7222222222222222,
+ "mmlu_eval_accuracy_machine_learning": 0.6363636363636364,
+ "mmlu_eval_accuracy_management": 0.9090909090909091,
+ "mmlu_eval_accuracy_marketing": 0.96,
+ "mmlu_eval_accuracy_medical_genetics": 0.9090909090909091,
+ "mmlu_eval_accuracy_miscellaneous": 0.7790697674418605,
+ "mmlu_eval_accuracy_moral_disputes": 0.8157894736842105,
+ "mmlu_eval_accuracy_moral_scenarios": 0.57,
+ "mmlu_eval_accuracy_nutrition": 0.7575757575757576,
+ "mmlu_eval_accuracy_philosophy": 0.8235294117647058,
+ "mmlu_eval_accuracy_prehistory": 0.8857142857142857,
+ "mmlu_eval_accuracy_professional_accounting": 0.6129032258064516,
+ "mmlu_eval_accuracy_professional_law": 0.6235294117647059,
+ "mmlu_eval_accuracy_professional_medicine": 0.8387096774193549,
+ "mmlu_eval_accuracy_professional_psychology": 0.8115942028985508,
+ "mmlu_eval_accuracy_public_relations": 0.5833333333333334,
+ "mmlu_eval_accuracy_security_studies": 0.8148148148148148,
+ "mmlu_eval_accuracy_sociology": 0.9090909090909091,
+ "mmlu_eval_accuracy_us_foreign_policy": 1.0,
+ "mmlu_eval_accuracy_virology": 0.5,
+ "mmlu_eval_accuracy_world_religions": 0.8421052631578947,
+ "mmlu_loss": 1.243813282909306,
+ "step": 935
+ },
+ {
+ "epoch": 1.7,
+ "learning_rate": 0.0001,
+ "loss": 1.0174,
+ "step": 940
+ },
+ {
+ "epoch": 1.72,
+ "learning_rate": 0.0001,
+ "loss": 1.0302,
+ "step": 950
+ },
+ {
+ "epoch": 1.74,
+ "learning_rate": 0.0001,
+ "loss": 0.8799,
+ "step": 960
+ },
+ {
+ "epoch": 1.76,
+ "learning_rate": 0.0001,
+ "loss": 0.8447,
+ "step": 970
+ },
+ {
+ "epoch": 1.78,
+ "learning_rate": 0.0001,
+ "loss": 0.9053,
+ "step": 980
+ },
+ {
+ "epoch": 1.8,
+ "learning_rate": 0.0001,
+ "loss": 1.0331,
+ "step": 990
+ },
+ {
+ "epoch": 1.81,
+ "learning_rate": 0.0001,
+ "loss": 1.0412,
+ "step": 1000
+ },
+ {
+ "epoch": 1.83,
+ "learning_rate": 0.0001,
+ "loss": 0.8753,
+ "step": 1010
+ },
+ {
+ "epoch": 1.85,
+ "learning_rate": 0.0001,
+ "loss": 0.8744,
+ "step": 1020
+ },
+ {
+ "epoch": 1.87,
+ "learning_rate": 0.0001,
+ "loss": 0.8899,
+ "step": 1030
+ },
+ {
+ "epoch": 1.89,
+ "learning_rate": 0.0001,
+ "loss": 1.0053,
+ "step": 1040
+ },
+ {
+ "epoch": 1.9,
+ "learning_rate": 0.0001,
+ "loss": 1.0127,
+ "step": 1050
+ },
+ {
+ "epoch": 1.92,
+ "learning_rate": 0.0001,
+ "loss": 0.8023,
+ "step": 1060
+ },
+ {
+ "epoch": 1.94,
+ "learning_rate": 0.0001,
+ "loss": 0.8349,
+ "step": 1070
+ },
+ {
+ "epoch": 1.96,
+ "learning_rate": 0.0001,
+ "loss": 0.9742,
+ "step": 1080
+ },
+ {
+ "epoch": 1.98,
+ "learning_rate": 0.0001,
+ "loss": 1.0971,
+ "step": 1090
+ },
+ {
+ "epoch": 2.0,
+ "learning_rate": 0.0001,
+ "loss": 1.0728,
+ "step": 1100
+ },
+ {
+ "epoch": 2.01,
+ "learning_rate": 0.0001,
+ "loss": 0.7724,
+ "step": 1110
+ },
+ {
+ "epoch": 2.03,
+ "learning_rate": 0.0001,
+ "loss": 0.7675,
+ "step": 1120
+ },
+ {
+ "epoch": 2.03,
+ "eval_loss": 1.052681565284729,
+ "eval_runtime": 942.0722,
+ "eval_samples_per_second": 1.061,
+ "eval_steps_per_second": 1.061,
+ "step": 1122
+ },
+ {
+ "epoch": 2.03,
+ "mmlu_eval_accuracy": 0.7373981967098951,
+ "mmlu_eval_accuracy_abstract_algebra": 0.36363636363636365,
+ "mmlu_eval_accuracy_anatomy": 0.6428571428571429,
+ "mmlu_eval_accuracy_astronomy": 0.6875,
+ "mmlu_eval_accuracy_business_ethics": 0.9090909090909091,
+ "mmlu_eval_accuracy_clinical_knowledge": 0.896551724137931,
+ "mmlu_eval_accuracy_college_biology": 0.875,
+ "mmlu_eval_accuracy_college_chemistry": 0.5,
+ "mmlu_eval_accuracy_college_computer_science": 0.5454545454545454,
+ "mmlu_eval_accuracy_college_mathematics": 0.36363636363636365,
+ "mmlu_eval_accuracy_college_medicine": 0.9090909090909091,
+ "mmlu_eval_accuracy_college_physics": 0.5454545454545454,
+ "mmlu_eval_accuracy_computer_security": 0.6363636363636364,
+ "mmlu_eval_accuracy_conceptual_physics": 0.6153846153846154,
+ "mmlu_eval_accuracy_econometrics": 0.8333333333333334,
+ "mmlu_eval_accuracy_electrical_engineering": 0.875,
+ "mmlu_eval_accuracy_elementary_mathematics": 0.5853658536585366,
+ "mmlu_eval_accuracy_formal_logic": 0.7142857142857143,
+ "mmlu_eval_accuracy_global_facts": 0.4,
+ "mmlu_eval_accuracy_high_school_biology": 0.84375,
+ "mmlu_eval_accuracy_high_school_chemistry": 0.45454545454545453,
+ "mmlu_eval_accuracy_high_school_computer_science": 0.7777777777777778,
+ "mmlu_eval_accuracy_high_school_european_history": 0.7777777777777778,
+ "mmlu_eval_accuracy_high_school_geography": 0.9090909090909091,
+ "mmlu_eval_accuracy_high_school_government_and_politics": 0.9523809523809523,
+ "mmlu_eval_accuracy_high_school_macroeconomics": 0.7674418604651163,
+ "mmlu_eval_accuracy_high_school_mathematics": 0.41379310344827586,
+ "mmlu_eval_accuracy_high_school_microeconomics": 0.9230769230769231,
+ "mmlu_eval_accuracy_high_school_physics": 0.23529411764705882,
+ "mmlu_eval_accuracy_high_school_psychology": 0.95,
+ "mmlu_eval_accuracy_high_school_statistics": 0.782608695652174,
+ "mmlu_eval_accuracy_high_school_us_history": 0.9090909090909091,
+ "mmlu_eval_accuracy_high_school_world_history": 0.7692307692307693,
+ "mmlu_eval_accuracy_human_aging": 0.8260869565217391,
+ "mmlu_eval_accuracy_human_sexuality": 0.6666666666666666,
+ "mmlu_eval_accuracy_international_law": 1.0,
+ "mmlu_eval_accuracy_jurisprudence": 0.6363636363636364,
+ "mmlu_eval_accuracy_logical_fallacies": 0.7777777777777778,
+ "mmlu_eval_accuracy_machine_learning": 0.6363636363636364,
+ "mmlu_eval_accuracy_management": 0.9090909090909091,
+ "mmlu_eval_accuracy_marketing": 0.92,
+ "mmlu_eval_accuracy_medical_genetics": 0.9090909090909091,
+ "mmlu_eval_accuracy_miscellaneous": 0.7906976744186046,
+ "mmlu_eval_accuracy_moral_disputes": 0.8157894736842105,
+ "mmlu_eval_accuracy_moral_scenarios": 0.62,
+ "mmlu_eval_accuracy_nutrition": 0.7575757575757576,
+ "mmlu_eval_accuracy_philosophy": 0.8235294117647058,
+ "mmlu_eval_accuracy_prehistory": 0.8857142857142857,
+ "mmlu_eval_accuracy_professional_accounting": 0.6774193548387096,
+ "mmlu_eval_accuracy_professional_law": 0.6294117647058823,
+ "mmlu_eval_accuracy_professional_medicine": 0.8709677419354839,
+ "mmlu_eval_accuracy_professional_psychology": 0.782608695652174,
+ "mmlu_eval_accuracy_public_relations": 0.6666666666666666,
+ "mmlu_eval_accuracy_security_studies": 0.8148148148148148,
+ "mmlu_eval_accuracy_sociology": 0.9090909090909091,
+ "mmlu_eval_accuracy_us_foreign_policy": 1.0,
+ "mmlu_eval_accuracy_virology": 0.5,
+ "mmlu_eval_accuracy_world_religions": 0.8421052631578947,
+ "mmlu_loss": 1.2340081441760609,
+ "step": 1122
+ },
+ {
+ "epoch": 2.05,
+ "learning_rate": 0.0001,
+ "loss": 0.7194,
+ "step": 1130
+ },
+ {
+ "epoch": 2.07,
+ "learning_rate": 0.0001,
+ "loss": 0.8236,
+ "step": 1140
+ },
+ {
+ "epoch": 2.09,
+ "learning_rate": 0.0001,
+ "loss": 0.6652,
+ "step": 1150
+ },
+ {
+ "epoch": 2.1,
+ "learning_rate": 0.0001,
+ "loss": 0.7177,
+ "step": 1160
+ },
+ {
+ "epoch": 2.12,
+ "learning_rate": 0.0001,
+ "loss": 0.7788,
+ "step": 1170
+ },
+ {
+ "epoch": 2.14,
+ "learning_rate": 0.0001,
+ "loss": 0.8117,
+ "step": 1180
+ },
+ {
+ "epoch": 2.16,
+ "learning_rate": 0.0001,
+ "loss": 0.8145,
+ "step": 1190
+ },
+ {
+ "epoch": 2.18,
+ "learning_rate": 0.0001,
+ "loss": 0.6984,
+ "step": 1200
+ },
+ {
+ "epoch": 2.19,
+ "learning_rate": 0.0001,
+ "loss": 0.7011,
+ "step": 1210
+ },
+ {
+ "epoch": 2.21,
+ "learning_rate": 0.0001,
+ "loss": 0.769,
+ "step": 1220
+ },
+ {
+ "epoch": 2.23,
+ "learning_rate": 0.0001,
+ "loss": 0.7705,
+ "step": 1230
+ },
+ {
+ "epoch": 2.25,
+ "learning_rate": 0.0001,
+ "loss": 0.8066,
+ "step": 1240
+ },
+ {
+ "epoch": 2.27,
+ "learning_rate": 0.0001,
+ "loss": 0.6622,
+ "step": 1250
+ },
+ {
+ "epoch": 2.29,
+ "learning_rate": 0.0001,
+ "loss": 0.6641,
+ "step": 1260
+ },
+ {
+ "epoch": 2.3,
+ "learning_rate": 0.0001,
+ "loss": 0.7239,
+ "step": 1270
+ },
+ {
+ "epoch": 2.32,
+ "learning_rate": 0.0001,
+ "loss": 0.7618,
+ "step": 1280
+ },
+ {
+ "epoch": 2.34,
+ "learning_rate": 0.0001,
+ "loss": 0.7845,
+ "step": 1290
+ },
+ {
+ "epoch": 2.36,
+ "learning_rate": 0.0001,
+ "loss": 0.719,
+ "step": 1300
+ },
+ {
+ "epoch": 2.37,
+ "eval_loss": 1.1104822158813477,
+ "eval_runtime": 948.1299,
+ "eval_samples_per_second": 1.055,
+ "eval_steps_per_second": 1.055,
+ "step": 1309
+ },
+ {
+ "epoch": 2.37,
+ "mmlu_eval_accuracy": 0.7369285730399766,
+ "mmlu_eval_accuracy_abstract_algebra": 0.36363636363636365,
+ "mmlu_eval_accuracy_anatomy": 0.6428571428571429,
+ "mmlu_eval_accuracy_astronomy": 0.6875,
+ "mmlu_eval_accuracy_business_ethics": 1.0,
+ "mmlu_eval_accuracy_clinical_knowledge": 0.8275862068965517,
+ "mmlu_eval_accuracy_college_biology": 0.875,
+ "mmlu_eval_accuracy_college_chemistry": 0.5,
+ "mmlu_eval_accuracy_college_computer_science": 0.6363636363636364,
+ "mmlu_eval_accuracy_college_mathematics": 0.45454545454545453,
+ "mmlu_eval_accuracy_college_medicine": 0.8636363636363636,
+ "mmlu_eval_accuracy_college_physics": 0.6363636363636364,
+ "mmlu_eval_accuracy_computer_security": 0.7272727272727273,
+ "mmlu_eval_accuracy_conceptual_physics": 0.6923076923076923,
+ "mmlu_eval_accuracy_econometrics": 0.75,
+ "mmlu_eval_accuracy_electrical_engineering": 0.8125,
+ "mmlu_eval_accuracy_elementary_mathematics": 0.6341463414634146,
+ "mmlu_eval_accuracy_formal_logic": 0.7857142857142857,
+ "mmlu_eval_accuracy_global_facts": 0.5,
+ "mmlu_eval_accuracy_high_school_biology": 0.84375,
+ "mmlu_eval_accuracy_high_school_chemistry": 0.4090909090909091,
+ "mmlu_eval_accuracy_high_school_computer_science": 0.8888888888888888,
+ "mmlu_eval_accuracy_high_school_european_history": 0.7777777777777778,
+ "mmlu_eval_accuracy_high_school_geography": 0.9090909090909091,
+ "mmlu_eval_accuracy_high_school_government_and_politics": 0.9523809523809523,
+ "mmlu_eval_accuracy_high_school_macroeconomics": 0.7906976744186046,
+ "mmlu_eval_accuracy_high_school_mathematics": 0.3793103448275862,
+ "mmlu_eval_accuracy_high_school_microeconomics": 0.9230769230769231,
+ "mmlu_eval_accuracy_high_school_physics": 0.17647058823529413,
+ "mmlu_eval_accuracy_high_school_psychology": 0.95,
+ "mmlu_eval_accuracy_high_school_statistics": 0.6956521739130435,
+ "mmlu_eval_accuracy_high_school_us_history": 0.9090909090909091,
+ "mmlu_eval_accuracy_high_school_world_history": 0.7307692307692307,
+ "mmlu_eval_accuracy_human_aging": 0.7391304347826086,
+ "mmlu_eval_accuracy_human_sexuality": 0.6666666666666666,
+ "mmlu_eval_accuracy_international_law": 1.0,
+ "mmlu_eval_accuracy_jurisprudence": 0.5454545454545454,
+ "mmlu_eval_accuracy_logical_fallacies": 0.7222222222222222,
+ "mmlu_eval_accuracy_machine_learning": 0.6363636363636364,
+ "mmlu_eval_accuracy_management": 0.9090909090909091,
+ "mmlu_eval_accuracy_marketing": 0.88,
+ "mmlu_eval_accuracy_medical_genetics": 0.9090909090909091,
+ "mmlu_eval_accuracy_miscellaneous": 0.7906976744186046,
+ "mmlu_eval_accuracy_moral_disputes": 0.8157894736842105,
+ "mmlu_eval_accuracy_moral_scenarios": 0.57,
+ "mmlu_eval_accuracy_nutrition": 0.7575757575757576,
+ "mmlu_eval_accuracy_philosophy": 0.8529411764705882,
+ "mmlu_eval_accuracy_prehistory": 0.8571428571428571,
+ "mmlu_eval_accuracy_professional_accounting": 0.6774193548387096,
+ "mmlu_eval_accuracy_professional_law": 0.6058823529411764,
+ "mmlu_eval_accuracy_professional_medicine": 0.8709677419354839,
+ "mmlu_eval_accuracy_professional_psychology": 0.7681159420289855,
+ "mmlu_eval_accuracy_public_relations": 0.5833333333333334,
+ "mmlu_eval_accuracy_security_studies": 0.8148148148148148,
+ "mmlu_eval_accuracy_sociology": 0.9090909090909091,
+ "mmlu_eval_accuracy_us_foreign_policy": 1.0,
+ "mmlu_eval_accuracy_virology": 0.5555555555555556,
+ "mmlu_eval_accuracy_world_religions": 0.8421052631578947,
+ "mmlu_loss": 1.0866562834095908,
+ "step": 1309
+ },
+ {
+ "epoch": 2.38,
+ "learning_rate": 0.0001,
+ "loss": 0.7093,
+ "step": 1310
+ },
+ {
+ "epoch": 2.39,
+ "learning_rate": 0.0001,
+ "loss": 0.7684,
+ "step": 1320
+ },
+ {
+ "epoch": 2.41,
+ "learning_rate": 0.0001,
+ "loss": 0.7501,
+ "step": 1330
+ },
+ {
+ "epoch": 2.43,
+ "learning_rate": 0.0001,
+ "loss": 0.8043,
+ "step": 1340
+ },
+ {
+ "epoch": 2.45,
+ "learning_rate": 0.0001,
+ "loss": 0.6927,
+ "step": 1350
+ },
+ {
+ "epoch": 2.47,
+ "learning_rate": 0.0001,
+ "loss": 0.7278,
+ "step": 1360
+ },
+ {
+ "epoch": 2.48,
+ "learning_rate": 0.0001,
+ "loss": 0.8095,
+ "step": 1370
+ },
+ {
+ "epoch": 2.5,
+ "learning_rate": 0.0001,
+ "loss": 0.7463,
+ "step": 1380
+ },
+ {
+ "epoch": 2.52,
+ "learning_rate": 0.0001,
+ "loss": 0.7707,
+ "step": 1390
+ },
+ {
+ "epoch": 2.54,
+ "learning_rate": 0.0001,
+ "loss": 0.7152,
+ "step": 1400
+ },
+ {
+ "epoch": 2.56,
+ "learning_rate": 0.0001,
+ "loss": 0.687,
+ "step": 1410
+ },
+ {
+ "epoch": 2.58,
+ "learning_rate": 0.0001,
+ "loss": 0.7529,
+ "step": 1420
+ },
+ {
+ "epoch": 2.59,
+ "learning_rate": 0.0001,
+ "loss": 0.7565,
+ "step": 1430
+ },
+ {
+ "epoch": 2.61,
+ "learning_rate": 0.0001,
+ "loss": 0.8066,
+ "step": 1440
+ },
+ {
+ "epoch": 2.63,
+ "learning_rate": 0.0001,
+ "loss": 0.7623,
+ "step": 1450
+ },
+ {
+ "epoch": 2.65,
+ "learning_rate": 0.0001,
+ "loss": 0.6947,
+ "step": 1460
+ },
+ {
+ "epoch": 2.67,
+ "learning_rate": 0.0001,
+ "loss": 0.7756,
+ "step": 1470
+ },
+ {
+ "epoch": 2.68,
+ "learning_rate": 0.0001,
+ "loss": 0.8453,
+ "step": 1480
+ },
+ {
+ "epoch": 2.7,
+ "learning_rate": 0.0001,
+ "loss": 0.8306,
+ "step": 1490
+ },
+ {
+ "epoch": 2.71,
+ "eval_loss": 1.100826621055603,
+ "eval_runtime": 940.4488,
+ "eval_samples_per_second": 1.063,
+ "eval_steps_per_second": 1.063,
+ "step": 1496
+ },
+ {
+ "epoch": 2.71,
+ "mmlu_eval_accuracy": 0.7363077307176445,
+ "mmlu_eval_accuracy_abstract_algebra": 0.36363636363636365,
+ "mmlu_eval_accuracy_anatomy": 0.5714285714285714,
+ "mmlu_eval_accuracy_astronomy": 0.6875,
+ "mmlu_eval_accuracy_business_ethics": 0.9090909090909091,
+ "mmlu_eval_accuracy_clinical_knowledge": 0.8620689655172413,
+ "mmlu_eval_accuracy_college_biology": 0.8125,
+ "mmlu_eval_accuracy_college_chemistry": 0.5,
+ "mmlu_eval_accuracy_college_computer_science": 0.5454545454545454,
+ "mmlu_eval_accuracy_college_mathematics": 0.36363636363636365,
+ "mmlu_eval_accuracy_college_medicine": 0.9090909090909091,
+ "mmlu_eval_accuracy_college_physics": 0.6363636363636364,
+ "mmlu_eval_accuracy_computer_security": 0.6363636363636364,
+ "mmlu_eval_accuracy_conceptual_physics": 0.6153846153846154,
+ "mmlu_eval_accuracy_econometrics": 0.75,
+ "mmlu_eval_accuracy_electrical_engineering": 0.8125,
+ "mmlu_eval_accuracy_elementary_mathematics": 0.6585365853658537,
+ "mmlu_eval_accuracy_formal_logic": 0.6428571428571429,
+ "mmlu_eval_accuracy_global_facts": 0.6,
+ "mmlu_eval_accuracy_high_school_biology": 0.78125,
+ "mmlu_eval_accuracy_high_school_chemistry": 0.5909090909090909,
+ "mmlu_eval_accuracy_high_school_computer_science": 0.7777777777777778,
+ "mmlu_eval_accuracy_high_school_european_history": 0.7777777777777778,
+ "mmlu_eval_accuracy_high_school_geography": 0.9545454545454546,
+ "mmlu_eval_accuracy_high_school_government_and_politics": 0.9523809523809523,
+ "mmlu_eval_accuracy_high_school_macroeconomics": 0.7906976744186046,
+ "mmlu_eval_accuracy_high_school_mathematics": 0.4827586206896552,
+ "mmlu_eval_accuracy_high_school_microeconomics": 0.9615384615384616,
+ "mmlu_eval_accuracy_high_school_physics": 0.17647058823529413,
+ "mmlu_eval_accuracy_high_school_psychology": 0.9166666666666666,
+ "mmlu_eval_accuracy_high_school_statistics": 0.6521739130434783,
+ "mmlu_eval_accuracy_high_school_us_history": 0.9090909090909091,
+ "mmlu_eval_accuracy_high_school_world_history": 0.8076923076923077,
+ "mmlu_eval_accuracy_human_aging": 0.7391304347826086,
+ "mmlu_eval_accuracy_human_sexuality": 0.75,
+ "mmlu_eval_accuracy_international_law": 1.0,
+ "mmlu_eval_accuracy_jurisprudence": 0.45454545454545453,
+ "mmlu_eval_accuracy_logical_fallacies": 0.7777777777777778,
+ "mmlu_eval_accuracy_machine_learning": 0.6363636363636364,
+ "mmlu_eval_accuracy_management": 0.9090909090909091,
+ "mmlu_eval_accuracy_marketing": 0.88,
+ "mmlu_eval_accuracy_medical_genetics": 0.9090909090909091,
+ "mmlu_eval_accuracy_miscellaneous": 0.7906976744186046,
+ "mmlu_eval_accuracy_moral_disputes": 0.8157894736842105,
+ "mmlu_eval_accuracy_moral_scenarios": 0.64,
+ "mmlu_eval_accuracy_nutrition": 0.7575757575757576,
+ "mmlu_eval_accuracy_philosophy": 0.8235294117647058,
+ "mmlu_eval_accuracy_prehistory": 0.8571428571428571,
+ "mmlu_eval_accuracy_professional_accounting": 0.7096774193548387,
+ "mmlu_eval_accuracy_professional_law": 0.6176470588235294,
+ "mmlu_eval_accuracy_professional_medicine": 0.8709677419354839,
+ "mmlu_eval_accuracy_professional_psychology": 0.7971014492753623,
+ "mmlu_eval_accuracy_public_relations": 0.6666666666666666,
+ "mmlu_eval_accuracy_security_studies": 0.8518518518518519,
+ "mmlu_eval_accuracy_sociology": 0.9090909090909091,
+ "mmlu_eval_accuracy_us_foreign_policy": 1.0,
+ "mmlu_eval_accuracy_virology": 0.5555555555555556,
+ "mmlu_eval_accuracy_world_religions": 0.8421052631578947,
+ "mmlu_loss": 1.2313211129857853,
+ "step": 1496
+ },
+ {
+ "epoch": 2.72,
+ "learning_rate": 0.0001,
+ "loss": 0.6937,
+ "step": 1500
+ },
+ {
+ "epoch": 2.74,
+ "learning_rate": 0.0001,
+ "loss": 0.6997,
+ "step": 1510
+ },
+ {
+ "epoch": 2.76,
+ "learning_rate": 0.0001,
+ "loss": 0.7588,
+ "step": 1520
+ },
+ {
+ "epoch": 2.77,
+ "learning_rate": 0.0001,
+ "loss": 0.7731,
+ "step": 1530
+ },
+ {
+ "epoch": 2.79,
+ "learning_rate": 0.0001,
+ "loss": 0.7914,
+ "step": 1540
+ },
+ {
+ "epoch": 2.81,
+ "learning_rate": 0.0001,
+ "loss": 0.7175,
+ "step": 1550
+ },
+ {
+ "epoch": 2.83,
+ "learning_rate": 0.0001,
+ "loss": 0.7046,
+ "step": 1560
+ },
+ {
+ "epoch": 2.85,
+ "learning_rate": 0.0001,
+ "loss": 0.7597,
+ "step": 1570
+ },
+ {
+ "epoch": 2.87,
+ "learning_rate": 0.0001,
+ "loss": 0.7932,
+ "step": 1580
+ },
+ {
+ "epoch": 2.88,
+ "learning_rate": 0.0001,
+ "loss": 0.8059,
+ "step": 1590
+ },
+ {
+ "epoch": 2.9,
+ "learning_rate": 0.0001,
+ "loss": 0.7258,
+ "step": 1600
+ },
+ {
+ "epoch": 2.92,
+ "learning_rate": 0.0001,
+ "loss": 0.7486,
+ "step": 1610
+ },
+ {
+ "epoch": 2.94,
+ "learning_rate": 0.0001,
+ "loss": 0.7233,
+ "step": 1620
+ },
+ {
+ "epoch": 2.96,
+ "learning_rate": 0.0001,
+ "loss": 0.7945,
+ "step": 1630
+ },
+ {
+ "epoch": 2.97,
+ "learning_rate": 0.0001,
+ "loss": 0.8324,
+ "step": 1640
+ },
+ {
+ "epoch": 2.99,
+ "learning_rate": 0.0001,
+ "loss": 0.7294,
+ "step": 1650
+ },
+ {
+ "epoch": 3.01,
+ "learning_rate": 0.0001,
+ "loss": 0.6117,
+ "step": 1660
+ },
+ {
+ "epoch": 3.03,
+ "learning_rate": 0.0001,
+ "loss": 0.6464,
+ "step": 1670
+ },
+ {
+ "epoch": 3.05,
+ "learning_rate": 0.0001,
+ "loss": 0.6156,
+ "step": 1680
+ },
+ {
+ "epoch": 3.05,
+ "eval_loss": 1.1478718519210815,
+ "eval_runtime": 932.4225,
+ "eval_samples_per_second": 1.072,
+ "eval_steps_per_second": 1.072,
+ "step": 1683
+ },
+ {
+ "epoch": 3.05,
+ "mmlu_eval_accuracy": 0.745366643285036,
+ "mmlu_eval_accuracy_abstract_algebra": 0.2727272727272727,
+ "mmlu_eval_accuracy_anatomy": 0.5714285714285714,
+ "mmlu_eval_accuracy_astronomy": 0.6875,
+ "mmlu_eval_accuracy_business_ethics": 1.0,
+ "mmlu_eval_accuracy_clinical_knowledge": 0.8620689655172413,
+ "mmlu_eval_accuracy_college_biology": 0.875,
+ "mmlu_eval_accuracy_college_chemistry": 0.5,
+ "mmlu_eval_accuracy_college_computer_science": 0.7272727272727273,
+ "mmlu_eval_accuracy_college_mathematics": 0.36363636363636365,
+ "mmlu_eval_accuracy_college_medicine": 0.9090909090909091,
+ "mmlu_eval_accuracy_college_physics": 0.6363636363636364,
+ "mmlu_eval_accuracy_computer_security": 0.6363636363636364,
+ "mmlu_eval_accuracy_conceptual_physics": 0.6538461538461539,
+ "mmlu_eval_accuracy_econometrics": 0.75,
+ "mmlu_eval_accuracy_electrical_engineering": 0.875,
+ "mmlu_eval_accuracy_elementary_mathematics": 0.7073170731707317,
+ "mmlu_eval_accuracy_formal_logic": 0.7857142857142857,
+ "mmlu_eval_accuracy_global_facts": 0.8,
+ "mmlu_eval_accuracy_high_school_biology": 0.8125,
+ "mmlu_eval_accuracy_high_school_chemistry": 0.5454545454545454,
+ "mmlu_eval_accuracy_high_school_computer_science": 0.7777777777777778,
+ "mmlu_eval_accuracy_high_school_european_history": 0.7777777777777778,
+ "mmlu_eval_accuracy_high_school_geography": 0.9090909090909091,
+ "mmlu_eval_accuracy_high_school_government_and_politics": 0.9523809523809523,
+ "mmlu_eval_accuracy_high_school_macroeconomics": 0.7674418604651163,
+ "mmlu_eval_accuracy_high_school_mathematics": 0.3793103448275862,
+ "mmlu_eval_accuracy_high_school_microeconomics": 0.9230769230769231,
+ "mmlu_eval_accuracy_high_school_physics": 0.23529411764705882,
+ "mmlu_eval_accuracy_high_school_psychology": 0.9333333333333333,
+ "mmlu_eval_accuracy_high_school_statistics": 0.6521739130434783,
+ "mmlu_eval_accuracy_high_school_us_history": 0.9090909090909091,
+ "mmlu_eval_accuracy_high_school_world_history": 0.8076923076923077,
+ "mmlu_eval_accuracy_human_aging": 0.7391304347826086,
+ "mmlu_eval_accuracy_human_sexuality": 0.75,
+ "mmlu_eval_accuracy_international_law": 1.0,
+ "mmlu_eval_accuracy_jurisprudence": 0.5454545454545454,
+ "mmlu_eval_accuracy_logical_fallacies": 0.7777777777777778,
+ "mmlu_eval_accuracy_machine_learning": 0.6363636363636364,
+ "mmlu_eval_accuracy_management": 0.9090909090909091,
+ "mmlu_eval_accuracy_marketing": 0.96,
+ "mmlu_eval_accuracy_medical_genetics": 1.0,
+ "mmlu_eval_accuracy_miscellaneous": 0.7906976744186046,
+ "mmlu_eval_accuracy_moral_disputes": 0.7894736842105263,
+ "mmlu_eval_accuracy_moral_scenarios": 0.61,
+ "mmlu_eval_accuracy_nutrition": 0.7272727272727273,
+ "mmlu_eval_accuracy_philosophy": 0.7647058823529411,
+ "mmlu_eval_accuracy_prehistory": 0.8571428571428571,
+ "mmlu_eval_accuracy_professional_accounting": 0.6774193548387096,
+ "mmlu_eval_accuracy_professional_law": 0.6176470588235294,
+ "mmlu_eval_accuracy_professional_medicine": 0.9032258064516129,
+ "mmlu_eval_accuracy_professional_psychology": 0.7391304347826086,
+ "mmlu_eval_accuracy_public_relations": 0.6666666666666666,
+ "mmlu_eval_accuracy_security_studies": 0.7777777777777778,
+ "mmlu_eval_accuracy_sociology": 0.9090909090909091,
+ "mmlu_eval_accuracy_us_foreign_policy": 1.0,
+ "mmlu_eval_accuracy_virology": 0.5,
+ "mmlu_eval_accuracy_world_religions": 0.8421052631578947,
+ "mmlu_loss": 1.4050485734687297,
+ "step": 1683
+ },
+ {
+ "epoch": 3.07,
+ "learning_rate": 0.0001,
+ "loss": 0.5237,
+ "step": 1690
+ },
+ {
+ "epoch": 3.08,
+ "learning_rate": 0.0001,
+ "loss": 0.3516,
+ "step": 1700
+ },
+ {
+ "epoch": 3.1,
+ "learning_rate": 0.0001,
+ "loss": 0.4976,
+ "step": 1710
+ },
+ {
+ "epoch": 3.12,
+ "learning_rate": 0.0001,
+ "loss": 0.6535,
+ "step": 1720
+ },
+ {
+ "epoch": 3.14,
+ "learning_rate": 0.0001,
+ "loss": 0.5926,
+ "step": 1730
+ },
+ {
+ "epoch": 3.16,
+ "learning_rate": 0.0001,
+ "loss": 0.5476,
+ "step": 1740
+ },
+ {
+ "epoch": 3.17,
+ "learning_rate": 0.0001,
+ "loss": 0.368,
+ "step": 1750
+ },
+ {
+ "epoch": 3.19,
+ "learning_rate": 0.0001,
+ "loss": 0.5043,
+ "step": 1760
+ },
+ {
+ "epoch": 3.21,
+ "learning_rate": 0.0001,
+ "loss": 0.5907,
+ "step": 1770
+ },
+ {
+ "epoch": 3.23,
+ "learning_rate": 0.0001,
+ "loss": 0.5609,
+ "step": 1780
+ },
+ {
+ "epoch": 3.25,
+ "learning_rate": 0.0001,
+ "loss": 0.5272,
+ "step": 1790
+ },
+ {
+ "epoch": 3.26,
+ "learning_rate": 0.0001,
+ "loss": 0.3672,
+ "step": 1800
+ },
+ {
+ "epoch": 3.28,
+ "learning_rate": 0.0001,
+ "loss": 0.4947,
+ "step": 1810
+ },
+ {
+ "epoch": 3.3,
+ "learning_rate": 0.0001,
+ "loss": 0.6441,
+ "step": 1820
+ },
+ {
+ "epoch": 3.32,
+ "learning_rate": 0.0001,
+ "loss": 0.5989,
+ "step": 1830
+ },
+ {
+ "epoch": 3.34,
+ "learning_rate": 0.0001,
+ "loss": 0.5411,
+ "step": 1840
+ },
+ {
+ "epoch": 3.36,
+ "learning_rate": 0.0001,
+ "loss": 0.401,
+ "step": 1850
+ },
+ {
+ "epoch": 3.37,
+ "learning_rate": 0.0001,
+ "loss": 0.4685,
+ "step": 1860
+ },
+ {
+ "epoch": 3.39,
+ "learning_rate": 0.0001,
+ "loss": 0.6234,
+ "step": 1870
+ },
+ {
+ "epoch": 3.39,
+ "eval_loss": 1.1522600650787354,
+ "eval_runtime": 572.6447,
+ "eval_samples_per_second": 1.746,
+ "eval_steps_per_second": 1.746,
+ "step": 1870
+ },
+ {
+ "epoch": 3.39,
+ "mmlu_eval_accuracy": 0.7349633316353468,
+ "mmlu_eval_accuracy_abstract_algebra": 0.36363636363636365,
+ "mmlu_eval_accuracy_anatomy": 0.5714285714285714,
+ "mmlu_eval_accuracy_astronomy": 0.6875,
+ "mmlu_eval_accuracy_business_ethics": 1.0,
+ "mmlu_eval_accuracy_clinical_knowledge": 0.8620689655172413,
+ "mmlu_eval_accuracy_college_biology": 0.875,
+ "mmlu_eval_accuracy_college_chemistry": 0.5,
+ "mmlu_eval_accuracy_college_computer_science": 0.6363636363636364,
+ "mmlu_eval_accuracy_college_mathematics": 0.2727272727272727,
+ "mmlu_eval_accuracy_college_medicine": 0.8636363636363636,
+ "mmlu_eval_accuracy_college_physics": 0.6363636363636364,
+ "mmlu_eval_accuracy_computer_security": 0.7272727272727273,
+ "mmlu_eval_accuracy_conceptual_physics": 0.6153846153846154,
+ "mmlu_eval_accuracy_econometrics": 0.75,
+ "mmlu_eval_accuracy_electrical_engineering": 0.75,
+ "mmlu_eval_accuracy_elementary_mathematics": 0.6341463414634146,
+ "mmlu_eval_accuracy_formal_logic": 0.7142857142857143,
+ "mmlu_eval_accuracy_global_facts": 0.6,
+ "mmlu_eval_accuracy_high_school_biology": 0.8125,
+ "mmlu_eval_accuracy_high_school_chemistry": 0.45454545454545453,
+ "mmlu_eval_accuracy_high_school_computer_science": 0.8888888888888888,
+ "mmlu_eval_accuracy_high_school_european_history": 0.7777777777777778,
+ "mmlu_eval_accuracy_high_school_geography": 0.9090909090909091,
+ "mmlu_eval_accuracy_high_school_government_and_politics": 0.9523809523809523,
+ "mmlu_eval_accuracy_high_school_macroeconomics": 0.7674418604651163,
+ "mmlu_eval_accuracy_high_school_mathematics": 0.4482758620689655,
+ "mmlu_eval_accuracy_high_school_microeconomics": 0.9615384615384616,
+ "mmlu_eval_accuracy_high_school_physics": 0.17647058823529413,
+ "mmlu_eval_accuracy_high_school_psychology": 0.9333333333333333,
+ "mmlu_eval_accuracy_high_school_statistics": 0.6521739130434783,
+ "mmlu_eval_accuracy_high_school_us_history": 0.9090909090909091,
+ "mmlu_eval_accuracy_high_school_world_history": 0.8076923076923077,
+ "mmlu_eval_accuracy_human_aging": 0.7391304347826086,
+ "mmlu_eval_accuracy_human_sexuality": 0.75,
+ "mmlu_eval_accuracy_international_law": 1.0,
+ "mmlu_eval_accuracy_jurisprudence": 0.5454545454545454,
+ "mmlu_eval_accuracy_logical_fallacies": 0.7777777777777778,
+ "mmlu_eval_accuracy_machine_learning": 0.6363636363636364,
+ "mmlu_eval_accuracy_management": 0.9090909090909091,
+ "mmlu_eval_accuracy_marketing": 0.84,
+ "mmlu_eval_accuracy_medical_genetics": 0.9090909090909091,
+ "mmlu_eval_accuracy_miscellaneous": 0.7906976744186046,
+ "mmlu_eval_accuracy_moral_disputes": 0.8157894736842105,
+ "mmlu_eval_accuracy_moral_scenarios": 0.64,
+ "mmlu_eval_accuracy_nutrition": 0.7575757575757576,
+ "mmlu_eval_accuracy_philosophy": 0.7941176470588235,
+ "mmlu_eval_accuracy_prehistory": 0.8857142857142857,
+ "mmlu_eval_accuracy_professional_accounting": 0.6451612903225806,
+ "mmlu_eval_accuracy_professional_law": 0.6176470588235294,
+ "mmlu_eval_accuracy_professional_medicine": 0.8709677419354839,
+ "mmlu_eval_accuracy_professional_psychology": 0.7246376811594203,
+ "mmlu_eval_accuracy_public_relations": 0.6666666666666666,
+ "mmlu_eval_accuracy_security_studies": 0.8148148148148148,
+ "mmlu_eval_accuracy_sociology": 0.9090909090909091,
+ "mmlu_eval_accuracy_us_foreign_policy": 1.0,
+ "mmlu_eval_accuracy_virology": 0.5,
+ "mmlu_eval_accuracy_world_religions": 0.8421052631578947,
+ "mmlu_loss": 1.221846800616253,
+ "step": 1870
+ },
+ {
+ "epoch": 3.4,
+ "step": 1875,
+ "total_flos": 2.1784431229955113e+18,
+ "train_loss": 0.8699908837636312,
+ "train_runtime": 112632.6276,
+ "train_samples_per_second": 0.266,
+ "train_steps_per_second": 0.017
+ }
+ ],
+ "max_steps": 1875,
+ "num_train_epochs": 4,
+ "total_flos": 2.1784431229955113e+18,
+ "trial_name": null,
+ "trial_params": null
+}