diff --git a/details/dreamgen/llama3-8b-instruct-align-test2-kto/2024-05-24T18-12-54.142897/details_leaderboard|arc:challenge|25_2024-05-24T18-12-54.142897.parquet b/details/dreamgen/llama3-8b-instruct-align-test2-kto/2024-05-24T18-12-54.142897/details_leaderboard|arc:challenge|25_2024-05-24T18-12-54.142897.parquet new file mode 100644 index 0000000000000000000000000000000000000000..a09bb323f15bf8ba4c6b737b1a446bbf625fdb6f --- /dev/null +++ b/details/dreamgen/llama3-8b-instruct-align-test2-kto/2024-05-24T18-12-54.142897/details_leaderboard|arc:challenge|25_2024-05-24T18-12-54.142897.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:14637928a9d15e58408a7a61891aa1d13baecedf2112320f8bc7fac43efb8795 +size 5638914 diff --git a/details/dreamgen/llama3-8b-instruct-align-test2-kto/2024-05-24T18-12-54.142897/details_leaderboard|gsm8k|5_2024-05-24T18-12-54.142897.parquet b/details/dreamgen/llama3-8b-instruct-align-test2-kto/2024-05-24T18-12-54.142897/details_leaderboard|gsm8k|5_2024-05-24T18-12-54.142897.parquet new file mode 100644 index 0000000000000000000000000000000000000000..8654109f0e9d0e89291df700484098c327daa962 --- /dev/null +++ b/details/dreamgen/llama3-8b-instruct-align-test2-kto/2024-05-24T18-12-54.142897/details_leaderboard|gsm8k|5_2024-05-24T18-12-54.142897.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:534b66c16ea82ab0e937fb166d5e51f568c07eee2b202d6a3f8f203ce4c05f81 +size 5046628 diff --git a/details/dreamgen/llama3-8b-instruct-align-test2-kto/2024-05-24T18-12-54.142897/details_leaderboard|hellaswag|10_2024-05-24T18-12-54.142897.parquet b/details/dreamgen/llama3-8b-instruct-align-test2-kto/2024-05-24T18-12-54.142897/details_leaderboard|hellaswag|10_2024-05-24T18-12-54.142897.parquet new file mode 100644 index 0000000000000000000000000000000000000000..e8afcf57f6a06f43b6c512790572888d12175caf --- /dev/null +++ b/details/dreamgen/llama3-8b-instruct-align-test2-kto/2024-05-24T18-12-54.142897/details_leaderboard|hellaswag|10_2024-05-24T18-12-54.142897.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d5e088713503f00cc00a2d8bdfa5db4c4cf95cf87d2857e35754a3d6fd5579b3 +size 57616465 diff --git a/details/dreamgen/llama3-8b-instruct-align-test2-kto/2024-05-24T18-12-54.142897/details_leaderboard|mmlu:abstract_algebra|5_2024-05-24T18-12-54.142897.parquet b/details/dreamgen/llama3-8b-instruct-align-test2-kto/2024-05-24T18-12-54.142897/details_leaderboard|mmlu:abstract_algebra|5_2024-05-24T18-12-54.142897.parquet new file mode 100644 index 0000000000000000000000000000000000000000..bef687720cc199b35c79dad595f597452cca31c0 --- /dev/null +++ b/details/dreamgen/llama3-8b-instruct-align-test2-kto/2024-05-24T18-12-54.142897/details_leaderboard|mmlu:abstract_algebra|5_2024-05-24T18-12-54.142897.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fc93417b804c74c7003ff5c9beb167faccb3dbd94a17de73c2d0535eacfa1b33 +size 72502 diff --git a/details/dreamgen/llama3-8b-instruct-align-test2-kto/2024-05-24T18-12-54.142897/details_leaderboard|mmlu:anatomy|5_2024-05-24T18-12-54.142897.parquet b/details/dreamgen/llama3-8b-instruct-align-test2-kto/2024-05-24T18-12-54.142897/details_leaderboard|mmlu:anatomy|5_2024-05-24T18-12-54.142897.parquet new file mode 100644 index 0000000000000000000000000000000000000000..c26278e59be1278df29edce87fb4c21f101e80e7 --- /dev/null +++ b/details/dreamgen/llama3-8b-instruct-align-test2-kto/2024-05-24T18-12-54.142897/details_leaderboard|mmlu:anatomy|5_2024-05-24T18-12-54.142897.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61834bb1c232ed7e26b53fe9495e41983e03f0eea882ddc603145a89d76469fb +size 115311 diff --git a/details/dreamgen/llama3-8b-instruct-align-test2-kto/2024-05-24T18-12-54.142897/details_leaderboard|mmlu:astronomy|5_2024-05-24T18-12-54.142897.parquet b/details/dreamgen/llama3-8b-instruct-align-test2-kto/2024-05-24T18-12-54.142897/details_leaderboard|mmlu:astronomy|5_2024-05-24T18-12-54.142897.parquet new file mode 100644 index 0000000000000000000000000000000000000000..7458e1e55420645141e50be212cc10c2de3028db --- /dev/null +++ b/details/dreamgen/llama3-8b-instruct-align-test2-kto/2024-05-24T18-12-54.142897/details_leaderboard|mmlu:astronomy|5_2024-05-24T18-12-54.142897.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7eb669694af160029b8de00766bf04476daeef904d9b34a49f2904a414f4a8bd +size 180550 diff --git a/details/dreamgen/llama3-8b-instruct-align-test2-kto/2024-05-24T18-12-54.142897/details_leaderboard|mmlu:business_ethics|5_2024-05-24T18-12-54.142897.parquet b/details/dreamgen/llama3-8b-instruct-align-test2-kto/2024-05-24T18-12-54.142897/details_leaderboard|mmlu:business_ethics|5_2024-05-24T18-12-54.142897.parquet new file mode 100644 index 0000000000000000000000000000000000000000..550fdfc05ad2aec1f9fb637148fb4698f1584be7 --- /dev/null +++ b/details/dreamgen/llama3-8b-instruct-align-test2-kto/2024-05-24T18-12-54.142897/details_leaderboard|mmlu:business_ethics|5_2024-05-24T18-12-54.142897.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:73c92cf7d87bb2d3a54af9cb6110e902f8ce46dee8c5d4bf1e20a9e8e2b32f30 +size 132583 diff --git a/details/dreamgen/llama3-8b-instruct-align-test2-kto/2024-05-24T18-12-54.142897/details_leaderboard|mmlu:clinical_knowledge|5_2024-05-24T18-12-54.142897.parquet b/details/dreamgen/llama3-8b-instruct-align-test2-kto/2024-05-24T18-12-54.142897/details_leaderboard|mmlu:clinical_knowledge|5_2024-05-24T18-12-54.142897.parquet new file mode 100644 index 0000000000000000000000000000000000000000..218526cca71fb19ba428ac28b10899734f4338ce --- /dev/null +++ b/details/dreamgen/llama3-8b-instruct-align-test2-kto/2024-05-24T18-12-54.142897/details_leaderboard|mmlu:clinical_knowledge|5_2024-05-24T18-12-54.142897.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b960702e6c17d5c5f15b1652069b01f9ac0603b27ce4f855e2920cac858c35ec +size 221384 diff --git a/details/dreamgen/llama3-8b-instruct-align-test2-kto/2024-05-24T18-12-54.142897/details_leaderboard|mmlu:college_biology|5_2024-05-24T18-12-54.142897.parquet b/details/dreamgen/llama3-8b-instruct-align-test2-kto/2024-05-24T18-12-54.142897/details_leaderboard|mmlu:college_biology|5_2024-05-24T18-12-54.142897.parquet new file mode 100644 index 0000000000000000000000000000000000000000..b3d71cd8b3b2f66ee9b6906677a4204d20055a7f --- /dev/null +++ b/details/dreamgen/llama3-8b-instruct-align-test2-kto/2024-05-24T18-12-54.142897/details_leaderboard|mmlu:college_biology|5_2024-05-24T18-12-54.142897.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:341790f3aa67365368cdc2f921f4101defd8b8becbf5849b6139f1ffa20cccd4 +size 167395 diff --git a/details/dreamgen/llama3-8b-instruct-align-test2-kto/2024-05-24T18-12-54.142897/details_leaderboard|mmlu:college_chemistry|5_2024-05-24T18-12-54.142897.parquet b/details/dreamgen/llama3-8b-instruct-align-test2-kto/2024-05-24T18-12-54.142897/details_leaderboard|mmlu:college_chemistry|5_2024-05-24T18-12-54.142897.parquet new file mode 100644 index 0000000000000000000000000000000000000000..ea2c86b049799326f06fb9c4a52df05c043857e8 --- /dev/null +++ b/details/dreamgen/llama3-8b-instruct-align-test2-kto/2024-05-24T18-12-54.142897/details_leaderboard|mmlu:college_chemistry|5_2024-05-24T18-12-54.142897.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b476e0b51c3c1b3686d7f22edeae08857726735bf639ab7f4209e0a027a71f7a +size 112834 diff --git a/details/dreamgen/llama3-8b-instruct-align-test2-kto/2024-05-24T18-12-54.142897/details_leaderboard|mmlu:college_computer_science|5_2024-05-24T18-12-54.142897.parquet b/details/dreamgen/llama3-8b-instruct-align-test2-kto/2024-05-24T18-12-54.142897/details_leaderboard|mmlu:college_computer_science|5_2024-05-24T18-12-54.142897.parquet new file mode 100644 index 0000000000000000000000000000000000000000..3a5dbfd1061c02dd9fe610cb6a63c4a66160c25c --- /dev/null +++ b/details/dreamgen/llama3-8b-instruct-align-test2-kto/2024-05-24T18-12-54.142897/details_leaderboard|mmlu:college_computer_science|5_2024-05-24T18-12-54.142897.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ba74d99caa24ffbaebc7ff52ad6bb453cedbad94d04fcee05f3d7ce130e9da87 +size 173789 diff --git a/details/dreamgen/llama3-8b-instruct-align-test2-kto/2024-05-24T18-12-54.142897/details_leaderboard|mmlu:college_mathematics|5_2024-05-24T18-12-54.142897.parquet b/details/dreamgen/llama3-8b-instruct-align-test2-kto/2024-05-24T18-12-54.142897/details_leaderboard|mmlu:college_mathematics|5_2024-05-24T18-12-54.142897.parquet new file mode 100644 index 0000000000000000000000000000000000000000..606596d07805321af1ff76eeae9f9fa7d6c438a7 --- /dev/null +++ b/details/dreamgen/llama3-8b-instruct-align-test2-kto/2024-05-24T18-12-54.142897/details_leaderboard|mmlu:college_mathematics|5_2024-05-24T18-12-54.142897.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0efcb3e2e0d31bdcfd6752dcfbda1b6a8dd9baa8ff09da6fa836e0a074ece03f +size 108482 diff --git a/details/dreamgen/llama3-8b-instruct-align-test2-kto/2024-05-24T18-12-54.142897/details_leaderboard|mmlu:college_medicine|5_2024-05-24T18-12-54.142897.parquet b/details/dreamgen/llama3-8b-instruct-align-test2-kto/2024-05-24T18-12-54.142897/details_leaderboard|mmlu:college_medicine|5_2024-05-24T18-12-54.142897.parquet new file mode 100644 index 0000000000000000000000000000000000000000..5b7a24934f4815d0bd416505ca56af4a328a4fd0 --- /dev/null +++ b/details/dreamgen/llama3-8b-instruct-align-test2-kto/2024-05-24T18-12-54.142897/details_leaderboard|mmlu:college_medicine|5_2024-05-24T18-12-54.142897.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1d0fc14025a215de3fbd1a9189a47c8d61e6297846c4a7d7c390bca6c9e0d889 +size 235055 diff --git a/details/dreamgen/llama3-8b-instruct-align-test2-kto/2024-05-24T18-12-54.142897/details_leaderboard|mmlu:college_physics|5_2024-05-24T18-12-54.142897.parquet b/details/dreamgen/llama3-8b-instruct-align-test2-kto/2024-05-24T18-12-54.142897/details_leaderboard|mmlu:college_physics|5_2024-05-24T18-12-54.142897.parquet new file mode 100644 index 0000000000000000000000000000000000000000..0f9f31aa03da72dba0fbc65832bc12ca8b95da42 --- /dev/null +++ b/details/dreamgen/llama3-8b-instruct-align-test2-kto/2024-05-24T18-12-54.142897/details_leaderboard|mmlu:college_physics|5_2024-05-24T18-12-54.142897.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8990564f710e76e0f4212734f61d9ee0705c5d4b9ebef2254cdbb16fba3a17ce +size 111093 diff --git a/details/dreamgen/llama3-8b-instruct-align-test2-kto/2024-05-24T18-12-54.142897/details_leaderboard|mmlu:computer_security|5_2024-05-24T18-12-54.142897.parquet b/details/dreamgen/llama3-8b-instruct-align-test2-kto/2024-05-24T18-12-54.142897/details_leaderboard|mmlu:computer_security|5_2024-05-24T18-12-54.142897.parquet new file mode 100644 index 0000000000000000000000000000000000000000..de002ec71f0ea1015b7d809a299a0b72e86f1904 --- /dev/null +++ b/details/dreamgen/llama3-8b-instruct-align-test2-kto/2024-05-24T18-12-54.142897/details_leaderboard|mmlu:computer_security|5_2024-05-24T18-12-54.142897.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:039588d50415dffd41ea70022d7b8e7f012bb5ae70f7222d6a9a425c652f2a3e +size 107025 diff --git a/details/dreamgen/llama3-8b-instruct-align-test2-kto/2024-05-24T18-12-54.142897/details_leaderboard|mmlu:conceptual_physics|5_2024-05-24T18-12-54.142897.parquet b/details/dreamgen/llama3-8b-instruct-align-test2-kto/2024-05-24T18-12-54.142897/details_leaderboard|mmlu:conceptual_physics|5_2024-05-24T18-12-54.142897.parquet new file mode 100644 index 0000000000000000000000000000000000000000..947613d9c3b00a5926e91cc2bfd10637a8a63eca --- /dev/null +++ b/details/dreamgen/llama3-8b-instruct-align-test2-kto/2024-05-24T18-12-54.142897/details_leaderboard|mmlu:conceptual_physics|5_2024-05-24T18-12-54.142897.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:45a3f64b3900b9f41f8bf84c6cad9da0c0338109c0e53631b225bd19d736ac0d +size 150436 diff --git a/details/dreamgen/llama3-8b-instruct-align-test2-kto/2024-05-24T18-12-54.142897/details_leaderboard|mmlu:econometrics|5_2024-05-24T18-12-54.142897.parquet b/details/dreamgen/llama3-8b-instruct-align-test2-kto/2024-05-24T18-12-54.142897/details_leaderboard|mmlu:econometrics|5_2024-05-24T18-12-54.142897.parquet new file mode 100644 index 0000000000000000000000000000000000000000..08aeed753b92fef697fb45a3e6db65d3e001c989 --- /dev/null +++ b/details/dreamgen/llama3-8b-instruct-align-test2-kto/2024-05-24T18-12-54.142897/details_leaderboard|mmlu:econometrics|5_2024-05-24T18-12-54.142897.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3dfc35593667df65e5c62ac4b29922e0e39b4b50eafec540308415086a9b7218 +size 151753 diff --git a/details/dreamgen/llama3-8b-instruct-align-test2-kto/2024-05-24T18-12-54.142897/details_leaderboard|mmlu:electrical_engineering|5_2024-05-24T18-12-54.142897.parquet b/details/dreamgen/llama3-8b-instruct-align-test2-kto/2024-05-24T18-12-54.142897/details_leaderboard|mmlu:electrical_engineering|5_2024-05-24T18-12-54.142897.parquet new file mode 100644 index 0000000000000000000000000000000000000000..19f535aaa433fac4fd00743a67ec57b99360b9b2 --- /dev/null +++ b/details/dreamgen/llama3-8b-instruct-align-test2-kto/2024-05-24T18-12-54.142897/details_leaderboard|mmlu:electrical_engineering|5_2024-05-24T18-12-54.142897.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:18faa93f02668ae335bede72596ad08db84aa0fab466a094b826a4850bdfaaa4 +size 112884 diff --git a/details/dreamgen/llama3-8b-instruct-align-test2-kto/2024-05-24T18-12-54.142897/details_leaderboard|mmlu:elementary_mathematics|5_2024-05-24T18-12-54.142897.parquet b/details/dreamgen/llama3-8b-instruct-align-test2-kto/2024-05-24T18-12-54.142897/details_leaderboard|mmlu:elementary_mathematics|5_2024-05-24T18-12-54.142897.parquet new file mode 100644 index 0000000000000000000000000000000000000000..b4f75c260b99649683a664655ee26ec07b91bade --- /dev/null +++ b/details/dreamgen/llama3-8b-instruct-align-test2-kto/2024-05-24T18-12-54.142897/details_leaderboard|mmlu:elementary_mathematics|5_2024-05-24T18-12-54.142897.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:806c4d24cde4a19ba92f353f4b7d1cb641994733f7509f565a5be89adbf2bdf0 +size 293203 diff --git a/details/dreamgen/llama3-8b-instruct-align-test2-kto/2024-05-24T18-12-54.142897/details_leaderboard|mmlu:formal_logic|5_2024-05-24T18-12-54.142897.parquet b/details/dreamgen/llama3-8b-instruct-align-test2-kto/2024-05-24T18-12-54.142897/details_leaderboard|mmlu:formal_logic|5_2024-05-24T18-12-54.142897.parquet new file mode 100644 index 0000000000000000000000000000000000000000..5c68978128dba5d83fa82b87db4847a060efacab --- /dev/null +++ b/details/dreamgen/llama3-8b-instruct-align-test2-kto/2024-05-24T18-12-54.142897/details_leaderboard|mmlu:formal_logic|5_2024-05-24T18-12-54.142897.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f9e965d8d0873277fb4a0ec07abb141c325d2a2fbe5b08ba352db8203b0dd555 +size 145187 diff --git a/details/dreamgen/llama3-8b-instruct-align-test2-kto/2024-05-24T18-12-54.142897/details_leaderboard|mmlu:global_facts|5_2024-05-24T18-12-54.142897.parquet b/details/dreamgen/llama3-8b-instruct-align-test2-kto/2024-05-24T18-12-54.142897/details_leaderboard|mmlu:global_facts|5_2024-05-24T18-12-54.142897.parquet new file mode 100644 index 0000000000000000000000000000000000000000..4e306c3f681571c1d0cff8ae598e1bcb3311d28c --- /dev/null +++ b/details/dreamgen/llama3-8b-instruct-align-test2-kto/2024-05-24T18-12-54.142897/details_leaderboard|mmlu:global_facts|5_2024-05-24T18-12-54.142897.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7039d0fa39221e15eec577151ed8f15f64f90c55a4e0e75739e4c92660b1a5b5 +size 85135 diff --git a/details/dreamgen/llama3-8b-instruct-align-test2-kto/2024-05-24T18-12-54.142897/details_leaderboard|mmlu:high_school_biology|5_2024-05-24T18-12-54.142897.parquet b/details/dreamgen/llama3-8b-instruct-align-test2-kto/2024-05-24T18-12-54.142897/details_leaderboard|mmlu:high_school_biology|5_2024-05-24T18-12-54.142897.parquet new file mode 100644 index 0000000000000000000000000000000000000000..f8826f8fa5b5d09dbecdfff7d163d865b6d94bec --- /dev/null +++ b/details/dreamgen/llama3-8b-instruct-align-test2-kto/2024-05-24T18-12-54.142897/details_leaderboard|mmlu:high_school_biology|5_2024-05-24T18-12-54.142897.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a516dcaaf7f85da1323801b12f0de192d3a0307681541899ca01135992bd74c8 +size 331956 diff --git a/details/dreamgen/llama3-8b-instruct-align-test2-kto/2024-05-24T18-12-54.142897/details_leaderboard|mmlu:high_school_chemistry|5_2024-05-24T18-12-54.142897.parquet b/details/dreamgen/llama3-8b-instruct-align-test2-kto/2024-05-24T18-12-54.142897/details_leaderboard|mmlu:high_school_chemistry|5_2024-05-24T18-12-54.142897.parquet new file mode 100644 index 0000000000000000000000000000000000000000..927228431706141e7727681fd6de8133ec905f78 --- /dev/null +++ b/details/dreamgen/llama3-8b-instruct-align-test2-kto/2024-05-24T18-12-54.142897/details_leaderboard|mmlu:high_school_chemistry|5_2024-05-24T18-12-54.142897.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e92ea72746da539da9034c15acc7f6098bacc33413acf2bbab1c2ca892cb6070 +size 190148 diff --git a/details/dreamgen/llama3-8b-instruct-align-test2-kto/2024-05-24T18-12-54.142897/details_leaderboard|mmlu:high_school_computer_science|5_2024-05-24T18-12-54.142897.parquet b/details/dreamgen/llama3-8b-instruct-align-test2-kto/2024-05-24T18-12-54.142897/details_leaderboard|mmlu:high_school_computer_science|5_2024-05-24T18-12-54.142897.parquet new file mode 100644 index 0000000000000000000000000000000000000000..fa6d9447d8a6edaec45e7c80a864a5c356be73db --- /dev/null +++ b/details/dreamgen/llama3-8b-instruct-align-test2-kto/2024-05-24T18-12-54.142897/details_leaderboard|mmlu:high_school_computer_science|5_2024-05-24T18-12-54.142897.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:064bffad31690395b3213a89eac598049ad3ff772941a44b08845fe18059defd +size 168879 diff --git a/details/dreamgen/llama3-8b-instruct-align-test2-kto/2024-05-24T18-12-54.142897/details_leaderboard|mmlu:high_school_european_history|5_2024-05-24T18-12-54.142897.parquet b/details/dreamgen/llama3-8b-instruct-align-test2-kto/2024-05-24T18-12-54.142897/details_leaderboard|mmlu:high_school_european_history|5_2024-05-24T18-12-54.142897.parquet new file mode 100644 index 0000000000000000000000000000000000000000..ba70e2c6ebe7ad4090bd92f80396bfe514809cc1 --- /dev/null +++ b/details/dreamgen/llama3-8b-instruct-align-test2-kto/2024-05-24T18-12-54.142897/details_leaderboard|mmlu:high_school_european_history|5_2024-05-24T18-12-54.142897.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9eb43cb346d271a1a782093872aaeb19383ffc1c7e0594c95d6eb5dae70c880b +size 1336154 diff --git a/details/dreamgen/llama3-8b-instruct-align-test2-kto/2024-05-24T18-12-54.142897/details_leaderboard|mmlu:high_school_geography|5_2024-05-24T18-12-54.142897.parquet b/details/dreamgen/llama3-8b-instruct-align-test2-kto/2024-05-24T18-12-54.142897/details_leaderboard|mmlu:high_school_geography|5_2024-05-24T18-12-54.142897.parquet new file mode 100644 index 0000000000000000000000000000000000000000..66d879aa5c3d385f10e07049d853103c9a4d83d5 --- /dev/null +++ b/details/dreamgen/llama3-8b-instruct-align-test2-kto/2024-05-24T18-12-54.142897/details_leaderboard|mmlu:high_school_geography|5_2024-05-24T18-12-54.142897.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8647007fb6c39fb28db4ff29ba9d3b5fb90cbba1ec052fe08cdc5574a52bb1c4 +size 164887 diff --git a/details/dreamgen/llama3-8b-instruct-align-test2-kto/2024-05-24T18-12-54.142897/details_leaderboard|mmlu:high_school_government_and_politics|5_2024-05-24T18-12-54.142897.parquet b/details/dreamgen/llama3-8b-instruct-align-test2-kto/2024-05-24T18-12-54.142897/details_leaderboard|mmlu:high_school_government_and_politics|5_2024-05-24T18-12-54.142897.parquet new file mode 100644 index 0000000000000000000000000000000000000000..d860a265cb230b51223b53dc72a406bda7cfa5d5 --- /dev/null +++ b/details/dreamgen/llama3-8b-instruct-align-test2-kto/2024-05-24T18-12-54.142897/details_leaderboard|mmlu:high_school_government_and_politics|5_2024-05-24T18-12-54.142897.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7dcc5ee8458a6dee87019fedd5894b1b48cf267965ae28d6cacc5c803d92dbd0 +size 215026 diff --git a/details/dreamgen/llama3-8b-instruct-align-test2-kto/2024-05-24T18-12-54.142897/details_leaderboard|mmlu:high_school_macroeconomics|5_2024-05-24T18-12-54.142897.parquet b/details/dreamgen/llama3-8b-instruct-align-test2-kto/2024-05-24T18-12-54.142897/details_leaderboard|mmlu:high_school_macroeconomics|5_2024-05-24T18-12-54.142897.parquet new file mode 100644 index 0000000000000000000000000000000000000000..cb2d88af866b42206844eba3ce57eb7e02da3825 --- /dev/null +++ b/details/dreamgen/llama3-8b-instruct-align-test2-kto/2024-05-24T18-12-54.142897/details_leaderboard|mmlu:high_school_macroeconomics|5_2024-05-24T18-12-54.142897.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:36a3c522794b3c7d6800aa7ddd1f0031f4ba390cd9a49ce125414b097e07ca01 +size 308310 diff --git a/details/dreamgen/llama3-8b-instruct-align-test2-kto/2024-05-24T18-12-54.142897/details_leaderboard|mmlu:high_school_mathematics|5_2024-05-24T18-12-54.142897.parquet b/details/dreamgen/llama3-8b-instruct-align-test2-kto/2024-05-24T18-12-54.142897/details_leaderboard|mmlu:high_school_mathematics|5_2024-05-24T18-12-54.142897.parquet new file mode 100644 index 0000000000000000000000000000000000000000..fad9c16fd09ec625e603251ab707aece109103eb --- /dev/null +++ b/details/dreamgen/llama3-8b-instruct-align-test2-kto/2024-05-24T18-12-54.142897/details_leaderboard|mmlu:high_school_mathematics|5_2024-05-24T18-12-54.142897.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c7aa608a739c43acfc1b04735bdf5a823e731bfcd60f909e414220bd19131db9 +size 229303 diff --git a/details/dreamgen/llama3-8b-instruct-align-test2-kto/2024-05-24T18-12-54.142897/details_leaderboard|mmlu:high_school_microeconomics|5_2024-05-24T18-12-54.142897.parquet b/details/dreamgen/llama3-8b-instruct-align-test2-kto/2024-05-24T18-12-54.142897/details_leaderboard|mmlu:high_school_microeconomics|5_2024-05-24T18-12-54.142897.parquet new file mode 100644 index 0000000000000000000000000000000000000000..e3d2923272430673199eae81283c01f6b65a93e1 --- /dev/null +++ b/details/dreamgen/llama3-8b-instruct-align-test2-kto/2024-05-24T18-12-54.142897/details_leaderboard|mmlu:high_school_microeconomics|5_2024-05-24T18-12-54.142897.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61fd00e20f649108b8706c4b6312654e07aedd2e7c53153174dd1654bcadcc94 +size 222430 diff --git a/details/dreamgen/llama3-8b-instruct-align-test2-kto/2024-05-24T18-12-54.142897/details_leaderboard|mmlu:high_school_physics|5_2024-05-24T18-12-54.142897.parquet b/details/dreamgen/llama3-8b-instruct-align-test2-kto/2024-05-24T18-12-54.142897/details_leaderboard|mmlu:high_school_physics|5_2024-05-24T18-12-54.142897.parquet new file mode 100644 index 0000000000000000000000000000000000000000..85ab0e60d22c4dd2f167485d2a06879f2997f0fa --- /dev/null +++ b/details/dreamgen/llama3-8b-instruct-align-test2-kto/2024-05-24T18-12-54.142897/details_leaderboard|mmlu:high_school_physics|5_2024-05-24T18-12-54.142897.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cea18c16d91666a5f6fbb8bc15c07ec40d7fabaa38df1b7a73585cda2b2d1e61 +size 183131 diff --git a/details/dreamgen/llama3-8b-instruct-align-test2-kto/2024-05-24T18-12-54.142897/details_leaderboard|mmlu:high_school_psychology|5_2024-05-24T18-12-54.142897.parquet b/details/dreamgen/llama3-8b-instruct-align-test2-kto/2024-05-24T18-12-54.142897/details_leaderboard|mmlu:high_school_psychology|5_2024-05-24T18-12-54.142897.parquet new file mode 100644 index 0000000000000000000000000000000000000000..824689ceed97b8f4b5f294844055723ac9a192bb --- /dev/null +++ b/details/dreamgen/llama3-8b-instruct-align-test2-kto/2024-05-24T18-12-54.142897/details_leaderboard|mmlu:high_school_psychology|5_2024-05-24T18-12-54.142897.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:caeaa7fafaa34fdb6c55f3dc9af9dcd5f8ad74ebf9796288d0603ffcc8c46f67 +size 539644 diff --git a/details/dreamgen/llama3-8b-instruct-align-test2-kto/2024-05-24T18-12-54.142897/details_leaderboard|mmlu:high_school_statistics|5_2024-05-24T18-12-54.142897.parquet b/details/dreamgen/llama3-8b-instruct-align-test2-kto/2024-05-24T18-12-54.142897/details_leaderboard|mmlu:high_school_statistics|5_2024-05-24T18-12-54.142897.parquet new file mode 100644 index 0000000000000000000000000000000000000000..130b4382ef18cdfe1d5f709215ffa2508f4e2529 --- /dev/null +++ b/details/dreamgen/llama3-8b-instruct-align-test2-kto/2024-05-24T18-12-54.142897/details_leaderboard|mmlu:high_school_statistics|5_2024-05-24T18-12-54.142897.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a7bd86f631f7e3fd8e714f4f6b221a80bf3fc0f109493f17af0b3ff750d9ff22 +size 329454 diff --git a/details/dreamgen/llama3-8b-instruct-align-test2-kto/2024-05-24T18-12-54.142897/details_leaderboard|mmlu:high_school_us_history|5_2024-05-24T18-12-54.142897.parquet b/details/dreamgen/llama3-8b-instruct-align-test2-kto/2024-05-24T18-12-54.142897/details_leaderboard|mmlu:high_school_us_history|5_2024-05-24T18-12-54.142897.parquet new file mode 100644 index 0000000000000000000000000000000000000000..8b2cff689132b9c9f49bc455d09c00c837505d00 --- /dev/null +++ b/details/dreamgen/llama3-8b-instruct-align-test2-kto/2024-05-24T18-12-54.142897/details_leaderboard|mmlu:high_school_us_history|5_2024-05-24T18-12-54.142897.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:08b4cc42d5818b7c3a40793d121a396225dc5dd4e110a27530e88f8b568988f5 +size 1212460 diff --git a/details/dreamgen/llama3-8b-instruct-align-test2-kto/2024-05-24T18-12-54.142897/details_leaderboard|mmlu:high_school_world_history|5_2024-05-24T18-12-54.142897.parquet b/details/dreamgen/llama3-8b-instruct-align-test2-kto/2024-05-24T18-12-54.142897/details_leaderboard|mmlu:high_school_world_history|5_2024-05-24T18-12-54.142897.parquet new file mode 100644 index 0000000000000000000000000000000000000000..ed8530ce93e829b0b2248fa08c85299872193ed3 --- /dev/null +++ b/details/dreamgen/llama3-8b-instruct-align-test2-kto/2024-05-24T18-12-54.142897/details_leaderboard|mmlu:high_school_world_history|5_2024-05-24T18-12-54.142897.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f997e62f863fed93c39119f763bc045a3418620311c04b0b6c786ef60a02e09f +size 962037 diff --git a/details/dreamgen/llama3-8b-instruct-align-test2-kto/2024-05-24T18-12-54.142897/details_leaderboard|mmlu:human_aging|5_2024-05-24T18-12-54.142897.parquet b/details/dreamgen/llama3-8b-instruct-align-test2-kto/2024-05-24T18-12-54.142897/details_leaderboard|mmlu:human_aging|5_2024-05-24T18-12-54.142897.parquet new file mode 100644 index 0000000000000000000000000000000000000000..3794f597ef4b9754d4bab53a6ac5315434f13ac0 --- /dev/null +++ b/details/dreamgen/llama3-8b-instruct-align-test2-kto/2024-05-24T18-12-54.142897/details_leaderboard|mmlu:human_aging|5_2024-05-24T18-12-54.142897.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:df04aeddea524eb724d66bca2c0ff796b70e96f27e9e73a0e83e1ec33e754d6f +size 168200 diff --git a/details/dreamgen/llama3-8b-instruct-align-test2-kto/2024-05-24T18-12-54.142897/details_leaderboard|mmlu:human_sexuality|5_2024-05-24T18-12-54.142897.parquet b/details/dreamgen/llama3-8b-instruct-align-test2-kto/2024-05-24T18-12-54.142897/details_leaderboard|mmlu:human_sexuality|5_2024-05-24T18-12-54.142897.parquet new file mode 100644 index 0000000000000000000000000000000000000000..ab6bdf341eec5c516f30440db3815d4e661fc278 --- /dev/null +++ b/details/dreamgen/llama3-8b-instruct-align-test2-kto/2024-05-24T18-12-54.142897/details_leaderboard|mmlu:human_sexuality|5_2024-05-24T18-12-54.142897.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e4bdd8660acfc270608456c67d1a86f58b23b43c23f753b0a576184c0ed6c197 +size 119551 diff --git a/details/dreamgen/llama3-8b-instruct-align-test2-kto/2024-05-24T18-12-54.142897/details_leaderboard|mmlu:international_law|5_2024-05-24T18-12-54.142897.parquet b/details/dreamgen/llama3-8b-instruct-align-test2-kto/2024-05-24T18-12-54.142897/details_leaderboard|mmlu:international_law|5_2024-05-24T18-12-54.142897.parquet new file mode 100644 index 0000000000000000000000000000000000000000..ef9e3ce0d77619a699dcad64c76493ca5ea1f103 --- /dev/null +++ b/details/dreamgen/llama3-8b-instruct-align-test2-kto/2024-05-24T18-12-54.142897/details_leaderboard|mmlu:international_law|5_2024-05-24T18-12-54.142897.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:46043e1c7ccc16316dfd6d27a683151ab6e5d1d02fb480e703fefc6a094b1340 +size 173828 diff --git a/details/dreamgen/llama3-8b-instruct-align-test2-kto/2024-05-24T18-12-54.142897/details_leaderboard|mmlu:jurisprudence|5_2024-05-24T18-12-54.142897.parquet b/details/dreamgen/llama3-8b-instruct-align-test2-kto/2024-05-24T18-12-54.142897/details_leaderboard|mmlu:jurisprudence|5_2024-05-24T18-12-54.142897.parquet new file mode 100644 index 0000000000000000000000000000000000000000..c8e076ffd7dcf2a72891e758bf3df553a0b55187 --- /dev/null +++ b/details/dreamgen/llama3-8b-instruct-align-test2-kto/2024-05-24T18-12-54.142897/details_leaderboard|mmlu:jurisprudence|5_2024-05-24T18-12-54.142897.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:95063de5d314eaacc3fe8a96d352f74fa7ed182b184edc98a3fb08c76c5717f5 +size 128228 diff --git a/details/dreamgen/llama3-8b-instruct-align-test2-kto/2024-05-24T18-12-54.142897/details_leaderboard|mmlu:logical_fallacies|5_2024-05-24T18-12-54.142897.parquet b/details/dreamgen/llama3-8b-instruct-align-test2-kto/2024-05-24T18-12-54.142897/details_leaderboard|mmlu:logical_fallacies|5_2024-05-24T18-12-54.142897.parquet new file mode 100644 index 0000000000000000000000000000000000000000..633fc00b96916879193b3ff4047fa6722b4416a1 --- /dev/null +++ b/details/dreamgen/llama3-8b-instruct-align-test2-kto/2024-05-24T18-12-54.142897/details_leaderboard|mmlu:logical_fallacies|5_2024-05-24T18-12-54.142897.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:88a7a17f8079de0b7cb3c1e395adc3d2f8ac0652fe18031986639c32e2cfaf93 +size 156189 diff --git a/details/dreamgen/llama3-8b-instruct-align-test2-kto/2024-05-24T18-12-54.142897/details_leaderboard|mmlu:machine_learning|5_2024-05-24T18-12-54.142897.parquet b/details/dreamgen/llama3-8b-instruct-align-test2-kto/2024-05-24T18-12-54.142897/details_leaderboard|mmlu:machine_learning|5_2024-05-24T18-12-54.142897.parquet new file mode 100644 index 0000000000000000000000000000000000000000..6f94a88735d3efd7945fe55e13446f6439c84cac --- /dev/null +++ b/details/dreamgen/llama3-8b-instruct-align-test2-kto/2024-05-24T18-12-54.142897/details_leaderboard|mmlu:machine_learning|5_2024-05-24T18-12-54.142897.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dbb90ec4009946f34ef1933b034982626c1398f8c9d0fe3d19ee78c61e650c49 +size 156363 diff --git a/details/dreamgen/llama3-8b-instruct-align-test2-kto/2024-05-24T18-12-54.142897/details_leaderboard|mmlu:management|5_2024-05-24T18-12-54.142897.parquet b/details/dreamgen/llama3-8b-instruct-align-test2-kto/2024-05-24T18-12-54.142897/details_leaderboard|mmlu:management|5_2024-05-24T18-12-54.142897.parquet new file mode 100644 index 0000000000000000000000000000000000000000..7a95d19f56b2572796bf18ef5c6f69e3e4bcfccf --- /dev/null +++ b/details/dreamgen/llama3-8b-instruct-align-test2-kto/2024-05-24T18-12-54.142897/details_leaderboard|mmlu:management|5_2024-05-24T18-12-54.142897.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c5cea640f03077e2f3150118da3fe56748a1144087ac523d653b20cb3a207d81 +size 82594 diff --git a/details/dreamgen/llama3-8b-instruct-align-test2-kto/2024-05-24T18-12-54.142897/details_leaderboard|mmlu:marketing|5_2024-05-24T18-12-54.142897.parquet b/details/dreamgen/llama3-8b-instruct-align-test2-kto/2024-05-24T18-12-54.142897/details_leaderboard|mmlu:marketing|5_2024-05-24T18-12-54.142897.parquet new file mode 100644 index 0000000000000000000000000000000000000000..606634a2927a532fda9649f0c95621726540121d --- /dev/null +++ b/details/dreamgen/llama3-8b-instruct-align-test2-kto/2024-05-24T18-12-54.142897/details_leaderboard|mmlu:marketing|5_2024-05-24T18-12-54.142897.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e9d8fa0ed042508510221a68d83f1abc622bf6ca3282af12fb828a87031ebfcf +size 213558 diff --git a/details/dreamgen/llama3-8b-instruct-align-test2-kto/2024-05-24T18-12-54.142897/details_leaderboard|mmlu:medical_genetics|5_2024-05-24T18-12-54.142897.parquet b/details/dreamgen/llama3-8b-instruct-align-test2-kto/2024-05-24T18-12-54.142897/details_leaderboard|mmlu:medical_genetics|5_2024-05-24T18-12-54.142897.parquet new file mode 100644 index 0000000000000000000000000000000000000000..c642596d1fd6102e6777f2509b38e7a50b7f7013 --- /dev/null +++ b/details/dreamgen/llama3-8b-instruct-align-test2-kto/2024-05-24T18-12-54.142897/details_leaderboard|mmlu:medical_genetics|5_2024-05-24T18-12-54.142897.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4844324a604ed9344265a79d640917e9df7e09446fef51f017ef1913cbe21518 +size 93043 diff --git a/details/dreamgen/llama3-8b-instruct-align-test2-kto/2024-05-24T18-12-54.142897/details_leaderboard|mmlu:miscellaneous|5_2024-05-24T18-12-54.142897.parquet b/details/dreamgen/llama3-8b-instruct-align-test2-kto/2024-05-24T18-12-54.142897/details_leaderboard|mmlu:miscellaneous|5_2024-05-24T18-12-54.142897.parquet new file mode 100644 index 0000000000000000000000000000000000000000..3a58a69fe0e7f27a86cf442c12b0432c7b8618d3 --- /dev/null +++ b/details/dreamgen/llama3-8b-instruct-align-test2-kto/2024-05-24T18-12-54.142897/details_leaderboard|mmlu:miscellaneous|5_2024-05-24T18-12-54.142897.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:65ec67fc09560c55c0aec2dd32d6338bc9d7f4e13bf04aff347f9e615123e6b4 +size 520746 diff --git a/details/dreamgen/llama3-8b-instruct-align-test2-kto/2024-05-24T18-12-54.142897/details_leaderboard|mmlu:moral_disputes|5_2024-05-24T18-12-54.142897.parquet b/details/dreamgen/llama3-8b-instruct-align-test2-kto/2024-05-24T18-12-54.142897/details_leaderboard|mmlu:moral_disputes|5_2024-05-24T18-12-54.142897.parquet new file mode 100644 index 0000000000000000000000000000000000000000..7aa48cce6a6f01137c84cc59e7ac1938dba5d128 --- /dev/null +++ b/details/dreamgen/llama3-8b-instruct-align-test2-kto/2024-05-24T18-12-54.142897/details_leaderboard|mmlu:moral_disputes|5_2024-05-24T18-12-54.142897.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:603fc1fb22a70fb9bcc7a13a231c8083fef9e437abb799dfe4bcf727349f5e43 +size 339742 diff --git a/details/dreamgen/llama3-8b-instruct-align-test2-kto/2024-05-24T18-12-54.142897/details_leaderboard|mmlu:moral_scenarios|5_2024-05-24T18-12-54.142897.parquet b/details/dreamgen/llama3-8b-instruct-align-test2-kto/2024-05-24T18-12-54.142897/details_leaderboard|mmlu:moral_scenarios|5_2024-05-24T18-12-54.142897.parquet new file mode 100644 index 0000000000000000000000000000000000000000..5f31cccea52e0c535afeeb4d3e78810a5122061d --- /dev/null +++ b/details/dreamgen/llama3-8b-instruct-align-test2-kto/2024-05-24T18-12-54.142897/details_leaderboard|mmlu:moral_scenarios|5_2024-05-24T18-12-54.142897.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:85eb82bd3d49677e2910708c7e89618f06bd0b5b64ae3fb8088e36f9baaa4aca +size 699259 diff --git a/details/dreamgen/llama3-8b-instruct-align-test2-kto/2024-05-24T18-12-54.142897/details_leaderboard|mmlu:nutrition|5_2024-05-24T18-12-54.142897.parquet b/details/dreamgen/llama3-8b-instruct-align-test2-kto/2024-05-24T18-12-54.142897/details_leaderboard|mmlu:nutrition|5_2024-05-24T18-12-54.142897.parquet new file mode 100644 index 0000000000000000000000000000000000000000..f829a91d69098bb750bcd3560a701ff7732dbe22 --- /dev/null +++ b/details/dreamgen/llama3-8b-instruct-align-test2-kto/2024-05-24T18-12-54.142897/details_leaderboard|mmlu:nutrition|5_2024-05-24T18-12-54.142897.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:693c174eec6416e49a4efcc84571f67765903770e7cea81932f4e7ff1b717b57 +size 328953 diff --git a/details/dreamgen/llama3-8b-instruct-align-test2-kto/2024-05-24T18-12-54.142897/details_leaderboard|mmlu:philosophy|5_2024-05-24T18-12-54.142897.parquet b/details/dreamgen/llama3-8b-instruct-align-test2-kto/2024-05-24T18-12-54.142897/details_leaderboard|mmlu:philosophy|5_2024-05-24T18-12-54.142897.parquet new file mode 100644 index 0000000000000000000000000000000000000000..134b81aba0a7740e2282387477b7484bd7ce81f6 --- /dev/null +++ b/details/dreamgen/llama3-8b-instruct-align-test2-kto/2024-05-24T18-12-54.142897/details_leaderboard|mmlu:philosophy|5_2024-05-24T18-12-54.142897.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3525335c11173bbbcc90ede07dfa15e60d48ff8ef18be37bf56b5f7fcdeacdd9 +size 244041 diff --git a/details/dreamgen/llama3-8b-instruct-align-test2-kto/2024-05-24T18-12-54.142897/details_leaderboard|mmlu:prehistory|5_2024-05-24T18-12-54.142897.parquet b/details/dreamgen/llama3-8b-instruct-align-test2-kto/2024-05-24T18-12-54.142897/details_leaderboard|mmlu:prehistory|5_2024-05-24T18-12-54.142897.parquet new file mode 100644 index 0000000000000000000000000000000000000000..94b657566f41dbbe4ba5782784085763e73ca919 --- /dev/null +++ b/details/dreamgen/llama3-8b-instruct-align-test2-kto/2024-05-24T18-12-54.142897/details_leaderboard|mmlu:prehistory|5_2024-05-24T18-12-54.142897.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:31e495bc606bcf1b58add411268b18667b1e3ee256d54361364c0d94c7075c77 +size 329182 diff --git a/details/dreamgen/llama3-8b-instruct-align-test2-kto/2024-05-24T18-12-54.142897/details_leaderboard|mmlu:professional_accounting|5_2024-05-24T18-12-54.142897.parquet b/details/dreamgen/llama3-8b-instruct-align-test2-kto/2024-05-24T18-12-54.142897/details_leaderboard|mmlu:professional_accounting|5_2024-05-24T18-12-54.142897.parquet new file mode 100644 index 0000000000000000000000000000000000000000..c89e5eafb58b1ffcea636138a95d152de721fbe9 --- /dev/null +++ b/details/dreamgen/llama3-8b-instruct-align-test2-kto/2024-05-24T18-12-54.142897/details_leaderboard|mmlu:professional_accounting|5_2024-05-24T18-12-54.142897.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:34dc1a8ddfbab0216f90a13d0be0eb428e8592953c07694e3f79851f8f24b2af +size 386650 diff --git a/details/dreamgen/llama3-8b-instruct-align-test2-kto/2024-05-24T18-12-54.142897/details_leaderboard|mmlu:professional_law|5_2024-05-24T18-12-54.142897.parquet b/details/dreamgen/llama3-8b-instruct-align-test2-kto/2024-05-24T18-12-54.142897/details_leaderboard|mmlu:professional_law|5_2024-05-24T18-12-54.142897.parquet new file mode 100644 index 0000000000000000000000000000000000000000..1a9d3ff20b13c9e4df0a9d326f21ee878d161b0d --- /dev/null +++ b/details/dreamgen/llama3-8b-instruct-align-test2-kto/2024-05-24T18-12-54.142897/details_leaderboard|mmlu:professional_law|5_2024-05-24T18-12-54.142897.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:87af5ad150daf1aff6c7519bd406c704bd7b7daa7062c7f29cbca5c99c3230aa +size 6079433 diff --git a/details/dreamgen/llama3-8b-instruct-align-test2-kto/2024-05-24T18-12-54.142897/details_leaderboard|mmlu:professional_medicine|5_2024-05-24T18-12-54.142897.parquet b/details/dreamgen/llama3-8b-instruct-align-test2-kto/2024-05-24T18-12-54.142897/details_leaderboard|mmlu:professional_medicine|5_2024-05-24T18-12-54.142897.parquet new file mode 100644 index 0000000000000000000000000000000000000000..f66e38a676ecda59137b88922ceb57dee3710a6b --- /dev/null +++ b/details/dreamgen/llama3-8b-instruct-align-test2-kto/2024-05-24T18-12-54.142897/details_leaderboard|mmlu:professional_medicine|5_2024-05-24T18-12-54.142897.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b7e4b0267d43905660b35718d3a2f64b9a3a855718937696b75e029290c3e997 +size 664145 diff --git a/details/dreamgen/llama3-8b-instruct-align-test2-kto/2024-05-24T18-12-54.142897/details_leaderboard|mmlu:professional_psychology|5_2024-05-24T18-12-54.142897.parquet b/details/dreamgen/llama3-8b-instruct-align-test2-kto/2024-05-24T18-12-54.142897/details_leaderboard|mmlu:professional_psychology|5_2024-05-24T18-12-54.142897.parquet new file mode 100644 index 0000000000000000000000000000000000000000..47a1a9c34b044446161e195b9db71de524cc5fef --- /dev/null +++ b/details/dreamgen/llama3-8b-instruct-align-test2-kto/2024-05-24T18-12-54.142897/details_leaderboard|mmlu:professional_psychology|5_2024-05-24T18-12-54.142897.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b3441b311caa699e33f4a34c390b67d422df0c8aa1e03202d7f601eddc35074c +size 726214 diff --git a/details/dreamgen/llama3-8b-instruct-align-test2-kto/2024-05-24T18-12-54.142897/details_leaderboard|mmlu:public_relations|5_2024-05-24T18-12-54.142897.parquet b/details/dreamgen/llama3-8b-instruct-align-test2-kto/2024-05-24T18-12-54.142897/details_leaderboard|mmlu:public_relations|5_2024-05-24T18-12-54.142897.parquet new file mode 100644 index 0000000000000000000000000000000000000000..dcc280396f4b5d7c9c35db1e23db7f4cb14109bc --- /dev/null +++ b/details/dreamgen/llama3-8b-instruct-align-test2-kto/2024-05-24T18-12-54.142897/details_leaderboard|mmlu:public_relations|5_2024-05-24T18-12-54.142897.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fe178658cb744ae83a78bdecead9911a5964d3cc751a6cfc6938a511c1140cfc +size 127287 diff --git a/details/dreamgen/llama3-8b-instruct-align-test2-kto/2024-05-24T18-12-54.142897/details_leaderboard|mmlu:security_studies|5_2024-05-24T18-12-54.142897.parquet b/details/dreamgen/llama3-8b-instruct-align-test2-kto/2024-05-24T18-12-54.142897/details_leaderboard|mmlu:security_studies|5_2024-05-24T18-12-54.142897.parquet new file mode 100644 index 0000000000000000000000000000000000000000..14c53513212095f5385eb33272397f73864b365a --- /dev/null +++ b/details/dreamgen/llama3-8b-instruct-align-test2-kto/2024-05-24T18-12-54.142897/details_leaderboard|mmlu:security_studies|5_2024-05-24T18-12-54.142897.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:95340d8ab9f0b9ea9a998f3f39a3b47c6e8410b612be37d6004c75ba6802351a +size 638418 diff --git a/details/dreamgen/llama3-8b-instruct-align-test2-kto/2024-05-24T18-12-54.142897/details_leaderboard|mmlu:sociology|5_2024-05-24T18-12-54.142897.parquet b/details/dreamgen/llama3-8b-instruct-align-test2-kto/2024-05-24T18-12-54.142897/details_leaderboard|mmlu:sociology|5_2024-05-24T18-12-54.142897.parquet new file mode 100644 index 0000000000000000000000000000000000000000..ca967b61ba0ebac7b3946df493aaf9d29583aad2 --- /dev/null +++ b/details/dreamgen/llama3-8b-instruct-align-test2-kto/2024-05-24T18-12-54.142897/details_leaderboard|mmlu:sociology|5_2024-05-24T18-12-54.142897.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1f26040b0cdda08963aae90cfd943486a70cd080aeb4b0ec84af60c3c9fb4f9b +size 224793 diff --git a/details/dreamgen/llama3-8b-instruct-align-test2-kto/2024-05-24T18-12-54.142897/details_leaderboard|mmlu:us_foreign_policy|5_2024-05-24T18-12-54.142897.parquet b/details/dreamgen/llama3-8b-instruct-align-test2-kto/2024-05-24T18-12-54.142897/details_leaderboard|mmlu:us_foreign_policy|5_2024-05-24T18-12-54.142897.parquet new file mode 100644 index 0000000000000000000000000000000000000000..7e6c4618ed324107ab28a87440d2886b1e2c3fad --- /dev/null +++ b/details/dreamgen/llama3-8b-instruct-align-test2-kto/2024-05-24T18-12-54.142897/details_leaderboard|mmlu:us_foreign_policy|5_2024-05-24T18-12-54.142897.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:30e16794d4b88361e4fb256b533764ce70e02427a4182b167bf4735e89513305 +size 119754 diff --git a/details/dreamgen/llama3-8b-instruct-align-test2-kto/2024-05-24T18-12-54.142897/details_leaderboard|mmlu:virology|5_2024-05-24T18-12-54.142897.parquet b/details/dreamgen/llama3-8b-instruct-align-test2-kto/2024-05-24T18-12-54.142897/details_leaderboard|mmlu:virology|5_2024-05-24T18-12-54.142897.parquet new file mode 100644 index 0000000000000000000000000000000000000000..5c56a359dddccca90dccbfea1992d95cb77b4b6d --- /dev/null +++ b/details/dreamgen/llama3-8b-instruct-align-test2-kto/2024-05-24T18-12-54.142897/details_leaderboard|mmlu:virology|5_2024-05-24T18-12-54.142897.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:42976e339853dfae63e0d9e7f57a69aa3fccc381ccdf015889df0e3016ae26ee +size 145512 diff --git a/details/dreamgen/llama3-8b-instruct-align-test2-kto/2024-05-24T18-12-54.142897/details_leaderboard|mmlu:world_religions|5_2024-05-24T18-12-54.142897.parquet b/details/dreamgen/llama3-8b-instruct-align-test2-kto/2024-05-24T18-12-54.142897/details_leaderboard|mmlu:world_religions|5_2024-05-24T18-12-54.142897.parquet new file mode 100644 index 0000000000000000000000000000000000000000..7cd91813b5b2178b7e3a1c6b8720d8b0ba2e00bc --- /dev/null +++ b/details/dreamgen/llama3-8b-instruct-align-test2-kto/2024-05-24T18-12-54.142897/details_leaderboard|mmlu:world_religions|5_2024-05-24T18-12-54.142897.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:19c315f775d32b614c0f169ea735e0782751ed2e5168bd884db0d9c337d47a04 +size 108994 diff --git a/details/dreamgen/llama3-8b-instruct-align-test2-kto/2024-05-24T18-12-54.142897/details_leaderboard|truthfulqa:mc|0_2024-05-24T18-12-54.142897.parquet b/details/dreamgen/llama3-8b-instruct-align-test2-kto/2024-05-24T18-12-54.142897/details_leaderboard|truthfulqa:mc|0_2024-05-24T18-12-54.142897.parquet new file mode 100644 index 0000000000000000000000000000000000000000..8b5973464dcbfed3edc37d9274997001f78c3df9 --- /dev/null +++ b/details/dreamgen/llama3-8b-instruct-align-test2-kto/2024-05-24T18-12-54.142897/details_leaderboard|truthfulqa:mc|0_2024-05-24T18-12-54.142897.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e87e00b61bbc50ff72cad8204c985c729154511547622aa08778d6d8ca75fdca +size 883938 diff --git a/details/dreamgen/llama3-8b-instruct-align-test2-kto/2024-05-24T18-12-54.142897/details_leaderboard|winogrande|5_2024-05-24T18-12-54.142897.parquet b/details/dreamgen/llama3-8b-instruct-align-test2-kto/2024-05-24T18-12-54.142897/details_leaderboard|winogrande|5_2024-05-24T18-12-54.142897.parquet new file mode 100644 index 0000000000000000000000000000000000000000..f72340fc8bfe39d5303fd79f3db1276821383266 --- /dev/null +++ b/details/dreamgen/llama3-8b-instruct-align-test2-kto/2024-05-24T18-12-54.142897/details_leaderboard|winogrande|5_2024-05-24T18-12-54.142897.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:30578fd42be38bede6bfca275fbb26271b4dcafaef991feb4c18060777fdd9a3 +size 1149966 diff --git a/details/dreamgen/llama3-8b-instruct-align-test2-kto/results_2024-05-24T18-12-54.142897.json b/details/dreamgen/llama3-8b-instruct-align-test2-kto/results_2024-05-24T18-12-54.142897.json new file mode 100644 index 0000000000000000000000000000000000000000..fdca4e4417f5af68f241ba460c8a0acf0f966bb0 --- /dev/null +++ b/details/dreamgen/llama3-8b-instruct-align-test2-kto/results_2024-05-24T18-12-54.142897.json @@ -0,0 +1,3461 @@ +{ + "config_general": { + "lighteval_sha": "a98210fd3a2d1e8bface1c32b72ebd5017173a4c", + "num_fewshot_seeds": 1, + "override_batch_size": -1, + "max_samples": null, + "job_id": "", + "start_time": 2236461.789611765, + "end_time": 2259996.300803602, + "total_evaluation_time_secondes": "23534.511191837024", + "model_name": "dreamgen/llama3-8b-instruct-align-test2-kto", + "model_sha": "9d61ac436978ee7007b966c145c2537cbe2fc994", + "model_dtype": "torch.bfloat16", + "model_size": "14.96 GB", + "config": null + }, + "results": { + "leaderboard|arc:challenge|25": { + "acc": 0.5554607508532423, + "acc_stderr": 0.014521226405627075, + "acc_norm": 0.5622866894197952, + "acc_norm_stderr": 0.014497573881108282 + }, + "leaderboard|hellaswag|10": { + "acc": 0.5598486357299343, + "acc_stderr": 0.004953907062096603, + "acc_norm": 0.7204740091615216, + "acc_norm_stderr": 0.004478491697891239 + }, + "leaderboard|mmlu:abstract_algebra|5": { + "acc": 0.33, + "acc_stderr": 0.047258156262526045 + }, + "leaderboard|mmlu:anatomy|5": { + "acc": 0.6592592592592592, + "acc_stderr": 0.040943762699967926 + }, + "leaderboard|mmlu:astronomy|5": { + "acc": 0.7302631578947368, + "acc_stderr": 0.03611780560284898 + }, + "leaderboard|mmlu:business_ethics|5": { + "acc": 0.67, + "acc_stderr": 0.047258156262526094 + }, + "leaderboard|mmlu:clinical_knowledge|5": { + "acc": 0.7320754716981132, + "acc_stderr": 0.027257260322494845 + }, + "leaderboard|mmlu:college_biology|5": { + "acc": 0.7708333333333334, + "acc_stderr": 0.03514697467862388 + }, + "leaderboard|mmlu:college_chemistry|5": { + "acc": 0.49, + "acc_stderr": 0.05024183937956911 + }, + "leaderboard|mmlu:college_computer_science|5": { + "acc": 0.46, + "acc_stderr": 0.05009082659620332 + }, + "leaderboard|mmlu:college_mathematics|5": { + "acc": 0.39, + "acc_stderr": 0.04902071300001975 + }, + "leaderboard|mmlu:college_medicine|5": { + "acc": 0.6069364161849711, + "acc_stderr": 0.0372424959581773 + }, + "leaderboard|mmlu:college_physics|5": { + "acc": 0.47058823529411764, + "acc_stderr": 0.04966570903978529 + }, + "leaderboard|mmlu:computer_security|5": { + "acc": 0.78, + "acc_stderr": 0.04163331998932261 + }, + "leaderboard|mmlu:conceptual_physics|5": { + "acc": 0.5829787234042553, + "acc_stderr": 0.03223276266711712 + }, + "leaderboard|mmlu:econometrics|5": { + "acc": 0.5, + "acc_stderr": 0.047036043419179864 + }, + "leaderboard|mmlu:electrical_engineering|5": { + "acc": 0.5862068965517241, + "acc_stderr": 0.04104269211806232 + }, + "leaderboard|mmlu:elementary_mathematics|5": { + "acc": 0.46296296296296297, + "acc_stderr": 0.02568056464005688 + }, + "leaderboard|mmlu:formal_logic|5": { + "acc": 0.5238095238095238, + "acc_stderr": 0.04467062628403273 + }, + "leaderboard|mmlu:global_facts|5": { + "acc": 0.43, + "acc_stderr": 0.04975698519562428 + }, + "leaderboard|mmlu:high_school_biology|5": { + "acc": 0.7580645161290323, + "acc_stderr": 0.024362599693031076 + }, + "leaderboard|mmlu:high_school_chemistry|5": { + "acc": 0.5270935960591133, + "acc_stderr": 0.03512819077876106 + }, + "leaderboard|mmlu:high_school_computer_science|5": { + "acc": 0.66, + "acc_stderr": 0.04760952285695237 + }, + "leaderboard|mmlu:high_school_european_history|5": { + "acc": 0.7212121212121212, + "acc_stderr": 0.035014387062967806 + }, + "leaderboard|mmlu:high_school_geography|5": { + "acc": 0.7929292929292929, + "acc_stderr": 0.028869778460267042 + }, + "leaderboard|mmlu:high_school_government_and_politics|5": { + "acc": 0.8756476683937824, + "acc_stderr": 0.023814477086593563 + }, + "leaderboard|mmlu:high_school_macroeconomics|5": { + "acc": 0.658974358974359, + "acc_stderr": 0.024035489676335068 + }, + "leaderboard|mmlu:high_school_mathematics|5": { + "acc": 0.34074074074074073, + "acc_stderr": 0.028897748741131137 + }, + "leaderboard|mmlu:high_school_microeconomics|5": { + "acc": 0.7563025210084033, + "acc_stderr": 0.027886828078380548 + }, + "leaderboard|mmlu:high_school_physics|5": { + "acc": 0.4503311258278146, + "acc_stderr": 0.04062290018683775 + }, + "leaderboard|mmlu:high_school_psychology|5": { + "acc": 0.8293577981651377, + "acc_stderr": 0.016129271025099878 + }, + "leaderboard|mmlu:high_school_statistics|5": { + "acc": 0.49537037037037035, + "acc_stderr": 0.03409825519163572 + }, + "leaderboard|mmlu:high_school_us_history|5": { + "acc": 0.803921568627451, + "acc_stderr": 0.027865942286639318 + }, + "leaderboard|mmlu:high_school_world_history|5": { + "acc": 0.8185654008438819, + "acc_stderr": 0.025085961144579654 + }, + "leaderboard|mmlu:human_aging|5": { + "acc": 0.695067264573991, + "acc_stderr": 0.03089861088247752 + }, + "leaderboard|mmlu:human_sexuality|5": { + "acc": 0.7862595419847328, + "acc_stderr": 0.0359546161177469 + }, + "leaderboard|mmlu:international_law|5": { + "acc": 0.8016528925619835, + "acc_stderr": 0.03640118271990947 + }, + "leaderboard|mmlu:jurisprudence|5": { + "acc": 0.8055555555555556, + "acc_stderr": 0.038260763248848646 + }, + "leaderboard|mmlu:logical_fallacies|5": { + "acc": 0.7361963190184049, + "acc_stderr": 0.03462419931615623 + }, + "leaderboard|mmlu:machine_learning|5": { + "acc": 0.49107142857142855, + "acc_stderr": 0.04745033255489123 + }, + "leaderboard|mmlu:management|5": { + "acc": 0.8252427184466019, + "acc_stderr": 0.03760178006026622 + }, + "leaderboard|mmlu:marketing|5": { + "acc": 0.8717948717948718, + "acc_stderr": 0.02190190511507333 + }, + "leaderboard|mmlu:medical_genetics|5": { + "acc": 0.69, + "acc_stderr": 0.04648231987117316 + }, + "leaderboard|mmlu:miscellaneous|5": { + "acc": 0.822477650063857, + "acc_stderr": 0.013664230995834834 + }, + "leaderboard|mmlu:moral_disputes|5": { + "acc": 0.7052023121387283, + "acc_stderr": 0.02454761779480383 + }, + "leaderboard|mmlu:moral_scenarios|5": { + "acc": 0.41899441340782123, + "acc_stderr": 0.016501579306861677 + }, + "leaderboard|mmlu:nutrition|5": { + "acc": 0.7352941176470589, + "acc_stderr": 0.025261691219729484 + }, + "leaderboard|mmlu:philosophy|5": { + "acc": 0.7202572347266881, + "acc_stderr": 0.02549425935069491 + }, + "leaderboard|mmlu:prehistory|5": { + "acc": 0.6975308641975309, + "acc_stderr": 0.025557653981868055 + }, + "leaderboard|mmlu:professional_accounting|5": { + "acc": 0.5035460992907801, + "acc_stderr": 0.02982674915328092 + }, + "leaderboard|mmlu:professional_law|5": { + "acc": 0.4576271186440678, + "acc_stderr": 0.012724296550980188 + }, + "leaderboard|mmlu:professional_medicine|5": { + "acc": 0.7132352941176471, + "acc_stderr": 0.027472274473233818 + }, + "leaderboard|mmlu:professional_psychology|5": { + "acc": 0.6879084967320261, + "acc_stderr": 0.018745011201277657 + }, + "leaderboard|mmlu:public_relations|5": { + "acc": 0.6545454545454545, + "acc_stderr": 0.04554619617541054 + }, + "leaderboard|mmlu:security_studies|5": { + "acc": 0.7387755102040816, + "acc_stderr": 0.028123429335142773 + }, + "leaderboard|mmlu:sociology|5": { + "acc": 0.8159203980099502, + "acc_stderr": 0.027403859410786848 + }, + "leaderboard|mmlu:us_foreign_policy|5": { + "acc": 0.85, + "acc_stderr": 0.0358870281282637 + }, + "leaderboard|mmlu:virology|5": { + "acc": 0.5, + "acc_stderr": 0.03892494720807614 + }, + "leaderboard|mmlu:world_religions|5": { + "acc": 0.8128654970760234, + "acc_stderr": 0.029913127232368032 + }, + "leaderboard|truthfulqa:mc|0": { + "truthfulqa_mc1": 0.36964504283965727, + "truthfulqa_mc1_stderr": 0.01689818070697389, + "truthfulqa_mc2": 0.5305070824753251, + "truthfulqa_mc2_stderr": 0.015944782750315033 + }, + "leaderboard|winogrande|5": { + "acc": 0.6937647987371744, + "acc_stderr": 0.01295438597280247 + }, + "leaderboard|gsm8k|5": { + "qem": 0.4670204700530705, + "qem_stderr": 0.013742492794163421 + }, + "leaderboard|mmlu:_average|5": { + "acc": 0.6526569493505927, + "acc_stderr": 0.03380504750509661 + }, + "all": { + "acc": 0.6501753383050689, + "acc_stderr": 0.03265528712051721, + "acc_norm": 0.6413803492906585, + "acc_norm_stderr": 0.009488032789499761, + "truthfulqa_mc1": 0.36964504283965727, + "truthfulqa_mc1_stderr": 0.01689818070697389, + "truthfulqa_mc2": 0.5305070824753251, + "truthfulqa_mc2_stderr": 0.015944782750315033, + "qem": 0.4670204700530705, + "qem_stderr": 0.013742492794163421 + } + }, + "versions": { + "leaderboard|arc:challenge|25": 0, + "leaderboard|gsm8k|5": 0, + "leaderboard|hellaswag|10": 0, + "leaderboard|mmlu:abstract_algebra|5": 0, + "leaderboard|mmlu:anatomy|5": 0, + "leaderboard|mmlu:astronomy|5": 0, + "leaderboard|mmlu:business_ethics|5": 0, + "leaderboard|mmlu:clinical_knowledge|5": 0, + "leaderboard|mmlu:college_biology|5": 0, + "leaderboard|mmlu:college_chemistry|5": 0, + "leaderboard|mmlu:college_computer_science|5": 0, + "leaderboard|mmlu:college_mathematics|5": 0, + "leaderboard|mmlu:college_medicine|5": 0, + "leaderboard|mmlu:college_physics|5": 0, + "leaderboard|mmlu:computer_security|5": 0, + "leaderboard|mmlu:conceptual_physics|5": 0, + "leaderboard|mmlu:econometrics|5": 0, + "leaderboard|mmlu:electrical_engineering|5": 0, + "leaderboard|mmlu:elementary_mathematics|5": 0, + "leaderboard|mmlu:formal_logic|5": 0, + "leaderboard|mmlu:global_facts|5": 0, + "leaderboard|mmlu:high_school_biology|5": 0, + "leaderboard|mmlu:high_school_chemistry|5": 0, + "leaderboard|mmlu:high_school_computer_science|5": 0, + "leaderboard|mmlu:high_school_european_history|5": 0, + "leaderboard|mmlu:high_school_geography|5": 0, + "leaderboard|mmlu:high_school_government_and_politics|5": 0, + "leaderboard|mmlu:high_school_macroeconomics|5": 0, + "leaderboard|mmlu:high_school_mathematics|5": 0, + "leaderboard|mmlu:high_school_microeconomics|5": 0, + "leaderboard|mmlu:high_school_physics|5": 0, + "leaderboard|mmlu:high_school_psychology|5": 0, + "leaderboard|mmlu:high_school_statistics|5": 0, + "leaderboard|mmlu:high_school_us_history|5": 0, + "leaderboard|mmlu:high_school_world_history|5": 0, + "leaderboard|mmlu:human_aging|5": 0, + "leaderboard|mmlu:human_sexuality|5": 0, + "leaderboard|mmlu:international_law|5": 0, + "leaderboard|mmlu:jurisprudence|5": 0, + "leaderboard|mmlu:logical_fallacies|5": 0, + "leaderboard|mmlu:machine_learning|5": 0, + "leaderboard|mmlu:management|5": 0, + "leaderboard|mmlu:marketing|5": 0, + "leaderboard|mmlu:medical_genetics|5": 0, + "leaderboard|mmlu:miscellaneous|5": 0, + "leaderboard|mmlu:moral_disputes|5": 0, + "leaderboard|mmlu:moral_scenarios|5": 0, + "leaderboard|mmlu:nutrition|5": 0, + "leaderboard|mmlu:philosophy|5": 0, + "leaderboard|mmlu:prehistory|5": 0, + "leaderboard|mmlu:professional_accounting|5": 0, + "leaderboard|mmlu:professional_law|5": 0, + "leaderboard|mmlu:professional_medicine|5": 0, + "leaderboard|mmlu:professional_psychology|5": 0, + "leaderboard|mmlu:public_relations|5": 0, + "leaderboard|mmlu:security_studies|5": 0, + "leaderboard|mmlu:sociology|5": 0, + "leaderboard|mmlu:us_foreign_policy|5": 0, + "leaderboard|mmlu:virology|5": 0, + "leaderboard|mmlu:world_religions|5": 0, + "leaderboard|truthfulqa:mc|0": 0, + "leaderboard|winogrande|5": 0 + }, + "config_tasks": { + "leaderboard|arc:challenge": { + "name": "arc:challenge", + "prompt_function": "arc", + "hf_repo": "ai2_arc", + "hf_subset": "ARC-Challenge", + "metric": [ + "loglikelihood_acc", + "loglikelihood_acc_norm_nospace" + ], + "hf_avail_splits": [ + "train", + "test" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": null, + "few_shots_select": "random_sampling_from_train", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "arc" + ], + "original_num_docs": 1172, + "effective_num_docs": 1172, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|gsm8k": { + "name": "gsm8k", + "prompt_function": "gsm8k", + "hf_repo": "gsm8k", + "hf_subset": "main", + "metric": [ + "quasi_exact_match_gsm8k" + ], + "hf_avail_splits": [ + "train", + "test" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": null, + "few_shots_select": "random_sampling_from_train", + "generation_size": 256, + "stop_sequence": [ + "Question:", + "Question", + ":" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard" + ], + "original_num_docs": 1319, + "effective_num_docs": 1319, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|hellaswag": { + "name": "hellaswag", + "prompt_function": "hellaswag_harness", + "hf_repo": "hellaswag", + "hf_subset": "default", + "metric": [ + "loglikelihood_acc", + "loglikelihood_acc_norm" + ], + "hf_avail_splits": [ + "train", + "test", + "validation" + ], + "evaluation_splits": [ + "validation" + ], + "few_shots_split": null, + "few_shots_select": "random_sampling_from_train", + "generation_size": -1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard" + ], + "original_num_docs": 10042, + "effective_num_docs": 10042, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:abstract_algebra": { + "name": "mmlu:abstract_algebra", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "abstract_algebra", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 100, + "effective_num_docs": 100, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:anatomy": { + "name": "mmlu:anatomy", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "anatomy", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 135, + "effective_num_docs": 135, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:astronomy": { + "name": "mmlu:astronomy", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "astronomy", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 152, + "effective_num_docs": 152, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:business_ethics": { + "name": "mmlu:business_ethics", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "business_ethics", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 100, + "effective_num_docs": 100, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:clinical_knowledge": { + "name": "mmlu:clinical_knowledge", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "clinical_knowledge", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 265, + "effective_num_docs": 265, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:college_biology": { + "name": "mmlu:college_biology", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "college_biology", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 144, + "effective_num_docs": 144, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:college_chemistry": { + "name": "mmlu:college_chemistry", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "college_chemistry", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 100, + "effective_num_docs": 100, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:college_computer_science": { + "name": "mmlu:college_computer_science", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "college_computer_science", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 100, + "effective_num_docs": 100, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:college_mathematics": { + "name": "mmlu:college_mathematics", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "college_mathematics", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 100, + "effective_num_docs": 100, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:college_medicine": { + "name": "mmlu:college_medicine", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "college_medicine", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 173, + "effective_num_docs": 173, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:college_physics": { + "name": "mmlu:college_physics", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "college_physics", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 102, + "effective_num_docs": 102, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:computer_security": { + "name": "mmlu:computer_security", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "computer_security", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 100, + "effective_num_docs": 100, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:conceptual_physics": { + "name": "mmlu:conceptual_physics", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "conceptual_physics", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 235, + "effective_num_docs": 235, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:econometrics": { + "name": "mmlu:econometrics", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "econometrics", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 114, + "effective_num_docs": 114, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:electrical_engineering": { + "name": "mmlu:electrical_engineering", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "electrical_engineering", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 145, + "effective_num_docs": 145, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:elementary_mathematics": { + "name": "mmlu:elementary_mathematics", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "elementary_mathematics", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 378, + "effective_num_docs": 378, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:formal_logic": { + "name": "mmlu:formal_logic", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "formal_logic", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 126, + "effective_num_docs": 126, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:global_facts": { + "name": "mmlu:global_facts", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "global_facts", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 100, + "effective_num_docs": 100, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:high_school_biology": { + "name": "mmlu:high_school_biology", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "high_school_biology", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 310, + "effective_num_docs": 310, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:high_school_chemistry": { + "name": "mmlu:high_school_chemistry", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "high_school_chemistry", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 203, + "effective_num_docs": 203, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:high_school_computer_science": { + "name": "mmlu:high_school_computer_science", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "high_school_computer_science", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 100, + "effective_num_docs": 100, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:high_school_european_history": { + "name": "mmlu:high_school_european_history", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "high_school_european_history", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 165, + "effective_num_docs": 165, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:high_school_geography": { + "name": "mmlu:high_school_geography", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "high_school_geography", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 198, + "effective_num_docs": 198, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:high_school_government_and_politics": { + "name": "mmlu:high_school_government_and_politics", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "high_school_government_and_politics", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 193, + "effective_num_docs": 193, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:high_school_macroeconomics": { + "name": "mmlu:high_school_macroeconomics", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "high_school_macroeconomics", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 390, + "effective_num_docs": 390, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:high_school_mathematics": { + "name": "mmlu:high_school_mathematics", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "high_school_mathematics", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 270, + "effective_num_docs": 270, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:high_school_microeconomics": { + "name": "mmlu:high_school_microeconomics", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "high_school_microeconomics", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 238, + "effective_num_docs": 238, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:high_school_physics": { + "name": "mmlu:high_school_physics", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "high_school_physics", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 151, + "effective_num_docs": 151, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:high_school_psychology": { + "name": "mmlu:high_school_psychology", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "high_school_psychology", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 545, + "effective_num_docs": 545, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:high_school_statistics": { + "name": "mmlu:high_school_statistics", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "high_school_statistics", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 216, + "effective_num_docs": 216, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:high_school_us_history": { + "name": "mmlu:high_school_us_history", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "high_school_us_history", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 204, + "effective_num_docs": 204, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:high_school_world_history": { + "name": "mmlu:high_school_world_history", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "high_school_world_history", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 237, + "effective_num_docs": 237, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:human_aging": { + "name": "mmlu:human_aging", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "human_aging", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 223, + "effective_num_docs": 223, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:human_sexuality": { + "name": "mmlu:human_sexuality", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "human_sexuality", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 131, + "effective_num_docs": 131, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:international_law": { + "name": "mmlu:international_law", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "international_law", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 121, + "effective_num_docs": 121, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:jurisprudence": { + "name": "mmlu:jurisprudence", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "jurisprudence", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 108, + "effective_num_docs": 108, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:logical_fallacies": { + "name": "mmlu:logical_fallacies", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "logical_fallacies", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 163, + "effective_num_docs": 163, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:machine_learning": { + "name": "mmlu:machine_learning", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "machine_learning", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 112, + "effective_num_docs": 112, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:management": { + "name": "mmlu:management", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "management", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 103, + "effective_num_docs": 103, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:marketing": { + "name": "mmlu:marketing", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "marketing", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 234, + "effective_num_docs": 234, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:medical_genetics": { + "name": "mmlu:medical_genetics", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "medical_genetics", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 100, + "effective_num_docs": 100, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:miscellaneous": { + "name": "mmlu:miscellaneous", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "miscellaneous", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 783, + "effective_num_docs": 783, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:moral_disputes": { + "name": "mmlu:moral_disputes", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "moral_disputes", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 346, + "effective_num_docs": 346, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:moral_scenarios": { + "name": "mmlu:moral_scenarios", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "moral_scenarios", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 895, + "effective_num_docs": 895, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:nutrition": { + "name": "mmlu:nutrition", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "nutrition", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 306, + "effective_num_docs": 306, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:philosophy": { + "name": "mmlu:philosophy", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "philosophy", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 311, + "effective_num_docs": 311, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:prehistory": { + "name": "mmlu:prehistory", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "prehistory", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 324, + "effective_num_docs": 324, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:professional_accounting": { + "name": "mmlu:professional_accounting", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "professional_accounting", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 282, + "effective_num_docs": 282, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:professional_law": { + "name": "mmlu:professional_law", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "professional_law", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 1534, + "effective_num_docs": 1534, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:professional_medicine": { + "name": "mmlu:professional_medicine", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "professional_medicine", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 272, + "effective_num_docs": 272, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:professional_psychology": { + "name": "mmlu:professional_psychology", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "professional_psychology", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 612, + "effective_num_docs": 612, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:public_relations": { + "name": "mmlu:public_relations", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "public_relations", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 110, + "effective_num_docs": 110, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:security_studies": { + "name": "mmlu:security_studies", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "security_studies", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 245, + "effective_num_docs": 245, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:sociology": { + "name": "mmlu:sociology", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "sociology", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 201, + "effective_num_docs": 201, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:us_foreign_policy": { + "name": "mmlu:us_foreign_policy", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "us_foreign_policy", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 100, + "effective_num_docs": 100, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:virology": { + "name": "mmlu:virology", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "virology", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 166, + "effective_num_docs": 166, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:world_religions": { + "name": "mmlu:world_religions", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "world_religions", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 171, + "effective_num_docs": 171, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|truthfulqa:mc": { + "name": "truthfulqa:mc", + "prompt_function": "truthful_qa_multiple_choice", + "hf_repo": "truthful_qa", + "hf_subset": "multiple_choice", + "metric": [ + "truthfulqa_mc_metrics" + ], + "hf_avail_splits": [ + "validation" + ], + "evaluation_splits": [ + "validation" + ], + "few_shots_split": null, + "few_shots_select": null, + "generation_size": -1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard" + ], + "original_num_docs": 817, + "effective_num_docs": 817, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|winogrande": { + "name": "winogrande", + "prompt_function": "winogrande", + "hf_repo": "winogrande", + "hf_subset": "winogrande_xl", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "train", + "test", + "validation" + ], + "evaluation_splits": [ + "validation" + ], + "few_shots_split": null, + "few_shots_select": "random_sampling", + "generation_size": -1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard" + ], + "original_num_docs": 1267, + "effective_num_docs": 1267, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + } + }, + "summary_tasks": { + "leaderboard|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "4aeb23a740784b86", + "hash_input_tokens": "2e9e18067d1f8ad8", + "hash_cont_tokens": "19baa8a044eaaac8" + }, + "truncated": 0, + "non_truncated": 1172, + "padded": 4687, + "non_padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|hellaswag|10": { + "hashes": { + "hash_examples": "31985c805c3a737e", + "hash_full_prompts": "3c2d3440e190b07b", + "hash_input_tokens": "412fc1d29623282b", + "hash_cont_tokens": "823c88a16c837063" + }, + "truncated": 0, + "non_truncated": 10042, + "padded": 40105, + "non_padded": 63, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:abstract_algebra|5": { + "hashes": { + "hash_examples": "4c76229e00c9c0e9", + "hash_full_prompts": "faefa0cccb952fe0", + "hash_input_tokens": "e7380c35f0e2c4b3", + "hash_cont_tokens": "a886b3552371a98b" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:anatomy|5": { + "hashes": { + "hash_examples": "6a1f8104dccbd33b", + "hash_full_prompts": "eacd03e46972fa59", + "hash_input_tokens": "2ee8bc2ef4561b6b", + "hash_cont_tokens": "9be31d13c42ead00" + }, + "truncated": 0, + "non_truncated": 135, + "padded": 540, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:astronomy|5": { + "hashes": { + "hash_examples": "1302effa3a76ce4c", + "hash_full_prompts": "826cacbdf1f6bfd0", + "hash_input_tokens": "6ab8d24255ff03b3", + "hash_cont_tokens": "30cc2b2fc1294aac" + }, + "truncated": 0, + "non_truncated": 152, + "padded": 608, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:business_ethics|5": { + "hashes": { + "hash_examples": "03cb8bce5336419a", + "hash_full_prompts": "518511169382ac39", + "hash_input_tokens": "8be4f0cc9ce448e1", + "hash_cont_tokens": "4e9d83c717b7deb8" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:clinical_knowledge|5": { + "hashes": { + "hash_examples": "ffbb9c7b2be257f9", + "hash_full_prompts": "0b07b0bc774fdfd9", + "hash_input_tokens": "413166c01db52a72", + "hash_cont_tokens": "40dd7263ce5af5de" + }, + "truncated": 0, + "non_truncated": 265, + "padded": 1060, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:college_biology|5": { + "hashes": { + "hash_examples": "3ee77f176f38eb8e", + "hash_full_prompts": "22cbe0e8dabf98b1", + "hash_input_tokens": "0dcd583202383d43", + "hash_cont_tokens": "1892d80e82b394c0" + }, + "truncated": 0, + "non_truncated": 144, + "padded": 576, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:college_chemistry|5": { + "hashes": { + "hash_examples": "ce61a69c46d47aeb", + "hash_full_prompts": "9c1288940a4afb59", + "hash_input_tokens": "59a4f0d36881d644", + "hash_cont_tokens": "b6bb78fb2d7e4e6f" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:college_computer_science|5": { + "hashes": { + "hash_examples": "32805b52d7d5daab", + "hash_full_prompts": "9522781d0cdf1a43", + "hash_input_tokens": "302a2f1d05b53513", + "hash_cont_tokens": "6a5da979260e607c" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:college_mathematics|5": { + "hashes": { + "hash_examples": "55da1a0a0bd33722", + "hash_full_prompts": "72fe6f46a57e6ca4", + "hash_input_tokens": "042f1988f13b8f9a", + "hash_cont_tokens": "62df3b0447bd3b12" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:college_medicine|5": { + "hashes": { + "hash_examples": "c33e143163049176", + "hash_full_prompts": "dee0989b2c8993f4", + "hash_input_tokens": "6dd81075c8e816e9", + "hash_cont_tokens": "933c01711a0757a0" + }, + "truncated": 0, + "non_truncated": 173, + "padded": 692, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:college_physics|5": { + "hashes": { + "hash_examples": "ebdab1cdb7e555df", + "hash_full_prompts": "a1be6b64ea1948c3", + "hash_input_tokens": "37818fa59254732b", + "hash_cont_tokens": "d36569ab90faad7c" + }, + "truncated": 0, + "non_truncated": 102, + "padded": 408, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:computer_security|5": { + "hashes": { + "hash_examples": "a24fd7d08a560921", + "hash_full_prompts": "01bc3fdfdefe67a4", + "hash_input_tokens": "d4957d5a9d5e83ec", + "hash_cont_tokens": "a886b3552371a98b" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:conceptual_physics|5": { + "hashes": { + "hash_examples": "8300977a79386993", + "hash_full_prompts": "b39315a8ada3ca79", + "hash_input_tokens": "c146a84803f78c9e", + "hash_cont_tokens": "6408f70f3d9ada31" + }, + "truncated": 0, + "non_truncated": 235, + "padded": 940, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:econometrics|5": { + "hashes": { + "hash_examples": "ddde36788a04a46f", + "hash_full_prompts": "70bab37ca5fcc48f", + "hash_input_tokens": "086bc025be133096", + "hash_cont_tokens": "3befa885ca6e4b97" + }, + "truncated": 0, + "non_truncated": 114, + "padded": 456, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:electrical_engineering|5": { + "hashes": { + "hash_examples": "acbc5def98c19b3f", + "hash_full_prompts": "86a4747481c11c61", + "hash_input_tokens": "b83507ac94ded59b", + "hash_cont_tokens": "e75df8f470aa4973" + }, + "truncated": 0, + "non_truncated": 145, + "padded": 580, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:elementary_mathematics|5": { + "hashes": { + "hash_examples": "146e61d07497a9bd", + "hash_full_prompts": "1fe56333735325fa", + "hash_input_tokens": "8c3c868b34bad37b", + "hash_cont_tokens": "f09c97e7f7f9af71" + }, + "truncated": 0, + "non_truncated": 378, + "padded": 1512, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:formal_logic|5": { + "hashes": { + "hash_examples": "8635216e1909a03f", + "hash_full_prompts": "cc83c1ede45f974c", + "hash_input_tokens": "bb0616a24585501c", + "hash_cont_tokens": "df96e75b4eb1d7b0" + }, + "truncated": 0, + "non_truncated": 126, + "padded": 504, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:global_facts|5": { + "hashes": { + "hash_examples": "30b315aa6353ee47", + "hash_full_prompts": "3a2ec1e2785c69a5", + "hash_input_tokens": "5e840dc7f1c55a67", + "hash_cont_tokens": "a886b3552371a98b" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:high_school_biology|5": { + "hashes": { + "hash_examples": "c9136373af2180de", + "hash_full_prompts": "27646a569cf2a6f8", + "hash_input_tokens": "1dce672a00c5cbe1", + "hash_cont_tokens": "c6d11e73dc85157f" + }, + "truncated": 0, + "non_truncated": 310, + "padded": 1240, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:high_school_chemistry|5": { + "hashes": { + "hash_examples": "b0661bfa1add6404", + "hash_full_prompts": "6905c6ca76f7b2b7", + "hash_input_tokens": "7fb2dd590b34e445", + "hash_cont_tokens": "208aff39cfca671a" + }, + "truncated": 0, + "non_truncated": 203, + "padded": 812, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:high_school_computer_science|5": { + "hashes": { + "hash_examples": "80fc1d623a3d665f", + "hash_full_prompts": "b80092241e8b6c06", + "hash_input_tokens": "b2a9091fd8d00b66", + "hash_cont_tokens": "150a6d581009fbe0" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:high_school_european_history|5": { + "hashes": { + "hash_examples": "854da6e5af0fe1a1", + "hash_full_prompts": "a3bc32a5dc022ce7", + "hash_input_tokens": "393e215e8667fde4", + "hash_cont_tokens": "7b6f4c22b304c3cc" + }, + "truncated": 0, + "non_truncated": 165, + "padded": 656, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:high_school_geography|5": { + "hashes": { + "hash_examples": "7dc963c7acd19ad8", + "hash_full_prompts": "53f91beae305905d", + "hash_input_tokens": "439ac435fc478534", + "hash_cont_tokens": "1a85c9e696d91a66" + }, + "truncated": 0, + "non_truncated": 198, + "padded": 792, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "1f675dcdebc9758f", + "hash_full_prompts": "623fd7e3495f243f", + "hash_input_tokens": "2c5757b8545f7cf8", + "hash_cont_tokens": "a47a4530b8790081" + }, + "truncated": 0, + "non_truncated": 193, + "padded": 772, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "2fb32cf2d80f0b35", + "hash_full_prompts": "378ac13c8abb6c5f", + "hash_input_tokens": "afea2ca30b1622ff", + "hash_cont_tokens": "e71e7c6acf44c3e5" + }, + "truncated": 0, + "non_truncated": 390, + "padded": 1560, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:high_school_mathematics|5": { + "hashes": { + "hash_examples": "fd6646fdb5d58a1f", + "hash_full_prompts": "14d34e0b34750627", + "hash_input_tokens": "34e63b0902b32a2c", + "hash_cont_tokens": "e36b5624bdbe96b0" + }, + "truncated": 0, + "non_truncated": 270, + "padded": 1080, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:high_school_microeconomics|5": { + "hashes": { + "hash_examples": "2118f21f71d87d84", + "hash_full_prompts": "9ac09e5d4da991c9", + "hash_input_tokens": "93d1c1ba5fe0bcbd", + "hash_cont_tokens": "a5f61d5beba13cc2" + }, + "truncated": 0, + "non_truncated": 238, + "padded": 952, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:high_school_physics|5": { + "hashes": { + "hash_examples": "dc3ce06378548565", + "hash_full_prompts": "b4832a554d47d224", + "hash_input_tokens": "f5bf59bc9f6839fe", + "hash_cont_tokens": "df1d218ccbc258e8" + }, + "truncated": 0, + "non_truncated": 151, + "padded": 604, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:high_school_psychology|5": { + "hashes": { + "hash_examples": "c8d1d98a40e11f2f", + "hash_full_prompts": "1e8cd27064546274", + "hash_input_tokens": "329851f26db67226", + "hash_cont_tokens": "6fb549a4eb8e6c47" + }, + "truncated": 0, + "non_truncated": 545, + "padded": 2180, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:high_school_statistics|5": { + "hashes": { + "hash_examples": "666c8759b98ee4ff", + "hash_full_prompts": "e05ab41077ec0afa", + "hash_input_tokens": "7abad93393993e44", + "hash_cont_tokens": "d9528c65af653d67" + }, + "truncated": 0, + "non_truncated": 216, + "padded": 864, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:high_school_us_history|5": { + "hashes": { + "hash_examples": "95fef1c4b7d3f81e", + "hash_full_prompts": "a4b275996a416b4a", + "hash_input_tokens": "e5def820604ad889", + "hash_cont_tokens": "8b827fc7dfd3c1c5" + }, + "truncated": 0, + "non_truncated": 204, + "padded": 816, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:high_school_world_history|5": { + "hashes": { + "hash_examples": "7e5085b6184b0322", + "hash_full_prompts": "8adf16361f0f320a", + "hash_input_tokens": "aa85ae4eba20e53f", + "hash_cont_tokens": "82f19c159c69a66d" + }, + "truncated": 0, + "non_truncated": 237, + "padded": 948, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:human_aging|5": { + "hashes": { + "hash_examples": "c17333e7c7c10797", + "hash_full_prompts": "918d91a3141aac4d", + "hash_input_tokens": "297fceccf01a2c64", + "hash_cont_tokens": "ca87074f1dc39668" + }, + "truncated": 0, + "non_truncated": 223, + "padded": 892, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:human_sexuality|5": { + "hashes": { + "hash_examples": "4edd1e9045df5e3d", + "hash_full_prompts": "bcee39ecea32fcc8", + "hash_input_tokens": "7c66a375881d6788", + "hash_cont_tokens": "491a0ab53f54aeb9" + }, + "truncated": 0, + "non_truncated": 131, + "padded": 524, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:international_law|5": { + "hashes": { + "hash_examples": "db2fa00d771a062a", + "hash_full_prompts": "ffe12a3b5bf350c2", + "hash_input_tokens": "dc0250213736abca", + "hash_cont_tokens": "e3d257d7ea257fc8" + }, + "truncated": 0, + "non_truncated": 121, + "padded": 484, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:jurisprudence|5": { + "hashes": { + "hash_examples": "e956f86b124076fe", + "hash_full_prompts": "b4293c3c08bebaf7", + "hash_input_tokens": "c9ed773ed04cff64", + "hash_cont_tokens": "4c69d7671fa1ab1c" + }, + "truncated": 0, + "non_truncated": 108, + "padded": 432, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:logical_fallacies|5": { + "hashes": { + "hash_examples": "956e0e6365ab79f1", + "hash_full_prompts": "8c1b7733e98cbe81", + "hash_input_tokens": "a4f6df541a56c41a", + "hash_cont_tokens": "57e78d3d09b7db81" + }, + "truncated": 0, + "non_truncated": 163, + "padded": 652, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:machine_learning|5": { + "hashes": { + "hash_examples": "397997cc6f4d581e", + "hash_full_prompts": "24a206a1c639ab8d", + "hash_input_tokens": "f0dfd08579d1f727", + "hash_cont_tokens": "94d2ec6c52bb7b53" + }, + "truncated": 0, + "non_truncated": 112, + "padded": 448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:management|5": { + "hashes": { + "hash_examples": "2bcbe6f6ca63d740", + "hash_full_prompts": "77e1c79d988beecc", + "hash_input_tokens": "15925fd62ddd3ca4", + "hash_cont_tokens": "79499fecb18f1cb1" + }, + "truncated": 0, + "non_truncated": 103, + "padded": 412, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:marketing|5": { + "hashes": { + "hash_examples": "8ddb20d964a1b065", + "hash_full_prompts": "83cec2fa6b681d9d", + "hash_input_tokens": "6eb177c438da2061", + "hash_cont_tokens": "c5e9cd86b1a58fac" + }, + "truncated": 0, + "non_truncated": 234, + "padded": 936, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:medical_genetics|5": { + "hashes": { + "hash_examples": "182a71f4763d2cea", + "hash_full_prompts": "195eb7ff99749730", + "hash_input_tokens": "5adeca0d34767f29", + "hash_cont_tokens": "a886b3552371a98b" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:miscellaneous|5": { + "hashes": { + "hash_examples": "4c404fdbb4ca57fc", + "hash_full_prompts": "33539955c9a96851", + "hash_input_tokens": "52aee92a69c2b698", + "hash_cont_tokens": "8578b82c42cc7026" + }, + "truncated": 0, + "non_truncated": 783, + "padded": 3132, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:moral_disputes|5": { + "hashes": { + "hash_examples": "60cbd2baa3fea5c9", + "hash_full_prompts": "009b7d0e7f819eff", + "hash_input_tokens": "f24c046b105c5e03", + "hash_cont_tokens": "26b0f808ec46464d" + }, + "truncated": 0, + "non_truncated": 346, + "padded": 1384, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:moral_scenarios|5": { + "hashes": { + "hash_examples": "fd8b0431fbdd75ef", + "hash_full_prompts": "f6e63c9fb9d3bff0", + "hash_input_tokens": "08eee0e3d8e89710", + "hash_cont_tokens": "52fe77d28aefc1b3" + }, + "truncated": 0, + "non_truncated": 895, + "padded": 3580, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:nutrition|5": { + "hashes": { + "hash_examples": "71e55e2b829b6528", + "hash_full_prompts": "8294d5e3ad435377", + "hash_input_tokens": "5b2c6686c8fc5e83", + "hash_cont_tokens": "25850a01b4a11b53" + }, + "truncated": 0, + "non_truncated": 306, + "padded": 1224, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:philosophy|5": { + "hashes": { + "hash_examples": "a6d489a8d208fa4b", + "hash_full_prompts": "db68c0f4503e4793", + "hash_input_tokens": "7108ad04b556854f", + "hash_cont_tokens": "8c34ab2fa65c3b6e" + }, + "truncated": 0, + "non_truncated": 311, + "padded": 1244, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:prehistory|5": { + "hashes": { + "hash_examples": "6cc50f032a19acaa", + "hash_full_prompts": "3972bcfa8c80e964", + "hash_input_tokens": "65cb6b1efc71921b", + "hash_cont_tokens": "89f21e5f9c7d81f2" + }, + "truncated": 0, + "non_truncated": 324, + "padded": 1296, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:professional_accounting|5": { + "hashes": { + "hash_examples": "50f57ab32f5f6cea", + "hash_full_prompts": "25f0becc2483bd32", + "hash_input_tokens": "c1b1c1e1f1ca4a85", + "hash_cont_tokens": "c7c4930a659ca843" + }, + "truncated": 0, + "non_truncated": 282, + "padded": 1120, + "non_padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:professional_law|5": { + "hashes": { + "hash_examples": "a8fdc85c64f4b215", + "hash_full_prompts": "7a6f6c5706f00c7d", + "hash_input_tokens": "e7517115da0204cd", + "hash_cont_tokens": "6f36bd560ae36f02" + }, + "truncated": 0, + "non_truncated": 1534, + "padded": 6136, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:professional_medicine|5": { + "hashes": { + "hash_examples": "c373a28a3050a73a", + "hash_full_prompts": "a74b6ac7c5c545d2", + "hash_input_tokens": "da6af6d03e682017", + "hash_cont_tokens": "ca4398b4ad3db5f1" + }, + "truncated": 0, + "non_truncated": 272, + "padded": 1088, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:professional_psychology|5": { + "hashes": { + "hash_examples": "bf5254fe818356af", + "hash_full_prompts": "c53fa139ec25f502", + "hash_input_tokens": "c6dbaf3c7103ebe9", + "hash_cont_tokens": "ce4bb75e80359fe4" + }, + "truncated": 0, + "non_truncated": 612, + "padded": 2448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:public_relations|5": { + "hashes": { + "hash_examples": "b66d52e28e7d14e0", + "hash_full_prompts": "55b5eff05aa6bf13", + "hash_input_tokens": "deea75b6eec5b782", + "hash_cont_tokens": "680235f5ede0b353" + }, + "truncated": 0, + "non_truncated": 110, + "padded": 440, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:security_studies|5": { + "hashes": { + "hash_examples": "514c14feaf000ad9", + "hash_full_prompts": "6690ecdc054f7b0c", + "hash_input_tokens": "deef3d39896aca43", + "hash_cont_tokens": "189956efcec12818" + }, + "truncated": 0, + "non_truncated": 245, + "padded": 980, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:sociology|5": { + "hashes": { + "hash_examples": "f6c9bc9d18c80870", + "hash_full_prompts": "945fbdd091c72d64", + "hash_input_tokens": "330fffbccabf89e4", + "hash_cont_tokens": "2178ff937c0c1a29" + }, + "truncated": 0, + "non_truncated": 201, + "padded": 804, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:us_foreign_policy|5": { + "hashes": { + "hash_examples": "ed7b78629db6678f", + "hash_full_prompts": "ebba6ea6eca4ae53", + "hash_input_tokens": "0ec87fa768a47632", + "hash_cont_tokens": "a886b3552371a98b" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 392, + "non_padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:virology|5": { + "hashes": { + "hash_examples": "bc52ffdc3f9b994a", + "hash_full_prompts": "a2ee4984d6877fe3", + "hash_input_tokens": "cc264818195d14da", + "hash_cont_tokens": "ec5c187546c7c842" + }, + "truncated": 0, + "non_truncated": 166, + "padded": 660, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:world_religions|5": { + "hashes": { + "hash_examples": "ecdb4a4f94f62930", + "hash_full_prompts": "a89c8dddd1d8ced0", + "hash_input_tokens": "e7e781ba363743eb", + "hash_cont_tokens": "e52b573046cdfc5c" + }, + "truncated": 0, + "non_truncated": 171, + "padded": 684, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "36a6d90e75d92d4a", + "hash_full_prompts": "8d9ca0a8bd458a1c", + "hash_input_tokens": "4aad1a3bfe70acfc", + "hash_cont_tokens": "b0f64f6659d8c230" + }, + "truncated": 0, + "non_truncated": 817, + "padded": 9996, + "non_padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|winogrande|5": { + "hashes": { + "hash_examples": "087d5d1a1afd4c7b", + "hash_full_prompts": "35da55e47222e0e1", + "hash_input_tokens": "881c630a9e0034f7", + "hash_cont_tokens": "c466f4c92e3879cb" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2534, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|gsm8k|5": { + "hashes": { + "hash_examples": "0ed016e24e7512fd", + "hash_full_prompts": "f7ab209f6467841e", + "hash_input_tokens": "deccfe61ad5cb3d5", + "hash_cont_tokens": "b7442470c79a7028" + }, + "truncated": 1319, + "non_truncated": 0, + "padded": 1074, + "non_padded": 245, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "670666fa3a90ce5d", + "hash_full_prompts": "56c005e427046302", + "hash_input_tokens": "2a51da62c271a1a0", + "hash_cont_tokens": "aba89b730029cc1f" + }, + "truncated": 1319, + "non_truncated": 27340, + "padded": 114540, + "non_padded": 332, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/results/dreamgen/llama3-8b-instruct-align-test2-kto/results_2024-05-24T18-12-54.142897.json b/results/dreamgen/llama3-8b-instruct-align-test2-kto/results_2024-05-24T18-12-54.142897.json new file mode 100644 index 0000000000000000000000000000000000000000..fdca4e4417f5af68f241ba460c8a0acf0f966bb0 --- /dev/null +++ b/results/dreamgen/llama3-8b-instruct-align-test2-kto/results_2024-05-24T18-12-54.142897.json @@ -0,0 +1,3461 @@ +{ + "config_general": { + "lighteval_sha": "a98210fd3a2d1e8bface1c32b72ebd5017173a4c", + "num_fewshot_seeds": 1, + "override_batch_size": -1, + "max_samples": null, + "job_id": "", + "start_time": 2236461.789611765, + "end_time": 2259996.300803602, + "total_evaluation_time_secondes": "23534.511191837024", + "model_name": "dreamgen/llama3-8b-instruct-align-test2-kto", + "model_sha": "9d61ac436978ee7007b966c145c2537cbe2fc994", + "model_dtype": "torch.bfloat16", + "model_size": "14.96 GB", + "config": null + }, + "results": { + "leaderboard|arc:challenge|25": { + "acc": 0.5554607508532423, + "acc_stderr": 0.014521226405627075, + "acc_norm": 0.5622866894197952, + "acc_norm_stderr": 0.014497573881108282 + }, + "leaderboard|hellaswag|10": { + "acc": 0.5598486357299343, + "acc_stderr": 0.004953907062096603, + "acc_norm": 0.7204740091615216, + "acc_norm_stderr": 0.004478491697891239 + }, + "leaderboard|mmlu:abstract_algebra|5": { + "acc": 0.33, + "acc_stderr": 0.047258156262526045 + }, + "leaderboard|mmlu:anatomy|5": { + "acc": 0.6592592592592592, + "acc_stderr": 0.040943762699967926 + }, + "leaderboard|mmlu:astronomy|5": { + "acc": 0.7302631578947368, + "acc_stderr": 0.03611780560284898 + }, + "leaderboard|mmlu:business_ethics|5": { + "acc": 0.67, + "acc_stderr": 0.047258156262526094 + }, + "leaderboard|mmlu:clinical_knowledge|5": { + "acc": 0.7320754716981132, + "acc_stderr": 0.027257260322494845 + }, + "leaderboard|mmlu:college_biology|5": { + "acc": 0.7708333333333334, + "acc_stderr": 0.03514697467862388 + }, + "leaderboard|mmlu:college_chemistry|5": { + "acc": 0.49, + "acc_stderr": 0.05024183937956911 + }, + "leaderboard|mmlu:college_computer_science|5": { + "acc": 0.46, + "acc_stderr": 0.05009082659620332 + }, + "leaderboard|mmlu:college_mathematics|5": { + "acc": 0.39, + "acc_stderr": 0.04902071300001975 + }, + "leaderboard|mmlu:college_medicine|5": { + "acc": 0.6069364161849711, + "acc_stderr": 0.0372424959581773 + }, + "leaderboard|mmlu:college_physics|5": { + "acc": 0.47058823529411764, + "acc_stderr": 0.04966570903978529 + }, + "leaderboard|mmlu:computer_security|5": { + "acc": 0.78, + "acc_stderr": 0.04163331998932261 + }, + "leaderboard|mmlu:conceptual_physics|5": { + "acc": 0.5829787234042553, + "acc_stderr": 0.03223276266711712 + }, + "leaderboard|mmlu:econometrics|5": { + "acc": 0.5, + "acc_stderr": 0.047036043419179864 + }, + "leaderboard|mmlu:electrical_engineering|5": { + "acc": 0.5862068965517241, + "acc_stderr": 0.04104269211806232 + }, + "leaderboard|mmlu:elementary_mathematics|5": { + "acc": 0.46296296296296297, + "acc_stderr": 0.02568056464005688 + }, + "leaderboard|mmlu:formal_logic|5": { + "acc": 0.5238095238095238, + "acc_stderr": 0.04467062628403273 + }, + "leaderboard|mmlu:global_facts|5": { + "acc": 0.43, + "acc_stderr": 0.04975698519562428 + }, + "leaderboard|mmlu:high_school_biology|5": { + "acc": 0.7580645161290323, + "acc_stderr": 0.024362599693031076 + }, + "leaderboard|mmlu:high_school_chemistry|5": { + "acc": 0.5270935960591133, + "acc_stderr": 0.03512819077876106 + }, + "leaderboard|mmlu:high_school_computer_science|5": { + "acc": 0.66, + "acc_stderr": 0.04760952285695237 + }, + "leaderboard|mmlu:high_school_european_history|5": { + "acc": 0.7212121212121212, + "acc_stderr": 0.035014387062967806 + }, + "leaderboard|mmlu:high_school_geography|5": { + "acc": 0.7929292929292929, + "acc_stderr": 0.028869778460267042 + }, + "leaderboard|mmlu:high_school_government_and_politics|5": { + "acc": 0.8756476683937824, + "acc_stderr": 0.023814477086593563 + }, + "leaderboard|mmlu:high_school_macroeconomics|5": { + "acc": 0.658974358974359, + "acc_stderr": 0.024035489676335068 + }, + "leaderboard|mmlu:high_school_mathematics|5": { + "acc": 0.34074074074074073, + "acc_stderr": 0.028897748741131137 + }, + "leaderboard|mmlu:high_school_microeconomics|5": { + "acc": 0.7563025210084033, + "acc_stderr": 0.027886828078380548 + }, + "leaderboard|mmlu:high_school_physics|5": { + "acc": 0.4503311258278146, + "acc_stderr": 0.04062290018683775 + }, + "leaderboard|mmlu:high_school_psychology|5": { + "acc": 0.8293577981651377, + "acc_stderr": 0.016129271025099878 + }, + "leaderboard|mmlu:high_school_statistics|5": { + "acc": 0.49537037037037035, + "acc_stderr": 0.03409825519163572 + }, + "leaderboard|mmlu:high_school_us_history|5": { + "acc": 0.803921568627451, + "acc_stderr": 0.027865942286639318 + }, + "leaderboard|mmlu:high_school_world_history|5": { + "acc": 0.8185654008438819, + "acc_stderr": 0.025085961144579654 + }, + "leaderboard|mmlu:human_aging|5": { + "acc": 0.695067264573991, + "acc_stderr": 0.03089861088247752 + }, + "leaderboard|mmlu:human_sexuality|5": { + "acc": 0.7862595419847328, + "acc_stderr": 0.0359546161177469 + }, + "leaderboard|mmlu:international_law|5": { + "acc": 0.8016528925619835, + "acc_stderr": 0.03640118271990947 + }, + "leaderboard|mmlu:jurisprudence|5": { + "acc": 0.8055555555555556, + "acc_stderr": 0.038260763248848646 + }, + "leaderboard|mmlu:logical_fallacies|5": { + "acc": 0.7361963190184049, + "acc_stderr": 0.03462419931615623 + }, + "leaderboard|mmlu:machine_learning|5": { + "acc": 0.49107142857142855, + "acc_stderr": 0.04745033255489123 + }, + "leaderboard|mmlu:management|5": { + "acc": 0.8252427184466019, + "acc_stderr": 0.03760178006026622 + }, + "leaderboard|mmlu:marketing|5": { + "acc": 0.8717948717948718, + "acc_stderr": 0.02190190511507333 + }, + "leaderboard|mmlu:medical_genetics|5": { + "acc": 0.69, + "acc_stderr": 0.04648231987117316 + }, + "leaderboard|mmlu:miscellaneous|5": { + "acc": 0.822477650063857, + "acc_stderr": 0.013664230995834834 + }, + "leaderboard|mmlu:moral_disputes|5": { + "acc": 0.7052023121387283, + "acc_stderr": 0.02454761779480383 + }, + "leaderboard|mmlu:moral_scenarios|5": { + "acc": 0.41899441340782123, + "acc_stderr": 0.016501579306861677 + }, + "leaderboard|mmlu:nutrition|5": { + "acc": 0.7352941176470589, + "acc_stderr": 0.025261691219729484 + }, + "leaderboard|mmlu:philosophy|5": { + "acc": 0.7202572347266881, + "acc_stderr": 0.02549425935069491 + }, + "leaderboard|mmlu:prehistory|5": { + "acc": 0.6975308641975309, + "acc_stderr": 0.025557653981868055 + }, + "leaderboard|mmlu:professional_accounting|5": { + "acc": 0.5035460992907801, + "acc_stderr": 0.02982674915328092 + }, + "leaderboard|mmlu:professional_law|5": { + "acc": 0.4576271186440678, + "acc_stderr": 0.012724296550980188 + }, + "leaderboard|mmlu:professional_medicine|5": { + "acc": 0.7132352941176471, + "acc_stderr": 0.027472274473233818 + }, + "leaderboard|mmlu:professional_psychology|5": { + "acc": 0.6879084967320261, + "acc_stderr": 0.018745011201277657 + }, + "leaderboard|mmlu:public_relations|5": { + "acc": 0.6545454545454545, + "acc_stderr": 0.04554619617541054 + }, + "leaderboard|mmlu:security_studies|5": { + "acc": 0.7387755102040816, + "acc_stderr": 0.028123429335142773 + }, + "leaderboard|mmlu:sociology|5": { + "acc": 0.8159203980099502, + "acc_stderr": 0.027403859410786848 + }, + "leaderboard|mmlu:us_foreign_policy|5": { + "acc": 0.85, + "acc_stderr": 0.0358870281282637 + }, + "leaderboard|mmlu:virology|5": { + "acc": 0.5, + "acc_stderr": 0.03892494720807614 + }, + "leaderboard|mmlu:world_religions|5": { + "acc": 0.8128654970760234, + "acc_stderr": 0.029913127232368032 + }, + "leaderboard|truthfulqa:mc|0": { + "truthfulqa_mc1": 0.36964504283965727, + "truthfulqa_mc1_stderr": 0.01689818070697389, + "truthfulqa_mc2": 0.5305070824753251, + "truthfulqa_mc2_stderr": 0.015944782750315033 + }, + "leaderboard|winogrande|5": { + "acc": 0.6937647987371744, + "acc_stderr": 0.01295438597280247 + }, + "leaderboard|gsm8k|5": { + "qem": 0.4670204700530705, + "qem_stderr": 0.013742492794163421 + }, + "leaderboard|mmlu:_average|5": { + "acc": 0.6526569493505927, + "acc_stderr": 0.03380504750509661 + }, + "all": { + "acc": 0.6501753383050689, + "acc_stderr": 0.03265528712051721, + "acc_norm": 0.6413803492906585, + "acc_norm_stderr": 0.009488032789499761, + "truthfulqa_mc1": 0.36964504283965727, + "truthfulqa_mc1_stderr": 0.01689818070697389, + "truthfulqa_mc2": 0.5305070824753251, + "truthfulqa_mc2_stderr": 0.015944782750315033, + "qem": 0.4670204700530705, + "qem_stderr": 0.013742492794163421 + } + }, + "versions": { + "leaderboard|arc:challenge|25": 0, + "leaderboard|gsm8k|5": 0, + "leaderboard|hellaswag|10": 0, + "leaderboard|mmlu:abstract_algebra|5": 0, + "leaderboard|mmlu:anatomy|5": 0, + "leaderboard|mmlu:astronomy|5": 0, + "leaderboard|mmlu:business_ethics|5": 0, + "leaderboard|mmlu:clinical_knowledge|5": 0, + "leaderboard|mmlu:college_biology|5": 0, + "leaderboard|mmlu:college_chemistry|5": 0, + "leaderboard|mmlu:college_computer_science|5": 0, + "leaderboard|mmlu:college_mathematics|5": 0, + "leaderboard|mmlu:college_medicine|5": 0, + "leaderboard|mmlu:college_physics|5": 0, + "leaderboard|mmlu:computer_security|5": 0, + "leaderboard|mmlu:conceptual_physics|5": 0, + "leaderboard|mmlu:econometrics|5": 0, + "leaderboard|mmlu:electrical_engineering|5": 0, + "leaderboard|mmlu:elementary_mathematics|5": 0, + "leaderboard|mmlu:formal_logic|5": 0, + "leaderboard|mmlu:global_facts|5": 0, + "leaderboard|mmlu:high_school_biology|5": 0, + "leaderboard|mmlu:high_school_chemistry|5": 0, + "leaderboard|mmlu:high_school_computer_science|5": 0, + "leaderboard|mmlu:high_school_european_history|5": 0, + "leaderboard|mmlu:high_school_geography|5": 0, + "leaderboard|mmlu:high_school_government_and_politics|5": 0, + "leaderboard|mmlu:high_school_macroeconomics|5": 0, + "leaderboard|mmlu:high_school_mathematics|5": 0, + "leaderboard|mmlu:high_school_microeconomics|5": 0, + "leaderboard|mmlu:high_school_physics|5": 0, + "leaderboard|mmlu:high_school_psychology|5": 0, + "leaderboard|mmlu:high_school_statistics|5": 0, + "leaderboard|mmlu:high_school_us_history|5": 0, + "leaderboard|mmlu:high_school_world_history|5": 0, + "leaderboard|mmlu:human_aging|5": 0, + "leaderboard|mmlu:human_sexuality|5": 0, + "leaderboard|mmlu:international_law|5": 0, + "leaderboard|mmlu:jurisprudence|5": 0, + "leaderboard|mmlu:logical_fallacies|5": 0, + "leaderboard|mmlu:machine_learning|5": 0, + "leaderboard|mmlu:management|5": 0, + "leaderboard|mmlu:marketing|5": 0, + "leaderboard|mmlu:medical_genetics|5": 0, + "leaderboard|mmlu:miscellaneous|5": 0, + "leaderboard|mmlu:moral_disputes|5": 0, + "leaderboard|mmlu:moral_scenarios|5": 0, + "leaderboard|mmlu:nutrition|5": 0, + "leaderboard|mmlu:philosophy|5": 0, + "leaderboard|mmlu:prehistory|5": 0, + "leaderboard|mmlu:professional_accounting|5": 0, + "leaderboard|mmlu:professional_law|5": 0, + "leaderboard|mmlu:professional_medicine|5": 0, + "leaderboard|mmlu:professional_psychology|5": 0, + "leaderboard|mmlu:public_relations|5": 0, + "leaderboard|mmlu:security_studies|5": 0, + "leaderboard|mmlu:sociology|5": 0, + "leaderboard|mmlu:us_foreign_policy|5": 0, + "leaderboard|mmlu:virology|5": 0, + "leaderboard|mmlu:world_religions|5": 0, + "leaderboard|truthfulqa:mc|0": 0, + "leaderboard|winogrande|5": 0 + }, + "config_tasks": { + "leaderboard|arc:challenge": { + "name": "arc:challenge", + "prompt_function": "arc", + "hf_repo": "ai2_arc", + "hf_subset": "ARC-Challenge", + "metric": [ + "loglikelihood_acc", + "loglikelihood_acc_norm_nospace" + ], + "hf_avail_splits": [ + "train", + "test" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": null, + "few_shots_select": "random_sampling_from_train", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "arc" + ], + "original_num_docs": 1172, + "effective_num_docs": 1172, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|gsm8k": { + "name": "gsm8k", + "prompt_function": "gsm8k", + "hf_repo": "gsm8k", + "hf_subset": "main", + "metric": [ + "quasi_exact_match_gsm8k" + ], + "hf_avail_splits": [ + "train", + "test" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": null, + "few_shots_select": "random_sampling_from_train", + "generation_size": 256, + "stop_sequence": [ + "Question:", + "Question", + ":" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard" + ], + "original_num_docs": 1319, + "effective_num_docs": 1319, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|hellaswag": { + "name": "hellaswag", + "prompt_function": "hellaswag_harness", + "hf_repo": "hellaswag", + "hf_subset": "default", + "metric": [ + "loglikelihood_acc", + "loglikelihood_acc_norm" + ], + "hf_avail_splits": [ + "train", + "test", + "validation" + ], + "evaluation_splits": [ + "validation" + ], + "few_shots_split": null, + "few_shots_select": "random_sampling_from_train", + "generation_size": -1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard" + ], + "original_num_docs": 10042, + "effective_num_docs": 10042, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:abstract_algebra": { + "name": "mmlu:abstract_algebra", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "abstract_algebra", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 100, + "effective_num_docs": 100, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:anatomy": { + "name": "mmlu:anatomy", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "anatomy", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 135, + "effective_num_docs": 135, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:astronomy": { + "name": "mmlu:astronomy", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "astronomy", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 152, + "effective_num_docs": 152, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:business_ethics": { + "name": "mmlu:business_ethics", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "business_ethics", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 100, + "effective_num_docs": 100, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:clinical_knowledge": { + "name": "mmlu:clinical_knowledge", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "clinical_knowledge", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 265, + "effective_num_docs": 265, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:college_biology": { + "name": "mmlu:college_biology", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "college_biology", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 144, + "effective_num_docs": 144, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:college_chemistry": { + "name": "mmlu:college_chemistry", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "college_chemistry", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 100, + "effective_num_docs": 100, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:college_computer_science": { + "name": "mmlu:college_computer_science", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "college_computer_science", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 100, + "effective_num_docs": 100, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:college_mathematics": { + "name": "mmlu:college_mathematics", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "college_mathematics", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 100, + "effective_num_docs": 100, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:college_medicine": { + "name": "mmlu:college_medicine", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "college_medicine", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 173, + "effective_num_docs": 173, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:college_physics": { + "name": "mmlu:college_physics", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "college_physics", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 102, + "effective_num_docs": 102, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:computer_security": { + "name": "mmlu:computer_security", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "computer_security", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 100, + "effective_num_docs": 100, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:conceptual_physics": { + "name": "mmlu:conceptual_physics", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "conceptual_physics", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 235, + "effective_num_docs": 235, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:econometrics": { + "name": "mmlu:econometrics", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "econometrics", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 114, + "effective_num_docs": 114, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:electrical_engineering": { + "name": "mmlu:electrical_engineering", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "electrical_engineering", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 145, + "effective_num_docs": 145, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:elementary_mathematics": { + "name": "mmlu:elementary_mathematics", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "elementary_mathematics", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 378, + "effective_num_docs": 378, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:formal_logic": { + "name": "mmlu:formal_logic", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "formal_logic", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 126, + "effective_num_docs": 126, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:global_facts": { + "name": "mmlu:global_facts", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "global_facts", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 100, + "effective_num_docs": 100, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:high_school_biology": { + "name": "mmlu:high_school_biology", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "high_school_biology", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 310, + "effective_num_docs": 310, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:high_school_chemistry": { + "name": "mmlu:high_school_chemistry", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "high_school_chemistry", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 203, + "effective_num_docs": 203, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:high_school_computer_science": { + "name": "mmlu:high_school_computer_science", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "high_school_computer_science", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 100, + "effective_num_docs": 100, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:high_school_european_history": { + "name": "mmlu:high_school_european_history", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "high_school_european_history", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 165, + "effective_num_docs": 165, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:high_school_geography": { + "name": "mmlu:high_school_geography", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "high_school_geography", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 198, + "effective_num_docs": 198, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:high_school_government_and_politics": { + "name": "mmlu:high_school_government_and_politics", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "high_school_government_and_politics", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 193, + "effective_num_docs": 193, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:high_school_macroeconomics": { + "name": "mmlu:high_school_macroeconomics", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "high_school_macroeconomics", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 390, + "effective_num_docs": 390, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:high_school_mathematics": { + "name": "mmlu:high_school_mathematics", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "high_school_mathematics", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 270, + "effective_num_docs": 270, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:high_school_microeconomics": { + "name": "mmlu:high_school_microeconomics", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "high_school_microeconomics", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 238, + "effective_num_docs": 238, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:high_school_physics": { + "name": "mmlu:high_school_physics", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "high_school_physics", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 151, + "effective_num_docs": 151, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:high_school_psychology": { + "name": "mmlu:high_school_psychology", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "high_school_psychology", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 545, + "effective_num_docs": 545, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:high_school_statistics": { + "name": "mmlu:high_school_statistics", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "high_school_statistics", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 216, + "effective_num_docs": 216, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:high_school_us_history": { + "name": "mmlu:high_school_us_history", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "high_school_us_history", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 204, + "effective_num_docs": 204, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:high_school_world_history": { + "name": "mmlu:high_school_world_history", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "high_school_world_history", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 237, + "effective_num_docs": 237, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:human_aging": { + "name": "mmlu:human_aging", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "human_aging", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 223, + "effective_num_docs": 223, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:human_sexuality": { + "name": "mmlu:human_sexuality", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "human_sexuality", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 131, + "effective_num_docs": 131, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:international_law": { + "name": "mmlu:international_law", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "international_law", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 121, + "effective_num_docs": 121, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:jurisprudence": { + "name": "mmlu:jurisprudence", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "jurisprudence", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 108, + "effective_num_docs": 108, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:logical_fallacies": { + "name": "mmlu:logical_fallacies", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "logical_fallacies", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 163, + "effective_num_docs": 163, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:machine_learning": { + "name": "mmlu:machine_learning", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "machine_learning", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 112, + "effective_num_docs": 112, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:management": { + "name": "mmlu:management", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "management", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 103, + "effective_num_docs": 103, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:marketing": { + "name": "mmlu:marketing", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "marketing", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 234, + "effective_num_docs": 234, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:medical_genetics": { + "name": "mmlu:medical_genetics", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "medical_genetics", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 100, + "effective_num_docs": 100, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:miscellaneous": { + "name": "mmlu:miscellaneous", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "miscellaneous", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 783, + "effective_num_docs": 783, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:moral_disputes": { + "name": "mmlu:moral_disputes", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "moral_disputes", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 346, + "effective_num_docs": 346, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:moral_scenarios": { + "name": "mmlu:moral_scenarios", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "moral_scenarios", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 895, + "effective_num_docs": 895, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:nutrition": { + "name": "mmlu:nutrition", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "nutrition", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 306, + "effective_num_docs": 306, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:philosophy": { + "name": "mmlu:philosophy", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "philosophy", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 311, + "effective_num_docs": 311, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:prehistory": { + "name": "mmlu:prehistory", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "prehistory", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 324, + "effective_num_docs": 324, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:professional_accounting": { + "name": "mmlu:professional_accounting", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "professional_accounting", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 282, + "effective_num_docs": 282, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:professional_law": { + "name": "mmlu:professional_law", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "professional_law", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 1534, + "effective_num_docs": 1534, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:professional_medicine": { + "name": "mmlu:professional_medicine", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "professional_medicine", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 272, + "effective_num_docs": 272, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:professional_psychology": { + "name": "mmlu:professional_psychology", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "professional_psychology", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 612, + "effective_num_docs": 612, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:public_relations": { + "name": "mmlu:public_relations", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "public_relations", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 110, + "effective_num_docs": 110, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:security_studies": { + "name": "mmlu:security_studies", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "security_studies", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 245, + "effective_num_docs": 245, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:sociology": { + "name": "mmlu:sociology", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "sociology", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 201, + "effective_num_docs": 201, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:us_foreign_policy": { + "name": "mmlu:us_foreign_policy", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "us_foreign_policy", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 100, + "effective_num_docs": 100, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:virology": { + "name": "mmlu:virology", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "virology", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 166, + "effective_num_docs": 166, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:world_religions": { + "name": "mmlu:world_religions", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "world_religions", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 171, + "effective_num_docs": 171, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|truthfulqa:mc": { + "name": "truthfulqa:mc", + "prompt_function": "truthful_qa_multiple_choice", + "hf_repo": "truthful_qa", + "hf_subset": "multiple_choice", + "metric": [ + "truthfulqa_mc_metrics" + ], + "hf_avail_splits": [ + "validation" + ], + "evaluation_splits": [ + "validation" + ], + "few_shots_split": null, + "few_shots_select": null, + "generation_size": -1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard" + ], + "original_num_docs": 817, + "effective_num_docs": 817, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|winogrande": { + "name": "winogrande", + "prompt_function": "winogrande", + "hf_repo": "winogrande", + "hf_subset": "winogrande_xl", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "train", + "test", + "validation" + ], + "evaluation_splits": [ + "validation" + ], + "few_shots_split": null, + "few_shots_select": "random_sampling", + "generation_size": -1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard" + ], + "original_num_docs": 1267, + "effective_num_docs": 1267, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + } + }, + "summary_tasks": { + "leaderboard|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "4aeb23a740784b86", + "hash_input_tokens": "2e9e18067d1f8ad8", + "hash_cont_tokens": "19baa8a044eaaac8" + }, + "truncated": 0, + "non_truncated": 1172, + "padded": 4687, + "non_padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|hellaswag|10": { + "hashes": { + "hash_examples": "31985c805c3a737e", + "hash_full_prompts": "3c2d3440e190b07b", + "hash_input_tokens": "412fc1d29623282b", + "hash_cont_tokens": "823c88a16c837063" + }, + "truncated": 0, + "non_truncated": 10042, + "padded": 40105, + "non_padded": 63, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:abstract_algebra|5": { + "hashes": { + "hash_examples": "4c76229e00c9c0e9", + "hash_full_prompts": "faefa0cccb952fe0", + "hash_input_tokens": "e7380c35f0e2c4b3", + "hash_cont_tokens": "a886b3552371a98b" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:anatomy|5": { + "hashes": { + "hash_examples": "6a1f8104dccbd33b", + "hash_full_prompts": "eacd03e46972fa59", + "hash_input_tokens": "2ee8bc2ef4561b6b", + "hash_cont_tokens": "9be31d13c42ead00" + }, + "truncated": 0, + "non_truncated": 135, + "padded": 540, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:astronomy|5": { + "hashes": { + "hash_examples": "1302effa3a76ce4c", + "hash_full_prompts": "826cacbdf1f6bfd0", + "hash_input_tokens": "6ab8d24255ff03b3", + "hash_cont_tokens": "30cc2b2fc1294aac" + }, + "truncated": 0, + "non_truncated": 152, + "padded": 608, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:business_ethics|5": { + "hashes": { + "hash_examples": "03cb8bce5336419a", + "hash_full_prompts": "518511169382ac39", + "hash_input_tokens": "8be4f0cc9ce448e1", + "hash_cont_tokens": "4e9d83c717b7deb8" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:clinical_knowledge|5": { + "hashes": { + "hash_examples": "ffbb9c7b2be257f9", + "hash_full_prompts": "0b07b0bc774fdfd9", + "hash_input_tokens": "413166c01db52a72", + "hash_cont_tokens": "40dd7263ce5af5de" + }, + "truncated": 0, + "non_truncated": 265, + "padded": 1060, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:college_biology|5": { + "hashes": { + "hash_examples": "3ee77f176f38eb8e", + "hash_full_prompts": "22cbe0e8dabf98b1", + "hash_input_tokens": "0dcd583202383d43", + "hash_cont_tokens": "1892d80e82b394c0" + }, + "truncated": 0, + "non_truncated": 144, + "padded": 576, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:college_chemistry|5": { + "hashes": { + "hash_examples": "ce61a69c46d47aeb", + "hash_full_prompts": "9c1288940a4afb59", + "hash_input_tokens": "59a4f0d36881d644", + "hash_cont_tokens": "b6bb78fb2d7e4e6f" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:college_computer_science|5": { + "hashes": { + "hash_examples": "32805b52d7d5daab", + "hash_full_prompts": "9522781d0cdf1a43", + "hash_input_tokens": "302a2f1d05b53513", + "hash_cont_tokens": "6a5da979260e607c" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:college_mathematics|5": { + "hashes": { + "hash_examples": "55da1a0a0bd33722", + "hash_full_prompts": "72fe6f46a57e6ca4", + "hash_input_tokens": "042f1988f13b8f9a", + "hash_cont_tokens": "62df3b0447bd3b12" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:college_medicine|5": { + "hashes": { + "hash_examples": "c33e143163049176", + "hash_full_prompts": "dee0989b2c8993f4", + "hash_input_tokens": "6dd81075c8e816e9", + "hash_cont_tokens": "933c01711a0757a0" + }, + "truncated": 0, + "non_truncated": 173, + "padded": 692, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:college_physics|5": { + "hashes": { + "hash_examples": "ebdab1cdb7e555df", + "hash_full_prompts": "a1be6b64ea1948c3", + "hash_input_tokens": "37818fa59254732b", + "hash_cont_tokens": "d36569ab90faad7c" + }, + "truncated": 0, + "non_truncated": 102, + "padded": 408, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:computer_security|5": { + "hashes": { + "hash_examples": "a24fd7d08a560921", + "hash_full_prompts": "01bc3fdfdefe67a4", + "hash_input_tokens": "d4957d5a9d5e83ec", + "hash_cont_tokens": "a886b3552371a98b" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:conceptual_physics|5": { + "hashes": { + "hash_examples": "8300977a79386993", + "hash_full_prompts": "b39315a8ada3ca79", + "hash_input_tokens": "c146a84803f78c9e", + "hash_cont_tokens": "6408f70f3d9ada31" + }, + "truncated": 0, + "non_truncated": 235, + "padded": 940, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:econometrics|5": { + "hashes": { + "hash_examples": "ddde36788a04a46f", + "hash_full_prompts": "70bab37ca5fcc48f", + "hash_input_tokens": "086bc025be133096", + "hash_cont_tokens": "3befa885ca6e4b97" + }, + "truncated": 0, + "non_truncated": 114, + "padded": 456, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:electrical_engineering|5": { + "hashes": { + "hash_examples": "acbc5def98c19b3f", + "hash_full_prompts": "86a4747481c11c61", + "hash_input_tokens": "b83507ac94ded59b", + "hash_cont_tokens": "e75df8f470aa4973" + }, + "truncated": 0, + "non_truncated": 145, + "padded": 580, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:elementary_mathematics|5": { + "hashes": { + "hash_examples": "146e61d07497a9bd", + "hash_full_prompts": "1fe56333735325fa", + "hash_input_tokens": "8c3c868b34bad37b", + "hash_cont_tokens": "f09c97e7f7f9af71" + }, + "truncated": 0, + "non_truncated": 378, + "padded": 1512, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:formal_logic|5": { + "hashes": { + "hash_examples": "8635216e1909a03f", + "hash_full_prompts": "cc83c1ede45f974c", + "hash_input_tokens": "bb0616a24585501c", + "hash_cont_tokens": "df96e75b4eb1d7b0" + }, + "truncated": 0, + "non_truncated": 126, + "padded": 504, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:global_facts|5": { + "hashes": { + "hash_examples": "30b315aa6353ee47", + "hash_full_prompts": "3a2ec1e2785c69a5", + "hash_input_tokens": "5e840dc7f1c55a67", + "hash_cont_tokens": "a886b3552371a98b" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:high_school_biology|5": { + "hashes": { + "hash_examples": "c9136373af2180de", + "hash_full_prompts": "27646a569cf2a6f8", + "hash_input_tokens": "1dce672a00c5cbe1", + "hash_cont_tokens": "c6d11e73dc85157f" + }, + "truncated": 0, + "non_truncated": 310, + "padded": 1240, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:high_school_chemistry|5": { + "hashes": { + "hash_examples": "b0661bfa1add6404", + "hash_full_prompts": "6905c6ca76f7b2b7", + "hash_input_tokens": "7fb2dd590b34e445", + "hash_cont_tokens": "208aff39cfca671a" + }, + "truncated": 0, + "non_truncated": 203, + "padded": 812, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:high_school_computer_science|5": { + "hashes": { + "hash_examples": "80fc1d623a3d665f", + "hash_full_prompts": "b80092241e8b6c06", + "hash_input_tokens": "b2a9091fd8d00b66", + "hash_cont_tokens": "150a6d581009fbe0" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:high_school_european_history|5": { + "hashes": { + "hash_examples": "854da6e5af0fe1a1", + "hash_full_prompts": "a3bc32a5dc022ce7", + "hash_input_tokens": "393e215e8667fde4", + "hash_cont_tokens": "7b6f4c22b304c3cc" + }, + "truncated": 0, + "non_truncated": 165, + "padded": 656, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:high_school_geography|5": { + "hashes": { + "hash_examples": "7dc963c7acd19ad8", + "hash_full_prompts": "53f91beae305905d", + "hash_input_tokens": "439ac435fc478534", + "hash_cont_tokens": "1a85c9e696d91a66" + }, + "truncated": 0, + "non_truncated": 198, + "padded": 792, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "1f675dcdebc9758f", + "hash_full_prompts": "623fd7e3495f243f", + "hash_input_tokens": "2c5757b8545f7cf8", + "hash_cont_tokens": "a47a4530b8790081" + }, + "truncated": 0, + "non_truncated": 193, + "padded": 772, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "2fb32cf2d80f0b35", + "hash_full_prompts": "378ac13c8abb6c5f", + "hash_input_tokens": "afea2ca30b1622ff", + "hash_cont_tokens": "e71e7c6acf44c3e5" + }, + "truncated": 0, + "non_truncated": 390, + "padded": 1560, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:high_school_mathematics|5": { + "hashes": { + "hash_examples": "fd6646fdb5d58a1f", + "hash_full_prompts": "14d34e0b34750627", + "hash_input_tokens": "34e63b0902b32a2c", + "hash_cont_tokens": "e36b5624bdbe96b0" + }, + "truncated": 0, + "non_truncated": 270, + "padded": 1080, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:high_school_microeconomics|5": { + "hashes": { + "hash_examples": "2118f21f71d87d84", + "hash_full_prompts": "9ac09e5d4da991c9", + "hash_input_tokens": "93d1c1ba5fe0bcbd", + "hash_cont_tokens": "a5f61d5beba13cc2" + }, + "truncated": 0, + "non_truncated": 238, + "padded": 952, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:high_school_physics|5": { + "hashes": { + "hash_examples": "dc3ce06378548565", + "hash_full_prompts": "b4832a554d47d224", + "hash_input_tokens": "f5bf59bc9f6839fe", + "hash_cont_tokens": "df1d218ccbc258e8" + }, + "truncated": 0, + "non_truncated": 151, + "padded": 604, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:high_school_psychology|5": { + "hashes": { + "hash_examples": "c8d1d98a40e11f2f", + "hash_full_prompts": "1e8cd27064546274", + "hash_input_tokens": "329851f26db67226", + "hash_cont_tokens": "6fb549a4eb8e6c47" + }, + "truncated": 0, + "non_truncated": 545, + "padded": 2180, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:high_school_statistics|5": { + "hashes": { + "hash_examples": "666c8759b98ee4ff", + "hash_full_prompts": "e05ab41077ec0afa", + "hash_input_tokens": "7abad93393993e44", + "hash_cont_tokens": "d9528c65af653d67" + }, + "truncated": 0, + "non_truncated": 216, + "padded": 864, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:high_school_us_history|5": { + "hashes": { + "hash_examples": "95fef1c4b7d3f81e", + "hash_full_prompts": "a4b275996a416b4a", + "hash_input_tokens": "e5def820604ad889", + "hash_cont_tokens": "8b827fc7dfd3c1c5" + }, + "truncated": 0, + "non_truncated": 204, + "padded": 816, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:high_school_world_history|5": { + "hashes": { + "hash_examples": "7e5085b6184b0322", + "hash_full_prompts": "8adf16361f0f320a", + "hash_input_tokens": "aa85ae4eba20e53f", + "hash_cont_tokens": "82f19c159c69a66d" + }, + "truncated": 0, + "non_truncated": 237, + "padded": 948, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:human_aging|5": { + "hashes": { + "hash_examples": "c17333e7c7c10797", + "hash_full_prompts": "918d91a3141aac4d", + "hash_input_tokens": "297fceccf01a2c64", + "hash_cont_tokens": "ca87074f1dc39668" + }, + "truncated": 0, + "non_truncated": 223, + "padded": 892, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:human_sexuality|5": { + "hashes": { + "hash_examples": "4edd1e9045df5e3d", + "hash_full_prompts": "bcee39ecea32fcc8", + "hash_input_tokens": "7c66a375881d6788", + "hash_cont_tokens": "491a0ab53f54aeb9" + }, + "truncated": 0, + "non_truncated": 131, + "padded": 524, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:international_law|5": { + "hashes": { + "hash_examples": "db2fa00d771a062a", + "hash_full_prompts": "ffe12a3b5bf350c2", + "hash_input_tokens": "dc0250213736abca", + "hash_cont_tokens": "e3d257d7ea257fc8" + }, + "truncated": 0, + "non_truncated": 121, + "padded": 484, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:jurisprudence|5": { + "hashes": { + "hash_examples": "e956f86b124076fe", + "hash_full_prompts": "b4293c3c08bebaf7", + "hash_input_tokens": "c9ed773ed04cff64", + "hash_cont_tokens": "4c69d7671fa1ab1c" + }, + "truncated": 0, + "non_truncated": 108, + "padded": 432, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:logical_fallacies|5": { + "hashes": { + "hash_examples": "956e0e6365ab79f1", + "hash_full_prompts": "8c1b7733e98cbe81", + "hash_input_tokens": "a4f6df541a56c41a", + "hash_cont_tokens": "57e78d3d09b7db81" + }, + "truncated": 0, + "non_truncated": 163, + "padded": 652, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:machine_learning|5": { + "hashes": { + "hash_examples": "397997cc6f4d581e", + "hash_full_prompts": "24a206a1c639ab8d", + "hash_input_tokens": "f0dfd08579d1f727", + "hash_cont_tokens": "94d2ec6c52bb7b53" + }, + "truncated": 0, + "non_truncated": 112, + "padded": 448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:management|5": { + "hashes": { + "hash_examples": "2bcbe6f6ca63d740", + "hash_full_prompts": "77e1c79d988beecc", + "hash_input_tokens": "15925fd62ddd3ca4", + "hash_cont_tokens": "79499fecb18f1cb1" + }, + "truncated": 0, + "non_truncated": 103, + "padded": 412, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:marketing|5": { + "hashes": { + "hash_examples": "8ddb20d964a1b065", + "hash_full_prompts": "83cec2fa6b681d9d", + "hash_input_tokens": "6eb177c438da2061", + "hash_cont_tokens": "c5e9cd86b1a58fac" + }, + "truncated": 0, + "non_truncated": 234, + "padded": 936, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:medical_genetics|5": { + "hashes": { + "hash_examples": "182a71f4763d2cea", + "hash_full_prompts": "195eb7ff99749730", + "hash_input_tokens": "5adeca0d34767f29", + "hash_cont_tokens": "a886b3552371a98b" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:miscellaneous|5": { + "hashes": { + "hash_examples": "4c404fdbb4ca57fc", + "hash_full_prompts": "33539955c9a96851", + "hash_input_tokens": "52aee92a69c2b698", + "hash_cont_tokens": "8578b82c42cc7026" + }, + "truncated": 0, + "non_truncated": 783, + "padded": 3132, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:moral_disputes|5": { + "hashes": { + "hash_examples": "60cbd2baa3fea5c9", + "hash_full_prompts": "009b7d0e7f819eff", + "hash_input_tokens": "f24c046b105c5e03", + "hash_cont_tokens": "26b0f808ec46464d" + }, + "truncated": 0, + "non_truncated": 346, + "padded": 1384, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:moral_scenarios|5": { + "hashes": { + "hash_examples": "fd8b0431fbdd75ef", + "hash_full_prompts": "f6e63c9fb9d3bff0", + "hash_input_tokens": "08eee0e3d8e89710", + "hash_cont_tokens": "52fe77d28aefc1b3" + }, + "truncated": 0, + "non_truncated": 895, + "padded": 3580, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:nutrition|5": { + "hashes": { + "hash_examples": "71e55e2b829b6528", + "hash_full_prompts": "8294d5e3ad435377", + "hash_input_tokens": "5b2c6686c8fc5e83", + "hash_cont_tokens": "25850a01b4a11b53" + }, + "truncated": 0, + "non_truncated": 306, + "padded": 1224, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:philosophy|5": { + "hashes": { + "hash_examples": "a6d489a8d208fa4b", + "hash_full_prompts": "db68c0f4503e4793", + "hash_input_tokens": "7108ad04b556854f", + "hash_cont_tokens": "8c34ab2fa65c3b6e" + }, + "truncated": 0, + "non_truncated": 311, + "padded": 1244, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:prehistory|5": { + "hashes": { + "hash_examples": "6cc50f032a19acaa", + "hash_full_prompts": "3972bcfa8c80e964", + "hash_input_tokens": "65cb6b1efc71921b", + "hash_cont_tokens": "89f21e5f9c7d81f2" + }, + "truncated": 0, + "non_truncated": 324, + "padded": 1296, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:professional_accounting|5": { + "hashes": { + "hash_examples": "50f57ab32f5f6cea", + "hash_full_prompts": "25f0becc2483bd32", + "hash_input_tokens": "c1b1c1e1f1ca4a85", + "hash_cont_tokens": "c7c4930a659ca843" + }, + "truncated": 0, + "non_truncated": 282, + "padded": 1120, + "non_padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:professional_law|5": { + "hashes": { + "hash_examples": "a8fdc85c64f4b215", + "hash_full_prompts": "7a6f6c5706f00c7d", + "hash_input_tokens": "e7517115da0204cd", + "hash_cont_tokens": "6f36bd560ae36f02" + }, + "truncated": 0, + "non_truncated": 1534, + "padded": 6136, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:professional_medicine|5": { + "hashes": { + "hash_examples": "c373a28a3050a73a", + "hash_full_prompts": "a74b6ac7c5c545d2", + "hash_input_tokens": "da6af6d03e682017", + "hash_cont_tokens": "ca4398b4ad3db5f1" + }, + "truncated": 0, + "non_truncated": 272, + "padded": 1088, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:professional_psychology|5": { + "hashes": { + "hash_examples": "bf5254fe818356af", + "hash_full_prompts": "c53fa139ec25f502", + "hash_input_tokens": "c6dbaf3c7103ebe9", + "hash_cont_tokens": "ce4bb75e80359fe4" + }, + "truncated": 0, + "non_truncated": 612, + "padded": 2448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:public_relations|5": { + "hashes": { + "hash_examples": "b66d52e28e7d14e0", + "hash_full_prompts": "55b5eff05aa6bf13", + "hash_input_tokens": "deea75b6eec5b782", + "hash_cont_tokens": "680235f5ede0b353" + }, + "truncated": 0, + "non_truncated": 110, + "padded": 440, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:security_studies|5": { + "hashes": { + "hash_examples": "514c14feaf000ad9", + "hash_full_prompts": "6690ecdc054f7b0c", + "hash_input_tokens": "deef3d39896aca43", + "hash_cont_tokens": "189956efcec12818" + }, + "truncated": 0, + "non_truncated": 245, + "padded": 980, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:sociology|5": { + "hashes": { + "hash_examples": "f6c9bc9d18c80870", + "hash_full_prompts": "945fbdd091c72d64", + "hash_input_tokens": "330fffbccabf89e4", + "hash_cont_tokens": "2178ff937c0c1a29" + }, + "truncated": 0, + "non_truncated": 201, + "padded": 804, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:us_foreign_policy|5": { + "hashes": { + "hash_examples": "ed7b78629db6678f", + "hash_full_prompts": "ebba6ea6eca4ae53", + "hash_input_tokens": "0ec87fa768a47632", + "hash_cont_tokens": "a886b3552371a98b" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 392, + "non_padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:virology|5": { + "hashes": { + "hash_examples": "bc52ffdc3f9b994a", + "hash_full_prompts": "a2ee4984d6877fe3", + "hash_input_tokens": "cc264818195d14da", + "hash_cont_tokens": "ec5c187546c7c842" + }, + "truncated": 0, + "non_truncated": 166, + "padded": 660, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:world_religions|5": { + "hashes": { + "hash_examples": "ecdb4a4f94f62930", + "hash_full_prompts": "a89c8dddd1d8ced0", + "hash_input_tokens": "e7e781ba363743eb", + "hash_cont_tokens": "e52b573046cdfc5c" + }, + "truncated": 0, + "non_truncated": 171, + "padded": 684, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "36a6d90e75d92d4a", + "hash_full_prompts": "8d9ca0a8bd458a1c", + "hash_input_tokens": "4aad1a3bfe70acfc", + "hash_cont_tokens": "b0f64f6659d8c230" + }, + "truncated": 0, + "non_truncated": 817, + "padded": 9996, + "non_padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|winogrande|5": { + "hashes": { + "hash_examples": "087d5d1a1afd4c7b", + "hash_full_prompts": "35da55e47222e0e1", + "hash_input_tokens": "881c630a9e0034f7", + "hash_cont_tokens": "c466f4c92e3879cb" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2534, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|gsm8k|5": { + "hashes": { + "hash_examples": "0ed016e24e7512fd", + "hash_full_prompts": "f7ab209f6467841e", + "hash_input_tokens": "deccfe61ad5cb3d5", + "hash_cont_tokens": "b7442470c79a7028" + }, + "truncated": 1319, + "non_truncated": 0, + "padded": 1074, + "non_padded": 245, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "670666fa3a90ce5d", + "hash_full_prompts": "56c005e427046302", + "hash_input_tokens": "2a51da62c271a1a0", + "hash_cont_tokens": "aba89b730029cc1f" + }, + "truncated": 1319, + "non_truncated": 27340, + "padded": 114540, + "non_padded": 332, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file