Locutusque
commited on
Commit
•
e6ea13e
1
Parent(s):
ca8e694
Update README.md
Browse files
README.md
CHANGED
@@ -81,6 +81,81 @@ Zero-shot AGIEval
|
|
81 |
| - agieval_sat_math | 1|none |None |acc |0.3091|± |0.0312|
|
82 |
| | |none |None |acc_norm|0.2364|± |0.0287|
|
83 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
84 |
## How to Use
|
85 |
```python
|
86 |
from transformers import AutoModelForCausalLM, AutoTokenizer
|
|
|
81 |
| - agieval_sat_math | 1|none |None |acc |0.3091|± |0.0312|
|
82 |
| | |none |None |acc_norm|0.2364|± |0.0287|
|
83 |
|
84 |
+
5 shot CoT MMLU
|
85 |
+
|
86 |
+
| Tasks |Version| Filter |n-shot| Metric |Value | |Stderr|
|
87 |
+
|-------------------------------------------------------------|-------|----------|-----:|-----------|-----:|---|-----:|
|
88 |
+
|mmlu_flan_cot_fewshot |N/A |get-answer| 0|exact_match|0.5924|± |0.0118|
|
89 |
+
| - mmlu_flan_cot_fewshot_humanities |N/A |get-answer| 0|exact_match|0.5077|± |0.0206|
|
90 |
+
| - mmlu_flan_cot_fewshot_formal_logic | 0|get-answer| 0|exact_match|0.2143|± |0.1138|
|
91 |
+
| - mmlu_flan_cot_fewshot_high_school_european_history | 0|get-answer| 0|exact_match|0.6111|± |0.1182|
|
92 |
+
| - mmlu_flan_cot_fewshot_high_school_us_history | 0|get-answer| 0|exact_match|0.7727|± |0.0914|
|
93 |
+
| - mmlu_flan_cot_fewshot_high_school_world_history | 0|get-answer| 0|exact_match|0.6154|± |0.0973|
|
94 |
+
| - mmlu_flan_cot_fewshot_international_law | 0|get-answer| 0|exact_match|0.9231|± |0.0769|
|
95 |
+
| - mmlu_flan_cot_fewshot_jurisprudence | 0|get-answer| 0|exact_match|0.3636|± |0.1521|
|
96 |
+
| - mmlu_flan_cot_fewshot_logical_fallacies | 0|get-answer| 0|exact_match|0.7222|± |0.1086|
|
97 |
+
| - mmlu_flan_cot_fewshot_moral_disputes | 0|get-answer| 0|exact_match|0.5526|± |0.0817|
|
98 |
+
| - mmlu_flan_cot_fewshot_moral_scenarios | 0|get-answer| 0|exact_match|0.3900|± |0.0490|
|
99 |
+
| - mmlu_flan_cot_fewshot_philosophy | 0|get-answer| 0|exact_match|0.7647|± |0.0738|
|
100 |
+
| - mmlu_flan_cot_fewshot_prehistory | 0|get-answer| 0|exact_match|0.7143|± |0.0775|
|
101 |
+
| - mmlu_flan_cot_fewshot_professional_law | 0|get-answer| 0|exact_match|0.3471|± |0.0366|
|
102 |
+
| - mmlu_flan_cot_fewshot_world_religions | 0|get-answer| 0|exact_match|0.8947|± |0.0723|
|
103 |
+
| - mmlu_flan_cot_fewshot_other |N/A |get-answer| 0|exact_match|0.6921|± |0.0240|
|
104 |
+
| - mmlu_flan_cot_fewshot_business_ethics | 0|get-answer| 0|exact_match|0.9091|± |0.0909|
|
105 |
+
| - mmlu_flan_cot_fewshot_clinical_knowledge | 0|get-answer| 0|exact_match|0.5517|± |0.0940|
|
106 |
+
| - mmlu_flan_cot_fewshot_college_medicine | 0|get-answer| 0|exact_match|0.7727|± |0.0914|
|
107 |
+
| - mmlu_flan_cot_fewshot_global_facts | 0|get-answer| 0|exact_match|0.6000|± |0.1633|
|
108 |
+
| - mmlu_flan_cot_fewshot_human_aging | 0|get-answer| 0|exact_match|0.6522|± |0.1015|
|
109 |
+
| - mmlu_flan_cot_fewshot_management | 0|get-answer| 0|exact_match|0.9091|± |0.0909|
|
110 |
+
| - mmlu_flan_cot_fewshot_marketing | 0|get-answer| 0|exact_match|0.8400|± |0.0748|
|
111 |
+
| - mmlu_flan_cot_fewshot_medical_genetics | 0|get-answer| 0|exact_match|1.0000|± |0.0000|
|
112 |
+
| - mmlu_flan_cot_fewshot_miscellaneous | 0|get-answer| 0|exact_match|0.7791|± |0.0450|
|
113 |
+
| - mmlu_flan_cot_fewshot_nutrition | 0|get-answer| 0|exact_match|0.6667|± |0.0833|
|
114 |
+
| - mmlu_flan_cot_fewshot_professional_accounting | 0|get-answer| 0|exact_match|0.4194|± |0.0901|
|
115 |
+
| - mmlu_flan_cot_fewshot_professional_medicine | 0|get-answer| 0|exact_match|0.6774|± |0.0853|
|
116 |
+
| - mmlu_flan_cot_fewshot_virology | 0|get-answer| 0|exact_match|0.3889|± |0.1182|
|
117 |
+
| - mmlu_flan_cot_fewshot_social_sciences |N/A |get-answer| 0|exact_match|0.6973|± |0.0239|
|
118 |
+
| - mmlu_flan_cot_fewshot_econometrics | 0|get-answer| 0|exact_match|0.3333|± |0.1421|
|
119 |
+
| - mmlu_flan_cot_fewshot_high_school_geography | 0|get-answer| 0|exact_match|0.9091|± |0.0627|
|
120 |
+
| - mmlu_flan_cot_fewshot_high_school_government_and_politics| 0|get-answer| 0|exact_match|0.8095|± |0.0878|
|
121 |
+
| - mmlu_flan_cot_fewshot_high_school_macroeconomics | 0|get-answer| 0|exact_match|0.6279|± |0.0746|
|
122 |
+
| - mmlu_flan_cot_fewshot_high_school_microeconomics | 0|get-answer| 0|exact_match|0.6154|± |0.0973|
|
123 |
+
| - mmlu_flan_cot_fewshot_high_school_psychology | 0|get-answer| 0|exact_match|0.9167|± |0.0360|
|
124 |
+
| - mmlu_flan_cot_fewshot_human_sexuality | 0|get-answer| 0|exact_match|0.5000|± |0.1508|
|
125 |
+
| - mmlu_flan_cot_fewshot_professional_psychology | 0|get-answer| 0|exact_match|0.6667|± |0.0572|
|
126 |
+
| - mmlu_flan_cot_fewshot_public_relations | 0|get-answer| 0|exact_match|0.5833|± |0.1486|
|
127 |
+
| - mmlu_flan_cot_fewshot_security_studies | 0|get-answer| 0|exact_match|0.4444|± |0.0975|
|
128 |
+
| - mmlu_flan_cot_fewshot_sociology | 0|get-answer| 0|exact_match|0.7727|± |0.0914|
|
129 |
+
| - mmlu_flan_cot_fewshot_us_foreign_policy | 0|get-answer| 0|exact_match|0.7273|± |0.1408|
|
130 |
+
| - mmlu_flan_cot_fewshot_stem |N/A |get-answer| 0|exact_match|0.5164|± |0.0265|
|
131 |
+
| - mmlu_flan_cot_fewshot_abstract_algebra | 0|get-answer| 0|exact_match|0.4545|± |0.1575|
|
132 |
+
| - mmlu_flan_cot_fewshot_anatomy | 0|get-answer| 0|exact_match|0.3571|± |0.1329|
|
133 |
+
| - mmlu_flan_cot_fewshot_astronomy | 0|get-answer| 0|exact_match|0.5000|± |0.1291|
|
134 |
+
| - mmlu_flan_cot_fewshot_college_biology | 0|get-answer| 0|exact_match|0.5625|± |0.1281|
|
135 |
+
| - mmlu_flan_cot_fewshot_college_chemistry | 0|get-answer| 0|exact_match|0.3750|± |0.1830|
|
136 |
+
| - mmlu_flan_cot_fewshot_college_computer_science | 0|get-answer| 0|exact_match|0.2727|± |0.1408|
|
137 |
+
| - mmlu_flan_cot_fewshot_college_mathematics | 0|get-answer| 0|exact_match|0.2727|± |0.1408|
|
138 |
+
| - mmlu_flan_cot_fewshot_college_physics | 0|get-answer| 0|exact_match|0.4545|± |0.1575|
|
139 |
+
| - mmlu_flan_cot_fewshot_computer_security | 0|get-answer| 0|exact_match|0.7273|± |0.1408|
|
140 |
+
| - mmlu_flan_cot_fewshot_conceptual_physics | 0|get-answer| 0|exact_match|0.6154|± |0.0973|
|
141 |
+
| - mmlu_flan_cot_fewshot_electrical_engineering | 0|get-answer| 0|exact_match|0.6875|± |0.1197|
|
142 |
+
| - mmlu_flan_cot_fewshot_elementary_mathematics | 0|get-answer| 0|exact_match|0.7317|± |0.0701|
|
143 |
+
| - mmlu_flan_cot_fewshot_high_school_biology | 0|get-answer| 0|exact_match|0.7188|± |0.0808|
|
144 |
+
| - mmlu_flan_cot_fewshot_high_school_chemistry | 0|get-answer| 0|exact_match|0.3636|± |0.1050|
|
145 |
+
| - mmlu_flan_cot_fewshot_high_school_computer_science | 0|get-answer| 0|exact_match|0.6667|± |0.1667|
|
146 |
+
| - mmlu_flan_cot_fewshot_high_school_mathematics | 0|get-answer| 0|exact_match|0.4138|± |0.0931|
|
147 |
+
| - mmlu_flan_cot_fewshot_high_school_physics | 0|get-answer| 0|exact_match|0.2353|± |0.1060|
|
148 |
+
| - mmlu_flan_cot_fewshot_high_school_statistics | 0|get-answer| 0|exact_match|0.4348|± |0.1057|
|
149 |
+
| - mmlu_flan_cot_fewshot_machine_learning | 0|get-answer| 0|exact_match|0.3636|± |0.1521|
|
150 |
+
|
151 |
+
| Groups |Version| Filter |n-shot| Metric |Value | |Stderr|
|
152 |
+
|----------------------------------------|-------|----------|-----:|-----------|-----:|---|-----:|
|
153 |
+
|mmlu_flan_cot_fewshot |N/A |get-answer| 0|exact_match|0.5924|± |0.0118|
|
154 |
+
| - mmlu_flan_cot_fewshot_humanities |N/A |get-answer| 0|exact_match|0.5077|± |0.0206|
|
155 |
+
| - mmlu_flan_cot_fewshot_other |N/A |get-answer| 0|exact_match|0.6921|± |0.0240|
|
156 |
+
| - mmlu_flan_cot_fewshot_social_sciences|N/A |get-answer| 0|exact_match|0.6973|± |0.0239|
|
157 |
+
| - mmlu_flan_cot_fewshot_stem |N/A |get-answer| 0|exact_match|0.5164|± |0.0265|
|
158 |
+
|
159 |
## How to Use
|
160 |
```python
|
161 |
from transformers import AutoModelForCausalLM, AutoTokenizer
|