Locutusque commited on
Commit
e6ea13e
1 Parent(s): ca8e694

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +75 -0
README.md CHANGED
@@ -81,6 +81,81 @@ Zero-shot AGIEval
81
  | - agieval_sat_math | 1|none |None |acc |0.3091|± |0.0312|
82
  | | |none |None |acc_norm|0.2364|± |0.0287|
83
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
84
  ## How to Use
85
  ```python
86
  from transformers import AutoModelForCausalLM, AutoTokenizer
 
81
  | - agieval_sat_math | 1|none |None |acc |0.3091|± |0.0312|
82
  | | |none |None |acc_norm|0.2364|± |0.0287|
83
 
84
+ 5 shot CoT MMLU
85
+
86
+ | Tasks |Version| Filter |n-shot| Metric |Value | |Stderr|
87
+ |-------------------------------------------------------------|-------|----------|-----:|-----------|-----:|---|-----:|
88
+ |mmlu_flan_cot_fewshot |N/A |get-answer| 0|exact_match|0.5924|± |0.0118|
89
+ | - mmlu_flan_cot_fewshot_humanities |N/A |get-answer| 0|exact_match|0.5077|± |0.0206|
90
+ | - mmlu_flan_cot_fewshot_formal_logic | 0|get-answer| 0|exact_match|0.2143|± |0.1138|
91
+ | - mmlu_flan_cot_fewshot_high_school_european_history | 0|get-answer| 0|exact_match|0.6111|± |0.1182|
92
+ | - mmlu_flan_cot_fewshot_high_school_us_history | 0|get-answer| 0|exact_match|0.7727|± |0.0914|
93
+ | - mmlu_flan_cot_fewshot_high_school_world_history | 0|get-answer| 0|exact_match|0.6154|± |0.0973|
94
+ | - mmlu_flan_cot_fewshot_international_law | 0|get-answer| 0|exact_match|0.9231|± |0.0769|
95
+ | - mmlu_flan_cot_fewshot_jurisprudence | 0|get-answer| 0|exact_match|0.3636|± |0.1521|
96
+ | - mmlu_flan_cot_fewshot_logical_fallacies | 0|get-answer| 0|exact_match|0.7222|± |0.1086|
97
+ | - mmlu_flan_cot_fewshot_moral_disputes | 0|get-answer| 0|exact_match|0.5526|± |0.0817|
98
+ | - mmlu_flan_cot_fewshot_moral_scenarios | 0|get-answer| 0|exact_match|0.3900|± |0.0490|
99
+ | - mmlu_flan_cot_fewshot_philosophy | 0|get-answer| 0|exact_match|0.7647|± |0.0738|
100
+ | - mmlu_flan_cot_fewshot_prehistory | 0|get-answer| 0|exact_match|0.7143|± |0.0775|
101
+ | - mmlu_flan_cot_fewshot_professional_law | 0|get-answer| 0|exact_match|0.3471|± |0.0366|
102
+ | - mmlu_flan_cot_fewshot_world_religions | 0|get-answer| 0|exact_match|0.8947|± |0.0723|
103
+ | - mmlu_flan_cot_fewshot_other |N/A |get-answer| 0|exact_match|0.6921|± |0.0240|
104
+ | - mmlu_flan_cot_fewshot_business_ethics | 0|get-answer| 0|exact_match|0.9091|± |0.0909|
105
+ | - mmlu_flan_cot_fewshot_clinical_knowledge | 0|get-answer| 0|exact_match|0.5517|± |0.0940|
106
+ | - mmlu_flan_cot_fewshot_college_medicine | 0|get-answer| 0|exact_match|0.7727|± |0.0914|
107
+ | - mmlu_flan_cot_fewshot_global_facts | 0|get-answer| 0|exact_match|0.6000|± |0.1633|
108
+ | - mmlu_flan_cot_fewshot_human_aging | 0|get-answer| 0|exact_match|0.6522|± |0.1015|
109
+ | - mmlu_flan_cot_fewshot_management | 0|get-answer| 0|exact_match|0.9091|± |0.0909|
110
+ | - mmlu_flan_cot_fewshot_marketing | 0|get-answer| 0|exact_match|0.8400|± |0.0748|
111
+ | - mmlu_flan_cot_fewshot_medical_genetics | 0|get-answer| 0|exact_match|1.0000|± |0.0000|
112
+ | - mmlu_flan_cot_fewshot_miscellaneous | 0|get-answer| 0|exact_match|0.7791|± |0.0450|
113
+ | - mmlu_flan_cot_fewshot_nutrition | 0|get-answer| 0|exact_match|0.6667|± |0.0833|
114
+ | - mmlu_flan_cot_fewshot_professional_accounting | 0|get-answer| 0|exact_match|0.4194|± |0.0901|
115
+ | - mmlu_flan_cot_fewshot_professional_medicine | 0|get-answer| 0|exact_match|0.6774|± |0.0853|
116
+ | - mmlu_flan_cot_fewshot_virology | 0|get-answer| 0|exact_match|0.3889|± |0.1182|
117
+ | - mmlu_flan_cot_fewshot_social_sciences |N/A |get-answer| 0|exact_match|0.6973|± |0.0239|
118
+ | - mmlu_flan_cot_fewshot_econometrics | 0|get-answer| 0|exact_match|0.3333|± |0.1421|
119
+ | - mmlu_flan_cot_fewshot_high_school_geography | 0|get-answer| 0|exact_match|0.9091|± |0.0627|
120
+ | - mmlu_flan_cot_fewshot_high_school_government_and_politics| 0|get-answer| 0|exact_match|0.8095|± |0.0878|
121
+ | - mmlu_flan_cot_fewshot_high_school_macroeconomics | 0|get-answer| 0|exact_match|0.6279|± |0.0746|
122
+ | - mmlu_flan_cot_fewshot_high_school_microeconomics | 0|get-answer| 0|exact_match|0.6154|± |0.0973|
123
+ | - mmlu_flan_cot_fewshot_high_school_psychology | 0|get-answer| 0|exact_match|0.9167|± |0.0360|
124
+ | - mmlu_flan_cot_fewshot_human_sexuality | 0|get-answer| 0|exact_match|0.5000|± |0.1508|
125
+ | - mmlu_flan_cot_fewshot_professional_psychology | 0|get-answer| 0|exact_match|0.6667|± |0.0572|
126
+ | - mmlu_flan_cot_fewshot_public_relations | 0|get-answer| 0|exact_match|0.5833|± |0.1486|
127
+ | - mmlu_flan_cot_fewshot_security_studies | 0|get-answer| 0|exact_match|0.4444|± |0.0975|
128
+ | - mmlu_flan_cot_fewshot_sociology | 0|get-answer| 0|exact_match|0.7727|± |0.0914|
129
+ | - mmlu_flan_cot_fewshot_us_foreign_policy | 0|get-answer| 0|exact_match|0.7273|± |0.1408|
130
+ | - mmlu_flan_cot_fewshot_stem |N/A |get-answer| 0|exact_match|0.5164|± |0.0265|
131
+ | - mmlu_flan_cot_fewshot_abstract_algebra | 0|get-answer| 0|exact_match|0.4545|± |0.1575|
132
+ | - mmlu_flan_cot_fewshot_anatomy | 0|get-answer| 0|exact_match|0.3571|± |0.1329|
133
+ | - mmlu_flan_cot_fewshot_astronomy | 0|get-answer| 0|exact_match|0.5000|± |0.1291|
134
+ | - mmlu_flan_cot_fewshot_college_biology | 0|get-answer| 0|exact_match|0.5625|± |0.1281|
135
+ | - mmlu_flan_cot_fewshot_college_chemistry | 0|get-answer| 0|exact_match|0.3750|± |0.1830|
136
+ | - mmlu_flan_cot_fewshot_college_computer_science | 0|get-answer| 0|exact_match|0.2727|± |0.1408|
137
+ | - mmlu_flan_cot_fewshot_college_mathematics | 0|get-answer| 0|exact_match|0.2727|± |0.1408|
138
+ | - mmlu_flan_cot_fewshot_college_physics | 0|get-answer| 0|exact_match|0.4545|± |0.1575|
139
+ | - mmlu_flan_cot_fewshot_computer_security | 0|get-answer| 0|exact_match|0.7273|± |0.1408|
140
+ | - mmlu_flan_cot_fewshot_conceptual_physics | 0|get-answer| 0|exact_match|0.6154|± |0.0973|
141
+ | - mmlu_flan_cot_fewshot_electrical_engineering | 0|get-answer| 0|exact_match|0.6875|± |0.1197|
142
+ | - mmlu_flan_cot_fewshot_elementary_mathematics | 0|get-answer| 0|exact_match|0.7317|± |0.0701|
143
+ | - mmlu_flan_cot_fewshot_high_school_biology | 0|get-answer| 0|exact_match|0.7188|± |0.0808|
144
+ | - mmlu_flan_cot_fewshot_high_school_chemistry | 0|get-answer| 0|exact_match|0.3636|± |0.1050|
145
+ | - mmlu_flan_cot_fewshot_high_school_computer_science | 0|get-answer| 0|exact_match|0.6667|± |0.1667|
146
+ | - mmlu_flan_cot_fewshot_high_school_mathematics | 0|get-answer| 0|exact_match|0.4138|± |0.0931|
147
+ | - mmlu_flan_cot_fewshot_high_school_physics | 0|get-answer| 0|exact_match|0.2353|± |0.1060|
148
+ | - mmlu_flan_cot_fewshot_high_school_statistics | 0|get-answer| 0|exact_match|0.4348|± |0.1057|
149
+ | - mmlu_flan_cot_fewshot_machine_learning | 0|get-answer| 0|exact_match|0.3636|± |0.1521|
150
+
151
+ | Groups |Version| Filter |n-shot| Metric |Value | |Stderr|
152
+ |----------------------------------------|-------|----------|-----:|-----------|-----:|---|-----:|
153
+ |mmlu_flan_cot_fewshot |N/A |get-answer| 0|exact_match|0.5924|± |0.0118|
154
+ | - mmlu_flan_cot_fewshot_humanities |N/A |get-answer| 0|exact_match|0.5077|± |0.0206|
155
+ | - mmlu_flan_cot_fewshot_other |N/A |get-answer| 0|exact_match|0.6921|± |0.0240|
156
+ | - mmlu_flan_cot_fewshot_social_sciences|N/A |get-answer| 0|exact_match|0.6973|± |0.0239|
157
+ | - mmlu_flan_cot_fewshot_stem |N/A |get-answer| 0|exact_match|0.5164|± |0.0265|
158
+
159
  ## How to Use
160
  ```python
161
  from transformers import AutoModelForCausalLM, AutoTokenizer