Update README.md
Browse files
README.md
CHANGED
@@ -82,3 +82,74 @@ messages = [
|
|
82 |
This seems to be sufficient to garanteed ethical behavior.
|
83 |
|
84 |
Hope it helps enterprises to not make more lobotomized models.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
82 |
This seems to be sufficient to garanteed ethical behavior.
|
83 |
|
84 |
Hope it helps enterprises to not make more lobotomized models.
|
85 |
+
|
86 |
+
# Benchmark
|
87 |
+
|
88 |
+
|
89 |
+
| Tasks |Version| Filter |n-shot| Metric | |Value | |Stderr|
|
90 |
+
|---------------------------------|------:|----------------|-----:|-----------|---|-----:|---|-----:|
|
91 |
+
|agieval | 0|none | |acc |↑ |0.2510|± |0.0045|
|
92 |
+
| - agieval_aqua_rat | 1|none | 0|acc |↑ |0.1772|± |0.0240|
|
93 |
+
| | |none | 0|acc_norm |↑ |0.1654|± |0.0234|
|
94 |
+
| - agieval_gaokao_biology | 1|none | 0|acc |↑ |0.1857|± |0.0269|
|
95 |
+
| | |none | 0|acc_norm |↑ |0.2333|± |0.0293|
|
96 |
+
| - agieval_gaokao_chemistry | 1|none | 0|acc |↑ |0.2415|± |0.0298|
|
97 |
+
| | |none | 0|acc_norm |↑ |0.2367|± |0.0296|
|
98 |
+
| - agieval_gaokao_chinese | 1|none | 0|acc |↑ |0.1829|± |0.0247|
|
99 |
+
| | |none | 0|acc_norm |↑ |0.1992|± |0.0255|
|
100 |
+
| - agieval_gaokao_english | 1|none | 0|acc |↑ |0.2810|± |0.0257|
|
101 |
+
| | |none | 0|acc_norm |↑ |0.2810|± |0.0257|
|
102 |
+
| - agieval_gaokao_geography | 1|none | 0|acc |↑ |0.2965|± |0.0325|
|
103 |
+
| | |none | 0|acc_norm |↑ |0.3518|± |0.0339|
|
104 |
+
| - agieval_gaokao_history | 1|none | 0|acc |↑ |0.2766|± |0.0292|
|
105 |
+
| | |none | 0|acc_norm |↑ |0.3021|± |0.0300|
|
106 |
+
| - agieval_gaokao_mathcloze | 1|none | 0|acc |↑ |0.0085|± |0.0085|
|
107 |
+
| - agieval_gaokao_mathqa | 1|none | 0|acc |↑ |0.2507|± |0.0232|
|
108 |
+
| | |none | 0|acc_norm |↑ |0.2821|± |0.0241|
|
109 |
+
| - agieval_gaokao_physics | 1|none | 0|acc |↑ |0.2300|± |0.0298|
|
110 |
+
| | |none | 0|acc_norm |↑ |0.2750|± |0.0317|
|
111 |
+
| - agieval_jec_qa_ca | 1|none | 0|acc |↑ |0.4675|± |0.0158|
|
112 |
+
| | |none | 0|acc_norm |↑ |0.4595|± |0.0158|
|
113 |
+
| - agieval_jec_qa_kd | 1|none | 0|acc |↑ |0.4720|± |0.0158|
|
114 |
+
| | |none | 0|acc_norm |↑ |0.4960|± |0.0158|
|
115 |
+
| - agieval_logiqa_en | 1|none | 0|acc |↑ |0.1859|± |0.0153|
|
116 |
+
| | |none | 0|acc_norm |↑ |0.2504|± |0.0170|
|
117 |
+
| - agieval_logiqa_zh | 1|none | 0|acc |↑ |0.2120|± |0.0160|
|
118 |
+
| | |none | 0|acc_norm |↑ |0.2504|± |0.0170|
|
119 |
+
| - agieval_lsat_ar | 1|none | 0|acc |↑ |0.1913|± |0.0260|
|
120 |
+
| | |none | 0|acc_norm |↑ |0.1696|± |0.0248|
|
121 |
+
| - agieval_lsat_lr | 1|none | 0|acc |↑ |0.1333|± |0.0151|
|
122 |
+
| | |none | 0|acc_norm |↑ |0.2078|± |0.0180|
|
123 |
+
| - agieval_lsat_rc | 1|none | 0|acc |↑ |0.2268|± |0.0256|
|
124 |
+
| | |none | 0|acc_norm |↑ |0.2119|± |0.0250|
|
125 |
+
| - agieval_math | 1|none | 0|acc |↑ |0.0130|± |0.0036|
|
126 |
+
| - agieval_sat_en | 1|none | 0|acc |↑ |0.3107|± |0.0323|
|
127 |
+
| | |none | 0|acc_norm |↑ |0.3010|± |0.0320|
|
128 |
+
| - agieval_sat_en_without_passage| 1|none | 0|acc |↑ |0.2621|± |0.0307|
|
129 |
+
| | |none | 0|acc_norm |↑ |0.2476|± |0.0301|
|
130 |
+
| - agieval_sat_math | 1|none | 0|acc |↑ |0.2227|± |0.0281|
|
131 |
+
| | |none | 0|acc_norm |↑ |0.2227|± |0.0281|
|
132 |
+
|global_mmlu_pt | 0|none | |acc |↑ |0.2425|± |0.0214|
|
133 |
+
| - global_mmlu_pt_business | 0|none | 0|acc |↑ |0.3103|± |0.0613|
|
134 |
+
| - global_mmlu_pt_humanities | 0|none | 0|acc |↑ |0.2549|± |0.0434|
|
135 |
+
| - global_mmlu_pt_medical | 0|none | 0|acc |↑ |0.3333|± |0.0797|
|
136 |
+
| - global_mmlu_pt_other | 0|none | 0|acc |↑ |0.1607|± |0.0495|
|
137 |
+
| - global_mmlu_pt_social_sciences| 0|none | 0|acc |↑ |0.2059|± |0.0402|
|
138 |
+
| - global_mmlu_pt_stem | 0|none | 0|acc |↑ |0.2391|± |0.0636|
|
139 |
+
|persona_conscientiousness | 0|none | 0|acc |↑ |0.5170|± |0.0158|
|
140 |
+
|piqa | 1|none | 0|acc |↑ |0.5294|± |0.0116|
|
141 |
+
| | |none | 0|acc_norm |↑ |0.5397|± |0.0116|
|
142 |
+
|truthfulqa_mc1 | 2|none | 0|acc |↑ |0.2411|± |0.0150|
|
143 |
+
|truthfulqa_mc2 | 3|none | 0|acc |↑ |0.5051|± |0.0169|
|
144 |
+
|truthfulqa_pt_mc1 | 1|none | 0|acc |↑ |0.2437|± |0.0153|
|
145 |
+
|truthfulqa_pt_mc2 | 2|none | 0|acc |↑ |0.5081|± |0.0174|
|
146 |
+
|
147 |
+
| Groups |Version|Filter|n-shot|Metric| |Value | |Stderr|
|
148 |
+
|--------------|------:|------|------|------|---|-----:|---|-----:|
|
149 |
+
|agieval | 0|none | |acc |↑ |0.2510|± |0.0045|
|
150 |
+
|global_mmlu_pt| 0|none | |acc |↑ |0.2425|± |0.0214|
|
151 |
+
|
152 |
+
| Tasks |Version|Filter|n-shot| Metric | |Value | |Stderr|
|
153 |
+
|---------|------:|------|-----:|--------|---|-----:|---|-----:|
|
154 |
+
|hellaswag| 1|none | 0|acc |↑ |0.2650|± |0.0044|
|
155 |
+
| | |none | 0|acc_norm|↑ |0.2785|± |0.0045|
|