Update README.md
Browse files
README.md
CHANGED
@@ -43,4 +43,95 @@ All evaluations are conducted in a zero-shot setting.
|
|
43 |
| **[MT-Bench](https://huggingface.co/spaces/lmsys/mt-bench)** | **8.81** | 8.53 |
|
44 |
| **[MT-Bench-TW](https://huggingface.co/datasets/MediaTek-Research/TCEval-v2)** | **8.36** | 7.80 |
|
45 |
| **[Chatbot-Arena-Hard](https://github.com/lmarena/arena-hard-auto)** | **43.90** | 33.60 |
|
46 |
-
| **[AlignBench](https://github.com/THUDM/AlignBench)** | **7.25** | 6.88 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
43 |
| **[MT-Bench](https://huggingface.co/spaces/lmsys/mt-bench)** | **8.81** | 8.53 |
|
44 |
| **[MT-Bench-TW](https://huggingface.co/datasets/MediaTek-Research/TCEval-v2)** | **8.36** | 7.80 |
|
45 |
| **[Chatbot-Arena-Hard](https://github.com/lmarena/arena-hard-auto)** | **43.90** | 33.60 |
|
46 |
+
| **[AlignBench](https://github.com/THUDM/AlignBench)** | **7.25** | 6.88 |
|
47 |
+
|
48 |
+
### Details of TMMLU+
|
49 |
+
|
50 |
+
#### STEM
|
51 |
+
|
52 |
+
| sub_category | score |
|
53 |
+
|----------------------------------|---------|
|
54 |
+
| advance_chemistry | 0.650407 |
|
55 |
+
| basic_medical_science | 0.681342 |
|
56 |
+
| computer_science | 0.839080 |
|
57 |
+
| engineering_math | 0.611650 |
|
58 |
+
| junior_chemistry | 0.708134 |
|
59 |
+
| junior_math_exam | 0.720000 |
|
60 |
+
| junior_science_exam | 0.755869 |
|
61 |
+
| organic_chemistry | 0.678899 |
|
62 |
+
| pharmacy | 0.452685 |
|
63 |
+
| physics | 0.742268 |
|
64 |
+
| secondary_physics | 0.660714 |
|
65 |
+
| statistics_and_machine_learning | 0.794643 |
|
66 |
+
| tve_mathematics | 0.766667 |
|
67 |
+
| tve_natural_sciences | 0.674528 |
|
68 |
+
|
69 |
+
### Humanities
|
70 |
+
|
71 |
+
| sub_category | score |
|
72 |
+
|------------------------------|--------|
|
73 |
+
| administrative_law | 0.454762 |
|
74 |
+
| anti_money_laundering | 0.738806 |
|
75 |
+
| general_principles_of_law | 0.509434 |
|
76 |
+
| introduction_to_law | 0.523207 |
|
77 |
+
| jce_humanities | 0.577778 |
|
78 |
+
| taxation | 0.322667 |
|
79 |
+
| trust_practice | 0.558603 |
|
80 |
+
|
81 |
+
### Social Science
|
82 |
+
|
83 |
+
| sub_category | score |
|
84 |
+
|--------------------------------------------------|---------|
|
85 |
+
| chinese_language_and_literature | 0.457286 |
|
86 |
+
| clinical_psychology | 0.664000 |
|
87 |
+
| economics | 0.702290 |
|
88 |
+
| education | 0.653226 |
|
89 |
+
| education_(profession_level) | 0.458848 |
|
90 |
+
| educational_psychology | 0.670455 |
|
91 |
+
| geography_of_taiwan | 0.618490 |
|
92 |
+
| human_behavior | 0.711974 |
|
93 |
+
| junior_chinese_exam | 0.765714 |
|
94 |
+
| macroeconomics | 0.649635 |
|
95 |
+
| national_protection | 0.687204 |
|
96 |
+
| occupational_therapy_for_psychological_disorders | 0.699816 |
|
97 |
+
| physical_education | 0.569832 |
|
98 |
+
| politic_science | 0.658291 |
|
99 |
+
| taiwanese_hokkien | 0.294574 |
|
100 |
+
| three_principles_of_people | 0.697842 |
|
101 |
+
| ttqav2 | 0.761062 |
|
102 |
+
| tve_chinese_language | 0.745342 |
|
103 |
+
|
104 |
+
|
105 |
+
### Others
|
106 |
+
|
107 |
+
| sub_category | score |
|
108 |
+
|-----------------------------------------------------|--------:|
|
109 |
+
| accounting | 0.350785 |
|
110 |
+
| agriculture | 0.476821 |
|
111 |
+
| auditing | 0.516364 |
|
112 |
+
| business_management | 0.661871 |
|
113 |
+
| culinary_skills | 0.636986 |
|
114 |
+
| dentistry | 0.581454 |
|
115 |
+
| finance_banking | 0.592593 |
|
116 |
+
| financial_analysis | 0.722513 |
|
117 |
+
| fire_science | 0.483871 |
|
118 |
+
| insurance_studies | 0.497368 |
|
119 |
+
| junior_social_studies | 0.785714 |
|
120 |
+
| logic_reasoning | 0.589928 |
|
121 |
+
| management_accounting | 0.530233 |
|
122 |
+
| marketing_management | 0.784946 |
|
123 |
+
| mechanical | 0.711864 |
|
124 |
+
| music | 0.521583 |
|
125 |
+
| nautical_science | 0.441016 |
|
126 |
+
| official_document_management | 0.513514 |
|
127 |
+
| optometry | 0.441304 |
|
128 |
+
| pharmacology | 0.639515 |
|
129 |
+
| real_estate | 0.500000 |
|
130 |
+
| technical | 0.604478 |
|
131 |
+
| trade | 0.410359 |
|
132 |
+
| traditional_chinese_medicine_clinical_medicine | 0.456835 |
|
133 |
+
| tve_design | 0.735417 |
|
134 |
+
| veterinary_pathology | 0.519435 |
|
135 |
+
| veterinary_pharmacology | 0.711111 |
|
136 |
+
|
137 |
+
|