Spaces:
Running
Running
| [ | |
| { | |
| "model": "OPT (1.3B)", | |
| "Average": 7.84, | |
| "MMLU": 7.4, | |
| "WinoGrande": 12.47, | |
| "PiQA": 4.45, | |
| "CommonsenseQA": 7.61, | |
| "Race": 13.61, | |
| "MedMCQA": 1.25, | |
| "OpenkookQA": 4.48 | |
| }, | |
| { | |
| "model": "SlimPajama", | |
| "Average": 9.54, | |
| "MMLU": 9.22, | |
| "WinoGrande": 14.76, | |
| "PiQA": 5.32, | |
| "CommonsenseQA": 9.01, | |
| "Race": 16.19, | |
| "MedMCQA": 1.68, | |
| "OpenkookQA": 5.7 | |
| }, | |
| { | |
| "model": "OLMo (1B)", | |
| "Average": 8.8, | |
| "MMLU": 8.54, | |
| "WinoGrande": 6.16, | |
| "PiQA": 8.05, | |
| "CommonsenseQA": 13.1, | |
| "Race": 13.61, | |
| "MedMCQA": 2.1, | |
| "OpenkookQA": 6.11 | |
| }, | |
| { | |
| "model": "GPT-Neo (1.3B)", | |
| "Average": 7.38, | |
| "MMLU": 6.94, | |
| "WinoGrande": 10.81, | |
| "PiQA": 4.31, | |
| "CommonsenseQA": 6.34, | |
| "Race": 13.75, | |
| "MedMCQA": 2.63, | |
| "OpenkookQA": 4.89 | |
| }, | |
| { | |
| "model": "Cerebras-GPT (1.3B)", | |
| "Average": 4.84, | |
| "MMLU": 5.37, | |
| "WinoGrande": 9.31, | |
| "PiQA": 2.16, | |
| "CommonsenseQA": 6.2, | |
| "Race": 6.9, | |
| "MedMCQA": 1.04, | |
| "OpenkookQA": 3.46 | |
| }, | |
| { | |
| "model": "RedPajama (1B)", | |
| "Average": 9.01, | |
| "MMLU": 9.21, | |
| "WinoGrande": 16.97, | |
| "PiQA": 1.39, | |
| "CommonsenseQA": 11.41, | |
| "Race": 14.35, | |
| "MedMCQA": 1.86, | |
| "OpenkookQA": 3.87 | |
| }, | |
| { | |
| "model": "Pythia (1.4B)", | |
| "Average": 8.73, | |
| "MMLU": 9.66, | |
| "WinoGrande": 11.52, | |
| "PiQA": 4.17, | |
| "CommonsenseQA": 9.01, | |
| "Race": 12.76, | |
| "MedMCQA": 3.19, | |
| "OpenkookQA": 5.3 | |
| }, | |
| { | |
| "model": "TinyLLama (1.1B)", | |
| "Average": 8.39, | |
| "MMLU": 8.94, | |
| "WinoGrande": 12.23, | |
| "PiQA": 3.59, | |
| "CommonsenseQA": 6.06, | |
| "Race": 16.7, | |
| "MedMCQA": 2.07, | |
| "OpenkookQA": 4.68 | |
| }, | |
| { | |
| "model": "OELM (1B)", | |
| "Average": 8.99, | |
| "MMLU": 9.03, | |
| "WinoGrande": 10.18, | |
| "PiQA": 9.05, | |
| "CommonsenseQA": 7.75, | |
| "Race": 12.78, | |
| "MedMCQA": 2.5, | |
| "OpenkookQA": 6.31 | |
| }, | |
| { | |
| "model": "Phi-3-mini-128k-instruct (3.8B)", | |
| "Average": 39.73, | |
| "MMLU": 36.97, | |
| "WinoGrande": 46.88, | |
| "PiQA": 32.04, | |
| "CommonsenseQA": 49.15, | |
| "Race": 37.81, | |
| "MedMCQA": 22.61, | |
| "OpenkookQA": 33.6 | |
| }, | |
| { | |
| "model": "Gemma (2B)", | |
| "Average": 17.37, | |
| "MMLU": 17.52, | |
| "WinoGrande": 22.68, | |
| "PiQA": 15.09, | |
| "CommonsenseQA": 27.46, | |
| "Race": 14.32, | |
| "MedMCQA": 4.57, | |
| "OpenkookQA": 14.26 | |
| }, | |
| { | |
| "model": "Qwen (1.8B)", | |
| "Average": 21.61, | |
| "MMLU": 10.0, | |
| "WinoGrande": 40.97, | |
| "PiQA": 15.52, | |
| "CommonsenseQA": 31.13, | |
| "Race": 34.91, | |
| "MedMCQA": 4.7, | |
| "OpenkookQA": 20.37 | |
| } | |
| ] |