Spaces:

mideind
/

maeliprof-demo

Running

App Files Files Community

gardari commited on Sep 24

Commit

7b934d3

verified ·

1 Parent(s): 47df215

Update scores for newer models

Browse files

Files changed (1) hide show

score.py +24 -12

score.py CHANGED Viewed

@@ -6,64 +6,76 @@ import re
 # Model scores
 BENCHMARK_SCORES = {
     "icelandic-winogrande": {
         "o1-preview": 92.8,
         "Claude 3.5 Sonnet": 91.3,
         "GPT-4o": 85.4,
         # "GPT-4-turbo": 85.8,
-        "Hermes 3 Llama 3.1 405B fp8": 70.6,
         "Claude 2.1": 55.1,
         "GPT-3.5-turbo": 52.0,
-        "Deepseek V3": 75.9,
     },
     "grammatical-error-detection": {
         "o1-preview": 74.5,
         "Claude 3.5 Sonnet": 72.5,
         "GPT-4o": 68.0,
         # "GPT-4-turbo": 60.5,
-        "Hermes 3 Llama 3.1 405B fp8": 53.5,
         "Claude 2.1": 52.5,
         "GPT-3.5-turbo": 52.0,
-        "Deepseek V3": 57.0,
     },
     "icelandic-inflection-all": {
         "o1-preview": 84.4,
         "Claude 3.5 Sonnet": 88.8,
         "GPT-4o": 87.8,
         # "GPT-4-turbo": 76.6,
-        "Hermes 3 Llama 3.1 405B fp8": 61.8,
         "Claude 2.1": 55.2,
         "GPT-3.5-turbo": 39.1,
-        "Deepseek V3": 77.3,
     },
     "icelandic-belebele": {
         "o1-preview": 92.2,
         "Claude 3.5 Sonnet": 92.2,
         "GPT-4o": 90.4,
         # "GPT-4-turbo": 89.3,
-        "Hermes 3 Llama 3.1 405B fp8": 86.1,
         "Claude 2.1": 42.1,
         "GPT-3.5-turbo": 59.2,
-        "Deepseek V3": 87.9,
     },
     "icelandic-arc-challenge": {
         "o1-preview": 93.4,
         "Claude 3.5 Sonnet": 91.3,
         "GPT-4o": 90.4,
         # "GPT-4-turbo": 88.7,
-        "Hermes 3 Llama 3.1 405B fp8": 72.0,
         "Claude 2.1": 59.9,
         "GPT-3.5-turbo": 49.5,
-        "Deepseek V3": 79.7,
     },
     "icelandic-wiki-qa": {
         "o1-preview": 44.5,
         "Claude 3.5 Sonnet": 45.2,
         "GPT-4o": 38.0,
         # "GPT-4-turbo": 31.0,
-        "Hermes 3 Llama 3.1 405B fp8": 33.8,
         "Claude 2.1": 21.1,
         "GPT-3.5-turbo": 15.0,
-        "Deepseek V3": 27.2,
     },
 }

 # Model scores
 BENCHMARK_SCORES = {
     "icelandic-winogrande": {
+        "GPT-5": 94.39,
+        "Gemini Pro 2.5": 92.83,
         "o1-preview": 92.8,
         "Claude 3.5 Sonnet": 91.3,
         "GPT-4o": 85.4,
         # "GPT-4-turbo": 85.8,
+        "Hermes 3 Llama 3.1 405B": 70.6,
         "Claude 2.1": 55.1,
         "GPT-3.5-turbo": 52.0,
+        "Deepseek R1": 81.43,
     },
     "grammatical-error-detection": {
+        "GPT-5": 71.50,
+        "Gemini Pro 2.5": 71.50,
         "o1-preview": 74.5,
         "Claude 3.5 Sonnet": 72.5,
         "GPT-4o": 68.0,
         # "GPT-4-turbo": 60.5,
+        "Hermes 3 Llama 3.1 405B": 53.5,
         "Claude 2.1": 52.5,
         "GPT-3.5-turbo": 52.0,
+        "Deepseek R1": 59.0,
     },
     "icelandic-inflection-all": {
+        "Gemini Pro 2.5": 90.79,
+        "GPT-5": 95.21,
         "o1-preview": 84.4,
         "Claude 3.5 Sonnet": 88.8,
         "GPT-4o": 87.8,
         # "GPT-4-turbo": 76.6,
+        "Hermes 3 Llama 3.1 405B": 61.8,
         "Claude 2.1": 55.2,
         "GPT-3.5-turbo": 39.1,
+        "Deepseek R1": 68.50,
     },
     "icelandic-belebele": {
+        "Gemini Pro 2.5": 93.89,
+        "GPT-5": 93.11,
         "o1-preview": 92.2,
         "Claude 3.5 Sonnet": 92.2,
         "GPT-4o": 90.4,
         # "GPT-4-turbo": 89.3,
+        "Hermes 3 Llama 3.1 405B": 86.1,
         "Claude 2.1": 42.1,
         "GPT-3.5-turbo": 59.2,
+        "Deepseek R1": 90.33,
     },
     "icelandic-arc-challenge": {
+        "Gemini Pro 2.5": 93.94,
+        "GPT-5": 94.37,
         "o1-preview": 93.4,
         "Claude 3.5 Sonnet": 91.3,
         "GPT-4o": 90.4,
         # "GPT-4-turbo": 88.7,
+        "Hermes 3 Llama 3.1 405B": 72.0,
         "Claude 2.1": 59.9,
         "GPT-3.5-turbo": 49.5,
+        "Deepseek R1": 90.61,
     },
     "icelandic-wiki-qa": {
+        "Gemini Pro 2.5": 52.66,
+        "GPT-5": 45.79,
         "o1-preview": 44.5,
         "Claude 3.5 Sonnet": 45.2,
         "GPT-4o": 38.0,
         # "GPT-4-turbo": 31.0,
+        "Hermes 3 Llama 3.1 405B": 33.8,
         "Claude 2.1": 21.1,
         "GPT-3.5-turbo": 15.0,
+        "Deepseek R1": 29.24,
     },
 }