Spaces:

mideind
/

icelandic-llm-leaderboard

Running

gardarjuto commited on Jul 21

Commit

d4577f4

1 Parent(s): 34da805

refactor(backend): Consolidate all source files into a single 'app' package

- Moves all logic from 'original_src' into the 'app' directory to create a unified and more maintainable backend structure.
- Updates all imports to be absolute from the 'app' root, removing the need for sys.path manipulation.
- Fixes data processing bugs that arose during the refactor by correcting benchmark and metric names in the configuration.

Files changed (16) hide show

.gitignore +5 -0
backend/app/about.py +18 -0
backend/{original_src → app/display}/__init__.py +0 -0
backend/{original_src → app}/display/formatting.py +0 -0
backend/{original_src → app}/display/utils.py +1 -1
backend/app/leaderboard/__init__.py +0 -0
backend/{original_src → app}/leaderboard/read_evals.py +3 -3
backend/{original_src → app}/populate.py +2 -2
backend/app/services/leaderboard.py +3 -17
backend/app/submission/__init__.py +0 -0
backend/app/submission/check_validity.py +42 -0
backend/original_src/about.py +0 -75
backend/original_src/display/css_html_js.py +0 -105
backend/original_src/envs.py +0 -25
backend/original_src/submission/check_validity.py +0 -36
backend/pyproject.toml +0 -1

.gitignore CHANGED Viewed

@@ -5,3 +5,8 @@
 # Ignore __pycache__ directories
 __pycache__/

 # Ignore __pycache__ directories
 __pycache__/
+# Node dependencies and build output
+frontend/node_modules
+frontend/build

backend/app/about.py ADDED Viewed

	@@ -0,0 +1,18 @@

+from dataclasses import dataclass
+from enum import Enum
+@dataclass
+class Task:
+    benchmark: str
+    metric: str
+    col_name: str
+class Tasks(Enum):
+    task0 = Task(benchmark="icelandic_winogrande_stringmatch", metric="exact_match,get-answer", col_name="WinoGrande-IS (3-shot)")
+    task1 = Task(benchmark="icelandic_sentences_ged_stringmatch", metric="exact_match,get-answer", col_name="GED")
+    task2 = Task(benchmark="icelandic_inflection_all", metric="exact_match,get-answer", col_name="Inflection (1-shot)")
+    task5 = Task(benchmark="icelandic_belebele", metric="exact_match,get-answer", col_name="Belebele (IS)")
+    task6 = Task(benchmark="icelandic_arc_challenge", metric="exact_match,get-answer", col_name="ARC-Challenge-IS")
+    task7 = Task(benchmark="icelandic_wiki_qa", metric="lm_judge_score,get-answer", col_name="WikiQA-IS")

backend/{original_src → app/display}/__init__.py RENAMED Viewed

File without changes

backend/{original_src → app}/display/formatting.py RENAMED Viewed

File without changes

backend/{original_src → app}/display/utils.py RENAMED Viewed

@@ -3,7 +3,7 @@ from enum import Enum
 import pandas as pd
-from about import Tasks
 def fields(raw_class):
     return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]

 import pandas as pd
+from app.about import Tasks
 def fields(raw_class):
     return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]

backend/app/leaderboard/__init__.py ADDED Viewed

File without changes

backend/{original_src → app}/leaderboard/read_evals.py RENAMED Viewed

@@ -7,9 +7,9 @@ from dataclasses import dataclass
 import dateutil
 import numpy as np
-from display.formatting import make_clickable_model
-from display.utils import AutoEvalColumn, ModelType, Tasks, Precision
-from submission.check_validity import is_model_on_hub
 @dataclass

 import dateutil
 import numpy as np
+from app.display.formatting import make_clickable_model
+from app.display.utils import AutoEvalColumn, ModelType, Tasks, Precision
+from app.submission.check_validity import is_model_on_hub
 @dataclass

backend/{original_src → app}/populate.py RENAMED Viewed

@@ -1,7 +1,7 @@
 import pandas as pd
-from display.utils import AutoEvalColumn
-from leaderboard.read_evals import get_raw_eval_results, EvalResult
 def get_leaderboard_df(

 import pandas as pd
+from app.display.utils import AutoEvalColumn
+from app.leaderboard.read_evals import get_raw_eval_results, EvalResult
 def get_leaderboard_df(

backend/app/services/leaderboard.py CHANGED Viewed

@@ -17,24 +17,10 @@ from app.core.cache import cache_config
 logger = logging.getLogger(__name__)
-# Import original processing logic
-import sys
-import os
-# Add the original Icelandic leaderboard source to Python path
-original_src_path = os.path.join(os.path.dirname(__file__), '..', '..', 'original_src')
-if original_src_path not in sys.path:
-    sys.path.insert(0, original_src_path)
-# Also add the parent directory so imports like 'src.display.utils' work
-backend_path = os.path.join(os.path.dirname(__file__), '..', '..')
-if backend_path not in sys.path:
-    sys.path.insert(0, backend_path)
 try:
-    from leaderboard.read_evals import get_raw_eval_results
-    from populate import get_leaderboard_df
-    from display.utils import COLS, BENCHMARK_COLS, Tasks
 except ImportError as e:
     # Fallback for development without mounted volume
     logger.warning(f"Could not import original modules: {e}")

 logger = logging.getLogger(__name__)
 try:
+    from app.leaderboard.read_evals import get_raw_eval_results
+    from app.populate import get_leaderboard_df
+    from app.display.utils import COLS, BENCHMARK_COLS, Tasks
 except ImportError as e:
     # Fallback for development without mounted volume
     logger.warning(f"Could not import original modules: {e}")

backend/app/submission/__init__.py ADDED Viewed

File without changes

backend/app/submission/check_validity.py ADDED Viewed

	@@ -0,0 +1,42 @@

+import json
+from huggingface_hub import HfApi, hf_hub_url, HfFolder
+from huggingface_hub.utils import GatedRepoError, RepositoryNotFoundError
+from requests import HTTPError
+from app.config import HF_TOKEN
+def is_model_on_hub(
+    model_id: str, revision: str, token: str = HF_TOKEN, trust_remote_code: bool = False, test_tokenizer=True
+) -> (bool, str, dict):
+    """Checks if a model is on the hub.
+    Returns:
+        (bool, str, dict): a tuple with a boolean indicating if the model is on the hub, a string with the error message, and the model config
+    """
+    if not token:
+        return (
+            False,
+            "No Hugging Face token provided. Please create a read token on the Hugging Face website and add it as a secret with the name `HF_TOKEN`.",
+            None,
+        )
+    api = HfApi(token=token)
+    try:
+        model_info = api.model_info(model_id, revision=revision)
+        model_config = None
+        if hasattr(model_info, "config"):
+            model_config = model_info.config
+    except RepositoryNotFoundError:
+        return False, f"Model {model_id} not found on hub", None
+    except (HTTPError, GatedRepoError) as e:
+        return False, f"Model {model_id} is gated, you need to accept the license agreement first.", None
+    if trust_remote_code and test_tokenizer:
+        from transformers import AutoTokenizer
+        try:
+            AutoTokenizer.from_pretrained(model_id, revision=revision, trust_remote_code=True, token=token)
+        except Exception as e:
+            return False, f"Could not load tokenizer for {model_id}. Error: {e}", None
+    return True, "", model_config

backend/original_src/about.py DELETED Viewed

@@ -1,75 +0,0 @@
-from dataclasses import dataclass
-from enum import Enum
-@dataclass
-class Task:
-    benchmark: str
-    metric: str
-    col_name: str
-# Select your tasks here
-# ---------------------------------------------------
-class Tasks(Enum):
-    # task_key in the json file, metric_key in the json file, name to display in the leaderboard
-    task0 = Task("icelandic_winogrande_stringmatch", "exact_match,get-answer", "WinoGrande-IS (3-shot)")
-    task1 = Task("icelandic_sentences_ged_stringmatch", "exact_match,get-answer", "GED")
-    task2 = Task("icelandic_inflection_all", "exact_match,get-answer", "Inflection (1-shot)")
-    task5 = Task("icelandic_belebele", "exact_match,get-answer", "Belebele (IS)")
-    task6 = Task("icelandic_arc_challenge", "exact_match,get-answer", "ARC-Challenge-IS")
-    task7 = Task("icelandic_wiki_qa", "lm_judge_score,get-answer", "WikiQA-IS")
-# ---------------------------------------------------
-# Your leaderboard name
-TITLE = """<h1 align="center" id="space-title">Icelandic LLM leaderboard</h1>"""
-# What does your leaderboard evaluate?
-INTRODUCTION_TEXT = """
-"""
-# Which evaluations are you running? how can people reproduce what you have?
-LLM_BENCHMARKS_TEXT = f"""
-## New submissions
-Do you want your model to be included on the leaderboard? Open a discussion on this repository with the details of your model and we will get back to you.
-## Benchmark tasks
-The Icelandic LLM leaderboard evaluates models on several tasks. All of them are set up as generation tasks, where the model's output is compared to the expected output.
-This means that models that have not been instruction fine-tuned might perform poorly on these tasks.
-The following tasks are evaluated:
-### WinoGrande-IS
-The Icelandic WinoGrande task is a human-translated and localized version of the ~1000 test set examples in the WinoGrande task in English.
-Each example consists of a sentence with a blank, and two answer choices for the blank. The task is to choose the correct answer choice using coreference resolution.
-The benchmark is designed to test the model's ability to use knowledge and common sense reasoning in Icelandic. For this benchmark, we use 3-shot evaluation.
-The Icelandic WinoGrande dataset is described in more detail in the IceBERT paper (https://aclanthology.org/2022.lrec-1.464.pdf).
-- Link to dataset: https://huggingface.co/datasets/mideind/icelandic-winogrande
-### GED
-This is a benchmark for binary sentence-level Icelandic grammatical error detection, adapted from the Icelandic Error Corpus (IEC) and contains 200 examples.
-Each example consists of a sentence that may contain one or more grammatical errors, and the task is to predict whether the sentence contains an error.
-- Link to dataset: https://huggingface.co/datasets/mideind/icelandic-sentences-gec
-### Inflection benchmark
-The inflection benchmark tests models' ability to generate inflected forms of 300 Icelandic adjective-noun pairs for all four cases, singular and plural.
-- Link to dataset: https://huggingface.co/datasets/mideind/icelandic-inflection-all-flat
-### Belebele (IS)
-This is the Icelandic subset (900 examples) of the Belebele benchmark, a multiple-choice reading comprehension task. The task is to answer questions about a given passage.
-- Link to dataset: https://huggingface.co/datasets/facebook/belebele
-### ARC-Challenge-IS
-A machine-translated version of the ARC-Challenge multiple-choice question-answering dataset. For this benchmark, we use the test set which contains 1.23k examples.
-- Link to dataset: https://huggingface.co/datasets/mideind/icelandic-arc-challenge
-### WikiQA-IS
-The Icelandic WikiQA dataset is a collection of 1.9k question-answer pairs from the Icelandic Wikipedia, meant to evaluate models' knowledge of Icelandic culture and history.
-They were collected by making GPT-4o generate questions and anwswers
-given Icelandic Wikipedia articles as context. All examples were then manually verified and corrected where necessary. For evaluation, we prompt GPT-4o to
-compare the generated answer to the original answer for semantic similarity and rate the answer on the following scale: (0, "poor"), (1, "fair"), (2, "excellent").
-- Link to dataset: https://huggingface.co/datasets/mideind/icelandic_wiki_qa
-"""

backend/original_src/display/css_html_js.py DELETED Viewed

@@ -1,105 +0,0 @@
-custom_css = """
-.markdown-text {
-    font-size: 16px !important;
-}
-#models-to-add-text {
-    font-size: 18px !important;
-}
-#citation-button span {
-    font-size: 16px !important;
-}
-#citation-button textarea {
-    font-size: 16px !important;
-}
-#citation-button > label > button {
-    margin: 6px;
-    transform: scale(1.3);
-}
-#leaderboard-table {
-    margin-top: 15px
-}
-#leaderboard-table-lite {
-    margin-top: 15px
-}
-#search-bar-table-box > div:first-child {
-    background: none;
-    border: none;
-}
-#search-bar {
-    padding: 0px;
-}
-/* Limit the width of the first AutoEvalColumn so that names don't expand too much */
-table td:first-child,
-table th:first-child {
-    max-width: 400px;
-    overflow: auto;
-    white-space: nowrap;
-}
-.tab-buttons button {
-    font-size: 20px;
-}
-#scale-logo {
-    border-style: none !important;
-    box-shadow: none;
-    display: block;
-    margin-left: auto;
-    margin-right: auto;
-    max-width: 600px;
-}
-#scale-logo .download {
-    display: none;
-}
-#filter_type{
-    border: 0;
-    padding-left: 0;
-    padding-top: 0;
-}
-#filter_type label {
-    display: flex;
-}
-#filter_type label > span{
-    margin-top: var(--spacing-lg);
-    margin-right: 0.5em;
-}
-#filter_type label > .wrap{
-    width: 103px;
-}
-#filter_type label > .wrap .wrap-inner{
-    padding: 2px;
-}
-#filter_type label > .wrap .wrap-inner input{
-    width: 1px
-}
-#filter-columns-type{
-    border:0;
-    padding:0.5;
-}
-#filter-columns-size{
-    border:0;
-    padding:0.5;
-}
-#box-filter > .form{
-    border: 0
-}
-"""
-get_window_url_params = """
-    function(url_params) {
-        const params = new URLSearchParams(window.location.search);
-        url_params = Object.fromEntries(params);
-        return url_params;
-    }
-    """

backend/original_src/envs.py DELETED Viewed

@@ -1,25 +0,0 @@
-import os
-from huggingface_hub import HfApi
-# Info to change for your repository
-# ----------------------------------
-TOKEN = os.environ.get("HF_TOKEN") # A read/write token for your org
-OWNER = "mideind" # Change to your org - don't forget to create a results and request dataset, with the correct format!
-# ----------------------------------
-REPO_ID = f"{OWNER}/icelandic-llm-leaderboard"
-QUEUE_REPO = f"{OWNER}/icelandic-llm-leaderboard-requests"
-RESULTS_REPO = f"{OWNER}/icelandic-llm-leaderboard-results"
-# If you setup a cache later, just change HF_HOME
-CACHE_PATH=os.getenv("HF_HOME", ".")
-# Local caches
-EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
-EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
-EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
-EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")
-API = HfApi(token=TOKEN)

backend/original_src/submission/check_validity.py DELETED Viewed

@@ -1,36 +0,0 @@
-from transformers import AutoConfig
-from transformers.models.auto.tokenization_auto import AutoTokenizer
-def is_model_on_hub(
-    model_name: str, revision: str, token: str = None, trust_remote_code=False, test_tokenizer=False
-) -> tuple[bool, str]:
-    """Checks if the model model_name is on the hub, and whether it (and its tokenizer) can be loaded with AutoClasses."""
-    try:
-        config = AutoConfig.from_pretrained(
-            model_name, revision=revision, trust_remote_code=trust_remote_code, token=token
-        )
-        if test_tokenizer:
-            try:
-                tk = AutoTokenizer.from_pretrained(
-                    model_name, revision=revision, trust_remote_code=trust_remote_code, token=token
-                )
-            except ValueError as e:
-                return (False, f"uses a tokenizer which is not in a transformers release: {e}", None)
-            except Exception as e:
-                return (
-                    False,
-                    "'s tokenizer cannot be loaded. Is your tokenizer class in a stable transformers release, and correctly configured?",
-                    None,
-                )
-        return True, None, config
-    except ValueError:
-        return (
-            False,
-            "needs to be launched with `trust_remote_code=True`. For safety reason, we do not allow these models to be automatically submitted to the leaderboard.",
-            None,
-        )
-    except Exception as e:
-        return False, "was not found on hub!", None

backend/pyproject.toml CHANGED Viewed

@@ -5,7 +5,6 @@ description = "Backend for the Icelandic LLM Leaderboard"
 authors = ["Mideind <[email protected]>"]
 packages = [
     {include = "app"},
-    {include = "original_src"},
 ]
 [tool.poetry.dependencies]

 authors = ["Mideind <[email protected]>"]
 packages = [
     {include = "app"},
 ]
 [tool.poetry.dependencies]