gardarjuto commited on
Commit
d4577f4
·
1 Parent(s): 34da805

refactor(backend): Consolidate all source files into a single 'app' package

Browse files

- Moves all logic from 'original_src' into the 'app' directory to create a unified and more maintainable backend structure.
- Updates all imports to be absolute from the 'app' root, removing the need for sys.path manipulation.
- Fixes data processing bugs that arose during the refactor by correcting benchmark and metric names in the configuration.

.gitignore CHANGED
@@ -5,3 +5,8 @@
5
 
6
  # Ignore __pycache__ directories
7
  __pycache__/
 
 
 
 
 
 
5
 
6
  # Ignore __pycache__ directories
7
  __pycache__/
8
+
9
+ # Node dependencies and build output
10
+ frontend/node_modules
11
+ frontend/build
12
+
backend/app/about.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass
2
+ from enum import Enum
3
+
4
+
5
+ @dataclass
6
+ class Task:
7
+ benchmark: str
8
+ metric: str
9
+ col_name: str
10
+
11
+
12
+ class Tasks(Enum):
13
+ task0 = Task(benchmark="icelandic_winogrande_stringmatch", metric="exact_match,get-answer", col_name="WinoGrande-IS (3-shot)")
14
+ task1 = Task(benchmark="icelandic_sentences_ged_stringmatch", metric="exact_match,get-answer", col_name="GED")
15
+ task2 = Task(benchmark="icelandic_inflection_all", metric="exact_match,get-answer", col_name="Inflection (1-shot)")
16
+ task5 = Task(benchmark="icelandic_belebele", metric="exact_match,get-answer", col_name="Belebele (IS)")
17
+ task6 = Task(benchmark="icelandic_arc_challenge", metric="exact_match,get-answer", col_name="ARC-Challenge-IS")
18
+ task7 = Task(benchmark="icelandic_wiki_qa", metric="lm_judge_score,get-answer", col_name="WikiQA-IS")
backend/{original_src → app/display}/__init__.py RENAMED
File without changes
backend/{original_src → app}/display/formatting.py RENAMED
File without changes
backend/{original_src → app}/display/utils.py RENAMED
@@ -3,7 +3,7 @@ from enum import Enum
3
 
4
  import pandas as pd
5
 
6
- from about import Tasks
7
 
8
  def fields(raw_class):
9
  return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
 
3
 
4
  import pandas as pd
5
 
6
+ from app.about import Tasks
7
 
8
  def fields(raw_class):
9
  return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
backend/app/leaderboard/__init__.py ADDED
File without changes
backend/{original_src → app}/leaderboard/read_evals.py RENAMED
@@ -7,9 +7,9 @@ from dataclasses import dataclass
7
  import dateutil
8
  import numpy as np
9
 
10
- from display.formatting import make_clickable_model
11
- from display.utils import AutoEvalColumn, ModelType, Tasks, Precision
12
- from submission.check_validity import is_model_on_hub
13
 
14
 
15
  @dataclass
 
7
  import dateutil
8
  import numpy as np
9
 
10
+ from app.display.formatting import make_clickable_model
11
+ from app.display.utils import AutoEvalColumn, ModelType, Tasks, Precision
12
+ from app.submission.check_validity import is_model_on_hub
13
 
14
 
15
  @dataclass
backend/{original_src → app}/populate.py RENAMED
@@ -1,7 +1,7 @@
1
  import pandas as pd
2
 
3
- from display.utils import AutoEvalColumn
4
- from leaderboard.read_evals import get_raw_eval_results, EvalResult
5
 
6
 
7
  def get_leaderboard_df(
 
1
  import pandas as pd
2
 
3
+ from app.display.utils import AutoEvalColumn
4
+ from app.leaderboard.read_evals import get_raw_eval_results, EvalResult
5
 
6
 
7
  def get_leaderboard_df(
backend/app/services/leaderboard.py CHANGED
@@ -17,24 +17,10 @@ from app.core.cache import cache_config
17
 
18
  logger = logging.getLogger(__name__)
19
 
20
- # Import original processing logic
21
- import sys
22
- import os
23
-
24
- # Add the original Icelandic leaderboard source to Python path
25
- original_src_path = os.path.join(os.path.dirname(__file__), '..', '..', 'original_src')
26
- if original_src_path not in sys.path:
27
- sys.path.insert(0, original_src_path)
28
-
29
- # Also add the parent directory so imports like 'src.display.utils' work
30
- backend_path = os.path.join(os.path.dirname(__file__), '..', '..')
31
- if backend_path not in sys.path:
32
- sys.path.insert(0, backend_path)
33
-
34
  try:
35
- from leaderboard.read_evals import get_raw_eval_results
36
- from populate import get_leaderboard_df
37
- from display.utils import COLS, BENCHMARK_COLS, Tasks
38
  except ImportError as e:
39
  # Fallback for development without mounted volume
40
  logger.warning(f"Could not import original modules: {e}")
 
17
 
18
  logger = logging.getLogger(__name__)
19
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
  try:
21
+ from app.leaderboard.read_evals import get_raw_eval_results
22
+ from app.populate import get_leaderboard_df
23
+ from app.display.utils import COLS, BENCHMARK_COLS, Tasks
24
  except ImportError as e:
25
  # Fallback for development without mounted volume
26
  logger.warning(f"Could not import original modules: {e}")
backend/app/submission/__init__.py ADDED
File without changes
backend/app/submission/check_validity.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ from huggingface_hub import HfApi, hf_hub_url, HfFolder
3
+ from huggingface_hub.utils import GatedRepoError, RepositoryNotFoundError
4
+ from requests import HTTPError
5
+
6
+ from app.config import HF_TOKEN
7
+
8
+
9
+ def is_model_on_hub(
10
+ model_id: str, revision: str, token: str = HF_TOKEN, trust_remote_code: bool = False, test_tokenizer=True
11
+ ) -> (bool, str, dict):
12
+ """Checks if a model is on the hub.
13
+ Returns:
14
+ (bool, str, dict): a tuple with a boolean indicating if the model is on the hub, a string with the error message, and the model config
15
+ """
16
+ if not token:
17
+ return (
18
+ False,
19
+ "No Hugging Face token provided. Please create a read token on the Hugging Face website and add it as a secret with the name `HF_TOKEN`.",
20
+ None,
21
+ )
22
+
23
+ api = HfApi(token=token)
24
+ try:
25
+ model_info = api.model_info(model_id, revision=revision)
26
+ model_config = None
27
+ if hasattr(model_info, "config"):
28
+ model_config = model_info.config
29
+ except RepositoryNotFoundError:
30
+ return False, f"Model {model_id} not found on hub", None
31
+ except (HTTPError, GatedRepoError) as e:
32
+ return False, f"Model {model_id} is gated, you need to accept the license agreement first.", None
33
+
34
+ if trust_remote_code and test_tokenizer:
35
+ from transformers import AutoTokenizer
36
+
37
+ try:
38
+ AutoTokenizer.from_pretrained(model_id, revision=revision, trust_remote_code=True, token=token)
39
+ except Exception as e:
40
+ return False, f"Could not load tokenizer for {model_id}. Error: {e}", None
41
+
42
+ return True, "", model_config
backend/original_src/about.py DELETED
@@ -1,75 +0,0 @@
1
- from dataclasses import dataclass
2
- from enum import Enum
3
-
4
- @dataclass
5
- class Task:
6
- benchmark: str
7
- metric: str
8
- col_name: str
9
-
10
-
11
- # Select your tasks here
12
- # ---------------------------------------------------
13
- class Tasks(Enum):
14
- # task_key in the json file, metric_key in the json file, name to display in the leaderboard
15
- task0 = Task("icelandic_winogrande_stringmatch", "exact_match,get-answer", "WinoGrande-IS (3-shot)")
16
- task1 = Task("icelandic_sentences_ged_stringmatch", "exact_match,get-answer", "GED")
17
- task2 = Task("icelandic_inflection_all", "exact_match,get-answer", "Inflection (1-shot)")
18
- task5 = Task("icelandic_belebele", "exact_match,get-answer", "Belebele (IS)")
19
- task6 = Task("icelandic_arc_challenge", "exact_match,get-answer", "ARC-Challenge-IS")
20
- task7 = Task("icelandic_wiki_qa", "lm_judge_score,get-answer", "WikiQA-IS")
21
-
22
- # ---------------------------------------------------
23
-
24
-
25
-
26
- # Your leaderboard name
27
- TITLE = """<h1 align="center" id="space-title">Icelandic LLM leaderboard</h1>"""
28
-
29
- # What does your leaderboard evaluate?
30
- INTRODUCTION_TEXT = """
31
- """
32
-
33
- # Which evaluations are you running? how can people reproduce what you have?
34
- LLM_BENCHMARKS_TEXT = f"""
35
- ## New submissions
36
- Do you want your model to be included on the leaderboard? Open a discussion on this repository with the details of your model and we will get back to you.
37
-
38
- ## Benchmark tasks
39
- The Icelandic LLM leaderboard evaluates models on several tasks. All of them are set up as generation tasks, where the model's output is compared to the expected output.
40
- This means that models that have not been instruction fine-tuned might perform poorly on these tasks.
41
-
42
- The following tasks are evaluated:
43
-
44
- ### WinoGrande-IS
45
- The Icelandic WinoGrande task is a human-translated and localized version of the ~1000 test set examples in the WinoGrande task in English.
46
- Each example consists of a sentence with a blank, and two answer choices for the blank. The task is to choose the correct answer choice using coreference resolution.
47
- The benchmark is designed to test the model's ability to use knowledge and common sense reasoning in Icelandic. For this benchmark, we use 3-shot evaluation.
48
- The Icelandic WinoGrande dataset is described in more detail in the IceBERT paper (https://aclanthology.org/2022.lrec-1.464.pdf).
49
- - Link to dataset: https://huggingface.co/datasets/mideind/icelandic-winogrande
50
-
51
- ### GED
52
- This is a benchmark for binary sentence-level Icelandic grammatical error detection, adapted from the Icelandic Error Corpus (IEC) and contains 200 examples.
53
- Each example consists of a sentence that may contain one or more grammatical errors, and the task is to predict whether the sentence contains an error.
54
- - Link to dataset: https://huggingface.co/datasets/mideind/icelandic-sentences-gec
55
-
56
- ### Inflection benchmark
57
- The inflection benchmark tests models' ability to generate inflected forms of 300 Icelandic adjective-noun pairs for all four cases, singular and plural.
58
- - Link to dataset: https://huggingface.co/datasets/mideind/icelandic-inflection-all-flat
59
-
60
- ### Belebele (IS)
61
- This is the Icelandic subset (900 examples) of the Belebele benchmark, a multiple-choice reading comprehension task. The task is to answer questions about a given passage.
62
- - Link to dataset: https://huggingface.co/datasets/facebook/belebele
63
-
64
- ### ARC-Challenge-IS
65
- A machine-translated version of the ARC-Challenge multiple-choice question-answering dataset. For this benchmark, we use the test set which contains 1.23k examples.
66
- - Link to dataset: https://huggingface.co/datasets/mideind/icelandic-arc-challenge
67
-
68
- ### WikiQA-IS
69
- The Icelandic WikiQA dataset is a collection of 1.9k question-answer pairs from the Icelandic Wikipedia, meant to evaluate models' knowledge of Icelandic culture and history.
70
- They were collected by making GPT-4o generate questions and anwswers
71
- given Icelandic Wikipedia articles as context. All examples were then manually verified and corrected where necessary. For evaluation, we prompt GPT-4o to
72
- compare the generated answer to the original answer for semantic similarity and rate the answer on the following scale: (0, "poor"), (1, "fair"), (2, "excellent").
73
- - Link to dataset: https://huggingface.co/datasets/mideind/icelandic_wiki_qa
74
- """
75
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
backend/original_src/display/css_html_js.py DELETED
@@ -1,105 +0,0 @@
1
- custom_css = """
2
-
3
- .markdown-text {
4
- font-size: 16px !important;
5
- }
6
-
7
- #models-to-add-text {
8
- font-size: 18px !important;
9
- }
10
-
11
- #citation-button span {
12
- font-size: 16px !important;
13
- }
14
-
15
- #citation-button textarea {
16
- font-size: 16px !important;
17
- }
18
-
19
- #citation-button > label > button {
20
- margin: 6px;
21
- transform: scale(1.3);
22
- }
23
-
24
- #leaderboard-table {
25
- margin-top: 15px
26
- }
27
-
28
- #leaderboard-table-lite {
29
- margin-top: 15px
30
- }
31
-
32
- #search-bar-table-box > div:first-child {
33
- background: none;
34
- border: none;
35
- }
36
-
37
- #search-bar {
38
- padding: 0px;
39
- }
40
-
41
- /* Limit the width of the first AutoEvalColumn so that names don't expand too much */
42
- table td:first-child,
43
- table th:first-child {
44
- max-width: 400px;
45
- overflow: auto;
46
- white-space: nowrap;
47
- }
48
-
49
- .tab-buttons button {
50
- font-size: 20px;
51
- }
52
-
53
- #scale-logo {
54
- border-style: none !important;
55
- box-shadow: none;
56
- display: block;
57
- margin-left: auto;
58
- margin-right: auto;
59
- max-width: 600px;
60
- }
61
-
62
- #scale-logo .download {
63
- display: none;
64
- }
65
- #filter_type{
66
- border: 0;
67
- padding-left: 0;
68
- padding-top: 0;
69
- }
70
- #filter_type label {
71
- display: flex;
72
- }
73
- #filter_type label > span{
74
- margin-top: var(--spacing-lg);
75
- margin-right: 0.5em;
76
- }
77
- #filter_type label > .wrap{
78
- width: 103px;
79
- }
80
- #filter_type label > .wrap .wrap-inner{
81
- padding: 2px;
82
- }
83
- #filter_type label > .wrap .wrap-inner input{
84
- width: 1px
85
- }
86
- #filter-columns-type{
87
- border:0;
88
- padding:0.5;
89
- }
90
- #filter-columns-size{
91
- border:0;
92
- padding:0.5;
93
- }
94
- #box-filter > .form{
95
- border: 0
96
- }
97
- """
98
-
99
- get_window_url_params = """
100
- function(url_params) {
101
- const params = new URLSearchParams(window.location.search);
102
- url_params = Object.fromEntries(params);
103
- return url_params;
104
- }
105
- """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
backend/original_src/envs.py DELETED
@@ -1,25 +0,0 @@
1
- import os
2
-
3
- from huggingface_hub import HfApi
4
-
5
- # Info to change for your repository
6
- # ----------------------------------
7
- TOKEN = os.environ.get("HF_TOKEN") # A read/write token for your org
8
-
9
- OWNER = "mideind" # Change to your org - don't forget to create a results and request dataset, with the correct format!
10
- # ----------------------------------
11
-
12
- REPO_ID = f"{OWNER}/icelandic-llm-leaderboard"
13
- QUEUE_REPO = f"{OWNER}/icelandic-llm-leaderboard-requests"
14
- RESULTS_REPO = f"{OWNER}/icelandic-llm-leaderboard-results"
15
-
16
- # If you setup a cache later, just change HF_HOME
17
- CACHE_PATH=os.getenv("HF_HOME", ".")
18
-
19
- # Local caches
20
- EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
21
- EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
22
- EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
23
- EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")
24
-
25
- API = HfApi(token=TOKEN)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
backend/original_src/submission/check_validity.py DELETED
@@ -1,36 +0,0 @@
1
- from transformers import AutoConfig
2
- from transformers.models.auto.tokenization_auto import AutoTokenizer
3
-
4
-
5
- def is_model_on_hub(
6
- model_name: str, revision: str, token: str = None, trust_remote_code=False, test_tokenizer=False
7
- ) -> tuple[bool, str]:
8
- """Checks if the model model_name is on the hub, and whether it (and its tokenizer) can be loaded with AutoClasses."""
9
- try:
10
- config = AutoConfig.from_pretrained(
11
- model_name, revision=revision, trust_remote_code=trust_remote_code, token=token
12
- )
13
- if test_tokenizer:
14
- try:
15
- tk = AutoTokenizer.from_pretrained(
16
- model_name, revision=revision, trust_remote_code=trust_remote_code, token=token
17
- )
18
- except ValueError as e:
19
- return (False, f"uses a tokenizer which is not in a transformers release: {e}", None)
20
- except Exception as e:
21
- return (
22
- False,
23
- "'s tokenizer cannot be loaded. Is your tokenizer class in a stable transformers release, and correctly configured?",
24
- None,
25
- )
26
- return True, None, config
27
-
28
- except ValueError:
29
- return (
30
- False,
31
- "needs to be launched with `trust_remote_code=True`. For safety reason, we do not allow these models to be automatically submitted to the leaderboard.",
32
- None,
33
- )
34
-
35
- except Exception as e:
36
- return False, "was not found on hub!", None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
backend/pyproject.toml CHANGED
@@ -5,7 +5,6 @@ description = "Backend for the Icelandic LLM Leaderboard"
5
  authors = ["Mideind <[email protected]>"]
6
  packages = [
7
  {include = "app"},
8
- {include = "original_src"},
9
  ]
10
 
11
  [tool.poetry.dependencies]
 
5
  authors = ["Mideind <[email protected]>"]
6
  packages = [
7
  {include = "app"},
 
8
  ]
9
 
10
  [tool.poetry.dependencies]