Spaces:
				
			
			
	
			
			
					
		Running
		
			on 
			
			CPU Upgrade
	
	
	
			
			
	
	
	
	
		
		
					
		Running
		
			on 
			
			CPU Upgrade
	Apply pre-commit
Browse files- README.md +1 -1
- src/about.py +12 -4
- src/display/utils.py +13 -4
- src/envs.py +3 -3
- src/leaderboard/read_evals.py +24 -36
- src/submission/check_validity.py +20 -12
- src/submission/submit.py +3 -7
    	
        README.md
    CHANGED
    
    | @@ -39,7 +39,7 @@ If you encounter problem on the space, don't hesitate to restart it to remove th | |
| 39 |  | 
| 40 | 
             
            # Code logic for more complex edits
         | 
| 41 |  | 
| 42 | 
            -
            You'll find | 
| 43 | 
             
            - the main table' columns names and properties in `src/display/utils.py`
         | 
| 44 | 
             
            - the logic to read all results and request files, then convert them in dataframe lines, in `src/leaderboard/read_evals.py`, and `src/populate.py`
         | 
| 45 | 
             
            - teh logic to allow or filter submissions in `src/submission/submit.py` and `src/submission/check_validity.py`
         | 
|  | |
| 39 |  | 
| 40 | 
             
            # Code logic for more complex edits
         | 
| 41 |  | 
| 42 | 
            +
            You'll find
         | 
| 43 | 
             
            - the main table' columns names and properties in `src/display/utils.py`
         | 
| 44 | 
             
            - the logic to read all results and request files, then convert them in dataframe lines, in `src/leaderboard/read_evals.py`, and `src/populate.py`
         | 
| 45 | 
             
            - teh logic to allow or filter submissions in `src/submission/submit.py` and `src/submission/check_validity.py`
         | 
    	
        src/about.py
    CHANGED
    
    | @@ -41,8 +41,12 @@ class Tasks(Enum): | |
| 41 | 
             
                jsem_exact_match = Task("scores", "jsem_exact_match", "JSEM")
         | 
| 42 | 
             
                jsick_exact_match = Task("scores", "jsick_exact_match", "JSICK")
         | 
| 43 | 
             
                jsquad_char_f1 = Task("scores", "jsquad_char_f1", "JSquad")
         | 
| 44 | 
            -
                jsts_pearson = Task( | 
| 45 | 
            -
             | 
|  | |
|  | |
|  | |
|  | |
| 46 | 
             
                kuci_exact_match = Task("scores", "kuci_exact_match", "KUCI")
         | 
| 47 | 
             
                mawps_exact_match = Task("scores", "mawps_exact_match", "MAWPS")
         | 
| 48 | 
             
                mmlu_en_exact_match = Task("scores", "mmlu_en_exact_match", "MMLU")
         | 
| @@ -52,10 +56,14 @@ class Tasks(Enum): | |
| 52 | 
             
                wiki_ner_set_f1 = Task("scores", "wiki_ner_set_f1", "Wiki NER")
         | 
| 53 | 
             
                wiki_pas_set_f1 = Task("scores", "wiki_pas_set_f1", "Wiki PAS")
         | 
| 54 | 
             
                wiki_reading_char_f1 = Task("scores", "wiki_reading_char_f1", "Wiki Reading")
         | 
| 55 | 
            -
                wikicorpus_e_to_j_bert_score_ja_f1 = Task( | 
|  | |
|  | |
| 56 | 
             
                wikicorpus_e_to_j_bleu_ja = Task("scores", "wikicorpus-e-to-j_bleu_ja", "WikiCorpus E to J BLEU")
         | 
| 57 | 
             
                wikicorpus_e_to_j_comet_wmt22 = Task("scores", "wikicorpus-e-to-j_comet_wmt22", "WikiCorpus E to J COMET WMT22")
         | 
| 58 | 
            -
                wikicorpus_j_to_e_bert_score_en_f1 = Task( | 
|  | |
|  | |
| 59 | 
             
                wikicorpus_j_to_e_bleu_en = Task("scores", "wikicorpus-j-to-e_bleu_en", "WikiCorpus J to E BLEU")
         | 
| 60 | 
             
                wikicorpus_j_to_e_comet_wmt22 = Task("scores", "wikicorpus-j-to-e_comet_wmt22", "WikiCorpus J to E COMET WMT22")
         | 
| 61 | 
             
                xlsum_ja_bert_score_ja_f1 = Task("scores", "xlsum_ja_bert_score_ja_f1", "XL-Sum JA BERT Score")
         | 
|  | |
| 41 | 
             
                jsem_exact_match = Task("scores", "jsem_exact_match", "JSEM")
         | 
| 42 | 
             
                jsick_exact_match = Task("scores", "jsick_exact_match", "JSICK")
         | 
| 43 | 
             
                jsquad_char_f1 = Task("scores", "jsquad_char_f1", "JSquad")
         | 
| 44 | 
            +
                jsts_pearson = Task(
         | 
| 45 | 
            +
                    "scores", "jsts_pearson", "JSTS (Pearson) - 意味的類似度"
         | 
| 46 | 
            +
                )  # Semantic Textual Similarity - 意味的類似度
         | 
| 47 | 
            +
                jsts_spearman = Task(
         | 
| 48 | 
            +
                    "scores", "jsts_spearman", "JSTS (Spearman) - 意味的類似度"
         | 
| 49 | 
            +
                )  # Semantic Textual Similarity - 意味的類似度
         | 
| 50 | 
             
                kuci_exact_match = Task("scores", "kuci_exact_match", "KUCI")
         | 
| 51 | 
             
                mawps_exact_match = Task("scores", "mawps_exact_match", "MAWPS")
         | 
| 52 | 
             
                mmlu_en_exact_match = Task("scores", "mmlu_en_exact_match", "MMLU")
         | 
|  | |
| 56 | 
             
                wiki_ner_set_f1 = Task("scores", "wiki_ner_set_f1", "Wiki NER")
         | 
| 57 | 
             
                wiki_pas_set_f1 = Task("scores", "wiki_pas_set_f1", "Wiki PAS")
         | 
| 58 | 
             
                wiki_reading_char_f1 = Task("scores", "wiki_reading_char_f1", "Wiki Reading")
         | 
| 59 | 
            +
                wikicorpus_e_to_j_bert_score_ja_f1 = Task(
         | 
| 60 | 
            +
                    "scores", "wikicorpus-e-to-j_bert_score_ja_f1", "WikiCorpus E to J BERT Score"
         | 
| 61 | 
            +
                )
         | 
| 62 | 
             
                wikicorpus_e_to_j_bleu_ja = Task("scores", "wikicorpus-e-to-j_bleu_ja", "WikiCorpus E to J BLEU")
         | 
| 63 | 
             
                wikicorpus_e_to_j_comet_wmt22 = Task("scores", "wikicorpus-e-to-j_comet_wmt22", "WikiCorpus E to J COMET WMT22")
         | 
| 64 | 
            +
                wikicorpus_j_to_e_bert_score_en_f1 = Task(
         | 
| 65 | 
            +
                    "scores", "wikicorpus-j-to-e_bert_score_en_f1", "WikiCorpus J to E BERT Score"
         | 
| 66 | 
            +
                )
         | 
| 67 | 
             
                wikicorpus_j_to_e_bleu_en = Task("scores", "wikicorpus-j-to-e_bleu_en", "WikiCorpus J to E BLEU")
         | 
| 68 | 
             
                wikicorpus_j_to_e_comet_wmt22 = Task("scores", "wikicorpus-j-to-e_comet_wmt22", "WikiCorpus J to E COMET WMT22")
         | 
| 69 | 
             
                xlsum_ja_bert_score_ja_f1 = Task("scores", "xlsum_ja_bert_score_ja_f1", "XL-Sum JA BERT Score")
         | 
    	
        src/display/utils.py
    CHANGED
    
    | @@ -5,6 +5,7 @@ import pandas as pd | |
| 5 |  | 
| 6 | 
             
            from src.about import Tasks
         | 
| 7 |  | 
|  | |
| 8 | 
             
            def fields(raw_class):
         | 
| 9 | 
             
                return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
         | 
| 10 |  | 
| @@ -21,12 +22,13 @@ class ColumnContent: | |
| 21 | 
             
                never_hidden: bool = False
         | 
| 22 | 
             
                dummy: bool = False
         | 
| 23 |  | 
|  | |
| 24 | 
             
            ## Leaderboard columns
         | 
| 25 | 
             
            auto_eval_column_dict = []
         | 
| 26 | 
             
            # Init
         | 
| 27 | 
             
            auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
         | 
| 28 | 
             
            auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
         | 
| 29 | 
            -
            #Scores
         | 
| 30 | 
             
            # auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
         | 
| 31 | 
             
            for task in Tasks:
         | 
| 32 | 
             
                auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
         | 
| @@ -47,6 +49,7 @@ auto_eval_column_dict.append(["dummy", ColumnContent, ColumnContent("model_name_ | |
| 47 | 
             
            # We use make dataclass to dynamically fill the scores from Tasks
         | 
| 48 | 
             
            AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
         | 
| 49 |  | 
|  | |
| 50 | 
             
            ## For the queue columns in the submission tab
         | 
| 51 | 
             
            @dataclass(frozen=True)
         | 
| 52 | 
             
            class EvalQueueColumn:  # Queue column
         | 
| @@ -57,12 +60,13 @@ class EvalQueueColumn:  # Queue column | |
| 57 | 
             
                weight_type = ColumnContent("weight_type", "str", "Original")
         | 
| 58 | 
             
                status = ColumnContent("status", "str", True)
         | 
| 59 |  | 
|  | |
| 60 | 
             
            ## All the model information that we might need
         | 
| 61 | 
             
            @dataclass
         | 
| 62 | 
             
            class ModelDetails:
         | 
| 63 | 
             
                name: str
         | 
| 64 | 
             
                display_name: str = ""
         | 
| 65 | 
            -
                symbol: str = "" | 
| 66 |  | 
| 67 |  | 
| 68 | 
             
            class ModelType(Enum):
         | 
| @@ -87,11 +91,13 @@ class ModelType(Enum): | |
| 87 | 
             
                        return ModelType.IFT
         | 
| 88 | 
             
                    return ModelType.Unknown
         | 
| 89 |  | 
|  | |
| 90 | 
             
            class WeightType(Enum):
         | 
| 91 | 
             
                Adapter = ModelDetails("Adapter")
         | 
| 92 | 
             
                Original = ModelDetails("Original")
         | 
| 93 | 
             
                Delta = ModelDetails("Delta")
         | 
| 94 |  | 
|  | |
| 95 | 
             
            class Precision(Enum):
         | 
| 96 | 
             
                float16 = ModelDetails("float16")
         | 
| 97 | 
             
                bfloat16 = ModelDetails("bfloat16")
         | 
| @@ -104,23 +110,26 @@ class Precision(Enum): | |
| 104 | 
             
                        return Precision.bfloat16
         | 
| 105 | 
             
                    return Precision.Unknown
         | 
| 106 |  | 
|  | |
| 107 | 
             
            class AddSpecialTokens(Enum):
         | 
| 108 | 
             
                true = ModelDetails("True")
         | 
| 109 | 
             
                false = ModelDetails("False")
         | 
| 110 | 
             
                Unknown = ModelDetails("?")
         | 
| 111 |  | 
|  | |
| 112 | 
             
            class NumFewShots(Enum):
         | 
| 113 | 
             
                shots_0 = ModelDetails("0")
         | 
| 114 | 
             
                shots_4 = ModelDetails("4")
         | 
| 115 | 
             
                Unknown = ModelDetails("?")
         | 
| 116 |  | 
| 117 | 
             
                def from_str(shots):
         | 
| 118 | 
            -
                    if shots== | 
| 119 | 
             
                        return NumFewShots.shots_0
         | 
| 120 | 
            -
                    if shots== | 
| 121 | 
             
                        return NumFewShots.shots_4
         | 
| 122 | 
             
                    return NumFewShots.Unknown
         | 
| 123 |  | 
|  | |
| 124 | 
             
            # Column selection
         | 
| 125 | 
             
            COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
         | 
| 126 | 
             
            TYPES = [c.type for c in fields(AutoEvalColumn)]
         | 
|  | |
| 5 |  | 
| 6 | 
             
            from src.about import Tasks
         | 
| 7 |  | 
| 8 | 
            +
             | 
| 9 | 
             
            def fields(raw_class):
         | 
| 10 | 
             
                return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
         | 
| 11 |  | 
|  | |
| 22 | 
             
                never_hidden: bool = False
         | 
| 23 | 
             
                dummy: bool = False
         | 
| 24 |  | 
| 25 | 
            +
             | 
| 26 | 
             
            ## Leaderboard columns
         | 
| 27 | 
             
            auto_eval_column_dict = []
         | 
| 28 | 
             
            # Init
         | 
| 29 | 
             
            auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
         | 
| 30 | 
             
            auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
         | 
| 31 | 
            +
            # Scores
         | 
| 32 | 
             
            # auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
         | 
| 33 | 
             
            for task in Tasks:
         | 
| 34 | 
             
                auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
         | 
|  | |
| 49 | 
             
            # We use make dataclass to dynamically fill the scores from Tasks
         | 
| 50 | 
             
            AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
         | 
| 51 |  | 
| 52 | 
            +
             | 
| 53 | 
             
            ## For the queue columns in the submission tab
         | 
| 54 | 
             
            @dataclass(frozen=True)
         | 
| 55 | 
             
            class EvalQueueColumn:  # Queue column
         | 
|  | |
| 60 | 
             
                weight_type = ColumnContent("weight_type", "str", "Original")
         | 
| 61 | 
             
                status = ColumnContent("status", "str", True)
         | 
| 62 |  | 
| 63 | 
            +
             | 
| 64 | 
             
            ## All the model information that we might need
         | 
| 65 | 
             
            @dataclass
         | 
| 66 | 
             
            class ModelDetails:
         | 
| 67 | 
             
                name: str
         | 
| 68 | 
             
                display_name: str = ""
         | 
| 69 | 
            +
                symbol: str = ""  # emoji
         | 
| 70 |  | 
| 71 |  | 
| 72 | 
             
            class ModelType(Enum):
         | 
|  | |
| 91 | 
             
                        return ModelType.IFT
         | 
| 92 | 
             
                    return ModelType.Unknown
         | 
| 93 |  | 
| 94 | 
            +
             | 
| 95 | 
             
            class WeightType(Enum):
         | 
| 96 | 
             
                Adapter = ModelDetails("Adapter")
         | 
| 97 | 
             
                Original = ModelDetails("Original")
         | 
| 98 | 
             
                Delta = ModelDetails("Delta")
         | 
| 99 |  | 
| 100 | 
            +
             | 
| 101 | 
             
            class Precision(Enum):
         | 
| 102 | 
             
                float16 = ModelDetails("float16")
         | 
| 103 | 
             
                bfloat16 = ModelDetails("bfloat16")
         | 
|  | |
| 110 | 
             
                        return Precision.bfloat16
         | 
| 111 | 
             
                    return Precision.Unknown
         | 
| 112 |  | 
| 113 | 
            +
             | 
| 114 | 
             
            class AddSpecialTokens(Enum):
         | 
| 115 | 
             
                true = ModelDetails("True")
         | 
| 116 | 
             
                false = ModelDetails("False")
         | 
| 117 | 
             
                Unknown = ModelDetails("?")
         | 
| 118 |  | 
| 119 | 
            +
             | 
| 120 | 
             
            class NumFewShots(Enum):
         | 
| 121 | 
             
                shots_0 = ModelDetails("0")
         | 
| 122 | 
             
                shots_4 = ModelDetails("4")
         | 
| 123 | 
             
                Unknown = ModelDetails("?")
         | 
| 124 |  | 
| 125 | 
             
                def from_str(shots):
         | 
| 126 | 
            +
                    if shots == "0":
         | 
| 127 | 
             
                        return NumFewShots.shots_0
         | 
| 128 | 
            +
                    if shots == "4":
         | 
| 129 | 
             
                        return NumFewShots.shots_4
         | 
| 130 | 
             
                    return NumFewShots.Unknown
         | 
| 131 |  | 
| 132 | 
            +
             | 
| 133 | 
             
            # Column selection
         | 
| 134 | 
             
            COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
         | 
| 135 | 
             
            TYPES = [c.type for c in fields(AutoEvalColumn)]
         | 
    	
        src/envs.py
    CHANGED
    
    | @@ -4,9 +4,9 @@ from huggingface_hub import HfApi | |
| 4 |  | 
| 5 | 
             
            # Info to change for your repository
         | 
| 6 | 
             
            # ----------------------------------
         | 
| 7 | 
            -
            TOKEN = os.environ.get("HF_TOKEN") | 
| 8 |  | 
| 9 | 
            -
            OWNER = "llm-jp" | 
| 10 | 
             
            # ----------------------------------
         | 
| 11 |  | 
| 12 | 
             
            REPO_ID = f"{OWNER}/open-japanese-llm-leaderboard"
         | 
| @@ -14,7 +14,7 @@ QUEUE_REPO = f"{OWNER}/requests" | |
| 14 | 
             
            RESULTS_REPO = f"{OWNER}/results"
         | 
| 15 |  | 
| 16 | 
             
            # If you setup a cache later, just change HF_HOME
         | 
| 17 | 
            -
            CACHE_PATH=os.getenv("HF_HOME", ".")
         | 
| 18 |  | 
| 19 | 
             
            # Local caches
         | 
| 20 | 
             
            EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
         | 
|  | |
| 4 |  | 
| 5 | 
             
            # Info to change for your repository
         | 
| 6 | 
             
            # ----------------------------------
         | 
| 7 | 
            +
            TOKEN = os.environ.get("HF_TOKEN")  # A read/write token for your org
         | 
| 8 |  | 
| 9 | 
            +
            OWNER = "llm-jp"  # Change to your org - don't forget to create a results and request dataset, with the correct format!
         | 
| 10 | 
             
            # ----------------------------------
         | 
| 11 |  | 
| 12 | 
             
            REPO_ID = f"{OWNER}/open-japanese-llm-leaderboard"
         | 
|  | |
| 14 | 
             
            RESULTS_REPO = f"{OWNER}/results"
         | 
| 15 |  | 
| 16 | 
             
            # If you setup a cache later, just change HF_HOME
         | 
| 17 | 
            +
            CACHE_PATH = os.getenv("HF_HOME", ".")
         | 
| 18 |  | 
| 19 | 
             
            # Local caches
         | 
| 20 | 
             
            EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
         | 
    	
        src/leaderboard/read_evals.py
    CHANGED
    
    | @@ -1,37 +1,36 @@ | |
| 1 | 
             
            import glob
         | 
| 2 | 
             
            import json
         | 
| 3 | 
            -
            import math
         | 
| 4 | 
             
            import os
         | 
| 5 | 
             
            from dataclasses import dataclass
         | 
| 6 | 
            -
            import dateutil
         | 
| 7 | 
            -
            import numpy as np
         | 
| 8 | 
             
            from decimal import Decimal
         | 
| 9 |  | 
|  | |
|  | |
| 10 | 
             
            from src.display.formatting import make_clickable_model
         | 
| 11 | 
            -
            from src.display.utils import AutoEvalColumn, ModelType, Tasks,  | 
| 12 | 
             
            from src.submission.check_validity import is_model_on_hub
         | 
| 13 |  | 
| 14 |  | 
| 15 | 
             
            @dataclass
         | 
| 16 | 
             
            class EvalResult:
         | 
| 17 | 
            -
                """Represents one full evaluation. Built from a combination of the result and request file for a given run.
         | 
| 18 | 
            -
             | 
| 19 | 
            -
                eval_name: str | 
| 20 | 
            -
                full_model: str | 
| 21 | 
            -
                org: str | 
| 22 | 
             
                model: str
         | 
| 23 | 
            -
                revision: str | 
| 24 | 
             
                results: dict
         | 
| 25 | 
             
                # precision: Precision = Precision.Unknown
         | 
| 26 | 
            -
                model_type: ModelType = ModelType.Unknown | 
| 27 | 
             
                precision: str = "Unknown"
         | 
| 28 | 
             
                # model_type: str = "Unknown"
         | 
| 29 | 
            -
                weight_type: WeightType = WeightType.Original | 
| 30 | 
            -
                architecture: str = "Unknown" | 
| 31 | 
             
                license: str = "?"
         | 
| 32 | 
             
                likes: int = 0
         | 
| 33 | 
             
                num_params: int = 0
         | 
| 34 | 
            -
                date: str = "" | 
| 35 | 
             
                still_on_hub: bool = False
         | 
| 36 | 
             
                num_few_shots: str = "0"
         | 
| 37 | 
             
                add_special_tokens: str = ""
         | 
| @@ -47,7 +46,7 @@ class EvalResult: | |
| 47 | 
             
                    model_config = config.get("model", {})
         | 
| 48 |  | 
| 49 | 
             
                    # Get model type from metainfo
         | 
| 50 | 
            -
                    # model_type_str = metainfo.get("model_type", "") | 
| 51 | 
             
                    # model_type = ModelType.from_str(model_type_str)
         | 
| 52 | 
             
                    # model_type = metainfo.get("model_type", "Unknown")
         | 
| 53 |  | 
| @@ -59,13 +58,15 @@ class EvalResult: | |
| 59 | 
             
                    precision = model_config.get("dtype", "Unknown")
         | 
| 60 |  | 
| 61 | 
             
                    # Add Special Tokens
         | 
| 62 | 
            -
                    add_special_tokens = str( | 
|  | |
|  | |
| 63 |  | 
| 64 | 
             
                    # Get model and org
         | 
| 65 | 
             
                    # org_and_model = config.get("model_name", config.get("offline_inference").get("model_name", None))
         | 
| 66 | 
             
                    org_and_model = config.get("model_name", config.get("offline_inference", {}).get("model_name", "Unknown"))
         | 
| 67 | 
             
                    org_and_model = org_and_model.split("/", 1)
         | 
| 68 | 
            -
             | 
| 69 | 
             
                    # org_and_modelがリストの場合、"/"で結合
         | 
| 70 | 
             
                    if isinstance(org_and_model, list):
         | 
| 71 | 
             
                        full_model = "/".join(org_and_model)
         | 
| @@ -92,7 +93,7 @@ class EvalResult: | |
| 92 | 
             
                        architectures = getattr(model_config, "architectures", None)
         | 
| 93 | 
             
                        if architectures:
         | 
| 94 | 
             
                            architecture = ";".join(architectures)
         | 
| 95 | 
            -
             | 
| 96 | 
             
                    if "scores" not in data:
         | 
| 97 | 
             
                        raise KeyError(f"'scores' key not found in JSON file: {json_filepath}")
         | 
| 98 |  | 
| @@ -103,7 +104,6 @@ class EvalResult: | |
| 103 | 
             
                        score = scores.get(task_value.metric)
         | 
| 104 | 
             
                        results[task_value.metric] = score
         | 
| 105 |  | 
| 106 | 
            -
             | 
| 107 | 
             
                    return self(
         | 
| 108 | 
             
                        eval_name=result_key,
         | 
| 109 | 
             
                        full_model=full_model,
         | 
| @@ -121,12 +121,6 @@ class EvalResult: | |
| 121 | 
             
                def update_with_request_file(self, requests_path):
         | 
| 122 | 
             
                    """Finds the relevant request file for the current model and updates info with it"""
         | 
| 123 | 
             
                    request_file = get_request_file_for_model(requests_path, self.full_model, self.precision)
         | 
| 124 | 
            -
                    if request_file:
         | 
| 125 | 
            -
                        with open(request_file, "r") as f:
         | 
| 126 | 
            -
                            request_data = json.load(f)
         | 
| 127 | 
            -
                    else:
         | 
| 128 | 
            -
                        print("No request file found.")
         | 
| 129 | 
            -
             | 
| 130 | 
             
                    try:
         | 
| 131 | 
             
                        with open(request_file, "r") as f:
         | 
| 132 | 
             
                            request = json.load(f)
         | 
| @@ -186,17 +180,15 @@ def get_request_file_for_model(requests_path, model_name, precision): | |
| 186 | 
             
                for tmp_request_file in request_files:
         | 
| 187 | 
             
                    with open(tmp_request_file, "r") as f:
         | 
| 188 | 
             
                        req_content = json.load(f)
         | 
| 189 | 
            -
                        if (
         | 
| 190 | 
            -
                            req_content["status"] in ["FINISHED"]
         | 
| 191 | 
            -
                            and req_content["precision"] == precision.split(".")[-1]
         | 
| 192 | 
            -
                        ):
         | 
| 193 | 
             
                            request_file = tmp_request_file
         | 
| 194 | 
             
                return request_file
         | 
| 195 |  | 
|  | |
| 196 | 
             
            def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResult]:
         | 
| 197 | 
             
                """From the path of the results folder root, extract all needed info for results"""
         | 
| 198 | 
             
                model_result_filepaths = []
         | 
| 199 | 
            -
             | 
| 200 | 
             
                for root, _, files in os.walk(results_path):
         | 
| 201 | 
             
                    # We should only have json files in model results
         | 
| 202 | 
             
                    if len(files) == 0 or any([not f.endswith(".json") for f in files]):
         | 
| @@ -210,7 +202,6 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu | |
| 210 |  | 
| 211 | 
             
                    for file in files:
         | 
| 212 | 
             
                        model_result_filepaths.append(os.path.join(root, file))
         | 
| 213 | 
            -
                        
         | 
| 214 |  | 
| 215 | 
             
                eval_results = {}
         | 
| 216 | 
             
                for model_result_filepath in model_result_filepaths:
         | 
| @@ -225,17 +216,14 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu | |
| 225 | 
             
                    else:
         | 
| 226 | 
             
                        eval_results[eval_name] = eval_result
         | 
| 227 |  | 
| 228 | 
            -
                data_dict = eval_result.to_dict()
         | 
| 229 | 
            -
             | 
| 230 | 
             
                results = []
         | 
| 231 | 
             
                for v in eval_results.values():
         | 
| 232 | 
             
                    try:
         | 
| 233 | 
            -
                        v.to_dict() | 
| 234 | 
             
                        results.append(v)
         | 
| 235 | 
             
                    except KeyError:  # not all eval values present
         | 
| 236 | 
             
                        continue
         | 
| 237 | 
             
                # print(f"Processing file: {model_result_filepath}")
         | 
| 238 | 
             
                # print(f"Eval result: {eval_result.to_dict()}")
         | 
| 239 |  | 
| 240 | 
            -
             | 
| 241 | 
            -
                return results
         | 
|  | |
| 1 | 
             
            import glob
         | 
| 2 | 
             
            import json
         | 
|  | |
| 3 | 
             
            import os
         | 
| 4 | 
             
            from dataclasses import dataclass
         | 
|  | |
|  | |
| 5 | 
             
            from decimal import Decimal
         | 
| 6 |  | 
| 7 | 
            +
            import dateutil
         | 
| 8 | 
            +
             | 
| 9 | 
             
            from src.display.formatting import make_clickable_model
         | 
| 10 | 
            +
            from src.display.utils import AutoEvalColumn, ModelType, Tasks, WeightType
         | 
| 11 | 
             
            from src.submission.check_validity import is_model_on_hub
         | 
| 12 |  | 
| 13 |  | 
| 14 | 
             
            @dataclass
         | 
| 15 | 
             
            class EvalResult:
         | 
| 16 | 
            +
                """Represents one full evaluation. Built from a combination of the result and request file for a given run."""
         | 
| 17 | 
            +
             | 
| 18 | 
            +
                eval_name: str  # org_model_precision (uid)
         | 
| 19 | 
            +
                full_model: str  # org/model (path on hub)
         | 
| 20 | 
            +
                org: str
         | 
| 21 | 
             
                model: str
         | 
| 22 | 
            +
                revision: str  # commit hash, "" if main
         | 
| 23 | 
             
                results: dict
         | 
| 24 | 
             
                # precision: Precision = Precision.Unknown
         | 
| 25 | 
            +
                model_type: ModelType = ModelType.Unknown  # Pretrained, fine tuned, ...
         | 
| 26 | 
             
                precision: str = "Unknown"
         | 
| 27 | 
             
                # model_type: str = "Unknown"
         | 
| 28 | 
            +
                weight_type: WeightType = WeightType.Original  # Original or Adapter
         | 
| 29 | 
            +
                architecture: str = "Unknown"
         | 
| 30 | 
             
                license: str = "?"
         | 
| 31 | 
             
                likes: int = 0
         | 
| 32 | 
             
                num_params: int = 0
         | 
| 33 | 
            +
                date: str = ""  # submission date of request file
         | 
| 34 | 
             
                still_on_hub: bool = False
         | 
| 35 | 
             
                num_few_shots: str = "0"
         | 
| 36 | 
             
                add_special_tokens: str = ""
         | 
|  | |
| 46 | 
             
                    model_config = config.get("model", {})
         | 
| 47 |  | 
| 48 | 
             
                    # Get model type from metainfo
         | 
| 49 | 
            +
                    # model_type_str = metainfo.get("model_type", "")
         | 
| 50 | 
             
                    # model_type = ModelType.from_str(model_type_str)
         | 
| 51 | 
             
                    # model_type = metainfo.get("model_type", "Unknown")
         | 
| 52 |  | 
|  | |
| 58 | 
             
                    precision = model_config.get("dtype", "Unknown")
         | 
| 59 |  | 
| 60 | 
             
                    # Add Special Tokens
         | 
| 61 | 
            +
                    add_special_tokens = str(
         | 
| 62 | 
            +
                        config.get("pipeline_kwargs", {"add_special_tokens": "Unknown"}).get("add_special_tokens")
         | 
| 63 | 
            +
                    )
         | 
| 64 |  | 
| 65 | 
             
                    # Get model and org
         | 
| 66 | 
             
                    # org_and_model = config.get("model_name", config.get("offline_inference").get("model_name", None))
         | 
| 67 | 
             
                    org_and_model = config.get("model_name", config.get("offline_inference", {}).get("model_name", "Unknown"))
         | 
| 68 | 
             
                    org_and_model = org_and_model.split("/", 1)
         | 
| 69 | 
            +
             | 
| 70 | 
             
                    # org_and_modelがリストの場合、"/"で結合
         | 
| 71 | 
             
                    if isinstance(org_and_model, list):
         | 
| 72 | 
             
                        full_model = "/".join(org_and_model)
         | 
|  | |
| 93 | 
             
                        architectures = getattr(model_config, "architectures", None)
         | 
| 94 | 
             
                        if architectures:
         | 
| 95 | 
             
                            architecture = ";".join(architectures)
         | 
| 96 | 
            +
             | 
| 97 | 
             
                    if "scores" not in data:
         | 
| 98 | 
             
                        raise KeyError(f"'scores' key not found in JSON file: {json_filepath}")
         | 
| 99 |  | 
|  | |
| 104 | 
             
                        score = scores.get(task_value.metric)
         | 
| 105 | 
             
                        results[task_value.metric] = score
         | 
| 106 |  | 
|  | |
| 107 | 
             
                    return self(
         | 
| 108 | 
             
                        eval_name=result_key,
         | 
| 109 | 
             
                        full_model=full_model,
         | 
|  | |
| 121 | 
             
                def update_with_request_file(self, requests_path):
         | 
| 122 | 
             
                    """Finds the relevant request file for the current model and updates info with it"""
         | 
| 123 | 
             
                    request_file = get_request_file_for_model(requests_path, self.full_model, self.precision)
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 124 | 
             
                    try:
         | 
| 125 | 
             
                        with open(request_file, "r") as f:
         | 
| 126 | 
             
                            request = json.load(f)
         | 
|  | |
| 180 | 
             
                for tmp_request_file in request_files:
         | 
| 181 | 
             
                    with open(tmp_request_file, "r") as f:
         | 
| 182 | 
             
                        req_content = json.load(f)
         | 
| 183 | 
            +
                        if req_content["status"] in ["FINISHED"] and req_content["precision"] == precision.split(".")[-1]:
         | 
|  | |
|  | |
|  | |
| 184 | 
             
                            request_file = tmp_request_file
         | 
| 185 | 
             
                return request_file
         | 
| 186 |  | 
| 187 | 
            +
             | 
| 188 | 
             
            def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResult]:
         | 
| 189 | 
             
                """From the path of the results folder root, extract all needed info for results"""
         | 
| 190 | 
             
                model_result_filepaths = []
         | 
| 191 | 
            +
             | 
| 192 | 
             
                for root, _, files in os.walk(results_path):
         | 
| 193 | 
             
                    # We should only have json files in model results
         | 
| 194 | 
             
                    if len(files) == 0 or any([not f.endswith(".json") for f in files]):
         | 
|  | |
| 202 |  | 
| 203 | 
             
                    for file in files:
         | 
| 204 | 
             
                        model_result_filepaths.append(os.path.join(root, file))
         | 
|  | |
| 205 |  | 
| 206 | 
             
                eval_results = {}
         | 
| 207 | 
             
                for model_result_filepath in model_result_filepaths:
         | 
|  | |
| 216 | 
             
                    else:
         | 
| 217 | 
             
                        eval_results[eval_name] = eval_result
         | 
| 218 |  | 
|  | |
|  | |
| 219 | 
             
                results = []
         | 
| 220 | 
             
                for v in eval_results.values():
         | 
| 221 | 
             
                    try:
         | 
| 222 | 
            +
                        v.to_dict()  # we test if the dict version is complete
         | 
| 223 | 
             
                        results.append(v)
         | 
| 224 | 
             
                    except KeyError:  # not all eval values present
         | 
| 225 | 
             
                        continue
         | 
| 226 | 
             
                # print(f"Processing file: {model_result_filepath}")
         | 
| 227 | 
             
                # print(f"Eval result: {eval_result.to_dict()}")
         | 
| 228 |  | 
| 229 | 
            +
                return results
         | 
|  | 
    	
        src/submission/check_validity.py
    CHANGED
    
    | @@ -1,8 +1,6 @@ | |
| 1 | 
             
            import json
         | 
| 2 | 
             
            import os
         | 
| 3 | 
            -
            import re
         | 
| 4 | 
             
            from collections import defaultdict
         | 
| 5 | 
            -
            from datetime import datetime, timedelta, timezone
         | 
| 6 |  | 
| 7 | 
             
            import huggingface_hub
         | 
| 8 | 
             
            from huggingface_hub import ModelCard
         | 
| @@ -10,6 +8,7 @@ from huggingface_hub.hf_api import ModelInfo | |
| 10 | 
             
            from transformers import AutoConfig
         | 
| 11 | 
             
            from transformers.models.auto.tokenization_auto import AutoTokenizer
         | 
| 12 |  | 
|  | |
| 13 | 
             
            def check_model_card(repo_id: str) -> tuple[bool, str]:
         | 
| 14 | 
             
                """Checks if the model card and license exist and have been filled"""
         | 
| 15 | 
             
                try:
         | 
| @@ -31,31 +30,38 @@ def check_model_card(repo_id: str) -> tuple[bool, str]: | |
| 31 |  | 
| 32 | 
             
                return True, ""
         | 
| 33 |  | 
| 34 | 
            -
             | 
|  | |
|  | |
|  | |
| 35 | 
             
                """Checks if the model model_name is on the hub, and whether it (and its tokenizer) can be loaded with AutoClasses."""
         | 
| 36 | 
             
                try:
         | 
| 37 | 
            -
                    config = AutoConfig.from_pretrained( | 
|  | |
|  | |
| 38 | 
             
                    if test_tokenizer:
         | 
| 39 | 
             
                        try:
         | 
| 40 | 
            -
                             | 
|  | |
|  | |
| 41 | 
             
                        except ValueError as e:
         | 
|  | |
|  | |
| 42 | 
             
                            return (
         | 
| 43 | 
             
                                False,
         | 
| 44 | 
            -
                                 | 
| 45 | 
            -
                                None
         | 
| 46 | 
             
                            )
         | 
| 47 | 
            -
                        except Exception as e:
         | 
| 48 | 
            -
                            return (False, "'s tokenizer cannot be loaded. Is your tokenizer class in a stable transformers release, and correctly configured?", None)
         | 
| 49 | 
             
                    return True, None, config
         | 
| 50 |  | 
| 51 | 
             
                except ValueError:
         | 
| 52 | 
             
                    return (
         | 
| 53 | 
             
                        False,
         | 
| 54 | 
             
                        "needs to be launched with `trust_remote_code=True`. For safety reason, we do not allow these models to be automatically submitted to the leaderboard.",
         | 
| 55 | 
            -
                        None
         | 
| 56 | 
             
                    )
         | 
| 57 |  | 
| 58 | 
            -
                except Exception | 
| 59 | 
             
                    return False, "was not found on hub!", None
         | 
| 60 |  | 
| 61 |  | 
| @@ -70,10 +76,12 @@ def get_model_size(model_info: ModelInfo, precision: str): | |
| 70 | 
             
                model_size = size_factor * model_size
         | 
| 71 | 
             
                return model_size
         | 
| 72 |  | 
|  | |
| 73 | 
             
            def get_model_arch(model_info: ModelInfo):
         | 
| 74 | 
             
                """Gets the model architecture from the configuration"""
         | 
| 75 | 
             
                return model_info.config.get("architectures", "Unknown")
         | 
| 76 |  | 
|  | |
| 77 | 
             
            def already_submitted_models(requested_models_dir: str) -> set[str]:
         | 
| 78 | 
             
                """Gather a list of already submitted models to avoid duplicates"""
         | 
| 79 | 
             
                depth = 1
         | 
| @@ -88,7 +96,7 @@ def already_submitted_models(requested_models_dir: str) -> set[str]: | |
| 88 | 
             
                                continue
         | 
| 89 | 
             
                            with open(os.path.join(root, file), "r") as f:
         | 
| 90 | 
             
                                info = json.load(f)
         | 
| 91 | 
            -
                                if info[ | 
| 92 | 
             
                                    continue
         | 
| 93 | 
             
                                file_names.append(f"{info['model']}_{info['precision']}_{info['add_special_tokens']}")
         | 
| 94 |  | 
|  | |
| 1 | 
             
            import json
         | 
| 2 | 
             
            import os
         | 
|  | |
| 3 | 
             
            from collections import defaultdict
         | 
|  | |
| 4 |  | 
| 5 | 
             
            import huggingface_hub
         | 
| 6 | 
             
            from huggingface_hub import ModelCard
         | 
|  | |
| 8 | 
             
            from transformers import AutoConfig
         | 
| 9 | 
             
            from transformers.models.auto.tokenization_auto import AutoTokenizer
         | 
| 10 |  | 
| 11 | 
            +
             | 
| 12 | 
             
            def check_model_card(repo_id: str) -> tuple[bool, str]:
         | 
| 13 | 
             
                """Checks if the model card and license exist and have been filled"""
         | 
| 14 | 
             
                try:
         | 
|  | |
| 30 |  | 
| 31 | 
             
                return True, ""
         | 
| 32 |  | 
| 33 | 
            +
             | 
| 34 | 
            +
            def is_model_on_hub(
         | 
| 35 | 
            +
                model_name: str, revision: str, token: str = None, trust_remote_code=False, test_tokenizer=False
         | 
| 36 | 
            +
            ) -> tuple[bool, str]:
         | 
| 37 | 
             
                """Checks if the model model_name is on the hub, and whether it (and its tokenizer) can be loaded with AutoClasses."""
         | 
| 38 | 
             
                try:
         | 
| 39 | 
            +
                    config = AutoConfig.from_pretrained(
         | 
| 40 | 
            +
                        model_name, revision=revision, trust_remote_code=trust_remote_code, token=token
         | 
| 41 | 
            +
                    )
         | 
| 42 | 
             
                    if test_tokenizer:
         | 
| 43 | 
             
                        try:
         | 
| 44 | 
            +
                            AutoTokenizer.from_pretrained(
         | 
| 45 | 
            +
                                model_name, revision=revision, trust_remote_code=trust_remote_code, token=token
         | 
| 46 | 
            +
                            )
         | 
| 47 | 
             
                        except ValueError as e:
         | 
| 48 | 
            +
                            return (False, f"uses a tokenizer which is not in a transformers release: {e}", None)
         | 
| 49 | 
            +
                        except Exception:
         | 
| 50 | 
             
                            return (
         | 
| 51 | 
             
                                False,
         | 
| 52 | 
            +
                                "'s tokenizer cannot be loaded. Is your tokenizer class in a stable transformers release, and correctly configured?",
         | 
| 53 | 
            +
                                None,
         | 
| 54 | 
             
                            )
         | 
|  | |
|  | |
| 55 | 
             
                    return True, None, config
         | 
| 56 |  | 
| 57 | 
             
                except ValueError:
         | 
| 58 | 
             
                    return (
         | 
| 59 | 
             
                        False,
         | 
| 60 | 
             
                        "needs to be launched with `trust_remote_code=True`. For safety reason, we do not allow these models to be automatically submitted to the leaderboard.",
         | 
| 61 | 
            +
                        None,
         | 
| 62 | 
             
                    )
         | 
| 63 |  | 
| 64 | 
            +
                except Exception:
         | 
| 65 | 
             
                    return False, "was not found on hub!", None
         | 
| 66 |  | 
| 67 |  | 
|  | |
| 76 | 
             
                model_size = size_factor * model_size
         | 
| 77 | 
             
                return model_size
         | 
| 78 |  | 
| 79 | 
            +
             | 
| 80 | 
             
            def get_model_arch(model_info: ModelInfo):
         | 
| 81 | 
             
                """Gets the model architecture from the configuration"""
         | 
| 82 | 
             
                return model_info.config.get("architectures", "Unknown")
         | 
| 83 |  | 
| 84 | 
            +
             | 
| 85 | 
             
            def already_submitted_models(requested_models_dir: str) -> set[str]:
         | 
| 86 | 
             
                """Gather a list of already submitted models to avoid duplicates"""
         | 
| 87 | 
             
                depth = 1
         | 
|  | |
| 96 | 
             
                                continue
         | 
| 97 | 
             
                            with open(os.path.join(root, file), "r") as f:
         | 
| 98 | 
             
                                info = json.load(f)
         | 
| 99 | 
            +
                                if info["status"] == "FAILED":
         | 
| 100 | 
             
                                    continue
         | 
| 101 | 
             
                                file_names.append(f"{info['model']}_{info['precision']}_{info['add_special_tokens']}")
         | 
| 102 |  | 
    	
        src/submission/submit.py
    CHANGED
    
    | @@ -3,17 +3,13 @@ import os | |
| 3 | 
             
            from datetime import datetime, timezone
         | 
| 4 |  | 
| 5 | 
             
            from src.display.formatting import styled_error, styled_message, styled_warning
         | 
| 6 | 
            -
            from src.envs import API, EVAL_REQUESTS_PATH,  | 
| 7 | 
            -
            from src.submission.check_validity import  | 
| 8 | 
            -
                already_submitted_models,
         | 
| 9 | 
            -
                check_model_card,
         | 
| 10 | 
            -
                get_model_size,
         | 
| 11 | 
            -
                is_model_on_hub,
         | 
| 12 | 
            -
            )
         | 
| 13 |  | 
| 14 | 
             
            REQUESTED_MODELS = None
         | 
| 15 | 
             
            USERS_TO_SUBMISSION_DATES = None
         | 
| 16 |  | 
|  | |
| 17 | 
             
            def add_new_eval(
         | 
| 18 | 
             
                model: str,
         | 
| 19 | 
             
                revision: str,
         | 
|  | |
| 3 | 
             
            from datetime import datetime, timezone
         | 
| 4 |  | 
| 5 | 
             
            from src.display.formatting import styled_error, styled_message, styled_warning
         | 
| 6 | 
            +
            from src.envs import API, EVAL_REQUESTS_PATH, QUEUE_REPO, TOKEN
         | 
| 7 | 
            +
            from src.submission.check_validity import already_submitted_models, check_model_card, get_model_size, is_model_on_hub
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
| 8 |  | 
| 9 | 
             
            REQUESTED_MODELS = None
         | 
| 10 | 
             
            USERS_TO_SUBMISSION_DATES = None
         | 
| 11 |  | 
| 12 | 
            +
             | 
| 13 | 
             
            def add_new_eval(
         | 
| 14 | 
             
                model: str,
         | 
| 15 | 
             
                revision: str,
         | 

