# This module handles interfacing with the huggingface api from typing import Literal from datetime import datetime from huggingface_hub import HfApi from huggingface_hub.errors import RepositoryNotFoundError from datasets import load_dataset, concatenate_datasets, Dataset, Features, Value from datasets.exceptions import DatasetNotFoundError api = HfApi() LEADERBOARD_ID = "KoelLabs/_IPA-TRANSCRIPTION-EN-SCORES" LEADERBOARD_FEATURES = Features( { "display_name": Value("string"), "repo_id": Value("string"), "repo_hash": Value("string"), "repo_last_modified": Value("timestamp[s, tz=UTC]"), "submission_timestamp": Value("timestamp[s, tz=UTC]"), "average_per": Value("float32"), "average_fer": Value("float32"), "url": Value("string"), "fer_TIMIT": Value("float32"), "fer_EpaDB": Value("float32"), "fer_PSST": Value("float32"), "fer_SpeechOcean": Value("float32"), "fer_ISLE": Value("float32"), } ) LEADERBOARD_DEFAULTS = { "url": "", "fer_TIMIT": None, "fer_EpaDB": None, "fer_PSST": None, "fer_SpeechOcean": None, "fer_ISLE": None, } def get_repo_info( repo_id, type: Literal["model", "dataset", "space"] = "model" ) -> tuple[str, datetime]: try: repo_info = api.repo_info(repo_id=repo_id, repo_type=type) return repo_info.sha, repo_info.last_modified # type: ignore except RepositoryNotFoundError: return "", datetime(year=1970, month=1, day=1) def get_or_create_leaderboard() -> Dataset: modified = False try: dataset: Dataset = load_dataset(LEADERBOARD_ID)["train"] # type: ignore except DatasetNotFoundError: empty_data = {col: [] for col in LEADERBOARD_FEATURES.keys()} dataset = Dataset.from_dict(empty_data, features=LEADERBOARD_FEATURES) modified = True except ValueError: empty_data = {col: [] for col in LEADERBOARD_FEATURES.keys()} dataset = Dataset.from_dict(empty_data, features=LEADERBOARD_FEATURES) for col in LEADERBOARD_FEATURES.keys(): if col not in dataset.column_names: modified = True dataset = dataset.add_column(col, [LEADERBOARD_DEFAULTS.get(col)] * len(dataset)) # type: ignore dataset = dataset.cast_column(col, feature=LEADERBOARD_FEATURES[col]) if modified: dataset.push_to_hub(LEADERBOARD_ID, private=True) return dataset def add_leaderboard_entry( display_name: str, repo_id: str, repo_hash: str, repo_last_modified: datetime, submission_timestamp: datetime, average_per: float, average_fer: float, url: str, per_dataset_fers: dict = {}, ): existing_dataset = get_or_create_leaderboard() new_row = Dataset.from_dict( dict( display_name=[display_name], repo_id=[repo_id], repo_hash=[repo_hash], repo_last_modified=[repo_last_modified.replace(microsecond=0)], submission_timestamp=[submission_timestamp.replace(microsecond=0)], average_per=[average_per], average_fer=[average_fer], url=[url], fer_TIMIT=[per_dataset_fers.get("TIMIT")], fer_EpaDB=[per_dataset_fers.get("EpaDB")], fer_PSST=[per_dataset_fers.get("PSST")], fer_SpeechOcean=[per_dataset_fers.get("SpeechOcean")], fer_ISLE=[per_dataset_fers.get("ISLE")], ), features=LEADERBOARD_FEATURES, ) combined_dataset = concatenate_datasets([existing_dataset, new_row]) combined_dataset.push_to_hub(LEADERBOARD_ID, private=True) if __name__ == "__main__": print(get_repo_info(LEADERBOARD_ID, type="dataset")) print(get_or_create_leaderboard().to_pandas().head(5)) # type: ignore