Spaces:

d0rj
/

romb-leaderboard

Sleeping

App Files Files Community

d0rj commited on 22 days ago

Commit

1719436

1 Parent(s): 101a598

feat: Initial commit

Browse files

Files changed (27) hide show

.gitignore +208 -0
LICENSE +21 -0
app.py +25 -0
cli.py +19 -0
configs/gemma-3-1b.yaml +22 -0
configs/ollama.yaml +13 -0
configs/openrouter.yaml +9 -0
docs/evaluate.md +90 -0
prompts/simple_think_end.yaml +13 -0
prompts/simple_think_system.yaml +11 -0
prompts/singleturn.yaml +14 -0
requirements.txt +12 -0
src/common/data.py +17 -0
src/common/env.py +25 -0
src/common/paths.py +11 -0
src/common/schema.py +32 -0
src/eval/cli.py +191 -0
src/eval/matchers.py +144 -0
src/eval/metrics.py +7 -0
src/eval/schema.py +16 -0
src/generate/answer.py +156 -0
src/generate/cli.py +210 -0
src/generate/config.py +27 -0
src/generate/generators.py +139 -0
src/generate/llms.py +14 -0
src/generate/schema.py +8 -0
src/space/utils.py +11 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,208 @@

+notebooks/
+tmp*
+*.jsonl
+*.json
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[codz]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py.cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# UV
+#   Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#uv.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+#poetry.toml
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#   pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
+#   https://pdm-project.org/en/latest/usage/project/#working-with-version-control
+#pdm.lock
+#pdm.toml
+.pdm-python
+.pdm-build/
+# pixi
+#   Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
+#pixi.lock
+#   Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
+#   in the .venv directory. It is recommended not to include this directory in version control.
+.pixi
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.envrc
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+# Abstra
+# Abstra is an AI-powered process automation framework.
+# Ignore directories containing user credentials, local state, and settings.
+# Learn more at https://abstra.io/docs
+.abstra/
+# Visual Studio Code
+#  Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
+#  that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
+#  and can be added to the global gitignore or merged into this file. However, if you prefer,
+#  you could uncomment the following to ignore the entire vscode folder
+# .vscode/
+# Ruff stuff:
+.ruff_cache/
+# PyPI configuration file
+.pypirc
+# Marimo
+marimo/_static/
+marimo/_lsp/
+__marimo__/
+# Streamlit
+.streamlit/secrets.toml

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+# MIT License
+Copyright (c) 2025 d0rj
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

app.py ADDED Viewed

	@@ -0,0 +1,25 @@

+import gradio as gr
+from gradio_leaderboard import Leaderboard, SelectColumns, ColumnFilter
+from src.common.paths import DOCS_PATH, DATASET_NAME
+with gr.Blocks(
+    title="ROMB Leaderboard v1.0",
+    theme=gr.themes.Ocean(
+        primary_hue=gr.themes.colors.green,
+    ),
+) as application:
+    gr.Markdown("# 🥇 ROMB - Russian Olympiad Math Benchmark")
+    gr.Markdown(f"See ROMB-1.0 dataset there - [{DATASET_NAME}](https://huggingface.co/datasets/{DATASET_NAME}).")
+    with gr.Tabs():
+        with gr.Tab("Leaderboard"):
+            gr.Markdown("In progress...")
+        with gr.Tab("Evaluate"):
+            gr.Markdown((DOCS_PATH / "evaluate.md").read_text())
+        with gr.Tab("Submit"):
+            gr.Markdown("In progress...")
+if __name__ == "__main__":
+    application.launch()

cli.py ADDED Viewed

	@@ -0,0 +1,19 @@

+import click
+from src.eval.cli import evaluate, metrics
+from src.generate.cli import generate, type_sanitycheck
+@click.group
+def cli():
+    pass
+cli.add_command(metrics)
+cli.add_command(evaluate)
+cli.add_command(generate)
+cli.add_command(type_sanitycheck)
+if __name__ == "__main__":
+    cli()

configs/gemma-3-1b.yaml ADDED Viewed

	@@ -0,0 +1,22 @@

+build_function: singleturn
+llm_class: ollama
+kwargs:
+  llm_args:
+    model: gemma3:1b
+    top_k: 1
+    top_p: 1
+    temperature: 0.0
+# build_function: thinking
+# llm_class: ollama
+# kwargs:
+#   think_llm_args:
+#     model: gemma3:1b
+#     top_k: 1
+#     top_p: 1
+#     temperature: 0.0
+#     max_tokens: 1024
+#   answer_llm_args:
+#     model: gemma3:1b
+#     top_k: 1
+#     top_p: 1
+#     temperature: 0.0

configs/ollama.yaml ADDED Viewed

	@@ -0,0 +1,13 @@

+build_function: thinking
+llm_class: ollama
+kwargs:
+  think_llm_args:
+    model: gemma3:1b
+    top_k: 1
+    top_p: 1
+    temperature: 0.0
+  answer_llm_args:
+    model: gemma3:1b
+    top_k: 1
+    top_p: 1
+    temperature: 0.0

configs/openrouter.yaml ADDED Viewed

	@@ -0,0 +1,9 @@

+build_function: singleturn
+llm_class: openai
+kwargs:
+  llm_args:
+    model_name: google/gemini-2.0-flash-lite-001
+    temperature: 0.0
+    top_p: 1.0
+    base_url: https://openrouter.ai/api/v1
+    api_key: sk-or-v1-...

docs/evaluate.md ADDED Viewed

	@@ -0,0 +1,90 @@

+## Evaluation process
+### 1. Generate responses
+The first and main step is to generate the answers. You can do this in any way that is convenient for you, including the scripts that exist in this repository. The main thing is that in the end you have a file with answers in JSONL format, where each object contains the fields `id` (question id, int) and `generated_answer` (model response, json object). Example:
+```json
+{"id":0,"generated_answer":{"answer":"А","context":{}}}
+{"id":1,"generated_answer":{"answer":"А","context":{}}}
+{"id":2,"generated_answer":{"answer":36,"context":{}}}
+{"id":3,"generated_answer":{"answer":10,"context":{}}}
+{"id":4,"generated_answer":{"answer":3000000000000000,"context":{}}}
+{"id":5,"generated_answer":{"answer":"А","context":{}}}
+{"id":6,"generated_answer":{"answer":10,"context":{}}}
+{"id":7,"generated_answer":{"answer":"А","context":{}}}
+{"id":8,"generated_answer":{"answer":{"Удав":4,"Слоненок":1,"Мартышка":3},"context":{}}}
+...
+```
+#### Generation utils
+There are currently 2 types of prompts supported (responding immediately or after the first line of reasoning) and 2 types of model providers (ollama and openai api compatible).
+```bash
+python3 cli.py generate --help
+```
+An example for generating responses using the Gemma 3.1B model:
+```bash
+ollama run gemma3:1b
+```
+```bash
+python3 cli.py generate --config-path configs/gemma-3-1b.yaml --output-path ./gemma-3-1b_nothink.jsonl --temp-path ./tmp_gemma-3-1b/
+```
+### 2. Validate responses
+The generated responses can be checked for correctness using the utility:
+```bash
+python3 cli.py type-sanitycheck --help
+```
+```bash
+python3 cli.py type-sanitycheck --file ./gemma-3-1b_nothink.jsonl
+```
+### 3. Evaluate responses
+Once you have the answers file, you can run the solved/unsolved assessment using the utility:
+```bash
+python3 cli.py evaluate --help
+```
+```bash
+python3 cli.py evaluate --file ./gemma-3-1b_nothink.jsonl
+```
+As a result, you will receive the file `gemma-3-1b_nothink.eval.jsonl` with a new field `is_correct` (bool) - the result of checking each response.
+### 4. Calculate overall metrics
+```bash
+python3 cli.py metrics --help
+```
+```bash
+python3 cli.py metrics --model-name gemma-3-1b --file ./gemma-3-1b_nothink.eval.jsonl --model-size 1.0 --model-url https://huggingface.co/google/gemma-3-1b-it --model-config "{'build_function': 'singleturn', 'top_k': 1, 'top_p': 1, 'temperature': 0.0}"
+```
+As a result, you will receive the file `gemma-3-1b_nothink.eval.metrics.json` with common metrics for the model:
+```json
+[
+    {
+        "model_name": "gemma-3-1b",
+        "model_size": 1.0,
+        "model_url": "https://huggingface.co/google/gemma-3-1b-it",
+        "pass1": 0.10148902821316615,
+        "weighted_pass1": 0.10207932648691802,
+        "arith_pass1": 0.08566433566433566,
+        "geometry_pass1": 0.125,
+        "logic_pass1": 0.13664596273291926,
+        "config": "{'build_function': 'singleturn', 'top_k': 1, 'top_p': 1, 'temperature': 0.0}"
+    }
+]
+```

prompts/simple_think_end.yaml ADDED Viewed

	@@ -0,0 +1,13 @@

+_type: prompt
+input_variables:
+- answer_type
+- task_note
+metadata: null
+name: null
+optional_variables: []
+output_parser: null
+partial_variables: {}
+tags: null
+template: "Напиши свой ответ в этом формате: {answer_type}\nПояснение к формату: {task_note}"
+template_format: f-string
+validate_template: false

prompts/simple_think_system.yaml ADDED Viewed

	@@ -0,0 +1,11 @@

+_type: prompt
+input_variables: []
+metadata: null
+name: null
+optional_variables: []
+output_parser: null
+partial_variables: {}
+tags: null
+template: "Реши следующую математическую задачу эффективно и ясно. Думай шаг за шагом перед ответом."
+template_format: f-string
+validate_template: false

prompts/singleturn.yaml ADDED Viewed

	@@ -0,0 +1,14 @@

+_type: prompt
+input_variables:
+- answer_type
+- task_note
+- task_text
+metadata: null
+name: null
+optional_variables: []
+output_parser: null
+partial_variables: {}
+tags: null
+template: "{task_text}\n\nНапиши свой ответ в этом формате: {answer_type}\nПояснение к формату: {task_note}"
+template_format: f-string
+validate_template: false

requirements.txt ADDED Viewed

	@@ -0,0 +1,12 @@

+pandas
+pandera
+datasets
+langchain
+langchain-community
+langchain-openai
+langchain-ollama
+gradio
+gradio-leaderboard
+pydantic>=2
+pydantic-yaml
+click

src/common/data.py ADDED Viewed

	@@ -0,0 +1,17 @@

+import json
+import datasets
+import pandas as pd
+import pandera.pandas as pa
+from src.common.paths import DATASET_NAME
+from src.common.schema import DatasetSchema
+@pa.check_output(DatasetSchema)
+def load_dataset() -> pd.DataFrame:
+    ds = datasets.load_dataset(DATASET_NAME, split="test")
+    df = pd.DataFrame(ds)
+    df[DatasetSchema.correct_answer] = df[DatasetSchema.correct_answer].apply(json.loads)
+    return df

src/common/env.py ADDED Viewed

	@@ -0,0 +1,25 @@

+from typing import Any
+def build_default_namespace() -> dict[str, Any]:
+    """Creates a dictionary with types from the typing module and built-in types."""
+    import typing
+    from numbers import Number
+    from fractions import Fraction
+    namespace = {
+        name: getattr(typing, name) for name in dir(typing) if not name.startswith("_")
+    }
+    namespace.update(
+        {
+            "int": int,
+            "str": str,
+            "float": float,
+            "bool": bool,
+            "dict": dict,
+            "list": list,
+        }
+    )
+    namespace.update({"Fraction": Fraction, "Number": Number})
+    return namespace

src/common/paths.py ADDED Viewed

	@@ -0,0 +1,11 @@

+import pathlib
+_FILE_PATH = pathlib.Path(__file__).parent.resolve()
+PROJECT_ROOT = _FILE_PATH.parent.parent
+PROMPTS_PATH = PROJECT_ROOT / "prompts"
+DATA_PATH = PROJECT_ROOT / "data"
+DOCS_PATH = PROJECT_ROOT / "docs"
+DATASET_NAME = "d0rj/ROMB-1.0"

src/common/schema.py ADDED Viewed

	@@ -0,0 +1,32 @@

+from typing import Any
+import pandera.pandas as pa
+class DatasetSchema(pa.DataFrameModel):
+    id_: pa.typing.Series[int] = pa.Field(alias="id")
+    task_text: pa.typing.Series[str]
+    answer_text: pa.typing.Series[str]
+    correct_answer: pa.typing.Series[Any]
+    date: pa.typing.Series[str]
+    olymp_name: pa.typing.Series[str]
+    grade: pa.typing.Series[str]
+    description: pa.typing.Series[str]
+    source: pa.typing.Series[str]
+    answer_type: pa.typing.Series[str]
+    check_type: pa.typing.Series[str]
+    check_function: pa.typing.Series[str] = pa.Field(nullable=True)
+    task_type: pa.typing.Series[str]
+    task_note: pa.typing.Series[str]
+class LeaderBoardSchema(pa.DataFrameModel):
+    model_name: pa.typing.Series[str]
+    model_size: pa.typing.Series[float] = pa.Field(nullable=True)
+    model_url: pa.typing.Series[str] = pa.Field(nullable=True)
+    pass1: pa.typing.Series[float]
+    weighted_pass1: pa.typing.Series[float]
+    arith_pass1: pa.typing.Series[float]
+    geometry_pass1: pa.typing.Series[float]
+    logic_pass1: pa.typing.Series[float]
+    config: pa.typing.Series[str] = pa.Field(nullable=True, default={})

src/eval/cli.py ADDED Viewed

	@@ -0,0 +1,191 @@

+import json
+import pathlib
+from copy import deepcopy
+import click
+import pandas as pd
+import pandera.pandas as pa
+from tqdm.auto import tqdm
+from src.common.data import load_dataset
+from src.eval.metrics import grade_to_weight
+from src.eval.schema import DatasetEvalSchema
+from src.eval.matchers import build_check_function
+from src.generate.generators import GenerationAnswer
+from src.generate.schema import GeneratedDatasetSchema
+from src.common.schema import DatasetSchema, LeaderBoardSchema
+def _evaluate_single_answer(
+    row: dict,
+) -> bool:
+    if pd.isna(row[GeneratedDatasetSchema.generated_answer]):
+        return False
+    if not type(row[GeneratedDatasetSchema.generated_answer]) is GenerationAnswer:
+        raise ValueError(
+            f"Expected GenerationAnswer, got {type(row[GeneratedDatasetSchema.generated_answer])} for id {row[DatasetSchema.id_]}",
+        )
+    y_pred = row[GeneratedDatasetSchema.generated_answer].answer
+    if not y_pred:
+        return False
+    y_true = row[DatasetSchema.correct_answer]
+    check_function = build_check_function(
+        row[DatasetSchema.check_type],
+        row[DatasetSchema.check_function],
+    )
+    try:
+        result = check_function(
+            y_true=deepcopy(y_true),
+            y_pred=deepcopy(y_pred),
+        )
+    except Exception as e:
+        print(e)
+        print(f"Error evaluating row with {row[DatasetSchema.check_type]} {row[DatasetSchema.id_]}: {y_true} vs {y_pred}")
+        exit(1)
+    return result
+@pa.check_input(GeneratedDatasetSchema)
+@pa.check_output(DatasetEvalSchema)
+def _evaluate(
+    generated_df: pd.DataFrame,
+) -> pd.DataFrame:
+    tqdm.pandas()
+    generated_df[GeneratedDatasetSchema.generated_answer] = generated_df[GeneratedDatasetSchema.generated_answer].apply(
+        lambda x: GenerationAnswer.model_validate(deepcopy(x)) if x else None,
+    )
+    dataset_df = load_dataset()
+    predictions_df = dataset_df.join(
+        generated_df.set_index(GeneratedDatasetSchema.id_),
+        on=DatasetSchema.id_,
+    )
+    predictions_df[DatasetEvalSchema.is_correct] = predictions_df.progress_apply(
+        _evaluate_single_answer,
+        axis=1,
+    )
+    predictions_df[DatasetEvalSchema.predicted_answer] = predictions_df[GeneratedDatasetSchema.generated_answer].apply(
+        lambda x: x.answer if not pd.isna(x) else None,
+    )
+    predictions_df[DatasetEvalSchema.context] = predictions_df[GeneratedDatasetSchema.generated_answer].apply(
+        lambda x: x.context if not pd.isna(x) else None,
+    )
+    predictions_df = predictions_df[list(DatasetEvalSchema._collect_fields().keys())]
+    return predictions_df
+@click.command()
+@click.option(
+    "--file",
+    type=click.Path(exists=True, dir_okay=False, readable=True, resolve_path=True),
+    default=pathlib.Path("./gemma3:4b.jsonl"),
+)
+def evaluate(
+    file: pathlib.Path = pathlib.Path("./gemma3:4b.jsonl"),
+):
+    file = pathlib.Path(file)
+    df = pd.read_json(file, lines=True)
+    evaluated_df = _evaluate(df)
+    evaluated_df.to_json(file.with_suffix(".eval.jsonl"), orient="records", lines=True, force_ascii=False)
+@pa.check_input(DatasetEvalSchema)
+@pa.check_output(LeaderBoardSchema)
+def _metrics(
+    df: pd.DataFrame,
+    model_name: str,
+    model_size: float,
+    model_url: str,
+    model_config: str
+) -> pd.DataFrame:
+    pass1 = df[DatasetEvalSchema.is_correct].mean()
+    w = df[DatasetEvalSchema.grade].apply(grade_to_weight)
+    weighted_accuracy = (df[DatasetEvalSchema.is_correct].astype(int) * w).sum() / w.sum()
+    arith_pass1 = df[df[DatasetEvalSchema.task_type] == "arith"][DatasetEvalSchema.is_correct].mean()
+    geometry_pass1 = df[df[DatasetEvalSchema.task_type] == "geometry"][DatasetEvalSchema.is_correct].mean()
+    logic_pass1 = df[df[DatasetEvalSchema.task_type] == "logic"][DatasetEvalSchema.is_correct].mean()
+    result = {
+        LeaderBoardSchema.model_name: model_name,
+        LeaderBoardSchema.model_size: model_size,
+        LeaderBoardSchema.model_url: model_url,
+        LeaderBoardSchema.config: str(model_config),
+        LeaderBoardSchema.pass1: pass1,
+        LeaderBoardSchema.weighted_pass1: weighted_accuracy,
+        LeaderBoardSchema.arith_pass1: arith_pass1,
+        LeaderBoardSchema.geometry_pass1: geometry_pass1,
+        LeaderBoardSchema.logic_pass1: logic_pass1,
+    }
+    result_df = pd.DataFrame([result])
+    result_df = result_df[list(LeaderBoardSchema._collect_fields().keys())]
+    return result_df
+@click.command()
+@click.option(
+    "--model-name",
+    type=str,
+    required=True,
+    help="Name of the model being evaluated.",
+)
+@click.option(
+    "--file",
+    type=click.Path(exists=True, dir_okay=False, readable=True, resolve_path=True),
+    default=pathlib.Path("./gemma3:4b_eval.jsonl"),
+)
+@click.option(
+    "--model-size",
+    type=float,
+    default=None,
+    help="Size of the model in billions of parameters.",
+)
+@click.option(
+    "--model-url",
+    type=str,
+    default=None,
+    help="URL where the model can be accessed.",
+)
+@click.option(
+    "--model-config",
+    type=str,
+    default=None,
+    help="Model configuration in dict format.",
+)
+def metrics(
+    model_name: str,
+    file: pathlib.Path = pathlib.Path("./gemma3:4b_eval.jsonl"),
+    model_size: float = None,
+    model_url: str = None,
+    model_config: str = None,
+):
+    file = pathlib.Path(file)
+    df = pd.read_json(file, lines=True)
+    metrics_df = _metrics(
+        df,
+        model_name=model_name,
+        model_size=model_size,
+        model_url=model_url,
+        model_config=model_config or '',
+    )
+    metrics = metrics_df.to_dict(orient="records")[0]
+    print(f"Metrics for {model_name}:")
+    for key, value in metrics.items():
+        print(f"{key}: {value}")
+    json.dump(
+        metrics_df.to_dict(orient="records"),
+        open(file.with_suffix(".metrics.json"), "w"),
+        ensure_ascii=False,
+    )
+if __name__ == "__main__":
+    evaluate()

src/eval/matchers.py ADDED Viewed

	@@ -0,0 +1,144 @@

+import collections
+from typing import Any, Callable
+from src.common.env import build_default_namespace
+def _dict_to_tuple(dict_obj: dict) -> tuple[tuple]:
+    return tuple(sorted(dict_obj.items()))
+def Am(y_true: list, y_pred: list) -> bool:
+    """ "Check if all elements in y_pred are present in y_true and vice versa."""
+    return all(y in y_true for y in y_pred) and all(y in y_pred for y in y_true)
+def am(y_true: list, y_pred: list) -> bool:
+    """Check if any elements in y_pred are present in y_true."""
+    return any(y in y_true for y in y_pred)
+def em(y_true: Any, y_pred: Any) -> bool:
+    """Check if the true answer and predicted answer are exactly the same."""
+    if type(y_true) is str:
+        y_true = y_true.lower()
+        y_pred = y_pred.lower()
+    return y_true == y_pred
+def um(y_true: list, y_pred: list) -> bool:
+    """Check if the true answer and predicted answer are unordered but contain the same elements."""
+    if len(y_true) != len(y_pred):
+        return False
+    if len(y_true) == 0:
+        return True
+    if (len(y_true) > 0 and type(y_true[0]) is dict) or (len(y_true) == 0 and type(y_pred[0]) is dict):
+        y_true = [_dict_to_tuple(item) for item in y_true]
+        y_pred = [_dict_to_tuple(item) for item in y_pred]
+    if type(y_true) != type(y_pred):
+        return False
+    return collections.Counter(y_true) == collections.Counter(y_pred)
+def om(y_true: list, y_pred: list) -> bool:
+    """Check if the true answer and predicted answer are in the same order."""
+    return list(y_true) == list(y_pred)
+def um_om(y_true: list[list], y_pred: list[list]) -> bool:
+    """Check if the true answer and predicted answer are unordered lists of ordered sublists."""
+    true_bags = collections.Counter(tuple(sub) for sub in y_true)
+    pred_bags = collections.Counter(tuple(sub) for sub in y_pred)
+    return true_bags == pred_bags
+def um_um(y_true: list[list], y_pred: list[list]) -> bool:
+    """Check if the true answer and predicted answer are unordered lists of unordered sublists."""
+    true_sets = [tuple(sorted(sub)) for sub in y_true]
+    pred_sets = [tuple(sorted(sub)) for sub in y_pred]
+    return collections.Counter(true_sets) == collections.Counter(pred_sets)
+def _build_custom(check_code: str) -> Callable[[Any, Any], bool]:
+    """
+    Builds a custom function based on the provided check code.
+    The check code should be a string representing a Python expression.
+    """
+    code = "\n".join([f"    {line}" for line in check_code.splitlines()])
+    code = f"def check(y_true: Any, y_pred: Any) -> bool:\n{code}"
+    namespace = build_default_namespace()
+    exec(code, namespace)
+    return namespace["check"]
+def _build_dict(type_dict: dict[Any, str]) -> Callable[[Any, Any], bool]:
+    """
+    Builds a function that checks if the predicted answer matches the true answer
+    for each field in the type dictionary.
+    """
+    def check(y_true, y_pred) -> bool:
+        assert set(type_dict.keys()) == set(y_true.keys())
+        try:
+            for key, value in y_true.items():
+                key_check = build_check_function(type_dict[key])
+                if not key_check(y_true=value, y_pred=y_pred[key]):
+                    return False
+            return True
+        except KeyError:
+            return False
+    return check
+def build_check_function(
+    check_type: str, check_code: str | None = None
+) -> Callable[[Any, Any], bool]:
+    """
+    Returns a function that checks if the predicted answer matches the true answer.
+    Args:
+        check_type (str): The type of check to perform. Can be one of:
+            - "Am": All match
+            - "am": Any match
+            - "em": Exact match
+            - "um": Unordered match
+            - "om": Ordered match
+            - "um[om]": Unordered match with ordered sublists
+            - "um[um]": Unordered match with unordered sublists
+            - "custom": Custom check defined by `check_code`
+            - A dictionary where keys are field names and values are check types for each field.
+        check_code (str, optional): Custom check code to be executed if `check_type` is "custom".
+            It should define a function body without the function definition line.
+    Returns:
+        Callable[[Any, Any], bool]: A function that takes two arguments (true answer and predicted answer)
+            and returns True if they match according to the specified check type, otherwise False.
+    """
+    check_functions = {
+        "Am": Am,
+        "am": am,
+        "em": em,
+        "um": um,
+        "um_f": um,  # TODO: fraction of matched answers
+        "om": om,
+        "um[om]": um_om,
+        "um[um]": um_um,
+    }
+    try:
+        check_type_dict = eval(check_type)
+        if not type(check_type_dict) is dict:
+            check_type_dict = None
+    except:
+        check_type_dict = None
+    if check_type in check_functions:
+        return check_functions[check_type]
+    elif check_type == "custom" and check_code is not None:
+        return _build_custom(check_code)
+    elif check_type_dict:
+        return _build_dict(check_type_dict)
+    else:
+        raise ValueError(
+            f"Unknown check type: {check_type}. Available types: {list(check_functions.keys()) + ['custom']}."
+        )

src/eval/metrics.py ADDED Viewed

	@@ -0,0 +1,7 @@

+import numpy as np
+def grade_to_weight(g: str) -> float:
+    """Convert a grade string to a weight value."""
+    parts = list(map(int, g.split('-')))
+    return np.mean(parts)

src/eval/schema.py ADDED Viewed

	@@ -0,0 +1,16 @@

+from typing import Any
+import pandera.pandas as pa
+class DatasetEvalSchema(pa.DataFrameModel):
+    id_: pa.typing.Series[int] = pa.Field(alias="id")
+    is_correct: pa.typing.Series[bool]
+    task_text: pa.typing.Series[str]
+    answer_text: pa.typing.Series[str]
+    correct_answer: pa.typing.Series[Any]
+    predicted_answer: pa.typing.Series[Any] = pa.Field(nullable=True)
+    olymp_name: pa.typing.Series[str]
+    grade: pa.typing.Series[str]
+    task_type: pa.typing.Series[str]
+    context: pa.typing.Series[Any] = pa.Field(default=None, nullable=True)

src/generate/answer.py ADDED Viewed

	@@ -0,0 +1,156 @@

+from typing_extensions import get_args, get_origin, TypedDict
+from typing import Any, Union, Literal, List, Tuple, Dict, Set, Annotated
+from pydantic import create_model, BaseModel, RootModel
+from src.common.env import build_default_namespace
+def string_to_type(type_str: str) -> Union[type, Tuple[type, ...]]:
+    """Converts a string representation of a type to an actual type."""
+    namespace = build_default_namespace()
+    return eval(type_str, namespace, {})
+def matches_type(value: Any, type_hint: Union[type, Tuple[type, ...]]) -> bool:
+    """Checks if a value matches a given type hint."""
+    origin = get_origin(type_hint)
+    args = get_args(type_hint)
+    if origin is Union:
+        return any(matches_type(value, arg) for arg in args)
+    if origin is Literal:
+        return value in args
+    if origin is Annotated:
+        return matches_type(value, args[0])
+    if origin is list or origin is List:
+        if not isinstance(value, list):
+            return False
+        if not args:
+            return True
+        return all(matches_type(item, args[0]) for item in value)
+    if origin is tuple or origin is Tuple:
+        if not isinstance(value, tuple):
+            return False
+        if not args:
+            return True
+        if len(args) == 2 and args[1] is Ellipsis:
+            return all(matches_type(item, args[0]) for item in value)
+        if len(args) != len(value):
+            return False
+        return all(matches_type(item, sub_type) for item, sub_type in zip(value, args))
+    if origin is dict or origin is Dict:
+        if not isinstance(value, dict):
+            return False
+        if not args:
+            return True
+        key_type, val_type = args
+        return all(
+            matches_type(k, key_type) and matches_type(v, val_type)
+            for k, v in value.items()
+        )
+    if origin is set or origin is Set:
+        if not isinstance(value, set):
+            return False
+        if not args:
+            return True
+        return all(matches_type(item, args[0]) for item in value)
+    if type_hint is type(None):
+        return value is None
+    if type_hint is Any:
+        return True
+    try:
+        return isinstance(value, type_hint)
+    except TypeError:
+        return False
+def make_answer_model(
+    type_str: str,
+    field_name: str = "answer",
+    model_name: str = "AnswerModel",
+    add_thinking_field: bool = False,
+) -> type[BaseModel]:
+    """
+    Creates a Pydantic model with one required field `field_name`,
+    whose type is taken from the string `type_str`.
+    If `add_thinking_field` is True, then a `thinking` field of type str is added.
+    The resulting class will have the name `model_name`.
+    """
+    type_hint = string_to_type(type_str)
+    model = create_model(
+        model_name,
+        **(
+            (
+                {
+                    "thinking": (str, ...),
+                }
+                if add_thinking_field
+                else {}
+            )
+            | {
+                field_name: (type_hint, ...),
+            }
+        ),
+    )
+    return model
+def _build_typed_dict(name: str, keys: tuple, value_type: Any):
+    annotations = {k: value_type for k in keys}
+    return TypedDict(name, annotations, total=True)
+def _transform_required_dicts(tp: Any, name_base: str = "TD") -> Any:
+    origin = get_origin(tp)
+    if origin in (dict, Dict):
+        k_type, v_type = get_args(tp)
+        if get_origin(k_type) is Literal:
+            literal_keys = get_args(k_type)
+            v_type_t = _transform_required_dicts(v_type, name_base + "V")
+            return _build_typed_dict(f"{name_base}Required", literal_keys, v_type_t)
+        k_type_t = _transform_required_dicts(k_type, name_base + "K")
+        v_type_t = _transform_required_dicts(v_type, name_base + "V")
+        return Dict[k_type_t, v_type_t]
+    if origin in (list, List):
+        (inner,) = get_args(tp)
+        inner_t = _transform_required_dicts(inner, name_base + "Item")
+        return List[inner_t]
+    if origin is Union:
+        return Union[
+            tuple(_transform_required_dicts(a, name_base + "U") for a in get_args(tp))
+        ]
+    return tp
+def make_root_model(
+    type_str: str, model_name: str = "Answer", make_required: bool = True
+) -> type[BaseModel]:
+    """
+    Creates a Pydantic root model equivalent to any type hint from string.
+    The resulting class will have a root field __root__ with the needed type,
+    and you can parse an object of this type directly.
+    """
+    type_hint = string_to_type(type_str)
+    if make_required:
+        type_hint = _transform_required_dicts(type_hint, name_base=model_name + "Dict")
+    model = type(model_name, (RootModel[type_hint],), {})
+    return model

src/generate/cli.py ADDED Viewed

	@@ -0,0 +1,210 @@

+import json
+import pathlib
+from copy import deepcopy
+from typing import Callable
+from functools import partial
+import click
+import pandas as pd
+import pandera.pandas as pa
+from tqdm.auto import tqdm
+from langchain_core.runnables import Runnable
+from src.common.data import load_dataset
+from src.common.schema import DatasetSchema
+from src.generate.config import GenerationConfig
+from src.generate.schema import GeneratedDatasetSchema
+from src.generate.answer import make_root_model, matches_type, string_to_type
+from src.generate.generators import GenerationAnswer, GENERATORS_NAME_TO_FACTORY
+def _save_temp_file(
+    row: dict,
+    result: GenerationAnswer,
+    temp_path: pathlib.Path,
+) -> None:
+    temp_file = temp_path / f"{row[DatasetSchema.id_]}.json"
+    json.dump(
+        {
+            DatasetSchema.id_: row[DatasetSchema.id_],
+            GeneratedDatasetSchema.generated_answer: result.model_dump(),
+        },
+        open(temp_file, "w"),
+        ensure_ascii=False,
+    )
+def _generate_single_answer(
+    row: dict,
+    build_chain: Callable[[type], Runnable],
+    temp_path: pathlib.Path = None,
+) -> GenerationAnswer:
+    if temp_path and (temp_path / f"{row[DatasetSchema.id_]}.json").exists():
+        return GenerationAnswer.model_validate(
+            json.load(open(temp_path / f"{row[DatasetSchema.id_]}.json", "r"))[GeneratedDatasetSchema.generated_answer]
+        )
+    answer_type = make_root_model(row[DatasetSchema.answer_type])
+    chain = build_chain(answer_type)
+    row = dict(row)
+    row.pop(DatasetSchema.correct_answer, None)
+    result: GenerationAnswer = chain.invoke(row)
+    if temp_path:
+        _save_temp_file(row, result, temp_path)
+    return result
+@pa.check_input(DatasetSchema)
+@pa.check_output(GeneratedDatasetSchema)
+def _generate_answers(
+    df: pd.DataFrame,
+    build_chain: Callable[[type], Runnable],
+    use_tqdm: bool = True,
+    temp_path: pathlib.Path = None,
+) -> pd.DataFrame:
+    if use_tqdm:
+        tqdm.pandas()
+        df[GeneratedDatasetSchema.generated_answer] = df.progress_apply(
+            partial(
+                _generate_single_answer,
+                build_chain=build_chain,
+                temp_path=temp_path,
+            ),
+            axis=1,
+        )
+    else:
+        df[GeneratedDatasetSchema.generated_answer] = df.apply(
+            partial(
+                _generate_single_answer,
+                build_chain=build_chain,
+                temp_path=temp_path,
+            ),
+            axis=1,
+        )
+    df = df[list(GeneratedDatasetSchema._collect_fields().keys())]
+    return df
+@click.command()
+@click.option(
+    "--config-path",
+    type=click.Path(exists=True, dir_okay=False),
+    default=pathlib.Path("configs/ollama.yaml"),
+    help="Path to the configuration file.",
+)
+@click.option(
+    "--output-path",
+    type=click.Path(dir_okay=False),
+    default=pathlib.Path("./gemma3:4b.jsonl"),
+    help="Path to the output file.",
+)
+@click.option(
+    "--temp-path",
+    type=click.Path(dir_okay=True, file_okay=False),
+    default=pathlib.Path("./tmp_gemma3:4b/"),
+    help="Path to the temp files directory.",
+)
+@click.option(
+    "--use-tqdm",
+    is_flag=True,
+    default=True,
+    help="Whether to use tqdm for progress bar.",
+)
+def generate(
+    config_path: pathlib.Path = pathlib.Path("configs/ollama.yaml"),
+    output_path: pathlib.Path = pathlib.Path("./gemma3:4b.jsonl"),
+    temp_path: pathlib.Path = pathlib.Path("./tmp_gemma3:4b/"),
+    use_tqdm: bool = True,
+):
+    output_path = pathlib.Path(output_path)
+    temp_path = pathlib.Path(temp_path)
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    temp_path.mkdir(parents=True, exist_ok=True)
+    config = GenerationConfig.from_file(config_path)
+    df = load_dataset()
+    # df = df.head(3)
+    build_chain_function = GENERATORS_NAME_TO_FACTORY[config.build_function]
+    build_chain_function = partial(
+        build_chain_function,
+        llm_class=config.llm_class,
+        structured_output_method=config.structured_output_method,
+        **config.kwargs
+    )
+    df = _generate_answers(df, build_chain_function, use_tqdm=use_tqdm, temp_path=temp_path)
+    df[GeneratedDatasetSchema.generated_answer] = df[GeneratedDatasetSchema.generated_answer].apply(
+        lambda x: x.model_dump()
+    )
+    df.to_json(
+        output_path,
+        lines=True,
+        orient="records",
+        force_ascii=False,
+    )
+@pa.check_input(GeneratedDatasetSchema)
+def _type_sanitycheck(
+    generated_df: pd.DataFrame,
+) -> tuple[bool, str]:
+    generated_df[GeneratedDatasetSchema.generated_answer] = generated_df[GeneratedDatasetSchema.generated_answer].apply(
+        lambda x: GenerationAnswer.model_validate(deepcopy(x)) if not isinstance(x, GenerationAnswer) else x
+    )
+    dataset_df = load_dataset()
+    predicted_df = dataset_df.join(
+        generated_df.set_index(GeneratedDatasetSchema.id_),
+        on=DatasetSchema.id_,
+        rsuffix='_generated',
+    ).dropna(subset=[GeneratedDatasetSchema.generated_answer])
+    if len(predicted_df) == 0:
+        return False, "No valid predictions found."
+    TYPE_MATCH = "type_match"
+    predicted_df[TYPE_MATCH] = predicted_df.apply(
+        lambda row: matches_type(
+            row[GeneratedDatasetSchema.generated_answer].answer,
+            string_to_type(row[DatasetSchema.answer_type]),
+        ), axis=1
+    )
+    if not predicted_df[TYPE_MATCH].all():
+        return False, f"Type mismatch found for {predicted_df[~predicted_df[TYPE_MATCH]][DatasetSchema.id_].tolist()}."
+    return True, f"All matched. Predicted count: {len(predicted_df)} of {len(dataset_df)}"
+@click.command()
+@click.option(
+    "--file",
+    type=click.Path(exists=True, dir_okay=False),
+    default=pathlib.Path("./gemma3:4b.jsonl"),
+    help="Path to the generated dataset file.",
+)
+def type_sanitycheck(
+    file: pathlib.Path = pathlib.Path("./gemma3:4b.jsonl"),
+):
+    df = pd.read_json(file, lines=True)
+    types_correct, message = _type_sanitycheck(df)
+    if not types_correct:
+        click.echo(f"❌ Type sanity check failed: {message}")
+        exit(1)
+    click.echo(f"✅ Type sanity check passed: {message}")
+@click.group()
+def cli():
+    pass
+cli.add_command(generate)
+cli.add_command(type_sanitycheck)
+if __name__ == "__main__":
+    cli()

src/generate/config.py ADDED Viewed

	@@ -0,0 +1,27 @@

+import pathlib
+from typing import Literal, Any, get_args
+from pydantic import BaseModel
+from pydantic_yaml import parse_yaml_raw_as
+from src.generate.llms import LLMName
+from src.generate.generators import GeneratorName
+class GenerationConfig(BaseModel):
+    build_function: GeneratorName = get_args(GeneratorName)[0]
+    llm_class: LLMName = get_args(LLMName)[0]
+    structured_output_method: Literal[
+        "function_calling", "json_mode", "json_schema"
+    ] = "json_schema"
+    kwargs: dict[str, Any] = {}
+    @classmethod
+    def from_yaml(cls, yaml_str: str) -> "GenerationConfig":
+        return parse_yaml_raw_as(cls, yaml_str)
+    @classmethod
+    def from_file(cls, file_path: str | pathlib.Path) -> "GenerationConfig":
+        with open(file_path, "r") as file:
+            yaml_str = file.read()
+        return cls.from_yaml(yaml_str)

src/generate/generators.py ADDED Viewed

	@@ -0,0 +1,139 @@

+from typing import Any, Literal, Callable
+import openai
+from pydantic import BaseModel
+from langchain_core.prompts import ChatPromptTemplate
+from langchain_core.runnables import RunnablePassthrough
+from langchain_core.output_parsers import StrOutputParser
+from langchain_core.runnables import Runnable, RunnableLambda
+from langchain_core.prompts import (
+    load_prompt,
+    ChatPromptTemplate,
+    AIMessagePromptTemplate,
+    HumanMessagePromptTemplate,
+    SystemMessagePromptTemplate,
+)
+from src.common.paths import PROMPTS_PATH
+from src.common.schema import DatasetSchema
+from src.generate.llms import LLM_NAME_TO_CLASS, LLMName
+class GenerationAnswer(BaseModel):
+    answer: Any
+    context: dict[str, Any] = {}
+def build_singleturn_chain(
+    answer_class: type[BaseModel],
+    llm_class: LLMName = "ollama",
+    llm_args: dict[str, Any] = {
+        "model": "gemma3:4b",
+        "top_k": 1,
+        "top_p": 1,
+        "temperature": 0.0,
+    },
+    structured_output_method: Literal[
+        "function_calling", "json_mode", "json_schema"
+    ] = "json_schema",
+) -> Runnable:
+    llm = LLM_NAME_TO_CLASS[llm_class](
+        **llm_args,
+    )
+    llm = llm.with_structured_output(
+        answer_class,
+        method=structured_output_method,
+    )
+    prompt = ChatPromptTemplate.from_messages(
+        [
+            HumanMessagePromptTemplate(
+                prompt=load_prompt(PROMPTS_PATH / "singleturn.yaml")
+            )
+        ]
+    )
+    chain = RunnablePassthrough.assign(answer=prompt | llm) | RunnableLambda(
+        lambda x: GenerationAnswer(
+            answer=x["answer"],
+            context={},
+        )
+    )
+    chain = chain.with_retry(
+        retry_if_exception_type=(openai.PermissionDeniedError, )
+    )
+    return chain
+def build_thinking_chain(
+    answer_class: type[BaseModel],
+    llm_class: LLMName = "ollama",
+    think_llm_args: dict[str, Any] = {
+        "model": "gemma3:4b",
+        "top_k": 1,
+        "top_p": 1,
+        "temperature": 0.0,
+    },
+    answer_llm_args: dict[str, Any] = {
+        "model": "gemma3:4b",
+        "top_k": 1,
+        "top_p": 1,
+        "temperature": 0.0,
+    },
+    structured_output_method: Literal[
+        "function_calling", "json_mode", "json_schema"
+    ] = "json_schema",
+) -> Runnable:
+    think_llm = LLM_NAME_TO_CLASS[llm_class](
+        **think_llm_args,
+    )
+    think_prompt = ChatPromptTemplate.from_messages(
+        [
+            SystemMessagePromptTemplate(
+                prompt=load_prompt(PROMPTS_PATH / "simple_think_system.yaml")
+            ),
+            HumanMessagePromptTemplate.from_template(f"{{{DatasetSchema.task_text}}}"),
+        ]
+    )
+    think_chain = think_prompt | think_llm | StrOutputParser()
+    answer_prompt = ChatPromptTemplate.from_messages(
+        think_prompt.messages
+        + [
+            AIMessagePromptTemplate.from_template("{think_answer}"),
+            HumanMessagePromptTemplate(
+                prompt=load_prompt(PROMPTS_PATH / "simple_think_end.yaml")
+            ),
+        ]
+    )
+    answer_llm = LLM_NAME_TO_CLASS[llm_class](
+        **answer_llm_args,
+    )
+    answer_llm = answer_llm.with_structured_output(
+        answer_class,
+        method=structured_output_method,
+    )
+    chain = (
+        RunnablePassthrough.assign(
+            think_answer=think_chain,
+        )
+        | RunnablePassthrough.assign(answer=answer_prompt | answer_llm)
+        | RunnableLambda(
+            lambda x: GenerationAnswer(
+                answer=x["answer"],
+                context={
+                    "think_answer": x["think_answer"],
+                },
+            )
+        )
+    )
+    chain = chain.with_retry(
+        retry_if_exception_type=(openai.PermissionDeniedError, )
+    )
+    return chain
+GeneratorName = Literal["singleturn", "thinking"]
+GENERATORS_NAME_TO_FACTORY: dict[str, Callable[[type[BaseModel]], Runnable]] = {
+    "singleturn": build_singleturn_chain,
+    "thinking": build_thinking_chain,
+}

src/generate/llms.py ADDED Viewed

	@@ -0,0 +1,14 @@

+from typing import Literal
+from langchain_ollama import ChatOllama
+from langchain_openai.chat_models import ChatOpenAI
+from langchain_community.chat_models import GigaChat
+from langchain_core.language_models.chat_models import BaseChatModel
+LLMName = Literal["ollama", "openai", "gigachat"]
+LLM_NAME_TO_CLASS: dict[LLMName, type[BaseChatModel]] = {
+    "ollama": ChatOllama,
+    "openai": ChatOpenAI,
+    "gigachat": GigaChat,
+}

src/generate/schema.py ADDED Viewed

	@@ -0,0 +1,8 @@

+from typing import Any
+import pandera.pandas as pa
+class GeneratedDatasetSchema(pa.DataFrameModel):
+    id_: pa.typing.Series[int] = pa.Field(alias="id")
+    generated_answer: pa.typing.Series[Any]

src/space/utils.py ADDED Viewed

	@@ -0,0 +1,11 @@

+import os
+from huggingface_hub import HfApi
+REPO_ID = os.getenv("SPACE_ID", "d0rj/romb-leaderboard")
+HF_TOKEN = os.getenv("HF_TOKEN", None)
+if not HF_TOKEN:
+    raise ValueError("HF_TOKEN environment variable is not set.")
+hf_api = HfApi(token=HF_TOKEN)