d0rj commited on
Commit
1719436
·
1 Parent(s): 101a598

feat: Initial commit

Browse files
.gitignore ADDED
@@ -0,0 +1,208 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ notebooks/
2
+ tmp*
3
+ *.jsonl
4
+ *.json
5
+
6
+ # Byte-compiled / optimized / DLL files
7
+ __pycache__/
8
+ *.py[codz]
9
+ *$py.class
10
+
11
+ # C extensions
12
+ *.so
13
+
14
+ # Distribution / packaging
15
+ .Python
16
+ build/
17
+ develop-eggs/
18
+ dist/
19
+ downloads/
20
+ eggs/
21
+ .eggs/
22
+ lib/
23
+ lib64/
24
+ parts/
25
+ sdist/
26
+ var/
27
+ wheels/
28
+ share/python-wheels/
29
+ *.egg-info/
30
+ .installed.cfg
31
+ *.egg
32
+ MANIFEST
33
+
34
+ # PyInstaller
35
+ # Usually these files are written by a python script from a template
36
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
37
+ *.manifest
38
+ *.spec
39
+
40
+ # Installer logs
41
+ pip-log.txt
42
+ pip-delete-this-directory.txt
43
+
44
+ # Unit test / coverage reports
45
+ htmlcov/
46
+ .tox/
47
+ .nox/
48
+ .coverage
49
+ .coverage.*
50
+ .cache
51
+ nosetests.xml
52
+ coverage.xml
53
+ *.cover
54
+ *.py.cover
55
+ .hypothesis/
56
+ .pytest_cache/
57
+ cover/
58
+
59
+ # Translations
60
+ *.mo
61
+ *.pot
62
+
63
+ # Django stuff:
64
+ *.log
65
+ local_settings.py
66
+ db.sqlite3
67
+ db.sqlite3-journal
68
+
69
+ # Flask stuff:
70
+ instance/
71
+ .webassets-cache
72
+
73
+ # Scrapy stuff:
74
+ .scrapy
75
+
76
+ # Sphinx documentation
77
+ docs/_build/
78
+
79
+ # PyBuilder
80
+ .pybuilder/
81
+ target/
82
+
83
+ # Jupyter Notebook
84
+ .ipynb_checkpoints
85
+
86
+ # IPython
87
+ profile_default/
88
+ ipython_config.py
89
+
90
+ # pyenv
91
+ # For a library or package, you might want to ignore these files since the code is
92
+ # intended to run in multiple environments; otherwise, check them in:
93
+ # .python-version
94
+
95
+ # pipenv
96
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
97
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
98
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
99
+ # install all needed dependencies.
100
+ #Pipfile.lock
101
+
102
+ # UV
103
+ # Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
104
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
105
+ # commonly ignored for libraries.
106
+ #uv.lock
107
+
108
+ # poetry
109
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
110
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
111
+ # commonly ignored for libraries.
112
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
113
+ #poetry.lock
114
+ #poetry.toml
115
+
116
+ # pdm
117
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
118
+ # pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
119
+ # https://pdm-project.org/en/latest/usage/project/#working-with-version-control
120
+ #pdm.lock
121
+ #pdm.toml
122
+ .pdm-python
123
+ .pdm-build/
124
+
125
+ # pixi
126
+ # Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
127
+ #pixi.lock
128
+ # Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
129
+ # in the .venv directory. It is recommended not to include this directory in version control.
130
+ .pixi
131
+
132
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
133
+ __pypackages__/
134
+
135
+ # Celery stuff
136
+ celerybeat-schedule
137
+ celerybeat.pid
138
+
139
+ # SageMath parsed files
140
+ *.sage.py
141
+
142
+ # Environments
143
+ .env
144
+ .envrc
145
+ .venv
146
+ env/
147
+ venv/
148
+ ENV/
149
+ env.bak/
150
+ venv.bak/
151
+
152
+ # Spyder project settings
153
+ .spyderproject
154
+ .spyproject
155
+
156
+ # Rope project settings
157
+ .ropeproject
158
+
159
+ # mkdocs documentation
160
+ /site
161
+
162
+ # mypy
163
+ .mypy_cache/
164
+ .dmypy.json
165
+ dmypy.json
166
+
167
+ # Pyre type checker
168
+ .pyre/
169
+
170
+ # pytype static type analyzer
171
+ .pytype/
172
+
173
+ # Cython debug symbols
174
+ cython_debug/
175
+
176
+ # PyCharm
177
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
178
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
179
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
180
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
181
+ #.idea/
182
+
183
+ # Abstra
184
+ # Abstra is an AI-powered process automation framework.
185
+ # Ignore directories containing user credentials, local state, and settings.
186
+ # Learn more at https://abstra.io/docs
187
+ .abstra/
188
+
189
+ # Visual Studio Code
190
+ # Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
191
+ # that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
192
+ # and can be added to the global gitignore or merged into this file. However, if you prefer,
193
+ # you could uncomment the following to ignore the entire vscode folder
194
+ # .vscode/
195
+
196
+ # Ruff stuff:
197
+ .ruff_cache/
198
+
199
+ # PyPI configuration file
200
+ .pypirc
201
+
202
+ # Marimo
203
+ marimo/_static/
204
+ marimo/_lsp/
205
+ __marimo__/
206
+
207
+ # Streamlit
208
+ .streamlit/secrets.toml
LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # MIT License
2
+
3
+ Copyright (c) 2025 d0rj
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
app.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from gradio_leaderboard import Leaderboard, SelectColumns, ColumnFilter
3
+
4
+ from src.common.paths import DOCS_PATH, DATASET_NAME
5
+
6
+
7
+ with gr.Blocks(
8
+ title="ROMB Leaderboard v1.0",
9
+ theme=gr.themes.Ocean(
10
+ primary_hue=gr.themes.colors.green,
11
+ ),
12
+ ) as application:
13
+ gr.Markdown("# 🥇 ROMB - Russian Olympiad Math Benchmark")
14
+ gr.Markdown(f"See ROMB-1.0 dataset there - [{DATASET_NAME}](https://huggingface.co/datasets/{DATASET_NAME}).")
15
+ with gr.Tabs():
16
+ with gr.Tab("Leaderboard"):
17
+ gr.Markdown("In progress...")
18
+ with gr.Tab("Evaluate"):
19
+ gr.Markdown((DOCS_PATH / "evaluate.md").read_text())
20
+ with gr.Tab("Submit"):
21
+ gr.Markdown("In progress...")
22
+
23
+
24
+ if __name__ == "__main__":
25
+ application.launch()
cli.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import click
2
+
3
+ from src.eval.cli import evaluate, metrics
4
+ from src.generate.cli import generate, type_sanitycheck
5
+
6
+
7
+ @click.group
8
+ def cli():
9
+ pass
10
+
11
+
12
+ cli.add_command(metrics)
13
+ cli.add_command(evaluate)
14
+ cli.add_command(generate)
15
+ cli.add_command(type_sanitycheck)
16
+
17
+
18
+ if __name__ == "__main__":
19
+ cli()
configs/gemma-3-1b.yaml ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ build_function: singleturn
2
+ llm_class: ollama
3
+ kwargs:
4
+ llm_args:
5
+ model: gemma3:1b
6
+ top_k: 1
7
+ top_p: 1
8
+ temperature: 0.0
9
+ # build_function: thinking
10
+ # llm_class: ollama
11
+ # kwargs:
12
+ # think_llm_args:
13
+ # model: gemma3:1b
14
+ # top_k: 1
15
+ # top_p: 1
16
+ # temperature: 0.0
17
+ # max_tokens: 1024
18
+ # answer_llm_args:
19
+ # model: gemma3:1b
20
+ # top_k: 1
21
+ # top_p: 1
22
+ # temperature: 0.0
configs/ollama.yaml ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ build_function: thinking
2
+ llm_class: ollama
3
+ kwargs:
4
+ think_llm_args:
5
+ model: gemma3:1b
6
+ top_k: 1
7
+ top_p: 1
8
+ temperature: 0.0
9
+ answer_llm_args:
10
+ model: gemma3:1b
11
+ top_k: 1
12
+ top_p: 1
13
+ temperature: 0.0
configs/openrouter.yaml ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ build_function: singleturn
2
+ llm_class: openai
3
+ kwargs:
4
+ llm_args:
5
+ model_name: google/gemini-2.0-flash-lite-001
6
+ temperature: 0.0
7
+ top_p: 1.0
8
+ base_url: https://openrouter.ai/api/v1
9
+ api_key: sk-or-v1-...
docs/evaluate.md ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## Evaluation process
2
+
3
+ ### 1. Generate responses
4
+
5
+ The first and main step is to generate the answers. You can do this in any way that is convenient for you, including the scripts that exist in this repository. The main thing is that in the end you have a file with answers in JSONL format, where each object contains the fields `id` (question id, int) and `generated_answer` (model response, json object). Example:
6
+
7
+ ```json
8
+ {"id":0,"generated_answer":{"answer":"А","context":{}}}
9
+ {"id":1,"generated_answer":{"answer":"А","context":{}}}
10
+ {"id":2,"generated_answer":{"answer":36,"context":{}}}
11
+ {"id":3,"generated_answer":{"answer":10,"context":{}}}
12
+ {"id":4,"generated_answer":{"answer":3000000000000000,"context":{}}}
13
+ {"id":5,"generated_answer":{"answer":"А","context":{}}}
14
+ {"id":6,"generated_answer":{"answer":10,"context":{}}}
15
+ {"id":7,"generated_answer":{"answer":"А","context":{}}}
16
+ {"id":8,"generated_answer":{"answer":{"Удав":4,"Слоненок":1,"Мартышка":3},"context":{}}}
17
+ ...
18
+ ```
19
+
20
+ #### Generation utils
21
+
22
+ There are currently 2 types of prompts supported (responding immediately or after the first line of reasoning) and 2 types of model providers (ollama and openai api compatible).
23
+
24
+ ```bash
25
+ python3 cli.py generate --help
26
+ ```
27
+
28
+ An example for generating responses using the Gemma 3.1B model:
29
+
30
+ ```bash
31
+ ollama run gemma3:1b
32
+ ```
33
+
34
+ ```bash
35
+ python3 cli.py generate --config-path configs/gemma-3-1b.yaml --output-path ./gemma-3-1b_nothink.jsonl --temp-path ./tmp_gemma-3-1b/
36
+ ```
37
+
38
+ ### 2. Validate responses
39
+
40
+ The generated responses can be checked for correctness using the utility:
41
+
42
+ ```bash
43
+ python3 cli.py type-sanitycheck --help
44
+ ```
45
+
46
+ ```bash
47
+ python3 cli.py type-sanitycheck --file ./gemma-3-1b_nothink.jsonl
48
+ ```
49
+
50
+ ### 3. Evaluate responses
51
+
52
+ Once you have the answers file, you can run the solved/unsolved assessment using the utility:
53
+
54
+ ```bash
55
+ python3 cli.py evaluate --help
56
+ ```
57
+
58
+ ```bash
59
+ python3 cli.py evaluate --file ./gemma-3-1b_nothink.jsonl
60
+ ```
61
+
62
+ As a result, you will receive the file `gemma-3-1b_nothink.eval.jsonl` with a new field `is_correct` (bool) - the result of checking each response.
63
+
64
+ ### 4. Calculate overall metrics
65
+
66
+ ```bash
67
+ python3 cli.py metrics --help
68
+ ```
69
+
70
+ ```bash
71
+ python3 cli.py metrics --model-name gemma-3-1b --file ./gemma-3-1b_nothink.eval.jsonl --model-size 1.0 --model-url https://huggingface.co/google/gemma-3-1b-it --model-config "{'build_function': 'singleturn', 'top_k': 1, 'top_p': 1, 'temperature': 0.0}"
72
+ ```
73
+
74
+ As a result, you will receive the file `gemma-3-1b_nothink.eval.metrics.json` with common metrics for the model:
75
+
76
+ ```json
77
+ [
78
+ {
79
+ "model_name": "gemma-3-1b",
80
+ "model_size": 1.0,
81
+ "model_url": "https://huggingface.co/google/gemma-3-1b-it",
82
+ "pass1": 0.10148902821316615,
83
+ "weighted_pass1": 0.10207932648691802,
84
+ "arith_pass1": 0.08566433566433566,
85
+ "geometry_pass1": 0.125,
86
+ "logic_pass1": 0.13664596273291926,
87
+ "config": "{'build_function': 'singleturn', 'top_k': 1, 'top_p': 1, 'temperature': 0.0}"
88
+ }
89
+ ]
90
+ ```
prompts/simple_think_end.yaml ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _type: prompt
2
+ input_variables:
3
+ - answer_type
4
+ - task_note
5
+ metadata: null
6
+ name: null
7
+ optional_variables: []
8
+ output_parser: null
9
+ partial_variables: {}
10
+ tags: null
11
+ template: "Напиши свой ответ в этом формате: {answer_type}\nПояснение к формату: {task_note}"
12
+ template_format: f-string
13
+ validate_template: false
prompts/simple_think_system.yaml ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _type: prompt
2
+ input_variables: []
3
+ metadata: null
4
+ name: null
5
+ optional_variables: []
6
+ output_parser: null
7
+ partial_variables: {}
8
+ tags: null
9
+ template: "Реши следующую математическую задачу эффективно и ясно. Думай шаг за шагом перед ответом."
10
+ template_format: f-string
11
+ validate_template: false
prompts/singleturn.yaml ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _type: prompt
2
+ input_variables:
3
+ - answer_type
4
+ - task_note
5
+ - task_text
6
+ metadata: null
7
+ name: null
8
+ optional_variables: []
9
+ output_parser: null
10
+ partial_variables: {}
11
+ tags: null
12
+ template: "{task_text}\n\nНапиши свой ответ в этом формате: {answer_type}\nПояснение к формату: {task_note}"
13
+ template_format: f-string
14
+ validate_template: false
requirements.txt ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ pandas
2
+ pandera
3
+ datasets
4
+ langchain
5
+ langchain-community
6
+ langchain-openai
7
+ langchain-ollama
8
+ gradio
9
+ gradio-leaderboard
10
+ pydantic>=2
11
+ pydantic-yaml
12
+ click
src/common/data.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+
3
+ import datasets
4
+ import pandas as pd
5
+ import pandera.pandas as pa
6
+
7
+ from src.common.paths import DATASET_NAME
8
+ from src.common.schema import DatasetSchema
9
+
10
+
11
+ @pa.check_output(DatasetSchema)
12
+ def load_dataset() -> pd.DataFrame:
13
+ ds = datasets.load_dataset(DATASET_NAME, split="test")
14
+ df = pd.DataFrame(ds)
15
+
16
+ df[DatasetSchema.correct_answer] = df[DatasetSchema.correct_answer].apply(json.loads)
17
+ return df
src/common/env.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Any
2
+
3
+
4
+ def build_default_namespace() -> dict[str, Any]:
5
+ """Creates a dictionary with types from the typing module and built-in types."""
6
+ import typing
7
+ from numbers import Number
8
+ from fractions import Fraction
9
+
10
+ namespace = {
11
+ name: getattr(typing, name) for name in dir(typing) if not name.startswith("_")
12
+ }
13
+ namespace.update(
14
+ {
15
+ "int": int,
16
+ "str": str,
17
+ "float": float,
18
+ "bool": bool,
19
+ "dict": dict,
20
+ "list": list,
21
+ }
22
+ )
23
+ namespace.update({"Fraction": Fraction, "Number": Number})
24
+
25
+ return namespace
src/common/paths.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pathlib
2
+
3
+
4
+ _FILE_PATH = pathlib.Path(__file__).parent.resolve()
5
+
6
+ PROJECT_ROOT = _FILE_PATH.parent.parent
7
+ PROMPTS_PATH = PROJECT_ROOT / "prompts"
8
+ DATA_PATH = PROJECT_ROOT / "data"
9
+ DOCS_PATH = PROJECT_ROOT / "docs"
10
+
11
+ DATASET_NAME = "d0rj/ROMB-1.0"
src/common/schema.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Any
2
+
3
+ import pandera.pandas as pa
4
+
5
+
6
+ class DatasetSchema(pa.DataFrameModel):
7
+ id_: pa.typing.Series[int] = pa.Field(alias="id")
8
+ task_text: pa.typing.Series[str]
9
+ answer_text: pa.typing.Series[str]
10
+ correct_answer: pa.typing.Series[Any]
11
+ date: pa.typing.Series[str]
12
+ olymp_name: pa.typing.Series[str]
13
+ grade: pa.typing.Series[str]
14
+ description: pa.typing.Series[str]
15
+ source: pa.typing.Series[str]
16
+ answer_type: pa.typing.Series[str]
17
+ check_type: pa.typing.Series[str]
18
+ check_function: pa.typing.Series[str] = pa.Field(nullable=True)
19
+ task_type: pa.typing.Series[str]
20
+ task_note: pa.typing.Series[str]
21
+
22
+
23
+ class LeaderBoardSchema(pa.DataFrameModel):
24
+ model_name: pa.typing.Series[str]
25
+ model_size: pa.typing.Series[float] = pa.Field(nullable=True)
26
+ model_url: pa.typing.Series[str] = pa.Field(nullable=True)
27
+ pass1: pa.typing.Series[float]
28
+ weighted_pass1: pa.typing.Series[float]
29
+ arith_pass1: pa.typing.Series[float]
30
+ geometry_pass1: pa.typing.Series[float]
31
+ logic_pass1: pa.typing.Series[float]
32
+ config: pa.typing.Series[str] = pa.Field(nullable=True, default={})
src/eval/cli.py ADDED
@@ -0,0 +1,191 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import pathlib
3
+ from copy import deepcopy
4
+
5
+ import click
6
+ import pandas as pd
7
+ import pandera.pandas as pa
8
+ from tqdm.auto import tqdm
9
+
10
+ from src.common.data import load_dataset
11
+ from src.eval.metrics import grade_to_weight
12
+ from src.eval.schema import DatasetEvalSchema
13
+ from src.eval.matchers import build_check_function
14
+ from src.generate.generators import GenerationAnswer
15
+ from src.generate.schema import GeneratedDatasetSchema
16
+ from src.common.schema import DatasetSchema, LeaderBoardSchema
17
+
18
+
19
+ def _evaluate_single_answer(
20
+ row: dict,
21
+ ) -> bool:
22
+ if pd.isna(row[GeneratedDatasetSchema.generated_answer]):
23
+ return False
24
+ if not type(row[GeneratedDatasetSchema.generated_answer]) is GenerationAnswer:
25
+ raise ValueError(
26
+ f"Expected GenerationAnswer, got {type(row[GeneratedDatasetSchema.generated_answer])} for id {row[DatasetSchema.id_]}",
27
+ )
28
+ y_pred = row[GeneratedDatasetSchema.generated_answer].answer
29
+ if not y_pred:
30
+ return False
31
+
32
+ y_true = row[DatasetSchema.correct_answer]
33
+ check_function = build_check_function(
34
+ row[DatasetSchema.check_type],
35
+ row[DatasetSchema.check_function],
36
+ )
37
+ try:
38
+ result = check_function(
39
+ y_true=deepcopy(y_true),
40
+ y_pred=deepcopy(y_pred),
41
+ )
42
+ except Exception as e:
43
+ print(e)
44
+ print(f"Error evaluating row with {row[DatasetSchema.check_type]} {row[DatasetSchema.id_]}: {y_true} vs {y_pred}")
45
+ exit(1)
46
+ return result
47
+
48
+
49
+ @pa.check_input(GeneratedDatasetSchema)
50
+ @pa.check_output(DatasetEvalSchema)
51
+ def _evaluate(
52
+ generated_df: pd.DataFrame,
53
+ ) -> pd.DataFrame:
54
+ tqdm.pandas()
55
+
56
+ generated_df[GeneratedDatasetSchema.generated_answer] = generated_df[GeneratedDatasetSchema.generated_answer].apply(
57
+ lambda x: GenerationAnswer.model_validate(deepcopy(x)) if x else None,
58
+ )
59
+ dataset_df = load_dataset()
60
+ predictions_df = dataset_df.join(
61
+ generated_df.set_index(GeneratedDatasetSchema.id_),
62
+ on=DatasetSchema.id_,
63
+ )
64
+
65
+ predictions_df[DatasetEvalSchema.is_correct] = predictions_df.progress_apply(
66
+ _evaluate_single_answer,
67
+ axis=1,
68
+ )
69
+
70
+ predictions_df[DatasetEvalSchema.predicted_answer] = predictions_df[GeneratedDatasetSchema.generated_answer].apply(
71
+ lambda x: x.answer if not pd.isna(x) else None,
72
+ )
73
+ predictions_df[DatasetEvalSchema.context] = predictions_df[GeneratedDatasetSchema.generated_answer].apply(
74
+ lambda x: x.context if not pd.isna(x) else None,
75
+ )
76
+ predictions_df = predictions_df[list(DatasetEvalSchema._collect_fields().keys())]
77
+
78
+ return predictions_df
79
+
80
+
81
+ @click.command()
82
+ @click.option(
83
+ "--file",
84
+ type=click.Path(exists=True, dir_okay=False, readable=True, resolve_path=True),
85
+ default=pathlib.Path("./gemma3:4b.jsonl"),
86
+ )
87
+ def evaluate(
88
+ file: pathlib.Path = pathlib.Path("./gemma3:4b.jsonl"),
89
+ ):
90
+ file = pathlib.Path(file)
91
+
92
+ df = pd.read_json(file, lines=True)
93
+ evaluated_df = _evaluate(df)
94
+ evaluated_df.to_json(file.with_suffix(".eval.jsonl"), orient="records", lines=True, force_ascii=False)
95
+
96
+
97
+ @pa.check_input(DatasetEvalSchema)
98
+ @pa.check_output(LeaderBoardSchema)
99
+ def _metrics(
100
+ df: pd.DataFrame,
101
+ model_name: str,
102
+ model_size: float,
103
+ model_url: str,
104
+ model_config: str
105
+ ) -> pd.DataFrame:
106
+ pass1 = df[DatasetEvalSchema.is_correct].mean()
107
+
108
+ w = df[DatasetEvalSchema.grade].apply(grade_to_weight)
109
+ weighted_accuracy = (df[DatasetEvalSchema.is_correct].astype(int) * w).sum() / w.sum()
110
+
111
+ arith_pass1 = df[df[DatasetEvalSchema.task_type] == "arith"][DatasetEvalSchema.is_correct].mean()
112
+ geometry_pass1 = df[df[DatasetEvalSchema.task_type] == "geometry"][DatasetEvalSchema.is_correct].mean()
113
+ logic_pass1 = df[df[DatasetEvalSchema.task_type] == "logic"][DatasetEvalSchema.is_correct].mean()
114
+
115
+ result = {
116
+ LeaderBoardSchema.model_name: model_name,
117
+ LeaderBoardSchema.model_size: model_size,
118
+ LeaderBoardSchema.model_url: model_url,
119
+ LeaderBoardSchema.config: str(model_config),
120
+ LeaderBoardSchema.pass1: pass1,
121
+ LeaderBoardSchema.weighted_pass1: weighted_accuracy,
122
+ LeaderBoardSchema.arith_pass1: arith_pass1,
123
+ LeaderBoardSchema.geometry_pass1: geometry_pass1,
124
+ LeaderBoardSchema.logic_pass1: logic_pass1,
125
+ }
126
+
127
+ result_df = pd.DataFrame([result])
128
+ result_df = result_df[list(LeaderBoardSchema._collect_fields().keys())]
129
+ return result_df
130
+
131
+
132
+ @click.command()
133
+ @click.option(
134
+ "--model-name",
135
+ type=str,
136
+ required=True,
137
+ help="Name of the model being evaluated.",
138
+ )
139
+ @click.option(
140
+ "--file",
141
+ type=click.Path(exists=True, dir_okay=False, readable=True, resolve_path=True),
142
+ default=pathlib.Path("./gemma3:4b_eval.jsonl"),
143
+ )
144
+ @click.option(
145
+ "--model-size",
146
+ type=float,
147
+ default=None,
148
+ help="Size of the model in billions of parameters.",
149
+ )
150
+ @click.option(
151
+ "--model-url",
152
+ type=str,
153
+ default=None,
154
+ help="URL where the model can be accessed.",
155
+ )
156
+ @click.option(
157
+ "--model-config",
158
+ type=str,
159
+ default=None,
160
+ help="Model configuration in dict format.",
161
+ )
162
+ def metrics(
163
+ model_name: str,
164
+ file: pathlib.Path = pathlib.Path("./gemma3:4b_eval.jsonl"),
165
+ model_size: float = None,
166
+ model_url: str = None,
167
+ model_config: str = None,
168
+ ):
169
+ file = pathlib.Path(file)
170
+
171
+ df = pd.read_json(file, lines=True)
172
+ metrics_df = _metrics(
173
+ df,
174
+ model_name=model_name,
175
+ model_size=model_size,
176
+ model_url=model_url,
177
+ model_config=model_config or '',
178
+ )
179
+ metrics = metrics_df.to_dict(orient="records")[0]
180
+ print(f"Metrics for {model_name}:")
181
+ for key, value in metrics.items():
182
+ print(f"{key}: {value}")
183
+ json.dump(
184
+ metrics_df.to_dict(orient="records"),
185
+ open(file.with_suffix(".metrics.json"), "w"),
186
+ ensure_ascii=False,
187
+ )
188
+
189
+
190
+ if __name__ == "__main__":
191
+ evaluate()
src/eval/matchers.py ADDED
@@ -0,0 +1,144 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import collections
2
+ from typing import Any, Callable
3
+
4
+ from src.common.env import build_default_namespace
5
+
6
+
7
+ def _dict_to_tuple(dict_obj: dict) -> tuple[tuple]:
8
+ return tuple(sorted(dict_obj.items()))
9
+
10
+
11
+ def Am(y_true: list, y_pred: list) -> bool:
12
+ """ "Check if all elements in y_pred are present in y_true and vice versa."""
13
+ return all(y in y_true for y in y_pred) and all(y in y_pred for y in y_true)
14
+
15
+
16
+ def am(y_true: list, y_pred: list) -> bool:
17
+ """Check if any elements in y_pred are present in y_true."""
18
+ return any(y in y_true for y in y_pred)
19
+
20
+
21
+ def em(y_true: Any, y_pred: Any) -> bool:
22
+ """Check if the true answer and predicted answer are exactly the same."""
23
+ if type(y_true) is str:
24
+ y_true = y_true.lower()
25
+ y_pred = y_pred.lower()
26
+ return y_true == y_pred
27
+
28
+
29
+ def um(y_true: list, y_pred: list) -> bool:
30
+ """Check if the true answer and predicted answer are unordered but contain the same elements."""
31
+ if len(y_true) != len(y_pred):
32
+ return False
33
+ if len(y_true) == 0:
34
+ return True
35
+ if (len(y_true) > 0 and type(y_true[0]) is dict) or (len(y_true) == 0 and type(y_pred[0]) is dict):
36
+ y_true = [_dict_to_tuple(item) for item in y_true]
37
+ y_pred = [_dict_to_tuple(item) for item in y_pred]
38
+ if type(y_true) != type(y_pred):
39
+ return False
40
+ return collections.Counter(y_true) == collections.Counter(y_pred)
41
+
42
+
43
+ def om(y_true: list, y_pred: list) -> bool:
44
+ """Check if the true answer and predicted answer are in the same order."""
45
+ return list(y_true) == list(y_pred)
46
+
47
+
48
+ def um_om(y_true: list[list], y_pred: list[list]) -> bool:
49
+ """Check if the true answer and predicted answer are unordered lists of ordered sublists."""
50
+ true_bags = collections.Counter(tuple(sub) for sub in y_true)
51
+ pred_bags = collections.Counter(tuple(sub) for sub in y_pred)
52
+ return true_bags == pred_bags
53
+
54
+
55
+ def um_um(y_true: list[list], y_pred: list[list]) -> bool:
56
+ """Check if the true answer and predicted answer are unordered lists of unordered sublists."""
57
+ true_sets = [tuple(sorted(sub)) for sub in y_true]
58
+ pred_sets = [tuple(sorted(sub)) for sub in y_pred]
59
+ return collections.Counter(true_sets) == collections.Counter(pred_sets)
60
+
61
+
62
+ def _build_custom(check_code: str) -> Callable[[Any, Any], bool]:
63
+ """
64
+ Builds a custom function based on the provided check code.
65
+ The check code should be a string representing a Python expression.
66
+ """
67
+ code = "\n".join([f" {line}" for line in check_code.splitlines()])
68
+ code = f"def check(y_true: Any, y_pred: Any) -> bool:\n{code}"
69
+ namespace = build_default_namespace()
70
+ exec(code, namespace)
71
+ return namespace["check"]
72
+
73
+
74
+ def _build_dict(type_dict: dict[Any, str]) -> Callable[[Any, Any], bool]:
75
+ """
76
+ Builds a function that checks if the predicted answer matches the true answer
77
+ for each field in the type dictionary.
78
+ """
79
+
80
+ def check(y_true, y_pred) -> bool:
81
+ assert set(type_dict.keys()) == set(y_true.keys())
82
+ try:
83
+ for key, value in y_true.items():
84
+ key_check = build_check_function(type_dict[key])
85
+ if not key_check(y_true=value, y_pred=y_pred[key]):
86
+ return False
87
+ return True
88
+ except KeyError:
89
+ return False
90
+
91
+ return check
92
+
93
+
94
+ def build_check_function(
95
+ check_type: str, check_code: str | None = None
96
+ ) -> Callable[[Any, Any], bool]:
97
+ """
98
+ Returns a function that checks if the predicted answer matches the true answer.
99
+
100
+ Args:
101
+ check_type (str): The type of check to perform. Can be one of:
102
+ - "Am": All match
103
+ - "am": Any match
104
+ - "em": Exact match
105
+ - "um": Unordered match
106
+ - "om": Ordered match
107
+ - "um[om]": Unordered match with ordered sublists
108
+ - "um[um]": Unordered match with unordered sublists
109
+ - "custom": Custom check defined by `check_code`
110
+ - A dictionary where keys are field names and values are check types for each field.
111
+ check_code (str, optional): Custom check code to be executed if `check_type` is "custom".
112
+ It should define a function body without the function definition line.
113
+ Returns:
114
+ Callable[[Any, Any], bool]: A function that takes two arguments (true answer and predicted answer)
115
+ and returns True if they match according to the specified check type, otherwise False.
116
+ """
117
+ check_functions = {
118
+ "Am": Am,
119
+ "am": am,
120
+ "em": em,
121
+ "um": um,
122
+ "um_f": um, # TODO: fraction of matched answers
123
+ "om": om,
124
+ "um[om]": um_om,
125
+ "um[um]": um_um,
126
+ }
127
+
128
+ try:
129
+ check_type_dict = eval(check_type)
130
+ if not type(check_type_dict) is dict:
131
+ check_type_dict = None
132
+ except:
133
+ check_type_dict = None
134
+
135
+ if check_type in check_functions:
136
+ return check_functions[check_type]
137
+ elif check_type == "custom" and check_code is not None:
138
+ return _build_custom(check_code)
139
+ elif check_type_dict:
140
+ return _build_dict(check_type_dict)
141
+ else:
142
+ raise ValueError(
143
+ f"Unknown check type: {check_type}. Available types: {list(check_functions.keys()) + ['custom']}."
144
+ )
src/eval/metrics.py ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+
3
+
4
+ def grade_to_weight(g: str) -> float:
5
+ """Convert a grade string to a weight value."""
6
+ parts = list(map(int, g.split('-')))
7
+ return np.mean(parts)
src/eval/schema.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Any
2
+
3
+ import pandera.pandas as pa
4
+
5
+
6
+ class DatasetEvalSchema(pa.DataFrameModel):
7
+ id_: pa.typing.Series[int] = pa.Field(alias="id")
8
+ is_correct: pa.typing.Series[bool]
9
+ task_text: pa.typing.Series[str]
10
+ answer_text: pa.typing.Series[str]
11
+ correct_answer: pa.typing.Series[Any]
12
+ predicted_answer: pa.typing.Series[Any] = pa.Field(nullable=True)
13
+ olymp_name: pa.typing.Series[str]
14
+ grade: pa.typing.Series[str]
15
+ task_type: pa.typing.Series[str]
16
+ context: pa.typing.Series[Any] = pa.Field(default=None, nullable=True)
src/generate/answer.py ADDED
@@ -0,0 +1,156 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing_extensions import get_args, get_origin, TypedDict
2
+ from typing import Any, Union, Literal, List, Tuple, Dict, Set, Annotated
3
+
4
+ from pydantic import create_model, BaseModel, RootModel
5
+
6
+ from src.common.env import build_default_namespace
7
+
8
+
9
+ def string_to_type(type_str: str) -> Union[type, Tuple[type, ...]]:
10
+ """Converts a string representation of a type to an actual type."""
11
+ namespace = build_default_namespace()
12
+ return eval(type_str, namespace, {})
13
+
14
+
15
+ def matches_type(value: Any, type_hint: Union[type, Tuple[type, ...]]) -> bool:
16
+ """Checks if a value matches a given type hint."""
17
+ origin = get_origin(type_hint)
18
+ args = get_args(type_hint)
19
+
20
+ if origin is Union:
21
+ return any(matches_type(value, arg) for arg in args)
22
+
23
+ if origin is Literal:
24
+ return value in args
25
+
26
+ if origin is Annotated:
27
+ return matches_type(value, args[0])
28
+
29
+ if origin is list or origin is List:
30
+ if not isinstance(value, list):
31
+ return False
32
+ if not args:
33
+ return True
34
+ return all(matches_type(item, args[0]) for item in value)
35
+
36
+ if origin is tuple or origin is Tuple:
37
+ if not isinstance(value, tuple):
38
+ return False
39
+ if not args:
40
+ return True
41
+ if len(args) == 2 and args[1] is Ellipsis:
42
+ return all(matches_type(item, args[0]) for item in value)
43
+ if len(args) != len(value):
44
+ return False
45
+ return all(matches_type(item, sub_type) for item, sub_type in zip(value, args))
46
+
47
+ if origin is dict or origin is Dict:
48
+ if not isinstance(value, dict):
49
+ return False
50
+ if not args:
51
+ return True
52
+ key_type, val_type = args
53
+ return all(
54
+ matches_type(k, key_type) and matches_type(v, val_type)
55
+ for k, v in value.items()
56
+ )
57
+
58
+ if origin is set or origin is Set:
59
+ if not isinstance(value, set):
60
+ return False
61
+ if not args:
62
+ return True
63
+ return all(matches_type(item, args[0]) for item in value)
64
+
65
+ if type_hint is type(None):
66
+ return value is None
67
+
68
+ if type_hint is Any:
69
+ return True
70
+
71
+ try:
72
+ return isinstance(value, type_hint)
73
+ except TypeError:
74
+ return False
75
+
76
+
77
+ def make_answer_model(
78
+ type_str: str,
79
+ field_name: str = "answer",
80
+ model_name: str = "AnswerModel",
81
+ add_thinking_field: bool = False,
82
+ ) -> type[BaseModel]:
83
+ """
84
+ Creates a Pydantic model with one required field `field_name`,
85
+ whose type is taken from the string `type_str`.
86
+ If `add_thinking_field` is True, then a `thinking` field of type str is added.
87
+
88
+ The resulting class will have the name `model_name`.
89
+ """
90
+
91
+ type_hint = string_to_type(type_str)
92
+
93
+ model = create_model(
94
+ model_name,
95
+ **(
96
+ (
97
+ {
98
+ "thinking": (str, ...),
99
+ }
100
+ if add_thinking_field
101
+ else {}
102
+ )
103
+ | {
104
+ field_name: (type_hint, ...),
105
+ }
106
+ ),
107
+ )
108
+ return model
109
+
110
+
111
+ def _build_typed_dict(name: str, keys: tuple, value_type: Any):
112
+ annotations = {k: value_type for k in keys}
113
+ return TypedDict(name, annotations, total=True)
114
+
115
+
116
+ def _transform_required_dicts(tp: Any, name_base: str = "TD") -> Any:
117
+ origin = get_origin(tp)
118
+
119
+ if origin in (dict, Dict):
120
+ k_type, v_type = get_args(tp)
121
+ if get_origin(k_type) is Literal:
122
+ literal_keys = get_args(k_type)
123
+ v_type_t = _transform_required_dicts(v_type, name_base + "V")
124
+ return _build_typed_dict(f"{name_base}Required", literal_keys, v_type_t)
125
+ k_type_t = _transform_required_dicts(k_type, name_base + "K")
126
+ v_type_t = _transform_required_dicts(v_type, name_base + "V")
127
+ return Dict[k_type_t, v_type_t]
128
+
129
+ if origin in (list, List):
130
+ (inner,) = get_args(tp)
131
+ inner_t = _transform_required_dicts(inner, name_base + "Item")
132
+ return List[inner_t]
133
+
134
+ if origin is Union:
135
+ return Union[
136
+ tuple(_transform_required_dicts(a, name_base + "U") for a in get_args(tp))
137
+ ]
138
+
139
+ return tp
140
+
141
+
142
+ def make_root_model(
143
+ type_str: str, model_name: str = "Answer", make_required: bool = True
144
+ ) -> type[BaseModel]:
145
+ """
146
+ Creates a Pydantic root model equivalent to any type hint from string.
147
+ The resulting class will have a root field __root__ with the needed type,
148
+ and you can parse an object of this type directly.
149
+ """
150
+
151
+ type_hint = string_to_type(type_str)
152
+ if make_required:
153
+ type_hint = _transform_required_dicts(type_hint, name_base=model_name + "Dict")
154
+
155
+ model = type(model_name, (RootModel[type_hint],), {})
156
+ return model
src/generate/cli.py ADDED
@@ -0,0 +1,210 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import pathlib
3
+ from copy import deepcopy
4
+ from typing import Callable
5
+ from functools import partial
6
+
7
+ import click
8
+ import pandas as pd
9
+ import pandera.pandas as pa
10
+ from tqdm.auto import tqdm
11
+ from langchain_core.runnables import Runnable
12
+
13
+ from src.common.data import load_dataset
14
+ from src.common.schema import DatasetSchema
15
+ from src.generate.config import GenerationConfig
16
+ from src.generate.schema import GeneratedDatasetSchema
17
+ from src.generate.answer import make_root_model, matches_type, string_to_type
18
+ from src.generate.generators import GenerationAnswer, GENERATORS_NAME_TO_FACTORY
19
+
20
+
21
+ def _save_temp_file(
22
+ row: dict,
23
+ result: GenerationAnswer,
24
+ temp_path: pathlib.Path,
25
+ ) -> None:
26
+ temp_file = temp_path / f"{row[DatasetSchema.id_]}.json"
27
+ json.dump(
28
+ {
29
+ DatasetSchema.id_: row[DatasetSchema.id_],
30
+ GeneratedDatasetSchema.generated_answer: result.model_dump(),
31
+ },
32
+ open(temp_file, "w"),
33
+ ensure_ascii=False,
34
+ )
35
+
36
+
37
+ def _generate_single_answer(
38
+ row: dict,
39
+ build_chain: Callable[[type], Runnable],
40
+ temp_path: pathlib.Path = None,
41
+ ) -> GenerationAnswer:
42
+ if temp_path and (temp_path / f"{row[DatasetSchema.id_]}.json").exists():
43
+ return GenerationAnswer.model_validate(
44
+ json.load(open(temp_path / f"{row[DatasetSchema.id_]}.json", "r"))[GeneratedDatasetSchema.generated_answer]
45
+ )
46
+ answer_type = make_root_model(row[DatasetSchema.answer_type])
47
+ chain = build_chain(answer_type)
48
+
49
+ row = dict(row)
50
+ row.pop(DatasetSchema.correct_answer, None)
51
+
52
+ result: GenerationAnswer = chain.invoke(row)
53
+ if temp_path:
54
+ _save_temp_file(row, result, temp_path)
55
+ return result
56
+
57
+
58
+ @pa.check_input(DatasetSchema)
59
+ @pa.check_output(GeneratedDatasetSchema)
60
+ def _generate_answers(
61
+ df: pd.DataFrame,
62
+ build_chain: Callable[[type], Runnable],
63
+ use_tqdm: bool = True,
64
+ temp_path: pathlib.Path = None,
65
+ ) -> pd.DataFrame:
66
+ if use_tqdm:
67
+ tqdm.pandas()
68
+ df[GeneratedDatasetSchema.generated_answer] = df.progress_apply(
69
+ partial(
70
+ _generate_single_answer,
71
+ build_chain=build_chain,
72
+ temp_path=temp_path,
73
+ ),
74
+ axis=1,
75
+ )
76
+ else:
77
+ df[GeneratedDatasetSchema.generated_answer] = df.apply(
78
+ partial(
79
+ _generate_single_answer,
80
+ build_chain=build_chain,
81
+ temp_path=temp_path,
82
+ ),
83
+ axis=1,
84
+ )
85
+ df = df[list(GeneratedDatasetSchema._collect_fields().keys())]
86
+ return df
87
+
88
+
89
+ @click.command()
90
+ @click.option(
91
+ "--config-path",
92
+ type=click.Path(exists=True, dir_okay=False),
93
+ default=pathlib.Path("configs/ollama.yaml"),
94
+ help="Path to the configuration file.",
95
+ )
96
+ @click.option(
97
+ "--output-path",
98
+ type=click.Path(dir_okay=False),
99
+ default=pathlib.Path("./gemma3:4b.jsonl"),
100
+ help="Path to the output file.",
101
+ )
102
+ @click.option(
103
+ "--temp-path",
104
+ type=click.Path(dir_okay=True, file_okay=False),
105
+ default=pathlib.Path("./tmp_gemma3:4b/"),
106
+ help="Path to the temp files directory.",
107
+ )
108
+ @click.option(
109
+ "--use-tqdm",
110
+ is_flag=True,
111
+ default=True,
112
+ help="Whether to use tqdm for progress bar.",
113
+ )
114
+ def generate(
115
+ config_path: pathlib.Path = pathlib.Path("configs/ollama.yaml"),
116
+ output_path: pathlib.Path = pathlib.Path("./gemma3:4b.jsonl"),
117
+ temp_path: pathlib.Path = pathlib.Path("./tmp_gemma3:4b/"),
118
+ use_tqdm: bool = True,
119
+ ):
120
+ output_path = pathlib.Path(output_path)
121
+ temp_path = pathlib.Path(temp_path)
122
+ output_path.parent.mkdir(parents=True, exist_ok=True)
123
+ temp_path.mkdir(parents=True, exist_ok=True)
124
+
125
+ config = GenerationConfig.from_file(config_path)
126
+ df = load_dataset()
127
+ # df = df.head(3)
128
+
129
+ build_chain_function = GENERATORS_NAME_TO_FACTORY[config.build_function]
130
+ build_chain_function = partial(
131
+ build_chain_function,
132
+ llm_class=config.llm_class,
133
+ structured_output_method=config.structured_output_method,
134
+ **config.kwargs
135
+ )
136
+
137
+ df = _generate_answers(df, build_chain_function, use_tqdm=use_tqdm, temp_path=temp_path)
138
+
139
+ df[GeneratedDatasetSchema.generated_answer] = df[GeneratedDatasetSchema.generated_answer].apply(
140
+ lambda x: x.model_dump()
141
+ )
142
+ df.to_json(
143
+ output_path,
144
+ lines=True,
145
+ orient="records",
146
+ force_ascii=False,
147
+ )
148
+
149
+
150
+ @pa.check_input(GeneratedDatasetSchema)
151
+ def _type_sanitycheck(
152
+ generated_df: pd.DataFrame,
153
+ ) -> tuple[bool, str]:
154
+ generated_df[GeneratedDatasetSchema.generated_answer] = generated_df[GeneratedDatasetSchema.generated_answer].apply(
155
+ lambda x: GenerationAnswer.model_validate(deepcopy(x)) if not isinstance(x, GenerationAnswer) else x
156
+ )
157
+
158
+ dataset_df = load_dataset()
159
+ predicted_df = dataset_df.join(
160
+ generated_df.set_index(GeneratedDatasetSchema.id_),
161
+ on=DatasetSchema.id_,
162
+ rsuffix='_generated',
163
+ ).dropna(subset=[GeneratedDatasetSchema.generated_answer])
164
+
165
+ if len(predicted_df) == 0:
166
+ return False, "No valid predictions found."
167
+
168
+ TYPE_MATCH = "type_match"
169
+ predicted_df[TYPE_MATCH] = predicted_df.apply(
170
+ lambda row: matches_type(
171
+ row[GeneratedDatasetSchema.generated_answer].answer,
172
+ string_to_type(row[DatasetSchema.answer_type]),
173
+ ), axis=1
174
+ )
175
+
176
+ if not predicted_df[TYPE_MATCH].all():
177
+ return False, f"Type mismatch found for {predicted_df[~predicted_df[TYPE_MATCH]][DatasetSchema.id_].tolist()}."
178
+
179
+ return True, f"All matched. Predicted count: {len(predicted_df)} of {len(dataset_df)}"
180
+
181
+
182
+ @click.command()
183
+ @click.option(
184
+ "--file",
185
+ type=click.Path(exists=True, dir_okay=False),
186
+ default=pathlib.Path("./gemma3:4b.jsonl"),
187
+ help="Path to the generated dataset file.",
188
+ )
189
+ def type_sanitycheck(
190
+ file: pathlib.Path = pathlib.Path("./gemma3:4b.jsonl"),
191
+ ):
192
+ df = pd.read_json(file, lines=True)
193
+ types_correct, message = _type_sanitycheck(df)
194
+ if not types_correct:
195
+ click.echo(f"❌ Type sanity check failed: {message}")
196
+ exit(1)
197
+ click.echo(f"✅ Type sanity check passed: {message}")
198
+
199
+
200
+ @click.group()
201
+ def cli():
202
+ pass
203
+
204
+
205
+ cli.add_command(generate)
206
+ cli.add_command(type_sanitycheck)
207
+
208
+
209
+ if __name__ == "__main__":
210
+ cli()
src/generate/config.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pathlib
2
+ from typing import Literal, Any, get_args
3
+
4
+ from pydantic import BaseModel
5
+ from pydantic_yaml import parse_yaml_raw_as
6
+
7
+ from src.generate.llms import LLMName
8
+ from src.generate.generators import GeneratorName
9
+
10
+
11
+ class GenerationConfig(BaseModel):
12
+ build_function: GeneratorName = get_args(GeneratorName)[0]
13
+ llm_class: LLMName = get_args(LLMName)[0]
14
+ structured_output_method: Literal[
15
+ "function_calling", "json_mode", "json_schema"
16
+ ] = "json_schema"
17
+ kwargs: dict[str, Any] = {}
18
+
19
+ @classmethod
20
+ def from_yaml(cls, yaml_str: str) -> "GenerationConfig":
21
+ return parse_yaml_raw_as(cls, yaml_str)
22
+
23
+ @classmethod
24
+ def from_file(cls, file_path: str | pathlib.Path) -> "GenerationConfig":
25
+ with open(file_path, "r") as file:
26
+ yaml_str = file.read()
27
+ return cls.from_yaml(yaml_str)
src/generate/generators.py ADDED
@@ -0,0 +1,139 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Any, Literal, Callable
2
+
3
+ import openai
4
+ from pydantic import BaseModel
5
+ from langchain_core.prompts import ChatPromptTemplate
6
+ from langchain_core.runnables import RunnablePassthrough
7
+ from langchain_core.output_parsers import StrOutputParser
8
+ from langchain_core.runnables import Runnable, RunnableLambda
9
+ from langchain_core.prompts import (
10
+ load_prompt,
11
+ ChatPromptTemplate,
12
+ AIMessagePromptTemplate,
13
+ HumanMessagePromptTemplate,
14
+ SystemMessagePromptTemplate,
15
+ )
16
+
17
+ from src.common.paths import PROMPTS_PATH
18
+ from src.common.schema import DatasetSchema
19
+ from src.generate.llms import LLM_NAME_TO_CLASS, LLMName
20
+
21
+
22
+ class GenerationAnswer(BaseModel):
23
+ answer: Any
24
+ context: dict[str, Any] = {}
25
+
26
+
27
+ def build_singleturn_chain(
28
+ answer_class: type[BaseModel],
29
+ llm_class: LLMName = "ollama",
30
+ llm_args: dict[str, Any] = {
31
+ "model": "gemma3:4b",
32
+ "top_k": 1,
33
+ "top_p": 1,
34
+ "temperature": 0.0,
35
+ },
36
+ structured_output_method: Literal[
37
+ "function_calling", "json_mode", "json_schema"
38
+ ] = "json_schema",
39
+ ) -> Runnable:
40
+ llm = LLM_NAME_TO_CLASS[llm_class](
41
+ **llm_args,
42
+ )
43
+ llm = llm.with_structured_output(
44
+ answer_class,
45
+ method=structured_output_method,
46
+ )
47
+ prompt = ChatPromptTemplate.from_messages(
48
+ [
49
+ HumanMessagePromptTemplate(
50
+ prompt=load_prompt(PROMPTS_PATH / "singleturn.yaml")
51
+ )
52
+ ]
53
+ )
54
+ chain = RunnablePassthrough.assign(answer=prompt | llm) | RunnableLambda(
55
+ lambda x: GenerationAnswer(
56
+ answer=x["answer"],
57
+ context={},
58
+ )
59
+ )
60
+ chain = chain.with_retry(
61
+ retry_if_exception_type=(openai.PermissionDeniedError, )
62
+ )
63
+ return chain
64
+
65
+
66
+ def build_thinking_chain(
67
+ answer_class: type[BaseModel],
68
+ llm_class: LLMName = "ollama",
69
+ think_llm_args: dict[str, Any] = {
70
+ "model": "gemma3:4b",
71
+ "top_k": 1,
72
+ "top_p": 1,
73
+ "temperature": 0.0,
74
+ },
75
+ answer_llm_args: dict[str, Any] = {
76
+ "model": "gemma3:4b",
77
+ "top_k": 1,
78
+ "top_p": 1,
79
+ "temperature": 0.0,
80
+ },
81
+ structured_output_method: Literal[
82
+ "function_calling", "json_mode", "json_schema"
83
+ ] = "json_schema",
84
+ ) -> Runnable:
85
+ think_llm = LLM_NAME_TO_CLASS[llm_class](
86
+ **think_llm_args,
87
+ )
88
+ think_prompt = ChatPromptTemplate.from_messages(
89
+ [
90
+ SystemMessagePromptTemplate(
91
+ prompt=load_prompt(PROMPTS_PATH / "simple_think_system.yaml")
92
+ ),
93
+ HumanMessagePromptTemplate.from_template(f"{{{DatasetSchema.task_text}}}"),
94
+ ]
95
+ )
96
+ think_chain = think_prompt | think_llm | StrOutputParser()
97
+
98
+ answer_prompt = ChatPromptTemplate.from_messages(
99
+ think_prompt.messages
100
+ + [
101
+ AIMessagePromptTemplate.from_template("{think_answer}"),
102
+ HumanMessagePromptTemplate(
103
+ prompt=load_prompt(PROMPTS_PATH / "simple_think_end.yaml")
104
+ ),
105
+ ]
106
+ )
107
+ answer_llm = LLM_NAME_TO_CLASS[llm_class](
108
+ **answer_llm_args,
109
+ )
110
+ answer_llm = answer_llm.with_structured_output(
111
+ answer_class,
112
+ method=structured_output_method,
113
+ )
114
+
115
+ chain = (
116
+ RunnablePassthrough.assign(
117
+ think_answer=think_chain,
118
+ )
119
+ | RunnablePassthrough.assign(answer=answer_prompt | answer_llm)
120
+ | RunnableLambda(
121
+ lambda x: GenerationAnswer(
122
+ answer=x["answer"],
123
+ context={
124
+ "think_answer": x["think_answer"],
125
+ },
126
+ )
127
+ )
128
+ )
129
+ chain = chain.with_retry(
130
+ retry_if_exception_type=(openai.PermissionDeniedError, )
131
+ )
132
+ return chain
133
+
134
+
135
+ GeneratorName = Literal["singleturn", "thinking"]
136
+ GENERATORS_NAME_TO_FACTORY: dict[str, Callable[[type[BaseModel]], Runnable]] = {
137
+ "singleturn": build_singleturn_chain,
138
+ "thinking": build_thinking_chain,
139
+ }
src/generate/llms.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Literal
2
+
3
+ from langchain_ollama import ChatOllama
4
+ from langchain_openai.chat_models import ChatOpenAI
5
+ from langchain_community.chat_models import GigaChat
6
+ from langchain_core.language_models.chat_models import BaseChatModel
7
+
8
+
9
+ LLMName = Literal["ollama", "openai", "gigachat"]
10
+ LLM_NAME_TO_CLASS: dict[LLMName, type[BaseChatModel]] = {
11
+ "ollama": ChatOllama,
12
+ "openai": ChatOpenAI,
13
+ "gigachat": GigaChat,
14
+ }
src/generate/schema.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ from typing import Any
2
+
3
+ import pandera.pandas as pa
4
+
5
+
6
+ class GeneratedDatasetSchema(pa.DataFrameModel):
7
+ id_: pa.typing.Series[int] = pa.Field(alias="id")
8
+ generated_answer: pa.typing.Series[Any]
src/space/utils.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ from huggingface_hub import HfApi
4
+
5
+
6
+ REPO_ID = os.getenv("SPACE_ID", "d0rj/romb-leaderboard")
7
+ HF_TOKEN = os.getenv("HF_TOKEN", None)
8
+ if not HF_TOKEN:
9
+ raise ValueError("HF_TOKEN environment variable is not set.")
10
+
11
+ hf_api = HfApi(token=HF_TOKEN)