add auto eval
Browse files- app.py +10 -5
- eval-results/omnieval-auto/CLOSE_deepseek-v2-chat/results_2023-12-08 15:46:20.425378.json +34 -0
- eval-results/omnieval-auto/CLOSE_llama3-70b-instruct/results_2023-12-08 15:46:20.425378.json +34 -0
- eval-results/omnieval-auto/CLOSE_qwen2-72b/results_2023-12-08 15:46:20.425378.json +34 -0
- eval-results/omnieval-auto/CLOSE_yi15-34b/results_2023-12-08 15:46:20.425378.json +34 -0
- eval-results/omnieval-auto/bge-large-zh_qwen2-72b/results_2023-12-08 15:46:20.425378.json +35 -0
- eval-results/omnieval-auto/bge-m3_qwen2-72b/results_2023-12-08 15:46:20.425378.json +35 -0
- eval-results/omnieval-auto/e5-mistral-7b_qwen2-72b/results_2023-12-08 15:46:20.425378.json +35 -0
- eval-results/omnieval-auto/gte-qwen2-1.5b_deepseek-v2-chat/results_2023-12-08 15:46:20.425378.json +35 -0
- eval-results/omnieval-auto/gte-qwen2-1.5b_llama3-70b-instruct/results_2023-12-08 15:46:20.425378.json +35 -0
- eval-results/{demo-leaderboard β omnieval-auto}/gte-qwen2-1.5b_qwen2-72b/results_2023-12-08 15:46:20.425378.json +12 -12
- eval-results/omnieval-auto/gte-qwen2-1.5b_yi15-34b/results_2023-12-08 15:46:20.425378.json +35 -0
- eval-results/omnieval-auto/jina-zh_qwen2-72b/results_2023-12-08 15:46:20.425378.json +35 -0
- eval-results/{demo-leaderboard β omnieval-human}/CLOSE_deepseek-v2-chat/results_2023-12-08 15:46:20.425378.json +0 -0
- eval-results/{demo-leaderboard β omnieval-human}/CLOSE_llama3-70b-instruct/results_2023-12-08 15:46:20.425378.json +0 -0
- eval-results/{demo-leaderboard β omnieval-human}/CLOSE_qwen2-72b/results_2023-12-08 15:46:20.425378.json +0 -0
- eval-results/{demo-leaderboard β omnieval-human}/CLOSE_yi15-34b/results_2023-12-08 15:46:20.425378.json +0 -0
- eval-results/{demo-leaderboard/qwen2-72b_bge-large-zh β omnieval-human/bge-large-zh_qwen2-72b}/results_2023-12-08 15:46:20.425378.json +1 -1
- eval-results/{demo-leaderboard/qwen2-72b_bge-m3 β omnieval-human/bge-m3_qwen2-72b}/results_2023-12-08 15:46:20.425378.json +1 -1
- eval-results/{demo-leaderboard/qwen2-72b_e5-mistral-7b β omnieval-human/e5-mistral-7b_qwen2-72b}/results_2023-12-08 15:46:20.425378.json +1 -1
- eval-results/{demo-leaderboard β omnieval-human}/gte-qwen2-1.5b_deepseek-v2-chat/results_2023-12-08 15:46:20.425378.json +0 -0
- eval-results/{demo-leaderboard β omnieval-human}/gte-qwen2-1.5b_llama3-70b-instruct/results_2023-12-08 15:46:20.425378.json +0 -0
- eval-results/{demo-leaderboard/qwen2-72b_gte-qwen2-1.5b β omnieval-human/gte-qwen2-1.5b_qwen2-72b}/results_2023-12-08 15:46:20.425378.json +1 -1
- eval-results/{demo-leaderboard β omnieval-human}/gte-qwen2-1.5b_yi15-34b/results_2023-12-08 15:46:20.425378.json +0 -0
- eval-results/{demo-leaderboard/qwen2-72b_jina-zh β omnieval-human/jina-zh_qwen2-72b}/results_2023-12-08 15:46:20.425378.json +1 -1
- src/about.py +5 -5
- src/envs.py +3 -2
- src/leaderboard/read_evals.py +1 -1
app.py
CHANGED
|
@@ -24,7 +24,7 @@ from src.display.utils import (
|
|
| 24 |
WeightType,
|
| 25 |
Precision
|
| 26 |
)
|
| 27 |
-
from src.envs import API, EVAL_REQUESTS_PATH,
|
| 28 |
from src.populate import get_evaluation_queue_df, get_leaderboard_df
|
| 29 |
from src.submission.submit import add_new_eval
|
| 30 |
|
|
@@ -41,7 +41,8 @@ try:
|
|
| 41 |
except Exception:
|
| 42 |
restart_space()
|
| 43 |
try:
|
| 44 |
-
print(
|
|
|
|
| 45 |
# snapshot_download(
|
| 46 |
# repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
|
| 47 |
# )
|
|
@@ -49,7 +50,8 @@ except Exception:
|
|
| 49 |
restart_space()
|
| 50 |
|
| 51 |
|
| 52 |
-
|
|
|
|
| 53 |
|
| 54 |
# (
|
| 55 |
# finished_eval_queue_df,
|
|
@@ -97,8 +99,11 @@ with demo:
|
|
| 97 |
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
| 98 |
|
| 99 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
| 100 |
-
with gr.TabItem("
|
| 101 |
-
leaderboard = init_leaderboard(
|
|
|
|
|
|
|
|
|
|
| 102 |
|
| 103 |
with gr.TabItem("π About", elem_id="llm-benchmark-tab-table", id=2):
|
| 104 |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
|
|
|
| 24 |
WeightType,
|
| 25 |
Precision
|
| 26 |
)
|
| 27 |
+
from src.envs import API, EVAL_REQUESTS_PATH, AUTO_RESULTS_PATH, HUMAN_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
|
| 28 |
from src.populate import get_evaluation_queue_df, get_leaderboard_df
|
| 29 |
from src.submission.submit import add_new_eval
|
| 30 |
|
|
|
|
| 41 |
except Exception:
|
| 42 |
restart_space()
|
| 43 |
try:
|
| 44 |
+
print(AUTO_RESULTS_PATH)
|
| 45 |
+
print(HUMAN_RESULTS_PATH)
|
| 46 |
# snapshot_download(
|
| 47 |
# repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
|
| 48 |
# )
|
|
|
|
| 50 |
restart_space()
|
| 51 |
|
| 52 |
|
| 53 |
+
AUTO_LEADERBOARD_DF = get_leaderboard_df(AUTO_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
|
| 54 |
+
HUMAN_LEADERBOARD_DF = get_leaderboard_df(HUMAN_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
|
| 55 |
|
| 56 |
# (
|
| 57 |
# finished_eval_queue_df,
|
|
|
|
| 99 |
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
| 100 |
|
| 101 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
| 102 |
+
with gr.TabItem("πOmniEval-Human", elem_id="llm-benchmark-tab-table", id=0):
|
| 103 |
+
leaderboard = init_leaderboard(HUMAN_LEADERBOARD_DF)
|
| 104 |
+
|
| 105 |
+
with gr.TabItem("π€OmniEval-Auto", elem_id="llm-benchmark-tab-table", id=1):
|
| 106 |
+
leaderboard = init_leaderboard(AUTO_LEADERBOARD_DF)
|
| 107 |
|
| 108 |
with gr.TabItem("π About", elem_id="llm-benchmark-tab-table", id=2):
|
| 109 |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
eval-results/omnieval-auto/CLOSE_deepseek-v2-chat/results_2023-12-08 15:46:20.425378.json
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"results": {
|
| 3 |
+
"retrieval": {
|
| 4 |
+
"mrr": 0.0,
|
| 5 |
+
"map": 0.0
|
| 6 |
+
},
|
| 7 |
+
"generation": {
|
| 8 |
+
"em": 0.0011680767773708802,
|
| 9 |
+
"f1": 0.3709233008524321,
|
| 10 |
+
"rouge1": 0.2570830224992733,
|
| 11 |
+
"rouge2": 0.09085043984411759,
|
| 12 |
+
"rougeL": 0.1860727124152372,
|
| 13 |
+
"accuracy": 0.35869427958075517,
|
| 14 |
+
"completeness": 0.5755086661642803,
|
| 15 |
+
"hallucination": 0.0,
|
| 16 |
+
"utilization": 0.0,
|
| 17 |
+
"numerical_accuracy": 0.11213720316622691
|
| 18 |
+
}
|
| 19 |
+
},
|
| 20 |
+
"config": {
|
| 21 |
+
"eval_name": "CLOSE_deepseek-v2-chat",
|
| 22 |
+
"generative_model": "deepseek-ai/DeepSeek-V2-Chat-0628",
|
| 23 |
+
"generative_model_args": {
|
| 24 |
+
"name": "deepseek-ai/DeepSeek-V2-Chat-0628",
|
| 25 |
+
"num_params": 236,
|
| 26 |
+
"open_source": true
|
| 27 |
+
},
|
| 28 |
+
"retrieval_model": "CLOSE",
|
| 29 |
+
"retrieval_model_args": {
|
| 30 |
+
"num_params": 0.0,
|
| 31 |
+
"open_source": true
|
| 32 |
+
}
|
| 33 |
+
}
|
| 34 |
+
}
|
eval-results/omnieval-auto/CLOSE_llama3-70b-instruct/results_2023-12-08 15:46:20.425378.json
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"results": {
|
| 3 |
+
"retrieval": {
|
| 4 |
+
"mrr": 0.0,
|
| 5 |
+
"map": 0.0
|
| 6 |
+
},
|
| 7 |
+
"generation": {
|
| 8 |
+
"em": 0.0008839499936860714,
|
| 9 |
+
"f1": 0.39891051266403244,
|
| 10 |
+
"rouge1": 0.2679937299203498,
|
| 11 |
+
"rouge2": 0.09293819886242284,
|
| 12 |
+
"rougeL": 0.19931718897529843,
|
| 13 |
+
"accuracy": 0.3238413941154186,
|
| 14 |
+
"completeness": 0.52843637454982,
|
| 15 |
+
"hallucination": 0.0,
|
| 16 |
+
"utilization": 0.0,
|
| 17 |
+
"numerical_accuracy": 0.06765619606489472
|
| 18 |
+
}
|
| 19 |
+
},
|
| 20 |
+
"config": {
|
| 21 |
+
"eval_name": "CLOSE_llama3-70b-instruct",
|
| 22 |
+
"generative_model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
|
| 23 |
+
"generative_model_args": {
|
| 24 |
+
"name": "meta-llama/Meta-Llama-3.1-70B-Instruct",
|
| 25 |
+
"num_params": 70.6,
|
| 26 |
+
"open_source": true
|
| 27 |
+
},
|
| 28 |
+
"retrieval_model": "CLOSE",
|
| 29 |
+
"retrieval_model_args": {
|
| 30 |
+
"num_params": 0.0,
|
| 31 |
+
"open_source": true
|
| 32 |
+
}
|
| 33 |
+
}
|
| 34 |
+
}
|
eval-results/omnieval-auto/CLOSE_qwen2-72b/results_2023-12-08 15:46:20.425378.json
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"results": {
|
| 3 |
+
"retrieval": {
|
| 4 |
+
"mrr": 0.0,
|
| 5 |
+
"map": 0.0
|
| 6 |
+
},
|
| 7 |
+
"generation": {
|
| 8 |
+
"em": 0.0002525571410531633,
|
| 9 |
+
"f1": 0.32215271896313463,
|
| 10 |
+
"rouge1": 0.2352109086389165,
|
| 11 |
+
"rouge2": 0.08060449522198783,
|
| 12 |
+
"rougeL": 0.16073680618083347,
|
| 13 |
+
"accuracy": 0.37883571157974494,
|
| 14 |
+
"completeness": 0.6016923768159353,
|
| 15 |
+
"hallucination": 0.0,
|
| 16 |
+
"utilization": 0.0,
|
| 17 |
+
"numerical_accuracy": 0.1255931667193926
|
| 18 |
+
}
|
| 19 |
+
},
|
| 20 |
+
"config": {
|
| 21 |
+
"eval_name": "CLOSE_qwen2-72b",
|
| 22 |
+
"generative_model": "Qwen/Qwen2.5-72B-Instruct",
|
| 23 |
+
"generative_model_args": {
|
| 24 |
+
"name": "Qwen/Qwen2.5-72B-Instruct",
|
| 25 |
+
"num_params": 72.7,
|
| 26 |
+
"open_source": true
|
| 27 |
+
},
|
| 28 |
+
"retrieval_model": "CLOSE",
|
| 29 |
+
"retrieval_model_args": {
|
| 30 |
+
"num_params": 0.0,
|
| 31 |
+
"open_source": true
|
| 32 |
+
}
|
| 33 |
+
}
|
| 34 |
+
}
|
eval-results/omnieval-auto/CLOSE_yi15-34b/results_2023-12-08 15:46:20.425378.json
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"results": {
|
| 3 |
+
"retrieval": {
|
| 4 |
+
"mrr": 0.0,
|
| 5 |
+
"map": 0.0
|
| 6 |
+
},
|
| 7 |
+
"generation": {
|
| 8 |
+
"em": 0.0,
|
| 9 |
+
"f1": 0.06725057117657031,
|
| 10 |
+
"rouge1": 0.1277764944666756,
|
| 11 |
+
"rouge2": 0.03211441875898112,
|
| 12 |
+
"rougeL": 0.03257144660565082,
|
| 13 |
+
"accuracy": 0.15734309887612072,
|
| 14 |
+
"completeness": 0.5063249001331558,
|
| 15 |
+
"hallucination": 0.0,
|
| 16 |
+
"utilization": 0.0,
|
| 17 |
+
"numerical_accuracy": 0.06932865291794647
|
| 18 |
+
}
|
| 19 |
+
},
|
| 20 |
+
"config": {
|
| 21 |
+
"eval_name": "CLOSE_yi15-34b",
|
| 22 |
+
"generative_model": "01ai/Yi-1.5-34B-Chat-16K",
|
| 23 |
+
"generative_model_args": {
|
| 24 |
+
"name": "01ai/Yi-1.5-34B-Chat-16K",
|
| 25 |
+
"num_params": 34.4,
|
| 26 |
+
"open_source": true
|
| 27 |
+
},
|
| 28 |
+
"retrieval_model": "CLOSE",
|
| 29 |
+
"retrieval_model_args": {
|
| 30 |
+
"num_params": 0.0,
|
| 31 |
+
"open_source": true
|
| 32 |
+
}
|
| 33 |
+
}
|
| 34 |
+
}
|
eval-results/omnieval-auto/bge-large-zh_qwen2-72b/results_2023-12-08 15:46:20.425378.json
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"results": {
|
| 3 |
+
"retrieval": {
|
| 4 |
+
"mrr": 0.3097634381445468,
|
| 5 |
+
"map": 0.30402197247127166
|
| 6 |
+
},
|
| 7 |
+
"generation": {
|
| 8 |
+
"em": 0.0026518499810582142,
|
| 9 |
+
"f1": 0.2480828824153542,
|
| 10 |
+
"rouge1": 0.2493538725800514,
|
| 11 |
+
"rouge2": 0.1235656068292625,
|
| 12 |
+
"rougeL": 0.16098924930699862,
|
| 13 |
+
"accuracy": 0.3906427579239803,
|
| 14 |
+
"completeness": 0.5930474914396308,
|
| 15 |
+
"hallucination": 0.0,
|
| 16 |
+
"utilization": 0.5045650189122212,
|
| 17 |
+
"numerical_accuracy": 0.28149656401119877
|
| 18 |
+
}
|
| 19 |
+
},
|
| 20 |
+
"config": {
|
| 21 |
+
"eval_name": "bge-large-zh_qwen2-72b",
|
| 22 |
+
"generative_model": "Qwen/Qwen2.5-72B-Instruct",
|
| 23 |
+
"generative_model_args": {
|
| 24 |
+
"name": "Qwen/Qwen2.5-72B-Instruct",
|
| 25 |
+
"num_params": 72.7,
|
| 26 |
+
"open_source": true
|
| 27 |
+
},
|
| 28 |
+
"retrieval_model": "BAAI/bge-large-zh",
|
| 29 |
+
"retrieval_model_args": {
|
| 30 |
+
"name": "BAAI/bge-large-zh",
|
| 31 |
+
"num_params": 0.326,
|
| 32 |
+
"open_source": true
|
| 33 |
+
}
|
| 34 |
+
}
|
| 35 |
+
}
|
eval-results/omnieval-auto/bge-m3_qwen2-72b/results_2023-12-08 15:46:20.425378.json
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"results": {
|
| 3 |
+
"retrieval": {
|
| 4 |
+
"mrr": 0.33076566906595944,
|
| 5 |
+
"map": 0.32402765500694536
|
| 6 |
+
},
|
| 7 |
+
"generation": {
|
| 8 |
+
"em": 0.002525571410531633,
|
| 9 |
+
"f1": 0.2524796046548042,
|
| 10 |
+
"rouge1": 0.2542055585319881,
|
| 11 |
+
"rouge2": 0.12967013110722864,
|
| 12 |
+
"rougeL": 0.16623387811734364,
|
| 13 |
+
"accuracy": 0.0,
|
| 14 |
+
"completeness": 0.0,
|
| 15 |
+
"hallucination": 0.0,
|
| 16 |
+
"utilization": 0.0,
|
| 17 |
+
"numerical_accuracy": 0.0
|
| 18 |
+
}
|
| 19 |
+
},
|
| 20 |
+
"config": {
|
| 21 |
+
"eval_name": "bge-m3_qwen2-72b",
|
| 22 |
+
"generative_model": "Qwen/Qwen2.5-72B-Instruct",
|
| 23 |
+
"generative_model_args": {
|
| 24 |
+
"name": "Qwen/Qwen2.5-72B-Instruct",
|
| 25 |
+
"num_params": 72.7,
|
| 26 |
+
"open_source": true
|
| 27 |
+
},
|
| 28 |
+
"retrieval_model": "BAAI/bge-m3",
|
| 29 |
+
"retrieval_model_args": {
|
| 30 |
+
"name": "BAAI/bge-m3",
|
| 31 |
+
"num_params": 0.5,
|
| 32 |
+
"open_source": true
|
| 33 |
+
}
|
| 34 |
+
}
|
| 35 |
+
}
|
eval-results/omnieval-auto/e5-mistral-7b_qwen2-72b/results_2023-12-08 15:46:20.425378.json
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"results": {
|
| 3 |
+
"retrieval": {
|
| 4 |
+
"mrr": 0.26059266742433806,
|
| 5 |
+
"map": 0.25533526960474806
|
| 6 |
+
},
|
| 7 |
+
"generation": {
|
| 8 |
+
"em": 0.002146735698951888,
|
| 9 |
+
"f1": 0.24207930410773865,
|
| 10 |
+
"rouge1": 0.24073805243800728,
|
| 11 |
+
"rouge2": 0.1162276261848681,
|
| 12 |
+
"rougeL": 0.1534679545927458,
|
| 13 |
+
"accuracy": 0.37713095087763604,
|
| 14 |
+
"completeness": 0.5855007473841555,
|
| 15 |
+
"hallucination": 0.0,
|
| 16 |
+
"utilization": 0.49136152656008253,
|
| 17 |
+
"numerical_accuracy": 0.2582123758594347
|
| 18 |
+
}
|
| 19 |
+
},
|
| 20 |
+
"config": {
|
| 21 |
+
"eval_name": "e5-mistral-7b_qwen2-72b",
|
| 22 |
+
"generative_model": "Qwen/Qwen2.5-72B-Instruct",
|
| 23 |
+
"generative_model_args": {
|
| 24 |
+
"name": "Qwen/Qwen2.5-72B-Instruct",
|
| 25 |
+
"num_params": 72.7,
|
| 26 |
+
"open_source": true
|
| 27 |
+
},
|
| 28 |
+
"retrieval_model": "intfloat/e5-mistral-7b-instruct",
|
| 29 |
+
"retrieval_model_args": {
|
| 30 |
+
"name": "intfloat/e5-mistral-7b-instruct",
|
| 31 |
+
"num_params": 7.11,
|
| 32 |
+
"open_source": true
|
| 33 |
+
}
|
| 34 |
+
}
|
| 35 |
+
}
|
eval-results/omnieval-auto/gte-qwen2-1.5b_deepseek-v2-chat/results_2023-12-08 15:46:20.425378.json
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"results": {
|
| 3 |
+
"retrieval": {
|
| 4 |
+
"mrr": 0.3406848507808225,
|
| 5 |
+
"map": 0.3337426863661236
|
| 6 |
+
},
|
| 7 |
+
"generation": {
|
| 8 |
+
"em": 0.0035568464031653824,
|
| 9 |
+
"f1": 0.3226028700822056,
|
| 10 |
+
"rouge1": 0.29804464952499493,
|
| 11 |
+
"rouge2": 0.1619392409911174,
|
| 12 |
+
"rougeL": 0.21536150159516076,
|
| 13 |
+
"accuracy": 0.3783377209477247,
|
| 14 |
+
"completeness": 0.5935541629364369,
|
| 15 |
+
"hallucination": 0.06668379802132854,
|
| 16 |
+
"utilization": 0.48314821907315203,
|
| 17 |
+
"numerical_accuracy": 0.2761605035405193
|
| 18 |
+
}
|
| 19 |
+
},
|
| 20 |
+
"config": {
|
| 21 |
+
"eval_name": "gte-qwen2-1.5b_deepseek-v2-chat",
|
| 22 |
+
"generative_model": "deepseek-ai/DeepSeek-V2-Chat-0628",
|
| 23 |
+
"generative_model_args": {
|
| 24 |
+
"name": "deepseek-ai/DeepSeek-V2-Chat-0628",
|
| 25 |
+
"num_params": 236,
|
| 26 |
+
"open_source": true
|
| 27 |
+
},
|
| 28 |
+
"retrieval_model": "Alibaba-NLP/gte-Qwen2-1.5B-instruct",
|
| 29 |
+
"retrieval_model_args": {
|
| 30 |
+
"name": "Alibaba-NLP/gte-Qwen2-1.5B-instruct",
|
| 31 |
+
"num_params": 1.78,
|
| 32 |
+
"open_source": true
|
| 33 |
+
}
|
| 34 |
+
}
|
| 35 |
+
}
|
eval-results/omnieval-auto/gte-qwen2-1.5b_llama3-70b-instruct/results_2023-12-08 15:46:20.425378.json
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"results": {
|
| 3 |
+
"retrieval": {
|
| 4 |
+
"mrr": 0.3406848507808225,
|
| 5 |
+
"map": 0.3337426863661236
|
| 6 |
+
},
|
| 7 |
+
"generation": {
|
| 8 |
+
"em": 0.030906680136380857,
|
| 9 |
+
"f1": 0.4704248712273675,
|
| 10 |
+
"rouge1": 0.3844331865430577,
|
| 11 |
+
"rouge2": 0.21544656691735142,
|
| 12 |
+
"rougeL": 0.3082188596657867,
|
| 13 |
+
"accuracy": 0.4181714862987751,
|
| 14 |
+
"completeness": 0.586105675146771,
|
| 15 |
+
"hallucination": 0.0880543450397334,
|
| 16 |
+
"utilization": 0.45601078859491395,
|
| 17 |
+
"numerical_accuracy": 0.2751721876024926
|
| 18 |
+
}
|
| 19 |
+
},
|
| 20 |
+
"config": {
|
| 21 |
+
"eval_name": "gte-qwen2-1.5b_llama3-70b-instruct",
|
| 22 |
+
"generative_model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
|
| 23 |
+
"generative_model_args": {
|
| 24 |
+
"name": "meta-llama/Meta-Llama-3.1-70B-Instruct",
|
| 25 |
+
"num_params": 70.6,
|
| 26 |
+
"open_source": true
|
| 27 |
+
},
|
| 28 |
+
"retrieval_model": "Alibaba-NLP/gte-Qwen2-1.5B-instruct",
|
| 29 |
+
"retrieval_model_args": {
|
| 30 |
+
"name": "Alibaba-NLP/gte-Qwen2-1.5B-instruct",
|
| 31 |
+
"num_params": 1.78,
|
| 32 |
+
"open_source": true
|
| 33 |
+
}
|
| 34 |
+
}
|
| 35 |
+
}
|
eval-results/{demo-leaderboard β omnieval-auto}/gte-qwen2-1.5b_qwen2-72b/results_2023-12-08 15:46:20.425378.json
RENAMED
|
@@ -1,20 +1,20 @@
|
|
| 1 |
{
|
| 2 |
"results": {
|
| 3 |
"retrieval": {
|
| 4 |
-
"mrr": 0.
|
| 5 |
-
"map": 0.
|
| 6 |
},
|
| 7 |
"generation": {
|
| 8 |
-
"em": 0.
|
| 9 |
-
"f1": 0.
|
| 10 |
-
"rouge1": 0.
|
| 11 |
-
"rouge2": 0.
|
| 12 |
-
"rougeL": 0.
|
| 13 |
-
"accuracy": 0.
|
| 14 |
-
"completeness": 0.
|
| 15 |
-
"hallucination": 0.
|
| 16 |
-
"utilization":
|
| 17 |
-
"numerical_accuracy": 0.
|
| 18 |
}
|
| 19 |
},
|
| 20 |
"config": {
|
|
|
|
| 1 |
{
|
| 2 |
"results": {
|
| 3 |
"retrieval": {
|
| 4 |
+
"mrr": 0.3406848507808225,
|
| 5 |
+
"map": 0.3337426863661236
|
| 6 |
},
|
| 7 |
"generation": {
|
| 8 |
+
"em": 0.0028412678368480867,
|
| 9 |
+
"f1": 0.2477112059712835,
|
| 10 |
+
"rouge1": 0.25666135328401396,
|
| 11 |
+
"rouge2": 0.13256084364546591,
|
| 12 |
+
"rougeL": 0.1669344569228441,
|
| 13 |
+
"accuracy": 0.40573304710190683,
|
| 14 |
+
"completeness": 0.6131668895824045,
|
| 15 |
+
"hallucination": 0.0,
|
| 16 |
+
"utilization": 0.5346272891410885,
|
| 17 |
+
"numerical_accuracy": 0.2971301335972291
|
| 18 |
}
|
| 19 |
},
|
| 20 |
"config": {
|
eval-results/omnieval-auto/gte-qwen2-1.5b_yi15-34b/results_2023-12-08 15:46:20.425378.json
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"results": {
|
| 3 |
+
"retrieval": {
|
| 4 |
+
"mrr": 0.3406848507808225,
|
| 5 |
+
"map": 0.3337426863661236
|
| 6 |
+
},
|
| 7 |
+
"generation": {
|
| 8 |
+
"em": 0.0,
|
| 9 |
+
"f1": 0.09732568803130702,
|
| 10 |
+
"rouge1": 0.1642342072893325,
|
| 11 |
+
"rouge2": 0.06542075931397044,
|
| 12 |
+
"rougeL": 0.059256539829821125,
|
| 13 |
+
"accuracy": 0.3304375804375804,
|
| 14 |
+
"completeness": 0.5735068912710567,
|
| 15 |
+
"hallucination": 0.06555017663221248,
|
| 16 |
+
"utilization": 0.4132755170113409,
|
| 17 |
+
"numerical_accuracy": 0.175
|
| 18 |
+
}
|
| 19 |
+
},
|
| 20 |
+
"config": {
|
| 21 |
+
"eval_name": "gte-qwen2-1.5b_yi15-34b",
|
| 22 |
+
"generative_model": "01ai/Yi-1.5-34B-Chat-16K",
|
| 23 |
+
"generative_model_args": {
|
| 24 |
+
"name": "01ai/Yi-1.5-34B-Chat-16K",
|
| 25 |
+
"num_params": 34.4,
|
| 26 |
+
"open_source": true
|
| 27 |
+
},
|
| 28 |
+
"retrieval_model": "Alibaba-NLP/gte-Qwen2-1.5B-instruct",
|
| 29 |
+
"retrieval_model_args": {
|
| 30 |
+
"name": "Alibaba-NLP/gte-Qwen2-1.5B-instruct",
|
| 31 |
+
"num_params": 1.78,
|
| 32 |
+
"open_source": true
|
| 33 |
+
}
|
| 34 |
+
}
|
| 35 |
+
}
|
eval-results/omnieval-auto/jina-zh_qwen2-72b/results_2023-12-08 15:46:20.425378.json
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"results": {
|
| 3 |
+
"retrieval": {
|
| 4 |
+
"mrr": 0.25315906890600665,
|
| 5 |
+
"map": 0.24830681483352277
|
| 6 |
+
},
|
| 7 |
+
"generation": {
|
| 8 |
+
"em": 0.0026518499810582142,
|
| 9 |
+
"f1": 0.24837825152624493,
|
| 10 |
+
"rouge1": 0.24111819423215256,
|
| 11 |
+
"rouge2": 0.11665848753826197,
|
| 12 |
+
"rougeL": 0.1558018779014647,
|
| 13 |
+
"accuracy": 0.3705644652102538,
|
| 14 |
+
"completeness": 0.5820335932813437,
|
| 15 |
+
"hallucination": 0.0,
|
| 16 |
+
"utilization": 0.4738984364905027,
|
| 17 |
+
"numerical_accuracy": 0.24648820567187915
|
| 18 |
+
}
|
| 19 |
+
},
|
| 20 |
+
"config": {
|
| 21 |
+
"eval_name": "jina-zh_qwen2-72b",
|
| 22 |
+
"generative_model": "Qwen/Qwen2.5-72B-Instruct",
|
| 23 |
+
"generative_model_args": {
|
| 24 |
+
"name": "Qwen/Qwen2.5-72B-Instruct",
|
| 25 |
+
"num_params": 72.7,
|
| 26 |
+
"open_source": true
|
| 27 |
+
},
|
| 28 |
+
"retrieval_model": "jinaai/jina-embeddings-v2-base-zh",
|
| 29 |
+
"retrieval_model_args": {
|
| 30 |
+
"name": "jinaai/jina-embeddings-v2-base-zh",
|
| 31 |
+
"num_params": 0.161,
|
| 32 |
+
"open_source": true
|
| 33 |
+
}
|
| 34 |
+
}
|
| 35 |
+
}
|
eval-results/{demo-leaderboard β omnieval-human}/CLOSE_deepseek-v2-chat/results_2023-12-08 15:46:20.425378.json
RENAMED
|
File without changes
|
eval-results/{demo-leaderboard β omnieval-human}/CLOSE_llama3-70b-instruct/results_2023-12-08 15:46:20.425378.json
RENAMED
|
File without changes
|
eval-results/{demo-leaderboard β omnieval-human}/CLOSE_qwen2-72b/results_2023-12-08 15:46:20.425378.json
RENAMED
|
File without changes
|
eval-results/{demo-leaderboard β omnieval-human}/CLOSE_yi15-34b/results_2023-12-08 15:46:20.425378.json
RENAMED
|
File without changes
|
eval-results/{demo-leaderboard/qwen2-72b_bge-large-zh β omnieval-human/bge-large-zh_qwen2-72b}/results_2023-12-08 15:46:20.425378.json
RENAMED
|
@@ -18,7 +18,7 @@
|
|
| 18 |
}
|
| 19 |
},
|
| 20 |
"config": {
|
| 21 |
-
"eval_name": "
|
| 22 |
"generative_model": "Qwen/Qwen2.5-72B-Instruct",
|
| 23 |
"generative_model_args": {
|
| 24 |
"name": "Qwen/Qwen2.5-72B-Instruct",
|
|
|
|
| 18 |
}
|
| 19 |
},
|
| 20 |
"config": {
|
| 21 |
+
"eval_name": "bge-large-zh_qwen2-72b",
|
| 22 |
"generative_model": "Qwen/Qwen2.5-72B-Instruct",
|
| 23 |
"generative_model_args": {
|
| 24 |
"name": "Qwen/Qwen2.5-72B-Instruct",
|
eval-results/{demo-leaderboard/qwen2-72b_bge-m3 β omnieval-human/bge-m3_qwen2-72b}/results_2023-12-08 15:46:20.425378.json
RENAMED
|
@@ -18,7 +18,7 @@
|
|
| 18 |
}
|
| 19 |
},
|
| 20 |
"config": {
|
| 21 |
-
"eval_name": "
|
| 22 |
"generative_model": "Qwen/Qwen2.5-72B-Instruct",
|
| 23 |
"generative_model_args": {
|
| 24 |
"name": "Qwen/Qwen2.5-72B-Instruct",
|
|
|
|
| 18 |
}
|
| 19 |
},
|
| 20 |
"config": {
|
| 21 |
+
"eval_name": "bge-m3_qwen2-72b",
|
| 22 |
"generative_model": "Qwen/Qwen2.5-72B-Instruct",
|
| 23 |
"generative_model_args": {
|
| 24 |
"name": "Qwen/Qwen2.5-72B-Instruct",
|
eval-results/{demo-leaderboard/qwen2-72b_e5-mistral-7b β omnieval-human/e5-mistral-7b_qwen2-72b}/results_2023-12-08 15:46:20.425378.json
RENAMED
|
@@ -18,7 +18,7 @@
|
|
| 18 |
}
|
| 19 |
},
|
| 20 |
"config": {
|
| 21 |
-
"eval_name": "
|
| 22 |
"generative_model": "Qwen/Qwen2.5-72B-Instruct",
|
| 23 |
"generative_model_args": {
|
| 24 |
"name": "Qwen/Qwen2.5-72B-Instruct",
|
|
|
|
| 18 |
}
|
| 19 |
},
|
| 20 |
"config": {
|
| 21 |
+
"eval_name": "e5-mistral-7b_qwen2-72b",
|
| 22 |
"generative_model": "Qwen/Qwen2.5-72B-Instruct",
|
| 23 |
"generative_model_args": {
|
| 24 |
"name": "Qwen/Qwen2.5-72B-Instruct",
|
eval-results/{demo-leaderboard β omnieval-human}/gte-qwen2-1.5b_deepseek-v2-chat/results_2023-12-08 15:46:20.425378.json
RENAMED
|
File without changes
|
eval-results/{demo-leaderboard β omnieval-human}/gte-qwen2-1.5b_llama3-70b-instruct/results_2023-12-08 15:46:20.425378.json
RENAMED
|
File without changes
|
eval-results/{demo-leaderboard/qwen2-72b_gte-qwen2-1.5b β omnieval-human/gte-qwen2-1.5b_qwen2-72b}/results_2023-12-08 15:46:20.425378.json
RENAMED
|
@@ -18,7 +18,7 @@
|
|
| 18 |
}
|
| 19 |
},
|
| 20 |
"config": {
|
| 21 |
-
"eval_name": "
|
| 22 |
"generative_model": "Qwen/Qwen2.5-72B-Instruct",
|
| 23 |
"generative_model_args": {
|
| 24 |
"name": "Qwen/Qwen2.5-72B-Instruct",
|
|
|
|
| 18 |
}
|
| 19 |
},
|
| 20 |
"config": {
|
| 21 |
+
"eval_name": "gte-qwen2-1.5b_qwen2-72b",
|
| 22 |
"generative_model": "Qwen/Qwen2.5-72B-Instruct",
|
| 23 |
"generative_model_args": {
|
| 24 |
"name": "Qwen/Qwen2.5-72B-Instruct",
|
eval-results/{demo-leaderboard β omnieval-human}/gte-qwen2-1.5b_yi15-34b/results_2023-12-08 15:46:20.425378.json
RENAMED
|
File without changes
|
eval-results/{demo-leaderboard/qwen2-72b_jina-zh β omnieval-human/jina-zh_qwen2-72b}/results_2023-12-08 15:46:20.425378.json
RENAMED
|
@@ -18,7 +18,7 @@
|
|
| 18 |
}
|
| 19 |
},
|
| 20 |
"config": {
|
| 21 |
-
"eval_name": "
|
| 22 |
"generative_model": "Qwen/Qwen2.5-72B-Instruct",
|
| 23 |
"generative_model_args": {
|
| 24 |
"name": "Qwen/Qwen2.5-72B-Instruct",
|
|
|
|
| 18 |
}
|
| 19 |
},
|
| 20 |
"config": {
|
| 21 |
+
"eval_name": "jina-zh_qwen2-72b",
|
| 22 |
"generative_model": "Qwen/Qwen2.5-72B-Instruct",
|
| 23 |
"generative_model_args": {
|
| 24 |
"name": "Qwen/Qwen2.5-72B-Instruct",
|
src/about.py
CHANGED
|
@@ -83,12 +83,12 @@ LLM_BENCHMARKS_TEXT = f"""
|
|
| 83 |
With FlashRAG and provided resources, you can effortlessly reproduce existing SOTA works in the RAG domain or implement your custom RAG processes and components. -->
|
| 84 |
|
| 85 |
|
| 86 |
-
##
|
| 87 |
`conda env create -f environment.yml && conda activate finrag`
|
| 88 |
|
| 89 |
-
<!-- ##
|
| 90 |
1. -->
|
| 91 |
-
##
|
| 92 |
Notion:
|
| 93 |
1. The code run path is `./OpenFinBench`
|
| 94 |
2. We provide our auto-generated evaluation dataset in <a href="https://huggingface.co/datasets/RUC-NLPIR/FlashRAG_datasets/" target="_blank"><img src=https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Dataset-27b3b4></a>
|
|
@@ -136,11 +136,11 @@ Then conduct the model-based evaluate using the following codes, (change the par
|
|
| 136 |
sh evaluator/judgement/judger.sh
|
| 137 |
```
|
| 138 |
|
| 139 |
-
##
|
| 140 |
|
| 141 |
OmniEval is licensed under the [<u>MIT License</u>](./LICENSE).
|
| 142 |
|
| 143 |
-
##
|
| 144 |
The paper is waiting to be released!
|
| 145 |
|
| 146 |
<!-- # Check Infos
|
|
|
|
| 83 |
With FlashRAG and provided resources, you can effortlessly reproduce existing SOTA works in the RAG domain or implement your custom RAG processes and components. -->
|
| 84 |
|
| 85 |
|
| 86 |
+
## π§ Installation
|
| 87 |
`conda env create -f environment.yml && conda activate finrag`
|
| 88 |
|
| 89 |
+
<!-- ## β¨ Features
|
| 90 |
1. -->
|
| 91 |
+
## π Quick-Start
|
| 92 |
Notion:
|
| 93 |
1. The code run path is `./OpenFinBench`
|
| 94 |
2. We provide our auto-generated evaluation dataset in <a href="https://huggingface.co/datasets/RUC-NLPIR/FlashRAG_datasets/" target="_blank"><img src=https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Dataset-27b3b4></a>
|
|
|
|
| 136 |
sh evaluator/judgement/judger.sh
|
| 137 |
```
|
| 138 |
|
| 139 |
+
## π License
|
| 140 |
|
| 141 |
OmniEval is licensed under the [<u>MIT License</u>](./LICENSE).
|
| 142 |
|
| 143 |
+
## π Citation
|
| 144 |
The paper is waiting to be released!
|
| 145 |
|
| 146 |
<!-- # Check Infos
|
src/envs.py
CHANGED
|
@@ -6,7 +6,7 @@ from huggingface_hub import HfApi
|
|
| 6 |
# ----------------------------------
|
| 7 |
TOKEN = os.environ.get("HF_TOKEN") # A read/write token for your org
|
| 8 |
|
| 9 |
-
OWNER = "
|
| 10 |
# ----------------------------------
|
| 11 |
|
| 12 |
REPO_ID = f"{OWNER}/leaderboard"
|
|
@@ -18,7 +18,8 @@ CACHE_PATH=os.getenv("HF_HOME", ".")
|
|
| 18 |
|
| 19 |
# Local caches
|
| 20 |
EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
|
| 21 |
-
|
|
|
|
| 22 |
EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
|
| 23 |
EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")
|
| 24 |
|
|
|
|
| 6 |
# ----------------------------------
|
| 7 |
TOKEN = os.environ.get("HF_TOKEN") # A read/write token for your org
|
| 8 |
|
| 9 |
+
OWNER = "RUC-NLPIR" # Change to your org - don't forget to create a results and request dataset, with the correct format!
|
| 10 |
# ----------------------------------
|
| 11 |
|
| 12 |
REPO_ID = f"{OWNER}/leaderboard"
|
|
|
|
| 18 |
|
| 19 |
# Local caches
|
| 20 |
EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
|
| 21 |
+
HUMAN_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results", "omnieval-human")
|
| 22 |
+
AUTO_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results", "omnieval-auto")
|
| 23 |
EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
|
| 24 |
EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")
|
| 25 |
|
src/leaderboard/read_evals.py
CHANGED
|
@@ -183,7 +183,7 @@ def get_request_file_for_model(requests_path, model_name, precision):
|
|
| 183 |
def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResult]:
|
| 184 |
"""From the path of the results folder root, extract all needed info for results"""
|
| 185 |
model_result_filepaths = []
|
| 186 |
-
|
| 187 |
for root, _, files in os.walk(results_path):
|
| 188 |
# We should only have json files in model results
|
| 189 |
if len(files) == 0 or any([not f.endswith(".json") for f in files]):
|
|
|
|
| 183 |
def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResult]:
|
| 184 |
"""From the path of the results folder root, extract all needed info for results"""
|
| 185 |
model_result_filepaths = []
|
| 186 |
+
print(f"Reading results from {results_path}")
|
| 187 |
for root, _, files in os.walk(results_path):
|
| 188 |
# We should only have json files in model results
|
| 189 |
if len(files) == 0 or any([not f.endswith(".json") for f in files]):
|