Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
chore: clean up the requests related codes
Browse files
src/display/utils.py
CHANGED
@@ -19,18 +19,22 @@ class ColumnContent:
|
|
19 |
never_hidden: bool = False
|
20 |
|
21 |
|
|
|
|
|
|
|
|
|
22 |
def make_autoevalcolumn(cls_name="BenchmarksQA", benchmarks=BenchmarksQA):
|
23 |
## Leaderboard columns
|
24 |
auto_eval_column_dict = []
|
25 |
# Init
|
26 |
auto_eval_column_dict.append(
|
27 |
-
["retrieval_model", ColumnContent, ColumnContent(
|
28 |
)
|
29 |
auto_eval_column_dict.append(
|
30 |
-
["reranking_model", ColumnContent, ColumnContent(
|
31 |
)
|
32 |
auto_eval_column_dict.append(
|
33 |
-
["average", ColumnContent, ColumnContent(
|
34 |
)
|
35 |
for benchmark in benchmarks:
|
36 |
auto_eval_column_dict.append(
|
|
|
19 |
never_hidden: bool = False
|
20 |
|
21 |
|
22 |
+
COL_NAME_AVG = "Average ⬆️"
|
23 |
+
COL_NAME_RETRIEVAL_MODEL = "Retrieval Model"
|
24 |
+
COL_NAME_RERANKING_MODEL = "Reranking Model"
|
25 |
+
|
26 |
def make_autoevalcolumn(cls_name="BenchmarksQA", benchmarks=BenchmarksQA):
|
27 |
## Leaderboard columns
|
28 |
auto_eval_column_dict = []
|
29 |
# Init
|
30 |
auto_eval_column_dict.append(
|
31 |
+
["retrieval_model", ColumnContent, ColumnContent(COL_NAME_RETRIEVAL_MODEL, "markdown", True, never_hidden=True)]
|
32 |
)
|
33 |
auto_eval_column_dict.append(
|
34 |
+
["reranking_model", ColumnContent, ColumnContent(COL_NAME_RERANKING_MODEL, "markdown", True, never_hidden=True)]
|
35 |
)
|
36 |
auto_eval_column_dict.append(
|
37 |
+
["average", ColumnContent, ColumnContent(COL_NAME_AVG, "number", True)]
|
38 |
)
|
39 |
for benchmark in benchmarks:
|
40 |
auto_eval_column_dict.append(
|
src/leaderboard/read_evals.py
CHANGED
@@ -1,24 +1,28 @@
|
|
1 |
-
import glob
|
2 |
-
from collections import defaultdict
|
3 |
import json
|
4 |
import os.path
|
|
|
5 |
from dataclasses import dataclass
|
6 |
from typing import List
|
7 |
|
8 |
import dateutil.parser._parser
|
|
|
9 |
|
10 |
-
from src.display.utils import AutoEvalColumnQA
|
11 |
from src.benchmarks import get_safe_name
|
|
|
|
|
|
|
12 |
|
13 |
|
14 |
@dataclass
|
15 |
class EvalResult:
|
16 |
-
"""
|
|
|
|
|
17 |
"""
|
18 |
eval_name: str # name of the evaluation, [retrieval_model]_[reranking_model]_[metric]
|
19 |
retrieval_model: str
|
20 |
reranking_model: str
|
21 |
-
results: list # results on all the benchmarks
|
22 |
task: str
|
23 |
metric: str
|
24 |
timestamp: str = "" # submission timestamp
|
@@ -26,6 +30,9 @@ class EvalResult:
|
|
26 |
|
27 |
@dataclass
|
28 |
class FullEvalResult:
|
|
|
|
|
|
|
29 |
eval_name: str # name of the evaluation, [retrieval_model]_[reranking_model]
|
30 |
retrieval_model: str
|
31 |
reranking_model: str
|
@@ -34,7 +41,8 @@ class FullEvalResult:
|
|
34 |
|
35 |
@classmethod
|
36 |
def init_from_json_file(cls, json_filepath):
|
37 |
-
"""
|
|
|
38 |
The json file will be written only when the status is FINISHED.
|
39 |
"""
|
40 |
with open(json_filepath) as fp:
|
@@ -63,19 +71,18 @@ class FullEvalResult:
|
|
63 |
)
|
64 |
|
65 |
def to_dict(self, task='qa', metric='ndcg_at_3') -> List:
|
66 |
-
"""
|
|
|
67 |
"""
|
68 |
results = defaultdict(dict)
|
69 |
for eval_result in self.results:
|
70 |
if eval_result.metric != metric:
|
71 |
-
# print(f'result skipped: {metric} != {eval_result.metric}')
|
72 |
continue
|
73 |
if eval_result.task != task:
|
74 |
-
# print(f'result skipped: {task} != {eval_result.task}')
|
75 |
continue
|
76 |
results[eval_result.eval_name]["eval_name"] = eval_result.eval_name
|
77 |
-
results[eval_result.eval_name][
|
78 |
-
results[eval_result.eval_name][
|
79 |
|
80 |
print(f'result loaded: {eval_result.eval_name}')
|
81 |
for result in eval_result.results:
|
@@ -92,43 +99,20 @@ class FullEvalResult:
|
|
92 |
return [v for v in results.values()]
|
93 |
|
94 |
|
95 |
-
def get_request_file_for_model(requests_path, retrieval_model_name, reranking_model_name):
|
96 |
-
"""
|
97 |
-
Load the request status from a json file
|
98 |
-
"""
|
99 |
-
request_files = os.path.join(
|
100 |
-
requests_path,
|
101 |
-
f"{retrieval_model_name}",
|
102 |
-
f"{reranking_model_name}",
|
103 |
-
"eval_request_*.json",
|
104 |
-
)
|
105 |
-
request_files = glob.glob(request_files)
|
106 |
-
|
107 |
-
request_file = ""
|
108 |
-
request_files = sorted(request_files, reverse=True)
|
109 |
-
for tmp_request_file in request_files:
|
110 |
-
with open(tmp_request_file, "r") as f:
|
111 |
-
req_content = json.load(f)
|
112 |
-
if req_content["status"] in ["FINISHED"]:
|
113 |
-
request_file = tmp_request_file
|
114 |
-
break
|
115 |
-
return request_file
|
116 |
-
|
117 |
-
|
118 |
def get_raw_eval_results(results_path: str) -> List[FullEvalResult]:
|
119 |
"""
|
120 |
Load the evaluation results from a json file
|
121 |
"""
|
122 |
model_result_filepaths = []
|
123 |
for root, dirs, files in os.walk(results_path):
|
124 |
-
if len(files) == 0
|
125 |
continue
|
126 |
try:
|
127 |
files.sort(key=lambda x: x.removesuffix(".json").removeprefix("results_")[:-7], reverse=True)
|
128 |
except dateutil.parser._parser.ParserError:
|
129 |
files = [files[-1]]
|
130 |
|
131 |
-
# select the latest
|
132 |
for file in files:
|
133 |
model_result_filepaths.append(os.path.join(root, file))
|
134 |
|
@@ -136,7 +120,6 @@ def get_raw_eval_results(results_path: str) -> List[FullEvalResult]:
|
|
136 |
for model_result_filepath in model_result_filepaths:
|
137 |
# create evaluation results
|
138 |
eval_result = FullEvalResult.init_from_json_file(model_result_filepath)
|
139 |
-
model_result_date_str = model_result_filepath.split('/')[-1].removeprefix("results_").removesuffix(".json")
|
140 |
print(f'file loaded: {model_result_filepath}')
|
141 |
eval_name = eval_result.eval_name
|
142 |
eval_results[eval_name] = eval_result
|
@@ -150,3 +133,35 @@ def get_raw_eval_results(results_path: str) -> List[FullEvalResult]:
|
|
150 |
print(f"loading failed: {k}")
|
151 |
continue
|
152 |
return results
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import json
|
2 |
import os.path
|
3 |
+
from collections import defaultdict
|
4 |
from dataclasses import dataclass
|
5 |
from typing import List
|
6 |
|
7 |
import dateutil.parser._parser
|
8 |
+
import pandas as pd
|
9 |
|
|
|
10 |
from src.benchmarks import get_safe_name
|
11 |
+
from src.display.formatting import has_no_nan_values
|
12 |
+
from src.display.utils import COL_NAME_RERANKING_MODEL, COL_NAME_RETRIEVAL_MODEL, COLS_QA, QA_BENCHMARK_COLS, \
|
13 |
+
COLS_LONG_DOC, LONG_DOC_BENCHMARK_COLS, COL_NAME_AVG
|
14 |
|
15 |
|
16 |
@dataclass
|
17 |
class EvalResult:
|
18 |
+
"""
|
19 |
+
Evaluation result of a single embedding model with a specific reranking model on benchmarks over different
|
20 |
+
domains, languages, and datasets
|
21 |
"""
|
22 |
eval_name: str # name of the evaluation, [retrieval_model]_[reranking_model]_[metric]
|
23 |
retrieval_model: str
|
24 |
reranking_model: str
|
25 |
+
results: list # results on all the benchmarks stored as dict
|
26 |
task: str
|
27 |
metric: str
|
28 |
timestamp: str = "" # submission timestamp
|
|
|
30 |
|
31 |
@dataclass
|
32 |
class FullEvalResult:
|
33 |
+
"""
|
34 |
+
Evaluation result of a single embedding model with a specific reranking model on benchmarks over different tasks
|
35 |
+
"""
|
36 |
eval_name: str # name of the evaluation, [retrieval_model]_[reranking_model]
|
37 |
retrieval_model: str
|
38 |
reranking_model: str
|
|
|
41 |
|
42 |
@classmethod
|
43 |
def init_from_json_file(cls, json_filepath):
|
44 |
+
"""
|
45 |
+
Initiate from the result json file for a single model.
|
46 |
The json file will be written only when the status is FINISHED.
|
47 |
"""
|
48 |
with open(json_filepath) as fp:
|
|
|
71 |
)
|
72 |
|
73 |
def to_dict(self, task='qa', metric='ndcg_at_3') -> List:
|
74 |
+
"""
|
75 |
+
Convert the results in all the EvalResults over different tasks and metrics. The output is a list of dict compatible with the dataframe UI
|
76 |
"""
|
77 |
results = defaultdict(dict)
|
78 |
for eval_result in self.results:
|
79 |
if eval_result.metric != metric:
|
|
|
80 |
continue
|
81 |
if eval_result.task != task:
|
|
|
82 |
continue
|
83 |
results[eval_result.eval_name]["eval_name"] = eval_result.eval_name
|
84 |
+
results[eval_result.eval_name][COL_NAME_RETRIEVAL_MODEL] = self.retrieval_model
|
85 |
+
results[eval_result.eval_name][COL_NAME_RERANKING_MODEL] = self.reranking_model
|
86 |
|
87 |
print(f'result loaded: {eval_result.eval_name}')
|
88 |
for result in eval_result.results:
|
|
|
99 |
return [v for v in results.values()]
|
100 |
|
101 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
102 |
def get_raw_eval_results(results_path: str) -> List[FullEvalResult]:
|
103 |
"""
|
104 |
Load the evaluation results from a json file
|
105 |
"""
|
106 |
model_result_filepaths = []
|
107 |
for root, dirs, files in os.walk(results_path):
|
108 |
+
if len(files) == 0:
|
109 |
continue
|
110 |
try:
|
111 |
files.sort(key=lambda x: x.removesuffix(".json").removeprefix("results_")[:-7], reverse=True)
|
112 |
except dateutil.parser._parser.ParserError:
|
113 |
files = [files[-1]]
|
114 |
|
115 |
+
# select the latest results
|
116 |
for file in files:
|
117 |
model_result_filepaths.append(os.path.join(root, file))
|
118 |
|
|
|
120 |
for model_result_filepath in model_result_filepaths:
|
121 |
# create evaluation results
|
122 |
eval_result = FullEvalResult.init_from_json_file(model_result_filepath)
|
|
|
123 |
print(f'file loaded: {model_result_filepath}')
|
124 |
eval_name = eval_result.eval_name
|
125 |
eval_results[eval_name] = eval_result
|
|
|
133 |
print(f"loading failed: {k}")
|
134 |
continue
|
135 |
return results
|
136 |
+
|
137 |
+
|
138 |
+
def get_leaderboard_df(raw_data: List[FullEvalResult], task: str, metric: str) -> pd.DataFrame:
|
139 |
+
"""
|
140 |
+
Creates a dataframe from all the individual experiment results
|
141 |
+
"""
|
142 |
+
if task == "qa":
|
143 |
+
cols = COLS_QA
|
144 |
+
benchmark_cols = QA_BENCHMARK_COLS
|
145 |
+
elif task == "long_doc":
|
146 |
+
cols = COLS_LONG_DOC
|
147 |
+
benchmark_cols = LONG_DOC_BENCHMARK_COLS
|
148 |
+
else:
|
149 |
+
raise NotImplemented
|
150 |
+
all_data_json = []
|
151 |
+
for v in raw_data:
|
152 |
+
all_data_json += v.to_dict(task=task, metric=metric)
|
153 |
+
df = pd.DataFrame.from_records(all_data_json)
|
154 |
+
print(f'dataframe created: {df.shape}')
|
155 |
+
|
156 |
+
# calculate the average score for selected benchmarks
|
157 |
+
_benchmark_cols = frozenset(benchmark_cols).intersection(frozenset(df.columns.to_list()))
|
158 |
+
df[COL_NAME_AVG] = df[list(_benchmark_cols)].mean(axis=1).round(decimals=2)
|
159 |
+
df = df.sort_values(by=[COL_NAME_AVG], ascending=False)
|
160 |
+
df.reset_index(inplace=True)
|
161 |
+
|
162 |
+
_cols = frozenset(cols).intersection(frozenset(df.columns.to_list()))
|
163 |
+
df = df[_cols].round(decimals=2)
|
164 |
+
|
165 |
+
# filter out if any of the benchmarks have not been produced
|
166 |
+
df = df[has_no_nan_values(df, _benchmark_cols)]
|
167 |
+
return df
|
src/populate.py
DELETED
@@ -1,94 +0,0 @@
|
|
1 |
-
import json
|
2 |
-
import os
|
3 |
-
|
4 |
-
import pandas as pd
|
5 |
-
|
6 |
-
from src.display.formatting import has_no_nan_values, make_clickable_model
|
7 |
-
from src.display.utils import AutoEvalColumnQA, AutoEvalColumnLongDoc, EvalQueueColumn
|
8 |
-
from src.leaderboard.read_evals import get_raw_eval_results, EvalResult, FullEvalResult
|
9 |
-
from typing import Tuple, List
|
10 |
-
|
11 |
-
|
12 |
-
def get_leaderboard_df(raw_data: List[FullEvalResult], cols: list, benchmark_cols: list, task: str, metric: str) -> pd.DataFrame:
|
13 |
-
"""Creates a dataframe from all the individual experiment results"""
|
14 |
-
all_data_json = []
|
15 |
-
for v in raw_data:
|
16 |
-
all_data_json += v.to_dict(task=task, metric=metric)
|
17 |
-
df = pd.DataFrame.from_records(all_data_json)
|
18 |
-
print(f'dataframe created: {df.shape}')
|
19 |
-
|
20 |
-
# calculate the average score for selected benchmarks
|
21 |
-
_benchmark_cols = frozenset(benchmark_cols).intersection(frozenset(df.columns.to_list()))
|
22 |
-
if task == 'qa':
|
23 |
-
df[AutoEvalColumnQA.average.name] = df[list(_benchmark_cols)].mean(axis=1).round(decimals=2)
|
24 |
-
df = df.sort_values(by=[AutoEvalColumnQA.average.name], ascending=False)
|
25 |
-
elif task == "long_doc":
|
26 |
-
df[AutoEvalColumnLongDoc.average.name] = df[list(_benchmark_cols)].mean(axis=1).round(decimals=2)
|
27 |
-
df = df.sort_values(by=[AutoEvalColumnLongDoc.average.name], ascending=False)
|
28 |
-
|
29 |
-
df.reset_index(inplace=True)
|
30 |
-
|
31 |
-
_cols = frozenset(cols).intersection(frozenset(df.columns.to_list()))
|
32 |
-
df = df[_cols].round(decimals=2)
|
33 |
-
|
34 |
-
# filter out if any of the benchmarks have not been produced
|
35 |
-
df = df[has_no_nan_values(df, _benchmark_cols)]
|
36 |
-
return df
|
37 |
-
|
38 |
-
|
39 |
-
def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
|
40 |
-
"""Creates the different dataframes for the evaluation queues requests"""
|
41 |
-
# entries = [entry for entry in os.listdir(save_path) if not entry.startswith(".")]
|
42 |
-
# all_evals = []
|
43 |
-
#
|
44 |
-
# for entry in entries:
|
45 |
-
# if ".json" in entry:
|
46 |
-
# file_path = os.path.join(save_path, entry)
|
47 |
-
# with open(file_path) as fp:
|
48 |
-
# data = json.load(fp)
|
49 |
-
#
|
50 |
-
# data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
|
51 |
-
# data[EvalQueueColumn.revision.name] = data.get("revision", "main")
|
52 |
-
#
|
53 |
-
# all_evals.append(data)
|
54 |
-
# elif ".md" not in entry:
|
55 |
-
# # this is a folder
|
56 |
-
# sub_entries = [e for e in os.listdir(f"{save_path}/{entry}") if not e.startswith(".")]
|
57 |
-
# for sub_entry in sub_entries:
|
58 |
-
# file_path = os.path.join(save_path, entry, sub_entry)
|
59 |
-
# with open(file_path) as fp:
|
60 |
-
# data = json.load(fp)
|
61 |
-
#
|
62 |
-
# data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
|
63 |
-
# data[EvalQueueColumn.revision.name] = data.get("revision", "main")
|
64 |
-
# all_evals.append(data)
|
65 |
-
#
|
66 |
-
# pending_list = [e for e in all_evals if e["status"] in ["PENDING", "RERUN"]]
|
67 |
-
# running_list = [e for e in all_evals if e["status"] == "RUNNING"]
|
68 |
-
# finished_list = [e for e in all_evals if e["status"].startswith("FINISHED") or e["status"] == "PENDING_NEW_EVAL"]
|
69 |
-
# df_pending = pd.DataFrame.from_records(pending_list, columns=cols)
|
70 |
-
# df_running = pd.DataFrame.from_records(running_list, columns=cols)
|
71 |
-
# df_finished = pd.DataFrame.from_records(finished_list, columns=cols)
|
72 |
-
cols = ["Retrieval Model", "Submitted Time", "Status"]
|
73 |
-
df_finished = pd.DataFrame(
|
74 |
-
{
|
75 |
-
"Retrieval Model": ["bge-m3", "jina-embeddings-v2"],
|
76 |
-
"Submitted Time": ["2024-05-01 12:34:20", "2024-05-02 12:34:20"],
|
77 |
-
"Status": ["FINISHED", "FINISHED"]
|
78 |
-
}
|
79 |
-
)
|
80 |
-
df_running = pd.DataFrame(
|
81 |
-
{
|
82 |
-
"Retrieval Model": ["bge-m3", "jina-embeddings-v2"],
|
83 |
-
"Submitted Time": ["2024-05-01 12:34:20", "2024-05-02 12:34:20"],
|
84 |
-
"Status": ["RUNNING", "RUNNING"]
|
85 |
-
}
|
86 |
-
)
|
87 |
-
df_pending = pd.DataFrame(
|
88 |
-
{
|
89 |
-
"Retrieval Model": ["bge-m3", "jina-embeddings-v2"],
|
90 |
-
"Submitted Time": ["2024-05-01 12:34:20", "2024-05-02 12:34:20"],
|
91 |
-
"Status": ["PENDING", "PENDING"]
|
92 |
-
}
|
93 |
-
)
|
94 |
-
return df_finished, df_running, df_pending
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
tests/src/leaderboard/test_read_evals.py
CHANGED
@@ -1,6 +1,6 @@
|
|
1 |
from pathlib import Path
|
2 |
|
3 |
-
from src.leaderboard.read_evals import FullEvalResult, get_raw_eval_results
|
4 |
|
5 |
cur_fp = Path(__file__)
|
6 |
|
@@ -8,7 +8,11 @@ cur_fp = Path(__file__)
|
|
8 |
def test_init_from_json_file():
|
9 |
json_fp = cur_fp.parents[2] / "toydata" / "test_data.json"
|
10 |
full_eval_result = FullEvalResult.init_from_json_file(json_fp)
|
11 |
-
|
|
|
|
|
|
|
|
|
12 |
|
13 |
|
14 |
def test_to_dict():
|
@@ -32,3 +36,32 @@ def test_get_raw_eval_results():
|
|
32 |
assert len(results[0].results) == 6
|
33 |
assert results[1].eval_name == "bge-m3_bge-reranker-v2-m3"
|
34 |
assert len(results[1].results) == 6
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
from pathlib import Path
|
2 |
|
3 |
+
from src.leaderboard.read_evals import FullEvalResult, get_raw_eval_results, get_leaderboard_df
|
4 |
|
5 |
cur_fp = Path(__file__)
|
6 |
|
|
|
8 |
def test_init_from_json_file():
|
9 |
json_fp = cur_fp.parents[2] / "toydata" / "test_data.json"
|
10 |
full_eval_result = FullEvalResult.init_from_json_file(json_fp)
|
11 |
+
num_different_task_domain_lang_metric_dataset_combination = 6
|
12 |
+
assert len(full_eval_result.results) == \
|
13 |
+
num_different_task_domain_lang_metric_dataset_combination
|
14 |
+
assert full_eval_result.retrieval_model == "bge-m3"
|
15 |
+
assert full_eval_result.reranking_model == "bge-reranker-v2-m3"
|
16 |
|
17 |
|
18 |
def test_to_dict():
|
|
|
36 |
assert len(results[0].results) == 6
|
37 |
assert results[1].eval_name == "bge-m3_bge-reranker-v2-m3"
|
38 |
assert len(results[1].results) == 6
|
39 |
+
|
40 |
+
def test_get_leaderboard_df():
|
41 |
+
results_path = cur_fp.parents[2] / "toydata" / "test_results"
|
42 |
+
raw_data = get_raw_eval_results(results_path)
|
43 |
+
df = get_leaderboard_df(raw_data, 'qa', 'ndcg_at_1')
|
44 |
+
assert df.shape[0] == 2
|
45 |
+
# the results contain only one embedding model
|
46 |
+
for i in range(2):
|
47 |
+
assert df["Retrieval Model"][i] == "bge-m3"
|
48 |
+
# the results contain only two reranking model
|
49 |
+
assert df["Reranking Model"][0] == "bge-reranker-v2-m3"
|
50 |
+
assert df["Reranking Model"][1] == "NoReranker"
|
51 |
+
assert df["Average ⬆️"][0] > df["Average ⬆️"][1]
|
52 |
+
assert not df[['Average ⬆️', 'wiki_en', 'wiki_zh',]].isnull().values.any()
|
53 |
+
|
54 |
+
|
55 |
+
def test_get_leaderboard_df_long_doc():
|
56 |
+
results_path = cur_fp.parents[2] / "toydata" / "test_results"
|
57 |
+
raw_data = get_raw_eval_results(results_path)
|
58 |
+
df = get_leaderboard_df(raw_data, 'long_doc', 'ndcg_at_1')
|
59 |
+
assert df.shape[0] == 2
|
60 |
+
# the results contain only one embedding model
|
61 |
+
for i in range(2):
|
62 |
+
assert df["Retrieval Model"][i] == "bge-m3"
|
63 |
+
# the results contains only two reranking model
|
64 |
+
assert df["Reranking Model"][0] == "bge-reranker-v2-m3"
|
65 |
+
assert df["Reranking Model"][1] == "NoReranker"
|
66 |
+
assert df["Average ⬆️"][0] > df["Average ⬆️"][1]
|
67 |
+
assert not df[['Average ⬆️', 'law_en_lex_files_500k_600k',]].isnull().values.any()
|
tests/src/test_populate.py
DELETED
@@ -1,41 +0,0 @@
|
|
1 |
-
from src.populate import get_leaderboard_df
|
2 |
-
from src.leaderboard.read_evals import get_raw_eval_results
|
3 |
-
from pathlib import Path
|
4 |
-
|
5 |
-
cur_fp = Path(__file__)
|
6 |
-
|
7 |
-
|
8 |
-
def test_get_leaderboard_df():
|
9 |
-
requests_path = cur_fp.parents[1] / "toydata" / "test_requests"
|
10 |
-
results_path = cur_fp.parents[1] / "toydata" / "test_results"
|
11 |
-
cols = ['Retrieval Model', 'Reranking Model', 'Average ⬆️', 'wiki_en', 'wiki_zh',]
|
12 |
-
benchmark_cols = ['wiki_en', 'wiki_zh',]
|
13 |
-
raw_data = get_raw_eval_results(results_path, requests_path)
|
14 |
-
df = get_leaderboard_df(raw_data, cols, benchmark_cols, 'qa', 'ndcg_at_1')
|
15 |
-
assert df.shape[0] == 2
|
16 |
-
# the results contain only one embedding model
|
17 |
-
for i in range(2):
|
18 |
-
assert df["Retrieval Model"][i] == "bge-m3"
|
19 |
-
# the results contains only two reranking model
|
20 |
-
assert df["Reranking Model"][0] == "bge-reranker-v2-m3"
|
21 |
-
assert df["Reranking Model"][1] == "NoReranker"
|
22 |
-
assert df["Average ⬆️"][0] > df["Average ⬆️"][1]
|
23 |
-
assert not df[['Average ⬆️', 'wiki_en', 'wiki_zh',]].isnull().values.any()
|
24 |
-
|
25 |
-
|
26 |
-
def test_get_leaderboard_df_long_doc():
|
27 |
-
requests_path = cur_fp.parents[1] / "toydata" / "test_requests"
|
28 |
-
results_path = cur_fp.parents[1] / "toydata" / "test_results"
|
29 |
-
cols = ['Retrieval Model', 'Reranking Model', 'Average ⬆️', 'law_en_lex_files_500k_600k',]
|
30 |
-
benchmark_cols = ['law_en_lex_files_500k_600k',]
|
31 |
-
raw_data = get_raw_eval_results(results_path, requests_path)
|
32 |
-
df = get_leaderboard_df(raw_data, cols, benchmark_cols, 'long_doc', 'ndcg_at_1')
|
33 |
-
assert df.shape[0] == 2
|
34 |
-
# the results contain only one embedding model
|
35 |
-
for i in range(2):
|
36 |
-
assert df["Retrieval Model"][i] == "bge-m3"
|
37 |
-
# the results contains only two reranking model
|
38 |
-
assert df["Reranking Model"][0] == "bge-reranker-v2-m3"
|
39 |
-
assert df["Reranking Model"][1] == "NoReranker"
|
40 |
-
assert df["Average ⬆️"][0] > df["Average ⬆️"][1]
|
41 |
-
assert not df[['Average ⬆️', 'law_en_lex_files_500k_600k',]].isnull().values.any()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
tests/toydata/test_results/bge-m3/NoReranker/results_2023-11-21T18-10-08.json
CHANGED
@@ -11,7 +11,7 @@
|
|
11 |
"domain": "law",
|
12 |
"lang": "en",
|
13 |
"dataset": "lex_files_500K-600K",
|
14 |
-
"value": 0.
|
15 |
}
|
16 |
]
|
17 |
},
|
@@ -27,7 +27,7 @@
|
|
27 |
"domain": "law",
|
28 |
"lang": "en",
|
29 |
"dataset": "lex_files_500K-600K",
|
30 |
-
"value": 0.
|
31 |
}
|
32 |
]
|
33 |
},
|
@@ -43,7 +43,7 @@
|
|
43 |
"domain": "wiki",
|
44 |
"lang": "en",
|
45 |
"dataset": "unknown",
|
46 |
-
"value": 0.
|
47 |
}
|
48 |
]
|
49 |
},
|
@@ -59,7 +59,7 @@
|
|
59 |
"domain": "wiki",
|
60 |
"lang": "en",
|
61 |
"dataset": "unknown",
|
62 |
-
"value": 0.
|
63 |
}
|
64 |
]
|
65 |
},
|
|
|
11 |
"domain": "law",
|
12 |
"lang": "en",
|
13 |
"dataset": "lex_files_500K-600K",
|
14 |
+
"value": 0.45723
|
15 |
}
|
16 |
]
|
17 |
},
|
|
|
27 |
"domain": "law",
|
28 |
"lang": "en",
|
29 |
"dataset": "lex_files_500K-600K",
|
30 |
+
"value": 0.49909
|
31 |
}
|
32 |
]
|
33 |
},
|
|
|
43 |
"domain": "wiki",
|
44 |
"lang": "en",
|
45 |
"dataset": "unknown",
|
46 |
+
"value": 0.49083
|
47 |
}
|
48 |
]
|
49 |
},
|
|
|
59 |
"domain": "wiki",
|
60 |
"lang": "en",
|
61 |
"dataset": "unknown",
|
62 |
+
"value": 0.43359
|
63 |
}
|
64 |
]
|
65 |
},
|
utils.py
CHANGED
@@ -7,9 +7,8 @@ from huggingface_hub import HfApi
|
|
7 |
|
8 |
from src.display.utils import AutoEvalColumnQA, AutoEvalColumnLongDoc, COLS_QA, COLS_LONG_DOC, QA_BENCHMARK_COLS, LONG_DOC_BENCHMARK_COLS
|
9 |
from src.benchmarks import BENCHMARK_COLS_QA, BENCHMARK_COLS_LONG_DOC, BenchmarksQA, BenchmarksLongDoc
|
10 |
-
from src.leaderboard.read_evals import FullEvalResult
|
11 |
from typing import List
|
12 |
-
from src.populate import get_leaderboard_df
|
13 |
|
14 |
|
15 |
def filter_models(df: pd.DataFrame, reranking_query: list) -> pd.DataFrame:
|
|
|
7 |
|
8 |
from src.display.utils import AutoEvalColumnQA, AutoEvalColumnLongDoc, COLS_QA, COLS_LONG_DOC, QA_BENCHMARK_COLS, LONG_DOC_BENCHMARK_COLS
|
9 |
from src.benchmarks import BENCHMARK_COLS_QA, BENCHMARK_COLS_LONG_DOC, BenchmarksQA, BenchmarksLongDoc
|
10 |
+
from src.leaderboard.read_evals import FullEvalResult, get_leaderboard_df
|
11 |
from typing import List
|
|
|
12 |
|
13 |
|
14 |
def filter_models(df: pd.DataFrame, reranking_query: list) -> pd.DataFrame:
|