import pytest import pandas as pd from src.utils import remove_html, calculate_mean, filter_models, filter_queries, get_default_cols, select_columns, get_selected_cols, _update_df_elem from src.models import model_hyperlink, TaskType from src.columns import COL_NAME_RERANKING_MODEL, COL_NAME_RETRIEVAL_MODEL NUM_QA_BENCHMARKS_24_05 = 53 NUM_DOC_BENCHMARKS_24_05 = 11 NUM_QA_BENCHMARKS_24_04 = 13 NUM_DOC_BENCHMARKS_24_04 = 15 @pytest.fixture def toy_df(): return pd.DataFrame( { "Retrieval Method": [ "bge-m3", "bge-m3", "jina-embeddings-v2-base", "jina-embeddings-v2-base" ], "Reranking Model": [ "bge-reranker-v2-m3", "NoReranker", "bge-reranker-v2-m3", "NoReranker" ], "Rank 🏆": [1, 2, 3, 4], "Revision": ["123", "234", "345", "456"], "Submission Date": ["", "", "", ""], "Average ⬆️": [0.6, 0.4, 0.3, 0.2], "wiki_en": [0.8, 0.7, 0.2, 0.1], "wiki_zh": [0.4, 0.1, 0.4, 0.3], "news_en": [0.8, 0.7, 0.2, 0.1], "news_zh": [0.4, 0.1, 0.2, 0.3], "Anonymous Submission": [False, False, False, True], } ) def test_remove_html(): model_name = "jina-embeddings-v3" html_str = model_hyperlink( "https://jina.ai", model_name) output_str = remove_html(html_str) assert output_str == model_name def test_calculate_mean(): valid_row = [1, 3] invalid_row = [2, pd.NA] df = pd.DataFrame([valid_row, invalid_row], columns=["a", "b"]) result = list(df.apply(calculate_mean, axis=1)) assert result[0] == sum(valid_row) / 2 assert result[1] == -1 @pytest.mark.parametrize("models, expected", [ (["model1", "model3"], 2), (["model1", "model_missing"], 1), (["model1", "model2", "model3"], 3), (["model1", ], 1), ([], 3), ]) def test_filter_models(models, expected): df = pd.DataFrame( { COL_NAME_RERANKING_MODEL: ["model1", "model2", "model3", ], "col2": [1, 2, 3], } ) output_df = filter_models(df, models) assert len(output_df) == expected @pytest.mark.parametrize("query, expected", [ ("model1;model3", 2), ("model1;model4", 1), ("model1;model2;model3", 3), ("model1", 1), ("", 3), ]) def test_filter_queries(query, expected): df = pd.DataFrame( { COL_NAME_RETRIEVAL_MODEL: ["model1", "model2", "model3", ], COL_NAME_RERANKING_MODEL: ["model4", "model5", "model6", ], } ) output_df = filter_queries(query, df) assert len(output_df) == expected @pytest.mark.parametrize( "task_type, slug, add_fix_cols, expected", [ (TaskType.qa, "air_bench_2404", True, NUM_QA_BENCHMARKS_24_04), (TaskType.long_doc, "air_bench_2404", True, NUM_DOC_BENCHMARKS_24_04), (TaskType.qa, "air_bench_2405", False, NUM_QA_BENCHMARKS_24_05), (TaskType.long_doc, "air_bench_2405", False, NUM_DOC_BENCHMARKS_24_05), ] ) def test_get_default_cols(task_type, slug, add_fix_cols, expected): attr_cols = ['Rank 🏆', 'Retrieval Method', 'Reranking Model', 'Revision', 'Submission Date', 'Average ⬆️'] cols, types = get_default_cols(task_type, slug) cols_set = frozenset(cols) attrs_set = frozenset(attr_cols) if add_fix_cols: assert attrs_set.issubset(cols_set) benchmark_cols = list(cols_set.difference(attrs_set)) assert len(benchmark_cols) == expected @pytest.mark.parametrize( "task_type, domains, languages, expected", [ (TaskType.qa, ["wiki", "news"], ["zh",], ["wiki_zh", "news_zh"]), (TaskType.qa, ["law",], ["zh", "en"], ["law_en"]), ( TaskType.long_doc, ["healthcare"], ["zh", "en"], [ 'healthcare_en_pubmed_100k_200k_1', 'healthcare_en_pubmed_100k_200k_2', 'healthcare_en_pubmed_100k_200k_3', 'healthcare_en_pubmed_40k_50k_5_merged', 'healthcare_en_pubmed_30k_40k_10_merged' ] ) ] ) def test_get_selected_cols(task_type, domains, languages, expected): slug = "air_bench_2404" cols = get_selected_cols(task_type, slug, domains, languages) assert sorted(cols) == sorted(expected) @pytest.mark.parametrize("reset_rank", [False]) def test_select_columns(toy_df, reset_rank): expected = [ 'Rank 🏆', 'Retrieval Method', 'Reranking Model', 'Revision', 'Submission Date', 'Average ⬆️', 'news_zh'] df_result = select_columns( toy_df, ["news"], ["zh"], version_slug="air_bench_2404", reset_ranking=reset_rank ) assert len(df_result.columns) == len(expected) if reset_rank: assert df_result["Average ⬆️"].equals(df_result["news_zh"]) else: assert df_result["Average ⬆️"].equals(toy_df["Average ⬆️"]) @pytest.mark.parametrize( "reset_rank, show_anony", [ (False, True), (True, True), (True, False), ] ) def test__update_df_elem(toy_df, reset_rank, show_anony): df = _update_df_elem( TaskType.qa, "AIR-Bench_24.04", toy_df, ["news"], ["zh"], [], "", show_anony, reset_rank ) if show_anony: assert df.shape[0] == 4 else: assert df.shape[0] == 3 if show_anony: if reset_rank: assert df["Average ⬆️"].equals(df["news_zh"]) else: assert df["Average ⬆️"].equals(toy_df["Average ⬆️"])