vidore-leaderboard / data /dataset_handler.py
QuentinJG's picture
map_new_names (#10)
4ad8722 verified
VIDORE_V1_MTEB_NAMES = [
"VidoreArxivQARetrieval",
"VidoreDocVQARetrieval",
"VidoreInfoVQARetrieval",
"VidoreShiftProjectRetrieval",
"VidoreSyntheticDocQAAIRetrieval",
"VidoreSyntheticDocQAEnergyRetrieval",
"VidoreSyntheticDocQAGovernmentReportsRetrieval",
"VidoreSyntheticDocQAHealthcareIndustryRetrieval",
"VidoreTabfquadRetrieval",
"VidoreTatdqaRetrieval",
]
VIDORE_V2_MTEB_NAMES = [
"Vidore2BioMedicalLecturesRetrieval",
"Vidore2EconomicsReportsRetrieval",
"Vidore2ESGReportsHLRetrieval",
"Vidore2ESGReportsRetrieval",
]
DEPRECATED_VIDORE_DATASETS_KEYWORDS = [
"arxivqa",
"docvqa",
"infovqa",
"tabfquad",
"tatdqa",
"shift",
"artificial_intelligence",
"energy",
"government_reports",
"healthcare_industry",
]
DEPRECATED_VIDORE_2_DATASETS_KEYWORDS = [
"restaurant_esg",
"rse_restaurant",
"mit_biomedical",
"economics_macro",
"biomedical_lectures",
"esg_reports",
"economics_reports",
]
def get_datasets_nickname(dataset_name) -> str:
if dataset_name == "VidoreArxivQARetrieval":
return "ArxivQA"
elif dataset_name == "VidoreDocVQARetrieval":
return "DocVQA"
elif dataset_name == "VidoreInfoVQARetrieval":
return "InfoVQA"
elif dataset_name == "VidoreTabfquadRetrieval":
return "TabFQuad"
elif dataset_name == "VidoreTatdqaRetrieval":
return "TAT-DQA"
elif dataset_name == "VidoreShiftProjectRetrieval":
return "Shift Project"
elif dataset_name == "VidoreSyntheticDocQAAIRetrieval":
return "Artificial Intelligence"
elif dataset_name == "VidoreSyntheticDocQAEnergyRetrieval":
return "Energy"
elif dataset_name == "VidoreSyntheticDocQAGovernmentReportsRetrieval":
return "Government Reports"
elif dataset_name == "VidoreSyntheticDocQAHealthcareIndustryRetrieval":
return "Healthcare Industry"
elif dataset_name == "Vidore2ESGReportsHLRetrieval":
return "ESG Restaurant Human English"
elif dataset_name == "Vidore2ESGReportsRetrieval":
return "ESG Restaurant Synthetic Multilingual"
elif dataset_name == "Vidore2BioMedicalLecturesRetrieval":
return "MIT Biomedical Multilingual"
elif dataset_name == "Vidore2EconomicsReportsRetrieval":
return "Economics Macro Multilingual"
else:
raise ValueError(f"Dataset {dataset_name} not found in ViDoRe")
def deprecated_get_datasets_nickname(dataset_name) -> str:
if "arxivqa" in dataset_name:
return "ArxivQA"
elif "docvqa" in dataset_name:
return "DocVQA"
elif "infovqa" in dataset_name:
return "InfoVQA"
elif "tabfquad" in dataset_name:
return "TabFQuad"
elif "tatdqa" in dataset_name:
return "TAT-DQA"
elif "shift" in dataset_name:
return "Shift Project"
elif "artificial_intelligence" in dataset_name:
return "Artificial Intelligence"
elif "energy" in dataset_name:
return "Energy"
elif "government_reports" in dataset_name:
return "Government Reports"
elif "healthcare_industry" in dataset_name:
return "Healthcare Industry"
elif ("restaurant_esg" in dataset_name) or ("esg_reports_human" in dataset_name):
return "ESG Restaurant Human"
elif ("rse_restaurant" in dataset_name and "multilingual" in dataset_name) or (
"esg_reports" in dataset_name and not "_eng_" in dataset_name
):
return "ESG Restaurant Synthetic Multilingual"
elif ("rse_restaurant" in dataset_name) or ("esg_reports" in dataset_name and "_eng_" in dataset_name):
return "ESG Restaurant Synthetic"
elif ("mit_biomedical" in dataset_name and "multilingual" in dataset_name) or (
"biomedical_lectures" in dataset_name and not "_eng_" in dataset_name
):
return "MIT Biomedical Multilingual"
elif ("mit_biomedical" in dataset_name) or ("biomedical_lectures" in dataset_name and "_eng_" in dataset_name):
return "MIT Biomedical"
elif ("economics_macro" in dataset_name and "multilingual" in dataset_name) or (
"economics_reports" in dataset_name and not "_eng_" in dataset_name
):
return "Economics Macro Multilingual"
elif ("economics_macro" in dataset_name) or ("economics_reports" in dataset_name and "_eng_" in dataset_name):
return "Economics Macro"
else:
raise ValueError(f"Dataset {dataset_name} not found in ViDoRe")