VIDORE_V1_MTEB_NAMES = [ "VidoreArxivQARetrieval", "VidoreDocVQARetrieval", "VidoreInfoVQARetrieval", "VidoreShiftProjectRetrieval", "VidoreSyntheticDocQAAIRetrieval", "VidoreSyntheticDocQAEnergyRetrieval", "VidoreSyntheticDocQAGovernmentReportsRetrieval", "VidoreSyntheticDocQAHealthcareIndustryRetrieval", "VidoreTabfquadRetrieval", "VidoreTatdqaRetrieval", ] VIDORE_V2_MTEB_NAMES = [ "Vidore2BioMedicalLecturesRetrieval", "Vidore2EconomicsReportsRetrieval", "Vidore2ESGReportsHLRetrieval", "Vidore2ESGReportsRetrieval", ] DEPRECATED_VIDORE_DATASETS_KEYWORDS = [ "arxivqa", "docvqa", "infovqa", "tabfquad", "tatdqa", "shift", "artificial_intelligence", "energy", "government_reports", "healthcare_industry", ] DEPRECATED_VIDORE_2_DATASETS_KEYWORDS = [ "restaurant_esg", "rse_restaurant", "mit_biomedical", "economics_macro", "biomedical_lectures", "esg_reports", "economics_reports", ] def get_datasets_nickname(dataset_name) -> str: if dataset_name == "VidoreArxivQARetrieval": return "ArxivQA" elif dataset_name == "VidoreDocVQARetrieval": return "DocVQA" elif dataset_name == "VidoreInfoVQARetrieval": return "InfoVQA" elif dataset_name == "VidoreTabfquadRetrieval": return "TabFQuad" elif dataset_name == "VidoreTatdqaRetrieval": return "TAT-DQA" elif dataset_name == "VidoreShiftProjectRetrieval": return "Shift Project" elif dataset_name == "VidoreSyntheticDocQAAIRetrieval": return "Artificial Intelligence" elif dataset_name == "VidoreSyntheticDocQAEnergyRetrieval": return "Energy" elif dataset_name == "VidoreSyntheticDocQAGovernmentReportsRetrieval": return "Government Reports" elif dataset_name == "VidoreSyntheticDocQAHealthcareIndustryRetrieval": return "Healthcare Industry" elif dataset_name == "Vidore2ESGReportsHLRetrieval": return "ESG Restaurant Human English" elif dataset_name == "Vidore2ESGReportsRetrieval": return "ESG Restaurant Synthetic Multilingual" elif dataset_name == "Vidore2BioMedicalLecturesRetrieval": return "MIT Biomedical Multilingual" elif dataset_name == "Vidore2EconomicsReportsRetrieval": return "Economics Macro Multilingual" else: raise ValueError(f"Dataset {dataset_name} not found in ViDoRe") def deprecated_get_datasets_nickname(dataset_name) -> str: if "arxivqa" in dataset_name: return "ArxivQA" elif "docvqa" in dataset_name: return "DocVQA" elif "infovqa" in dataset_name: return "InfoVQA" elif "tabfquad" in dataset_name: return "TabFQuad" elif "tatdqa" in dataset_name: return "TAT-DQA" elif "shift" in dataset_name: return "Shift Project" elif "artificial_intelligence" in dataset_name: return "Artificial Intelligence" elif "energy" in dataset_name: return "Energy" elif "government_reports" in dataset_name: return "Government Reports" elif "healthcare_industry" in dataset_name: return "Healthcare Industry" elif ("restaurant_esg" in dataset_name) or ("esg_reports_human" in dataset_name): return "ESG Restaurant Human" elif ("rse_restaurant" in dataset_name and "multilingual" in dataset_name) or ( "esg_reports" in dataset_name and not "_eng_" in dataset_name ): return "ESG Restaurant Synthetic Multilingual" elif ("rse_restaurant" in dataset_name) or ("esg_reports" in dataset_name and "_eng_" in dataset_name): return "ESG Restaurant Synthetic" elif ("mit_biomedical" in dataset_name and "multilingual" in dataset_name) or ( "biomedical_lectures" in dataset_name and not "_eng_" in dataset_name ): return "MIT Biomedical Multilingual" elif ("mit_biomedical" in dataset_name) or ("biomedical_lectures" in dataset_name and "_eng_" in dataset_name): return "MIT Biomedical" elif ("economics_macro" in dataset_name and "multilingual" in dataset_name) or ( "economics_reports" in dataset_name and not "_eng_" in dataset_name ): return "Economics Macro Multilingual" elif ("economics_macro" in dataset_name) or ("economics_reports" in dataset_name and "_eng_" in dataset_name): return "Economics Macro" else: raise ValueError(f"Dataset {dataset_name} not found in ViDoRe")