IMAGE_PATH = 'clinicbench.png' RESULTS = 'clinicbench_result.json' # CONSTANTS-CITATION CITATION_BUTTON_TEXT = r"""@inproceedings{Liu2024ClinicBench, title={Large Language Models Are Poor Clinical Decision-Makers: A Comprehensive Benchmark}, author={Fenglin Liu, Zheng Li, Hongjian Zhou, Qingyu Yin, Jingfeng Yang, Xianfeng Tang, Chen Luo, Ming Zeng, Haoming Jiang, Yifan Gao, Priyanka Nigam, Sreyashi Nag, Bing Yin, Yining Hua, Xuan Zhou, Omid Rohanian, Anshul Thakur, Lei Clifton, David A. Clifton}, booktitle={Conference on Empirical Methods in Natural Language Processing (EMNLP)}, year={2024} }""" CITATION_BUTTON_LABEL = "Please consider citing 📑 our papers if our repository is helpful to your work, thanks sincerely!" # CONSTANTS-TEXT LEADERBORAD_INTRODUCTION = """# Medical LLM Leaderboard (Working in progress) ### Welcome to Medical LLM Leaderboard! On this leaderboard, we evaluate 22 LLMs in the clinic: ### [Large Language Models Are Poor Clinical Decision-Makers: A Comprehensive Benchmark](https://aclanthology.org/2024.emnlp-main.759.pdf) 🏆 ### Currently, Medical LLM Leaderboard covers 11 tasks, 7 metrics, 17 datasets, and over 20,000 test samples. We reveal that LLMs are poor clinical decision-makers in multiple complex clinical tasks This leaderboard was last updated: Nov 11, 2024. Medical LLM Leaderboard includes 22 LLMs (i.e., 11 general LLMs and 11 medical LLMs) covering open-source public LLMs and closed-source commercial LLMs, across different numbers of parameters from 7 to 70 billion (B). To add your own model to the leaderboard, please send your results to us or create a PR in [ClinicBench](https://github.com/AI-in-Health/ClinicBench) to support your LLM and then we will help with the evaluation and updating the leaderboard. For any questions or concerns, please feel free to contact us at fenglin.liu@eng.ox.ac.uk and amzzhe@amazon.com. Acknowledgements: This repository borrows code from [Shopping MMLU Leaderboard](https://huggingface.co/spaces/KL4805/shopping_mmlu_leaderboard). """ # CONSTANTS-FIELDS META_FIELDS = ['Method', 'Param (B)', 'OpenSource', 'Verified', 'Commercial LLMs', 'General LLMs', 'Medical LLMs', 'SOTA'] MAIN_FIELDS = [ 'MedQA', 'MedMCQA', 'MMLU-Medicine', 'PubMedQA', 'Referral QA', 'Treat Recom.', 'MIMIC', 'IU-Xray', 'Hospitaliz. Summari.', 'Patient Education', 'BC5', 'NCBI', 'DDI', 'GAD', 'HoC', 'Pharma. QA', 'Drug Inter.' ] DEFAULT_BENCH = [ 'MedQA', 'MedMCQA', 'MMLU-Medicine', 'PubMedQA', 'Referral QA', 'Treat Recom.', 'MIMIC', 'IU-Xray', 'Hospitaliz. Summari.', 'Patient Education', 'BC5', 'NCBI', 'DDI', 'GAD', 'HoC', 'Pharma. QA', 'Drug Inter.' ] MODEL_SIZE = ['7B', '13B', '70B', 'Unknown'] # MODEL_TYPE = ['Commercial LLMs', 'General LLMs', 'Medical LLMs', 'SOTA'] MODEL_TYPE = ['OpenSource', 'Commercial LLMs', 'General LLMs', 'Medical LLMs', 'SOTA'] # The README file for each benchmark LEADERBOARD_MD = {} LEADERBOARD_MD['MAIN'] = f""" ## Included Evaluation Scenarios and Tasks: - Clinical Language Reasoning: Exam-style QA, Referral QA, and Treatment Recommendation. - Clinical Language Generation: Radiology Report Summarization, Hospitalization Summarization, and Patient Education. - Clinical Language Understanding: Named Entity Recognition, Relation Extraction, Document Classification, Pharmacology QA for Emerging Drugs, and Drug Interaction for Emerging Drugs. """ LEADERBOARD_MD['RESULT'] = f""" ## Main Evaluation Results """