Spaces:
Sleeping
Sleeping
File size: 3,472 Bytes
4f7ff9e 4777882 df490c1 bd292a6 df490c1 3c3c97a df490c1 23cf1f6 bbd4ac2 993070a bbd4ac2 df490c1 bbd4ac2 df490c1 b5fff87 ee071f5 23cf1f6 df490c1 ad5cd1a df490c1 3082425 090ba31 5712628 afdc52a df490c1 85a9e6f df490c1 fe7f326 fa42c76 df490c1 fa42c76 df490c1 2437deb |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 |
IMAGE_PATH = 'clinicbench.png'
RESULTS = 'clinicbench_result.json'
# CONSTANTS-CITATION
CITATION_BUTTON_TEXT = r"""@inproceedings{Liu2024ClinicBench,
title={Large Language Models Are Poor Clinical Decision-Makers: A Comprehensive Benchmark},
author={Fenglin Liu, Zheng Li, Hongjian Zhou, Qingyu Yin, Jingfeng Yang, Xianfeng Tang, Chen Luo, Ming Zeng, Haoming Jiang, Yifan Gao, Priyanka Nigam, Sreyashi Nag, Bing Yin, Yining Hua, Xuan Zhou, Omid Rohanian, Anshul Thakur, Lei Clifton, David A. Clifton},
booktitle={Conference on Empirical Methods in Natural Language Processing (EMNLP)},
year={2024}
}"""
CITATION_BUTTON_LABEL = "Please consider citing 📑 our papers if our repository is helpful to your work, thanks sincerely!"
# CONSTANTS-TEXT
LEADERBORAD_INTRODUCTION = """# Medical LLM Leaderboard (Working in progress)
### Welcome to Medical LLM Leaderboard! On this leaderboard, we evaluate 22 LLMs in the clinic:
### [Large Language Models Are Poor Clinical Decision-Makers: A Comprehensive Benchmark](https://aclanthology.org/2024.emnlp-main.759.pdf) 🏆
### Currently, Medical LLM Leaderboard covers 11 tasks, 7 metrics, 17 datasets, and over 20,000 test samples.
We reveal that LLMs are poor clinical decision-makers in multiple complex clinical tasks
This leaderboard was last updated: Nov 11, 2024.
Medical LLM Leaderboard includes 22 LLMs (i.e., 11 general LLMs and 11 medical LLMs) covering open-source public LLMs and closed-source commercial LLMs,
across different numbers of parameters from 7 to 70 billion (B).
To add your own model to the leaderboard, please send your results to us or create a PR in [ClinicBench](https://github.com/AI-in-Health/ClinicBench) to support your LLM and
then we will help with the evaluation and updating the leaderboard.
For any questions or concerns, please feel free to contact us at [email protected] and [email protected].
Acknowledgements:
This repository borrows code from [Shopping MMLU Leaderboard](https://huggingface.co/spaces/KL4805/shopping_mmlu_leaderboard).
"""
# CONSTANTS-FIELDS
META_FIELDS = ['Method', 'Param (B)', 'OpenSource', 'Verified', 'Commercial LLMs', 'General LLMs', 'Medical LLMs', 'SOTA']
MAIN_FIELDS = [
'MedQA', 'MedMCQA', 'MMLU-Medicine', 'PubMedQA', 'Referral QA', 'Treat Recom.', 'MIMIC', 'IU-Xray',
'Hospitaliz. Summari.', 'Patient Education', 'BC5', 'NCBI', 'DDI', 'GAD', 'HoC', 'Pharma. QA', 'Drug Inter.'
]
DEFAULT_BENCH = [
'MedQA', 'MedMCQA', 'MMLU-Medicine', 'PubMedQA', 'Referral QA', 'Treat Recom.', 'MIMIC', 'IU-Xray',
'Hospitaliz. Summari.', 'Patient Education', 'BC5', 'NCBI', 'DDI', 'GAD', 'HoC', 'Pharma. QA', 'Drug Inter.'
]
MODEL_SIZE = ['7B', '13B', '70B', 'Unknown']
# MODEL_TYPE = ['Commercial LLMs', 'General LLMs', 'Medical LLMs', 'SOTA']
MODEL_TYPE = ['OpenSource', 'Commercial LLMs', 'General LLMs', 'Medical LLMs', 'SOTA']
# The README file for each benchmark
LEADERBOARD_MD = {}
LEADERBOARD_MD['MAIN'] = f"""
## Included Evaluation Scenarios and Tasks:
- Clinical Language Reasoning: Exam-style QA, Referral QA, and Treatment Recommendation.
- Clinical Language Generation: Radiology Report Summarization, Hospitalization Summarization, and Patient Education.
- Clinical Language Understanding: Named Entity Recognition, Relation Extraction, Document Classification, Pharmacology QA for Emerging Drugs, and Drug Interaction for Emerging Drugs.
"""
LEADERBOARD_MD['RESULT'] = f"""
## Main Evaluation Results
""" |