medical_llm_leaderboard

Sleeping

File size: 3,472 Bytes

4f7ff9e
4777882
df490c1
bd292a6
 
 
 
df490c1
 
3c3c97a
df490c1
23cf1f6
bbd4ac2
993070a
bbd4ac2
 
df490c1
bbd4ac2
 
 
 
df490c1
b5fff87
ee071f5
 
23cf1f6
 
 
df490c1
 
ad5cd1a
df490c1
3082425
 
 
 
 
 
 
090ba31
5712628
afdc52a
df490c1
 
 
 
 
85a9e6f
df490c1
fe7f326
 
 
fa42c76
df490c1
fa42c76
df490c1
2437deb

IMAGE_PATH = 'clinicbench.png'
RESULTS = 'clinicbench_result.json'
# CONSTANTS-CITATION
CITATION_BUTTON_TEXT = r"""@inproceedings{Liu2024ClinicBench,
  title={Large Language Models Are Poor Clinical Decision-Makers: A Comprehensive Benchmark},
  author={Fenglin Liu, Zheng Li, Hongjian Zhou, Qingyu Yin, Jingfeng Yang, Xianfeng Tang, Chen Luo, Ming Zeng, Haoming Jiang, Yifan Gao, Priyanka Nigam, Sreyashi Nag, Bing Yin, Yining Hua, Xuan Zhou, Omid Rohanian, Anshul Thakur, Lei Clifton, David A. Clifton},
  booktitle={Conference on Empirical Methods in Natural Language Processing (EMNLP)},
  year={2024}
}"""
CITATION_BUTTON_LABEL = "Please consider citing 📑 our papers if our repository is helpful to your work, thanks sincerely!"
# CONSTANTS-TEXT
LEADERBORAD_INTRODUCTION = """# Medical LLM Leaderboard (Working in progress)
### Welcome to Medical LLM Leaderboard! On this leaderboard, we evaluate 22 LLMs in the clinic:
### [Large Language Models Are Poor Clinical Decision-Makers: A Comprehensive Benchmark](https://aclanthology.org/2024.emnlp-main.759.pdf) 🏆
### Currently, Medical LLM Leaderboard covers 11 tasks, 7 metrics, 17 datasets, and over 20,000 test samples.  
We reveal that LLMs are poor clinical decision-makers in multiple complex clinical tasks

This leaderboard was last updated: Nov 11, 2024. 

Medical LLM Leaderboard includes 22 LLMs (i.e., 11 general LLMs and 11 medical LLMs) covering open-source public LLMs and closed-source commercial LLMs, 
across different numbers of parameters from 7 to 70 billion (B).

To add your own model to the leaderboard, please send your results to us or create a PR in [ClinicBench](https://github.com/AI-in-Health/ClinicBench) to support your LLM and 
then we will help with the evaluation and updating the leaderboard. 
For any questions or concerns, please feel free to contact us at [email protected] and [email protected].

Acknowledgements:
This repository borrows code from [Shopping MMLU Leaderboard](https://huggingface.co/spaces/KL4805/shopping_mmlu_leaderboard).
"""
# CONSTANTS-FIELDS
META_FIELDS = ['Method', 'Param (B)', 'OpenSource', 'Verified', 'Commercial LLMs', 'General LLMs', 'Medical LLMs', 'SOTA']
MAIN_FIELDS = [
    'MedQA', 'MedMCQA', 'MMLU-Medicine', 'PubMedQA', 'Referral QA', 'Treat Recom.', 'MIMIC', 'IU-Xray', 
    'Hospitaliz. Summari.', 'Patient Education', 'BC5', 'NCBI', 'DDI', 'GAD', 'HoC', 'Pharma. QA', 'Drug Inter.'
]
DEFAULT_BENCH = [
    'MedQA', 'MedMCQA', 'MMLU-Medicine', 'PubMedQA', 'Referral QA', 'Treat Recom.', 'MIMIC', 'IU-Xray', 
    'Hospitaliz. Summari.', 'Patient Education', 'BC5', 'NCBI', 'DDI', 'GAD', 'HoC', 'Pharma. QA', 'Drug Inter.'
]
MODEL_SIZE = ['7B', '13B', '70B', 'Unknown']
# MODEL_TYPE = ['Commercial LLMs', 'General LLMs', 'Medical LLMs', 'SOTA']
MODEL_TYPE = ['OpenSource', 'Commercial LLMs', 'General LLMs', 'Medical LLMs', 'SOTA']

# The README file for each benchmark
LEADERBOARD_MD = {}

LEADERBOARD_MD['MAIN'] = f"""
## Included Evaluation Scenarios and Tasks: 

- Clinical Language Reasoning: Exam-style QA, Referral QA, and Treatment Recommendation. 
- Clinical Language Generation: Radiology Report Summarization, Hospitalization Summarization, and Patient Education. 
- Clinical Language Understanding: Named Entity Recognition, Relation Extraction, Document Classification, Pharmacology QA for Emerging Drugs, and Drug Interaction for Emerging Drugs.  
"""

LEADERBOARD_MD['RESULT'] = f"""
## Main Evaluation Results
"""