zhuohan-7 commited on
Commit
5bd2aed
·
1 Parent(s): e4e7f0a

Upload folder using huggingface_hub

Browse files
Files changed (6) hide show
  1. __init__.py +0 -0
  2. content.py +169 -0
  3. draw_diagram.py +223 -0
  4. pages.py +626 -0
  5. show_examples.py +193 -0
  6. summarization.py +127 -0
__init__.py ADDED
File without changes
content.py ADDED
@@ -0,0 +1,169 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ displayname2datasetname = {
3
+ 'LibriSpeech-Clean' : 'librispeech_test_clean',
4
+ 'LibriSpeech-Other' : 'librispeech_test_other',
5
+ 'CommonVoice-15-EN' : 'common_voice_15_en_test',
6
+ 'Peoples-Speech' : 'peoples_speech_test',
7
+ 'GigaSpeech-1' : 'gigaspeech_test',
8
+ 'Earnings-21' : 'earnings21_test',
9
+ 'Earnings-22' : 'earnings22_test',
10
+ 'TED-LIUM-3' : 'tedlium3_test',
11
+ 'TED-LIUM-3-LongForm' : 'tedlium3_long_form_test',
12
+ 'AISHELL-ASR-ZH' : 'aishell_asr_zh_test',
13
+ 'CoVoST2-EN-ID' : 'covost2_en_id_test',
14
+ 'CoVoST2-EN-ZH' : 'covost2_en_zh_test',
15
+ 'CoVoST2-EN-TA' : 'covost2_en_ta_test',
16
+ 'CoVoST2-ID-EN' : 'covost2_id_en_test',
17
+ 'CoVoST2-ZH-EN' : 'covost2_zh_en_test',
18
+ 'CoVoST2-TA-EN' : 'covost2_ta_en_test',
19
+ 'CN-College-Listen-MCQ': 'cn_college_listen_mcq_test',
20
+ 'DREAM-TTS-MCQ' : 'dream_tts_mcq_test',
21
+ 'SLUE-P2-SQA5' : 'slue_p2_sqa5_test',
22
+ 'Public-SG-Speech-QA' : 'public_sg_speech_qa_test',
23
+ 'Spoken-SQuAD' : 'spoken_squad_test',
24
+ 'OpenHermes-Audio' : 'openhermes_audio_test',
25
+ 'ALPACA-Audio' : 'alpaca_audio_test',
26
+ 'WavCaps' : 'wavcaps_test',
27
+ 'AudioCaps' : 'audiocaps_test',
28
+ 'Clotho-AQA' : 'clotho_aqa_test',
29
+ 'WavCaps-QA' : 'wavcaps_qa_test',
30
+ 'AudioCaps-QA' : 'audiocaps_qa_test',
31
+ 'VoxCeleb-Accent' : 'voxceleb_accent_test',
32
+ 'MNSC-AR-Sentence' : 'imda_ar_sentence',
33
+ 'MNSC-AR-Dialogue' : 'imda_ar_dialogue',
34
+ 'VoxCeleb-Gender' : 'voxceleb_gender_test',
35
+ 'IEMOCAP-Gender' : 'iemocap_gender_test',
36
+ 'IEMOCAP-Emotion' : 'iemocap_emotion_test',
37
+ 'MELD-Sentiment' : 'meld_sentiment_test',
38
+ 'MELD-Emotion' : 'meld_emotion_test',
39
+ 'MuChoMusic' : 'muchomusic_test',
40
+ 'MNSC-PART1-ASR' : 'imda_part1_asr_test',
41
+ 'MNSC-PART2-ASR' : 'imda_part2_asr_test',
42
+ 'MNSC-PART3-ASR' : 'imda_part3_30s_asr_test',
43
+ 'MNSC-PART4-ASR' : 'imda_part4_30s_asr_test',
44
+ 'MNSC-PART5-ASR' : 'imda_part5_30s_asr_test',
45
+ 'MNSC-PART6-ASR' : 'imda_part6_30s_asr_test',
46
+ 'MNSC-PART3-SQA' : 'imda_part3_30s_sqa_human_test',
47
+ 'MNSC-PART4-SQA' : 'imda_part4_30s_sqa_human_test',
48
+ 'MNSC-PART5-SQA' : 'imda_part5_30s_sqa_human_test',
49
+ 'MNSC-PART6-SQA' : 'imda_part6_30s_sqa_human_test',
50
+ 'MNSC-PART3-SDS' : 'imda_part3_30s_ds_human_test',
51
+ 'MNSC-PART4-SDS' : 'imda_part4_30s_ds_human_test',
52
+ 'MNSC-PART5-SDS' : 'imda_part5_30s_ds_human_test',
53
+ 'MNSC-PART6-SDS' : 'imda_part6_30s_ds_human_test',
54
+
55
+ 'CNA' : 'cna_test',
56
+ 'IDPC' : 'idpc_test',
57
+ 'Parliament' : 'parliament_test',
58
+ 'UKUS-News' : 'ukusnews_test',
59
+ 'Mediacorp' : 'mediacorp_test',
60
+ 'IDPC-Short' : 'idpc_short_test',
61
+ 'Parliament-Short': 'parliament_short_test',
62
+ 'UKUS-News-Short' : 'ukusnews_short_test',
63
+ 'Mediacorp-Short' : 'mediacorp_short_test',
64
+ 'YouTube ASR: English with Singapore Content': 'ytb_asr_batch1',
65
+ 'YouTube ASR: English with Strong Emotion': 'ytb_asr_batch2',
66
+ 'YouTube ASR: Malay with English Prompt': 'ytb_asr_batch3_ms',
67
+ 'YouTube ASR: Malay with Malay Prompt': 'ytb_asr_batch3_ms_ms_prompt',
68
+
69
+ 'SEAME-Dev-Mandarin' : 'seame_dev_man',
70
+ 'SEAME-Dev-Singlish' : 'seame_dev_sge',
71
+
72
+ 'YouTube SQA: English with Singapore Content': 'ytb_sqa_batch1',
73
+ 'YouTube SDS: English with Singapore Content': 'ytb_sds_batch1',
74
+ 'YouTube PQA: English with Singapore Content': 'ytb_pqa_batch1',
75
+
76
+ }
77
+
78
+ datasetname2diaplayname = {datasetname: displayname for displayname, datasetname in displayname2datasetname.items()}
79
+
80
+
81
+ dataset_diaplay_information = {
82
+ 'LibriSpeech-Clean' : 'A clean, high-quality testset of the LibriSpeech dataset, used for ASR testing.',
83
+ 'LibriSpeech-Other' : 'A more challenging, noisier testset of the LibriSpeech dataset for ASR testing.',
84
+ 'CommonVoice-15-EN' : 'Test set from the Common Voice project, which is a crowd-sourced, multilingual speech dataset.',
85
+ 'Peoples-Speech' : 'A large-scale, open-source speech recognition dataset, with diverse accents and domains.',
86
+ 'GigaSpeech-1' : 'A large-scale ASR dataset with diverse audio sources like podcasts, interviews, etc.',
87
+ 'Earnings-21' : 'ASR test dataset focused on earnings calls from 2021, with professional speech and financial jargon.',
88
+ 'Earnings-22' : 'Similar to Earnings21, but covering earnings calls from 2022.',
89
+ 'TED-LIUM-3' : 'A test set derived from TED talks, covering diverse speakers and topics.',
90
+ 'TED-LIUM-3-LongForm' : 'A longer version of the TED-LIUM dataset, containing extended audio samples. This poses challenges to existing fusion methods in handling long audios. However, it provides benchmark for future development.',
91
+ 'AISHELL-ASR-ZH' : 'ASR test dataset for Mandarin Chinese, based on the Aishell dataset.',
92
+ 'CoVoST2-EN-ID' : 'CoVoST 2 dataset for speech translation from English to Indonesian.',
93
+ 'CoVoST2-EN-ZH' : 'CoVoST 2 dataset for speech translation from English to Chinese.',
94
+ 'CoVoST2-EN-TA' : 'CoVoST 2 dataset for speech translation from English to Tamil.',
95
+ 'CoVoST2-ID-EN' : 'CoVoST 2 dataset for speech translation from Indonesian to English.',
96
+ 'CoVoST2-ZH-EN' : 'CoVoST 2 dataset for speech translation from Chinese to English.',
97
+ 'CoVoST2-TA-EN' : 'CoVoST 2 dataset for speech translation from Tamil to English.',
98
+ 'CN-College-Listen-MCQ': 'Chinese College English Listening Test, with multiple-choice questions.',
99
+ 'DREAM-TTS-MCQ' : 'DREAM dataset for spoken question-answering, derived from textual data and synthesized speech.',
100
+ 'SLUE-P2-SQA5' : 'Spoken Language Understanding Evaluation (SLUE) dataset, part 2, focused on QA tasks.',
101
+ 'Public-SG-Speech-QA' : 'Public dataset for speech-based question answering, gathered from Singapore.',
102
+ 'Spoken-SQuAD' : 'Spoken SQuAD dataset, based on the textual SQuAD dataset, converted into audio.',
103
+ 'OpenHermes-Audio' : 'Test set for spoken instructions. Synthesized from the OpenHermes dataset.',
104
+ 'ALPACA-Audio' : 'Spoken version of the ALPACA dataset, used for evaluating instruction following in audio.',
105
+ 'WavCaps' : 'WavCaps is a dataset for testing audio captioning, where models generate textual descriptions of audio clips.',
106
+ 'AudioCaps' : 'AudioCaps dataset, used for generating captions from general audio events.',
107
+ 'Clotho-AQA' : 'Clotho dataset adapted for audio-based question answering, containing audio clips and questions.',
108
+ 'WavCaps-QA' : 'Question-answering test dataset derived from WavCaps, focusing on audio content.',
109
+ 'AudioCaps-QA' : 'AudioCaps adapted for question-answering tasks, using audio events as input for Q&A.',
110
+ 'VoxCeleb-Accent' : 'Test dataset for accent recognition, based on VoxCeleb, a large speaker identification dataset.',
111
+ 'MNSC-AR-Sentence' : 'Accent recognition based on the IMDA NSC dataset, focusing on sentence-level accents.',
112
+ 'MNSC-AR-Dialogue' : 'Accent recognition based on the IMDA NSC dataset, focusing on dialogue-level accents.',
113
+
114
+ 'VoxCeleb-Gender': 'Test dataset for gender classification, also derived from VoxCeleb.',
115
+ 'IEMOCAP-Gender' : 'Gender classification based on the IEMOCAP dataset.',
116
+ 'IEMOCAP-Emotion': 'Emotion recognition test data from the IEMOCAP dataset, focusing on identifying emotions in speech.',
117
+ 'MELD-Sentiment' : 'Sentiment recognition from speech using the MELD dataset, classifying positive, negative, or neutral sentiments.',
118
+ 'MELD-Emotion' : 'Emotion classification in speech using MELD, detecting specific emotions like happiness, anger, etc.',
119
+ 'MuChoMusic' : 'Test dataset for music understanding, from paper: MuChoMusic: Evaluating Music Understanding in Multimodal Audio-Language Models.',
120
+ 'MNSC-PART1-ASR' : 'Speech recognition test data from the IMDA NSC project, Part 1.',
121
+ 'MNSC-PART2-ASR' : 'Speech recognition test data from the IMDA NSC project, Part 2.',
122
+ 'MNSC-PART3-ASR' : 'Speech recognition test data from the IMDA NSC project, Part 3.',
123
+ 'MNSC-PART4-ASR' : 'Speech recognition test data from the IMDA NSC project, Part 4.',
124
+ 'MNSC-PART5-ASR' : 'Speech recognition test data from the IMDA NSC project, Part 5.',
125
+ 'MNSC-PART6-ASR' : 'Speech recognition test data from the IMDA NSC project, Part 6.',
126
+ 'MNSC-PART3-SQA' : 'Multitak National Speech Corpus (MNSC) dataset, Question answering task, Part 3.',
127
+ 'MNSC-PART4-SQA' : 'Multitak National Speech Corpus (MNSC) dataset, Question answering task, Part 4.',
128
+ 'MNSC-PART5-SQA' : 'Multitak National Speech Corpus (MNSC) dataset, Question answering task, Part 5.',
129
+ 'MNSC-PART6-SQA' : 'Multitak National Speech Corpus (MNSC) dataset, Question answering task, Part 6.',
130
+ 'MNSC-PART3-SDS' : 'Multitak National Speech Corpus (MNSC) dataset, dialogue summarization task, Part 3.',
131
+ 'MNSC-PART4-SDS' : 'Multitak National Speech Corpus (MNSC) dataset, dialogue summarization task, Part 4.',
132
+ 'MNSC-PART5-SDS' : 'Multitak National Speech Corpus (MNSC) dataset, dialogue summarization task, Part 5.',
133
+ 'MNSC-PART6-SDS' : 'Multitak National Speech Corpus (MNSC) dataset, dialogue summarization task, Part 6.',
134
+
135
+ 'CNA' : 'Under Development',
136
+ 'IDPC' : 'Under Development',
137
+ 'Parliament' : 'Under Development',
138
+ 'UKUS-News' : 'Under Development',
139
+ 'Mediacorp' : 'Under Development',
140
+ 'IDPC-Short' : 'Under Development',
141
+ 'Parliament-Short': 'Under Development',
142
+ 'UKUS-News-Short' : 'Under Development',
143
+ 'Mediacorp-Short' : 'Under Development',
144
+ 'YouTube ASR: English Singapore Content' : '''\nYouTube Evaluation Dataset for ASR Task: This dataset include English and Singlish with Singapore Content.''',
145
+ 'YouTube ASR: English with Strong Emotion' : '\nYouTube Evaluation Dataset for ASR Task. English with strong emotions',
146
+ 'YouTube ASR: Malay English Prompt': 'YouTube ASR Dataset, Malay and Malay-English CondeSwitch',
147
+ 'YouTube ASR: Malay with Malay Prompt': 'YouTube ASR Dataset, Malay and Malay-English CondeSwitch. Use Malay prompts',
148
+
149
+ 'SEAME-Dev-Mandarin' : 'Under Development',
150
+ 'SEAME-Dev-Singlish' : 'Under Development',
151
+
152
+ 'YouTube SQA: English with Singapore Content': 'Under Development',
153
+ 'YouTube SDS: English with Singapore Content': 'Under Development',
154
+ 'YouTube PQA: English with Singapore Content': 'Under Development',
155
+
156
+
157
+ }
158
+
159
+
160
+
161
+
162
+ metrics_info = {
163
+ 'wer' : 'Word Error Rate (WER) - The Lower, the better.',
164
+ 'llama3_70b_judge_binary': 'Model-as-a-Judge Peformance. Using LLAMA-3-70B. Scale from 0-100. The higher, the better.',
165
+ 'llama3_70b_judge' : 'Model-as-a-Judge Peformance. Using LLAMA-3-70B. Scale from 0-100. The higher, the better.',
166
+ 'meteor' : 'METEOR Score. The higher, the better.',
167
+ 'bleu' : 'BLEU Score. The higher, the better.',
168
+ }
169
+
draw_diagram.py ADDED
@@ -0,0 +1,223 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import numpy as np
4
+ from streamlit_echarts import st_echarts
5
+ from app.show_examples import *
6
+ from app.content import *
7
+
8
+ import pandas as pd
9
+
10
+ from model_information import get_dataframe
11
+ info_df = get_dataframe()
12
+
13
+
14
+ def draw(folder_name, category_name, displayname, metrics, cus_sort=True):
15
+
16
+ folder = f"./results_organized/{metrics}/"
17
+
18
+ # Load the results from CSV
19
+ data_path = f'{folder}/{category_name.lower()}.csv'
20
+ chart_data = pd.read_csv(data_path).round(3)
21
+
22
+ dataset_name = displayname2datasetname[displayname]
23
+ chart_data = chart_data[['Model', dataset_name]]
24
+
25
+ # Rename to proper display name
26
+ chart_data = chart_data.rename(columns=datasetname2diaplayname)
27
+
28
+ st.markdown("""
29
+ <style>
30
+ .stMultiSelect [data-baseweb=select] span {
31
+ max-width: 800px;
32
+ font-size: 0.9rem;
33
+ background-color: #3C6478 !important; /* Background color for selected items */
34
+ color: white; /* Change text color */
35
+ back
36
+ }
37
+ </style>
38
+ """, unsafe_allow_html=True)
39
+
40
+ # remap model names
41
+ display_model_names = {key.strip() :val.strip() for key, val in zip(info_df['Original Name'], info_df['Proper Display Name'])}
42
+ chart_data['model_show'] = chart_data['Model'].map(lambda x: display_model_names.get(x, x))
43
+
44
+
45
+ models = st.multiselect("Please choose the model",
46
+ sorted(chart_data['model_show'].tolist()),
47
+ default = sorted(chart_data['model_show'].tolist()),
48
+ )
49
+
50
+ chart_data = chart_data[chart_data['model_show'].isin(models)]
51
+ chart_data = chart_data.sort_values(by=[displayname], ascending=cus_sort).dropna(axis=0)
52
+
53
+ if len(chart_data) == 0: return
54
+
55
+
56
+ # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
57
+ '''
58
+ Show Table
59
+ '''
60
+ with st.container():
61
+ st.markdown('##### TABLE')
62
+
63
+
64
+ model_link = {key.strip(): val for key, val in zip(info_df['Proper Display Name'], info_df['Link'])}
65
+
66
+ chart_data['model_link'] = chart_data['model_show'].map(model_link)
67
+
68
+ chart_data_table = chart_data[['model_show', chart_data.columns[1], chart_data.columns[3]]]
69
+
70
+ # Format numeric columns to 2 decimal places
71
+ #chart_data_table[chart_data_table.columns[1]] = chart_data_table[chart_data_table.columns[1]].apply(lambda x: round(float(x), 3) if isinstance(float(x), (int, float)) else float(x))
72
+ cur_dataset_name = chart_data_table.columns[1]
73
+
74
+
75
+ def highlight_first_element(x):
76
+ # Create a DataFrame with the same shape as the input
77
+ df_style = pd.DataFrame('', index=x.index, columns=x.columns)
78
+ # Apply background color to the first element in row 0 (df[0][0])
79
+ # df_style.iloc[0, 1] = 'background-color: #b0c1d7; color: white'
80
+ df_style.iloc[0, 1] = 'background-color: #b0c1d7'
81
+
82
+ return df_style
83
+
84
+ if cur_dataset_name in [
85
+ 'LibriSpeech-Clean',
86
+ 'LibriSpeech-Other',
87
+ 'CommonVoice-15-EN',
88
+ 'Peoples-Speech',
89
+ 'GigaSpeech-1',
90
+ 'Earnings-21',
91
+ 'Earnings-22',
92
+ 'TED-LIUM-3',
93
+ 'TED-LIUM-3-LongForm',
94
+ 'AISHELL-ASR-ZH',
95
+ 'MNSC-PART1-ASR',
96
+ 'MNSC-PART2-ASR',
97
+ 'MNSC-PART3-ASR',
98
+ 'MNSC-PART4-ASR',
99
+ 'MNSC-PART5-ASR',
100
+ 'MNSC-PART6-ASR',
101
+ 'CNA',
102
+ 'IDPC',
103
+ 'Parliament',
104
+ 'UKUS-News',
105
+ 'Mediacorp',
106
+ 'IDPC-Short',
107
+ 'Parliament-Short',
108
+ 'UKUS-News-Short',
109
+ 'Mediacorp-Short',
110
+ 'YTB-ASR-Batch1',
111
+ 'YTB-ASR-Batch2',
112
+ 'SEAME-Dev-Man',
113
+ 'SEAME-Dev-Sge',
114
+ ]:
115
+
116
+ chart_data_table = chart_data_table.sort_values(
117
+ by=chart_data_table.columns[1],
118
+ ascending=True
119
+ ).reset_index(drop=True)
120
+ else:
121
+ chart_data_table = chart_data_table.sort_values(
122
+ by=chart_data_table.columns[1],
123
+ ascending=False
124
+ ).reset_index(drop=True)
125
+
126
+
127
+ styled_df = chart_data_table.style.format(
128
+ {chart_data_table.columns[1]: "{:.3f}"}
129
+ ).apply(
130
+ highlight_first_element, axis=None
131
+ )
132
+
133
+
134
+ st.dataframe(
135
+ styled_df,
136
+ column_config={
137
+ 'model_show': 'Model',
138
+ chart_data_table.columns[1]: {'alignment': 'left'},
139
+ "model_link": st.column_config.LinkColumn(
140
+ "Model Link",
141
+ ),
142
+ },
143
+ hide_index=True,
144
+ use_container_width=True
145
+ )
146
+
147
+
148
+ # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
149
+ '''
150
+ Show Chart
151
+ '''
152
+
153
+ # Initialize a session state variable for toggling the chart visibility
154
+ if "show_chart" not in st.session_state:
155
+ st.session_state.show_chart = False
156
+
157
+ # Create a button to toggle visibility
158
+ if st.button("Show Chart"):
159
+ st.session_state.show_chart = not st.session_state.show_chart
160
+
161
+ if st.session_state.show_chart:
162
+
163
+ with st.container():
164
+ st.markdown('##### CHART')
165
+
166
+ # Get Values
167
+ data_values = chart_data.iloc[:, 1]
168
+
169
+ # Calculate Q1 and Q3
170
+ q1 = data_values.quantile(0.25)
171
+ q3 = data_values.quantile(0.75)
172
+
173
+ # Calculate IQR
174
+ iqr = q3 - q1
175
+
176
+ # Define lower and upper bounds (1.5*IQR is a common threshold)
177
+ lower_bound = q1 - 1.5 * iqr
178
+ upper_bound = q3 + 1.5 * iqr
179
+
180
+ # Filter data within the bounds
181
+ filtered_data = data_values[(data_values >= lower_bound) & (data_values <= upper_bound)]
182
+
183
+ # Calculate min and max values after outlier handling
184
+ min_value = round(filtered_data.min() - 0.1 * filtered_data.min(), 3)
185
+ max_value = round(filtered_data.max() + 0.1 * filtered_data.max(), 3)
186
+
187
+ options = {
188
+ # "title": {"text": f"{dataset_name}"},
189
+ "tooltip": {
190
+ "trigger": "axis",
191
+ "axisPointer": {"type": "cross", "label": {"backgroundColor": "#6a7985"}},
192
+ "triggerOn": 'mousemove',
193
+ },
194
+ "legend": {"data": ['Overall Accuracy']},
195
+ "toolbox": {"feature": {"saveAsImage": {}}},
196
+ "grid": {"left": "3%", "right": "4%", "bottom": "3%", "containLabel": True},
197
+ "xAxis": [
198
+ {
199
+ "type": "category",
200
+ "boundaryGap": True,
201
+ "triggerEvent": True,
202
+ "data": chart_data['model_show'].tolist(),
203
+ }
204
+ ],
205
+ "yAxis": [{"type": "value",
206
+ "min": min_value,
207
+ "max": max_value,
208
+ "boundaryGap": True
209
+ # "splitNumber": 10
210
+ }],
211
+ "series": [{
212
+ "name": f"{dataset_name}",
213
+ "type": "bar",
214
+ "data": chart_data[f'{displayname}'].tolist(),
215
+ }],
216
+ }
217
+
218
+ events = {
219
+ "click": "function(params) { return params.value }"
220
+ }
221
+
222
+ value = st_echarts(options=options, events=events, height="500px")
223
+
pages.py ADDED
@@ -0,0 +1,626 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from app.draw_diagram import *
3
+ from app.content import *
4
+ from app.summarization import *
5
+ from app.show_examples import *
6
+
7
+ def dataset_contents(dataset, metrics):
8
+
9
+ custom_css = """
10
+ <style>
11
+ .my-dataset-info {
12
+ # background-color: #F9EBEA;
13
+ # padding: 10px;
14
+ color: #050505;
15
+ font-style: normal;
16
+ font-size: 8px;
17
+ height: auto;
18
+ }
19
+ </style>
20
+ """
21
+ st.markdown(custom_css, unsafe_allow_html=True)
22
+ st.markdown(f"""<div class="my-dataset-info">
23
+ <p><b>About this dataset</b>: {dataset}</p>
24
+ </div>""", unsafe_allow_html=True)
25
+ st.markdown(f"""<div class="my-dataset-info">
26
+ <p><b>About this metric</b>: {metrics}</p>
27
+ </div>""", unsafe_allow_html=True)
28
+
29
+
30
+ def dashboard():
31
+
32
+ with st.container():
33
+ st.title("Leaderboard for AudioBench")
34
+
35
+ st.markdown("""
36
+ [gh1]: https://github.com/AudioLLMs/AudioBench
37
+ [gh2]: https://github.com/AudioLLMs/AudioBench
38
+ **Toolkit:** [![GitHub Repo stars](https://img.shields.io/github/stars/AudioLLMs/AudioBench?style=social)][gh1] |
39
+ [**Paper @ NAACL 2025**](https://arxiv.org/abs/2406.16020) |
40
+ **Resource for AudioLLMs:** [![GitHub Repo stars](https://img.shields.io/github/stars/AudioLLMs/Awesome-Audio-LLM?style=social)][gh2]
41
+ """)
42
+
43
+
44
+ st.markdown("""
45
+ #### Recent updates
46
+ - **Jan. 2025**: AudioBench is officially accepted to NAACL 2025!
47
+ - **Jan. 2025**: Update the layout.
48
+ - **Dec. 2024**: Added MuChoMusic dataset for Music Understanding - MCQ Questions. From Paper: https://arxiv.org/abs/2408.01337.
49
+ - **Dec. 2024**: Singlish ASR task added! The datasets are available on [HF](https://huggingface.co/datasets/MERaLiON/MNSC).
50
+ - **Dec. 2024**: Updated layout and added support for comparison between models with similar sizes. 1) Reorganized layout for a better user experience. 2) Added performance summary for each task.
51
+ - **Aug. 2024**: Initial leaderboard is now online.
52
+ """)
53
+
54
+ st.divider()
55
+
56
+ st.markdown("""
57
+ #### Evaluating Audio-based Large Language Models
58
+
59
+ - AudioBench is a comprehensive evaluation benchmark designed for general instruction-following audio large language models.
60
+ - AudioBench is an evaluation benchmark that we continually improve and maintain.
61
+
62
+ Below are the initial 26 datasets that are included in AudioBench. We are now exteneded to over 40 datasets and going to extend to more in the future.
63
+ """
64
+ )
65
+
66
+
67
+ with st.container():
68
+
69
+ st.markdown('''
70
+ ''')
71
+
72
+ st.markdown("###### :dart: Our Benchmark includes: ")
73
+ cols = st.columns(8)
74
+ cols[0].metric(label="Tasks", value=">8")
75
+ cols[1].metric(label="Datasets", value=">40")
76
+ cols[2].metric(label="Evaluated Models", value=">5")
77
+
78
+ st.divider()
79
+ with st.container():
80
+ left_co, right_co = st.columns([1, 0.1])
81
+
82
+ with left_co:
83
+ st.markdown("""
84
+ ##### Citations :round_pushpin:
85
+ ```
86
+ @article{wang2024audiobench,
87
+ title={AudioBench: A Universal Benchmark for Audio Large Language Models},
88
+ author={Wang, Bin and Zou, Xunlong and Lin, Geyu and Sun, Shuo and Liu, Zhuohan and Zhang, Wenyu and Liu, Zhengyuan and Aw, AiTi and Chen, Nancy F},
89
+ journal={NAACL},
90
+ year={2025}
91
+ }
92
+ ```
93
+ ```
94
+ @article{zhang2024mowe,
95
+ title={MoWE-Audio: Multitask AudioLLMs with Mixture of Weak Encoders},
96
+ author={Zhang, Wenyu and Sun, Shuo and Wang, Bin and Zou, Xunlong and Liu, Zhuohan and He, Yingxu and Lin, Geyu and Chen, Nancy F and Aw, Ai Ti},
97
+ journal={ICASSP},
98
+ year={2025}
99
+ }
100
+ ```
101
+ ```
102
+ @article{wang2025advancing,
103
+ title={Advancing Singlish Understanding: Bridging the Gap with Datasets and Multimodal Models},
104
+ author={Wang, Bin and Zou, Xunlong and Sun, Shuo and Zhang, Wenyu and He, Yingxu and Liu, Zhuohan and Wei, Chengwei and Chen, Nancy F and Aw, AiTi},
105
+ journal={arXiv preprint arXiv:2501.01034},
106
+ year={2025}
107
+ }
108
+ ```
109
+ ```
110
+ @article{he2024meralion,
111
+ title={MERaLiON-AudioLLM: Technical Report},
112
+ author={He, Yingxu and Liu, Zhuohan and Sun, Shuo and Wang, Bin and Zhang, Wenyu and Zou, Xunlong and Chen, Nancy F and Aw, Ai Ti},
113
+ journal={arXiv preprint arXiv:2412.09818},
114
+ year={2024}
115
+ }
116
+ ```
117
+
118
+ """)
119
+
120
+
121
+
122
+
123
+
124
+
125
+
126
+ def asr_english():
127
+ st.title("Task: Automatic Speech Recognition - English")
128
+
129
+ sum = ['Overall']
130
+ dataset_lists = [
131
+ 'LibriSpeech-Clean',
132
+ 'LibriSpeech-Other',
133
+ 'CommonVoice-15-EN',
134
+ 'Peoples-Speech',
135
+ 'GigaSpeech-1',
136
+ 'Earnings-21',
137
+ 'Earnings-22',
138
+ 'TED-LIUM-3',
139
+ 'TED-LIUM-3-LongForm',
140
+ ]
141
+
142
+ filters_levelone = sum + dataset_lists
143
+
144
+ left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
145
+
146
+ with left:
147
+ filter_1 = st.selectbox('Dataset', filters_levelone)
148
+
149
+ if filter_1:
150
+ if filter_1 in sum:
151
+ sum_table_mulit_metrix('asr_english', ['wer'])
152
+ else:
153
+ dataset_contents(dataset_diaplay_information[filter_1], metrics_info['wer'])
154
+ draw('su', 'asr_english', filter_1, 'wer', cus_sort=True)
155
+
156
+
157
+
158
+
159
+
160
+ def asr_singlish():
161
+ st.title("Task: Automatic Speech Recognition - Singlish")
162
+
163
+ sum = ['Overall']
164
+ dataset_lists = [
165
+ 'MNSC-PART1-ASR',
166
+ 'MNSC-PART2-ASR',
167
+ 'MNSC-PART3-ASR',
168
+ 'MNSC-PART4-ASR',
169
+ 'MNSC-PART5-ASR',
170
+ 'MNSC-PART6-ASR',
171
+ ]
172
+
173
+ filters_levelone = sum + dataset_lists
174
+
175
+ left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
176
+
177
+ with left:
178
+ filter_1 = st.selectbox('Dataset', filters_levelone)
179
+
180
+ if filter_1:
181
+ if filter_1 in sum:
182
+ sum_table_mulit_metrix('asr_singlish', ['wer'])
183
+ else:
184
+ dataset_contents(dataset_diaplay_information[filter_1], metrics_info['wer'])
185
+ draw('su', 'asr_singlish', filter_1, 'wer')
186
+
187
+
188
+
189
+
190
+ def asr_mandarin():
191
+ st.title("Task: Automatic Speech Recognition - Mandarin")
192
+
193
+ sum = ['Overall']
194
+ dataset_lists = [
195
+ 'AISHELL-ASR-ZH',
196
+ ]
197
+
198
+ filters_levelone = sum + dataset_lists
199
+
200
+ left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
201
+
202
+ with left:
203
+ filter_1 = st.selectbox('Dataset', filters_levelone)
204
+
205
+ if filter_1:
206
+ if filter_1 in sum:
207
+ sum_table_mulit_metrix('asr_mandarin', ['wer'])
208
+ else:
209
+ dataset_contents(dataset_diaplay_information[filter_1], metrics_info['wer'])
210
+ draw('su', 'asr_mandarin', filter_1, 'wer')
211
+
212
+
213
+
214
+
215
+ def speech_translation():
216
+ st.title("Task: Speech Translation")
217
+
218
+ sum = ['Overall']
219
+ dataset_lists = [
220
+ 'CoVoST2-EN-ID',
221
+ 'CoVoST2-EN-ZH',
222
+ 'CoVoST2-EN-TA',
223
+ 'CoVoST2-ID-EN',
224
+ 'CoVoST2-ZH-EN',
225
+ 'CoVoST2-TA-EN']
226
+
227
+ filters_levelone = sum + dataset_lists
228
+
229
+ left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
230
+
231
+ with left:
232
+ filter_1 = st.selectbox('Dataset', filters_levelone)
233
+
234
+ if filter_1:
235
+ if filter_1 in sum:
236
+ sum_table_mulit_metrix('st', ['bleu'])
237
+ else:
238
+ dataset_contents(dataset_diaplay_information[filter_1], metrics_info['bleu'])
239
+ draw('su', 'ST', filter_1, 'bleu')
240
+
241
+
242
+
243
+
244
+ def speech_question_answering_english():
245
+ st.title("Task: Spoken Question Answering - English")
246
+
247
+ sum = ['Overall']
248
+
249
+ dataset_lists = [
250
+ 'CN-College-Listen-MCQ',
251
+ 'DREAM-TTS-MCQ',
252
+ 'SLUE-P2-SQA5',
253
+ 'Public-SG-Speech-QA',
254
+ 'Spoken-SQuAD',
255
+ ]
256
+
257
+ filters_levelone = sum + dataset_lists
258
+
259
+ left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
260
+
261
+ with left:
262
+ filter_1 = st.selectbox('Dataset', filters_levelone)
263
+
264
+ if filter_1:
265
+ if filter_1 in sum:
266
+ sum_table_mulit_metrix('sqa_english', ['llama3_70b_judge'])
267
+
268
+ #elif filter_1 in dataset_lists:
269
+ # dataset_contents(sqa_datasets[filter_1], metrics['llama3_70b_judge'])
270
+ # draw('su', 'SQA', filter_1, 'llama3_70b_judge')
271
+
272
+ else:
273
+ dataset_contents(dataset_diaplay_information[filter_1], metrics_info['llama3_70b_judge'])
274
+ draw('su', 'sqa_english', filter_1, 'llama3_70b_judge')
275
+
276
+
277
+
278
+
279
+ def speech_question_answering_singlish():
280
+ st.title("Task: Spoken Question Answering - Singlish")
281
+
282
+ sum = ['Overall']
283
+
284
+ dataset_lists = [
285
+ 'MNSC-PART3-SQA',
286
+ 'MNSC-PART4-SQA',
287
+ 'MNSC-PART5-SQA',
288
+ 'MNSC-PART6-SQA',
289
+ ]
290
+
291
+
292
+ filters_levelone = sum + dataset_lists
293
+
294
+ left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
295
+
296
+ with left:
297
+ filter_1 = st.selectbox('Dataset', filters_levelone)
298
+
299
+ if filter_1:
300
+ if filter_1 in sum:
301
+ sum_table_mulit_metrix('sqa_singlish', ['llama3_70b_judge'])
302
+
303
+ else:
304
+ dataset_contents(dataset_diaplay_information[filter_1], metrics_info['llama3_70b_judge'])
305
+ draw('su', 'sqa_singlish', filter_1, 'llama3_70b_judge')
306
+
307
+
308
+ def spoken_dialogue_summarization_singlish():
309
+ st.title("Task: Spoken Dialogue Summarization - Singlish")
310
+
311
+ sum = ['Overall']
312
+
313
+ dataset_lists = [
314
+ 'MNSC-PART3-SDS',
315
+ 'MNSC-PART4-SDS',
316
+ 'MNSC-PART5-SDS',
317
+ 'MNSC-PART6-SDS',
318
+ ]
319
+
320
+
321
+ filters_levelone = sum + dataset_lists
322
+
323
+ left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
324
+
325
+ with left:
326
+ filter_1 = st.selectbox('Dataset', filters_levelone)
327
+
328
+ if filter_1:
329
+ if filter_1 in sum:
330
+ sum_table_mulit_metrix('sds_singlish', ['llama3_70b_judge'])
331
+
332
+ else:
333
+ dataset_contents(dataset_diaplay_information[filter_1], metrics_info['llama3_70b_judge'])
334
+ draw('su', 'sds_singlish', filter_1, 'llama3_70b_judge')
335
+
336
+
337
+
338
+
339
+ def speech_instruction():
340
+ st.title("Task: Speech Instruction")
341
+
342
+ sum = ['Overall']
343
+
344
+ dataset_lists = ['OpenHermes-Audio',
345
+ 'ALPACA-Audio',
346
+ ]
347
+
348
+ filters_levelone = sum + dataset_lists
349
+
350
+ left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
351
+
352
+ with left:
353
+ filter_1 = st.selectbox('Dataset', filters_levelone)
354
+
355
+ if filter_1:
356
+ if filter_1 in sum:
357
+ sum_table_mulit_metrix('speech_instruction', ['llama3_70b_judge'])
358
+ else:
359
+ dataset_contents(dataset_diaplay_information[filter_1], metrics_info['llama3_70b_judge'])
360
+ draw('su', 'speech_instruction', filter_1, 'llama3_70b_judge')
361
+
362
+
363
+
364
+
365
+ def audio_captioning():
366
+ st.title("Task: Audio Captioning")
367
+
368
+ filters_levelone = ['WavCaps',
369
+ 'AudioCaps',
370
+ ]
371
+ filters_leveltwo = ['Llama3-70b-judge', 'Meteor']
372
+
373
+ left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
374
+
375
+ with left:
376
+ filter_1 = st.selectbox('Dataset', filters_levelone)
377
+ with middle:
378
+ metric = st.selectbox('Metric', filters_leveltwo)
379
+
380
+ if filter_1 or metric:
381
+ dataset_contents(dataset_diaplay_information[filter_1], metrics_info[metric.lower().replace('-', '_')])
382
+ draw('asu', 'audio_captioning', filter_1, metric.lower().replace('-', '_'))
383
+
384
+
385
+
386
+
387
+ def audio_scene_question_answering():
388
+ st.title("Task: Audio Scene Question Answering")
389
+
390
+ sum = ['Overall']
391
+
392
+ dataset_lists = ['Clotho-AQA',
393
+ 'WavCaps-QA',
394
+ 'AudioCaps-QA']
395
+
396
+ filters_levelone = sum + dataset_lists
397
+
398
+ left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
399
+
400
+ with left:
401
+ filter_1 = st.selectbox('Dataset', filters_levelone)
402
+
403
+ if filter_1:
404
+ if filter_1 in sum:
405
+ sum_table_mulit_metrix('audio_scene_question_answering', ['llama3_70b_judge'])
406
+ else:
407
+ dataset_contents(dataset_diaplay_information[filter_1], metrics_info['llama3_70b_judge'])
408
+ draw('asu', 'audio_scene_question_answering', filter_1, 'llama3_70b_judge')
409
+
410
+
411
+
412
+
413
+ def emotion_recognition():
414
+ st.title("Task: Emotion Recognition")
415
+
416
+ sum = ['Overall']
417
+
418
+ dataset_lists = [
419
+ 'IEMOCAP-Emotion',
420
+ 'MELD-Sentiment',
421
+ 'MELD-Emotion',
422
+ ]
423
+
424
+ filters_levelone = sum + dataset_lists
425
+
426
+ left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
427
+
428
+ with left:
429
+ filter_1 = st.selectbox('Dataset', filters_levelone)
430
+
431
+ if filter_1:
432
+ if filter_1 in sum:
433
+ sum_table_mulit_metrix('emotion_recognition', ['llama3_70b_judge'])
434
+ else:
435
+ dataset_contents(dataset_diaplay_information[filter_1], metrics_info['llama3_70b_judge'])
436
+ draw('vu', 'emotion_recognition', filter_1, 'llama3_70b_judge')
437
+
438
+
439
+
440
+
441
+ def accent_recognition():
442
+ st.title("Task: Accent Recognition")
443
+
444
+ sum = ['Overall']
445
+ dataset_lists = [
446
+ 'VoxCeleb-Accent',
447
+ 'MNSC-AR-Sentence',
448
+ 'MNSC-AR-Dialogue',
449
+ ]
450
+
451
+
452
+ filters_levelone = sum + dataset_lists
453
+
454
+ left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
455
+
456
+ with left:
457
+ filter_1 = st.selectbox('Dataset', filters_levelone)
458
+
459
+
460
+ if filter_1:
461
+ if filter_1 in sum:
462
+ sum_table_mulit_metrix('accent_recognition', ['llama3_70b_judge'])
463
+ else:
464
+ dataset_contents(dataset_diaplay_information[filter_1], metrics_info['llama3_70b_judge'])
465
+ draw('vu', 'accent_recognition', filter_1, 'llama3_70b_judge')
466
+
467
+
468
+
469
+
470
+ def gender_recognition():
471
+ st.title("Task: Gender Recognition")
472
+
473
+ sum = ['Overall']
474
+
475
+ dataset_lists = [
476
+ 'VoxCeleb-Gender',
477
+ 'IEMOCAP-Gender'
478
+ ]
479
+
480
+ filters_levelone = sum + dataset_lists
481
+
482
+ left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
483
+
484
+ with left:
485
+ filter_1 = st.selectbox('Dataset', filters_levelone)
486
+
487
+ if filter_1:
488
+ if filter_1 in sum:
489
+ sum_table_mulit_metrix('gender_recognition', ['llama3_70b_judge'])
490
+ else:
491
+ dataset_contents(dataset_diaplay_information[filter_1], metrics_info['llama3_70b_judge'])
492
+ draw('vu', 'gender_recognition', filter_1, 'llama3_70b_judge')
493
+
494
+
495
+
496
+
497
+ def music_understanding():
498
+ st.title("Task: Music Understanding - MCQ Questions")
499
+
500
+ sum = ['Overall']
501
+
502
+ dataset_lists = ['MuChoMusic',
503
+ ]
504
+
505
+ filters_levelone = sum + dataset_lists
506
+
507
+ left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
508
+
509
+ with left:
510
+ filter_1 = st.selectbox('Dataset', filters_levelone)
511
+
512
+ if filter_1:
513
+ if filter_1 in sum:
514
+ sum_table_mulit_metrix('music_understanding', ['llama3_70b_judge'])
515
+ else:
516
+ dataset_contents(dataset_diaplay_information[filter_1], metrics_info['llama3_70b_judge'])
517
+ draw('vu', 'music_understanding', filter_1, 'llama3_70b_judge')
518
+
519
+
520
+
521
+
522
+
523
+
524
+
525
+
526
+
527
+
528
+ def under_development():
529
+ st.title("Task: Under Development")
530
+
531
+
532
+ dataset_lists = [
533
+ 'CNA',
534
+ 'IDPC',
535
+ 'Parliament',
536
+ 'UKUS-News',
537
+ 'Mediacorp',
538
+ 'IDPC-Short',
539
+ 'Parliament-Short',
540
+ 'UKUS-News-Short',
541
+ 'Mediacorp-Short',
542
+
543
+ 'YouTube ASR: English Singapore Content',
544
+ 'YouTube ASR: English with Strong Emotion',
545
+ 'YouTube ASR: Malay English Prompt',
546
+ 'YouTube ASR: Malay with Malay Prompt',
547
+
548
+ 'SEAME-Dev-Mandarin',
549
+ 'SEAME-Dev-Singlish',
550
+
551
+ 'YouTube SQA: English with Singapore Content',
552
+ 'YouTube SDS: English with Singapore Content',
553
+ 'YouTube PQA: English with Singapore Content',
554
+
555
+ ]
556
+
557
+ filters_levelone = dataset_lists
558
+
559
+ left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
560
+
561
+ with left:
562
+ filter_1 = st.selectbox('Dataset', filters_levelone)
563
+
564
+ dataset_contents(dataset_diaplay_information[filter_1], 'under_development')
565
+
566
+ # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
567
+
568
+ '''
569
+ Show Dataset Examples
570
+ '''
571
+
572
+ # Initialize a session state variable for toggling the chart visibility
573
+ if "show_dataset_examples" not in st.session_state:
574
+ st.session_state.show_dataset_examples = False
575
+
576
+ # Create a button to toggle visibility
577
+ if st.button("Show Dataset Examples"):
578
+ st.session_state.show_dataset_examples = not st.session_state.show_dataset_examples
579
+
580
+ if st.session_state.show_dataset_examples:
581
+
582
+ # st.markdown('To be implemented')
583
+
584
+ # # if dataset_name in ['Earnings21-Test', 'Earnings22-Test', 'Tedlium3-Test', 'Tedlium3-Long-form-Test']:
585
+ if filter_1 in []:
586
+ pass
587
+ else:
588
+ try:
589
+ show_dataset_examples(filter_1)
590
+ except:
591
+ st.markdown('To be implemented')
592
+ # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
593
+
594
+ if filter_1 in [
595
+ 'CNA',
596
+ 'IDPC',
597
+ 'Parliament',
598
+ 'UKUS-News',
599
+ 'Mediacorp',
600
+ 'IDPC-Short',
601
+ 'Parliament-Short',
602
+ 'UKUS-News-Short',
603
+ 'Mediacorp-Short',
604
+
605
+ 'YouTube ASR: English Singapore Content',
606
+ 'YouTube ASR: English with Strong Emotion',
607
+ 'YouTube ASR: Malay English Prompt',
608
+ 'YouTube ASR: Malay with Malay Prompt',
609
+
610
+ 'SEAME-Dev-Mandarin',
611
+ 'SEAME-Dev-Singlish',
612
+ ]:
613
+
614
+ draw('vu', 'under_development_wer', filter_1, 'wer')
615
+
616
+ elif filter_1 in [
617
+ 'YouTube SQA: English with Singapore Content',
618
+ 'YouTube SDS: English with Singapore Content',
619
+ 'YouTube PQA: English with Singapore Content',
620
+ ]:
621
+ draw('vu', 'under_development_llama3_70b_judge', filter_1, 'llama3_70b_judge')
622
+
623
+
624
+
625
+
626
+
show_examples.py ADDED
@@ -0,0 +1,193 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import datasets
3
+ import numpy as np
4
+
5
+ import html
6
+
7
+ from app.content import displayname2datasetname
8
+
9
+ def show_dataset_examples(display_name):
10
+ st.divider()
11
+ dataset_name = displayname2datasetname[display_name]
12
+ sample_folder = f"./examples/{dataset_name}"
13
+
14
+ # load dataset
15
+ dataset = datasets.load_from_disk(sample_folder)
16
+
17
+ for index in range(len(dataset)):
18
+ with st.container():
19
+ st.markdown(f'##### Example-{index+1}')
20
+ col1, col2 = st.columns([0.3, 0.7], vertical_alignment="center")
21
+
22
+ # with col1:
23
+ st.audio(f'{sample_folder}/sample_{index}.wav', format="audio/wav")
24
+
25
+ if dataset_name in ['CN-College-Listen-MCQ-Test', 'DREAM-TTS-MCQ-Test']:
26
+
27
+ choices = dataset[index]['other_attributes']['choices']
28
+ if isinstance(choices, str):
29
+ choices_text = choices
30
+ elif isinstance(choices, list):
31
+ choices_text = ' '.join(i for i in choices)
32
+
33
+ question_text = f"""{dataset[index]['instruction']['text']} {choices_text}"""
34
+ else:
35
+ question_text = f"""{dataset[index]['instruction']['text']}"""
36
+
37
+ question_text = html.escape(question_text)
38
+
39
+ with st.container():
40
+ custom_css = """
41
+ <style>
42
+ .my-container-table, p.my-container-text {
43
+ background-color: #fcf8dc;
44
+ padding: 10px;
45
+ border-radius: 5px;
46
+ font-size: 13px;
47
+ # height: 50px;
48
+ word-wrap: break-word
49
+ }
50
+ </style>
51
+ """
52
+ st.markdown(custom_css, unsafe_allow_html=True)
53
+
54
+ s = f"""<tr>
55
+ <td><b>{html.escape(question_text.replace('(A)', '<br>(A)').replace('(B)', '<br>(B)').replace('(C)', '<br>(C)'))}
56
+ </td>
57
+ <td><b>{html.escape(dataset[index]['answer']['text'])}
58
+ </td>
59
+ </tr>
60
+ """
61
+
62
+ body_details = f"""<table style="table-layout: fixed; width:100%">
63
+ <thead>
64
+ <tr style="text-align: center;">
65
+ <th style="width:50%">PROMPT</th>
66
+ <th style="width:50%">ANSWER</th>
67
+ </tr>
68
+ {s}
69
+ </thead>
70
+ </table>"""
71
+
72
+ st.markdown(f"""<div class="my-container-table">
73
+ {body_details}
74
+ </div>""", unsafe_allow_html=True)
75
+
76
+ st.text("")
77
+
78
+ st.divider()
79
+
80
+
81
+ def show_examples(category_name, dataset_name, model_lists, display_model_names):
82
+ st.divider()
83
+ sample_folder = f"./examples/{category_name}/{dataset_name}"
84
+
85
+ dataset = datasets.load_from_disk(sample_folder)
86
+
87
+ for index in range(len(dataset)):
88
+ with st.container():
89
+ st.markdown(f'##### Example-{index+1}')
90
+ col1, col2 = st.columns([0.3, 0.7], vertical_alignment="center")
91
+
92
+ # with col1:
93
+ st.audio(f'{sample_folder}/sample_{index}.wav', format="audio/wav")
94
+
95
+ if dataset_name in ['CN-College-Listen-MCQ-Test', 'DREAM-TTS-MCQ-Test']:
96
+
97
+ choices = dataset[index]['other_attributes']['choices']
98
+ if isinstance(choices, str):
99
+ choices_text = choices
100
+ elif isinstance(choices, list):
101
+ choices_text = ' '.join(i for i in choices)
102
+
103
+ question_text = f"""{dataset[index]['instruction']['text']} {choices_text}"""
104
+ else:
105
+ question_text = f"""{dataset[index]['instruction']['text']}"""
106
+
107
+ question_text = html.escape(question_text)
108
+
109
+ # st.divider()
110
+ with st.container():
111
+ custom_css = """
112
+ <style>
113
+ .my-container-table, p.my-container-text {
114
+ background-color: #fcf8dc;
115
+ padding: 10px;
116
+ border-radius: 5px;
117
+ font-size: 13px;
118
+ # height: 50px;
119
+ word-wrap: break-word
120
+ }
121
+ </style>
122
+ """
123
+ st.markdown(custom_css, unsafe_allow_html=True)
124
+
125
+ model_lists.sort()
126
+
127
+ s = f"""<tr>
128
+ <td><b>REFERENCE</td>
129
+ <td><b>{html.escape(question_text.replace('(A)', '<br>(A)').replace('(B)', '<br>(B)').replace('(C)', '<br>(C)'))}
130
+ </td>
131
+ <td><b>{html.escape(dataset[index]['answer']['text'])}
132
+ </td>
133
+ </tr>
134
+ """
135
+ if dataset_name in ['CN-College-Listen-MCQ-Test', 'DREAM-TTS-MCQ-Test']:
136
+ for model in model_lists:
137
+ try:
138
+
139
+ model_prediction = dataset[index][model]['model_prediction']
140
+ model_prediction = model_prediction.replace('<','').replace('>','').replace('\n','(newline)').replace('*','')
141
+
142
+ s += f"""<tr>
143
+ <td>{display_model_names[model]}</td>
144
+ <td>
145
+ {dataset[index][model]['text'].replace('Choices:', '<br>Choices:').replace('(A)', '<br>(A)').replace('(B)', '<br>(B)').replace('(C)', '<br>(C)')
146
+ }
147
+ </td>
148
+ <td>{html.escape(model_prediction)}</td>
149
+ </tr>"""
150
+ except:
151
+ print(f"{model} is not in {dataset_name}")
152
+ continue
153
+ else:
154
+ for model in model_lists:
155
+
156
+ print(dataset[index][model]['model_prediction'])
157
+
158
+ try:
159
+
160
+ model_prediction = dataset[index][model]['model_prediction']
161
+ model_prediction = model_prediction.replace('<','').replace('>','').replace('\n','(newline)').replace('*','')
162
+
163
+ s += f"""<tr>
164
+ <td>{display_model_names[model]}</td>
165
+ <td>{html.escape(dataset[index][model]['text'])}</td>
166
+ <td>{html.escape(model_prediction)}</td>
167
+ </tr>"""
168
+ except:
169
+ print(f"{model} is not in {dataset_name}")
170
+ continue
171
+
172
+
173
+ body_details = f"""<table style="table-layout: fixed; width:100%">
174
+ <thead>
175
+ <tr style="text-align: center;">
176
+ <th style="width:20%">MODEL</th>
177
+ <th style="width:30%">QUESTION</th>
178
+ <th style="width:50%">MODEL PREDICTION</th>
179
+ </tr>
180
+ {s}
181
+ </thead>
182
+ </table>"""
183
+
184
+ st.markdown(f"""<div class="my-container-table">
185
+ {body_details}
186
+ </div>""", unsafe_allow_html=True)
187
+
188
+ st.text("")
189
+
190
+ st.divider()
191
+
192
+
193
+
summarization.py ADDED
@@ -0,0 +1,127 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import numpy as np
4
+ from streamlit_echarts import st_echarts
5
+ from streamlit.components.v1 import html
6
+ # from PIL import Image
7
+ from app.show_examples import *
8
+ from app.content import *
9
+
10
+ import pandas as pd
11
+ from typing import List
12
+
13
+ from model_information import get_dataframe
14
+
15
+ info_df = get_dataframe()
16
+
17
+
18
+ def sum_table_mulit_metrix(task_name, metrics_lists: List[str]):
19
+
20
+ # combine chart data from multiple sources
21
+ chart_data = pd.DataFrame()
22
+ for metrics in metrics_lists:
23
+ folder = f"./results_organized/{metrics}"
24
+ data_path = f'{folder}/{task_name.lower()}.csv'
25
+ one_chart_data = pd.read_csv(data_path).round(3)
26
+ if len(chart_data) == 0:
27
+ chart_data = one_chart_data
28
+ else:
29
+ chart_data = pd.merge(chart_data, one_chart_data, on='Model', how='outer')
30
+
31
+
32
+ selected_columns = [i for i in chart_data.columns if i != 'Model']
33
+ chart_data['Average'] = chart_data[selected_columns].mean(axis=1)
34
+
35
+ # Update dataset name in table
36
+ chart_data = chart_data.rename(columns=datasetname2diaplayname)
37
+
38
+ st.markdown("""
39
+ <style>
40
+ .stMultiSelect [data-baseweb=select] span {
41
+ max-width: 800px;
42
+ font-size: 0.9rem;
43
+ background-color: #3C6478 !important; /* Background color for selected items */
44
+ color: white; /* Change text color */
45
+ back
46
+ }
47
+ </style>
48
+ """, unsafe_allow_html=True)
49
+
50
+ # remap model names
51
+ display_model_names = {key.strip() :val.strip() for key, val in zip(info_df['Original Name'], info_df['Proper Display Name'])}
52
+ chart_data['model_show'] = chart_data['Model'].map(lambda x: display_model_names.get(x, x))
53
+
54
+ models = st.multiselect("Please choose the model",
55
+ sorted(chart_data['model_show'].tolist()),
56
+ default = sorted(chart_data['model_show'].tolist()),
57
+ )
58
+
59
+ chart_data = chart_data[chart_data['model_show'].isin(models)].dropna(axis=0)
60
+
61
+ if len(chart_data) == 0: return
62
+
63
+ # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
64
+ '''
65
+ Show Table
66
+ '''
67
+ with st.container():
68
+ st.markdown(f'##### TABLE')
69
+
70
+ model_link = {key.strip(): val for key, val in zip(info_df['Proper Display Name'], info_df['Link'])}
71
+
72
+ chart_data['model_link'] = chart_data['model_show'].map(model_link)
73
+
74
+ tabel_columns = [i for i in chart_data.columns if i not in ['Model', 'model_show']]
75
+ column_to_front = 'Average'
76
+ new_order = [column_to_front] + [col for col in tabel_columns if col != column_to_front]
77
+
78
+ chart_data_table = chart_data[['model_show'] + new_order]
79
+
80
+
81
+ # Format numeric columns to 2 decimal places
82
+ chart_data_table[chart_data_table.columns[1]] = chart_data_table[chart_data_table.columns[1]].apply(lambda x: round(float(x), 3) if isinstance(float(x), (int, float)) else float(x))
83
+
84
+ if metrics in ['wer']:
85
+ ascend = True
86
+ else:
87
+ ascend= False
88
+
89
+ chart_data_table = chart_data_table.sort_values(
90
+ by=['Average'],
91
+ ascending=ascend
92
+ ).reset_index(drop=True)
93
+
94
+ # Highlight the best performing model
95
+ def highlight_first_element(x):
96
+ # Create a DataFrame with the same shape as the input
97
+ df_style = pd.DataFrame('', index=x.index, columns=x.columns)
98
+ # Apply background color to the first element in row 0 (df[0][0])
99
+ # df_style.iloc[0, 1] = 'background-color: #b0c1d7; color: white'
100
+ df_style.iloc[0, 1] = 'background-color: #b0c1d7'
101
+
102
+ return df_style
103
+
104
+
105
+ styled_df = chart_data_table.style.format(
106
+ {
107
+ chart_data_table.columns[i]: "{:.3f}" for i in range(1, len(chart_data_table.columns) - 1)
108
+ }
109
+ ).apply(
110
+ highlight_first_element, axis=None
111
+ )
112
+
113
+ st.dataframe(
114
+ styled_df,
115
+ column_config={
116
+ 'model_show': 'Model',
117
+ chart_data_table.columns[1]: {'alignment': 'left'},
118
+ "model_link": st.column_config.LinkColumn(
119
+ "Model Link",
120
+ ),
121
+ },
122
+ hide_index=True,
123
+ use_container_width=True
124
+ )
125
+
126
+ # Only report the last metrics
127
+ st.markdown(f'###### Metric: {metrics_info[metrics]}')