YingxuHe commited on
Commit
c205d11
·
1 Parent(s): 627709b

add voice chat function

Browse files
.streamlit/config.toml ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ [client]
2
+ showSidebarNavigation = false
app.py CHANGED
@@ -1,42 +1,3 @@
1
- import copy
2
 
3
- import streamlit as st
4
-
5
- from src.tunnel import start_server
6
- from src.generation import FIXED_GENERATION_CONFIG, load_model
7
- from src.pages import DEFAULT_DIALOGUE_STATES, sidebar_fragment, specify_audio_fragment, conversation_section
8
-
9
-
10
- st.set_page_config(page_title='MERaLiON-AudioLLM', page_icon = "🔥", layout='wide')
11
-
12
- st.markdown('<style>' + open('./style/app_style.css').read() + '</style>', unsafe_allow_html=True)
13
-
14
- if "server" not in st.session_state:
15
- st.session_state.server = start_server()
16
-
17
- if "client" not in st.session_state or 'model_name' not in st.session_state:
18
- st.session_state.client, st.session_state.model_name = load_model()
19
-
20
- for key, value in FIXED_GENERATION_CONFIG.items():
21
- if key not in st.session_state:
22
- st.session_state[key]=copy.deepcopy(value)
23
-
24
- for key, value in DEFAULT_DIALOGUE_STATES.items():
25
- if key not in st.session_state:
26
- st.session_state[key]=copy.deepcopy(value)
27
-
28
- with st.sidebar:
29
- sidebar_fragment()
30
-
31
- if st.sidebar.button('Clear History'):
32
- st.session_state.update(copy.deepcopy(DEFAULT_DIALOGUE_STATES))
33
-
34
- st.markdown("<h1 style='text-align: center;'>MERaLiON-AudioLLM Demo 🤖</h1>", unsafe_allow_html=True)
35
- st.markdown(
36
- """This demo is based on [MERaLiON-AudioLLM](https://huggingface.co/MERaLiON/MERaLiON-AudioLLM-Whisper-SEA-LION),
37
- developed by I2R, A*STAR, in collaboration with AISG, Singapore.
38
- It is tailored for Singapore’s multilingual and multicultural landscape."""
39
- )
40
-
41
- specify_audio_fragment()
42
- conversation_section()
 
1
+ from src.content.playground import playground_page
2
 
3
+ playground_page()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
pages/playground.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ from src.content.playground import playground_page
2
+
3
+
4
+ playground_page()
pages/voice_chat.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ from src.content.voice_chat import voice_chat_page
2
+
3
+
4
+ voice_chat_page()
src/content/common.py ADDED
@@ -0,0 +1,367 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import copy
2
+
3
+ import numpy as np
4
+ import streamlit as st
5
+
6
+ from src.tunnel import start_server
7
+ from src.generation import FIXED_GENERATION_CONFIG, load_model
8
+
9
+
10
+ DEFAULT_DIALOGUE_STATES = dict(
11
+ audio_base64='',
12
+ audio_array=np.array([]),
13
+ disprompt = False,
14
+ new_prompt = "",
15
+ messages=[],
16
+ on_select=False,
17
+ on_upload=False,
18
+ on_record=False,
19
+ on_select_quick_action=False
20
+ )
21
+
22
+
23
+ DEFAULT_VOICE_CHAT_STATES = dict(
24
+ audio_base64='',
25
+ audio_array=np.array([]),
26
+ disprompt = False,
27
+ new_prompt = "",
28
+ messages=[],
29
+ on_select=False,
30
+ on_upload=False,
31
+ on_record=False,
32
+ on_select_quick_action=False
33
+ )
34
+
35
+
36
+ AUDIO_SAMPLES_W_INSTRUCT = {
37
+ "7_ASR_IMDA_PART3_30_ASR_v2_2269": {
38
+ "apperance": "7. Automatic Speech Recognation task: conversation in Singapore accent",
39
+ "instructions": [
40
+ "Need this talk written down, please."
41
+ ]
42
+ },
43
+ "11_ASR_IMDA_PART4_30_ASR_v2_3771": {
44
+ "apperance": "11. Automatic Speech Recognation task: conversation with Singlish code-switch",
45
+ "instructions": [
46
+ "Write out the dialogue as text."
47
+ ]
48
+ },
49
+ "12_ASR_IMDA_PART4_30_ASR_v2_103": {
50
+ "apperance": "12. Automatic Speech Recognation task: conversation with Singlish code-switch",
51
+ "instructions": [
52
+ "Write out the dialogue as text."
53
+ ]
54
+ },
55
+ "17_ASR_IMDA_PART6_30_ASR_v2_1413": {
56
+ "apperance": "17. Automatic Speech Recognation task: conversation in Singapore accent",
57
+ "instructions": [
58
+ "Record the spoken word in text form."
59
+ ]
60
+ },
61
+ "32_SQA_CN_COLLEDGE_ENTRANCE_ENGLISH_TEST_SQA_V2_572": {
62
+ "apperance": "32. Spoken Question Answering task: general speech",
63
+ "instructions": [
64
+ "What does the man think the woman should do at 4:00."
65
+ ]
66
+ },
67
+ "33_SQA_IMDA_PART3_30_SQA_V2_2310": {
68
+ "apperance": "33. Spoken Question Answering task: conversation in Singapore accent",
69
+ "instructions": [
70
+ "Does Speaker2's wife cook for Speaker2 when they are at home."
71
+ ]
72
+ },
73
+ "34_SQA_IMDA_PART3_30_SQA_V2_3621": {
74
+ "apperance": "34. Spoken Question Answering task: conversation in Singapore accent",
75
+ "instructions": [
76
+ "Does the phrase \"#gai-gai#\" have a meaning in Chinese or Hokkien language."
77
+ ]
78
+ },
79
+ "35_SQA_IMDA_PART3_30_SQA_V2_4062": {
80
+ "apperance": "35. Spoken Question Answering task: conversation in Singapore accent",
81
+ "instructions": [
82
+ "What is the color of the vase mentioned in the dialogue."
83
+ ]
84
+ },
85
+ "36_DS_IMDA_PART4_30_DS_V2_849": {
86
+ "apperance": "36. Spoken Dialogue Summarization task: conversation with Singlish code-switch",
87
+ "instructions": [
88
+ "Condense the dialogue into a concise summary highlighting major topics and conclusions."
89
+ ]
90
+ },
91
+ "39_Paralingual_IEMOCAP_ER_V2_91": {
92
+ "apperance": "39. Paralinguistics task: general speech",
93
+ "instructions": [
94
+ "Based on the speaker's speech patterns, what do you think they are feeling."
95
+ ]
96
+ },
97
+ "40_Paralingual_IEMOCAP_ER_V2_567": {
98
+ "apperance": "40. Paralinguistics task: general speech",
99
+ "instructions": [
100
+ "Based on the speaker's speech patterns, what do you think they are feeling."
101
+ ]
102
+ },
103
+ "42_Paralingual_IEMOCAP_GR_V2_320": {
104
+ "apperance": "42. Paralinguistics task: general speech",
105
+ "instructions": [
106
+ "Is it possible for you to identify whether the speaker in this recording is male or female."
107
+ ]
108
+ },
109
+ "47_Paralingual_IMDA_PART3_30_NR_V2_10479": {
110
+ "apperance": "47. Paralinguistics task: conversation in Singapore accent",
111
+ "instructions": [
112
+ "Can you guess which ethnic group this person is from based on their accent."
113
+ ]
114
+ },
115
+ "49_Paralingual_MELD_ER_V2_676": {
116
+ "apperance": "49. Paralinguistics task: general speech",
117
+ "instructions": [
118
+ "What emotions do you think the speaker is expressing."
119
+ ]
120
+ },
121
+ "50_Paralingual_MELD_ER_V2_692": {
122
+ "apperance": "50. Paralinguistics task: general speech",
123
+ "instructions": [
124
+ "Based on the speaker's speech patterns, what do you think they are feeling."
125
+ ]
126
+ },
127
+ "51_Paralingual_VOXCELEB1_GR_V2_2148": {
128
+ "apperance": "51. Paralinguistics task: general speech",
129
+ "instructions": [
130
+ "May I know the gender of the speaker."
131
+ ]
132
+ },
133
+ "53_Paralingual_VOXCELEB1_NR_V2_2286": {
134
+ "apperance": "53. Paralinguistics task: general speech",
135
+ "instructions": [
136
+ "What's the nationality identity of the speaker."
137
+ ]
138
+ },
139
+ "55_SQA_PUBLIC_SPEECH_SG_TEST_SQA_V2_2": {
140
+ "apperance": "55. Spoken Question Answering task: general speech",
141
+ "instructions": [
142
+ "What impact would the growth of the healthcare sector have on the country's economy in terms of employment and growth."
143
+ ]
144
+ },
145
+ "56_SQA_PUBLIC_SPEECH_SG_TEST_SQA_V2_415": {
146
+ "apperance": "56. Spoken Question Answering task: general speech",
147
+ "instructions": [
148
+ "Based on the statement, can you summarize the speaker's position on the recent controversial issues in Singapore."
149
+ ]
150
+ },
151
+ "57_SQA_PUBLIC_SPEECH_SG_TEST_SQA_V2_460": {
152
+ "apperance": "57. Spoken Question Answering task: general speech",
153
+ "instructions": [
154
+ "How does the author respond to parents' worries about masks in schools."
155
+ ]
156
+ },
157
+ "1_ASR_IMDA_PART1_ASR_v2_141": {
158
+ "apperance": "1. Automatic Speech Recognation task: phonetically balanced reading",
159
+ "instructions": [
160
+ "Turn the spoken language into a text format.",
161
+ "Please translate the content into Chinese."
162
+ ]
163
+ },
164
+ "2_ASR_IMDA_PART1_ASR_v2_2258": {
165
+ "apperance": "2. Automatic Speech Recognation task: phonetically balanced reading",
166
+ "instructions": [
167
+ "Turn the spoken language into a text format.",
168
+ "Please translate the content into Chinese."
169
+ ]
170
+ },
171
+ "3_ASR_IMDA_PART1_ASR_v2_2265": {
172
+ "apperance": "3. Automatic Speech Recognation task: phonetically balanced reading",
173
+ "instructions": [
174
+ "Turn the spoken language into a text format."
175
+ ]
176
+ },
177
+ "4_ASR_IMDA_PART2_ASR_v2_999": {
178
+ "apperance": "4. Automatic Speech Recognation task: reading in Singapore context",
179
+ "instructions": [
180
+ "Translate the spoken words into text format."
181
+ ]
182
+ },
183
+ "5_ASR_IMDA_PART2_ASR_v2_2241": {
184
+ "apperance": "5. Automatic Speech Recognation task: reading in Singapore context",
185
+ "instructions": [
186
+ "Translate the spoken words into text format."
187
+ ]
188
+ },
189
+ "6_ASR_IMDA_PART2_ASR_v2_3409": {
190
+ "apperance": "6. Automatic Speech Recognation task: reading in Singapore context",
191
+ "instructions": [
192
+ "Translate the spoken words into text format."
193
+ ]
194
+ },
195
+ "8_ASR_IMDA_PART3_30_ASR_v2_1698": {
196
+ "apperance": "8. Automatic Speech Recognation task: conversation in Singapore accent",
197
+ "instructions": [
198
+ "Need this talk written down, please."
199
+ ]
200
+ },
201
+ "9_ASR_IMDA_PART3_30_ASR_v2_2474": {
202
+ "apperance": "9. Automatic Speech Recognation task: conversation in Singapore accent",
203
+ "instructions": [
204
+ "Need this talk written down, please."
205
+ ]
206
+ },
207
+ "10_ASR_IMDA_PART4_30_ASR_v2_1527": {
208
+ "apperance": "10. Automatic Speech Recognation task: conversation with Singlish code-switch",
209
+ "instructions": [
210
+ "Write out the dialogue as text."
211
+ ]
212
+ },
213
+ "13_ASR_IMDA_PART5_30_ASR_v2_1446": {
214
+ "apperance": "13. Automatic Speech Recognation task: conversation in Singapore accent",
215
+ "instructions": [
216
+ "Translate this vocal recording into a textual format."
217
+ ]
218
+ },
219
+ "14_ASR_IMDA_PART5_30_ASR_v2_2281": {
220
+ "apperance": "14. Automatic Speech Recognation task: conversation in Singapore accent",
221
+ "instructions": [
222
+ "Translate this vocal recording into a textual format."
223
+ ]
224
+ },
225
+ "15_ASR_IMDA_PART5_30_ASR_v2_4388": {
226
+ "apperance": "15. Automatic Speech Recognation task: conversation in Singapore accent",
227
+ "instructions": [
228
+ "Translate this vocal recording into a textual format."
229
+ ]
230
+ },
231
+ "16_ASR_IMDA_PART6_30_ASR_v2_576": {
232
+ "apperance": "16. Automatic Speech Recognation task: conversation in Singapore accent",
233
+ "instructions": [
234
+ "Record the spoken word in text form."
235
+ ]
236
+ },
237
+ "18_ASR_IMDA_PART6_30_ASR_v2_2834": {
238
+ "apperance": "18. Automatic Speech Recognation task: conversation in Singapore accent",
239
+ "instructions": [
240
+ "Record the spoken word in text form."
241
+ ]
242
+ },
243
+ "19_ASR_AIShell_zh_ASR_v2_5044": {
244
+ "apperance": "19. Automatic Speech Recognation task: speech in Chinese ",
245
+ "instructions": [
246
+ "Transform the oral presentation into a text document."
247
+ ]
248
+ },
249
+ "20_ASR_LIBRISPEECH_CLEAN_ASR_V2_833": {
250
+ "apperance": "20. Automatic Speech Recognation task: general speech",
251
+ "instructions": [
252
+ "Please provide a written transcription of the speech."
253
+ ]
254
+ },
255
+ "25_ST_COVOST2_ZH-CN_EN_ST_V2_4567": {
256
+ "apperance": "25. Speech Translation task: Chinese to English",
257
+ "instructions": [
258
+ "Please translate the given speech to English."
259
+ ]
260
+ },
261
+ "26_ST_COVOST2_EN_ZH-CN_ST_V2_5422": {
262
+ "apperance": "26. Speech Translation task: English to Chinese",
263
+ "instructions": [
264
+ "Please translate the given speech to Chinese."
265
+ ]
266
+ },
267
+ "27_ST_COVOST2_EN_ZH-CN_ST_V2_6697": {
268
+ "apperance": "27. Speech Translation task: English to Chinese",
269
+ "instructions": [
270
+ "Please translate the given speech to Chinese."
271
+ ]
272
+ },
273
+ "28_SI_ALPACA-GPT4-AUDIO_SI_V2_299": {
274
+ "apperance": "28. Speech Instruction task: general speech",
275
+ "instructions": [
276
+ "Please follow the instruction in the speech."
277
+ ]
278
+ },
279
+ "29_SI_ALPACA-GPT4-AUDIO_SI_V2_750": {
280
+ "apperance": "29. Speech Instruction task: general speech",
281
+ "instructions": [
282
+ "Please follow the instruction in the speech."
283
+ ]
284
+ },
285
+ "30_SI_ALPACA-GPT4-AUDIO_SI_V2_1454": {
286
+ "apperance": "30. Speech Instruction task: general speech",
287
+ "instructions": [
288
+ "Please follow the instruction in the speech."
289
+ ]
290
+ }
291
+ }
292
+
293
+
294
+ def init_state_section():
295
+ st.set_page_config(page_title='MERaLiON-AudioLLM', page_icon = "🔥", layout='wide')
296
+
297
+ st.markdown(
298
+ (
299
+ '<style>' + \
300
+ open('./style/app_style.css').read() + \
301
+ open('./style/normal_window.css').read() + \
302
+ open('./style/small_window.css').read() + \
303
+ '</style>'
304
+ ),
305
+ unsafe_allow_html=True
306
+ )
307
+
308
+ if "server" not in st.session_state:
309
+ st.session_state.server = start_server()
310
+
311
+ if "client" not in st.session_state or 'model_name' not in st.session_state:
312
+ st.session_state.client, st.session_state.model_name = load_model()
313
+
314
+ for key, value in FIXED_GENERATION_CONFIG.items():
315
+ if key not in st.session_state:
316
+ st.session_state[key]=copy.deepcopy(value)
317
+
318
+ for key, value in DEFAULT_DIALOGUE_STATES.items():
319
+ if key not in st.session_state:
320
+ st.session_state[key]=copy.deepcopy(value)
321
+
322
+
323
+ def header_section(component_name="Playground", icon="🤖"):
324
+ st.markdown(
325
+ f"<h1 style='text-align: center;'>MERaLiON-AudioLLM {component_name} {icon}</h1>",
326
+ unsafe_allow_html=True
327
+ )
328
+
329
+ st.markdown(
330
+ f"""<div class="main-intro-normal-window">
331
+ <p>This {component_name.lower()} is based on
332
+ <a href="https://huggingface.co/MERaLiON/MERaLiON-AudioLLM-Whisper-SEA-LION"
333
+ target="_blank" rel="noopener noreferrer"> MERaLiON-AudioLLM</a>,
334
+ developed by I2R, A*STAR, in collaboration with AISG, Singapore.
335
+ It is tailored for Singapore’s multilingual and multicultural landscape.
336
+ MERaLiON-AudioLLM supports <strong>Automatic Speech Recognation</strong>,
337
+ <strong>Speech Translation</strong>,
338
+ <strong>Spoken Question Answering</strong>,
339
+ <strong>Spoken Dialogue Summarization</strong>,
340
+ <strong>Speech Instruction</strong>, and
341
+ <strong>Paralinguistics</strong> tasks.</p></div>""",
342
+ unsafe_allow_html=True
343
+ )
344
+
345
+ st.markdown(
346
+ f"""<div class="main-intro-small-window">
347
+ <p>This {component_name.lower()} is based on
348
+ <a href="https://huggingface.co/MERaLiON/MERaLiON-AudioLLM-Whisper-SEA-LION"
349
+ target="_blank" rel="noopener noreferrer"> MERaLiON-AudioLLM</a>.</p></div>""",
350
+ unsafe_allow_html=True
351
+ )
352
+
353
+
354
+ @st.fragment
355
+ def sidebar_fragment():
356
+ with st.container(height=300, border=False):
357
+ st.page_link("pages/playground.py", label="Playground")
358
+ st.page_link("pages/voice_chat.py", label="Voice Chat (experimental)")
359
+
360
+
361
+ st.divider()
362
+
363
+ st.slider(label='Temperature', min_value=0.0, max_value=2.0, value=0.1, key='temperature')
364
+
365
+ st.slider(label='Top P', min_value=0.0, max_value=1.0, value=0.9, key='top_p')
366
+
367
+ st.slider(label="Repetition Penalty", min_value=1.0, max_value=1.2, value=1.1, key="repetition_penalty")
src/content/playground.py ADDED
@@ -0,0 +1,208 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import copy
2
+ import base64
3
+
4
+ import streamlit as st
5
+
6
+ from src.generation import retrive_response
7
+ from src.utils import bytes_to_array, array_to_bytes
8
+ from src.content.common import (
9
+ AUDIO_SAMPLES_W_INSTRUCT,
10
+ DEFAULT_DIALOGUE_STATES,
11
+ init_state_section,
12
+ header_section,
13
+ sidebar_fragment
14
+ )
15
+
16
+
17
+ QUICK_ACTIONS = [
18
+ {
19
+ "name": "**Summary**",
20
+ "instruction": "Please summarise this speech.",
21
+ "width": 10,
22
+ },
23
+ {
24
+ "name": "**Transcript**",
25
+ "instruction": "Please transcribe this speech.",
26
+ "width": 9.5,
27
+ }
28
+ ]
29
+
30
+
31
+ MAX_AUDIO_LENGTH = 120
32
+
33
+
34
+ def _update_audio(audio_bytes):
35
+ origin_audio_array = bytes_to_array(audio_bytes)
36
+ truncated_audio_array = origin_audio_array[: MAX_AUDIO_LENGTH*16000]
37
+ truncated_audio_bytes = array_to_bytes(truncated_audio_array)
38
+
39
+ st.session_state.audio_array = origin_audio_array
40
+ st.session_state.audio_base64 = base64.b64encode(truncated_audio_bytes).decode('utf-8')
41
+
42
+
43
+ @st.fragment
44
+ def successful_example_section():
45
+ audio_sample_names = [audio_sample_name for audio_sample_name in AUDIO_SAMPLES_W_INSTRUCT.keys()]
46
+
47
+ st.markdown(":fire: **Successful Tasks and Examples**")
48
+
49
+ sample_name = st.selectbox(
50
+ label="**Select Audio:**",
51
+ label_visibility="collapsed",
52
+ options=audio_sample_names,
53
+ format_func=lambda o: AUDIO_SAMPLES_W_INSTRUCT[o]["apperance"],
54
+ index=None,
55
+ placeholder="Select an audio sample:",
56
+ on_change=lambda: st.session_state.update(
57
+ on_select=True,
58
+ messages=[],
59
+ disprompt=True
60
+ ),
61
+ key='select')
62
+
63
+ if sample_name and st.session_state.on_select:
64
+ audio_bytes = open(f"audio_samples/{sample_name}.wav", "rb").read()
65
+ st.session_state.update(
66
+ on_select=False,
67
+ new_prompt=AUDIO_SAMPLES_W_INSTRUCT[sample_name]["instructions"][0]
68
+ )
69
+ _update_audio(audio_bytes)
70
+ st.rerun(scope="app")
71
+
72
+
73
+ @st.dialog("Specify Audio")
74
+ def audio_attach_dialogue():
75
+ st.markdown("**Upload**")
76
+
77
+ uploaded_file = st.file_uploader(
78
+ label="**Upload Audio:**",
79
+ label_visibility="collapsed",
80
+ type=['wav', 'mp3'],
81
+ on_change=lambda: st.session_state.update(on_upload=True, messages=[]),
82
+ key='upload'
83
+ )
84
+
85
+ if uploaded_file and st.session_state.on_upload:
86
+ audio_bytes = uploaded_file.read()
87
+ _update_audio(audio_bytes)
88
+ st.session_state.on_upload = False
89
+ st.rerun()
90
+
91
+ st.markdown("**Record**")
92
+
93
+ uploaded_file = st.audio_input(
94
+ label="**Record Audio:**",
95
+ label_visibility="collapsed",
96
+ on_change=lambda: st.session_state.update(on_record=True, messages=[]),
97
+ key='record'
98
+ )
99
+
100
+ if uploaded_file and st.session_state.on_record:
101
+ audio_bytes = uploaded_file.read()
102
+ _update_audio(audio_bytes)
103
+ st.session_state.on_record = False
104
+ st.rerun()
105
+
106
+
107
+ def bottom_input_section():
108
+ bottom_cols = st.columns([0.03, 0.03, 0.94])
109
+ with bottom_cols[0]:
110
+ st.button(
111
+ 'Clear',
112
+ disabled=st.session_state.disprompt,
113
+ on_click=lambda: st.session_state.update(copy.deepcopy(DEFAULT_DIALOGUE_STATES))
114
+ )
115
+
116
+ with bottom_cols[1]:
117
+ if st.button("\+ Audio", disabled=st.session_state.disprompt):
118
+ audio_attach_dialogue()
119
+
120
+ with bottom_cols[2]:
121
+ if chat_input := st.chat_input(
122
+ placeholder="Instruction...",
123
+ disabled=st.session_state.disprompt,
124
+ on_submit=lambda: st.session_state.update(disprompt=True, messages=[])
125
+ ):
126
+ st.session_state.new_prompt = chat_input
127
+
128
+
129
+ @st.fragment
130
+ def quick_actions_fragment():
131
+ action_cols_spec = [_["width"] for _ in QUICK_ACTIONS]
132
+ action_cols = st.columns(action_cols_spec)
133
+
134
+ for idx, action in enumerate(QUICK_ACTIONS):
135
+ action_cols[idx].button(
136
+ action["name"],
137
+ args=(action["instruction"],),
138
+ disabled=st.session_state.disprompt,
139
+ on_click=lambda p: st.session_state.update(
140
+ disprompt=True,
141
+ messages=[],
142
+ new_prompt=p,
143
+ on_select_quick_action=True
144
+ )
145
+ )
146
+
147
+ if st.session_state.on_select_quick_action:
148
+ st.session_state.on_select_quick_action = False
149
+ st.rerun(scope="app")
150
+
151
+
152
+ def conversation_section():
153
+ if st.session_state.audio_array.size:
154
+ with st.chat_message("user"):
155
+ st.audio(st.session_state.audio_array, format="audio/wav", sample_rate=16000)
156
+ quick_actions_fragment()
157
+
158
+ for message in st.session_state.messages:
159
+ with st.chat_message(message["role"]):
160
+ if message.get("error"):
161
+ st.error(message["error"])
162
+ for warning_msg in message.get("warnings", []):
163
+ st.warning(warning_msg)
164
+ if message.get("content"):
165
+ st.write(message["content"])
166
+
167
+ with st._bottom:
168
+ bottom_input_section()
169
+
170
+ if one_time_prompt := st.session_state.new_prompt:
171
+ st.session_state.update(new_prompt="", messages=[])
172
+
173
+ with st.chat_message("user"):
174
+ st.write(one_time_prompt)
175
+ st.session_state.messages.append({"role": "user", "content": one_time_prompt})
176
+
177
+ with st.chat_message("assistant"):
178
+ with st.spinner("Thinking..."):
179
+ error_msg, warnings, stream = retrive_response(
180
+ one_time_prompt, st.session_state.audio_base64, stream=True)
181
+ response = ""
182
+
183
+ if error_msg:
184
+ st.error(error_msg)
185
+ for warning_msg in warnings:
186
+ st.warning(warning_msg)
187
+ if stream:
188
+ response = st.write_stream(stream)
189
+
190
+ st.session_state.messages.append({
191
+ "role": "assistant",
192
+ "error": error_msg,
193
+ "warnings": warnings,
194
+ "content": response
195
+ })
196
+
197
+ st.session_state.disprompt=False
198
+ st.rerun(scope="app")
199
+
200
+ def playground_page():
201
+ init_state_section()
202
+ header_section()
203
+
204
+ with st.sidebar:
205
+ sidebar_fragment()
206
+
207
+ successful_example_section()
208
+ conversation_section()
src/content/voice_chat.py ADDED
@@ -0,0 +1,153 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import copy
2
+ import base64
3
+
4
+ import numpy as np
5
+ import streamlit as st
6
+
7
+ from src.generation import retrive_response
8
+ from src.utils import bytes_to_array, array_to_bytes
9
+ from src.content.common import (
10
+ DEFAULT_DIALOGUE_STATES,
11
+ init_state_section,
12
+ header_section,
13
+ sidebar_fragment
14
+ )
15
+
16
+
17
+ # TODO: change this.
18
+ DEFAULT_PROMPT = "Please follow the instruction in the speech."
19
+
20
+
21
+ MAX_AUDIO_LENGTH = 120
22
+
23
+
24
+ def _update_audio(audio_bytes):
25
+ origin_audio_array = bytes_to_array(audio_bytes)
26
+ truncated_audio_array = origin_audio_array[: MAX_AUDIO_LENGTH*16000]
27
+ truncated_audio_bytes = array_to_bytes(truncated_audio_array)
28
+
29
+ st.session_state.audio_array = origin_audio_array
30
+ st.session_state.audio_base64 = base64.b64encode(truncated_audio_bytes).decode('utf-8')
31
+
32
+
33
+ @st.dialog("Specify Audio")
34
+ def audio_attach_dialogue():
35
+ st.markdown("**Upload**")
36
+
37
+ uploaded_file = st.file_uploader(
38
+ label="**Upload Audio:**",
39
+ label_visibility="collapsed",
40
+ type=['wav', 'mp3'],
41
+ on_change=lambda: st.session_state.update(
42
+ on_upload=True,
43
+ messages=[],
44
+ disprompt=True
45
+ ),
46
+ key='upload'
47
+ )
48
+
49
+ if uploaded_file and st.session_state.on_upload:
50
+ audio_bytes = uploaded_file.read()
51
+ _update_audio(audio_bytes)
52
+ st.session_state.update(
53
+ on_upload=False,
54
+ new_prompt=DEFAULT_PROMPT
55
+ )
56
+ st.rerun()
57
+
58
+
59
+ def bottom_input_section():
60
+ bottom_cols = st.columns([0.03, 0.03, 0.94])
61
+ with bottom_cols[0]:
62
+ st.button(
63
+ 'Clear',
64
+ disabled=st.session_state.disprompt,
65
+ on_click=lambda: st.session_state.update(copy.deepcopy(DEFAULT_DIALOGUE_STATES))
66
+ )
67
+
68
+ with bottom_cols[1]:
69
+ if st.button("\+ Audio", disabled=st.session_state.disprompt):
70
+ audio_attach_dialogue()
71
+
72
+ with bottom_cols[2]:
73
+ uploaded_file = st.audio_input(
74
+ label="record audio",
75
+ label_visibility="collapsed",
76
+ on_change=lambda: st.session_state.update(
77
+ on_record=True,
78
+ messages=[],
79
+ disprompt=True
80
+ ),
81
+ key='record'
82
+ )
83
+
84
+ if uploaded_file and st.session_state.on_record:
85
+ audio_bytes = uploaded_file.read()
86
+ _update_audio(audio_bytes)
87
+ st.session_state.update(
88
+ on_record=False,
89
+ new_prompt=DEFAULT_PROMPT
90
+ )
91
+
92
+
93
+ def conversation_section():
94
+ for message in st.session_state.messages:
95
+ with st.chat_message(message["role"]):
96
+ if message.get("error"):
97
+ st.error(message["error"])
98
+ for warning_msg in message.get("warnings", []):
99
+ st.warning(warning_msg)
100
+ if message.get("audio", np.array([])).shape[0]:
101
+ st.audio(message["audio"], format="audio/wav", sample_rate=16000)
102
+ if message.get("content"):
103
+ st.write(message["content"])
104
+
105
+ with st._bottom:
106
+ bottom_input_section()
107
+
108
+ if one_time_prompt := st.session_state.new_prompt:
109
+ one_time_array = st.session_state.audio_array
110
+ one_time_base64 = st.session_state.audio_base64
111
+ st.session_state.update(
112
+ new_prompt="",
113
+ one_time_array=np.array([]),
114
+ one_time_base64="",
115
+ messages=[]
116
+ )
117
+
118
+ with st.chat_message("user"):
119
+ st.audio(one_time_array, format="audio/wav", sample_rate=16000)
120
+
121
+ st.session_state.messages.append({"role": "user", "audio": one_time_array})
122
+
123
+ with st.chat_message("assistant"):
124
+ with st.spinner("Thinking..."):
125
+ error_msg, warnings, stream = retrive_response(
126
+ one_time_prompt, one_time_base64, stream=True)
127
+ response = ""
128
+
129
+ if error_msg:
130
+ st.error(error_msg)
131
+ for warning_msg in warnings:
132
+ st.warning(warning_msg)
133
+ if stream:
134
+ response = st.write_stream(stream)
135
+
136
+ st.session_state.messages.append({
137
+ "role": "assistant",
138
+ "error": error_msg,
139
+ "warnings": warnings,
140
+ "content": response
141
+ })
142
+
143
+ st.session_state.disprompt=False
144
+ st.rerun(scope="app")
145
+
146
+ def voice_chat_page():
147
+ init_state_section()
148
+ header_section(component_name="Voice Chat")
149
+
150
+ with st.sidebar:
151
+ sidebar_fragment()
152
+
153
+ conversation_section()
src/generation.py CHANGED
@@ -19,6 +19,8 @@ FIXED_GENERATION_CONFIG = dict(
19
  seed=42
20
  )
21
 
 
 
22
 
23
  def load_model():
24
  """
@@ -100,7 +102,7 @@ def _retry_retrive_response_throws_exception(text_input, base64_audio_input, str
100
  return response_object
101
 
102
 
103
- def _validate_text_input(text_input) -> List[str]:
104
  """
105
  TODO: improve the input validation regex.
106
  """
@@ -111,11 +113,17 @@ def _validate_text_input(text_input) -> List[str]:
111
  if re.search(r'[\u4e00-\u9fff]+', text_input):
112
  warnings.append("NOTE: Please try to prompt in English for the best performance.")
113
 
 
 
 
 
 
 
114
  return warnings
115
 
116
 
117
  def retrive_response(text_input, base64_audio_input, stream=False):
118
- warnings = _validate_text_input(text_input)
119
 
120
  response_object, error_msg = None, ""
121
  try:
 
19
  seed=42
20
  )
21
 
22
+ MAX_AUDIO_LENGTH = 120
23
+
24
 
25
  def load_model():
26
  """
 
102
  return response_object
103
 
104
 
105
+ def _validate_input(text_input) -> List[str]:
106
  """
107
  TODO: improve the input validation regex.
108
  """
 
113
  if re.search(r'[\u4e00-\u9fff]+', text_input):
114
  warnings.append("NOTE: Please try to prompt in English for the best performance.")
115
 
116
+ if st.session_state.audio_array.shape[0] / 16000 > 30.0:
117
+ warnings.append((
118
+ "MERaLiON-AudioLLM is trained to process audio up to **30 seconds**."
119
+ f" Audio longer than **{MAX_AUDIO_LENGTH} seconds** will be truncated."
120
+ ))
121
+
122
  return warnings
123
 
124
 
125
  def retrive_response(text_input, base64_audio_input, stream=False):
126
+ warnings = _validate_input(text_input)
127
 
128
  response_object, error_msg = None, ""
129
  try:
src/pages.py DELETED
@@ -1,220 +0,0 @@
1
- import base64
2
-
3
- import numpy as np
4
- import streamlit as st
5
-
6
- from src.generation import retrive_response, postprocess_voice_transcription
7
- from src.utils import (
8
- GENERAL_INSTRUCTIONS,
9
- AUDIO_SAMPLES_W_INSTRUCT,
10
- bytes_to_array,
11
- array_to_bytes,
12
- )
13
-
14
-
15
- DEFAULT_DIALOGUE_STATES = dict(
16
- default_instruction=[],
17
- audio_base64='',
18
- audio_array=np.array([]),
19
- disprompt = False,
20
- new_prompt = "",
21
- messages=[],
22
- voice_instruction="",
23
- on_select=False,
24
- on_upload=False,
25
- on_record=False,
26
- on_click_button=False,
27
- on_record_voice=False
28
- )
29
-
30
-
31
- MAX_AUDIO_LENGTH = 120
32
-
33
-
34
- def _update_audio(audio_bytes):
35
- origin_audio_array = bytes_to_array(audio_bytes)
36
- truncated_audio_array = origin_audio_array[: MAX_AUDIO_LENGTH*16000]
37
- truncated_audio_bytes = array_to_bytes(truncated_audio_array)
38
-
39
- st.session_state.audio_array = origin_audio_array
40
- st.session_state.audio_base64 = base64.b64encode(truncated_audio_bytes).decode('utf-8')
41
-
42
-
43
- @st.fragment
44
- def sidebar_fragment():
45
- st.markdown("""<div class="sidebar-intro">
46
- <p><strong>📌 Supported Tasks</strong>
47
- <p>Automatic Speech Recognation</p>
48
- <p>Speech Translation</p>
49
- <p>Spoken Question Answering</p>
50
- <p>Spoken Dialogue Summarization</p>
51
- <p>Speech Instruction</p>
52
- <p>Paralinguistics</p>
53
- <br>
54
- <p><strong>📎 Generation Config</strong>
55
- </div>""", unsafe_allow_html=True)
56
-
57
- st.slider(label='Temperature', min_value=0.0, max_value=2.0, value=0.1, key='temperature')
58
-
59
- st.slider(label='Top P', min_value=0.0, max_value=1.0, value=0.9, key='top_p')
60
-
61
- st.slider(label="Repetition Penalty", min_value=1.0, max_value=1.2, value=1.1, key="repetition_penalty")
62
-
63
- @st.fragment
64
- def specify_audio_fragment():
65
- col1, col2, col3 = st.columns([4, 2, 2])
66
-
67
- with col1:
68
- audio_sample_names = [audio_sample_name for audio_sample_name in AUDIO_SAMPLES_W_INSTRUCT.keys()]
69
-
70
- st.markdown("**Select Audio From Examples:**")
71
-
72
- sample_name = st.selectbox(
73
- label="**Select Audio:**",
74
- label_visibility="collapsed",
75
- options=audio_sample_names,
76
- index=None,
77
- placeholder="Select an audio sample:",
78
- on_change=lambda: st.session_state.update(on_select=True),
79
- key='select')
80
-
81
- if sample_name and st.session_state.on_select:
82
- audio_bytes = open(f"audio_samples/{sample_name}.wav", "rb").read()
83
- st.session_state.default_instruction = AUDIO_SAMPLES_W_INSTRUCT[sample_name] + GENERAL_INSTRUCTIONS
84
- _update_audio(audio_bytes)
85
-
86
-
87
- with col2:
88
- st.markdown("or **Upload Audio:**")
89
-
90
- uploaded_file = st.file_uploader(
91
- label="**Upload Audio:**",
92
- label_visibility="collapsed",
93
- type=['wav', 'mp3'],
94
- on_change=lambda: st.session_state.update(on_upload=True),
95
- key='upload'
96
- )
97
-
98
- if uploaded_file and st.session_state.on_upload:
99
- audio_bytes = uploaded_file.read()
100
- st.session_state.default_instruction = GENERAL_INSTRUCTIONS
101
- _update_audio(audio_bytes)
102
-
103
-
104
- with col3:
105
- st.markdown("or **Record Audio:**")
106
-
107
- uploaded_file = st.audio_input(
108
- label="**Record Audio:**",
109
- label_visibility="collapsed",
110
- on_change=lambda: st.session_state.update(on_record=True),
111
- key='record'
112
- )
113
-
114
- if uploaded_file and st.session_state.on_record:
115
- audio_bytes = uploaded_file.read()
116
- st.session_state.default_instruction = GENERAL_INSTRUCTIONS
117
- _update_audio(audio_bytes)
118
-
119
- st.session_state.update(on_upload=False, on_record=False, on_select=False)
120
-
121
- if st.session_state.audio_array.size:
122
- with st.chat_message("user"):
123
- if st.session_state.audio_array.shape[0] / 16000 > 30.0:
124
- st.warning((
125
- "MERaLiON-AudioLLM is trained to process audio up to **30 seconds**."
126
- f" Audio longer than **{MAX_AUDIO_LENGTH} seconds** will be truncated."
127
- ))
128
-
129
- st.audio(st.session_state.audio_array, format="audio/wav", sample_rate=16000)
130
-
131
- for i, inst in enumerate(st.session_state.default_instruction):
132
- st.button(
133
- f"**Example Instruction {i+1}**: {inst}",
134
- args=(inst,),
135
- disabled=st.session_state.disprompt,
136
- on_click=lambda p: st.session_state.update(disprompt=True, new_prompt=p, on_click_button=True, messages=[])
137
- )
138
-
139
- if st.session_state.on_click_button:
140
- st.session_state.on_click_button = False
141
- st.rerun(scope="app")
142
-
143
-
144
- def bottom_input_section():
145
- bottom_cols = st.columns([0.02, 0.98])
146
-
147
- uploaded_file = bottom_cols[0].audio_input(
148
- label="voice",
149
- label_visibility="collapsed",
150
- disabled=st.session_state.disprompt,
151
- on_change=lambda: st.session_state.update(on_record_voice=True),
152
- key='voice'
153
- )
154
-
155
- if uploaded_file and st.session_state.on_record_voice:
156
- audio_bytes = uploaded_file.read()
157
- audio_base64 = base64.b64encode(audio_bytes).decode('utf-8')
158
- error_msg, warnings, completion = retrive_response(
159
- "Write out the dialogue as text.", audio_base64, stream=False)
160
-
161
- if error_msg:
162
- st.toast(error_msg, icon="🚨")
163
- for warning_msg in warnings:
164
- st.toast(warning_msg, icon="❗")
165
-
166
- st.session_state.update(
167
- new_prompt = postprocess_voice_transcription(
168
- completion.choices[0].message.content),
169
- on_record_voice = False
170
- )
171
-
172
- if chat_input := bottom_cols[1].chat_input(
173
- placeholder="Type Your Instruction Here",
174
- disabled=st.session_state.disprompt,
175
- on_submit=lambda: st.session_state.update(disprompt=True, messages=[])
176
- ):
177
- st.session_state.new_prompt = chat_input
178
-
179
- def conversation_section():
180
- for message in st.session_state.messages:
181
- with st.chat_message(message["role"]):
182
- if message.get("error"):
183
- st.error(message["error"])
184
- for warning_msg in message.get("warnings", []):
185
- st.warning(warning_msg)
186
- if message.get("content"):
187
- st.write(message["content"])
188
-
189
- with st._bottom:
190
- bottom_input_section()
191
-
192
- if one_time_prompt := st.session_state.new_prompt:
193
- st.session_state.update(new_prompt="", messages=[])
194
-
195
- with st.chat_message("user"):
196
- st.write(one_time_prompt)
197
- st.session_state.messages.append({"role": "user", "content": one_time_prompt})
198
-
199
- with st.chat_message("assistant"):
200
- with st.spinner("Thinking..."):
201
- error_msg, warnings, stream = retrive_response(
202
- one_time_prompt, st.session_state.audio_base64, stream=True)
203
- response = ""
204
-
205
- if error_msg:
206
- st.error(error_msg)
207
- for warning_msg in warnings:
208
- st.warning(warning_msg)
209
- if stream:
210
- response = st.write_stream(stream)
211
-
212
- st.session_state.messages.append({
213
- "role": "assistant",
214
- "error": error_msg,
215
- "warnings": warnings,
216
- "content": response
217
- })
218
-
219
- st.session_state.disprompt=False
220
- st.rerun(scope="app")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/utils.py CHANGED
@@ -4,70 +4,6 @@ from scipy.io.wavfile import write
4
  import librosa
5
 
6
 
7
- GENERAL_INSTRUCTIONS = [
8
- "Please transcribe this speech.",
9
- "Please summarise this speech."
10
- ]
11
-
12
-
13
- AUDIO_SAMPLES_W_INSTRUCT = {
14
- '7_ASR_IMDA_PART3_30_ASR_v2_2269': ["Need this talk written down, please."],
15
- '11_ASR_IMDA_PART4_30_ASR_v2_3771': ["Write out the dialogue as text."],
16
- '12_ASR_IMDA_PART4_30_ASR_v2_103' : ["Write out the dialogue as text."],
17
- '17_ASR_IMDA_PART6_30_ASR_v2_1413': ["Record the spoken word in text form."],
18
-
19
- '32_SQA_CN_COLLEDGE_ENTRANCE_ENGLISH_TEST_SQA_V2_572': ["What does the man think the woman should do at 4:00."],
20
- '33_SQA_IMDA_PART3_30_SQA_V2_2310': ["Does Speaker2's wife cook for Speaker2 when they are at home."],
21
- '34_SQA_IMDA_PART3_30_SQA_V2_3621': ["Does the phrase \"#gai-gai#\" have a meaning in Chinese or Hokkien language."],
22
- '35_SQA_IMDA_PART3_30_SQA_V2_4062': ["What is the color of the vase mentioned in the dialogue."],
23
- '36_DS_IMDA_PART4_30_DS_V2_849': ["Condense the dialogue into a concise summary highlighting major topics and conclusions."],
24
-
25
- '39_Paralingual_IEMOCAP_ER_V2_91': ["Based on the speaker's speech patterns, what do you think they are feeling."],
26
- '40_Paralingual_IEMOCAP_ER_V2_567': ["Based on the speaker's speech patterns, what do you think they are feeling."],
27
- '42_Paralingual_IEMOCAP_GR_V2_320': ["Is it possible for you to identify whether the speaker in this recording is male or female."],
28
- '47_Paralingual_IMDA_PART3_30_NR_V2_10479': ["Can you guess which ethnic group this person is from based on their accent."],
29
- '49_Paralingual_MELD_ER_V2_676': ["What emotions do you think the speaker is expressing."],
30
- '50_Paralingual_MELD_ER_V2_692': ["Based on the speaker's speech patterns, what do you think they are feeling."],
31
- '51_Paralingual_VOXCELEB1_GR_V2_2148': ["May I know the gender of the speaker."],
32
- '53_Paralingual_VOXCELEB1_NR_V2_2286': ["What's the nationality identity of the speaker."],
33
-
34
- '55_SQA_PUBLIC_SPEECH_SG_TEST_SQA_V2_2': ["What impact would the growth of the healthcare sector have on the country's economy in terms of employment and growth."],
35
- '56_SQA_PUBLIC_SPEECH_SG_TEST_SQA_V2_415': ["Based on the statement, can you summarize the speaker's position on the recent controversial issues in Singapore."],
36
- '57_SQA_PUBLIC_SPEECH_SG_TEST_SQA_V2_460': ["How does the author respond to parents' worries about masks in schools."],
37
-
38
- '1_ASR_IMDA_PART1_ASR_v2_141' : ["Turn the spoken language into a text format.", "Please translate the content into Chinese."],
39
- '2_ASR_IMDA_PART1_ASR_v2_2258': ["Turn the spoken language into a text format.", "Please translate the content into Chinese."],
40
- '3_ASR_IMDA_PART1_ASR_v2_2265': ["Turn the spoken language into a text format."],
41
-
42
- '4_ASR_IMDA_PART2_ASR_v2_999' : ["Translate the spoken words into text format."],
43
- '5_ASR_IMDA_PART2_ASR_v2_2241': ["Translate the spoken words into text format."],
44
- '6_ASR_IMDA_PART2_ASR_v2_3409': ["Translate the spoken words into text format."],
45
-
46
- '8_ASR_IMDA_PART3_30_ASR_v2_1698': ["Need this talk written down, please."],
47
- '9_ASR_IMDA_PART3_30_ASR_v2_2474': ["Need this talk written down, please."],
48
-
49
- '10_ASR_IMDA_PART4_30_ASR_v2_1527': ["Write out the dialogue as text."],
50
-
51
- '13_ASR_IMDA_PART5_30_ASR_v2_1446': ["Translate this vocal recording into a textual format."],
52
- '14_ASR_IMDA_PART5_30_ASR_v2_2281': ["Translate this vocal recording into a textual format."],
53
- '15_ASR_IMDA_PART5_30_ASR_v2_4388': ["Translate this vocal recording into a textual format."],
54
-
55
- '16_ASR_IMDA_PART6_30_ASR_v2_576': ["Record the spoken word in text form."],
56
- '18_ASR_IMDA_PART6_30_ASR_v2_2834': ["Record the spoken word in text form."],
57
-
58
- '19_ASR_AIShell_zh_ASR_v2_5044': ["Transform the oral presentation into a text document."],
59
- '20_ASR_LIBRISPEECH_CLEAN_ASR_V2_833': ["Please provide a written transcription of the speech."],
60
-
61
- '25_ST_COVOST2_ZH-CN_EN_ST_V2_4567': ["Please translate the given speech to English."],
62
- '26_ST_COVOST2_EN_ZH-CN_ST_V2_5422': ["Please translate the given speech to Chinese."],
63
-
64
- '27_ST_COVOST2_EN_ZH-CN_ST_V2_6697': ["Please translate the given speech to Chinese."],
65
- '28_SI_ALPACA-GPT4-AUDIO_SI_V2_299': ["Please follow the instruction in the speech."],
66
- '29_SI_ALPACA-GPT4-AUDIO_SI_V2_750': ["Please follow the instruction in the speech."],
67
- '30_SI_ALPACA-GPT4-AUDIO_SI_V2_1454': ["Please follow the instruction in the speech."],
68
- }
69
-
70
-
71
  def bytes_to_array(audio_bytes):
72
  audio_array, _ = librosa.load(
73
  io.BytesIO(audio_bytes),
 
4
  import librosa
5
 
6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
  def bytes_to_array(audio_bytes):
8
  audio_array, _ = librosa.load(
9
  io.BytesIO(audio_bytes),
style/app_style.css CHANGED
@@ -1,16 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  div[data-testid="stChatMessage"]:has(> div[data-testid="stChatMessageAvatarUser"]) {
2
  flex-direction: row-reverse;
3
  text-align: right;
4
  }
5
 
6
- @media(min-width: 576px) {
7
- .stMainBlockContainer {
8
- padding: 2rem 5rem 1rem;
9
- }
10
  }
11
 
 
 
 
 
 
 
 
 
12
  section[data-testid='stFileUploaderDropzone'] {
13
- padding: 6px 2rem;
14
  }
15
 
16
  section[data-testid='stFileUploaderDropzone']>button {
@@ -21,40 +40,45 @@ div[data-testid="stFileUploaderDropzoneInstructions"]>div>span {
21
  display:none;
22
  }
23
 
24
- div[data-testid="stMainBlockContainer"] div[data-testid="stAudioInput"]>div {
25
- max-height: 3rem;
26
  }
27
 
28
- div[data-testid="stBottomBlockContainer"] div[data-testid="stAudioInput"]>div {
29
- background-color:transparent;
30
- /* border:1px solid rgba(49, 51, 63, 0.2); */
31
- max-height: 40px;
32
- display: block;
33
- padding: 0;
34
- margin: auto;
35
- }
36
 
37
- div[data-testid="stBottomBlockContainer"] div[data-testid="stAudioInput"]>div>div:last-of-type {
38
- display:none;
39
  }
40
 
41
- div[data-testid="stBottomBlockContainer"] div[data-testid="stAudioInput"]>div>div:nth-of-type(2) {
42
- margin:auto;
 
 
43
  }
44
 
45
- div[data-testid="stBottomBlockContainer"] div[data-testid="stAudioInput"]>div>div:nth-of-type(2)>span:last-of-type {
46
- display:none;
 
 
47
  }
48
 
49
- div[data-testid="stBottomBlockContainer"] div[data-testid="stAudioInput"]>div>div:nth-of-type(2)>span:only-of-type {
50
- display:block;
 
51
  }
52
 
53
- div[data-testid="stBottomBlockContainer"] div[data-testid="stAudioInput"]>div>span {
54
- display:none;
 
 
55
  }
56
 
57
- [class='stAudio'] {
58
- max-width: 500px !important;
59
- margin: auto !important;
 
60
  }
 
 
 
 
 
1
+ div[data-testid="stMainBlockContainer"] div[data-testid="stAudioInput"]>div {
2
+ max-height: 3rem;
3
+ }
4
+
5
+ div[class="sidebar-intro"] p {
6
+ margin-bottom: 0.75rem;
7
+ }
8
+
9
+ [class='stAudio'] {
10
+ max-width: 500px !important;
11
+ margin: auto !important;
12
+ }
13
+
14
  div[data-testid="stChatMessage"]:has(> div[data-testid="stChatMessageAvatarUser"]) {
15
  flex-direction: row-reverse;
16
  text-align: right;
17
  }
18
 
19
+ div[data-testid="stChatMessage"] div[data-testid="stHorizontalBlock"]:has(> div[data-testid="stColumn"]) {
20
+ flex-direction: row-reverse;
 
 
21
  }
22
 
23
+ div[data-testid="stChatMessage"] div[data-testid="stHorizontalBlock"]>div[data-testid="stColumn"]:has( div[data-testid="stButton"]) {
24
+ width: 6rem;
25
+ min-width: 6rem;
26
+ flex: 0 0 6rem;
27
+ }
28
+
29
+ /* File uploader */
30
+
31
  section[data-testid='stFileUploaderDropzone'] {
32
+ padding:6px 2rem;
33
  }
34
 
35
  section[data-testid='stFileUploaderDropzone']>button {
 
40
  display:none;
41
  }
42
 
43
+ div[data-testid="stBottomBlockContainer"] {
44
+ padding-bottom: 2rem;
45
  }
46
 
47
+ /* Chat input component at the bottom */
 
 
 
 
 
 
 
48
 
49
+ div[data-testid="stBottomBlockContainer"] div[data-testid="stHorizontalBlock"]:has(> div[data-testid="stColumn"]) {
50
+ gap: 4px;
51
  }
52
 
53
+ div[data-testid="stBottomBlockContainer"] div[data-testid="stColumn"]:has( div[data-testid="stButton"]):first-of-type {
54
+ width: 61px;
55
+ min-width: 61px;
56
+ flex: 0 0 61px;
57
  }
58
 
59
+ div[data-testid="stBottomBlockContainer"] div[data-testid="stColumn"]:has( div[data-testid="stButton"]):nth-of-type(2) {
60
+ width: 76px;
61
+ min-width: 76px;
62
+ flex: 0 0 76px;
63
  }
64
 
65
+ div[data-testid="stBottomBlockContainer"] div[data-testid="stColumn"] button[data-testid="stBaseButton-secondary"] {
66
+ background-color: rgb(240, 242, 246);
67
+ border-color: rgb(240, 242, 246);
68
  }
69
 
70
+ div[data-testid="stBottomBlockContainer"] div[data-testid="stColumn"]:has( div[data-testid="stChatInput"]) {
71
+ width: 10rem;
72
+ min-width: 10rem;
73
+ flex: 1 1 10rem;
74
  }
75
 
76
+ div[data-testid="stBottomBlockContainer"] div[data-testid="stColumn"]:has( div[data-testid="stAudioInput"]) {
77
+ width: 10rem;
78
+ min-width: 10rem;
79
+ flex: 1 1 10rem;
80
  }
81
+
82
+ div[data-testid="stBottomBlockContainer"] div[data-testid="stAudioInput"]>div {
83
+ max-height: 40px;
84
+ }
style/normal_window.css ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ @media(min-width: 576px) {
2
+ .stMainBlockContainer {
3
+ padding: 2rem 5rem 1rem;
4
+ }
5
+
6
+ div[data-testid="stBottomBlockContainer"] {
7
+ padding-left: 5rem;
8
+ padding-right: 5rem;
9
+ }
10
+
11
+ div[class="main-intro-small-window"] {
12
+ display: none;
13
+ }
14
+ }
style/small_window.css ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ @media(max-width: 576px) {
2
+ div[data-testid="stMainBlockContainer"] div[data-testid="stVerticalBlock"]>div[data-testid="stElementContainer"]:has( div[data-testid="stHeadingWithActionElements"]) {
3
+ display: none;
4
+ }
5
+
6
+ div[class="main-intro-normal-window"] {
7
+ display: none;
8
+ }
9
+ }