Spaces:

MERaLiON
/

MERaLiON-AudioLLM

Running

App Files Files Community

YingxuHe commited on Jan 17

Commit

c205d11

1 Parent(s): 627709b

add voice chat function

Browse files

Files changed (13) hide show

.streamlit/config.toml +2 -0
app.py +2 -41
pages/playground.py +4 -0
pages/voice_chat.py +4 -0
src/content/common.py +367 -0
src/content/playground.py +208 -0
src/content/voice_chat.py +153 -0
src/generation.py +10 -2
src/pages.py +0 -220
src/utils.py +0 -64
style/app_style.css +52 -28
style/normal_window.css +14 -0
style/small_window.css +9 -0

.streamlit/config.toml ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ [client]
2	+ showSidebarNavigation = false

app.py CHANGED Viewed

@@ -1,42 +1,3 @@
-import copy
-import streamlit as st
-from src.tunnel import start_server
-from src.generation import FIXED_GENERATION_CONFIG, load_model
-from src.pages import DEFAULT_DIALOGUE_STATES, sidebar_fragment, specify_audio_fragment, conversation_section
-st.set_page_config(page_title='MERaLiON-AudioLLM', page_icon = "🔥", layout='wide')
-st.markdown('<style>' + open('./style/app_style.css').read() + '</style>', unsafe_allow_html=True)
-if "server" not in st.session_state:
-    st.session_state.server = start_server()
-if "client" not in st.session_state or 'model_name' not in st.session_state:
-    st.session_state.client, st.session_state.model_name = load_model()
-for key, value in FIXED_GENERATION_CONFIG.items():
-    if key not in st.session_state:
-        st.session_state[key]=copy.deepcopy(value)
-for key, value in DEFAULT_DIALOGUE_STATES.items():
-    if key not in st.session_state:
-        st.session_state[key]=copy.deepcopy(value)
-with st.sidebar:
-    sidebar_fragment()
-if st.sidebar.button('Clear History'):
-    st.session_state.update(copy.deepcopy(DEFAULT_DIALOGUE_STATES))
-st.markdown("<h1 style='text-align: center;'>MERaLiON-AudioLLM Demo 🤖</h1>", unsafe_allow_html=True)
-st.markdown(
-    """This demo is based on [MERaLiON-AudioLLM](https://huggingface.co/MERaLiON/MERaLiON-AudioLLM-Whisper-SEA-LION),
-    developed by I2R, A*STAR, in collaboration with AISG, Singapore.
-    It is tailored for Singapore’s multilingual and multicultural landscape."""
-)
-specify_audio_fragment()
-conversation_section()


1	+ from src.content.playground import playground_page
2
3	+ playground_page()

pages/playground.py ADDED Viewed

	@@ -0,0 +1,4 @@


1	+ from src.content.playground import playground_page
2	+
3	+
4	+ playground_page()

pages/voice_chat.py ADDED Viewed

	@@ -0,0 +1,4 @@


1	+ from src.content.voice_chat import voice_chat_page
2	+
3	+
4	+ voice_chat_page()

src/content/common.py ADDED Viewed

	@@ -0,0 +1,367 @@

+import copy
+import numpy as np
+import streamlit as st
+from src.tunnel import start_server
+from src.generation import FIXED_GENERATION_CONFIG, load_model
+DEFAULT_DIALOGUE_STATES = dict(
+    audio_base64='',
+    audio_array=np.array([]),
+    disprompt = False,
+    new_prompt = "",
+    messages=[],
+    on_select=False,
+    on_upload=False,
+    on_record=False,
+    on_select_quick_action=False
+)
+DEFAULT_VOICE_CHAT_STATES = dict(
+    audio_base64='',
+    audio_array=np.array([]),
+    disprompt = False,
+    new_prompt = "",
+    messages=[],
+    on_select=False,
+    on_upload=False,
+    on_record=False,
+    on_select_quick_action=False
+)
+AUDIO_SAMPLES_W_INSTRUCT = {
+    "7_ASR_IMDA_PART3_30_ASR_v2_2269": {
+        "apperance": "7. Automatic Speech Recognation task: conversation in Singapore accent",
+        "instructions": [
+            "Need this talk written down, please."
+        ]
+    },
+    "11_ASR_IMDA_PART4_30_ASR_v2_3771": {
+        "apperance": "11. Automatic Speech Recognation task: conversation with Singlish code-switch",
+        "instructions": [
+            "Write out the dialogue as text."
+        ]
+    },
+    "12_ASR_IMDA_PART4_30_ASR_v2_103": {
+        "apperance": "12. Automatic Speech Recognation task: conversation with Singlish code-switch",
+        "instructions": [
+            "Write out the dialogue as text."
+        ]
+    },
+    "17_ASR_IMDA_PART6_30_ASR_v2_1413": {
+        "apperance": "17. Automatic Speech Recognation task: conversation in Singapore accent",
+        "instructions": [
+            "Record the spoken word in text form."
+        ]
+    },
+    "32_SQA_CN_COLLEDGE_ENTRANCE_ENGLISH_TEST_SQA_V2_572": {
+        "apperance": "32. Spoken Question Answering task: general speech",
+        "instructions": [
+            "What does the man think the woman should do at 4:00."
+        ]
+    },
+    "33_SQA_IMDA_PART3_30_SQA_V2_2310": {
+        "apperance": "33. Spoken Question Answering task: conversation in Singapore accent",
+        "instructions": [
+            "Does Speaker2's wife cook for Speaker2 when they are at home."
+        ]
+    },
+    "34_SQA_IMDA_PART3_30_SQA_V2_3621": {
+        "apperance": "34. Spoken Question Answering task: conversation in Singapore accent",
+        "instructions": [
+            "Does the phrase \"#gai-gai#\" have a meaning in Chinese or Hokkien language."
+        ]
+    },
+    "35_SQA_IMDA_PART3_30_SQA_V2_4062": {
+        "apperance": "35. Spoken Question Answering task: conversation in Singapore accent",
+        "instructions": [
+            "What is the color of the vase mentioned in the dialogue."
+        ]
+    },
+    "36_DS_IMDA_PART4_30_DS_V2_849": {
+        "apperance": "36. Spoken Dialogue Summarization task: conversation with Singlish code-switch",
+        "instructions": [
+            "Condense the dialogue into a concise summary highlighting major topics and conclusions."
+        ]
+    },
+    "39_Paralingual_IEMOCAP_ER_V2_91": {
+        "apperance": "39. Paralinguistics task: general speech",
+        "instructions": [
+            "Based on the speaker's speech patterns, what do you think they are feeling."
+        ]
+    },
+    "40_Paralingual_IEMOCAP_ER_V2_567": {
+        "apperance": "40. Paralinguistics task: general speech",
+        "instructions": [
+            "Based on the speaker's speech patterns, what do you think they are feeling."
+        ]
+    },
+    "42_Paralingual_IEMOCAP_GR_V2_320": {
+        "apperance": "42. Paralinguistics task: general speech",
+        "instructions": [
+            "Is it possible for you to identify whether the speaker in this recording is male or female."
+        ]
+    },
+    "47_Paralingual_IMDA_PART3_30_NR_V2_10479": {
+        "apperance": "47. Paralinguistics task: conversation in Singapore accent",
+        "instructions": [
+            "Can you guess which ethnic group this person is from based on their accent."
+        ]
+    },
+    "49_Paralingual_MELD_ER_V2_676": {
+        "apperance": "49. Paralinguistics task: general speech",
+        "instructions": [
+            "What emotions do you think the speaker is expressing."
+        ]
+    },
+    "50_Paralingual_MELD_ER_V2_692": {
+        "apperance": "50. Paralinguistics task: general speech",
+        "instructions": [
+            "Based on the speaker's speech patterns, what do you think they are feeling."
+        ]
+    },
+    "51_Paralingual_VOXCELEB1_GR_V2_2148": {
+        "apperance": "51. Paralinguistics task: general speech",
+        "instructions": [
+            "May I know the gender of the speaker."
+        ]
+    },
+    "53_Paralingual_VOXCELEB1_NR_V2_2286": {
+        "apperance": "53. Paralinguistics task: general speech",
+        "instructions": [
+            "What's the nationality identity of the speaker."
+        ]
+    },
+    "55_SQA_PUBLIC_SPEECH_SG_TEST_SQA_V2_2": {
+        "apperance": "55. Spoken Question Answering task: general speech",
+        "instructions": [
+            "What impact would the growth of the healthcare sector have on the country's economy in terms of employment and growth."
+        ]
+    },
+    "56_SQA_PUBLIC_SPEECH_SG_TEST_SQA_V2_415": {
+        "apperance": "56. Spoken Question Answering task: general speech",
+        "instructions": [
+            "Based on the statement, can you summarize the speaker's position on the recent controversial issues in Singapore."
+        ]
+    },
+    "57_SQA_PUBLIC_SPEECH_SG_TEST_SQA_V2_460": {
+        "apperance": "57. Spoken Question Answering task: general speech",
+        "instructions": [
+            "How does the author respond to parents' worries about masks in schools."
+        ]
+    },
+    "1_ASR_IMDA_PART1_ASR_v2_141": {
+        "apperance": "1. Automatic Speech Recognation task: phonetically balanced reading",
+        "instructions": [
+            "Turn the spoken language into a text format.",
+            "Please translate the content into Chinese."
+        ]
+    },
+    "2_ASR_IMDA_PART1_ASR_v2_2258": {
+        "apperance": "2. Automatic Speech Recognation task: phonetically balanced reading",
+        "instructions": [
+            "Turn the spoken language into a text format.",
+            "Please translate the content into Chinese."
+        ]
+    },
+    "3_ASR_IMDA_PART1_ASR_v2_2265": {
+        "apperance": "3. Automatic Speech Recognation task: phonetically balanced reading",
+        "instructions": [
+            "Turn the spoken language into a text format."
+        ]
+    },
+    "4_ASR_IMDA_PART2_ASR_v2_999": {
+        "apperance": "4. Automatic Speech Recognation task: reading in Singapore context",
+        "instructions": [
+            "Translate the spoken words into text format."
+        ]
+    },
+    "5_ASR_IMDA_PART2_ASR_v2_2241": {
+        "apperance": "5. Automatic Speech Recognation task: reading in Singapore context",
+        "instructions": [
+            "Translate the spoken words into text format."
+        ]
+    },
+    "6_ASR_IMDA_PART2_ASR_v2_3409": {
+        "apperance": "6. Automatic Speech Recognation task: reading in Singapore context",
+        "instructions": [
+            "Translate the spoken words into text format."
+        ]
+    },
+    "8_ASR_IMDA_PART3_30_ASR_v2_1698": {
+        "apperance": "8. Automatic Speech Recognation task: conversation in Singapore accent",
+        "instructions": [
+            "Need this talk written down, please."
+        ]
+    },
+    "9_ASR_IMDA_PART3_30_ASR_v2_2474": {
+        "apperance": "9. Automatic Speech Recognation task: conversation in Singapore accent",
+        "instructions": [
+            "Need this talk written down, please."
+        ]
+    },
+    "10_ASR_IMDA_PART4_30_ASR_v2_1527": {
+        "apperance": "10. Automatic Speech Recognation task: conversation with Singlish code-switch",
+        "instructions": [
+            "Write out the dialogue as text."
+        ]
+    },
+    "13_ASR_IMDA_PART5_30_ASR_v2_1446": {
+        "apperance": "13. Automatic Speech Recognation task: conversation in Singapore accent",
+        "instructions": [
+            "Translate this vocal recording into a textual format."
+        ]
+    },
+    "14_ASR_IMDA_PART5_30_ASR_v2_2281": {
+        "apperance": "14. Automatic Speech Recognation task: conversation in Singapore accent",
+        "instructions": [
+            "Translate this vocal recording into a textual format."
+        ]
+    },
+    "15_ASR_IMDA_PART5_30_ASR_v2_4388": {
+        "apperance": "15. Automatic Speech Recognation task: conversation in Singapore accent",
+        "instructions": [
+            "Translate this vocal recording into a textual format."
+        ]
+    },
+    "16_ASR_IMDA_PART6_30_ASR_v2_576": {
+        "apperance": "16. Automatic Speech Recognation task: conversation in Singapore accent",
+        "instructions": [
+            "Record the spoken word in text form."
+        ]
+    },
+    "18_ASR_IMDA_PART6_30_ASR_v2_2834": {
+        "apperance": "18. Automatic Speech Recognation task: conversation in Singapore accent",
+        "instructions": [
+            "Record the spoken word in text form."
+        ]
+    },
+    "19_ASR_AIShell_zh_ASR_v2_5044": {
+        "apperance": "19. Automatic Speech Recognation task: speech in Chinese ",
+        "instructions": [
+            "Transform the oral presentation into a text document."
+        ]
+    },
+    "20_ASR_LIBRISPEECH_CLEAN_ASR_V2_833": {
+        "apperance": "20. Automatic Speech Recognation task: general speech",
+        "instructions": [
+            "Please provide a written transcription of the speech."
+        ]
+    },
+    "25_ST_COVOST2_ZH-CN_EN_ST_V2_4567": {
+        "apperance": "25. Speech Translation task: Chinese to English",
+        "instructions": [
+            "Please translate the given speech to English."
+        ]
+    },
+    "26_ST_COVOST2_EN_ZH-CN_ST_V2_5422": {
+        "apperance": "26. Speech Translation task: English to Chinese",
+        "instructions": [
+            "Please translate the given speech to Chinese."
+        ]
+    },
+    "27_ST_COVOST2_EN_ZH-CN_ST_V2_6697": {
+        "apperance": "27. Speech Translation task: English to Chinese",
+        "instructions": [
+            "Please translate the given speech to Chinese."
+        ]
+    },
+    "28_SI_ALPACA-GPT4-AUDIO_SI_V2_299": {
+        "apperance": "28. Speech Instruction task: general speech",
+        "instructions": [
+            "Please follow the instruction in the speech."
+        ]
+    },
+    "29_SI_ALPACA-GPT4-AUDIO_SI_V2_750": {
+        "apperance": "29. Speech Instruction task: general speech",
+        "instructions": [
+            "Please follow the instruction in the speech."
+        ]
+    },
+    "30_SI_ALPACA-GPT4-AUDIO_SI_V2_1454": {
+        "apperance": "30. Speech Instruction task: general speech",
+        "instructions": [
+            "Please follow the instruction in the speech."
+        ]
+    }
+}
+def init_state_section():
+    st.set_page_config(page_title='MERaLiON-AudioLLM', page_icon = "🔥", layout='wide')
+    st.markdown(
+        (
+            '<style>' + \
+            open('./style/app_style.css').read() + \
+            open('./style/normal_window.css').read() + \
+            open('./style/small_window.css').read() + \
+            '</style>'
+        ),
+        unsafe_allow_html=True
+    )
+    if "server" not in st.session_state:
+        st.session_state.server = start_server()
+    if "client" not in st.session_state or 'model_name' not in st.session_state:
+        st.session_state.client, st.session_state.model_name = load_model()
+    for key, value in FIXED_GENERATION_CONFIG.items():
+        if key not in st.session_state:
+            st.session_state[key]=copy.deepcopy(value)
+    for key, value in DEFAULT_DIALOGUE_STATES.items():
+        if key not in st.session_state:
+            st.session_state[key]=copy.deepcopy(value)
+def header_section(component_name="Playground", icon="🤖"):
+    st.markdown(
+        f"<h1 style='text-align: center;'>MERaLiON-AudioLLM {component_name} {icon}</h1>",
+        unsafe_allow_html=True
+        )
+    st.markdown(
+        f"""<div class="main-intro-normal-window">
+        <p>This {component_name.lower()} is based on
+        <a href="https://huggingface.co/MERaLiON/MERaLiON-AudioLLM-Whisper-SEA-LION"
+        target="_blank" rel="noopener noreferrer"> MERaLiON-AudioLLM</a>,
+        developed by I2R, A*STAR, in collaboration with AISG, Singapore.
+        It is tailored for Singapore’s multilingual and multicultural landscape.
+        MERaLiON-AudioLLM supports <strong>Automatic Speech Recognation</strong>,
+        <strong>Speech Translation</strong>,
+        <strong>Spoken Question Answering</strong>,
+        <strong>Spoken Dialogue Summarization</strong>,
+        <strong>Speech Instruction</strong>, and
+        <strong>Paralinguistics</strong> tasks.</p></div>""",
+        unsafe_allow_html=True
+        )
+    st.markdown(
+        f"""<div class="main-intro-small-window">
+        <p>This {component_name.lower()} is based on
+        <a href="https://huggingface.co/MERaLiON/MERaLiON-AudioLLM-Whisper-SEA-LION"
+        target="_blank" rel="noopener noreferrer"> MERaLiON-AudioLLM</a>.</p></div>""",
+        unsafe_allow_html=True
+        )
+@st.fragment
+def sidebar_fragment():
+    with st.container(height=300, border=False):
+        st.page_link("pages/playground.py", label="Playground")
+        st.page_link("pages/voice_chat.py", label="Voice Chat (experimental)")
+    st.divider()
+    st.slider(label='Temperature', min_value=0.0, max_value=2.0, value=0.1, key='temperature')
+    st.slider(label='Top P', min_value=0.0, max_value=1.0, value=0.9, key='top_p')
+    st.slider(label="Repetition Penalty", min_value=1.0, max_value=1.2, value=1.1, key="repetition_penalty")

src/content/playground.py ADDED Viewed

	@@ -0,0 +1,208 @@

+import copy
+import base64
+import streamlit as st
+from src.generation import retrive_response
+from src.utils import bytes_to_array, array_to_bytes
+from src.content.common import (
+    AUDIO_SAMPLES_W_INSTRUCT,
+    DEFAULT_DIALOGUE_STATES,
+    init_state_section,
+    header_section,
+    sidebar_fragment
+)
+QUICK_ACTIONS = [
+    {
+        "name": "**Summary**",
+        "instruction": "Please summarise this speech.",
+        "width": 10,
+    },
+    {
+        "name": "**Transcript**",
+        "instruction": "Please transcribe this speech.",
+        "width": 9.5,
+    }
+]
+MAX_AUDIO_LENGTH = 120
+def _update_audio(audio_bytes):
+    origin_audio_array = bytes_to_array(audio_bytes)
+    truncated_audio_array = origin_audio_array[: MAX_AUDIO_LENGTH*16000]
+    truncated_audio_bytes = array_to_bytes(truncated_audio_array)
+    st.session_state.audio_array = origin_audio_array
+    st.session_state.audio_base64 = base64.b64encode(truncated_audio_bytes).decode('utf-8')
+@st.fragment
+def successful_example_section():
+    audio_sample_names = [audio_sample_name for audio_sample_name in AUDIO_SAMPLES_W_INSTRUCT.keys()]
+    st.markdown(":fire: **Successful Tasks and Examples**")
+    sample_name = st.selectbox(
+        label="**Select Audio:**",
+        label_visibility="collapsed",
+        options=audio_sample_names,
+        format_func=lambda o: AUDIO_SAMPLES_W_INSTRUCT[o]["apperance"],
+        index=None,
+        placeholder="Select an audio sample:",
+        on_change=lambda: st.session_state.update(
+            on_select=True,
+            messages=[],
+            disprompt=True
+        ),
+        key='select')
+    if sample_name and st.session_state.on_select:
+        audio_bytes = open(f"audio_samples/{sample_name}.wav", "rb").read()
+        st.session_state.update(
+            on_select=False,
+            new_prompt=AUDIO_SAMPLES_W_INSTRUCT[sample_name]["instructions"][0]
+        )
+        _update_audio(audio_bytes)
+        st.rerun(scope="app")
+@st.dialog("Specify Audio")
+def audio_attach_dialogue():
+    st.markdown("**Upload**")
+    uploaded_file = st.file_uploader(
+        label="**Upload Audio:**",
+        label_visibility="collapsed",
+        type=['wav', 'mp3'],
+        on_change=lambda: st.session_state.update(on_upload=True, messages=[]),
+        key='upload'
+    )
+    if uploaded_file and st.session_state.on_upload:
+        audio_bytes = uploaded_file.read()
+        _update_audio(audio_bytes)
+        st.session_state.on_upload = False
+        st.rerun()
+    st.markdown("**Record**")
+    uploaded_file = st.audio_input(
+        label="**Record Audio:**",
+        label_visibility="collapsed",
+        on_change=lambda: st.session_state.update(on_record=True, messages=[]),
+        key='record'
+    )
+    if uploaded_file and st.session_state.on_record:
+        audio_bytes = uploaded_file.read()
+        _update_audio(audio_bytes)
+        st.session_state.on_record = False
+        st.rerun()
+def bottom_input_section():
+    bottom_cols = st.columns([0.03, 0.03, 0.94])
+    with bottom_cols[0]:
+        st.button(
+            'Clear',
+            disabled=st.session_state.disprompt,
+            on_click=lambda: st.session_state.update(copy.deepcopy(DEFAULT_DIALOGUE_STATES))
+        )
+    with bottom_cols[1]:
+        if st.button("\+ Audio", disabled=st.session_state.disprompt):
+            audio_attach_dialogue()
+    with bottom_cols[2]:
+        if chat_input := st.chat_input(
+            placeholder="Instruction...",
+            disabled=st.session_state.disprompt,
+            on_submit=lambda: st.session_state.update(disprompt=True, messages=[])
+        ):
+            st.session_state.new_prompt = chat_input
+@st.fragment
+def quick_actions_fragment():
+    action_cols_spec = [_["width"] for _ in QUICK_ACTIONS]
+    action_cols = st.columns(action_cols_spec)
+    for idx, action in enumerate(QUICK_ACTIONS):
+        action_cols[idx].button(
+            action["name"],
+            args=(action["instruction"],),
+            disabled=st.session_state.disprompt,
+            on_click=lambda p: st.session_state.update(
+                disprompt=True,
+                messages=[],
+                new_prompt=p,
+                on_select_quick_action=True
+            )
+        )
+    if st.session_state.on_select_quick_action:
+        st.session_state.on_select_quick_action = False
+        st.rerun(scope="app")
+def conversation_section():
+    if st.session_state.audio_array.size:
+        with st.chat_message("user"):
+            st.audio(st.session_state.audio_array, format="audio/wav", sample_rate=16000)
+            quick_actions_fragment()
+    for message in st.session_state.messages:
+        with st.chat_message(message["role"]):
+            if message.get("error"):
+                st.error(message["error"])
+            for warning_msg in message.get("warnings", []):
+                st.warning(warning_msg)
+            if message.get("content"):
+                st.write(message["content"])
+    with st._bottom:
+        bottom_input_section()
+    if one_time_prompt := st.session_state.new_prompt:
+        st.session_state.update(new_prompt="", messages=[])
+        with st.chat_message("user"):
+            st.write(one_time_prompt)
+        st.session_state.messages.append({"role": "user", "content": one_time_prompt})
+        with st.chat_message("assistant"):
+            with st.spinner("Thinking..."):
+                error_msg, warnings, stream = retrive_response(
+                    one_time_prompt, st.session_state.audio_base64, stream=True)
+                response = ""
+                if error_msg:
+                    st.error(error_msg)
+                for warning_msg in warnings:
+                    st.warning(warning_msg)
+                if stream:
+                    response = st.write_stream(stream)
+        st.session_state.messages.append({
+            "role": "assistant",
+            "error": error_msg,
+            "warnings": warnings,
+            "content": response
+        })
+        st.session_state.disprompt=False
+        st.rerun(scope="app")
+def playground_page():
+    init_state_section()
+    header_section()
+    with st.sidebar:
+        sidebar_fragment()
+    successful_example_section()
+    conversation_section()

src/content/voice_chat.py ADDED Viewed

	@@ -0,0 +1,153 @@

+import copy
+import base64
+import numpy as np
+import streamlit as st
+from src.generation import retrive_response
+from src.utils import bytes_to_array, array_to_bytes
+from src.content.common import (
+    DEFAULT_DIALOGUE_STATES,
+    init_state_section,
+    header_section,
+    sidebar_fragment
+)
+# TODO: change this.
+DEFAULT_PROMPT = "Please follow the instruction in the speech."
+MAX_AUDIO_LENGTH = 120
+def _update_audio(audio_bytes):
+    origin_audio_array = bytes_to_array(audio_bytes)
+    truncated_audio_array = origin_audio_array[: MAX_AUDIO_LENGTH*16000]
+    truncated_audio_bytes = array_to_bytes(truncated_audio_array)
+    st.session_state.audio_array = origin_audio_array
+    st.session_state.audio_base64 = base64.b64encode(truncated_audio_bytes).decode('utf-8')
+@st.dialog("Specify Audio")
+def audio_attach_dialogue():
+    st.markdown("**Upload**")
+    uploaded_file = st.file_uploader(
+        label="**Upload Audio:**",
+        label_visibility="collapsed",
+        type=['wav', 'mp3'],
+        on_change=lambda: st.session_state.update(
+            on_upload=True,
+            messages=[],
+            disprompt=True
+            ),
+        key='upload'
+    )
+    if uploaded_file and st.session_state.on_upload:
+        audio_bytes = uploaded_file.read()
+        _update_audio(audio_bytes)
+        st.session_state.update(
+            on_upload=False,
+            new_prompt=DEFAULT_PROMPT
+        )
+        st.rerun()
+def bottom_input_section():
+    bottom_cols = st.columns([0.03, 0.03, 0.94])
+    with bottom_cols[0]:
+        st.button(
+            'Clear',
+            disabled=st.session_state.disprompt,
+            on_click=lambda: st.session_state.update(copy.deepcopy(DEFAULT_DIALOGUE_STATES))
+        )
+    with bottom_cols[1]:
+        if st.button("\+ Audio", disabled=st.session_state.disprompt):
+            audio_attach_dialogue()
+    with bottom_cols[2]:
+        uploaded_file = st.audio_input(
+            label="record audio",
+            label_visibility="collapsed",
+            on_change=lambda: st.session_state.update(
+                on_record=True,
+                messages=[],
+                disprompt=True
+                ),
+            key='record'
+        )
+        if uploaded_file and st.session_state.on_record:
+            audio_bytes = uploaded_file.read()
+            _update_audio(audio_bytes)
+            st.session_state.update(
+                on_record=False,
+                new_prompt=DEFAULT_PROMPT
+            )
+def conversation_section():
+    for message in st.session_state.messages:
+        with st.chat_message(message["role"]):
+            if message.get("error"):
+                st.error(message["error"])
+            for warning_msg in message.get("warnings", []):
+                st.warning(warning_msg)
+            if message.get("audio", np.array([])).shape[0]:
+                st.audio(message["audio"], format="audio/wav", sample_rate=16000)
+            if message.get("content"):
+                st.write(message["content"])
+    with st._bottom:
+        bottom_input_section()
+    if one_time_prompt := st.session_state.new_prompt:
+        one_time_array = st.session_state.audio_array
+        one_time_base64 = st.session_state.audio_base64
+        st.session_state.update(
+            new_prompt="",
+            one_time_array=np.array([]),
+            one_time_base64="",
+            messages=[]
+        )
+        with st.chat_message("user"):
+            st.audio(one_time_array, format="audio/wav", sample_rate=16000)
+        st.session_state.messages.append({"role": "user", "audio": one_time_array})
+        with st.chat_message("assistant"):
+            with st.spinner("Thinking..."):
+                error_msg, warnings, stream = retrive_response(
+                    one_time_prompt, one_time_base64, stream=True)
+                response = ""
+                if error_msg:
+                    st.error(error_msg)
+                for warning_msg in warnings:
+                    st.warning(warning_msg)
+                if stream:
+                    response = st.write_stream(stream)
+        st.session_state.messages.append({
+            "role": "assistant",
+            "error": error_msg,
+            "warnings": warnings,
+            "content": response
+        })
+        st.session_state.disprompt=False
+        st.rerun(scope="app")
+def voice_chat_page():
+    init_state_section()
+    header_section(component_name="Voice Chat")
+    with st.sidebar:
+        sidebar_fragment()
+    conversation_section()

src/generation.py CHANGED Viewed

@@ -19,6 +19,8 @@ FIXED_GENERATION_CONFIG = dict(
     seed=42
 )
 def load_model():
     """
@@ -100,7 +102,7 @@ def _retry_retrive_response_throws_exception(text_input, base64_audio_input, str
     return response_object
-def _validate_text_input(text_input) -> List[str]:
     """
     TODO: improve the input validation regex.
     """
@@ -111,11 +113,17 @@ def _validate_text_input(text_input) -> List[str]:
     if re.search(r'[\u4e00-\u9fff]+', text_input):
         warnings.append("NOTE: Please try to prompt in English for the best performance.")
     return warnings
 def retrive_response(text_input, base64_audio_input, stream=False):
-    warnings = _validate_text_input(text_input)
     response_object, error_msg = None, ""
     try:

     seed=42
 )
+MAX_AUDIO_LENGTH = 120
 def load_model():
     """
     return response_object
+def _validate_input(text_input) -> List[str]:
     """
     TODO: improve the input validation regex.
     """
     if re.search(r'[\u4e00-\u9fff]+', text_input):
         warnings.append("NOTE: Please try to prompt in English for the best performance.")
+    if st.session_state.audio_array.shape[0] / 16000 > 30.0:
+        warnings.append((
+            "MERaLiON-AudioLLM is trained to process audio up to **30 seconds**."
+            f" Audio longer than **{MAX_AUDIO_LENGTH} seconds** will be truncated."
+        ))
     return warnings
 def retrive_response(text_input, base64_audio_input, stream=False):
+    warnings = _validate_input(text_input)
     response_object, error_msg = None, ""
     try:

src/pages.py DELETED Viewed

@@ -1,220 +0,0 @@
-import base64
-import numpy as np
-import streamlit as st
-from src.generation import retrive_response, postprocess_voice_transcription
-from src.utils import (
-    GENERAL_INSTRUCTIONS,
-    AUDIO_SAMPLES_W_INSTRUCT,
-    bytes_to_array,
-    array_to_bytes,
-)
-DEFAULT_DIALOGUE_STATES = dict(
-    default_instruction=[],
-    audio_base64='',
-    audio_array=np.array([]),
-    disprompt = False,
-    new_prompt = "",
-    messages=[],
-    voice_instruction="",
-    on_select=False,
-    on_upload=False,
-    on_record=False,
-    on_click_button=False,
-    on_record_voice=False
-)
-MAX_AUDIO_LENGTH = 120
-def _update_audio(audio_bytes):
-    origin_audio_array = bytes_to_array(audio_bytes)
-    truncated_audio_array = origin_audio_array[: MAX_AUDIO_LENGTH*16000]
-    truncated_audio_bytes = array_to_bytes(truncated_audio_array)
-    st.session_state.audio_array = origin_audio_array
-    st.session_state.audio_base64 = base64.b64encode(truncated_audio_bytes).decode('utf-8')
-@st.fragment
-def sidebar_fragment():
-    st.markdown("""<div class="sidebar-intro">
-                <p><strong>📌 Supported Tasks</strong>
-                <p>Automatic Speech Recognation</p>
-                <p>Speech Translation</p>
-                <p>Spoken Question Answering</p>
-                <p>Spoken Dialogue Summarization</p>
-                <p>Speech Instruction</p>
-                <p>Paralinguistics</p>
-                <br>
-                <p><strong>📎 Generation Config</strong>
-                </div>""", unsafe_allow_html=True)
-    st.slider(label='Temperature', min_value=0.0, max_value=2.0, value=0.1, key='temperature')
-    st.slider(label='Top P', min_value=0.0, max_value=1.0, value=0.9, key='top_p')
-    st.slider(label="Repetition Penalty", min_value=1.0, max_value=1.2, value=1.1, key="repetition_penalty")
-@st.fragment
-def specify_audio_fragment():
-    col1, col2, col3 = st.columns([4, 2, 2])
-    with col1:
-        audio_sample_names = [audio_sample_name for audio_sample_name in AUDIO_SAMPLES_W_INSTRUCT.keys()]
-        st.markdown("**Select Audio From Examples:**")
-        sample_name = st.selectbox(
-            label="**Select Audio:**",
-            label_visibility="collapsed",
-            options=audio_sample_names,
-            index=None,
-            placeholder="Select an audio sample:",
-            on_change=lambda: st.session_state.update(on_select=True),
-            key='select')
-        if sample_name and st.session_state.on_select:
-            audio_bytes = open(f"audio_samples/{sample_name}.wav", "rb").read()
-            st.session_state.default_instruction = AUDIO_SAMPLES_W_INSTRUCT[sample_name] + GENERAL_INSTRUCTIONS
-            _update_audio(audio_bytes)
-    with col2:
-        st.markdown("or **Upload Audio:**")
-        uploaded_file = st.file_uploader(
-            label="**Upload Audio:**",
-            label_visibility="collapsed",
-            type=['wav', 'mp3'],
-            on_change=lambda: st.session_state.update(on_upload=True),
-            key='upload'
-        )
-        if uploaded_file and st.session_state.on_upload:
-            audio_bytes = uploaded_file.read()
-            st.session_state.default_instruction = GENERAL_INSTRUCTIONS
-            _update_audio(audio_bytes)
-    with col3:
-        st.markdown("or **Record Audio:**")
-        uploaded_file = st.audio_input(
-            label="**Record Audio:**",
-            label_visibility="collapsed",
-            on_change=lambda: st.session_state.update(on_record=True),
-            key='record'
-        )
-        if uploaded_file and st.session_state.on_record:
-            audio_bytes = uploaded_file.read()
-            st.session_state.default_instruction = GENERAL_INSTRUCTIONS
-            _update_audio(audio_bytes)
-    st.session_state.update(on_upload=False, on_record=False, on_select=False)
-    if st.session_state.audio_array.size:
-        with st.chat_message("user"):
-            if st.session_state.audio_array.shape[0] / 16000 > 30.0:
-                st.warning((
-                    "MERaLiON-AudioLLM is trained to process audio up to **30 seconds**."
-                    f" Audio longer than **{MAX_AUDIO_LENGTH} seconds** will be truncated."
-                ))
-            st.audio(st.session_state.audio_array, format="audio/wav", sample_rate=16000)
-            for i, inst in enumerate(st.session_state.default_instruction):
-                st.button(
-                    f"**Example Instruction {i+1}**: {inst}",
-                    args=(inst,),
-                    disabled=st.session_state.disprompt,
-                    on_click=lambda p: st.session_state.update(disprompt=True, new_prompt=p, on_click_button=True, messages=[])
-                )
-    if st.session_state.on_click_button:
-        st.session_state.on_click_button = False
-        st.rerun(scope="app")
-def bottom_input_section():
-    bottom_cols = st.columns([0.02, 0.98])
-    uploaded_file = bottom_cols[0].audio_input(
-        label="voice",
-        label_visibility="collapsed",
-        disabled=st.session_state.disprompt,
-        on_change=lambda: st.session_state.update(on_record_voice=True),
-        key='voice'
-    )
-    if uploaded_file and st.session_state.on_record_voice:
-        audio_bytes = uploaded_file.read()
-        audio_base64 = base64.b64encode(audio_bytes).decode('utf-8')
-        error_msg, warnings, completion = retrive_response(
-            "Write out the dialogue as text.", audio_base64, stream=False)
-        if error_msg:
-            st.toast(error_msg, icon="🚨")
-        for warning_msg in warnings:
-            st.toast(warning_msg, icon="❗")
-        st.session_state.update(
-            new_prompt = postprocess_voice_transcription(
-                completion.choices[0].message.content),
-            on_record_voice = False
-        )
-    if chat_input := bottom_cols[1].chat_input(
-        placeholder="Type Your Instruction Here",
-        disabled=st.session_state.disprompt,
-        on_submit=lambda: st.session_state.update(disprompt=True, messages=[])
-    ):
-        st.session_state.new_prompt = chat_input
-def conversation_section():
-    for message in st.session_state.messages:
-        with st.chat_message(message["role"]):
-            if message.get("error"):
-                st.error(message["error"])
-            for warning_msg in message.get("warnings", []):
-                st.warning(warning_msg)
-            if message.get("content"):
-                st.write(message["content"])
-    with st._bottom:
-        bottom_input_section()
-    if one_time_prompt := st.session_state.new_prompt:
-        st.session_state.update(new_prompt="", messages=[])
-        with st.chat_message("user"):
-            st.write(one_time_prompt)
-        st.session_state.messages.append({"role": "user", "content": one_time_prompt})
-        with st.chat_message("assistant"):
-            with st.spinner("Thinking..."):
-                error_msg, warnings, stream = retrive_response(
-                    one_time_prompt, st.session_state.audio_base64, stream=True)
-                response = ""
-                if error_msg:
-                    st.error(error_msg)
-                for warning_msg in warnings:
-                    st.warning(warning_msg)
-                if stream:
-                    response = st.write_stream(stream)
-        st.session_state.messages.append({
-            "role": "assistant",
-            "error": error_msg,
-            "warnings": warnings,
-            "content": response
-        })
-        st.session_state.disprompt=False
-        st.rerun(scope="app")

src/utils.py CHANGED Viewed

@@ -4,70 +4,6 @@ from scipy.io.wavfile import write
 import librosa
-GENERAL_INSTRUCTIONS = [
-    "Please transcribe this speech.",
-    "Please summarise this speech."
-]
-AUDIO_SAMPLES_W_INSTRUCT = {
-    '7_ASR_IMDA_PART3_30_ASR_v2_2269': ["Need this talk written down, please."],
-    '11_ASR_IMDA_PART4_30_ASR_v2_3771': ["Write out the dialogue as text."],
-    '12_ASR_IMDA_PART4_30_ASR_v2_103' : ["Write out the dialogue as text."],
-    '17_ASR_IMDA_PART6_30_ASR_v2_1413': ["Record the spoken word in text form."],
-    '32_SQA_CN_COLLEDGE_ENTRANCE_ENGLISH_TEST_SQA_V2_572': ["What does the man think the woman should do at 4:00."],
-    '33_SQA_IMDA_PART3_30_SQA_V2_2310': ["Does Speaker2's wife cook for Speaker2 when they are at home."],
-    '34_SQA_IMDA_PART3_30_SQA_V2_3621': ["Does the phrase \"#gai-gai#\" have a meaning in Chinese or Hokkien language."],
-    '35_SQA_IMDA_PART3_30_SQA_V2_4062': ["What is the color of the vase mentioned in the dialogue."],
-    '36_DS_IMDA_PART4_30_DS_V2_849': ["Condense the dialogue into a concise summary highlighting major topics and conclusions."],
-    '39_Paralingual_IEMOCAP_ER_V2_91': ["Based on the speaker's speech patterns, what do you think they are feeling."],
-    '40_Paralingual_IEMOCAP_ER_V2_567': ["Based on the speaker's speech patterns, what do you think they are feeling."],
-    '42_Paralingual_IEMOCAP_GR_V2_320': ["Is it possible for you to identify whether the speaker in this recording is male or female."],
-    '47_Paralingual_IMDA_PART3_30_NR_V2_10479': ["Can you guess which ethnic group this person is from based on their accent."],
-    '49_Paralingual_MELD_ER_V2_676': ["What emotions do you think the speaker is expressing."],
-    '50_Paralingual_MELD_ER_V2_692': ["Based on the speaker's speech patterns, what do you think they are feeling."],
-    '51_Paralingual_VOXCELEB1_GR_V2_2148': ["May I know the gender of the speaker."],
-    '53_Paralingual_VOXCELEB1_NR_V2_2286': ["What's the nationality identity of the speaker."],
-    '55_SQA_PUBLIC_SPEECH_SG_TEST_SQA_V2_2': ["What impact would the growth of the healthcare sector have on the country's economy in terms of employment and growth."],
-    '56_SQA_PUBLIC_SPEECH_SG_TEST_SQA_V2_415': ["Based on the statement, can you summarize the speaker's position on the recent controversial issues in Singapore."],
-    '57_SQA_PUBLIC_SPEECH_SG_TEST_SQA_V2_460': ["How does the author respond to parents' worries about masks in schools."],
-    '1_ASR_IMDA_PART1_ASR_v2_141' : ["Turn the spoken language into a text format.", "Please translate the content into Chinese."],
-    '2_ASR_IMDA_PART1_ASR_v2_2258': ["Turn the spoken language into a text format.", "Please translate the content into Chinese."],
-    '3_ASR_IMDA_PART1_ASR_v2_2265': ["Turn the spoken language into a text format."],
-    '4_ASR_IMDA_PART2_ASR_v2_999' : ["Translate the spoken words into text format."],
-    '5_ASR_IMDA_PART2_ASR_v2_2241': ["Translate the spoken words into text format."],
-    '6_ASR_IMDA_PART2_ASR_v2_3409': ["Translate the spoken words into text format."],
-    '8_ASR_IMDA_PART3_30_ASR_v2_1698': ["Need this talk written down, please."],
-    '9_ASR_IMDA_PART3_30_ASR_v2_2474': ["Need this talk written down, please."],
-    '10_ASR_IMDA_PART4_30_ASR_v2_1527': ["Write out the dialogue as text."],
-    '13_ASR_IMDA_PART5_30_ASR_v2_1446': ["Translate this vocal recording into a textual format."],
-    '14_ASR_IMDA_PART5_30_ASR_v2_2281': ["Translate this vocal recording into a textual format."],
-    '15_ASR_IMDA_PART5_30_ASR_v2_4388': ["Translate this vocal recording into a textual format."],
-    '16_ASR_IMDA_PART6_30_ASR_v2_576': ["Record the spoken word in text form."],
-    '18_ASR_IMDA_PART6_30_ASR_v2_2834': ["Record the spoken word in text form."],
-    '19_ASR_AIShell_zh_ASR_v2_5044': ["Transform the oral presentation into a text document."],
-    '20_ASR_LIBRISPEECH_CLEAN_ASR_V2_833': ["Please provide a written transcription of the speech."],
-    '25_ST_COVOST2_ZH-CN_EN_ST_V2_4567': ["Please translate the given speech to English."],
-    '26_ST_COVOST2_EN_ZH-CN_ST_V2_5422': ["Please translate the given speech to Chinese."],
-    '27_ST_COVOST2_EN_ZH-CN_ST_V2_6697': ["Please translate the given speech to Chinese."],
-    '28_SI_ALPACA-GPT4-AUDIO_SI_V2_299': ["Please follow the instruction in the speech."],
-    '29_SI_ALPACA-GPT4-AUDIO_SI_V2_750': ["Please follow the instruction in the speech."],
-    '30_SI_ALPACA-GPT4-AUDIO_SI_V2_1454': ["Please follow the instruction in the speech."],
-}
 def bytes_to_array(audio_bytes):
     audio_array, _ = librosa.load(
         io.BytesIO(audio_bytes),

 import librosa
 def bytes_to_array(audio_bytes):
     audio_array, _ = librosa.load(
         io.BytesIO(audio_bytes),

style/app_style.css CHANGED Viewed

@@ -1,16 +1,35 @@
 div[data-testid="stChatMessage"]:has(> div[data-testid="stChatMessageAvatarUser"]) {
     flex-direction: row-reverse;
     text-align: right;
 }
-@media(min-width: 576px) {
-    .stMainBlockContainer {
-        padding: 2rem 5rem 1rem;
-    }
 }
 section[data-testid='stFileUploaderDropzone'] {
-    padding: 6px 2rem;
 }
 section[data-testid='stFileUploaderDropzone']>button {
@@ -21,40 +40,45 @@ div[data-testid="stFileUploaderDropzoneInstructions"]>div>span {
     display:none;
 }
-div[data-testid="stMainBlockContainer"] div[data-testid="stAudioInput"]>div {
-    max-height: 3rem;
 }
-div[data-testid="stBottomBlockContainer"] div[data-testid="stAudioInput"]>div {
-    background-color:transparent;
-    /* border:1px solid rgba(49, 51, 63, 0.2); */
-    max-height: 40px;
-    display: block;
-    padding: 0;
-    margin: auto;
-}
-div[data-testid="stBottomBlockContainer"] div[data-testid="stAudioInput"]>div>div:last-of-type {
-    display:none;
 }
-div[data-testid="stBottomBlockContainer"] div[data-testid="stAudioInput"]>div>div:nth-of-type(2) {
-    margin:auto;
 }
-div[data-testid="stBottomBlockContainer"] div[data-testid="stAudioInput"]>div>div:nth-of-type(2)>span:last-of-type {
-    display:none;
 }
-div[data-testid="stBottomBlockContainer"] div[data-testid="stAudioInput"]>div>div:nth-of-type(2)>span:only-of-type {
-    display:block;
 }
-div[data-testid="stBottomBlockContainer"] div[data-testid="stAudioInput"]>div>span {
-    display:none;
 }
-[class='stAudio'] {
-    max-width: 500px !important;
-    margin: auto !important;
 }

+div[data-testid="stMainBlockContainer"] div[data-testid="stAudioInput"]>div {
+    max-height: 3rem;
+}
+div[class="sidebar-intro"] p {
+    margin-bottom: 0.75rem;
+}
+[class='stAudio'] {
+    max-width: 500px !important;
+    margin: auto !important;
+}
 div[data-testid="stChatMessage"]:has(> div[data-testid="stChatMessageAvatarUser"]) {
     flex-direction: row-reverse;
     text-align: right;
 }
+div[data-testid="stChatMessage"] div[data-testid="stHorizontalBlock"]:has(> div[data-testid="stColumn"]) {
+    flex-direction: row-reverse;
 }
+div[data-testid="stChatMessage"] div[data-testid="stHorizontalBlock"]>div[data-testid="stColumn"]:has( div[data-testid="stButton"]) {
+    width: 6rem;
+    min-width: 6rem;
+    flex: 0 0 6rem;
+}
+/* File uploader */
 section[data-testid='stFileUploaderDropzone'] {
+    padding:6px 2rem;
 }
 section[data-testid='stFileUploaderDropzone']>button {
     display:none;
 }
+div[data-testid="stBottomBlockContainer"] {
+    padding-bottom: 2rem;
 }
+/* Chat input component at the bottom */
+div[data-testid="stBottomBlockContainer"] div[data-testid="stHorizontalBlock"]:has(> div[data-testid="stColumn"]) {
+    gap: 4px;
 }
+div[data-testid="stBottomBlockContainer"] div[data-testid="stColumn"]:has( div[data-testid="stButton"]):first-of-type {
+    width: 61px;
+    min-width: 61px;
+    flex: 0 0 61px;
 }
+div[data-testid="stBottomBlockContainer"] div[data-testid="stColumn"]:has( div[data-testid="stButton"]):nth-of-type(2) {
+    width: 76px;
+    min-width: 76px;
+    flex: 0 0 76px;
 }
+div[data-testid="stBottomBlockContainer"] div[data-testid="stColumn"] button[data-testid="stBaseButton-secondary"] {
+    background-color: rgb(240, 242, 246);
+    border-color: rgb(240, 242, 246);
 }
+div[data-testid="stBottomBlockContainer"] div[data-testid="stColumn"]:has( div[data-testid="stChatInput"]) {
+    width: 10rem;
+    min-width: 10rem;
+    flex: 1 1 10rem;
 }
+div[data-testid="stBottomBlockContainer"] div[data-testid="stColumn"]:has( div[data-testid="stAudioInput"]) {
+    width: 10rem;
+    min-width: 10rem;
+    flex: 1 1 10rem;
 }
+div[data-testid="stBottomBlockContainer"] div[data-testid="stAudioInput"]>div {
+    max-height: 40px;
+}

style/normal_window.css ADDED Viewed

	@@ -0,0 +1,14 @@

+@media(min-width: 576px) {
+    .stMainBlockContainer {
+        padding: 2rem 5rem 1rem;
+    }
+    div[data-testid="stBottomBlockContainer"] {
+        padding-left: 5rem;
+        padding-right: 5rem;
+    }
+    div[class="main-intro-small-window"] {
+        display: none;
+    }
+}

style/small_window.css ADDED Viewed

	@@ -0,0 +1,9 @@

+@media(max-width: 576px) {
+    div[data-testid="stMainBlockContainer"] div[data-testid="stVerticalBlock"]>div[data-testid="stElementContainer"]:has( div[data-testid="stHeadingWithActionElements"]) {
+        display: none;
+    }
+    div[class="main-intro-normal-window"] {
+        display: none;
+    }
+}