diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000000000000000000000000000000000000..a6344aac8c09253b3b630fb776ae94478aa0275b --- /dev/null +++ b/.gitattributes @@ -0,0 +1,35 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text diff --git a/.streamlit/config.toml b/.streamlit/config.toml new file mode 100644 index 0000000000000000000000000000000000000000..74812cd03e02b4f059f0fdb02781b9d0a16cb546 --- /dev/null +++ b/.streamlit/config.toml @@ -0,0 +1,2 @@ +[client] +showSidebarNavigation = false \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000000000000000000000000000000000000..8a00bb12a279eee5165d19eb79bef0c8c3e02ba9 --- /dev/null +++ b/README.md @@ -0,0 +1,14 @@ +--- +title: Interactive-Demo / MERaLiON-AudioLLM +emoji: 🚀 +colorFrom: indigo +colorTo: indigo +sdk: streamlit +sdk_version: 1.41.1 +app_file: app.py +pinned: true +models: +- MERaLiON/MERaLiON-AudioLLM-Whisper-SEA-LION +--- + +Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference diff --git a/app.py b/app.py new file mode 100644 index 0000000000000000000000000000000000000000..e497863323cffda8c0dcd856a502419068d5c166 --- /dev/null +++ b/app.py @@ -0,0 +1,3 @@ +from src.content.playground import playground_page + +playground_page() \ No newline at end of file diff --git a/audio_samples/10_ASR_IMDA_PART4_30_ASR_v2_1527.wav b/audio_samples/10_ASR_IMDA_PART4_30_ASR_v2_1527.wav new file mode 100644 index 0000000000000000000000000000000000000000..d1fa45ef18df244757240868794d54063f723d56 Binary files /dev/null and b/audio_samples/10_ASR_IMDA_PART4_30_ASR_v2_1527.wav differ diff --git a/audio_samples/11_ASR_IMDA_PART4_30_ASR_v2_3771.wav b/audio_samples/11_ASR_IMDA_PART4_30_ASR_v2_3771.wav new file mode 100644 index 0000000000000000000000000000000000000000..f8c81dfdd16b9167a273a3964a6a3dd47de80d98 Binary files /dev/null and b/audio_samples/11_ASR_IMDA_PART4_30_ASR_v2_3771.wav differ diff --git a/audio_samples/12_ASR_IMDA_PART4_30_ASR_v2_103.wav b/audio_samples/12_ASR_IMDA_PART4_30_ASR_v2_103.wav new file mode 100644 index 0000000000000000000000000000000000000000..186c8c090c344c63e0ea828cc12a51207df58aff Binary files /dev/null and b/audio_samples/12_ASR_IMDA_PART4_30_ASR_v2_103.wav differ diff --git a/audio_samples/13_ASR_IMDA_PART5_30_ASR_v2_1446.wav b/audio_samples/13_ASR_IMDA_PART5_30_ASR_v2_1446.wav new file mode 100644 index 0000000000000000000000000000000000000000..0e606bfd770bb85abab4957d0380fef221c254c4 Binary files /dev/null and b/audio_samples/13_ASR_IMDA_PART5_30_ASR_v2_1446.wav differ diff --git a/audio_samples/14_ASR_IMDA_PART5_30_ASR_v2_2281.wav b/audio_samples/14_ASR_IMDA_PART5_30_ASR_v2_2281.wav new file mode 100644 index 0000000000000000000000000000000000000000..cbf2fad1cc780f4a7d691974033dcef7fd0fcef0 Binary files /dev/null and b/audio_samples/14_ASR_IMDA_PART5_30_ASR_v2_2281.wav differ diff --git a/audio_samples/15_ASR_IMDA_PART5_30_ASR_v2_4388.wav b/audio_samples/15_ASR_IMDA_PART5_30_ASR_v2_4388.wav new file mode 100644 index 0000000000000000000000000000000000000000..8ea95dd7dbab7762154da8588b7ead701317e956 Binary files /dev/null and b/audio_samples/15_ASR_IMDA_PART5_30_ASR_v2_4388.wav differ diff --git a/audio_samples/16_ASR_IMDA_PART6_30_ASR_v2_576.wav b/audio_samples/16_ASR_IMDA_PART6_30_ASR_v2_576.wav new file mode 100644 index 0000000000000000000000000000000000000000..df7b46f0a8f4f93b280cd9c91e486956e5d89b11 Binary files /dev/null and b/audio_samples/16_ASR_IMDA_PART6_30_ASR_v2_576.wav differ diff --git a/audio_samples/17_ASR_IMDA_PART6_30_ASR_v2_1413.wav b/audio_samples/17_ASR_IMDA_PART6_30_ASR_v2_1413.wav new file mode 100644 index 0000000000000000000000000000000000000000..69c2b3ee39465bc58efe769fde69c95c9d5092fc Binary files /dev/null and b/audio_samples/17_ASR_IMDA_PART6_30_ASR_v2_1413.wav differ diff --git a/audio_samples/18_ASR_IMDA_PART6_30_ASR_v2_2834.wav b/audio_samples/18_ASR_IMDA_PART6_30_ASR_v2_2834.wav new file mode 100644 index 0000000000000000000000000000000000000000..1d89e648d87d2bc193f728ac86b54ea7a4e07634 Binary files /dev/null and b/audio_samples/18_ASR_IMDA_PART6_30_ASR_v2_2834.wav differ diff --git a/audio_samples/19_ASR_AIShell_zh_ASR_v2_5044.wav b/audio_samples/19_ASR_AIShell_zh_ASR_v2_5044.wav new file mode 100644 index 0000000000000000000000000000000000000000..b296224725ec5acf74a02304f6beb6a7723d2c89 Binary files /dev/null and b/audio_samples/19_ASR_AIShell_zh_ASR_v2_5044.wav differ diff --git a/audio_samples/1_ASR_IMDA_PART1_ASR_v2_141.wav b/audio_samples/1_ASR_IMDA_PART1_ASR_v2_141.wav new file mode 100644 index 0000000000000000000000000000000000000000..17c5fc99647aaa658eabe035b40f97f8ea7638d1 Binary files /dev/null and b/audio_samples/1_ASR_IMDA_PART1_ASR_v2_141.wav differ diff --git a/audio_samples/20_ASR_LIBRISPEECH_CLEAN_ASR_V2_833.wav b/audio_samples/20_ASR_LIBRISPEECH_CLEAN_ASR_V2_833.wav new file mode 100644 index 0000000000000000000000000000000000000000..1ec609efd1c3790487c3c0aec77e5e3e5b0c3eda Binary files /dev/null and b/audio_samples/20_ASR_LIBRISPEECH_CLEAN_ASR_V2_833.wav differ diff --git a/audio_samples/25_ST_COVOST2_ZH-CN_EN_ST_V2_4567.wav b/audio_samples/25_ST_COVOST2_ZH-CN_EN_ST_V2_4567.wav new file mode 100644 index 0000000000000000000000000000000000000000..55c435f3b228e136e3c1047a4b43e992b9acfc0f Binary files /dev/null and b/audio_samples/25_ST_COVOST2_ZH-CN_EN_ST_V2_4567.wav differ diff --git a/audio_samples/26_ST_COVOST2_EN_ZH-CN_ST_V2_5422.wav b/audio_samples/26_ST_COVOST2_EN_ZH-CN_ST_V2_5422.wav new file mode 100644 index 0000000000000000000000000000000000000000..f2780b3b7da1d553f59f4f29256b4e848049cf52 Binary files /dev/null and b/audio_samples/26_ST_COVOST2_EN_ZH-CN_ST_V2_5422.wav differ diff --git a/audio_samples/27_ST_COVOST2_EN_ZH-CN_ST_V2_6697.wav b/audio_samples/27_ST_COVOST2_EN_ZH-CN_ST_V2_6697.wav new file mode 100644 index 0000000000000000000000000000000000000000..234f811d4c60ab67659f06bcd1db481a11648ca9 Binary files /dev/null and b/audio_samples/27_ST_COVOST2_EN_ZH-CN_ST_V2_6697.wav differ diff --git a/audio_samples/28_SI_ALPACA-GPT4-AUDIO_SI_V2_299.wav b/audio_samples/28_SI_ALPACA-GPT4-AUDIO_SI_V2_299.wav new file mode 100644 index 0000000000000000000000000000000000000000..239fff4d4cfcf2653e00d97ca842f334bd31ed18 Binary files /dev/null and b/audio_samples/28_SI_ALPACA-GPT4-AUDIO_SI_V2_299.wav differ diff --git a/audio_samples/29_SI_ALPACA-GPT4-AUDIO_SI_V2_750.wav b/audio_samples/29_SI_ALPACA-GPT4-AUDIO_SI_V2_750.wav new file mode 100644 index 0000000000000000000000000000000000000000..35d9dfbdc9ca3169a05c50a548cb5836adc65d52 Binary files /dev/null and b/audio_samples/29_SI_ALPACA-GPT4-AUDIO_SI_V2_750.wav differ diff --git a/audio_samples/2_ASR_IMDA_PART1_ASR_v2_2258.wav b/audio_samples/2_ASR_IMDA_PART1_ASR_v2_2258.wav new file mode 100644 index 0000000000000000000000000000000000000000..1b3ff08f36d5e02043445bd8c0f37b73cdd9f59c Binary files /dev/null and b/audio_samples/2_ASR_IMDA_PART1_ASR_v2_2258.wav differ diff --git a/audio_samples/30_SI_ALPACA-GPT4-AUDIO_SI_V2_1454.wav b/audio_samples/30_SI_ALPACA-GPT4-AUDIO_SI_V2_1454.wav new file mode 100644 index 0000000000000000000000000000000000000000..d84f6abdca95d5bfa3f292f45b370c243bf79f86 Binary files /dev/null and b/audio_samples/30_SI_ALPACA-GPT4-AUDIO_SI_V2_1454.wav differ diff --git a/audio_samples/31_SI_OPENHERMES-AUDIO_SI_V2_673.wav b/audio_samples/31_SI_OPENHERMES-AUDIO_SI_V2_673.wav new file mode 100644 index 0000000000000000000000000000000000000000..e0d9a7f61f0a8b0137bc8c5ddd4d03c02686b49b Binary files /dev/null and b/audio_samples/31_SI_OPENHERMES-AUDIO_SI_V2_673.wav differ diff --git a/audio_samples/32_SQA_CN_COLLEDGE_ENTRANCE_ENGLISH_TEST_SQA_V2_572.wav b/audio_samples/32_SQA_CN_COLLEDGE_ENTRANCE_ENGLISH_TEST_SQA_V2_572.wav new file mode 100644 index 0000000000000000000000000000000000000000..4f0aadf1e9ac1e100c052fa9df0760651e2b2c4f Binary files /dev/null and b/audio_samples/32_SQA_CN_COLLEDGE_ENTRANCE_ENGLISH_TEST_SQA_V2_572.wav differ diff --git a/audio_samples/33_SQA_IMDA_PART3_30_SQA_V2_2310.wav b/audio_samples/33_SQA_IMDA_PART3_30_SQA_V2_2310.wav new file mode 100644 index 0000000000000000000000000000000000000000..c2858560478a1b51a6085e0f54a34d4bbca30b8e Binary files /dev/null and b/audio_samples/33_SQA_IMDA_PART3_30_SQA_V2_2310.wav differ diff --git a/audio_samples/34_SQA_IMDA_PART3_30_SQA_V2_3621.wav b/audio_samples/34_SQA_IMDA_PART3_30_SQA_V2_3621.wav new file mode 100644 index 0000000000000000000000000000000000000000..e4f53b20b6210ef6bba708ea1bccb9ad787caf22 Binary files /dev/null and b/audio_samples/34_SQA_IMDA_PART3_30_SQA_V2_3621.wav differ diff --git a/audio_samples/35_SQA_IMDA_PART3_30_SQA_V2_4062.wav b/audio_samples/35_SQA_IMDA_PART3_30_SQA_V2_4062.wav new file mode 100644 index 0000000000000000000000000000000000000000..8e18d39cdceaa84abc9dff3f002a0c6502c30b69 Binary files /dev/null and b/audio_samples/35_SQA_IMDA_PART3_30_SQA_V2_4062.wav differ diff --git a/audio_samples/36_DS_IMDA_PART4_30_DS_V2_849.wav b/audio_samples/36_DS_IMDA_PART4_30_DS_V2_849.wav new file mode 100644 index 0000000000000000000000000000000000000000..6b381a7b04f312f0b317bd3b6a0581155aeaf4c1 Binary files /dev/null and b/audio_samples/36_DS_IMDA_PART4_30_DS_V2_849.wav differ diff --git a/audio_samples/39_Paralingual_IEMOCAP_ER_V2_91.wav b/audio_samples/39_Paralingual_IEMOCAP_ER_V2_91.wav new file mode 100644 index 0000000000000000000000000000000000000000..738c14bf9ff890820659be0ad4d27ec5576ea7c4 Binary files /dev/null and b/audio_samples/39_Paralingual_IEMOCAP_ER_V2_91.wav differ diff --git a/audio_samples/3_ASR_IMDA_PART1_ASR_v2_2265.wav b/audio_samples/3_ASR_IMDA_PART1_ASR_v2_2265.wav new file mode 100644 index 0000000000000000000000000000000000000000..507bca925cbe5c433d1021c89f8f5c2108fc00d6 Binary files /dev/null and b/audio_samples/3_ASR_IMDA_PART1_ASR_v2_2265.wav differ diff --git a/audio_samples/40_Paralingual_IEMOCAP_ER_V2_567.wav b/audio_samples/40_Paralingual_IEMOCAP_ER_V2_567.wav new file mode 100644 index 0000000000000000000000000000000000000000..6709d5a7b3509690d89d222e8a75120b0a9c4d35 Binary files /dev/null and b/audio_samples/40_Paralingual_IEMOCAP_ER_V2_567.wav differ diff --git a/audio_samples/42_Paralingual_IEMOCAP_GR_V2_320.wav b/audio_samples/42_Paralingual_IEMOCAP_GR_V2_320.wav new file mode 100644 index 0000000000000000000000000000000000000000..593e18ad1ff04af7877072ba964c323786ba580e Binary files /dev/null and b/audio_samples/42_Paralingual_IEMOCAP_GR_V2_320.wav differ diff --git a/audio_samples/43_Paralingual_IEMOCAP_GR_V2_129.wav b/audio_samples/43_Paralingual_IEMOCAP_GR_V2_129.wav new file mode 100644 index 0000000000000000000000000000000000000000..cd143063c19ca28fb3820ded2f1caa2cda0a8861 Binary files /dev/null and b/audio_samples/43_Paralingual_IEMOCAP_GR_V2_129.wav differ diff --git a/audio_samples/45_Paralingual_IMDA_PART3_30_GR_V2_12312.wav b/audio_samples/45_Paralingual_IMDA_PART3_30_GR_V2_12312.wav new file mode 100644 index 0000000000000000000000000000000000000000..af4fe23487085a3047ad1f0f56b824a6a75907f4 Binary files /dev/null and b/audio_samples/45_Paralingual_IMDA_PART3_30_GR_V2_12312.wav differ diff --git a/audio_samples/47_Paralingual_IMDA_PART3_30_NR_V2_10479.wav b/audio_samples/47_Paralingual_IMDA_PART3_30_NR_V2_10479.wav new file mode 100644 index 0000000000000000000000000000000000000000..20a685bb51cd1670280e104e1f06987e471657bb Binary files /dev/null and b/audio_samples/47_Paralingual_IMDA_PART3_30_NR_V2_10479.wav differ diff --git a/audio_samples/49_Paralingual_MELD_ER_V2_676.wav b/audio_samples/49_Paralingual_MELD_ER_V2_676.wav new file mode 100644 index 0000000000000000000000000000000000000000..a614033adb66d5d8b5a0054530336876c0d61d86 Binary files /dev/null and b/audio_samples/49_Paralingual_MELD_ER_V2_676.wav differ diff --git a/audio_samples/4_ASR_IMDA_PART2_ASR_v2_999.wav b/audio_samples/4_ASR_IMDA_PART2_ASR_v2_999.wav new file mode 100644 index 0000000000000000000000000000000000000000..48bfb135fc3eb12814801c49abd0b8250178ad86 Binary files /dev/null and b/audio_samples/4_ASR_IMDA_PART2_ASR_v2_999.wav differ diff --git a/audio_samples/50_Paralingual_MELD_ER_V2_692.wav b/audio_samples/50_Paralingual_MELD_ER_V2_692.wav new file mode 100644 index 0000000000000000000000000000000000000000..69f435f7308b5090f2668d22c1f324d30dd8857e Binary files /dev/null and b/audio_samples/50_Paralingual_MELD_ER_V2_692.wav differ diff --git a/audio_samples/51_Paralingual_VOXCELEB1_GR_V2_2148.wav b/audio_samples/51_Paralingual_VOXCELEB1_GR_V2_2148.wav new file mode 100644 index 0000000000000000000000000000000000000000..42d4d89846cfcd0c6bb0de173f584ad2b6d6d131 Binary files /dev/null and b/audio_samples/51_Paralingual_VOXCELEB1_GR_V2_2148.wav differ diff --git a/audio_samples/53_Paralingual_VOXCELEB1_NR_V2_2286.wav b/audio_samples/53_Paralingual_VOXCELEB1_NR_V2_2286.wav new file mode 100644 index 0000000000000000000000000000000000000000..ce05d92f8004d6054d39fae59f4d3a34c3b80e49 Binary files /dev/null and b/audio_samples/53_Paralingual_VOXCELEB1_NR_V2_2286.wav differ diff --git a/audio_samples/55_SQA_PUBLIC_SPEECH_SG_TEST_SQA_V2_2.wav b/audio_samples/55_SQA_PUBLIC_SPEECH_SG_TEST_SQA_V2_2.wav new file mode 100644 index 0000000000000000000000000000000000000000..f8513f46825e7b386b1f00f058d249044dac82d2 Binary files /dev/null and b/audio_samples/55_SQA_PUBLIC_SPEECH_SG_TEST_SQA_V2_2.wav differ diff --git a/audio_samples/56_SQA_PUBLIC_SPEECH_SG_TEST_SQA_V2_415.wav b/audio_samples/56_SQA_PUBLIC_SPEECH_SG_TEST_SQA_V2_415.wav new file mode 100644 index 0000000000000000000000000000000000000000..f95f167ebe177b0db82f346f9dbd2c51eb828ec1 Binary files /dev/null and b/audio_samples/56_SQA_PUBLIC_SPEECH_SG_TEST_SQA_V2_415.wav differ diff --git a/audio_samples/57_SQA_PUBLIC_SPEECH_SG_TEST_SQA_V2_460.wav b/audio_samples/57_SQA_PUBLIC_SPEECH_SG_TEST_SQA_V2_460.wav new file mode 100644 index 0000000000000000000000000000000000000000..c2decc6d21300257c3fc74b6718f1898dedbf4e2 Binary files /dev/null and b/audio_samples/57_SQA_PUBLIC_SPEECH_SG_TEST_SQA_V2_460.wav differ diff --git a/audio_samples/5_ASR_IMDA_PART2_ASR_v2_2241.wav b/audio_samples/5_ASR_IMDA_PART2_ASR_v2_2241.wav new file mode 100644 index 0000000000000000000000000000000000000000..55063388c14bd69df6a8023e5a65e4c9c3a01fb5 Binary files /dev/null and b/audio_samples/5_ASR_IMDA_PART2_ASR_v2_2241.wav differ diff --git a/audio_samples/6_ASR_IMDA_PART2_ASR_v2_3409.wav b/audio_samples/6_ASR_IMDA_PART2_ASR_v2_3409.wav new file mode 100644 index 0000000000000000000000000000000000000000..daf99a1877bae21a5ab72147a7a6359c8953e242 Binary files /dev/null and b/audio_samples/6_ASR_IMDA_PART2_ASR_v2_3409.wav differ diff --git a/audio_samples/7_ASR_IMDA_PART3_30_ASR_v2_2269.wav b/audio_samples/7_ASR_IMDA_PART3_30_ASR_v2_2269.wav new file mode 100644 index 0000000000000000000000000000000000000000..5e439cf43817a436692e132e194bcf2b43332126 Binary files /dev/null and b/audio_samples/7_ASR_IMDA_PART3_30_ASR_v2_2269.wav differ diff --git a/audio_samples/8_ASR_IMDA_PART3_30_ASR_v2_1698.wav b/audio_samples/8_ASR_IMDA_PART3_30_ASR_v2_1698.wav new file mode 100644 index 0000000000000000000000000000000000000000..e0929f09849acb481f80ca007bf257a9d937c035 Binary files /dev/null and b/audio_samples/8_ASR_IMDA_PART3_30_ASR_v2_1698.wav differ diff --git a/audio_samples/9_ASR_IMDA_PART3_30_ASR_v2_2474.wav b/audio_samples/9_ASR_IMDA_PART3_30_ASR_v2_2474.wav new file mode 100644 index 0000000000000000000000000000000000000000..11e66f37907da37aa2d90a492e407bc3a7a20bb1 Binary files /dev/null and b/audio_samples/9_ASR_IMDA_PART3_30_ASR_v2_2474.wav differ diff --git a/pages/agent.py b/pages/agent.py new file mode 100644 index 0000000000000000000000000000000000000000..36a9a35623385096495fc8d00d77b7302d9f4a5c --- /dev/null +++ b/pages/agent.py @@ -0,0 +1,3 @@ +from src.content.agent import agent_page + +agent_page() \ No newline at end of file diff --git a/pages/playground.py b/pages/playground.py new file mode 100644 index 0000000000000000000000000000000000000000..da5d8c7953bac6ca10d1fc69c8e755b0284f616d --- /dev/null +++ b/pages/playground.py @@ -0,0 +1,4 @@ +from src.content.playground import playground_page + + +playground_page() diff --git a/pages/voice_chat.py b/pages/voice_chat.py new file mode 100644 index 0000000000000000000000000000000000000000..c0627ef4f40fc78b1fc85ed7b9db34a0ee603a12 --- /dev/null +++ b/pages/voice_chat.py @@ -0,0 +1,4 @@ +from src.content.voice_chat import voice_chat_page + + +voice_chat_page() \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..26e328f99799199c667b3f8c88bb84ebd0867bc4 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,9 @@ +librosa==0.10.2.post1 +streamlit==1.40.2 +openai==1.57.1 +streamlit_mic_recorder==0.0.8 +sshtunnel +accelerate==1.3.0 +FlagEmbedding==1.3.3 +sentence-transformers==3.4.0 +sentencepiece==0.1.99 \ No newline at end of file diff --git a/src/content/agent.py b/src/content/agent.py new file mode 100644 index 0000000000000000000000000000000000000000..5ec20840a751bb24574c421fda50af0639957d80 --- /dev/null +++ b/src/content/agent.py @@ -0,0 +1,289 @@ +import copy +import base64 + +import streamlit as st + +from src.generation import MAX_AUDIO_LENGTH +from src.retrieval import STANDARD_QUERIES, retrieve_relevant_docs +from src.utils import bytes_to_array, array_to_bytes +from src.content.common import ( + MODEL_NAMES, + AUDIO_SAMPLES_W_INSTRUCT, + DEFAULT_DIALOGUE_STATES, + init_state_section, + header_section, + sidebar_fragment, + retrive_response_with_ui +) + + +LLM_NO_AUDIO_PROMPT_TEMPLATE = """{user_question}""" + + +LLM_PROMPT_TEMPLATE = """User asked a question about the audio clip. + +## User Question +{user_question} + +{audio_information_prompt}Please reply to user's question with a friendly, accurate, and helpful answer.""" + + +AUDIO_INFO_TEMPLATE = """Here are some information about this audio clip. + +## Audio Information +{audio_information} + +However, the audio analysis may or may not contain relevant information to the user question, please only reply the user with the relevant information. + +""" + + +AUDIO_ANALYSIS_STATUS = "MERaLiON-AudioLLM Analysis" + + +def _update_audio(audio_bytes): + origin_audio_array = bytes_to_array(audio_bytes) + truncated_audio_array = origin_audio_array[: MAX_AUDIO_LENGTH*16000] + truncated_audio_bytes = array_to_bytes(truncated_audio_array) + + st.session_state.ag_audio_array = origin_audio_array + st.session_state.ag_audio_base64 = base64.b64encode(truncated_audio_bytes).decode('utf-8') + + +@st.fragment +def successful_example_section(): + audio_sample_names = [name for name in AUDIO_SAMPLES_W_INSTRUCT.keys() if "Paral" in name] + + st.markdown(":fire: **Successful Tasks and Examples**") + + sample_name = st.selectbox( + label="**Select Audio:**", + label_visibility="collapsed", + options=audio_sample_names, + format_func=lambda o: AUDIO_SAMPLES_W_INSTRUCT[o]["apperance"], + index=None, + placeholder="Select an audio sample:", + on_change=lambda: st.session_state.update( + on_select=True, + ag_messages=[], + ag_model_messages=[], + ag_visited_query_indices=[], + disprompt=True + ), + key='select') + + if sample_name and st.session_state.on_select: + audio_bytes = open(f"audio_samples/{sample_name}.wav", "rb").read() + st.session_state.update( + on_select=False, + new_prompt=AUDIO_SAMPLES_W_INSTRUCT[sample_name]["instructions"][0] + ) + _update_audio(audio_bytes) + st.rerun(scope="app") + + +@st.dialog("Specify Audio") +def audio_attach_dialogue(): + st.markdown("**Upload**") + + uploaded_file = st.file_uploader( + label="**Upload Audio:**", + label_visibility="collapsed", + type=['wav', 'mp3'], + on_change=lambda: st.session_state.update( + on_upload=True, + ag_messages=[], + ag_model_messages=[], + ag_visited_query_indices=[] + ), + key='upload' + ) + + if uploaded_file and st.session_state.on_upload: + audio_bytes = uploaded_file.read() + _update_audio(audio_bytes) + st.session_state.on_upload = False + st.rerun() + + st.markdown("**Record**") + + uploaded_file = st.audio_input( + label="**Record Audio:**", + label_visibility="collapsed", + on_change=lambda: st.session_state.update( + on_record=True, + ag_messages=[], + ag_model_messages=[], + ag_visited_query_indices=[] + ), + key='record' + ) + + if uploaded_file and st.session_state.on_record: + audio_bytes = uploaded_file.read() + _update_audio(audio_bytes) + st.session_state.on_record = False + st.rerun() + + +def bottom_input_section(): + bottom_cols = st.columns([0.03, 0.03, 0.94]) + with bottom_cols[0]: + st.button( + 'Clear', + disabled=st.session_state.disprompt, + on_click=lambda: st.session_state.update(copy.deepcopy(DEFAULT_DIALOGUE_STATES)) + ) + + with bottom_cols[1]: + if st.button("\+ Audio", disabled=st.session_state.disprompt): + audio_attach_dialogue() + + with bottom_cols[2]: + if chat_input := st.chat_input( + placeholder="Instruction...", + disabled=st.session_state.disprompt, + on_submit=lambda: st.session_state.update(disprompt=True) + ): + st.session_state.new_prompt = chat_input + + +def _prepare_final_prompt_with_ui(one_time_prompt): + if st.session_state.ag_audio_array.shape[0] == 0: + return LLM_NO_AUDIO_PROMPT_TEMPLATE.format(user_question=one_time_prompt) + + with st.spinner("Searching appropriate querys..."): + relevant_query_indices = retrieve_relevant_docs(one_time_prompt) + if len(st.session_state.ag_messages) <= 2: + relevant_query_indices.append(0) + + relevant_query_indices = list( + set(relevant_query_indices).difference(st.session_state.ag_visited_query_indices) + ) + + st.session_state.ag_visited_query_indices.extend(relevant_query_indices) + + if not relevant_query_indices: + return LLM_PROMPT_TEMPLATE.format( + user_question=one_time_prompt, + audio_information_prompt="" + ) + + audio_info = [] + with st.status(AUDIO_ANALYSIS_STATUS, expanded=False) as status: + for i, standard_idx in enumerate(relevant_query_indices): + new_label = ( + f"{AUDIO_ANALYSIS_STATUS}: " + f"{STANDARD_QUERIES[standard_idx]['ui_text']} " + f"({i+1}/{len(relevant_query_indices)})" + ) + + status.update(label=new_label, state="running") + error_msg, warnings, response = retrive_response_with_ui( + model_name=MODEL_NAMES["audiollm"]["vllm_name"], + text_input=STANDARD_QUERIES[standard_idx]["query_text"], + array_audio_input=st.session_state.ag_audio_array, + base64_audio_input=st.session_state.ag_audio_base64, + prefix=f"**{STANDARD_QUERIES[standard_idx]['ui_text']}**: ", + stream=True, + show_warning=i==0 + ) + audio_info.append(STANDARD_QUERIES[standard_idx]["response_prefix_text"] + response) + + st.session_state.ag_messages[-1]["process"].append({ + "error": error_msg, + "warnings": warnings, + "content": response + }) + + status.update(label=AUDIO_ANALYSIS_STATUS, state="complete") + + audio_information_prompt = AUDIO_INFO_TEMPLATE.format( + audio_information="\n".join(audio_info) + ) + + return LLM_PROMPT_TEMPLATE.format( + user_question=one_time_prompt, + audio_information_prompt=audio_information_prompt + ) + + +def conversation_section(): + chat_message_container = st.container(height=480) + if st.session_state.ag_audio_array.size: + with chat_message_container.chat_message("user"): + st.audio(st.session_state.ag_audio_array, format="audio/wav", sample_rate=16000) + + for message in st.session_state.ag_messages: + message_name = "assistant" if "assistant" in message["role"] else message["role"] + + with chat_message_container.chat_message(name=message_name): + if message.get("error"): + st.error(message["error"]) + for warning_msg in message.get("warnings", []): + st.warning(warning_msg) + if process := message.get("process", []): + with st.status(AUDIO_ANALYSIS_STATUS, expanded=False, state="complete"): + for proc in process: + if proc.get("error"): + st.error(proc["error"]) + for proc_warning_msg in proc.get("warnings", []): + st.warning(proc_warning_msg) + if proc.get("content"): + st.write(proc["content"]) + if message.get("content"): + st.write(message["content"]) + + with st._bottom: + bottom_input_section() + + if one_time_prompt := st.session_state.new_prompt: + st.session_state.update(new_prompt="") + + with chat_message_container.chat_message("user"): + st.write(one_time_prompt) + st.session_state.ag_messages.append({"role": "user", "content": one_time_prompt}) + + with chat_message_container.chat_message("assistant"): + assistant_message = {"role": "assistant", "process": []} + st.session_state.ag_messages.append(assistant_message) + + final_prompt = _prepare_final_prompt_with_ui(one_time_prompt) + + error_msg, warnings, response = retrive_response_with_ui( + model_name=MODEL_NAMES["llm"]["vllm_name"], + text_input=final_prompt, + array_audio_input=st.session_state.ag_audio_array, + base64_audio_input="", + prefix=f"**{MODEL_NAMES['llm']['ui_name']}**: ", + stream=True, + history=st.session_state.ag_model_messages, + show_warning=False + ) + + assistant_message.update({"error": error_msg, "warnings": warnings, "content": response}) + st.session_state.ag_model_messages.extend([ + {"role": "user", "content": final_prompt}, + {"role": "assistant", "content": response} + ]) + + st.session_state.disprompt=False + st.rerun(scope="app") + + +def agent_page(): + init_state_section() + header_section( + component_name="Chatbot", + description=""" It is implemented by connecting multiple AI models, + offers more flexibility, and supports multi-round conversation.""", + concise_description=""" It is implemented by connecting multiple AI models and + support multi-round conversation.""", + icon="👥" + ) + + with st.sidebar: + sidebar_fragment() + + successful_example_section() + conversation_section() \ No newline at end of file diff --git a/src/content/common.py b/src/content/common.py new file mode 100644 index 0000000000000000000000000000000000000000..40dc6e132087aa499c5411037cb4392d56c1c5a1 --- /dev/null +++ b/src/content/common.py @@ -0,0 +1,443 @@ +import os +import copy +import itertools +from collections import OrderedDict +from typing import List, Optional + +import numpy as np +import streamlit as st + +from src.tunnel import start_server +from src.generation import FIXED_GENERATION_CONFIG, load_model, retrive_response +from src.retrieval import load_retriever +from src.logger import load_logger + + +DEFAULT_DIALOGUE_STATES = dict( + pg_audio_base64='', + pg_audio_array=np.array([]), + pg_messages=[], + vc_audio_base64='', + vc_audio_array=np.array([]), + vc_messages=[], + ag_audio_base64='', + ag_audio_array=np.array([]), + ag_visited_query_indices=[], + ag_messages=[], + ag_model_messages=[], + disprompt = False, + new_prompt = "", + on_select=False, + on_upload=False, + on_record=False, + on_select_quick_action=False +) + + +MODEL_NAMES = OrderedDict({}) + + +AUDIO_SAMPLES_W_INSTRUCT = { + "7_ASR_IMDA_PART3_30_ASR_v2_2269": { + "apperance": "7. Automatic Speech Recognition task: conversation in Singapore accent", + "instructions": [ + "Need this talk written down, please." + ] + }, + "11_ASR_IMDA_PART4_30_ASR_v2_3771": { + "apperance": "11. Automatic Speech Recognition task: conversation with Singlish code-switch", + "instructions": [ + "Write out the dialogue as text." + ] + }, + "12_ASR_IMDA_PART4_30_ASR_v2_103": { + "apperance": "12. Automatic Speech Recognition task: conversation with Singlish code-switch", + "instructions": [ + "Write out the dialogue as text." + ] + }, + "17_ASR_IMDA_PART6_30_ASR_v2_1413": { + "apperance": "17. Automatic Speech Recognition task: conversation in Singapore accent", + "instructions": [ + "Record the spoken word in text form." + ] + }, + "32_SQA_CN_COLLEDGE_ENTRANCE_ENGLISH_TEST_SQA_V2_572": { + "apperance": "32. Spoken Question Answering task: general speech", + "instructions": [ + "What does the man think the woman should do at 4:00." + ] + }, + "33_SQA_IMDA_PART3_30_SQA_V2_2310": { + "apperance": "33. Spoken Question Answering task: conversation in Singapore accent", + "instructions": [ + "Does Speaker2's wife cook for Speaker2 when they are at home." + ] + }, + "34_SQA_IMDA_PART3_30_SQA_V2_3621": { + "apperance": "34. Spoken Question Answering task: conversation in Singapore accent", + "instructions": [ + "Does the phrase \"#gai-gai#\" have a meaning in Chinese or Hokkien language." + ] + }, + "35_SQA_IMDA_PART3_30_SQA_V2_4062": { + "apperance": "35. Spoken Question Answering task: conversation in Singapore accent", + "instructions": [ + "What is the color of the vase mentioned in the dialogue." + ] + }, + "36_DS_IMDA_PART4_30_DS_V2_849": { + "apperance": "36. Spoken Dialogue Summarization task: conversation with Singlish code-switch", + "instructions": [ + "Condense the dialogue into a concise summary highlighting major topics and conclusions." + ] + }, + "39_Paralingual_IEMOCAP_ER_V2_91": { + "apperance": "39. Paralinguistics task: general speech", + "instructions": [ + "Based on the speaker's speech patterns, what do you think they are feeling." + ] + }, + "40_Paralingual_IEMOCAP_ER_V2_567": { + "apperance": "40. Paralinguistics task: general speech", + "instructions": [ + "Based on the speaker's speech patterns, what do you think they are feeling." + ] + }, + "42_Paralingual_IEMOCAP_GR_V2_320": { + "apperance": "42. Paralinguistics task: general speech", + "instructions": [ + "Is it possible for you to identify whether the speaker in this recording is male or female." + ] + }, + "47_Paralingual_IMDA_PART3_30_NR_V2_10479": { + "apperance": "47. Paralinguistics task: conversation in Singapore accent", + "instructions": [ + "Can you guess which ethnic group this person is from based on their accent." + ] + }, + "49_Paralingual_MELD_ER_V2_676": { + "apperance": "49. Paralinguistics task: general speech", + "instructions": [ + "What emotions do you think the speaker is expressing." + ] + }, + "50_Paralingual_MELD_ER_V2_692": { + "apperance": "50. Paralinguistics task: general speech", + "instructions": [ + "Based on the speaker's speech patterns, what do you think they are feeling." + ] + }, + "51_Paralingual_VOXCELEB1_GR_V2_2148": { + "apperance": "51. Paralinguistics task: general speech", + "instructions": [ + "May I know the gender of the speaker." + ] + }, + "53_Paralingual_VOXCELEB1_NR_V2_2286": { + "apperance": "53. Paralinguistics task: general speech", + "instructions": [ + "What's the nationality identity of the speaker." + ] + }, + "55_SQA_PUBLIC_SPEECH_SG_TEST_SQA_V2_2": { + "apperance": "55. Spoken Question Answering task: general speech", + "instructions": [ + "What impact would the growth of the healthcare sector have on the country's economy in terms of employment and growth." + ] + }, + "56_SQA_PUBLIC_SPEECH_SG_TEST_SQA_V2_415": { + "apperance": "56. Spoken Question Answering task: general speech", + "instructions": [ + "Based on the statement, can you summarize the speaker's position on the recent controversial issues in Singapore." + ] + }, + "57_SQA_PUBLIC_SPEECH_SG_TEST_SQA_V2_460": { + "apperance": "57. Spoken Question Answering task: general speech", + "instructions": [ + "How does the author respond to parents' worries about masks in schools." + ] + }, + "1_ASR_IMDA_PART1_ASR_v2_141": { + "apperance": "1. Automatic Speech Recognition task: phonetically balanced reading", + "instructions": [ + "Turn the spoken language into a text format.", + "Please translate the content into Chinese." + ] + }, + "2_ASR_IMDA_PART1_ASR_v2_2258": { + "apperance": "2. Automatic Speech Recognition task: phonetically balanced reading", + "instructions": [ + "Turn the spoken language into a text format.", + "Please translate the content into Chinese." + ] + }, + "3_ASR_IMDA_PART1_ASR_v2_2265": { + "apperance": "3. Automatic Speech Recognition task: phonetically balanced reading", + "instructions": [ + "Turn the spoken language into a text format." + ] + }, + "4_ASR_IMDA_PART2_ASR_v2_999": { + "apperance": "4. Automatic Speech Recognition task: reading in Singapore context", + "instructions": [ + "Translate the spoken words into text format." + ] + }, + "5_ASR_IMDA_PART2_ASR_v2_2241": { + "apperance": "5. Automatic Speech Recognition task: reading in Singapore context", + "instructions": [ + "Translate the spoken words into text format." + ] + }, + "6_ASR_IMDA_PART2_ASR_v2_3409": { + "apperance": "6. Automatic Speech Recognition task: reading in Singapore context", + "instructions": [ + "Translate the spoken words into text format." + ] + }, + "8_ASR_IMDA_PART3_30_ASR_v2_1698": { + "apperance": "8. Automatic Speech Recognition task: conversation in Singapore accent", + "instructions": [ + "Need this talk written down, please." + ] + }, + "9_ASR_IMDA_PART3_30_ASR_v2_2474": { + "apperance": "9. Automatic Speech Recognition task: conversation in Singapore accent", + "instructions": [ + "Need this talk written down, please." + ] + }, + "10_ASR_IMDA_PART4_30_ASR_v2_1527": { + "apperance": "10. Automatic Speech Recognition task: conversation with Singlish code-switch", + "instructions": [ + "Write out the dialogue as text." + ] + }, + "13_ASR_IMDA_PART5_30_ASR_v2_1446": { + "apperance": "13. Automatic Speech Recognition task: conversation in Singapore accent", + "instructions": [ + "Translate this vocal recording into a textual format." + ] + }, + "14_ASR_IMDA_PART5_30_ASR_v2_2281": { + "apperance": "14. Automatic Speech Recognition task: conversation in Singapore accent", + "instructions": [ + "Translate this vocal recording into a textual format." + ] + }, + "15_ASR_IMDA_PART5_30_ASR_v2_4388": { + "apperance": "15. Automatic Speech Recognition task: conversation in Singapore accent", + "instructions": [ + "Translate this vocal recording into a textual format." + ] + }, + "16_ASR_IMDA_PART6_30_ASR_v2_576": { + "apperance": "16. Automatic Speech Recognition task: conversation in Singapore accent", + "instructions": [ + "Record the spoken word in text form." + ] + }, + "18_ASR_IMDA_PART6_30_ASR_v2_2834": { + "apperance": "18. Automatic Speech Recognition task: conversation in Singapore accent", + "instructions": [ + "Record the spoken word in text form." + ] + }, + "19_ASR_AIShell_zh_ASR_v2_5044": { + "apperance": "19. Automatic Speech Recognition task: speech in Chinese ", + "instructions": [ + "Transform the oral presentation into a text document." + ] + }, + "20_ASR_LIBRISPEECH_CLEAN_ASR_V2_833": { + "apperance": "20. Automatic Speech Recognition task: general speech", + "instructions": [ + "Please provide a written transcription of the speech." + ] + }, + "25_ST_COVOST2_ZH-CN_EN_ST_V2_4567": { + "apperance": "25. Speech Translation task: Chinese to English", + "instructions": [ + "Please translate the given speech to English." + ] + }, + "26_ST_COVOST2_EN_ZH-CN_ST_V2_5422": { + "apperance": "26. Speech Translation task: English to Chinese", + "instructions": [ + "Please translate the given speech to Chinese." + ] + }, + "27_ST_COVOST2_EN_ZH-CN_ST_V2_6697": { + "apperance": "27. Speech Translation task: English to Chinese", + "instructions": [ + "Please translate the given speech to Chinese." + ] + }, + "28_SI_ALPACA-GPT4-AUDIO_SI_V2_299": { + "apperance": "28. Speech Instruction task: general speech", + "instructions": [ + "Please follow the instruction in the speech." + ] + }, + "29_SI_ALPACA-GPT4-AUDIO_SI_V2_750": { + "apperance": "29. Speech Instruction task: general speech", + "instructions": [ + "Please follow the instruction in the speech." + ] + }, + "30_SI_ALPACA-GPT4-AUDIO_SI_V2_1454": { + "apperance": "30. Speech Instruction task: general speech", + "instructions": [ + "Please follow the instruction in the speech." + ] + } +} + + +exec(os.getenv('APP_CONFIGS')) + + +def init_state_section(): + st.set_page_config(page_title='MERaLiON-AudioLLM', page_icon = "🔥", layout='wide') + + st.markdown( + ( + '' + ), + unsafe_allow_html=True + ) + + if "logger" not in st.session_state: + st.session_state.logger = load_logger() + st.session_state.session_id = st.session_state.logger.register_session() + + if "server" not in st.session_state: + st.session_state.server = start_server() + + if "client_mapper" not in st.session_state: + st.session_state.client_mapper = load_model() + + if "retriever" not in st.session_state: + st.session_state.retriever = load_retriever() + + for key, value in FIXED_GENERATION_CONFIG.items(): + if key not in st.session_state: + st.session_state[key]=copy.deepcopy(value) + + for key, value in DEFAULT_DIALOGUE_STATES.items(): + if key not in st.session_state: + st.session_state[key]=copy.deepcopy(value) + + +def header_section(component_name, description="", concise_description="", icon="🤖"): + st.markdown( + f"

MERaLiON-AudioLLM {component_name} {icon}

", + unsafe_allow_html=True + ) + + st.markdown( + f"""
+

This {component_name.lower()} is based on + MERaLiON-AudioLLM, + developed by I2R, A*STAR, in collaboration with AISG, Singapore. + {description}

""", + unsafe_allow_html=True + ) + + st.markdown( + f"""
+

This {component_name.lower()} is based on + MERaLiON-AudioLLM.{concise_description}

""", + unsafe_allow_html=True + ) + + +@st.fragment +def sidebar_fragment(): + with st.container(height=256, border=False): + st.page_link("pages/playground.py", disabled=st.session_state.disprompt, label="🚀 Playground") + st.page_link("pages/agent.py", disabled=st.session_state.disprompt, label="👥 Chatbot") + st.page_link("pages/voice_chat.py", disabled=st.session_state.disprompt, label="🗣️ Voice Chat") + + st.divider() + + st.slider(label='Temperature', min_value=0.0, max_value=2.0, value=0.1, key='temperature') + + st.slider(label='Top P', min_value=0.0, max_value=1.0, value=0.9, key='top_p') + + st.slider(label="Repetition Penalty", min_value=1.0, max_value=1.2, value=1.1, key="repetition_penalty") + + +def retrive_response_with_ui( + model_name: str, + text_input: str, + array_audio_input: np.ndarray, + base64_audio_input: str, + prefix: str = "", + stream: bool = True, + history: Optional[List] = None, + show_warning: bool = True, + **kwargs + ): + + if history is None: + history = [] + + generation_params = dict( + model=model_name, + max_completion_tokens=st.session_state.max_completion_tokens, + temperature=st.session_state.temperature, + top_p=st.session_state.top_p, + extra_body={ + "repetition_penalty": st.session_state.repetition_penalty, + "top_k": st.session_state.top_k, + "length_penalty": st.session_state.length_penalty + }, + stream=stream, + seed=st.session_state.seed + ) + + error_msg, warnings, response_obj = retrive_response( + text_input, + array_audio_input, + base64_audio_input=base64_audio_input, + history=history, + **generation_params, + **kwargs + ) + + if error_msg: + st.error(error_msg) + + if show_warning: + for warning_msg in warnings: + st.warning(warning_msg) + + response = "" + if response_obj is not None: + if stream: + response_obj = itertools.chain([prefix], response_obj) + response = st.write_stream(response_obj) + else: + response = response_obj.choices[0].message.content + st.write(prefix+response) + + st.session_state.logger.register_query( + session_id=st.session_state.session_id, + base64_audio=base64_audio_input, + text_input=text_input, + history=history, + params=generation_params, + response=response, + warnings=warnings, + error_msg=error_msg + ) + + return error_msg, warnings, response \ No newline at end of file diff --git a/src/content/playground.py b/src/content/playground.py new file mode 100644 index 0000000000000000000000000000000000000000..bd97920d763032b3a8ead6477351549e96df050b --- /dev/null +++ b/src/content/playground.py @@ -0,0 +1,235 @@ +import copy +import base64 + +import streamlit as st + +from src.generation import MAX_AUDIO_LENGTH +from src.utils import bytes_to_array, array_to_bytes +from src.content.common import ( + MODEL_NAMES, + AUDIO_SAMPLES_W_INSTRUCT, + DEFAULT_DIALOGUE_STATES, + init_state_section, + header_section, + sidebar_fragment, + retrive_response_with_ui +) + + +QUICK_ACTIONS = [ + { + "name": "**Summary**", + "instruction": "Please summarise this speech.", + "width": 10, + }, + { + "name": "**Transcript**", + "instruction": "Please transcribe this speech.", + "width": 9.5, + } +] + + +def _update_audio(audio_bytes): + origin_audio_array = bytes_to_array(audio_bytes) + truncated_audio_array = origin_audio_array[: MAX_AUDIO_LENGTH*16000] + truncated_audio_bytes = array_to_bytes(truncated_audio_array) + + st.session_state.pg_audio_array = origin_audio_array + st.session_state.pg_audio_base64 = base64.b64encode(truncated_audio_bytes).decode('utf-8') + + +@st.fragment +def successful_example_section(): + audio_sample_names = [audio_sample_name for audio_sample_name in AUDIO_SAMPLES_W_INSTRUCT.keys()] + + st.markdown(":fire: **Successful Tasks and Examples**") + + sample_name = st.selectbox( + label="**Select Audio:**", + label_visibility="collapsed", + options=audio_sample_names, + format_func=lambda o: AUDIO_SAMPLES_W_INSTRUCT[o]["apperance"], + index=None, + placeholder="Select an audio sample:", + on_change=lambda: st.session_state.update( + on_select=True, + pg_messages=[], + disprompt=True + ), + key='select') + + if sample_name and st.session_state.on_select: + audio_bytes = open(f"audio_samples/{sample_name}.wav", "rb").read() + st.session_state.update( + on_select=False, + new_prompt=AUDIO_SAMPLES_W_INSTRUCT[sample_name]["instructions"][0] + ) + _update_audio(audio_bytes) + st.rerun(scope="app") + + +@st.dialog("Specify Audio") +def audio_attach_dialogue(): + st.markdown("**Upload**") + + uploaded_file = st.file_uploader( + label="**Upload Audio:**", + label_visibility="collapsed", + type=['wav', 'mp3'], + on_change=lambda: st.session_state.update(on_upload=True, pg_messages=[]), + key='upload' + ) + + if uploaded_file and st.session_state.on_upload: + audio_bytes = uploaded_file.read() + _update_audio(audio_bytes) + st.session_state.on_upload = False + st.rerun() + + st.markdown("**Record**") + + uploaded_file = st.audio_input( + label="**Record Audio:**", + label_visibility="collapsed", + on_change=lambda: st.session_state.update(on_record=True, pg_messages=[]), + key='record' + ) + + if uploaded_file and st.session_state.on_record: + audio_bytes = uploaded_file.read() + _update_audio(audio_bytes) + st.session_state.on_record = False + st.rerun() + + +@st.fragment +def select_model_variants_fradment(): + display_mapper = {value["vllm_name"]: value["ui_name"] for value in MODEL_NAMES.values()} + + st.selectbox( + label=":fire: Explore more MERaLiON-AudioLLM variants!", + options=[value["vllm_name"] for value in MODEL_NAMES.values()], + index=0, + format_func=lambda o: display_mapper[o], + key="pg_model_name", + placeholder=":fire: Explore more MERaLiON-AudioLLM variants!", + disabled=st.session_state.disprompt, + ) + + +def bottom_input_section(): + select_model_variants_fradment() + + bottom_cols = st.columns([0.03, 0.03, 0.94]) + with bottom_cols[0]: + st.button( + 'Clear', + disabled=st.session_state.disprompt, + on_click=lambda: st.session_state.update(copy.deepcopy(DEFAULT_DIALOGUE_STATES)) + ) + + with bottom_cols[1]: + if st.button("\+ Audio", disabled=st.session_state.disprompt): + audio_attach_dialogue() + + with bottom_cols[2]: + if chat_input := st.chat_input( + placeholder="Instruction...", + disabled=st.session_state.disprompt, + on_submit=lambda: st.session_state.update(disprompt=True, pg_messages=[]) + ): + st.session_state.new_prompt = chat_input + + +@st.fragment +def quick_actions_fragment(): + action_cols_spec = [_["width"] for _ in QUICK_ACTIONS] + action_cols = st.columns(action_cols_spec) + + for idx, action in enumerate(QUICK_ACTIONS): + action_cols[idx].button( + action["name"], + args=(action["instruction"],), + disabled=st.session_state.disprompt, + on_click=lambda p: st.session_state.update( + disprompt=True, + pg_messages=[], + new_prompt=p, + on_select_quick_action=True + ) + ) + + if st.session_state.on_select_quick_action: + st.session_state.on_select_quick_action = False + st.rerun(scope="app") + + +def conversation_section(): + if st.session_state.pg_audio_array.size: + with st.chat_message("user"): + st.audio(st.session_state.pg_audio_array, format="audio/wav", sample_rate=16000) + quick_actions_fragment() + + for message in st.session_state.pg_messages: + with st.chat_message(message["role"]): + if message.get("error"): + st.error(message["error"]) + for warning_msg in message.get("warnings", []): + st.warning(warning_msg) + if message.get("content"): + st.write(message["content"]) + + with st._bottom: + bottom_input_section() + + if one_time_prompt := st.session_state.new_prompt: + st.session_state.update(new_prompt="", pg_messages=[]) + + with st.chat_message("user"): + st.write(one_time_prompt) + st.session_state.pg_messages.append({"role": "user", "content": one_time_prompt}) + + with st.chat_message("assistant"): + with st.spinner("Thinking..."): + error_msg, warnings, response = retrive_response_with_ui( + model_name=st.session_state.pg_model_name, + text_input=one_time_prompt, + array_audio_input=st.session_state.pg_audio_array, + base64_audio_input=st.session_state.pg_audio_base64, + stream=True + ) + + st.session_state.pg_messages.append({ + "role": "assistant", + "error": error_msg, + "warnings": warnings, + "content": response + }) + + st.session_state.disprompt=False + st.rerun(scope="app") + + +def playground_page(): + init_state_section() + header_section( + component_name="Playground", + description=""" It is tailored for Singapore’s multilingual and multicultural landscape. + MERaLiON-AudioLLM supports + Automatic Speech Recognition, + Speech Translation, + Spoken Question Answering, + Spoken Dialogue Summarization, + Speech Instruction, and + Paralinguistics tasks. + This playground currently only support single-round conversation. + """, + concise_description=" It currently only support single-round conversation." + ) + + with st.sidebar: + sidebar_fragment() + + successful_example_section() + conversation_section() \ No newline at end of file diff --git a/src/content/voice_chat.py b/src/content/voice_chat.py new file mode 100644 index 0000000000000000000000000000000000000000..0758f89710c30166090482d592cf70f69f2690ac --- /dev/null +++ b/src/content/voice_chat.py @@ -0,0 +1,127 @@ +import copy +import base64 + +import numpy as np +import streamlit as st + +from src.generation import MAX_AUDIO_LENGTH +from src.utils import bytes_to_array, array_to_bytes +from src.content.common import ( + MODEL_NAMES, + DEFAULT_DIALOGUE_STATES, + init_state_section, + header_section, + sidebar_fragment, + retrive_response_with_ui +) + + +# TODO: change this. +DEFAULT_PROMPT = "Based on the information in this user’s voice, please reply the user in a friendly and helpful way." + + +def _update_audio(audio_bytes): + origin_audio_array = bytes_to_array(audio_bytes) + truncated_audio_array = origin_audio_array[: MAX_AUDIO_LENGTH*16000] + truncated_audio_bytes = array_to_bytes(truncated_audio_array) + + st.session_state.vc_audio_array = origin_audio_array + st.session_state.vc_audio_base64 = base64.b64encode(truncated_audio_bytes).decode('utf-8') + + +def bottom_input_section(): + st.info(":bulb: Ask something with clear intention.") + bottom_cols = st.columns([0.03, 0.97]) + with bottom_cols[0]: + st.button( + 'Clear', + disabled=st.session_state.disprompt, + on_click=lambda: st.session_state.update(copy.deepcopy(DEFAULT_DIALOGUE_STATES)) + ) + + with bottom_cols[1]: + uploaded_file = st.audio_input( + label="record audio", + label_visibility="collapsed", + on_change=lambda: st.session_state.update( + on_record=True, + vc_messages=[], + disprompt=True + ), + key='record' + ) + + if uploaded_file and st.session_state.on_record: + audio_bytes = uploaded_file.read() + _update_audio(audio_bytes) + st.session_state.update( + on_record=False, + new_prompt=DEFAULT_PROMPT + ) + + +def conversation_section(): + for message in st.session_state.vc_messages: + with st.chat_message(message["role"]): + if message.get("error"): + st.error(message["error"]) + for warning_msg in message.get("warnings", []): + st.warning(warning_msg) + if message.get("audio", np.array([])).shape[0]: + st.audio(message["audio"], format="audio/wav", sample_rate=16000) + if message.get("content"): + st.write(message["content"]) + + with st._bottom: + bottom_input_section() + + if one_time_prompt := st.session_state.new_prompt: + one_time_array = st.session_state.vc_audio_array + one_time_base64 = st.session_state.vc_audio_base64 + st.session_state.update( + new_prompt="", + one_time_array=np.array([]), + one_time_base64="", + vc_messages=[] + ) + + with st.chat_message("user"): + st.audio(one_time_array, format="audio/wav", sample_rate=16000) + + st.session_state.vc_messages.append({"role": "user", "audio": one_time_array}) + + with st.chat_message("assistant"): + with st.spinner("Thinking..."): + error_msg, warnings, response = retrive_response_with_ui( + model_name=MODEL_NAMES["audiollm-it"]["vllm_name"], + text_input=one_time_prompt, + array_audio_input=one_time_array, + base64_audio_input=one_time_base64, + stream=True + ) + + st.session_state.vc_messages.append({ + "role": "assistant", + "error": error_msg, + "warnings": warnings, + "content": response + }) + + st.session_state.disprompt=False + st.rerun(scope="app") + + +def voice_chat_page(): + init_state_section() + header_section( + component_name="Voice Chat", + description=""" It currently only support single-round conversation. + Feel free to talk about anything.""", + concise_description=" It currently only support single-round conversation.", + icon="🗣️" + ) + + with st.sidebar: + sidebar_fragment() + + conversation_section() \ No newline at end of file diff --git a/src/exceptions.py b/src/exceptions.py new file mode 100644 index 0000000000000000000000000000000000000000..0f05852f05823ca810c356e1d124a2fdbfeae658 --- /dev/null +++ b/src/exceptions.py @@ -0,0 +1,6 @@ +class NoAudioException(Exception): + pass + + +class TunnelNotRunningException(Exception): + pass \ No newline at end of file diff --git a/src/generation.py b/src/generation.py new file mode 100644 index 0000000000000000000000000000000000000000..304bc11e692de887a8764093970f458a2e336825 --- /dev/null +++ b/src/generation.py @@ -0,0 +1,148 @@ +import os +import re +import time +from typing import List, Dict, Optional + +import numpy as np +import streamlit as st +from openai import OpenAI, APIConnectionError + +from src.exceptions import TunnelNotRunningException + + +FIXED_GENERATION_CONFIG = dict( + max_completion_tokens=1024, + top_k=50, + length_penalty=1.0, + seed=42 +) + +MAX_AUDIO_LENGTH = 120 + + +def load_model() -> Dict: + """ + Create an OpenAI client with connection to vllm server. + """ + openai_api_key = os.getenv('API_KEY') + local_ports = os.getenv('LOCAL_PORTS').split(" ") + + name_to_client_mapper = {} + for port in local_ports: + client = OpenAI( + api_key=openai_api_key, + base_url=f"http://localhost:{port}/v1", + ) + + for model in client.models.list().data: + name_to_client_mapper[model.id] = client + + return name_to_client_mapper + + +def _retrive_response( + model: str, + text_input: str, + base64_audio_input: str, + history: Optional[List] = None, + **kwargs): + """ + Send request through OpenAI client. + """ + if history is None: + history = [] + + if base64_audio_input: + content = [ + { + "type": "text", + "text": f"Text instruction: {text_input}" + }, + { + "type": "audio_url", + "audio_url": { + "url": f"data:audio/ogg;base64,{base64_audio_input}" + }, + }, + ] + else: + content = text_input + + current_client = st.session_state.client_mapper[model] + + return current_client.chat.completions.create( + messages=history + [{"role": "user", "content": content}], + model=model, + **kwargs + ) + + +def _retry_retrive_response_throws_exception(retry=3, **kwargs): + try: + response_object = _retrive_response(**kwargs) + except APIConnectionError as e: + if not st.session_state.server.is_running(): + if retry == 0: + raise TunnelNotRunningException() + + st.toast(f":warning: Internet connection is down. Trying to re-establish connection ({retry}).") + + if st.session_state.server.is_down(): + st.session_state.server.restart() + elif st.session_state.server.is_starting(): + time.sleep(2) + + return _retry_retrive_response_throws_exception(retry-1, **kwargs) + raise e + + return response_object + + +def _validate_input(text_input, array_audio_input) -> List[str]: + """ + TODO: improve the input validation regex. + """ + warnings = [] + if re.search("tool|code|python|java|math|calculate", text_input): + warnings.append("WARNING: MERaLiON-AudioLLM is not intended for use in tool calling, math, and coding tasks.") + + if re.search(r'[\u4e00-\u9fff]+', text_input): + warnings.append("NOTE: Please try to prompt in English for the best performance.") + + if array_audio_input.shape[0] == 0: + warnings.append("NOTE: Please specify audio from examples or local files.") + + if array_audio_input.shape[0] / 16000 > 30.0: + warnings.append(( + "WARNING: MERaLiON-AudioLLM is trained to process audio up to **30 seconds**." + f" Audio longer than **{MAX_AUDIO_LENGTH} seconds** will be truncated." + )) + + return warnings + + +def retrive_response( + text_input: str, + array_audio_input: np.ndarray, + **kwargs + ): + warnings = _validate_input(text_input, array_audio_input) + + response_object, error_msg = None, "" + try: + response_object = _retry_retrive_response_throws_exception( + text_input=text_input, + **kwargs + ) + except TunnelNotRunningException: + error_msg = "Internet connection cannot be established. Please contact the administrator." + except Exception as e: + error_msg = f"Caught Exception: {repr(e)}. Please contact the administrator." + + return error_msg, warnings, response_object + + +def postprocess_voice_transcription(text): + text = re.sub("<.*>:?|\(.*\)|\[.*\]", "", text) + text = re.sub("\s+", " ", text).strip() + return text \ No newline at end of file diff --git a/src/logger.py b/src/logger.py new file mode 100644 index 0000000000000000000000000000000000000000..426bee2e36d246e79fd1e990c9181893987b9ef7 --- /dev/null +++ b/src/logger.py @@ -0,0 +1,111 @@ +import io +import os +import time +import json +from threading import Thread, Lock + +import streamlit as st +from huggingface_hub import HfApi + +from src.utils import get_current_strftime + + +logger_lock = Lock() + + +def threaded(fn): + def wrapper(*args, **kwargs): + thread = Thread(target=fn, args=args, kwargs=kwargs) + thread.start() + return thread + return wrapper + + +class Logger: + def __init__(self): + self.app_id = get_current_strftime() + self.session_increment = 0 + self.query_increment = 0 + self.sync_interval = 180 + + self.session_data = [] + self.query_data = [] + self.audio_data = [] + + self.sync_data() + + def register_session(self) -> str: + new_session_id = f"{self.app_id}+{self.session_increment}" + with logger_lock: + self.session_data.append({ + "session_id": new_session_id, + "creation_time": get_current_strftime() + }) + + self.session_increment += 1 + return new_session_id + + def register_query(self, + session_id, + base64_audio, + text_input, + response, + **kwargs + ): + new_query_id = self.query_increment + current_time = get_current_strftime() + + with logger_lock: + current_query_data = { + "session_id": session_id, + "query_id": new_query_id, + "creation_time": current_time, + "text": text_input, + "response": response, + } + current_query_data.update(kwargs) + self.query_data.append(current_query_data) + + self.audio_data.append({ + "session_id": session_id, + "query_id": new_query_id, + "creation_time": current_time, + "audio": base64_audio, + }) + self.query_increment += 1 + + + @threaded + def sync_data(self): + api = HfApi() + + while True: + time.sleep(self.sync_interval) + + for data_name in ["session_data", "query_data", "audio_data"]: + with logger_lock: + last_data = getattr(self, data_name, []) + setattr(self, data_name, []) + + if not last_data: + continue + + buffer = io.BytesIO() + for row in last_data: + row_str = json.dumps(row, ensure_ascii=False)+"\n" + buffer.write(row_str.encode("utf-8")) + + api.upload_file( + path_or_fileobj=buffer, + path_in_repo=f"{data_name}/{get_current_strftime()}.json", + repo_id=os.getenv("LOGGING_REPO_NAME"), + repo_type="dataset", + token=os.getenv('HF_TOKEN') + ) + + buffer.close() + + +@st.cache_resource() +def load_logger(): + return Logger() \ No newline at end of file diff --git a/src/retrieval.py b/src/retrieval.py new file mode 100644 index 0000000000000000000000000000000000000000..ae4c6da2b25aa8f6b86f9b62c366d6a7fd10b653 --- /dev/null +++ b/src/retrieval.py @@ -0,0 +1,113 @@ +from typing import List + +import numpy as np +import streamlit as st +from FlagEmbedding import BGEM3FlagModel + + +STANDARD_QUERIES = [ + { + "query_text": "Please transcribe this speech.", + "doc_text": "Listen to a speech and write down exactly what is being said in text form. It's essentially converting spoken words into written words. Provide the exact transcription of the given audio. Record whatever the speaker has said into written text.", + "response_prefix_text": "The transcription of the speech is: ", + "ui_text": "speech trancription" + }, + { + "query_text": "Please describe what happended in this audio", + "doc_text": "Text captions describing the sound events and environments in the audio clips, describing the events and actions happened in the audio.", + "response_prefix_text": "Events in this audio clip: ", + "ui_text": "audio caption" + }, + { + "query_text": "May I know the gender of the speakers", + "doc_text": "Identify the gender, male or female, based on pitch, formants, harmonics, and prosody features, and other speech pattern differences between genders.", + "response_prefix_text": "By analyzing pitch, formants, harmonics, and prosody features, which reflect physiological and speech pattern differences between genders: ", + "ui_text": "gender recognition" + }, + { + "query_text": "May I know the nationality of the speakers", + "doc_text": "Discover speakers' nationality, country, or the place he is coming from, from his/her accent, pronunciation patterns, and other language-specific speech features influenced by cultural and linguistic backgrounds.", + "response_prefix_text": "By analyzing accent, pronunciation patterns, intonation, rhythm, phoneme usage, and language-specific speech features influenced by cultural and linguistic backgrounds: ", + "ui_text": "natinoality recognition" + }, + { + "query_text": "Can you guess which ethnic group this person is from based on their accent.", + "doc_text": "Discover speakers' ethnic group, home country, or the place he is coming from, from his/her accent, tone, and other vocal characteristics influenced by cultural, regional, and linguistic factors.", + "response_prefix_text": "By analyzing speech features like accent, tone, intonation, phoneme variations, and vocal characteristics influenced by cultural, regional, and linguistic factors: ", + "ui_text": "ethnic group recognition" + }, + { + "query_text": "What do you think the speakers are feeling.", + "doc_text": "What do you think the speakers are feeling. Please identify speakers' emotions by analyzing vocal features like pitch, tone, volume, speech rate, rhythm, and spectral energy, which reflect emotional states such as happiness, anger, sadness, or fear.", + "response_prefix_text": "By analyzing vocal features like pitch, tone, volume, speech rate, rhythm, and spectral energy: ", + "ui_text": "emotion recognition" + }, +] + + +def _colbert_score(q_reps, p_reps): + """Compute colbert scores of input queries and passages. + + Args: + q_reps (np.ndarray): Multi-vector embeddings for queries. + p_reps (np.ndarray): Multi-vector embeddings for passages/corpus. + + Returns: + torch.Tensor: Computed colbert scores. + """ + # q_reps, p_reps = torch.from_numpy(q_reps), torch.from_numpy(p_reps) + token_scores = np.einsum('in,jn->ij', q_reps, p_reps) + scores = token_scores.max(-1) + scores = np.sum(scores) / q_reps.shape[0] + return scores + +class QueryRetriever: + def __init__(self, docs): + self.model = BGEM3FlagModel('BAAI/bge-m3', use_fp16=True) + self.docs = docs + self.doc_vectors = self.model.encode( + [d["doc_text"] for d in self.docs], + return_sparse=True, + return_colbert_vecs=True + ) + self.scorer_attrs = { + "lexical_weights": { + "method": self.model.compute_lexical_matching_score, + "weight": 0.2 + }, + "colbert_vecs": { + "method": _colbert_score, + "weight": 0.8 + }, + } + + def get_relevant_doc_indices(self, prompt, normalize=False) -> np.ndarray: + scores = np.zeros(len(self.docs)) + + if not prompt: + return scores + + prompt_vector = self.model.encode( + prompt, + return_sparse=True, + return_colbert_vecs=True + ) + + for scorer_name, scorer_attrs in self.scorer_attrs.items(): + for i, doc_vec in enumerate(self.doc_vectors[scorer_name]): + scores[i] += scorer_attrs["method"](prompt_vector[scorer_name], doc_vec) + + if normalize: + scores = scores / np.sum(scores) + return scores + + +@st.cache_resource() +def load_retriever(): + return QueryRetriever(docs=STANDARD_QUERIES) + + +def retrieve_relevant_docs(user_question: str) -> List[int]: + scores = st.session_state.retriever.get_relevant_doc_indices(user_question, normalize=True) + selected_indices = np.where(scores > 0.2)[0] + return selected_indices.tolist() diff --git a/src/tunnel.py b/src/tunnel.py new file mode 100644 index 0000000000000000000000000000000000000000..7fc714725afb869e25deda37df16964e25afd07e --- /dev/null +++ b/src/tunnel.py @@ -0,0 +1,68 @@ +import io +import os + +import paramiko +import streamlit as st +from sshtunnel import SSHTunnelForwarder + + +@st.cache_resource() +def start_server(): + server = SSHTunnelManager() + server.start() + return server + + +class SSHTunnelManager: + def __init__(self): + pkey = paramiko.RSAKey.from_private_key(io.StringIO(os.getenv('PRIVATE_KEY'))) + + self.server = SSHTunnelForwarder( + ssh_address_or_host=os.getenv('SERVER_DNS_NAME'), + ssh_username="ec2-user", + ssh_pkey=pkey, + local_bind_addresses=[ + ("127.0.0.1", int(port)) + for port in os.getenv('LOCAL_PORTS').split(" ") + ], + remote_bind_addresses=[ + ("127.0.0.1", 8000), + ("127.0.0.1", 8001), + ] + ) + + self._is_starting = False + self._is_running = False + + def update_status(self): + if not self._is_starting: + self.server.check_tunnels() + self._is_running = all( + list(self.server.tunnel_is_up.values()) + ) + else: + self._is_running = False + + def is_starting(self): + self.update_status() + return self._is_starting + + def is_running(self): + self.update_status() + return self._is_running + + def is_down(self): + self.update_status() + return (not self._is_running) and (not self._is_starting) + + def start(self, *args, **kwargs): + if not self._is_starting: + self._is_starting = True + self.server.start(*args, **kwargs) + self._is_starting = False + + def restart(self, *args, **kwargs): + if not self._is_starting: + self._is_starting = True + self.server.restart(*args, **kwargs) + self._is_starting = False \ No newline at end of file diff --git a/src/utils.py b/src/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..3855d25dbb7f8a1b65c6242d0f112b7bb47ca9c2 --- /dev/null +++ b/src/utils.py @@ -0,0 +1,24 @@ +import io +from datetime import datetime +from scipy.io.wavfile import write + +import librosa + + +def get_current_strftime(): + return datetime.now().strftime(r'%d-%m-%y-%H-%M-%S') + + +def bytes_to_array(audio_bytes): + audio_array, _ = librosa.load( + io.BytesIO(audio_bytes), + sr=16000 + ) + return audio_array + + +def array_to_bytes(audio_array): + bytes_wav = bytes() + byte_io = io.BytesIO(bytes_wav) + write(byte_io, 16000, audio_array) + return byte_io.read() \ No newline at end of file diff --git a/style/app_style.css b/style/app_style.css new file mode 100644 index 0000000000000000000000000000000000000000..b9d3a3f213c9506786f0ef682d2a459b1b6c55c0 --- /dev/null +++ b/style/app_style.css @@ -0,0 +1,116 @@ +div[data-testid="stMainBlockContainer"] { + padding-top: 2rem; + padding-bottom: 1rem; +} + +div[data-testid="stMainBlockContainer"]:has( div[height="480"][data-testid="stVerticalBlockBorderWrapper"]) { + height: calc(100% - 90px); +} + +div[data-testid="stMainBlockContainer"]>div[data-testid="stVerticalBlockBorderWrapper"] { + height: 100%; +} + +div[data-testid="stMainBlockContainer"]>div[data-testid="stVerticalBlockBorderWrapper"]>div { + height: 100%; +} + +div[data-testid="stMainBlockContainer"]>div[data-testid="stVerticalBlockBorderWrapper"]>div>div { + height: 100%; +} + +div[data-testid="stMainBlockContainer"] div[data-testid="stAudioInput"]>div { + max-height: 3rem; +} + +div[data-testid="stMainBlockContainer"] h1 { + padding-top: 0.25rem; +} + +div[class="sidebar-intro"] p { + margin-bottom: 0.75rem; +} + +[class='stAudio'] { + max-width: 500px !important; + margin: auto !important; +} + +div[data-testid="stChatMessage"]:has(> div[data-testid="stChatMessageAvatarUser"]) { + flex-direction: row-reverse; + text-align: right; +} + +div[height="480"][data-testid="stVerticalBlockBorderWrapper"] { + height: 100%; + min-height: 20px; +} + +/* audio quick actions */ + +div[data-testid="stChatMessage"] div[data-testid="stVerticalBlock"]:has( audio[data-testid="stAudio"]) { + gap: 2px; +} + +div[data-testid="stChatMessage"] div[data-testid="stHorizontalBlock"]:has(> div[data-testid="stColumn"]) { + flex-direction: row-reverse; + gap: 4px; +} + +div[data-testid="stChatMessage"] div[data-testid="stHorizontalBlock"]>div[data-testid="stColumn"]:has( div[data-testid="stButton"]) { + width: 6rem; + min-width: 6rem; + flex: 0 0 6rem; +} + +/* File uploader */ + +section[data-testid='stFileUploaderDropzone'] { + padding:6px 2rem; +} + +section[data-testid='stFileUploaderDropzone']>button { + display:none; +} + +div[data-testid="stFileUploaderDropzoneInstructions"]>div>span { + display:none; +} + +div[data-testid="stBottomBlockContainer"] { + padding-bottom: 2rem; +} + +/* Chat input component at the bottom */ + +div[data-testid="stBottomBlockContainer"] div[data-testid="stHorizontalBlock"]:has(> div[data-testid="stColumn"]) { + gap: 4px; +} + +div[data-testid="stBottomBlockContainer"] div[data-testid="stColumn"]:has( div[data-testid="stButton"]):first-of-type { + width: 61px; + min-width: 61px; + flex: 0 0 61px; +} + +div[data-testid="stBottomBlockContainer"] div[data-testid="stColumn"]:has( div[data-testid="stButton"]):nth-of-type(2) { + width: 76px; + min-width: 76px; + flex: 0 0 76px; +} + +div[data-testid="stBottomBlockContainer"] div[data-testid="stColumn"]:has( div[data-testid="stChatInput"]) { + width: 10rem; + min-width: 10rem; + flex: 1 1 10rem; +} + +div[data-testid="stBottomBlockContainer"] div[data-testid="stColumn"]:has( div[data-testid="stAudioInput"]) { + width: 10rem; + min-width: 10rem; + flex: 1 1 10rem; +} + +div[data-testid="stBottomBlockContainer"] div[data-testid="stAudioInput"]>div { + max-height: 40px; +} \ No newline at end of file diff --git a/style/normal_window.css b/style/normal_window.css new file mode 100644 index 0000000000000000000000000000000000000000..f66ffb8ca6da843c656cdb7ec5f6b4fac2283552 --- /dev/null +++ b/style/normal_window.css @@ -0,0 +1,18 @@ +@media(min-width: 800px) { + div[data-testid="stMainBlockContainer"] { + padding-left: 5rem; + padding-right: 5rem; + } + + div[data-testid="stBottomBlockContainer"] { + padding-left: 5rem; + padding-right: 5rem; + } +} + + +@media(min-width: 800px) and (min-height: 800px) { + div[class="main-intro-small-window"] { + display: none; + } +} \ No newline at end of file diff --git a/style/small_window.css b/style/small_window.css new file mode 100644 index 0000000000000000000000000000000000000000..63875f8d6a6ef7872462517ddc5cdee3859ec342 --- /dev/null +++ b/style/small_window.css @@ -0,0 +1,25 @@ +@media(max-width: 800px) { + div[data-testid="stMainBlockContainer"] { + padding-left: 1rem; + padding-right: 1rem; + } + + div[data-testid="stBottomBlockContainer"] { + padding-left: 1rem; + padding-right: 1rem; + } + + div[data-testid="stSidebarCollapsedControl"] button[data-testid="stBaseButton-headerNoPadding"]::after { + content: "More Use Cases" + } +} + +@media(max-width: 800px) or (max-height: 800px) { + div[data-testid="stMainBlockContainer"] div[data-testid="stVerticalBlock"]>div[data-testid="stElementContainer"]:has( div[data-testid="stHeadingWithActionElements"]) { + display: none; + } + + div[class="main-intro-normal-window"] { + display: none; + } +} \ No newline at end of file