This {component_name.lower()} is based on + MERaLiON-AudioLLM, + developed by I2R, A*STAR, in collaboration with AISG, Singapore. + {description}
diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000000000000000000000000000000000000..a6344aac8c09253b3b630fb776ae94478aa0275b --- /dev/null +++ b/.gitattributes @@ -0,0 +1,35 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text diff --git a/.streamlit/config.toml b/.streamlit/config.toml new file mode 100644 index 0000000000000000000000000000000000000000..74812cd03e02b4f059f0fdb02781b9d0a16cb546 --- /dev/null +++ b/.streamlit/config.toml @@ -0,0 +1,2 @@ +[client] +showSidebarNavigation = false \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000000000000000000000000000000000000..8a00bb12a279eee5165d19eb79bef0c8c3e02ba9 --- /dev/null +++ b/README.md @@ -0,0 +1,14 @@ +--- +title: Interactive-Demo / MERaLiON-AudioLLM +emoji: 🚀 +colorFrom: indigo +colorTo: indigo +sdk: streamlit +sdk_version: 1.41.1 +app_file: app.py +pinned: true +models: +- MERaLiON/MERaLiON-AudioLLM-Whisper-SEA-LION +--- + +Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference diff --git a/app.py b/app.py new file mode 100644 index 0000000000000000000000000000000000000000..e497863323cffda8c0dcd856a502419068d5c166 --- /dev/null +++ b/app.py @@ -0,0 +1,3 @@ +from src.content.playground import playground_page + +playground_page() \ No newline at end of file diff --git a/audio_samples/10_ASR_IMDA_PART4_30_ASR_v2_1527.wav b/audio_samples/10_ASR_IMDA_PART4_30_ASR_v2_1527.wav new file mode 100644 index 0000000000000000000000000000000000000000..d1fa45ef18df244757240868794d54063f723d56 Binary files /dev/null and b/audio_samples/10_ASR_IMDA_PART4_30_ASR_v2_1527.wav differ diff --git a/audio_samples/11_ASR_IMDA_PART4_30_ASR_v2_3771.wav b/audio_samples/11_ASR_IMDA_PART4_30_ASR_v2_3771.wav new file mode 100644 index 0000000000000000000000000000000000000000..f8c81dfdd16b9167a273a3964a6a3dd47de80d98 Binary files /dev/null and b/audio_samples/11_ASR_IMDA_PART4_30_ASR_v2_3771.wav differ diff --git a/audio_samples/12_ASR_IMDA_PART4_30_ASR_v2_103.wav b/audio_samples/12_ASR_IMDA_PART4_30_ASR_v2_103.wav new file mode 100644 index 0000000000000000000000000000000000000000..186c8c090c344c63e0ea828cc12a51207df58aff Binary files /dev/null and b/audio_samples/12_ASR_IMDA_PART4_30_ASR_v2_103.wav differ diff --git a/audio_samples/13_ASR_IMDA_PART5_30_ASR_v2_1446.wav b/audio_samples/13_ASR_IMDA_PART5_30_ASR_v2_1446.wav new file mode 100644 index 0000000000000000000000000000000000000000..0e606bfd770bb85abab4957d0380fef221c254c4 Binary files /dev/null and b/audio_samples/13_ASR_IMDA_PART5_30_ASR_v2_1446.wav differ diff --git a/audio_samples/14_ASR_IMDA_PART5_30_ASR_v2_2281.wav b/audio_samples/14_ASR_IMDA_PART5_30_ASR_v2_2281.wav new file mode 100644 index 0000000000000000000000000000000000000000..cbf2fad1cc780f4a7d691974033dcef7fd0fcef0 Binary files /dev/null and b/audio_samples/14_ASR_IMDA_PART5_30_ASR_v2_2281.wav differ diff --git a/audio_samples/15_ASR_IMDA_PART5_30_ASR_v2_4388.wav b/audio_samples/15_ASR_IMDA_PART5_30_ASR_v2_4388.wav new file mode 100644 index 0000000000000000000000000000000000000000..8ea95dd7dbab7762154da8588b7ead701317e956 Binary files /dev/null and b/audio_samples/15_ASR_IMDA_PART5_30_ASR_v2_4388.wav differ diff --git a/audio_samples/16_ASR_IMDA_PART6_30_ASR_v2_576.wav b/audio_samples/16_ASR_IMDA_PART6_30_ASR_v2_576.wav new file mode 100644 index 0000000000000000000000000000000000000000..df7b46f0a8f4f93b280cd9c91e486956e5d89b11 Binary files /dev/null and b/audio_samples/16_ASR_IMDA_PART6_30_ASR_v2_576.wav differ diff --git a/audio_samples/17_ASR_IMDA_PART6_30_ASR_v2_1413.wav b/audio_samples/17_ASR_IMDA_PART6_30_ASR_v2_1413.wav new file mode 100644 index 0000000000000000000000000000000000000000..69c2b3ee39465bc58efe769fde69c95c9d5092fc Binary files /dev/null and b/audio_samples/17_ASR_IMDA_PART6_30_ASR_v2_1413.wav differ diff --git a/audio_samples/18_ASR_IMDA_PART6_30_ASR_v2_2834.wav b/audio_samples/18_ASR_IMDA_PART6_30_ASR_v2_2834.wav new file mode 100644 index 0000000000000000000000000000000000000000..1d89e648d87d2bc193f728ac86b54ea7a4e07634 Binary files /dev/null and b/audio_samples/18_ASR_IMDA_PART6_30_ASR_v2_2834.wav differ diff --git a/audio_samples/19_ASR_AIShell_zh_ASR_v2_5044.wav b/audio_samples/19_ASR_AIShell_zh_ASR_v2_5044.wav new file mode 100644 index 0000000000000000000000000000000000000000..b296224725ec5acf74a02304f6beb6a7723d2c89 Binary files /dev/null and b/audio_samples/19_ASR_AIShell_zh_ASR_v2_5044.wav differ diff --git a/audio_samples/1_ASR_IMDA_PART1_ASR_v2_141.wav b/audio_samples/1_ASR_IMDA_PART1_ASR_v2_141.wav new file mode 100644 index 0000000000000000000000000000000000000000..17c5fc99647aaa658eabe035b40f97f8ea7638d1 Binary files /dev/null and b/audio_samples/1_ASR_IMDA_PART1_ASR_v2_141.wav differ diff --git a/audio_samples/20_ASR_LIBRISPEECH_CLEAN_ASR_V2_833.wav b/audio_samples/20_ASR_LIBRISPEECH_CLEAN_ASR_V2_833.wav new file mode 100644 index 0000000000000000000000000000000000000000..1ec609efd1c3790487c3c0aec77e5e3e5b0c3eda Binary files /dev/null and b/audio_samples/20_ASR_LIBRISPEECH_CLEAN_ASR_V2_833.wav differ diff --git a/audio_samples/25_ST_COVOST2_ZH-CN_EN_ST_V2_4567.wav b/audio_samples/25_ST_COVOST2_ZH-CN_EN_ST_V2_4567.wav new file mode 100644 index 0000000000000000000000000000000000000000..55c435f3b228e136e3c1047a4b43e992b9acfc0f Binary files /dev/null and b/audio_samples/25_ST_COVOST2_ZH-CN_EN_ST_V2_4567.wav differ diff --git a/audio_samples/26_ST_COVOST2_EN_ZH-CN_ST_V2_5422.wav b/audio_samples/26_ST_COVOST2_EN_ZH-CN_ST_V2_5422.wav new file mode 100644 index 0000000000000000000000000000000000000000..f2780b3b7da1d553f59f4f29256b4e848049cf52 Binary files /dev/null and b/audio_samples/26_ST_COVOST2_EN_ZH-CN_ST_V2_5422.wav differ diff --git a/audio_samples/27_ST_COVOST2_EN_ZH-CN_ST_V2_6697.wav b/audio_samples/27_ST_COVOST2_EN_ZH-CN_ST_V2_6697.wav new file mode 100644 index 0000000000000000000000000000000000000000..234f811d4c60ab67659f06bcd1db481a11648ca9 Binary files /dev/null and b/audio_samples/27_ST_COVOST2_EN_ZH-CN_ST_V2_6697.wav differ diff --git a/audio_samples/28_SI_ALPACA-GPT4-AUDIO_SI_V2_299.wav b/audio_samples/28_SI_ALPACA-GPT4-AUDIO_SI_V2_299.wav new file mode 100644 index 0000000000000000000000000000000000000000..239fff4d4cfcf2653e00d97ca842f334bd31ed18 Binary files /dev/null and b/audio_samples/28_SI_ALPACA-GPT4-AUDIO_SI_V2_299.wav differ diff --git a/audio_samples/29_SI_ALPACA-GPT4-AUDIO_SI_V2_750.wav b/audio_samples/29_SI_ALPACA-GPT4-AUDIO_SI_V2_750.wav new file mode 100644 index 0000000000000000000000000000000000000000..35d9dfbdc9ca3169a05c50a548cb5836adc65d52 Binary files /dev/null and b/audio_samples/29_SI_ALPACA-GPT4-AUDIO_SI_V2_750.wav differ diff --git a/audio_samples/2_ASR_IMDA_PART1_ASR_v2_2258.wav b/audio_samples/2_ASR_IMDA_PART1_ASR_v2_2258.wav new file mode 100644 index 0000000000000000000000000000000000000000..1b3ff08f36d5e02043445bd8c0f37b73cdd9f59c Binary files /dev/null and b/audio_samples/2_ASR_IMDA_PART1_ASR_v2_2258.wav differ diff --git a/audio_samples/30_SI_ALPACA-GPT4-AUDIO_SI_V2_1454.wav b/audio_samples/30_SI_ALPACA-GPT4-AUDIO_SI_V2_1454.wav new file mode 100644 index 0000000000000000000000000000000000000000..d84f6abdca95d5bfa3f292f45b370c243bf79f86 Binary files /dev/null and b/audio_samples/30_SI_ALPACA-GPT4-AUDIO_SI_V2_1454.wav differ diff --git a/audio_samples/31_SI_OPENHERMES-AUDIO_SI_V2_673.wav b/audio_samples/31_SI_OPENHERMES-AUDIO_SI_V2_673.wav new file mode 100644 index 0000000000000000000000000000000000000000..e0d9a7f61f0a8b0137bc8c5ddd4d03c02686b49b Binary files /dev/null and b/audio_samples/31_SI_OPENHERMES-AUDIO_SI_V2_673.wav differ diff --git a/audio_samples/32_SQA_CN_COLLEDGE_ENTRANCE_ENGLISH_TEST_SQA_V2_572.wav b/audio_samples/32_SQA_CN_COLLEDGE_ENTRANCE_ENGLISH_TEST_SQA_V2_572.wav new file mode 100644 index 0000000000000000000000000000000000000000..4f0aadf1e9ac1e100c052fa9df0760651e2b2c4f Binary files /dev/null and b/audio_samples/32_SQA_CN_COLLEDGE_ENTRANCE_ENGLISH_TEST_SQA_V2_572.wav differ diff --git a/audio_samples/33_SQA_IMDA_PART3_30_SQA_V2_2310.wav b/audio_samples/33_SQA_IMDA_PART3_30_SQA_V2_2310.wav new file mode 100644 index 0000000000000000000000000000000000000000..c2858560478a1b51a6085e0f54a34d4bbca30b8e Binary files /dev/null and b/audio_samples/33_SQA_IMDA_PART3_30_SQA_V2_2310.wav differ diff --git a/audio_samples/34_SQA_IMDA_PART3_30_SQA_V2_3621.wav b/audio_samples/34_SQA_IMDA_PART3_30_SQA_V2_3621.wav new file mode 100644 index 0000000000000000000000000000000000000000..e4f53b20b6210ef6bba708ea1bccb9ad787caf22 Binary files /dev/null and b/audio_samples/34_SQA_IMDA_PART3_30_SQA_V2_3621.wav differ diff --git a/audio_samples/35_SQA_IMDA_PART3_30_SQA_V2_4062.wav b/audio_samples/35_SQA_IMDA_PART3_30_SQA_V2_4062.wav new file mode 100644 index 0000000000000000000000000000000000000000..8e18d39cdceaa84abc9dff3f002a0c6502c30b69 Binary files /dev/null and b/audio_samples/35_SQA_IMDA_PART3_30_SQA_V2_4062.wav differ diff --git a/audio_samples/36_DS_IMDA_PART4_30_DS_V2_849.wav b/audio_samples/36_DS_IMDA_PART4_30_DS_V2_849.wav new file mode 100644 index 0000000000000000000000000000000000000000..6b381a7b04f312f0b317bd3b6a0581155aeaf4c1 Binary files /dev/null and b/audio_samples/36_DS_IMDA_PART4_30_DS_V2_849.wav differ diff --git a/audio_samples/39_Paralingual_IEMOCAP_ER_V2_91.wav b/audio_samples/39_Paralingual_IEMOCAP_ER_V2_91.wav new file mode 100644 index 0000000000000000000000000000000000000000..738c14bf9ff890820659be0ad4d27ec5576ea7c4 Binary files /dev/null and b/audio_samples/39_Paralingual_IEMOCAP_ER_V2_91.wav differ diff --git a/audio_samples/3_ASR_IMDA_PART1_ASR_v2_2265.wav b/audio_samples/3_ASR_IMDA_PART1_ASR_v2_2265.wav new file mode 100644 index 0000000000000000000000000000000000000000..507bca925cbe5c433d1021c89f8f5c2108fc00d6 Binary files /dev/null and b/audio_samples/3_ASR_IMDA_PART1_ASR_v2_2265.wav differ diff --git a/audio_samples/40_Paralingual_IEMOCAP_ER_V2_567.wav b/audio_samples/40_Paralingual_IEMOCAP_ER_V2_567.wav new file mode 100644 index 0000000000000000000000000000000000000000..6709d5a7b3509690d89d222e8a75120b0a9c4d35 Binary files /dev/null and b/audio_samples/40_Paralingual_IEMOCAP_ER_V2_567.wav differ diff --git a/audio_samples/42_Paralingual_IEMOCAP_GR_V2_320.wav b/audio_samples/42_Paralingual_IEMOCAP_GR_V2_320.wav new file mode 100644 index 0000000000000000000000000000000000000000..593e18ad1ff04af7877072ba964c323786ba580e Binary files /dev/null and b/audio_samples/42_Paralingual_IEMOCAP_GR_V2_320.wav differ diff --git a/audio_samples/43_Paralingual_IEMOCAP_GR_V2_129.wav b/audio_samples/43_Paralingual_IEMOCAP_GR_V2_129.wav new file mode 100644 index 0000000000000000000000000000000000000000..cd143063c19ca28fb3820ded2f1caa2cda0a8861 Binary files /dev/null and b/audio_samples/43_Paralingual_IEMOCAP_GR_V2_129.wav differ diff --git a/audio_samples/45_Paralingual_IMDA_PART3_30_GR_V2_12312.wav b/audio_samples/45_Paralingual_IMDA_PART3_30_GR_V2_12312.wav new file mode 100644 index 0000000000000000000000000000000000000000..af4fe23487085a3047ad1f0f56b824a6a75907f4 Binary files /dev/null and b/audio_samples/45_Paralingual_IMDA_PART3_30_GR_V2_12312.wav differ diff --git a/audio_samples/47_Paralingual_IMDA_PART3_30_NR_V2_10479.wav b/audio_samples/47_Paralingual_IMDA_PART3_30_NR_V2_10479.wav new file mode 100644 index 0000000000000000000000000000000000000000..20a685bb51cd1670280e104e1f06987e471657bb Binary files /dev/null and b/audio_samples/47_Paralingual_IMDA_PART3_30_NR_V2_10479.wav differ diff --git a/audio_samples/49_Paralingual_MELD_ER_V2_676.wav b/audio_samples/49_Paralingual_MELD_ER_V2_676.wav new file mode 100644 index 0000000000000000000000000000000000000000..a614033adb66d5d8b5a0054530336876c0d61d86 Binary files /dev/null and b/audio_samples/49_Paralingual_MELD_ER_V2_676.wav differ diff --git a/audio_samples/4_ASR_IMDA_PART2_ASR_v2_999.wav b/audio_samples/4_ASR_IMDA_PART2_ASR_v2_999.wav new file mode 100644 index 0000000000000000000000000000000000000000..48bfb135fc3eb12814801c49abd0b8250178ad86 Binary files /dev/null and b/audio_samples/4_ASR_IMDA_PART2_ASR_v2_999.wav differ diff --git a/audio_samples/50_Paralingual_MELD_ER_V2_692.wav b/audio_samples/50_Paralingual_MELD_ER_V2_692.wav new file mode 100644 index 0000000000000000000000000000000000000000..69f435f7308b5090f2668d22c1f324d30dd8857e Binary files /dev/null and b/audio_samples/50_Paralingual_MELD_ER_V2_692.wav differ diff --git a/audio_samples/51_Paralingual_VOXCELEB1_GR_V2_2148.wav b/audio_samples/51_Paralingual_VOXCELEB1_GR_V2_2148.wav new file mode 100644 index 0000000000000000000000000000000000000000..42d4d89846cfcd0c6bb0de173f584ad2b6d6d131 Binary files /dev/null and b/audio_samples/51_Paralingual_VOXCELEB1_GR_V2_2148.wav differ diff --git a/audio_samples/53_Paralingual_VOXCELEB1_NR_V2_2286.wav b/audio_samples/53_Paralingual_VOXCELEB1_NR_V2_2286.wav new file mode 100644 index 0000000000000000000000000000000000000000..ce05d92f8004d6054d39fae59f4d3a34c3b80e49 Binary files /dev/null and b/audio_samples/53_Paralingual_VOXCELEB1_NR_V2_2286.wav differ diff --git a/audio_samples/55_SQA_PUBLIC_SPEECH_SG_TEST_SQA_V2_2.wav b/audio_samples/55_SQA_PUBLIC_SPEECH_SG_TEST_SQA_V2_2.wav new file mode 100644 index 0000000000000000000000000000000000000000..f8513f46825e7b386b1f00f058d249044dac82d2 Binary files /dev/null and b/audio_samples/55_SQA_PUBLIC_SPEECH_SG_TEST_SQA_V2_2.wav differ diff --git a/audio_samples/56_SQA_PUBLIC_SPEECH_SG_TEST_SQA_V2_415.wav b/audio_samples/56_SQA_PUBLIC_SPEECH_SG_TEST_SQA_V2_415.wav new file mode 100644 index 0000000000000000000000000000000000000000..f95f167ebe177b0db82f346f9dbd2c51eb828ec1 Binary files /dev/null and b/audio_samples/56_SQA_PUBLIC_SPEECH_SG_TEST_SQA_V2_415.wav differ diff --git a/audio_samples/57_SQA_PUBLIC_SPEECH_SG_TEST_SQA_V2_460.wav b/audio_samples/57_SQA_PUBLIC_SPEECH_SG_TEST_SQA_V2_460.wav new file mode 100644 index 0000000000000000000000000000000000000000..c2decc6d21300257c3fc74b6718f1898dedbf4e2 Binary files /dev/null and b/audio_samples/57_SQA_PUBLIC_SPEECH_SG_TEST_SQA_V2_460.wav differ diff --git a/audio_samples/5_ASR_IMDA_PART2_ASR_v2_2241.wav b/audio_samples/5_ASR_IMDA_PART2_ASR_v2_2241.wav new file mode 100644 index 0000000000000000000000000000000000000000..55063388c14bd69df6a8023e5a65e4c9c3a01fb5 Binary files /dev/null and b/audio_samples/5_ASR_IMDA_PART2_ASR_v2_2241.wav differ diff --git a/audio_samples/6_ASR_IMDA_PART2_ASR_v2_3409.wav b/audio_samples/6_ASR_IMDA_PART2_ASR_v2_3409.wav new file mode 100644 index 0000000000000000000000000000000000000000..daf99a1877bae21a5ab72147a7a6359c8953e242 Binary files /dev/null and b/audio_samples/6_ASR_IMDA_PART2_ASR_v2_3409.wav differ diff --git a/audio_samples/7_ASR_IMDA_PART3_30_ASR_v2_2269.wav b/audio_samples/7_ASR_IMDA_PART3_30_ASR_v2_2269.wav new file mode 100644 index 0000000000000000000000000000000000000000..5e439cf43817a436692e132e194bcf2b43332126 Binary files /dev/null and b/audio_samples/7_ASR_IMDA_PART3_30_ASR_v2_2269.wav differ diff --git a/audio_samples/8_ASR_IMDA_PART3_30_ASR_v2_1698.wav b/audio_samples/8_ASR_IMDA_PART3_30_ASR_v2_1698.wav new file mode 100644 index 0000000000000000000000000000000000000000..e0929f09849acb481f80ca007bf257a9d937c035 Binary files /dev/null and b/audio_samples/8_ASR_IMDA_PART3_30_ASR_v2_1698.wav differ diff --git a/audio_samples/9_ASR_IMDA_PART3_30_ASR_v2_2474.wav b/audio_samples/9_ASR_IMDA_PART3_30_ASR_v2_2474.wav new file mode 100644 index 0000000000000000000000000000000000000000..11e66f37907da37aa2d90a492e407bc3a7a20bb1 Binary files /dev/null and b/audio_samples/9_ASR_IMDA_PART3_30_ASR_v2_2474.wav differ diff --git a/pages/agent.py b/pages/agent.py new file mode 100644 index 0000000000000000000000000000000000000000..36a9a35623385096495fc8d00d77b7302d9f4a5c --- /dev/null +++ b/pages/agent.py @@ -0,0 +1,3 @@ +from src.content.agent import agent_page + +agent_page() \ No newline at end of file diff --git a/pages/playground.py b/pages/playground.py new file mode 100644 index 0000000000000000000000000000000000000000..da5d8c7953bac6ca10d1fc69c8e755b0284f616d --- /dev/null +++ b/pages/playground.py @@ -0,0 +1,4 @@ +from src.content.playground import playground_page + + +playground_page() diff --git a/pages/voice_chat.py b/pages/voice_chat.py new file mode 100644 index 0000000000000000000000000000000000000000..c0627ef4f40fc78b1fc85ed7b9db34a0ee603a12 --- /dev/null +++ b/pages/voice_chat.py @@ -0,0 +1,4 @@ +from src.content.voice_chat import voice_chat_page + + +voice_chat_page() \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..26e328f99799199c667b3f8c88bb84ebd0867bc4 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,9 @@ +librosa==0.10.2.post1 +streamlit==1.40.2 +openai==1.57.1 +streamlit_mic_recorder==0.0.8 +sshtunnel +accelerate==1.3.0 +FlagEmbedding==1.3.3 +sentence-transformers==3.4.0 +sentencepiece==0.1.99 \ No newline at end of file diff --git a/src/content/agent.py b/src/content/agent.py new file mode 100644 index 0000000000000000000000000000000000000000..5ec20840a751bb24574c421fda50af0639957d80 --- /dev/null +++ b/src/content/agent.py @@ -0,0 +1,289 @@ +import copy +import base64 + +import streamlit as st + +from src.generation import MAX_AUDIO_LENGTH +from src.retrieval import STANDARD_QUERIES, retrieve_relevant_docs +from src.utils import bytes_to_array, array_to_bytes +from src.content.common import ( + MODEL_NAMES, + AUDIO_SAMPLES_W_INSTRUCT, + DEFAULT_DIALOGUE_STATES, + init_state_section, + header_section, + sidebar_fragment, + retrive_response_with_ui +) + + +LLM_NO_AUDIO_PROMPT_TEMPLATE = """{user_question}""" + + +LLM_PROMPT_TEMPLATE = """User asked a question about the audio clip. + +## User Question +{user_question} + +{audio_information_prompt}Please reply to user's question with a friendly, accurate, and helpful answer.""" + + +AUDIO_INFO_TEMPLATE = """Here are some information about this audio clip. + +## Audio Information +{audio_information} + +However, the audio analysis may or may not contain relevant information to the user question, please only reply the user with the relevant information. + +""" + + +AUDIO_ANALYSIS_STATUS = "MERaLiON-AudioLLM Analysis" + + +def _update_audio(audio_bytes): + origin_audio_array = bytes_to_array(audio_bytes) + truncated_audio_array = origin_audio_array[: MAX_AUDIO_LENGTH*16000] + truncated_audio_bytes = array_to_bytes(truncated_audio_array) + + st.session_state.ag_audio_array = origin_audio_array + st.session_state.ag_audio_base64 = base64.b64encode(truncated_audio_bytes).decode('utf-8') + + +@st.fragment +def successful_example_section(): + audio_sample_names = [name for name in AUDIO_SAMPLES_W_INSTRUCT.keys() if "Paral" in name] + + st.markdown(":fire: **Successful Tasks and Examples**") + + sample_name = st.selectbox( + label="**Select Audio:**", + label_visibility="collapsed", + options=audio_sample_names, + format_func=lambda o: AUDIO_SAMPLES_W_INSTRUCT[o]["apperance"], + index=None, + placeholder="Select an audio sample:", + on_change=lambda: st.session_state.update( + on_select=True, + ag_messages=[], + ag_model_messages=[], + ag_visited_query_indices=[], + disprompt=True + ), + key='select') + + if sample_name and st.session_state.on_select: + audio_bytes = open(f"audio_samples/{sample_name}.wav", "rb").read() + st.session_state.update( + on_select=False, + new_prompt=AUDIO_SAMPLES_W_INSTRUCT[sample_name]["instructions"][0] + ) + _update_audio(audio_bytes) + st.rerun(scope="app") + + +@st.dialog("Specify Audio") +def audio_attach_dialogue(): + st.markdown("**Upload**") + + uploaded_file = st.file_uploader( + label="**Upload Audio:**", + label_visibility="collapsed", + type=['wav', 'mp3'], + on_change=lambda: st.session_state.update( + on_upload=True, + ag_messages=[], + ag_model_messages=[], + ag_visited_query_indices=[] + ), + key='upload' + ) + + if uploaded_file and st.session_state.on_upload: + audio_bytes = uploaded_file.read() + _update_audio(audio_bytes) + st.session_state.on_upload = False + st.rerun() + + st.markdown("**Record**") + + uploaded_file = st.audio_input( + label="**Record Audio:**", + label_visibility="collapsed", + on_change=lambda: st.session_state.update( + on_record=True, + ag_messages=[], + ag_model_messages=[], + ag_visited_query_indices=[] + ), + key='record' + ) + + if uploaded_file and st.session_state.on_record: + audio_bytes = uploaded_file.read() + _update_audio(audio_bytes) + st.session_state.on_record = False + st.rerun() + + +def bottom_input_section(): + bottom_cols = st.columns([0.03, 0.03, 0.94]) + with bottom_cols[0]: + st.button( + 'Clear', + disabled=st.session_state.disprompt, + on_click=lambda: st.session_state.update(copy.deepcopy(DEFAULT_DIALOGUE_STATES)) + ) + + with bottom_cols[1]: + if st.button("\+ Audio", disabled=st.session_state.disprompt): + audio_attach_dialogue() + + with bottom_cols[2]: + if chat_input := st.chat_input( + placeholder="Instruction...", + disabled=st.session_state.disprompt, + on_submit=lambda: st.session_state.update(disprompt=True) + ): + st.session_state.new_prompt = chat_input + + +def _prepare_final_prompt_with_ui(one_time_prompt): + if st.session_state.ag_audio_array.shape[0] == 0: + return LLM_NO_AUDIO_PROMPT_TEMPLATE.format(user_question=one_time_prompt) + + with st.spinner("Searching appropriate querys..."): + relevant_query_indices = retrieve_relevant_docs(one_time_prompt) + if len(st.session_state.ag_messages) <= 2: + relevant_query_indices.append(0) + + relevant_query_indices = list( + set(relevant_query_indices).difference(st.session_state.ag_visited_query_indices) + ) + + st.session_state.ag_visited_query_indices.extend(relevant_query_indices) + + if not relevant_query_indices: + return LLM_PROMPT_TEMPLATE.format( + user_question=one_time_prompt, + audio_information_prompt="" + ) + + audio_info = [] + with st.status(AUDIO_ANALYSIS_STATUS, expanded=False) as status: + for i, standard_idx in enumerate(relevant_query_indices): + new_label = ( + f"{AUDIO_ANALYSIS_STATUS}: " + f"{STANDARD_QUERIES[standard_idx]['ui_text']} " + f"({i+1}/{len(relevant_query_indices)})" + ) + + status.update(label=new_label, state="running") + error_msg, warnings, response = retrive_response_with_ui( + model_name=MODEL_NAMES["audiollm"]["vllm_name"], + text_input=STANDARD_QUERIES[standard_idx]["query_text"], + array_audio_input=st.session_state.ag_audio_array, + base64_audio_input=st.session_state.ag_audio_base64, + prefix=f"**{STANDARD_QUERIES[standard_idx]['ui_text']}**: ", + stream=True, + show_warning=i==0 + ) + audio_info.append(STANDARD_QUERIES[standard_idx]["response_prefix_text"] + response) + + st.session_state.ag_messages[-1]["process"].append({ + "error": error_msg, + "warnings": warnings, + "content": response + }) + + status.update(label=AUDIO_ANALYSIS_STATUS, state="complete") + + audio_information_prompt = AUDIO_INFO_TEMPLATE.format( + audio_information="\n".join(audio_info) + ) + + return LLM_PROMPT_TEMPLATE.format( + user_question=one_time_prompt, + audio_information_prompt=audio_information_prompt + ) + + +def conversation_section(): + chat_message_container = st.container(height=480) + if st.session_state.ag_audio_array.size: + with chat_message_container.chat_message("user"): + st.audio(st.session_state.ag_audio_array, format="audio/wav", sample_rate=16000) + + for message in st.session_state.ag_messages: + message_name = "assistant" if "assistant" in message["role"] else message["role"] + + with chat_message_container.chat_message(name=message_name): + if message.get("error"): + st.error(message["error"]) + for warning_msg in message.get("warnings", []): + st.warning(warning_msg) + if process := message.get("process", []): + with st.status(AUDIO_ANALYSIS_STATUS, expanded=False, state="complete"): + for proc in process: + if proc.get("error"): + st.error(proc["error"]) + for proc_warning_msg in proc.get("warnings", []): + st.warning(proc_warning_msg) + if proc.get("content"): + st.write(proc["content"]) + if message.get("content"): + st.write(message["content"]) + + with st._bottom: + bottom_input_section() + + if one_time_prompt := st.session_state.new_prompt: + st.session_state.update(new_prompt="") + + with chat_message_container.chat_message("user"): + st.write(one_time_prompt) + st.session_state.ag_messages.append({"role": "user", "content": one_time_prompt}) + + with chat_message_container.chat_message("assistant"): + assistant_message = {"role": "assistant", "process": []} + st.session_state.ag_messages.append(assistant_message) + + final_prompt = _prepare_final_prompt_with_ui(one_time_prompt) + + error_msg, warnings, response = retrive_response_with_ui( + model_name=MODEL_NAMES["llm"]["vllm_name"], + text_input=final_prompt, + array_audio_input=st.session_state.ag_audio_array, + base64_audio_input="", + prefix=f"**{MODEL_NAMES['llm']['ui_name']}**: ", + stream=True, + history=st.session_state.ag_model_messages, + show_warning=False + ) + + assistant_message.update({"error": error_msg, "warnings": warnings, "content": response}) + st.session_state.ag_model_messages.extend([ + {"role": "user", "content": final_prompt}, + {"role": "assistant", "content": response} + ]) + + st.session_state.disprompt=False + st.rerun(scope="app") + + +def agent_page(): + init_state_section() + header_section( + component_name="Chatbot", + description=""" It is implemented by connecting multiple AI models, + offers more flexibility, and supports multi-round conversation.""", + concise_description=""" It is implemented by connecting multiple AI models and + support multi-round conversation.""", + icon="👥" + ) + + with st.sidebar: + sidebar_fragment() + + successful_example_section() + conversation_section() \ No newline at end of file diff --git a/src/content/common.py b/src/content/common.py new file mode 100644 index 0000000000000000000000000000000000000000..40dc6e132087aa499c5411037cb4392d56c1c5a1 --- /dev/null +++ b/src/content/common.py @@ -0,0 +1,443 @@ +import os +import copy +import itertools +from collections import OrderedDict +from typing import List, Optional + +import numpy as np +import streamlit as st + +from src.tunnel import start_server +from src.generation import FIXED_GENERATION_CONFIG, load_model, retrive_response +from src.retrieval import load_retriever +from src.logger import load_logger + + +DEFAULT_DIALOGUE_STATES = dict( + pg_audio_base64='', + pg_audio_array=np.array([]), + pg_messages=[], + vc_audio_base64='', + vc_audio_array=np.array([]), + vc_messages=[], + ag_audio_base64='', + ag_audio_array=np.array([]), + ag_visited_query_indices=[], + ag_messages=[], + ag_model_messages=[], + disprompt = False, + new_prompt = "", + on_select=False, + on_upload=False, + on_record=False, + on_select_quick_action=False +) + + +MODEL_NAMES = OrderedDict({}) + + +AUDIO_SAMPLES_W_INSTRUCT = { + "7_ASR_IMDA_PART3_30_ASR_v2_2269": { + "apperance": "7. Automatic Speech Recognition task: conversation in Singapore accent", + "instructions": [ + "Need this talk written down, please." + ] + }, + "11_ASR_IMDA_PART4_30_ASR_v2_3771": { + "apperance": "11. Automatic Speech Recognition task: conversation with Singlish code-switch", + "instructions": [ + "Write out the dialogue as text." + ] + }, + "12_ASR_IMDA_PART4_30_ASR_v2_103": { + "apperance": "12. Automatic Speech Recognition task: conversation with Singlish code-switch", + "instructions": [ + "Write out the dialogue as text." + ] + }, + "17_ASR_IMDA_PART6_30_ASR_v2_1413": { + "apperance": "17. Automatic Speech Recognition task: conversation in Singapore accent", + "instructions": [ + "Record the spoken word in text form." + ] + }, + "32_SQA_CN_COLLEDGE_ENTRANCE_ENGLISH_TEST_SQA_V2_572": { + "apperance": "32. Spoken Question Answering task: general speech", + "instructions": [ + "What does the man think the woman should do at 4:00." + ] + }, + "33_SQA_IMDA_PART3_30_SQA_V2_2310": { + "apperance": "33. Spoken Question Answering task: conversation in Singapore accent", + "instructions": [ + "Does Speaker2's wife cook for Speaker2 when they are at home." + ] + }, + "34_SQA_IMDA_PART3_30_SQA_V2_3621": { + "apperance": "34. Spoken Question Answering task: conversation in Singapore accent", + "instructions": [ + "Does the phrase \"#gai-gai#\" have a meaning in Chinese or Hokkien language." + ] + }, + "35_SQA_IMDA_PART3_30_SQA_V2_4062": { + "apperance": "35. Spoken Question Answering task: conversation in Singapore accent", + "instructions": [ + "What is the color of the vase mentioned in the dialogue." + ] + }, + "36_DS_IMDA_PART4_30_DS_V2_849": { + "apperance": "36. Spoken Dialogue Summarization task: conversation with Singlish code-switch", + "instructions": [ + "Condense the dialogue into a concise summary highlighting major topics and conclusions." + ] + }, + "39_Paralingual_IEMOCAP_ER_V2_91": { + "apperance": "39. Paralinguistics task: general speech", + "instructions": [ + "Based on the speaker's speech patterns, what do you think they are feeling." + ] + }, + "40_Paralingual_IEMOCAP_ER_V2_567": { + "apperance": "40. Paralinguistics task: general speech", + "instructions": [ + "Based on the speaker's speech patterns, what do you think they are feeling." + ] + }, + "42_Paralingual_IEMOCAP_GR_V2_320": { + "apperance": "42. Paralinguistics task: general speech", + "instructions": [ + "Is it possible for you to identify whether the speaker in this recording is male or female." + ] + }, + "47_Paralingual_IMDA_PART3_30_NR_V2_10479": { + "apperance": "47. Paralinguistics task: conversation in Singapore accent", + "instructions": [ + "Can you guess which ethnic group this person is from based on their accent." + ] + }, + "49_Paralingual_MELD_ER_V2_676": { + "apperance": "49. Paralinguistics task: general speech", + "instructions": [ + "What emotions do you think the speaker is expressing." + ] + }, + "50_Paralingual_MELD_ER_V2_692": { + "apperance": "50. Paralinguistics task: general speech", + "instructions": [ + "Based on the speaker's speech patterns, what do you think they are feeling." + ] + }, + "51_Paralingual_VOXCELEB1_GR_V2_2148": { + "apperance": "51. Paralinguistics task: general speech", + "instructions": [ + "May I know the gender of the speaker." + ] + }, + "53_Paralingual_VOXCELEB1_NR_V2_2286": { + "apperance": "53. Paralinguistics task: general speech", + "instructions": [ + "What's the nationality identity of the speaker." + ] + }, + "55_SQA_PUBLIC_SPEECH_SG_TEST_SQA_V2_2": { + "apperance": "55. Spoken Question Answering task: general speech", + "instructions": [ + "What impact would the growth of the healthcare sector have on the country's economy in terms of employment and growth." + ] + }, + "56_SQA_PUBLIC_SPEECH_SG_TEST_SQA_V2_415": { + "apperance": "56. Spoken Question Answering task: general speech", + "instructions": [ + "Based on the statement, can you summarize the speaker's position on the recent controversial issues in Singapore." + ] + }, + "57_SQA_PUBLIC_SPEECH_SG_TEST_SQA_V2_460": { + "apperance": "57. Spoken Question Answering task: general speech", + "instructions": [ + "How does the author respond to parents' worries about masks in schools." + ] + }, + "1_ASR_IMDA_PART1_ASR_v2_141": { + "apperance": "1. Automatic Speech Recognition task: phonetically balanced reading", + "instructions": [ + "Turn the spoken language into a text format.", + "Please translate the content into Chinese." + ] + }, + "2_ASR_IMDA_PART1_ASR_v2_2258": { + "apperance": "2. Automatic Speech Recognition task: phonetically balanced reading", + "instructions": [ + "Turn the spoken language into a text format.", + "Please translate the content into Chinese." + ] + }, + "3_ASR_IMDA_PART1_ASR_v2_2265": { + "apperance": "3. Automatic Speech Recognition task: phonetically balanced reading", + "instructions": [ + "Turn the spoken language into a text format." + ] + }, + "4_ASR_IMDA_PART2_ASR_v2_999": { + "apperance": "4. Automatic Speech Recognition task: reading in Singapore context", + "instructions": [ + "Translate the spoken words into text format." + ] + }, + "5_ASR_IMDA_PART2_ASR_v2_2241": { + "apperance": "5. Automatic Speech Recognition task: reading in Singapore context", + "instructions": [ + "Translate the spoken words into text format." + ] + }, + "6_ASR_IMDA_PART2_ASR_v2_3409": { + "apperance": "6. Automatic Speech Recognition task: reading in Singapore context", + "instructions": [ + "Translate the spoken words into text format." + ] + }, + "8_ASR_IMDA_PART3_30_ASR_v2_1698": { + "apperance": "8. Automatic Speech Recognition task: conversation in Singapore accent", + "instructions": [ + "Need this talk written down, please." + ] + }, + "9_ASR_IMDA_PART3_30_ASR_v2_2474": { + "apperance": "9. Automatic Speech Recognition task: conversation in Singapore accent", + "instructions": [ + "Need this talk written down, please." + ] + }, + "10_ASR_IMDA_PART4_30_ASR_v2_1527": { + "apperance": "10. Automatic Speech Recognition task: conversation with Singlish code-switch", + "instructions": [ + "Write out the dialogue as text." + ] + }, + "13_ASR_IMDA_PART5_30_ASR_v2_1446": { + "apperance": "13. Automatic Speech Recognition task: conversation in Singapore accent", + "instructions": [ + "Translate this vocal recording into a textual format." + ] + }, + "14_ASR_IMDA_PART5_30_ASR_v2_2281": { + "apperance": "14. Automatic Speech Recognition task: conversation in Singapore accent", + "instructions": [ + "Translate this vocal recording into a textual format." + ] + }, + "15_ASR_IMDA_PART5_30_ASR_v2_4388": { + "apperance": "15. Automatic Speech Recognition task: conversation in Singapore accent", + "instructions": [ + "Translate this vocal recording into a textual format." + ] + }, + "16_ASR_IMDA_PART6_30_ASR_v2_576": { + "apperance": "16. Automatic Speech Recognition task: conversation in Singapore accent", + "instructions": [ + "Record the spoken word in text form." + ] + }, + "18_ASR_IMDA_PART6_30_ASR_v2_2834": { + "apperance": "18. Automatic Speech Recognition task: conversation in Singapore accent", + "instructions": [ + "Record the spoken word in text form." + ] + }, + "19_ASR_AIShell_zh_ASR_v2_5044": { + "apperance": "19. Automatic Speech Recognition task: speech in Chinese ", + "instructions": [ + "Transform the oral presentation into a text document." + ] + }, + "20_ASR_LIBRISPEECH_CLEAN_ASR_V2_833": { + "apperance": "20. Automatic Speech Recognition task: general speech", + "instructions": [ + "Please provide a written transcription of the speech." + ] + }, + "25_ST_COVOST2_ZH-CN_EN_ST_V2_4567": { + "apperance": "25. Speech Translation task: Chinese to English", + "instructions": [ + "Please translate the given speech to English." + ] + }, + "26_ST_COVOST2_EN_ZH-CN_ST_V2_5422": { + "apperance": "26. Speech Translation task: English to Chinese", + "instructions": [ + "Please translate the given speech to Chinese." + ] + }, + "27_ST_COVOST2_EN_ZH-CN_ST_V2_6697": { + "apperance": "27. Speech Translation task: English to Chinese", + "instructions": [ + "Please translate the given speech to Chinese." + ] + }, + "28_SI_ALPACA-GPT4-AUDIO_SI_V2_299": { + "apperance": "28. Speech Instruction task: general speech", + "instructions": [ + "Please follow the instruction in the speech." + ] + }, + "29_SI_ALPACA-GPT4-AUDIO_SI_V2_750": { + "apperance": "29. Speech Instruction task: general speech", + "instructions": [ + "Please follow the instruction in the speech." + ] + }, + "30_SI_ALPACA-GPT4-AUDIO_SI_V2_1454": { + "apperance": "30. Speech Instruction task: general speech", + "instructions": [ + "Please follow the instruction in the speech." + ] + } +} + + +exec(os.getenv('APP_CONFIGS')) + + +def init_state_section(): + st.set_page_config(page_title='MERaLiON-AudioLLM', page_icon = "🔥", layout='wide') + + st.markdown( + ( + '' + ), + unsafe_allow_html=True + ) + + if "logger" not in st.session_state: + st.session_state.logger = load_logger() + st.session_state.session_id = st.session_state.logger.register_session() + + if "server" not in st.session_state: + st.session_state.server = start_server() + + if "client_mapper" not in st.session_state: + st.session_state.client_mapper = load_model() + + if "retriever" not in st.session_state: + st.session_state.retriever = load_retriever() + + for key, value in FIXED_GENERATION_CONFIG.items(): + if key not in st.session_state: + st.session_state[key]=copy.deepcopy(value) + + for key, value in DEFAULT_DIALOGUE_STATES.items(): + if key not in st.session_state: + st.session_state[key]=copy.deepcopy(value) + + +def header_section(component_name, description="", concise_description="", icon="🤖"): + st.markdown( + f"
This {component_name.lower()} is based on + MERaLiON-AudioLLM, + developed by I2R, A*STAR, in collaboration with AISG, Singapore. + {description}
This {component_name.lower()} is based on + MERaLiON-AudioLLM.{concise_description}